**********************************
* 								 *
* California Body Donors Project *
*								 *
**********************************

* created: June 11, 2012
* last update: Sep 10, 2013 5pm

* start with the original data set 

cd "/Users/filizgarip/Desktop/Revisions Sep 13 for SSM"

use "UC_DDL_Registrants.dta", clear

codebook registeration_date

gen reg_yr = substr(registeration_date, 1,4)
gen reg_mo = substr(registeration_date, 6,2)
gen reg_da = substr(registeration_date, 9,2)

destring reg_yr, replace
destring reg_mo, replace
destring reg_da, replace

gen reg_date = mdy(reg_mo, reg_da, reg_yr)

gen conversion_date = mdy(7,1,2009) if campus_code=="UCD"
replace conversion_date = mdy(1,22,2009) if campus_code=="UCI"
replace conversion_date = mdy(6,22,2009) if campus_code=="UCLA"
replace conversion_date = mdy(3,4,2009) if campus_code=="UCSF"

gen main_conversion_date = mdy(9,3,2008)

format reg_date %d
format conversion_date %d
format main_conversion_date %d


	* age 

replace birth_year = . if birth_year<1900 & birth_year>1995
gen age = reg_yr - birth_year
replace age = . if age<18 | age >100

	* education level

ren educationlevel_id ed
codebook ed 

gen lths = (ed<15) if ed~=.
gen hs = (ed==15 | ed==16) if ed~=.
gen somecol = (ed==17 | ed==18) if ed~=.
gen col = (ed==19) if ed~=.
gen adv = (ed>19) if ed~=.

	* sex

gen sex = 0 if gender=="F"
replace sex = 1 if gender=="M"

	* marital status
	
recode marital_status_id (2 5=1) (1 3 4=0), gen(married)
recode marital_status_id (3 4=1) (1 2 5=0), gen(separated)
rename marital_status_id marital_status

	* race

gen race=race_id
recode race 1=1 2=2 3/7=3 8/18=4 19=5  /*1=White; 2=African American; 3=Native and Islander; 4=Asian American; 5=other*/
	 
gen hispanic= hispanic_id
recode hispanic 1=0 2/7=1  /*1=yes*/
label define dummy 1 "yes" 0 "no"
label value  hispanic dummy
	 
replace race=6 if hispanic==1
recode race 5=6 6=5
label define race 1 "White" 2 "Black" 3"Native" 4 "Asian" 5 "Hispanic" 6 "Others"
label value race race

tab race, gen(r)
ren r1 white
ren r2 black
ren r5 hisp
gen otr = (r3==1 | r4==1 | r6==1) if race~=.

gen hispotr = (hisp==1 | otr==1)


	* migrant status
	
gen bplace = "US" if birth_state_id~=. | birth_country=="USA"
replace bplace = "nonUS" if birth_country~="" & birth_country~="USA"

gen father_bplace = "US" if father_birth_state_id~=. | father_birth_country=="USA"
replace father_bplace = "nonUS" if father_birth_country~="" & father_birth_country~="USA"

gen mother_bplace = "US" if mother_birth_state_id~=. | mother_birth_country=="USA"
replace mother_bplace = "nonUS" if mother_birth_country~="" & mother_birth_country~="USA"

gen mig = 0 if bplace~="" 
replace mig = 1 if bplace=="nonUS" 

gen migprt = 0 if father_bplace~="" | mother_bplace~=""
replace migprt = 1 if father_bplace=="nonUS" | mother_bplace=="nonUS"

	* geographic location
	
gen davis = campus_code=="UCD"
gen irvine = campus_code=="UCI"
gen la = campus_code=="UCLA"
gen sf = campus_code=="UCSF"

gen irvine_la = irvine==1 | la==1
gen sf_davis = sf==1 | davis==1

	* us_armed forces
	
gen in_army = 1 if us_armed_forces=="Yes"
replace in_army = 0 if us_armed_forces=="No"
replace in_army = . if us_armed_forces=="Unknown" | us_armed_forces==""

	* Keep observations after pre-need conversion for each campus (when
	* registration dates were entered accurately)
	
keep if reg_date>main_conversion_date // all dates after main conversion date are accurate
									  // with the exception of pre-need conversion dates
drop if reg_date==conversion_date	  // dropped here 


* Compare all registrants to actual donors (deceased registrants)

gen died = (death_year~="")

ttest age, by(died)
ttest sex, by(died)
ttest married, by(died)
ttest separated, by(died)
ttest white, by(died)
ttest black, by(died)
ttest hispotr, by(died)
ttest hs, by(died)
ttest somecol, by(died)
ttest col, by(died)
ttest adv, by(died)
ttest mig, by(died)
ttest migprt, by(died)
ttest davis, by(died)
ttest irvine, by(died)
ttest la, by(died)
ttest sf, by(died)
ttest irvine_la, by(died)
ttest sf_davis, by(died)


	* Keep only individuals who have died -- the same form was filled again
	* So, we have few missing observations
	
keep if death_year~=""


keep zipcode campus_code  death_year us_armed_forces reg_yr reg_mo reg_da age sex married separated white black hisp otr hispotr lths hs somecol col adv mig migprt ed ///
	race marital_status campus_code davis irvine la sf irvine_la sf_davis occupation in_army


save calif_data_temp.dta, replace

* Outsheet to excel to code occupation manually.

preserve
drop if age==. | married==. | white==. | hs==. | mig==. | migprt==. | ed==.
sort age sex married separated white black hisp hs col mig migprt 
gen id = _n

save calif_data_no_occ.dta, replace

outsheet id age hs somecol col adv occupation in_army using calif_data_occ.csv,  delim(",") replace

restore


* Insheet from excel

clear
insheet using calif_data_occ_manually_coded.csv
keep id occupation occ_cat1 occ_cat2
sort id
save calif_data_occ_manually_coded.dta, replace

* Merge the occupation data

clear
use calif_data_no_occ.dta
sort id
merge 1:1 id using calif_data_occ_manually_coded.dta, keepusing(occ_cat1 occ_cat2)
drop id _merge
save calif_data_occ1.dta, replace

	* Per reviewer's request, we need to see if the results are robust to excluding
	* parent's migration status (and consequently to keeping the observations with missing
	* values on that variable in). We need to code the occupation categories manually
	* for that tiny sample.
	
	use calif_data_temp.dta, replace

	drop if age==. | married==. | white==. | hs==. | mig==. | ed==.
	keep if migprt==.
	gen id = _n
	save calif_data_no_occ2.dta, replace

	outsheet id occupation using calif_data_occ2.csv,  delim(",") replace

	* Insheet the manually-coded data from excel

	clear
	insheet using calif_data_occ2_manually_coded.csv
	keep id occupation occ_cat1 occ_cat2
	sort id
	save calif_data_occ2_manually_coded.dta, replace
	
	use calif_data_no_occ2.dta, clear
	sort id
	merge 1:1 id using calif_data_occ2_manually_coded.dta, keepusing(occ_cat1 occ_cat2)
	drop id _merge
	save calif_data_occ1_migprt_missing.dta, replace
	
	* Append to the larger data set (note the cases with missing migprt aren't
	* in teh occupation data)

	clear
	use calif_data_occ1.dta
	append using calif_data_occ1_migprt_missing.dta
	save calif_data_occ1_full.dta, replace //contains the observations w missing migprt
	

* Code the occupation categories

use calif_data_occ1_full.dta, clear


gen prof = (occ_cat1=="p")
gen mngr = (occ_cat1=="m")
gen clrk = (occ_cat1=="c")
gen serv = (occ_cat1=="s")
gen wrk = (occ_cat1=="w")
gen unemp = (occ_cat1=="st" | occ_cat1=="r" | occ_cat1=="u" | occ_cat1=="h")

//unemp includes unemployed, retired, students and housewives.

	* additional categorization
	
	gen md = (occ_cat2=="md") //medical professionals
	gen tchr = (occ_cat2=="t") //teachers
	gen othprof = (md==0 & tchr==0 & prof==1) //other professionals
	
// Before dropping the missing observations on occupation, save the original 
// data set.

preserve
drop if migprt==.
sort age sex married separated white black hisp hs col mig migprt
gen id = _n
xtile agec = age, nq(2)
recode agec 1=0 2=1

save "calif_data_new_orig.dta", replace

//  Save the original data set only (no occupations, migprt included)

outsheet id age agec sex married separated white black hispotr hs somecol col adv mig migprt ///
	     using "calif_data_new_orig.csv", nonames replace

outsheet id age agec sex married separated white black hispotr hs somecol col adv mig migprt ///
		 using "calif_data_new_orig.raw", replace

restore
	

drop if occ_cat1== ""


save "calif_data_occ1_full.dta", replace

sort age sex married separated white black hisp hs col mig  
gen id = _n

xtile agec = age, nq(2)
recode agec 1=0 2=1

save "calif_data_new_full.dta", replace

outsheet id age agec sex married separated white black hispotr hs somecol col adv mig ///
		 prof mngr clrk serv wrk unemp irvine la davis sf using "calif_data_new_full.csv", nonames replace

outsheet id age agec sex married separated white black hispotr hs somecol col adv mig ///
	     prof mngr clrk serv wrk unemp irvine la davis sf using "calif_data_new_full.raw", replace


// Save the data where the migprt is included (the original analysis)
	
use "calif_data_new_full.dta", replace
drop id agec
drop if migprt==.

sort age sex married separated white black hisp hs col mig migprt
gen id = _n

xtile agec = age, nq(2)
recode agec 1=0 2=1

save "calif_data_new.dta", replace

outsheet id age agec sex married separated white black hispotr hs somecol col adv mig migprt ///
	     prof mngr clrk serv wrk unemp using "calif_data_new.csv", nonames replace

outsheet id age agec sex married separated white black hispotr hs somecol col adv mig migprt ///
		 prof mngr clrk serv wrk unemp using "calif_data_new.raw", replace


// Save a data set with alternative categorization of occupations (less refined)

gen profmngr = prof==1 | mngr==1 | clrk==1

save calif_data_new_temp.dta, replace


outsheet id age agec sex married separated white black hispotr hs somecol col adv mig migprt ///
	     profmngr serv wrk unemp using "calif_data_new_temp.csv", nonames replace

outsheet id age agec sex married separated white black hispotr hs somecol col adv mig migprt ///
		 profmngr serv wrk unemp using "calif_data_new_temp.raw", replace


*********************************
*            Analysis           *
*********************************

set logtype text

clear
infile id clid using "cluster.txt"
sort id
save "cluster_id.dta", replace

use calif_data_new, clear
sort id
merge 1:1 id using cluster_id
drop _merge

	
		log using "cluster_comparison_new.txt", replace
		sum age sex married separated white black hispotr hs somecol col adv prof mngr clrk serv wrk unemp mig migprt, sep(25)
		bys clid: sum age sex married separated white black hispotr hs somecol col adv prof mngr clrk serv wrk unemp mig migprt, sep(25)
		ttest age, by(clid)
		ttest sex, by(clid)
		ttest married, by(clid)
		ttest separated, by(clid)
		ttest white, by(clid)
		ttest black, by(clid)
		ttest hispotr, by(clid)
		ttest hs, by(clid)
		ttest somecol, by(clid)
		ttest col, by(clid)
		ttest adv, by(clid)
		ttest prof, by(clid)
		ttest mngr, by(clid)
		ttest clrk, by(clid)
		ttest serv, by(clid)
		ttest wrk, by(clid)
		ttest unemp, by(clid)
		ttest mig, by(clid)
		ttest migprt, by(clid)

		
		ttest md, by(clid)
		ttest tchr, by(clid)
		ttest othprof, by(clid)
	
		ttest irvine, by(clid)
		ttest la, by(clid)
		ttest davis, by(clid)
		ttest sf, by(clid)
		
		log cl


save "calif_data_w_clusters.dta", replace

*********************************
* Analysis - FULL (No migprt)   *
*********************************

set logtype text

clear
infile id clid using "cluster_full.txt"
sort id
save "cluster_id_full.dta", replace

use calif_data_new_full, clear
sort id
merge 1:1 id using cluster_id_full
drop _merge

	
		log using "cluster_comparison_new_full.txt", replace
		sum age sex married separated white black hispotr hs somecol col adv prof mngr clrk serv wrk unemp mig, sep(25)
		bys clid: sum age sex married separated white black hispotr hs somecol col adv prof mngr clrk serv wrk unemp mig, sep(25)
		ttest age, by(clid)
		ttest sex, by(clid)
		ttest married, by(clid)
		ttest separated, by(clid)
		ttest white, by(clid)
		ttest black, by(clid)
		ttest hispotr, by(clid)
		ttest hs, by(clid)
		ttest somecol, by(clid)
		ttest col, by(clid)
		ttest adv, by(clid)
		ttest prof, by(clid)
		ttest mngr, by(clid)
		ttest clrk, by(clid)
		ttest serv, by(clid)
		ttest wrk, by(clid)
		ttest unemp, by(clid)
		ttest mig, by(clid)
	
		
		ttest md, by(clid)
		ttest tchr, by(clid)
		ttest othprof, by(clid)
	
		ttest irvine, by(clid)
		ttest la, by(clid)
		ttest davis, by(clid)
		ttest sf, by(clid)
		
		log cl


save "calif_data_w_clusters_full.dta", replace


*********************************
*Analysis - TEMP (alt cat of occ)*
*********************************

set logtype text

clear
infile id clid using "cluster_temp.txt"
sort id
save "cluster_id_temp.dta", replace

use calif_data_new_temp, clear
sort id
merge 1:1 id using cluster_id_temp
drop _merge

	
		log using "cluster_comparison_new_temp.txt", replace
		sum age sex married separated white black hispotr hs somecol col adv prof mngr clrk serv wrk unemp mig migprt, sep(25)
		bys clid: sum age sex married separated white black hispotr hs somecol col adv prof mngr clrk serv wrk unemp mig migprt, sep(25)
		ttest age, by(clid)
		ttest sex, by(clid)
		ttest married, by(clid)
		ttest separated, by(clid)
		ttest white, by(clid)
		ttest black, by(clid)
		ttest hispotr, by(clid)
		ttest hs, by(clid)
		ttest somecol, by(clid)
		ttest col, by(clid)
		ttest adv, by(clid)
		ttest profmngr, by(clid)
		ttest serv, by(clid)
		ttest wrk, by(clid)
		ttest unemp, by(clid)
		ttest mig, by(clid)
		ttest migprt, by(clid)

		
		ttest md, by(clid)
		ttest tchr, by(clid)
		ttest othprof, by(clid)
	
		ttest irvine, by(clid)
		ttest la, by(clid)
		ttest davis, by(clid)
		ttest sf, by(clid)
		
		log cl


save "calif_data_w_clusters_temp.dta", replace


*********************************
* Analysis - ORIG *
*********************************

set logtype text

clear
infile id clid using "cluster_orig.txt"
sort id
save "cluster_id_orig.dta", replace

use calif_data_new_orig, clear
sort id
merge 1:1 id using cluster_id_orig
drop _merge

	
		log using "cluster_comparison_new_orig.txt", replace
		sum age sex married separated white black hispotr hs somecol col adv prof mngr clrk serv wrk unemp mig migprt, sep(25)
		bys clid: sum age sex married separated white black hispotr hs somecol col adv prof mngr clrk serv wrk unemp mig migprt, sep(25)
		ttest age, by(clid)
		ttest sex, by(clid)
		ttest married, by(clid)
		ttest separated, by(clid)
		ttest white, by(clid)
		ttest black, by(clid)
		ttest hispotr, by(clid)
		ttest hs, by(clid)
		ttest somecol, by(clid)
		ttest col, by(clid)
		ttest adv, by(clid)
		ttest prof, by(clid)
		ttest mngr, by(clid)
		ttest clrk, by(clid)
		ttest serv, by(clid)
		ttest wrk, by(clid)
		ttest unemp, by(clid)
		ttest mig, by(clid)
		ttest migprt, by(clid)

		
		ttest md, by(clid)
		ttest tchr, by(clid)
		ttest othprof, by(clid)
	
		ttest irvine, by(clid)
		ttest la, by(clid)
		ttest davis, by(clid)
		ttest sf, by(clid)
		
		log cl


save "calif_data_w_clusters_orig.dta", replace