********************************************************************************************************
******************This code was written by Sara Hart October 13, 2015***********************************
********************************************************************************************************; 

/*this is bringing in the raw data that I pulled directly from Qualtrics*/
proc import 
datafile="C:\Sara\Math in the Home\second go at data collection\Math_in_the_Home_take_2_FINAL10615.sav" 
out=mydata dbms = sav replace;
run;
proc contents data=mydata;
run;


proc freq data=mydata; tables V10 Q10; run;  
/*V10 is the variable indicating they have completed the survey and Q10 is actually 
the education variable. I noticed that one of the participant's answered the lowest value for each
question, and I decided to drop them as unusable data. I found I could identify them by the ed variable, 
as this person was only one who indicated an education less than grade 6 (highly unlikely).*/


*Here is where I drop the unfinished survey data and the one participant who answered the lowest value for everything.
As a note, it's common for MTurk participants to start the survey to see if they want to do it, and quickly move on;
data math; set mydata;
where V10 = 1 and Q10 ne 1 ; run;

*Creating an id variable;
data math2; set math;
      id = _N_;  
  RUN;


  *this is bringing in the wscore data.  We have the participants go to the panamath website and do the task and copy back the 
  url of their results.  We then go to each url and harvest the W scores.  Here is the result of that process. Qualtrics also
  creates date data that can have the format messed up, so my RA also fixes all the dates to be in the right format and 
  calculates age. ;


****************;
proc import 
datafile="C:\Sara\Math in the Home\second go at data collection\wscores dob final.sav" 
out=wscores1 dbms = sav replace;
run;

proc contents data=wscores1; run;


****bringing the wscore and age data into other data;
proc sort data=math2; by id; run;
proc sort data=wscores1; by id; run;

data finalcombined; merge math2 wscores1; by id; run;


*Here I am creating the final "raw" dataset to be posted online from this paper.  The only thing I am doing to it here
  is dropping the few identifying location variables, MTurk ID, IP address, random junk variables etc ;
*also, before publication was asked by PLOS ONE to remove all birthdates;
data finalraw101315 (drop = LOCAT0-LOCAT9 LOCATA LOCATb LOCATc LOCATd LOCATe LOCATf LOCATg LOCATh LOCATi LOCATj
LOCATk LOCATl LOCATm LocationLatitude LocationLongitude LocationAccuracy V1-V10 V3_0-V3_6 V4_0-V4_6
V5_0-V5_6 V6_0-V6_6 V7_0-V7_6  Q13_10-Q13_16 Q110_0-Q110_6 Q44_F0-Q44_F6 
Q44_F8 Q44_F9 Q44_FA Q44_FB Q44_FC Q44_FD Q44_FE Q44_FG Q44_FH Q44_FI Q44_FJ Q44_FK Q44_FL Q44_FM 
Q44_FO Q44_FP Q44_FQ Q44_FR Q44_FS Q44_FT Q44_FU Q44_FW Q44_FX Q44_FY Q44_FZ Q44_F10-Q44_F12 Q116_0-Q116_7
Q115_0-Q115_6 Q98_0-Q98_6 Q126_0-Q126_6 Q4_TE0-Q4_TE6 Q5_TE0-Q5_TE6 Q8_TE0-Q8_TE6 Q11_T1-Q11_T6 Q105_0-Q105_6 Q106_0-Q106_6
Q13_1b dob); set finalcombined;  run;
proc export 
  data=finalraw101315 
  dbms=xlsx 
  outfile="C:\Sara\Math in the Home\second go at data collection\finalraw12616.xlsx" 
  replace;
run;


***checking out age range.  Despite multiple checks for participants having kid within 3-8 age range, some
did not report kids this age;
proc freq data=finalraw101315; tables age_yr; run;
*there are out of the range. Going to trim them;
data math1a; set finalraw101315;
where age_yr ge 3 and age_yr <9; run;


**recoding the bulk of the questions into variable names I prefer to use;
data math2a (drop=Q101 Q116 Q117 Q96 Q102 Q103 Q13_1a Q12  Q13_1c Q13_1d_1-Q13_1d_6 
Q127_1-Q127_20 
Q14_1 Q15_1 Q16_1 Q17_1 Q18_1 Q19_1 Q20_1 Q21_1 
Q120_1 Q122_1
Q24_1-Q24_6 
Q22_a Q22_b Q23_a  Q23_b Q110  
Q123 
Q41_1-Q41_6 
Q42_1-Q42_5
Q124_1-Q124_13 
Q125_1-Q125_3 
 Q4 Q5 Q6 Q104 Q108 Q7 Q8 Q9 Q10 Q11  
Q35_a_1 Q35_b_1 Q37_a_1 Q37_b_1 Q36_a_1 Q36_b_1 Q38_a_1 Q38_b_1 Q39_a_1 Q39_b_1 Q40_a_1 Q40_b_1
Q111 Q127_1_0 Q112 Q31_a_1 Q31_b_1 Q32_a_1 Q32_b_1 Q33_a_1 Q33_b_1
Q44_f_1	Q44_f_1_TEXT	Q44_f_2	Q44_f_2_TEXT	Q44_f_3	Q44_f_3_TEXT	Q44_f_4	Q44_f_4_TEXT	Q44_f_5	Q44_f_5_TEXT	Q115	Q98	Q126
Obs	Q110_TEXT	Q4_TEXT	Q5_TEXT	Q8_TEXT	Q11_TEXT	Q105	Q106	Q107
); set math1a;
	array rr3ecold {*} 
Q12  Q13_1c Q13_1d_1-Q13_1d_6 
Q127_1-Q127_20 
Q14_1 Q15_1 Q16_1 Q17_1 Q18_1 Q19_1 Q20_1 Q21_1 
Q120_1 Q122_1
Q24_1-Q24_6 
Q22_a Q22_b Q23_a  Q23_b Q110  
Q123 
Q41_1-Q41_6 
Q42_1-Q42_5
Q124_1-Q124_13 
Q125_1-Q125_3 
 Q4 Q5 Q6 Q104 Q108 Q7 Q8 Q9 Q10 Q11  
Q35_a_1 Q35_b_1 Q37_a_1 Q37_b_1 Q36_a_1 Q36_b_1 Q38_a_1 Q38_b_1 Q39_a_1 Q39_b_1 Q40_a_1 Q40_b_1
Q111 Q127_1_0 Q112 Q31_a_1 Q31_b_1 Q32_a_1 Q32_b_1 Q33_a_1 Q33_b_1;
	array rr3ecnew {*} 
numberofchildren  childgender compare_child_math compare_child_science compare_child_reading compare_child_writing
compare_child_spatial compare_child_numbers 
ncldq_1-ncldq_20 
mathimportance1-mathimportance8 
readtoeveryday matheveryday 
compare_parent_computers compare_parent_math compare_parent_science 
compare_parent_writing compare_parent_reading compare_parent_spatial 
highschoolGPAblocks highschoolGPAexact collegeGPAblocks collegeGPAexact collegeGPAmax  
SATorACT 
anx1-anx6 
famrole1-famrole5 
benchmarks1-benchmarks13 
booksinhome mathbooksinhome adultbooksinhome
 USborn childUSborn homelocation gender Age ethnicity race income education occupation_coded 
ACT_composite  ACT_composite_confidence ACT_math  ACT_math_confidence ACT_english  ACT_english_confidence 
ACT_reading  ACT_reading_confidence ACT_science  ACT_science_confidence ACT_writing  ACT_writing_confidence 
SAT SAT_confidence highestSAT SAT_reading SAT_reading_confidence SAT_math SAT_math_confidence SAT_writing SAT_writing_confidence ;
	do J=1 to dim(rr3ecold);
	rr3ecnew{J}	 = rr3ecold{J};
		end;
run;


proc freq; tables ACT_math_confidence SAT_math_confidence; run;
proc means; var ACT_math ; where ACT_math_confidence > 50; run;

proc means; var  SAT_math; where SAT_math_confidence > 50; run;

proc print data=math2a; where Q44_f_9 = 0; run;

*recoding the home variables;
data math3 (drop=Q44_a_1
Q44_a_2 Q44_a_3 Q44_a_4 Q44_a_5 Q44_a_6 Q44_a_7 Q44_a_8 Q44_a_9 Q44_a_10 Q44_a_11 Q44_b_11 Q44_b_1 Q44_b_2 
Q44_b_3 Q44_b_4 Q44_b_5 Q44_b_6 Q44_b_7 Q44_b_8 Q44_b_9 Q44_b_10 Q44_b_22 Q44_c_2 Q44_c_3 Q44_c_4 Q44_c_5 
Q44_c_6 Q44_c_7 Q44_c_8 Q44_c_9 Q44_c_10 Q44_c_11 Q44_c_12 Q44_c_13 Q44_d_1 Q44_d_2 Q44_d_3 Q44_d_4 Q44_d_5
 Q44_d_6 Q44_d_8 Q44_d_10 Q44_d_11 Q44_d_13 Q44_d_14 Q44_d_12 Q44_f_11 Q44_f_10 Q44_f_9 Q44_f_8 Q44_f_7); set math2a;
	array rr3ecold {*} 
Q44_a_1
Q44_a_2 Q44_a_3 Q44_a_4 Q44_a_5 Q44_a_6 Q44_a_7 Q44_a_8 Q44_a_9 Q44_a_10 Q44_a_11 Q44_b_11 Q44_b_1 Q44_b_2 
Q44_b_3 Q44_b_4 Q44_b_5 Q44_b_6 Q44_b_7 Q44_b_8 Q44_b_9 Q44_b_10 Q44_b_22 Q44_c_2 Q44_c_3 Q44_c_4 Q44_c_5 
Q44_c_6 Q44_c_7 Q44_c_8 Q44_c_9 Q44_c_10 Q44_c_11 Q44_c_12 Q44_c_13 Q44_d_1 Q44_d_2 Q44_d_3 Q44_d_4 Q44_d_5
 Q44_d_6 Q44_d_8 Q44_d_10 Q44_d_11 Q44_d_13 Q44_d_14 Q44_d_12 Q44_f_11 Q44_f_10 Q44_f_9 Q44_f_8 Q44_f_7;
	array rr3ecnew {*} 
mathathomeuse1-mathathomeuse52 ;
	do J=1 to dim(rr3ecold);
	rr3ecnew{J}	 = rr3ecold{J};
		end;
run;

proc freq; tables mathathomeuse1-mathathomeuse52; run;

**there were a few skipped items that mistakenly coded as "0" rather than ".", so fixing here;
data math3fix; set math3;
if mathathomeuse46 = 0 then mathathomeuse46 = .;
if mathathomeuse48 = 0 then mathathomeuse48 = .;
if mathathomeuse49 = 0 then mathathomeuse49 = .;
if mathathomeuse50 = 0 then mathathomeuse50 = .;
if mathathomeuse51 = 0 then mathathomeuse51 = .;
if mathathomeuse52 = 0 then mathathomeuse52 = .;
run;
proc freq; tables mathathomeuse1-mathathomeuse52; run;


**recoding the Colorado Learning Disabilities Questionnaire and creating subscores;
proc freq; tables cldq_1-cldq_20 Q127_1-Q127_20; run;

data math3a(drop=J ncldq_1-ncldq_20); set math3fix;
	array rr3ecold {*} ncldq_1-ncldq_20;
	array rr3ecnew {*} cldq_1-cldq_20;
	do J=1 to dim(rr3ecold);
		if rr3ecold{J} = 6 then rr3ecnew{J} = 1;
		if rr3ecold{J} = 8 then rr3ecnew{J} = 2;
		if rr3ecold{J} = 9 then rr3ecnew{J} = 3;
		if rr3ecold{J} = 5 then rr3ecnew{J} = 4;
		if rr3ecold{J} = 4 then rr3ecnew{J} = 5;		
	end;
run;
/*here I drop the case if there's greater than 10% missing which can happen with this scale*/
data math4; set math3a;
missingcldqt = n (of cldq_1-cldq_20);
missingcldqr = n (of cldq_1 cldq_2 cldq_3 cldq_4 cldq_5 cldq_6);
missingcldqsc = n (of cldq_7 cldq_8 cldq_9 cldq_10);
missingcldqsa = n (of cldq_11 cldq_12 cldq_13);
missingcldqsp = n (of cldq_14 cldq_15 cldq_16 cldq_17);
missingcldqmp = n (of cldq_18 cldq_19 cldq_20);
run;
proc freq; tables missingcldqt missingcldqr missingcldqsc missingcldqsa missingcldqsp
missingcldqmp; run;
data math5 (drop=missingcldqt missingcldqr missingcldqsc missingcldqsa missingcldqsp
missingcldqmp); set math4;
if missingcldqt ge 18 then P_cldq_total= sum (of cldq_1-cldq_20);
if missingcldqr = 6 then P_cldq_read= sum (of cldq_1 cldq_2 cldq_3 cldq_4 cldq_5 cldq_6);
if missingcldqsc = 4 then P_cldq_social =sum (of cldq_7 cldq_8 cldq_9 cldq_10);
if missingcldqsa = 3 then P_cldq_anxiety= sum (of cldq_11 cldq_12 cldq_13);
if missingcldqsp =4 then P_cldq_spatial= sum (of cldq_14 cldq_15 cldq_16 cldq_17);
if missingcldqmp = 3 then P_cldq_math= sum (of cldq_18 cldq_19 cldq_20);
label
P_cldq_total = 'CLDQ Total Problems sum score'
P_cldq_read = 'CLDQ Reading Problems subscale sum score'
P_cldq_social = 'CLDQ Social Cognition Problems subscale sum score'
P_cldq_anxiety = 'CLDQ Social Anxiety Problems subscale sum score'
P_cldq_spatial = 'CLDQ Spatial Problems subscale sum score'
P_cldq_math = 'CLDQ Math Problems subscale sum score';
run;


***looking at range of w scores for out of range values, which happens because of guessing.
If a value is greater than 4 SDs from the mean, we are removing it, based
on Halberta et al., 2012. First have to get rid of the super high value, it's messing
with the zscore;
proc freq data=math5; tables wscore; run;

data math8; set math5;
if wscore ge 10.00 then wscore = .; run;

proc standard mean =0 STD = 1 data=math8 out = math8z;
var  wscore; run;
proc freq data=math8z; tables wscore; run;
data math8zz; set math8z;
if wscore > 4.00 then flag = 1; run;
proc print; var id; where flag =1; run;
*getting rid of ids over 4SDs;
data math8new; set math8;
if id = 125 or id = 258 or id = 0 or id = 297 or id= 317 or id= 330 then wscore = .;
run;

*one person answered gender = 3 (prefer not to answer) so recoding to missing;
*also switching the coding of parent gender to male = 1, female = 2, to match kid gender;
*also, recoding income so that 13 & 14, which are "don't know" and "don't wish to say"
are missing;

proc freq; tables newgender; run;
data math9; set math8new;
if gender = 3 then newgender = .;
if gender = 2 then newgender = 1;
if gender = 1 then newgender = 2;
if income = 13 then income = .;
if income = 14 then income = .;
run;


***creating a .csv file for posting online;
ODS CSV file="C:\Sara\Math in the Home\Hart et al., HNE paper code and data\FinalCleanedData.csv"; 
proc print data=math9; run; 
ODS CSV close; 
run;