******************************************************************************************************** ******************This code was written by Sara Hart October 13, 2015*********************************** ********************************************************************************************************; /*this is bringing in the raw data that I pulled directly from Qualtrics*/ proc import datafile="C:\Sara\Math in the Home\second go at data collection\Math_in_the_Home_take_2_FINAL10615.sav" out=mydata dbms = sav replace; run; proc contents data=mydata; run; proc freq data=mydata; tables V10 Q10; run; /*V10 is the variable indicating they have completed the survey and Q10 is actually the education variable. I noticed that one of the participant's answered the lowest value for each question, and I decided to drop them as unusable data. I found I could identify them by the ed variable, as this person was only one who indicated an education less than grade 6 (highly unlikely).*/ *Here is where I drop the unfinished survey data and the one participant who answered the lowest value for everything. As a note, it's common for MTurk participants to start the survey to see if they want to do it, and quickly move on; data math; set mydata; where V10 = 1 and Q10 ne 1 ; run; *Creating an id variable; data math2; set math; id = _N_; RUN; *this is bringing in the wscore data. We have the participants go to the panamath website and do the task and copy back the url of their results. We then go to each url and harvest the W scores. Here is the result of that process. Qualtrics also creates date data that can have the format messed up, so my RA also fixes all the dates to be in the right format and calculates age. ; ****************; proc import datafile="C:\Sara\Math in the Home\second go at data collection\wscores dob final.sav" out=wscores1 dbms = sav replace; run; proc contents data=wscores1; run; ****bringing the wscore and age data into other data; proc sort data=math2; by id; run; proc sort data=wscores1; by id; run; data finalcombined; merge math2 wscores1; by id; run; *Here I am creating the final "raw" dataset to be posted online from this paper. The only thing I am doing to it here is dropping the few identifying location variables, MTurk ID, IP address, random junk variables etc ; *also, before publication was asked by PLOS ONE to remove all birthdates; data finalraw101315 (drop = LOCAT0-LOCAT9 LOCATA LOCATb LOCATc LOCATd LOCATe LOCATf LOCATg LOCATh LOCATi LOCATj LOCATk LOCATl LOCATm LocationLatitude LocationLongitude LocationAccuracy V1-V10 V3_0-V3_6 V4_0-V4_6 V5_0-V5_6 V6_0-V6_6 V7_0-V7_6 Q13_10-Q13_16 Q110_0-Q110_6 Q44_F0-Q44_F6 Q44_F8 Q44_F9 Q44_FA Q44_FB Q44_FC Q44_FD Q44_FE Q44_FG Q44_FH Q44_FI Q44_FJ Q44_FK Q44_FL Q44_FM Q44_FO Q44_FP Q44_FQ Q44_FR Q44_FS Q44_FT Q44_FU Q44_FW Q44_FX Q44_FY Q44_FZ Q44_F10-Q44_F12 Q116_0-Q116_7 Q115_0-Q115_6 Q98_0-Q98_6 Q126_0-Q126_6 Q4_TE0-Q4_TE6 Q5_TE0-Q5_TE6 Q8_TE0-Q8_TE6 Q11_T1-Q11_T6 Q105_0-Q105_6 Q106_0-Q106_6 Q13_1b dob); set finalcombined; run; proc export data=finalraw101315 dbms=xlsx outfile="C:\Sara\Math in the Home\second go at data collection\finalraw12616.xlsx" replace; run; ***checking out age range. Despite multiple checks for participants having kid within 3-8 age range, some did not report kids this age; proc freq data=finalraw101315; tables age_yr; run; *there are out of the range. Going to trim them; data math1a; set finalraw101315; where age_yr ge 3 and age_yr <9; run; **recoding the bulk of the questions into variable names I prefer to use; data math2a (drop=Q101 Q116 Q117 Q96 Q102 Q103 Q13_1a Q12 Q13_1c Q13_1d_1-Q13_1d_6 Q127_1-Q127_20 Q14_1 Q15_1 Q16_1 Q17_1 Q18_1 Q19_1 Q20_1 Q21_1 Q120_1 Q122_1 Q24_1-Q24_6 Q22_a Q22_b Q23_a Q23_b Q110 Q123 Q41_1-Q41_6 Q42_1-Q42_5 Q124_1-Q124_13 Q125_1-Q125_3 Q4 Q5 Q6 Q104 Q108 Q7 Q8 Q9 Q10 Q11 Q35_a_1 Q35_b_1 Q37_a_1 Q37_b_1 Q36_a_1 Q36_b_1 Q38_a_1 Q38_b_1 Q39_a_1 Q39_b_1 Q40_a_1 Q40_b_1 Q111 Q127_1_0 Q112 Q31_a_1 Q31_b_1 Q32_a_1 Q32_b_1 Q33_a_1 Q33_b_1 Q44_f_1 Q44_f_1_TEXT Q44_f_2 Q44_f_2_TEXT Q44_f_3 Q44_f_3_TEXT Q44_f_4 Q44_f_4_TEXT Q44_f_5 Q44_f_5_TEXT Q115 Q98 Q126 Obs Q110_TEXT Q4_TEXT Q5_TEXT Q8_TEXT Q11_TEXT Q105 Q106 Q107 ); set math1a; array rr3ecold {*} Q12 Q13_1c Q13_1d_1-Q13_1d_6 Q127_1-Q127_20 Q14_1 Q15_1 Q16_1 Q17_1 Q18_1 Q19_1 Q20_1 Q21_1 Q120_1 Q122_1 Q24_1-Q24_6 Q22_a Q22_b Q23_a Q23_b Q110 Q123 Q41_1-Q41_6 Q42_1-Q42_5 Q124_1-Q124_13 Q125_1-Q125_3 Q4 Q5 Q6 Q104 Q108 Q7 Q8 Q9 Q10 Q11 Q35_a_1 Q35_b_1 Q37_a_1 Q37_b_1 Q36_a_1 Q36_b_1 Q38_a_1 Q38_b_1 Q39_a_1 Q39_b_1 Q40_a_1 Q40_b_1 Q111 Q127_1_0 Q112 Q31_a_1 Q31_b_1 Q32_a_1 Q32_b_1 Q33_a_1 Q33_b_1; array rr3ecnew {*} numberofchildren childgender compare_child_math compare_child_science compare_child_reading compare_child_writing compare_child_spatial compare_child_numbers ncldq_1-ncldq_20 mathimportance1-mathimportance8 readtoeveryday matheveryday compare_parent_computers compare_parent_math compare_parent_science compare_parent_writing compare_parent_reading compare_parent_spatial highschoolGPAblocks highschoolGPAexact collegeGPAblocks collegeGPAexact collegeGPAmax SATorACT anx1-anx6 famrole1-famrole5 benchmarks1-benchmarks13 booksinhome mathbooksinhome adultbooksinhome USborn childUSborn homelocation gender Age ethnicity race income education occupation_coded ACT_composite ACT_composite_confidence ACT_math ACT_math_confidence ACT_english ACT_english_confidence ACT_reading ACT_reading_confidence ACT_science ACT_science_confidence ACT_writing ACT_writing_confidence SAT SAT_confidence highestSAT SAT_reading SAT_reading_confidence SAT_math SAT_math_confidence SAT_writing SAT_writing_confidence ; do J=1 to dim(rr3ecold); rr3ecnew{J} = rr3ecold{J}; end; run; proc freq; tables ACT_math_confidence SAT_math_confidence; run; proc means; var ACT_math ; where ACT_math_confidence > 50; run; proc means; var SAT_math; where SAT_math_confidence > 50; run; proc print data=math2a; where Q44_f_9 = 0; run; *recoding the home variables; data math3 (drop=Q44_a_1 Q44_a_2 Q44_a_3 Q44_a_4 Q44_a_5 Q44_a_6 Q44_a_7 Q44_a_8 Q44_a_9 Q44_a_10 Q44_a_11 Q44_b_11 Q44_b_1 Q44_b_2 Q44_b_3 Q44_b_4 Q44_b_5 Q44_b_6 Q44_b_7 Q44_b_8 Q44_b_9 Q44_b_10 Q44_b_22 Q44_c_2 Q44_c_3 Q44_c_4 Q44_c_5 Q44_c_6 Q44_c_7 Q44_c_8 Q44_c_9 Q44_c_10 Q44_c_11 Q44_c_12 Q44_c_13 Q44_d_1 Q44_d_2 Q44_d_3 Q44_d_4 Q44_d_5 Q44_d_6 Q44_d_8 Q44_d_10 Q44_d_11 Q44_d_13 Q44_d_14 Q44_d_12 Q44_f_11 Q44_f_10 Q44_f_9 Q44_f_8 Q44_f_7); set math2a; array rr3ecold {*} Q44_a_1 Q44_a_2 Q44_a_3 Q44_a_4 Q44_a_5 Q44_a_6 Q44_a_7 Q44_a_8 Q44_a_9 Q44_a_10 Q44_a_11 Q44_b_11 Q44_b_1 Q44_b_2 Q44_b_3 Q44_b_4 Q44_b_5 Q44_b_6 Q44_b_7 Q44_b_8 Q44_b_9 Q44_b_10 Q44_b_22 Q44_c_2 Q44_c_3 Q44_c_4 Q44_c_5 Q44_c_6 Q44_c_7 Q44_c_8 Q44_c_9 Q44_c_10 Q44_c_11 Q44_c_12 Q44_c_13 Q44_d_1 Q44_d_2 Q44_d_3 Q44_d_4 Q44_d_5 Q44_d_6 Q44_d_8 Q44_d_10 Q44_d_11 Q44_d_13 Q44_d_14 Q44_d_12 Q44_f_11 Q44_f_10 Q44_f_9 Q44_f_8 Q44_f_7; array rr3ecnew {*} mathathomeuse1-mathathomeuse52 ; do J=1 to dim(rr3ecold); rr3ecnew{J} = rr3ecold{J}; end; run; proc freq; tables mathathomeuse1-mathathomeuse52; run; **there were a few skipped items that mistakenly coded as "0" rather than ".", so fixing here; data math3fix; set math3; if mathathomeuse46 = 0 then mathathomeuse46 = .; if mathathomeuse48 = 0 then mathathomeuse48 = .; if mathathomeuse49 = 0 then mathathomeuse49 = .; if mathathomeuse50 = 0 then mathathomeuse50 = .; if mathathomeuse51 = 0 then mathathomeuse51 = .; if mathathomeuse52 = 0 then mathathomeuse52 = .; run; proc freq; tables mathathomeuse1-mathathomeuse52; run; **recoding the Colorado Learning Disabilities Questionnaire and creating subscores; proc freq; tables cldq_1-cldq_20 Q127_1-Q127_20; run; data math3a(drop=J ncldq_1-ncldq_20); set math3fix; array rr3ecold {*} ncldq_1-ncldq_20; array rr3ecnew {*} cldq_1-cldq_20; do J=1 to dim(rr3ecold); if rr3ecold{J} = 6 then rr3ecnew{J} = 1; if rr3ecold{J} = 8 then rr3ecnew{J} = 2; if rr3ecold{J} = 9 then rr3ecnew{J} = 3; if rr3ecold{J} = 5 then rr3ecnew{J} = 4; if rr3ecold{J} = 4 then rr3ecnew{J} = 5; end; run; /*here I drop the case if there's greater than 10% missing which can happen with this scale*/ data math4; set math3a; missingcldqt = n (of cldq_1-cldq_20); missingcldqr = n (of cldq_1 cldq_2 cldq_3 cldq_4 cldq_5 cldq_6); missingcldqsc = n (of cldq_7 cldq_8 cldq_9 cldq_10); missingcldqsa = n (of cldq_11 cldq_12 cldq_13); missingcldqsp = n (of cldq_14 cldq_15 cldq_16 cldq_17); missingcldqmp = n (of cldq_18 cldq_19 cldq_20); run; proc freq; tables missingcldqt missingcldqr missingcldqsc missingcldqsa missingcldqsp missingcldqmp; run; data math5 (drop=missingcldqt missingcldqr missingcldqsc missingcldqsa missingcldqsp missingcldqmp); set math4; if missingcldqt ge 18 then P_cldq_total= sum (of cldq_1-cldq_20); if missingcldqr = 6 then P_cldq_read= sum (of cldq_1 cldq_2 cldq_3 cldq_4 cldq_5 cldq_6); if missingcldqsc = 4 then P_cldq_social =sum (of cldq_7 cldq_8 cldq_9 cldq_10); if missingcldqsa = 3 then P_cldq_anxiety= sum (of cldq_11 cldq_12 cldq_13); if missingcldqsp =4 then P_cldq_spatial= sum (of cldq_14 cldq_15 cldq_16 cldq_17); if missingcldqmp = 3 then P_cldq_math= sum (of cldq_18 cldq_19 cldq_20); label P_cldq_total = 'CLDQ Total Problems sum score' P_cldq_read = 'CLDQ Reading Problems subscale sum score' P_cldq_social = 'CLDQ Social Cognition Problems subscale sum score' P_cldq_anxiety = 'CLDQ Social Anxiety Problems subscale sum score' P_cldq_spatial = 'CLDQ Spatial Problems subscale sum score' P_cldq_math = 'CLDQ Math Problems subscale sum score'; run; ***looking at range of w scores for out of range values, which happens because of guessing. If a value is greater than 4 SDs from the mean, we are removing it, based on Halberta et al., 2012. First have to get rid of the super high value, it's messing with the zscore; proc freq data=math5; tables wscore; run; data math8; set math5; if wscore ge 10.00 then wscore = .; run; proc standard mean =0 STD = 1 data=math8 out = math8z; var wscore; run; proc freq data=math8z; tables wscore; run; data math8zz; set math8z; if wscore > 4.00 then flag = 1; run; proc print; var id; where flag =1; run; *getting rid of ids over 4SDs; data math8new; set math8; if id = 125 or id = 258 or id = 0 or id = 297 or id= 317 or id= 330 then wscore = .; run; *one person answered gender = 3 (prefer not to answer) so recoding to missing; *also switching the coding of parent gender to male = 1, female = 2, to match kid gender; *also, recoding income so that 13 & 14, which are "don't know" and "don't wish to say" are missing; proc freq; tables newgender; run; data math9; set math8new; if gender = 3 then newgender = .; if gender = 2 then newgender = 1; if gender = 1 then newgender = 2; if income = 13 then income = .; if income = 14 then income = .; run; ***creating a .csv file for posting online; ODS CSV file="C:\Sara\Math in the Home\Hart et al., HNE paper code and data\FinalCleanedData.csv"; proc print data=math9; run; ODS CSV close; run;