***** IMPORTING DATA *****; * Replace 'C:\path\' with the path(directory) where NHANES 2011-2012 demographic file is located; libname xp xport 'C:\path\demo_g.xpt'; * create a SAS dataset of NHANES demo file; data nhanes2012; set xp.demo_g; run; ***** EXAMINING DESIGN VARIABLES *****; * sampling weight; proc means data = nhanes2012 n min mean max sum; var wtint2yr; run; * cluster and strata variables; * 14 total strata; proc freq data = nhanes2012; tables sdmvpsu sdmvstra; run; * cross-tab of PSU (cluster) and strata; * some strata have 3 PSUs (clusters), most have 2 PSUs; * 31 total PSUs; proc freq data = nhanes2012; tables sdmvpsu*sdmvstra; run; ***** PROC SURVEYMEANS (SLIDE) *****; * for continuous variable descriptives, use PROC SURVEYMEANS; * ridageyr is age; proc surveymeans data = nhanes2012; weight wtint2yr; cluster sdmvpsu; strata sdmvstra; var ridageyr; run; * compare to unweighted estimate and its standard error; proc means data=nhanes2012 stderr; var ridageyr; run; * additional statistics for age; proc surveymeans data = nhanes2012 mean min max range; weight wtint2yr; cluster sdmvpsu; strata sdmvstra; var ridageyr; run; * percentiles for age; proc surveymeans data = nhanes2012 percentile = (10 25 50 75 90); weight wtint2yr; cluster sdmvpsu; strata sdmvstra; var ridageyr; run; * indfmpir is ratio of family income to poverty; * in addition to mean, also var=variance, * nmiss=number missing, * df=degrees of freedom=31 PSU-14 strata=17 * cv=coefficient of variation * deff=design effect; proc surveymeans data = nhanes2012 mean var nmiss df cv deff; weight wtint2yr; cluster sdmvpsu; strata sdmvstra; var indfmpir; run; * if multiple variables are specified on VAR, SAS will use all available data * for each estimate; proc surveymeans data = nhanes2012; weight wtint2yr; cluster sdmvpsu; strata sdmvstra; var ridageyr indfmpir; run; ***** PROC SURVEYFREQ (SLIDE) *****; * for categorical variable descriptives, use PROC SURVEYFREQ; * riagendr is gender, 1=male, 2=female; proc surveyfreq data = nhanes2012; weight wtint2yr; cluster sdmvpsu; strata sdmvstra; tables riagendr; run; * cross-tabulation; * dmdborn4 is country of birth (1=US, 2=others, 77=refused, 99=don't know); proc surveyfreq data = nhanes2012; weight wtint2yr; cluster sdmvpsu; strata sdmvstra; tables riagendr*dmdborn4; run; * formats (value labels) would be helpful; * we'll use some of the formats later; proc format; value gendr 1="male" 2="female"; value cob 1="US" 2="others" 77="refused" 99="don't know"; value marstat 1="married" 2="widowed" 3="divorced" 4="separated" 5="never married" 6="living with partner" 77="refused" 99="don't know"; run; * format variables for value labels; proc surveyfreq data = nhanes2012; weight wtint2yr; cluster sdmvpsu; strata sdmvstra; tables riagendr*dmdborn4; format riagendr gendr. dmdborn4 cob.; run; * row and column percentages; * expected frequencies; * deff=design effect on percentages; proc surveyfreq data = nhanes2012; *restricting to born in US or others; weight wtint2yr; cluster sdmvpsu; strata sdmvstra; tables riagendr*dmdborn4 / row col expected deff; format riagendr gendr. dmdborn4 cob.; run; * weighted frequency plot for single categorical variable; * dmdhrmar is martial status; proc surveyfreq data = nhanes2012; weight wtint2yr; cluster sdmvpsu; strata sdmvstra; tables dmdhrmar / plots = wtfreqplot; format dmdhrmar marstat.; run; * mosaic plot to visualize crosstab; proc surveyfreq data = nhanes2012; weight wtint2yr; cluster sdmvpsu; strata sdmvstra; tables dmdmartl*dmdborn4 / plots = mosaicplot; format dmdmartl marstat. dmdborn4 cob.; run; * DESIGN-ADJUSTED CHI-SQUARE TESTS; * chi-square tests: chisq=Rao-Scott, lrchisq=Rao-Scott likelihood ratio, * wchisq=Wald, wllchisq=Wald log-linear; * using where= to eliminate refused, don't know and missing, but don't do this in general (see SAS warning); proc surveyfreq data = nhanes2012(where=((dmdborn4<=2) and (dmdmartl<=6))); weight wtint2yr; cluster sdmvpsu; strata sdmvstra; tables dmdmartl*dmdborn4 / row col expected chisq lrchisq wchisq wllchisq; format dmdmartl marstat. dmdborn4 cob.; run; ***** SUBPOPULATION ANALYSIS with DOMAIN STATEMENTS (SLIDE) *****; * mean number of people in household (dmdhhsiz) separately for males and females; proc surveymeans data = nhanes2012; weight wtint2yr; cluster sdmvpsu; strata sdmvstra; domain riagendr; var dmdhhsiz; format riagendr gendr.; run; * mean number of people in household (dmdhhsiz) separately for groups formed by gender and country of birth; proc surveymeans data = nhanes2012; weight wtint2yr; cluster sdmvpsu; strata sdmvstra; domain riagendr*dmdborn4; var dmdhhsiz; format riagendr gendr. dmdborn4 cob.; run; * mean number of people in household (dmdhhsiz) separately for just US-born females; proc surveymeans data = nhanes2012; weight wtint2yr; cluster sdmvpsu; strata sdmvstra; domain riagendr('female')*dmdborn4('US'); var dmdhhsiz; format riagendr gendr. dmdborn4 cob.; run; ***** PROC SURVEYREG (SLIDE) *****; * essentially a t-test comparing household size between genders; proc surveyreg data = nhanes2012; weight wtint2yr; cluster sdmvpsu; strata sdmvstra; class riagendr; model dmdhhsiz = riagendr / solution; format riagendr gendr.; run; * same as above but no intercept so parameters represent means for males and females; proc surveyreg data = nhanes2012; weight wtint2yr; cluster sdmvpsu; strata sdmvstra; class riagendr; model dmdhhsiz = riagendr / noint solution; contrast 't-test: males vs females household size' riagendr 1 -1; format riagendr gendr.; run; * linear regression of ratio of income to poverty predicted by gender and age; * lsmeans gives predicted ratio for each gender; proc surveyreg data = nhanes2012; weight wtint2yr; cluster sdmvpsu; strata sdmvstra; class riagendr; model indfmpir = riagendr ridageyr / solution; lsmeans riagendr; format riagendr gendr.; run; ***** PROC SURVEYLOGISTIC (SLIDE) *****; * binary variable ridstatr codes 1=interview only, 2=interview and medical exam; * (event=last) says model probability last value, 2=interview and medical exam; * (param=glm) on CLASS statement required for LSMEANS, which outputs predicted probabilities with ilink; proc surveylogistic data = nhanes2012; * eliminating refused and don't know; weight wtint2yr; cluster sdmvpsu; strata sdmvstra; class riagendr / param=glm; model ridstatr(event=last) = riagendr ridageyr; lsmeans riagendr / ilink; format riagendr gendr.; run; ***** PROC SURVEYIMPUTE (SLIDE) *****; *fialang is language of family interview, 1=English, 2=Spanish; * has 105 missing values; proc surveyfreq data=nhanes2012; weight wtint2yr; cluster sdmvpsu; strata sdmvstra; tables fialang; run; * observation 9 is missing on fialang; proc print data=nhanes2012(obs=10); var fialang; run; * cells defined by age and country of birth; * observations with missing on fialang will have value replaced by * observation with same age and country of birth; proc surveyimpute data=nhanes2012 method=hotdeck seed=543; * seed sets the random seed; weight wtint2yr; cluster sdmvpsu; strata sdmvstra; class dmdcitzn; cells ridageyr dmdcitzn; var fialang; * variable to impute; output out=nhanes2012_imp donorid; *output data set with new variable identifying donor observation; run; * create a small data set for viewing; data nhanes2012_imp; set nhanes2012_imp; keep ridageyr dmdcitzn fialang donorid; run; *observation 9 now has fialang imputed using donor observation 5351; proc print data=nhanes2012_imp(obs=10); run; * this is the donor group that has the same age and * country of birth has observation 9, including observation 5351; proc print data=nhanes2012_imp(where=(ridageyr=21 and dmdcitzn=1)); run;