***** IMPORTING DATA *****;

* Replace 'C:\path\' with the path(directory) where NHANES 2011-2012 demographic file is located;
libname xp xport 'C:\path\demo_g.xpt';

* create a SAS dataset of NHANES demo file;
data nhanes2012;
  set xp.demo_g;
run;

***** EXAMINING DESIGN VARIABLES *****;

* sampling weight;
proc means data = nhanes2012 n min mean max sum;
var wtint2yr;
run;

* cluster and strata variables;
* 14 total strata;
proc freq data = nhanes2012;
tables sdmvpsu sdmvstra;
run;

* cross-tab of PSU (cluster) and strata;
* some strata have 3 PSUs (clusters), most have 2 PSUs;
* 31 total PSUs;
proc freq data = nhanes2012;
tables sdmvpsu*sdmvstra;
run;

***** PROC SURVEYMEANS (SLIDE) *****;

* for continuous variable descriptives, use PROC SURVEYMEANS;
* ridageyr is age;
proc surveymeans data = nhanes2012;
weight wtint2yr;
cluster sdmvpsu;
strata sdmvstra;
var ridageyr;
run;

* compare to unweighted estimate and its standard error;
proc means data=nhanes2012 stderr;
var ridageyr;
run;

* additional statistics for age;
proc surveymeans data = nhanes2012 mean min max range;
weight wtint2yr;
cluster sdmvpsu;
strata sdmvstra;
var ridageyr;
run;

* percentiles for age;
proc surveymeans data = nhanes2012 percentile = (10 25 50 75 90);
weight wtint2yr;
cluster sdmvpsu;
strata sdmvstra;
var ridageyr;
run;


* indfmpir is ratio of family income to poverty;
* in addition to mean, also var=variance,
* nmiss=number missing,
* df=degrees of freedom=31 PSU-14 strata=17
* cv=coefficient of variation
* deff=design effect;
proc surveymeans data = nhanes2012 mean var nmiss df cv deff;
weight wtint2yr;
cluster sdmvpsu;
strata sdmvstra;
var indfmpir;
run;

* if multiple variables are specified on VAR, SAS will use all available data
*  for each estimate;
proc surveymeans data = nhanes2012;
weight wtint2yr;
cluster sdmvpsu;
strata sdmvstra;
var ridageyr indfmpir;
run;


***** PROC SURVEYFREQ (SLIDE) *****;

* for categorical variable descriptives, use PROC SURVEYFREQ;
* riagendr is gender, 1=male, 2=female;
proc surveyfreq data = nhanes2012;
weight wtint2yr;
cluster sdmvpsu;
strata sdmvstra;
tables riagendr;
run;

* cross-tabulation;
* dmdborn4 is country of birth (1=US, 2=others, 77=refused, 99=don't know);
proc surveyfreq data = nhanes2012;
weight wtint2yr;
cluster sdmvpsu;
strata sdmvstra;
tables riagendr*dmdborn4;
run;

* formats (value labels) would be helpful;
* we'll use some of the formats later;
proc format;
value gendr
  1="male"
  2="female";
value cob
  1="US"
  2="others"
  77="refused"
  99="don't know";
value marstat  
  1="married"
  2="widowed"
  3="divorced"
  4="separated"
  5="never married"
  6="living with partner"
  77="refused"
  99="don't know";
run;


* format variables for value labels;
proc surveyfreq data = nhanes2012;
weight wtint2yr;
cluster sdmvpsu;
strata sdmvstra;
tables riagendr*dmdborn4;
format riagendr gendr. dmdborn4 cob.;
run;

* row and column percentages;
* expected frequencies;
* deff=design effect on percentages;
proc surveyfreq data = nhanes2012; *restricting to born in US or others;
weight wtint2yr;
cluster sdmvpsu;
strata sdmvstra;
tables riagendr*dmdborn4 /  row col expected deff;
format riagendr gendr. dmdborn4 cob.;
run;


* weighted frequency plot for single categorical variable;
* dmdhrmar is martial status;
proc surveyfreq data = nhanes2012;
weight wtint2yr;
cluster sdmvpsu;
strata sdmvstra;
tables dmdhrmar / plots = wtfreqplot;
format dmdhrmar marstat.;
run;

* mosaic plot to visualize crosstab;
proc surveyfreq data = nhanes2012;
weight wtint2yr;
cluster sdmvpsu;
strata sdmvstra;
tables dmdmartl*dmdborn4 / plots = mosaicplot;
format dmdmartl marstat. dmdborn4 cob.;
run;


* DESIGN-ADJUSTED CHI-SQUARE TESTS;
* chi-square tests: chisq=Rao-Scott, lrchisq=Rao-Scott likelihood ratio,
*  wchisq=Wald, wllchisq=Wald log-linear;
* using where= to eliminate refused, don't know and missing, but don't do this in general (see SAS warning);
proc surveyfreq data = nhanes2012(where=((dmdborn4<=2) and (dmdmartl<=6))); 
weight wtint2yr;
cluster sdmvpsu;
strata sdmvstra;
tables dmdmartl*dmdborn4 /  row col expected chisq lrchisq wchisq wllchisq;
format dmdmartl marstat. dmdborn4 cob.;
run;


***** SUBPOPULATION ANALYSIS with DOMAIN STATEMENTS (SLIDE) *****;

* mean number of people in household (dmdhhsiz) separately for males and females;
proc surveymeans data = nhanes2012;
weight wtint2yr;
cluster sdmvpsu;
strata sdmvstra;
domain riagendr;
var dmdhhsiz;
format riagendr gendr.;
run;


* mean number of people in household (dmdhhsiz) separately for groups formed by gender and country of birth;
proc surveymeans data = nhanes2012;
weight wtint2yr;
cluster sdmvpsu;
strata sdmvstra;
domain riagendr*dmdborn4;
var dmdhhsiz;
format riagendr gendr. dmdborn4 cob.;
run;


* mean number of people in household (dmdhhsiz) separately for just US-born females;
proc surveymeans data = nhanes2012;
weight wtint2yr;
cluster sdmvpsu;
strata sdmvstra;
domain riagendr('female')*dmdborn4('US');
var dmdhhsiz;
format riagendr gendr. dmdborn4 cob.;
run;


***** PROC SURVEYREG (SLIDE) *****;

* essentially a t-test comparing household size between genders;
proc surveyreg data = nhanes2012;
weight wtint2yr;
cluster sdmvpsu;
strata sdmvstra;
class riagendr;
model dmdhhsiz = riagendr / solution;
format riagendr gendr.;
run;

* same as above but no intercept so parameters represent means for males and females;
proc surveyreg data = nhanes2012;
weight wtint2yr;
cluster sdmvpsu;
strata sdmvstra;
class riagendr;
model dmdhhsiz = riagendr / noint solution;
contrast 't-test: males vs females household size' riagendr 1 -1;
format riagendr gendr.;
run;

* linear regression of ratio of income to poverty predicted by gender and age;
* lsmeans gives predicted ratio for each gender;
proc surveyreg data = nhanes2012;
weight wtint2yr;
cluster sdmvpsu;
strata sdmvstra;
class riagendr;
model indfmpir = riagendr ridageyr / solution;
lsmeans riagendr;
format riagendr gendr.;
run;


***** PROC SURVEYLOGISTIC (SLIDE) *****;

* binary variable ridstatr codes 1=interview only, 2=interview and medical exam;
* (event=last) says model probability last value, 2=interview and medical exam;
* (param=glm) on CLASS statement required for LSMEANS, which outputs predicted probabilities with ilink;
proc surveylogistic data = nhanes2012; * eliminating refused and don't know;
weight wtint2yr;
cluster sdmvpsu;
strata sdmvstra;
class riagendr  / param=glm;
model ridstatr(event=last) = riagendr ridageyr;
lsmeans riagendr / ilink;
format riagendr gendr.;
run;


***** PROC SURVEYIMPUTE (SLIDE) *****;

*fialang is language of family interview, 1=English, 2=Spanish;
* has 105 missing values;
proc surveyfreq data=nhanes2012;
weight wtint2yr;
cluster sdmvpsu;
strata sdmvstra;
tables fialang;
run;

* observation 9 is missing on fialang;
proc print data=nhanes2012(obs=10);
var fialang;
run;

* cells defined by age and country of birth;
* observations with missing on fialang will have value replaced by
*   observation with same age and country of birth;
proc surveyimpute data=nhanes2012 method=hotdeck seed=543; * seed sets the random seed;
weight wtint2yr;
cluster sdmvpsu;
strata sdmvstra;
class dmdcitzn;
cells ridageyr dmdcitzn;
var fialang; * variable to impute;
output out=nhanes2012_imp donorid; *output data set with new variable identifying donor observation;
run;


* create a small data set for viewing;
data nhanes2012_imp;
set nhanes2012_imp;
keep ridageyr dmdcitzn fialang donorid;
run;


*observation 9 now has fialang imputed using donor observation 5351;
proc print data=nhanes2012_imp(obs=10);
run;

* this is the donor group that has the same age and 
*  country of birth has observation 9, including observation 5351;
proc print data=nhanes2012_imp(where=(ridageyr=21 and dmdcitzn=1));
run;