********** LOGISTIC REGRESSION IN STATA **********
/* load low birth weight data */
webuse lbw, clear

/* histograms of predictors */
hist low, discrete freq
hist age, freq
hist ptl, freq discrete
hist ht, freq discrete
hist smoke, freq discrete
hist race, freq discrete

/* logistic regression with low as outcome,
   age and ptl as continuous predictors,
   ht, smoke and race as categorical predictors */
logit low age ptl i.ht i.smoke i.race

/* rerunning model to express odds ratios */
logit low age ptl i.ht i.smoke i.race, or

/* adding interaction of age and smoke */
logit low c.age##i.smoke ptl i.ht i.race, or

********** MODEL PREDICTIONS **********

/* predp will be predicted probabilities */
predict predp

/* display model variables and predictions for first 10 obs */
list low age ptl ht smoke race predp in 1/10

/* compare distribution of predp between observations with low=1 vs low=0 */
graph box predp, over(low) b1title("observed low") ytitle("predicted Pr(low=1)")

/* probability of lbw, averaging over all predictors */
margins

/* means of low and predp equals the predictive margin above */
summ low predp

/* average probabilities of lbw for smokers and nonsmokers */
margins smoke

/* predictive margins for each combination of smoke and race */
margins smoke#race

/* average probability of lbw for 20-year old mothers */
margins, at(age=20)

/* average probabilities of lbw for smoking and non-smoking moms age 20 or 30 */
margins smoke, at(age=(20 30))

/* average probabilities of lbw for mothers aged 20 to 40 (in increments of 5) and 
   either 0 to 2 premature labors */
margins, at(age=(20(5)40) ptl=(0/2))

/* estimate marginal means for smoking and non-smoking mothers */
margins smoke
/* plot marginal means */
marginsplot

/* estimate marginal means for smoking and non-smoking mothers at ages 20, 30 and 40 */
margins smoke, at(age=(20 30 40))
* plot marginal means
marginsplot

/* estimate marginal means for smoking and non-smoking mothers at ages 20, 30 and 40 */
margins smoke, at(age=(20 30 40))
/* plot marginal means */
marginsplot, x(smoke)

/* margins with many predictors fixed at many levels */
margins smoke#race, at(age=(20 30 40) ptl=(0 1))
/* too many lines in marginsplot */
marginsplot

margins smoke#race, at(age=(20 30 40) ptl=(0 1))
/* separate graphs by race and ptl */
marginsplot, by(race ptl)

/* adjusted predictions of probability of lbw for smokers and non-smokers of each race
   who are age 30 with no previous premature labors and a history of hypertension */
margins smoke#race, at(age=30 ptl=0 ht=1)

/* adjusted predictions of probability of lbw for smokers and non-smokers of each race
   who have a history of hypertension and are at the mean age and number of premature labors */
margins smoke#race, at(ht=1) atmeans

/* marginal effect of history of hypertension */
margins, dydx(ht)

/* marginal effect of ht is difference between these predictive margins */
margins ht

/* marginal effect of ht by race, averaging over other predictors */
margins race, dydx(ht)

/* marginal effect of age is a derivative */
margins, dydx(age)

/* predictive margins for ages 14-45 */
margins, at(age=(14/45))
/* plot margins without confidence intervals */
marginsplot, noci


********** ASSESSING MODEL FIT AND MODEL ASSUMPTIONS **********

/* Pearson goodness-of-fit test */
estat gof

/* Hosmer and Lemeshow gof test with groups based on predicted probability deciles */
estat gof, group(10)

/* AIC and BIC */
estat ic

/* model without continuous predictors age and ptl*/
logit low i.ht i.smoke i.race, or
/* store model results as m_reduced */
estimates store m_reduced
/* AIC and BIC */
estat ic

/* refit full model */
logit low age ptl i.ht i.smoke i.race, or
/* store model results */
estimate store m_full
/* likelihood ratio test comparing full and reduced models */
lrtest m_full m_reduced

/* classify observations based on cutoff 0.5 */
gen pred_low = 0
replace pred_low = 1 if predp > 0.5
/* check agreement of observed and predicted outcomes, add row percentages */
tab low pred_low, row  

/* classify observations based on cutoff 0.25 */
gen pred_low_25 = 0
replace pred_low_25 = 1 if predp > 0.25
/* check agreement of observed and predicted outcomes with new cutoff */
tab low pred_low_25, row 

/* plot ROC curve and estimate AUC */
lroc

/* Pregibon's link test to assess linearity */
linktest

/* create a categorical age variable based on age quintiles */
egen age_cat = cut(age), group(5)
/* enter categorical age in logit model as factor */
logit low i.age_cat ptl i.ht i.smoke i.race, or
/* predicted logits across age categories, predict(xb) requests logits */
margins age_cat, predict(xb)
/* graph predicted logits across age categories */
marginsplot

/* calculate delta-beta influence statistic and save as variable dbeta */
predict dbeta, dbeta
/* sort data by dbeta, descending */
gsort -dbeta
/* list 10 observations with largest dbeta */
li dbeta in 1/10

/* rerun model without 2 observations with dbeta > 1 */
logit low age ptl i.ht i.smoke i.race if dbeta < 1, or



************ EXERCISES ***************************

/* Exercise 1 

To practice logistic regression in Stata, we will use the `nhanes2` dataset, coming from the National Health and Nutrition Examination Survey, that can be loaded with:

The model variables are:

* `highbp`: high blood pressure, 1=high blood pressure, 0=normal
* `region`: region of US, 1=NE, 2=MW, 3=S, 4=W
* `rural`: rural residence, 0=urban, 1= rural
* `weight`: weight in kg
* `age`: age in years

1. Perform a logistic regression of outcome `highbp` with categorical predictors (factors) `region` and `rural`, and linear relationships for continuous predictors `weight` and `age`, with coefficients expressed in logits (log odds). Interpret the coefficients.
2. Rerun the model with coefficients expressed as odds ratios and interpret the odds ratios.

*/
webuse nhanes2, clear




/* Exercise 2

Use the NHANES data and the previously run logistic regression to:

3. Estimate the following:
    + the average probability of high blood pressure in each region of the US
    + the average probability of high blood pressure for rural and urban residents in each region of the US
4. Run `margins, at(region=4 urban=0 weight=60 age=30)` and interpret the estimate
5. Estimate the average marginal effect of age
    + Estimate the predictive margins for every year of age from 20 to 70 and graph the predictive margins vs age
    + Can the marginal effect of be interpreted as approximately the increase in the probability of highbp for each additional year of age?
	
*/



/* Exercise 3

Assess the linearity assumption of the logistic regression model run on the NHANES data:

6. Evaluate the linearity assumption with `linktest`
7. Use the graphical method to get an idea of the form of the relationship between the logit of high blood pressure and age

*/