#run if you have not install packages #install.packages(c("MASS", "ggplot2", "tidyverse", "nnet", "mlogit", "ordinal", "brant"), # packagesdependencies=TRUE) #load packages used in the workshop library(tidyverse) library(MASS) library(nnet) library(mlogit) library(brant) library(ordinal) #factor #creating variable x x <- c("a", "b", "c", "d") #print x x #class of object x class(x) #transform x to factor assign the result to x.f x.f <- factor(x, levels = c("a", "b", "c", "d"), labels = c("a", "b", "c", "d")) #print x.f x.f #class of x.f class(x.f) #relevel x.f make "b" reference relevel(x.f, ref = "b") x.f #assign the result to x.f x.f <- relevel(x.f, ref = "b") x.f # Map from 4 different values to three levels (combine c and d into c): x.f2 <- x.f levels(x.f2) <- c("b", "a", "c", "c") x.f2 #or x.f2 <- factor(x.f, labels = c("b", "a", "c", "c")) x.f2 #ordered factor x.of <- factor(x, ordered = TRUE) #or ordered(x) #change order factor(x.of, levels = c("d","b","c", "a"), ordered = TRUE) #changing labels (x.of <- ordered(x.of, label = c("strong disagree", "disagree", "agree", "strongly agree"))) factor(x.of, levels= c("strongly agree", "agree", "disagree", "strong disagree" )) #reading hsb data hsb <- read.csv("https://stats.idre.ucla.edu/stat/data/hsbdemo.csv") #names of variables names(hsb) #Structur of variables str(hsb) #making variable female as a factor hsb$female <- factor(hsb$female) #levels levels(hsb$female) #frequency table table(hsb$female) #proportion in each level prop.table(table(hsb$female)) #changing the reference to "male" hsb$female <- relevel(hsb$female, ref = "male") #check to see reference changed levels(hsb$female) #transform variables schtyp, prog, honors and ses to factor hsb$schtyp <- factor(hsb$schtyp) prop.table(table(hsb$schtyp)) hsb$prog <- factor(hsb$prog) prop.table(table(hsb$prog)) hsb$honors <- factor(hsb$honors) prop.table(table(hsb$honors)) #ses as factor in order or "low", "middle", "high". Not ordered factor. hsb$ses <- factor(hsb$ses, levels = c("low", "middle", "high")) prop.table(table(hsb$ses)) #relevel honors hsb$honors <- relevel(hsb$honors, ref = "not enrolled") #first we create frequency table and #then we use prop.table to calculate probabilities prog.tab <- table(hsb$prog) (prog.p <- prop.table(prog.tab)) #odds of general vs. academic (odds.general <- 0.225 / 0.525) #odds of vocational vs. academic (odds.vocational <- 0.250 / 0.525) p.academic <- 1 / (1 + odds.general + odds.vocational) p.general <- p.academic * odds.general p.vocational <- p.academic*odds.vocational cbind(p.academic = p.academic, p.general = p.general, p.vocational= p.vocational) #calculating proportion of enrollment for each group of female and male. #table gives frequencies (tab_honors_female <- table(hsb$honors, hsb$female)) #prop.table on frequency table with margin = 2 returns column-vise proportions. prop.table(tab_honors_female, margin = 2) (odds.ratio <- (0.3211009 / 0.6788991) / (0.1978022 / 0.8021978) ) #using frequencies (35/74) / (18/73) #3 X 2 table of program vs. female table(hsb$prog, hsb$female) #odds ratios using frequencies (odds.ratio <- (24/58) / (21/47)) #to estimate coefficient of the model we use glm with #the option family = binomial(link = "logit") logit.m <- glm(honors ~ math + female, family = binomial(link = "logit"), data = hsb) summary(logit.m) coef(summary(logit.m)) #confidence interval for coef confint(logit.m) #exponentiate coefficients exp(coef(logit.m)) #exponentiate coefficients and column bind with exponentiation of CIs cbind(OR = exp(coef(logit.m)), exp(confint(logit.m))) #summary of the outcome prog (tab.prog <- table(hsb$prog)) prop.table(tab.prog) #running multinomial regression model for prog on socio econonic status and write library(nnet) m.multinom <- multinom(prog ~ ses + write, data = hsb) #extracting the result of model using summary (m.results <- summary(m.multinom, digits = 3)) #exponentiation of coefficients exp(coef(m.multinom)) #calculating z score by deviding estimated coefficient by it's standard error z <- m.results$coefficients/m.results$standard.errors #using pnorm we get probability of tail of normal distribution for z value p <- (1 - pnorm(abs(z), 0, 1)) * 2 #print the result of z-scores and p-values print(z,digits = 4) print(p, digits = 4) #likelihood test foe ses: #update the model to exclude ses m.multinom.r <- update(m.multinom, . ~ . -ses) #anova of the restricted model vs. the full model anova(m.multinom.r, m.multinom) #fitted gives fitted probability for observations, head report the first 6 rows head(pp <- fitted(m.multinom)) #new data with all levels of ses and write at the mean of write. newdata.ses <- data.frame(ses = c("low", "middle", "high"), write = mean(hsb$write)) #column bind the new data with predicted probabilities cbind(newdata.ses, probability = predict(m.multinom, newdata = newdata.ses, "probs")) #check the range of write range(hsb$write) #in new data, for write from 30 to 40 we replicate levels of ses newdata.write <- data.frame(ses = rep(c("low", "middle", "high"), each = 41), write = rep(c(30:70), 3)) #predict gives predicted probability for each rows of newdata pp.write <- cbind(newdata.write, predict(m.multinom, newdata = newdata.write, type = "probs", se = TRUE)) #first few rows head(pp.write) #calculate the mean probabilities within each level of ses by(pp.write[, 3:5], pp.write$ses, colMeans) #pivot_longer from package tidy reshape the wide format to long format library(tidyr) long.pp <- pp.write %>% pivot_longer(cols = c("academic", "general", "vocation"), names_to = "Program", values_to = "Probability") #fist few rows will be head(long.pp) library(ggplot2) ggplot(long.pp, aes(x = write, y = Probability, colour = ses)) + geom_line(lwd=1.2) + facet_grid(Program ~., scales = "free") #package mlogit library(mlogit) #reshapes the data in data frame with index, dfidx hsb.choice <- mlogit.data(hsb, shape = "wide", choice = "prog") class(hsb.choice) #each row extended to 3 row for alternative index head(data.frame(hsb.choice)) #running multinomial logistic using mlogit m.mlogit <- mlogit(prog ~ 0 | ses + write , data = hsb.choice) #extract the result using summary summary(m.mlogit) #exercise solution #first create variable c.math hsb$c.math <- scale(hsb$math, center = TRUE, scale = FALSE) #run the moltinomial regression with nnet::multinom() ex.multinom <- multinom(ses ~ c.math + female + c.math:female, data = hsb) (result <- summary(ex.multinom)) #using mlogit first we need to reshape the data hsb.choice <- mlogit.data(hsb, shape = "wide", choice = "ses") m.mlogit <- mlogit(ses ~ 0 | c.math * female, data = hsb.choice) #extract the result summary(m.mlogit) #############ordinal regression #reading the data into R dat <- read.csv("https://stats.idre.ucla.edu/stat/data/ologit.csv") #structure of the data str(dat) #transform apply to ordered factor dat$apply <- factor(dat$apply, labels = c("unlikely", "somewhat likely", "very likely") ,ordered = TRUE) str(dat$apply) #transform pared to factor dat$pared <- factor(dat$pared) #checking levels levels(dat$pared) #change labels to "not attend" and "attend" dat$pared <- factor(dat$pared, labels = c("not attend", "attend")) #structure of pared str(dat$pared) #loading library ordinal library(ordinal) #running clm for cumulative link(logit) model ordinal.clm <- clm(apply ~ pared , data = dat) #extract the result summary(ordinal.clm) (ci <- confint(ordinal.clm)) # default method gives profiled CIs confint.default(ordinal.clm) # CIs assuming normality # OR and CI also the reciprocal of the odds ratio exp(cbind(OR=-coef(ordinal.clm)[3], inv.OR = coef(ordinal.clm)[3], ci)) #loading library MASS library(MASS) #running the model with polr from package MASS m.plr <- polr(apply ~ pared, data = dat) summary(m.plr) # store table (ctable <- coef(summary(m.plr))) # calculate and store p values p <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2 # combined table (ctable <- cbind(ctable, "p value" = p)) #likelihood ratio test for parallel assumption #ordinal logistic regression with relaxing proportional odds assumption ordinal.clm.ur <- clm(apply ~ 1 ,nominal = ~pared , data = dat) summary(ordinal.clm.ur) anova(ordinal.clm.ur, ordinal.clm) #brant test for proportional odds assumption over polr object library(brant) brant::brant(m.plr) # Exercise, Ordinal logistic regression of `apply` #on `pared`, `public` and `gpa` #First we make variable public as factor dat$public <- factor(dat$public, labels = c("private", "public")) #run the clm model m.clm <- clm(apply ~ pared + public + gpa , data = dat) summary(m.clm) #getting CI (ci <- confint(m.clm)) # exponantiate to get OR and CI exp(cbind(OR=-coef(m.clm)[3:5], inv.OR = coef(m.clm)[3:5], ci)) #PO assumption #re-run the model with proportional odds assumption relaxed m.clm.relaxed <- clm(apply ~ 1, nominal = ~ pared + public + gpa , data = dat) summary(m.clm.relaxed) #likelihood ratio test with anova anova(m.clm, m.clm.relaxed) #brant test for proportional odds assumption over polr object #run the model using polr m1.plr <- polr(apply ~ pared + public + gpa, data = dat) #run the brant test brant::brant(m1.plr) ###################### #plot ordinal #creating new data newdat <- data.frame( pared = rep(levels(dat$pared), 200), public = rep(levels(dat$public), each = 200), gpa = rep(seq(from = 1.9, to = 4, length.out = 100), 4)) #using function predict newdat <- cbind(newdat, predict(m.clm, newdat, type = "prob")) head(newdat) newdat.long <- newdat %>% pivot_longer(cols = c(4,5,6), names_to = "apply", values_to = "Probability", names_pattern ="....(.*)") head(newdat.long) #plot of predicted probability vs. gpa separated by parent education and public ggplot(newdat.long, aes(x = gpa, y = Probability, colour = apply)) + geom_line() + facet_grid(pared ~ public, labeller="label_both")