#run if you have not install packages
#install.packages(c("MASS", "ggplot2", "tidyverse", "nnet", "mlogit", "ordinal", "brant"), 
#                 packagesdependencies=TRUE)
#load packages used in the workshop
library(tidyverse)
library(MASS)
library(nnet)
library(mlogit)
library(brant)
library(ordinal)

#factor
#creating variable x 
x <- c("a", "b", "c", "d")
#print x
x
#class of object x
class(x)
#transform x to factor assign the result to x.f
x.f <- factor(x, levels = c("a", "b", "c", "d"), labels = c("a", "b", "c", "d"))
#print x.f
x.f 
#class of x.f
class(x.f)
#relevel x.f make "b" reference
relevel(x.f, ref = "b")
x.f
#assign the result to x.f
x.f <- relevel(x.f, ref = "b")
x.f
# Map from 4 different values to three levels (combine c and d into c):
x.f2 <- x.f
levels(x.f2) <- c("b", "a", "c", "c")
x.f2
#or 
x.f2 <- factor(x.f, labels = c("b", "a", "c", "c"))
x.f2
#ordered factor
x.of <- factor(x, ordered = TRUE)
#or
ordered(x)
#change order
factor(x.of, levels = c("d","b","c", "a"), ordered = TRUE)
#changing labels
(x.of <- ordered(x.of, 
                 label = c("strong disagree", "disagree", "agree", "strongly agree")))

factor(x.of,
       levels= c("strongly agree", "agree", "disagree", "strong disagree" ))


#reading hsb data
hsb <- read.csv("https://stats.idre.ucla.edu/stat/data/hsbdemo.csv")
#names of variables 
names(hsb)
#Structur of variables
str(hsb)

#making variable female as a factor 
hsb$female <- factor(hsb$female)
#levels
levels(hsb$female)
#frequency table
table(hsb$female)
#proportion in each level
prop.table(table(hsb$female))

#changing the reference to "male" 
hsb$female <- relevel(hsb$female, ref = "male")
#check to see reference changed
levels(hsb$female)

#transform variables schtyp, prog, honors and ses to factor
hsb$schtyp <- factor(hsb$schtyp)
prop.table(table(hsb$schtyp))
hsb$prog <- factor(hsb$prog)
prop.table(table(hsb$prog))
hsb$honors <- factor(hsb$honors)
prop.table(table(hsb$honors))
#ses as factor in order or "low", "middle", "high". Not ordered factor.
hsb$ses <- factor(hsb$ses, levels = c("low", "middle", "high"))
prop.table(table(hsb$ses))
#relevel honors
hsb$honors <- relevel(hsb$honors, ref = "not enrolled")

#first we create frequency table and 
#then we use prop.table to calculate probabilities
prog.tab <- table(hsb$prog)
(prog.p <- prop.table(prog.tab))
#odds of general vs. academic
(odds.general <- 0.225  / 0.525) 
#odds of vocational vs. academic
(odds.vocational <- 0.250  / 0.525)
p.academic <- 1 / (1 + odds.general + odds.vocational)
p.general <- p.academic * odds.general
p.vocational <- p.academic*odds.vocational
cbind(p.academic = p.academic, p.general = p.general, p.vocational= p.vocational)
#calculating proportion of enrollment for each group of female and male. 
#table gives frequencies 
(tab_honors_female <- table(hsb$honors, hsb$female))

#prop.table on frequency table with margin = 2 returns column-vise proportions.
prop.table(tab_honors_female, margin = 2)
(odds.ratio <- (0.3211009 / 0.6788991) / (0.1978022 / 0.8021978) )
#using frequencies
(35/74) / (18/73)


#3 X 2 table of program vs. female
table(hsb$prog, hsb$female)

#odds ratios using frequencies
(odds.ratio <- (24/58) / (21/47))


#to estimate coefficient of the model we use glm with  
#the option family = binomial(link = "logit")
logit.m <- glm(honors ~ math + female, family = binomial(link = "logit"), data = hsb)
summary(logit.m)
coef(summary(logit.m))
#confidence interval for coef
confint(logit.m)

#exponentiate coefficients
exp(coef(logit.m))

#exponentiate coefficients and column bind with exponentiation of CIs

cbind(OR = exp(coef(logit.m)), exp(confint(logit.m)))

#summary of the outcome prog
(tab.prog <- table(hsb$prog))
prop.table(tab.prog)
#running multinomial regression model for prog on socio econonic status and write
library(nnet)
m.multinom <- multinom(prog ~ ses + write, data = hsb)
#extracting the result of model using summary
(m.results <- summary(m.multinom, digits = 3))
#exponentiation of coefficients
exp(coef(m.multinom))

#calculating z score by deviding estimated coefficient by it's standard error
z <- m.results$coefficients/m.results$standard.errors
#using pnorm we get probability of tail of normal distribution for z value
p <- (1 - pnorm(abs(z), 0, 1)) * 2
#print the result of z-scores and p-values
print(z,digits = 4)
print(p, digits = 4)
#likelihood test foe ses:
#update the model to exclude ses
m.multinom.r <- update(m.multinom, . ~ . -ses)
#anova of the restricted model vs. the full model
anova(m.multinom.r, m.multinom)

#fitted gives fitted probability for observations, head report the first 6 rows
head(pp <- fitted(m.multinom))


#new data with all levels of ses and write at the mean of write.
newdata.ses <- data.frame(ses = c("low", "middle", "high"), write = mean(hsb$write))
#column bind the new data with predicted probabilities 
cbind(newdata.ses, probability = predict(m.multinom, newdata = newdata.ses, "probs"))


#check the range of write
range(hsb$write)
#in new data,  for write from 30 to 40 we replicate levels of ses 
newdata.write <- data.frame(ses = rep(c("low", "middle", "high"), each = 41), write = rep(c(30:70),
                                                                                          3))
#predict gives predicted probability for each rows of newdata
pp.write <- cbind(newdata.write, predict(m.multinom, newdata = newdata.write, type = "probs", se = TRUE))
#first few rows
head(pp.write)
#calculate the mean probabilities within each level of ses
by(pp.write[, 3:5], pp.write$ses, colMeans)

#pivot_longer from package tidy reshape the wide format to long format
library(tidyr)
long.pp <- pp.write %>% pivot_longer(cols = c("academic", "general", "vocation"), names_to = "Program", values_to = "Probability")
#fist few rows will be
head(long.pp)

library(ggplot2)
ggplot(long.pp, aes(x = write, y = Probability, colour = ses)) + 
  geom_line(lwd=1.2) + 
  facet_grid(Program ~., scales = "free")
#package mlogit
library(mlogit)
#reshapes the data in data frame with index, dfidx 
hsb.choice <- mlogit.data(hsb, shape = "wide", choice = "prog")
class(hsb.choice)
#each row extended to 3 row for alternative index
head(data.frame(hsb.choice))

#running multinomial logistic using mlogit
m.mlogit <- mlogit(prog ~ 0 | ses + write , data = hsb.choice)
#extract the result using summary
summary(m.mlogit)


#exercise solution

#first create variable c.math
hsb$c.math <- scale(hsb$math, center = TRUE, scale = FALSE)
#run the moltinomial regression with nnet::multinom()
ex.multinom <- multinom(ses ~ c.math + female + c.math:female, data = hsb)
(result <- summary(ex.multinom))

#using mlogit first we need to reshape the data
hsb.choice <- mlogit.data(hsb, shape = "wide", choice = "ses")
m.mlogit <- mlogit(ses ~ 0 | c.math * female, data = hsb.choice)
#extract the result
summary(m.mlogit)



#############ordinal regression

#reading the data into R
dat <- read.csv("https://stats.idre.ucla.edu/stat/data/ologit.csv")
#structure of the data
str(dat)

#transform apply to ordered factor
dat$apply <- factor(dat$apply, labels = c("unlikely", "somewhat likely", "very likely") ,ordered = TRUE)
str(dat$apply)

#transform pared to factor
dat$pared <- factor(dat$pared)
#checking levels
levels(dat$pared)
#change labels to "not attend" and "attend"
dat$pared <- factor(dat$pared, labels = c("not attend", "attend"))
#structure of pared
str(dat$pared)

#loading library ordinal
library(ordinal)
#running clm for cumulative link(logit) model
ordinal.clm <- clm(apply ~ pared , data = dat)
#extract the result
summary(ordinal.clm)

(ci <- confint(ordinal.clm)) # default method gives profiled CIs

confint.default(ordinal.clm) # CIs assuming normality

# OR and CI also the reciprocal of the odds ratio
exp(cbind(OR=-coef(ordinal.clm)[3], inv.OR = coef(ordinal.clm)[3], ci))


#loading library MASS
library(MASS)
#running the model with polr from package MASS
m.plr <- polr(apply ~ pared, data = dat)
summary(m.plr)


# store table
(ctable <- coef(summary(m.plr)))
# calculate and store p values
p <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2

# combined table
(ctable <- cbind(ctable, "p value" = p))


#likelihood ratio test for parallel assumption

#ordinal logistic regression with relaxing proportional odds assumption
ordinal.clm.ur <- clm(apply ~ 1 ,nominal = ~pared , data = dat)
summary(ordinal.clm.ur)
anova(ordinal.clm.ur, ordinal.clm)

#brant test for proportional odds assumption over polr object
library(brant)
brant::brant(m.plr)

# Exercise, Ordinal logistic regression of `apply` 
#on `pared`, `public` and `gpa`
#First we make variable public as factor
dat$public <- factor(dat$public, labels = c("private", "public"))


#run the clm model
m.clm <- clm(apply ~ pared + public + gpa , data = dat)
summary(m.clm)
#getting CI
(ci <- confint(m.clm))
# exponantiate to get OR and CI
exp(cbind(OR=-coef(m.clm)[3:5], inv.OR = coef(m.clm)[3:5], ci))

#PO assumption
#re-run the model with proportional odds assumption relaxed

m.clm.relaxed <- clm(apply ~ 1, nominal = ~ pared + public + gpa , data = dat)
summary(m.clm.relaxed)
#likelihood ratio test with anova
anova(m.clm, m.clm.relaxed)


#brant test for proportional odds assumption over polr object
#run the model using polr
m1.plr <- polr(apply ~ pared + public + gpa, data = dat)
#run the brant test
brant::brant(m1.plr)

######################
#plot ordinal
#creating new data
newdat <- data.frame(
  pared = rep(levels(dat$pared), 200),
  public = rep(levels(dat$public), each = 200),
  gpa = rep(seq(from = 1.9, to = 4, length.out = 100), 4))
#using function predict
newdat <- cbind(newdat, predict(m.clm, newdat, type = "prob"))

head(newdat)


newdat.long <- newdat %>% 
  pivot_longer(cols = c(4,5,6), names_to = "apply", values_to = "Probability",
               names_pattern ="....(.*)")

head(newdat.long)
#plot of predicted probability vs. gpa separated by parent education and public
ggplot(newdat.long, aes(x = gpa, y = Probability, colour = apply)) +
  geom_line() + facet_grid(pared ~ public, labeller="label_both")