#reading the data elemapi2v2.csv from OARC website
library(ggplot2)
library(COUNT)

dat <- read.csv(
  "https://stats.oarc.ucla.edu/wp-content/uploads/2019/02/elemapi2v2.csv")
head(dat[,c("api00", "enroll")])

#linear model of api00 vs. enroll
m1 <- lm(api00 ~ enroll, data = dat)
summary(m1)

#scatter plot
plot(api00 ~ enroll, data = dat, 
     main = "Scatter plot of school performance vs. number of enrollment", 
     ylab = "api00 (performance score in 2000)",
     xlab = "enroll", xlim = c(0,1000))
abline(m1, col = "blue")

#do not run *******************************************
# dat1 <- dat[50:100,]
# attach(dat1)
# plot(api00 ~ enroll, data = dat1)
# abline(m1, col = "blue")
# abline(h = mean(dat1$api00), col = "black")
# fit <- m1$fitted.values
# i <- 47
# arrows(enroll[i], api00[i], enroll[i], fit[i],
#        col = "blue", length=0.05, angle=30, code=3)
# arrows(enroll[i], fit[i], enroll[i], mean(api00),
#        col = "red", length=0.05, angle=30, code=3)
# text(860, 530,  expression(y[i] - hat(y[i]) ), col ="blue")
# text(860, 600,  expression(hat(y[i]) - bar(y)), col ="red" )
# arrows(enroll[i] - 20, api00[i], enroll[i] - 20, mean(api00),
#        col = "black", length=0.05, angle=30, code=3)
# text(750, 570,  expression(y[i] - bar(y)) )
# text(850, 680,   expression( paste(y ," = ", bar(y))))
# text(440, 720,col = "blue"  ,
#      expression(paste(hat(y), " = ",hat(beta)[0], " + " , hat(beta)[1], x) ))
# i <- 6
# arrows(enroll[i], api00[i], enroll[i], fit[i],
#        col = "blue", length=0.05, angle=30, code=3)
# text(250, 700,  expression(y[i] - hat(y[i]) ), col ="blue")
#********************************************************************
#*

# linear regression model diagnostics(optional)

# opar <- par(mfrow = c(2, 2), 
#             mar = c(4.1, 4.1, 2.1, 1.1))
# plot(m1)

#The graph below shows the plot of PDF of a normal distribution


# library(ggplot2)
# pdfbin <- data.frame(x = (x = seq(0,10,1)), p = dbinom(x, 10, 0.2))
# ggplot(data = data.frame(x = c(2 - 3*1, 2 + 3*1)), aes(x)) +
#   ggtitle("PDF of normal(mean = 2, sd = 1)") +
#   stat_function(fun = dnorm, n = 101, args = list(mean = 2, sd = 1),colour="dodgerblue") + xlab("x") + ylab("f(x)") +
#   scale_y_continuous(breaks = x)
#Bernoulli
# databernol <- data.frame(x = c(0, 1), p = c(.7, .3))
# ggplot(data = databernol , aes(x, p)) +
#   ggtitle("PMF of  Bernoulli(p = 0.3)") +
#   geom_bar(stat = "identity", width=1, colour="dodgerblue", fill="white") +
#   xlab("x") + 
#   ylab("P(x)")
#binomial
# pdfbin <- data.frame(x = (x = seq(0,10,1)), p = dbinom(x, 10, 0.2))
# ggplot(pdfbin, aes(x, p)) + 
#   ggtitle("PMF of Binomial(n = 10, p = 0.2)") +
#   geom_bar(stat = "identity", width=1, colour="dodgerblue", fill="white") +
#   xlab("x") + 
#   ylab("P(x)") +
#   scale_x_continuous(breaks=x)


#reading the data from OARC stat website admission data:
mydata <- read.csv("https://stats.oarc.ucla.edu/stat/data/binary.csv")
#summary statistics of variables
summary(mydata)
#standard deviation of variables
sapply(mydata, sd)

#lm model for admission
# lm.admit <- lm(admit ~ gre, data = mydata)
# summary(lm.admit)        
# opar <- par(mfrow = c(2, 2), 
#             mar = c(4.1, 4.1, 2.1, 1.1))
# plot(lm.admit)
# par(opar)
# 
# phat <- lm.admit$fitted.values
# 
# plot(phat * (1 - phat) ~ phat)

#plot(admit ~ gre, data = mydata, xlim = c(0,800), ylim = c(-.2,1))
#abline(lm.admit)


#to estimate coefficient of the model we use glm with  
#the option family = binomial(link = "logit")
glm.mod <- glm(admit ~ gre, family = binomial(link = "logit"), data = mydata)

summary(glm.mod)

coef(summary(glm.mod))

confint(glm.mod)
#----------------------------------------------------------------------------
##### Exercise: As an exercise use the same data set and run logistic regression of admit predicted by gpa. 

mydata <- read.csv("https://stats.oarc.ucla.edu/stat/data/binary.csv")

glm.mod.2 <- glm(admit ~ gpa, data = mydata,
                 family = binomial(link = "logit"))

class(glm.mod.2)

coef(summary(glm.mod.2))[2,1]

exp(1.0511)
exp(1.0511 * .2)
exp(1.0511) ^ .2
#---------------------------------------------------------------------------


#logistic gre + gpa}
glm.mod.3 <- glm(admit ~ gre + gpa, family = binomial(link = "logit"), data = mydata)

summary(glm.mod.3)

coef(summary(glm.mod.3))
exp(coef(summary(glm.mod.3))[,1])
confint(glm.mod.3)

#logistic gre * gpa
glm.mod.4 <- glm(admit ~ gre * gpa, family = binomial(link = "logit"), data = mydata)

summary(glm.mod.4)

coef(summary(glm.mod.4))
exp(coef(summary(glm.mod.4))[,1])
confint(glm.mod.4)


#The simple effect of gre at GPA 500

exp(3.367918022 + 500 * -0.004326605)
# or

29.01805 * 0.9956827 ^  500


# pdfpois <- data.frame(x = (x = seq(0,15,1)), p = dpois(x, 2))
# ggplot(pdfpois, aes(x, p)) + 
#   ggtitle("PMF of Poisson(mu = 2)") +
#   geom_bar(stat = "identity", width=1, colour="dodgerblue", fill="white") +
#   xlab("x") + 
#   ylab("P(x)") +
#   scale_x_continuous(breaks=x)


dpois(0:8, lambda = 5.5)

prob <- c(0.1353, 0.2707, 0.2707, 0.1804, 0.0902, 0.0361, 0.0120, 0.0034, 0.0009)
domain <- seq(0,8,1)
set.seed(1001)
y1 <- sample(x = domain, size = 20, replace = TRUE, prob = prob)
y1
mean(y1)
var(y1)
y2 <- c(y1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 5, 6, 8, 5, 6, 7, 4, 3, 6, 10, 15, 20)
mean(y2)
var(y2)

par(mfrow = c(1,2))
hist(y1, breaks = 10)
hist(y2, breaks = 10)


y <- rpois(1000, 2)
mean(y)
var(y)
hist(y, breaks = 15)

# Sunny day vs rainy day data:
y <- c(rpois(20, lambda = 2), rpois(20, lambda = 5))
x <- rep(c(0,1), each = 20)
print(data.frame(y = y,x = x))
accidents_model <- glm(y ~ x, family = poisson(link = "log"))
summary(accidents_model)


mu0 <- mean(y[x==0])
mu1 <- mean(y[x==1])
c(mu0, mu1, mu1/mu0)


library(COUNT)
data("rwm1984")
head(rwm1984)
summary(rwm1984)
#centering the age predictor
rwm1984$cage <- rwm1984$age - mean(rwm1984$age)

#running glm
pois_m <- glm(docvis ~ outwork + cage, family = poisson(link = "log"), data = rwm1984)
#extract the outcome by summary
summary(pois_m)



exp(coef(pois_m))
exp(coef(pois_m)) * sqrt(diag(vcov(pois_m))) #delta method
exp(confint.default(pois_m))


# sum of fit statistics for poisson model  
pr <- sum(residuals(pois_m, type = "pearson") ^ 2)
pr / pois_m$df.resid
pois_m$aic / (pois_m$df.null + 1)
COUNT::modelfit(pois_m)


#marginal effect at mean
beta <- coef(pois_m)
mout <- mean(rwm1984$outwork)
mcage <- mean(rwm1984$cage)
xb <- sum(beta * c(1,mout,mcage))
dfdxb <- exp(xb) * beta[3]
dfdxb


#offset and exposure
data("fasttrakg")
head(fasttrakg)
summary(fast <- glm(die ~ anterior + hcabg +factor(killip) , family = poisson, 
                    offset = log(cases), data = fasttrakg))
exp(coef(fast))
exp(coef(fast)) * sqrt(diag(vcov(fast))) #delta method
exp(confint.default(fast))
modelfit(fast)


#deviance
dev <- deviance(pois_m)
df <- df.residual(pois_m)
p_value <- 1 - pchisq(dev, df)
print(matrix(c("Deviance GOF ", " ", 
               "D =", round(dev,4) , 
               "df = ", df, 
               "P-value =  ", p_value), byrow = T,ncol = 2))


### 3.11 Negative binomial Examples 

# Poisson Default standard errors (variance equals mean) based on MLE
poiss_mle <- glm(docvis ~ outwork + age, data = rwm1984, family=poisson()) 
print(summary(poiss_mle),digits=3)

# Robust sandwich based on poiss_mle
library(sandwich)
#this will gives robust covariance matrix of estimated model
cov.robust <- vcovHC(poiss_mle, type="HC0")
#diagonal is the variances
se.robust <- sqrt(diag(cov.robust))
coeffs <- coef(poiss_mle)
z_.025 <- qnorm(.975)
t.robust <- coeffs / se.robust
poiss_mle_robust <- cbind(coeffs, se.robust, t.robust, pvalue=round(2*(1-pnorm(abs(coeffs/se.robust))),5), 
                          lower=coeffs - z_.025 * se.robust, upper=coeffs+ z_.025 * se.robust)    
print(poiss_mle_robust,digits=3)

# Poisson Bootstrap standard errors (variance equals mean)
# install.packages("boot")
library(boot)
boot.poiss <- function(data, indices) {
  data <- data[indices, ] 
  model <- glm(docvis ~ outwork + age, family=poisson(), data=data)
  coefficients(model)
}
set.seed(10101)
# To speed up we use only 100 bootstraps. 
summary.poissboot <- boot(rwm1984, boot.poiss, 400)
print(summary.poissboot,digits=3)


# Poisson with NB1 standard errors (assumes variance is a multiple of the mean) w = theta*mu or w = mu*(1 + alpha)
#also called Poisson Quasi-MLE
poiss_qmle <- glm(docvis ~ outwork + age, data = rwm1984, family=quasipoisson()) 
print(summary(poiss_qmle),digits=4)

# MASS does just NB2
# install.packages("MASS")
library(MASS)
# Note: MASS  reports as the overdispersion parameter 1/alpha not alpha
# In this example R reports 0.9285 and 1/0.9285 = 1.077  
# NB2 with default standard errors 
NB2.MASS <- glm.nb(docvis ~ outwork + age, data = rwm1984) 
print(summary(NB2.MASS),digits=3)

# ##Weighted least square 
# 
# set.seed(101)
# x = seq(1,3.5,.1)
# x = sort(rep(x,10))
# Ey = 220 - 60*x
# y = rpois(length(Ey),Ey)
# m = lm(y~x)
# 
# plot(x,y,xlab="Price",ylab="Number Sold"); abline(m)
# plot(predict(m),resid(m),xlab="Predicted values",ylab="Residuals")
# abline(h=0)
# W = 1/predict(m);
# Wm = lm(y~x,weights=W)
# plot(predict(Wm),sqrt(W)*resid(Wm)); abline(h=0)
# 
# glp.m <- glm(y ~ x, family = poisson(link = "identity"))
# 
# summary(m)
# summary(Wm)
# summary(glp.m)
# 
# 
# 
# y <- mydata$admit
# x <- mydata$gre
# 
# mu <- rep(mean(y), nrow(mydata))   # initialize mu
# #
# eta <- log(mu/(1-mu))              # initialize eta
# for (i in 1:10) {                   # loop for 10 iterations
#   w <-  mu*(1-mu)                  # weight = variance
#   z <- eta + (y - mu)/(mu*(1-mu))  # expected response + standardized error
#   mod <- lm(z ~ x, weights = w)    # weighted regression
#   eta <- mod$fit                   # linear predictor
#   mu <- 1/(1+exp(-eta))            # fitted value
#   cat(i, coef(mod), "\n")          # displayed iteration log
# }
# 
# coef(summary(mod))
# confint(mod)
# 
# glm.mod <- glm(admit ~ gre, family = binomial, data = mydata)
# coef(summary(glm.mod))
# 
# confint(glm.mod)