#reading the data elemapi2v2.csv from OARC website library(ggplot2) library(COUNT) dat <- read.csv( "https://stats.oarc.ucla.edu/wp-content/uploads/2019/02/elemapi2v2.csv") head(dat[,c("api00", "enroll")]) #linear model of api00 vs. enroll m1 <- lm(api00 ~ enroll, data = dat) summary(m1) #scatter plot plot(api00 ~ enroll, data = dat, main = "Scatter plot of school performance vs. number of enrollment", ylab = "api00 (performance score in 2000)", xlab = "enroll", xlim = c(0,1000)) abline(m1, col = "blue") #do not run ******************************************* # dat1 <- dat[50:100,] # attach(dat1) # plot(api00 ~ enroll, data = dat1) # abline(m1, col = "blue") # abline(h = mean(dat1$api00), col = "black") # fit <- m1$fitted.values # i <- 47 # arrows(enroll[i], api00[i], enroll[i], fit[i], # col = "blue", length=0.05, angle=30, code=3) # arrows(enroll[i], fit[i], enroll[i], mean(api00), # col = "red", length=0.05, angle=30, code=3) # text(860, 530, expression(y[i] - hat(y[i]) ), col ="blue") # text(860, 600, expression(hat(y[i]) - bar(y)), col ="red" ) # arrows(enroll[i] - 20, api00[i], enroll[i] - 20, mean(api00), # col = "black", length=0.05, angle=30, code=3) # text(750, 570, expression(y[i] - bar(y)) ) # text(850, 680, expression( paste(y ," = ", bar(y)))) # text(440, 720,col = "blue" , # expression(paste(hat(y), " = ",hat(beta)[0], " + " , hat(beta)[1], x) )) # i <- 6 # arrows(enroll[i], api00[i], enroll[i], fit[i], # col = "blue", length=0.05, angle=30, code=3) # text(250, 700, expression(y[i] - hat(y[i]) ), col ="blue") #******************************************************************** #* # linear regression model diagnostics(optional) # opar <- par(mfrow = c(2, 2), # mar = c(4.1, 4.1, 2.1, 1.1)) # plot(m1) #The graph below shows the plot of PDF of a normal distribution # library(ggplot2) # pdfbin <- data.frame(x = (x = seq(0,10,1)), p = dbinom(x, 10, 0.2)) # ggplot(data = data.frame(x = c(2 - 3*1, 2 + 3*1)), aes(x)) + # ggtitle("PDF of normal(mean = 2, sd = 1)") + # stat_function(fun = dnorm, n = 101, args = list(mean = 2, sd = 1),colour="dodgerblue") + xlab("x") + ylab("f(x)") + # scale_y_continuous(breaks = x) #Bernoulli # databernol <- data.frame(x = c(0, 1), p = c(.7, .3)) # ggplot(data = databernol , aes(x, p)) + # ggtitle("PMF of Bernoulli(p = 0.3)") + # geom_bar(stat = "identity", width=1, colour="dodgerblue", fill="white") + # xlab("x") + # ylab("P(x)") #binomial # pdfbin <- data.frame(x = (x = seq(0,10,1)), p = dbinom(x, 10, 0.2)) # ggplot(pdfbin, aes(x, p)) + # ggtitle("PMF of Binomial(n = 10, p = 0.2)") + # geom_bar(stat = "identity", width=1, colour="dodgerblue", fill="white") + # xlab("x") + # ylab("P(x)") + # scale_x_continuous(breaks=x) #reading the data from OARC stat website admission data: mydata <- read.csv("https://stats.oarc.ucla.edu/stat/data/binary.csv") #summary statistics of variables summary(mydata) #standard deviation of variables sapply(mydata, sd) #lm model for admission # lm.admit <- lm(admit ~ gre, data = mydata) # summary(lm.admit) # opar <- par(mfrow = c(2, 2), # mar = c(4.1, 4.1, 2.1, 1.1)) # plot(lm.admit) # par(opar) # # phat <- lm.admit$fitted.values # # plot(phat * (1 - phat) ~ phat) #plot(admit ~ gre, data = mydata, xlim = c(0,800), ylim = c(-.2,1)) #abline(lm.admit) #to estimate coefficient of the model we use glm with #the option family = binomial(link = "logit") glm.mod <- glm(admit ~ gre, family = binomial(link = "logit"), data = mydata) summary(glm.mod) coef(summary(glm.mod)) confint(glm.mod) #---------------------------------------------------------------------------- ##### Exercise: As an exercise use the same data set and run logistic regression of admit predicted by gpa. mydata <- read.csv("https://stats.oarc.ucla.edu/stat/data/binary.csv") glm.mod.2 <- glm(admit ~ gpa, data = mydata, family = binomial(link = "logit")) class(glm.mod.2) coef(summary(glm.mod.2))[2,1] exp(1.0511) exp(1.0511 * .2) exp(1.0511) ^ .2 #--------------------------------------------------------------------------- #logistic gre + gpa} glm.mod.3 <- glm(admit ~ gre + gpa, family = binomial(link = "logit"), data = mydata) summary(glm.mod.3) coef(summary(glm.mod.3)) exp(coef(summary(glm.mod.3))[,1]) confint(glm.mod.3) #logistic gre * gpa glm.mod.4 <- glm(admit ~ gre * gpa, family = binomial(link = "logit"), data = mydata) summary(glm.mod.4) coef(summary(glm.mod.4)) exp(coef(summary(glm.mod.4))[,1]) confint(glm.mod.4) #The simple effect of gre at GPA 500 exp(3.367918022 + 500 * -0.004326605) # or 29.01805 * 0.9956827 ^ 500 # pdfpois <- data.frame(x = (x = seq(0,15,1)), p = dpois(x, 2)) # ggplot(pdfpois, aes(x, p)) + # ggtitle("PMF of Poisson(mu = 2)") + # geom_bar(stat = "identity", width=1, colour="dodgerblue", fill="white") + # xlab("x") + # ylab("P(x)") + # scale_x_continuous(breaks=x) dpois(0:8, lambda = 5.5) prob <- c(0.1353, 0.2707, 0.2707, 0.1804, 0.0902, 0.0361, 0.0120, 0.0034, 0.0009) domain <- seq(0,8,1) set.seed(1001) y1 <- sample(x = domain, size = 20, replace = TRUE, prob = prob) y1 mean(y1) var(y1) y2 <- c(y1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 8, 5, 6, 7, 4, 3, 6, 10, 15, 20) mean(y2) var(y2) par(mfrow = c(1,2)) hist(y1, breaks = 10) hist(y2, breaks = 10) y <- rpois(1000, 2) mean(y) var(y) hist(y, breaks = 15) # Sunny day vs rainy day data: y <- c(rpois(20, lambda = 2), rpois(20, lambda = 5)) x <- rep(c(0,1), each = 20) print(data.frame(y = y,x = x)) accidents_model <- glm(y ~ x, family = poisson(link = "log")) summary(accidents_model) mu0 <- mean(y[x==0]) mu1 <- mean(y[x==1]) c(mu0, mu1, mu1/mu0) library(COUNT) data("rwm1984") head(rwm1984) summary(rwm1984) #centering the age predictor rwm1984$cage <- rwm1984$age - mean(rwm1984$age) #running glm pois_m <- glm(docvis ~ outwork + cage, family = poisson(link = "log"), data = rwm1984) #extract the outcome by summary summary(pois_m) exp(coef(pois_m)) exp(coef(pois_m)) * sqrt(diag(vcov(pois_m))) #delta method exp(confint.default(pois_m)) # sum of fit statistics for poisson model pr <- sum(residuals(pois_m, type = "pearson") ^ 2) pr / pois_m$df.resid pois_m$aic / (pois_m$df.null + 1) COUNT::modelfit(pois_m) #marginal effect at mean beta <- coef(pois_m) mout <- mean(rwm1984$outwork) mcage <- mean(rwm1984$cage) xb <- sum(beta * c(1,mout,mcage)) dfdxb <- exp(xb) * beta[3] dfdxb #offset and exposure data("fasttrakg") head(fasttrakg) summary(fast <- glm(die ~ anterior + hcabg +factor(killip) , family = poisson, offset = log(cases), data = fasttrakg)) exp(coef(fast)) exp(coef(fast)) * sqrt(diag(vcov(fast))) #delta method exp(confint.default(fast)) modelfit(fast) #deviance dev <- deviance(pois_m) df <- df.residual(pois_m) p_value <- 1 - pchisq(dev, df) print(matrix(c("Deviance GOF ", " ", "D =", round(dev,4) , "df = ", df, "P-value = ", p_value), byrow = T,ncol = 2)) ### 3.11 Negative binomial Examples # Poisson Default standard errors (variance equals mean) based on MLE poiss_mle <- glm(docvis ~ outwork + age, data = rwm1984, family=poisson()) print(summary(poiss_mle),digits=3) # Robust sandwich based on poiss_mle library(sandwich) #this will gives robust covariance matrix of estimated model cov.robust <- vcovHC(poiss_mle, type="HC0") #diagonal is the variances se.robust <- sqrt(diag(cov.robust)) coeffs <- coef(poiss_mle) z_.025 <- qnorm(.975) t.robust <- coeffs / se.robust poiss_mle_robust <- cbind(coeffs, se.robust, t.robust, pvalue=round(2*(1-pnorm(abs(coeffs/se.robust))),5), lower=coeffs - z_.025 * se.robust, upper=coeffs+ z_.025 * se.robust) print(poiss_mle_robust,digits=3) # Poisson Bootstrap standard errors (variance equals mean) # install.packages("boot") library(boot) boot.poiss <- function(data, indices) { data <- data[indices, ] model <- glm(docvis ~ outwork + age, family=poisson(), data=data) coefficients(model) } set.seed(10101) # To speed up we use only 100 bootstraps. summary.poissboot <- boot(rwm1984, boot.poiss, 400) print(summary.poissboot,digits=3) # Poisson with NB1 standard errors (assumes variance is a multiple of the mean) w = theta*mu or w = mu*(1 + alpha) #also called Poisson Quasi-MLE poiss_qmle <- glm(docvis ~ outwork + age, data = rwm1984, family=quasipoisson()) print(summary(poiss_qmle),digits=4) # MASS does just NB2 # install.packages("MASS") library(MASS) # Note: MASS reports as the overdispersion parameter 1/alpha not alpha # In this example R reports 0.9285 and 1/0.9285 = 1.077 # NB2 with default standard errors NB2.MASS <- glm.nb(docvis ~ outwork + age, data = rwm1984) print(summary(NB2.MASS),digits=3) # ##Weighted least square # # set.seed(101) # x = seq(1,3.5,.1) # x = sort(rep(x,10)) # Ey = 220 - 60*x # y = rpois(length(Ey),Ey) # m = lm(y~x) # # plot(x,y,xlab="Price",ylab="Number Sold"); abline(m) # plot(predict(m),resid(m),xlab="Predicted values",ylab="Residuals") # abline(h=0) # W = 1/predict(m); # Wm = lm(y~x,weights=W) # plot(predict(Wm),sqrt(W)*resid(Wm)); abline(h=0) # # glp.m <- glm(y ~ x, family = poisson(link = "identity")) # # summary(m) # summary(Wm) # summary(glp.m) # # # # y <- mydata$admit # x <- mydata$gre # # mu <- rep(mean(y), nrow(mydata)) # initialize mu # # # eta <- log(mu/(1-mu)) # initialize eta # for (i in 1:10) { # loop for 10 iterations # w <- mu*(1-mu) # weight = variance # z <- eta + (y - mu)/(mu*(1-mu)) # expected response + standardized error # mod <- lm(z ~ x, weights = w) # weighted regression # eta <- mod$fit # linear predictor # mu <- 1/(1+exp(-eta)) # fitted value # cat(i, coef(mod), "\n") # displayed iteration log # } # # coef(summary(mod)) # confint(mod) # # glm.mod <- glm(admit ~ gre, family = binomial, data = mydata) # coef(summary(glm.mod)) # # confint(glm.mod)