# Install and load required packages library(tidyverse) # For data manipulation and visualization library(AER) # Applied Econometrics with R library(pscl) # Political Science Computational Laboratory library(MASS) # Modern Applied Statistics with S library(sjPlot) # For visualizing regression models library(lmtest) # For hypothesis testing in linear models library(emmeans) # For estimated marginal means library(performance) # For model performance evaluation # Set the random seed set.seed(100) # Sample size n <- 100 # Draw Poisson random variables count.p <- rpois(n = n, lambda = 2) # Frequency table of counts table(count.p) # Sample mean and sample variance mean(count.p) var(count.p) # Create a barplot of the frequency table barplot(table(count.p), col = "lightblue", xlab = "Count", ylab = "Frequency") # Define values for y and corresponding probabilities for Poisson distribution pmfpois <- data.frame(y = seq(0, 15, 1), p = dpois(seq(0, 15, 1), lambda = 2)) # Create the PMF plot using ggplot ggplot(pmfpois, aes(y, p)) + ggtitle("PMF of Poisson (lambda = 2)") + geom_bar(stat = "identity", width = 1, colour = "dodgerblue", fill = "white") + xlab("y") + ylab("P(y)") + scale_x_continuous(breaks = seq(0, 15, 1)) # Drawing negative binomial random variables count.nb <- rnbinom(n = n, size = 1, mu = 2) # Frequency table of counts table(count.nb) # Sample mean and sample variance mean(count.nb) var(count.nb) # Create a barplot of the frequency table barplot(table(count.nb), col = "lightblue", xlab = "Count", ylab = "Frequency") # Define values for y and corresponding probabilities for Negative Binomial distribution pmfnb <- data.frame(y = seq(0, 15, 1), p = dnbinom(seq(0, 15, 1), size = 1, mu = 2)) # Create the PMF plot using ggplot ggplot(pmfnb, aes(y, p)) + ggtitle("PMF of Negative Binomial (mu = 2, size = 1)") + geom_bar(stat = "identity", width = 1, colour = "dodgerblue", fill = "white") + xlab("y") + ylab("P(y)") + scale_x_continuous(breaks = seq(0, 15, 1)) # Load the data from AER package data("NMES1988") # Select variables used in the workshop dat <- NMES1988[, c(1, 6:8, 13, 15, 18)] # Display the first few rows of the dataset head(dat) # Show the structure of the dataset str(dat) # Assign the counts to a variable counts <- dat$visits # Display summary statistics of the counts summary(counts) # Estimate mu from the sample mean mu <- mean(counts) mu # Estimate variance from the sample variance s2 <- var(counts) s2 # Create a bar plot of the frequency table plot(table(counts), col = "blue", ylab = "Frequency") # Sample count data counts <- dat$visits # Calculate the parameters for Poisson distribution mu <- mean(counts) # Estimate mu from the sample mean x <- 0:max(counts) # Values for the x-axis (counts) # Calculate the PMF of the Poisson distribution pmf_poisson <- dpois(x, lambda = mu) # Calculate the PMF of the Negative Binomial distribution theta <- 1 pmf_neg_binomial <- dnbinom(x, size = theta, mu = mu) # Create a data frame for plotting df <- data.frame(x = x, pmf_poisson = pmf_poisson, pmf_neg_binomial = pmf_neg_binomial) # Plot histogram of count data, Poisson PMF, and Negative Binomial PMF ggplot(data.frame(counts), aes(x = counts)) + geom_histogram(binwidth = 1, aes(fill = "Histogram"), color = "black", alpha = 0.5) + geom_line(data = df, aes(x = x, y = pmf_poisson * length(counts), color = "Poisson PMF"), linewidth = 1) + geom_line(data = df, aes(x = x, y = pmf_neg_binomial * length(counts), color = "Negative Binomial PMF"), linewidth = 1, linetype = "dashed") + labs(title = "Histogram of Count Data and PMFs", x = "Counts", y = "Frequency") + scale_fill_manual(values = "lightblue") + scale_color_manual(values = c("blue", "red")) + guides(fill = guide_legend(title = "Distribution"), color = guide_legend(title = "Distribution")) + theme_minimal() # Create a formula formula <- visits ~ hospital + health + chronic + gender + school + insurance # Fit Poisson model using glm function from stats m.pois <- glm(formula = formula, family = poisson(link = "log"), data = dat) summary(m.pois) # Using emmeans to extract coefficients coefficient <- coefficients(m.pois) CI <- confint(m.pois) colnames(CI) <- c("L", "U") # Calculate Rate Ratios RR <- exp(coefficient) CI_RR <- exp(confint(m.pois)) colnames(CI_RR) <- c("L_RR", "U_RR") # Combine coefficients, confidence intervals, Rate Ratios, and their confidence intervals print(round(cbind(coefficient, CI, RR, CI_RR), 4)) # Show model summary using sjPlot library(sjPlot) tab_model(m.pois, show.dev = TRUE) # Using dispersiontest from package AER dispersiontest(m.pois) # Using check_overdispersion from package performance library(performance) check_overdispersion(m.pois) # Compute sandwich covariance matrix from package lmtest coeftest(m.pois, vcov = sandwich) # Fit NB model using glm.nb function from MASS m.nb <- glm.nb(formula, data = dat, link = log) summary(m.nb) # Compare AIC between NB and Poisson models c(AIC_NB = m.nb$aic, AIC_Pois = m.pois$aic) # Check for overdispersion in NB model check_overdispersion(m.nb) # Calculate observed and expected number of zero counts obs.zero <- sum(dat$visits == 0) Ezero.pois <- round(sum(dpois(0,lambda = fitted(m.pois)))) Ezero.nb <- round(sum(dnbinom(0,mu = fitted(m.nb), size = m.nb$theta))) # Calculate ratios of observed to expected zeros obs.zero / Ezero.pois obs.zero / Ezero.nb # Check for zero inflation in Poisson and NB models check_zeroinflation(m.pois) check_zeroinflation(m.nb) # Fit zero-inflated Poisson and NB models m.zeroinf.pois <- zeroinfl( visits ~ hospital + health + chronic + gender + school + insurance | chronic + insurance + school + gender, data = dat, dist = "poisson") m.zeroinf.nb <- zeroinfl( visits ~ hospital + health + chronic + gender + school + insurance | chronic + insurance + school + gender, data = dat, dist = "negbin") summary(m.zeroinf.pois) summary(m.zeroinf.nb) # Show class of model objects class(m.zeroinf.pois) class(m.zeroinf.nb) # Extract coefficients for count and zero components coef(m.zeroinf.nb, model = "count") coef(m.zeroinf.nb, model = "zero") # Compute variance of count component diag(vcov(m.zeroinf.nb, model = "count")) # Predict response, count, zero, and probability of zero predict.data <- data.frame( response = predict(m.zeroinf.nb, type = "response" ), count = predict(m.zeroinf.nb, type = "count"), zero = predict(m.zeroinf.nb, type = "zero" ), prob0 = predict(m.zeroinf.nb, type = "prob")[, 1]) # Calculate predictions directly by computing linear predictions predict.data <- predict.data %>% mutate( xb.count = model.matrix(m.zeroinf.nb,"count") %*% coef(m.zeroinf.nb, "count"), xb.zero = model.matrix(m.zeroinf.nb, "zero") %*% coef(m.zeroinf.nb, "zero"), mu = exp(xb.count), pi = 1 / (1 + exp(-xb.zero)), p0 = pi + (1 - pi) * dnbinom(0, mu = mu, size = m.zeroinf.nb$theta) ) round(head(predict.data), 5) # Show model summaries using sjPlot tab_model(m.zeroinf.pois, m.zeroinf.nb, dv.labels = c("Zero-Inflated Poisson", "Zero-Inflated_NB")) # Predicted probability of the excess zeros for an average individual mean(dat$chronic) (predicted0 <- predict( m.zeroinf.nb, newdata = data.frame( hospital = mean(dat$hospital), health = "average", chronic = mean(dat$chronic), gender = "female", school = mean(dat$school), insurance = "no" ), type = "zero" )) # Calculate the odds (odds <- predicted0 / (1 - predicted0)) # Calculate odds changes odds * 0.2872938 # Create new data for predictions new.data <- data.frame( hospital = mean(dat$hospital), health = "poor", chronic = seq(0, 8, 0.1), gender = "female", school = mean(dat$school), insurance = "no" ) # Predicted response new.data$response <- predict(m.zeroinf.nb, newdata = new.data, type ="response") # Predicted count only in count component new.data$count <- predict(m.zeroinf.nb, newdata = new.data, type = "count") # Predicted probability of excess zero new.data$pi <- predict(m.zeroinf.nb, newdata = new.data, type = "zero") # Linear predictor new.data$linear.count <- log(new.data$count) # Plot predicted response against chronic variable ggplot(new.data, aes(x = chronic, y = response)) + geom_line(color = "blue", size = 1) # Plot predicted count against chronic variable ggplot(new.data, aes(x = chronic, y = count)) + geom_line(color = "lightblue", size = 1) # Plot both predicted response and count together new.data %>% pivot_longer(cols = c(count, response), names_to = "type", values_to = "visits") %>% ggplot(aes(x = chronic, y = visits, color = type)) + geom_line(size = 1) # Plot linear predictor ggplot(new.data, aes(x = chronic, y = linear.count)) + geom_line(color = "black", size = 1.5) + ylab("linear predictor count") # Plot probability of excess zero ggplot(new.data, aes(x = chronic, y = pi)) + geom_line(color = "red", size = 1.5) + ylab("probability of excess zero") # Use emmeans library(emmeans) emmip(m.zeroinf.nb, ~ chronic, at = list( hospital = mean(dat$hospital), health = "poor", chronic = 0:8, gender = "female", school = mean(dat$school), insurance = "no" ), lin.pred = FALSE, mode ="count", CIs = TRUE) + ylab("count") # Fit hurdle models m.hurdle.pois <- hurdle( visits ~ hospital + health + chronic + gender + school + insurance, data = dat, dist = "poisson") m.hurdle.nb <- hurdle( visits ~ hospital + health + chronic + gender + school + insurance, data = dat, dist = "negbin") summary(m.hurdle.pois) summary(m.hurdle.nb) #setting all positive count equal to one dat <- dat %>% mutate(visits.1 = ifelse(visits >= 1, 1, 0)) #run a logistic regression for visit1 m0 <- glm(visits.1 ~ hospital + health + chronic + gender + school + insurance, data = dat, family = binomial) summary(m0) # Show class of model objects class(m.hurdle.pois) class(m.hurdle.nb) # Extract coefficients for count and zero components coef(m.hurdle.nb, model = "count") coef(m.hurdle.nb, model = "zero") # Compute variance of count component diag(vcov(m.hurdle.nb, model = "count")) # Test no zero hurdle m.hurdle.nb.nb <- hurdle( visits ~ hospital + health + chronic + gender + school + insurance, data = dat, dist = "negbin", zero.dist = "negbin") hurdletest(m.hurdle.nb.nb) # Predict response, count, zero, and probability of zero predict.data <- data.frame( response = predict(m.hurdle.nb, type = "response" ), count = predict(m.hurdle.nb, type = "count", ), zero = predict(m.hurdle.nb, type = "zero" ), prob0 = predict(m.hurdle.nb, type = "prob")[,1] ) # Compute the prediction from linear prediction predict.data <- predict.data %>% mutate( xb.count = model.matrix(m.hurdle.nb,"count") %*% coef(m.hurdle.nb, "count"), xb.zero = model.matrix(m.hurdle.nb, "zero") %*% coef(m.hurdle.nb, "zero"), mu = exp(xb.count), f0 = dnbinom(0, mu = mu, size = m.hurdle.nb$theta), pi = 1 / (1 + exp(-xb.zero)), p0 = (1 - pi), zero2 = pi / (1 - f0), response3 <- mu * pi / (1 - f0) ) # Show the first few rows of the prediction data head(predict.data) # Use sjPlot to show model summaries tab_model(m.hurdle.pois, m.hurdle.nb, m0, dv.labels =c("Hurdle Poisson", "Hurdle NB", "logistic model")) # Plot probability of zero against chronic variable emmip(m.hurdle.nb, ~ chronic, at = list( hospital = mean(dat$hospital), health = "poor", chronic = 0:8, gender = "female", school = mean(dat$school), insurance = "no" ), lin.pred = FALSE, mode ="prob0", CIs = TRUE) + ylab("Probability of zero") # Using check_overdispersion from package performance # Compute dispersion statistics for different models Dispersion <- c( sum(residuals(m.pois, type = "pearson")^2) / m.pois$df.residual, sum(residuals(m.nb, type = "pearson")^2) / m.nb$df.residual, sum(residuals(m.zeroinf.pois, type = "pearson")^2) / m.zeroinf.pois$df.residual, sum(residuals(m.zeroinf.nb, type = "pearson")^2) / m.zeroinf.nb$df.residual, sum(residuals(m.hurdle.pois, type = "pearson")^2) / m.hurdle.pois$df.residual, sum(residuals(m.hurdle.nb, type = "pearson")^2) / m.hurdle.nb$df.residual ) # Predicted number of zeros Observed.zero <- sum(dat$visits == 0) # Calculate expected number of zeros Expected.zero <- round(c( "Pois" = sum(dpois(0, fitted(m.pois))), "NB" = sum(dnbinom(0, mu = fitted(m.nb), size = m.nb$theta)), "Zero-inflated Poisson" = sum(predict(m.zeroinf.pois, type = "prob")[,1]), "Zero-inflated NB" = sum(predict(m.zeroinf.nb, type = "prob")[,1]), "Hurdle Poisson" = sum(predict(m.hurdle.pois, type = "prob")[,1]), "Hurdle NB" = sum(predict(m.hurdle.nb, type = "prob")[,1]) )) # Calculate ratios of observed to expected zeros Ratio <- Observed.zero / Expected.zero # Combine expected zeros, ratios, and dispersion statistics cbind(Expected.zero, Ratio, Dispersion) #------------Exercise Analyzing Fish Catch Data---------------------- # Read the data from idre website ex.data <- read.csv("https://stats.idre.ucla.edu/stat/data/fish.csv") # Check the structure of the data str(ex.data) # Make the variable 'camper' a factor since it is binary ex.data$camper <- factor(ex.data$camper, labels = c("no", "yes")) # Fit the negative binomial model ex.m.nb <- glm.nb(count ~ child + camper + persons, data = ex.data) # Fit the zero-inflated negative binomial model ex.m.zeroinf.nb <- zeroinfl(count ~ child + camper | persons, dist = "negbin", data = ex.data) # Fit the hurdle negative binomial model ex.m.hurdle.nb <- hurdle(count ~ child + camper + persons | child + camper + persons, dist = "negbin", data = ex.data) # Summarize the results of each model summary(ex.m.nb) summary(ex.m.zeroinf.nb) summary(ex.m.hurdle.nb) # Use tab_model to display the models' summaries tab_model(ex.m.zeroinf.nb, ex.m.hurdle.nb, ex.m.nb)