# Install and load required packages
library(tidyverse)  # For data manipulation and visualization
library(AER)        # Applied Econometrics with R
library(pscl)       # Political Science Computational Laboratory
library(MASS)       # Modern Applied Statistics with S
library(sjPlot)     # For visualizing regression models
library(lmtest)     # For hypothesis testing in linear models
library(emmeans)    # For estimated marginal means
library(performance) # For model performance evaluation

# Set the random seed
set.seed(100)

# Sample size
n <- 100

# Draw Poisson random variables
count.p <- rpois(n = n, lambda = 2)

# Frequency table of counts
table(count.p)

# Sample mean and sample variance
mean(count.p)
var(count.p)

# Create a barplot of the frequency table
barplot(table(count.p), col = "lightblue", 
        xlab = "Count", ylab = "Frequency")

# Define values for y and corresponding probabilities for Poisson distribution
pmfpois <- data.frame(y = seq(0, 15, 1), p = dpois(seq(0, 15, 1), lambda = 2))

# Create the PMF plot using ggplot
ggplot(pmfpois, aes(y, p)) + 
  ggtitle("PMF of Poisson (lambda = 2)") +
  geom_bar(stat = "identity", width = 1, colour = "dodgerblue", fill = "white") +
  xlab("y") + 
  ylab("P(y)") +
  scale_x_continuous(breaks = seq(0, 15, 1))

# Drawing negative binomial random variables
count.nb <- rnbinom(n = n, size = 1, mu = 2)

# Frequency table of counts
table(count.nb)

# Sample mean and sample variance
mean(count.nb)
var(count.nb)

# Create a barplot of the frequency table
barplot(table(count.nb), col = "lightblue", 
        xlab = "Count", ylab = "Frequency")

# Define values for y and corresponding probabilities for Negative Binomial distribution
pmfnb <- data.frame(y = seq(0, 15, 1), p = dnbinom(seq(0, 15, 1), size = 1, mu = 2))

# Create the PMF plot using ggplot
ggplot(pmfnb, aes(y, p)) + 
  ggtitle("PMF of Negative Binomial (mu = 2, size = 1)") +
  geom_bar(stat = "identity", width = 1, colour = "dodgerblue", fill = "white") +
  xlab("y") + 
  ylab("P(y)") +
  scale_x_continuous(breaks = seq(0, 15, 1))

# Load the data from AER package
data("NMES1988")
# Select variables used in the workshop
dat <- NMES1988[, c(1, 6:8, 13, 15, 18)]
# Display the first few rows of the dataset
head(dat)
# Show the structure of the dataset
str(dat)

# Assign the counts to a variable
counts <- dat$visits
# Display summary statistics of the counts
summary(counts)

# Estimate mu from the sample mean
mu <- mean(counts)
mu

# Estimate variance from the sample variance
s2 <- var(counts)
s2

# Create a bar plot of the frequency table
plot(table(counts), col = "blue", ylab = "Frequency")

# Sample count data
counts <- dat$visits

# Calculate the parameters for Poisson distribution
mu <- mean(counts)  # Estimate mu from the sample mean
x <- 0:max(counts)  # Values for the x-axis (counts)

# Calculate the PMF of the Poisson distribution
pmf_poisson <- dpois(x, lambda = mu)

# Calculate the PMF of the Negative Binomial distribution
theta <- 1 
pmf_neg_binomial <- dnbinom(x, size = theta, mu = mu)

# Create a data frame for plotting
df <- data.frame(x = x, pmf_poisson = pmf_poisson, pmf_neg_binomial = pmf_neg_binomial)

# Plot histogram of count data, Poisson PMF, and Negative Binomial PMF
ggplot(data.frame(counts), aes(x = counts)) +
  geom_histogram(binwidth = 1, aes(fill = "Histogram"), color = "black", alpha = 0.5) +
  geom_line(data = df, aes(x = x, y = pmf_poisson * length(counts), color = "Poisson PMF"), linewidth = 1) +
  geom_line(data = df, aes(x = x, y = pmf_neg_binomial * length(counts), 
                           color = "Negative Binomial PMF"), linewidth = 1, linetype = "dashed") +
  labs(title = "Histogram of Count Data and PMFs", x = "Counts", y = "Frequency") +
  scale_fill_manual(values = "lightblue") +
  scale_color_manual(values = c("blue", "red")) +
  guides(fill = guide_legend(title = "Distribution"),
         color = guide_legend(title = "Distribution")) +
  theme_minimal()

# Create a formula
formula <- visits ~ hospital + health + chronic + gender + school + insurance
# Fit Poisson model using glm function from stats
m.pois <- glm(formula = formula,
              family  = poisson(link = "log"),
              data    = dat)
summary(m.pois)

# Using emmeans to extract coefficients
coefficient <- coefficients(m.pois)
CI <- confint(m.pois)
colnames(CI) <- c("L", "U")
# Calculate Rate Ratios
RR <- exp(coefficient)
CI_RR <- exp(confint(m.pois))
colnames(CI_RR) <- c("L_RR", "U_RR")

# Combine coefficients, confidence intervals, Rate Ratios, and their confidence intervals
print(round(cbind(coefficient, CI, RR, CI_RR), 4))

# Show model summary using sjPlot
library(sjPlot)
tab_model(m.pois, show.dev = TRUE)

# Using dispersiontest from package AER
dispersiontest(m.pois)

# Using check_overdispersion from package performance
library(performance)
check_overdispersion(m.pois)

# Compute sandwich covariance matrix from package lmtest 
coeftest(m.pois, vcov = sandwich)

# Fit NB model using glm.nb function from MASS
m.nb <- glm.nb(formula, data = dat, link = log)
summary(m.nb)

# Compare AIC between NB and Poisson models
c(AIC_NB = m.nb$aic, AIC_Pois = m.pois$aic)

# Check for overdispersion in NB model
check_overdispersion(m.nb)

# Calculate observed and expected number of zero counts
obs.zero <- sum(dat$visits == 0)
Ezero.pois <- round(sum(dpois(0,lambda = fitted(m.pois))))
Ezero.nb <- round(sum(dnbinom(0,mu = fitted(m.nb), size = m.nb$theta)))

# Calculate ratios of observed to expected zeros
obs.zero / Ezero.pois 
obs.zero / Ezero.nb 

# Check for zero inflation in Poisson and NB models
check_zeroinflation(m.pois)
check_zeroinflation(m.nb)

# Fit zero-inflated Poisson and NB models
m.zeroinf.pois <- zeroinfl(
  visits ~ hospital + health + chronic + gender + school + insurance | 
    chronic + insurance + school + gender,
  data = dat, dist = "poisson")
m.zeroinf.nb <- zeroinfl(
  visits ~ hospital + health + chronic + gender + school + insurance | 
    chronic + insurance + school + gender,
  data = dat, dist = "negbin")

summary(m.zeroinf.pois)
summary(m.zeroinf.nb)

# Show class of model objects
class(m.zeroinf.pois)
class(m.zeroinf.nb)

# Extract coefficients for count and zero components
coef(m.zeroinf.nb, model = "count")
coef(m.zeroinf.nb, model = "zero")

# Compute variance of count component
diag(vcov(m.zeroinf.nb, model = "count"))

# Predict response, count, zero, and probability of zero
predict.data <- data.frame(
  response = predict(m.zeroinf.nb, type = "response" ),
  count = predict(m.zeroinf.nb, type = "count"),
  zero = predict(m.zeroinf.nb, type = "zero" ),
  prob0 = predict(m.zeroinf.nb, type = "prob")[, 1])

# Calculate predictions directly by computing linear predictions
predict.data <- predict.data %>% 
  mutate(
    xb.count = model.matrix(m.zeroinf.nb,"count") %*% coef(m.zeroinf.nb, "count"),
    xb.zero  = model.matrix(m.zeroinf.nb, "zero") %*% coef(m.zeroinf.nb, "zero"),
    mu = exp(xb.count),
    pi = 1 / (1 + exp(-xb.zero)),
    p0 = pi + (1 - pi) * dnbinom(0, mu = mu, size = m.zeroinf.nb$theta)
  )

round(head(predict.data), 5)

# Show model summaries using sjPlot
tab_model(m.zeroinf.pois, m.zeroinf.nb, 
          dv.labels = c("Zero-Inflated Poisson", "Zero-Inflated_NB"))

# Predicted probability of the excess zeros for an average individual
mean(dat$chronic)
(predicted0 <- predict(
  m.zeroinf.nb, 
  newdata = data.frame(
    hospital = mean(dat$hospital), 
    health = "average",
    chronic = mean(dat$chronic), 
    gender = "female",
    school = mean(dat$school), 
    insurance = "no"
  ),
  type = "zero"
))

# Calculate the odds
(odds <- predicted0 / (1 - predicted0))

# Calculate odds changes
odds * 0.2872938

# Create new data for predictions
new.data <- data.frame(
  hospital = mean(dat$hospital),
  health = "poor",
  chronic = seq(0, 8, 0.1),
  gender = "female",
  school = mean(dat$school),
  insurance = "no"
)

# Predicted response
new.data$response <- predict(m.zeroinf.nb, newdata = new.data, type ="response")

# Predicted count only in count component
new.data$count <- predict(m.zeroinf.nb, newdata = new.data, type = "count")

# Predicted probability of excess zero
new.data$pi <- predict(m.zeroinf.nb, newdata = new.data, type = "zero")

# Linear predictor
new.data$linear.count <-  log(new.data$count)

# Plot predicted response against chronic variable
ggplot(new.data, aes(x = chronic, y = response)) +
  geom_line(color = "blue", size = 1)

# Plot predicted count against chronic variable
ggplot(new.data, aes(x = chronic, y = count)) +
  geom_line(color = "lightblue", size = 1)

# Plot both predicted response and count together
new.data %>% 
  pivot_longer(cols = c(count, response),
               names_to = "type", values_to = "visits") %>% 
  ggplot(aes(x = chronic, y = visits, color = type)) +
  geom_line(size = 1)

# Plot linear predictor
ggplot(new.data, aes(x = chronic, y = linear.count)) +
  geom_line(color = "black", size = 1.5) +
  ylab("linear predictor count")

# Plot probability of excess zero
ggplot(new.data, aes(x = chronic, y = pi)) +
  geom_line(color = "red", size = 1.5) +
  ylab("probability of excess zero")

# Use emmeans 
library(emmeans)
emmip(m.zeroinf.nb, ~ chronic,
      at = list(
        hospital = mean(dat$hospital),
        health = "poor",
        chronic = 0:8,
        gender = "female",
        school = mean(dat$school),
        insurance = "no"
      ), 
      lin.pred = FALSE, mode ="count", CIs = TRUE) +
  ylab("count")

# Fit hurdle models
m.hurdle.pois <- hurdle(
  visits ~ hospital + health + chronic + gender + school + insurance, 
  data = dat, dist = "poisson")
m.hurdle.nb <- hurdle(
  visits ~ hospital + health + chronic + gender + school + insurance,
  data = dat, dist = "negbin")

summary(m.hurdle.pois)
summary(m.hurdle.nb)


#setting all positive count equal to one
dat <- dat %>% mutate(visits.1 = ifelse(visits >= 1, 1, 0))
#run a logistic regression for visit1
m0 <- glm(visits.1 ~ hospital + health + chronic + gender + school + insurance, 
          data = dat, family = binomial)
summary(m0)


# Show class of model objects
class(m.hurdle.pois)
class(m.hurdle.nb)

# Extract coefficients for count and zero components
coef(m.hurdle.nb, model = "count")
coef(m.hurdle.nb, model = "zero")

# Compute variance of count component
diag(vcov(m.hurdle.nb, model = "count"))

# Test no zero hurdle
m.hurdle.nb.nb <- hurdle(
  visits ~ hospital + health + chronic + gender + school + insurance,
  data = dat, dist = "negbin", zero.dist = "negbin")
hurdletest(m.hurdle.nb.nb)

# Predict response, count, zero, and probability of zero
predict.data <- data.frame(
  response = predict(m.hurdle.nb, type = "response" ),
  count = predict(m.hurdle.nb, type = "count", ),
  zero = predict(m.hurdle.nb, type = "zero" ),
  prob0 = predict(m.hurdle.nb, type = "prob")[,1]
)

# Compute the prediction from linear prediction
predict.data <- predict.data %>% 
  mutate(
    xb.count = model.matrix(m.hurdle.nb,"count") %*% coef(m.hurdle.nb, "count"),
    xb.zero = model.matrix(m.hurdle.nb, "zero") %*% coef(m.hurdle.nb, "zero"),
    mu = exp(xb.count),
    f0 = dnbinom(0, mu = mu, size = m.hurdle.nb$theta),
    pi = 1 / (1 + exp(-xb.zero)),
    p0 = (1 - pi),
    zero2 =  pi  / (1 - f0), 
    response3 <- mu *  pi /  (1 - f0) 
  ) 

# Show the first few rows of the prediction data
head(predict.data)

# Use sjPlot to show model summaries
tab_model(m.hurdle.pois, m.hurdle.nb, m0, 
          dv.labels =c("Hurdle Poisson", "Hurdle NB", "logistic model"))

# Plot probability of zero against chronic variable
emmip(m.hurdle.nb, ~ chronic,
      at = list(
        hospital = mean(dat$hospital),
        health = "poor",
        chronic = 0:8,
        gender = "female",
        school = mean(dat$school),
        insurance = "no"
      ), 
      lin.pred = FALSE, mode ="prob0", CIs = TRUE) +
  ylab("Probability of zero")

# Using check_overdispersion from package performance

# Compute dispersion statistics for different models
Dispersion <- c(
  sum(residuals(m.pois, type = "pearson")^2) / m.pois$df.residual,
  sum(residuals(m.nb, type = "pearson")^2) / m.nb$df.residual,
  sum(residuals(m.zeroinf.pois, type = "pearson")^2) / m.zeroinf.pois$df.residual,
  sum(residuals(m.zeroinf.nb, type = "pearson")^2) / m.zeroinf.nb$df.residual,
  sum(residuals(m.hurdle.pois, type = "pearson")^2) / m.hurdle.pois$df.residual,
  sum(residuals(m.hurdle.nb, type = "pearson")^2) / m.hurdle.nb$df.residual
)

# Predicted number of zeros
Observed.zero <- sum(dat$visits == 0)

# Calculate expected number of zeros
Expected.zero <- round(c(
  "Pois" = sum(dpois(0, fitted(m.pois))),
  "NB" = sum(dnbinom(0, mu = fitted(m.nb), size = m.nb$theta)),
  "Zero-inflated Poisson" = sum(predict(m.zeroinf.pois, type = "prob")[,1]),
  "Zero-inflated NB" = sum(predict(m.zeroinf.nb, type = "prob")[,1]),
  "Hurdle Poisson" = sum(predict(m.hurdle.pois, type = "prob")[,1]),
  "Hurdle NB" = sum(predict(m.hurdle.nb, type = "prob")[,1])
))



# Calculate ratios of observed to expected zeros
Ratio <-  Observed.zero / Expected.zero 

# Combine expected zeros, ratios, and dispersion statistics
cbind(Expected.zero, Ratio, Dispersion)



#------------Exercise Analyzing Fish Catch Data----------------------
# Read the data from idre website
ex.data <- read.csv("https://stats.idre.ucla.edu/stat/data/fish.csv")

# Check the structure of the data
str(ex.data)

# Make the variable 'camper' a factor since it is binary
ex.data$camper <- factor(ex.data$camper, labels = c("no", "yes"))

# Fit the negative binomial model
ex.m.nb <- glm.nb(count ~ child + camper + persons, data = ex.data)

# Fit the zero-inflated negative binomial model
ex.m.zeroinf.nb <- zeroinfl(count ~ child + camper | persons, 
                            dist = "negbin", data = ex.data)

# Fit the hurdle negative binomial model
ex.m.hurdle.nb <- hurdle(count ~ child + camper + persons | 
                           child + camper + persons, 
                         dist = "negbin", data = ex.data)

# Summarize the results of each model
summary(ex.m.nb)
summary(ex.m.zeroinf.nb)
summary(ex.m.hurdle.nb)

# Use tab_model to display the models' summaries
tab_model(ex.m.zeroinf.nb, ex.m.hurdle.nb, ex.m.nb)