#----------------------------Intro to regression in R---------------------------
#reading the data file in R
d <- read.csv(
  "https://stats.idre.ucla.edu/wp-content/uploads/2019/02/elemapi2v2.csv")
#We can also read the data from the local computer
#we can set the working directory by setwd()
#d <- read.csv("c:/.../elemapi2v2.csv",
#                header = TRUE, sep = ",")

#class of object d, returns data.frame
class(d)  
#returns names of variables (columns)
names(d) 
#returns number of rows and columns of data.frame
dim(d)

#returns structure of variables in the data frame d
str(d)      
#Summary statistics of variables in d
summary(d) 

#returns structure of variables in the data frame d
str(d[,1:4])      
#Summary statistics of variables in d
summary(d[,1:4])  

help(lm)   #shows R Help Documentation for function lm()

#Simple regression model of DV api00 and IV enroll
m1 <- lm(api00 ~ enroll, data = d)
class(m1)
print(m1)    #Prints coefficients of the model

#Prints summary of the model
summary(m1)

#correlation coefficient of api00 and enroll
r <- cor(d$api00, d$enroll)
#this is equal to r-squared in simple regression 
r ^ 2   
#list of components in object class lm
ls(m1) 

m1$coefficients        #returns coefficients of the model
m1$fitted.values[1:10] #a vector of fitted values here we show the first 10

#a vector of residuals
residuals <- m1$resid 
#r coeff function
coefficients(m1)

confint(m1) # returns a matrix of Confidence Interval for coefficients
anova(m1)  #anova table

#first we need to create new data.frame including all predictors of the model
new.data <- data.frame(enroll = c(500, 600, 700))
predict(m1, newdata = new.data)

#Optional if someone likes to use nice table for regression results
#install.packages(sjPlot)
library(sjPlot)
tab_model(m1)

plot(api00 ~ enroll, data = d) #scatter plot of api00 vs. enroll
abline(m1, col = "blue")       #add regression line to the scatter plot

#adding labels(school number) to the scatter plot
plot(api00 ~ enroll, data = d) 
text(d$enroll, d$api00+20, labels = d$snum, cex = .7)
abline(m1, col = "blue")

##We can also use identify() to click on the points
plot(api00 ~ enroll, data = d)
abline(m1, col = "blue")
identify(d$enroll,  d$api00,labels = d$snum, col = "red")

#Optional if someone likes to use ggplot  
library(ggplot2)
ggplot(d, aes(x = enroll, y = api00)) + 
  geom_point() +
  stat_smooth(method = "lm", col = "red")



#-------Regression model for categorical independent variable


#class of mealcat is integer
class(d$mealcat) 
#we transform variable meancat to a factor variable 
d$mealcat_F <- factor(d$mealcat)
#now the class of mealcat_F is factor
str(d$mealcat_F)
#class of yr_rnd is integer
class(d$yr_rnd) 
#we transform variable yr_rnd to a factor variable 
d$yr_rnd_F <- factor(d$yr_rnd)
#levels of factor variable yr_rnd_F
levels(d$yr_rnd_F)
#changing the levels to "Yes" and "No"
levels(d$yr_rnd_F) <- c("No", "Yes")
table(d$yr_rnd_F)    #frequency table for yr_rnd_F
table(d$mealcat_F)   #frequency table for mealcat_F

#regression of api00 with yr_rnd_F
m2 <- lm(api00 ~ yr_rnd_F, data = d)
#summary of the model
summary(m2)
#the first 6 rows of the model matrix
model.matrix(m2)[1:6,]

#scatter plot api00 against yr_rnd
plot(api00 ~ yr_rnd, data = d)
abline(m2)  # add regression line to the plot

#regression model of api00 against categorical variable mealcat_F with 3 levels
m3 <- lm(api00 ~ mealcat_F, data = d)
summary(m3)

#aggregate the mean of api00 for each category in mealcat_F
aggregate(api00 ~ mealcat_F, FUN=mean, data = d)

#relevel factor mealcat_F and make group "3" as the reference group
d$mealcat_F <- relevel(d$mealcat_F, ref = "3")
m4 <- lm(api00 ~ mealcat_F, data = d)
summary(m4)
#------------------------------------Optional-----------------------------
## regression analysis with a single dummy variable and t-test (Optional)
#scatter plot api00 against yr_rnd_F
plot(api00 ~ yr_rnd_F, data = d)

```{r}
# mean of api00 when yr_rnd_F is at level "0". school is not year round
mean(d$api00[d$yr_rnd_F == "0"])
# mean of api00 when yr_rnd_F is at level "1". school is year round
mean(d$api00[d$yr_rnd_F == "1"])

#using aggregate to find the mean for each group of year school round
aggregate(api00 ~ yr_rnd_F, FUN=mean, data = d)

#t test for equality of mean of api00 for two group of year round and not year round 
#with equal variance assumption
t.test(api00 ~ yr_rnd_F, data = d, var.equal = TRUE)

#anova table
anova(m2)

#square of t value from the t-test is the same as F value from anova
10.7815 ^ 2

#------------------Multiple regression and interaction----------------------------

#multiple regression model of DV api00 and IVs enroll, meals, and full
m5 <- lm(api00 ~  enroll + meals + full, data = d)
summary(m5)   #summary of model m5
anova(m5)     #anova table of model m5 

sum(anova(m5)$Sum)          #sum of RSS and SSreg
(400 - 1) * var(d$api00)    #Total sum of squares 

#Standardized regression model
m5.sd <- lm(scale(api00) ~  scale(enroll) + scale(meals) + scale(full), data = d)
summary(m5.sd) #coefficients are standardized

#multiple regression model of DV api00 and IVs enroll, meals, and enroll:meals
m6 <- lm(api00 ~  enroll + meals + enroll:meals , data = d)
summary(m6)   #summary of model m6
#multiple regression model of DV api00 and IVs enroll, meals, and enroll:meals
m7 <- lm(api00 ~  yr_rnd_F * mealcat_F , data = d)
summary(m7)   #summary of model m7
#running the model without interaction terms
m0 <- lm(api00 ~  yr_rnd_F + mealcat_F , data = d)
#run F-test using anova function
anova(m0, m7)

#multiple regression model of DV api00 and IVs enroll, meals, and enroll:meals
m8 <- lm(api00 ~  yr_rnd_F * enroll , data = d)
summary(m8)   #summary of model m8

#-----------------------------Graph of Interaction------------------------------
#First get the range of continuous predictor
range(d$enroll)
#make a sequence of number of enroll from range of enroll 130-1570 increment by 5
new.enroll <- seq(130, 1570, 5)
#Because we have two levels of yr_rnd_F we repeat enroll two times.
new.yr_rnd <- rep(levels(d$yr_rnd_F), each = length(new.enroll))
#create new data
new.data.plot <- data.frame(enroll = rep(new.enroll, times = 2),  yr_rnd_F = new.yr_rnd)
#predict api00
new.data.plot$predicted_api00 <- predict(m8, newdata = new.data.plot)  
#I use ggplot to plot
library(ggplot2)
ggplot(new.data.plot, aes(x = enroll, y = predicted_api00, colour = yr_rnd_F)) +
  geom_line(lwd=1.2) 
#-----------------------------Regression Diagnostics----------------------------
# install packages for Regression Diagnostics

#install.packages("car") 
#install.packages("alr4")
#install.packages("faraway")

#loading packages into working environment
library(car)
library(alr4)
library(faraway)
#
#multiple regression model of DV api00 and DVs enroll, meals, and full
m2 <- lm(api00 ~  enroll + meals + full, data = d)
scatterplotMatrix(~ api00 + enroll + meals + full, data = d)

#--------------------# Unusual and Influential data

res.std <- rstandard(m2) #studentized residuals stored in vector res.std 
#plot Standardized residual in y axis. X axis will be the index or row names
plot(res.std, ylab="Standardized Residual", ylim=c(-3.5,3.5))
#add horizontal lines 3 and -3 to identify extreme values
abline(h =c(-3,0,3), lty = 2)
#find out which data point is outside of 3 standard deviation cut-off
#index is row numbers of those point
index <- which(res.std > 3 | res.std < -3)
#add School name next to points that have extreme value
text(index-20, res.std[index] , labels = d$snum[index])
#print row number of values that are out of bounds
print(index)
#print school names of points that are out of bounds
print(d$snum[index])

#Bonferroni p-values for testing outliner
outlierTest(m2)

#a vector containing the diagonal of the 'hat' matrix
h <- influence(m2)$hat
#half normal plot of leverage from package faraway
halfnorm(influence(m2)$hat, ylab = "leverage")


#the cut of value for cook's distance
cutoff <- 4/((nrow(d)-length(m2$coefficients)-2))
plot(m2, which = 4, cook.levels = cutoff)

#cook's distance, studentized residuals, and leverage in the same plot
influencePlot(m2, main="Influence Plot", 
              sub="Circle size is proportional to Cook's Distance")

#4 diagnostic plots to identify influential points
infIndexPlot(m2)

#---------------------------Checking Homoscedasticity---------------------------
#residual vs. fitted value plot for Homoscedasticity
plot(m2$resid ~ m2$fitted.values)
#add horizental line from 0
abline(h = 0, lty = 2)
#---------------------------Checking Linearity ---------------------------------
#residual vs. fitted value and all predictors plus test for curvature
residualPlots(m2)

#---------------------------Issues of Independence -----------------------------

#plotting the trend of residuals and school number
plot(m2$resid ~ d$snum)   #residual plot vs. school id 
#--------------------------Checking Normality of Residuals----------------------
qqnorm(m2$resid)  #Normal Quantile to Quantile plot
qqline(m2$resid)  

#--------------------------Checking for Multicollinearity-----------------------
car::vif(m2)  #variance inflation factor
#--------------------------Model Specification----------------------------------
avPlots(m2) #added variable plots from package "car"