#----------------------------Intro to regression in R--------------------------- #reading the data file in R d <- read.csv( "https://stats.idre.ucla.edu/wp-content/uploads/2019/02/elemapi2v2.csv") #We can also read the data from the local computer #we can set the working directory by setwd() #d <- read.csv("c:/.../elemapi2v2.csv", # header = TRUE, sep = ",") #class of object d, returns data.frame class(d) #returns names of variables (columns) names(d) #returns number of rows and columns of data.frame dim(d) #returns structure of variables in the data frame d str(d) #Summary statistics of variables in d summary(d) #returns structure of variables in the data frame d str(d[,1:4]) #Summary statistics of variables in d summary(d[,1:4]) help(lm) #shows R Help Documentation for function lm() #Simple regression model of DV api00 and IV enroll m1 <- lm(api00 ~ enroll, data = d) class(m1) print(m1) #Prints coefficients of the model #Prints summary of the model summary(m1) #correlation coefficient of api00 and enroll r <- cor(d$api00, d$enroll) #this is equal to r-squared in simple regression r ^ 2 #list of components in object class lm ls(m1) m1$coefficients #returns coefficients of the model m1$fitted.values[1:10] #a vector of fitted values here we show the first 10 #a vector of residuals residuals <- m1$resid #r coeff function coefficients(m1) confint(m1) # returns a matrix of Confidence Interval for coefficients anova(m1) #anova table #first we need to create new data.frame including all predictors of the model new.data <- data.frame(enroll = c(500, 600, 700)) predict(m1, newdata = new.data) #Optional if someone likes to use nice table for regression results #install.packages(sjPlot) library(sjPlot) tab_model(m1) plot(api00 ~ enroll, data = d) #scatter plot of api00 vs. enroll abline(m1, col = "blue") #add regression line to the scatter plot #adding labels(school number) to the scatter plot plot(api00 ~ enroll, data = d) text(d$enroll, d$api00+20, labels = d$snum, cex = .7) abline(m1, col = "blue") ##We can also use identify() to click on the points plot(api00 ~ enroll, data = d) abline(m1, col = "blue") identify(d$enroll, d$api00,labels = d$snum, col = "red") #Optional if someone likes to use ggplot library(ggplot2) ggplot(d, aes(x = enroll, y = api00)) + geom_point() + stat_smooth(method = "lm", col = "red") #-------Regression model for categorical independent variable #class of mealcat is integer class(d$mealcat) #we transform variable meancat to a factor variable d$mealcat_F <- factor(d$mealcat) #now the class of mealcat_F is factor str(d$mealcat_F) #class of yr_rnd is integer class(d$yr_rnd) #we transform variable yr_rnd to a factor variable d$yr_rnd_F <- factor(d$yr_rnd) #levels of factor variable yr_rnd_F levels(d$yr_rnd_F) #changing the levels to "Yes" and "No" levels(d$yr_rnd_F) <- c("No", "Yes") table(d$yr_rnd_F) #frequency table for yr_rnd_F table(d$mealcat_F) #frequency table for mealcat_F #regression of api00 with yr_rnd_F m2 <- lm(api00 ~ yr_rnd_F, data = d) #summary of the model summary(m2) #the first 6 rows of the model matrix model.matrix(m2)[1:6,] #scatter plot api00 against yr_rnd plot(api00 ~ yr_rnd, data = d) abline(m2) # add regression line to the plot #regression model of api00 against categorical variable mealcat_F with 3 levels m3 <- lm(api00 ~ mealcat_F, data = d) summary(m3) #aggregate the mean of api00 for each category in mealcat_F aggregate(api00 ~ mealcat_F, FUN=mean, data = d) #relevel factor mealcat_F and make group "3" as the reference group d$mealcat_F <- relevel(d$mealcat_F, ref = "3") m4 <- lm(api00 ~ mealcat_F, data = d) summary(m4) #------------------------------------Optional----------------------------- ## regression analysis with a single dummy variable and t-test (Optional) #scatter plot api00 against yr_rnd_F plot(api00 ~ yr_rnd_F, data = d) ```{r} # mean of api00 when yr_rnd_F is at level "0". school is not year round mean(d$api00[d$yr_rnd_F == "0"]) # mean of api00 when yr_rnd_F is at level "1". school is year round mean(d$api00[d$yr_rnd_F == "1"]) #using aggregate to find the mean for each group of year school round aggregate(api00 ~ yr_rnd_F, FUN=mean, data = d) #t test for equality of mean of api00 for two group of year round and not year round #with equal variance assumption t.test(api00 ~ yr_rnd_F, data = d, var.equal = TRUE) #anova table anova(m2) #square of t value from the t-test is the same as F value from anova 10.7815 ^ 2 #------------------Multiple regression and interaction---------------------------- #multiple regression model of DV api00 and IVs enroll, meals, and full m5 <- lm(api00 ~ enroll + meals + full, data = d) summary(m5) #summary of model m5 anova(m5) #anova table of model m5 sum(anova(m5)$Sum) #sum of RSS and SSreg (400 - 1) * var(d$api00) #Total sum of squares #Standardized regression model m5.sd <- lm(scale(api00) ~ scale(enroll) + scale(meals) + scale(full), data = d) summary(m5.sd) #coefficients are standardized #multiple regression model of DV api00 and IVs enroll, meals, and enroll:meals m6 <- lm(api00 ~ enroll + meals + enroll:meals , data = d) summary(m6) #summary of model m6 #multiple regression model of DV api00 and IVs enroll, meals, and enroll:meals m7 <- lm(api00 ~ yr_rnd_F * mealcat_F , data = d) summary(m7) #summary of model m7 #running the model without interaction terms m0 <- lm(api00 ~ yr_rnd_F + mealcat_F , data = d) #run F-test using anova function anova(m0, m7) #multiple regression model of DV api00 and IVs enroll, meals, and enroll:meals m8 <- lm(api00 ~ yr_rnd_F * enroll , data = d) summary(m8) #summary of model m8 #-----------------------------Graph of Interaction------------------------------ #First get the range of continuous predictor range(d$enroll) #make a sequence of number of enroll from range of enroll 130-1570 increment by 5 new.enroll <- seq(130, 1570, 5) #Because we have two levels of yr_rnd_F we repeat enroll two times. new.yr_rnd <- rep(levels(d$yr_rnd_F), each = length(new.enroll)) #create new data new.data.plot <- data.frame(enroll = rep(new.enroll, times = 2), yr_rnd_F = new.yr_rnd) #predict api00 new.data.plot$predicted_api00 <- predict(m8, newdata = new.data.plot) #I use ggplot to plot library(ggplot2) ggplot(new.data.plot, aes(x = enroll, y = predicted_api00, colour = yr_rnd_F)) + geom_line(lwd=1.2) #-----------------------------Regression Diagnostics---------------------------- # install packages for Regression Diagnostics #install.packages("car") #install.packages("alr4") #install.packages("faraway") #loading packages into working environment library(car) library(alr4) library(faraway) # #multiple regression model of DV api00 and DVs enroll, meals, and full m2 <- lm(api00 ~ enroll + meals + full, data = d) scatterplotMatrix(~ api00 + enroll + meals + full, data = d) #--------------------# Unusual and Influential data res.std <- rstandard(m2) #studentized residuals stored in vector res.std #plot Standardized residual in y axis. X axis will be the index or row names plot(res.std, ylab="Standardized Residual", ylim=c(-3.5,3.5)) #add horizontal lines 3 and -3 to identify extreme values abline(h =c(-3,0,3), lty = 2) #find out which data point is outside of 3 standard deviation cut-off #index is row numbers of those point index <- which(res.std > 3 | res.std < -3) #add School name next to points that have extreme value text(index-20, res.std[index] , labels = d$snum[index]) #print row number of values that are out of bounds print(index) #print school names of points that are out of bounds print(d$snum[index]) #Bonferroni p-values for testing outliner outlierTest(m2) #a vector containing the diagonal of the 'hat' matrix h <- influence(m2)$hat #half normal plot of leverage from package faraway halfnorm(influence(m2)$hat, ylab = "leverage") #the cut of value for cook's distance cutoff <- 4/((nrow(d)-length(m2$coefficients)-2)) plot(m2, which = 4, cook.levels = cutoff) #cook's distance, studentized residuals, and leverage in the same plot influencePlot(m2, main="Influence Plot", sub="Circle size is proportional to Cook's Distance") #4 diagnostic plots to identify influential points infIndexPlot(m2) #---------------------------Checking Homoscedasticity--------------------------- #residual vs. fitted value plot for Homoscedasticity plot(m2$resid ~ m2$fitted.values) #add horizental line from 0 abline(h = 0, lty = 2) #---------------------------Checking Linearity --------------------------------- #residual vs. fitted value and all predictors plus test for curvature residualPlots(m2) #---------------------------Issues of Independence ----------------------------- #plotting the trend of residuals and school number plot(m2$resid ~ d$snum) #residual plot vs. school id #--------------------------Checking Normality of Residuals---------------------- qqnorm(m2$resid) #Normal Quantile to Quantile plot qqline(m2$resid) #--------------------------Checking for Multicollinearity----------------------- car::vif(m2) #variance inflation factor #--------------------------Model Specification---------------------------------- avPlots(m2) #added variable plots from package "car"