# R codes for MI in R #Please make sure the packages that we are going to use are installed. #install.packages("mice", dependencies = TRUE) #install.packages("VIM") #install.packages("dplyr") #install.packages("tidyverse") #install.packages("lattice") #install.packages("sjPlot") #install.packages("lavaan") #install.packages("MASS") #in R a missing value is represented by NA #this is not missing! a is null a <- c() a #In this one means `a` is a missing value a <- NA a # x is a vector with 4th element missing x <- c(1, 2, 3, NA, 5) x #is.na() detect whether a value is `NA`. #is.na() detect whether a value is `NA`. is.na(x) #to get the location of NAs which(is.na(x)) #To get total number of NA in an array sum(is.na(x)) #anyNA check whether there exist any missing anyNA(x) is.na(x) <- 2 x #lets try apply some computational function on x #by default most of the function in R do not ignore the NA mean(x) sum(x) #?mean #remove NAs mean(x, na.rm = TRUE) sum(x, na.rm = TRUE) #setting seed set.seed(123) #We draw a sample of size 25 from a standard normal distribution and store them in y1 y1 <- rnorm(25) #We draw a sample of size 25 from a normal distribution with mean 1 and standard deviations 5 y2 <- rnorm(n = 25, mean = 1, sd = 5) #Let's draw 25 random value from 0 and 1 with equally likely chances. z1 <- runif(n = 25, min = 0, max = 1) # we can create missing in y1 if z is less than 0.5 y1 <- replace(y1, z1 < 0.5, NA) #Exercise 1. #Draw a standard normal random vector of size 25 and for each value greater #than 1 and less than -1 z2 <- rnorm(25,0,1) replace y2 with a missing value y2 <- replace(y2, z2 > 1 | z2 < -1, NA) y2 #-------------------- #data frame is the usual way a multivariate data is stored in R dat <- data.frame(y1 = y1, y2 = y2) #this is how a missing data look like in R head(dat) R <- !is.na(dat) # or we can make R as 0 and 1, 0 indicates missing and 1 indicates observed R <- apply(R, 2, as.numeric) #name the columns of R colnames(R) <- c("R1","R2") #desplay data and missing indicator together dat <- cbind(dat, R) head(dat) #bivariate normal sample Y <- MASS::mvrnorm(100, mu = c(0, 0), Sigma = .5 * diag(2) + .5) R2 <- Y[,1] < -0.5 R1 <- Y[,2] > 0.5 #apply missingness R1 and R2 to the data dat.MAR <- data.frame(Y1 = replace(Y[,1], R1==TRUE, NA), Y2 = replace(Y[,2], R2==TRUE, NA)) head(dat.MAR) # mean imputation with for loop #first store the missing in new data to impute dat_mean_im <- dat for(i in 1:ncol(dat_mean_im)) { dat_mean_im[ , i][is.na(dat_mean_im[ , i])] <- mean(dat_mean_im[ , i], na.rm = TRUE) } #getting mean and standard deviation of data with missing data.frame(mean = apply(dat, 2,mean, na.rm = TRUE), sd = apply(dat, 2,sd, na.rm = TRUE)) # mean and standard deviation of mean imputed data data.frame(mean = apply(dat_mean_im, 2,mean), sd = apply(dat_mean_im, 2,sd)) #------------------hsb2_mar data #hsb2_mar is missing at random hsb2_mar <- read.csv("https://stats.idre.ucla.edu/wp-content/uploads/2021/03/hsb2_mar.csv") summary(hsb2_mar) #since variables prog and female are categorical we make them as factor. hsb2_mar$prog <- factor(hsb2_mar$prog) #we can change the reference group of a factor hsb2_mar$prog <- relevel(hsb2_mar$prog, ref = "vocational") hsb2_mar$female <- factor(hsb2_mar$female) hsb2_mar$female <- relevel(hsb2_mar$female, ref = "male") #now we can see NA apears in the summary summary(hsb2_mar) m_complete <- lm(read ~ write + math, data = hsb2_mar) summary(m_complete) #sjPlot::tab_model(m_complete, show.se = TRUE, digits = 4) #How many cases has been used in the above model? #We can see it from summary summary(m_complete) #158 df + 3 = 161 #or #complete.cases gives logical vector TRUE/FALSE is a row is complete sum(complete.cases(hsb2_mar[c("read", "write", "math")])) #exercise #mean value imputation data_mean_im <- hsb2_mar[c("read", "write", "math")] for(i in 1:ncol(data_mean_im)) { data_mean_im[ , i][is.na(data_mean_im[ , i])] <- mean(data_mean_im[ , i], na.rm = TRUE) } #regression on imputed data by mean value m_mean_im <- lm(read ~ write + math, data = data_mean_im) summary(m_mean_im) #comparing two models side by side sjPlot::tab_model(m_complete, m_mean_im, show.p = FALSE, show.se = TRUE, digits = 4, dv.labels = c("complete case", "mean Imputed")) #regression model with lavaan setting missing to FIML library(lavaan) lm.fiml <- sem('read ~ write + math', data = hsb2_mar, missing = "FIML", fixed.x = F) summary(lm.fiml) library(mice) library(lattice) #again we set seed at first set.seed(123) #Check the missingness pattern for the hsb2_mar dataset md.pattern(hsb2_mar, rotate.names = TRUE) #plot with VIM library(VIM) aggr_plot <- aggr(hsb2_mar, col=c('blue','red'), numbers=TRUE, sortVars=TRUE, labels=names(data), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern")) #VIM marginplot VIM::marginplot(hsb2_mar[c("read","write")]) #select variables "read", "math", and "write" in a new data hsb2_mar_1 <- hsb2_mar[c("read", "math","write")] #mean impute in mice imp <- mice(hsb2_mar_1, method = "mean", m = 1, maxit = 1) #the class of multiple imputed data is mids class(imp) #first 10 rows of imputed data head(complete(imp), 10) #mean on each column(variables) that used for imputation colMeans(hsb2_mar_1, na.rm = TRUE) #applying the analysis to imputed data fit <- with(imp, lm(read ~ math + write)) summary(fit) #the class of fit is mira class(fit) #regression imputation imp <- mice(hsb2_mar_1, method = "norm.predict", m = 1, maxit = 1) #runing the mode for imputed data fit <- with(imp, lm(read ~ math + write)) summary(fit) #stochastic regression imputation imp <- mice(hsb2_mar_1, method = "norm.nob", m = 1, maxit = 1) #runing the mode for imputed data fit <- with(imp, lm(read ~ math + write)) summary(fit) ##stochastic regression imputation with setting seed to 123 give the same result every time imp <- mice(hsb2_mar_1, method = "norm.nob", m = 1, maxit = 1, seed = 123) fit <- with(imp, lm(read ~ math + write)) summary(fit) imp$predictorMatrix #using mice default imp <- mice(hsb2_mar_1, maxit = 2, print = F) imp imp$chainMean imp$lastSeedValue imp <- mice(hsb2_mar_1, maxit = 6, m = 5,seed = 123, print = F) imp1 <- mice(hsb2_mar_1, maxit = 1, seed = 123, print = FALSE) imp2 <- mice.mids(imp1, maxit = 4, print = FALSE) all(imp$imp$read == imp2$imp$read) #information stored in mids class attributes(imp) #for example we can extract imputed values imp$imp #extract completed data third imputed values c3 <- mice::complete(imp, 3) #long c.long <- complete(imp, "long") #wide c.broad <- complete(imp, "broad") # analysis for 5 imputed data: fit <- with(imp, lm(read ~ math + write)) #summary(fit) #three imputations imp <- mice(hsb2_mar_1, m = 3, print=F) #predicted matrix imp$pred ini <- mice(hsb2_mar_1, maxit=0, print=F) pred <- ini$pred pred pred[ ,"write"] <- 0 #Use your new predictor matrix in mice() as follows imp <- mice(hsb2_mar_1, pred=pred, print=F) #set the minimum rho ini <- mice(hsb2_mar_1, pred=quickpred(hsb2_mar_1, mincor=.6), print=F) ini$pred imp <- mice(hsb2_mar_1, print=F) #plots the trace line plot(imp) #imputation methods imp$meth #An up-to-date overview of the methods in mice can be found by methods(mice) #initial methods ini <- mice(hsb2_mar_1, maxit = 0) meth <- ini$meth meth meth["write"] <- "norm" meth #now we can use this new methods imp <- mice(hsb2_mar_1, meth = meth, print=F) plot(imp) #addtional imputation imp50 <- mice.mids(imp, maxit=45, print=F) plot(imp50) ##strip plots showing wether we did impute plausible imputed values stripplot(imp, math~.imp, pch=20, cex=2) stripplot(imp) fit <- with(imp, lm(read ~ write + math)) #fit class(fit) #ls() extract information in fit ls(fit) #result of analysis on second imputed data summary(fit$analyses[[2]]) #combine the results pool.fit <- pool(fit) summary(pool.fit) pool.fit # hsb_ex <- hsb2_mar[c("read", "write", "math", "science", "socst", "prog","female")] imp <- mice(hsb_ex, m = 10, maxit = 50, print = FALSE) imp$method imp$predictorMatrix plot(imp) fit <- with(imp, lm(read ~ write+math + science+ prog + female)) pool.fit <- pool(fit) summary(pool.fit) print(pool.fit)