# R codes for MI in R

#Please make sure the packages that we are going to use are installed.
#install.packages("mice", dependencies = TRUE)
#install.packages("VIM")
#install.packages("dplyr")
#install.packages("tidyverse")
#install.packages("lattice")
#install.packages("sjPlot")
#install.packages("lavaan")
#install.packages("MASS")

#in R a missing value is represented by NA
#this is not missing! a is null
a <- c()
a
#In this one means `a` is a missing value
a <- NA
a
# x is a vector with 4th element missing
x <- c(1, 2, 3, NA, 5)
x
#is.na() detect whether a value is `NA`.
#is.na() detect whether a value is `NA`.
is.na(x)
#to get the location of NAs
which(is.na(x))
#To get total number of NA in an array 
sum(is.na(x))
#anyNA check whether there exist any missing
anyNA(x)
is.na(x) <- 2
x
#lets try apply some computational function on x
#by default most of the function in R do not ignore the NA
mean(x)
sum(x)
#?mean
#remove NAs 
mean(x, na.rm = TRUE)
sum(x, na.rm = TRUE)


#setting seed
set.seed(123)
#We draw a sample of size 25 from a standard normal distribution and store them in y1
y1 <- rnorm(25)

#We draw a sample of size 25 from a normal distribution with mean 1 and standard deviations 5
y2 <- rnorm(n = 25, mean = 1, sd = 5)
#Let's draw 25 random value from 0 and 1 with equally likely chances.
z1 <- runif(n = 25, min = 0, max = 1)
# we can create missing in y1 if z is less than 0.5
y1 <- replace(y1, z1 < 0.5, NA)
#Exercise 1.
#Draw a standard normal random vector of size 25 and for each value greater 
#than 1 and less than -1
z2 <- rnorm(25,0,1)
replace y2 with a missing value

y2 <- replace(y2, z2 > 1 | z2 < -1, NA)
y2



#--------------------
#data frame is the usual way a multivariate data is stored in R
dat <- data.frame(y1 = y1, y2 = y2)
#this is how a missing data look like in R
head(dat)

R <- !is.na(dat)
# or we can make R as 0 and 1, 0 indicates missing and 1 indicates observed
R <- apply(R, 2, as.numeric)
#name the columns of R
colnames(R) <- c("R1","R2")
#desplay data and missing indicator together  
dat <- cbind(dat, R)
head(dat)

#bivariate normal sample
Y <- MASS::mvrnorm(100, mu = c(0, 0), Sigma = .5 * diag(2) + .5)
R2 <- Y[,1] < -0.5
R1 <- Y[,2] > 0.5
#apply missingness R1 and R2 to the data
dat.MAR <- data.frame(Y1 = replace(Y[,1], R1==TRUE, NA),
                      Y2 = replace(Y[,2], R2==TRUE, NA))
head(dat.MAR)

# mean imputation with for loop
#first store the missing in new data to impute 
dat_mean_im <- dat
for(i in 1:ncol(dat_mean_im)) {
  dat_mean_im[ , i][is.na(dat_mean_im[ , i])] <- mean(dat_mean_im[ , i], na.rm = TRUE)
}

#getting mean and standard deviation of data with missing 

data.frame(mean = apply(dat, 2,mean, na.rm = TRUE), sd = apply(dat, 2,sd,  na.rm = TRUE))

# mean and standard deviation of mean imputed data
data.frame(mean = apply(dat_mean_im, 2,mean), sd = apply(dat_mean_im, 2,sd))

#------------------hsb2_mar data

#hsb2_mar is missing at random 
hsb2_mar <- read.csv("https://stats.idre.ucla.edu/wp-content/uploads/2021/03/hsb2_mar.csv")
summary(hsb2_mar)
#since variables prog and female are categorical we make them as factor.
hsb2_mar$prog <- factor(hsb2_mar$prog)
#we can change the reference group of a factor
hsb2_mar$prog <- relevel(hsb2_mar$prog, ref = "vocational")
hsb2_mar$female <- factor(hsb2_mar$female)
hsb2_mar$female <-  relevel(hsb2_mar$female, ref = "male")
#now we can see NA apears in the summary
summary(hsb2_mar)


m_complete <- lm(read ~ write + math, data = hsb2_mar)
summary(m_complete)
#sjPlot::tab_model(m_complete, show.se = TRUE, digits = 4)

#How many cases has been used in the above model?

#We can see it from summary
summary(m_complete)
#158 df + 3 = 161
#or
#complete.cases gives logical vector TRUE/FALSE is a row is complete
sum(complete.cases(hsb2_mar[c("read", "write", "math")]))

#exercise
#mean value imputation 
data_mean_im <- hsb2_mar[c("read", "write", "math")]
for(i in 1:ncol(data_mean_im)) {
  data_mean_im[ , i][is.na(data_mean_im[ , i])] <- mean(data_mean_im[ , i], na.rm = TRUE)
}

#regression on imputed data by mean value

m_mean_im <- lm(read ~ write + math, data = data_mean_im)
summary(m_mean_im)

#comparing two models side by side

sjPlot::tab_model(m_complete, m_mean_im, show.p = FALSE, show.se = TRUE, digits = 4, dv.labels = c("complete case", "mean Imputed"))


#regression model with lavaan setting missing to FIML
library(lavaan)
lm.fiml <- sem('read ~ write + math', data = hsb2_mar, 
               missing = "FIML", fixed.x = F)
summary(lm.fiml)

library(mice)
library(lattice)
#again we set seed at first
set.seed(123)

#Check the missingness pattern for the hsb2_mar dataset

md.pattern(hsb2_mar, rotate.names = TRUE)

#plot with VIM
library(VIM)
aggr_plot <- aggr(hsb2_mar, col=c('blue','red'), numbers=TRUE, 
                  sortVars=TRUE, labels=names(data), cex.axis=.7, gap=3,
                  ylab=c("Histogram of missing data","Pattern"))

#VIM marginplot
VIM::marginplot(hsb2_mar[c("read","write")])

#select variables "read", "math", and "write" in a new data
hsb2_mar_1 <- hsb2_mar[c("read", "math","write")]
#mean impute in mice
imp <- mice(hsb2_mar_1, method = "mean", m = 1, maxit = 1)
#the class of multiple imputed data is mids
class(imp)
#first 10 rows of imputed data
head(complete(imp), 10)
#mean on each column(variables) that used for imputation
colMeans(hsb2_mar_1, na.rm = TRUE)
#applying the analysis to imputed data
fit <- with(imp, lm(read ~ math + write))
summary(fit)
#the class of fit is mira
class(fit)

#regression imputation
imp <- mice(hsb2_mar_1, method = "norm.predict", m = 1, maxit = 1)

#runing the mode for imputed data
fit <- with(imp, lm(read ~ math + write))
summary(fit)

#stochastic regression imputation
imp <- mice(hsb2_mar_1, method = "norm.nob", m = 1, maxit = 1)
#runing the mode for imputed data
fit <- with(imp, lm(read ~ math + write))
summary(fit)

##stochastic regression imputation with setting seed to 123 give the same result every time
imp <- mice(hsb2_mar_1, method = "norm.nob", m = 1, maxit = 1, seed = 123)
fit <- with(imp, lm(read ~ math + write))
summary(fit)
imp$predictorMatrix
#using mice default
imp <- mice(hsb2_mar_1, maxit = 2, print = F)
imp
imp$chainMean
imp$lastSeedValue


imp <- mice(hsb2_mar_1, maxit = 6, m = 5,seed = 123, print = F)
imp1 <- mice(hsb2_mar_1, maxit = 1, seed = 123, print = FALSE)
imp2 <- mice.mids(imp1, maxit = 4, print = FALSE)
all(imp$imp$read == imp2$imp$read)



#information stored in mids class
attributes(imp)
#for example we can extract imputed values
imp$imp
#extract completed data third imputed values
c3 <- mice::complete(imp, 3)

#long
c.long <- complete(imp, "long")
#wide
c.broad <- complete(imp, "broad")

# analysis for 5 imputed data:
fit <- with(imp, lm(read ~ math + write))
#summary(fit)

#three imputations
imp <- mice(hsb2_mar_1, m = 3, print=F)


#predicted matrix
imp$pred
ini <- mice(hsb2_mar_1, maxit=0, print=F)
pred <- ini$pred
pred
pred[ ,"write"] <- 0
#Use your new predictor matrix in mice() as follows
imp <- mice(hsb2_mar_1, pred=pred, print=F)

#set the minimum rho
ini <- mice(hsb2_mar_1, pred=quickpred(hsb2_mar_1, mincor=.6), print=F)
ini$pred

imp <- mice(hsb2_mar_1, print=F)
#plots the trace line
plot(imp)

#imputation methods
imp$meth
#An up-to-date overview of the methods in mice can be found by
methods(mice)

#initial methods
ini <- mice(hsb2_mar_1, maxit = 0)
meth <- ini$meth
meth
meth["write"] <- "norm"
meth
#now we can use this new methods
imp <- mice(hsb2_mar_1, meth = meth, print=F)
plot(imp)
#addtional imputation
imp50 <- mice.mids(imp, maxit=45, print=F)
plot(imp50)
##strip plots showing wether we did impute plausible imputed values
stripplot(imp, math~.imp, pch=20, cex=2)
stripplot(imp)

fit <- with(imp, lm(read ~ write + math))
#fit
class(fit)
#ls() extract information in fit
ls(fit)
#result of analysis on second imputed data 
summary(fit$analyses[[2]])
#combine the results
pool.fit <- pool(fit)
summary(pool.fit)
pool.fit




#
hsb_ex <- hsb2_mar[c("read",  "write", "math",
              "science", "socst", "prog","female")]
imp <- mice(hsb_ex, m = 10, maxit = 50, print = FALSE)

imp$method
imp$predictorMatrix
plot(imp)

fit <- with(imp, lm(read ~ write+math + science+ prog + female))
pool.fit <- pool(fit)
summary(pool.fit)
print(pool.fit)