## INSTRUCTIONS: ## 1. TO RUN A COMMAND, PLACE CURSOR INSIDE COMMAND ## OR HIGHLIGHT COMMAND(S) AND HIT CTRL-ENTER (COMMAND-ENTER FOR MACS) ## 2. TO RUN ALL CODE FROM BEGINNING OF FILE TO CURRENT LINE, ## HIT CTRL-ALT-B ## 3. COMMANDS THAT BEGIN WITH "" ARE COMMENTS AND WILL NOT BE EXECUTED ## 4. NOTE: A FEW COMMANDS PURPOSEFULLY RESULT IN ERRORS FOR TEACHING PURPOSES ## 5. USE CTRL-F TO FIND ## # uncomment (remove ##) to run ## install.packages("dplyr", dependencies=TRUE) ## install.packages("ggplot2", dependencies=TRUE) ## install.packages("rmarkdown", dependencies=TRUE) ## library(dplyr) ## library(ggplot2) ## # list all available vignettes ## vignette() # Put operators like + at the end of lines 2 + 3 # specifying arguments by name log(x=100, base=10) # specifying arguments by position log(8, 2) # create a vector first_vec <- c(1, 3, 5) first_vec # length() returns the number of elements char_vec <- c("these", "are", "some", "words") length(char_vec) # the result of this comparison is a logical vector first_vec > c(2, 2, 2) # first argument to rep is what to repeate # the second argument is number of repetitions rep(0, times=3) rep("abc", 4) # arguments for seq are from, to, by seq(from=1, to=5, by=2) seq(10, 0, -5) # colon operator 3:7 # you can nest functions rep(seq(1,3,1), times=2) # create a vector 10 to 1 # putting () around a command will cause the result to be printed (a <- seq(10,1,-1)) # second element a[2] # first 5 elements a[seq(1,5)] # first, third, and fourth elements a[c(1,3,4)] scores <- c(55, 24, 43, 10) scores[c(FALSE, TRUE, TRUE, FALSE)] # this returns a logical vector... scores < 30 # ...that we can now use to subset scores[scores<30] ## # basic syntax of read.csv, not run ## data <- read.csv("/path/to/file.csv") ## ## # specification for tab-delimited file ## dat.tab <- read.delim("/path/to/file.txt", sep="\t") dat_csv <- read.csv("https://stats.oarc.ucla.edu/stat/data/hsbraw.csv") ## # write a csv file ## write.csv(dat_csv, file = "path/to/save/filename.csv") ## ## # save these objects to an .Rdata file ## save(dat_csv, mydata, file="path/to/save/filename.Rdata") ## View(dat_csv) # first 2 rows head(dat_csv, 2) # last 8 rows tail(dat_csv, 8) # use data.frame() to create a data frame manually mydata <- data.frame(patient=c("Smith", "Jones", "Williams"), height=c(72, 61, 66), diabetic=c(1, 0, 0)) # row 3 column 2 mydata[3,2] # first two rows of column height mydata[1:2, "height"] # all rows of columns patient and diabetic mydata[,c("patient", "diabetic")] # subsetting creates a numeric vector mydata$height # just the second and third elements mydata$height[2:3] # get column names colnames(mydata) # assign column names (capitalizing them) colnames(mydata) <- c("Patient", "Height", "Diabetic") colnames(mydata) # to change one variable name, just use vector indexing colnames(mydata)[3] <- "Diabetes" colnames(mydata) # number of rows and columns dim(mydata) #d is of class "data.frame" #all of its variables are of type "integer" str(mydata) # this will add a column variable called logwrite to d mydata$logHeight <- log(mydata$Height) # now we see logwrite as a column in d colnames(mydata) # d has 200 rows, and the rep vector has 300 mydata$z <- rep(0, 5) # load packages for this section library(dplyr) # creating some data manually dog_data <- data.frame(id = c("Duke", "Lucy", "Buddy", "Daisy", "Bear", "Stella"), weight = c(25, 12, 58, 67, 33, 9), sex=c("M", "F", "M", "F", "M", "F"), location=c("north", "west", "north", "south", "west", "west")) # dogs weighing more than 40 filter(dog_data, weight > 40) # female dogs in the north or south locations filter(dog_data, (location == "north" | location == "south") & sex == "F") # select 2 variables select(dog_data, id, sex) # select everything BUT id and sex select(dog_data, -c(id, sex)) # make a data.frame of new dogs more_dogs <- data.frame(id = c("Jack", "Luna"), weight=c(38, -99), sex=c("M", "F"), location=c("east", "east")) # make sure that data frames have the same columns names(dog_data) names(more_dogs) # appended dataset combines rows all_dogs <- rbind(dog_data, more_dogs) all_dogs # new dog variable # matching variables do not have to be sorted dog_vax <- data.frame(id = c("Luna", "Duke", "Buddy", "Stella", "Daisy", "Lucy", "Jack", "Bear"), vaccinated = c(TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE)) # id appears in both datasets, so will be used to match observations dogs <- inner_join(all_dogs, dog_vax) dogs # subset to science values equal to -99, and then change # them all to NA dogs$weight[dogs$weight == -99] <- NA dogs$weight # a sum involving "undefined" is "undefined" 1 + 2 + NA # NA could be larger or smaller or equal to 2 c(1, 2, 3, NA) > 2 # mean is undefined because of the presence of NA dogs$weight mean(dogs$weight) # NA values will be removed first sum(c(1,2,NA), na.rm=TRUE) mean(dogs$weight, na.rm=TRUE) # one of the values is NA x <- c(1, 2, NA) # check for equality to NA using == is wrong # RStudio may give you a warning about this (to use is.na() instead) x == NA # this is correct is.na(x) # create a new bloodtest data set bloodtest <- data.frame(id = 1:10, gender = c("female", "male", "female", "female", "female", "male", "male", "female", "male", "female"), hospital = c("CLH", "MH", "MH", "MH", "CLH", "MH", "MDH", "MDH", "CLH", "MH"), doc_id = c(1, 1, 1, 2, 2, 2, 3, 3, 3, 3), insured = c(0, 1, 1, 1, 0, 1, 1, 0, 1, 1), age = c(23, 45, 37, 49, 51, 55, 56, 37, 26, 40), test1 = c(47, 67, 41, 65, 60, 52, 68, 37, 44, 44), test2 = c(46, 57, 47, 65, 62, 51 ,62 ,44 ,46, 61), test3 = c(49, 73, 50, 64, 77, 57, 75, 55, 62, 55), test4 = c(61, 61, 51, 71, 56, 57, 61, 46, 46, 46)) mean(bloodtest$age) median(bloodtest$age) var(bloodtest$age) summary(bloodtest$test1) # just a single correlation cor(bloodtest$test1, bloodtest$test2) # use dplyr select() to pull out just the test variables scores <- select(bloodtest, test1, test2, test3, test4) cor(scores) # table() produces counts table(bloodtest$gender) table(bloodtest$hospital) # for proportions, use output of table() # as input to prop.table() prop.table(table(bloodtest$hospital)) # this time saving the freq table to an object my2way <- table(bloodtest$gender, bloodtest$hospital) # counts in each crossing of prog and ses my2way # row proportions, # proportion of prog that falls into ses prop.table(my2way, margin=1) # columns proportions, # proportion of ses that falls into prog prop.table(my2way, margin=2) # program and ses class appear to be associated chisq.test(bloodtest$hospital, bloodtest$insured) # formula notation for independent samples t-test t.test(test1 ~ gender, data=bloodtest) t.test(bloodtest$test1, bloodtest$test3, paired=TRUE) # fit a linear model (ANOVA and linear regression) m1 <- lm(test1 ~ age + gender, data=bloodtest) # printing an lm object will list the coefficients only m1 # summary produces regression table and model fit stats summary(m1) # just the coefficients coef(m1) # 95% confidence intervals confint(m1) # first 5 observed values, predicted values and residuals # cbind() joins column vectors into a matrix cbind(bloodtest$test1, predict(m1), residuals(m1)) # ANOVA sequential sums of squares anova(m1) # fit another linear regression model, adding hosiptal as predictor (two parameters added to model): m2 <- lm(test1 ~ age + gender + hospital, data=bloodtest) # printing an lm object will list the coefficients only anova(m2, m1) # plots all 4 plots at once (otherwise one at a time) layout(matrix(c(1,2,3,4),2,2)) # 4 diagnostic plots plot(m1) layout(1) # family=binomail uses link=logit by default m_ins <- glm(insured ~ age, data=bloodtest, family=binomial) summary(m_ins) # ORs exp(coef(m2)) # confidence intervals on ORs exp(confint(m2)) plot(bloodtest$test1, bloodtest$test2) plot(bloodtest$test1, bloodtest$test2, xlab="Test 1", ylab="Test 2", main="Plot of Test1 vs Test2") plot(bloodtest$test1, bloodtest$test2, xlab="Test 1", ylab="Test 2", main="Plot of Test1 vs Test2", col="steelblue", pch=17) hist(bloodtest$test1) boxplot(bloodtest$test2 ~ bloodtest$insured) boxplot(bloodtest$test2 ~ bloodtest$insured, xlab="Insured", ylab="Test 2", main = "Boxplots of Test2 by Insurance Status", col="lightblue") tab <- table(bloodtest$gender, bloodtest$hospital) barplot(tab, legend.text = TRUE) tab <- table(bloodtest$gender, bloodtest$hospital) barplot(tab, legend.text = TRUE, beside=TRUE, col=c("lawngreen", "sandybrown"), xlab="hospital") # a scatterplot of math vs write ggplot(data=dat_csv, aes(x=math, y=write)) + geom_point() # a scatterplot of math vs write with best fit line ggplot(dat_csv, aes(x=math, y=write)) + geom_point() + geom_smooth(method="lm") # a scatterplot and best fit line, by gender # color affects the best fit line, fill affects the confidence intervals ggplot(dat_csv, aes(x=math, y=write, color=female, fill=female)) + geom_point() + geom_smooth(method="lm") # panel of scatterplot and best fit line, colored by gender, paneled by prog ggplot(dat_csv, aes(x=math, y=write, color=female, fill=female)) + geom_point() + geom_smooth(method="lm") + facet_wrap(~prog) # panel of scatterplot and best fit line, colored by gender, paneled by prog ggplot(dat_csv, aes(x=math, y=write, color=female, fill=female)) + geom_point() + geom_smooth(method="lm") + facet_wrap(~prog) + theme_classic() # panel of scatterplot and best fit line, colored by gender, paneled by prog ggplot(dat_csv, aes(x=math, y=write, color=female, fill=female)) + geom_point() + geom_smooth(method="lm") + facet_wrap(~prog) + theme_dark() ## # a scatterplot of read vs write ## ggplot(data=dat_csv, aes(x=read, y=write, color=ses)) + ## geom_point() + ## geom_smooth(method=lm, se=FALSE) barplot(HairEyeColor[,,1], col=c("#4d4d4d", "#bf812d", "#f4a582", "#f6e8c3"), legend.text=TRUE, xlab="Eye Color", args.legend=list(title="Hair Color"))