This page contains all the code snippets from the book for easy copy/paste ppossibilities. # Chapter 1

library(MESS)   # Load the package that contains the data 
data(bees)      # Load the data 
head(bees)      # Show the first couple of lines of data 
load("penny.rda")                # Load data 
load("penny.rda", verbose=TRUE)  # Load data and show objects 
x <- c(1, 7, 3, 2) 
df <- data.frame(v1=c("A", "B", "C", "D"), v2=1:4) 
save(x, df, file="mydata.rda")      # Save two objects 
indata <- read.table("mydata.txt", header=TRUE) 
indata 
indata <- read.table("mydata.txt", header=TRUE, 
                     na.strings=c("NA", "")) 
indata 
indata <- read.table("mydata.txt", header=TRUE, 
                      na.strings=c("NA", "", ".")) 
indata 
data(trees) 
trees$Girth[2] <- NA    # Set 2nd obs of Girth to missing 
write.table(trees, file="savedata.txt") 
write.table(trees, file="mydata2.txt", row.names=FALSE, 
            dec=",", eol="\r\n", na=".", quote=FALSE) 
indata <- read.csv("mydata.csv") 
head(indata) 
data(trees) 
write.csv(trees, file = "savedata.csv") 
data(trees) 
write.csv(trees, file = "savedata2.csv", row.names=FALSE, quote=FALSE) 
mydata 
library(readxl) 
goesforth <- read_excel("data/cunningplan.xlsx", sheet=1) 
goesforth 
library(xlsx) 
data(trees) 
write.xlsx(trees, "mydata.xlsx") 
write.xlsx(trees, "mydata.xlsx", row.names=FALSE) # No row numbers 
library(haven) 
indata <- read_sas("hip.sas7bdat") 
head(indata) 
library(foreign) 
indata <- read.xport("somefile.xpt") 
head(indata) 
library(foreign) 
data(trees) 
write.foreign(trees, datafile = "toSAS.dat", 
              codefile="toSAS.sas", package="SAS") 
library(haven) 
indata <- read_sav("cancer.sav") 
head(indata) 
data(trees) 
write_sav(trees, "myspssfile.sav") 
library(haven) 
indata <- read_dta("statafile.dta") 
head(indata) 
data(trees) 
write.dta(trees, file="mydata.dta") 
library(foreign) 
indata <- read.systat("Play4.syd") 
head(indata) 
library(jsonlite) 
indata <- fromJSON("exampledata.json") 
class(indata)       # Note a list is returned 
indata 
df <- as.data.frame(indata) 
class(df) 
df 
toJSON(1:5)   # JSON string for a numeric vector 
 
outdata <- toJSON(df, pretty=TRUE) 
class(outdata) 
outdata # Default is row-wise 
outdata <- toJSON(df, dataframe="column", pretty=TRUE) 
outdata 
toJSON(df, dataframe="column", pretty=TRUE, na="null") 
cat(outdata, file="output.json") 
library(XML) 
url <- "http://www.rprimer.dk/download/mydata.xml" 
indata <- xmlToDataFrame(url) 
head(indata) 
library(XML) 
# Location of the example XML file 
url <- "http://www.rprimer.dk/download/bank.xml" 
# Parse the tree 
doc <- xmlTreeParse(url, useInternalNodes=TRUE) 
top <- xmlRoot(doc)         # Identify the root node 
xmlName(top)                # Show node name of root node 
names(top)                  # Name of root node children 
xmlValue(top[[1]])          # Access first element 
xmlValue(top[["author"]])   # First element with named index 
names(top[[3]])             # Children of node 3 
xmlAttrs(top[[3]][[1]])     # Extract tags from a Date node 
top[["rates"]][[1]][[1]]    # Tree from first bank and date 
xmlValue(top[[3]][[1]][[1]][[1]]) # Bank name is node value 
xmlAttrs(top[[3]][[1]][[1]][[1]]) # but has no tags 
xmlAttrs(top[[3]][[1]][[1]][[2]]) # The <exch> node has tags 
xmlValue(top[[3]][[1]][[1]][[2]]) # but no value 
# Search tree for all source nodes and return their value 
xpathSApply(doc, "//source", xmlValue) 
# Search full tree for all exch nodes where currency is "DKK" 
xpathApply(doc, "//exch[@currency='DKK']", xmlAttrs) 
res <- xpathApply(doc, "//exch", 
          function(ex) { 
             c(xmlAttrs(ex), 
               bank=xmlValue(xmlParent(ex)[["source"]]), 
               date=xmlGetAttr(xmlParent(xmlParent(ex)), "time")) 
            }) 
result <- do.call(rbind, res) 
result 
library(MESS) 
data(bees) 
write.xml(bees, file="bees.xml") 
library(htmltab) 
url <- paste0("http://en.wikipedia.org/wiki/", 
              "List_of_national_capitals_by_population") 
result <- htmltab(doc = url, which=2, 
                  rm_nodata_cols=FALSE)  # Select table 2 
head(result) 
library(rvest) 
webaddress <- "http://www.imdb.com/chart/top?ref_=nv_mv_250_6" 
htmlpage <- read_html(webaddress) 
titlenodes  <- html_nodes(htmlpage, ".titleColumn") 
ratingnodes <- html_nodes(htmlpage, ".imdbRating") 
head(ratingnodes,3) 
library(RMySQL) 
mydb <- dbConnect(MySQL(), user="rprimer", password="PASSword", 
                  dbname="rprimer", host="192.168.1.151") 
indata <- dbGetQuery(mydb, "select * from students") 
indata 
data(trees) 
dbWriteTable(mydb, name="newTableName", value=trees, overwrite=TRUE) 
dbListTables(mydb) 
dbDisconnect(mydb) 

Chapter 2

vec1 <- 4                     # Numeric vector of length 1 
vec1 
vec2 <- c(1, 2, 3.4, 5.6, 7)  # Numeric vector of length 5 
vec2 
# Create a character vector 
vec3 <- c("Spare", "a talent", "for an", "old", "ex-leper") 
vec3 
# And a vector of logical values 
vec4 <- c(TRUE, TRUE, FALSE, TRUE, FALSE) 
vec4 
f <- factor(vec3)             # Make factor based on vec3 
f 
x <- 1 + 2i                   # Enter a complex number 
x 
sqrt(-1)                      # R does not see a complex number 
sqrt(-1 + 0i)                 # but it does here 
m <- matrix(1:6, ncol=2)               # Create a matrix 
m 
m2 <- matrix(1:6, ncol=2, byrow=TRUE)  # Fill the matrix row-wise 
m2 
list(vec2, comp=x, m)      # Combine 3 different objects, one named 
data.frame(vec2, f, vec4)  # Make a data frame 
5/2 + 2*(5.1 - 2.3)  # Add, subtract, multiply and divide 
2**8                 # 2 to the power 8 
1.61^5               # 1.61 to the power 5 
10 %% 3              # 10 modulus 3 has remainder 1 
10 %/% 3             # integer division 
abs(-3.1)            # absolute value of -3.1 
sqrt(5)              # square root of 5 
ceiling(4.3)         # smallest integer larger than 4.3 
floor(4.3)           # largest integer smaller than 4.3 
trunc(4.3)           # remove decimals 
round(4.5) 
round(4.51) 
round(4.51, digits=1) 
# Angles for trigonometric functions use radians - not degrees 
cos(pi/2)            # cosine of pi/2. 
sin(pi/4)            # sine of pi/4 
tan(pi/6)            # tangent of (pi/6) 
log(5)               # natural logarithm of 5 
log(5, base=2)       # binary logarithm of 5 
log10(5)             # common logarithm of 5 
exp(log(5) + log(3)) # exponential function 
# Create two matrices 
x <- matrix(1:6,ncol=3) 
y <- matrix(c(1, 1, 0, 0, 0, 1), ncol=2) 
x 
y 
x %*% y      # Matrix multiplication 
y %*% x      # Matrix multiplication 
seq(5)            # vector with sequence from 1 to 5 
seq(4, 6)         # from 4 to 6 with step size 1 
4:6               # Alternative way to make sequence with step 1 
seq(6, 4)         # from 6 to 4 with step size 1 
seq(0, 3, .5)     # from 0 to 3 with steps 0.5 
 
x <- c(6, 8, 1:4, 9, 7, 5, 3) 
x 
length(x)         # length of x 
head(x)           # Show the start of x 
tail(x)           # and the bottom 
rev(x)            # reverse the order of elements in x 
class(1:3)        # class of a sequence (only has integers) 
class(x)          # class of x (numeric because of c()) 
class(c(1, 2.5))  # and now numeric variables 
class(c(TRUE, TRUE, FALSE)) 
x 
which.min(x)  # The 3rd element is smallest (value 1) 
which.max(x)  # The 2nd element is largest (value 8) 
y <- c(2, 4, 6, 2, 4, 9, 2) 
which.min(y)  # Return the first element if there are ties 
which.min(c(FALSE, TRUE, FALSE)) # FALSE = 0, TRUE = 1 
which.max(c(FALSE, TRUE, FALSE)) 
unique(y) 
duplicated(y) 
z1 <- c(TRUE, FALSE, FALSE) 
any(z1) 
all(z1) 
z2 <- c(TRUE, TRUE, TRUE) 
any(z2) 
all(z2) 
z3 <- c(FALSE, FALSE, FALSE) 
any(z3) 
all(z3) 
which(z1) 
which(z3) 
x 
is.na(x) 
x[c(2,4)] <- NA # redefine x 
x 
is.na(x)        # identify missing elements 
is.numeric(x)   # Check if x is a numeric vector 
is.character(x) # Check if x is a character vector 
x <- c("Suki", "walk", "to", "the", "well", NA) 
toupper(x)             # convert to upper case 
tolower(x)             # convert to lower case 
nchar(x)               # no. of characters in each element of x 
grep("a", x)           # elements that contain letter "a" 
grep("o", x)           # elements that contain letter "o" 
substr(x, 2, 3)        # extract 2nd to 3rd characters 
substr(x, 2, 3) <- "X" # replace 2nd char with X 
x 
substr(x, 2, 3) <- "XXXX" 
x 
paste(x, "y")          # paste "y" to end of each element 
paste(x, "y", sep="-") 
paste(x, "y", sep="-", collapse=" ") 
indata <- read.table("DKdata.txt", header=TRUE) 
indata 
x <- as.character(indata$name) 
x 
Encoding(x)              # Read the encoding 
Encoding(x) <- "latin1"  # Set it to latin1 
x                        # Correct answer 
x <- as.character(indata$name) 
x 
iconv(x, from="latin1", to="") 
x <- c(2, 4, 1, 5, 3, 4) 
x 
sort(x) 
sort(x, decreasing=TRUE) 
y <- c("b", "a", "A", "b", "B", "a") 
y 
sort(y) 
o <- order(x) 
o 
x[o[1]]  # Same as min(x) 
x[o]     # Same as sort(x) 
o1 <- order(y) 
o1 
o2 <- order(y, x) 
o2 
df <- data.frame(day=c(1, 2, 1, 2, 3), 
                 time=c("late", "late", "early", "late", "early")) 
df 
o <- order(df$day, df$time)   # Order by day and then time 
df[o,] 
data(airquality) 
names(airquality) 
celsius <- (airquality$Temp-32)/1.8            # New variable 
airquality$Celsius <- (airquality$Temp-32)/1.8 # Add to data frame 
head(airquality$Temp) 
head(airquality$Celsius) 
newdata <- transform(airquality, logOzone = log(Ozone)) 
head(newdata) 
x <- seq(0, pi/2, .2) 
x 
y <- sin(2*x) 
y 
x[which.max(y)] 
x <- c(1, 2, 3, 1, NA, 4) 
y <- c(1, 2, 5, NA) 
x %in% y 
y %in% x 
! x %in% y   # Invert result: find objects NOT present 
z <- matrix(2:7, ncol=2) 
z 
x %in% z 
# c(1,2) is found in both 
data.frame(a=c(1,2), b=c(3,4)) %in% data.frame(f=c(5,6), 
                                               d=c(1,2)) 
list(1, 2, 3) %in% list(a=c(1,2), b=3) 
data(airquality) 
tapply(airquality$Temp, airquality$Month, mean) 
when <- cut(airquality$Day, 3, labels=c("Early", "Mid", "Late")) 
tapply(airquality$Temp, list(airquality$Month, when), mean) 
library(MESS) 
id <- c(1, NA, NA, NA, 2, NA, NA, NA, 3, NA, NA, NA) 
date <- c("2016-02-05", NA, NA, NA, 
          NA, NA, NA, NA, 
          "2016-02-06", NA, NA, NA) 
newid <- filldown(id) 
newid 
tapply(date, newid, filldown)   # Returns a list 
unlist(tapply(date, newid, filldown)) 
x <- c("1,2", "1,5", "1,8", "1.9", "2")   # Example data 
as.numeric(sub(",", ".", x))  # Substitute , with . and convert 
f <- factor(x)                # It also works on factors 
as.numeric(sub(",", ".", f))  # Substitute , with . and convert 
as.numeric(x)       # Does not work - need to substitute first 
library(data.table) 
x <- 1:8 
shift(x) 
shift(x, n=3) 
shift(x, n=2, fill=0) 
shift(LETTERS[1:5], type="lead") 
df <- data.frame(1:5, LETTERS[1:5]) 
shift(df, n=1, type="lag") 
shift(x, n=c(1,2)) 
library(MESS) 
x <- 1:5 
y <- c(1, 0, 1, 1, 5) 
auc(x, y) 
 
x2 <- c(4, 5, 1, 3, 2)  # Reorder the same data 
y2 <- c(1, 5, 1, 1, 0) 
auc(x2, y2)             # Reordering is fine. Same result 
auc(x, y, from=0)                     # AUC from 0 to max(x) 
auc(x, y, from=0, rule=2)             # Allow extrapolation 
auc(x, y, from=0, rule=2, yleft=0)    # Use value 0 to the left 
auc(x, y, from=0, rule=2, yleft=.5)   # Use 1/2 to the left 
data(ChickWeight) 
result <- by(ChickWeight, ChickWeight$Chick, 
             function(df) { auc(df$Time, df$weight) } ) 
head(result) 
result2 <- by(ChickWeight, ChickWeight$Chick, function(df) { 
              auc(df$Time, df$weight, from=0, to=21, rule=2)}) 
head(result2) 
m <- matrix(c(NA, 2, 3, 4, 5, 6), ncol=2) 
m 
apply(m, 1, sum)               # Compute the sum for each row 
apply(m, 2, mean)              # Compute the mean for each column 
apply(m, 2, mean, na.rm=TRUE)  # Compute the mean for each column 
m <- matrix(c(1, 1, 2, 2, 7, 7), 2) 
m 
prop.table(m) 
prop.table(m, margin=1) 
prop.table(m, 2) 
m <- matrix(1:6, ncol=2)     # Create matrix 
m 
t(m)                         # Transpose 
mpdat <- data.frame(source=c("KU", "Spain", "Secret", "Attic"), 
                     rats=c(97, 119, 210, 358)) 
mpdat 
t(mpdat) 
t(mpdat[,2]) 
x <- rbinom(20, size=4, p=.5) 
x 
table(x) 
sex <- sample(c("Male", "Female"), 20, replace=TRUE) 
sex 
table(sex, x) 
treatment <- rep(c("Placebo", "Active"), 10) 
treatment 
table(sex, x, treatment) 
data(Titanic) 
ftable(Titanic) 
# Only look at males to keep output short 
maletable <- as.table(HairEyeColor[,,1]) 
maletable 
freq <- as.data.frame(maletable) 
freq 
library(epitools) 
fulldata <- expand.table(HairEyeColor) 
head(fulldata) 
summary(fulldata) 
library(lubridate) 
string1 <- "1971-08-20" 
string2 <- "11-04-01"   # Year with only two digits 
date1 <- ymd(string1) 
date2 <- ymd(string2)   # 11 becomes 2011 
date1 
date2 
string3 <- c("02/28/00", "Feb 29th, 2000", "March 01 00", "3-31-00") 
date3 <- mdy(string3)   # Super flexible 
date3 
date2 - date1           # Difference between dates 
date1 <- ymd(string1, tz="Europe/Copenhagen") 
date1 
ymd(date2, tz="UTC") - date1 # Difference changed a bit 
time1 <- hms(c("08:34:12", "15-32-41")) 
time1 
time2 <- hm(c("12:11", "5,30")) 
time2 
ydm_hms("71-20-08 09:05:21")  # Combine date and time 
ydm_hm("71-20-08 09:05")      # Combine date and time no seconds 
ydm_hm("71-20-08 09:05 PM")   # AM/PM specification allowed 
ydm_hm("Trial started on 71-20-08 at 09:05 PM and was successful") 
date4 <- mdy(c("Mar 31 2011", "Apr 1 2011")) 
date4 + 30        # Add 30 seconds 
date4 + days(30)  # Add 30 days 
date4 + months(1) # Add 1 month 
library(lubridate) 
string <- c("02/28/00", "Feb 29th, 2000", "March 01 00", "3-31-00") 
D <- mdy(string)          # Parse as dates 
D 
stamp("March 1, 1999")(D) # Make and apply stamp like "March 1, 1999" 
stamp("March 1, 1999", orders="dy")(D) # Only expect day and year 
D2 <- ydm_hm(c("71-20-08 09:05"), c("71-18-08 09:05"), 
             c("00-28-02 09:15"), c("01-20-12 13:10")) 
mystamp <- stamp("Date of birth: Sunday, Jan 1, 1999 3:34 pm", 
                 quiet=TRUE) 
mystamp(D2) 
 
mystamp <- stamp("Date of birth: Sunday, Jan 1, 1999 3:34", 
                 orders="mdyHM") 
mystamp(D2) 
f <- factor(c(1:4, "A", 8, NA, "B")) 
f 
vect <- as.numeric(levels(f))[f] 
vect 
as.numeric(f)                   # Not correct 
f <- factor(c(1:4, "A", 8, NA, "B")) 
f 
vect <- as.character(f) 
vect 
f <- factor(c(1, 2, 3, 2, 3, 1, 3, 1, 2, 1)) 
f 
f[c(1,2)] <- "A"   # Not working 
f 
f <- factor(f, levels=c(levels(f), "A")) 
f[c(1,2)] <- "A"   # Can now assign the value "new level" 
f 
f <- factor(c(1, 1, 2, 3, 2, 1, "A", 3, 3, 3, "A")) 
levels(f)    # Print existing levels 
levels(f) <- c("13", "2", "13", "A") 
f 
f <- factor(c(1, 1, 2, 3, 2, 1, "A", 3, 3, 3, "A")) 
f 
levels(f)[c(1,3)] <- "1&3" 
f 
f <- factor(c(1, 1, 2, 3, 2, 1, "A", 3, 3, 3, "A")) 
f 
levels(f) <- list("1+3" = c(1, 3), "2"=2, "A"="A") 
f 
f <- factor(c(1, 2, 3, 2, 3, 2, 1), levels=c(0, 1, 2, 3)) 
f 
ff <- droplevels(f)   # Drop unused levels from f 
ff 
df <- data.frame(f1=f, f2=f) 
newdf <- droplevels(df, except=2) 
newdf$f1 
newdf$f2 
f <- factor(LETTERS[1:5]) 
f 
f <- relevel(f, ref="D")     # Set level D as reference 
f 
x <- (1:15)**2 
x 
cut(x, 3)  # Cut the range of x into 3 interval of same length 
# Cut the range into 3 intervals of roughly the same size 
cut(x, quantile(x, probs=seq(0, 1, length=4)), 
    include.lowest=TRUE) 
library(MESS) 
data(earthquakes) 
subset(earthquakes, mag>7.5) 
subset(earthquakes, type=="mining explosion" & mag>3.6) 
subset(earthquakes, mag>7 & depth>100, 
       select=c("time", "place")) 
data(airquality) 
head(airquality) 
complete <- complete.cases(airquality) 
head(complete, 20) 
ccdata <- airquality[complete,] 
head(ccdata) 
complete2 <- complete.cases(airquality$Ozone, airquality$Temp) 
oztemp <- airquality[complete2,] 
head(oztemp) 
mydat <- data.frame(x=1:5, y=factor(c("A", "B", "A", "B", NA))) 
mydat 
mydat$y <- NULL   # Remove the y variable from mydat 
mydat 
library(MESS) 
data(earthquakes) 
lapply(earthquakes, FUN=class) # Apply the class function 
sapply(earthquakes, class)     # Simplify output 
sapply(earthquakes, mean)   # Compute the mean of each variable 
sapply(earthquakes, mean, trim=.1) 
newlist <- lapply(earthquakes, function(i) { 
                      if (is.factor(i)) {  # Check if it is a factor 
                          as.character(i)  # Convert 
                      } else {             # Otherwise do nothing 
                          i 
                      } 
                  }) 
sapply(newlist, class)     # New classes 
data(airquality) 
by(airquality, airquality$Month, colMeans, na.rm=TRUE) 
df1 <- data.frame(delta=seq(0.7, 1, .1), 
                  n=seq(20, 50, 10)) 
do.call("power.t.test", df1) 
df2 <- data.frame(delta=seq(0.7, 1, .1), 
                  n=seq(20, 50, 10), 
                  extra=1:4) 
df2 
do.call("power.t.test", df2) 
df3 <- data.frame(delta=seq(0.7, 1, .1), 
                  power=c(0.58, 0.86, 0.98, 0.998)) 
res <- mapply(power.t.test, delta=df3$delta, power=df3$power) 
res 
unlist(res[1,]) 
df1 <- data.frame(id=1:3, x=11:13, y=factor(c("A", "B", "C"))) 
df1 
df2 <- data.frame(id=6:10, y=factor(1:5), x=1:5) 
df2 
combined <- rbind(df1, df2) 
combined 
df1 <- data.frame(id=1:3, x=11:13, y=factor(c("A", "B", "C")), 
                  z=1:3) 
df2 <- data.frame(id=6:10, y=factor(1:5), x=1:5) 
df1$z <- NULL   # Remove the z variable from data frame df1 
rbind(df1,df2)  # Now we can join the data frames 
df1 <- data.frame(id=1:3, x=11:13, y=factor(c("A", "B", "C")), 
                  z=1:3) 
df2 <- data.frame(id=6:10, y=factor(1:5), x=1:5) 
df2$z <- NA     # Add a vector z of NAs to data frame df2 
rbind(df1,df2)  # Now we can join the data frames 
dfx <- data.frame(id=factor(1:6), var1=11:16, 
                  fac1=factor(c("A", "B", "C", "D", "E", "F"))) 
dfy <- data.frame(id=factor(c(4,5,6,7,8,9)), var2=15:10) 
dfx 
dfy 
merge(dfx, dfy, by="id") 
merge(dfx, dfy, by="id", all=TRUE) 
merge(dfx, dfy, by="id", all.x=TRUE) 
merge(dfx, dfy, by="id", all.y=TRUE) 
df1 <- data.frame(id=1:3, x=11:13, y=factor(c("A", "B", "C"))) 
df1 
newobs <- list(4, 10, "B")   # The new observation to add 
df2 <- rbind(df1, newobs) 
df2 
newobs2 <- list(4, "B", 10)  # Order different from data frame 
df3 <- rbind(df1, newobs2)   # Add the new observation 
df3 
df3$x      # The x variable is converted to character 
library(reshape2) 
df1 <- data.frame(time1=c(1, 2, 3, 4), 
                  time2=c(3, NA, 6, 7)) 
df1 
melt(df1) 
df1$id <- c("Reed", "Sue", "Ben", "Johnny") 
df1 
melt(df1) 
melt(df1, id.vars=c("time1")) 
library(reshape2) 
indata <- read.table("wide.txt", header=TRUE) 
indata 
long <- melt(indata, id.vars=c("person", "age", "gender")) 
long 
dcast(long, person ~ variable) 
library(MESS) 
data(bdstat) 
head(bdstat) 
wide <- dcast(bdstat, year ~ month, value.var="births") 
head(wide) 
indata <- read.table("widedata.txt", header=TRUE) 
indata 
long <- reshape(indata, 
                varying=list(c("y1", "y2", "y3", "y4"), 
                             c("t1", "t2", "t3", "t4")), 
                v.names=c("response", "day"), 
                direction="long") 
long 
reshape(long, idvar="person", timevar="time", 
        v.names=c("day", "response"), 
        direction="wide") 
reshape(long, idvar="person", timevar="time", 
        v.names=c("day", "response"), 
        varying=list(c("y1", "y2", "y3", "y4"), 
                     c("t1", "t2", "t3", "t4")), 
        direction="wide") 
df <- data.frame(a=rep(1,5), b=rep(2,5), c=rep(3,5)) 
df 
vector <- as.vector(as.matrix(df))  # Convert to vector 
vector 
df <- data.frame(a=rep(1,5), 
                 b=factor(c("A", "A", "A", "B", "B"))) 
df        # Show data frame with both numeric and factor 
as.vector(as.matrix(df))   # Result is vector of strings 

Chapter 3

f1 <- factor(c("A", "B", "C", "B", "A", "C")) 
f2 <- factor(c(1, 2, 1, 2, 1, 2)) 
interaction(f1,f2) 
interaction(f1,f2, drop=TRUE) # Drop unused factor levels 
lm(weight ~ offset(2.7*Time))   # Fix slope at 2.7 
x <- c(6, 8, 1:4) 
x 
mean(x)         # mean of x 
median(x)       # median 
sd(x)           # standard deviation 
IQR(x)          # inter-quartile range 
y <- (1:6)**2   # New vector 
y 
cor(x,y)        # Pearson correlation 
cor(x, y, method="spearman") # Spearman correlation 
cor(x, y, method="kendall")  # Kendall correlation 
sum(x)          # sum of elements in x 
cumsum(x)       # cumulative sum of all elements of x 
min(x)          # minimum value 
max(x)          # maximum value 
range(x)        # range. Same as the values above 
quantile(x)     # get quartiles 
quantile(x, probs=c(.15, .25, .99))  # specific quantiles 
x[c(2,4)] <- NA       # redefine x 
x                     # vector with missing values 
sum(x)                # sum returns missing 
sum(x, na.rm=TRUE)    # unless we remove the NA's 
mean(x)               # mean also returns missing 
mean(x, na.rm=TRUE)   # unless we remove the NA's 
library(isdals)   # Get the data 
data(fev) 
library(Publish) 
univariateTable(Smoke ~ Age + Ht + FEV + Gender, data=fev) 
univariateTable(Smoke ~ Age + Q(Ht) + FEV + Gender, 
                showTotals=FALSE, data=fev) 
data(trees) 
result <- lm(Volume ~ Height, data=trees) 
result 
summary(result) 
data(trees) 
result <- lm(Volume ~ Height + Girth, data=trees) 
result 
summary(result) 
year <- seq(1981, 1999) 
deaths <- c(339, 1201, 3153, 6368, 12044, 19404, 29105, 
            36126, 43499, 49546, 60573, 79657, 79879, 
            73086, 69984, 61124, 49379, 43225, 41356) 
 
newyear <- year - 1980 
model <- lm(deaths ~ newyear + I(newyear^2) + I(newyear^3)) 
summary(model) 
library(isdals) 
data(antibio) 
head(antibio) 
model <- lm(org ~ type, data=antibio) 
model 
summary(model) 
drop1(model, test="F") 
library(isdals) 
data(antibio) 
model <- lm(org ~ type, data=antibio) 
drop1(model, test="F") 
result <- TukeyHSD(aov(model)) 
result 
plot(result) 
library(isdals) 
data(fev) 
# Convert variables to factors and get meaningful labels 
fev$Gender <- factor(fev$Gender, labels=c("Female", "Male")) 
fev$Smoke <- factor(fev$Smoke, labels=c("No", "Yes")) 
interaction.plot(fev$Gender, fev$Smoke, fev$FEV, 
                 lwd=3, col=c("black", "blue")) 
model <- lm(FEV ~ Gender + Smoke + Gender*Smoke, data=fev) 
model 
drop1(model, test="F") 
model2 <- lm(FEV ~ Gender + Smoke, data=fev) 
drop1(model2, test="F") 
summary(model2) 
library(isdals) 
data(fev) 
# Convert variables to factors and get meaningful labels 
fev$Gender <- factor(fev$Gender, labels=c("Female", "Male")) 
fev$Smoke <- factor(fev$Smoke, labels=c("No", "Yes")) 
summary(fev) 
plot(FEV ~ Ht, pch=20, col=c("Blue", "Black")[Smoke], 
   xlab="Height (inches)", ylab="FEV (liters)",  data=fev) 
model <- lm(FEV ~ Ht + I(Ht^2) + Smoke*Gender + Smoke*Age, 
            data=fev) 
drop1(model, test="F") 
model2 <- lm(FEV ~ Ht + I(Ht^2) + Gender + Smoke*Age, 
             data=fev) 
drop1(model2, test="F") 
# Remove the insignificant interaction and refit 
model3 <- lm(FEV ~ Ht + I(Ht^2) + Gender + Smoke + Age, 
             data=fev) 
drop1(model3, test="F") 
summary(model3) 
head(coef(result, s=exp(-1))) 
cvres$lambda.min 
cvres$lambda.1se 
head(coef(cvres, s = "lambda.min")) 
head(predict(cvres, newx = x[1:3,], s = "lambda.min")) 
library(MASS) 
data(birthwt) 
model <- glm(low ~ factor(race) + smoke + lwt, family=binomial, 
             data=birthwt) 
summary(model) 
exp(1.060 +  c(-1,1)*1.96*0.3783)    # 95% CI for smoking 
drop1(model, test="Chisq") 
exp(-0.10922 + 1.06001 + 120*-0.01326) / 
    (1 + exp(-0.10922 + 1.06001 + 120*-0.01326)) # Logit 
pnorm(-0.072490 + 0.652215 + 120*-0.008067)      # Probit 
library(pROC) 
pred <- predict(model, type=c("response")) 
roccurve <- roc(low ~ pred, data=birthwt) 
plot(roccurve) 
library(survival) 
library(MESS) 
data(matched) 
model <- clogit(iscase=="Case" ~ vaccine + lung + strata(id), 
                data=matched) 
summary(model) 
drop1(model, test="Chisq") 
library(ordinal) 
library(MESS) 
data(icecreamads) 
resp <- ordered(icecreamads$Consumption) 
model <- clm(Consumption ~ Temperature + Price + Advertise, 
             data=icecreamads) 
summary(model) 
exp(10*0.1473 + 10*qnorm(.975)*c(-1,1)*0.04001) 
exp(3.56)/(1 + exp(3.56)) 
exp(6.941-(0.147*50-6.11*0.28))/(1+exp(6.941-(0.147*50-6.11*0.28))) - 
exp(3.560-(0.147*50-6.11*0.28))/(1+exp(3.560-(0.147*50-6.11*0.28))) 
head(predict(model, type="prob")$fit) 
drop1(model, test="Chisq") 
library(nnet) 
multi <- multinom(Consumption ~ Price + Advertise + Temperature, 
                  data=icecreamads) 
1-pchisq(-2*as.numeric(logLik(model)) - deviance(multi), 
         df=multi$edf - model$edf) 
library(nnet) 
library(isdals) 
data(alligator) 
head(alligator) 
model <- multinom(food ~ length, data=alligator) 
summary(model) 
head(predict(model, response="probs")) 
len <- seq(0.3, 4, .1) 
matplot(len, predict(model, newdata=data.frame(length=len), 
                     type="probs"), 
        type="l", xlab="Alligator length", ylab="Probability") 
null <- multinom(food ~ 1, data=alligator) 
anova(null, model)               # Compare the two models 
library(MESS) 
data(bees) 
beedata <- subset(bees, Time=="july1") # Look at one day only 
head(beedata) 
model <- glm(Number ~ Locality + Type*Color, 
             family=poisson, data=beedata) 
summary(model) 
exp(2.5649 + 1.8718 - 2.1342) 
drop1(model, test="Chisq") 
model2 <- glm(Number ~ Locality + Type*Color, 
              family=quasipoisson, data=beedata) 
drop1(model2, test="Chisq") 
summary(model2) 
exp(-0.4745 + c(-1,1)*1.96*0.2881) # 95% CI Kragevig/Havreholm 
library(lme4) 
data(ChickWeight) 
head(ChickWeight) 
model <- lmer(log(weight) ~ Time + Diet + Time*Diet + (1|Chick), 
              data=ChickWeight) 
summary(model) 
model <- lmer(log(weight) ~ Time + Diet + Time*Diet + (1|Chick), 
              data=ChickWeight, REML=FALSE) 
noint <- lmer(log(weight) ~ Time + Diet + (1|Chick), 
              data=ChickWeight, REML=FALSE) 
anova(model, noint) 
library(nlme) 
data(ChickWeight) 
head(ChickWeight) 
model <- lme(log(weight) ~ Diet + Time + Time*Diet, 
             random= ~ 1|Chick, 
             correlation=corGaus(form= ~ Time|Chick, 
                                 nugget=TRUE), 
             data=ChickWeight) 
model 
model2 <- lme(log(weight) ~ Diet + Time + Time*Diet, 
              random= ~ 1|Chick, 
              correlation=corGaus(form= ~Time|Chick, 
                                  nugget=TRUE), 
              method="ML", data=ChickWeight) 
model3 <- lme(log(weight) ~ Diet + Time, random= ~ 1|Chick, 
              correlation=corGaus(form= ~Time|Chick, 
                                  nugget=TRUE), 
              method="ML", data=ChickWeight) 
anova(model3, model2) 
summary(model) 
model4 <- lme(log(weight) ~ Diet + Time + Time*Diet, 
              random= ~ 1|Chick, 
              correlation=corGaus(form= ~ Time|Chick), 
              data=ChickWeight) 
anova(model4, model) 
model5 <- lme(log(weight) ~ Diet + Time + Time*Diet, 
              random= ~ 1|Chick, 
              correlation=corExp(form= ~ Time|Chick, 
                                 nugget=TRUE), 
              data=ChickWeight) 
AIC(model) 
AIC(model5) 
library(lme4) 
data(cbpp) 
model <- glmer(cbind(incidence, size-incidence) ~ 
               period + (1|herd), family=binomial, data=cbpp) 
summary(model) 
exp(-0.9923) 
model2 <- glmer(cbind(incidence, size-incidence) ~ 1 + 
                (1|herd), family=binomial, data=cbpp) 
anova(model2, model) 
cbpp$obs <- 1:nrow(cbpp)   # Construct vector for overdispersion 
model3 <- glmer(cbind(incidence, size - incidence) ~ period + 
                (1|herd) + (1|obs), family=binomial, data=cbpp) 
summary(model3) 
library(geepack) 
library(MESS) 
data(ohio) 
ordered.clusters(ohio$id)      # Check that clusters are ordered 
fage <- factor(ohio$age+9)     # Add 9 to get the right label 
model <- geeglm(resp ~ fage + smoke + fage:smoke, id=id, data = ohio, 
                family = binomial, corstr="exchangeable") 
summary(model) 
drop1(model)               # Generalized Wald test 
drop1(model, test="score") # Score test 
model2 <- geeglm(resp ~ fage + smoke, id=id, data = ohio, 
                 family = binomial, corstr="exchangeable") 
summary(model2) 
sqrt(diag(model$geese$vbeta.naiv)) 
model3 <- geeglm(resp ~ fage + smoke, id=id, data = ohio, 
                 family = binomial, corstr="ar1") 
summary(model3) 
library(MESS) 
data(greenland) 
temp <- ts(greenland$airtemp, start=1960) 
temp 
plot(temp) 
acf(temp, na.action=na.pass) 
acf(diff(temp), na.action=na.pass) 
pacf(temp, na.action=na.pass) 
library(forecast) 
model <- Arima(temp, order=c(1,1,1), include.drift=TRUE) 
model 
forecast(model, h=15) 
plot(forecast(model, h=15)) 
auto.arima(temp) 
data(AirPassengers) 
model <- stl(log(AirPassengers), s.window="periodic") 
plot(model) 
summary(model) 
acf(model$time.series[,3])  # Plot autocorrelation of residuals 
library(MASS) 
data(cats) 
head(cats) 
t.test(Bwt ~ Sex, data=cats) 
t.test(Bwt ~ Sex, data=cats, var.equal=TRUE) 
x <- c(1.3, 5.5, -2.1, 0.9, -0.4, 1.1) 
t.test(x, mu=0) 
library(MESS) 
data(qpcr) 
# Use data from just one of the 14 runs 
run1 <- subset(qpcr, transcript==1 & line=="wt") 
model <- nls(flour ~ fmax/(1+exp(-(cycle-c)/b))+fb, 
             start=list(c=25, b=1, fmax=100, fb=0), 
             data=run1) 
model 
plot(run1$cycle, run1$flour, 
     xlab="Cycle", ylab="Fluorescence") 
lines(run1$cycle, predict(model)) # Add the fitted line 
summary(model) 
confint(model) 
library(AER) 
library(MESS) 
data(nh4) 
model <- tobit(log(nh4) ~ factor(year), left=log(0.00999), data=nh4) 
summary(model) 
drop1(model, test="Chisq") 
library(pscl) 
library(AER) 
data(Affairs) 
model <- zeroinfl(affairs ~ age + gender + children, 
                  dist="poisson", data=Affairs) 
summary(model) 
m2 <- zeroinfl(affairs ~ age + gender + children | 1, 
               dist="poisson", data=Affairs) 
summary(m2) 
drop1(model, test="Chisq") 
library(MESS) 
data(lifeexpect) 
 
matplot(lifeexpect$myear, cbind(lifeexpect$male, lifeexpect$female), 
        xlab="Year", ylab="Life expectancy", 
        pch=20, col=c("black", "blue")) 
 
lo <- loess(male ~ myear, span=.25, data=lifeexpect) 
lo2 <- loess(male ~ myear, span=1, data=lifeexpect) 
lines(lifeexpect$myear, predict(lo)) 
lines(lifeexpect$myear, predict(lo2), lty=2) 
library(mgcv) 
library(isdals) 
data(fev) 
model <- gam(FEV ~ Gender + Smoke + s(Age) + s(Ht), data=fev) 
summary(model) 
plot(model) 
linmodel <- gam(FEV ~ Gender + Smoke + Age + s(Ht), data=fev) 
anova(linmodel, model, test="Chisq") 
predict(model, newdata=data.frame(Age=14, Ht=60, Smoke=1, Gender=0)) 
model2 <- gam(FEV ~ Gender + Smoke + s(Age, Ht), data=fev) 
summary(model2) 
plot(model2, pers=TRUE) 
library(metafor) 
yi  <- c(0.095, 0.277, 0.367, 0.664, 0.462, 0.185) 
sei <- c(0.033, 0.031, 0.050, 0.011, 0.043, 0.023) 
res <- rma(yi, sei=sei) 
res 
res2 <- rma(yi, sei=sei, method="FE") 
res2 
data("dat.bcg") 
dat.bcg 
rma(measure="RR", ai=tpos, bi=tneg, ci=cpos, di=cneg, data=dat.bcg) 
url <- "http://www.biostatistics.dk/data/winequality-red.csv" 
indata <- read.csv(url, sep=";", header=TRUE) 
indata$taste <- cut(indata$quality, breaks=c(0, 4, 6, 10), 
                    labels=c("Bad", "Normal", "Good")) 
head(indata) 
pick <- sample(nrow(indata), 0.63 * nrow(indata)) 
traindata <- indata[pick,]  # Select a random sample for training 
testdata <- indata[-pick,]  # and put the rest in test data 
traindata$quality <- NULL   # Remove the quality variable 
testdata$quality <- NULL    # from training and test data 
table(traindata$taste) 
library(caret) 
 
fit <- train(taste ~ . , data=traindata, method="rf", 
             trControl=trainControl(method="cv",number=5)) 
print(fit) 
mtry <- expand.grid(mtry = 5) 
train(taste ~ . , data=traindata, method="rf", tuneGrid=mtry, 
             trControl=trainControl(method="cv",number=5)) 
confusionMatrix(fit) 
varImp(fit, scale=FALSE) 
pred <- predict(fit, newdata=testdata) 
confusionMatrix(pred, testdata$taste) # Compare 
library(quantreg) 
library(isdals) 
data(fev) 
res <- rq(FEV ~ Age + Gender + Smoke, data=fev) 
summary(res) 
summary(res, se="nid") 
summary(lm(FEV ~ Age + Gender + Smoke, data=fev)) 
res <- rq(FEV ~ Age + Gender + Smoke, data=fev, 
          tau=c(0.1, 0.25, 0.5, 0.75, 0.9)) 
plot(summary(res), lcol="blue", level=0.95) 
anova(res, joint=FALSE) 
start <- rq(FEV ~ Age + Gender + Smoke, data=fev, tau=0.5) 
nested <- rq(FEV ~ Gender + Smoke, data=fev, tau=0.5) 
anova(start, nested) 
library(nortest) 
x <- rnorm(100, mean=5, sd=2)         # Normal distribution 
y <- (rnorm(100, mean=5, sd=1.5))**2  # Right-skewed distribution 
shapiro.test(x) 
ad.test(x) 
cvm.test(x) 
lillie.test(x) 
shapiro.test(y) 
ad.test(y) 
cvm.test(y) 
lillie.test(y) 
data(ToothGrowth) 
bartlett.test(len ~ factor(dose), data=ToothGrowth) 
library(car) 
leveneTest(len ~ factor(dose), data=ToothGrowth) 
fligner.test(len ~ factor(dose), data=ToothGrowth) 
bartlett.test(len ~ interaction(supp, dose), data=ToothGrowth) 
cr2 <- cumres(model2)  # Save the cumulated residuals 
plot(cr2)              # and make the plots 
m <- matrix(c(23, 20, 12, 5, 6, 9), ncol=3)   # Input data 
m 
chisq.test(m)                          # Chi-square test 
fisher.test(m)                         # Fisher's exact test 
m2 <- matrix(c(35, 25, 6, 9), ncol=2)  # Input data 
m2 
chisq.test(m2) 
chisq.test(m2, correct=FALSE)          # No continuity correction 
fisher.test(m2, alternative="greater") # One-sided alternative 
library(MESS) 
data(smokehealth) 
smokehealth 
m <- smokehealth 
m[,3] <- m[,3]+ m[,4]   # Collapse last two columns 
m[4,] <- m[4,] + m[5,]  # Collapse last two rows 
m <- m[1:4,1:3]         # Extract result 
m 
gkgamma(m) 
chisq.test(m) 
m <- matrix(c(29, 17, 5, 35), 2, 
            dimnames = list("Baseline"=c("Low PI", "High PI"), 
                            "4 weeks"=c("Low PI", "High PI"))) 
m 
mcnemar.test(m) 
binom.test(17, (5+17)) 
library(MASS) 
data(survey) 
mydata <- as.data.frame(xtabs(~ Sex + Clap + Exer, 
                              data=survey)) 
head(mydata) 
full <- glm(Freq ~ Sex*Clap*Exer, family=poisson, data=mydata) 
indep <- glm(Freq ~ Sex+Clap+Exer, family=poisson, data=mydata) 
anova(indep, full, test="Chisq") 
block1 <- glm(Freq ~ Sex + Clap*Exer, family=poisson, 
              data=mydata) 
anova(block1, full, test="Chisq") 
cond1 <- glm(Freq ~ Sex*Exer+Clap*Exer, family=poisson, 
             data=mydata) 
anova(cond1, full, test="Chisq") 
library(MethComp) 
library(MESS) 
data(rainman) 
head(rainman) 
mydata <- Meth(item=1, y=3:5, data=rainman) 
result <- BA.est(mydata, Transform="log")  # Use log transform 
result 
result2 <- BA.est(mydata, random.raters=TRUE, Transform="log") 
result2 
result2$LoA 
library(irr) 
data(anxiety) 
head(anxiety) 
ftable(anxiety[,1:2]) # Print the concordance table 
kappa2(anxiety[,1:2]) # Compute kappa for raters 1 and 2 
library(MESS) 
m <- matrix(c(10, 4, 1, 5, 9, 4, 0, 3, 13), 3) 
m 
kappa2(expand_table(m)) 
kappa2(anxiety[,1:2], weight="squared") 
kappam.fleiss(anxiety) 
library(lubridate) 
data(airquality) 
dayno <- yday(ymd(paste("1973", 
                        airquality$Month, 
                        airquality$Day, sep="-"))) 
head(dayno) 
model <- lm(cbind(Ozone, Temp, Solar.R) ~ dayno + I(dayno**2), 
            data=airquality) 
model 
anova(model) 
library(cluster) 
data(agriculture) 
fit2 <- kmeans(agriculture, centers=2)   # 2 cluster solution 
fit2 
# Calculate the total within sum of squares for different 
# number of clusters (here 2-6) 
sapply(2:6, function(i) { 
       sum(kmeans(agriculture, centers=i)$withinss)}) 
scale.agriculture <- scale(agriculture) 
fit2s <- kmeans(scale.agriculture, centers=2) 
fit2s 
sapply(2:6, function(i) { 
       sum(kmeans(scale.agriculture, centers=i)$withinss)}) 
distances <- dist(agriculture) 
hierclust <- hclust(distances) 
hierclust 
hierclust.single <- hclust(distances, method="single") 
plot(hierclust) 
plot(hierclust.single) 
cutree(hierclust, k=2)         # Cut tree to get 2 groups 
cutree(hierclust.single, h=5)  # Cut at height 5 
library(MASS) 
data(biopsy) 
names(biopsy) 
predictors <- biopsy[complete.cases(biopsy),2:10] 
fit <- prcomp(predictors, scale.=TRUE) 
summary(fit) 
plot(fit) 
biplot(fit, col=c("gray", "blue")) 
fit 
head(predict(fit)) 
fit <- prcomp(~ V1 + V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9, 
              data=biopsy, scale.=TRUE) 
library(MASS) 
data(biopsy) 
names(biopsy) 
fit <- prcomp(~ V1 + V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9, 
              data=biopsy, scale.=TRUE) 
summary(fit) 
pc <- predict(fit)[,1:4]  # Select first 4 principal comp. 
head(pc) 
y <- biopsy$class[complete.cases(biopsy)] # Complete outcome cases 
model <- glm(y ~ pc, family=binomial)     # Fit the pcr model 
summary(model) 
library(MASS) 
data(biopsy) 
fit <- lda(class ~ V1 + V2 + V3, data=biopsy) 
fit 
plot(fit, col="blue") 
lapply(predict(fit), head, 4)  # List the first 4 
result <- table(biopsy$class, predict(fit)$class) 
result 
sum(diag(result)) / sum(result) 
# Split the data up into random training and test samples 
train <- sample(1:nrow(biopsy), ceiling(nrow(biopsy)/3)) 
testdata <- biopsy[-train,] 
trainfit <- lda(class ~ V1 + V2 + V3, data=biopsy, subset=train) 
trainfit 
result2 <- table(testdata$class, 
                 predict(trainfit, testdata)$class) 
result2 
sum(diag(result2)) / sum(result2) 
library(pls) 
data(gasoline) 
gasTrain <- gasoline[1:50,]  # Training dataset with 50 samples 
gasTest <- gasoline[51:60,]  # Test dataset with 10 samples 
gas1 <- plsr(octane ~ NIR, ncomp=5, 
             data=gasTrain, validation="LOO") 
summary(gas1) 
plot(gas1, ncomp=3, line=TRUE) 
plot(gas1, "loadings", comps=1:3, col=c("black", "blue", "black")) 
RMSEP(gas1, newdata=gasTest) 
library(boot) 
set.seed(1)           # Set seed to reproduce simulations 
mydata <- rnorm(150, mean=rep(rnorm(30), 5)) 
mydata <- matrix(mydata,ncol=5) 
head(mydata) 
heritability <- function(data, indices=1:nrow(data)) { 
  r <- ncol(data) 
  df <- stack(as.data.frame(t(data[indices,]))) 
  result <- anova(lm(values ~ ind, data=df)) 
  mssire <- result[1,3] 
  mswithin <- result[2,3] 
  heritab <- 4*(mssire - mswithin) / ((r+1)*mssire - mswithin) 
  return(heritab) 
} 
 
heritability(mydata) 
result <- boot(mydata, heritability, R=10000) 
result 
plot(result) 
boot.ci(result) 
library(boot) 
data(trees) 
fit1 <- glm(Volume ~ Girth + Height, data=trees) 
cv.err.loo <- cv.glm(trees, fit1) 
cv.err.loo$delta 
cv.err.5 <- cv.glm(trees, fit1, K=5)  # 5 fold cross validation 
cv.err.5$delta 
res <- sapply(1:200, function(i) {cv.glm(trees, fit1, K=5)$delta[1]}) 
mean(res) 
fit2 <- glm(log(Volume) ~ log(Girth) + log(Height), data=trees) 
mycost <- function(responses, expected) { 
                   mean((exp(responses) - exp(expected))**2) } 
cv2.err.loo <- cv.glm(trees, fit2, mycost) 
cv2.err.loo$delta 
library(bootstrap)      # For the crossval function 
library(nnet)           # For the nnet function 
library(MASS)           # For the survey data frame 
data(survey) 
cc <- complete.cases(survey$Exer, survey$Sex, 
                     survey$Smoke, survey$Age) 
full <- survey[cc,]     # Keep only the complete cases 
fit <- nnet(Exer ~ Sex + Smoke + Age, data=full, 
            size=10, rang=.05, maxiter=200, trace=FALSE) 
fit 
table(full$Exer, predict(fit, type="class")) 
# Calculate the number of misclassifications 
sum(predict(fit, type="class") != full$Exer) 
index <- 1:nrow(full) # Work-around to identify obs. 
theta.fit <- function(x,y) { 
  nnet(Exer ~ Sex + Smoke + Age, data=full, size=10, 
       subset=x, rang=.05, maxiter=200, trace=FALSE) } 
theta.predict <- function(fit, x) { 
  predict(fit, full[x,], type="class") } 
 
o <- crossval(index, full$Exer, theta.fit, theta.predict, 
              ngroup=5) 
table(full$Exer, o$cv.fit) 
sum(o$cv.fit != full$Exer, na.rm=TRUE) 
power.t.test(delta=2, sd=3.6, power=.8, sig.level=0.05) 
power.t.test(n=20, delta=.70, sd=1, sig.level=0.05, 
             type="one.sample") 
power.prop.test(p1=0.43, p2=0.75, power=.8, sig.level=0.05) 
groupmeans <- c(4.5, 8, 6.2, 7.0) 
power.anova.test(groups = length(groupmeans), 
                 between.var=var(groupmeans), 
                 within.var=5.8, power=.80, sig.level=0.05) 
n <- seq(20, 50)          # Consider sample sizes from 20 to 50 
plot(n, power.t.test(n, delta=.7)$power, type="l", 
     ylim=c(0.55,1), ylab="Power", 
     xlab="Number of observations per group") 
lines(n, power.t.test(n, delta=.8)$power, lty=2) 
lines(n, power.t.test(n, delta=1)$power, lty=3) 
 
delta <- seq(0, 1.6, .1)  # Consider effect sizes from 0 to 1.6 
plot(delta, power.t.test(n=25, delta=delta)$power, 
     type="l", ylim=c(0,1), xlab="Effect size", 
     ylab="Power") 
lines(delta, power.t.test(n=40, delta=delta)$power, lty=2) 
lines(delta, power.t.test(n=60, delta=delta)$power, lty=3) 
x <- rbinom(50, size=1, prob=.6)  # Generate random data with 60% 1s 
x 
wilcox.test(x, mu=0.5)            # Test median equal to 0.5 
x2 <- x - 0.5                     # Deduct 0.5 from all observations 
wilcox.test(x2)                   # Make the same test with median 0 
sample1 <- rbinom(24, size=4, p=0.5) 
sample1 
sample2 <- sample1 + rbinom(24, size=3, p=0.5) - 1 
sample2 
wilcox.test(sample1, sample2, paired=TRUE) 
library(coin) 
mu <- rep(.5, length(x)) 
wilcoxsign_test(x ~ mu, distribution=exact()) 
wilcoxsign_test(sample1 ~ sample2, distribution=exact()) 
data(ToothGrowth) 
wilcox.test(len ~ supp, data=ToothGrowth) 
library(coin) 
wilcox_test(len ~ supp, data=ToothGrowth, distribution=exact()) 
data(ToothGrowth) 
# Use only data on ascorbic acid 
teeth <- subset(ToothGrowth, supp=="VC") 
kruskal.test(len ~ factor(dose), data=teeth) 
library(pgirmess) 
kruskalmc(len ~ factor(dose), data=teeth) 
library(coin) 
kruskal_test(len ~ factor(dose), data=teeth, 
             distribution=approximate(B=9999)) 
library(isdals) 
data(cabbage) 
friedman.test(yield ~ method | field, data=cabbage) 
library(coin) 
friedman_test(yield ~ method | factor(field), 
              data=cabbage, distribution=approximate(B=9999)) 
library(survival) 
data(ovarian) 
head(ovarian) 
# Fit Kaplan-Meier curves for each treatment 
model <- survfit(Surv(futime, fustat) ~ rx, data=ovarian) 
summary(model) 
plot(model, lty=2:3, xlab="Days", ylab="Survival") # Plot them 
survdiff(Surv(futime, fustat) ~ rx, data=ovarian) 
library(coin) 
logrank_test(Surv(futime, fustat) ~ factor(rx), 
             data=ovarian, distribution="exact") 
validate <- cox.zph(model) 
validate 
model4 <- coxph(Surv(futime, fustat) ~ rx + strata(resid.ds), 
                data=ovarian) 
summary(model4) 
library(survival) 
data(heart) 
head(jasa1) 
model <- coxph(Surv(start, stop, event) ~ 
               transplant + year + year*transplant, 
               data=jasa1) 
summary(model) 
cox.zph(model) 
nd <- data.frame(start=c(0, 50, 0), stop=c(50, 250, 300), event=1, 
                 transplant=c(0, 1, 0), year=1, id=c(1, 1, 2)) 
nd 
plot(survfit(model, newdata=nd, id=id), 
     xlab="Time since admission (days)", ylab="Survival", 
     col=c("black", "blue"), conf.int=TRUE) 
head(sort(pval), 8)                 # Show smallest p-values 
head(p.adjust(sort(pval)), 8)       # Holm correction 
head(p.adjust(sort(pval), method="fdr"), 8) # FDR 
library(MASS) 
data(trees) 
result <- boxcox(Volume ~ log(Height) + log(Girth), data=trees, 
                 lambda=seq(-0.5, 0.6, length=13)) 
result$x[which.max(result$y)] 
y <- rnorm(100) + 5       # Generate data 
y2 <- y**2                # Square the results 
result <- boxcox(y2 ~ 1, plotit=FALSE) 
result$x[which.max(result$y)] 

Chapter 4

x <- 1:5                     # Generate data 
y <- x^2                     # Generate data 
plot(x, y, pch=3:7)          # Use symbols 3-7 for the points 
plot(x, y, pch=c("a", "b"))  # Use symbols a b a b a 
plot(x, y, col="red")        # Each symbol is red 
plot(x, y, pch=c("a", "b"),  # Symbols are a b a b a in colors 
     col=c("red", "blue", "green")) # red blue green red blue 
# Plot circles with a thick red border and blue fill color 
plot(x, y, pch=21, lwd=3, col="red", bg="blue") 
plot(x, y, cex=seq(1,2,.25)) # Symbols with increasing size 
plot(x, y, type="l", lty=2)  # Line with line type 2 
plot(x, y, type="l", lwd=3)  # Line with line width 3 
par(lwd=3, pch=16)           # Line width 3 and symbol 16 
par(cex.lab=2, cex.main=3)   # Large labels and larger title 
plot(x, y, xlim=c(0, 10))    # x axis goes from 0 to 10 
plot(x, y, log="y")          # Logarithmic y axis 
plot(x, y, las=1)            # Rotate numbers on y axis 
colors() 
plot(1:10, col="cyan")         # Plot using cyan color 
plot(1:10, col="#cc0076")      # Plot using purplish color 
rgb(10, 20, 30, maxColorValue=255) 
rgb(10, 20, 30, maxColorValue=100) 
mycolor <- rgb(10, 20, 30, alpha=50, maxColorValue=100) 
plot(1:10, col=mycolor)   # Use the new colour 
palette()             # Show the current palette 
palette(c("red", "#1A334D", "black", rgb(.1, .8, 0))) 
palette() 
palette("default")    # Use the default palette 
library(RColorBrewer) 
head(brewer.pal.info, n=10) 
brewer.pal(n=5, name="Spectral")       # 5 colors from Spectral 
brewer.pal(n=8, name="Accent")        # 8 colors from Accent 
# Set colors locally for a single plotting call 
plot(1:8, col=brewer.pal(n=8, name="Reds"), pch=16, cex=4) 
palette(brewer.pal(n=8, name="Greens")) # Set global palette 
plot(1:8, col=1:8, pch=16, cex=4)       # Plot using colors 
library(MESS) 
data(happiness) 
with(happiness, symbols(tax, happy, circles=sqrt(population)/8, 
        inches=FALSE, bg="lightgray", 
        xlab="Tax (% of GDP)", ylab="Happiness")) 
data(LakeHuron) 
hist(LakeHuron, main=NULL)  # Create histogram with no title 
boxplot(ToothGrowth$len, horizontal=TRUE, range=0) 
barplot(c(2,5,6,3), col="blue", 
        names.arg=c("Four", "shades", "of", "blue")) 
 
m <- matrix(c(2,5,6,3),2) 
m 
 
barplot(m, col=c("lightgray", "blue"), 
        names.arg=c("Group 1", "Group 2")) 
m <- matrix(c(23, 20, 12, 5, 6, 9), ncol=3)  # Input data 
relfrq <- prop.table(m, margin=1)            # Calc rel. freqs 
relfrq 
# Make juxtaposed barplot 
barplot(relfrq,  beside=TRUE, 
        legend.text=c("Hadsten", "Hammel"), 
        names=c("<0.01", "0.01-0.10", ">0.10")) 
 
# Stacked relative barplot with labels added 
barplot(t(relfrq), names=c("Hadsten", "Hammel"), horiz=TRUE) 
airquality$Month <- factor(airquality$Month) 
model <- lm(Temp ~ Month - 1, data=airquality) 
mean.values <- coef(summary(model))[,1] 
mean.values 
 
sem <- coef(summary(model))[,2] 
scale <- qt(0.975, df=summary(model)$df[2]) 
barplot2(mean.values, col="blue", 
         xlab="Month", ylab="Temperature (F)", 
         plot.ci=TRUE, 
         ci.u = mean.values+scale*sem, 
         ci.l = mean.values) 
library(gplots) 
data(OrchardSprays) 
fit <- lm(decrease ~ treatment - 1, data=OrchardSprays) 
summary(fit) 
plotCI(coef(summary(fit))[,1], 
       uiw=qt(.975, df=56)*coef(summary(fit))[,2], 
       ylab="Estimates and 95% confidence intervals", xaxt="n") 
axis(1, 1:length(rownames(coef(summary(fit)))), 
     rownames(coef(summary(fit)))) 
library(plotrix) 
men <- c(334700, 357834, 328545, 367201, 411689, 
         359363, 336841, 178422, 72296, 9552, 139, 0) 
women <- c(318662, 340357, 320462, 365180, 401521, 
           357405, 345884, 208057, 117843, 27914, 761, 0) 
groups <- paste(seq(0, 110, 10), "-", seq(9, 119, 10), sep="") 
groups                             # Show group labels 
men <- 100*men/sum(men)            # Normalize men 
women <- 100*women/sum(women)      # and women 
pyramid.plot(men, women, labels=groups, 
            lxcol="lightgray", rxcol="lightgray") 
library(MASS) 
data(survey) 
# Tabulate exercise and smoking status by gender 
result <- by(survey, survey$Sex, 
             function(x) {table(x$Smoke, x$Exer)}) 
result 
x1 <- 1:6 
y1 <- x1 
x2 <- c(1.5, 2.5, 3.5, 4.5, NA, NA) 
y2 <- sqrt(x2) 
y3 <- 7-x1 
y3[4] <- NA 
y3 
matplot(cbind(x1, x2, x1), cbind(y1, y2, y3), 
        type=c("p", "p", "l"), lwd=3) 
x <- seq(0, 3, .2)    # Set grid points for x 
y <- seq(0, 4, .1)    # Set grid points for y 
# Compute function at all grid points 
z <- outer(x, y, FUN=function(xx,yy) { sin(xx*yy) }) 
contour(x, y, z)      # Make contour plot 
contour(x, y, z, levels=c(-1, -.3, 0, .1, .2, .3, .8, 1)) 
filled.contour(x, y, z, color.palette=grey.colors) 
 
library(RColorBrewer) 
blue.colors <- brewer.pal(9, "Blues") 
image(x, y, z, col=blue.colors) 
x <- seq(0, 3, .2)        # Set grid for x dimension 
y <- seq(0, 4, .1)        # Set grid for y dimension 
z <- outer(x, y, FUN=function(xx,yy) { sin(xx*yy) }) 
persp(x, y, z) 
library(scatterplot3d) 
data(trees) 
scatterplot3d(trees$Girth, trees$Height, trees$Volume, 
              xlab="Girth", ylab="Height", zlab="Volume") 
scatterplot3d(trees$Girth, trees$Height, trees$Volume, 
              type="h", color=grey(31:1/40), 
              xlab="Girth", ylab="Height", zlab="Volume") 
o <- order(trees$Volume) 
scatterplot3d(trees$Girth[o], trees$Height[o], trees$Volume[o], 
              type="l", scale.y=3, angle=75, 
              xlab="Girth", ylab="Height", zlab="Volume") 
myplot <- scatterplot3d(trees$Girth, trees$Height, trees$Volume, 
                        type="h", angle=60, 
                        xlab="Girth", ylab="Height", zlab="Volume") 
model <- lm(Volume ~ Girth + Height, data=trees)   # Fit a model 
myplot$plane3d(model, col="blue")                  # Add fitted plane 
data(eurodist) 
x <- as.matrix(eurodist) 
 
library(RColorBrewer) 
blue.colors <- brewer.pal(9, "Blues") 
heatmap(x, Rowv=NA, Colv=NA, col=blue.colors) 
colnames(x) 
# Add a color bar to the heatmap 
areas <- c("Greece", "Iberia", "Belgium", "France", "France", 
           "Germany", "Scandinavia", "Central Europe", "Iberia", 
           "Germany", "Holland", "Iberia", "France", "Iberia", 
           "France",  "Italy", "Germany", "France", "Italy", 
           "Scandinavia", "Central Europe") 
f.areas <- factor(areas) 
area.colors <- gray.colors(nlevels(f.areas)) 
heatmap(x, col=blue.colors, ColSideColors = area.colors[f.areas], 
        RowSideColors = area.colors[f.areas]) 
library(gplots)              # Load the gplots package 
list1 <- c("a", "b", "c", "d", "e", "f", "g", "h") 
list2 <- c("a", "b", "c", "i", "j", "k", "l", "m", "p") 
list3 <- c("a", "b", "d", "e", "i", "j", "k", "n", "o") 
venn(list(Group1=list1, list2=list2, "Set 3"=list3)) 
list4 <- c("a", "b", "d", "i", "p")    # Add extra group 
lets <- tolower(LETTERS)[1:17]         # Set possible names 
# Create a data frame 
df <- data.frame(g1=lets %in% list1, g2=lets %in% list2, 
                 g3=lets %in% list3, g4=lets %in% list4) 
head(df, 10) 
venn(df) 
library(plotrix) 
year <- 2001:2005 
prevalence <- c(10, 15, 12, 60, 65) 
prevalence[prevalence>40] <- prevalence[prevalence>40] - 40 
prevalence 
design <- matrix(c(1,1,0,2), 2, 2, byrow=TRUE) 
design                          # Show the layout matrix 
layout1 <- layout(design) 
layout.show(layout1) 
x <- exp(rnorm(100))            # Simulate data 
y <- (rnorm(100)+3)**2          # Simulate more data 
plot(x, y)                      # Make a scatter plot 
hist(x, main="")                # and a histogram 
data(CO2) 
matplot(matrix(CO2$conc, ncol=12), matrix(CO2$uptake, ncol=12), 
        type="l", lwd=2, lty=rep(c(1,2), c(6,6)), xlim=c(80, 1100), 
        ylim=c(0, 45), xlab="Concentration", ylab="Uptake", 
        col=rep(c("black", "blue", "black", "blue"), times=rep(3,4))) 
legend(600, 9, legend=c("Quebec - nonchilled", "Quebec - chilled", 
               "Mississippi - nonchilled", "Mississippi - chilled"), 
       lty=c(1, 1, 2, 2), col=rep(c("black", "blue"), 2)) 
library(plotrix) 
cats <- matrix(c(53, 115, 17, 502, 410, 14),ncol=2) 
rownames(cats) <- c("Yes", "No", "No info") 
colnames(cats) <- c("Problems", "No problems") 
cats 
origintreat <- interaction(CO2$Treatment, CO2$Type) 
head(origintreat) 
levels(origintreat) 
plotcode <- as.numeric(origintreat) 
plot(CO2$conc, CO2$uptake, cex=2.6, 
     pch=c(21, 22, 21, 22)[plotcode], 
     bg=c("black", "blue", "white", "lightblue")[plotcode], 
     xlab="Concentration", ylab="Uptake") 
library(rgl) 
data(trees) 
plot3d(trees$Height, trees$Girth, trees$Volume, 
       type="s", size=2) # Plot spheres 
plot3d(trees$Height, trees$Girth, trees$Volume, 
       type="h")         # Vertic. lines 
data(volcano) 
z <- volcano            # Height 
x <- 10 * (1:nrow(z))   # 10 meter spacing (S to N) 
y <- 10 * (1:ncol(z))   # 10 meter spacing (E to W) 
persp3d(x, y, z, smooth=TRUE, col="lightgray", 
        xlab="NS axis", ylab="EW axis", zlab="Height") 
persp3d(x, y, z, col="darkgray", front="line", back="cull", 
        xlab="NS axis", ylab="EW axis", zlab="Height") 
dev.print(file="pngfile.png", device=png, width=480) 
dev.print(file="jpgfile.jpeg", device=jpeg, 
          width=480, height=240) 
names(pdfFonts()) 

Chapter 5

sd 
boxplot 
methods(boxplot) 
boxplot.matrix   # See source for non-hidden method 
boxplot.formula  # Try to see source for hidden method 
getAnywhere("boxplot.formula") 
library(lme4) 
refit           # Try to show the source for the refit function 
showMethods("refit")     # Show available methods 
# List the source code for refit for class type numeric 
findMethods("refit", classes="numeric") 
library(microbenchmark) 
m1 <- matrix(rnorm(120000), ncol=12)  # Simulate data 
m2 <- rep(1, 10000)                   # Vector of 1s for multiplication 
microbenchmark(apply(m1, 2, sum),     # Compare apply, 
               m2 %*% m1,             # matrix multiplication, 
               colSums(m1))           # and colSums methods 
# Example of .Rprofile to have the foreign package loaded 
# automatically when R launches 
local({ 
       options(defaultPackages=c(getOption("defaultPackages"), 
                                           "foreign")) 
      }) 
x <- 1:5 
myfactor <- factor(c("a", "b", "b", "b", "c", "c")) 
ls()                # List local objects 
x                   # Print x 
rm(x)               # Remove object x 
x                   # Try to print x 
ls() 
ls()                     # List the current objects 
x <- 1:5                 # Create a new vector 
ls()                     # List the current objects 
save.image(".RData")     # Save the workspace to .RData 
ls()                     # List the current objects 
load(".RData")           # Load the saved image 
ls()                     # List the current objects 
x 
source("myanalysis.txt") 
ls() 
y 
when <- system("date", intern=TRUE) 
when 
session_info("MESS")