The The R Primer logo Primer

R code

Below are the raw R code from the book.

  1. Chapter 1: Importing data
  2. Chapter 2: Data handling
  3. Chapter 3: Statistical analyses
  4. Chapter 4: Graphics
  5. Chapter 5: R

Chapter 1: Importing data



Rule 1.1
indata <- read.table("mydata.txt", header=TRUE)
indata
indata <- read.table("mydata.txt", header=TRUE, 
                     na.strings=c("NA", ""))
indata
indata <- read.table("mydata.txt", header=TRUE, 
                     na.strings=c("NA", "", "."))
indata
indata <- read.table("d:/mydata.txt", header=TRUE)


Rule 1.2
library(XML)
url <- "http://www.statistics.life.ku.dk/primer/mydata.xml"
indata <- xmlToDataFrame(url)
head(indata)


Rule 1.3
library(XML)
# Location of the example XML file
url <- "http://www.statistics.life.ku.dk/primer/bank.xml"
# Parse the tree 
doc <- xmlTreeParse(url, useInternalNodes=TRUE)
top <- xmlRoot(doc)         # Identify the root node
xmlName(top)                # Show node name of root node
names(top)                  # Name of root node children
xmlValue(top[[1]])          # Access first element 
xmlValue(top[["author"]])   # First element with named index
names(top[[3]])             # Children of node 3
xmlAttrs(top[[3]][[1]])     # Extract tags from a Date node
top[["rates"]][[1]][[1]]    # Tree from first bank and date
xmlValue(top[[3]][[1]][[1]][[1]]) # Bank name is node value
xmlAttrs(top[[3]][[1]][[1]][[1]]) # but has no tags
xmlAttrs(top[[3]][[1]][[1]][[2]]) # The  node has tags
xmlValue(top[[3]][[1]][[1]][[2]]) # but no value
# Search tree for all source nodes and return their value
xpathSApply(doc, "//source", xmlValue)
# Search full tree for all exch nodes where currency is "DKK"
xpathApply(doc, "//exch[@currency='DKK']", xmlAttrs)
res <- xpathApply(doc, "//exch",  
         function(ex) {
            c(xmlAttrs(ex),
              bank=xmlValue(xmlParent(ex)[["source"]]),
              date=xmlGetAttr(xmlParent(xmlParent(ex)), "time"))
           })
result <- do.call(rbind, res)
result


Rule 1.4
library(RODBC)
# Connect to SQL database with username and password
channel <- odbcConnect("mydata", uid="tv", pwd="office") 
sqlTables(channel)               # List tables in the database
mydata <- sqlFetch(channel, "paper")      # Fetch entire table
mydata
sqlQuery(channel, 
         "SELECT * FROM paper WHERE Sales>12 ORDER BY Person")
odbcClose(channel)


Rule 1.5
indata <- read.csv("mydata.csv")
head(indata)


Rule 1.6
library(xlsx)
goesforth <- read.xlsx("Documents/cunningplan.xlsx", 1)
goesforth


Rule 1.7
library(RODBC)
channel <- odbcConnectExcel("example.xls")
indata <- sqlFetch(channel, "Sheet1")
odbcClose(channel)
indata


Rule 1.8


Rule 1.9
mydata <- read.table(file="clipboard", header=TRUE) # Windows/X
mydata <- read.table(pipe("pbpaste"), header=TRUE)  # Mac OS X
mydata
mydata <- read.table(file="clipboard", sep="\t", header=TRUE)


Rule 1.10
library(foreign)
indata <- read.xport("somefile.xpt") 


Rule 1.11
library(foreign)
indata <- read.spss("spssfilename.sav", to.data.frame = TRUE)


Rule 1.12
library(foreign)
indata <- read.dta("statafile.dta")


Rule 1.13
library(foreign)
indata <- read.systat("systatfile.syd")


Rule 1.14
data(trees)
trees$Girth[2] <- NA    # Set to missing
write.table(trees, file="mydata.txt")
write.table(trees, file="mydata2.txt",
            dec=",", eol="\r\n", na=".", quote=FALSE)


Rule 1.15
data(trees)
write.csv(trees, file = "mydata.csv")


Rule 1.16


Rule 1.17
library(xlsReadWrite)
data(trees)
write.xls(trees, "mydata.xls") 


Rule 1.18
library(SASxport)
data(trees)
write.xport(trees, file="datafromR.xpt")


Rule 1.19
library(foreign) 
data(trees)
write.foreign(trees, datafile="mydata.dat",
              codefile="mydata.sps", package="SPSS")


Rule 1.20
library(foreign) 
data(trees)
write.dta(trees, file="mydata.dta")


Rule 1.21
library(XML)
data(trees)
mydata <- trees             # Use cherry trees data as example
doc <- newXMLDoc()          # Start empty XML document tree
# Start by adding a document tag at the root of the XML file
root <- newXMLNode("document", doc=doc) 
invisible(                             # Make output invisible
lapply(1:nrow(mydata),                 # Iterate over all rows
       function(rowi) {
         r <- newXMLNode("row", parent=root)   # Create row tag 
         for(var in names(mydata)) {   # Iterate over variables
                                       # Add each observation
           newXMLNode(var, mydata[rowi, var], parent = r) 
       }}))
# Now save the XML document tree as file mydata.xml
saveXML(doc, file="mydata.xml")


Rule 1.22

Chapter 2: Data handling

vec1 <- 4                     # Numeric vector of length 1
vec1
vec2 <- c(1, 2, 3.4, 5.6, 7)  # Numeric vector of length 5
vec2
# Create a character vector
vec3 <- c("Spare", "a talent", "for an", "old", "ex-leper")
vec3
# And a vector of logical values
vec4 <- c(TRUE, TRUE, FALSE, TRUE, FALSE)
vec4
f <- factor(vec3)             # Make factor based on vec 3
f
x <- 1 + 2i                   # Enter a complex number
x
sqrt(-1)                      # R don't see a complex number
sqrt(-1 + 0i)                 # but it does here
m <- matrix(1:6, ncol=2)      # Create a matrix
m
list(vec2, comp=x, m)         # Combine 3 different objects
data.frame(vec2, f, vec4)     # Make a data frame


Rule 2.1
5/2 + 2*(5.1 - 2.3)  # Add, subtract, multiply and divide
2**8                 # 2 to the power 8
1.61^5               # 1.61 to the power 5
10 %% 3              # 10 modulus 3 has remainder 1
10 %/% 3             # integer division
abs(-3.1)            # absolute value of -3.1
sqrt(5)              # square root of 5
ceiling(4.3)         # smallest integer larger than 4.3
floor(4.3)           # largest integer smaller than 4.3
trunc(4.3)           # remove decimals
round(4.5)
round(4.51)
round(4.51, digits=1)
# Angles for trigonometric functions use radians - not degrees
cos(pi/2)            # cosine of pi/2. 
sin(pi/4)            # sine of pi/4
tan(pi/6)            # tangent of (pi/6)
log(5)               # natural logarithm of 5
log(5, base=2)       # binary logarithm of 5
log10(5)             # common logarithm of 5
exp(log(5) + log(3)) # exponential function
# Create two matrices
x <- matrix(1:6,ncol=3)  
y <- matrix(c(1, 1, 0, 0, 0, 1), ncol=2)
x
y
x %*% y      # Matrix multiplication
y %*% x      # Matrix multiplication


Rule 2.2
x <- c(6, 8, 1:4)
x
mean(x)         # mean of x
median(x)       # median 
sd(x)           # standard deviation
IQR(x)          # inter-quartile range
y <- (1:6)**2   # New vector
y
cor(x,y)        # Pearson correlation
cor(x, y, method="spearman") # Spearman correlation
cor(x, y, method="kendall")  # Kendall correlation
length(x)       # length of x
sum(x)          # sum of elements in x
cumsum(x)       # cumulative sum of all elements of x
sort(x)         # sort the vector x
order(x)        # order the elements of x in ascending order
min(x)          # minimum value
max(x)          # maximum value
quantile(x)     # get quartiles
quantile(x, probs=c(.15, .25, .99))  # specific quantiles
x[c(2,4)] <- NA # redefine x
x
is.na(x)        # identify missing elements
df <- data.frame(x=c(1, 2, 3, 4), y=c(2, 1, 3, 2))
df
nrow(df)        # number of rows in matrix/data frame
ncol(df)        # number of columns in matrix/data frame
x                     # vector with missing values
sum(x)                # sum returns missing
sum(x, na.rm=TRUE)    # unless we remove the NA's
mean(x)               # mean also returns missing
mean(x, na.rm=TRUE)   # unless we remove the NA's
order(x)              # order places NA's last
sort(x)               # sort removes NA altogether
sort(x, na.last=TRUE) # unless we want to keep them
sort(x, na.last=TRUE, decreasing=TRUE)


Rule 2.3
today <- Sys.Date()       # Current date
today
date1 <-  as.Date("1971-08-20")
date2 <-  as.Date("2011-04-01")
date2 - date1             # Difference between dates
# Read dates in the format m/d/y'
date3 <- c("02/28/00", "12/20/01")
as.Date(date3)
as.Date(date3, "%m/%d/%y")
format(today, "%d %B %Y") # Print current date w text for month
format(today, "%a %d")


Rule 2.4
x <- c("Suki", "walk", "to", "the", "well", NA)
toupper(x)             # convert to upper case
tolower(x)             # convert to lower case
nchar(x)               # no. of characters in each element of x
grep("a", x)           # elements that contain letter "a"
grep("o", x)           # elements that contain letter "o"
substr(x, 2, 3)        # extract 2nd to 3rd characters
substr(x, 2, 3) <- "X" # replace 2nd char with X
x
substr(x, 2, 3) <- "XXXX"
x
paste(x, "y")          # paste "y" to end of each element
paste(x, "y", sep="-")
paste(x, "y", sep="-", collapse=" ")


Rule 2.5
x <- seq(0, pi/2, .2)
x
y <- sin(2*x)
y
x[which.max(y)]


Rule 2.6
x <- c(1, 2, 3, 1, NA, 4)
y <- c(1, 2, 5, NA)
x %in% y
y %in% x
! x %in% y   # Invert result: find objects NOT present
z <- matrix(1:6, ncol=2)
z
x %in% z
list(1, 2, 3) %in% list(a=c(1,2), b=3)
data.frame(a=c(1,2), b=c(3,4)) %in% data.frame(a=c(2,1), 
                                               d=c(1,2))


Rule 2.7
m <- matrix(1:6, ncol=2)     # Create matrix
m
t(m)                         # Transpose
mpdat <- data.frame(source=c("KU", "Spain", "Secret", "Attic"),
                     rats=c(97, 119, 210, 358))
mpdat
t(mpdat)
t(mpdat[,2])


Rule 2.8
# Start by creating a data frame with missing observation
id <- factor(rep(c("A", "B", "C"),4))
time <- rep(1:4, times=3)
x <- c(1, 2, 3, 4, 5, NA, NA, NA, NA, NA, NA, 6)
mydata <- data.frame(id, time, x)
mydata                  # Data frame
o <- order(mydata$id, mydata$time)
newdata <- mydata[o,]   # Ordered data frame
newdata
library(zoo)
# Just using na.locf will carry observations forward 
# across individuals as the following line show
na.locf(newdata$x)      # Not working correctly
# Create function that uses na.locf on the correct variable: x
result <- by(newdata, newdata$id, 
             function(df) { na.locf(df$x, na.rm=FALSE)})
result
newdata$newx <- unlist(result)  # unlist and add to data frame
newdata


Rule 2.9
x <- c("1,2", "1,5", "1,8", "1.9", "2")   # Example data
as.numeric(sub(",", ".", x))  # Substitute , with . and convert
f <- factor(x)                # It also works on factors
as.numeric(sub(",", ".", f))  # Substitute , with . and convert
as.numeric(x)       # Does not work - need to substitute first


Rule 2.10
data(ToothGrowth)
subset(ToothGrowth, dose < 2)
subset(ToothGrowth, dose < 2 & supp == "VC")
subset(ToothGrowth, dose < 2 & supp == "VC", 
       select=c("len", "supp"))


Rule 2.11
data(airquality)
head(airquality)
complete <- complete.cases(airquality)
complete
ccdata <- airquality[complete,]
head(ccdata)
complete2 <- complete.cases(airquality$Ozone, airquality$Temp)
oztemp <- airquality[complete2,]
head(oztemp)


Rule 2.12
mydat <- data.frame(x=1:5, y=factor(c("A", "B", "A", "B", NA)))
mydat
mydat$y <- NULL   # Remove the y variable from mydat
mydat


Rule 2.13
df1 <- data.frame(id=1:3, x=11:13, y=factor(c("A", "B", "C")))
df1
df2 <- data.frame(id=6:10, y=factor(1:5), x=1:5)
df2
combined <- rbind(df1, df2)
combined
df1 <- data.frame(id=1:3, x=11:13, y=factor(c("A", "B", "C")), 
                  z=1:3)
df2 <- data.frame(id=6:10, y=factor(1:5), x=1:5)
df1$z <- NULL   # Remove the z variable from data frame df1
rbind(df1,df2)  # Now we can join the data frames
df1 <- data.frame(id=1:3, x=11:13, y=factor(c("A", "B", "C")), 
                  z=1:3)
df2 <- data.frame(id=6:10, y=factor(1:5), x=1:5)
df2$z <- NA     # Add a vector z of NAs to data frame df2
rbind(df1,df2)  # Now we can join the data frames


Rule 2.14
dfx <- data.frame(id=factor(1:5), var1=11:15, 
dfy <- data.frame(id=factor(6:1), var2=15:10)
merge(dfx, dfy, by="id")
merge(dfx, dfy, by="id", all.y=TRUE)


Rule 2.15
df <- data.frame(x=c(1, 2, 3, 4, 5), y=c(3, NA, 6, 7, 8))
df
newdf <- stack(df)
newdf
t.test(df$x, df$y, var.equal=TRUE)    # T-test for two samples
summary(lm(values ~ ind, data=newdf))    # One-way anova


Rule 2.16
indata <- read.table("widedata.txt", header=TRUE)
indata
long <- reshape(indata,
                varying=list(c("y1", "y2", "y3", "y4"), 
                             c("t1", "t2", "t3", "t4")),
                v.names=c("response", "day"),
                direction="long")
long
reshape(long, idvar="person", timevar="time",
        v.names=c("day", "response"), 
        direction="wide")
# Reshape but specify names of variables in wide format
reshape(long, idvar="person", timevar="time",
        v.names=c("day", "response"), 
        varying=list(c("y1", "y2", "y3", "y4"), 
                     c("t1", "t2", "t3", "t4")),
        direction="wide")


Rule 2.17
x <- rbinom(20, size=4, p=.5)
x
table(x)
sex <- sample(c("Male", "Female"), 20, replace=TRUE)
sex
table(sex, x)
treatment <- rep(c("Placebo", "Active"), 10)
treatment
table(sex, x, treatment)
data(Titanic)
ftable(Titanic)


Rule 2.18
# Only look at males to keep output short
maletable <- as.table(HairEyeColor[,,1])
maletable
freq <- as.data.frame(maletable)
freq
library(epitools)
fulldata <- expand.table(HairEyeColor)
fulldata


Rule 2.19
df <- data.frame(a=rep(1,5), b=rep(2,5), c=rep(3,5))
df
vector <- as.vector(as.matrix(df))  # Convert to vector
vector
df <- data.frame(a=rep(1,5), 
                 b=factor(c("A", "A", "A", "B", "B")))
df        # Show data frame with both numeric and factor
as.vector(as.matrix(df))   # Result is vector of strings


Rule 2.20
f <- factor(c(1:4, "A", 8, NA, "B"))
f
vect <- as.numeric(levels(f))[f]
vect
as.numeric(f)                   # Not correct


Rule 2.21
f <- factor(c(1, 2, 3, 2, 3, 1, 3, 1, 2, 1))
f
f[c(1,2)] <- "new level"   # Not working
f
# Define the new factor level first
f <- factor(f, levels=c(levels(f), "new level"))
f[c(1,2)] <- "new level"   # Can now assign new levels
f


Rule 2.22
f <- factor(c(1, 1, 2, 3, 2, 1, "A", 3, 3, 3, "A"))
f
# Define the new factor levels. We wish to combine
# levels 1 and 3 so assign appropriate labels for 
# each of the original levels of the factor f
new.levels <- c("13", "2", "13", "A")
levels(f) <- new.levels
f


Rule 2.23
f <- factor(c(1, 2, 3, 2, 3, 2, 1), levels=c(0, 1, 2, 3))
f
ff <- droplevels(f)   # Drop unused levels from f
ff
# Create data frame with two identical factors
df <- data.frame(f1=f, f2=f)
# Drop levels from all factors in df except variable 2
newdf <- droplevels(df, except=2)
newdf$f1
newdf$f2


Rule 2.24
x <- (1:15)**2
x
cut(x, 3)  # Cut the range of x into 3 interval of same length
# Cut the range into 3 intervals of roughly the same size
cut(x, quantile(x, probs=seq(0, 1, length=4)), 
    include.lowest=TRUE)


Rule 2.25
x <- c(2, 4, 1, 5, 3, 4)
x
sort(x)
sort(x, decreasing=TRUE)
y <- c("b", "a", "A", "b", "b", "a")
y
sort(y)
order(x)
df <- data.frame(x,y)
df
df[order(df$y),]
df[order(df$y, df$x),]


Rule 2.26
data(airquality)
names(airquality)
# New variable
celsius <- (airquality$Temp-32)/1.8
# Store the variable inside the data frame
airquality$Celsius <- (airquality$Temp-32)/1.8
head(airquality$Temp)
head(airquality$Celsius)
newdata <- transform(airquality, logOzone = log(Ozone))
head(newdata)


Rule 2.27
m <- matrix(c(NA, 2, 3, 4, 5, 6), ncol=2)
m
apply(m, 1, sum)           # Compute the sum for each row
data(airquality)
sapply(airquality, mean)   # Compute the mean of each variable
sapply(airquality, mean, na.rm=TRUE, trim=.1)
by(airquality, airquality$Month, mean)


Rule 2.28
library(MASS)
data(trees)
result <- boxcox(Volume ~ log(Height) + log(Girth), data=trees,
                 lambda=seq(-0.5, 0.6, length=13))
# Find the lambda value with highest profile log-likelihood
result$x[which.max(result$y)] 
y <- rnorm(100) + 5       # Generate data
y2 <- y**2                # Square the results
result <- boxcox(y2 ~ 1, plotit=FALSE)
result$x[which.max(result$y)] 


Rule 2.29
trapezoid <- function(x, y) { 
                      sum(diff(x)*(y[-1]+y[-length(y)]))/2 }
x <- 1:4
y <- c(0, 1, 1, 5)
trapezoid(x, y)
x2 <- c(3, 4, 2, 1)  # Reorder the same data
y2 <- c(1, 5, 1, 0)
trapezoid(x2, y2)    # We get wrong result
auc <- function(x, y, from=min(x), to=max(x), ...) {
                integrate(approxfun(x,y,...), from, to)$value }
auc(x, y)
auc(x2, y2)                           # Reordering is fine
auc(x, y, from=0)                     # AUC from 0 to max(x)
auc(x, y, from=0, rule=2)             # Allow extrapolation
auc(x, y, from=0, rule=2, yleft=0)    # Use value 0 to the left
auc(x, y, from=0, rule=2, yleft=.5)   # Use 1/2 to the left
data(ChickWeight)
result <- by(ChickWeight, ChickWeight$Chick, 
             function(df) { auc(df$Time, df$weight) } )
head(result)
# Make sure that each area are based on the same interval
# from 0 to 21 days. Carry first and last observation
# backward and forward, respectively
result2 <- by(ChickWeight, ChickWeight$Chick, function(df) { 
              auc(df$Time, df$weight, from=0, to=21, rule=2)})
head(result2)

Chapter 3: Statistical analyses

data(ChickWeight)
attach(ChickWeight)
lm(weight ~ Time)          # Linear regression
lm(weight ~ Time - 1)      # Linear regression through origin
lm(weight ~ Diet)          # One-way ANOVA
lm(weight ~ Diet - 1)      # One-way ANOVA without ref. group
lm(weight ~ Diet + Chick)  # Two-way additive ANOVA
# Analysis of covariance, common slope, one intercept per diet
lm(weight ~ Time + Diet)   
lm(weight ~ Diet*Chick)    # Two-way ANOVA with interaction
lm(weight ~ Diet + Chick + Diet:Chick) # Same as above
# Analysis of covariance, one slope and intercept for each diet
lm(weight ~ Time*Diet)        
lm(weight ~ Diet/Chick)                # Nested model
lm(weight ~ Diet + Diet:Chick)         # Same as above
lm(weight ~ Diet*Chick - Chick)        # Same as above
data(pressure)
lm(log(pressure) ~ I(temperature*9/5+32) + 
                   I((temperature*9/5+32)^2), data=pressure)
f1 <- factor(c("A", "B", "C", "B", "A", "C"))
f2 <- factor(c(1, 2, 1, 2, 1, 2))
interaction(f1,f2)
interaction(f1,f2, drop=TRUE) # Drop unused factor levels
lm(weight ~ offset(Time))    # Fix the slope of time at 1
newy <- weight - Time        # Subtract the effect of time 
lm(newy ~ 1)                 # Same result as above
lm(weight ~ offset(2.7*Time))   # Fix slope at 2.7


Rule 3.1
library(pastecs)
options(scipen=100, digits=2)   # Prevent scientific notation
data(ToothGrowth)
stat.desc(ToothGrowth)
by(ToothGrowth, ToothGrowth$supp, stat.desc, basic=FALSE)
stat.desc(ToothGrowth, basic=FALSE, norm=TRUE)


Rule 3.2
data(trees)
result <- lm(Volume ~ Height, data=trees)
result
summary(result)
plot(trees$Height, trees$Volume,     # Make scatter plot
     xlab="Height", ylab="Volume")
abline(result)                       # Add fitted line


Rule 3.3
data(trees)
result <- lm(Volume ~ Height + Girth, data=trees)
result
summary(result)


Rule 3.4
year <- seq(1981, 1999)
deaths <- c(339, 1201, 3153, 6368, 12044, 19404, 29105, 
            36126, 43499, 49546, 60573, 79657, 79879, 
            73086, 69984, 61124, 49379, 43225, 41356)
newyear <- year - 1980
model <- lm(deaths ~ newyear + I(newyear^2) + I(newyear^3))
summary(model)
# Plot the data and add the fitted cubic line
plot(year, deaths)   
lines(seq(1981, 1999, .1), 
      predict(model, data.frame(newyear=seq(1, 19, .1))))


Rule 3.5
data(OrchardSprays)
head(OrchardSprays)
model <- lm(decrease ~ treatment, data=OrchardSprays)
model
summary(model)
drop1(model, test="F")


Rule 3.6
library(isdals)
data(fev)
# Convert variables to factors and get meaningful labels
fev$Gender <- factor(fev$Gender, labels=c("Female", "Male"))
fev$Smoke <- factor(fev$Smoke, labels=c("No", "Yes"))
attach(fev)
interaction.plot(Gender, Smoke, FEV)
model <- lm(FEV ~ Gender + Smoke + Gender*Smoke, data=fev)
model
drop1(model, test="F")
model2 <- lm(FEV ~ Gender + Smoke, data=fev)
drop1(model2, test="F")
summary(model2)


Rule 3.7
library(isdals)
data(fev)
# Convert variables to factors and get meaningful labels
fev$Gender <- factor(fev$Gender, labels=c("Female", "Male"))
fev$Smoke <- factor(fev$Smoke, labels=c("No", "Yes"))
summary(fev)
# Fit the initial model. Interactions like Smoke*Age 
# inherently include the main effects of Smoke and Age
model <- lm(FEV ~ Ht + I(Ht^2) + Smoke*Gender + Smoke*Age, 
            data=fev)
drop1(model, test="F")
model2 <- lm(FEV ~ Ht + I(Ht^2) + Gender + Smoke*Age,
             data=fev)
drop1(model2, test="F")
# Remove the insignificant interaction and refit
model3 <- lm(FEV ~ Ht + I(Ht^2) + Gender + Smoke + Age,
             data=fev)
drop1(model3, test="F")
summary(model3)


Rule 3.8
library(MASS)
data(birthwt)
model <- glm(low ~ factor(race) + smoke, family=binomial, 
             data=birthwt)
drop1(model, test="Chisq")
summary(model)
exp(1.1160 +  c(-1,1)*1.96*0.3692)    # 95% CI for smoking
model2 <- model2 <- glm(low ~ factor(race) + smoke, 
                        family=quasibinomial(link="probit"), 
                        data=birthwt)
summary(model2)
exp(-1.8405 + 1.1160) / (1 + exp(-1.8405 + 1.1160))
pnorm(-1.1292 + 0.6842)


Rule 3.9
library(nnet)
library(isdals)
data(alligator)
head(alligator)
model <- multinom(food ~ length, data=alligator) 
null <- multinom(food ~ 1, data=alligator) 
anova(null, model)               # Compare the two models
summary(model)
len <- seq(0.3, 4, .1)
matplot(len, predict(model, newdata=data.frame(length=len),
                     type="probs"), type="l", 
        xlab="Alligator length", ylab="Probability")


Rule 3.10
library(kulife)
data(bees)
beedata <- subset(bees, Time=="july1") # Look at one day only
head(beedata)
model <- glm(Number ~ Locality + Type*Color,
             family=poisson, data=beedata)
drop1(model, test="Chisq")
summary(model)
exp(2.5649 + 1.8718 - 2.1342)
model2 <- glm(Number ~ Locality + Type*Color,
              family=quasipoisson, data=beedata)
drop1(model2, test="Chisq")
summary(model2)
exp(-0.4745 + c(-1,1)*1.96*0.2881) # 95% CI Kragevig/Havreholm


Rule 3.11
library(MASS)
data(survey)
resp <- ordered(survey$Exer, levels=c("None", "Some", "Freq"))
head(resp)
model <- polr(resp ~ Sex + Age + Smoke, data=survey)
drop1(model, test="Chisq")
model2 <- polr(resp ~ Sex + Smoke, data=survey)
drop1(model2, test="Chisq")
model3 <- polr(resp ~ Sex , data=survey)
drop1(model3, test="Chisq")
summary(model3)
exp(-1.9935) / (1 + exp(-1.9935))
exp(0.2708 - 0.4176) / (1 + exp(0.2708 - 0.4176)) - 
exp(-1.9935 - 0.4176) / (1 + exp(-1.9935 - 0.4176))
head(predict(model3, type="prob"))
exp(0.4176 + qnorm(.975)*c(-1,1)*0.2517)
library(nnet)
multi <- multinom(Exer ~ Sex + Smoke + Age, data=survey)
1-pchisq(deviance(model) - deviance(multi), 
         df=multi$edf - model$edf)


Rule 3.12
data(ChickWeight)
library(lme4)
head(ChickWeight)
model <- lmer(log(weight) ~ Time + Diet + Time*Diet + (1|Chick), 
              data=ChickWeight)
summary(model)
model <- lmer(log(weight) ~ Time + Diet + Time*Diet + (1|Chick),
              data=ChickWeight, REML=FALSE)
noint <- lmer(log(weight) ~ Time + Diet + (1|Chick), 
              data=ChickWeight, REML=FALSE)
anova(model, noint)


Rule 3.13
library(nlme)
data(ChickWeight)
head(ChickWeight)
model <- lme(log(weight) ~ Diet + Time + Time*Diet, 
             random= ~ 1|Chick, 
             correlation=corGaus(form= ~ Time|Chick, 
                                 nugget=TRUE), 
             data=ChickWeight)
model
model2 <- lme(log(weight) ~ Diet + Time + Time*Diet,
              random= ~ 1|Chick, 
              correlation=corGaus(form= ~Time|Chick, 
                                  nugget=TRUE), 
              method="ML", data=ChickWeight)
model3 <- lme(log(weight) ~ Diet + Time, random= ~ 1|Chick,                
              correlation=corGaus(form= ~Time|Chick, 
                                  nugget=TRUE), 
              method="ML", data=ChickWeight)
anova(model3, model2)
summary(model)
model4 <- lme(log(weight) ~ Diet + Time + Time*Diet, 
              random= ~ 1|Chick, 
              correlation=corGaus(form= ~ Time|Chick), 
              data=ChickWeight)
anova(model4, model)
model5 <- lme(log(weight) ~ Diet + Time + Time*Diet, 
              random= ~ 1|Chick, 
              correlation=corExp(form= ~ Time|Chick, 
                                 nugget=TRUE), 
              data=ChickWeight)
AIC(model)
AIC(model5)
plot(Variogram(model))
plot(Variogram(model, resType="n"))


Rule 3.14
library(lme4)
data(cbpp)
model <- glmer(cbind(incidence, size-incidence) ~ 
               period + (1|herd), family=binomial, data=cbpp)
summary(model)
model2 <- glmer(cbind(incidence, size-incidence) ~ 1 +
                (1|herd), family=binomial, data=cbpp)
anova(model2, model)
exp(-0.9923)
cbpp$obs <- 1:nrow(cbpp)
model3 <- glmer(cbind(incidence, size - incidence) ~ period +
                (1|herd) + (1|obs), family=binomial, data=cbpp)
summary(model3)


Rule 3.15
library(gee)
library(lme4)
data(cbpp)
cbpp <- cbpp[order(cbpp$herd),]  # Order data according to herd
model <- gee(cbind(incidence, size-incidence) ~ period, 
              corstr="exchangeable", family=binomial, 
              id=herd, data=cbpp)
summary(model)
prs <- c(2,3,4)       # Choose parameters to test equal to zero
VI <- solve(model$robust.variance[prs,prs])   # Invert variance
res <- as.numeric(coef(model)[prs] %*% VI %*% coef(model)[prs])
1-pchisq(res, df=length(prs))
model2 <- gee(cbind(incidence, size-incidence) ~ period, 
              corstr="unstructured", family=binomial, 
              id=herd, data=cbpp)
summary(model2)$coefficients
model2$working.correlation


Rule 3.16
data(AirPassengers)
model <- stl(log(AirPassengers), s.window="periodic")
plot(model)
summary(model)
acf(model$time.series[,3])  # Plot autocorrelation of residuals


Rule 3.17
library(kulife)
data(greenland)
temp <- ts(greenland$airtemp, start=1960)
temp
acf(temp, na.action=na.pass)
pacf(temp, na.action=na.pass)
model <- arima(temp, order=c(1,0,1))
model
forecast <- predict(model, 15)
forecast
ts.plot(temp, forecast$pred, lty=1:2)
lines(forecast$pred + 1.96*forecast$se,  lty=3)
lines(forecast$pred - 1.96*forecast$se,  lty=3)


Rule 3.18
library(MASS)
data(cats)
head(cats)
t.test(Bwt ~ Sex, data=cats)
t.test(Bwt ~ Sex, data=cats, var.equal=TRUE)
x <- c(1.3, 5.5, -2.1, 0.9, -0.4, 1.1)
t.test(x, mu=0)


Rule 3.19
library(kulife)
data(qpcr)
# Use data from just one of the 14 runs
run1 <- subset(qpcr, transcript==1 & line=="wt")
model <- nls(flour ~ fmax/(1+exp(-(cycle-c)/b))+fb, 
             start=list(c=25, b=1, fmax=100, fb=0), 
             data=run1)
model
summary(model)
# Plot the original data
plot(run1$cycle, run1$flour, 
     xlab="Cycle", ylab="Fluorescence")
lines(run1$cycle, predict(model)) # Add the fitted line
model2 <- nls(flour ~ fmax/(1+exp(-(cycle-c)/b))+fb, 
              start=list(c=25, b=1, fmax=100, fb=0),
              lower=c(-Inf, .9, -Inf, -Inf), algorithm="port",
              control=nls.control(maxiter=100, tol=1e-08),
              data=run1)
model2


Rule 3.20
library(AER)
data(Affairs)
model <- tobit(affairs ~ age + gender + children, data=Affairs)
summary(model)
drop1(model, test="Chisq")


Rule 3.21
library(nortest)
x <- rnorm(100, mean=5, sd=2)         # Normal distribution
y <- (rnorm(100, mean=5, sd=1.5))**2  # Right-skewed distrib.
shapiro.test(x)
ad.test(x)
cvm.test(y)
lillie.test(y)


Rule 3.22
data(ToothGrowth)
bartlett.test(len ~ factor(dose), data=ToothGrowth)
library(car)
leveneTest(len ~ factor(dose), data=ToothGrowth)
fligner.test(len ~ factor(dose), data=ToothGrowth)
bartlett.test(len ~ interaction(supp, dose), data=ToothGrowth)


Rule 3.23
library(gof)
data(trees)
attach(trees)
model <- lm(Volume ~ Girth + Height)  # Multiple regression
cumres(model)
lVol <- log(Volume)      # Log transform variables
lGir <- log(Girth)
lHei <- log(Height)
model2 <- lm(lVol ~ lGir + lHei)    # New model
cumres(model2)
cr2 <- cumres(model2)  # Save the cumulated residuals 
par(mfrow=c(2,2))      # Allow 2x2 plots in the same Figure
plot(cr2)


Rule 3.24
m <- matrix(c(23, 20, 12, 5, 6, 9), ncol=3)   # Input data
m
chisq.test(m)                          # Chi-square test
fisher.test(m)                         # Fisher's exact test
m2 <- matrix(c(35, 25, 6, 9), ncol=2)   # Input data
chisq.test(m2)
chisq.test(m2, correct=FALSE)       # No continuity correction
fisher.test(m2, alternative="greater") # One-sided alternative


Rule 3.25
library(MASS)
data(survey)
mydata <- as.data.frame(xtabs(~ Sex + Clap + Exer, 
                              data=survey))
head(mydata)
full <- glm(Freq ~ Sex*Clap*Exer, family=poisson, data=mydata)
indep <- glm(Freq ~ Sex+Clap+Exer, family=poisson, data=mydata)
anova(indep, full, test="Chisq")
block1 <- glm(Freq ~ Sex + Clap*Exer, family=poisson, 
              data=mydata)
anova(block1, full, test="Chisq")
cond1 <- glm(Freq ~ Sex*Exer+Clap*Exer, family=poisson, 
             data=mydata)
anova(cond1, full, test="Chisq")


Rule 3.26
hplc <- c(139, 120, 143, 496, 149, 52, 184, 190, 32, 312, 19)
gcms <- c(151, 93, 145, 443, 153, 58, 239, 256, 69, 321, 8)
average <- (hplc + gcms)/2           # Compute average
dif <- (hplc - gcms)                 # and difference
plot(average, dif, ylim=c(-80,80),   # Plot diff vs average
     xlab="Average", ylab="Difference")
# Calculate limit for 95% agreement
limit <- qnorm(.975)                 
# Add average bias and 95% limits of agreement
abline(h=mean(dif)+c(-limit,0,limit)*sd(dif),lty=c(3,1,3))
# Add line showing bias as function of magnitude
abline(lm(dif~average), lty=2)


Rule 3.27
library(MethComp)
library(kulife)
data(rainman)
head(rainman)
# Use column 1 to define items, and values from columns 3 to 5
# Meth automatically use the original column names for the 
# methods and creates the replicate vector
mydata <- Meth(item=1, y=3:5, data=rainman)
result <- BA.est(mydata, Transform="log")  # Use log transform
result
plot(mydata)
plot(result, wh.cmp=c(1,2), points=TRUE, pl.type="BA")
library(lme4)
mi <- interaction(mydata$meth, mydata$item)
ri <- interaction(mydata$repl, mydata$item)
lmer(log(y) ~ item + (1|meth) + (1|mi) + (1|ri), data=mydata)
c(-1,1)*1.96*sqrt(2*(0.0000 + 0.0020532 + 0.0245641))


Rule 3.28
library(irr)
data(anxiety)
head(anxiety)
kappa2(anxiety[,1:2]) # Compute kappa for raters 1,2
kappa2(anxiety[,1:2], weight="squared")
kappam.fleiss(anxiety)


Rule 3.29
data(airquality)
attach(airquality)
day <- as.Date(paste("1973", Month, Day, sep="-"))
dayno <- julian(day, origin = as.Date("1973-01-01"))
head(dayno)
model <- lm(cbind(Ozone, Temp, Solar.R) ~ dayno + I(dayno**2))
model
anova(model)


Rule 3.30
library(cluster)
data(agriculture)
fit2 <- kmeans(agriculture, centers=2)   # 2 cluster solution
fit2
# Calculate the total within sum of squares for different
# number of clusters (here 2-6)
sapply(2:6, function(i) {
scale.agriculture <- scale(agriculture)
fit2s <- kmeans(scale.agriculture, centers=2)
fit2s
sapply(2:6, function(i) {
 sum(kmeans(scale.agriculture, centers=i)$withinss)})
distances <- dist(agriculture)
hierclust <- hclust(distances)
hierclust
hierclust.single <- hclust(distances, method="single")
plot(hierclust)
plot(hierclust.single)
cutree(hierclust, k=2)         # Cut tree to get 2 groups
cutree(hierclust.single, h=5)  # Cut at height 5


Rule 3.31
library(MASS)
data(biopsy)
names(biopsy)
predictors <- biopsy[complete.cases(biopsy),2:10]
fit <- prcomp(predictors, scale=TRUE)
summary(fit)
plot(fit)
biplot(fit)
fit
head(predict(fit))
predictors <- biopsy[,2:9]
fit <- prcomp(~ ., data=predictors, scale=TRUE)


Rule 3.32
library(MASS)
data(biopsy)
names(biopsy)
predictors <- biopsy[complete.cases(biopsy),2:10]
fit <- prcomp(predictors, scale=TRUE)
summary(fit)
pc <- predict(fit)[,1:4]  # Select first 4 principal comp.
head(pc)
y <- biopsy$class[complete.cases(biopsy)]  # Get response
model <- glm(y ~ pc, family=binomial)      # Fit the pcr
summary(model)


Rule 3.33
library(MASS)
data(biopsy)
fit <- lda(class ~ V1 + V2 + V3, data=biopsy)
fit
plot(fit, col="lightgray")
result <- table(biopsy$class, predict(fit)$class)
result
sum(diag(result)) / sum(result)
# Split the data up into random training and test samples
train <- sample(1:nrow(biopsy), ceiling(nrow(biopsy)/3))
testdata <- biopsy[-train,]
trainfit <- lda(class ~ V1 + V2 + V3, data=biopsy, subset=train)
trainfit
result2 <- table(testdata$class, 
                 predict(trainfit, testdata)$class)
result2
sum(diag(result2)) / sum(result2)


Rule 3.34
library(pls)
data(gasoline)
gasTrain <- gasoline[1:50,]  # Training dataset with 50 samples
gasTest <- gasoline[51:60,]  # Test dataset with 10 samples
gas1 <- plsr(octane ~ NIR, ncomp=5, 
             data=gasTrain, validation="LOO")
summary(gas1)
plot(gas1, ncomp=3, line=TRUE)
plot(gas1, "loadings", comps=1:3)
RMSEP(gas1, newdata=gasTest)


Rule 3.35
library(boot)
set.seed(1)           # Set seed to reproduce simulations
mydata <- rnorm(150, mean=rep(rnorm(30), 5))
mydata <- matrix(mydata,ncol=5)
head(mydata)
heritability <- function(data, indices=1:nrow(data)) {
  r <- ncol(data) 
  df <- stack(as.data.frame(t(data[indices,])))
  result <- anova(lm(values ~ ind, data=df))
  mssire <- result[1,3]
  mswithin <- result[2,3]
  heritab <- 4*(mssire - mswithin) / ((r+1)*mssire - mswithin)
  return(heritab)
}
heritability(mydata)
result <- boot(mydata, heritability, R=10000)
result
plot(result)
boot.ci(result)


Rule 3.36
data(trees)
fit1 <- glm(Volume ~ Girth + Height, data=trees)
cv.err.loo <- cv.glm(trees, fit1)
cv.err.loo$delta
cv.err.3 <- cv.glm(trees, fit1, K=3)  # 3 fold cross validation
cv.err.3$delta
fit2 <- glm(log(Volume) ~ log(Girth) + log(Height), data=trees)
mycost <- function(responses, expected) { 
  mean((exp(responses) - exp(expected))**2) }
cv2.err.loo <- cv.glm(trees, fit2, mycost)
cv2.err.loo$delta
library(bootstrap)      # For the crossval function
library(nnet)           # For the nnet function
library(MASS)           # For the survey data frame
data(survey)
fit <-  nnet(Exer ~ Sex + Smoke + Age, data=survey, 
             size=10, rang=.05, maxiter=200, trace=FALSE) 
fit
# Calculate the number of misclassifications
cc <- complete.cases(survey$Exer, survey$Sex, 
                     survey$Smoke, survey$Age)
sum(predict(fit, type="class") != survey$Exer[cc])
index <- 1:nrow(survey) # Work-around to identify obs.
# theta.fit fits the model. It uses the indices passed through  
# x to select the subset of the survey data frame. The response
# y is not really used since we get the correct response with 
# the subset argument to nnet
theta.fit <- function(x,y) {  
  nnet(Exer ~ Sex + Smoke + Age, data=survey, size=10, 
       subset=x, rang=.05, maxiter=200, trace=FALSE) }
# theta.predict returns the predicted class. Again, indices for
# the test data are passed through x and selected from survey
theta.predict <- function(fit, x) {
  predict(fit, survey[x,], type="class") }                       
o <- crossval(index, survey$Exer, theta.fit, theta.predict, 
              ngroup=5)
table(survey$Exer, o$cv.fit)
sum(o$cv.fit != survey$Exer, na.rm=TRUE)


Rule 3.37
power.t.test(delta=2, sd=3.6, power=.8, sig.level=0.05)
 power.t.test(n=20, delta=.70, sd=1, sig.level=0.05, 
              type="one.sample")
power.prop.test(p1=0.43, p2=0.75, power=.8, sig.level=0.05)
groupmeans <- c(4.5, 8, 6.2, 7.0)
power.anova.test(groups = length(groupmeans),
                 between.var=var(groupmeans),
                 within.var=5.8, power=.80, sig.level=0.05) 
n <- seq(20, 50)          # Consider sample sizes from 20 to 50
plot(n, power.t.test(n, delta=.7)$power, type="l", 
     ylim=c(0.55,1), ylab="Power",
     xlab="Number of observations per group")
lines(n, power.t.test(n, delta=.8)$power, lty=2)
lines(n, power.t.test(n, delta=1)$power, lty=3)
delta <- seq(0, 1.6, .1)  # Consider effect sizes from 0 to 1.6
plot(delta, power.t.test(n=25, delta=delta)$power, 
     type="l", ylim=c(0,1), xlab="Effect size", 
     ylab="Power")
lines(delta, power.t.test(n=40, delta=delta)$power, lty=2)
lines(delta, power.t.test(n=60, delta=delta)$power, lty=3)


Rule 3.38
library(kulife)
data(superroot2)
geneid <- unique(superroot2$gene)   # Get vector of gene names
# Analyze each gene separately and extract the p-value
pval <- sapply(1:21500, function(i) { 
               anova(lm(log(signal) ~ array + color + plant, 
                        data=superroot2, 
                        subset=(gene==geneid[i])))[3,5]})
head(sort(pval), 8)                 # Show smallest p-values
head(p.adjust(sort(pval)), 8)       # Holm correction
head(p.adjust(sort(pval), method="fdr"), 8) # FDR


Rule 3.39
x <- rbinom(50, size=1, p=.6)  # Generate random data
x
wilcox.test(x, mu=0.5)         # Test median equal to 0.5
x2 <- x - 0.5               # Deduct 0.5 from all observations
wilcox.test(x2)             # Make the same test with median 0
sample1 <- rbinom(12, size=4, p=0.5)
sample1
sample2 <- sample1 + rbinom(12, size=2, p=0.5) - 1
sample2
wilcox.test(sample1, sample2, paired=TRUE)
library(coin)
mu <- rep(.5, length(x))
wilcoxsign_test(x ~ mu, distribution=exact()) 
wilcoxsign_test(sample1 ~ sample2, distribution=exact()) 


Rule 3.40
data(ToothGrowth)
wilcox.test(len ~ supp, data=ToothGrowth)
library(coin)
wilcox_test(len ~ supp, data=ToothGrowth, distribution=exact())


Rule 3.41
data(ToothGrowth)
# Use only data on ascorbic acid
teeth <- subset(ToothGrowth, supp=="VC")
kruskal.test(len ~ factor(dose), data=teeth)
library(pgirmess)
kruskalmc(len ~ factor(dose), data=teeth)
library(coin)
kruskal_test(len ~ factor(dose), data=teeth, 
             distribution=approximate())


Rule 3.42
library(isdals)
data(cabbage)
friedman.test(yield ~ method | field, data=cabbage)
library(coin)
friedman_test(yield ~ method | factor(field), 
              data=cabbage, distribution=approximate())


Rule 3.43
library(survival)
data(ovarian)
head(ovarian)
# Fit Kaplan-Meier curves for each treatment
model <- survfit(Surv(futime, fustat) ~ rx, data=ovarian) 
summary(model)
plot(model, lty=2:3, xlab="Days", ylab="Survival") # Plot them
survdiff(Surv(futime, fustat) ~ rx, data=ovarian)
library(coin)
surv_test(Surv(futime, fustat) ~ factor(rx), 
          data=ovarian, distribution=exact)


Rule 3.44
library(survival)
data(ovarian)
ovarian$rx <- factor(ovarian$rx)   # Make sure rx is a factor
head(ovarian)
model <- coxph(Surv(futime, fustat) ~ rx*age, data=ovarian) 
drop1(model, test="Chisq")
model2 <- coxph(Surv(futime, fustat) ~ rx + age, data=ovarian)
summary(model2)
validate <- cox.zph(model)
validate
model4 <- coxph(Surv(futime, fustat) ~ rx + strata(age>60), 
                data=ovarian)
summary(model4)


Rule 3.45
library(survival)
data(heart)
head(jasa1)
model <- coxph(Surv(start, stop, event) ~ transplant + 
               year + year*transplant, data=jasa1)
summary(model)
cox.zph(model)
model2 <- coxph(Surv(start, stop, event) ~ transplant:year
                year + strata(transplant), data=jasa1)
summary(model2)
plot(survfit(model2), conf.int=TRUE, lty=1:2, 
     xlab="Time (days)", ylab="Survival")

Chapter 4: Graphics

x <- 1:5                     # Generate data
y <- x^2                     # Generate data
plot(x, y, pch=3:7)          # Use symbols 3-7 for the points
plot(x, y, pch=c("a", "b"))  # Use symbols a b a b a 
plot(x, y, col="red")        # Each symbol is red
plot(x, y, pch=c("a", "b"),  # Symbols are a b a b a in colors
     col=c("red", "blue", "green")) # red blue green red blue
# Plot circles with a thick red border and blue fill color
plot(x, y, pch=21, lwd=3, col="red", bg="blue")
plot(x, y, cex=seq(1,2,.25)) # Symbols with increasing size
plot(x, y, type="l", lty=2)  # Line with line type 2
plot(x, y, type="l", lwd=3)  # Line with line width 3
par(lwd=3, pch=16)           # Line width 3 and symbol 16
par(cex.lab=2, cex.main=3)   # Large labels and larger title
plot(x, y, xlim=c(0, 10))    # x axis goes from 0 to 10
plot(x, y, log="y")          # Logarithmic y axis
plot(x, y, las=1)            # Rotate numbers on y axis
text(1, 2, "Where's Wally?")  # Add text to existing plot
mtext("I'm out here", side=4) # and text to the right margin


Rule 4.1
x <- seq(0, 5, .1)
# Plot sine and square root curve and add x axis label
plot(x, sin(x), type="l", ylim=c(-1, 2.5),
     xlab=expression(paste("Concentration ", mu[i])))
lines(x, sqrt(x), lty=2)
title(expression(paste("This looks like ", Gamma,
# Place equations at specific positions
text(4, 1.5, expression(hat(x) == sqrt(x)))
text(1.6, -.5, expression(paste(plain(sin)(x) == 
     sum(frac((-1)^n, paste((2*n+1), plain("!")))*x^(2*n+1), 
     n==0, infinity))))


Rule 4.2
colors()
plot(1:10, col="cyan")         # Plot using cyan color
plot(1:10, col="#cc0076")      # Plot using purplish color
rgb(10, 20, 30, maxColorValue=255)
rgb(10, 20, 30, maxColorValue=100)
rgb(10, 20, 30, alpha=50, maxColorValue=100)


Rule 4.3
palette()             # Show the current palette
palette(c("red", "#1A334D", "black", rgb(.1, .8, 0)))
palette()
palette("default")    # Use the default palette
library(RColorBrewer)
head(brewer.pal.info, n=10)
brewer.pal(n=5, name="Spectral")       # 5 colors from Spectral
brewer.pal(n=10, name="Accent")        # 10 colors from Accent
# Set colors locally for just one function call
plot(1:8, col=brewer.pal(n=8, name="Reds"), pch=16, cex=4)
palette(brewer.pal(n=8, name="Greens")) # Set global palette
plot(1:8, col=1:8, pch=16, cex=4)       # Plot using colors


Rule 4.4
data(airquality)
plot(airquality$Temp, airquality$Ozone,
     xlab="Temperature", ylab="Ozone",
     main="Airquality relationship?")
pairs(airquality)
# Create function to make histogram with density superimposed
panel.hist <- function(x, ...)  {
  # Set user coordinates of plotting region
  usr <- par("usr"); on.exit(par(usr))
  par(usr = c(usr[1:2], 0, 1.5))
  par(new=TRUE)                    # Do not start new plot
  hist(x, prob=TRUE, axes=FALSE, xlab="", ylab="", 
       main="", col="lightgray")
  lines(density(x, na.rm=TRUE))    # Add density curve
}
# Create function to compute and print R^2
panel.r2 <- function(x, y, digits=2, cex.cor, ...) {
  # Set user coordinates of plotting region
  usr <- par("usr"); on.exit(par(usr))
  par(usr = c(0, 1, 0, 1))
  r <- cor(x, y, use="complete.obs")**2  # Compute R^2
  txt <- format(c(r, 0.123456789), digits=digits)[1]
  if(missing(cex.cor)) cex.cor <- 1/strwidth(txt)
  text(0.5, 0.5, txt, cex = cex.cor * r)
}
pairs(~ Ozone + Temp + Wind + Solar.R, data=airquality, 
      lower.panel=panel.smooth, upper.panel=panel.r2, 
      diag.panel=panel.hist)


Rule 4.5
data(LakeHuron)
hist(LakeHuron)
# Use the breaks option to specify the number of bins
# regardless of the size of the dataset
# Remove title and make bars gray
hist(LakeHuron, xlab="Level of Lake Huron (in feet)",
     freq=FALSE, breaks=c(574, 577, 578, 581, 585), 
     col="gray", main=NULL)
lines(density(LakeHuron))


Rule 4.6
data(ToothGrowth)
teeth <- subset(ToothGrowth, supp=="VC") # Use only some data
boxplot(len ~ factor(dose), data=teeth)
# Horiz. boxplot with whiskers from minimum to maximum value
attach(ToothGrowth)                      # Use all data
boxplot(len, horizontal=TRUE, range=0)


Rule 4.7
m <- matrix(c(23, 20, 12, 5, 6, 9), ncol=3)  # Input data
relfrq <- prop.table(m, margin=1)            # Calc rel. freqs
relfrq
# Make juxtaposed barplot
barplot(relfrq,  beside=TRUE, 
        legend.text=c("Hadsten", "Hammel"),
        names=c("<0.01", "0.01-0.10", ">0.10"))
# Stacked relative barplot with labels added
barplot(t(relfrq), names=c("Hadsten", "Hammel"), horiz=TRUE)


Rule 4.8
library(gplots)
data(airquality)
# Compute means and standard deviations for each month
mean.values <- by(airquality$Temp, airquality$Month, mean)
sd.values <- by(airquality$Temp, airquality$Month, sd)
barplot2(mean.values,
         xlab="Month", ylab="Temperature (F)",
         plot.ci=TRUE,
         ci.u = mean.values+1.96*sd.values,
         ci.l = mean.values-1.96*sd.values)
airquality$Month <- factor(airquality$Month)
model <- lm(Temp ~ Month - 1, data=airquality)
mean.values <- coef(summary(model))[,1]
mean.values
sem <- coef(summary(model))[,2]
scale <- qt(0.975, df=summary(model)$df[2])
barplot2(mean.values,
         xlab="Month", ylab="Temperature (F)",
         plot.ci=TRUE,
         ci.u = mean.values+scale*sem,
         ci.l = mean.values)


Rule 4.9
library(gplots)
data(OrchardSprays)
fit <- lm(decrease ~ treatment - 1, data=OrchardSprays)
summary(fit)
plotCI(coef(summary(fit))[,1], 
       uiw=qt(.975, df=56)*coef(summary(fit))[,2],
       ylab="Estimates and 95% confidence intervals", xaxt="n")
axis(1, 1:length(rownames(coef(summary(fit)))), 
     rownames(coef(summary(fit))))


Rule 4.10
library(plotrix)
men <- c(334700, 357834, 328545, 367201, 411689, 
         359363, 336841, 178422, 72296, 9552, 139, 0)
women <- c(318662, 340357, 320462, 365180, 401521, 
           357405, 345884, 208057, 117843, 27914, 761, 0)
groups <- paste(seq(0, 110, 10), "-", seq(9, 119, 10), sep="")
groups                             # Show group labels
men <- 100*men/sum(men)            # Normalize men
women <- 100*women/sum(women)      # and women
pyramid.plot(men, women, labels=groups,
            lxcol="lightgray", rxcol="lightgray")
library(MASS)
data(survey)               
# Tabulate exercise and smoking status by gender
result <- by(survey, survey$Sex, 
             function(x) {table(x$Smoke, x$Exer)})
result
pyramid.plot(lx=result$Male, rx=result$Female,
             labels=row.names(result$Male),
             gap=10, unit="Number of students",
             top.labels=c("Male","Smoking Frequency","Female"),
             lxcol=c("darkgray", "white", "lightgray"),
             rxcol=c("darkgray", "white", "lightgray"))


Rule 4.11
x <- seq(0, 4*pi, .1)
matplot(x, cbind(sin(x), cos(x), sin(x/2)), type="l")
x1 <- 1:6
y1 <- x1**2
x2 <- seq(1.5, 4.5, 1)
x2
x2fix <- c(x2, NA, NA)
y2 <- sqrt(x2fix)
matplot(cbind(x1, x2fix), cbind(y1, y2))


Rule 4.12
x <- seq(0, 3, .2)    # Set grid points for x
y <- seq(0, 4, .1)    # Set grid points for y
# Compute function at all grid points
z <- outer(x, y, FUN=function(xx,yy) { sin(xx*yy) })
contour(x, y, z)      # Make contour plot
contour(x, y, z, levels=c(-1, -.3, 0, .1, .2, .3, .8, 1))
filled.contour(x,y,z,color.palette=gray.colors)
image(x,y,z,col=gray.colors(10))


Rule 4.13
x <- seq(0, 3, .2)        # Set grid for x dimension
y <- seq(0, 4, .1)        # Set grid for y dimension
z <- outer(x, y, FUN=function(xx,yy) { sin(xx*yy) })
persp(x, y, z)
persp(x, y, z, theta=40, phi=45, col="lightgray")
persp(x, y, z, theta=40, phi=45, col="lightgray", shade=.6)
persp(x, y, z, theta=40, phi=45, col="lightgray", shade=.6,
      ticktype="detailed")


Rule 4.14
library(scatterplot3d)
data(trees)
attach(trees)
scatterplot3d(Girth, Height, Volume)           # Scatter plot
scatterplot3d(Girth, Height, Volume, type="h", # Vertical lines
              color=grey(31:1/40))
# Order trees according to volume and then 
# plot curve from lowest to highest volume
o <- order(trees$Volume)  
scatterplot3d(Girth[o], Height[o], Volume[o], type="l", 
              scale.y=3, angle=75)
myplot <- scatterplot3d(Girth, Height, Volume, type="h", 
                        angle=60)
model <- lm(Volume ~ Girth + Height)   # Fit a model
myplot$plane3d(model)                  # Add fitted plane
detach(trees)


Rule 4.15
data(eurodist)
x <- as.matrix(eurodist)
heatmap(x, Rowv=NA, Colv=NA)
colnames(x)
# Add a color bar to the heatmap
countries <- c("Greece", "Spain", "Belgium", "France", 
"France", "Germany", "Denmark", "Switzerland", "Gibraltar", 
"Germany", "Holland", "Portugal", "France", "Spain", "France", 
"Italy", "Germany", "France", "Italy", "Sweden", "Austria")
f.countries <- factor(countries)
group.colors <- heat.colors(nlevels(f.countries))
country.color <- group.colors[as.integer(f.countries)]
heatmap(x, ColSideColors = country.color,
        RowSideColors = country.color)


Rule 4.16
library(ellipse)
data(airquality)
# cor only works on complete cases so we remove missing
complete <- airquality[complete.cases(airquality),]
plotcorr(cor(complete))
# Get lung capacity data and fit a model
library(isdals)
data(fev)              
model <- lm(FEV ~ Ht + I(Ht^2) + Gender + Smoke + Age,
            data=fev)
corr2 <- cov2cor(vcov(model))     # Correlation matrix
plotcorr(corr2,  type="lower", diag=TRUE)
par(new=TRUE)                     # Keep 1st plot
plotcorr(corr2, type="upper", diag=TRUE, numbers=TRUE)


Rule 4.17
x <- rnorm(400, mean=5, sd=2)   # Normal data
y <- rt(400, df=4)              # Data from t distribution
qqnorm(x)                       # Create normal q-q plot
qqline(x)                       # Add comparison line
qqnorm(y)                       # Create normal q-q plot
qqline(y)                       # Add comparison line


Rule 4.18
data(trees)
result <- lm(Volume ~ Height + Girth, data=trees)
plot(result, which=1:6)
plot(fitted(result), rstandard(result),
     xlab="Fitted values", ylab="Standardized residuals")
plot(trees$Height, rstandard(result),
     xlab="Height", ylab="Standardized residuals")


Rule 4.19
library(plotrix)
plot(2:6)
# Default slash break point on the x axis
axis.break()    
# Zigzag break point at 3.5 on y axis, set width 
# to 0.05 of plot width
axis.break(axis=2, breakpos=3.5, brw=0.05, style="zigzag")


Rule 4.20
library(plotrix)
# Enter data
softdrinks <- c(19.13, 15.94, 22.96, 28.70, 49.75, 76.53, 
                70.15, 89.29, 56.12, 44.01, 31.89, 25.51)
accidents <- c(557, 527, 566, 598, 631, 657, 
               640, 733, 724, 663, 678, 627)
twoord.plot(1:12, softdrinks, 1:12, accidents, xlab="Month",
            ylab="Danish soft drink sales (million liters)",
            rylab="Danish traffic accidents", 
            xtickpos = 1:12, 
            xticklab=month.name[1:12])


Rule 4.21
# Set extra space for rotated labels
par(mar = c(7, 4, 1, 1) + 0.1)
# Create plot with no x axis and no x axis label
x <- 1:6
plot(x, sin(x), xaxt="n", xlab="", pch=1:6)
axis(1, labels=FALSE)
labels <- c("Are", "you", "master", "of", "your", "domain?")
# Add labels at default tick marks
text(x, y=par("usr")[3] - .1, srt=45, adj=1,
    labels=labels, xpd=TRUE)
# Add axis label at line 5 
mtext(1, text="Added label for x-axis", line=5)


Rule 4.22
design <- matrix(c(1,1,0,2), 2, 2, byrow=TRUE)
design                          # Show the layout matrix
layout1 <- layout(design)
layout.show(layout1)
x <- exp(rnorm(100))            # Simulate data
y <- (rnorm(100)+3)**2          # Simulate more data
plot(x, y)                      # Make a scatter plot
hist(x)                         # and a histogram
layout2 <- layout(matrix(c(1, 0, 2, 3), nrow=2, byrow=TRUE),
                  widths=c(3, 1),
                  heights=c(1, 3))
# Remove some of the white space around the plots
par(mar=c(4, 4.3, 0, 0) +.1)    
plot(density(x), main="", xlab="")
plot(x, y)
boxplot(y)


Rule 4.23
data(CO2)
matplot(matrix(CO2$conc, ncol=12), matrix(CO2$uptake, ncol=12), 
        type="l", lty=rep(c(1,2), c(6,6)), 
        xlab="Concentration", ylab="Uptake")
legend(750, 12, c("Quebec", "Mississippi"), lty=c(1,2))


Rule 4.24
library(plotrix)
cats <- matrix(c(53, 115, 17, 502, 410, 14),ncol=2)
rownames(cats) <- c("Yes", "No", "No info")
colnames(cats) <- c("Problems", "No problems")
cats
# Calculate the group-wise relative frequencies
cattable <-  cats / apply(cats, 1, sum)
barplot(t(cattable), beside=TRUE)
addtable2plot(4.6, .8, cats, display.rownames=TRUE, bty="o",
              bg="lightgray", title="Cat behaviour data")


Rule 4.25
data(CO2)
plot(CO2$conc, CO2$uptake, pch=22, 
     bg=c("black", "white")[as.numeric(CO2$Type)],
     xlab="Concentration", ylab="Uptake")
origintreat <- interaction(CO2$Treatment, CO2$Type)
head(origintreat)
levels(origintreat)
plotcode <- as.numeric(origintreat)
plot(CO2$conc, CO2$uptake, 
     pch=c(21, 22, 21, 22)[plotcode], 
     bg=c("black", "black", "white", "white")[plotcode],
     xlab="Concentration", ylab="Uptake")


Rule 4.26
x <- 1:10
y <- x**2
plot(x, y)
identify(x,y)
plot(x, y)
group <- rep(c("Odd", "Even"), 5)
identify(x,y, labels=group)


Rule 4.27
library(rgl)
data(trees)
attach(trees)
plot3d(Height, Girth, Volume, type="s", size=2) # Plot spheres
plot3d(Height, Girth, Volume, type="h")         # Vertic. lines
data(volcano)
z <- volcano            # Height
x <- 10 * (1:nrow(z))   # 10 meter spacing (S to N)
y <- 10 * (1:ncol(z))   # 10 meter spacing (E to W)
persp3d(x, y, z, smooth=TRUE, col="lightgray",
        xlab="NS axis", ylab="EW axis", zlab="Height")
persp3d(x, y, z, col="darkgray", front="line", back="cull",
        xlab="NS axis", ylab="EW axis", zlab="Height")
movie3d(spin3d(axis=c(0,0,1), rpm=3), duration=60, dir=".")


Rule 4.28
x <- 1:10
y <- rnorm(10)
plot(x,y)                            # Create a scatter plot
title("Very lovely graphics")        # Add title 
dev.copy2pdf(file="lovelygraph.pdf") # Copy graph to pdf file
dev.print(file="pngfile.png", device=png, width=480)
dev.print(file="jpgfile.jpeg", device=jpeg, 
          width=480, height=240)
pdf("gorgeousgraph.pdf")      # Opens a pdf device
plot(x,y)                     # Create a scatter plot
title("Very lovely graphics") # Add title 
dev.off()                     # Close device and file


Rule 4.29
library(tikzDevice)
data(airquality)
attach(airquality)
tikz(file="tikzexample.tex")       # Open tikz device
plot(Temp, Ozone)                  # Make scatter plot
model <- lm(Ozone ~ Temp)          # Estimate lin. reg.
abline(model, lwd=3, lty=2)        # Add line to plot
# Add a lowess smoothed line to the plot as well 
# lowess requires complete cases only
cc <- complete.cases(Temp, Ozone)  
lines(lowess(Temp[cc], Ozone[cc]), lwd=3, lty=1)
# Add a legend
legend(60, 150, c("Lowess", "Lin. reg."), lty=1:2, lwd=3)
dev.off()                          # Close tikz device
detach(airquality)


Rule 4.30
embedFonts("myplot.pdf")
names(pdfFonts())
pdf(file="test.pdf", useDingbats=FALSE)
par(family="NimbusSan")
plot(1,1, pch=16)
title("All embedded")
dev.off()
embedFonts("test.pdf")
library(Cairo)
CairoPDF("mycairoplot.pdf")
hist(rnorm(100))
dev.off()

Chapter 5: R

Rule 5.1
help(mean)      # Get help about the function mean 
help("mean")    # Same as above
?mean           # Same as above
help.start()    # Start browser and the general help
apropos("mean")
help.search("mean")
??mean                # Same as above
RSiteSearch("sas2r")  # Search for the string sas2r in online 
                      # help manuals and archived mailing lists


Rule 5.2
sd
boxplot
methods(boxplot)
boxplot.matrix   # See source for non-hidden method
boxplot.formula  # Try to see source for hidden method
getAnywhere("boxplot.formula")
library(lme4)
refit           # Try to show the source for the refit function
showMethods("refit")     # Show available methods
# List the source code for refit for class type numeric
findMethods("refit", classes="numeric")


Rule 5.3
install.packages("foreign")  # Install the foreign package 
install.packages("foreign", lib="C:/my/private/R-packages/")
.libPaths()     # Output from a machine running ubuntu
# Install a package from http://www.some.url.dk/Rfiles/
install.packages("newRpackage", 
                 repos="www.some.url.dk/Rfiles/")
# Install the newRpackage package from a local source file 
# located in the current working directory 
install.packages("newRpackage_1.0-1.tar.gz", repos=NULL)
library("foreign")  # Loads the foreign package
library("isdals", lib.loc="C:/my/private/R-packages/")


Rule 5.4
update.packages()
update.packages(lib.loc="C:/my/private/R-packages/")


Rule 5.5
library()


Rule 5.6
help(package=foreign)


Rule 5.7
vignette(all=FALSE)  # List all attached vignettes
vignette(all=TRUE)   # List all installed vignettes
library(Matrix)
vignette(all=FALSE, package="Matrix")
vignette("Intro2Matrix")          # Show vignette
vign <- vignette("Intro2Matrix")  # Store vignette
edit(vign)                        # Extract/edit source code


Rule 5.8
# Read the installation script directly from BioConductor site
source("http://bioconductor.org/biocLite.R")
biocLite("limma")  # Install the limma package
biocLite()         # Install the common BioConductor packages
source("http://bioconductor.org/biocLite.R")
update.packages(repos=biocinstallRepos(), ask=FALSE)


Rule 5.9
.libPaths()   # Example for ubuntu installation
.libPaths()   # Example for Windows 7 installation
Sys.getenv("R_LIBS_USER")   # On machine running Ubuntu
Sys.getenv("R_LIBS_USER")   # On machine running WinXP
Sys.getenv("R_LIBS_USER")   # On machine running Mac OS X
.libPaths()


Rule 5.10
Sys.getenv("R_PROFILE")
Sys.getenv("R_PROFILE_USER")
# Identify the installation directory
Sys.getenv("R_HOME")  # Output from machine running Ubuntu
Sys.getenv("R_HOME")  # Output from machine running Windows XP 


Rule 5.11
# Create two vectors 
x <- 1:5   
myfactor <- factor(c("a", "b", "b", "b", "c", "c"))
ls()                # List local objects
x                   # Print x
rm(x)               # Remove object x
x                   # Try to print x
ls()


Rule 5.12
getwd()          # Get the current working directory
setwd("d:/")     # Set the current working directory to d:/


Rule 5.13
ls()                     # List the current objects
x <- 1:5                 # Create a new vector
ls()                     # List the current objects
save.image("tmp/.RData") # Save the workspace to tmp/.RData
q()                      # Quit R
ls()                     # List the current objects
load("tmp/.RData")       # Load the saved image
ls()                     # List the current objects
x


Rule 5.14
x <- 1:5
y <- c(2, 3, 3, 1, 2)
lm(y ~ x)
savehistory("myanalysis.txt")
loadhistory("myanalysis.txt")
x                              # x is not found
source("myanalysis.txt")
ls()


Rule 5.15
dir()             # List files in the current working directory
dir(pattern="^b") # List files starting with b
file.copy("boxcox.r", "boxcox2.r")     # Copy file to boxcox2.r
file.rename("boxcox2.r", "newrcode.r") # Rename to newrcode.r
file.exists("newrcode.r")              # Check if file exists
file.exists("boxcox2.r")               # Old file is gone
file.remove("newrcode.r")              # Delete the file
dir.create("newdir")                   # Make new directory
basename("d:/ont/mention/the/war.txt")
dirname("d:/ont/mention/the/war.txt")


Rule 5.16
mydata <- read.table(file.choose(), header=TRUE)
library(tcltk)
mydata <- read.table(tk_choose.files(), header=TRUE)


Rule 5.17
system("df -h")           # File system disk space usage (unix)
system("fsutil volume diskfree c:")  # Disk usage (Windows)
when <- system("date", intern=TRUE)
when