Документ взят из кэша поисковой машины. Адрес оригинального документа : http://herba.msu.ru/shipunov/school/biol_240/en/visual_statistics.r
Дата изменения: Fri Apr 8 00:51:27 2016
Дата индексирования: Sun Apr 10 19:25:36 2016
Кодировка:

pdf.options(pointsize=16)

# \chapterX{Foreword}

# \chapter{The Data}

# \section{Origin of the data}

# \section{Population and sample}

# \section{How to obtain the data}

# \section{What to find in the data}

# \subsection{Why we need the data analysis}

# \subsection{What data analysis can do}

# \section{Answers to exercises}

# \chapter{How to process the data}

# \section{General purpose software}

# \section{Statistical software}

# \subsection{Graphical systems}

# \subsection{Statistical environments}

# \section{The very short history of the \S and \R}

# \section{Use, advantages and disadvantages of the \R}

b <- matrix(1:9, ncol=3)

# \section{How to download and install \R}

# \section{How to start with \R}

# \subsection{Launching \R}

# \subsection{First steps}

q

# \section{\R and data: the view from outside}

# \subsection{\R as the overgrown calculator}

# \subsection{How to load the data}

a <- c(1,2,3,4,5)

a

b <- 1:5

b

getwd()

getwd()

getwd()

dir("data")

read.table("data/mydata.txt", sep=";", head=TRUE)

read.table("data/mydata1.txt", sep="\t", head=TRUE)

read.table("data/mydata3.txt", dec=",", sep=";", h=T)

x <- "apple"

save(x, file="x.rd") # Save object "x"

exists("x")

rm(x)

exists("x")

dir()

load("x.rd") # Load object "x"

x

# \subsection{How to save the results}

write.table(file="trees.csv", trees, row.names=FALSE, sep=";",
quote=FALSE)

sink("1.txt", split=TRUE)

2+2

sink()

sink("1.txt", split=TRUE, append=TRUE)

print("2+2")

2+2

sink()

# \subsection{Graphics}

pdf(file="pics/05430.pdf"); oldpar <- par(mar=c(2,2,2,1))

plot(1:20, main="Title")

legend("topleft", pch=1, legend="My wonderful points")

par(oldpar); dev.off()

pdf(file="pics/05610.pdf"); oldpar <- par(mar=c(2,2,2,1))

plot(cars)

title(main="Cars from 1920s")

par(oldpar); dev.off()

pdf(file="pics/05960.pdf"); oldpar <- par(mar=c(2,2,1,1))

library(ggplot2)

qplot(1:20, 1:20, main="title")

par(oldpar); dev.off()

# \subsection{Graphical devices}

png(file="1-20.png", bg="transparent")

plot(1:20)

text(10, 20, "a")

dev.off()

# \subsection{Graphical options}

pdf(file="pics/06060.pdf"); oldpar <- par(mar=c(2,2,0,1))

old.par <- par(mfrow=c(2,1))

hist(cars$speed, main="")

hist(cars$dist, main="")

par(old.par)

par(oldpar); dev.off()

# \subsection{Interactive graphics}

# \section{Answers to exercises}

# \chapter{Types of data}

# \section{Degrees, hour and kilometers: measurement data}

pdf(file="pics/parametric_to_crop.pdf"); oldpar <- par(mar=c(0,0,0,0))

library(plotrix)

plot(c(-1,1), c(-1,1), type="n", xlab="", ylab="", axes=FALSE)

draw.circle(-.2, 0, .4)

draw.circle(.1, 0, .9)

text(-.2, 0, "parametric", cex=1.5)

text(.1, 0.6, "non-parametric", cex=1.5)

par(oldpar); dev.off()

x <- c(174, 162, 188, 192, 165, 168, 172.5)

str(x)

is.vector(x)

is.numeric(x)

# \section{Grades: ranked data}

# \section{Red, Yellow, Green: nominal data}

sex <- c("male", "female", "male", "male", "female", "male",
"male")

is.character(sex)

is.vector(sex)

str(sex)

sex

sex[1]

sex[2:3]

table(sex)

sex.f <- factor(sex)

sex.f

pdf(file="pics/08830.pdf"); oldpar <- par(mar=c(2,2,1,1))

plot(sex.f)

par(oldpar); dev.off()

is.factor(sex.f)

is.character(sex.f)

str(sex.f)

sex.f[5:6]

sex.f[6:7]

sex.f[6:7, drop=TRUE]

factor(as.character(sex.f[6:7]))

as.numeric(sex.f)

w <- c(69, 68, 93, 87, 59, 82, 72)

pdf(file="pics/09290.pdf"); oldpar <- par(mar=c(2,2,0,1))

plot(x, w, pch=as.numeric(sex.f), col=as.numeric(sex.f))

legend("topleft", pch=1:2, col=1:2, legend=levels(sex.f))

par(oldpar); dev.off()

m <- c("L", "S", "XL", "XXL", "S", "M", "L")

m.f <- factor(m)

m.f

m.o <- ordered(m.f, levels=c("S", "M", "L", "XL", "XXL"))

m.o

a <- factor(3:5)

a

as.numeric(a) # Incorrect conversion

as.numeric(as.character(a)) # Correct!

# \section{Fractions, counts and ranks: secondary data}

pdf(file="pics/09860.pdf"); oldpar <- par(mar=c(2,2,0,1))

daisy.t <- read.table("data/daisy.txt", sep="\t")

daisy <- daisy.t$V2

names(daisy) <- daisy.t$V1

oldpar <- par(mar = c(7, 4, 4, 2) + 0.1)

daisy.plot <- barplot(daisy, names.arg="")

text(daisy.plot, par("usr")[3]-0.5, srt=45, adj=1,
xpd=TRUE, labels=names(daisy))

par(oldpar)

par(oldpar); dev.off()

pdf(file="pics/10070.pdf"); oldpar <- par(mar=c(2,2,3,1))

dotchart(daisy)

par(oldpar); dev.off()

pdf(file="pics/06105.pdf")

cplants <- read.table("data/cplants.txt", sep="\t",
quote="", as.is=T)

library(wordcloud)

cfams <- data.frame(table(cplants$V3))

set.seed(5); wordcloud(cfams[,1], cfams[,2])

dev.off()

a1 <- c(1,2,3,4,4,5,7,7,7,9,15,17)

a2 <- c(1,2,3,4,5,7,7,7,9,15,17)

names(a1) <- rank(a1)

a1

names(a2) <- rank(a2)

a2

wilcox.test(a2)

# \section{Missing data}

h <- c(8, 10, NA, NA, 8, NA, 8)

h

mean(h)

mean(h, na.rm=TRUE)

mean(na.omit(h))

h.old <- h

h.old

h[is.na(h)] <- mean(h, na.rm=TRUE)

h

# \section{Outliers, and how to find them}

# \section{Changing data: basics of transformations}

a <- 1:10

b <- seq(100, 1000, 100)

d <- data.frame(a, b)

d

scale(d)

# \section{Inside \R: Matrices, lists and data frames}

# \subsection{Matrices}

m <- 1:4

m

ma <- matrix(m, ncol=2, byrow=TRUE)

ma

str(ma)

str(m)

mb <- m

mb

attr(mb, "dim") <- c(2,2)

mb

m3 <- 1:8

dim(m3) <- c(2,2,2)

m3

# \subsection{Lists}

l <- list("R", 1:3, TRUE, NA, list("r", 4))

l

fred <- list(name="Fred", wife.name="Mary", no.children=3,
child.ages=c(1,5,9))

fred

h[3]

ma[2, 1]

l[1]

str(l[1])

l[[1]]

str(l[[1]])

str(l[[5]])

names(l) <- c("first", "second", "third", "fourth", "fifth")

l$first

str(l$first)

names(w) <- c("Nick", "Jenny", "Peter", "Alex", "Kathryn",
"Vas", "George")

w

rownames(ma) <- c("a1","a2")

colnames(ma) <- c("b1","b2")

ma

w["Jenny"]

# \subsection{Data frames}

d <- data.frame(weight=w, height=x, size=m.o, sex=sex.f)

d

str(d)

d$weight

d[[1]]

d[,1]

d[,"weight"]

d[,2:4]

d[,-1]

d[d$sex=="female",]

d$sex=="female"

d[order(d$sex, d$height), ]

# \subsection{How to convert vectors, matrices, lists and data frames}

# \section{Answers to exercises}

rev(sort(daisy))

ma <- matrix(m, ncol=2, byrow=FALSE)

ma

d.sorted <- d[order(d$sex, d$height), ]

d.sorted[, order(names(d.sorted))]

# \chapter{One-dimensional data}

# \section{How to estimate general tendencies}

salary <- c(21, 19, 27, 11, 102, 25, 21)

mean(salary); median(salary)

a1 <- c(1,2,3,4,4,5,7,7,7,9,15,17)

a2 <- c(1,2,3,4,5,7,7,7,9,15,17)

median(a1)

median(a2)

attach(trees) # The first way

mean(Girth)

mean(Height)

mean(Volume/Height)

detach(trees)

with(trees, mean(Volume/Height)) # Second way

sapply(trees, mean) # Third way

sex <- c("male", "female", "male", "male", "female", "male",
"male")

t.sex <- table(sex)

mode <- names(t.sex[which.max(t.sex)])

mode

sd(salary); var(salary); IQR(salary)

attach(trees)

mean(Height)

median(Height)

sd(Height)

IQR(Height)

detach(trees)

pdf(file="pics/14390.pdf"); oldpar <- par(mar=c(2,2,0,1))

new.1000 <- sample((median(salary) - IQR(salary)) :
(median(salary) + IQR(salary)), 1000, replace=TRUE)

salary2 <- c(salary, new.1000)

boxplot(salary2, log="y")

par(oldpar); dev.off()

pdf(file="pics/14560.pdf"); oldpar <- par(mar=c(2,2,0,1))

boxplot(trees)

par(oldpar); dev.off()

pdf(file="pics/14730.pdf"); oldpar <- par(mar=c(2,2,0,1))

hist(salary2, breaks=20, main="")

par(oldpar); dev.off()

table(cut(salary2, 20))

stem(salary, scale=2)

pdf(file="pics/15260.pdf"); oldpar <- par(mar=c(2,2,0,1))

plot(density(salary2, adjust=2), main="")

rug(salary2)

par(oldpar); dev.off()

pdf(file="pics/15580.pdf"); oldpar <- par(mar=c(2,2,0,1))

library("beeswarm")

beeswarm(trees)

boxplot(trees, add=TRUE)

par(oldpar); dev.off()

pdf(file="pics/stripchart.pdf"); oldpar <- par(mar=c(2,2,0,1))

stripchart(data.frame(scale(trees)), vertical=TRUE)

par(oldpar); dev.off()

lapply(list(salary, salary2), summary)

summary(attenu)

100*sapply(trees, sd)/colMeans(trees)

# \section{Bad data}

dir("data")

err <- read.table("data/errors.txt", h=TRUE, sep="\t")

str(err)

summary(err)

# \section{One-dimensional statistical tests}

t.test(salary, mu=mean(salary))

wilcox.test(salary, mu=median(salary), conf.int=TRUE)

pdf(file="pics/17030.pdf"); oldpar <- par(mar=c(2,2,0,1))

qqnorm(salary2, main=""); qqline(salary2, col=2)

par(oldpar); dev.off()

shapiro.test(salary)

shapiro.test(salary2)

set.seed(1638)

shapiro.test(rnorm(100))

ks.test(salary2, "pnorm")

oldoptions <- options(scipen=100)

ks.test(salary2, "pnorm")

options(oldoptions)

# \section{How to create your own functions}

source("r/asmisc.r") # to load Normality()

Normality <- function(x, p=.05)
{
ifelse(shapiro.test(x)$p.value > p, "NORMAL", "NOT NORMAL")
}

sapply(list(salary, salary2), Normality)

sapply(trees, Normality)

sapply(log(trees+1), Normality)

# \section{Are the percentages always accurate?}

prop.test(x=356, n=476, p=0.7, alternative="two.sided")

# \section{Answers to exercises}

str(shapiro.test(rnorm(100)))

set.seed(1683)

shapiro.test(rnorm(100))$p.value

betula <- read.table("data/betula.txt", h=TRUE)

str(betula)

sapply(betula[,c(2:4,7:8)], Normality) # we skip categorical columns

CV <- function(x)

{

100*sd(x)/mean(x)

}

sapply(betula[,c(2:4,7:8)], CV)

prop.test(0.48*262, 262)

power.prop.test(p1=0.48, p2=0.52, power=0.8)

table(betula[betula$LOC < 3, c("LOC","LOBES")])

prop.test(table(betula[betula$LOC < 3, c("LOC","LOBES")]))

table(betula[, c("LOBES","WINGS")])

mcnemar.test(table(betula[, c("LOBES","WINGS")]))

# \chapter{Analyzing relationships: Two-dimensional data}

# \section{What is a statistical test?}

# \subsection{Statistical hypotheses}

# \subsection{Statistical errors}

# \section{Is there a difference? Comparing two samples}

sample1 <- c(1, 2, 3, 4, 5, 6, 7, 8, 9) # or 1:9

sample2 <- c(5, 5, 5, 5, 5, 5, 5, 5, 5) # or rep(5, 9)

pdf(file="pics/20320.pdf"); oldpar <- par(mar=c(2,2,0,1))

plot(extra ~ group, data = sleep)

par(oldpar); dev.off()

sapply(unstack(sleep, extra ~ group), Normality)

t.test(extra ~ group, data = sleep, paired=TRUE)

wilcox.test(Ozone ~ Month, data = airquality,
subset = Month %in% c(5, 8), conf.int=TRUE)

pdf(file="pics/20890.pdf"); oldpar <- par(mar=c(2,2,0,1))

boxplot(Ozone ~ Month, data = airquality,
subset = Month %in% c(5, 8))

par(oldpar); dev.off()

# \section{Effect sizes}

a <- 1:9; b <- rep(5, 9)

library(effsize)

cohen.d(a, b)

cohen.d(extra ~ group, data=sleep)

cliff.delta(Ozone ~ Month, data = airquality,
subset = Month %in% c(5, 8))

# \section{Is there an association? Analysis of tables}

with(airquality, table(cut(Temp, quantile(Temp)), Month))

ftable(Titanic, row.vars = 1:3)

d <- factor(rep(c("A","B","C"), 10), levels=c("A","B","C","D",
"E"))

is.na(d) <- 3:4

table(factor(d, exclude = NULL))

pdf(file="pics/21660.pdf"); oldpar <- par(mar=c(2,2,0,1))

titanic <- apply(Titanic, c(1, 4), sum)

titanic

titanic

mosaicplot(titanic, col = c("red", "green"), main = "",
cex.axis=1)

par(oldpar); dev.off()

x <- margin.table(HairEyeColor, 1:2)

chisq.test(x)

pdf(file="pics/21960.pdf"); oldpar <- par(mar=c(2,2,0,1))

x <- margin.table(HairEyeColor, c(1,2))

assocplot(x)

par(oldpar); dev.off()

tox <- read.table("data/poisoning.txt", h=TRUE)

head(tox)

for (m in 2:ncol(tox))
{
tmp <- chisq.test(tox$ILL, tox[,m])
print(paste(names(tox)[m], tmp$p.value))
}

pdf(file="pics/24120.pdf"); oldpar <- par(mar=c(4,4,0,1))

assocplot(table(ILL=tox$ILL, CAESAR=tox$CAESAR))

par(oldpar); dev.off()

tea <- matrix(c(3,1,1,3), nrow=2)

colnames(tea) <- row.names(tea) <- c("Milk", "Tea")

tea

chisq.test(tea)

fisher.test(tea)

chisq.test(tea, simulate.p.value=T)

pok <- read.table("data/pokorm.txt", h=TRUE, sep=";")

source("r/concordance.r")

cohen.kappa(as.matrix(pok))

# \section{Analysis of correlations}

cor(5:15, 7:17)

cor(5:15, c(7:16, 23))

cor(trees)

x <- rexp(50)

cor(x, log(x), method="spearman")

symnum(cor(longley))

pdf(file="pics/22880.pdf"); oldpar <- par(mar=c(2,3,0,1))

cor.l <- cor(longley)

colnames(cor.l) <- abbreviate(colnames(cor.l))

rownames(cor.l) <- abbreviate(rownames(cor.l))

image(1:ncol(cor.l), 1:nrow(cor.l), cor.l,
col=heat.colors(22), axes=FALSE, xlab="", ylab="")

axis(1, at=1:ncol(cor.l), labels=colnames(cor.l))

axis(2, at=1:nrow(cor.l), labels=rownames(cor.l),
las = 2)

par(oldpar); dev.off()

pdf(file="pics/23120.pdf"); oldpar <- par(mar=c(2,2,0,1))

library(ellipse)

plotcorr(cor.l, type="lower", mar=c(0,0,0,0))

par(oldpar); dev.off()

with(trees, cor.test(Girth, Height))

source("r/pleiad.r")

i <- read.table("data/abio.txt", sep="\t", h=TRUE)

Nnames(i)

pdf(file="pics/cover_to_crop.pdf")

abio.dyn <- cor(i[,1:6], i[,7:23],
method="spearman", use="complete.obs")

abio.dynn <- abs(abio.dyn[,-c(1, 7, 9,14)])

abio.dynn <- ifelse(abio.dynn < .2, 0, abio.dynn)

Pleiad(abio.dynn, abbr=2, legend=FALSE,
pch=21, cex=2, bg="white")

dev.off()

Cor(i, method="spearman", use="complete.obs")

Topm(abio.dyn)

# \section{Analysis of regression}

pdf(file="pics/23870.pdf"); oldpar <- par(mar=c(4,4,1,1))

women.lm <- lm(weight ~ height, data=women)

plot(weight ~ height, data=women,
xlab="Height, in", ylab="Weight, lb")

grid()

abline(women.lm, col="red")

source("r/asmisc.r") # to load Cladd()

Cladd(women.lm, data=women)

legend("bottomright", col=2:1, lty=1:2,
legend=c("linear relationship", "95% confidence bands"))

par(oldpar); dev.off()

summary(women.lm)

oldpar <- par(mfrow=c(3,3))

# Uniform variation (good):

for (i in 1:9) plot(1:50, rnorm(50), xlab="Fitted",
ylab="Residuals")

# Non-uniform variation (bad):

for (i in 1:9) plot(1:50, (1:50)*rnorm(50), xlab="Fitted",
ylab="Residuals")

par(oldpar)

women.lm2 <- lm(weight ~ height + I(height^2), data=women)

summary(women.lm2)

plot(women.lm2, which=1) # just residuals vs. fitted

plot(weight ~ height, data=women)

Cladd(women.lm2, data=women, ab.lty=1, ab.col="red")

# \subsection{Analysis of covariation}

pdf(file="pics/3265.pdf"); oldpar <- par(mar=c(4,4,0,1))

ipo <- read.table("data/ipomopsis.txt", h=T)

with(ipo, plot(Root, Fruit, pch=as.numeric(Grazing)))

abline(lm(Fruit ~ Root, data=subset(ipo, Grazing=="Grazed")))

abline(lm(Fruit ~ Root, data=subset(ipo, Grazing=="Ungrazed")), lty=2)

legend("topleft", lty=1:2, legend=c("Grazed","Ungrazed"))

par(oldpar); dev.off()

ipo.lm <- lm(Fruit ~ Root * Grazing, data=ipo)

summary(ipo.lm)

ipo.lm2 <- update(ipo.lm, . ~ . - Root:Grazing)

summary(ipo.lm2)

AIC(ipo.lm)

AIC(ipo.lm2)

AIC(women.lm)

AIC(women.lm2)

elections <- read.table("data/elections.txt", h=TRUE)

str(elections)

attach(elections)

PROP <- cbind(CAND.1, CAND.2, CAND.3) / VOTER

ATTEN <- (VALID + INVALID) / VOTER

pdf(file="pics/27760.pdf"); oldpar <- par(mar=c(4,4,0,1))

lm.1 <- lm(CAND.1/VOTER ~ ATTEN)

lm.2 <- lm(CAND.2/VOTER ~ ATTEN)

lm.3 <- lm(CAND.3/VOTER ~ ATTEN)

plot(CAND.3/VOTER ~ ATTEN, xlim=c(0,1), ylim=c(0,1),
xlab="Attendance", ylab="Percent voted for the candidate")

points(CAND.1/VOTER ~ ATTEN, pch=2)

points(CAND.2/VOTER ~ ATTEN, pch=3)

abline(lm.3)

abline(lm.2, lty=2)

abline(lm.1, lty=3)

legend("topleft", lty=c(3,2,1),
legend=c("Party 1","Party 2","Party 3"))

detach(elections)

par(oldpar); dev.off()

elections2 <- cbind(ATTEN, stack(data.frame(PROP)))

names(elections2) <- c("atten","percn","cand")

str(elections2)

ancova.v <- lm(percn ~ atten * cand, data=elections2)

summary(ancova.v)

pdf(file="pics/28790.pdf"); oldpar <- par(mar=c(4,4,0,1))

prp.coast <- read.table("data/primula.txt",
as.is=TRUE, h=TRUE)

plot(yfrac ~ nwse, data=prp.coast, type="n",
xlab="Distance from Novorossijsk, km",
ylab="Proportion of light-colored flowers, %")

rect(129, -10, 189, 110, col=gray(.8), border=NA); box()

mtext("129", at=128, side=3, line=0, cex=.8)

mtext("189", at=189, side=3, line=0, cex=.8)

points(yfrac ~ nwse, data=prp.coast)

abline(lm(yfrac ~ nwse, data=prp.coast), lty=2)

lines(loess.smooth(prp.coast$nwse, prp.coast$yfrac), lty=1)

par(oldpar); dev.off()

# \section{Probability of the success: logistic regression}

l <- read.table("data/logit.txt")

head(l)

l.logit <- glm(V2 ~ V1, family=binomial, data=l)

summary(l.logit)

tox.logit <- glm(formula=I(2-ILL) ~ CAESAR + TOMATO,
family=binomial, data=tox)

tox.logit2 <- update(tox.logit, . ~ . - TOMATO)

tox.logit3 <- update(tox.logit, . ~ . - CAESAR)

tox.logit$aic

tox.logit2$aic

tox.logit3$aic

# \section{If there are more than two samples: ANOVA}

pdf(file="pics/27420.pdf"); oldpar <- par(mar=c(2,2,0,1))

hwc <- read.table("data/hwc.txt", h=T)

str(hwc)

boxplot(WEIGHT ~ COLOR, data=hwc)

source("r/asmisc.r")

sapply(hwc[, 2:3], Normality)

wc <- aov(WEIGHT ~ COLOR, data=hwc)

summary(wc)

par(oldpar); dev.off()

pairwise.t.test(hwc$WEIGHT, hwc$COLOR)

TukeyHSD(wc)

kruskal.test(WEIGHT ~ COLOR, data=hwc)

# \section{Answers to exercises}

a <- c(1, 2, 3, 4, 5, 6, 7, 8, 9)

b <- c(5, 5, 5, 5, 5, 5, 5, 5, 5)

dif <- a-b

pos.dif <- dif[dif > 0]

prop.test(length(pos.dif), length(dif))

ozone.month <- airquality[,c("Ozone","Month")]

ozone.month.list <- unstack(ozone.month)

sapply(ozone.month.list, Normality)

grades <- read.table("data/grades.txt")

classes <- split(grades$V1, grades$V2)

sapply(classes, Normality)

lapply(classes, function(.x) median(.x, na.rm=TRUE))

wilcox.test(classes$A1, classes$A2, paired=TRUE)

wilcox.test(classes$B1, classes$A1, alt="greater")

cashiers <- read.table("data/cashiers.txt", h=TRUE)

head(cashiers)

sapply(cashiers, Normality)

(cashiers.m <- sapply(cashiers, mean))

with(cashiers, t.test(CASHIER.1, CASHIER.2, alt="greater"))

pr <- read.table("data/seedlings.txt", h=TRUE)

str(pr)

(t1 <- chisq.test(table(pr[pr$CID %in% c(0,105),])))

(t2 <- chisq.test(table(pr[pr$CID %in% c(0,80),])))

(t3 <- chisq.test(table(pr[pr$CID %in% c(0,63),])))

p.adjust(c(t1$p.value, t2$p.value, t3$p.value), method="bonferroni")

seeing <- read.table("data/seeing.txt")

attach(seeing)

seeing.logit <- glm(formula=V3 ~ V2, family=binomial)

summary(seeing.logit)

pdf(file="pics/32450.pdf"); oldpar <- par(mar=c(2,2,0,1))

tries <- seq(1, 5, 0.081) # exactly 50 numbers from 1 to 5

seeing.p <- predict(seeing.logit, list(V2=tries),
type="response")

plot(V3 ~ jitter(V2, amount=.1))

lines(tries, seeing.p)

detach(seeing)

par(oldpar); dev.off()

summary(aov(HEIGHT ~ COLOR, data=hwc))

pairwise.t.test(hwc$HEIGHT, hwc$COLOR)

pdf(file="pics/26450.pdf"); oldpar <- par(mar=c(2,2,0,1))

cor.test(hwc$WEIGHT, hwc$HEIGHT)

w.h <- lm(WEIGHT ~ HEIGHT, data=hwc)

summary(w.h)

plot(WEIGHT ~ HEIGHT, data=hwc)

abline(w.h)

Cladd(w.h, data=hwc)

par(oldpar); dev.off()

# \chapter{Analysis of structure: Data mining}

# \section{Drawing the multivariate data}

# \subsection{3D scatterplots}

pdf(file="pics/27180.pdf"); oldpar <- par(mar=c(0,0,0,0))

pairs(iris[,1:4], pch=21, bg=as.numeric(iris[,5]))

par(oldpar); dev.off()

pdf(file="pics/27000.pdf"); oldpar <- par(mar=c(0,0,0,1))

library(lattice)

xyplot(Sepal.Length ~ Petal.Length + Petal.Width | Species,
data=iris, auto.key=TRUE)

par(oldpar); dev.off()

pdf(file="pics/35270.pdf"); oldpar <- par(mar=c(0,0,0,1))

coplot(percn ~ atten | cand, data=elections2)

par(oldpar); dev.off()

# \subsection{Pictographs}

pdf(file="pics/27400.pdf"); oldpar <- par(mar=c(2,2,0,1))

stars(mtcars[1:9, 1:7], cex=1.2)

par(oldpar); dev.off()

pdf(file="pics/27580_to_crop.pdf"); oldpar <- par(mar=c(2,2,0,1))

library(TeachingDemos)

faces(mtcars[1:9, 1:7])

par(oldpar); dev.off()

pdf(file="pics/34390.pdf"); oldpar <- par(mar=c(2,2,0,1))

library(MASS)

parcoord(iris[,-5], col=rep(1:3, table(iris[,5])))

legend("top", bty="n", lty=1, col=1:3, legend=names(table(iris[,5])))

par(oldpar); dev.off()

# \section{The shadows of multidimensional clouds: principal component analysis}

iris.pca <- princomp(scale(iris[,1:4]))

pdf(file="pics/27890.pdf"); oldpar <- par(mar=c(2,2,0,1))

plot(iris.pca, main="")

par(oldpar); dev.off()

summary(iris.pca)

pdf(file="pics/28180.pdf"); oldpar <- par(mar=c(4,4,0,1))

iris.p <- predict(iris.pca)

plot(iris.p[,1:2], type="n", xlab="PC1", ylab="PC2")

text(iris.p[,1:2],
labels=abbreviate(iris[,5],1, method="both.sides"))

par(oldpar); dev.off()

oldpar <- par()

pdf(file="pics/28360.pdf"); oldpar <- par(mar=c(2,2,2,2))

biplot(iris.pca)

par(oldpar); dev.off()

par(oldpar)

loadings(iris.pca)

pdf(file="pics/28750.pdf"); oldpar <- par(mar=c(2,2,0,1))

library(ade4)

iris.dudi <- dudi.pca(iris[,1:4], scannf=FALSE)

s.class(iris.dudi$li, iris[,5])

par(oldpar); dev.off()

iris.between <- bca(iris.dudi, iris[,5], scannf=FALSE)

randtest(iris.between)

# \section{Classification without learning: cluster analysis}

library(cluster)

iris.dist <- daisy(iris[,1:4], metric="manhattan")

pdf(file="pics/36340.pdf"); oldpar <- par(mar=c(4,4,0,1))

library(KernSmooth)

iris.c <- cmdscale(iris.dist)

est <- bkde2D(iris.c[,1:2], bandwidth=c(.7,1.5))

plot(iris.c[,1:2], type="n", xlab="Dim. 1", ylab="Dim. 2")

text(iris.c[,1:2],
labels=abbreviate(iris[,5],1, method="both.sides"))

contour(est$x1, est$x2, est$fhat, add=TRUE,
drawlabels=FALSE, lty=3)

par(oldpar); dev.off()

pdf(file="pics/29530.pdf"); oldpar <- par(mar=c(0,2,0,1))

iriss <- iris[seq(1,nrow(iris),5),]

iriss.dist <- daisy(iriss[,1:4])

iriss.h <- hclust(iriss.dist, method="ward")

plot(iriss.h, labels=abbreviate(iriss[,5], 1,
method="both.sides"), main="")

par(oldpar); dev.off()

pdf(file="pics/29730.pdf"); oldpar <- par(mar=c(2,2,1,1))

library(pvclust)

irisst <- t(iriss[,1:4])

colnames(irisst) <- paste(abbreviate(iriss[,5], 3),
colnames(irisst))

iriss.pv <- pvclust(irisst, method.dist="manhattan",
method.hclust="ward", nboot=100)

plot(iriss.pv, col.pv=c(1,0,0), main="")

par(oldpar); dev.off()

eq <- read.table("data/eq.txt", h=TRUE)

eq.k <- kmeans(eq[,-1], 2)

table(eq.k$cluster, eq$SPECIES)

pdf(file="pics/36040.pdf"); oldpar <- par(mar=c(2,2,0,1))

iris.f <- fanny(iris[,1:4], 3)

plot(iris.f, which=1, main="")

head(data.frame(sp=iris[,5], iris.f$membership))

par(oldpar); dev.off()

# \subsection{Correspondence analysis}

pdf(file="pics/37300.pdf"); oldpar <- par(mar=c(2,2,2,2))

library(MASS)

caith

biplot(corresp(caith, nf = 2))

par(oldpar); dev.off()

pdf(file="pics/4446.pdf"); oldpar <- par(mar=c(2,2,2,2))

require(vegan)

alla <- read.table("data/lakesea_abio.txt", sep="\t", h=T)

allc <- read.table("data/lakesea_bio.txt", sep="\t", h=T)

all.cca <- cca(allc, alla[,-14])

plot.all.cca <- plot(all.cca, display=c("sp","cn"))

points(all.cca, display="sites", pch=ifelse(alla[,14],15,0))

legend("topleft", pch=c(15,0), legend=c("lake","sea"))

text(-1.6, -4.2, "Carex.lasiocarpa", pos=4)

par(oldpar); dev.off()

# \section{Classification with learning: discriminant analysis}

library(MASS)

iris.train <- iris[seq(1,nrow(iris),5),]

iris.unknown <- iris[-seq(1,nrow(iris),5),]

iris.lda <- lda(iris.train[,1:4], iris.train[,5])

iris.ldap <- predict(iris.lda, iris.unknown[,1:4])$class

table(iris.ldap, iris.unknown[,5])

Misclass <- function(pred, obs) {
tbl <- table(pred, obs)
sum <- colSums(tbl)
dia <- diag(tbl)
msc <- (sum - dia)/sum * 100
m.m <- mean(msc)
cat("Classification table:", "\n")
print(tbl)
cat("Misclassification errors:", "\n")
print(round(msc, 1))
}

Misclass(iris.ldap, iris.unknown[,5])

ldam <- manova(as.matrix(iris.unknown[,1:4]) ~ iris.ldap)

summary(ldam, test="Wilks")

pdf(file="pics/30320.pdf"); oldpar <- par(mar=c(4,4,0,1))

iris.lda2 <- lda(iris[,1:4], iris[,5])

iris.ldap2 <- predict(iris.lda2, dimen=2)$x

plot(iris.ldap2, type="n", xlab="LD1", ylab="LD2")

text(iris.ldap2, labels=abbreviate(iris[,5], 1,
method="both.sides"))

par(oldpar); dev.off()

pdf(file="pics/30520.pdf"); oldpar <- par(mar=c(2,2,0,1))

library(tree)

iris.tree <- tree(iris[,5] ~ ., iris[,-5])

plot(iris.tree)

text(iris.tree)

par(oldpar); dev.off()

library(randomForest)

set.seed(17)

iris.rf <- randomForest(iris.train[,5] ~ .,
data=iris.train[,1:4])

iris.rfp <- predict(iris.rf, iris.unknown[,1:4])

table(iris.rfp, iris.unknown[,5])

pdf(file="pics/30890.pdf"); oldpar <- par(mar=c(2,2,0,1))

set.seed(17)

iris.urf <- randomForest(iris[,-5])

MDSplot(iris.urf, iris[,5])

par(oldpar); dev.off()

library(e1071)

iris.svm <- svm(Species ~ ., data = iris.train)

iris.svmp <- predict(iris.svm, iris[,1:4])

table(iris.svmp, iris[,5])

# \section{Answers to exercises}

beer <- read.table("data/beer.txt", sep="\t", h=TRUE)

head(beer)

pdf(file="pics/38270.pdf"); oldpar <- par(mar=c(0,2,0,1))

library(vegan)

beer.d <- vegdist(beer, "jaccard")

plot(hclust(beer.d, "ward"), main="", xlab="", sub="")

par(oldpar); dev.off()

pdf(file="pics/37980.pdf"); oldpar <- par(mar=c(2,2,0,1))

eq <- read.table("data/eq.txt", h=TRUE)

eq.tree <- tree(eq[,1] ~ ., eq[,-1])

plot(eq.tree); text(eq.tree)

par(oldpar); dev.off()

# \chapter{Statistical workflow}

# \section{Preliminary processing of the data}

# \section{Finding the correct method of data analysis}

# \section{The report}

Sweave("test-Sweave.Rnw")

citation()

# \chapter{Example of \R session}

# \section{Starting...}

dir("data")

data <- read.table("data/bugs.txt", h=TRUE)

data

str(data)

data.f <- data[data$SEX == 0,]

data.m.big <- data[data$SEX == 1 & data$LENGTH > 10,]

data$WEIGHT.R <- data$WEIGHT/data$LENGTH

write.table(data, "data/bugs_new.txt", quote=FALSE)

# \section{Describing...}

summary(data)

summary(data$WEIGHT)

min(data$WEIGHT)

max(data$WEIGHT)

median(data$WEIGHT)

mean(data$WEIGHT)

colMeans(data)

mean(data$WEIGHT, na.rm=TRUE)

data.o <- na.omit(data)

sum(data$WEIGHT)

sum(data[2,])

apply(data, 1, sum)

table(data$SEX)

table(data$COLOR)

100*table(data$SEX)/length(data$SEX)

round(100*table(data$SEX)/length(data$SEX), 0)

sd(data$WEIGHT)

100*sd(data$WEIGHT)/mean(data$WEIGHT)

tapply(data$WEIGHT, data$SEX, mean)

table(data$COLOR, data$SEX)

100*table(data$COLOR, data$SEX)/sum(data$COLOR, data$SEX)

tapply(data$WEIGHT, list(data$SEX, data$COLOR), mean)

# \section{Plotting...}

hist(data$WEIGHT, breaks=20)

hist(data$WEIGHT, breaks=c(seq(0,100,20)))

qqnorm(data$WEIGHT); qqline(data$WEIGHT)

plot(data$LENGTH, data$WEIGHT, type="p")

plot(data$LENGTH, data$WEIGHT, type="p", cex=0.5)

plot(data$LENGTH, data$WEIGHT, type="p", cex=2)

pchShow <- function(

extras = c("*",".", "o","O","0","+","-","|","%","#"),

cex=3, col="black", bg="gray", coltext="black",

cextext=1.2, main="")

{

nex <- length(extras)

np <- 26 + nex

ipch <- 0:(np-1)

k <- floor(sqrt(np))

dd <- c(-1,1)/2

rx <- dd + range(ix <- ipch %/% k)

ry <- dd + range(iy <- 3 + (k-1)- ipch %% k)

pch <- as.list(ipch) # list with integers & strings

if(nex > 0) pch[26+ 1:nex] <- as.list(extras)

oldpar <- par(mar=c(0,0,0,0))

plot(rx, ry, type="n", axes=FALSE, xlab="", ylab="", main=main)

abline(v = ix, h = iy, col="lightgray", lty="dotted")

for(i in 1:np)

{

pc <- pch[[i]]

## 'col' symbols with a 'bg'-colored interior (where available) :

points(ix[i], iy[i], pch = pc, col = col, bg = bg, cex = cex)

if(cextext > 0) text(ix[i] - 0.3, iy[i], pc, col = coltext, cex = cextext)

}

par(oldpar)

}

pdf(file="pics/47690.pdf"); oldpar <- par(mar=c(2,2,0,1))

pchShow(c("*",".","+"), cex = 2)

par(oldpar); dev.off()

plot(data$LENGTH, data$WEIGHT, type="p", pch=2)

plot(data$LENGTH, data$WEIGHT, type="n")

text(data$LENGTH, data$WEIGHT, labels=data$SEX)

plot(data$LENGTH, data$WEIGHT, type="n")

text(data$LENGTH, data$WEIGHT, labels=data$SEX, col=data$SEX+1)

plot(data$LENGTH, data$WEIGHT, type="n")

points(data$LENGTH, data$WEIGHT, pch=data$SEX)

pdf(file="pics/52050.pdf"); oldpar <- par(mar=c(4,4,1,1))

plot(data$LENGTH, data$WEIGHT, type="n", xlab="Length", ylab="Weight")

text(data$LENGTH, data$WEIGHT,
labels=ifelse(data$SEX, "\\MA", "\\VE"),
vfont=c("serif","plain"), cex=1.5)

par(oldpar); dev.off()

plot(data$LENGTH, data$WEIGHT, type="n")

points(data$LENGTH, data$WEIGHT, pch=data$SEX, col=data$SEX+1)

legend("bottomright", c("male", "female"), pch=c(0,1), col=c(1,2))

dev.copy(pdf, "graph.pdf")

dev.off()

pdf("graph.pdf")

plot(data$LENGTH, data$WEIGHT, type="n")

text(data$LENGTH, data$WEIGHT, pch=data$SEX, col=data$SEX+1)

legend("bottomright", c("male", "female"), pch=c(0,1), col=c(1,2))

dev.off()

plot(data[order(data$LENGTH), c("LENGTH", "WEIGHT")], type="o")

plot(data[order(data$COLOR), c("COLOR", "WEIGHT")], type="o",
ylim=c(5, 15))

lines(data[order(data$COLOR), c("COLOR", "LENGTH")], lty=3)

boxplot(data$LENGTH)

boxplot(data$LENGTH ~ factor(data$SEX))

# \section{Testing...}

t.test(data$WEIGHT, data$LENGTH, paired=TRUE)

t.test(data$WEIGHT, data$LENGTH, paired=FALSE)

t.test(data$WEIGHT ~ data$SEX)

wilcox.test(data$WEIGHT, data$LENGTH, paired=TRUE)

oneway.test(data$WEIGHT ~ data$COLOR)

pairwise.t.test(data$WEIGHT, data$COLOR, p.adj="bonferroni")

kruskal.test(data$WEIGHT ~ data$COLOR)

pairwise.wilcox.test(data$WEIGHT, data$COLOR)

chisq.test(data$COLOR, data$SEX)

prop.test(c(sum(data$SEX)), c(length(data$SEX)), 0.5)

cor.test(data$WEIGHT, data$LENGTH, method="pearson")

cor.test(data$WEIGHT, data$LENGTH, method="spearman")

summary(lm(data$LENGTH ~ data$SEX))

anova(lm(data$LENGTH ~ data$SEX))

# \section{Finishing...}

# \chapter{\R fragments}

# \section{\R and databases}

Recode <- function(var, from, to)
{
x <- as.vector(var)
x.tmp <- x
for (i in 1:length(from)) {x <- replace(x, x.tmp == from[i],
to[i])}
if(is.factor(var)) factor(x) else x
}

Recode4 <- function(var, from, to, missed="")
{
ifelse(Recode(var, from, to) == var,
missed,
Recode(var, from, to))
}

replace(rep(1:10,2), c(2,3,5), c("a","b","c"))

Recode(rep(1:10,2), c(2,3,5), c("a","b","c"))

locations <- read.table("data/eq_l.txt", h=T, sep=";")

measurements <- read.table("data/eq_s.txt", h=T, sep=";")

head(locations)

head(measurements)

loc.N.POP <- Recode(measurements$N.POP, locations$N.POP,
as.character(locations$SPECIES))

head(cbind(species=loc.N.POP, measurements))

m <- c("Plantago major", "Plantago lanceolata",
"Littorella uniflora")

do.call(rbind, strsplit(m, split=" ")) # one space inside quotes

m.o

model.matrix( ~ m.o - 1, data=data.frame(m.o))

sapply(levels(m.o),
function(.x) {d <- rep(0, length(m.o)); d[m.o==.x] <- 1; d})

# \section{\R and time}

dates.df <- data.frame(dates=c("2011-01-01","2011-01-02",
"2011-01-03","2011-01-04","2011-01-05"))

str(dates.df$dates)

dates.1 <- as.Date(dates.df$dates, "%Y-%m-%d")

str(dates.1)

d <- c(20130708, 19990203, 17650101)

as.Date(as.character(d), "%Y%m%d")

ts(1:10, # sequence
frequency = 4, # by quartile
start = c(1959, 2)) # start in the second quartile 1959

z <- ts(matrix(rnorm(300), 100, 3),
start=c(1961, 1), # start in January 1961
frequency=12) # by months

pdf(file="pics/32340.pdf"); oldpar <- par(mar=c(2,2,0,1))

plot(z,
plot.type="single", # place all types on one plot
lty=1:3)

par(oldpar); dev.off()

leaf <- read.table("data/sundew.txt", head=TRUE,
as.is=TRUE, sep=";")

str(leaf)

summary(leaf)

shape <- ts(leaf$SHAPE, frequency=36)

str(shape)

pdf(file="pics/33160.pdf"); oldpar <- par(mar=c(4,4,0,1))

(acf(shape, main=""))

par(oldpar); dev.off()

pdf(file="pics/33310.pdf"); oldpar <- par(mar=c(2,2,0,1))

plot(stl(shape, s.window="periodic")$time.series, main="")

par(oldpar); dev.off()

# \section{\R and bootstrap}

library(boot)

# Statistic to be bootstrapped:

ave.b <- function (data, indices)
{
d <- data[indices]
return(mean(d))
}

(result.b <- boot(data=trees$Height, statistic=ave.b, R=100))

pdf(file="pics/06074.pdf")

plot(result.b)

dev.off()

boot.ci(result.b, type="bca")

spur <- scan("data/spur.txt")

library(bootstrap)

result.b2 <- bootstrap(x=spur, 100, function(x) mean(x))

# Median of bootstrapped values:

median(result.b2$thetastar)

# Confidence interval:

quantile(result.b2$thetastar, probs=c(0.025, 0.975))

result.j <- jackknife(x=spur, function(x) mean(x))

# Median of jackknifed values:

median(result.j$jack.values)

# Standard error:

result.j$jack.se

# Confidence interval:

quantile(result.j$jack.values, probs=c(0.025, 0.975))

str(classes)

sapply(classes, function(.x) median(.x, na.rm=T))

set.seed(5998)

differences <- with(classes,
replicate(1000,
median(sample(A1, size=length(A1), replace=TRUE), na.rm=TRUE) -
median(sample(B1, size=length(B1), replace=TRUE), na.rm=TRUE)
)
)

quantile(differences, probs=c(0.025, 0.975))

# \section{Answers to exercises}

wet <- ts(leaf$WET, frequency=36)

str(wet)

acf(wet)

plot(stl(wet, s.window="periodic")$time.series)

# \chapter{Essential \R commands}

# \chapter{The short \R glossary}

pdf(file="pics/76680.pdf"); oldpar <- par(mar=c(2,2,0,1))

plot(density(rnorm(10^5)), main="", xlab="", ylab="")

par(oldpar); dev.off()

# \chapterX{References}