Документ взят из кэша поисковой машины. Адрес оригинального документа : http://herba.msu.ru/shipunov/school/biol_240/en/r/asmisc.r
Дата изменения: Sat Apr 2 02:11:53 2016
Дата индексирования: Mon Apr 11 03:52:44 2016
Кодировка:

# Miscellaneous functions for R
# Alexey Shipunov, dactylorhiza@gmail.com
# v. 20160401
# see also recode.r and pleiad.r

#
# UTILITIES
#

# ===

# All duplicates
All.Dup <- function (value) {
duplicated(value) | duplicated(value, fromLast = TRUE)
}

# ===

# Works under Linux with "xclip" utility installed
To.Clip <- function(x, sep="\t", row.names=FALSE, col.names=TRUE, ...)
{
con <- pipe("xclip -selection clipboard -i", open="w")
write.table(x, con, sep=sep, row.names=row.names, col.names=col.names, ...)
close(con)
}

# ===

#
# Light summary of each variable
#
Nnames <- function(x) {
xdim <- dimnames(x)[[2]]
xnum <- seq(1, along=xdim)
xna <- apply(x, 2, function(x) sum(is.na(x)))
xdf <- (paste(xnum, paste(xdim, "/", xna, sep="")))
cat("Hint: var.number var.name/number.of.NA", "\n")
noquote(xdf) }

# ===

#
# Object browser
#
# The function has "mode" and "type" arguments (type is typically, but not
# always an object's class attribute). If the user only wants to list
# data.frames, then the command ls.objects(type="data.frame") will list only
# information for data.frames (dputler@scu.edu).
#
ls.objects <- function (pos = 1, pattern, mode = "any", type = "any"){
#
Object.Name <- ls(pos = pos, envir = as.environment(pos), pattern = pattern)
Object.Mode <- rep("", length(Object.Name))
Object.Type <- rep("", length(Object.Name))
Variables <- rep("-", length(Object.Name))
Observations <- rep("-", length(Object.Name))
for (i in 1:length(Object.Name)){
Object.Mode[[i]] <- mode(get(Object.Name[[i]]))
if(is.list(get(Object.Name[[i]]))){
if((is.null(class(get(Object.Name[[i]]))) | is.null(attributes(get(Object.Name[[i]]))$class)))
Object.Type[[i]] <- c("unknown")
else { Object.Attrib <- attributes(get(Object.Name[[i]]))
Object.Type[[i]] <- Object.Attrib$class
if(Object.Type[[i]]=="data.frame"){
Variables[[i]] <- as.character(length(Object.Attrib$names))
Observations[[i]] <- as.character(length(Object.Attrib$row.names))
}}}
if(is.matrix(get(Object.Name[[i]]))){
Object.Attrib <- dim(get(Object.Name[[i]]))
Object.Type[[i]] <- c("matrix")
Variables[[i]] <- as.character(Object.Attrib[2])
Observations[[i]] <- as.character(Object.Attrib[1])
}
if(is.vector(get(Object.Name[[i]])) && (Object.Mode[[i]]=="character" || Object.Mode[[i]]=="numeric")){
Object.Type[[i]] <- c("vector")
Variables[[i]] <- c("1")
Observations[[i]] <- as.character(length(get(Object.Name[[i]])))
}
if(is.factor(get(Object.Name[[i]]))){
Object.Type[[i]] <- c("factor")
Variables[[i]] <- c("1")
Observations[[i]] <- as.character(length(get(Object.Name[[i]])))
}
if(is.function(get(Object.Name[[i]]))) Object.Type[[i]] <- c("function")
}
objList <- data.frame(Object.Name, Object.Mode, Object.Type, Observations, Variables)
if(mode != "any") objList <- objList[objList[["Object.Mode"]] == mode, ]
if(type != "any") objList <- objList[objList[["Object.Type"]] == type, ]
return(objList)
}

# ===

#
# Read a lower triangular matrix (J. Fox, 09/10/2002, R-help)
#
read.tri.nts <- function (file, ...) {
elements <- scan(file, ...)
n <- (sqrt(1 + 8 * length(elements)) - 1)/2
U <- matrix(0, n, n)
U[outer(1:n, 1:n, '<=')] <- elements
return(t(U))
}

# ===

#
# Transform character or numeric vector
# in dataframe with 0/100 (absent/present) cells
#
to.bin <- function (var) {
u.var <- sort(unique(var))
mat.var <- matrix((rep(var, length(u.var)) == rep(u.var, each=length(var)))*100, ncol=length(u.var))
colnames(mat.var) <- paste(deparse(substitute(var)), u.var, sep=".")
return(mat.var)
}

# =================================================
# =================================================
# =================================================

#
# STATISTICS
#

#
# Normality (via Shapiro-Wilks test)
#

Normality <- function(x, p=.05)
{
ifelse(shapiro.test(x)$p.value > p, "NORMAL", "NOT NORMAL")
}

normality <- Normality

# ===

#
# CORRELATION: see "pleiad" package
#

# ===

#
# "Filling plot and filling vector" (sort of species accumulation curves)
# for floristic lists (partly from E. Altshuler)
#
# Columns are species!
infill <- function(x, n=10) {
x <- as.matrix(x)
x <- x[, colSums(x)!= 0]
mat <- numeric(0)
for (j in 1:n) {
ini <- rep(0, nrow(x))
sam <- sample(1:nrow(x), nrow(x))
dat <- cbind(1:nrow(x), x[sam, ])
for (i in 2:ncol(dat)){
nums <- dat[dat[, i] > 0, 1]
ini[nums[1]] <- ini[nums[1]] + 1}
ini <- cumsum(ini)
mat <- cbind(mat, ini)}
dimnames(mat)[[2]] <- 1:n
mat <- apply(mat, 1, mean)
class(mat) <- "infill"
attr(mat, "nspecies") <- ncol(x)
attr(mat, "nperm") <- n
mat}

plot.infill <- function(x, ...) {
sp <- attr(x, "nspecies")
n <- attr(x, "nperm")
plot(unclass(x), type="l", ylab="species", xlab="sites",
sub=paste(n, "permutations"),
axes=F, ...)
abline(h=.5*sp, lty=2, col="green")
abline(h=.75*sp, lty=2, col="blue")
abline(h=.9*sp, lty=2, col="red")
axis(1, seq(0, length(x), by=1))
axis(2, seq(0, max(x)))
box()}

summary.infill <- function(x, ...) {
sp <- attr(x, "nspecies")
n <- attr(x, "nperm")
cat("One site infill:", (x[1]/sp)*100, "%", "\n")
cat("50% infill:", which(abs(x - .5*sp) == min(abs(x - .5*sp)))[1], "sites", "\n")
cat("75% infill:", which(abs(x - .75*sp) == min(abs(x - .75*sp)))[1], "sites", "\n")
cat("90% infill:", which(abs(x - .9*sp) == min(abs(x - .9*sp)))[1], "sites", "\n")
cat(sp, "species", "/", length(x), "sites", "/", n, "permutations", "\n")}

# ===

# Compare checklists
# From Abramova et al., 2003
# Functions for checklists
# calculates difference (in %) between checklists with _common base_
# and names "indicators" which influence the difference most

sravn <- function(df1, df2)
{
df1.sp <- (rowSums(df1) > 0) * 1
df2.sp <- (rowSums(df2) > 0) * 1
procent <- sum((df1.sp > 0) * (df2.sp > 0)) / sum((((df1.sp + df2.sp) > 0) * 1))
sr.proc.1 <- apply(df1 > 0, 1, function(x) round(sum(x) / ncol(df1)*100, 2))
sr.proc.2 <- apply(df2 > 0, 1, function(x) round(sum(x) / ncol(df2)*100, 2))
ind.1 <- rev(sort(sr.proc.1 - sr.proc.2))
ind.2 <- rev(sort(sr.proc.2 - sr.proc.1))
sravn.list <- list(procent=procent, ind.1=ind.1, ind.2=ind.2)
class(sravn.list) <- "sravn"
invisible(sravn.list)
}

summary.sravn <- function(sravn.list, n=10)
{
cat("The mean difference between groups is:", sravn.list$procent, "%\n")
cat("======================\n")
cat("The indicator species ( top", deparse(substitute(n)),") for group 1 are:\n")
print(head(sravn.list$ind.1, n=n))
cat("======================\n")
cat("The indicator species ( top", deparse(substitute(n)),") for group 2 are:\n")
print(head(sravn.list$ind.2, n=n))
}

# =================================================
# =================================================
# =================================================

#
# MULTIVARIATE
#

#
# E.S.Smirnov's scaled taxonomic coefficients: see "smirnov" package
#

# ===

#
# Misclassification error (for 2-dimension tables)
#
misclass <- function(pred, obs) {
tbl <- table(pred, obs)
sum <- colSums(tbl)
dia <- diag(tbl)
msc <- (sum - dia)/sum * 100
m.m <- mean(msc)
cat("Classification table:", "\n")
print(tbl)
cat("Misclassification errors:", "\n")
print(round(msc, 1))
cat("Mean misclassification error: ", round(m.m, 1), "%", "\n", sep="")
}

# ===

# PCO scores (fractions of explained variation)
# Taken from http://ecology.msu.montana.edu/labdsv/R/lab8/lab8.html
# For cmdscale ("PCO") objects only!
pco.scores <- function(cmds) {
scores <- cmds$eig/sum(cmds$eig)
names(scores) <- 1:ncol(cmds$points)
return(scores)
}

# ===

#
# For using with "vegan" library
#
text.ordiplot <- function (x, what, lbl = rownames(x), ...)
{
require(vegan)
x <- scores(x, what)
text(x, labels = lbl, ...)
invisible()
}

# ===

#
# Jacknife clustering (but cluster bootstrap in "pvclust" is better)
# TODO:
# (1) return *all* clusters from hclust object -> jacknife for all clusters
# (2) construct (strict) consensus tree -> tree/dendrogram with labels
#

#
j.clust <- function(data, n.cl, iter=100, d.method="euclidean", c.method="ward")
{
j.res <- matrix(rep(0, nrow(data)^2), ncol=nrow(data))
for (i in 1:iter)
{
j.data <- data[, sample(seq(1:ncol(data)), round(.8*ncol(data)))]
j.dist <- dist(j.data, method=d.method)
j.clust <- cutree(hclust(j.dist, method=c.method), k=n.cl)
j.mat <- outer(j.clust, j.clust, "==")
j.res <- j.res + j.mat
}
j.supp <- c(rep(0, n.cl))
names(j.supp) <- paste("Cluster", 1:n.cl)
j.group <- cutree(hclust(dist(j.res, method=d.method), method=c.method), k=n.cl)
for(j in 1:n.cl)
{
j.which <- which(j.group == j)
j.subset <- j.res[j.which, j.which]
j.supp[j] <- median(as.vector(j.subset), na.rm=T)/iter
}
j.clust <- list(mat=j.res, gr=j.group, supp=j.supp, iter=iter, n.cl=n.cl)
class(j.clust) <- "j.clust"
j.clust
}

print.j.clust <- function (x, ...)
{
cat("\nJacknife support for", x$n.cl, "clusters, ", x$iter, "iterations: \n")
cat("\n")
print(x$supp)
cat("\n")
invisible(x)
}

# =================================================
# =================================================
# =================================================

#
# GRAPHICS
#

#
# Geometric ellipses
#

## Sundar Dorai-Raj
# begin ellipse
ellipse <- function(x, y,
width, height=width, theta=2*pi,
npoints=100, plot=T, ...) {
# x = x coordinate of center
# y = y coordinate of center
# width = length of major axis
# height = length of minor axis
# theta = rotation
# npoints = number of points to send to polygon
# plot = if TRUE, add to current device
# = if FALSE, return list of components
a <- width/2
b <- height/2
xcoord <- seq(-a, a, length=npoints)
ycoord.neg <- sqrt(b^2*(1-(xcoord)^2/a^2))
ycoord.pos <- -sqrt(b^2*(1-(xcoord)^2/a^2))
xx <- c(xcoord, xcoord[npoints:1])
yy <- c(ycoord.neg, ycoord.pos)
x.theta <- xx*cos(2*pi-theta)+yy*sin(2*pi-theta)+x
y.theta <- yy*cos(2*pi-theta)-xx*sin(2*pi-theta)+y
if(plot)
invisible(polygon(x.theta, y.theta, density=0, ...))
else
invisible(list(coords=data.frame(x=x.theta, y=y.theta),
center=c(x, y),
theta=theta))
}

# ===

#
# Confidence ellipses and hulls
#

# Plot an ellipse with "covariance matrix" C, center b, and P-content
# level according the F(2, df) distribution.
# Sent to S-NEWS on May 19, 1999 by Roger Koenker
# Department of Economics
# University of Illinois
# Champaign, IL 61820
# url: http://www.econ.uiuc.edu
# email roger@ysidro.econ.uiuc.edu
# vox: 217-333-4558
# fax: 217-244-6678.
#
confelli <- function(b, C, df = 1000, level = 0.95, xlab = "", ylab = "", add=T, prec=51, ...)
{
d <- sqrt(diag(C))
dfvec <- c(2, df)
phase <- acos(C[1, 2]/(d[1] * d[2]))
angles <- seq( - (pi), pi, len = prec)
mult <- sqrt(dfvec[1] * qf(level, dfvec[1], dfvec[2]))
xpts <- b[1] + d[1] * mult * cos(angles)
ypts <- b[2] + d[2] * mult * cos(angles + phase)
if(add) lines(xpts, ypts, ...)
else plot(xpts, ypts, type = "l", xlab = xlab, ylab = ylab, ...)
}

# ===

#
# Confidence ellipses
#
ellipses <- function (pts, groups, match.color=TRUE, ...)
{
out <- seq(along = groups)
inds <- names(table(groups))
for (is in inds) {
if (match.color) {m.col <- is} else {m.col <- "black"}
gr <- out[groups == is]
if (length(gr) > 1) {
X <- pts[gr, ]
confelli(apply(X, 2, median), cov(X), col=m.col, ...)
}
}
invisible()
}

# ===

#
# Groups' hulls
#
hulls <- function (pts, groups, match.color=TRUE, plot=TRUE, ...)
{
ppts <- list()
out <- seq(along = groups)
inds <- names(table(groups))
for (is in inds) {
if (match.color) {m.col <- is} else {m.col <- "black"}
gr <- out[groups == is]
if (length(gr) > 1) {
X <- pts[gr, ]
hpts <- chull(X)
ppts[[is]] <- X[hpts, ]
hpts.l <- c(hpts, hpts[1])
if (plot) {lines(X[hpts.l, ], col=m.col, ...)}
}
}
invisible(ppts)
}

# ===

#
# Polygons overlap (require gpclib)
#
overlap <- function(ppts)
{
require(gpclib)
len <- length(ppts)
ppol <- lapply(ppts, function(x) as(x, "gpc.poly"))
over.m <- matrix(ncol=len, nrow=len)
for (i in 2:len)
{
for (j in 1:i)
{
p.i <- ppol[[i]]
p.j <- ppol[[j]]
ij <- intersect(p.i, p.j)
if (length(attr(ij, "pts")) == 0)
{
over.m[j,i] <- over.m[i,j] <- NA
}
else
{
ij.a <- area.poly(ij)
over.m[j,i] <- ij.a/area.poly(p.j)
over.m[i,j] <- ij.a/area.poly(p.i)
}
}
}
diag(over.m) <- 1
dimnames(over.m) <- list(names(ppts), names(ppts))
class(over.m) <- "overlap"
invisible(over.m)
}

summary.overlap <- function(over.m, ...)
{
pops <- sort((rowSums(over.m, na.rm=T)-1)/(ncol(over.m)-1))
pops.m <- mean(pops, na.rm=T)
cat("The mean overlap for each population is:\n")
print(matrix(round(pops*100, 2), dimnames=list(names(pops), "% overlap")))
cat("The mean overlap for all dataset is", round(pops.m*100, 2), "%\n")
}

plot.overlap <- function(over.m, ...)
{
symnum(over.m,
corr=F,
cutpoints = c(-.1, 0, .1, .2, .3, .5, 1),
symbols = c("!", "X", "*", "+", ".", " "),
lower.triangular=F)
}

# ===

#
# Number of cases in each point coded by point size
#

psize <- function(x, y)
{
TAB.s <- table(paste(x, y))
TAB.x <- as.numeric(unlist(strsplit(names(TAB.s), " "))[seq(1, 2*length(TAB.s), by=2)])
TAB.y <- as.numeric(unlist(strsplit(names(TAB.s), " "))[seq(2, 2*length(TAB.s), by=2)])
size <- as.numeric(cut(TAB.s, 7))
points(TAB.x, TAB.y, cex=size, pch=1)
}

# ===

# Two y-axes (end even two x-axes, if need) in one graph:
# modified from Jim Lemon (bitwrit@ozemail.com.au)
# R-Help Message-ID: <39BCB5C0.DBF8CDC0@ozemail.com.au> Mon 11 Sep 2000

# Example of usage:
# sh <- read.table("schr-s.dat", h=T, sep=";")

# plot2ax(x1=sh$LOWER, y1=sh$STALK, y2=sh$SPIKE, xlab="LOWER",
# ylab="STALK", y2lab="SPIKE", main="Two y-axes")

# or even:

# plot2ax(x1=sh$LOWER, x2=sh$UPPER, y1=sh$STALK, y2=sh$SPIKE,
# xlab="LOWER", ylab="STALK", y2lab="SPIKE", x2lab="UPPER",
# col=sh$BLANK)
# Note: in the case of 2 x-axes "main" is not recommended

plot2ax <- function(x1, y1, xlim1, ylim1, x2, y2, xlim2, ylim2,
x2lab, y2lab, type="p", pch1=1, pch2=2, col=1, ...)
{
oldmar <- par("mar")
par(mar=c(5,4,4,4))
if(missing(xlim1)) xlim1 <- range(x1)
if(missing(ylim1)) ylim1 <- range(y1)
plot(x1, y1, xlim=xlim1, ylim=ylim1, type="n", ...)
points(x1, y1, pch=pch1, type=type, col=col)
if(!missing(y2))
{
if(missing(x2)) x2.tmp <- x1 else x2.tmp <- x2
if(missing(xlim2)) xlim2 <- range(x2.tmp)
if(missing(ylim2)) ylim2 <- range(pretty(y2))
ax2val <- pretty(y2)
xmult <- (xlim1[2] - xlim1[1])/(xlim2[2] - xlim2[1])
ymult <- (ylim1[2] - ylim1[1])/(ylim2[2] - ylim2[1])
axis(4,(ax2val-ylim2[1])*ymult+ylim1[1], labels=as.character(ax2val))
if(!missing(x2))
{
ax4val <- pretty(x2)
axis(3,(ax4val-xlim2[1])*xmult+xlim1[1], labels=as.character(ax4val))
}
points((x2.tmp-xlim2[1])*xmult+xlim1[1], (y2-ylim2[1])*ymult+ylim1[1],
pch=pch2, type=type, col=col)
if(!missing(y2lab)) mtext(y2lab,4,2)
if(!missing(x2lab)) mtext(x2lab,3,2)
}
par(oldmar)
}

# ===

# Histogram with percents on the top of bars

histPercentB <- function(x, breaks, ...) {
H <- hist(x, plot = FALSE, breaks=breaks)
H$density <- with(H, 100 * density* diff(breaks)[1])
labs <- paste(round(H$density), "%", sep="")
plot(H, freq = FALSE, labels = labs, ylim=c(0, 1.08*max(H$density)),...)
}

# ===

# Adds confidence bands to the linear model plot
#
# Usage:
# Cladd(a.lm, data=a)
#
Cladd <- function(model, data, level=.95, lty=2, ab.lty=0, col="black", ab.col="black")
{
var <- names(model$model)[2]
sel <- data[, var]
new.var <- seq(min(sel, na.rm=T), max(sel, na.rm=T), length.out=length(sel))
new <- data.frame(new.var)
names(new)[1] <- var
pp <- predict(model, interval="confidence", level=level, newdata=new)
matlines(new.var, pp, lty=c(ab.lty,lty,lty), col=c(ab.col,col,col))
}