# Functions for lab 9 # by Tom Minka sort.levels <- function(f,x,fun=median) { if(length(x) == length(f)) x <- tapply(x,f,fun) if(length(x) != length(levels(f))) stop("wrong number of values to sort by") factor(f,levels=levels(f)[order(x)]) } update.default <- function (object, formula., ..., evaluate = TRUE) { call <- object$call if (is.null(call)) stop("need an object with call component") extras <- match.call(expand.dots = FALSE)$... if (!missing(formula.)) call$formula <- update.formula(formula(object), formula.) if(length(extras) > 0) { existing <- !is.na(match(names(extras), names(call))) ## do these individually to allow NULL to remove entries. for (a in names(extras)[existing]) call[[a]] <- extras[[a]] if(any(!existing)) { call <- c(as.list(call), extras[!existing]) call <- as.call(call) } } if(evaluate) { # minka: use environment of formula instead of parent.frame env<-environment(call$formula) if (is.null(env)) env<-parent.frame() eval(call,env) } else call } ############################################################################# # returns a model frame where the responses have been replaced by their # residuals under the fit residual.frame <- function(object,data) { if(missing(data)) { x <- model.frame(object) resp <- response.var(x) if(inherits(object,"glm")) { p <- object$fitted.value x[[resp]] <- (object$y - p)/p/(1-p) } else x[[resp]] <- residuals(object) } else { x <- data resp <- response.var(object) # only for lm, not glm if(inherits(object,"glm")) stop("can't compute residuals for glm") # expand p to include NAs p <- predict(object,x)[rownames(x)] names(p) <- NULL x[[resp]] <- x[[resp]] - p #attr(x,"terms") <- terms(object) } x } partial.frame <- function(object,data) { if(F) { if(missing(data)) { data.frame(residuals(object,type="partial")) } else { data <- as.matrix(data) pred <- predictor.vars(object) x <- rep.col(residuals(object),ncol(data)) colnames(x) <- colnames(data) x[,pred] <- residuals(object,type="partial") as.data.frame(x) } } else { if(!inherits(object,"lm")) stop("can't compute partial residuals") if(missing(data)) data <- model.frame(object) resp <- response.var(object) pred <- predictor.vars(object) x <- rep.col(residuals(object),ncol(data)) colnames(x) <- colnames(data) for(i in 1:length(pred)) { not.i <- pred[-i] if(F) { fmla <- as.formula(paste(resp,"~",paste(not.i,collapse="+"))) fit <- lm(fmla,data) } else if(T) { # this is more correct than above fit <- update(object,paste(".~. -",pred[i])) } else { data.mat <- as.matrix(data) fit <- lm.fit(data.mat[,not.i,drop=F],data.mat[,resp]) } #cat(pred[i],coef(fit),"\n") x[,pred[i]] <- residuals(fit) } as.data.frame(x) } } auto.legend <- function(labels, col=1, lty, pch, text.col, ...) { usr <- par("usr") top <- usr[4] inch.x <- diff(par("usr")[1:2])/par("pin")[1] inch.y <- diff(par("usr")[3:4])/par("pin")[2] left <- usr[2]-max(strwidth(labels))-0.04*inch.x y <- cumsum(strheight(labels)+0.04*inch.y) if(missing(lty) && missing(pch)) { text.only <- T if(missing(text.col)) text.col <- col } for(i in 1:length(labels)) { text(left,top-y[i],labels[i],col=text.col[i],adj=0,...) } } ############################################################################## predict.plot.data.frame <- function(x,given,given.lab,given.groups=4, layout,partial,type=c("prob","logit","probit"), highlight, identify.pred=F,scol="red",slwd=1, span=2/3,mar=NA,...) { type <- match.arg(type) resp <- response.var(x) pred <- predictor.vars(x) if(!missing(given) && !is.factor(given)) { # make given a factor b <- as.numeric(quantile(unique(given),seq(0,1,length=given.groups+1))) given <- rcut(given,b) } if(missing(layout)) layout <- auto.layout(length(pred) + !missing(given)) opar <- par(mfrow=layout) if(!is.null(mar)) { if(is.na(mar)) mar <- c(4.5,4,0,0.1) opar <- c(opar,par(mar=mar)) } on.exit(par(opar)) if(!missing(partial)) { pres <- partial.frame(partial,x) if(missing(highlight)) highlight <- predictor.vars(partial) } else if(missing(highlight)) highlight <- NULL for(i in pred) { if(!missing(partial)) { x[[resp]] <- pres[[make.names(i)]] if(is.null(x[[resp]])) stop(paste("partial of",i,"not found")) } if(is.factor(x[[i]]) && !is.ordered(x[[i]])) { x[[i]] <- sort.levels(x[[i]],as.numeric(x[[resp]])) } # plot points if(!missing(given) && is.factor(x[[i]])) { # given and factor plot.new() xlim <- length(levels(x[[i]])) plot.window(xlim=c(0.5,xlim+0.5),ylim=range(x[[resp]])) axis(1,1:xlim,labels=levels(x[[i]])) axis(2) box() title(xlab=i, ylab=resp) cat(paste("jittering",i,"\n")) } else { if(is.factor(x[[resp]])) { # no given, factor if(type=="prob") { if(is.factor(x[[i]])) { mosaicplot(table(x[[i]], x[[resp]]), xlab=i, ylab=resp) } else { plot(x[[i]], x[[resp]], xlab=i, ylab=resp, ...) } } } else { # no given, numeric plot(x[[i]], x[[resp]], xlab="", ylab=resp, ...) if(i %in% highlight) title(xlab=i,font.lab=2) else title(xlab=i) if(!missing(partial)) { a <- coef(partial) #if(!is.na(a[i])) abline(0,a[i],col=3,lty=2) } } } # smooth curve k <- !is.na(x[,i]) & !is.na(x[[resp]]) if(missing(given) && !is.na(scol)) { if(is.factor(x[[resp]])) { if(length(levels(x[[resp]]))==2 && !is.factor(x[[i]])) { if(type=="prob") { lines(loprob(x[k,i], x[k,resp]), col=scol) } else { xy <- loprob(x[k,i], x[k,resp]) p <- xy$y-1 p <- switch(type,logit=log(p/(1-p)),probit=qnorm(p)) xy$y <- p+1.5 plot(xy,col=scol,type="l",xlab=i,ylab=type) points(x[[i]],2*as.numeric(x[[resp]])-3) } } } else { lines(lowess(x[k,i], x[k,resp], f=span),col=scol,lwd=slwd) } } # identify if((identify.pred == T) || (i %in% identify.pred)) { identify(x[k,i],x[k,resp],labels=rownames(x)[k]) } # condition on the given variable if(!missing(given)) { lev <- levels(given) for(g in 1:length(lev)) { color <- ((g-1) %% 6) + 1 val <- lev[g] k <- (given == val) if(is.factor(x[[i]])) { jitter <- (runif(length(x[k,i]))-0.5)/5 points(as.numeric(x[k,i])+jitter, x[k,resp], col=color, ...) } else { points(x[k,i], x[k,resp], col=color, ...) } if(is.factor(x[[resp]])) { lines(loprob(x[k,i], x[k,resp]),col=color) } else { lines(lowess(x[k,i], x[k,resp], f=span),col=color) #abline(lm(x[k,resp]~x[k,i]),col=color) } } } } # show legend for the given variable, as a separate panel # this is better than putting a color key on each plot if(!missing(given)) { # legend plot.new() if(!missing(given.lab)) title(xlab=given.lab) y <- cumsum(strheight(lev)+0.02) for(i in 1:length(lev)) { color <- ((i-1) %% 6) + 1 val <- lev[i] text(0.5,0.75-y[i],val,col=color,adj=0.5) } } } predict.plot.lm <- function(object,data,partial=F,...) { if(!partial) { if(missing(data)) { res <- residual.frame(object) } else { res <- residual.frame(object,data) } if(F) { expr <- match.call(expand = F) expr$... <- NULL expr[[1]] <- as.name("residual.frame") res <- eval(expr, parent.frame()) } cat("plotting residuals\n") predict.plot.data.frame(res,highlight=predictor.vars(object),...) } else { if(missing(data)) data <- model.frame(object) cat("plotting partial residuals\n") predict.plot.data.frame(data,partial=object,...) } } predict.plot.formula <- function(formula,data=parent.frame(),...) { # formula has givens? rhs <- formula[[3]] if(is.call(rhs) && (deparse(rhs[[1]]) == "|")) { # remove givens from formula given <- deparse(rhs[[3]]) formula[[3]] <- rhs[[2]] if(is.environment(data)) g <- get(given,env=data) else g <- data[[given]] if(is.null(g)) stop(paste("variable \"",given,"\" not found",sep="")) return(predict.plot.formula(formula,data, given=g,given.lab=given,...)) } if(F) { expr <- match.call(expand = F) expr$... <- NULL expr$na.action <- na.pass expr[[1]] <- as.name("model.frame.default") x <- eval(expr, parent.frame()) } else { # formula has its own environment x <- model.frame.default(formula,data,na.action=na.pass) } predict.plot.data.frame(x,...) } predict.plot <- function(object, ...) UseMethod("predict.plot") step.up <- function(object) { resp <- response.var(object) pred <- predictor.vars(object) scope <- terms(formula(paste(resp,"~",paste(pred,collapse="*")))) step(object,scope) } formals(predict.plot.data.frame)$slwd <- 2 # Functions for lab 5 # by Tom Minka rep.mat <- function(x,n,m) { array(rep(x,n*m),c(length(x)*n,m)) } rep.col <- function(x,n) { rep.mat(x,1,n) } rep.row <- function(x,n) { t(rep.col(x,n)) } terms.data.frame <- function(x,env=parent.frame(),...) { fmla <- attr(x,"terms") if(is.null(fmla)) { # minka: assume the last column is the response nm <- make.names(names(x)) if(length(nm) > 1) { lhs <- nm[length(nm)] rhs <- nm[-length(nm)] } else { lhs <- NULL rhs <- nm } fmla <- terms(formula(paste(lhs,"~",paste(rhs,collapse="+")),env=env,...)) } fmla } formula.data.frame <- function(x,env=parent.frame(),...) { formula(terms(x,env=env),...) } # returns the name of the response variable # works for formulas, model frames, and fitted models response.var <- function(object) { if(inherits(object, "terms")) { a <- attributes(object) if(!a$response) return(character(0)) return(as.character(a$variables[2])) } if(inherits(object,"data.frame") && is.null(attr(object,"terms"))) { # shortcut to avoid make.names return(names(object)[length(object)]) } response.var(terms(object)) } # returns only the predictors which appear as bare terms predictor.vars <- function(object) { if(inherits(object, "terms")) { a <- attributes(object) return(a$term.labels[a$order == 1]) } if(inherits(object,"data.frame") && is.null(attr(object,"terms"))) { # shortcut to avoid make.names return(names(object)[-length(object)]) } predictor.vars(terms(object)) } ############################################################################## auto.layout <- function(len) { layout <- c(1,len) if(len > 3) { din <- par("din") asp <- din[2]/din[1] layout[2] <- ceiling(sqrt(len/asp)) layout[1] <- ceiling(len/layout[2]) } layout } hist.data.frame <- function(x,layout,...) { if(missing(layout)) layout <- auto.layout(length(x)) opar <- par(mfrow=layout) on.exit(par(opar)) for(i in names(x)) { hist(x[[i]],xlab=i,main="",...) } } scale.data.frame <- function(x,...) { i <- sapply(x,is.numeric) x[i] <- data.frame(scale(data.matrix(x[i]),...)) x } project <- function(x,w) { pred <- rownames(w) xw <- data.frame(data.matrix(x[pred]) %*% w) other <- setdiff(colnames(x),pred) if(length(other) > 0) cbind(xw, x[other]) else xw } pca <- function(x,k=1) { x <- data.matrix(x[sapply(x,is.numeric)]) s <- svd(x) cat("R-squared =",format(sum(s$d[1:k]^2)/sum(s$d^2)),"\n") w <- s$v[,1:k] if(k == 1) dim(w) <- c(length(w),1) rownames(w) <- colnames(x) colnames(w) <- paste("h",1:ncol(w),sep="") w } solve.chol <- function(a,b) { ch <- chol(a) backsolve(ch,forwardsolve(t(ch),b)) } pca2 <- function(x,k=1) { # PCA vis Roweis's EM algorithm # takes O(dnk) time x <- data.matrix(x[sapply(x,is.numeric)]) x <- t(x) d <- nrow(x) n <- ncol(x) w <- array(rnorm(d*k),c(d,k)) # EM for(iter in 1:100) { old.w <- w if(d >= n) { # h is k by n # flops is kdk + kdn + kkn + dnk + knk + kkn = O(dnk) h <- solve.chol(t(w) %*% w, t(w) %*% x) w <- x %*% t(solve.chol(h %*% t(h), h)) } else { # flops is kdk + kdn + kkd + knd + knk + kkd = O(dnk) h <- solve.chol(t(w) %*% w, t(w)) %*% x w <- t(solve.chol(h %*% t(h), h %*% t(x))) } if(max(abs(w - old.w)) < 1e-5) break } if(iter == 100) warning("not enough iterations") # postprocessing w <- qr.Q(qr(w)) wx <- t(w) %*% x s <- svd(wx) cat("R-squared =",format(sum(s$d^2)/sum(x*x)),"\n") w <- w %*% s$u rownames(w) <- rownames(x) colnames(w) <- paste("h",1:ncol(w),sep="") w } plot.axes <- function(w,col=2,origin,keep=NULL,cex=par("cex")) { if(is.null(keep)) keep <- 0.2 usr <- par("usr") # is the origin in the plot? if(missing(origin)) { origin <- (usr[1] < 0) && (usr[2] > 0) && (usr[3] < 0) && (usr[4] > 0) } if(origin) m <- c(0,0) else m <- c((usr[2]+usr[1])/2, (usr[4]+usr[3])/2) width <- strwidth(rownames(w),cex=cex)/2 height <- strheight(rownames(w),cex=cex)/2 # "a" is scale factor to place text at arrow tips a <- pmin(width/abs(w[,1]), height/abs(w[,2])) # txt.w is the offset of the text from the arrow tip txt.w <- w * rep.col(a,2) xlim <- usr[1:2]-m[1] # make room on left xlim[1] <- xlim[1] + diff(xlim)/par("pin")[1]*0.02 xscale <- c() # find xscale so that # xscale*w[,1] + txt.w[,1] - width > xlim[1] (w < 0) # xscale*w[,1] + txt.w[,1] + width < xlim[2] (w > 0) i <- (w[,1] < 0) xscale[1] <- min((xlim[1] + width[i] - txt.w[i,1])/w[i,1]) i <- (w[,1] > 0) xscale[2] <- min((xlim[2] - width[i] - txt.w[i,1])/w[i,1]) # if one of the sets is empty, the scale will be NA xscale <- min(xscale,na.rm=T) ylim <- usr[3:4]-m[2] # make room for color key ylim[2] <- ylim[2] - diff(ylim)/par("pin")[2]*0.06 yscale <- c() # find yscale so that # yscale*w[,2] + txt.w[,2] - height > ylim[1] (w < 0) i <- (w[,2] < 0) yscale[1] <- min((ylim[1] + height[i] - txt.w[i,2])/w[i,2]) i <- (w[,2] > 0) yscale[2] <- min((ylim[2] - height[i] - txt.w[i,2])/w[i,2]) yscale <- min(yscale,na.rm=T) # scale equally xscale <- min(c(xscale,yscale)) yscale <- xscale w <- scale(w,center=F,scale=1/c(xscale,yscale)) if(T) { # only plot arrows of significant length i <- (abs(w[,1])/(xlim[2]-xlim[1])*par("fin")[1] > keep) | (abs(w[,2])/(ylim[2]-ylim[1])*par("fin")[2] > keep) if(any(!i)) cat("omitting arrow for",rownames(w)[!i],"\n") } if(!any(i)) { warning("all arrows omitted"); return() } len <- min(par("pin"))/20 arrows(m[1], m[2], m[1]+w[i,1], m[2]+w[i,2], col=col, len=len) w <- scale(w,center=-m,scale=F) w <- w + txt.w # note: rownames(w[i,]) does not work if i picks only one text(w[i,1],w[i,2],labels=rownames(w)[i],col=col,cex=cex) } ############################################################################## # returns the minimum user limits for a plotting region of size pin # which would enclose the given labels at the given positions # with the justification adj at rotation srt. range.text <- function(x,y,labels,cex=NULL,adj=NULL,srt=0, pin=par("pin")) { lab.w <- strwidth(labels,"inches",cex=cex) lab.h <- strheight(labels,"inches",cex=cex) # bounding box of text, columns are (x1,y1)(x2,y1)(x2,y2)(x1,y2) n <- length(labels) ix <- seq(1,8,by=2) iy <- seq(2,8,by=2) hull <- array(0,c(n,8)) hull[,c(1,7,2,4)] <- rep(0,n) hull[,c(3,5,6,8)] <- rep(1,n) # put adj point at origin and scale if(is.null(adj)) adj <- c(0.5,0.5) if(length(adj) == 1) adj <- c(adj,0.5) hull[,ix] <- (hull[,ix] - adj[1])*rep.col(lab.w,4) hull[,iy] <- (hull[,iy] - adj[2])*rep.col(lab.h,4) # rotate srt <- srt/180*pi cr <- cos(srt) sr <- sin(srt) R <- array(c(cr,-sr,sr,cr),c(2,2)) R <- diag(4) %x% R hull <- hull %*% R bbox <- list() bbox$left <- apply(hull[,ix],1,min)/pin[1] bbox$right <- apply(hull[,ix],1,max)/pin[1] bbox$bottom <- apply(hull[,iy],1,min)/pin[2] bbox$top <- apply(hull[,iy],1,max)/pin[2] # solve constraints xmid <- mean(range(x)) ymid <- mean(range(y)) xlim <- c() # these come from the constraints # (x-xmin)/(xmax-xmin) + left > 0 # (x-xmin)/(xmax-xmin) + right < 1 # where xmax is fixed at 2*xmid - xmin min1 <- min((x + 2*bbox$left*xmid)/(1+2*bbox$left)) min2 <- min((2*(1-bbox$right)*xmid - x)/(1-2*bbox$right)) xlim[1] <- min(min1,min2) xlim[2] <- 2*xmid - xlim[1] ylim <- c() min1 <- min((y + 2*bbox$bottom*ymid)/(1+2*bbox$bottom)) min2 <- min((2*(1-bbox$top)*ymid - y)/(1-2*bbox$top)) ylim[1] <- min(min1,min2) ylim[2] <- 2*ymid - ylim[1] c(xlim,ylim) } text.plot <- function(object,...) UseMethod("text.plot") text.plot.formula <- function(formula,data=parent.frame(),...) { x <- model.frame.default(formula,data,na.action=na.pass) text.plot.data.frame(x,...) } text.plot.data.frame <- function(x,...) { pred <- predictor.vars(x)[1] resp <- response.var(x) x1 <- x[[pred]] x2 <- x[[resp]] labels <- rownames(x) text.plot.default(x1,x2,labels,xlab=pred,ylab=resp,...) } text.plot.default <- function(x,y,labels,xlab,ylab,xlim=NULL,ylim=NULL, cex=par("cex"),adj=NULL,srt=0,axes=T,...) { if(missing(xlab)) xlab <- deparse(substitute(x)) if(missing(ylab)) ylab <- deparse(substitute(y)) plot.new() lim <- range.text(x,y,labels,cex,adj,srt) # make room for labels if(is.null(xlim)) xlim <- lim[1:2] if(is.null(ylim)) ylim <- lim[3:4] plot(x,y,xlab=xlab,ylab=ylab,xlim=xlim,ylim=ylim,axes=axes,...,type="n") text(x,y,labels=labels,cex=cex,adj=adj,srt=srt,...) }