################################################################ ################################################################ # # High-level regression tree function that als creates a tree plot. # Calls reg.tree() and pretty.tree() for the actual work. # The rest is repackaging the output of reg.tree() so it # qualifies as a tree data structure recognized by other # tree-related functions supplied by R. # # Assumption: Categorical predictors are coded with integer values. # Because in the tree plot categories are shown as letters, # the category codes are supposed to be <= length(letters). # regression.tree <- function(xy, method = "XM", minsize = tree.control(dim(xy)[1])$minsize, catego = rep(FALSE, dim(xy)[2]), title = "Regression Tree") { cat("regression.tree(): start\n") p <- dim(xy)[2] x <- xy[,-p] y <- xy[,p] nobs <- length(y) xlevels <- attr(x, "column.levels") if (is.null(xlevels)) { xlevels <- rep(list(NULL), ncol(x)) names(xlevels) <- dimnames(x)[[2]] } pfit <- reg.tree(x, y, method = method, minsize = minsize, catego = catego) nnode <- length(pfit) fit <- list(node = rep(NA, nnode), var = rep(0, nnode), cutleft = rep("", nnode), cutright = rep("", nnode), n = rep(NA, nnode), dev = rep(NA, nnode), yval = rep(NA, nnode) ) for(i in 1:nnode) { k <- pfit[[i]]$depthcount fit$node[i] <- pfit[[k]]$binnum fit$n[i] <- pfit[[k]]$nobs fit$dev[i] <- ifelse(is.null(pfit[[k]]$rss), 0, pfit[[k]]$rss) fit$yval[i] <- pfit[[k]]$mean if(!is.null(pfit[[k]]$minth)) { # not a leaf fit$var[i] <- pfit[[k]]$minj if(!catego[pfit[[k]]$minj]) { # not a categorical predictor fit$cutleft[i] <- paste("<", round(pfit[[k]]$minth,2), sep = "") fit$cutright[i] <- paste(">", round(pfit[[k]]$minth,2), sep = "") } else { # a categorical predictor maxminj <- max(pfit[[k]]$minth, pfit[[k]]$minthR) ggl <- ggr <- rep(FALSE, maxminj) ggl[pfit[[k]]$minth] <- TRUE ggr[pfit[[k]]$minthR] <- TRUE gl <- letters[1:maxminj][ggl] gr <- letters[1:maxminj][ggr] for(m in 1:maxminj) { fit$cutleft[i] <-paste(fit$cutleft[i],gl[m],sep="") fit$cutright[i] <-paste(fit$cutright[i],gr[m],sep="") } fit$cutleft[i] <-paste(":",fit$cutleft[i],sep="") fit$cutright[i] <-paste(":",fit$cutright[i],sep="") } } } frame <- data.frame(fit[c("var", "n", "dev", "yval")]) frame$var <- factor(frame$var, levels = 0:length(xlevels), labels = c("", names(x))) frame$splits <- array(data = unlist(fit[c("cutleft", "cutright")]), dim = c(nnode, 2), dimnames = list(character(0), c("cutleft", "cutright")) ) row.names(frame) <- fit$node fit <- list(frame = frame, where = attr(pfit, "where"), terms = NULL, call = match.call(), y=y, x=x) attr(fit, "xlevels") <- xlevels; names(fit$where) <- row.names(x) attr(fit, "x") <- x attr(fit, "y") <- y attr(fit, "method") <- method attr(fit, "title") <- title if(nnode > 1) class(fit) <- "tree" else class(fit) <- c("singlenode", "tree") cat("regression.tree(): done\n") return(fit) } # ################ # regression tree # 6/15/97 initialized. # 3/19/98 : Stop-splitting criterion is added. # Instead, put nodemax argument. # 1/16/98: Add Likelihood method (Method=L) # 3/3/98: Add XM,NM 3/25/98 Add XQ50, NQ50(median) again. # Remove Stopsplit. # 3/4/98: change from 7 to 22 for Boston Housing Data. Add LO, LNO # reg.tree <- function(x, y, method = "XM", minsize = tree.control(length(y))$minsize, catego = rep(FALSE, dim(as.matrix(x))[2]), nq = 20, frac = 1, control = tree.control(length(y))) { regtree <- mtree <- list() nc <- nt <- 1 mtree[[nc]] <- rep(T, length(y)) regtree[[nc]] <- list(mother = 0, binnum = 1, nobs = length(y), mean = mean(y), std = sqrt(var(y)), rss = var(y) * (length(y)-1)) while(nc <= nt && sum(mtree[[nc]]) > 0 && nt <= control$nmax) { cat("node", nc, "\n") sel <- mtree[[nc]] split <- reg.split(x,y, sel, method, nq, frac, minsize, catego) if(split$mincrit < Inf) { sell <- sel & split$splitsel selr <- sel & !sell if(sum(sell) > 1 && sum(selr) > 1) { nt <- nt + 1 mtree[[nt]] <- sell regtree[[nt]] <- list(mother = nc, binnum = regtree[[nc]]$binnum*2, nobs = sum(sell), mean = mean(y[sell]), std = sqrt(var(y[sell])), rss = var(y[sell]) * (sum(sell)-1)) regtree[[nc]]$left <- nt nt <- nt + 1 mtree[[nt]] <- selr regtree[[nt]] <- list(mother = nc, binnum = regtree[[nc]]$binnum*2 + 1, nobs = sum(selr), mean = mean(y[selr]), std = sqrt(var(y[selr])), rss = var(y[selr]) * (sum(selr)-1)) regtree[[nc]]$right <- nt regtree[[nc]]$mincrit <- split$mincrit regtree[[nc]]$minj <- split$minj regtree[[nc]]$minth <- split$minth if(catego[split$minj]) regtree[[nc]]$minthR <- split$minthR } } nc <- nc + 1 } k <- m <- 1 # provide indices for the required sort in S tree structures vtree <- rep(F, length(regtree)) # vertical tree levels for plotting? while(k <= length(regtree) && !is.null(regtree[[m]]$mother)) { # Is the second condition never F? print(paste("k,m = ",k,m)) if(vtree[m] == F) { regtree[[k]]$depthcount <- m vtree[m] <- T k <- k + 1 } if(!is.null(regtree[[m]]$left) && vtree[regtree[[m]]$left] == F) m <- regtree[[m]]$left else if(!is.null(regtree[[m]]$right) && vtree[regtree[[m]]$right] == F) m <- regtree[[m]]$right else if(!is.null(regtree[[m]]$mother)) m <- regtree[[m]]$mother else break } where <- rep(NA, length(y)) for(i in 1:length(mtree)) if(is.null(regtree[[i]]$minj)) # leaf where[mtree[[i]]] <- i attr(regtree, "where") <- where regtree } # ################ # # regsplit - regression tree spilt # 3/19/98 : stopsplit crit is added. Add Likelihood method # reg.split <- function(x,y, sel, method, nq, frac, minsize, catego) { if(sum(y) == sum(sel)) return(list(mincrit = Inf)) xs <- x[sel,] ys <- y[sel] if(length(unique(ys)) <= 1 || length(ys) < minsize) return(list(mincrit = Inf)) for(k in 1:dim(xs)[2]) if(catego[k]) { ux <- sort(unique(xs[,k])) my <- rep(NA, length(ux)) for(q in 1:length(ux)) my[q] <- mean(ys[xs[,k] == ux[q]]) fr <- rbind(ux,my) xs[,k] <- fr[2,][match(xs[,k],fr[1,])] } xqs <- apply(xs, 2, quantile, seq(1/nq, 1 - (1/nq), by = 1/nq)) mincrit <- Inf minj <- minth <- minthR <- splitsel<- NULL for(j in 1:dim(xqs)[2]) { for(i in 1:dim(xqs)[1]) { critsel <- xs[, j] <= xqs[i, j] nl <- sum(critsel) nr <- sum(!critsel) if(nl >= minsize && nr >= minsize) { critval <- reg.crit(critsel, ys, method, frac) if(critval < mincrit) { mincrit <- critval minj <- j minth <- xqs[i, j] } } } } if (mincrit < Inf) { if(catego[minj]) { selx <- x[sel,minj] minthL <- sort(unique(selx[xs[,minj] <= minth])) minthR <- sort(unique(selx[xs[,minj] > minth])) minth <- minthL splitsel <- !is.na(match(x[,minj],minth)) } else { minth <- round(minth,4) splitsel <- x[,minj] <= minth } } return(list(mincrit = mincrit, minj = minj, minth = minth, splitsel = splitsel, minthR = minthR)) } # ################ # # 6/15/97 :Method A=AIC/n , B=BIC/n, H=Heuristic # 2/24/98: O = A,B without penalized value, # i.e. only with log(sigl) and log(sigr) # 2/24/98 : Change formulas from exponential(i.e.,Liklihood) # to logarithm(i.e.-2logLikelihood) # 3/25/98 : Add XQ50, NQ50(median) again. # Add XM=max of meanL and meanR. NM= min of.. # 3/4/98 : Change 7 to 23 for Boston Housing Data. Add LO, LNO # reg.crit <- function(critsel, yy, method, frac) { nl <- sum(critsel) nr <- sum(!critsel) n <- length(yy) meanparent <- mean(yy) meanl <- mean(yy[critsel]) meanr <- mean(yy[!critsel]) sigl <- var(yy[critsel]) sigr <- var(yy[!critsel]) lqtl <- sort(yy[critsel])[ceiling(50/100 * nl)] rqtl <- sort(yy[!critsel])[ceiling(50/100 * nr)] crit <- switch(method, A = min(log(sigl) + (frac*4/nl), log(sigr) + (frac*4/nr)), B = min(log(sigl) + (frac*2*log(nl) /nl), log(sigr) + (frac*2*log(nr) /nr)), CART = (nl/n) * sigl + (nr/n) * sigr , LO = (nl/n) * log(sigl) + (nr/n) * log(sigr), O = min(log(sigl), log(sigr)), DM = -max(abs(meanl - meanparent), abs(meanr - meanparent)), XM = -max(meanl, meanr), NM = min(meanl, meanr), XQ50 = -max(lqtl, rqtl), NQ50 = min(lqtl, rqtl), stop("reg.crit(): unknown method")) return(crit) } # # end regression tree functions # # ################################################################ ################################################################ # # pretty plotting of trees # show.tree <- function(xtree, ps=NULL, digits=2, width=12, height=13, type="proportional",...) { if(!is.null(ps)) { postscript(ps, hor=T) par(pty = "m", mar = c(1,1,4,1), cex = 0.5, mex = 0.5) } else { x11(width=width, height=height) par(pty = "m", mar = c(1,1,3,1), cex = 1.0, mex = 1.0) } title <- paste(attr(xtree, "title"), ": method =", attr(xtree, "method"), ", ") plot.tree(xtree, type=type) # treepl(treeco(xtree)) text(xtree, pretty = 20) pretty.tree(xtree, title, digits=2,...) if(!is.null(ps)) { graphics.off() system(paste("gv",ps,"!")) } } pretty.tree <- function(xtree, title = "", digits = 2, ...) { if(!inherits(xtree, "tree")) stop("pretty.tree(): not legitimate tree") frame <- xtree$frame col <- names(frame) cxy <- par("cxy")[2] if(!is.null(srt <- list(...)$srt) && srt == 90) cxy <- cxy * 4 xy <- treeco(xtree) leaves <- rep(T, nrow(frame)) leaves <- frame$var == "" smry <- summary(xtree) if( is.null( attr(xtree, "ylevels") ) ) { m <- format( signif( frame$yval, digits = digits ) ) s <- format( signif( sqrt(frame$dev/frame$n), digits = digits ) ) pct <- format( round( frame$n/length(xtree$where) * 100,1 ) ) stat <- paste("m=", m, ", sz=", pct, "%", sep = "") rsq2 <- 1 - smry$dev/smry$df/var(xtree$y) titl <- paste( title, " R2 = ", round(rsq2,2), sep="" ) } else { prm <- format( signif( round(frame$yprob,2), digits=digits ) ) prv <- prm[,1] for(i in 2:dim(prm)[2]) prv <- paste(prv,":",prm[,i],sep="") pct <- format( round( frame$n/length(xtree$where)*100,1) ) stat <- paste("p=", prv, ", sz=", pct, "%", sep = "") mis <- smry$misclass err <- paste( round(mis[1]/mis[2],digits), " (", mis[1], "/", mis[2], ")", sep="" ) titl <- paste( title, " misclass. error = ", err, sep="" ) } text(xy$x[leaves], xy$y[leaves] - 1.6 * cxy, stat[leaves], ...) text(xy$x[!leaves], xy$y[!leaves] + 1.3 * cxy, stat[!leaves], ...) mtext( titl, 3, 1, cex=1.2*par()$cex ) invisible() } # ################################################################ ################################################################ # treeco <- function (tree, uniform = paste(".Tree.unif", dev.cur(), sep = ".")) { frame <- tree$frame node <- as.numeric(row.names(frame)) depth <- tree.depth(node) x <- -depth if (exists(uniform)) uniform <- get(uniform) else uniform <- 0 if (uniform) y <- x else { y <- dev <- frame$dev depth <- split(seq(node), depth) parent <- match(node%/%2, node) sibling <- match(ifelse(node%%2, node - 1, node + 1), node) for (i in depth[-1]) y[i] <- y[parent[i]] - dev[parent[i]] + dev[i] + dev[sibling[i]] } depth <- -x leaves <- frame$var == "" x[leaves] <- seq(sum(leaves)) depth <- split(seq(node)[!leaves], depth[!leaves]) left.child <- match(node * 2, node) right.child <- match(node * 2 + 1, node) for (i in rev(depth)) x[i] <- 0.5 * (x[left.child[i]] + x[right.child[i]]) list(x = x, y = y) } treepl <- function (xy, node, erase = FALSE, ...) { x <- xy$x y <- xy$y parent <- match((node%/%2), node) sibling <- match(ifelse(node%%2, node - 1, node + 1), node) xx <- rbind(x, x, x[sibling], x[sibling], NA) yy <- rbind(y, y[parent], y[parent], y[sibling], NA) if (any(erase)) { lines(c(xx[, erase]), c(yy[, erase]), col = par("bg")) return(x = x[!erase], y = y[!erase]) } plot(range(x), range(y), type = "n", axes = FALSE, xlab = "", ylab = "") text(x[1], y[1], "|", ...) lines(c(xx[, -1]), c(yy[, -1]), ...) list(x = x, y = y) } tree.control <- function (nobs, mincut = 5, minsize = 10, mindev = 0.01) { mcut <- missing(mincut) msize <- missing(minsize) if (mincut > (minsize/2)) { if (mcut) mincut <- trunc(minsize/2) else if (msize && !mcut) minsize <- 2 * mincut else if (!msize && !mcut) stop("mincut cannot be greater than minsize/2") } mincut <- max(1, mincut) minsize <- max(2, minsize) nmax <- ceiling((4 * nobs)/(minsize - 1)) list(mincut = mincut, minsize = minsize, mindev = mindev, nmax = nmax, nobs = nobs) } tree.depth <- function (nodes) { depth <- floor(log(nodes, base = 2) + 1e-07) as.vector(depth - min(depth)) } plot.tree <- function (x, y = NULL, type = c("proportional", "uniform"), ...) { if (inherits(x, "singlenode")) stop("Cannot plot singlenode tree") if (!inherits(x, "tree")) stop("Not legitimate tree") type <- match.arg(type) uniform <- type == "uniform" dev <- dev.cur() if (dev == 1) dev <- 2 assign(paste(".Tree.unif", dev, sep = "."), uniform, envir = .GlobalEnv) invisible(treepl(treeco(x), node = as.numeric(row.names(x$frame)), ...)) } text.tree <- function (x, splits = TRUE, label = "yval", all = FALSE, pretty = NULL, digits = getOption("digits") - 3, adj = par("adj"), xpd = TRUE, ...) { oldxpd <- par(xpd = xpd) on.exit(par(oldxpd)) if (inherits(x, "singlenode")) stop("Cannot plot singlenode tree") if (!inherits(x, "tree")) stop("Not legitimate tree") frame <- x$frame column <- names(frame) if (!is.null(ylevels <- attr(x, "ylevels"))) column <- c(column, ylevels) if (!is.null(label) && is.na(match(label, column))) stop("Label must be a column label of the frame component of the tree") charht <- par("cxy")[2] if (!is.null(srt <- list(...)$srt) && srt == 90) { if (missing(adj)) adj <- 0 ladj <- 1 - adj } else ladj <- adj xy <- treeco(x) if (splits) { node <- as.numeric(row.names(frame)) left.child <- match(2 * node, node) rows <- labels.tree(x, pretty = pretty)[left.child] ind <- rows != "NA" text(xy$x[ind], xy$y[ind] + 0.5 * charht, rows[ind], adj = adj, ...) } if (!is.null(label)) { leaves <- if (all) rep(TRUE, nrow(frame)) else frame$var == "" if (label == "yval" & !is.null(ylevels)) stat <- as.character(frame$yval[leaves]) else if (!is.null(ylevels) && !is.na(lev <- match(label, ylevels))) stat <- format(signif(frame$yprob[leaves, lev], digits = digits)) else stat <- format(signif(frame[leaves, label], digits = digits)) if (!is.null(dim(stat)) && dim(stat)[2] > 1) { if (length(dimnames(stat)[[2]])) stat[1, ] <- paste(sep = ":", dimnames(stat)[[2]], stat[1, ]) stat <- do.call("paste", c(list(sep = "\n"), split(stat, col(stat)))) } text(xy$x[leaves], xy$y[leaves] - 0.5 * charht, labels = stat, adj = ladj, ...) } invisible() } labels.tree <- function (object, pretty = TRUE, collapse = TRUE, ...) { if (!inherits(object, "tree")) stop("Not legitimate tree") frame <- object$frame xlevels <- attr(object, "xlevels") var <- as.character(frame$var) splits <- matrix(sub("^>", " > ", sub("^<", " < ", frame$splits)), , 2) if (!is.null(pretty)) { if (pretty) xlevels <- lapply(xlevels, abbreviate, minlength = pretty) for (i in grep("^:", splits[, 1])) for (j in 1:2) { sh <- splits[i, j] nc <- nchar(sh) sh <- substring(sh, 2:nc, 2:nc) xl <- xlevels[[var[i]]][match(sh, letters)] splits[i, j] <- paste(": ", paste(as.vector(xl), collapse = ","), sep = "") } } if (!collapse) return(array(paste(var, splits, sep = ""), dim(splits))) node <- as.numeric(row.names(frame)) parent <- match((node%/%2), node) odd <- as.logical(node%%2) node[odd] <- paste(var[parent[odd]], splits[parent[odd], 2], sep = "") node[!odd] <- paste(var[parent[!odd]], splits[parent[!odd], 1], sep = "") node[1] <- "root" node } summary.tree <- function (object, ...) { obj <- list(call = object$call) frame <- object$frame obj$type <- if (is.reg <- is.null(attr(object, "ylevels"))) "\nRegression tree:\n" else "\nClassification tree:\n" leaves <- frame$var == "" variables <- names(attr(object, "xlevels")) used <- unique(frame$var[!leaves]) if (length(used) < length(variables)) obj$used <- used obj$size <- sum(leaves) obj$df <- frame$n[1] - obj$size obj$dev <- deviance.tree(object) if (!is.reg) obj$misclass <- c(misclass.tree(object), frame$n[1]) else obj$residuals <- residuals(object) class(obj) <- "summary.tree" obj } deviance.tree <- function (object, detail = FALSE, ...) { if (!inherits(object, "tree")) stop("Not legitimate tree") frame <- object$frame if (detail) frame$dev else sum(frame$dev[frame$var == ""]) }