### Text utilities ### Prepared by Robert Stine for Blalock Lectures: Text as Data ### ICPSR Summer Program, July 2017 dither <- function (x, s = 0.05) { x + s*sd(x, na.rm=T)*rnorm(length(x)) } reset <- function(lessMargin=T) { # bottom, left, top, right if (lessMargin) { mar <- c(2.5,2.5,1.5,0.5)+.1; mgp <- c(1.5,.5,0) } else { mar <- c(5,4,4,2)+.1; mgp <- c(3,1,0) } .old.par <<- par(mfrow=c(1,1), mar=mar ,mgp=mgp, fg="black",bg="white",col="black", col.main="black", col.sub="black", col.lab="black", col.axis="black"); palette("default"); } words_from_string <- function (text) { return( unlist(strsplit(prepare_text(text),"\\s+")) ) # gobble up double space } # input as if from a row of doc-term matrix; only adjusts p0,p1,p2 good_turing_probabilities <- function(counts) { N <- sum(counts) N0 <- length(i0 <- which(counts==0)) # number of types, not tokens N1 <- length(i1 <- which(counts==1)) N2 <- length(i2 <- which(counts==2)) N3 <- sum(counts==3) if(0==N0 || 0==N1 || 0==N2 || 0==N3) stop("Some type count is zero.") p0 <- (N1/N) /N0 p1 <- (2*N2/N)/N1 p2 <- (3*N3/N)/N2 prob <- counts/N prob[i0] <- p0 prob[i1] <- p1 prob[i2] <- p2 return (prob/sum(prob)) } zipf_plot <- function(freq, n.label=5, n.fit=200) { # Returns log-log regression freq <- sort(freq, decreasing=T) x<-1:length(freq); y<-freq df <- data.frame(list(x=x,y=y,lx=log(x),ly=log(y))) plot(y~x, xlab="Rank", ylab="Frequency", log="xy", data=df) common <- names(freq)[1:n.label] text(0.9*x[1:n.label],0.7*y[1:n.label],common,cex=0.75) regr<-lm(ly~lx, data=df[1:min(n.fit, nrow(df)),]); lx <- log(x<-c(1,5000)); y <- exp(predict(regr, data.frame(lx=lx))) lines(x,y,col="red") return(regr) } ### fix word names... must be better way to do with a table rename_punctuation <- function(str) { require(stringr) # s <- gsub("([\\])","_bslash", str) # yuck! do this one first s <- str_replace(str, "_,_", "comma_") # extra _ in case you have "comma" s <- str_replace(s , "_;_", "semi_") s <- str_replace(s , "_:_", "colon_") s <- str_replace(s , "_-_", "dash_") s <- str_replace(s , "_!_", "exclam_") s <- str_replace(s , "_/_", "slash_") s <- str_replace(s , "_§_", "section_") s <- str_replace(s , "_&_", "amp_") s <- str_replace(s , "_´_", "bquote_") s <- str_replace(s , "_¨_", "snake_") s <- str_replace(s , "_©_", "cw_") s <- str_replace(s , "_±_", "pm_") s <- str_replace(s , "_=_", "equal_") s <- str_replace(s , "_%_", "pct_") s <- str_replace(s , "_\\$_", "dollar_") s <- str_replace(s , "_\\+_", "plus_") # note escape char s <- str_replace(s , "_\\._", "period_") s <- str_replace(s , "_\\(_", "lparen_") s <- str_replace(s , "_\\)_", "rparen_") s <- str_replace(s , "_\\?_", "quest_") s <- str_replace(s , "_\\'_", "quote_") s <- str_replace(s , "_\"_", "dquote_") s <- str_replace(s , "_\\[_", "lbrac_") return (s) } coef_cloud <- function(coefs) { require(wordcloud) v <- as.vector(coefs)[-1] # no intercept nm <- (attributes(coefs)$Dimnames[[1]])[-1] par(mfrow=c(1,2)) neg <- v<0 wordcloud(nm[ neg], v[ neg]/max(v[ neg]) * 100, colors= 'red', random.color=F) # brewer colors RdBu, Dark2 wordcloud(nm[!neg], v[!neg]/max(v[!neg]) * 100, colors='blue', random.color=F) # wordcloud(nm[-1], abs(v[-1])/max(abs(v[-1])) * 100, colors=brewer.pal(9,"RdBu"), random.color=F) par(mfrow=c(1,1)) } ### Confusion matrix confusion_summary <- function(truth, pred) { # assume inputs are logical sens <- recall <- sum(pred & truth)/sum(truth) spec <- sum(!pred & !truth)/sum(!truth) prec <- sum(pred & truth)/ sum(pred) mclass <- sum(truth!=pred)/length(truth) z<-c(sens, spec, recall, prec, fm=2*prec*recall/(prec+recall), mclass) names(z) <- c("sens", "spec", "recall", "prec", "fm", "mclass") return(z) } ### Spectral methods random_projection_svd <- function (X, d, power.iter=1, verbose=F) { # matrix X, recover d dimensions n <- nrow(X); k <- ncol(X) omega <- matrix (rnorm (k*d), nrow=k, ncol=d) P <- X %*% omega if (power.iter >= 1) { if(verbose) cat("Starting power iterations...\n") for(i in 1:power.iter) { P <- X %*% (t(X) %*% P) if(verbose) cat("Completed power iteration...\n") } } Q <- qr.Q(qr(P)) if (verbose) cat("Completed QR factorization...\n") udv <- svd(t(Q)%*%X) udv$u <- (Q %*% udv$u) return (udv) } plot_loadings <- function(v,j1,j2,threshold=0.1, cex=0.7) { # labeled plots of loadings plot(v[,j1], v[,j2], col='gray', xlab=substitute("V"[j],list(j=j1)), ylab=substitute("V"[j], list(j=j2))) label <- threshold < sqrt(v[,j1]^2 + v[,j2]^2) if(is.null(rownames(v))) cat("Add row labels to input matrix.\n") text(v[label,j1], v[label,j2], rownames(v)[label], cex=cex, srt=45) } ### For simulating data with PCA patterns make_hidden_pca_data <- function (n) { e <- matrix(rnorm(n*25), nrow=n) u <- rnorm(n) y <- u^2 + rnorm(n) x <- e + .35*u # plot(x[,3],y) # plot(u,y) return(list(x=x,y=y)) } make_pca_data <- function(n, m, k=2, sd.signal=1) { e <- matrix(rnorm(n*m), nrow=n) u <- matrix(rnorm(n*k), nrow=n) vt <- matrix(rnorm(k*m), nrow=k) d <- rnorm(k) * sd.signal return(e + u %*% (d * vt)) } ### For topic models rdirichlet <- function(a, n=NULL) { if (is.null(n)) y <- rgamma(length(a), a, 1) else y <- rgamma(n, a, 1) return(y / sum(y)) } entropy <- function (p) { p <- p[p>0] p <- p/sum(p) return(-sum(p*log(p))) }