# **************** Discrete Random Variable Computations **************** ## source("http://stat.wharton.upenn.edu/~buja/STAT-101/src-probability.R") # Remove '## ' to expose the documentation and code examples. ## # make.RV(vals, probs.or.odds): Generate a random variable ## # vals: vector of possible values ## # probs.or.odds: vector of probabilities or odds in the order of 'vals'; ## # they don't need to add up to 1 as they will be ## # normalized by the function 'probs(RV)', ## # with negative values set to zero probability; ## # they can also be specified as a vector of strings ## # that represent algebraic expressions such as "1/6". ## # Implementation: The function simply attaches 'probs.or.odds' as the ## # names() attribute of 'vals' and returns the result. ## # Thus a RV is primarily the vector of possible outcome ## # values, and the probabilities are 'decoration'. ## # The reason for making the possible values primary ## # is that when we transform RVs, we operate on the values, ## # not the probabilities; similarly, when we formulate ## # events such as 'X>0', we mean the possible values, ## # not the probabilities. ## # ## # Examples: Each example is accompanied by plot of the RV. ## #---- ## # Make a 50:50 Bernoulli random variable: ## X.Bern <- make.RV(c(1,0), c(.5,.5)) ## plot(X.Bern) ## #---- ## # Make a fair coin flip game with payoffs +$1 and -$1: ## X.fair.coin <- make.RV(c(1,-1), c(.5,.5)) ## plot(X.fair.coin) ## #---- ## # Make a biased coin flip game with odds 1:2 and with fair payoffs +$2 and -$1 ## X.biased.coin <- make.RV(c(2,-1), c(1,2)) ## plot(X.biased.coin) ## # *** The probabilities are specified as unnormalized odds c(1,2). ## # When probabilities are needed, use 'probs(RV)', which normalizes the odds. ## #---- ## X.fair.die <- make.RV(1:6, rep("1/6",6)) ## plot(X.fair.die) ## # *** Probabilities can be given as strings representing algebraic expressions; ## # they will be evaluated when actual values are needed. ## #---- ## # Make a loaded die, specifying odds 1:1:1:1:2:4 rather than probabilities: ## X.loaded.die <- make.RV(1:6, c(1,1,1,1,2,4)) ## plot(X.loaded.die) ## #---- ## # P(), E(), V(), SD(): ## # Probability of composite events, expected values, variances, standard deviations: ## P(X.fair.die>3) ## P(X.loaded.die>3) ## P(X.loaded.die==6) ## E(X.Bern) ## E(X.fair.die) ## V(X.Bern) ## SD(X.Bern) ## # probs(): The vector of all probabilities ## #---- ## X.Bern ## probs(X.Bern) ## #---- ## X.fair.die ## probs(X.fair.die) ## #---- ## X.loaded.die ## probs(X.loaded.die) ## #---- ## # The function 'probs()' evaluates the 'names()' attribute of a RV, ## # normalizes the values to sum to +1 (setting negatives to zero), ## # and makes the possible outcome values the new 'names()' attribute. ## # This function is rarely used because 'P()', 'E()', 'V()', 'SD()' ## # encapsulate the major uses of probabilities. ## # rsim(): Simulated trials (random numbers) from a discrete random variable ## X.Bern.sim100 <- rsim(100, X.Bern) ## X.Bern.sim100 ## plot(X.Bern.sim100) ## X.loaded.die.sim100 <- rsim(100, X.loaded.die) ## X.loaded.die.sim100 ## plot(X.loaded.die.sim100) ## # The function 'rsim()' attaches the probabilities as names to the random draws. ## # To get the values only, use 'strip()': ## strip(X.Bern.sim100) ## strip(X.loaded.die.sim100) ## # props(): Tabulate simulations and show PROPORTIONS, not counts. ## props(X.Bern.sim100) ## props(X.loaded.die.sim100) ## # Note: 'props()' is the analog of 'probs()', but ## # 'props()' applies to SIMULATION DATA and tabulates them, whereas ## # 'probs()' applies to RANDOM VARIABLES and lists their probabilities. ## # By the LLN the results of 'props()' will be close to 'probs()' for ## # for large simulations. ## # Recommendation: Apply 'props()' to unstripped simulation data; ## # it will preserve the order of the random variable. ## # SofI(), SofIID(): Sums of independent RVs and IID RVs, e.g. S5 = X1 + X2 + X3 + X4 + X5 ## S5 <- SofI(X.Bern, X.Bern, X.Bern, X.Bern, X.Bern) ## S5 <- SofIID(X.Bern, 5) # Same thing ## # Note: SofI() can form sums of heterogeneous but independent RVs ## # SofIID() can only do sums of IID RVs, but for larger sums ## S.mix <- SofI(X.Bern, X.fair.die) # Independent but not IID ## plot(S.mix) ## # Larger example with CLT effect ## S128 <- SofIID(X.Bern, 128) # Takes a while! ## plot(S128, pch=16, cex=.2) ## # qqnorm(): Normal quantile plot for discrete random variables ## qqnorm(S128) # Normal quantile plot of the RV S128 from above ## qqnorm(S128, cex=1, type="l", lwd=2, main="Normal Quantile Plot of S128") ## S128.sim100000 <- rsim(100000, S128) # Simulated data ## qqnorm(S128.sim100000, cex=.5) # Regular normal quantile plot of simulated data ## qqnorm(S128, add=T, col="red", pch=1, cex=1.5) # RV normal quantile plot overlaid ## #---------------- ## # Example: A simple option 'X' ## # The code shows parallel operations on the RV and simulated data. ## X <- make.RV(c(100000,10000,0), c(0.00025,0.005,0.99475)) # The payoff RV ## X.sim <- rsim(200000, X) # A sample of 200,000 from RV ## probs(X); props(X.sim) # Probabilities and observed proportions in sample ## par(mfrow=c(2,1), mar=c(3,3,2,1), mgp=c(1.8,.5,0)) ## plot(X); plot(X.sim) # Plot the RV and the sample the same way ## P(X>0); Prop(X.sim>0) # Probability and observed proportion of event ## P(X==100000); Prop(X.sim==100000) # " ## P(X==2000); Prop(X.sim==2000) # " ## E(X); mean(X.sim) # Expected value of RV and observed mean in sample ## SD(X); sd(X.sim) # SD of RV and sd of sample ## # Transformation of random variables and simulated samples: form any function f(X) and f(X.sim) ## X.10times <- 10*X # total of 10 copies of the option, or one option 10 times the size ## X.sim.10times <- 10*X.sim # simulation analog ## X.fair <- X-75 ## X.sim.fair <- X.sim-75 # simulation analog ## M128 <- SofIID(X.Bern, 128)/128 # Proportion RV of successes in 128 iid Bernoullis ## # Figure out what to expect before executing these lines of code: ## E(X.10times); E(X) ## E(X.fair); E(X) ## SD(X.10times); SD(X) ## SD(X.fair); SD(X) ## # Transformations that collapse values: contract/consolidate with 'con()' ## con(ifelse(X.loaded.die>3,1,0)) # Bernoulli RV from 'X.loaded.di'; 'con()' contracts from 6 to 2 values ## con((X.fair.die - 3.5)^2) # squaring creates identical values; 'con()' contracts identical pairs #---------------------------------------------------------------- # RANDOM VARIABLE AND PROBABILITY FUNCTIONS: # Make a random variable consisting of a list of # possible outcome values and their probabilities or odds: # - Possible outcome values = main content # - Probabilities = names attribute make.RV <- function(vals, probs.or.odds) { names(vals) <- probs.or.odds; class(vals) <- "RV"; vals } # Obtain the list of probabilities from a random variable: p(x) probs <- function(X, scipen=10, digits=22) { pr <- sapply(names(X), function(pstr) eval(parse(text=pstr))); options(scipen=scipen) names(pr) <- X pr <- pmax(pr,0) if(any(is.na(pr))) pr[is.na(pr)] <- pmax(0, (1-sum(pr[!is.na(pr)]))/sum(is.na(pr))) pr/sum(pr) } # Turn a probability vector with possible outcome values in the 'names()' attribute # into a random variable: as.RV <- function(px) { X <- as.numeric(names(px)) names(X) <- px class(X) <- "RV" X } # Strip attributes, names and return plain content: strip <- as.vector # Calculate probabilitiess of events; expects logical with names=probs: P <- function(event) { sum(probs(event)[event]) } # Expected value of a random variable: E <- function(X) { sum(X*probs(X)) } # Variance of a random variable: V <- function(X) { E((X-E(X))^2) } # Standard deviation of a random variable: SD <- function(X) { sqrt(V(X)) } # Skewness of a random variable: SKEW <- function(X) { E((X-E(X))^3)/SD(X)^3 } # Curtosis of a random variable: CURT <- function(X) { E((X-E(X))^4)/V(X)^2 } # Sum of independent random variables: arbitrary number of arguments SofI <- function(..., digits=15, scipen=10) { LIST <- list(...) S <- LIST[[1]] LIST <- LIST[-1] while(length(LIST)>0) { X <- LIST[[1]] tmp <- tapply(outer(probs(S), probs(X), FUN="*"), outer(S, X, FUN="+"), sum) S <- as.numeric(names(tmp)) options(digits=digits, scipen=scipen) names(S) <- format(tmp) LIST <- LIST[-1] } class(S) <- "RV" return(S) } SofIID <- function(X, n=2, digits=15, scipen=10) { S <- X; i <- 2 while(i<=n) { tmp <- tapply(outer(probs(S), probs(X), FUN="*"), outer(S, X, FUN="+"), sum) S <- as.numeric(names(tmp)) options(digits=digits, scipen=scipen) names(S) <- format(tmp) if(i%%100==0) cat(i,"... ") i <- i+1 }; cat("\n") class(S) <- "RV" return(S) } # Plot a random variable of class "RV" in the first argument: plot.RV <- function(X, pch=16, cex=1.2, lwd=2, col="black", stretch.x=1.2, stretch.y=1.2, xlab="Possible Values", ylab="Probabilities", xlim={ rg <- range(X); mrg <- mean(rg); mrg + (rg-mrg)*stretch.x }, ylim=c(0, max(probs(X))*stretch.y), ...) { ## args <- list(...); print(args) x <- as.numeric(X) px <- probs(X) plot(x, px, type="h", lwd=lwd, col=col, xlab=xlab, ylab=ylab, xlim=xlim, ylim=ylim, ...) abline(h=0, col="gray") points(x, px, pch=pch, cex=cex, col=col) } # Print a random variable of class "RV": avoid printing the annoying class attribute print.RV <- function(X, ...) { attributes(X)$class <- NULL print(X) } # Normal quantile plot for RVs to answer the question how close to normal it is: qqnorm.RV <- function(x, pch=16, cex=.5, add=F, xlab="Normal Quantiles", ylab="Random Variable Quantiles", ...) { x <- sort(x[probs(x)>0]) pc <- cumsum(probs(x)) if(!add) { plot(qnorm(pc), x, pch=pch, cex=cex, xlab=xlab, ylab=ylab, ...) } else { points(qnorm(pc), x, pch=pch, cex=cex, ...) } } # Contraction/consolidation of a RV that has collapsed values, # as for Bernoullis formed with 'ifelse(X>3,1,0)', for example: con <- function(rv) { probs <- tapply(probs(rv), rv, sum) make.RV(as.numeric(names(probs)), probs) } # Mixing two random variables: default is 50:50 draw from each mix <- function(X, Y, fracx=.5, fracy=.5) { names(X) <- paste(names(X),fracx,sep="*") names(Y) <- paste(names(Y),fracy,sep="*") con(c(X,Y)) } # SIMULATION AND TABULATION FUNCTIONS: # Simulate n independent trials from a random variable X: rsim <- function(n, X) { tmp <- sample(X, size=n, replace=T, prob=probs(X)) attributes(tmp)$RV <- X; class(tmp) <- "RVsim"; tmp } # Proportions of observed outcomes in one or more vectors of simulated trials: props <- function(...) { LIST <- list(...) LIST <- lapply(LIST, function(x) { RV <- attributes(x)$RV if(!is.null(RV)) { factor(x, levels=as.character(RV)) } else { x } } ) tbl <- table(RV=LIST) tbl <- tbl/sum(tbl) tbl } # Proportion of an event observed in a vector of simulated trials: Prop <- function(X.sim) { sum(X.sim)/length(X.sim) } # Otherwise use the empirical functions # mean(X.sim); var(X.sim); sd(X.sim); and the empirical skewness: skew <- function(X.sim) { mean(scale(X.sim)^3) } # Plot simulations like a random variable: plot.RVsim <- function(Xsim, ...) { X <- as.RV(props(Xsim)) plot(X, ylab="Proportions", ...) } # SAVING: # Make sure the functions are saved for next time: save.image() #----------------------------------------------------------------