### Analysis of US unemployment numbers ################################################################################ # From Federal Reserve Bank of St Louis (UNRATE, monthly, seasonally adjusted) ################################################################################ Unempl <- read.csv("http://www-stat.wharton.upenn.edu/~stine/stat910/data/unrate.csv") dim(Unempl) # --- columns are monthly date and unemployment rate (1948:1 - 2009:1) colnames(Unempl) <- c("Month", "Unemployment rate") time <- 1948 + (0:732)/12 xt <- Unempl[,2] # --- begin with sequence plot - always plot(time,xt, type="o") # --- huge correlations; partial correlations end abruptly by comparison par(mfrow=c(2,1)) acf(xt); pacf(xt); par(mfrow=c(1,1)) # --- compare to the differenced data; can see the effect of rounding plot(time[-1],diff(xt), type="o") par(mfrow=c(2,1)) acf(diff(xt)); pacf(diff(xt)); par(mfrow=c(1,1)) # It hardly seems right to model this data using an arima model. Its really # much more like discrete data rather than continuous data. Let's get something # else to work with! ################################################################################ # Aggregate weekly hours index (AWHI, monthly, seasonally adjusted, 2002=100) ################################################################################ AWHI <- read.csv("http://www-stat.wharton.upenn.edu/~stine/stat910/data/awhi.csv") dim(AWHI) # --- columns are monthly date and unemployment rate (1964:1 - 2009:1) colnames(AWHI) <- c("Month", "Hours index") time <- 1964 + (0:540)/12 xt <- AWHI[,2] # --- sequence plot plot(time,xt, type="o") # --- clearly in need of differencing: these have been rounded as well... No real data? dtime <- time[-1] dxt <- diff(xt) plot(dtime,dxt, type="o") ################################################################################ # Employment, all private industries (in thous, monthly, seasonally adjusted) ################################################################################ Empl <- read.csv("http://www-stat.wharton.upenn.edu/~stine/stat910/data/unpriv.csv") dim(Empl) # --- columns are month and employment (1939:1 - 2009:1) colnames(Empl) <- c("Month", "Employed") time <- 1939 + (0:840)/12 xt <- Empl[,2] # --- sequence plot plot(time,xt, type="o") # --- clearly in need of differencing: some big outliers to watch for dtime <- time[-1] dxt <- diff(xt) plot(dtime,dxt, type="o") hist(dxt, probability=TRUE); lines(density(dxt)); qqnorm(dxt) # --- autocorrelations of differenced data: geometric decay par(mfrow=c(2,1)) acf(dxt); pacf(dxt); par(mfrow=c(1,1)) # --- try several high-order autoregressions; let software pick max using AIC ar.yw <- ar(dxt, method="yw" , order.max=25); print(ar.yw) ar.bg <- ar(dxt, method="burg" , order.max=25); print(ar.bg) ar.ls <- ar(dxt, method="ols" , order.max=25); print(ar.ls) ar.ml <- ar(dxt, method="mle" , order.max=25); print(ar.ml) # takes a while # --- all like the AR(6); yw seems most different # --- check out the zeros plot(polyroot(c(1,-ar.yw$ar))) x <- seq(0,2*pi,0.01); lines(sin(x)+cos(x)*1i) # add the zeros from ml, ols points(polyroot(c(1,-ar.ml$ar)), col="red", pch=6) points(polyroot(c(1,-ar.ls$ar)), col="blue", pch=7) points(polyroot(c(1,-ar.bg$ar)), col="green", pch=8) # --- fit some arma models (really arima since differenced) m200 <- arima(dxt, order=c(2,0,0), include.mean=TRUE, method="ML"); print(m200) tsdiag(m200) # too much left m400 <- arima(dxt, order=c(4,0,0), include.mean=TRUE, method="ML"); print(m400) tsdiag(m400) # too much left m600 <- arima(dxt, order=c(6,0,0), include.mean=TRUE, method="ML"); print(m600) tsdiag(m600) # --- put the AIC into a matrix (indices off by 1) maxp <- maxq <- 8 aic.table <- matrix(NA,nrow=1+maxp, ncol=1+maxq) rownames(aic.table)<-paste("p",0:maxp,sep="=") colnames(aic.table)<-paste("q",0:maxq,sep="=") for (p in 0:maxp) for (q in 0:maxq) { aic.table[1+p,1+q] <- arima(dxt, order=c(p,0,q), include.mean=TRUE, method="ML")$aic } # look for most negative values aic.table # deep blue sea pp <- 0:(1+maxp); qq <- 0:(1+maxq) # one more than nrow or ncol image(pp,qq,aic.table, col=topo.colors(50)) # better image without q=0 or p=0.... p = 5, q = 8 Whoa! image(pp[-1],qq[-1],aic.table[-1,-1], col=topo.colors(50)) # what do those SEs mean? m508 <- arima(dxt, order=c(5,0,8), include.mean=TRUE, method="ML"); print(m508) # anything special about the fit? plot(dtime, dxt); fit508 <- dxt-residuals(m508); lines(dtime,fit508,col="red") tsdiag(m508) # fit as a regression wt <- as.vector(residuals(m508)) my.lag <- function(x,k) c(rep(NA,k),x[-(length(x)-0:(k-1))]) LagWt <- cbind(my.lag( wt,1),my.lag( wt,2),my.lag( wt,3)) LagDxt <- cbind(my.lag(dxt,1),my.lag(dxt,2),my.lag(dxt,3)) m11 <- lm(dxt ~ LagDxt[,1] + LagWt[,1]); summary(m11) m101 <- arima(dxt, order=c(1,0,1), include.mean=TRUE, method="ML"); print(m101) # predict out with standard errors n <- length(dxt) past <- 1:100; future <- 101:131; plot(past, dxt[n+1-rev(past)], type="o", xlim=c(0,length(past)+length(future))) lines(past, fit508[n+1-rev(past)], col="red") # what do you think of the predicted changes? pred <- predict(m508, n.ahead=length(future)); names(pred) points(future, pred$pred, col="red") lines(future, pred$pred-2*pred$se, lty=2, col="red") lines(future, pred$pred+2*pred$se, lty=2, col="red")