###  Analysis of US unemployment numbers

################################################################################
#  From Federal Reserve Bank of St Louis (UNRATE, monthly, seasonally adjusted)
################################################################################

Unempl <- read.csv("http://www-stat.wharton.upenn.edu/~stine/stat910/data/unrate.csv")
dim(Unempl)

# --- columns are monthly date and unemployment rate (1948:1 - 2009:1)

colnames(Unempl) <- c("Month", "Unemployment rate")

time <- 1948 + (0:732)/12
xt   <- Unempl[,2]

# --- begin with sequence plot - always

plot(time,xt, type="o")

# --- huge correlations; partial correlations end abruptly by comparison 

par(mfrow=c(2,1))
	acf(xt);
	pacf(xt);
par(mfrow=c(1,1))

# --- compare to the differenced data; can see the effect of rounding 

plot(time[-1],diff(xt), type="o")

par(mfrow=c(2,1))
	acf(diff(xt));
	pacf(diff(xt));
par(mfrow=c(1,1))

# It hardly seems right to model this data using an arima model. Its really
# much more like discrete data rather than continuous data.  Let's get something
# else to work with!


################################################################################
#  Aggregate weekly hours index (AWHI, monthly, seasonally adjusted, 2002=100)
################################################################################

AWHI <- read.csv("http://www-stat.wharton.upenn.edu/~stine/stat910/data/awhi.csv")
dim(AWHI)

# --- columns are monthly date and unemployment rate (1964:1 - 2009:1)

colnames(AWHI) <- c("Month", "Hours index")

time <- 1964 + (0:540)/12
xt   <- AWHI[,2]

# --- sequence plot

plot(time,xt, type="o")

# --- clearly in need of differencing: these have been rounded as well... No real data?

dtime <- time[-1]
dxt <- diff(xt)
plot(dtime,dxt, type="o")


################################################################################
#  Employment, all private industries (in thous, monthly, seasonally adjusted)
################################################################################

Empl <- read.csv("http://www-stat.wharton.upenn.edu/~stine/stat910/data/unpriv.csv")
dim(Empl)

# --- columns are month and employment (1939:1 - 2009:1)

colnames(Empl) <- c("Month", "Employed")

time <- 1939 + (0:840)/12
xt   <- Empl[,2]

# --- sequence plot 

plot(time,xt, type="o")

# --- clearly in need of differencing: some big outliers to watch for

dtime <- time[-1]
dxt <- diff(xt)
plot(dtime,dxt, type="o")

hist(dxt, probability=TRUE); lines(density(dxt)); qqnorm(dxt)

# --- autocorrelations of differenced data: geometric decay

par(mfrow=c(2,1))
	acf(dxt);
	pacf(dxt);
par(mfrow=c(1,1))

# --- try several high-order autoregressions; let software pick max using AIC

ar.yw <- ar(dxt, method="yw"      , order.max=25); print(ar.yw)
ar.bg <- ar(dxt, method="burg"    , order.max=25); print(ar.bg)
ar.ls <- ar(dxt, method="ols"     , order.max=25); print(ar.ls)
ar.ml <- ar(dxt, method="mle"     , order.max=25); print(ar.ml)  # takes a while

# --- all like the AR(6); yw seems most different

# --- check out the zeros

plot(polyroot(c(1,-ar.yw$ar)))
x <- seq(0,2*pi,0.01); lines(sin(x)+cos(x)*1i)

# add the zeros from ml, ols

points(polyroot(c(1,-ar.ml$ar)), col="red", pch=6)
points(polyroot(c(1,-ar.ls$ar)), col="blue", pch=7)
points(polyroot(c(1,-ar.bg$ar)), col="green", pch=8)

# --- fit some arma models (really arima since differenced)

m200 <- arima(dxt, order=c(2,0,0), include.mean=TRUE, method="ML"); print(m200)
tsdiag(m200)  # too much left

m400 <- arima(dxt, order=c(4,0,0), include.mean=TRUE, method="ML"); print(m400)
tsdiag(m400)  # too much left

m600 <- arima(dxt, order=c(6,0,0), include.mean=TRUE, method="ML"); print(m600)
tsdiag(m600)  

# --- put the AIC into a matrix  (indices off by 1)
maxp <- maxq <- 8
aic.table <- matrix(NA,nrow=1+maxp, ncol=1+maxq)
rownames(aic.table)<-paste("p",0:maxp,sep="=")
colnames(aic.table)<-paste("q",0:maxq,sep="=")

for (p in 0:maxp) for (q in 0:maxq) {
	aic.table[1+p,1+q] <- arima(dxt, order=c(p,0,q), include.mean=TRUE, method="ML")$aic
}

# look for most negative values

aic.table


# deep blue sea

pp <- 0:(1+maxp); qq <- 0:(1+maxq)  # one more than nrow or ncol
image(pp,qq,aic.table, col=topo.colors(50))

# better image without q=0 or p=0.... p = 5, q = 8  Whoa!

image(pp[-1],qq[-1],aic.table[-1,-1], col=topo.colors(50))

# what do those SEs mean?

m508 <- arima(dxt, order=c(5,0,8), include.mean=TRUE, method="ML"); print(m508)

# anything special about the fit?

plot(dtime, dxt); 
fit508 <- dxt-residuals(m508); lines(dtime,fit508,col="red")

tsdiag(m508)  

# fit as a regression

wt <- as.vector(residuals(m508))

my.lag <- function(x,k) c(rep(NA,k),x[-(length(x)-0:(k-1))])

LagWt  <- cbind(my.lag( wt,1),my.lag( wt,2),my.lag( wt,3))
LagDxt <- cbind(my.lag(dxt,1),my.lag(dxt,2),my.lag(dxt,3))

m11 <- lm(dxt ~ LagDxt[,1] + LagWt[,1]); summary(m11)

m101 <- arima(dxt, order=c(1,0,1), include.mean=TRUE, method="ML"); print(m101)

# predict out with standard errors

n <- length(dxt)
past <- 1:100;
future <- 101:131;

plot(past, dxt[n+1-rev(past)], type="o", xlim=c(0,length(past)+length(future)))
lines(past, fit508[n+1-rev(past)], col="red")

# what do you think of the predicted changes?
pred <- predict(m508, n.ahead=length(future)); names(pred)
points(future, pred$pred, col="red")
lines(future, pred$pred-2*pred$se, lty=2, col="red")
lines(future, pred$pred+2*pred$se, lty=2, col="red")