###  R commands for Lecture 5 on regressinon

setwd("/Users/bob/work/courses/stat910/")

#################################################
#
#  Functions
#
#################################################
	
mean.na <- function(x) { mean(x,na.rm = TRUE) }
sd.na   <- function(x) { sd  (x,na.rm = TRUE) }

lag.plot <- function(xt,lag) {
	n <- length(xt);
	plot(xt[-(n:(n-lag+1))],xt[-(1:lag)], 
	   xlab=paste("X[t-",lag,"]"), ylab="X_t")
}

generalized.difference <- function(xt, alpha) {
	n<-length(xt)
	c(NA, xt[-1]-alpha*xt[-n]) # jam a leading missing value
	}

#################################################
#
#  Examples
#
#################################################

### Global temperature since 1900

data <- scan("http://www.stat.pitt.edu/stoffer/tsa2/data/globtemp.dat")
xt   <- data[45:142]   
time <- 1900:1997

plot(time,xt, type="o")  # this type connect the dots


# Line explains 2/3 of variance (R2 = 0.65), but clearly leaves a pattern
# MIGHT this be a random walk?
regr <- lm(xt ~ time); summary(regr)
abline(regr, col="red")

# plot of regr gives variety of diagnostics
plot(regr)

# Sequence plot of the residuals
plot(time,residuals(regr),type="o"); abline(h=0, col="gray")

# Autocorrelation plot of residuals (where do horizontal lines come from?)
acf(residuals(regr), lag.max=25)
points(0:20,0.5^(0:25),col="red")

# Lag plots to make sure nothing odd (outliers, curvature)
e <- residuals(regr)
n <- length(e)
plot(e[-n],e[-1])

par(mfrow=c(2,2))
 for(i in 2:5) { lag.plot(e,i) }
par(mfrow=c(1,1))

# alternative would be to consider differencing the series (see text)
plot(time,xt, type="o")
plot(time[-1],diff(xt),type="o")
acf(diff(xt), lag.max=25)


### Temperature, particulates, and mortality
#     6-day averages, after smoothing (n = 508)

mort <- scan("http://www.stat.pitt.edu/stoffer/tsa2/data/cmort.dat")
temp <- scan("http://www.stat.pitt.edu/stoffer/tsa2/data/temp.dat")
part <- scan("http://www.stat.pitt.edu/stoffer/tsa2/data/part.dat")
time <- 1:(length(temp))

# mortality as the response, note seasonality
plot(mort, type="b",pch=20)

par(mfrow=c(3,1))
  plot(mort, type="o")
  plot(temp, type="o")
  plot(part, type="o")
par(mfrow=c(1,1))

# where's the seasonality (put time last)
pairs(cbind(mort,temp,part))  # ,time))

plot(temp, mort)
lines(lowess(temp,mort), col="red")

# vaguely quadratic pattern
ctemp <- temp-mean(temp)
ctemp2 <- ctemp^2
qregr <- lm(mort ~ ctemp + ctemp2); summary(qregr)

index <- order(temp)
lines(temp[index], fitted.values(qregr)[index], col="blue")
plot(residuals(qregr), type="l")

# add time trend as well
qregr <- lm(mort ~ time + ctemp + ctemp2 + part); summary(qregr)
plot(residuals(qregr), type="l")


# evident skewness in qq plot
plot(qregr)

# temporal dependence?  Lag enough for annual cycle
acf(residuals(qregr), lag.max=60)


plot(residuals(qregr), type="l")
for(j in seq(0,500,by=52)) abline(v=j, col="green", lty=2)

#################################################
#
#  OLS vs GLS
#
#################################################

# Generate series

n     <- 100
alpha <- 0.95
sigma <- sqrt(1/(1-alpha^2))
beta  <- 1.5
rho   <- 0.95

# simple example
filter(rep(1,10),0.5,method="recursive")

et <- as.vector(filter(rnorm(n),alpha, method="recursive"))
xt <- as.vector(filter(rnorm(n),  rho, method="recursive"))
yt <- beta*xt + et

plot(xt,yt)

# OLS regression, sans constant ...
ols.regr <- lm(yt~xt-1); summary(ols.regr)

plot(ols.regr)
plot(residuals(ols.regr), type="o");

# explain this? 
acf(residuals(ols.regr))


# Actual SE of the ols estimator (first term is close to OLS) 
sigma/sqrt(n*(1/(1-rho^2))) * sqrt((1+alpha*rho)/(1-alpha*rho))


# GLS regression, using generalized differences 
gxt <- generalized.difference(xt,rho)
gyt <- generalized.difference(yt,alpha)

gls.regr <- lm(gyt ~ gxt-1); summary(gls.regr)


# Simulate the estimators
# Work dates back to Rao&Griliches, JASA, 1969
n.reps <- 500
results <- matrix(0,nrow=n.reps, ncol=5)
colnames(results) <- c("ols","est.gls","gls","alpha","b")
for(i in 1:n.reps) {
	et <- as.vector(filter(rnorm(n),alpha, method="recursive"))
	xt <- as.vector(filter(rnorm(n),  rho, method="recursive"))
	yt <- beta*xt + et
	results[i,"ols"] <-  coefficients( ols.regr<-lm(yt~xt-1) )
	results[i,"alpha"] <- alpha.hat <- (acf(residuals(ols.regr),plot=FALSE)$acf)[2]
	gxt <- generalized.difference(xt,alpha.hat)
	gyt <- generalized.difference(yt,alpha.hat)
	results[i,"est.gls"] <- coefficients( lm(gyt ~ gxt-1) )
	gxt <- generalized.difference(xt,alpha)
	gyt <- generalized.difference(yt,alpha)
	results[i,"gls"] <- coefficients( lm(gyt ~ gxt-1) )
	newyt <- yt[-1]
	newxt <- xt[-1]
	lagyt <- yt[-n]
	lagxt <- xt[-n]
	big.regr <- lm(newyt ~ lagyt + newxt + lagxt - 1)
	results[i,"b"] <- coefficients(big.regr)[2]
}

apply(results,2,mean)
apply(results,2,sd)

boxplot(as.data.frame(results))

pairs(results)