# Script for Class 1
# Note that SS data is at www.stat.pitt.edu/stoffer/tsa2


# set working directory for convenience to place on your system

setwd("~/courses/stat910/")


### J&J Quarterly Earnings per Share ###

# read SS data on J&J quarterly earnings from SS web site
jj.raw <- scan("http://www.stat.pitt.edu/stoffer/tsa2/data/jj.dat")
plot(jj.raw)  
jj.ts <- ts(jj.raw, start=c(1960,1), frequency = 4)  # time labels, but come at cost
plot(jj.ts)
# log scale on y axis
plot(jj.ts, log="y")

# Want to fit a regression on time, but regression complicates ts objects
# Simply work with regular R series

jj      <- jj.raw
time    <- 1960+(0:83)/4
quarter <- as.factor(rep(1:4,21))  # quarter labels

plot(time,log(jj), main="Log of J&J Quarterly Earnings Per Share")

# Fit a regression on time, residuals have patterns
regr <- lm(log(jj) ~ time); summary(regr)
abline(regr, col="red")   # add to plot

plot(time, residuals(regr), col=1:4)  # color is nice for this

# Revise regression by adding quarterly indicators
regr.2 <- lm(log(jj) ~ time + quarter); summary(regr.2)
plot(time, residuals(regr.2), col=1:4)  # color is nice for this

regr.3 <- lm(log(jj) ~ time + quarter + quarter*time); summary(regr.3)
plot(time, residuals(regr.3), col=1:4)  # color is nice for this

# Q: What's happening?  Certainly not random pattern left in residuals.
# Q: What's the presence of a pattern here say about the regressions?


### Global temperature deviations ###

temp <- scan("http://stat.wharton.upenn.edu/~stine/stat910/data/nasa_global.dat")
time <- 1880 + (0:(length(temp)-1))/12
plot(time,temp, type="l", main="NASA Global Temperature Deviation",
     xlab="Year", ylab="1/100 Degree Deviation")

# plot deviations of deviations
smth <- loess(temp ~ time)
lines(time,smth$fitted, col="red")

smth.4 <- loess(temp ~ time, span=0.4)
lines(time,smth.4$fitted, col="blue")

plot(time, smth.4$residuals)


### Speech Signal ###

# Much longer, with quite different and regular repeating pattern

speech <- scan("http://www.stat.pitt.edu/stoffer/tsa2/data/speech.dat")
length(speech)
plot(speech, type="l")

# look at the autocorrelations
acf(speech)

# look at the periodogram
pg <- spec.pgram(speech, demean=TRUE)

s  <- spec.pgram(speech, taper=0.5, demean=TRUE, plot=FALSE, spans=c(25,13))
lines(s$freq, s$spec, col="red")

# periodogram of periodogram!
dev <- log(pg$spec)-log(s$spec)
plot(dev,type="l")

spec.pgram(dev, taper=0.5, demean=TRUE, plot=TRUE, spans=c(13,5))


### Southern Oscillation and Fish ###

# Arranged as two series, with 453 months in each

soi  <- scan("http://www.stat.pitt.edu/stoffer/tsa2/data/soi.dat")
fish <- scan("http://www.stat.pitt.edu/stoffer/tsa2/data/recruit.dat")

par(mfrow=c(2,1))  
  plot(soi, main="Southern Oscillation", type="l")
  plot(fish, main="Fish Recruitment", type="l")
par(mfrow=c(1,1))

# no contemporaneous correlation
plot(soi,fish)

# dependence at other lags?
plot(soi[-453], fish[-1])
plot(soi[-(453-(0:3))], fish[-(1:4)])
plot(soi[-(453-(0:7))], fish[-(1:8)])

# this function computes the correlations at varying lags: which is the leading indicator?
ccf(soi,fish)


### Brain Imaging ###

mri <- read.table("http://www.stat.pitt.edu/stoffer/tsa2/data/fmri.dat")
time <- 2*(1:128)

# these are very much in phase
 plot(time,mri[,2],type="l",col="red")
lines(time,mri[,3],type="l",col="blue")
lines(time,mri[,4],type="l",col="green")
lines(time,mri[,5],type="l",col="black")
lines(time,mri[,6],type="l",col="orange")

# these less so
 plot(time,mri[,7],type="l",col="red")
lines(time,mri[,8],type="l",col="blue")
lines(time,mri[,9],type="l",col="green")


### Earthquake or Explosion ###

# Arranged as 2048 of earthquake, then explosion (sampled at 40/second)
boom <- matrix(scan("http://www.stat.pitt.edu/stoffer/tsa2/data/eq5exp6.dat"), ncol=2)
length(boom[,1])
par(mfrow=c(2,1))  
  plot(boom[,1], main="Earthquake", type="l")
  plot(boom[,2], main="Explosion", type="l")
par(mfrow=c(1,1))


### Random Walk ###

# can be very hard to distinguish this sort of noise from results of a model
# since you only have one realization, one sample path

n <- 100;
x <- 1:n;
rw <- cumsum(rnorm(n)); plot(rw)
regr <- lm(rw ~ x)
summary(regr)

# summary adds significance
coefficients(regr)
coefficients(summary(regr))

# do a little simulation of the t-statistic of the slope
nreps <- 100;
tstats <- rep(0,nreps)
for(rep in 1:nreps) {
	rw <- cumsum(rnorm(n))
	regr <- lm(rw ~ x)
	tstats[rep] <- coefficients(summary(regr))[2,3]
}
hist(tstats)

# count how many |t| are larger than two; how many should be?
sum(abs(tstats)>2)


### Two different types of processes... both generated by the filter command:

### Linear Process ###
xt <- filter(rnorm(100), filter=c(.2,.4,.4,.4,.2), method="convolution"); plot(xt)


### Autoregression ###
xt <- filter(rnorm(100), filter=c(.2,.5,.2), method="recursive"); plot(xt)


# How would you distinguish among these?