# ================================== STAT 926 NOTES ===================================


## Mechanics of this script:

##     - Lots of R code, for you to replicate and cannibalize.

##     - For syntax highlighting, the file has extension '.R'.

##     - The instructor uses the Emacs editor in which he runs R
##       for ease of code execution.
##       You can use any environment of your choice.
##       Let the instructor know what the latest in environments is.
##       R-Studio?

##     - Much of the explanatory text is in R comments.

##     - This is an evolving document.
##       Feel free to get the initial 2012 version from your older colleagues,
##       but there is no guarantee that this course will be identical
##       to 2012.  In this second run of the course, the instructor
##       hopes to cover more material than in 2012.

##     - Plan: We start with some R basics,
##             then proceed with an overview of multivariate data visualization,
##             and finally we'll spend much of the semester with
##             concepts of multivariate analysis,
##             idiosyncratically selected by the instructor.
##             Suggestions for topics are welcome.


#================================================================


# R SHORTCUTS, PACKAGES, LIBRARIES AND INSTRUCTOR'S FUNCTIONS:

    # Sheer laziness:
    len <- length       
    su <- function(x) sort(unique(x))
    cn <- colnames
    rn <- rownames
      # Caution!  Do you think this will work?
        mat <- cbind(1:3,4:6);  cn(mat) <- c("A","B")

    repos <- "http://cran.us.r-project.org/"     # For installing packages
    install.packages("vcd", repos=repos)         
    install.packages("YaleTookit", repos=repos)  # Ooops, not avaiable
    source("http://www-stat.wharton.upenn.edu/~buja/STAT-541/tab-all.R")
    source("http://www-stat.wharton.upenn.edu/~buja/STAT-541/mosaic.R")
    source("http://stat.wharton.upenn.edu/~buja/STAT-541/plot-plus.R")
    source("http://stat.wharton.upenn.edu/~buja/STAT-541/pairs-plus.R")
    .First <- function() {  # Gets called on startup of R, ...
      library(MASS)         #
      library(lattice)      # ... so these libraries are ready to use.
      library(vcd)          # But you can do many other things in .First as well.
      options(width=200)
    }
    save.image()            # So the functions are available next time you start up R.

    # Examine what packages you got:
    installed.packages()

    # Available packages:
    pckgs.all <- available.packages("http://cran.r-project.org/bin/windows/contrib/3.2")
    str(pckgs.all)          # 2012/09/11: 3956 packages; 2013/09/04: 4723;  2014/09/03: ...
    grep("ace", rownames(pckgs.all), val=T)   # If you remember part but not all of a name.
    grep("norm", rownames(pckgs.all), val=T)   # If you remember part but not all of a name.
    install.packages("mnormt", repos=repos)
    install.packages("mvtnorm", repos=repos)
    install.packages("acepack", repos=repos)

    # Examine what datasets you got:
    data()     # Note some datasets come with packages.


# PS: Interesting recent NYTimes article
#     "For Big-Data Scientists, ‘Janitor Work’ Is Key Hurdle to Insights"
#     http://www.nytimes.com/2014/08/18/technology/for-big-data-scientists-hurdle-to-insights-is-janitor-work.html


#----------------------------------------------------------------


# TOY DATA:

    # Laser data (courtesy of P. Tukey, Bellcore 1987):
    laser <- read.table("http://www-stat.wharton.upenn.edu/~buja/DATA/laser.dat", header=T)
    str(laser)     # laser data: 2 inputs (current 1 and 2), 2 outputs (energy, wavelength)
    tab.all(laser)

    # FTSE stock data, 1991-05-13 till 2006-06-11 (courtesy of Kartik Ghia):
    ftse <- read.csv("http://stat.wharton.upenn.edu/~buja/DATA/FTSE-financials.csv")
    str(ftse)

    # Autism phenotype datasets: not on webpage
    autism.cdv <- read.csv("http://stat.wharton.upenn.edu/~buja/DATA/proband_core_descriptive_variables.csv")
    str(autism.cdv)        # "cdv" = "core descriptive variables"
    autism.cuv <- read.csv("http://stat.wharton.upenn.edu/~buja/DATA/proband_commonly_used_variables.csv")
    str(autism.cuv)        # "cuv" = "commonly used variables"

    # Italian olive oil data (from various R packages):
    olive <- read.csv("http://www-stat.wharton.upenn.edu/~buja/DATA/olive.csv")
    str(olive)     # olives: 8 fatty acid contents, 3 regions, 9 subregions of Italy
    scan("http://rss.acs.unt.edu/Rdoc/library/classifly/html/olives.html", w="", sep="\n") # docu

    # Places Rated data (1980s):
    places <- read.table("http://www-stat.wharton.upenn.edu/~buja/DATA/places.dat")
    scan("http://stat.wharton.upenn.edu/~buja/DATA/places.README", w="", sep="\n") # docu
    str(places)

    # Boston Housing data (Belseley-Kuh-Welsch, Regression Diagnostics):
    data(Boston);  help(Boston)
    Boston <- read.table("http://www-stat.wharton.upenn.edu/~buja/DATA/boston.dat", header=T)
    # Geography of the census tracts:
    rownames(Boston) <- paste(scan("http://www-stat.wharton.upenn.edu/~buja/DATA/boston.geography", w=""),
                              1:nrow(Boston))

    # Iris data: NOT Fisher's iris data", rather: Edgar Anderson's iris data, as reanalyzed by R.A. Fisher
    data(iris);  help(iris)   # as a dataframe
    data(iris3)               # as a 3-d array

    save.image()


#================================================================


BASIC R GRAPHICS:

* Fundamental plots:

  - Univariate:
    . quantitative:
        histogram
        density plot (violin plots)
        boxplot (one box)
    . categorical:
        barplot/chart
        dotplot
        pie chart

  - Bivariate:
    . both quantitative:
        scatterplot
    . both categorical:
        divided or side-by-side barplot/chart
        mosaic plot
    . mixed, case 1: quantitative Y conditional on categorical X
        boxplot (one box per category)
    . mixed, case 2: categorical conditional on quantitative
        ???(running conditional proportions?)
    HW: Is there a convincing solution to the last case in the lit?

  - Trivariate:
    . heatmaps (2 X forming a rectangular grid, 1Y)
    . contour plots (2X, 1Y)  (shows a 2-D smooth, really)
    . glyph (plotting character/symbol) coding of Y, plot X1,X2
    . 3D rotations (is in JMP, e.g.)

  - Multivariate:
    . scatterplot matrices ('splom')
    . mosaic plots (multivariate)
    . parallel coordinate plots
    . lattice/trellis displays, coplots, conditional plots

  ==> Data visualization  (as opposed to ... medical 3D vis, network vis, ...)

* Adding to fundamental plots:
      lines()
      points()
      abline()
      rect()
      polygon()
      text()
      mtext()
    Extreme case of building up your own plot:
      plot(..., type="n")   # Set up a plot panel with axes but plot nothing
    Then use the above functions to build up your plot.

* Making up your own axis:
    plot(..., xaxt="n")   # No x-axis
    axis(..., side=1)     # Fill the arguments for tic locations and labels


#----------------------------------------------------------------

# DEFAULTS: Much of R plots waste space with too much margin
#           ==> Move the axes labels closer and narrow the margin.

    par(mgp=c(1.8,.5,0), mar=c(3,3,2,1))  # Call everytime you create a new window !!!!!!


#----------------------------------------------------------------


# TIPS DATA:

# Tips data (Di Cook, Iowa Stat U):
# http://stat.wharton.upenn.edu/~buja/STAT-541/tips.html
    tips <- read.table("http://www-stat.wharton.upenn.edu/~buja/DATA/tips.dat", header=T)

# First glance:
    str(tips)
    tab.all(tips)

# Draw conclusions from the displays below.  The data are quite neat!
# There is a 'theory of social behavior' regarding tipping that we all know.


#---- 1D PLOTS

# HISTOGRAMS: A curiosity about moneys: there are preferred values
    hist(tips[,"TIP"], col="gray")
    hist(tips[,"TIP"], col="gray", breaks=10000, xlab="Tip", main="Tips Data")
# ==> ALWAYS make a histogram with narrow bins as well (breaks=10000) !!!!
#     You will see the values and their ties!!!
# Out of curiosity: Is there a rounding effect in total bill?
    hist(tips[,"TOTBILL"], col="gray", breaks=10000, xlab="TotBill", main="Tips Data")
# What about total paid?
    hist(tips[,"TOTBILL"]+tips[,"TIP"], col="gray", breaks=10000, xlab="Tot+Tip", main="Tips Data")
      abline(v=c(10,15,20,25,30,35)+.1, col="red")   # Add a dime so as not to overplot...

# Histogram function in 'lattice':
    histogram(tips[,"TIP"])  # Less wasteful margins than 'hist()', no need for par(mgp=..., mar=...)
    histogram(~ TOTBILL, data=tips)                     # Better labeling with formula language
    histogram(~ TOTBILL, data=tips, main="Tips Data")   # Title
    histogram(~ TOTBILL, data=tips, main="Tips Data", breaks=1000)
    histogram(~ I(TOTBILL+TIP), data=tips, main="Tips Data", breaks=1000,
              panel=function(...) { panel.abline(v=c(10,15,20,25,30,35), col="red", lwd=3, alpha=.2)
                                    panel.histogram(...) }  )
# ==> Panel function permits drawing of abline decorations before main content.
#     It also has an alpha-blending parameter (beween 0 and 1) for transparency.
#     'Drawback': May seem more cumbersome to add to a plot;
#     However: You will see the necessity below for conditional plots/coplots/trellis plots.

# OTHER 1D DISPLAYS, just so you know:
    bwplot(~ TIP, data=tips)        # A single-box 'boxplot' or 'box-and-whiskers plot'
    densityplot(~ TIP, data=tips)   # Kernel density plot
    densityplot(~ TIP, data=tips, plot.points="rug")    # Aesthetics: Rug instead of jittered points
    densityplot(~ TIP, data=tips, plot.points=F, ref=T) # Aesthetics: No points but reference line
    bwplot(~ TIP, data=tips, panel=panel.violin)        # Variation on density plots: violin plots

# NOTE ON HISTOGRAM AND DENSITY PLOTS:
# For theoreticians the choice of 'bin width'/'bandwidth' in density estimation is a big deal.
# In practice you should play with many choices, in particular very small widths;
# they might show different aspects of the data.


#---- 2D PLOTS

# SCATTERPLOT: Several ways of plotting two variables of primary interest
    plot(tips[,"TOTBILL"], tips[,"TIP"], xlab="Total Bill", ylab="Tip", pch=16)
    plot(tips[,c("TOTBILL","TIP")], pch=16)
    plot(TIP ~ TOTBILL,   data=tips, pch=16)              # model formula: Y ~ X
      abline(a=0, b=.15, lwd=2, col="blue")               # Can add a 15% line to any of the above
      abline(h=seq(.5,6,by=.5), col="gray")
    xyplot(TIP ~ TOTBILL, data=tips)                      # library(lattice)
    xyplot(TIP ~ TOTBILL, data=tips, pch=16, col="black") # library(lattice)
# Critique of the previous three lines: The decoration lines overplot the main info, the dots.
# This can be most easily corrected in the 'lattice' version:
    xyplot(TIP ~ TOTBILL, data=tips, pch=16, col="black",
           panel=function(...) {
             panel.abline(h=seq(.5,6,by=.5), col="gray")
             panel.abline(a=0, b=.15, lwd=2, col="blue")
             panel.xyplot(...) }  )
# Important concept in 'lattice': the panel function!
# It can assume that something like coordinates and axes have been provided.

# 2 variables, X categorical ('is.factor'), Y quantiative ('is.numeric'):
# COMPARISON BOXPLOT
    plot(x=factor(tips[,"SEX"]), y=tips[,"TIP"], xlab="SEX", ylab="TIP") # 'base'
    plot(TIP ~ factor(SEX), data=tips)    # 'base' with formulas
    xyplot(TIP ~ factor(SEX), data=tips)  # 'lattice': nope, just a scatterplot
    bwplot(TIP ~ factor(SEX), data=tips)  # Comparison boxplot
    bwplot(TIP ~ factor(SEX), data=tips, panel=panel.violin)  # Comparison boxplot
# Side note on 'generic functions':
# - plot() in 'base' is a 'generic function';
#   that is, it is really many functions ('methods')
#   and which are called depending on the type fo the arguments.
#   ==> Essential feature of 'object-oriented programming' ('OO').
# - You can find out what methods for a generic function exist
#   by asking with 'methods()':
    methods(plot)
#   Another generic function: print(),
    methods(print)


#---- COPLOTS (HIGH-D SECTIONS)

# COPLOTS/CONDITIONAL PLOTS/LATTICE PLOTS/TRELLIS PLOTS:
# condition on the categories of one or more factors
# We will condition on SEX and SMOKER: first the hard way with 'base' graphics ...
    windows(width=8, height=8)      # Aesthetics of aspect ratios: roughly squares
    par(mfrow=c(2,2), mgp=c(1.5,0.5,0), mar=c(3,3,2,1)) # Aesthetics of axes & /margins
    for(sex in 0:1)
      for(smo in 0:1)
      {
       sel <- tips[,"SEX"]==sex & tips[,"SMOKER"]==smo
       plot(tips[sel,"TOTBILL"], tips[sel,"TIP"],
         xlim=c(0,55), ylim=c(0,10),        # Force round numbers on the ranges
         xlab="TOTBILL", ylab="TIP",        # Just to show you can overrule var labels
         main=paste(c("Male,","Female,")[sex+1],  # Choose main titles for the frames.
                    c("Non-Smokers","Smokers")[smo+1]),
         type="n")                          # Plot nothing, only set up the panel.
       abline(a=0, b=.15, col="blue")       # The 15% tip line; quibble: it overplots...
       points(tips[sel,"TOTBILL"], tips[sel,"TIP"],  # Plot main contain AFTER the line!
              pch=20, cex=1.2)              # Aesthetics of plotting glyphs/symbols
      }
# A high-level function for the same (in 'base'):
    coplot(TIP ~ TOTBILL | factor(SMOKER)*factor(SEX), data=tips, pch=16)
    coplot(TIP ~ TOTBILL | factor(SMOKER)*factor(SEX), data=tips, pch=16,
           panel=function(...) { abline(a=0, b=.15, col="gray");  points(...) })
# Finally coplots in 'lattice':
    xyplot(TIP ~ TOTBILL | SMOKER*SEX, data=tips, pch=16)     # library(lattice)
    xyplot(TIP ~ TOTBILL | SMOKER*SEX, data=tips, pch=16, col="black",
           panel=function(...) { panel.abline(a=0,b=.15, col="gray"); panel.xyplot(...) } )
# Sorry, this calls for better labeling, which is achieved
# by using factors instead of numeric dummy variables:
    tips.df <- tips    # Is a dataframe
    tips.df[,"SEX"]    <- factor(tips[,"SEX"], levels=c(0,1), labels=c("Male","Female"))
    tips.df[,"SMOKER"] <- factor(tips[,"SMOKER"], levels=c(0,1), labels=c("Non","Smoker"))
    tips.df[,"DAY"]    <- factor(tips[,"DAY"], levels=3:6, labels=c("Wed","Thu","Fri","Sat"))
    coplot(TIP ~ TOTBILL | SMOKER*SEX, data=tips.df, pch=16)
    xyplot(TIP ~ TOTBILL | SMOKER*SEX, data=tips.df, pch=16, col="black",
           panel=function(...) { panel.abline(a=0,b=.15, col="gray"); panel.xyplot(...) } )
# Condition even more: layout problems
    # In the following first version multiple pages are drawn, which disappear till the last.
    xyplot(TIP ~ TOTBILL | SMOKER*SEX*DAY, data=tips.df, pch=16, col="black",
           panel=function(...) { panel.abline(a=0,b=.15, col="gray"); panel.xyplot(...) } )
    # We need to squeeze the plot into one page with 'layout' and make smaller labels with 'cex'
    xyplot(TIP ~ TOTBILL | SMOKER*SEX*DAY, data=tips.df, pch=16, col="black",
           layout=c(4,4),
           panel=function(...) { panel.abline(a=0,b=.15, col="gray"); panel.xyplot(...) } )
    xyplot(TIP ~ TOTBILL | SMOKER*SEX*DAY, data=tips.df, pch=16, col="black",
           layout=c(4,4), par.strip.text = list(cex = 0.75), aspect=1,
           panel=function(...) { panel.abline(a=0,b=.15, col="gray"); panel.xyplot(...) } )
    # Aesthetics:
    # - 'layout=c(4,4)' produces 4 columns by 4 rows, filled row-major.
    # - 'par.strip.text' has parameters for sizing the label strips (see help for more).
    # - 'aspect=1' produces an aspect ratio of the panels so slopes are on average +1.
    #   ==> Learn about Cleveland's 'banking rule' !!!!
    #       Displays should have aspect ratios that show the slopes of interest
    #       at roughly 45 degrees.

# Conditioning can be applied to any of the 'lattice' plot types:
    bwplot(TIP ~ factor(DAY) | SEX, data=tips)  # Comparison boxplot for SEX
    bwplot(TIP ~ factor(DAY) | SEX, data=tips, aspect=1)  # Better aesthetics?  Maybe not; why?
    bwplot(TIP ~ factor(DAY) | SEX*SMOKER, data=tips.df, main="TIP conditioned on DAY")

# CONDITIONING CREATES HIGH-DIMENSIONAL DISPLAYS!!!
#   What is the dimensionality of the last coplot as a whole,
#   that is, taking all four panels together?


#---- HIGH-D PROJECTIONS

# SCATTERPLOT MATRICES/SPLOMS:
    pairs(tips)    # Hm...
    plot(tips)     # Same: 'tips' is a dataframe, so plot() => plot.data.frame()
    splom(tips)    # 'lattice': messy, would need to learn sizing labels
    pairs.plus(tips) # Instructor's version with automatic jitters and more space around point
    pairs.plus(tips, jitter=0.1, gap=.5)  # (To show how to control jitter and gap)
# Use color:
    tips.col <- c("black","red","blue","green")[2*tips[,"SEX"]+tips[,"SMOKER"]+1]
    # ==> black = male   non smoker
    #     red   = male   smoker
    #     blue  = female non smoker
    #     green = female smoker
    pairs.plus(tips.df, gap=.5, col=tips.col, jit=.2)  # Add color
    # For sploms, create a new version with jittered discrete variables 3:6:
    tips.jit <- sapply(1:ncol(tips), function(j) if(j<3) tips[,j] else jitter(tips[,j], factor=.5))
    colnames(tips.jit) <- colnames(tips)
    splom(tips.jit, pch=16, col=tips.col, cex=.5, alpha=.5) # Thanks, Josh!
    # Aesthetics of labels and axes:
    splom(tips.jit, pch=16, col=tips.col, varname.cex=.8, axis.text.cex=.5, axis.line.tck=.5, pscales=4)
# Coplot version:
    splom(~cbind(tips[,c("TOTBILL","TIP")],DAY=jitter(tips[,"DAY"])) | SEX*SMOKER, data=tips.df)
    splom(~cbind(tips[,c("TOTBILL","TIP")],DAY=jitter(tips[,"DAY"])) | SEX*SMOKER, data=tips.df,
          pch=16, varname.cex=.8, axis.text.cex=.5, axis.line.tck=.5, pscales=4, xlab="") # Aethetics!
# Q: How many data dimensions are you 'looking at' in each display?

# PARALLEL COORDINATE PLOTS:
    par(mfrow=c(1,1))
    parcoord(tips)       # 'MASS'
    parallelplot(tips)   # 'lattice', random colors by default
    parallelplot(tips, horizontal.axis=F)   # left-to-right
    parallelplot(tips.jit, horizontal.axis=F, col=tips.col)  # meaningful colors
    # Coplot version enabled by 'lattice':
    parallelplot(~ tips.jit[,-c(3,4)] | SEX*SMOKER, data=tips.df,
                 horizontal.axis=F, varname.cex=.8,
                 groups=DAY)                                       # Selects default colors for 'groups'
    parallelplot(~ tips.jit[,-c(3,4)] | SEX*SMOKER, data=tips.df,
                 horizontal.axis=F, varname.cex=.8,
                 groups=DAY, col=c("red","purple","blue","green")) # Choose your own colors for groups.

# COLORS in 'lattice':
# - This library tries to provide good default colors.
#   You may or may not agree.
# - Colors for cases are conveyed by using the argument
#     'groups=...".  Default colors will be chosen.
# - If you want to use your own colors, use in addition the argument
#     'col=...".  Its values will be matched to 'groups=...'.
# - If you are NOT CONDITIONING, then just using 'col=...' will work
#     just like in 'base' graphics.
#     In 'lattice' IT WILL NOT WORK PROPERLY IF YOU CONDITION!!!!!!
# - Unfortunately 'lattice' allows only one grouping variable
#   in 'groups=...', so if you intend to color according to
#   SEX*SMOKER, you have to create this variable first.


#----------------------------------------------------------------


# NOTE ON AESTHETICS:

# - The instructor's principle is that the main info should stick out
#   the most, which is why he forces the glyphs to be black and solid.

# - 'lattice' graphics force us to add plot elements in the panel function.
#   This may seem like a nuisance but has the advantage that
#   we can plot decorations such as ablines before the main plot.
#   The reason is that 'lattice' sets up a plotting panel before
#   the panel function is called, hence axes have been created.

# - Meta-principles regarding aesthetics:
#   . You should rarely be satisfied with the defaults of plotting software.
#   . You don't have to have the instructor's aesthetics,
#     but do have SOME aesthetics, that is, principles by which you plot.
#     It means that you can answer questions as to why the details of
#     a plot look the way they look.


#----------------------------------------------------------------


# HISTOGRAMS: CORPORATE EARNINGS DATA -- NARROW INS

# When I taught this course in Spring of 2004, a student in
# accounting, Joseph Gerakos, showed us this article:

#   D. Burgstahler, I. Dichev,
#   "Earnings management to avoid earnings decreases and losses,"
#   Journal of Accounting and Economics 24 (1997) 99-126.

# Joseph Gerakos prepared for us a dataset that replicates
# the Burgstahler-Dichev effect: Check data...
    burg.dich <- read.csv("http://www-stat.wharton.upenn.edu/~buja/DATA/burg_dich.csv", header=T)

# First glance:
    str(burg.dich)
    tab.all(burg.dich)
    summary(burg.dich)

# Plot Earnings
    earnings <- burg.dich[,"earnings"]
# We find some strangely heavy tails:
    par(mgp=c(1.5,.5,0), mar=c(3,3,2,1))
    hist(earnings, col="gray", breaks=100)
# Where are these extreme negative values?  List them:
    sort(earnings)[1:20]          # Lower end.
    rev(sort(earnings))[1:20]     # Upper end.
# Based on the extremes, let's limit earnings to +-10:
    sel <- (-10) < earnings & earnings < 10
    hist(earnings[sel], col="gray", breaks=100)
# Still, very heavy-tailed.  Subselect the middle 80% of earnings:
    lower <- quantile(earnings, 0.1)   # Lowest decile.
    upper <- quantile(earnings, 0.9)   # Highest decile.
    earnings.sub <- earnings[lower < earnings & earnings < upper]
    hist(earnings.sub, col="gray")
# Ok, we like to see this: earnings have a mode in the positive range.
# But there is left-skewness, which means that there are many more
# extreme negative earnings than extreme positive earnings:
# Earnings can drop faster than rise, which makes sense.
# To get back to our story: let's use unreasonably narrow bins.
    hist(earnings.sub, breaks=100, col="gray")
# Or even more extreme:
    hist(earnings.sub, breaks=1000, col="gray")
    abline(v=0, col="red")

# Here we go! Companies try to avoid reporting negative earnings
# if at all possible.  If the earnings are only slightly negative,
# there seem to exist ways to 'manage' (cook?) the books somewhat.
# According to Joseph, a small specialty in accounting has emerged
# that examines countries according to whether they show signs of
# earnings management.
#
# Or is this so?  A year later the accounting students in 2005 said
# that there is now some debate in the literature about the causes of
# the dip and spike near zero.  It could potentially be an artifact of
# tax laws.  What will the students of 20xx report???
#
# More recently, Sarah Zechman, Ph.D. student in Accounting who took
# Stat 541 in 2007, showed me a more recent article on this topic.
# Aparently, the dip-and-spike disappear if one computes earnings
# not for reporting periods but for periods that are off the reporting dates.
# So, indeed, we see human action at work: when it comes to reporting,
# "earnings management" is done, throughout the year it isn't.



#================================================================


# TITANIC DATA: BARCHARTS AND MOSAIC PLOTS

# Titanic data:
# - The tabulated form: contingency table
    data(Titanic);  help(Titanic)
# - Raw form: dataframe
    titanic <- read.table("http://www-stat.wharton.upenn.edu/~buja/DATA/titanic.dat", header=T)
    str(titanic)
# [Note: table(titanic) is not identical with 'Titanic'
#        due to different ordering of the categories.]
    dimnames(table(titanic))
    dimnames(Titanic)


#----

# Barplot/chart functions
      barplot       # 'base'
      barchart      # 'lattice'

# First experiments with barplot():
      barplot(table(titanic[,"Class"]))     # From dataframe
      barplot(margin.table(Titanic,1))      # From contingency table
      barplot(table(titanic[,c("Sex")]))    # From dataframe
      barplot(margin.table(Titanic,2))      # From contingency table; reverse ordering
# Same with barchart:
      barchart(table(titanic[,"Class"]))    # Oops, why this default?
      barchart(margin.table(Titanic, 1))    # Dito
      barchart(table(titanic[,"Class"]), horizontal=F)
      # The 'lattice' default for horizontal bars is visually not good for bar comparison:
      # We are better at vertical height comparisons than horizontal width comparisons.
      # Reason: Our vision is trained for detail on a horizontal horizon.
      # The only reason for horizontal bars would be legibility of horizontal labels.
      # Horizontal labels on the x-axis get crowded if there are many,
      # and vertical labels cause our heads to tilt for reading.


#----

# 2D BARCHARTS: DIVIDED AND NESTED BARCHARTS:

# - DIVIDED BARCHARTS: barplot() accepts matrices
#   and uses their columns for 'stacked/divided barcharts':
      barplot(table(titanic[,c("Survived","Class")]))  # From dataframe
      barplot(margin.table(Titanic,c(4,1)))            # From contingency table
#   Need a legend to make sense of what is being plotted:
      barplot(table(titanic[,c("Survived","Class")]), legend.text=T)  # Ooops
#   Aesthetics:
      par(mar=c(3,3,5,2))
      barplot(table(titanic[,c("Survived","Class")]),
              col=c("gray70","gray90"), main="Survival on the Titanic by Class",
              legend.text=c("perished","survived"), args.legend=list(x="topleft") )
#
# - NESTED BARCHARTS:
      barplot(table(titanic[,c("Survived","Class")]), beside=T)
      barplot(table(titanic[,c("Survived","Sex")]), beside=T)

# - Comparison of divided and nested:
      par(mfrow=c(2,2), mgp=c(1.5,.5,0), mar=c(3,3,2,2), oma=c(0,0,3,0)) # Outer margin for title!!!
      barplot(table(titanic[,c("Survived","Class")]), legend=T, args.legend=list(x="topleft"))
      barplot(table(titanic[,c("Survived","Sex")]))
      barplot(table(titanic[,c("Survived","Class")]), beside=T)
      barplot(table(titanic[,c("Survived","Sex")]), beside=T)
      title("Divided and Nested Barcharts", outer=T, cex.main=1.5)   # Title in outer margin
#   Divided barcharts:
#   . Facilitated comparisons:
#     + overall category counts
#     + first subcategory counts (here: perished)
#     + NOT: second and other subcategories (because they are shifted)
#   . Aesthetics are simple, fewer plotted bars
#   Nested barcharts:
#   . Faciliated comparisons:
#     + all subcategory counts
#     + NOT: overall category counts
#   . Aesthetics are more complex, more plotted bars


#----

# COPLOTS WITH BARCHARTS:
# barchart() in 'lattice' gives you much flexibility for conditionaing and layout:
#   refine...
      barchart(Freq ~ Class| Sex + Age, data=as.data.frame(Titanic))
      # The instructor has no idea what the bar dividers are...  Anyone?
      #   Another problem: baseline is negative .... !!!! BUG !!!
      #   This may be fixed in your possibly newer version of R.
# Nested/grouped barcharts: nesting is by 'groups=...'
      barchart(Freq ~ Class| Sex + Age, data=as.data.frame(Titanic),
               groups= Survived)   # default: 'stack=F'
# Divided/stacked barcharts: dividing is by 'groups=...'
      barchart(Freq ~ Class| Sex + Age, data=as.data.frame(Titanic),
               groups= Survived,
               stack=T)
# With legend: Justin the stickler -- No/Yes vs Yes/No ... how invert?
      barchart(Freq ~ Class| Sex + Age, data=as.data.frame(Titanic),
               groups= Survived,
               stack=T,
               auto.key=list(title="Survived") )
# Legend horizontal in 2 columns:
      barchart(Freq ~ Class| Sex + Age, data=as.data.frame(Titanic),
               groups= Survived,
               stack=T,
               auto.key=list(title="Survived", columns=2) )
# Force four plots into a row:
      barchart(Freq ~ Class| Sex + Age, data=as.data.frame(Titanic),
               groups= Survived,
               stack=T,
               auto.key=list(title="Survived", columns=2),
               layout=c(4,1))
# Play with aspect ratio:
      barchart(Freq ~ Class| Sex + Age, data=as.data.frame(Titanic),
               groups= Survived,
               stack=T,
               auto.key=list(title="Survived", columns=2),
               layout=c(4,1),
               aspect=1.5)
# Allow separate scales on the vertical axes:
      barchart(Freq ~ Class| Sex + Age, data=as.data.frame(Titanic),
               groups= Survived,
               stack=T,
               auto.key=list(title="Survived", columns=2),
               layout=c(4,1),
               scales=list(y="free") )

# Final question: How-Many-D are these plots:


#================================================================



# MOSAIC PLOTS:

# - The standard R function (Scott Emerson at Yale):
      par(mfrow=c(1,1))
      mosaicplot(Titanic)    # How on earth do you decipher this?
      mosaicplot(~ Sex + Class + Survived, dir=c("v","v","h"), data=Titanic, off=1)
      help(mosaicplot)

# - Rule: The most interpretable mosaic plot has one vertical variable and
#         one or more nested horizontal variables
#         ==> 'doubledecker plot' (Heike Hofmann)

# - The 'vcd' package has a doubledecker plot:
      library(vcd)
      doubledecker(Survived ~ Sex + Class, data=Titanic)
#   This is a mosaic plot with one y variable (vertical),
#   and possibly several nested x variables (horizontal).
#   This is the mosaic plot that makes the most sense.

# - The instructor's mosaic plot:
      detach("package:vcd")   # conflict with package 'vcd'
      source("http://www-stat.wharton.upenn.edu/~buja/STAT-541/mosaic.R")
      mosaic(titanic[,c("Sex","Class","Survived")])          # from dataframe
      mosaic.plot(Titanic, dims=c("Sex","Class","Survived")) # from contingency table (array)

# - 'lattice': No mosaic plots apparently...

# - Compare barcharts with mosaic plots:
#   . Barcharts    compare conditional counts
#   . Mosaic plots compare conditional proportions


# - Mosaic matrices:
#     If all variables are categorical,
#     plot all pairwise mosaic plots.
#     (Heike Hofmann cannibalized the instructor's mosaic function.)
      mosaic.matrix(titanic)   # dataframe
      mosaic.matrix(Titanic)   # contingency table
#     Q: Are symmetric cells the 45 degree reflections of each other?


#----------------------------------------------------------------


# MARKETING DATA: PROFILING CUSTOMER SEGMENTS

# Telecom marketing data (instructor's source):
    mktg <- read.table("http://www-stat.wharton.upenn.edu/~buja/DATA/mktg.dat", header=T)

# First glance:
    str(mktg)      # 3 quantitative variables for customer intensity; 5(6) categorical covariates
    tab.all(mktg)

# Will need a version with categorials coded as factors:
    mktg.df <- mktg
    mktg.df[,"biz"]   <- factor(mktg[,"biz"],   levels=c(0,1), labels=c("biz.No",  "biz.Yes"))
    mktg.df[,"reach"] <- factor(mktg[,"reach"], levels=c(0,1), labels=c("reach.No","reach.Yes"))
    mktg.df[,"kids"]  <- factor(mktg[,"kids"],  levels=c(0,1), labels=c("kids.No", "kids.Yes"))
    tmp <- su(mktg.df[,"age"])               # su <- function(x) sort(unique(x))
    mktg.df[,"age"]   <- factor(mktg[,"age"],   levels=tmp, labels=paste("age",tmp,sep="."))
    tmp <- su(mktg.df[,"edu"])
    mktg.df[,"edu"]   <- factor(mktg[,"edu"],   levels=tmp, labels=paste("edu",tmp,sep="."))
    tmp <- su(mktg.df[,"seg"])
    mktg.df[,"seg"]   <- factor(mktg[,"seg"],   levels=tmp, labels=paste("seg",tmp,sep="."))
    str(mktg.df)

# HISTOGRAMS:
    windows(width=3, height=10)
    par(mfrow=c(ncol(mktg),1), mgp=c(1.5,0.5,0), mar=c(3,3,2,1)) # Shrink waste...
    for(i in 1:ncol(mktg)) hist(mktg[,i])
# Looks nasty.
# Refine and improve: 2 columns, one with normal bins, one with narrow bins.
    windows(width=8, height=10)
    par(mfrow=c(ncol(mktg),2), mgp=c(1.5,.5,0), mar=c(2,3,1,1))
    for(i in 1:ncol(mktg)) {
      hist(mktg[,i], main=names(mktg)[i],                   # default bin widths
           xlab="", ylab="", col="gray", cex.main=1, cex.axis=.8)
      hist(mktg[,i], main=names(mktg)[i], breaks=1000,      # narrow bin widths
           xlab="", ylab="", col="gray", cex.main=1, cex.axis=.8)
    }
# The display:
#   Can you find a more high-level way of doing this with 'lattice', say?
# The data:
#   The narrow bin widths indicate how 'in.' and 'out' may have
#   been generated (transformations of counts).

# SCATTERPLOT MATRIX: Grand overview
    pairs(mktg)
    pairs(mktg, gap=.1, pch=16, cex=.5)
    plot(as.data.frame(mktg), gap=.1, pch=".", cex=.5)
    # Instructor's function: jitter variables with few values, adds space around points
    pairs.plus(mktg)          # source(....)  # see above
    pairs.plus(mktg, gap=.1, pch=".")

#----

# MOSAIC PLOTS:
# While profiling the segments was the primary practical concern
# for this dataset, one should also be interested in the relationship
# between the observed variables ('in.',...,'edu').
# A variable of primary interest to a business is the revenue from each
# custormer, hence we may want to see whether there exist differences in
# revenue between groups defined in terms of 'bus','reach','kids'.
# Here is a histogram-based coplot of the 8 groups defined by all
# possible combinations of levels of these three categorical variables:

# Marketers should keep in mind how little bottom line differences
# there are.

#----

# - Sometimes it is worthwhile to discretize a continuous variable
#   to a categorical variable, if for no other reason than making
#   it accessible to the power of mosaic plots.
#
#   (If discretizing turns continuous into categorical variables,
#    recall that jittering did somewhat the opposite: turning
#    categorical/ordered variables into continuous ones.)
#
#   In the marketing data, we will do the following:
#   * We will treat "revenue" as a response and try to describe its
#     association with other variables: "reach", "kids", "bus", "age", and "edu".
#     To this end, we will break "revenue" into 3 equal brackets.
#   * We will then use mosaic plots to profile the segments in terms of the
#     other categorical variables, including discretized "revenue".
#
# - We ignore incoming/outgoing calls and discretize "revenue"
#   to a categorical  variable.
#   Hand-craft a contingency table with self-explanatory labels:
#
#   1) Break "revenue" into 3 categories according to terciles:
         mktg.rev <- cut(mktg[,3],
                         breaks=c(-Inf, quantile(mktg[,3],c(1/3,2/3)), Inf),
                         labels=c("-rev","~rev","+rev"))
#      The function 'cut()' is meant for such purposes.  One gives it the
#      vectors of breaks (here computed as terciles

#   2) Convert the categorical variables (coded by positive integers
#      in columns 4 through 9 in 'mktg') to factors with self-explanatory
#      labels for the categories.  For example
         mktg.bus   <- factor(mktg[,4], levels=0:1, labels=c("-bus","+bus"))
         mktg.reach <- factor(mktg[,5], levels=0:1, labels=c("-reach","+reach"))
         mktg.kids  <- factor(mktg[,6], levels=0:1, labels=c("-kids","+kids"))
         mktg.age   <- factor(mktg[,7], levels=0:4,
                              labels=paste("age",1:5,sep=""))
         mktg.edu   <- factor(mktg[,8], levels=0:6,
                              labels=paste("edu",1:7,sep=""))
         mktg.seg   <- factor(mktg[,9], levels=1:4,
                              labels=paste("seg",1:4,sep=""))

#   Now throw all these factors into the tabulation:
      mktg.tab <- table(rev=mktg.rev, bus=mktg.bus, reach=mktg.reach,
                        kids=mktg.kids, age=mktg.age, edu=mktg.edu,
                        seg=mktg.seg)
#   This table is quite high-dimensional:
      dim(mktg.tab)
#   This shows that we have 7 dimensions with the following number of
#   groups in each dimension: 3, 2, 2, 2, 5, 7, 4,
#   The following shows the names of the levels/groups/categories for
#   each dimension:
      dimnames(mktg.tab)
#
#   For a table this size, plotting all dimensions doesn't work.
#   Here is more clarity:
#   - Look at the 'rev' as a function of 'kids' within 'reach' within 'bus':
      mosaic.plot(mktg.tab, dims=c("bus","reach","kids","rev"))
#   - It may make sense to a few different nestings to see potential patterns
#     in how 'rev' depends on 'bus', 'reach', 'kids':
      windows(15,10)
      par(mfrow=c(2,3))
      mosaic.plot(mktg.tab, dims=c("bus","reach","kids","rev"), cex=.8)
      mosaic.plot(mktg.tab, dims=c("reach","bus","kids","rev"), cex=.8)
      mosaic.plot(mktg.tab, dims=c("bus","kids","reach","rev"), cex=.8)
      mosaic.plot(mktg.tab, dims=c("kids","bus","reach","rev"), cex=.8)
      mosaic.plot(mktg.tab, dims=c("reach","kids","bus","rev"), cex=.8)
      mosaic.plot(mktg.tab, dims=c("kids","reach","bus","rev"), cex=.8,
                  mar=c(0,0,2,0),main="gaga")
#   The most patterned plot may be the last: it shows that
#     the fraction of high-revenue households (blue bars) increases and
#     the fraction of low-revenue households (brown bars) decreases
#     . from '-bus' to '+bus', that is, with business needs,
#     . from '-reach' to '+reach', that is, with reach needs,
#     . both for '-kids' and '+kids', that is, households with and without children.
#   It seems that business and reach needs are slight drivers of high revenue.
#
#   The Hartigan-Kleiner mosaic plots: turns out to be useful!
      windows()
      mosaic.plot(mktg.tab, dims=c("bus","reach","kids","rev"),
                  hor.vert=c("h","v","h","v"), cex=.8, lab.dist.vert=.2, border=T)
#   Interesting observation:
#   - The four large blocks are broken almost cross-wise.
#   - This means that all proportions are roughly equal across the
#     'reach' and 'bus' conditions.
#   - This in turn means that 'rev' and 'kids' are conditionally independent
#     given 'reach' and 'bus'.
#
#   However, there are slight differences in proportions of
#   high/low revenue between '-kids' and '+kids' when one compares
#   the bottom left quarter with the top right quarter:
#   - for '-reach' and '-bus', adding kids seems to add to revenue, but
#   - for '+reach' and '+bus', adding kids seems to take away from revenuel
#
#   (You might wonder whether we aren't over-interpreting the data by
#   reading meaning into such small differences in proportions.
#   Indeed, this kind of situation calls for inferential statistics
#   to tell us whether the differences are more likely to be due to randomness.)
#
#   Next, how do age and education affect (better: relate to) revenue?
#   Education:
        mosaic.plot(mktg.tab, dims=c("edu","rev"))
#     The lowest two education groups are almost empty, so we remove them:
        mosaic.plot(mktg.tab[,,,,,-c(1,2),], dims=c("edu","rev"))
#     Marginally: the more education, the more high revenue, the less low revenue.
#   Age:
        mosaic.plot(mktg.tab, dims=c("age","rev"))
#     It would help to know the age brackets, but I don't.
#     Apparently, very young ('age1') and young mature ('age3') are the
#     strongest revenue producers for the company.
#
#   Stratify by edu nested within age: Remove the lowest 3 education groups first.
        mosaic.plot(mktg.tab[,,,,,-(1:3),], dims=c("age","edu","rev"))
#     Education shows an effect on revenue for all age groups except 'age3'.

#   Profiling the segments:
        windows(width=15, height=10)
        par(mfrow=c(2,3), cex=.6)
        mosaic.plot(mktg.tab, dims=c("seg","bus"), lab.dist.hor=.2, lab.dist.vert=.2)
        mosaic.plot(mktg.tab, dims=c("seg","reach"), lab.dist.hor=.2, lab.dist.vert=.2)
        mosaic.plot(mktg.tab, dims=c("seg","kids"), lab.dist.hor=.2, lab.dist.vert=.2)
        mosaic.plot(mktg.tab, dims=c("seg","age"), lab.dist.hor=.2, lab.dist.vert=.2)
        mosaic.plot(mktg.tab[,,,,,-(1:3),], dims=c("seg","edu"), lab.dist.hor=.2, lab.dist.vert=.2)
        mosaic.plot(mktg.tab, dims=c("seg","rev"), lab.dist.hor=.2, lab.dist.vert=.2)
#     We knew all these things, didn't we?
#     Segments 2 and 4 are low revenue.
#
#   More broken down, but segments vertical for colors:
      windows(width=15, height=15)
      mosaic.plot(mktg.tab,
                  dims=c("bus","reach","kids","rev","seg"))
#   What do you see?  Note that segments 1,2,3,4 have assigned
#   colors brown, green, blue, purple in this order
#
#----

# A couple of thought about this type of survey data:
# - Who is willing to fill out questionnaires for a small sum of money?
#   => ... bias
# - Which of the four managers who marketed to the four segments
#   performed best in the subsequent 12 months?
#   => "... to the ..."

################################################################
#
# * SIMULATED DATA: a demonstration of the power of mosaic plots
#   applied to discretized continuous variables:

#
#   We construct a pair of normally distributed data
#   that have a very weak (true) correlation, 0.05:

  for(i in 1:100) {

      cor.xy <- .05;  N <- 10000
      x <- rnorm(N)
      y <- cor.xy * x + sqrt(1-cor.xy^2) * rnorm(N)
#   [Justification of this construction:
#   If X and V are independent standard normal,
#   then Y = c*X + sqrt(1-c^2)*V is also standard normal
#   and X and Y have correlation c.
#   This remark is for students with sufficient stats background.]
#
#   Of course the data we generated do not have a correlation
#   of exactly 0.05:
      cor(x,y)
#
#   Next discretize x and y:
      x.categ <- cut(x,
                     breaks=c(-Inf,quantile(x,c(1/3,2/3)),Inf),
                     labels=c("low.x","mid.x","high.x"))
      y.categ <- cut(y,
                     breaks=c(-Inf,quantile(y,c(1/3,2/3)),Inf),
                     labels=c("low.y","mid.y","high.y"))
#
#   Finally compare a scatterplot of y versus x with a mosaic plot
#   of y.categ versus x.categ:
#      windows(15,7.5)
#      windows(w=16, h=8)
      par(mfrow=c(1,2))
      plot(x,y, pch=16, cex=.2, xlim=c(-4,4), ylim=c(-4,4))
      mosaic.plot(table(x.categ, y.categ),
                  lab.dist.hor=.2, lab.dist.vert=.2, cex=.8)
#   Can you see the correlation in the scatterplot?  The mosaic plot?
#   [If the trend in the mosaic plot seems wrong, think about it.
#   It does reflect a positive correlation.]
#


# [ 
#   Animation:
#     You can wrap a for-loop around the statements of this section
#     and execute them a 100 or so times.
#
#   ...
    dev.flush(); Sys.sleep(.5)
  }
#       But comment out any 'windows(...)' statement to avoid creating
#       100 windows!!!  This will make sure that we re-use the existing
#       one instead.
#     This loop will create an animation that shows you the
#
#                 SAMPLING VARIABILITY
#                         or
#            DATASET-TO-DATASET VARIABILITY
#
#     of standard normal data of size N and with true correlation 0.05.
#     You can now ask how often you see a correlation in the scatterplot
#     and how often a corresponding trend in the mosaic plot.
#     For the overwhelming majority of samples you will
#     NOT SEE the correlation in the scatterplot, but you will
#     SEE it in the mosaic plot.
# ]
#     Raising the question of sampling variability brings us
#     to problems of statistical inference:
#
#                IS WHAT WE SEE REAL?
#
# Here is an inferential version that provides an independence test:
      mosaic.plot.overallnull(data.frame(x=x.categ, y=y.categ), nrand=10,
                              lab.dist.hor=.2, lab.dist.vert=.2, cex=.8)
# Its not very successful, though.

#----------------------------------------------------------------


# INTERACTIVE DATA VISUALIZATION SOFTWARE:

# - Some commercial software packages contain such features:
#   . JMP (from SAS, John Sall''s project)
#   . Datadesk (Velleman)

# - Fundamental operations:
#   . painting/brushing in linked plots
#   . dynamic projections, in particular 3D rotations,
#     but also grand tours, correlation tours, regression tours
#   . scaling
#   . others: subsampling, transforming, ...

# - The modern challenge: MANY VARIABLES   (large p versus large N)
#   . Example: Association Navigator

# - Other areas for dynamic visualization:
#   . maps
#   . analytic graphs

# - What does it take to program your own dynamic viz?
#
#   . R: Simple identifications can be done with 'identify()'
           cars <- read.table("http://stat.wharton.upenn.edu/~buja/STAT-541/CarModels2003-4-NA.TXT",
                              header=T, sep=",", na=".")
#        Use instrustor's fct for more space between borders and points:
           plot.plus(cars[,c("Horsepower","MPG.Highway")], extra=.2)  
           identify(cars[,c("Horsepower","MPG.Highway")], labels=cars[,"Model"]) # Hit ESC to stop
#        Another function returns coordinates at click location, NOT neares point:
           for(i in 1:5) { xy <- locator(n=1);  points(xy, pch=16, cex=2, col="red") } # Hit ESC to stop sooner
           for(i in 1:5) { xy <- locator(n=1);  abline(v=xy$x, col="red");  abline(h=xy$y, col="red") }
#
#   . Need also 'widgets', provided in R by libraries 'tcltk2', 'RGtk2'
         pckgs.all <- available.packages("http://cran.r-project.org/bin/windows/contrib/3.2") # update version 3.2!
         str(pckgs.all)        # 2012/09/11: 3956 packages;  2014/09/10: 5698
         grep("tk", pckgs.all[,1], val=T)
#        Prints packages related to graphics toolkits.
#
#   . R: getGraphicsEvent()
#        Provides direct access to mouse clicks and mouse moves
#        and dispatch to user-programmed functions on these events.
#        (See the instructor's demo of two programs for dynamic visualization
#         - of commercial tuna fishing vessels in the eastern Pacific Ocean, and
#         - of large correlation tables of size up to 1000x1000.
#         To learn about this function, see
            help(getGraphicsEvent)
#         Here is a simple non-statistical demo of the getGraphicsEvent() function:
            source("http://stat.wharton.upenn.edu/~buja/STAT-926/etch-a-sketch.R")
#         Examine the code and cannibalize it to do something useful.
#
#   . A program with a fixed set interactive dynamic capabilities:
#        ggobi from http://www.ggobi.org/
#
#   . For truly dynamic graphics with web capability learn javascript (which is not java).


#----------------------------------------------------------------


# DATA VISUALIZATION AND STATISTICAL INFERENCE:

# - Add
#   . confidence intervals/regions/bands or
#   . null intervals/regions/bands
#   to your plots whenever possible and meaninful.

# - General principles that often apply (recall Stat 541)
#   . for confidence: ...
#   . for null: ...

# - Issue: Simultaneity

# - Examples:
#   . normal quantile plots
#   . empirical quantile plots
#

#----------------


# - EXAMPLE 1: Normal quantile plots with null and confidence bands
        places <- read.table("http://www-stat.wharton.upenn.edu/~buja/STAT-541/places.dat")
        places <- read.table("../A-STAT-541/Data/places.dat")
        str(places)
#     Housing costs:
        qqnorm(places[,"Housing"])
        source("http://www-stat.wharton.upenn.edu/~buja/STAT-541/qqplot.R")
        source("../A-STAT-541/Functions/qqplot.R")
        qqnorm.null.conf(places[,"Housing"])
#     Some fun: search Box-Cox transformations to achieve normality by the lights
#               the null bands and CI bands:
        qqnorm.null.conf(places[,"Housing"], box.cox.par=0.5, new.window=F)
        qqnorm.null.conf(places[,"Housing"], box.cox.par=0, new.window=F)
        qqnorm.null.conf(places[,"Housing"], box.cox.par=-1, new.window=F)
        qqnorm.null.conf(places[,"Housing"], box.cox.par=-1.5, new.window=F)

# - Problem with inference for quantiles:
#     We don't know the location and scale parameters:
#       y = mu + sig*rnorm       where mu = E[y], sig=SD[y]
#       mu = intercept, sig = slope of line if  y ~ N(mu,sig^2).
#     They are NOT of interest, i.e., we are not testing them.
#        => mu, sig are nuisance parameters.
#     Q: What is of interest in this context?
#     A: The SHAPE of the distribution, not its LOCATION and SCALE !!!

# - General principle for eliminating nuisance parameters:
#   Find PIVOTAL test statistics.
#   DEF: A test statistic is PIVOTAL if its NULL DISTRIBUTION
#        does not depend on the nuisance parameters.
#   Example: The t-statistic for  H0: mu=0  is pivotal wrt
#            the nuisance parameters sig in  N(mu,sig^2).

# - Q: How can we make the normal quantile plot 'pivotal'
#      wrt to the nuisance parameters  mu  and  sig  if
#      H0: y ~ N(mu,sig),  and the alternative is  y !~  N(,) ?
#      Note that this is a nonparametric alternative!
#      Rejection is desired for any other distributional shape.
#
#   A: Transform the empirical quantiles (sorted values)   y(i)
#      by standardizing them:   (y(i) - mean(y))/sd(y)
#      Make up other examples of standardization!
#
# - Correct testing procedure: Generate NULL SAMPLES
#   by sampling  ynull ~ rnorm(N,mu=0,s=1)
#   and form the pivotal sorted values:
#      (ynull(i) - mean(ynull))/sd(ynull)
#   ==> Overplot many NULL SAMPLES (so-called 'spaghetti plot')
#       in pivotized form.
#       Overlay the OBSERVED SAMPLE in pivotized form.
#   ==> Reject if 'observed' reaches outside the 'null band'.
#   Check the null band code of the instructor's function:
        qqnorm.null.conf

# - Confidence bands follow a different logic.
#   For iid data, the nonparametric bootstrap (BS) is often applicable.
#   Here the need for pivotal transformation is less,
#   because bootstrap resampling (NOT null sampling)
#   requires no knowledge of the nuisance parameters.
#   Intuitive use of a BS band:
#     If no straight line fits inside the band,
#     reject the null hypothesis of a normal distribution.
#   However, for an asymptotically valid test, you should
#   also use pivotized data.
#   Reason: In pivotized form you know WHICH LINE
#           the confidence band should capture --
#           the  vert=hor 45 degreed line.


#----------------


# - EXAMPLE 2: Two-sample comparisons with permutation null bands
#              Mktg data -- compare 'rev' for the 4 segments
#              This is an alterantive to HW 1 where we use density comparison.
#              Here we use an emprirical QQ-plot for group comparison (x)
#              wrt to a quantitative variable (y).
        windows(9,9)
        x <- as.character(mktg.df[,'seg'])  # factor = pain in the ...
        y <- mktg.df[,'rev']
        grps <- su(x)          # 4 segments
        ngrps <- len(grps)
        par(mfrow=c(ngrps, ngrps), mgp=c(1.5,.5,0), mar=rep(0,4))
        for(ig in grps)
          for(jg in grps) {
            if(ig==jg) {
              hist(y[x==ig], breaks=20, col="gray", xlab="", ylab="", main="", xaxt="n", yaxt="n",
                   xlim=range(y))
              text(x=par()$usr[1], y=par()$usr[4], lab=ig, adj=c(-.1,1.1), cex=1.2)
            } else {
              qqplot.null(y[x==jg], y[x==ig], xlab="", ylab="", xaxt="n", yaxt="n",
                          mgp=c(1.5,.5,0), mar=rep(0,4), nrep=1000)
            }
          }
#
#    [Another data example: Autism and fathers' ages]

#    Q: Why do we NOT have a pivotality problem here?
#       Or do we?

#----------------

# - Issues with bands:
#   . Spaghetti method is easy, universal, but ...
#   . Convert the spaghetti bands into actual bands
#     with controlled simultaneous null coverage 1-alpha
#     along the lines of Homework 1 for densities.

#  -------------- End of Data Viz -------------------


#================================================================


NEW CHAPTER: CLASSICAL EIGENVALUE-BASED MULTIVARIATE ANALYSIS


* ROADMAP:
  - Review of principal component analysis (PCA)
  - Canonical correlation analysis
  - Generalized canonical analysis
  - Application of the above to
    Fisher''s optimal scoring and correspondence analysis


#----------------------------------------------------------------


* PRINCIPLE OF MULTIVARIATE ANALYSIS (MVA):

  - Putting it negatively:
      Do not single out a response variable (that would be regression)
  - Putting it positively:
      Perform symmetric analyses of the variables
  - Goal: Dimension reduction, capturing many dimensions in few


================================================================


* PRINCIPAL COMPONENT ANALYSIS -- PCA ('review')


  - Setup:   [We use pseudo-R like c(), cbind(), rbind()]

    . Population: Random variables Xj collected in a random vector:
        X.vec = (X1,...,Xp)^T      (column vector px1)

    . Multivariate data in columns xj collected in data matrix:
        X.mat = cbind(x1,...,xp) = rbind(x.row1,...,x.rowN) (Nxp)
        X.mat <- as.matrix(mktg[,1:8])   # R; toy data from data frame 'mktg'


  - Connection:

    . We think of the rows  x.rowi  of  X.mat  as iid samples
      from the random vector  X.vec.

    . The data matrix  X.mat  is used to estimate parameters
      of the distribution of X.vec.

    [We write a row  x.rowi  as a px1 COLUMN to correspond to X.vec.]

    [Our toy data are actually the result of sampling,
     unlike the Boston Housing data.]


  - First and second order properties:

    . Population parameters:
        mu.vec :=  E[X.vec]                                            (px1)
        Sigma  :=  V[X.vec]  =  E[(X.vec - mu.vec)(X.vec - mu.vec)^T]  (pxp)
      Formula reminder:
        Sigma = E[X.vec X.vec^T] - E[X.vec] E[X.vec]^T

    . Sample estimates:
        mu.est  <- colMeans(X.mat)   # R
        Sig.est <- var(X.mat)        # R
      Centered matrix:
        X.mat.c <- scale(X.mat, center=T, scale=F)

  - What object is being optimized in PCA, and what is the criterion?

    . The object: linear combinations of the variables
                  a^T X.vec    in the population
                  X.mat %*% a  in the data   (R)

    . The criterion: variance of the linear combination
                     V[a^T X.vec]      in the population
                     var(X.mat %*% a)  in the data  (R)


  - What is the variance of a linear combination?

    . Population:
        V[ a^T X.vec ]  =  a^T V[X.vec] a  =  a^T Sigma a  (butterfly formula)

    . Estimated from data:
        a <- c(1,1,1,2,2,2,1,1)  # R; an arbitrary coefficient vector for a lin comb...
        var(X.mat %*% a)         # R
        t(a) %*% Sig.est %*% a   # R


  - Constructing the PCA problem:

    . What is the intuition behind PCA in terms of the above concepts?
        ... Large variance direction have a chance of showing interesting structure.

    . What is the problem with this intuition?
        ... max Var(lincomb) = Inf

    . What is the conventional solution?
        ... Restrict the coefficient vector 'a' somehow.
            |a|^2 = 1

    . What is the problem with the conventional solution?
        ... Apples and oranges wrt units (except in functional data...)

    . What is it that is conventionally done to 'solve' this problem?
        ... One scales the variables to unit variance (sd)

    . What does this imply about the covariance matrix that is actually used?
        ... Correlation rather than covariance

    . Think of contexts in which this problem does not arise,
      that is, in which the plain covariance matrix can be used.
        ... Functional data where all variables have the same units

    . Comments:

      + You will find some authors argue against the use of the
        correlation matrix in PCA on grounds of difficult or
        disadvantageous asymptotics.  Ask them how they would solve
        the apples-and-oranges problem when the variables have
        heterogeneous units.

      + Compare: In regression, explanatory power of a predictor s
        measured by a t-statistic, which is free of units.  In PCA we
        have no such tool and must decide a priori how to make the
        units of the variables comparable.  Choosing the units to be
        the standard deviation across all variables, i.e.,
        z-scoring/standardizing the variables, seems commonsensical.

      + Note that there are many ways to standardize.
        For example, one might choose median and MAD instead of
        mean and sd if robustness to heavy tails is an issue.
        But then you might also revamp the PCA criterion by optimizing
        a robust variance rather than the classical variance.
        However, you will no longer be performing eigenanalysis
        but successive extraction of eigenvectors with actual optimization.

      + Yet another possibility is nonlinear transformation of the
        variables so as to tame skewness and/or kurtosis.


  - Reformulate the PCA problem as an eigenvalue/vector problem:

    . Population:
        ...

    . Data:
        ...


  - Perform the PCA eigenanalysis for the mktg data in  X.mat:
    + Two ways to get the correlation matrix:
        X.std <- scale(X.mat)     # standardize to mean=0, sd=1
        Sig.est <- var(X.std)     # now this is the correlation matrix
        Sig.est <- cor(X.mat)     # same as previous two lines: cor() rather than var(scale()) !!
    + Eigenanalysis of the correlation matrix:
        Sig.est.eig <- eigen(Sig.est)
        str(Sig.est.eig)          # components:  ...$vectors,  ...$values
    + Check reconstruction of  Sig.est  as  V D V^T:
        max(abs(  Sig.est - Sig.est.eig$vec %*% diag(Sig.est.eig$val) %*% t(Sig.est.eig$vec) ))

  - Interpretation of eigenvalues as variances:

    . Population:  let  v.j  be the j''th eigenvector of Sigma
        Sigma v.j  =  lambda v.j
        v.j^T Sigma v.j = lambda |v.j|^2 = lambda.j  (j''th eigenvalue)
        v.j^T Sigma v.j = V[v.j^T X.vec]
      ==>  lambda.j = V[v.j^T X.vec]

    . Data:  dito with  Sig.est  instead of  Sigma
      Check the interpretation numerically:
        apply(X.std %*% Sig.est.eig$vec, 2, var) # var() of lin combs from e'vecs
        Sig.est.eig$val                          # eigenvalues

    ==> We call the PCA eigenvalues also 'PC variances'.
        Their roots are also called 'PC standard deviations' or 'PC sdevs'.

    . Reminder of variance 'budgets':
        tr <- function(mat) sum(diag(mat))
        tr(Sig.est)            # trace of covariance/correlation matrix
        sum(Sig.est.eig$val)   # sum of eigenvalues
      Trace is invariant under basis change, hence
      sum of eigenvalues == sum of variable variances
      ==> If PCA is correlation-based (equiv: X.mat is standardized),
          then the eigenvalues sum up to p, the number of variables.


  - PC scores: Projecting the cases onto the PC dimensions
                   "Linear Dimension Reduction"

    . Recall: The cases correspond to the rows in  X.std.

    . The projection score of the i''th case  x = X.std[i,]  (row i)
      on a unit vector (direction) v is  <x,v> = x^T v.

      [Mathematically, a projection is IR^p --> IR^p, so a p-vector x
      projects to the p-vector (v v^T) x = v (v^T x), because v v^T is
      the rank-one or one-dimensional projection onto the direction v.
      What matters, of course, is the 'projection score' (v^T x).]

    . If the projection is onto the PC eigenvectors,
      the projection scores are called 'PC scores'.
      Example: case 5 projected onto the first eigen direction
        i <- 5;  j <- 1                         # case 5, PC 1
        t(X.std[i,]) %*% Sig.est.eig$vec[,j]    # x^T v
        sum( X.std[i,] * Sig.est.eig$vec[,j] )  # same

    . The N-vector of scores for the j''th PC direction is:
        X.std %*% Sig.est.eig$vec[,j]           # all inner products ...
      The matrix of PC scores for all PC directions is:
        X.PCscores <- X.std %*% Sig.est.eig$vec
      Its size is ...
        dim(X.PCscores)

    . From the PC scores you can easily reconstruct the (standardized) data:
        max(abs(  X.std - X.PCscores %*% t(Sig.est.eig$vec)  ))
      The reason is that t(Sig.est.eig$vec) is the inverse of  Sig.est.eig$vec
      because this is an orthogonal matrix.

    . Interpretation of PC scores:
      + re-expression of the (standardized) data matrix  X.std
        in a new coordinate system where
          basis vectors = PC eigenvectors
      + The basis change creates new variables (the PC scores)
        that are 'DECORRELATED':
          round(var(X.PCscores), 5)   # Off-diagonal==0
          Sig.est.eig$val             # Diagonal
        and centered if X.std was centered:
          colMeans(X.PCscores)
      + The variances of the PC score variables are the PC eigenvalues.
        [This better be so because we showed it earlier:
         The PC scores are the optimized linear combinations
         of the original variables.]
      + PC scores lend themselves for plotting:
        Think of the data cloud in p-space as forming a 'pizza'.
          plot(X.PCscores[,1:2])   # What's wrong with this?  Scale!!
          lims <- c(-5,5)
          plot(X.PCscores[,1:2], xlim=lims, ylim=lims)
          abline(h=0); abline(v=0)
        ==> View of the 'pizza' from above.  It is an oval pizza.
        Q: Why is scale of the essence?
        A: Projections have their individual variances which should
           be reflected by the plot.
        Here is another view:
          plot(X.PCscores[,c(1,ncol(X.PCscores))], xlim=lims, ylim=lims)
        ==> View of the 'pizza' edge-on.
        Here is the full view:
            pairs.plus(X.PCscores, pch=".")
        . How many points are plotted?
          ==> Don''t be surprised if reshaping the plot is slow.
        . How do you explain the striations?
        . What''s wrong with the scales?
            pairs.plus(X.PCscores, pch=".", xlim=lims, ylim=lims)
          ==> Proper size of the 8-dim point clould in the PC directions.
        . Do you perceive correlations in any of the plots?
          ==> If so, it''s a visual illusion.
          We know there is no correlation:
            round(cor(X.PCscores), dig=7)

  - Properties of PC scores:

    . Population version of PC scores:
        For correlation-based PCA, use standardized variables:
          Xvec.std = (Xvec - E[Xvec])/SD[Xvec]    # SD and division componentwise
          Sigma = E[ Xvec.std %*% Xvec.std^T ]    # Population correlation matrix
          Evals (p) in pxp diagonal Lambda, Evecs in pxp o.n. V:
                       Sigma = V Lambda V^T
        PC score variables: random vector
          Xvec.PCscores = V^T Xvec.std   (px1)
        1st order properties:
          E[Xvec.PCscores] = 0    if  E[X.std] = 0 (px1)
        2nd order properties:
          V[Xvec.PCscores] = V[V^ Xvec.std] = V^ V[Xvec.std] V = V^T Sigma V = Sigma
          ==> diagonal, hence decorrelated.

    . The same holds in samples with mean() instead of E[]
      and cov() instead of V[]:
        X.PCscores = X.std V.est
      where  V.est = estimated matrix of eigenvectors (Sig.est.eig$vec).
      Note the transposition compared to the population version
      because the rows of X.std corresponds to the column Xvec.std.
      + 1st order properties:
        If the columns of X.std have zero means,
        so do the columns of the linear transformation X.std V.est.
      + 2nd order properties:
        cov( X.PCscores ) = X.PCscores^T X.PCscores / (N-1)
        = V.est^T (X.std^T X.std/(N-1)) V.est
        = V.est^T Sig.est V.est
        = diagonal matrix of estimated eigenvalues
        ==> decorrelated columns in X.PCscores

    . Under the hood:
      Eigendecompositions diagonalize simultaneously
      the quadratic forms of the Rayleigh numerator and denominator.
      ==>  PC eigenvectors vj are simultaneously orthogonal wrt to
      + the covariance:                   vj^T Sigma vk = 0  for j!=k
      + the Euclidean squared distance:   vj^T vk = 0        for j!=k


  - Casting the PCA problem for data as an SVD problem:

    . Reminder: the 'singular value decomposition' or 'SVD'
                If X is Nxp full rank, then there exist U, D, V such that
                X = U D V^T
                    U is Nxp          columns = left singular vectors
                    V is pxp          columns = right singular vectors
                    U^T U = I  (pxp)  (U has o.n. columns)
                    V^T V = I  (pxp)  (V has o.n. columns)
                    D diagonal (pxp)
       
    . SVD as optimization/stationarity of a bi-Rayleigh quotient:
                  (u^T X v)^2
         R(u,v) = -----------    (or its root)
                  |u|^2 |v|^2
      Write down the stationarity conditions:
         X (v/|v|) = R(u,v)^(1/2) u/|u|
      As for eigen decomps, there exists a whole o.n. basis of such u and v,
      which we can collect in matrices U and V (U^T U = I_p = V^T V), so that:
         X V = U D,  where D is diagonal with values R(u,v) in it.
         X = U D V^T      (QED)
    . For correlation-based PCA, standardize  X.mat  first:
        X.std <- scale(X.mat)  # did this earlier already, just a reminder
      For covariance-based PCA, center  X.mat  first:
        X.std <- scale(X.mat, scale=F)

    . Perform an SVD of X.std:
        X.svd <- svd(X.std)
        str(X.svd)       # components  ...$d, ...$u, ...$v
      Check the reconstruction of X.std:
        max(abs(  X.std - X.svd$u %*% diag(X.svd$d) %*% t(X.svd$v)  ))

    . The connection with PCA is obtained by recalling the correlation matrix:
        max(abs(  Sig.est - t(X.std) %*% X.std / (nrow(X.std)-1)  ))
      Hence, in math rather than R notation:
        Sig.est  =  X.std^T X.std / (N-1)
                 =  V D U^T U D V^T / (N-1)
                 =  V D^2 V^T / (N-1)
      ==>  D^2/(N-1)   are the eigenvalues or PC variances.
           D/sqrt(N-1) are the PC sdevs.
           V  from  svd(X.std)  contains the PC eigenvectors.

    . Numeric check of agreement between svd(X.std) and eigen(Sig.est):
        Sig.est.eig$val;  X.svd$d^2 / (nrow(X.std)-1) # eigenvalue agreement
        max(abs(  Sig.est.eig$vec - X.svd$v  ))       # eigenvector agreement -- oops!?!?
        plot(Sig.est.eig$vec,  X.svd$v, pch=16)
      ==> Eigenvectors and singular vectors are determined
          only up to a sign change!
      Here is a better test (try to understand it):
          round(t(Sig.est.eig$vec) %*% X.svd$v, 13)

    . PC scores from the svd:
        X.std = U D V^T    where   V = eigenvectors
        X.std V = U D
      ==>  U D is the PC score matrix.
      Check agreement with previous scores, but beware of sign flips:
        plot(X.svd$u %*% diag(X.svd$d), X.PCscores, pch=16)
        abline(a=0, b=1, col="red");  abline(a=0, b=-1, col="red");


  - PC loadings: PC eigenvectors rescaled

    . In principle, the meaning of a PC can be recognized
      by comparing the entries of its eigenvector:
      an entry large in magnitude implies that
      this variables 'loads' strongly on this PC.

    . Interpretability of eigenvectors can be enhanced by rescaling them:
        X.std = U D V^T
        U D = PC score matrix
        U   = PC score matrix with columns scaled to unit length
        sqrt(N-1)*U = PC score matrix with columns scaled to unit sdev
      ==> Correlation between X.std and PC scores can be obtained by
        cor(X.PCscores, X.std)  =  sqrt(N-1) * U^T X.std  / (N-1)
                                =  U^T X.std  / sqrt(N-1)
                                =  D V^T / sqrt(N-1)
        cor(X.std, X.PCscores)  =  (D V^T)^T / sqrt(N-1)  =  V D / sqrt(N-1)
      Recall that  D/sqrt(N-1)  are the PC sdevs.
      Hence  V D / sqrt(N-1)  are the eigenvectors rescaled by the PC sdevs.
      They are identical with the correlations
      between PC scores and the (stdized) data.


  - R functions:

    . Canned:

        prcomp(X.mat)           # defaults are  center=T, scale=F
        summary(prcomp(X.mat))

        princomp(X.mat)         # default: cov-based, not cor-based
        summary(princomp(X.mat))

    . From scratch: much work for good labeling

        pca <- function(X, center=T, scale=T) {
          N <- nrow(X)
          p <- ncol(X)
          X.std <- scale(X, center=center, scale=scale)
          X.svd <- svd(X.std)
          # Eigenvalues and their siblings:
          evals.out <- rbind("variance"       = X.svd$d^2 / (N-1),
                             "sdev"           = X.svd$d / sqrt(N-1),
                             "% variance"     = X.svd$d^2 / sum(X.svd$d^2) * 100,
                             "% cumulative"   = cumsum(X.svd$d^2 / sum(X.svd$d^2)) * 100,
                             "% cum. wrt rest"= X.svd$d^2 / rev(cumsum(rev(X.svd$d^2))) * 100
                             )
          colnames(evals.out) <- paste("PC",1:p,sep="")
          # Eigenvectors:
          evecs.out <- X.svd$v
          colnames(evecs.out) <- paste("PC",1:p,sep="")
          rownames(evecs.out) <- colnames(X)
          # Loadings:
          loads.out <- evecs.out %*% diag(evals.out["sdev",])
          colnames(loads.out) <- paste("PC",1:p,sep="")
          # Scores:
          scores.out <- X.svd$u %*% diag(X.svd$d)
          colnames(scores.out) <- paste("PC",1:p,sep="")
          rownames(scores.out) <- rownames(X)
          # Return the goods:
          list(values=evals.out,
               vectors=evecs.out,
               loadings=loads.out,
               scores=scores.out
               )
        }
        X.pca <- pca(X.mat)    # Example

  - Worked example:

    . PC eigenvalues:
        round(X.pca$val, 2)
             ##                   PC1   PC2   PC3   PC4   PC5   PC6   PC7    PC8
             ## variance         2.64  1.69  1.03  0.81  0.67  0.53  0.42   0.20
             ## sdev             1.62  1.30  1.02  0.90  0.82  0.73  0.65   0.45
             ## % variance      33.00 21.11 12.93 10.15  8.41  6.59  5.27   2.54
             ## % cumulative    33.00 54.11 67.04 77.19 85.60 92.19 97.46 100.00
             ## % cum. wrt rest 33.00 31.51 28.17 30.80 36.86 45.77 67.49 100.00

      Trace plot:
        plot(X.pca$val["sdev",], pch=16, type="b", ylim=c(0,max(X.pca$val["sdev",])),
             ylab="PC sdev", xlab="PC")
        abline(h=c(0,1), col="blue")

    . PC loadings for meaning:
        round(X.pca$load,3)
             ##          PC1    PC2    PC3    PC4    PC5    PC6    PC7    PC8
             ## in.    0.713 -0.429  0.090 -0.003  0.133  0.162 -0.503 -0.053
             ## out    0.850 -0.337  0.102  0.001  0.020 -0.084  0.179  0.337
             ## rev    0.812 -0.378  0.076 -0.053 -0.072 -0.120  0.291 -0.292
             ## biz    0.401  0.404 -0.457  0.497  0.451 -0.125  0.019 -0.028
             ## reach  0.473  0.482  0.039  0.424 -0.574  0.180 -0.043  0.002
             ## kids   0.364  0.628  0.342 -0.261  0.296  0.426  0.136 -0.014
             ## age   -0.378 -0.672 -0.184  0.330  0.060  0.478  0.175  0.004
             ## edu    0.301  0.088 -0.805 -0.454 -0.161  0.147 -0.001  0.018
      Note:
      - read a story from PC1, PC2, PC8
      - note the 'importance weighting' of the loadings according to PC sdevs
      - note the sign patterns imposed by orthogonalities
      - typical: the first PC is a weighted average of positively associated variables

      Profile plots of loadings: parallel coordinate plots
        p <- ncol(X.pca$load)
        windows(h=5, w=10)
        parcoord(cbind(p:1,t(X.pca$load)), lwd=3,
                 ## col=paste("gray",round(seq(10,90,len=ncol(X.pca$load))),sep="") )
                 col=sample(colors()[25:60],size=p))
        mtext(text=colnames(X.pca$load), side=2, at=seq(1,0,len=p), las=2, adj=0)
        abline(h=0, col="gray", lty=2)
      ==> Problem is that all axes are independently scaled.
      Hand-made parallel coordinate traces with identical scales:
        plot(c(-.5,p), c(-1,1), type="n", ylab="Loadings", xlab="Variables", xaxt="n")
        axis(side=1, at=0:p, lab=c("PC Order",rownames(X.pca$load)), las=2)
        text(lab=colnames(X.pca$load), y=seq(1,-1,len=p), x=0, adj=1.1, cex=.8)
        abline(h=0, col="gray")
        clrs <- colors()[c(70,76,91,97,119,134,254,461,653)]  # Some strong colors
        for(j in 1:p) lines(x=0:p, rbind(seq(1,-1,len=p),X.pca$load)[,j],
                            lwd=3, col=clrs[j])

    . PC scores, plotted: We use segmentation for coloring
        windows()                   # New square window
        rg <- c(-1,1)*max(abs(X.pca$sco))      # Impose same ranges on all plots in the PC score splom
        pairs.plus(X.pca$sco, xlim=rg, ylim=rg, cex=.1,
                   col=c("red","blue","green","brown")[mktg[,"seg"]])  # color by segment
      Focus on first (dimension reduction to 2 dims):
        plot.plus(X.pca$sco[,1:2], xlim=rg, ylim=rg, cex=1,   # Just the first two PCs
                  col=c("red","blue","green","brown")[mktg[,"seg"]])
      Original standardized data, plotted:
        windows()
        rg <- c(-1,1)*max(abs(X.std))      # Impose same ranges on all plots in the PC score splom
        pairs.plus(X.std, xlim=rg, ylim=rg, cex=.1,
                   col=c("red","blue","green","brown")[mktg[,"seg"]])  # color by segment

    . Compare:
        prcomp(X.mat, scale=T)
        princomp(X.mat, cor=T)
        summary(princomp(X.mat, cor=T))

  - Inference for PCA:

    . Try the universal hammers:
      + permutation tests
      + nonparametric bootstrap
      But will they work?

    . Permutation testing:
      + H0: All variables are independent (only game for permutation approach)
        Q: Is global independence a meaningful null hypothesis?
           ...
      + Test statistics: estimated eigenvalues
          Try the LARGEST eigenvalue as test statistic.
          Q: Valid test of independence?
          A: ...
      + Try the SECOND LARGEST eigenvalue as test statistic.
          What are H0 and H1 now?  (Recall: test stat <--> H1)
            ...
          What would the intuitively desired H0 and H1 be?
            ...
          Intuitively one would like to test whether the 2nd eval
          is significant -- never mind what the 1st was.
          However: If the 1st eval was powerful,
                   the 2nd eval has a lesser chance
                   due to the budget constraint: sum evals = p.

      + One is tempted to look for a conditional test for lambda2:
          H0: "Conditional on lambda1.est, the variables are independent."
          But the conditional independence distribution of X.mat given lambda1.est
          is not a permutation distribution!
          How would you sample from a conditional distribution of X.mat given lambda1.est?
            ...

      + Another idea:
          Consider lambda1 as a nuisance parameter when testing lambda2.
          ==> Try to pivotize lambda1 out of lambda2.est.
          Any proposals?
            ...
          How about this: lambda2.est as proportion of remaining variance
            lambda2.est / sum_{j=2...p} lambdaj.est
            = lambda2.est / (p-lambda1.est)   # correlation-based PCA
          But is the distribution of this statistic free of lambda1?
            ...
          Should it be conditional on the first eigenvector v1.est as well?
            ...
          Would it work at least for multivariate normal Xvec?
            ...
          How would you 'pull out' the first component?
            ...   ('residualizing')

            max(abs(X.std - X.svd$u %*% diag(X.svd$d) %*% t(X.svd$v)))
            # Ok, we understand the decomposition of X.std.
            X.adj1 <- X.std - X.svd$u[,1] %*% t(X.svd$v[,1]) * X.svd$d[1]
            # Should we use correlation-based or covariance-based PCA on X.adj1?
            # Correlation-based would restandardize the adjusted variables.
            # Maybe we don't want this because it changes the eigenstructure,
            # whereas we want to continue testing the previous eigenstructure.
            X.adj1.pca <- pca(X.adj1, scale=F)
            rbind(X.adj1.pca$val, X.pca$val)
            # Good, we seem to have removed PC1 from X.pca and replaced it with a zero PC.
            # Next question: Is it possible that X.adj1 has independent columns?
            # Ooops, we introduced an eigenvalue zero, which violates the assumption
            # of independence!
          Here is another idea:
            ## If random permutations do not work, how about random rotations?
            ## The point is that we can constrain the random rotations to the
            ## space orthogonal to the first eigenvector (and in generality to
            ## spaces orthogonal to any sequence or leading eigenvectors).
            ## The null hypothesis is therefore not independence but
            ## rotational invariance of the remaining variability.
            ## Multivariate normal distributions would have this property
            ## of rotational invariance in subspaces of identical eigenvalues.
            ## The test would still not be exact because it is orthogonal to
            ## estimated eivenvectors, not the true eigenvectors.
            ## Hence we still need N large compared to p.

      + Social science interpretation of the PC testing problem:
          Only correlation-based PCA is used.
          Of interest is not H1: lambda2 > lambda3,
          but H1: lambda2 > 1.
          Reason: In your typical questionnaire dataset,
          a PC that has variance<=1 does not capture more variation
          than one variable/item (variance of a colum = 1 due to standardization).
          ==> Testing H0: lambda2=1 is reasonable.
          ==> Permutation testing of lambda2.est does approximately the right thing.
              (There is still the issue that a large lambda1 depresses lambda2.)
          ==> Augment the eigenvalue profile with a permutation null band.
            plot(X.pca$val["sdev",], type="n", ylim=c(0,max(X.pca$val["sdev",])),
                 ylab="PC sdev", xlab="PC")  # roots of eigenvalues
            for(j in 1:100) lines(sqrt(eigen(cor(apply(X.std,2,sample)))$val), col="gray")
            abline(h=c(0,1), col="blue")
            points(X.pca$val["sdev",], pch=16, type="b")
          Q: Is lambda3>1 significant?
            ...

       + If conditional null hypotheses do not work, what about changing
           the statistic for power against desirable alternatives?
           We may ask whether lambda2 is large compared to sum{j>=2} lambdaj.
           ==> Use percentage of variance wrt to remaining variance, as
               shown in row 5 of round(X.pca$val,2):
               ##                   PC1   PC2   PC3   PC4   PC5   PC6   PC7    PC8
               ## variance         2.64  1.69  1.03  0.81  0.67  0.53  0.42   0.20
               ## sdev             1.62  1.30  1.02  0.90  0.82  0.73  0.65   0.45
               ## % variance      33.00 21.11 12.93 10.15  8.41  6.59  5.27   2.54
               ## % cumulative    33.00 54.11 67.04 77.19 85.60 92.19 97.46 100.00
               ## % cum. wrt rest 33.00 31.51 28.17 30.80 36.86 45.77 67.49 100.00
               plot(X.pca$val[5,], type="n", ylim=c(0,max(X.pca$val[5,])),
                    ylab="% var wrt rest", xlab="PC")  # roots of eigenvalues
               for(j in 1:100) lines(pca(apply(X.std,2,sample))$val[5,], col="gray")
               lines(1/(p:1)*100, col="blue")            # True profile under independence
               points(X.pca$val[5,], pch=16, type="b")
           ==> The eigenvalues are uniformly and significantly steeper than under independence.        


    . Nonparametric confidence intervals/bands:
      + What could be simpler than nonparametric bootstrap, right?
        ==> obtain SEs for all eigenvalues
        PS: To make the inference problem harder, we reduce the sample size to 50.
        There is almost no inference problem for the full N=1926,
        so let''s select a small subset, more typical of social siences:
            N <- 30
            X <- mktg[sample(nrow(mktg),size=N),1:8]
            X.pca <- pca(X)
            Nboot <- 10000
            X.evals.bs <- replicate(Nboot, eigen(cor(X[sample(nrow(X),replace=T),]))$val)
            dim(X.evals.bs)
            rownames(X.evals.bs) <- paste("PC",1:nrow(X.evals.bs),sep="")
            str(X.evals.bs)
            X.evals.bs.SE <- apply(X.evals.bs, 1, sd)   # SEs !!!!
            X.evals.bs.SE
      + Decorate/enhance the profile plot:
            plot(X.pca$val["variance",], type="b", ylim=c(0,max(X.pca$val["variance",])),
                 ylab="PC Variance", xlab="PC", pch=16, cex=1.5, lwd=2)  # roots of evals
            abline(h=c(0,1), col="blue")
            lines(c(rbind(1:p,1:p,NA)),
                  c(rbind(X.pca$val["variance",]+2*X.evals.bs.SE,
                          X.pca$val["variance",]-2*X.evals.bs.SE,
                          NA)), lwd=2, col="red")
      + Q: Do you think it likely that these CIs cover the true eigenvalues?
        Objection: Eigenvalue estimates are biased for the true eigenvalues!!!
        Why?  lambda1 = max_a E((a^T Xvec)^2) / |a|^2     (assuming E[Xvec]=0 wlog)
        Proof?  Use  E[max_a f(X,a)] >= max_a E[f(X,a)]
              lambda1.est ~ max_a mean((X.mat a)^2) / |a|^2
              E[lambda1.est] ~ E[max_a mean((X.mat a)^2) / |a|^2]
                 >= max_a  E[ sum_i (X.mat[i,]^T a)^2 / N / |a|^2 ]
                 =  max_a  sum_i E[ (X.mat[i,]]^T a)^2 / N / |a|^2
                 =  max_a  N E[(X.vec^T a)^2 ] / N / |a|^2
                 =  max_a  E[(X.vec^T a)^2 ] / |a|^2
                 =  lambda1
         Do you still think  lambda1.est +- 2.SE[lambda1.est]
         covers the true  lambda1  for 95% of datasets?
       + Do you remember from Stat 541 how to obtain bias-corrected BS CIs?
         ==> BS quantile method !!!
         Here is the model calculation -- start with a coverage statement
         as if you knew the true sampling distribution of lambdaj.est:
               P[ q.lo              <= lambdaj.est             <= q.hi         ] ~ 0.95
               P[ q.lo-lambdaj      <= lambdaj.est-lambdaj <=  q.hi-lambdaj    ] ~ 0.95
               P[ -c.lo             <= lambdaj.est-lambdaj <=  c.hi            ] ~ 0.95
         Invert the coverage statement to a CI statement: invert by multiplying with -1
               P[     c.lo          >= lambdaj-lambdaj.est >=   -c.hi          ] ~ 0.95
               P[ lambdaj.est+c.lo  >=      lambdaj        >= lambdaj.est-c.hi ] ~ 0.95
         Finally reverse order:
               P[ lambdaj.est-c.hi  <=      lambdaj        <= lambdaj.est+c.lo ] ~ 0.95
       + Transfer the above calculation to the bootstrap simulation:
         BS principle: In a BS simulation, the sample becomes the population.
         Correspondence:  lambdaj     <-- lambdaj.est
                          lambdaj.est <-- lambdaj.est*   (* = from BS samples)
         BS Recipe:
           1) obtain q.lo* and q.hi* as upper and lower 2.5% quantiles
              of the BS distribution of lambdaj.est*
           2) obtain  c.lo* <- lambdaj.est - q.lo*
                      c.hi* <- q.hi - lambdaj.est
           3) BS CI:  [lambdaj.est - c.hi, lambdaj.est + c.lo]
       + Simplified formulas:
              [lambdaj.est - c.hi, lambdaj.est + c.lo]
            = [2*lambdaj.est - q.hi, 2*lambdaj.est + q.lo]
               ^^^^^^^^^^^^^^^^^^^^  ^^^^^^^^^^^^^^^^^^^^
                    characteristic bias correction
       + R implementation: obtain  X.evals.bs  as above
            Nboot <- 10000
            X.evals.bs <- replicate(Nboot, eigen(cor(X[sample(nrow(X),replace=T),]))$val)
            rownames(X.evals.bs) <- paste("PC",1:nrow(X.evals.bs),sep="")
            dim(X.evals.bs)
            X.evals.bs.q.lo.hi <- apply(X.evals.bs, 1, quantile, prob=c(.025,.975))
            X.evals <- eigen(cor(X))$val          # observed e'vals
            rbind(X.evals.bs.q.lo.hi, X.evals)    # compare BS quantiles with observed e'vals
         Graphical presentation of inverted CIs for bias/skewness correction:
            plot(X.pca$val["variance",], type="b", ylim=c(0,.5+max(X.pca$val["variance",])),
                 ylab="PC Variance", xlab="PC", pch=16, cex=2, lwd=2)  # roots of eigenvalues
            abline(h=c(0,1), col="blue")
            lines(c(rbind(1:p,1:p,NA)),
                  c(rbind(2*X.pca$val["variance",]-X.evals.bs.q.lo.hi[1,],
                          2*X.pca$val["variance",]-X.evals.bs.q.lo.hi[2,],
                          NA)), lwd=2, col="red")
            # The following is WRONG: using the BS quantiles as boundaries of CIs
            plot(X.pca$val["variance",], type="b", ylim=c(0,.5+max(X.pca$val["variance",])),
                 ylab="PC Variance", xlab="PC", pch=16, cex=2, lwd=2)  # roots of eigenvalues
            abline(h=c(0,1), col="blue")
            lines(c(rbind(1:p,1:p,NA)),
                  c(rbind(X.evals.bs.q.lo.hi[1,],
                          X.evals.bs.q.lo.hi[2,],
                          NA)), lwd=2, col="brown")
            # ==> These intervals exacerbate the bias.
      + Alternative: Perform bias correction, then use symmetric normal intervals +-2SE:
            X.evals.biascorr <- X.evals - (apply(X.evals.bs, 1, mean) - X.evals)
            #                              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ BS bias estimate
            X.evals.biascorr <- 2*X.evals - apply(X.evals.bs, 1, mean)  # same!!!
            plot(X.pca$val["variance",], type="b", ylim=c(0,.5+max(X.pca$val["variance",])),
                 ylab="PC Variance", xlab="PC", pch=16, cex=2, lwd=2)  # roots of eigenvalues
            abline(h=c(0,1), col="blue")
            lines(c(rbind(1:p,1:p,NA)),      # Show Bias-corrected symmetric CIs in red
                  c(rbind(X.evals.biascorr - 2*X.evals.bs.SE,
                          X.evals.biascorr + 2*X.evals.bs.SE,
                          NA)), lwd=2, col="red")
            lines(c(rbind(1:p,1:p,NA))+.05,  # Quantile-based CIs for comparison in green
                  c(rbind(2*X.pca$val["variance",]-X.evals.bs.q.lo.hi[1,],
                          2*X.pca$val["variance",]-X.evals.bs.q.lo.hi[2,],
                          NA)), lwd=2, col="green")
            points(X.evals.biascorr, pch=16, cex=1.5, col="red") # Bias-corrected e'vals
            title(main="Red: bias-corrected symmetric, Green: quantile inversion")
            # ==> Very similar intervals
      + Warning: There is no free lunch.
        ~ Bootstrap quantile methods are more noisy than symmetric intervals with BS SEs.
        ~ Bias correction adds noise also.
        However, the bias problem for large eigenvalues can be such
        that it must be taken care of.
        The same may hold for the non-normality of the sampling (dataset-to-dataset) distribution
        of the eigenvalue estimates.
      + Do you remember a diagnostic from Stat 541 for 'asymptotic normality'?
            windows(10,5)
            par(mfrow=c(2,4), mgp=c(1.8, .5, 0), mar=c(3,3,1,1))
            for(j in 1:nrow(X.evals.bs)) {
              qqnorm(X.evals.bs[j,], main="", xlab="", ylab="", pch=16)
              abline(a=mean(X.evals.bs[j,]), b=sd(X.evals.bs[j,]), col="green", lwd=2)
              abline(v=c(-2,+2), col="red", lwd=2)
              }
        Depending on the random subsample of 'mktg' you are getting,
        the deviation from normality can be substantial.
      + Remaining issue: +-2SE is a pointwise benchmark.
        ==> Calibrate for simultaneity.
        Recipe: Examine the 8 intervals as to how many BS profiles they entirely contain.
                This is simple for the centered SE-based intervals:
                Do the bean counting for various multiples K*SE.
                You have to simulate nature, though, so you should check coverage
                from the mean profile of the eigenvalues.
        R code:
            for(K in seq(2,3.5,by=.01))    # A ladder of multiples
              cat(K,":  ",
                  mean(apply(abs(X.evals.bs - apply(X.evals.bs,1,mean)) <= K*X.evals.bs.SE,
                             2, all)),  # We need the inequalities for all 8 e'vals; mean() makes proportions
                  "\n")
        ==> K~2.76 seems to be required for 0.95 simultaneous coverage.
            (If you redo the exercise with different subsets of 'mktg',
             you will get different results.  The instructor has seen
             required multipliers as high as K=3.1.)
        Compare with Bonferroni, assuming normality:
            -qnorm(.05/2/length(X.evals))
        Funny, the multiplier for simultaneity is almost the same.
        Do not be surprised, though, if the simulation-based K is larger
        than the Bonferroni-based K.
        The simulation adjusts for non-normality, for example,
        whereas our Bonferroni multiplier is based on a normality assumption.

  - Outlook on past theory:
      There is some finite-sample and asymptotic theory
      for PCA under the assumption of normality.
      Typically it does not get used.

  - Outlook on modern theory: Random matrix theory
      There are some neat results that show how strong 'low-rank signals'
      should be to be correctly detectable at all.
      + For PCA: Lu 2002, Baik&Silverstein 2006, Paul (2007), Nadler (2008), ...
      + For a low-rank-mean+noise model: Shabalin&Nobel (2010)
          (See a slide from Dan Yang''s 2012 talk.)


-------------------------

  - Odds and Ends:

    . Background about eigendecomps: simultaneous diagonalization
        Given two quadratic forms  Q1(x) = x^T S1 x  and  Q2(x) = x^T S2 x,
        where S1 and S2 are symmetric matrices, S2 positive definite,
        there exists a generalized eigendecomp for S1 wrt S2
        based on the Rayleigh quotient
          R(x) = x^T S1 x / x^T S2 x = Q1(x) / Q2(x)
        The generalized eigendecomp of S1 wrt S2
        diagonalizes S1 and S2 simultaneously.

    . The LS interpretation of the SVD of X (actually: X.std):
      + Frobenius squared norm of an arbitrary matrix X:
          |X|^2 = sum_nj X_nj^2 = tr(X X^T) = tr(X^T X)
                = Euclidean squared norm if X is strung out as a vector
      + LS criterion for rank-one approximation:
          |X - d1 U1 V1^T|^2 = min_{d1,U1,V1}
        ==> svd provides the minimizing d1,U1,V1
        ==> Generalizes to arbitrary low-rank approximation.
        ==> Reduction of svd to two alternating regressions

    . SVD and Rayleigh quotient of an arbitrary matrix X:
      + R(U1,V1) = (U1^T X V1) / (|U1| |V1|)
      + The pairs of singular vectors are the stationary solutions for R(U1,U2).
      + The largest singular value is  d1 = max_{U1,V1} R(U1,V1)
      + The 2nd largest singular value is d2 = max_{U2,V2} R(U2,V2)
          subject to U2 _|_ U1, V2 _|_ V1
        ...

    . Power algorithms for eigenvectors of a symmetric matrix S:
      + Simple power iterations:
          Initialize  V1  randomly !=0, normalize V1 <- V1/|V1|
          Repeat till convergence:
            V1 <- S V1
            V1 <- V1/|V1|
        Convergence will be to an eigenvalue for the largest eigenvalue.
      + 2nd eigenvector: Repeat power algorithm with V2 subject to _|_ V1.
        ...
      + Simultaneous power iterations:
          Initialize V=(V1,V2,...Vq), orthonormalize columns: V^T V = I
          Repeat till convergence:
            V <- S V
            Orthonormalize columns: V^T V = I
        Convergence will be to a q-tuple of eigenvectors for the
        largest q eigenvalues.
        Permitted: q=p, in which case one gets a basis of eigenvectors
                   with eigenvalues sorted in descending magnitude.
        (Caution for negative eigenvalues: convergence of even or odd iterates.).

    . Power algorithms for SVDs of an arbitrary matrix X:
      + Simple power iterations:
          Initialize  V1  randomly !=0, normalize V1 <- V1/|V1|
          Repeat till convergence:
            U1 <- S V1
            U1 <- U1/|U1|
            V1 <- S^T U1
            V1 <- V1/|V1|
        Convergence will be to ...
      + Simultaneous iterations obvious ...
 

================================================================


* CANONICAL CORRELATION ANALYSIS -- CCA:

  - Idea of CCA:
      ...  Divide the variables into two blocks,
           called X, Y: sizes: Nxp, Nxq;
           this division is driven by subject matter.
           Ask for linear combinations   X a, Y b
           such that

            ---------------------------------------------
           |                                             |
           |   (a,b) = argmax_{a,b != 0} cor(X a, Y b)   |  X, Y full rank
           |                                             |
            ---------------------------------------------

  - Again, there is a population and a sample/data version:
    . Population: random vectors Xvec (px1), Yvec (qx1)
                  with a joint distribution P(dx,dy)
    . Data:  X (Nxp),  Y (Nxq)  = row-matched random matrices
    . Relation:
      + The rows Xvecn of X are iid draws from Xvec.
      + The rows Yvecn of Y are iid draws from Yvec.

  - Simple data example: 'mktg', volume variables versus needs&demographics
          X <- as.matrix(mktg[,4:8])
          Y <- as.matrix(mktg[,1:3])
          round(cor(X,Y), 2)
          XY.cca <- cancor(X,Y)
          XY.cca

  - Terminology:
    . Stationary solutions of the CCA problem are called
        'canonical variates' or CVs
    . The X part is the 'left CV'; the Y part is the 'right CV'.

  - Basics:
    . How many CVs (Canonical Variates) are there for p=5, q=3?
        ...
    . Are the CVs uniquely determined when the canonical corrs are different?
        ...  No, any POSITIVE multiple of a and b is also a solution,
             and simultaneous sign flips in a and b also produces a solution.
    . Do you expect CV coefficient vectors to be orthogonal to each other?
        ...  'orthogonal' in the Euclidean sense ...
      So what are they?
        ...  'orthogonal = uncorrelated linear combinations' makes sense
    . Can you immediately tell from the coefficients which variables are strong?
        ...
      What could you do for better comparison?
        ...
        ==> Standardize before running CCA!
          X.std <- scale(X)
          Y.std <- scale(Y)
          XY.std.cca <- cancor(X.std,Y.std)
          # Compare canonical correlations:
          rbind(XY.std.cca$cor, XY.cca$cor)
          # Why are the above the same?
          # Compare coefficients:
          XY.std.cca$xcoef;     XY.cca$xcoef # Not the same
          XY.std.cca$ycoef;     XY.cca$ycoef # Nearly the same because
                                             # mktg[,1:3] are nearly stdized.
        Read stories, if any, from these coefficients.
          XY.std.cca$xcoef;     XY.std.cca$ycoef
    . Doesn''t it feel like cancorr is somewhere between
      regression and PCA?
      It is not a popular method, yet one needs to know
      about it because of other methods that derive from it.

  - Algebra of CCA:

    . Optimization problem:
        cor(X a, Y b) = max/min/stationary_{a,b}  ???
        cor(X a, Y b)
        = cov(X a, Y b)/sqrt(var(X a) var(Y b))   # (N-1) drops out
        = a^T CXY b / sqrt( a^T CXX a b^T CYY b )
      where CXY, CXX, CYY are the covariance matrices:
      assuming X, Y are both centered, we have
        CXX = X^T X/ (N-1)
        CYY = Y^T Y/ (N-1)
        CXY = X^T Y/ (N-1)

    . Idea: Reduce to Euclidean denominators
            through 'sphering'/'pre-whitening' of the X and Y blocks.
        ==> Find roots RX and RY of CXX and CYY.
            Could be  RX = CXX^{1/2},  RY = CYY^{1/2},
            but any   RX^T RX = CXX,   RY^T RY = CYY will do.
            Can be obtained, e.g., with a Cholesky decomp in R: chol()
        CXX = RX^T RX
        CYY = RY^T RY
        aa = RX a        # Coordinate/basis change
        bb = RY b        # Same

    . Why the terms 'sphering' and 'pre-whitening'?
        Xpw := X RX^{-1}
      satisfies (assuming centered X):
        cov(Xpw) = Xpw^T Xpw / (N-1)
                 = RX^{-T} X^T X RX^{-1} / (N-1)
                 = RX^{-T} CXX RX^{-1}
                 = I    (pxp)
      ==> Xpw has columns that are uncorrelated and have equal variance (=1).
          ~ 'sphered' or 'spherically symmetric' covariance matrix
          ~ 'white noise' if Gaussian (data version, though)
      Dito for Ypw := Y RY^{-1}:
        cov(Ypw) = I    (qxq)

    . Continue with CCA problem:
          cor(X a, Y b)
        = a^T CXY b / sqrt(a^T CXX a * b^T CYY b)
        = aa^T RX^{-T} CXY RY^{-1} bb / sqrt(aa^T aa  bb^T b)
        = aa^T (RX^{-T} (X^T Y) RY^{-1}) bb / (|aa| |bb|)
        = max/min/stationary wrt aa and bb
      ==> Perform an SVD of  RX^{-T} CXY RY^{-1},
          then back-transform:  a <- RX^{-1} aa;  b <- RY^{-1} bb

    . The pretty form of the CCA problem:
        RX = CXX^{-1/2},    RY = CYY^{-1/2},  both symmetric.
      Hence:
             ---------------------------------------------
            |                                             |
            |  CCA  =  SVD of  CXX^{-1/2} CXY CYY^{-1/2}  |
            |                                             |
            |          up to a coordinate transformation  |
            |                                             |
             ---------------------------------------------

            Keep in mind the purpose of the front and back factors:
            'sphering', 'pre-whitening'

    . Another way to think about sphering/pre-whitening:
      + Orthogonalize the columns of X and Y (separately)
        with modified Gram-Schmidt,
      + but use covariance as the inner product.
        and variance as squared length/norm.

    . Spatial/geometric interpretation:
      + Form the column spaces
          V.X = cspan(Xpw) = cspan(X)
          V.Y = cspan(Ypw) = cspan(Y)
      + Find
          x = Xpw aa    in V.X and
          y = Ypw bb    in V.Y
        that maximize
          cor(x,y) = cor(x, y)
                   = cov(x, y) / (sd(x) sd(y))
                   = x^T y / (|x| |y|)  (factor sqrt(N-1) cancels)
        ==> Maximize the cosine between x and y.
        ==> Minimize the angle  between x and y.

         ------------------------------------------
        |                                          |
        |  correlation  =  special case of cosine  |
        |                                          |
         ------------------------------------------

        ==> Hierarchy of stationary cosines / stationary angles
            between the two subspaces.

        General fact: The stationary or principal angles between two subspaces
                      are their orthogonal invariants.
                      That is: Any two pairs of subspaces with the same
                      principal angles can be orthogonally rotated into each other.

        HW: Show in the hierarchy of CC coeffs that
                xcoef[,1] _|_ ycoef[,2]
            in sphered coordinates or in the sense of covariance.


    . Another way to look at the CCA problem:
      + Preparation: The Nxp SVD as an (N+p)x(N+p) eigenproblem

           |  I   X | |u|      |u|
           | X^T  I | |v|  = e |v|

        These eigenequations are the same as:
              u  + X v  = e u
           X^T u +  v   = e v
        or:
             X  v = (e-1) u
            X^T u = (e-1) v
        i.e.,
        the SVD problem with singular values d = e-1.
        Consequence:
          The Rayleigh quotient

              u^2 + v^2 + 2 u^T X v
              ---------------------  is stationary
                    u^2 + v^2

          iff the bi-Rayleigh quotient

              (u^T X v)^2
              -----------   is stationary.
              |u|^2 |v|^2

          The stationary values of the latter are
          the stationary values of the former - 1.

        [Cute problem: Show u^2 = v^2 for the eigenvectors (u^T,v^T)^T
                       of the eigenproblem.]

      + Applied to the sphered CCA problem:

             aa^T CXX^{-1/2} CXY CYY^{-1/2} bb
             ---------------------------------   is stationary
                         |aa| |bb|

        iff
             aa^2 + bb^2 + 2 aa^T CXX^{-1/2} CXY CYY^{-1/2} bb
             -------------------------------------------------  is stationary.
                              aa^2 + bb^2

      + Meaning of this Rayleigh quotient: do back translation
             a  = CXX^{-1/2} aa
             b  = CYY^{-1/2} bb
             aa = CXX^{1/2}  a
             bb = CYY^{1/2}  b
             aa^2  =  a CXX a  =  V[X a]
             bb^2  =  b CYY b  =  V[Y b]
             aa^T CXX^{-1/2} CXY CYY^{-1/2} bb  =  a^T CXY b  =  V[X a, Y b]

        Rayleigh quotient:

             V[X a] + V[Y b] + 2 V[X a, Y b]      V[X a + Y b]
             -------------------------------  =  ---------------
                   V[X a] + V[Y b]               V[X a] + V[Y b]

        Remember the connection to the canonical correlations:
          canonical correlations = Rayleigh stationary values - 1


    . Similarity between regression and CCA:
      + What happens if you throw highly correlated variables
        into the X block?  (Similar for the Y block.)
          ...
      + What is the problem called?
          ...
      + What can you do about it?
          ...  subset selection in both blocks; shrink them; block-internal dim reduction
      + What kind of inference would you like to have for the coefficients?
          ...  t-based inference
      + What would you miss if you performed inference like in regression?
          ...
      + How would you perform such inference anyway?
          ...


  - Software: The following is inefficient but pretty code.
              It uses the symmetric roots  CXX^{1/2}  and   CXX^{-1/2}.
        cca <- function(X, Y, CV=FALSE, center=TRUE, scale=TRUE) {  # CV=TRUE: want canonical variates
          X <- as.matrix(X)
          Y <- as.matrix(Y)
          N <- nrow(X);  if(nrow(Y) != N) {
            cat("X and Y row numbers not the same...\n"); return() }
          Xs <- scale(X, center=center, scale=scale)
          Ys <- scale(Y, center=center, scale=scale)
          CXX <- t(Xs) %*% Xs / (N-1)  # pedestrian cov() or cor()
          CYY <- t(Ys) %*% Ys / (N-1)
          CXY <- t(Xs) %*% Ys / (N-1)
          CXX.eig <- eigen(CXX);  CXX.eig$values <- pmax(0,CXX.eig$values)
          CYY.eig <- eigen(CYY);  CYY.eig$values <- pmax(0,CYY.eig$values)
          RX     <- CXX.eig$vec %*% diag(CXX.eig$val^{ 1/2}) %*% t(CXX.eig$vec)
          RX.inv <- CXX.eig$vec %*% diag(CXX.eig$val^{-1/2}) %*% t(CXX.eig$vec)
          RY     <- CYY.eig$vec %*% diag(CYY.eig$val^{ 1/2}) %*% t(CYY.eig$vec)
          RY.inv <- CYY.eig$vec %*% diag(CYY.eig$val^{-1/2}) %*% t(CYY.eig$vec)
          CXY.pw <- RX.inv %*% CXY %*% RY.inv
          CXY.pw.svd <- svd(CXY.pw)
          CCA <- list(cor = CXY.pw.svd$d,
                      xcoef = RX.inv %*% CXY.pw.svd$u,
                      ycoef = RY.inv %*% CXY.pw.svd$v
                      )
          rownames(CCA$xcoef) <- colnames(X)
          rownames(CCA$ycoef) <- colnames(Y)
          colnames(CCA$xcoef) <- paste("CV", 1:ncol(CCA$xcoef), sep="")
          colnames(CCA$ycoef) <- paste("CV", 1:ncol(CCA$ycoef), sep="")
          names(CCA$cor) <- paste("CV", 1:length(CCA$cor), sep="")
          if(CV) {
            CCA$xCV <- Xs %*% CCA$xcoef
            CCA$yCV <- Ys %*% CCA$ycoef
            CCA$xscaled <- Xs
            CCA$yscaled <- Ys
          }
          CCA
        }
        # Example application:
        X <- mktg[,4:8]
        Y <- mktg[,1:3]
        XY.cca  <- cca(X,Y)
        XY.cca1 <- cancor(scale(X), scale(Y))
        XY.cca;  XY.cca1
      # cancor() has strange scaling of coeffs:
        round(crossprod(scale(X) %*% XY.cca1$xcoef), 10)
      # ==> CVs have unit Euclidean length, not unit variance...
      # Our CVs are sphered, in particular they have unit variance:
        XY.cca <- cca(X,Y,CV=T)
        round(cov(XY.cca$xCV), 10)
        round(cov(XY.cca$yCV), 10)

  - Plots:
    . Canonical correlation profile:
        plot(XY.cca$cor, ylim=c(0,1.2*max(XY.cca$cor)),
             pch=16, cex=1.5, xaxt="n", xlab="CV Order", ylab="Can Corr")
        axis(side=1, at=seq(along=XY.cca$cor), lab=names(XY.cca$cor))
        lines(XY.cca$cor, lwd=2)
        abline(h=0, col="blue")
    . CC coefficient plots: x-coeffs only; should be packaged in a function
        pq <- length(XY.cca$cor)
        clrs <- sample(colors()[25:60], size=pq)   # Random colors
        plot(c(0,nrow(XY.cca$xcoef)+0.5), c(-1,1)*max(abs(XY.cca$xcoef)),
             xaxt="n", xlab="X-Variables", ylab="CCA X-Coeffs", type="n")
        abline(h=0, col="gray")
        for(i in 1:pq) lines(1:nrow(XY.cca$xcoef), XY.cca$xcoef[,i], lwd=2, col=clrs[i])
        for(i in 1:pq) points(1:nrow(XY.cca$xcoef), XY.cca$xcoef[,i], pch=16, col=clrs[i])
        text(rep(1,pq), XY.cca$xcoef[1,], names(XY.cca$cor), adj=1.2)
        axis(side=1, at=1:nrow(XY.cca$xcoef), lab=rownames(XY.cca$xcoef))
    . CVs: In which frames do you see the canonical correlations?
           This is a considerable trick question!!!
        windows();
        lims <- c(-4,4)
        pairs.plus(cbind(XY.cca$xCV, XY.cca$yCV), cex=.5, xlim=lims, ylim=lims)
      Remind yourself of the CCA structure:
        round(cor(cbind(XY.cca$xCV, XY.cca$yCV)), 3)
      Go back to the pairwise plots and wonder
      why some of them show zero correlations!
      Here is version with some jitter added:
        pairs.plus(cbind(XY.cca$xCV, XY.cca$yCV)+runif(len(c(XY.cca$xCV, XY.cca$yCV)),-.2,+.2),
                   pch=".", cex=1.5, xlim=lims, ylim=lims)
      Do you see the zero correlations better now?


  - Permutation inference for canonical correlations:

    . Plot the profile, which has only length 3 in the example:
          N <- nrow(mktg)
          sel <- sample(N, size=400)
          # sel <- T                   # select all
          X <- mktg[sel,4:8]
          Y <- mktg[sel,1:3]
          XY.cca <- cca(X,Y,CV=T)
          pq <- length(XY.cca$cor)
          plot(XY.cca$cor, pch=16, cex=2, type="b", ylim=c(0,1.2*max(XY.cca$cor)), xlim=c(0,pq+1),
               ylab="Canonical Correlations", xaxt="n", xlab="Order of Canonical Correlations")
          axis(side=1, at=1:pq, lab=paste("CC",1:pq,sep=""))
          abline(h=0, col="gray")

    . Permutation inference:
        H0 = ...
      Spaghetti decoration: add to the above
          for(i in 1:100) lines(cca(X,Y[sample(nrow(Y)),])$cor, col="gray")
          lines(XY.cca$cor, lwd=2);  points(XY.cca$cor, pch=16, cex=2)
      Vary 'size=...' above: 25, 100, 400, 1600

    . Questions:
      + Do we have a budget constraint on the canonical correlations?
          ...
      + Is it possible that all can.corrs. = 1?  What would it mean?
          ...
      + Implications for testing the canonical correlations
        with an independence assumption?
          ...


  - Bootstrap inference for canonical correlations:
    . BS simulation:
          Nboot <- 1000
          XY.cca.bs <- replicate(Nboot, { sel <- sample(nrow(X), replace=T)
                                          cca(X[sel,], Y[sel,])$cor })
          dim(XY.cca.bs)
          XY.cca.bs.q.lo.hi <- apply(XY.cca.bs, 1, quantile, prob=c(.025,.975))
          XY.cca <- cca(X,Y)          # observed
    . Graphical presentation: add bias/skewness-correcting CIs to the above plot
          lines(c(rbind(1:pq,1:pq,NA)),
                c(rbind(pmax(2*XY.cca$cor - XY.cca.bs.q.lo.hi[1,], 0),
                        pmax(2*XY.cca$cor - XY.cca.bs.q.lo.hi[2,], 0), # Why pmax(...,0)?
                        NA)), lwd=2, col="red")
    . Same for coefficients:
          XY.cca.xcoef.bs <- replicate(Nboot, { sel <- sample(nrow(X), replace=T)
                                                cca(X[sel,], Y[sel,])$xcoef })
          dim(XY.cca.xcoef.bs)
          XY.cca.xcoef.bs.q.lo.hi <- apply(XY.cca.xcoef.bs, c(1,2),
                                           quantile, prob=c(.025,.975))
          dim(XY.cca.xcoef.bs.q.lo.hi)
          XY.cca <- cca(X,Y)          # observed
          # Print for each CV the lower bound, the estimate, and the upper bound:
          for(j in 1:ncol(XY.cca$xcoef)) {
            cat("\nCV",j,":\n")
            print(cbind(Lower = 2*XY.cca$xcoef[,j] - XY.cca.xcoef.bs.q.lo.hi[2,,j],
                        Coeff = XY.cca$xcoef[,j],
                        Upper = 2*XY.cca$xcoef[,j] - XY.cca.xcoef.bs.q.lo.hi[1,,j] ))
            }


================================================================


* GENERALIZED CANONICAL ANALYSIS -- GCA:

  - Consider a three-block situation
    . Data version: three matrices
        X (Nxp), Y (Nxq), Z (Nxr)
        Assume all columns centered.
        Lin combs:  X a, Y b, Z c
    . Population version:
        Xvec (px1), Yvec (qx1), Zvec (rx1)
        Assume all random variables centered.
        Lin combs:  a^T Xvec, b^T Yvec, c^T Zvec
    . Proceed with data version in what follows.
    . Goal: Find linear combinations of the three blocks
            that somehow maximally correlate with each other...
      Q: What could that mean ?????

  - Block-internal correlation structure:
    . Should be irrelevant, as in CCA ==> sphere the blocks.
    . Sphering/pre-whitening:
        Xpw = X CXX^{-1/2}  ==>  Xpw^T Xpw / (N-1) = I
        Ypw = Y CYY^{-1/2}  ==>  Ypw^T Ypw / (N-1) = I
        Zpw = Z CZZ^{-1/2}  ==>  Zpw^T Zpw / (N-1) = I
        aa  = CXX^{1/2} a   ==>  Xpw aa = X a
        bb  = CYY^{1/2} b   ==>  Ypw bb = Y b
        cc  = CZZ^{1/2} c   ==>  Zpw cc = Z c
    . If |aa|^2 = 1, then V[Xpw aa] = 1;
      same for  |bb|^2 = 1  and  |cc|^2 = 1.

  - Approach 1:
    . Criterion for 3-block GCA: Oh, well, how about this?
        crit(a,b,c) := cor(Xa,Yb) + cor(Xa,Zc) + cor(Yb,Zc)
    . Pre-whitened:
        cor(Xa,Yb) = cor(Xpw aa, Ypw bb) = aa^T Xpw^T Ypw bb / (|aa||bb|)
        cor(Xa,Zc) = cor(Xpw aa, Zpw cc) = aa^T Xpw^T Zpw cc / (|aa||cc|)
        cor(Yb,Zc) = cor(Ypw bb, Zpw cc) = bb^T Ypw^T Zpw cc / (|bb||cc|)
    . Stationary solutions wrt aa, bb, cc:  'grad_aa' = gradient  wyrt aa
          grad_aa [ cor(Xpw aa, Ypw bb) + cor(Xpw aa, Zpw cc) + cor(Ypw bb, Zpw cc) ]
        = grad_aa [ cor(Xpw aa, Ypw bb) + cor(Xpw aa, Zpw cc) ]
        = grad_aa [ aa^T Xpw^T Ypw bb / (|aa||bb|) + aa^T Xpw^T Zpw cc / (|aa||cc|) ]
        = grad_aa [ (aa/|aa|)^T Xpw^T ( Ypw bb/|bb| + Zpw cc/|cc| ) ]
        = grad_aa [ (aa/|aa|)^T Xpw^T ( Ypw bb/|bb| + Zpw cc/|cc| ) ]
        = grad_aa [ (aa/|aa|)^T v ]       where v = all to the right on the previous line
        = ( |aa| v - (aa^T v) aa/|aa| ) / |aa|^2
        = 0    iff    v = (aa^T v)/|aa|^2 aa
        where v = Xpw^T ( Ypw bb/|bb| + Zpw cc/|cc| )
        ==>   Xpw^T ( Ypw bb/|bb| + Zpw cc/|cc| )  =  const(aa) * aa
              Ypw^T ( Xpw aa/|aa| + Zpw cc/|cc| )  =  const(bb) * bb
              Zpw^T ( Xpw aa/|aa| + Ypw bb/|bb| )  =  const(cc) * cc
        ==> Not a set of eigen equations, whichever way you look at it,
            though this might be an interesting problem, too.
            What makes this NOT an eigenproblem?
            + The independent normalizations aa/|aa|, bb/|bb|, cc/|cc|
              in each block.
            + The different 'eigenvalues' const(aa), const(bb), const(cc)
              in each block of equations.

  - Approach 2:
    . Learn from Approach 1 and modify the
      stationary equations so they become eigenequations.
              Xpw^T ( Ypw bb + Zpw cc )  =  alpha * aa
              Ypw^T ( Xpw aa + Zpw cc )  =  alpha * bb
              Zpw^T ( Xpw aa + Ypw bb )  =  alpha * cc
    . Recall: Xpw^T Ypw /(N-1)  =  CXX^{-1/2} CXY CYY^{-1/2}  =: CXY.pw
              Similar for the other two cross-covariances.
    . So the above system is the eigen equation for this matrix:
              |    0    CXY.pw  CXZ.pw |
              | CYX.pw    0     CYZ.pw |
              | CZX.pw CXY.pw     0    |
    . We may add the identity and obtain the same eigenvectors,
      with eigenvalues shifted by +1:
              |    I    CXY.pw  CXZ.pw |
              | CYX.pw    I     CYZ.pw |
              | CZX.pw CXY.pw     I    |
      These identity matrices can be interpreted as
      CXX.pw, CYY.pw, CYY.pw:
              | CXX.pw CXY.pw  CXZ.pw |
              | CYX.pw CYY.pw  CYZ.pw |
              | CZX.pw CXY.pw  CZZ.pw |
    . The eigen equations:
              | CXX.pw CXY.pw  CXZ.pw | |aa|  =  beta * aa
              | CYX.pw CYY.pw  CYZ.pw | |bb|  =  beta * bb
              | CZX.pw CXY.pw  CZZ.pw | |cc|  =  beta * cc
    . Associated Rayleigh quotient:
                   V[ X.pw aa + Y.pw bb + Z.pw cc ]
              ------------------------------------------
              V[ X.pw aa ] + V[ Y.pw bb ] + V[ Z.pw cc ]

  - Larger conclusion:

       ---------------------------------------------------------------
      |                                                               |
      |  A NULL COMPARISON PRINCIPLE OF MULTIVARIATE ANALYSIS:        |
      |                                                               |
      |  Classical multivariate analysis is a variance comparison,    |
      |  one variance being the of observed linear combinations,      |
      |  the other being the variance calculated under an assumption  |
      |  of absent correlation.                                       |
      |                                                               |
       ---------------------------------------------------------------

    . Special cases: Null/denominator variances for PCA, CCA and GCA

      o  PCA:  Var.0(X a) = sum aj^2 Var(Xj)
      o  CCA:  Var.0(X a + Y b) = Var(X a) + Var(Y b)
      o  GCA:  Var.0(X a + Y b + Z c) = Var(X a) + Var(Y b) + Var(Z c)
      
    . Correlation- vs covariance-based PCA in the null comparison framework:
      Var.0(X a) = sum aj^2 Var(Xj) results in correlation-based PCA.
      Covariance-based PCA can be covered by adding to the null assumption
        Var(Xj) = same (j=1...p)
      The null variance becomes:         
        Var.0(X a) = sum aj^2 * ave(Var(Xj))
      I.e., Var.0(X a) = sum aj^2,  the Euclidean norm,
      as  ave(Var(Xj))  is just a known constant.

    . Observations for GCA:
      + Unlike the two-block CCA problem, the three denominator
        variances will not generally be equal at stationarity.
        Example: If the Z block is uncorrelated from the X and Y blocks,
        then stationary solutions involving X and Y will set  cc=0.
      + The three CCA problems of one block vs the union of the two others,
        in general do not produce the three-block GCA problem:
              CCA of X vs (Y,Z)
              CCA of Y vs (X,Z)
              CCA of Z vs (X,Y)
              GCA of X vs Y vs Z
        These are in general 4 separate problems.
        Which one is chosen depends on subject matter.
      + Lesson from Approach 1 and Approach 2:
        Do not obsess over one formulation of a problem.
        If a problem formulation is unwieldy, try another one
        that might be more manageable and whose solutions provide
        relevant answers.

  - Code for GCA:
        gca <- function(..., center=T, scale=T, GCV=FALSE) {  # GCV=TRUE: want GCV variates
          blocks <- list(...)
          if(length(blocks)==1 & is.list(blocks[[1]])) blocks <- blocks[[1]]
          if(length(blocks)<2) { cat("Fewer than 2 blocks provided.\n");  return() }
          for(i in 1:length(blocks)) blocks[[i]] <- as.matrix(blocks[[i]])
          N <- unique(sapply(blocks, nrow))
          if(length(N)!=1) { cat("Row numbers of blocks not the same.\n"); return() }
          blocks.s <- lapply(blocks, scale, center=center, scale=scale)
          roots <- lapply(blocks.s,
                          function(X) { CXX <- crossprod(X)/(nrow(X)-1);  CXX.eig <- eigen(CXX)
                                        RX     <- CXX.eig$vec %*% diag(CXX.eig$val^{ 1/2}) %*% t(CXX.eig$vec)
                                        RX.inv <- CXX.eig$vec %*% diag(CXX.eig$val^{-1/2}) %*% t(CXX.eig$vec)
                                        list(raw=RX, inv=RX.inv, eig=CXX.eig) } )
          blocks.pw <- lapply(1:length(blocks),
                              function(j) blocks.s[[j]] %*% roots[[j]]$inv )
          # all.pw.cbind <- matrix(unlist(blocks.pw), nrow=N)  # Better ways to cbind from a list???
          # YES!  Thanks to Josh: 'do.call()'
          all.pw.cbind <- do.call(cbind, blocks.pw)
          block.indeces <- rep(1:length(blocks.pw), sapply(blocks.pw, ncol))
          COV <- cov(all.pw.cbind)
          GCA <- eigen(COV)
          names(GCA$values) <- paste("CV", 1:length(GCA$val), sep="")
          colnames(GCA$vectors) <- paste("CV", 1:ncol(GCA$vectors), sep="")
          for(j in unique(block.indeces))
            GCA$vectors[block.indeces==j,] <- roots[[j]]$inv %*% GCA$vectors[block.indeces==j,]
          rownames(GCA$vectors) <- paste("(",block.indeces,") ",unlist(sapply(blocks,colnames)),sep="")
          colnames(GCA$vectors) <- paste("CV", 1:ncol(GCA$vectors), sep="")
          if(GCV) {
            GCA$CV <- GCA$vectors
          }
          GCA
        }
        # Example application:
        mktg.gca <- gca(mktg[,1:3], mktg[,4:6], mktg[,7:8])
        mktg.gca
        # Yes, you can interpret the coefficients as 'strengths'
        # because by default 'gca()' standardizes all columns.

  - Correspondence between CCA and GCA:
    Two-block GCA should agree with CCA.
        XY.cca <- cca(mktg[,1:3], mktg[,4:8])
        XY.gca <- gca(mktg[,1:3], mktg[,4:8])
        XY.cca;  XY.gca
    ==> The canonical correlation values are
        the first min(p,q) GCA eigenvalues minus 1.
        The GCA eigenvalues below +1 are
        1-canonical correlations in reverse order.
        The GCA eigenvalues between min(p,q)
        from both ends are exactly +1.
    ==> GCA gives the same information as CCA.
        We can extract the first min(p,q) GCVs
        and toss the rest.
    Compare the coefficients:
        windows(w=2.7, h=8)
        par(mfrow=c(3,1), mgp=c(1.5,.5,0), mar=c(3,3,1,1))
        for(j in 1:3) plot.plus(cbind(c(XY.cca$xcoef[,j]), c(XY.gca$vectors[1:3,j])))
        for(j in 1:3) plot.plus(cbind(c(XY.cca$ycoef[,j]), c(XY.gca$vectors[4:8,j])))
    ==> Up to scaling the coefficients are the same.


================================================================


* FISHER''S OPTIMAL SCORING: CCA APPLIED TO CATEGORICAL DATA

  - Consider two categorical variables such as
       mktg[,c("edu","seg")]
    Well, "edu" is ordinal, but still.

  - Q: How do we measure association between two categorical variables?
         (They can be nominal or ordinal.)
    A: ...
    Standard answer: the chi-squared statistic for independence
    Chi-squared recipe:
    . tabulate
    . calculate expected counts under independence
    . sum up over all cells:   (obs.count - exp.count)^2/exp.count
    . compare with a chi-squared distribution with ... degrees of freedom
    . Stine-Foster transform chi-squared to Cramer''s V for normalization ...

  - Alternative: correlation
    . Naive: if the levels are ordinal, score them with integers 1,2,...;
             use the integer scores as if the variables were quantitative.
    . Idea: Assign scores to the categories,
            then treat the variable as quantitative.
    . Next idea: Optimize the scores of both variables wrt correlation.
      ==> Fisher''s optimal scoring!

  - Execution of the idea of Fisher''s optimal scoring:
    . Introduce two blocks of dummy variables
      associated with the two categorical variables:
        X <- cbind(....)
        Y <- cbind(....)
    . If the scores for the categories are collected
      in vectors  'a'  and  'b', then the scored variables are:
        X a  and  Y b
    . Maximize 'a' and 'b' wrt to correlation:
        cor(X%*%a, Y%*%b)

  - Example: Dummify (spelling?) a categorical variable -- R coding
      make.dummies <- function(categ) {  # Thanks, Josh!
        categ.f <- factor(categ)  # Make sure the categorial variable is a factor.
        X <- do.call(cbind, lapply(levels(categ.f),
                                   function(lev) as.numeric(categ.f==lev)))
        colnames(X) <- levels(categ.f)  # cbind() does not see the column labels
        rownames(X) <- names(categ)     # Carry over the names if any.
        X }
      data.frame(mktg[1:10,"edu"], make.dummies(mktg[1:10,"edu"]))    # Seems to work...

  - Example: Try CCA on the dummy blocks of two categorical variables.
    There shall be issues!
      x <- mktg[,"edu"]
      y <- mktg[,"age"]
      X <- make.dummies(x)
      Y <- make.dummies(y)
      cca(X, Y)
      # Where could this have gone wrong?
      #   ...
      # Remember the issue with a complete set of dummies?
      #   ...
      # How does it affect CCA?
      #   ...
      # What is the standard solution?
      #   ...
      XY.cca <- cca(X[,-1], Y[,-1])
      XY.cca
      XY.cca <- cca(X[,-2], Y[,-2])
      XY.cca
      # New issue: We are now missing the score
      # for the left-out dummy.  What is it?
      # (Recall how the CVs are computed!)
      #   ...
      # ==> This is an asymmetric way of scoring,
      #     making the left-out category the base.

  - Example, same data but another approach:
      # Don't center, only scale the dummies!
      # (Check the code for cca() earlier; it accommodates
      #    scale(..., center=T/F, scale=T/F).)
      XY.cca <- cca(X, Y, center=F, scale=F, CV=T)
      XY.cca[1:3]   # Ignore the variates; corrs and coeffs only
        ## $cor
        ##        CV1        CV2        CV3        CV4        CV5 
        ## 1.00000000 0.21718116 0.08657873 0.06404319 0.02222852 
        
        ## $xcoef
        ##          CV1          CV2         CV3        CV4         CV5
        ## 0 -0.9997404 -6.270201355  1.74918526  7.0569367 -9.42699080
        ## 1 -0.9997404 -5.861549967  1.22444403  5.0793966 -4.41281153
        ## 2 -0.9997404 -2.994472073 -1.97064994  0.8012265  3.60216868
        ## 3 -0.9997404 -0.511181415  1.11051688 -0.9943952  0.01677355
        ## 4 -0.9997404  0.310283343  0.13067035  0.4708330  0.26974546
        ## 5 -0.9997404  1.155826412  0.07483341  1.0816856  0.12792218
        ## 6 -0.9997404 -0.002502345 -1.64999925 -0.8206683 -1.00345941
        
        ## $ycoef
        ##          CV1         CV2        CV3        CV4        CV5
        ## 0 -0.9997404  1.24516310 -0.2394911  2.3577747  1.5899922
        ## 1 -0.9997404  0.86168024  1.7881059 -0.4111201 -0.4244355
        ## 2 -0.9997404  0.74751366 -1.2375788 -0.1148203 -1.2301481
        ## 3 -0.9997404 -0.03049887 -0.5300292 -1.4477710  1.4623189
        ## 4 -0.9997404 -1.36176958  0.1514422  0.4519488 -0.2095481

      # Tell the story of the second CCA!!!  Age vs Edu
      # What do you expect the first CV to be?
      range(XY.cca.$xCV[,1])
      range(XY.cca$yCV[,1])
      # Explain: ...
      # Hence where do you expect to be the first non-trivial CV?
      plot.plus(cbind(-XY.cca$xCV[,2], -XY.cca$yCV[,2]), jitter=.5, xlab="edu", ylab="age",
                col=c("red","blue","green","brown")[mktg[,"seg"]])
      abline(lm(XY.cca$yCV[,2] ~ XY.cca$xCV[,2]), col="gray", lwd=2)

  - Comparison of removing a dummy and not centering:
    . Is there a difference in scores for the categories
      as found in cca()$xcoef and cca()ycoef?
        ...
          cca(X[,-1], Y[,-1])
          cca(X, Y, center=F)
      Yes, there is because
      + when removing a dummy, the base score is forced to zero;
      + when not centering, the CVs must have zero cross-moment
        with (1,1,1,...) because this is the first uncentered CV.
    . Is there a difference in CVs as found by cca()$xCV and cca()$yCV?
          XY.cca.0 <- cca(X[,-1],Y[,-1], CV=T)
          XY.cca.2 <- cca(X,Y,center=F, CV=T)
          pairs.plus(cbind(XY.cca.0$xCV, XY.cca.2$xCV))
      Apparently not!
      Have a better look:
          windows(w=3, h=9)
          par(mfrow=c(3,1), mgp=c(1.5,.5,0), mar=c(3,3,1,1))
          for(j in 1:3) {
            plot.plus(cbind(XY.cca.0$xCV[,j],
                            XY.cca.2$xCV[,j+1]), jitter=0)
            abline(a=0, b=1);  abline(a=0, b=-1)  }
    . Q: How is it possible that the CVs are the same
         when the base category was scored with zero?
         That''s a tough one!
           ...
      A: When we drop one dummy, CCA centers the dummies!!!
         It isn''t the left-out dummy that has a zero coefficient;
         rather, it is its centered version!!!
         Centering of the included dummies moves
         the score of the base category away from zero.

  - Summary:
    . Fisher''s optimal scoring describes the association between
      two categorical variables in terms of correlation
      beween optimally scored categories in both variables.
    . The complete description of the association is given by
      + the hierarchy of scores and
      + the hierarchy of optimal correlations,
        of which there is more than one when
        both variables have more than one category.

  - Q: How would you describe the strength of the association?
       Is the optimal correlation the only way?
           XY.cca.2$cor
       What about the sum of stationary correlations squared?
           sum(XY.cca.2$cor[-1]^2)
       Show that this equals the squared Frobenius norm
           | CXX^{-1/2} CXY CYY^{-1/2} |^2
       In some sense this is "total linear association"
       between the two blocks.

  - Matt''s observation:  X^T Y can be interpreted as ...


================================================================


* MORE BACKGROUND ON CCA

  - Centering versus appending intercept columns:

    . In order to write the CCA problem with lin alg,
      we said we would center all the columns:
          X -> X.c, Y -> Y.c
      The CCA problem then becomes
                           (X.c a)^T (Y.c b)
          cor(X a, Y b) =  ----------------- = max/stationary_{a,b}
                           |X.c a| * |Y.c b|

    . Alternatively, we can augment X and Y both with an intercept column:
          X -> X.i = (1,X),  Y -> Y.i = (1,Y)
      The new CCA problem is now in terms of uncentered cross-moments:
                                 (X.i a)^T (Y.i b)
          cross(X.i a, Y.i b) =  ----------------- = max/stationary_{a,b}
                                 |X.i a| * |Y.i b|
      This problem is equivalent to the correlation/centering-based problem
      because it generates a trivial solution 1=1 with perfect cross()=1,
      but all subsequent solutions will be orthogonal to 1, hence centered.

    . In the future we will therefore write X and Y but mean either
      a centered version or an intercept-augmented version.

    . Other variants of the idea include categorical variables with a complete
      block of dummies that are collinear with the intercept vector.

    . Q: Remind yourself, what is the notion of orthogonality in CCA?
      A: Orthogonal means the variates are uncorrelated or have zero cross-moment.
         The metric is |X a|, not |a|, and |Y b|, not |b|.
         Correspondingly orthogonality means (X a)^T (Y b) = 0, NOT a^T b = 0.

  - Equivalence of LS and max-corr for regression:

      Consider a regression situation, modeling y ~ X a.

      Consider three criteria:

        RSS(a) = |y - X a|^2    (X centered or with intercept)

                    y^T X a
        cross(a) = ---------    (Interpretation?)
                   |y| |X a|

                 |y - X a|^2
        sph(a) = -----------   (Interpretation?)
                   |X a|^2

      Fact 0:  argmin cross(a) is determined only up to a factor > 0.

      Fact 1:  argmin cross(a) ~ argmin RSS(a)    up to a factor > 0.

               Proof: Stationarity of cross(a) yields
                      ... X^T y - ... X^T X a  =  0
                      where ... are scalars.  It follows
                      a ~ (X^T X)^{-1} X^T y  up to a factor..

      Fact 2:  argmin sph(a) ~ argmin RSS(a)      up to a factor > 0.

               Proof: Stationarity of sph(a) yields
                      (-2 X^T y + 2 X^T X a) ... - ... (2 X^T X a) = 0
                      where again ... are scalars.  It follows
                      a ~ (X^T X)^{-1} X^T y  up to a factor..


  - Extension of the equivalence to CCA:

    . The CCA problem is

                    b^T Y^T X a
        cross(a) = ------------
                    |Y b| |X a|

    . Compare with the asymmetric LS criterion:

                  |Y b - X a|^2
        LS(a,b) = -------------
                     |Y b|^2

    . Note:  LS(b) = min_a LS(a,b)  determines b only up to a factor.

    . Stationarities:

        d/da cross = 0:   a ~ (X^T X)^{-1} X^T Y b

        d/db cross = 0:   b ~ (Y^T Y)^{-1} Y^T X a

        d/da LS = 0:      a = (X^T X)^{-1} X^T Y b

        d/db LS = 0:      b ~ (Y^T Y)^{-1} Y^T X a

        ==> Directionally the same solutions.

    . Corollary: CCA can be solved by iterative regressions.


    . Minimize explicitly min_a LS(a,b):    ....................

        min_a |Yb - Xa|^2/|Yb|^2 = |Yb - H_X Yb|^2 / |Yb|^2
                                 = |(I-H_X)Yb|^2 / |Yb|^2
        Partial wrt b:  ~ Y^T (I-H_X) Y b * ... - ... * Y^T Y b  =  0
        ==>  Y^T (I-H_X) Y b ~  Y^T Y b
        ==>  (Y^T Y)^{-1} Y^T (I-H_X) Y b  = ...  b   (eigen equation for b)
        ==>  Y (Y^T Y)^{-1} Y^T (I-H_X) Y b  = ...  Yb   (eigen equation for b)
        ==>  H_y (I-H_X) Y b  = ...  Yb   (eigen equation for b)
        ==>  Yb - H_Y H_X Yb = ... Yb
        ==>  H_Y H_X Yb = ... Yb
    ==> Yb is eigenvectors of   H_Y H_X
    Do they exist???
    Surprise:  H_Y H_X: cspan(Y) -> cspan(Y) is symmetric and >=0 !
    Proof:  ...


================================================================


* FISHER''S LINEAR DISCRIMINANT ANALYSIS:

  - Data situation:
    . Grouped data, start with TWO groups (for now).
    . Formalize the data as coming from two populations.

  - Question 1: If we could observe only a single
      feature X to tell the populations/groups apart, what
      characteristics would we like it to have?
      Assume you know the class-conditional densities
      p(x|1) and p(x|0).
        ...

  - Question 2: If we were only told the two group means
      mu1 and mu2 and the two variances  sig1^2 and sig2^2
      of the quantitative feature X, what characteristics
      would we like them to have?
        ...

  - Question 3:
      Given p variables (collected in Xvec) characterizing
      the members of the two groups, if you could not
      observe Xvec but had the choice to make up an
      arbitrary univariate function f(Xvec) for telling
      the groups apart, what would f(Xvec) be?
      Again, assume you knew p(xvec|1) and p(xvec|0).
        ...

  - Question 4:
      Given p features/attributes/variables Xvec
      on the members of the two groups, if we only
      knew  E(Xvec|0)=muvec0 and E(Xvec|1)=muvec1
      and   V(Xvec|0)=Sig0   and E(Xvec|1)=Sig1,
      and if you were given the choice to make up a
      linear (!) function of Xvec to tell the groups apart,
      what would that function beta^T Xvec be?
        ...

  - If you answered Q4 in Fisher''s way, you''d have a
      generalized Rayleigh quotient to maximize.  If you
      don''t have it in Rayleigh form yet, get it there.
      It should be a ratio of two quadratic forms.
        ...
      [Fisher called the numerator "between-groups variance"
       and the denominator "within-groups variance".]

        a^T (mu1 - mu0)(mu1 - mu0)^T a
        ------------------------------
             a^T (Sig1 + Sig0) a

  - Execute the proper sphering to reduce the
      generalized eigenproblem to a simple eigenproblem.
        ...
       a  = (Sig1 + Sig0)^{-1/2} at
       at = (Sig1 + Sig0)^{+1/2} a

       at^T (Sig1 + Sig0)^{-1/2} (mu1 - mu0)(mu1 - mu0)^T (Sig1 + Sig0)^{-1/2} at
       --------------------------------------------------------------------------
                                        at^T at


  - What rank is the matrix in the Rayleigh numerator?
        ...

  - Use the rank info to find the explicit solution to the
      eigenproblem, then backtransform.
        ...
        1 eigenvector     at ~ (Sig1 + Sig0)^{-1/2} (mu1 - mu0)
        backtransformed   a  ~ (Sig1 + Sig0)^{-1} (mu1 - mu0)

  - Deviating from Fisher, conventional linear discriminant
      analysis or "LDA" assumes the covariance matrices
      Sig0 and Sig1 are the same:   Sig0 = Sig1 =: Sig
      Execute the simplification that results for the
      best linear function of the variables to tell the
      groups apart:
        ...
        a  ~ Sig^{-1} (mu1 - mu0)

      Comment: As illuminating the 2-class case is,
               we don''t see the connection with CCA yet.
               It is better to deal with the multi-class case.


----------------


  - Multiclass Discriminant Analysis (linear or quadratic):

    . Program: Make idealized model calculations for
               the general multi-class classification problem.

    . Model: For covariates x (a random vector, xvec earlier)
             and classes j=1,...J, we assume we know the
             class-conditional distribution of x:
                       p(x|j)
             For each class we are also know a prior
                       p(j)
             Think of it as the marginal distribution
             of the classes (nothing Bayesian here).
             ==> Statistical decision problem with
                 a discrete parameter 'j'.

    . The Bayes procedure is to pick the j that has
      highest posterior probability given x:

        p(j|x) = p(x|j)*p(j) / sum_{k=1...J} p(x|k)*p(k)  (Bayes theorem)

        jhat(x)  <-  argmax_j p(j|x)
                  =  argmax_j p(x|j)*p(j)
                  =  argmin_j [ -log( p(x|j) ) - log(p(j)) ]

    . Task: Assume the covariates to be multivariate normal.
              p(x|j) = N(mu.j, Sig.j)
            Initially allow different Sig.j in each group.
            Later specialize to a common  Sig = Sig.j.
            Calculate jhat(x)!
            ...          (Too tedious for print.)

         jhat(x) = argmin_j [ (x-mu.j)^T Sig.j^{-1} (x-mu.j) + log det Sig.j - log p(j) ]

            ==> 'Nearest class center classifier'
                where 'nearest' is in the sense of
                Mahalonobis distance for each class,
                with penalties for
                o large Sig.j:         + log det Sig.j
                o small class prior:   - log p(j)

    . Assume now identical Sig.j =: Sig:

         jhat(x) = argmin_j [ -2 mu.j)^T Sig^{-1} x + mu.j^T Sig^{-1} mu.j - log p(j) ]

            ==> Still a 'Nearest class center classifier',
                penalized for small class size,
                but it turns out to be based on linear functions,
                hence creates linear separation boundaries.

    . Q: What are the classification boundaries for general Sig.j?
         What are the classification boundaries for shared Sig?
            ...
      A: Quadratic decision boundaries for general Sig.j,
         Linear decision boundaries for shared Sig.

         ==> Quadratic Discriminant Analysis or QDA vs
             Linear Discriminant Analysis or LDA

    . Q: Think through the decision boundaries in LDA for 2 classes!
             ...

    . Q: Think through the decision boundaries in LDA for 2, 3 and 4 classes!
             ...

    . Estimation:
      o The above optimal procedures assume that mu.j, Sig.j or Sig are known.
      o In practice they need to be estimated.
        ==> Require a "large N & small p" regime.
            ("Large p" requires additional assumptions and regularizations.)


    . Q: What are the trade-offs between QDA and LDA?
           ...

    . Even for LDA there might be too many parameters to estimate.
      Generate some ideas to reduce their number!
      + Estimate mu.j^T Sig^{-1}, not Sig^{-1}
      + Naive Bayes: assume Sig = diagonal
      + Ridge: regularize Sig^{-1} by replacing it with
                  (Sig + lambda*Omega)^{-1}   (Omega fixed, e.g. diagonal > 0)

    . Q: What is the posterior probability p(j|x) in LDA under normal assumptions?
      A:
                         exp( - 2*mu.j^T Sig^{-1} x + mu.j^T Sig^{-1} mu.j ) * p(j)
         p(j|x) = -----------------------------------------------------------------
                  sum_k  exp( - 2*mu.k^T Sig^{-1} x + mu.k^T Sig^{-1} mu.k ) * p(k)

         It is function that estimates the x-conditional class probabilities,
         not the linear function mu.j^T Sig^{-1} x.
         The formula for p(j|x) provides the link function that maps
         the linear function  mu.j^T Sig^{-1} x  to actual probabilities.


  - Covariances:

    . Define three covariance matrices:

      + Total (marginal):       Sig.T  :=  C[x]    (x = predictor random vector)

      + Within (conditional):   Sig.W  :=  sum_j C[x|j]*p(j)
                                        =  Sig  in LDA

      + Between:                Sig.B  :=  C[mu.j]  (j is random)

        (See below for calculations that show what these are.)

    . Comments:

      + Sig.W can have two interpretations:
        o Not assuming shared Sig.j, it is the average Sig.j.
        o Assuming shared Sig.j as in LDA, it is the conditional C[x|j].
        Idea: Following Fisher, you can avoid the shared Sig assumption
              and work with an average Sig instead and get
              (suboptimal) linear boundaries and decision rules,
              thereby avoiding the optimal but more parameter-rich QDA.

      + mu.j is a random vector with distribution p(j),
        which is why defining Sig.B as C[mu.j] is meaningful.

      + Identities: No normal assumptions,
                    just definitions and 1st & 2nd order calculations.

        o  mu.j = E[ x | j ]

        o  mu  =  E[mu.j]  =  sum_j mu.j*p(j) = E[E[x|j]] = E[x]

             Wlog, we can assume the data 'x' to be centered, mu=0,
             if this is convenient.

        o  Sig.B  =  E[ (mu.j - mu) (mu.j - mu)^T ]          # j is random.
                  =  sum_j  (mu.j - mu) (mu.j - mu)^T p(j)

        o  Sig.W  =  E[ E[ (x - mu.j) (x - mu.j)^T | j ] ]
                  =  sum_j C[x|j] * p(j)

           LDA: same for all j
           Sig.W  =  E[ (x - mu.j) (x - mu.j)^T | j ]  

        o  Sig.T  =  E[ (x-mu)(x-mu)^T ]
                  =  E[ ((x-mu.j)+(mu.j-mu)) ((x-mu.j)+(mu.j-mu))^T ]   # telescoping
                  =  E[ E[ ((x-mu.j)+(mu.j-mu)) ((x-mu.j)+(mu.j-mu))^T  | j ] ]
                  =  E[ E[  (x-mu.j)(x-mu.j)^T
                          + (mu.j-mu)(x-mu.j)^T + (x-mu.j)(mu.j-mu)^T   # both zero!
                          + (mu.j-mu)(mu.j-mu)^T
                          | j ] ]
                  =  E[ E[ (x-mu.j)(x-mu.j)^T ] | j ]
                   + E[ (mu.j-mu)(mu.j-mu)^T ]
                  =  E[ Sig.j ]  +  Sig.B
                  =  Sig.W + Sig.B

            ------------------------------
           |                              |
           |   Sig.T  =  Sig.W + Sig.B    |  
           |                              |
            ------------------------------

           No assumption of identical C[x|j]!
           Sig.W is just  the average C[x|j].


      + Q: What rank is Sig.B?
        A: ...
           Sig.B is a weighted sum of rank one
           matrices  (mu.j-mu)(mu.j-mu)^T.
           There are J vectors (mu.j-mu),
           but they span only a (J-1)-dim space
           due to    sum_j (mu.j-mu) = 0.
           Therefore the rank of Sig.B is no more than (J-1).
           (It can be even less if the vectors (mu.j-mu) are
            in some other way collinear, e.g., if they form
            a perfect chain on a line.  This is unlikely in practice.
            Your typical estimate of Sig.B is of rank (J-1).)


  - LDA Dimension Reduction -- Take 1:

    . Goal: dimension reduction!

    . Generalizing the thought experiment from last class,
      if you could only use K < J-1 dimensions
      (= linear combinations of x) for classification,
      what would you choose?

    . Fisher: Maximize the "between-centers" variation
              while keeping the "Within-class" variation small.

                        a^T Sig.B a
              R1(a)  =  -----------  =  max_a
                        a^T Sig.W a

    . Execute using appropriate sphering.
           ...

    . Use of the eigenvectors from R1(a):
      + Sphere the data 'x' and centers:
              xt    = Sig.W^{-1/2} x
              mut.j = Sig.W^{-1/2} mu.j
      + Obtain the eigenvectors 'at.k' of  Sig.W^{-1/2} Sig.B Sig.W^{-1/2}
      + Project the sphered data 'xt' and centers 'mut.j' onto the 'at.k'.
      + Plot the projected centers:  at.2^T mut.j  vs  at.1^T mut.j
                                     if a K=2 dim reduction is chosen.
      + Assign the projected covariate vectors xt to the nearest projected center,
        where 'nearest' is in Euclidean distance due to the use of sphered coords.
      + This projection plot will give you a sense of how reasonable the
        linear boundaries are and how spherically distributed the classes are.

    . Caution: There is no correspondence between the
      order of eigenvectors and classes!


    . An equivalent Rayleigh quotient:        !!!!!!!!!!!!!!

                        a^T Sig.B a
              R2(a)  =  -----------  =  max_a
                        a^T Sig.T a

       R2(a) and R1(a) are equivalent in the sense that they
       generate eigenproblems with the same eigen vectors.
       Proof: Stationarity conditions
         R1(a):    2* Sig.B a * ... - ... 2* Sig.W a = 0
         R2(a):    2* Sig.B a * ... - ... 2* Sig.T a = 0
                 = 2* Sig.B a * ... - ... 2* (Sig.W+Sig.B) a


  - Estimation:

    . Data:  xij ~ p(x|j) iid,  j=1...J,  i=1...N.j

    . Assumption:  E[N.j/N] = p(j)   !!!!  May not be true either.
            
    . Estimates:   Replace "E[]" and "E[|j]" with empirical means.
                   ==> Put hats on mu''s and Sig''s.
  
    . Write down the estimation version of the max-posterior rule.
          ...

    . Is this rule finite sample optimal?
          ...


  - Connection with CCA: LDA Dimension reduction -- Take 2
    . Setup:
        Dummify the groups:  Y-block
        Covariates:          X-block

              Y                  X
            1 0 0        x111 x112 x113 ...
            1 0 0        x211 x212 x213 ...
            0 1 0        x121 x122 x123 ...
            0 1 0        x221 x222 x223 ...
            0 1 0        x321 x322 x323 ...
            0 0 1        x131 x132 x133 ...

    . Recall an asymmetric CCA formulation:

        LS(a,b)  =  | Yb - Xa |^2 / | Xa |^2  =  min_{a,b}

      Minimize wrt b:
        Yb = H_Y Xa = argmin_Yb  LS(a,b)
        Numerator term with minimizing Yb:   Xa - Yb = (I - H_Y) Xa

      The LS(a,b) problem becomes:
        min_b (min_a LS(a,b))  =  a^T X^T (I - H_Y) X a / a^T X^T X a

      + Q: What does (I - H_Y) do?
        A: ...
        ==>  (I - H_Y) X  =  groupwise centered X matrix
        ==>  [(I - H_Y) X]^T [(I - H_Y) X] / N  =  Sig.W   (estimated)

      + Q: What is X^T X?
        A: Assuming the columns are centered we have
              X^T X / N  =  Sig.T

      As a consequence the CCA problem

          | Yb - Xa |^2 / | Xa |^2  =  min/stat_{a,b}

      is equivalent to the LDA problem

          a^T Sig.B a / a^T Sig.T a  =  max/stat_a


  - Final Q: Why/when would you even want to use fewer dimensions for classification?
          A: It is a form of regularization; it reduces parameters estimated.
             It may improve out-of-sample classification performance
             (to be found only with CV, not in-sample performance).

  - Example: Mktg data -- take the segments as the 4 classes

             X  <- scale(mktg[,-9])
             cl <- mktg[,9]
             Y  <- make.dummies(cl)
             seg.cca <- cca(X, Y, CV=T, center=F, scale=F)
             str(seg.cca)
             plot.plus(seg.cca$xCV[,c(1,2)], col=c("red","blue","green","black")[cl])
             seg.means <- t(sapply(su(cl), function(j) apply(seg.cca$xCV[cl==j,c(1,2)], 2, mean)))
             points(seg.means, col=c("red","blue","green","black"), pch=16, cex=4)
     
      Why are the segment separations so strikingly clear?


----------------------------------------------------------------


* MULTIPLE CORRESPONDENCE ANALYSIS: GCA APPLIED TO CATEGORICAL DATA

  - Plain correspondence analysis is really Fisher''s optimal scoring
    with some bells and whistles.
    It was a cult in France centered on Benzecri.

  - Multiple correspondence analysis gives one possible solution
    to the reasonable problem of how to describe the association
    between multiple categorical variables.

  - Try:
        mktg.mca <- gca(make.dummies(mktg[,'age']), make.dummies(mktg[,'edu']), make.dummies(mktg[,'seg']))
        round(mktg.mca$val, 3)
        round(mktg.mca$vec, 3)
        # Check the zero eigenvectors: something went wrong
        # ==> The way the gca calculates matrix roots for prewhitening (using eigen decomps)
        #     has enough rounding error to create numerically non-singular matrices.
        #     So it doesn't bomb, instead gives a largely correct solution except for the zero-evals.
        # The following is correct:
        mktg.mca <- gca(make.dummies(mktg[,'age']),
                        make.dummies(mktg[,'edu']),
                        make.dummies(mktg[,'seg']),
                        center=F, scale=F)
        round(mktg.mca$val, 3)
        round(mktg.mca$vec, 3)
        
    ==> Gives reasonably monotone scores to 'age' and 'edu',
        and potentially reasonable scores to the nominal 'seg'.


----------------------------------------------------------------


* AN ALTERNATIVE TO MULTIPLE CORRESPONDENCE ANALYSIS:
    'PRINCALS' OR PCA WITH ALTERNATING LS SCORING

  - Recount:

    . Problem:
      We have some categorical variables and we wish to perform PCA.

    . Naive approach:
      Use ordinal variables as quantitative, using sores 0, 1, 2...

    . Criticism of the naive approach:
      o It assumes the scores 0, 1, 2, ... make the ordinal variables
        quantitative.
      o For truly nominal non-binary data there is no remedy.

  - Idea of PRINCALS:
      Generate one fixed set of scores for K (<<p) principal components.

  - Setup for an alternating algorithm:

    . For each 'item' (variable) xj, replace 0, 1, 2, ...
      with new numbers sj.0, sj.1, sj.2, ..., called 'scores',
      and denote the scored column xj by  sj(xj)  accordingly.

    . Choose the scores so they are CENTERED and STANDARDIZED:

          mean(sj(xj)) = 0,   sd(sj(xj)) = 1

    . Consider a PCA executed on the scored columns:

          X.sc = (s1(x1), ..., sp(xp))

    . Fact: PCA via SVD solves a low-rank approximation problem.

      o For a fixed K < min(p,N) and fixed X, solve

          min | X - U.K D.K V.K^T |^2

          subject to:
            U.K (NxK)
            U.K^T U.K = I.K
            V.K (pxK)
            V.K^T V.K = I.K
            D.K=diagonal (KxK)

      o The minimum of the criterion is:

          min = d_{K+1}^2 + d_{K+2}^2 + ...

      o Typicallly one tries several values of K: K=1, K=2, K=3, ...

    . PRINCALS optimization problem:

          min | X.sc - U.K D.K V.K^T |^2

          subject to 
            U.K (NxK)
            U.K^T U.K = I.K
            V.K (pxK)
            V.K^T V.K = I.K
            D.K=diagonal (KxK)
            sj(xj) = scoring of xj (j=1...p)
            mean(sj(xj)) = 0
            sd(sj(xj))   = 1

       Source: de Leeuw, J., and van Rijckevorsel, J. (Psychometrika, 1980)


    . Algorithm:
      + Given a rank-K approximation U D V^T,
        update the scores for each X column by minimizing
           | X.sc - U D V^T |^2
        wrt sj.0, sj.1,...
      + This is a simple LS problem that amounts to averaging
        the rank-K approximation in column j across the xj groups:
          | X.sc - U D V^T |^2  =  sum_{j=1..p} | sj(xj) - (UDV^T)[,j] |^2

    . Standardization issue for scored columns sj(xj):
      + Recall the equivalence of norm-constrained and plain LS problems:
            argmin_a  |y - X a|^2 / | X a |^2
          ~ argmin_a  |y - X a|^2
        modulo a positive scalar multiplier.
      + It follows that standardizing a LS-scored column sj(xj)
        solves the standardized LS problem.
      + Raw LS-scored and standardized LS-scored columns sj(xj)
        both have uses:
        o Raw LS-scored sj(xj) tells you how strongly sj(xj)
          is associated with the rank-K approximation.
          Use  sd(raw sj(xj))  as a measure of strength.
        o Standardized LS-scored sj(xj) is used at the next stage
          of the now obvious alternating LS algorithm.

  - The PRINCALS Algorithm:
    (0) Choose a rank K << p (typically K=1 or 2 or 3, maybe 4 or 5).
        Initialize sj(xj) with standardized and centered scores.
        Typically these are scores 0,1,2... or 1,2,3... for ordinals.
        Then:
            X    <- (s1(x1), ..., sp(xp))
            X.sc <- scale(X)
    (1) Obtain the rank-K approximation to X.sc with a truncated SVD:
            list(d,U,V) <- svd(X.sc)
            U <- U[,1:K],  D <- diag(d[1:K]),  V <- V[,1:K]
    (2) Given the rank-K approximation, update the scores sj(xj):
            sj <- argmin_sj | sj(xj) - (UDV^T)[,j] |^2
    (3) Repeat (1)-(2) till convergence.

  - [Minor issue: Centering
    . If all columns of X.sc are centered,
      then all columns of U will also be centered.
      (Q: Why?  A: ...)
    . Accordingly, the LS-scored columns sj(xj) will
      also be centered.
      (Q: Why?  A: ...)
    . Back and forth, centering is maintained --- in theory.
      Just the same, for numerical stability,
      you should recenter the scores every so often.

  - Plots for PRINCALS:
    . Scores for each categorical variable,
        but in non-standardized form so as to see the strength of
        association between the scored variables and their
        rank-K approximants.
    . Usual PCA plots on top of the scores,
        with special attention to the K dimensions that were maximized.

  - A data example: Autism ADOS-3 data from the Simons Foundation (SFARI)
      (AUTISM/ados_3_plot_princs_annotated.pdf)


================================================================


MULTIVARIATE ANALYSIS (MA): FROM LINEAR TO NONLINEAR/NONPARAMETRIC/FUNCTIONAL


* Preliminaries: Nonlinear/nonparametric fitting/regression

  - Q: Why be interested in regression with MA in mind?
    A: Even though MA does not single out responses,
       many MA methods have natural alternating algorithms
       whose steps amount to regressions.
       By formulating MA methods as iterated linear regressions,
       we may be able to generalize MA methods by substituting
       nonlinear/nonparametric regressions.

    Ex.: CCA, alternately regress Y b on X, then X a on Y,...

  - Two directions of generalizations:

    . Transformational MA:
      + Allow nonlinear transformations of the variables
        Example: the ACE algorithm is a case of CCA (see below)
      + Scoring of categorical variables is a special case
        because we can think of scoring as transformation.

    . Functional MA:
      + The variables in a block are measurements of the same
        quantity in different locations of a domain.
      + Domains: 1D/2D/3D space (pixels in images, voxels in fMRI),
        geography (temperature, precipitation), time (stock returns),
        frequency (signal)

    . Research:
      + Functional MA is a very active field right now.
      + Transformational MA is more dormant, but there is
        at least one form of "Independent Component Analysis" (ICA)
        that qualifies as a transformational version of GCA (Bachman & Jordan).

  - Q: What do we need to know about Ridge/nonlinear/nonpar regression
       to be creative MA innovators?
    A: We need to know the various classes of ... regression.

  - Classes of nonparametric regression:

    . Preliminaries:
      + For illustration we limit ourselves to 'simple regression',
        that is, a single quantitative predictor
            x = (x1,...,xN)^T,
        but we wish to estimate a nonlinear function
            f = (f1,...,fN)^T = (f(x1),...,f(xN))^T
        to approximate a response
            y = (y1,...,yN)^T.
      + The symbol 'f' may do double duty:
        o f(x) means a function defined at all values x in an interval.
        o fhat = (f1,...,fN)^T = (f(x1),...,f(xN))^T is f(...) discretized.
        (In latex f(x) should be roman, but N-vector f should be bold.)
      + All smoothers will be presented in a form that makes them linear
        in the sense that  y -> fhat,  R^N --> R^N  is linear,
        and hence a hat matrix exists.
        If the hat matrix is a projection, we denote it by H:
          fhat <- H %*% y
        If the hat matrix is not a projection, we denote it by S for 'smoother':
          fhat <- S %*% y
      + In real applications the hat matrix is not needed,
        but for understanding a smoothing method, it is useful.
        Q: How can we obtain a hat matrix if the software only provides fits?
        A: Apply the smoothing algorithm to all canonical basis vectors as y!
            S[,j] <- smoothing.algorithm(x, y=j_th_basis_vector)  # Pseudo-R-code
      + All smoothers have a complexity or bandwidth parameter.
        They are linear smoothers only for a fixed choice of this parameter.
        Data-driven choice such as with cross-validation makes them non-linear.
      + Some R infrastructure:
            # Toy Data: The X and Y variables:
            x1 <- scale(mktg[,'in.']);    hist(x, col="gray")
            y1 <- scale(mktg[,'rev']);    hist(x, col="gray")
            plot(x1, y1, pch=16)
            # ==> Ties in x! It is important for smoothers to be able to handle ties.
            # Parallel sort according to x:
            ord <- order(x1)
            x <- x1[ord]
            y <- y1[ord]
            N <- length(x)
            rm(x1,y1)
            # Generate the j'th canonical basis vector:
            canon.basis.vec <- function(j,N) as.numeric(seq(N)==j)
            # Generate smoother matrix:
            make.S <- function(x, smoother, ...) {
              N <- length(x)
              S <- matrix(NA, nrow=N, ncol=N)
              for(j in seq(N)) S[,j] <- smoother(x, y=canon.basis.vec(j,N), ...)
              S
            }
            # Plot a basis discretized to the observed x values::
            plot.cols <- function(x, mat, type="l", col="blue", lwd=2, pch=16, cex=.5) {
              ord <- order(x)
              par(mfcol=c(ceiling(ncol(mat)/2),2), mar=c(2,2,1,1), mgp=c(1.5,.5,0))
              for(j in seq(ncol(mat))) {
                plot(x, mat[,j], type="n", ylab="", xlab="", ylim=range(c(0,mat[,j])))
                abline(h=0, col="gray")
                if(grepl("[bl]",type)) lines(x[ord], mat[ord,j], col=col, lwd=lwd)
                if(grepl("[bp]",type)) points(x,     mat[,j],    col=col, pch=pch, cex=cex)
              }
            }
            # Plot selected rows of a hat/smoother matrix
            plot.rows <- function(sel, x, mat, type="l", col="red", lwd=2, pch=16, cex=.5) {
              ord <- order(x)
              x.rows <- mat[sel,];  x.loc <- x[sel]
              par(mfcol=c(ceiling(length(sel)/2),2), mar=c(3,3,1,1), mgp=c(1.5,.5,0))
              for(j in seq(along=sel)) {
                plot(x, x.rows[j,], type="n", ylab="", xlab="", ylim=range(c(0,x.rows[j,])))
                abline(h=0, col="gray");  abline(v=x.loc[j], col="gray")
                if(grepl("[bl]",type)) lines(x[ord], x.rows[j,ord], col=col, lwd=lwd)
                if(grepl("[bp]",type)) points(x,     x.rows[j,],    col=col, pch=pch, cex=cex)
              }
            }

    . PROJECTION SMOOTHERS:
      Expand X in a basis to form multiple predictors,
      then use multiple regression.  Examples:

      + POLYNOMIALS:

        o Nice because they provide a hierarchy of complexity (the degree)
          but they have unfortunate nonlocal behaviors.

        o If you ever use polynomials, do not use monomial bases.
          They may be numerically unstable.
          Instead use orthogonal polynomial bases such as Hermite polynomials.
          They have easy recurrence relationships.
          You can later orthogonalize them wrt to the discretization of the
          observed x values.

        o R - Monomials, Hermite polynomials, and empirically orthogonalized polynomials:
            # Monomial basis, just for curiosity: Not recommended but works for low orders
            x.mon <- sapply(0:5, function(j) x^j)
            plot.cols(x, x.mon);          rug(jitter(x,5))
            # Orthogonalize empirically with a Q-R decomposition:
            x.mon.orth <- qr.Q(qr(x.mon))
            round(crossprod(x.mon.orth), 14)   # Check orthogonality, same as t(...)%*%(...)
            plot.cols(x, x.mon.orth)
            # Recursion for Hermite polynomials:
            x.herm <- matrix(NA, ncol=6, nrow=length(x))
            x.herm[,1] <- 1
            x.herm[,2] <- x
            for(j in 3:ncol(x.herm)) x.herm[,j] <- x*x.herm[,j-1] - (j-1)*x.herm[,j-2]
            plot.cols(x, x.herm)
            # Orthogonolization wrt to sampled points: use a Q-R decomposition
            x.herm.orth <- qr.Q(qr(x.herm))
            max(abs(x.herm.orth - x.mon.orth))  # The same; why?
            plot.cols(x, x.herm.orth)
            # Hat matrix
            H.poly <- tcrossprod(x.herm.orth)  # same as: x.herm.orth %*% t(x.herm.orth)
            sel <- seq(1,1926,by=275)          # Select columns in steps of 275
            plot.rows(sel, x, H.poly)  # Plot weights at 8 locations (8 rows)
            rug(x)                     # Add a 'rug' to show where the x values fall.
            # Note the vertical lines -- they indicate the location for which this row provides fhat(x).
            # Discuss these traces!
            # Would it be justified to call polynomial regression a 'local smoother'?
            # ...
            fhat.poly <- H.poly %*% y
            windows()
            plot(x, y, pch=16, cex=.5);  lines(x, fhat.poly, lwd=2, col="green")
            # Looks nice in spite of the unreasonable looking row weights.


      + B-SPLINES REGRESSION ('B'=basis):

        o Preferred over polynomials because they are more local.

        o Idea: Place K (<<N) 'knots' on the X axis (e.g., at quintiles for K=4).
          Form piece-wise polynomials of degree 'd' on the intervals between the knots,
          but have the polynomials 'glued' together to be differentiable to order 'd-1'.

        o Examples:
            0th degree B-splines: locally constant, jumps at knots
                                  basis functions: 1, as.numeric(x >= knot.j)
            1st degree B-splines: locally linear, continuous
                                  basis functions: 1, x, (x-knot.k)*(x >= knot.j)
            2nd degree B-splines: quadratic linear, 1x differentiable
                                  basis functions: 1, x, x^2, (x-knot.k)^2*(x >= knot.j)
            3rd degree B-splines: 'cubic splines', most popular, 2x differentiable
                                  basis functions: 1, x, x^2, x^3, (x-knot.k)^3*(x >= knot.j)

        o R - Generating discretized spline bases of any order and any set of knots:
            library(splines)
            nknots <- 9
            knots <- quantile(x, prob=(1:nknots)/(nknots+1));  deg <- 3 # Ex. 2
            x.bsp <- bs(x, knots=knots, Boundary.knots=range(x), degree=deg, intercept=T)
            plot.cols(x, x.bsp)
            # Orthogonalize: look like wavelets
            x.bsp.orth <- qr.Q(qr(x.bsp));           round(crossprod(x.bsp.orth), 14)
            plot.cols(x, x.bsp.orth);                rug(x)
            # Hat matrix:
            H.bsp <- tcrossprod(x.bsp.orth)
            sel <- seq(1,1926,by=275)   # Plot 8 rows = weights at 8 locations
            plot.rows(sel, x, H.bsp)
            rug(jitter(x,5))
            # Would you call B-spline smoothers more or less local than polynomial regression?
            fhat.bsp <- H.bsp %*% y
            windows()
            plot(x, y, pch=16, cex=.5);  lines(x, fhat.bsp, lwd=2, col="blue")
            rug(jitter(x,5))

      + Fourier/trig bases: for stationary data or data with a major period ('wrap-around')

      + Wavelet bases: see Tony Kai for details


    . SMOOTHING SPLINES: NOT the same as B-splines!

      + Idea: Place knots at all observed values of X (assuming no ties).
        ==> Saturated basis.
        When fitting to a response Y, penalize with some generalized Ridge penalty.
        Actually, the following criterion defines cubic smoothing splines:
            sum_i (yi - f(xi))^2 + lambda * Integral f''(x)^2 dx  (over a range)
        While the RSS only speaks to the values of f() at observed xi,
        the penalty integral speaks to the values of f() at all values x in the range.
        ==> There exists one function that minimizes the criterion, and
            it happens to be a cubic spline with knots at all observed xi.
        ==> Study Grace Wahba''s papers on "Reproducing Kernel Hilbert Space Theory" (RKHS theory).
            Interesting functional analysis (a field of math, inf-dim lin alg w topology)

      + Linear algebra version of smoothing splines: at observed xi only
            y = (y1,...,yN)^T,  f = (f1,...,fN)^T
            |y - f|^2 + lambda * f^T Omega f  =  min_f
        where Omega is a non-negative definite generalized Ridge penalty.
        Omega is a function of x=(x1,...,xN)^T.  Look up how to construct it from x.
        It looks like a weighted second difference penalty but sphered!

      + Exercise: For intuition, write down an Omega such that
             f^T Omega f  =  | D2 f |^2
        where D2 is the 2nd difference operator:  D2 f = (f1+f3-2*f2, f2+f4-2*f3, ...) in R^{N-2}
        assuming that the xi are sorted:  x1 < x2 < ... < xN
        REMINDER: This is NOT a smoothing spline!
                  It does NOT originate from the integral penalty!
                  For one thing, it does not account for unequispaced x!
        In R: The following is a straight illustration of linear algebra.
              This is not how you implement production code.
        o Assume x is sorted, or better check.  (We ignore that there are ties...)
            N <- length(x);        all(x[-N] <= x[-1])   # Check that x is sorted.
        o Form a 2nd difference matrix function
            diff.1st.mat <- function(N) { diag(N)[-N,] - diag(N)[-1,] }
            diff.1st.mat(6)   # Toy example
            diff.2nd.mat <- function(N) { diff.1st.mat(N)[-(N-1),] - diff.1st.mat(N)[-1,] }
            diff.2nd.mat(6)   # Toy example
            # Alternative:
            diff.2nd.mat <- function(N) {
              D2 <- matrix(0, nrow=N-2, ncol=N)
              D2[col(D2)==row(D2)]   <- 1
              D2[col(D2)==row(D2)+2] <- 1
              D2[col(D2)==row(D2)+1] <- -2
              D2  }
        o Note: For non-equispaced x one should really use differences
                (y{i} - y{i-1})/(x{i} - x{i-1})   (i=2,...,N),
                hence row entries should be not +-1 but +-1/(x{i}-x{i-1})
                [Now you should realize the difficulty of programming splines:
                 In the presence of ties or near-ties, the x values need to be
                 bundled first before stable divisions by x-differences can be made.]
        o Form the associated penalty matrix Omega,
          representing  f^T Omega f  =  |D2 f|^2  =  f^T D2^T D2 f
            N <- length(x)
            Omega <- crossprod(diff.2nd.mat(N))   # (takes a while)
            dim(Omega)   # quite big, of course
            Omega.eig <- eigen(Omega)             # (takes a while)
            plot(Omega.eig$val, type="l")  # Large values means function strongly penalized
            range(Omega.eig$val)   
            plot.cols(x, Omega.eig$vec[,1:10])              # Strongly penalized
            plot.cols(x, Omega.eig$vec[,N-(0:9)], type="p") # Weakly penalized
            # (Steps are artifacts of improper treatment of ties: they should be bundled.)

      + 'Hat' matrix for smoothing splines: solve the generalized Ridge problem
        Stationarity wrt f:   -2*y + 2*f + 2*Omega f = 0
        ==>  f = (I+Omega)^{-1} y
        ==>  S = (I+Omega)^{-1}  is the smoother or hat matrix.
        R:
            S.D2 <- solve(diag(N) + Omega)  # Takes a minute!
            fhat.D2 <- S.D2 %*% y           # This is quick
            fhat.D2 <- solve(diag(N) + Omega, y)  # Obviates the need to form S.D2.
            plot(x, y, pch=16);  lines(x, fhat.D2, lwd=2, col="red")
            # Ooops, we undersmoothed!  How do we control the smoothness?
            # We seem to need a lot more of the penalty.

      + Analysis of S:
        o S is symmetric like Omega.
        o Omega and S have the same eigenvectors.
        o evals(S) = 1/(1+evals(Omega))
        o evals(Omega) >= 0  ==>  evals(S) in [0,1]
        o As Omega penalizes more, S shrinks more.
        o Zero penalty in direction u ==> Zero shrinkage in direction u
          that is, pure projection onto u.
        o Write the eigendecomp of S:  S = U Lambda U^T = sum_k lambda.k u.k u.k^T
        o For cubic splines:
            lambda.1 = lambda.2 = 1
          corresponding to  u.1=1  and  u.2=x.
          Why?  Because both f(x)=1 and f(x)=x  have vanishing
                properly x-weighted second differences.
          For unweighted 2nd differences the effective x is the order/rank of x.

      + For projections we have a notion of degrees of freedom: rank(H)
        Q: What would be a good notion of dfs for a smoothing spline smoother S?
           ...
        A: The trace of S.  (There are alternative definitions, though.)
           ==> S = (I + lambda*Omega)^{-1}
               tr(S) = sum_i eval.i(S) = sum_i 1/(1+lambda*eval.i(Omega))
        R: # Play with lambda till the trace of S is near the desired df=6.
            lambda <- 1;    sum(1/(1+lambda*Omega.eig$val))   # Ooops, we used almost 750 dfs!
           # Search (preferably with bisection) till you, say, dfs~6:
            lambda <- 344020000;    sum(1/(1+lambda*Omega.eig$val))
            S.D2.df6 <- solve(diag(N) + lambda*Omega)      # (takes a while)
            tr(S.D2.df6)   # Ok, approximately 6, as targeted.
            fhat.D2.df6 <- S.D2.df6 %*% y
            fhat.D2.df6 <- solve(diag(N)+lambda*Omega, y)  # Alternative w/o S
            windows()
            plot(x, y, pch=16);  lines(x, fhat.D2.df6, lwd=4, col="red")
            # Not bad!!!   (Except for improper handling of ties.)

      + R:  # Don't calculate smoother matrices for smoothing splines!
            # Use functions that compute fits directly.
            apropos('spline')
            help('smooth.spline')
            par(mfcol=c(1,1));  plot(mktg[,c('in.','rev')], pch=16)
            lines(smooth.spline(x=mktg[,'in.'], y=mktg[,'rev'], df=  5), col="green", lwd=3)
            lines(smooth.spline(x=mktg[,'in.'], y=mktg[,'rev'], df= 10), col="blue",  lwd=3)
            lines(smooth.spline(x=mktg[,'in.'], y=mktg[,'rev'], df= 20), col="red",   lwd=3)
      + Hat/smoother matrix:
            S.smsp <- make.S(x, function(x,y) fitted(smooth.spline(x=x, y=y, df=10))) # (takes a while)
            max(abs(S.smsp - t(S.smsp)))    # Symmetry check
            sel <- seq(1,1926,by=275)       # Select rows in steps of 275
            plot.rows(sel, x, S.smsp)       # Plot row weights 
      + Eigen analysis of the smoother:
              S.smsp.eig <- eigen(S.smsp)   # (takes a while)
              # Look at 30 largest e'vals:
              cbind(S.smsp.eig$val[1:30])   # Eigen values: surprises?
              # Plot eigenvalue profile up to order 30:
              par(mfcol=c(1,1))
              plot(S.smsp.eig$val[1:30], type="b", pch=16, ylab="E'vals"); abline(h=0)
              # Plot first 10 eigen vectors: interpret them as 'eigen functions' of the smoother
              plot.cols(x, S.smsp.eig$vec[,1:10])  # Looks like a system of orthogonal polynomials
              rug(jitter(x,5))


    . LOCALLY KERNEL WEIGHTED POLYNOMIAL SMOOTHING: most often 'polynomial = line'

      + Idea: To obtain fhat(x0), fit a degree-p polynomial with kernel-weighted LS,
              where the a kernel is centered at x0 of interest.
      + For fixed degree p and fixed kernel width, the smoother is linear.
        However, the usual implementations have robustness or cross-validation
        features that make them nonlinear in y.

      + R function: loess(formula, data=...) [newer] or
                    lowess(x, y)             [older]
        A local polynomial smoother with robustness features (Bill Cleveland)
              help(lowess)
              # Plot data:
              par(mfcol=c(1,1));  plot(mktg[,c('in.','rev')], pch=16)
              lines(lowess(x=mktg[,'in.'], y=mktg[,'rev'], f=.5), col="green", lwd=3)
              lines(lowess(x=mktg[,'in.'], y=mktg[,'rev'], f=.2), col="blue",  lwd=3)
              lines(lowess(x=mktg[,'in.'], y=mktg[,'rev'], f=.1), col="red",   lwd=3)
      + Hat/smoother matrix:
              S.lowess <- make.S(x, function(x,y) lowess(x, y, f=.5)$y )
              max(abs(S.lowess - t(S.lowess)))  # Nearly symmetric but not quite
              plot.rows(sel, x, S.lowess)       # Plot weights at a few locations
      + Eigen analysis of the smoother:
              S.lowess.eig <- eigen(S.lowess)   # (Takes very long; works even though S is not symmetric.)
              cbind(round(S.lowess.eig$val[1:30],4))       # Eigen values: surprises?
              # Plot eigenvalue profile up to order 30:
              par(mfcol=c(2,1))
              plot(Re(S.lowess.eig$val[1:30]), type="b", pch=16, ylab="E'vals");  abline(h=0)
              plot(Im(S.lowess.eig$val[1:30]), type="b", pch=16, ylab="E'vals");  abline(h=0)
              # Plot first 10 eigen vectors: interpret them as 'eigen functions' of the smoother
              plot.cols(x, Re(S.lowess.eig$vec[,1:10]))  # Looks quite strange
              plot.cols(x, Im(S.lowess.eig$vec[,1:10]))  # Looks quite strange
              rug(jitter(x,5))
      + SVD analysis of the smoother: more appropriate
              S.lowess.svd <- svd(S.lowess)
              # Plot singular value profile up to order 30: 
              plot(Re(S.lowess.svd$d[1:30]), type="b", pch=16, ylab="Singular valuess");  abline(h=0)
              # Right singular vectors:
              windows();  plot.cols(x, S.lowess.svd$u[,1:10])    
              # Left singular vectors:
              windows();  plot.cols(x, S.lowess.svd$v[,1:10])    
              # Plot left and right singular vectors against each other:
              windows(h=10, w=4);  par(mfcol=c(5,2), mgp=c(1.5,.5,0), mar=c(1,1,1,1));
              for(j in 1:10) plot(S.lowess.svd$u[,j], S.lowess.svd$v[,j], pch=16, cex=1, xlab="", ylab="")
              # Didn't know what to expect...              

      + R: function supsmu(x, y)  # J.H. Friedman & W. Stuetzle
        A local linear smoother with local cross-validation (J.H.Friedman and W.Stuetzle)
        ==> Fast and flexible, but may look raggedy.
              help(supsmu)
              par(mfcol=c(1,1));  plot(mktg[,c('in.','rev')], pch=16)
              lines(supsmu(x=mktg[,'in.'], y=mktg[,'rev'], span=.5), col="green", lwd=3)
              lines(supsmu(x=mktg[,'in.'], y=mktg[,'rev'], span=.2), col="blue",  lwd=3)
              lines(supsmu(x=mktg[,'in.'], y=mktg[,'rev'], span=.1), col="red",   lwd=3)
        Cumbersome to compute a smoother matrix
        because supsum() returns fewer values when there are ties.


----------------


* TRANSFORMATIONAL MA 0: NONLINEAR OPTIMAL CORRELATION AS A CCA PROBLEM

  - This is the analog of Fisher''s optimal scoring for two quantitative variables:
        max_{f,g} cor(f(X),g(Y))
    We know this is really a CCA problem.
    How does this value compare with  cor(X,Y)?
    See [Buja, AOS 18(3), 1990, pp.  1032-1069] for some population calculations.

  - Estimation of optimal correlation:

    . Data: Boston Housing
             site <- "http://www-stat.wharton.upenn.edu/~buja/STAT-541/"
             site <- "../A-STAT-541/Data/"
             boston <- read.table(paste(site,"boston.dat",sep=""), header=T)
             rownames(boston) <- paste(1:nrow(boston),
                                       scan(paste(site,"boston.geography",sep=""), w="") )
             boston[1:5,]
             x.LSTAT <- boston[,"LSTAT"]
             y.MEDV  <- boston[,"MEDV"]
             # Association:
             plot(x.LSTAT, y.MEDV, pch=16)
             cor(x.LSTAT, y.MEDV)

    . Polynomial estimation:
             # Monomial basis (with apologies):
             deg <- 5
             x.LSTAT.mon <- scale(qr.Q(qr(sapply(1:deg, function(j) scale(x.LSTAT)^j))))
             y.MEDV.mon  <- scale(qr.Q(qr(sapply(1:deg, function(j) scale(y.MEDV)^j))))
             dim(x.LSTAT.mon);  dim(y.MEDV.mon)
             cca.MEDV.LSTAT.mon <- cca(x.LSTAT.mon, y.MEDV.mon)
             # Stationary correlations:
             round(cca.MEDV.LSTAT.mon$cor, 3)
             # Transformations = linear combinations: again, only the first is meaninful
             plot.cols(y.MEDV,  y.MEDV.mon  %*% cca.MEDV.LSTAT.mon$ycoef)
             plot.cols(x.LSTAT, x.LSTAT.mon %*% cca.MEDV.LSTAT.mon$xcoef)
             # Plot transformed against each other:
             windows()
             j <- 1
             xx <- x.LSTAT.mon %*% cca.MEDV.LSTAT.mon$xcoef[,j] 
             yy <- y.MEDV.mon  %*% cca.MEDV.LSTAT.mon$ycoef[,j] 
             plot(xx, yy, pch=16, cex=1)
             cor(xx, yy)
             cor(x.LSTAT, y.MEDV)


    . B-spline estimation:
             # B-spline bases:
             nknots <- 3
             library(splines)  # for bs()
             x.LSTAT.bsp  <- scale(bs(x.LSTAT, knots=quantile(x.LSTAT, prob=(1:nknots)/((nknots+1))),
                                   Boundary.knots=range(x.LSTAT),
                                   degree=3, intercept=F ) )    # No intercept due to scaling
             y.MEDV.bsp  <- scale(bs(y.MEDV, knots=quantile(y.MEDV, prob=(1:nknots)/((nknots+1))),
                                     Boundary.knots=range(y.MEDV),
                                     degree=3, intercept=F ) )  # No intercept due to scaling
             dim(x.LSTAT.bsp);  dim(y.MEDV.bsp)
             cca.MEDV.LSTAT.bsp <- cca(x.LSTAT.bsp, y.MEDV.bsp)
             # Stationary correlations:
             round(cca.MEDV.LSTAT.bsp$cor, 3)
             # Transformations = linear combinations: only the first is meaningful
             plot.cols(y.MEDV,  y.MEDV.bsp  %*% cca.MEDV.LSTAT.bsp$ycoef)
             plot.cols(x.LSTAT, x.LSTAT.bsp %*% cca.MEDV.LSTAT.bsp$xcoef)
             # Transforms for orders 2 and higher complete an orthogonal system.

    . Compare with Breiman-Friedman''s ACE algorithm:
             install.packages("acepack", repos=repos)
             library(acepack)   # Install first if you don't have it.
             # ACE uses a version of the nonlinear 'subsmu()' smoother (does local CV):
             ace.MEDV.LSTAT <- ace(x=x.LSTAT, y=y.MEDV)
             # Plot and compare the transformations:
             par(mfcol=c(2,3), mar=c(3,3,1,1), mgp=c(1.5,.5,0))
             # ACE transforms:
             plot(ace.MEDV.LSTAT$x, ace.MEDV.LSTAT$tx, pch=16, cex=.5)
             plot(ace.MEDV.LSTAT$y, ace.MEDV.LSTAT$ty, pch=16, cex=.5)
             # Our polynomial transforms:
             plot(x.LSTAT, x.LSTAT.mon %*% cca.MEDV.LSTAT.mon$xcoef[,1], pch=16, cex=.5)
             plot(y.MEDV,  y.MEDV.mon  %*% cca.MEDV.LSTAT.mon$ycoef[,1], pch=16, cex=.5)
             # Our B-spline transforms:
             plot(x.LSTAT, - x.LSTAT.bsp %*% cca.MEDV.LSTAT.bsp$xcoef[,1], pch=16, cex=.5)
             plot(y.MEDV,  - y.MEDV.bsp  %*% cca.MEDV.LSTAT.bsp$ycoef[,1], pch=16, cex=.5)
        Questions:
        + Are these transformations trying tell us something?
        + Which of the three approaches do we trust most?


----------------


* TRANSFORMATIONAL MA 1: ACE AS A CCA PROBLEM

  - The ACE problem is also a maximal correlation problem,
    but the x-block will be permitted to consist of
    bases of several predictor variables.

  - R: Code for bases-based smoothing inside ACE; then run CCA
    We need to write general code that implements the multi-bases x-block.
             ace.basis <- function(x, y, basis.generator, o=1) {  # o = desired orders of eigen transforms
               y.basis <- basis.generator(y)
               d <- ncol(y.basis)
               colnames(y.basis) <- paste("y.",seq(ncol(y.basis)),sep="")
               x.bases <- lapply(colnames(x), function(j) basis.generator(x[,j]))
               for(j in seq(x.bases)) {
                 colnames(x.bases[[j]]) <- paste(colnames(x)[j],seq(ncol(x.bases[[j]])),sep=".") }
               x.bases <- do.call(cbind, x.bases)
               cc <- cca(x.bases, y.basis)   # This is the actual computation; all else is (un)packaging.
               ty <- y.basis %*% cc$ycoef[,o]
               tx <- sapply(unique(sub("[.].+","",colnames(x.bases))),
                            function(nam) { nams <- grep(nam,colnames(x.bases));  print(nams);  print(nam)
                                            x.bases[,nams,drop=F] %*% cc$xcoef[nams,o,drop=F] } )
               if(cor(y, ty) < 0) { ty <- -ty;  tx <- -tx }
               tmp <- list(x=x, y=y, tx=tx, ty=ty, rsq=cc$cor[o]^2)
               class(tmp) <- "ace"
               tmp
             }
             # Polynomial basis:
             basis.generator.poly <- function(x, nn=3) {
               x.qr <- qr(sapply(1:nn, function(j) x^j))
               scale(qr.Q(x.qr)[,seq(x.qr$rank),drop=F]) }
             # B-spline basis:
             basis.generator.bsp <- function(x, nn=2) {
               library(splines)
               x.qr <- qr(bs(x, knots=setdiff(unique(quantile(x, prob=(1:nn)/((nn+1)))), range(x)),
                          Boundary.knots=range(x), degree=3, intercept=F))
               scale(qr.Q(x.qr)[,seq(x.qr$rank),drop=F]) }
             # Original Breiman-Friedman ACE algorithm packaged:
             install.packages("acepack", repos=repos)  # Install first if you don't have it.
             ace.orig <- function(...) {
               library(acepack)
               tmp <- ace(...)
               tmp$x <- t(tmp$x)  # screwy detail in ace()
               class(tmp) <- "ace"
               tmp
             }
             # Method for generic function 'plot' to dispatch on class 'ace':
             plot.ace <- function(ace.str, mfrow=c(3,5)) {
               par(mfrow=mfrow, mgp=c(1.5,.5,0), mar=c(3,2,.5,.5))
               rg <- range(c(ace.str$tx, ace.str$ty))
               for(j in 1:ncol(ace.str$x)) {
                 plot(ace.str$x[,j], ace.str$tx[,j],
                      xlab=colnames(ace.str$x)[j], ylab="", pch=16, ylim=rg)
                 text(x=par("usr")[1], y=par("usr")[4], adj=c(-.1,1),
                      lab=paste("sd =",round(sd(ace.str$tx[,j]),3)) )
               }
               plot(ace.str$y, ace.str$ty,
                    xlab="y", ylab="", pch=16, ylim=rg)
               text(x=par("usr")[1], y=par("usr")[4], adj=c(-.1,1),
                    lab=paste("sd =",round(sd(ace.str$ty),3)) )
               plot(apply(ace.str$tx,1,sum), ace.str$ty,
                    xlab="tx", ylab="ty", pch=16, xlim=rg, ylim=rg)
               text(x=par("usr")[1], y=par("usr")[4], adj=c(-.1,1),
                    lab=paste("rsq =",round(ace.str$rsq,3)) )
             }
             # Original and our version of ACE:
             boston.ace.orig <- ace.orig( boston[,-14], boston[,14])                       # original ACE
             boston.ace.poly <- ace.basis(boston[,-14], boston[,14], basis.generator.poly) # polynomial
             boston.ace.bsp  <- ace.basis(boston[,-14], boston[,14], basis.generator.bsp)  # B-spline
             c(R2=boston.ace.orig$rsq,  R=sqrt(boston.ace.orig$rsq))
             c(R2=boston.ace.poly$rsq,  R=sqrt(boston.ace.poly$rsq))
             c(R2=boston.ace.bsp$rsq,   R=sqrt(boston.ace.bsp$rsq))
             windows(h=9,w=15)
             plot(boston.ace.orig)
             plot(boston.ace.poly)
             plot(boston.ace.bsp)

  - Null behaviors:
             boston.null <- boston
             boston.null[,"MEDV"] <- sample(boston.null[,"MEDV"])   # Scramble the response
             boston.ace.null.orig <- ace.orig( boston.null[,-14], boston.null[,14])                        # original ACE
             boston.ace.null.poly <- ace.basis(boston.null[,-14], boston.null[,14], basis.generator.poly) # polynomial
             boston.ace.null.bsp  <- ace.basis(boston.null[,-14], boston.null[,14], basis.generator.bsp)  # B-spline
             boston.ace.null.orig$rsq;  sqrt(boston.ace.null.orig$rsq)
             boston.ace.null.poly$rsq;  sqrt(boston.ace.null.poly$rsq)
             boston.ace.null.bsp$rsq;   sqrt(boston.ace.null.bsp$rsq)
             windows(); plot(boston.ace.null.orig)
             windows(); plot(boston.ace.null.poly)
             windows(); plot(boston.ace.null.bsp)
    Comparison:
    . g(Y):
      + Original ACE manages to create a roughly linear g(Y),
      + polynomial and B-Splinen ACE don''t.
    . Optimal correlations:
      + ACE overfits the least:      max corr ~ .24)
      + Poly and B-Splines overfit:  max corr ~ 0.38 and 0.42, resp.
    . Reasons for overfitting:  Too many dfs in the additive model
      + dfs(poly)      = 37  (= 13*3 - 2)
      + dfs(B-splines) = 59  (= 13*5 - 2 - 4)


  - A universal alternating algorithm for ACE:
           ace.alt <- function(x, y, smoother, niter=30) {
             x <- scale(x);   y <- scale(y)
             # Initialize with linear model through origin:
             xy.lm <- lm(y ~ .-1, data=as.data.frame(x))
             tx <- t(t(x)*coef(xy.lm)) # s(xj) = bj * xj
             ty <- y                 # initialize with identity transform
             res <- resid(xy.lm)     # initialize ty-residuals
             for(iter in seq(niter)) {
               # tx updates (in random order, instructor's arbitrary decision):
               for(j in sample(seq(ncol(x)))) {  
                 res <- res + tx[,j]
                 tx[,j] <- smoother(x=x[,j], y=res)
                 res <- res - tx[,j]
               }
               # ty updates:
               res <- ty - res
               ty <- scale(smoother(x=y, y=res))
               res <- ty - res
             }
             tmp <- list(x=x, y=y, tx=tx, ty=ty, rsq=cor(apply(tx, 1, sum), y)^2 )
             class(tmp) <- "ace"
             tmp
           }
           # Wrapper for polynomial regression:
           smoother.poly <- function(x, y) { fitted(lm(y ~ x + I(x^2) + I(x^3))) }
           # Wrapper for smoothing spline:
           smoother.smspl <- function(x, y) {
             xx <- round(x,5)     # bundle values
             if(length(unique(xx)) <=10) {
                 ty <- fitted(lm(y ~ factor(xx)))
             } else {
                 sm <- smooth.spline(x=xx, y=y, df=  5)
                 ty <- approx(sm, xout=xx)$y
             }
             ty }
           # Wrapper for lowess(): it orders according to x
           smoother.low <- function(x, y) {
             xx <- x+runif(length(x),-sd(x)/100,+sd(x)/100)
             ord <- order(xx)
             ty <- lowess(xx[ord], y[ord])$y
             ty[ord] <- ty   # Undo the ordering according to x
             ty }
           # Wrapper for supsmu: uniques and sorts the x values ==> jitter them randomly and unsort
           smoother.sup <- function(x, y) {
             xx <- x+runif(length(x),-sd(x)/100,+sd(x)/100)
             ord <- order(xx)
             ty <- supsmu(xx[ord], y[ord])$y
             ty[ord] <- ty   # Undo the ordering according to x
             ty }
           # Run the next two lines with any of the above three smoothers:
           boston.ace.poly  <- ace.alt(boston[,-14], boston[,14], smoother.poly)
           boston.ace.smspl <- ace.alt(boston[,-14], boston[,14], smoother.smspl)
           boston.ace.low   <- ace.alt(boston[,-14], boston[,14], smoother.low)
           boston.ace.sup   <- ace.alt(boston[,-14], boston[,14], smoother.sup)
           boston.ace.poly$rsq
           boston.ace.smspl$rsq
           boston.ace.low$rsq
           boston.ace.sup$rsq
           plot(boston.ace.poly)   # RM and end dips not believable
           plot(boston.ace.smspl)  # Too much believable
           plot(boston.ace.low)    # Nice!
           plot(boston.ace.sup)    # NOX not quite believable
           # There may be slight differences between runs due to random order of updates
           # and random jitter for supsmu().


  - Question: Under what conditions does the above alternating algorithm converge?

    . This question will have a satisfying answer only for
      the linear versions of some of the smoother classes.
      Nonlinearity due to cross-validated bandwidth choice
      makes the convergence question most likely impenetrable.

    . BASIS EXPANSION METHODS: easy...
      The regularization is just subspace restriction.
      + Let g, f1, ..., fp be N vectors of transforms g(Y), g1(X1), ..., fp(Xp)
        evaluated at the observed values of these variables.
      + These vectors will be members of subspaces V.y, V.x1, ..., V.xp (subsets of R^N).
      + Because the sum  f1(X1)+...+fp(Xp) represents an additive model,
        the subspace restriction for  f1+...+fp  is  V.x = V.x1+...+V.xp
        (= span of V.x1, ..., V.xp).
      + Therefore, the alternating algorithm is just LS projection
        back and forth between V.y and V.x.
      + Let H.x and H.y be the orthogonal projectors onto V.y and V.x, resp.
      + Consider convergence of g:
        o Initialize:  g.0
        o Iterate:     g.n = H.y H.x g.{n-1}   and  normalize  g.n = g.n/|g.n|
      + Does this algorithm converge?
        Wishful thinking:  If  H.y H.x  only were symmetric and non-negative definite ...
      + Actually, it is, as a map  V.y --> V.y, but NOT as a map R^N --> R^N !!
        Proof:  Let g1 and g2 be elements in V.y.
                <H.y H.x g1, g2> = <H.x g1, H.y g2>      because  H.y is symmetric on R^N
                                 = <H.x g1, g2>          because  g2 is in V.y
                                 = <g1, H.x g2>          because  H.x is symmetric on R^N
                                 = <H.y g1, H.x g2>      because  g1 is in V.y
                                 = <g1, H.y H.x g2>      because  H.y is symmetric on R^N
                                 QED:  H.y H.x is symmetric on the subspace V.y.
                Let g be an element of V.y:
                <H.y H.x g, H.y H.x g> = |H.y H.x g|^2 >= 0
                                 QED:  H.y H.x is non-negative definite.
      + Thus the alternating projections amount to a power algorithm,
        which we know to converge to the largest eigenvector.
        (Remind yourself of the proof!)

    . QUADRATIC PENALTY METHODS: needs a little more machinery, but works, too...

      + First: The same argument as for projections does not work
        if S.x and S.y are of the smoothing spline/generalized Ridge type.

      + Preliminaries for additive models with quadratic penalties:
          PLS  =  |y - (f1+...+fp)|^2 + f1^T Omega1 f1 + ... + fp^T Omegap fp
        Rewrite this with

        o a 'design matrix'  X = (I_N,...,I_N)                          of size  N x (Np)

        o additive summands stacked:  f=(f1^T,...,fp^T)^T               of size  (Np) x 1

        o so that  f1+...+fp = X f                                      of size  N x 1

        o and a super penalty matrix  Omega = diag(Omega1,...,Omegap)   of (Np) x (Np)

        o so that  f1^T Omega1 f1 + ... + fp^T Omegap fp  =  f^T Omega f

        o Finally:   PLS  =  |y - X f|^2 + f^T Omega f

        o Normal equations:  - X^T y + X^T X f + Omega f = 0

        o Hat/additive fitting matrix:    f = (X^T X + Omega)^{-1} X^T y = H.x y

        [Informal HW: Write out what  X^T X + Omega  really is ...]

        [This is not how you would actually compute an additive fit with
         smoothing splines, but it gives us a way to think about it.
         In particular we see that y |-> f=c(f1,...,fp) is just
         another linear operation, assuming fixed penalties.]

      + Back to the alternating algorithm:

        o We can consider a general setup where both blocks are additive models:
            Criterion.obs   =  |Y g - X f|^2 + g^T Omega.y g + f^T Omega.x f

        o What is the natural constraint on (f,g)?
          Expand the quadratic Criterion and set the cross-term between f and g to zero:
            Criterion.null  =  |Y g|^2 + |X f|^2 + g^T Omega.y g + f^T Omega.x f
          Crossterm:  -2<Yg,Xg>;  the penalties contribute nothing

        o Rayleigh quotient:
                    |Y g - X f|^2 + g^T Omega.y g + f^T Omega.x f
          (*)     -------------------------------------------------  = min/max/stat
                  |Y g|^2 + |X f|^2 + g^T Omega.y g + f^T Omega.x f

        o Stationary (eigen) equations:
            (- X^T Y g + X^T X f + Omega.x f)*... - ...*(X^T X f + Omega.x f) = 0
            ==>    f  ~  (X^T X + Omega.x)^{-1} X^T Y g  =  S.x Y g
            ==>    g  ~  (Y^T Y + Omega.y)^{-1} Y^T X f  =  S.y X f    (for symmetry)
          This has indeed the form of the alternating algorithm.
          Because the algorithm is the power algorithm of an eigen problem,
          it converges.

      + Equivalent criteria:

        o Asymmetric constraint for CCA to mascarade as regression:
                    |Y g - X f|^2 + g^T Omega.y g + f^T Omega.x f
          (**)      ---------------------------------------------  = min/stat
                    |Y g|^2 + g^T Omega.y g 

        o Penalized correlation criterion:
                                     <Y g, X f>^2
          (***) -----------------------------------------------------  = max/stat
                (|Y g|^2 + g^T Omega.y g) * (|X f|^2 + f^T Omega.x f)

        o The stationary solutions (f,g) of criteria (*), (**) and (***) 
          are the same up to positive scalars.


      + Alternating algorithm:

        o Power algorithm applied to S.x S.y  and S.y S.x:
            f --> S.x S.y f
            g --> S.y S.x g

        o Is S.x S.y (say) symmetric ('self-adjoint') in some sense?
            ==> We need an inner product!
            Q: What is the natural inner product on (f,g)-space?
            A:   |||(f,g)|||^2 := Rayleigh denominator
                                = |Y g|^2 + |X f|^2 + g^T Omega.y g + f^T Omega.x f
            
        o Associated with a pos.-def. quadratic form is an inner product:
            <<(f1,g1), (f2,g2)>> = g1^T Y^T Y g2 + f1^T X^T X f2 + g1^T Omega.y g2 + f1^T Omega.x f2

        o Restricted to f-space this is:
            <<f1, f2>> = f1^T X^T X f2 + f1^T Omega.x f2

        o Is the mapping f --> S.x S.y f symmetric in this inner product?
            ...


    . LOCALLY KERNEL WEIGHTED POLYNOMIAL SMOOTHING: nothing is known here
      because this type of smoothing does not solve an optimization problem.
      The non-symmetric nature of the smoothing matrix suggests that
      occasional slight non-convergence due to 'Jordan cycling' might occur.
      This, however, has never been a practical problem.
      Even the use of locally cross-validating supsmu seems
      to result in convergence in practice.
      Asymptotically for N-->Inf, there is of course convergence
      because all properly bandwidthed smoother converge to
      conditional expectation operators, which are orthogonal
      projections in Hilbert space, hence the arguments from
      'BASIS EXPANSION METHODS' apply, modulo infinite dimensional
      regularity conditions (E[] is best assume to be a compact operator).



----------------------------------------------------------------


* TRANSFORMATIONAL MA 2: ADDITIVE IMPLICIT EQUATIONS FROM
                         SMALLEST ADDITIVE PRINCIPAL COMPONENTS (APC)

  - The problem:

    . Look for additive implicit equations of the form

           f1(X1) + ... + fp(Xp) ~ 0

    . Can be used as a 'concurvity' diagnostic for additive regression:

           Y ~ h1(X1) + ... + hp(Xp)
           0 ~ f1(X1) + ... + fp(Xp)

      creates identifiability problems for the regression.

    . Has independent value as a symmetric analysis of variables.

  - Criterion:

        V[ f1(X1) + ... + fp(Xp) ]  =  min_{f1,...,fp}

    subject to ... ?
    Recall the "null principle" for deriving MA constraints:

         --------------------------------------------
        |   Null assumption of uncorrelated blocks   |
         --------------------------------------------

    This results in the following constraint:

        V[f1(X1)] + ... + V[fp(Xp)] = 1

    Rayleigh:

        V[ f1(X1) + ... + fp(Xp) ]
        ---------------------------  =  min_{f1,...,fp}
        V[f1(X1)] + ... + V[fp(Xp)]

    If the minimum is near zero, we will have found
    an approximate implicit equation for the data.

  - We know this must result in an eigen problem:

    . Subspace projection smoothers:
      Expand all variables in bases, so  fj(Xj) = Xbas.j aj
      ==>  GCA problem

             | Xbas.1 a1 + ... + Xbas.p ap |^2
          ---------------------------------------  =  min_{a1,...,ap}
          | Xbas.1 a1 |^2 + ... + | Xbas.p ap |^2

    . Application:
      + Use the basis generators from earlier:
             basis.generator.poly <- function(x, nn=3) {
               # x.qr <- qr(sapply(1:nn, function(j) x^j))
               # scale(qr.Q(x.qr)[,seq(x.qr$rank),drop=F]) }
               scale(poly(x, min(nn,length(unique(x))-1))) }
             basis.generator.bsp <- function(x, nn=2) {
               library(splines)
               x.qr <- qr(bs(x, knots=setdiff(unique(quantile(x, prob=(1:nn)/((nn+1)))), range(x)),
                          Boundary.knots=range(x), degree=3, intercept=F))
               scale(qr.Q(x.qr)[,seq(x.qr$rank),drop=F]) }
      + Apply gca:
              apc.bases <- function(X, basis.generator, ord=1, minimize=T) {
                  Xs <- scale(X)
                  X.bases <- lapply(seq(ncol(X)), function(j) basis.generator(Xs[,j]))
                  names(X.bases) <- colnames(X)
                  X.gca <- gca(X.bases)               # The actual computation
                  if(minimize) ord <- ncol(X.gca$vectors)+1-ord
                  coefs <- tapply(X.gca$vectors[,ord], names(X.gca$vectors[,ord]), function(x) x)
                  # print(coefs)
                  Xt <- sapply(seq(ncol(X)),
                               function(j) {
                                 sel <- grep(paste("[(]",j,"[)]",sep=""),
                                             names(X.gca$vectors[,ord]))
                                 X.bases[[j]] %*% X.gca$vectors[sel,ord] } )
                  colnames(Xt) <- colnames(X)
                  tmp <- list(X=X, Xt=Xt, eval=X.gca$values[ord], all.evals=X.gca$values, ord=ord)
                  class(tmp) <- "apc"
                  tmp }
              plot.apc <- function(apc, jitter=F, fac=1) {
                  par(mfrow=c(3,5), mgp=c(1.5,.5,0), mar=c(3,3,1,1))
                  ylim <- range(apc$Xt)
                  for(j in seq(ncol(apc$X))) {
                      if(jitter) {
                          x <- jitter(apc$X[,j], fac);  y <- jitter(apc$Xt[,j], fac)
                      } else {
                          x <- apc$X[,j];  y <- apc$Xt[,j]
                      }
                      plot(x, y, type="n", ylim=ylim, xlab=colnames(apc$X)[j], ylab="")
                      abline(h=0, col="gray")
                      points(x, y, pch=16, cex=.8)
                      text(x=par("usr")[1], y=par("usr")[4], adj=c(-.1,1),
                           lab=paste("sd =",round(sd(apc$Xt[,j]),3)) )
                  }
              }
      + Polynomial bases example:
              boston.apc.poly <- apc.bases(boston, basis.generator.poly, ord=1)
              boston.apc.poly <- apc.bases(boston, basis.generator.poly, ord=2)
                plot(boston.apc.poly);  round(boston.apc.poly$all.evals,3);  round(boston.apc.poly$eval,3)
      + B-spline bases example:
              boston.apc.bsp <- apc.bases(boston, basis.generator.bsp, ord=1)
              boston.apc.bsp <- apc.bases(boston, basis.generator.bsp, ord=2)
              boston.apc.bsp <- apc.bases(boston, basis.generator.bsp, ord=3)
                plot(boston.apc.bsp);  boston.apc.bsp$all.evals;  boston.apc.bsp$eval;
      + Comments:
        o B-splines (ord=1) generate a meaningless solution (constants) with zero eigenvalue
        o Polynomials (ord=1) and B-splines (ord=2) generate a solution
            involving INDUS, RAD, TAX, like in STAT 541 with linear smallest PCA
        o Polynomials (ord=2) and B-splines (ord=3) generate an interesting solution
            involving many variables, including CRIME, NOX and MEDV

  - APC with quadratic penalties:

    . Rayleigh quotient:

              |f1 + ... + fp|^2 + f1^T Omega1 f1 + ... + fp^T Omegap fp
         -----------------------------------------------------------------  =  min_{f1,...,fp}
             |f1|^2 + ... + |fp|^2 + f1^T Omega1 f1 + ... + fp^T Omegap fp

    . Stationary/normal equations: gradient wrt fj

              [(f1 + ... + fp) + Omegaj fj] * ... - ... * [fj + Omegaj fj] = 0
              [(f1 + ... + fp) - fj]  ~  (I + Omegaj) fj

    . Suggested iterative algorithm:

               fj <- (I+Omegaj)^{-1} [(f1 + ... + fp) - fj]    (for all j=1...,p)
                   = Sj [(f1 + ... + fp) - fj]

      ==> Power algorithm with a big matrix (Np)x(pN) matrix  S.super.
          Problem for you:  Write down  S.Super

    . Unfortunately these iterations converge to the max of the Rayleigh quotient,
      not the minimum.
      Q: How can we find the minimum eval?
      A: Flip the eigenvalue profile (spectrum)
         If  alpha >= max evals of S.super,
         then the smallest eigenvalue of S.super
         corresponds to the largest eigenvalue of  alpha*I - S.super
      ==> Power algorithm applied to  alpha*I - S.super
      [It helps to have a good bound on max(evals(S.super)))]

  - APC iterations: We generalize the suggested iterations
    to work with arbitrary smoothers.
           apc.iter <- function(X, smoother, niter=50, minimize=T) {
             Xs <- scale(X)
             Xt <- Xs
             p <- ncol(X)
             N <- nrow(X)
             if(minimize) { alpha <- (p+1)/2 } else { alpha <- 0 }
             for(iter in 1:niter) {
               Xsum <- rowSums(Xt)
               Xt <- sapply(1:p, function(j) alpha*Xt[,j] - smoother(Xs[,j], Xsum-Xt[,j]) )  #
               s <- sd(c(Xt));  if(s<1E-16) cat("  sd=0\n")
               Xt <- Xt / (s*sqrt(N))
               num.na <- sum(is.na(Xt));  num.inf <- sum(abs(Xt)==Inf)
               if(num.na>0 | num.inf>0) cat("#NA =",num.na,"  #Inf =",num.inf,"\n")
             }
             Xt <- scale(Xt, center=T, scale=F)
             colnames(Xt) <- colnames(X)
             tmp <- list(X=X, Xt=Xt, eval=sum(rowSums(Xt)^2)/sum(Xt^2))
             class(tmp) <- "apc"
             tmp
           }
           # Wrapper for polynomial regression:
           smoother.poly <- function(x, y) { fitted(lm(y ~ x + I(x^2) + I(x^3))) }
           # Wrapper for smoothing spline:
           smoother.smspl <- function(x, y) {
             x <- x+runif(length(x),-sd(x)/100,+sd(x)/100)
             ord <- order(x)
             ty <- smooth.spline(x=x[ord], y=y[ord], df=4, tol=diff(range(x))*1E-04)$y
             ty[ord] <- ty
             ty }
           # Wrapper for lowess(): it orders according to x
           smoother.low <- function(x, y) {
             x <- x+runif(length(x),-sd(x)/100,+sd(x)/100)
             ord <- order(x)
             ty <- lowess(x[ord], y[ord])$y
             ty[ord] <- ty   # Undo the ordering according to x
             ty }
           # Wrapper for supsmu: uniques and sorts the x values ==> jitter them randomly and unsort
           smoother.sup <- function(x, y) {
             x <- x+runif(length(x),-sd(x)/100,+sd(x)/100)
             ord <- order(x)
             ty <- supsmu(x[ord], y[ord])$y
             ty[ord] <- ty   # Undo the ordering accordingn to x
             ty }
  - Examples:
           boston.apc.iter.poly <- apc.iter(boston, smoother.poly)
             windows(w=8, h=6);  plot(boston.apc.iter.poly);  boston.apc.iter.poly$eval
             plot(0:1,0:1, type="n", xaxt="n", yaxt="n", xlab="", ylab="", main="Poly")
           boston.apc.iter.smspl <- apc.iter(boston, smoother.smspl)
             windows(w=8, h=6);  plot(boston.apc.iter.smspl);  boston.apc.iter.smspl$eval
             plot(0:1,0:1, type="n", xaxt="n", yaxt="n", xlab="", ylab="", main="SmSpline")
           boston.apc.iter.low <- apc.iter(boston, smoother.low)
             windows(w=8, h=6);  plot(boston.apc.iter.low);  boston.apc.iter.low$eval
             plot(0:1,0:1, type="n", xaxt="n", yaxt="n", xlab="", ylab="", main="Lowess")
           boston.apc.iter.sup <- apc.iter(boston, smoother.sup)
             windows(w=8, h=6);  plot(boston.apc.iter.sup);  boston.apc.iter.sup$eval
             plot(0:1,0:1, type="n", xaxt="n", yaxt="n", xlab="", ylab="", main="SupSmu")
    We see here what corresponds to ord=2 and ord=3
    for poly and B-spline bases solutions.


----------------------------------------------------------------


* TRANSFORMATIONAL MA 3: INDEPENDENT COMPONENT ANALYSIS (ICA) AND
                         SMALLEST ADDITIVE PRINCIPAL COMPONENTS (APC)


  - Externalities:

    . Bach and Jordan (2002) proposed 'kernel ICA' based on APCs.

  - What is ICA?

    . It is the search for 'INDEPENDENT linear sources'.

    . 'Linear source' = linear combs. in multivariate language
                      = latent variable linearly related to observables
                        (but then estimated as a lin.comb. ...)

    . 'Independent' = stochasticaly independent, not just uncorrelated

    .  The "model" is in random vector language:
           Xvec = A Yvec
       where  Yvec[1],...,Yvec[p]  are unknown sources 
       that are INDEPENDENTLY but generally NOT IDENTICALLY distributed.
       The fixed parameter matrix A (pxp) must be non-singular.

    . One should assume ICAs do not generally exist, 
      but still one can try to find out how close to 
      independent one can find p linear combinations to be.

 - Simplifying pre-processing: sphering !!!

    . Wlog we can assume that the components of Yvec are standardized:
          V[Yvec] = I_p
      The components of Yvec must be uncorrelated:
          Independent ==> uncorrelated.

    . Wlog we can also assume the p-variate random vector is sphered:
          V[Xvec] = I_p
      (Well, empirical sphering is really only an estimate of
       population sphering, but that''s ok.)

    . Q: If Xvec and Yvec are sphered, then this implies what for the columns of A?
      A: ...
         Reason:  I = V[Xvec] = V[A Yvec] = A V[Yvec] A^T = A A^T
                     -------------------
               ==>  |  A is orthogonal  |
                     -------------------
      ==> ICA searches for an o.n. basis or rotation U of R^p such that
          the transformed  Yvec = Xvec U  has as independent columns as possible.

  - Bach&Jordan''s proposal:

    . Use this fact:

         -----------------------------------------------
        |   Yvec has pairwise independent components    |
        |          <==>                                 |
        |   Cor[(f1(Yvec.1),...,fp(Yvec.p))] = I_p      |
        |   FOR ALL (nonlinear) fj.                     |
         -----------------------------------------------


    . Note: Pairwise independence is not the same as full independence!!!
      But:  Full independence ==> Pairwise independence
      Hence: B&J are only looking for a necessary
             but not sufficient condition for ICA

    . Connection with population APCs:

         Cor[(f1(Yvec.1),...,fp(Yvec.p))] = I_p  for all fj(Yvec.j)
                 <==>
         min_{f1,...,fp}  min.eval[Cor[(f1(Yvec.1),...,fp(Yvec.p))]]  =  1
                 <==>
         max_{f1,...,fp}  max.eval[Cor[(f1(Yvec.1),...,fp(Yvec.p))]]  =  1
                <==>
         max_{f1,...,fp}  det[Cor[(f1(Yvec.1),...,fp(Yvec.p))]]  =  1

      The first two criteria lend themselves for APCs.
      Bach&Jordan use the third criterion.

    . Proposal for ICA: solve one of the following problems

      (1)  max_{U in SO(p)}  min_{f1,...,fp}  min.eval[Cor[(f1((U Xvec).1),...,fp(U Xvec).p)]]

      (2)  min_{U in SO(p)}  max_{f1,...,fp}  max.eval[Cor[(f1((U Xvec).1),...,fp(U Xvec).p)]]

      (3)  max_{U in SO(p)}  min_{f1,...,fp}  det[Cor[(f1((U Xvec).1),...,fp(U Xvec).p)]]

      Bach&Jordan: They minimize -log[det[...]], hence maximize det[...] wrt U.
                   Hence they choose proposal (3).

    . [Why does (3) work?
       Reason:  det = product of evals, but sum(evals) = p
       For any p numbers a1,...,ap>0 we have:
          max_{a1+...+ap=p, aj>=0} a1*...*ap == 1  <==>  aj=1 for all j
       The product function is concave and permutation symmetric,
       hence its max is attained at the center of the symplex
          {a1+...+ap=p, aj>=0}.
      ]

    . For all proposals the closeness of the actual criterion value to +1
      at the optimum indicates how well we have found Independent Components.

  - A crude R implementation:
    . Make up some artificial data:
           N <- 2000
           Y <- scale(cbind("Norm"=rnorm(N), "t5"=rt(N,df=5), "Chi7"=rchisq(N,df=7), "Exp"=rexp(N), "Unif"=runif(N)))
           p <- ncol(Y)
           rg <- range(c(Y))
           pairs(Y, pch=16, cex=.5, xlim=rg, ylim=rg, gap=0)
    . Set the true A=I, so X=Y, but start with a random rotation A,
      so the algorithm should find back to A=I.
           X <- Y
    . Use (1) or (2) above as the criterion function:
           ica.crit <- function(A) {
               # Orthonormalize the argument:
               A.o.n. <- qr.Q(qr(matrix(A, nrow=sqrt(length(A)))))
               X <- Y %*% A.o.n.
               # Some alternatives as APC building blocks: uncomment one at a time
               apc.bases(X, basis.generator.poly, minimize=F)$eval
               #  apc.bases(X, basis.generator.bsp,  minimize=F)$eval
           }
           # Initalize:
           ica.init <- qr.Q(qr(matrix(rnorm(p^2), ncol=p)))
           # Test the function:
           ica.crit(ica.init)   
           # This function 'thinks' a long time:
           ica.nlm <- nlm(ica.crit, ica.init)
           # Local minimum value:
           c(init=ica.crit(ica.init), converged=ica.nlm$minimum)
           # Local minimum solution:
           round(matrix(ica.nlm$estimate, ncol=p), 2)           # raw
           round(qr.Q(qr(matrix(ica.nlm$estimate, ncol=p))), 2) # o.n.
           # Some directions are found, but others  not.

  - Summary: Not sure that ICA is a viable technique.
             Strongly depends on tail behavior of distributions.
             ==> Expect extreme non-robustness.
             File it away as an interesting concept.



----------------------------------------------------------------


* FUNCTIONAL MULTIVARIATE ANALYSIS  (just briefly)

  - What makes a data analysis situation 'functional'?
    . We''re estimating discretized functions/signals/processes on a domain.
    . Typical domains: time, space, frequency
    . We only consider the case where all cases are sampled
      at the same locations in the domain.
      Examples: daily stock returns (case=company)
                mid-day temperatures at 200 locations across the US
  - What makes data two-way functional?
      A: In the data table both rows and columns have underlying domains.
      Ex.: Daily mid-day temperatures at 200 locations for 20 years         
           rows: locations, cols: days


  - Multiple time series example: montly average temperatures in Philly 1950-199
    Organized as multiple time series: 50 12-month series, or 12 50-yr series

          #X <- as.matrix(read.table("c:/D/DATA/TemperaturesAndPrecipitations/Philly-temp.txt")[,4:15])
          X <- as.matrix(read.table("http://stat.wharton.upenn.edu/~buja/STAT-926/Philly-temp.txt")[,4:15])
          str(X)
          X.svd <- svd(X - mean(X))
          par(mfrow=c(2,4), mgp=c(1.5,.5,0), mar=c(3,3,1,1))
          for(j in 1:4) plot(-X.svd$v[,j], pch=16, type="b")
          for(j in 1:4) plot(-X.svd$u[,j], pch=16, type="b")
          X.svd$d

    (Idiosyncratic remarks about this dataset: In light of the 

  - One would want the yearly u-vectors to be smoother.
    Start with the svd as a low-rank approximation problem:
          | X - d u v^T |^2                (Frobenius matrix norm)

  - Aproach 1: Subspace restriction on u

    . Algebra:
        u = B a,  where B is an o.n. basis:
        B^T B = I_d,
        hence |B a|^2 = |a|^2
        hence |a|^2 = 1  due to  |u|^2 = 1
        Let  H = B B^T  be the hat matrix for the projection onto span(B).
        Then:
          | X - d u v^T |^2  =   | X - d (Ba) v^T |^2
                             =   | X - d B a v^T |^2
                             =   | (I-H)X + HX - d B a v^T |^2
                             =   | (I-H)X |^2 + | HX - d B a v^T |^2
                             ~   const + | B B^T X - d B a v^T |^2
                             ~   const + | B^T X - d a v^T |^2
        ==> Projection of the X columns onto B,
            run an svd on the projection,
            reconstitute the original.

    . R code:
            # Orthogonal polynomials:
            B <- cbind(1,poly(1:nrow(X), degree=5))   
            # B-spline alternative:
            library(splines)
            B <- qr.Q(qr(bs(1:nrow(X), df=5)))      
            # Common code:
            X.B <- crossprod(B, X-mean(X))
            X.B.svd <- svd(X.B)
            X.B.svd.u <- B %*% X.B.svd$u
            par(mfrow=c(2,4), mgp=c(1.5,.5,0), mar=c(3,3,1,1))
            for(j in 1:4) plot(-X.B.svd$v[,j], pch=16, type="b")
            for(j in 1:4) plot(-X.B.svd.u[,j], pch=16, type="b")
            X.B.svd$d
         Unsatisfactory, both with polynomials and with B-splines.

  - Approach 2: Quadratic penalty approach -- one way

    . Algebra:
            | X - d u v^T | + |D u|^2
            | X - d u v^T | + u^T Omega u     where Omega = D^T D
        ==> Does not work, or is conceptually odd.
            We are penalizing the smoothness of unit vector...
        Proposal: Absorb d in u and/or v and
                  look for a penalty that is invariant
                  under rescaling  u -> cu, v -> v/c.
            | X - u v^T | + u^T Omega u * |v|^2
          = | X |^2 - 2 tr[ X^T u v^T ] + |u|^2 |v|^2 + u^T Omega u |v|^2
          = const   - 2 tr[ X^T u v^T ] + [ u^T (I + Omega) u ] |v|^2
          ~ - 2 tr[ X^T u v^T ] + [ u^T (I + Omega) u ] |v|^2
                  Define and substitute:
                    S = (I + Omega)^{-1} = smoother matrix
                    S.1/2 = (I + Omega)^{-1/2} = half-smoother matrix
                    uu = (I + Omega)^{1/2} u
                    u = S.1/2 uu
          = - 2 tr[ X^T S.1/2 uu v^T ] + |uu|^2 |v|^2
          ~ | S.1/2 X |^2 - 2 tr[ X^T S.1/2 uu v^T ] + |uu|^2 |v|^2
          = | S.1/2 X - uu v^T |^2
        Recipe: Half-smooth the columns of X.
                Run an svd on the result, obtain uu and v.
                Half-smooth uu:  u = S.1/2 uu

    . R code:
            diff.2nd.mat <- function(N) { diff.1st.mat(N)[-(N-1),] - diff.1st.mat(N)[-1,] }
            diff.2nd.mat(6)   # Toy example
            N <- nrow(X)
            D <- diff.2nd.mat(N)
            Omega <- crossprod(D)
            Omega.eig <- eigen(Omega)
            lambda <- 10  # then try 1, then 10, then 100, then 1000...
            S      <- Omega.eig$vec %*% diag(1/(1 + lambda*Omega.eig$val))     %*% t(Omega.eig$vec)
            S.half <- Omega.eig$vec %*% diag(1/sqrt(1 + lambda*Omega.eig$val)) %*% t(Omega.eig$vec)
            X.half <- S.half %*% (X-mean(X))
            X.half.svd <- svd(X.half)
            X.half.svd.u <- S.half %*% X.half.svd$u
            par(mfrow=c(2,4), mgp=c(1.5,.5,0), mar=c(3,3,1,1))
            for(j in 1:4) plot(-X.half.svd$v[,j], pch=16, type="b")
            for(j in 1:4) plot(-X.half.svd.u[,j], pch=16, type="b")
            X.half.svd$d

  - Approach 2: Quadratic penalty approach -- two way

      What if we also want to smoothe the second domain (the other dimension)?
      What two-way penalty would you propose?
            | X - d u v^T | + u^T Omega.u u + v^T Omega.v v
        ==> Same violation of scale invariance
      Next try:
        (*) | X - u v^T | + u^T Omega.u u * |v|^2 + |u|^2 * v^T Omega.v v 
      This doesn''t lead to the obvious procedure either:
      We expect that the proper penalty results in two-way half-smoothing.
      Criterion (*) doesn''t.
      Here is what we need:
        (**)| X - u v^T | + u^T Omega.u u * |v|^2 + |u|^2 * v^T Omega.v v 
                          + u^T Omega.u u * v^T Omega.v v 
      If you do the math, you will find that the two-way substitution
      with half-smoothing results in the solution of this criterion:
                    S.u = (I + Omega.u)^{-1} = smoother matrix for u
                    S.v = (I + Omega.v)^{-1} = smoother matrix for v
                    S.u.1/2 = (I + Omega.u)^{-1/2} = half-smoother matrix for u
                    S.v.1/2 = (I + Omega.v)^{-1/2} = half-smoother matrix for v
                    uu = (I + Omega.u)^{1/2} u
                    u = S.u.1/2 uu
                    vv = (I + Omega.v)^{1/2} v
                    v = S.v.1/2 v
         (**) = - 2 tr[ X^T S.1/2 uu v^T ] + |uu|^2 |v|^2
        Recipe: Half-smooth the columns of X.
                Half-smooth the rows    of X.
                Run an svd on the result, obtain uu and vv.
                Half-smooth uu and half-smooth vv.  Done

      R code:

            diff.2nd.mat <- function(N) { diff.1st.mat(N)[-(N-1),] - diff.1st.mat(N)[-1,] }
            diff.2nd.mat(6)   # Toy example
            N <- nrow(X)
            p <- ncol(X)
            D.u <- diff.2nd.mat(N)
            D.v <- diff.2nd.mat(p)
            Omega.u <- crossprod(D.u)
            Omega.v <- crossprod(D.v)
            Omega.u.eig <- eigen(Omega.u)
            Omega.v.eig <- eigen(Omega.v)
            lambda.u <- 10  # then try 1, then 10, then 100, then 1000...
            lambda.v <- 1000000  # then try 1, then 10, then 100, then 1000...
            S.u      <- Omega.u.eig$vec %*% diag(1/(1 + lambda.u*Omega.u.eig$val))     %*% t(Omega.u.eig$vec)
            S.v      <- Omega.v.eig$vec %*% diag(1/(1 + lambda.v*Omega.v.eig$val))     %*% t(Omega.v.eig$vec)
            S.u.half <- Omega.u.eig$vec %*% diag(1/sqrt(1 + lambda.u*Omega.u.eig$val)) %*% t(Omega.u.eig$vec)
            S.v.half <- Omega.v.eig$vec %*% diag(1/sqrt(1 + lambda.v*Omega.v.eig$val)) %*% t(Omega.v.eig$vec)
            X.half <- S.u.half %*% (X-mean(X)) %*% S.v.half
            X.half.svd <- svd(X.half)
            X.half.svd.u <- S.u.half %*% X.half.svd$u
            X.half.svd.v <- S.v.half %*% X.half.svd$v
            par(mfrow=c(2,4), mgp=c(1.5,.5,0), mar=c(3,3,1,1))
            for(j in 1:4) plot(-X.half.svd.v[,j], pch=16, type="b")
            for(j in 1:4) plot(-X.half.svd.u[,j], pch=16, type="b")
            X.half.svd$d

  - Cyclic penalty for months:
            D.v <- diff.2nd.mat(p)
            rot <- function(x,n) c(x[-seq(n)],x[seq(n)])
            D.v <- rbind(rot(D.v[1,],2), rot(D.v[1,],1), D.v)
            D.v    # Looks right
      Now re-run the above code except the definition of D.v.

  - Note on the SVD approach: This is a mean model for the table X !!!
      Neither of the two 'dimensions' (months, years) is randomly sampled.
      So this breaks out of the mold of MVA where variance is decomposed.
      The SVD approach above is much more a form of latent mean model.
      One form of theory assumes

             X   = U.r D.r V.r^T + Z
            E[X] = U.r D.r V.r^T

      where Z is a table of same size as X filled with iid N(0,sigma^2)
      and r is the 'true rank' of the table of expectated values.



  - Some obvious EDA we haven''t performed so far:
    . Plot row and column means of the data.
            plot(apply(X, 2, mean), type="b", pch=16, lwd=2)
            plot(apply(X, 1, mean), type="b", pch=16, lwd=2)
      Oh boy, this doesn''t quite look like global cooling at all...
      So where does the u-pattern come from in the SVD????
    . Note that SVDs essentially estimate product interactions.
        That is, an SVD mean model decomposes the data table into
        a hierarchy of product interactions:  X[i,j] ~ sum u[i]*v[j].
      ==> Maybe it makes more sense to apply an SVD to the residuals
          from an additive fit, i.e., after removing row- and col-means.
            X.mean <- mean(X)
            X.row <- apply(X-X.mean,1,mean)
            X.col <- apply(X-X.mean,2,mean)
            X.res <- sweep(sweep(X, 1, X.row), 2, X.col) - X.mean
            X.res.svd <- svd(X.res)
            round(X.res.svd$d,5)
            # Sums of squares accounted for by the SVD product terms and the row/col effects:
            round(sum(X.col^2)*50,5)    # SS due to Months
            round(sum(X.row^2)*12,5)    # SS due to Years
            round(X.res.svd$d^2,5)      # SS due to product interactions
            # ==> 
            # Look at them:            
            par(mfrow=c(2,4), mgp=c(1.5,0.5,0), mar=c(2.5,2.5,.5,.5))
            plot(X.col, type="b", pch=16, xlab="Months", ylab="Temp")
            for(j in 1:3) plot(X.res.svd$v[,j], type="b", pch=16, xlab="Months", ylab="Temp")
            plot(X.row, type="b", pch=16, xlab="Years", ylab="Temp")
            for(j in 1:3) plot(X.res.svd$u[,j], type="b", pch=16, xlab="Years", ylab="Temp")
       These pictures are in sore need of smoothing, especially for the years.
            

  - Meta-observations:
    . In the two-way regularization problem we 'knew' what the
      'obvious' algorithm was:
        Half-smooth both rows and columns of X,
        perform an SVD on the result,
        then half-smooth both the u- and v-vectors.
    . The problem was what criterion this algorithm minimizes.
        We managed to reverse-engineer the criterion from the algorithm.
        It seemed to make sense.
    . The minimization criterion can then be used for generalizations.
        Example: Propose one-way and two-way lasso-penalized SVDs!
                 ... (squared lasso) ...
               


----------------------------------------------------------------


  - FUNCTIONAL PCA:

    . So when do we have a functional PCA situation?
      o The temperature data considered so far are two-way functional,
        and there is no randomly sampled dimension to provide variance.
      o For multivariate analysis we need one index to represent random samples.
        Examples:
        + Body temperature time series on randomly sampled individuals
        + Stock return series on randomly sampled small businesses
        + Energy consumption time series of randomly sampled residences
        + DNA sequences on randomly sampled patients

 
    . Subspace fPCA:

                            V[ Xvec^T a ]
        max/min/stationary  -------------   subject to:  a = B aa
                                |a|^2

    . Penalized fPCA: Once again you can use the null comparison principle.
        What is the numerator criterion under the assumption
        of absent correlation?

                              V[ Xvec^T a ] + a^T Omega a
          max/min/stationary  ---------------------------   (*)
                                |a|^2 + a ^T Omega a

      assuming the variables are standardized.  Else:

                          V[ Xvec^T a ] + a^T Omega
        max/min/stationary  -------------------------
                          sum aj^2*V[Xj] + a ^T Omega a

    . If the null assumption includes equal variances,
      then (*) is the criterion without standardization.


  - Functional CCA: subspace restricted or penalized

    . Subspace fCCA
                             V[ Xvec^T a + Yvec^T b ]
        max/min/stationary  -------------------------  subject to: a = B.a aa, b = B.b bb
                            V[Xvec^T a] + V[Yvec^T b]

    . Penalized fCCA:
                              V[ Xvec^T a + Yvec^T b ] + a^T Omega.X a + b^T Omega.Y b
        max/min/stationary  -------------------------------------------------------------
                            V[ Xvec^T a ] + V[ Yvec^T b ] + a^T Omega.X a + b^T Omega.Y b

  - Functional GCA: subspace restricted or penalized ... you get it...

  - Q: Which is more in need of regularization, PCA or CCA?
    A: ...

  - Curiosity:
      B.Silverman co-authored two approaches to fPCA, 
      neither of which agrees with the above.
      Here they are:

                          V[ Xvec^T a ] - a^T Omega a
          max/stationary  ---------------------------  (Rice & Silverman 1991)
                                  |a|^2

                             V[ Xvec^T a ]
          max/stationary  -------------------          (Silverman 1994)
                          |a|^2 + a^T Omega a
 
      Both approaches work only for the upper end of the PCA spectrum.


----------------------------------------------------------------


* Self-serving literature:
    - Buja, Hastie, Tibshirani (1989, AS, w discussion)
      "Linear Smoothers and Additive Models"
    - Buja (1990, AS)
      "Remarks on Functional Canonical Variates,
       Alternating Least Squares Methods and ACE"
    - Hastie, Tibshirani, Buja (1994, JASA)
      "Flexible Discriminant Analysis by Optimal Scoring"
    - Hastie, Buja, Tibshirani (1995, AS)
      "Penalized Discriminant Analysis"
    - Donnell, Buja, Stuetzle (1994, AS, with discussion)
      "Analysis of Additive Dependencies and Concurvities
       Using Smallest Additive Principal Components"
    - Tan, Buja, Ma (2014)
      "Regularized Additive Principal Components"
    - Huang, Shen, Buja (2009, JASA)
      "The Analysis of Two-Way Functional Data Using
       Two-Way Regularized Singular Value Decompositions"

* Real literature:
    - Leurgans and Moyeed on penalized CCA
        fully our penalized CCA
    - Silverman (and Rice) on regularized PCA
        interesting: neither gets the penalizations 'right'
    - books by Ramsay and Silverman (different approach)
        and much more ...


================================================================

#
# K-MEANS CLUSTER ANALYSIS
#
# The K-means clustering algorithm tries to divide the data
# into a pre-specified number of groups (K) in such a way
# that the groups are somehow homogeneous and separated
# from each other if possible.
#
# The algorithm works by placing K tentative cluster centers
# into data space.  The algorithm then moves the centers
# around till a certain criterion is minimized.
# The criterion is the sum of squared distances to the
# nearest cluster center:
#
#   sum_{i=1..N}  min_{j=1..K}  || x_i - m_j ||^2  =  min_{m_1,...,m_K}
#
# where m_1,..., m_K are the cluster centers in p-space
# and   x_1,..., x_N are the N data vectors/rows/cases/units/records.

# Obviously, the k-means clustering algorithm assumes quantitative
# variables.

# About the algorithm: How are the centers m_j moved around?
#   Here some heuristics: Each data row x_i "belongs" to one of the m_j,
#   namely, the one that is nearest to x_i.
#   Now turn things around: m_j has a collection of x_i's that "belong" to it
#   because m_j is the nearest center.
#   Now hold the collection of these x_i's fixed and ask: where should m_j
#   be located to minimize the criterion?  Answer: m_j should be the
#   mean of "its" x_i's.  That's because we are using a sums of square criterion!
#   So let's move m_j to the mean of "its" x_i's, do it for all centers m_j.
#   Having done this, we should wonder whether the assignments of x_i's to
#   nearest m_j's might not have changed.  In fact, in will have changed,
#   so we should do a re-assignement of the x_i's.
#   Now you see how the game repeats:
#     Given centers m_j, assign the x_i's to nearest m_j.
#     Replace the m_j's with the means of "their" x_i's.
#     Having new centers m_j, re-assign the x_i's to nearest m_j.
#     ... till convergence.

# Is k-means a "clustering" algorithm?  Maybe it is better to think
#   of it as a "partioning" algorithm.  It partitions the data into
#   as many groups as you tell it to.  Let's not ask the question
#   of the "correct number of clusters" yet.  In fact, k-means is
#   often applied when there are no clusters at all.  In marketing,
#   for example, clustering really means "segmentation", that is,
#   carving up into homogeneous segments, not natural clusters.
#   There may exist better methods for finding natural clusters
#   than k-means.

# Example in R: function 'kmeans()', data: telecom marketing
#   We use a random selection of K=4 rows as initialization of
#   the cluster centers.
  dat <- scale(mktg[,-9])
  N <- nrow(dat)
  p <- ncol(dat)
  K <- 4             # decide on 4 clusters/segments
  mktg.km <- kmeans(dat,
                    iter.max=1000,
                    centers = dat[sample(N, size=K),]
                    )
# Note that the number of cluster is implicit in the dimensions of 'centers';
# no need for a special argument for K.

# Examine the returned data structure:
      class(mktg.km)
      names(mktg.km)
# The following lists the breakdown into
      mktg.km$size
# The final centers are the following:
      mktg.km$centers
# The assignment of the rows of 'mktg' to the clusters is in:
      mktg.km$cluster
# But this grouping variable should be redundant with the size component:
      table(mktg.km$cluster);        mktg.km$size
# Indeed.

# In what follows we will use a repackaged function that
# picks K random rows for initialization.  It is in
    source("http://www-stat.wharton.upenn.edu/~buja/STAT-541/segs.R")
    # source("functions/segs.R")   # Instructor's local source
# and its call is 'k.means(x, K=4)'.  (Look at the content.)
# It has functions also to align multiple clusterings
# choosing labels as best matched as possible to each other.

#
# We'll look at the clusters in the first four principal component:
      mktg.pca <- pca(mktg[,-9])
# So here are the first 4 PCs colored according to 
      windows(10,10);  par(mgp=c(1.8,.5,0), mar=c(3,3,2,1))
      xlim <- ylim <- range(mktg.pca$scores)
      pairs(mktg.pca$scores[,1:4], pch=16, cex=.2, xlim=xlim, ylim=ylim,
                 col=c("red","green","blue","brown")[mktg.km$cluster])
#                                     our clustering:^^^^^^^^^^^^^^^
# Now this looks different from the original clustering/segmentation:
      windows(10,10);  par(mgp=c(1.8,.5,0), mar=c(3,3,2,1))
      pairs(mktg.pca$scores[,1:4], pch=16, cex=.2, xlim=xlim, ylim=ylim,
                 col=c("red","green","blue","brown")[mktg[,9]])
#                            the original clustering:^^^^^^^^
# Now let us compare a sequence of clusterings (we now omit the axes):
    par(mfrow=c(4,4), mar=rep(.5,4))
    for(i in 1:16) {
      clust <- k.means(scale(mktg[,-9]), K=4)$cluster # using random restarts
      plot(mktg.pca$scores[,1:2], pch=16, cex=.2, xlim=xlim, ylim=ylim,
           xaxt="n", yaxt="n", xlab="", ylab="",
           col=c("red","green","blue","brown")[clust])
    }
# There is a distraction in that the clusters are not matched in colors.
# This can be helped: our 'k.means()' function can be given a clustering
# vector, and the newly created solution will be immediately aligned
# with the given clustering.  This is done by giving 'k.means()' the
# argument  'K=clustering.to.be.aligned.with'.  In fact, the full name
# of this argument is 'K.or.align'.  If this argument is given, then
# K is inferred from it.
#
# Here are the original clustering with 15 randomly initialized clusterings
# aligned with the original clustering:
    plot(mktg.pca$scores[,1:2], pch=16, cex=.2, xlim=xlim, ylim=ylim,
         xaxt="n", yaxt="n", xlab="", ylab="",
         col=c("red","green","blue","brown")[mktg[,9]]) # the initial segmentation
    for(i in 1:15) { # alignment below...
      clust <- k.means(dat, K.or.align=mktg[,9])$cluster
#                                      ^^^^^^^^ align colors with the original segs
      plot(mktg.pca$scores[,1:2], pch=16, cex=.3, xlim=xlim, ylim=ylim,
           xaxt="n", yaxt="n", xlab="", ylab="",
           col=c("red","green","blue","brown")[clust])
    }
# So there seems to exist a degree of dependence on the initialization!
# Comparing the 16 plots, however, it appears that there are several
# clusterings that look identical.  So maybe there are only a few discrete
# local solutions for the kmeans criterion?
#
# If the k-means algorithm finds different solutions, how do they
# perform in terms of the k-means criterion?  Let's collect a
# list of 1000 clusterings and analyze the criterion values:
    mktg.km.lst <- list()          # takes a moment: <2sec/100 kmeans
    for(i in (length(mktg.km.lst)+1):1000) {
        if(i%%100==0) cat(i,"")
        mktg.km.lst[[i]] <- k.means(dat, K=mktg[,9])
    }
# (Among many calls to kmeans() it can happen that one of them crashes;
#  rerun, and it will continue from where it crashed.  This is fail-safe code.)
#
# Peel off the criterion values:
    mktg.km.crit <- sapply(mktg.km.lst, function(x) sum(x$withinss)) 
# Look at them:
    windows(10,10)
    hist(mktg.km.crit, breaks=10000, col="gray", xlim=c(9100,9700))
# Oh, there are discrete sets of values that appear many times over!!
# In fact, there are only few different values of the criterion:
    length(unique(mktg.km.crit))
    ## [1] 29
# Let's collect them, sorted from smallest (best) to largest (worst):
    mktg.km.crit.uniq <- sort(unique(mktg.km.crit))
# and tabulate them:
    table(mktg.km.crit)
9240.92733734854 9313.91364263915 9313.91740353386 9313.97250010717
             277              114               34               80
 9314.0971039678 9319.76071481234  9319.8238068561 9319.91076889382
             116              172               31                1
9336.89070097278 9336.90739225535  9336.9346563692 9370.29454375176
              48               15               25                6
 9370.4148403249 9377.17397780075 9377.31085261402 9449.83056805507
               5               17                1               11
9450.00997410455 9450.41265146235  9487.3014103005 9487.32581863884
               8               11                8                6
9488.15924588206   9491.312016306 9530.79434513218 9530.79668858917
               1                2                5                4
9569.88679967423  9577.3159323069
               1                1
# Turns out some of the values are nearly identical, such as the
# 2nd, 3rd and 4th.
# Could the values correspond to different geometric patterns of the
# cluster centers?  How could we confirm this hunch?  Let's plot
# in a 4x4 matrix layout again, but for one particular value
# of the criterion at a time.  Execute the following repeatedly,
# but pick different values from the vector 'mktg.km.crit.uniq':
    icrit <- 1 # try for 1, 2, 3, ...; the values are sorted, best first!
    crit <- mktg.km.crit.uniq[icrit]
    windows()
    par(mfrow=c(4,4), mar=rep(.5,4))
    xlim <- ylim <- range(mktg.pca$scores[,1:2])
    for(i in 1:16) {
      clust <- mktg.km.lst[mktg.km.crit==crit][[i]]$cluster
      plot(mktg.pca$scores[,1:2], pch=16, cex=.2, xlim=xlim, ylim=ylim,
           col=c("red","green","blue","brown")[clust],
           xaxt="n", yaxt="n", xlab="", ylab="")
    }
    text(x=par()$usr[2], y=par()$usr[3], lab=paste(icrit,":",crit), adj=c(1.1,-.2) )
# Interestingly, icrit=1, with crit = 9240.927, produces something
# different from the segmentation that comes with the data in column 9.
# A pattern like column 9 is found for icrit=2 with crit = 9313.914,
# which is only second best!  How come the analysts who produced
# the segmentation of column 9 did not hit on the "best" clustering
# in terms of the criterion?  They probably did but preferred the
# "second best" pattern anyway, most likely due to interpretability.
# (In a moment we will see that the column 9 pattern is actually
#  worse: it belongs to the 5th largest criterion value...)

# Next question: are the cluster assignments identical for all clusterings
# with the same criterion value?  So far the plots sure look like it.
# In the following code we look at each criterion value at a time,
# and for each value we check whether the cluster memberships are
# identical.  Realize that this exercise makes sense only because
# we aligned the clusterings.  Without alignment, identical clusterings
# would differ by a random permutation of the cluster numberings.
  for(icrit in 1:length(mktg.km.crit.uniq)) {
    crit <- mktg.km.crit.uniq[icrit]   # the unique sorted criterion values
    sel  <- mktg.km.crit==crit    # these are the solutions with criterion==crit
    if(sum(sel)>1) { # nothing to compare if only one clustering for this crit value
      clust1 <- mktg.km.lst[sel][[1]]$cluster  # pick the first as reference
      for(i in 2:sum(sel))  # compare the others with the reference
        if(any(clust1 != mktg.km.lst[sel][[i]]$cluster)) # report only if different
          cat("differences found for",icrit,":",
              sum(clust1 != mktg.km.lst[sel][[i]]$cluster),"\n")
    }}
# No complaint, all are identical clusterings in terms of case assignments!
# Hence we can limit ourselves to one clustering per criterion value:
  mktg.km.lst.red <- list()
  for(icrit in 1:length(mktg.km.crit.uniq)) {
    crit <- mktg.km.crit.uniq[icrit]   # the unique sorted criterion values
    sel  <- mktg.km.crit==crit    # these are the solutions with criterion==crit
    mktg.km.lst.red[[icrit]] <- mktg.km.lst[sel][[1]]  # pick the first; all other are equal
  }
# check: length 29
  length(mktg.km.lst.red)
# We lost the information about the number each clustering appeared in
# the 1000 random restarts, but this is not essential for what follows.
# From now on we're working with this reduced set that has redundant
# clusterings removed.
# Here is a plot of the first 16 of the 26 clusterings in the first 2 PCs:
    par(mfrow=c(4,4), mar=rep(.5,4))
    xlim <- ylim <- range(mktg.pca$scores[,1:2])
    for(i in 1:16) {
      crit <- mktg.km.crit.uniq[i]
      clust <- mktg.km.lst.red[[i]]$cluster
      plot(mktg.pca$scores[,1:2], pch=16, cex=.2, xlim=xlim, ylim=ylim,
           xlab="", ylab="", xaxt="n", yaxt="n",
           col=c("red","green","blue","brown")[clust])
      text(xlim[2], ylim[2], lab=paste("crit:",round(crit,4)), adj=c(1,1))
      text(xlim[2], ylim[1], lab=paste("freq:",      sum(mktg.km.crit==crit)), adj=c(1,0))
    } # freq of this crit value in multiple restarts:^^^^^^^^^^^^^^^^^^
# Well, the best solution with criterion ~ 9241 is numerically a league better
# than the others, but the 7 next patterns look very similar to each other

# Let us now check which of the 28 clusterings has the same
# partitions as the colum 9 segmentation by counting the
# mismatches in the cluster numberings.  Recall that this works
# because mktg[,9] is the reference for alignment of the numberings:
  mktg.km.mis.9 <- sapply(mktg.km.lst.red,
                          function(x) sum(x$cluster != mktg[,9]) )
  names(mktg.km.mis.9) <- round(mktg.km.crit.uniq,4) # use the criterion values as names
  mktg.km.mis.9
# So, it is only the 5th that is identical to the column 9 clustering!
# Clusterings 2, 3, 4 are close, though, expecially in view of N=1926.
# Let us get a more complete picture by forming a table of mismatch counts
# of all pairs of clusterings:
    mktg.km.mis <- matrix(0, length(mktg.km.crit.uniq), length(mktg.km.crit.uniq))
    for(i in seq(mktg.km.crit.uniq))
      for(j in seq(mktg.km.crit.uniq))
        mktg.km.mis[i,j] <-
          sum(mktg.km.lst.red[[i]]$cluster != mktg.km.lst.red[[j]]$cluster)
    options(width=120)
    mktg.km.mis
# We can make out patterns by going down the diagonal where we notice
# square blocks of small numbers that indicate groups of clusterings
# that are very close to each other.  Here is the 13x13 square from the top left:
    mktg.km.discrep[1:13,1:13]
#----------------
      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
 [1,]    0  810  809  806  814  791  794  792  823   821   821   836   835
 [2,]  810    0    3    4   10  104  105  101  492   490   494   604   615
 [3,]  809    3    0    7    7  101  102   98  491   489   493   603   614
 [4,]  806    4    7    0   14  108  109  105  491   489   493   607   618
 [5,]  814   10    7   14    0   94   95   91  490   488   492   600   611
 [6,]  791  104  101  108   94    0    6   10  429   433   437   546   557
 [7,]  794  105  102  109   95    6    0    6  424   428   432   541   552
 [8,]  792  101   98  105   91   10    6    0  426   430   434   543   554
 [9,]  823  492  491  491  490  429  424  426    0     6    10   207   218
[10,]  821  490  489  489  488  433  428  430    6     0     4   205   216
[11,]  821  494  493  493  492  437  432  434   10     4     0   202   213
[12,]  836  604  603  607  600  546  541  543  207   205   202     0    11
[13,]  835  615  614  618  611  557  552  554  218   216   213    11     0
#----------------
# We see that the first and best pattern is nothing like anything else.
# The next 4 patterns form a block and are all close to each other.
# Then comes a block of 3 patterns that are very close,
# then another block of 3 patterns that are very close.
# Did you notice what we did?  We clustered the clustering patterns...
# In light of this table it shouldn't be a surprise that clusterings
# 2-5 looked about the same.  Actually, even clusterings 6-9 have
# small mismatch numbers with clusterings 2-5 percentagewise,
# and even they look all alike.  The next group of clusterings, 9-11,
# look alike but different from everything else.

# Summary points so far:

# 1) K-means analysis has an issue (not a problem) of local minima,
# and one needs methodology to get an overview of them:
# collecting and analyzing many multiple restarts.
# The analysis we used consisted of
#  a) listing and sorting the unique values of the criterion
#  b) checking that all clusterings for a given criterion value
#     are identical
#  c) plotting the clusterings of each criterion value as
#     colors in the first two principal components of the data
#  d) computing a mismatch matrix for all pairs of clusterings
#  e) comparing the mismatch matrix with the similarities among
#     the colored PC projection plots

# 2) It seems the "best" clustering is not always the most interpretable.
# At least that's what is suggested by the fact that the column 9
# segmentation that comes with the data is not numerically optimal.
# Yet, the marketers who decided in its favor most likely had
# interpretability in mind in their "suboptimal choice".
#
# 3) For comparing clusterings, one needs to align the numberings
# of clusters across clusterings.  But for alignment, one needs a
# reference clustering.  For the mktg data we used the column 9
# clustering as reference, but in other situations one may first
# have to play with clusterings before one picks a reference for
# aligning the other clusterings.


#----------------
#
# We continue with a systematic simulation study of
#   the geometry of kmeans clusterings and their (in)stabilities

# Illustration:
#   Consider normal 2-D data with pre-defined correlations.
#   Plot the kmeans solutions for K=4 for varying correlations.
    N <- 20000
    x1 <- rnorm(N);  xa <- rnorm(N) # start with uncorrelated variables
    par(mfrow=c(6,6), mar=rep(.5,4))#
    for(co in seq(.00, .99, length=36)) { # loop over correlations
    ## for(co in seq(.6, .7, length=36)) { # zoom in on switch of pattern
      xx <- scale(cbind(x1,co*x1+sqrt(1-co^2)*xa))
      xlim <- ylim <- 3.5*(c(-1,1))
      clust <- kmeans(xx, xx[1:4,])$cluster
      plot(xx, xlim=xlim, ylim=ylim, pch=".", cex=0.5,
           col=c("red","green","blue","brown")[clust],
           xlab="", ylab="", xaxt="n", yaxt="n")
      text(x=-3.4, y=3.4, lab=round(co,8), adj=0, cex=.8)
    }
# Something happens between correlation 0.6 and 0.7:
# an abrupt switch from one geometric pattern to another.
# One could think of the pattern for lower correlations
# as slicing a pie, for higher correlations as slicing
# a cigar.
#
# If in a practical situation the data are correlated
# in a way that is conducive to multiple near-optimal patterns,
# they can be found with multiple restarts, as we did with
# the marketing data.  This shows that these patterns are
# indeed local minima of the kmeans criterion:
    co <- 0.648  # (choose near the switch of patterns for current x1 and xa)
    xx <- scale(cbind(x1,co*x1+sqrt(1-co^2)*xa))
    for(irestart in 1:36) {
      clust <- k.means(xx, K=4)$cluster  # our function
      plot(xx, xlim=xlim, ylim=ylim, pch=".", cex=0.5,
           col=c("red","green","blue","brown")[clust],
           xlab="", ylab="", xaxt="n", yaxt="n")
    }
#
# Q: What are the implications for statistical inference
#    for kmeans clustering?
#    Does bootstrap, for example, makes sense near correlation 0.6?
#    What is bootstrap going to tell you?
#    Let's try: pick a correlation near the switch point, and bootstrap.
    for(iboot in 1:36) {
      xb <- scale(xx[sample(N, replace=T),])
      clust <- kmeans(xb, xb[1:4,])$cluster
      plot(xb, xlim=xlim, ylim=ylim, pch=".", cex=0.5,
           col=c("red","green","blue","brown")[clust],
           xlab="", ylab="", xaxt="n", yaxt="n")
    }
# Indeed, bootstrap keeps flip-flopping between patterns,
# with a strange attempt at a transition pattern at times.
# Possible lesson: Yes, do bootstrap, but beware that the
#   multiplicity of solutions might be of a geometric nature.
#
# Come to think of it, we found multiple patterns in the
# telecom marketing data also, just with multiple restarts!
# The pattern had the colors blue and brown overlapping,
# indicating that it needed a third dimension to accommodate
# its four cluster centers:
    xlim <- ylim <- range(mktg.pca$scores[,1:2])
    icrit <- 1
    clust <- mktg.km.lst.red[[icrit]]$cluster
    pairs.plus(mktg.pca$scores[,1:4], pch=16, cex=.2, xlim=xlim, ylim=ylim,
               col=c("red","green","blue","brown")[clust])
# It appears that blue and brown are best separated in the positive
# diagonal direction of PCs 3 and 4, a fact that is somewhat obscured
# by the green and red clusters that are smudged over everything.
#
#----------------
#
# Summary points:
#
# 1) Patterns in k-means clusterings have to do with the
#    geometry of the point scatter.  If the point scatter is just a
#    multivariate normal blob, the geometry is fully described by the
#    covariance structure.  One should therefore not be surprised if
#    there is a connection between PCA and k-means partitions.
#    Both respond to the geometry of the best fitting ellipsoid
#    round the data.
#
# 2) For normal (Gaussian) point scatters it is quite plausible
#    that the k-means solutions are fully determined by the
#    covariance structure.  A little more surprising is that
#    this seems to hold for such instances as the marketing
#    data as well, where most of the variables are discrete.
#    Who would have expected a pie-slicing pattern in the
#    first two PCs very similar to the bivariate normal simulation
#    with correlations below 0.6.

# 3) Note that we said "covariance structure".  In fact,
#    k-means is a metric method in that it relies on
#    Euclidean distances.  Depending on how the variables
#    are scaled, one can get very different k-means results.
#    The issue should become clear by imagining data with two
#    variables whose standard deviations are 1 and 100, respectively.
#    K-means will consider this as a cigar and slice it on the
#    second variable, never mind the correlation.  If one wants the
#    correlation structure to drive the clustering, one should
#    scale all variables to unit variance, so covariance and
#    correlation are identical.  In general, one should make
#    sure the variables are on comparable scales, and the
#    conventional 'equalization' of scales is by standardization
#    of the variables (z-scores: subtract from each variable its mean,
#    divide it by its stdev).

#----------------
#
# Interpretation of cluster centers:
#
#   Like regression coefficients and PCA loadings,
#   the kmeans centers are a major source for interpretation.
#   In what follows we plot cluster centers as profiles in
#   what is known as a "parallel coordinate plot":
  windows(wid=16, hei=12)
  par(mgp=c(1.8,.5,0), mar=c(2,2,1,1), mfcol=c(5,2))
  for(icrit in 1:10) {   # number of pattern; recall: icrit=5 ~ mktg[,9]
    crit <- mktg.km.crit.uniq[icrit]
    mktg.ctrs <- mktg.km.lst.red[[icrit]]$centers
    round(mktg.ctrs, 2)
    plot(x=c(.5,8.5), y=range(mktg.ctrs), type="n", xaxt="n", xlab=NA, ylab=NA)
    axis(side=1, at=1:ncol(mktg.ctrs), lab=colnames(mktg.ctrs))
    lines(x=c(rbind(1:ncol(mktg.ctrs),1:ncol(mktg.ctrs),NA)),
          y=rep(c(par()$usr[3:4],NA),ncol(mktg.ctrs)), col="gray95")
    lines(par()$usr[1:2], c(0,0), col="gray95")
    for(i in 1:nrow(mktg.ctrs))
      lines(mktg.ctrs[i,], col=c("red","green","blue","brown")[i])
    text(x=par()$usr[2], y=par()$usr[3], lab=icrit, adj=c(1.5,-0.4))
    text(x=par()$usr[2], y=par()$usr[4], lab=round(crit,3),  adj=c(1.05,1.2))
  }

# The center coordinates indicate 'where' the cluster sits.
# Judge their meaning in terms of:
# - the first three variables: call volume
# - the next three variables:  dummies for needs
# - age
# - education
# Which of these sets of centers is most interpretable?
# Recall that we have natural groups of clusterings:
# 1 by itself, 2-5 pretty much like mktg[,9], 6-8 also similar,
# 9-10 different from all else.
#  (clustering 1 is interesting, and new to us:
#      red group by itself makes high volums, all else low vol;
#      blue high on business; green old age with low vol and no features)

#----------------
#
# Summary points about K-means clustering:
#
# - K-means is more often than not used to segment
#   data that do not fall into natural groups.
#   Segmenting of ungrouped data can be meaningful.
#
# - K-means arranges cluster centers in geometric patterns;
#   these patterns are local minima of the K-means criterion.
#
# - Which pattern solution one prefers depends not only on
#   the K-means criterion value but also on interpretability.
#
# - Flip-flopping between patterns can be an impediment
#   to statistical inference: one could conceive of
#   confidence regions for the cluster centers given
#   a fixed pattern, but the presence of multiple patterns
#   sabotages this idea.
#
# - PCA and K-means are naturally connected.  In fact, there
#   exists theory according to which the K-means centers of
#   multivariate normally distributed data fall in PC
#   subspaces of large eigenvalues (Tarpey & Flury).
#   Therefore, plotting the first few PC projections with
#   color coded clusters may reveal the K-means patterns.
#
#----------------
#
# Open questions:
#
#    We have not addressed the choice of 'K', the number of
#    clusters.  There exists a literature on this topic,
#    repleat with quasi-F-tests and the like.
#    I find these formal methods mostly irrelevant or even
#    misleading.  If there are natural groups, you should
#    try to see them in a dimension reduction, such as a
#    scatterplot matrix of largest few principal component
#    projections.  If this doesn't work try a non-linear
#    dimension reduction method such as multidimensional
#    scaling (MDS).  [Ask the instructor about MDS; he loves
#    to talk about it.]

#    In soft sciences such as marketing and
#    social sciences in general, there are rarely natural
#    clusters present, hence clustering is really segmentation,
#    and the question of 'correct number of clusters' does not
#    exist.  If anything, one should wonder whether the chosen
#    number of clusters is compatible with the geometry of the
#    data cloud in p-space.  For example, I find K=5 generally
#    not congenial for situations where the first two PCs
#    cover a majority of variance.  K-means then carves an
#    ellipsoid in 2-D into 5 slices, which just doesn't seem
#    right because it violates the symmetry of an ellipsoid.
#    If the data projected onto the first two PCs looks not
#    ellipsoidal but more like a polygon with odd number of
#    corners, then slicing into 3 or 5 might be sensible.
#
#
#----------------------------------------------------------------
#
# * ANOTHER TYPE OF CLUSTERING: HIERARCHICAL METHODS
#
# A very different type of cluster analysis is based on the idea
# of hierarchical trees.  Their purpose is to find natural groups,
# unlike K-means, which is more often used for segmentation, that is,
# carving up data that don't have natural groups.

# Hierarchical methods work in a bottom up fashion:
# they recursively form partitions of the data by
# joining groups that are near each other in some sense.
# They are also called 'agglomerative' because they
# recursively form unions of pairs of groups.

# Here are the steps:
# - The input is a distance or dissimilarity matrix of size NxN.
#   Each entry is a proximity measure for a pair of points.
# - One initializes the algorithm with the finest possible
#   partition in which every data point is its own group.
# - One then joins the nearest two points to form the first
#   non-trivial group.
# - Next one computes a distance measure of this two-point group
#   from all other one-point groups.  This way one obtains an
#   (N-1)x(N-1) distance matrix.
# - Now repeat: join the two nearest elements, which may be joining
#   another two points into a two-point group, or the existing
#   two-point group with another point into a three-point group.
# - Which ever way, one again computes a distance of the newly
#   merged group from all others, resulting in an (N-2)x(N-2)
#   distance matrix.
# - Recurse...
# The output of this algorithms is represented as a hierarchical
# tree which depicts all mergers of groups.
#
# Here is an example: the (famous) Swiss Fertility data
#  
    str(swiss)
    pairs.plus(swiss)
    rownames(swiss)
# (Background: Cases = districts of French speaking Switzerland
#              in the late 1800s, the days of industrialization.
#              Social scientists, demographers and historians are 
#              interested in the dependence of fertility on
#              socio-economics in that period.
               plot(Education ~ Catholic, data=swiss,
                    pch=16, xlim=c(-20,120), ylim=c(-10,65))
               with(swiss, identify(Catholic, Education, rownames(swiss)))
# Again, one should scale the data first, then...
    swiss.dist   <- dist(scale(swiss))   # see:   help(dist)
    swiss.hclust <- hclust(swiss.dist)   # see:   help(hclust)

    plot(swiss.hclust)
# Choose arguments for a less cluttered views:
    plot(swiss.hclust, hang=-1, main="", sub="", ylab="", xlab="", cex=1)
#
# The function 'hclust()' offers seven methods that
# differ in how they form the reduced distance matrix after
# each merger step:
    hclust.methods <- c("ward", "single", "complete", "average",
                        "mcquitty", "median", "centroid")
    swiss.hclust <- hclust(swiss.dist, method="single")   # see:   help(hclust)
# We'll try them out before we give more explanations:
    windows(12,8)
    par(mfrow=c(3,3), mgp=c(1.8,.5,0), mar=c(2,3,2,1))
    for(meth in hclust.methods)
      plot(hclust(swiss.dist, method=meth),
           hang=-1, xlab="", main=meth, cex=.45)
#
# Here is the explanation from 'help(hclust)':
# - WARD''S MINIMUM VARIANCE method aims at finding compact, spherical
#   clusters.
# - The COMPLETE LINKAGE method finds similar clusters.
# - The SINGLE LINKAGE method (which is closely related to the minimal
#   spanning tree) adopts a 'friends of friends' clustering strategy.
# - The other methods can be regarded as aiming for clusters with
#   characteristics somewhere between the single and complete link
#   methods.
# Here are details for some of the methods:
# for two existing clusters A and B, the methods compute the
# distance d(A,B) as follows.
# - SINGLE LINKAGE: the minimum distance between points in A and B
# - COMPLETE LINKAGE: the maximum distance between points in A and B
# - AVERAGE LINKAGE: the mean of distances between points in A and B
# - WARD'S METHODS: the mean of the squared distances between all
#     points in the union of A and B
# - MEDIAN LINKAGE: the median of distances between points in A and B
# These methods, except for the last, have simple updating rules
# that facilitate the recursion:
# - SINGLE LINKAGE:   d(AuB,C) = min(d(A,C),d(B,C))
# - COMPLETE LINKAGE: d(AuB,C) = max(d(A,C),d(B,C))
# - AVERAGE LINKAGE:  d(AuB,C) = (N_A*d(A,C)+N_B*d(B,C))/(N_A+N_B)
# - WARD'S METHODS:   SS(A,B) := sum{i in A, j in B} dij^2 
#                     S(A)    := SS(A,A)
#                     V(A)    := SS(A)/N_A/(N_A-1)/2
#                     d(A,B)  := V(AuB)
#                   Note:
#                     SS(AuB) = SS(A)+SS(B)+SS(A,B)
#                   hence one keeps track of all SS(A)'s and
#                   all SS(A,B)'s and collects them in a diagonal
# Which method gives the most useful results depends on the data
# and on the purpose.
#
# As mentioned, a major purpose of these methods is to
# find natural groups in multivariate data.  Whether the
# 'clusters' are natural or not should be checked graphically.
# To this end, one needs a method for cutting a clustering tree
# at a given level.  The function 'cutree()' serves for this purpose: 
# it allows us to cut at a given 'height' of the tree 
# (on the vertical axis of the tree plot),
# or for a given number of clusters.
# We cut the 'swiss' fertility data into 6 clusters and
# check what the various methods produce.
# First we plot the tree once more:
    imeth <- 2
    meth <- c("ward", "single", "complete", "average",
              "mcquitty", "median", "centroid")[imeth]  # play with method
    swiss.hclust <- hclust(swiss.dist, method=meth)
    windows(8,8)
    plot(swiss.hclust, hang=-1, main=meth, xlab="", ylab="", sub="", cex=.75)
# Cut the tree so we obtain six groups:
    swiss.grps <- cutree(swiss.hclust, k=6)
# Tabulate so we know the sizes of the groups:
    table(swiss.grps)
# Matrix-plot the data with color-coding of the groups:
    swiss.grps <- order(order(table(swiss.grps)))[swiss.grps]
    windows(10,10)
    clrs <- c("black","brown","green","blue","red","orange","magenta")
    pairs.plus(swiss, col=clrs[swiss.grps], main=meth, cex=.8)
# (The first line is arcane: it orders the groups so their
#  number id is matched to colors: small groups in dark colors,
#  large ones in bright colors:)
    table(swiss.grps)
#
# Some comments on our experiences with these methods on this data:
# - Ward's method does not work; there are no spherical clusters.
# - 'Complete linkage' does a reasonable job: it isolates Geneva
#   and splits both catholics and protestants into two groups.
# - 'Single linkage' does an extremely meaningful job: it
#   separates catholics and protestants in two groups and
#   isolates 3 singletons and one pair.
#   Single linkage is known to 'suffer' from 'chaining', that is,
#   clusters it creates can consist of long strands of points.
#   Maybe this isn't a real disadvantage.

# Remaining mystery: what makes La Vallee so special?... ggobi

#---------------- end of class 
#
# Some artificial data with natural clusters, so we can find
# out whether either K-means or hierarchical clustering finds them.
# The following construction puts standard normal clusters at
# 4 times the unit vectors (the vertices of the natural simplex):
    K <- 6      # number of natural clusters
    N <- K*100  # number of points per cluster = 100
    p <- 6      # number of variables
    s <- 4      # cluster separation parameter:
                #   center dists = s*sqrt(2)=5.66
    clusdat <- matrix(rnorm(N*p), ncol=p)
    clusdat <- clusdat + s * ( ((row(clusdat)-1)%%K) == (col(clusdat)-1) )
    pairs.plus(clusdat, cex=.3)
# Make it clear that in each frame we are supposed to see two detached
# clusters (top and right), and the four others plotted on top of each
# other in the bottom left.
#
# - Does K-means find the clusters?
    clusdat.km <- k.means(clusdat, K=6)
    pairs.plus(clusdat, col=clrs[clusdat.km$cluster], cex=.3)
# Great! K-means finds them, if we give it the proper cluster number
# What if we give it the wrong number?
    clusdat.km <- k.means(clusdat, K=2)
    pairs.plus(clusdat, col=clrs[3+clusdat.km$cluster], cex=.3)
# K=2: Lumps clusters together.
# What about too many clusters?
    clusdat.km <- k.means(clusdat, K=7)
    pairs.plus(clusdat, col=clrs[clusdat.km$cluster], cex=.3)
# some clusters are merged, others split...
# K-means seems to be sensitive to choice of K.
#
# - Do hierarchical methods find the clusters?
    imeth <- 4
    meth <- c("ward", "single", "complete", "average",
              "mcquitty", "median", "centroid")[imeth]  # play with method
    clusdat.hc <- hclust(dist(clusdat), method=meth)
    #windows(8,8)
    plot(clusdat.hc, hang=-1, main=meth, xlab="", ylab="", sub="")
# Indeed, they all do.  Seems the problem was too easy.
# We could reduce the separation of the centers,
# which was large: 4*sqrt(2)=5.66
# At least there is comfort in the fact that clustering methods
# do find the obvious natural clusters.
#
#
#================================================================


                     KERNELIZING INTRO


* Purpose: Background for
  - smoothing splines and their generalizations
  - generalized quadratic penalty methods
  Source: Notes by Xin Lu


* We assume that you know some fundamentals of Hilbert spaces:
    They are inner product spaces whose topology
    (1) is generated by the associated norm and
    (2) is complete, i.e., all Cauchy sequences have limits.


* Fundamental notion: Reproducing Kernel Hilbert Space (RKHS)


  - Definition: A 'RKHS' is a Hilbertspace of functions on a domain X
                such that function evaluations are bounded (continuous)
                linear functionals, i.e.:  |f(x)| <= C(x)*||f||

    [Pedantically:  Define a linear functional L(f) := f(x)
     for a fixed but arbitrary location x in the domain X.
     The linear functional is bounded (hence norm-continuous) iff
     there exists a constant C such that |L(f)| <= C*||f||
     for all f in the Hilbert space of functions.]


  - Fact: L2 spaces cannot be RKHSs if there exist non-empty sets
          of measure zero.
          Why? ...  Function evaluation is not even well-defined.

          (L2 spaces can be RKHSs, e.g., if the domain is discrete
          and all atomic events have non-zero probability.)


  - Fact: RKHS norm-convergence implies pointwise convergence:
          ||fn - f|| --> 0   ==>   fn(x) --> f(x)    for all x
          Why?  Because  |fn - f| <= C(x) |fn(x 


  - Definition: Let H be a Hilbert space of functions on a domain X.
                A 'reproducing kernel of H' is a function k(x,y)
                on X x X that satisfies
                (1)   k(x,.) is in H for all x in X
                (2)   <k(x,.),f(.)> = f(x)

    . Property (2) is called the 'reproducing' property.

    . k(x,y) is necessarily symmetric because <.,> is symmetric:
                k(y,x) = <k(x,.),k(y,.)> = <k(y,.),k(x,.)> = k(x,y)


  - A Hilbert space of functions H is a RKHS iff it has a reproducing kernel.

      '=>': If H is a RKHS, then by the Riesz Representation Theorem there exists
            for every x a function k(x,.) in H that 'represents' the linear 
            functional L(f)) = f(x):  L(f) = <k(x,.),f(.)> = f(x).
            So k(x,y) satisfies (1) and (2).
      '<=': If H is a Hilbert space of functions and there exists a reproducing
            kernel k(x,y) in it, then evaluation functionals are bounded:
            |f(x)| = |<f(.),k(x,.)>|  <=  ||f||*||k(x,.)||  by the C-S inequality.
            Hence ||k(x,.)|| < Inf is a bounding constant C(x) because k(x,.) is in H.


  - Definition: A 'kernel' k(x,y) is a function on X x X for which there
                exists a (not necessarily reproducing) Hilbert Space F
                (called the 'feature space') and a 'feature map'  Phi: X -> F 
                such that:         k(x,y) = <Phi(x),Phi(y)>_F

    . Simple example:
        Let f1(x),...,fK(x) be real-valued functions on X.
        Let F = Reals^K with the Euclidean inner product <.,.>.
        Define  Phi(x) = (f1(x),...,fK(x)), X --> F.
        Then  k(x,y) = <Phi(x),Phi(y)> = sum_k fk(x)*fk(y)  defines a kernel.

    . More complex example:
        Let L2(Z,mu) be an L2 space on a domain Z with measure mu(dz).
        Let h(x,z) be such that h(x,.) is in L2(Z,mu) for all x in X.
        Then x --> h(x,.), X --> L2(Z,mu) defines a feature map, and
        the associated kernel is    k(x,y) = Integal h(x,z)*h(y,z) mu(dz)

    . Fact: Every reproducing kernel is a kernel.
      Proof: Set Phi(x) = k(x,.) and F=H, so that k(x,y) = <k(x,.),k(y,.)>_H.

    . We may also require that Phi(x1),...,Phi(xn) are lin.indep.
      for any distinct x1,...,xn and arbitrary n.  This would be
      satisfied, e.g., if X=Reals and Phi(x)=(1,x,x^2,x^3,...).


  - Definition: A 'positive definite function' k(x,y) is a function
                that is symmetric, k(x,y) = k(y,x), and for which
                all matrices   K = (k(xi,xj))_{i,j=1,...n}  are
                positive semi-definite for all n and all choices x1,...,xn:
                   sum_ij ai aj k(xi,xj) >= 0  for all (a1,...,an).

    . Fact: Every kernel obtained from a feature map Phi(x) is a pos.def. function.
      Proof:  sum_ij ai aj k(xi,xj) = sum_ij ai aj <Phi(xi),Phi(xj)>_F
                                    = < sum_i ai Phi(xi), sum_j aj Phi(xj) >_F
                                    = ( || sum_i ai Phi(xi) ||_F )^2
                                    >= 0  always.


  - THEOREM (Moore-Aronszajn): For any positive definite function k(x,y)
      there exists a unique RKHS with reproducing kernel k(x,y).

    Fact:  H = topological completion of span{k(x,.): x in X}

    'Proof': Define       H_0 := span{k(x,.): x in X}
             For f,g in H_0 we have:
               f(.) = sum_i ai k(xi,.)
               g(.) = sum_j bj k(yj,.)
             where both {xi} and {yj} are finite sequences of elements in X.
             Def.:       <f,g> := sum_ij ai bj k(xi,yj)
             Fact:       <f,g> = sum_j bj f(yj) = sum_i ai g(xi)
             Issue: At this point we don''t know whether this is a
                    coherent definition.  What if f1 = f2 has two different
                    representations with different locations x1i and x2i and
                    different coefficients a1i and a2i?  Could it then be
                    that <f1,g> =/= <f2,g> ?  It is not possible.  Check this:
                      (1)   <f1,g> = sum_j bj f1(yj)      (See above Fact)
                      (2)   <f2,g> = sum_j bj f2(yj)      (     "      " )    
                    If f1(x)=f2(x) for all x, then (1) = (2).  QED
             Fact:       <k(x,.),k(y,.)> = k(x,y)    (Set {xi}={x}, {yj}={y})
             Def.:       ||f||^2 := <f,f> = sum_ij ai aj k(xi,xj)
             Fact:       ||k(x,.)||^2 = k(x,x)
             Fact:       ||f||^2 >= 0                (k(x,y) is pos.def.)
             Fact:       <f,g> <= ||f||*||g||        (C-S, from previous line)
             Fact:       <k(x,.),f(.)> = f(x)        (reproducing property)
                         Proof:  <k(x,.),f(.)>
                                 = sum ai <k(x,.),k(xi,.)>
                                 = sum ai k(x,xi)
                                 = f(x)              (A HIGHER TRIVIALITY, isn''t it?)
             Fact:       |f(x)| <= ||k(x,.)||*||f||  (from f(x) = <k(x,.),f(.)> and C-S)
                                                     (thus evaluation functionals are bounded)
             Fact:       ||f|| = 0  ==>  f(x)=0  for all x   (see previous Fact)
             Fact:       ||fn|| --> 0  ==>  fn(x) --> 0 pointwise  (from boundedness of eval.)
             Completion of the space H_0:
               Let fn be Cauchy sequence wrt the norm: ||fn-fm|| --> 0  as n,m-->Inf
               Then, for each x, fn(x) is a Cauchy sequence of real numbers (see previous Fact),
               hence has a limit:   fn(x) --> f(x)  for some function f.
               Let H be the space obtained by including all limits of Cauchy sequences in H_0.
               It can be shown that the inner product can be uniquely extended to H:
                  <f,g> := lim <fn,gn>  
               More details are needed...

  - Thus we have come full circle:
    . RKHSs have reproducing kernels.
    . Reproducing kernels are kernels (are representable with feature maps Phi(x)).
    . Kernels are positive definite functions.
    . Every positive definite function defines a unique RKHS.


* The Freedom of Kernelizing:

  - The feature representation of kernels is a great help in
    constructing RKHSs

  - Kernel algebra: positive semi-definite kernels!
    . k(x,y) = c > 0
    . k(x,y) = f(x)*f(y)           for arbitrary f
    . k(x,y) = k1(x,y) + k2(x,y)
    . k(x,y) = k1(x,y) * k2(x,y)
    . k(x,y) = lim kn(x,y)  pointwise
    are all kernels.

    Only the product case requires a proof:
      Given two pos. semi-def. nxn matrices K1 and K2,
      let their eigen-decomps be
        K1 = sum_i lambda1i * u1i u1i^T 
        K2 = sum_j lambda2j * u2j u2j^T 
      We will use "." for the componentwise product of matrices and vectors:
        K1 . K2 = sum_ij lambda1i lambda2j * (u1i u1i^T) . (u2j u2j^T)
                = sum_ij lambda1i lambda2j * (u1i . u2j) (u1i . u2j)^T
      Because lambda1i, lambda2j >= 0, the product is >= 0,
      and the matrices are again positive semi-def. matrices of rank 1.  QED

     [Matt says there is a shorter proof via covariance matrices of random vectors]

  - Derived kernel rules:  For any kernel k(x,y),
    . k(x,y)^n  for n>=0
    . exp(k(x,y))
    are also kernels.

  - Kernels on R^K:
    . k(x,y) = <x,y>                   Euclidean inner product kernel on R^K
    . <x,y>^2
    . exp(-||x-y||^2)                  Standard Gaussian Kernel
    . exp(-||x-y||^2/(2*sigma^2))      Gaussian Kernel with bandwidth sigma


* From functional analysis to linear algebra:

  - The Kimmeldorf-Wahba observation (early 1970s):

    Theorem: Let  H_n := span{k(x1,.),...,k(xn,.)}.
             The orthogonal complement of H_n is
               { g in H:  g(x1)=...=g(xn)=0 }
  
      Proof: g _|_ H_n  <==>  <g(.),k(xi,.)> = 0  for i=1...n
                        <==>  g(xi) = 0  for i=1...n

  - Typical statistical regularization problems look like this:
    Given a loss function  sum_i l(yi|f(xi))  and
    a RKHS penalty  ||f||^2,  solve the following problem:

     --------------------------------------------------------------
    |  argmin_{f in H}  [ sum_i l(yi|f(xi))  +  lambda*||f||^2  ]  |  (*)
     --------------------------------------------------------------

  - Theorem: A solution of (*) can always be found in H_n = {k(x1,.),...,k(xn,.)}.

             Proof: Decompose a given f as follows:
                           f = fn + g
                    where  fn in H_n  and  g _|_ H_n.
                    Then:
                           ||f||^2 = ||fn||^2 + ||g||^2
                           f(xi) = fn(xi) + g(xi) = fn(xi)
                    Thus:
                           ||fn||^2  <=  ||f||^2
                           sum_i l(yi|f(xi))  =  sum_i l(yi|fn(xi))

  - Reduction of (*) to linear algebra/n-dim minimization:

    . Notations:
             yvec := (y1,...,yn)^T               n-vector
             fvec := (fn(x1),...,fn(xn))^T       n-vector
             K    := (k(xi,xj))_{i,j=1...n}      nxn matrix

    . Represent fn(.) in terms of k(xi,.):
             fn(.) = sum_i bi * k(xi,.)
      More notation:
             b    := (b1,...,bn)                 n-vector

      Using the above notations fvec and K we get:
              --------------
             |  fvec = K b  |
              --------------

    . Loss:  L(yvec|fvec) := sum_i l(yi|f(xi))

              ------------------------
    . Fact:  |  ||fn||^2  =  b^T K b  |
              ------------------------

             Proof:  ||fn||^2  =  <sum_i k(xi,.)*bi, sum_j k(xj,.)*bj>
                               =  sum_ij  bi*bj*<k(xi,.),k(xj,.)>
                               =  sum_ij  bi*bj*k(xi,xj)
                               =  b^T K b

    . Reduced version of problem (*):

       ---------------------------------------------------
      |  argmin_b  [ L(yvec|K b)  +  lambda * b^T K b  ]  |  (**)
       ---------------------------------------------------
      
      Back in terms of  fvec = K b:  Assume K^{-1} exists, so  b = K^{-1} fvec.

       ------------------------------------------------------------------
      |  argmin_fvec  [ L(yvec|fvec)  +  lambda * fvec^T K^{-1} fvec  ]  |  (***)
       ------------------------------------------------------------------

    . Connection between penalty matrices and RKHS kernel:

       ------------------
      |  Omega = K^{-1}  |
       ------------------

    . Solve normal equations for LS:  L(yvec|fvec) = |yvec - fvec|^2

       -----------------------------------------
      |  fvec = (I + lambda * Omega)^{-1} yvec  |      
       -----------------------------------------

      This is old hat, the well-known formula for the smoother matrix S:

         S = (I + lambda * Omega)^{-1}

    . Extrapolation to locations x other than the observed xi:

       --------------------------------
      |  b = K^{-1} fvec = Omega fvec  |
      |                                |
      |  f(.) = sum_i bi * k(xi,.)     |
       --------------------------------

    . Direct solution for b:  

        argmin_b  [ |yvec - K b|^2  +  lambda * b^T K b ]

      1/2 Gradient wrt b:

        - K yvec + K^2 b  +  lambda * K b  =  0            (+)

      To cancel one of the common factors K, we must constrain
      to the orthogonal complement Null(K)^{perp} = Range(K).
      To this end let P_K be the orthogonal projection onto Range(K):

        - P_K yvec + K b + lambda b = 0

       --------------------------------------
      |  b = (lambda * I + K)^{-1} P_K yvec  |
       --------------------------------------

      The inverse is well-defined on Range(K).
      This particular 'b' is one solution.  However,
      if Null(K) =/= {0}, then any element of Null(K)
      can be added to b and it is still a solution to (+).

  - Examples:

        y <- boston[,"MEDV"]
        x <- boston[,"LSTAT"]
        N <- length(x)

    . Gaussian kernel smoothing on the real line

        # Constructing a Gaussian kernel:
        sigma <- sd(x)/1        # To be tuned...
        K <- exp( -outer(x, x, FUN="-")^2 / (2*sigma^2) )
        # Solving for the coefficients 'b':
        lambda <- .001            # Also to be tuned...
        b <- solve(lambda * diag(N) + K) %*% y
        f <- K %*% b
        # The result:
        plot(x, y, pch=16, xlab="LSTAT", ylab="MEDV")
        ord <- order(x)
        lines(x[ord], f[ord], lwd=3, col="red")

      Note: Gaussian kernels require two tuning parameters.
            . sigma,  the width of the Gaussian kernel,
            . lambda, the strength of the penalty.
    
            [If you hear someone give a talk presenting a Gaussian
             kernel example, ask how they chose the kernel width AND
             the penalty strength!]

      Results are mildly satisfying, but what''s the downward bend on the left?
      These choices of sigma and lambda took some experimenting.


---------------- End of class

    . Linear regression with a linear kernel:

        # Constructing a linear kernel: Try both kernels.
        K <- 0 + outer(x, x)     # Linear regression through the origin
        K <- 1 + outer(x, x)     # Linear regression with intercept
        # Solving for the coefficients 'b':
        lambda <- .01
        # Play with lambda: Crank it up/down to 1E6 and 1E-6
        b <- solve(lambda * diag(N) + K) %*% y
        f <- K %*% b
        # The result:
        plot(x, y, pch=16, xlab="LSTAT", ylab="MEDV")
        ord <- order(x)
        lines(x[ord], f[ord], lwd=2, col="red")
        # Why the difference between the two kernels?
        # Does it matter what non-zero constant you add?
        # How does the penalty strength matter?

    . Quadratic regression with a quadratic kernel:

        # Constructing a linear kernel: Try both kernels.
        K <- (1 + outer(x, x))^2     # Quadratic regression with intercept
        # Solving for the coefficients 'b':
        lambda <- 10
        # Play with lambda: Crank it up/down to 1E6 and 1E-6
        b <- solve(lambda * diag(N) + K) %*% y
        f <- K %*% b
        # The result:
        plot(x, y, pch=16, xlab="LSTAT", ylab="MEDV")
        ord <- order(x)
        lines(x[ord], f[ord], lwd=2, col="red")
        # Why the difference between the two kernels?
        # Does it matter what non-zero constant you add?
        # How does the penalty strength matter?

      Not bad!  Compare with LS fit:

        lines(x[ord], fitted(lm(y[ord] ~ x[ord] + I(x[ord]^2))), lwd=2, col="green")

    . Irresponsible step: Assuming that Omega = K^{-1} exists.
      Can''t K have zero eigenvalues?
      Examine: An eigen vector u=(u1...uN)^T for e''val=0 would mean
            K u = 0
      In components:
            sum_j uj k(xi,xj) = 0  for i=1...n 
      This means the function sum_j uj k(xi,.) is orthogonal
      to k(xi,.) for all i.  But the function is also in the
      span of the functions k(xi,.), hence it is zero:
            sum_j ui k(xi,.)  = 0              
      But this does not mean the coefficients ui are zero!
      Consider the linear kernel k(x,y)=x*y:
      All functions k(x,.) are linearly dependent.

 
* Sobolev RKHS:

  - Define squared norms and inner products directly:

      ||f||^2 :=  Integral_[0,1]  (f^(m))^2 dx
      <f,g>   :=  Integral_[0,1]  f^(m) * g^(m) dx

    where  f^(m) is the m''th derivative of f.
    ||.|| is only a semi-norm, though, because
    there exist non-zero functions f with ||f||=0,
    namely, the polynomials up to degree m-1.

  - What is the connection with RKHS?  

    . Consider the functions with m derivatives s.th.
        f^(m) in L2([0,1],dx)
      SATISFYING THE BOUNDARY CONDITIONS
        f(0) = f^(1)(0) = ... = f^(m-1)(0) = 0.

    . This is obviously an L2 space isomorphic to L2([0,1],dx)
      Call it H_1.

    . Its evaluation functionals are bounded:

        f(x) =  Integral_[0,x]  (x-t)^{m-1} / (m-1)! * f^(m)(t) dt
             <= C(x) * ||f^(m)||         by C-S.
             
        [The first line stems from repeated partial integration
         that leads to the Taylor expansion with m''th order
         remainder term: while increasing derivatives of f,
         increase the anti-derivatives of (x-t).  This choice
         will ensure that the boundary values of
            (x-t)^l*f^{k}(t)
         evaluated at t=0 and t=x vanish:
         at t=0 by the assumed boundary conditions,
         at t=x because (x-t)^l vanishes for t=x.]

    . It follows that H_1 a RKHS.
    
  - Heuristic observation: The integral above can be written as

        f(x) =  Integral_[0,1]  (x-t)_{+}^{m-1} / (m-1)! * f^(m)(t) dt
    
    Note the truncated spline functions:

        G(x,t)  :=  (x-t)_{+}^{m-1} / (m-1)!            (*)
    
        f(x) =  Integral_[0,1] G(x,t) * f^(m)(t) dt     (**)

  - What is the kernel?  A:

        k(x,y) = Integral_[0,1] G(x,t) G(y,t) dt        (***)

    To prove:
        (1)   k(x,.) is in H1
        (2)   <k(x,.),f(.)> = f(x)  for f in H1
    
    (1): Is the m''th derivative of k(x,.) in L2([0,1],dt)?
         Take m derivatives of y-->k(x,y) by taking m derivatives
         of y-->G(y,t) inside (***), which ends up being the Dirac
         one-point mass at t=y.  This is heuristic, but one can
         also just reverse the partial integrations to get here.
         Hence the m''th derivative in the second argument is:
             k(x,.)^{m) = G(x,.)
         This is in L2([0,1],dx).
    (2): Reproducing property:
              <k(x,.),f(.)>
           = Integral_[0,1] k(x,t)^{m}*f^{m}(t) dt
           = Integral_[0,1] G(x,t)*f^{m}(t) dt
           = f(x)
         by (**) above.
         
  - But how does this add up to a full space of fits w/o
    boundary conditions?

    . Expand the space H_1 by adding polynomials up to degree m-1.
      Call their space  H_0.  They get zero penalty.  So we have a
      mix of space-restricted LS and penalized LS:

        f = f0 + f1

      where f0 is a polynomial of degree m-1
      and   f1 is in the RKHS.
   
    . For computations you you need to re-derive the normal equations 
      for the mixed case that includes a space-restricted LS-component f0 
      and a penalized component f1.


* KERNEL PCA:

  - Kernel PCA is just the eigen decomposition of
    a multivariate dataset replaced by a kernel matrix thereof.

  - Classical PCA is obtained from the kernel
        k(x,y) = x^T y
    where x and y are p-vectors of variables.

  - You may obtain 'quadratic PCA' by using a kernel of the form

#----------------
# Infrastructure:

norm      <- function(x) sqrt(sum(x^2))              # Euclidean norm
normalize <- function(x) x/sqrt(sum(x^2))            # For mapping points to unit sphere
to.01     <- function(x) (x-min(x))/(max(x)-min(x))  # For mapping coords to rgb colors
dist.mat  <- function(X) as.matrix(dist(X))          # Convert the distance structure to a matrix
kNN.crit  <- function(dists.base,dists.rel,k=10) {   # Compare kNN neighborhoods of two dist matrices
    qtile <- k/ncol(dists.base)
    kNN.base <- apply(dists.base, 2, function(dcol) as.numeric(dcol <= quantile(dcol, qtile)))
    kNN.rel  <- apply(dists.rel,  2, function(dcol) as.numeric(dcol <= quantile(dcol, qtile)))
    apply(pmin(kNN.base, kNN.rel), 2, sum) / apply(kNN.base, 2, sum)
}

#----------------
#
# Two artificial data examples:

# * Circular data, equispaced, but 1-frac removed:
N <- 1000;  frac <- .5
tau <- sample(seq(0,2*pi, length=N+1)[-1][1:(N*frac)])  # Scrambled order
X <- cbind(cos(tau), sin(tau))
clrs <- rgb(red=to.01(X[,1]), green=to.01(X[,2]), blue=0)  # Colors
dim(X)
# Spacing, relevant for sigma:
2*pi/N


# * Spherical data, random, but 1-frac cap removed on last coordinate:
N <- 1000;  p <- 3;  frac <- 1
tmp <- t(apply(matrix(rnorm(N*p), ncol=p), 1, normalize))
X <- tmp[tmp[,p]<=quantile(tmp[,p], frac),]
clrs <- rgb(red=to.01(X[,p]), green=0, blue=0)  # Colors
dim(X)

# Plot original data:
windows()
lims <- range(X);  par(mfrow=c(1,1))
plot(as.data.frame(X), pch=16, cex=.8, col=clrs, xlim=lims, ylim=lims)

#----------------

# Kernels:

# * QUADRATIC KERNEL:
K <- (1000+tcrossprod(X))^2
# Then execute the code below headed "EIGENANALYSIS..." and "PLOTS..."
# Lessons:
# - The 1st eigenvector is a constant accounting for location.
# - The 2nd and 3rd eigenvector reproduce the circle quite well.
# - Correspondingly there is one large eigenvalue for the constant
#   and two identical eigenvalues for the sine and cosine motion.


# * GAUSSIAN KERNEL:
dists <- dist.mat(X)  # these are raw, NOT squared distances
# What is the max of the m*N smallest distances?  Relevant for kernel width 'sig':
m <- 3;  quantile(x=dists[col(dists) < row(dists)], prob=(m*N)/choose(N,2))
# Kernel width, must be chosen every time:
sig <- 10
K <- exp(-dists^2/sig^2)
# round(K[,1:6], 3) # checking bandedness, e.g.
# Lessons?

#----------------

# EIGENANALYSIS of kernel:
K.eig <- eigen(K);  K.eig$vectors <- t(t(K.eig$vectors)*sqrt(pmax(0,K.eig$values)))
colnames(K.eig$vectors) <- paste("Evec",1:ncol(K.eig$vectors))

# PLOTS of a few eigen coordinates:
rk <- 1:4    # embedding dimensions (ranks) shown
lims <- range(K.eig$vectors[,rk])
lims <- range(K.eig$vectors[,rk[-1]])
pairs(K.eig$vectors[,rk], xlim=lims, ylim=lims, gap=0, oma=rep(2,4),
      col=clrs, pch=16, cex=.5) #pch=".")

# EIGENVALUES, up to 20 largest:
eval.ord <- 1:min(N,20)
par(mfrow=c(3,1), mgp=c(1.8,.5,0), mar=c(3,3,1,1))
plot(x=eval.ord, y=K.eig$values[eval.ord],       type="b", pch=16) # raw scale
plot(x=eval.ord, y=sqrt(K.eig$values[eval.ord]), type="b", pch=16) # root scale
plot(x=eval.ord, y=log(K.eig$values[eval.ord]),  type="b", pch=16) # log scale
round(K.eig$values[eval.ord], 8)

# META-CRITERION of kNN overlap: first is base, second is relative to base
rk <- 1:2    # dimensions used in kernel embedding
mean(kNN.crit(dist.mat(X),                  dist.mat(K.eig$vectors[,rk]), k=10)) # recall
mean(kNN.crit(dist.mat(K.eig$vectors[,rk]), dist.mat(X),                  k=10)) # precision

# PLOTS of eigen coordinates against curve parameter tau for curve data such as circle:
par(mfrow=c(2,2), mgp=c(1.8,.5,0), mar=c(3,3,1,1))
for(j in 1:4) plot(tau, K.eig$vectors[,j], pch=16, type="p", ylim=lims, ylab=paste("Eigenvector",j))

# PLOTS of eigen coordinates against original coordinates:
par(mfrow=c(2,2), mgp=c(1.8,.5,0), mar=c(3,3,1,1))
plot(X[,1], K.eig$vectors[,1], pch=16, type="p")
plot(X[,2], K.eig$vectors[,1], pch=16, type="p")
plot(X[,1], K.eig$vectors[,2], pch=16, type="p")
plot(X[,2], K.eig$vectors[,2], pch=16, type="p")


----------------------------------------------------------------