############################################
# 2014-04-19 PCA analysis of temperature
#            stations
############################################


###############################################
# SUPPORT FUNCTIONS ARE DEFINED HERE
###############################################
# 

#####################################################
# MAIN PART
#####################################################

rm(list=ls())
dev.off()
par(mfrow=c(1,1))
print("Principal Component Analsysis")
print("=============================")
# read the data file with January 24 (20 June) stations
# that have comple
varname<-'tavg'
selmon<-'Jan'
startyr<-1950
endyr<-2012


##################################################
# FOR HOMEWORK 8: Control here if noise
#                 is added to the time series
#                 and the standard deviation
#                 of the random (gaussian) error
##################################################

addnoise<-FALSE
# random temperature anomalies standard deviation
# [deg C]
stddevnoise<-1.0




# get the data matrix
# NOTE: first column contains an index,
# second column the time
# first station data is at column 3

filename<-paste("data/NY/class21a_",varname,'_',selmon,'_',startyr,'-',endyr,'.csv',sep='')
# NOTE: READING FROM A CSV TABLE DOE NOT CREATE A 
#       MATRIX AUTOMATICALLY (it is a data.frame object)
# Here we 'force' a conversion to a matrix
buffer<-data.matrix(read.csv(filename))
time<-buffer[,2]
isize<-dim(buffer)
n<-isize[1]

#################################################
# For Homework 8: Manually control the 
#                 number of stations
#                 that are used in the PCA
#################################################
m<-2
# uncomment this line to use all available stations
#m<-isize[2]-2


X<-matrix(NA,n,m)
for (i in 1:m){
  X[,i]<-buffer[,i+2]
  # Add some random noise to the data
  if (addnoise) {
    X[,i]<-X[,i]+rnorm(n,sd=stddevnoise)
  }
}




print("Size of data matrix X: ")
print(dim(X))


# calculate COVARIANCE MATRIX
# note that rows in matrix sst represent grid locations
# and columns are the samples over the years
# that's why we use 

wait<-readline("press enter for the Covariance Matrix analysis")

C<-1/n*t(X)%*%X
# the variance of all the stations is stored in the diagonal entires
variances<-diag(C)
totalvar<-sum(variances)
print(paste("sum of the variances from all stations: ",totalvar,sep=''))

# here we plot the variances of the stations sorted in ascending order
index<-seq(1,m,1)
sortedvar<-sort(variances,decreasing=TRUE)
# NOTE: sort() returns the vector entries sorted in descending (or ascending)
# order



###########################################
# here we do the PCA 'by hand'
# instead of usign the function princomp()
############################################
# (1) first get the eigenvectors

wait<-readline("press enter for calculating the eigenvectors and eigenvalues of C")
# 
e<-eigen(C)

print("Dimension of the eigenvetor matrix")
print(dim(e$vectors))

# eigenvectors indicate how much of the total variance
# of the station data set projects onto the eigenvectors
# the more covariance among the stations the more
# will project onto the first few leading eigenvectors
# We can plot the 'eigenvalue spectrum' 

print(paste("sum of the eigenvalues: ",sum(e$values)))
# maximize the y-axis range to fit the largest eigenvalue 
yrange<-c(0,max(e$values[1],sortedvar[1]))

plot(c(1,m),yrange,typ='n',
     xlab="station rank (sorted)",ylab="variance (deg C)^2")
lines(sortedvar,typ='h',col=1,lwd=5)
lines(e$values,typ='h',col=3,lwd=3)
legend(1.2,yrange[2],c("stations","eigenvalues"),pch=c(19,19),col=c(1,3))


wait<-readline("press enter to visualize the eigenvectors:")

# NOTE: the dimension of our vector space is given by
# the numbers of stations. So it is a high-dimensional
# space, but we can plot the coordinates of the eigenvectors:
# the x-axis position is used to indicate the 
# station (i.e. each station 'represents one dimension)
# eignevectors are stored as column vectors 
plot(c(1,m),c(-1,1),typ='n',xlab='station',ylab='weight')
lines(index,e$vector[,1],col=1,lwd=3,typ='h')
lines(index,e$vector[,2],col=2,lwd=1,typ='h')
legend(1.2,1,c("eigenvector #1","eigenvector #2"),pch=c(19,19),col=c(1,2))

wait<-readline("press enter to project station data onto the eigenvectors (create PC time series)")

# Principal Component time series
# associated with eigenvectors
# are obtained by using the entries of the eigenvectors
# to form a weighted sum of the stations 
# temperature anomaly time series

# here the first two PCs are calculated
A<-X%*%e$vector[,1:2]
plot(time,A[,1],col=1,typ='l',lwd=2,xlab='time',ylab='PC [deg C]')
lines(time,A[,2],typ='l',col=2,lwd=2)
print(paste("variance of PC 1: ",var(A[,1])))
print(paste("variance of PC 2: ",var(A[,2])))

wait<-readline("Press enter to compare the PC time with station average")
# compare this with the station average time series
# NOTE: I you want to use a powerful R-function
# you can avoid loops with xmean<-apply(X,1,mean)
# see help(apply) for more information
xmean<-matrix(NA,n,1)
for (i in 1:n) {
  xmean[i]<-mean(X[i,])
}
# compare standardized time series
plot(time,A[,1]/sqrt(e$value[1]),col=1,lwd=2,typ='l',
      ,xlab='time',ylab='standardized anomalies')
lines(time,xmean/sd(xmean),col='green',lwd=1)

print(paste("correlation between PC #1",
            "and station mean time series: ",
            round(cor(A[,1],xmean),6)))
print(paste("correlation between PC #2",
            "and station mean time series: ",
            round(cor(A[,2],xmean),6)))