# 2014-03-31
# Introduction of matrix operations
# calculation of the covariance matrix
# clear memory
rm(list=ls())
# reset the plot window to full sized plot
par(mfrow=c(1,1))
source("scripts/loadano2.R")
source("scripts/myfunctions.R")

# note this logical variable will allow us 
# to calculate the correlation matrix
iscor<-FALSE

###############################################
# Select here different months Jan - Dec
###############################################
month<-"Jan"
varname<-"tavg"
start<-1950
end<-2012
# sample size, used to define matrix
n<-end-start+1
stationlist<-c("USW00014771","USW00094728")

# variables (stations), used to define matrix
m<-length(stationlist)

# define a n x m matrix filled with NA entries
X<-matrix(NA,n,m)
#####################################################
# here I form a loop to repeat the loading of the 
# data for the stations given in stationlist
#####################################################
for ( i in 1:m) {
  buffer<-loadano2(station=stationlist[i],state="NY",month=month,start=start,end=end)
  mhelp<-mean(buffer$ano)
  # now I assign the centered data vector
  # into the column i of the matrix
  if (iscor) {
    # standardize to get the correlation coefficients
    # NOTE: standard deviation function is using 1/(n-1)
    #       factor, but below we work with 1/n 
    #       That's why sqrt(n-1/n)
    shelp<-sd(buffer$ano)*sqrt((n-1)/n)
    X[,i]<-(buffer$ano-mhelp)/shelp
  } else {
    # default is the covariance will be calculated
    # from centered data
    X[,i]<-(buffer$ano-mhelp)
  }
}
# the function t() is forming the transpose of a matrix
print("size of the data matrix X: ")
print(dim(X))
print("size of the transpose of data matrix X: ")
print(dim(t(X)))

# the matrix multiplication
# %*% is the operator for matrix multiplication
C<-t(X)%*%X
# multiply the matrix with factor 1/n
# to get the covariance (correlation matrix)
C<-1/n*C

if (iscor) { 
  print("size of the correlation matrix: ")
  print(dim(C))
  print ("Correlation Matrix")
  print(C)
} else {
  print("size of the Covariance matrix: ")
  print(dim(C))
  print ("Covariance Matrix")
  print(C)
}

# The variance for the four variables (stations)
# are stored on the diagonal 
# (top left down to bottom right)
vsum<-0
for ( i in 1:m){
  print(paste("Variance for ",stationlist[i]," : ",round(C[i,i],3),sep=''))
  vsum<-vsum+C[i,i]
}
print("---------------------------------------")
print(paste("sum of variances         : ",round(vsum,3)),sep='')


wait<-readline("press enter to create a scatter-plot ")

###################################################
# create an empty plot with myplot()
# function from myfunctions.R (new 2014-04-10)
###################################################
xrange<-c(-5,5)
yrange<-xrange
myplot(xrange=xrange,yrange=yrange,frame=FALSE)
points(X[,1],X[,2],pch=1)




wait<-readline("press enter to put show the fitted regression line")

bfit<-cor(X[,1],X[,2])*(sd(X[,2])/sd(X[,1]))
afit<-mean(X[,2])-bfit*mean(X[,1])

yfit<-afit+bfit*X[,1]
lines(X[,1],yfit,col=5)

########################################################
# use plot_vector() from myfunctions.R to draw a vector
# first parameter is a vector with two entries
# for the x and y coordinate
# Vectors usually start from the point x=0,y=0
# You can use xoff and yoff to change that start point
# col, lwd determine the color and thinckness,
# scale scales the arrow head size
########################################################

plot_vector(c(1,bfit),col=4,yoff=afit,scale=2,lwd=3)


wait<-readline("press enter to show the eigenvector directions ")

########################################################
# finding the eigenvector
# using PCA analysis function princomp()
########################################################

res<-princomp(X)
print(summary(res))
print(names(res))

print("Eigenvectors are stored in res$loadings")
print("sqrt(Eigenvalues) are stored in res$sdev")
lambda<-res$sdev^2
e1<-res$loadings[,1]
e2<-res$loadings[,2]

plot_vector(e1,col=3,lwd=3)
plot_vector(e2,col=3,lwd=3)

pcafit1<-e1[2]/e1[1]
lines(X[,1],pcafit1*X[,1],col=3,lty=2)
pcafit2<-e2[2]/e2[1]
lines(X[,1],pcafit2*X[,1],col=3,lty=2)

dev.copy2pdf(file="class19b_scatterplot_org.pdf")
################################################
# Eigenvectors form a new set of basis vectors 
#
# Principal Component Anaysis preserves 
# the variance in the data 
# the squared eigenvalues are the variances
# of the sample cloud projected onto the
# eigenvectors
################################################ 

varcheck<-0
for ( i in 1:m) {
  varcheck<-varcheck+res$sdev[i]^2
}
print(paste("Sum of the squared eigenvalues :", round(varcheck,3)))
####################################################
# use the vector projection to 
# find the new coordinates 
# of our 'temperature vector' with respect to the basis
# vectors formed by the the eigenvectors
####################################################
# loop over years
# copy rename the matrix of eigenvectors (in columns) 
# into E
E<-res$loadings
A<-X%*%E

print("size of matrix A :")
print(dim(A))

# calculate the new covariance matrix
Caa<-1/n*t(A)%*%A
Caa<-round(Caa,3)
print("covariance matrix of our data expressed in the new coordinate system")
print(Caa)


wait<-readline("press enter for plotting the data in eigenvector coordinates")

myplot(xrange,yrange,xlab="e1",ylab="e2")
points(A)
plot_vector(c(1,0),col=3,lwd=3)
plot_vector(c(0,1),col=3,lwd=3)
dev.copy2pdf(file="class19b_scatter_in_eigenvector_basis.pdf")

#####################################
# Note that A stores the same
# information as X:
# n samples (here years of observations)
# of two now weighted averages of the 
# input temperatures
#####################################

# here we plot the original times series
# toegether with the new times
# (the principal component time series)

wait<-readline("press enter for plotting the data as time series")

buffer$time
plot(c(1950,2012),c(-8,8),typ='n',xlab="year",ylab="anomalies [deg D]")
lines(buffer$time,X[,1],col="green",lwd=2,typ='l')
lines(buffer$time,X[,2],col="blue",lwd=2,typ='l')
irev<-sign(cor(A[,1],X[,1]))
lines(buffer$time,irev*A[,1],col="black",lwd=2,typ='l')
irev<-sign(cor(A[,1],X[,2]))
lines(buffer$time,irev*A[,2],col="gray",lwd=2,typ='l')


text(1950,8,stationlist[1],adj=0,col="green")
text(1980,8,stationlist[2],adj=0,col="blue")
text(1950,-8,"'PC #1'",adj=0,col="black")
text(1980,-8,"'PC #2'",adj=0,col="gray")

print("NOTE: 'Principal Component (PC)' time series are equivalent to")
print("       the returned values from princomp named 'scores' :i.e. res$scores")
dev.copy2pdf(file="class19b_timeseries.pdf")


print("class19b.R finished successfully")
print("check for files names 'class19b*.pdf in your working directory for PDF figure output")