# 2014-03-31
# Introduction of matrix operations
# calculation of the covariance matrix
# clear memory
rm(list=ls())
# reset the plot window to full sized plot
par(mfrow=c(1,1))
source("scripts/loadano2.R")
source("scripts/myfunctions.R")
# note this logical variable will allow us 
# to calculate the correlation matrix
# by first normalizing the station data
# to be centered and normalized to standard 
# deviation 1
#
# Learning goal: find a vector that marks
# the direction where the scatter cloud
# is widest in its range.
# (2) understand that in 2-dim. case
#     the orthogonal vector then points
#     into direction of minimum spread
# (3) see that this direction is not
#     the same as the vector associated with
#     the slope of the regression line
# (3) relate this to the covariance 
#     matrix and it's eigenvectors and eigenvalues
# (4) use the vector projection 



iscor<-FALSE

varname<-"tavg"
month<-"Jun"


start<-1950
end<-2012
# sample size, used to define matrix
n<-end-start+1
stationlist<-c("USW00014771","USW00094728")
# variables (stations), used to define matrix
m<-length(stationlist)

# define a n x m matrix filled with NA entries
X<-matrix(NA,n,m)

# here I form a loop to repeat the loading of the 
# data for the stations given in stationlist

for ( i in 1:m) {
  buffer<-loadano2(station=stationlist[i],state="NY",month=month,start=start,end=end)
  mhelp<-mean(buffer$ano)
  # now I assign the centered data vector
  # into the column i of the matrix
  if (iscor) {
    # standardize to get the correlation coefficients
    # NOTE: standard deviation function is using 1/(n-1)
    #       factor, but below we work with 1/n 
    #       That's why sqrt(n-1/n)
    shelp<-sd(buffer$ano)*sqrt((n-1)/n)
    X[,i]<-(buffer$ano-mhelp)/shelp
  } else {
    # default is the covariance will be calculated
    # from centered data
    X[,i]<-(buffer$ano-mhelp)
  }
}

print("size of the data matrix X: ")
print(dim(X))

# the function t() is forming the transpose
# of matrix X
print("size of the transpose of data matrix X: ")
print(dim(t(X)))

# the matrix multiplication
# %*% is the operator for matrix multiplication
C<-t(X)%*%X
# multiply the matrix with factor 1/n
# to get the covariance (correlation matrix)
C<-1/n*C

if (iscor) { 
  print("size of the correlation matrix: ")
  print(dim(C))
  print ("Correlation Matrix")
  print(C)
} else {
  print("size of the Covariance matrix: ")
  print(dim(C))
  print ("Covariance Matrix")
  print(C)
}

# The variance for the four variables (stations)
# are stored on the diagonal (top left down to bottom right)
vsum<-0
for ( i in 1:m){
  print(paste("Variance for ",stationlist[i]," : ",round(C[i,i],3),sep=''))
  vsum<-vsum+C[i,i]
}
print("---------------------------------------")
print(paste("sum of variances         : ",round(vsum,3)),sep='')

# create an empty plot
xrange<-c(-5,5)
yrange<-xrange
myplot(xrange=xrange,yrange=yrange,frame=FALSE)

# put the scatter points into the plot

for (i in 1:n){
  points(X[i,1],X[i,2],pch=1)
}


bfit<-cor(X[,1],X[,2])*(sd(X[,2])/sd(X[,1]))
afit<-mean(X[,2])-bfit*mean(X[,1])

yfit<-afit+bfit*X[,1]
lines(X[,1],yfit,col=5)

res<-readline()

plot_vector(c(1,bfit),col=4,yoff=afit,scale=2,lwd=3)


# finding the eigenvector
# using PCA analysis princomp

res<-princomp(X)
print(summary(res))
print(names(res))

print("Eigenvectors are stored in res$loadings")
print("sqrt(Eigenvalues) are stored in res$sdev")
lambda<-res$sdev^2
e1<-res$loadings[,1]
e2<-res$loadings[,2]

plot_vector(e1,col=3,lwd=3)
plot_vector(e2,col=3,lwd=3)

pcafit1<-e1[2]/e1[1]
lines(X[,1],pcafit1*X[,1],col=3,lty=2)
pcafit2<-e2[2]/e2[1]
lines(X[,1],pcafit2*X[,1],col=3,lty=2)
# eigenvectors form a new set of basis vectors 
# 
# need projection of a vector x onto eigenvector
# e1, e2

# Principal Component Anaysis preserves 
# the variance in the data 
# the squared eigenvalues are the variances
# of the sample cloud projected onto the
# eigenvectors

varcheck<-0
for ( i in 1:m) {
  varcheck<-varcheck+res$sdev[i]^2
}
print(paste("Sum of the squared eigenvalues :", round(varcheck,3)))