# 2014-03-31 # Introduction of matrix operations # calculation of the covariance matrix # clear memory rm(list=ls()) # reset the plot window to full sized plot par(mfrow=c(1,1)) source("scripts/loadano2.R") source("scripts/myfunctions.R") # note this logical variable will allow us # to calculate the correlation matrix # by first normalizing the station data # to be centered and normalized to standard # deviation 1 # # Learning goal: find a vector that marks # the direction where the scatter cloud # is widest in its range. # (2) understand that in 2-dim. case # the orthogonal vector then points # into direction of minimum spread # (3) see that this direction is not # the same as the vector associated with # the slope of the regression line # (3) relate this to the covariance # matrix and it's eigenvectors and eigenvalues # (4) use the vector projection iscor<-FALSE varname<-"tavg" month<-"Jun" start<-1950 end<-2012 # sample size, used to define matrix n<-end-start+1 stationlist<-c("USW00014771","USW00094728") # variables (stations), used to define matrix m<-length(stationlist) # define a n x m matrix filled with NA entries X<-matrix(NA,n,m) # here I form a loop to repeat the loading of the # data for the stations given in stationlist for ( i in 1:m) { buffer<-loadano2(station=stationlist[i],state="NY",month=month,start=start,end=end) mhelp<-mean(buffer$ano) # now I assign the centered data vector # into the column i of the matrix if (iscor) { # standardize to get the correlation coefficients # NOTE: standard deviation function is using 1/(n-1) # factor, but below we work with 1/n # That's why sqrt(n-1/n) shelp<-sd(buffer$ano)*sqrt((n-1)/n) X[,i]<-(buffer$ano-mhelp)/shelp } else { # default is the covariance will be calculated # from centered data X[,i]<-(buffer$ano-mhelp) } } print("size of the data matrix X: ") print(dim(X)) # the function t() is forming the transpose # of matrix X print("size of the transpose of data matrix X: ") print(dim(t(X))) # the matrix multiplication # %*% is the operator for matrix multiplication C<-t(X)%*%X # multiply the matrix with factor 1/n # to get the covariance (correlation matrix) C<-1/n*C if (iscor) { print("size of the correlation matrix: ") print(dim(C)) print ("Correlation Matrix") print(C) } else { print("size of the Covariance matrix: ") print(dim(C)) print ("Covariance Matrix") print(C) } # The variance for the four variables (stations) # are stored on the diagonal (top left down to bottom right) vsum<-0 for ( i in 1:m){ print(paste("Variance for ",stationlist[i]," : ",round(C[i,i],3),sep='')) vsum<-vsum+C[i,i] } print("---------------------------------------") print(paste("sum of variances : ",round(vsum,3)),sep='') # create an empty plot xrange<-c(-5,5) yrange<-xrange myplot(xrange=xrange,yrange=yrange,frame=FALSE) # put the scatter points into the plot for (i in 1:n){ points(X[i,1],X[i,2],pch=1) } bfit<-cor(X[,1],X[,2])*(sd(X[,2])/sd(X[,1])) afit<-mean(X[,2])-bfit*mean(X[,1]) yfit<-afit+bfit*X[,1] lines(X[,1],yfit,col=5) res<-readline() plot_vector(c(1,bfit),col=4,yoff=afit,scale=2,lwd=3) # finding the eigenvector # using PCA analysis princomp res<-princomp(X) print(summary(res)) print(names(res)) print("Eigenvectors are stored in res$loadings") print("sqrt(Eigenvalues) are stored in res$sdev") lambda<-res$sdev^2 e1<-res$loadings[,1] e2<-res$loadings[,2] plot_vector(e1,col=3,lwd=3) plot_vector(e2,col=3,lwd=3) pcafit1<-e1[2]/e1[1] lines(X[,1],pcafit1*X[,1],col=3,lty=2) pcafit2<-e2[2]/e2[1] lines(X[,1],pcafit2*X[,1],col=3,lty=2) # eigenvectors form a new set of basis vectors # # need projection of a vector x onto eigenvector # e1, e2 # Principal Component Anaysis preserves # the variance in the data # the squared eigenvalues are the variances # of the sample cloud projected onto the # eigenvectors varcheck<-0 for ( i in 1:m) { varcheck<-varcheck+res$sdev[i]^2 } print(paste("Sum of the squared eigenvalues :", round(varcheck,3)))