# 2014-03-31 # Introduction of matrix operations # calculation of the covariance matrix # clear memory rm(list=ls()) # reset the plot window to full sized plot par(mfrow=c(1,1)) source("scripts/loadano2.R") source("scripts/myfunctions.R") # note this logical variable will allow us # to calculate the correlation matrix iscor<-FALSE ############################################### # Select here different months Jan - Dec ############################################### month<-"Jan" varname<-"tavg" start<-1950 end<-2012 # sample size, used to define matrix n<-end-start+1 stationlist<-c("USW00014771","USW00094728") # variables (stations), used to define matrix m<-length(stationlist) # define a n x m matrix filled with NA entries X<-matrix(NA,n,m) ##################################################### # here I form a loop to repeat the loading of the # data for the stations given in stationlist ##################################################### for ( i in 1:m) { buffer<-loadano2(station=stationlist[i],state="NY",month=month,start=start,end=end) mhelp<-mean(buffer$ano) # now I assign the centered data vector # into the column i of the matrix if (iscor) { # standardize to get the correlation coefficients # NOTE: standard deviation function is using 1/(n-1) # factor, but below we work with 1/n # That's why sqrt(n-1/n) shelp<-sd(buffer$ano)*sqrt((n-1)/n) X[,i]<-(buffer$ano-mhelp)/shelp } else { # default is the covariance will be calculated # from centered data X[,i]<-(buffer$ano-mhelp) } } # the function t() is forming the transpose of a matrix print("size of the data matrix X: ") print(dim(X)) print("size of the transpose of data matrix X: ") print(dim(t(X))) # the matrix multiplication # %*% is the operator for matrix multiplication C<-t(X)%*%X # multiply the matrix with factor 1/n # to get the covariance (correlation matrix) C<-1/n*C if (iscor) { print("size of the correlation matrix: ") print(dim(C)) print ("Correlation Matrix") print(C) } else { print("size of the Covariance matrix: ") print(dim(C)) print ("Covariance Matrix") print(C) } # The variance for the four variables (stations) # are stored on the diagonal # (top left down to bottom right) vsum<-0 for ( i in 1:m){ print(paste("Variance for ",stationlist[i]," : ",round(C[i,i],3),sep='')) vsum<-vsum+C[i,i] } print("---------------------------------------") print(paste("sum of variances : ",round(vsum,3)),sep='') wait<-readline("press enter to create a scatter-plot ") ################################################### # create an empty plot with myplot() # function from myfunctions.R (new 2014-04-10) ################################################### xrange<-c(-5,5) yrange<-xrange myplot(xrange=xrange,yrange=yrange,frame=FALSE) points(X[,1],X[,2],pch=1) wait<-readline("press enter to put show the fitted regression line") bfit<-cor(X[,1],X[,2])*(sd(X[,2])/sd(X[,1])) afit<-mean(X[,2])-bfit*mean(X[,1]) yfit<-afit+bfit*X[,1] lines(X[,1],yfit,col=5) ######################################################## # use plot_vector() from myfunctions.R to draw a vector # first parameter is a vector with two entries # for the x and y coordinate # Vectors usually start from the point x=0,y=0 # You can use xoff and yoff to change that start point # col, lwd determine the color and thinckness, # scale scales the arrow head size ######################################################## plot_vector(c(1,bfit),col=4,yoff=afit,scale=2,lwd=3) wait<-readline("press enter to show the eigenvector directions ") ######################################################## # finding the eigenvector # using PCA analysis function princomp() ######################################################## res<-princomp(X) print(summary(res)) print(names(res)) print("Eigenvectors are stored in res$loadings") print("sqrt(Eigenvalues) are stored in res$sdev") lambda<-res$sdev^2 e1<-res$loadings[,1] e2<-res$loadings[,2] plot_vector(e1,col=3,lwd=3) plot_vector(e2,col=3,lwd=3) pcafit1<-e1[2]/e1[1] lines(X[,1],pcafit1*X[,1],col=3,lty=2) pcafit2<-e2[2]/e2[1] lines(X[,1],pcafit2*X[,1],col=3,lty=2) dev.copy2pdf(file="class19b_scatterplot_org.pdf") ################################################ # Eigenvectors form a new set of basis vectors # # Principal Component Anaysis preserves # the variance in the data # the squared eigenvalues are the variances # of the sample cloud projected onto the # eigenvectors ################################################ varcheck<-0 for ( i in 1:m) { varcheck<-varcheck+res$sdev[i]^2 } print(paste("Sum of the squared eigenvalues :", round(varcheck,3))) #################################################### # use the vector projection to # find the new coordinates # of our 'temperature vector' with respect to the basis # vectors formed by the the eigenvectors #################################################### # loop over years # copy rename the matrix of eigenvectors (in columns) # into E E<-res$loadings A<-X%*%E print("size of matrix A :") print(dim(A)) # calculate the new covariance matrix Caa<-1/n*t(A)%*%A Caa<-round(Caa,3) print("covariance matrix of our data expressed in the new coordinate system") print(Caa) wait<-readline("press enter for plotting the data in eigenvector coordinates") myplot(xrange,yrange,xlab="e1",ylab="e2") points(A) plot_vector(c(1,0),col=3,lwd=3) plot_vector(c(0,1),col=3,lwd=3) dev.copy2pdf(file="class19b_scatter_in_eigenvector_basis.pdf") ##################################### # Note that A stores the same # information as X: # n samples (here years of observations) # of two now weighted averages of the # input temperatures ##################################### # here we plot the original times series # toegether with the new times # (the principal component time series) wait<-readline("press enter for plotting the data as time series") buffer$time plot(c(1950,2012),c(-8,8),typ='n',xlab="year",ylab="anomalies [deg D]") lines(buffer$time,X[,1],col="green",lwd=2,typ='l') lines(buffer$time,X[,2],col="blue",lwd=2,typ='l') irev<-sign(cor(A[,1],X[,1])) lines(buffer$time,irev*A[,1],col="black",lwd=2,typ='l') irev<-sign(cor(A[,1],X[,2])) lines(buffer$time,irev*A[,2],col="gray",lwd=2,typ='l') text(1950,8,stationlist[1],adj=0,col="green") text(1980,8,stationlist[2],adj=0,col="blue") text(1950,-8,"'PC #1'",adj=0,col="black") text(1980,-8,"'PC #2'",adj=0,col="gray") print("NOTE: 'Principal Component (PC)' time series are equivalent to") print(" the returned values from princomp named 'scores' :i.e. res$scores") dev.copy2pdf(file="class19b_timeseries.pdf") print("class19b.R finished successfully") print("check for files names 'class19b*.pdf in your working directory for PDF figure output")