############################################ # 2014-04-19 PCA analysis of temperature # stations ############################################ ############################################### # SUPPORT FUNCTIONS ARE DEFINED HERE ############################################### # ##################################################### # MAIN PART ##################################################### rm(list=ls()) dev.off() par(mfrow=c(1,1)) print("Principal Component Analsysis") print("=============================") # read the data file with January 24 (20 June) stations # that have comple varname<-'tavg' selmon<-'Jan' startyr<-1950 endyr<-2012 ################################################## # FOR HOMEWORK 8: Control here if noise # is added to the time series # and the standard deviation # of the random (gaussian) error ################################################## addnoise<-FALSE # random temperature anomalies standard deviation # [deg C] stddevnoise<-1.0 # get the data matrix # NOTE: first column contains an index, # second column the time # first station data is at column 3 filename<-paste("data/NY/class21a_",varname,'_',selmon,'_',startyr,'-',endyr,'.csv',sep='') # NOTE: READING FROM A CSV TABLE DOE NOT CREATE A # MATRIX AUTOMATICALLY (it is a data.frame object) # Here we 'force' a conversion to a matrix buffer<-data.matrix(read.csv(filename)) time<-buffer[,2] isize<-dim(buffer) n<-isize[1] ################################################# # For Homework 8: Manually control the # number of stations # that are used in the PCA ################################################# m<-2 # uncomment this line to use all available stations #m<-isize[2]-2 X<-matrix(NA,n,m) for (i in 1:m){ X[,i]<-buffer[,i+2] # Add some random noise to the data if (addnoise) { X[,i]<-X[,i]+rnorm(n,sd=stddevnoise) } } print("Size of data matrix X: ") print(dim(X)) # calculate COVARIANCE MATRIX # note that rows in matrix sst represent grid locations # and columns are the samples over the years # that's why we use wait<-readline("press enter for the Covariance Matrix analysis") C<-1/n*t(X)%*%X # the variance of all the stations is stored in the diagonal entires variances<-diag(C) totalvar<-sum(variances) print(paste("sum of the variances from all stations: ",totalvar,sep='')) # here we plot the variances of the stations sorted in ascending order index<-seq(1,m,1) sortedvar<-sort(variances,decreasing=TRUE) # NOTE: sort() returns the vector entries sorted in descending (or ascending) # order ########################################### # here we do the PCA 'by hand' # instead of usign the function princomp() ############################################ # (1) first get the eigenvectors wait<-readline("press enter for calculating the eigenvectors and eigenvalues of C") # e<-eigen(C) print("Dimension of the eigenvetor matrix") print(dim(e$vectors)) # eigenvectors indicate how much of the total variance # of the station data set projects onto the eigenvectors # the more covariance among the stations the more # will project onto the first few leading eigenvectors # We can plot the 'eigenvalue spectrum' print(paste("sum of the eigenvalues: ",sum(e$values))) # maximize the y-axis range to fit the largest eigenvalue yrange<-c(0,max(e$values[1],sortedvar[1])) plot(c(1,m),yrange,typ='n', xlab="station rank (sorted)",ylab="variance (deg C)^2") lines(sortedvar,typ='h',col=1,lwd=5) lines(e$values,typ='h',col=3,lwd=3) legend(1.2,yrange[2],c("stations","eigenvalues"),pch=c(19,19),col=c(1,3)) wait<-readline("press enter to visualize the eigenvectors:") # NOTE: the dimension of our vector space is given by # the numbers of stations. So it is a high-dimensional # space, but we can plot the coordinates of the eigenvectors: # the x-axis position is used to indicate the # station (i.e. each station 'represents one dimension) # eignevectors are stored as column vectors plot(c(1,m),c(-1,1),typ='n',xlab='station',ylab='weight') lines(index,e$vector[,1],col=1,lwd=3,typ='h') lines(index,e$vector[,2],col=2,lwd=1,typ='h') legend(1.2,1,c("eigenvector #1","eigenvector #2"),pch=c(19,19),col=c(1,2)) wait<-readline("press enter to project station data onto the eigenvectors (create PC time series)") # Principal Component time series # associated with eigenvectors # are obtained by using the entries of the eigenvectors # to form a weighted sum of the stations # temperature anomaly time series # here the first two PCs are calculated A<-X%*%e$vector[,1:2] plot(time,A[,1],col=1,typ='l',lwd=2,xlab='time',ylab='PC [deg C]') lines(time,A[,2],typ='l',col=2,lwd=2) print(paste("variance of PC 1: ",var(A[,1]))) print(paste("variance of PC 2: ",var(A[,2]))) wait<-readline("Press enter to compare the PC time with station average") # compare this with the station average time series # NOTE: I you want to use a powerful R-function # you can avoid loops with xmean<-apply(X,1,mean) # see help(apply) for more information xmean<-matrix(NA,n,1) for (i in 1:n) { xmean[i]<-mean(X[i,]) } # compare standardized time series plot(time,A[,1]/sqrt(e$value[1]),col=1,lwd=2,typ='l', ,xlab='time',ylab='standardized anomalies') lines(time,xmean/sd(xmean),col='green',lwd=1) print(paste("correlation between PC #1", "and station mean time series: ", round(cor(A[,1],xmean),6))) print(paste("correlation between PC #2", "and station mean time series: ", round(cor(A[,2],xmean),6)))