# class 11/12 examples for correlation # loader function for Albany, NY Central Park # station data (monthly mean anomalies) source("scripts/loadano.R") # set the window to two rows with one plot in each row par(mfrow=c(1,1)) # Albany station data: USW00014735 month<-"Ann" x1<-loadano(station="USW00014735",month=month,start=1880,end=2012) # New York Central Park station data: USW00094728 x2<-loadano(station="USW00094728",month=month,start=1880,end=2012) # loading the global mean temperature data from table # (source ishttp://www.metoffice.gov.uk/hadobs/hadcrut4/data/ # current/time_series/HadCRUT.4.2.0.0.annual_ns_avg.txt # column two has the data we need) buffer<-read.table("data/hadcrut4ann.txt") tglobe<-list(time=buffer[,1],ano=buffer[,2]) #plot(x1$time,x1$ano,typ='h',col=1) ipos<-tglobe$ano>0 ineg<-tglobe$ano<0 xrange<-c(1880,2013) yrange<-c(-2.5,2.5) plot(xrange,yrange,typ='n',xlab='time',ylab='temp. ano. [Deg C.]') lines(tglobe$time[ipos],tglobe$ano[ipos],typ='h',col=2) lines(tglobe$time[ineg],tglobe$ano[ineg],typ='h',col=4) # adding ALBANY and NYC station lines(x1$time,x1$ano,col=1) lines(x2$time,x2$ano,col="gray",lwd=1) res<-readline("next we create scatter plots") # Excercise: Transform the information into a scatter plot # (a) in the units of Deg. C. as given # (b) centered an # First we must make sure we select data from the same years iglobe<-tglobe$time>=1881 & tglobe$time<=2012 i1<-x1$time>=1881 & x1$time<=2012 i2<-x2$time>=1881 & x2$time<=2012 # global temp. anomalies on x-axis, Albany temp. anomalies on y-axis x<-tglobe$ano[iglobe] y<-x1$ano[i1] iuse<-! (is.na(x)) & ! (is.na(y)) plot ( x[iuse], y[iuse]) mx<-mean(x[iuse],na.rm=TRUE) my<-mean(y[iuse],na.rm=TRUE) # add some help lines to mark the mean of the scatter # Note we do mark the mean of the shown sample in the scatter # exactly because only pairs with existing data are plotted # and only these are used in the mean calculation. # (i.e. data from 1938-2012) lines(c(mx,mx),c(-3,3)) lines(c(-3,3),c(my,my)) # we standardize the variables: (a) center to the mean) res<-readline("centering the data (shifting to the mean)") x[iuse]<-x[iuse]-mx y[iuse]<-y[iuse]-my plot ( x[iuse], y[iuse],main="centered") lines(c(0,0),c(-3,3)) lines(c(-3,3),c(0,0)) # we standardize the the variables: (b) divide by the standard deviation res<-readline("centering the data (scaling to 1 standard deviation)") sx<-sd(x[iuse]) sy<-sd(y[iuse]) x[iuse]<-x[iuse]/sx y[iuse]<-y[iuse]/sy plot ( x[iuse], y[iuse],main="centered & scaled = standardized variables ") lines(c(0,0),c(-3,3)) lines(c(-3,3),c(0,0)) lines(c(-3,3),c(-3,3),lty=2,col='gray') # calculate the covariance of the standardized # data n<-length(x[iuse]) covxy<-1/(n)*sum(x[iuse]*y[iuse]) print(paste("covariance of the standardized variables: ", round(covxy,2))) # using the built-in functions print(paste("compare with R's cov()-function: ", round( cov (x[iuse],y[iuse]) ,2) )) # and correlation print(paste("compare with R's cor()-function: ", round( cor (x[iuse],y[iuse]) ,2) )) # Note difference due to teh factor (1/n we used and 1/(n-1) R uses)