# class 11/12  examples for correlation
# loader function for Albany, NY Central Park
# station data (monthly mean anomalies)
source("scripts/loadano.R")

# set the window to two rows with one plot in each row 
par(mfrow=c(1,1))

# Albany station data: USW00014735

month<-"Ann"
x1<-loadano(station="USW00014735",month=month,start=1880,end=2012)



# New York Central Park station data: USW00094728
x2<-loadano(station="USW00094728",month=month,start=1880,end=2012)

# loading the global mean temperature data from table
# (source ishttp://www.metoffice.gov.uk/hadobs/hadcrut4/data/
#  current/time_series/HadCRUT.4.2.0.0.annual_ns_avg.txt
# column two has the data we need)

buffer<-read.table("data/hadcrut4ann.txt")

tglobe<-list(time=buffer[,1],ano=buffer[,2])

#plot(x1$time,x1$ano,typ='h',col=1)
ipos<-tglobe$ano>0
ineg<-tglobe$ano<0
xrange<-c(1880,2013) 
yrange<-c(-2.5,2.5)
plot(xrange,yrange,typ='n',xlab='time',ylab='temp. ano. [Deg C.]')
lines(tglobe$time[ipos],tglobe$ano[ipos],typ='h',col=2)
lines(tglobe$time[ineg],tglobe$ano[ineg],typ='h',col=4)
# adding ALBANY and NYC station
lines(x1$time,x1$ano,col=1)
lines(x2$time,x2$ano,col="gray",lwd=1)



res<-readline("next we create scatter plots")

# Excercise: Transform the information into a scatter plot
# (a) in the units of Deg. C. as given
# (b) centered an

# First we must make sure we select data from the same years
iglobe<-tglobe$time>=1881 & tglobe$time<=2012
i1<-x1$time>=1881 & x1$time<=2012
i2<-x2$time>=1881 & x2$time<=2012
# global temp. anomalies on x-axis, Albany temp. anomalies on y-axis 
x<-tglobe$ano[iglobe]
y<-x1$ano[i1]
iuse<-! (is.na(x)) & ! (is.na(y))
plot ( x[iuse], y[iuse])

mx<-mean(x[iuse],na.rm=TRUE)
my<-mean(y[iuse],na.rm=TRUE)


# add some help lines to mark the mean of the scatter

# Note we do mark the mean of the shown sample in the scatter
# exactly because only pairs with existing data are plotted
# and only these are used in the mean calculation.
# (i.e. data from 1938-2012)

lines(c(mx,mx),c(-3,3))
lines(c(-3,3),c(my,my))




# we standardize the variables: (a) center to the mean)

res<-readline("centering the data (shifting to the mean)")



x[iuse]<-x[iuse]-mx
y[iuse]<-y[iuse]-my
plot ( x[iuse], y[iuse],main="centered")
lines(c(0,0),c(-3,3))
lines(c(-3,3),c(0,0))


# we standardize the the variables: (b) divide by the standard deviation

res<-readline("centering the data (scaling to 1 standard deviation)")

sx<-sd(x[iuse])
sy<-sd(y[iuse])

x[iuse]<-x[iuse]/sx
y[iuse]<-y[iuse]/sy

plot ( x[iuse], y[iuse],main="centered & scaled = standardized variables ")
lines(c(0,0),c(-3,3))
lines(c(-3,3),c(0,0))

lines(c(-3,3),c(-3,3),lty=2,col='gray')

# calculate the covariance of the standardized
# data
n<-length(x[iuse])
covxy<-1/(n)*sum(x[iuse]*y[iuse])
print(paste("covariance of the standardized variables: ",
            round(covxy,2)))

# using the built-in functions
print(paste("compare with R's cov()-function: ",
            round( cov (x[iuse],y[iuse]) ,2) ))

# and correlation
print(paste("compare with R's cor()-function: ",
            round( cor (x[iuse],y[iuse]) ,2) ))
# Note difference due to teh factor (1/n we used and 1/(n-1) R uses)