# 2014-4-16
# Multiple Linear Regression
# Introduction

rm(list=ls())
# reset the plot window to full sized plot
par(mfrow=c(1,1))
source("scripts/loadano2.R")
source("scripts/myfunctions.R")

# note this logical variable will allow us 
# to calculate the correlation matrix
iscor<-FALSE

###############################################
# Select here different months Jan - Dec
###############################################
month<-"Jun"
varname<-"tavg"
start<-1950
end<-2012
# sample size, used to define matrix
n<-end-start+1
stationlist<-c("USW00014735","USW00014750","USW00014771","USW00094728")

# variables (stations), used to define matrix
m<-length(stationlist)

# define a n x m matrix filled with NA entries
X<-matrix(NA,n,m)
#####################################################
# here I form a loop to repeat the loading of the 
# data for the stations given in stationlist
#####################################################
for ( i in 1:m) {
  buffer<-loadano2(station=stationlist[i],state="NY",month=month,start=start,end=end)
  mhelp<-mean(buffer$ano)
  # now I assign the centered data vector
  # into the column i of the matrix
  if (iscor) {
    # standardize to get the correlation coefficients
    # NOTE: standard deviation function is using 1/(n-1)
    #       factor, but below we work with 1/n 
    #       That's why sqrt(n-1/n)
    shelp<-sd(buffer$ano)*sqrt((n-1)/n)
    X[,i]<-(buffer$ano-mhelp)/shelp
  } else {
    # default is the covariance will be calculated
    # from centered data
    X[,i]<-(buffer$ano-mhelp)
  }
}
# the function t() is forming the transpose of a matrix
print("size of the data matrix X: ")
print(dim(X))
print("size of the transpose of data matrix X: ")
print(dim(t(X)))

# the matrix multiplication
# %*% is the operator for matrix multiplication
C<-t(X)%*%X
# multiply the matrix with factor 1/n
# to get the covariance (correlation matrix)
C<-1/n*C

if (iscor) { 
  print("size of the correlation matrix: ")
  print(dim(C))
  print ("Correlation Matrix")
  print(C)
} else {
  print("size of the Covariance matrix: ")
  print(dim(C))
  print ("Covariance Matrix")
  print(C)
}

# The variance for the four variables (stations)
# are stored on the diagonal 
# (top left down to bottom right)
vsum<-0
for ( i in 1:m){
  print(paste("Variance for ",stationlist[i]," : ",round(C[i,i],3),sep=''))
  vsum<-vsum+C[i,i]
}
print("---------------------------------------")
print(paste("sum of variances         : ",round(vsum,3)),sep='')




###################################################
# create an empty plot with myplot()
# function from myfunctions.R (new 2014-04-10)
###################################################
xrange<-c(-5,5)
yrange<-xrange

time<-buffer$time
plot(time,X[,1],pch=1,col=3,lwd=4,xlab="year",ylab="tavg ano [deg C]",typ='b')




wait<-readline("press enter to start multiple linear regression with lm()")
# Let's sse how good we can fit Albany temperature anomalies
# with the temp. anomalies from the other three stations

y<-X[,1]

# first I use only one station
res<-lm( y ~ X[,2:2])
b1fit<-res$coeff[2]
print(summary(res))
lines(time,res$fitted,col="blue")

wait<-readline("press enter to start multiple linear regression with lm()")
# Let's sse how good we can fit Albany temperature anomalies
# with the temp. anomalies from the other three stations

y<-X[,1]

# now I add a fit with another single station
res<-lm( y ~ X[,3:3])
b2fit<-res$coeff[2]
print(summary(res))
lines(time,res$fitted,col="cyan")



wait<-readline("press enter to start multiple linear regression with lm()")
#
# now the MLR fit with another single station
res<-lm( y ~ X[,2:3])
print(summary(res))
lines(time,res$fitted,col="black")

b1mlrfit<-res$coeff[2]
b2mlrfit<-res$coeff[3]



print ("compare fitted regression parameters:")
print (b1fit)
print (b2fit)
print (b1mlrfit)
print (b2mlrfit)
print("class20a finished")