# scatter covariance and correlation

# example using random numbers

# 
# function seq(from,to,by)
# from value to value in increments of the third value 
# the finer the interval the more data points
# 2014-03-25 UPDATE: I changed the 
# slope and intercept variable names 
# to match the class14.ppt slides
# [please review your class note for the 
# definition of the meaning of a and b]
# a= intercept, b = slope of the linear line
# y=b*x+a

x<-seq(-1,10,0.05)
n<-length(x)
print (paste("Note: the sample size will be: ",n))
a<-(-2)
b<-(1)
y<-b*x+a
xrange<-range(x)
yrange<-range(y)
#yrange<-c(-4,4)
plot(xrange,yrange,typ='n')
lines(x,y,typ='l')
lines(c(0,0),range(y))
text(max(x),0,">")
text(max(x),-0.75,"x")
lines(range(x),c(0,0))
text(0,max(y),"^")
text(-0.35,max(y),"y")

# now we assume that the values of y are contaminated with random
# errors (noise), but only the y values (the dependent variable), 
# not the x values. This comes from the history of using
# linear regression often with controlled experimental data,
# where one is in control of one variable x, and interested in 
# the outcome y, but yet the experiment cannot prevent other
# influencing factors to introduce errors in the outcome 
# (whether it is measurement errors, or environmental factors
# that vary during the experiment)
# standard deviation of the gaussian-distributed random data
se<-1
print(paste("standard deviation or the gaussian random sample data ",se))
e<-rnorm(n=length(x),m=0,sd=se) 
yobs<-b*x+a+e

points(x,yobs,col=4,pch='+')

print(paste("correlation yobs and x",round(cor(yobs,x),2)))

# estimate the regression line:
# slope bfit and intercept afit

r<-cor(x,yobs)
syobs<-sd(yobs)
sx<-sd(x)
bfit<-r*syobs/sx
afit<-mean(yobs)-bfit*mean(x)
bfit<-round(bfit,2)
afit<-round(afit,2)
print (paste("estimated slope parameter b:", bfit))
print (paste("estimated intercept parameter a:", afit))
print (paste("regression line model: yfit= ",afit,"+",bfit,"*x"))

# create the regression line
yfit<-afit+bfit*x
lines(x,yfit,col=3,lty=2,lwd=2)

# mark the center of the sample cloud 

points(mean(x),mean(yobs),pch='X',cex=1.5,col=3)


# Excercise:

# Change the correlation ratio by increasing the variance of 
# the errors
#
# Change the sample size and see how it affects the estimated
# regression parameter
# and line.