# UNIFE_SEB prof. MINI
# 2019, March 8
##################### second lecture on MLRM #################

#RQ: do the sugar, cereals and meat consumption affect the Alcoholic beverages?
#For investigating this question, refer to the eating database
#The model we should use is a MLRM

#Step-by-step analysis: 
#0. prepare and describe the dataset and the variables of interest
#1. graphical analysi of relationships (please, comment)
#2. specify the model (command and definition of your equation model)
#3. interpreting the coefficients
#4. range of xi and possible prediction
#5. the goodness of fit of your model (R2)
#6. try to compare different models (R2 adj)
#7. the significance of your model (F test)
#8. the significance of each single coefficient (t-test)
#9. graphical analysis of LRM conditions (e)
#10. check for the multicollinearity (VIF)
#11. please comment the obtained results considering the initial Research Question

#0.
getwd()
#change your directory
eating=read.csv2("eating.csv")
View(eating)
str(eating)
#the database is composed by 126 observations and 15 variables
attach(eating)
#we are interested in 4 variables (and all the 126 observations)
#thus we create a new dataset consdering only the variables of interest: 
#alcoholic consumption, sugar, cereals and meat


View(eating)
eating1=eating[,-seq(1:2)]
View(eating1)
eating2=eating1[,-3]
View(eating2)
eating3=eating2[,-3]
View(eating3)
eating4=eating3[,-4]
View(eating4)
eating5=eating4[,-4]
View(eating5)
eating6=eating5[,-5]
View(eating6)
eating7=eating6[,-5]
eating8=eating7[,-5]
eating9=eating8[,-5]
eating10=eating9[,-5]
View(eating10)
eating=eating10

#we obtained a database called "eating" cointaining only the variables of our interest

str(eating)

#1. graphical analysi of relationships (please, make a comment)

y=Alcoholic.Beverages
x1=Cereals
x2=Sugar
x3=Meat
plot(x1,y)
plot(x2,y)
plot(x3,y)
pairs(x=eating,panel=panel.smooth)


#2. specify the model (command and definition of your equation model)

#the model we are setting up is: 
# Alcoholic.Beverage=b0+b1*Cereals+b2*Sugar+b3*Meat
# y= b0+b1x1+b2x2+b3x3

reg=lm(y~x1+x2+x3)
reg
summary(reg)

#3. interpreting the coefficients
b0=11.44962
b1=-0.0429
b2=0.42128
b3=0.14766

#make a comment for each coefficient

#4. range of xi and possible prediction
range(x1)
range(x2)
range(x3)

#prediction question: immagine a policy maker in a contry want to fix the 
# cereals consumption at 29 units
# sugar consumption at 29 units
# meat consumption at 29 units
# how much of alcoholic beverage we expect to observe in that country?

predy=b0+b1*29+b2*29+b3*29
predy # 26.7047 units of alcoholic beverage

#prediction question: the year after, the policy maker want to fix the 
# cereals consumption at 20 units
# sugar consumption at 20 units
# meat consumption at 20 units
# how much of alcoholic beverage we expect to observe in that country?

predy2=b0+b1*20+b2*20+b3*20
predy2 # 21.970 units of alcoholic beverage

#prediction question: the year after, the policy maker want to fix the 
# cereals consumption at 0 units
# sugar consumption at 20 units
# meat consumption at 20 units
# how much of alcoholic beverage we expect to observe in that country?

predy3=b0+b1*0+b2*20+b3*20
predy3 # 22.828 units of alcoholic beverage


#5. the goodness of fit of your model (R2)
summary(reg)
#Rq = 0.07124 --> the 7.124% of total variance in alcoholic beverage is explained by the 
#linear relationship with sugar, cereals and meat.
#our model is not weel fitting data (the 92.876% of the variability is still unexplained)

#6. try to compare different models (R2 adj)

#model A= we try to delete x1
#thus our new model is: Y=b0+b2x2+b3x3 --> alcoholic beverage= b0+b2*sugar+b3*meat

regA=lm(y~x2+x3)
summary(regA)

#let's compare those models: 
# R2 adj. reg = 0.0484 
# R2 adj. regA= 0.04794
#the first model is a bit better than the second

#model B = we try to delete x2
#thus our new model is: Y=b0+b1x1+b3x3 --> alcoholic beverage= b0+b1*cereals+b3*meat

regB=lm(y~x1+x3)
summary(regB)

#let's compare those models: 
# R2 adj. reg = 0.0484 
# R2 adj. regB= 0.01016
#the first model is much better than the second

#model C = we try to delete x3
#thus our new model is: Y=b0+b1x1+b2x2 --> alcoholic beverage= b0+b1*cereals+b2*sugar

regC=lm(y~x1+x2)
summary(regC)

#let's compare those models: 
# R2 adj. reg = 0.0484 
# R2 adj. regC= 0.0405

#the first is a bit better than the second model 

#after having compared different solutions, we can mantain 
# - the original model 
# - or the model A, because we obtain quite the same R2 adj, but using less variables

#7. the significance of your model (F test)

summary(reg)
# please, make a comment: 
# F-statistic 3.119; DF: 3 and 122; p-value = 0.0286
# we may be confident at 95% (of confidence level) that the more-to-one relationship 
# described by our model, does exist within the population too. 
# (in fact, we may be confident at 97% of confidence level!)


#8. the significance of each single coefficient (t-test)

summary(reg)

#check the value and the associated p-value: make a comment


#9. geaphical analysis of LRM conditions (e)


e1=reg$residuals
pred=reg$fitted.values
plot(e1)
plot(pred,e1)
plot(e1,x1)
plot(e1,x2)
plot(e1,x3)
hist(e1)
# now let's produce a normal QQ plot of the variable e1
qqnorm(e1)  #how to create quantile-quantile plots in R. QQ plot (or quantile-quantile plot) draws the 
correlation between a given sample and the normal distribution. 
A 45-degree reference line is also plotted. 

# If the values lie along a line the distribution 
#has the same shape (up to location and scale) as the theoretical distribution we have supposed.
#As sample sizes become larger, generally speaking the plots 
#'stabilize' and the features become more clearly interpretable 
#rather than representing noise. [With some very heavy-tailed distributions, 
#the rare large outlier might 
#prevent the picture stabilizing nicely even at quite large sample sizes.]
#QQ plots are used to visually check the normality of the data.
#if we perform  qqline(e1)  we add a reference line
#please make a comment#

#10. chek for the multicollinearity (VIF)
e1=reg$residuals
SST=sum(y-mean(y))^2
SST
SSE=sum(e1^2)
R1=1-SSE/SST
VIF=1/(1-R1)
VIF

#to make a real analysis of VIF in MLRM: 

reg1=lm(x1~x2+x3)
summary(reg1)

reg2=lm(x2~x1+x3)
summary(reg2)

reg3=lm(x3~x1+x2)
summary(reg3)

R1=0.01817 #value from reg 1
R2=0.02019 #value from reg 2
R3=0.00983 #value from reg 3

VIF1=1/(1-R1)
VIF2=1/(1-R2)
VIF3=1/(1-R3)
VIF1
VIF2
VIF3
#observe your results: VIF shoul be lower/equal 5

#11. please comment the obtained results considering the initial Research Question