##############
# 11/01/2016 #
##############


# MATRIX ALGEBRA

A=matrix(c(2,1,0,-3,1,3),nrow=2,ncol=3,byrow=TRUE)
A
b=c(1,2,3)
A%*%b

A=matrix(c(1,2,3,0,6,4,3,2,1),nrow=3,ncol=3,byrow=TRUE)
A
B=matrix(c(4,5,6,3,1,5,4,5,6),nrow=3,ncol=3,byrow=TRUE)
B
C=A%*%B
C
det(C)
det(A)
det(B)

# REGRESSION 

# Tom Jackson develops applications (apps) for iPhones, smartphones and iPods 
# He believes that the success of an app (number of downloads) is  
# linearly dependent on the number of websites with a link to the app.
# He considers data for four different apps 
# x: number of linking websites
# y: number of downloads

# case 1
x=c(20,30,40,50)
y=c(600,800,1000,900)

plot(x,y,main="Scatter diagram of DOWNLOADS (y) vs WEBSITES (x)",
      xlab="linking websites",ylab="number of downloads",col="blue",lwd=5)

r=cor(x,y)
r
R2=r^2
R2

reg=lm(y~x)
summary(reg)
abline(reg,col="red",lwd=3)

# case 2
x=c(20,30,40,50)
y=c(600,800,1000,1200)

plot(x,y,main="Scatter diagram of DOWNLOADS (y) vs WEBSITES (x)",
      xlab="linking websites",ylab="number of downloads",col="blue",lwd=5)

r=cor(x,y)
r
R2=r^2
R2

reg=lm(y~x)
summary(reg)
abline(reg,col="red",lwd=3)

# case 3
x=c(20,30,40,50)
y=c(700,600,900,750)

plot(x,y,main="Scatter diagram of DOWNLOADS (y) vs WEBSITES (x)",
      xlab="linking websites",ylab="number of downloads",col="blue",lwd=5)

r=cor(x,y)
r
R2=r^2
R2

reg=lm(y~x)
summary(reg)
abline(reg,col="red",lwd=3)

# case 4
x=c(20,30,40,50)
y=c(900,700,700,900)

plot(x,y,main="Scatter diagram of DOWNLOADS (y) vs WEBSITES (x)",
      xlab="linking websites",ylab="number of downloads",col="blue",lwd=5)

r=cor(x,y)
r
R2=r^2
R2

reg=lm(y~x)
summary(reg)
abline(reg,col="red",lwd=3)

# CLUSTER ANALYSIS

x1=c(1,4,2,3,4,6,5,5,6,2)
x2=c(4,3,3,6,7,7,4,5,2,5)
n=length(x1)
n

groups=1:n  # starting point: each unit is a cluster

plot(x1,x2,main="Scatter diagram of X1 vs X2",xlab="X1",ylab="X2",
     col="black",lwd=5,xlim=c(0,8),ylim=c(0,8))

# distance between units: Euclidean
# distance between clusters: Nearest Neighbour (single linkage)

X=matrix(c(x1,x2,groups),nrow=length(x1),ncol=3,byrow=FALSE)
colnames(X)=c("x1","x2","cluster")
d=dist(X[,-3],method="euclidean") # distance matrix between units
d
d.1_2=sqrt((x1[1]-x1[2])^2+(x2[1]-x2[2])^2) # euclid. dist. between u1 and u2
min(d)
X[8,]
X[7,]

HCA=hclust(d,method="single") # complete dendogram creation nearest neighbour
## TD=sum((x1-mean(x1))^2)+sum((x2-mean(x2))^2)

#step 1
groups=cutree(HCA,k=10-1) # group labels after step 1 
X[,3]=groups
X
g=max(groups)
g
## WD=rep(0,times=g)
## WD
## for (l in 1:g){
## WD[l]=sum((x1[X[,3]==l]-mean(x1[X[,3]==l]))^2)+sum((x2[X[,3]==l]-mean(x2[X[,3]==l]))^2)
## }
## R2=1-sum(WD)/TD

#step 2
groups=cutree(HCA,k=10-2) # group labels after step 1 
X[,3]=groups
X
g=max(groups)
g

#step 3
groups=cutree(HCA,k=10-3) # group labels after step 1 
X[,3]=groups
X
g=max(groups)
g

#step 4
groups=cutree(HCA,k=10-4) # group labels after step 1 
X[,3]=groups
X
g=max(groups)
g

#step 5
groups=cutree(HCA,k=10-5) # group labels after step 1 
X[,3]=groups
X
g=max(groups)
g

#step 6
groups=cutree(HCA,k=10-6) # group labels after step 1 
X[,3]=groups
X
g=max(groups)
g

#step 7
groups=cutree(HCA,k=10-7) # group labels after step 1 
X[,3]=groups
X
g=max(groups)
g

plot(x1[groups==1],x2[groups==1],main="Scatter diagram of X1 vs X2",xlab="X1",ylab="X2",
     col="blue",lwd=5,xlim=c(0,8),ylim=c(0,8))
par(new=TRUE)
plot(x1[groups==2],x2[groups==2],main="Scatter diagram of X1 vs X2",xlab="X1",ylab="X2",
     col="red",lwd=5,xlim=c(0,8),ylim=c(0,8))
par(new=TRUE)
plot(x1[groups==3],x2[groups==3],main="Scatter diagram of X1 vs X2",xlab="X1",ylab="X2",
     col="green",lwd=5,xlim=c(0,8),ylim=c(0,8))

HCA$merge # aggregation steps
plot(HCA) # dendogram representation


HCA=hclust(d,method="complete") # complete dendogram creation farthest neighbour
## TD=sum((x1-mean(x1))^2)+sum((x2-mean(x2))^2)

#step 1
groups=cutree(HCA,k=3) # group labels after step 7 
X[,3]=groups
X
g=max(groups)

plot(x1[groups==1],x2[groups==1],main="Scatter diagram of X1 vs X2",xlab="X1",ylab="X2",
     col="blue",lwd=5,xlim=c(0,8),ylim=c(0,8))
par(new=TRUE)
plot(x1[groups==2],x2[groups==2],main="Scatter diagram of X1 vs X2",xlab="X1",ylab="X2",
     col="red",lwd=5,xlim=c(0,8),ylim=c(0,8))
par(new=TRUE)
plot(x1[groups==3],x2[groups==3],main="Scatter diagram of X1 vs X2",xlab="X1",ylab="X2",
     col="green",lwd=5,xlim=c(0,8),ylim=c(0,8))

HCA$merge # aggregation steps
plot(HCA) # dendogram representation



##############
# 26/01/2016 #
##############


# MATRIX ALGEBRA

A=matrix(c(1,2,0,1,3,3,1,0),nrow=4,ncol=2,byrow=TRUE)
B=t(A)  # transposed
B

a=c(10,15,8,-3)
a
b=c(2,-2,0,5)
b
a%*%b #scalar product between vectors
a*b  # element by element product 
A=matrix(a,nrow=1,ncol=4)
B=matrix(b,nrow=4,ncol=1)
B%*%A  #product between column and row

A=matrix(c(1,0,2,1,3,1,2,2,0,1,0,1,4,3,2,1),nrow=4,ncol=4,byrow=TRUE)
diag(A)
sum(diag(A))


# REGRESSION 

# Tom Jackson develops applications (apps) for iPhones, smartphones and iPods 
# He believes that the success of an app (number of downloads) is  
# linearly dependent on the number of websites with a link to the app and 
# on the number of followers of the developer on twitter.
# He considers data for four different apps 
# x1: number of linking websites
# x2: twitter followers
# y: number of downloads

# Compute the ratio between RSS and TSS and prove that it is equal to 
# the coefficient of determination

x1=c(20,30,40,50)
x2=c(290,1210,1980,320)
y=c(600,800,1000,900)

plot(x1,y,main="Scatter diagram of DOWNLOADS (y) vs WEBSITES (x)",
      xlab="linking websites",ylab="number of downloads",col="blue",lwd=5)
plot(x2,y,main="Scatter diagram of DOWNLOADS (y) vs FOLLOWERS (x)",
      xlab="NUMBER OF FOLLOWERS",ylab="number of downloads",col="red",lwd=5)

TSS=sum((y-mean(y))^2)  # Total Sum of Squares
TSS

reg=lm(y~x1+x2)  # regression analysis
attributes(reg)
reg$coefficients  # least squares estimates
b0=reg$coefficients[1]
b0
b1=reg$coefficients[2]
b1
b2=reg$coefficients[3]
b2
y.pred=b0+b1*x1+b2*x2  # predicted values of y
reg$fitted.values

RSS=sum((y.pred-mean(y))^2)  # Regression Sum of Squares
RSS

R.squared=RSS/TSS  #coefficient of determination

# Prove that TSS=RSS+ESS

ESS=sum((y-y.pred)^2)  # Error Sum of Squares
ESS
e=reg$residuals
e
sum(e^2)
e%*%e

TSS
RSS+ESS

adj.R.squared=1-(ESS/(4-3))/(TSS/(4-1))
adj.R.squared

summary(reg)


# CLUSTER ANALYSIS

x1=c(1,4,2,3,4,6,5,5,5,2)
x2=c(4,3,3,6,7,7,4,5,2,5)
n=length(x1)
n

plot(x1,x2,main="Scatter diagram of X1 vs X2",xlab="X1",ylab="X2",
     col="black",lwd=5,xlim=c(0,8),ylim=c(0,8))

groups=c(1,2,1,3,3,3,2,2,2,1)  # hypothetical partition of units
X=matrix(c(x1,x2,groups),nrow=length(x1),ncol=3,byrow=FALSE)
colnames(X)=c("x1","x2","cluster")
rownames(X)=c("u1","u2","u3","u4","u5","u6","u7","u8","u9","u10")

# distance between units: Euclidean
# distance between clusters: Nearest Neighbour (single linkage)

# plot of cluster 1
plot(x1[groups==1],x2[groups==1],main="Scatter diagram of X1 vs X2",xlab="X1",ylab="X2",
     col="blue",lwd=5,xlim=c(0,8),ylim=c(0,8))
par(new=TRUE)

#plot of cluster 2
plot(x1[groups==2],x2[groups==2],main="Scatter diagram of X1 vs X2",xlab="X1",ylab="X2",
     col="red",lwd=5,xlim=c(0,8),ylim=c(0,8))
par(new=TRUE)

# plot of cluster 3
plot(x1[groups==3],x2[groups==3],main="Scatter diagram of X1 vs X2",xlab="X1",ylab="X2",
     col="green",lwd=5,xlim=c(0,8),ylim=c(0,8))

legend("bottomright",legend=c("Cluster 1","Cluster 2","Cluster 3"),pch=19,
       col=c("blue","red","green"))
text(X[,2]~X[,1],labels=row.names(X),pos=4)

X

# Nearest Neighbour
d_C1.C2=sqrt((x1[3]-x1[2])^2+(x2[3]-x2[2])^2)
d_C1.C2
d_C1.C3=sqrt((x1[10]-x1[4])^2+(x2[10]-x2[4])^2)
d_C1.C3
d_C2.C3=sqrt((x1[8]-x1[4])^2+(x2[8]-x2[4])^2)
d_C2.C3

# we should aggregate C1 and C3

# Farthest Neighbour
d_C1.C2=sqrt((x1[1]-x1[9])^2+(x2[1]-x2[9])^2)
d_C1.C2
d_C1.C3=sqrt((x1[1]-x1[6])^2+(x2[1]-x2[6])^2)
d_C1.C3
d_C2.C3=sqrt((x1[9]-x1[5])^2+(x2[9]-x2[5])^2)
d_C2.C3

# we should aggregate C1 and C2


##############
# 06/06/2016 #
##############


# MATRIX ALGEBRA

A=matrix(c(3,2,7,-4),nrow=2,ncol=2,byrow=TRUE)
A

# compute the transposed of A
B=t(A)  
B

# compute the determinant of A
det(A)

# compute the inverse of A
C=solve(A) 
C

# product AC and CA
A%*%C
C%*%A



# REGRESSION 

# Tom Jackson develops applications (apps) for iPhones, smartphones and iPods 
# He believes that the success of an app (number of downloads) is  
# linearly dependent on the number of websites with a link to the app and 
# on the number of followers of the developer on twitter.
# He considers data for four different apps 
# x1: number of linking websites
# x2: twitter followers
# y: number of downloads

# Compute the ratio between RSS and TSS and prove that it is equal to 
# the coefficient of determination

x1=c(20,30,40,50)
x2=c(290,1210,1980,320)
y=c(600,800,1000,900)

# compute the correlation between x1 and x2
cor(x1,x2)
# compute the scatter diagram of x2 vs x1
plot(x1,x2,main="Scatter diagram of FOLLOWERS (x2) vs WEBSITES (x1)",
      xlab="linking websites",ylab="number of followers",col="blue",lwd=5)
# regression of x2 vs x1 and VIF computation
reg1=lm(x2 ~ x1)
summary(reg1)
abline(reg1,col="red",lwd=3)

e1=reg1$residuals
TSS=sum((x2-mean(x2))^2)
ESS=sum(e1^2)
R1=1-ESS/TSS
R1
VIF1=1/(1-R1)
VIF1


# CLUSTER ANALYSIS

# Manhattan, euclidean and Chebichev distance between units u1 and u2 
# (5 variables)

u1=c(10,8,6,8,7)
u2=c(6,7,6,8,5)
k=length(u1)
k

# Manhattan distance
u1-u2
abs(u1-u2)
d.1=sum(abs(u1-u2))
d.1

# Euclidean distance
(u1-u2)^2
sum((u1-u2)^2)
d.2=sqrt(sum((u1-u2)^2))
d.2

# Chebichev distance
abs(u1-u2)
d.inf=max(abs(u1-u2))
d.inf

# let us change the first value of u2 by replacing 6 with 3
u1=c(10,8,6,8,7)
u2=c(3,7,6,8,5)

# Manhattan distance
d.1=sum(abs(u1-u2))
d.1

# Euclidean distance
d.2=sqrt(sum((u1-u2)^2))
d.2

# Chebichev distance
d.inf=max(abs(u1-u2))
d.inf

# relative distance increase
# Manhattan
(10-7)/7

#Euclidean
(7.35-4.58)/4.58

# Chebicev
(7-4)/4