Lab_Examples_Matrix.Regression.ClusterA
Examples_Matrix.Regression.ClusterA.txt
—
Plain Text,
10 kB (10536 bytes)
Contenuto del file
##############
# 11/01/2016 #
##############
# MATRIX ALGEBRA
A=matrix(c(2,1,0,-3,1,3),nrow=2,ncol=3,byrow=TRUE)
A
b=c(1,2,3)
A%*%b
A=matrix(c(1,2,3,0,6,4,3,2,1),nrow=3,ncol=3,byrow=TRUE)
A
B=matrix(c(4,5,6,3,1,5,4,5,6),nrow=3,ncol=3,byrow=TRUE)
B
C=A%*%B
C
det(C)
det(A)
det(B)
# REGRESSION
# Tom Jackson develops applications (apps) for iPhones, smartphones and iPods
# He believes that the success of an app (number of downloads) is
# linearly dependent on the number of websites with a link to the app.
# He considers data for four different apps
# x: number of linking websites
# y: number of downloads
# case 1
x=c(20,30,40,50)
y=c(600,800,1000,900)
plot(x,y,main="Scatter diagram of DOWNLOADS (y) vs WEBSITES (x)",
xlab="linking websites",ylab="number of downloads",col="blue",lwd=5)
r=cor(x,y)
r
R2=r^2
R2
reg=lm(y~x)
summary(reg)
abline(reg,col="red",lwd=3)
# case 2
x=c(20,30,40,50)
y=c(600,800,1000,1200)
plot(x,y,main="Scatter diagram of DOWNLOADS (y) vs WEBSITES (x)",
xlab="linking websites",ylab="number of downloads",col="blue",lwd=5)
r=cor(x,y)
r
R2=r^2
R2
reg=lm(y~x)
summary(reg)
abline(reg,col="red",lwd=3)
# case 3
x=c(20,30,40,50)
y=c(700,600,900,750)
plot(x,y,main="Scatter diagram of DOWNLOADS (y) vs WEBSITES (x)",
xlab="linking websites",ylab="number of downloads",col="blue",lwd=5)
r=cor(x,y)
r
R2=r^2
R2
reg=lm(y~x)
summary(reg)
abline(reg,col="red",lwd=3)
# case 4
x=c(20,30,40,50)
y=c(900,700,700,900)
plot(x,y,main="Scatter diagram of DOWNLOADS (y) vs WEBSITES (x)",
xlab="linking websites",ylab="number of downloads",col="blue",lwd=5)
r=cor(x,y)
r
R2=r^2
R2
reg=lm(y~x)
summary(reg)
abline(reg,col="red",lwd=3)
# CLUSTER ANALYSIS
x1=c(1,4,2,3,4,6,5,5,6,2)
x2=c(4,3,3,6,7,7,4,5,2,5)
n=length(x1)
n
groups=1:n # starting point: each unit is a cluster
plot(x1,x2,main="Scatter diagram of X1 vs X2",xlab="X1",ylab="X2",
col="black",lwd=5,xlim=c(0,8),ylim=c(0,8))
# distance between units: Euclidean
# distance between clusters: Nearest Neighbour (single linkage)
X=matrix(c(x1,x2,groups),nrow=length(x1),ncol=3,byrow=FALSE)
colnames(X)=c("x1","x2","cluster")
d=dist(X[,-3],method="euclidean") # distance matrix between units
d
d.1_2=sqrt((x1[1]-x1[2])^2+(x2[1]-x2[2])^2) # euclid. dist. between u1 and u2
min(d)
X[8,]
X[7,]
HCA=hclust(d,method="single") # complete dendogram creation nearest neighbour
## TD=sum((x1-mean(x1))^2)+sum((x2-mean(x2))^2)
#step 1
groups=cutree(HCA,k=10-1) # group labels after step 1
X[,3]=groups
X
g=max(groups)
g
## WD=rep(0,times=g)
## WD
## for (l in 1:g){
## WD[l]=sum((x1[X[,3]==l]-mean(x1[X[,3]==l]))^2)+sum((x2[X[,3]==l]-mean(x2[X[,3]==l]))^2)
## }
## R2=1-sum(WD)/TD
#step 2
groups=cutree(HCA,k=10-2) # group labels after step 1
X[,3]=groups
X
g=max(groups)
g
#step 3
groups=cutree(HCA,k=10-3) # group labels after step 1
X[,3]=groups
X
g=max(groups)
g
#step 4
groups=cutree(HCA,k=10-4) # group labels after step 1
X[,3]=groups
X
g=max(groups)
g
#step 5
groups=cutree(HCA,k=10-5) # group labels after step 1
X[,3]=groups
X
g=max(groups)
g
#step 6
groups=cutree(HCA,k=10-6) # group labels after step 1
X[,3]=groups
X
g=max(groups)
g
#step 7
groups=cutree(HCA,k=10-7) # group labels after step 1
X[,3]=groups
X
g=max(groups)
g
plot(x1[groups==1],x2[groups==1],main="Scatter diagram of X1 vs X2",xlab="X1",ylab="X2",
col="blue",lwd=5,xlim=c(0,8),ylim=c(0,8))
par(new=TRUE)
plot(x1[groups==2],x2[groups==2],main="Scatter diagram of X1 vs X2",xlab="X1",ylab="X2",
col="red",lwd=5,xlim=c(0,8),ylim=c(0,8))
par(new=TRUE)
plot(x1[groups==3],x2[groups==3],main="Scatter diagram of X1 vs X2",xlab="X1",ylab="X2",
col="green",lwd=5,xlim=c(0,8),ylim=c(0,8))
HCA$merge # aggregation steps
plot(HCA) # dendogram representation
HCA=hclust(d,method="complete") # complete dendogram creation farthest neighbour
## TD=sum((x1-mean(x1))^2)+sum((x2-mean(x2))^2)
#step 1
groups=cutree(HCA,k=3) # group labels after step 7
X[,3]=groups
X
g=max(groups)
plot(x1[groups==1],x2[groups==1],main="Scatter diagram of X1 vs X2",xlab="X1",ylab="X2",
col="blue",lwd=5,xlim=c(0,8),ylim=c(0,8))
par(new=TRUE)
plot(x1[groups==2],x2[groups==2],main="Scatter diagram of X1 vs X2",xlab="X1",ylab="X2",
col="red",lwd=5,xlim=c(0,8),ylim=c(0,8))
par(new=TRUE)
plot(x1[groups==3],x2[groups==3],main="Scatter diagram of X1 vs X2",xlab="X1",ylab="X2",
col="green",lwd=5,xlim=c(0,8),ylim=c(0,8))
HCA$merge # aggregation steps
plot(HCA) # dendogram representation
##############
# 26/01/2016 #
##############
# MATRIX ALGEBRA
A=matrix(c(1,2,0,1,3,3,1,0),nrow=4,ncol=2,byrow=TRUE)
B=t(A) # transposed
B
a=c(10,15,8,-3)
a
b=c(2,-2,0,5)
b
a%*%b #scalar product between vectors
a*b # element by element product
A=matrix(a,nrow=1,ncol=4)
B=matrix(b,nrow=4,ncol=1)
B%*%A #product between column and row
A=matrix(c(1,0,2,1,3,1,2,2,0,1,0,1,4,3,2,1),nrow=4,ncol=4,byrow=TRUE)
diag(A)
sum(diag(A))
# REGRESSION
# Tom Jackson develops applications (apps) for iPhones, smartphones and iPods
# He believes that the success of an app (number of downloads) is
# linearly dependent on the number of websites with a link to the app and
# on the number of followers of the developer on twitter.
# He considers data for four different apps
# x1: number of linking websites
# x2: twitter followers
# y: number of downloads
# Compute the ratio between RSS and TSS and prove that it is equal to
# the coefficient of determination
x1=c(20,30,40,50)
x2=c(290,1210,1980,320)
y=c(600,800,1000,900)
plot(x1,y,main="Scatter diagram of DOWNLOADS (y) vs WEBSITES (x)",
xlab="linking websites",ylab="number of downloads",col="blue",lwd=5)
plot(x2,y,main="Scatter diagram of DOWNLOADS (y) vs FOLLOWERS (x)",
xlab="NUMBER OF FOLLOWERS",ylab="number of downloads",col="red",lwd=5)
TSS=sum((y-mean(y))^2) # Total Sum of Squares
TSS
reg=lm(y~x1+x2) # regression analysis
attributes(reg)
reg$coefficients # least squares estimates
b0=reg$coefficients[1]
b0
b1=reg$coefficients[2]
b1
b2=reg$coefficients[3]
b2
y.pred=b0+b1*x1+b2*x2 # predicted values of y
reg$fitted.values
RSS=sum((y.pred-mean(y))^2) # Regression Sum of Squares
RSS
R.squared=RSS/TSS #coefficient of determination
# Prove that TSS=RSS+ESS
ESS=sum((y-y.pred)^2) # Error Sum of Squares
ESS
e=reg$residuals
e
sum(e^2)
e%*%e
TSS
RSS+ESS
adj.R.squared=1-(ESS/(4-3))/(TSS/(4-1))
adj.R.squared
summary(reg)
# CLUSTER ANALYSIS
x1=c(1,4,2,3,4,6,5,5,5,2)
x2=c(4,3,3,6,7,7,4,5,2,5)
n=length(x1)
n
plot(x1,x2,main="Scatter diagram of X1 vs X2",xlab="X1",ylab="X2",
col="black",lwd=5,xlim=c(0,8),ylim=c(0,8))
groups=c(1,2,1,3,3,3,2,2,2,1) # hypothetical partition of units
X=matrix(c(x1,x2,groups),nrow=length(x1),ncol=3,byrow=FALSE)
colnames(X)=c("x1","x2","cluster")
rownames(X)=c("u1","u2","u3","u4","u5","u6","u7","u8","u9","u10")
# distance between units: Euclidean
# distance between clusters: Nearest Neighbour (single linkage)
# plot of cluster 1
plot(x1[groups==1],x2[groups==1],main="Scatter diagram of X1 vs X2",xlab="X1",ylab="X2",
col="blue",lwd=5,xlim=c(0,8),ylim=c(0,8))
par(new=TRUE)
#plot of cluster 2
plot(x1[groups==2],x2[groups==2],main="Scatter diagram of X1 vs X2",xlab="X1",ylab="X2",
col="red",lwd=5,xlim=c(0,8),ylim=c(0,8))
par(new=TRUE)
# plot of cluster 3
plot(x1[groups==3],x2[groups==3],main="Scatter diagram of X1 vs X2",xlab="X1",ylab="X2",
col="green",lwd=5,xlim=c(0,8),ylim=c(0,8))
legend("bottomright",legend=c("Cluster 1","Cluster 2","Cluster 3"),pch=19,
col=c("blue","red","green"))
text(X[,2]~X[,1],labels=row.names(X),pos=4)
X
# Nearest Neighbour
d_C1.C2=sqrt((x1[3]-x1[2])^2+(x2[3]-x2[2])^2)
d_C1.C2
d_C1.C3=sqrt((x1[10]-x1[4])^2+(x2[10]-x2[4])^2)
d_C1.C3
d_C2.C3=sqrt((x1[8]-x1[4])^2+(x2[8]-x2[4])^2)
d_C2.C3
# we should aggregate C1 and C3
# Farthest Neighbour
d_C1.C2=sqrt((x1[1]-x1[9])^2+(x2[1]-x2[9])^2)
d_C1.C2
d_C1.C3=sqrt((x1[1]-x1[6])^2+(x2[1]-x2[6])^2)
d_C1.C3
d_C2.C3=sqrt((x1[9]-x1[5])^2+(x2[9]-x2[5])^2)
d_C2.C3
# we should aggregate C1 and C2
##############
# 06/06/2016 #
##############
# MATRIX ALGEBRA
A=matrix(c(3,2,7,-4),nrow=2,ncol=2,byrow=TRUE)
A
# compute the transposed of A
B=t(A)
B
# compute the determinant of A
det(A)
# compute the inverse of A
C=solve(A)
C
# product AC and CA
A%*%C
C%*%A
# REGRESSION
# Tom Jackson develops applications (apps) for iPhones, smartphones and iPods
# He believes that the success of an app (number of downloads) is
# linearly dependent on the number of websites with a link to the app and
# on the number of followers of the developer on twitter.
# He considers data for four different apps
# x1: number of linking websites
# x2: twitter followers
# y: number of downloads
# Compute the ratio between RSS and TSS and prove that it is equal to
# the coefficient of determination
x1=c(20,30,40,50)
x2=c(290,1210,1980,320)
y=c(600,800,1000,900)
# compute the correlation between x1 and x2
cor(x1,x2)
# compute the scatter diagram of x2 vs x1
plot(x1,x2,main="Scatter diagram of FOLLOWERS (x2) vs WEBSITES (x1)",
xlab="linking websites",ylab="number of followers",col="blue",lwd=5)
# regression of x2 vs x1 and VIF computation
reg1=lm(x2 ~ x1)
summary(reg1)
abline(reg1,col="red",lwd=3)
e1=reg1$residuals
TSS=sum((x2-mean(x2))^2)
ESS=sum(e1^2)
R1=1-ESS/TSS
R1
VIF1=1/(1-R1)
VIF1
# CLUSTER ANALYSIS
# Manhattan, euclidean and Chebichev distance between units u1 and u2
# (5 variables)
u1=c(10,8,6,8,7)
u2=c(6,7,6,8,5)
k=length(u1)
k
# Manhattan distance
u1-u2
abs(u1-u2)
d.1=sum(abs(u1-u2))
d.1
# Euclidean distance
(u1-u2)^2
sum((u1-u2)^2)
d.2=sqrt(sum((u1-u2)^2))
d.2
# Chebichev distance
abs(u1-u2)
d.inf=max(abs(u1-u2))
d.inf
# let us change the first value of u2 by replacing 6 with 3
u1=c(10,8,6,8,7)
u2=c(3,7,6,8,5)
# Manhattan distance
d.1=sum(abs(u1-u2))
d.1
# Euclidean distance
d.2=sqrt(sum((u1-u2)^2))
d.2
# Chebichev distance
d.inf=max(abs(u1-u2))
d.inf
# relative distance increase
# Manhattan
(10-7)/7
#Euclidean
(7.35-4.58)/4.58
# Chebicev
(7-4)/4