############## # 11/01/2016 # ############## # MATRIX ALGEBRA A=matrix(c(2,1,0,-3,1,3),nrow=2,ncol=3,byrow=TRUE) A b=c(1,2,3) A%*%b A=matrix(c(1,2,3,0,6,4,3,2,1),nrow=3,ncol=3,byrow=TRUE) A B=matrix(c(4,5,6,3,1,5,4,5,6),nrow=3,ncol=3,byrow=TRUE) B C=A%*%B C det(C) det(A) det(B) # REGRESSION # Tom Jackson develops applications (apps) for iPhones, smartphones and iPods # He believes that the success of an app (number of downloads) is # linearly dependent on the number of websites with a link to the app. # He considers data for four different apps # x: number of linking websites # y: number of downloads # case 1 x=c(20,30,40,50) y=c(600,800,1000,900) plot(x,y,main="Scatter diagram of DOWNLOADS (y) vs WEBSITES (x)", xlab="linking websites",ylab="number of downloads",col="blue",lwd=5) r=cor(x,y) r R2=r^2 R2 reg=lm(y~x) summary(reg) abline(reg,col="red",lwd=3) # case 2 x=c(20,30,40,50) y=c(600,800,1000,1200) plot(x,y,main="Scatter diagram of DOWNLOADS (y) vs WEBSITES (x)", xlab="linking websites",ylab="number of downloads",col="blue",lwd=5) r=cor(x,y) r R2=r^2 R2 reg=lm(y~x) summary(reg) abline(reg,col="red",lwd=3) # case 3 x=c(20,30,40,50) y=c(700,600,900,750) plot(x,y,main="Scatter diagram of DOWNLOADS (y) vs WEBSITES (x)", xlab="linking websites",ylab="number of downloads",col="blue",lwd=5) r=cor(x,y) r R2=r^2 R2 reg=lm(y~x) summary(reg) abline(reg,col="red",lwd=3) # case 4 x=c(20,30,40,50) y=c(900,700,700,900) plot(x,y,main="Scatter diagram of DOWNLOADS (y) vs WEBSITES (x)", xlab="linking websites",ylab="number of downloads",col="blue",lwd=5) r=cor(x,y) r R2=r^2 R2 reg=lm(y~x) summary(reg) abline(reg,col="red",lwd=3) # CLUSTER ANALYSIS x1=c(1,4,2,3,4,6,5,5,6,2) x2=c(4,3,3,6,7,7,4,5,2,5) n=length(x1) n groups=1:n # starting point: each unit is a cluster plot(x1,x2,main="Scatter diagram of X1 vs X2",xlab="X1",ylab="X2", col="black",lwd=5,xlim=c(0,8),ylim=c(0,8)) # distance between units: Euclidean # distance between clusters: Nearest Neighbour (single linkage) X=matrix(c(x1,x2,groups),nrow=length(x1),ncol=3,byrow=FALSE) colnames(X)=c("x1","x2","cluster") d=dist(X[,-3],method="euclidean") # distance matrix between units d d.1_2=sqrt((x1[1]-x1[2])^2+(x2[1]-x2[2])^2) # euclid. dist. between u1 and u2 min(d) X[8,] X[7,] HCA=hclust(d,method="single") # complete dendogram creation nearest neighbour ## TD=sum((x1-mean(x1))^2)+sum((x2-mean(x2))^2) #step 1 groups=cutree(HCA,k=10-1) # group labels after step 1 X[,3]=groups X g=max(groups) g ## WD=rep(0,times=g) ## WD ## for (l in 1:g){ ## WD[l]=sum((x1[X[,3]==l]-mean(x1[X[,3]==l]))^2)+sum((x2[X[,3]==l]-mean(x2[X[,3]==l]))^2) ## } ## R2=1-sum(WD)/TD #step 2 groups=cutree(HCA,k=10-2) # group labels after step 1 X[,3]=groups X g=max(groups) g #step 3 groups=cutree(HCA,k=10-3) # group labels after step 1 X[,3]=groups X g=max(groups) g #step 4 groups=cutree(HCA,k=10-4) # group labels after step 1 X[,3]=groups X g=max(groups) g #step 5 groups=cutree(HCA,k=10-5) # group labels after step 1 X[,3]=groups X g=max(groups) g #step 6 groups=cutree(HCA,k=10-6) # group labels after step 1 X[,3]=groups X g=max(groups) g #step 7 groups=cutree(HCA,k=10-7) # group labels after step 1 X[,3]=groups X g=max(groups) g plot(x1[groups==1],x2[groups==1],main="Scatter diagram of X1 vs X2",xlab="X1",ylab="X2", col="blue",lwd=5,xlim=c(0,8),ylim=c(0,8)) par(new=TRUE) plot(x1[groups==2],x2[groups==2],main="Scatter diagram of X1 vs X2",xlab="X1",ylab="X2", col="red",lwd=5,xlim=c(0,8),ylim=c(0,8)) par(new=TRUE) plot(x1[groups==3],x2[groups==3],main="Scatter diagram of X1 vs X2",xlab="X1",ylab="X2", col="green",lwd=5,xlim=c(0,8),ylim=c(0,8)) HCA$merge # aggregation steps plot(HCA) # dendogram representation HCA=hclust(d,method="complete") # complete dendogram creation farthest neighbour ## TD=sum((x1-mean(x1))^2)+sum((x2-mean(x2))^2) #step 1 groups=cutree(HCA,k=3) # group labels after step 7 X[,3]=groups X g=max(groups) plot(x1[groups==1],x2[groups==1],main="Scatter diagram of X1 vs X2",xlab="X1",ylab="X2", col="blue",lwd=5,xlim=c(0,8),ylim=c(0,8)) par(new=TRUE) plot(x1[groups==2],x2[groups==2],main="Scatter diagram of X1 vs X2",xlab="X1",ylab="X2", col="red",lwd=5,xlim=c(0,8),ylim=c(0,8)) par(new=TRUE) plot(x1[groups==3],x2[groups==3],main="Scatter diagram of X1 vs X2",xlab="X1",ylab="X2", col="green",lwd=5,xlim=c(0,8),ylim=c(0,8)) HCA$merge # aggregation steps plot(HCA) # dendogram representation ############## # 26/01/2016 # ############## # MATRIX ALGEBRA A=matrix(c(1,2,0,1,3,3,1,0),nrow=4,ncol=2,byrow=TRUE) B=t(A) # transposed B a=c(10,15,8,-3) a b=c(2,-2,0,5) b a%*%b #scalar product between vectors a*b # element by element product A=matrix(a,nrow=1,ncol=4) B=matrix(b,nrow=4,ncol=1) B%*%A #product between column and row A=matrix(c(1,0,2,1,3,1,2,2,0,1,0,1,4,3,2,1),nrow=4,ncol=4,byrow=TRUE) diag(A) sum(diag(A)) # REGRESSION # Tom Jackson develops applications (apps) for iPhones, smartphones and iPods # He believes that the success of an app (number of downloads) is # linearly dependent on the number of websites with a link to the app and # on the number of followers of the developer on twitter. # He considers data for four different apps # x1: number of linking websites # x2: twitter followers # y: number of downloads # Compute the ratio between RSS and TSS and prove that it is equal to # the coefficient of determination x1=c(20,30,40,50) x2=c(290,1210,1980,320) y=c(600,800,1000,900) plot(x1,y,main="Scatter diagram of DOWNLOADS (y) vs WEBSITES (x)", xlab="linking websites",ylab="number of downloads",col="blue",lwd=5) plot(x2,y,main="Scatter diagram of DOWNLOADS (y) vs FOLLOWERS (x)", xlab="NUMBER OF FOLLOWERS",ylab="number of downloads",col="red",lwd=5) TSS=sum((y-mean(y))^2) # Total Sum of Squares TSS reg=lm(y~x1+x2) # regression analysis attributes(reg) reg$coefficients # least squares estimates b0=reg$coefficients[1] b0 b1=reg$coefficients[2] b1 b2=reg$coefficients[3] b2 y.pred=b0+b1*x1+b2*x2 # predicted values of y reg$fitted.values RSS=sum((y.pred-mean(y))^2) # Regression Sum of Squares RSS R.squared=RSS/TSS #coefficient of determination # Prove that TSS=RSS+ESS ESS=sum((y-y.pred)^2) # Error Sum of Squares ESS e=reg$residuals e sum(e^2) e%*%e TSS RSS+ESS adj.R.squared=1-(ESS/(4-3))/(TSS/(4-1)) adj.R.squared summary(reg) # CLUSTER ANALYSIS x1=c(1,4,2,3,4,6,5,5,5,2) x2=c(4,3,3,6,7,7,4,5,2,5) n=length(x1) n plot(x1,x2,main="Scatter diagram of X1 vs X2",xlab="X1",ylab="X2", col="black",lwd=5,xlim=c(0,8),ylim=c(0,8)) groups=c(1,2,1,3,3,3,2,2,2,1) # hypothetical partition of units X=matrix(c(x1,x2,groups),nrow=length(x1),ncol=3,byrow=FALSE) colnames(X)=c("x1","x2","cluster") rownames(X)=c("u1","u2","u3","u4","u5","u6","u7","u8","u9","u10") # distance between units: Euclidean # distance between clusters: Nearest Neighbour (single linkage) # plot of cluster 1 plot(x1[groups==1],x2[groups==1],main="Scatter diagram of X1 vs X2",xlab="X1",ylab="X2", col="blue",lwd=5,xlim=c(0,8),ylim=c(0,8)) par(new=TRUE) #plot of cluster 2 plot(x1[groups==2],x2[groups==2],main="Scatter diagram of X1 vs X2",xlab="X1",ylab="X2", col="red",lwd=5,xlim=c(0,8),ylim=c(0,8)) par(new=TRUE) # plot of cluster 3 plot(x1[groups==3],x2[groups==3],main="Scatter diagram of X1 vs X2",xlab="X1",ylab="X2", col="green",lwd=5,xlim=c(0,8),ylim=c(0,8)) legend("bottomright",legend=c("Cluster 1","Cluster 2","Cluster 3"),pch=19, col=c("blue","red","green")) text(X[,2]~X[,1],labels=row.names(X),pos=4) X # Nearest Neighbour d_C1.C2=sqrt((x1[3]-x1[2])^2+(x2[3]-x2[2])^2) d_C1.C2 d_C1.C3=sqrt((x1[10]-x1[4])^2+(x2[10]-x2[4])^2) d_C1.C3 d_C2.C3=sqrt((x1[8]-x1[4])^2+(x2[8]-x2[4])^2) d_C2.C3 # we should aggregate C1 and C3 # Farthest Neighbour d_C1.C2=sqrt((x1[1]-x1[9])^2+(x2[1]-x2[9])^2) d_C1.C2 d_C1.C3=sqrt((x1[1]-x1[6])^2+(x2[1]-x2[6])^2) d_C1.C3 d_C2.C3=sqrt((x1[9]-x1[5])^2+(x2[9]-x2[5])^2) d_C2.C3 # we should aggregate C1 and C2 ############## # 06/06/2016 # ############## # MATRIX ALGEBRA A=matrix(c(3,2,7,-4),nrow=2,ncol=2,byrow=TRUE) A # compute the transposed of A B=t(A) B # compute the determinant of A det(A) # compute the inverse of A C=solve(A) C # product AC and CA A%*%C C%*%A # REGRESSION # Tom Jackson develops applications (apps) for iPhones, smartphones and iPods # He believes that the success of an app (number of downloads) is # linearly dependent on the number of websites with a link to the app and # on the number of followers of the developer on twitter. # He considers data for four different apps # x1: number of linking websites # x2: twitter followers # y: number of downloads # Compute the ratio between RSS and TSS and prove that it is equal to # the coefficient of determination x1=c(20,30,40,50) x2=c(290,1210,1980,320) y=c(600,800,1000,900) # compute the correlation between x1 and x2 cor(x1,x2) # compute the scatter diagram of x2 vs x1 plot(x1,x2,main="Scatter diagram of FOLLOWERS (x2) vs WEBSITES (x1)", xlab="linking websites",ylab="number of followers",col="blue",lwd=5) # regression of x2 vs x1 and VIF computation reg1=lm(x2 ~ x1) summary(reg1) abline(reg1,col="red",lwd=3) e1=reg1$residuals TSS=sum((x2-mean(x2))^2) ESS=sum(e1^2) R1=1-ESS/TSS R1 VIF1=1/(1-R1) VIF1 # CLUSTER ANALYSIS # Manhattan, euclidean and Chebichev distance between units u1 and u2 # (5 variables) u1=c(10,8,6,8,7) u2=c(6,7,6,8,5) k=length(u1) k # Manhattan distance u1-u2 abs(u1-u2) d.1=sum(abs(u1-u2)) d.1 # Euclidean distance (u1-u2)^2 sum((u1-u2)^2) d.2=sqrt(sum((u1-u2)^2)) d.2 # Chebichev distance abs(u1-u2) d.inf=max(abs(u1-u2)) d.inf # let us change the first value of u2 by replacing 6 with 3 u1=c(10,8,6,8,7) u2=c(3,7,6,8,5) # Manhattan distance d.1=sum(abs(u1-u2)) d.1 # Euclidean distance d.2=sqrt(sum((u1-u2)^2)) d.2 # Chebichev distance d.inf=max(abs(u1-u2)) d.inf # relative distance increase # Manhattan (10-7)/7 #Euclidean (7.35-4.58)/4.58 # Chebicev (7-4)/4