### SEB UNIFE 2019 - docente: MINI V. ###

#### non hierarchical cluster analysis ####

# we use the in-class experiment database: wh

# STEP-BY-STEP ANALYSIS

# 0. database creation/preparation
# 1. checking variables and observations
# 2. create and comment the plot
# 3. normalize the data
# 4. performing non hierarchical cluster (k-means method)
# 5. observe and comment your clusters
# 6. plotting results and checking for the right numbers of clusters
# 7. make a final comment (considering the initial research questions)

######## 0. database creation and/or preparation
getwd()
#change directory

wh=read.csv2("wh.csv")
attach(wh)

######## 1. checking variables and observation
str(wh)
View(wh)

######### 2. create and comment the plot
pairs(x=wh,panel=panel.smooth)
#or you can use:
plot(peso~altezza,wh)
#we may add the label m for male and f for female
with(wh,text(peso~altezza,labels=sex))
#we may adjust the labels considering their position and the text dimension
with(wh,text(peso~altezza,labels=sex,pos=1,cex=.6))
#pos=1 means next to the dot (lower the number , closer the label to the dot)
#cex=.6  indicates the dimension of the text (higher the number, bigger the text)

#please, comment the scatter plot looking at the groups you may individuate. 

######## 3. normalize the data
#the average value of each variables will be 0 and 
#the standard variation will be approximately = 1
#the standardized value = (observed value-mean)/standard deviation

#you can compute all the elements using R: 
sd(peso)
mean(peso)

sd(altezza)
mean(altezza)

#and ten compute each standardized values 
#i.e. for the 1 student: Zpeso=(60-65.48)/12.94
 
#or we can use the command scale in R
#obviously we need to use only numeric variables
wh1=wh[,-1]
wh2=wh1[,-3]
str(wh2)
#all the variables are numeric

z=wh2

m=apply(z,2,mean)
s=apply(z,2,sd)
z=scale(z,m,s)

######### 4. performing non hierarchical cluster (k-means method)

#we use the kmeans command
# the argument are: 
# - the standardized values matrix (z)
# - the number of clusters we impose (3)

kc=kmeans(z,3)

# we perform a non hierarchical cluster based on kmeans method
#let's observe our performed n/h cluster analysis analysis

######### 5. observe and comment your clusters

kc

# 5.A #### 1 OUTPUT ROW: we obtain information about our analysis:
# K-means clustering with 3 clusters of sizes 23, 20, 15

# 5.B #### 2 ROW OUTPUT ROW: we may profile each cluster using cluster means information: 
# i.e : the 3rd cluster is caracherized by the highest and eviest students 
# (in fact it's defined by the two highest average values)

# 5.C #### 3rd OUTPUT ROW: clustering vector 
# here we may individuate the cluster memberships
# i.e. the first student belongs the first cluster
# i.e. the second student belongs the first cluster too
# i.e. the 11th student belongs the 3rd cluster ... and so on

# 5.D #### 4th OUTPUT ROW: WCSS in each cluster
# in other words we have information about within deviance (WD)
# in addition we can see the R square value = BD/TD = Bss/Tss=74.6%
# (remember: RQ vary between 0 and 1 --> the lower the value, the weaker the performed analysis

# in other words we are saying that given a partition of the units in 3 groups,
# the proportion of global variability explained by this partition is 74.6%
# so we are explaining a high part of global variability in our data

# 5.E #### last OUTPUT ROW: 
# all the specific information we may ask on our performed analysis

kc$cluster

kc$centers

kc$size

kc$iter

#etc. 


######### 6. plotting results and checking for the right numbers of clusters

plot(peso~altezza,wh)

# using this visualization we are unable to see the created clusters
# to individuate each created cluster we can add colors by clusters

plot(peso~altezza,wh,col=kc$cluster)

plot(peso~altezza,wh,col=wh$sex)

# generally, if the two plots are similar, it'means you are performing a good classification/clustering
# please, make a comment: why the two graphs are not very similar?


#########7. make a final comment (considering the initial research questions)