First have some common code… See the previous lecture about kmeans++
. Also, load the HS data and provide the necessary visualization functions.
set.seed(42)
library(RColorBrewer)
library(plotly)
Loading required package: ggplot2
Registered S3 method overwritten by 'dplyr':
method from
print.rowwise_df
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Registered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Attaching package: ‘plotly’
The following object is masked from ‘package:ggplot2’:
last_plot
The following object is masked from ‘package:stats’:
filter
The following object is masked from ‘package:graphics’:
layout
library(vegan)
Loading required package: permute
Loading required package: lattice
This is vegan 2.5-6
library(clue)
library(dendextend)
Registered S3 method overwritten by 'dendextend':
method from
rev.hclust vegan
---------------------
Welcome to dendextend version 1.12.0
Type citation('dendextend') for how to cite the package.
Type browseVignettes(package = 'dendextend') for the package vignette.
The github page is: https://github.com/talgalili/dendextend/
Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
Or contact: <tal.galili@gmail.com>
To suppress this message use: suppressPackageStartupMessages(library(dendextend))
---------------------
Attaching package: ‘dendextend’
The following object is masked from ‘package:permute’:
shuffle
The following object is masked from ‘package:stats’:
cutree
library(MASS)
Attaching package: ‘MASS’
The following object is masked from ‘package:plotly’:
select
library(lle)
Loading required package: scatterplot3d
Loading required package: snowfall
Loading required package: snow
makepdf <- FALSE
n <- 500
rot <- matrix(c(cos(pi/4),-sin(pi/4),sin(pi/4),cos(pi/4)),2,2)
cols <- rep(brewer.pal(11,"RdYlBu"),each=ceiling(n/11))[1:n]
makecluster <- function(n=500) {
Xcluster <- matrix(rnorm(2*n),n,2)
Xcluster[,1] <- Xcluster[,1]/10
p <- sample.int(n,size=floor(n/2))
Xcluster[p,1] <- Xcluster[p,1]-0.3
Xcluster[-p,1] <- Xcluster[-p,1]+0.3
Ycluster <- matrix(-1,n,1)
Ycluster[p,1] <- 1
p <- order(Xcluster[,1])
Xcluster <- Xcluster[p,]
Xcluster <- Xcluster %*% t(rot)
Ycluster <- Ycluster[p,,drop=FALSE]
list(X=Xcluster,Y=Ycluster)
}
makeswiss <- function(n=500) {
X <- matrix(0,n,3)
s <- sort(runif(n))
r <- 1+s+0.05*runif(n)
X[,1] <- r*cos(2*pi*s)
X[,2] <- r*sin(2*pi*s)
X[,3] <- runif(n,min=-3,max=3)
list(X=X,Y=matrix(s,n,1))
}
spca <- function(X,Y=diag(dim(X)[1])) {
X <- scale(X,center=TRUE,scale=FALSE)
a <- svd(t(X) %*% Y %*% t(Y) %*% X)
p <- order(a$d,decreasing=TRUE)
d <- a$d[p]
U <- a$u[,p]
list(E=X %*% U,d=d,U=U)
}
eigenmap <- function(X,ndim=2,k=5,eps=.Machine$double.eps^0.25) {
W <- find_nn_k(X,k=k)
W <- 1*(W | t(W)) # symmetrise
D <- diag(apply(W,1,sum))
s <- svd(W-D)
p <- order(abs(s$d))
lambda <- s$d[p]
U <- s$u[,p,drop=FALSE]
i <- lambda>=eps
U[,i,drop=FALSE][,1:ndim,drop=FALSE]
}
doplots <- function(X,Y=diag(dim(X)[1]),col=cols,main="",prefix="",sp=FALSE) {
spca(X)
Xp <- spca(X)$E
Xi <- spca(X,Y)$E
y1 <- max(abs(c(as.matrix(X[,1]),Xp[,1],Xi[,1])))
y2 <- max(abs(c(as.matrix(X[,2]),Xp[,2],Xi[,2])))
y1 <- y2 <- max(y1,y2)
plot(c(-y1,y1),c(-y2,y2),type="n",bty="n",xlab="x",ylab="y",main=main)
points(X,col=col)
plot(c(-y1,y1),c(-y2,y2),type="n",bty="n",xlab="PC1",ylab="PC2",main=if(main=="") "PCA" else sprintf("%s (PCA)",main))
points(Xp,col=col)
for(i in sample.int(dim(Xp)[1])) rug(Xp[i,1],side=1,lwd=1,col=col[i])
if(sp) {
plot(c(-y1,y1),c(-y2,y2),type="n",bty="n",xlab="SPC1",ylab="SPC2",main=if(main=="") "supervised PCA" else sprintf("%s (supervised PCA)",main))
points(Xi,col=col)
for(i in sample.int(dim(Xi)[1])) rug(Xi[i,1],side=1,lwd=1,col=col[i])
}
}
partyplot <- function(xy,cl=class,col0=brewer.pal(length(levels(cl)),"Paired"),alpha=0.6,
acol=sapply(col0,function(x) adjustcolor(x,alpha=alpha)),
lab=levels(cl),main="Espoo 2017",textp=FALSE) {
plot(c(min(xy[,1])-if(textp) 0 else 0,max(xy[,1])),range(xy[,2]),bty="n",xlab="x",ylab="y",type="n",main=main)
p <- sample.int(dim(xy)[1])
if(textp) {
text(xy[p,],lab[cl[p]],cex=1,col=col0[cl[p]])
} else {
## legend("bottomleft",legend=lab,pch=19,col=col0,bty="n")
points(xy[p,],pch=19,col=acol[cl[p]])
aux <- aggregate(xy,by=list(cl),FUN=mean)
text(aux[,2:3],as.character(aux[,1]),font=2,cex=2,col=col0)
text(aux[,2:3],as.character(aux[,1]),pos=1,font=2)
}
}
partyplot3d <- function(xyz,cl=class,col0=brewer.pal(length(levels(cl)),"Paired"),alpha=0.6,
acol=sapply(col0,function(x) adjustcolor(x,alpha=alpha)),
lab=levels(cl),main="Espoo 2017") {
xyz <- as.data.frame(xyz)
colnames(xyz) <- c("x","y","z")
xyz <- cbind(xyz,w=as.integer(cl))
plot_ly(xyz,x=~x,y=~y,z=~z,color=~w,colors=acol,text=rownames(xyz)) %>% layout(scene=list(xaxis=list(title="x"),yaxis=list(title="y"),zaxis=list(title="z"))) %>% add_markers() %>% hide_colorbar() %>% hide_legend()
}
HS <- readRDS("HS.rds")
dHS <- dist(HS[,4:52])
mHS <- as.matrix(HS[,4:52])
## Do a trick to make the order of the factors to present
## similarity of answers within a party
a <- aggregate(svd(scale(mHS))$u[,1],by=list(class=HS[,"class"]),mean)
a <- a[order(a[,"x"]),]
HS[,"class"] <- factor(as.character(HS[,"class"]),levels=a[,"class"])
print(table(HS[,"class"]))
ps kok kd kesk pir rkp sdp vihr vas skp
59 102 39 33 6 43 84 108 36 5
# kmeansppcenters - Finds initial kmeans++ centroids
# Arguments:
# x numeric matrix, rows correspond to data items
# k integer, number of clusters
# Value:
# centers a matrix of cluster centroids
#
# Reference: Arthur & Vassilivitskii (2007) k-means++: the
# advantages of careful seeding. In Proc SODA '07, 1027-1035.
kmeansppcenters <- function(x,k) {
x <- as.matrix(x)
n <- dim(x)[1]
centers <- matrix(NA,k,dim(x)[2])
p <- rep(1/n,n)
d <- rep(NA,n)
for(i in 1:k) {
centers[i,] <- x[sample.int(n,size=1,prob=p),]
dd <- rowSums((x-(rep(1,n) %o% centers[i,]))^2)
d <- if(i>1) pmin(d,dd) else dd
if(max(d)>0) { p <- d/sum(d) }
}
centers
}
# kmeanspp - kmeans++ algorithm
# Arguments:
# x numeric matrix, rows correspond to data items
# k integer, number of clusters
# iter.max the maximum number of iterations allowed (default 1000)
# algorithm see kmeans documentation (default "Lloyd")
# Value:
# See kmeanspp documentation
#
# Reference: Arthur & Vassilivitskii (2007) k-means++: the
# advantages of careful seeding. In Proc SODA '07, 1027-1035.
kmeanspp <- function(x,k,iter.max=1000,algorithm="Lloyd",...) {
kmeans(x,centers=kmeansppcenters(x,k),iter.max=iter.max,
algorithm=algorithm,...)
}
# kmeansppn
# Run kmeanspp algorithm n times (default n=10) and return a solution
# with the smallest loss.
kmeansppn <- function(x,k,n=10,iter.max=1000,algorithm="Lloyd",...) {
best <- NULL
count <- 1
for(i in 1:n) {
res <- kmeanspp(x,k,iter.max=iter.max,algorithm=algorithm,...)
if(!is.null(best) && res$tot.withinss==best$tot.withinss) {
count <- count+1
}
if(is.null(best) || res$tot.withinss<best$tot.withinss) {
best <- res
count <- 1
}
}
## If we are really converged we might expect count to be >1.
cat(sprintf("count = %d\n",count))
best
}
a <- makecluster(n)
Xcluster <- a$X ; Ycluster <- a$Y
PCA rotates the data so that the first principal component points to the direction of the largest variance, the second principla component to the direction of the second largest variance etc.
if(makepdf) pdf("pca_spca.pdf")
doplots(Xcluster,Y=Ycluster)
if(makepdf) dev.off()
a <- makeswiss(n)
Xswiss <- a$X[,1:2] ; Yswiss <- a$Y
if(makepdf) pdf("pca_spca_swiss.pdf")
doplots(Xswiss,Y=Yswiss)
if(makepdf) dev.off()
compstress <- function(x,d,dhat=dist(x)) { sqrt(sum((dhat-d)^2)/sum(dhat^2)) }
shepard <- function(d,x,main="",pch=".") {
aux <- Shepard(d,x)
plot(range(aux$x),range(c(aux$y,aux$yf)),xlab="p",ylab="d(X)",bty="n",type="n",
main=main)
p <- sample.int(length(aux$x),size=min(2000,length(aux$x)))
points(aux$x[p],aux$y[p],pch=pch)
lines(aux$x,aux$yf,type="S")
}
wav2rgb <- function(wavelength, gamma=0.8){
## Based on code by Dan Bruton
## http://www.physics.sfasu.edu/astro/color/spectra.html
if (wavelength >= 380 & wavelength <= 440) {
attenuation = 0.3 + 0.7 * (wavelength - 380) / (440 - 380)
R = ((-(wavelength - 440) / (440 - 380)) * attenuation) ^ gamma
G = 0.0
B = (1.0 * attenuation) ^ gamma
}
else if (wavelength >= 440 & wavelength <= 490) {
R = 0.0
G = ((wavelength - 440) / (490 - 440)) ^ gamma
B = 1.0
}
else if (wavelength >= 490 & wavelength <= 510) {
R = 0.0
G = 1.0
B = (-(wavelength - 510) / (510 - 490)) ^ gamma
}
else if (wavelength >= 510 & wavelength <= 580) {
R = ((wavelength - 510) / (580 - 510)) ^ gamma
G = 1.0
B = 0.0
}
else if (wavelength >= 580 & wavelength <= 645) {
R = 1.0
G = (-(wavelength - 645) / (645 - 580)) ^ gamma
B = 0.0
}
else if (wavelength >= 645 & wavelength <= 750) {
attenuation = 0.3 + 0.7 * (750 - wavelength) / (750 - 645)
R = (1.0 * attenuation) ^ gamma
G = 0.0
B = 0.0
}
else {
R = 0.0
G = 0.0
B = 0.0
}
R = R * 255
G = G * 255
B = B * 255
return (rgb(floor(R), floor(G), floor(B), max=255))
}
colors <- function() {
wl <- c(434,445,465,472,490,504,537,555,584,600,610,628,651,674)
sim <- c(0.14,
0.58,0.50,
0.58,0.56,0.19,
0.82,0.78,0.53,0.46,
0.94,0.91,0.83,0.75,0.39,
0.93,0.93,0.90,0.90,0.69,0.38,
0.96,0.93,0.92,0.91,0.74,0.55,0.27,
0.98,0.98,0.98,0.98,0.93,0.86,0.78,0.67,
0.93,0.96,0.99,0.99,0.98,0.92,0.86,0.81,0.42,
0.91,0.93,0.98,1.00,0.98,0.98,0.95,0.96,0.63,0.26,
0.88,0.89,0.99,0.99,0.99,0.98,0.98,0.97,0.73,0.50,0.24,
0.87,0.87,0.95,0.98,0.98,0.98,0.98,0.98,0.80,0.59,0.38,0.15,
0.84,0.86,0.97,0.96,1.00,0.99,1.00,0.98,0.77,0.72,0.45,0.32,0.24)
res <- matrix(0,length(wl),length(wl),dimnames=list(wl,wl))
res[upper.tri(res)] <- sim
res <- t(res)
res[upper.tri(res)] <- sim
list(d=as.dist(res),col=sapply(wl,wav2rgb),labels=as.character(wl))
}
aux <- colors()
plot1d <- function(x,col=aux$col,labels=aux$labels,main="k = 1") {
plot(c(min(x),max(x)+0.05),c(1,length(x)),type="n",bty="n",xlab="x",ylab="index",main=main)
abline(v=x,col=aux$col)
lines(x,1:length(x),col="gray")
points(x,1:length(x),pch=19,col=aux$col)
text(x,1:length(x),labels=aux$labels,pos=4,cex=0.5)
}
if(makepdf) pdf("nmds1d.pdf")
plot1d(isoMDS(aux$d,k=1)$points,main="k = 1 (nonmetric MDS)")
initial value 31.847239
final value 28.919217
converged
if(makepdf) dev.off()
plot2d <- function(xy,col=aux$col,labels=aux$labels,main="k = 2") {
plot(c(min(xy[,1]),max(xy[,1])+0.05),range(xy[,2]),type="n",bty="n",xlab="x",ylab="y",main=main)
lines(xy,col="gray")
points(xy,pch=19,col=aux$col)
text(xy,labels=aux$labels,pos=4,cex=0.5)
}
if(makepdf) pdf("nmds2d.pdf")
plot2d(isoMDS(aux$d,k=2)$points,main="k = 2 (nonmetric MDS)")
initial value 5.841614
iter 5 value 3.335963
iter 10 value 3.089500
iter 15 value 2.936210
final value 2.921821
converged
if(makepdf) dev.off()
plot3d <- function(xyz,xy,col=aux$col,labels=aux$labels) {
xyz <- as.data.frame(xyz)
colnames(xyz) <- c("x","y","z")
rownames(xyz) <- labels
xyz <- cbind(xyz,w=1:dim(xyz)[1])
plot_ly(xyz,x=~x,y=~y,z=~z,color=~w,colors=aux$col) %>% layout(scene=list(xaxis=list(title="x"),yaxis=list(title="y"),zaxis=list(title="z"))) %>% add_trace(x=~x,y=~y,z=~z,mode="line") %>% add_markers() %>% hide_colorbar() %>% hide_legend()
}
plot3d(isoMDS(aux$d,k=3)$points)
initial value 4.363630
iter 5 value 2.598462
iter 10 value 2.205186
iter 15 value 2.032003
iter 20 value 1.886957
iter 25 value 1.857243
iter 30 value 1.818042
iter 35 value 1.786504
iter 40 value 1.767466
final value 1.763071
converged
No trace type specified:
Based on info supplied, a 'scatter3d' trace seems appropriate.
Read more about this trace type -> https://plot.ly/r/reference/#scatter3d
s2 <- sapply(1:12,function(i) isoMDS(aux$d,k=i)$stress)
initial value 31.847239
final value 28.919217
converged
initial value 5.841614
iter 5 value 3.335963
iter 10 value 3.089500
iter 15 value 2.936210
final value 2.921821
converged
initial value 4.363630
iter 5 value 2.598462
iter 10 value 2.205186
iter 15 value 2.032003
iter 20 value 1.886957
iter 25 value 1.857243
iter 30 value 1.818042
iter 35 value 1.786504
iter 40 value 1.767466
final value 1.763071
converged
initial value 2.975333
iter 5 value 2.069891
iter 10 value 1.757177
iter 15 value 1.154414
iter 20 value 0.998939
iter 25 value 0.930378
iter 30 value 0.872730
iter 35 value 0.828782
iter 40 value 0.788755
iter 45 value 0.761303
iter 50 value 0.752342
final value 0.752342
stopped after 50 iterations
initial value 1.422477
iter 5 value 0.539959
iter 10 value 0.447943
iter 15 value 0.385682
iter 20 value 0.355484
iter 25 value 0.332173
iter 30 value 0.315434
iter 35 value 0.290758
iter 40 value 0.277736
iter 45 value 0.255893
iter 50 value 0.248739
final value 0.248739
stopped after 50 iterations
initial value 1.185698
iter 5 value 0.457878
iter 10 value 0.349676
iter 15 value 0.308997
iter 20 value 0.274385
iter 25 value 0.228370
iter 30 value 0.192654
iter 35 value 0.149698
iter 40 value 0.121093
iter 45 value 0.108874
iter 50 value 0.102501
final value 0.102501
stopped after 50 iterations
initial value 0.824420
iter 5 value 0.333363
iter 10 value 0.270529
iter 15 value 0.226512
iter 20 value 0.202023
iter 25 value 0.175213
iter 30 value 0.152422
iter 35 value 0.132591
iter 40 value 0.115870
iter 45 value 0.090601
iter 50 value 0.076804
final value 0.076804
stopped after 50 iterations
initial value 0.683179
iter 5 value 0.184667
iter 10 value 0.112932
iter 15 value 0.071743
iter 20 value 0.050152
iter 25 value 0.038713
iter 30 value 0.025375
iter 35 value 0.016901
iter 40 value 0.012782
final value 0.005211
converged
initial value 0.572289
iter 5 value 0.171035
iter 10 value 0.116950
iter 15 value 0.070949
iter 20 value 0.049898
iter 25 value 0.033451
iter 30 value 0.025718
iter 35 value 0.020287
iter 40 value 0.013010
final value 0.006061
converged
initial value 0.554579
iter 5 value 0.153384
iter 10 value 0.105945
iter 15 value 0.061520
iter 20 value 0.041735
iter 25 value 0.031939
iter 30 value 0.021863
iter 35 value 0.014101
final value 0.007455
converged
initial value 0.551530
iter 5 value 0.168459
iter 10 value 0.122992
iter 15 value 0.080473
iter 20 value 0.053504
iter 25 value 0.039326
iter 30 value 0.026801
iter 35 value 0.020222
iter 40 value 0.015187
iter 45 value 0.010223
iter 45 value 0.009978
iter 45 value 0.007526
final value 0.007526
converged
initial value 0.551530
iter 5 value 0.168459
iter 10 value 0.122992
iter 15 value 0.080473
iter 20 value 0.053504
iter 25 value 0.039326
iter 30 value 0.026801
iter 35 value 0.020222
iter 40 value 0.015187
iter 45 value 0.010223
iter 45 value 0.009978
iter 45 value 0.007526
final value 0.007526
converged
## s3 <- sapply(1:12,function(i) sammon(aux$d,k=i)$stress)
if(makepdf) pdf("screecol.pdf")
plot(c(1,12),c(0,max(c(s2))),type="n",xlab="k",ylab="stress",bty="n")
lines(1:12,s2)
## lines(1:12,s3,col=brewer.pal(2,"Set1")[2],lty="dotted")
points(1:12,s2,pch=1)
##points(1:12,s3,col=brewer.pal(2,"Set1")[2],pch=18)
legend("topright",legend=c("nonmetric MDS"),bty="n",
lty=c("solid"),pch=c(1))
if(makepdf) dev.off()
if(makepdf) pdf("shepardcolnmds.pdf")
shepard(aux$d,isoMDS(aux$d,k=2)$points,main="nonmetric MDS (k = 2)",pch=1)
initial value 5.841614
iter 5 value 3.335963
iter 10 value 3.089500
iter 15 value 2.936210
final value 2.921821
converged
if(makepdf) dev.off()
X <- makeswiss()$X
plotf <- function(x,main=main) plot(x,bty="n",xlab="",ylab="",xaxt="n",yaxt="n",col=cols,main=main)
if(makepdf) pdf("swiss.pdf")
pairs(X,col=cols)
plotf(spca(X)$E,main="PCA")
plotf(isoMDS(dist(X))$points,main="nonmetric MDS")
initial value 15.819080
final value 15.220408
converged
plotf(sammon(dist(X))$points,main="Sammon")
Initial stress : 0.05141
stress after 10 iters: 0.03963, magic = 0.500
stress after 20 iters: 0.03950, magic = 0.014
stress after 30 iters: 0.03948, magic = 0.150
plotf(isomap(dist(X),k=10)$points,main="ISOMAP")
plotf(lle(X,2,10)$Y,main="LLE")
finding neighbours
calculating weights
computing coordinates
plotf(eigenmap(X,2,10),main="Eigenmap")
if(makepdf) dev.off()
xyz <- data.frame(x=X[,1],y=X[,2],z=X[,3],w=1:dim(X)[1])
plot_ly(xyz,x=~x,y=~y,z=~z,color=~w,colors=cols) %>% layout(scene=list(xaxis=list(title="x"),yaxis=list(title="y"),zaxis=list(title="z"))) %>% add_markers() %>% hide_colorbar() %>% hide_legend()
HS <- readRDS("HS.rds")
dHS <- dist(HS[,4:52])
if(makepdf) pdf("HS.pdf")
partyplot(lle(HS[,4:52],2,10)$Y,cl=HS[,3],main="Espoo 2017 (LLE)")
finding neighbours
calculating weights
computing coordinates
partyplot(isomap(dHS,ndim=2,k=10)$points,cl=HS[,3],main="Espoo 2017 (ISOMAP)")
partyplot(eigenmap(HS[,4:52],2,10),cl=HS[,3],main="Espoo 2017 (Eigenmap)")
if(makepdf) dev.off()
partyplot3d(data.frame(lle(HS[,4:52],3,10)$Y,row.names=rownames(HS)),cl=HS[,3])
finding neighbours
calculating weights
computing coordinates
partyplot3d(data.frame(eigenmap(HS[,4:52],3,10),row.names=rownames(HS)),cl=HS[,3])
partyplot3d(data.frame(isomap(dHS,ndim=3,k=10)$points,row.names=rownames(HS)),cl=HS[,3])