Data is from https://github.com/HS-Datadesk/avoindata/tree/master/vaalikoneet/kuntavaalit2017
Also see:
set.seed(42)
library(RColorBrewer)
library(plotly)
Loading required package: ggplot2
Registered S3 method overwritten by 'dplyr':
method from
print.rowwise_df
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Registered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Attaching package: ‘plotly’
The following object is masked from ‘package:ggplot2’:
last_plot
The following object is masked from ‘package:stats’:
filter
The following object is masked from ‘package:graphics’:
layout
library(vegan)
Loading required package: permute
Loading required package: lattice
This is vegan 2.5-6
library(clue)
library(dendextend)
Registered S3 method overwritten by 'dendextend':
method from
rev.hclust vegan
---------------------
Welcome to dendextend version 1.12.0
Type citation('dendextend') for how to cite the package.
Type browseVignettes(package = 'dendextend') for the package vignette.
The github page is: https://github.com/talgalili/dendextend/
Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
Or contact: <tal.galili@gmail.com>
To suppress this message use: suppressPackageStartupMessages(library(dendextend))
---------------------
Attaching package: ‘dendextend’
The following object is masked from ‘package:permute’:
shuffle
The following object is masked from ‘package:stats’:
cutree
library(MASS)
Attaching package: ‘MASS’
The following object is masked from ‘package:plotly’:
select
library(png)
library(grid)
partyplot <- function(xy,cl=class,col0=brewer.pal(length(levels(cl)),"Paired"),alpha=0.6,
acol=sapply(col0,function(x) adjustcolor(x,alpha=alpha)),
lab=levels(cl),main="Espoo 2017",textp=FALSE) {
plot(c(min(xy[,1])-if(textp) 0 else 0,max(xy[,1])),range(xy[,2]),bty="n",xlab="x",ylab="y",type="n",main=main)
p <- sample.int(dim(xy)[1])
if(textp) {
text(xy[p,],lab[cl[p]],cex=1,col=col0[cl[p]])
} else {
## legend("bottomleft",legend=lab,pch=19,col=col0,bty="n")
points(xy[p,],pch=19,col=acol[cl[p]])
aux <- aggregate(xy,by=list(cl),FUN=mean)
text(aux[,2:3],as.character(aux[,1]),font=2,cex=2,col=col0)
text(aux[,2:3],as.character(aux[,1]),pos=1,font=2)
}
}
partyplot3d <- function(xyz,cl=class,col0=brewer.pal(length(levels(cl)),"Paired"),alpha=0.6,
acol=sapply(col0,function(x) adjustcolor(x,alpha=alpha)),
lab=levels(cl),main="Espoo 2017") {
xyz <- as.data.frame(xyz)
colnames(xyz) <- c("x","y","z")
xyz <- cbind(xyz,w=as.integer(cl))
plot_ly(xyz,x=~x,y=~y,z=~z,color=~w,colors=acol,text=rownames(xyz)) %>% layout(scene=list(xaxis=list(title="x"),yaxis=list(title="y"),zaxis=list(title="z"))) %>% add_markers() %>% hide_colorbar() %>% hide_legend()
}
HS <- readRDS("HS.rds")
dHS <- dist(HS[,4:52])
mHS <- as.matrix(HS[,4:52])
## Do a trick to make the order of the factors to present
## similarity of answers within a party
a <- aggregate(svd(scale(mHS))$u[,1],by=list(class=HS[,"class"]),mean)
a <- a[order(a[,"x"]),]
HS[,"class"] <- factor(as.character(HS[,"class"]),levels=a[,"class"])
print(table(HS[,"class"]))
ps kok kd kesk pir rkp sdp vihr vas skp
59 102 39 33 6 43 84 108 36 5
Let’s take a look at the data. Here we use non-metric multidimensional scaling (MDS) which “flattens” the 49-dimensional data into 2 or 3 dimensions, trying to preserve the inter-candidate distances \(p_{ij}\) as faithfully as possible
partyplot(isoMDS(dHS,k=2)$points,cl=HS[,3],main="Espoo 2017 (MDS)")
initial value 28.005607
iter 5 value 22.431213
iter 10 value 20.964772
final value 20.249786
converged
partyplot3d(data.frame(isoMDS(dHS,k=3)$points,row.names=rownames(HS)),cl=HS[,3])
initial value 22.348438
iter 5 value 16.222087
iter 10 value 15.085854
iter 15 value 14.744647
iter 15 value 14.738425
iter 15 value 14.735757
final value 14.735757
converged
# kmeansppcenters - Finds initial kmeans++ centroids
# Arguments:
# x numeric matrix, rows correspond to data items
# k integer, number of clusters
# Value:
# centers a matrix of cluster centroids
#
# Reference: Arthur & Vassilivitskii (2007) k-means++: the
# advantages of careful seeding. In Proc SODA '07, 1027-1035.
kmeansppcenters <- function(x,k) {
x <- as.matrix(x)
n <- dim(x)[1]
centers <- matrix(NA,k,dim(x)[2])
p <- rep(1/n,n)
d <- rep(NA,n)
for(i in 1:k) {
centers[i,] <- x[sample.int(n,size=1,prob=p),]
dd <- rowSums((x-(rep(1,n) %o% centers[i,]))^2)
d <- if(i>1) pmin(d,dd) else dd
if(max(d)>0) { p <- d/sum(d) }
}
centers
}
# kmeanspp - kmeans++ algorithm
# Arguments:
# x numeric matrix, rows correspond to data items
# k integer, number of clusters
# iter.max the maximum number of iterations allowed (default 1000)
# algorithm see kmeans documentation (default "Lloyd")
# Value:
# See kmeanspp documentation
#
# Reference: Arthur & Vassilivitskii (2007) k-means++: the
# advantages of careful seeding. In Proc SODA '07, 1027-1035.
kmeanspp <- function(x,k,iter.max=1000,algorithm="Lloyd",...) {
kmeans(x,centers=kmeansppcenters(x,k),iter.max=iter.max,
algorithm=algorithm,...)
}
# kmeansppn
# Run kmeanspp algorithm n times (default n=10) and return a solution
# with the smallest loss.
kmeansppn <- function(x,k,n=10,iter.max=1000,algorithm="Lloyd",...) {
best <- NULL
for(i in 1:n) {
res <- kmeanspp(x,k,iter.max=iter.max,algorithm=algorithm,...)
if(is.null(best) || res$tot.withinss<best$tot.withinss) {
best <- res
}
}
best
}
K-means clustering with \(k=10\) and kmeans++ initialization. Let’s look contingency table where the rows represent the clusters and columns the known party affiliations.
a <- kmeansppn(mHS,k=10)
## t1 <- table(levels(HS[,"class"])[a$cluster],HS[,"class"])
t1 <- table(a$cluster,HS[,"class"])
print(t1)
ps kok kd kesk pir rkp sdp vihr vas skp
1 1 0 3 2 0 8 21 22 4 2
2 8 10 1 5 1 3 7 5 2 0
3 39 1 3 0 0 0 1 0 2 0
4 8 27 3 2 0 1 0 0 0 0
5 0 1 0 1 1 3 8 57 1 0
6 0 0 0 0 0 0 17 5 27 3
7 1 0 0 0 1 0 11 0 0 0
8 1 10 28 11 0 3 3 2 0 0
9 0 7 1 6 0 22 16 13 0 0
10 1 46 0 6 3 3 0 4 0 0
print(sum(diag(t1))/sum(t1))
[1] 0.05825243
We notice that the found clusters do not “match” the known party affiliations. Let’s find a permutation of cluster indices such that the cluster indices match the parties as well as possible. For this, we can use the Hungarian algorithm (e.g., solve_LSAP
from clue
libary in R). Hungarian algorithm finds a permutation of rows (cluster indices) such that the sum of entries in the diagonal is maximised. We find a better match with the clusters and parties already, but not perfect (is it surprising?).
t2 <- t1[order(solve_LSAP(t1,maximum=TRUE)),]
print(t2)
ps kok kd kesk pir rkp sdp vihr vas skp
3 39 1 3 0 0 0 1 0 2 0
10 1 46 0 6 3 3 0 4 0 0
8 1 10 28 11 0 3 3 2 0 0
2 8 10 1 5 1 3 7 5 2 0
7 1 0 0 0 1 0 11 0 0 0
9 0 7 1 6 0 22 16 13 0 0
1 1 0 3 2 0 8 21 22 4 2
5 0 1 0 1 1 3 8 57 1 0
6 0 0 0 0 0 0 17 5 27 3
4 8 27 3 2 0 1 0 0 0 0
print(sum(diag(t2))/sum(t2))
[1] 0.4776699
Lets just see how the number of clusters affects the clustering error.
r <-sapply(1:20,function(k) kmeansppn(mHS,k)$tot.withinss)
plot(1:20,r,type="l",main="",xlab="number of clusters (k)",ylab="total within-cluster sum of squares")
points(1:20,r)
Lets then do some image compression.
me <- readPNG("kaip.png")[,,1:3] # only rgb channels
mem <- matrix(me,dim(me)[1]*dim(me)[2],dim(me)[3])
colnames(mem) <- c("red","green","blue")
a <- kmeansppn(mem,8)
mem0 <- a$centers[a$cluster,]
me0 <- array(mem0,dim=dim(me))
mec <- rgb(me[,,1],me[,,2],me[,,3])
dim(mec) <- dim(me)[1:2]
png("kaip1.png")
grid.raster(mec,interpolate=FALSE)
dev.off()
null device
1
mec0 <- rgb(me0[,,1],me0[,,2],me0[,,3])
dim(mec0) <- dim(me0)[1:2]
png("kaip0.png")
grid.raster(mec0,interpolate=FALSE)
dev.off()
null device
1
Original image:
Compressed image using k-means and 8 colour channels:
Here I have taken 200x200 = 40000 pixels as 3-dimensional rgb-vectors and clustered them into 8 clusters. Then I have replaced the actual pixel values with the values of respective cluster centroids. A pixel in the original image takes 3x8 = 24 bits and a pixel in the compressed image takes 3 bits, i.e., we obtain a compression ratio of 24/3 = 8 by this simple operation!
Let’s then try hierarchical clustering with different cluster indices.
hc.ward <- hclust(dHS,method="ward.D2") # similar to k-means
hc.complete <- hclust(dHS,method="complete")
hc.single <- hclust(dHS,method="single")
hc.average <- hclust(dHS,method="average")
plotdend <- function(hc,main="",col0=brewer.pal(10,"Paired")) {
d <- as.dendrogram(hc)
labels_colors(d) <- col0[HS[,"class"]]
par(cex=0.25)
plot(d,main=main)
}
Ward is similar to kmeans, working with cluster centroids.
plotdend(hc.ward,main="ward")
Complete linkage leads to clusters with roughly similar diameters.
plotdend(hc.complete,main="complete")
plotdend(hc.single,main="single")
plotdend(hc.average,main="average")