First have some common code… See the previous lecture about kmeans++. Also, load the HS data and provide the necessary visualization functions.

set.seed(42)
library(RColorBrewer)
library(plotly)
Loading required package: ggplot2
Registered S3 method overwritten by 'dplyr':
  method           from
  print.rowwise_df     
Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     
Registered S3 methods overwritten by 'htmltools':
  method               from         
  print.html           tools:rstudio
  print.shiny.tag      tools:rstudio
  print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
  method           from         
  print.htmlwidget tools:rstudio

Attaching package: ‘plotly’

The following object is masked from ‘package:ggplot2’:

    last_plot

The following object is masked from ‘package:stats’:

    filter

The following object is masked from ‘package:graphics’:

    layout
library(vegan)
Loading required package: permute
Loading required package: lattice
This is vegan 2.5-6
library(clue)
library(dendextend)
Registered S3 method overwritten by 'dendextend':
  method     from 
  rev.hclust vegan

---------------------
Welcome to dendextend version 1.12.0
Type citation('dendextend') for how to cite the package.

Type browseVignettes(package = 'dendextend') for the package vignette.
The github page is: https://github.com/talgalili/dendextend/

Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
Or contact: <tal.galili@gmail.com>

    To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
---------------------


Attaching package: ‘dendextend’

The following object is masked from ‘package:permute’:

    shuffle

The following object is masked from ‘package:stats’:

    cutree
library(MASS)

Attaching package: ‘MASS’

The following object is masked from ‘package:plotly’:

    select
library(lle)
Loading required package: scatterplot3d
Loading required package: snowfall
Loading required package: snow
makepdf <- FALSE
n <- 500
rot <- matrix(c(cos(pi/4),-sin(pi/4),sin(pi/4),cos(pi/4)),2,2)
cols <- rep(brewer.pal(11,"RdYlBu"),each=ceiling(n/11))[1:n]

makecluster <- function(n=500) {
  Xcluster <- matrix(rnorm(2*n),n,2)
  Xcluster[,1] <- Xcluster[,1]/10
  p <- sample.int(n,size=floor(n/2))
  Xcluster[p,1] <- Xcluster[p,1]-0.3
  Xcluster[-p,1] <- Xcluster[-p,1]+0.3
  Ycluster <- matrix(-1,n,1)
  Ycluster[p,1] <- 1
  p <- order(Xcluster[,1])
  Xcluster <- Xcluster[p,]
  Xcluster <- Xcluster %*% t(rot)
  Ycluster <- Ycluster[p,,drop=FALSE]
  list(X=Xcluster,Y=Ycluster)
}

makeswiss <- function(n=500) {
  X <- matrix(0,n,3)
  s <- sort(runif(n))
  r <- 1+s+0.05*runif(n)
  X[,1] <- r*cos(2*pi*s)
  X[,2] <- r*sin(2*pi*s)
  X[,3] <- runif(n,min=-3,max=3)
  list(X=X,Y=matrix(s,n,1))
}

spca <- function(X,Y=diag(dim(X)[1])) {
  X <- scale(X,center=TRUE,scale=FALSE)
  a <- svd(t(X) %*% Y %*% t(Y) %*% X)
  p <- order(a$d,decreasing=TRUE)
  d <- a$d[p]
  U <- a$u[,p]
  list(E=X %*% U,d=d,U=U)
}

eigenmap <- function(X,ndim=2,k=5,eps=.Machine$double.eps^0.25) {
  W <- find_nn_k(X,k=k)
  W <- 1*(W | t(W)) # symmetrise
  D <- diag(apply(W,1,sum))
  s <- svd(W-D)
  p <- order(abs(s$d))
  lambda <- s$d[p]
  U <- s$u[,p,drop=FALSE]
  i <- lambda>=eps
  U[,i,drop=FALSE][,1:ndim,drop=FALSE]
}

doplots <- function(X,Y=diag(dim(X)[1]),col=cols,main="",prefix="",sp=FALSE) {
  spca(X)
  Xp <- spca(X)$E
  Xi <- spca(X,Y)$E
  y1 <- max(abs(c(as.matrix(X[,1]),Xp[,1],Xi[,1])))
  y2 <- max(abs(c(as.matrix(X[,2]),Xp[,2],Xi[,2])))
  y1 <- y2 <- max(y1,y2)
  plot(c(-y1,y1),c(-y2,y2),type="n",bty="n",xlab="x",ylab="y",main=main)
  points(X,col=col)
  plot(c(-y1,y1),c(-y2,y2),type="n",bty="n",xlab="PC1",ylab="PC2",main=if(main=="") "PCA" else sprintf("%s (PCA)",main))
  points(Xp,col=col)
  for(i in sample.int(dim(Xp)[1])) rug(Xp[i,1],side=1,lwd=1,col=col[i])
  if(sp) {
  plot(c(-y1,y1),c(-y2,y2),type="n",bty="n",xlab="SPC1",ylab="SPC2",main=if(main=="") "supervised PCA" else sprintf("%s (supervised PCA)",main))
  points(Xi,col=col)
  for(i in sample.int(dim(Xi)[1])) rug(Xi[i,1],side=1,lwd=1,col=col[i])
  }
}


partyplot <- function(xy,cl=class,col0=brewer.pal(length(levels(cl)),"Paired"),alpha=0.6,
                      acol=sapply(col0,function(x) adjustcolor(x,alpha=alpha)),
                      lab=levels(cl),main="Espoo 2017",textp=FALSE) {
  plot(c(min(xy[,1])-if(textp) 0 else 0,max(xy[,1])),range(xy[,2]),bty="n",xlab="x",ylab="y",type="n",main=main)
  
  p <- sample.int(dim(xy)[1])
  if(textp) {
    text(xy[p,],lab[cl[p]],cex=1,col=col0[cl[p]])
  } else {
    ## legend("bottomleft",legend=lab,pch=19,col=col0,bty="n")
    points(xy[p,],pch=19,col=acol[cl[p]])
    aux <- aggregate(xy,by=list(cl),FUN=mean)
    text(aux[,2:3],as.character(aux[,1]),font=2,cex=2,col=col0)
    text(aux[,2:3],as.character(aux[,1]),pos=1,font=2)
  }
}

partyplot3d <- function(xyz,cl=class,col0=brewer.pal(length(levels(cl)),"Paired"),alpha=0.6,
                      acol=sapply(col0,function(x) adjustcolor(x,alpha=alpha)),
                      lab=levels(cl),main="Espoo 2017") {
  xyz <- as.data.frame(xyz)
  colnames(xyz) <- c("x","y","z")
  xyz <- cbind(xyz,w=as.integer(cl))
  plot_ly(xyz,x=~x,y=~y,z=~z,color=~w,colors=acol,text=rownames(xyz)) %>% layout(scene=list(xaxis=list(title="x"),yaxis=list(title="y"),zaxis=list(title="z")))  %>% add_markers()  %>% hide_colorbar() %>% hide_legend()
}

HS <- readRDS("HS.rds")
dHS <- dist(HS[,4:52])
mHS <- as.matrix(HS[,4:52])

## Do a trick to make the order of the factors to present
## similarity of answers within a party
a <- aggregate(svd(scale(mHS))$u[,1],by=list(class=HS[,"class"]),mean)
a <- a[order(a[,"x"]),]
HS[,"class"] <- factor(as.character(HS[,"class"]),levels=a[,"class"])
print(table(HS[,"class"]))

  ps  kok   kd kesk  pir  rkp  sdp vihr  vas  skp 
  59  102   39   33    6   43   84  108   36    5 
# kmeansppcenters - Finds initial kmeans++ centroids
# Arguments:
# x        numeric matrix, rows correspond to data items
# k        integer, number of clusters
# Value:
# centers  a matrix of cluster centroids
#
# Reference: Arthur & Vassilivitskii (2007) k-means++: the
# advantages of careful seeding. In Proc SODA '07, 1027-1035. 
kmeansppcenters <- function(x,k) {
  x <- as.matrix(x)
  n <- dim(x)[1]
  centers <- matrix(NA,k,dim(x)[2])
  p <- rep(1/n,n)
  d <- rep(NA,n)
  for(i in 1:k) {
    centers[i,] <- x[sample.int(n,size=1,prob=p),]
    dd <- rowSums((x-(rep(1,n) %o% centers[i,]))^2)
    d <- if(i>1) pmin(d,dd) else dd
    if(max(d)>0) { p <- d/sum(d) }
  }
  centers
}

# kmeanspp - kmeans++ algorithm
# Arguments:
# x numeric matrix, rows correspond to data items
# k integer, number of clusters
# iter.max   the maximum number of iterations allowed (default 1000)
# algorithm  see kmeans documentation (default "Lloyd")
# Value:
# See kmeanspp documentation
#
# Reference: Arthur & Vassilivitskii (2007) k-means++: the
# advantages of careful seeding. In Proc SODA '07, 1027-1035. 
kmeanspp <- function(x,k,iter.max=1000,algorithm="Lloyd",...) {
  kmeans(x,centers=kmeansppcenters(x,k),iter.max=iter.max,
         algorithm=algorithm,...)
}

# kmeansppn 
# Run kmeanspp algorithm n times (default n=10) and return a solution
# with the smallest loss.
kmeansppn <- function(x,k,n=10,iter.max=1000,algorithm="Lloyd",...) {
  best <- NULL
  count <- 1
  for(i in 1:n) {
    res <- kmeanspp(x,k,iter.max=iter.max,algorithm=algorithm,...)
    if(!is.null(best) && res$tot.withinss==best$tot.withinss) {
      count <- count+1
    }
    if(is.null(best) || res$tot.withinss<best$tot.withinss) {
      best <- res
      count <- 1
    }
  }
  ## If we are really converged we might expect count to be >1.
  cat(sprintf("count = %d\n",count))
  best
}

a <- makecluster(n)
Xcluster <- a$X ; Ycluster <- a$Y

PCA rotates the data so that the first principal component points to the direction of the largest variance, the second principla component to the direction of the second largest variance etc.

if(makepdf) pdf("pca_spca.pdf")
doplots(Xcluster,Y=Ycluster)

if(makepdf) dev.off()
a <- makeswiss(n)
Xswiss <- a$X[,1:2] ; Yswiss <- a$Y
if(makepdf) pdf("pca_spca_swiss.pdf")
doplots(Xswiss,Y=Yswiss)

if(makepdf) dev.off()
compstress <- function(x,d,dhat=dist(x)) { sqrt(sum((dhat-d)^2)/sum(dhat^2)) }
shepard <- function(d,x,main="",pch=".") {
  aux <- Shepard(d,x)
  plot(range(aux$x),range(c(aux$y,aux$yf)),xlab="p",ylab="d(X)",bty="n",type="n",
       main=main)
  p <- sample.int(length(aux$x),size=min(2000,length(aux$x)))
  points(aux$x[p],aux$y[p],pch=pch)
  lines(aux$x,aux$yf,type="S")
}
wav2rgb <- function(wavelength, gamma=0.8){
##    Based on code by Dan Bruton
##    http://www.physics.sfasu.edu/astro/color/spectra.html
    if (wavelength >= 380 & wavelength <= 440) {
        attenuation = 0.3 + 0.7 * (wavelength - 380) / (440 - 380)
        R = ((-(wavelength - 440) / (440 - 380)) * attenuation) ^ gamma
        G = 0.0
        B = (1.0 * attenuation) ^ gamma
        }
    else if (wavelength >= 440 & wavelength <= 490) {
        R = 0.0
        G = ((wavelength - 440) / (490 - 440)) ^ gamma
        B = 1.0
        }
    else if (wavelength >= 490 & wavelength <= 510) {
        R = 0.0
        G = 1.0
        B = (-(wavelength - 510) / (510 - 490)) ^ gamma
        }
    else if (wavelength >= 510 & wavelength <= 580) {
        R = ((wavelength - 510) / (580 - 510)) ^ gamma
        G = 1.0
        B = 0.0
        }
    else if (wavelength >= 580 & wavelength <= 645) {
        R = 1.0
        G = (-(wavelength - 645) / (645 - 580)) ^ gamma
        B = 0.0
        }
    else if (wavelength >= 645 & wavelength <= 750) {
        attenuation = 0.3 + 0.7 * (750 - wavelength) / (750 - 645)
        R = (1.0 * attenuation) ^ gamma
        G = 0.0
        B = 0.0
        }
    else {
        R = 0.0
        G = 0.0
        B = 0.0
        }
    R = R * 255
    G = G * 255
    B = B * 255
    return (rgb(floor(R), floor(G), floor(B), max=255))
}

colors <- function() {
  wl <- c(434,445,465,472,490,504,537,555,584,600,610,628,651,674)
  sim <- c(0.14,
           0.58,0.50,
           0.58,0.56,0.19,
           0.82,0.78,0.53,0.46,
           0.94,0.91,0.83,0.75,0.39,
           0.93,0.93,0.90,0.90,0.69,0.38,
           0.96,0.93,0.92,0.91,0.74,0.55,0.27,
           0.98,0.98,0.98,0.98,0.93,0.86,0.78,0.67,
           0.93,0.96,0.99,0.99,0.98,0.92,0.86,0.81,0.42,
           0.91,0.93,0.98,1.00,0.98,0.98,0.95,0.96,0.63,0.26,
           0.88,0.89,0.99,0.99,0.99,0.98,0.98,0.97,0.73,0.50,0.24,
           0.87,0.87,0.95,0.98,0.98,0.98,0.98,0.98,0.80,0.59,0.38,0.15,
           0.84,0.86,0.97,0.96,1.00,0.99,1.00,0.98,0.77,0.72,0.45,0.32,0.24)
  res <- matrix(0,length(wl),length(wl),dimnames=list(wl,wl))
  res[upper.tri(res)] <- sim
  res <- t(res)
  res[upper.tri(res)] <- sim
  list(d=as.dist(res),col=sapply(wl,wav2rgb),labels=as.character(wl))
}
aux <- colors()
plot1d <- function(x,col=aux$col,labels=aux$labels,main="k = 1") {
  plot(c(min(x),max(x)+0.05),c(1,length(x)),type="n",bty="n",xlab="x",ylab="index",main=main)
  abline(v=x,col=aux$col)
  lines(x,1:length(x),col="gray")
  points(x,1:length(x),pch=19,col=aux$col)
  text(x,1:length(x),labels=aux$labels,pos=4,cex=0.5)
}
if(makepdf) pdf("nmds1d.pdf")
plot1d(isoMDS(aux$d,k=1)$points,main="k = 1 (nonmetric MDS)")
initial  value 31.847239 
final  value 28.919217 
converged

if(makepdf) dev.off()


plot2d <- function(xy,col=aux$col,labels=aux$labels,main="k = 2") {
  plot(c(min(xy[,1]),max(xy[,1])+0.05),range(xy[,2]),type="n",bty="n",xlab="x",ylab="y",main=main)
  lines(xy,col="gray")
  points(xy,pch=19,col=aux$col)
  text(xy,labels=aux$labels,pos=4,cex=0.5)
}

if(makepdf) pdf("nmds2d.pdf")
plot2d(isoMDS(aux$d,k=2)$points,main="k = 2 (nonmetric MDS)")
initial  value 5.841614 
iter   5 value 3.335963
iter  10 value 3.089500
iter  15 value 2.936210
final  value 2.921821 
converged

if(makepdf) dev.off()


plot3d <- function(xyz,xy,col=aux$col,labels=aux$labels) {
  xyz <- as.data.frame(xyz)
  colnames(xyz) <- c("x","y","z")
  rownames(xyz) <- labels
  xyz <- cbind(xyz,w=1:dim(xyz)[1])
  plot_ly(xyz,x=~x,y=~y,z=~z,color=~w,colors=aux$col) %>% layout(scene=list(xaxis=list(title="x"),yaxis=list(title="y"),zaxis=list(title="z"))) %>% add_trace(x=~x,y=~y,z=~z,mode="line")  %>% add_markers()  %>% hide_colorbar() %>% hide_legend()
}

plot3d(isoMDS(aux$d,k=3)$points)
initial  value 4.363630 
iter   5 value 2.598462
iter  10 value 2.205186
iter  15 value 2.032003
iter  20 value 1.886957
iter  25 value 1.857243
iter  30 value 1.818042
iter  35 value 1.786504
iter  40 value 1.767466
final  value 1.763071 
converged
No trace type specified:
  Based on info supplied, a 'scatter3d' trace seems appropriate.
  Read more about this trace type -> https://plot.ly/r/reference/#scatter3d

s2 <- sapply(1:12,function(i) isoMDS(aux$d,k=i)$stress)
initial  value 31.847239 
final  value 28.919217 
converged
initial  value 5.841614 
iter   5 value 3.335963
iter  10 value 3.089500
iter  15 value 2.936210
final  value 2.921821 
converged
initial  value 4.363630 
iter   5 value 2.598462
iter  10 value 2.205186
iter  15 value 2.032003
iter  20 value 1.886957
iter  25 value 1.857243
iter  30 value 1.818042
iter  35 value 1.786504
iter  40 value 1.767466
final  value 1.763071 
converged
initial  value 2.975333 
iter   5 value 2.069891
iter  10 value 1.757177
iter  15 value 1.154414
iter  20 value 0.998939
iter  25 value 0.930378
iter  30 value 0.872730
iter  35 value 0.828782
iter  40 value 0.788755
iter  45 value 0.761303
iter  50 value 0.752342
final  value 0.752342 
stopped after 50 iterations
initial  value 1.422477 
iter   5 value 0.539959
iter  10 value 0.447943
iter  15 value 0.385682
iter  20 value 0.355484
iter  25 value 0.332173
iter  30 value 0.315434
iter  35 value 0.290758
iter  40 value 0.277736
iter  45 value 0.255893
iter  50 value 0.248739
final  value 0.248739 
stopped after 50 iterations
initial  value 1.185698 
iter   5 value 0.457878
iter  10 value 0.349676
iter  15 value 0.308997
iter  20 value 0.274385
iter  25 value 0.228370
iter  30 value 0.192654
iter  35 value 0.149698
iter  40 value 0.121093
iter  45 value 0.108874
iter  50 value 0.102501
final  value 0.102501 
stopped after 50 iterations
initial  value 0.824420 
iter   5 value 0.333363
iter  10 value 0.270529
iter  15 value 0.226512
iter  20 value 0.202023
iter  25 value 0.175213
iter  30 value 0.152422
iter  35 value 0.132591
iter  40 value 0.115870
iter  45 value 0.090601
iter  50 value 0.076804
final  value 0.076804 
stopped after 50 iterations
initial  value 0.683179 
iter   5 value 0.184667
iter  10 value 0.112932
iter  15 value 0.071743
iter  20 value 0.050152
iter  25 value 0.038713
iter  30 value 0.025375
iter  35 value 0.016901
iter  40 value 0.012782
final  value 0.005211 
converged
initial  value 0.572289 
iter   5 value 0.171035
iter  10 value 0.116950
iter  15 value 0.070949
iter  20 value 0.049898
iter  25 value 0.033451
iter  30 value 0.025718
iter  35 value 0.020287
iter  40 value 0.013010
final  value 0.006061 
converged
initial  value 0.554579 
iter   5 value 0.153384
iter  10 value 0.105945
iter  15 value 0.061520
iter  20 value 0.041735
iter  25 value 0.031939
iter  30 value 0.021863
iter  35 value 0.014101
final  value 0.007455 
converged
initial  value 0.551530 
iter   5 value 0.168459
iter  10 value 0.122992
iter  15 value 0.080473
iter  20 value 0.053504
iter  25 value 0.039326
iter  30 value 0.026801
iter  35 value 0.020222
iter  40 value 0.015187
iter  45 value 0.010223
iter  45 value 0.009978
iter  45 value 0.007526
final  value 0.007526 
converged
initial  value 0.551530 
iter   5 value 0.168459
iter  10 value 0.122992
iter  15 value 0.080473
iter  20 value 0.053504
iter  25 value 0.039326
iter  30 value 0.026801
iter  35 value 0.020222
iter  40 value 0.015187
iter  45 value 0.010223
iter  45 value 0.009978
iter  45 value 0.007526
final  value 0.007526 
converged
## s3 <- sapply(1:12,function(i) sammon(aux$d,k=i)$stress)
if(makepdf) pdf("screecol.pdf")
plot(c(1,12),c(0,max(c(s2))),type="n",xlab="k",ylab="stress",bty="n")
lines(1:12,s2)
## lines(1:12,s3,col=brewer.pal(2,"Set1")[2],lty="dotted")
points(1:12,s2,pch=1)
##points(1:12,s3,col=brewer.pal(2,"Set1")[2],pch=18)
legend("topright",legend=c("nonmetric MDS"),bty="n",
       lty=c("solid"),pch=c(1))

if(makepdf) dev.off()


if(makepdf) pdf("shepardcolnmds.pdf")
shepard(aux$d,isoMDS(aux$d,k=2)$points,main="nonmetric MDS (k = 2)",pch=1)
initial  value 5.841614 
iter   5 value 3.335963
iter  10 value 3.089500
iter  15 value 2.936210
final  value 2.921821 
converged

if(makepdf) dev.off()
X <- makeswiss()$X
plotf <- function(x,main=main) plot(x,bty="n",xlab="",ylab="",xaxt="n",yaxt="n",col=cols,main=main)
if(makepdf) pdf("swiss.pdf")
pairs(X,col=cols)

plotf(spca(X)$E,main="PCA")

plotf(isoMDS(dist(X))$points,main="nonmetric MDS")
initial  value 15.819080 
final  value 15.220408 
converged

plotf(sammon(dist(X))$points,main="Sammon")
Initial stress        : 0.05141
stress after  10 iters: 0.03963, magic = 0.500
stress after  20 iters: 0.03950, magic = 0.014
stress after  30 iters: 0.03948, magic = 0.150

plotf(isomap(dist(X),k=10)$points,main="ISOMAP")

plotf(lle(X,2,10)$Y,main="LLE")
finding neighbours
calculating weights
computing coordinates

plotf(eigenmap(X,2,10),main="Eigenmap")

if(makepdf) dev.off()
xyz <- data.frame(x=X[,1],y=X[,2],z=X[,3],w=1:dim(X)[1])
plot_ly(xyz,x=~x,y=~y,z=~z,color=~w,colors=cols) %>% layout(scene=list(xaxis=list(title="x"),yaxis=list(title="y"),zaxis=list(title="z"))) %>% add_markers()  %>% hide_colorbar() %>% hide_legend()
HS <- readRDS("HS.rds")
dHS <- dist(HS[,4:52])
if(makepdf) pdf("HS.pdf")
partyplot(lle(HS[,4:52],2,10)$Y,cl=HS[,3],main="Espoo 2017 (LLE)")
finding neighbours
calculating weights
computing coordinates

partyplot(isomap(dHS,ndim=2,k=10)$points,cl=HS[,3],main="Espoo 2017 (ISOMAP)")

partyplot(eigenmap(HS[,4:52],2,10),cl=HS[,3],main="Espoo 2017 (Eigenmap)")

if(makepdf) dev.off()
partyplot3d(data.frame(lle(HS[,4:52],3,10)$Y,row.names=rownames(HS)),cl=HS[,3])
finding neighbours
calculating weights
computing coordinates

partyplot3d(data.frame(eigenmap(HS[,4:52],3,10),row.names=rownames(HS)),cl=HS[,3])

partyplot3d(data.frame(isomap(dHS,ndim=3,k=10)$points,row.names=rownames(HS)),cl=HS[,3])
---
title: "Dimensionality reduction"
output: html_notebook
---

First have some common code... See the previous lecture about `kmeans++`. Also, load the HS data and provide the necessary visualization functions.

```{r}
set.seed(42)
library(RColorBrewer)
library(plotly)
library(vegan)
library(clue)
library(dendextend)
library(MASS)
library(lle)

makepdf <- FALSE
n <- 500
rot <- matrix(c(cos(pi/4),-sin(pi/4),sin(pi/4),cos(pi/4)),2,2)
cols <- rep(brewer.pal(11,"RdYlBu"),each=ceiling(n/11))[1:n]

makecluster <- function(n=500) {
  Xcluster <- matrix(rnorm(2*n),n,2)
  Xcluster[,1] <- Xcluster[,1]/10
  p <- sample.int(n,size=floor(n/2))
  Xcluster[p,1] <- Xcluster[p,1]-0.3
  Xcluster[-p,1] <- Xcluster[-p,1]+0.3
  Ycluster <- matrix(-1,n,1)
  Ycluster[p,1] <- 1
  p <- order(Xcluster[,1])
  Xcluster <- Xcluster[p,]
  Xcluster <- Xcluster %*% t(rot)
  Ycluster <- Ycluster[p,,drop=FALSE]
  list(X=Xcluster,Y=Ycluster)
}

makeswiss <- function(n=500) {
  X <- matrix(0,n,3)
  s <- sort(runif(n))
  r <- 1+s+0.05*runif(n)
  X[,1] <- r*cos(2*pi*s)
  X[,2] <- r*sin(2*pi*s)
  X[,3] <- runif(n,min=-3,max=3)
  list(X=X,Y=matrix(s,n,1))
}

spca <- function(X,Y=diag(dim(X)[1])) {
  X <- scale(X,center=TRUE,scale=FALSE)
  a <- svd(t(X) %*% Y %*% t(Y) %*% X)
  p <- order(a$d,decreasing=TRUE)
  d <- a$d[p]
  U <- a$u[,p]
  list(E=X %*% U,d=d,U=U)
}

eigenmap <- function(X,ndim=2,k=5,eps=.Machine$double.eps^0.25) {
  W <- find_nn_k(X,k=k)
  W <- 1*(W | t(W)) # symmetrise
  D <- diag(apply(W,1,sum))
  s <- svd(W-D)
  p <- order(abs(s$d))
  lambda <- s$d[p]
  U <- s$u[,p,drop=FALSE]
  i <- lambda>=eps
  U[,i,drop=FALSE][,1:ndim,drop=FALSE]
}

doplots <- function(X,Y=diag(dim(X)[1]),col=cols,main="",prefix="",sp=FALSE) {
  spca(X)
  Xp <- spca(X)$E
  Xi <- spca(X,Y)$E
  y1 <- max(abs(c(as.matrix(X[,1]),Xp[,1],Xi[,1])))
  y2 <- max(abs(c(as.matrix(X[,2]),Xp[,2],Xi[,2])))
  y1 <- y2 <- max(y1,y2)
  plot(c(-y1,y1),c(-y2,y2),type="n",bty="n",xlab="x",ylab="y",main=main)
  points(X,col=col)
  plot(c(-y1,y1),c(-y2,y2),type="n",bty="n",xlab="PC1",ylab="PC2",main=if(main=="") "PCA" else sprintf("%s (PCA)",main))
  points(Xp,col=col)
  for(i in sample.int(dim(Xp)[1])) rug(Xp[i,1],side=1,lwd=1,col=col[i])
  if(sp) {
  plot(c(-y1,y1),c(-y2,y2),type="n",bty="n",xlab="SPC1",ylab="SPC2",main=if(main=="") "supervised PCA" else sprintf("%s (supervised PCA)",main))
  points(Xi,col=col)
  for(i in sample.int(dim(Xi)[1])) rug(Xi[i,1],side=1,lwd=1,col=col[i])
  }
}


partyplot <- function(xy,cl=class,col0=brewer.pal(length(levels(cl)),"Paired"),alpha=0.6,
                      acol=sapply(col0,function(x) adjustcolor(x,alpha=alpha)),
                      lab=levels(cl),main="Espoo 2017",textp=FALSE) {
  plot(c(min(xy[,1])-if(textp) 0 else 0,max(xy[,1])),range(xy[,2]),bty="n",xlab="x",ylab="y",type="n",main=main)
  
  p <- sample.int(dim(xy)[1])
  if(textp) {
    text(xy[p,],lab[cl[p]],cex=1,col=col0[cl[p]])
  } else {
    ## legend("bottomleft",legend=lab,pch=19,col=col0,bty="n")
    points(xy[p,],pch=19,col=acol[cl[p]])
    aux <- aggregate(xy,by=list(cl),FUN=mean)
    text(aux[,2:3],as.character(aux[,1]),font=2,cex=2,col=col0)
    text(aux[,2:3],as.character(aux[,1]),pos=1,font=2)
  }
}

partyplot3d <- function(xyz,cl=class,col0=brewer.pal(length(levels(cl)),"Paired"),alpha=0.6,
                      acol=sapply(col0,function(x) adjustcolor(x,alpha=alpha)),
                      lab=levels(cl),main="Espoo 2017") {
  xyz <- as.data.frame(xyz)
  colnames(xyz) <- c("x","y","z")
  xyz <- cbind(xyz,w=as.integer(cl))
  plot_ly(xyz,x=~x,y=~y,z=~z,color=~w,colors=acol,text=rownames(xyz)) %>% layout(scene=list(xaxis=list(title="x"),yaxis=list(title="y"),zaxis=list(title="z")))  %>% add_markers()  %>% hide_colorbar() %>% hide_legend()
}

HS <- readRDS("HS.rds")
dHS <- dist(HS[,4:52])
mHS <- as.matrix(HS[,4:52])

## Do a trick to make the order of the factors to present
## similarity of answers within a party
a <- aggregate(svd(scale(mHS))$u[,1],by=list(class=HS[,"class"]),mean)
a <- a[order(a[,"x"]),]
HS[,"class"] <- factor(as.character(HS[,"class"]),levels=a[,"class"])
print(table(HS[,"class"]))

# kmeansppcenters - Finds initial kmeans++ centroids
# Arguments:
# x        numeric matrix, rows correspond to data items
# k        integer, number of clusters
# Value:
# centers  a matrix of cluster centroids
#
# Reference: Arthur & Vassilivitskii (2007) k-means++: the
# advantages of careful seeding. In Proc SODA '07, 1027-1035. 
kmeansppcenters <- function(x,k) {
  x <- as.matrix(x)
  n <- dim(x)[1]
  centers <- matrix(NA,k,dim(x)[2])
  p <- rep(1/n,n)
  d <- rep(NA,n)
  for(i in 1:k) {
    centers[i,] <- x[sample.int(n,size=1,prob=p),]
    dd <- rowSums((x-(rep(1,n) %o% centers[i,]))^2)
    d <- if(i>1) pmin(d,dd) else dd
    if(max(d)>0) { p <- d/sum(d) }
  }
  centers
}

# kmeanspp - kmeans++ algorithm
# Arguments:
# x numeric matrix, rows correspond to data items
# k integer, number of clusters
# iter.max   the maximum number of iterations allowed (default 1000)
# algorithm  see kmeans documentation (default "Lloyd")
# Value:
# See kmeanspp documentation
#
# Reference: Arthur & Vassilivitskii (2007) k-means++: the
# advantages of careful seeding. In Proc SODA '07, 1027-1035. 
kmeanspp <- function(x,k,iter.max=1000,algorithm="Lloyd",...) {
  kmeans(x,centers=kmeansppcenters(x,k),iter.max=iter.max,
         algorithm=algorithm,...)
}

# kmeansppn 
# Run kmeanspp algorithm n times (default n=10) and return a solution
# with the smallest loss.
kmeansppn <- function(x,k,n=10,iter.max=1000,algorithm="Lloyd",...) {
  best <- NULL
  count <- 1
  for(i in 1:n) {
    res <- kmeanspp(x,k,iter.max=iter.max,algorithm=algorithm,...)
    if(!is.null(best) && res$tot.withinss==best$tot.withinss) {
      count <- count+1
    }
    if(is.null(best) || res$tot.withinss<best$tot.withinss) {
      best <- res
      count <- 1
    }
  }
  ## If we are really converged we might expect count to be >1.
  cat(sprintf("count = %d\n",count))
  best
}

a <- makecluster(n)
Xcluster <- a$X ; Ycluster <- a$Y
```


PCA rotates the data so that the first principal component points to the direction of the largest variance, the second principla component to the direction of the second largest variance etc.

```{r}
if(makepdf) pdf("pca_spca.pdf")
doplots(Xcluster,Y=Ycluster)
if(makepdf) dev.off()
a <- makeswiss(n)
Xswiss <- a$X[,1:2] ; Yswiss <- a$Y
if(makepdf) pdf("pca_spca_swiss.pdf")
doplots(Xswiss,Y=Yswiss)
if(makepdf) dev.off()
```
```{r}
compstress <- function(x,d,dhat=dist(x)) { sqrt(sum((dhat-d)^2)/sum(dhat^2)) }
shepard <- function(d,x,main="",pch=".") {
  aux <- Shepard(d,x)
  plot(range(aux$x),range(c(aux$y,aux$yf)),xlab="p",ylab="d(X)",bty="n",type="n",
       main=main)
  p <- sample.int(length(aux$x),size=min(2000,length(aux$x)))
  points(aux$x[p],aux$y[p],pch=pch)
  lines(aux$x,aux$yf,type="S")
}
wav2rgb <- function(wavelength, gamma=0.8){
##    Based on code by Dan Bruton
##    http://www.physics.sfasu.edu/astro/color/spectra.html
    if (wavelength >= 380 & wavelength <= 440) {
        attenuation = 0.3 + 0.7 * (wavelength - 380) / (440 - 380)
        R = ((-(wavelength - 440) / (440 - 380)) * attenuation) ^ gamma
        G = 0.0
        B = (1.0 * attenuation) ^ gamma
        }
    else if (wavelength >= 440 & wavelength <= 490) {
        R = 0.0
        G = ((wavelength - 440) / (490 - 440)) ^ gamma
        B = 1.0
        }
    else if (wavelength >= 490 & wavelength <= 510) {
        R = 0.0
        G = 1.0
        B = (-(wavelength - 510) / (510 - 490)) ^ gamma
        }
    else if (wavelength >= 510 & wavelength <= 580) {
        R = ((wavelength - 510) / (580 - 510)) ^ gamma
        G = 1.0
        B = 0.0
        }
    else if (wavelength >= 580 & wavelength <= 645) {
        R = 1.0
        G = (-(wavelength - 645) / (645 - 580)) ^ gamma
        B = 0.0
        }
    else if (wavelength >= 645 & wavelength <= 750) {
        attenuation = 0.3 + 0.7 * (750 - wavelength) / (750 - 645)
        R = (1.0 * attenuation) ^ gamma
        G = 0.0
        B = 0.0
        }
    else {
        R = 0.0
        G = 0.0
        B = 0.0
        }
    R = R * 255
    G = G * 255
    B = B * 255
    return (rgb(floor(R), floor(G), floor(B), max=255))
}

colors <- function() {
  wl <- c(434,445,465,472,490,504,537,555,584,600,610,628,651,674)
  sim <- c(0.14,
           0.58,0.50,
           0.58,0.56,0.19,
           0.82,0.78,0.53,0.46,
           0.94,0.91,0.83,0.75,0.39,
           0.93,0.93,0.90,0.90,0.69,0.38,
           0.96,0.93,0.92,0.91,0.74,0.55,0.27,
           0.98,0.98,0.98,0.98,0.93,0.86,0.78,0.67,
           0.93,0.96,0.99,0.99,0.98,0.92,0.86,0.81,0.42,
           0.91,0.93,0.98,1.00,0.98,0.98,0.95,0.96,0.63,0.26,
           0.88,0.89,0.99,0.99,0.99,0.98,0.98,0.97,0.73,0.50,0.24,
           0.87,0.87,0.95,0.98,0.98,0.98,0.98,0.98,0.80,0.59,0.38,0.15,
           0.84,0.86,0.97,0.96,1.00,0.99,1.00,0.98,0.77,0.72,0.45,0.32,0.24)
  res <- matrix(0,length(wl),length(wl),dimnames=list(wl,wl))
  res[upper.tri(res)] <- sim
  res <- t(res)
  res[upper.tri(res)] <- sim
  list(d=as.dist(res),col=sapply(wl,wav2rgb),labels=as.character(wl))
}
aux <- colors()
plot1d <- function(x,col=aux$col,labels=aux$labels,main="k = 1") {
  plot(c(min(x),max(x)+0.05),c(1,length(x)),type="n",bty="n",xlab="x",ylab="index",main=main)
  abline(v=x,col=aux$col)
  lines(x,1:length(x),col="gray")
  points(x,1:length(x),pch=19,col=aux$col)
  text(x,1:length(x),labels=aux$labels,pos=4,cex=0.5)
}
if(makepdf) pdf("nmds1d.pdf")
plot1d(isoMDS(aux$d,k=1)$points,main="k = 1 (nonmetric MDS)")
if(makepdf) dev.off()


plot2d <- function(xy,col=aux$col,labels=aux$labels,main="k = 2") {
  plot(c(min(xy[,1]),max(xy[,1])+0.05),range(xy[,2]),type="n",bty="n",xlab="x",ylab="y",main=main)
  lines(xy,col="gray")
  points(xy,pch=19,col=aux$col)
  text(xy,labels=aux$labels,pos=4,cex=0.5)
}

if(makepdf) pdf("nmds2d.pdf")
plot2d(isoMDS(aux$d,k=2)$points,main="k = 2 (nonmetric MDS)")
if(makepdf) dev.off()


plot3d <- function(xyz,xy,col=aux$col,labels=aux$labels) {
  xyz <- as.data.frame(xyz)
  colnames(xyz) <- c("x","y","z")
  rownames(xyz) <- labels
  xyz <- cbind(xyz,w=1:dim(xyz)[1])
  plot_ly(xyz,x=~x,y=~y,z=~z,color=~w,colors=aux$col) %>% layout(scene=list(xaxis=list(title="x"),yaxis=list(title="y"),zaxis=list(title="z"))) %>% add_trace(x=~x,y=~y,z=~z,mode="line")  %>% add_markers()  %>% hide_colorbar() %>% hide_legend()
}

plot3d(isoMDS(aux$d,k=3)$points)




s2 <- sapply(1:12,function(i) isoMDS(aux$d,k=i)$stress)
## s3 <- sapply(1:12,function(i) sammon(aux$d,k=i)$stress)
if(makepdf) pdf("screecol.pdf")
plot(c(1,12),c(0,max(c(s2))),type="n",xlab="k",ylab="stress",bty="n")
lines(1:12,s2)
## lines(1:12,s3,col=brewer.pal(2,"Set1")[2],lty="dotted")
points(1:12,s2,pch=1)
##points(1:12,s3,col=brewer.pal(2,"Set1")[2],pch=18)
legend("topright",legend=c("nonmetric MDS"),bty="n",
       lty=c("solid"),pch=c(1))
if(makepdf) dev.off()


if(makepdf) pdf("shepardcolnmds.pdf")
shepard(aux$d,isoMDS(aux$d,k=2)$points,main="nonmetric MDS (k = 2)",pch=1)
if(makepdf) dev.off()

```

```{r}
X <- makeswiss()$X
plotf <- function(x,main=main) plot(x,bty="n",xlab="",ylab="",xaxt="n",yaxt="n",col=cols,main=main)
if(makepdf) pdf("swiss.pdf")
pairs(X,col=cols)
plotf(spca(X)$E,main="PCA")
plotf(isoMDS(dist(X))$points,main="nonmetric MDS")
plotf(sammon(dist(X))$points,main="Sammon")
plotf(isomap(dist(X),k=10)$points,main="ISOMAP")
plotf(lle(X,2,10)$Y,main="LLE")
plotf(eigenmap(X,2,10),main="Eigenmap")
if(makepdf) dev.off()
xyz <- data.frame(x=X[,1],y=X[,2],z=X[,3],w=1:dim(X)[1])
plot_ly(xyz,x=~x,y=~y,z=~z,color=~w,colors=cols) %>% layout(scene=list(xaxis=list(title="x"),yaxis=list(title="y"),zaxis=list(title="z"))) %>% add_markers()  %>% hide_colorbar() %>% hide_legend()
```

```{r}
HS <- readRDS("HS.rds")
dHS <- dist(HS[,4:52])
if(makepdf) pdf("HS.pdf")
partyplot(lle(HS[,4:52],2,10)$Y,cl=HS[,3],main="Espoo 2017 (LLE)")
partyplot(isomap(dHS,ndim=2,k=10)$points,cl=HS[,3],main="Espoo 2017 (ISOMAP)")
partyplot(eigenmap(HS[,4:52],2,10),cl=HS[,3],main="Espoo 2017 (Eigenmap)")
if(makepdf) dev.off()
partyplot3d(data.frame(lle(HS[,4:52],3,10)$Y,row.names=rownames(HS)),cl=HS[,3])
partyplot3d(data.frame(eigenmap(HS[,4:52],3,10),row.names=rownames(HS)),cl=HS[,3])
partyplot3d(data.frame(isomap(dHS,ndim=3,k=10)$points,row.names=rownames(HS)),cl=HS[,3])
```

