Note: Some results may differ from the hard copy book due to the changing of sampling procedures introduced in R 3.6.0. See http://bit.ly/35D1SW7 for more details. Access and run the source code for this notebook here.

Hidden chapter requirements used in the book to set the plotting theme and load packages used in hidden code chunks:

knitr::opts_chunk$set(
  message = FALSE, 
  warning = FALSE, 
  cache = FALSE
)

# Set the graphical theme
ggplot2::theme_set(ggplot2::theme_light())

Figure 21.1:

knitr::include_graphics("images/dendrogram.png")

Prerequisites

For this chapter we’ll use the following packages:

# Helper packages
library(dplyr)       # for data manipulation
library(ggplot2)     # for data visualization
# Modeling packages
library(cluster)     # for general clustering algorithms
library(factoextra)  # for visualizing cluster results

The major concepts of hierarchical clustering will be illustrated using the Ames housing data:

ames_scale <- AmesHousing::make_ames() %>%
  select_if(is.numeric) %>%  # select numeric columns
  select(-Sale_Price) %>%    # remove target column
  mutate_all(as.double) %>%  # coerce to double type
  scale()                    # center & scale the resulting columns

Hierarchical clustering algorithms

Figure 21.2:

knitr::include_graphics("images/dendrogram2.png")

Figure 21.3:

knitr::include_graphics("images/dendrogram3.png")

Hierarchical clustering in R

Agglomerative hierarchical clustering

# For reproducibility
set.seed(123)
# Dissimilarity matrix
d <- dist(ames_scale, method = "euclidean")
# Hierarchical clustering using Complete Linkage
hc1 <- hclust(d, method = "complete" )
# For reproducibility
set.seed(123)
# Compute maximum or complete linkage clustering with agnes
hc2 <- agnes(ames_scale, method = "complete")
# Agglomerative coefficient
hc2$ac
[1] 0.926775
# methods to assess
m <- c( "average", "single", "complete", "ward")
names(m) <- c( "average", "single", "complete", "ward")
# function to compute coefficient
ac <- function(x) {
  agnes(ames_scale, method = x)$ac
}
# get agglomerative coefficient for each linkage method
purrr::map_dbl(m, ac)
  average    single  complete      ward 
0.9139303 0.8712890 0.9267750 0.9766577 

Divisive hierarchical clustering

# compute divisive hierarchical clustering
hc4 <- diana(ames_scale)
# Divise coefficient; amount of clustering structure found
hc4$dc
[1] 0.9191094

Determining optimal clusters

Working with dendrograms

# Construct dendorgram for the Ames housing example
hc5 <- hclust(d, method = "ward.D2" )
dend_plot <- fviz_dend(hc5)
dend_data <- attr(dend_plot, "dendrogram")
dend_cuts <- cut(dend_data, h = 8)
fviz_dend(dend_cuts$lower[[2]])

Figure 21.6:

df <- data.frame(
  x1 = c(-1.5, -1.3, -.9, -.6, .1, .1, .6, 1.2, 1.4),
  x2 = c(-.4, -1.5, -1.2, -1, -1.1, .6, -.2, -.5, -.3),
  label = c(3, 4, 6, 1, 2, 9, 8, 5, 7),
  row.names = c(3, 4, 6, 1, 2, 9, 8, 5, 7)
)
highlight <- filter(df, label %in% c(2 ,9))
p1 <- ggplot(df, aes(x1, x2, label = label)) +
  geom_label() +
  geom_label(data = highlight, fill = 'yellow')
df <- data.frame(
  x1 = c(-1.5, -1.3, -.9, -.6, .1, .1, .6, 1.2, 1.4),
  x2 = c(-.4, -1.5, -1.2, -1, -1.1, .6, -.2, -.5, -.3),
  row.names = c(3, 4, 6, 1, 2, 9, 8, 5, 7)
)
p2 <- factoextra::fviz_dend(hclust(dist(df)))
gridExtra::grid.arrange(p1, p2, nrow = 1)

# Ward's method
hc5 <- hclust(d, method = "ward.D2" )
# Cut tree into 4 groups
sub_grp <- cutree(hc5, k = 8)
# Number of members in each cluster
table(sub_grp)
sub_grp
   1    2    3    4    5    6    7    8 
1363  567  650   36  123  156   24   11 
# Plot full dendogram
fviz_dend(
  hc5,
  k = 8,
  horiz = TRUE,
  rect = TRUE,
  rect_fill = TRUE,
  rect_border = "jco",
  k_colors = "jco",
  cex = 0.1
)

dend_plot <- fviz_dend(hc5)                # create full dendogram
dend_data <- attr(dend_plot, "dendrogram") # extract plot info
dend_cuts <- cut(dend_data, h = 70.5)      # cut the dendogram at 
                                           # designated height
# Create sub dendrogram plots
p1 <- fviz_dend(dend_cuts$lower[[1]])
p2 <- fviz_dend(dend_cuts$lower[[1]], type = 'circular')
# Side by side plots
gridExtra::grid.arrange(p1, p2, nrow = 1)

LS0tCnRpdGxlOiAiQ2hhcHRlciAyMTogSGllcmFyY2hpY2FsIENsdXN0ZXJpbmciCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCl9fTm90ZV9fOiBTb21lIHJlc3VsdHMgbWF5IGRpZmZlciBmcm9tIHRoZSBoYXJkIGNvcHkgYm9vayBkdWUgdG8gdGhlIGNoYW5naW5nIG9mCnNhbXBsaW5nIHByb2NlZHVyZXMgaW50cm9kdWNlZCBpbiBSIDMuNi4wLiBTZWUgaHR0cDovL2JpdC5seS8zNUQxU1c3IGZvciBtb3JlCmRldGFpbHMuIEFjY2VzcyBhbmQgcnVuIHRoZSBzb3VyY2UgY29kZSBmb3IgdGhpcyBub3RlYm9vayBbaGVyZV0oaHR0cHM6Ly9yc3R1ZGlvLmNsb3VkL3Byb2plY3QvODAxMTg1KS4KCkhpZGRlbiBjaGFwdGVyIHJlcXVpcmVtZW50cyB1c2VkIGluIHRoZSBib29rIHRvIHNldCB0aGUgcGxvdHRpbmcgdGhlbWUgYW5kIGxvYWQKcGFja2FnZXMgdXNlZCBpbiBoaWRkZW4gY29kZSBjaHVua3M6CgpgYGB7ciBzZXR1cH0Ka25pdHI6Om9wdHNfY2h1bmskc2V0KAogIG1lc3NhZ2UgPSBGQUxTRSwgCiAgd2FybmluZyA9IEZBTFNFLCAKICBjYWNoZSA9IEZBTFNFCikKCiMgU2V0IHRoZSBncmFwaGljYWwgdGhlbWUKZ2dwbG90Mjo6dGhlbWVfc2V0KGdncGxvdDI6OnRoZW1lX2xpZ2h0KCkpCmBgYAoKRmlndXJlIDIxLjE6CgpgYGB7ciBkZW5kcm9ncmFtLCBvdXQuaGVpZ2h0PSc2NSUnLCBvdXQud2lkdGg9JzY1JScsIGZpZy5jYXA9IklsbHVzdHJhdGl2ZSBkZW5kcm9ncmFtLiJ9CmtuaXRyOjppbmNsdWRlX2dyYXBoaWNzKCJpbWFnZXMvZGVuZHJvZ3JhbS5wbmciKQpgYGAKCiMjIFByZXJlcXVpc2l0ZXMKCkZvciB0aGlzIGNoYXB0ZXIgd2UnbGwgdXNlIHRoZSBmb2xsb3dpbmcgcGFja2FnZXM6CgpgYGB7ciBoaWVyYXJjaGljYWwtY2x1c3RlcmluZy1wa2dzfQojIEhlbHBlciBwYWNrYWdlcwpsaWJyYXJ5KGRwbHlyKSAgICAgICAjIGZvciBkYXRhIG1hbmlwdWxhdGlvbgpsaWJyYXJ5KGdncGxvdDIpICAgICAjIGZvciBkYXRhIHZpc3VhbGl6YXRpb24KIyBNb2RlbGluZyBwYWNrYWdlcwpsaWJyYXJ5KGNsdXN0ZXIpICAgICAjIGZvciBnZW5lcmFsIGNsdXN0ZXJpbmcgYWxnb3JpdGhtcwpsaWJyYXJ5KGZhY3RvZXh0cmEpICAjIGZvciB2aXN1YWxpemluZyBjbHVzdGVyIHJlc3VsdHMKYGBgCgpUaGUgbWFqb3IgY29uY2VwdHMgb2YgaGllcmFyY2hpY2FsIGNsdXN0ZXJpbmcgd2lsbCBiZSBpbGx1c3RyYXRlZCB1c2luZyB0aGUgQW1lcyBob3VzaW5nIGRhdGE6CgpgYGB7ciBoaWVyYXJjaGljYWwtY2x1c3RlcmluZy1kYXRhfQphbWVzX3NjYWxlIDwtIEFtZXNIb3VzaW5nOjptYWtlX2FtZXMoKSAlPiUKICBzZWxlY3RfaWYoaXMubnVtZXJpYykgJT4lICAjIHNlbGVjdCBudW1lcmljIGNvbHVtbnMKICBzZWxlY3QoLVNhbGVfUHJpY2UpICU+JSAgICAjIHJlbW92ZSB0YXJnZXQgY29sdW1uCiAgbXV0YXRlX2FsbChhcy5kb3VibGUpICU+JSAgIyBjb2VyY2UgdG8gZG91YmxlIHR5cGUKICBzY2FsZSgpICAgICAgICAgICAgICAgICAgICAjIGNlbnRlciAmIHNjYWxlIHRoZSByZXN1bHRpbmcgY29sdW1ucwpgYGAKCiMjIEhpZXJhcmNoaWNhbCBjbHVzdGVyaW5nIGFsZ29yaXRobXMKCkZpZ3VyZSAyMS4yOgoKYGBge3IgZGVuZHJvZ3JhbTIsICBvdXQuaGVpZ2h0PSc2NSUnLCBvdXQud2lkdGg9JzY1JScsIGZpZy5jYXA9IkFHTkVTIChib3R0b20tdXApIHZlcnN1cyBESUFOQSAodG9wLWRvd24pIGNsdXN0ZXJpbmcuIn0Ka25pdHI6OmluY2x1ZGVfZ3JhcGhpY3MoImltYWdlcy9kZW5kcm9ncmFtMi5wbmciKQpgYGAKCkZpZ3VyZSAyMS4zOgoKYGBge3IgZGVuZHJvZ3JhbTMsIGZpZy5jYXA9IkRpZmZlcmluZyBoaWVyYXJjaGljYWwgY2x1c3RlcmluZyBvdXRwdXRzIGJhc2VkIG9uIHNpbWlsYXJpdHkgbWVhc3VyZXMuIiwgb3V0LndpZHRoPSc5MCUnLCBvdXQuaGVpZ2h0PSc5MCUnfQprbml0cjo6aW5jbHVkZV9ncmFwaGljcygiaW1hZ2VzL2RlbmRyb2dyYW0zLnBuZyIpCmBgYAoKCiMjIEhpZXJhcmNoaWNhbCBjbHVzdGVyaW5nIGluIFIgCgojIyMgQWdnbG9tZXJhdGl2ZSBoaWVyYXJjaGljYWwgY2x1c3RlcmluZwoKYGBge3IgYWdnbG9tZXJhdGl2ZTF9CiMgRm9yIHJlcHJvZHVjaWJpbGl0eQpzZXQuc2VlZCgxMjMpCiMgRGlzc2ltaWxhcml0eSBtYXRyaXgKZCA8LSBkaXN0KGFtZXNfc2NhbGUsIG1ldGhvZCA9ICJldWNsaWRlYW4iKQojIEhpZXJhcmNoaWNhbCBjbHVzdGVyaW5nIHVzaW5nIENvbXBsZXRlIExpbmthZ2UKaGMxIDwtIGhjbHVzdChkLCBtZXRob2QgPSAiY29tcGxldGUiICkKYGBgCgpgYGB7ciBhZ2dsb21lcmF0aXZlMn0KIyBGb3IgcmVwcm9kdWNpYmlsaXR5CnNldC5zZWVkKDEyMykKIyBDb21wdXRlIG1heGltdW0gb3IgY29tcGxldGUgbGlua2FnZSBjbHVzdGVyaW5nIHdpdGggYWduZXMKaGMyIDwtIGFnbmVzKGFtZXNfc2NhbGUsIG1ldGhvZCA9ICJjb21wbGV0ZSIpCiMgQWdnbG9tZXJhdGl2ZSBjb2VmZmljaWVudApoYzIkYWMKYGBgCgpgYGB7ciBhZ2dsb21lcmF0aXZlLWNvbXBhcmV9CiMgbWV0aG9kcyB0byBhc3Nlc3MKbSA8LSBjKCAiYXZlcmFnZSIsICJzaW5nbGUiLCAiY29tcGxldGUiLCAid2FyZCIpCm5hbWVzKG0pIDwtIGMoICJhdmVyYWdlIiwgInNpbmdsZSIsICJjb21wbGV0ZSIsICJ3YXJkIikKIyBmdW5jdGlvbiB0byBjb21wdXRlIGNvZWZmaWNpZW50CmFjIDwtIGZ1bmN0aW9uKHgpIHsKICBhZ25lcyhhbWVzX3NjYWxlLCBtZXRob2QgPSB4KSRhYwp9CiMgZ2V0IGFnZ2xvbWVyYXRpdmUgY29lZmZpY2llbnQgZm9yIGVhY2ggbGlua2FnZSBtZXRob2QKcHVycnI6Om1hcF9kYmwobSwgYWMpCmBgYAoKCiMjIyBEaXZpc2l2ZSBoaWVyYXJjaGljYWwgY2x1c3RlcmluZwoKYGBge3IgZGl2aXNpdmV9CiMgY29tcHV0ZSBkaXZpc2l2ZSBoaWVyYXJjaGljYWwgY2x1c3RlcmluZwpoYzQgPC0gZGlhbmEoYW1lc19zY2FsZSkKIyBEaXZpc2UgY29lZmZpY2llbnQ7IGFtb3VudCBvZiBjbHVzdGVyaW5nIHN0cnVjdHVyZSBmb3VuZApoYzQkZGMKYGBgCgoKIyMgRGV0ZXJtaW5pbmcgb3B0aW1hbCBjbHVzdGVycwoKYGBge3IgaGNsdXN0LW9wdGltYWwtY2x1c3RlcnMtY29tcGFyZSwgZmlnLmNhcD0nQ29tcGFyaXNvbiBvZiB0aHJlZSBkaWZmZXJlbnQgbWV0aG9kcyB0byBpZGVudGlmeSB0aGUgb3B0aW1hbCBudW1iZXIgb2YgY2x1c3RlcnMuJ30KIyBQbG90IGNsdXN0ZXIgcmVzdWx0cwpwMSA8LSBmdml6X25iY2x1c3QoYW1lc19zY2FsZSwgRlVOID0gaGN1dCwgbWV0aG9kID0gIndzcyIsIAogICAgICAgICAgICAgICAgICAgay5tYXggPSAxMCkgKwogIGdndGl0bGUoIihBKSBFbGJvdyBtZXRob2QiKQpwMiA8LSBmdml6X25iY2x1c3QoYW1lc19zY2FsZSwgRlVOID0gaGN1dCwgbWV0aG9kID0gInNpbGhvdWV0dGUiLCAKICAgICAgICAgICAgICAgICAgIGsubWF4ID0gMTApICsKICBnZ3RpdGxlKCIoQikgU2lsaG91ZXR0ZSBtZXRob2QiKQpwMyA8LSBmdml6X25iY2x1c3QoYW1lc19zY2FsZSwgRlVOID0gaGN1dCwgbWV0aG9kID0gImdhcF9zdGF0IiwgCiAgICAgICAgICAgICAgICAgICBrLm1heCA9IDEwKSArCiAgZ2d0aXRsZSgiKEMpIEdhcCBzdGF0aXN0aWMiKQojIERpc3BsYXkgcGxvdHMgc2lkZSBieSBzaWRlCmdyaWRFeHRyYTo6Z3JpZC5hcnJhbmdlKHAxLCBwMiwgcDMsIG5yb3cgPSAxKQpgYGAKCiMjIFdvcmtpbmcgd2l0aCBkZW5kcm9ncmFtcwoKYGBge3IgaWxsdXN0cmF0aXZlLWRlbmRyb2dyYW0tcGxvdCwgZmlnLmNhcD0iQSBzdWJzZWN0aW9uIG9mIHRoZSBkZW5kcm9ncmFtIGZvciBpbGx1c3RyYXRpdmUgcHVycG9zZXMuIn0KIyBDb25zdHJ1Y3QgZGVuZG9yZ3JhbSBmb3IgdGhlIEFtZXMgaG91c2luZyBleGFtcGxlCmhjNSA8LSBoY2x1c3QoZCwgbWV0aG9kID0gIndhcmQuRDIiICkKZGVuZF9wbG90IDwtIGZ2aXpfZGVuZChoYzUpCmRlbmRfZGF0YSA8LSBhdHRyKGRlbmRfcGxvdCwgImRlbmRyb2dyYW0iKQpkZW5kX2N1dHMgPC0gY3V0KGRlbmRfZGF0YSwgaCA9IDgpCmZ2aXpfZGVuZChkZW5kX2N1dHMkbG93ZXJbWzJdXSkKYGBgCgpGaWd1cmUgMjEuNjoKCmBgYHtyIGNvbXBhcmluZy1kZW5kcm9ncmFtLXRvLWRpc3RhbmNlcywgZmlnLmNhcD0iQ29tcGFyaXNvbiBvZiBuaW5lIG9ic2VydmF0aW9ucyBtZWFzdXJlZCBhY3Jvc3MgdHdvIGZlYXR1cmVzIChsZWZ0KSBhbmQgdGhlIHJlc3VsdGluZyBkZW5kcm9ncmFtIGNyZWF0ZWQgYmFzZWQgb24gaGllcmFyY2hpY2FsIGNsdXN0ZXJpbmcgKHJpZ2h0KS4ifQpkZiA8LSBkYXRhLmZyYW1lKAogIHgxID0gYygtMS41LCAtMS4zLCAtLjksIC0uNiwgLjEsIC4xLCAuNiwgMS4yLCAxLjQpLAogIHgyID0gYygtLjQsIC0xLjUsIC0xLjIsIC0xLCAtMS4xLCAuNiwgLS4yLCAtLjUsIC0uMyksCiAgbGFiZWwgPSBjKDMsIDQsIDYsIDEsIDIsIDksIDgsIDUsIDcpLAogIHJvdy5uYW1lcyA9IGMoMywgNCwgNiwgMSwgMiwgOSwgOCwgNSwgNykKKQpoaWdobGlnaHQgPC0gZmlsdGVyKGRmLCBsYWJlbCAlaW4lIGMoMiAsOSkpCnAxIDwtIGdncGxvdChkZiwgYWVzKHgxLCB4MiwgbGFiZWwgPSBsYWJlbCkpICsKICBnZW9tX2xhYmVsKCkgKwogIGdlb21fbGFiZWwoZGF0YSA9IGhpZ2hsaWdodCwgZmlsbCA9ICd5ZWxsb3cnKQpkZiA8LSBkYXRhLmZyYW1lKAogIHgxID0gYygtMS41LCAtMS4zLCAtLjksIC0uNiwgLjEsIC4xLCAuNiwgMS4yLCAxLjQpLAogIHgyID0gYygtLjQsIC0xLjUsIC0xLjIsIC0xLCAtMS4xLCAuNiwgLS4yLCAtLjUsIC0uMyksCiAgcm93Lm5hbWVzID0gYygzLCA0LCA2LCAxLCAyLCA5LCA4LCA1LCA3KQopCnAyIDwtIGZhY3RvZXh0cmE6OmZ2aXpfZGVuZChoY2x1c3QoZGlzdChkZikpKQpncmlkRXh0cmE6OmdyaWQuYXJyYW5nZShwMSwgcDIsIG5yb3cgPSAxKQpgYGAKCmBgYHtyIHdvcmtpbmctd2l0aC1kZW5kcy0xfQojIFdhcmQncyBtZXRob2QKaGM1IDwtIGhjbHVzdChkLCBtZXRob2QgPSAid2FyZC5EMiIgKQojIEN1dCB0cmVlIGludG8gNCBncm91cHMKc3ViX2dycCA8LSBjdXRyZWUoaGM1LCBrID0gOCkKIyBOdW1iZXIgb2YgbWVtYmVycyBpbiBlYWNoIGNsdXN0ZXIKdGFibGUoc3ViX2dycCkKYGBgCgpgYGB7ciB3b3JraW5nLXdpdGgtZGVuZHMtMi1wbG90LCBmaWcuY2FwPSJUaGUgY29tcGxldGUgZGVuZG9ncmFtIGhpZ2hsaWdodGluZyBhbGwgOCBjbHVzdGVycy4iLCBvdXQuaGVpZ2h0PSc3NSUnLCBvdXQud2lkdGg9Jzc1JSd9CiMgUGxvdCBmdWxsIGRlbmRvZ3JhbQpmdml6X2RlbmQoCiAgaGM1LAogIGsgPSA4LAogIGhvcml6ID0gVFJVRSwKICByZWN0ID0gVFJVRSwKICByZWN0X2ZpbGwgPSBUUlVFLAogIHJlY3RfYm9yZGVyID0gImpjbyIsCiAga19jb2xvcnMgPSAiamNvIiwKICBjZXggPSAwLjEKKQpgYGAKCmBgYHtyIHpvb20taW50by1kZW5kcm9ncmFtLXBsb3QsIGZpZy5jYXA9IkEgc3Vic2VjdGlvbiBvZiB0aGUgZGVuZHJvZ3JhbSBoaWdobGlnaHRpbmcgY2x1c3RlciA3LiJ9CmRlbmRfcGxvdCA8LSBmdml6X2RlbmQoaGM1KSAgICAgICAgICAgICAgICAjIGNyZWF0ZSBmdWxsIGRlbmRvZ3JhbQpkZW5kX2RhdGEgPC0gYXR0cihkZW5kX3Bsb3QsICJkZW5kcm9ncmFtIikgIyBleHRyYWN0IHBsb3QgaW5mbwpkZW5kX2N1dHMgPC0gY3V0KGRlbmRfZGF0YSwgaCA9IDcwLjUpICAgICAgIyBjdXQgdGhlIGRlbmRvZ3JhbSBhdCAKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICMgZGVzaWduYXRlZCBoZWlnaHQKIyBDcmVhdGUgc3ViIGRlbmRyb2dyYW0gcGxvdHMKcDEgPC0gZnZpel9kZW5kKGRlbmRfY3V0cyRsb3dlcltbMV1dKQpwMiA8LSBmdml6X2RlbmQoZGVuZF9jdXRzJGxvd2VyW1sxXV0sIHR5cGUgPSAnY2lyY3VsYXInKQojIFNpZGUgYnkgc2lkZSBwbG90cwpncmlkRXh0cmE6OmdyaWQuYXJyYW5nZShwMSwgcDIsIG5yb3cgPSAxKQpgYGAK