Note: Some results may differ from the hard copy book due to the changing of sampling procedures introduced in R 3.6.0. See http://bit.ly/35D1SW7 for more details. Access and run the source code for this notebook here.
Hidden chapter requirements used in the book to set the plotting theme and load packages used in hidden code chunks:
knitr::opts_chunk$set(
message = FALSE,
warning = FALSE,
cache = FALSE
)
# Set the graphical theme
ggplot2::theme_set(ggplot2::theme_light())
# packages for hidden code chunks
library(kableExtra)
library(pca3d)
Prerequisites
This chapter leverages the following packages:
library(dplyr) # basic data manipulation and plotting
library(ggplot2) # data visualization
library(h2o) # performing dimension reduction
To illustrate dimension reduction techniques, we’ll use the my_basket
data set:
url <- "https://koalaverse.github.io/homlr/data/my_basket.csv"
my_basket <- readr::read_csv(url)
Parsed with column specification:
cols(
.default = col_double()
)
See spec(...) for full column specifications.
dim(my_basket)
[1] 2000 42
The idea
Table 17.1:
# compute feature correlation
m <- cor(my_basket)
# plot features with highest correlations
data.frame(
row = rownames(m)[row(m)[upper.tri(m)]],
col = colnames(m)[col(m)[upper.tri(m)]],
corr = m[upper.tri(m)],
stringsAsFactors = FALSE
) %>%
filter(corr < 1 & corr > .25) %>%
rename(`Item 1` = row, `Item 2` = col, Correlation = corr) %>%
mutate(Correlation = round(Correlation, 3)) %>%
arrange(desc(Correlation)) %>%
kable(caption = "Various items in our my basket data that are correlated.") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Various items in our my basket data that are correlated.
Item 1 |
Item 2 |
Correlation |
cheese |
mayonnaise |
0.345 |
bulmers |
fosters |
0.335 |
cheese |
bread |
0.320 |
lasagna |
pizza |
0.316 |
pepsi |
coke |
0.309 |
red.wine |
fosters |
0.308 |
milk |
muesli |
0.302 |
mars |
twix |
0.301 |
red.wine |
bulmers |
0.298 |
bulmers |
kronenbourg |
0.289 |
milk |
tea |
0.288 |
red.wine |
kronenbourg |
0.286 |
7up |
coke |
0.282 |
spinach |
broccoli |
0.282 |
mayonnaise |
bread |
0.278 |
peas |
potatoes |
0.271 |
peas |
carrots |
0.270 |
tea |
instant.coffee |
0.270 |
milk |
instant.coffee |
0.267 |
bread |
lettuce |
0.264 |
twix |
kitkat |
0.259 |
mars |
kitkat |
0.255 |
muesli |
instant.coffee |
0.251 |
Finding principal components
Figure 17.1:
df <- AmesHousing::make_ames() %>%
select(var1 = First_Flr_SF, var2 = Gr_Liv_Area) %>%
filter(var1 != var2) %>%
mutate_all(log) %>%
scale() %>%
data.frame() %>%
filter(var1 < 4)
ggplot(df, aes(var1, var2)) +
geom_jitter(alpha = .2, size = 1, color = "dodgerblue") +
geom_segment(
aes(x = 0, xend = 1.5 , y = 0, yend = 1.5),
arrow = arrow(length = unit(0.25,"cm")), size = 0.75, color = "black"
) +
annotate("text", x = 1, y = .2, label = "First principal component", size = 2.5, hjust = 0) +
annotate("text", x = -3, y = .8, label = "Second principal component", size = 2.5, hjust = 0) +
geom_segment(
aes(x = 0, xend = -0.27 , y = 0, yend = .65),
arrow = arrow(length = unit(0.25,"cm")), size = 0.75, color = "black"
) +
xlab("Feature 2") +
ylab("Feature 1") +
theme_bw()
Creates Figure 17.2. Uncomment snapshotPCA3d(file = "3D-PCA.png")
to save image to desired location.
df <- AmesHousing::make_ames() %>%
select(var1 = First_Flr_SF, var2 = Gr_Liv_Area, var3 = TotRms_AbvGrd) %>%
filter(var1 != var2) %>%
mutate_at(vars(var1, var2), log)
pca <- prcomp(df, scale = FALSE)
pca3d(pca)
[1] 0.135693940 0.022140269 0.009712809
Creating new device
#snapshotPCA3d(file="3D-PCA.png")
Imports Figure 17.2:
knitr::include_graphics("images/3D-PCA.png")
Selecting the number of principal components
Eigenvalue criterion
# Compute eigenvalues
eigen <- my_pca@model$importance["Standard deviation", ] %>%
as.vector() %>%
.^2
# Sum of all eigenvalues equals number of variables
sum(eigen)
[1] 42
# Find PCs where the sum of eigenvalues is greater than or equal to 1
which(eigen >= 1)
[1] 1 2 3 4 5 6 7 8 9 10
Figure 17.5:
data.frame(
PC = seq_along(eigen),
Eigenvalue = unlist(eigen)
) %>%
ggplot(aes(PC, Eigenvalue)) +
geom_point() +
geom_hline(yintercept = 1, lty = "dashed", color = "red") +
scale_y_continuous(breaks = 0:6) +
xlab("PC") +
annotate("text", x = 15, y = 1, label = "eigenvalue criteria cutoff", color = "red", size = 5, hjust = 0, vjust = -1)
Proportion of variance explained criterion
# Extract and plot PVE and CVE
data.frame(
PC = my_pca@model$importance %>% seq_along(),
PVE = my_pca@model$importance %>% .[2,] %>% unlist(),
CVE = my_pca@model$importance %>% .[3,] %>% unlist()
) %>%
tidyr::gather(metric, variance_explained, -PC) %>%
ggplot(aes(PC, variance_explained)) +
geom_point() +
facet_wrap(~ metric, ncol = 1, scales = "free")
ve <- data.frame(
PC = my_pca@model$importance %>% names(),
PVE = my_pca@model$importance %>% .[2,] %>% unlist(),
CVE = my_pca@model$importance %>% .[3,] %>% unlist()
)
# How many PCs required to explain at least 75% of total variability
min(which(ve$CVE >= 0.75))
[1] 27
Scree plot criterion
data.frame(
PC = my_pca@model$importance %>% seq_along,
PVE = my_pca@model$importance %>% .[2,] %>% unlist()
) %>%
ggplot(aes(PC, PVE, group = 1, label = PC)) +
geom_point() +
geom_line() +
geom_text(nudge_y = -.002)
h2o.shutdown(prompt = FALSE)
