Note: Some results may differ from the hard copy book due to the changing of sampling procedures introduced in R 3.6.0. See http://bit.ly/35D1SW7 for more details. Access and run the source code for this notebook here.
Hidden chapter requirements used in the book to set the plotting theme and load packages used in hidden code chunks:
# Set global R options
options(scipen = 999)
# Set the graphical theme
ggplot2::theme_set(ggplot2::theme_light())
# Set global knitr chunk options
knitr::opts_chunk$set(
warning = FALSE,
message = FALSE
)
library(tidyverse)
ames <- AmesHousing::make_ames()
Prerequisites
For this chapter we’ll use the following packages:
# Helper packages
library(dplyr) # for data wrangling
library(ggplot2) # for awesome graphics
library(rsample) # for creating validation splits
library(recipes) # for feature engineering
# Modeling packages
library(caret) # for fitting KNN models
To illustrate various concepts we’ll continue working with the ames_train
and ames_test
data sets. We’ll also illustrate the performance of KNNs on the employee attrition and MNIST data sets.
# create training (70%) set for the rsample::attrition data.
attrit <- attrition %>% mutate_if(is.ordered, factor, ordered = FALSE)
set.seed(123)
churn_split <- initial_split(attrit, prop = .7, strata = "Attrition")
churn_train <- training(churn_split)
# import MNIST training data
mnist <- dslabs::read_mnist()
names(mnist)
[1] "train" "test"
# stratified sampling with the rsample package
set.seed(123)
split <- rsample::initial_split(ames, prop = 0.7, strata = "Sale_Price")
ames_train <- rsample::training(split)
Measuring similarity
Figure 8.1
library(ggmap)
library(recipes)
df <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_nzv(all_nominal()) %>%
step_integer(matches("Qual|Cond|QC|Qu")) %>%
step_center(all_numeric(), -all_outcomes()) %>%
step_scale(all_numeric(), -all_outcomes()) %>%
step_dummy(all_nominal(), -all_outcomes(), one_hot = TRUE) %>%
prep(training = ames_train, retain = TRUE) %>%
juice() %>%
select(-Sale_Price)
home <- 30
k = 10
index <- as.vector(FNN::knnx.index(df[-home, ], df[home, ], k = k))
knn_homes <- ames_train[c(home, index), ]
knn_homes %>%
select(Longitude, Latitude) %>%
mutate(desc = factor(c('House of interest', rep('Closest neighbors', k)),
levels = c('House of interest', 'Closest neighbors'))) %>%
qmplot(Longitude, Latitude, data = .,
maptype = "toner-background", darken = .7, color = desc, size = I(2.5)) +
theme(legend.position = "top",
legend.title = element_blank())
Distance measures
(two_houses <- ames_train[1:2, c("Gr_Liv_Area", "Year_Built")])
# Euclidean
dist(two_houses, method = "euclidean")
1
2 760.0007
# Manhattan
dist(two_houses, method = "manhattan")
1
2 761
Figure 8.2:
p1 <- ggplot(two_houses, aes(Gr_Liv_Area, Year_Built)) +
geom_point() +
geom_line(lty = "dashed") +
ggtitle("(A) Euclidean distance")
p2 <- ggplot(two_houses, aes(Gr_Liv_Area, Year_Built)) +
geom_point() +
geom_step(lty = "dashed") +
ggtitle("(B) Manhattan distance")
gridExtra::grid.arrange(p1, p2, nrow = 1)
Pre-processing
home1 <- ames %>%
mutate(id = row_number()) %>%
select(Bedroom_AbvGr, Year_Built, id) %>%
filter(Bedroom_AbvGr == 4 & Year_Built == 2008) %>%
slice(1) %>%
mutate(home = "home1") %>%
select(home, everything())
home2 <- ames %>%
mutate(id = row_number()) %>%
select(Bedroom_AbvGr, Year_Built, id) %>%
filter(Bedroom_AbvGr == 2 & Year_Built == 2008) %>%
slice(1) %>%
mutate(home = "home2") %>%
select(home, everything())
home3 <- ames %>%
mutate(id = row_number()) %>%
select(Bedroom_AbvGr, Year_Built, id) %>%
filter(Bedroom_AbvGr == 3 & Year_Built == 1998) %>%
slice(1) %>%
mutate(home = "home3") %>%
select(home, everything())
home1
home2
home3
features <- c("Bedroom_AbvGr", "Year_Built")
# distance between home 1 and 2
dist(rbind(home1[,features], home2[,features]))
1
2 2
# distance between home 1 and 3
dist(rbind(home1[,features], home3[,features]))
1
2 10.04988
scaled_ames <- recipe(Sale_Price ~ ., ames_train) %>%
step_center(all_numeric()) %>%
step_scale(all_numeric()) %>%
prep(training = ames, retain = TRUE) %>%
juice()
home1_std <- scaled_ames %>%
mutate(id = row_number()) %>%
filter(id == home1$id) %>%
select(Bedroom_AbvGr, Year_Built, id) %>%
mutate(home = "home1") %>%
select(home, everything())
home2_std <- scaled_ames %>%
mutate(id = row_number()) %>%
filter(id == home2$id) %>%
select(Bedroom_AbvGr, Year_Built, id) %>%
mutate(home = "home2") %>%
select(home, everything())
home3_std <- scaled_ames %>%
mutate(id = row_number()) %>%
filter(id == home3$id) %>%
select(Bedroom_AbvGr, Year_Built, id) %>%
mutate(home = "home3") %>%
select(home, everything())
home1_std
home2_std
home3_std
# distance between home 1 and 2
dist(rbind(home1_std[,features], home2_std[,features]))
1
2 2.416244
# distance between home 1 and 3
dist(rbind(home1_std[,features], home3_std[,features]))
1
2 1.252547
Choosing k
# Create blueprint
blueprint <- recipe(Attrition ~ ., data = churn_train) %>%
step_nzv(all_nominal()) %>%
step_integer(contains("Satisfaction")) %>%
step_integer(WorkLifeBalance) %>%
step_integer(JobInvolvement) %>%
step_dummy(all_nominal(), -all_outcomes(), one_hot = TRUE) %>%
step_center(all_numeric(), -all_outcomes()) %>%
step_scale(all_numeric(), -all_outcomes())
# Create a resampling method
cv <- trainControl(
method = "repeatedcv",
number = 10,
repeats = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary
)
# Create a hyperparameter grid search
hyper_grid <- expand.grid(
k = floor(seq(1, nrow(churn_train)/3, length.out = 20))
)
# Fit knn model and perform grid search
knn_grid <- train(
blueprint,
data = churn_train,
method = "knn",
trControl = cv,
tuneGrid = hyper_grid,
metric = "ROC"
)
ggplot(knn_grid)
MNIST example
set.seed(123)
index <- sample(nrow(mnist$train$images), size = 10000)
mnist_x <- mnist$train$images[index, ]
mnist_y <- factor(mnist$train$labels[index])
mnist_x %>%
as.data.frame() %>%
map_df(sd) %>%
gather(feature, sd) %>%
ggplot(aes(sd)) +
geom_histogram(binwidth = 1)
Figure 8.5:
nzv <- nearZeroVar(mnist_x)
par(mfrow = c(1, 4))
i <- 2
image(1:28, 1:28, matrix(mnist$test$images[i,], nrow=28)[ , 28:1],
col = gray(seq(0, 1, 0.05)), xlab = "", ylab="",
xaxt="n", yaxt="n", main = "(A) Example image \nfor digit 2")
i <- 7
image(1:28, 1:28, matrix(mnist$test$images[i,], nrow=28)[ , 28:1],
col = gray(seq(0, 1, 0.05)), xlab = "", ylab="",
xaxt="n", yaxt="n", main = "(B) Example image \nfor digit 4")
i <- 9
image(1:28, 1:28, matrix(mnist$test$images[i,], nrow=28)[ , 28:1],
col = gray(seq(0, 1, 0.05)), xlab = "", ylab="",
xaxt="n", yaxt="n", main = "(C) Example image \nfor digit 5")
image(matrix(!(1:784 %in% nzv), 28, 28), col = gray(seq(0, 1, 0.05)),
xaxt="n", yaxt="n", main = "(D) Typical variability \nin images.")
# Rename features
colnames(mnist_x) <- paste0("V", 1:ncol(mnist_x))
# Remove near zero variance features manually
nzv <- nearZeroVar(mnist_x)
index <- setdiff(1:ncol(mnist_x), nzv)
mnist_x <- mnist_x[, index]
# Use train/validate resampling method
cv <- trainControl(
method = "LGOCV",
p = 0.7,
number = 1,
savePredictions = TRUE
)
# Create a hyperparameter grid search
hyper_grid <- expand.grid(k = seq(3, 25, by = 2))
# Execute grid search
knn_mnist <- train(
mnist_x,
mnist_y,
method = "knn",
tuneGrid = hyper_grid,
preProc = c("center", "scale"),
trControl = cv
)
ggplot(knn_mnist)
# Create confusion matrix
cm <- confusionMatrix(knn_mnist$pred$pred, knn_mnist$pred$obs)
cm$byClass[, c(1:2, 11)] # sensitivity, specificity, & accuracy
Sensitivity Specificity Balanced Accuracy
Class: 0 0.9641638 0.9962374 0.9802006
Class: 1 0.9916667 0.9841210 0.9878938
Class: 2 0.9155666 0.9955114 0.9555390
Class: 3 0.9163952 0.9920325 0.9542139
Class: 4 0.8698630 0.9960538 0.9329584
Class: 5 0.9151404 0.9914891 0.9533148
Class: 6 0.9795322 0.9888684 0.9842003
Class: 7 0.9326520 0.9896962 0.9611741
Class: 8 0.8224382 0.9978798 0.9101590
Class: 9 0.9329897 0.9852687 0.9591292
# Top 20 most important features
vi <- varImp(knn_mnist)
vi
ROC curve variable importance
variables are sorted by maximum importance across the classes
only 20 most important variables shown (out of 249)
# Get median value for feature importance
imp <- vi$importance %>%
rownames_to_column(var = "feature") %>%
gather(response, imp, -feature) %>%
group_by(feature) %>%
summarize(imp = median(imp))
# Create tibble for all edge pixels
edges <- tibble(
feature = paste0("V", nzv),
imp = 0
)
# Combine and plot
imp <- rbind(imp, edges) %>%
mutate(ID = as.numeric(str_extract(feature, "\\d+"))) %>%
arrange(ID)
image(matrix(imp$imp, 28, 28), col = gray(seq(0, 1, 0.05)),
xaxt="n", yaxt="n")
# Get a few accurate predictions
set.seed(9)
good <- knn_mnist$pred %>%
filter(pred == obs) %>%
sample_n(4)
# Get a few inaccurate predictions
set.seed(9)
bad <- knn_mnist$pred %>%
filter(pred != obs) %>%
sample_n(4)
combine <- bind_rows(good, bad)
# Get original feature set with all pixel features
set.seed(123)
index <- sample(nrow(mnist$train$images), 10000)
X <- mnist$train$images[index,]
# Plot results
par(mfrow = c(4, 2), mar=c(1, 1, 1, 1))
layout(matrix(seq_len(nrow(combine)), 4, 2, byrow = FALSE))
for(i in seq_len(nrow(combine))) {
image(matrix(X[combine$rowIndex[i],], 28, 28)[, 28:1],
col = gray(seq(0, 1, 0.05)),
main = paste("Actual:", combine$obs[i], " ",
"Predicted:", combine$pred[i]),
xaxt="n", yaxt="n")
}
---
title: "Chapter 8: K-Nearest Neighbors"
output: html_notebook
---

__Note__: Some results may differ from the hard copy book due to the changing of sampling procedures introduced in R 3.6.0. See http://bit.ly/35D1SW7 for more details. Access and run the source code for this notebook [here](https://rstudio.cloud/project/801185). 

Hidden chapter requirements used in the book to set the plotting theme and load packages used in hidden code chunks:

```{r setup}
# Set global R options
options(scipen = 999)

# Set the graphical theme
ggplot2::theme_set(ggplot2::theme_light())

# Set global knitr chunk options
knitr::opts_chunk$set(
  warning = FALSE, 
  message = FALSE
)

library(tidyverse)
ames <- AmesHousing::make_ames()
```

## Prerequisites

For this chapter we'll use the following packages:

```{r knn-pkgs, message=FALSE}
# Helper packages
library(dplyr)      # for data wrangling
library(ggplot2)    # for awesome graphics
library(rsample)    # for creating validation splits
library(recipes)    # for feature engineering

# Modeling packages
library(caret)       # for fitting KNN models
```

To illustrate various concepts we'll continue working with the `ames_train` and `ames_test` data sets. We'll also illustrate the performance of KNNs on the employee attrition and MNIST data sets.

```{r knn-data-prereq}
# create training (70%) set for the rsample::attrition data.
attrit <- attrition %>% mutate_if(is.ordered, factor, ordered = FALSE)
set.seed(123)
churn_split <- initial_split(attrit, prop = .7, strata = "Attrition")
churn_train <- training(churn_split)

# import MNIST training data
mnist <- dslabs::read_mnist()
names(mnist)
```

```{r knn-ames-train, echo=TRUE}
# stratified sampling with the rsample package
set.seed(123)
split  <- rsample::initial_split(ames, prop = 0.7, strata = "Sale_Price")
ames_train  <- rsample::training(split)
```

## Measuring similarity

Figure 8.1

```{r map-homes, fig.cap="The 10 nearest neighbors (blue) whose home attributes most closely resemble the house of interest (red).", echo=TRUE}
library(ggmap)
library(recipes)
df <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_nzv(all_nominal()) %>%
  step_integer(matches("Qual|Cond|QC|Qu")) %>%
  step_center(all_numeric(), -all_outcomes()) %>%
  step_scale(all_numeric(), -all_outcomes()) %>%
  step_dummy(all_nominal(), -all_outcomes(), one_hot = TRUE) %>%
  prep(training = ames_train, retain = TRUE) %>%
  juice() %>%
  select(-Sale_Price)

home <- 30
k = 10
index <- as.vector(FNN::knnx.index(df[-home, ], df[home, ], k = k))
knn_homes <- ames_train[c(home, index), ]

knn_homes %>% 
  select(Longitude, Latitude) %>%
  mutate(desc = factor(c('House of interest', rep('Closest neighbors', k)), 
                       levels = c('House of interest', 'Closest neighbors'))) %>%
  qmplot(Longitude, Latitude, data = ., 
         maptype = "toner-background", darken = .7, color = desc, size = I(2.5)) + 
  theme(legend.position = "top",
        legend.title = element_blank())
```

### Distance measures

```{r distance-btwn-two-houses}
(two_houses <- ames_train[1:2, c("Gr_Liv_Area", "Year_Built")])

# Euclidean
dist(two_houses, method = "euclidean")

# Manhattan
dist(two_houses, method = "manhattan")
```

Figure 8.2:

```{r difference-btwn-distance-measures, echo=TRUE, fig.height=3, fig.cap="Euclidean (A) versus Manhattan (B) distance."}
p1 <- ggplot(two_houses, aes(Gr_Liv_Area, Year_Built)) +
  geom_point() +
  geom_line(lty = "dashed") +
  ggtitle("(A) Euclidean distance")
  

p2 <- ggplot(two_houses, aes(Gr_Liv_Area, Year_Built)) +
  geom_point() +
  geom_step(lty = "dashed") +
  ggtitle("(B) Manhattan distance")

gridExtra::grid.arrange(p1, p2, nrow = 1)
```

### Pre-processing

```{r scale-impacts-distance-hidden, echo=TRUE}
home1 <- ames %>%
  mutate(id = row_number()) %>%
  select(Bedroom_AbvGr, Year_Built, id) %>%
  filter(Bedroom_AbvGr == 4 & Year_Built == 2008) %>%
  slice(1) %>%
  mutate(home = "home1") %>%
  select(home, everything())

home2 <- ames %>%
  mutate(id = row_number()) %>%
  select(Bedroom_AbvGr, Year_Built, id) %>%
  filter(Bedroom_AbvGr == 2 & Year_Built == 2008) %>%
  slice(1) %>%
  mutate(home = "home2") %>%
  select(home, everything())

home3 <- ames %>%
  mutate(id = row_number()) %>%
  select(Bedroom_AbvGr, Year_Built, id) %>%
  filter(Bedroom_AbvGr == 3 & Year_Built == 1998) %>%
  slice(1) %>%
  mutate(home = "home3") %>%
  select(home, everything())
```

```{r scale-impacts-distance}
home1
home2
home3
```

```{r scale-impacts-distance2}
features <- c("Bedroom_AbvGr", "Year_Built")

# distance between home 1 and 2
dist(rbind(home1[,features], home2[,features]))

# distance between home 1 and 3
dist(rbind(home1[,features], home3[,features]))
```

```{r scaling, echo=TRUE}
scaled_ames <- recipe(Sale_Price ~ ., ames_train) %>%
  step_center(all_numeric()) %>%
  step_scale(all_numeric()) %>%
  prep(training = ames, retain = TRUE) %>%
  juice()

home1_std <- scaled_ames %>%
  mutate(id = row_number()) %>%
  filter(id == home1$id) %>%
  select(Bedroom_AbvGr, Year_Built, id) %>%
  mutate(home = "home1") %>%
  select(home, everything())

home2_std <- scaled_ames %>%
  mutate(id = row_number()) %>%
  filter(id == home2$id) %>%
  select(Bedroom_AbvGr, Year_Built, id) %>%
  mutate(home = "home2") %>%
  select(home, everything())

home3_std <- scaled_ames %>%
  mutate(id = row_number()) %>%
  filter(id == home3$id) %>%
  select(Bedroom_AbvGr, Year_Built, id) %>%
  mutate(home = "home3") %>%
  select(home, everything())
```

```{r scale-impacts-distance3}
home1_std
home2_std
home3_std

# distance between home 1 and 2
dist(rbind(home1_std[,features], home2_std[,features]))

# distance between home 1 and 3
dist(rbind(home1_std[,features], home3_std[,features]))
```

## Choosing _k_

```{r range-k-values, fig.height=3, fig.cap="Cross validated search grid results for Attrition training data where 20 values between 1 and 343 are assessed for k. When k = 1, the predicted value is based on a single observation that is closest to the target sample and when k = 343, the predicted value is based on the response with the largest proportion for 1/3 of the training sample."}
# Create blueprint
blueprint <- recipe(Attrition ~ ., data = churn_train) %>%
  step_nzv(all_nominal()) %>%
  step_integer(contains("Satisfaction")) %>%
  step_integer(WorkLifeBalance) %>%
  step_integer(JobInvolvement) %>%
  step_dummy(all_nominal(), -all_outcomes(), one_hot = TRUE) %>%
  step_center(all_numeric(), -all_outcomes()) %>%
  step_scale(all_numeric(), -all_outcomes())

# Create a resampling method
cv <- trainControl(
  method = "repeatedcv", 
  number = 10, 
  repeats = 5,
  classProbs = TRUE,                 
  summaryFunction = twoClassSummary
)

# Create a hyperparameter grid search
hyper_grid <- expand.grid(
  k = floor(seq(1, nrow(churn_train)/3, length.out = 20))
)

# Fit knn model and perform grid search
knn_grid <- train(
  blueprint, 
  data = churn_train, 
  method = "knn", 
  trControl = cv, 
  tuneGrid = hyper_grid,
  metric = "ROC"
)

ggplot(knn_grid)
```


## MNIST example 

```{r mnist-subsample}
set.seed(123)
index <- sample(nrow(mnist$train$images), size = 10000)
mnist_x <- mnist$train$images[index, ]
mnist_y <- factor(mnist$train$labels[index])
```

```{r mnist-plot-variance, fig.height=3, fig.cap="Distribution of variability across the MNIST features.  We see a significant number of zero variance features that should be removed."}
mnist_x %>%
  as.data.frame() %>%
  map_df(sd) %>%
  gather(feature, sd) %>%
  ggplot(aes(sd)) +
  geom_histogram(binwidth = 1)
```

Figure 8.5:

```{r mnist-plot-nzv-feature-image, echo=TRUE, fig.width=8, fig.height=3.5, fig.cap="Example images (A)-(C) from our data set and (D) highlights near-zero variance features around the edges of our images."}
nzv <- nearZeroVar(mnist_x)
par(mfrow = c(1, 4))
i <- 2
image(1:28, 1:28, matrix(mnist$test$images[i,], nrow=28)[ , 28:1], 
      col = gray(seq(0, 1, 0.05)), xlab = "", ylab="", 
      xaxt="n", yaxt="n", main = "(A) Example image \nfor digit 2")
i <- 7
image(1:28, 1:28, matrix(mnist$test$images[i,], nrow=28)[ , 28:1], 
      col = gray(seq(0, 1, 0.05)), xlab = "", ylab="", 
      xaxt="n", yaxt="n", main = "(B) Example image \nfor digit 4")
i <- 9
image(1:28, 1:28, matrix(mnist$test$images[i,], nrow=28)[ , 28:1], 
      col = gray(seq(0, 1, 0.05)), xlab = "", ylab="", 
      xaxt="n", yaxt="n", main = "(C) Example image \nfor digit 5")
image(matrix(!(1:784 %in% nzv), 28, 28), col = gray(seq(0, 1, 0.05)), 
      xaxt="n", yaxt="n", main = "(D) Typical variability \nin images.")
```

```{r prep-mnist-data}
# Rename features
colnames(mnist_x) <- paste0("V", 1:ncol(mnist_x))

# Remove near zero variance features manually
nzv <- nearZeroVar(mnist_x)
index <- setdiff(1:ncol(mnist_x), nzv)
mnist_x <- mnist_x[, index]
```

```{r mnist-initial-model, fig.height=3, fig.cap="KNN search grid results for the MNIST data"}
# Use train/validate resampling method
cv <- trainControl(
  method = "LGOCV", 
  p = 0.7,
  number = 1,
  savePredictions = TRUE
)

# Create a hyperparameter grid search
hyper_grid <- expand.grid(k = seq(3, 25, by = 2))

# Execute grid search
knn_mnist <- train(
  mnist_x,
  mnist_y,
  method = "knn",
  tuneGrid = hyper_grid,
  preProc = c("center", "scale"),
  trControl = cv
)

ggplot(knn_mnist)
```

```{r mnist-class-results}
# Create confusion matrix
cm <- confusionMatrix(knn_mnist$pred$pred, knn_mnist$pred$obs)
cm$byClass[, c(1:2, 11)]  # sensitivity, specificity, & accuracy
```

```{r mnist-vi}
# Top 20 most important features
vi <- varImp(knn_mnist)
vi
```

```{r plot-mnist-vi, fig.width=4, fig.height=4, fig.cap="Image heat map showing which features, on average, are most influential across all response classes for our KNN model."}
# Get median value for feature importance
imp <- vi$importance %>%
  rownames_to_column(var = "feature") %>%
  gather(response, imp, -feature) %>%
  group_by(feature) %>%
  summarize(imp = median(imp))

# Create tibble for all edge pixels
edges <- tibble(
  feature = paste0("V", nzv),
  imp = 0
)

# Combine and plot
imp <- rbind(imp, edges) %>%
  mutate(ID  = as.numeric(str_extract(feature, "\\d+"))) %>%
  arrange(ID)
image(matrix(imp$imp, 28, 28), col = gray(seq(0, 1, 0.05)), 
      xaxt="n", yaxt="n")
```

```{r correct-vs-incorrect, fig.height=7, fig.cap="Actual images from the MNIST data set along with our KNN model's predictions.  Left column illustrates a few accurate predictions and the right column illustrates a few inaccurate predictions."}
# Get a few accurate predictions
set.seed(9)
good <- knn_mnist$pred %>%
  filter(pred == obs) %>%
  sample_n(4)

# Get a few inaccurate predictions
set.seed(9)
bad <- knn_mnist$pred %>%
  filter(pred != obs) %>%
  sample_n(4)

combine <- bind_rows(good, bad)

# Get original feature set with all pixel features
set.seed(123)
index <- sample(nrow(mnist$train$images), 10000)
X <- mnist$train$images[index,]

# Plot results
par(mfrow = c(4, 2), mar=c(1, 1, 1, 1))
layout(matrix(seq_len(nrow(combine)), 4, 2, byrow = FALSE))
for(i in seq_len(nrow(combine))) {
  image(matrix(X[combine$rowIndex[i],], 28, 28)[, 28:1], 
        col = gray(seq(0, 1, 0.05)),
        main = paste("Actual:", combine$obs[i], "  ", 
                     "Predicted:", combine$pred[i]),
        xaxt="n", yaxt="n") 
}
```
