Note: Some results may differ from the hard copy book due to the changing of sampling procedures introduced in R 3.6.0. See http://bit.ly/35D1SW7 for more details. Access and run the source code for this notebook here.

Hidden chapter requirements used in the book to set the plotting theme and load packages used in hidden code chunks:

# Set global R options
options(scipen = 999)

# Set the graphical theme
ggplot2::theme_set(ggplot2::theme_light())

# Set global knitr chunk options
knitr::opts_chunk$set(
  warning = FALSE, 
  message = FALSE
)

library(rsample)

Prerequisites

This chapter leverages the following packages:

# Helper packages
library(dplyr)    # for data wrangling
library(ggplot2)  # for awesome graphics

# Modeling packages
library(ranger)   # a c++ implementation of random forest 
library(h2o)      # a java-based implementation of random forest

We’ll continue working with the ames_train data set:

# create Ames training data
set.seed(123)
ames <- AmesHousing::make_ames()
split  <- initial_split(ames, prop = 0.7, strata = "Sale_Price")
ames_train  <- training(split)

Out-of-the-box performance

# number of features
n_features <- length(setdiff(names(ames_train), "Sale_Price"))

# train a default random forest model
ames_rf1 <- ranger(
  Sale_Price ~ ., 
  data = ames_train,
  mtry = floor(n_features / 3),
  respect.unordered.factors = "order",
  seed = 123
)

# get OOB RMSE
(default_rmse <- sqrt(ames_rf1$prediction.error))
[1] 26042.77

Hyperparameters

Number of trees

Figure 11.1:

# number of features
n_features <- ncol(ames_train) - 1

# tuning grid
tuning_grid <- expand.grid(
  trees = seq(10, 1000, by = 20),
  rmse  = NA
)

for(i in seq_len(nrow(tuning_grid))) {

  # Fit a random forest
  fit <- ranger(
    formula = Sale_Price ~ ., 
    data = ames_train, 
    num.trees = tuning_grid$trees[i],
    mtry = floor(n_features / 3),
    respect.unordered.factors = 'order',
    verbose = FALSE,
    seed = 123
  )
  
  # Extract OOB RMSE
  tuning_grid$rmse[i] <- sqrt(fit$prediction.error)
  
}

ggplot(tuning_grid, aes(trees, rmse)) +
  geom_line(size = 1) +
  ylab("OOB Error (RMSE)") +
  xlab("Number of trees")

\(m_{try}\)

Figure 11.2 (has an exceptionally long runtime):

tuning_grid <- expand.grid(
  trees = seq(10, 1000, by = 20),
  mtry  = floor(c(seq(2, 80, length.out = 5), 26)),
  rmse  = NA
)

for(i in seq_len(nrow(tuning_grid))) {
  fit <- ranger(
  formula    = Sale_Price ~ ., 
  data       = ames_train, 
  num.trees  = tuning_grid$trees[i],
  mtry       = tuning_grid$mtry[i],
  respect.unordered.factors = 'order',
  verbose    = FALSE,
  seed       = 123
)
  
  tuning_grid$rmse[i] <- sqrt(fit$prediction.error)
  
}

labels <- tuning_grid %>%
  filter(trees == 990) %>%
  mutate(mtry = as.factor(mtry))

tuning_grid %>%
  mutate(mtry = as.factor(mtry)) %>%
  ggplot(aes(trees, rmse, color = mtry)) +
  geom_line(size = 1, show.legend = FALSE) +
  ggrepel::geom_text_repel(data = labels, aes(trees, rmse, label = mtry), nudge_x = 50, show.legend = FALSE) +
  ylab("OOB Error (RMSE)") +
  xlab("Number of trees")

Tree complexity

Figure 11.3:

tuning_grid <- expand.grid(
  min.node.size = 1:20,
  run_time  = NA,
  rmse = NA
)

for(i in seq_len(nrow(tuning_grid))) {
  fit_time <- system.time({
    fit <- ranger(
    formula    = Sale_Price ~ ., 
    data       = ames_train, 
    num.trees  = 1000,
    mtry       = 26,
    min.node.size = tuning_grid$min.node.size[i],
    respect.unordered.factors = 'order',
    verbose    = FALSE,
    seed       = 123
  )
})
  
  tuning_grid$run_time[i] <- fit_time[[3]]
  tuning_grid$rmse[i] <- sqrt(fit$prediction.error)
  
}

min_node_size <- tuning_grid %>% 
  mutate(
    error_first = first(rmse),
    runtime_first = first(run_time),
    `Error Growth` = (rmse / error_first) - 1,
    `Run Time Reduction` = (run_time / runtime_first) - 1
    )

p1 <-  ggplot(min_node_size, aes(min.node.size, `Error Growth`)) +
  geom_smooth(size = 1, se = FALSE, color = "black") +
  scale_y_continuous("Percent growth in error estimate", labels = scales::percent) +
  xlab("Minimum node size") +
  ggtitle("A) Impact to error estimate")

p2 <-  ggplot(min_node_size, aes(min.node.size, `Run Time Reduction`)) +
  geom_smooth(size = 1, se = FALSE, color = "black") +
  scale_y_continuous("Reduction in run time", labels = scales::percent) +
  xlab("Minimum node size") +
  ggtitle("B) Impact to run time")

gridExtra::grid.arrange(p1, p2, nrow = 1)

Sampling scheme

Figure 11.4:

tuning_grid <- expand.grid(
  sample.fraction = seq(.05, .95, by = .05),
  replace  = c(TRUE, FALSE),
  rmse = NA
)

for(i in seq_len(nrow(tuning_grid))) {
  fit <- ranger(
    formula    = Sale_Price ~ ., 
    data       = ames_train, 
    num.trees  = 1000,
    mtry       = 26,
    sample.fraction = tuning_grid$sample.fraction[i],
    replace = tuning_grid$replace[i],
    respect.unordered.factors = 'order',
    verbose    = FALSE,
    seed       = 123
  )

  tuning_grid$rmse[i] <- sqrt(fit$prediction.error)
  
}

tuning_grid %>%
  ggplot(aes(sample.fraction, rmse, color = replace)) +
  geom_line(size = 1) +
  scale_x_continuous("Sample Fraction", breaks = seq(.1, .9, by = .1), labels = scales::percent) +
  ylab("OOB Error (RMSE)") +
  scale_color_discrete("Sample with Replacement") +
  theme(legend.position = c(0.8, 0.85),
        legend.key = element_blank(),
        legend.background = element_blank())

Tuning strategies

This grid search takes approximately 2 minutes.

# create hyperparameter grid
hyper_grid <- expand.grid(
  mtry = floor(n_features * c(.05, .15, .25, .333, .4)),
  min.node.size = c(1, 3, 5, 10), 
  replace = c(TRUE, FALSE),                               
  sample.fraction = c(.5, .63, .8),                       
  rmse = NA                                               
)

# execute full cartesian grid search
for(i in seq_len(nrow(hyper_grid))) {
  # fit model for ith hyperparameter combination
  fit <- ranger(
    formula         = Sale_Price ~ ., 
    data            = ames_train, 
    num.trees       = n_features * 10,
    mtry            = hyper_grid$mtry[i],
    min.node.size   = hyper_grid$min.node.size[i],
    replace         = hyper_grid$replace[i],
    sample.fraction = hyper_grid$sample.fraction[i],
    verbose         = FALSE,
    seed            = 123,
    respect.unordered.factors = 'order',
  )
  # export OOB error 
  hyper_grid$rmse[i] <- sqrt(fit$prediction.error)
}

# assess top 10 models
hyper_grid %>%
  arrange(rmse) %>%
  mutate(perc_gain = (default_rmse - rmse) / default_rmse * 100) %>%
  head(10)
h2o.no_progress()
h2o.init(max_mem_size = "5g")

H2O is memory intensive so you should not run the code that follows in RStudio Cloud.

# convert training data to h2o object
train_h2o <- as.h2o(ames_train)

# set the response column to Sale_Price
response <- "Sale_Price"

# set the predictor names
predictors <- setdiff(colnames(ames_train), response)
h2o_rf1 <- h2o.randomForest(
    x = predictors, 
    y = response,
    training_frame = train_h2o, 
    ntrees = n_features * 10,
    seed = 123
)

h2o_rf1
Model Details:
==============

H2ORegressionModel: drf
Model ID:  DRF_model_R_1577101028169_1 
Model Summary: 


H2ORegressionMetrics: drf
** Reported on training data. **
** Metrics reported on Out-Of-Bag training samples **

MSE:  680109951
RMSE:  26078.92
MAE:  15451.62
RMSLE:  0.1349113
Mean Residual Deviance :  680109951
# hyperparameter grid
hyper_grid <- list(
  mtries = floor(n_features * c(.05, .15, .25, .333, .4)),
  min_rows = c(1, 3, 5, 10),
  max_depth = c(10, 20, 30),
  sample_rate = c(.55, .632, .70, .80)
)

# random grid search strategy
search_criteria <- list(
  strategy = "RandomDiscrete",
  stopping_metric = "mse",
  stopping_tolerance = 0.001,   # stop if improvement is < 0.1%
  stopping_rounds = 10,         # over the last 10 models
  max_runtime_secs = 60*5      # or stop search after 5 min.
)

This grid search takes 5 minutes.

# perform grid search 
random_grid <- h2o.grid(
  algorithm = "randomForest",
  grid_id = "rf_random_grid",
  x = predictors, 
  y = response, 
  training_frame = train_h2o,
  hyper_params = hyper_grid,
  ntrees = n_features * 10,
  seed = 123,
  stopping_metric = "RMSE",   
  stopping_rounds = 10,           # stop if last 10 trees added 
  stopping_tolerance = 0.005,     # don't improve RMSE by 0.5%
  search_criteria = search_criteria
)
# collect the results and sort by our model performance metric 
# of choice
random_grid_perf <- h2o.getGrid(
  grid_id = "rf_random_grid", 
  sort_by = "mse", 
  decreasing = FALSE
)
random_grid_perf
H2O Grid Details
================

Grid ID: rf_random_grid 
Used hyper parameters: 
  -  max_depth 
  -  min_rows 
  -  mtries 
  -  sample_rate 
Number of models: 107 
Number of failed models: 0 

Hyper-Parameter Search Summary: ordered by increasing mse

---

Feature interpretation

# re-run model with impurity-based variable importance
rf_impurity <- ranger(
  formula = Sale_Price ~ ., 
  data = ames_train, 
  num.trees = 2000,
  mtry = 32,
  min.node.size = 1,
  sample.fraction = .80,
  replace = FALSE,
  importance = "impurity",
  respect.unordered.factors = "order",
  verbose = FALSE,
  seed  = 123
)

# re-run model with permutation-based variable importance
rf_permutation <- ranger(
  formula = Sale_Price ~ ., 
  data = ames_train, 
  num.trees = 2000,
  mtry = 32,
  min.node.size = 1,
  sample.fraction = .80,
  replace = FALSE,
  importance = "permutation",
  respect.unordered.factors = "order",
  verbose = FALSE,
  seed  = 123
)
p1 <- vip::vip(rf_impurity, num_features = 25, bar = FALSE)
p2 <- vip::vip(rf_permutation, num_features = 25, bar = FALSE)

gridExtra::grid.arrange(p1, p2, nrow = 1)

---
title: "Chapter 11: Random Forests"
output: html_notebook
---

__Note__: Some results may differ from the hard copy book due to the changing of sampling procedures introduced in R 3.6.0. See http://bit.ly/35D1SW7 for more details. Access and run the source code for this notebook [here](https://rstudio.cloud/project/801185). 

Hidden chapter requirements used in the book to set the plotting theme and load packages used in hidden code chunks:

```{r setup}
# Set global R options
options(scipen = 999)

# Set the graphical theme
ggplot2::theme_set(ggplot2::theme_light())

# Set global knitr chunk options
knitr::opts_chunk$set(
  warning = FALSE, 
  message = FALSE
)

library(rsample)
```

## Prerequisites

This chapter leverages the following packages:

```{r rf-pkg-req}
# Helper packages
library(dplyr)    # for data wrangling
library(ggplot2)  # for awesome graphics

# Modeling packages
library(ranger)   # a c++ implementation of random forest 
library(h2o)      # a java-based implementation of random forest
```

We'll continue working with the `ames_train` data set:

```{r rf-ames-train, echo=TRUE}
# create Ames training data
set.seed(123)
ames <- AmesHousing::make_ames()
split  <- initial_split(ames, prop = 0.7, strata = "Sale_Price")
ames_train  <- training(split)
```

## Out-of-the-box performance

```{r out-of-box-rf}
# number of features
n_features <- length(setdiff(names(ames_train), "Sale_Price"))

# train a default random forest model
ames_rf1 <- ranger(
  Sale_Price ~ ., 
  data = ames_train,
  mtry = floor(n_features / 3),
  respect.unordered.factors = "order",
  seed = 123
)

# get OOB RMSE
(default_rmse <- sqrt(ames_rf1$prediction.error))
```

## Hyperparameters

### Number of trees

Figure 11.1:

```{r tuning-trees, echo=TRUE, fig.cap="The Ames data has 80 features and starting with 10 times the number of features typically ensures the error estimate converges.", fig.height=3}
# number of features
n_features <- ncol(ames_train) - 1

# tuning grid
tuning_grid <- expand.grid(
  trees = seq(10, 1000, by = 20),
  rmse  = NA
)

for(i in seq_len(nrow(tuning_grid))) {

  # Fit a random forest
  fit <- ranger(
    formula = Sale_Price ~ ., 
    data = ames_train, 
    num.trees = tuning_grid$trees[i],
    mtry = floor(n_features / 3),
    respect.unordered.factors = 'order',
    verbose = FALSE,
    seed = 123
  )
  
  # Extract OOB RMSE
  tuning_grid$rmse[i] <- sqrt(fit$prediction.error)
  
}

ggplot(tuning_grid, aes(trees, rmse)) +
  geom_line(size = 1) +
  ylab("OOB Error (RMSE)") +
  xlab("Number of trees")
```

### $m_{try}$ 

Figure 11.2 (has an exceptionally long runtime):

```{r tuning-mtry, echo=TRUE, fig.cap="For the Ames data, an mtry value slightly lower (21) than the default (26) improves performance.", fig.height=3.5}
tuning_grid <- expand.grid(
  trees = seq(10, 1000, by = 20),
  mtry  = floor(c(seq(2, 80, length.out = 5), 26)),
  rmse  = NA
)

for(i in seq_len(nrow(tuning_grid))) {
  fit <- ranger(
  formula    = Sale_Price ~ ., 
  data       = ames_train, 
  num.trees  = tuning_grid$trees[i],
  mtry       = tuning_grid$mtry[i],
  respect.unordered.factors = 'order',
  verbose    = FALSE,
  seed       = 123
)
  
  tuning_grid$rmse[i] <- sqrt(fit$prediction.error)
  
}

labels <- tuning_grid %>%
  filter(trees == 990) %>%
  mutate(mtry = as.factor(mtry))

tuning_grid %>%
  mutate(mtry = as.factor(mtry)) %>%
  ggplot(aes(trees, rmse, color = mtry)) +
  geom_line(size = 1, show.legend = FALSE) +
  ggrepel::geom_text_repel(data = labels, aes(trees, rmse, label = mtry), nudge_x = 50, show.legend = FALSE) +
  ylab("OOB Error (RMSE)") +
  xlab("Number of trees")
```

### Tree complexity

Figure 11.3:

```{r tuning-node-size, echo=TRUE, fig.cap="Increasing node size to reduce tree complexity will often have a larger impact on computation speed (right) than on your error estimate.", fig.width=10}
tuning_grid <- expand.grid(
  min.node.size = 1:20,
  run_time  = NA,
  rmse = NA
)

for(i in seq_len(nrow(tuning_grid))) {
  fit_time <- system.time({
    fit <- ranger(
    formula    = Sale_Price ~ ., 
    data       = ames_train, 
    num.trees  = 1000,
    mtry       = 26,
    min.node.size = tuning_grid$min.node.size[i],
    respect.unordered.factors = 'order',
    verbose    = FALSE,
    seed       = 123
  )
})
  
  tuning_grid$run_time[i] <- fit_time[[3]]
  tuning_grid$rmse[i] <- sqrt(fit$prediction.error)
  
}

min_node_size <- tuning_grid %>% 
  mutate(
    error_first = first(rmse),
    runtime_first = first(run_time),
    `Error Growth` = (rmse / error_first) - 1,
    `Run Time Reduction` = (run_time / runtime_first) - 1
    )

p1 <-  ggplot(min_node_size, aes(min.node.size, `Error Growth`)) +
  geom_smooth(size = 1, se = FALSE, color = "black") +
  scale_y_continuous("Percent growth in error estimate", labels = scales::percent) +
  xlab("Minimum node size") +
  ggtitle("A) Impact to error estimate")

p2 <-  ggplot(min_node_size, aes(min.node.size, `Run Time Reduction`)) +
  geom_smooth(size = 1, se = FALSE, color = "black") +
  scale_y_continuous("Reduction in run time", labels = scales::percent) +
  xlab("Minimum node size") +
  ggtitle("B) Impact to run time")

gridExtra::grid.arrange(p1, p2, nrow = 1)
```

### Sampling scheme

Figure 11.4:

```{r tuning-sampling-scheme, echo=TRUE, fig.cap="The Ames data has several imbalanced categorical features such as neighborhood, zoning, overall quality, and more. Consequently, sampling without replacement appears to improve performance as it leads to less biased split variable selection and more uncorrelated trees.", fig.height=3}
tuning_grid <- expand.grid(
  sample.fraction = seq(.05, .95, by = .05),
  replace  = c(TRUE, FALSE),
  rmse = NA
)

for(i in seq_len(nrow(tuning_grid))) {
  fit <- ranger(
    formula    = Sale_Price ~ ., 
    data       = ames_train, 
    num.trees  = 1000,
    mtry       = 26,
    sample.fraction = tuning_grid$sample.fraction[i],
    replace = tuning_grid$replace[i],
    respect.unordered.factors = 'order',
    verbose    = FALSE,
    seed       = 123
  )

  tuning_grid$rmse[i] <- sqrt(fit$prediction.error)
  
}

tuning_grid %>%
  ggplot(aes(sample.fraction, rmse, color = replace)) +
  geom_line(size = 1) +
  scale_x_continuous("Sample Fraction", breaks = seq(.1, .9, by = .1), labels = scales::percent) +
  ylab("OOB Error (RMSE)") +
  scale_color_discrete("Sample with Replacement") +
  theme(legend.position = c(0.8, 0.85),
        legend.key = element_blank(),
        legend.background = element_blank())
```

## Tuning strategies 

___This grid search takes approximately 2 minutes.___

```{r ranger-grid-search}
# create hyperparameter grid
hyper_grid <- expand.grid(
  mtry = floor(n_features * c(.05, .15, .25, .333, .4)),
  min.node.size = c(1, 3, 5, 10), 
  replace = c(TRUE, FALSE),                               
  sample.fraction = c(.5, .63, .8),                       
  rmse = NA                                               
)

# execute full cartesian grid search
for(i in seq_len(nrow(hyper_grid))) {
  # fit model for ith hyperparameter combination
  fit <- ranger(
    formula         = Sale_Price ~ ., 
    data            = ames_train, 
    num.trees       = n_features * 10,
    mtry            = hyper_grid$mtry[i],
    min.node.size   = hyper_grid$min.node.size[i],
    replace         = hyper_grid$replace[i],
    sample.fraction = hyper_grid$sample.fraction[i],
    verbose         = FALSE,
    seed            = 123,
    respect.unordered.factors = 'order',
  )
  # export OOB error 
  hyper_grid$rmse[i] <- sqrt(fit$prediction.error)
}

# assess top 10 models
hyper_grid %>%
  arrange(rmse) %>%
  mutate(perc_gain = (default_rmse - rmse) / default_rmse * 100) %>%
  head(10)
```

```{r h2o-init}
h2o.no_progress()
h2o.init(max_mem_size = "5g")
```

H2O is memory intensive so you should not run the code that follows in RStudio Cloud.

```{r h2o-objects}
# convert training data to h2o object
train_h2o <- as.h2o(ames_train)

# set the response column to Sale_Price
response <- "Sale_Price"

# set the predictor names
predictors <- setdiff(colnames(ames_train), response)
```

```{r h2o-baseline}
h2o_rf1 <- h2o.randomForest(
    x = predictors, 
    y = response,
    training_frame = train_h2o, 
    ntrees = n_features * 10,
    seed = 123
)

h2o_rf1
```

```{r h20-random-search-setup}
# hyperparameter grid
hyper_grid <- list(
  mtries = floor(n_features * c(.05, .15, .25, .333, .4)),
  min_rows = c(1, 3, 5, 10),
  max_depth = c(10, 20, 30),
  sample_rate = c(.55, .632, .70, .80)
)

# random grid search strategy
search_criteria <- list(
  strategy = "RandomDiscrete",
  stopping_metric = "mse",
  stopping_tolerance = 0.001,   # stop if improvement is < 0.1%
  stopping_rounds = 10,         # over the last 10 models
  max_runtime_secs = 60*5      # or stop search after 5 min.
)
```

___This grid search takes 5 minutes.___

```{r h20-random-search-execution}
# perform grid search 
random_grid <- h2o.grid(
  algorithm = "randomForest",
  grid_id = "rf_random_grid",
  x = predictors, 
  y = response, 
  training_frame = train_h2o,
  hyper_params = hyper_grid,
  ntrees = n_features * 10,
  seed = 123,
  stopping_metric = "RMSE",   
  stopping_rounds = 10,           # stop if last 10 trees added 
  stopping_tolerance = 0.005,     # don't improve RMSE by 0.5%
  search_criteria = search_criteria
)
```

```{r h2o-grid-search-results}
# collect the results and sort by our model performance metric 
# of choice
random_grid_perf <- h2o.getGrid(
  grid_id = "rf_random_grid", 
  sort_by = "mse", 
  decreasing = FALSE
)
random_grid_perf
```

## Feature interpretation

```{r feature-importance}
# re-run model with impurity-based variable importance
rf_impurity <- ranger(
  formula = Sale_Price ~ ., 
  data = ames_train, 
  num.trees = 2000,
  mtry = 32,
  min.node.size = 1,
  sample.fraction = .80,
  replace = FALSE,
  importance = "impurity",
  respect.unordered.factors = "order",
  verbose = FALSE,
  seed  = 123
)

# re-run model with permutation-based variable importance
rf_permutation <- ranger(
  formula = Sale_Price ~ ., 
  data = ames_train, 
  num.trees = 2000,
  mtry = 32,
  min.node.size = 1,
  sample.fraction = .80,
  replace = FALSE,
  importance = "permutation",
  respect.unordered.factors = "order",
  verbose = FALSE,
  seed  = 123
)
```

```{r feature-importance-plot, fig.cap="Top 25 most important variables based on impurity (left) and permutation (right).", fig.height=4.5, fig.width=10}
p1 <- vip::vip(rf_impurity, num_features = 25, bar = FALSE)
p2 <- vip::vip(rf_permutation, num_features = 25, bar = FALSE)

gridExtra::grid.arrange(p1, p2, nrow = 1)
```
