Note: Some results may differ from the hard copy book due to the changing of sampling procedures introduced in R 3.6.0. See http://bit.ly/35D1SW7 for more details. Access and run the source code for this notebook here. Do to output size, most of this chapter’s code chunks should not be ran on RStudio Cloud.
Hidden chapter requirements used in the book to set the plotting theme and load packages used in hidden code chunks:
Prerequisites
This chapter leverages the following packages, with the emphasis on h2o:
# Helper packages
library(rsample) # for creating our train-test splits
library(recipes) # for minor feature engineering tasks
# Modeling packages
library(h2o) # for fitting stacked models
h2o.no_progress()
h2o.init()
To illustrate key concepts we continue with the Ames housing example from previous chapters:
# Load and split the Ames housing data
ames <- AmesHousing::make_ames()
set.seed(123) # for reproducibility
split <- initial_split(ames, strata = "Sale_Price")
ames_train <- training(split)
ames_test <- testing(split)
# Make sure we have consistent categorical levels
blueprint <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_other(all_nominal(), threshold = 0.05)
# Create training & test sets for h2o
train_h2o <- prep(blueprint, training = ames_train, retain = TRUE) %>%
juice() %>%
as.h2o()
test_h2o <- prep(blueprint, training = ames_train) %>%
bake(new_data = ames_test) %>%
as.h2o()
# Get response and feature names
Y <- "Sale_Price"
X <- setdiff(names(ames_train), Y)
Stacking existing models
# Train & cross-validate a GLM model
best_glm <- h2o.glm(
x = X, y = Y, training_frame = train_h2o, alpha = 0.1,
remove_collinear_columns = TRUE, nfolds = 10, fold_assignment = "Modulo",
keep_cross_validation_predictions = TRUE, seed = 123
)
# Train & cross-validate a RF model
best_rf <- h2o.randomForest(
x = X, y = Y, training_frame = train_h2o, ntrees = 1000, mtries = 20,
max_depth = 30, min_rows = 1, sample_rate = 0.8, nfolds = 10,
fold_assignment = "Modulo", keep_cross_validation_predictions = TRUE,
seed = 123, stopping_rounds = 50, stopping_metric = "RMSE",
stopping_tolerance = 0
)
# Train & cross-validate a GBM model
best_gbm <- h2o.gbm(
x = X, y = Y, training_frame = train_h2o, ntrees = 5000, learn_rate = 0.01,
max_depth = 7, min_rows = 5, sample_rate = 0.8, nfolds = 10,
fold_assignment = "Modulo", keep_cross_validation_predictions = TRUE,
seed = 123, stopping_rounds = 50, stopping_metric = "RMSE",
stopping_tolerance = 0
)
# Train & cross-validate an XGBoost model
best_xgb <- h2o.xgboost(
x = X, y = Y, training_frame = train_h2o, ntrees = 5000, learn_rate = 0.05,
max_depth = 3, min_rows = 3, sample_rate = 0.8, categorical_encoding = "Enum",
nfolds = 10, fold_assignment = "Modulo",
keep_cross_validation_predictions = TRUE, seed = 123, stopping_rounds = 50,
stopping_metric = "RMSE", stopping_tolerance = 0
)
# Train a stacked tree ensemble
ensemble_tree <- h2o.stackedEnsemble(
x = X, y = Y, training_frame = train_h2o, model_id = "my_tree_ensemble",
base_models = list(best_glm, best_rf, best_gbm, best_xgb),
metalearner_algorithm = "drf"
)
# Get results from base learners
get_rmse <- function(model) {
results <- h2o.performance(model, newdata = test_h2o)
results@metrics$RMSE
}
list(best_glm, best_rf, best_gbm, best_xgb) %>%
purrr::map_dbl(get_rmse)
[1] 36834.07 23635.47 19236.09 19725.81
# Stacked results
h2o.performance(ensemble_tree, newdata = test_h2o)@metrics$RMSE
[1] 20446.95
data.frame(
GLM_pred = as.vector(h2o.getFrame(best_glm@model$cross_validation_holdout_predictions_frame_id$name)),
RF_pred = as.vector(h2o.getFrame(best_rf@model$cross_validation_holdout_predictions_frame_id$name)),
GBM_pred = as.vector(h2o.getFrame(best_gbm@model$cross_validation_holdout_predictions_frame_id$name)),
XGB_pred = as.vector(h2o.getFrame(best_xgb@model$cross_validation_holdout_predictions_frame_id$name))
) %>% cor()
GLM_pred RF_pred GBM_pred XGB_pred
GLM_pred 1.0000000 0.9600988 0.9537208 0.9539898
RF_pred 0.9600988 1.0000000 0.9936554 0.9851487
GBM_pred 0.9537208 0.9936554 1.0000000 0.9915265
XGB_pred 0.9539898 0.9851487 0.9915265 1.0000000
Stacking a grid search
# Define GBM hyperparameter grid
hyper_grid <- list(
max_depth = c(1, 3, 5),
min_rows = c(1, 5, 10),
learn_rate = c(0.01, 0.05, 0.1),
learn_rate_annealing = c(0.99, 1),
sample_rate = c(0.5, 0.75, 1),
col_sample_rate = c(0.8, 0.9, 1)
)
# Define random grid search criteria
search_criteria <- list(
strategy = "RandomDiscrete",
max_models = 25
)
# Build random grid search
random_grid <- h2o.grid(
algorithm = "gbm", grid_id = "gbm_grid", x = X, y = Y,
training_frame = train_h2o, hyper_params = hyper_grid,
search_criteria = search_criteria, ntrees = 5000, stopping_metric = "RMSE",
stopping_rounds = 10, stopping_tolerance = 0, nfolds = 10,
fold_assignment = "Modulo", keep_cross_validation_predictions = TRUE,
seed = 123
)
# Sort results by RMSE
h2o.getGrid(
grid_id = "gbm_grid",
sort_by = "rmse"
)
H2O Grid Details
================
Grid ID: gbm_grid
Used hyper parameters:
- col_sample_rate
- learn_rate
- learn_rate_annealing
- max_depth
- min_rows
- sample_rate
Number of models: 25
Number of failed models: 0
Hyper-Parameter Search Summary: ordered by increasing rmse
---
# Grab the model_id for the top model, chosen by validation error
best_model_id <- random_grid@model_ids[[1]]
best_model <- h2o.getModel(best_model_id)
h2o.performance(best_model, newdata = test_h2o)
H2ORegressionMetrics: gbm
MSE: 387885567
RMSE: 19694.81
MAE: 12982.36
RMSLE: 0.1244279
Mean Residual Deviance : 387885567
# Train a stacked ensemble using the GBM grid
ensemble <- h2o.stackedEnsemble(
x = X, y = Y, training_frame = train_h2o, model_id = "ensemble_gbm_grid",
base_models = random_grid@model_ids, metalearner_algorithm = "gbm"
)
# Eval ensemble performance on a test set
h2o.performance(ensemble, newdata = test_h2o)
H2ORegressionMetrics: stackedensemble
MSE: 379961705
RMSE: 19492.61
MAE: 13131.34
RMSLE: 0.1234258
Mean Residual Deviance : 379961705
Automated machine learning
# Use AutoML to find a list of candidate models (i.e., leaderboard)
auto_ml <- h2o.automl(
x = X, y = Y, training_frame = train_h2o, nfolds = 5,
max_runtime_secs = 60 * 120, max_models = 50,
keep_cross_validation_predictions = TRUE, sort_metric = "RMSE", seed = 123,
stopping_rounds = 50, stopping_metric = "RMSE", stopping_tolerance = 0
)
# Assess the leader board; the following truncates the results to show the top
# 25 models. You can get the top model with auto_ml@leader
auto_ml@leaderboard %>%
as.data.frame() %>%
dplyr::select(model_id, rmse) %>%
dplyr::slice(1:25)
h2o.shutdown(prompt = FALSE)
[1] TRUE
---
title: "Chapter 15: Stacked Models"
output: html_notebook
---

__Note__: Some results may differ from the hard copy book due to the changing of sampling procedures introduced in R 3.6.0. See http://bit.ly/35D1SW7 for more details. Access and run the source code for this notebook [here](https://rstudio.cloud/project/801185). Do to output size, most of this
chapter's code chunks should not be ran on RStudio Cloud.

Hidden chapter requirements used in the book to set the plotting theme and load packages used in hidden code chunks:

```{r setup, include=FALSE}
knitr::opts_chunk$set(
  message = FALSE, 
  warning = FALSE, 
  cache = FALSE
)
```

## Prerequisites

This chapter leverages the following packages, with the emphasis on __h2o__:

```{r pkg-req-12}
# Helper packages
library(rsample)   # for creating our train-test splits
library(recipes)   # for minor feature engineering tasks

# Modeling packages
library(h2o)       # for fitting stacked models
```

```{r}
h2o.no_progress()
h2o.init()
```

To illustrate key concepts we continue with the Ames housing example from previous chapters:

```{r data-req-12}
# Load and split the Ames housing data
ames <- AmesHousing::make_ames()
set.seed(123)  # for reproducibility
split <- initial_split(ames, strata = "Sale_Price")
ames_train <- training(split)
ames_test <- testing(split)

# Make sure we have consistent categorical levels
blueprint <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_other(all_nominal(), threshold = 0.05)

# Create training & test sets for h2o
train_h2o <- prep(blueprint, training = ames_train, retain = TRUE) %>%
  juice() %>%
  as.h2o()
test_h2o <- prep(blueprint, training = ames_train) %>%
  bake(new_data = ames_test) %>%
  as.h2o()

# Get response and feature names
Y <- "Sale_Price"
X <- setdiff(names(ames_train), Y)
```

## Stacking existing models

```{r}
# Train & cross-validate a GLM model
best_glm <- h2o.glm(
  x = X, y = Y, training_frame = train_h2o, alpha = 0.1,
  remove_collinear_columns = TRUE, nfolds = 10, fold_assignment = "Modulo",
  keep_cross_validation_predictions = TRUE, seed = 123
)

# Train & cross-validate a RF model
best_rf <- h2o.randomForest(
  x = X, y = Y, training_frame = train_h2o, ntrees = 1000, mtries = 20,
  max_depth = 30, min_rows = 1, sample_rate = 0.8, nfolds = 10,
  fold_assignment = "Modulo", keep_cross_validation_predictions = TRUE,
  seed = 123, stopping_rounds = 50, stopping_metric = "RMSE",
  stopping_tolerance = 0
)

# Train & cross-validate a GBM model
best_gbm <- h2o.gbm(
  x = X, y = Y, training_frame = train_h2o, ntrees = 5000, learn_rate = 0.01,
  max_depth = 7, min_rows = 5, sample_rate = 0.8, nfolds = 10,
  fold_assignment = "Modulo", keep_cross_validation_predictions = TRUE,
  seed = 123, stopping_rounds = 50, stopping_metric = "RMSE",
  stopping_tolerance = 0
)

# Train & cross-validate an XGBoost model
best_xgb <- h2o.xgboost(
  x = X, y = Y, training_frame = train_h2o, ntrees = 5000, learn_rate = 0.05,
  max_depth = 3, min_rows = 3, sample_rate = 0.8, categorical_encoding = "Enum",
  nfolds = 10, fold_assignment = "Modulo", 
  keep_cross_validation_predictions = TRUE, seed = 123, stopping_rounds = 50,
  stopping_metric = "RMSE", stopping_tolerance = 0
)
```

```{r}
# Train a stacked tree ensemble
ensemble_tree <- h2o.stackedEnsemble(
  x = X, y = Y, training_frame = train_h2o, model_id = "my_tree_ensemble",
  base_models = list(best_glm, best_rf, best_gbm, best_xgb),
  metalearner_algorithm = "drf"
)
```

```{r}
# Get results from base learners
get_rmse <- function(model) {
  results <- h2o.performance(model, newdata = test_h2o)
  results@metrics$RMSE
}

list(best_glm, best_rf, best_gbm, best_xgb) %>%
  purrr::map_dbl(get_rmse)

# Stacked results
h2o.performance(ensemble_tree, newdata = test_h2o)@metrics$RMSE
```

```{r}
data.frame(
  GLM_pred = as.vector(h2o.getFrame(best_glm@model$cross_validation_holdout_predictions_frame_id$name)),
  RF_pred = as.vector(h2o.getFrame(best_rf@model$cross_validation_holdout_predictions_frame_id$name)),
  GBM_pred = as.vector(h2o.getFrame(best_gbm@model$cross_validation_holdout_predictions_frame_id$name)),
  XGB_pred = as.vector(h2o.getFrame(best_xgb@model$cross_validation_holdout_predictions_frame_id$name))
) %>% cor()
```

## Stacking a grid search

```{r}
# Define GBM hyperparameter grid
hyper_grid <- list(
  max_depth = c(1, 3, 5),
  min_rows = c(1, 5, 10),
  learn_rate = c(0.01, 0.05, 0.1),
  learn_rate_annealing = c(0.99, 1),
  sample_rate = c(0.5, 0.75, 1),
  col_sample_rate = c(0.8, 0.9, 1)
)

# Define random grid search criteria
search_criteria <- list(
  strategy = "RandomDiscrete",
  max_models = 25
)

# Build random grid search 
random_grid <- h2o.grid(
  algorithm = "gbm", grid_id = "gbm_grid", x = X, y = Y,
  training_frame = train_h2o, hyper_params = hyper_grid,
  search_criteria = search_criteria, ntrees = 5000, stopping_metric = "RMSE",     
  stopping_rounds = 10, stopping_tolerance = 0, nfolds = 10, 
  fold_assignment = "Modulo", keep_cross_validation_predictions = TRUE,
  seed = 123
)
```

```{r}
# Sort results by RMSE
h2o.getGrid(
  grid_id = "gbm_grid", 
  sort_by = "rmse"
)
```

```{r}
# Grab the model_id for the top model, chosen by validation error
best_model_id <- random_grid@model_ids[[1]]
best_model <- h2o.getModel(best_model_id)
h2o.performance(best_model, newdata = test_h2o)
```

```{r}
# Train a stacked ensemble using the GBM grid
ensemble <- h2o.stackedEnsemble(
  x = X, y = Y, training_frame = train_h2o, model_id = "ensemble_gbm_grid",
  base_models = random_grid@model_ids, metalearner_algorithm = "gbm"
)

# Eval ensemble performance on a test set
h2o.performance(ensemble, newdata = test_h2o)
```

## Automated machine learning

```{r}
# Use AutoML to find a list of candidate models (i.e., leaderboard)
auto_ml <- h2o.automl(
  x = X, y = Y, training_frame = train_h2o, nfolds = 5, 
  max_runtime_secs = 60 * 120, max_models = 50,
  keep_cross_validation_predictions = TRUE, sort_metric = "RMSE", seed = 123,
  stopping_rounds = 50, stopping_metric = "RMSE", stopping_tolerance = 0
)
```

```{r}
# Assess the leader board; the following truncates the results to show the top 
# 25 models. You can get the top model with auto_ml@leader
auto_ml@leaderboard %>% 
  as.data.frame() %>%
  dplyr::select(model_id, rmse) %>%
  dplyr::slice(1:25)
```

```{r}
h2o.shutdown(prompt = FALSE)
```

