Note: Some results may differ from the hard copy book due to the changing of sampling procedures introduced in R 3.6.0. See http://bit.ly/35D1SW7 for more details. Access and run the source code for this notebook here. Do to output size, most of this chapter’s code chunks should not be ran on RStudio Cloud.
Hidden chapter requirements used in the book to set the plotting theme and load packages used in hidden code chunks:
knitr::opts_chunk$set(
message = FALSE,
warning = FALSE,
cache = FALSE
)
# Set the graphical theme
ggplot2::theme_set(ggplot2::theme_light())
# Helper packages
library(dplyr) # for data wrangling
library(ggplot2) # for awesome graphics
# Modeling packages
library(h2o) # for interfacing with H2O
library(recipes) # for ML recipes
library(rsample) # for data splitting
library(xgboost) # for fitting GBMs
# Model interpretability packages
library(pdp) # for partial dependence plots (and ICE curves)
library(vip) # for variable importance plots
library(iml) # for general IML-related functions
library(DALEX) # for general IML-related functions
library(lime) # for local interpretable model-agnostic explanations
To illustrate various concepts we’ll continue working with the h2o version of the Ames housing data. We’ll also use the stacked ensemble model created here.
# Connect to H2O
h2o.no_progress()
h2o.init(max_mem_size = "5g")
# Load and split Ames housing data
ames <- AmesHousing::make_ames()
set.seed(123) # for reproducibility
split <- initial_split(ames, strata = "Sale_Price")
ames_train <- training(split)
ames_test <- testing(split)
# Make sure we have consistent categorical levels
blueprint <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_other(all_nominal(), threshold = .005)
# Create training & test sets
train_h2o <- prep(blueprint, training = ames_train, retain = TRUE) %>%
juice() %>%
as.h2o()
test_h2o <- prep(blueprint, training = ames_train) %>%
bake(new_data = ames_test) %>%
as.h2o()
# Get names of response and features
Y <- "Sale_Price"
X <- setdiff(names(ames_train), Y)
# Train & cross-validate a GLM model
best_glm <- h2o.glm(
x = X, y = Y, training_frame = train_h2o, alpha = 0.1,
remove_collinear_columns = TRUE, nfolds = 10, fold_assignment = "Modulo",
keep_cross_validation_predictions = TRUE, seed = 123
)
# Train & cross-validate a RF model
best_rf <- h2o.randomForest(
x = X, y = Y, training_frame = train_h2o, ntrees = 1000, mtries = 20,
max_depth = 30, min_rows = 1, sample_rate = 0.8, nfolds = 10,
fold_assignment = "Modulo", keep_cross_validation_predictions = TRUE,
seed = 123, stopping_rounds = 50, stopping_metric = "RMSE",
stopping_tolerance = 0
)
# Train & cross-validate a GBM model
best_gbm <- h2o.gbm(
x = X, y = Y, training_frame = train_h2o, ntrees = 5000, learn_rate = 0.01,
max_depth = 7, min_rows = 5, sample_rate = 0.8, nfolds = 10,
fold_assignment = "Modulo", keep_cross_validation_predictions = TRUE,
seed = 123, stopping_rounds = 50, stopping_metric = "RMSE",
stopping_tolerance = 0
)
# Train & cross-validate an XGBoost model
best_xgb <- h2o.xgboost(
x = X, y = Y, training_frame = train_h2o, ntrees = 5000, learn_rate = 0.05,
max_depth = 3, min_rows = 3, sample_rate = 0.8, categorical_encoding = "Enum",
nfolds = 10, fold_assignment = "Modulo",
keep_cross_validation_predictions = TRUE, seed = 123, stopping_rounds = 50,
stopping_metric = "RMSE", stopping_tolerance = 0
)
# Train a stacked tree ensemble
ensemble_tree <- h2o.stackedEnsemble(
x = X, y = Y, training_frame = train_h2o, model_id = "my_tree_ensemble",
base_models = list(best_glm, best_rf, best_gbm, best_xgb),
metalearner_algorithm = "drf"
)
predictions <- predict(ensemble_tree, train_h2o) %>% as.vector()
# Compute predictions
predictions <- predict(ensemble_tree, train_h2o) %>% as.vector()
# Print the highest and lowest predicted sales price
paste("Observation", which.max(predictions),
"has a predicted sale price of", scales::dollar(max(predictions)))
[1] "Observation 1315 has a predicted sale price of $697,273"
paste("Observation", which.min(predictions),
"has a predicted sale price of", scales::dollar(min(predictions)))
[1] "Observation 548 has a predicted sale price of $44,227.53"
# Grab feature values for observations with min/max predicted sales price
high_ob <- as.data.frame(train_h2o)[which.max(predictions), ] %>% select(-Sale_Price)
low_ob <- as.data.frame(train_h2o)[which.min(predictions), ] %>% select(-Sale_Price)
# 1) create a data frame with just the features
features <- as.data.frame(train_h2o) %>% select(-Sale_Price)
# 2) Create a vector with the actual responses
response <- as.data.frame(train_h2o) %>% pull(Sale_Price)
# 3) Create custom predict function that returns the predicted values as a vector
pred <- function(object, newdata) {
results <- as.vector(h2o.predict(object, as.h2o(newdata)))
return(results)
}
# Example of prediction output
pred(ensemble_tree, features) %>% head()
[1] 217750.2 109165.7 178811.5 200595.1 194434.3 209172.5
# iml model agnostic object
components_iml <- Predictor$new(
model = ensemble_tree,
data = features,
y = response,
predict.fun = pred
)
# DALEX model agnostic object
components_dalex <- DALEX::explain(
model = ensemble_tree,
data = features,
y = response,
predict_function = pred
)
vip(
ensemble_tree,
train = as.data.frame(train_h2o),
method = "permute",
target = "Sale_Price",
metric = "RMSE",
nsim = 5,
sample_frac = 0.5,
pred_wrapper = pred
)
Figure 16.1:
knitr::include_graphics("images/pdp-illustration.png")
# Custom prediction function wrapper
pdp_pred <- function(object, newdata) {
results <- mean(as.vector(h2o.predict(object, as.h2o(newdata))))
return(results)
}
# Compute partial dependence values
pd_values <- partial(
ensemble_tree,
train = as.data.frame(train_h2o),
pred.var = "Gr_Liv_Area",
pred.fun = pdp_pred,
grid.resolution = 20
)
head(pd_values) # take a peak
# Partial dependence plot
autoplot(pd_values, rug = TRUE, train = as.data.frame(train_h2o))
Figure 16.2:
# Construct ICE curves
ice_non_centered <- partial(
ensemble_tree,
train = as.data.frame(train_h2o),
pred.var = "Gr_Liv_Area",
pred.fun = pred,
grid.resolution = 20
) %>%
autoplot(alpha = 0.05, center = FALSE) +
ggtitle("A) Non-centered ICE curves")
# Construct c-ICE curves
ice_centered <- partial(
ensemble_tree,
train = as.data.frame(train_h2o),
pred.var = "Gr_Liv_Area",
pred.fun = pred,
grid.resolution = 20
) %>%
autoplot(alpha = 0.05, center = TRUE) +
ggtitle("B) Centered ICE curves")
# Display plots side by side
gridExtra::grid.arrange(ice_non_centered, ice_centered, ncol = 2)
# Construct c-ICE curves
partial(
ensemble_tree,
train = as.data.frame(train_h2o),
pred.var = "Gr_Liv_Area",
pred.fun = pred,
grid.resolution = 20,
plot = TRUE,
center = TRUE,
plot.engine = "ggplot2"
)
interact_2way <- Interaction$new(components_iml, feature = "First_Flr_SF")
interact_2way$results %>%
arrange(desc(.interaction)) %>%
top_n(10)
# Two-way PDP using iml
interaction_pdp <- Partial$new(
components_iml,
c("First_Flr_SF", "Overall_Qual"),
ice = FALSE,
grid.size = 20
)
labels <- interaction_pdp$results %>% filter(First_Flr_SF == max(First_Flr_SF))
plot(interaction_pdp) +
ggrepel::geom_label_repel(
data = labels,
aes(label = Overall_Qual),
label.size = .05,
label.padding = .15
)
# Create explainer object
components_lime <- lime(
x = features,
model = ensemble_tree,
n_bins = 10
)
class(components_lime)
[1] "data_frame_explainer" "explainer" "list"
summary(components_lime)
Length Class Mode
model 1 H2ORegressionModel S4
preprocess 1 -none- function
bin_continuous 1 -none- logical
n_bins 1 -none- numeric
quantile_bins 1 -none- logical
use_density 1 -none- logical
feature_type 80 -none- character
bin_cuts 80 -none- list
feature_distribution 80 -none- list
# Use LIME to explain previously defined instances: high_ob and low_ob
lime_explanation <- lime::explain(
x = rbind(high_ob, low_ob),
explainer = components_lime,
n_permutations = 5000,
dist_fun = "gower",
kernel_width = 0.25,
n_features = 10,
feature_select = "highest_weights"
)
glimpse(lime_explanation)
Observations: 20
Variables: 11
$ model_type [3m[38;5;246m<chr>[39m[23m "regression", "regression", "regression", "regression", "regression", "regr…
$ case [3m[38;5;246m<chr>[39m[23m "1315", "1315", "1315", "1315", "1315", "1315", "1315", "1315", "1315", "13…
$ model_r2 [3m[38;5;246m<dbl>[39m[23m 0.50020562, 0.50020562, 0.50020562, 0.50020562, 0.50020562, 0.50020562, 0.5…
$ model_intercept [3m[38;5;246m<dbl>[39m[23m 142204.0, 142204.0, 142204.0, 142204.0, 142204.0, 142204.0, 142204.0, 14220…
$ model_prediction [3m[38;5;246m<dbl>[39m[23m 700416.0, 700416.0, 700416.0, 700416.0, 700416.0, 700416.0, 700416.0, 70041…
$ feature [3m[38;5;246m<chr>[39m[23m "Pool_Area", "Gr_Liv_Area", "Total_Bsmt_SF", "Overall_Qual", "First_Flr_SF"…
$ feature_value [3m[38;5;246m<int>[39m[23m 555, 4476, 2396, 8, 2411, 15623, 2065, 1, 1, 3, 0, 612, 1, 0, 1940, 0, 4, 6…
$ feature_weight [3m[38;5;246m<dbl>[39m[23m 338848.604, 48179.163, 41363.918, 26280.478, 23465.130, 21227.388, 19015.07…
$ feature_desc [3m[38;5;246m<chr>[39m[23m "516.6 < Pool_Area <= 590.4", "2145 < Gr_Liv_Area", "1603 < Total_Bsmt_SF",…
$ data [3m[38;5;246m<list>[39m[23m [[Two_Story_1946_and_Newer, Residential_Low_Density, 160, 15623, Pave, No_…
$ prediction [3m[38;5;246m<dbl>[39m[23m 700415.52, 700415.52, 700415.52, 700415.52, 700415.52, 700415.52, 700415.52…
plot_features(lime_explanation, ncol = 1)
# Tune the LIME algorithm a bit
lime_explanation2 <- explain(
x = rbind(high_ob, low_ob),
explainer = components_lime,
n_permutations = 5000,
dist_fun = "euclidean",
kernel_width = 0.75,
n_features = 10,
feature_select = "lasso_path"
)
# Plot the results
plot_features(lime_explanation2, ncol = 1)
Figure 16.8:
knitr::include_graphics("images/approx-shapley-idea.png")
# Compute (approximate) Shapley values
(shapley <- Shapley$new(components_iml, x.interest = high_ob, sample.size = 1000))
Interpretation method: Shapley
Predicted value: 700415.520000, Average prediction: 181296.077636 (diff = 519119.442364)
Analysed predictor:
Prediction task: unknown
Analysed data:
Sampling from data.frame with 2199 rows and 80 columns.
Head of results:
# Plot results
plot(shapley)
# Reuse existing object
shapley$explain(x.interest = low_ob)
# Plot results
shapley$results %>%
top_n(25, wt = abs(phi)) %>%
ggplot(aes(phi, reorder(feature.value, phi), color = phi > 0)) +
geom_point(show.legend = FALSE)
# Compute tree SHAP for a previously obtained XGBoost model
X <- readr::read_rds("data/xgb-features.rds")
xgb.fit.final <- readr::read_rds("data/xgb-fit-final.rds")
# Try to re-scale features (low to high)
feature_values <- X %>%
as.data.frame() %>%
mutate_all(scale) %>%
gather(feature, feature_value) %>%
pull(feature_value)
# Compute SHAP values, wrangle a bit, compute SHAP-based importance, etc.
shap_df <- xgb.fit.final %>%
predict(newdata = X, predcontrib = TRUE) %>%
as.data.frame() %>%
select(-BIAS) %>%
gather(feature, shap_value) %>%
mutate(feature_value = feature_values) %>%
group_by(feature) %>%
mutate(shap_importance = mean(abs(shap_value)))
# SHAP contribution plot
p1 <- ggplot(shap_df, aes(x = shap_value, y = reorder(feature, shap_importance))) +
ggbeeswarm::geom_quasirandom(groupOnX = FALSE, varwidth = TRUE, size = 0.4, alpha = 0.25) +
xlab("SHAP value") +
ylab(NULL)
# SHAP importance plot
p2 <- shap_df %>%
select(feature, shap_importance) %>%
filter(row_number() == 1) %>%
ggplot(aes(x = reorder(feature, shap_importance), y = shap_importance)) +
geom_col() +
coord_flip() +
xlab(NULL) +
ylab("mean(|SHAP value|)")
# Combine plots
gridExtra::grid.arrange(p1, p2, nrow = 1)
shap_df %>%
filter(feature %in% c("Overall_Qual", "Gr_Liv_Area")) %>%
ggplot(aes(x = feature_value, y = shap_value)) +
geom_point(aes(color = shap_value)) +
scale_colour_viridis_c(name = "Feature value\n(standardized)", option = "C") +
facet_wrap(~ feature, scales = "free") +
scale_y_continuous('Shapley value', labels = scales::comma) +
xlab('Normalized feature value')
high_breakdown <- prediction_breakdown(components_dalex, observation = high_ob)
# class of prediction_breakdown output
class(high_breakdown)
# check out the top 10 influential variables for this observation
high_breakdown[1:10, 1:5]
h2o.shutdown(prompt = FALSE)
[1] TRUE