Note: Some results may differ from the hard copy book due to the changing of sampling procedures introduced in R 3.6.0. See http://bit.ly/35D1SW7 for more details. Access and run the source code for this notebook here.
Hidden chapter requirements used in the book to set the plotting theme and load packages used in hidden code chunks:
knitr::opts_chunk$set(
message = FALSE,
warning = FALSE,
cache = FALSE
)
# Set the graphical theme
ggplot2::theme_set(ggplot2::theme_light())
This chapter leverages the following packages:
# Helper packages
library(dplyr) # for data manipulation
library(ggplot2) # for data visualization
library(tidyr) # for data reshaping
# Modeling packages
library(h2o) # for fitting GLRMs
To illustrate GLRM concepts, we’ll continue using the my_basket
data set created in the previous chapter:
url <- "https://koalaverse.github.io/homlr/data/my_basket.csv"
my_basket <- readr::read_csv(url)
head(mtcars)
Figure 18.1:
knitr::include_graphics("images/glrm-example.png")
Figure 18.2:
knitr::include_graphics("images/quadratic-huber-loss.png")
h2o.no_progress() # turn off progress bars
h2o.init(max_mem_size = "5g") # connect to H2O instance
# convert data to h2o object
my_basket.h2o <- as.h2o(my_basket)
# run basic GLRM
basic_glrm <- h2o.glrm(
training_frame = my_basket.h2o,
k = 20,
loss = "Quadratic",
regularization_x = "None",
regularization_y = "None",
transform = "STANDARDIZE",
max_iterations = 2000,
seed = 123
)
# get top level summary information on our model
summary(basic_glrm)
Model Details:
==============
H2ODimReductionModel: glrm
Model Key: GLRM_model_R_1577620863288_5
Model Summary:
H2ODimReductionMetrics: glrm
** Reported on training data. **
Sum of Squared Error (Numeric): 31004.59
Misclassification Error (Categorical): 0
Number of Numeric Entries: 84000
Number of Categorical Entries: 0
Scoring History:
---
plot(basic_glrm)
# amount of variance explained by each archetype (aka "pc")
basic_glrm@model$importance
Importance of components:
data.frame(
PC = basic_glrm@model$importance %>% seq_along(),
PVE = basic_glrm@model$importance %>% .[2,] %>% unlist(),
CVE = basic_glrm@model$importance %>% .[3,] %>% unlist()
) %>%
gather(metric, variance_explained, -PC) %>%
ggplot(aes(PC, variance_explained)) +
geom_point() +
facet_wrap(~ metric, ncol = 1, scales = "free")
t(basic_glrm@model$archetypes)[1:5, 1:5]
Arch1 Arch2 Arch3 Arch4 Arch5
7up -0.5783538 -1.5705325 0.9906612 -0.9306704 0.17552643
lasagna 0.2196728 0.1213954 -0.7068851 0.8436524 3.56206178
pepsi -0.2504310 -0.8156136 -0.7669562 -1.2551630 -0.47632696
yop -0.1856632 0.4000083 -0.4855958 1.1598919 -0.26142763
redwine -0.1372589 -0.1059148 -0.9579530 0.4641668 -0.08539977
p1 <- t(basic_glrm@model$archetypes) %>%
as.data.frame() %>%
mutate(feature = row.names(.)) %>%
ggplot(aes(Arch1, reorder(feature, Arch1))) +
geom_point()
p2 <- t(basic_glrm@model$archetypes) %>%
as.data.frame() %>%
mutate(feature = row.names(.)) %>%
ggplot(aes(Arch1, Arch2, label = feature)) +
geom_text()
gridExtra::grid.arrange(p1, p2, nrow = 1)
# Re-run model with k = 8
k8_glrm <- h2o.glrm(
training_frame = my_basket.h2o,
k = 8,
loss = "Quadratic",
regularization_x = "None",
regularization_y = "None",
transform = "STANDARDIZE",
max_iterations = 2000,
seed = 123
)
# Reconstruct to see how well the model did
my_reconstruction <- h2o.reconstruct(k8_glrm, my_basket.h2o, reverse_transform = TRUE)
# Raw predicted values
my_reconstruction[1:5, 1:5]
[5 rows x 5 columns]
# Round values to whole integers
my_reconstruction[1:5, 1:5] %>% round(0)
[5 rows x 5 columns]
# Use non-negative regularization
k8_glrm_regularized <- h2o.glrm(
training_frame = my_basket.h2o,
k = 8,
loss = "Quadratic",
regularization_x = "NonNegative",
regularization_y = "NonNegative",
gamma_x = 0.5,
gamma_y = 0.5,
transform = "STANDARDIZE",
max_iterations = 2000,
seed = 123
)
# Show predicted values
predict(k8_glrm_regularized, my_basket.h2o)[1:5, 1:5]
[5 rows x 5 columns]
# Compare regularized versus non-regularized loss
par(mfrow = c(1, 2))
plot(k8_glrm)
plot(k8_glrm_regularized)
# Split data into train & validation
split <- h2o.splitFrame(my_basket.h2o, ratios = 0.75, seed = 123)
train <- split[[1]]
valid <- split[[2]]
# Create hyperparameter search grid
params <- expand.grid(
regularization_x = c("None", "NonNegative", "L1"),
regularization_y = c("None", "NonNegative", "L1"),
gamma_x = seq(0, 1, by = .25),
gamma_y = seq(0, 1, by = .25),
error = 0,
stringsAsFactors = FALSE
)
# Perform grid search
for(i in seq_len(nrow(params))) {
# Create model
glrm_model <- h2o.glrm(
training_frame = train,
k = 8,
loss = "Quadratic",
regularization_x = params$regularization_x[i],
regularization_y = params$regularization_y[i],
gamma_x = params$gamma_x[i],
gamma_y = params$gamma_y[i],
transform = "STANDARDIZE",
max_runtime_secs = 1000,
seed = 123
)
# Predict on validation set and extract error
validate <- h2o.performance(glrm_model, valid)
params$error[i] <- validate@metrics$numerr
}
# Look at the top 10 models with the lowest error rate
params %>%
arrange(error) %>%
head(10)
# Apply final model with optimal hyperparamters
final_glrm_model <- h2o.glrm(
training_frame = my_basket.h2o,
k = 8,
loss = "Quadratic",
regularization_x = "L1",
regularization_y = "NonNegative",
gamma_x = 1,
gamma_y = 0.25,
transform = "STANDARDIZE",
max_iterations = 2000,
seed = 123
)
# New observations to score
new_observations <- as.h2o(sample_n(my_basket, 2))
# Basic scoring
predict(final_glrm_model, new_observations) %>% round(0)
[2 rows x 42 columns]
h2o.shutdown(prompt = FALSE)