Note: Some results may differ from the hard copy book due to the changing of sampling procedures introduced in R 3.6.0. See http://bit.ly/35D1SW7 for more details. Access and run the source code for this notebook here.
Hidden chapter requirements used in the book to set the plotting theme and load packages used in hidden code chunks:
# Set global R options
options(scipen = 999)
# Set the graphical theme
ggplot2::theme_set(ggplot2::theme_light())
# Set global knitr chunk options
knitr::opts_chunk$set(
fig.align = "center",
fig.height = 3.5
)
ames <- AmesHousing::make_ames()
Prerequisites
This chapter leverages the following packages:
# Helper packages
library(dplyr) # for data manipulation
library(ggplot2) # for awesome graphics
# Modeling packages
library(caret) # for cross-validation, etc.
# Model interpretability packages
library(vip) # variable importance
We’ll also continue working with the ames_train
data set:
library(rsample)
# stratified sampling with the rsample package
set.seed(123)
split <- initial_split(ames, prop = 0.7, strata = "Sale_Price")
ames_train <- training(split)
ames_test <- testing(split)
Simple linear regression
Estimation
model1 <- lm(Sale_Price ~ Gr_Liv_Area, data = ames_train)
Figure 4.1:
# Fitted regression line (full training data)
p1 <- model1 %>%
broom::augment() %>%
ggplot(aes(Gr_Liv_Area, Sale_Price)) +
geom_point(size = 1, alpha = 0.3) +
geom_smooth(se = FALSE, method = "lm") +
scale_y_continuous(labels = scales::dollar) +
ggtitle("Fitted regression line")
# Fitted regression line (restricted range)
p2 <- model1 %>%
broom::augment() %>%
ggplot(aes(Gr_Liv_Area, Sale_Price)) +
geom_segment(aes(x = Gr_Liv_Area, y = Sale_Price,
xend = Gr_Liv_Area, yend = .fitted),
alpha = 0.3) +
geom_point(size = 1, alpha = 0.3) +
geom_smooth(se = FALSE, method = "lm") +
scale_y_continuous(labels = scales::dollar) +
ggtitle("Fitted regression line (with residuals)")
# Side-by-side plots
grid.arrange(p1, p2, nrow = 1)
summary(model1)
Call:
lm(formula = Sale_Price ~ Gr_Liv_Area, data = ames_train)
Residuals:
Min 1Q Median 3Q Max
-361143 -30668 -2449 22838 331357
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 8732.938 3996.613 2.185 0.029 *
Gr_Liv_Area 114.876 2.531 45.385 <0.0000000000000002 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 56700 on 2051 degrees of freedom
Multiple R-squared: 0.5011, Adjusted R-squared: 0.5008
F-statistic: 2060 on 1 and 2051 DF, p-value: < 0.00000000000000022
sigma(model1) # RMSE
[1] 56704.78
sigma(model1)^2 # MSE
[1] 3215432370
Inference
confint(model1, level = 0.95)
2.5 % 97.5 %
(Intercept) 895.0961 16570.7805
Gr_Liv_Area 109.9121 119.8399
Multiple linear regression
(model2 <- lm(Sale_Price ~ Gr_Liv_Area + Year_Built, data = ames_train))
Call:
lm(formula = Sale_Price ~ Gr_Liv_Area + Year_Built, data = ames_train)
Coefficients:
(Intercept) Gr_Liv_Area Year_Built
-2123054.21 99.18 1093.48
(model2 <- update(model1, . ~ . + Year_Built))
Call:
lm(formula = Sale_Price ~ Gr_Liv_Area + Year_Built, data = ames_train)
Coefficients:
(Intercept) Gr_Liv_Area Year_Built
-2123054.21 99.18 1093.48
lm(Sale_Price ~ Gr_Liv_Area + Year_Built + Gr_Liv_Area:Year_Built, data = ames_train)
Call:
lm(formula = Sale_Price ~ Gr_Liv_Area + Year_Built + Gr_Liv_Area:Year_Built,
data = ames_train)
Coefficients:
(Intercept) Gr_Liv_Area Year_Built
382194.3015 -1483.8810 -179.7979
Gr_Liv_Area:Year_Built
0.8037
Figure 4.2:
# Fitted models
fit1 <- lm(Sale_Price ~ Gr_Liv_Area + Year_Built, data = ames_train)
fit2 <- lm(Sale_Price ~ Gr_Liv_Area * Year_Built, data = ames_train)
# Regression plane data
plot_grid <- expand.grid(
Gr_Liv_Area = seq(from = min(ames_train$Gr_Liv_Area), to = max(ames_train$Gr_Liv_Area),
length = 100),
Year_Built = seq(from = min(ames_train$Year_Built), to = max(ames_train$Year_Built),
length = 100)
)
plot_grid$y1 <- predict(fit1, newdata = plot_grid)
plot_grid$y2 <- predict(fit2, newdata = plot_grid)
# Level plots
p1 <- ggplot(plot_grid, aes(x = Gr_Liv_Area, y = Year_Built,
z = y1, fill = y1)) +
geom_tile() +
geom_contour(color = "white") +
viridis::scale_fill_viridis(name = "Predicted\nvalue", option = "inferno") +
theme_bw() +
ggtitle("Main effects only")
p2 <- ggplot(plot_grid, aes(x = Gr_Liv_Area, y = Year_Built,
z = y2, fill = y1)) +
geom_tile() +
geom_contour(color = "white") +
viridis::scale_fill_viridis(name = "Predicted\nvalue", option = "inferno") +
theme_bw() +
ggtitle("Main effects with two-way interaction")
gridExtra::grid.arrange(p1, p2, nrow = 1)
# include all possible main effects
model3 <- lm(Sale_Price ~ ., data = ames_train)
# print estimated coefficients in a tidy data frame
broom::tidy(model3)
Assessing model accuracy
# Train model using 10-fold cross-validation
set.seed(123) # for reproducibility
(cv_model1 <- train(
form = Sale_Price ~ Gr_Liv_Area,
data = ames_train,
method = "lm",
trControl = trainControl(method = "cv", number = 10)
))
Linear Regression
2053 samples
1 predictor
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 1846, 1848, 1848, 1848, 1848, 1848, ...
Resampling results:
RMSE Rsquared MAE
56410.89 0.5069425 39169.09
Tuning parameter 'intercept' was held constant at a value of TRUE
# model 2 CV
set.seed(123)
cv_model2 <- train(
Sale_Price ~ Gr_Liv_Area + Year_Built,
data = ames_train,
method = "lm",
trControl = trainControl(method = "cv", number = 10)
)
# model 3 CV
set.seed(123)
cv_model3 <- train(
Sale_Price ~ .,
data = ames_train,
method = "lm",
trControl = trainControl(method = "cv", number = 10)
)
# Extract out of sample performance measures
summary(resamples(list(
model1 = cv_model1,
model2 = cv_model2,
model3 = cv_model3
)))
Call:
summary.resamples(object = resamples(list(model1 = cv_model1, model2 =
cv_model2, model3 = cv_model3)))
Models: model1, model2, model3
Number of resamples: 10
MAE
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
model1 34457.58 36323.74 38943.81 39169.09 41660.81 45005.17 0
model2 28094.79 30594.47 31959.30 32246.86 34210.70 37441.82 0
model3 12458.27 15420.10 16484.77 16258.84 17262.39 19029.29 0
RMSE
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
model1 47211.34 52363.41 54948.96 56410.89 60672.31 67679.05 0
model2 37698.17 42607.11 45407.14 46292.38 49668.59 54692.06 0
model3 20844.33 22581.04 24947.45 26098.00 27695.65 39521.49 0
Rsquared
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
model1 0.3598237 0.4550791 0.5289068 0.5069425 0.5619841 0.5965793 0
model2 0.5714665 0.6392504 0.6800818 0.6703298 0.7067458 0.7348562 0
model3 0.7869022 0.9018567 0.9104351 0.8949642 0.9166564 0.9303504 0
Model concerns
Figure 4.3:
p1 <- ggplot(ames_train, aes(Year_Built, Sale_Price)) +
geom_point(size = 1, alpha = .4) +
geom_smooth(se = FALSE) +
scale_y_continuous("Sale price", labels = scales::dollar) +
xlab("Year built") +
ggtitle(paste("Non-transformed variables with a\n",
"non-linear relationship."))
p2 <- ggplot(ames_train, aes(Year_Built, Sale_Price)) +
geom_point(size = 1, alpha = .4) +
geom_smooth(method = "lm", se = FALSE) +
scale_y_log10("Sale price", labels = scales::dollar,
breaks = seq(0, 400000, by = 100000)) +
xlab("Year built") +
ggtitle(paste("Transforming variables can provide a\n",
"near-linear relationship."))
gridExtra::grid.arrange(p1, p2, nrow = 1)
Figure 4.4:
df1 <- broom::augment(cv_model1$finalModel, data = ames_train)
p1 <- ggplot(df1, aes(.fitted, .resid)) +
geom_point(size = 1, alpha = .4) +
xlab("Predicted values") +
ylab("Residuals") +
ggtitle("Model 1", subtitle = "Sale_Price ~ Gr_Liv_Area")
df2 <- broom::augment(cv_model3$finalModel, data = ames_train)
p2 <- ggplot(df2, aes(.fitted, .resid)) +
geom_point(size = 1, alpha = .4) +
xlab("Predicted values") +
ylab("Residuals") +
ggtitle("Model 3", subtitle = "Sale_Price ~ .")
gridExtra::grid.arrange(p1, p2, nrow = 1)
Figure 4.5:
df1 <- mutate(df1, id = row_number())
df2 <- mutate(df2, id = row_number())
p1 <- ggplot(df1, aes(id, .resid)) +
geom_point(size = 1, alpha = .4) +
xlab("Row ID") +
ylab("Residuals") +
ggtitle("Model 1", subtitle = "Correlated residuals.")
p2 <- ggplot(df2, aes(id, .resid)) +
geom_point(size = 1, alpha = .4) +
xlab("Row ID") +
ylab("Residuals") +
ggtitle("Model 3", subtitle = "Uncorrelated residuals.")
gridExtra::grid.arrange(p1, p2, nrow = 1)
# fit with two strongly correlated variables
summary(cv_model3) %>%
broom::tidy() %>%
filter(term %in% c("Garage_Area", "Garage_Cars"))
# model without Garage_Area
set.seed(123)
mod_wo_Garage_Cars <- train(
Sale_Price ~ .,
data = select(ames_train, -Garage_Cars),
method = "lm",
trControl = trainControl(method = "cv", number = 10)
)
prediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleading
summary(mod_wo_Garage_Cars) %>%
broom::tidy() %>%
filter(term == "Garage_Area")
Principal component regression
Figure 4.6:
knitr::include_graphics("images/pcr-steps.png")
# perform 10-fold cross validation on a PCR model tuning the
# number of principal components to use as predictors from 1-20
set.seed(123)
cv_model_pcr <- train(
Sale_Price ~ .,
data = ames_train,
method = "pcr",
trControl = trainControl(method = "cv", number = 10),
preProcess = c("zv", "center", "scale"),
tuneLength = 20
)
# model with lowest RMSE
cv_model_pcr$bestTune
# plot cross-validated RMSE
ggplot(cv_model_pcr)
Partial least squares
Figure 4.7:
knitr::include_graphics("images/pls-vs-pcr.png")
Figure 4.9:
library(AppliedPredictiveModeling)
library(recipes)
library(tidyr)
data(solubility)
df <- cbind(solTrainX, solTrainY)
pca_df <- recipe(solTrainY ~ ., data = df) %>%
step_center(all_predictors()) %>%
step_scale(all_predictors()) %>%
step_pca(all_predictors()) %>%
prep(training = df, retain = TRUE) %>%
juice() %>%
select(PC1, PC2, solTrainY) %>%
rename(`PCR Component 1` = "PC1", `PCR Component 2` = "PC2") %>%
gather(component, value, -solTrainY)
pls_df <- recipe(solTrainY ~ ., data = df) %>%
step_center(all_predictors()) %>%
step_scale(all_predictors()) %>%
step_pls(all_predictors(), outcome = "solTrainY") %>%
prep(training = df, retain = TRUE) %>%
juice() %>%
rename(`PLS Component 1` = "PLS1", `PLS Component 2` = "PLS2") %>%
gather(component, value, -solTrainY)
pca_df %>%
bind_rows(pls_df) %>%
ggplot(aes(value, solTrainY)) +
geom_point(alpha = .15) +
geom_smooth(method = "lm", se = FALSE, lty = "dashed") +
facet_wrap(~ component, scales = "free") +
labs(x = "PC Eigenvalues", y = "Response")
NA
# perform 10-fold cross validation on a PLS model tuning the
# number of principal components to use as predictors from 1-20
set.seed(123)
cv_model_pls <- train(
Sale_Price ~ .,
data = ames_train,
method = "pls",
trControl = trainControl(method = "cv", number = 10),
preProcess = c("zv", "center", "scale"),
tuneLength = 20
)
# model with lowest RMSE
cv_model_pls$bestTune
# plot cross-validated RMSE
ggplot(cv_model_pls)
Feature interpretation
vip(cv_model_pls, num_features = 20, method = "model")
Figure 4.12:
p1 <- pdp::partial(cv_model_pls, pred.var = "Gr_Liv_Area", grid.resolution = 20) %>%
autoplot() +
scale_y_continuous(limits = c(0, 300000), labels = scales::dollar)
p2 <- pdp::partial(cv_model_pls, pred.var = "First_Flr_SF", grid.resolution = 20) %>%
autoplot() +
scale_y_continuous(limits = c(0, 300000), labels = scales::dollar)
p3 <- pdp::partial(cv_model_pls, pred.var = "Total_Bsmt_SF", grid.resolution = 20) %>%
autoplot() +
scale_y_continuous(limits = c(0, 300000), labels = scales::dollar)
p4 <- pdp::partial(cv_model_pls, pred.var = "Garage_Cars", grid.resolution = 4) %>%
autoplot() +
scale_y_continuous(limits = c(0, 300000), labels = scales::dollar)
grid.arrange(p1, p2, p3, p4, nrow = 2)
# clean up
rm(list = ls())
---
title: "Chapter 4: Linear Regression"
output: html_notebook
---

__Note__: Some results may differ from the hard copy book due to the changing of sampling procedures introduced in R 3.6.0. See http://bit.ly/35D1SW7 for more details. Access and run the source code for this notebook [here](https://rstudio.cloud/project/801185). 

Hidden chapter requirements used in the book to set the plotting theme and load packages used in hidden code chunks:

```{r setup}
# Set global R options
options(scipen = 999)

# Set the graphical theme
ggplot2::theme_set(ggplot2::theme_light())

# Set global knitr chunk options
knitr::opts_chunk$set(
  fig.align = "center",
  fig.height = 3.5
)

ames <- AmesHousing::make_ames()
```


## Prerequisites

This chapter leverages the following packages:

```{r 04-pkgs, message=FALSE}
# Helper packages
library(dplyr)    # for data manipulation
library(ggplot2)  # for awesome graphics

# Modeling packages
library(caret)    # for cross-validation, etc.

# Model interpretability packages
library(vip)      # variable importance
```

We'll also continue working with the `ames_train` data set:

```{r 04-ames-train, echo=TRUE}
library(rsample)
# stratified sampling with the rsample package
set.seed(123)
split  <- initial_split(ames, prop = 0.7, strata = "Sale_Price")
ames_train  <- training(split)
ames_test   <- testing(split)
```

## Simple linear regression

### Estimation

```{r 04-model1}
model1 <- lm(Sale_Price ~ Gr_Liv_Area, data = ames_train)
```

Figure 4.1:

```{r 04-visualize-model1, eval=TRUE, fig.width=10, fig.height=3.5, echo=TRUE, fig.cap="The least squares fit from regressing sale price on living space for the the Ames housing data. Left: Fitted regression line. Right: Fitted regression line with vertical grey bars representing the residuals."}
# Fitted regression line (full training data)
p1 <- model1 %>%
  broom::augment() %>%
  ggplot(aes(Gr_Liv_Area, Sale_Price)) + 
  geom_point(size = 1, alpha = 0.3) +
  geom_smooth(se = FALSE, method = "lm") +
  scale_y_continuous(labels = scales::dollar) +
  ggtitle("Fitted regression line")

# Fitted regression line (restricted range)
p2 <- model1 %>%
  broom::augment() %>%
  ggplot(aes(Gr_Liv_Area, Sale_Price)) + 
  geom_segment(aes(x = Gr_Liv_Area, y = Sale_Price,
                   xend = Gr_Liv_Area, yend = .fitted), 
               alpha = 0.3) +
  geom_point(size = 1, alpha = 0.3) +
  geom_smooth(se = FALSE, method = "lm") +
  scale_y_continuous(labels = scales::dollar) +
  ggtitle("Fitted regression line (with residuals)")

# Side-by-side plots
grid.arrange(p1, p2, nrow = 1)
```

```{r 04-model1-summary}
summary(model1) 
```

```{r model1-sigma}
sigma(model1)    # RMSE
sigma(model1)^2  # MSE
```

### Inference

```{r}
confint(model1, level = 0.95)
```

## Multiple linear regression {#multi-lm}

```{r model2, linewidth=56}
(model2 <- lm(Sale_Price ~ Gr_Liv_Area + Year_Built, data = ames_train))
```

```{r model2-using-update, linewidth=56}
(model2 <- update(model1, . ~ . + Year_Built))
```

```{r model2-w-interaction, linewidth=56}
lm(Sale_Price ~ Gr_Liv_Area + Year_Built + Gr_Liv_Area:Year_Built, data = ames_train)
```

Figure 4.2:

```{r 04-mlr-fit, echo=TRUE, fig.width=10, fig.height=4.5, fig.cap="In a three-dimensional setting, with two predictors and one response, the least squares regression line becomes a plane. The 'best-fit' plane minimizes the sum of squared errors between the actual sales price (individual dots) and the predicted sales price (plane)."}
# Fitted models
fit1 <- lm(Sale_Price ~ Gr_Liv_Area + Year_Built, data = ames_train)
fit2 <- lm(Sale_Price ~ Gr_Liv_Area * Year_Built, data = ames_train)

# Regression plane data
plot_grid <- expand.grid(
  Gr_Liv_Area = seq(from = min(ames_train$Gr_Liv_Area), to = max(ames_train$Gr_Liv_Area), 
                    length = 100), 
  Year_Built = seq(from = min(ames_train$Year_Built), to = max(ames_train$Year_Built), 
                   length = 100)
)
plot_grid$y1 <- predict(fit1, newdata = plot_grid)
plot_grid$y2 <- predict(fit2, newdata = plot_grid)

# Level plots
p1 <- ggplot(plot_grid, aes(x = Gr_Liv_Area, y = Year_Built, 
                            z = y1, fill = y1)) +
  geom_tile() +
  geom_contour(color = "white") +
  viridis::scale_fill_viridis(name = "Predicted\nvalue", option = "inferno") +
  theme_bw() +
  ggtitle("Main effects only")
p2 <- ggplot(plot_grid, aes(x = Gr_Liv_Area, y = Year_Built, 
                            z = y2, fill = y1)) +
  geom_tile() +
  geom_contour(color = "white") +
  viridis::scale_fill_viridis(name = "Predicted\nvalue", option = "inferno") +
  theme_bw() +
  ggtitle("Main effects with two-way interaction")

gridExtra::grid.arrange(p1, p2, nrow = 1)
```

```{r model3}
# include all possible main effects
model3 <- lm(Sale_Price ~ ., data = ames_train) 

# print estimated coefficients in a tidy data frame
broom::tidy(model3)  
```


## Assessing model accuracy

```{r model1-accuracy}
# Train model using 10-fold cross-validation
set.seed(123)  # for reproducibility
(cv_model1 <- train(
  form = Sale_Price ~ Gr_Liv_Area, 
  data = ames_train, 
  method = "lm",
  trControl = trainControl(method = "cv", number = 10)
))
```

```{r mult-models, message=FALSE, warning=FALSE}
# model 2 CV
set.seed(123)
cv_model2 <- train(
  Sale_Price ~ Gr_Liv_Area + Year_Built, 
  data = ames_train, 
  method = "lm",
  trControl = trainControl(method = "cv", number = 10)
)

# model 3 CV
set.seed(123)
cv_model3 <- train(
  Sale_Price ~ ., 
  data = ames_train, 
  method = "lm",
  trControl = trainControl(method = "cv", number = 10)
)

# Extract out of sample performance measures
summary(resamples(list(
  model1 = cv_model1, 
  model2 = cv_model2, 
  model3 = cv_model3
)))
```

## Model concerns {#lm-residuals}

Figure 4.3:

```{r 04-linear-relationship, fig.align='center', fig.width=8, fig.height=3.5, fig.cap="Linear regression assumes a linear relationship between the predictor(s) and the response variable; however, non-linear relationships can often be altered to be near-linear by applying a transformation to the variable(s)."}
p1 <- ggplot(ames_train, aes(Year_Built, Sale_Price)) + 
  geom_point(size = 1, alpha = .4) +
  geom_smooth(se = FALSE) +
  scale_y_continuous("Sale price", labels = scales::dollar) +
  xlab("Year built") +
  ggtitle(paste("Non-transformed variables with a\n",
                "non-linear relationship."))

p2 <- ggplot(ames_train, aes(Year_Built, Sale_Price)) + 
  geom_point(size = 1, alpha = .4) + 
  geom_smooth(method = "lm", se = FALSE) +
  scale_y_log10("Sale price", labels = scales::dollar, 
                breaks = seq(0, 400000, by = 100000)) +
  xlab("Year built") +
  ggtitle(paste("Transforming variables can provide a\n",
                "near-linear relationship."))

gridExtra::grid.arrange(p1, p2, nrow = 1)
```

Figure 4.4:

```{r 04-homoskedasticity, fig.align='center', fig.width=8, fig.height=3.5, fig.cap="Linear regression assumes constant variance among the residuals. `model1` (left) shows definitive signs of heteroskedasticity whereas `model3` (right) appears to have constant variance."}

df1 <- broom::augment(cv_model1$finalModel, data = ames_train)

p1 <- ggplot(df1, aes(.fitted, .resid)) + 
  geom_point(size = 1, alpha = .4) +
  xlab("Predicted values") +
  ylab("Residuals") +
  ggtitle("Model 1", subtitle = "Sale_Price ~ Gr_Liv_Area")

df2 <- broom::augment(cv_model3$finalModel, data = ames_train)

p2 <- ggplot(df2, aes(.fitted, .resid)) + 
  geom_point(size = 1, alpha = .4)  +
  xlab("Predicted values") +
  ylab("Residuals") +
  ggtitle("Model 3", subtitle = "Sale_Price ~ .")

gridExtra::grid.arrange(p1, p2, nrow = 1)
```

Figure 4.5:

```{r 04-autocorrelation, fig.align='center', fig.width=8, fig.height=3.5, fig.cap="Linear regression assumes uncorrelated errors. The residuals in `model1` (left) have a distinct pattern suggesting that information about $\\epsilon_1$ provides information about $\\epsilon_2$. Whereas `model3` has no signs of autocorrelation."}

df1 <- mutate(df1, id = row_number())
df2 <- mutate(df2, id = row_number())

p1 <- ggplot(df1, aes(id, .resid)) + 
  geom_point(size = 1, alpha = .4) +
  xlab("Row ID") +
  ylab("Residuals") +
  ggtitle("Model 1", subtitle = "Correlated residuals.")

p2 <- ggplot(df2, aes(id, .resid)) + 
  geom_point(size = 1, alpha = .4) +
  xlab("Row ID") +
  ylab("Residuals") +
  ggtitle("Model 3", subtitle = "Uncorrelated residuals.")

gridExtra::grid.arrange(p1, p2, nrow = 1)
```

```{r collinearity-1}
# fit with two strongly correlated variables
summary(cv_model3) %>%
  broom::tidy() %>%
  filter(term %in% c("Garage_Area", "Garage_Cars"))
```

```{r collinearity-2}
# model without Garage_Area
set.seed(123)
mod_wo_Garage_Cars <- train(
  Sale_Price ~ ., 
  data = select(ames_train, -Garage_Cars), 
  method = "lm",
  trControl = trainControl(method = "cv", number = 10)
)

summary(mod_wo_Garage_Cars) %>%
  broom::tidy() %>%
  filter(term == "Garage_Area")
```

## Principal component regression

Figure 4.6:

```{r pcr-steps, echo=TRUE, out.height="70%", out.width="70%", fig.cap="A depiction of the steps involved in performing principal component regression."}
knitr::include_graphics("images/pcr-steps.png")
```

```{r pcr-regression, fig.height=3.5, fig.width=6, fig.cap="The 10-fold cross valdation RMSE obtained using PCR with 1-20 principal components."}
# perform 10-fold cross validation on a PCR model tuning the 
# number of principal components to use as predictors from 1-20
set.seed(123)
cv_model_pcr <- train(
  Sale_Price ~ ., 
  data = ames_train, 
  method = "pcr",
  trControl = trainControl(method = "cv", number = 10),
  preProcess = c("zv", "center", "scale"),
  tuneLength = 20
  )

# model with lowest RMSE
cv_model_pcr$bestTune

# plot cross-validated RMSE
ggplot(cv_model_pcr)
```

## Partial least squares

Figure 4.7:

```{r pcr-vs-pls, echo=TRUE, fig.cap="A diagram depicting the differences between PCR (left) and PLS (right). PCR finds principal components (PCs) that maximally summarize the features independent of the response variable and then uses those PCs as predictor variables. PLS finds components that simultaneously summarize variation of the predictors while being optimally correlated with the outcome and then uses those PCs as predictors.", out.width="100%"}
knitr::include_graphics("images/pls-vs-pcr.png")
```

Figure 4.9:

```{r pls-vs-pcr-relationship, echo=TRUE, fig.cap="Illustration showing that the first two PCs when using PCR have very little relationship to the response variable (top row); however, the first two PCs when using PLS have a much stronger association to the response (bottom row).", fig.height=4.5, message=FALSE, warning=FALSE}
library(AppliedPredictiveModeling)
library(recipes)
library(tidyr)

data(solubility)
df <- cbind(solTrainX, solTrainY)

pca_df <- recipe(solTrainY ~ ., data = df) %>%
  step_center(all_predictors()) %>%
  step_scale(all_predictors()) %>%
  step_pca(all_predictors()) %>%
  prep(training = df, retain = TRUE) %>%
  juice() %>%
  select(PC1, PC2, solTrainY) %>%
  rename(`PCR Component 1` = "PC1", `PCR Component 2` = "PC2") %>%  
  gather(component, value, -solTrainY)

pls_df <- recipe(solTrainY ~ ., data = df) %>%
  step_center(all_predictors()) %>%
  step_scale(all_predictors()) %>%
  step_pls(all_predictors(), outcome = "solTrainY") %>%
  prep(training = df, retain = TRUE) %>%
  juice() %>%
  rename(`PLS Component 1` = "PLS1", `PLS Component 2` = "PLS2") %>%
  gather(component, value, -solTrainY)

pca_df %>% 
  bind_rows(pls_df) %>%
  ggplot(aes(value, solTrainY)) +
  geom_point(alpha = .15) +
  geom_smooth(method = "lm", se = FALSE, lty = "dashed") +
  facet_wrap(~ component, scales = "free") +
  labs(x = "PC Eigenvalues", y = "Response")
  
```

```{r pls-regression, fig.height=3.5, fig.width=6, fig.cap="The 10-fold cross valdation RMSE obtained using PLS with 1-20 principal components."}
# perform 10-fold cross validation on a PLS model tuning the 
# number of principal components to use as predictors from 1-20
set.seed(123)
cv_model_pls <- train(
  Sale_Price ~ ., 
  data = ames_train, 
  method = "pls",
  trControl = trainControl(method = "cv", number = 10),
  preProcess = c("zv", "center", "scale"),
  tuneLength = 20
)

# model with lowest RMSE
cv_model_pls$bestTune

# plot cross-validated RMSE
ggplot(cv_model_pls)
```

## Feature interpretation

```{r pls-vip, fig.cap="Top 20 most important variables for the PLS model.", message=FALSE, warning=FALSE}
vip(cv_model_pls, num_features = 20, method = "model")
```

Figure 4.12:

```{r pls-pdp, echo=TRUE, fig.height=5, fig.width=7, fig.cap="Partial dependence plots for the first four most important variables."}
p1 <- pdp::partial(cv_model_pls, pred.var = "Gr_Liv_Area", grid.resolution = 20) %>% 
  autoplot() +
  scale_y_continuous(limits = c(0, 300000), labels = scales::dollar)

p2 <- pdp::partial(cv_model_pls, pred.var = "First_Flr_SF", grid.resolution = 20) %>% 
  autoplot() +
  scale_y_continuous(limits = c(0, 300000), labels = scales::dollar)

p3 <- pdp::partial(cv_model_pls, pred.var = "Total_Bsmt_SF", grid.resolution = 20) %>% 
  autoplot() +
  scale_y_continuous(limits = c(0, 300000), labels = scales::dollar)

p4 <- pdp::partial(cv_model_pls, pred.var = "Garage_Cars", grid.resolution = 4) %>% 
  autoplot() +
  scale_y_continuous(limits = c(0, 300000), labels = scales::dollar)

grid.arrange(p1, p2, p3, p4, nrow = 2)
```

```{r}
# clean up
rm(list = ls())
```
