Note: Some results may differ from the hard copy book due to the changing of sampling procedures introduced in R 3.6.0. See http://bit.ly/35D1SW7 for more details. Access and run the source code for this notebook here.

Hidden chapter requirements used in the book to set the plotting theme and load packages used in hidden code chunks:

# Set the graphical theme
ggplot2::theme_set(ggplot2::theme_light())

# Set global knitr chunk options
knitr::opts_chunk$set(
  fig.align = "center",
  fig.height = 3.5
)

Prerequisites

For this section we’ll use the following packages:

# Helper packages
library(dplyr)     # for data wrangling
library(ggplot2)   # for awesome plotting
library(rsample)   # for data splitting

# Modeling packages
library(caret)     # for logistic regression modeling

# Model interpretability packages
library(vip)       # variable importance

To illustrate logistic regression concepts we’ll use the employee attrition data:

df <- attrition %>% mutate_if(is.ordered, factor, ordered = FALSE)

# Create training (70%) and test (30%) sets for the 
# rsample::attrition data.
set.seed(123)  # for reproducibility
churn_split <- initial_split(df, prop = .7, strata = "Attrition")
churn_train <- training(churn_split)
churn_test  <- testing(churn_split)

Why logistic regression

Figure 5.1:

p1 <- ISLR::Default %>%
  mutate(prob = ifelse(default == "Yes", 1, 0)) %>%
  ggplot(aes(balance, prob)) +
  geom_point(alpha = .15) +
  geom_smooth(method = "lm") +
  ggtitle("Linear regression model fit") +
  xlab("Balance") +
  ylab("Probability of Default")

p2 <- ISLR::Default %>%
  mutate(prob = ifelse(default == "Yes", 1, 0)) %>%
  ggplot(aes(balance, prob)) +
  geom_point(alpha = .15) +
  geom_smooth(method = "glm", method.args = list(family = "binomial")) +
  ggtitle("Logistic regression model fit") +
  xlab("Balance") +
  ylab("Probability of Default")

gridExtra::grid.arrange(p1, p2, nrow = 1)

Simple logistic regression

model1 <- glm(Attrition ~ MonthlyIncome, family = "binomial", data = churn_train)
model2 <- glm(Attrition ~ OverTime, family = "binomial", data = churn_train)

Figure 5.2:

churn_train2 <- churn_train %>% mutate(prob = ifelse(Attrition == "Yes", 1, 0))
churn_train2 <- broom::augment(model2, churn_train2) %>% mutate(.fitted = exp(.fitted))

p1 <- ggplot(churn_train2, aes(MonthlyIncome, prob)) +
  geom_point(alpha = 0.15) +
  geom_smooth(method = "glm", method.args = list(family = "binomial")) +
  ggtitle("Predicted probabilities for model1") +
  xlab("Monthly Income") +
  ylab("Probability of Attrition")

p2 <- ggplot(churn_train2, aes(OverTime, .fitted, color = OverTime)) +
  geom_boxplot(show.legend = FALSE) +
  geom_rug(sides = "b", position = "jitter", alpha = 0.2, show.legend = FALSE) +
  ggtitle("Predicted probabilities for model2") +
  xlab("Over Time") +
  scale_y_continuous("Probability of Attrition", limits = c(0, 1))

gridExtra::grid.arrange(p1, p2, nrow = 1)

tidy(model1)
tidy(model2)
exp(coef(model1))
  (Intercept) MonthlyIncome 
    0.3970771     0.9998697 
exp(coef(model2))
(Intercept) OverTimeYes 
  0.1126126   4.0812121 
confint(model1)  # for odds, you can use `exp(confint(model1))`
Waiting for profiling to be done...
                      2.5 %         97.5 %
(Intercept)   -1.2267754960 -0.61800619157
MonthlyIncome -0.0001849796 -0.00008107634
confint(model2)
Waiting for profiling to be done...
                2.5 %    97.5 %
(Intercept) -2.430458 -1.952330
OverTimeYes  1.063246  1.752879

Multiple logistic regression

model3 <- glm(
  Attrition ~ MonthlyIncome + OverTime,
  family = "binomial", 
  data = churn_train
  )

tidy(model3)

Figure 5.3:

churn_train3 <- churn_train %>% mutate(prob = ifelse(Attrition == "Yes", 1, 0))
churn_train3 <- broom::augment(model3, churn_train3) %>% mutate(.fitted = exp(.fitted))

ggplot(churn_train3, aes(MonthlyIncome, prob, color = OverTime)) +
  geom_point(alpha = .15) +
  geom_smooth(method = "glm", method.args = list(family = "binomial"), se = FALSE) +
  ggtitle("Predicted probabilities for model3") +
  xlab("Monthly Income") +
  ylab("Probability of Attrition")

Assessing model accuracy

set.seed(123)
cv_model1 <- train(
  Attrition ~ MonthlyIncome, 
  data = churn_train, 
  method = "glm",
  family = "binomial",
  trControl = trainControl(method = "cv", number = 10)
)

set.seed(123)
cv_model2 <- train(
  Attrition ~ MonthlyIncome + OverTime, 
  data = churn_train, 
  method = "glm",
  family = "binomial",
  trControl = trainControl(method = "cv", number = 10)
)

set.seed(123)
cv_model3 <- train(
  Attrition ~ ., 
  data = churn_train, 
  method = "glm",
  family = "binomial",
  trControl = trainControl(method = "cv", number = 10)
)

# extract out of sample performance measures
summary(
  resamples(
    list(
      model1 = cv_model1, 
      model2 = cv_model2, 
      model3 = cv_model3
    )
  )
)$statistics$Accuracy
            Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
model1 0.8349515 0.8349515 0.8365385 0.8388478 0.8431373 0.8446602    0
model2 0.8349515 0.8349515 0.8365385 0.8388478 0.8431373 0.8446602    0
model3 0.8365385 0.8495146 0.8792476 0.8757893 0.8907767 0.9313725    0
# predict class
pred_class <- predict(cv_model3, churn_train)

# create confusion matrix
confusionMatrix(
  data = relevel(pred_class, ref = "Yes"), 
  reference = relevel(churn_train$Attrition, ref = "Yes")
)
Confusion Matrix and Statistics

          Reference
Prediction Yes  No
       Yes  93  25
       No   73 839
                                          
               Accuracy : 0.9049          
                 95% CI : (0.8853, 0.9221)
    No Information Rate : 0.8388          
    P-Value [Acc > NIR] : 0.000000000536  
                                          
                  Kappa : 0.6016          
                                          
 Mcnemar's Test P-Value : 0.000002057257  
                                          
            Sensitivity : 0.56024         
            Specificity : 0.97106         
         Pos Pred Value : 0.78814         
         Neg Pred Value : 0.91996         
             Prevalence : 0.16117         
         Detection Rate : 0.09029         
   Detection Prevalence : 0.11456         
      Balanced Accuracy : 0.76565         
                                          
       'Positive' Class : Yes             
                                          
library(ROCR)

# Compute predicted probabilities
m1_prob <- predict(cv_model1, churn_train, type = "prob")$Yes
m3_prob <- predict(cv_model3, churn_train, type = "prob")$Yes

# Compute AUC metrics for cv_model1 and cv_model3
perf1 <- prediction(m1_prob, churn_train$Attrition) %>%
  performance(measure = "tpr", x.measure = "fpr")
perf2 <- prediction(m3_prob, churn_train$Attrition) %>%
  performance(measure = "tpr", x.measure = "fpr")

# Plot ROC curves for cv_model1 and cv_model3
plot(perf1, col = "black", lty = 2)
plot(perf2, add = TRUE, col = "blue")
legend(0.8, 0.2, legend = c("cv_model1", "cv_model3"),
       col = c("black", "blue"), lty = 2:1, cex = 0.6)

# Perform 10-fold CV on a PLS model tuning the number of PCs to 
# use as predictors
set.seed(123)
cv_model_pls <- train(
  Attrition ~ ., 
  data = churn_train, 
  method = "pls",
  family = "binomial",
  trControl = trainControl(method = "cv", number = 10),
  preProcess = c("zv", "center", "scale"),
  tuneLength = 16
)

# Model with lowest RMSE
cv_model_pls$bestTune

# Plot cross-validated RMSE
ggplot(cv_model_pls)

Feature interpretation

vip(cv_model3, num_features = 20)

Figure 5.7:

pred.fun <- function(object, newdata) {
  Yes <- mean(predict(object, newdata, type = "prob")$Yes)
  as.data.frame(Yes)
}

p1 <- pdp::partial(cv_model3, pred.var = "OverTime", pred.fun = pred.fun) %>% 
  autoplot(rug = TRUE) + ylim(c(0, 1))

p2 <- pdp::partial(cv_model3, pred.var = "JobSatisfaction", pred.fun = pred.fun) %>% 
  autoplot() + ylim(c(0, 1))

p3 <- pdp::partial(cv_model3, pred.var = "NumCompaniesWorked", pred.fun = pred.fun, gr = 10) %>% 
  autoplot() + scale_x_continuous(breaks = 0:9) + ylim(c(0, 1))
  

p4 <- pdp::partial(cv_model3, pred.var = "EnvironmentSatisfaction", pred.fun = pred.fun) %>% 
  autoplot() + ylim(c(0, 1))

grid.arrange(p1, p2, p3, p4, nrow = 2)

# clean up
rm(list = ls())
---
title: "Chapter 5: Logisitic Regression"
output: html_notebook
---

__Note__: Some results may differ from the hard copy book due to the changing of sampling procedures introduced in R 3.6.0. See http://bit.ly/35D1SW7 for more details. Access and run the source code for this notebook [here](https://rstudio.cloud/project/801185). 

Hidden chapter requirements used in the book to set the plotting theme and load packages used in hidden code chunks:

```{r setup}
# Set the graphical theme
ggplot2::theme_set(ggplot2::theme_light())

# Set global knitr chunk options
knitr::opts_chunk$set(
  fig.align = "center",
  fig.height = 3.5
)
```

## Prerequisites

For this section we'll use the following packages:

```{r 08-pkgs, message=FALSE}
# Helper packages
library(dplyr)     # for data wrangling
library(ggplot2)   # for awesome plotting
library(rsample)   # for data splitting

# Modeling packages
library(caret)     # for logistic regression modeling

# Model interpretability packages
library(vip)       # variable importance
```

To illustrate logistic regression concepts we'll use the employee attrition data:

```{r logit-data-import}
df <- attrition %>% mutate_if(is.ordered, factor, ordered = FALSE)

# Create training (70%) and test (30%) sets for the 
# rsample::attrition data.
set.seed(123)  # for reproducibility
churn_split <- initial_split(df, prop = .7, strata = "Attrition")
churn_train <- training(churn_split)
churn_test  <- testing(churn_split)
```


## Why logistic regression

Figure 5.1:

```{r whylogit, echo=TRUE, fig.height=3, fig.width=8, fig.cap="Comparing the predicted probabilities of linear regression (left) to logistic regression (right). Predicted probabilities using linear regression results in flawed logic whereas predicted values from logistic regression will always lie between 0 and 1."}
p1 <- ISLR::Default %>%
  mutate(prob = ifelse(default == "Yes", 1, 0)) %>%
  ggplot(aes(balance, prob)) +
  geom_point(alpha = .15) +
  geom_smooth(method = "lm") +
  ggtitle("Linear regression model fit") +
  xlab("Balance") +
  ylab("Probability of Default")

p2 <- ISLR::Default %>%
  mutate(prob = ifelse(default == "Yes", 1, 0)) %>%
  ggplot(aes(balance, prob)) +
  geom_point(alpha = .15) +
  geom_smooth(method = "glm", method.args = list(family = "binomial")) +
  ggtitle("Logistic regression model fit") +
  xlab("Balance") +
  ylab("Probability of Default")

gridExtra::grid.arrange(p1, p2, nrow = 1)
```

## Simple logistic regression

```{r glm-model1}
model1 <- glm(Attrition ~ MonthlyIncome, family = "binomial", data = churn_train)
model2 <- glm(Attrition ~ OverTime, family = "binomial", data = churn_train)
```

Figure 5.2:

```{r glm-sigmoid, echo=TRUE, fig.width=8, fig.height=3, fig.cap="Predicted probablilities of employee attrition based on monthly income (left) and overtime (right). As monthly income increases, `model1` predicts a decreased probability of attrition and if employees work overtime `model2` predicts an increased probability."}
churn_train2 <- churn_train %>% mutate(prob = ifelse(Attrition == "Yes", 1, 0))
churn_train2 <- broom::augment(model2, churn_train2) %>% mutate(.fitted = exp(.fitted))

p1 <- ggplot(churn_train2, aes(MonthlyIncome, prob)) +
  geom_point(alpha = 0.15) +
  geom_smooth(method = "glm", method.args = list(family = "binomial")) +
  ggtitle("Predicted probabilities for model1") +
  xlab("Monthly Income") +
  ylab("Probability of Attrition")

p2 <- ggplot(churn_train2, aes(OverTime, .fitted, color = OverTime)) +
  geom_boxplot(show.legend = FALSE) +
  geom_rug(sides = "b", position = "jitter", alpha = 0.2, show.legend = FALSE) +
  ggtitle("Predicted probabilities for model2") +
  xlab("Over Time") +
  scale_y_continuous("Probability of Attrition", limits = c(0, 1))

gridExtra::grid.arrange(p1, p2, nrow = 1)
```

```{r}
tidy(model1)
tidy(model2)
```

```{r convert-odds-probs}
exp(coef(model1))
exp(coef(model2))
```

```{r coef-confint}
confint(model1)  # for odds, you can use `exp(confint(model1))`
confint(model2)
```

## Multiple logistic regression

```{r glm-model3}
model3 <- glm(
  Attrition ~ MonthlyIncome + OverTime,
  family = "binomial", 
  data = churn_train
  )

tidy(model3)
```

Figure 5.3:

```{r glm-sigmoid2, echo=TRUE, fig.width=6, fig.height=3, fig.cap="Predicted probability of attrition based on monthly income and whether or not employees work overtime."}
churn_train3 <- churn_train %>% mutate(prob = ifelse(Attrition == "Yes", 1, 0))
churn_train3 <- broom::augment(model3, churn_train3) %>% mutate(.fitted = exp(.fitted))

ggplot(churn_train3, aes(MonthlyIncome, prob, color = OverTime)) +
  geom_point(alpha = .15) +
  geom_smooth(method = "glm", method.args = list(family = "binomial"), se = FALSE) +
  ggtitle("Predicted probabilities for model3") +
  xlab("Monthly Income") +
  ylab("Probability of Attrition")
```


## Assessing model accuracy

```{r mult-models-logistic}
set.seed(123)
cv_model1 <- train(
  Attrition ~ MonthlyIncome, 
  data = churn_train, 
  method = "glm",
  family = "binomial",
  trControl = trainControl(method = "cv", number = 10)
)

set.seed(123)
cv_model2 <- train(
  Attrition ~ MonthlyIncome + OverTime, 
  data = churn_train, 
  method = "glm",
  family = "binomial",
  trControl = trainControl(method = "cv", number = 10)
)

set.seed(123)
cv_model3 <- train(
  Attrition ~ ., 
  data = churn_train, 
  method = "glm",
  family = "binomial",
  trControl = trainControl(method = "cv", number = 10)
)

# extract out of sample performance measures
summary(
  resamples(
    list(
      model1 = cv_model1, 
      model2 = cv_model2, 
      model3 = cv_model3
    )
  )
)$statistics$Accuracy
```

```{r glm-confusion-matrix}
# predict class
pred_class <- predict(cv_model3, churn_train)

# create confusion matrix
confusionMatrix(
  data = relevel(pred_class, ref = "Yes"), 
  reference = relevel(churn_train$Attrition, ref = "Yes")
)
```

```{r logistic-regression-roc, fig.width=6, fig.height=4.5, fig.cap="ROC curve for cross-validated models 1 and 3. The increase in the AUC represents the 'lift' that we achieve with model 3.", message=FALSE, warning=FALSE}
library(ROCR)

# Compute predicted probabilities
m1_prob <- predict(cv_model1, churn_train, type = "prob")$Yes
m3_prob <- predict(cv_model3, churn_train, type = "prob")$Yes

# Compute AUC metrics for cv_model1 and cv_model3
perf1 <- prediction(m1_prob, churn_train$Attrition) %>%
  performance(measure = "tpr", x.measure = "fpr")
perf2 <- prediction(m3_prob, churn_train$Attrition) %>%
  performance(measure = "tpr", x.measure = "fpr")

# Plot ROC curves for cv_model1 and cv_model3
plot(perf1, col = "black", lty = 2)
plot(perf2, add = TRUE, col = "blue")
legend(0.8, 0.2, legend = c("cv_model1", "cv_model3"),
       col = c("black", "blue"), lty = 2:1, cex = 0.6)
```

```{r pls-logistic-regression, fig.height=3.5, fig.width=6, fig.cap="The 10-fold cross-validation RMSE obtained using PLS with 1--16 principal components."}
# Perform 10-fold CV on a PLS model tuning the number of PCs to 
# use as predictors
set.seed(123)
cv_model_pls <- train(
  Attrition ~ ., 
  data = churn_train, 
  method = "pls",
  family = "binomial",
  trControl = trainControl(method = "cv", number = 10),
  preProcess = c("zv", "center", "scale"),
  tuneLength = 16
)

# Model with lowest RMSE
cv_model_pls$bestTune

# Plot cross-validated RMSE
ggplot(cv_model_pls)
```

## Feature interpretation

```{r glm-vip, fig.cap="Top 20 most important variables for the PLS model."}
vip(cv_model3, num_features = 20)
```

Figure 5.7:

```{r glm-pdp, echo=TRUE, fig.height=5, fig.width=7, fig.cap="Partial dependence plots for the first four most important variables.  We can see how the predicted probability of attrition changes for each value of the influential predictors."}
pred.fun <- function(object, newdata) {
  Yes <- mean(predict(object, newdata, type = "prob")$Yes)
  as.data.frame(Yes)
}

p1 <- pdp::partial(cv_model3, pred.var = "OverTime", pred.fun = pred.fun) %>% 
  autoplot(rug = TRUE) + ylim(c(0, 1))

p2 <- pdp::partial(cv_model3, pred.var = "JobSatisfaction", pred.fun = pred.fun) %>% 
  autoplot() + ylim(c(0, 1))

p3 <- pdp::partial(cv_model3, pred.var = "NumCompaniesWorked", pred.fun = pred.fun, gr = 10) %>% 
  autoplot() + scale_x_continuous(breaks = 0:9) + ylim(c(0, 1))
  

p4 <- pdp::partial(cv_model3, pred.var = "EnvironmentSatisfaction", pred.fun = pred.fun) %>% 
  autoplot() + ylim(c(0, 1))

grid.arrange(p1, p2, p3, p4, nrow = 2)
```

```{r}
# clean up
rm(list = ls())
```

