Introduction

This example builds on the statistical learning chapter from Modern Data Science with R: http://mdsr-book.github.io/.

Data ingestation and processing

library(mdsr) 
census <- read.csv(
"http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", 
  header = FALSE)
names(census) <- c("age", "workclass", "fnlwgt", "education", 
  "education.num", "marital.status", "occupation", "relationship", 
  "race", "sex", "capital.gain", "capital.loss", "hours.per.week", 
  "native.country", "income")
glimpse(census)
## Observations: 32,561
## Variables: 15
## $ age            <int> 39, 50, 38, 53, 28, 37, 49, 52, 31, 42, 37, 30,...
## $ workclass      <fctr>  State-gov,  Self-emp-not-inc,  Private,  Priv...
## $ fnlwgt         <int> 77516, 83311, 215646, 234721, 338409, 284582, 1...
## $ education      <fctr>  Bachelors,  Bachelors,  HS-grad,  11th,  Bach...
## $ education.num  <int> 13, 13, 9, 7, 13, 14, 5, 9, 14, 13, 10, 13, 13,...
## $ marital.status <fctr>  Never-married,  Married-civ-spouse,  Divorced...
## $ occupation     <fctr>  Adm-clerical,  Exec-managerial,  Handlers-cle...
## $ relationship   <fctr>  Not-in-family,  Husband,  Not-in-family,  Hus...
## $ race           <fctr>  White,  White,  White,  Black,  Black,  White...
## $ sex            <fctr>  Male,  Male,  Male,  Male,  Female,  Female, ...
## $ capital.gain   <int> 2174, 0, 0, 0, 0, 0, 0, 0, 14084, 5178, 0, 0, 0...
## $ capital.loss   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ hours.per.week <int> 40, 13, 40, 40, 40, 40, 16, 45, 50, 40, 80, 40,...
## $ native.country <fctr>  United-States,  United-States,  United-States...
## $ income         <fctr>  <=50K,  <=50K,  <=50K,  <=50K,  <=50K,  <=50K...
set.seed(364)
n <- nrow(census)
test_idx <- sample.int(n, size = round(0.2 * n))
train <- census[-test_idx, ]
nrow(train)
## [1] 26049
test <- census[test_idx, ]
nrow(test)
## [1] 6512
pi_bar <- tally(~ income, data = train, format = "percent")[2]
pi_bar
##     >50K 
## 24.25045
tally(~ income, data = train, format = "percent")
## income
##    <=50K     >50K 
## 75.74955 24.25045
library(rpart)
dtree <- rpart(income ~ capital.gain, data = train)
split_val <- as.data.frame(dtree$splits)$index
dtree_frame <- dtree$frame %>%
  select(var, n, dev, yval) %>%
  mutate(pct = n / nrow(train), right = (n - dev) / n)
rpart(income ~ capital.gain, data = train)
## n= 26049 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 26049 6317  <=50K (0.75749549 0.24250451)  
##   2) capital.gain< 5095.5 24784 5115  <=50K (0.79361685 0.20638315) *
##   3) capital.gain>=5095.5 1265   63  >50K (0.04980237 0.95019763) *
split <- 5095.5
train <- train %>% mutate(hi_cap_gains = capital.gain >= split)

ggplot(data = train, aes(x = capital.gain, y = income)) +
  geom_count(aes(color = hi_cap_gains),
    position = position_jitter(width = 0, height = 0.1), alpha = 0.5) +
  geom_vline(xintercept = split, color = "dodgerblue", lty = 2) +
  scale_x_log10(labels = scales::dollar)
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 23870 rows containing non-finite values (stat_sum).

form <- as.formula("income ~ age + workclass + education + marital.status + 
  occupation + relationship + race + sex + capital.gain + capital.loss +
  hours.per.week")
mod_tree <- rpart(form, data = train)
mod_tree
## n= 26049 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 26049 6317  <=50K (0.75749549 0.24250451)  
##    2) relationship= Not-in-family, Other-relative, Own-child, Unmarried 14196  947  <=50K (0.93329107 0.06670893)  
##      4) capital.gain< 7073.5 13946  706  <=50K (0.94937617 0.05062383) *
##      5) capital.gain>=7073.5 250    9  >50K (0.03600000 0.96400000) *
##    3) relationship= Husband, Wife 11853 5370  <=50K (0.54695014 0.45304986)  
##      6) education= 10th, 11th, 12th, 1st-4th, 5th-6th, 7th-8th, 9th, Assoc-acdm, Assoc-voc, HS-grad, Preschool, Some-college 8280 2769  <=50K (0.66557971 0.33442029)  
##       12) capital.gain< 5095.5 7857 2355  <=50K (0.70026728 0.29973272) *
##       13) capital.gain>=5095.5 423    9  >50K (0.02127660 0.97872340) *
##      7) education= Bachelors, Doctorate, Masters, Prof-school 3573  972  >50K (0.27204030 0.72795970) *
plot(mod_tree)
text(mod_tree, use.n = TRUE, all = TRUE, cex = 0.7)

library(partykit)
## Loading required package: grid
plot(as.party(mod_tree))

options(digits = 5)
mod_tree_frame <- mod_tree$frame %>%
  select(var, n, dev, yval) %>%
  mutate(pct = n / nrow(train), right = (n - dev) / n)

70% of those who paid below the threshold made less than $50k.

train <- train %>%
  mutate(husband_or_wife = relationship %in% c(" Husband", " Wife"),
         college_degree = husband_or_wife & education %in%
           c(" Bachelors", " Doctorate", " Masters", " Prof-school"),
         income_dtree = predict(mod_tree, type = "class"))

cg_splits <- data.frame(husband_or_wife = c(TRUE, FALSE),
                        vals = c(5095.5, 7073.5))

ggplot(data = train, aes(x = capital.gain, y = income)) +
  geom_count(aes(color = income_dtree, shape = college_degree),
             position = position_jitter(width = 0, height = 0.1),
             alpha = 0.5) +
  facet_wrap(~ husband_or_wife) +
  geom_vline(data = cg_splits, aes(xintercept = vals),
             color = "dodgerblue", lty = 2) +
  scale_x_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 23870 rows containing non-finite values (stat_sum).

printcp(mod_tree)
## 
## Classification tree:
## rpart(formula = form, data = train)
## 
## Variables actually used in tree construction:
## [1] capital.gain education    relationship
## 
## Root node error: 6317/26049 = 0.243
## 
## n= 26049 
## 
##       CP nsplit rel error xerror    xstd
## 1 0.1289      0     1.000  1.000 0.01095
## 2 0.0641      2     0.742  0.742 0.00982
## 3 0.0367      3     0.678  0.678 0.00947
## 4 0.0100      4     0.641  0.641 0.00926
# plotcp(mod_tree)
train <- train %>%
  mutate(income_dtree = predict(mod_tree, type = "class"))
confusion <- tally(income_dtree ~ income, data = train, format = "count")
confusion
##             income
## income_dtree  <=50K  >50K
##        <=50K  18742  3061
##        >50K     990  3256
sum(diag(confusion)) / nrow(train)
## [1] 0.84449

In this case, the accuracy of the decision tree classifier is now 84.4%, a considerable improvement over the null model.

mod_tree2 <- rpart(form, data = train, control = rpart.control(cp = 0.002))
# XX calculate accuracy
library(randomForest)
mod_forest <- randomForest(form, data = train, ntree = 201, mtry = 3)
mod_forest
## 
## Call:
##  randomForest(formula = form, data = train, ntree = 201, mtry = 3) 
##                Type of random forest: classification
##                      Number of trees: 201
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 13.31%
## Confusion matrix:
##         <=50K  >50K class.error
##  <=50K  18471  1261    0.063906
##  >50K    2205  4112    0.349058
sum(diag(mod_forest$confusion)) / nrow(train)
## [1] 0.86694
library(tibble)
importance(mod_forest) %>%
  as.data.frame() %>%
  rownames_to_column() %>%
  arrange(desc(MeanDecreaseGini))
##           rowname MeanDecreaseGini
## 1             age          1068.76
## 2    capital.gain          1064.39
## 3       education           982.65
## 4    relationship           951.87
## 5      occupation           905.27
## 6  marital.status           880.90
## 7  hours.per.week           627.88
## 8       workclass           337.92
## 9    capital.loss           326.13
## 10           race           145.34
## 11            sex           110.08
library(class)
# distance metric only works with quantitative variables
train_q <- train %>%
  select(age, education.num, capital.gain, capital.loss, hours.per.week)
income_knn <- knn(train_q, test = train_q, cl = train$income, k = 10)

confusion <- tally(income_knn ~ income, data = train, format = "count")
confusion
##           income
## income_knn  <=50K  >50K
##      <=50K  18875  2997
##      >50K     857  3320
sum(diag(confusion)) / nrow(train)
## [1] 0.85205
knn_error_rate <- function(x, y, numNeighbors, z = x) {
  y_hat <- knn(train = x, test = z, cl = y, k = numNeighbors)
  return(sum(y_hat != y) / nrow(x))
}
ks <- c(1:15, 20, 30, 40, 50)
train_rates <- sapply(ks, FUN = knn_error_rate, x = train_q, y = train$income)
knn_error_rates <- data.frame(k = ks, train_rate = train_rates)
ggplot(data = knn_error_rates, aes(x = k, y = train_rate)) +
  geom_point() + geom_line() + ylab("Misclassification Rate")

library(e1071)
mod_nb <- naiveBayes(form, data = train)
income_nb <- predict(mod_nb, newdata = train)
confusion <- tally(income_nb ~ income, data = train, format = "count")
confusion
##          income
## income_nb  <=50K  >50K
##     <=50K  18587  3605
##     >50K    1145  2712
sum(diag(confusion)) / nrow(train)
## [1] 0.81765
library(nnet)
mod_nn <- nnet(form, data = train, size = 5)
## # weights:  296
## initial  value 21842.825468 
## iter  10 value 13198.315933
## iter  20 value 11190.055832
## iter  30 value 10252.441741
## iter  40 value 9937.073100
## iter  50 value 9591.448419
## iter  60 value 9319.908227
## iter  70 value 9062.177126
## iter  80 value 8918.313144
## iter  90 value 8826.858128
## iter 100 value 8729.189597
## final  value 8729.189597 
## stopped after 100 iterations
income_nn <- predict(mod_nn, newdata = train, type = "class")
confusion <- tally(income_nn ~ income, data = train, format = "count")
confusion
##          income
## income_nn  <=50K  >50K
##     <=50K  17871  2128
##     >50K    1861  4189
sum(diag(confusion)) / nrow(train)
## [1] 0.84687
income_ensemble <- ifelse((income_knn == " >50K") +
                           (income_nb == " >50K") +
                           (income_nn == " >50K") >= 2, " >50K", " <=50K")
confusion <- tally(income_ensemble ~ income, data = train, format = "count")
confusion
##                income
## income_ensemble  <=50K  >50K
##           <=50K  18790  3039
##           >50K     942  3278
sum(diag(confusion)) / nrow(train)
## [1] 0.84717
income_probs <- mod_nb %>%
  predict(newdata = train, type = "raw") %>%
  as.data.frame()
head(income_probs, 3)
##     <=50K       >50K
## 1 0.98882 0.01117538
## 2 0.84292 0.15707605
## 3 0.99973 0.00026597
names(income_probs)
## [1] " <=50K" " >50K"
tally(~` >50K` > 0.5, data = income_probs, format = "percent")
## ` >50K` > 0.5
##   TRUE  FALSE 
## 14.807 85.193
tally(~` >50K` > 0.24, data = income_probs, format = "percent")
## ` >50K` > 0.24
##   TRUE  FALSE 
## 19.939 80.061
pred <- ROCR::prediction(income_probs[,2], train$income)
perf <- ROCR::performance(pred, 'tpr', 'fpr')
class(perf) # can also plot(perf)
## [1] "performance"
## attr(,"package")
## [1] "ROCR"
perf_df <- data.frame(perf@x.values, perf@y.values)
names(perf_df) <- c("fpr", "tpr")
roc <- ggplot(data = perf_df, aes(x = fpr, y = tpr)) +
  geom_line(color="blue") + geom_abline(intercept=0, slope=1, lty=3) +
  ylab(perf@y.name) + xlab(perf@x.name)
confusion <- tally(income_nb ~ income, data = train, format = "count")
confusion
##          income
## income_nb  <=50K  >50K
##     <=50K  18587  3605
##     >50K    1145  2712
sum(diag(confusion)) / nrow(train)
## [1] 0.81765
tpr <- confusion[" >50K", " >50K"] / sum(confusion[, " >50K"])
fpr <- confusion[" >50K", " <=50K"] / sum(confusion[, " <=50K"])
roc + geom_point(x = fpr, y = tpr, size = 3)

test_q <- test %>%
  select(age, education.num, capital.gain, capital.loss, hours.per.week)
test_rates <- sapply(ks, FUN = knn_error_rate, x = train_q,
                     y = train$income, z = test_q)
knn_error_rates <- knn_error_rates %>% mutate(test_rate = test_rates)
library(tidyr)
knn_error_rates_tidy <- knn_error_rates %>%
  gather(key = "type", value = "error_rate", -k)
ggplot(data = knn_error_rates_tidy, aes(x = k, y = error_rate)) +
  geom_point(aes(color = type)) + geom_line(aes(color = type)) +
  ylab("Misclassification Rate")

favstats(~ capital.gain, data = train)
##  min Q1 median Q3   max   mean     sd     n missing
##    0  0      0  0 99999 1084.4 7428.6 26049       0
favstats(~ capital.gain, data = test)
##  min Q1 median Q3   max   mean     sd    n missing
##    0  0      0  0 99999 1050.8 7210.1 6512       0
mod_null <- glm(income ~ 1, data = train, family = binomial)
mods <- list(mod_null, mod_tree, mod_forest, mod_nn, mod_nb)
lapply(mods, class)
## [[1]]
## [1] "glm" "lm" 
## 
## [[2]]
## [1] "rpart"
## 
## [[3]]
## [1] "randomForest.formula" "randomForest"        
## 
## [[4]]
## [1] "nnet.formula" "nnet"        
## 
## [[5]]
## [1] "naiveBayes"
predict_methods <- methods("predict")
predict_methods[grepl(pattern = "(glm|rpart|randomForest|nnet|naive)",
  predict_methods)]
## [1] "predict.glm"          "predict.glmmPQL"      "predict.glmtree"     
## [4] "predict.naiveBayes"   "predict.nnet"         "predict.randomForest"
## [7] "predict.rpart"
predictions_train <- data.frame(
  y = as.character(train$income),
  type = "train",
  mod_null = predict(mod_null, type = "response"),
  mod_tree = predict(mod_tree, type = "class"),
  mod_forest = predict(mod_forest, type = "class"),
  mod_nn = predict(mod_nn, type = "class"),
  mod_nb = predict(mod_nb, newdata = train, type = "class"))
predictions_test <- data.frame(
  y = as.character(test$income),
  type = "test",
  mod_null = predict(mod_null, newdata = test, type = "response"),
  mod_tree = predict(mod_tree, newdata = test, type = "class"),
  mod_forest = predict(mod_forest, newdata = test, type = "class"),
  mod_nn = predict(mod_nn, newdata = test, type = "class"),
  mod_nb = predict(mod_nb, newdata = test, type = "class"))
predictions <- bind_rows(predictions_train, predictions_test)
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
glimpse(predictions)
## Observations: 32,561
## Variables: 7
## $ y          <fctr>  <=50K,  <=50K,  <=50K,  <=50K,  <=50K,  <=50K,  <...
## $ type       <chr> "train", "train", "train", "train", "train", "train...
## $ mod_null   <dbl> 0.2425, 0.2425, 0.2425, 0.2425, 0.2425, 0.2425, 0.2...
## $ mod_tree   <fctr>  <=50K,  >50K,  <=50K,  <=50K,  >50K,  >50K,  <=50...
## $ mod_forest <fctr>  <=50K,  <=50K,  <=50K,  <=50K,  >50K,  >50K,  <=5...
## $ mod_nn     <fctr>  >50K,  <=50K,  <=50K,  <=50K,  >50K,  >50K,  <=50...
## $ mod_nb     <fctr>  <=50K,  <=50K,  <=50K,  <=50K,  <=50K,  <=50K,  <...
predictions_tidy <- predictions %>%
  mutate(mod_null = ifelse(mod_null < 0.5, " <=50K", " >50K")) %>%
  gather(key = "model", value = "y_hat", -type, -y)
## Warning: attributes are not identical across measure variables; they will
## be dropped
glimpse(predictions_tidy)
## Observations: 162,805
## Variables: 4
## $ y     <fctr>  <=50K,  <=50K,  <=50K,  <=50K,  <=50K,  <=50K,  <=50K,...
## $ type  <chr> "train", "train", "train", "train", "train", "train", "t...
## $ model <chr> "mod_null", "mod_null", "mod_null", "mod_null", "mod_nul...
## $ y_hat <chr> " <=50K", " <=50K", " <=50K", " <=50K", " <=50K", " <=50...
predictions_summary <- predictions_tidy %>%
  group_by(model, type) %>%
  summarize(N = n(), correct = sum(y == y_hat, 0),
            positives = sum(y == " >50K"),
            true_pos = sum(y_hat == " >50K" & y == y_hat),
            false_pos = sum(y_hat == " >50K" & y != y_hat)) %>%
  mutate(accuracy = correct / N,
         tpr = true_pos / positives,
         fpr = false_pos / (N - positives)) %>%
  ungroup() %>%
  gather(val_type, val, -model, -type) %>%
  unite(temp1, type, val_type, sep = "_") %>%   # glue variables
  spread(temp1, val) %>%
  arrange(desc(test_accuracy)) %>%
  select(model, train_accuracy, test_accuracy, test_tpr, test_fpr)
predictions_summary
## # A tibble: 5 x 5
##        model train_accuracy test_accuracy test_tpr test_fpr
##        <chr>          <dbl>         <dbl>    <dbl>    <dbl>
## 1 mod_forest        0.86694       0.86364  0.64436 0.069366
## 2   mod_tree        0.84449       0.84459  0.50459 0.051524
## 3     mod_nn        0.84687       0.84413  0.65157 0.097033
## 4     mod_nb        0.81765       0.82217  0.41929 0.054731
## 5   mod_null        0.75750       0.76597  0.00000 0.000000
outputs <- c("response", "prob", "prob", "raw", "raw")
roc_test <- mapply(predict, mods, type = outputs,
    MoreArgs = list(newdata = test)) %>%
  as.data.frame() %>%
  select(1,3,5,6,8)
names(roc_test) <-
  c("mod_null", "mod_tree", "mod_forest", "mod_nn", "mod_nb")
glimpse(roc_test)
## Observations: 6,512
## Variables: 5
## $ mod_null   <dbl> 0.2425, 0.2425, 0.2425, 0.2425, 0.2425, 0.2425, 0.2...
## $ mod_tree   <dbl> 0.299733, 0.727960, 0.299733, 0.299733, 0.299733, 0...
## $ mod_forest <dbl> 0.5920398, 0.5771144, 0.0945274, 0.0348259, 0.59701...
## $ mod_nn     <dbl> 0.669130, 0.649079, 0.249087, 0.166118, 0.695224, 0...
## $ mod_nb     <dbl> 1.8642e-01, 2.6567e-01, 4.8829e-02, 3.5218e-02, 4.2...
get_roc <- function(x, y) {
  pred <- ROCR::prediction(x$y_hat, y)
  perf <- ROCR::performance(pred, 'tpr', 'fpr')
  perf_df <- data.frame(perf@x.values, perf@y.values)
  names(perf_df) <- c("fpr", "tpr")
  return(perf_df)
}

roc_tidy <- roc_test %>%
  gather(key = "model", value = "y_hat") %>%
  group_by(model) %>%
  dplyr::do(get_roc(., y = test$income))
ggplot(data = roc_tidy, aes(x = fpr, y = tpr)) +
  geom_line(aes(color = model)) +
  geom_abline(intercept = 0, slope = 1, lty = 3) +
  ylab(perf@y.name) + xlab(perf@x.name) +
  geom_point(data = predictions_summary, size = 3,
    aes(x = test_fpr, y = test_tpr, color = model))