This example builds on the statistical learning chapter from Modern Data Science with R: http://mdsr-book.github.io/.
library(mdsr)
census <- read.csv(
"http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
header = FALSE)
names(census) <- c("age", "workclass", "fnlwgt", "education",
"education.num", "marital.status", "occupation", "relationship",
"race", "sex", "capital.gain", "capital.loss", "hours.per.week",
"native.country", "income")
glimpse(census)
## Observations: 32,561
## Variables: 15
## $ age <int> 39, 50, 38, 53, 28, 37, 49, 52, 31, 42, 37, 30,...
## $ workclass <fctr> State-gov, Self-emp-not-inc, Private, Priv...
## $ fnlwgt <int> 77516, 83311, 215646, 234721, 338409, 284582, 1...
## $ education <fctr> Bachelors, Bachelors, HS-grad, 11th, Bach...
## $ education.num <int> 13, 13, 9, 7, 13, 14, 5, 9, 14, 13, 10, 13, 13,...
## $ marital.status <fctr> Never-married, Married-civ-spouse, Divorced...
## $ occupation <fctr> Adm-clerical, Exec-managerial, Handlers-cle...
## $ relationship <fctr> Not-in-family, Husband, Not-in-family, Hus...
## $ race <fctr> White, White, White, Black, Black, White...
## $ sex <fctr> Male, Male, Male, Male, Female, Female, ...
## $ capital.gain <int> 2174, 0, 0, 0, 0, 0, 0, 0, 14084, 5178, 0, 0, 0...
## $ capital.loss <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ hours.per.week <int> 40, 13, 40, 40, 40, 40, 16, 45, 50, 40, 80, 40,...
## $ native.country <fctr> United-States, United-States, United-States...
## $ income <fctr> <=50K, <=50K, <=50K, <=50K, <=50K, <=50K...
set.seed(364)
n <- nrow(census)
test_idx <- sample.int(n, size = round(0.2 * n))
train <- census[-test_idx, ]
nrow(train)
## [1] 26049
test <- census[test_idx, ]
nrow(test)
## [1] 6512
pi_bar <- tally(~ income, data = train, format = "percent")[2]
pi_bar
## >50K
## 24.25045
tally(~ income, data = train, format = "percent")
## income
## <=50K >50K
## 75.74955 24.25045
library(rpart)
dtree <- rpart(income ~ capital.gain, data = train)
split_val <- as.data.frame(dtree$splits)$index
dtree_frame <- dtree$frame %>%
select(var, n, dev, yval) %>%
mutate(pct = n / nrow(train), right = (n - dev) / n)
rpart(income ~ capital.gain, data = train)
## n= 26049
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 26049 6317 <=50K (0.75749549 0.24250451)
## 2) capital.gain< 5095.5 24784 5115 <=50K (0.79361685 0.20638315) *
## 3) capital.gain>=5095.5 1265 63 >50K (0.04980237 0.95019763) *
split <- 5095.5
train <- train %>% mutate(hi_cap_gains = capital.gain >= split)
ggplot(data = train, aes(x = capital.gain, y = income)) +
geom_count(aes(color = hi_cap_gains),
position = position_jitter(width = 0, height = 0.1), alpha = 0.5) +
geom_vline(xintercept = split, color = "dodgerblue", lty = 2) +
scale_x_log10(labels = scales::dollar)
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 23870 rows containing non-finite values (stat_sum).
form <- as.formula("income ~ age + workclass + education + marital.status +
occupation + relationship + race + sex + capital.gain + capital.loss +
hours.per.week")
mod_tree <- rpart(form, data = train)
mod_tree
## n= 26049
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 26049 6317 <=50K (0.75749549 0.24250451)
## 2) relationship= Not-in-family, Other-relative, Own-child, Unmarried 14196 947 <=50K (0.93329107 0.06670893)
## 4) capital.gain< 7073.5 13946 706 <=50K (0.94937617 0.05062383) *
## 5) capital.gain>=7073.5 250 9 >50K (0.03600000 0.96400000) *
## 3) relationship= Husband, Wife 11853 5370 <=50K (0.54695014 0.45304986)
## 6) education= 10th, 11th, 12th, 1st-4th, 5th-6th, 7th-8th, 9th, Assoc-acdm, Assoc-voc, HS-grad, Preschool, Some-college 8280 2769 <=50K (0.66557971 0.33442029)
## 12) capital.gain< 5095.5 7857 2355 <=50K (0.70026728 0.29973272) *
## 13) capital.gain>=5095.5 423 9 >50K (0.02127660 0.97872340) *
## 7) education= Bachelors, Doctorate, Masters, Prof-school 3573 972 >50K (0.27204030 0.72795970) *
plot(mod_tree)
text(mod_tree, use.n = TRUE, all = TRUE, cex = 0.7)
library(partykit)
## Loading required package: grid
plot(as.party(mod_tree))
options(digits = 5)
mod_tree_frame <- mod_tree$frame %>%
select(var, n, dev, yval) %>%
mutate(pct = n / nrow(train), right = (n - dev) / n)
70% of those who paid below the threshold made less than $50k.
train <- train %>%
mutate(husband_or_wife = relationship %in% c(" Husband", " Wife"),
college_degree = husband_or_wife & education %in%
c(" Bachelors", " Doctorate", " Masters", " Prof-school"),
income_dtree = predict(mod_tree, type = "class"))
cg_splits <- data.frame(husband_or_wife = c(TRUE, FALSE),
vals = c(5095.5, 7073.5))
ggplot(data = train, aes(x = capital.gain, y = income)) +
geom_count(aes(color = income_dtree, shape = college_degree),
position = position_jitter(width = 0, height = 0.1),
alpha = 0.5) +
facet_wrap(~ husband_or_wife) +
geom_vline(data = cg_splits, aes(xintercept = vals),
color = "dodgerblue", lty = 2) +
scale_x_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 23870 rows containing non-finite values (stat_sum).
printcp(mod_tree)
##
## Classification tree:
## rpart(formula = form, data = train)
##
## Variables actually used in tree construction:
## [1] capital.gain education relationship
##
## Root node error: 6317/26049 = 0.243
##
## n= 26049
##
## CP nsplit rel error xerror xstd
## 1 0.1289 0 1.000 1.000 0.01095
## 2 0.0641 2 0.742 0.742 0.00982
## 3 0.0367 3 0.678 0.678 0.00947
## 4 0.0100 4 0.641 0.641 0.00926
# plotcp(mod_tree)
train <- train %>%
mutate(income_dtree = predict(mod_tree, type = "class"))
confusion <- tally(income_dtree ~ income, data = train, format = "count")
confusion
## income
## income_dtree <=50K >50K
## <=50K 18742 3061
## >50K 990 3256
sum(diag(confusion)) / nrow(train)
## [1] 0.84449
In this case, the accuracy of the decision tree classifier is now 84.4%, a considerable improvement over the null model.
mod_tree2 <- rpart(form, data = train, control = rpart.control(cp = 0.002))
# XX calculate accuracy
library(randomForest)
mod_forest <- randomForest(form, data = train, ntree = 201, mtry = 3)
mod_forest
##
## Call:
## randomForest(formula = form, data = train, ntree = 201, mtry = 3)
## Type of random forest: classification
## Number of trees: 201
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 13.31%
## Confusion matrix:
## <=50K >50K class.error
## <=50K 18471 1261 0.063906
## >50K 2205 4112 0.349058
sum(diag(mod_forest$confusion)) / nrow(train)
## [1] 0.86694
library(tibble)
importance(mod_forest) %>%
as.data.frame() %>%
rownames_to_column() %>%
arrange(desc(MeanDecreaseGini))
## rowname MeanDecreaseGini
## 1 age 1068.76
## 2 capital.gain 1064.39
## 3 education 982.65
## 4 relationship 951.87
## 5 occupation 905.27
## 6 marital.status 880.90
## 7 hours.per.week 627.88
## 8 workclass 337.92
## 9 capital.loss 326.13
## 10 race 145.34
## 11 sex 110.08
library(class)
# distance metric only works with quantitative variables
train_q <- train %>%
select(age, education.num, capital.gain, capital.loss, hours.per.week)
income_knn <- knn(train_q, test = train_q, cl = train$income, k = 10)
confusion <- tally(income_knn ~ income, data = train, format = "count")
confusion
## income
## income_knn <=50K >50K
## <=50K 18875 2997
## >50K 857 3320
sum(diag(confusion)) / nrow(train)
## [1] 0.85205
knn_error_rate <- function(x, y, numNeighbors, z = x) {
y_hat <- knn(train = x, test = z, cl = y, k = numNeighbors)
return(sum(y_hat != y) / nrow(x))
}
ks <- c(1:15, 20, 30, 40, 50)
train_rates <- sapply(ks, FUN = knn_error_rate, x = train_q, y = train$income)
knn_error_rates <- data.frame(k = ks, train_rate = train_rates)
ggplot(data = knn_error_rates, aes(x = k, y = train_rate)) +
geom_point() + geom_line() + ylab("Misclassification Rate")
library(e1071)
mod_nb <- naiveBayes(form, data = train)
income_nb <- predict(mod_nb, newdata = train)
confusion <- tally(income_nb ~ income, data = train, format = "count")
confusion
## income
## income_nb <=50K >50K
## <=50K 18587 3605
## >50K 1145 2712
sum(diag(confusion)) / nrow(train)
## [1] 0.81765
library(nnet)
mod_nn <- nnet(form, data = train, size = 5)
## # weights: 296
## initial value 21842.825468
## iter 10 value 13198.315933
## iter 20 value 11190.055832
## iter 30 value 10252.441741
## iter 40 value 9937.073100
## iter 50 value 9591.448419
## iter 60 value 9319.908227
## iter 70 value 9062.177126
## iter 80 value 8918.313144
## iter 90 value 8826.858128
## iter 100 value 8729.189597
## final value 8729.189597
## stopped after 100 iterations
income_nn <- predict(mod_nn, newdata = train, type = "class")
confusion <- tally(income_nn ~ income, data = train, format = "count")
confusion
## income
## income_nn <=50K >50K
## <=50K 17871 2128
## >50K 1861 4189
sum(diag(confusion)) / nrow(train)
## [1] 0.84687
income_ensemble <- ifelse((income_knn == " >50K") +
(income_nb == " >50K") +
(income_nn == " >50K") >= 2, " >50K", " <=50K")
confusion <- tally(income_ensemble ~ income, data = train, format = "count")
confusion
## income
## income_ensemble <=50K >50K
## <=50K 18790 3039
## >50K 942 3278
sum(diag(confusion)) / nrow(train)
## [1] 0.84717
income_probs <- mod_nb %>%
predict(newdata = train, type = "raw") %>%
as.data.frame()
head(income_probs, 3)
## <=50K >50K
## 1 0.98882 0.01117538
## 2 0.84292 0.15707605
## 3 0.99973 0.00026597
names(income_probs)
## [1] " <=50K" " >50K"
tally(~` >50K` > 0.5, data = income_probs, format = "percent")
## ` >50K` > 0.5
## TRUE FALSE
## 14.807 85.193
tally(~` >50K` > 0.24, data = income_probs, format = "percent")
## ` >50K` > 0.24
## TRUE FALSE
## 19.939 80.061
pred <- ROCR::prediction(income_probs[,2], train$income)
perf <- ROCR::performance(pred, 'tpr', 'fpr')
class(perf) # can also plot(perf)
## [1] "performance"
## attr(,"package")
## [1] "ROCR"
perf_df <- data.frame(perf@x.values, perf@y.values)
names(perf_df) <- c("fpr", "tpr")
roc <- ggplot(data = perf_df, aes(x = fpr, y = tpr)) +
geom_line(color="blue") + geom_abline(intercept=0, slope=1, lty=3) +
ylab(perf@y.name) + xlab(perf@x.name)
confusion <- tally(income_nb ~ income, data = train, format = "count")
confusion
## income
## income_nb <=50K >50K
## <=50K 18587 3605
## >50K 1145 2712
sum(diag(confusion)) / nrow(train)
## [1] 0.81765
tpr <- confusion[" >50K", " >50K"] / sum(confusion[, " >50K"])
fpr <- confusion[" >50K", " <=50K"] / sum(confusion[, " <=50K"])
roc + geom_point(x = fpr, y = tpr, size = 3)
test_q <- test %>%
select(age, education.num, capital.gain, capital.loss, hours.per.week)
test_rates <- sapply(ks, FUN = knn_error_rate, x = train_q,
y = train$income, z = test_q)
knn_error_rates <- knn_error_rates %>% mutate(test_rate = test_rates)
library(tidyr)
knn_error_rates_tidy <- knn_error_rates %>%
gather(key = "type", value = "error_rate", -k)
ggplot(data = knn_error_rates_tidy, aes(x = k, y = error_rate)) +
geom_point(aes(color = type)) + geom_line(aes(color = type)) +
ylab("Misclassification Rate")
favstats(~ capital.gain, data = train)
## min Q1 median Q3 max mean sd n missing
## 0 0 0 0 99999 1084.4 7428.6 26049 0
favstats(~ capital.gain, data = test)
## min Q1 median Q3 max mean sd n missing
## 0 0 0 0 99999 1050.8 7210.1 6512 0
mod_null <- glm(income ~ 1, data = train, family = binomial)
mods <- list(mod_null, mod_tree, mod_forest, mod_nn, mod_nb)
lapply(mods, class)
## [[1]]
## [1] "glm" "lm"
##
## [[2]]
## [1] "rpart"
##
## [[3]]
## [1] "randomForest.formula" "randomForest"
##
## [[4]]
## [1] "nnet.formula" "nnet"
##
## [[5]]
## [1] "naiveBayes"
predict_methods <- methods("predict")
predict_methods[grepl(pattern = "(glm|rpart|randomForest|nnet|naive)",
predict_methods)]
## [1] "predict.glm" "predict.glmmPQL" "predict.glmtree"
## [4] "predict.naiveBayes" "predict.nnet" "predict.randomForest"
## [7] "predict.rpart"
predictions_train <- data.frame(
y = as.character(train$income),
type = "train",
mod_null = predict(mod_null, type = "response"),
mod_tree = predict(mod_tree, type = "class"),
mod_forest = predict(mod_forest, type = "class"),
mod_nn = predict(mod_nn, type = "class"),
mod_nb = predict(mod_nb, newdata = train, type = "class"))
predictions_test <- data.frame(
y = as.character(test$income),
type = "test",
mod_null = predict(mod_null, newdata = test, type = "response"),
mod_tree = predict(mod_tree, newdata = test, type = "class"),
mod_forest = predict(mod_forest, newdata = test, type = "class"),
mod_nn = predict(mod_nn, newdata = test, type = "class"),
mod_nb = predict(mod_nb, newdata = test, type = "class"))
predictions <- bind_rows(predictions_train, predictions_test)
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
glimpse(predictions)
## Observations: 32,561
## Variables: 7
## $ y <fctr> <=50K, <=50K, <=50K, <=50K, <=50K, <=50K, <...
## $ type <chr> "train", "train", "train", "train", "train", "train...
## $ mod_null <dbl> 0.2425, 0.2425, 0.2425, 0.2425, 0.2425, 0.2425, 0.2...
## $ mod_tree <fctr> <=50K, >50K, <=50K, <=50K, >50K, >50K, <=50...
## $ mod_forest <fctr> <=50K, <=50K, <=50K, <=50K, >50K, >50K, <=5...
## $ mod_nn <fctr> >50K, <=50K, <=50K, <=50K, >50K, >50K, <=50...
## $ mod_nb <fctr> <=50K, <=50K, <=50K, <=50K, <=50K, <=50K, <...
predictions_tidy <- predictions %>%
mutate(mod_null = ifelse(mod_null < 0.5, " <=50K", " >50K")) %>%
gather(key = "model", value = "y_hat", -type, -y)
## Warning: attributes are not identical across measure variables; they will
## be dropped
glimpse(predictions_tidy)
## Observations: 162,805
## Variables: 4
## $ y <fctr> <=50K, <=50K, <=50K, <=50K, <=50K, <=50K, <=50K,...
## $ type <chr> "train", "train", "train", "train", "train", "train", "t...
## $ model <chr> "mod_null", "mod_null", "mod_null", "mod_null", "mod_nul...
## $ y_hat <chr> " <=50K", " <=50K", " <=50K", " <=50K", " <=50K", " <=50...
predictions_summary <- predictions_tidy %>%
group_by(model, type) %>%
summarize(N = n(), correct = sum(y == y_hat, 0),
positives = sum(y == " >50K"),
true_pos = sum(y_hat == " >50K" & y == y_hat),
false_pos = sum(y_hat == " >50K" & y != y_hat)) %>%
mutate(accuracy = correct / N,
tpr = true_pos / positives,
fpr = false_pos / (N - positives)) %>%
ungroup() %>%
gather(val_type, val, -model, -type) %>%
unite(temp1, type, val_type, sep = "_") %>% # glue variables
spread(temp1, val) %>%
arrange(desc(test_accuracy)) %>%
select(model, train_accuracy, test_accuracy, test_tpr, test_fpr)
predictions_summary
## # A tibble: 5 x 5
## model train_accuracy test_accuracy test_tpr test_fpr
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 mod_forest 0.86694 0.86364 0.64436 0.069366
## 2 mod_tree 0.84449 0.84459 0.50459 0.051524
## 3 mod_nn 0.84687 0.84413 0.65157 0.097033
## 4 mod_nb 0.81765 0.82217 0.41929 0.054731
## 5 mod_null 0.75750 0.76597 0.00000 0.000000
outputs <- c("response", "prob", "prob", "raw", "raw")
roc_test <- mapply(predict, mods, type = outputs,
MoreArgs = list(newdata = test)) %>%
as.data.frame() %>%
select(1,3,5,6,8)
names(roc_test) <-
c("mod_null", "mod_tree", "mod_forest", "mod_nn", "mod_nb")
glimpse(roc_test)
## Observations: 6,512
## Variables: 5
## $ mod_null <dbl> 0.2425, 0.2425, 0.2425, 0.2425, 0.2425, 0.2425, 0.2...
## $ mod_tree <dbl> 0.299733, 0.727960, 0.299733, 0.299733, 0.299733, 0...
## $ mod_forest <dbl> 0.5920398, 0.5771144, 0.0945274, 0.0348259, 0.59701...
## $ mod_nn <dbl> 0.669130, 0.649079, 0.249087, 0.166118, 0.695224, 0...
## $ mod_nb <dbl> 1.8642e-01, 2.6567e-01, 4.8829e-02, 3.5218e-02, 4.2...
get_roc <- function(x, y) {
pred <- ROCR::prediction(x$y_hat, y)
perf <- ROCR::performance(pred, 'tpr', 'fpr')
perf_df <- data.frame(perf@x.values, perf@y.values)
names(perf_df) <- c("fpr", "tpr")
return(perf_df)
}
roc_tidy <- roc_test %>%
gather(key = "model", value = "y_hat") %>%
group_by(model) %>%
dplyr::do(get_roc(., y = test$income))
ggplot(data = roc_tidy, aes(x = fpr, y = tpr)) +
geom_line(aes(color = model)) +
geom_abline(intercept = 0, slope = 1, lty = 3) +
ylab(perf@y.name) + xlab(perf@x.name) +
geom_point(data = predictions_summary, size = 3,
aes(x = test_fpr, y = test_tpr, color = model))