#=======================================================================================
#
# File: IntroToMachineLearning.R
# Author: Dave Langer
# Description: This code illustrates the usage of the caret package for the An
# Introduction to Machine Learning with R and Caret" Meetup dated
# 06/07/2017. More details on the Meetup are available at:
#
# https://www.meetup.com/data-science-dojo/events/239730653/
#
# NOTE - This file is provided "As-Is" and no warranty regardings its contents are
# offered nor implied. USE AT YOUR OWN RISK!
#
#=======================================================================================
#install.packages(c("e1071", "caret", "doSNOW", "ipred", "xgboost"))
library(caret)
library(doSNOW)
#=================================================================
# Load Data
#=================================================================
train <- read.csv("train.csv", stringsAsFactors = FALSE)
View(train)
#=================================================================
# Data Wrangling
#=================================================================
# Replace missing embarked values with mode.
table(train$Embarked)
train$Embarked[train$Embarked == ""] <- "S"
# Add a feature for tracking missing ages.
summary(train$Age)
train$MissingAge <- ifelse(is.na(train$Age),
"Y", "N")
# Add a feature for family size.
train$FamilySize <- 1 + train$SibSp + train$Parch
# Set up factors.
train$Survived <- as.factor(train$Survived)
train$Pclass <- as.factor(train$Pclass)
train$Sex <- as.factor(train$Sex)
train$Embarked <- as.factor(train$Embarked)
train$MissingAge <- as.factor(train$MissingAge)
# Subset data to features we wish to keep/use.
features <- c("Survived", "Pclass", "Sex", "Age", "SibSp",
"Parch", "Fare", "Embarked", "MissingAge",
"FamilySize")
train <- train[, features]
str(train)
#=================================================================
# Impute Missing Ages
#=================================================================
# Caret supports a number of mechanism for imputing (i.e.,
# predicting) missing values. Leverage bagged decision trees
# to impute missing values for the Age feature.
# First, transform all feature to dummy variables.
dummy.vars <- dummyVars(~ ., data = train[, -1])
train.dummy <- predict(dummy.vars, train[, -1])
View(train.dummy)
# Now, impute!
pre.process <- preProcess(train.dummy, method = "bagImpute")
imputed.data <- predict(pre.process, train.dummy)
View(imputed.data)
train$Age <- imputed.data[, 6]
View(train)
#=================================================================
# Split Data
#=================================================================
# Use caret to create a 70/30% split of the training data,
# keeping the proportions of the Survived class label the
# same across splits.
set.seed(54321)
indexes <- createDataPartition(train$Survived,
times = 1,
p = 0.7,
list = FALSE)
titanic.train <- train[indexes,]
titanic.test <- train[-indexes,]
# Examine the proportions of the Survived class lable across
# the datasets.
prop.table(table(train$Survived))
prop.table(table(titanic.train$Survived))
prop.table(table(titanic.test$Survived))
#=================================================================
# Train Model
#=================================================================
# Set up caret to perform 10-fold cross validation repeated 3
# times and to use a grid search for optimal model hyperparamter
# values.
train.control <- trainControl(method = "repeatedcv",
number = 10,
repeats = 3,
search = "grid")
# Leverage a grid search of hyperparameters for xgboost. See
# the following presentation for more information:
# https://www.slideshare.net/odsc/owen-zhangopen-sourcetoolsanddscompetitions1
tune.grid <- expand.grid(eta = c(0.05, 0.075, 0.1),
nrounds = c(50, 75, 100),
max_depth = 6:8,
min_child_weight = c(2.0, 2.25, 2.5),
colsample_bytree = c(0.3, 0.4, 0.5),
gamma = 0,
subsample = 1)
View(tune.grid)
# Use the doSNOW package to enable caret to train in parallel.
# While there are many package options in this space, doSNOW
# has the advantage of working on both Windows and Mac OS X.
#
# Create a socket cluster using 10 processes.
#
# NOTE - Tune this number based on the number of cores/threads
# available on your machine!!!
#
cl <- makeCluster(10, type = "SOCK")
# Register cluster so that caret will know to train in parallel.
registerDoSNOW(cl)
# Train the xgboost model using 10-fold CV repeated 3 times
# and a hyperparameter grid search to train the optimal model.
caret.cv <- train(Survived ~ .,
data = titanic.train,
method = "xgbTree",
tuneGrid = tune.grid,
trControl = train.control)
stopCluster(cl)
# Examine caret's processing results
caret.cv
# Make predictions on the test set using a xgboost model
# trained on all 625 rows of the training set using the
# found optimal hyperparameter values.
preds <- predict(caret.cv, titanic.test)
# Use caret's confusionMatrix() function to estimate the
# effectiveness of this model on unseen, new data.
confusionMatrix(preds, titanic.test$Survived)
Happy learning!!
|