TitanicDataAnalysis4_Model_Cross_Validation.R
####################################################
# Model Cross Validation and Fine Tune
####################################################
# This file contains Cross Validation
# 1. CV-tree models
# 2. CV-random forest models
#
##### Predictor selection
library(caret)
library(rpart)
library(rpart.plot)
#read Re-engineered dataset
RE_data <- read.csv("RE_data.csv", header = TRUE)
#Factorize response variable
RE_data$Survived <- factor(RE_data$Survived)
#Separate Train and test data.
train <- RE_data[1:891, ]
test <- RE_data[892:1309, ]
# Setup model's train and valid dataset
set.seed(1000)
samp <- sample(nrow(train), 0.8 * nrow(train))
trainData <- train[samp, ]
validData <- train[-samp, ]
# set random for reproduction
set.seed(3214)
# specify parameters for cross validation
control <- trainControl(method = "repeatedcv",
number = 10, # number of folds
repeats = 5, # repeat times
search = "grid")
###############################################
# CV on Tree models
###############################################
### CV on tree model2
set.seed(1010)
# Create model from cross validation train data
# Tree_model2_cv <- train(Survived ~ Sex + Pclass + HasCabinNum + Deck + Fare_pp,
# data = trainData,
# method = "rpart",
# trControl = control)
# Due to the computation cost once a model is trained.
# it is better to save it and load later rather than compute a gain
#save(Tree_model2_cv, file = "Tree_model2_cv.rda")
load("Tree_model2_cv.rda")
# Show CV model's details
print.train(Tree_model2_cv)
# CV model's estimated accuracy
model_accuracy <- Tree_model2_cv$results$Accuracy[1]
paste("Estimated accuracy:", format(model_accuracy, digits = 4))
# Visualize cross validation tree
rpart.plot(Tree_model2_cv$finalModel, extra=4)
plot.train(Tree_model2_cv)
# Record the model's accuracy on *trainData*, *validData*,
# and *test* dataset. Remember *trainData* and *validData*
# are randomly partitioned from the train dataset.
### Access accuracy on different datasets
# prediction's Confusion Matrix on the trainData
predict_train <-predict(Tree_model2_cv, trainData)
conMat <- confusionMatrix(predict_train, trainData$Survived)
conMat$table
# prediction's Accuracy on the trainData
predict_train_accuracy <- conMat$overall["Accuracy"]
predict_train_accuracy
# prediction's Confusion Matrix on the validData
predict_valid <-predict(Tree_model2_cv, validData)
conMat <- confusionMatrix(predict_valid, validData$Survived)
conMat$table
# prediction's Accuracy on the validData
predict_valid_accuracy <- conMat$overall["Accuracy"]
predict_valid_accuracy
# predict on test
predict_test <-predict(Tree_model2_cv, test)
submit <- data.frame(PassengerId = test$PassengerId, Survived = as.factor(predict_test))
write.csv(submit, file = "Tree_model2_CV.CSV", row.names = FALSE)
# test accuracy 0.75837
paste("Test Accuracy:", 0.7584)
# accumulate model's accuracy
name <- c("Esti Accu", "Train Accu", "Valid Accu", "Test Accu")
Tree_model2_CV_accuracy <- c(model_accuracy, predict_train_accuracy, predict_valid_accuracy, 0.7584)
names(Tree_model2_CV_accuracy) <- name
Tree_model2_CV_accuracy
### CV on model3
#
set.seed(1234)
# Tree_model3_cv <- train(Survived ~ Sex + Fare_pp + Pclass + Title + Age_group + Group_size + Ticket_class + Embarked,
#
# data = trainData,
# method = "rpart",
# trControl = control)
#
# save(Tree_model3_cv, file = "Tree_model3_cv.rda")
load("Tree_model3_cv.rda")
# show model details
print.train(Tree_model3_cv)
# CV model's estimated accuracy
model_accuracy <- Tree_model3_cv$results$Accuracy[1]
paste("Estimated accuracy:", format(model_accuracy, digits = 4))
#Visualize cross validation tree
rpart.plot(Tree_model3_cv$finalModel, extra=4)
plot.train(Tree_model3_cv)
### Access accuracy on different datasets
# prediction's Confusion Matrix on the trainData
predict_train <-predict(Tree_model3_cv, trainData)
conMat <- confusionMatrix(predict_train, trainData$Survived)
conMat$table
# prediction's Accuracy on the trainData
predict_train_accuracy <- format(conMat$overall["Accuracy"], digits=4)
paste("trainData Accuracy:", predict_train_accuracy)
# prediction's Confusion Matrix on the validData
predict_valid <-predict(Tree_model3_cv, validData)
conMat <- confusionMatrix(predict_valid, validData$Survived)
conMat$table
# prediction's Accuracy on the validData
predict_valid_accuracy <- format(conMat$overall["Accuracy"], digits=4)
paste("validData Accuracy:", predict_valid_accuracy)
#predict on test
predict_test <-predict(Tree_model3_cv, test)
submit <- data.frame(PassengerId = test$PassengerId, Survived = as.factor(predict_test))
write.csv(submit, file = "Tree_model3_CV.CSV", row.names = FALSE)
## test accuracy is 0.77751
paste("Test Accuracy:", 0.7775)
# accumulate model's accuracy
Tree_model3_CV_accuracy <- c(model_accuracy, predict_train_accuracy, predict_valid_accuracy, 0.7775)
names(Tree_model3_CV_accuracy) <- name
Tree_model3_CV_accuracy
##############################################
### Cross Validation on Random Forest Models
##############################################
# Random Forest model RF_model1_cv
# set seed for reproduction
set.seed(2307)
# RF_model1_cv <- train(Survived ~ Sex + Pclass + HasCabinNum + Deck + Fare_pp,
# data = trainData,
# method = "rf",
# trControl = control)
# save(RF_model1_cv, file = "RF_model1_cv.rda")
load("RF_model1_cv.rda")
# Show CV mdoel's details
print(RF_model1_cv)
print(RF_model1_cv$results)
# Record model's accuracy
model_accuracy <- format(RF_model1_cv$results$Accuracy[2], digits = 4)
paste("Estimated accuracy:", model_accuracy)
### Access accuracy on different datasets
# prediction's Confusion Matrix on the trainData
predict_train <-predict(RF_model1_cv, trainData)
conMat <- confusionMatrix(predict_train, trainData$Survived)
conMat$table
# prediction's Accuracy on the trainData
predict_train_accuracy <- format(conMat$overall["Accuracy"], digits=4)
paste("trainData Accuracy:", predict_train_accuracy)
# prediction's Confusion Matrix on the validData
predict_valid <-predict(RF_model1_cv, validData)
conMat <- confusionMatrix(predict_valid, validData$Survived)
conMat$table
# prediction's Accuracy on the validData
predict_valid_accuracy <- format(conMat$overall["Accuracy"], digits=4)
paste("validData Accuracy:", predict_valid_accuracy)
# predict on test
predict_test <-predict(RF_model1_cv, test)
submit <- data.frame(PassengerId = test$PassengerId, Survived = as.factor(predict_test))
write.csv(submit, file = "RF_model1_CV.CSV", row.names = FALSE)
## test accuracy 0.74641
paste("Test Accuracy:", 0.7464)
# accumulate model's accuracy
RF_model1_cv_accuracy <- c(model_accuracy, predict_train_accuracy, predict_valid_accuracy, 0.7464)
names(RF_model1_cv_accuracy) <- name
RF_model1_cv_accuracy
###
# Random Forest model RF_model2_cv
##
# set seed for reproduction
set.seed(2300)
# RF_model2_cv <- train(Survived ~ Sex + Fare_pp + Pclass + Title + Age_group + Group_size + Ticket_class + Embarked,
# data = trainData,
# method = "rf",
# trControl = control)
# # This model will be used in chapter 12. so it is saved into a file for late to be loaded
# save(RF_model2_cv, file = "RF_model2_cv.rda")
load("RF_model2_cv.rda")
# Show CV mdoel's details
print(RF_model2_cv)
print(RF_model2_cv$results)
# Record model's accuracy
mode2_accuracy <- format(RF_model2_cv$results$Accuracy[2], digits = 4)
paste("Estimated accuracy:", mode2_accuracy)
### Access accuracy on different datasets
# prediction's Confusion Matrix on the trainData
predict_train <-predict(RF_model2_cv, trainData)
conMat <- confusionMatrix(predict_train, trainData$Survived)
conMat$table
# prediction's Accuracy on the trainData
predict_train_accuracy <- format(conMat$overall["Accuracy"], digits=4)
paste("trainData Accuracy:", predict_train_accuracy)
# prediction's Confusion Matrix on the validData
predict_valid <-predict(RF_model2_cv, validData)
conMat <- confusionMatrix(predict_valid, validData$Survived)
conMat$table
# prediction's Accuracy on the validData
predict_valid_accuracy <- format(conMat$overall["Accuracy"], digits=4)
paste("validData Accuracy:", predict_valid_accuracy)
#predict on test
predict_test <-predict(RF_model2_cv, test)
submit <- data.frame(PassengerId = test$PassengerId, Survived = as.factor(predict_test))
write.csv(submit, file = "RF_model2_CV.CSV", row.names = FALSE)
## test accuracy 0.75119
paste("Test Accuracy:", 0.7512)
# accumulate model's accuracy
RF_model2_cv_accuracy <- c(mode2_accuracy, predict_train_accuracy, predict_valid_accuracy, 0.7512)
names(RF_model2_cv_accuracy) <- name
RF_model2_cv_accuracy
### make a comparison among tree and random forest models
library(tidyr)
Model <- c("Tree_M2","Tree_M3","RF_model1","RF_model2")
# Show individual models' accuracy
Tree_model2_CV_accuracy
Tree_model3_CV_accuracy
RF_model1_cv_accuracy
RF_model2_cv_accuracy
#preparee for table construction
Pre <- c("Sex, Pclass, HasCabinNum, Deck, Fare_pp", "Sex, Fare_pp, Pclass, Title, Age_group, Group_size, Ticket_class, Embarked", "Sex, Pclass, HasCabinNum, Deck, Fare_pp", "Sex, Fare_pp, Pclass, Title, Age_group, Group_size, Ticket_class, Embarked")
#
Learn <- c(as.numeric(Tree_model2_CV_accuracy[1])*100, as.numeric(Tree_model3_CV_accuracy[1])*100, as.numeric(RF_model1_cv_accuracy[1])*100, as.numeric(RF_model2_cv_accuracy[1])*100)
#
Train <- c(as.numeric(Tree_model2_CV_accuracy[2])*100, as.numeric(Tree_model3_CV_accuracy[2])*100, as.numeric(RF_model1_cv_accuracy[2])*100, as.numeric(RF_model2_cv_accuracy[2])*100)
#
Valid <- c(as.numeric(Tree_model2_CV_accuracy[3])*100, as.numeric(Tree_model3_CV_accuracy[3])*100, as.numeric(RF_model1_cv_accuracy[3])*100, as.numeric(RF_model2_cv_accuracy[3])*100)
#
Test <- c(as.numeric(Tree_model2_CV_accuracy[4])*100, as.numeric(Tree_model3_CV_accuracy[4])*100, as.numeric(RF_model1_cv_accuracy[4])*100, as.numeric(RF_model2_cv_accuracy[4])*100)
# Construct Dataframe for table and plot
df1 <- data.frame(Model, Pre, Learn, Train, Valid, Test)
df2 <- data.frame(Model, Learn, Train, Valid, Test)
# show in table
knitr::kable(df1, longtable = TRUE, booktabs = TRUE, digits = 2, col.names =c("Models", "Predictors", "Accuracy on Learn", "Accuracy on Train", "Accuracy on Valid", "Accuracy on Test"),
caption = 'The Comparision among 4 CV models'
)
# plot results in bar chat
df.long <- gather(df2, Dataset, Accuracy, -Model, factor_key =TRUE)
ggplot(data = df.long, aes(x = Model, y = Accuracy, fill = Dataset)) +
geom_col(position = position_dodge())
#############################################
## Multiple Models Comparison
#############################################
#
### Regression Model for Titanic
LR_Model <- glm(formula = Survived ~ Pclass + Title + Sex + Age_group + Group_size + Ticket_class + Fare_pp + Embarked, family = binomial, data = trainData)
#summary(LR_Model_CV)
### Validate on trainData
Valid_trainData <- predict(LR_Model, newdata = trainData, type = "response") #prediction threshold
Valid_trainData <- ifelse(Valid_trainData > 0.5, 1, 0) # set binary
#produce confusion matrix
confusion_Mat<- confusionMatrix(as.factor(trainData$Survived),as.factor(Valid_trainData))
# accuracy on traindata
Regression_Acc_Train <- round(confusion_Mat$overall["Accuracy"]*100,2)
paste('Model Train Accuracy =', Regression_Acc_Train)
### Validate on validData
validData_Survived_predicted <- predict(LR_Model, newdata = validData, type = "response")
validData_Survived_predicted <- ifelse(validData_Survived_predicted > 0.5, 1, 0) # set binary prediction threshold
conMat<- confusionMatrix(as.factor(validData$Survived),as.factor(validData_Survived_predicted))
Regression_Acc_Valid <-round(conMat$overall["Accuracy"]*100,2)
paste('Model Valid Accuracy =', Regression_Acc_Valid)
### produce a prediction on test data
library(pROC)
auc(roc(trainData$Survived,Valid_trainData)) # calculate AUROC curve
#predict on test
test$Survived <- predict(LR_Model, newdata = test, type = "response")
test$Survived <- ifelse(test$Survived > 0.5, 1, 0) # set binary prediction threshold
submit <- data.frame(PassengerId = test$PassengerId, Survived = as.factor(test$Survived))
write.csv(submit, file = "LG_model1_CV.CSV", row.names = FALSE)
# Kaggle test accuracy score:0.76555
# record accuracy
Regr_Acc <- c(Regression_Acc_Train, Regression_Acc_Valid, 0.76555)
acc_names <- c("Train Accu", "Valid Accu", "Test Accu")
names(Regr_Acc) <- acc_names
Regr_Acc
### Support Vector Machine Model for Titanic
#load library
library(e1071)
# fit the model using default parameters
SVM_model<- svm(Survived ~ Pclass + Title + Sex + Age_group + Group_size + Ticket_class + Fare_pp + Deck + HasCabinNum + Embarked, data=trainData, kernel = 'radial', type="C-classification")
#summary(SVM_model)
### Validate on trainData
Valid_trainData <- predict(SVM_model, trainData)
#produce confusion matrix
confusion_Mat<- confusionMatrix(as.factor(trainData$Survived), as.factor(Valid_trainData))
# output accuracy
AVM_Acc_Train <- round(confusion_Mat$overall["Accuracy"]*100,4)
paste('Model Train Accuracy =', AVM_Acc_Train)
### Validate on validData
validData_Survived_predicted <- predict(SVM_model, validData) #produce confusion matrix
conMat<- confusionMatrix(as.factor(validData$Survived), as.factor(validData_Survived_predicted))
# output accuracy
AVM_Acc_Valid <- round(conMat$overall["Accuracy"]*100,4)
paste('Model Valid Accuracy =', AVM_Acc_Valid)
### make prediction on test
# SVM failed to produce a prediction on test because test has Survived col and it has value NA. A work around is assign it with a num like 1.
test$Survived <-1
# predict results on test
Survived <- predict(SVM_model, test)
solution <- data.frame(PassengerId=test$PassengerId, Survived =Survived)
write.csv(solution, file = 'svm_predicton.csv', row.names = F)
# prediction accuracy on test
SVM_Acc <- c(AVM_Acc_Train, AVM_Acc_Valid, 0.78947)
names(SVM_Acc) <- acc_names
# print out
SVM_Acc
### Neural Network Models
# load library
library(nnet)
# train the model
xTrain = train[ , c("Survived", "Pclass","Title", "Sex","Age_group","Group_size", "Ticket_class", "Fare_pp", "Deck", "HasCabinNum", "Embarked")]
NN_model1 <- nnet(Survived ~ ., data = xTrain, size=10, maxit=500, trace=FALSE)
#How do we do on the training data?
nn_pred_train_class = predict(NN_model1, xTrain, type="class" ) # yields "0", "1"
nn_train_pred = as.numeric(nn_pred_train_class ) #transform to 0, 1
confusion_Mat<-confusionMatrix(as.factor(nn_train_pred), train$Survived)
# output accuracy
NN_Acc_Train <- round(confusion_Mat$overall["Accuracy"]*100,4)
paste('Model Train Accuracy =', NN_Acc_Train)
#How do we do on the valid data?
nn_pred_valid_class = predict(NN_model1, validData, type="class" ) # yields "0", "1"
nn_valid_pred = as.numeric(nn_pred_valid_class ) #transform to 0, 1
confusion_Mat<-confusionMatrix(as.factor(nn_valid_pred), validData$Survived)
# output accuracy
NN_Acc_Valid <- round(confusion_Mat$overall["Accuracy"]*100,4)
paste('Model valid Accuracy =', NN_Acc_Valid)
#make a prediction on test data
nn_pred_test_class = predict(NN_model1, test, type="class" ) # yields "0", "1"
nn_pred_test = as.numeric(nn_pred_test_class ) #transform to 0, 1
solution <- data.frame(PassengerId=test$PassengerId, Survived = nn_pred_test)
write.csv(solution, file = 'NN_predicton.csv', row.names = F)
###
# 0.8934,0.8547, 0.71052
NN_Acc <- c(NN_Acc_Train, NN_Acc_Valid, 0.71052)
names(NN_Acc) <- acc_names
NN_Acc
### Comparision among Different Models
library(tidyr)
Model <- c("Regression","SVM","NN", "Decision tree", "Random Forest")
Train <- c(Regression_Acc_Train, AVM_Acc_Train, NN_Acc_Train, 82.72, 83.16)
Valid <- c(Regression_Acc_Valid, AVM_Acc_Valid, NN_Acc_Valid, 81.01, 92)
Test <- c(76.56, 78.95, 71.05, 77.75, 78.95)
df1 <- data.frame(Model, Train, Valid, Test)
# show in table
knitr::kable(df1, longtable = TRUE, booktabs = TRUE, digits = 2, col.names =c("Models", "Accuracy on Train", "Accuracy on Valid","Accuracy on Test"),
caption = 'The Comparision among 3 Machine Learning Models'
)
# plot in bar chat
df.long <- gather(df1, Dataset, Accuracy, -Model, factor_key =TRUE)
ggplot(data = df.long, aes(x = Model, y = Accuracy, fill = Dataset)) +
geom_col(position = position_dodge())
### END 4 Model Cross Validation #############################################