TitanicDataAnalysis1_Understand_Data.R

############################################################################
# Copyright 2020 Gangmin Li
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# The base of this code was from Dave Langer "Intoduction to Data Science."
# Credit to him. You can find him on Github: https://github.com/EasyD
# I have also integrated other sources like Titanic Forum, Python code
# and some Chinese R code.
#
# Notably,
# https://www.kaggle.com/startupsci/titanic-data-science-solutions
#
# The whole purpose of this is to teach My students on Data Science.
#
# This R source code file corresponds to video 1 of the YouTube series
# "Introduction to Data Science with R" located at the following URL:
#     http://www.youtube.com/watch?v=32o0DnuRjfg
#
# The task is to build a model based on the train data
# then to predict test data who can survive in Kaggle Titanic competition
##########################################################################
# 1. Understand Data
##########################################################################

### Load raw data

train <- read.csv("train.csv", header = TRUE)
test <- read.csv("test.csv", header = TRUE)

# RStudio help function
# what help with any R function just type ?
# for example ?read.csv in Console

# First explore of datasets
# the first R code you can use is str.
# use ?str to find more

help(str)

# use str to explore train and test
str(train)
str(test)

# Add a "Survived" variable to the test set to allow for combining data sets
test <- data.frame(Survived = rep("NA", nrow(test)), test[,])

# Now check test. you can see that it has 12 variable now.
# And Survived is the first variable. because we used test[,]
# if you want add Survived into the second position Do this,
# test <- data.frame(test[1], Survived = rep("NA", nrow(test)), test[,2:ncol(test)])

# Combine datasets together. actually append test to train
data <- rbind(train, test)
# We may need to keep the raw data into a file in case we need it later.
write.csv(data, "data.cvs", row.names = FALSE )

#### We have done combined datasest with only a few line of code
# notice that the type of Survived has been changed to Chr.
# This is because we used "NA" as its value
# A bit about R data types
# ?str structure of dataset
# chr, int
# Factor in R is 'category'. it likes a selection from a list.
# for example, Cabin Factor w/187 levels. It means there are 187 selections.
# Sex Factor w/ 2 "female", "male", 2,1 means, two options 2- female, 1- male.
#
# NA : not available (absent value, missing value)

str(data)

# Exam PassengerID, type INT, we can check total number and the number of unique values.
# If they are equal and both equal to the number of records. it means there are
# unique and has no missing value.
length(data$PassengerId)
length(unique(data$PassengerId))

### Exam Survived
data$Survived <- as.factor(data$Survived)
table(data$Survived)

# Calculate the survive rate in train data is 38% and the death rate is 61.61%
prop.table(table(as.factor(train$Survived)))

### Examine Pclass value,
# Look into Kaggle's explanation about Pclass: it is a proxy for social class i.e. rich or poor
# It should be factor and it does not make sense to stay in int.
data$Pclass <- as.factor(data$Pclass)
test$Pclass <- as.factor(test$Pclass)
train$Pclass <- as.factor(train$Pclass)

# Distribution across classes
table(data$Pclass)

# Distribution across classes with survive
table(data$Pclass, data$Survived)

# Calculate the distribution on Pclass
# Overall passenger distribution on classes.
prop.table(table(data$Pclass))

# Train data passenger distribution on classes.
prop.table(table(train$Pclass))

# Test data passenger distribution on classes.
prop.table(table(test$Pclass))

# Calculate death distribution across classes with Train data
SurviveOverClass <- table(train$Pclass, train$Survived)
# Convert SurviveOverClass into data frame
SoC.data.fram <- data.frame(SurviveOverClass)
# Retrieve death distribution in classes
Death.distribution.on.class <- SoC.data.fram$Freq[SoC.data.fram$Var2==0]
prop.table(Death.distribution.on.class)

# Calculate death rate in train data
# Distribution across classes with survive in Train data
SurviveOverClass <- table(train$Pclass, train$Survived)
# Convert SurviveOverClass into data frame
SoC.data.fram <- data.frame(SurviveOverClass)
# Retrieve death distribution in classes
Death.distribution.on.class <- SoC.data.fram$Freq[SoC.data.fram$Var2==0]
prop.table(Death.distribution.on.class)

# Try summary on data
summary.data.frame(data)

## Exploratory data analysis with graph
# Load up ggplot2 package to use for visualizations
# load it into memory
library(ggplot2)

# High class passenger has more chance of survive than passenger with lower class
# Hypothesis - Rich passengers can but expensive ticket. class=1 is more expensive
# survived at a higher rate
ggplot(train, aes(x = Pclass, fill = factor(Survived))) +
  geom_bar(width = 0.5) +
  xlab("Pclass") +
  ylab("Total Count") +
  labs(fill = "Survived")

# It sort of proved the survive rate with social class
# more people perished in the third class

### Exam Name attribute
# the original name typed as factor, which we really don't want (shows the uniqueness)
# convert Name type
data$Name <- as.character(data$Name)

# Confirm the name has 1037 unique values
length(unique(data$Name))

# Find the two duplicate names
# First used which function to get the duplicate names and store them as a vector dup.names
# check it up ?which.
dup.names <- data[which(duplicated(data$Name)), "Name"]

# Echo out
dup.names

### Exam Sex attribute
# Retrial male and females. then check their numbers.
summary(data$Sex)

#Plot Sex distribution on entire dataset and get general an impression
ggplot(data[1:891,], aes(x = Sex)) +
  geom_bar(fill="steelblue") +
  xlab("Sex") +
  ylab("Total Count")

# plot Survived over Sex on train. use data[1:891,]
ggplot(data[1:891,], aes(x = Sex, fill = Survived)) +
  geom_bar() +
  xlab("Sex") +
  ylab("Total Count") +
  labs(fill = "Survived")

### Examine Age
# Summary over data, train and test.
summary(data$Age)
summary(train$Age)
summary(test$Age)

#It makes sense to change Age type to Factor to see distribution
summary(as.factor(data$Age))

# Plot distribution of age group
ggplot(data, aes(x = Age)) +
  geom_histogram(binwidth = 10, fill="steelblue") +
  xlab("Age") +
  ylab("Total Count")

# Plot Survived on age group on train dataset
ggplot(data[1:891,], aes(x = Age, fill = Survived)) +
  geom_histogram(binwidth = 10) +
  xlab("Age") +
  ylab("Total Count")

### Exam SibSp, Its original type is int
summary((data$SibSp))

# How many possible unique values?
length(unique(data$SibSp))

# Treat it as a factor, so we know the value distribution
data$SibSp <- as.factor(data$SibSp)
summary(data$SibSp)

# Plot entire SibSp distribution among the 7 values
ggplot(data, aes(x = SibSp)) +
  geom_bar() +
  xlab("SibSp") +
  ylab("Total Count")+
  coord_cartesian()

# Plot on the Survived on SibSp
ggplot(data[1:891,], aes(x = SibSp, fill = Survived)) +
  geom_bar() +
  xlab("SibSp") +
  ylab("Total Count")  +
  labs(fill = "Survived")

### Exam Parch
summary(data$Parch)

# How many possible values?
length(unique(data$Parch))

# Treat is as a factor so we know the value distribution
data$Parch <- as.factor(data$Parch)
summary(data$Parch)

# Plot entire Parch distribution among the 8 posibilites
ggplot(data, aes(x = Parch)) +
  geom_bar() +
  xlab("Parch") +
  ylab("Total Count")

# Plot on the Survived on SibSp
ggplot(data[1:891,], aes(x = Parch, fill = Survived)) +
  geom_bar() +
  xlab("Parch") +
  ylab("Total Count")  +
  labs(fill = "Survived")

### Exam  ticket
summary(data$Ticket)
length(unique(data$Ticket))
str(data$Ticket)
which(is.na(data$Ticket))

# Plot it value
ggplot(data[1:891,], aes(x = Ticket)) +
  geom_bar() +
  xlab("Ticket") +
  ylab("Total Count")

# Plot on the survive on Ticket
ggplot(data[1:891,], aes(x = Ticket, fill = Survived)) +
  geom_bar() +
  xlab("Ticket") +
  ylab("Total Count")  +
  labs(fill = "Survived")

### Examine Fare
summary(data$Fare)
length(unique(data$Fare))

# Can't make fare a factor, treat as numeric & visualize with histogram
ggplot(data, aes(x = Fare)) +
  geom_histogram(binwidth = 5) +
  ggtitle("Fare Distribution") +
  xlab("Fare") +
  ylab("Total Count") +
  ylim(0,200)

# Let's check to see if fare has predictive power
ggplot(data[1:891,], aes(x = Fare, fill = Survived)) +
  geom_histogram(binwidth = 5) +
  xlab("fare") +
  ylab("Total Count") +
  ylim(0,50) +
  labs(fill = "Survived")

# Explore Fare distribution between train and test to see if they are overlapped?
ggplot(train, aes(x = Fare)) +
  geom_histogram(binwidth = 5) +
  ggtitle("Fare Distribution") +
  xlab("Fare") +
  ylab("Total Count") +
  ylim(0,200)

ggplot(test, aes(x = Fare)) +
  geom_histogram(binwidth = 5) +
  ggtitle("Fare Distribution") +
  xlab("Fare") +
  ylab("Total Count") +
  ylim(0,200)

### Cabin
# Examine cabin values
str(data$Cabin)
# Cabin really isn't a factor, make a string and the display first 100
data$Cabin <- as.character(data$Cabin)
data$Cabin[1:100]

# Find number of the missing value
table(train[which(train$Cabin ==""), "Cabin"])

# Analysis of the cabin variable
str(data$Cabin)

# Cabin really isn't a factor, make a string and the display first 100
data$Cabin <- as.character(data$Cabin)
data$Cabin[1:100]

# Find out number of the missing value in the train
train$Cabin <- as.character(train$Cabin)
table(train[which(train$Cabin ==""), "Cabin"])

# Replace empty cabins with a "U"
#data[which(data$Cabin == ""), "Cabin"] <- "U"
data$Cabin[1:100]

# Take a look at just the first char as a factor
cabin.first.char <- as.factor(substr(data$Cabin, 1, 1))
str(cabin.first.char)
levels(cabin.first.char)

# Add to combined data set and plot
data$cabin.first.char <- cabin.first.char

# High level plot
ggplot(data[1:891,], aes(x = cabin.first.char, fill = Survived)) +
  geom_bar() +
  ggtitle("Survivability by cabin.first.char") +
  xlab("cabin.first.char") +
  ylab("Total Count") +
  ylim(0,750) +
  labs(fill = "Survived")

### Examine Embark
str(data$Embarked)
summary(data$Embarked)

# Plot Embarked data distribution and the Survived data over it
ggplot(data, aes(x = Embarked)) +
  geom_bar(width=0.5) +
  xlab("Passenger embarked port") +
  ylab("Total Count")

ggplot(data[1:891,], aes(x = Embarked, fill = Survived)) +
  geom_bar(width=0.5) +
   xlab("embarked") +
  ylab("Total Count") +
  labs(fill = "Survived")

##Calculate death RATE distribution over Embarked port with Train data
# We use table in R, you can check with ?table. A good example is
# mytable <- table(A,B) # A will be rows, B will be columns
# mytable # print table

# margin.table(mytable, 1) # A frequencies (summed over B)
# margin.table(mytable, 2) # B frequencies (summed over A)

# prop.table(mytable) # cell percentages
# prop.table(mytable, 1) # row percentages
# prop.table(mytable, 2) # column percentages

# We need prop.table to get column percentage which is the survived

# creat Embarked and Survived contingency table
SurviveOverEmbarkedTable <- table(train$Embarked, train$Survived)
# Death-0/survived-1 value distribution (percentage) based on embarked ports
# prop.table(mytable, 2) give us column (Survived) percentages
Deathandsurvivepercentage <- prop.table(SurviveOverEmbarkedTable, 2)
# Plot
M <- c("c-Cherbourg", "Q-Queenstown", "S-Southampton")
barplot(Deathandsurvivepercentage[2:4,1]*100, xlab =(""), ylim=c(0,100), ylab="Death distribution in percentage %",  names.arg = M, col="steelblue", main="Death distribution", border="black", beside=TRUE)
barplot(Deathandsurvivepercentage[2:4,2]*100, xlab =(""), ylim=c(0,100), ylab="Death distribution in percentage %",  names.arg = M, col="blue", main="Death distribution", border="black", beside=TRUE)

## Calculate survived RATE distribution based on embarked ports
# Death-0/survived-1 value distribution (percentage) based on embarked ports
# prop.table(mytable, 1) give us row (Port) percentages
# col-1 (Survived=0, perished) and col-2 (Survived =1, survived)
DeathandsurviveRateforeachport <- prop.table(SurviveOverEmbarkedTable, 1)
#plot
barplot(Deathandsurvivepercentage[2:4,1]*100, xlab =(""), ylim=c(0,100), ylab="Death rate in percentage %",  names.arg = M, col="red", main="Death rate comparison among mebarked ports", border="black", beside=TRUE)

##END 1 ######################################################