In [82]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [83]:
train = pd.read_csv("Data/Titanic/train.csv")
test = pd.read_csv("Data/Titanic/test.csv")
In [84]:
sex_pivot = train.pivot_table(index = "Sex", values = "Survived")
pclass_pivot = train.pivot_table(index = "Pclass", values = "Survived")
sex_pivot.plot.bar()
pclass_pivot.plot.bar()
Out[84]:
In [85]:
train["Age"].describe()
Out[85]:
there are 100 data rows of missing ages, and there are ages lower than 1 expresssed as fractions. further exploration with graphs below
In [86]:
survived = train[train["Survived"] == 1]
died = train[train["Survived"] == 0]
survived["Age"].plot.hist(alpha = 0.5, color = 'red', bins = 50)
died["Age"].plot.hist(alpha = 0.5, color = 'blue', bins = 50)
plt.legend(["Survived", "Died"])
plt.show()
In [87]:
# Age shows a group pattern between age groups, applying pd.cut to categorize them into groups
def process_age(df,cut_points,label_names):
df["Age"] = df["Age"].fillna(-0.5)
df["Age_categories"] = pd.cut(df["Age"],cut_points,labels=label_names)
return df
cut_points = [-1, 0, 5, 12, 18, 35, 60, 100]
label_names = ["Missing", "Infant", "Child", "Teenager", "Young Adult", "Adult", "Senior"]
train = process_age(train, cut_points, label_names)
test = process_age(test, cut_points, label_names)
agecat_pivot = train.pivot_table(index = "Age_categories", values = "Survived")
agecat_pivot.plot.bar()
plt.show()
In [88]:
cols = ["Sex", "Age_categories", "Pclass"]
for col in cols:
train[col] = train[col].astype("category")
test[col] = test[col].astype("category")
dummies = pd.get_dummies(train[cols])
train = pd.concat([train, dummies], axis = 1)
dummies = pd.get_dummies(test[cols])
test = pd.concat([test, dummies], axis = 1)
In [89]:
train.info()
In [90]:
cols = train.select_dtypes(include = ['uint8'])
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
features = cols
target = train["Survived"]
train_features, test_features, train_target, test_target = train_test_split(features, target, test_size = 0.2, random_state = 0)
lr = LogisticRegression()
lr.fit(train_features, train_target)
predictions = lr.predict(test_features)
accuracy = accuracy_score(test_target, predictions)
accuracy
Out[90]:
In [91]:
#use kfold cross validation than hold-out
from sklearn.model_selection import cross_val_score
import numpy as np
lr = LogisticRegression()
scores = cross_val_score(lr, features, target, cv = 10)
accuracy = np.mean(scores)
print(scores, accuracy)
In [92]:
cols = ['SibSp','Parch','Fare','Cabin','Embarked']
train[cols].describe(include = 'all', percentiles = [])
Out[92]:
In [93]:
# Cabin are mostly missing and unique, while Embarked has a few missing value with 3 unique values; SibSp, Parch, Fare are all continuous var with no missing value but of very different scale
# fill embark missing value with mode
train["Embarked"] = train["Embarked"].fillna("S")
train["Embarked"] = train["Embarked"].astype("category")
dummy = pd.get_dummies(train["Embarked"], prefix = "Embarked")
train = pd.concat([train, dummy], axis = 1)
train.head()
Out[93]:
In [94]:
from sklearn.preprocessing import minmax_scale
new_cols = ['SibSp_scaled','Parch_scaled','Fare_scaled']
for i in range(3):
train[new_cols[i]] = minmax_scale(train[cols[i]])
print(train.head())
In [95]:
test[cols].describe(include = 'all', percentiles = [])
Out[95]:
In [96]:
# test data has one missing value in "Fare", fill with mean and make the similar conversion like the training set
test["Fare"] = test["Fare"].fillna(test["Fare"].mean())
test["Embarked"] = test["Embarked"].astype("category")
dummy = pd.get_dummies(test["Embarked"], prefix = "Embarked")
test = pd.concat([test, dummy], axis = 1)
for i in range(3):
test[new_cols[i]] = minmax_scale(test[cols[i]])
In [97]:
train.info()
In [98]:
cols = ['Age_categories_Missing', 'Age_categories_Infant',
'Age_categories_Child', 'Age_categories_Teenager',
'Age_categories_Young Adult', 'Age_categories_Adult',
'Age_categories_Senior', 'Pclass_1', 'Pclass_2', 'Pclass_3',
'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
'SibSp', 'Parch', 'Fare']
In [99]:
lr = LogisticRegression()
lr.fit(train[cols], train["Survived"])
coef = lr.coef_
feature_coef = pd.Series(coef[0], index = cols)
feature_coef.plot.barh()
Out[99]:
In [100]:
# show bar plots with absolute values and sorted
sort_feature_coef = feature_coef.abs().sort_values()
sort_feature_coef.plot.barh()
plt.show()
In [101]:
# only selecting the top 8 vars and fit a model to test the accuracy
columns = ['Age_categories_Infant', 'SibSp_scaled', 'Sex_female', 'Sex_male',
'Pclass_1', 'Pclass_3', 'Age_categories_Senior', 'Parch_scaled']
lr = LogisticRegression()
scores = cross_val_score(lr, train[columns], train["Survived"], cv = 10)
accuracy = np.mean(scores)
print(accuracy)
In [104]:
survived = train["Fare"][train["Survived"] == 1]
died = train["Fare"][train["Survived"] == 0]
survived.plot.hist(alpha = 0.5, color = 'red', bins = 20, figsize = (12,8), xlim = (0, 150))
died.plot.hist(alpha = .5, color = 'blue', bins = 20, figsize = (12, 8), xlim = (0, 150))
plt.legend(["Survived", "Died"])
plt.show()
In [105]:
# Judging by the graph, it makes sense to binning the var "Fare"
def process_fare(df, cut, label):
df["Fare_categories"] = pd.cut(df["Fare"], cut, labels = label)
return df
cut = [0, 12, 50, 100, 1000]
label = ["0-12", "12-50", "50-100", "100+"]
train = process_fare(train, cut, label)
test = process_fare(test, cut, label)
dummy = pd.get_dummies(train["Fare_categories"], prefix = "Fare_categories")
train = pd.concat([train, dummy], axis = 1)
dummy = pd.get_dummies(test["Fare_categories"], prefix = "Fare_categories")
test = pd.concat([test, dummy], axis = 1)
In [106]:
# create func to create dummy vars
def create_dummies(df, col):
dummy = pd.get_dummies(df[col], prefix = col)
df = pd.concat([df, dummy], axis = 1)
return df
titles = {
"Mr" : "Mr",
"Mme": "Mrs",
"Ms": "Mrs",
"Mrs" : "Mrs",
"Master" : "Master",
"Mlle": "Miss",
"Miss" : "Miss",
"Capt": "Officer",
"Col": "Officer",
"Major": "Officer",
"Dr": "Officer",
"Rev": "Officer",
"Jonkheer": "Royalty",
"Don": "Royalty",
"Sir" : "Royalty",
"Countess": "Royalty",
"Dona": "Royalty",
"Lady" : "Royalty"
}
extracted_titles = train["Name"].str.extract(' ([A-Za-z]+)\.',expand=False)
train["Title"] = extracted_titles.map(titles)
ext = test["Name"].str.extract(' ([A-Za-z]+)\.',expand=False)
test["Title"] = ext.map(titles)
train["Cabin_type"] = train["Cabin"].str[0]
test["Cabin_type"] = test["Cabin"].str[0]
train["Cabin_type"] = train["Cabin_type"].fillna("Unknown")
test["Cabin_type"] = test["Cabin_type"].fillna("Unknown")
cols = ["Title", "Cabin_type"]
dummies = pd.get_dummies(train[cols])
train = pd.concat([train, dummies], axis = 1)
dummies = pd.get_dummies(test[cols])
test = pd.concat([test, dummies], axis = 1)
In [108]:
# check collinearity
import seaborn as sns
def plot_correlation_heatmap(df):
corr = df.corr()
sns.set(style="white")
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.show()
columns = ['Age_categories_Missing', 'Age_categories_Infant',
'Age_categories_Child', 'Age_categories_Teenager',
'Age_categories_Young Adult', 'Age_categories_Adult',
'Age_categories_Senior', 'Pclass_1', 'Pclass_2', 'Pclass_3',
'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
'SibSp_scaled', 'Parch_scaled', 'Fare_categories_0-12',
'Fare_categories_12-50','Fare_categories_50-100', 'Fare_categories_100+',
'Title_Master', 'Title_Miss', 'Title_Mr','Title_Mrs', 'Title_Officer',
'Title_Royalty', 'Cabin_type_A','Cabin_type_B', 'Cabin_type_C', 'Cabin_type_D',
'Cabin_type_E','Cabin_type_F', 'Cabin_type_G', 'Cabin_type_T', 'Cabin_type_Unknown']
plot_correlation_heatmap(train[columns])
In [110]:
# Remove one var in each dummy variable set to reduce the collinearity in each, and the high correlation between Sex and Title will render the elimination of sex as title may be more nuanced
# Removed categories include : Pclass_2, Age_categories_Teenager, Fare_categories_12-50, Title_Master, Cabin_type_A, Sexes
cols = ['Age_categories_Missing', 'Age_categories_Infant',
'Age_categories_Child', 'Age_categories_Young Adult',
'Age_categories_Adult', 'Age_categories_Senior', 'Pclass_1', 'Pclass_3',
'Embarked_C', 'Embarked_Q', 'Embarked_S', 'SibSp_scaled',
'Parch_scaled', 'Fare_categories_0-12', 'Fare_categories_50-100',
'Fare_categories_100+', 'Title_Miss', 'Title_Mr', 'Title_Mrs',
'Title_Officer', 'Title_Royalty', 'Cabin_type_B', 'Cabin_type_C',
'Cabin_type_D', 'Cabin_type_E', 'Cabin_type_F', 'Cabin_type_G',
'Cabin_type_T', 'Cabin_type_Unknown']
from sklearn.feature_selection import RFECV
features = train[cols]
target = train["Survived"]
lr = LogisticRegression()
selector = RFECV(lr, cv = 10)
selector.fit(features, target)
optimized_columns = features.columns[selector.support_]
In [111]:
optimized_columns
Out[111]:
In [112]:
features = train[optimized_columns]
lr = LogisticRegression()
scores = cross_val_score(lr, features, target, cv = 10)
accuracy = np.mean(scores)
print(accuracy)
In [115]:
# try model with K-nearest neighbors (reset the features to all to compare the base models)
from sklearn.neighbors import KNeighborsClassifier
# base model with logistic Regression
all_X = train[cols]
all_y = train['Survived']
lr = LogisticRegression()
scores = cross_val_score(lr, all_X, all_y, cv=10)
accuracy_lr = scores.mean()
print(accuracy_lr)
In [116]:
# K_nearest neighbors
knn = KNeighborsClassifier(n_neighbors=1)
scores = cross_val_score(knn, all_X, all_y, cv = 10)
accuracy_knn = scores.mean()
print(accuracy_knn)
In [118]:
#hyperparameter optimization by testing out different ks from 1-49 over the odd values
def plot_dict(dictionary):
pd.Series(dictionary).plot.bar(figsize=(9,6),
ylim=(0.78,0.83),rot=0)
plt.show()
knn_scores = dict()
for k in range(1, 50, 2):
knn = KNeighborsClassifier(n_neighbors = k)
scores = cross_val_score(knn, all_X, all_y, cv = 10)
accuracy_knn = scores.mean()
knn_scores[k] = accuracy_knn
plot_dict(knn_scores)
print(knn_scores)
In [119]:
# Choose the best params with GridSearch kit of scikit-learn
from sklearn.model_selection import GridSearchCV
hyperparameters = {
"n_neighbors": range(1,20,2),
"weights": ["distance", "uniform"],
"algorithm": ['brute'],
"p": [1,2]
}
knn = KNeighborsClassifier()
grid = GridSearchCV(knn, param_grid = hyperparameters, cv = 10)
grid.fit(all_X, all_y)
best_params = grid.best_params_
best_score = grid.best_score_
print(best_params, best_score)
In [120]:
# Random Forest Model
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state = 1)
scores = cross_val_score(rf, all_X, all_y, cv = 10)
accuracy_rf = scores.mean()
print(accuracy_rf)
In [121]:
# Also test out different params with gridsearch
rf = RandomForestClassifier(random_state = 1)
hyperparameters = {
"criterion": ["entropy", "gini"],
"max_depth": [5, 10],
"max_features": ["log2", "sqrt"],
"min_samples_leaf": [1, 5],
"min_samples_split": [3, 5],
"n_estimators": [6, 9]
}
grid = GridSearchCV(rf, param_grid = hyperparameters, cv = 10)
grid.fit(all_X, all_y)
best_params = grid.best_params_
best_score = grid.best_score_
print(best_params, best_score)