In [82]:

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [83]:

train = pd.read_csv("Data/Titanic/train.csv")
test = pd.read_csv("Data/Titanic/test.csv")

In [84]:

sex_pivot = train.pivot_table(index = "Sex", values = "Survived")
pclass_pivot = train.pivot_table(index = "Pclass", values = "Survived")
sex_pivot.plot.bar()
pclass_pivot.plot.bar()

Out[84]:

<matplotlib.axes._subplots.AxesSubplot at 0x1a1a35e6a0>

In [85]:

train["Age"].describe()

Out[85]:

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

there are 100 data rows of missing ages, and there are ages lower than 1 expresssed as fractions. further exploration with graphs below

In [86]:

survived = train[train["Survived"] == 1]
died = train[train["Survived"] == 0]
survived["Age"].plot.hist(alpha = 0.5, color = 'red', bins = 50)
died["Age"].plot.hist(alpha = 0.5, color = 'blue', bins = 50)
plt.legend(["Survived", "Died"])
plt.show()

In [87]:

# Age shows a group pattern between age groups, applying pd.cut to categorize them into groups
def process_age(df,cut_points,label_names):
    df["Age"] = df["Age"].fillna(-0.5)
    df["Age_categories"] = pd.cut(df["Age"],cut_points,labels=label_names)
    return df

cut_points = [-1, 0, 5, 12, 18, 35, 60, 100]
label_names = ["Missing", "Infant", "Child", "Teenager", "Young Adult", "Adult", "Senior"]
train = process_age(train, cut_points, label_names)
test = process_age(test, cut_points, label_names)
agecat_pivot = train.pivot_table(index = "Age_categories", values = "Survived")
agecat_pivot.plot.bar()
plt.show()

In [88]:

cols = ["Sex", "Age_categories", "Pclass"]
for col in cols:
    train[col] = train[col].astype("category")
    test[col] = test[col].astype("category")
dummies = pd.get_dummies(train[cols])
train = pd.concat([train, dummies], axis = 1)
dummies = pd.get_dummies(test[cols])
test = pd.concat([test, dummies], axis = 1)

In [89]:

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 25 columns):
PassengerId                   891 non-null int64
Survived                      891 non-null int64
Pclass                        891 non-null category
Name                          891 non-null object
Sex                           891 non-null category
Age                           891 non-null float64
SibSp                         891 non-null int64
Parch                         891 non-null int64
Ticket                        891 non-null object
Fare                          891 non-null float64
Cabin                         204 non-null object
Embarked                      889 non-null object
Age_categories                891 non-null category
Sex_female                    891 non-null uint8
Sex_male                      891 non-null uint8
Age_categories_Missing        891 non-null uint8
Age_categories_Infant         891 non-null uint8
Age_categories_Child          891 non-null uint8
Age_categories_Teenager       891 non-null uint8
Age_categories_Young Adult    891 non-null uint8
Age_categories_Adult          891 non-null uint8
Age_categories_Senior         891 non-null uint8
Pclass_1                      891 non-null uint8
Pclass_2                      891 non-null uint8
Pclass_3                      891 non-null uint8
dtypes: category(3), float64(2), int64(4), object(4), uint8(12)
memory usage: 83.3+ KB

In [90]:

cols = train.select_dtypes(include = ['uint8'])
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
features = cols
target = train["Survived"]
train_features, test_features, train_target, test_target = train_test_split(features, target, test_size = 0.2, random_state = 0)
lr = LogisticRegression()
lr.fit(train_features, train_target)
predictions = lr.predict(test_features)
accuracy = accuracy_score(test_target, predictions)
accuracy

Out[90]:

0.81005586592178769

In [91]:

#use kfold cross validation than hold-out
from sklearn.model_selection import cross_val_score
import numpy as np
lr = LogisticRegression()
scores = cross_val_score(lr, features, target, cv = 10)
accuracy = np.mean(scores)
print(scores, accuracy)

[ 0.8         0.81111111  0.7752809   0.87640449  0.80898876  0.78651685
  0.76404494  0.76404494  0.83146067  0.80681818] 0.802467086596

In [92]:

cols = ['SibSp','Parch','Fare','Cabin','Embarked']
train[cols].describe(include = 'all', percentiles = [])

Out[92]:

	SibSp	Parch	Fare	Cabin	Embarked
count	891.000000	891.000000	891.000000	204	889
unique	NaN	NaN	NaN	147	3
top	NaN	NaN	NaN	G6	S
freq	NaN	NaN	NaN	4	644
mean	0.523008	0.381594	32.204208	NaN	NaN
std	1.102743	0.806057	49.693429	NaN	NaN
min	0.000000	0.000000	0.000000	NaN	NaN
50%	0.000000	0.000000	14.454200	NaN	NaN
max	8.000000	6.000000	512.329200	NaN	NaN

In [93]:

# Cabin are mostly missing and unique, while Embarked has a few missing value with 3 unique values; SibSp, Parch, Fare are all continuous var with no missing value but of very different scale
# fill embark missing value with mode
train["Embarked"] = train["Embarked"].fillna("S")
train["Embarked"] = train["Embarked"].astype("category")
dummy = pd.get_dummies(train["Embarked"], prefix = "Embarked")
train = pd.concat([train, dummy], axis = 1)
train.head()

Out[93]:

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	...	Age_categories_Young Adult	Age_categories_Adult	Pclass_1	Pclass_3	Embarked_C	Embarked_S
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	...	1	0	0	1	0	1
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	...	0	1	1	0	1	0
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	...	1	0	0	1	0	1
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	...	1	0	1	0	0	1
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	...	1	0	0	1	0	1

5 rows × 28 columns

In [94]:

from sklearn.preprocessing import minmax_scale
new_cols = ['SibSp_scaled','Parch_scaled','Fare_scaled']
for i in range(3):
    train[new_cols[i]] = minmax_scale(train[cols[i]])
print(train.head())

   PassengerId  Survived Pclass  \
0            1         0      3   
1            2         1      1   
2            3         1      3   
3            4         1      1   
4            5         0      3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare     ...      Age_categories_Senior  \
0      0         A/5 21171   7.2500     ...                          0   
1      0          PC 17599  71.2833     ...                          0   
2      0  STON/O2. 3101282   7.9250     ...                          0   
3      0            113803  53.1000     ...                          0   
4      0            373450   8.0500     ...                          0   

  Pclass_1 Pclass_2  Pclass_3  Embarked_C  Embarked_Q  Embarked_S  \
0        0        0         1           0           0           1   
1        1        0         0           1           0           0   
2        0        0         1           0           0           1   
3        1        0         0           0           0           1   
4        0        0         1           0           0           1   

   SibSp_scaled  Parch_scaled  Fare_scaled  
0         0.125           0.0     0.014151  
1         0.125           0.0     0.139136  
2         0.000           0.0     0.015469  
3         0.125           0.0     0.103644  
4         0.000           0.0     0.015713  

[5 rows x 31 columns]

/Applications/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py:475: DataConversionWarning: Data with input dtype int64 was converted to float64.
  warnings.warn(msg, DataConversionWarning)

In [95]:

test[cols].describe(include = 'all', percentiles = [])

Out[95]:

	SibSp	Parch	Fare	Cabin	Embarked
count	418.000000	418.000000	417.000000	91	418
unique	NaN	NaN	NaN	76	3
top	NaN	NaN	NaN	B57 B59 B63 B66	S
freq	NaN	NaN	NaN	3	270
mean	0.447368	0.392344	35.627188	NaN	NaN
std	0.896760	0.981429	55.907576	NaN	NaN
min	0.000000	0.000000	0.000000	NaN	NaN
50%	0.000000	0.000000	14.454200	NaN	NaN
max	8.000000	9.000000	512.329200	NaN	NaN

In [96]:

# test data has one missing value in "Fare", fill with mean and make the similar conversion like the training set
test["Fare"] = test["Fare"].fillna(test["Fare"].mean())
test["Embarked"] = test["Embarked"].astype("category")
dummy = pd.get_dummies(test["Embarked"], prefix = "Embarked")
test = pd.concat([test, dummy], axis = 1)
for i in range(3):
    test[new_cols[i]] = minmax_scale(test[cols[i]])

/Applications/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py:475: DataConversionWarning: Data with input dtype int64 was converted to float64.
  warnings.warn(msg, DataConversionWarning)

In [97]:

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 31 columns):
PassengerId                   891 non-null int64
Survived                      891 non-null int64
Pclass                        891 non-null category
Name                          891 non-null object
Sex                           891 non-null category
Age                           891 non-null float64
SibSp                         891 non-null int64
Parch                         891 non-null int64
Ticket                        891 non-null object
Fare                          891 non-null float64
Cabin                         204 non-null object
Embarked                      891 non-null category
Age_categories                891 non-null category
Sex_female                    891 non-null uint8
Sex_male                      891 non-null uint8
Age_categories_Missing        891 non-null uint8
Age_categories_Infant         891 non-null uint8
Age_categories_Child          891 non-null uint8
Age_categories_Teenager       891 non-null uint8
Age_categories_Young Adult    891 non-null uint8
Age_categories_Adult          891 non-null uint8
Age_categories_Senior         891 non-null uint8
Pclass_1                      891 non-null uint8
Pclass_2                      891 non-null uint8
Pclass_3                      891 non-null uint8
Embarked_C                    891 non-null uint8
Embarked_Q                    891 non-null uint8
Embarked_S                    891 non-null uint8
SibSp_scaled                  891 non-null float64
Parch_scaled                  891 non-null float64
Fare_scaled                   891 non-null float64
dtypes: category(4), float64(5), int64(4), object(3), uint8(15)
memory usage: 100.8+ KB

In [98]:

cols = ['Age_categories_Missing', 'Age_categories_Infant',
       'Age_categories_Child', 'Age_categories_Teenager',
       'Age_categories_Young Adult', 'Age_categories_Adult',
       'Age_categories_Senior', 'Pclass_1', 'Pclass_2', 'Pclass_3',
       'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
       'SibSp', 'Parch', 'Fare']

In [99]:

lr = LogisticRegression()
lr.fit(train[cols], train["Survived"])
coef = lr.coef_
feature_coef = pd.Series(coef[0], index = cols)
feature_coef.plot.barh()

Out[99]:

<matplotlib.axes._subplots.AxesSubplot at 0x1148d1550>

In [100]:

# show bar plots with absolute values and sorted
sort_feature_coef = feature_coef.abs().sort_values()
sort_feature_coef.plot.barh()
plt.show()

In [101]:

# only selecting the top 8 vars and fit a model to test the accuracy
columns = ['Age_categories_Infant', 'SibSp_scaled', 'Sex_female', 'Sex_male',
       'Pclass_1', 'Pclass_3', 'Age_categories_Senior', 'Parch_scaled']
lr = LogisticRegression()
scores = cross_val_score(lr, train[columns], train["Survived"], cv = 10)
accuracy = np.mean(scores)
print(accuracy)

0.814801952105

In [104]:

survived = train["Fare"][train["Survived"] == 1]
died = train["Fare"][train["Survived"] == 0]
survived.plot.hist(alpha = 0.5, color = 'red', bins = 20, figsize = (12,8), xlim = (0, 150))
died.plot.hist(alpha = .5, color = 'blue', bins = 20, figsize = (12, 8), xlim = (0, 150))
plt.legend(["Survived", "Died"])
plt.show()

In [105]:

# Judging by the graph, it makes sense to binning the var "Fare"
def process_fare(df, cut, label):
    df["Fare_categories"] = pd.cut(df["Fare"], cut, labels = label)
    return df
cut = [0, 12, 50, 100, 1000]
label = ["0-12", "12-50", "50-100", "100+"]
train = process_fare(train, cut, label)
test = process_fare(test, cut, label)
dummy = pd.get_dummies(train["Fare_categories"], prefix = "Fare_categories")
train = pd.concat([train, dummy], axis = 1)
dummy = pd.get_dummies(test["Fare_categories"], prefix = "Fare_categories")
test = pd.concat([test, dummy], axis = 1)

In [106]:

# create func to create dummy vars
def create_dummies(df, col):
    dummy = pd.get_dummies(df[col], prefix = col)
    df = pd.concat([df, dummy], axis = 1)
    return df
titles = {
    "Mr" :         "Mr",
    "Mme":         "Mrs",
    "Ms":          "Mrs",
    "Mrs" :        "Mrs",
    "Master" :     "Master",
    "Mlle":        "Miss",
    "Miss" :       "Miss",
    "Capt":        "Officer",
    "Col":         "Officer",
    "Major":       "Officer",
    "Dr":          "Officer",
    "Rev":         "Officer",
    "Jonkheer":    "Royalty",
    "Don":         "Royalty",
    "Sir" :        "Royalty",
    "Countess":    "Royalty",
    "Dona":        "Royalty",
    "Lady" :       "Royalty"
}

extracted_titles = train["Name"].str.extract(' ([A-Za-z]+)\.',expand=False)
train["Title"] = extracted_titles.map(titles)
ext = test["Name"].str.extract(' ([A-Za-z]+)\.',expand=False)
test["Title"] = ext.map(titles)
train["Cabin_type"] = train["Cabin"].str[0]
test["Cabin_type"] = test["Cabin"].str[0]

train["Cabin_type"] = train["Cabin_type"].fillna("Unknown")
test["Cabin_type"] = test["Cabin_type"].fillna("Unknown")
cols = ["Title", "Cabin_type"]
dummies = pd.get_dummies(train[cols])
train = pd.concat([train, dummies], axis = 1)
dummies = pd.get_dummies(test[cols])
test = pd.concat([test, dummies], axis = 1)

In [108]:

# check collinearity
import seaborn as sns
def plot_correlation_heatmap(df):
    corr = df.corr()
    
    sns.set(style="white")
    mask = np.zeros_like(corr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True

    f, ax = plt.subplots(figsize=(11, 9))
    cmap = sns.diverging_palette(220, 10, as_cmap=True)


    sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
    plt.show()

columns = ['Age_categories_Missing', 'Age_categories_Infant',
       'Age_categories_Child', 'Age_categories_Teenager',
       'Age_categories_Young Adult', 'Age_categories_Adult',
       'Age_categories_Senior', 'Pclass_1', 'Pclass_2', 'Pclass_3',
       'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
       'SibSp_scaled', 'Parch_scaled', 'Fare_categories_0-12',
       'Fare_categories_12-50','Fare_categories_50-100', 'Fare_categories_100+',
       'Title_Master', 'Title_Miss', 'Title_Mr','Title_Mrs', 'Title_Officer',
       'Title_Royalty', 'Cabin_type_A','Cabin_type_B', 'Cabin_type_C', 'Cabin_type_D',
       'Cabin_type_E','Cabin_type_F', 'Cabin_type_G', 'Cabin_type_T', 'Cabin_type_Unknown']
plot_correlation_heatmap(train[columns])

In [110]:

# Remove one var in each dummy variable set to reduce the collinearity in each, and the high correlation between Sex and Title will render the elimination of sex as title may be more nuanced
# Removed categories include : Pclass_2, Age_categories_Teenager, Fare_categories_12-50, Title_Master, Cabin_type_A, Sexes
cols = ['Age_categories_Missing', 'Age_categories_Infant',
       'Age_categories_Child', 'Age_categories_Young Adult',
       'Age_categories_Adult', 'Age_categories_Senior', 'Pclass_1', 'Pclass_3',
       'Embarked_C', 'Embarked_Q', 'Embarked_S', 'SibSp_scaled',
       'Parch_scaled', 'Fare_categories_0-12', 'Fare_categories_50-100',
       'Fare_categories_100+', 'Title_Miss', 'Title_Mr', 'Title_Mrs',
       'Title_Officer', 'Title_Royalty', 'Cabin_type_B', 'Cabin_type_C',
       'Cabin_type_D', 'Cabin_type_E', 'Cabin_type_F', 'Cabin_type_G',
       'Cabin_type_T', 'Cabin_type_Unknown']
from sklearn.feature_selection import RFECV
features = train[cols]
target = train["Survived"]
lr = LogisticRegression()
selector = RFECV(lr, cv = 10)
selector.fit(features, target)
optimized_columns = features.columns[selector.support_]

In [111]:

optimized_columns

Out[111]:

Index(['SibSp_scaled', 'Title_Mr', 'Title_Officer', 'Cabin_type_Unknown'], dtype='object')

In [112]:

features = train[optimized_columns]
lr = LogisticRegression()
scores = cross_val_score(lr, features, target, cv = 10)
accuracy = np.mean(scores)
print(accuracy)

0.822654919986

In [115]:

# try model with K-nearest neighbors (reset the features to all to compare the base models)
from sklearn.neighbors import KNeighborsClassifier
# base model with logistic Regression
all_X = train[cols]
all_y = train['Survived']

lr = LogisticRegression()
scores = cross_val_score(lr, all_X, all_y, cv=10)
accuracy_lr = scores.mean()
print(accuracy_lr)

0.820495403473

In [116]:

# K_nearest neighbors
knn = KNeighborsClassifier(n_neighbors=1)
scores = cross_val_score(knn, all_X, all_y, cv = 10)
accuracy_knn = scores.mean()
print(accuracy_knn)

0.768985075474

In [118]:

#hyperparameter optimization by testing out different ks from 1-49 over the odd values
def plot_dict(dictionary):
    pd.Series(dictionary).plot.bar(figsize=(9,6),
                                   ylim=(0.78,0.83),rot=0)
    plt.show()

knn_scores = dict()

for k in range(1, 50, 2):
    knn = KNeighborsClassifier(n_neighbors = k)
    scores = cross_val_score(knn, all_X, all_y, cv = 10)
    accuracy_knn = scores.mean()
    knn_scores[k] = accuracy_knn
plot_dict(knn_scores)
print(knn_scores)

{1: 0.76898507547383954, 3: 0.80364118715242316, 5: 0.8238795255930087, 7: 0.81037055952786274, 9: 0.8002704006355692, 11: 0.80251759164680503, 13: 0.80259334922256276, 15: 0.80143116558846894, 17: 0.80595108387243219, 19: 0.79804817841334685, 21: 0.79919702644421742, 23: 0.80366643967767559, 25: 0.80702474180002282, 27: 0.80817358983089316, 29: 0.81040829644762236, 31: 0.8115449438202248, 33: 0.81937237543979113, 35: 0.81603932584269656, 37: 0.80824934740665078, 39: 0.80150720690046529, 41: 0.79701254114175457, 43: 0.79362898649415503, 45: 0.79477811826126421, 47: 0.78692515038020661, 49: 0.79028345250255361}

In [119]:

# Choose the best params with GridSearch kit of scikit-learn
from sklearn.model_selection import GridSearchCV

hyperparameters = {
    "n_neighbors": range(1,20,2),
    "weights": ["distance", "uniform"],
    "algorithm": ['brute'],
    "p": [1,2]
}
knn = KNeighborsClassifier()
grid = GridSearchCV(knn, param_grid = hyperparameters, cv = 10)
grid.fit(all_X, all_y)
best_params = grid.best_params_
best_score = grid.best_score_
print(best_params, best_score)

{'algorithm': 'brute', 'n_neighbors': 5, 'p': 1, 'weights': 'uniform'} 0.82379349046

In [120]:

# Random Forest Model
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state = 1)
scores = cross_val_score(rf, all_X, all_y, cv = 10)
accuracy_rf = scores.mean()
print(accuracy_rf)

0.823866473726

In [121]:

# Also test out different params with gridsearch
rf = RandomForestClassifier(random_state = 1)
hyperparameters = {
    "criterion": ["entropy", "gini"],
    "max_depth": [5, 10],
    "max_features": ["log2", "sqrt"],
    "min_samples_leaf": [1, 5],
    "min_samples_split": [3, 5],
    "n_estimators": [6, 9]
}
grid = GridSearchCV(rf, param_grid = hyperparameters, cv = 10)
grid.fit(all_X, all_y)
best_params = grid.best_params_
best_score = grid.best_score_
print(best_params, best_score)

{'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 6} 0.832772166105

Phoebe's blog

Titanic-Predictive Analysis with Feature Selection

social