Titanic-Predictive Analysis with Feature Selection

In [82]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [83]:
train = pd.read_csv("Data/Titanic/train.csv")
test = pd.read_csv("Data/Titanic/test.csv")
In [84]:
sex_pivot = train.pivot_table(index = "Sex", values = "Survived")
pclass_pivot = train.pivot_table(index = "Pclass", values = "Survived")
sex_pivot.plot.bar()
pclass_pivot.plot.bar()
Out[84]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1a35e6a0>
In [85]:
train["Age"].describe()
Out[85]:
count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

there are 100 data rows of missing ages, and there are ages lower than 1 expresssed as fractions. further exploration with graphs below

In [86]:
survived = train[train["Survived"] == 1]
died = train[train["Survived"] == 0]
survived["Age"].plot.hist(alpha = 0.5, color = 'red', bins = 50)
died["Age"].plot.hist(alpha = 0.5, color = 'blue', bins = 50)
plt.legend(["Survived", "Died"])
plt.show()
In [87]:
# Age shows a group pattern between age groups, applying pd.cut to categorize them into groups
def process_age(df,cut_points,label_names):
    df["Age"] = df["Age"].fillna(-0.5)
    df["Age_categories"] = pd.cut(df["Age"],cut_points,labels=label_names)
    return df

cut_points = [-1, 0, 5, 12, 18, 35, 60, 100]
label_names = ["Missing", "Infant", "Child", "Teenager", "Young Adult", "Adult", "Senior"]
train = process_age(train, cut_points, label_names)
test = process_age(test, cut_points, label_names)
agecat_pivot = train.pivot_table(index = "Age_categories", values = "Survived")
agecat_pivot.plot.bar()
plt.show()
In [88]:
cols = ["Sex", "Age_categories", "Pclass"]
for col in cols:
    train[col] = train[col].astype("category")
    test[col] = test[col].astype("category")
dummies = pd.get_dummies(train[cols])
train = pd.concat([train, dummies], axis = 1)
dummies = pd.get_dummies(test[cols])
test = pd.concat([test, dummies], axis = 1)
In [89]:
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 25 columns):
PassengerId                   891 non-null int64
Survived                      891 non-null int64
Pclass                        891 non-null category
Name                          891 non-null object
Sex                           891 non-null category
Age                           891 non-null float64
SibSp                         891 non-null int64
Parch                         891 non-null int64
Ticket                        891 non-null object
Fare                          891 non-null float64
Cabin                         204 non-null object
Embarked                      889 non-null object
Age_categories                891 non-null category
Sex_female                    891 non-null uint8
Sex_male                      891 non-null uint8
Age_categories_Missing        891 non-null uint8
Age_categories_Infant         891 non-null uint8
Age_categories_Child          891 non-null uint8
Age_categories_Teenager       891 non-null uint8
Age_categories_Young Adult    891 non-null uint8
Age_categories_Adult          891 non-null uint8
Age_categories_Senior         891 non-null uint8
Pclass_1                      891 non-null uint8
Pclass_2                      891 non-null uint8
Pclass_3                      891 non-null uint8
dtypes: category(3), float64(2), int64(4), object(4), uint8(12)
memory usage: 83.3+ KB
In [90]:
cols = train.select_dtypes(include = ['uint8'])
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
features = cols
target = train["Survived"]
train_features, test_features, train_target, test_target = train_test_split(features, target, test_size = 0.2, random_state = 0)
lr = LogisticRegression()
lr.fit(train_features, train_target)
predictions = lr.predict(test_features)
accuracy = accuracy_score(test_target, predictions)
accuracy
Out[90]:
0.81005586592178769
In [91]:
#use kfold cross validation than hold-out
from sklearn.model_selection import cross_val_score
import numpy as np
lr = LogisticRegression()
scores = cross_val_score(lr, features, target, cv = 10)
accuracy = np.mean(scores)
print(scores, accuracy)
[ 0.8         0.81111111  0.7752809   0.87640449  0.80898876  0.78651685
  0.76404494  0.76404494  0.83146067  0.80681818] 0.802467086596
In [92]:
cols = ['SibSp','Parch','Fare','Cabin','Embarked']
train[cols].describe(include = 'all', percentiles = [])
Out[92]:
SibSp Parch Fare Cabin Embarked
count 891.000000 891.000000 891.000000 204 889
unique NaN NaN NaN 147 3
top NaN NaN NaN G6 S
freq NaN NaN NaN 4 644
mean 0.523008 0.381594 32.204208 NaN NaN
std 1.102743 0.806057 49.693429 NaN NaN
min 0.000000 0.000000 0.000000 NaN NaN
50% 0.000000 0.000000 14.454200 NaN NaN
max 8.000000 6.000000 512.329200 NaN NaN
In [93]:
# Cabin are mostly missing and unique, while Embarked has a few missing value with 3 unique values; SibSp, Parch, Fare are all continuous var with no missing value but of very different scale
# fill embark missing value with mode
train["Embarked"] = train["Embarked"].fillna("S")
train["Embarked"] = train["Embarked"].astype("category")
dummy = pd.get_dummies(train["Embarked"], prefix = "Embarked")
train = pd.concat([train, dummy], axis = 1)
train.head()
Out[93]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare ... Age_categories_Teenager Age_categories_Young Adult Age_categories_Adult Age_categories_Senior Pclass_1 Pclass_2 Pclass_3 Embarked_C Embarked_Q Embarked_S
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 ... 0 1 0 0 0 0 1 0 0 1
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 ... 0 0 1 0 1 0 0 1 0 0
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 ... 0 1 0 0 0 0 1 0 0 1
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 ... 0 1 0 0 1 0 0 0 0 1
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 ... 0 1 0 0 0 0 1 0 0 1

5 rows × 28 columns

In [94]:
from sklearn.preprocessing import minmax_scale
new_cols = ['SibSp_scaled','Parch_scaled','Fare_scaled']
for i in range(3):
    train[new_cols[i]] = minmax_scale(train[cols[i]])
print(train.head())
   PassengerId  Survived Pclass  \
0            1         0      3   
1            2         1      1   
2            3         1      3   
3            4         1      1   
4            5         0      3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare     ...      Age_categories_Senior  \
0      0         A/5 21171   7.2500     ...                          0   
1      0          PC 17599  71.2833     ...                          0   
2      0  STON/O2. 3101282   7.9250     ...                          0   
3      0            113803  53.1000     ...                          0   
4      0            373450   8.0500     ...                          0   

  Pclass_1 Pclass_2  Pclass_3  Embarked_C  Embarked_Q  Embarked_S  \
0        0        0         1           0           0           1   
1        1        0         0           1           0           0   
2        0        0         1           0           0           1   
3        1        0         0           0           0           1   
4        0        0         1           0           0           1   

   SibSp_scaled  Parch_scaled  Fare_scaled  
0         0.125           0.0     0.014151  
1         0.125           0.0     0.139136  
2         0.000           0.0     0.015469  
3         0.125           0.0     0.103644  
4         0.000           0.0     0.015713  

[5 rows x 31 columns]
/Applications/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py:475: DataConversionWarning: Data with input dtype int64 was converted to float64.
  warnings.warn(msg, DataConversionWarning)
In [95]:
test[cols].describe(include = 'all', percentiles = [])
Out[95]:
SibSp Parch Fare Cabin Embarked
count 418.000000 418.000000 417.000000 91 418
unique NaN NaN NaN 76 3
top NaN NaN NaN B57 B59 B63 B66 S
freq NaN NaN NaN 3 270
mean 0.447368 0.392344 35.627188 NaN NaN
std 0.896760 0.981429 55.907576 NaN NaN
min 0.000000 0.000000 0.000000 NaN NaN
50% 0.000000 0.000000 14.454200 NaN NaN
max 8.000000 9.000000 512.329200 NaN NaN
In [96]:
# test data has one missing value in "Fare", fill with mean and make the similar conversion like the training set
test["Fare"] = test["Fare"].fillna(test["Fare"].mean())
test["Embarked"] = test["Embarked"].astype("category")
dummy = pd.get_dummies(test["Embarked"], prefix = "Embarked")
test = pd.concat([test, dummy], axis = 1)
for i in range(3):
    test[new_cols[i]] = minmax_scale(test[cols[i]])
/Applications/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py:475: DataConversionWarning: Data with input dtype int64 was converted to float64.
  warnings.warn(msg, DataConversionWarning)
In [97]:
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 31 columns):
PassengerId                   891 non-null int64
Survived                      891 non-null int64
Pclass                        891 non-null category
Name                          891 non-null object
Sex                           891 non-null category
Age                           891 non-null float64
SibSp                         891 non-null int64
Parch                         891 non-null int64
Ticket                        891 non-null object
Fare                          891 non-null float64
Cabin                         204 non-null object
Embarked                      891 non-null category
Age_categories                891 non-null category
Sex_female                    891 non-null uint8
Sex_male                      891 non-null uint8
Age_categories_Missing        891 non-null uint8
Age_categories_Infant         891 non-null uint8
Age_categories_Child          891 non-null uint8
Age_categories_Teenager       891 non-null uint8
Age_categories_Young Adult    891 non-null uint8
Age_categories_Adult          891 non-null uint8
Age_categories_Senior         891 non-null uint8
Pclass_1                      891 non-null uint8
Pclass_2                      891 non-null uint8
Pclass_3                      891 non-null uint8
Embarked_C                    891 non-null uint8
Embarked_Q                    891 non-null uint8
Embarked_S                    891 non-null uint8
SibSp_scaled                  891 non-null float64
Parch_scaled                  891 non-null float64
Fare_scaled                   891 non-null float64
dtypes: category(4), float64(5), int64(4), object(3), uint8(15)
memory usage: 100.8+ KB
In [98]:
cols = ['Age_categories_Missing', 'Age_categories_Infant',
       'Age_categories_Child', 'Age_categories_Teenager',
       'Age_categories_Young Adult', 'Age_categories_Adult',
       'Age_categories_Senior', 'Pclass_1', 'Pclass_2', 'Pclass_3',
       'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
       'SibSp', 'Parch', 'Fare']
In [99]:
lr = LogisticRegression()
lr.fit(train[cols], train["Survived"])
coef = lr.coef_
feature_coef = pd.Series(coef[0], index = cols)
feature_coef.plot.barh()
Out[99]:
<matplotlib.axes._subplots.AxesSubplot at 0x1148d1550>
In [100]:
# show bar plots with absolute values and sorted
sort_feature_coef = feature_coef.abs().sort_values()
sort_feature_coef.plot.barh()
plt.show()
In [101]:
# only selecting the top 8 vars and fit a model to test the accuracy
columns = ['Age_categories_Infant', 'SibSp_scaled', 'Sex_female', 'Sex_male',
       'Pclass_1', 'Pclass_3', 'Age_categories_Senior', 'Parch_scaled']
lr = LogisticRegression()
scores = cross_val_score(lr, train[columns], train["Survived"], cv = 10)
accuracy = np.mean(scores)
print(accuracy)
0.814801952105
In [104]:
survived = train["Fare"][train["Survived"] == 1]
died = train["Fare"][train["Survived"] == 0]
survived.plot.hist(alpha = 0.5, color = 'red', bins = 20, figsize = (12,8), xlim = (0, 150))
died.plot.hist(alpha = .5, color = 'blue', bins = 20, figsize = (12, 8), xlim = (0, 150))
plt.legend(["Survived", "Died"])
plt.show()
In [105]:
# Judging by the graph, it makes sense to binning the var "Fare"
def process_fare(df, cut, label):
    df["Fare_categories"] = pd.cut(df["Fare"], cut, labels = label)
    return df
cut = [0, 12, 50, 100, 1000]
label = ["0-12", "12-50", "50-100", "100+"]
train = process_fare(train, cut, label)
test = process_fare(test, cut, label)
dummy = pd.get_dummies(train["Fare_categories"], prefix = "Fare_categories")
train = pd.concat([train, dummy], axis = 1)
dummy = pd.get_dummies(test["Fare_categories"], prefix = "Fare_categories")
test = pd.concat([test, dummy], axis = 1)
In [106]:
# create func to create dummy vars
def create_dummies(df, col):
    dummy = pd.get_dummies(df[col], prefix = col)
    df = pd.concat([df, dummy], axis = 1)
    return df
titles = {
    "Mr" :         "Mr",
    "Mme":         "Mrs",
    "Ms":          "Mrs",
    "Mrs" :        "Mrs",
    "Master" :     "Master",
    "Mlle":        "Miss",
    "Miss" :       "Miss",
    "Capt":        "Officer",
    "Col":         "Officer",
    "Major":       "Officer",
    "Dr":          "Officer",
    "Rev":         "Officer",
    "Jonkheer":    "Royalty",
    "Don":         "Royalty",
    "Sir" :        "Royalty",
    "Countess":    "Royalty",
    "Dona":        "Royalty",
    "Lady" :       "Royalty"
}

extracted_titles = train["Name"].str.extract(' ([A-Za-z]+)\.',expand=False)
train["Title"] = extracted_titles.map(titles)
ext = test["Name"].str.extract(' ([A-Za-z]+)\.',expand=False)
test["Title"] = ext.map(titles)
train["Cabin_type"] = train["Cabin"].str[0]
test["Cabin_type"] = test["Cabin"].str[0]

train["Cabin_type"] = train["Cabin_type"].fillna("Unknown")
test["Cabin_type"] = test["Cabin_type"].fillna("Unknown")
cols = ["Title", "Cabin_type"]
dummies = pd.get_dummies(train[cols])
train = pd.concat([train, dummies], axis = 1)
dummies = pd.get_dummies(test[cols])
test = pd.concat([test, dummies], axis = 1)
In [108]:
# check collinearity
import seaborn as sns
def plot_correlation_heatmap(df):
    corr = df.corr()
    
    sns.set(style="white")
    mask = np.zeros_like(corr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True

    f, ax = plt.subplots(figsize=(11, 9))
    cmap = sns.diverging_palette(220, 10, as_cmap=True)


    sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
    plt.show()

columns = ['Age_categories_Missing', 'Age_categories_Infant',
       'Age_categories_Child', 'Age_categories_Teenager',
       'Age_categories_Young Adult', 'Age_categories_Adult',
       'Age_categories_Senior', 'Pclass_1', 'Pclass_2', 'Pclass_3',
       'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
       'SibSp_scaled', 'Parch_scaled', 'Fare_categories_0-12',
       'Fare_categories_12-50','Fare_categories_50-100', 'Fare_categories_100+',
       'Title_Master', 'Title_Miss', 'Title_Mr','Title_Mrs', 'Title_Officer',
       'Title_Royalty', 'Cabin_type_A','Cabin_type_B', 'Cabin_type_C', 'Cabin_type_D',
       'Cabin_type_E','Cabin_type_F', 'Cabin_type_G', 'Cabin_type_T', 'Cabin_type_Unknown']
plot_correlation_heatmap(train[columns])
In [110]:
# Remove one var in each dummy variable set to reduce the collinearity in each, and the high correlation between Sex and Title will render the elimination of sex as title may be more nuanced
# Removed categories include : Pclass_2, Age_categories_Teenager, Fare_categories_12-50, Title_Master, Cabin_type_A, Sexes
cols = ['Age_categories_Missing', 'Age_categories_Infant',
       'Age_categories_Child', 'Age_categories_Young Adult',
       'Age_categories_Adult', 'Age_categories_Senior', 'Pclass_1', 'Pclass_3',
       'Embarked_C', 'Embarked_Q', 'Embarked_S', 'SibSp_scaled',
       'Parch_scaled', 'Fare_categories_0-12', 'Fare_categories_50-100',
       'Fare_categories_100+', 'Title_Miss', 'Title_Mr', 'Title_Mrs',
       'Title_Officer', 'Title_Royalty', 'Cabin_type_B', 'Cabin_type_C',
       'Cabin_type_D', 'Cabin_type_E', 'Cabin_type_F', 'Cabin_type_G',
       'Cabin_type_T', 'Cabin_type_Unknown']
from sklearn.feature_selection import RFECV
features = train[cols]
target = train["Survived"]
lr = LogisticRegression()
selector = RFECV(lr, cv = 10)
selector.fit(features, target)
optimized_columns = features.columns[selector.support_]
In [111]:
optimized_columns
Out[111]:
Index(['SibSp_scaled', 'Title_Mr', 'Title_Officer', 'Cabin_type_Unknown'], dtype='object')
In [112]:
features = train[optimized_columns]
lr = LogisticRegression()
scores = cross_val_score(lr, features, target, cv = 10)
accuracy = np.mean(scores)
print(accuracy)
0.822654919986
In [115]:
# try model with K-nearest neighbors (reset the features to all to compare the base models)
from sklearn.neighbors import KNeighborsClassifier
# base model with logistic Regression
all_X = train[cols]
all_y = train['Survived']

lr = LogisticRegression()
scores = cross_val_score(lr, all_X, all_y, cv=10)
accuracy_lr = scores.mean()
print(accuracy_lr)
0.820495403473
In [116]:
# K_nearest neighbors
knn = KNeighborsClassifier(n_neighbors=1)
scores = cross_val_score(knn, all_X, all_y, cv = 10)
accuracy_knn = scores.mean()
print(accuracy_knn)
0.768985075474
In [118]:
#hyperparameter optimization by testing out different ks from 1-49 over the odd values
def plot_dict(dictionary):
    pd.Series(dictionary).plot.bar(figsize=(9,6),
                                   ylim=(0.78,0.83),rot=0)
    plt.show()

knn_scores = dict()

for k in range(1, 50, 2):
    knn = KNeighborsClassifier(n_neighbors = k)
    scores = cross_val_score(knn, all_X, all_y, cv = 10)
    accuracy_knn = scores.mean()
    knn_scores[k] = accuracy_knn
plot_dict(knn_scores)
print(knn_scores)
{1: 0.76898507547383954, 3: 0.80364118715242316, 5: 0.8238795255930087, 7: 0.81037055952786274, 9: 0.8002704006355692, 11: 0.80251759164680503, 13: 0.80259334922256276, 15: 0.80143116558846894, 17: 0.80595108387243219, 19: 0.79804817841334685, 21: 0.79919702644421742, 23: 0.80366643967767559, 25: 0.80702474180002282, 27: 0.80817358983089316, 29: 0.81040829644762236, 31: 0.8115449438202248, 33: 0.81937237543979113, 35: 0.81603932584269656, 37: 0.80824934740665078, 39: 0.80150720690046529, 41: 0.79701254114175457, 43: 0.79362898649415503, 45: 0.79477811826126421, 47: 0.78692515038020661, 49: 0.79028345250255361}
In [119]:
# Choose the best params with GridSearch kit of scikit-learn
from sklearn.model_selection import GridSearchCV

hyperparameters = {
    "n_neighbors": range(1,20,2),
    "weights": ["distance", "uniform"],
    "algorithm": ['brute'],
    "p": [1,2]
}
knn = KNeighborsClassifier()
grid = GridSearchCV(knn, param_grid = hyperparameters, cv = 10)
grid.fit(all_X, all_y)
best_params = grid.best_params_
best_score = grid.best_score_
print(best_params, best_score)
{'algorithm': 'brute', 'n_neighbors': 5, 'p': 1, 'weights': 'uniform'} 0.82379349046
In [120]:
# Random Forest Model
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state = 1)
scores = cross_val_score(rf, all_X, all_y, cv = 10)
accuracy_rf = scores.mean()
print(accuracy_rf)
0.823866473726
In [121]:
# Also test out different params with gridsearch
rf = RandomForestClassifier(random_state = 1)
hyperparameters = {
    "criterion": ["entropy", "gini"],
    "max_depth": [5, 10],
    "max_features": ["log2", "sqrt"],
    "min_samples_leaf": [1, 5],
    "min_samples_split": [3, 5],
    "n_estimators": [6, 9]
}
grid = GridSearchCV(rf, param_grid = hyperparameters, cv = 10)
grid.fit(all_X, all_y)
best_params = grid.best_params_
best_score = grid.best_score_
print(best_params, best_score)
{'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 6} 0.832772166105

social