Loan Club Data Predictive Analysis with data cleansing

In [47]:
import pandas as pd
loan = pd.read_csv("Data/lending club loan/LoanStats3a.csv", skiprows = 1)
/Applications/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2698: DtypeWarning: Columns (0,47) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
In [48]:
loan.head()
loan.shape
Out[48]:
(42538, 145)

this dataset contains complete loan data for all loans issued through 2007-2011, including the current loan status (Current, Late, Fully Paid, etc.) and latest payment information. data source-lending club data dictionary

In [49]:
half_count = len(loan) / 2
loan = loan.dropna(thresh = half_count, axis = 1)
loan = loan.drop(["desc"], axis = 1)
loan.to_csv('loan2007.csv', index = False)
In [50]:
sorted(loan.columns)
Out[50]:
['acc_now_delinq',
 'addr_state',
 'annual_inc',
 'application_type',
 'chargeoff_within_12_mths',
 'collection_recovery_fee',
 'collections_12_mths_ex_med',
 'debt_settlement_flag',
 'delinq_2yrs',
 'delinq_amnt',
 'disbursement_method',
 'dti',
 'earliest_cr_line',
 'emp_length',
 'emp_title',
 'funded_amnt',
 'funded_amnt_inv',
 'grade',
 'hardship_flag',
 'home_ownership',
 'initial_list_status',
 'inq_last_6mths',
 'installment',
 'int_rate',
 'issue_d',
 'last_credit_pull_d',
 'last_pymnt_amnt',
 'last_pymnt_d',
 'loan_amnt',
 'loan_status',
 'open_acc',
 'out_prncp',
 'out_prncp_inv',
 'policy_code',
 'pub_rec',
 'pub_rec_bankruptcies',
 'purpose',
 'pymnt_plan',
 'recoveries',
 'revol_bal',
 'revol_util',
 'sub_grade',
 'tax_liens',
 'term',
 'title',
 'total_acc',
 'total_pymnt',
 'total_pymnt_inv',
 'total_rec_int',
 'total_rec_late_fee',
 'total_rec_prncp',
 'verification_status',
 'zip_code']
In [51]:
#Major reasons to remove columns:
'''
-leak information from the future (after the loan has already been funded)
-don't affect a borrower's ability to pay back a loan (e.g. a randomly generated ID value by Lending Club)
-formatted poorly and need to be cleaned up
-require more data or a lot of processing to turn into a useful feature
-contain redundant information
'''
cols_to_remove = ["funded_amnt", "funded_amnt_inv", "grade", "sub_grade", "emp_title", "issue_d"]
loan = loan.drop(cols_to_remove, axis = 1)
In [52]:
removecols2 = ["zip_code", "out_prncp", "out_prncp_inv", "total_pymnt", "total_pymnt_inv", "total_rec_prncp"]
loan = loan.drop(removecols2, axis = 1)
In [53]:
removecols3 = ["total_rec_int", "total_rec_late_fee", "recoveries", "collection_recovery_fee", "last_pymnt_d", "last_pymnt_amnt"]
loan = loan.drop(removecols3, axis = 1)
loan.shape
Out[53]:
(42538, 35)
In [54]:
loan.drop(["hardship_flag", "disbursement_method", "debt_settlement_flag"], axis = 1, inplace = True)
In [55]:
loan.head()
Out[55]:
loan_amnt term int_rate installment emp_length home_ownership annual_inc verification_status loan_status pymnt_plan ... initial_list_status last_credit_pull_d collections_12_mths_ex_med policy_code application_type acc_now_delinq chargeoff_within_12_mths delinq_amnt pub_rec_bankruptcies tax_liens
0 5000.0 36 months 10.65% 162.87 10+ years RENT 24000.0 Verified Fully Paid n ... f Dec-2017 0.0 1.0 Individual 0.0 0.0 0.0 0.0 0.0
1 2500.0 60 months 15.27% 59.83 < 1 year RENT 30000.0 Source Verified Charged Off n ... f Oct-2016 0.0 1.0 Individual 0.0 0.0 0.0 0.0 0.0
2 2400.0 36 months 15.96% 84.33 10+ years RENT 12252.0 Not Verified Fully Paid n ... f Jun-2017 0.0 1.0 Individual 0.0 0.0 0.0 0.0 0.0
3 10000.0 36 months 13.49% 339.31 10+ years RENT 49200.0 Source Verified Fully Paid n ... f Apr-2016 0.0 1.0 Individual 0.0 0.0 0.0 0.0 0.0
4 3000.0 60 months 12.69% 67.79 1 year RENT 80000.0 Source Verified Fully Paid n ... f Jan-2017 0.0 1.0 Individual 0.0 0.0 0.0 0.0 0.0

5 rows × 32 columns

In [56]:
print(loan["loan_status"].value_counts())
Fully Paid                                             34116
Charged Off                                             5670
Does not meet the credit policy. Status:Fully Paid      1988
Does not meet the credit policy. Status:Charged Off      761
Name: loan_status, dtype: int64
In [57]:
mapping_dict = {
    "loan_status":{
        "Fully Paid": 1,
        "Charged Off": 0,
        "Does not meet the credit policy. Status:Fully Paid":1,
        "Does not meet the credit policy. Status:Charged Off":0
    }
}
loan = loan.replace(mapping_dict)
In [58]:
print(loan["loan_status"].value_counts())
1.0    36104
0.0     6431
Name: loan_status, dtype: int64
In [59]:
#remove cols with only one unique value
drop_columns = []
for col in loan.columns:
    col_series = loan[col].dropna().unique()
    if len(col_series) == 1:
        drop_columns.append(col)
loan = loan.drop(drop_columns, axis=1)
print(drop_columns)
['pymnt_plan', 'initial_list_status', 'collections_12_mths_ex_med', 'policy_code', 'application_type', 'chargeoff_within_12_mths']
In [61]:
loan.isnull().sum()
Out[61]:
loan_amnt                  3
term                       3
int_rate                   3
installment                3
emp_length                 3
home_ownership             3
annual_inc                 7
verification_status        3
loan_status                3
purpose                    3
title                     15
addr_state                 3
dti                        3
delinq_2yrs               32
earliest_cr_line          32
inq_last_6mths            32
open_acc                  32
pub_rec                   32
revol_bal                  3
revol_util                93
total_acc                 32
last_credit_pull_d         7
acc_now_delinq            32
delinq_amnt               32
pub_rec_bankruptcies    1368
tax_liens                108
dtype: int64
In [64]:
loan = loan.drop("pub_rec_bankruptcies", axis = 1)
In [65]:
loan = loan.dropna(axis = 0)
In [73]:
loan.dtypes.value_counts
Out[73]:
<bound method IndexOpsMixin.value_counts of loan_amnt              float64
term                    object
int_rate                object
installment            float64
emp_length              object
home_ownership          object
annual_inc             float64
verification_status     object
loan_status            float64
purpose                 object
title                   object
addr_state              object
dti                    float64
delinq_2yrs            float64
earliest_cr_line        object
inq_last_6mths         float64
open_acc               float64
pub_rec                float64
revol_bal              float64
revol_util              object
total_acc              float64
last_credit_pull_d      object
acc_now_delinq         float64
delinq_amnt            float64
tax_liens              float64
dtype: object>
In [74]:
obj_cols = loan.select_dtypes(include = ["object"])
obj_cols.head()
Out[74]:
term int_rate emp_length home_ownership verification_status purpose title addr_state earliest_cr_line revol_util last_credit_pull_d
0 36 months 10.65% 10+ years RENT Verified credit_card Computer AZ Jan-1985 83.7% Dec-2017
1 60 months 15.27% < 1 year RENT Source Verified car bike GA Apr-1999 9.4% Oct-2016
2 36 months 15.96% 10+ years RENT Not Verified small_business real estate business IL Nov-2001 98.5% Jun-2017
3 36 months 13.49% 10+ years RENT Source Verified other personel CA Feb-1996 21% Apr-2016
4 60 months 12.69% 1 year RENT Source Verified other Personal OR Jan-1996 53.9% Jan-2017
In [76]:
#explore the cols appear to be categorical
cols = ["home_ownership", "verification_status", "emp_length", "term", "addr_state", "purpose", "title"]
for col in cols:
    print(loan[col].value_counts())
RENT        20087
MORTGAGE    18902
OWN          3231
OTHER         134
NONE            2
Name: home_ownership, dtype: int64
Not Verified       18608
Verified           13458
Source Verified    10290
Name: verification_status, dtype: int64
10+ years    9355
< 1 year     5003
2 years      4727
3 years      4354
4 years      3633
1 year       3567
5 years      3449
6 years      2368
7 years      1868
8 years      1587
9 years      1337
n/a          1108
Name: emp_length, dtype: int64
 36 months    31363
 60 months    10993
Name: term, dtype: int64
CA    7418
NY    4044
FL    3085
TX    2906
NJ    1978
IL    1670
PA    1645
GA    1495
VA    1484
MA    1417
OH    1323
MD    1121
AZ     928
WA     887
CO     852
NC     824
CT     810
MI     795
MO     760
MN     649
NV     526
WI     509
SC     489
AL     483
OR     467
LA     460
KY     358
OK     316
KS     296
UT     278
AR     260
DC     222
RI     208
NM     205
NH     188
WV     187
HI     180
DE     135
MT      96
WY      86
AK      86
SD      66
VT      57
TN      32
MS      26
IN      15
IA      12
NE      11
ID       9
ME       2
Name: addr_state, dtype: int64
debt_consolidation    19736
credit_card            5450
other                  4371
home_improvement       3187
major_purchase         2304
small_business         1984
car                    1610
wedding                1003
medical                 751
moving                  624
house                   424
educational             409
vacation                397
renewable_energy        106
Name: purpose, dtype: int64
Debt Consolidation                    2255
Debt Consolidation Loan               1759
Personal Loan                          706
Consolidation                          545
debt consolidation                     532
Home Improvement                       372
Credit Card Consolidation              370
Debt consolidation                     345
Small Business Loan                    333
Personal                               330
Credit Card Loan                       323
personal loan                          264
Consolidation Loan                     263
Home Improvement Loan                  255
personal                               246
Loan                                   218
Wedding Loan                           217
consolidation                          217
Car Loan                               206
Other Loan                             192
Credit Card Payoff                     163
Wedding                                160
Credit Card Refinance                  147
Major Purchase Loan                    146
Consolidate                            141
Medical                                124
Credit Card                            122
home improvement                       117
My Loan                                 95
Credit Cards                            94
                                      ... 
Time To Get The Finances In Order!       1
FinalCCPayment                           1
Post-College Debt Consolidation          1
paybackcards                             1
Three Long Years                         1
High Interest CC Payoff                  1
My GREEN PROJECT IN HOME                 1
In Credit Card Recovery!!                1
Personal Consid                          1
Furniture Purchase                       1
pay off high interest debt               1
Furniture/Improvements                   1
consolidate debt and fund hobby          1
Loan to remodel my kitchen               1
Future                                   1
my credit loan                           1
business loan                            1
Bmw                                      1
Lower rates                              1
Consolodating Student Loans              1
Bridge to new contract                   1
Short Term Financing Needed              1
Paying too much interest                 1
My conslidation loan                     1
Life-Saver Loan                          1
Consolidate my credit cards              1
consolidation!!                          1
Move to Michigan                         1
JL Vacation                              1
Wheels                                   1
Name: title, Length: 21160, dtype: int64
In [77]:
# remove addr_state (as it would add too many dummy variables), last_credit_pull_d, earliest_cr_line, title(low data quality and overlap with purpose)
loan = loan.drop(["last_credit_pull_d", "addr_state", "title", "earliest_cr_line"], axis = 1)
In [79]:
#convert int_rate and revol_util to float
int_rate = loan["int_rate"].str.strip('%')
int_rate = int_rate.astype('float')
revol_util = loan["revol_util"].str.strip('%')
revol_util = revol_util.astype('float')
In [81]:
loan["int_rate"] = int_rate
loan["revol_util"] = revol_util
In [82]:
#convert emp_length to numerical
mapping_dict = {
    "emp_length":{
        "10+ years": 10,
        "9 years": 9,
        "8 years": 8,
        "7 years": 7,
        "6 years": 6,
        "5 years": 5,
        "4 years": 4,
        "3 years": 3,
        "2 years": 2,
        "1 year": 1,
        "< 1 year": 0,
        "n/a": 0
    }
}
loan = loan.replace(mapping_dict)
In [84]:
# convert the remaining categorical cols as dummy cols
cols = ["home_ownership", "verification_status", "purpose", "term"]
for col in cols:
    loan[col] = loan[col].astype("category")
In [86]:
dummy_df = pd.get_dummies(loan[cols])
loan = pd.concat([loan, dummy_df],axis = 1)
loan = loan.drop(cols, axis = 1)
In [90]:
loan.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 42356 entries, 0 to 42478
Data columns (total 41 columns):
loan_amnt                              42356 non-null float64
int_rate                               42356 non-null float64
installment                            42356 non-null float64
emp_length                             42356 non-null int64
annual_inc                             42356 non-null float64
loan_status                            42356 non-null float64
dti                                    42356 non-null float64
delinq_2yrs                            42356 non-null float64
inq_last_6mths                         42356 non-null float64
open_acc                               42356 non-null float64
pub_rec                                42356 non-null float64
revol_bal                              42356 non-null float64
revol_util                             42356 non-null float64
total_acc                              42356 non-null float64
acc_now_delinq                         42356 non-null float64
delinq_amnt                            42356 non-null float64
tax_liens                              42356 non-null float64
home_ownership_MORTGAGE                42356 non-null uint8
home_ownership_NONE                    42356 non-null uint8
home_ownership_OTHER                   42356 non-null uint8
home_ownership_OWN                     42356 non-null uint8
home_ownership_RENT                    42356 non-null uint8
verification_status_Not Verified       42356 non-null uint8
verification_status_Source Verified    42356 non-null uint8
verification_status_Verified           42356 non-null uint8
purpose_car                            42356 non-null uint8
purpose_credit_card                    42356 non-null uint8
purpose_debt_consolidation             42356 non-null uint8
purpose_educational                    42356 non-null uint8
purpose_home_improvement               42356 non-null uint8
purpose_house                          42356 non-null uint8
purpose_major_purchase                 42356 non-null uint8
purpose_medical                        42356 non-null uint8
purpose_moving                         42356 non-null uint8
purpose_other                          42356 non-null uint8
purpose_renewable_energy               42356 non-null uint8
purpose_small_business                 42356 non-null uint8
purpose_vacation                       42356 non-null uint8
purpose_wedding                        42356 non-null uint8
term_ 36 months                        42356 non-null uint8
term_ 60 months                        42356 non-null uint8
dtypes: float64(16), int64(1), uint8(24)
memory usage: 6.8 MB
In [102]:
features = loan.columns
features = features.tolist()
features.remove("loan_status")
target = loan["loan_status"]
In [107]:
# fit with logistic regression model first
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_predict, KFold
lr = LogisticRegression()
kf = KFold(loan[features].shape[0], random_state = 1)
predictions = cross_val_predict(lr, loan[features], target, cv = kf)
predictions = pd.Series(predictions)
# False positives.
fp_filter = (predictions == 1) & (loan["loan_status"] == 0)
fp = len(predictions[fp_filter])

# True positives.
tp_filter = (predictions == 1) & (loan["loan_status"] == 1)
tp = len(predictions[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (loan["loan_status"] == 1)
fn = len(predictions[fn_filter])

# True negatives
tn_filter = (predictions == 0) & (loan["loan_status"] == 0)
tn = len(predictions[tn_filter])

# Rates
tpr = tp / (tp + fn) #recall
fpr = fp / (fp + tn) #fall-out

print(tpr, fpr)
0.9993034077293879 0.9985837922895358
In [108]:
# Both tpr and fpr are high as the data is highly category imbalanced. (1s are a lot more than 0s), setting the class_weight parameter to balanced will help to solve the problem
lr = LogisticRegression(class_weight = "balanced")
kf = KFold(loan[features].shape[0], random_state = 1)
predictions = cross_val_predict(lr, loan[features], target, cv = kf)
predictions = pd.Series(predictions)
# False positives.
fp_filter = (predictions == 1) & (loan["loan_status"] == 0)
fp = len(predictions[fp_filter])

# True positives.
tp_filter = (predictions == 1) & (loan["loan_status"] == 1)
tp = len(predictions[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (loan["loan_status"] == 1)
fn = len(predictions[fn_filter])

# True negatives
tn_filter = (predictions == 0) & (loan["loan_status"] == 0)
tn = len(predictions[tn_filter])

# Rates
tpr = tp / (tp + fn) #recall
fpr = fp / (fp + tn) #fall-out

print(tpr, fpr)
0.6338989662570704 0.5741935483870968

In this proj, we are particularly concerned about lowering the false positive rate, so we'd like to apply a harsher penalty for misclassifying the negative class

In [109]:
penalty = {0: 10, 1: 1}
lr = LogisticRegression(class_weight = penalty)
kf = KFold(loan[features].shape[0], random_state = 1)
predictions = cross_val_predict(lr, loan[features], target, cv = kf)
predictions = pd.Series(predictions)
# False positives.
fp_filter = (predictions == 1) & (loan["loan_status"] == 0)
fp = len(predictions[fp_filter])

# True positives.
tp_filter = (predictions == 1) & (loan["loan_status"] == 1)
tp = len(predictions[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (loan["loan_status"] == 1)
fn = len(predictions[fn_filter])

# True negatives
tn_filter = (predictions == 0) & (loan["loan_status"] == 0)
tn = len(predictions[tn_filter])

# Rates
tpr = tp / (tp + fn) #recall
fpr = fp / (fp + tn) #fall-out

print(tpr, fpr)
0.18181058262977515 0.16443745082612116
In [117]:
# try fitting a random forest, which is expected to perform better due to the nonlinear relationship between many variables and the target
from sklearn.ensemble import RandomForestClassifier
penalty = {0: 20, 1: 1}
rfc = RandomForestClassifier(class_weight = penalty, random_state = 1, max_depth = 15)
kf = KFold(loan[features].shape[0], random_state = 1)
predictions = cross_val_predict(rfc, loan[features], target, cv = kf)
predictions = pd.Series(predictions)
# False positives.
fp_filter = (predictions == 1) & (loan["loan_status"] == 0)
fp = len(predictions[fp_filter])

# True positives.
tp_filter = (predictions == 1) & (loan["loan_status"] == 1)
tp = len(predictions[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (loan["loan_status"] == 1)
fn = len(predictions[fn_filter])

# True negatives
tn_filter = (predictions == 0) & (loan["loan_status"] == 0)
tn = len(predictions[tn_filter])

# Rates
tpr = tp / (tp + fn) #recall
fpr = fp / (fp + tn) #fall-out

print(tpr, fpr)
0.5706483880854858 0.53705743509048

social