In [47]:

import pandas as pd
loan = pd.read_csv("Data/lending club loan/LoanStats3a.csv", skiprows = 1)

/Applications/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2698: DtypeWarning: Columns (0,47) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

In [48]:

loan.head()
loan.shape

Out[48]:

(42538, 145)

this dataset contains complete loan data for all loans issued through 2007-2011, including the current loan status (Current, Late, Fully Paid, etc.) and latest payment information. data source-lending club data dictionary

In [49]:

half_count = len(loan) / 2
loan = loan.dropna(thresh = half_count, axis = 1)
loan = loan.drop(["desc"], axis = 1)
loan.to_csv('loan2007.csv', index = False)

In [50]:

sorted(loan.columns)

Out[50]:

['acc_now_delinq',
 'addr_state',
 'annual_inc',
 'application_type',
 'chargeoff_within_12_mths',
 'collection_recovery_fee',
 'collections_12_mths_ex_med',
 'debt_settlement_flag',
 'delinq_2yrs',
 'delinq_amnt',
 'disbursement_method',
 'dti',
 'earliest_cr_line',
 'emp_length',
 'emp_title',
 'funded_amnt',
 'funded_amnt_inv',
 'grade',
 'hardship_flag',
 'home_ownership',
 'initial_list_status',
 'inq_last_6mths',
 'installment',
 'int_rate',
 'issue_d',
 'last_credit_pull_d',
 'last_pymnt_amnt',
 'last_pymnt_d',
 'loan_amnt',
 'loan_status',
 'open_acc',
 'out_prncp',
 'out_prncp_inv',
 'policy_code',
 'pub_rec',
 'pub_rec_bankruptcies',
 'purpose',
 'pymnt_plan',
 'recoveries',
 'revol_bal',
 'revol_util',
 'sub_grade',
 'tax_liens',
 'term',
 'title',
 'total_acc',
 'total_pymnt',
 'total_pymnt_inv',
 'total_rec_int',
 'total_rec_late_fee',
 'total_rec_prncp',
 'verification_status',
 'zip_code']

In [51]:

#Major reasons to remove columns:
'''
-leak information from the future (after the loan has already been funded)
-don't affect a borrower's ability to pay back a loan (e.g. a randomly generated ID value by Lending Club)
-formatted poorly and need to be cleaned up
-require more data or a lot of processing to turn into a useful feature
-contain redundant information
'''
cols_to_remove = ["funded_amnt", "funded_amnt_inv", "grade", "sub_grade", "emp_title", "issue_d"]
loan = loan.drop(cols_to_remove, axis = 1)

In [52]:

removecols2 = ["zip_code", "out_prncp", "out_prncp_inv", "total_pymnt", "total_pymnt_inv", "total_rec_prncp"]
loan = loan.drop(removecols2, axis = 1)

In [53]:

removecols3 = ["total_rec_int", "total_rec_late_fee", "recoveries", "collection_recovery_fee", "last_pymnt_d", "last_pymnt_amnt"]
loan = loan.drop(removecols3, axis = 1)
loan.shape

Out[53]:

(42538, 35)

In [54]:

loan.drop(["hardship_flag", "disbursement_method", "debt_settlement_flag"], axis = 1, inplace = True)

In [55]:

loan.head()

Out[55]:

	loan_amnt	term	int_rate	installment	emp_length	home_ownership	annual_inc	verification_status	loan_status	pymnt_plan	...	initial_list_status	last_credit_pull_d	policy_code	application_type
0	5000.0	36 months	10.65%	162.87	10+ years	RENT	24000.0	Verified	Fully Paid	n	...	f	Dec-2017	1.0	Individual
1	2500.0	60 months	15.27%	59.83	< 1 year	RENT	30000.0	Source Verified	Charged Off	n	...	f	Oct-2016	1.0	Individual
2	2400.0	36 months	15.96%	84.33	10+ years	RENT	12252.0	Not Verified	Fully Paid	n	...	f	Jun-2017	1.0	Individual
3	10000.0	36 months	13.49%	339.31	10+ years	RENT	49200.0	Source Verified	Fully Paid	n	...	f	Apr-2016	1.0	Individual
4	3000.0	60 months	12.69%	67.79	1 year	RENT	80000.0	Source Verified	Fully Paid	n	...	f	Jan-2017	1.0	Individual

5 rows × 32 columns

In [56]:

print(loan["loan_status"].value_counts())

Fully Paid                                             34116
Charged Off                                             5670
Does not meet the credit policy. Status:Fully Paid      1988
Does not meet the credit policy. Status:Charged Off      761
Name: loan_status, dtype: int64

In [57]:

mapping_dict = {
    "loan_status":{
        "Fully Paid": 1,
        "Charged Off": 0,
        "Does not meet the credit policy. Status:Fully Paid":1,
        "Does not meet the credit policy. Status:Charged Off":0
    }
}
loan = loan.replace(mapping_dict)

In [58]:

print(loan["loan_status"].value_counts())

1.0    36104
0.0     6431
Name: loan_status, dtype: int64

In [59]:

#remove cols with only one unique value
drop_columns = []
for col in loan.columns:
    col_series = loan[col].dropna().unique()
    if len(col_series) == 1:
        drop_columns.append(col)
loan = loan.drop(drop_columns, axis=1)
print(drop_columns)

['pymnt_plan', 'initial_list_status', 'collections_12_mths_ex_med', 'policy_code', 'application_type', 'chargeoff_within_12_mths']

In [61]:

loan.isnull().sum()

Out[61]:

loan_amnt                  3
term                       3
int_rate                   3
installment                3
emp_length                 3
home_ownership             3
annual_inc                 7
verification_status        3
loan_status                3
purpose                    3
title                     15
addr_state                 3
dti                        3
delinq_2yrs               32
earliest_cr_line          32
inq_last_6mths            32
open_acc                  32
pub_rec                   32
revol_bal                  3
revol_util                93
total_acc                 32
last_credit_pull_d         7
acc_now_delinq            32
delinq_amnt               32
pub_rec_bankruptcies    1368
tax_liens                108
dtype: int64

In [64]:

loan = loan.drop("pub_rec_bankruptcies", axis = 1)

In [65]:

loan = loan.dropna(axis = 0)

In [73]:

loan.dtypes.value_counts

Out[73]:

<bound method IndexOpsMixin.value_counts of loan_amnt              float64
term                    object
int_rate                object
installment            float64
emp_length              object
home_ownership          object
annual_inc             float64
verification_status     object
loan_status            float64
purpose                 object
title                   object
addr_state              object
dti                    float64
delinq_2yrs            float64
earliest_cr_line        object
inq_last_6mths         float64
open_acc               float64
pub_rec                float64
revol_bal              float64
revol_util              object
total_acc              float64
last_credit_pull_d      object
acc_now_delinq         float64
delinq_amnt            float64
tax_liens              float64
dtype: object>

In [74]:

obj_cols = loan.select_dtypes(include = ["object"])
obj_cols.head()

Out[74]:

	term	int_rate	emp_length	home_ownership	verification_status	purpose	title	addr_state	earliest_cr_line	revol_util	last_credit_pull_d
0	36 months	10.65%	10+ years	RENT	Verified	credit_card	Computer	AZ	Jan-1985	83.7%	Dec-2017
1	60 months	15.27%	< 1 year	RENT	Source Verified	car	bike	GA	Apr-1999	9.4%	Oct-2016
2	36 months	15.96%	10+ years	RENT	Not Verified	small_business	real estate business	IL	Nov-2001	98.5%	Jun-2017
3	36 months	13.49%	10+ years	RENT	Source Verified	other	personel	CA	Feb-1996	21%	Apr-2016
4	60 months	12.69%	1 year	RENT	Source Verified	other	Personal	OR	Jan-1996	53.9%	Jan-2017

In [76]:

#explore the cols appear to be categorical
cols = ["home_ownership", "verification_status", "emp_length", "term", "addr_state", "purpose", "title"]
for col in cols:
    print(loan[col].value_counts())

RENT        20087
MORTGAGE    18902
OWN          3231
OTHER         134
NONE            2
Name: home_ownership, dtype: int64
Not Verified       18608
Verified           13458
Source Verified    10290
Name: verification_status, dtype: int64
10+ years    9355
< 1 year     5003
2 years      4727
3 years      4354
4 years      3633
1 year       3567
5 years      3449
6 years      2368
7 years      1868
8 years      1587
9 years      1337
n/a          1108
Name: emp_length, dtype: int64
 36 months    31363
 60 months    10993
Name: term, dtype: int64
CA    7418
NY    4044
FL    3085
TX    2906
NJ    1978
IL    1670
PA    1645
GA    1495
VA    1484
MA    1417
OH    1323
MD    1121
AZ     928
WA     887
CO     852
NC     824
CT     810
MI     795
MO     760
MN     649
NV     526
WI     509
SC     489
AL     483
OR     467
LA     460
KY     358
OK     316
KS     296
UT     278
AR     260
DC     222
RI     208
NM     205
NH     188
WV     187
HI     180
DE     135
MT      96
WY      86
AK      86
SD      66
VT      57
TN      32
MS      26
IN      15
IA      12
NE      11
ID       9
ME       2
Name: addr_state, dtype: int64
debt_consolidation    19736
credit_card            5450
other                  4371
home_improvement       3187
major_purchase         2304
small_business         1984
car                    1610
wedding                1003
medical                 751
moving                  624
house                   424
educational             409
vacation                397
renewable_energy        106
Name: purpose, dtype: int64
Debt Consolidation                    2255
Debt Consolidation Loan               1759
Personal Loan                          706
Consolidation                          545
debt consolidation                     532
Home Improvement                       372
Credit Card Consolidation              370
Debt consolidation                     345
Small Business Loan                    333
Personal                               330
Credit Card Loan                       323
personal loan                          264
Consolidation Loan                     263
Home Improvement Loan                  255
personal                               246
Loan                                   218
Wedding Loan                           217
consolidation                          217
Car Loan                               206
Other Loan                             192
Credit Card Payoff                     163
Wedding                                160
Credit Card Refinance                  147
Major Purchase Loan                    146
Consolidate                            141
Medical                                124
Credit Card                            122
home improvement                       117
My Loan                                 95
Credit Cards                            94
                                      ... 
Time To Get The Finances In Order!       1
FinalCCPayment                           1
Post-College Debt Consolidation          1
paybackcards                             1
Three Long Years                         1
High Interest CC Payoff                  1
My GREEN PROJECT IN HOME                 1
In Credit Card Recovery!!                1
Personal Consid                          1
Furniture Purchase                       1
pay off high interest debt               1
Furniture/Improvements                   1
consolidate debt and fund hobby          1
Loan to remodel my kitchen               1
Future                                   1
my credit loan                           1
business loan                            1
Bmw                                      1
Lower rates                              1
Consolodating Student Loans              1
Bridge to new contract                   1
Short Term Financing Needed              1
Paying too much interest                 1
My conslidation loan                     1
Life-Saver Loan                          1
Consolidate my credit cards              1
consolidation!!                          1
Move to Michigan                         1
JL Vacation                              1
Wheels                                   1
Name: title, Length: 21160, dtype: int64

In [77]:

# remove addr_state (as it would add too many dummy variables), last_credit_pull_d, earliest_cr_line, title(low data quality and overlap with purpose)
loan = loan.drop(["last_credit_pull_d", "addr_state", "title", "earliest_cr_line"], axis = 1)

In [79]:

#convert int_rate and revol_util to float
int_rate = loan["int_rate"].str.strip('%')
int_rate = int_rate.astype('float')
revol_util = loan["revol_util"].str.strip('%')
revol_util = revol_util.astype('float')

In [81]:

loan["int_rate"] = int_rate
loan["revol_util"] = revol_util

In [82]:

#convert emp_length to numerical
mapping_dict = {
    "emp_length":{
        "10+ years": 10,
        "9 years": 9,
        "8 years": 8,
        "7 years": 7,
        "6 years": 6,
        "5 years": 5,
        "4 years": 4,
        "3 years": 3,
        "2 years": 2,
        "1 year": 1,
        "< 1 year": 0,
        "n/a": 0
    }
}
loan = loan.replace(mapping_dict)

In [84]:

# convert the remaining categorical cols as dummy cols
cols = ["home_ownership", "verification_status", "purpose", "term"]
for col in cols:
    loan[col] = loan[col].astype("category")

In [86]:

dummy_df = pd.get_dummies(loan[cols])
loan = pd.concat([loan, dummy_df],axis = 1)
loan = loan.drop(cols, axis = 1)

In [90]:

loan.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42356 entries, 0 to 42478
Data columns (total 41 columns):
loan_amnt                              42356 non-null float64
int_rate                               42356 non-null float64
installment                            42356 non-null float64
emp_length                             42356 non-null int64
annual_inc                             42356 non-null float64
loan_status                            42356 non-null float64
dti                                    42356 non-null float64
delinq_2yrs                            42356 non-null float64
inq_last_6mths                         42356 non-null float64
open_acc                               42356 non-null float64
pub_rec                                42356 non-null float64
revol_bal                              42356 non-null float64
revol_util                             42356 non-null float64
total_acc                              42356 non-null float64
acc_now_delinq                         42356 non-null float64
delinq_amnt                            42356 non-null float64
tax_liens                              42356 non-null float64
home_ownership_MORTGAGE                42356 non-null uint8
home_ownership_NONE                    42356 non-null uint8
home_ownership_OTHER                   42356 non-null uint8
home_ownership_OWN                     42356 non-null uint8
home_ownership_RENT                    42356 non-null uint8
verification_status_Not Verified       42356 non-null uint8
verification_status_Source Verified    42356 non-null uint8
verification_status_Verified           42356 non-null uint8
purpose_car                            42356 non-null uint8
purpose_credit_card                    42356 non-null uint8
purpose_debt_consolidation             42356 non-null uint8
purpose_educational                    42356 non-null uint8
purpose_home_improvement               42356 non-null uint8
purpose_house                          42356 non-null uint8
purpose_major_purchase                 42356 non-null uint8
purpose_medical                        42356 non-null uint8
purpose_moving                         42356 non-null uint8
purpose_other                          42356 non-null uint8
purpose_renewable_energy               42356 non-null uint8
purpose_small_business                 42356 non-null uint8
purpose_vacation                       42356 non-null uint8
purpose_wedding                        42356 non-null uint8
term_ 36 months                        42356 non-null uint8
term_ 60 months                        42356 non-null uint8
dtypes: float64(16), int64(1), uint8(24)
memory usage: 6.8 MB

In [102]:

features = loan.columns
features = features.tolist()
features.remove("loan_status")
target = loan["loan_status"]

In [107]:

# fit with logistic regression model first
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_predict, KFold
lr = LogisticRegression()
kf = KFold(loan[features].shape[0], random_state = 1)
predictions = cross_val_predict(lr, loan[features], target, cv = kf)
predictions = pd.Series(predictions)
# False positives.
fp_filter = (predictions == 1) & (loan["loan_status"] == 0)
fp = len(predictions[fp_filter])

# True positives.
tp_filter = (predictions == 1) & (loan["loan_status"] == 1)
tp = len(predictions[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (loan["loan_status"] == 1)
fn = len(predictions[fn_filter])

# True negatives
tn_filter = (predictions == 0) & (loan["loan_status"] == 0)
tn = len(predictions[tn_filter])

# Rates
tpr = tp / (tp + fn) #recall
fpr = fp / (fp + tn) #fall-out

print(tpr, fpr)

0.9993034077293879 0.9985837922895358

In [108]:

# Both tpr and fpr are high as the data is highly category imbalanced. (1s are a lot more than 0s), setting the class_weight parameter to balanced will help to solve the problem
lr = LogisticRegression(class_weight = "balanced")
kf = KFold(loan[features].shape[0], random_state = 1)
predictions = cross_val_predict(lr, loan[features], target, cv = kf)
predictions = pd.Series(predictions)
# False positives.
fp_filter = (predictions == 1) & (loan["loan_status"] == 0)
fp = len(predictions[fp_filter])

# True positives.
tp_filter = (predictions == 1) & (loan["loan_status"] == 1)
tp = len(predictions[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (loan["loan_status"] == 1)
fn = len(predictions[fn_filter])

# True negatives
tn_filter = (predictions == 0) & (loan["loan_status"] == 0)
tn = len(predictions[tn_filter])

# Rates
tpr = tp / (tp + fn) #recall
fpr = fp / (fp + tn) #fall-out

print(tpr, fpr)

0.6338989662570704 0.5741935483870968

In this proj, we are particularly concerned about lowering the false positive rate, so we'd like to apply a harsher penalty for misclassifying the negative class

In [109]:

penalty = {0: 10, 1: 1}
lr = LogisticRegression(class_weight = penalty)
kf = KFold(loan[features].shape[0], random_state = 1)
predictions = cross_val_predict(lr, loan[features], target, cv = kf)
predictions = pd.Series(predictions)
# False positives.
fp_filter = (predictions == 1) & (loan["loan_status"] == 0)
fp = len(predictions[fp_filter])

# True positives.
tp_filter = (predictions == 1) & (loan["loan_status"] == 1)
tp = len(predictions[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (loan["loan_status"] == 1)
fn = len(predictions[fn_filter])

# True negatives
tn_filter = (predictions == 0) & (loan["loan_status"] == 0)
tn = len(predictions[tn_filter])

# Rates
tpr = tp / (tp + fn) #recall
fpr = fp / (fp + tn) #fall-out

print(tpr, fpr)

0.18181058262977515 0.16443745082612116

In [117]:

# try fitting a random forest, which is expected to perform better due to the nonlinear relationship between many variables and the target
from sklearn.ensemble import RandomForestClassifier
penalty = {0: 20, 1: 1}
rfc = RandomForestClassifier(class_weight = penalty, random_state = 1, max_depth = 15)
kf = KFold(loan[features].shape[0], random_state = 1)
predictions = cross_val_predict(rfc, loan[features], target, cv = kf)
predictions = pd.Series(predictions)
# False positives.
fp_filter = (predictions == 1) & (loan["loan_status"] == 0)
fp = len(predictions[fp_filter])

# True positives.
tp_filter = (predictions == 1) & (loan["loan_status"] == 1)
tp = len(predictions[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (loan["loan_status"] == 1)
fn = len(predictions[fn_filter])

# True negatives
tn_filter = (predictions == 0) & (loan["loan_status"] == 0)
tn = len(predictions[tn_filter])

# Rates
tpr = tp / (tp + fn) #recall
fpr = fp / (fp + tn) #fall-out

print(tpr, fpr)

0.5706483880854858 0.53705743509048

Phoebe's blog

Loan Club Data Predictive Analysis with data cleansing

social