In [47]:
import pandas as pd
loan = pd.read_csv("Data/lending club loan/LoanStats3a.csv", skiprows = 1)
In [48]:
loan.head()
loan.shape
Out[48]:
this dataset contains complete loan data for all loans issued through 2007-2011, including the current loan status (Current, Late, Fully Paid, etc.) and latest payment information. data source-lending club data dictionary
In [49]:
half_count = len(loan) / 2
loan = loan.dropna(thresh = half_count, axis = 1)
loan = loan.drop(["desc"], axis = 1)
loan.to_csv('loan2007.csv', index = False)
In [50]:
sorted(loan.columns)
Out[50]:
In [51]:
#Major reasons to remove columns:
'''
-leak information from the future (after the loan has already been funded)
-don't affect a borrower's ability to pay back a loan (e.g. a randomly generated ID value by Lending Club)
-formatted poorly and need to be cleaned up
-require more data or a lot of processing to turn into a useful feature
-contain redundant information
'''
cols_to_remove = ["funded_amnt", "funded_amnt_inv", "grade", "sub_grade", "emp_title", "issue_d"]
loan = loan.drop(cols_to_remove, axis = 1)
In [52]:
removecols2 = ["zip_code", "out_prncp", "out_prncp_inv", "total_pymnt", "total_pymnt_inv", "total_rec_prncp"]
loan = loan.drop(removecols2, axis = 1)
In [53]:
removecols3 = ["total_rec_int", "total_rec_late_fee", "recoveries", "collection_recovery_fee", "last_pymnt_d", "last_pymnt_amnt"]
loan = loan.drop(removecols3, axis = 1)
loan.shape
Out[53]:
In [54]:
loan.drop(["hardship_flag", "disbursement_method", "debt_settlement_flag"], axis = 1, inplace = True)
In [55]:
loan.head()
Out[55]:
In [56]:
print(loan["loan_status"].value_counts())
In [57]:
mapping_dict = {
"loan_status":{
"Fully Paid": 1,
"Charged Off": 0,
"Does not meet the credit policy. Status:Fully Paid":1,
"Does not meet the credit policy. Status:Charged Off":0
}
}
loan = loan.replace(mapping_dict)
In [58]:
print(loan["loan_status"].value_counts())
In [59]:
#remove cols with only one unique value
drop_columns = []
for col in loan.columns:
col_series = loan[col].dropna().unique()
if len(col_series) == 1:
drop_columns.append(col)
loan = loan.drop(drop_columns, axis=1)
print(drop_columns)
In [61]:
loan.isnull().sum()
Out[61]:
In [64]:
loan = loan.drop("pub_rec_bankruptcies", axis = 1)
In [65]:
loan = loan.dropna(axis = 0)
In [73]:
loan.dtypes.value_counts
Out[73]:
In [74]:
obj_cols = loan.select_dtypes(include = ["object"])
obj_cols.head()
Out[74]:
In [76]:
#explore the cols appear to be categorical
cols = ["home_ownership", "verification_status", "emp_length", "term", "addr_state", "purpose", "title"]
for col in cols:
print(loan[col].value_counts())
In [77]:
# remove addr_state (as it would add too many dummy variables), last_credit_pull_d, earliest_cr_line, title(low data quality and overlap with purpose)
loan = loan.drop(["last_credit_pull_d", "addr_state", "title", "earliest_cr_line"], axis = 1)
In [79]:
#convert int_rate and revol_util to float
int_rate = loan["int_rate"].str.strip('%')
int_rate = int_rate.astype('float')
revol_util = loan["revol_util"].str.strip('%')
revol_util = revol_util.astype('float')
In [81]:
loan["int_rate"] = int_rate
loan["revol_util"] = revol_util
In [82]:
#convert emp_length to numerical
mapping_dict = {
"emp_length":{
"10+ years": 10,
"9 years": 9,
"8 years": 8,
"7 years": 7,
"6 years": 6,
"5 years": 5,
"4 years": 4,
"3 years": 3,
"2 years": 2,
"1 year": 1,
"< 1 year": 0,
"n/a": 0
}
}
loan = loan.replace(mapping_dict)
In [84]:
# convert the remaining categorical cols as dummy cols
cols = ["home_ownership", "verification_status", "purpose", "term"]
for col in cols:
loan[col] = loan[col].astype("category")
In [86]:
dummy_df = pd.get_dummies(loan[cols])
loan = pd.concat([loan, dummy_df],axis = 1)
loan = loan.drop(cols, axis = 1)
In [90]:
loan.info()
In [102]:
features = loan.columns
features = features.tolist()
features.remove("loan_status")
target = loan["loan_status"]
In [107]:
# fit with logistic regression model first
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_predict, KFold
lr = LogisticRegression()
kf = KFold(loan[features].shape[0], random_state = 1)
predictions = cross_val_predict(lr, loan[features], target, cv = kf)
predictions = pd.Series(predictions)
# False positives.
fp_filter = (predictions == 1) & (loan["loan_status"] == 0)
fp = len(predictions[fp_filter])
# True positives.
tp_filter = (predictions == 1) & (loan["loan_status"] == 1)
tp = len(predictions[tp_filter])
# False negatives.
fn_filter = (predictions == 0) & (loan["loan_status"] == 1)
fn = len(predictions[fn_filter])
# True negatives
tn_filter = (predictions == 0) & (loan["loan_status"] == 0)
tn = len(predictions[tn_filter])
# Rates
tpr = tp / (tp + fn) #recall
fpr = fp / (fp + tn) #fall-out
print(tpr, fpr)
In [108]:
# Both tpr and fpr are high as the data is highly category imbalanced. (1s are a lot more than 0s), setting the class_weight parameter to balanced will help to solve the problem
lr = LogisticRegression(class_weight = "balanced")
kf = KFold(loan[features].shape[0], random_state = 1)
predictions = cross_val_predict(lr, loan[features], target, cv = kf)
predictions = pd.Series(predictions)
# False positives.
fp_filter = (predictions == 1) & (loan["loan_status"] == 0)
fp = len(predictions[fp_filter])
# True positives.
tp_filter = (predictions == 1) & (loan["loan_status"] == 1)
tp = len(predictions[tp_filter])
# False negatives.
fn_filter = (predictions == 0) & (loan["loan_status"] == 1)
fn = len(predictions[fn_filter])
# True negatives
tn_filter = (predictions == 0) & (loan["loan_status"] == 0)
tn = len(predictions[tn_filter])
# Rates
tpr = tp / (tp + fn) #recall
fpr = fp / (fp + tn) #fall-out
print(tpr, fpr)
In this proj, we are particularly concerned about lowering the false positive rate, so we'd like to apply a harsher penalty for misclassifying the negative class
In [109]:
penalty = {0: 10, 1: 1}
lr = LogisticRegression(class_weight = penalty)
kf = KFold(loan[features].shape[0], random_state = 1)
predictions = cross_val_predict(lr, loan[features], target, cv = kf)
predictions = pd.Series(predictions)
# False positives.
fp_filter = (predictions == 1) & (loan["loan_status"] == 0)
fp = len(predictions[fp_filter])
# True positives.
tp_filter = (predictions == 1) & (loan["loan_status"] == 1)
tp = len(predictions[tp_filter])
# False negatives.
fn_filter = (predictions == 0) & (loan["loan_status"] == 1)
fn = len(predictions[fn_filter])
# True negatives
tn_filter = (predictions == 0) & (loan["loan_status"] == 0)
tn = len(predictions[tn_filter])
# Rates
tpr = tp / (tp + fn) #recall
fpr = fp / (fp + tn) #fall-out
print(tpr, fpr)
In [117]:
# try fitting a random forest, which is expected to perform better due to the nonlinear relationship between many variables and the target
from sklearn.ensemble import RandomForestClassifier
penalty = {0: 20, 1: 1}
rfc = RandomForestClassifier(class_weight = penalty, random_state = 1, max_depth = 15)
kf = KFold(loan[features].shape[0], random_state = 1)
predictions = cross_val_predict(rfc, loan[features], target, cv = kf)
predictions = pd.Series(predictions)
# False positives.
fp_filter = (predictions == 1) & (loan["loan_status"] == 0)
fp = len(predictions[fp_filter])
# True positives.
tp_filter = (predictions == 1) & (loan["loan_status"] == 1)
tp = len(predictions[tp_filter])
# False negatives.
fn_filter = (predictions == 0) & (loan["loan_status"] == 1)
fn = len(predictions[fn_filter])
# True negatives
tn_filter = (predictions == 0) & (loan["loan_status"] == 0)
tn = len(predictions[tn_filter])
# Rates
tpr = tp / (tp + fn) #recall
fpr = fp / (fp + tn) #fall-out
print(tpr, fpr)