In [81]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = 99
In [82]:
cols = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration", "num_doors", "body_style", "drive_wheels", "engine_location", "wheel_base", "length", "width", "height", "curb_weight", "engine_type", "num_cylinders", "engine_size", "fuel_system", "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price"]
auto = pd.read_csv("Data/auto/imports-85.data", header = None, names = cols)
print(auto.head())
This dataset is based on 1985 Ward's Automotive Yearbook
data description: https://archive.ics.uci.edu/ml/datasets/Automobile
In [83]:
auto = auto.replace("?", np.nan)
print(auto.info(), auto.head())
In [84]:
#drop the char cols first
auto.drop(["make", "fuel_type"], axis = 1, inplace = True)
In [85]:
for col in auto.columns:
print(col, auto[col].value_counts())
In [86]:
convert = {"num_doors":{"four":4, "two":2}, "aspiration":{"std":0, "turbo":1}, "num_cylinders":{"four": 4, "six": 6, "five": 5, "eight": 8, "two":2, "twelve":12, "three": 3}}
#convert to numeric: num_doors, turbo, num_cylinders
auto = auto.replace(convert)
In [87]:
auto.drop(["body_style", "drive_wheels", "engine_location", "fuel_system"], axis = 1, inplace = True)
auto.drop(["engine_type"], axis = 1, inplace = True)
In [90]:
print(auto.head())
print(auto.info())
In [91]:
auto[["normalized_losses","bore", "stroke", "horsepower", "peak_rpm", "price"]] = auto[["normalized_losses","bore", "stroke", "horsepower", "peak_rpm", "price"]].astype(float)
In [92]:
auto.info()
auto.shape
Out[92]:
In [93]:
auto["normalized_losses"].replace(np.nan, auto["normalized_losses"].mean(), inplace = True)
In [94]:
print(auto.isnull().sum())
cols = ["num_doors", "bore", "stroke", "horsepower", "peak_rpm", "price"]
print(auto[auto["num_doors"].isnull()])
print(auto["num_doors"].value_counts())
auto["num_doors"].replace(np.nan, 4, inplace = True)
In [95]:
print(auto[auto["bore"].isnull()])
auto["bore"].replace(np.nan, auto["bore"].mean(), inplace = True)
auto["stroke"].replace(np.nan, auto["stroke"].mean(), inplace = True)
In [96]:
print(auto.isnull().sum())
In [97]:
print(auto[auto["horsepower"].isnull()])
In [98]:
auto["horsepower"].replace(np.nan, auto["horsepower"].mean(), inplace = True)
auto["peak_rpm"].replace(np.nan, auto["peak_rpm"].mean(), inplace = True)
# instead of using df.replace, the solution uses df.fillna(df.mean()) which is the same way I use here but much easier
In [99]:
#for the missing price ones, as it's the predictive values, have to drop these 4 rows
auto.shape
Out[99]:
In [100]:
auto.dropna(subset = ["price"], axis = 0, inplace = True)
In [101]:
# Normalize all the rows
price = auto["price"]
auto = (auto - auto.min())/(auto.max() - auto.min())
auto["price"] = price
print(auto.describe())
print(auto.isnull().sum())
In [165]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
#knn train_test function
def knn_train_test(traincol, targetcol, dataframe):
np.random.seed(1)
df = dataframe.iloc[np.random.permutation(len(dataframe))]
train = df[:150]
test = df[150:]
model = KNeighborsRegressor()
model.fit(train[[traincol]], train[targetcol])
predict = model.predict(test[[traincol]])
mse = mean_squared_error(test[targetcol], predict)
rmse = mse ** .5
return rmse
rmse_series = {}
train_cols = auto.columns.drop("price")
for col in train_cols:
rmse_series[col] = knn_train_test(col, "price", auto)
#convert to panda series to be sorted
rmse_result = pd.Series(rmse_series)
rmse_result.sort_values()
Out[165]:
In [154]:
#modify the function to accept parameter for k value
def knn_train_test2(k, traincol, targetcol, dataframe):
np.random.seed(1)
df = dataframe.iloc[np.random.permutation(len(dataframe))]
train = df[:150]
test = df[150:]
model = KNeighborsRegressor(k)
model.fit(train[[traincol]], train[targetcol])
predict = model.predict(test[[traincol]])
mse = mean_squared_error(test[targetcol], predict)
rmse = mse ** .5
return rmse
result = pd.DataFrame()
train_cols = auto.columns.drop("price")
kvalues = [1, 3, 5, 7, 9]
for k in kvalues:
rmse_series = {}
for col in train_cols:
rmse_series[col] = knn_train_test2(k, col, "price", auto)
rmse_result = pd.Series(rmse_series, name = k)
result[k]=rmse_result
print(result[result == result.min()])
In [155]:
import matplotlib.pyplot as plt
%matplotlib inline
In [164]:
result.plot(figsize = (12,9),ylim = (2000,10000))
Out[164]:
In [168]:
#modify the function to incorporate multiple columns
def knn_train_test3(traincol, targetcol, dataframe):
np.random.seed(1)
df = dataframe.iloc[np.random.permutation(len(dataframe))]
train = df[:150]
test = df[150:]
model = KNeighborsRegressor()
model.fit(train[traincol], train[targetcol])
predict = model.predict(test[traincol])
mse = mean_squared_error(test[targetcol], predict)
rmse = mse ** .5
return rmse
ranking = rmse_result.sort_values()
In [180]:
#top combinations with top_n_ranked predictor
for i in range(2,6):
print(ranking.index[:i], knn_train_test3(ranking.index[:i],"price",auto))
In [183]:
#modify the function to take in multivariable and k-value
def knn_train_test4(k, traincol, targetcol, dataframe):
np.random.seed(1)
df = dataframe.iloc[np.random.permutation(len(dataframe))]
train = df[:150]
test = df[150:]
model = KNeighborsRegressor(k)
model.fit(train[traincol], train[targetcol])
predict = model.predict(test[traincol])
mse = mean_squared_error(test[targetcol], predict)
rmse = mse ** .5
return rmse
kranking = pd.DataFrame()
for k in range(1,26):
rmse_series = {}
for col in train_cols:
rmse_series[col] = knn_train_test4(k, [col], "price", auto)
rmse_result = pd.Series(rmse_series, name = k)
ranking = rmse_result.sort_values()
varranking = dict()
for i in range(2, 7):
# print("k =", k, "num_var= ", i, ranking.index[:i], knn_train_test4(k, ranking.index[:i],"price",auto))
varranking["numvar =" + str(i)] = knn_train_test4(k, ranking.index[:i],"price",auto)
varseries = pd.Series(varranking)
kranking[k] = varseries
print(kranking)
In [185]:
kranking.transpose().plot(figsize = (12,9))
Out[185]: