In [81]:

import pandas as pd
import numpy as np

pd.options.display.max_columns = 99

In [82]:

cols = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration", "num_doors", "body_style", "drive_wheels", "engine_location", "wheel_base", "length", "width", "height", "curb_weight", "engine_type", "num_cylinders", "engine_size", "fuel_system", "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price"]
auto = pd.read_csv("Data/auto/imports-85.data", header = None, names = cols)
print(auto.head())

   symboling normalized_losses         make fuel_type aspiration num_doors  \
0          3                 ?  alfa-romero       gas        std       two   
1          3                 ?  alfa-romero       gas        std       two   
2          1                 ?  alfa-romero       gas        std       two   
3          2               164         audi       gas        std      four   
4          2               164         audi       gas        std      four   

    body_style drive_wheels engine_location  wheel_base  length  width  \
0  convertible          rwd           front        88.6   168.8   64.1   
1  convertible          rwd           front        88.6   168.8   64.1   
2    hatchback          rwd           front        94.5   171.2   65.5   
3        sedan          fwd           front        99.8   176.6   66.2   
4        sedan          4wd           front        99.4   176.6   66.4   

   height  curb_weight engine_type num_cylinders  engine_size fuel_system  \
0    48.8         2548        dohc          four          130        mpfi   
1    48.8         2548        dohc          four          130        mpfi   
2    52.4         2823        ohcv           six          152        mpfi   
3    54.3         2337         ohc          four          109        mpfi   
4    54.3         2824         ohc          five          136        mpfi   

   bore stroke  compression_ratio horsepower peak_rpm  city_mpg  highway_mpg  \
0  3.47   2.68                9.0        111     5000        21           27   
1  3.47   2.68                9.0        111     5000        21           27   
2  2.68   3.47                9.0        154     5000        19           26   
3  3.19   3.40               10.0        102     5500        24           30   
4  3.19   3.40                8.0        115     5500        18           22   

   price  
0  13495  
1  16500  
2  16500  
3  13950  
4  17450

This dataset is based on 1985 Ward's Automotive Yearbook

data description: https://archive.ics.uci.edu/ml/datasets/Automobile

In [83]:

auto = auto.replace("?", np.nan)
print(auto.info(), auto.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
symboling            205 non-null int64
normalized_losses    164 non-null object
make                 205 non-null object
fuel_type            205 non-null object
aspiration           205 non-null object
num_doors            203 non-null object
body_style           205 non-null object
drive_wheels         205 non-null object
engine_location      205 non-null object
wheel_base           205 non-null float64
length               205 non-null float64
width                205 non-null float64
height               205 non-null float64
curb_weight          205 non-null int64
engine_type          205 non-null object
num_cylinders        205 non-null object
engine_size          205 non-null int64
fuel_system          205 non-null object
bore                 201 non-null object
stroke               201 non-null object
compression_ratio    205 non-null float64
horsepower           203 non-null object
peak_rpm             203 non-null object
city_mpg             205 non-null int64
highway_mpg          205 non-null int64
price                201 non-null object
dtypes: float64(5), int64(5), object(16)
memory usage: 41.7+ KB
None    symboling normalized_losses         make fuel_type aspiration num_doors  \
0          3               NaN  alfa-romero       gas        std       two   
1          3               NaN  alfa-romero       gas        std       two   
2          1               NaN  alfa-romero       gas        std       two   
3          2               164         audi       gas        std      four   
4          2               164         audi       gas        std      four   

    body_style drive_wheels engine_location  wheel_base  length  width  \
0  convertible          rwd           front        88.6   168.8   64.1   
1  convertible          rwd           front        88.6   168.8   64.1   
2    hatchback          rwd           front        94.5   171.2   65.5   
3        sedan          fwd           front        99.8   176.6   66.2   
4        sedan          4wd           front        99.4   176.6   66.4   

   height  curb_weight engine_type num_cylinders  engine_size fuel_system  \
0    48.8         2548        dohc          four          130        mpfi   
1    48.8         2548        dohc          four          130        mpfi   
2    52.4         2823        ohcv           six          152        mpfi   
3    54.3         2337         ohc          four          109        mpfi   
4    54.3         2824         ohc          five          136        mpfi   

   bore stroke  compression_ratio horsepower peak_rpm  city_mpg  highway_mpg  \
0  3.47   2.68                9.0        111     5000        21           27   
1  3.47   2.68                9.0        111     5000        21           27   
2  2.68   3.47                9.0        154     5000        19           26   
3  3.19   3.40               10.0        102     5500        24           30   
4  3.19   3.40                8.0        115     5500        18           22   

   price  
0  13495  
1  16500  
2  16500  
3  13950  
4  17450

In [84]:

#drop the char cols first
auto.drop(["make", "fuel_type"], axis = 1, inplace = True)

In [85]:

for col in auto.columns:
    print(col, auto[col].value_counts())

symboling  0    67
 1    54
 2    32
 3    27
-1    22
-2     3
Name: symboling, dtype: int64
normalized_losses 161    11
91      8
150     7
128     6
134     6
104     6
65      5
103     5
85      5
94      5
102     5
168     5
95      5
74      5
93      4
148     4
106     4
122     4
118     4
125     3
83      3
154     3
137     3
115     3
101     3
194     2
129     2
108     2
158     2
113     2
110     2
192     2
89      2
197     2
145     2
164     2
81      2
87      2
153     2
119     2
188     2
77      1
142     1
78      1
107     1
98      1
186     1
231     1
121     1
90      1
256     1
Name: normalized_losses, dtype: int64
aspiration std      168
turbo     37
Name: aspiration, dtype: int64
num_doors four    114
two      89
Name: num_doors, dtype: int64
body_style sedan          96
hatchback      70
wagon          25
hardtop         8
convertible     6
Name: body_style, dtype: int64
drive_wheels fwd    120
rwd     76
4wd      9
Name: drive_wheels, dtype: int64
engine_location front    202
rear       3
Name: engine_location, dtype: int64
wheel_base 94.5     21
93.7     20
95.7     13
96.5      8
98.4      7
97.3      7
96.3      6
107.9     6
98.8      6
99.1      6
104.3     6
100.4     6
93.1      5
97.2      5
102.4     5
109.1     5
95.9      5
101.2     4
97.0      4
114.2     4
95.3      4
105.8     3
103.5     3
110.0     3
89.5      3
99.8      2
86.6      2
113.0     2
104.5     2
104.9     2
88.6      2
96.9      2
91.3      2
115.6     2
102.9     2
96.1      2
103.3     2
112.0     1
99.5      1
88.4      1
99.2      1
96.0      1
102.0     1
106.7     1
93.0      1
99.4      1
93.3      1
102.7     1
120.9     1
95.1      1
96.6      1
94.3      1
108.0     1
Name: wheel_base, dtype: int64
length 157.3    15
188.8    11
166.3     7
171.7     7
186.7     7
165.3     6
177.8     6
176.2     6
186.6     6
176.8     5
172.0     5
175.6     5
173.2     5
172.4     4
169.0     4
168.7     4
198.9     4
168.9     4
192.7     3
158.7     3
155.9     3
170.7     3
169.7     3
159.1     3
150.0     3
175.4     3
168.8     2
167.3     2
170.2     2
173.4     2
         ..
173.6     2
166.8     2
187.5     1
165.6     1
178.5     1
172.6     1
178.4     1
181.5     1
208.1     1
159.3     1
197.0     1
162.4     1
165.7     1
158.8     1
180.3     1
169.1     1
157.1     1
156.9     1
171.2     1
163.4     1
178.2     1
191.7     1
199.2     1
141.1     1
181.7     1
157.9     1
177.3     1
175.7     1
193.8     1
183.1     1
Name: length, Length: 75, dtype: int64
width 63.8    24
66.5    23
65.4    15
63.6    11
64.4    10
68.4    10
64.0     9
65.5     8
65.2     7
66.3     6
64.2     6
67.2     6
65.6     6
67.9     5
66.9     5
68.9     4
64.8     4
65.7     4
65.0     3
63.9     3
71.4     3
71.7     3
70.3     3
64.6     2
64.1     2
67.7     2
66.1     2
68.3     2
69.6     2
61.8     1
72.0     1
68.8     1
62.5     1
66.0     1
63.4     1
70.5     1
72.3     1
68.0     1
70.9     1
66.2     1
66.4     1
60.3     1
70.6     1
66.6     1
Name: width, dtype: int64
height 50.8    14
52.0    12
55.7    12
54.5    10
54.1    10
55.5     9
56.7     8
54.3     8
51.6     7
56.1     7
52.6     7
50.2     6
53.0     6
54.9     6
52.8     6
53.7     5
55.1     5
50.6     5
53.3     4
58.7     4
49.6     4
57.5     3
53.5     3
49.7     3
59.1     3
56.2     3
52.5     3
49.4     2
48.8     2
56.3     2
54.4     2
59.8     2
56.5     2
54.7     2
51.4     2
53.9     2
50.5     2
56.0     1
51.0     1
52.4     1
55.4     1
55.6     1
53.1     1
58.3     1
47.8     1
54.8     1
55.9     1
53.2     1
55.2     1
Name: height, dtype: int64
curb_weight 2385    4
1989    3
1918    3
2275    3
3230    2
2410    2
3252    2
2337    2
2403    2
2414    2
2145    2
3139    2
2395    2
1876    2
1874    2
2128    2
1967    2
2380    2
4066    2
2756    2
2579    2
3075    2
2535    2
2548    2
1909    2
2191    2
2290    2
2024    2
2300    2
3151    1
       ..
2707    1
3217    1
3770    1
2190    1
2094    1
2302    1
2443    1
2954    1
2261    1
2695    1
3740    1
3485    1
1951    1
2976    1
2952    1
2563    1
1971    1
1985    1
3505    1
2480    1
2734    1
2221    1
2670    1
2926    1
1950    1
3750    1
1956    1
2037    1
2465    1
2304    1
Name: curb_weight, Length: 171, dtype: int64
engine_type ohc      148
ohcf      15
ohcv      13
dohc      12
l         12
rotor      4
dohcv      1
Name: engine_type, dtype: int64
num_cylinders four      159
six        24
five       11
eight       5
two         4
twelve      1
three       1
Name: num_cylinders, dtype: int64
engine_size 122    15
92     15
98     14
97     14
108    13
90     12
110    12
109     8
120     7
141     7
152     6
121     6
146     6
181     6
91      5
156     5
136     5
130     4
183     4
194     3
164     3
171     3
209     3
70      3
234     2
131     2
134     2
140     2
258     2
132     2
111     1
103     1
145     1
151     1
119     1
173     1
203     1
80      1
79      1
326     1
61      1
308     1
304     1
161     1
Name: engine_size, dtype: int64
fuel_system mpfi    94
2bbl    66
idi     20
1bbl    11
spdi     9
4bbl     3
mfi      1
spfi     1
Name: fuel_system, dtype: int64
bore 3.62    23
3.19    20
3.15    15
3.03    12
2.97    12
3.46     9
3.31     8
3.43     8
3.78     8
2.91     7
3.27     7
3.39     6
3.05     6
3.58     6
3.54     6
3.70     5
3.01     5
3.35     4
3.74     3
3.17     3
3.59     3
3.94     2
3.50     2
3.63     2
3.33     2
3.24     2
3.80     2
3.47     2
3.13     2
3.61     1
3.60     1
3.08     1
2.54     1
3.34     1
2.68     1
2.92     1
2.99     1
3.76     1
Name: bore, dtype: int64
stroke 3.40    20
3.23    14
3.03    14
3.15    14
3.39    13
2.64    11
3.29     9
3.35     9
3.46     8
3.58     6
3.27     6
3.11     6
3.41     6
3.50     6
3.19     6
3.07     6
3.64     5
3.52     5
3.54     4
3.47     4
3.86     4
3.90     3
2.90     3
2.19     2
2.80     2
4.17     2
2.68     2
3.10     2
3.08     2
2.76     1
2.07     1
2.87     1
3.16     1
3.12     1
3.21     1
2.36     1
Name: stroke, dtype: int64
compression_ratio 9.00     46
9.40     26
8.50     14
9.50     13
9.30     11
8.70      9
9.20      8
8.00      8
7.00      7
21.00     5
7.50      5
9.60      5
23.00     5
8.40      5
8.60      5
21.50     4
7.60      4
10.00     3
22.50     3
8.30      3
8.80      3
7.70      2
8.10      2
9.31      1
21.90     1
22.00     1
11.50     1
9.41      1
10.10     1
9.10      1
22.70     1
7.80      1
Name: compression_ratio, dtype: int64
horsepower 68     19
70     11
69     10
116     9
110     8
95      7
88      6
114     6
101     6
62      6
160     6
102     5
76      5
84      5
97      5
82      5
145     5
92      4
111     4
86      4
123     4
85      3
207     3
90      3
73      3
182     3
152     3
121     3
184     2
162     2
161     2
155     2
156     2
112     2
56      2
94      2
52      2
176     2
100     2
78      1
134     1
135     1
60      1
262     1
58      1
72      1
115     1
200     1
143     1
140     1
64      1
175     1
142     1
120     1
55      1
288     1
106     1
48      1
154     1
Name: horsepower, dtype: int64
peak_rpm 5500    37
4800    36
5000    27
5200    23
5400    13
6000     9
5800     7
5250     7
4500     7
4150     5
4200     5
4750     4
4350     4
5100     3
5900     3
4250     3
4400     3
6600     2
4650     1
5300     1
5750     1
5600     1
4900     1
Name: peak_rpm, dtype: int64
city_mpg 31    28
19    27
24    22
27    14
17    13
26    12
23    12
21     8
30     8
25     8
38     7
28     7
37     6
16     6
22     4
15     3
18     3
29     3
20     3
14     2
49     1
47     1
32     1
33     1
34     1
35     1
36     1
45     1
13     1
Name: city_mpg, dtype: int64
highway_mpg 25    19
24    17
38    17
30    16
32    16
34    14
37    13
28    13
29    10
33     9
31     8
22     8
23     7
27     5
43     4
41     3
42     3
26     3
20     2
19     2
18     2
16     2
36     2
39     2
46     2
47     2
53     1
50     1
17     1
54     1
Name: highway_mpg, dtype: int64
price 7609     2
7775     2
9279     2
7957     2
6692     2
5572     2
16500    2
13499    2
8921     2
18150    2
8495     2
7898     2
7295     2
8845     2
6229     2
10698    1
17199    1
32250    1
8778     1
36000    1
8449     1
11248    1
20970    1
7499     1
18920    1
10795    1
15580    1
17075    1
13495    1
15985    1
        ..
16925    1
14489    1
10245    1
5399     1
6095     1
9095     1
15510    1
13200    1
6189     1
16430    1
7099     1
8949     1
9549     1
13415    1
11900    1
7895     1
11199    1
11048    1
16630    1
7788     1
40960    1
17710    1
7463     1
28176    1
9895     1
24565    1
11694    1
6849     1
7999     1
16845    1
Name: price, Length: 186, dtype: int64

In [86]:

convert = {"num_doors":{"four":4, "two":2}, "aspiration":{"std":0, "turbo":1}, "num_cylinders":{"four": 4, "six": 6, "five": 5, "eight": 8, "two":2, "twelve":12, "three": 3}}
#convert to numeric: num_doors, turbo, num_cylinders
auto = auto.replace(convert)

In [87]:

auto.drop(["body_style", "drive_wheels", "engine_location", "fuel_system"], axis = 1, inplace = True)
auto.drop(["engine_type"], axis = 1, inplace = True)

In [90]:

print(auto.head())
print(auto.info())

   symboling normalized_losses  aspiration  num_doors  wheel_base  length  \
0          3               NaN           0        2.0        88.6   168.8   
1          3               NaN           0        2.0        88.6   168.8   
2          1               NaN           0        2.0        94.5   171.2   
3          2               164           0        4.0        99.8   176.6   
4          2               164           0        4.0        99.4   176.6   

   width  height  curb_weight  num_cylinders  engine_size  bore stroke  \
0   64.1    48.8         2548              4          130  3.47   2.68   
1   64.1    48.8         2548              4          130  3.47   2.68   
2   65.5    52.4         2823              6          152  2.68   3.47   
3   66.2    54.3         2337              4          109  3.19   3.40   
4   66.4    54.3         2824              5          136  3.19   3.40   

   compression_ratio horsepower peak_rpm  city_mpg  highway_mpg  price  
0                9.0        111     5000        21           27  13495  
1                9.0        111     5000        21           27  16500  
2                9.0        154     5000        19           26  16500  
3               10.0        102     5500        24           30  13950  
4                8.0        115     5500        18           22  17450  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 19 columns):
symboling            205 non-null int64
normalized_losses    164 non-null object
aspiration           205 non-null int64
num_doors            203 non-null float64
wheel_base           205 non-null float64
length               205 non-null float64
width                205 non-null float64
height               205 non-null float64
curb_weight          205 non-null int64
num_cylinders        205 non-null int64
engine_size          205 non-null int64
bore                 201 non-null object
stroke               201 non-null object
compression_ratio    205 non-null float64
horsepower           203 non-null object
peak_rpm             203 non-null object
city_mpg             205 non-null int64
highway_mpg          205 non-null int64
price                201 non-null object
dtypes: float64(6), int64(7), object(6)
memory usage: 30.5+ KB
None

In [91]:

auto[["normalized_losses","bore", "stroke", "horsepower", "peak_rpm", "price"]] = auto[["normalized_losses","bore", "stroke", "horsepower", "peak_rpm", "price"]].astype(float)

In [92]:

auto.info()
auto.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 19 columns):
symboling            205 non-null int64
normalized_losses    164 non-null float64
aspiration           205 non-null int64
num_doors            203 non-null float64
wheel_base           205 non-null float64
length               205 non-null float64
width                205 non-null float64
height               205 non-null float64
curb_weight          205 non-null int64
num_cylinders        205 non-null int64
engine_size          205 non-null int64
bore                 201 non-null float64
stroke               201 non-null float64
compression_ratio    205 non-null float64
horsepower           203 non-null float64
peak_rpm             203 non-null float64
city_mpg             205 non-null int64
highway_mpg          205 non-null int64
price                201 non-null float64
dtypes: float64(12), int64(7)
memory usage: 30.5 KB

Out[92]:

(205, 19)

In [93]:

auto["normalized_losses"].replace(np.nan, auto["normalized_losses"].mean(), inplace = True)

In [94]:

print(auto.isnull().sum())
cols = ["num_doors", "bore", "stroke", "horsepower", "peak_rpm", "price"]
print(auto[auto["num_doors"].isnull()])
print(auto["num_doors"].value_counts())
auto["num_doors"].replace(np.nan, 4, inplace = True)

symboling            0
normalized_losses    0
aspiration           0
num_doors            2
wheel_base           0
length               0
width                0
height               0
curb_weight          0
num_cylinders        0
engine_size          0
bore                 4
stroke               4
compression_ratio    0
horsepower           2
peak_rpm             2
city_mpg             0
highway_mpg          0
price                4
dtype: int64
    symboling  normalized_losses  aspiration  num_doors  wheel_base  length  \
27          1              148.0           1        NaN        93.7   157.3   
63          0              122.0           0        NaN        98.8   177.8   

    width  height  curb_weight  num_cylinders  engine_size  bore  stroke  \
27   63.8    50.6         2191              4           98  3.03    3.39   
63   66.5    55.5         2443              4          122  3.39    3.39   

    compression_ratio  horsepower  peak_rpm  city_mpg  highway_mpg    price  
27                7.6       102.0    5500.0        24           30   8558.0  
63               22.7        64.0    4650.0        36           42  10795.0  
4.0    114
2.0     89
Name: num_doors, dtype: int64

In [95]:

print(auto[auto["bore"].isnull()])
auto["bore"].replace(np.nan, auto["bore"].mean(), inplace = True)
auto["stroke"].replace(np.nan, auto["stroke"].mean(), inplace = True)

    symboling  normalized_losses  aspiration  num_doors  wheel_base  length  \
55          3              150.0           0        2.0        95.3   169.0   
56          3              150.0           0        2.0        95.3   169.0   
57          3              150.0           0        2.0        95.3   169.0   
58          3              150.0           0        2.0        95.3   169.0   

    width  height  curb_weight  num_cylinders  engine_size  bore  stroke  \
55   65.7    49.6         2380              2           70   NaN     NaN   
56   65.7    49.6         2380              2           70   NaN     NaN   
57   65.7    49.6         2385              2           70   NaN     NaN   
58   65.7    49.6         2500              2           80   NaN     NaN   

    compression_ratio  horsepower  peak_rpm  city_mpg  highway_mpg    price  
55                9.4       101.0    6000.0        17           23  10945.0  
56                9.4       101.0    6000.0        17           23  11845.0  
57                9.4       101.0    6000.0        17           23  13645.0  
58                9.4       135.0    6000.0        16           23  15645.0

In [96]:

print(auto.isnull().sum())

symboling            0
normalized_losses    0
aspiration           0
num_doors            0
wheel_base           0
length               0
width                0
height               0
curb_weight          0
num_cylinders        0
engine_size          0
bore                 0
stroke               0
compression_ratio    0
horsepower           2
peak_rpm             2
city_mpg             0
highway_mpg          0
price                4
dtype: int64

In [97]:

print(auto[auto["horsepower"].isnull()])

     symboling  normalized_losses  aspiration  num_doors  wheel_base  length  \
130          0              122.0           0        4.0        96.1   181.5   
131          2              122.0           0        2.0        96.1   176.8   

     width  height  curb_weight  num_cylinders  engine_size  bore  stroke  \
130   66.5    55.2         2579              4          132  3.46     3.9   
131   66.6    50.5         2460              4          132  3.46     3.9   

     compression_ratio  horsepower  peak_rpm  city_mpg  highway_mpg   price  
130                8.7         NaN       NaN        23           31  9295.0  
131                8.7         NaN       NaN        23           31  9895.0

In [98]:

auto["horsepower"].replace(np.nan, auto["horsepower"].mean(), inplace = True)
auto["peak_rpm"].replace(np.nan, auto["peak_rpm"].mean(), inplace = True)
# instead of using df.replace, the solution uses df.fillna(df.mean()) which is the same way I use here but much easier

In [99]:

#for the missing price ones, as it's the predictive values, have to drop these 4 rows
auto.shape

Out[99]:

(205, 19)

In [100]:

auto.dropna(subset = ["price"], axis = 0, inplace = True)

In [101]:

# Normalize all the rows
price = auto["price"]
auto = (auto - auto.min())/(auto.max() - auto.min())
auto["price"] = price
print(auto.describe())
print(auto.isnull().sum())

        symboling  normalized_losses  aspiration   num_doors  wheel_base  \
count  201.000000         201.000000  201.000000  201.000000  201.000000   
mean     0.568159           0.298429    0.179104    0.572139    0.355598   
std      0.250960           0.167520    0.384397    0.496004    0.176862   
min      0.000000           0.000000    0.000000    0.000000    0.000000   
25%      0.400000           0.188482    0.000000    0.000000    0.230321   
50%      0.600000           0.298429    0.000000    1.000000    0.303207   
75%      0.800000           0.376963    0.000000    1.000000    0.460641   
max      1.000000           1.000000    1.000000    1.000000    1.000000   

           length       width      height  curb_weight  num_cylinders  \
count  201.000000  201.000000  201.000000   201.000000     201.000000   
mean     0.494045    0.477697    0.497222     0.414145       0.236318   
std      0.183913    0.179613    0.203985     0.200658       0.105945   
min      0.000000    0.000000    0.000000     0.000000       0.000000   
25%      0.383582    0.324786    0.350000     0.264158       0.200000   
50%      0.479104    0.444444    0.525000     0.359193       0.200000   
75%      0.632836    0.538462    0.641667     0.557797       0.200000   
max      1.000000    1.000000    1.000000     1.000000       1.000000   

       engine_size        bore      stroke  compression_ratio  horsepower  \
count   201.000000  201.000000  201.000000         201.000000  201.000000   
mean      0.248587    0.564780    0.565178           0.197767    0.258904   
std       0.156781    0.191480    0.150499           0.250310    0.174606   
min       0.000000    0.000000    0.000000           0.000000    0.000000   
25%       0.139623    0.435714    0.495238           0.100000    0.102804   
50%       0.222642    0.550000    0.580952           0.125000    0.219626   
75%       0.301887    0.742857    0.638095           0.150000    0.317757   
max       1.000000    1.000000    1.000000           1.000000    1.000000   

         peak_rpm    city_mpg  highway_mpg         price  
count  201.000000  201.000000   201.000000    201.000000  
mean     0.394965    0.338308     0.386489  13207.129353  
std      0.195148    0.178423     0.179346   7947.066342  
min      0.000000    0.000000     0.000000   5118.000000  
25%      0.265306    0.166667     0.236842   7775.000000  
50%      0.398110    0.305556     0.368421  10295.000000  
75%      0.551020    0.472222     0.473684  16500.000000  
max      1.000000    1.000000     1.000000  45400.000000  
symboling            0
normalized_losses    0
aspiration           0
num_doors            0
wheel_base           0
length               0
width                0
height               0
curb_weight          0
num_cylinders        0
engine_size          0
bore                 0
stroke               0
compression_ratio    0
horsepower           0
peak_rpm             0
city_mpg             0
highway_mpg          0
price                0
dtype: int64

In [165]:

from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
#knn train_test function
def knn_train_test(traincol, targetcol, dataframe):
    np.random.seed(1)
    df = dataframe.iloc[np.random.permutation(len(dataframe))]
    train = df[:150]
    test = df[150:]
    model = KNeighborsRegressor()
    model.fit(train[[traincol]], train[targetcol])
    predict = model.predict(test[[traincol]])
    mse = mean_squared_error(test[targetcol], predict)
    rmse = mse ** .5
    return rmse
rmse_series = {}
train_cols = auto.columns.drop("price")
for col in train_cols:
    rmse_series[col] = knn_train_test(col, "price", auto)
#convert to panda series to be sorted
rmse_result = pd.Series(rmse_series)
rmse_result.sort_values()

Out[165]:

engine_size           3051.434222
city_mpg              3684.803554
width                 3917.227670
curb_weight           4011.450036
wheel_base            4161.947972
highway_mpg           4323.502530
num_cylinders         4730.679949
horsepower            4756.983755
length                5416.294064
compression_ratio     5958.572328
normalized_losses     6231.311124
peak_rpm              6326.471744
bore                  6506.581507
height                6666.667678
stroke                6939.691440
aspiration            7293.297564
num_doors             7811.982033
symboling            11034.058934
dtype: float64

In [154]:

#modify the function to accept parameter for k value
def knn_train_test2(k, traincol, targetcol, dataframe):
    np.random.seed(1)
    df = dataframe.iloc[np.random.permutation(len(dataframe))]
    train = df[:150]
    test = df[150:]
    model = KNeighborsRegressor(k)
    model.fit(train[[traincol]], train[targetcol])
    predict = model.predict(test[[traincol]])
    mse = mean_squared_error(test[targetcol], predict)
    rmse = mse ** .5
    return rmse
result = pd.DataFrame()
train_cols = auto.columns.drop("price")
kvalues = [1, 3, 5, 7, 9]
for k in kvalues:
    rmse_series = {}
    for col in train_cols:
        rmse_series[col] = knn_train_test2(k, col, "price", auto)
    rmse_result = pd.Series(rmse_series, name = k)
    result[k]=rmse_result
print(result[result == result.min()])

                             1            3            5            7  \
aspiration                 NaN          NaN          NaN          NaN   
bore                       NaN          NaN          NaN          NaN   
city_mpg                   NaN          NaN          NaN          NaN   
compression_ratio          NaN          NaN          NaN          NaN   
curb_weight                NaN          NaN          NaN          NaN   
engine_size                NaN  2726.617989  3051.434222  3009.880997   
height                     NaN          NaN          NaN          NaN   
highway_mpg                NaN          NaN          NaN          NaN   
horsepower                 NaN          NaN          NaN          NaN   
length                     NaN          NaN          NaN          NaN   
normalized_losses          NaN          NaN          NaN          NaN   
num_cylinders              NaN          NaN          NaN          NaN   
num_doors                  NaN          NaN          NaN          NaN   
peak_rpm                   NaN          NaN          NaN          NaN   
stroke                     NaN          NaN          NaN          NaN   
symboling                  NaN          NaN          NaN          NaN   
wheel_base         2740.625723          NaN          NaN          NaN   
width                      NaN          NaN          NaN          NaN   

                             9  
aspiration                 NaN  
bore                       NaN  
city_mpg                   NaN  
compression_ratio          NaN  
curb_weight                NaN  
engine_size        2951.586311  
height                     NaN  
highway_mpg                NaN  
horsepower                 NaN  
length                     NaN  
normalized_losses          NaN  
num_cylinders              NaN  
num_doors                  NaN  
peak_rpm                   NaN  
stroke                     NaN  
symboling                  NaN  
wheel_base                 NaN  
width                      NaN

In [155]:

import matplotlib.pyplot as plt
%matplotlib inline

In [164]:

result.plot(figsize = (12,9),ylim = (2000,10000))

Out[164]:

<matplotlib.axes._subplots.AxesSubplot at 0x1a201e3c88>

In [168]:

#modify the function to incorporate multiple columns
def knn_train_test3(traincol, targetcol, dataframe):
    np.random.seed(1)
    df = dataframe.iloc[np.random.permutation(len(dataframe))]
    train = df[:150]
    test = df[150:]
    model = KNeighborsRegressor()
    model.fit(train[traincol], train[targetcol])
    predict = model.predict(test[traincol])
    mse = mean_squared_error(test[targetcol], predict)
    rmse = mse ** .5
    return rmse
ranking = rmse_result.sort_values()

In [180]:

#top combinations with top_n_ranked predictor
for i in range(2,6):
    print(ranking.index[:i], knn_train_test3(ranking.index[:i],"price",auto))

Index(['engine_size', 'city_mpg'], dtype='object') 2887.65323888
Index(['engine_size', 'city_mpg', 'width'], dtype='object') 2617.73526723
Index(['engine_size', 'city_mpg', 'width', 'curb_weight'], dtype='object') 2589.59592265
Index(['engine_size', 'city_mpg', 'width', 'curb_weight', 'wheel_base'], dtype='object') 2785.46102052

In [183]:

#modify the function to take in multivariable and k-value
def knn_train_test4(k, traincol, targetcol, dataframe):
    np.random.seed(1)
    df = dataframe.iloc[np.random.permutation(len(dataframe))]
    train = df[:150]
    test = df[150:]
    model = KNeighborsRegressor(k)
    model.fit(train[traincol], train[targetcol])
    predict = model.predict(test[traincol])
    mse = mean_squared_error(test[targetcol], predict)
    rmse = mse ** .5
    return rmse
kranking = pd.DataFrame()
for k in range(1,26):
    rmse_series = {}
    for col in train_cols:
        rmse_series[col] = knn_train_test4(k, [col], "price", auto)
    rmse_result = pd.Series(rmse_series, name = k)
    ranking = rmse_result.sort_values()
    varranking = dict()
    for i in range(2, 7):
#        print("k =", k, "num_var= ", i, ranking.index[:i], knn_train_test4(k, ranking.index[:i],"price",auto))
        varranking["numvar =" + str(i)] = knn_train_test4(k, ranking.index[:i],"price",auto)
    varseries = pd.Series(varranking)
    kranking[k] = varseries
print(kranking)

                    1            2            3            4            5   \
numvar =2  4437.156343  3180.781519  3051.542957  3197.997191  2792.012710   
numvar =3  2135.698164  2737.209363  2938.593962  3051.879296  2797.747685   
numvar =4  2208.623376  2353.175378  2621.713409  2297.287469  2587.380872   
numvar =5  2120.984118  2250.203873  2516.605248  2588.640862  2543.664239   
numvar =6  2123.715829  2191.497081  2453.977213  2532.360879  2537.128715   

                    6            7            8            9            10  \
numvar =2  2876.669568  2708.948135  2804.629833  2823.082238  2885.795088   
numvar =3  2754.592010  2634.729783  2652.873991  2608.116064  2673.756613   
numvar =4  2648.112514  2664.826976  2553.864609  2589.595923  2845.049675   
numvar =5  2714.255019  2613.434012  2663.381774  2722.911476  2932.146476   
numvar =6  2520.730726  2573.884855  2662.578452  2686.020827  2715.694788   

                    11           12           13           14           15  \
numvar =2  2769.319636  2829.822057  2833.806806  2921.067426  3050.007322   
numvar =3  2576.275117  2611.471402  2728.209506  2817.056137  2917.056104   
numvar =4  2726.062597  2729.783400  2738.782084  2874.022131  2995.089647   
numvar =5  2792.069266  2758.168368  2938.144811  2807.116801  2910.011132   
numvar =6  2660.545650  2624.831586  2613.194564  2786.228335  3033.047859   

                    16           17           18           19           20  \
numvar =2  3154.396933  3279.504358  3384.151151  3362.126575  3361.774115   
numvar =3  3037.625056  3160.157808  3251.134246  3325.312863  3356.755339   
numvar =4  3048.467891  3135.063968  3246.222736  3311.095032  3403.904990   
numvar =5  3009.283314  3122.868612  3196.264864  3433.676958  3539.817829   
numvar =6  3152.808446  3050.813571  3152.668142  3660.719550  3719.188530   

                    21           22           23           24           25  
numvar =2  3389.770380  3460.499282  3490.470105  3539.404533  3603.547655  
numvar =3  3460.788353  3535.511536  3588.816495  3647.562909  3710.698185  
numvar =4  3497.334580  3623.257610  3715.180210  3762.555635  3796.023608  
numvar =5  3551.839514  3647.366217  3717.166943  3744.742260  3760.588222  
numvar =6  3754.165046  3792.991058  3796.335246  3529.528828  3630.032775

In [185]:

kranking.transpose().plot(figsize = (12,9))

Out[185]:

<matplotlib.axes._subplots.AxesSubplot at 0x1a208e6cc0>

Phoebe's blog

Airport-Airline mapping with Basemap

social