S&P 500 stock closing price prediction with Linear Regression Model

In [1]:
import pandas as pd
from datetime import datetime
import numpy as np
In [2]:
data = pd.read_csv("Data/sphist.csv")
print(data.head())
         Date         Open         High          Low        Close  \
0  2015-12-07  2090.419922  2090.419922  2066.780029  2077.070068   
1  2015-12-04  2051.239990  2093.840088  2051.239990  2091.689941   
2  2015-12-03  2080.709961  2085.000000  2042.349976  2049.620117   
3  2015-12-02  2101.709961  2104.270020  2077.110107  2079.510010   
4  2015-12-01  2082.929932  2103.370117  2082.929932  2102.629883   

         Volume    Adj Close  
0  4.043820e+09  2077.070068  
1  4.214910e+09  2091.689941  
2  4.306490e+09  2049.620117  
3  3.950640e+09  2079.510010  
4  3.712120e+09  2102.629883  

this dataset contains S&P500 stock opening, closing, high/low prices of 1950-2015

In [3]:
df = data.copy()
df["Date"] = pd.to_datetime(df["Date"])
df.sort_values("Date", ascending = True, inplace = True)
print(df.head())
            Date   Open   High    Low  Close     Volume  Adj Close
16589 1950-01-03  16.66  16.66  16.66  16.66  1260000.0      16.66
16588 1950-01-04  16.85  16.85  16.85  16.85  1890000.0      16.85
16587 1950-01-05  16.93  16.93  16.93  16.93  2550000.0      16.93
16586 1950-01-06  16.98  16.98  16.98  16.98  2010000.0      16.98
16585 1950-01-09  17.08  17.08  17.08  17.08  2520000.0      17.08
In [5]:
i = 0
close_acc5 = 0
close_acc30 = 0
close_std5 = []
day5 = []
day30 = []
std5 = []

#calculation 
for idx, row in df.iterrows():
    i += 1
    if i <= 5:
        day5.append(0)
        day30.append(0)
        std5.append(0)
        close_acc5 += row["Close"]
        close_acc30 += row["Close"]
        close_std5.append(row["Close"])
    elif (i <= 30) & (i > 5):
        day5.append(close_acc5/5)
        day30.append(0)
        std5.append(np.std(close_std5))
        close_acc5 -= df.iloc[i-6,:]["Close"]
        close_acc5 += df.iloc[i-1,:]["Close"]
        close_std5.pop(0)
        close_std5.append(df.iloc[i-1,:]["Close"])
        close_acc30 += row["Close"]
    else:
        day30.append(close_acc30 / 30)
        close_acc30 -= df.iloc[i-31]["Close"]
        close_acc30 += df.iloc[i-1]["Close"]
        day5.append(close_acc5/5)
        std5.append(np.std(close_std5))
        close_std5.pop(0)
        close_std5.append(df.iloc[i-1,:]["Close"])
        close_acc5 -= df.iloc[i-6,:]["Close"]
        close_acc5 += df.iloc[i-1,:]["Close"]
df["day_5"] = day5
df["day_30"] = day30
df["std_5"] = std5
print(df.iloc[16330:16339])
[16.66, 16.85, 16.93, 16.98, 17.08]
[16.85, 16.93, 16.98, 17.08, 17.030001000000002]
[16.93, 16.98, 17.08, 17.030001000000002, 17.09]
[16.98, 17.08, 17.030001000000002, 17.09, 16.760000000000002]
[17.08, 17.030001000000002, 17.09, 16.760000000000002, 16.670000000000002]
[17.030001000000002, 17.09, 16.760000000000002, 16.670000000000002, 16.719998999999998]
[17.09, 16.760000000000002, 16.670000000000002, 16.719998999999998, 16.860001]
[16.760000000000002, 16.670000000000002, 16.719998999999998, 16.860001, 16.850000000000001]
[16.670000000000002, 16.719998999999998, 16.860001, 16.850000000000001, 16.870001000000002]
[16.719998999999998, 16.860001, 16.850000000000001, 16.870001000000002, 16.899999999999999]
[16.860001, 16.850000000000001, 16.870001000000002, 16.899999999999999, 16.920000000000002]
[16.850000000000001, 16.870001000000002, 16.899999999999999, 16.920000000000002, 16.860001]
[16.870001000000002, 16.899999999999999, 16.920000000000002, 16.860001, 16.739999999999998]
[16.899999999999999, 16.920000000000002, 16.860001, 16.739999999999998, 16.73]
[16.920000000000002, 16.860001, 16.739999999999998, 16.73, 16.82]
[16.860001, 16.739999999999998, 16.73, 16.82, 17.02]
[16.739999999999998, 16.73, 16.82, 17.02, 17.049999]
[16.73, 16.82, 17.02, 17.049999, 17.049999]
[16.82, 17.02, 17.049999, 17.049999, 17.23]
[17.02, 17.049999, 17.049999, 17.23, 17.290001]
[17.049999, 17.049999, 17.23, 17.290001, 17.32]
[17.049999, 17.23, 17.290001, 17.32, 17.23]
[17.23, 17.290001, 17.32, 17.23, 17.209999]
[17.290001, 17.32, 17.23, 17.209999, 17.280001000000002]
[17.32, 17.23, 17.209999, 17.280001000000002, 17.239999999999998]
            Date       Open       High        Low      Close     Volume  \
16589 1950-01-03  16.660000  16.660000  16.660000  16.660000  1260000.0   
16588 1950-01-04  16.850000  16.850000  16.850000  16.850000  1890000.0   
16587 1950-01-05  16.930000  16.930000  16.930000  16.930000  2550000.0   
16586 1950-01-06  16.980000  16.980000  16.980000  16.980000  2010000.0   
16585 1950-01-09  17.080000  17.080000  17.080000  17.080000  2520000.0   
16584 1950-01-10  17.030001  17.030001  17.030001  17.030001  2160000.0   
16583 1950-01-11  17.090000  17.090000  17.090000  17.090000  2630000.0   
16582 1950-01-12  16.760000  16.760000  16.760000  16.760000  2970000.0   
16581 1950-01-13  16.670000  16.670000  16.670000  16.670000  3330000.0   
16580 1950-01-16  16.719999  16.719999  16.719999  16.719999  1460000.0   
16579 1950-01-17  16.860001  16.860001  16.860001  16.860001  1790000.0   
16578 1950-01-18  16.850000  16.850000  16.850000  16.850000  1570000.0   
16577 1950-01-19  16.870001  16.870001  16.870001  16.870001  1170000.0   
16576 1950-01-20  16.900000  16.900000  16.900000  16.900000  1440000.0   
16575 1950-01-23  16.920000  16.920000  16.920000  16.920000  1340000.0   
16574 1950-01-24  16.860001  16.860001  16.860001  16.860001  1250000.0   
16573 1950-01-25  16.740000  16.740000  16.740000  16.740000  1700000.0   
16572 1950-01-26  16.730000  16.730000  16.730000  16.730000  1150000.0   
16571 1950-01-27  16.820000  16.820000  16.820000  16.820000  1250000.0   
16570 1950-01-30  17.020000  17.020000  17.020000  17.020000  1640000.0   
16569 1950-01-31  17.049999  17.049999  17.049999  17.049999  1690000.0   
16568 1950-02-01  17.049999  17.049999  17.049999  17.049999  1810000.0   
16567 1950-02-02  17.230000  17.230000  17.230000  17.230000  2040000.0   
16566 1950-02-03  17.290001  17.290001  17.290001  17.290001  2210000.0   
16565 1950-02-06  17.320000  17.320000  17.320000  17.320000  1490000.0   
16564 1950-02-07  17.230000  17.230000  17.230000  17.230000  1360000.0   
16563 1950-02-08  17.209999  17.209999  17.209999  17.209999  1470000.0   
16562 1950-02-09  17.280001  17.280001  17.280001  17.280001  1810000.0   
16561 1950-02-10  17.240000  17.240000  17.240000  17.240000  1790000.0   
16560 1950-02-14  17.059999  17.059999  17.059999  17.059999  2210000.0   
16559 1950-02-15  17.059999  17.059999  17.059999  17.059999  1730000.0   
16558 1950-02-16  16.990000  16.990000  16.990000  16.990000  1920000.0   
16557 1950-02-17  17.150000  17.150000  17.150000  17.150000  1940000.0   
16556 1950-02-20  17.200001  17.200001  17.200001  17.200001  1420000.0   
16555 1950-02-21  17.170000  17.170000  17.170000  17.170000  1260000.0   
16554 1950-02-23  17.209999  17.209999  17.209999  17.209999  1310000.0   
16553 1950-02-24  17.280001  17.280001  17.280001  17.280001  1710000.0   
16552 1950-02-27  17.280001  17.280001  17.280001  17.280001  1410000.0   
16551 1950-02-28  17.219999  17.219999  17.219999  17.219999  1310000.0   
16550 1950-03-01  17.240000  17.240000  17.240000  17.240000  1410000.0   

       Adj Close   day_5     day_30     std_5  
16589  16.660000   0.000   0.000000  0.000000  
16588  16.850000   0.000   0.000000  0.000000  
16587  16.930000   0.000   0.000000  0.000000  
16586  16.980000   0.000   0.000000  0.000000  
16585  17.080000   0.000   0.000000  0.000000  
16584  17.030001  16.900   0.000000  0.141280  
16583  17.090000  16.974   0.000000  0.079649  
16582  16.760000  17.022   0.000000  0.060465  
16581  16.670000  16.988   0.000000  0.120565  
16580  16.719999  16.926   0.000000  0.175795  
16579  16.860001  16.854   0.000000  0.171651  
16578  16.850000  16.820   0.000000  0.148728  
16577  16.870001  16.772   0.000000  0.073594  
16576  16.900000  16.794   0.000000  0.082608  
16575  16.920000  16.840   0.000000  0.062290  
16574  16.860001  16.880   0.000000  0.026077  
16573  16.740000  16.880   0.000000  0.026077  
16572  16.730000  16.858   0.000000  0.062738  
16571  16.820000  16.830   0.000000  0.080000  
16570  17.020000  16.814   0.000000  0.072000  
16569  17.049999  16.834   0.000000  0.104995  
16568  17.049999  16.872   0.000000  0.137025  
16567  17.230000  16.934   0.000000  0.133356  
16566  17.290001  17.034   0.000000  0.130323  
16565  17.320000  17.128   0.000000  0.109982  
16564  17.230000  17.188   0.000000  0.116345  
16563  17.209999  17.224   0.000000  0.093724  
16562  17.280001  17.256   0.000000  0.041762  
16561  17.240000  17.266   0.000000  0.040299  
16560  17.059999  17.256   0.000000  0.039294  
16559  17.059999  17.204  16.976667  0.075525  
16558  16.990000  17.170  16.990000  0.092521  
16557  17.150000  17.126  16.994667  0.113067  
16556  17.200001  17.100  17.002000  0.086487  
16555  17.170000  17.092  17.009333  0.074135  
16554  17.209999  17.114  17.012333  0.077615  
16553  17.280001  17.144  17.018333  0.079900  
16552  17.280001  17.202  17.024667  0.044453  
16551  17.219999  17.228  17.042000  0.044453  
16550  17.240000  17.232  17.060333  0.042615  
In [21]:
df = df[df["Date"] > datetime(year = 1950, month = 2, day = 14)]
train = df[df["Date"] < datetime(year = 2013, month = 1, day = 1)]
test = df[df["Date"] >= datetime(year = 2013, month = 1, day = 1)]
feature = ["day_5", "day_30", "std_5"]
In [24]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
lr = LinearRegression()
target = "Close"
lr.fit(train[feature], train[target])
predict_close = lr.predict(test[feature])
mae = abs(predict_close - test[target]).mean()
print(train.tail())
          Date         Open         High          Low        Close  \
743 2012-12-24  1430.150024  1430.150024  1424.660034  1426.660034   
742 2012-12-26  1426.660034  1429.420044  1416.430054  1419.829956   
741 2012-12-27  1419.829956  1422.800049  1401.800049  1418.099976   
740 2012-12-28  1418.099976  1418.099976  1401.579956  1402.430054   
739 2012-12-31  1402.430054  1426.739990  1398.109985  1426.189941   

           Volume    Adj Close        day_5       day_30     std_5  
743  1.248960e+09  1426.660034  1437.360010  1405.926001  6.817332  
742  2.285030e+09  1419.829956  1436.620019  1407.486336  7.682855  
741  2.830180e+09  1418.099976  1431.228003  1408.813000  8.102333  
740  2.426680e+09  1402.430054  1427.685986  1410.265332  9.130821  
739  3.204330e+09  1426.189941  1419.434009  1411.830001  9.572036  

additional indicator to consider

  • The average volume over the past five days.
  • The average volume over the past year.
  • The ratio between the average volume for the past five days, and the average volume for the past year.
  • The standard deviation of the average volume over the past five days.
  • The standard deviation of the average volume over the past year.
  • The ratio between the standard deviation of the average volume for the past five days, and the standard deviation of the average volume for the past year.
  • The year component of the date.
  • The ratio between the lowest price in the past year and the current price.
  • The ratio between the highest price in the past year and the current price.
  • The year component of the date.
  • The month component of the date.
  • The day of week.
  • The day component of the date.
  • The number of holidays in the prior month.

social