import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
for dirname, _, filenames in os.walk('C:/Users/Stevie/Data_Analyasis/timeseries1'):
for filename in filenames:
print(os.path.join(dirname,filename))
holiday = pd.read_csv('C:/Users/Stevie/Data_Analysis/timeseries1/holidays_events.csv')
oil = pd.read_csv('C:/Users/Stevie/Data_Analysis/timeseries1/oil.csv')
sample = pd.read_csv('C:/Users/Stevie/Data_Analysis/timeseries1/sample_submission.csv')
stores = pd.read_csv('C:/Users/Stevie/Data_Analysis/timeseries1/stores.csv')
test = pd.read_csv('C:/Users/Stevie/Data_Analysis/timeseries1/test.csv')
train = pd.read_csv('C:/Users/Stevie/Data_Analysis/timeseries1/train.csv')
transaction = pd.read_csv('C:/Users/Stevie/Data_Analysis/timeseries1/transactions.csv')
list(stores)
['store_nbr', 'city', 'state', 'type', 'cluster']
# left joinning the store tbl to the train tbl
df = pd.merge(train,stores,how='left',on="store_nbr")
df
id | date | store_nbr | family | sales | onpromotion | city | state | type | cluster | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 2013-01-01 | 1 | AUTOMOTIVE | 0.000 | 0 | Quito | Pichincha | D | 13 |
1 | 1 | 2013-01-01 | 1 | BABY CARE | 0.000 | 0 | Quito | Pichincha | D | 13 |
2 | 2 | 2013-01-01 | 1 | BEAUTY | 0.000 | 0 | Quito | Pichincha | D | 13 |
3 | 3 | 2013-01-01 | 1 | BEVERAGES | 0.000 | 0 | Quito | Pichincha | D | 13 |
4 | 4 | 2013-01-01 | 1 | BOOKS | 0.000 | 0 | Quito | Pichincha | D | 13 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
3000883 | 3000883 | 2017-08-15 | 9 | POULTRY | 438.133 | 0 | Quito | Pichincha | B | 6 |
3000884 | 3000884 | 2017-08-15 | 9 | PREPARED FOODS | 154.553 | 1 | Quito | Pichincha | B | 6 |
3000885 | 3000885 | 2017-08-15 | 9 | PRODUCE | 2419.729 | 148 | Quito | Pichincha | B | 6 |
3000886 | 3000886 | 2017-08-15 | 9 | SCHOOL AND OFFICE SUPPLIES | 121.000 | 8 | Quito | Pichincha | B | 6 |
3000887 | 3000887 | 2017-08-15 | 9 | SEAFOOD | 16.000 | 0 | Quito | Pichincha | B | 6 |
3000888 rows × 10 columns
df["family"].nunique()
33
# Initial plot analysis
# Group aggregates
(df.groupby("family")['sales'].mean().sort_values(ascending=True)
.plot(kind='barh',figsize=(10,6),title="Total Amount of sales per product")
)
plt.xlabel('Amount')
plt.ylabel('Product')
df.groupby("family")['sales'].mean()
family AUTOMOTIVE 6.101236 BABY CARE 0.110528 BEAUTY 3.715723 BEVERAGES 2385.793151 BOOKS 0.070797 BREAD/BAKERY 463.336254 CELEBRATION 8.370469 CLEANING 1072.416744 DAIRY 709.154889 DELI 265.135067 EGGS 171.420516 FROZEN FOODS 154.766954 GROCERY I 3776.972100 GROCERY II 21.584048 HARDWARE 1.137833 HOME AND KITCHEN I 20.470342 HOME AND KITCHEN II 16.722420 HOME APPLIANCES 0.457476 HOME CARE 176.198029 LADIESWEAR 7.160629 LAWN AND GARDEN 6.035475 LINGERIE 7.182128 LIQUOR,WINE,BEER 85.187824 MAGAZINES 2.929082 MEATS 341.849965 PERSONAL CARE 270.432513 PET SUPPLIES 3.921263 PLAYERS AND ELECTRONICS 6.186857 POULTRY 350.532292 PREPARED FOODS 96.770202 PRODUCE 1349.352123 SCHOOL AND OFFICE SUPPLIES 2.961599 SEAFOOD 22.163190 Name: sales, dtype: float64
import plotly.offline as pyo
import plotly.express as px
pyo.init_notebook_mode()
d2 = df[['date','sales','onpromotion','type']].groupby('date').sum().reset_index()
fig = px.line(d2,x='date',y='sales',title="Total sales for all stores and products between 2013 and 2017")
fig.update_layout({
"xaxis":{"title":"Date"},
"yaxis":{'title':"Total Amount Sold"}
})
fig.show()
# Which store sells the most?
df2 = df.groupby(['family','store_nbr','type'])[["sales"]].mean().sort_values(by="sales",ascending=False).reset_index()
df_pie = df2.groupby('type')[['sales']].mean().reset_index().sort_values(by='sales', ascending=False)
# Converting into date columns and extracting monthly records
d2['date'] = pd.to_datetime(d2['date'])
d2['month'] = d2['date'].dt.month
def text(x):
return{
1:'Jan',
2:"Feb",
3:"Mar",
4:"Apr",
5:"May",
6:"Jun",
7:'July',
8:'Aug',
9:'Sept',
10:'Oct',
11:'Nov',
12:'Dec'
}[x]
d2['month'] = d2['month'].apply(text)
df_bar = d2.groupby('month')[['sales']].mean().reset_index().sort_values(by='sales',ascending=False)
# Color for bar chart
import warnings
warnings.filterwarnings(action='once')
df_bar['color'] = '#c3c6b7'
df_bar['color'][:1] = '#505c46'
df_bar['color'][1:2] = '#656f55'
df_bar['color'][2:3] = "#858b6c"
df_bar['color'][3:4] = "#999e87"
# chart
from plotly.subplots import make_subplots
import plotly.graph_objs as go
fig = make_subplots(rows=1, cols=2,
specs=[[{'type':'bar'},{'type':'pie'}]],
column_widths=[0.7, 0.3], vertical_spacing=0.5, horizontal_spacing=0.05,
subplot_titles=("Monthly Average Sales", "Sales per Store Types"))
fig.add_trace(go.Bar(x=df_bar['sales'], y=df_bar['month'],marker=dict(color= df_bar['color']),text=round(df_bar['sales'],2),
name='family', orientation='h'),
row=1, col=1)
fig.add_trace(go.Pie(values=df_pie['sales'], labels=df_pie['type'], name='Store type',
marker=dict(colors=['#505c46','#656f55','#858b6c','#999e87','#c3c6b7']), hole=0.7,
hoverinfo='label+percent+value', textinfo='label'),
row=1, col=2)
fig.update_layout(plot_bgcolor = "white")
C:\Users\Stevie\AppData\Local\Temp/ipykernel_5952/1769151063.py:31: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
d3 = pd.merge(train,stores,how='left',on="store_nbr")
d4 = pd.merge(d3,holiday,how='left',on="date")
d4.rename(columns={'type_x':'store_type','type_y':'type'},inplace=True)
pie2= d4.groupby(['store_type','type'])[['sales']].mean().reset_index()
fig3 = px.scatter(pie2,x='store_type',y='type',color='sales',size='sales',title="Average sales of each store type per holiday type")
fig3.update_layout(yaxis_title="Holiday Type",
plot_bgcolor="white"
)
pie2
store_type | type | sales | |
---|---|---|---|
0 | A | Additional | 957.698141 |
1 | A | Bridge | 969.824204 |
2 | A | Event | 813.558037 |
3 | A | Holiday | 723.282966 |
4 | A | Transfer | 984.633589 |
5 | A | Work Day | 803.755953 |
6 | B | Additional | 445.089571 |
7 | B | Bridge | 355.831136 |
8 | B | Event | 409.931014 |
9 | B | Holiday | 332.504335 |
10 | B | Transfer | 426.551892 |
11 | B | Work Day | 292.564950 |
12 | C | Additional | 267.189999 |
13 | C | Bridge | 239.883849 |
14 | C | Event | 233.702849 |
15 | C | Holiday | 193.947573 |
16 | C | Transfer | 226.688622 |
17 | C | Work Day | 202.484202 |
18 | D | Additional | 480.446294 |
19 | D | Bridge | 428.372398 |
20 | D | Event | 416.987452 |
21 | D | Holiday | 344.354777 |
22 | D | Transfer | 448.077986 |
23 | D | Work Day | 371.903094 |
24 | E | Additional | 374.019711 |
25 | E | Bridge | 310.129571 |
26 | E | Event | 343.214052 |
27 | E | Holiday | 269.561215 |
28 | E | Transfer | 379.704359 |
29 | E | Work Day | 197.666095 |
d2['weekday'] = d2['date'].dt.dayofweek
d2 = d2.sort_values(by='weekday')
def text2 (x):
return{ 1:'Monday',
2:'Tuesday',
3:'Wednesday',
4:'Thursday',
5:'Friday',
6:'Saturday',
0:"Sunday"}[x]
d2['weekday'] = d2['weekday'].apply(text2)
df_bar2 = d2.groupby('weekday')[['sales']].mean().reset_index()
# Plotting
fig2 = px.bar(df_bar2, x='sales', y='weekday',
color_discrete_sequence=['#999e87'],
title="Average sales per day of week",
category_orders={
"weekday":["Monday","Tuesday","Wednesday","Thursday", "Friday","Saturday","Sunday"]},
text=round(df_bar2['sales'],2))
fig2.update_layout(plot_bgcolor = "white",
yaxis_title=" ",
xaxis_title=" ",
margin=dict(t=80, b=0, l=0, r=20), height=300)
import statsmodels
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tsa.arima_model import ARMA
d5 = train.loc[train["family"]=="GROCERY I"]
ts = d5.groupby('date')[['sales']].mean().reset_index()
ts['date'] = pd.to_datetime(ts['date'])
ts.set_index("date")
adft = adfuller(ts['sales'])
output_df = pd.DataFrame({"Values":[adft[0],adft[1],adft[2],adft[3], adft[4]['1%'], adft[4]['5%'], adft[4]['10%']] , "Metric":["Test Statistics","p-value","No. of lags used","Number of observations used",
"critical value (1%)", "critical value (5%)", "critical value (10%)"]})
print(output_df)
Values Metric 0 -3.448763 Test Statistics 1 0.009405 p-value 2 25.000000 No. of lags used 3 1658.000000 Number of observations used 4 -3.434300 critical value (1%) 5 -2.863285 critical value (5%) 6 -2.567699 critical value (10%)
autocorrelation_lag1 = ts['sales'].autocorr(lag=1)
print("One Month Lag: ", autocorrelation_lag1)
autocorrelation_lag5 = ts['sales'].autocorr(lag=5)
print("Five Month Lag: ", autocorrelation_lag5)
autocorrelation_lag9 = ts['sales'].autocorr(lag=9)
print("Nine Month Lag: ", autocorrelation_lag9)
One Month Lag: 0.6687285451788195 Five Month Lag: 0.26633735060260877 Nine Month Lag: 0.21353788135609938
# test and train datasets before differencing
ts['year'] = ts['date'].dt.year
ts.index = ts['date']
train = ts.loc[ts['year']<=2016]
del train['date']
del train['year']
test = ts.loc[ts['year']>2016]
del test['date']
del test['year']
fig_dims = (14, 8)
fig, ax = plt.subplots(figsize=fig_dims)
plt.plot(train,color='black')
plt.plot(test,color='red')
plt.ylabel('Average Sales')
plt.legend(('TEST','TRAIN'),fontsize=15)
<matplotlib.legend.Legend at 0x22200a763d0>
from imp import reload
reload(plt)
def difference(data,interval=1):
df = list()
for i in range(interval,len(data)):
x = data[i] - data[i-1]
df.append(x)
return pd.Series(df)
#plt.figure(figsize=(14,6))
#ts_d = pd.DataFrame(difference(ts['sales']),index=ts['date'].index,columns='sales')
#forecast = pd.DataFrame(forecast,index = test['Number'].index,columns=['Prediction'])
ts_d1 = pd.DataFrame(difference(ts['sales']))
ts_d2 = ts[['year']][1:].reset_index()
ts_d = pd.concat([ts_d1,ts_d2],axis=1)
ts_d.index = ts_d['date']
del ts_d['date']
ts_d.columns = ['sales','year']
train2 = ts_d.loc[ts_d['year']<=2016]
test2 = ts_d.loc[ts['year']>2016]
fig_dims = (14, 8)
fig, ax = plt.subplots(figsize=fig_dims)
plt.plot(train2['sales'],color='black')
plt.plot(test2['sales'],color='red')
plt.ylabel('Average Sales')
plt.xlabel('Year')
plt.legend(('TEST','TRAIN'),fontsize=11)
plt.title("Test / Train data after first-differencing")
Text(0.5, 1.0, 'Test / Train data after first-differencing')
from pmdarima import auto_arima
model2 = auto_arima(train2['sales'],trace=True,suppress_warnings=True,error_action='ignore')
model2.fit(train2['sales'])
forecast = model2.predict(n_periods=len(test2))
forecast = pd.DataFrame(forecast,index=test2.index,columns=['Prediction'])
Performing stepwise search to minimize aic ARIMA(2,0,2)(0,0,0)[0] intercept : AIC=23335.885, Time=1.29 sec ARIMA(0,0,0)(0,0,0)[0] intercept : AIC=23886.084, Time=0.02 sec ARIMA(1,0,0)(0,0,0)[0] intercept : AIC=23886.889, Time=0.06 sec ARIMA(0,0,1)(0,0,0)[0] intercept : AIC=23883.287, Time=0.08 sec ARIMA(0,0,0)(0,0,0)[0] : AIC=23884.111, Time=0.02 sec ARIMA(1,0,2)(0,0,0)[0] intercept : AIC=23348.813, Time=1.11 sec ARIMA(2,0,1)(0,0,0)[0] intercept : AIC=23346.705, Time=0.76 sec ARIMA(3,0,2)(0,0,0)[0] intercept : AIC=23364.443, Time=1.38 sec ARIMA(2,0,3)(0,0,0)[0] intercept : AIC=23330.426, Time=1.36 sec ARIMA(1,0,3)(0,0,0)[0] intercept : AIC=inf, Time=1.37 sec ARIMA(3,0,3)(0,0,0)[0] intercept : AIC=23078.063, Time=1.82 sec ARIMA(4,0,3)(0,0,0)[0] intercept : AIC=22904.970, Time=1.81 sec ARIMA(4,0,2)(0,0,0)[0] intercept : AIC=22914.009, Time=1.81 sec ARIMA(5,0,3)(0,0,0)[0] intercept : AIC=22868.238, Time=2.29 sec ARIMA(5,0,2)(0,0,0)[0] intercept : AIC=22923.196, Time=1.95 sec ARIMA(5,0,4)(0,0,0)[0] intercept : AIC=22693.707, Time=2.66 sec ARIMA(4,0,4)(0,0,0)[0] intercept : AIC=22783.774, Time=2.19 sec ARIMA(5,0,5)(0,0,0)[0] intercept : AIC=22629.690, Time=3.03 sec ARIMA(4,0,5)(0,0,0)[0] intercept : AIC=22667.924, Time=2.60 sec ARIMA(5,0,5)(0,0,0)[0] : AIC=inf, Time=2.41 sec Best model: ARIMA(5,0,5)(0,0,0)[0] intercept Total fit time: 30.038 seconds
plt.figure(figsize=(15,6))
plt.plot(train2['sales'],color='black')
plt.plot(test2['sales'],color='red')
plt.plot(forecast['Prediction'],color='blue')
plt.ylabel('Average Sales')
plt.xlabel('Year')
plt.title('Train/Test dataset versus forecast with ARIMA()')
plt.legend(['Train','Test','Forecast'],fontsize=13)
<matplotlib.legend.Legend at 0x22201e03040>
#### It seems like the forecast is suggesting that the sales will slowly decreases and flucutate less and less.
from math import sqrt
from sklearn.metrics import mean_squared_error
rms = sqrt(mean_squared_error(test2['sales'],forecast))
print("RMSE: ", rms)
RMSE: 1190.7776134067853