"""
Importing all libraries to execute code
"""

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

# Basic imports
import time
import numpy as np
import pandas as pd
from datetime import datetime

# Data visualization
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
from matplotlib.dates import date2num
from matplotlib.ticker import FuncFormatter
import seaborn as sns
%matplotlib inline

# Statistics
from statsmodels.distributions.empirical_distribution import ECDF
from statsmodels.tsa.api import ExponentialSmoothing
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose
import statsmodels.api as sm

# Time Series Analysis
from pmdarima import auto_arima

# Machine Learning: XGBoost
# import xgboost as xgb
# from xgboost.sklearn import XGBRegressor  # Wrapper
# from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Excel File Manipulation
from openpyxl import load_workbook
from openpyxl.utils.dataframe import dataframe_to_rows
import openpyxl

# Data Profiling
from dataprofiler import Profiler, Data
from collections import defaultdict
pd.options.display.float_format = '{:.0f}'.format

# Custom Function (from original code block)
def millions(x, pos):
    return f'${x*1e-6:,.0f}M'
qualitative_colors = [
    "#1f77b4",  # Blue
    "#ff7f0e",  # Orange
    "#2ca02c",  # Green
    "#d62728",  # Red
    "#9467bd",  # Purple
    "#8c564b",  # Brown
    "#e377c2",  # Pink
    "#7f7f7f",  # Gray
    "#bcbd22",  # Olive
    "#17becf"   # Cyan
]

# importing train data to learn
train = pd.read_csv("train.csv", 
                    parse_dates = True, index_col = 'Date')

# additional store data
store = pd.read_csv("store.csv")
store = store.apply(lambda x: pd.to_numeric(x, errors='coerce') if x.dtype == 'float' else x)
store = store.round(1)
display(train)
display(store)

print("The data runs over ", train.index.nunique(), "days")
print("The unique number of stores ", train['Store'].nunique())


# Adding auxillary columns to aid in visualization and modelling
train['Year'] = train.index.year
train['Month'] = train.index.month
train['Day'] = train.index.day
train['WeekOfYear'] = train.index.isocalendar().week 

# adding SalesPerCustomer to better understand consumer trends
train['SalePerCustomer'] = train['Sales']/train['Customers']

The data runs over  942 days
The unique number of stores  1115

# Aggretating sales by day
train_aggregated = train.groupby(train.index).agg(
    Total_Sales=('Sales', 'sum'),
    Total_Customers=('Customers', 'sum')
)
train_weekly = train_aggregated.resample('W').sum()

sns.set_theme(style="ticks")
# Daily Sales Plot
plt.figure(figsize=(18,6))
plt.ylabel("Daily Total Sales",color = '#858585')
plt.xlabel("Date",color = '#858585')
formatter = FuncFormatter(millions)
plt.gca().yaxis.set_major_formatter(formatter)
plt.title("Daily Total Sales by Date", color = 'black',fontweight = 'bold')
plt.annotate(
    'Store Closures', 
    xy=(pd.to_datetime('2012-12-30'), 100000),  # Point near the 80% mark
    xytext=(date2num(pd.to_datetime('2012-11-20')), 14300000), 
    arrowprops=dict(facecolor='black',edgecolor='black', arrowstyle="->",linestyle = "--", lw=1.5),
    fontsize=10,
    color='black'
)
sns.lineplot(x="Date", y="Total_Sales",
           
             data=train_aggregated)

# Weekly Sales Plot to highlight Yearly Seasonality
plt.figure(figsize=(18,6))
plt.ylabel("Weekly Total Sales",color = '#858585')
plt.xlabel("Date",color = '#858585')
formatter = FuncFormatter(millions)
plt.gca().yaxis.set_major_formatter(formatter)
plt.title("Weekly Total Sales by Date", color = 'black',fontweight = 'bold')
plt.annotate(
    'Yearly Seasonality', 
    xy=(pd.to_datetime('2013-12-10'), 75000000),  # Point near the 80% mark
    xytext=(date2num(pd.to_datetime('2013-01-01')), 65000000), 
    arrowprops=dict(facecolor='black',edgecolor='black', arrowstyle="->", lw=1.5),
    fontsize=10,
    color='black'
)
plt.annotate(
    '', 
    xy=(pd.to_datetime('2013-01-19'), 50000000),  # Point near the 80% mark
    xytext=(date2num(pd.to_datetime('2013-01-01')), 65000000), 
    arrowprops=dict(facecolor='black',edgecolor='black', arrowstyle="->", lw=1.5),
    fontsize=10,
    color='black'
)
sns.lineplot(x=train_weekly.index, y="Total_Sales",
           
             data=train_weekly)
             
#Day of Week Sales plot to show weekly seasonality
train['DayOfWeek'] = train.index.dayofweek  # 0 = Monday, 6 = Sunday
# Aggregate sales by DayOfWeek
sales_by_dow = train.groupby('DayOfWeek')['Sales'].sum().reset_index()

plt.figure(figsize=(18, 6))
sns.set_theme(style="ticks")
sns.barplot(x='DayOfWeek', y='Sales', data=sales_by_dow, palette='rocket')
plt.ylabel("Total Sales",color = '#858585')
plt.xlabel("Day of the Week",color = '#858585')
plt.title('Total Sales by Day of the Week', color = 'black',fontweight = 'bold')
plt.xticks(ticks=range(7), labels=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
plt.gca().yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'${x*1e-6:,.0f}M'))
plt.show()

sales= (
    train
    .groupby([train.index, 'Promo'])  # Group by index and Promo
    .agg({
        'Sales': 'sum',          # Sum "Sales"
        'Customers': 'sum'       # Sum "Customers"
    })
)
sales_6_months = sales['2013-01-01':'2013-06-01'].reset_index()

sns.set_theme(style="ticks")

# Initialize figure
plt.figure(figsize=(18,6))
plt.ylabel("Total Sales", color='#858585')
plt.xlabel("Date", color='#858585')

# Format y-axis
formatter = FuncFormatter(millions)
plt.gca().yaxis.set_major_formatter(formatter)

# Title
plt.title("Daily Sales over 6 Months with Promotional Periods Highlighted", color = 'black',fontweight = 'bold')
plt.annotate(
    'Yearly Seasonality', 
    xy=(pd.to_datetime('2013-12-10'), 14000000),  # Point near the 80% mark
    xytext=(date2num(pd.to_datetime('2013-01-01')), 14300000), 
    arrowprops=dict(facecolor='black',edgecolor='black', arrowstyle="->", lw=1.5),
    fontsize=10,
    color='black'
)
# Plot the Sales time series
sns.lineplot(x="Date", y="Sales", data=sales_6_months)


# Highlight Promo periods
for _, row in sales_6_months[sales_6_months['Promo'] == 1].iterrows():
    plt.axvspan(row['Date'] - pd.Timedelta(days=0.5), 
                row['Date'] + pd.Timedelta(days=0.5), 
                color='#ffb554', label="Promo" if _ == 0 else "",linewidth=None)
plt.legend()
plt.show()

No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.

sns.set_theme(style = "ticks")

# basic color for plots
plt.figure(figsize = (18, 6))

plt.subplot(311)
cdf = ECDF(train['Sales'])
plt.annotate(
    '~20% of days have no sales', 
    xy=(3500, 0.2),  # Point near the 80% mark
    xytext=(7000, 0.2), 
    arrowprops=dict(facecolor='black',edgecolor='black', arrowstyle="->", lw=1.5),
    fontsize=10,
    color=qualitative_colors[0]
)
plt.plot(cdf.x, cdf.y, label = "statmodels", color = qualitative_colors[0]);
plt.xlabel('Sales',color = '#858585'); 
plt.ylabel('ECDF',color = '#858585');

# plot second ECDF  
plt.subplot(312)
cdf = ECDF(train['Customers'])
plt.annotate(
    '~20% of days have no customers', 
    xy=(350, 0.2),  # Point near the 80% mark
    xytext=(900, 0.2), 
    arrowprops=dict(facecolor='black',edgecolor='black', arrowstyle="->", lw=1.5),
    fontsize=10,
    color=qualitative_colors[0]
)
plt.plot(cdf.x, cdf.y, label = "statmodels", color = qualitative_colors[1]);
plt.xlabel('Customers',color = '#858585');
plt.ylabel('ECDF',color = '#858585');

# plot second ECDF  
plt.subplot(313)
cdf = ECDF(train['SalePerCustomer'])
plt.plot(cdf.x, cdf.y, label = "statmodels", color = qualitative_colors[2]);
plt.xlabel('Sale per Customer',color = '#858585');
plt.ylabel('ECDF',color = '#858585');
plt.tight_layout()

#Remove days with no sales 
train = train[(train["Open"] != 0) & (train['Sales'] != 0)]
#Checking how many null columns are in our store data.
store.info()
store['CompetitionDistance'].fillna(store['CompetitionDistance'].median(), inplace = True)
store.fillna(0, inplace = True)

print(store.groupby('StoreType').size())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1115 entries, 0 to 1114
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Store                      1115 non-null   int64  
 1   StoreType                  1115 non-null   object 
 2   Assortment                 1115 non-null   object 
 3   CompetitionDistance        1112 non-null   float64
 4   CompetitionOpenSinceMonth  761 non-null    float64
 5   CompetitionOpenSinceYear   761 non-null    float64
 6   Promo2                     1115 non-null   int64  
 7   Promo2SinceWeek            571 non-null    float64
 8   Promo2SinceYear            571 non-null    float64
 9   PromoInterval              571 non-null    object 
dtypes: float64(5), int64(2), object(3)
memory usage: 87.2+ KB
StoreType
a    602
b     17
c    148
d    348
dtype: int64

print(train.shape)
print("Joining train set with an additional store information.")

# by specifying inner join we make sure that only those observations 
# that are present in both train and store sets are merged together
train_store = pd.merge(train.reset_index(), store, how='inner', on='Store')
train_store.set_index('Date', inplace=True)

(844338, 13)
Joining train set with an additional store information.

train_store.groupby('StoreType')['Sales'].describe()
train_store.groupby('StoreType')[['Customers', 'Sales']].sum()

# Compute total counts of Promo = 1 and Promo = 0 for each StoreType
promo_totals = (
    train_store.groupby(['StoreType', 'Promo'])
    .size()
    .reset_index(name='TotalDays')  # Add a 'TotalDays' column
)

# Create the plot
g = sns.catplot(
    data=train_store, 
    x='Month', 
    y='Sales', 
    col='StoreType',  # Separate plots for each store type
    hue='Promo',      # Separate lines for Promo = 0 and Promo = 1
    kind='point', 
    palette='plasma', 
    sharey=True
)

# Add annotations for total Promo counts
for ax, store_type in zip(g.axes.flat, train_store['StoreType'].unique()):
    # Filter totals for this StoreType
    store_promo_totals = promo_totals[promo_totals['StoreType'] == store_type]
    #print(store_type)
    for promo in [0, 1]:
        # Get the total count of days for Promo = 0 or 1
        total_days = store_promo_totals[
            store_promo_totals['Promo'] == promo
        ]['TotalDays']
        
        if not total_days.empty:
            # Annotate the total days
            if store_type !='d':
                ax.text(
                    x=2,  # Position text near the left edge
                    y=ax.get_ylim()[1] * 0.9 - promo * 1000,  # Offset for each promo line
                    s=f"Promo {promo}: {total_days.values[0]} days",
                    fontsize=10,
                    color='black',
                    ha='center'
                )
            else:
                ax.text(
                    x=2,  # Position text near the left edge
                    y=ax.get_ylim()[1] * 0.9 - promo * 1000,  # Offset for each promo line
                    s=f"Promo {promo}: {total_days.values[0]} days",
                    fontsize=10,
                    color='black',
                    ha='center'
                )

plt.show()

# Compute total counts of Promo = 1 and Promo = 0 for each StoreType
promo_totals = (
    train_store.groupby(['StoreType', 'Promo'])
    .size()
    .reset_index(name='TotalDays')  # Add a 'TotalDays' column
)

# Create the plot
g = sns.catplot(
    data=train_store, 
    x='Month', 
    y='Customers', 
    col='StoreType',  # Separate plots for each store type
    hue='Promo',      # Separate lines for Promo = 0 and Promo = 1
    kind='point', 
    palette='plasma', 
    sharey=True
)

# Add annotations for total Promo counts
for ax, store_type in zip(g.axes.flat, train_store['StoreType'].unique()):
    # Filter totals for this StoreType
    store_promo_totals = promo_totals[promo_totals['StoreType'] == store_type]
    #print(store_type)
    for promo in [0, 1]:
        # Get the total count of days for Promo = 0 or 1
        total_days = store_promo_totals[
            store_promo_totals['Promo'] == promo
        ]['TotalDays']
        
        if not total_days.empty:
            # Annotate the total days
            if store_type !='b':
                ax.text(
                    x=2,  # Position text near the left edge
                    y=ax.get_ylim()[1] * 0.9 - promo * 200,  # Offset for each promo line
                    
                    s=f"Promo {promo}: {total_days.values[0]} days",
                    fontsize=10,
                    color='black',
                    ha='center'
                )
            else:
                ax.text(
                    x=2,  # Position text near the left edge
                    y=ax.get_ylim()[1] * 0.6 - promo * 200,  # Offset for each promo line
                    s=f"Promo {promo}: {total_days.values[0]} days",
                    fontsize=10,
                    color='black',
                    ha='center'
                )

plt.show()

# Compute total counts of Promo = 1 and Promo = 0 for each StoreType
promo_totals = (
    train_store.groupby(['StoreType', 'Promo'])
    .size()
    .reset_index(name='TotalDays')  # Add a 'TotalDays' column
)

# Create the plot
g = sns.catplot(
    data=train_store, 
    x='Month', 
    y='SalePerCustomer', 
    col='StoreType',  # Separate plots for each store type
    hue='Promo',      # Separate lines for Promo = 0 and Promo = 1
    kind='point', 
    palette='plasma', 
    sharey=True
)

# Add annotations for total Promo counts
for ax, store_type in zip(g.axes.flat, train_store['StoreType'].unique()):
    # Filter totals for this StoreType
    store_promo_totals = promo_totals[promo_totals['StoreType'] == store_type]
    #print(store_type)
    for promo in [0, 1]:
        # Get the total count of days for Promo = 0 or 1
        total_days = store_promo_totals[
            store_promo_totals['Promo'] == promo
        ]['TotalDays']
        
        if not total_days.empty:
            # Annotate the total days
            if store_type !='d':
                ax.text(
                    x=2,  # Position text near the left edge
                    y=ax.get_ylim()[1] * 0.9 - promo * 0.7,  # Offset for each promo line
                    s=f"Promo {promo}: {total_days.values[0]} days",
                    fontsize=10,
                    color='black',
                    ha='center'
                )
            else:
                ax.text(
                    x=2,  # Position text near the left edge
                    y=ax.get_ylim()[1] * 0.6 - promo * 0.7,  # Offset for each promo line
                    s=f"Promo {promo}: {total_days.values[0]} days",
                    fontsize=10,
                    color='black',
                    ha='center'
                )

# Identify non-numeric columns
non_numeric_cols = train_store.select_dtypes(exclude=[np.number]).columns.tolist()
print("Non-numeric columns:", non_numeric_cols)

# Drop non-numeric columns
train_store_numeric = train_store.drop(non_numeric_cols + ['Open'], axis=1)

# Now compute the correlation matrix
corr_all = train_store_numeric.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr_all, dtype=bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr_all, mask=mask, square=True, linewidths=.5, ax=ax, cmap="magma")      

plt.show()

Non-numeric columns: ['StateHoliday', 'StoreType', 'Assortment', 'PromoInterval']

train = pd.read_csv("train.csv", 
                    parse_dates = True, index_col = 'Date')

# additional store data
store = pd.read_csv("store.csv")

train['Year'] = train.index.year
train['Month'] = train.index.month
train['Day'] = train.index.day
train['WeekOfYear'] = train.index.isocalendar().week 

# adding new variable
train['SalePerCustomer'] = train['Sales']/train['Customers']

train_store = pd.merge(train.reset_index(), store, how='inner', on='Store')
train_store.set_index('Date', inplace=True)

train_a = train_store[train_store['StoreType']=='a']
train_a = train_a.sort_values(by='Date',ascending=True)
stores = train_a['Store'].unique().tolist()
promos = {}
for store in stores:
    store_df = train_a[train_a['Store']==store].copy()
    store_df = store_df.sort_values(by='Date',ascending=True)
    promos[store] = list(store_df['Promo'])

grouped_keys = defaultdict(list)

for key, value in promos.items():
    # Convert the list to a tuple (since lists are not hashable) and group keys
    grouped_keys[tuple(value)].append(key)

# Extract groups with more than one key
result = {value: keys for value, keys in grouped_keys.items() if len(keys) > 1}

#find the subset of stores that share promotion dates
first_key = list(result.keys())[0]
stores_a = result[first_key]
df = train_a[train_a['Store'].isin(stores_a)]
exog = df[df['Store']==1114][[ 'Promo', 'SchoolHoliday', 'StateHoliday','DayOfWeek']]

# Create a exog variable file and aggregrate data
df_agg = df.groupby(df.index).agg(
    sales=('Sales', 'sum'),
    customers=('Customers', 'sum'),
    avg_salespercustomer=('SalePerCustomer', 'mean')
)

data=df_agg
data.index = pd.DatetimeIndex(data.index, freq='D')
data = data.resample('D').sum()
print( min(data.index))
print( max(data.index))

data.loc[data.index.dayofweek == 6, 'sales'] = 0
y=data['sales']
x = date2num(data.index.to_pydatetime())
x = x.reshape(-1, 1)  # Reshape for sklearn which expects 2D array for features
y = data['sales'].values
display(data)

2013-01-01 00:00:00
2015-07-31 00:00:00

reg= LinearRegression().fit(x,y)
y_pred = reg.predict(x)
fig, ax = plt.subplots(figsize=(20,6))
ax.plot(data.index,y,marker='.', linestyle='-', linewidth=1, label='Weekly')
ax.plot(data.index,y_pred, linestyle='-', linewidth=3, label='Weekly')
ax.yaxis.set_major_formatter(FuncFormatter(millions))
plt.title('Rossman Daily Sales',fontsize=16)
plt.ylabel('Weekly Sales ($)',fontsize=12)
plt.xlabel('Date',fontsize=12)
plt.show()

y = np.array(y)
mse = mean_squared_error(y, y_pred)

# Calculate RMSE
rmse = mean_squared_error(y, y_pred, squared=False)

# Calculate MAE
mae = mean_absolute_error(y, y_pred)

# Calculate MAPE - Handling division by zero if y contains zeros
mape = np.mean(np.abs((y - y_pred) / y)) * 100 if np.all(y) else float('inf')

# Calculate R^2
r2 = r2_score(y, y_pred)

#Calculate the f1 score

print('MSE: {:,.2f}'.format(mse))
print('RMSE: {:,.2f}'.format(rmse))
print('MAE: {:,.2f}'.format(mae))
print('MAPE: {:.2f}%'.format(mape))  # Percentage values don't typically include commas
print('R^2: {:.2f}'.format(r2))

MSE: 2,429,575,656,288.68
RMSE: 1,558,709.61
MAE: 1,143,395.16
MAPE: inf%
R^2: 0.00

# 
#exog = exog.set_index('Date')
exog.index = pd.DatetimeIndex(exog.index, freq='D')
print(len(exog))
#print(type(exog['campaign']))

#public_holidays = ['2020-12-25','2021-01-01','2021-12-25','2022-01-01','2022-12-25','2023-01-01','2023-12-25', '2024-01-01','2021-04-02','2021-04-04','2022-04-15','2022-04-17','2023-04-07','2023-04-09']  

# Convert your date column to datetime format if it's not already
#df['date'] = pd.to_datetime(df['date'])
print(exog)
# Create a binary indicator for public holidays
#exog['is_public_holiday'] = exog.index.isin(public_holidays).astype(int)
#print(exog.to_string())
print(len(exog))
print(len(y))
exog['StateHoliday'] = exog['StateHoliday'].apply(lambda x: 0 if x == 0 else 1)
print(exog['StateHoliday'].unique())
exog['StateHoliday'] = pd.to_numeric(exog['StateHoliday'], errors='coerce')
exog['SchoolHoliday'] = pd.to_numeric(exog['SchoolHoliday'], errors='coerce')
exog['DayOfWeek'] = exog['DayOfWeek'].apply(lambda x: 1 if x == 7 else 0)
#exog.loc[exog.index < '2023-01-01', 'weekend'] = 0

942
            Promo  SchoolHoliday StateHoliday  DayOfWeek
Date                                                    
2013-01-01      0              1            a          2
2013-01-02      0              1            0          3
2013-01-03      0              1            0          4
2013-01-04      0              1            0          5
2013-01-05      0              0            0          6
...           ...            ...          ...        ...
2015-07-27      1              1            0          1
2015-07-28      1              1            0          2
2015-07-29      1              1            0          3
2015-07-30      1              1            0          4
2015-07-31      1              1            0          5

[942 rows x 4 columns]
942
942
[1 0]

def fourier_terms(data, period, K):
    # `data` is the DataFrame, `period` is the seasonal period (e.g., 365 for yearly),
    # and `K` is the number of sine/cosine term pairs
    T = len(data)
    t = np.arange(1, T + 1)
    
    fourier_terms = pd.DataFrame(index=data.index)
    
    for k in range(1, K + 1):
        fourier_terms[f'sin_{k}'] = np.sin(2 * np.pi * k * t / period)
        fourier_terms[f'cos_{k}'] = np.cos(2 * np.pi * k * t / period)
        
    return fourier_terms
K = 6  # Number of terms; adjust based on your model's needs
period = 365  # Yearly seasonality
fourier_terms_df = fourier_terms(data, period, K)
print(fourier_terms_df)
combined_exog = pd.concat([exog, fourier_terms_df], axis=1)
print(combined_exog)
exog=combined_exog

            sin_1  cos_1  sin_2  cos_2  sin_3  cos_3  sin_4  cos_4  sin_5  \
Date                                                                        
2013-01-01      0      1      0      1      0      1      0      1      0   
2013-01-02      0      1      0      1      0      1      0      1      0   
2013-01-03      0      1      0      1      0      1      0      1      0   
2013-01-04      0      1      0      1      0      1      0      1      0   
2013-01-05      0      1      0      1      0      1      0      1      0   
...           ...    ...    ...    ...    ...    ...    ...    ...    ...   
2015-07-27     -0     -1      1      1     -1     -0      1     -0     -1   
2015-07-28     -0     -1      1      1     -1     -0      1     -0     -1   
2015-07-29     -0     -1      1      1     -1     -0      1     -0     -1   
2015-07-30     -0     -1      1      1     -1     -0      1     -0     -1   
2015-07-31     -0     -1      1      1     -1     -0      1     -0     -1   

            cos_5  sin_6  cos_6  
Date                             
2013-01-01      1      0      1  
2013-01-02      1      0      1  
2013-01-03      1      0      1  
2013-01-04      1      0      1  
2013-01-05      1      0      1  
...           ...    ...    ...  
2015-07-27      1      0     -1  
2015-07-28      1      0     -1  
2015-07-29      1      0     -1  
2015-07-30      1      0     -1  
2015-07-31      1      0     -1  

[942 rows x 12 columns]
            Promo  SchoolHoliday  StateHoliday  DayOfWeek  sin_1  cos_1  \
Date                                                                      
2013-01-01      0              1             1          0      0      1   
2013-01-02      0              1             1          0      0      1   
2013-01-03      0              1             1          0      0      1   
2013-01-04      0              1             1          0      0      1   
2013-01-05      0              0             1          0      0      1   
...           ...            ...           ...        ...    ...    ...   
2015-07-27      1              1             1          0     -0     -1   
2015-07-28      1              1             1          0     -0     -1   
2015-07-29      1              1             1          0     -0     -1   
2015-07-30      1              1             1          0     -0     -1   
2015-07-31      1              1             1          0     -0     -1   

            sin_2  cos_2  sin_3  cos_3  sin_4  cos_4  sin_5  cos_5  sin_6  \
Date                                                                        
2013-01-01      0      1      0      1      0      1      0      1      0   
2013-01-02      0      1      0      1      0      1      0      1      0   
2013-01-03      0      1      0      1      0      1      0      1      0   
2013-01-04      0      1      0      1      0      1      0      1      0   
2013-01-05      0      1      0      1      0      1      0      1      0   
...           ...    ...    ...    ...    ...    ...    ...    ...    ...   
2015-07-27      1      1     -1     -0      1     -0     -1      1      0   
2015-07-28      1      1     -1     -0      1     -0     -1      1      0   
2015-07-29      1      1     -1     -0      1     -0     -1      1      0   
2015-07-30      1      1     -1     -0      1     -0     -1      1      0   
2015-07-31      1      1     -1     -0      1     -0     -1      1      0   

            cos_6  
Date               
2013-01-01      1  
2013-01-02      1  
2013-01-03      1  
2013-01-04      1  
2013-01-05      1  
...           ...  
2015-07-27     -1  
2015-07-28     -1  
2015-07-29     -1  
2015-07-30     -1  
2015-07-31     -1  

[942 rows x 16 columns]

y = data['sales']
y.loc[y.index.dayofweek == 6] = 0
print(y)
y.to_clipboard()
fig, ax = plt.subplots(figsize=(20, 6))
ax.plot(y,marker='.', linestyle='-', linewidth=0.5, label='Weekly')
ax.plot(y.resample('M').mean(),marker='o', markersize=8, linestyle='-', label='Monthly Mean Resample')
ax.set_ylabel('Sales')
ax.legend()

Date
2013-01-01       2907
2013-01-02    3404416
2013-01-03    3083285
2013-01-04    3145636
2013-01-05    2556636
               ...   
2015-07-27    4823911
2015-07-28    4215867
2015-07-29    3888677
2015-07-30    4014507
2015-07-31    4671897
Freq: D, Name: sales, Length: 942, dtype: int64

<matplotlib.legend.Legend at 0x363c08c10>

def seasonal_decompose (y):
    decomposition = sm.tsa.seasonal_decompose(y, model='additive',extrapolate_trend='freq',period=365)
    seasonal = decomposition.seasonal
    seasonal_df = pd.DataFrame(seasonal)
    fig = decomposition.plot()
    fig.set_size_inches(14,7)
    plt.show()
    return seasonal_df
seasonal_df = seasonal_decompose(y)
print(seasonal_df)

# def seasonal_decompose_to_df(y):
#     decomposition = sm.tsa.seasonal_decompose(y, model='additive', extrapolate_trend='freq', period=365)
#     # Decompose the time series
#     seasonal = decomposition.seasonal
#     # Convert the seasonal component to a DataFrame
#     seasonal_df = pd.DataFrame(seasonal)
#     # Plot the decomposition
#     fig = decomposition.plot()
#     fig.set_size_inches(14, 7)
#     plt.show()
#     # Return the DataFrame
#     return seasonal_df

            seasonal
Date                
2013-01-01  -2985649
2013-01-02    405492
2013-01-03    -30998
2013-01-04  -1054281
2013-01-05   -396293
...              ...
2015-07-27   -651578
2015-07-28    -39371
2015-07-29   1399854
2015-07-30   1280142
2015-07-31   1723254

[942 rows x 1 columns]

import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# Example: Load your data into a pandas DataFrame
# Ensure your 'Date' column is a datetime object and set it as the index
# df = pd.read_csv('your_file.csv')  # Uncomment if loading from a CSV
# df['Date'] = pd.to_datetime(df['Date'])
# df.set_index('Date', inplace=True)

# Assuming your DataFrame is already prepared:
# df should have 'Date' as the index and a 'Sales' column

# Plot the ACF and PACF
fig, axes = plt.subplots(2, 1, figsize=(12, 8))

# ACF plot
plot_acf(data['sales'], ax=axes[0], lags=100)  # Adjust lags as needed
axes[0].set_title('Autocorrelation Function (ACF)')

# PACF plot
plot_pacf(data['sales'], ax=axes[1], lags=100, method='ywm')  # Method can be 'ywm', 'ols', or 'ld'
axes[1].set_title('Partial Autocorrelation Function (PACF)')

plt.tight_layout()
plt.show()

def test_stationarity(timeseries, title):
    
    #Determing rolling statistics
    rolmean = pd.Series(timeseries).rolling(window=7).mean() 
    rolstd = pd.Series(timeseries).rolling(window=7).std()
    
    fig, ax = plt.subplots(figsize=(16, 4))
    ax.plot(timeseries, label= title)
    ax.plot(rolmean, label='rolling mean');
    ax.plot(rolstd, label='rolling std (x10)');
    ax.legend()
pd.options.display.float_format = '{:.8f}'.format
test_stationarity(y,'raw data')

def ADF_test(timeseries, dataDesc):
    print(' > Is the {} stationary ?'.format(dataDesc))
    dftest = adfuller(timeseries.dropna(), autolag='AIC')
    print('Test statistic = {:.3f}'.format(dftest[0]))
    print('P-value = {:.3f}'.format(dftest[1]))
    print('Critical values :')
    for k, v in dftest[4].items():
        print('\t{}: {} - The data is {} stationary with {}% confidence'.format(k, v, 'not' if v<dftest[0] else '', 100-int(k[:-1])))
ADF_test(y,'raw data')

 > Is the raw data stationary ?
Test statistic = -6.046
P-value = 0.000
Critical values :
	1%: -3.4374778690219956 - The data is  stationary with 99% confidence
	5%: -2.864686684217556 - The data is  stationary with 95% confidence
	10%: -2.5684454926748583 - The data is  stationary with 90% confidence

# Detrending
y_float = y.apply(float)

y_detrend = (y_float - y_float.rolling(window=12).mean()) / y_float.rolling(window=12).std()

test_stationarity(y_detrend, 'de-trended data')
ADF_test(y_detrend, 'de-trended data')

 > Is the de-trended data stationary ?
Test statistic = -10.544
P-value = 0.000
Critical values :
	1%: -3.4375643702748078 - The data is  stationary with 99% confidence
	5%: -2.8647248254388096 - The data is  stationary with 95% confidence
	10%: -2.568465808810804 - The data is  stationary with 90% confidence

# Differencing
y_12lag =  y - y.shift(12)

test_stationarity(y_12lag,'12 lag differenced data')
ADF_test(y_12lag,'12 lag differenced data')

 > Is the 12 lag differenced data stationary ?
Test statistic = -9.915
P-value = 0.000
Critical values :
	1%: -3.4375723382479735 - The data is  stationary with 99% confidence
	5%: -2.8647283387229963 - The data is  stationary with 95% confidence
	10%: -2.568467680189796 - The data is  stationary with 90% confidence

y_12lag_detrend =  y_detrend - y_detrend.shift(12)

test_stationarity(y_12lag_detrend,'12 lag differenced de-trended data')
ADF_test(y_12lag_detrend,'12 lag differenced de-trended data')

 > Is the 12 lag differenced de-trended data stationary ?
Test statistic = -13.041
P-value = 0.000
Critical values :
	1%: -3.4376611618861697 - The data is  stationary with 99% confidence
	5%: -2.864767502722044 - The data is  stationary with 95% confidence
	10%: -2.5684885413039127 - The data is  stationary with 90% confidence

y_to_train = y[:'2015-06-30'] # dataset to train
y_to_val = y['2015-07-01':] # last X months for test  
predict_date = len(y) - len(y[:'2023-01-01']) # the number of data point
#y_float = y.astype(float)
#print(exog)
#exog=exog['campaign']
# exog=exog['2020-09-28':]
#exog=exog['2021-06-01':]
exog_to_train=exog[:'2015-06-30']
exog_to_val=exog['2015-07-01':]

# print(len(y_to_val))
# print(exog_to_val)
# print(type(y_to_val))
# print(y.index.equals(exog.index))
#exog=exog.drop(columns=['weekend'],axis=1)
print(len(exog))
print(len(y))
print(max(y.index))
print(min(y.index))
print(max(exog.index))
print(min(exog.index))

942
942
2015-07-31 00:00:00
2013-01-01 00:00:00
2015-07-31 00:00:00
2013-01-01 00:00:00

def rmspe(y_actual, y_pred):
    """
    Calculate Root Mean Squared Percentage Error (RMSPE).
    
    Args:
    - y_actual: Actual values (numpy array or pandas series).
    - y_pred: Predicted values (numpy array or pandas series).
    
    Returns:
    - RMSPE value as a float.
    """
    y_actual, y_pred = np.array(y_actual), np.array(y_pred)
    percentage_errors = (y_actual - y_pred) / y_actual
    return np.sqrt(np.mean(np.square(percentage_errors)))

def sarima_eva_autoarima(y, exog, seasonal_period, pred_date, y_to_test, exog_to_test):
    start_time = time.time()
    
    # Use auto_arima to find the best parameters
    print("Running auto_arima to find the best SARIMA parameters...")
    model = auto_arima(y, 
                       exogenous=exog, 
                       seasonal=True, 
                       m=seasonal_period, 
                       trace=True, 
                       error_action='ignore', 
                       suppress_warnings=True,
                       stepwise=False)
    
    print(f"Best SARIMA Model: {model.order}x{model.seasonal_order}")
    
    # Fit SARIMAX with the parameters found by auto_arima
    mod = sm.tsa.statespace.SARIMAX(endog=y,
                                    exog=exog,
                                    order=model.order,
                                    seasonal_order=model.seasonal_order,
                                    enforce_invertibility=False)
    results = mod.fit()
    print(results.summary().tables[1])
    
    results.plot_diagnostics(figsize=(16, 8))
    plt.show()
    
    # One-step ahead forecast
    pred = results.get_prediction(start=pd.to_datetime(pred_date), dynamic=False, exog=exog_to_test)
    pred_ci = pred.conf_int()
    y_forecasted = pred.predicted_mean
    
    # Calculate MSE and other metrics
    mse = ((y_forecasted - y_to_test) ** 2).mean()
    print(f"The Mean Squared Error of our one-step ahead forecasts is {mse:.2f}")
    
    # Plot observed vs forecasted
    ax = y.plot(label='Observed')
    start_date = min(y_to_test.index) - pd.DateOffset(days=2 * len(y_to_test))  # Start 2x the length of y_to_test before min
    end_date = max(y_to_test.index)  # End at max of y_to_test
    print(start_date,end_date)
    
    y_forecasted.plot(ax=ax, label='One-step ahead Forecast', alpha=0.7, figsize=(14, 7))
    ax.fill_between(pred_ci.index, pred_ci.iloc[:, 0], pred_ci.iloc[:, 1], color='k', alpha=0.2)
    ax.set_xlabel('Date')
    ax.set_ylabel('Sales $')
    ax.yaxis.set_major_formatter(FuncFormatter(millions))
    ax.set_xlim([start_date, end_date])   # Extend by 2x the forecasted window
   # ax.set_xlim([y.index.min(), y.index.max() + pd.DateOffset(days=2 * len(y_to_test))])
    plt.legend()
    plt.show()
    
    # Additional Metrics
    rmse = mean_squared_error(y_to_test, y_forecasted, squared=False)
    mae = mean_absolute_error(y_to_test, y_forecasted)
    non_zero_indices = y_to_test != 0  # Create a boolean mask for non-zero values
    mape = np.mean(np.abs((y_to_test[non_zero_indices] - y_forecasted[non_zero_indices]) / y_to_test[non_zero_indices])) * 100
    #mape = np.mean(np.abs((y_to_test - y_forecasted) / y_to_test)) * 100
    #mape = np.mean(np.abs((y_to_test - y_forecasted) / y_to_test)) * 100 if np.all(y_to_test != 0) else float('inf')
    r2 = r2_score(y_to_test, y_forecasted)
    #rmspe = rmspe(y_to_test, y_forecasted)
    percentage_errors = (y_to_test[non_zero_indices]- y_forecasted[non_zero_indices]) / y_to_test[non_zero_indices]
    np.sqrt(np.mean(np.square(percentage_errors)))
    print(f"RMSE: {rmse:.2f}")
    print(f"MAE: {mae:.2f}")
    print(f"MAPE: {mape:.2f}%")
    print(f"R²: {r2:.2f}")
    print(f"RMSPE:{np.sqrt(np.mean(np.square(percentage_errors)))*100:.2f}%")
    
    # Store results in DataFrame
    df_forecasts = pd.DataFrame({
        'Actual': y_to_test,
        'Forecasted': y_forecasted
    })
    
    # Get coefficient details for the exogenous variable
    coeff_details = {}
    variable_name = 'Promo'  # Example variable name
    if variable_name in results.params.index:
        coeff_details['coefficient'] = results.params[variable_name]
        coeff_details['standard_error'] = results.bse[variable_name]
        coeff_details['confidence_interval'] = results.conf_int().loc[variable_name].values
    else:
        print(f"Variable '{variable_name}' not found in the model.")
    
    end_time = time.time()
    duration = end_time - start_time
    print(f"The model took {duration:.2f} seconds to run.")
    
    return results, df_forecasts, coeff_details

# Example Usage
# y, exog, y_to_test, exog_to_test should be defined as your data
# Replace the following with your data and parameters
model=sarima_eva_autoarima(y, exog, 7, '2015-07-01', y_to_val,exog_to_val)

Running auto_arima to find the best SARIMA parameters...
 ARIMA(0,0,0)(0,0,0)[7] intercept   : AIC=29543.085, Time=0.01 sec
 ARIMA(0,0,0)(0,0,1)[7] intercept   : AIC=29395.435, Time=0.04 sec

 ARIMA(0,0,0)(0,0,2)[7] intercept   : AIC=29192.238, Time=0.23 sec
 ARIMA(0,0,0)(1,0,0)[7] intercept   : AIC=29478.964, Time=0.09 sec
 ARIMA(0,0,0)(1,0,1)[7] intercept   : AIC=29387.659, Time=0.17 sec

 ARIMA(0,0,0)(1,0,2)[7] intercept   : AIC=29190.128, Time=0.51 sec

 ARIMA(0,0,0)(2,0,0)[7] intercept   : AIC=29423.663, Time=0.29 sec

 ARIMA(0,0,0)(2,0,1)[7] intercept   : AIC=29162.432, Time=0.49 sec

 ARIMA(0,0,0)(2,0,2)[7] intercept   : AIC=29148.026, Time=0.82 sec
 ARIMA(0,0,1)(0,0,0)[7] intercept   : AIC=29544.593, Time=0.04 sec
 ARIMA(0,0,1)(0,0,1)[7] intercept   : AIC=29380.561, Time=0.09 sec

 ARIMA(0,0,1)(0,0,2)[7] intercept   : AIC=29186.035, Time=0.23 sec
 ARIMA(0,0,1)(1,0,0)[7] intercept   : AIC=29455.320, Time=0.16 sec

 ARIMA(0,0,1)(1,0,1)[7] intercept   : AIC=29364.595, Time=0.31 sec

 ARIMA(0,0,1)(1,0,2)[7] intercept   : AIC=29182.095, Time=0.73 sec

 ARIMA(0,0,1)(2,0,0)[7] intercept   : AIC=29403.970, Time=0.39 sec

	Store	StoreType	Assortment	CompetitionDistance	CompetitionOpenSinceMonth	CompetitionOpenSinceYear	Promo2	Promo2SinceWeek	Promo2SinceYear	PromoInterval
0	1	c	a	1270	9	2008	0	NaN	NaN	NaN
1	2	a	a	570	11	2007	1	13	2010	Jan,Apr,Jul,Oct
2	3	a	a	14130	12	2006	1	14	2011	Jan,Apr,Jul,Oct
3	4	c	c	620	9	2009	0	NaN	NaN	NaN
4	5	a	a	29910	4	2015	0	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...
1110	1111	a	a	1900	6	2014	1	31	2013	Jan,Apr,Jul,Oct
1111	1112	c	c	1880	4	2006	0	NaN	NaN	NaN
1112	1113	a	c	9260	NaN	NaN	0	NaN	NaN	NaN
1113	1114	a	c	870	NaN	NaN	0	NaN	NaN	NaN
1114	1115	d	c	5350	NaN	NaN	1	22	2012	Mar,Jun,Sept,Dec

Feature	Description
Sales	The turnover for any given day (target variable).
Customers	The number of customers on a given day.
Open	Indicates whether the store was open: 0 = closed, 1 = open.
Promo	Indicates whether a store was running a promotion on that day.
StateHoliday	Indicates a state holiday. Normally, all stores, with few exceptions, are closed on state holidays.
SchoolHoliday	Indicates if the store was affected by the closure of public schools on that day.

Feature	Description
Store	A unique ID for each store.
StoreType	Differentiates between 4 different store models: a, b, c, d.
Assortment	Describes the assortment level: a = basic, b = extra, c = extended.
CompetitionDistance	Distance in meters to the nearest competitor store.
CompetitionOpenSince[Month/Year]	Gives the approximate year and month when the nearest competitor store was opened.
Promo2	Indicates if the store is participating in a continuing promotion: 0 = not participating, 1 = participating.
Promo2Since[Year/Week]	Describes the year and calendar week when the store started participating in Promo2.
PromoInterval	Describes the months when Promo2 starts. For example, "Feb, May, Aug, Nov" means Promo2 begins in these months annually.

	Customers	Sales
StoreType
a	363541431	3165334859
b	31465616	159231395
c	92129705	783221426
d	156904995	1765392943

Analyzing the Impact of Promotions on Rossman Sales using SARIMAX¶

Introduction¶

Step 1: Importing neccessary packages
¶

Rossman Store Sales Data¶

Dataset¶

Step 2: Ingesting the data
¶

Analyzing the dataset¶

Train Feature Descriptions¶

Store Feature Descriptions¶

Step 3: Extracting date columns
¶

We have extracted date columns to futher analyse the sales data¶

Step 4: Visualizing sales data
¶

Explatory Data Analysis (EDA)¶

Step 5: Examining the effect of promos on sales
¶

EDA Continued¶

But before we do so, we need to do a bit of data cleaning.¶

Step 6: ECDF Analysis
¶

20% of the data has no sales¶

Step 7: Data cleaning
¶

The Store dataset also includes nulls¶

Step 8: Merging store and train datasets
¶

Store Types A and D have the highest sales and customer counts. Store Types B a C have significantly less sales¶

Step 9:Analyzing sales by store type
¶

Step 10:Analyzing customers by store type
¶

Step 11:Analyzing sales per customer by store type
¶

Step 12:Analyzing the correlation between variables
¶

EDA Conclusion¶

Step 13: Isolating Store Type 'A' for further analysis
¶

	Store	DayOfWeek	Sales	Customers	Open	Promo	StateHoliday	SchoolHoliday
Date
2015-07-31	1	5	5263	555	1	1	0	1
2015-07-31	2	5	6064	625	1	1	0	1
2015-07-31	3	5	8314	821	1	1	0	1
2015-07-31	4	5	13995	1498	1	1	0	1
2015-07-31	5	5	4822	559	1	1	0	1
...	...	...	...	...	...	...	...	...
2013-01-01	1111	2	0	0	0	0	a	1
2013-01-01	1112	2	0	0	0	0	a	1
2013-01-01	1113	2	0	0	0	0	a	1
2013-01-01	1114	2	0	0	0	0	a	1
2013-01-01	1115	2	0	0	0	0	a	1

	sales	customers	avg_salespercustomer
Date
2013-01-01	2907	532	5
2013-01-02	3404416	432946	8
2013-01-03	3083285	392123	8
2013-01-04	3145636	394606	8
2013-01-05	2556636	317693	8
...	...	...	...
2015-07-27	4823911	466917	11
2015-07-28	4215867	432920	10
2015-07-29	3888677	407142	10
2015-07-30	4014507	418309	10
2015-07-31	4671897	476425	10

Analyzing the Impact of Promotions on Rossman Sales using SARIMAX¶

Introduction¶

Step 1: Importing neccessary packages¶

Rossman Store Sales Data¶

Dataset¶

Step 2: Ingesting the data¶

Analyzing the dataset¶

Train Feature Descriptions¶

Store Feature Descriptions¶

Step 3: Extracting date columns¶

We have extracted date columns to futher analyse the sales data¶

Step 4: Visualizing sales data¶

Explatory Data Analysis (EDA)¶

Step 5: Examining the effect of promos on sales¶

Promo's have a significant positive increase on Sales¶

EDA Continued¶

But before we do so, we need to do a bit of data cleaning.¶

Step 6: ECDF Analysis¶

20% of the data has no sales¶

Step 7: Data cleaning¶

The Store dataset also includes nulls¶

Step 8: Merging store and train datasets¶

Store Types A and D have the highest sales and customer counts. Store Types B a C have significantly less sales¶

Step 9:Analyzing sales by store type¶

The catplot above of sales over month for promo = 1 (Active) vs promo = 0 (Inactive) shows that sales are significantly higher on days when promos are active, even though the number of non-promo days are higher than promo days over the 2 year period.¶

Step 10:Analyzing customers by store type¶

The catplot above of sales over month for promo = 1 (Active) vs promo = 0 (Inactive) shows that customers in store are significantly higher on days when promos are active, even though the number of non-promo days are higher than promo days over the 2 year period.¶

Step 11:Analyzing sales per customer by store type¶

The catplot above of sales per customer over month for promo = 1 (Active) vs promo = 0 (Inactive) shows that customers are purchasing more per transaction when promos are active.¶

Step 12:Analyzing the correlation between variables¶

EDA Conclusion¶

Step 13: Isolating Store Type 'A' for further analysis¶

Step 1: Importing neccessary packages
¶

Step 2: Ingesting the data
¶

Step 3: Extracting date columns
¶

Step 4: Visualizing sales data
¶

Step 5: Examining the effect of promos on sales
¶

Step 6: ECDF Analysis
¶

Step 7: Data cleaning
¶

Step 8: Merging store and train datasets
¶

Step 9:Analyzing sales by store type
¶

Step 10:Analyzing customers by store type
¶

Step 11:Analyzing sales per customer by store type
¶

Step 12:Analyzing the correlation between variables
¶

Step 13: Isolating Store Type 'A' for further analysis
¶