# Libraries to help with reading and manipulating data
import numpy as np
import pandas as pd

# Library to split the data
from sklearn.model_selection import train_test_split

# Libaries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Removes the limit for the number of displayed columns
pd.set_option("display.max_columns", None)

# Sets the limit for the number of displayed rows
pd.set_option("display.max_rows", 100)

# Import libraries for building linear regression model
from statsmodels.formula.api import ols
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

# Import library for preparing data
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

kart = pd.read_csv("SuperKart.csv")

# Copying data to another variable to avoid any changes to original data
data = kart.copy()

data.head()

data.tail()

print(f"There are {data.shape[0]} rows and {data.shape[1]} columns.")

There are 8763 rows and 12 columns.

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8763 entries, 0 to 8762
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Product_Id                 8763 non-null   object 
 1   Product_Weight             8763 non-null   float64
 2   Product_Sugar_Content      8763 non-null   object 
 3   Product_Allocated_Area     8763 non-null   float64
 4   Product_Type               8763 non-null   object 
 5   Product_MRP                8763 non-null   float64
 6   Store_Id                   8763 non-null   object 
 7   Store_Establishment_Year   8763 non-null   int64  
 8   Store_Size                 8763 non-null   object 
 9   Store_Location_City_Type   8763 non-null   object 
 10  Store_Type                 8763 non-null   object 
 11  Product_Store_Sales_Total  8763 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 821.7+ KB

# Checking for missing values in the data
data.isnull().sum()

Product_Id                   0
Product_Weight               0
Product_Sugar_Content        0
Product_Allocated_Area       0
Product_Type                 0
Product_MRP                  0
Store_Id                     0
Store_Establishment_Year     0
Store_Size                   0
Store_Location_City_Type     0
Store_Type                   0
Product_Store_Sales_Total    0
dtype: int64

# Checking for duplicate values
data.duplicated().sum()

0

data.describe(include = "all").T

# Making a list of all categorical variables
cat_col = list(data.select_dtypes("object").columns)

# Printing the count of each unique value
for column in cat_col:
    print(data[column].value_counts())
    print("-" * 50)

Product_Id
FD6114    1
FD8349    1
DR3530    1
NC5926    1
FD3575    1
         ..
FD2396    1
FD681     1
FD8077    1
DR3118    1
FD306     1
Name: count, Length: 8763, dtype: int64
--------------------------------------------------
Product_Sugar_Content
Low Sugar    4885
Regular      2251
No Sugar     1519
reg           108
Name: count, dtype: int64
--------------------------------------------------
Product_Type
Fruits and Vegetables    1249
Snack Foods              1149
Frozen Foods              811
Dairy                     796
Household                 740
Baking Goods              716
Canned                    677
Health and Hygiene        628
Meat                      618
Soft Drinks               519
Breads                    200
Hard Drinks               186
Others                    151
Starchy Foods             141
Breakfast                 106
Seafood                    76
Name: count, dtype: int64
--------------------------------------------------
Store_Id
OUT004    4676
OUT001    1586
OUT003    1349
OUT002    1152
Name: count, dtype: int64
--------------------------------------------------
Store_Size
Medium    6025
High      1586
Small     1152
Name: count, dtype: int64
--------------------------------------------------
Store_Location_City_Type
Tier 2    6262
Tier 1    1349
Tier 3    1152
Name: count, dtype: int64
--------------------------------------------------
Store_Type
Supermarket Type2     4676
Supermarket Type1     1586
Departmental Store    1349
Food Mart             1152
Name: count, dtype: int64
--------------------------------------------------

# Replacing reg with Regular in Product_Sugar_Content feature
data.Product_Sugar_Content.replace(to_replace = ["reg"], value = ["Regular"], inplace = True)

data.Product_Sugar_Content.value_counts()

Product_Sugar_Content
Low Sugar    4885
Regular      2359
No Sugar     1519
Name: count, dtype: int64

# Extracting the first two characters from the Product_Id column and storing it in another column
data["Product_Id_char"] = data["Product_Id"].str[:2]

data.head()

data["Product_Id_char"].unique()

array(['FD', 'NC', 'DR'], dtype=object)

data.loc[data.Product_Id_char == "FD", "Product_Type"].unique()

array(['Frozen Foods', 'Dairy', 'Canned', 'Baking Goods', 'Snack Foods',
       'Meat', 'Fruits and Vegetables', 'Breads', 'Breakfast',
       'Starchy Foods', 'Seafood'], dtype=object)

data.loc[data.Product_Id_char == "DR", "Product_Type"].unique()

array(['Hard Drinks', 'Soft Drinks'], dtype=object)

data.loc[data.Product_Id_char == "NC", "Product_Type"].unique()

array(['Health and Hygiene', 'Household', 'Others'], dtype=object)

# Dropping both the columns
data = data.drop(["Product_Id"], axis = 1)

data.head()

# Function to plot a boxplot and a histogram along the same scale

def histogram_boxplot(data, feature, figsize = (12, 7), kde = False, bins = None):
    """
    Boxplot and histogram combined

    data: dataframe
    feature: dataframe column
    figsize: size of figure (default (12,7))
    kde: whether to show density curve (default False)
    bins: number of bins for histogram (default None)
    """
    f2, (ax_box2, ax_hist2) = plt.subplots(
        nrows = 2,      # Number of rows of the subplot grid = 2
        sharex = True,  # x-axis will be shared among all subplots
        gridspec_kw = {"height_ratios": (0.25, 0.75)},
        figsize = figsize,
    )                   # Creating the 2 subplots
    sns.boxplot(
        data = data, x = feature, ax = ax_box2, showmeans = True, color = "violet"
    )                   # Boxplot will be created and a star will indicate the mean value of the column
    sns.histplot(
        data = data, x = feature, kde = kde, ax = ax_hist2, bins = bins, palette = "winter"
    ) if bins else sns.histplot(
        data = data, x = feature, kde = kde, ax = ax_hist2
    )                   # For histogram
    ax_hist2.axvline(
        data[feature].mean(), color = "green", linestyle = "--"
    )                   # Add mean to the histogram
    ax_hist2.axvline(
        data[feature].median(), color = "black", linestyle = "-"
    )                   # Add median to the histogram

histogram_boxplot(data, "Product_Weight")

histogram_boxplot(data, "Product_Allocated_Area")

histogram_boxplot(data, "Product_MRP")

histogram_boxplot(data, "Product_Store_Sales_Total")

# Function to create labeled barplots

def labeled_barplot(data, feature, perc = False, n = None):
    """
    Barplot with percentage at the top

    data: dataframe
    feature: dataframe column
    perc: whether to display percentages instead of count (default is False)
    n: displays the top n category levels (default is None, i.e., display all levels)
    """

    total = len(data[feature])            # Length of the column
    count = data[feature].nunique()
    if n is None:
        plt.figure(figsize = (count + 1, 5))
    else:
        plt.figure(figsize = (n + 1, 5))

    plt.xticks(rotation = 90, fontsize = 15)
    ax = sns.countplot(
        data = data,
        x = feature,
        palette = "Paired",
        order = data[feature].value_counts().index[:n].sort_values(),
    )

    for p in ax.patches:
        if perc == True:
            label = "{:.1f}%".format(
                100 * p.get_height() / total
            )                              # Percentage of each class of the category
        else:
            label = p.get_height()         # Count of each level of the category

        x = p.get_x() + p.get_width() / 2  # Width of the plot
        y = p.get_height()                 # Height of the plot

        ax.annotate(
            label,
            (x, y),
            ha = "center",
            va = "center",
            size = 12,
            xytext = (0, 5),
            textcoords = "offset points",
        )                                 # Annotate the percentage

    plt.show()                            # Show the plot

labeled_barplot(data, "Product_Sugar_Content", perc = True)

labeled_barplot(data, "Product_Type", perc = True)

labeled_barplot(data, "Store_Id", perc = True)

labeled_barplot(data, "Store_Size", perc = True)

labeled_barplot(data, "Store_Location_City_Type", perc = True)

labeled_barplot(data, "Store_Type", perc = True)

cols_list = data.select_dtypes(include = np.number).columns.tolist()

plt.figure(figsize = (10, 5))
sns.heatmap(
    data[cols_list].corr(), annot = True, vmin = -1, vmax = 1, fmt = ".2f", cmap = "Spectral"
)
plt.show()

plt.figure(figsize = [8, 6])
sns.scatterplot(x = data.Product_Weight, y = data.Product_Store_Sales_Total)
plt.show()

plt.figure(figsize = [8, 6])
sns.scatterplot(x = data.Product_Allocated_Area, y = data.Product_Store_Sales_Total)
plt.show()

plt.figure(figsize = [8, 6])
sns.scatterplot(x = data.Product_MRP, y = data.Product_Store_Sales_Total)
plt.show()

df_revenue1 = data.groupby(["Product_Type"], as_index = False)[
    "Product_Store_Sales_Total"
].sum()
plt.figure(figsize = [14, 8])
plt.xticks(rotation = 90)
a = sns.barplot(x = df_revenue1.Product_Type, y = df_revenue1.Product_Store_Sales_Total)
a.set_xlabel("Product Types")
a.set_ylabel("Revenue")
plt.show()

df_revenue2 = data.groupby(["Product_Sugar_Content"], as_index = False)[
    "Product_Store_Sales_Total"
].sum()
plt.figure(figsize = [8, 6])
plt.xticks(rotation = 90)
b = sns.barplot(
    x = df_revenue2.Product_Sugar_Content, y = df_revenue2.Product_Store_Sales_Total
)
b.set_xlabel("Product_Sugar_content")
b.set_ylabel("Revenue")
plt.show()

df_store_revenue = data.groupby(["Store_Id"], as_index = False)[
    "Product_Store_Sales_Total"
].sum()
plt.figure(figsize = [8, 6])
plt.xticks(rotation = 90)
r = sns.barplot(
    x = df_store_revenue.Store_Id, y = df_store_revenue.Product_Store_Sales_Total
)
r.set_xlabel("Stores")
r.set_ylabel("Revenue")
plt.show()

df_revenue3 = data.groupby(["Store_Size"], as_index = False)[
    "Product_Store_Sales_Total"
].sum()
plt.figure(figsize = [8, 6])
plt.xticks(rotation = 90)
c = sns.barplot(x = df_revenue3.Store_Size, y = df_revenue3.Product_Store_Sales_Total)
c.set_xlabel("Store_Size")
c.set_ylabel("Revenue")
plt.show()

df_revenue4 = data.groupby(["Store_Location_City_Type"], as_index = False)[
    "Product_Store_Sales_Total"
].sum()
plt.figure(figsize = [8, 6])
plt.xticks(rotation = 90)
d = sns.barplot(
    x = df_revenue4.Store_Location_City_Type, y = df_revenue4.Product_Store_Sales_Total
)
d.set_xlabel("Store_Location_City_Type")
d.set_ylabel("Revenue")
plt.show()

df_revenue5 = data.groupby(["Store_Type"], as_index = False)[
    "Product_Store_Sales_Total"
].sum()
plt.figure(figsize = [8, 6])
plt.xticks(rotation = 90)
e = sns.barplot(x=df_revenue5.Store_Type, y = df_revenue5.Product_Store_Sales_Total)
e.set_xlabel("Store_Type")
e.set_ylabel("Revenue")
plt.show()

plt.figure(figsize = [14, 8])
sns.boxplot(data = data, x = data.Store_Id, y = data.Product_Store_Sales_Total)
plt.xticks(rotation = 90)
plt.title("Boxplot - Store_Id Vs Product_Store_Sales_Total")
plt.xlabel("Stores")
plt.ylabel("Product_Store_Sales_Total (of each product)")
plt.show()

plt.figure(figsize = [14, 8])
sns.boxplot(data = data, x = data.Store_Size, y = data.Product_Store_Sales_Total)
plt.xticks(rotation = 90)
plt.title("Boxplot - Store_Size Vs Product_Store_Sales_Total")
plt.xlabel("Stores")
plt.ylabel("Product_Store_Sales_Total (of each product)")
plt.show()

plt.figure(figsize = [14, 8])
sns.boxplot(data = data, x = data.Product_Type, y = data.Product_Weight)
plt.xticks(rotation = 90)
plt.title("Boxplot - Product_Type Vs Product_Weight")
plt.xlabel("Types of Products")
plt.ylabel("Product_Weight")
plt.show()

plt.figure(figsize = [14, 8])
sns.boxplot(data = data, x = data.Product_Sugar_Content, y = data.Product_Weight)
plt.xticks(rotation = 90)
plt.title("Boxplot - Product_Sugar_Content Vs Product_Weight")
plt.xlabel("Product_Sugar_Content")
plt.ylabel("Product_Weight")
plt.show()

plt.figure(figsize = (14, 8))
sns.heatmap(
    pd.crosstab(data["Product_Sugar_Content"], data["Product_Type"]),
    annot = True,
    fmt = "g",
    cmap = "viridis",
)
plt.ylabel("Product_Sugar_Content")
plt.xlabel("Product_Type")
plt.show()

plt.figure(figsize = (14, 8))
sns.heatmap(
    pd.crosstab(data["Store_Id"], data["Product_Type"]),
    annot = True,
    fmt = "g",
    cmap = "viridis",
)
plt.ylabel("Stores")
plt.xlabel("Product_Type")
plt.show()

plt.figure(figsize = [14, 8])
sns.boxplot(data = data, x = data.Product_Type, y = data.Product_MRP)
plt.xticks(rotation = 90)
plt.title("Boxplot - Product_Type Vs Product_MRP")
plt.xlabel("Product_Type")
plt.ylabel("Product_MRP (of each product)")
plt.show()

plt.figure(figsize = [14, 8])
sns.boxplot(data = data, x = data.Store_Id, y = data.Product_MRP)
plt.xticks(rotation = 90)
plt.title("Boxplot - Store_Id Vs Product_MRP")
plt.xlabel("Stores")
plt.ylabel("Product_MRP (of each product)")
plt.show()

data.loc[data["Store_Id"] == "OUT001"].describe(include = "all").T

data.loc[data["Store_Id"] == "OUT001", "Product_Store_Sales_Total"].sum()

6223113.18

df_OUT001 = (
    data.loc[data["Store_Id"] == "OUT001"]
    .groupby(["Product_Type"], as_index = False)["Product_Store_Sales_Total"]
    .sum()
)
plt.figure(figsize = [14, 8])
plt.xticks(rotation = 90)
plt.xlabel("Product_Type")
plt.ylabel("Product_Store_Sales_Total")
plt.title("OUT001")
sns.barplot(x = df_OUT001.Product_Type, y = df_OUT001.Product_Store_Sales_Total)
plt.show()

data.loc[data["Store_Id"] == "OUT002"].describe(include = "all").T

data.loc[data["Store_Id"] == "OUT002", "Product_Store_Sales_Total"].sum()

2030909.72

df_OUT002 = (
    data.loc[data["Store_Id"] == "OUT002"]
    .groupby(["Product_Type"], as_index = False)["Product_Store_Sales_Total"]
    .sum()
)
plt.figure(figsize = [14, 8])
plt.xticks(rotation = 90)
plt.xlabel("Product_Type")
plt.ylabel("Product_Store_Sales_Total")
plt.title("OUT002")
sns.barplot(x = df_OUT002.Product_Type, y = df_OUT002.Product_Store_Sales_Total)
plt.show()

data.loc[data["Store_Id"] == "OUT003"].describe(include = "all").T

data.loc[data["Store_Id"] == "OUT003", "Product_Store_Sales_Total"].sum()

6673457.57

df_OUT003 = (
    data.loc[data["Store_Id"] == "OUT003"]
    .groupby(["Product_Type"], as_index = False)["Product_Store_Sales_Total"]
    .sum()
)
plt.figure(figsize = [14, 8])
plt.xticks(rotation = 90)
plt.xlabel("Product_Type")
plt.ylabel("Product_Store_Sales_Total")
plt.title("OUT003")
sns.barplot(x = df_OUT003.Product_Type, y = df_OUT003.Product_Store_Sales_Total)
plt.show()

data.loc[data["Store_Id"] == "OUT004"].describe(include = "all").T

data.loc[data["Store_Id"] == "OUT004", "Product_Store_Sales_Total"].sum()

15427583.43

df_OUT004 = (
    data.loc[data["Store_Id"] == "OUT004"]
    .groupby(["Product_Type"], as_index = False)["Product_Store_Sales_Total"]
    .sum()
)
plt.figure(figsize = [14, 8])
plt.xticks(rotation = 90)
plt.xlabel("Product_Type")
plt.ylabel("Product_Store_Sales_Total")
plt.title("OUT004")
sns.barplot(x = df_OUT004.Product_Type, y = df_OUT004.Product_Store_Sales_Total)
plt.show()

df1 = data.groupby(["Product_Type", "Store_Id"], as_index = False)[
    "Product_Store_Sales_Total"
].sum()
df1

df2 = data.groupby(["Product_Sugar_Content", "Store_Id"], as_index = False)[
    "Product_Store_Sales_Total"
].sum()
df2

# Outlet Age
data["Store_Age_Years"] = 2022 - data.Store_Establishment_Year

perishables = [
    "Dairy",
    "Meat",
    "Fruits and Vegetables",
    "Breakfast",
    "Breads",
    "Seafood",
]

def change(x):
    if x in perishables:
        return "Perishables"
    else:
        return "Non Perishables"


data.Product_Type.apply(change)

0       Non Perishables
1           Perishables
2       Non Perishables
3       Non Perishables
4       Non Perishables
             ...       
8758    Non Perishables
8759    Non Perishables
8760    Non Perishables
8761    Non Perishables
8762    Non Perishables
Name: Product_Type, Length: 8763, dtype: object

change1 = []
for i in range(0, len(data)):
    if data.Product_Type[i] in perishables:
        change1.append("Perishables")
    else:
        change1.append("Non Perishables")

data["Product_Type_Category"] = pd.Series(change1)

data.head()

# Outlier detection using boxplot
numeric_columns = data.select_dtypes(include=np.number).columns.tolist()
numeric_columns.remove("Store_Establishment_Year")
numeric_columns.remove("Store_Age_Years")


plt.figure(figsize = (15, 12))

for i, variable in enumerate(numeric_columns):
    plt.subplot(4, 4, i + 1)
    plt.boxplot(data[variable], whis = 1.5)
    plt.tight_layout()
    plt.title(variable)

plt.show()

plt.figure(figsize = (16, 8))
sns.heatmap(data.corr(numeric_only = True), annot = True)
plt.show()

data = data.drop(["Product_Type", "Store_Id", "Store_Establishment_Year"], axis = 1)

data = pd.get_dummies(
    data,
    columns = data.select_dtypes(include = ["object", "category"]).columns.tolist(),
    drop_first = True,
)

# Separating features and the target column
X = data.drop(["Product_Store_Sales_Total"], axis = 1)
y = data["Product_Store_Sales_Total"]

X = sm.add_constant(X)

# Splitting the data into train and test sets in 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.30, random_state = 1)

X_train1 = X_train.astype(float)
y_train1 = y_train.astype(float)

from statsmodels.stats.outliers_influence import variance_inflation_factor

# Function to check VIF
def checking_vif(train):
    vif = pd.DataFrame()
    vif["feature"] = train.columns

    # Calculating VIF for each feature
    vif["VIF"] = [
        variance_inflation_factor(train.values, i) for i in range(len(train.columns))
    ]
    return vif

print(checking_vif(X_train1)) #made change to X_train1

                              feature       VIF
0                               const  0.000000
1                      Product_Weight  1.752928
2              Product_Allocated_Area  1.001184
3                         Product_MRP  1.885058
4                     Store_Age_Years       inf
5      Product_Sugar_Content_No Sugar       inf
6       Product_Sugar_Content_Regular  1.088506
7                   Store_Size_Medium       inf
8                    Store_Size_Small       inf
9     Store_Location_City_Type_Tier 2       inf
10    Store_Location_City_Type_Tier 3       inf
11               Store_Type_Food Mart       inf
12       Store_Type_Supermarket Type1       inf
13       Store_Type_Supermarket Type2       inf
14                 Product_Id_char_FD  2.774371
15                 Product_Id_char_NC       inf
16  Product_Type_Category_Perishables  1.217690

X_train1 = X_train1.drop('Store_Age_Years',axis = 1)

X_test = X_test.drop('Store_Age_Years',axis = 1)

from statsmodels.stats.outliers_influence import variance_inflation_factor

# Function to check VIF
def checking_vif(train):
    vif = pd.DataFrame()
    vif["feature"] = train.columns

    # Calculating VIF for each feature
    vif["VIF"] = [
        variance_inflation_factor(train.values, i) for i in range(len(train.columns))
    ]
    return vif


print(checking_vif(X_train1))

                              feature       VIF
0                               const  0.000000
1                      Product_Weight  1.752928
2              Product_Allocated_Area  1.001184
3                         Product_MRP  1.885058
4      Product_Sugar_Content_No Sugar       inf
5       Product_Sugar_Content_Regular  1.088506
6                   Store_Size_Medium       inf
7                    Store_Size_Small       inf
8     Store_Location_City_Type_Tier 2       inf
9     Store_Location_City_Type_Tier 3       inf
10               Store_Type_Food Mart       inf
11       Store_Type_Supermarket Type1       inf
12       Store_Type_Supermarket Type2       inf
13                 Product_Id_char_FD  2.774371
14                 Product_Id_char_NC       inf
15  Product_Type_Category_Perishables  1.217690

from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_absolute_error, mean_squared_error

# Model Performance on test and train data
def model_pref(olsmodel, x_train, x_test):

    # In-sample Prediction
    y_pred_train = olsmodel.predict(x_train)
    y_observed_train = y_train

    # Prediction on test data
    y_pred_test = olsmodel.predict(x_test)
    y_observed_test = y_test

    print(
        pd.DataFrame(
            {
                "Data": ["Train", "Test"],
                "RMSE": [
                    np.sqrt(mean_squared_error(y_pred_train, y_observed_train)),
                    np.sqrt(mean_squared_error(y_pred_test, y_observed_test)),
                ],
                "MAE": [
                    mean_absolute_error(y_pred_train, y_observed_train),
                    mean_absolute_error(y_pred_test, y_observed_test),
                ],
                
                "r2": [
                    r2_score(y_pred_train, y_observed_train),
                    r2_score(y_pred_test, y_observed_test),
                ],
            }
        )
    )

X_train1 = X_train1.astype(float)
y_train1 = y_train.astype(float)

# Create the model
model1 = sm.OLS(y_train1, X_train1).fit()

# Get the model summary
model1.summary()

# Checking model1 performance
model_pref(model1, X_train1, X_test)

    Data        RMSE         MAE        r2
0  Train  446.667654  262.110666  0.785927
1   Test  446.685287  266.690587  0.790208

X_train1 = X_train1.drop(["Product_Type_Category_Perishables", "Product_Id_char_FD"], axis = 1)

X_test1 = X_test.drop(["Product_Type_Category_Perishables", "Product_Id_char_FD"], axis = 1)

X_train1 = X_train1.astype(float)
y_train1 = y_train.astype(float)

# Create the model
model2 = sm.OLS(y_train, X_train1).fit()

# Get the model summary
model2.summary()

# Checking model2 performance
model_pref(model2, X_train1, X_test1)

    Data        RMSE         MAE        r2
0  Train  446.681336  262.125308  0.785911
1   Test  446.780252  266.790089  0.790113

residuals = model2.resid

np.mean(residuals)

7.361361746596404e-12

from statsmodels.stats.diagnostic import het_white

from statsmodels.compat import lzip

import statsmodels.stats.api as sms

name = ["F statistic", "p-value"]

test = sms.het_goldfeldquandt(y_train, X_train1)

lzip(name, test)

[('F statistic', 0.9832189770563932), ('p-value', 0.6800664494252238)]

# Predicted values
fitted = model2.fittedvalues

# sns.set_style("whitegrid")
sns.residplot(x = fitted, y = residuals, color = "lightblue", lowess = True)

plt.xlabel("Fitted Values")

plt.ylabel("Residual")

plt.title("Residual PLOT")

plt.show()

# Plot histogram of residuals
sns.histplot(residuals, kde = True)

<Axes: ylabel='Count'>

# Plot q-q plot of residuals
import pylab

import scipy.stats as stats

stats.probplot(residuals, dist = "norm", plot = pylab)

plt.show()

# Import the required function

from sklearn.model_selection import cross_val_score

# Build the regression model and cross-validate
linearregression = LinearRegression()                                    

cv_Score11 = cross_val_score(linearregression, X_train, y_train, cv = 10)
cv_Score12 = cross_val_score(linearregression, X_train, y_train, cv = 10, 
                             scoring = 'neg_mean_squared_error')                                  


print("RSquared: %0.3f (+/- %0.3f)" % (cv_Score11.mean(), cv_Score11.std() * 2))
print("Mean Squared Error: %0.3f (+/- %0.3f)" % (-1*cv_Score12.mean(), cv_Score12.std() * 2))

RSquared: 0.823 (+/- 0.045)
Mean Squared Error: 200712.689 (+/- 64085.512)

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
Product_Id	8763	8763	FD6114	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Weight	8763.0	NaN	NaN	NaN	12.653792	2.21732	4.0	11.15	12.66	14.18	22.0
Product_Sugar_Content	8763	4	Low Sugar	4885	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Allocated_Area	8763.0	NaN	NaN	NaN	0.068786	0.048204	0.004	0.031	0.056	0.096	0.298
Product_Type	8763	16	Fruits and Vegetables	1249	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_MRP	8763.0	NaN	NaN	NaN	147.032539	30.69411	31.0	126.16	146.74	167.585	266.0
Store_Id	8763	4	OUT004	4676	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Establishment_Year	8763.0	NaN	NaN	NaN	2002.032751	8.388381	1987.0	1998.0	2009.0	2009.0	2009.0
Store_Size	8763	3	Medium	6025	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Location_City_Type	8763	3	Tier 2	6262	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Type	8763	4	Supermarket Type2	4676	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Store_Sales_Total	8763.0	NaN	NaN	NaN	3464.00364	1065.630494	33.0	2761.715	3452.34	4145.165	8000.0

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
Product_Weight	1586.0	NaN	NaN	NaN	13.458865	2.064975	6.16	12.0525	13.96	14.95	17.97
Product_Sugar_Content	1586	3	Low Sugar	845	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Allocated_Area	1586.0	NaN	NaN	NaN	0.068768	0.047131	0.004	0.033	0.0565	0.094	0.295
Product_Type	1586	16	Snack Foods	202	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_MRP	1586.0	NaN	NaN	NaN	160.514054	30.359059	71.35	141.72	168.32	182.9375	226.59
Store_Id	1586	1	OUT001	1586	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Establishment_Year	1586.0	NaN	NaN	NaN	1987.0	0.0	1987.0	1987.0	1987.0	1987.0	1987.0
Store_Size	1586	1	High	1586	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Location_City_Type	1586	1	Tier 2	1586	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Type	1586	1	Supermarket Type1	1586	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Store_Sales_Total	1586.0	NaN	NaN	NaN	3923.778802	904.62901	2300.56	3285.51	4139.645	4639.4	4997.63
Product_Id_char	1586	3	FD	1163	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
Product_Weight	1152.0	NaN	NaN	NaN	9.911241	1.799846	4.0	8.7675	9.795	10.89	19.82
Product_Sugar_Content	1152	3	Low Sugar	658	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Allocated_Area	1152.0	NaN	NaN	NaN	0.067747	0.047567	0.006	0.031	0.0545	0.09525	0.292
Product_Type	1152	16	Fruits and Vegetables	168	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_MRP	1152.0	NaN	NaN	NaN	107.080634	24.912333	31.0	92.8275	104.675	117.8175	224.93
Store_Id	1152	1	OUT002	1152	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Establishment_Year	1152.0	NaN	NaN	NaN	1998.0	0.0	1998.0	1998.0	1998.0	1998.0	1998.0
Store_Size	1152	1	Small	1152	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Location_City_Type	1152	1	Tier 3	1152	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Type	1152	1	Food Mart	1152	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Store_Sales_Total	1152.0	NaN	NaN	NaN	1762.942465	462.862431	33.0	1495.4725	1889.495	2133.6225	2299.63
Product_Id_char	1152	3	FD	850	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
Product_Weight	1349.0	NaN	NaN	NaN	15.103692	1.893531	7.35	14.02	15.18	16.35	22.0
Product_Sugar_Content	1349	3	Low Sugar	750	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Allocated_Area	1349.0	NaN	NaN	NaN	0.068637	0.048708	0.004	0.031	0.057	0.094	0.298
Product_Type	1349	16	Snack Foods	186	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_MRP	1349.0	NaN	NaN	NaN	181.358725	24.796429	85.88	166.92	179.67	198.07	266.0
Store_Id	1349	1	OUT003	1349	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Establishment_Year	1349.0	NaN	NaN	NaN	1999.0	0.0	1999.0	1999.0	1999.0	1999.0	1999.0
Store_Size	1349	1	Medium	1349	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Location_City_Type	1349	1	Tier 1	1349	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Type	1349	1	Departmental Store	1349	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Store_Sales_Total	1349.0	NaN	NaN	NaN	4946.966323	677.539953	3069.24	4355.39	4958.29	5366.59	8000.0
Product_Id_char	1349	3	FD	1024	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
Product_Weight	4676.0	NaN	NaN	NaN	12.349613	1.428199	7.34	11.37	12.37	13.3025	17.79
Product_Sugar_Content	4676	3	Low Sugar	2632	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Allocated_Area	4676.0	NaN	NaN	NaN	0.069092	0.048584	0.004	0.031	0.056	0.097	0.297
Product_Type	4676	16	Fruits and Vegetables	700	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_MRP	4676.0	NaN	NaN	NaN	142.399709	17.513973	83.04	130.54	142.82	154.1925	197.66
Store_Id	4676	1	OUT004	4676	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Establishment_Year	4676.0	NaN	NaN	NaN	2009.0	0.0	2009.0	2009.0	2009.0	2009.0	2009.0
Store_Size	4676	1	Medium	4676	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Location_City_Type	4676	1	Tier 2	4676	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Type	4676	1	Supermarket Type2	4676	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Store_Sales_Total	4676.0	NaN	NaN	NaN	3299.312111	468.271692	1561.06	2942.085	3304.18	3646.9075	5462.86
Product_Id_char	4676	3	FD	3502	NaN	NaN	NaN	NaN	NaN	NaN	NaN

Python Code Public Examples

Juan D. Correa - Software Developher @ Veracruz,MX & California,USA

astropema@gmail.com

SuperKart Sales Prediction¶

Context:¶

Objective:¶

Data Description:¶

Importing the necessary libraries and overview of the dataset¶

Importing the Dataset¶

View the first and last 5 rows of the dataset¶

Understand the shape of the dataset¶

Check the data types of the columns for the dataset¶

Checking for missing values in the dataset¶

Exploratory Data Analysis¶

Univariate Analysis¶

Bivariate Analysis¶

Data Preprocessing¶

Feature Engineering¶

Outlier Check¶

Data Preparation for modeling¶

Check for Multicollinearity¶

Building Models¶

Drop insignificant variables (variables with p-value > 0.05) from the above model and create the regression model again.¶

Checking the below linear regression assumptions¶

1. Check for mean residuals¶

2. Check for homoscedasticity¶

3. Linearity of variables¶

4. Normality of error terms¶

Apply cross validation to improve the model and evaluate it using different evaluation metrics¶

Actionable Insights and Business Recommendations¶

	Product_Id	Product_Weight	Product_Sugar_Content	Product_Allocated_Area	Product_Type	Product_MRP	Store_Id	Store_Establishment_Year	Store_Size	Store_Location_City_Type	Store_Type	Product_Store_Sales_Total
8758	NC7546	14.80	No Sugar	0.016	Health and Hygiene	140.53	OUT004	2009	Medium	Tier 2	Supermarket Type2	3806.53
8759	NC584	14.06	No Sugar	0.142	Household	144.51	OUT004	2009	Medium	Tier 2	Supermarket Type2	5020.74
8760	NC2471	13.48	No Sugar	0.017	Health and Hygiene	88.58	OUT001	1987	High	Tier 2	Supermarket Type1	2443.42
8761	NC7187	13.89	No Sugar	0.193	Household	168.44	OUT001	1987	High	Tier 2	Supermarket Type1	4171.82
8762	FD306	14.73	Low Sugar	0.177	Snack Foods	224.93	OUT002	1998	Small	Tier 3	Food Mart	2186.08

	Product_Weight	Product_Sugar_Content	Product_Allocated_Area	Product_Type	Product_MRP	Store_Id	Store_Establishment_Year	Store_Size	Store_Location_City_Type	Store_Type	Product_Store_Sales_Total	Product_Id_char
0	12.66	Low Sugar	0.027	Frozen Foods	117.08	OUT004	2009	Medium	Tier 2	Supermarket Type2	2842.40	FD
1	16.54	Low Sugar	0.144	Dairy	171.43	OUT003	1999	Medium	Tier 1	Departmental Store	4830.02	FD
2	14.28	Regular	0.031	Canned	162.08	OUT001	1987	High	Tier 2	Supermarket Type1	4130.16	FD
3	12.10	Low Sugar	0.112	Baking Goods	186.31	OUT001	1987	High	Tier 2	Supermarket Type1	4132.18	FD
4	9.57	No Sugar	0.010	Health and Hygiene	123.67	OUT002	1998	Small	Tier 3	Food Mart	2279.36	NC

	Product_Type	Store_Id	Product_Store_Sales_Total
0	Baking Goods	OUT001	525131.04
1	Baking Goods	OUT002	169860.50
2	Baking Goods	OUT003	491908.20
3	Baking Goods	OUT004	1266086.26
4	Breads	OUT001	121274.09
5	Breads	OUT002	43419.47
6	Breads	OUT003	175391.93
7	Breads	OUT004	374856.75
8	Breakfast	OUT001	38161.10
9	Breakfast	OUT002	23396.10
10	Breakfast	OUT003	95634.08
11	Breakfast	OUT004	204939.13
12	Canned	OUT001	449016.38
13	Canned	OUT002	151467.66
14	Canned	OUT003	452445.17
15	Canned	OUT004	1247153.50
16	Dairy	OUT001	598767.62
17	Dairy	OUT002	178888.18
18	Dairy	OUT003	715814.94
19	Dairy	OUT004	1318447.30
20	Frozen Foods	OUT001	558556.81
21	Frozen Foods	OUT002	180295.95
22	Frozen Foods	OUT003	597608.42
23	Frozen Foods	OUT004	1473519.65
24	Fruits and Vegetables	OUT001	792992.59
25	Fruits and Vegetables	OUT002	298503.56
26	Fruits and Vegetables	OUT003	897437.46
27	Fruits and Vegetables	OUT004	2311899.66
28	Hard Drinks	OUT001	152920.74
29	Hard Drinks	OUT002	54281.85
30	Hard Drinks	OUT003	110760.30
31	Hard Drinks	OUT004	307851.73
32	Health and Hygiene	OUT001	435005.31
33	Health and Hygiene	OUT002	164660.81
34	Health and Hygiene	OUT003	439139.18
35	Health and Hygiene	OUT004	1124901.91
36	Household	OUT001	531371.38
37	Household	OUT002	184665.65
38	Household	OUT003	523981.64
39	Household	OUT004	1324721.50
40	Meat	OUT001	505867.28
41	Meat	OUT002	151800.01
42	Meat	OUT003	520939.68
43	Meat	OUT004	950604.97
44	Others	OUT001	123977.09
45	Others	OUT002	32835.73
46	Others	OUT003	159963.75
47	Others	OUT004	224719.73
48	Seafood	OUT001	52936.84
49	Seafood	OUT002	17663.35
50	Seafood	OUT003	65337.48
51	Seafood	OUT004	136466.37
52	Snack Foods	OUT001	806142.24
53	Snack Foods	OUT002	255317.57
54	Snack Foods	OUT003	918510.44
55	Snack Foods	OUT004	2009026.70
56	Soft Drinks	OUT001	410548.69
57	Soft Drinks	OUT002	103808.35
58	Soft Drinks	OUT003	365046.30
59	Soft Drinks	OUT004	917641.38
60	Starchy Foods	OUT001	120443.98
61	Starchy Foods	OUT002	20044.98
62	Starchy Foods	OUT003	143538.60
63	Starchy Foods	OUT004	234746.89

	Product_Sugar_Content	Store_Id	Product_Store_Sales_Total
0	Low Sugar	OUT001	3300834.93
1	Low Sugar	OUT002	1156758.85
2	Low Sugar	OUT003	3706903.24
3	Low Sugar	OUT004	8658908.78
4	No Sugar	OUT001	1090353.78
5	No Sugar	OUT002	382162.19
6	No Sugar	OUT003	1123084.57
7	No Sugar	OUT004	2674343.14
8	Regular	OUT001	1831924.47
9	Regular	OUT002	491988.68
10	Regular	OUT003	1843469.76
11	Regular	OUT004	4094331.51

Dep. Variable:	Product_Store_Sales_Total	R-squared:	0.824
Model:	OLS	Adj. R-squared:	0.823
Method:	Least Squares	F-statistic:	2860.
Date:	Sat, 18 Jan 2025	Prob (F-statistic):	0.00
Time:	07:35:20	Log-Likelihood:	-46132.
No. Observations:	6134	AIC:	9.229e+04
Df Residuals:	6123	BIC:	9.236e+04
Df Model:	10
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	176.6077	42.659	4.140	0.000	92.982	260.234
Product_Weight	135.0943	3.436	39.318	0.000	128.359	141.830
Product_Allocated_Area	-139.5679	118.876	-1.174	0.240	-372.608	93.472
Product_MRP	12.8696	0.254	50.702	0.000	12.372	13.367
Product_Sugar_Content_No Sugar	16.0039	12.324	1.299	0.194	-8.155	40.163
Product_Sugar_Content_Regular	30.8608	13.284	2.323	0.020	4.819	56.902
Store_Size_Medium	394.3947	26.547	14.856	0.000	342.353	446.436
Store_Size_Small	-383.1731	5.776	-66.339	0.000	-394.496	-371.850
Store_Location_City_Type_Tier 2	-308.1516	8.855	-34.801	0.000	-325.510	-290.794
Store_Location_City_Type_Tier 3	-383.1731	5.776	-66.339	0.000	-394.496	-371.850
Store_Type_Food Mart	-383.1731	5.776	-66.339	0.000	-394.496	-371.850
Store_Type_Supermarket Type1	165.3861	17.762	9.311	0.000	130.566	200.206
Store_Type_Supermarket Type2	-473.5378	17.176	-27.570	0.000	-507.208	-439.867
Product_Id_char_FD	0.3932	21.936	0.018	0.986	-42.609	43.396
Product_Id_char_NC	16.0039	12.324	1.299	0.194	-8.155	40.163
Product_Type_Category_Perishables	7.7111	13.233	0.583	0.560	-18.231	33.653

Omnibus:	1745.262	Durbin-Watson:	1.987
Prob(Omnibus):	0.000	Jarque-Bera (JB):	46497.082
Skew:	0.775	Prob(JB):	0.00
Kurtosis:	16.399	Cond. No.	2.46e+20

Omnibus:	1744.629	Durbin-Watson:	1.987
Prob(Omnibus):	0.000	Jarque-Bera (JB):	46526.548
Skew:	0.775	Prob(JB):	0.00
Kurtosis:	16.403	Cond. No.	1.78e+34

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	179.0666	40.949	4.373	0.000	98.793	259.340
Product_Weight	135.0918	3.435	39.323	0.000	128.357	141.827
Product_Allocated_Area	-140.3752	118.852	-1.181	0.238	-373.367	92.616
Product_MRP	12.8703	0.254	50.727	0.000	12.373	13.368
Product_Sugar_Content_No Sugar	14.1944	7.933	1.789	0.074	-1.358	29.747
Product_Sugar_Content_Regular	30.7219	13.272	2.315	0.021	4.704	56.740
Store_Size_Medium	395.6741	25.835	15.316	0.000	345.029	446.319
Store_Size_Small	-382.7595	5.395	-70.953	0.000	-393.335	-372.184
Store_Location_City_Type_Tier 2	-307.8490	8.636	-35.648	0.000	-324.778	-290.920
Store_Location_City_Type_Tier 3	-382.7595	5.395	-70.953	0.000	-393.335	-372.184
Store_Type_Food Mart	-382.7595	5.395	-70.953	0.000	-393.335	-372.184
Store_Type_Supermarket Type1	166.1521	17.353	9.575	0.000	132.135	200.169
Store_Type_Supermarket Type2	-474.0011	17.074	-27.762	0.000	-507.472	-440.530
Product_Id_char_NC	14.1944	7.933	1.789	0.074	-1.358	29.747