import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3 as sql
import warnings
warnings.filterwarnings('ignore')

!gdown https://drive.google.com/uc?id=11eJQmZrCrKwovqQlaf4BZ-DExivgNSro

!gdown https://drive.google.com/uc?id=10eArwzQnEBtzzXDlrz1U58Cw0OkwyUKc

!gdown https://drive.google.com/uc?id=11S24579EfQ1xeNbyLrBryuk2RIZMJpB8

!gdown https://drive.google.com/uc?id=11MThES6FJS5DsSqRitvRfleuTDBjRnqP

!gdown https://drive.google.com/uc?id=11B-S5D9GUOsELksIQX09UclVz9gGfII7

!gdown https://drive.google.com/uc?id=10sv_7qVoTmGYHn26mzvOYRka2wi95N5o

Downloading...
From: https://drive.google.com/uc?id=11eJQmZrCrKwovqQlaf4BZ-DExivgNSro
To: /content/customer_dim.csv
100% 426k/426k [00:00<00:00, 89.8MB/s]
Downloading...
From: https://drive.google.com/uc?id=10eArwzQnEBtzzXDlrz1U58Cw0OkwyUKc
To: /content/fact_table.csv
100% 49.5M/49.5M [00:00<00:00, 130MB/s]
Downloading...
From: https://drive.google.com/uc?id=11S24579EfQ1xeNbyLrBryuk2RIZMJpB8
To: /content/item_dim.csv
100% 23.7k/23.7k [00:00<00:00, 44.7MB/s]
Downloading...
From: https://drive.google.com/uc?id=11MThES6FJS5DsSqRitvRfleuTDBjRnqP
To: /content/store_dim.csv
100% 25.0k/25.0k [00:00<00:00, 46.6MB/s]
Downloading...
From: https://drive.google.com/uc?id=11B-S5D9GUOsELksIQX09UclVz9gGfII7
To: /content/time_dim.csv
100% 5.19M/5.19M [00:00<00:00, 20.3MB/s]
Downloading...
From: https://drive.google.com/uc?id=10sv_7qVoTmGYHn26mzvOYRka2wi95N5o
To: /content/Trans_dim.csv
100% 1.30k/1.30k [00:00<00:00, 5.76MB/s]

cust_df = pd.read_csv("customer_dim.csv", encoding='latin1')
fact_df= pd.read_csv('fact_table.csv', encoding='latin1')
item_df = pd.read_csv('item_dim.csv', encoding='latin1')
store_df = pd.read_csv('store_dim.csv', encoding='latin1')
time_df = pd.read_csv('time_dim.csv', encoding='latin1')
trans_df = pd.read_csv('Trans_dim.csv', encoding='latin1')

df1 = pd.merge(fact_df, cust_df, on='coustomer_key', how='left')
df2 = pd.merge(df1, item_df, on='item_key', how='left')
df3 = pd.merge(df2, store_df, on='store_key', how='left')
df4 = pd.merge(df3, time_df, on='time_key', how='left')
df5 = pd.merge(df4, trans_df, on='payment_key', how='left')

df5.shape

(1000000, 30)

df5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 30 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   payment_key    1000000 non-null  object 
 1   coustomer_key  1000000 non-null  object 
 2   time_key       1000000 non-null  object 
 3   item_key       1000000 non-null  object 
 4   store_key      1000000 non-null  object 
 5   quantity       1000000 non-null  int64  
 6   unit_x         996277 non-null   object 
 7   unit_price_x   1000000 non-null  float64
 8   total_price    1000000 non-null  float64
 9   name           997010 non-null   object 
 10  contact_no     1000000 non-null  int64  
 11  nid            1000000 non-null  int64  
 12  item_name      1000000 non-null  object 
 13  desc           1000000 non-null  object 
 14  unit_price_y   1000000 non-null  float64
 15  man_country    1000000 non-null  object 
 16  supplier       1000000 non-null  object 
 17  unit_y         996277 non-null   object 
 18  division       1000000 non-null  object 
 19  district       1000000 non-null  object 
 20  upazila        1000000 non-null  object 
 21  date           1000000 non-null  object 
 22  hour           1000000 non-null  int64  
 23  day            1000000 non-null  int64  
 24  week           1000000 non-null  object 
 25  month          1000000 non-null  int64  
 26  quarter        1000000 non-null  object 
 27  year           1000000 non-null  int64  
 28  trans_type     1000000 non-null  object 
 29  bank_name      974410 non-null   object 
dtypes: float64(3), int64(7), object(20)
memory usage: 228.9+ MB

df5[['unit_x', 'unit_y', 'unit_price_y', 'unit_price_x']].sample(10)

df5['unit'] = df5['unit_x'].fillna(df5['unit_y'])
df5['unit_price'] = df5['unit_price_x'].fillna(df5['unit_price_y'])
df5['customer_key'] = df5['coustomer_key']
df5['sub-district'] = df5['upazila']
df5['item_description'] = df5['desc']

df = df5.drop(columns = ['unit_x', 'unit_y', 'unit_price_y', 'unit_price_x', 'coustomer_key', 'upazila', 'desc', 'name', 'nid', 'contact_no'])

df.shape

(1000000, 25)

df.describe()

df.sample(5)

df.isna().sum()

trans_type_with_null_bank = df.loc[df['bank_name'].isna(), 'trans_type']
trans_type_with_null_bank.value_counts()

df['bank_name'] = df['bank_name'].fillna('Not Applicable')

nul_unit = df.loc[df['unit'].isna(), ['item_name', 'item_description']]
nul_unit.value_counts().reset_index()

fill_unit = df.groupby('item_description')['unit'].agg(lambda x: x.mode()[0] if not x.mode().empty else np.nan)

df['unit'] = df['unit'].fillna(df['item_description'].map(fill_unit))

df.duplicated().sum()

0

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 25 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   payment_key       1000000 non-null  object 
 1   time_key          1000000 non-null  object 
 2   item_key          1000000 non-null  object 
 3   store_key         1000000 non-null  object 
 4   quantity          1000000 non-null  int64  
 5   total_price       1000000 non-null  float64
 6   item_name         1000000 non-null  object 
 7   man_country       1000000 non-null  object 
 8   supplier          1000000 non-null  object 
 9   division          1000000 non-null  object 
 10  district          1000000 non-null  object 
 11  date              1000000 non-null  object 
 12  hour              1000000 non-null  int64  
 13  day               1000000 non-null  int64  
 14  week              1000000 non-null  object 
 15  month             1000000 non-null  int64  
 16  quarter           1000000 non-null  object 
 17  year              1000000 non-null  int64  
 18  trans_type        1000000 non-null  object 
 19  bank_name         1000000 non-null  object 
 20  unit              1000000 non-null  object 
 21  unit_price        1000000 non-null  float64
 22  customer_key      1000000 non-null  object 
 23  sub-district      1000000 non-null  object 
 24  item_description  1000000 non-null  object 
dtypes: float64(2), int64(5), object(18)
memory usage: 190.7+ MB

df['date'] = pd.to_datetime(df['date'])

df['dates'] = df['date'].dt.date
df['month'] = df['date'].dt.month_name()
df['year'] = df['date'].dt.year
df['day'] = df['date'].dt.day_name()
df['time'] = df['date'].dt.time

df['dates'] = pd.to_datetime(df['dates'])

df.drop(columns = ['date', 'hour'], inplace = True)

num_cols = df.columns[(df.dtypes != 'object') & (df.dtypes != 'datetime64[ns]')]
num_cols

Index(['quantity', 'total_price', 'year', 'unit_price'], dtype='object')

def detect_outliers(data, col):
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR


    outliers = data[(data[col] < lower_bound) | (data[col] > upper_bound)].shape[0]
    return outliers

for col in num_cols:
    outliers = detect_outliers(df, col)
    print(f"Number of outliers in column '{col}': {outliers}")

Number of outliers in column 'quantity': 0
Number of outliers in column 'total_price': 31933
Number of outliers in column 'year': 0
Number of outliers in column 'unit_price': 75823

def treat_outliers(data, col):
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    data[col] = data[col].clip(lower=lower_bound, upper=upper_bound)

for col in num_cols:
    treat_outliers(df, col)

df.shape

(1000000, 25)

cat_cols = df.columns[(df.dtypes == 'object') & (df.columns != 'dates')]
cat_cols

Index(['payment_key', 'time_key', 'item_key', 'store_key', 'item_name',
       'man_country', 'supplier', 'division', 'district', 'day', 'week',
       'month', 'quarter', 'trans_type', 'bank_name', 'unit', 'customer_key',
       'sub-district', 'item_description', 'time'],
      dtype='object')

cols1 = df[['item_name','item_key','store_key', 'man_country', 'supplier', 'division', 'district','bank_name', 'trans_type', 'customer_key', 'sub-district']]
cols1.nunique().reset_index()

duplicate_item_names = df.groupby('item_name')['item_key'].nunique().reset_index().rename(columns={'item_key': 'unique_item_keys'})
duplicate_item_names = duplicate_item_names[duplicate_item_names['unique_item_keys'] > 1]
duplicate_item_names

filtered_rows = df[['item_key', 'item_name', 'unit_price', 'man_country']].drop_duplicates().sort_values(by='item_key')
filtered_rows = filtered_rows[filtered_rows['item_name'].isin(duplicate_item_names['item_name'])]
filtered_rows

df[['man_country', 'supplier']].drop_duplicates().sort_values(by = 'man_country')

def disp_val(bp):
  for p in bp.patches:
    bp.annotate(f'{int(p.get_height())}',
                 (p.get_x() + p.get_width() / 2., p.get_height()),
                 ha='center', va='center',
                 xytext=(0, 10),
                 textcoords='offset points')

def leg():
  plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0)

ecom_db = sql.connect('ecommerce.db')

df.to_sql('df', ecom_db, if_exists='replace', index=False)

1000000

q1 = """
with cte as (
  SELECT
    year, store_key,
    round(SUM(total_price), 1) AS total_sales
FROM
    df
    GROUP BY
    year, store_key
ORDER BY
    total_sales DESC
    )

  select year, store_key, total_sales
  from(
  select year, store_key, total_sales,
  dense_rank() over (partition by year order by total_sales desc) as rnk
  from cte
  ) as a
  where rnk = 1
  order by year
"""

top_sales_store_year = pd.read_sql_query(q1, ecom_db)
top_sales_store_year

q1 = """
select year, min(dates) as start_date, max(dates) as end_date
from df
group by year
order by 1
"""
pd.read_sql_query(q1, ecom_db)

plt.figure(figsize=(10, 5))
a = sns.barplot(data = top_sales_store_year[top_sales_store_year['year'] != 2021], x = 'year', hue = 'store_key', y = 'total_sales', palette='Set2')
plt.title('Every year highest sales in store')
plt.xlabel('Year')
plt.ylabel('Total Sales')

leg()
disp_val(a)

plt.tight_layout()
plt.show()

q1 = """
select year, item_description, total_quantity_sold, total_sales_amount, q_rnk
from(
select year, item_description, total_quantity_sold, total_sales_amount,
dense_rank() over (partition by year order by total_quantity_sold desc) as q_rnk
from(
select year, item_description, sum(quantity) as total_quantity_sold, round(sum(total_price), 1) as total_sales_amount
from df
group by year, item_description
order by total_quantity_sold desc) a
) b
where q_rnk = 1
order by year
"""
total_quantity_sold = pd.read_sql_query(q1, ecom_db)
total_quantity_sold

q1 = """
select year, item_description, total_quantity_sold, total_sales_amount, q_rnk
from(
select year, item_description, total_quantity_sold, total_sales_amount,
dense_rank() over (partition by year order by total_sales_amount desc) as q_rnk
from(
select year, item_description, sum(quantity) as total_quantity_sold, round(sum(total_price), 1) as total_sales_amount
from df
group by year, item_description
order by total_sales_amount desc) a
) b
where q_rnk = 1
order by year
"""
total_sales_amount = pd.read_sql_query(q1, ecom_db)
total_sales_amount

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

sns.barplot(data=total_quantity_sold[total_quantity_sold['year'] != 2021], x='year', y='total_quantity_sold', hue='item_description', palette='cividis', ax=ax1)
ax1.set_title('Every Year Highest Sold Item (Quantity)')
ax1.set_xlabel('Year')
ax1.set_ylabel('Total Quantity Sold')
leg()
disp_val(ax1)

sns.barplot(data=total_sales_amount[total_sales_amount['year'] != 2021], x='year', y='total_sales_amount', hue='item_description', palette='Set2', ax=ax2)
ax2.set_title('Every Year Highest Sold Item (Total Price)')
ax2.set_xlabel('Year')
ax2.set_ylabel('Total Sales Amount')
leg()
disp_val(ax2)

handles, labels = ax1.get_legend_handles_labels()
fig.legend(handles, labels, bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0)

ax1.get_legend().remove()
ax2.get_legend().remove()

plt.tight_layout()
plt.show()

div_sales = df.groupby(['division'])[['total_price', 'quantity']].sum().sort_values(by=['total_price', 'quantity'], ascending=False).reset_index()


plt.figure(figsize=(10, 6))
a = sns.barplot(data=div_sales, x='division', y='total_price', palette='Set2')
plt.title('Division wise Total Sales Amount')
plt.xlabel('Division')
plt.ylabel('Total Sales Amount')

disp_val(a)

plt.tight_layout()
plt.show()

q1 = """
with cte1 as (
select division, item_description, total_quantity_sold, total_sales_amount,
dense_rank() over (partition by division order by total_sales_amount desc) as s_rnk
from(
select division, item_description, sum(quantity) as total_quantity_sold, round(sum(total_price), 1) as total_sales_amount
from df
group by division, item_description ) a)

select division, item_description, total_quantity_sold, total_sales_amount
from cte1
where s_rnk = 1
order by total_sales_amount desc
"""
div_item_sales = pd.read_sql_query(q1, ecom_db)

plt.figure(figsize=(10, 6))
a = sns.barplot(data=div_item_sales, x='division', y='total_sales_amount', hue='item_description' , palette='Set2')
plt.title('Division wise Total Sales Amount')
plt.xlabel('Division')
plt.ylabel('Total Sales Amount')

disp_val(a)

plt.tight_layout()
plt.show()

overall_sales = df.groupby(['year', 'division'])['total_price'].sum().reset_index()

plt.figure(figsize=(12, 6))
sns.lineplot(data=overall_sales[overall_sales['year'] !=2021], x='year', y='total_price', hue='division', palette='Set2')
plt.title('Overall Sales Trend')
plt.xlabel('Year')
plt.ylabel('Total Sales Amount')
leg()
plt.tight_layout()
plt.show()

store_cnt = df.groupby(['division'])['store_key'].nunique().reset_index()
store_cnt = store_cnt.sort_values(by='store_key', ascending=False)

plt.figure(figsize=(10, 6))
a = sns.barplot(data=store_cnt, x='division', y='store_key', palette='Set2')

plt.title('Number of Stores per Division')
plt.xlabel('Division')
plt.ylabel('Number of Stores')

disp_val(a)

plt.tight_layout()
plt.show()

quarter_sales = df.groupby('quarter')['total_price'].sum().reset_index()

plt.figure(figsize=(10, 6))
a = sns.lineplot(data=quarter_sales, x='quarter', y='total_price', palette='Set2')
disp_val(a)

plt.title('Sales in Different Quarters')
plt.xlabel('Quarter')
plt.ylabel('Total Sales Amount')
plt.show()

q1 = """
select quarter, item_description, total_quantity_sold
from(
select quarter, item_description, total_quantity_sold,
dense_rank() over (partition by quarter order by total_quantity_sold desc) as q_rnk
from(
select quarter, item_description, sum(quantity) as total_quantity_sold
from df
group by quarter, item_description
) a
) b
where q_rnk = 1 or q_rnk = 2
order by 1
"""
quart_item_sales = pd.read_sql(q1, ecom_db)

plt.figure(figsize=(10, 6))
a = sns.lineplot(data=quart_item_sales, x='quarter', y='total_quantity_sold', hue='item_description', palette='Set2')
plt.title('Sales in Different Quarters')
plt.xlabel('Quarter')
plt.ylabel('Total Quantity Sold')

leg()

plt.tight_layout()
plt.show()

df['month'] = pd.to_datetime(df['month'], format='%B').dt.month

month_sales = df.groupby('month')['total_price'].sum().reset_index()

plt.figure(figsize=(10, 6))
sns.lineplot(data=month_sales, x='month', y='total_price', palette='Set2')
plt.title('Monthwise Sales Trend')
plt.xlabel('Month')
plt.ylabel('Total Sales Amount')
plt.tight_layout()
plt.show()

pay_method = df['trans_type'].value_counts().reset_index()

explode = [0.2, 0, 0]
plt.figure(figsize=(10, 6))
plt.pie(pay_method['count'], labels=pay_method['trans_type'], autopct='%1.0f%%', explode=explode, startangle=150, colors=sns.color_palette('Pastel1'))

plt.title('Distribution of Payment Methods')
plt.tight_layout()
plt.show()

bank = df['bank_name'].value_counts().reset_index().head(10)

plt.figure(figsize=(15, 10))
a = sns.barplot(data=bank, x='bank_name', y='count', palette='Set2')
disp_val(a)
plt.title('Most used bank')
plt.xlabel('Bank Name')
plt.ylabel('Count')
plt.ylim(25650, max(bank['count'])+100)
wrapped_labels = ["\n".join(bank.split(" ", 2)[:2]) for bank in bank]
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()

first_date = df.groupby('customer_key')['dates'].min().reset_index()
last_date = df.groupby('customer_key')['dates'].max().reset_index()

first_date.columns = ['customer_key', 'first_purchase_date']
last_date.columns = ['customer_key', 'last_purchase_date']

df = df.merge(first_date, on='customer_key', how='left')
df = df.merge(last_date, on='customer_key', how='left')

df['ref_date'] = df['dates'].max() + pd.DateOffset(days=1)

df['recency'] = (df['ref_date'] - df['last_purchase_date']).dt.days

df['number_of_purchase'] = df.groupby('customer_key')['customer_key'].transform('count')

df['tot_month'] = round((df['last_purchase_date'] - df['first_purchase_date']).dt.days/30, 0)

df['frequency'] = round(df['number_of_purchase'] / df['tot_month'], 2)
df['monetary'] = round(df.groupby('customer_key')['total_price'].transform('sum'), 2)

m20 = df['monetary'].quantile(0.20)
m40 = df['monetary'].quantile(0.40)
m60 = df['monetary'].quantile(0.60)
m80 = df['monetary'].quantile(0.80)
m100 = df['monetary'].quantile(1)
m20, m40, m60, m80, m100

(10130.38, 10842.5, 11507.0, 12274.75, 16320.75)

f20 = df['frequency'].quantile(0.20)
f40 = df['frequency'].quantile(0.40)
f60 = df['frequency'].quantile(0.60)
f80 = df['frequency'].quantile(0.80)
f100 = df['frequency'].quantile(1)
f20, f40, f60, f80, f100

(1.2, 1.28, 1.34, 1.42, 1.86)

r20 = df['recency'].quantile(0.20)
r40 = df['recency'].quantile(0.40)
r60 = df['recency'].quantile(0.60)
r80 = df['recency'].quantile(0.80)
r100 = df['recency'].quantile(1)
r20, r40, r60, r80, r100

(7.0, 14.0, 23.0, 39.0, 218.0)

df['m_score'] = pd.cut(df['monetary'], bins=[0, m20, m40, m60, m80, m100], labels=[1, 2, 3, 4, 5])
df['f_score'] = pd.cut(df['frequency'], bins=[0, f20, f40, f60, f80, f100], labels=[1, 2, 3, 4, 5])
df['r_score'] = pd.cut(df['recency'], bins=[0, r20, r40, r60, r80, r100], labels=[5, 4, 3, 2, 1])

df['fm_score'] = round((df['m_score'].astype(int) + df['f_score'].astype(int)) / 2, 0)

def cust_seg(r, fm):
  if (r == 5 and fm == 5) or (r == 5 and fm == 4) or  (r == 4 and fm == 5):
    return 'Champions'
  elif (r == 5 and fm == 3) or (r == 4 and fm == 4) or  (r == 3 and fm == 5) or (r == 3 and fm == 4):
    return 'Loyal Customers'
  elif (r == 5 and fm == 2) or (r == 4 and fm == 2) or  (r == 3 and fm == 3) or (r == 4 and fm == 3):
    return 'Potential Loyalists'
  elif (r == 5 and fm == 1):
    return 'Recent Customers'
  elif (r == 4 and fm == 1) or (r == 3 and fm == 1):
    return 'Promising'
  elif (r == 3 and fm == 2) or (r == 2 and fm == 3) or (r == 2 and fm == 2):
    return 'Customers Needing Attention'
  elif (r == 2 and fm == 4) or (r == 2 and fm == 5) or (r == 1 and fm == 3):
    return 'At Risk'
  elif (r == 1 and fm == 5) or (r == 1 and fm == 4):
    return 'Cant Lose Them'
  elif (r == 1 and fm == 2) or (r == 1 and fm == 1) or (r == 2 and fm == 1):
    return 'Hibernating'
  elif (r == 1 and fm == 1):
    return 'Lost'
  else:
    return r,fm

df['rfm_segment'] = df.apply(lambda x: cust_seg(x['r_score'], x['fm_score']), axis=1)

df['rfm_segment'].value_counts().reset_index()

df.to_csv('df_rfm.csv', index=False)

sizes = df['rfm_segment'].value_counts()
labels = sizes.index
explode = [0, 0, 0, 0, 0, 0.1, 0.1, 0, 0]

plt.figure(figsize=(8, 6))
plt.pie(sizes, labels=labels, autopct='%1.1f%%',  explode=explode, startangle=0, colors=sns.color_palette('Pastel1'))
plt.title("RFM Segment Distribution")
plt.show()

import statsmodels.api as sm

num_cols = ['total_price', 'quantity', 'recency', 'number_of_purchase', 'frequency', 'monetary', 'unit_price']

for col in num_cols:
  a = np.array(df[col])
  sm.qqplot(a, line="s")
  plt.title(f"Normality test for {col}")
  plt.show()

from scipy.stats import shapiro

for col in num_cols:
  a = np.array(df[col])
  stat, pvalue = shapiro(a)
  print(f"{col} : {pvalue}; Not normally distributed" if pvalue <= 0.05 else ";\t\t\tNormally distributed\n")

total_price : 8.958389509258947e-144; Not normally distributed
quantity : 1.0115475064741086e-138; Not normally distributed
recency : 9.258991473465387e-174; Not normally distributed
number_of_purchase : 7.526419825183871e-49; Not normally distributed
frequency : 3.646155091653796e-48; Not normally distributed
monetary : 4.1539248217887104e-50; Not normally distributed
unit_price : 1.2827231666404496e-139; Not normally distributed

df1['total_price'] = np.log1p(df1['total_price'])
sns.histplot(data=df1['total_price'], kde=True)
plt.show()

from scipy.stats import boxcox

df1["total_price_boxcox"], lambda_val = boxcox(df["total_price"] + 1)
print("After Box-Cox transformation:", df1["total_price_boxcox"].skew())
print(f"Best lambda value: {lambda_val}")
sns.histplot(data=df1['total_price_boxcox'], kde=True)
plt.show()

After Box-Cox transformation: -0.06505589448079488
Best lambda value: 0.3382599822222821

from scipy.stats import kruskal

sales_2014 = df[df['year'] == 2014]['total_price']
sales_2015 = df[df['year'] == 2015]['total_price']
sales_2016 = df[df['year'] == 2016]['total_price']
sales_2017 = df[df['year'] == 2017]['total_price']
sales_2019 = df[df['year'] == 2018]['total_price']
sales_2019 = df[df['year'] == 2019]['total_price']
sales_2020 = df[df['year'] == 2020]['total_price']

stat, p_value = kruskal(
    sales_2014,
    sales_2015,
    sales_2016,
    sales_2017,
    sales_2019,
    sales_2019,
    sales_2020
)

print(f'Statistic: {stat}')
print(f'P-Value: {p_value}')

if p_value < 0.05:
   print("Reject H0")
   print("At least one of the medians of the total price is different among the other sales year.")
else:
   print("Fail to reject H0")
   print("The medians of the total price are the same for all the sales years.")

Statistic: 1.1997009367452445
P-Value: 0.9768994813158671
Fail to reject H0
The medians of the total price are the same for all the sales years.

from scipy.stats import chi2_contingency

payment_counts = pd.crosstab(df['trans_type'], df['customer_key'])

chi2_stat, p_value, dof, expected = chi2_contingency(payment_counts)

print(f"Chi-Square Statistic: {chi2_stat}, P-Value: {p_value}")

if p_value < 0.05:
    print("Reject H0: Payment methods are not equally distributed among customers.")
else:
    print("Fail to Reject H0: Payment methods are equally distributed among customers.")

Chi-Square Statistic: 17907.371069052442, P-Value: 0.993486971758989
Fail to Reject H0: Payment methods are equally distributed among customers.

contingency_table = pd.crosstab(df["item_key"], df["customer_key"])

chi2, p, dof, expected = chi2_contingency(contingency_table)

print(f"Chi-Square Statistic: {chi2:.2f}")
print(f"P-Value: {p:.5f}")

if p < 0.05:
    print("Reject H0: There is a significant relationship between items and customers.")
else:
    print("Fail to Reject H0: No significant relationship found.")

Chi-Square Statistic: 2419231.36
P-Value: 0.15185
Fail to Reject H0: No significant relationship found.

contingency_table = pd.crosstab(df["item_key"], df["rfm_segment"])

chi2, p, dof, expected = chi2_contingency(contingency_table)

print(f"Chi-Square Statistic: {chi2:.2f}")
print(f"P-Value: {p:.5f}")

if p < 0.05:
    print("Reject H0: There is a significant relationship between items and customer segments.")
else:
    print("Fail to Reject H0: No significant relationship found.")

Chi-Square Statistic: 2366.16
P-Value: 0.00005
Reject H0: There is a significant relationship between items and customer segments.

contingency_table = pd.crosstab(df["item_description"], df["rfm_segment"])

chi2, p, dof, expected = chi2_contingency(contingency_table)

print(f"Chi-Square Statistic: {chi2:.2f}")
print(f"P-Value: {p:.5f}")

if p < 0.05:
    print("Reject H0: There is a significant relationship between items and customer segments.")
else:
    print("Fail to Reject H0: No significant relationship found.")

Chi-Square Statistic: 420.37
P-Value: 0.00000
Reject H0: There is a significant relationship between items and customer segments.

contingency_percent = contingency_table.div(contingency_table.sum(axis=1), axis=0)

plt.figure(figsize=(12, 6))
sns.heatmap(contingency_percent, cmap="Set2", annot=True, fmt=".2f", linewidths=0.5)

plt.title("Customer Preference for Item Categories")
plt.xlabel("Customer Segment")
plt.ylabel("Item Description")
plt.show()

contingency_table = pd.crosstab(df["man_country"], df["customer_key"])

chi2, p, dof, expected = chi2_contingency(contingency_table)

print(f"Chi-Square Statistic: {chi2:.2f}")
print(f"P-Value: {p:.5f}")

if p < 0.05:
    print("Reject H0: There is a significant relationship between manufacturing country and customers.")
else:
    print("Fail to Reject H0: No significant relationship found.")

Chi-Square Statistic: 82891.40
P-Value: 0.32732
Fail to Reject H0: No significant relationship found.

contingency_table = pd.crosstab(df["man_country"], df["rfm_segment"])

chi2, p, dof, expected = chi2_contingency(contingency_table)

print(f"Chi-Square Statistic: {chi2:.2f}")
print(f"P-Value: {p:.5f}")

if p < 0.05:
    print("Reject H0: There is a significant relationship between manufacturing country and customer segments.")
else:
    print("Fail to Reject H0: No significant relationship found.")

Chi-Square Statistic: 65.17
P-Value: 0.70265
Fail to Reject H0: No significant relationship found.

contingency_table = pd.crosstab(df["month"], df["item_key"])

chi2, p, dof, expected = chi2_contingency(contingency_table)

print(f"Chi-Square Statistic: {chi2:.2f}")
print(f"P-Value: {p:.5f}")

if p < 0.05:
    print("Reject H0: There is a significant relationship between months and items.")
else:
    print("Fail to Reject H0: No significant relationship found.")

Chi-Square Statistic: 3025.67
P-Value: 0.04209
Reject H0: There is a significant relationship between months and items.

contingency_table = pd.crosstab(df["month"], df["item_description"])

chi2, p, dof, expected = chi2_contingency(contingency_table)

print(f"Chi-Square Statistic: {chi2:.2f}")
print(f"P-Value: {p:.5f}")

if p < 0.05:
    print("Reject H0: There is a significant relationship between months and items.")
else:
    print("Fail to Reject H0: No significant relationship found.")

Chi-Square Statistic: 351.77
P-Value: 0.10001
Fail to Reject H0: No significant relationship found.

contingency_table = pd.crosstab(df["quarter"], df["item_key"])

chi2, p, dof, expected = chi2_contingency(contingency_table)

print(f"Chi-Square Statistic: {chi2:.2f}")
print(f"P-Value: {p:.5f}")

if p < 0.05:
    print("Reject H0: There is a significant relationship between quarters and items.")
else:
    print("Fail to Reject H0: No significant relationship found.")

Chi-Square Statistic: 802.37
P-Value: 0.36261
Fail to Reject H0: No significant relationship found.

contingency_table = pd.crosstab(df["quarter"], df["item_description"])

chi2, p, dof, expected = chi2_contingency(contingency_table)

print(f"Chi-Square Statistic: {chi2:.2f}")
print(f"P-Value: {p:.5f}")

if p < 0.05:
    print("Reject H0: There is a significant relationship between quarters and items.")
else:
    print("Fail to Reject H0: No significant relationship found.")

Chi-Square Statistic: 88.06
P-Value: 0.44808
Fail to Reject H0: No significant relationship found.

contingency_table = pd.crosstab(df["division"], df["item_description"])

chi2, p, dof, expected = chi2_contingency(contingency_table)

print(f"Chi-Square Statistic: {chi2:.2f}")
print(f"P-Value: {p:.5f}")

if p < 0.05:
    print("Reject H0: There is a significant relationship between divisions and items.")
else:
    print("Fail to Reject H0: No significant relationship found.")

Chi-Square Statistic: 156.29
P-Value: 0.82829
Fail to Reject H0: No significant relationship found.

contingency_table = pd.crosstab(df["division"], df["item_key"])

chi2, p, dof, expected = chi2_contingency(contingency_table)

print(f"Chi-Square Statistic: {chi2:.2f}")
print(f"P-Value: {p:.5f}")

if p < 0.05:
    print("Reject H0: There is a significant relationship between divisions and items.")
else:
    print("Fail to Reject H0: No significant relationship found.")

Chi-Square Statistic: 1559.89
P-Value: 0.62239
Fail to Reject H0: No significant relationship found.

	unit_x	unit_y	unit_price_y	unit_price_x
107794	cans	cans	16.00	16.00
121289	ct	ct	26.00	26.00
97953	bottles	bottles	22.00	22.00
867614	ct.	ct.	13.00	13.00
477800	ct	ct	14.00	14.00
429294	pk	pk	15.00	15.00
793682	ct	ct	20.00	20.00
977247	cans	cans	16.25	16.25
918431	ct	ct	13.50	13.50
817054	cans	cans	11.50	11.50

	quantity	total_price	hour	day	month	year	unit_price
count	1000000.000000	1000000.000000	1000000.000000	1000000.00000	1000000.000000	1000000.000000	1000000.000000
mean	6.000185	105.401436	11.542591	15.77572	6.518765	2017.050811	17.561602
std	3.161932	80.829301	6.903028	8.80161	3.447577	2.016437	8.637097
min	1.000000	6.000000	0.000000	1.00000	1.000000	2014.000000	6.000000
25%	3.000000	47.250000	6.000000	8.00000	4.000000	2015.000000	13.000000
50%	6.000000	90.000000	12.000000	16.00000	7.000000	2017.000000	16.000000
75%	9.000000	144.000000	18.000000	23.00000	10.000000	2019.000000	20.000000
max	11.000000	605.000000	23.000000	31.00000	12.000000	2021.000000	55.000000

	payment_key	time_key	item_key	store_key	quantity	total_price	item_name	man_country	supplier	division	...	month	quarter	year	trans_type	bank_name	unit	unit_price	customer_key	sub-district	item_description
356930	P032	T044920	I00169	S00706	3	51.0	Sun Chips Variety Pack	India	Indo Count Industries Ltd	CHITTAGONG	...	9	Q3	2017	card	South Bangla Agriculture and Commerce Bank Lim...	ct	17.0	C007281	LOHAGARA	Food - Chips
786444	P010	T07812	I00262	S00675	6	84.0	Strong Everyday Napkins	Netherlands	Bolsius Boxmeer	DHAKA	...	11	Q4	2018	card	Dutch-Bangla Bank Limited	ct	14.0	C003758	SHER-E-BANGLA NAGAR	Kitchen Supplies
610030	P038	T060424	I00187	S00406	4	52.0	Belvita Hard Biscuits Cinnamon	Netherlands	Bolsius Boxmeer	CHITTAGONG	...	4	Q2	2020	mobile	Rocket	ct.	13.0	C007697	LAMA	Food - Healthy
197306	P022	T010240	I00249	S00633	10	160.0	Clorox Bleach Large Jugs	China	CHERRY GROUP CO.,LTD	DHAKA	...	7	Q3	2014	card	NRB Commercial Bank Ltd	pack	16.0	C005287	KHILKHET	Kitchen Supplies
523217	P004	T033285	I00036	S00218	10	160.0	La Croix Sparkling Orange 12 oz	China	CHERRY GROUP CO.,LTD	KHULNA	...	11	Q4	2019	card	Bank Asia Limited	cans	16.0	C005760	SREEPUR	a. Beverage Sparkling Water

	item_name	unique_item_keys
41	Coke Classic 12 oz cans	2
51	Diet Coke - 12 oz cans	2
161	Muscle Milk Protein Shake Van. 11oz	2
186	Pepsi - 12 oz cans	2
222	Sprite - 12 oz cans	2

	item_key	item_name	unit_price	man_country
129	I00006	Coke Classic 12 oz cans	16.25	Lithuania
47	I00007	Coke Classic 12 oz cans	6.75	India
199	I00009	Diet Coke - 12 oz cans	16.25	Netherlands
65	I00010	Diet Coke - 12 oz cans	6.75	Lithuania
11	I00023	Pepsi - 12 oz cans	6.75	Cambodia
156	I00024	Pepsi - 12 oz cans	16.25	Lithuania
414	I00026	Sprite - 12 oz cans	6.75	Lithuania
457	I00027	Sprite - 12 oz cans	16.25	poland
1130	I00057	Muscle Milk Protein Shake Van. 11oz	24.00	Germany
284	I00059	Muscle Milk Protein Shake Van. 11oz	22.00	Lithuania

	man_country	supplier
1	Bangladesh	DENIMACH LTD
8	Cambodia	NINGBO SEDUNO IMP & EXP CO.LTD
24	China	CHERRY GROUP CO.,LTD
30	Finland	HARDFORD AB
33	Germany	Friedola 1888 GmbH
17	India	Indo Count Industries Ltd
10	Lithuania	BIGSO AB
0	Netherlands	Bolsius Boxmeer
5	United States	MAESA SAS
2	poland	CHROMADURLIN S.A.S

	year	store_key	total_sales
0	2014	S0071	24243.3
1	2015	S0010	26320.1
2	2016	S0049	25037.0
3	2017	S00375	25507.3
4	2018	S00640	25559.6
5	2019	S00246	25335.8
6	2020	S00320	26149.4
7	2021	S00544	2568.1

	year	start_date	end_date
0	2014	2014-01-20 00:00:00	2014-12-31 00:00:00
1	2015	2015-01-01 00:00:00	2015-12-31 00:00:00
2	2016	2016-01-01 00:00:00	2016-12-31 00:00:00
3	2017	2017-01-01 00:00:00	2017-12-31 00:00:00
4	2018	2018-01-01 00:00:00	2018-12-31 00:00:00
5	2019	2019-01-01 00:00:00	2019-12-31 00:00:00
6	2020	2020-01-01 00:00:00	2020-12-31 00:00:00
7	2021	2021-01-01 00:00:00	2021-01-23 00:00:00

	year	item_description	total_quantity_sold	total_sales_amount	q_rnk
0	2014	a. Beverage - Soda	88879	877300.0	1
1	2015	Food - Healthy	94796	1497118.5	1
2	2016	a. Beverage - Soda	93637	924687.8	1
3	2017	a. Beverage - Soda	93508	916345.3	1
4	2018	a. Beverage - Soda	93935	929126.5	1
5	2019	a. Beverage - Soda	94702	933581.8	1
6	2020	a. Beverage - Soda	94988	927722.3	1
7	2021	a. Beverage - Soda	5698	55533.0	1

	year	item_description	total_quantity_sold	total_sales_amount	q_rnk
0	2014	Food - Healthy	88790	1406845.0	1
1	2015	Food - Healthy	94796	1497118.5	1
2	2016	Food - Healthy	92867	1465907.5	1
3	2017	Food - Healthy	92928	1474811.5	1
4	2018	Food - Healthy	93816	1486923.5	1
5	2019	Food - Healthy	93538	1480297.0	1
6	2020	Food - Healthy	93309	1479379.0	1
7	2021	Food - Healthy	5509	87756.0	1

	rfm_segment	count
0	Potential Loyalists	176063
1	Loyal Customers	166242
2	Customers Needing Attention	148114
3	Champions	119888
4	Hibernating	116854
5	At Risk	109725
6	Cant Lose Them	84745
7	Promising	50446
8	Recent Customers	27923

	count
trans_type
cash	25590

Importing Python Libraries¶

Downloading Datasets¶

Joing the Tables¶

Data Cleaning¶

Handling missing values¶

Handling duplicates¶

Handling date columns¶

Outlier treatment¶

Visual Analysis¶

Prelims¶

Discrepancies in the total counts between the item key and item name¶

Relationship between manufacturering country and supplier¶

Funtions for chart formatting¶

Analysis using SQL¶

Connecting to SQL DB¶

Every year highest sales in store¶

Every year highest sold item (quantity and total price)¶

Division wise items sales¶

Number of stores per division¶

Sales in Different Quarters¶

Monthwise sales trend¶

Payment methods¶

Most used bank¶

Customer Segmentation¶

Exploratory Data Analysis¶

Normality test¶

QQ plot¶

Shapiro-Wilk test¶

Data transformation for normalization¶

Kruskal-Wallis test (compare the total sales among different years)¶

Chi-Squared test¶

Payment methods vs customers¶

Items vs customers¶

Items vs customers segment¶

Manufacturing country vs customers¶

Item vs month¶

Items vs quarters¶

Item vs divisions¶

Key Insights & Recommendations¶

1. Sales Trends and Performance¶

2. Best-Selling Products¶

3. Customer Behavior and Segmentation (RFM Analysis)¶

4. Payment Method Analysis¶

5. Bank Preferences¶

6. Customer Preferences Based on Item and Location¶

7. Short-Term Actions¶

8. Long-Term Actions¶