import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler
from scipy.stats import shapiro, mannwhitneyu, chi2_contingency, boxcox
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import KNNImputer
from xgboost import XGBClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier
import math
import statsmodels.api as sm
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, cross_validate, RandomizedSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, classification_report, confusion_matrix
)
import shap

df = pd.read_csv('ola_driver.csv')

df.shape

(19104, 14)

df.head(5)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19104 entries, 0 to 19103
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            19104 non-null  int64  
 1   MMM-YY                19104 non-null  object 
 2   Driver_ID             19104 non-null  int64  
 3   Age                   19043 non-null  float64
 4   Gender                19052 non-null  float64
 5   City                  19104 non-null  object 
 6   Education_Level       19104 non-null  int64  
 7   Income                19104 non-null  int64  
 8   Dateofjoining         19104 non-null  object 
 9   LastWorkingDate       1616 non-null   object 
 10  Joining Designation   19104 non-null  int64  
 11  Grade                 19104 non-null  int64  
 12  Total Business Value  19104 non-null  int64  
 13  Quarterly Rating      19104 non-null  int64  
dtypes: float64(2), int64(8), object(4)
memory usage: 2.0+ MB

df.drop(columns='Unnamed: 0', inplace=True)

# Converting to date time format
df['Reporting_date'] = pd.to_datetime(df['MMM-YY'], format='%d/%m/%y')
df['Dateofjoining'] = pd.to_datetime(df['Dateofjoining'], format='%d/%m/%y')
df['LastWorkingDate'] = pd.to_datetime(df['LastWorkingDate'], format='%d/%m/%y')

df.drop(columns='MMM-YY', inplace=True)

df.head(5)

df.describe()

df.nunique()

Driver_ID                2381
Age                        36
Gender                      2
City                       29
Education_Level             3
Income                   2383
Dateofjoining             869
LastWorkingDate           493
Joining Designation         5
Grade                       5
Total Business Value    10181
Quarterly Rating            4
Reporting_date             24
dtype: int64

# Percentage of null values
round(df.isna().sum()/len(df) * 100, 2)

Driver_ID                0.00
Age                      0.32
Gender                   0.27
City                     0.00
Education_Level          0.00
Income                   0.00
Dateofjoining            0.00
LastWorkingDate         91.54
Joining Designation      0.00
Grade                    0.00
Total Business Value     0.00
Quarterly Rating         0.00
Reporting_date           0.00
dtype: float64

df['Gender'] = df.groupby('Driver_ID')['Gender'].transform(
    lambda x: x.fillna(x.mode()[0] if not x.mode().empty else x.mean()))

df_num = df.select_dtypes(np.number)
df_num.head()

imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')
imputer.fit(df_num)
df_imp = imputer.transform(df_num)
df_imp = pd.DataFrame(df_imp)
df_imp.columns = df_num.columns
df_imp.head()

rem_cols = list(set(df.columns).difference(set(df_imp.columns)))
rem_cols

['Dateofjoining', 'LastWorkingDate', 'City', 'Reporting_date']

df_final = pd.concat([df_imp, df[rem_cols]], axis = 1)
df_final.head()

# Checking the concat is correct or not
df[df["Driver_ID"] == 43]

df_final[df_final["Driver_ID"] == 43]

round(df_final.isna().sum()/len(df_final)*100, 2)

Driver_ID                0.00
Age                      0.00
Gender                   0.00
Education_Level          0.00
Income                   0.00
Joining Designation      0.00
Grade                    0.00
Total Business Value     0.00
Quarterly Rating         0.00
Dateofjoining            0.00
LastWorkingDate         91.54
City                     0.00
Reporting_date           0.00
dtype: float64

function_dict = {'Age':'max', 'Gender':'first','City':'first',
                 'Education_Level':'last', 'Income':'last',
                 'Joining Designation':'last','Grade':'last',
                 'Dateofjoining':'last','LastWorkingDate':'last',
                 'Total Business Value':'sum','Quarterly Rating':'last'}
df_new=df_final.groupby(['Driver_ID','Reporting_date']).aggregate(function_dict)
df_new.reset_index(inplace =True)
df_new.head()

df_new[df_new["Driver_ID"] == 43]

df1 = pd.DataFrame()
df1['Driver_ID']=df_final['Driver_ID'].unique()

#  Sort the DataFrame by 'Driver_ID' and 'Reporting date'
df_new_sorted = df_new.sort_values(['Driver_ID', 'Reporting_date'])

# Perform the aggregations
df1 = df_new_sorted.groupby('Driver_ID').agg({
    'Age': 'last',                      # Latest age per Driver_ID
    'Gender': 'first',                   # Latest gender per Driver_ID
    'City': 'last',                     # Latest city per Driver_ID
    'Education_Level': 'last',          # Latest education level per Driver_ID
    'Income': 'last',                   # Latest income per Driver_ID
    'Joining Designation': 'last',      # Latest joining designation per Driver_ID
    'Grade': 'last',                    # Latest grade per Driver_ID
    'Total Business Value': 'sum',      # Sum of business value per Driver_ID
    'Quarterly Rating': 'last'          # Latest quarterly rating per Driver_ID
}).reset_index()

df1.head()

df1['Gender'].value_counts()

Gender
0.0    1404
1.0     977
Name: count, dtype: int64

df1 = df1.merge(
    df_final.groupby('Driver_ID')['Quarterly Rating']
      .agg(['first', 'last'])
      .assign(Quarterly_Rating_Increased=lambda x: (x['last'] > x['first']).astype(int))
      .reset_index()[['Driver_ID', 'Quarterly_Rating_Increased']],
    on='Driver_ID',
    how='left'
)
df1.head()

# Determine if the last 'LastWorkingDate' is missing (NaN) for each Driver_ID
lwd = df_final.groupby('Driver_ID')['LastWorkingDate'].last().isna().reset_index()
lwd.rename(columns={'LastWorkingDate': 'target'}, inplace=True)
lwd['target'] = (~lwd['target']).astype(int)

lwd.head()

# Merge this information back into df1
df1 = df1.merge(lwd, on='Driver_ID', how='left')
df1.head()

df1['target'].value_counts()

target
1    1616
0     765
Name: count, dtype: int64

inc_ch = df_final.groupby('Driver_ID')['Income'].agg(['first', 'last']).reset_index()
inc_ch['income_change'] = (inc_ch['first'] < inc_ch['last']).astype(int)
inc_ch.sample(5)

df1 = df1.merge(inc_ch[['Driver_ID','income_change']], on='Driver_ID', how='left')
df1.sample(5)

df1.describe()

def outlier(data, col):
    q1 = data[col].quantile(0.25)
    q3 = data[col].quantile(0.75)
    iqr = q3 - q1

    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    # boolean mask for outliers
    mask = (data[col] < lower_bound) | (data[col] > upper_bound)

    # extract ONLY the outlier values
    outlier_vals = data.loc[mask, col]

    print(f"Outliers count for {col}: {mask.sum()}")
    print(f"\nOutlier values for {col}:\n{list(outlier_vals)}\n")

    return outlier_vals

num_cols = df1.select_dtypes(include=['float', 'int']).columns.tolist()
num_cols

['Driver_ID',
 'Age',
 'Gender',
 'Education_Level',
 'Income',
 'Joining Designation',
 'Grade',
 'Total Business Value',
 'Quarterly Rating',
 'Quarterly_Rating_Increased',
 'target',
 'income_change']

num_cols = ['Age', 'Income', 'Total Business Value']

for col in num_cols:
    outlier(df1, col)

Outliers count for Age: 25

Outlier values for Age:
[52.0, 51.0, 50.0, 51.0, 53.0, 52.0, 52.0, 54.0, 50.0, 51.0, 50.0, 50.0, 52.0, 52.0, 51.0, 52.0, 51.0, 55.0, 50.0, 52.0, 52.0, 58.0, 55.0, 53.0, 51.0]

Outliers count for Income: 48

Outlier values for Income:
[132577.0, 131347.0, 144978.0, 148588.0, 139139.0, 134529.0, 188418.0, 133752.0, 140769.0, 135879.0, 137082.0, 133783.0, 132558.0, 152234.0, 149403.0, 135436.0, 134302.0, 135414.0, 138069.0, 153109.0, 133978.0, 145861.0, 157124.0, 135337.0, 149637.0, 149354.0, 145483.0, 136307.0, 140833.0, 132819.0, 139882.0, 131847.0, 153766.0, 133579.0, 145116.0, 133489.0, 138520.0, 131567.0, 136960.0, 167758.0, 132505.0, 169549.0, 142543.0, 137450.0, 135231.0, 131568.0, 144726.0, 131805.0]

Outliers count for Total Business Value: 336

Outlier values for Total Business Value:
[36351110.0, 69867900.0, 21755910.0, 33823290.0, 11723470.0, 22791310.0, 49225520.0, 12293120.0, 11718580.0, 12543500.0, 27309820.0, 17959920.0, 43531510.0, 36029220.0, 21415440.0, 13121470.0, 12674940.0, 40152720.0, 13939940.0, 24273480.0, 12087570.0, 19866290.0, 10666090.0, 31533700.0, 12182590.0, 11947410.0, 28348090.0, 19169110.0, 33697790.0, 29901450.0, 56839380.0, 58024490.0, 30153680.0, 14717150.0, 11082000.0, 13594350.0, 13191530.0, 35460020.0, 30869270.0, 12205880.0, 19017290.0, 17483970.0, 27458220.0, 26927680.0, 16290800.0, 10732480.0, 26975690.0, 15036790.0, 51680810.0, 25128430.0, 16391700.0, 30318780.0, 22457270.0, 23995570.0, 11593960.0, 10574260.0, 13593340.0, 12922130.0, 11940020.0, 17244320.0, 23376420.0, 13721360.0, 35969490.0, 20915380.0, 41826510.0, 41667690.0, 29530380.0, 19649270.0, 16475160.0, 26819450.0, 18872890.0, 23650380.0, 20011620.0, 54792310.0, 16118660.0, 34707890.0, 39090190.0, 22269520.0, 13697710.0, 13234050.0, 36389370.0, 34913070.0, 23542730.0, 11318750.0, 23441990.0, 34007130.0, 17467630.0, 36806180.0, 38028190.0, 34495110.0, 15440080.0, 17134040.0, 28562340.0, 13627070.0, 15298690.0, 32819830.0, 20976830.0, 17369610.0, 17866060.0, 10697010.0, 10493340.0, 13197400.0, 12072740.0, 12941580.0, 23230860.0, 25943780.0, 18653790.0, 25951520.0, 22388420.0, 13942630.0, 15626920.0, 30755460.0, 16187110.0, 41669570.0, 24743820.0, 37135780.0, 26646720.0, 18248240.0, 10479300.0, 14899610.0, 14676410.0, 12132870.0, 14298980.0, 50382490.0, 10885280.0, 18472930.0, 17132800.0, 20284600.0, 41660820.0, 14970770.0, 19409190.0, 11622640.0, 25672950.0, 23381570.0, 29252710.0, 17287180.0, 53807450.0, 31372160.0, 35362450.0, 18340790.0, 15938480.0, 15537200.0, 12662250.0, 43100100.0, 14593130.0, 12219890.0, 57159050.0, 25133230.0, 27355810.0, 20792700.0, 29562230.0, 10702790.0, 34630160.0, 50986970.0, 37833950.0, 33998640.0, 12476730.0, 24857720.0, 23471030.0, 11686450.0, 12628580.0, 12707450.0, 17620700.0, 23297720.0, 20941450.0, 25624290.0, 16090900.0, 37693910.0, 19778240.0, 13715180.0, 18851330.0, 26696700.0, 47124880.0, 29181100.0, 17624660.0, 12796500.0, 14026730.0, 10990300.0, 21660330.0, 19301010.0, 17967580.0, 19559860.0, 12423580.0, 22800020.0, 12641110.0, 14313890.0, 60153830.0, 11769770.0, 15329900.0, 28842760.0, 16431480.0, 21404140.0, 21851100.0, 25121990.0, 10734200.0, 21717750.0, 14577970.0, 26073780.0, 30735660.0, 14723750.0, 18180000.0, 13366510.0, 10855850.0, 21356700.0, 14401180.0, 11165310.0, 33709640.0, 16483690.0, 34374620.0, 10735670.0, 18027620.0, 15570900.0, 20702280.0, 21029470.0, 32766560.0, 14208530.0, 29036280.0, 11614440.0, 20351490.0, 26818060.0, 17160130.0, 29146990.0, 17454920.0, 27099710.0, 52526160.0, 20918330.0, 25757010.0, 13626550.0, 16585750.0, 13395500.0, 16747520.0, 24461920.0, 26824210.0, 36239210.0, 21565150.0, 17600760.0, 16924970.0, 11244630.0, 24294670.0, 19132220.0, 29736510.0, 17504850.0, 20187150.0, 56969020.0, 59696450.0, 20041520.0, 11207560.0, 10804780.0, 13590220.0, 13936770.0, 18036100.0, 21746880.0, 13363600.0, 16342310.0, 28251450.0, 35896130.0, 10752230.0, 29040450.0, 66352800.0, 20412560.0, 21687570.0, 19504080.0, 13233470.0, 19422760.0, 36398090.0, 18076460.0, 13170380.0, 13468220.0, 15313810.0, 44861920.0, 23264700.0, 11611790.0, 22993970.0, 16350440.0, 52296540.0, 59380670.0, 13233930.0, 28596320.0, 15539040.0, 18608140.0, 14703740.0, 11742390.0, 30668490.0, 22212220.0, 12342600.0, 48017720.0, 37052330.0, 18059400.0, 22037490.0, 12943640.0, 11532320.0, 33738740.0, 17386610.0, 15457560.0, 12708940.0, 17784050.0, 23959960.0, 33139380.0, 95331060.0, 13977500.0, 34703880.0, 33882080.0, 10878820.0, 22155600.0, 11509380.0, 11155940.0, 33054410.0, 27474930.0, 43275060.0, 20222360.0, 17286750.0, 10913820.0, 15672850.0, 22976810.0, 13985380.0, 42883990.0, 23114450.0, 22738660.0, 11868520.0, 21260010.0, 22183450.0, 30121900.0, 11417820.0, 11666740.0, 29475240.0, 15075460.0, 21645050.0, 19356940.0, 14852170.0, 18503930.0, 12580670.0, 20843460.0, 61583040.0, 25164110.0, 19597020.0, 21748820.0]

plt.figure(figsize=(14, 8))

plt.subplot(1, 3, 1)
sns.boxplot(data=df1, y='Age')
plt.subplot(1, 3, 2)
sns.boxplot(data=df1, y='Income')
plt.subplot(1, 3, 3)
sns.boxplot(data=df1, y='Total Business Value')

plt.show()

df1_pcorr = df1.select_dtypes(include=['float', 'int']).corr(method='pearson')

plt.figure(figsize=(9, 6))
sns.heatmap(data=df1_pcorr, cmap ='viridis', annot=True, fmt=".2f", center=0)
plt.show()

sns.histplot(data=df1, x='Age', kde=True, palette='tab10')
plt.show()

def bp(dt, col1, col2, xtick_labels=None):
    cnt = dt.groupby(col1)[col2].value_counts().unstack().fillna(0)
    cnt = cnt.rename(columns={0: "Not Churned", 1: "Churned"})
    custom_colors = {
        "Not Churned": "#90EE90",   # light green
        "Churned": "#FF9999"        # light red
    }

    plt.figure(figsize=(6,12))
    ax1 = cnt.plot(kind='bar', stacked=True, figsize=(7,5), color=[custom_colors[col] for col in cnt.columns])

    # Add labels on each stacked segment
    for c in ax1.containers:
        for bar in c:
            height = bar.get_height()
            if height > 20:   # show only if bar tall enough
                ax1.text(
                    bar.get_x() + bar.get_width() / 2,
                    bar.get_y() + height / 2,
                    f"{int(height)}",
                    ha='center', va='center'
                )

    # Custom xticks if mapping is provided
    if xtick_labels:
        new_labels = [xtick_labels.get(x, x) for x in cnt.index]
        ax1.set_xticklabels(new_labels)

    plt.title(f"{col1} vs Churn")
    plt.xlabel(None)
    plt.ylabel("Count")
    plt.xticks(rotation=0)
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    plt.show()

bins = [18, 30, 45, 60]
labels = ['youngester', 'middle_age', 'elders']

df1['age_group'] = pd.cut(df1['Age'], bins=bins, labels=labels)
df1['age_group'].value_counts()

age_group
middle_age    1521
youngester     764
elders          96
Name: count, dtype: int64

def plot_churn_stacked_bars(
    df,
    cols,
    target='target',
    label_maps=None,   
    ncols=2,
    figsize=(14, 4)
):
    
    color_map = {
        "Not Churned": "#90EE90",   # light green
        "Churned": "#FF9999"        # light red
    }

    nrows = math.ceil(len(cols) / ncols)
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=figsize, squeeze=False)

    for idx, col in enumerate(cols):
        r = idx // ncols
        c = idx % ncols
        ax = axes[r, c]

        
        cnt = df.groupby(col)[target].value_counts().unstack().fillna(0)

        
        if 0 not in cnt.columns:
            cnt[0] = 0
        if 1 not in cnt.columns:
            cnt[1] = 0
        cnt = cnt[[0, 1]]

        
        cnt = cnt.rename(columns={0: "Not Churned", 1: "Churned"})

        
        perc = cnt.div(cnt.sum(axis=1), axis=0) * 100

        
        perc.plot(
            kind='bar',
            stacked=True,
            ax=ax,
            color=[color_map[col_name] for col_name in perc.columns]
        )

        
        for container in ax.containers:
            ax.bar_label(container, fmt="%.1f%%", label_type='center', fontsize=8)

        
        if label_maps and col in label_maps:
            mapping = label_maps[col]
            new_labels = [mapping.get(x, x) for x in perc.index]
            ax.set_xticklabels(new_labels, rotation=0)
        else:
            ax.set_xticklabels(perc.index, rotation=0)

        ax.set_title(f"{col} vs Churn (%)")
        ax.set_xlabel(None)
        ax.set_ylabel(None)
        ax.set_ylim(0, 100)

        
        if idx == 0:
            ax.legend(title="Churn Status", loc='center left', bbox_to_anchor=(1, 0.5))
        else:
            ax.legend_.remove()

    
    total_axes = nrows * ncols
    for j in range(len(cols), total_axes):
        r = j // ncols
        c = j % ncols
        fig.delaxes(axes[r, c])

    fig.tight_layout()
    plt.show()

label_maps = {
    'Education_Level': {0: 'Secondary School', 1: 'Higher secondary', 2: 'Graduate'},
    'income_change': {0: 'Not increased', 1: 'Increased'},
    'Gender': {0: 'Male', 1: 'Female'},
    'Quarterly_Rating_Increased': {0: 'Not increased', 1: 'Increased'}
}

lst_cols = ['Gender', 'Education_Level', 'Grade', 'income_change', 
            'Quarterly_Rating_Increased', 'Joining Designation', 'age_group']

plot_churn_stacked_bars(
    df=df1,
    cols=lst_cols,
    target='target',          
    label_maps=label_maps,
    ncols=2,                  
    figsize=(14, 10)         
)

custom_colors = {'0': "#90EE90", '1': "#FF9999"}

plt.figure(figsize=(10,6))

plt.subplot(1, 2, 1)
sns.boxplot(data=df1, x='target', y='Total Business Value', palette=custom_colors)
plt.xlabel(None)
plt.title("Churn vs Total Business value")
plt.xticks([0, 1], ['Not Churn', 'Churn'])

plt.subplot(1, 2, 2)
sns.boxplot(data=df1, x='target', y='Income', palette=custom_colors)
plt.xlabel(None)
plt.title("Churn vs Income")
plt.xticks([0, 1], ['Not Churn', 'Churn'])

plt.tight_layout()
plt.show()

def annotate_box_and_summary(ax, data, x_col, y_col, title):
    unique_x = sorted(data[x_col].unique())
    summary_lines = []  # for building the text box content

    for x_val in unique_x:
        subset = data[data[x_col] == x_val][y_col]

        mean_val = subset.mean()
        q1 = subset.quantile(0.25)
        q3 = subset.quantile(0.75)

        x_pos = unique_x.index(x_val)

        # Annotate values on top of the boxplot
        ax.text(x_pos, mean_val, f"Mean: {mean_val:,.0f}", 
                ha='center', va='bottom', fontsize=9, color='blue')
        ax.text(x_pos, q1, f"Q1: {q1:,.0f}", 
                ha='center', va='bottom', fontsize=9, color='purple')
        ax.text(x_pos, q3, f"Q3: {q3:,.0f}", 
                ha='center', va='bottom', fontsize=9, color='brown')

        # Build summary lines
        category_name = "Not Churn" if x_val == 0 else "Churn"
        summary_lines.append(
            f"{category_name}:\n"
            f" • Mean = {mean_val:,.0f}\n"
            f" • Q1 (25%) = {q1:,.0f}\n"
            f" • Q3 (75%) = {q3:,.0f}\n"
        )

    # Add the summary text box to the right side
    text_box = "\n".join(summary_lines)
    ax.text(
        1.15, 0.5, text_box,
        transform=ax.transAxes,
        fontsize=10,
        verticalalignment='center',
        bbox=dict(facecolor='white', edgecolor='black', alpha=0.8)
    )

    ax.set_title(title)
    ax.set_xlabel("")
    ax.set_ylabel("")

plt.figure(figsize=(16,6))


plt.subplot(1, 2, 1)
ax1 = sns.boxplot(
    data=df1, x='target', y='Total Business Value', palette=custom_colors
)
plt.xticks([0, 1], ['Not Churn', 'Churn'])
annotate_box_and_summary(ax1, df1, 'target', 'Total Business Value',
                         "Churn vs Total Business Value")


plt.subplot(1, 2, 2)
ax2 = sns.boxplot(
    data=df1, x='target', y='Income', palette=custom_colors
)
plt.xticks([0, 1], ['Not Churn', 'Churn'])
annotate_box_and_summary(ax2, df1, 'target', 'Income',
                         "Churn vs Income")

plt.tight_layout()
plt.show()

le_city = LabelEncoder()
df1['City_LE'] = le_city.fit_transform(df1['City'])

city_mapping = dict(zip(le_city.classes_, le_city.transform(le_city.classes_)))
print("City Mapping:", city_mapping)

City Mapping: {'C1': np.int64(0), 'C10': np.int64(1), 'C11': np.int64(2), 'C12': np.int64(3), 'C13': np.int64(4), 'C14': np.int64(5), 'C15': np.int64(6), 'C16': np.int64(7), 'C17': np.int64(8), 'C18': np.int64(9), 'C19': np.int64(10), 'C2': np.int64(11), 'C20': np.int64(12), 'C21': np.int64(13), 'C22': np.int64(14), 'C23': np.int64(15), 'C24': np.int64(16), 'C25': np.int64(17), 'C26': np.int64(18), 'C27': np.int64(19), 'C28': np.int64(20), 'C29': np.int64(21), 'C3': np.int64(22), 'C4': np.int64(23), 'C5': np.int64(24), 'C6': np.int64(25), 'C7': np.int64(26), 'C8': np.int64(27), 'C9': np.int64(28)}

norm_cols = ['Age', 'Income', 'Total Business Value']
for col in norm_cols:
    stats, p =shapiro(df1[col])
    if p < 0.05:
        print(f"{col} is not normally distributed")
    else:
        print(f"{col} is normally distributed")

Age is not normally distributed
Income is not normally distributed
Total Business Value is not normally distributed

df1['box_age'], fitted_lambda =  boxcox(df1['Age'])
df1['log_income'] = np.log(df1['Income'])
df1['log_bv'] = np.log1p(df1['Total Business Value'])

log_cols = ['box_age', 'log_income', 'log_bv']

for col in log_cols:
    stats, p =shapiro(df1[col])
    if p < 0.05:
        print(f"{col} is not normally distributed")
    else:
        print(f"{col} is normally distributed")

box_age is not normally distributed
log_income is not normally distributed
log_bv is normally distributed

def numerical_test(df, num_col, target='target'):
    group0 = df[df[target] == 0][num_col]
    group1 = df[df[target] == 1][num_col]
    
    stat, p = mannwhitneyu(group0, group1, alternative='two-sided')
    interpretation = "Significant" if p < 0.05 else "Not Significant"
    return {"Feature": num_col, "Test": "Mann-Whitney U", "Statistic": stat, "p-Value": p, "Interpretation": interpretation}

def categorical_test(df, cat_col, target='target'):
    table = pd.crosstab(df[cat_col], df[target])
    chi2, p, dof, expected = chi2_contingency(table)
    interpretation = "Significant" if p < 0.05 else "Not Significant"
    return {"Feature": cat_col, "Test": "Chi-square", "Statistic": chi2, "p-Value": p, "Interpretation": interpretation}

numerical_cols = ['Age', 'Income', 'Total Business Value']
ordinal_cols = ['Education_Level', 'Joining Designation', 'Grade', 'Quarterly Rating']
categorical_cols = ['Gender', 'income_change', 'Quarterly_Rating_Increased', 'City_LE']

results = []

for col in numerical_cols:
    results.append(numerical_test(df1, col))

for col in ordinal_cols:
    results.append(numerical_test(df1, col))

for col in categorical_cols:
    results.append(categorical_test(df1, col))

pd.DataFrame(results)

X = df1.drop(columns=['Driver_ID', 'target', 'age_group', 'City', 'log_income', 'log_bv', 'box_age'])
y = df1['target']

scaler = MinMaxScaler()
scaler.fit(X)

X_scaled = scaler.fit_transform(X)

X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.4, random_state=42, stratify=y)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

plt.figure(figsize=(4,6))

ax = sns.countplot(
    data=df1,
    x='target',
    palette={'0': "#90EE90", '1': "#FF9999"}
)

plt.xticks([0, 1], ['Not churn', 'Churn'])
plt.title("Overall churn rate")
plt.xlabel(None)
plt.ylabel(None)

total = len(df1)

for p in ax.patches:
    count = p.get_height()
    percentage = 100 * count / total
    x_pos = p.get_x() + p.get_width() / 2
    ax.text(
        x_pos,
        count + total * 0.01,    
        f"{percentage:.1f}%",
        ha='center',
        fontsize=12
    )

plt.show()

sm = SMOTE(random_state=42, k_neighbors=5)
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

def eval(yt, yp):
    print(f"Recall score: {round(recall_score(yt, yp), 3)}")
    print(f"Precision score: {round(precision_score(yt, yp), 3)}")
    print(f"F1 score: {round(f1_score(yt, yp), 3)}")
    print(f"ROC_AUC score: {round(roc_auc_score(yt, yp), 3)}")
    print(f"Accuracy: {round(accuracy_score(yt, yp), 3)}")
    print(f"Classification report: {classification_report(yt, yp)}")
    print(f"Confusion Matrix: {confusion_matrix (yt, yp)}")

def make_pipeline(model, scale=False):
    steps = []
    if scale:
        steps.append(('scaler', StandardScaler()))
    steps.append(('model', model))
    return Pipeline(steps=steps)

models = {
    "Logistic Regression": make_pipeline(
        LogisticRegression(max_iter=500, class_weight='balanced'), scale=True
    ),
    "SVM (RBF)": make_pipeline(
        SVC(kernel='rbf', probability=True, class_weight='balanced'), scale=True
    ),
    "Decision Tree": make_pipeline(
        DecisionTreeClassifier(
            max_depth=6, min_samples_leaf=50, class_weight='balanced', random_state=42
        ), scale=False
    ),
    "Random Forest": make_pipeline(
        RandomForestClassifier(
            n_estimators=300, min_samples_leaf=20,
            n_jobs=-1, class_weight='balanced', random_state=42
        ), scale=False
    ),
    "GBDT (sklearn)": make_pipeline(
        GradientBoostingClassifier(
            n_estimators=300, learning_rate=0.05, max_depth=3, random_state=42
        ), scale=False
    ),
    "XGBoost": make_pipeline(
        XGBClassifier(
            n_estimators=300, learning_rate=0.05, max_depth=4,
            subsample=0.8, colsample_bytree=0.8,
            objective='binary:logistic', eval_metric='logloss',
            n_jobs=-1, random_state=42
        ), scale=False
    ),
    "LightGBM": make_pipeline(
        LGBMClassifier(
            n_estimators=300, learning_rate=0.05,
            num_leaves=31, subsample=0.8, colsample_bytree=0.8,
            class_weight='balanced', random_state=42
        ), scale=False
    ),
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
    'roc_auc': 'roc_auc'
}

results = []

for name, pipe in models.items():
    print("="*80)
    print(name)

    cv_res = cross_validate(
    pipe,
    X_train, y_train,
    cv=cv,
    scoring=scoring,
    n_jobs=-1,
    return_train_score=False
    )

    print("CV Accuracy:  ", cv_res['test_accuracy'].mean().round(3))
    print("CV Precision: ", cv_res['test_precision'].mean().round(3))
    print("CV Recall:    ", cv_res['test_recall'].mean().round(3))
    print("CV F1:        ", cv_res['test_f1'].mean().round(3))
    print("CV ROC_AUC:   ", cv_res['test_roc_auc'].mean().round(3))

    
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    if hasattr(pipe.named_steps['model'], "predict_proba"):
        y_proba = pipe.predict_proba(X_test)[:, 1]
        test_auc = roc_auc_score(y_test, y_proba)
    else:
        y_proba = None
        test_auc = np.nan

    test_acc = accuracy_score(y_test, y_pred)
    test_prec = precision_score(y_test, y_pred)
    test_rec = recall_score(y_test, y_pred)
    test_f1 = f1_score(y_test, y_pred)

    print("\nTEST metrics:")
    print("  Accuracy :", round(test_acc, 3))
    print("  Precision:", round(test_prec, 3))
    print("  Recall   :", round(test_rec, 3))
    print("  F1-score :", round(test_f1, 3))
    print("  ROC_AUC  :", round(test_auc, 3))

    print("\nClassification report:\n", classification_report(y_test, y_pred))
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

    results.append({
        "Model": name,
        "CV_Accuracy": cv_res['test_accuracy'].mean(),
        "CV_Precision": cv_res['test_precision'].mean(),
        "CV_Recall": cv_res['test_recall'].mean(),
        "CV_F1": cv_res['test_f1'].mean(),
        "CV_ROC_AUC": cv_res['test_roc_auc'].mean(),
        "Test_Accuracy": test_acc,
        "Test_Precision": test_prec,
        "Test_Recall": test_rec,
        "Test_F1": test_f1,
        "Test_ROC_AUC": test_auc,
    })

results_df = pd.DataFrame(results)
results_df.sort_values("Test_Recall", ascending=False)

================================================================================
Logistic Regression
CV Accuracy:   0.776
CV Precision:  0.821
CV Recall:     0.858
CV F1:         0.839
CV ROC_AUC:    0.801

TEST metrics:
  Accuracy : 0.792
  Precision: 0.84
  Recall   : 0.858
  F1-score : 0.849
  ROC_AUC  : 0.837

Classification report:
               precision    recall  f1-score   support

           0       0.68      0.65      0.67       153
           1       0.84      0.86      0.85       324

    accuracy                           0.79       477
   macro avg       0.76      0.76      0.76       477
weighted avg       0.79      0.79      0.79       477

Confusion matrix:
 [[100  53]
 [ 46 278]]
================================================================================
SVM (RBF)
CV Accuracy:   0.784
CV Precision:  0.83
CV Recall:     0.859
CV F1:         0.844
CV ROC_AUC:    0.807

TEST metrics:
  Accuracy : 0.788
  Precision: 0.831
  Recall   : 0.864
  F1-score : 0.847
  ROC_AUC  : 0.808

Classification report:
               precision    recall  f1-score   support

           0       0.69      0.63      0.66       153
           1       0.83      0.86      0.85       324

    accuracy                           0.79       477
   macro avg       0.76      0.75      0.75       477
weighted avg       0.78      0.79      0.79       477

Confusion matrix:
 [[ 96  57]
 [ 44 280]]
================================================================================
Decision Tree
CV Accuracy:   0.733
CV Precision:  0.828
CV Recall:     0.767
CV F1:         0.795
CV ROC_AUC:    0.782

TEST metrics:
  Accuracy : 0.778
  Precision: 0.854
  Recall   : 0.812
  F1-score : 0.832
  ROC_AUC  : 0.848

Classification report:
               precision    recall  f1-score   support

           0       0.64      0.71      0.67       153
           1       0.85      0.81      0.83       324

    accuracy                           0.78       477
   macro avg       0.75      0.76      0.75       477
weighted avg       0.78      0.78      0.78       477

Confusion matrix:
 [[108  45]
 [ 61 263]]
================================================================================
Random Forest
CV Accuracy:   0.772
CV Precision:  0.827
CV Recall:     0.84
CV F1:         0.833
CV ROC_AUC:    0.814

TEST metrics:
  Accuracy : 0.797
  Precision: 0.845
  Recall   : 0.858
  F1-score : 0.851
  ROC_AUC  : 0.84

Classification report:
               precision    recall  f1-score   support

           0       0.69      0.67      0.68       153
           1       0.84      0.86      0.85       324

    accuracy                           0.80       477
   macro avg       0.77      0.76      0.76       477
weighted avg       0.80      0.80      0.80       477

Confusion matrix:
 [[102  51]
 [ 46 278]]
================================================================================
GBDT (sklearn)
CV Accuracy:   0.784
CV Precision:  0.798
CV Recall:     0.912
CV F1:         0.851
CV ROC_AUC:    0.817

TEST metrics:
  Accuracy : 0.822
  Precision: 0.833
  Recall   : 0.923
  F1-score : 0.876
  ROC_AUC  : 0.848

Classification report:
               precision    recall  f1-score   support

           0       0.79      0.61      0.69       153
           1       0.83      0.92      0.88       324

    accuracy                           0.82       477
   macro avg       0.81      0.77      0.78       477
weighted avg       0.82      0.82      0.81       477

Confusion matrix:
 [[ 93  60]
 [ 25 299]]
================================================================================
XGBoost
CV Accuracy:   0.777
CV Precision:  0.797
CV Recall:     0.902
CV F1:         0.846
CV ROC_AUC:    0.807

TEST metrics:
  Accuracy : 0.82
  Precision: 0.832
  Recall   : 0.92
  F1-score : 0.874
  ROC_AUC  : 0.837

Classification report:
               precision    recall  f1-score   support

           0       0.78      0.61      0.68       153
           1       0.83      0.92      0.87       324

    accuracy                           0.82       477
   macro avg       0.81      0.76      0.78       477
weighted avg       0.82      0.82      0.81       477

Confusion matrix:
 [[ 93  60]
 [ 26 298]]
================================================================================
LightGBM
CV Accuracy:   0.767
CV Precision:  0.816
CV Recall:     0.849
CV F1:         0.832
CV ROC_AUC:    0.785
[LightGBM] [Info] Number of positive: 969, number of negative: 459
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000106 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 594
[LightGBM] [Info] Number of data points in the train set: 1428, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000

TEST metrics:
  Accuracy : 0.78
  Precision: 0.835
  Recall   : 0.843
  F1-score : 0.839
  ROC_AUC  : 0.828

Classification report:
               precision    recall  f1-score   support

           0       0.66      0.65      0.65       153
           1       0.83      0.84      0.84       324

    accuracy                           0.78       477
   macro avg       0.75      0.74      0.75       477
weighted avg       0.78      0.78      0.78       477

Confusion matrix:
 [[ 99  54]
 [ 51 273]]

rf_base = RandomForestClassifier(
        n_estimators=300,
        min_samples_leaf=20,
        n_jobs=-1,
        class_weight=None,
        random_state=42
    )

gbdt_base = GradientBoostingClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=3,
        random_state=42
    )

xgb_base = XGBClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=4,
        subsample=0.8,
        colsample_bytree=0.8,
        objective='binary:logistic',
        eval_metric='logloss',
        n_jobs=-1,
        random_state=42
    )

lgbm_base = LGBMClassifier(
        n_estimators=300,
        learning_rate=0.05,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        class_weight=None,
        random_state=42
    )

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

rf_param_dist = {
    'n_estimators': [200, 300, 400, 500],
    'max_depth': [None, 5, 8, 12],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [5, 10, 20],
    'max_features': ['sqrt', 'log2', 0.5]
}

rf_search = RandomizedSearchCV(
    rf_base,
    param_distributions=rf_param_dist,
    n_iter=25,
    scoring='recall',
    cv=cv,
    n_jobs=-1,
    random_state=42,
    verbose=1
)

rf_search.fit(X_train_sm, y_train_sm)
print("Best RF params:", rf_search.best_params_)
print("Best RF CV Recall:", rf_search.best_score_)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best RF params: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'log2', 'max_depth': 5}
Best RF CV Recall: 0.8771913893488594

xgb_param_dist = {
    'n_estimators': [200, 300, 400],
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.3]
}

xgb_search = RandomizedSearchCV(
    xgb_base,
    param_distributions=xgb_param_dist,
    n_iter=30,
    scoring='recall',
    cv=cv,
    n_jobs=-1,
    random_state=42,
    verbose=1
)

xgb_search.fit(X_train_sm, y_train_sm)
print("Best XGB params:", xgb_search.best_params_)
print("Best XGB CV Recall:", xgb_search.best_score_)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best XGB params: {'subsample': 1.0, 'n_estimators': 400, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0.1, 'colsample_bytree': 0.8}
Best XGB CV Recall: 0.8596709577479835

lgbm_param_dist = {
    'n_estimators': [200, 300, 400],
    'num_leaves': [31, 50, 70],
    'max_depth': [-1, 5, 8, 12],
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    'min_child_samples': [10, 20, 50],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_lambda': [0, 0.5, 1.0]
}

lgbm_search = RandomizedSearchCV(
    lgbm_base,
    param_distributions=lgbm_param_dist,
    n_iter=30,
    scoring='recall',
    cv=cv,
    n_jobs=-1,
    random_state=42,
    verbose=1
)

lgbm_search.fit(X_train_sm, y_train_sm)
print("Best LGBM params:", lgbm_search.best_params_)
print("Best LGBM CV Recall:", lgbm_search.best_score_)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[LightGBM] [Info] Number of positive: 969, number of negative: 969
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000591 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1169
[LightGBM] [Info] Number of data points in the train set: 1938, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
Best LGBM params: {'subsample': 0.8, 'reg_lambda': 0.5, 'num_leaves': 70, 'n_estimators': 300, 'min_child_samples': 10, 'max_depth': 5, 'learning_rate': 0.03, 'colsample_bytree': 0.6}
Best LGBM CV Recall: 0.8545163185727258

rf_final = RandomForestClassifier(
    n_estimators=300,
    max_depth=3,
    class_weight=None,      
    random_state=42,
    n_jobs=-1
)


rf_final.fit(X_train_sm, y_train_sm)
y_pred = rf_final.predict(X_test)

eval(y_test, y_pred)

Recall score: 0.889
Precision score: 0.84
F1 score: 0.864
ROC_AUC score: 0.765
Accuracy: 0.809
Classification report:               precision    recall  f1-score   support

           0       0.73      0.64      0.68       153
           1       0.84      0.89      0.86       324

    accuracy                           0.81       477
   macro avg       0.79      0.76      0.77       477
weighted avg       0.80      0.81      0.81       477

Confusion Matrix: [[ 98  55]
 [ 36 288]]

importances = rf_final.feature_importances_

cols = X.columns

imp = pd.DataFrame({'Feature': cols, 'Importances':importances})
imp.sort_values(by='Importances', ascending=False)

# Let remove the low importances features
X = X.drop(columns=['income_change', 'City_LE', 'Gender', 'Education_Level'])

X_scaled=scaler.fit_transform(X)

X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.4, random_state=42, stratify=y)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

sm = SMOTE(sampling_strategy=0.6, random_state=42)
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

lr = LogisticRegression()

lr.fit(X_train_sm, y_train_sm)

y_pred = lr.predict(X_test)

print("Scores for Logistic Regression model")
eval(y_test, y_pred)

Scores for Logistic Regression model
Recall score: 0.923
Precision score: 0.824
F1 score: 0.87
ROC_AUC score: 0.752
Accuracy: 0.813
Classification report:               precision    recall  f1-score   support

           0       0.78      0.58      0.67       153
           1       0.82      0.92      0.87       324

    accuracy                           0.81       477
   macro avg       0.80      0.75      0.77       477
weighted avg       0.81      0.81      0.81       477

Confusion Matrix: [[ 89  64]
 [ 25 299]]

rf_final = RandomForestClassifier(
    n_estimators=300,
    max_depth=6,
    class_weight=None,      
    random_state=42,
    n_jobs=-1
)

rf_final.fit(X_train_sm, y_train_sm)
y_pred = rf_final.predict(X_test)

print("Scores for RF model")
eval(y_test, y_pred)

Scores for RF model
Recall score: 0.929
Precision score: 0.834
F1 score: 0.879
ROC_AUC score: 0.768
Accuracy: 0.826
Classification report:               precision    recall  f1-score   support

           0       0.80      0.61      0.69       153
           1       0.83      0.93      0.88       324

    accuracy                           0.83       477
   macro avg       0.82      0.77      0.79       477
weighted avg       0.82      0.83      0.82       477

Confusion Matrix: [[ 93  60]
 [ 23 301]]

gbdt_final = GradientBoostingClassifier(
        n_estimators=200,
        learning_rate=0.02,
        max_depth=3,
        random_state=42
    )

gbdt_final.fit(X_train_sm, y_train_sm)
y_pred = gbdt_final.predict(X_test)

print("Scores for GBDT model")
eval(y_test, y_pred)

Scores for GBDT model
Recall score: 0.935
Precision score: 0.842
F1 score: 0.886
ROC_AUC score: 0.781
Accuracy: 0.836
Classification report:               precision    recall  f1-score   support

           0       0.82      0.63      0.71       153
           1       0.84      0.94      0.89       324

    accuracy                           0.84       477
   macro avg       0.83      0.78      0.80       477
weighted avg       0.83      0.84      0.83       477

Confusion Matrix: [[ 96  57]
 [ 21 303]]

xgb_final = XGBClassifier(
        n_estimators=200,
        learning_rate=0.02,
        max_depth=3,
        subsample=0.7,
        colsample_bytree=0.8,
        objective='binary:logistic',
        eval_metric='logloss',
        n_jobs=-1,
        random_state=42
    )

xgb_final.fit(X_train_sm, y_train_sm)
y_pred = xgb_final.predict(X_test)

print("Scores for XGB model")
eval(y_test, y_pred)

Scores for XGB model
Recall score: 0.929
Precision score: 0.836
F1 score: 0.88
ROC_AUC score: 0.772
Accuracy: 0.828
Classification report:               precision    recall  f1-score   support

           0       0.80      0.61      0.70       153
           1       0.84      0.93      0.88       324

    accuracy                           0.83       477
   macro avg       0.82      0.77      0.79       477
weighted avg       0.83      0.83      0.82       477

Confusion Matrix: [[ 94  59]
 [ 23 301]]

lgbm_final = LGBMClassifier(
        n_estimators=200,
        learning_rate=0.01,
        num_leaves=31,
        subsample=0.9,
        colsample_bytree=0.8,
        class_weight=None,
        max_depth=5,
        random_state=42
    )

lgbm_final.fit(X_train_sm, y_train_sm)
y_pred = lgbm_final.predict(X_test)

print("Scores for LGBM model")
eval(y_test, y_pred)

[LightGBM] [Info] Number of positive: 969, number of negative: 581
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000213 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 587
[LightGBM] [Info] Number of data points in the train set: 1550, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.625161 -> initscore=0.511514
[LightGBM] [Info] Start training from score 0.511514
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
Scores for LGBM model
Recall score: 0.926
Precision score: 0.84
F1 score: 0.881
ROC_AUC score: 0.777
Accuracy: 0.83
Classification report:               precision    recall  f1-score   support

           0       0.80      0.63      0.70       153
           1       0.84      0.93      0.88       324

    accuracy                           0.83       477
   macro avg       0.82      0.78      0.79       477
weighted avg       0.83      0.83      0.82       477

Confusion Matrix: [[ 96  57]
 [ 24 300]]

y_pred_val = gbdt_final.predict(X_val)

print("Validation scores for GBDT:")
eval(y_val, y_pred_val)

Validation scores for GBDT:
Recall score: 0.938
Precision score: 0.837
F1 score: 0.885
ROC_AUC score: 0.776
Accuracy: 0.834
Classification report:               precision    recall  f1-score   support

           0       0.82      0.61      0.70       153
           1       0.84      0.94      0.88       323

    accuracy                           0.83       476
   macro avg       0.83      0.78      0.79       476
weighted avg       0.83      0.83      0.83       476

Confusion Matrix: [[ 94  59]
 [ 20 303]]

feature_names = X.columns

X_val_df = pd.DataFrame(X_val, columns=feature_names)

shap.initjs()

explainer = shap.TreeExplainer(gbdt_final)

X_val_sample = X_val_df.sample(min(300, len(X_val_df)), random_state=42)

shap_values = explainer.shap_values(X_val_sample)
shap_values.shape

(300, 7)

shap.summary_plot(
    shap_values,
    X_val_sample,
    plot_type="bar",  
    feature_names=feature_names
)

shap.summary_plot(
    shap_values,
    X_val_sample,
    feature_names=feature_names
)

mean_abs_shap = np.abs(shap_values).mean(axis=0)

importance_df = (
    pd.DataFrame({
        "Feature": feature_names,
        "MeanAbsSHAP": mean_abs_shap
    })
    .sort_values("MeanAbsSHAP", ascending=False)
    .reset_index(drop=True)
)

print(importance_df)

                      Feature  MeanAbsSHAP
0            Quarterly Rating     0.790650
1        Total Business Value     0.363237
2         Joining Designation     0.282037
3  Quarterly_Rating_Increased     0.105378
4                         Age     0.095511
5                       Grade     0.043770
6                      Income     0.042792

	Driver_ID	Age	Gender	Education_Level	Income	Dateofjoining	LastWorkingDate	Joining Designation	Grade	Total Business Value	Quarterly Rating	Reporting_date
count	19104.000000	19043.000000	19052.000000	19104.000000	19104.000000	19104	1616	19104.000000	19104.000000	1.910400e+04	19104.000000	19104
mean	1415.591133	34.668435	0.418749	1.021671	65652.025126	2018-04-20 21:49:17.788944896	2019-12-26 23:22:34.455445760	1.690536	2.252670	5.716621e+05	2.008899	2019-07-04 22:36:06.331658240
min	1.000000	21.000000	0.000000	0.000000	10747.000000	2013-01-04 00:00:00	2018-12-31 00:00:00	1.000000	1.000000	-6.000000e+06	1.000000	2019-01-01 00:00:00
25%	710.000000	30.000000	0.000000	0.000000	42383.000000	2016-11-28 00:00:00	2019-06-10 00:00:00	1.000000	1.000000	0.000000e+00	1.000000	2019-01-06 00:00:00
50%	1417.000000	34.000000	0.000000	1.000000	60087.000000	2018-09-19 00:00:00	2019-12-20 12:00:00	1.000000	2.000000	2.500000e+05	2.000000	2019-01-12 00:00:00
75%	2137.000000	39.000000	1.000000	2.000000	83969.000000	2019-10-25 00:00:00	2020-07-14 00:00:00	2.000000	3.000000	6.997000e+05	3.000000	2020-01-07 00:00:00
max	2788.000000	58.000000	1.000000	2.000000	188418.000000	2020-12-28 00:00:00	2020-12-28 00:00:00	5.000000	5.000000	3.374772e+07	4.000000	2020-01-12 00:00:00
std	810.705321	6.257912	0.493367	0.800167	30914.515344	NaN	NaN	0.836984	1.026512	1.128312e+06	1.009832	NaN

	Driver_ID	Age	Gender	Education_Level	Income	Joining Designation	Grade	Total Business Value	Quarterly Rating	Quarterly_Rating_Increased	target	income_change
count	2381.000000	2381.000000	2381.000000	2381.00000	2381.000000	2381.000000	2381.000000	2.381000e+03	2381.000000	2381.000000	2381.000000	2381.000000
mean	1397.559009	33.676018	0.410332	1.00756	59334.157077	1.820244	2.096598	4.586742e+06	1.427971	0.150357	0.678706	0.018060
std	806.161628	5.974057	0.491997	0.81629	28383.666384	0.841433	0.941522	9.127115e+06	0.809839	0.357496	0.467071	0.133195
min	1.000000	21.000000	0.000000	0.00000	10747.000000	1.000000	1.000000	-1.385530e+06	1.000000	0.000000	0.000000	0.000000
25%	695.000000	29.000000	0.000000	0.00000	39104.000000	1.000000	1.000000	0.000000e+00	1.000000	0.000000	0.000000	0.000000
50%	1400.000000	33.000000	0.000000	1.00000	55315.000000	2.000000	2.000000	8.176800e+05	1.000000	0.000000	1.000000	0.000000
75%	2100.000000	37.000000	1.000000	2.00000	75986.000000	2.000000	3.000000	4.173650e+06	2.000000	0.000000	1.000000	0.000000
max	2788.000000	58.000000	1.000000	2.00000	188418.000000	5.000000	5.000000	9.533106e+07	4.000000	1.000000	1.000000	1.000000

	Model	CV_Accuracy	CV_Precision	CV_Recall	CV_F1	CV_ROC_AUC	Test_Accuracy	Test_Precision	Test_Recall	Test_F1	Test_ROC_AUC
4	GBDT (sklearn)	0.783639	0.798492	0.912302	0.851323	0.817275	0.821803	0.832869	0.922840	0.875549	0.847979
5	XGBoost	0.776624	0.796518	0.901987	0.845622	0.806945	0.819706	0.832402	0.919753	0.873900	0.837025
1	SVM (RBF)	0.784314	0.830104	0.858603	0.843650	0.807044	0.788260	0.830861	0.864198	0.847201	0.808037
3	Random Forest	0.771736	0.827195	0.840035	0.832950	0.813636	0.796646	0.844985	0.858025	0.851455	0.840374
0	Logistic Regression	0.775905	0.820884	0.857593	0.838531	0.801453	0.792453	0.839879	0.858025	0.848855	0.836904
6	LightGBM	0.766821	0.815593	0.849346	0.831592	0.785316	0.779874	0.834862	0.842593	0.838710	0.827967
2	Decision Tree	0.733260	0.827977	0.766829	0.794730	0.782308	0.777778	0.853896	0.811728	0.832278	0.848180

OLA Driver Churn Prediction¶

This is a binary classification problem.

Importing Python Libraries¶

Loading the Data set and Preliminary Analysis¶

Handling missing values¶

For 'Gender' column - we use 'Group-wise mode imputation'¶

For the 'Age' column - we use 'kNN imputation'¶

Grouping by driver ID¶

Aggregation at driver level¶

Creating a column which tells if the quarterly rating has increased for that employee for those whose quarterly rating has increased we assign the value 1¶

Feature engineering¶

Creating the target column (if the driver left the company will have $1$ and driver still in the company will have $0$)¶

Creating a column which tells if the monthly income has increased for that employee for those whose monthly income has increased we assign the value 1¶

Statistical summary of the derived data¶

Outlier treatment¶

Exploratory Data Analysis¶

Visual analysis¶

Statistical analysis¶

ML Model Development¶

SMOTE¶

Reusable function for model evalution¶

Training ML models¶

Fine tuning the final model¶

Validation scores for GBDT¶

SHAP explainer for GBDT model¶

✅ Final Summary¶

Model Selection¶

1. Highest Recall (0.935)

2. Strong F1-score (0.886)

3. Stable validation performance

SHAP Explainability – Why Drivers Are Churning¶

Recommendations to Reduce Driver Churn¶

Final Conclusion¶

	Unnamed: 0	MMM-YY	Driver_ID	Age	City	Education_Level	Income	Dateofjoining	LastWorkingDate	Joining Designation	Grade	Total Business Value	Quarterly Rating
0	0	01/01/19	1	28.0	C23	2	57387	24/12/18	NaN	1	1	2381060	2
1	1	02/01/19	1	28.0	C23	2	57387	24/12/18	NaN	1	1	-665480	2
2	2	03/01/19	1	28.0	C23	2	57387	24/12/18	03/11/19	1	1	0	2
3	3	11/01/20	2	31.0	C7	2	67016	11/06/20	NaN	2	2	0	1
4	4	12/01/20	2	31.0	C7	2	67016	11/06/20	NaN	2	2	0	1

	Driver_ID	Age	City	Education_Level	Income	Dateofjoining	LastWorkingDate	Joining Designation	Grade	Total Business Value	Quarterly Rating	Reporting_date
0	1	28.0	C23	2	57387	2018-12-24	NaT	1	1	2381060	2	2019-01-01
1	1	28.0	C23	2	57387	2018-12-24	NaT	1	1	-665480	2	2019-01-02
2	1	28.0	C23	2	57387	2018-12-24	2019-11-03	1	1	0	2	2019-01-03
3	2	31.0	C7	2	67016	2020-06-11	NaT	2	2	0	1	2020-01-11
4	2	31.0	C7	2	67016	2020-06-11	NaT	2	2	0	1	2020-01-12

	Driver_ID	Age	Education_Level	Income	Joining Designation	Grade	Total Business Value	Quarterly Rating
0	1.0	28.0	2.0	57387.0	1.0	1.0	2381060.0	2.0
1	1.0	28.0	2.0	57387.0	1.0	1.0	-665480.0	2.0
2	1.0	28.0	2.0	57387.0	1.0	1.0	0.0	2.0
3	2.0	31.0	2.0	67016.0	2.0	2.0	0.0	1.0
4	2.0	31.0	2.0	67016.0	2.0	2.0	0.0	1.0

	Driver_ID	Age	Gender	City	Education_Level	Income	Dateofjoining	LastWorkingDate	Joining Designation	Grade	Total Business Value	Quarterly Rating	Reporting_date
239	43	27.0	1.0	C15	0	12906	2018-07-13	NaT	1	1	359890	1	2019-01-01
240	43	27.0	1.0	C15	0	12906	2018-07-13	2019-02-20	1	1	0	1	2019-01-02

	Driver_ID	Age	Gender	Education_Level	Income	Joining Designation	Grade	Total Business Value	Quarterly Rating	Dateofjoining	LastWorkingDate	City	Reporting_date
239	43.0	27.0	1.0	0.0	12906.0	1.0	1.0	359890.0	1.0	2018-07-13	NaT	C15	2019-01-01
240	43.0	27.0	1.0	0.0	12906.0	1.0	1.0	0.0	1.0	2018-07-13	2019-02-20	C15	2019-01-02

	Driver_ID	Age	Gender	City	Education_Level	Income	Joining Designation	Grade	Total Business Value	Quarterly Rating
0	1.0	28.0	0.0	C23	2.0	57387.0	1.0	1.0	1715580.0	2.0
1	2.0	31.0	0.0	C7	2.0	67016.0	2.0	2.0	0.0	1.0
2	4.0	43.0	0.0	C13	2.0	65603.0	2.0	2.0	350000.0	1.0
3	5.0	29.0	0.0	C9	0.0	46368.0	1.0	1.0	120360.0	1.0
4	6.0	31.0	1.0	C11	1.0	78728.0	3.0	3.0	1265000.0	2.0

	Driver_ID	first	last
698	819.0	114358.0	114358.0
507	588.0	17660.0	17660.0
1503	1763.0	33042.0	33042.0
144	170.0	91458.0	91458.0
308	363.0	67650.0	67650.0

	Driver_ID	Age	Gender	City	Education_Level	Income	Joining Designation	Grade	Total Business Value	Quarterly Rating	Quarterly_Rating_Increased	target	income_change
1313	1544.0	37.0	0.0	C13	1.0	90123.0	2.0	2.0	350000.0	1.0	0	1	0
1935	2270.0	34.0	1.0	C9	2.0	78164.0	3.0	3.0	232140.0	1.0	0	1	0
460	537.0	35.0	1.0	C29	1.0	84554.0	2.0	3.0	23376420.0	4.0	1	0	1
1854	2179.0	32.0	1.0	C22	1.0	59105.0	1.0	2.0	18036100.0	3.0	0	0	0
1224	1439.0	31.0	1.0	C7	1.0	49664.0	1.0	1.0	463890.0	1.0	0	1	0

	Feature	Test	Statistic	p-Value	Interpretation
0	Age	Mann-Whitney U	682792.000000	3.564764e-05	Significant
1	Income	Mann-Whitney U	774242.000000	2.141977e-23	Significant
2	Total Business Value	Mann-Whitney U	838829.000000	2.625202e-46	Significant
3	Education_Level	Mann-Whitney U	623793.500000	7.008911e-01	Not Significant
4	Joining Designation	Mann-Whitney U	710709.000000	2.402923e-10	Significant
5	Grade	Mann-Whitney U	786265.500000	1.641462e-29	Significant
6	Quarterly Rating	Mann-Whitney U	927619.000000	8.772323e-143	Significant
7	Gender	Chi-square	0.154374	6.943903e-01	Not Significant
8	income_change	Chi-square	71.647408	2.572999e-17	Significant
9	Quarterly_Rating_Increased	Chi-square	388.259008	1.981050e-86	Significant
10	City_LE	Chi-square	46.917176	1.397755e-02	Significant

	Feature	Importances
7	Quarterly Rating	0.431723
8	Quarterly_Rating_Increased	0.211003
6	Total Business Value	0.150477
5	Grade	0.065122
4	Joining Designation	0.057458
0	Age	0.038733
3	Income	0.028693
10	City_LE	0.010237
9	income_change	0.002603
2	Education_Level	0.002515
1	Gender	0.001435

Model	Recall	Precision	F1	Accuracy	ROC-AUC
GBDT	0.935	0.842	0.886	0.836	0.781
LightGBM	0.926	0.840	0.881	0.83	0.777
XGBoost	0.929	0.836	0.880	0.828	0.772
Random Forest	0.929	0.834	0.879	0.826	0.768
Logistic Regression	0.923	0.824	0.870	0.813	0.752