# Importing the basic libraries we will require for the project

# Libraries to help with reading and manipulating data
import pandas as pd
import numpy as np

# Libaries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# Importing the Machine Learning models we require from Scikit-Learn
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Importing the other functions we may require from Scikit-Learn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# To get diferent metric scores
from sklearn.metrics import confusion_matrix,classification_report,roc_auc_score,precision_recall_curve,roc_curve,make_scorer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score

from sklearn.tree import plot_tree

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


from sklearn.cluster import DBSCAN
from xgboost import XGBClassifier


# Code to ignore warnings from function usage
import warnings;

warnings.filterwarnings('ignore')

file_path = 'ExtraaLearn.csv'
data= pd.read_csv(file_path)

data.head()

data.tail()

data.shape

(4612, 15)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4612 entries, 0 to 4611
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ID                     4612 non-null   object 
 1   age                    4612 non-null   int64  
 2   current_occupation     4612 non-null   object 
 3   first_interaction      4612 non-null   object 
 4   profile_completed      4612 non-null   object 
 5   website_visits         4612 non-null   int64  
 6   time_spent_on_website  4612 non-null   int64  
 7   page_views_per_visit   4612 non-null   float64
 8   last_activity          4612 non-null   object 
 9   print_media_type1      4612 non-null   object 
 10  print_media_type2      4612 non-null   object 
 11  digital_media          4612 non-null   object 
 12  educational_channels   4612 non-null   object 
 13  referral               4612 non-null   object 
 14  status                 4612 non-null   int64  
dtypes: float64(1), int64(4), object(10)
memory usage: 540.6+ KB

# change page_views_per_visit from float64 to int64
data['page_views_per_visit'] = data['page_views_per_visit'].astype('int64')

pd.DataFrame(data={'% of Missing Values':round(data.isna().sum()/data.isna().count()*100,2)}).sort_values(by='% of Missing Values',ascending=False)

data.nunique()

ID                       4612
age                        46
current_occupation          3
first_interaction           2
profile_completed           3
website_visits             27
time_spent_on_website    1623
page_views_per_visit       17
last_activity               3
print_media_type1           2
print_media_type2           2
digital_media               2
educational_channels        2
referral                    2
status                      2
dtype: int64

data.drop(columns='ID',inplace=True)

data.describe().T

# Frequency tables for categorical columns
categorical_columns = data.select_dtypes(include=['object', 'category']).columns
for col in categorical_columns:
    print(f"Frequency table for {col}:\n")
    print(data[col].value_counts())
    print("\n")

Frequency table for current_occupation:

current_occupation
Professional    2616
Unemployed      1441
Student          555
Name: count, dtype: int64


Frequency table for first_interaction:

first_interaction
Website       2542
Mobile App    2070
Name: count, dtype: int64


Frequency table for profile_completed:

profile_completed
High      2264
Medium    2241
Low        107
Name: count, dtype: int64


Frequency table for last_activity:

last_activity
Email Activity      2278
Phone Activity      1234
Website Activity    1100
Name: count, dtype: int64


Frequency table for print_media_type1:

print_media_type1
No     4115
Yes     497
Name: count, dtype: int64


Frequency table for print_media_type2:

print_media_type2
No     4379
Yes     233
Name: count, dtype: int64


Frequency table for digital_media:

digital_media
No     4085
Yes     527
Name: count, dtype: int64


Frequency table for educational_channels:

educational_channels
No     3907
Yes     705
Name: count, dtype: int64


Frequency table for referral:

referral
No     4519
Yes      93
Name: count, dtype: int64

# Identify categorical and numeric columns
cat_cols = data.select_dtypes(include=['object', 'category']).columns
numeric_cols = data.select_dtypes(include=['number']).columns
    
print(f"Categorical columns: {cat_cols}")
print(f"Numeric columns: {numeric_cols}")

Categorical columns: Index(['current_occupation', 'first_interaction', 'profile_completed',
       'last_activity', 'print_media_type1', 'print_media_type2',
       'digital_media', 'educational_channels', 'referral'],
      dtype='object')
Numeric columns: Index(['age', 'website_visits', 'time_spent_on_website',
       'page_views_per_visit', 'status'],
      dtype='object')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4612 entries, 0 to 4611
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   age                    4612 non-null   int64 
 1   current_occupation     4612 non-null   object
 2   first_interaction      4612 non-null   object
 3   profile_completed      4612 non-null   object
 4   website_visits         4612 non-null   int64 
 5   time_spent_on_website  4612 non-null   int64 
 6   page_views_per_visit   4612 non-null   int64 
 7   last_activity          4612 non-null   object
 8   print_media_type1      4612 non-null   object
 9   print_media_type2      4612 non-null   object
 10  digital_media          4612 non-null   object
 11  educational_channels   4612 non-null   object
 12  referral               4612 non-null   object
 13  status                 4612 non-null   int64 
dtypes: int64(5), object(9)
memory usage: 504.6+ KB

df = data.copy()

df.head()

# Defining the hist_box() function
def hist_box(data, col):
    f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={'height_ratios': (0.15, 0.85)}, figsize=(10, 10))
    # Adding a graph in each part
    sns.boxplot(data=data, x=col, ax=ax_box, showmeans=True)
    sns.histplot(data=data, x=col, kde=True, ax=ax_hist)
    plt.show()

hist_box(df, "age")

plt.figure(figsize=(10, 6))
sns.violinplot(x = df['age'])
plt.show()

hist_box(df, 'website_visits')

plt.figure(figsize=(10, 6))
sns.violinplot(x = df['website_visits'])
plt.show()

hist_box(df, 'time_spent_on_website')

plt.figure(figsize=(10, 6))
sns.violinplot(x = df['time_spent_on_website'])
plt.show()

hist_box(df,'first_interaction')

hist_box(df, 'current_occupation')

plt.figure(figsize=(10, 6))
sns.violinplot(x = df['current_occupation'])
plt.show()

hist_box(df, 'referral')

plt.figure(figsize=(10, 6))
sns.violinplot(x = df['referral'])
plt.show()

hist_box(df, 'status')

plt.figure(figsize=(10, 6))
sns.violinplot(x = df['status'])
plt.show()

# Defining the stacked_barplot() function
def stacked_barplot(data,predictor,target,figsize=(10,6)):
  (pd.crosstab(data[predictor],data[target],normalize='index')*100).plot(kind='bar',figsize=figsize,stacked=True)
  plt.legend(loc="lower right")
  plt.ylabel(target)

plt.figure(figsize=(100, 6))
stacked_barplot(data, "age", "status" )

<Figure size 10000x600 with 0 Axes>

stacked_barplot(data, "age", "current_occupation" )

plt.figure(figsize=(70, 6))
stacked_barplot(data, "profile_completed", "age" )

<Figure size 7000x600 with 0 Axes>

cols_list = df.select_dtypes(include=np.number).columns.tolist()

plt.figure(figsize=(10, 6))
sns.heatmap(data[cols_list].corr(), annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="Spectral")
plt.show()

stacked_barplot(data, "current_occupation","status" )

stacked_barplot(data, "first_interaction","status" )

data.head()

stacked_barplot(data, "referral","status" )

stacked_barplot(data, "print_media_type1","status" )

stacked_barplot(data, "print_media_type2","status" )

stacked_barplot(data, "digital_media","status" )

# Step 1: Identify categorical and numerical columns
categorical_columns = ['current_occupation', 'first_interaction', 'profile_completed', 'last_activity', 'print_media_type1', 
                       'print_media_type2', 'digital_media', 'educational_channels', 'referral']
numerical_columns = ['age', 'website_visits', 'time_spent_on_website', 'page_views_per_visit']

# Step 2: Create preprocessing pipelines
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing categorical data
    ('encoder', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical data
])

numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Handle missing numerical data
    ('scaler', StandardScaler())  # Standardize numerical data
])

# Combine both pipelines
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_pipeline, numerical_columns),
    ('cat', categorical_pipeline, categorical_columns)
])

# Step 3: Apply preprocessing to the entire dataset
X = data.drop(columns='status')
y = data['status']
X_processed = preprocessor.fit_transform(X)

# Step 4: Split the preprocessed data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.30, random_state=1, stratify=y)

data.head(3)

# Initialize the Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=1)

# Fit the model on the training data
dt_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = dt_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Decision Tree Classifier Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Decision Tree Classifier Accuracy: 0.7955202312138728

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.84      0.85       971
           1       0.65      0.69      0.67       413

    accuracy                           0.80      1384
   macro avg       0.76      0.76      0.76      1384
weighted avg       0.80      0.80      0.80      1384

param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

grid_search = GridSearchCV(DecisionTreeClassifier(random_state=1), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

Best Parameters: {'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 2}

# Initialize the optimized Decision Tree Classifier
optimized_dt_model = DecisionTreeClassifier(
    max_depth=3, 
    min_samples_leaf=1, 
    min_samples_split=2, 
    random_state=1
)

# Train the model on the training data
optimized_dt_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_optimized = optimized_dt_model.predict(X_test)

# Evaluate the model
from sklearn.metrics import accuracy_score, classification_report

accuracy_optimized = accuracy_score(y_test, y_pred_optimized)
print("Optimized Decision Tree Classifier Accuracy:", accuracy_optimized)
print("\nClassification Report:\n", classification_report(y_test, y_pred_optimized))

Optimized Decision Tree Classifier Accuracy: 0.7998554913294798

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.84      0.85       971
           1       0.65      0.71      0.68       413

    accuracy                           0.80      1384
   macro avg       0.76      0.77      0.77      1384
weighted avg       0.81      0.80      0.80      1384

# Visualize the Decision Tree
plt.figure(figsize=(20, 10))  # Adjust figure size for readability
plot_tree(
    optimized_dt_model, 
    feature_names=[f'PCA_{i+1}' for i in range(X_train.shape[1])],  # Feature names as PCA components
    class_names=['Class 0', 'Class 1'],  # Replace with actual class labels if available
    filled=True,  # Color nodes by class
    rounded=True,  # Rounded boxes
    fontsize=10  # Font size for readability
)
plt.title("Optimized Decision Tree Visualization", fontsize=16)
plt.show()

# Get feature importance from the trained model
feature_importance = optimized_dt_model.feature_importances_

# Create a DataFrame for better readability
features = [f'PCA_{i+1}' for i in range(X_train.shape[1])]
importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importance
}).sort_values(by='Importance', ascending=False)

# Display the importance DataFrame
print(importance_df)

# Visualize feature importance
plt.figure(figsize=(10, 6))
plt.bar(importance_df['Feature'], importance_df['Importance'], align='center')
plt.xlabel('PCA Components')
plt.ylabel('Importance')
plt.title('Feature Importance in Decision Tree')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

   Feature  Importance
2    PCA_3    0.339551
7    PCA_8    0.333836
9   PCA_10    0.210528
13  PCA_14    0.081056
14  PCA_15    0.034732
0    PCA_1    0.000298
15  PCA_16    0.000000
23  PCA_24    0.000000
22  PCA_23    0.000000
21  PCA_22    0.000000
20  PCA_21    0.000000
19  PCA_20    0.000000
18  PCA_19    0.000000
17  PCA_18    0.000000
16  PCA_17    0.000000
12  PCA_13    0.000000
1    PCA_2    0.000000
11  PCA_12    0.000000
10  PCA_11    0.000000
8    PCA_9    0.000000
6    PCA_7    0.000000
5    PCA_6    0.000000
4    PCA_5    0.000000
3    PCA_4    0.000000
24  PCA_25    0.000000

# Initialize the Random Forest model
rf_model = RandomForestClassifier(
    n_estimators=100,  # Number of trees in the forest
    max_depth=3,  # Maximum depth of each tree (same as Decision Tree for comparison)
    random_state=1,
    class_weight='balanced'  # Handle class imbalance
)

# Train the Random Forest model
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the Random Forest model
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print("Random Forest Classifier Accuracy:", rf_accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))

Random Forest Classifier Accuracy: 0.8367052023121387

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.84      0.88       971
           1       0.69      0.82      0.75       413

    accuracy                           0.84      1384
   macro avg       0.80      0.83      0.81      1384
weighted avg       0.85      0.84      0.84      1384

# Get feature importance from the Random Forest model
rf_feature_importance = rf_model.feature_importances_

# Create a DataFrame for better readability
rf_importance_df = pd.DataFrame({
    'Feature': [f'PCA_{i+1}' for i in range(X_train.shape[1])],
    'Importance': rf_feature_importance
}).sort_values(by='Importance', ascending=False)

# Display the importance DataFrame
print(rf_importance_df)

# Visualize feature importance
plt.figure(figsize=(10, 6))
plt.bar(rf_importance_df['Feature'], rf_importance_df['Importance'], align='center')
plt.xlabel('PCA Components')
plt.ylabel('Importance')
plt.title('Feature Importance in Random Forest')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

   Feature  Importance
7    PCA_8    0.225474
8    PCA_9    0.224742
2    PCA_3    0.194957
9   PCA_10    0.127221
11  PCA_12    0.062989
5    PCA_6    0.028924
0    PCA_1    0.028524
13  PCA_14    0.027042
4    PCA_5    0.025269
14  PCA_15    0.023160
10  PCA_11    0.006484
12  PCA_13    0.006115
6    PCA_7    0.005611
24  PCA_25    0.004335
23  PCA_24    0.003308
1    PCA_2    0.002421
3    PCA_4    0.001511
22  PCA_23    0.000598
16  PCA_17    0.000472
21  PCA_22    0.000311
15  PCA_16    0.000172
17  PCA_18    0.000126
19  PCA_20    0.000115
20  PCA_21    0.000082
18  PCA_19    0.000035

# Train a deeper Decision Tree (no max_depth restriction)
deeper_dt_model = DecisionTreeClassifier(
    random_state=1  # No depth restriction
)

deeper_dt_model.fit(X_train, y_train)

# Evaluate the deeper Decision Tree
y_pred_deeper_dt = deeper_dt_model.predict(X_test)

accuracy_deeper_dt = accuracy_score(y_test, y_pred_deeper_dt)
print("Deeper Decision Tree Accuracy:", accuracy_deeper_dt)
print("\nClassification Report:\n", classification_report(y_test, y_pred_deeper_dt))

Deeper Decision Tree Accuracy: 0.7955202312138728

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.84      0.85       971
           1       0.65      0.69      0.67       413

    accuracy                           0.80      1384
   macro avg       0.76      0.76      0.76      1384
weighted avg       0.80      0.80      0.80      1384

# Train a Random Forest without depth restriction
deeper_rf_model = RandomForestClassifier(
    n_estimators=100,  # Number of trees
    random_state=1,  # For reproducibility
    class_weight='balanced'  # To handle class imbalance
)

# Fit the model on the training data
deeper_rf_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_deeper_rf = deeper_rf_model.predict(X_test)

# Evaluate the deeper Random Forest
accuracy_deeper_rf = accuracy_score(y_test, y_pred_deeper_rf)
print("Deeper Random Forest Accuracy:", accuracy_deeper_rf)
print("\nClassification Report:\n", classification_report(y_test, y_pred_deeper_rf))

Deeper Random Forest Accuracy: 0.8526011560693642

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.91      0.90       971
           1       0.78      0.71      0.74       413

    accuracy                           0.85      1384
   macro avg       0.83      0.81      0.82      1384
weighted avg       0.85      0.85      0.85      1384

# Initialize the XGBoost model
xgb_model = XGBClassifier(
    max_depth=3,  # Same depth as previous models for fair comparison
    n_estimators=100,  # Number of boosting rounds
    learning_rate=0.1,  # Learning rate
    use_label_encoder=False,  # Suppress warnings
    eval_metric="logloss",  # Evaluation metric
    random_state=1
)

# Train the Gradient Boosting model
xgb_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate the Gradient Boosting model
xgb_accuracy = accuracy_score(y_test, y_pred_xgb)
print("Gradient Boosting Classifier Accuracy:", xgb_accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred_xgb))

Gradient Boosting Classifier Accuracy: 0.861271676300578

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.91      0.90       971
           1       0.78      0.74      0.76       413

    accuracy                           0.86      1384
   macro avg       0.84      0.83      0.83      1384
weighted avg       0.86      0.86      0.86      1384

# Get feature importance from the XGBoost model
xgb_feature_importance = xgb_model.feature_importances_

# Create a DataFrame for better readability
xgb_importance_df = pd.DataFrame({
    'Feature': [f'PCA_{i+1}' for i in range(X_train.shape[1])],
    'Importance': xgb_feature_importance
}).sort_values(by='Importance', ascending=False)

# Display the importance DataFrame
print(xgb_importance_df)

# Visualize feature importance
plt.figure(figsize=(10, 6))
plt.bar(xgb_importance_df['Feature'], xgb_importance_df['Importance'], align='center')
plt.xlabel('PCA Components')
plt.ylabel('Importance')
plt.title('Feature Importance in Gradient Boosting')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

   Feature  Importance
7    PCA_8    0.183186
9   PCA_10    0.138852
11  PCA_12    0.109778
5    PCA_6    0.094576
2    PCA_3    0.093906
13  PCA_14    0.086900
4    PCA_5    0.073535
14  PCA_15    0.062918
0    PCA_1    0.027064
10  PCA_11    0.026937
23  PCA_24    0.023698
12  PCA_13    0.021121
21  PCA_22    0.016231
6    PCA_7    0.012190
3    PCA_4    0.009436
1    PCA_2    0.008318
15  PCA_16    0.006623
19  PCA_20    0.004732
8    PCA_9    0.000000
16  PCA_17    0.000000
17  PCA_18    0.000000
18  PCA_19    0.000000
20  PCA_21    0.000000
22  PCA_23    0.000000
24  PCA_25    0.000000

# Define the model

# Elbow Method
sse = []
silhouette_scores = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=1)
    kmeans.fit(X_pca)
    sse.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_pca, kmeans.labels_))

# Plotting Elbow and Silhouette Scores
plt.figure(figsize=(14, 5))

plt.subplot(1, 2, 1)
plt.plot(range(2, 11), sse, marker='o')
plt.title('Elbow Method - SSE')
plt.xlabel('Number of Clusters')
plt.ylabel('SSE')

plt.subplot(1, 2, 2)
plt.plot(range(2, 11), silhouette_scores, marker='o')
plt.title('Silhouette Scores')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')

plt.tight_layout()
plt.show()

# Assuming 3 clusters based on visual inspection (adjust based on results)
optimal_clusters = 3
kmeans = KMeans(n_clusters=optimal_clusters, random_state=1)
kmeans_labels = kmeans.fit_predict(X_pca)

# Visualizing Clusters
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=kmeans_labels, cmap='viridis')
plt.title(f'K-Means Clustering with {optimal_clusters} Clusters')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.colorbar(label='Cluster')
plt.show()

from scipy.cluster.hierarchy import dendrogram, linkage

# Perform Hierarchical Clustering
Z = linkage(X_pca, method='ward')

# Plot the Dendrogram
plt.figure(figsize=(10, 7))
dendrogram(Z, truncate_mode='level', p=5)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample Index')
plt.ylabel('Distance')
plt.show()

# Apply DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(X_pca)

# Visualize DBSCAN Results
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=dbscan_labels, cmap='plasma')
plt.title('DBSCAN Clustering')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.colorbar(label='Cluster')
plt.show()

# Try different combinations of eps and min_samples
eps_values = np.linspace(0.1, 1.0, 10)
min_samples_values = [3, 5, 10, 15]

# Store results
dbscan_results = []

for eps in eps_values:
    for min_samples in min_samples_values:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        labels = dbscan.fit_predict(X_pca)
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise = list(labels).count(-1)
        dbscan_results.append((eps, min_samples, n_clusters, n_noise))

# Convert to DataFrame for better visualization
import pandas as pd
dbscan_results_df = pd.DataFrame(dbscan_results, columns=['eps', 'min_samples', 'n_clusters', 'n_noise'])

# Display the top results
dbscan_results_df.sort_values(by='n_clusters', ascending=False).head(10)

# Apply DBSCAN with the best parameters
best_eps = 0.5  # Replace with your chosen value
best_min_samples = 5  # Replace with your chosen value

dbscan_optimized = DBSCAN(eps=best_eps, min_samples=best_min_samples)
labels_optimized = dbscan_optimized.fit_predict(X_pca)

# Visualize the optimized clusters
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels_optimized, cmap='plasma')
plt.title(f'DBSCAN Clustering (eps={best_eps}, min_samples={best_min_samples})')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.colorbar(label='Cluster')
plt.show()

# Example: Adding DBSCAN labels to the dataset
dbscan = DBSCAN(eps=0.5, min_samples=5)
labels = dbscan.fit_predict(X_pca)
data['Cluster'] = labels

# Convert the 'Cluster' column to numeric (if not already)
data['Cluster'] = pd.to_numeric(data['Cluster'], errors='coerce')

# Group by clusters and calculate summary statistics
cluster_summary = data.groupby('Cluster').mean(numeric_only=True)
print(cluster_summary)

# Check cluster sizes
cluster_sizes = data['Cluster'].value_counts()
print(cluster_sizes)

               age  website_visits  time_spent_on_website  \
Cluster                                                     
-1       45.225628        3.780151             780.633417   
 0       57.360000        2.880000             285.760000   
 1       53.187500        2.687500             236.718750   
 2       21.125000        2.000000             168.375000   
 3       56.666667        0.000000               0.000000   
...            ...             ...                    ...   
 63      57.200000        3.200000             294.400000   
 64      20.714286        2.571429             152.857143   
 65      21.400000        1.600000             275.000000   
 66      53.400000        1.400000             213.200000   
 67      56.800000        2.600000            1550.600000   

         page_views_per_visit    status  
Cluster                                  
-1                   2.713568  0.298744  
 0                   2.000000  0.040000  
 1                   2.000000  0.187500  
 2                   2.000000  0.125000  
 3                   0.000000  0.000000  
...                       ...       ...  
 63                  2.000000  0.000000  
 64                  2.000000  0.142857  
 65                  2.000000  0.000000  
 66                  2.000000  0.000000  
 67                  2.000000  0.200000  

[69 rows x 5 columns]
Cluster
-1     3980
 1       32
 24      32
 6       27
 0       25
       ... 
 48       5
 33       5
 45       5
 9        5
 65       5
Name: count, Length: 69, dtype: int64

	count	mean	std	min	25%	50%	75%	max
age	4612.0	46.201214	13.161454	18.0	36.00	51.0	57.00	63.0
website_visits	4612.0	3.566782	2.829134	0.0	2.00	3.0	5.00	30.0
time_spent_on_website	4612.0	724.011275	743.828683	0.0	148.75	376.0	1336.75	2537.0
page_views_per_visit	4612.0	2.641804	1.879720	0.0	2.00	2.0	3.00	18.0
status	4612.0	0.298569	0.457680	0.0	0.00	0.0	1.00	1.0

	ID	age	current_occupation	first_interaction	profile_completed	website_visits	time_spent_on_website	page_views_per_visit	last_activity	print_media_type1	print_media_type2	digital_media	educational_channels	referral	status
0	EXT001	57	Unemployed	Website	High	7	1639	1.861	Website Activity	Yes	No	Yes	No	No	1
1	EXT002	56	Professional	Mobile App	Medium	2	83	0.320	Website Activity	No	No	No	Yes	No	0
2	EXT003	52	Professional	Website	Medium	3	330	0.074	Website Activity	No	No	Yes	No	No	0
3	EXT004	53	Unemployed	Website	High	4	464	2.057	Website Activity	No	No	No	No	No	1
4	EXT005	23	Student	Website	High	4	600	16.914	Email Activity	No	No	No	No	No	0

	ID	age	current_occupation	first_interaction	profile_completed	website_visits	time_spent_on_website	page_views_per_visit	last_activity	print_media_type1	print_media_type2	digital_media	educational_channels	referral	status
4607	EXT4608	35	Unemployed	Mobile App	Medium	15	360	2.170	Phone Activity	No	No	No	Yes	No	0
4608	EXT4609	55	Professional	Mobile App	Medium	8	2327	5.393	Email Activity	No	No	No	No	No	0
4609	EXT4610	58	Professional	Website	High	2	212	2.692	Email Activity	No	No	No	No	No	1
4610	EXT4611	57	Professional	Mobile App	Medium	1	154	3.879	Website Activity	Yes	No	No	No	No	0
4611	EXT4612	55	Professional	Website	Medium	4	2290	2.075	Phone Activity	No	No	No	No	No	0

	% of Missing Values
ID	0.0
age	0.0
current_occupation	0.0
first_interaction	0.0
profile_completed	0.0
website_visits	0.0
time_spent_on_website	0.0
page_views_per_visit	0.0
last_activity	0.0
print_media_type1	0.0
print_media_type2	0.0
digital_media	0.0
educational_channels	0.0
referral	0.0
status	0.0

	eps	min_samples	n_clusters	n_noise
20	0.6	3	216	2845
24	0.7	3	210	2295
16	0.5	3	190	3465
28	0.8	3	174	1871
32	0.9	3	165	1602
36	1.0	3	149	1374
12	0.4	3	136	3838
8	0.3	3	92	4252
29	0.8	5	86	2453
25	0.7	5	85	3046

Juan David Correa (BATCH: MIT-DSML-NOVEMBER 2024-B)¶

Course: "Classification and Hypothesis Testing".¶

ExtraaLearn Project -- January, 2025¶

Context¶

Objective¶

Data Description¶

Homework Description:¶

Importing necessary libraries and data¶

Dataset Exploration:¶

Data Overview¶

Exploratory Data Analysis (EDA)¶

Univariate Analysis¶

Bivariate Anaysis¶

Multivariate Analysis¶

Time spent on web and age seem the most correlated, the rest do not seem to have much correlation.¶

Questions¶

1. Leads will have different expectations from the outcome of the course and the current occupation may play a key role in getting them to participate in the program. Find out how current occupation affects lead status.¶

2. The company's first impression on the customer must have an impact. Do the first channels of interaction have an impact on the lead status?¶

3. The company uses multiple modes to interact with prospects. Which way of interaction works best?¶

4. The company gets leads from various channels such as print media, digital media, referrals, etc. Which of these channels have the highest lead conversion rate?¶

5. People browsing the website or mobile application are generally required to create a profile by sharing their personal data before they can access additional information. Does having more details about a prospect increase the chances of conversion?¶

Data Preprocessing¶

EDA¶

Building a Decision Tree model¶

The Decision Tree Classifier achieved an accuracy of 77.75% on the test set. Here's a breakdown of the performance metrics:¶

The optimized Decision Tree model shows significant improvement in performance:¶

Observations:¶

From the bar chart, it’s clear that not all PCA components contribute equally to the Decision Tree's decision-making. Here are the key takeaways:¶

Do we need to prune the tree?¶

Building a Random Forest model¶

The Random Forest model achieved an accuracy of 80.20%, slightly lower than the optimized Decision Tree's 82.01%. Here’s the detailed breakdown of the results:¶

Random Forest Feature Importance Analysis¶

Pruning Trees¶

Bonus Task: Train and Evaluate Deeper or Unpruned Trees¶

Key Observations:¶

Analysis of Deeper Random Forest Results¶

Bonus Task - Gradient Boosting Model Performance Analysis¶

Gradient Boosting Feature Importance Analysis¶

Bonus Task - Clusters Analysis¶

Observations:¶

1. Cluster Characteristics¶

Actionable Advice¶

Clustering: Unsupervised Learning¶

Cluster Analysis Adds Value By:¶

Insights Gained¶