import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from sklearn.datasets import make_blobs
import numpy as np
from latexify import latexify
# retina
%config InlineBackend.figure_format = 'retina'
Comparison of Sophisticated vs Dummy Baseline ML Algorithms for Imbalanced Datasets
ML
Tutorial
Classification
# Set a random seed for reproducibility
42)
np.random.seed(
# Create an imbalanced dataset with two features
= make_blobs(
X, y =[4500,500],
n_samples=2, # Use only two features
n_features=[4.0,4.0],
cluster_std=42
random_state
)
print(len(y))
print(len(y[y==1]))
print(len(y[y==0]))
5000
500
4500
from sklearn.model_selection import train_test_split
# Split the data into train and test sets
= train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test
# Print the shapes of the train and test sets
print("Train set shape:", X_train.shape, y_train.shape)
print("Test set shape:", X_test.shape, y_test.shape)
Train set shape: (4000, 2) (4000,)
Test set shape: (1000, 2) (1000,)
= ['blue' if label == 0 else 'red' for label in y_train]
colors =7, fig_height=5)
latexify(fig_width0], X_train[:, 1], c=colors, alpha=0.8)
plt.scatter(X_train[:, '$x_1$')
plt.xlabel('$x_2$')
plt.ylabel('Training Data') plt.title(
Text(0.5, 1.0, 'Training Data')
print(len(y_train[y_train==1]))
print(len(y_train[y_train==0]))
393
3607
print(len(y_test[y_test==1]))
print(len(y_test[y_test==0]))
107
893
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
= RandomForestClassifier()
rf_classifier
rf_classifier.fit(X_train, y_train)
= rf_classifier.predict(X_test)
y_pred = accuracy_score(y_test, y_pred)
accuracy = f1_score(y_test, y_pred)
f1
# Print the accuracy and F1 score
print("Accuracy:", accuracy)
print("F1 Score:", f1)
Accuracy: 0.945
F1 Score: 0.7208121827411167
from sklearn.dummy import DummyClassifier
= DummyClassifier(strategy='stratified')
dummy_classifier
dummy_classifier.fit(X_train, y_train)
= dummy_classifier.predict(X_test)
y_pred_dummy = accuracy_score(y_test, y_pred_dummy)
accuracy_dummy = f1_score(y_test, y_pred_dummy)
f1_dummy
# Print the accuracy and F1 score for the dummy classifier
print("Dummy Classifier Accuracy:", accuracy_dummy)
print("Dummy Classifier F1 Score:", f1_dummy)
Dummy Classifier Accuracy: 0.81
Dummy Classifier F1 Score: 0.07766990291262137
Regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.dummy import DummyRegressor
# Generate synthetic regression dataset with noise
42)
np.random.seed(= np.linspace(0, 1, 500).reshape(-1, 1)
X = 1.2
slope = slope * X.squeeze()
y_true = y_true + np.random.normal(scale=0.5, size=len(X))
y
# Split the data into train and test sets
= train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test
# Print the shapes of the train and test sets
print("Train set shape:", X_train.shape, y_train.shape)
print("Test set shape:", X_test.shape, y_test.shape)
Train set shape: (400, 1) (400,)
Test set shape: (100, 1) (100,)
# Scatter plot of the training data
='red', alpha=0.8, label='Actual')
plt.scatter(X_train, y_train, color='blue', label='True')
plt.plot(X, y_true, color'$x$')
plt.xlabel('$y$')
plt.ylabel(
plt.legend()'Training Data') plt.title(
Text(0.5, 1.0, 'Training Data')
# RandomForestRegressor
= RandomForestRegressor(n_estimators=10, random_state=42)
rf_regressor
rf_regressor.fit(X_train, y_train)
= rf_regressor.predict(X_test)
y_pred_rf = mean_squared_error(y_test, y_pred_rf)
mse_rf
# Print the Mean Squared Error for the RandomForestRegressor
print("Random Forest Regressor MSE:", mse_rf)
Random Forest Regressor MSE: 0.3446175878909235
='red', alpha=0.8, label='Actual')
plt.scatter(X_test, y_test, color='black', alpha=0.8, label='Predicted')
plt.scatter(X_test, y_pred_rf, color*X_test.squeeze(), color='blue', label='True')
plt.plot(X_test, slope'$x$')
plt.xlabel('$y$')
plt.ylabel('Predictions')
plt.title(
plt.legend() plt.show()
# DummyRegressor
= DummyRegressor(strategy='mean')
dummy_regressor
dummy_regressor.fit(X_train, y_train)
= dummy_regressor.predict(X_test)
y_pred_dummy = mean_squared_error(y_test, y_pred_dummy)
mse_dummy
# Print the Mean Squared Error for the DummyRegressor
print("Dummy Regressor MSE:", mse_dummy)
Dummy Regressor MSE: 0.39550009552721027
='red', alpha=0.8, label='Actual')
plt.scatter(X_test, y_test, color='black', alpha=0.8, label='Predicted')
plt.scatter(X_test, y_pred_dummy, color*X_test.squeeze(), color='blue', label='True')
plt.plot(X_test, slope'$x$')
plt.xlabel('$y$')
plt.ylabel('Predictions')
plt.title(
plt.legend() plt.show()