# Install evidently if needed
# !pip install evidentlyData Drift Detection — Hands-On Notebook
CS 203 — Week 10
In this notebook we’ll: 1. Visualize drift with histograms 2. Detect drift with the KS test (scipy) 3. Compute PSI manually 4. Use Evidently for automated drift reports 5. Simulate gradual drift over time 6. Try a real-world scenario
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import ks_2samp, chi2_contingency
from sklearn.datasets import load_iris, load_wine
plt.style.use("seaborn-v0_8-whitegrid")
np.random.seed(42)Part 1: Setup — Simulating Training vs Production Data
We’ll use the Iris dataset. Imagine: - Training data: data collected in January - Production data: data arriving in July (the world may have changed!)
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df["target"] = iris.target
# Split into "training" and "production"
df_train = df.iloc[:100].copy()
df_prod = df.iloc[100:].copy()
print(f"Training: {len(df_train)} rows")
print(f"Production: {len(df_prod)} rows")
print(f"\nTraining class distribution:\n{df_train['target'].value_counts().sort_index()}")
print(f"\nProduction class distribution:\n{df_prod['target'].value_counts().sort_index()}")Notice anything? The production data is mostly class 2 (virginica). This is already a form of drift — label drift!
Part 2: Visual Drift Detection — Histograms
The simplest way to spot drift: plot the same feature from both datasets.
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
for ax, col in zip(axes.flat, iris.feature_names):
ax.hist(df_train[col], bins=20, alpha=0.5, label="Training", color="steelblue", density=True)
ax.hist(df_prod[col], bins=20, alpha=0.5, label="Production", color="coral", density=True)
ax.set_title(col, fontsize=12)
ax.legend()
plt.suptitle("Training vs Production — Do these look the same?", fontsize=14, fontweight="bold")
plt.tight_layout()
plt.show()Discussion: Which features look shifted? Which look stable?
The problem with visual inspection: it’s subjective. We need a number.
Part 3: KS Test — A Number for Drift
The Kolmogorov-Smirnov test compares two distributions and gives us: - KS statistic: the maximum gap between the cumulative distributions (0 = identical, 1 = completely different) - p-value: probability this gap happened by chance (< 0.05 = drift)
# Test each feature
print(f"{'Feature':30s} {'KS stat':>8s} {'p-value':>8s} {'Verdict'}")
print("-" * 65)
for col in iris.feature_names:
stat, p = ks_2samp(df_train[col], df_prod[col])
verdict = "DRIFT" if p < 0.05 else "OK"
print(f"{col:30s} {stat:8.4f} {p:8.4f} [{verdict}]")Visualizing the KS test
Let’s see what the KS test actually measures — the gap between cumulative distribution curves.
# Pick the feature with the most drift
feature = "petal length (cm)"
# Sort values and compute empirical CDF
train_sorted = np.sort(df_train[feature])
prod_sorted = np.sort(df_prod[feature])
train_cdf = np.arange(1, len(train_sorted) + 1) / len(train_sorted)
prod_cdf = np.arange(1, len(prod_sorted) + 1) / len(prod_sorted)
fig, ax = plt.subplots(figsize=(10, 5))
ax.step(train_sorted, train_cdf, label="Training CDF", color="steelblue", linewidth=2)
ax.step(prod_sorted, prod_cdf, label="Production CDF", color="coral", linewidth=2)
# Find and annotate the maximum gap
stat, p = ks_2samp(df_train[feature], df_prod[feature])
ax.set_title(f"KS Test for '{feature}' — D = {stat:.3f}, p = {p:.4f}", fontsize=13)
ax.set_xlabel(feature)
ax.set_ylabel("Cumulative Probability")
ax.legend(fontsize=12)
ax.annotate(f"Max gap = {stat:.3f}", xy=(4.5, 0.5), fontsize=14,
bbox=dict(boxstyle="round", fc="lightyellow", ec="orange"))
plt.tight_layout()
plt.show()Part 4: Simulate Artificial Drift
Let’s deliberately shift a feature and watch the KS test catch it.
# Shift petal_length by +2 cm in production
df_prod_shifted = df_prod.copy()
df_prod_shifted["petal length (cm)"] += 2.0
print("After shifting petal length by +2 cm:\n")
print(f"{'Feature':30s} {'KS stat':>8s} {'p-value':>8s} {'Verdict'}")
print("-" * 65)
for col in iris.feature_names:
stat, p = ks_2samp(df_train[col], df_prod_shifted[col])
verdict = "DRIFT" if p < 0.05 else "OK"
marker = " ← shifted!" if col == "petal length (cm)" else ""
print(f"{col:30s} {stat:8.4f} {p:8.4f} [{verdict}]{marker}")# Visualize the shift
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Before shift
axes[0].hist(df_train["petal length (cm)"], bins=20, alpha=0.5, label="Training", color="steelblue", density=True)
axes[0].hist(df_prod["petal length (cm)"], bins=20, alpha=0.5, label="Production", color="coral", density=True)
axes[0].set_title("Before: No artificial drift", fontsize=12)
axes[0].legend()
# After shift
axes[1].hist(df_train["petal length (cm)"], bins=20, alpha=0.5, label="Training", color="steelblue", density=True)
axes[1].hist(df_prod_shifted["petal length (cm)"], bins=20, alpha=0.5, label="Production (shifted +2)", color="coral", density=True)
axes[1].set_title("After: Petal length shifted by +2 cm", fontsize=12)
axes[1].legend()
plt.suptitle("Can you see the drift?", fontsize=14, fontweight="bold")
plt.tight_layout()
plt.show()Part 5: PSI — Population Stability Index
PSI gives a single score instead of a p-value. Popular in banking/fintech.
Traffic light rules: - PSI < 0.1 → Green (no drift) - PSI 0.1–0.25 → Yellow (monitor) - PSI > 0.25 → Red (drift!)
def compute_psi(reference, current, bins=10):
"""Population Stability Index between two arrays."""
breakpoints = np.quantile(reference, np.linspace(0, 1, bins + 1))
breakpoints[-1] += 1e-6 # avoid edge issues
ref_counts = np.histogram(reference, bins=breakpoints)[0] / len(reference)
cur_counts = np.histogram(current, bins=breakpoints)[0] / len(current)
# Avoid log(0)
ref_counts = np.clip(ref_counts, 0.001, None)
cur_counts = np.clip(cur_counts, 0.001, None)
psi = np.sum((cur_counts - ref_counts) * np.log(cur_counts / ref_counts))
return psi
print(f"{'Feature':30s} {'PSI':>8s} {'Verdict'}")
print("-" * 55)
for col in iris.feature_names:
psi = compute_psi(df_train[col].values, df_prod_shifted[col].values)
if psi < 0.1:
verdict = "GREEN (no drift)"
elif psi < 0.25:
verdict = "YELLOW (monitor)"
else:
verdict = "RED (drift!)"
print(f"{col:30s} {psi:8.3f} [{verdict}]")Part 6: Gradual Drift Over Time
Real drift usually happens slowly — not overnight. Let’s simulate 12 months of gradual shift.
feature = "petal length (cm)"
results = []
for month in range(1, 13):
df_month = df_prod.copy()
# Each month, the shift increases by 0.3 cm
df_month[feature] += month * 0.3
stat, p = ks_2samp(df_train[feature], df_month[feature])
psi = compute_psi(df_train[feature].values, df_month[feature].values)
results.append({"month": month, "ks_stat": stat, "p_value": p, "psi": psi})
df_results = pd.DataFrame(results)
fig, axes = plt.subplots(1, 3, figsize=(16, 4))
axes[0].plot(df_results["month"], df_results["ks_stat"], "o-", color="steelblue", linewidth=2)
axes[0].set_title("KS Statistic Over Time")
axes[0].set_xlabel("Month")
axes[0].set_ylabel("KS Statistic")
axes[1].plot(df_results["month"], df_results["p_value"], "o-", color="coral", linewidth=2)
axes[1].axhline(y=0.05, color="red", linestyle="--", label="p=0.05 threshold")
axes[1].set_title("p-value Over Time")
axes[1].set_xlabel("Month")
axes[1].set_ylabel("p-value")
axes[1].legend()
axes[2].plot(df_results["month"], df_results["psi"], "o-", color="green", linewidth=2)
axes[2].axhline(y=0.1, color="orange", linestyle="--", label="PSI=0.1 (monitor)")
axes[2].axhline(y=0.25, color="red", linestyle="--", label="PSI=0.25 (drift!)")
axes[2].set_title("PSI Over Time")
axes[2].set_xlabel("Month")
axes[2].set_ylabel("PSI")
axes[2].legend()
plt.suptitle("Drift Gets Worse Over Time", fontsize=14, fontweight="bold")
plt.tight_layout()
plt.show()Key insight: The KS statistic and PSI both grow as drift increases. The p-value drops to near zero quickly — it’s very sensitive. PSI gives a more gradual signal.
Part 7: Evidently — Automated Drift Reports
Instead of checking features one by one, let Evidently do it all at once.
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset
# Drop target column — we're only checking input features
train_features = df_train.drop("target", axis=1)
prod_features = df_prod_shifted.drop("target", axis=1)
report = Report(metrics=[DataDriftPreset()])
report.run(reference_data=train_features, current_data=prod_features)
reportExplore the report above:
- How many features drifted?
- Click on each feature — see the histograms
- Which statistical test did Evidently choose?
- Do the results match our manual KS tests?
# Save the report as an HTML file you can share
report.save_html("drift_report.html")
print("Report saved to drift_report.html — open in browser!")Part 8: Real-World Scenario — Wine Dataset
Imagine you trained a wine quality model on wines from Region A.
Now you deploy it on wines from Region B. Will it still work?
wine = load_wine()
df_wine = pd.DataFrame(wine.data, columns=wine.feature_names)
df_wine["target"] = wine.target
# "Region A" = class 0 and 1, "Region B" = class 2
df_region_a = df_wine[df_wine["target"].isin([0, 1])].drop("target", axis=1)
df_region_b = df_wine[df_wine["target"] == 2].drop("target", axis=1)
print(f"Region A (training): {len(df_region_a)} samples")
print(f"Region B (production): {len(df_region_b)} samples")# Manual KS test
print(f"{'Feature':30s} {'KS stat':>8s} {'p-value':>10s} {'Verdict'}")
print("-" * 65)
for col in wine.feature_names:
stat, p = ks_2samp(df_region_a[col], df_region_b[col])
verdict = "DRIFT" if p < 0.05 else "OK"
print(f"{col:30s} {stat:8.4f} {p:10.6f} [{verdict}]")# Evidently report for wine data
report_wine = Report(metrics=[DataDriftPreset()])
report_wine.run(reference_data=df_region_a, current_data=df_region_b)
report_winePart 9: Impact on Model Performance
Drift detection tells you the inputs changed. But does the model actually perform worse? Let’s check.
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
# Train on Region A
X_a = df_wine[df_wine["target"].isin([0, 1])].drop("target", axis=1)
y_a = df_wine[df_wine["target"].isin([0, 1])]["target"]
model = RandomForestClassifier(random_state=42)
scores_a = cross_val_score(model, X_a, y_a, cv=5)
print(f"Accuracy on Region A (training distribution): {scores_a.mean():.3f} ± {scores_a.std():.3f}")
# Train on Region A, test on Region B
model.fit(X_a, y_a)
X_b = df_wine[df_wine["target"] == 2].drop("target", axis=1)
y_b = df_wine[df_wine["target"] == 2]["target"]
# Model was trained on classes 0 and 1 — it has never seen class 2!
# Every prediction on Region B will be wrong.
preds = model.predict(X_b)
print(f"\nPredictions on Region B: {np.unique(preds, return_counts=True)}")
print(f"Actual labels in Region B: all class 2")
print(f"Accuracy on Region B: {(preds == y_b.values).mean():.3f}")
print(f"\n→ The model completely fails on the drifted data!")Summary
| What we learned | Tool |
|---|---|
| Visual drift check | matplotlib histograms |
| Statistical test (numeric) | scipy.stats.ks_2samp() |
| Statistical test (categorical) | scipy.stats.chi2_contingency() |
| Stability score | PSI (custom function) |
| Automated full report | evidently (4 lines) |
| Impact verification | Train on old data, test on new |
Key takeaway: Detecting drift is easy. The hard part is deciding what to do about it.