Lecture 02: Data Foundation - Live Demos

Quick runnable examples for in-class demonstrations.

1. Train/Test Split

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris

# Load data
X, y = load_iris(return_X_y=True)
print(f"Total samples: {len(X)}")

# Split: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"Train: {len(X_train)}, Test: {len(X_test)}")

# Train and evaluate
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

train_acc = model.score(X_train, y_train)
test_acc = model.score(X_test, y_test)
print(f"\nTrain accuracy: {train_acc:.1%}")
print(f"Test accuracy:  {test_acc:.1%}")

2. When Accuracy Fails (Imbalanced Data)

import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix

# Simulate: 990 healthy, 10 cancer
y_true = np.array([0]*990 + [1]*10)  # 0=healthy, 1=cancer

# Dumb model: always predicts healthy
y_pred_dumb = np.zeros(1000)

print("Dumb model (always predict healthy):")
print(f"  Accuracy: {accuracy_score(y_true, y_pred_dumb):.1%}")
print(f"  Cancer cases caught: {sum((y_pred_dumb == 1) & (y_true == 1))}/{sum(y_true == 1)}")
print(f"\nConfusion Matrix:")
print(confusion_matrix(y_true, y_pred_dumb))

3. Confusion Matrix, Precision, Recall

from sklearn.metrics import (
    confusion_matrix, precision_score, recall_score, f1_score,
    ConfusionMatrixDisplay
)
import matplotlib.pyplot as plt

# Simulated predictions (from our slide example)
# 100 cancer patients, 900 healthy
y_true = np.array([1]*100 + [0]*900)

# Model: catches 85/100 cancer, but also flags 50 healthy as cancer
y_pred = np.array([1]*85 + [0]*15 + [1]*50 + [0]*850)

cm = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(cm)

print(f"\nPrecision: {precision_score(y_true, y_pred):.2f}")
print(f"Recall:    {recall_score(y_true, y_pred):.2f}")
print(f"F1 Score:  {f1_score(y_true, y_pred):.2f}")
print(f"Accuracy:  {accuracy_score(y_true, y_pred):.2f}")

# Plot
fig, ax = plt.subplots(figsize=(6, 5))
ConfusionMatrixDisplay(cm, display_labels=['Healthy', 'Cancer']).plot(ax=ax)
ax.set_title('Cancer Screening Model')
plt.tight_layout()
plt.show()

4. One-Hot Encoding

import pandas as pd

# Tomato dataset
df = pd.DataFrame({
    'Color': ['Orange', 'Red', 'Orange', 'Yellow'],
    'Size': ['Small', 'Small', 'Medium', 'Large'],
    'Quality': ['Good', 'Good', 'Bad', 'Bad']
})

print("Original:")
print(df)
print("\nOne-Hot Encoded:")
print(pd.get_dummies(df[['Color', 'Size']]))

5. The sklearn Universal Pattern

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Same pattern for ALL models!
for name, model in [
    ('Decision Tree', DecisionTreeClassifier()),
    ('Logistic Reg', LogisticRegression(max_iter=200)),
    ('KNN (k=5)', KNeighborsClassifier()),
]:
    model.fit(X_train, y_train)           # Train
    preds = model.predict(X_test)         # Predict
    acc = accuracy_score(y_test, preds)   # Evaluate
    print(f"{name:15s}: {acc:.1%}")

6. Multi-Class F1 (Cat/Dog/Bird Example)Verify the hand calculations from the slides!

from sklearn.metrics import classification_report, confusion_matrix, f1_scoreimport numpy as np# From our slide: Cat/Dog/Bird confusion matrixy_true = ['Cat']*10 + ['Dog']*10 + ['Bird']*10y_pred = (['Cat']*8 + ['Dog']*1 + ['Bird']*1 +   # 8 correct cats          ['Cat']*2 + ['Dog']*6 + ['Bird']*2 +     # 6 correct dogs          ['Cat']*0 + ['Dog']*1 + ['Bird']*9)       # 9 correct birdsprint("Confusion Matrix:")print(confusion_matrix(y_true, y_pred, labels=['Cat', 'Dog', 'Bird']))print("\nDetailed Report:")print(classification_report(y_true, y_pred))print("F1 Scores:")print(f"  Macro:    {f1_score(y_true, y_pred, average='macro'):.3f}")print(f"  Weighted: {f1_score(y_true, y_pred, average='weighted'):.3f}")print(f"  Micro:    {f1_score(y_true, y_pred, average='micro'):.3f}")print("\nDo these match your hand calculations?")