from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
# Load data
X, y = load_iris(return_X_y=True)
print(f"Total samples: {len(X)}")
# Split: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
print(f"Train: {len(X_train)}, Test: {len(X_test)}")
# Train and evaluate
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
train_acc = model.score(X_train, y_train)
test_acc = model.score(X_test, y_test)
print(f"\nTrain accuracy: {train_acc:.1%}")
print(f"Test accuracy: {test_acc:.1%}")Lecture 02: Data Foundation - Live Demos
Quick runnable examples for in-class demonstrations.
1. Train/Test Split
2. When Accuracy Fails (Imbalanced Data)
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix
# Simulate: 990 healthy, 10 cancer
y_true = np.array([0]*990 + [1]*10) # 0=healthy, 1=cancer
# Dumb model: always predicts healthy
y_pred_dumb = np.zeros(1000)
print("Dumb model (always predict healthy):")
print(f" Accuracy: {accuracy_score(y_true, y_pred_dumb):.1%}")
print(f" Cancer cases caught: {sum((y_pred_dumb == 1) & (y_true == 1))}/{sum(y_true == 1)}")
print(f"\nConfusion Matrix:")
print(confusion_matrix(y_true, y_pred_dumb))3. Confusion Matrix, Precision, Recall
from sklearn.metrics import (
confusion_matrix, precision_score, recall_score, f1_score,
ConfusionMatrixDisplay
)
import matplotlib.pyplot as plt
# Simulated predictions (from our slide example)
# 100 cancer patients, 900 healthy
y_true = np.array([1]*100 + [0]*900)
# Model: catches 85/100 cancer, but also flags 50 healthy as cancer
y_pred = np.array([1]*85 + [0]*15 + [1]*50 + [0]*850)
cm = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(cm)
print(f"\nPrecision: {precision_score(y_true, y_pred):.2f}")
print(f"Recall: {recall_score(y_true, y_pred):.2f}")
print(f"F1 Score: {f1_score(y_true, y_pred):.2f}")
print(f"Accuracy: {accuracy_score(y_true, y_pred):.2f}")
# Plot
fig, ax = plt.subplots(figsize=(6, 5))
ConfusionMatrixDisplay(cm, display_labels=['Healthy', 'Cancer']).plot(ax=ax)
ax.set_title('Cancer Screening Model')
plt.tight_layout()
plt.show()4. One-Hot Encoding
import pandas as pd
# Tomato dataset
df = pd.DataFrame({
'Color': ['Orange', 'Red', 'Orange', 'Yellow'],
'Size': ['Small', 'Small', 'Medium', 'Large'],
'Quality': ['Good', 'Good', 'Bad', 'Bad']
})
print("Original:")
print(df)
print("\nOne-Hot Encoded:")
print(pd.get_dummies(df[['Color', 'Size']]))5. The sklearn Universal Pattern
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# Same pattern for ALL models!
for name, model in [
('Decision Tree', DecisionTreeClassifier()),
('Logistic Reg', LogisticRegression(max_iter=200)),
('KNN (k=5)', KNeighborsClassifier()),
]:
model.fit(X_train, y_train) # Train
preds = model.predict(X_test) # Predict
acc = accuracy_score(y_test, preds) # Evaluate
print(f"{name:15s}: {acc:.1%}")6. Multi-Class F1 (Cat/Dog/Bird Example)Verify the hand calculations from the slides!
from sklearn.metrics import classification_report, confusion_matrix, f1_scoreimport numpy as np# From our slide: Cat/Dog/Bird confusion matrixy_true = ['Cat']*10 + ['Dog']*10 + ['Bird']*10y_pred = (['Cat']*8 + ['Dog']*1 + ['Bird']*1 + # 8 correct cats ['Cat']*2 + ['Dog']*6 + ['Bird']*2 + # 6 correct dogs ['Cat']*0 + ['Dog']*1 + ['Bird']*9) # 9 correct birdsprint("Confusion Matrix:")print(confusion_matrix(y_true, y_pred, labels=['Cat', 'Dog', 'Bird']))print("\nDetailed Report:")print(classification_report(y_true, y_pred))print("F1 Scores:")print(f" Macro: {f1_score(y_true, y_pred, average='macro'):.3f}")print(f" Weighted: {f1_score(y_true, y_pred, average='weighted'):.3f}")print(f" Micro: {f1_score(y_true, y_pred, average='micro'):.3f}")print("\nDo these match your hand calculations?")