Lecture 3: Supervised Learning Deep Dive

Interactive Demo Notebook

In this notebook, we’ll implement and visualize: 1. Linear Regression - Predicting numbers 2. Logistic Regression - Classifying into categories 3. Decision Trees - If-then rules 4. K-Nearest Neighbors - Vote by neighbors 5. Evaluation Metrics - How good is our model?

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_regression, make_classification, load_iris
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)
print("Ready to learn supervised learning! 🎯")

Part 1: Linear Regression - Fitting a Line

Our goal: Given house size, predict price

# Create simple data: House size → Price
np.random.seed(42)
house_size = np.array([1000, 1500, 2000, 2500, 3000, 1200, 1800, 2200, 2800, 3500])
price = 0.04 * house_size + np.random.randn(10) * 5 + 10  # Price in lakhs

# Visualize
plt.figure(figsize=(10, 6))
plt.scatter(house_size, price, s=100, color='#1e3a5f', edgecolors='white', linewidth=2)
plt.xlabel('House Size (sqft)', fontsize=12)
plt.ylabel('Price (₹ Lakhs)', fontsize=12)
plt.title('House Size vs Price', fontsize=14)
plt.grid(True, alpha=0.3)
plt.show()

print("🤔 Can you see a pattern? Bigger house → Higher price!")

# Fit a line!
from sklearn.linear_model import LinearRegression

X = house_size.reshape(-1, 1)  # sklearn needs 2D array
y = price

model = LinearRegression()
model.fit(X, y)

print(f"Slope (w): {model.coef_[0]:.4f}")
print(f"Intercept (b): {model.intercept_:.2f}")
print(f"\n📐 Equation: Price = {model.coef_[0]:.4f} × Size + {model.intercept_:.2f}")

# Visualize the fitted line
plt.figure(figsize=(10, 6))
plt.scatter(house_size, price, s=100, color='#1e3a5f', edgecolors='white', linewidth=2, label='Actual data')

# Plot the line
x_line = np.linspace(800, 3700, 100)
y_line = model.predict(x_line.reshape(-1, 1))
plt.plot(x_line, y_line, color='#e85a4f', linewidth=3, label='Best fit line')

plt.xlabel('House Size (sqft)', fontsize=12)
plt.ylabel('Price (₹ Lakhs)', fontsize=12)
plt.title('Linear Regression: Finding the Best Fit Line', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.show()

# Make a prediction
new_house = 1750
predicted_price = model.predict([[new_house]])[0]
print(f"\n🏠 A {new_house} sqft house is predicted to cost ₹{predicted_price:.1f} lakhs")

Part 2: Logistic Regression - Classification

Our goal: Predict if a student will pass (1) or fail (0) based on study hours

# Create data: Study hours → Pass/Fail
np.random.seed(42)
study_hours = np.array([1, 2, 2.5, 3, 3.5, 4, 4.5, 5, 5.5, 6, 6.5, 7, 8, 9, 10])
passed = np.array([0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1])

# Visualize
plt.figure(figsize=(10, 4))
colors = ['#e85a4f' if p == 0 else '#2a9d8f' for p in passed]
plt.scatter(study_hours, passed, c=colors, s=150, edgecolors='white', linewidth=2)
plt.xlabel('Study Hours', fontsize=12)
plt.ylabel('Passed (1) / Failed (0)', fontsize=12)
plt.title('Study Hours vs Exam Result', fontsize=14)
plt.yticks([0, 1], ['Failed', 'Passed'])
plt.grid(True, alpha=0.3)
plt.show()

print("🔴 Red = Failed, 🟢 Green = Passed")

# Fit logistic regression
from sklearn.linear_model import LogisticRegression

X = study_hours.reshape(-1, 1)
y = passed

log_model = LogisticRegression()
log_model.fit(X, y)

# The S-curve (sigmoid)
x_range = np.linspace(0, 11, 100)
y_prob = log_model.predict_proba(x_range.reshape(-1, 1))[:, 1]

plt.figure(figsize=(10, 5))
plt.scatter(study_hours, passed, c=colors, s=150, edgecolors='white', linewidth=2, zorder=5)
plt.plot(x_range, y_prob, color='#1e3a5f', linewidth=3, label='Probability of passing')
plt.axhline(y=0.5, color='gray', linestyle='--', alpha=0.7, label='Decision boundary (50%)')
plt.xlabel('Study Hours', fontsize=12)
plt.ylabel('Probability of Passing', fontsize=12)
plt.title('Logistic Regression: The Sigmoid Curve', fontsize=14)
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Find the threshold
threshold_hours = x_range[np.argmin(np.abs(y_prob - 0.5))]
print(f"\n📊 Decision threshold: ~{threshold_hours:.1f} hours of study")
print(f"   Below: Likely to fail | Above: Likely to pass")

Part 3: Decision Trees - If-Then Rules

from sklearn.tree import DecisionTreeClassifier, plot_tree

# Use Iris data
iris = load_iris()
X = iris.data[:, 2:4]  # Just petal length and width
y = iris.target

# Train a simple tree (limited depth for clarity)
tree = DecisionTreeClassifier(max_depth=3, random_state=42)
tree.fit(X, y)

# Visualize the tree
plt.figure(figsize=(15, 8))
plot_tree(tree, 
          feature_names=['Petal Length', 'Petal Width'],
          class_names=iris.target_names,
          filled=True,
          rounded=True,
          fontsize=10)
plt.title('Decision Tree: If-Then Rules for Iris Classification', fontsize=14)
plt.tight_layout()
plt.show()

print("💡 Read from top to bottom: Each node asks a yes/no question!")

# Visualize the decision boundaries
from matplotlib.colors import ListedColormap

def plot_decision_boundary(model, X, y, title):
    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200),
                         np.linspace(y_min, y_max, 200))
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    plt.figure(figsize=(10, 6))
    plt.contourf(xx, yy, Z, alpha=0.3, cmap='RdYlGn')
    scatter = plt.scatter(X[:, 0], X[:, 1], c=y, cmap='RdYlGn', edgecolors='white', s=80)
    plt.xlabel('Petal Length (cm)', fontsize=12)
    plt.ylabel('Petal Width (cm)', fontsize=12)
    plt.title(title, fontsize=14)
    plt.colorbar(scatter, label='Species')
    plt.show()

plot_decision_boundary(tree, X, y, 'Decision Tree: Rectangular Boundaries')

Part 4: K-Nearest Neighbors - Vote by Neighbors

from sklearn.neighbors import KNeighborsClassifier

# Different values of K
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for idx, k in enumerate([1, 5, 15]):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X, y)
    
    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                         np.linspace(y_min, y_max, 100))
    Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    axes[idx].contourf(xx, yy, Z, alpha=0.3, cmap='RdYlGn')
    axes[idx].scatter(X[:, 0], X[:, 1], c=y, cmap='RdYlGn', edgecolors='white', s=50)
    axes[idx].set_title(f'K = {k}', fontsize=14)
    axes[idx].set_xlabel('Petal Length')
    if idx == 0:
        axes[idx].set_ylabel('Petal Width')

plt.suptitle('K-NN: Effect of Number of Neighbors', fontsize=16, y=1.02)
plt.tight_layout()
plt.show()

print("💡 K=1: Memorizes data (overfits) | K=15: Smoother boundaries")

Part 5: Evaluation Metrics

How do we know if our model is good?

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train and predict
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=iris.target_names, 
            yticklabels=iris.target_names,
            annot_kws={'size': 16})
plt.xlabel('Predicted', fontsize=12)
plt.ylabel('Actual', fontsize=12)
plt.title('Confusion Matrix', fontsize=14)
plt.show()

print("📊 Reading the confusion matrix:")
print("   - Diagonal = Correct predictions")
print("   - Off-diagonal = Mistakes")

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("📈 Evaluation Metrics:")
print(f"   Accuracy:  {accuracy:.1%}  (% correct overall)")
print(f"   Precision: {precision:.1%}  (of predicted positives, how many were right?)")
print(f"   Recall:    {recall:.1%}  (of actual positives, how many did we find?)")
print(f"   F1 Score:  {f1:.1%}  (balance between precision and recall)")

🎯 Exercises

# Exercise 1: Create a linear regression on California housing data
# from sklearn.datasets import fetch_california_housing
# Predict house value from median income

# Exercise 2: Compare Decision Tree vs KNN on digits dataset
# Which gets higher accuracy? Why?

Summary

Algorithm	Type	Key Idea	When to Use
Linear Regression	Regression	Fit a line	Predicting numbers
Logistic Regression	Classification	S-curve probability	Binary yes/no
Decision Trees	Both	If-then rules	When interpretability matters
K-NN	Both	Vote by neighbors	Simple baseline