import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_regression, make_classification, load_iris
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)
print("Ready to learn supervised learning! 🎯")Lecture 3: Supervised Learning Deep Dive
Interactive Demo Notebook
In this notebook, we’ll implement and visualize: 1. Linear Regression - Predicting numbers 2. Logistic Regression - Classifying into categories 3. Decision Trees - If-then rules 4. K-Nearest Neighbors - Vote by neighbors 5. Evaluation Metrics - How good is our model?
Part 1: Linear Regression - Fitting a Line
Our goal: Given house size, predict price
# Create simple data: House size → Price
np.random.seed(42)
house_size = np.array([1000, 1500, 2000, 2500, 3000, 1200, 1800, 2200, 2800, 3500])
price = 0.04 * house_size + np.random.randn(10) * 5 + 10 # Price in lakhs
# Visualize
plt.figure(figsize=(10, 6))
plt.scatter(house_size, price, s=100, color='#1e3a5f', edgecolors='white', linewidth=2)
plt.xlabel('House Size (sqft)', fontsize=12)
plt.ylabel('Price (₹ Lakhs)', fontsize=12)
plt.title('House Size vs Price', fontsize=14)
plt.grid(True, alpha=0.3)
plt.show()
print("🤔 Can you see a pattern? Bigger house → Higher price!")# Fit a line!
from sklearn.linear_model import LinearRegression
X = house_size.reshape(-1, 1) # sklearn needs 2D array
y = price
model = LinearRegression()
model.fit(X, y)
print(f"Slope (w): {model.coef_[0]:.4f}")
print(f"Intercept (b): {model.intercept_:.2f}")
print(f"\n📐 Equation: Price = {model.coef_[0]:.4f} × Size + {model.intercept_:.2f}")# Visualize the fitted line
plt.figure(figsize=(10, 6))
plt.scatter(house_size, price, s=100, color='#1e3a5f', edgecolors='white', linewidth=2, label='Actual data')
# Plot the line
x_line = np.linspace(800, 3700, 100)
y_line = model.predict(x_line.reshape(-1, 1))
plt.plot(x_line, y_line, color='#e85a4f', linewidth=3, label='Best fit line')
plt.xlabel('House Size (sqft)', fontsize=12)
plt.ylabel('Price (₹ Lakhs)', fontsize=12)
plt.title('Linear Regression: Finding the Best Fit Line', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.show()
# Make a prediction
new_house = 1750
predicted_price = model.predict([[new_house]])[0]
print(f"\n🏠 A {new_house} sqft house is predicted to cost ₹{predicted_price:.1f} lakhs")Part 2: Logistic Regression - Classification
Our goal: Predict if a student will pass (1) or fail (0) based on study hours
# Create data: Study hours → Pass/Fail
np.random.seed(42)
study_hours = np.array([1, 2, 2.5, 3, 3.5, 4, 4.5, 5, 5.5, 6, 6.5, 7, 8, 9, 10])
passed = np.array([0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1])
# Visualize
plt.figure(figsize=(10, 4))
colors = ['#e85a4f' if p == 0 else '#2a9d8f' for p in passed]
plt.scatter(study_hours, passed, c=colors, s=150, edgecolors='white', linewidth=2)
plt.xlabel('Study Hours', fontsize=12)
plt.ylabel('Passed (1) / Failed (0)', fontsize=12)
plt.title('Study Hours vs Exam Result', fontsize=14)
plt.yticks([0, 1], ['Failed', 'Passed'])
plt.grid(True, alpha=0.3)
plt.show()
print("🔴 Red = Failed, 🟢 Green = Passed")# Fit logistic regression
from sklearn.linear_model import LogisticRegression
X = study_hours.reshape(-1, 1)
y = passed
log_model = LogisticRegression()
log_model.fit(X, y)
# The S-curve (sigmoid)
x_range = np.linspace(0, 11, 100)
y_prob = log_model.predict_proba(x_range.reshape(-1, 1))[:, 1]
plt.figure(figsize=(10, 5))
plt.scatter(study_hours, passed, c=colors, s=150, edgecolors='white', linewidth=2, zorder=5)
plt.plot(x_range, y_prob, color='#1e3a5f', linewidth=3, label='Probability of passing')
plt.axhline(y=0.5, color='gray', linestyle='--', alpha=0.7, label='Decision boundary (50%)')
plt.xlabel('Study Hours', fontsize=12)
plt.ylabel('Probability of Passing', fontsize=12)
plt.title('Logistic Regression: The Sigmoid Curve', fontsize=14)
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
# Find the threshold
threshold_hours = x_range[np.argmin(np.abs(y_prob - 0.5))]
print(f"\n📊 Decision threshold: ~{threshold_hours:.1f} hours of study")
print(f" Below: Likely to fail | Above: Likely to pass")Part 3: Decision Trees - If-Then Rules
from sklearn.tree import DecisionTreeClassifier, plot_tree
# Use Iris data
iris = load_iris()
X = iris.data[:, 2:4] # Just petal length and width
y = iris.target
# Train a simple tree (limited depth for clarity)
tree = DecisionTreeClassifier(max_depth=3, random_state=42)
tree.fit(X, y)
# Visualize the tree
plt.figure(figsize=(15, 8))
plot_tree(tree,
feature_names=['Petal Length', 'Petal Width'],
class_names=iris.target_names,
filled=True,
rounded=True,
fontsize=10)
plt.title('Decision Tree: If-Then Rules for Iris Classification', fontsize=14)
plt.tight_layout()
plt.show()
print("💡 Read from top to bottom: Each node asks a yes/no question!")# Visualize the decision boundaries
from matplotlib.colors import ListedColormap
def plot_decision_boundary(model, X, y, title):
x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200),
np.linspace(y_min, y_max, 200))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure(figsize=(10, 6))
plt.contourf(xx, yy, Z, alpha=0.3, cmap='RdYlGn')
scatter = plt.scatter(X[:, 0], X[:, 1], c=y, cmap='RdYlGn', edgecolors='white', s=80)
plt.xlabel('Petal Length (cm)', fontsize=12)
plt.ylabel('Petal Width (cm)', fontsize=12)
plt.title(title, fontsize=14)
plt.colorbar(scatter, label='Species')
plt.show()
plot_decision_boundary(tree, X, y, 'Decision Tree: Rectangular Boundaries')Part 4: K-Nearest Neighbors - Vote by Neighbors
from sklearn.neighbors import KNeighborsClassifier
# Different values of K
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
for idx, k in enumerate([1, 5, 15]):
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X, y)
x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
np.linspace(y_min, y_max, 100))
Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
axes[idx].contourf(xx, yy, Z, alpha=0.3, cmap='RdYlGn')
axes[idx].scatter(X[:, 0], X[:, 1], c=y, cmap='RdYlGn', edgecolors='white', s=50)
axes[idx].set_title(f'K = {k}', fontsize=14)
axes[idx].set_xlabel('Petal Length')
if idx == 0:
axes[idx].set_ylabel('Petal Width')
plt.suptitle('K-NN: Effect of Number of Neighbors', fontsize=16, y=1.02)
plt.tight_layout()
plt.show()
print("💡 K=1: Memorizes data (overfits) | K=15: Smoother boundaries")Part 5: Evaluation Metrics
How do we know if our model is good?
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Train and predict
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=iris.target_names,
yticklabels=iris.target_names,
annot_kws={'size': 16})
plt.xlabel('Predicted', fontsize=12)
plt.ylabel('Actual', fontsize=12)
plt.title('Confusion Matrix', fontsize=14)
plt.show()
print("📊 Reading the confusion matrix:")
print(" - Diagonal = Correct predictions")
print(" - Off-diagonal = Mistakes")# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
print("📈 Evaluation Metrics:")
print(f" Accuracy: {accuracy:.1%} (% correct overall)")
print(f" Precision: {precision:.1%} (of predicted positives, how many were right?)")
print(f" Recall: {recall:.1%} (of actual positives, how many did we find?)")
print(f" F1 Score: {f1:.1%} (balance between precision and recall)")🎯 Exercises
# Exercise 1: Create a linear regression on California housing data
# from sklearn.datasets import fetch_california_housing
# Predict house value from median income
# Exercise 2: Compare Decision Tree vs KNN on digits dataset
# Which gets higher accuracy? Why?
Summary
| Algorithm | Type | Key Idea | When to Use |
|---|---|---|---|
| Linear Regression | Regression | Fit a line | Predicting numbers |
| Logistic Regression | Classification | S-curve probability | Binary yes/no |
| Decision Trees | Both | If-then rules | When interpretability matters |
| K-NN | Both | Vote by neighbors | Simple baseline |