Lecture 6: Computer Vision - How Machines See

Interactive Demo Notebook

In this notebook, we’ll explore: 1. Images as Data - Pixels and arrays 2. Convolution - The key operation 3. CNNs in Action - Real image classification 4. Object Detection - What AND where 5. YOLO Demo - Real-time detection

import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)
print("Ready to see how machines see! 👁️")

Part 1: Images as Data

To a computer, an image is just a grid of numbers!

# Load MNIST digits
from sklearn.datasets import load_digits

digits = load_digits()
sample_image = digits.images[0]  # First digit image

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Visual representation
axes[0].imshow(sample_image, cmap='gray')
axes[0].set_title(f'What we see: Digit "{digits.target[0]}"', fontsize=14)
axes[0].axis('off')

# Numerical representation
im = axes[1].imshow(sample_image, cmap='gray')
for i in range(8):
    for j in range(8):
        val = int(sample_image[i, j])
        color = 'white' if val > 8 else 'black'
        axes[1].text(j, i, val, ha='center', va='center', fontsize=8, color=color)
axes[1].set_title('What the computer sees: Numbers!', fontsize=14)
axes[1].axis('off')

plt.tight_layout()
plt.show()

print(f"Image shape: {sample_image.shape} (8×8 pixels)")
print(f"Pixel values range: {sample_image.min():.0f} to {sample_image.max():.0f}")
print("\n💡 Higher number = darker pixel")
# RGB Color Images
# Create a simple RGB image
rgb_image = np.zeros((100, 100, 3), dtype=np.uint8)
rgb_image[:33, :, 0] = 255   # Red stripe
rgb_image[33:66, :, 1] = 255  # Green stripe  
rgb_image[66:, :, 2] = 255   # Blue stripe

fig, axes = plt.subplots(1, 4, figsize=(14, 3))

axes[0].imshow(rgb_image)
axes[0].set_title('RGB Image', fontsize=12)
axes[0].axis('off')

# Show individual channels
for i, (channel, color, name) in enumerate([(0, 'Reds', 'Red'), 
                                             (1, 'Greens', 'Green'), 
                                             (2, 'Blues', 'Blue')]):
    axes[i+1].imshow(rgb_image[:, :, channel], cmap=color)
    axes[i+1].set_title(f'{name} Channel', fontsize=12)
    axes[i+1].axis('off')

plt.tight_layout()
plt.show()

print("💡 Color images have 3 channels: Red, Green, Blue")
print(f"   Shape: {rgb_image.shape} = (Height, Width, Channels)")

Part 2: Convolution - The Magic Operation

Slide a small filter over the image to detect patterns

from scipy import ndimage

# Create a simple image with edges
simple_image = np.zeros((10, 10))
simple_image[2:8, 2:8] = 1  # White square in center

# Edge detection filter (Sobel)
edge_filter_h = np.array([[-1, 0, 1],
                          [-2, 0, 2],
                          [-1, 0, 1]])

edge_filter_v = np.array([[-1, -2, -1],
                          [ 0,  0,  0],
                          [ 1,  2,  1]])

# Apply convolution
edges_h = ndimage.convolve(simple_image, edge_filter_h)
edges_v = ndimage.convolve(simple_image, edge_filter_v)
edges_combined = np.sqrt(edges_h**2 + edges_v**2)

fig, axes = plt.subplots(1, 4, figsize=(14, 3))

axes[0].imshow(simple_image, cmap='gray')
axes[0].set_title('Original Image', fontsize=12)
axes[0].axis('off')

axes[1].imshow(edges_h, cmap='RdBu')
axes[1].set_title('Horizontal Edges', fontsize=12)
axes[1].axis('off')

axes[2].imshow(edges_v, cmap='RdBu')
axes[2].set_title('Vertical Edges', fontsize=12)
axes[2].axis('off')

axes[3].imshow(edges_combined, cmap='hot')
axes[3].set_title('All Edges', fontsize=12)
axes[3].axis('off')

plt.tight_layout()
plt.show()

print("💡 Convolution detects patterns like edges, corners, textures")
# More fun with filters!
# Blur filter
blur_filter = np.ones((5, 5)) / 25

# Sharpen filter
sharpen_filter = np.array([[0, -1, 0],
                           [-1, 5, -1],
                           [0, -1, 0]])

# Apply to digit image
blurred = ndimage.convolve(sample_image, blur_filter)
sharpened = ndimage.convolve(sample_image, sharpen_filter)
edges = ndimage.convolve(sample_image, edge_filter_h)

fig, axes = plt.subplots(1, 4, figsize=(14, 3))

axes[0].imshow(sample_image, cmap='gray')
axes[0].set_title('Original', fontsize=12)
axes[0].axis('off')

axes[1].imshow(blurred, cmap='gray')
axes[1].set_title('Blurred', fontsize=12)
axes[1].axis('off')

axes[2].imshow(sharpened, cmap='gray')
axes[2].set_title('Sharpened', fontsize=12)
axes[2].axis('off')

axes[3].imshow(edges, cmap='gray')
axes[3].set_title('Edges', fontsize=12)
axes[3].axis('off')

plt.tight_layout()
plt.show()

print("💡 Different filters = Different features detected")
print("   CNNs LEARN the best filters automatically!")

Part 3: CNNs for Image Classification

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Prepare data
X = digits.images.reshape(-1, 1, 8, 8)  # (N, Channels, H, W)
y = digits.target

X_train = torch.tensor(X[:1400], dtype=torch.float32)
y_train = torch.tensor(y[:1400], dtype=torch.long)
X_test = torch.tensor(X[1400:], dtype=torch.float32)
y_test = torch.tensor(y[1400:], dtype=torch.long)

print(f"Training set: {len(X_train)} images")
print(f"Test set: {len(X_test)} images")
print(f"Image shape: {X_train[0].shape} (1 channel, 8×8)")
# Define a simple CNN
class SimpleCNN(nn.Module):
    def __init__(self):
        super().__init__()
        # Convolution layers
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1)  # 8x8 → 8x8
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1) # 4x4 → 4x4
        self.pool = nn.MaxPool2d(2, 2)  # Reduce size by half
        self.relu = nn.ReLU()
        
        # Fully connected layers
        self.fc1 = nn.Linear(32 * 2 * 2, 64)
        self.fc2 = nn.Linear(64, 10)  # 10 digits
    
    def forward(self, x):
        # Conv block 1
        x = self.relu(self.conv1(x))
        x = self.pool(x)  # 8x8 → 4x4
        
        # Conv block 2
        x = self.relu(self.conv2(x))
        x = self.pool(x)  # 4x4 → 2x2
        
        # Flatten and classify
        x = x.view(-1, 32 * 2 * 2)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

model = SimpleCNN()
print(model)
print(f"\nTotal parameters: {sum(p.numel() for p in model.parameters()):,}")
# Training
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

losses = []
for epoch in range(20):
    epoch_loss = 0
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    
    losses.append(epoch_loss / len(train_loader))
    if epoch % 5 == 0:
        print(f"Epoch {epoch:2d}, Loss: {losses[-1]:.4f}")

print("\nTraining complete!")
# Evaluate
model.eval()
with torch.no_grad():
    outputs = model(X_test)
    _, predicted = torch.max(outputs, 1)
    accuracy = (predicted == y_test).float().mean()

print(f"🎯 Test Accuracy: {accuracy:.1%}")

# Show some predictions
fig, axes = plt.subplots(2, 5, figsize=(12, 5))
for i, ax in enumerate(axes.flat):
    ax.imshow(X_test[i, 0].numpy(), cmap='gray')
    pred = predicted[i].item()
    actual = y_test[i].item()
    color = 'green' if pred == actual else 'red'
    ax.set_title(f'Pred: {pred}, True: {actual}', color=color, fontsize=10)
    ax.axis('off')

plt.suptitle('CNN Predictions on Test Digits', fontsize=14)
plt.tight_layout()
plt.show()

Part 4: Object Detection Concepts

Classification: What is in the image?
Detection: What AND where?

# Visualize the difference
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Create a simple "scene" with shapes
scene = np.ones((200, 300, 3)) * 0.9  # Light gray background

# Draw a "cat" (circle)
for i in range(200):
    for j in range(300):
        if (i - 80)**2 + (j - 80)**2 < 30**2:
            scene[i, j] = [0.8, 0.6, 0.4]  # Brown circle
        if (i - 120)**2 + (j - 220)**2 < 40**2:
            scene[i, j] = [0.3, 0.5, 0.8]  # Blue circle

# Classification
axes[0].imshow(scene)
axes[0].set_title('Classification:\n"Cat and Dog"', fontsize=12)
axes[0].axis('off')

# Localization (one object)
axes[1].imshow(scene)
rect = plt.Rectangle((50, 50), 60, 60, fill=False, edgecolor='red', linewidth=3)
axes[1].add_patch(rect)
axes[1].text(80, 45, 'Cat', ha='center', color='red', fontsize=12, fontweight='bold')
axes[1].set_title('Localization:\n"Cat" + bounding box', fontsize=12)
axes[1].axis('off')

# Detection (multiple objects)
axes[2].imshow(scene)
rect1 = plt.Rectangle((50, 50), 60, 60, fill=False, edgecolor='red', linewidth=3)
rect2 = plt.Rectangle((180, 80), 80, 80, fill=False, edgecolor='blue', linewidth=3)
axes[2].add_patch(rect1)
axes[2].add_patch(rect2)
axes[2].text(80, 45, 'Cat', ha='center', color='red', fontsize=12, fontweight='bold')
axes[2].text(220, 75, 'Dog', ha='center', color='blue', fontsize=12, fontweight='bold')
axes[2].set_title('Detection:\nMultiple objects + boxes', fontsize=12)
axes[2].axis('off')

plt.tight_layout()
plt.show()
# IoU - Intersection over Union
def calculate_iou(box1, box2):
    """Calculate IoU between two boxes [x1, y1, x2, y2]"""
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])
    
    intersection = max(0, x2 - x1) * max(0, y2 - y1)
    
    area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
    area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
    union = area1 + area2 - intersection
    
    return intersection / union if union > 0 else 0

# Visualize IoU
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

examples = [
    ([1, 1, 4, 4], [2, 2, 5, 5], 'Medium Overlap'),
    ([1, 1, 4, 4], [1, 1, 4, 4], 'Perfect Match'),
    ([1, 1, 3, 3], [4, 4, 6, 6], 'No Overlap'),
]

for ax, (box1, box2, title) in zip(axes, examples):
    ax.set_xlim(0, 7)
    ax.set_ylim(0, 7)
    
    # Draw boxes
    rect1 = plt.Rectangle((box1[0], box1[1]), box1[2]-box1[0], box1[3]-box1[1],
                           fill=True, facecolor='green', edgecolor='darkgreen', 
                           alpha=0.5, linewidth=2, label='Ground Truth')
    rect2 = plt.Rectangle((box2[0], box2[1]), box2[2]-box2[0], box2[3]-box2[1],
                           fill=True, facecolor='red', edgecolor='darkred',
                           alpha=0.5, linewidth=2, label='Prediction')
    ax.add_patch(rect1)
    ax.add_patch(rect2)
    
    iou = calculate_iou(box1, box2)
    ax.set_title(f'{title}\nIoU = {iou:.2f}', fontsize=12)
    ax.set_aspect('equal')
    ax.legend(loc='upper right', fontsize=9)
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("💡 IoU measures how well predicted box matches ground truth")
print("   IoU > 0.5 is typically considered a 'good' detection")

Part 5: YOLO Demo (if ultralytics is installed)

# Try to load YOLO
try:
    from ultralytics import YOLO
    model = YOLO('yolov8n.pt')  # Nano model
    print("✅ YOLO loaded! Ready for detection.")
    YOLO_AVAILABLE = True
except ImportError:
    print("⚠️ ultralytics not installed. Run: pip install ultralytics")
    print("   Skipping YOLO demo.")
    YOLO_AVAILABLE = False
if YOLO_AVAILABLE:
    # Run detection on a sample image
    # Download a sample image from the web
    import urllib.request
    import os
    
    # Use a public domain image
    url = "https://ultralytics.com/images/bus.jpg"
    img_path = "sample_detection.jpg"
    
    if not os.path.exists(img_path):
        urllib.request.urlretrieve(url, img_path)
    
    # Run YOLO
    results = model(img_path)
    
    # Display results
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    
    # Original
    img = Image.open(img_path)
    axes[0].imshow(img)
    axes[0].set_title('Original Image', fontsize=14)
    axes[0].axis('off')
    
    # With detections
    result_img = results[0].plot()
    axes[1].imshow(result_img)
    axes[1].set_title('YOLO Detections', fontsize=14)
    axes[1].axis('off')
    
    plt.tight_layout()
    plt.show()
    
    # Print detections
    print("\nDetected objects:")
    for box in results[0].boxes:
        cls_id = int(box.cls[0])
        cls_name = model.names[cls_id]
        conf = float(box.conf[0])
        print(f"  - {cls_name}: {conf:.1%} confidence")
else:
    print("Skipping YOLO demo (not installed)")

🎯 Exercises

# Exercise 1: Create your own edge detection filter
# Hint: Try different 3x3 patterns
# Exercise 2: Add more convolutional layers to the CNN
# Does it improve accuracy on digits?

Summary

Concept What We Learned
Images Grid of numbers (pixels)
Convolution Sliding filter detects patterns
CNN Learns best filters automatically
Detection Classification + Localization
IoU Measures box overlap quality
YOLO Fast, real-time detection