import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)
print("Ready to see how machines see! 👁️")Lecture 6: Computer Vision - How Machines See
Interactive Demo Notebook
In this notebook, we’ll explore: 1. Images as Data - Pixels and arrays 2. Convolution - The key operation 3. CNNs in Action - Real image classification 4. Object Detection - What AND where 5. YOLO Demo - Real-time detection
Part 1: Images as Data
To a computer, an image is just a grid of numbers!
# Load MNIST digits
from sklearn.datasets import load_digits
digits = load_digits()
sample_image = digits.images[0] # First digit image
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# Visual representation
axes[0].imshow(sample_image, cmap='gray')
axes[0].set_title(f'What we see: Digit "{digits.target[0]}"', fontsize=14)
axes[0].axis('off')
# Numerical representation
im = axes[1].imshow(sample_image, cmap='gray')
for i in range(8):
for j in range(8):
val = int(sample_image[i, j])
color = 'white' if val > 8 else 'black'
axes[1].text(j, i, val, ha='center', va='center', fontsize=8, color=color)
axes[1].set_title('What the computer sees: Numbers!', fontsize=14)
axes[1].axis('off')
plt.tight_layout()
plt.show()
print(f"Image shape: {sample_image.shape} (8×8 pixels)")
print(f"Pixel values range: {sample_image.min():.0f} to {sample_image.max():.0f}")
print("\n💡 Higher number = darker pixel")# RGB Color Images
# Create a simple RGB image
rgb_image = np.zeros((100, 100, 3), dtype=np.uint8)
rgb_image[:33, :, 0] = 255 # Red stripe
rgb_image[33:66, :, 1] = 255 # Green stripe
rgb_image[66:, :, 2] = 255 # Blue stripe
fig, axes = plt.subplots(1, 4, figsize=(14, 3))
axes[0].imshow(rgb_image)
axes[0].set_title('RGB Image', fontsize=12)
axes[0].axis('off')
# Show individual channels
for i, (channel, color, name) in enumerate([(0, 'Reds', 'Red'),
(1, 'Greens', 'Green'),
(2, 'Blues', 'Blue')]):
axes[i+1].imshow(rgb_image[:, :, channel], cmap=color)
axes[i+1].set_title(f'{name} Channel', fontsize=12)
axes[i+1].axis('off')
plt.tight_layout()
plt.show()
print("💡 Color images have 3 channels: Red, Green, Blue")
print(f" Shape: {rgb_image.shape} = (Height, Width, Channels)")Part 2: Convolution - The Magic Operation
Slide a small filter over the image to detect patterns
from scipy import ndimage
# Create a simple image with edges
simple_image = np.zeros((10, 10))
simple_image[2:8, 2:8] = 1 # White square in center
# Edge detection filter (Sobel)
edge_filter_h = np.array([[-1, 0, 1],
[-2, 0, 2],
[-1, 0, 1]])
edge_filter_v = np.array([[-1, -2, -1],
[ 0, 0, 0],
[ 1, 2, 1]])
# Apply convolution
edges_h = ndimage.convolve(simple_image, edge_filter_h)
edges_v = ndimage.convolve(simple_image, edge_filter_v)
edges_combined = np.sqrt(edges_h**2 + edges_v**2)
fig, axes = plt.subplots(1, 4, figsize=(14, 3))
axes[0].imshow(simple_image, cmap='gray')
axes[0].set_title('Original Image', fontsize=12)
axes[0].axis('off')
axes[1].imshow(edges_h, cmap='RdBu')
axes[1].set_title('Horizontal Edges', fontsize=12)
axes[1].axis('off')
axes[2].imshow(edges_v, cmap='RdBu')
axes[2].set_title('Vertical Edges', fontsize=12)
axes[2].axis('off')
axes[3].imshow(edges_combined, cmap='hot')
axes[3].set_title('All Edges', fontsize=12)
axes[3].axis('off')
plt.tight_layout()
plt.show()
print("💡 Convolution detects patterns like edges, corners, textures")# More fun with filters!
# Blur filter
blur_filter = np.ones((5, 5)) / 25
# Sharpen filter
sharpen_filter = np.array([[0, -1, 0],
[-1, 5, -1],
[0, -1, 0]])
# Apply to digit image
blurred = ndimage.convolve(sample_image, blur_filter)
sharpened = ndimage.convolve(sample_image, sharpen_filter)
edges = ndimage.convolve(sample_image, edge_filter_h)
fig, axes = plt.subplots(1, 4, figsize=(14, 3))
axes[0].imshow(sample_image, cmap='gray')
axes[0].set_title('Original', fontsize=12)
axes[0].axis('off')
axes[1].imshow(blurred, cmap='gray')
axes[1].set_title('Blurred', fontsize=12)
axes[1].axis('off')
axes[2].imshow(sharpened, cmap='gray')
axes[2].set_title('Sharpened', fontsize=12)
axes[2].axis('off')
axes[3].imshow(edges, cmap='gray')
axes[3].set_title('Edges', fontsize=12)
axes[3].axis('off')
plt.tight_layout()
plt.show()
print("💡 Different filters = Different features detected")
print(" CNNs LEARN the best filters automatically!")Part 3: CNNs for Image Classification
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
# Prepare data
X = digits.images.reshape(-1, 1, 8, 8) # (N, Channels, H, W)
y = digits.target
X_train = torch.tensor(X[:1400], dtype=torch.float32)
y_train = torch.tensor(y[:1400], dtype=torch.long)
X_test = torch.tensor(X[1400:], dtype=torch.float32)
y_test = torch.tensor(y[1400:], dtype=torch.long)
print(f"Training set: {len(X_train)} images")
print(f"Test set: {len(X_test)} images")
print(f"Image shape: {X_train[0].shape} (1 channel, 8×8)")# Define a simple CNN
class SimpleCNN(nn.Module):
def __init__(self):
super().__init__()
# Convolution layers
self.conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1) # 8x8 → 8x8
self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1) # 4x4 → 4x4
self.pool = nn.MaxPool2d(2, 2) # Reduce size by half
self.relu = nn.ReLU()
# Fully connected layers
self.fc1 = nn.Linear(32 * 2 * 2, 64)
self.fc2 = nn.Linear(64, 10) # 10 digits
def forward(self, x):
# Conv block 1
x = self.relu(self.conv1(x))
x = self.pool(x) # 8x8 → 4x4
# Conv block 2
x = self.relu(self.conv2(x))
x = self.pool(x) # 4x4 → 2x2
# Flatten and classify
x = x.view(-1, 32 * 2 * 2)
x = self.relu(self.fc1(x))
x = self.fc2(x)
return x
model = SimpleCNN()
print(model)
print(f"\nTotal parameters: {sum(p.numel() for p in model.parameters()):,}")# Training
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
losses = []
for epoch in range(20):
epoch_loss = 0
for batch_X, batch_y in train_loader:
optimizer.zero_grad()
outputs = model(batch_X)
loss = criterion(outputs, batch_y)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
losses.append(epoch_loss / len(train_loader))
if epoch % 5 == 0:
print(f"Epoch {epoch:2d}, Loss: {losses[-1]:.4f}")
print("\nTraining complete!")# Evaluate
model.eval()
with torch.no_grad():
outputs = model(X_test)
_, predicted = torch.max(outputs, 1)
accuracy = (predicted == y_test).float().mean()
print(f"🎯 Test Accuracy: {accuracy:.1%}")
# Show some predictions
fig, axes = plt.subplots(2, 5, figsize=(12, 5))
for i, ax in enumerate(axes.flat):
ax.imshow(X_test[i, 0].numpy(), cmap='gray')
pred = predicted[i].item()
actual = y_test[i].item()
color = 'green' if pred == actual else 'red'
ax.set_title(f'Pred: {pred}, True: {actual}', color=color, fontsize=10)
ax.axis('off')
plt.suptitle('CNN Predictions on Test Digits', fontsize=14)
plt.tight_layout()
plt.show()Part 4: Object Detection Concepts
Classification: What is in the image?
Detection: What AND where?
# Visualize the difference
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
# Create a simple "scene" with shapes
scene = np.ones((200, 300, 3)) * 0.9 # Light gray background
# Draw a "cat" (circle)
for i in range(200):
for j in range(300):
if (i - 80)**2 + (j - 80)**2 < 30**2:
scene[i, j] = [0.8, 0.6, 0.4] # Brown circle
if (i - 120)**2 + (j - 220)**2 < 40**2:
scene[i, j] = [0.3, 0.5, 0.8] # Blue circle
# Classification
axes[0].imshow(scene)
axes[0].set_title('Classification:\n"Cat and Dog"', fontsize=12)
axes[0].axis('off')
# Localization (one object)
axes[1].imshow(scene)
rect = plt.Rectangle((50, 50), 60, 60, fill=False, edgecolor='red', linewidth=3)
axes[1].add_patch(rect)
axes[1].text(80, 45, 'Cat', ha='center', color='red', fontsize=12, fontweight='bold')
axes[1].set_title('Localization:\n"Cat" + bounding box', fontsize=12)
axes[1].axis('off')
# Detection (multiple objects)
axes[2].imshow(scene)
rect1 = plt.Rectangle((50, 50), 60, 60, fill=False, edgecolor='red', linewidth=3)
rect2 = plt.Rectangle((180, 80), 80, 80, fill=False, edgecolor='blue', linewidth=3)
axes[2].add_patch(rect1)
axes[2].add_patch(rect2)
axes[2].text(80, 45, 'Cat', ha='center', color='red', fontsize=12, fontweight='bold')
axes[2].text(220, 75, 'Dog', ha='center', color='blue', fontsize=12, fontweight='bold')
axes[2].set_title('Detection:\nMultiple objects + boxes', fontsize=12)
axes[2].axis('off')
plt.tight_layout()
plt.show()# IoU - Intersection over Union
def calculate_iou(box1, box2):
"""Calculate IoU between two boxes [x1, y1, x2, y2]"""
x1 = max(box1[0], box2[0])
y1 = max(box1[1], box2[1])
x2 = min(box1[2], box2[2])
y2 = min(box1[3], box2[3])
intersection = max(0, x2 - x1) * max(0, y2 - y1)
area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
union = area1 + area2 - intersection
return intersection / union if union > 0 else 0
# Visualize IoU
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
examples = [
([1, 1, 4, 4], [2, 2, 5, 5], 'Medium Overlap'),
([1, 1, 4, 4], [1, 1, 4, 4], 'Perfect Match'),
([1, 1, 3, 3], [4, 4, 6, 6], 'No Overlap'),
]
for ax, (box1, box2, title) in zip(axes, examples):
ax.set_xlim(0, 7)
ax.set_ylim(0, 7)
# Draw boxes
rect1 = plt.Rectangle((box1[0], box1[1]), box1[2]-box1[0], box1[3]-box1[1],
fill=True, facecolor='green', edgecolor='darkgreen',
alpha=0.5, linewidth=2, label='Ground Truth')
rect2 = plt.Rectangle((box2[0], box2[1]), box2[2]-box2[0], box2[3]-box2[1],
fill=True, facecolor='red', edgecolor='darkred',
alpha=0.5, linewidth=2, label='Prediction')
ax.add_patch(rect1)
ax.add_patch(rect2)
iou = calculate_iou(box1, box2)
ax.set_title(f'{title}\nIoU = {iou:.2f}', fontsize=12)
ax.set_aspect('equal')
ax.legend(loc='upper right', fontsize=9)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("💡 IoU measures how well predicted box matches ground truth")
print(" IoU > 0.5 is typically considered a 'good' detection")Part 5: YOLO Demo (if ultralytics is installed)
# Try to load YOLO
try:
from ultralytics import YOLO
model = YOLO('yolov8n.pt') # Nano model
print("✅ YOLO loaded! Ready for detection.")
YOLO_AVAILABLE = True
except ImportError:
print("⚠️ ultralytics not installed. Run: pip install ultralytics")
print(" Skipping YOLO demo.")
YOLO_AVAILABLE = Falseif YOLO_AVAILABLE:
# Run detection on a sample image
# Download a sample image from the web
import urllib.request
import os
# Use a public domain image
url = "https://ultralytics.com/images/bus.jpg"
img_path = "sample_detection.jpg"
if not os.path.exists(img_path):
urllib.request.urlretrieve(url, img_path)
# Run YOLO
results = model(img_path)
# Display results
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
# Original
img = Image.open(img_path)
axes[0].imshow(img)
axes[0].set_title('Original Image', fontsize=14)
axes[0].axis('off')
# With detections
result_img = results[0].plot()
axes[1].imshow(result_img)
axes[1].set_title('YOLO Detections', fontsize=14)
axes[1].axis('off')
plt.tight_layout()
plt.show()
# Print detections
print("\nDetected objects:")
for box in results[0].boxes:
cls_id = int(box.cls[0])
cls_name = model.names[cls_id]
conf = float(box.conf[0])
print(f" - {cls_name}: {conf:.1%} confidence")
else:
print("Skipping YOLO demo (not installed)")🎯 Exercises
# Exercise 1: Create your own edge detection filter
# Hint: Try different 3x3 patterns
# Exercise 2: Add more convolutional layers to the CNN
# Does it improve accuracy on digits?
Summary
| Concept | What We Learned |
|---|---|
| Images | Grid of numbers (pixels) |
| Convolution | Sliding filter detects patterns |
| CNN | Learns best filters automatically |
| Detection | Classification + Localization |
| IoU | Measures box overlap quality |
| YOLO | Fast, real-time detection |