Week 5 Lab: Data Augmentation

CS 203: Software Tools and Techniques for AI
IIT Gandhinagar


Learning Objectives

By the end of this lab, you will be able to:

  1. Apply image augmentation techniques using Albumentations
  2. Perform text augmentation using nlpaug
  3. Understand audio augmentation with audiomentations
  4. Design appropriate augmentation pipelines for different tasks
  5. Measure the impact of augmentation on model performance

Netflix Movie Theme

We’ll augment our movie data to improve model performance: - Movie posters: Image augmentation for genre classification - Movie reviews: Text augmentation for sentiment analysis - Audio clips: Audio augmentation for trailer classification


Part 1: Environment Setup

# Install required packages
!pip install albumentations opencv-python-headless pillow
!pip install nlpaug transformers
!pip install torch torchvision torchaudio
!pip install matplotlib seaborn pandas numpy
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import cv2
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

print("All imports successful!")

Part 2: Image Augmentation with Albumentations

2.1 Creating Sample Images

# Create a sample image (movie poster placeholder)
def create_sample_poster(width=400, height=600):
    """
    Create a synthetic movie poster image for demonstration.
    """
    # Create gradient background
    img = np.zeros((height, width, 3), dtype=np.uint8)
    
    # Create gradient from dark blue to black
    for i in range(height):
        ratio = i / height
        img[i, :, 0] = int(20 * (1 - ratio))   # Blue
        img[i, :, 1] = int(10 * (1 - ratio))   # Green
        img[i, :, 2] = int(40 * (1 - ratio))   # Red (in BGR)
    
    # Add some shapes to represent movie poster elements
    # Title area (rectangle)
    cv2.rectangle(img, (50, 450), (350, 550), (200, 180, 100), -1)
    
    # Character silhouette (ellipse)
    cv2.ellipse(img, (200, 250), (80, 150), 0, 0, 360, (100, 80, 60), -1)
    
    # Star rating
    for i in range(5):
        x = 80 + i * 60
        cv2.circle(img, (x, 580), 15, (0, 200, 255), -1)
    
    return img

# Create and display sample image
sample_image = create_sample_poster()

plt.figure(figsize=(6, 9))
plt.imshow(cv2.cvtColor(sample_image, cv2.COLOR_BGR2RGB))
plt.title("Sample Movie Poster")
plt.axis('off')
plt.show()

print(f"Image shape: {sample_image.shape}")

2.2 Basic Albumentations

import albumentations as A

# Define individual transforms
transforms = {
    'Original': None,
    'Horizontal Flip': A.HorizontalFlip(p=1.0),
    'Rotate 15': A.Rotate(limit=15, p=1.0),
    'Brightness': A.RandomBrightnessContrast(brightness_limit=0.3, contrast_limit=0, p=1.0),
    'Contrast': A.RandomBrightnessContrast(brightness_limit=0, contrast_limit=0.3, p=1.0),
    'Blur': A.GaussianBlur(blur_limit=(7, 7), p=1.0),
}

# Apply and display
fig, axes = plt.subplots(2, 3, figsize=(15, 12))
axes = axes.flatten()

for idx, (name, transform) in enumerate(transforms.items()):
    if transform is None:
        augmented = sample_image
    else:
        augmented = transform(image=sample_image)['image']
    
    axes[idx].imshow(cv2.cvtColor(augmented, cv2.COLOR_BGR2RGB))
    axes[idx].set_title(name, fontsize=12)
    axes[idx].axis('off')

plt.tight_layout()
plt.show()

Question 2.1 (Solved): Create an Augmentation Pipeline

# SOLVED: Create a composed augmentation pipeline

poster_augmentation = A.Compose([
    # Geometric transforms
    A.HorizontalFlip(p=0.5),
    A.Rotate(limit=10, p=0.5),
    A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.1, rotate_limit=5, p=0.5),
    
    # Color transforms
    A.OneOf([
        A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=1),
        A.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1, p=1),
    ], p=0.5),
    
    # Noise and blur
    A.OneOf([
        A.GaussianBlur(blur_limit=3, p=1),
        A.GaussNoise(var_limit=(10, 50), p=1),
    ], p=0.3),
])

# Generate multiple augmentations
fig, axes = plt.subplots(2, 5, figsize=(20, 10))
axes = axes.flatten()

for i in range(10):
    augmented = poster_augmentation(image=sample_image)['image']
    axes[i].imshow(cv2.cvtColor(augmented, cv2.COLOR_BGR2RGB))
    axes[i].set_title(f"Augmented {i+1}")
    axes[i].axis('off')

plt.suptitle("10 Augmented Versions of the Same Poster", fontsize=14)
plt.tight_layout()
plt.show()

Question 2.2: Create a Domain-Specific Augmentation Pipeline

Create an augmentation pipeline specifically for movie poster genre classification.

# TODO: Create an augmentation pipeline that:
# 1. Preserves the overall composition (important for genre recognition)
# 2. Simulates different viewing conditions (brightness, contrast)
# 3. Is NOT too aggressive (we want to keep the poster recognizable)

genre_classification_augmentation = A.Compose([
    # Your augmentations here
])

# Test your pipeline

Question 2.3: Advanced Augmentations (Cutout, MixUp)

# Cutout (CoarseDropout in Albumentations)
cutout_transform = A.CoarseDropout(
    max_holes=8,
    max_height=50,
    max_width=50,
    min_holes=4,
    min_height=20,
    min_width=20,
    fill_value=0,
    p=1.0
)

# Apply and show
cutout_image = cutout_transform(image=sample_image)['image']

fig, axes = plt.subplots(1, 2, figsize=(12, 8))
axes[0].imshow(cv2.cvtColor(sample_image, cv2.COLOR_BGR2RGB))
axes[0].set_title("Original")
axes[0].axis('off')
axes[1].imshow(cv2.cvtColor(cutout_image, cv2.COLOR_BGR2RGB))
axes[1].set_title("With Cutout")
axes[1].axis('off')
plt.show()

Question 2.4: Implement MixUp

# TODO: Implement MixUp augmentation
# MixUp blends two images and their labels

def mixup(image1, image2, label1, label2, alpha=0.4):
    """
    Perform MixUp augmentation.
    
    Args:
        image1, image2: Two images to mix
        label1, label2: Their corresponding labels (one-hot or probabilities)
        alpha: Parameter for Beta distribution
    
    Returns:
        mixed_image: Blended image
        mixed_label: Blended label
    """
    # Your code here
    pass

# Create a second sample image
sample_image2 = create_sample_poster()
# Modify it slightly
sample_image2 = cv2.rectangle(sample_image2.copy(), (100, 200), (300, 400), (0, 0, 255), -1)

# Test your mixup function

Question 2.5: Augmentation with Bounding Boxes

# Augmentation that preserves bounding box annotations
bbox_transform = A.Compose([
    A.HorizontalFlip(p=0.5),
    A.Rotate(limit=15, p=0.5),
    A.RandomBrightnessContrast(p=0.3),
], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['labels']))

# Sample bounding boxes [x_min, y_min, x_max, y_max]
bboxes = [
    [50, 450, 350, 550],   # Title area
    [120, 100, 280, 400],  # Character
]
labels = ['title', 'character']

# Apply transform
transformed = bbox_transform(
    image=sample_image,
    bboxes=bboxes,
    labels=labels
)

# Draw bounding boxes
def draw_bboxes(image, bboxes, labels):
    img = image.copy()
    colors = {'title': (0, 255, 0), 'character': (255, 0, 0)}
    for bbox, label in zip(bboxes, labels):
        x1, y1, x2, y2 = [int(c) for c in bbox]
        cv2.rectangle(img, (x1, y1), (x2, y2), colors.get(label, (0, 0, 255)), 2)
        cv2.putText(img, label, (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors.get(label, (0, 0, 255)), 2)
    return img

# Display
fig, axes = plt.subplots(1, 2, figsize=(12, 8))

orig_with_boxes = draw_bboxes(sample_image, bboxes, labels)
axes[0].imshow(cv2.cvtColor(orig_with_boxes, cv2.COLOR_BGR2RGB))
axes[0].set_title("Original with Bboxes")
axes[0].axis('off')

aug_with_boxes = draw_bboxes(transformed['image'], transformed['bboxes'], transformed['labels'])
axes[1].imshow(cv2.cvtColor(aug_with_boxes, cv2.COLOR_BGR2RGB))
axes[1].set_title("Augmented with Bboxes")
axes[1].axis('off')

plt.show()

Part 3: Text Augmentation with nlpaug

3.1 Setting Up nlpaug

import nlpaug.augmenter.word as naw
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.sentence as nas

# Sample movie reviews
sample_reviews = [
    "This movie was absolutely fantastic! A must-watch.",
    "Terrible film. Waste of time and money.",
    "The acting was superb, but the plot was confusing.",
    "One of the best movies I have seen this year.",
    "Boring and predictable. Would not recommend.",
]

print("Sample reviews:")
for i, review in enumerate(sample_reviews):
    print(f"{i+1}. {review}")

Question 3.1 (Solved): Synonym Augmentation

# SOLVED: Synonym replacement augmentation

# Create synonym augmenter using WordNet
synonym_aug = naw.SynonymAug(aug_src='wordnet', aug_max=3)

print("Synonym Augmentation Examples:\n")
for review in sample_reviews[:3]:
    augmented = synonym_aug.augment(review)
    print(f"Original: {review}")
    print(f"Augmented: {augmented[0]}")
    print()

Question 3.2: Random Word Operations

# TODO: Try different word-level augmentations:
# 1. Random word deletion
# 2. Random word swap
# 3. Random word insertion

# Your code here

Question 3.3: Contextual Word Embeddings (BERT)

# Contextual augmentation using BERT
# Note: This requires downloading the BERT model (may take a few minutes)

try:
    bert_aug = naw.ContextualWordEmbsAug(
        model_path='bert-base-uncased',
        action='substitute',
        aug_max=2
    )
    
    print("BERT Contextual Augmentation:\n")
    for review in sample_reviews[:2]:
        augmented = bert_aug.augment(review)
        print(f"Original: {review}")
        print(f"Augmented: {augmented[0]}")
        print()
        
except Exception as e:
    print(f"BERT augmentation not available: {e}")
    print("Try: pip install transformers torch")

Question 3.4: Character-Level Augmentation

# Character-level augmentation (simulating typos)

# Keyboard typo augmenter
keyboard_aug = nac.KeyboardAug(aug_char_max=2, aug_word_max=2)

# OCR error augmenter
ocr_aug = nac.OcrAug(aug_char_max=2, aug_word_max=2)

print("Character-Level Augmentation Examples:\n")

review = sample_reviews[0]
print(f"Original: {review}")
print(f"Keyboard typos: {keyboard_aug.augment(review)[0]}")
print(f"OCR errors: {ocr_aug.augment(review)[0]}")

Question 3.5: Create a Complete Text Augmentation Pipeline

# TODO: Create a pipeline that applies multiple augmentations
# Use naf.Sequential or naf.Sometimes for probabilistic application

import nlpaug.flow as naf

def create_text_augmentation_pipeline():
    """
    Create a text augmentation pipeline for movie review sentiment.
    
    Requirements:
    - Should preserve sentiment (don't replace sentiment words)
    - Should maintain readability
    - Should be diverse
    """
    # Your code here
    pass

# Test your pipeline

Question 3.6: Back-Translation (Advanced)

# Back-translation: English -> Other Language -> English
# This requires downloading translation models

try:
    back_trans_aug = naw.BackTranslationAug(
        from_model_name='facebook/wmt19-en-de',
        to_model_name='facebook/wmt19-de-en'
    )
    
    print("Back-Translation Augmentation:\n")
    review = "This movie was absolutely fantastic!"
    augmented = back_trans_aug.augment(review)
    print(f"Original: {review}")
    print(f"Back-translated: {augmented[0]}")
    
except Exception as e:
    print(f"Back-translation not available: {e}")
    print("Requires: pip install transformers torch sentencepiece")

Part 4: Audio Augmentation

4.1 Creating Sample Audio

import numpy as np

# Create a synthetic audio signal (sine wave + noise)
def create_sample_audio(duration=2.0, sr=16000):
    """
    Create a sample audio signal.
    
    Args:
        duration: Duration in seconds
        sr: Sample rate
    
    Returns:
        audio: Audio signal as numpy array
    """
    t = np.linspace(0, duration, int(sr * duration), endpoint=False)
    
    # Create a combination of frequencies (like a chord)
    audio = (
        0.3 * np.sin(2 * np.pi * 440 * t) +   # A4 note
        0.2 * np.sin(2 * np.pi * 554 * t) +   # C#5 note
        0.2 * np.sin(2 * np.pi * 659 * t) +   # E5 note
        0.1 * np.random.randn(len(t))          # Some noise
    )
    
    # Normalize
    audio = audio / np.max(np.abs(audio))
    
    return audio.astype(np.float32)

sample_audio = create_sample_audio()
sr = 16000

print(f"Audio shape: {sample_audio.shape}")
print(f"Duration: {len(sample_audio) / sr:.2f} seconds")

# Plot waveform
plt.figure(figsize=(12, 3))
plt.plot(np.linspace(0, len(sample_audio)/sr, len(sample_audio)), sample_audio)
plt.title("Sample Audio Waveform")
plt.xlabel("Time (s)")
plt.ylabel("Amplitude")
plt.show()

Question 4.1: Basic Audio Augmentations

# Manual audio augmentations

def add_noise(audio, noise_level=0.01):
    """Add Gaussian noise to audio."""
    noise = np.random.randn(len(audio)) * noise_level
    return audio + noise

def time_shift(audio, shift_range=0.2):
    """Shift audio in time."""
    shift = int(len(audio) * shift_range * np.random.uniform(-1, 1))
    return np.roll(audio, shift)

def change_volume(audio, volume_range=(0.5, 1.5)):
    """Change audio volume."""
    volume = np.random.uniform(*volume_range)
    return audio * volume

def pitch_shift_simple(audio, shift_factor=0.1):
    """Simple pitch shift by resampling."""
    from scipy.signal import resample
    factor = 1 + np.random.uniform(-shift_factor, shift_factor)
    new_length = int(len(audio) / factor)
    shifted = resample(audio, new_length)
    # Pad or trim to original length
    if len(shifted) > len(audio):
        return shifted[:len(audio)]
    else:
        return np.pad(shifted, (0, len(audio) - len(shifted)))

# Apply augmentations
augmentations = {
    'Original': sample_audio,
    'Added Noise': add_noise(sample_audio, 0.05),
    'Time Shifted': time_shift(sample_audio, 0.1),
    'Volume Changed': change_volume(sample_audio, (0.5, 1.5)),
}

# Plot
fig, axes = plt.subplots(4, 1, figsize=(12, 8))
for idx, (name, audio) in enumerate(augmentations.items()):
    axes[idx].plot(audio)
    axes[idx].set_title(name)
    axes[idx].set_xlim(0, len(audio))
plt.tight_layout()
plt.show()

Question 4.2: Implement SpecAugment

# TODO: Implement SpecAugment (frequency and time masking on spectrograms)

from scipy.signal import spectrogram

def spec_augment(audio, sr, freq_mask_param=10, time_mask_param=20):
    """
    Apply SpecAugment to audio.
    
    Args:
        audio: Audio signal
        sr: Sample rate
        freq_mask_param: Maximum frequency mask width
        time_mask_param: Maximum time mask width
    
    Returns:
        Augmented spectrogram
    """
    # Your code here
    pass

# Test your function

Part 5: Measuring Augmentation Impact

5.1 Setting Up a Classification Task

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load a small subset of 20 newsgroups
categories = ['rec.sport.baseball', 'rec.sport.hockey']
newsgroups = fetch_20newsgroups(subset='all', categories=categories, 
                                 remove=('headers', 'footers', 'quotes'))

# Limit to smaller dataset for demonstration
texts = newsgroups.data[:500]
labels = newsgroups.target[:500]

# Split
X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.3, random_state=42
)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

Question 5.1 (Solved): Baseline Without Augmentation

# SOLVED: Train baseline model without augmentation

# Vectorize
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

# Evaluate
y_pred = model.predict(X_test_vec)
baseline_accuracy = accuracy_score(y_test, y_pred)

print(f"Baseline Accuracy (no augmentation): {baseline_accuracy:.2%}")

Question 5.2: Train with Augmentation

# TODO: Augment training data and measure improvement
# 1. Augment each training sample 2-3 times
# 2. Combine with original data
# 3. Train model
# 4. Compare accuracy

# Your code here

Question 5.3: Learning Curve Comparison

# TODO: Plot learning curves with and without augmentation
# Train on 10%, 20%, ..., 100% of training data
# Compare the two curves

# Your code here

Part 6: Best Practices and Guidelines

Question 6.1: When NOT to Augment

# Demonstration: Bad augmentation that changes the label

# Example 1: Text sentiment - replacing key sentiment words
review = "This movie was terrible"
# Bad augmentation: synonym replacement of "terrible"
bad_aug = "This movie was awesome"  # Completely changes meaning!

print("Example of BAD text augmentation:")
print(f"Original (NEGATIVE): {review}")
print(f"Bad augmentation: {bad_aug}")
print("The label changed from NEGATIVE to POSITIVE!")

print("\n" + "="*50 + "\n")

# Example 2: Digit classification with vertical flip
print("Example of BAD image augmentation:")
print("Original: Image of '6'")
print("Vertical flip: Image of '9'")
print("The label changed from '6' to '9'!")

Question 6.2: Create an Augmentation Checklist

# TODO: Write a checklist for designing augmentation pipelines

augmentation_checklist = """
# Augmentation Design Checklist

## Before Choosing Augmentations:
[ ] What is the task? (classification, detection, segmentation, etc.)
[ ] What modality? (image, text, audio, video)
[ ] What domain? (natural images, medical, documents, etc.)

## Validation Questions:
[ ] Does this augmentation preserve the label?
[ ] ...
[ ] ...

## Implementation:
[ ] ...

## Testing:
[ ] ...
"""

# Complete the checklist
print(augmentation_checklist)

Challenge Problems

Challenge 1: Test-Time Augmentation (TTA)

# Challenge: Implement Test-Time Augmentation
# Apply augmentations at inference time and average predictions

def tta_predict(model, vectorizer, text, n_augments=5):
    """
    Make prediction using Test-Time Augmentation.
    
    Args:
        model: Trained classifier
        vectorizer: Fitted TfidfVectorizer
        text: Text to classify
        n_augments: Number of augmented versions to use
    
    Returns:
        Averaged prediction probabilities
    """
    # Your code here
    pass

Challenge 3: Augmentation for Imbalanced Classes

# Challenge: Use augmentation to balance an imbalanced dataset
# More augmentation for minority classes

def class_balanced_augmentation(X, y, target_count=None):
    """
    Augment data to balance class distribution.
    
    Args:
        X: Features
        y: Labels
        target_count: Target number of samples per class (default: max class count)
    
    Returns:
        X_balanced, y_balanced: Balanced dataset
    """
    # Your code here
    pass

Summary

In this lab, you learned:

  1. Image Augmentation: Geometric and color transforms with Albumentations
  2. Text Augmentation: Synonym replacement, back-translation with nlpaug
  3. Audio Augmentation: Time-domain and frequency-domain augmentations
  4. Pipeline Design: Creating appropriate augmentation pipelines for tasks
  5. Impact Measurement: Comparing model performance with/without augmentation

Key Takeaways

Modality Key Transforms Libraries
Image Flip, rotate, color, cutout Albumentations
Text Synonym, back-translation nlpaug
Audio Noise, pitch, SpecAugment audiomentations

Best Practices

  1. Always verify augmentations preserve labels
  2. Start simple, add complexity gradually
  3. Don’t augment validation/test data
  4. Measure impact before and after
  5. Domain-specific choices matter

Next Week

Week 6: LLM APIs - Integrating large language models into your applications!