# Install required packages
!pip install modAL-python scikit-learn pandas numpy matplotlib seaborn
!pip install snorkel
!pip install cleanlab
!pip install google-generativeai # For Gemini APIWeek 4 Lab: Optimizing the Labeling Process
CS 203: Software Tools and Techniques for AI
IIT Gandhinagar
Learning Objectives
By the end of this lab, you will be able to:
- Implement Active Learning with uncertainty sampling using modAL
- Write and apply labeling functions for Weak Supervision with Snorkel
- Use LLMs (like Gemini) to generate labels programmatically
- Detect and handle noisy labels with cleanlab
- Compare the cost and quality trade-offs of different labeling approaches
Netflix Movie Theme
We need to label 100,000 movie reviews for sentiment analysis. Manual labeling would cost $30,000 and take months. Let’s use smarter approaches!
Part 1: Environment Setup
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
np.random.seed(42)
print("All imports successful!")Part 2: Create Movie Review Dataset
# Create synthetic movie review dataset
positive_templates = [
"Amazing movie! Loved every minute of {movie}.",
"{movie} was absolutely fantastic. A must-watch!",
"Best film I've seen this year. {movie} is a masterpiece.",
"Incredible performances in {movie}. Highly recommend!",
"{movie} exceeded all my expectations. Perfect 10/10.",
"Brilliant storytelling in {movie}. Oscar-worthy!",
"Loved {movie}! The acting was superb.",
"{movie} is a triumph. Beautifully crafted film.",
]
negative_templates = [
"Terrible movie. {movie} was a complete waste of time.",
"{movie} was awful. Avoid at all costs.",
"Worst film I've ever seen. {movie} is garbage.",
"{movie} was boring and predictable. Very disappointed.",
"Don't waste your money on {movie}. Horrible!",
"{movie} has no redeeming qualities. Truly bad.",
"Painful to watch. {movie} is a disaster.",
"{movie} is insulting to audiences. Just awful.",
]
neutral_templates = [
"{movie} was okay. Nothing special.",
"Average film. {movie} had its moments.",
"{movie} is fine. Not great, not terrible.",
"Meh. {movie} was just okay.",
"{movie} was watchable but forgettable.",
"It's alright. {movie} passes the time.",
]
movies = ['Inception', 'The Matrix', 'Avatar', 'Titanic', 'Interstellar',
'The Godfather', 'Pulp Fiction', 'Parasite', 'Joker', 'Gladiator']
def generate_reviews(n_samples=1000):
reviews = []
labels = []
for _ in range(n_samples):
movie = np.random.choice(movies)
sentiment = np.random.choice(['positive', 'negative', 'neutral'], p=[0.4, 0.4, 0.2])
if sentiment == 'positive':
template = np.random.choice(positive_templates)
label = 1
elif sentiment == 'negative':
template = np.random.choice(negative_templates)
label = 0
else:
template = np.random.choice(neutral_templates)
label = 2 # We'll treat neutral as a separate class
reviews.append(template.format(movie=movie))
labels.append(label)
return reviews, labels
# Generate dataset
reviews, labels = generate_reviews(n_samples=1000)
df = pd.DataFrame({'review': reviews, 'label': labels})
print(f"Generated {len(df)} reviews")
print(f"\nLabel distribution:")
print(df['label'].value_counts())
print(f"\nSample reviews:")
df.head()# Convert to binary classification (positive vs negative, drop neutral for simplicity)
df_binary = df[df['label'] != 2].copy()
print(f"Binary dataset size: {len(df_binary)}")
# Vectorize text
vectorizer = TfidfVectorizer(max_features=500, stop_words='english')
X = vectorizer.fit_transform(df_binary['review']).toarray()
y = df_binary['label'].values
# Split into train (unlabeled pool), validation, and test
X_pool, X_test, y_pool, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_pool, X_val, y_pool, y_val = train_test_split(X_pool, y_pool, test_size=0.1, random_state=42)
print(f"Pool size: {len(X_pool)}")
print(f"Validation size: {len(X_val)}")
print(f"Test size: {len(X_test)}")Part 3: Active Learning with modAL
3.1 The Active Learning Loop
from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling, margin_sampling, entropy_sampling
# Start with a small seed set (simulating initial labeled data)
n_initial = 20
initial_idx = np.random.choice(range(len(X_pool)), size=n_initial, replace=False)
X_initial = X_pool[initial_idx]
y_initial = y_pool[initial_idx]
# Remove initial samples from pool
mask = np.ones(len(X_pool), dtype=bool)
mask[initial_idx] = False
X_pool_remaining = X_pool[mask]
y_pool_remaining = y_pool[mask]
print(f"Initial labeled set: {len(X_initial)}")
print(f"Remaining pool: {len(X_pool_remaining)}")Question 3.1 (Solved): Create Active Learner with Uncertainty Sampling
# SOLVED: Create and train Active Learner
# Initialize the learner
learner = ActiveLearner(
estimator=RandomForestClassifier(n_estimators=100, random_state=42),
query_strategy=uncertainty_sampling,
X_training=X_initial,
y_training=y_initial
)
# Check initial performance
initial_accuracy = learner.score(X_test, y_test)
print(f"Initial accuracy with {n_initial} samples: {initial_accuracy:.2%}")Question 3.2 (Solved): Run Active Learning Loop
# SOLVED: Active Learning Loop
n_queries = 50 # Number of samples to query
performance_history = [initial_accuracy]
# Copy pool to work with
X_pool_al = X_pool_remaining.copy()
y_pool_al = y_pool_remaining.copy()
for i in range(n_queries):
# Query for the most uncertain sample
query_idx, query_instance = learner.query(X_pool_al)
# Get the label (in practice, this would be a human annotator)
y_new = y_pool_al[query_idx]
# Teach the model
learner.teach(X_pool_al[query_idx].reshape(1, -1), y_new.reshape(-1))
# Remove queried sample from pool
X_pool_al = np.delete(X_pool_al, query_idx, axis=0)
y_pool_al = np.delete(y_pool_al, query_idx)
# Track performance
accuracy = learner.score(X_test, y_test)
performance_history.append(accuracy)
if (i + 1) % 10 == 0:
print(f"Query {i+1}: Accuracy = {accuracy:.2%}")
print(f"\nFinal accuracy: {performance_history[-1]:.2%}")Question 3.3: Compare with Random Sampling
# TODO: Implement random sampling baseline and compare with active learning
# Create a learning curve comparing both approaches
def random_sampling(classifier, X_pool, n_instances=1):
"""Random query strategy for baseline comparison."""
# Your code here
pass
# Run random sampling loop and collect performance
# Your code hereQuestion 3.4: Plot Learning Curves
# TODO: Plot learning curves for Active Learning vs Random Sampling
# X-axis: Number of labeled samples
# Y-axis: Test accuracy
# Your code hereQuestion 3.5: Try Different Query Strategies
Compare uncertainty sampling, margin sampling, and entropy sampling.
# TODO: Compare different query strategies
# Run active learning with:
# 1. uncertainty_sampling
# 2. margin_sampling
# 3. entropy_sampling
# Plot all three on the same graph
# Your code herePart 4: Weak Supervision with Snorkel
4.1 Understanding Labeling Functions
from snorkel.labeling import labeling_function, PandasLFApplier, LFAnalysis
from snorkel.labeling.model import LabelModel
# Constants for labels
POSITIVE = 1
NEGATIVE = 0
ABSTAIN = -1
# Prepare DataFrame for Snorkel
df_snorkel = df_binary.copy().reset_index(drop=True)
print(f"Dataset size: {len(df_snorkel)}")
df_snorkel.head()Question 4.1 (Solved): Write Labeling Functions
# SOLVED: Labeling Functions for Sentiment Analysis
@labeling_function()
def lf_contains_amazing(x):
"""If review contains 'amazing', label as positive."""
return POSITIVE if 'amazing' in x.review.lower() else ABSTAIN
@labeling_function()
def lf_contains_terrible(x):
"""If review contains 'terrible', label as negative."""
return NEGATIVE if 'terrible' in x.review.lower() else ABSTAIN
@labeling_function()
def lf_contains_loved(x):
"""If review contains 'loved', label as positive."""
return POSITIVE if 'loved' in x.review.lower() else ABSTAIN
@labeling_function()
def lf_contains_awful(x):
"""If review contains 'awful', label as negative."""
return NEGATIVE if 'awful' in x.review.lower() else ABSTAIN
@labeling_function()
def lf_contains_masterpiece(x):
"""If review contains 'masterpiece', label as positive."""
return POSITIVE if 'masterpiece' in x.review.lower() else ABSTAIN
@labeling_function()
def lf_contains_waste(x):
"""If review contains 'waste', label as negative."""
return NEGATIVE if 'waste' in x.review.lower() else ABSTAIN
@labeling_function()
def lf_exclamation_positive(x):
"""Multiple exclamations with positive words."""
positive_words = ['great', 'amazing', 'love', 'best', 'fantastic']
if x.review.count('!') >= 2:
if any(word in x.review.lower() for word in positive_words):
return POSITIVE
return ABSTAIN
@labeling_function()
def lf_rating_positive(x):
"""If mentions high rating (9/10, 10/10), positive."""
import re
match = re.search(r'(9|10)/10', x.review)
return POSITIVE if match else ABSTAIN
# Collect all LFs
lfs = [
lf_contains_amazing, lf_contains_terrible, lf_contains_loved,
lf_contains_awful, lf_contains_masterpiece, lf_contains_waste,
lf_exclamation_positive, lf_rating_positive
]
print(f"Created {len(lfs)} labeling functions")Question 4.2: Apply Labeling Functions
# Apply LFs to data
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df_snorkel)
print(f"Label matrix shape: {L_train.shape}")
print(f"Labels: {L_train[:5]}")Question 4.3 (Solved): Analyze Labeling Functions
# SOLVED: Analyze LF performance
analysis = LFAnalysis(L=L_train, lfs=lfs).lf_summary(Y=df_snorkel['label'].values)
print("Labeling Function Analysis:")
analysisQuestion 4.4: Train Label Model
# TODO: Train the Label Model and get probabilistic labels
# Your code here
label_model = LabelModel(cardinality=2, verbose=True)
# Train the model
# Get predictionsQuestion 4.5: Write More Labeling Functions
Create 3 additional labeling functions to improve coverage.
# TODO: Write 3 more labeling functions
# Consider: word patterns, punctuation, sentence structure
@labeling_function()
def lf_your_function_1(x):
"""Description"""
# Your code here
return ABSTAIN
@labeling_function()
def lf_your_function_2(x):
"""Description"""
# Your code here
return ABSTAIN
@labeling_function()
def lf_your_function_3(x):
"""Description"""
# Your code here
return ABSTAINQuestion 4.6: Train Downstream Model with Weak Labels
# TODO: Train a classifier using the probabilistic labels from the Label Model
# Compare performance with a model trained on true labels
# Your code herePart 5: LLM-Based Labeling
5.1 Setting Up Gemini API
import os
# Set your API key (get from https://makersuite.google.com/app/apikey)
# os.environ['GEMINI_API_KEY'] = 'your-api-key-here'
try:
from google import genai
client = genai.Client(api_key=os.environ.get('GEMINI_API_KEY', ''))
print("Gemini client initialized!")
except Exception as e:
print(f"Error: {e}")
print("Set GEMINI_API_KEY environment variable to use LLM labeling")Question 5.1 (Solved): Create LLM Labeling Function
# SOLVED: LLM Labeling Function (mock implementation for demo)
def label_with_llm(review, use_api=False):
"""
Label a movie review using an LLM.
Args:
review: The review text
use_api: If True, call actual API; else use mock
Returns:
dict with 'label' and 'confidence'
"""
if use_api and 'GEMINI_API_KEY' in os.environ:
prompt = f"""
Classify the following movie review as POSITIVE or NEGATIVE.
Review: "{review}"
Respond with JSON format:
{{"label": "POSITIVE" or "NEGATIVE", "confidence": 0.0-1.0}}
"""
response = client.models.generate_content(
model="gemini-2.0-flash-exp",
contents=prompt
)
import json
result = json.loads(response.text)
return result
else:
# Mock implementation based on keywords
positive_words = ['amazing', 'loved', 'fantastic', 'masterpiece', 'best', 'incredible']
negative_words = ['terrible', 'awful', 'worst', 'waste', 'boring', 'disaster']
review_lower = review.lower()
pos_count = sum(1 for w in positive_words if w in review_lower)
neg_count = sum(1 for w in negative_words if w in review_lower)
if pos_count > neg_count:
return {'label': 'POSITIVE', 'confidence': 0.8 + 0.1 * pos_count}
elif neg_count > pos_count:
return {'label': 'NEGATIVE', 'confidence': 0.8 + 0.1 * neg_count}
else:
return {'label': 'POSITIVE', 'confidence': 0.5} # Default
# Test
test_review = "Amazing movie! Loved every minute of Inception."
result = label_with_llm(test_review)
print(f"Review: {test_review}")
print(f"Label: {result['label']}, Confidence: {result['confidence']:.2f}")Question 5.2: Batch Label with LLM
# TODO: Label a batch of reviews using the LLM function
# Track time and estimated cost
sample_reviews = df_binary['review'].head(20).tolist()
# Your code hereQuestion 5.3: Evaluate LLM Labels Against Ground Truth
# TODO: Calculate accuracy, precision, recall of LLM labels
# Compare with human labels (ground truth)
# Your code hereQuestion 5.4: Hybrid Labeling Strategy
# TODO: Implement hybrid labeling
# - Use LLM for high-confidence labels
# - Send low-confidence examples to human review
def hybrid_labeling(reviews, confidence_threshold=0.8):
"""
Label reviews using hybrid approach.
Returns:
llm_labeled: Reviews labeled by LLM
human_queue: Reviews needing human review
"""
# Your code here
passPart 6: Noisy Label Detection with cleanlab
6.1 Setting Up cleanlab
from cleanlab import Datalab
from sklearn.model_selection import cross_val_predict
# Create a dataset with some noisy labels
X_noisy = X.copy()
y_noisy = y.copy()
# Flip 10% of labels to simulate noise
noise_idx = np.random.choice(len(y_noisy), size=int(0.1 * len(y_noisy)), replace=False)
y_noisy[noise_idx] = 1 - y_noisy[noise_idx] # Flip labels
print(f"Added noise to {len(noise_idx)} labels ({len(noise_idx)/len(y_noisy):.1%})")Question 6.1 (Solved): Find Label Issues
# SOLVED: Use cleanlab to find label issues
# Get out-of-fold predictions
clf = LogisticRegression(max_iter=1000)
pred_probs = cross_val_predict(clf, X_noisy, y_noisy, cv=5, method='predict_proba')
# Create Datalab and find issues
data_dict = {
'labels': y_noisy
}
lab = Datalab(data=data_dict, label_name='labels')
lab.find_issues(pred_probs=pred_probs)
# Get summary
print("Issue Summary:")
print(lab.get_issue_summary())Question 6.2: Analyze Detected Issues
# Get detailed issues
issues = lab.get_issues()
print("\nLabel Issues:")
label_issues = issues[issues['is_label_issue'] == True]
print(f"Found {len(label_issues)} potential label errors")
# TODO: Calculate precision/recall of cleanlab's detection
# How many of the actual noisy labels did it find?
# How many false positives?
# Your code hereQuestion 6.3: Train on Clean Data
# TODO: Compare model performance:
# 1. Trained on all data (with noise)
# 2. Trained on clean data (removing detected issues)
# 3. Trained on true clean data (oracle)
# Your code herePart 7: Cost Comparison
Question 7.1: Calculate Labeling Costs
# Calculate and compare costs of different approaches
def calculate_costs(n_items, method):
"""
Calculate estimated cost for different labeling methods.
Args:
n_items: Number of items to label
method: 'manual', 'active_learning', 'weak_supervision', 'llm', 'hybrid'
Returns:
dict with cost, time, quality estimates
"""
costs = {
'manual': {
'cost_per_item': 0.30, # USD
'time_per_item': 0.1, # minutes
'quality': 0.95, # accuracy
'items_needed': n_items
},
'active_learning': {
'cost_per_item': 0.30,
'time_per_item': 0.1,
'quality': 0.90,
'items_needed': n_items // 3 # 3x reduction
},
'weak_supervision': {
'cost_per_item': 0.0, # Just engineer time
'setup_cost': 500, # Writing LFs
'time_per_item': 0.0,
'quality': 0.80,
'items_needed': 0
},
'llm': {
'cost_per_item': 0.002, # API cost
'time_per_item': 0.01, # minutes
'quality': 0.85,
'items_needed': n_items
},
'hybrid': {
'cost_per_item': 0.05, # Average
'time_per_item': 0.02,
'quality': 0.90,
'items_needed': n_items
}
}
method_info = costs[method]
total_cost = method_info['cost_per_item'] * method_info['items_needed']
if 'setup_cost' in method_info:
total_cost += method_info['setup_cost']
total_time = method_info['time_per_item'] * method_info['items_needed']
return {
'method': method,
'total_cost': total_cost,
'total_time_hours': total_time / 60,
'expected_quality': method_info['quality']
}
# Compare all methods for 100,000 items
n_items = 100000
methods = ['manual', 'active_learning', 'weak_supervision', 'llm', 'hybrid']
print(f"Cost comparison for {n_items:,} items:\n")
print(f"{'Method':<20} {'Cost ($)':<12} {'Time (hrs)':<12} {'Quality':<10}")
print("-" * 54)
for method in methods:
result = calculate_costs(n_items, method)
print(f"{result['method']:<20} ${result['total_cost']:>10,.2f} {result['total_time_hours']:>10,.1f} {result['expected_quality']:>8.0%}")Challenge Problems
Challenge 1: Batch Active Learning
# Challenge: Implement batch active learning with diversity
# Query 10 samples at a time, ensuring diversity using clustering
from sklearn.cluster import KMeans
def diversity_uncertainty_sampling(classifier, X_pool, n_instances=10):
"""
Select n_instances that are both uncertain AND diverse.
Strategy:
1. Get uncertainty scores for all samples
2. Cluster samples into n_instances clusters
3. Pick most uncertain sample from each cluster
"""
# Your code here
passChallenge 2: Combine All Methods
# Challenge: Build a complete labeling pipeline that:
# 1. Uses weak supervision for initial bulk labels
# 2. Uses LLM to label uncertain examples
# 3. Uses active learning for remaining hard cases
# 4. Uses cleanlab to detect and fix errors
class SmartLabelingPipeline:
def __init__(self, labeling_functions, llm_client=None, confidence_threshold=0.8):
self.lfs = labeling_functions
self.llm_client = llm_client
self.threshold = confidence_threshold
def label(self, data):
"""
Label data using the hybrid pipeline.
Returns:
labels: Final labels
sources: Source of each label (weak/llm/human)
confidence: Confidence scores
"""
# Your code here
passChallenge 3: Active Learning with Human-in-the-Loop UI
# Challenge: Create an interactive labeling interface
# Use ipywidgets to create a simple annotation UI
try:
import ipywidgets as widgets
from IPython.display import display, clear_output
# Create interactive labeling widget
# Your code here
except ImportError:
print("Install ipywidgets for interactive labeling: pip install ipywidgets")Summary
In this lab, you learned:
- Active Learning: Query strategies (uncertainty, margin, entropy) to label smarter
- Weak Supervision: Writing labeling functions to generate noisy but cheap labels
- LLM Labeling: Using GPT/Gemini as automated annotators
- Noisy Label Detection: Using cleanlab to find and fix label errors
- Cost Comparison: Trade-offs between different labeling approaches
Key Takeaways
| Method | Best When | Typical Savings |
|---|---|---|
| Active Learning | Limited budget | 2-10x |
| Weak Supervision | Patterns exist | 10-100x |
| LLM Labeling | Well-defined task | 10-50x |
| Hybrid | Large scale projects | 5-20x |
Next Week
Week 5: Data Augmentation - Create more training data without any labeling!