🎯 Project: Build Your Own Simple Language Model

Goal: Create an AI that writes text!

By the end of this notebook, you will have built a character-level language model that can generate new text in the style of your training data.

What we’ll build: - A model that learns patterns from text (like Shakespeare or song lyrics) - Generate new text that sounds similar!

Prerequisites: - Basic Python (loops, functions) - Completed L05 (Neural Networks) and L07 (Language Models) lectures

Step 0: Setup

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt

# Check if we have a GPU (optional, works without)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")
print("\nβœ… Ready to build your language model!")

Step 1: Get Some Text Data

We’ll use a small piece of text to train our model. You can change this to anything!

# Our training text - feel free to replace with your own!
text = """
To be or not to be that is the question
Whether tis nobler in the mind to suffer
The slings and arrows of outrageous fortune
Or to take arms against a sea of troubles
And by opposing end them to die to sleep
No more and by a sleep to say we end
The heartache and the thousand natural shocks
That flesh is heir to tis a consummation
Devoutly to be wished to die to sleep
To sleep perchance to dream ay there s the rub
For in that sleep of death what dreams may come
When we have shuffled off this mortal coil
Must give us pause there s the respect
That makes calamity of so long life
""".lower().strip()

print(f"πŸ“– Training text length: {len(text)} characters")
print(f"\nFirst 100 characters:")
print(text[:100])

Step 2: Create a Vocabulary

We need to convert characters to numbers (computers only understand numbers!)

# Get all unique characters
chars = sorted(list(set(text)))
vocab_size = len(chars)

print(f"πŸ“ Vocabulary size: {vocab_size} unique characters")
print(f"Characters: {chars}")
# Create mappings: character <-> number
char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for i, ch in enumerate(chars)}

# Helper functions
def encode(s):
    """Convert string to list of integers"""
    return [char_to_idx[c] for c in s]

def decode(l):
    """Convert list of integers to string"""
    return ''.join([idx_to_char[i] for i in l])

# Test it!
test = "hello"
encoded = encode(test)
decoded = decode(encoded)

print(f"'{test}' β†’ {encoded} β†’ '{decoded}'")
print("\nβœ… Encoding/decoding works!")

Step 3: Prepare Training Data

For language modeling, we predict the next character given previous characters.

# Convert entire text to numbers
data = torch.tensor(encode(text), dtype=torch.long)

print(f"Data shape: {data.shape}")
print(f"First 20 values: {data[:20].tolist()}")
print(f"Which is: '{decode(data[:20].tolist())}'")
# Create input-output pairs
# Input: characters 0 to n-1
# Output: characters 1 to n (shifted by 1)

# Context length: how many characters to look at
context_length = 32

def get_batch(batch_size=32):
    """Get a random batch of training examples"""
    # Random starting positions
    starts = torch.randint(len(data) - context_length - 1, (batch_size,))
    
    # Create input and target sequences
    x = torch.stack([data[i:i+context_length] for i in starts])
    y = torch.stack([data[i+1:i+context_length+1] for i in starts])
    
    return x, y

# Test it
x_batch, y_batch = get_batch(4)
print(f"Input batch shape: {x_batch.shape}")
print(f"Target batch shape: {y_batch.shape}")

print("\nExample (first item in batch):")
print(f"  Input:  '{decode(x_batch[0].tolist())}'")
print(f"  Target: '{decode(y_batch[0].tolist())}'")
print("\nπŸ’‘ Notice: Target is shifted by 1 character!")

Step 4: Build the Model! πŸ—οΈ

We’ll build a simple neural network with: 1. Embedding layer: Convert character indices to vectors 2. Linear layer: Learn patterns 3. Output layer: Predict next character

class SimpleLanguageModel(nn.Module):
    def __init__(self, vocab_size, embed_size=64, hidden_size=128):
        super().__init__()
        
        # Embedding: each character gets a vector representation
        self.embedding = nn.Embedding(vocab_size, embed_size)
        
        # Simple neural network layers
        self.fc1 = nn.Linear(embed_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, x):
        # x shape: (batch_size, context_length)
        
        # Get embeddings for each character
        emb = self.embedding(x)  # (batch, context, embed_size)
        
        # Process through network
        out = self.fc1(emb)      # (batch, context, hidden_size)
        out = self.relu(out)
        out = self.fc2(out)      # (batch, context, vocab_size)
        
        return out

# Create the model
model = SimpleLanguageModel(vocab_size)
print(model)
print(f"\nπŸ“Š Total parameters: {sum(p.numel() for p in model.parameters()):,}")

Step 5: Train the Model! πŸš€

# Training setup
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# Training loop
losses = []
num_steps = 2000
batch_size = 32

print("Training...")
for step in range(num_steps):
    # Get a batch
    x, y = get_batch(batch_size)
    
    # Forward pass
    logits = model(x)  # (batch, context, vocab_size)
    
    # Reshape for loss calculation
    B, T, C = logits.shape
    logits = logits.view(B*T, C)
    targets = y.view(B*T)
    
    # Calculate loss
    loss = criterion(logits, targets)
    
    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    losses.append(loss.item())
    
    if step % 500 == 0:
        print(f"  Step {step:4d}: Loss = {loss.item():.4f}")

print("\nβœ… Training complete!")
# Plot training loss
plt.figure(figsize=(10, 4))
plt.plot(losses, alpha=0.7)
plt.xlabel('Step')
plt.ylabel('Loss')
plt.title('Training Loss Over Time')
plt.grid(True, alpha=0.3)
plt.show()

print(f"Final loss: {losses[-1]:.4f}")

Step 6: Generate Text! ✨

Now the fun part - let’s make our model write!

@torch.no_grad()
def generate(model, start_text, max_length=100, temperature=1.0):
    """Generate text starting from start_text"""
    model.eval()
    
    # Encode the starting text
    context = torch.tensor([encode(start_text)], dtype=torch.long)
    
    generated = list(start_text)
    
    for _ in range(max_length):
        # Get predictions for the last character position
        logits = model(context)  # (1, seq_len, vocab_size)
        logits = logits[0, -1, :] / temperature  # Just last position
        
        # Convert to probabilities
        probs = torch.softmax(logits, dim=0)
        
        # Sample next character
        next_idx = torch.multinomial(probs, 1).item()
        next_char = idx_to_char[next_idx]
        
        generated.append(next_char)
        
        # Update context (add new character)
        next_tensor = torch.tensor([[next_idx]], dtype=torch.long)
        context = torch.cat([context, next_tensor], dim=1)
        
        # Keep only last context_length characters
        if context.shape[1] > context_length:
            context = context[:, -context_length:]
    
    return ''.join(generated)

# Generate some text!
print("🎭 Generated text (starting with 'to be'):")
print("="*50)
print(generate(model, "to be", max_length=150))
print("="*50)
# Try different temperatures!
print("\n🌑️ Effect of temperature:\n")

for temp in [0.5, 1.0, 1.5]:
    print(f"Temperature = {temp}:")
    print(generate(model, "the ", max_length=80, temperature=temp))
    print()

print("πŸ’‘ Lower temperature = more predictable")
print("   Higher temperature = more creative/random")

πŸŽ‰ Congratulations!

You just built a language model! The same basic idea powers ChatGPT and Claude:

  1. Data: Text from the internet (we used Shakespeare)
  2. Task: Predict the next character/token
  3. Model: Neural network (we used a simple one, GPT uses Transformers)
  4. Generation: Sample from predictions, use temperature to control creativity

πŸš€ Challenges to Try:

  1. Different text: Train on song lyrics, code, or your own writing
  2. Longer training: Increase num_steps to 10000
  3. Bigger model: Increase embed_size and hidden_size
  4. Word-level: Instead of characters, use words as tokens
# CHALLENGE: Train on your own text!
# Replace the text variable and run all cells again

your_text = """
# Paste your own text here!
# Ideas: song lyrics, poems, code, stories
"""