import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
# Check if we have a GPU (optional, works without)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")
print("\nβ
Ready to build your language model!")π― Project: Build Your Own Simple Language Model
Goal: Create an AI that writes text!
By the end of this notebook, you will have built a character-level language model that can generate new text in the style of your training data.
What weβll build: - A model that learns patterns from text (like Shakespeare or song lyrics) - Generate new text that sounds similar!
Prerequisites: - Basic Python (loops, functions) - Completed L05 (Neural Networks) and L07 (Language Models) lectures
Step 0: Setup
Step 1: Get Some Text Data
Weβll use a small piece of text to train our model. You can change this to anything!
# Our training text - feel free to replace with your own!
text = """
To be or not to be that is the question
Whether tis nobler in the mind to suffer
The slings and arrows of outrageous fortune
Or to take arms against a sea of troubles
And by opposing end them to die to sleep
No more and by a sleep to say we end
The heartache and the thousand natural shocks
That flesh is heir to tis a consummation
Devoutly to be wished to die to sleep
To sleep perchance to dream ay there s the rub
For in that sleep of death what dreams may come
When we have shuffled off this mortal coil
Must give us pause there s the respect
That makes calamity of so long life
""".lower().strip()
print(f"π Training text length: {len(text)} characters")
print(f"\nFirst 100 characters:")
print(text[:100])Step 2: Create a Vocabulary
We need to convert characters to numbers (computers only understand numbers!)
# Get all unique characters
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(f"π Vocabulary size: {vocab_size} unique characters")
print(f"Characters: {chars}")# Create mappings: character <-> number
char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for i, ch in enumerate(chars)}
# Helper functions
def encode(s):
"""Convert string to list of integers"""
return [char_to_idx[c] for c in s]
def decode(l):
"""Convert list of integers to string"""
return ''.join([idx_to_char[i] for i in l])
# Test it!
test = "hello"
encoded = encode(test)
decoded = decode(encoded)
print(f"'{test}' β {encoded} β '{decoded}'")
print("\nβ
Encoding/decoding works!")Step 3: Prepare Training Data
For language modeling, we predict the next character given previous characters.
# Convert entire text to numbers
data = torch.tensor(encode(text), dtype=torch.long)
print(f"Data shape: {data.shape}")
print(f"First 20 values: {data[:20].tolist()}")
print(f"Which is: '{decode(data[:20].tolist())}'")# Create input-output pairs
# Input: characters 0 to n-1
# Output: characters 1 to n (shifted by 1)
# Context length: how many characters to look at
context_length = 32
def get_batch(batch_size=32):
"""Get a random batch of training examples"""
# Random starting positions
starts = torch.randint(len(data) - context_length - 1, (batch_size,))
# Create input and target sequences
x = torch.stack([data[i:i+context_length] for i in starts])
y = torch.stack([data[i+1:i+context_length+1] for i in starts])
return x, y
# Test it
x_batch, y_batch = get_batch(4)
print(f"Input batch shape: {x_batch.shape}")
print(f"Target batch shape: {y_batch.shape}")
print("\nExample (first item in batch):")
print(f" Input: '{decode(x_batch[0].tolist())}'")
print(f" Target: '{decode(y_batch[0].tolist())}'")
print("\nπ‘ Notice: Target is shifted by 1 character!")Step 4: Build the Model! ποΈ
Weβll build a simple neural network with: 1. Embedding layer: Convert character indices to vectors 2. Linear layer: Learn patterns 3. Output layer: Predict next character
class SimpleLanguageModel(nn.Module):
def __init__(self, vocab_size, embed_size=64, hidden_size=128):
super().__init__()
# Embedding: each character gets a vector representation
self.embedding = nn.Embedding(vocab_size, embed_size)
# Simple neural network layers
self.fc1 = nn.Linear(embed_size, hidden_size)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(hidden_size, vocab_size)
def forward(self, x):
# x shape: (batch_size, context_length)
# Get embeddings for each character
emb = self.embedding(x) # (batch, context, embed_size)
# Process through network
out = self.fc1(emb) # (batch, context, hidden_size)
out = self.relu(out)
out = self.fc2(out) # (batch, context, vocab_size)
return out
# Create the model
model = SimpleLanguageModel(vocab_size)
print(model)
print(f"\nπ Total parameters: {sum(p.numel() for p in model.parameters()):,}")Step 5: Train the Model! π
# Training setup
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
# Training loop
losses = []
num_steps = 2000
batch_size = 32
print("Training...")
for step in range(num_steps):
# Get a batch
x, y = get_batch(batch_size)
# Forward pass
logits = model(x) # (batch, context, vocab_size)
# Reshape for loss calculation
B, T, C = logits.shape
logits = logits.view(B*T, C)
targets = y.view(B*T)
# Calculate loss
loss = criterion(logits, targets)
# Backward pass
optimizer.zero_grad()
loss.backward()
optimizer.step()
losses.append(loss.item())
if step % 500 == 0:
print(f" Step {step:4d}: Loss = {loss.item():.4f}")
print("\nβ
Training complete!")# Plot training loss
plt.figure(figsize=(10, 4))
plt.plot(losses, alpha=0.7)
plt.xlabel('Step')
plt.ylabel('Loss')
plt.title('Training Loss Over Time')
plt.grid(True, alpha=0.3)
plt.show()
print(f"Final loss: {losses[-1]:.4f}")Step 6: Generate Text! β¨
Now the fun part - letβs make our model write!
@torch.no_grad()
def generate(model, start_text, max_length=100, temperature=1.0):
"""Generate text starting from start_text"""
model.eval()
# Encode the starting text
context = torch.tensor([encode(start_text)], dtype=torch.long)
generated = list(start_text)
for _ in range(max_length):
# Get predictions for the last character position
logits = model(context) # (1, seq_len, vocab_size)
logits = logits[0, -1, :] / temperature # Just last position
# Convert to probabilities
probs = torch.softmax(logits, dim=0)
# Sample next character
next_idx = torch.multinomial(probs, 1).item()
next_char = idx_to_char[next_idx]
generated.append(next_char)
# Update context (add new character)
next_tensor = torch.tensor([[next_idx]], dtype=torch.long)
context = torch.cat([context, next_tensor], dim=1)
# Keep only last context_length characters
if context.shape[1] > context_length:
context = context[:, -context_length:]
return ''.join(generated)
# Generate some text!
print("π Generated text (starting with 'to be'):")
print("="*50)
print(generate(model, "to be", max_length=150))
print("="*50)# Try different temperatures!
print("\nπ‘οΈ Effect of temperature:\n")
for temp in [0.5, 1.0, 1.5]:
print(f"Temperature = {temp}:")
print(generate(model, "the ", max_length=80, temperature=temp))
print()
print("π‘ Lower temperature = more predictable")
print(" Higher temperature = more creative/random")π Congratulations!
You just built a language model! The same basic idea powers ChatGPT and Claude:
- Data: Text from the internet (we used Shakespeare)
- Task: Predict the next character/token
- Model: Neural network (we used a simple one, GPT uses Transformers)
- Generation: Sample from predictions, use temperature to control creativity
π Challenges to Try:
- Different text: Train on song lyrics, code, or your own writing
- Longer training: Increase
num_stepsto 10000 - Bigger model: Increase
embed_sizeandhidden_size - Word-level: Instead of characters, use words as tokens
# CHALLENGE: Train on your own text!
# Replace the text variable and run all cells again
your_text = """
# Paste your own text here!
# Ideas: song lyrics, poems, code, stories
"""