import torch
import torch.nn.functional as F
from torch import nn
import matplotlib.pyplot as plt
from collections import Counter
import re
import os
import requests
torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")Part 3: BPE Tokenizer from Scratch
Part 3: BPE Tokenizer from Scratch
From characters to subwords
In Parts 1 and 2, we built character-level language models. They work, but have fundamental limitations:
- No word understanding: The model sees “the” as three separate decisions: t→h→e
- Long sequences: A sentence of 10 words might be 60+ characters
- Inefficient learning: Must learn spelling patterns from scratch
Modern LLMs use subword tokenization, which finds a middle ground between characters and words.
Byte-Pair Encoding (BPE)
BPE is an algorithm that learns the most common character pairs and merges them. Repeat until you reach a target vocabulary size.
Example:
Text: "low lower lowest"
Initial: ['l', 'o', 'w', ' ', 'l', 'o', 'w', 'e', 'r', ' ', 'l', 'o', 'w', 'e', 's', 't']
After merge 'l'+'o' → 'lo': ['lo', 'w', ' ', 'lo', 'w', 'e', 'r', ' ', 'lo', 'w', 'e', 's', 't']
After merge 'lo'+'w' → 'low': ['low', ' ', 'low', 'e', 'r', ' ', 'low', 'e', 's', 't']
...
Setup
Step 1: Implement BPE from Scratch
class BPETokenizer:
"""
A from-scratch implementation of Byte-Pair Encoding.
Algorithm:
1. Start with character-level vocabulary
2. Count all adjacent pairs in the data
3. Merge the most frequent pair into a new token
4. Repeat until desired vocabulary size
"""
def __init__(self, vocab_size=256):
self.vocab_size = vocab_size
self.merges = {} # (token1, token2) -> new_token
self.vocab = {} # token_id -> token_string
self.inverse_vocab = {} # token_string -> token_id
def _get_pairs(self, tokens):
"""Count frequency of adjacent pairs."""
pairs = Counter()
for i in range(len(tokens) - 1):
pairs[(tokens[i], tokens[i + 1])] += 1
return pairs
def _merge(self, tokens, pair, new_token):
"""Merge all occurrences of pair into new_token."""
new_tokens = []
i = 0
while i < len(tokens):
if i < len(tokens) - 1 and (tokens[i], tokens[i + 1]) == pair:
new_tokens.append(new_token)
i += 2
else:
new_tokens.append(tokens[i])
i += 1
return new_tokens
def train(self, text, verbose=True):
"""
Learn BPE merges from text.
Args:
text: Training text
verbose: Print progress
"""
# Start with character-level tokens
tokens = list(text)
# Initialize vocabulary with all unique characters
unique_chars = sorted(set(tokens))
for i, char in enumerate(unique_chars):
self.vocab[i] = char
self.inverse_vocab[char] = i
next_id = len(unique_chars)
if verbose:
print(f"Initial vocabulary size: {len(unique_chars)}")
print(f"Target vocabulary size: {self.vocab_size}")
print(f"Merges to learn: {self.vocab_size - len(unique_chars)}")
print("-" * 50)
# Learn merges
num_merges = self.vocab_size - len(unique_chars)
for merge_idx in range(num_merges):
pairs = self._get_pairs(tokens)
if not pairs:
if verbose:
print("No more pairs to merge!")
break
# Find most frequent pair
best_pair = max(pairs, key=pairs.get)
best_count = pairs[best_pair]
# Create new token by joining the pair
new_token = best_pair[0] + best_pair[1]
# Record merge
self.merges[best_pair] = new_token
self.vocab[next_id] = new_token
self.inverse_vocab[new_token] = next_id
# Apply merge
tokens = self._merge(tokens, best_pair, new_token)
next_id += 1
if verbose and merge_idx % 50 == 0:
print(f"Merge {merge_idx}: '{best_pair[0]}' + '{best_pair[1]}' → '{new_token}' (count: {best_count})")
if verbose:
print("-" * 50)
print(f"Final vocabulary size: {len(self.vocab)}")
print(f"Compression: {len(text)} chars → {len(tokens)} tokens")
print(f"Compression ratio: {len(text) / len(tokens):.2f}x")
def encode(self, text):
"""
Encode text into token IDs.
Apply learned merges in order of learning.
"""
tokens = list(text)
# Apply each merge in order
for pair, new_token in self.merges.items():
tokens = self._merge(tokens, pair, new_token)
# Convert to IDs
return [self.inverse_vocab[t] for t in tokens]
def decode(self, ids):
"""Decode token IDs back to text."""
return ''.join(self.vocab[i] for i in ids)Step 2: Train the Tokenizer
Let’s train on Shakespeare text:
# Load Shakespeare
if not os.path.exists('shakespeare.txt'):
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
response = requests.get(url)
with open('shakespeare.txt', 'w') as f:
f.write(response.text)
with open('shakespeare.txt', 'r') as f:
shakespeare = f.read()
print(f"Training data: {len(shakespeare):,} characters")Training data: 1,115,394 characters
# Train BPE tokenizer
tokenizer = BPETokenizer(vocab_size=300) # Start small
tokenizer.train(shakespeare)Initial vocabulary size: 65
Target vocabulary size: 300
Merges to learn: 235
--------------------------------------------------
Merge 0: 'e' + ' ' → 'e ' (count: 27643)
Merge 50: 'l' + 'i' → 'li' (count: 2358)
Merge 100: 'w' + 'ith' → 'with' (count: 1258)
Merge 150: 'a' + 'y ' → 'ay ' (count: 849)
Merge 200: 's' + 'ha' → 'sha' (count: 636)
--------------------------------------------------
Final vocabulary size: 300
Compression: 1115394 chars → 578590 tokens
Compression ratio: 1.93x
Step 3: See What It Learned
# Look at the vocabulary
print("Sample vocabulary entries (sorted by length):")
print("-" * 50)
# Sort by token length to see progression
sorted_tokens = sorted(tokenizer.vocab.items(), key=lambda x: len(x[1]))
# Show some short and long tokens
print("\nShortest tokens (individual characters):")
for idx, token in sorted_tokens[:10]:
print(f" {idx}: '{repr(token)[1:-1]}'")
print("\nLongest tokens (learned subwords):")
for idx, token in sorted_tokens[-20:]:
print(f" {idx}: '{token}'")Sample vocabulary entries (sorted by length):
--------------------------------------------------
Shortest tokens (individual characters):
0: '\n'
1: ' '
2: '!'
3: '$'
4: '&'
5: '''
6: ','
7: '-'
8: '.'
9: '3'
Longest tokens (learned subwords):
247: 'e.
'
271: 'thy '
277: ' the'
283: 'KING'
284: ' him'
292: 'But '
293: 'est '
295: 'but '
113: ' the '
166: 'have '
170: 'that '
202: 'ould '
243: ' and '
260: ' his '
268: 'That '
285: 'this '
299: ' with'
231: ',
And '
286: ' this '
288: ' that '
# Test encoding
test_texts = [
"To be or not to be",
"ROMEO: Wherefore art thou",
"the king",
]
print("Encoding examples:")
print("=" * 60)
for text in test_texts:
ids = tokenizer.encode(text)
decoded = tokenizer.decode(ids)
print(f"\nOriginal: '{text}'")
print(f"Token IDs: {ids}")
print(f"Tokens: {[tokenizer.vocab[i] for i in ids]}")
print(f"Decoded: '{decoded}'")
print(f"Compression: {len(text)} chars → {len(ids)} tokens")Encoding examples:
============================================================
Original: 'To be or not to be'
Token IDs: [227, 197, 77, 1, 136, 92, 178]
Tokens: ['To ', 'be ', 'or', ' ', 'not ', 'to ', 'be']
Decoded: 'To be or not to be'
Compression: 18 chars → 7 tokens
Original: 'ROMEO: Wherefore art thou'
Token IDs: [30, 27, 25, 17, 27, 10, 1, 169, 72, 43, 109, 65, 81, 67, 185]
Tokens: ['R', 'O', 'M', 'E', 'O', ':', ' ', 'Wh', 'er', 'e', 'for', 'e ', 'ar', 't ', 'thou']
Decoded: 'ROMEO: Wherefore art thou'
Compression: 25 chars → 15 tokens
Original: 'the king'
Token IDs: [104, 49, 100]
Tokens: ['the ', 'k', 'ing']
Decoded: 'the king'
Compression: 8 chars → 3 tokens
What BPE Learns
Common patterns in Shakespeare: - “the”, “and”, “to” become single tokens - Common suffixes like “ing”, “ed”, “er” - Character names like “ROMEO” might merge partially
The tokenizer learns the statistical structure of the language!
Step 4: Build Language Model with BPE
Now let’s train a language model using our BPE tokenizer:
# Encode the entire text
encoded = tokenizer.encode(shakespeare)
print(f"Original: {len(shakespeare):,} characters")
print(f"Encoded: {len(encoded):,} tokens")
print(f"Compression ratio: {len(shakespeare) / len(encoded):.2f}x")Original: 1,115,394 characters
Encoded: 578,590 tokens
Compression ratio: 1.93x
# Create training dataset
block_size = 32 # Now 32 TOKENS, not characters
def build_dataset_bpe(encoded_text, block_size):
X, Y = [], []
for i in range(len(encoded_text) - block_size):
X.append(encoded_text[i:i + block_size])
Y.append(encoded_text[i + block_size])
return torch.tensor(X), torch.tensor(Y)
X, Y = build_dataset_bpe(encoded, block_size)
print(f"Dataset size: {len(X):,} examples")Dataset size: 578,558 examples
# Same model architecture!
class TokenLM(nn.Module):
"""Language model operating on BPE tokens."""
def __init__(self, vocab_size, block_size, emb_dim, hidden_size):
super().__init__()
self.emb = nn.Embedding(vocab_size, emb_dim)
self.hidden = nn.Linear(block_size * emb_dim, hidden_size)
self.output = nn.Linear(hidden_size, vocab_size)
def forward(self, x):
x = self.emb(x)
x = x.view(x.shape[0], -1)
x = torch.tanh(self.hidden(x))
x = self.output(x)
return x
# Create model with larger capacity
vocab_size = len(tokenizer.vocab)
emb_dim = 64 # Increased from 32
hidden_size = 512 # Increased from 256
model = TokenLM(vocab_size, block_size, emb_dim, hidden_size).to(device)
num_params = sum(p.numel() for p in model.parameters())
print(f"Model parameters: {num_params:,}")
# Model scale context
print(f"\n--- Model Scale Context ---")
print(f"Our model: ~{num_params/1000:.0f}K parameters")
print(f"GPT-2 Small: 124M parameters ({124_000_000/num_params:.0f}x larger)")
print(f"GPT-3: 175B parameters ({175_000_000_000/num_params:.0f}x larger)")
print(f"Claude/GPT-4: ~1T+ parameters ({1_000_000_000_000/num_params:.0f}x larger)")Model parameters: 1,222,188
--- Model Scale Context ---
Our model: ~1222K parameters
GPT-2 Small: 124M parameters (101x larger)
GPT-3: 175B parameters (143186x larger)
Claude/GPT-4: ~1T+ parameters (818205x larger)
# Train
def train(model, X, Y, epochs=500, batch_size=2048, lr=0.001):
model.train()
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
loss_fn = nn.CrossEntropyLoss()
X, Y = X.to(device), Y.to(device)
losses = []
for epoch in range(epochs):
perm = torch.randperm(X.shape[0])
total_loss, n_batches = 0, 0
for i in range(0, X.shape[0], batch_size):
idx = perm[i:i+batch_size]
logits = model(X[idx])
loss = loss_fn(logits, Y[idx])
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss += loss.item()
n_batches += 1
losses.append(total_loss / n_batches)
if epoch % 100 == 0:
print(f"Epoch {epoch}: Loss = {losses[-1]:.4f}")
return losses
losses = train(model, X, Y, epochs=500)Epoch 0: Loss = 4.3560
Epoch 100: Loss = 0.2567
Epoch 200: Loss = 0.0432
Epoch 300: Loss = 0.0364
Epoch 400: Loss = 0.0320
Step 5: Generate with BPE
@torch.no_grad()
def generate_bpe(model, tokenizer, seed_text, length=200, temperature=0.8):
model.eval()
# Encode seed
tokens = tokenizer.encode(seed_text)
# Pad or truncate to block_size
if len(tokens) < block_size:
tokens = [0] * (block_size - len(tokens)) + tokens
else:
tokens = tokens[-block_size:]
generated_ids = list(tokens)
for _ in range(length):
x = torch.tensor([tokens[-block_size:]]).to(device)
logits = model(x)
probs = F.softmax(logits / temperature, dim=-1)
next_id = torch.multinomial(probs, 1).item()
generated_ids.append(next_id)
tokens = tokens[1:] + [next_id]
return tokenizer.decode(generated_ids)
print("=" * 60)
print("GENERATED TEXT (BPE Tokenization)")
print("=" * 60)
print(generate_bpe(model, tokenizer, "ROMEO:\n", length=150))============================================================
GENERATED TEXT (BPE Tokenization)
============================================================
ROMEO:
why thine in you woul
s
On the inientans. Come, if you beor paice
Dodest but by get at I a wronge: proclaim:
And leave me to be evenge, I have drans:
Comarurious,
And, if you may appos. Bomingbatch herce!
ANREBIO:
Tell, moll hear you; made intwent
O, 'tood but joath. Thushy stafe, being?
M
Comparison: Character vs BPE
| Aspect | Character-Level | BPE (vocab=300) |
|---|---|---|
| Vocab size | 65 | 300 |
| Sequence for “the king” | 8 tokens | ~3-4 tokens |
| Context window (32 tokens) | 32 characters | ~100+ characters |
| Compression ratio | 1x | ~3-4x |
| Learns word boundaries | No | Partially |
Why BPE is Better
- Efficiency: Fewer tokens = longer effective context
- Semantics: Whole words/subwords carry more meaning
- Generalization: Handles rare words by splitting into known subwords
- Scalability: How GPT-2/3/4 and modern LLMs tokenize
A 32-token context with BPE sees ~100 characters, vs only 32 with character-level!
How Modern Tokenizers Extend This
GPT-2/3 use a variant called Byte-level BPE:
- Start with 256 byte values (not characters)
- Can represent ANY text (Unicode, emojis, etc.)
- Vocabulary size ~50,000 tokens
Our implementation is simplified but captures the core algorithm.
Train on Names with BPE
Let’s also test BPE on names to compare:
# Download names if needed
if not os.path.exists('names.csv'):
url = "https://raw.githubusercontent.com/balasahebgulave/Dataset-Indian-Names/master/Indian_Names.csv"
response = requests.get(url)
with open('names.csv', 'w') as f:
f.write(response.text)
import pandas as pd
names = pd.read_csv('names.csv')["Name"].str.lower().str.strip()
names = names[names.str.len().between(3, 9)]
names = names[names.apply(lambda x: str(x).isalpha())]
names_text = '\n'.join(names.tolist())
print(f"Names corpus: {len(names_text):,} characters")Names corpus: 44,324 characters
# Train smaller BPE for names
names_tokenizer = BPETokenizer(vocab_size=100)
names_tokenizer.train(names_text)Initial vocabulary size: 27
Target vocabulary size: 100
Merges to learn: 73
--------------------------------------------------
Merge 0: 'a' + '
' → 'a
' (count: 1510)
Merge 50: 'ee' + 't' → 'eet' (count: 112)
--------------------------------------------------
Final vocabulary size: 100
Compression: 44324 chars → 26636 tokens
Compression ratio: 1.66x
# Test encoding names
test_names = ["nipun", "priya", "arjun", "krishna"]
print("\nName encoding examples:")
for name in test_names:
ids = names_tokenizer.encode(name)
tokens = [names_tokenizer.vocab[i] for i in ids]
print(f" {name} → {tokens}")
Name encoding examples:
nipun → ['n', 'i', 'p', 'un']
priya → ['p', 'r', 'i', 'y', 'a']
arjun → ['ar', 'j', 'un']
krishna → ['k', 'r', 'ish', 'n', 'a']
Save Model and Tokenizer
Export both the BPE model and tokenizer for later use and comparison.
import pickle
# Create models directory
os.makedirs('../models', exist_ok=True)
# Save BPE tokenizer
tokenizer_data = {
'vocab_size': len(tokenizer.vocab),
'merges': tokenizer.merges,
'vocab': tokenizer.vocab,
'inverse_vocab': tokenizer.inverse_vocab,
}
with open('../models/bpe_tokenizer_shakespeare.pkl', 'wb') as f:
pickle.dump(tokenizer_data, f)
print(f"Tokenizer saved to ../models/bpe_tokenizer_shakespeare.pkl")
# Save model checkpoint
checkpoint = {
'model_state_dict': model.state_dict(),
'model_config': {
'vocab_size': vocab_size,
'block_size': block_size,
'emb_dim': emb_dim,
'hidden_size': hidden_size,
},
'final_loss': losses[-1] if losses else None,
}
torch.save(checkpoint, '../models/bpe_lm_shakespeare.pt')
print(f"Model saved to ../models/bpe_lm_shakespeare.pt")
# Verify
print(f"\nTokenizer vocab size: {len(tokenizer.vocab)}")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")Tokenizer saved to ../models/bpe_tokenizer_shakespeare.pkl
Model saved to ../models/bpe_lm_shakespeare.pt
Tokenizer vocab size: 300
Model parameters: 1,222,188
Clean Up GPU Memory
import gc
# Delete model and data tensors
del model
del X, Y
# Clear CUDA cache
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.synchronize()
gc.collect()
print("GPU memory cleared!")
if torch.cuda.is_available():
print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024**2:.1f} MB")
print(f"GPU memory cached: {torch.cuda.memory_reserved() / 1024**2:.1f} MB")GPU memory cleared!
GPU memory allocated: 20.9 MB
GPU memory cached: 62.0 MB
Summary
In this part, we built BPE tokenization from scratch:
- Algorithm: Iteratively merge most common pairs
- Vocabulary: Learns subword units from data
- Compression: 3-4x fewer tokens than characters
- Longer context: Same number of tokens sees more text
| Step | What happens |
|---|---|
| Initialize | Start with characters |
| Count pairs | Find adjacent token frequencies |
| Merge best | Create new token from pair |
| Repeat | Until target vocab size |
What’s Next
In Part 4, we’ll add self-attention—the key mechanism that makes transformers powerful. Our MLP treats all context positions equally; attention lets the model focus on relevant parts.
Exercises
- Vary vocab size: Try 100, 500, 1000. How does compression change?
- Different data: Train BPE on Python code or another language
- Subword analysis: What common subwords does it learn?
- Byte-level BPE: Modify to work with bytes instead of characters
- OOV handling: What happens with words not in training data?