Lecture 7: Language Models - How Machines Understand Text

Interactive Demo Notebook

In this notebook, we’ll build understanding from the ground up: 1. Text as Data - Characters, tokens, and numbers 2. Next Character Prediction - The simplest language model 3. N-gram Models - Counting patterns 4. Word Embeddings - Words as vectors 5. Temperature & Sampling - Controlling generation

import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)
print("Ready to understand language models! πŸ“–")

Part 1: Text as Data

Computers don’t understand words - they understand numbers!

text = "The cat sat on the mat"

print("Text:", text)
print("\n--- Character Level ---")
print("Characters:", list(text))
print("As numbers (ASCII):", [ord(c) for c in text])

print("\n--- Word Level ---")
words = text.lower().split()
print("Words:", words)

# Create vocabulary
vocab = sorted(set(words))
word_to_idx = {w: i for i, w in enumerate(vocab)}
print("Vocabulary:", vocab)
print("Word β†’ Index:", word_to_idx)
print("As indices:", [word_to_idx[w] for w in words])
# Visualize tokenization
fig, ax = plt.subplots(figsize=(14, 3))

# Draw boxes for each token
colors = plt.cm.Set3(np.linspace(0, 1, len(vocab)))
x = 0
for i, word in enumerate(words):
    color = colors[word_to_idx[word]]
    width = len(word) * 0.3 + 0.2
    rect = plt.Rectangle((x, 0), width, 1, facecolor=color, edgecolor='black', linewidth=2)
    ax.add_patch(rect)
    ax.text(x + width/2, 0.5, word, ha='center', va='center', fontsize=14, fontweight='bold')
    ax.text(x + width/2, -0.3, f'idx={word_to_idx[word]}', ha='center', va='center', fontsize=10)
    x += width + 0.1

ax.set_xlim(-0.2, x)
ax.set_ylim(-0.6, 1.3)
ax.axis('off')
ax.set_title('Tokenization: Words β†’ Numbers', fontsize=14)
plt.show()

print("πŸ’‘ Same words (like 'the') get the same index!")

Part 2: Next Character Prediction

The core idea of language models: Predict what comes next!

# Simple text for training
training_text = """
the cat sat on the mat
the dog sat on the log
the cat and the dog sat together
the mat was soft and warm
""".lower().strip()

# Build character vocabulary
chars = sorted(set(training_text))
char_to_idx = {c: i for i, c in enumerate(chars)}
idx_to_char = {i: c for c, i in char_to_idx.items()}

print(f"Vocabulary size: {len(chars)} characters")
print(f"Characters: {chars}")
# Count what character comes after each character
next_char_counts = defaultdict(Counter)

for i in range(len(training_text) - 1):
    current_char = training_text[i]
    next_char = training_text[i + 1]
    next_char_counts[current_char][next_char] += 1

# Look at what follows 't'
print("After 't', we see:")
for char, count in next_char_counts['t'].most_common():
    char_display = repr(char) if char in '\n ' else char
    print(f"  '{char_display}': {count} times")
# Convert counts to probabilities
def get_next_char_probs(char):
    counts = next_char_counts[char]
    total = sum(counts.values())
    return {c: count/total for c, count in counts.items()}

# Visualize probabilities after 'th'
probs_after_h = get_next_char_probs('h')
probs_after_t = get_next_char_probs('t')

fig, axes = plt.subplots(1, 2, figsize=(14, 4))

# After 't'
chars_t = list(probs_after_t.keys())
probs_t = list(probs_after_t.values())
labels_t = [repr(c) if c in '\n ' else c for c in chars_t]
axes[0].barh(labels_t, probs_t, color='#1e3a5f')
axes[0].set_xlabel('Probability')
axes[0].set_title('P(next | current="t")', fontsize=12)

# After 'h'
chars_h = list(probs_after_h.keys())
probs_h = list(probs_after_h.values())
labels_h = [repr(c) if c in '\n ' else c for c in chars_h]
axes[1].barh(labels_h, probs_h, color='#e85a4f')
axes[1].set_xlabel('Probability')
axes[1].set_title('P(next | current="h")', fontsize=12)

plt.tight_layout()
plt.show()

print("πŸ’‘ After 'h', 'e' is most likely (in 'the')")
print("   This is the essence of language models!")
# Generate text!
def generate_text(start_char, length=50):
    result = start_char
    current = start_char
    
    for _ in range(length):
        probs = get_next_char_probs(current)
        if not probs:
            break
        
        # Sample from distribution
        chars = list(probs.keys())
        p = list(probs.values())
        next_char = np.random.choice(chars, p=p)
        
        result += next_char
        current = next_char
    
    return result

print("Generated text (starting with 't'):")
for i in range(3):
    print(f"  {i+1}. {generate_text('t', 40)}")

print("\nπŸ’‘ It learns patterns from the training data!")

Part 3: N-gram Models

Look at more context for better predictions

# Bigram (2-gram): Consider previous word
words = training_text.split()
bigrams = [(words[i], words[i+1]) for i in range(len(words)-1)]

print("Sample bigrams:")
for bg in bigrams[:8]:
    print(f"  {bg[0]} β†’ {bg[1]}")

# Count bigrams
bigram_counts = Counter(bigrams)
print(f"\nMost common bigrams:")
for (w1, w2), count in bigram_counts.most_common(5):
    print(f"  '{w1} {w2}': {count} times")
# Word-level prediction
word_next_counts = defaultdict(Counter)
for w1, w2 in bigrams:
    word_next_counts[w1][w2] += 1

def get_next_word_probs(word):
    counts = word_next_counts[word]
    total = sum(counts.values())
    return {w: c/total for w, c in counts.items()}

# Visualize
probs_after_the = get_next_word_probs('the')

plt.figure(figsize=(10, 4))
words_list = list(probs_after_the.keys())
probs_list = list(probs_after_the.values())

plt.barh(words_list, probs_list, color='#2a9d8f')
plt.xlabel('Probability', fontsize=12)
plt.title('P(next_word | previous_word="the")', fontsize=14)
plt.tight_layout()
plt.show()

print("πŸ’‘ 'The cat', 'the dog', 'the mat' are all common patterns!")

Part 4: Word Embeddings - Words as Vectors

Similar words should have similar vectors

# Simple example: Create embeddings based on co-occurrence
unique_words = sorted(set(words))
n_words = len(unique_words)
word_to_idx = {w: i for i, w in enumerate(unique_words)}

# Co-occurrence matrix (simplified embedding)
cooccur = np.zeros((n_words, n_words))
window_size = 2

for i, word in enumerate(words):
    idx = word_to_idx[word]
    for j in range(max(0, i-window_size), min(len(words), i+window_size+1)):
        if i != j:
            neighbor_idx = word_to_idx[words[j]]
            cooccur[idx, neighbor_idx] += 1

print("Co-occurrence matrix (simplified embeddings):")
print(f"Shape: {cooccur.shape}")
print(f"Words: {unique_words}")
# Reduce to 2D for visualization
from sklearn.decomposition import PCA

# Add small noise to avoid zero vectors
cooccur_noisy = cooccur + np.random.randn(*cooccur.shape) * 0.01

pca = PCA(n_components=2)
embeddings_2d = pca.fit_transform(cooccur_noisy)

plt.figure(figsize=(10, 8))
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], s=100, c='#1e3a5f', edgecolors='white')

for i, word in enumerate(unique_words):
    plt.annotate(word, (embeddings_2d[i, 0], embeddings_2d[i, 1]), 
                 fontsize=12, ha='center', va='bottom',
                 xytext=(0, 5), textcoords='offset points')

plt.xlabel('Dimension 1', fontsize=12)
plt.ylabel('Dimension 2', fontsize=12)
plt.title('Words as Vectors (2D Projection)', fontsize=14)
plt.grid(True, alpha=0.3)
plt.show()

print("πŸ’‘ Words that appear in similar contexts end up close together!")

Part 5: Temperature & Sampling

Controlling creativity vs. consistency

def softmax_with_temperature(logits, temperature=1.0):
    """Apply softmax with temperature"""
    scaled = np.array(logits) / temperature
    exp_scaled = np.exp(scaled - np.max(scaled))  # Subtract max for stability
    return exp_scaled / exp_scaled.sum()

# Example: Probabilities for next word after "the"
# Logits (raw scores before softmax)
logits = [2.0, 1.5, 1.0, 0.5, 0.2]  # cat, dog, mat, and, was
words_example = ['cat', 'dog', 'mat', 'and', 'was']

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for idx, temp in enumerate([0.5, 1.0, 2.0]):
    probs = softmax_with_temperature(logits, temp)
    
    bars = axes[idx].bar(words_example, probs, color='#1e3a5f')
    axes[idx].set_ylim(0, 0.8)
    axes[idx].set_xlabel('Word')
    axes[idx].set_ylabel('Probability')
    
    if temp < 1.0:
        desc = 'Low temp = More focused'
    elif temp > 1.0:
        desc = 'High temp = More random'
    else:
        desc = 'Default'
    
    axes[idx].set_title(f'Temperature = {temp}\n({desc})', fontsize=12)

plt.tight_layout()
plt.show()

print("πŸ’‘ Temperature controls creativity:")
print("   Low (0.1-0.5):  Safe, predictable, repetitive")
print("   Medium (0.7-1): Balanced")
print("   High (1.5-2):   Creative, diverse, possibly weird")
# Demo: Generate with different temperatures
def generate_with_temp(start_char, length=30, temperature=1.0):
    result = start_char
    current = start_char
    
    for _ in range(length):
        counts = next_char_counts[current]
        if not counts:
            break
        
        chars = list(counts.keys())
        logits = [counts[c] for c in chars]
        probs = softmax_with_temperature(logits, temperature)
        
        next_char = np.random.choice(chars, p=probs)
        result += next_char
        current = next_char
    
    return result

print("Generated text with different temperatures:\n")
for temp in [0.3, 1.0, 2.0]:
    print(f"Temperature = {temp}:")
    for i in range(2):
        text = generate_with_temp('t', 40, temp)
        print(f"  {text}")
    print()

Bonus: The Attention Intuition

Why transformers revolutionized NLP

# Visualize attention concept
sentence = "The cat sat because it was tired".split()

# Fake attention weights for "it" looking at other words
attention_weights = np.array([0.1, 0.6, 0.05, 0.05, 0.0, 0.1, 0.1])

fig, ax = plt.subplots(figsize=(12, 4))

# Draw words
for i, word in enumerate(sentence):
    color = '#e85a4f' if word == 'it' else '#1e3a5f'
    ax.text(i, 0, word, ha='center', va='center', fontsize=14, fontweight='bold',
            bbox=dict(boxstyle='round', facecolor=color, alpha=0.7),
            color='white')

# Draw attention lines from "it" (index 4) to other words
it_idx = 4
for i, weight in enumerate(attention_weights):
    if i != it_idx and weight > 0.05:
        ax.annotate('', xy=(i, 0.15), xytext=(it_idx, -0.15),
                    arrowprops=dict(arrowstyle='->', color='green',
                                    alpha=weight*1.5, linewidth=weight*5))
        ax.text((i + it_idx)/2, 0.3, f'{weight:.0%}', ha='center', fontsize=10, color='green')

ax.set_xlim(-1, 7)
ax.set_ylim(-0.5, 0.8)
ax.axis('off')
ax.set_title('Attention: "it" attends most to "cat" (60%)', fontsize=14)
plt.show()

print("πŸ’‘ The word 'it' needs to know what it refers to!")
print("   Attention lets the model look at all words and decide which matter.")

🎯 Exercises

# Exercise 1: Train on different text (e.g., Shakespeare)
# Does the generated text change style?
shakespeare = """
to be or not to be that is the question
whether tis nobler in the mind to suffer
the slings and arrows of outrageous fortune
"""
# Your code here:
# Exercise 2: Build a trigram model (3 characters of context)
# Does it generate better text?
# Your code here:

Summary

Concept What We Learned
Tokenization Convert text to numbers
Next Token Prediction Core LM task: guess what’s next
N-grams More context = better predictions
Embeddings Words as vectors in space
Temperature Control creativity vs consistency
Attention Let model decide what to focus on