import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)
print("Ready to understand language models! π")Lecture 7: Language Models - How Machines Understand Text
Interactive Demo Notebook
In this notebook, weβll build understanding from the ground up: 1. Text as Data - Characters, tokens, and numbers 2. Next Character Prediction - The simplest language model 3. N-gram Models - Counting patterns 4. Word Embeddings - Words as vectors 5. Temperature & Sampling - Controlling generation
Part 1: Text as Data
Computers donβt understand words - they understand numbers!
text = "The cat sat on the mat"
print("Text:", text)
print("\n--- Character Level ---")
print("Characters:", list(text))
print("As numbers (ASCII):", [ord(c) for c in text])
print("\n--- Word Level ---")
words = text.lower().split()
print("Words:", words)
# Create vocabulary
vocab = sorted(set(words))
word_to_idx = {w: i for i, w in enumerate(vocab)}
print("Vocabulary:", vocab)
print("Word β Index:", word_to_idx)
print("As indices:", [word_to_idx[w] for w in words])# Visualize tokenization
fig, ax = plt.subplots(figsize=(14, 3))
# Draw boxes for each token
colors = plt.cm.Set3(np.linspace(0, 1, len(vocab)))
x = 0
for i, word in enumerate(words):
color = colors[word_to_idx[word]]
width = len(word) * 0.3 + 0.2
rect = plt.Rectangle((x, 0), width, 1, facecolor=color, edgecolor='black', linewidth=2)
ax.add_patch(rect)
ax.text(x + width/2, 0.5, word, ha='center', va='center', fontsize=14, fontweight='bold')
ax.text(x + width/2, -0.3, f'idx={word_to_idx[word]}', ha='center', va='center', fontsize=10)
x += width + 0.1
ax.set_xlim(-0.2, x)
ax.set_ylim(-0.6, 1.3)
ax.axis('off')
ax.set_title('Tokenization: Words β Numbers', fontsize=14)
plt.show()
print("π‘ Same words (like 'the') get the same index!")Part 2: Next Character Prediction
The core idea of language models: Predict what comes next!
# Simple text for training
training_text = """
the cat sat on the mat
the dog sat on the log
the cat and the dog sat together
the mat was soft and warm
""".lower().strip()
# Build character vocabulary
chars = sorted(set(training_text))
char_to_idx = {c: i for i, c in enumerate(chars)}
idx_to_char = {i: c for c, i in char_to_idx.items()}
print(f"Vocabulary size: {len(chars)} characters")
print(f"Characters: {chars}")# Count what character comes after each character
next_char_counts = defaultdict(Counter)
for i in range(len(training_text) - 1):
current_char = training_text[i]
next_char = training_text[i + 1]
next_char_counts[current_char][next_char] += 1
# Look at what follows 't'
print("After 't', we see:")
for char, count in next_char_counts['t'].most_common():
char_display = repr(char) if char in '\n ' else char
print(f" '{char_display}': {count} times")# Convert counts to probabilities
def get_next_char_probs(char):
counts = next_char_counts[char]
total = sum(counts.values())
return {c: count/total for c, count in counts.items()}
# Visualize probabilities after 'th'
probs_after_h = get_next_char_probs('h')
probs_after_t = get_next_char_probs('t')
fig, axes = plt.subplots(1, 2, figsize=(14, 4))
# After 't'
chars_t = list(probs_after_t.keys())
probs_t = list(probs_after_t.values())
labels_t = [repr(c) if c in '\n ' else c for c in chars_t]
axes[0].barh(labels_t, probs_t, color='#1e3a5f')
axes[0].set_xlabel('Probability')
axes[0].set_title('P(next | current="t")', fontsize=12)
# After 'h'
chars_h = list(probs_after_h.keys())
probs_h = list(probs_after_h.values())
labels_h = [repr(c) if c in '\n ' else c for c in chars_h]
axes[1].barh(labels_h, probs_h, color='#e85a4f')
axes[1].set_xlabel('Probability')
axes[1].set_title('P(next | current="h")', fontsize=12)
plt.tight_layout()
plt.show()
print("π‘ After 'h', 'e' is most likely (in 'the')")
print(" This is the essence of language models!")# Generate text!
def generate_text(start_char, length=50):
result = start_char
current = start_char
for _ in range(length):
probs = get_next_char_probs(current)
if not probs:
break
# Sample from distribution
chars = list(probs.keys())
p = list(probs.values())
next_char = np.random.choice(chars, p=p)
result += next_char
current = next_char
return result
print("Generated text (starting with 't'):")
for i in range(3):
print(f" {i+1}. {generate_text('t', 40)}")
print("\nπ‘ It learns patterns from the training data!")Part 3: N-gram Models
Look at more context for better predictions
# Bigram (2-gram): Consider previous word
words = training_text.split()
bigrams = [(words[i], words[i+1]) for i in range(len(words)-1)]
print("Sample bigrams:")
for bg in bigrams[:8]:
print(f" {bg[0]} β {bg[1]}")
# Count bigrams
bigram_counts = Counter(bigrams)
print(f"\nMost common bigrams:")
for (w1, w2), count in bigram_counts.most_common(5):
print(f" '{w1} {w2}': {count} times")# Word-level prediction
word_next_counts = defaultdict(Counter)
for w1, w2 in bigrams:
word_next_counts[w1][w2] += 1
def get_next_word_probs(word):
counts = word_next_counts[word]
total = sum(counts.values())
return {w: c/total for w, c in counts.items()}
# Visualize
probs_after_the = get_next_word_probs('the')
plt.figure(figsize=(10, 4))
words_list = list(probs_after_the.keys())
probs_list = list(probs_after_the.values())
plt.barh(words_list, probs_list, color='#2a9d8f')
plt.xlabel('Probability', fontsize=12)
plt.title('P(next_word | previous_word="the")', fontsize=14)
plt.tight_layout()
plt.show()
print("π‘ 'The cat', 'the dog', 'the mat' are all common patterns!")Part 4: Word Embeddings - Words as Vectors
Similar words should have similar vectors
# Simple example: Create embeddings based on co-occurrence
unique_words = sorted(set(words))
n_words = len(unique_words)
word_to_idx = {w: i for i, w in enumerate(unique_words)}
# Co-occurrence matrix (simplified embedding)
cooccur = np.zeros((n_words, n_words))
window_size = 2
for i, word in enumerate(words):
idx = word_to_idx[word]
for j in range(max(0, i-window_size), min(len(words), i+window_size+1)):
if i != j:
neighbor_idx = word_to_idx[words[j]]
cooccur[idx, neighbor_idx] += 1
print("Co-occurrence matrix (simplified embeddings):")
print(f"Shape: {cooccur.shape}")
print(f"Words: {unique_words}")# Reduce to 2D for visualization
from sklearn.decomposition import PCA
# Add small noise to avoid zero vectors
cooccur_noisy = cooccur + np.random.randn(*cooccur.shape) * 0.01
pca = PCA(n_components=2)
embeddings_2d = pca.fit_transform(cooccur_noisy)
plt.figure(figsize=(10, 8))
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], s=100, c='#1e3a5f', edgecolors='white')
for i, word in enumerate(unique_words):
plt.annotate(word, (embeddings_2d[i, 0], embeddings_2d[i, 1]),
fontsize=12, ha='center', va='bottom',
xytext=(0, 5), textcoords='offset points')
plt.xlabel('Dimension 1', fontsize=12)
plt.ylabel('Dimension 2', fontsize=12)
plt.title('Words as Vectors (2D Projection)', fontsize=14)
plt.grid(True, alpha=0.3)
plt.show()
print("π‘ Words that appear in similar contexts end up close together!")Part 5: Temperature & Sampling
Controlling creativity vs. consistency
def softmax_with_temperature(logits, temperature=1.0):
"""Apply softmax with temperature"""
scaled = np.array(logits) / temperature
exp_scaled = np.exp(scaled - np.max(scaled)) # Subtract max for stability
return exp_scaled / exp_scaled.sum()
# Example: Probabilities for next word after "the"
# Logits (raw scores before softmax)
logits = [2.0, 1.5, 1.0, 0.5, 0.2] # cat, dog, mat, and, was
words_example = ['cat', 'dog', 'mat', 'and', 'was']
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
for idx, temp in enumerate([0.5, 1.0, 2.0]):
probs = softmax_with_temperature(logits, temp)
bars = axes[idx].bar(words_example, probs, color='#1e3a5f')
axes[idx].set_ylim(0, 0.8)
axes[idx].set_xlabel('Word')
axes[idx].set_ylabel('Probability')
if temp < 1.0:
desc = 'Low temp = More focused'
elif temp > 1.0:
desc = 'High temp = More random'
else:
desc = 'Default'
axes[idx].set_title(f'Temperature = {temp}\n({desc})', fontsize=12)
plt.tight_layout()
plt.show()
print("π‘ Temperature controls creativity:")
print(" Low (0.1-0.5): Safe, predictable, repetitive")
print(" Medium (0.7-1): Balanced")
print(" High (1.5-2): Creative, diverse, possibly weird")# Demo: Generate with different temperatures
def generate_with_temp(start_char, length=30, temperature=1.0):
result = start_char
current = start_char
for _ in range(length):
counts = next_char_counts[current]
if not counts:
break
chars = list(counts.keys())
logits = [counts[c] for c in chars]
probs = softmax_with_temperature(logits, temperature)
next_char = np.random.choice(chars, p=probs)
result += next_char
current = next_char
return result
print("Generated text with different temperatures:\n")
for temp in [0.3, 1.0, 2.0]:
print(f"Temperature = {temp}:")
for i in range(2):
text = generate_with_temp('t', 40, temp)
print(f" {text}")
print()Bonus: The Attention Intuition
Why transformers revolutionized NLP
# Visualize attention concept
sentence = "The cat sat because it was tired".split()
# Fake attention weights for "it" looking at other words
attention_weights = np.array([0.1, 0.6, 0.05, 0.05, 0.0, 0.1, 0.1])
fig, ax = plt.subplots(figsize=(12, 4))
# Draw words
for i, word in enumerate(sentence):
color = '#e85a4f' if word == 'it' else '#1e3a5f'
ax.text(i, 0, word, ha='center', va='center', fontsize=14, fontweight='bold',
bbox=dict(boxstyle='round', facecolor=color, alpha=0.7),
color='white')
# Draw attention lines from "it" (index 4) to other words
it_idx = 4
for i, weight in enumerate(attention_weights):
if i != it_idx and weight > 0.05:
ax.annotate('', xy=(i, 0.15), xytext=(it_idx, -0.15),
arrowprops=dict(arrowstyle='->', color='green',
alpha=weight*1.5, linewidth=weight*5))
ax.text((i + it_idx)/2, 0.3, f'{weight:.0%}', ha='center', fontsize=10, color='green')
ax.set_xlim(-1, 7)
ax.set_ylim(-0.5, 0.8)
ax.axis('off')
ax.set_title('Attention: "it" attends most to "cat" (60%)', fontsize=14)
plt.show()
print("π‘ The word 'it' needs to know what it refers to!")
print(" Attention lets the model look at all words and decide which matter.")π― Exercises
# Exercise 1: Train on different text (e.g., Shakespeare)
# Does the generated text change style?
shakespeare = """
to be or not to be that is the question
whether tis nobler in the mind to suffer
the slings and arrows of outrageous fortune
"""
# Your code here:
# Exercise 2: Build a trigram model (3 characters of context)
# Does it generate better text?
# Your code here:
Summary
| Concept | What We Learned |
|---|---|
| Tokenization | Convert text to numbers |
| Next Token Prediction | Core LM task: guess whatβs next |
| N-grams | More context = better predictions |
| Embeddings | Words as vectors in space |
| Temperature | Control creativity vs consistency |
| Attention | Let model decide what to focus on |