Character-Level RNN for Name Generation

—author: Nipun Batrabadges: truecategories:- MLdate: ’2024-05-30’title: Character-Level RNN for Name Generationtoc: true—

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import torch
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange, reduce, repeat

!wget https://raw.githubusercontent.com/MASTREX/List-of-Indian-Names/master/2.%20First.txt -O names-indian.txt

--2024-05-30 09:41:48--  https://raw.githubusercontent.com/MASTREX/List-of-Indian-Names/master/2.%20First.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8752 (8.5K) [text/plain]
Saving to: ‘names-indian.txt’

names-indian.txt    100%[===================>]   8.55K  --.-KB/s    in 0s      

2024-05-30 09:41:49 (33.8 MB/s) - ‘names-indian.txt’ saved [8752/8752]

import pandas as pd
pd.read_csv('names-indian.txt', header=None)

	0
0	Abhishek
1	Aman
2	Harsh
3	Ayush
4	Aditi
...	...
1160	Prasoon
1161	Madhusudan
1162	Prastuti
1163	Rampratap
1164	Madhukar

1165 rows × 1 columns

# convert all names to lowercase
names = pd.read_csv('names-indian.txt', header=None)[0].str.lower().values

names

array(['abhishek', 'aman', 'harsh', ..., 'prastuti', 'rampratap',
       'madhukar'], dtype=object)

# KDE plot of name lengths
plt.figure(figsize=(8, 4))
plt.hist([len(name) for name in names], bins=range(1, 20), density=True, alpha=0.7)
plt.xlabel('Name length')
plt.ylabel('Density')

Text(0, 0.5, 'Density')

# Attach START and END tokens to each name. Need to add these two to the vocabulary.
start_symbol = '^'
end_symbol = '$'

names = [start_symbol + name + end_symbol for name in names]
names[:5]

['^abhishek$', '^aman$', '^harsh$', '^ayush$', '^aditi$']

# Find unique characters in the dataset
vocab = set(''.join(names))
vocab = sorted(vocab)
print(vocab, len(vocab))

['$', '^', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] 28

# Create a d dimensional lookup table for each character in the vocabulary
class CharTable:
    def __init__(self, vocab):
        self.vocab = vocab
        self.char2index = {c: i for i, c in enumerate(vocab)}
        self.index2char = {i: c for i, c in enumerate(vocab)}
        self.vocab_size = len(vocab)
    
    def encode(self, name):
        return torch.tensor([self.char2index[c] for c in name])
    
    def decode(self, tensor):
        if type(tensor) == torch.Tensor:
            tensor = tensor.cpu().numpy()
        return ''.join([self.index2char[i] for i in tensor])

ct = CharTable(vocab)

ct.encode('^'), ct.encode('$'), ct.encode('a'), ct.encode('z'), ct.encode('ab'), ct.encode('za')

(tensor([1]),
 tensor([0]),
 tensor([2]),
 tensor([27]),
 tensor([2, 3]),
 tensor([27,  2]))

ct.decode([1]), ct.decode(torch.tensor([1])), ct.decode(torch.tensor([1, 2, 3]))

('^', '^', '^ab')

# create embedding layer
class CharEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(CharEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        
    def forward(self, x):
        return self.embedding(x)

char_embedding = CharEmbedding(ct.vocab_size, 2)

def plot_2d_embeddings(embedding, vocab):
    plt.figure(figsize=(4, 4))
    for i, char in enumerate(vocab):
        tensor = ct.encode(char)
        embedding = char_embedding(tensor)
        plt.scatter(embedding[0, 0].item(), embedding[0, 1].item())
        plt.text(embedding[0, 0].item(), embedding[0, 1].item(), char)
    plt.xlabel('Dimension 1')
    plt.ylabel('Dimension 2')

plot_2d_embeddings(char_embedding, vocab)

import torch.nn.functional as F

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size, hidden_size)
        self.h2h = nn.Linear(hidden_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        hidden = F.tanh(self.i2h(input) + self.h2h(hidden))
        output = self.h2o(hidden)
        output = self.softmax(output)
        return output, hidden

    def init_hidden(self):
        return torch.zeros(1, self.hidden_size)

rnn = RNN(2, 128, ct.vocab_size)

# Predict the next character given the current character
current_char = "a"
print("Current character:", current_char)
# convert to tensor
current_tensor = ct.encode(current_char)
print("Curent tensor:", current_tensor)
# Look up the embedding
current_embedding = char_embedding(current_tensor)
print("Current embedding:", current_embedding)
# Initialize the hidden state
hidden = rnn.init_hidden()
#print(hidden)
# Pass the embedding and hidden state through the RNN
output, hidden = rnn(current_embedding, hidden)
print(output)

# Print the predicted character (most probable)
_, predicted_index = output.topk(1)
# flatten the tensor
predicted_index = predicted_index.squeeze().item()
# convert to character
predicted_char = ct.decode([predicted_index])
print("Predicted character:", predicted_char)

Current character: a
Curent tensor: tensor([2])
Current embedding: tensor([[-1.4545,  0.9880]], grad_fn=<EmbeddingBackward0>)
tensor([[-2.5902, -3.3533, -3.8653, -3.9548, -3.5940, -2.8801, -3.4821, -3.0470,
         -3.5943, -3.5595, -3.6062, -3.5047, -3.6877, -3.3012, -3.7079, -4.4289,
         -2.9308, -3.6200, -3.3797, -3.7172, -2.8883, -2.6247, -3.7265, -3.3239,
         -3.7247, -2.9247, -3.4027, -3.2497]], grad_fn=<LogSoftmaxBackward0>)
Predicted character: $

# Create a function to generate a word (sequence of characters) given a 
# starting sequence of characters (stops when END token is predicted) 
# or if the length of the generated word exceeds a certain limit of 10 characters
def create_name(start_string, rnn, char_embedding, ct):
    with torch.no_grad():
        # start with the last character in the start_string
        current_char = start_string[-1]
        current_tensor = ct.encode(current_char)
        current_embedding = char_embedding(current_tensor)
        hidden = rnn.init_hidden()
        name = start_string
        while current_char != end_symbol and len(name) < 10:
            output, hidden = rnn(current_embedding, hidden)
            # Find the next character by sampling from the output distribution
            predicted_index = torch.multinomial(torch.exp(output), 1).item()
            current_char = ct.decode([predicted_index])
            _, predicted_index = output.topk(1)
            predicted_index = predicted_index.squeeze().item()
            current_char = ct.decode([predicted_index])
            name += current_char
            current_tensor = ct.encode(current_char)
            current_embedding = char_embedding(current_tensor)
        return name

create_name('^a', rnn, char_embedding, ct)

'^anm$'

create_name('^c', rnn, char_embedding, ct)

'^c$'

# Generate dataset for training
def generate_data(names, ct):
    X = []
    Y = []
    for name in names:
        for i in range(1, len(name)):
            X.append(name[i-1])
            Y.append(name[i])
    X = [ct.encode(x) for x in X]
    Y = [ct.encode(y) for y in Y]
    return X, Y

X, Y = generate_data(names, ct)

X[0], Y[0], X[1], Y[1], X[2], Y[2]

(tensor([1]), tensor([2]))

print(names[0])
print(ct.decode(X[0]), ct.decode(Y[0]))
print(ct.decode(X[1]), ct.decode(Y[1]))
print(ct.decode(X[2]), ct.decode(Y[2]))

^abhishek$
^ a
a b
b h

# Training loop
num_epochs = 12
learning_rate = 3e-4
embedding_size = 8
hidden_size = 32
rnn = RNN(embedding_size, hidden_size, ct.vocab_size)
embedding = CharEmbedding(ct.vocab_size, embedding_size)

optimizer = torch.optim.Adam(list(rnn.parameters()) + list(embedding.parameters()), lr=learning_rate)

criterion = nn.NLLLoss()

for epoch in range(num_epochs):
    total_loss = 0
    for i in range(len(X)):
        optimizer.zero_grad()
        hidden = rnn.init_hidden()
        input_tensor = X[i]
        target_tensor = Y[i].squeeze()
        input_embedding = embedding(input_tensor)
        target_tensor = target_tensor.unsqueeze(0)
        output, hidden = rnn(input_embedding, hidden)
        
        predicted_next_char = output.argmax().item()
        
        loss = criterion(output, target_tensor)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
        #print(i, loss.item())
    if (epoch+1) % 1 == 0:
        print(f'Epoch: {epoch+1}/{num_epochs}, Loss: {total_loss/len(X)}')

Epoch: 1/12, Loss: 2.684675631081001
Epoch: 2/12, Loss: 2.4274482760898484
Epoch: 3/12, Loss: 2.3604175581492934
Epoch: 4/12, Loss: 2.3314669918697972
Epoch: 5/12, Loss: 2.3155676853116023
Epoch: 6/12, Loss: 2.3054449003057
Epoch: 7/12, Loss: 2.2983417296262845
Epoch: 8/12, Loss: 2.2929774504282614
Epoch: 9/12, Loss: 2.2887099773854604
Epoch: 10/12, Loss: 2.2851798680263626
Epoch: 11/12, Loss: 2.2821793051528485
Epoch: 12/12, Loss: 2.2795761335450453

plot_2d_embeddings(embedding, vocab)

create_name('^a', rnn, embedding, ct)

'^an$'

create_name('^b', rnn, embedding, ct)

'^bhan$'

create_name('^c', rnn, embedding, ct)

'^chan$'

create_name('^d', rnn, embedding, ct)

'^dan$'

create_name('^n', rnn, embedding, ct)

'^n$'