Commit dcec0acc authored by Brian McMahan's avatar Brian McMahan
Browse files

removing some extra code

parent dc9b5633
%% Cell type:markdown id: tags:
# Conditional Surname Generation with RNNs
## Dataset Info
The surnames dataset has been collected from a couple different sources.
#### Value Counts for the Nationality:
```
russian 9408
english 3668
arabic 2000
japanese 991
german 724
italian 709
czech 519
spanish 298
dutch 297
french 277
chinese 268
irish 232
greek 203
polish 139
scottish 100
korean 94
portuguese 74
vietnamese 73
Name: nationality, dtype: int64
```
## Model Info
The `ConditionalCharRNN` first conditions on an embedding of the nationality and is then trained to generate the surnames. In this way, the model can learn nationality-specific representations for surnames.
%% Cell type:code id: tags:
``` python
from argparse import Namespace
from collections import Counter
import json
import os
os.environ['OMP_NUM_THREADS'] = '4'
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook
from vocabulary import Vocabulary
%matplotlib inline
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (14, 6)
START_TOKEN = "^"
END_TOKEN = "_"
IGNORE_INDEX_VALUE = -100
```
%% Cell type:markdown id: tags:
# Dataset
%% Cell type:markdown id: tags:
## Dataset Utilities
%% Cell type:code id: tags:
``` python
def count_tokens(x_data_list):
"""Count the tokens in the data list
Args:
x_data_list (list(list(str))): a list of lists, each sublist is a list of string tokens.
In other words, a list of the data points where the data points have been tokenized.
Returns:
dict: a mapping from tokens to their counts
"""
# alternatively
# return Counter([token for x_data in x_data_list for token in x_data])
counter = Counter()
for x_data in x_data_list:
for token in x_data:
counter[token] += 1
return counter
def add_splits(df, target_y_column=None, split_proportions=(0.7, 0.15, 0.15), seed=0):
"""Add 'train', 'val', and 'test' splits to the dataset
Args:
df (pd.DataFrame): the data frame to assign splits to
target_y_column (str): [default=None]
the name of the label column; in order to preserve the class distribution
between splits, the label column is used to group the datapoints and splits are
assigned within these groups. If None, the assumption is this is an unsupervised
task and the splits will not respect any class distribution
split_proportions (tuple(float, float, float)): three floats which represent the
proportion in 'train', 'val, 'and 'test'. Must sum to 1.
seed (int): the random seed for making the shuffling deterministic. If the dataset and seed
are kept the same, the split assignment is deterministic.
Returns:
pd.DataFrame: the input dataframe with a new column for split assignments; note: row order
will have changed
"""
if target_y_column is not None:
# partition on the y label
df_by_label = {label: [] for label in df[target_y_column].unique()}
for _, row in df.iterrows():
df_by_label[row[target_y_column]].append(row.to_dict())
else:
# no y label to partition on
df_by_label = dict(unlabeled=df.to_dict('records'))
np.random.seed(seed)
assert sum(split_proportions) == 1, "`split_proportions` should sum to 1"
train_p, val_p, test_p = split_proportions
out_df = []
# to ensure consistent behavior, lexicographically sort the dictionary
for _, data_points in sorted(df_by_label.items()):
np.random.shuffle(data_points)
n_total = len(data_points)
n_train = int(train_p * n_total)
n_val = int(val_p * n_total)
for data_point in data_points[:n_train]:
data_point['split'] = 'train'
for data_point in data_points[n_train:n_train+n_val]:
data_point['split'] = 'val'
for data_point in data_points[n_train+n_val:]:
data_point['split'] = 'test'
out_df.extend(data_points)
return pd.DataFrame(out_df)
```
%% Cell type:markdown id: tags:
# Text Sequence Vectorizer
%% Cell type:code id: tags:
``` python
class TextSequenceVectorizer:
"""A composite data structure that uses Vocabularies to map text and its labels to integers
This differs from the SupervisedTextVectorizer in that it outputs the sequences
in a source-target pairing: the source is all indices except the final index and
the target is all indices except the first index. In this pairing, the computational
goal is to learn a function which maps source to target at every timestep.
The variable name of the label has changed in various places as well. This is to indicate
that the label column is no longer the y-variable of the machine learning problem.
Attributes:
token_vocab (Vocabulary): the vocabulary managing the mapping between text tokens and
the unique indices that represent them
label_voab (Vocabulary): the vocabulary managing the mapping between labels and the
unique indices that represent them.
max_seq_length (int): the length of the longest sequence (including start or end tokens
that will be prepended or appended).
"""
def __init__(self, token_vocab, label_vocab, max_seq_length):
"""Initialize the TextSequenceVectorizer
Args:
token_vocab (Vocabulary): the vocabulary managing the mapping between text tokens and
the unique indices that represent them
label_voab (Vocabulary): the vocabulary managing the mapping between labels and the
unique indices that represent them.
max_seq_length (int): the length of the longest sequence (including start or end tokens
that will be prepended or appended).
"""
self.token_vocab = token_vocab
self.label_vocab = label_vocab
self.max_seq_length = max_seq_length
def _wrap_with_start_end(self, x_data):
"""Prepend the start token and append the end token.
Args:
x_data (list(str)): the list of string tokens in the data point
Returns:
list(str): the list of string tokens with start token prepended and end token appended
"""
return [self.token_vocab.start_token] + x_data + [self.token_vocab.end_token]
def vectorize(self, x_data, label):
"""Convert the data point and its label into their integer form
Args:
x_data (list(str)): the list of string tokens in the data point
label (str,int): the label associated with the data point
Returns:
numpy.ndarray, np.ndarray, int:
the first two outputs are x_data in two vectorized forms:
the first is the source vector (x_data[:-1])
the second is the target vector (x_data[1:])
the third output is the label mapped to the integer that represents it
"""
x_data = self._wrap_with_start_end(x_data)
x_source_vector = np.zeros(self.max_seq_length).astype(np.int64)
y_target_vector = np.ones(self.max_seq_length).astype(np.int64) * IGNORE_INDEX_VALUE
x_data_indices = [self.token_vocab[token] for token in x_data]
x_source_indices = x_data_indices[:-1]
y_target_indices = x_data_indices[1:]
x_source_vector[:len(x_source_indices)] = x_source_indices
y_target_vector[:len(y_target_indices)] = y_target_indices
label_index = self.label_vocab[label]
return x_source_vector, y_target_vector, label_index
def transform(self, x_data_list, label_list):
"""Transform a dataset by vectorizing each datapoint
Args:
x_data_list (list(list(str))): a list of lists, each sublist contains string tokens
label_list (list(str,int)): a list of either strings or integers. the label can come
as strings or integers, but they are remapped with the label_vocab to a unique integer
Returns:
numpy.ndarray, np.ndarray, np.ndarray: [
shape=(dataset_size, max_seq_length),
shape=(dataset_size, max_seq_length),
shape=(dataset_size,)
]
the first two outputs are x_data in two vectorized forms:
the first is all of the source vectors (x_data[:-1])
the second is all of the target vectors (x_data[1:])
the third output is the vector of labels mapped to the integers that represents them """
x_source_matrix = []
y_target_matrix = []
label_vector = []
for x_data, label in zip(x_data_list, label_list):
x_source_vector, y_target_vector, label_index = self.vectorize(x_data, label)
x_source_matrix.append(x_source_vector)
y_target_matrix.append(y_target_vector)
label_vector.append(label_index)
return np.stack(x_source_matrix), np.stack(y_target_matrix), np.stack(label_vector)
@classmethod
def from_df(cls, df, target_x_column, label_column, token_count_cutoff=0):
"""Instantiate the TextSequenceVectorizer from a standardized dataframe
Standardized DataFrame has a special meaning:
there is a column that has been tokenized into a list of strings
Args:
df (pd.DataFrame): the dataset with a tokenized text column and a label column
target_x_column (str): the name of the tokenized text column
label_column (str): the name of the label column
token_count_cutoff (int): [default=0] the minimum token frequency to add to the
token_vocab. Any tokens that are less frequent will not be added.
Returns:
TextSequenceVectorizer: the instantiated vectorizer
"""
# get the x data (the observations)
target_x_list = df[target_x_column].tolist()
# compute max sequence length, add 1 for the start/end token
max_seq_length = max(map(len, target_x_list)) + 1
# populate token vocab
token_vocab = Vocabulary(use_unks=False,
use_mask=True,
use_start_end=True,
start_token=START_TOKEN,
end_token=END_TOKEN)
counts = count_tokens(target_x_list)
# sort counts in reverse order
for token, count in sorted(counts.items(), key=lambda x: x[1], reverse=True):
if count <= token_count_cutoff:
break
token_vocab.add(token)
token_vocab.freeze()
# populate label vocab
label_vocab = Vocabulary(use_unks=False, use_start_end=False, use_mask=False)
label_vocab.add_many(sorted(df[label_column].unique()))
label_vocab.freeze()
return cls(token_vocab, label_vocab, max_seq_length)
def save(self, filename):
"""Save the vectorizer using json to the file specified
Args:
filename (str): the output file
"""
vec_dict = {"token_vocab": self.token_vocab.get_serializable_contents(),
"label_vocab": self.label_vocab.get_serializable_contents(),
"max_seq_length": self.max_seq_length}
with open(filename, "w") as fp:
json.dump(vec_dict, fp)
@classmethod
def load(cls, filename):
"""Load the vectorizer from the json file it was saved to
Args:
filename (str): the file into which the vectorizer was saved.
Returns:
TextSequenceVectorizer: the instantiated vectorizer
"""
with open(filename, "r") as fp:
contents = json.load(fp)
contents["token_vocab"] = Vocabulary.deserialize_from_contents(contents["token_vocab"])
contents["token_vocab"].freeze()
contents["label_vocab"] = Vocabulary.deserialize_from_contents(contents["label_vocab"])
return cls(**contents)
```
%% Cell type:markdown id: tags:
# Text Sequence Dataset
%% Cell type:code id: tags:
``` python
class TextSequenceDataset(Dataset):
"""
Attributes:
vectorizer (SupervisedTextVectorizer): an instantiated vectorizer
active_split (str): the string name of the active split
# internal use
_split_df (dict): a mapping from split name to partitioned DataFrame
_vectorized (dict): a mapping from split to an x data matrix and y vector
_active_df (pd.DataFrame): the DataFrame corresponding to the split
_active_source (np.ndarray): a matrix of the vectorized source text data
_active_target (np.ndarray): a matrix of the vectorized target text data
_active_labels (np.ndarray): a vector of the vectorized labels
"""
def __init__(self, df, vectorizer, target_x_column, label_column):
"""Initialize the TextSequenceDataset
Args:
df (pd.DataFrame): the dataset with a text and label column
vectorizer (TextSequenceVectorizer): an instantiated vectorizer
target_x_column (str): the column containing the tokenized text
label_column (str): the column containing the label
"""
self._split_df = {
'train': df[df.split=='train'],
'val': df[df.split=='val'],
'test': df[df.split=='test']
}
self._vectorized = {}
for split_name, split_df in self._split_df.items():
self._vectorized[split_name] = \
vectorizer.transform(x_data_list=split_df[target_x_column].tolist(),
label_list=split_df[label_column].str.lower().tolist())
self.vectorizer = vectorizer
self.active_split = None
self._active_df = None
self._active_source = None
self._active_target = None
self._active_labels = None
self.set_split("train")
def set_split(self, split_name):
"""Set the active split
Args:
split_name (str): the name of the split to make active; should
be one of 'train', 'val', or 'test'
"""
self.active_split = split_name
self._active_source, self._active_target, self._active_labels = self._vectorized[split_name]
self._active_df = self._split_df[split_name]
def __getitem__(self, index):
"""Return the data point corresponding to the index
Args:
index (int): an int between 0 and len(self._active_source)
Returns:
dict: the data for this data point. Has the following form:
{"x_source": the vectorized source text data point,
"y_target": the vectorized target text data point,
"label_indices": the index of the label for this data point,
"x_lengths": method: the number of nonzeros in the vector,
"data_index": the provided index for bookkeeping}
"""
return {
"x_source": self._active_source[index],
"y_target": self._active_target[index],
"label_indices": self._active_labels[index],
"x_lengths": len(self._active_source[index].nonzero()[0]),
"data_index": index
}
def __len__(self):
"""The length of the active dataset
Returns:
int: len(self._active_x)
"""
return self._active_source.shape[0]
```
%% Cell type:markdown id: tags:
### Dataset Loading Function
%% Cell type:code id: tags:
``` python
def character_tokenizer(input_string):
"""Tokenized a string a list of its characters
Args:
input_string (str): the character string to tokenize
Returns:
list: a list of characters
"""
return list(input_string)
def load_surname_dataset(dataset_csv, tokenizer_func, saved_vectorizer_file=None):
"""Load the surname dataset
Args:
dataset_csv (str): the location of the dataset
tokenizer_func (function): the tokenizing function to turn each datapoint into
its tokenized form
saved_vectorizer_file (str or None): [default=None] if not None, load the vectorizer
from the file
"""
df = add_splits(pd.read_csv(dataset_csv),
target_y_column='nationality')
df['tokenized'] = df.surname.apply(tokenizer_func)
df['nationality'] = df.nationality.str.lower()
if saved_vectorizer_file is not None:
vectorizer = TextSequenceVectorizer.load(saved_vectorizer_file)
else:
vectorizer = TextSequenceVectorizer.from_df(df,
target_x_column='tokenized',
label_column='nationality',
token_count_cutoff=0)
dataset = TextSequenceDataset(df=df,
vectorizer=vectorizer,
target_x_column='tokenized',
label_column='nationality')
return dataset
```
%% Cell type:markdown id: tags:
### Verify it loads
%% Cell type:code id: tags:
``` python
import pandas as pd
```
%% Cell type:code id: tags:
``` python
pd.read_csv("../data/surnames.csv").nationality.value_counts()
```
%% Output
russian 9408
english 3668
arabic 2000
japanese 991
german 724
italian 709
czech 519
spanish 298
dutch 297
french 277
chinese 268
irish 232
greek 203
polish 139
scottish 100
korean 94
portuguese 74
vietnamese 73
Name: nationality, dtype: int64
%% Cell type:code id: tags:
``` python
dataset = load_surname_dataset("../data/surnames.csv",
character_tokenizer)
#"../modelzoo/surnames.vectorizer")
dataset[0]
```
%% Output
{'x_source': array([ 1, 31, 9, 4, 15, 8, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0]),
'y_target': array([ 31, 9, 4, 15, 8, 6, 2, -100, -100, -100, -100,
-100, -100, -100, -100, -100, -100, -100, -100, -100, -100]),
'label_indices': 0,
'x_lengths': 7,
'data_index': 0}
%% Cell type:markdown id: tags:
# Model
%% Cell type:markdown id: tags:
### Model Utilities
%% Cell type:code id: tags:
``` python
def new_parameter(*size):
"""Initialize a new parameter
Args:
size (*args): being star args, pass in any number of ints to create a
parameter tensor of that size.
Returns:
nn.Parameter: a Tensor that has some extra bookkeeping
"""
out = torch.randn(*size, requires_grad=True, dtype=torch.float32)
torch.nn.init.xavier_normal_(out)
return nn.Parameter(out)
def column_gather(y_out, x_lengths):
'''Get a specific vector from each batch datapoint in `y_out`.
More precisely, iterate over batch row indices, get the vector that's at
the position indicated by the corresponding value in `x_lengths` at the row
index.
Args:
y_out (torch.FloatTensor, torch.cuda.FloatTensor)
shape: (batch, sequence, feature)
x_lengths (torch.LongTensor, torch.cuda.LongTensor)
shape: (batch,)
Returns:
y_out (torch.FloatTensor, torch.cuda.FloatTensor)
shape: (batch, feature)
'''
x_lengths = x_lengths.long().detach().cpu().numpy() - 1
out = []
for batch_index, column_index in enumerate(x_lengths):
out.append(y_out[batch_index, column_index])
return torch.stack(out)
def apply_across_sequence_looping(in_tensor, func, batch_first=True):
"""Apply a computational function over the sequence dimension in a tensor
Args:
in_tensor (torch.FloatTensor): [
shape=(batch_size, max_seq_size, feature_size)
or shape=(max_seq_size, batch_size, feature_size)
]
func (torch.nn.Module or function): either a PyTorch module (such as a Linear layer)
or a function; if a function, should output PyTorch tensors. `func`
is to be applied to the feature vectors in `in_tensor` at each time step.
Returns:
torch.FloatTensor: the resultant from the `func` applied to each time step
The shape will be the same as the in_tensor except dim=2 will be whatever sized
vectors `func` outputs.
Notes:
This function applies `func` through the use of looping and is meant for explanation purposes.
For a faster runtime, use the reshape version.
"""
if batch_first:
in_tensor = in_tensor.permute(1, 0, 2)
out_tensor = []
for index in range(in_tensor.shape[0]):
out_i = func(in_tensor[index])
out_tensor.append(out_i)
out_tensor = torch.stack(out_tensor)
if batch_first:
out_tensor = out_tensor.permute(1, 0, 2)
return out_tensor
def apply_across_sequence_reshape(in_tensor, func):
"""Apply a computational function over the sequence dimension in a tensor
Args:
in_tensor (torch.FloatTensor): [
shape=(batch_size, max_seq_size, feature_size)
or shape=(max_seq_size, batch_size, feature_size)
]
func (torch.nn.Module or function): either a PyTorch module (such as a Linear layer)
or a function; if a function, should output PyTorch tensors. `func`
is to be applied to the feature vectors in `in_tensor` at each time step.
Returns:
torch.FloatTensor: the resultant from the `func` applied to each time step
The shape will be the same as the in_tensor except dim=2 will be whatever sized
vectors `func` outputs.
Notes:
This function applies `func` by reshaping `in_tensor` so that it appears to be a matrix,
then reshaping the resultant back to the expanded 3-dim shape.
"""
dim0, dim1, dim2 = in_tensor.size()
# reshape into a matrix so we can apply a linear layer
in_tensor = in_tensor.contiguous().view(dim0 * dim1, dim2)
# now that it's a matrix, can apply func
out_tensor = func(in_tensor)
# return the tensor reshaped
return out_tensor.view(dim0, dim1, -1)
```
%% Cell type:markdown id: tags:
### Model Definitions
%% Cell type:markdown id: tags:
#### Module: ExplicitRNN
%% Cell type:code id: tags:
``` python
class ExplicitRNN(nn.Module):
"""An explicit implementation of the RNN for teaching purposes
Attributes:
W_in2hid (nn.Parameter): the mapping from input features to hidden features
W_hid2hid (nn.Parameter): the mapping from hidden features to new hidden features
b_hid (nn.Parameter): the bias term for the hidden state computation
hidden_size (int): the size of the hidden state
batch_first (bool): [default=False] if True, the batch dimension
will be permuted to the dim=1 position and the sequence
dimension will take the dim=0 position for faster indexing
of the tensor.
"""
def __init__(self, input_size, hidden_size, batch_first=False):
"""Initialize the ExplicitRNN
Args:
input_size (int): the size of the input feature vectors
hidden_size (int): the size of the RNN's hidden state
batch_first (bool): [default=False] if True, the batch dimension
will be permuted to the dim=1 position and the sequence
dimension will take the dim=0 position for faster indexing
of the tensor.
"""
super(ExplicitRNN, self).__init__()
self.W_in2hid = new_parameter(input_size, hidden_size)
self.W_hid2hid = new_parameter(hidden_size, hidden_size)
self.b_hid = new_parameter(1, hidden_size)
self.hidden_size = hidden_size
self.batch_first = batch_first
def _compute_next_hidden(self, x, h):
"""The core update computation for the RNN
Args:
x (torch.FloatTensor): [shape=(batch_size, input_size)]
The input data for the current time step (t)
h (torch.FloatTenosr): [shape=(batch_size, hidden_size)]
The hidden state for the previous time step (t-1)
Returns:
torch.FloatTensor: [shape=(batch_size, hidden_size)]
The hidden state for the the current time step (t)
"""
return torch.tanh(x.matmul(self.W_in2hid) +
h.matmul(self.W_hid2hid) +
self.b_hid)
def forward(self, x_in, hid_t=None):
"""The forward computation of the RNN
Args:
x_in (torch.FloatTensor): [shape=(batch_size, seq_size, input_size)
or shape=(batch_size, seq_size, input_size)]
The input data with batch either on dim=0 or dim=1. Conceptually,
this is a batch of sequences of vectors. The default mode in PyTorch's
RNN, LSTM, and GRU implementations is to expect batch to be on dim=1
(as indicated by `batch_first`=False). This is usually done for speed or
simplicity in an algorithm.
hid_t (torch.FloatTensor): [default=None; shape=(batch_size, hidden_size)]
An optional hidden state; This can be used to condition or bias the RNN
to certain states. Some image captioning models use an image's feature vector
to bias an RNN to produce the captions.
"""
if self.batch_first:
batch_size, seq_size, feat_size = x_in.size()
x_in = x_in.permute(1, 0, 2)
else:
seq_size, batch_size, feat_size = x_in.size()
hiddens = []
if hid_t is None:
hid_t = torch.ones((batch_size, self.hidden_size))
if x_in.is_cuda:
hid_t = hid_t.cuda()
for t in range(seq_size):
x_t = x_in[t]
# assert x_t.shape == (batch_size, feat_size)
hid_t = self._compute_next_hidden(x_t, hid_t)
hiddens.append(hid_t)
hiddens = torch.stack(hiddens)
if self.batch_first:
hiddens = hiddens.permute(1, 0, 2)
return hiddens
```
%% Cell type:markdown id: tags:
#### ConditionalCharRNN
%% Cell type:code id: tags:
``` python
class ConditionalCharRNN(nn.Module):
"""The Character-level RNN for sequence-predictions that conditions on initial hidden states
Attributes:
emb (nn.Embedding): the embedding for tokens
conditional_emb (nn.Embedding): the embedding for the states to condition on
rnn (ExplicitRNN): the RNN implementation
fc (nn.Linear): a mapping from RNN outputs to the prediction vector
"""
def __init__(self, embedding_size, num_embeddings, num_conditioning_states,
num_classes, hidden_size):
"""Initialize the CharRNN
Args:
embedding_size (int): size of each embedding vector
num_embeddings (int): number of input characters
num_classes (int): number of characters to predict to
hidden_size (int): the intermediate representation size
"""
super(ConditionalCharRNN, self).__init__()
self.emb = nn.Embedding(embedding_dim=embedding_size,
num_embeddings=num_embeddings,
padding_idx=0)
self.conditional_emb = nn.Embedding(embedding_dim=hidden_size,
num_embeddings=num_conditioning_states)
self.rnn = ExplicitRNN(input_size=embedding_size,
hidden_size=hidden_size,
batch_first=True)
self.fc = nn.Linear(in_features=hidden_size, out_features=num_classes)
def forward(self, x_in, state_indices, apply_softmax=False):
"""The forward pass of the sequence model
Args:
x_in (torch.Tensor): [shape=(batch_size, max_seq_length)]
The input data tensor.
state_indices (torch.Tensor): [shape=(batch_size,)]
The conditioning state indices for each batch item
apply_softmax (bool): [default=False]
A flag for the softmax activation. This should be
false if used with the Cross Entropy losses. See note below.
Returns:
torch.FloatTensor: [shape=(batch_size, max_seq_length, num_classes)]
The vector for each data point in each sequence in the batch:
if `apply_softmax=False`, it is the pre-softmax prediction vector
else, it is the softmax'ed prediction vector
Note:
It is useful to not softmax the prediction vector because there is
a corresponding loss function optimized for it. In essence, the loss
function associated with optimizing probabilities of multinomials is called
Negative Log Likelihood (NLL). To apply NLL, you first apply the log function.
This function cancels out with the exponential function of the softmax
and so some simplification can occur to shortcut the extra computations.
"""
# x_in.shape == (batch_size, max_seq_length)
x_in = self.emb(x_in)
# x_in.shape == (batch_size, max_seq_length, embedding_size)
conditional_state_embedding = self.conditional_emb(state_indices)
y_out = self.rnn(x_in, hid_t=conditional_state_embedding)
# y_out.shape == (batch_size, max_seq_length, hidden_size)
# apply the linear layer to each vector in each sequence
y_out = apply_across_sequence_looping(y_out, self.fc)
# alternatively:
# y_out = apply_across_sequence_reshape(y_out, self.fc)
# y_out.shape == (batch_size, max_seq_length, num_classes)
# optionally apply the softmax to the last dim
if apply_softmax:
y_out = F.softmax(y_out, dim=2)
return y_out
```
%% Cell type:markdown id: tags:
### Prototyping
%% Cell type:code id: tags:
``` python
batch = next(iter(DataLoader(dataset, batch_size=8)))
model = ConditionalCharRNN(embedding_size=8,
num_embeddings=len(dataset.vectorizer.token_vocab),
num_classes=len(dataset.vectorizer.token_vocab),
num_conditioning_states=len(dataset.vectorizer.label_vocab),
hidden_size=8)
model(batch['x_source'], batch['label_indices']).shape
```
%% Output
torch.Size([8, 21, 90])
%% Cell type:markdown id: tags:
# Training
%% Cell type:markdown id: tags:
### Training Utilities
%% Cell type:code id: tags:
``` python
def compute_accuracy(y_pred, y_true, mask_index=IGNORE_INDEX_VALUE):
"""Compute the accuracy between a tensor of predictions and a matrix of label indices
Args:
y_pred (torch.FloatTensor): [shape=(batch_size, max_sequence_size, num_classes)]
The tensor of predictions, 1 prediction per batch item per sequence step
y_true (torch.FloatTensor): [shape=(batch_size, max_sequence_size)]
The matrix of label indices, 1 index per batch item per sequence step
mask_index (int): [default=IGNORE_INDEX_VALUE=-100]
the mask index is used to identify the positions in the y_true that
correspond to the masked positions. A negative number is suggested so that the
a label can have index 0.
Returns:
float: an accuracy computation that is sensitive to variable length sequences
"""
y_pred = y_pred.contiguous().view(-1, y_pred.shape[2])
# y_pred.shape == (batch * sequence_length, prediction_vector)
y_true = y_true.contiguous().view(-1)
# y_true.shape == (batch * sequence_length, )
y_pred_indices = y_pred.argmax(dim=1)
correct_indices = torch.eq(y_pred_indices, y_true).float()
valid_indices = torch.ne(y_true, mask_index).float()
n_correct = (correct_indices * valid_indices).sum().item()
n_valid = valid_indices.sum().item()
return n_correct / n_valid * 100
def loss_func(y_pred, y_true, mask_index=IGNORE_INDEX_VALUE):
"""Compute the cross entropy loss sequence-wide with variable length handling
Args:
y_pred (torch.FloatTensor): [shape=(batch_size, max_sequence_size, num_classes)]
The tensor of predictions, 1 prediction per batch item per sequence step
y_true (torch.FloatTensor): [shape=(batch_size, max_sequence_size)]
The matrix of label indices, 1 index per batch item per sequence step
mask_index (int): [default=IGNORE_INDEX_VALUE=-100]
the mask index is used to identify the positions in the y_true that
correspond to the masked positions. A negative number is suggested so that the
a label can have index 0.
Returns:
torch.FloatTensor: a scalar representing the loss across the sequence
"""
y_pred = y_pred.contiguous().view(-1, y_pred.shape[2])
# y_pred.shape == (batch * sequence_length, prediction_vector)
y_true = y_true.contiguous().view(-1)
# y_true.shape == (batch * sequence_length, )
return F.cross_entropy(y_pred, y_true, ignore_index=mask_index)
def generate_batches(dataset, batch_size, shuffle=True,
drop_last=True, device="cpu", dataloader_kwargs=None):
"""Generate batches from a dataset
Args:
dataset (torch.utils.data.Dataset): the instantiated dataset
batch_size (int): the size of the batches
shuffle (bool): [default=True] batches are formed from shuffled indices
drop_last (bool): [default=True] don't return the final batch if it's smaller
than the specified batch size
device (str): [default="cpu"] the device to move the tensors to
dataloader_kwargs (dict or None): [default=None] Any additional arguments to the
DataLoader can be specified
Yields:
dict: a dictionary mapping from tensor name to tensor object where the first
dimension of tensor object is the batch dimension
Note:
This function is mostly an iterator for the DataLoader, but has the added
feature that it moves the tensors to a target device.
"""
dataloader_kwargs = dataloader_kwargs or {}
dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
shuffle=shuffle, drop_last=drop_last, **dataloader_kwargs)
for data_dict in dataloader:
out_data_dict = {}
for name, tensor in data_dict.items():
out_data_dict[name] = data_dict[name].to(device)
yield out_data_dict
class TrainState:
"""A data structure for managing training state operations.
The TrainState will monitor validation loss and everytime a new best loss
(lower is better) is observed, a couple things happen:
1. The model is checkpointed
2. Patience is reset
Attributes:
model (torch.nn.Module): the model being trained and will be
checkpointed during training.
dataset (SupervisedTextDataset, TextSequenceDataset): the dataset
which is being iterate during training; must have the `active_split`
attribute.
log_dir (str): the directory to output the checkpointed model
patience (int): the number of epochs since a new best loss was observed
# Internal Use
_full_model_path (str): `log_dir/model_state_file`
_split (str): the active split
_best_loss (float): the best observed loss
"""
def __init__(self, model, dataset, log_dir, model_state_file="model.pth"):
"""Initialize the TrainState
Args:
model (torch.nn.Module): the model to be checkpointed during training
dataset (SupervisedTextDataset, TextSequenceDataset): the dataset
which is being iterate during training; must have the `active_split`
attribute.
log_dir (str): the directory to output the checkpointed model
model_state_file (str): the name of the checkpoint model
"""
self.model = model
self.dataset = dataset
self._full_model_path = os.path.join(log_dir, model_state_file)
if not os.path.exists(log_dir):
os.makedirs(log_dir)
self.log_dir = log_dir
self._metrics_by_split = {
'train': {},
'val': {},
'test': {}
}
self._split = 'train'
self._best_loss = 10**10
self.patience = 0
def _init_metric(self, split, metric_name):
"""Initialize a metric to the specified split
A dictionary is created in `self._metrics_by_split` with
the keys 'running', 'count', and 'history'.
Args:
split (str): the target split to record the metric
metric_name (str): the name of the metric
"""
self._metrics_by_split[split][metric_name] = {
'running': 0.,
'count': 0,
'history': []
}
def _update_metric(self, metric_name, metric_value):
"""Update a metric with an observed value
Specifically, the running average is updated.
Args:
metric_name (str): the name of the metric
metric_value (float): the observed value of the metric
"""
if metric_name not in self._metrics_by_split[self._split]:
self._init_metric(self._split, metric_name)
metric = self._metrics_by_split[self._split][metric_name]
metric['count'] += 1
metric['running'] += (metric_value - metric['running']) / metric['count']
def set_split(self, split):
"""Set the dataset split
Args:
split (str): the target split to set
"""
self._split = split
def get_history(self, split, metric_name):
"""Get the history of values for any metric in any split
Args:
split (str): the target split
metric_name (str): the target metric
Returns:
list(float): the running average of each epoch for `metric_name` in `split`
"""
return self._metrics_by_split[split][metric_name]['history']
def get_value_of(self, split, metric_name):
"""Retrieve the running average of any metric in any split
Args:
split (str): the target split
metric_name (str): the target metric
Returns:
float: the running average for `metric_name` in `split`
"""
return self._metrics_by_split[split][metric_name]['running']
def log_metrics(self, **metrics):
"""Log some values for some metrics
Args:
metrics (kwargs): pass keyword args with the form `metric_name=metric_value`
to log the metric values into the attribute `_metrics_by_split`.
"""
self._split = self.dataset.active_split
for metric_name, metric_value in metrics.items():
self._update_metric(metric_name, metric_value)
def log_epoch_end(self):
"""Log the end of the epoch.
Some key functions happen at the end of the epoch:
- for each metric in each split running averages, counts,
and history are updated
- the model is checkpointed if a new best value is observed
- patience is incremented if a new best value is not observed
"""
for split_dict in self._metrics_by_split.values():
for metric_dict in split_dict.values():
metric_dict['history'].append(metric_dict['running'])
metric_dict['running'] = 0.0
metric_dict['count'] = 0
if 'loss' in self._metrics_by_split['val']:
val_loss = self._metrics_by_split['val']['loss']['history'][-1]
if val_loss < self._best_loss:
self._best_loss = val_loss
self.save_model()
self.patience = 0
else:
self.patience += 1
def save_model(self):
""" Save `model` to `log_dir/model_state_file` """
torch.save(self.model.state_dict(), self._full_model_path)
def reload_best(self):
""" reload `log_dir/model_state_file` to `model` """
if os.path.exists(self._full_model_path):
self.model.load_state_dict(torch.load(self._full_model_path))
```
%% Cell type:markdown id: tags:
### Args
%% Cell type:code id: tags:
``` python
args = Namespace(
# Dataset
surname_csv="../data/surnames.csv",
# Model hyper parameters
embedding_size=16,
hidden_size=64,
num_classes=-1,
num_conditioning_states=-1,
num_embeddings=-1,
# Training options
batch_size = 128,
cuda=False,
learning_rate=0.001,
num_epochs=100,
patience_threshold=3,
# model reload options
load_zoo_model=True,
zoo={
'model': '../modelzoo/charnn_emb16_hid64_surnames_conditionally_predict.pth',
'vectorizer': '../modelzoo/surnames.vectorizer',
'comments': 'pre-trained surname conditioned sequence prediction (& conditioned generation)',
'parameters': {
'embedding_size': 16,
'hidden_size': 64
}
}
)
# Check CUDA
if not torch.cuda.is_available():
args.cuda = False
print("Using CUDA: {}".format(args.cuda))
args.device = torch.device("cuda" if args.cuda else "cpu")
args.device
```
%% Output
Using CUDA: False
device(type='cpu')
%% Cell type:markdown id: tags:
### Instantiation
%% Cell type:code id: tags:
``` python
args.load_zoo_model = True
args.load_zoo_model = (
args.load_zoo_model
and os.path.exists(args.zoo['vectorizer'])
and os.path.exists(args.zoo['model'])
)
print(f"Loading zoo model: {args.load_zoo_model}")
if args.load_zoo_model:
dataset = load_surname_dataset(dataset_csv=args.surname_csv,
tokenizer_func=character_tokenizer,
saved_vectorizer_file=args.zoo['vectorizer'])
args.embedding_size = args.zoo['parameters']['embedding_size']
args.hidden_size = args.zoo['parameters']['hidden_size']
else:
dataset = load_surname_dataset(dataset_csv=args.surname_csv,
tokenizer_func=character_tokenizer)
args.num_embeddings = len(dataset.vectorizer.token_vocab)
args.num_classes = len(dataset.vectorizer.token_vocab)
args.num_conditioning_states = len(dataset.vectorizer.label_vocab)
model = ConditionalCharRNN(embedding_size=args.embedding_size,
hidden_size=args.hidden_size,
num_embeddings=args.num_embeddings,
num_classes=args.num_classes,
num_conditioning_states=args.num_conditioning_states)
if args.load_zoo_model:
model.load_state_dict(
torch.load(args.zoo['model'], map_location=lambda storage, loc: storage)
)
```
%% Output
Loading zoo model: True
%% Cell type:markdown id: tags:
### Training Routine
%% Cell type:code id: tags:
``` python
model = model.to(args.device)
train_state = TrainState(model, dataset=dataset, log_dir='./logs/conditional_charrnn_predict_surnames',
model_state_file='model.pth')
optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
# loss function with class-weighted modifications
# loss_func is defined in the training utilities.
# It is a sequence-wide cross entropy loss with special handling for masked predictions
# progress bars
epoch_bar = tqdm_notebook(desc='epochs', total=args.num_epochs, position=1)
dataset.set_split("train")
train_bar = tqdm_notebook(desc='training', total=len(dataset)//args.batch_size)
dataset.set_split("val")
val_bar = tqdm_notebook(desc='validation', total=len(dataset)//args.batch_size)
try:
for _ in range(args.num_epochs):
model.train()
dataset.set_split("train")
# TODO: deprecate in favor of single source of truth
train_state.set_split("train")
for batch in generate_batches(dataset, batch_size=args.batch_size, device=args.device):
# Step 1: clear the gradients
optimizer.zero_grad()
# Step 2: compute the outputs
y_prediction = model(batch['x_source'], batch['label_indices'])
# Step 3: compute the loss
loss = loss_func(y_prediction, batch['y_target'])
# Step 4: propagate the gradients
loss.backward()
# Step 5: update the model weights
optimizer.step()
# Auxillary: logging
train_state.log_metrics(loss=loss.item(),
accuracy=compute_accuracy(y_prediction, batch['y_target']))
train_bar.set_postfix(loss=train_state.get_value_of(split="train", metric_name="loss"),
acc=train_state.get_value_of(split="train", metric_name="accuracy"))
train_bar.update()
# loop over test dataset
model.eval()
dataset.set_split("val")
train_state.set_split("val")
for batch in generate_batches(dataset, batch_size=args.batch_size, device=args.device):
# Step 1: compute the outputs
y_prediction = model(batch['x_source'], batch['label_indices'])
# Step 2: compute the loss
loss = loss_func(y_prediction, batch['y_target'])
# Auxillary: logging
train_state.log_metrics(loss=loss.item(),
accuracy=compute_accuracy(y_prediction, batch['y_target']))
val_bar.set_postfix(loss=train_state.get_value_of(split="val", metric_name="loss"),
acc=train_state.get_value_of(split="val", metric_name="accuracy"))
val_bar.update()
epoch_bar.set_postfix(train_loss=train_state.get_value_of(split="train",
metric_name="loss"),
train_accuracy=train_state.get_value_of(split="train",
metric_name="accuracy"),
val_loss=train_state.get_value_of(split="val",
metric_name="loss"),
val_accuracy=train_state.get_value_of(split="val",
metric_name="accuracy"),
patience=train_state.patience)
epoch_bar.update()
train_state.log_epoch_end()
train_bar.n = 0
val_bar.n = 0
if train_state.patience > args.patience_threshold:
break
train_state.reload_best()
model.eval()
dataset.set_split("test")
test_bar = tqdm_notebook(desc='test', total=len(dataset)//args.batch_size)
for batch in generate_batches(dataset, batch_size=args.batch_size, device=args.device):
# Step 1: compute the outputs
y_prediction = model(batch['x_source'], batch['label_indices'])
# Step 2: compute the loss
loss = loss_func(y_prediction, batch['y_target'])
# Auxillary: logging
train_state.log_metrics(loss=loss.item(),
accuracy=compute_accuracy(y_prediction, batch['y_target']))
test_bar.set_postfix(loss=train_state.get_value_of(split="test", metric_name="loss"),
acc=train_state.get_value_of(split="test", metric_name="accuracy"))
test_bar.update()
except KeyboardInterrupt:
print("...")
```
%% Output
%% Cell type:markdown id: tags:
# Using the Model to Sample
We can define a new model which doesn't change the stored modules/parameters and define a new forward function.
Then, we can use that forward function to generate outputs sampled from the model's predictions.
One method, `interpolated_sample` has been left only partially complete for you to complete.
%% Cell type:code id: tags:
``` python
class SamplingCharRNN(ConditionalCharRNN):
def sample(self, initial_string, nationality, token_vocab, label_vocab, max_length=30, temperature=1):
"""Sample from the model in a human-interfacing way
Args:
initial_string (str): the initial letters of the surname to be generated. Passing an empty
string is acceptable.
nationality (str): one of the nationalities to condition on; must be in `label_vocab`
token_vocab (Vocabulary): the vocabulary that maps tokens to integers
max_length (int): the maximum length to generate
temperature (float): module the peakiness of the probability distributions using
temperature; see the Boltzmann Distribution (https://en.wikipedia.org/wiki/Boltzmann_distribution)
Returns:
str: the generated sequence of characters (including the seeded `initial_string`)
"""
if len(initial_string) == 0 or initial_string[0] != token_vocab.start_token:
initial_string = token_vocab.start_token + initial_string
x_data = torch.LongTensor([token_vocab[token] for token in initial_string]).view(1, -1)
initial_hidden_indices = torch.LongTensor([label_vocab[nationality]]).view(-1)
hid_t = self.conditional_emb(initial_hidden_indices)
output_indices = self.forward(x_data, hid_t, max_length, temperature).view(-1).detach().numpy()
# remove start token
output_indices = output_indices[1:]
output = ""
for index in output_indices:
if index == token_vocab.end_index:
break
output += token_vocab.lookup(index)
return output
def interpolated_sample(self, initial_string, nationality1, nationality2, interpolation_weight,
token_vocab, label_vocab, max_length=30, temperature=1):
"""Sample from a state conditioned on 2 interpolated nationalities
Args:
initial_string (str): the initial letters of the surname to be generated. Passing an empty
string is acceptable.
nationality1 (str): one of the nationalities to condition on; must be in `label_vocab`
nationality2 (str): one of the nationalities to condition on; must be in `label_vocab`
interpolation_weight (float): [0 < interpolation_weight < 1]
The amount to interpolate between the two nationalities;
`interpolation_weight * nationality1 + (1 - interpolation_weight) * nationality2`
token_vocab (Vocabulary): the vocabulary that maps tokens to integers
max_length (int): the maximum length to generate
temperature (float): module the peakiness of the probability distributions using
temperature; see the Boltzmann Distribution (https://en.wikipedia.org/wiki/Boltzmann_distribution)
Returns:
str: the generated sequence of characters (including the seeded `initial_string`)
"""
if len(initial_string) == 0 or initial_string[0] != token_vocab.start_token:
initial_string = token_vocab.start_token + initial_string
x_data = torch.LongTensor([token_vocab[token] for token in initial_string]).view(1, -1)
###
# COMPUTE THE INTERPOLATED NATIONALITY HERE
###
output_indices = self.forward(x_data, hid_t, max_length, temperature).view(-1).detach().numpy()
# remove start token
output_indices = output_indices[1:]
output = ""
for index in output_indices:
if index == token_vocab.end_index:
break
output += token_vocab.lookup(index)
return output
def forward(self, x_source, hid_t, max_length, temperature=1):
"""Compute a new forward pass which samples from the representations
Args:
x_source (torch.LongTensor): The indices to seed the sampling
initial_hidden_indices (torch.LongTensor): The indices for the conditioning state
max_length (int): the number of samples to compute
temperature (float): module the peakiness of the probability distributions using
temperature; see the Boltzmann Distribution (https://en.wikipedia.org/wiki/Boltzmann_distribution)
"""
# get to the hidden state needed to generate by applying the rnn to the input
# note: this could have also been done by applying the rnn to x_source
x_source = x_source.permute(1, 0)
for t in range(x_source.shape[0]):
indices_t = x_source[t]
x_embedding_t = self.emb(indices_t)
hid_t = self.rnn._compute_next_hidden(x_embedding_t, hid_t)
# compute the initial indices
prediction_t = F.softmax(self.fc(hid_t), dim=1)
indices_t = torch.multinomial(prediction_t, num_samples=1).view(-1)
# start caching the indices
generated_indices = [indices_t]
for t in range(max_length - x_source.shape[0]):
# embed the indices
x_embedded_t = self.emb(indices_t)
# compute the next hidden
hid_t = self.rnn._compute_next_hidden(x_embedded_t, hid_t)
# compute the probability distribution by passing the hidden through the linear layer
# and applying the softmax function
prediction_t = F.softmax(self.fc(hid_t) / temperature, dim=1)
# use torch.multinomial to sample from the probability distribution
indices_t = torch.multinomial(prediction_t, num_samples=1).view(-1)
# cache the resulting indices
generated_indices.append(indices_t)
# concatenate the passed in tensor with the generated one
return torch.cat([x_source.permute(1, 0), torch.stack(generated_indices, dim=1)], dim=1)
```
%% Cell type:code id: tags:
``` python
model = SamplingCharRNN(embedding_size=args.embedding_size,
hidden_size=args.hidden_size,
num_embeddings=args.num_embeddings,
num_classes=args.num_classes,
num_conditioning_states=args.num_conditioning_states)
train_state.model = model
train_state.reload_best()
```
%% Cell type:code id: tags:
``` python
print(f"Options: {', '.join(dataset.vectorizer.label_vocab.keys())}")
```
%% Output
Options: arabic, russian, czech, japanese, english, german, portuguese, italian, greek, french, dutch, irish, chinese, spanish, korean, scottish, vietnamese, polish
%% Cell type:code id: tags:
``` python
model.sample(initial_string="",
nationality="german",
token_vocab=dataset.vectorizer.token_vocab,
label_vocab=dataset.vectorizer.label_vocab,
max_length=20,
temperature=0.7)
```
%% Output
'Borferen'
%% Cell type:code id: tags:
``` python
for nationality in sorted(dataset.vectorizer.label_vocab.keys()):
generated_surname = model.sample(initial_string="",
nationality=nationality,
token_vocab=dataset.vectorizer.token_vocab,
label_vocab=dataset.vectorizer.label_vocab,
max_length=20,
temperature=0.9)
print(f"{nationality} -> {generated_surname}")
```
%% Output
arabic -> Hana
chinese -> Chan
czech -> Hovimo
dutch -> Asteis
english -> Hergharag
french -> Lessigv
german -> Kachley
greek -> Kontari
irish -> O'Plad
italian -> Racpur
japanese -> Towada
korean -> Shek
polish -> Hodze
portuguese -> Carto
russian -> Mihevets
scottish -> Witt
spanish -> Vortili
vietnamese -> Hen
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment