Recurrent translations

Recurrent translations#

This notebook considers the solution of the translation task using recurrent layers.

from tqdm import tqdm
from datasets import load_dataset

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence

if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
elif torch.backends.mps.is_available():
    DEVICE = torch.device("mps")
else:
    DEVICE = torch.device("cpu")

Data#

As an example, the English->Russian data set tatoeda is considered. Tatoeda - collections of sentences and their translations. In particular, it’s hugging face implementation was used. The following cell loads data, transforms it into a more convenient format and displays some sentences from the dataset.

dataset = load_dataset(
    "tatoeba",
    lang1="en",
    lang2="ru",
    trust_remote_code=True
)
dataset = [
    (translation["en"], translation["ru"])
    for translation in dataset["train"]["translation"]
]
dataset[200:205]

The next cell prepares the following:

Service tokens.
Vocabulary.
Tokenization transformation.
Mappings between tokens and indices (both directions).

# Special tokens
PAD_TOKEN = "<PAD>"
# End of sentence
EOS_TOKEN = "<EOS>"
# Start of sentence
SOS_TOKEN = "<SOS>"
# Unknown
UNK_TOKEN = "<UNK>"


def tokenize(sentence: str) -> list[str]:
    '''Tokinelattion of the given stirng  by words'''
    return sentence.lower().split()

EN_VOCAB = {PAD_TOKEN, EOS_TOKEN, SOS_TOKEN, UNK_TOKEN}
RU_VOCAB = {PAD_TOKEN, EOS_TOKEN, SOS_TOKEN, UNK_TOKEN}
for en, ru in dataset:
    EN_VOCAB.update(tokenize(en))
    RU_VOCAB.update(tokenize(ru))

def create_mappings(vocab: set) -> tuple[dict[str, int], dict[int, str]]:
    '''
    Create mappings.
    '''
    word2int = {word: i for i, word in enumerate(vocab)}
    int2word = {i: word for word, i in word2int.items()}
    return word2int, int2word

EN_WORD2INT, EN_INT2WORD = create_mappings(EN_VOCAB)
RU_WORD2INT, RU_INT2WORD = create_mappings(RU_VOCAB)

Now, torch.utils.data.Dataset is set up to iterate over sentence pairs, already transformed into tensors containing token indices.

def tensor_tokenize(
    sentence: str,
    word2int: dict[str, int],
    device: torch.device = DEVICE
) -> torch.Tensor:
    '''Transform sentence into tensor with indeces of tokens.'''
    return torch.tensor(
        [
            word2int.get(word, word2int[UNK_TOKEN])
            for word in tokenize(sentence)
        ]
        + [word2int[EOS_TOKEN]],
        dtype=torch.long,
        device=device
    )


class TranslationDataset(torch.utils.data.Dataset):
    '''A data set iterating over input->translation pairs.'''
    def __init__(
        self,
        pairs: list[tuple[str, str]],
        en_word2int: dict[str, int],
        ru_word2int: dict[str, int]
    ):
        self.pairs = pairs
        self.en_word2int = en_word2int
        self.ru_word2int = ru_word2int

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        eng, rus = self.pairs[idx]
        eng_tensor = tensor_tokenize(sentence=eng, word2int=self.en_word2int)
        rus_tensor = tensor_tokenize(sentence=rus, word2int=self.ru_word2int)
        return eng_tensor, rus_tensor

translation_dataset = TranslationDataset(
    pairs=dataset,
    en_word2int=EN_WORD2INT,
    ru_word2int=RU_WORD2INT
)

Here is an example of using the created dataset - it simply returns a pair of tensors containing the indices of English and Russian tokens, respectively.

next(iter(translation_dataset))

(tensor([12845, 47486, 33172, 36617, 54900, 13416, 17676, 15429, 27838, 33949,
         35532, 23035, 39765, 51507,  9368], device='cuda:0'),
 tensor([ 58057,   7555,  41970, 109240,  52815,  38096,  93771, 115141,  73088,
          44994,  40218,  11276], device='cuda:0'))

And finally, a dataloader constructs minibatches of pairs and pads shorter sentences so that data across a set of sentences can be represented as a single tensor.

def collate_fn(batch: list[torch.Tensor, torch.Tensor]):
    '''
    Transforms a list of tokinized sentences into the torch tensor. Should be
    used as `collate_fn` argument of the dataloader. The main purpose is to
    pad all sentences to have the same length.
    '''
    eng_batch, rus_batch = zip(*batch)
    eng_batch_padded = pad_sequence(
        eng_batch, batch_first=True, padding_value=EN_WORD2INT[PAD_TOKEN]
    )
    rus_batch_padded = pad_sequence(
        rus_batch, batch_first=True, padding_value=RU_WORD2INT[PAD_TOKEN]
    )
    return eng_batch_padded, rus_batch_padded

batch_size = 64
translation_dataloader = torch.utils.data.DataLoader(
    translation_dataset,
    batch_size=batch_size,
    shuffle=False,
    drop_last=True,
    collate_fn=collate_fn,
)
batch = next(iter(translation_dataloader))
(batch[0][:2], batch[1][:2])

(tensor([[12845, 47486, 33172, 36617, 54900, 13416, 17676, 15429, 27838, 33949,
          35532, 23035, 39765, 51507,  9368],
         [40990, 30170, 54430,  9368, 50966, 50966, 50966, 50966, 50966, 50966,
          50966, 50966, 50966, 50966, 50966]], device='cuda:0'),
 tensor([[ 58057,   7555,  41970, 109240,  52815,  38096,  93771, 115141,  73088,
           44994,  40218,  11276, 129051, 129051, 129051, 129051],
         [ 82115, 128804,  49943,  11276, 129051, 129051, 129051, 129051, 129051,
          129051, 129051, 129051, 129051, 129051, 129051, 129051]],
        device='cuda:0'))

Model#

This section discusses the model architecture, with the key components being the encoder and the decoder. The central idea is that the encoder transforms the input text into a vector representation. This vector serves as the initial hidden state for the recurrent decoder, which generates the translation by processing the already translated part of the sentence as its input sequence.

Encoder#

The encoder extracts the last hidden state from each layer. Since there are both forward and backward directional layers, the total number of states corresponds to twice the value of the num_layers parameter. All these outputs are then concatenated into a single vector for each sample.

class Encoder(nn.Module):

    def __init__(
        self,
        vocab_size: int,
        embed_size: int,
        hidden_size: int,
        num_layers: int = 1
    ):
        super().__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.RNN(
            input_size=embed_size,
            hidden_size=hidden_size,
            batch_first=True,
            bidirectional=True,
            num_layers=num_layers
        )

    def forward(self, x: torch.Tensor):
        '''
        Parameters
        ----------
        x: torch.Tensor
            With dimentionality (samples_number, sequence_length).
            Original language senteces transformed to indeces.

        Returns
        -------
            ouputs: torch.Tensor
                (samples_number, sequence_len, hidden_size)
                All states.
            hidden: torch.Tensor
                (samples_number, hidden_size)
                Final state.
        '''
        embedded = self.embedding(x)
        outputs, hidden = self.rnn(embedded)
        # Concatenating results of all layers and all directions to one long
        # sequence
        hidden = torch.cat(
            [hidden[i, :, :] for i in range(len(hidden))], dim=1
        ).unsqueeze(0)
        return outputs, hidden

Here is the result of applying the recurrent layer to the two transformed sentences.

encoder = Encoder(
    vocab_size=1000,
    embed_size=50,
    hidden_size=10,
    num_layers=5
)
encoder(torch.tensor([[0,1,2,3,4], [3,2,5,7,1]], dtype=torch.long))[1].shape

torch.Size([1, 2, 100])

Note: An extra dimension is added to the output of the decoder just to be used as the initial hidden state of a layer RNN in `decoder’.

Decoder#

The decoder is designed to take the text already generated in the previous steps and the hidden state from the previous steps. Note: In the first step, the decoder is supposed to use the start-of-sentence token as the already translated part and the encoder’s hidden state, which contains information about the text to be translated.

The following cell implements decoder:

class Decoder(nn.Module):
    def __init__(
        self: int,
        vocab_size: int,
        embed_size: int,
        hidden_size: int
    ):
        super().__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.RNN(
            input_size=embed_size,
            hidden_size=hidden_size,
            batch_first=True
        )
        self.fc = nn.Linear(in_features=hidden_size, out_features=vocab_size)

    def forward(self, x: torch.Tensor, hidden: torch.Tensor):
        '''
        Parameters
        ----------
        x: torch.Tensor
            (batch_size, sequence_length)
            Each sentence transformed to the indices of tokens.
        hidden: torch.Tensor
            (1, batch_size, hidden_size)
            Hidden state at the previous step.
        '''
        out = self.embedding(x)
        out, hidden = self.rnn(out, hidden)
        out = self.fc(out).reshape(out.size(0), -1)
        return out, hidden

The following cell shows how Decoder can be applied to the two 3-token sentences with a 3-dimensional hidden vector.

decoder = Decoder(
    vocab_size=len(RU_VOCAB),
    embed_size=7,
    hidden_size=10
)

decoder(
    torch.tensor([[1, 2, 3], [3, 2, 2]], dtype=torch.long),
    torch.randn(1, 2, 10)
)

(tensor([[-0.3121, -0.0889,  0.3479,  ...,  0.4920,  0.5068,  0.7837],
         [-1.1442,  0.4230, -0.1281,  ..., -0.2087,  0.1295,  0.3339]],
        grad_fn=<ViewBackward0>),
 tensor([[[-0.7554, -0.1857,  0.8682,  0.2983,  0.3260,  0.7916,  0.3769,
           -0.6647, -0.7988, -0.7135],
          [-0.1838,  0.1077,  0.2179, -0.1817, -0.3542,  0.0289, -0.5906,
           -0.4652, -0.3920, -0.1775]]], grad_fn=<StackBackward0>))

Composition#

Have a closer look at how it all works together.

We’ll look at how it works using the first batch of the data loader we created earlier.

x = next(iter(translation_dataloader))
print("EN shape", x[0].shape)
print("RU shape", x[1].shape)

EN shape torch.Size([64, 15])
RU shape torch.Size([64, 16])

The following cell applies encoder to the input batch.

embed_size = 10
hidden_size = 10
num_layers = 4

encoder = Encoder(
    vocab_size=len(EN_VOCAB),
    embed_size=embed_size,
    hidden_size=hidden_size,
    num_layers=num_layers
).to(DEVICE)
hidden = encoder(x[0])[1]
hidden.shape

torch.Size([1, 64, 80])

As a result, we obtain a vector for each observation in the batch. This vector’s dimensionality is determined by the following logic: the hidden state size of each recurrent layer is multiplied by two because each layer processes the sequence in both forward and reverse directions (bidirectional=True).

decoder = Decoder(
    vocab_size=len(RU_VOCAB),
    embed_size=10,
    hidden_size=hidden_size*num_layers*2
).to(DEVICE)
decoder(x[1], hidden)[0].shape

torch.Size([64, 2145632])

Inference#

Consider how the final model can be used in production. The following function represents how the given encoder and decoder can be applied to the arbitrary set.

def translate(
    encoder: Encoder,
    decoder: Decoder,
    sentence: str,
    en_word2int: dict[str: int],
    ru_int2word: dict[int: str],
    ru_word2int: dict[str: int],
    max_length: int = 15,
    device: torch.device = DEVICE
) -> str:
    """
    Apply model to a given sentence.
    """
    encoder.eval()
    decoder.eval()

    with torch.inference_mode():
        input_tensor = tensor_tokenize(sentence=sentence, word2int=en_word2int)
        input_tensor = input_tensor.view(1, -1).to(device)

        # Pass input sentence through encoder
        _, encoder_hidden = encoder(input_tensor)
        # Intialise hidden state of decoder
        decoder_hidden = encoder_hidden

        decoded_words = []
        last_word = torch.tensor([[en_word2int[SOS_TOKEN]]]).to(device)
        for _ in range(max_length):
            # Pass last predicted token through decoder
            logits, decoder_hidden = decoder(last_word, decoder_hidden)
            # Selecting the most probable token
            next_token = logits.argmax(dim=1)
            last_word = next_token.unsqueeze(0).to(device)
            if next_token.item() == ru_word2int[EOS_TOKEN]:
                break
            else:
                decoded_words.append(ru_int2word.get(next_token.item()))

    # return predicted words as a string
    return " ".join(decoded_words)

Here is an example of use on an unfitted encoder/decoder combination:

translate(
    encoder=encoder,
    decoder=decoder,
    sentence="hello world",
    en_word2int=EN_WORD2INT,
    ru_int2word=RU_INT2WORD,
    ru_word2int=RU_WORD2INT,
    max_length=20
)

'оранжевые? проиллюстрировать, сбивать засужу! победители! якоже задул ванну "...они виктории, серии. окликнул сельского вечеринка. якоже волнуетесь. сплошным окиньте остановите. терминах'

Fitting#

Consider fitting procedure for such model.

The following cell sets up a parameter of the models that’ll be used for fitting.

encoder_hidden_size = 32
num_layers = 5

encoder = Encoder(
    vocab_size=len(EN_VOCAB),
    embed_size=256,
    hidden_size=encoder_hidden_size,
    num_layers=num_layers
).to(DEVICE)

decoder = Decoder(
    vocab_size=len(RU_VOCAB),
    embed_size=256,
    hidden_size=encoder_hidden_size*num_layers*2
).to(DEVICE)

Following code implements fitting procedure:

# CrossEntropy loss shouldn't consider padding during optimilation
loss_fn = nn.CrossEntropyLoss(ignore_index=EN_WORD2INT[PAD_TOKEN])

encoder_optimizer = optim.AdamW(encoder.parameters())
decoder_optimizer = optim.AdamW(decoder.parameters())

num_epochs = 1

encoder.train()
decoder.train()

for epoch in range(num_epochs):
    iterator = tqdm(
        enumerate(translation_dataloader),
        total=len(translation_dataloader)
    )
    for i, (input_tensor, target_tensor) in iterator:
        input_tensor, target_tensor = (
            input_tensor.to(DEVICE),
            target_tensor.to(DEVICE)
        )

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        target_length = target_tensor.size(1)

        _, encoder_hidden = encoder(input_tensor)

        decoder_input = torch.full(
            (batch_size, 1),
            EN_WORD2INT[SOS_TOKEN],
            dtype=torch.long
        ).to(DEVICE)
        decoder_hidden = encoder_hidden

        loss = torch.tensor(0.0, device=DEVICE, requires_grad=True)
        for di in range(target_length):
            logits, decoder_hidden = decoder(decoder_input, decoder_hidden)
            loss = loss + loss_fn(logits, target_tensor[:, di])
            decoder_input = target_tensor[:, di].reshape(batch_size, 1)

        loss.backward()
        encoder_optimizer.step()
        decoder_optimizer.step()

        if i % 100 == 0:
            print(
                f"Epoch {epoch}, ",
                f"Batch {i}, ",
                f"Loss: {loss.item() / target_length:.4f}"
            )

Result#

Consider results of the gotten models.

def simple_tranlation(input: str) -> str:
    return translate(
        encoder=encoder,
        decoder=decoder,
        sentence=input,
        en_word2int=EN_WORD2INT,
        ru_int2word=RU_INT2WORD,
        ru_word2int=RU_WORD2INT,
        max_length=20
    )

Few examples and their results:

simple_tranlation("Hello world")

'у неё не было.'

simple_tranlation("What is this?")

'что такое любовь?'

simple_tranlation("How task can be solved in other way?")

'как долго он узнал о смерти брата?'

Well, the translation is really far from correct, but at least it looks like sentences.