Introduction

This example demonstrates how to implement an autoregressive language model using a miniature version of the GPT model. The model consists of a single Transformer block with causal masking in its attention layer. We use the text from the IMDB sentiment classification dataset for training and generate new movie reviews for a given prompt. When using this script with your own dataset, make sure it has at least 1 million words.

Setup

import numpy as np
import os
import re
import string
import random

import torch
import torch.nn as nn
import torch.functional as F

Implement a Transformer block as a layer

def causal_attention_mask(batch_size, n_dest, n_src, dtype):
    """
    Mask the upper half of the dot product matrix in self attention.
    This prevents flow of information from future tokens to current token.
    1's in the lower triangle, counting from the lower right corner.
    """
    i = torch.arange(n_dest)[:, None]
    j = torch.arange(n_src)
    m = i >= j - n_src + n_dest
    mask = m.type(dtype)
    mask = mask.reshape(1, n_dest, n_src)
    mult = torch.cat(
        [torch.expand_dims(batch_size, -1), torch.tensor([1, 1], dtype=torch.int32)], 0
    )
    return torch.tile(mask, mult)

class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = nn.MultiheadAttention(embed_dim, num_heads)
        self.ffn = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, embed_dim),
        )
        self.layernorm1 = nn.LayerNorm(embed_dim, eps=1e-6)
        self.layernorm2 = nn.LayerNorm(embed_dim, eps=1e-6)
        self.dropout1 = nn.Dropout(rate)
        self.dropout2 = nn.Dropout(rate)
    
    def forward(self, x):
        input_shape = x.shape
        batch_size = input_shape[0]
        seq_len = input_shape[1]
        causal_mask = causal_attention_mask(batch_size, seq_len, seq_len, torch.bool)
        attention_output = self.att(x, x, causal_mask)
        attention_output = self.dropout1(attention_output)
        out1 = self.layernorm1(x + attention_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)

Implement an embedding layer

class TokenAndPositionEmbedding(nn.Module):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = nn.Embedding(vocab_size, embed_dim)
        self.pos_emb = nn.Embedding(maxlen, embed_dim)

    def forward(self, x):
        maxlen = x.size(1)
        positions = torch.arange(0, maxlen, dtype=torch.long)
        positions = positions.unsqueeze(0).expand_as(x)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

Implement the miniature GPT model

vocab_size = 20000  # Only consider the top 20k words
maxlen = 80  # Max sequence size
embed_dim = 256  # Embedding size for each token
num_heads = 2  # Number of attention heads
feed_forward_dim = 256  # Hidden layer size in feed forward network inside transformer

class Transformer(nn.Module):
    def __init__(self, maxlen, vocab_size, embed_dim, num_heads, feed_forward_dim):
        super(Transformer, self).__init__()
        self.embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
        self.transformer_block = TransformerBlock(embed_dim, num_heads, feed_forward_dim)
        self.dense = nn.Linear(embed_dim, vocab_size)
        
    def forward(self, x):
        x = self.embedding_layer(x)
        x = self.transformer_block(x)
        x = self.dense(x)
        return x

model = Transformer(maxlen, vocab_size, embed_dim, num_heads, feed_forward_dim)

Name		Name	Last commit message	Last commit date
Latest commit History 5 Commits
LICENSE		LICENSE
README.md		README.md
miniaturegpt.py		miniaturegpt.py

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

Introduction

Setup

Implement a Transformer block as a layer

Implement an embedding layer

Implement the miniature GPT model

About

Releases

Packages

Languages

License

thisisanshgupta/miniature-GPT

Folders and files

Latest commit

History

Repository files navigation

Introduction

Setup

Implement a Transformer block as a layer

Implement an embedding layer

Implement the miniature GPT model

About

Resources

License

Stars

Watchers

Forks

Releases

Packages 0

Languages

Packages