jdblackstar · jdblackstar · Apr 8, 2024 · Apr 8, 2024 · Apr 8, 2024 · Apr 8, 2024
diff --git a/examples/gpt.py b/examples/gpt.py
@@ -29,32 +29,38 @@
 # 2. Dataclasses for Model and Hyperparameters
 @dataclass
 class Hyperparameters:
-    batch_size: int
-    block_size: int
-    max_iters: int
-    eval_interval: int
-    learning_rate: float
-    device: str
-    eval_iters: int
-    num_embeds: int
-    num_heads: int
-    num_layers: int
-    dropout: float
-
-
-@dataclass
-class ModelArgs:
-    seq_len: int
-    d_model: int
-    n_heads: int
-    vocab_size: int
-    num_layers: int
-    esp: float
+    # Model architecture parameters
+    vocab_size: int  # Size of the vocabulary
+    d_model: int  # Dimensionality of the token embeddings (equivalent to num_embeds)
+    num_layers: int  # Number of transformer blocks
+    num_heads: int  # Number of attention heads in each transformer block
+    d_ff: int  # Dimensionality of the feed-forward layer within each transformer block
+    dropout_rate: float  # Dropout rate applied to several components within the transformer blocks (equivalent to dropout)
+    max_position_embeddings: int  # Maximum sequence length that this model might ever be used with (could align with block_size)
+    eps: float  # Epsilon used for layer normalization modules
+
+    # Training-specific parameters
+    batch_size: int  # Number of sequences per training batch
+    block_size: int  # Length of the sequence to be processed (could align with max_position_embeddings)
+    max_iters: int  # Maximum number of training iterations
+    eval_interval: int  # Interval (in iterations) at which to evaluate the model
+    learning_rate: float  # Learning rate for the optimizer
+    device: str  # Training device ('cpu' or 'cuda')
+    eval_iters: int  # Number of iterations to perform during evaluation
+
+    # Additional training hyperparameters (suggested)
+    num_epochs: int = 1  # Total number of training epochs (default to 1 for flexibility)
+    warmup_steps: int = 0  # Number of warmup steps for learning rate scheduling (default to 0)
+    gradient_accumulation_steps: int = (
+        1  # Number of steps to accumulate gradients before performing a backward/update pass (default to 1)
+    )
+    max_grad_norm: float = 1.0  # Maximum gradient norm (for gradient clipping, default to 1.0)
+    save_interval: int = 1000  # Interval (in steps) at which to save model checkpoints (default to 1000)
 
 
 # 3. Helper Functions
 @Tensor.no_grad()
-def estimate_loss(model, train_data, val_data, hyperparameters):
+def estimate_loss(model, train_data, val_data, hparams: Hyperparameters):
     """
     Estimates the loss of the model over a number of iterations.
 
@@ -79,8 +85,8 @@ def estimate_loss(model, train_data, val_data, hyperparameters):
 
     for split in ["train", "val"]:
         losses = []
-        for k in range(hyperparameters.eval_iters):
-            data, targets = get_batch(split, train_data, val_data, hyperparameters)
+        for k in range(hparams.eval_iters):
+            data, targets = get_batch(split, train_data, val_data, hparams)
             logits = model(data)
 
             batch_size, time_step, channels = logits.shape
@@ -96,7 +102,7 @@ def estimate_loss(model, train_data, val_data, hyperparameters):
     return out
 
 
-def get_batch(split, train_data, val_data, hyperparameters):
+def get_batch(split, train_data, val_data, hparams: Hyperparameters):
     """
     Generates a batch of data for training or validation.
 
@@ -119,20 +125,20 @@ def get_batch(split, train_data, val_data, hyperparameters):
     len_data = len(data)
 
     # randomly select starting indices for the sequences
-    idx = np.random.randint(0, len_data - hyperparameters.block_size, hyperparameters.batch_size)
+    idx = np.random.randint(0, len_data - hparams.block_size, hparams.batch_size)
 
     # create input (x) and target (y) sequences based on block_size
     # target (y) sequence is offset by one (common practice in language modeling)
-    x = Tensor.stack([data[i : i + hyperparameters.block_size] for i in idx])
-    y = Tensor.stack([data[i + 1 : i + hyperparameters.block_size + 1] for i in idx])
+    x = Tensor.stack([data[i : i + hparams.block_size] for i in idx])
+    y = Tensor.stack([data[i + 1 : i + hparams.block_size + 1] for i in idx])
 
     # move the tensor to the specified device
-    x, y = x.to(hyperparameters.device), y.to(hyperparameters.device)
+    x, y = x.to(hparams.device), y.to(hparams.device)
     return x, y
 
 
 @Tensor.no_grad()
-def generate(model, idx, max_new_tokens, hyperparameters):
+def generate(model, idx, max_new_tokens, hparams: Hyperparameters):
     """
     Generates new tokens using the trained model.
 
@@ -149,9 +155,9 @@ def generate(model, idx, max_new_tokens, hyperparameters):
     Returns:
         Tensor: A tensor containing the indices of the generated tokens.
     """
-    idx = Tensor.zeros((1, hyperparameters.block_size)).to(hyperparameters.device).long()
+    idx = Tensor.zeros((1, hparams.block_size)).to(hparams.device).long()
     for i in range(max_new_tokens):
-        idx_cond = idx[:, -hyperparameters.block_size :]
+        idx_cond = idx[:, -hparams.block_size :]
         logits = model(idx_cond)
         logits = logits[:, -1, :]  # only take the last token, since we're predicting the "next" token
 
@@ -164,91 +170,108 @@ def generate(model, idx, max_new_tokens, hyperparameters):
 
     # return the model to training mode
     model.train()
-    return idx[:, hyperparameters.block_size :]
+    return idx[:, hparams.block_size :]
 
 
-# 4. Model Components (MHA, MLP, RMSNorm, Block, GPT)
-class MHA(nn.Module):
-    def __init__(self, model_args: ModelArgs) -> None:
-        """
-        Initializes the Multi-Head Attention module.
+# 4. Model Components (Attention (Single head and MHA), MLP, RMSNorm, Block, GPT)
+class Head(nn.Module):
+    """
+    A single attention head.
 
-        Args:
-            model_args (ModelArgs): The arguments for the model, including dimensions and sequence length.
-        """
+    This class implements a single attention head, which is a key component of the transformer architecture.
+    It computes the attention scores, applies a mask, and performs the attention operation.
+
+    Args:
+        head_size (int): The size of the attention head.
+        hyperparameters (Hyperparameters): The hyperparameters of the model.
+    """
+
+    def __init__(self, hparams: Hyperparameters):
         super().__init__()
-        self.key = nn.Linear(model_args.d_model, model_args.d_model)
-        self.query = nn.Linear(model_args.d_model, model_args.d_model)
-        self.value = nn.Linear(model_args.d_model, model_args.d_model)
-        self.proj = nn.Linear(model_args.d_model, model_args.d_model)
-        self.head_dim = model_args.d_model // model_args.n_heads
-
-        self.n_heads = model_args.n_heads
-        mask = np.tril(np.ones((model_args.seq_len, model_args.seq_len)))
-        mask = np.triu(mask, k=1) * -np.inf
-        # Repeat the mask for each head
-        mask = np.repeat(mask[np.newaxis, np.newaxis, :, :], self.n_heads, axis=1)
-        self.register_buffer("mask", Tensor(mask).float())
+        self.key = nn.Linear(hparams.num_embeds, hparams.head_size)
+        self.query = nn.Linear(hparams.num_embeds, hparams.head_size)
+        self.value = nn.Linear(hparams.num_embeds, hparams.head_size)
+        self.register_buffer("tril", Tensor(np.tril(np.ones((hparams.block_size, hparams.block_size)))))
+        self.dropout = nn.Dropout(hparams.dropout)  # this is placeholder, need to implement fully
 
-    def forward(self, x: Tensor) -> Tensor:
+    def forward(self, x):
         """
-        Defines the computation performed at every call.
+        Computes the forward pass of the attention head.
 
         Args:
-            x (Tensor): The input data.
+            x (Tensor): The input tensor of shape (batch_size, sequence_length, num_embeds).
 
         Returns:
-            Tensor: The output of the Multi-Head Attention layer.
+            Tensor: The output tensor after applying attention, of shape (batch_size, sequence_length, head_size).
         """
-        if not isinstance(x, Tensor):
-            raise TypeError(f"Expected x to be a Tensor, but got {type(x).__name__}")
+        batch_size, sequence_length, channels = x.shape
 
-        batch_size, time_step, channels = x.shape
-        key = self.key(x)
-        query = self.query(x)
-        value = self.value(x)
-        key = key.reshape(batch_size, time_step, self.n_heads, channels // self.n_heads).transpose(1, 2)
-        query = query.reshape(batch_size, time_step, self.n_heads, channels // self.n_heads).transpose(1, 2)
-        value = value.reshape(batch_size, time_step, self.n_heads, channels // self.n_heads).transpose(1, 2)
+        # Compute key, query, and value projections
+        k = self.key(x)
+        q = self.query(x)
+        v = self.value(x)
 
-        # Call the static attention method
-        x = MHA.attention(key, query, value, self.mask)
+        # Compute attention scores
+        attention_scores = q @ k.transpose(-2, -1) * k.shape[-1] ** -0.5
 
-        return x
+        # Apply mask to attention scores
+        masked_attention_scores = attention_scores.masked_fill(
+            self.tril[:sequence_length, :sequence_length] == 0, float("-inf")
+        )
 
-    @staticmethod
-    def attention(key, query, value, mask) -> Tensor:
+        # Compute attention probabilities
+        attention_probs = Softmax().forward(masked_attention_scores)
+        attention_probs = self.dropout(attention_probs)
+
+        # Compute the attended values
+        v = self.value(x)
+        out = attention_probs @ v
+
+        return out
+
+
+class MultiHeadAttention(nn.Module):
+    """
+    Multi-head attention module.
+
+    This module applies multiple attention heads in parallel and concatenates their outputs.
+    The concatenated output is then projected to the original embedding dimension.
+
+    Args:
+        num_heads (int): The number of attention heads.
+        head_size (int): The size of each attention head.
+        hyperparameters (Hyperparameters): The hyperparameters of the model.
+    """
+
+    def __init__(self, hparams: Hyperparameters):
+        super().__init__()
+        self.heads = nn.ModuleList([Head(hparams) for _ in range(hparams.num_heads)])
+        self.proj = nn.Linear(hparams.num_embeds, hparams.num_embeds)
+        self.dropout = nn.Dropout(hparams.dropout)
+
+    def forward(self, x):
         """
-        Computes the attention scores.
+        Computes the forward pass of the multi-head attention module.
 
         Args:
-            key (Tensor): The key vectors.
-            query (Tensor): The query vectors.
-            value (Tensor): The value vectors.
-            mask (Tensor): The mask tensor.
+            x (Tensor): The input tensor of shape (batch_size, sequence_length, num_embeds).
 
         Returns:
-            Tensor: The output of the attention mechanism.
+            Tensor: The output tensor after applying multi-head attention, of shape (batch_size, sequence_length, num_embeds).
         """
-        logger.debug(
-            f"key shape: {key.shape}, query shape: {query.shape}, value shape: {value.shape}, mask shape: {mask.shape}"
-        )
-        batch_size, n_head, time_step, channels = key.shape
-        scaling_factor = Tensor(channels**-0.5)
-        attention_scores = (query @ key.transpose(-2, -1)) * scaling_factor
-        attention_scores = mask[:, :, :time_step, :time_step] + attention_scores
-        attention_scores = Softmax().forward(attention_scores, dim=-1)
-        logger.debug(f"value shape: {value.shape}, attention_scores shape: {attention_scores.shape}")
+        # Apply attention heads in parallel
+        head_outputs = [h(x) for h in self.heads]
 
-        value = value.reshape(batch_size, n_head, time_step, channels, 1)
-        attention_scores = attention_scores.reshape(batch_size, n_head, time_step, 1, time_step)
+        # Concatenate the outputs of all attention heads
+        concatenated = Tensor.cat(head_outputs, dim=-1)
 
-        matmul_result = value @ attention_scores
-        x = matmul_result.sum(axis=-1)
+        # Project the concatenated output back to the original embedding dimension
+        out = self.proj(concatenated)
 
-        if not isinstance(x, Tensor):
-            raise TypeError(f"Expected x to be a Tensor, but got {type(x).__name__}")
-        return x
+        # Apply dropout regularization
+        out = self.dropout(out)
+
+        return out
 
 
 class MLP(nn.Module):
@@ -339,18 +362,18 @@ class Block(nn.Module):
     layer normalization.
     """
 
-    def __init__(self, model_args: ModelArgs) -> None:
+    def __init__(self, hparams: Hyperparameters) -> None:
         """
         Initializes the Block module.
 
         Args:
             model_args (ModelArgs): The arguments for the model, including dimensions and sequence length.
         """
         super().__init__()
-        self.attn = MHA(model_args)
-        self.ffn = MLP(model_args.d_model, model_args.d_model)
-        self.l1 = RMSNorm(model_args.d_model, eps=model_args.esp)
-        self.l2 = RMSNorm(model_args.d_model, eps=model_args.esp)
+        self.attn = MultiHeadAttention(hparams)
+        self.ffn = MLP(hparams.d_model, hparams.d_model)
+        self.l1 = RMSNorm(hparams.d_model, eps=hparams.eps)
+        self.l2 = RMSNorm(hparams.d_model, eps=hparams.eps)
 
     def forward(self, x):
         """
@@ -373,7 +396,7 @@ class GPT(nn.Module):
     a list of transformer blocks, a normalization layer, and a projection layer.
     """
 
-    def __init__(self, model_args: ModelArgs, device: str):
+    def __init__(self, hparams: Hyperparameters) -> None:
         """
         Initializes the GPT model.
 
@@ -382,12 +405,12 @@ def __init__(self, model_args: ModelArgs, device: str):
             device (str): The device to run the model on ("cpu" or "gpu").
         """
         super().__init__()
-        self.device = device
-        self.token_embedding = nn.Embedding(model_args.vocab_size, model_args.d_model)
-        self.position_embedding = nn.Embedding(model_args.seq_len, model_args.d_model)
-        self.layers = nn.ModuleList([Block(model_args) for _ in range(model_args.num_layers)])
-        self.norm = RMSNorm(model_args.d_model)
-        self.proj = nn.Linear(model_args.d_model, model_args.vocab_size)
+        self.device = hparams.device
+        self.token_embedding = nn.Embedding(hparams.vocab_size, hparams.d_model)
+        self.position_embedding = nn.Embedding(hparams.seq_len, hparams.d_model)
+        self.layers = nn.ModuleList([Block(hparams) for _ in range(hparams.num_layers)])
+        self.norm = RMSNorm(hparams.d_model)
+        self.proj = nn.Linear(hparams.d_model, hparams.vocab_size)
 
     def forward(self, x: Tensor) -> Tensor:
         """
@@ -417,7 +440,7 @@ def forward(self, x: Tensor) -> Tensor:
 
 # 5. Main Function
 def main():
-    # hyperparameters and modelargs
+    # hyperparameters for the model and training run
     hyperparameters = Hyperparameters(
         batch_size=64,
         block_size=128,
@@ -430,20 +453,15 @@ def main():
         num_heads=4,
         num_layers=2,
         dropout=0.2,
-    )
-
-    # fmt: off
-    model_args = ModelArgs(
         seq_len=1000,
         d_model=16,
         n_heads=2,
         vocab_size=1000,
         num_layers=2,
         esp=1e-5,
     )
-    # fmt: on
 
-    model = GPT(model_args, hyperparameters.device).to(hyperparameters.device)
+    model = GPT(hyperparameters).to(hyperparameters.device)
     optimizer = Adam(model.parameters(), lr=hyperparameters.learning_rate)
     tokenizer = CharTokenizer(filepath="datasets/input.txt")
 

diff --git a/punytorch/nn/__init__.py b/punytorch/nn/__init__.py
@@ -1 +1,2 @@
 from .modules import *
+from .dropout import *