From c48d2c9d2a8e94f8e0cffcc1cb9d5f9caf2cae31 Mon Sep 17 00:00:00 2001
From: Josh Black-Star <23347017+jdblackstar@users.noreply.github.com>
Date: Mon, 8 Apr 2024 12:45:43 -0700
Subject: [PATCH 01/17] remove model_args and just use hyperparameters

---
 examples/gpt.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/examples/gpt.py b/examples/gpt.py
index 5173a93..15354f0 100644
--- a/examples/gpt.py
+++ b/examples/gpt.py
@@ -42,16 +42,6 @@ class Hyperparameters:
     dropout: float
 
 
-@dataclass
-class ModelArgs:
-    seq_len: int
-    d_model: int
-    n_heads: int
-    vocab_size: int
-    num_layers: int
-    esp: float
-
-
 # 3. Helper Functions
 @Tensor.no_grad()
 def estimate_loss(model, train_data, val_data, hyperparameters):

From 1c332bc01e9d3ad27ac1666470674e06b4314585 Mon Sep 17 00:00:00 2001
From: Josh Black-Star <23347017+jdblackstar@users.noreply.github.com>
Date: Mon, 8 Apr 2024 12:46:20 -0700
Subject: [PATCH 02/17] make sure the hyperparameters is an instance of the
 dataclass

---
 examples/gpt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/gpt.py b/examples/gpt.py
index 15354f0..edefe34 100644
--- a/examples/gpt.py
+++ b/examples/gpt.py
@@ -122,7 +122,7 @@ def get_batch(split, train_data, val_data, hyperparameters):
 
 
 @Tensor.no_grad()
-def generate(model, idx, max_new_tokens, hyperparameters):
+def generate(model, idx, max_new_tokens, hyperparameters: Hyperparameters):
     """
     Generates new tokens using the trained model.
 

From d4add5c76c47394ad9ef2caf299b9794f885e5fd Mon Sep 17 00:00:00 2001
From: Josh Black-Star <23347017+jdblackstar@users.noreply.github.com>
Date: Mon, 8 Apr 2024 12:48:10 -0700
Subject: [PATCH 03/17] splitting MHA into a single Head class and an MHA class
 that concatenates multiple Heads

---
 examples/gpt.py | 137 +++++++++++++++++++++++++++---------------------
 1 file changed, 77 insertions(+), 60 deletions(-)

diff --git a/examples/gpt.py b/examples/gpt.py
index edefe34..7358a2a 100644
--- a/examples/gpt.py
+++ b/examples/gpt.py
@@ -157,88 +157,105 @@ def generate(model, idx, max_new_tokens, hyperparameters: Hyperparameters):
     return idx[:, hyperparameters.block_size :]
 
 
-# 4. Model Components (MHA, MLP, RMSNorm, Block, GPT)
-class MHA(nn.Module):
-    def __init__(self, model_args: ModelArgs) -> None:
-        """
-        Initializes the Multi-Head Attention module.
+# 4. Model Components (Attention (Single head and MHA), MLP, RMSNorm, Block, GPT)
+class Head(nn.Module):
+    """
+    A single attention head.
 
-        Args:
-            model_args (ModelArgs): The arguments for the model, including dimensions and sequence length.
-        """
+    This class implements a single attention head, which is a key component of the transformer architecture.
+    It computes the attention scores, applies a mask, and performs the attention operation.
+
+    Args:
+        head_size (int): The size of the attention head.
+        hyperparameters (Hyperparameters): The hyperparameters of the model.
+    """
+
+    def __init__(self, head_size, hyperparameters):
         super().__init__()
-        self.key = nn.Linear(model_args.d_model, model_args.d_model)
-        self.query = nn.Linear(model_args.d_model, model_args.d_model)
-        self.value = nn.Linear(model_args.d_model, model_args.d_model)
-        self.proj = nn.Linear(model_args.d_model, model_args.d_model)
-        self.head_dim = model_args.d_model // model_args.n_heads
-
-        self.n_heads = model_args.n_heads
-        mask = np.tril(np.ones((model_args.seq_len, model_args.seq_len)))
-        mask = np.triu(mask, k=1) * -np.inf
-        # Repeat the mask for each head
-        mask = np.repeat(mask[np.newaxis, np.newaxis, :, :], self.n_heads, axis=1)
-        self.register_buffer("mask", Tensor(mask).float())
+        self.key = nn.Linear(hyperparameters.num_embeds, head_size)
+        self.query = nn.Linear(hyperparameters.num_embeds, head_size)
+        self.value = nn.Linear(hyperparameters.num_embeds, head_size)
+        self.register_buffer("tril", Tensor(np.tril(np.ones((hyperparameters.block_size, hyperparameters.block_size)))))
+        self.dropout = nn.Dropout(hyperparameters.dropout)  # this is placeholder, need to implement fully
 
-    def forward(self, x: Tensor) -> Tensor:
+    def forward(self, x):
         """
-        Defines the computation performed at every call.
+        Computes the forward pass of the attention head.
 
         Args:
-            x (Tensor): The input data.
+            x (Tensor): The input tensor of shape (batch_size, sequence_length, num_embeds).
 
         Returns:
-            Tensor: The output of the Multi-Head Attention layer.
+            Tensor: The output tensor after applying attention, of shape (batch_size, sequence_length, head_size).
         """
-        if not isinstance(x, Tensor):
-            raise TypeError(f"Expected x to be a Tensor, but got {type(x).__name__}")
+        batch_size, sequence_length, channels = x.shape
 
-        batch_size, time_step, channels = x.shape
-        key = self.key(x)
-        query = self.query(x)
-        value = self.value(x)
-        key = key.reshape(batch_size, time_step, self.n_heads, channels // self.n_heads).transpose(1, 2)
-        query = query.reshape(batch_size, time_step, self.n_heads, channels // self.n_heads).transpose(1, 2)
-        value = value.reshape(batch_size, time_step, self.n_heads, channels // self.n_heads).transpose(1, 2)
+        # Compute key, query, and value projections
+        k = self.key(x)
+        q = self.query(x)
+        v = self.value(x)
 
-        # Call the static attention method
-        x = MHA.attention(key, query, value, self.mask)
+        # Compute attention scores
+        attention_scores = q @ k.transpose(-2, -1) * k.shape[-1] ** -0.5
 
-        return x
+        # Apply mask to attention scores
+        masked_attention_scores = attention_scores.masked_fill(
+            self.tril[:sequence_length, :sequence_length] == 0, float("-inf")
+        )
 
-    @staticmethod
-    def attention(key, query, value, mask) -> Tensor:
+        # Compute attention probabilities
+        attention_probs = Softmax().forward(masked_attention_scores)
+        attention_probs = self.dropout(attention_probs)
+
+        # Compute the attended values
+        v = self.value(x)
+        out = attention_probs @ v
+
+        return out
+
+
+class MultiHeadAttention(nn.Module):
+    """
+    Multi-head attention module.
+
+    This module applies multiple attention heads in parallel and concatenates their outputs.
+    The concatenated output is then projected to the original embedding dimension.
+
+    Args:
+        num_heads (int): The number of attention heads.
+        head_size (int): The size of each attention head.
+        hyperparameters (Hyperparameters): The hyperparameters of the model.
+    """
+
+    def __init__(self, num_heads, head_size, hyperparameters):
+        super().__init__()
+        self.heads = nn.ModuleList([Head(head_size, hyperparameters) for _ in range(num_heads)])
+        self.proj = nn.Linear(hyperparameters.num_embeds, hyperparameters.num_embeds)
+        self.dropout = nn.Dropout(hyperparameters.dropout)
+
+    def forward(self, x):
         """
-        Computes the attention scores.
+        Computes the forward pass of the multi-head attention module.
 
         Args:
-            key (Tensor): The key vectors.
-            query (Tensor): The query vectors.
-            value (Tensor): The value vectors.
-            mask (Tensor): The mask tensor.
+            x (Tensor): The input tensor of shape (batch_size, sequence_length, num_embeds).
 
         Returns:
-            Tensor: The output of the attention mechanism.
+            Tensor: The output tensor after applying multi-head attention, of shape (batch_size, sequence_length, num_embeds).
         """
-        logger.debug(
-            f"key shape: {key.shape}, query shape: {query.shape}, value shape: {value.shape}, mask shape: {mask.shape}"
-        )
-        batch_size, n_head, time_step, channels = key.shape
-        scaling_factor = Tensor(channels**-0.5)
-        attention_scores = (query @ key.transpose(-2, -1)) * scaling_factor
-        attention_scores = mask[:, :, :time_step, :time_step] + attention_scores
-        attention_scores = Softmax().forward(attention_scores, dim=-1)
-        logger.debug(f"value shape: {value.shape}, attention_scores shape: {attention_scores.shape}")
+        # Apply attention heads in parallel
+        head_outputs = [h(x) for h in self.heads]
 
-        value = value.reshape(batch_size, n_head, time_step, channels, 1)
-        attention_scores = attention_scores.reshape(batch_size, n_head, time_step, 1, time_step)
+        # Concatenate the outputs of all attention heads
+        concatenated = Tensor.cat(head_outputs, dim=-1)
 
-        matmul_result = value @ attention_scores
-        x = matmul_result.sum(axis=-1)
+        # Project the concatenated output back to the original embedding dimension
+        out = self.proj(concatenated)
 
-        if not isinstance(x, Tensor):
-            raise TypeError(f"Expected x to be a Tensor, but got {type(x).__name__}")
-        return x
+        # Apply dropout regularization
+        out = self.dropout(out)
+
+        return out
 
 
 class MLP(nn.Module):

From 455f03c467cd437671f7902c045a179027662401 Mon Sep 17 00:00:00 2001
From: Josh Black-Star <23347017+jdblackstar@users.noreply.github.com>
Date: Mon, 8 Apr 2024 12:48:32 -0700
Subject: [PATCH 04/17] creating a dropout class that inherits from nn.Module

---
 punytorch/nn/__init__.py |  1 +
 punytorch/nn/dropout.py  | 50 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+)
 create mode 100644 punytorch/nn/dropout.py

diff --git a/punytorch/nn/__init__.py b/punytorch/nn/__init__.py
index 270dceb..e882c22 100644
--- a/punytorch/nn/__init__.py
+++ b/punytorch/nn/__init__.py
@@ -1 +1,2 @@
 from .modules import *
+from .dropout import *
diff --git a/punytorch/nn/dropout.py b/punytorch/nn/dropout.py
new file mode 100644
index 0000000..64d7996
--- /dev/null
+++ b/punytorch/nn/dropout.py
@@ -0,0 +1,50 @@
+import numpy as np
+import punytorch.nn as nn
+from punytorch.tensor import Tensor
+
+
+class Dropout(nn.Module):
+    def __init__(self, p: float = 0.5, seed: int = None):
+        """
+        Initializes the Dropout layer.
+
+        Args:
+            p (float or Tensor): The probability of an element to be zeroed. Default: 0.5
+                Can be a float for a constant dropout rate, or a Tensor for element-wise rates.
+            seed (int, optional): The seed for the random number generator. If provided,
+                ensures reproducibility of dropout mask across runs. Default: None
+
+        Raises:
+            TypeError: If `p` is not a float or a Tensor.
+        """
+        super().__init__()
+        self.p = p
+        self.seed = seed
+        if isinstance(p, float):
+            self.p = float(p)
+        elif isinstance(p, Tensor):
+            self.p = p.data
+        else:
+            raise TypeError(f"p must be a float or a Tensor, got {type(p)}")
+
+    def forward(self, input: Tensor, train: bool = True) -> Tensor:
+        """
+        Applies Dropout to the input Tensor during training.
+
+        Args:
+            input (Tensor): Input tensor.
+            train (bool): If True, apply dropout. If False, return the input as is.
+
+        Returns:
+            Tensor: Output tensor after applying dropout.
+        """
+        if train:
+            # Generate a mask with the same shape as the input
+            # Elements are drawn from a Bernoulli distribution
+            self.mask = (np.random.rand(*input.shape) > self.p) / (1 - self.p)
+            return input * self.mask
+        else:
+            return input
+
+    def __call__(self, input: Tensor, train: bool = True) -> Tensor:
+        return self.forward(input, train)

From 9005bc2dc4d77ad64712ce9bfadb8e2691c15fba Mon Sep 17 00:00:00 2001
From: Josh Black-Star <23347017+jdblackstar@users.noreply.github.com>
Date: Mon, 8 Apr 2024 12:48:41 -0700
Subject: [PATCH 05/17] basic dropout test

---
 tests/test_dropout.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 tests/test_dropout.py

diff --git a/tests/test_dropout.py b/tests/test_dropout.py
new file mode 100644
index 0000000..b3cd6f5
--- /dev/null
+++ b/tests/test_dropout.py
@@ -0,0 +1,26 @@
+import numpy as np
+from punytorch.nn.dropout import Dropout
+from punytorch.tensor import Tensor
+
+
+def test_dropout():
+    np.random.seed(42)  # For reproducibility
+    input_tensor = Tensor(np.array([1.0, 2.0, 3.0, 4.0]))
+    dropout_layer = Dropout(p=0.5)
+
+    # Test during training
+    output_tensor_train = dropout_layer(input_tensor, train=True)
+    assert (
+        output_tensor_train.shape == input_tensor.shape
+    ), "Output tensor shape should match input tensor shape during training."
+
+    # Test during evaluation
+    output_tensor_eval = dropout_layer(input_tensor, train=False)
+    assert np.array_equal(
+        output_tensor_eval.data, input_tensor.data
+    ), "Output tensor should match input tensor during evaluation."
+
+    # Check if dropout is applied (not a rigorous test due to randomness, but a basic check)
+    assert not np.array_equal(
+        output_tensor_train.data, input_tensor.data
+    ), "Output tensor should not match input tensor during training (with dropout applied)."

From 29cca1ffde728f09759d91b46cc8d92d9fbfe9bc Mon Sep 17 00:00:00 2001
From: Josh Black-Star <23347017+jdblackstar@users.noreply.github.com>
Date: Mon, 8 Apr 2024 14:14:18 -0700
Subject: [PATCH 06/17] i'm going to use one dataclass called Hyperparameters
 and just reference that everywhere

---
 examples/gpt.py | 115 ++++++++++++++++++++++++++----------------------
 1 file changed, 63 insertions(+), 52 deletions(-)

diff --git a/examples/gpt.py b/examples/gpt.py
index 7358a2a..e036787 100644
--- a/examples/gpt.py
+++ b/examples/gpt.py
@@ -29,22 +29,38 @@
 # 2. Dataclasses for Model and Hyperparameters
 @dataclass
 class Hyperparameters:
-    batch_size: int
-    block_size: int
-    max_iters: int
-    eval_interval: int
-    learning_rate: float
-    device: str
-    eval_iters: int
-    num_embeds: int
-    num_heads: int
-    num_layers: int
-    dropout: float
+    # Model architecture parameters
+    vocab_size: int  # Size of the vocabulary
+    d_model: int  # Dimensionality of the token embeddings (equivalent to num_embeds)
+    num_layers: int  # Number of transformer blocks
+    num_heads: int  # Number of attention heads in each transformer block
+    d_ff: int  # Dimensionality of the feed-forward layer within each transformer block
+    dropout_rate: float  # Dropout rate applied to several components within the transformer blocks (equivalent to dropout)
+    max_position_embeddings: int  # Maximum sequence length that this model might ever be used with (could align with block_size)
+    eps: float  # Epsilon used for layer normalization modules
+
+    # Training-specific parameters
+    batch_size: int  # Number of sequences per training batch
+    block_size: int  # Length of the sequence to be processed (could align with max_position_embeddings)
+    max_iters: int  # Maximum number of training iterations
+    eval_interval: int  # Interval (in iterations) at which to evaluate the model
+    learning_rate: float  # Learning rate for the optimizer
+    device: str  # Training device ('cpu' or 'cuda')
+    eval_iters: int  # Number of iterations to perform during evaluation
+
+    # Additional training hyperparameters (suggested)
+    num_epochs: int = 1  # Total number of training epochs (default to 1 for flexibility)
+    warmup_steps: int = 0  # Number of warmup steps for learning rate scheduling (default to 0)
+    gradient_accumulation_steps: int = (
+        1  # Number of steps to accumulate gradients before performing a backward/update pass (default to 1)
+    )
+    max_grad_norm: float = 1.0  # Maximum gradient norm (for gradient clipping, default to 1.0)
+    save_interval: int = 1000  # Interval (in steps) at which to save model checkpoints (default to 1000)
 
 
 # 3. Helper Functions
 @Tensor.no_grad()
-def estimate_loss(model, train_data, val_data, hyperparameters):
+def estimate_loss(model, train_data, val_data, hparams: Hyperparameters):
     """
     Estimates the loss of the model over a number of iterations.
 
@@ -69,8 +85,8 @@ def estimate_loss(model, train_data, val_data, hyperparameters):
 
     for split in ["train", "val"]:
         losses = []
-        for k in range(hyperparameters.eval_iters):
-            data, targets = get_batch(split, train_data, val_data, hyperparameters)
+        for k in range(hparams.eval_iters):
+            data, targets = get_batch(split, train_data, val_data, hparams)
             logits = model(data)
 
             batch_size, time_step, channels = logits.shape
@@ -86,7 +102,7 @@ def estimate_loss(model, train_data, val_data, hyperparameters):
     return out
 
 
-def get_batch(split, train_data, val_data, hyperparameters):
+def get_batch(split, train_data, val_data, hparams: Hyperparameters):
     """
     Generates a batch of data for training or validation.
 
@@ -109,20 +125,20 @@ def get_batch(split, train_data, val_data, hyperparameters):
     len_data = len(data)
 
     # randomly select starting indices for the sequences
-    idx = np.random.randint(0, len_data - hyperparameters.block_size, hyperparameters.batch_size)
+    idx = np.random.randint(0, len_data - hparams.block_size, hparams.batch_size)
 
     # create input (x) and target (y) sequences based on block_size
     # target (y) sequence is offset by one (common practice in language modeling)
-    x = Tensor.stack([data[i : i + hyperparameters.block_size] for i in idx])
-    y = Tensor.stack([data[i + 1 : i + hyperparameters.block_size + 1] for i in idx])
+    x = Tensor.stack([data[i : i + hparams.block_size] for i in idx])
+    y = Tensor.stack([data[i + 1 : i + hparams.block_size + 1] for i in idx])
 
     # move the tensor to the specified device
-    x, y = x.to(hyperparameters.device), y.to(hyperparameters.device)
+    x, y = x.to(hparams.device), y.to(hparams.device)
     return x, y
 
 
 @Tensor.no_grad()
-def generate(model, idx, max_new_tokens, hyperparameters: Hyperparameters):
+def generate(model, idx, max_new_tokens, hparams: Hyperparameters):
     """
     Generates new tokens using the trained model.
 
@@ -139,9 +155,9 @@ def generate(model, idx, max_new_tokens, hyperparameters: Hyperparameters):
     Returns:
         Tensor: A tensor containing the indices of the generated tokens.
     """
-    idx = Tensor.zeros((1, hyperparameters.block_size)).to(hyperparameters.device).long()
+    idx = Tensor.zeros((1, hparams.block_size)).to(hparams.device).long()
     for i in range(max_new_tokens):
-        idx_cond = idx[:, -hyperparameters.block_size :]
+        idx_cond = idx[:, -hparams.block_size :]
         logits = model(idx_cond)
         logits = logits[:, -1, :]  # only take the last token, since we're predicting the "next" token
 
@@ -154,7 +170,7 @@ def generate(model, idx, max_new_tokens, hyperparameters: Hyperparameters):
 
     # return the model to training mode
     model.train()
-    return idx[:, hyperparameters.block_size :]
+    return idx[:, hparams.block_size :]
 
 
 # 4. Model Components (Attention (Single head and MHA), MLP, RMSNorm, Block, GPT)
@@ -170,13 +186,13 @@ class Head(nn.Module):
         hyperparameters (Hyperparameters): The hyperparameters of the model.
     """
 
-    def __init__(self, head_size, hyperparameters):
+    def __init__(self, hparams: Hyperparameters):
         super().__init__()
-        self.key = nn.Linear(hyperparameters.num_embeds, head_size)
-        self.query = nn.Linear(hyperparameters.num_embeds, head_size)
-        self.value = nn.Linear(hyperparameters.num_embeds, head_size)
-        self.register_buffer("tril", Tensor(np.tril(np.ones((hyperparameters.block_size, hyperparameters.block_size)))))
-        self.dropout = nn.Dropout(hyperparameters.dropout)  # this is placeholder, need to implement fully
+        self.key = nn.Linear(hparams.num_embeds, hparams.head_size)
+        self.query = nn.Linear(hparams.num_embeds, hparams.head_size)
+        self.value = nn.Linear(hparams.num_embeds, hparams.head_size)
+        self.register_buffer("tril", Tensor(np.tril(np.ones((hparams.block_size, hparams.block_size)))))
+        self.dropout = nn.Dropout(hparams.dropout)  # this is placeholder, need to implement fully
 
     def forward(self, x):
         """
@@ -227,11 +243,11 @@ class MultiHeadAttention(nn.Module):
         hyperparameters (Hyperparameters): The hyperparameters of the model.
     """
 
-    def __init__(self, num_heads, head_size, hyperparameters):
+    def __init__(self, hparams: Hyperparameters):
         super().__init__()
-        self.heads = nn.ModuleList([Head(head_size, hyperparameters) for _ in range(num_heads)])
-        self.proj = nn.Linear(hyperparameters.num_embeds, hyperparameters.num_embeds)
-        self.dropout = nn.Dropout(hyperparameters.dropout)
+        self.heads = nn.ModuleList([Head(hparams) for _ in range(hparams.num_heads)])
+        self.proj = nn.Linear(hparams.num_embeds, hparams.num_embeds)
+        self.dropout = nn.Dropout(hparams.dropout)
 
     def forward(self, x):
         """
@@ -346,7 +362,7 @@ class Block(nn.Module):
     layer normalization.
     """
 
-    def __init__(self, model_args: ModelArgs) -> None:
+    def __init__(self, hparams: Hyperparameters) -> None:
         """
         Initializes the Block module.
 
@@ -354,10 +370,10 @@ def __init__(self, model_args: ModelArgs) -> None:
             model_args (ModelArgs): The arguments for the model, including dimensions and sequence length.
         """
         super().__init__()
-        self.attn = MHA(model_args)
-        self.ffn = MLP(model_args.d_model, model_args.d_model)
-        self.l1 = RMSNorm(model_args.d_model, eps=model_args.esp)
-        self.l2 = RMSNorm(model_args.d_model, eps=model_args.esp)
+        self.attn = MultiHeadAttention(hparams)
+        self.ffn = MLP(hparams.d_model, hparams.d_model)
+        self.l1 = RMSNorm(hparams.d_model, eps=hparams.eps)
+        self.l2 = RMSNorm(hparams.d_model, eps=hparams.eps)
 
     def forward(self, x):
         """
@@ -380,7 +396,7 @@ class GPT(nn.Module):
     a list of transformer blocks, a normalization layer, and a projection layer.
     """
 
-    def __init__(self, model_args: ModelArgs, device: str):
+    def __init__(self, hparams: Hyperparameters) -> None:
         """
         Initializes the GPT model.
 
@@ -389,12 +405,12 @@ def __init__(self, model_args: ModelArgs, device: str):
             device (str): The device to run the model on ("cpu" or "gpu").
         """
         super().__init__()
-        self.device = device
-        self.token_embedding = nn.Embedding(model_args.vocab_size, model_args.d_model)
-        self.position_embedding = nn.Embedding(model_args.seq_len, model_args.d_model)
-        self.layers = nn.ModuleList([Block(model_args) for _ in range(model_args.num_layers)])
-        self.norm = RMSNorm(model_args.d_model)
-        self.proj = nn.Linear(model_args.d_model, model_args.vocab_size)
+        self.device = hparams.device
+        self.token_embedding = nn.Embedding(hparams.vocab_size, hparams.d_model)
+        self.position_embedding = nn.Embedding(hparams.seq_len, hparams.d_model)
+        self.layers = nn.ModuleList([Block(hparams) for _ in range(hparams.num_layers)])
+        self.norm = RMSNorm(hparams.d_model)
+        self.proj = nn.Linear(hparams.d_model, hparams.vocab_size)
 
     def forward(self, x: Tensor) -> Tensor:
         """
@@ -424,7 +440,7 @@ def forward(self, x: Tensor) -> Tensor:
 
 # 5. Main Function
 def main():
-    # hyperparameters and modelargs
+    # hyperparameters for the model and training run
     hyperparameters = Hyperparameters(
         batch_size=64,
         block_size=128,
@@ -437,10 +453,6 @@ def main():
         num_heads=4,
         num_layers=2,
         dropout=0.2,
-    )
-
-    # fmt: off
-    model_args = ModelArgs(
         seq_len=1000,
         d_model=16,
         n_heads=2,
@@ -448,9 +460,8 @@ def main():
         num_layers=2,
         esp=1e-5,
     )
-    # fmt: on
 
-    model = GPT(model_args, hyperparameters.device).to(hyperparameters.device)
+    model = GPT(hyperparameters).to(hyperparameters.device)
     optimizer = Adam(model.parameters(), lr=hyperparameters.learning_rate)
     tokenizer = CharTokenizer(filepath="datasets/input.txt")
 

From 08dc1ffd380f907f22ce07682b6b62171703e557 Mon Sep 17 00:00:00 2001
From: Josh Black-Star <23347017+jdblackstar@users.noreply.github.com>
Date: Mon, 8 Apr 2024 14:21:46 -0700
Subject: [PATCH 07/17] fixing some minor things per ellipsis

---
 examples/gpt.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/gpt.py b/examples/gpt.py
index e036787..fa2e4ae 100644
--- a/examples/gpt.py
+++ b/examples/gpt.py
@@ -408,7 +408,7 @@ def __init__(self, hparams: Hyperparameters) -> None:
         self.device = hparams.device
         self.token_embedding = nn.Embedding(hparams.vocab_size, hparams.d_model)
         self.position_embedding = nn.Embedding(hparams.seq_len, hparams.d_model)
-        self.layers = nn.ModuleList([Block(hparams) for _ in range(hparams.num_layers)])
+        self.layers = nn.ModuleList([Block(hparams=hparams) for _ in range(hparams.num_layers)])
         self.norm = RMSNorm(hparams.d_model)
         self.proj = nn.Linear(hparams.d_model, hparams.vocab_size)
 
@@ -461,7 +461,7 @@ def main():
         esp=1e-5,
     )
 
-    model = GPT(hyperparameters).to(hyperparameters.device)
+    model = GPT(hparams=hyperparameters).to(hyperparameters.device)
     optimizer = Adam(model.parameters(), lr=hyperparameters.learning_rate)
     tokenizer = CharTokenizer(filepath="datasets/input.txt")
 

From 5e2fbaea0b436e28ac48872fcb6a5afa4ec60840 Mon Sep 17 00:00:00 2001
From: Josh Black-Star <23347017+jdblackstar@users.noreply.github.com>
Date: Mon, 8 Apr 2024 17:08:18 -0700
Subject: [PATCH 08/17] trimming down that dataclass a bit

---
 examples/gpt.py | 38 +++++++++++---------------------------
 1 file changed, 11 insertions(+), 27 deletions(-)

diff --git a/examples/gpt.py b/examples/gpt.py
index fa2e4ae..410f1b3 100644
--- a/examples/gpt.py
+++ b/examples/gpt.py
@@ -29,33 +29,17 @@
 # 2. Dataclasses for Model and Hyperparameters
 @dataclass
 class Hyperparameters:
-    # Model architecture parameters
-    vocab_size: int  # Size of the vocabulary
-    d_model: int  # Dimensionality of the token embeddings (equivalent to num_embeds)
-    num_layers: int  # Number of transformer blocks
-    num_heads: int  # Number of attention heads in each transformer block
-    d_ff: int  # Dimensionality of the feed-forward layer within each transformer block
-    dropout_rate: float  # Dropout rate applied to several components within the transformer blocks (equivalent to dropout)
-    max_position_embeddings: int  # Maximum sequence length that this model might ever be used with (could align with block_size)
-    eps: float  # Epsilon used for layer normalization modules
-
-    # Training-specific parameters
-    batch_size: int  # Number of sequences per training batch
-    block_size: int  # Length of the sequence to be processed (could align with max_position_embeddings)
-    max_iters: int  # Maximum number of training iterations
-    eval_interval: int  # Interval (in iterations) at which to evaluate the model
-    learning_rate: float  # Learning rate for the optimizer
-    device: str  # Training device ('cpu' or 'cuda')
-    eval_iters: int  # Number of iterations to perform during evaluation
-
-    # Additional training hyperparameters (suggested)
-    num_epochs: int = 1  # Total number of training epochs (default to 1 for flexibility)
-    warmup_steps: int = 0  # Number of warmup steps for learning rate scheduling (default to 0)
-    gradient_accumulation_steps: int = (
-        1  # Number of steps to accumulate gradients before performing a backward/update pass (default to 1)
-    )
-    max_grad_norm: float = 1.0  # Maximum gradient norm (for gradient clipping, default to 1.0)
-    save_interval: int = 1000  # Interval (in steps) at which to save model checkpoints (default to 1000)
+    batch_size: int
+    block_size: int
+    max_iters: int
+    eval_interval: int
+    learning_rate: float
+    device: str
+    eval_iters: int
+    num_embeds: int
+    num_heads: int
+    num_layers: int
+    dropout: float
 
 
 # 3. Helper Functions

From bbdb4d96321f2e2030004ed53a6363e000a8ea24 Mon Sep 17 00:00:00 2001
From: Josh Black-Star <23347017+jdblackstar@users.noreply.github.com>
Date: Mon, 8 Apr 2024 17:10:25 -0700
Subject: [PATCH 09/17] because we trimmed down, need to rename

---
 examples/gpt.py      | 102 +++++++++++++++++++++++++++----------------
 punytorch/helpers.py |   3 ++
 2 files changed, 68 insertions(+), 37 deletions(-)

diff --git a/examples/gpt.py b/examples/gpt.py
index 410f1b3..fcd8577 100644
--- a/examples/gpt.py
+++ b/examples/gpt.py
@@ -170,13 +170,13 @@ class Head(nn.Module):
         hyperparameters (Hyperparameters): The hyperparameters of the model.
     """
 
-    def __init__(self, hparams: Hyperparameters):
+    def __init__(self, head_size, hparams: Hyperparameters):
         super().__init__()
-        self.key = nn.Linear(hparams.num_embeds, hparams.head_size)
-        self.query = nn.Linear(hparams.num_embeds, hparams.head_size)
-        self.value = nn.Linear(hparams.num_embeds, hparams.head_size)
+        self.key = nn.Linear(hparams.num_embeds, head_size)
+        self.query = nn.Linear(hparams.num_embeds, head_size)
+        self.value = nn.Linear(hparams.num_embeds, head_size)
         self.register_buffer("tril", Tensor(np.tril(np.ones((hparams.block_size, hparams.block_size)))))
-        self.dropout = nn.Dropout(hparams.dropout)  # this is placeholder, need to implement fully
+        self.dropout = nn.Dropout(hparams.dropout)
 
     def forward(self, x):
         """
@@ -227,9 +227,9 @@ class MultiHeadAttention(nn.Module):
         hyperparameters (Hyperparameters): The hyperparameters of the model.
     """
 
-    def __init__(self, hparams: Hyperparameters):
+    def __init__(self, head_size, hparams: Hyperparameters):
         super().__init__()
-        self.heads = nn.ModuleList([Head(hparams) for _ in range(hparams.num_heads)])
+        self.heads = nn.ModuleList([Head(head_size, hparams) for _ in range(hparams.num_heads)])
         self.proj = nn.Linear(hparams.num_embeds, hparams.num_embeds)
         self.dropout = nn.Dropout(hparams.dropout)
 
@@ -354,10 +354,11 @@ def __init__(self, hparams: Hyperparameters) -> None:
             model_args (ModelArgs): The arguments for the model, including dimensions and sequence length.
         """
         super().__init__()
-        self.attn = MultiHeadAttention(hparams)
-        self.ffn = MLP(hparams.d_model, hparams.d_model)
-        self.l1 = RMSNorm(hparams.d_model, eps=hparams.eps)
-        self.l2 = RMSNorm(hparams.d_model, eps=hparams.eps)
+        head_size = hparams.num_embeds // hparams.num_heads
+        self.attn = MultiHeadAttention(head_size, hparams)
+        self.ffn = MLP(in_features=hparams.num_embeds, out_features=hparams.num_embeds)
+        self.l1 = RMSNorm(hparams.num_embeds, eps=hparams.dropout)
+        self.l2 = RMSNorm(hparams.num_embeds, eps=hparams.dropout)
 
     def forward(self, x):
         """
@@ -380,7 +381,7 @@ class GPT(nn.Module):
     a list of transformer blocks, a normalization layer, and a projection layer.
     """
 
-    def __init__(self, hparams: Hyperparameters) -> None:
+    def __init__(self, hparams: Hyperparameters, vocab_size) -> None:
         """
         Initializes the GPT model.
 
@@ -390,11 +391,19 @@ def __init__(self, hparams: Hyperparameters) -> None:
         """
         super().__init__()
         self.device = hparams.device
-        self.token_embedding = nn.Embedding(hparams.vocab_size, hparams.d_model)
-        self.position_embedding = nn.Embedding(hparams.seq_len, hparams.d_model)
+        self.token_embedding = nn.Embedding(vocab_size, hparams.num_embeds)
+        self.position_embedding = nn.Embedding(hparams.block_size, hparams.num_embeds)
         self.layers = nn.ModuleList([Block(hparams=hparams) for _ in range(hparams.num_layers)])
-        self.norm = RMSNorm(hparams.d_model)
-        self.proj = nn.Linear(hparams.d_model, hparams.vocab_size)
+        self.norm = RMSNorm(hparams.num_embeds)
+        self.proj = nn.Linear(hparams.num_embeds, vocab_size)
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
 
     def forward(self, x: Tensor) -> Tensor:
         """
@@ -411,15 +420,38 @@ def forward(self, x: Tensor) -> Tensor:
         B, T = x.shape
         token_embedding = self.token_embedding(x)
         position_embedding = self.position_embedding(Tensor(np.arange(T)).to(self.device))
-        x = token_embedding + position_embedding
-
-        for layer in self.layers:
-            x = layer(x)
-
-        x = self.norm(x)
-        logits = self.proj(x)
+        x = token_embedding + position_embedding  # (B,T,C)
+        x = self.layers(x)  # (B,T,C)
+        x = self.norm(x)  # (B,T,C)
+        logits = self.lm_head(x)  # (B,T,vocab_size)
+
+        if targets is None:
+            loss = None
+        else:
+            B, T, C = logits.shape
+            logits = logits.view(B * T, C)
+            targets = targets.view(B * T)
+            loss = CrossEntropyLoss.forward(logits, targets)
 
-        return logits
+        return logits, loss
+
+    def generate(self, idx, max_new_tokens, hparams: Hyperparameters):
+        # idx is (B, T) array of indices in the current context
+        for _ in range(max_new_tokens):
+            # crop idx to the last block_size tokens
+            idx_cond = idx[:, -hparams.block_size :]
+            # get the predictions
+            logits, loss = self(idx_cond)
+            # focus only on the last time step
+            logits = logits[:, -1, :]  # becomes (B, C)
+            # apply softmax to get probabilities
+            probs = Softmax.forward(logits, dim=-1)  # (B, C)
+            # sample from the distribution - don't have access to torch.multinomial, so we're making our own
+            idx_next = np.array([np.random.choice(len(probs[b]), 1, p=probs[b]) for b in range(len(probs))])  # (B, 1)
+            idx_next = Tensor(idx_next).to(probs.device)  # Convert to Tensor and move to the same device as probs
+            # append sampled index to the running sequence
+            idx = Tensor.cat((idx, idx_next), dim=1)  # (B, T+1)
+        return idx
 
 
 # 5. Main Function
@@ -427,32 +459,28 @@ def main():
     # hyperparameters for the model and training run
     hyperparameters = Hyperparameters(
         batch_size=64,
-        block_size=128,
+        block_size=256,
         max_iters=5000,
         eval_interval=500,
         learning_rate=3e-4,
         device="cpu",
-        eval_iters=100,
-        num_embeds=128 * 2,
-        num_heads=4,
-        num_layers=2,
+        eval_iters=200,
+        num_embeds=384,
+        num_heads=6,
+        num_layers=6,
         dropout=0.2,
-        seq_len=1000,
-        d_model=16,
-        n_heads=2,
-        vocab_size=1000,
-        num_layers=2,
-        esp=1e-5,
     )
 
-    model = GPT(hparams=hyperparameters).to(hyperparameters.device)
-    optimizer = Adam(model.parameters(), lr=hyperparameters.learning_rate)
     tokenizer = CharTokenizer(filepath="datasets/input.txt")
 
     data = Tensor(tokenizer.encode(tokenizer.text)).long()
     n = int(0.95 * len(data))
     train_data = data[:n]
     val_data = data[n:]
+    vocab_size = tokenizer.get_vocab_size()
+
+    model = GPT(hparams=hyperparameters, vocab_size=vocab_size).to(hyperparameters.device)
+    optimizer = Adam(model.parameters(), lr=hyperparameters.learning_rate)
 
     # type checks before moving on
     if not all(isinstance(x, Tensor) for x in [data, train_data, val_data]):
diff --git a/punytorch/helpers.py b/punytorch/helpers.py
index a73c1e4..d777f5a 100644
--- a/punytorch/helpers.py
+++ b/punytorch/helpers.py
@@ -59,3 +59,6 @@ def encode(self, text):
 
     def decode(self, encoded_chars):
         return "".join([self.int_to_char[i] for i in encoded_chars])
+
+    def get_vocab_size(self):
+        return self.vocab_size

From 811fc4f1f8ac97d595f66f83eb55900903b6d279 Mon Sep 17 00:00:00 2001
From: Josh Black-Star <23347017+jdblackstar@users.noreply.github.com>
Date: Mon, 8 Apr 2024 17:15:40 -0700
Subject: [PATCH 10/17] oops wrong name

---
 examples/gpt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/gpt.py b/examples/gpt.py
index fcd8577..0e6a468 100644
--- a/examples/gpt.py
+++ b/examples/gpt.py
@@ -423,7 +423,7 @@ def forward(self, x: Tensor) -> Tensor:
         x = token_embedding + position_embedding  # (B,T,C)
         x = self.layers(x)  # (B,T,C)
         x = self.norm(x)  # (B,T,C)
-        logits = self.lm_head(x)  # (B,T,vocab_size)
+        logits = self.proj(x)  # (B,T,vocab_size)
 
         if targets is None:
             loss = None

From a50ccd0daefbfa0f15af5f08ac1e77085f9953f5 Mon Sep 17 00:00:00 2001
From: Josh Black-Star <23347017+jdblackstar@users.noreply.github.com>
Date: Mon, 8 Apr 2024 17:16:28 -0700
Subject: [PATCH 11/17] set target to None

---
 examples/gpt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/gpt.py b/examples/gpt.py
index 0e6a468..d5b1ab8 100644
--- a/examples/gpt.py
+++ b/examples/gpt.py
@@ -405,7 +405,7 @@ def _init_weights(self, module):
         if isinstance(module, nn.Linear) and module.bias is not None:
             module.bias.data.zero_()
 
-    def forward(self, x: Tensor) -> Tensor:
+    def forward(self, x: Tensor, targets=None) -> Tensor:
         """
         Defines the computation performed at every call.
 

From 96d361185c9d0071f561889f69fee76d839f78ec Mon Sep 17 00:00:00 2001
From: Josh Black-Star <23347017+jdblackstar@users.noreply.github.com>
Date: Wed, 10 Apr 2024 10:49:27 -0700
Subject: [PATCH 12/17] pre-commit changes

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index fece537..3933897 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,4 +3,4 @@ repos:
     rev: 23.12.1
     hooks:
       - id: black
-        args: ["."]
\ No newline at end of file
+        args: [".", "--line-length", "120"]
\ No newline at end of file

From 9a3db129d6d2cd4675684ddd22191f0efe76c05c Mon Sep 17 00:00:00 2001
From: Josh Black-Star <23347017+jdblackstar@users.noreply.github.com>
Date: Wed, 10 Apr 2024 10:49:43 -0700
Subject: [PATCH 13/17] adding the target argument to the GPT docstring

---
 examples/gpt.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/examples/gpt.py b/examples/gpt.py
index d5b1ab8..0b959a2 100644
--- a/examples/gpt.py
+++ b/examples/gpt.py
@@ -411,6 +411,8 @@ def forward(self, x: Tensor, targets=None) -> Tensor:
 
         Args:
             x (Tensor): The input data.
+            targets (Tensor, optional): The target values. If provided, the method will compute and return the loss.
+            If not provided, the method will only return the logits. Defaults to None.
 
         Returns:
             Tensor: The output of the GPT model.
@@ -436,6 +438,22 @@ def forward(self, x: Tensor, targets=None) -> Tensor:
         return logits, loss
 
     def generate(self, idx, max_new_tokens, hparams: Hyperparameters):
+        """
+        Generates text based on the provided context.
+
+        This function takes an initial context of indices and generates text by repeatedly predicting the next token
+        until the specified maximum number of new tokens is reached. The generation process involves sampling from the
+        probability distribution over the vocabulary for each new token.
+
+        Args:
+            idx (Tensor): The initial context represented as a tensor of token indices with shape (B, T), where B is
+                          the batch size and T is the sequence length of the context.
+            max_new_tokens (int): The maximum number of new tokens to generate.
+            hparams (Hyperparameters): The hyperparameters for the model, including block size.
+
+        Returns:
+            Tensor: The generated indices including the initial context and the new tokens, with shape (B, T + max_new_tokens).
+        """
         # idx is (B, T) array of indices in the current context
         for _ in range(max_new_tokens):
             # crop idx to the last block_size tokens

From e766d25b0beabf6dda6867aa5f8415c84d4cd63d Mon Sep 17 00:00:00 2001
From: Josh Black-Star <23347017+jdblackstar@users.noreply.github.com>
Date: Thu, 11 Apr 2024 09:16:40 -0700
Subject: [PATCH 14/17] adds the masked_fill() method to the Tensor class

---
 punytorch/tensor.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/punytorch/tensor.py b/punytorch/tensor.py
index a99a502..aff2047 100644
--- a/punytorch/tensor.py
+++ b/punytorch/tensor.py
@@ -109,6 +109,21 @@ def __lt__(self, other):
         else:
             return self.data < other
 
+    def masked_fill(self, mask, value):
+        """
+        Fills elements of this tensor with `value` where `mask` is True.
+
+        Args:
+            mask (Tensor): The boolean mask.
+            value (float): The value to fill in with.
+
+        Returns:
+            Tensor: A new tensor with filled values.
+        """
+        mask = self.ensure_tensor(mask)
+        result_data = np.where(mask.data, value, self.data)
+        return Tensor(result_data, requires_grad=self.requires_grad)
+
     @staticmethod
     def data_to_numpy(data):
         if isinstance(data, (int, float)):

From 9b302d322758b83c5d504ed3d7f006e9653f2e4c Mon Sep 17 00:00:00 2001
From: Josh Black-Star <23347017+jdblackstar@users.noreply.github.com>
Date: Thu, 11 Apr 2024 10:37:05 -0700
Subject: [PATCH 15/17] changes to binary operations in both tensor.py and
 ops.py

---
 punytorch/ops.py    | 173 +++++++++++++++-----------------------------
 punytorch/tensor.py | 101 ++++++++++++--------------
 2 files changed, 106 insertions(+), 168 deletions(-)

diff --git a/punytorch/ops.py b/punytorch/ops.py
index cb917ec..9c1edbc 100644
--- a/punytorch/ops.py
+++ b/punytorch/ops.py
@@ -16,28 +16,34 @@ def apply(self, *args):
         Returns:
             The result of applying the function.
         """
-        return self.forward(*args)
+        return self.op(*args)
 
 
 class Operation:
+    def __call__(self, *args):
+        self.inputs = args
+        self.outputs = self.forward(*args)
+        return self.outputs
+
     def forward(self, *args):
         raise NotImplementedError
 
-    def backward(self, context, grad):
+    def backward(self, grad):
         raise NotImplementedError
 
 
-class Add(Operation):
-    @staticmethod
-    def forward(x, y):
-        from punytorch.tensor import Tensor
+def ensure_numpy(x):
+    from punytorch.tensor import Tensor
+
+    return x.data if isinstance(x, Tensor) else np.array(x)
 
-        x_data = x.data if isinstance(x, Tensor) else np.array(x)
-        y_data = y.data if isinstance(y, Tensor) else np.array(y)
-        return np.add(x_data, y_data)
 
-    @staticmethod
-    def backward(context, grad):
+class Add(Operation):
+    def forward(self, x, y):
+        self.x, self.y = ensure_numpy(x), ensure_numpy(y)
+        return np.add(self.x, self.y)
+
+    def backward(self, grad):
         # grad is assumed to be a NumPy array
         # The gradient of the sum is distributed equally to both operands
         # No need to change the shape of grad since addition is element-wise
@@ -45,58 +51,35 @@ def backward(context, grad):
 
 
 class Sub(Operation):
-    @staticmethod
-    def forward(x, y):
-        from punytorch.tensor import Tensor
-
-        # Ensure that x and y are NumPy arrays
-        x_data = x.data if isinstance(x, Tensor) else np.array(x)
-        y_data = y.data if isinstance(y, Tensor) else np.array(y)
-        # Use NumPy's subtraction
-        return np.subtract(x_data, y_data)
-
-    @staticmethod
-    def backward(context, grad):
+    def forward(self, x, y):
+        self.x, self.y = ensure_numpy(x), ensure_numpy(y)
+        return np.subtract(self.x, self.y)
+
+    def backward(self, grad):
         # The gradient with respect to the first operand is 1
         # The gradient with respect to the second operand is -1
         return grad, -grad
 
 
 class Mul(Operation):
-    @staticmethod
-    def forward(x, y):
-        from punytorch.tensor import Tensor
-
-        # Ensure that x and y are NumPy arrays
-        x_data = x.data if isinstance(x, Tensor) else np.array(x)
-        y_data = y.data if isinstance(y, Tensor) else np.array(y)
-        # Use NumPy's multiplication
-        return np.multiply(x_data, y_data)
-
-    @staticmethod
-    def backward(context, grad):
-        x, y = context.args
+    def forward(self, x, y):
+        self.x, self.y = ensure_numpy(x), ensure_numpy(y)
+        return np.multiply(self.x, self.y)
+
+    def backward(self, grad):
         # The gradient with respect to x is y, and vice versa
-        return grad * y.data, grad * x.data
+        return grad * self.y, grad * self.x
 
 
 class TrueDiv(Operation):
-    @staticmethod
-    def forward(x, y):
-        from punytorch.tensor import Tensor
-
-        # Ensure that x and y are NumPy arrays
-        x_data = x.data if isinstance(x, Tensor) else np.array(x)
-        y_data = y.data if isinstance(y, Tensor) else np.array(y)
-        # Use NumPy's true division
-        return np.divide(x_data, y_data)
-
-    @staticmethod
-    def backward(context, grad):
-        x, y = context.args
+    def forward(self, x, y):
+        self.x, self.y = ensure_numpy(x), ensure_numpy(y)
+        return np.divide(self.x, self.y)
+
+    def backward(self, grad):
         # The gradient with respect to x is 1/y
         # The gradient with respect to y is -x/y^2
-        return grad.data / y.data, -x.data * grad.data / (y.data**2)
+        return grad / self.y, -self.x * grad / (self.y**2)
 
 
 class Mod(Operation):
@@ -109,82 +92,46 @@ class Mod(Operation):
     for all use cases.
     """
 
-    @staticmethod
-    def forward(x, y):
-        from punytorch.tensor import Tensor
-
-        # Ensure that x and y are NumPy arrays
-        x_data = x.data if isinstance(x, Tensor) else np.array(x)
-        y_data = y.data if isinstance(y, Tensor) else np.array(y)
-        # Use NumPy's mod
-        return np.mod(x_data, y_data)
+    def forward(self, x, y):
+        self.x, self.y = ensure_numpy(x), ensure_numpy(y)
+        return np.mod(self.x, self.y)
 
-    @staticmethod
-    def backward(context, grad):
-        x, y = context.args
+    def backward(self, grad):
         # The gradient of x % y with respect to x is 1, and with respect to y is 0
         # Check if all elements in `y.data` are integers and raise a ValueError if they're not
-        if not np.all(y.data.astype(int) == y.data):
+        if not np.all(self.y.astype(int) == self.y):
             raise ValueError("The derivative with respect to `y` is undefined for non-integer values.")
-        return grad, np.zeros_like(y.data)
+        return grad, np.zeros_like(self.y)
 
 
 class Pow(Operation):
-    @staticmethod
-    def forward(x, y):
-        from punytorch.tensor import Tensor
-
-        # Ensure that x and y are NumPy arrays
-        x_data = x.data if isinstance(x, Tensor) else np.array(x)
-        y_data = y.data if isinstance(y, Tensor) else np.array(y)
-        # Use NumPy's power function
-        return np.power(x_data, y_data)
-
-    @staticmethod
-    def backward(context, grad):
-        x, y = context.args
+    def forward(self, x, y):
+        self.x, self.y = ensure_numpy(x), ensure_numpy(y)
+        return np.power(self.x, self.y)
+
+    def backward(self, grad):
         # The gradient with respect to x is y * x^(y - 1)
         # The gradient with respect to y is x^y * log(x)
-        grad_x = grad * y.data * np.power(x.data, y.data - 1)
-        grad_y = grad * np.power(x.data, y.data) * np.log(x.data)
+        grad_x = grad * self.y * np.power(self.x, self.y - 1)
+        grad_y = grad * np.power(self.x, self.y) * np.log(self.x)
         return grad_x, grad_y
 
 
 class MatMul(Operation):
-    @staticmethod
-    def forward(x, y):
-        from punytorch.tensor import Tensor
-
-        # Ensure that x and y are NumPy arrays
-        x_data = x.data if isinstance(x, Tensor) else np.array(x)
-        y_data = y.data if isinstance(y, Tensor) else np.array(y)
-        # Use NumPy's matmul
-        return np.matmul(x_data, y_data)
-
-    @staticmethod
-    def backward(context, grad):
-        x, y = context.args
+    def forward(self, x, y):
+        self.x, self.y = ensure_numpy(x), ensure_numpy(y)
+        return np.matmul(self.x, self.y)
+
+    def backward(self, grad):
         # If Z = X @ Y, then d(Z)/dX = grad @ Y^T and d(Z)/dY = X^T @ grad
-        return grad.data @ np.transpose(y.data), np.transpose(x.data) @ grad.data
+        return np.dot(grad, self.y.T), np.dot(self.x.T, grad)
 
 
 class Tanh(Operation):
-    @staticmethod
-    def forward(x):
-        from punytorch.tensor import Tensor
-
-        # Ensure that x is a NumPy array
-        x_data = x.data if isinstance(x, Tensor) else np.array(x)
-        # Use NumPy's tanh
-        return np.tanh(x_data)
-
-    @staticmethod
-    def backward(context, grad):
-        from punytorch.tensor import Tensor
-
-        x = context.args[0]
-        # The gradient of tanh is (1 - tanh^2(x))
-        x_data = x.data if isinstance(x, Tensor) else np.array(x)
-        tanh_x_data = np.tanh(x_data)
-        grad_tanh = (1 - np.square(tanh_x_data)) * grad
-        return grad_tanh
+    def forward(self, x):
+        self.x = ensure_numpy(x)
+        return np.tanh(self.x)
+
+    def backward(self, grad):
+        tanh_x = np.tanh(self.x)
+        return (1 - np.square(tanh_x)) * grad
diff --git a/punytorch/tensor.py b/punytorch/tensor.py
index aff2047..d6bfabd 100644
--- a/punytorch/tensor.py
+++ b/punytorch/tensor.py
@@ -225,76 +225,67 @@ def reshape(self, *shape):
     BINARY OPS
     """
 
-    def __add__(self, other):
-        result = Tensor(
-            Add.forward(self, other), requires_grad=self.requires_grad or getattr(other, "requires_grad", False)
-        )
-        if result.requires_grad:
-            result.context = Function(Add, self, other)
-        return result
+    def _binary_op(self, other, op, op_class):
+        """
+        Helper function to perform binary operations and handle gradients.
 
-    def __sub__(self, other):
-        result = Tensor(
-            Sub.forward(self, other), requires_grad=self.requires_grad or getattr(other, "requires_grad", False)
-        )
-        if result.requires_grad:
-            result.context = Function(Sub, self, other)
-        return result
+        Args:
+            other (Tensor, float, int): The right operand.
+            op (function): The operation to perform (e.g., np.add, np.subtract).
+            op_class (class): The class representing the operation for gradient computation.
 
-    def __mul__(self, other):
+        Returns:
+            Tensor: The result of the binary operation.
+        """
         if isinstance(other, (int, float)):
-            return Tensor(self.data * other, requires_grad=self.requires_grad)
-        else:
-            result = Tensor(
-                Mul.forward(self, other), requires_grad=self.requires_grad or getattr(other, "requires_grad", False)
-            )
+            result_data = op(self.data, other)
+            return Tensor(result_data, requires_grad=self.requires_grad)
+        elif isinstance(other, Tensor):
+            result_data = op(self.data, other.data)
+            result = Tensor(result_data, requires_grad=self.requires_grad or other.requires_grad)
             if result.requires_grad:
-                result.context = Function(Mul, self, other)
+                result.context = op_class(self, other)
             return result
-
-    def __truediv__(self, other):
-        if isinstance(other, (int, float)):
-            return Tensor(self.data // other, requires_grad=self.requires_grad)
         else:
-            result = Tensor(
-                TrueDiv.forward(self, other), requires_grad=self.requires_grad or getattr(other, "requires_grad", False)
+            raise TypeError(
+                f"Unsupported operand type(s) for {op.__name__}: '{type(self).__name__}' and '{type(other).__name__}'"
             )
-            if result.requires_grad:
-                result.context = Function(TrueDiv, self, other)
-            return result
+
+    def __add__(self, other):
+        return self._binary_op(other, np.add, Add)
+
+    def __sub__(self, other):
+        return self._binary_op(other, np.subtract, Sub)
+
+    def __mul__(self, other):
+        return self._binary_op(other, np.multiply, Mul)
+
+    def __truediv__(self, other):
+        return self._binary_op(other, np.divide, TrueDiv)
 
     def __mod__(self, other):
-        result = Tensor(
-            Mod.forward(self, other), requires_grad=self.requires_grad or getattr(other, "requires_grad", False)
-        )
-        if result.requires_grad:
-            result.context = Function(Mod, self, other)
-        return result
+        return self._binary_op(other, np.mod, Mod)
 
     def __pow__(self, other):
-        result = Tensor(
-            Pow.forward(self, other), requires_grad=self.requires_grad or getattr(other, "requires_grad", False)
-        )
-        if result.requires_grad:
-            result.context = Function(Pow, self, other)
-        return result
+        return self._binary_op(other, np.power, Pow)
 
     def __matmul__(self, other):
-        if isinstance(other, (int, float)):
-            return Tensor(self.data @ other, requires_grad=self.requires_grad)
+        # __matmul__ requires special handling due to reshaping for vectors.
+        if not isinstance(other, Tensor):
+            raise TypeError(f"Unsupported operand type(s) for @: '{type(self).__name__}' and '{type(other).__name__}'")
+
+        if self.data.ndim == 1:
+            self_data = self.data.reshape(1, -1)
         else:
-            result = Tensor(
-                MatMul.forward(self, other), requires_grad=self.requires_grad or getattr(other, "requires_grad", False)
-            )
-            if result.requires_grad:
-                result.context = Function(MatMul, self, other)
-            return result
+            self_data = self.data
 
-    def __tanh__(self):
-        result = Tensor(Tanh.forward(self), requires_grad=self.requires_grad)
-        if result.requires_grad:
-            result.context = Function(Tanh, self)
-        return result
+        if other.data.ndim == 1:
+            other_data = other.data.reshape(-1, 1)
+        else:
+            other_data = other.data
+
+        result_data = np.matmul(self_data, other_data)
+        return Tensor(result_data, requires_grad=self.requires_grad or other.requires_grad)
 
     """
     UNARY OPS

From a07794751f519de30f1a5a1c969def46752e2676 Mon Sep 17 00:00:00 2001
From: Josh Black-Star <23347017+jdblackstar@users.noreply.github.com>
Date: Fri, 12 Apr 2024 09:53:32 -0700
Subject: [PATCH 16/17] adding apply() method to the Module class

---
 punytorch/nn/modules.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/punytorch/nn/modules.py b/punytorch/nn/modules.py
index 185b408..755561e 100644
--- a/punytorch/nn/modules.py
+++ b/punytorch/nn/modules.py
@@ -48,6 +48,20 @@ def parameters(self) -> list[Parameter]:
                     params.extend(module.parameters())
         return list(set(params))
 
+    def apply(self, fn):
+        for module in self.children():
+            module.apply(fn)
+        fn(self)
+        return self
+
+    def children(self):
+        """
+        Returns an iterator over immediate children modules.
+        """
+        for name, module in self.__dict__.items():
+            if isinstance(module, Module):
+                yield module
+
     def state_dict(self):
         """
         Returns a dictionary containing a whole state of the module.

From 059c8a00d925279e2484376294d4072f802cbe7f Mon Sep 17 00:00:00 2001
From: Josh Black-Star <23347017+jdblackstar@users.noreply.github.com>
Date: Fri, 12 Apr 2024 09:54:00 -0700
Subject: [PATCH 17/17] some more attention block rewrites

---
 examples/gpt.py | 38 +++++++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/examples/gpt.py b/examples/gpt.py
index 0b959a2..275350b 100644
--- a/examples/gpt.py
+++ b/examples/gpt.py
@@ -196,7 +196,9 @@ def forward(self, x):
         v = self.value(x)
 
         # Compute attention scores
-        attention_scores = q @ k.transpose(-2, -1) * k.shape[-1] ** -0.5
+        attention_scores = q @ k.transpose(-2, -1)
+        scale_factor = Tensor(k.shape[-1] ** -0.5)  # Wrap the scalar in a Tensor
+        attention_scores = attention_scores * scale_factor  # Ensure element-wise multiplication
 
         # Apply mask to attention scores
         masked_attention_scores = attention_scores.masked_fill(
@@ -209,7 +211,9 @@ def forward(self, x):
 
         # Compute the attended values
         v = self.value(x)
-        out = attention_probs @ v
+        logger.debug(f"v: {v.shape}, {type(v)}")
+        logger.debug(f"attention_probs: {attention_probs.shape}, {type(attention_probs)}")
+        out = attention_probs.data @ v.data
 
         return out
 
@@ -401,13 +405,16 @@ def __init__(self, hparams: Hyperparameters, vocab_size) -> None:
 
     def _init_weights(self, module):
         if isinstance(module, (nn.Linear, nn.Embedding)):
-            module.weight.data.normal_(mean=0.0, std=0.02)
+            # Assuming module.weight.data is a numpy array
+            mean = 0.0
+            std = 0.02
+            module.weight.data = np.random.normal(mean, std, module.weight.data.shape)
         if isinstance(module, nn.Linear) and module.bias is not None:
-            module.bias.data.zero_()
+            module.bias.data = np.zeros_like(module.bias.data)
 
     def forward(self, x: Tensor, targets=None) -> Tensor:
         """
-        Defines the computation performed at every call.
+        Overrides the base forward method in the nn.Module class to define the computation performed at every call.
 
         Args:
             x (Tensor): The input data.
@@ -422,18 +429,19 @@ def forward(self, x: Tensor, targets=None) -> Tensor:
         B, T = x.shape
         token_embedding = self.token_embedding(x)
         position_embedding = self.position_embedding(Tensor(np.arange(T)).to(self.device))
-        x = token_embedding + position_embedding  # (B,T,C)
-        x = self.layers(x)  # (B,T,C)
-        x = self.norm(x)  # (B,T,C)
-        logits = self.proj(x)  # (B,T,vocab_size)
+        x = token_embedding + position_embedding  # Combine token and position embeddings (B,T,C)
+        for layer in self.layers:
+            x = layer(x)  # Pass through each transformer block (B,T,C)
+        x = self.norm(x)  # Apply normalization (B,T,C)
+        logits = self.proj(x)  # Project to vocabulary size (B,T,vocab_size)
 
-        if targets is None:
-            loss = None
-        else:
+        if targets is not None:
             B, T, C = logits.shape
-            logits = logits.view(B * T, C)
-            targets = targets.view(B * T)
-            loss = CrossEntropyLoss.forward(logits, targets)
+            logits_flat = logits.view(B * T, C)
+            targets_flat = targets.view(B * T)
+            loss = CrossEntropyLoss.forward(logits_flat, targets_flat)
+        else:
+            loss = None
 
         return logits, loss