From c48d2c9d2a8e94f8e0cffcc1cb9d5f9caf2cae31 Mon Sep 17 00:00:00 2001 From: Josh Black-Star <23347017+jdblackstar@users.noreply.github.com> Date: Mon, 8 Apr 2024 12:45:43 -0700 Subject: [PATCH 01/17] remove model_args and just use hyperparameters --- examples/gpt.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/examples/gpt.py b/examples/gpt.py index 5173a93..15354f0 100644 --- a/examples/gpt.py +++ b/examples/gpt.py @@ -42,16 +42,6 @@ class Hyperparameters: dropout: float -@dataclass -class ModelArgs: - seq_len: int - d_model: int - n_heads: int - vocab_size: int - num_layers: int - esp: float - - # 3. Helper Functions @Tensor.no_grad() def estimate_loss(model, train_data, val_data, hyperparameters): From 1c332bc01e9d3ad27ac1666470674e06b4314585 Mon Sep 17 00:00:00 2001 From: Josh Black-Star <23347017+jdblackstar@users.noreply.github.com> Date: Mon, 8 Apr 2024 12:46:20 -0700 Subject: [PATCH 02/17] make sure the hyperparameters is an instance of the dataclass --- examples/gpt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/gpt.py b/examples/gpt.py index 15354f0..edefe34 100644 --- a/examples/gpt.py +++ b/examples/gpt.py @@ -122,7 +122,7 @@ def get_batch(split, train_data, val_data, hyperparameters): @Tensor.no_grad() -def generate(model, idx, max_new_tokens, hyperparameters): +def generate(model, idx, max_new_tokens, hyperparameters: Hyperparameters): """ Generates new tokens using the trained model. From d4add5c76c47394ad9ef2caf299b9794f885e5fd Mon Sep 17 00:00:00 2001 From: Josh Black-Star <23347017+jdblackstar@users.noreply.github.com> Date: Mon, 8 Apr 2024 12:48:10 -0700 Subject: [PATCH 03/17] splitting MHA into a single Head class and an MHA class that concatenates multiple Heads --- examples/gpt.py | 137 +++++++++++++++++++++++++++--------------------- 1 file changed, 77 insertions(+), 60 deletions(-) diff --git a/examples/gpt.py b/examples/gpt.py index edefe34..7358a2a 100644 --- a/examples/gpt.py +++ b/examples/gpt.py @@ -157,88 +157,105 @@ def generate(model, idx, max_new_tokens, hyperparameters: Hyperparameters): return idx[:, hyperparameters.block_size :] -# 4. Model Components (MHA, MLP, RMSNorm, Block, GPT) -class MHA(nn.Module): - def __init__(self, model_args: ModelArgs) -> None: - """ - Initializes the Multi-Head Attention module. +# 4. Model Components (Attention (Single head and MHA), MLP, RMSNorm, Block, GPT) +class Head(nn.Module): + """ + A single attention head. - Args: - model_args (ModelArgs): The arguments for the model, including dimensions and sequence length. - """ + This class implements a single attention head, which is a key component of the transformer architecture. + It computes the attention scores, applies a mask, and performs the attention operation. + + Args: + head_size (int): The size of the attention head. + hyperparameters (Hyperparameters): The hyperparameters of the model. + """ + + def __init__(self, head_size, hyperparameters): super().__init__() - self.key = nn.Linear(model_args.d_model, model_args.d_model) - self.query = nn.Linear(model_args.d_model, model_args.d_model) - self.value = nn.Linear(model_args.d_model, model_args.d_model) - self.proj = nn.Linear(model_args.d_model, model_args.d_model) - self.head_dim = model_args.d_model // model_args.n_heads - - self.n_heads = model_args.n_heads - mask = np.tril(np.ones((model_args.seq_len, model_args.seq_len))) - mask = np.triu(mask, k=1) * -np.inf - # Repeat the mask for each head - mask = np.repeat(mask[np.newaxis, np.newaxis, :, :], self.n_heads, axis=1) - self.register_buffer("mask", Tensor(mask).float()) + self.key = nn.Linear(hyperparameters.num_embeds, head_size) + self.query = nn.Linear(hyperparameters.num_embeds, head_size) + self.value = nn.Linear(hyperparameters.num_embeds, head_size) + self.register_buffer("tril", Tensor(np.tril(np.ones((hyperparameters.block_size, hyperparameters.block_size))))) + self.dropout = nn.Dropout(hyperparameters.dropout) # this is placeholder, need to implement fully - def forward(self, x: Tensor) -> Tensor: + def forward(self, x): """ - Defines the computation performed at every call. + Computes the forward pass of the attention head. Args: - x (Tensor): The input data. + x (Tensor): The input tensor of shape (batch_size, sequence_length, num_embeds). Returns: - Tensor: The output of the Multi-Head Attention layer. + Tensor: The output tensor after applying attention, of shape (batch_size, sequence_length, head_size). """ - if not isinstance(x, Tensor): - raise TypeError(f"Expected x to be a Tensor, but got {type(x).__name__}") + batch_size, sequence_length, channels = x.shape - batch_size, time_step, channels = x.shape - key = self.key(x) - query = self.query(x) - value = self.value(x) - key = key.reshape(batch_size, time_step, self.n_heads, channels // self.n_heads).transpose(1, 2) - query = query.reshape(batch_size, time_step, self.n_heads, channels // self.n_heads).transpose(1, 2) - value = value.reshape(batch_size, time_step, self.n_heads, channels // self.n_heads).transpose(1, 2) + # Compute key, query, and value projections + k = self.key(x) + q = self.query(x) + v = self.value(x) - # Call the static attention method - x = MHA.attention(key, query, value, self.mask) + # Compute attention scores + attention_scores = q @ k.transpose(-2, -1) * k.shape[-1] ** -0.5 - return x + # Apply mask to attention scores + masked_attention_scores = attention_scores.masked_fill( + self.tril[:sequence_length, :sequence_length] == 0, float("-inf") + ) - @staticmethod - def attention(key, query, value, mask) -> Tensor: + # Compute attention probabilities + attention_probs = Softmax().forward(masked_attention_scores) + attention_probs = self.dropout(attention_probs) + + # Compute the attended values + v = self.value(x) + out = attention_probs @ v + + return out + + +class MultiHeadAttention(nn.Module): + """ + Multi-head attention module. + + This module applies multiple attention heads in parallel and concatenates their outputs. + The concatenated output is then projected to the original embedding dimension. + + Args: + num_heads (int): The number of attention heads. + head_size (int): The size of each attention head. + hyperparameters (Hyperparameters): The hyperparameters of the model. + """ + + def __init__(self, num_heads, head_size, hyperparameters): + super().__init__() + self.heads = nn.ModuleList([Head(head_size, hyperparameters) for _ in range(num_heads)]) + self.proj = nn.Linear(hyperparameters.num_embeds, hyperparameters.num_embeds) + self.dropout = nn.Dropout(hyperparameters.dropout) + + def forward(self, x): """ - Computes the attention scores. + Computes the forward pass of the multi-head attention module. Args: - key (Tensor): The key vectors. - query (Tensor): The query vectors. - value (Tensor): The value vectors. - mask (Tensor): The mask tensor. + x (Tensor): The input tensor of shape (batch_size, sequence_length, num_embeds). Returns: - Tensor: The output of the attention mechanism. + Tensor: The output tensor after applying multi-head attention, of shape (batch_size, sequence_length, num_embeds). """ - logger.debug( - f"key shape: {key.shape}, query shape: {query.shape}, value shape: {value.shape}, mask shape: {mask.shape}" - ) - batch_size, n_head, time_step, channels = key.shape - scaling_factor = Tensor(channels**-0.5) - attention_scores = (query @ key.transpose(-2, -1)) * scaling_factor - attention_scores = mask[:, :, :time_step, :time_step] + attention_scores - attention_scores = Softmax().forward(attention_scores, dim=-1) - logger.debug(f"value shape: {value.shape}, attention_scores shape: {attention_scores.shape}") + # Apply attention heads in parallel + head_outputs = [h(x) for h in self.heads] - value = value.reshape(batch_size, n_head, time_step, channels, 1) - attention_scores = attention_scores.reshape(batch_size, n_head, time_step, 1, time_step) + # Concatenate the outputs of all attention heads + concatenated = Tensor.cat(head_outputs, dim=-1) - matmul_result = value @ attention_scores - x = matmul_result.sum(axis=-1) + # Project the concatenated output back to the original embedding dimension + out = self.proj(concatenated) - if not isinstance(x, Tensor): - raise TypeError(f"Expected x to be a Tensor, but got {type(x).__name__}") - return x + # Apply dropout regularization + out = self.dropout(out) + + return out class MLP(nn.Module): From 455f03c467cd437671f7902c045a179027662401 Mon Sep 17 00:00:00 2001 From: Josh Black-Star <23347017+jdblackstar@users.noreply.github.com> Date: Mon, 8 Apr 2024 12:48:32 -0700 Subject: [PATCH 04/17] creating a dropout class that inherits from nn.Module --- punytorch/nn/__init__.py | 1 + punytorch/nn/dropout.py | 50 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) create mode 100644 punytorch/nn/dropout.py diff --git a/punytorch/nn/__init__.py b/punytorch/nn/__init__.py index 270dceb..e882c22 100644 --- a/punytorch/nn/__init__.py +++ b/punytorch/nn/__init__.py @@ -1 +1,2 @@ from .modules import * +from .dropout import * diff --git a/punytorch/nn/dropout.py b/punytorch/nn/dropout.py new file mode 100644 index 0000000..64d7996 --- /dev/null +++ b/punytorch/nn/dropout.py @@ -0,0 +1,50 @@ +import numpy as np +import punytorch.nn as nn +from punytorch.tensor import Tensor + + +class Dropout(nn.Module): + def __init__(self, p: float = 0.5, seed: int = None): + """ + Initializes the Dropout layer. + + Args: + p (float or Tensor): The probability of an element to be zeroed. Default: 0.5 + Can be a float for a constant dropout rate, or a Tensor for element-wise rates. + seed (int, optional): The seed for the random number generator. If provided, + ensures reproducibility of dropout mask across runs. Default: None + + Raises: + TypeError: If `p` is not a float or a Tensor. + """ + super().__init__() + self.p = p + self.seed = seed + if isinstance(p, float): + self.p = float(p) + elif isinstance(p, Tensor): + self.p = p.data + else: + raise TypeError(f"p must be a float or a Tensor, got {type(p)}") + + def forward(self, input: Tensor, train: bool = True) -> Tensor: + """ + Applies Dropout to the input Tensor during training. + + Args: + input (Tensor): Input tensor. + train (bool): If True, apply dropout. If False, return the input as is. + + Returns: + Tensor: Output tensor after applying dropout. + """ + if train: + # Generate a mask with the same shape as the input + # Elements are drawn from a Bernoulli distribution + self.mask = (np.random.rand(*input.shape) > self.p) / (1 - self.p) + return input * self.mask + else: + return input + + def __call__(self, input: Tensor, train: bool = True) -> Tensor: + return self.forward(input, train) From 9005bc2dc4d77ad64712ce9bfadb8e2691c15fba Mon Sep 17 00:00:00 2001 From: Josh Black-Star <23347017+jdblackstar@users.noreply.github.com> Date: Mon, 8 Apr 2024 12:48:41 -0700 Subject: [PATCH 05/17] basic dropout test --- tests/test_dropout.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 tests/test_dropout.py diff --git a/tests/test_dropout.py b/tests/test_dropout.py new file mode 100644 index 0000000..b3cd6f5 --- /dev/null +++ b/tests/test_dropout.py @@ -0,0 +1,26 @@ +import numpy as np +from punytorch.nn.dropout import Dropout +from punytorch.tensor import Tensor + + +def test_dropout(): + np.random.seed(42) # For reproducibility + input_tensor = Tensor(np.array([1.0, 2.0, 3.0, 4.0])) + dropout_layer = Dropout(p=0.5) + + # Test during training + output_tensor_train = dropout_layer(input_tensor, train=True) + assert ( + output_tensor_train.shape == input_tensor.shape + ), "Output tensor shape should match input tensor shape during training." + + # Test during evaluation + output_tensor_eval = dropout_layer(input_tensor, train=False) + assert np.array_equal( + output_tensor_eval.data, input_tensor.data + ), "Output tensor should match input tensor during evaluation." + + # Check if dropout is applied (not a rigorous test due to randomness, but a basic check) + assert not np.array_equal( + output_tensor_train.data, input_tensor.data + ), "Output tensor should not match input tensor during training (with dropout applied)." From 29cca1ffde728f09759d91b46cc8d92d9fbfe9bc Mon Sep 17 00:00:00 2001 From: Josh Black-Star <23347017+jdblackstar@users.noreply.github.com> Date: Mon, 8 Apr 2024 14:14:18 -0700 Subject: [PATCH 06/17] i'm going to use one dataclass called Hyperparameters and just reference that everywhere --- examples/gpt.py | 115 ++++++++++++++++++++++++++---------------------- 1 file changed, 63 insertions(+), 52 deletions(-) diff --git a/examples/gpt.py b/examples/gpt.py index 7358a2a..e036787 100644 --- a/examples/gpt.py +++ b/examples/gpt.py @@ -29,22 +29,38 @@ # 2. Dataclasses for Model and Hyperparameters @dataclass class Hyperparameters: - batch_size: int - block_size: int - max_iters: int - eval_interval: int - learning_rate: float - device: str - eval_iters: int - num_embeds: int - num_heads: int - num_layers: int - dropout: float + # Model architecture parameters + vocab_size: int # Size of the vocabulary + d_model: int # Dimensionality of the token embeddings (equivalent to num_embeds) + num_layers: int # Number of transformer blocks + num_heads: int # Number of attention heads in each transformer block + d_ff: int # Dimensionality of the feed-forward layer within each transformer block + dropout_rate: float # Dropout rate applied to several components within the transformer blocks (equivalent to dropout) + max_position_embeddings: int # Maximum sequence length that this model might ever be used with (could align with block_size) + eps: float # Epsilon used for layer normalization modules + + # Training-specific parameters + batch_size: int # Number of sequences per training batch + block_size: int # Length of the sequence to be processed (could align with max_position_embeddings) + max_iters: int # Maximum number of training iterations + eval_interval: int # Interval (in iterations) at which to evaluate the model + learning_rate: float # Learning rate for the optimizer + device: str # Training device ('cpu' or 'cuda') + eval_iters: int # Number of iterations to perform during evaluation + + # Additional training hyperparameters (suggested) + num_epochs: int = 1 # Total number of training epochs (default to 1 for flexibility) + warmup_steps: int = 0 # Number of warmup steps for learning rate scheduling (default to 0) + gradient_accumulation_steps: int = ( + 1 # Number of steps to accumulate gradients before performing a backward/update pass (default to 1) + ) + max_grad_norm: float = 1.0 # Maximum gradient norm (for gradient clipping, default to 1.0) + save_interval: int = 1000 # Interval (in steps) at which to save model checkpoints (default to 1000) # 3. Helper Functions @Tensor.no_grad() -def estimate_loss(model, train_data, val_data, hyperparameters): +def estimate_loss(model, train_data, val_data, hparams: Hyperparameters): """ Estimates the loss of the model over a number of iterations. @@ -69,8 +85,8 @@ def estimate_loss(model, train_data, val_data, hyperparameters): for split in ["train", "val"]: losses = [] - for k in range(hyperparameters.eval_iters): - data, targets = get_batch(split, train_data, val_data, hyperparameters) + for k in range(hparams.eval_iters): + data, targets = get_batch(split, train_data, val_data, hparams) logits = model(data) batch_size, time_step, channels = logits.shape @@ -86,7 +102,7 @@ def estimate_loss(model, train_data, val_data, hyperparameters): return out -def get_batch(split, train_data, val_data, hyperparameters): +def get_batch(split, train_data, val_data, hparams: Hyperparameters): """ Generates a batch of data for training or validation. @@ -109,20 +125,20 @@ def get_batch(split, train_data, val_data, hyperparameters): len_data = len(data) # randomly select starting indices for the sequences - idx = np.random.randint(0, len_data - hyperparameters.block_size, hyperparameters.batch_size) + idx = np.random.randint(0, len_data - hparams.block_size, hparams.batch_size) # create input (x) and target (y) sequences based on block_size # target (y) sequence is offset by one (common practice in language modeling) - x = Tensor.stack([data[i : i + hyperparameters.block_size] for i in idx]) - y = Tensor.stack([data[i + 1 : i + hyperparameters.block_size + 1] for i in idx]) + x = Tensor.stack([data[i : i + hparams.block_size] for i in idx]) + y = Tensor.stack([data[i + 1 : i + hparams.block_size + 1] for i in idx]) # move the tensor to the specified device - x, y = x.to(hyperparameters.device), y.to(hyperparameters.device) + x, y = x.to(hparams.device), y.to(hparams.device) return x, y @Tensor.no_grad() -def generate(model, idx, max_new_tokens, hyperparameters: Hyperparameters): +def generate(model, idx, max_new_tokens, hparams: Hyperparameters): """ Generates new tokens using the trained model. @@ -139,9 +155,9 @@ def generate(model, idx, max_new_tokens, hyperparameters: Hyperparameters): Returns: Tensor: A tensor containing the indices of the generated tokens. """ - idx = Tensor.zeros((1, hyperparameters.block_size)).to(hyperparameters.device).long() + idx = Tensor.zeros((1, hparams.block_size)).to(hparams.device).long() for i in range(max_new_tokens): - idx_cond = idx[:, -hyperparameters.block_size :] + idx_cond = idx[:, -hparams.block_size :] logits = model(idx_cond) logits = logits[:, -1, :] # only take the last token, since we're predicting the "next" token @@ -154,7 +170,7 @@ def generate(model, idx, max_new_tokens, hyperparameters: Hyperparameters): # return the model to training mode model.train() - return idx[:, hyperparameters.block_size :] + return idx[:, hparams.block_size :] # 4. Model Components (Attention (Single head and MHA), MLP, RMSNorm, Block, GPT) @@ -170,13 +186,13 @@ class Head(nn.Module): hyperparameters (Hyperparameters): The hyperparameters of the model. """ - def __init__(self, head_size, hyperparameters): + def __init__(self, hparams: Hyperparameters): super().__init__() - self.key = nn.Linear(hyperparameters.num_embeds, head_size) - self.query = nn.Linear(hyperparameters.num_embeds, head_size) - self.value = nn.Linear(hyperparameters.num_embeds, head_size) - self.register_buffer("tril", Tensor(np.tril(np.ones((hyperparameters.block_size, hyperparameters.block_size))))) - self.dropout = nn.Dropout(hyperparameters.dropout) # this is placeholder, need to implement fully + self.key = nn.Linear(hparams.num_embeds, hparams.head_size) + self.query = nn.Linear(hparams.num_embeds, hparams.head_size) + self.value = nn.Linear(hparams.num_embeds, hparams.head_size) + self.register_buffer("tril", Tensor(np.tril(np.ones((hparams.block_size, hparams.block_size))))) + self.dropout = nn.Dropout(hparams.dropout) # this is placeholder, need to implement fully def forward(self, x): """ @@ -227,11 +243,11 @@ class MultiHeadAttention(nn.Module): hyperparameters (Hyperparameters): The hyperparameters of the model. """ - def __init__(self, num_heads, head_size, hyperparameters): + def __init__(self, hparams: Hyperparameters): super().__init__() - self.heads = nn.ModuleList([Head(head_size, hyperparameters) for _ in range(num_heads)]) - self.proj = nn.Linear(hyperparameters.num_embeds, hyperparameters.num_embeds) - self.dropout = nn.Dropout(hyperparameters.dropout) + self.heads = nn.ModuleList([Head(hparams) for _ in range(hparams.num_heads)]) + self.proj = nn.Linear(hparams.num_embeds, hparams.num_embeds) + self.dropout = nn.Dropout(hparams.dropout) def forward(self, x): """ @@ -346,7 +362,7 @@ class Block(nn.Module): layer normalization. """ - def __init__(self, model_args: ModelArgs) -> None: + def __init__(self, hparams: Hyperparameters) -> None: """ Initializes the Block module. @@ -354,10 +370,10 @@ def __init__(self, model_args: ModelArgs) -> None: model_args (ModelArgs): The arguments for the model, including dimensions and sequence length. """ super().__init__() - self.attn = MHA(model_args) - self.ffn = MLP(model_args.d_model, model_args.d_model) - self.l1 = RMSNorm(model_args.d_model, eps=model_args.esp) - self.l2 = RMSNorm(model_args.d_model, eps=model_args.esp) + self.attn = MultiHeadAttention(hparams) + self.ffn = MLP(hparams.d_model, hparams.d_model) + self.l1 = RMSNorm(hparams.d_model, eps=hparams.eps) + self.l2 = RMSNorm(hparams.d_model, eps=hparams.eps) def forward(self, x): """ @@ -380,7 +396,7 @@ class GPT(nn.Module): a list of transformer blocks, a normalization layer, and a projection layer. """ - def __init__(self, model_args: ModelArgs, device: str): + def __init__(self, hparams: Hyperparameters) -> None: """ Initializes the GPT model. @@ -389,12 +405,12 @@ def __init__(self, model_args: ModelArgs, device: str): device (str): The device to run the model on ("cpu" or "gpu"). """ super().__init__() - self.device = device - self.token_embedding = nn.Embedding(model_args.vocab_size, model_args.d_model) - self.position_embedding = nn.Embedding(model_args.seq_len, model_args.d_model) - self.layers = nn.ModuleList([Block(model_args) for _ in range(model_args.num_layers)]) - self.norm = RMSNorm(model_args.d_model) - self.proj = nn.Linear(model_args.d_model, model_args.vocab_size) + self.device = hparams.device + self.token_embedding = nn.Embedding(hparams.vocab_size, hparams.d_model) + self.position_embedding = nn.Embedding(hparams.seq_len, hparams.d_model) + self.layers = nn.ModuleList([Block(hparams) for _ in range(hparams.num_layers)]) + self.norm = RMSNorm(hparams.d_model) + self.proj = nn.Linear(hparams.d_model, hparams.vocab_size) def forward(self, x: Tensor) -> Tensor: """ @@ -424,7 +440,7 @@ def forward(self, x: Tensor) -> Tensor: # 5. Main Function def main(): - # hyperparameters and modelargs + # hyperparameters for the model and training run hyperparameters = Hyperparameters( batch_size=64, block_size=128, @@ -437,10 +453,6 @@ def main(): num_heads=4, num_layers=2, dropout=0.2, - ) - - # fmt: off - model_args = ModelArgs( seq_len=1000, d_model=16, n_heads=2, @@ -448,9 +460,8 @@ def main(): num_layers=2, esp=1e-5, ) - # fmt: on - model = GPT(model_args, hyperparameters.device).to(hyperparameters.device) + model = GPT(hyperparameters).to(hyperparameters.device) optimizer = Adam(model.parameters(), lr=hyperparameters.learning_rate) tokenizer = CharTokenizer(filepath="datasets/input.txt") From 08dc1ffd380f907f22ce07682b6b62171703e557 Mon Sep 17 00:00:00 2001 From: Josh Black-Star <23347017+jdblackstar@users.noreply.github.com> Date: Mon, 8 Apr 2024 14:21:46 -0700 Subject: [PATCH 07/17] fixing some minor things per ellipsis --- examples/gpt.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/gpt.py b/examples/gpt.py index e036787..fa2e4ae 100644 --- a/examples/gpt.py +++ b/examples/gpt.py @@ -408,7 +408,7 @@ def __init__(self, hparams: Hyperparameters) -> None: self.device = hparams.device self.token_embedding = nn.Embedding(hparams.vocab_size, hparams.d_model) self.position_embedding = nn.Embedding(hparams.seq_len, hparams.d_model) - self.layers = nn.ModuleList([Block(hparams) for _ in range(hparams.num_layers)]) + self.layers = nn.ModuleList([Block(hparams=hparams) for _ in range(hparams.num_layers)]) self.norm = RMSNorm(hparams.d_model) self.proj = nn.Linear(hparams.d_model, hparams.vocab_size) @@ -461,7 +461,7 @@ def main(): esp=1e-5, ) - model = GPT(hyperparameters).to(hyperparameters.device) + model = GPT(hparams=hyperparameters).to(hyperparameters.device) optimizer = Adam(model.parameters(), lr=hyperparameters.learning_rate) tokenizer = CharTokenizer(filepath="datasets/input.txt") From 5e2fbaea0b436e28ac48872fcb6a5afa4ec60840 Mon Sep 17 00:00:00 2001 From: Josh Black-Star <23347017+jdblackstar@users.noreply.github.com> Date: Mon, 8 Apr 2024 17:08:18 -0700 Subject: [PATCH 08/17] trimming down that dataclass a bit --- examples/gpt.py | 38 +++++++++++--------------------------- 1 file changed, 11 insertions(+), 27 deletions(-) diff --git a/examples/gpt.py b/examples/gpt.py index fa2e4ae..410f1b3 100644 --- a/examples/gpt.py +++ b/examples/gpt.py @@ -29,33 +29,17 @@ # 2. Dataclasses for Model and Hyperparameters @dataclass class Hyperparameters: - # Model architecture parameters - vocab_size: int # Size of the vocabulary - d_model: int # Dimensionality of the token embeddings (equivalent to num_embeds) - num_layers: int # Number of transformer blocks - num_heads: int # Number of attention heads in each transformer block - d_ff: int # Dimensionality of the feed-forward layer within each transformer block - dropout_rate: float # Dropout rate applied to several components within the transformer blocks (equivalent to dropout) - max_position_embeddings: int # Maximum sequence length that this model might ever be used with (could align with block_size) - eps: float # Epsilon used for layer normalization modules - - # Training-specific parameters - batch_size: int # Number of sequences per training batch - block_size: int # Length of the sequence to be processed (could align with max_position_embeddings) - max_iters: int # Maximum number of training iterations - eval_interval: int # Interval (in iterations) at which to evaluate the model - learning_rate: float # Learning rate for the optimizer - device: str # Training device ('cpu' or 'cuda') - eval_iters: int # Number of iterations to perform during evaluation - - # Additional training hyperparameters (suggested) - num_epochs: int = 1 # Total number of training epochs (default to 1 for flexibility) - warmup_steps: int = 0 # Number of warmup steps for learning rate scheduling (default to 0) - gradient_accumulation_steps: int = ( - 1 # Number of steps to accumulate gradients before performing a backward/update pass (default to 1) - ) - max_grad_norm: float = 1.0 # Maximum gradient norm (for gradient clipping, default to 1.0) - save_interval: int = 1000 # Interval (in steps) at which to save model checkpoints (default to 1000) + batch_size: int + block_size: int + max_iters: int + eval_interval: int + learning_rate: float + device: str + eval_iters: int + num_embeds: int + num_heads: int + num_layers: int + dropout: float # 3. Helper Functions From bbdb4d96321f2e2030004ed53a6363e000a8ea24 Mon Sep 17 00:00:00 2001 From: Josh Black-Star <23347017+jdblackstar@users.noreply.github.com> Date: Mon, 8 Apr 2024 17:10:25 -0700 Subject: [PATCH 09/17] because we trimmed down, need to rename --- examples/gpt.py | 102 +++++++++++++++++++++++++++---------------- punytorch/helpers.py | 3 ++ 2 files changed, 68 insertions(+), 37 deletions(-) diff --git a/examples/gpt.py b/examples/gpt.py index 410f1b3..fcd8577 100644 --- a/examples/gpt.py +++ b/examples/gpt.py @@ -170,13 +170,13 @@ class Head(nn.Module): hyperparameters (Hyperparameters): The hyperparameters of the model. """ - def __init__(self, hparams: Hyperparameters): + def __init__(self, head_size, hparams: Hyperparameters): super().__init__() - self.key = nn.Linear(hparams.num_embeds, hparams.head_size) - self.query = nn.Linear(hparams.num_embeds, hparams.head_size) - self.value = nn.Linear(hparams.num_embeds, hparams.head_size) + self.key = nn.Linear(hparams.num_embeds, head_size) + self.query = nn.Linear(hparams.num_embeds, head_size) + self.value = nn.Linear(hparams.num_embeds, head_size) self.register_buffer("tril", Tensor(np.tril(np.ones((hparams.block_size, hparams.block_size))))) - self.dropout = nn.Dropout(hparams.dropout) # this is placeholder, need to implement fully + self.dropout = nn.Dropout(hparams.dropout) def forward(self, x): """ @@ -227,9 +227,9 @@ class MultiHeadAttention(nn.Module): hyperparameters (Hyperparameters): The hyperparameters of the model. """ - def __init__(self, hparams: Hyperparameters): + def __init__(self, head_size, hparams: Hyperparameters): super().__init__() - self.heads = nn.ModuleList([Head(hparams) for _ in range(hparams.num_heads)]) + self.heads = nn.ModuleList([Head(head_size, hparams) for _ in range(hparams.num_heads)]) self.proj = nn.Linear(hparams.num_embeds, hparams.num_embeds) self.dropout = nn.Dropout(hparams.dropout) @@ -354,10 +354,11 @@ def __init__(self, hparams: Hyperparameters) -> None: model_args (ModelArgs): The arguments for the model, including dimensions and sequence length. """ super().__init__() - self.attn = MultiHeadAttention(hparams) - self.ffn = MLP(hparams.d_model, hparams.d_model) - self.l1 = RMSNorm(hparams.d_model, eps=hparams.eps) - self.l2 = RMSNorm(hparams.d_model, eps=hparams.eps) + head_size = hparams.num_embeds // hparams.num_heads + self.attn = MultiHeadAttention(head_size, hparams) + self.ffn = MLP(in_features=hparams.num_embeds, out_features=hparams.num_embeds) + self.l1 = RMSNorm(hparams.num_embeds, eps=hparams.dropout) + self.l2 = RMSNorm(hparams.num_embeds, eps=hparams.dropout) def forward(self, x): """ @@ -380,7 +381,7 @@ class GPT(nn.Module): a list of transformer blocks, a normalization layer, and a projection layer. """ - def __init__(self, hparams: Hyperparameters) -> None: + def __init__(self, hparams: Hyperparameters, vocab_size) -> None: """ Initializes the GPT model. @@ -390,11 +391,19 @@ def __init__(self, hparams: Hyperparameters) -> None: """ super().__init__() self.device = hparams.device - self.token_embedding = nn.Embedding(hparams.vocab_size, hparams.d_model) - self.position_embedding = nn.Embedding(hparams.seq_len, hparams.d_model) + self.token_embedding = nn.Embedding(vocab_size, hparams.num_embeds) + self.position_embedding = nn.Embedding(hparams.block_size, hparams.num_embeds) self.layers = nn.ModuleList([Block(hparams=hparams) for _ in range(hparams.num_layers)]) - self.norm = RMSNorm(hparams.d_model) - self.proj = nn.Linear(hparams.d_model, hparams.vocab_size) + self.norm = RMSNorm(hparams.num_embeds) + self.proj = nn.Linear(hparams.num_embeds, vocab_size) + + self.apply(self._init_weights) + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.02) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() def forward(self, x: Tensor) -> Tensor: """ @@ -411,15 +420,38 @@ def forward(self, x: Tensor) -> Tensor: B, T = x.shape token_embedding = self.token_embedding(x) position_embedding = self.position_embedding(Tensor(np.arange(T)).to(self.device)) - x = token_embedding + position_embedding - - for layer in self.layers: - x = layer(x) - - x = self.norm(x) - logits = self.proj(x) + x = token_embedding + position_embedding # (B,T,C) + x = self.layers(x) # (B,T,C) + x = self.norm(x) # (B,T,C) + logits = self.lm_head(x) # (B,T,vocab_size) + + if targets is None: + loss = None + else: + B, T, C = logits.shape + logits = logits.view(B * T, C) + targets = targets.view(B * T) + loss = CrossEntropyLoss.forward(logits, targets) - return logits + return logits, loss + + def generate(self, idx, max_new_tokens, hparams: Hyperparameters): + # idx is (B, T) array of indices in the current context + for _ in range(max_new_tokens): + # crop idx to the last block_size tokens + idx_cond = idx[:, -hparams.block_size :] + # get the predictions + logits, loss = self(idx_cond) + # focus only on the last time step + logits = logits[:, -1, :] # becomes (B, C) + # apply softmax to get probabilities + probs = Softmax.forward(logits, dim=-1) # (B, C) + # sample from the distribution - don't have access to torch.multinomial, so we're making our own + idx_next = np.array([np.random.choice(len(probs[b]), 1, p=probs[b]) for b in range(len(probs))]) # (B, 1) + idx_next = Tensor(idx_next).to(probs.device) # Convert to Tensor and move to the same device as probs + # append sampled index to the running sequence + idx = Tensor.cat((idx, idx_next), dim=1) # (B, T+1) + return idx # 5. Main Function @@ -427,32 +459,28 @@ def main(): # hyperparameters for the model and training run hyperparameters = Hyperparameters( batch_size=64, - block_size=128, + block_size=256, max_iters=5000, eval_interval=500, learning_rate=3e-4, device="cpu", - eval_iters=100, - num_embeds=128 * 2, - num_heads=4, - num_layers=2, + eval_iters=200, + num_embeds=384, + num_heads=6, + num_layers=6, dropout=0.2, - seq_len=1000, - d_model=16, - n_heads=2, - vocab_size=1000, - num_layers=2, - esp=1e-5, ) - model = GPT(hparams=hyperparameters).to(hyperparameters.device) - optimizer = Adam(model.parameters(), lr=hyperparameters.learning_rate) tokenizer = CharTokenizer(filepath="datasets/input.txt") data = Tensor(tokenizer.encode(tokenizer.text)).long() n = int(0.95 * len(data)) train_data = data[:n] val_data = data[n:] + vocab_size = tokenizer.get_vocab_size() + + model = GPT(hparams=hyperparameters, vocab_size=vocab_size).to(hyperparameters.device) + optimizer = Adam(model.parameters(), lr=hyperparameters.learning_rate) # type checks before moving on if not all(isinstance(x, Tensor) for x in [data, train_data, val_data]): diff --git a/punytorch/helpers.py b/punytorch/helpers.py index a73c1e4..d777f5a 100644 --- a/punytorch/helpers.py +++ b/punytorch/helpers.py @@ -59,3 +59,6 @@ def encode(self, text): def decode(self, encoded_chars): return "".join([self.int_to_char[i] for i in encoded_chars]) + + def get_vocab_size(self): + return self.vocab_size From 811fc4f1f8ac97d595f66f83eb55900903b6d279 Mon Sep 17 00:00:00 2001 From: Josh Black-Star <23347017+jdblackstar@users.noreply.github.com> Date: Mon, 8 Apr 2024 17:15:40 -0700 Subject: [PATCH 10/17] oops wrong name --- examples/gpt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/gpt.py b/examples/gpt.py index fcd8577..0e6a468 100644 --- a/examples/gpt.py +++ b/examples/gpt.py @@ -423,7 +423,7 @@ def forward(self, x: Tensor) -> Tensor: x = token_embedding + position_embedding # (B,T,C) x = self.layers(x) # (B,T,C) x = self.norm(x) # (B,T,C) - logits = self.lm_head(x) # (B,T,vocab_size) + logits = self.proj(x) # (B,T,vocab_size) if targets is None: loss = None From a50ccd0daefbfa0f15af5f08ac1e77085f9953f5 Mon Sep 17 00:00:00 2001 From: Josh Black-Star <23347017+jdblackstar@users.noreply.github.com> Date: Mon, 8 Apr 2024 17:16:28 -0700 Subject: [PATCH 11/17] set target to None --- examples/gpt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/gpt.py b/examples/gpt.py index 0e6a468..d5b1ab8 100644 --- a/examples/gpt.py +++ b/examples/gpt.py @@ -405,7 +405,7 @@ def _init_weights(self, module): if isinstance(module, nn.Linear) and module.bias is not None: module.bias.data.zero_() - def forward(self, x: Tensor) -> Tensor: + def forward(self, x: Tensor, targets=None) -> Tensor: """ Defines the computation performed at every call. From 96d361185c9d0071f561889f69fee76d839f78ec Mon Sep 17 00:00:00 2001 From: Josh Black-Star <23347017+jdblackstar@users.noreply.github.com> Date: Wed, 10 Apr 2024 10:49:27 -0700 Subject: [PATCH 12/17] pre-commit changes --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fece537..3933897 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,4 +3,4 @@ repos: rev: 23.12.1 hooks: - id: black - args: ["."] \ No newline at end of file + args: [".", "--line-length", "120"] \ No newline at end of file From 9a3db129d6d2cd4675684ddd22191f0efe76c05c Mon Sep 17 00:00:00 2001 From: Josh Black-Star <23347017+jdblackstar@users.noreply.github.com> Date: Wed, 10 Apr 2024 10:49:43 -0700 Subject: [PATCH 13/17] adding the target argument to the GPT docstring --- examples/gpt.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/examples/gpt.py b/examples/gpt.py index d5b1ab8..0b959a2 100644 --- a/examples/gpt.py +++ b/examples/gpt.py @@ -411,6 +411,8 @@ def forward(self, x: Tensor, targets=None) -> Tensor: Args: x (Tensor): The input data. + targets (Tensor, optional): The target values. If provided, the method will compute and return the loss. + If not provided, the method will only return the logits. Defaults to None. Returns: Tensor: The output of the GPT model. @@ -436,6 +438,22 @@ def forward(self, x: Tensor, targets=None) -> Tensor: return logits, loss def generate(self, idx, max_new_tokens, hparams: Hyperparameters): + """ + Generates text based on the provided context. + + This function takes an initial context of indices and generates text by repeatedly predicting the next token + until the specified maximum number of new tokens is reached. The generation process involves sampling from the + probability distribution over the vocabulary for each new token. + + Args: + idx (Tensor): The initial context represented as a tensor of token indices with shape (B, T), where B is + the batch size and T is the sequence length of the context. + max_new_tokens (int): The maximum number of new tokens to generate. + hparams (Hyperparameters): The hyperparameters for the model, including block size. + + Returns: + Tensor: The generated indices including the initial context and the new tokens, with shape (B, T + max_new_tokens). + """ # idx is (B, T) array of indices in the current context for _ in range(max_new_tokens): # crop idx to the last block_size tokens From e766d25b0beabf6dda6867aa5f8415c84d4cd63d Mon Sep 17 00:00:00 2001 From: Josh Black-Star <23347017+jdblackstar@users.noreply.github.com> Date: Thu, 11 Apr 2024 09:16:40 -0700 Subject: [PATCH 14/17] adds the masked_fill() method to the Tensor class --- punytorch/tensor.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/punytorch/tensor.py b/punytorch/tensor.py index a99a502..aff2047 100644 --- a/punytorch/tensor.py +++ b/punytorch/tensor.py @@ -109,6 +109,21 @@ def __lt__(self, other): else: return self.data < other + def masked_fill(self, mask, value): + """ + Fills elements of this tensor with `value` where `mask` is True. + + Args: + mask (Tensor): The boolean mask. + value (float): The value to fill in with. + + Returns: + Tensor: A new tensor with filled values. + """ + mask = self.ensure_tensor(mask) + result_data = np.where(mask.data, value, self.data) + return Tensor(result_data, requires_grad=self.requires_grad) + @staticmethod def data_to_numpy(data): if isinstance(data, (int, float)): From 9b302d322758b83c5d504ed3d7f006e9653f2e4c Mon Sep 17 00:00:00 2001 From: Josh Black-Star <23347017+jdblackstar@users.noreply.github.com> Date: Thu, 11 Apr 2024 10:37:05 -0700 Subject: [PATCH 15/17] changes to binary operations in both tensor.py and ops.py --- punytorch/ops.py | 173 +++++++++++++++----------------------------- punytorch/tensor.py | 101 ++++++++++++-------------- 2 files changed, 106 insertions(+), 168 deletions(-) diff --git a/punytorch/ops.py b/punytorch/ops.py index cb917ec..9c1edbc 100644 --- a/punytorch/ops.py +++ b/punytorch/ops.py @@ -16,28 +16,34 @@ def apply(self, *args): Returns: The result of applying the function. """ - return self.forward(*args) + return self.op(*args) class Operation: + def __call__(self, *args): + self.inputs = args + self.outputs = self.forward(*args) + return self.outputs + def forward(self, *args): raise NotImplementedError - def backward(self, context, grad): + def backward(self, grad): raise NotImplementedError -class Add(Operation): - @staticmethod - def forward(x, y): - from punytorch.tensor import Tensor +def ensure_numpy(x): + from punytorch.tensor import Tensor + + return x.data if isinstance(x, Tensor) else np.array(x) - x_data = x.data if isinstance(x, Tensor) else np.array(x) - y_data = y.data if isinstance(y, Tensor) else np.array(y) - return np.add(x_data, y_data) - @staticmethod - def backward(context, grad): +class Add(Operation): + def forward(self, x, y): + self.x, self.y = ensure_numpy(x), ensure_numpy(y) + return np.add(self.x, self.y) + + def backward(self, grad): # grad is assumed to be a NumPy array # The gradient of the sum is distributed equally to both operands # No need to change the shape of grad since addition is element-wise @@ -45,58 +51,35 @@ def backward(context, grad): class Sub(Operation): - @staticmethod - def forward(x, y): - from punytorch.tensor import Tensor - - # Ensure that x and y are NumPy arrays - x_data = x.data if isinstance(x, Tensor) else np.array(x) - y_data = y.data if isinstance(y, Tensor) else np.array(y) - # Use NumPy's subtraction - return np.subtract(x_data, y_data) - - @staticmethod - def backward(context, grad): + def forward(self, x, y): + self.x, self.y = ensure_numpy(x), ensure_numpy(y) + return np.subtract(self.x, self.y) + + def backward(self, grad): # The gradient with respect to the first operand is 1 # The gradient with respect to the second operand is -1 return grad, -grad class Mul(Operation): - @staticmethod - def forward(x, y): - from punytorch.tensor import Tensor - - # Ensure that x and y are NumPy arrays - x_data = x.data if isinstance(x, Tensor) else np.array(x) - y_data = y.data if isinstance(y, Tensor) else np.array(y) - # Use NumPy's multiplication - return np.multiply(x_data, y_data) - - @staticmethod - def backward(context, grad): - x, y = context.args + def forward(self, x, y): + self.x, self.y = ensure_numpy(x), ensure_numpy(y) + return np.multiply(self.x, self.y) + + def backward(self, grad): # The gradient with respect to x is y, and vice versa - return grad * y.data, grad * x.data + return grad * self.y, grad * self.x class TrueDiv(Operation): - @staticmethod - def forward(x, y): - from punytorch.tensor import Tensor - - # Ensure that x and y are NumPy arrays - x_data = x.data if isinstance(x, Tensor) else np.array(x) - y_data = y.data if isinstance(y, Tensor) else np.array(y) - # Use NumPy's true division - return np.divide(x_data, y_data) - - @staticmethod - def backward(context, grad): - x, y = context.args + def forward(self, x, y): + self.x, self.y = ensure_numpy(x), ensure_numpy(y) + return np.divide(self.x, self.y) + + def backward(self, grad): # The gradient with respect to x is 1/y # The gradient with respect to y is -x/y^2 - return grad.data / y.data, -x.data * grad.data / (y.data**2) + return grad / self.y, -self.x * grad / (self.y**2) class Mod(Operation): @@ -109,82 +92,46 @@ class Mod(Operation): for all use cases. """ - @staticmethod - def forward(x, y): - from punytorch.tensor import Tensor - - # Ensure that x and y are NumPy arrays - x_data = x.data if isinstance(x, Tensor) else np.array(x) - y_data = y.data if isinstance(y, Tensor) else np.array(y) - # Use NumPy's mod - return np.mod(x_data, y_data) + def forward(self, x, y): + self.x, self.y = ensure_numpy(x), ensure_numpy(y) + return np.mod(self.x, self.y) - @staticmethod - def backward(context, grad): - x, y = context.args + def backward(self, grad): # The gradient of x % y with respect to x is 1, and with respect to y is 0 # Check if all elements in `y.data` are integers and raise a ValueError if they're not - if not np.all(y.data.astype(int) == y.data): + if not np.all(self.y.astype(int) == self.y): raise ValueError("The derivative with respect to `y` is undefined for non-integer values.") - return grad, np.zeros_like(y.data) + return grad, np.zeros_like(self.y) class Pow(Operation): - @staticmethod - def forward(x, y): - from punytorch.tensor import Tensor - - # Ensure that x and y are NumPy arrays - x_data = x.data if isinstance(x, Tensor) else np.array(x) - y_data = y.data if isinstance(y, Tensor) else np.array(y) - # Use NumPy's power function - return np.power(x_data, y_data) - - @staticmethod - def backward(context, grad): - x, y = context.args + def forward(self, x, y): + self.x, self.y = ensure_numpy(x), ensure_numpy(y) + return np.power(self.x, self.y) + + def backward(self, grad): # The gradient with respect to x is y * x^(y - 1) # The gradient with respect to y is x^y * log(x) - grad_x = grad * y.data * np.power(x.data, y.data - 1) - grad_y = grad * np.power(x.data, y.data) * np.log(x.data) + grad_x = grad * self.y * np.power(self.x, self.y - 1) + grad_y = grad * np.power(self.x, self.y) * np.log(self.x) return grad_x, grad_y class MatMul(Operation): - @staticmethod - def forward(x, y): - from punytorch.tensor import Tensor - - # Ensure that x and y are NumPy arrays - x_data = x.data if isinstance(x, Tensor) else np.array(x) - y_data = y.data if isinstance(y, Tensor) else np.array(y) - # Use NumPy's matmul - return np.matmul(x_data, y_data) - - @staticmethod - def backward(context, grad): - x, y = context.args + def forward(self, x, y): + self.x, self.y = ensure_numpy(x), ensure_numpy(y) + return np.matmul(self.x, self.y) + + def backward(self, grad): # If Z = X @ Y, then d(Z)/dX = grad @ Y^T and d(Z)/dY = X^T @ grad - return grad.data @ np.transpose(y.data), np.transpose(x.data) @ grad.data + return np.dot(grad, self.y.T), np.dot(self.x.T, grad) class Tanh(Operation): - @staticmethod - def forward(x): - from punytorch.tensor import Tensor - - # Ensure that x is a NumPy array - x_data = x.data if isinstance(x, Tensor) else np.array(x) - # Use NumPy's tanh - return np.tanh(x_data) - - @staticmethod - def backward(context, grad): - from punytorch.tensor import Tensor - - x = context.args[0] - # The gradient of tanh is (1 - tanh^2(x)) - x_data = x.data if isinstance(x, Tensor) else np.array(x) - tanh_x_data = np.tanh(x_data) - grad_tanh = (1 - np.square(tanh_x_data)) * grad - return grad_tanh + def forward(self, x): + self.x = ensure_numpy(x) + return np.tanh(self.x) + + def backward(self, grad): + tanh_x = np.tanh(self.x) + return (1 - np.square(tanh_x)) * grad diff --git a/punytorch/tensor.py b/punytorch/tensor.py index aff2047..d6bfabd 100644 --- a/punytorch/tensor.py +++ b/punytorch/tensor.py @@ -225,76 +225,67 @@ def reshape(self, *shape): BINARY OPS """ - def __add__(self, other): - result = Tensor( - Add.forward(self, other), requires_grad=self.requires_grad or getattr(other, "requires_grad", False) - ) - if result.requires_grad: - result.context = Function(Add, self, other) - return result + def _binary_op(self, other, op, op_class): + """ + Helper function to perform binary operations and handle gradients. - def __sub__(self, other): - result = Tensor( - Sub.forward(self, other), requires_grad=self.requires_grad or getattr(other, "requires_grad", False) - ) - if result.requires_grad: - result.context = Function(Sub, self, other) - return result + Args: + other (Tensor, float, int): The right operand. + op (function): The operation to perform (e.g., np.add, np.subtract). + op_class (class): The class representing the operation for gradient computation. - def __mul__(self, other): + Returns: + Tensor: The result of the binary operation. + """ if isinstance(other, (int, float)): - return Tensor(self.data * other, requires_grad=self.requires_grad) - else: - result = Tensor( - Mul.forward(self, other), requires_grad=self.requires_grad or getattr(other, "requires_grad", False) - ) + result_data = op(self.data, other) + return Tensor(result_data, requires_grad=self.requires_grad) + elif isinstance(other, Tensor): + result_data = op(self.data, other.data) + result = Tensor(result_data, requires_grad=self.requires_grad or other.requires_grad) if result.requires_grad: - result.context = Function(Mul, self, other) + result.context = op_class(self, other) return result - - def __truediv__(self, other): - if isinstance(other, (int, float)): - return Tensor(self.data // other, requires_grad=self.requires_grad) else: - result = Tensor( - TrueDiv.forward(self, other), requires_grad=self.requires_grad or getattr(other, "requires_grad", False) + raise TypeError( + f"Unsupported operand type(s) for {op.__name__}: '{type(self).__name__}' and '{type(other).__name__}'" ) - if result.requires_grad: - result.context = Function(TrueDiv, self, other) - return result + + def __add__(self, other): + return self._binary_op(other, np.add, Add) + + def __sub__(self, other): + return self._binary_op(other, np.subtract, Sub) + + def __mul__(self, other): + return self._binary_op(other, np.multiply, Mul) + + def __truediv__(self, other): + return self._binary_op(other, np.divide, TrueDiv) def __mod__(self, other): - result = Tensor( - Mod.forward(self, other), requires_grad=self.requires_grad or getattr(other, "requires_grad", False) - ) - if result.requires_grad: - result.context = Function(Mod, self, other) - return result + return self._binary_op(other, np.mod, Mod) def __pow__(self, other): - result = Tensor( - Pow.forward(self, other), requires_grad=self.requires_grad or getattr(other, "requires_grad", False) - ) - if result.requires_grad: - result.context = Function(Pow, self, other) - return result + return self._binary_op(other, np.power, Pow) def __matmul__(self, other): - if isinstance(other, (int, float)): - return Tensor(self.data @ other, requires_grad=self.requires_grad) + # __matmul__ requires special handling due to reshaping for vectors. + if not isinstance(other, Tensor): + raise TypeError(f"Unsupported operand type(s) for @: '{type(self).__name__}' and '{type(other).__name__}'") + + if self.data.ndim == 1: + self_data = self.data.reshape(1, -1) else: - result = Tensor( - MatMul.forward(self, other), requires_grad=self.requires_grad or getattr(other, "requires_grad", False) - ) - if result.requires_grad: - result.context = Function(MatMul, self, other) - return result + self_data = self.data - def __tanh__(self): - result = Tensor(Tanh.forward(self), requires_grad=self.requires_grad) - if result.requires_grad: - result.context = Function(Tanh, self) - return result + if other.data.ndim == 1: + other_data = other.data.reshape(-1, 1) + else: + other_data = other.data + + result_data = np.matmul(self_data, other_data) + return Tensor(result_data, requires_grad=self.requires_grad or other.requires_grad) """ UNARY OPS From a07794751f519de30f1a5a1c969def46752e2676 Mon Sep 17 00:00:00 2001 From: Josh Black-Star <23347017+jdblackstar@users.noreply.github.com> Date: Fri, 12 Apr 2024 09:53:32 -0700 Subject: [PATCH 16/17] adding apply() method to the Module class --- punytorch/nn/modules.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/punytorch/nn/modules.py b/punytorch/nn/modules.py index 185b408..755561e 100644 --- a/punytorch/nn/modules.py +++ b/punytorch/nn/modules.py @@ -48,6 +48,20 @@ def parameters(self) -> list[Parameter]: params.extend(module.parameters()) return list(set(params)) + def apply(self, fn): + for module in self.children(): + module.apply(fn) + fn(self) + return self + + def children(self): + """ + Returns an iterator over immediate children modules. + """ + for name, module in self.__dict__.items(): + if isinstance(module, Module): + yield module + def state_dict(self): """ Returns a dictionary containing a whole state of the module. From 059c8a00d925279e2484376294d4072f802cbe7f Mon Sep 17 00:00:00 2001 From: Josh Black-Star <23347017+jdblackstar@users.noreply.github.com> Date: Fri, 12 Apr 2024 09:54:00 -0700 Subject: [PATCH 17/17] some more attention block rewrites --- examples/gpt.py | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/examples/gpt.py b/examples/gpt.py index 0b959a2..275350b 100644 --- a/examples/gpt.py +++ b/examples/gpt.py @@ -196,7 +196,9 @@ def forward(self, x): v = self.value(x) # Compute attention scores - attention_scores = q @ k.transpose(-2, -1) * k.shape[-1] ** -0.5 + attention_scores = q @ k.transpose(-2, -1) + scale_factor = Tensor(k.shape[-1] ** -0.5) # Wrap the scalar in a Tensor + attention_scores = attention_scores * scale_factor # Ensure element-wise multiplication # Apply mask to attention scores masked_attention_scores = attention_scores.masked_fill( @@ -209,7 +211,9 @@ def forward(self, x): # Compute the attended values v = self.value(x) - out = attention_probs @ v + logger.debug(f"v: {v.shape}, {type(v)}") + logger.debug(f"attention_probs: {attention_probs.shape}, {type(attention_probs)}") + out = attention_probs.data @ v.data return out @@ -401,13 +405,16 @@ def __init__(self, hparams: Hyperparameters, vocab_size) -> None: def _init_weights(self, module): if isinstance(module, (nn.Linear, nn.Embedding)): - module.weight.data.normal_(mean=0.0, std=0.02) + # Assuming module.weight.data is a numpy array + mean = 0.0 + std = 0.02 + module.weight.data = np.random.normal(mean, std, module.weight.data.shape) if isinstance(module, nn.Linear) and module.bias is not None: - module.bias.data.zero_() + module.bias.data = np.zeros_like(module.bias.data) def forward(self, x: Tensor, targets=None) -> Tensor: """ - Defines the computation performed at every call. + Overrides the base forward method in the nn.Module class to define the computation performed at every call. Args: x (Tensor): The input data. @@ -422,18 +429,19 @@ def forward(self, x: Tensor, targets=None) -> Tensor: B, T = x.shape token_embedding = self.token_embedding(x) position_embedding = self.position_embedding(Tensor(np.arange(T)).to(self.device)) - x = token_embedding + position_embedding # (B,T,C) - x = self.layers(x) # (B,T,C) - x = self.norm(x) # (B,T,C) - logits = self.proj(x) # (B,T,vocab_size) + x = token_embedding + position_embedding # Combine token and position embeddings (B,T,C) + for layer in self.layers: + x = layer(x) # Pass through each transformer block (B,T,C) + x = self.norm(x) # Apply normalization (B,T,C) + logits = self.proj(x) # Project to vocabulary size (B,T,vocab_size) - if targets is None: - loss = None - else: + if targets is not None: B, T, C = logits.shape - logits = logits.view(B * T, C) - targets = targets.view(B * T) - loss = CrossEntropyLoss.forward(logits, targets) + logits_flat = logits.view(B * T, C) + targets_flat = targets.view(B * T) + loss = CrossEntropyLoss.forward(logits_flat, targets_flat) + else: + loss = None return logits, loss