Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature/attention block rewrite #9

Open
wants to merge 17 commits into
base: feature/gpt-example
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
242 changes: 130 additions & 112 deletions examples/gpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,32 +29,38 @@
# 2. Dataclasses for Model and Hyperparameters
@dataclass
class Hyperparameters:
batch_size: int
block_size: int
max_iters: int
eval_interval: int
learning_rate: float
device: str
eval_iters: int
num_embeds: int
num_heads: int
num_layers: int
dropout: float


@dataclass
class ModelArgs:
seq_len: int
d_model: int
n_heads: int
vocab_size: int
num_layers: int
esp: float
# Model architecture parameters
vocab_size: int # Size of the vocabulary
d_model: int # Dimensionality of the token embeddings (equivalent to num_embeds)
num_layers: int # Number of transformer blocks
num_heads: int # Number of attention heads in each transformer block
d_ff: int # Dimensionality of the feed-forward layer within each transformer block
dropout_rate: float # Dropout rate applied to several components within the transformer blocks (equivalent to dropout)
max_position_embeddings: int # Maximum sequence length that this model might ever be used with (could align with block_size)
eps: float # Epsilon used for layer normalization modules

# Training-specific parameters
batch_size: int # Number of sequences per training batch
block_size: int # Length of the sequence to be processed (could align with max_position_embeddings)
max_iters: int # Maximum number of training iterations
eval_interval: int # Interval (in iterations) at which to evaluate the model
learning_rate: float # Learning rate for the optimizer
device: str # Training device ('cpu' or 'cuda')
eval_iters: int # Number of iterations to perform during evaluation

# Additional training hyperparameters (suggested)
num_epochs: int = 1 # Total number of training epochs (default to 1 for flexibility)
warmup_steps: int = 0 # Number of warmup steps for learning rate scheduling (default to 0)
gradient_accumulation_steps: int = (
1 # Number of steps to accumulate gradients before performing a backward/update pass (default to 1)
)
max_grad_norm: float = 1.0 # Maximum gradient norm (for gradient clipping, default to 1.0)
save_interval: int = 1000 # Interval (in steps) at which to save model checkpoints (default to 1000)


# 3. Helper Functions
@Tensor.no_grad()
def estimate_loss(model, train_data, val_data, hyperparameters):
def estimate_loss(model, train_data, val_data, hparams: Hyperparameters):
"""
Estimates the loss of the model over a number of iterations.

Expand All @@ -79,8 +85,8 @@ def estimate_loss(model, train_data, val_data, hyperparameters):

for split in ["train", "val"]:
losses = []
for k in range(hyperparameters.eval_iters):
data, targets = get_batch(split, train_data, val_data, hyperparameters)
for k in range(hparams.eval_iters):
data, targets = get_batch(split, train_data, val_data, hparams)
logits = model(data)

batch_size, time_step, channels = logits.shape
Expand All @@ -96,7 +102,7 @@ def estimate_loss(model, train_data, val_data, hyperparameters):
return out


def get_batch(split, train_data, val_data, hyperparameters):
def get_batch(split, train_data, val_data, hparams: Hyperparameters):
"""
Generates a batch of data for training or validation.

Expand All @@ -119,20 +125,20 @@ def get_batch(split, train_data, val_data, hyperparameters):
len_data = len(data)

# randomly select starting indices for the sequences
idx = np.random.randint(0, len_data - hyperparameters.block_size, hyperparameters.batch_size)
idx = np.random.randint(0, len_data - hparams.block_size, hparams.batch_size)

# create input (x) and target (y) sequences based on block_size
# target (y) sequence is offset by one (common practice in language modeling)
x = Tensor.stack([data[i : i + hyperparameters.block_size] for i in idx])
y = Tensor.stack([data[i + 1 : i + hyperparameters.block_size + 1] for i in idx])
x = Tensor.stack([data[i : i + hparams.block_size] for i in idx])
y = Tensor.stack([data[i + 1 : i + hparams.block_size + 1] for i in idx])

# move the tensor to the specified device
x, y = x.to(hyperparameters.device), y.to(hyperparameters.device)
x, y = x.to(hparams.device), y.to(hparams.device)
return x, y


@Tensor.no_grad()
def generate(model, idx, max_new_tokens, hyperparameters):
def generate(model, idx, max_new_tokens, hparams: Hyperparameters):
"""
Generates new tokens using the trained model.

Expand All @@ -149,9 +155,9 @@ def generate(model, idx, max_new_tokens, hyperparameters):
Returns:
Tensor: A tensor containing the indices of the generated tokens.
"""
idx = Tensor.zeros((1, hyperparameters.block_size)).to(hyperparameters.device).long()
idx = Tensor.zeros((1, hparams.block_size)).to(hparams.device).long()
for i in range(max_new_tokens):
idx_cond = idx[:, -hyperparameters.block_size :]
idx_cond = idx[:, -hparams.block_size :]
logits = model(idx_cond)
logits = logits[:, -1, :] # only take the last token, since we're predicting the "next" token

Expand All @@ -164,91 +170,108 @@ def generate(model, idx, max_new_tokens, hyperparameters):

# return the model to training mode
model.train()
return idx[:, hyperparameters.block_size :]
return idx[:, hparams.block_size :]


# 4. Model Components (MHA, MLP, RMSNorm, Block, GPT)
class MHA(nn.Module):
def __init__(self, model_args: ModelArgs) -> None:
"""
Initializes the Multi-Head Attention module.
# 4. Model Components (Attention (Single head and MHA), MLP, RMSNorm, Block, GPT)
class Head(nn.Module):
jdblackstar marked this conversation as resolved.
Show resolved Hide resolved
"""
A single attention head.

Args:
model_args (ModelArgs): The arguments for the model, including dimensions and sequence length.
"""
This class implements a single attention head, which is a key component of the transformer architecture.
It computes the attention scores, applies a mask, and performs the attention operation.

Args:
head_size (int): The size of the attention head.
hyperparameters (Hyperparameters): The hyperparameters of the model.
"""

def __init__(self, hparams: Hyperparameters):
super().__init__()
self.key = nn.Linear(model_args.d_model, model_args.d_model)
self.query = nn.Linear(model_args.d_model, model_args.d_model)
self.value = nn.Linear(model_args.d_model, model_args.d_model)
self.proj = nn.Linear(model_args.d_model, model_args.d_model)
self.head_dim = model_args.d_model // model_args.n_heads

self.n_heads = model_args.n_heads
mask = np.tril(np.ones((model_args.seq_len, model_args.seq_len)))
mask = np.triu(mask, k=1) * -np.inf
# Repeat the mask for each head
mask = np.repeat(mask[np.newaxis, np.newaxis, :, :], self.n_heads, axis=1)
self.register_buffer("mask", Tensor(mask).float())
self.key = nn.Linear(hparams.num_embeds, hparams.head_size)
self.query = nn.Linear(hparams.num_embeds, hparams.head_size)
self.value = nn.Linear(hparams.num_embeds, hparams.head_size)
self.register_buffer("tril", Tensor(np.tril(np.ones((hparams.block_size, hparams.block_size)))))
self.dropout = nn.Dropout(hparams.dropout) # this is placeholder, need to implement fully

def forward(self, x: Tensor) -> Tensor:
def forward(self, x):
"""
Defines the computation performed at every call.
Computes the forward pass of the attention head.

Args:
x (Tensor): The input data.
x (Tensor): The input tensor of shape (batch_size, sequence_length, num_embeds).

Returns:
Tensor: The output of the Multi-Head Attention layer.
Tensor: The output tensor after applying attention, of shape (batch_size, sequence_length, head_size).
"""
if not isinstance(x, Tensor):
raise TypeError(f"Expected x to be a Tensor, but got {type(x).__name__}")
batch_size, sequence_length, channels = x.shape

batch_size, time_step, channels = x.shape
key = self.key(x)
query = self.query(x)
value = self.value(x)
key = key.reshape(batch_size, time_step, self.n_heads, channels // self.n_heads).transpose(1, 2)
query = query.reshape(batch_size, time_step, self.n_heads, channels // self.n_heads).transpose(1, 2)
value = value.reshape(batch_size, time_step, self.n_heads, channels // self.n_heads).transpose(1, 2)
# Compute key, query, and value projections
k = self.key(x)
q = self.query(x)
v = self.value(x)

# Call the static attention method
x = MHA.attention(key, query, value, self.mask)
# Compute attention scores
attention_scores = q @ k.transpose(-2, -1) * k.shape[-1] ** -0.5

return x
# Apply mask to attention scores
masked_attention_scores = attention_scores.masked_fill(
self.tril[:sequence_length, :sequence_length] == 0, float("-inf")
)

@staticmethod
def attention(key, query, value, mask) -> Tensor:
# Compute attention probabilities
attention_probs = Softmax().forward(masked_attention_scores)
attention_probs = self.dropout(attention_probs)

# Compute the attended values
v = self.value(x)
out = attention_probs @ v

return out


class MultiHeadAttention(nn.Module):
"""
Multi-head attention module.

This module applies multiple attention heads in parallel and concatenates their outputs.
The concatenated output is then projected to the original embedding dimension.

Args:
num_heads (int): The number of attention heads.
head_size (int): The size of each attention head.
hyperparameters (Hyperparameters): The hyperparameters of the model.
"""

def __init__(self, hparams: Hyperparameters):
super().__init__()
self.heads = nn.ModuleList([Head(hparams) for _ in range(hparams.num_heads)])
self.proj = nn.Linear(hparams.num_embeds, hparams.num_embeds)
self.dropout = nn.Dropout(hparams.dropout)

def forward(self, x):
"""
Computes the attention scores.
Computes the forward pass of the multi-head attention module.

Args:
key (Tensor): The key vectors.
query (Tensor): The query vectors.
value (Tensor): The value vectors.
mask (Tensor): The mask tensor.
x (Tensor): The input tensor of shape (batch_size, sequence_length, num_embeds).

Returns:
Tensor: The output of the attention mechanism.
Tensor: The output tensor after applying multi-head attention, of shape (batch_size, sequence_length, num_embeds).
"""
logger.debug(
f"key shape: {key.shape}, query shape: {query.shape}, value shape: {value.shape}, mask shape: {mask.shape}"
)
batch_size, n_head, time_step, channels = key.shape
scaling_factor = Tensor(channels**-0.5)
attention_scores = (query @ key.transpose(-2, -1)) * scaling_factor
attention_scores = mask[:, :, :time_step, :time_step] + attention_scores
attention_scores = Softmax().forward(attention_scores, dim=-1)
logger.debug(f"value shape: {value.shape}, attention_scores shape: {attention_scores.shape}")
# Apply attention heads in parallel
head_outputs = [h(x) for h in self.heads]

value = value.reshape(batch_size, n_head, time_step, channels, 1)
attention_scores = attention_scores.reshape(batch_size, n_head, time_step, 1, time_step)
# Concatenate the outputs of all attention heads
concatenated = Tensor.cat(head_outputs, dim=-1)

matmul_result = value @ attention_scores
x = matmul_result.sum(axis=-1)
# Project the concatenated output back to the original embedding dimension
out = self.proj(concatenated)

if not isinstance(x, Tensor):
raise TypeError(f"Expected x to be a Tensor, but got {type(x).__name__}")
return x
# Apply dropout regularization
out = self.dropout(out)

return out


class MLP(nn.Module):
Expand Down Expand Up @@ -339,18 +362,18 @@ class Block(nn.Module):
layer normalization.
"""

def __init__(self, model_args: ModelArgs) -> None:
def __init__(self, hparams: Hyperparameters) -> None:
"""
Initializes the Block module.

Args:
model_args (ModelArgs): The arguments for the model, including dimensions and sequence length.
"""
super().__init__()
self.attn = MHA(model_args)
self.ffn = MLP(model_args.d_model, model_args.d_model)
self.l1 = RMSNorm(model_args.d_model, eps=model_args.esp)
self.l2 = RMSNorm(model_args.d_model, eps=model_args.esp)
self.attn = MultiHeadAttention(hparams)
self.ffn = MLP(hparams.d_model, hparams.d_model)
self.l1 = RMSNorm(hparams.d_model, eps=hparams.eps)
self.l2 = RMSNorm(hparams.d_model, eps=hparams.eps)

def forward(self, x):
"""
Expand All @@ -373,7 +396,7 @@ class GPT(nn.Module):
a list of transformer blocks, a normalization layer, and a projection layer.
"""

def __init__(self, model_args: ModelArgs, device: str):
def __init__(self, hparams: Hyperparameters) -> None:
"""
Initializes the GPT model.

Expand All @@ -382,12 +405,12 @@ def __init__(self, model_args: ModelArgs, device: str):
device (str): The device to run the model on ("cpu" or "gpu").
"""
super().__init__()
self.device = device
self.token_embedding = nn.Embedding(model_args.vocab_size, model_args.d_model)
self.position_embedding = nn.Embedding(model_args.seq_len, model_args.d_model)
self.layers = nn.ModuleList([Block(model_args) for _ in range(model_args.num_layers)])
self.norm = RMSNorm(model_args.d_model)
self.proj = nn.Linear(model_args.d_model, model_args.vocab_size)
self.device = hparams.device
self.token_embedding = nn.Embedding(hparams.vocab_size, hparams.d_model)
self.position_embedding = nn.Embedding(hparams.seq_len, hparams.d_model)
self.layers = nn.ModuleList([Block(hparams) for _ in range(hparams.num_layers)])
self.norm = RMSNorm(hparams.d_model)
self.proj = nn.Linear(hparams.d_model, hparams.vocab_size)

def forward(self, x: Tensor) -> Tensor:
"""
Expand Down Expand Up @@ -417,7 +440,7 @@ def forward(self, x: Tensor) -> Tensor:

# 5. Main Function
def main():
# hyperparameters and modelargs
# hyperparameters for the model and training run
hyperparameters = Hyperparameters(
batch_size=64,
block_size=128,
Expand All @@ -430,20 +453,15 @@ def main():
num_heads=4,
num_layers=2,
dropout=0.2,
)

# fmt: off
model_args = ModelArgs(
seq_len=1000,
d_model=16,
n_heads=2,
vocab_size=1000,
num_layers=2,
esp=1e-5,
)
# fmt: on

model = GPT(model_args, hyperparameters.device).to(hyperparameters.device)
model = GPT(hyperparameters).to(hyperparameters.device)
optimizer = Adam(model.parameters(), lr=hyperparameters.learning_rate)
tokenizer = CharTokenizer(filepath="datasets/input.txt")

Expand Down
1 change: 1 addition & 0 deletions punytorch/nn/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from .modules import *
from .dropout import *
Loading