Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature/attention block rewrite #9

Open
wants to merge 17 commits into
base: feature/gpt-example
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ repos:
rev: 23.12.1
hooks:
- id: black
args: ["."]
args: [".", "--line-length", "120"]
302 changes: 179 additions & 123 deletions examples/gpt.py

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions punytorch/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,6 @@ def encode(self, text):

def decode(self, encoded_chars):
return "".join([self.int_to_char[i] for i in encoded_chars])

def get_vocab_size(self):
return self.vocab_size
1 change: 1 addition & 0 deletions punytorch/nn/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from .modules import *
from .dropout import *
50 changes: 50 additions & 0 deletions punytorch/nn/dropout.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import numpy as np
import punytorch.nn as nn
from punytorch.tensor import Tensor


class Dropout(nn.Module):
def __init__(self, p: float = 0.5, seed: int = None):
"""
Initializes the Dropout layer.

Args:
p (float or Tensor): The probability of an element to be zeroed. Default: 0.5
Can be a float for a constant dropout rate, or a Tensor for element-wise rates.
seed (int, optional): The seed for the random number generator. If provided,
ensures reproducibility of dropout mask across runs. Default: None

Raises:
TypeError: If `p` is not a float or a Tensor.
"""
super().__init__()
self.p = p
self.seed = seed
if isinstance(p, float):
self.p = float(p)
elif isinstance(p, Tensor):
self.p = p.data
else:
raise TypeError(f"p must be a float or a Tensor, got {type(p)}")

def forward(self, input: Tensor, train: bool = True) -> Tensor:
"""
Applies Dropout to the input Tensor during training.

Args:
input (Tensor): Input tensor.
train (bool): If True, apply dropout. If False, return the input as is.

Returns:
Tensor: Output tensor after applying dropout.
"""
if train:
# Generate a mask with the same shape as the input
# Elements are drawn from a Bernoulli distribution
self.mask = (np.random.rand(*input.shape) > self.p) / (1 - self.p)
return input * self.mask
else:
return input

def __call__(self, input: Tensor, train: bool = True) -> Tensor:
return self.forward(input, train)
14 changes: 14 additions & 0 deletions punytorch/nn/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,20 @@ def parameters(self) -> list[Parameter]:
params.extend(module.parameters())
return list(set(params))

def apply(self, fn):
for module in self.children():
module.apply(fn)
fn(self)
return self

def children(self):
"""
Returns an iterator over immediate children modules.
"""
for name, module in self.__dict__.items():
if isinstance(module, Module):
yield module

def state_dict(self):
"""
Returns a dictionary containing a whole state of the module.
Expand Down
173 changes: 60 additions & 113 deletions punytorch/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,87 +16,70 @@ def apply(self, *args):
Returns:
The result of applying the function.
"""
return self.forward(*args)
return self.op(*args)


class Operation:
def __call__(self, *args):
self.inputs = args
self.outputs = self.forward(*args)
return self.outputs

def forward(self, *args):
raise NotImplementedError

def backward(self, context, grad):
def backward(self, grad):
raise NotImplementedError


class Add(Operation):
@staticmethod
def forward(x, y):
from punytorch.tensor import Tensor
def ensure_numpy(x):
from punytorch.tensor import Tensor

return x.data if isinstance(x, Tensor) else np.array(x)

x_data = x.data if isinstance(x, Tensor) else np.array(x)
y_data = y.data if isinstance(y, Tensor) else np.array(y)
return np.add(x_data, y_data)

@staticmethod
def backward(context, grad):
class Add(Operation):
def forward(self, x, y):
self.x, self.y = ensure_numpy(x), ensure_numpy(y)
return np.add(self.x, self.y)

def backward(self, grad):
# grad is assumed to be a NumPy array
# The gradient of the sum is distributed equally to both operands
# No need to change the shape of grad since addition is element-wise
return grad, grad


class Sub(Operation):
@staticmethod
def forward(x, y):
from punytorch.tensor import Tensor

# Ensure that x and y are NumPy arrays
x_data = x.data if isinstance(x, Tensor) else np.array(x)
y_data = y.data if isinstance(y, Tensor) else np.array(y)
# Use NumPy's subtraction
return np.subtract(x_data, y_data)

@staticmethod
def backward(context, grad):
def forward(self, x, y):
self.x, self.y = ensure_numpy(x), ensure_numpy(y)
return np.subtract(self.x, self.y)

def backward(self, grad):
# The gradient with respect to the first operand is 1
# The gradient with respect to the second operand is -1
return grad, -grad


class Mul(Operation):
@staticmethod
def forward(x, y):
from punytorch.tensor import Tensor

# Ensure that x and y are NumPy arrays
x_data = x.data if isinstance(x, Tensor) else np.array(x)
y_data = y.data if isinstance(y, Tensor) else np.array(y)
# Use NumPy's multiplication
return np.multiply(x_data, y_data)

@staticmethod
def backward(context, grad):
x, y = context.args
def forward(self, x, y):
self.x, self.y = ensure_numpy(x), ensure_numpy(y)
return np.multiply(self.x, self.y)

def backward(self, grad):
# The gradient with respect to x is y, and vice versa
return grad * y.data, grad * x.data
return grad * self.y, grad * self.x


class TrueDiv(Operation):
@staticmethod
def forward(x, y):
from punytorch.tensor import Tensor

# Ensure that x and y are NumPy arrays
x_data = x.data if isinstance(x, Tensor) else np.array(x)
y_data = y.data if isinstance(y, Tensor) else np.array(y)
# Use NumPy's true division
return np.divide(x_data, y_data)

@staticmethod
def backward(context, grad):
x, y = context.args
def forward(self, x, y):
self.x, self.y = ensure_numpy(x), ensure_numpy(y)
return np.divide(self.x, self.y)

def backward(self, grad):
# The gradient with respect to x is 1/y
# The gradient with respect to y is -x/y^2
return grad.data / y.data, -x.data * grad.data / (y.data**2)
return grad / self.y, -self.x * grad / (self.y**2)


class Mod(Operation):
Expand All @@ -109,82 +92,46 @@ class Mod(Operation):
for all use cases.
"""

@staticmethod
def forward(x, y):
from punytorch.tensor import Tensor

# Ensure that x and y are NumPy arrays
x_data = x.data if isinstance(x, Tensor) else np.array(x)
y_data = y.data if isinstance(y, Tensor) else np.array(y)
# Use NumPy's mod
return np.mod(x_data, y_data)
def forward(self, x, y):
self.x, self.y = ensure_numpy(x), ensure_numpy(y)
return np.mod(self.x, self.y)

@staticmethod
def backward(context, grad):
x, y = context.args
def backward(self, grad):
# The gradient of x % y with respect to x is 1, and with respect to y is 0
# Check if all elements in `y.data` are integers and raise a ValueError if they're not
if not np.all(y.data.astype(int) == y.data):
if not np.all(self.y.astype(int) == self.y):
raise ValueError("The derivative with respect to `y` is undefined for non-integer values.")
return grad, np.zeros_like(y.data)
return grad, np.zeros_like(self.y)


class Pow(Operation):
@staticmethod
def forward(x, y):
from punytorch.tensor import Tensor

# Ensure that x and y are NumPy arrays
x_data = x.data if isinstance(x, Tensor) else np.array(x)
y_data = y.data if isinstance(y, Tensor) else np.array(y)
# Use NumPy's power function
return np.power(x_data, y_data)

@staticmethod
def backward(context, grad):
x, y = context.args
def forward(self, x, y):
self.x, self.y = ensure_numpy(x), ensure_numpy(y)
return np.power(self.x, self.y)

def backward(self, grad):
# The gradient with respect to x is y * x^(y - 1)
# The gradient with respect to y is x^y * log(x)
grad_x = grad * y.data * np.power(x.data, y.data - 1)
grad_y = grad * np.power(x.data, y.data) * np.log(x.data)
grad_x = grad * self.y * np.power(self.x, self.y - 1)
grad_y = grad * np.power(self.x, self.y) * np.log(self.x)
return grad_x, grad_y


class MatMul(Operation):
@staticmethod
def forward(x, y):
from punytorch.tensor import Tensor

# Ensure that x and y are NumPy arrays
x_data = x.data if isinstance(x, Tensor) else np.array(x)
y_data = y.data if isinstance(y, Tensor) else np.array(y)
# Use NumPy's matmul
return np.matmul(x_data, y_data)

@staticmethod
def backward(context, grad):
x, y = context.args
def forward(self, x, y):
self.x, self.y = ensure_numpy(x), ensure_numpy(y)
return np.matmul(self.x, self.y)

def backward(self, grad):
# If Z = X @ Y, then d(Z)/dX = grad @ Y^T and d(Z)/dY = X^T @ grad
return grad.data @ np.transpose(y.data), np.transpose(x.data) @ grad.data
return np.dot(grad, self.y.T), np.dot(self.x.T, grad)


class Tanh(Operation):
@staticmethod
def forward(x):
from punytorch.tensor import Tensor

# Ensure that x is a NumPy array
x_data = x.data if isinstance(x, Tensor) else np.array(x)
# Use NumPy's tanh
return np.tanh(x_data)

@staticmethod
def backward(context, grad):
from punytorch.tensor import Tensor

x = context.args[0]
# The gradient of tanh is (1 - tanh^2(x))
x_data = x.data if isinstance(x, Tensor) else np.array(x)
tanh_x_data = np.tanh(x_data)
grad_tanh = (1 - np.square(tanh_x_data)) * grad
return grad_tanh
def forward(self, x):
self.x = ensure_numpy(x)
return np.tanh(self.x)

def backward(self, grad):
tanh_x = np.tanh(self.x)
return (1 - np.square(tanh_x)) * grad
Loading