Skip to content

Commit

Permalink
botched merge (#463)
Browse files Browse the repository at this point in the history
  • Loading branch information
dlwh authored Feb 12, 2024
1 parent 8f9a3de commit 0b8b6e9
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 3 deletions.
32 changes: 32 additions & 0 deletions config/gpt2_1536_sophiah.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
data:
train_urls:
- "gs://pubmed-mosaic/openwebtext-sharded/openwebtext_train.{1..128}-of-128.jsonl.gz"
validation_urls:
- "gs://pubmed-mosaic/openwebtext-sharded/openwebtext_val.{1..8}-of-8.jsonl.gz"
cache_dir: "gs://levanter-data/tokenized/openwebtext/"
tokenizer: "gpt2"
model:
type: gpt2
hidden_dim: 1536
num_heads: 24
num_layers: 48
seq_len: 1024
gradient_checkpointing: true
scale_attn_by_inverse_layer_idx: true
trainer:
tracker:
project: "levanter"
tags: [ "openwebtext", "gpt2"]

mp: p=f32,c=bfloat16
model_axis_size: 1
per_device_parallelism: 2
per_device_eval_parallelism: 8
optimizer:
type: sophia-h
learning_rate: 2E-4
weight_decay: 0.2
min_lr_ratio: 0.1
gamma: 0.01
# sophia needs a minimum amount of warmup or it doesn't do well
warmup: 2000
7 changes: 4 additions & 3 deletions src/levanter/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -493,13 +493,14 @@ def _take_train_step(self, state: S, model, grads, *batch, **batch_kwargs) -> S:
"""
# only train on the trainable parameters. We're leaning on JAX to do dead code elimination for us
with hax.axis_mapping(self.parameter_axis_mapping):
opt_state = state.opt_state
train_grads = _partition_trainable_params(grads, state.is_trainable)[0]
trainable_model = _partition_trainable_params(model, state.is_trainable)[0]
updates, opt_state = self.optimizer.update(train_grads, state.opt_state, params=trainable_model)

partial_fn = lambda model: self.loss_fn(model, *batch, **batch_kwargs)

updates, opt_state = self.optimizer.update(grads, opt_state, params=trainable_model, obj_fn=partial_fn)
updates, opt_state = self.optimizer.update(
train_grads, opt_state, params=trainable_model, obj_fn=partial_fn
)
model = eqx.apply_updates(model, updates)

return dataclasses.replace(state, _step=state._step + 1, model=model, opt_state=opt_state)
Expand Down

0 comments on commit 0b8b6e9

Please sign in to comment.