Merge branch 'merge_trackers' into doremi

stanford-crfm · Feb 9, 2024 · c5fb7a6 · c5fb7a6
2 parents 3e3c9da + 7ba2b39
commit c5fb7a6
Show file tree

Hide file tree

Showing 37 changed files with 1,399 additions and 483 deletions.
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -0,0 +1,11 @@
+# To get started with Dependabot version updates, you'll need to specify which
+# package ecosystems to update and where the package manifests are located.
+# Please see the documentation for all configuration options:
+# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
+
+version: 2
+updates:
+  - package-ecosystem: "pip" # See documentation for possible values
+    directory: "/" # Location of package manifests
+    schedule:
+      interval: "weekly"
diff --git a/config/gpt2_nano.yaml b/config/gpt2_nano.yaml
@@ -1,5 +1,5 @@
-#data:
-#  id: dlwh/wikitext_103_detokenized
+data:
+  id: dlwh/wikitext_103_detokenized
 model:
   type: gpt2
   hidden_dim: 32
@@ -14,7 +14,7 @@ trainer:
       - every: 50
     save_interval: 5m
 
-  per_device_parallelism: 16
+  per_device_parallelism: -1
   train_batch_size: 32
 
   tensor_parallel_axes: ["mlp", "heads"]

diff --git a/config/gpt2_nano_tb.yaml b/config/gpt2_nano_tb.yaml
@@ -14,8 +14,7 @@ trainer:
       - every: 50
     save_interval: 5m
 
-  per_device_eval_parallelism: 1
-  per_device_parallelism: 1
+  per_device_parallelism: -1
   train_batch_size: 32
 
   tensor_parallel_axes: ["mlp", "heads"]

diff --git a/config/gpt2_small.yaml b/config/gpt2_small.yaml
@@ -14,7 +14,7 @@ trainer:
 
   mp: p=f32,c=bfloat16
   model_axis_size: 1
-  per_device_parallelism: 4
+  per_device_parallelism: -1
 
   train_batch_size: 512
 optimizer:

diff --git a/config/mistral_7b.yaml b/config/mistral_7b.yaml
@@ -0,0 +1,28 @@
+data:
+  train_urls:
+    - "gs://pubmed-mosaic/openwebtext-sharded/openwebtext_train.{1..128}-of-128.jsonl.gz"
+  validation_urls:
+    - "gs://pubmed-mosaic/openwebtext-sharded/openwebtext_val.{1..8}-of-8.jsonl.gz"
+  cache_dir: "gs://levanter-data/tokenized/openwebtext_llama/"
+  tokenizer: "mistralai/Mistral-7B-v0.1"
+model:
+  type: mistral
+# TODO: uncomment this once we resolve the resource exhaustion issue
+# initialize_from_hf: "mistralai/Mistral-7B-v0.1"
+# use_hf_model_config: true
+trainer:
+  wandb:
+    project: "levanter"
+    tags: ["openwebtext", "mistral"]
+
+  mp: p=f32,c=bfloat16
+  train_batch_size: 256  # set for v4-64 TPU
+  num_train_steps: 1000
+  steps_per_eval: 50
+  tensor_parallel_axes: ["mlp", "heads"]
+  fsdp_axis: "embed"
+  batch_axis: "batch"
+optimizer:
+  learning_rate: 1.2E-5  # set low for fine-tuning
+  weight_decay: 0.1
+  min_lr_ratio: 0.1
diff --git a/docs/Configuration-Guide.md b/docs/Configuration-Guide.md
@@ -13,7 +13,7 @@ class TrainLmConfig:
     data: LMDatasetConfig = field(default_factory=LMDatasetConfig)
     trainer: TrainerConfig = field(default_factory=TrainerConfig)
     model: LmConfig = field(default_factory=Gpt2Config)
-    optimizer: OptimizerConfig = field(default_factory=OptimizerConfig)
+    optimizer: OptimizerConfig = field(default_factory=AdamConfig)
 ```
 
 Your training run will typically be associated with a single config file. For instance, you might have a file