diff --git a/config/backpack.yaml b/config/backpack.yaml index 493be77a3..735d40c01 100644 --- a/config/backpack.yaml +++ b/config/backpack.yaml @@ -18,7 +18,6 @@ trainer: num_train_steps: 50000 train_batch_size: 1024 - per_device_parallelism: 4 model_axis_size: 1 optimizer: diff --git a/config/backpack_nano.yaml b/config/backpack_nano.yaml index 41b97d160..7bcc8ab6f 100644 --- a/config/backpack_nano.yaml +++ b/config/backpack_nano.yaml @@ -15,7 +15,6 @@ trainer: num_train_steps: 100 train_batch_size: 32 - per_device_parallelism: 1 model_axis_size: 1 optimizer: diff --git a/config/gpt2_1536.yaml b/config/gpt2_1536.yaml index a3633bf65..bbce6e1f6 100644 --- a/config/gpt2_1536.yaml +++ b/config/gpt2_1536.yaml @@ -14,7 +14,6 @@ trainer: mp: p=f32,c=bfloat16 model_axis_size: 1 - per_device_parallelism: 2 per_device_eval_parallelism: 8 optimizer: learning_rate: 1E-4 diff --git a/config/gpt2_1536_sophiah.yaml b/config/gpt2_1536_sophiah.yaml index 0d1008106..83338d202 100644 --- a/config/gpt2_1536_sophiah.yaml +++ b/config/gpt2_1536_sophiah.yaml @@ -20,8 +20,6 @@ trainer: mp: p=f32,c=bfloat16 model_axis_size: 1 - per_device_parallelism: 2 - per_device_eval_parallelism: 8 optimizer: type: sophia-h learning_rate: 2E-4 diff --git a/config/gpt2_20b.yaml b/config/gpt2_20b.yaml index 6f5f40e1b..e33067338 100644 --- a/config/gpt2_20b.yaml +++ b/config/gpt2_20b.yaml @@ -19,7 +19,6 @@ trainer: mp: p=f32,c=bfloat16 - per_device_parallelism: 4 per_device_eval_parallelism: 4 train_batch_size: 1024 diff --git a/config/gpt2_medium.yaml b/config/gpt2_medium.yaml index 47e21799c..2451153ac 100644 --- a/config/gpt2_medium.yaml +++ b/config/gpt2_medium.yaml @@ -14,7 +14,6 @@ trainer: mp: p=f32,c=bfloat16 model_axis_size: 1 - per_device_parallelism: 16 optimizer: learning_rate: 3E-4 weight_decay: 0.1 diff --git a/config/gpt2_micro.yaml b/config/gpt2_micro.yaml index 0a8283e78..d2ef2c3d5 100644 --- a/config/gpt2_micro.yaml +++ b/config/gpt2_micro.yaml @@ -13,5 +13,4 @@ trainer: mp: p=f32,c=bfloat16 num_train_steps: 100 per_device_eval_parallelism: 1 - per_device_parallelism: 4 train_batch_size: 32 diff --git a/config/gpt2_nano_mixture.yaml b/config/gpt2_nano_mixture.yaml index b089d090e..46ee59853 100644 --- a/config/gpt2_nano_mixture.yaml +++ b/config/gpt2_nano_mixture.yaml @@ -22,7 +22,6 @@ trainer: save_interval: 5m per_device_eval_parallelism: 1 - per_device_parallelism: 1 train_batch_size: 32 tensor_parallel_axes: ["mlp", "heads"] diff --git a/config/gpt2_small_fast_mix.yaml b/config/gpt2_small_fast_mix.yaml index ca9fa2ca6..deb2fd7c0 100644 --- a/config/gpt2_small_fast_mix.yaml +++ b/config/gpt2_small_fast_mix.yaml @@ -27,7 +27,6 @@ trainer: mp: p=f32,c=bfloat16 model_axis_size: 1 - per_device_parallelism: 8 train_batch_size: 256 num_train_steps: 20000 diff --git a/config/gpt2_small_fast_pile.yaml b/config/gpt2_small_fast_pile.yaml index a0336da45..3a21732a7 100644 --- a/config/gpt2_small_fast_pile.yaml +++ b/config/gpt2_small_fast_pile.yaml @@ -14,7 +14,6 @@ trainer: mp: p=f32,c=bfloat16 model_axis_size: 1 - per_device_parallelism: 8 train_batch_size: 256 num_train_steps: 20000 diff --git a/config/gpt2_small_fast_sophia_h.yaml b/config/gpt2_small_fast_sophia_h.yaml index 671acec8f..0037664f1 100644 --- a/config/gpt2_small_fast_sophia_h.yaml +++ b/config/gpt2_small_fast_sophia_h.yaml @@ -14,7 +14,6 @@ trainer: mp: p=f32,c=bfloat16 model_axis_size: 1 - per_device_parallelism: 8 train_batch_size: 256 num_train_steps: 20000 diff --git a/config/gpt2_small_pile.yaml b/config/gpt2_small_pile.yaml index 19512c3dd..07aeb24ee 100644 --- a/config/gpt2_small_pile.yaml +++ b/config/gpt2_small_pile.yaml @@ -14,7 +14,6 @@ trainer: mp: p=f32,c=bfloat16 model_axis_size: 1 - per_device_parallelism: 8 train_batch_size: 256 num_train_steps: 50000 diff --git a/config/gpt2_small_pile_mixture.yaml b/config/gpt2_small_pile_mixture.yaml index a79ec8052..c6c5338cd 100644 --- a/config/gpt2_small_pile_mixture.yaml +++ b/config/gpt2_small_pile_mixture.yaml @@ -14,7 +14,6 @@ trainer: mp: p=f32,c=bfloat16 model_axis_size: 1 - per_device_parallelism: 8 train_batch_size: 256 num_train_steps: 50000 diff --git a/config/gpt2_xl.yaml b/config/gpt2_xl.yaml index 026fc077e..70239ad1b 100644 --- a/config/gpt2_xl.yaml +++ b/config/gpt2_xl.yaml @@ -12,7 +12,6 @@ trainer: project: "levanter" tags: [ "openwebtext", "gpt2"] mp: p=f32,c=bfloat16 - per_device_parallelism: 1 optimizer: learning_rate: 1E-4 weight_decay: 0.1 diff --git a/config/llama2_7b_continued.yaml b/config/llama2_7b_continued.yaml index edb72a7e4..1c16a2f16 100644 --- a/config/llama2_7b_continued.yaml +++ b/config/llama2_7b_continued.yaml @@ -14,7 +14,6 @@ trainer: mp: p=f32,c=bfloat16 model_axis_size: 1 - per_device_parallelism: 4 per_device_eval_parallelism: 4 train_batch_size: 1024