-
Notifications
You must be signed in to change notification settings - Fork 470
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add default llama2 config converter (#563)
Co-authored-by: Duy Phung <[email protected]>
- Loading branch information
1 parent
ffa5ba1
commit 78c7faa
Showing
2 changed files
with
167 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,166 @@ | ||
name: megatron_llama2 | ||
restore_from_path: null # used when starting from a .nemo file | ||
|
||
trainer: | ||
devices: 8 | ||
num_nodes: 4 | ||
accelerator: gpu | ||
precision: bf16 | ||
logger: False # logger provided by exp_manager | ||
enable_checkpointing: False | ||
replace_sampler_ddp: False | ||
max_epochs: -1 # PTL default. In practice, max_steps will be reached first. | ||
max_steps: 200 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches | ||
log_every_n_steps: 1 | ||
val_check_interval: 20 | ||
# check_val_every_n_epoch: null | ||
limit_val_batches: 2 | ||
limit_test_batches: 0 | ||
accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models | ||
gradient_clip_val: 1.0 | ||
benchmark: False | ||
|
||
exp_manager: | ||
# set this to save checkpoints | ||
explicit_log_dir: ppo_sentiments_logs | ||
exp_dir: null | ||
name: megatron_gpt_7b_ppo_sentiments | ||
create_tensorboard_logger: False | ||
create_wandb_logger: False | ||
wandb_logger_kwargs: | ||
project: trlxnemo | ||
name: megatron_gpt_7b_ppo_sentiments | ||
resume_if_exists: False | ||
resume_ignore_no_checkpoint: True | ||
# set this to save checkpoints | ||
create_checkpoint_callback: False | ||
checkpoint_callback_params: | ||
monitor: reduced_train_loss | ||
save_top_k: 1 | ||
mode: min | ||
always_save_nemo: False # saves nemo file during validation, not implemented for model parallel | ||
save_nemo_on_train_end: True # not recommended when training large models on clusters with short time limits | ||
filename: 'megatron_gpt-{reduced_train_loss:.2f}-{step}-{consumed_samples}' | ||
model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} | ||
log_step_timing: True | ||
step_timing_kwargs: | ||
sync_cuda: True | ||
buffer_size: 5 | ||
|
||
model: | ||
padded_vocab_size: 32000 | ||
micro_batch_size: 1 | ||
global_batch_size: 32 | ||
tensor_model_parallel_size: 4 | ||
pipeline_model_parallel_size: 1 | ||
resume_from_checkpoint: null # manually set the checkpoint file to load from | ||
# model architecture | ||
encoder_seq_length: 2048 | ||
max_position_embeddings: 2048 | ||
num_layers: 32 | ||
hidden_size: 4096 | ||
ffn_hidden_size: 11008 # Transformer FFN hidden size. Usually 4 * hidden_size. | ||
num_attention_heads: 32 | ||
init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.') | ||
use_scaled_init_method: True # use scaled residuals initialization | ||
hidden_dropout: 0. # Dropout probability for hidden state transformer. | ||
attention_dropout: 0. # Dropout probability for attention | ||
ffn_dropout: 0.0 # Dropout probability in the feed-forward layer. | ||
kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null | ||
apply_query_key_layer_scaling: False # scale Q * K^T by 1 / layer-number. | ||
layernorm_epsilon: 1e-6 | ||
transformer_block_type: pre_ln | ||
do_layer_norm_weight_decay: False # True means weight decay on all params | ||
make_vocab_size_divisible_by: 1 # Pad the vocab size to be divisible by this value for computation efficiency. | ||
pre_process: True # add embedding | ||
post_process: True # add pooler | ||
persist_layer_norm: True # Use of persistent fused layer norm kernel. | ||
# Need to set this to remove bias terms in LN in NeMo | ||
normalization: rmsnorm | ||
bias: False # Whether to use bias terms in all weight matrices. | ||
bias_activation_fusion: False # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function. | ||
bias_dropout_add_fusion: False # Use a kernel that fuses the bias addition, dropout and residual connection addition. | ||
activation: 'swiglu' # Options ['gelu', 'geglu', 'swiglu', 'reglu', 'squared-relu', 'fast-geglu', 'fast-swiglu', 'fast-reglu'] | ||
headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head. | ||
rotary_percentage: 1 | ||
megatron_legacy: True | ||
position_embedding_type: 'rope' # Options ['rotary', 'learned', 'rope'] | ||
share_embeddings_and_output_weights: false | ||
|
||
## Activation Checkpointing | ||
activations_checkpoint_granularity: 'selective' #'selective' # 'selective' or 'full' | ||
activations_checkpoint_method: 'uniform' # 'uniform', 'block', not used with 'selective' | ||
activations_checkpoint_num_layers: null # not used with 'selective' | ||
|
||
## Sequence Parallelism | ||
sequence_parallel: True | ||
|
||
# tokenizer: | ||
# library: 'megatron' | ||
# type: 'GPT2BPETokenizer' | ||
# model: null | ||
# vocab_file: null | ||
# merge_file: null | ||
# delimiter: null # only used for tabular tokenizer | ||
# sentencepiece_legacy: false # Legacy=True allows you to add special tokens to sentencepiece tokenizers. | ||
tokenizer: | ||
library: 'huggingface' | ||
type: 'NousResearch/Llama-2-7b-hf' | ||
model: null | ||
vocab_file: null | ||
merge_file: null | ||
delimiter: null # only used for tabular tokenizer | ||
sentencepiece_legacy: false # Legacy=True allows you to add special tokens to sentencepiece tokenizers. | ||
|
||
|
||
# precision | ||
native_amp_init_scale: 4294967296 # 2 ** 32 | ||
native_amp_growth_interval: 1000 | ||
hysteresis: 2 # Gradient scale hysteresis | ||
fp32_residual_connection: False # Move residual connections to fp32 | ||
fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 | ||
|
||
# Megatron O2-style half-precision | ||
megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters | ||
grad_allreduce_chunk_size_mb: 125 | ||
sync_batch_comm: False | ||
# miscellaneous | ||
seed: 1234 | ||
use_cpu_initialization: False # Init weights on the CPU (slow for large models) | ||
onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. | ||
apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this | ||
gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) | ||
|
||
data: | ||
data_prefix: | ||
- dataset: hh | ||
index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix | ||
data_impl: mmap | ||
splits_string: 900,50,50 | ||
seq_length: ${model.encoder_seq_length} | ||
skip_warmup: True | ||
num_workers: 2 | ||
dataloader_type: cyclic | ||
reset_position_ids: False # Reset position ids after end-of-document token | ||
reset_attention_mask: False # Reset attention mask after end-of-document token | ||
eod_mask_loss: False # Mask loss for the end of document tokens | ||
|
||
# Nsys profiling options | ||
nsys_profile: | ||
enabled: False | ||
start_step: 10 # Global batch to start profiling | ||
end_step: 10 # Global batch to end profiling | ||
ranks: [0, 4, 8, 12] # Global rank IDs to profile | ||
gen_shape: False # Generate model and kernel details including input shapes | ||
|
||
optim: | ||
name: distributed_fused_adam | ||
lr: 1.0e-6 | ||
weight_decay: 0.0 #1.0e-6 | ||
betas: | ||
- 0.9 | ||
- 0.95 | ||
sched: | ||
name: CosineAnnealing | ||
max_steps: 200 | ||
min_lr: 1.0e-6 |