cherry pick of NVIDIA#9266 (NVIDIA#9411)

* add deprecation warnings for non-mcore models Signed-off-by: dimapihtar <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * change warning default time Signed-off-by: dimapihtar <[email protected]> * remove unused import Signed-off-by: dimapihtar <[email protected]> * Apply isort and black reformatting Signed-off-by: dimapihtar <[email protected]> * remove deprecated tests Signed-off-by: dimapihtar <[email protected]> * set mcore_gpt to True Signed-off-by: dimapihtar <[email protected]> * set mcore_bert to True Signed-off-by: dimapihtar <[email protected]> * remove deprecated tests Signed-off-by: dimapihtar <[email protected]> * remove deprecated unit tests Signed-off-by: dimapihtar <[email protected]> * add deprecation warning Signed-off-by: dimapihtar <[email protected]> * Apply isort and black reformatting Signed-off-by: dimapihtar <[email protected]> * remove deprecated playbook Signed-off-by: dimapihtar <[email protected]> * remove deprecated tutorial Signed-off-by: dimapihtar <[email protected]> * turn off FA for Bert Signed-off-by: dimapihtar <[email protected]> * turn of FA for Bert Signed-off-by: dimapihtar <[email protected]> * change mcore commit Signed-off-by: dimapihtar <[email protected]> * adjustments * update TE commit Signed-off-by: dimapihtar <[email protected]> * fix mcore precision issue Signed-off-by: dimapihtar <[email protected]> * change precision for bert Signed-off-by: dimapihtar <[email protected]> * change precision for fine-tuning Signed-off-by: dimapihtar <[email protected]> * turn off fused attention for bert Signed-off-by: dimapihtar <[email protected]> * fix bert test Signed-off-by: dimapihtar <[email protected]> * revert tests Signed-off-by: dimapihtar <[email protected]> * fix typo Signed-off-by: dimapihtar <[email protected]> * remove unnecessary Signed-off-by: dimapihtar <[email protected]> --------- Signed-off-by: dimapihtar <[email protected]> Signed-off-by: dimapihtar <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: dimapihtar <[email protected]> Co-authored-by: Pablo Garay <[email protected]>
jgerh · Jun 10, 2024 · 27de845 · 27de845
1 parent 0fe2194
commit 27de845
Show file tree

Hide file tree

Showing 58 changed files with 1,252 additions and 8,709 deletions.
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
diff --git a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml
@@ -5,7 +5,7 @@ trainer:
   devices: 1
   num_nodes: 1
   accelerator: gpu
-  precision: 16
+  precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
   use_distributed_sampler: False
@@ -41,7 +41,7 @@ exp_manager:
 
 model:
   # model parallelism 
-  mcore_bert: False
+  mcore_bert: True
   micro_batch_size: 4
   global_batch_size: 8
   tensor_model_parallel_size: 1
@@ -85,7 +85,7 @@ model:
   fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
 
   # Megatron O2-style half-precision
-  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
+  megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters
   grad_allreduce_chunk_size_mb: 125
   grad_div_ar_fusion: False 
 
@@ -158,4 +158,4 @@ model:
       name: CosineAnnealing
       warmup_steps: 500
       constant_steps: 50000
-      min_lr: 2e-5
+      min_lr: 2e-5
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -9,7 +9,7 @@ trainer:
   devices: 1
   num_nodes: 1
   accelerator: gpu
-  precision: 16
+  precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
   use_distributed_sampler: False
@@ -56,7 +56,7 @@ exp_manager:
 
 model:
   # use GPTModel from megatron.core
-  mcore_gpt: False
+  mcore_gpt: True
 
   # specify micro_batch_size, global_batch_size, and model parallelism
   # gradient accumulation will be done automatically based on data_parallel_size
@@ -121,7 +121,7 @@ model:
   fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
 
   # Megatron O2-style half-precision
-  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
+  megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters
   grad_allreduce_chunk_size_mb: 125
 
   # Fusion

diff --git a/nemo/collections/nlp/data/dialogue/data_processor/assistant_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/assistant_data_processor.py
@@ -17,6 +17,7 @@
 
 from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor
 from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['DialogueAssistantDataProcessor']
 
@@ -31,6 +32,9 @@ def __init__(self, data_dir: str, tokenizer: object, cfg):
             data_dir: path to data directory
             tokenizer: tokenizer object
         """
+        # deprecation warning
+        deprecated_warning("DialogueAssistantDataProcessor")
+
         self.data_dir = data_dir
         self._tokenizer = tokenizer
         self.cfg = cfg
@@ -69,16 +73,15 @@ def open_file(self, filename):
 
     @staticmethod
     def get_continuous_slots(slot_ids, empty_slot_id, bio_slot_ids_to_unified_slot_ids):
-
         """
         Extract continuous spans of slot_ids
 
-        To accomodate slots with distinct labels for B-label1 and I-label1, 
+        To accomodate slots with distinct labels for B-label1 and I-label1,
         slot_id = self.bio_slot_ids_to_unified_slot_ids[slot_id] is called to map them both to label1
-        
+
         Args:
             Slot: list of int representing slot of each word token
-            For instance, 54 54 54 54 54 54 54 54 18 54 44 44 54 46 46 54 12 
+            For instance, 54 54 54 54 54 54 54 54 18 54 44 44 54 46 46 54 12
             Corresponds to "please set an alarm clock for my next meeting with the team at three pm next friday"
             Except for the empty_slot_id (54 in this case), we hope to extract the continuous spans of tokens,
             each containing a start position and an exclusive end position
@@ -124,7 +127,7 @@ def map_bio_format_slots_to_unified_slots(slots):
     def get_dialog_examples(self, dataset_split: str):
         """
         Process raw files into DialogueInputExample
-        Args: 
+        Args:
             dataset_split: {train, dev, test}
         For the assistant dataset, there is no explicit dev set (instead uses the test set as the dev set)
         Therefore, this function creates a dev set and a new train set from the train set.
@@ -177,7 +180,11 @@ def get_dialog_examples(self, dataset_split: str):
                 "labels": {"service": intent.split('_')[0], "intent": intent, "slots": slot_to_words},
                 "label_positions": {
                     "slots": {
-                        slot: {"start": position[0], "exclusive_end": position[1], "slot": slot,}
+                        slot: {
+                            "start": position[0],
+                            "exclusive_end": position[1],
+                            "slot": slot,
+                        }
                         for slot, position in slot_to_start_and_exclusive_end.items()
                     }
                 },

diff --git a/nemo/collections/nlp/data/dialogue/data_processor/data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/data_processor.py
@@ -17,6 +17,7 @@
 import random
 
 from nemo.collections.nlp.data.data_utils.data_preprocessing import DataProcessor
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['DialogueDataProcessor']
 
@@ -40,6 +41,9 @@ class DialogueDataProcessor(DataProcessor):
     """
 
     def __init__(self):
+        # deprecation warning
+        deprecated_warning("DialogueDataProcessor")
+
         raise NotImplementedError()
 
     def get_train_examples(self):
@@ -58,8 +62,8 @@ def get_test_examples(self):
     def get_relevant_idxs(dataset_split, n_samples, dev_proportion):
         """
         Obtain indexes for each dataset_split, when train and dev sets are not in separate files
-        
-        Args: 
+
+        Args:
             dataset_split: train, dev or test
             n_samples: total number of samples
             dev_proportion: value from 1 to 99 that represent proportion of data in dev set

diff --git a/nemo/collections/nlp/data/dialogue/data_processor/design_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/design_data_processor.py
@@ -19,6 +19,7 @@
 
 from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor
 from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['DialogueDesignDataProcessor']
 
@@ -34,6 +35,9 @@ def __init__(self, data_dir: str, tokenizer: object, cfg=None):
             tokenizer: tokenizer object
             cfg: cfg container for dataset
         """
+        # deprecation warning
+        deprecated_warning("DialogueDesignDataProcessor")
+
         self.data_dir = data_dir
         self._tokenizer = tokenizer
         self.cfg = cfg
@@ -50,7 +54,7 @@ def open_csv(self, filename):
     def get_dialog_examples(self, dataset_split: str):
         """
         Process raw files into DialogueInputExample
-        Args: 
+        Args:
             dataset_split: {train, dev, test}
         Dev set contains self.cfg.dev_proportion % of samples with the rest going into the train set
         Test set contains the whole dataset (Dev + Train) as this dataset is small (~100) and primarily used in a zero shot setting

diff --git a/nemo/collections/nlp/data/dialogue/data_processor/mellon_qa_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/mellon_qa_data_processor.py
@@ -19,13 +19,13 @@
 
 from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor
 from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['DialogueMellonQADataProcessor']
 
 
 class DialogueMellonQADataProcessor(DialogueDataProcessor):
-    """Data Processor for Mellon QA dialogues. 
-    """
+    """Data Processor for Mellon QA dialogues."""
 
     def __init__(self, data_dir: str, tokenizer: object, cfg=None):
         """
@@ -35,6 +35,9 @@ def __init__(self, data_dir: str, tokenizer: object, cfg=None):
             tokenizer: tokenizer object
             cfg: cfg container for dataset
         """
+        # deprecation warning
+        deprecated_warning("DialogueMellonQADataProcessor")
+
         self.data_dir = data_dir
         self._tokenizer = tokenizer
         self.cfg = cfg
@@ -51,7 +54,7 @@ def open_csv(self, filename):
     def get_dialog_examples(self, dataset_split: str):
         """
         Process raw files into DialogueInputExample
-        Args: 
+        Args:
             dataset_split: {train, dev, test}
         For the Mellon QA dataset, there is no explicit dev set (instead uses the test set as the dev set)
         Therefore, this function creates a dev set and a new train set from the train set.
@@ -82,7 +85,11 @@ def get_dialog_examples(self, dataset_split: str):
             input_example = {
                 "utterance": utterance,
                 "example_id": i,
-                "labels": {"response": answer, "fluent_response": well_formed_answer, "passage": passage,},
+                "labels": {
+                    "response": answer,
+                    "fluent_response": well_formed_answer,
+                    "passage": passage,
+                },
             }
             example = DialogueInputExample(input_example)
             examples.append(example)

diff --git a/nemo/collections/nlp/data/dialogue/data_processor/ms_marco_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/ms_marco_data_processor.py
@@ -19,15 +19,16 @@
 
 from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor
 from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['DialogueMSMarcoDataProcessor']
 
 
 class DialogueMSMarcoDataProcessor(DialogueDataProcessor):
     """Data Processor for MS Marco dialogues. (https://github.com/microsoft/MSMARCO-Question-Answering)
-       Please agree to the Terms of Use before downloading data at 
-       https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz
-       https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz
+    Please agree to the Terms of Use before downloading data at
+    https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz
+    https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz
     """
 
     def __init__(self, data_dir: str, tokenizer: object, cfg=None):
@@ -39,6 +40,9 @@ def __init__(self, data_dir: str, tokenizer: object, cfg=None):
             debug_mode: reduce number of samples to load in order to increase speed of processing
             cfg: cfg container for dataset
         """
+        # deprecation warning
+        deprecated_warning("DialogueMSMarcoDataProcessor")
+
         self.data_dir = data_dir
         self._tokenizer = tokenizer
         self.cfg = cfg
@@ -55,7 +59,7 @@ def open_json(self, filename):
     def get_dialog_examples(self, dataset_split: str):
         """
         Process raw files into DialogueInputExample
-        Args: 
+        Args:
             dataset_split: {train, dev, test}
         For the MS Marco dataset, there is no explicit dev set (instead uses the test set as the dev set)
         Therefore, this function creates a dev set and a new train set from the train set.

diff --git a/nemo/collections/nlp/data/dialogue/data_processor/sgd_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/sgd_data_processor.py
@@ -28,6 +28,7 @@
 from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample
 from nemo.collections.nlp.data.dialogue.sgd.schema import Schema
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 from nemo.utils.get_rank import is_global_rank_zero
 
 __all__ = ['DialogueSGDDataProcessor']
@@ -51,7 +52,7 @@ class DialogueSGDDataProcessor(DialogueDataProcessor):
         #   git clone https://github.com/google-research-datasets/dstc8-schema-guided-dialogue.git
 
     ***Data format***
-    SGD data comes with a JSON schema file and dialogue files for each dataset split. 
+    SGD data comes with a JSON schema file and dialogue files for each dataset split.
 
     In the following we will show an example for a service entry in the schema file.
     * service_name
@@ -70,7 +71,7 @@ class DialogueSGDDataProcessor(DialogueDataProcessor):
         * result_slots (not used)
 
 
-    In the following we will show an example for a dialogue. 
+    In the following we will show an example for a dialogue.
     * dialogue_id
     * services
     * turns
@@ -87,14 +88,18 @@ class DialogueSGDDataProcessor(DialogueDataProcessor):
             * state
                 * active_intent
                 * requeste_slots
-                * slot_values 
+                * slot_values
         * speaker - [USER, SYSTEM]
         * utterance
 
     """
 
     def __init__(
-        self, data_dir: str, dialogues_example_dir: str, tokenizer: object, cfg=None,
+        self,
+        data_dir: str,
+        dialogues_example_dir: str,
+        tokenizer: object,
+        cfg=None,
     ):
         """
         Constructs DialogueSGDDataProcessor
@@ -104,6 +109,9 @@ def __init__(
             tokenizer: tokenizer object
             cfg: cfg container for dataset
         """
+        # deprecation warning
+        deprecated_warning("DialogueSGDDataProcessor")
+
         self.data_dir = data_dir
         self.cfg = cfg
 
@@ -213,7 +221,7 @@ def get_labels(self):
 
     def get_dialog_examples(self, dataset_split: str) -> List[object]:
         """
-        Loads preprocessed dialogue examples from disk. 
+        Loads preprocessed dialogue examples from disk.
         Args:
             dataset_split: dataset split
         Returns:
@@ -260,7 +268,7 @@ def _generate_dialog_examples(self, dataset_split: str, schemas: object, subsamp
         Returns a list of `InputExample`s of the data splits' dialogues.
         Args:
             dataset_split: data split, can be "train", "dev", or "test".
-            schemas: schema for all services of all datasets 
+            schemas: schema for all services of all datasets
             subsample: whether to balance postive and negative samples in the dataset
         Returns:
             examples: a list of `InputExample`s.
@@ -447,9 +455,9 @@ def _create_examples_from_turn(
                 "example_id_num": example_id_num,
                 "utterance": user_utterance,
                 "system_utterance": system_utterance,
-                "system_slots": {slot["slot"]: slot for slot in system_frame["slots"]}
-                if system_frame is not None
-                else None,
+                "system_slots": (
+                    {slot["slot"]: slot for slot in system_frame["slots"]} if system_frame is not None else None
+                ),
                 "system_actions": system_frame["actions"] if system_frame is not None else None,
                 "labels": {
                     "service": service,
@@ -464,9 +472,11 @@ def _create_examples_from_turn(
                         for intent in schemas.get_service_schema(service).intents
                     ],
                     "slots": {
-                        slot: schemas.get_service_schema(service).get_categorical_slot_values(slot)
-                        if slot in categorical_slots
-                        else []
+                        slot: (
+                            schemas.get_service_schema(service).get_categorical_slot_values(slot)
+                            if slot in categorical_slots
+                            else []
+                        )
                         for slot in all_possible_slots
                     },
                 },