diff --git a/demo/NeMo/.gitignore b/demo/NeMo/.gitignore deleted file mode 100644 index af9bae11c..000000000 --- a/demo/NeMo/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -apex/ -Megatron-LM/ -NeMo/ -temp/ -__pycache__/ diff --git a/demo/NeMo/GPT3/GPT3ModelConfig.py b/demo/NeMo/GPT3/GPT3ModelConfig.py deleted file mode 100644 index 0e50d6cec..000000000 --- a/demo/NeMo/GPT3/GPT3ModelConfig.py +++ /dev/null @@ -1,87 +0,0 @@ -# -# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Base Class -import sys -sys.path.append('../../HuggingFace') # Include HuggingFace directory -from NNDF.networks import NNConfig, NetworkMetadata - -class GPT3ModelTRTConfig(NNConfig): - - NETWORK_FULL_NAME = "full" - TARGET_MODELS = [ - "gpt-126m", - "gpt-1.3b", - "gpt-5b", - ] - - def __init__( - self, - metadata, - **kwargs - ): - super().__init__( - network_name="GPT3", - **kwargs - ) - self.nemo_config = None - self.use_mask = False - self.metadata = metadata - self.variant = metadata.variant - - def from_nemo_config(self, nemo_config): - self.nemo_config = nemo_config - - def get_metadata_string(self, metadata: NetworkMetadata) -> str: - """ - Serializes a Metadata object into string. - String will be checked if friendly to filenames across Windows and Linux operating systems. - This function is a modified version from HuggingFace/NNDF/networks.py. - - returns: - string: -[-]*- - """ - - enabled_precisions = self.nemo_config.trt_export_options - precision_str = "-".join( - [ - k for k, v in { - "fp8": enabled_precisions.use_fp8, - "fp16": enabled_precisions.use_fp16, - "bf16": enabled_precisions.use_bf16, - }.items() if v - ] - ) - - result = [self.network_name, metadata.variant] - if precision_str: - result.append(precision_str) - - # Append max sequence length - result.append("ms" + str(self.nemo_config.model.max_seq_len)) - - if metadata.use_cache: - result.append("kv_cache") - - final_str = "-".join(result) - assert self._is_valid_filename( - final_str - ), "Metadata for current network {} is not filename friendly: {}.".format( - self.network_name, final_str - ) - - return final_str diff --git a/demo/NeMo/GPT3/decoding.py b/demo/NeMo/GPT3/decoding.py deleted file mode 100644 index 2edf66e7b..000000000 --- a/demo/NeMo/GPT3/decoding.py +++ /dev/null @@ -1,453 +0,0 @@ -# -# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from collections.abc import Iterable -import sys -from typing import List - -from apex.transformer.pipeline_parallel.utils import _reconfigure_microbatch_calculator -from megatron.core import parallel_state -from nemo.collections.nlp.modules.common.text_generation_strategy import GPTModelTextGenerationStrategy -from nemo.utils import AppState -import torch -import torch.nn.functional as F - -from GPT3.trt_utils import GPTTRTDecoder - -sys.path.append('../../HuggingFace') # Include HuggingFace -from NNDF.logger import G_LOGGER - - -def sample_sequence_batch( - model, - inference_strategy, - context_tokens, - context_lengths, - tokens_to_generate, - all_probs=False, - temperature=None, - extra={}, -): - def repetition_penalty(logits, repetition_penalty, used_tokens): - """ Implement the repetition penalty, check paper - https://arxiv.org/pdf/1909.05858.pdf - """ - if used_tokens is not None and repetition_penalty != 1.0: - logits_update = torch.gather(logits, 1, used_tokens) - logits = torch.scatter(logits, 1, used_tokens, logits_update / repetition_penalty) - return logits - - def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf'), started=None): - """ - This function has been mostly taken from huggingface conversational - ai code at - https://medium.com/huggingface/how-to-build-a-state-of-the-art- - conversational-ai-with-transfer-learning-2d818ac26313 - - @param logits: logits tensor - @param top_k: keep only top k tokens with highest probability - @param top_p: keep the top tokens with cumulative probability - @filter_value: value to set filtered tokens to - @started: a tensor of bools indicating whether the text generation starts for the batch - returns the filtered logits - """ - if top_k > 0: - # Remove all tokens with a probability less than the - # last token of the top-k - indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] - if started is not None: - for i in torch.arange(indices_to_remove.size(0))[started]: - logits[i, indices_to_remove[i]] = filter_value - else: - logits[indices_to_remove] = filter_value - - if top_p > 0.0: - # Cconvert to 1D - sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1) - cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) - - # Remove tokens with cumulative probability above the threshold - sorted_indices_to_remove = cumulative_probs > top_p - # Shift the indices to the right to keep also the first token - # above the threshold - sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() - sorted_indices_to_remove[..., 0] = 0 - if started is not None: - for i in torch.arange(sorted_indices.size(0))[started]: - indices_to_remove = sorted_indices[i][sorted_indices_to_remove[i]] - logits[i, indices_to_remove] = filter_value - else: - for i in range(sorted_indices.size(0)): - indices_to_remove = sorted_indices[i][sorted_indices_to_remove[i]] - logits[i, indices_to_remove] = filter_value - - return logits - - app_state = AppState() - batch_size = context_tokens.shape[0] - if not (hasattr(model, "trt") or hasattr(model, "onnx")): - _reconfigure_microbatch_calculator( - rank=app_state.global_rank, - rampup_batch_size=None, - global_batch_size=batch_size, - micro_batch_size=batch_size, - data_parallel_size=1, - ) - - tokenizer = model.tokenizer - # initialize the batch - with torch.no_grad(): - context_length = context_lengths.min().item() - context_lengths_cpu = context_lengths.cpu() - inference_strategy.init_batch(context_tokens, context_length) - # added eos_id to support the function generate_samples_eval that passes - # eos_id as an argument and needs termination when that id id found. - eod_id = tokenizer.eos_id - counter = 0 - - tokens = context_tokens - output_logits = None - all_generated_indices = None # used to track all generated indices - # Generate enough tokens for the longest sequence - maxlen = tokens_to_generate + context_lengths.max().item() - maxlen = inference_strategy.clip_max_len(maxlen) - - is_done = torch.zeros([batch_size]).byte() - lengths = torch.ones([batch_size]).long() * maxlen - - use_cache = extra.get("use_cache", False) - is_onnx = hasattr(model, "onnx") - is_trt = hasattr(model, "trt") - - if is_trt: - assert isinstance(model.trt, GPTTRTDecoder) - input_ids_name = model.trt.get_input_ids_name() - input_ids_type = model.trt.get_torch_type(input_ids_name) - position_ids_name = model.trt.get_position_ids_name() - position_ids_type = model.trt.get_torch_type(position_ids_name) - attention_mask_name = model.trt.get_attention_mask_name() - if attention_mask_name != None: - attention_mask_type = model.trt.get_torch_type(attention_mask_name) - - position_ids = inference_strategy.position_ids - attention_mask = inference_strategy.attention_mask - - torch.cuda.nvtx.range_pop() # "Prepare Batch" - while context_length < maxlen: - torch.cuda.nvtx.range_push("I/O Setup") - - output = None - if is_onnx and use_cache: - G_LOGGER.warn(f"ONNX runtime path does not support KV-cache.") - - # Modify counter based on using cache or not. - if is_trt: - # TRT input preprocessing doesn't use nemo function - pass - elif not is_onnx and use_cache: - batch, tensor_shape = inference_strategy.prepare_batch_at_step( - tokens, maxlen, batch_size, counter, context_length - ) - else: - batch, tensor_shape = inference_strategy.prepare_batch_at_step( - tokens, maxlen, batch_size, 0, context_length # step is always 0 - ) - - # inputs input_ids: [BS, SEQ], position_ids: [BS, SEQ], attention_mask: [1, 1, SEQ, SEQ] - if is_trt: - context_mode = (use_cache and counter == 0) or not use_cache - if context_mode or not use_cache: - # context mode - batch_tokens = tokens[:, :context_length] - batch_position_ids = position_ids[:, :context_length] - else: - # generate mode - batch_tokens = tokens[:, context_length - 1].view(batch_size, -1) - batch_position_ids = position_ids[:, context_length - 1].view(batch_size, -1) - seq_len = batch_tokens.shape[1] - batch_attention_mask = attention_mask[0:1, 0:1, :seq_len, :seq_len] - input_ids = batch_tokens.type(input_ids_type).contiguous().cuda() - tensor_dict = {input_ids_name : (input_ids.data_ptr(), input_ids.shape)} - if position_ids_name != None: - batch_position_ids = batch_position_ids.type(position_ids_type).contiguous().cuda() - tensor_dict[position_ids_name] = (batch_position_ids.data_ptr(), batch_position_ids.shape) - if attention_mask_name != None: - batch_attention_mask = batch_attention_mask.type(attention_mask_type).contiguous().cuda() - tensor_dict[attention_mask_name] = (batch_attention_mask.data_ptr(), batch_attention_mask.shape) - - logits_name = model.trt.get_output_name() - torch.cuda.nvtx.range_pop() # "I/O Setup" - output = model.trt.run(logits_name, tensor_dict, seq_len, context_mode) - - elif is_onnx: - assert len(batch) == 5, "Length of batch must be 5." - ( - batch_tokens, - attention_mask, - position_ids, - set_inference_key_value_memory, - _, - ) = batch - seq_len = batch_tokens.shape[1] - attention_mask = attention_mask[0:1, 0:1, 0:seq_len, 0:seq_len] - - from onnxruntime import InferenceSession - assert isinstance(model.onnxrt, InferenceSession) - # Currently only support onnx runtime with cpu - # Our fp8 models don't currently use a user-provided attention_mask - tensor_dict = {'input_ids': batch_tokens.cpu().detach().numpy(), - 'position_ids': position_ids.cpu().detach().numpy()} - - def have_attention_mask(sess): - return any(inp.name == 'attention_mask' for inp in all_inputs) - - if have_attention_mask(model.onnxrt): - tensor_dict['attention_mask'] = attention_mask.cpu().detach().numpy() - torch.cuda.nvtx.range_pop() # "I/O Setup" - output = model.onnxrt.run(['logits'], tensor_dict)[0] - output = torch.Tensor(output).cuda() - # output logits: [BS, SEQ, 50304] - else: - # nemo path - torch.cuda.nvtx.range_pop() # "I/O Setup" - output = inference_strategy.forward_step(batch, tensor_shape) - output = output[0]['logits'].float() - - assert output is not None - torch.cuda.nvtx.range_push("Output Sampling") - output = output.float() - logits = output[:, -1].view(batch_size, -1).contiguous() - - # make sure it will generate at least min_length - min_length = extra.get('min_tokens_to_generate', 0) - if min_length > 0: - within_min_length = (context_length - context_lengths) < min_length - logits[within_min_length, eod_id] = -float('Inf') - - # make sure it won't sample outside the vocab_size range - logits[:, tokenizer.vocab_size :] = -float('Inf') - - # started indicates whether the current token step passes the context_length, so we make sure not to overwrite the context tokens - started = context_lengths_cpu <= context_length - if extra.get('greedy', False): - prev = torch.argmax(logits, dim=-1).view(-1) - else: - logits = logits.float() - logits /= temperature - # handle repetition penality - logits = repetition_penalty(logits, extra.get('repetition_penalty', 1.0), all_generated_indices) - logits = top_k_logits( - logits, top_k=extra.get('top_k', 0), top_p=extra.get('top_p', 0.9), started=started - ) - probs = F.softmax(logits, dim=-1) - prev = torch.multinomial(probs, num_samples=1).view(-1) - - prev = prev.cpu() - # Clamp the predicted out of vocabulary tokens - prev = torch.clamp(prev, max=tokenizer.vocab_size - 1) - # Replace sampled tokens w/ done token if EOD has already been sampled - new_tokens = torch.where(is_done, eod_id, prev) - # post process the inference tokens based on the strategy - inference_strategy.post_process(tokens, new_tokens, context_length) - - # Insert either new predicted or next prompt token - if extra.get("accuracy_mode", False): - # We only update the last token for accuracy mode. - at_prediction_index = (context_lengths + tokens_to_generate - 1 == context_length) - tokens[:, context_length] = torch.where(at_prediction_index, new_tokens.cuda(), tokens[:, context_length]) - else: - tokens[:, context_length] = torch.where(started.cuda(), new_tokens.cuda(), tokens[:, context_length]) - - if not extra.get("benchmark_mode", False): - if output_logits is None: - output = F.log_softmax(output[:, :context_length, :], 2) - indices = torch.unsqueeze(tokens[:, 1 : context_length + 1], 2) - output_logits = torch.gather(output, 2, indices).squeeze(2) - all_generated_indices = indices[:, :, 0] - if all_probs: - full_logits = output - else: - output = F.log_softmax(output, 2) - indices = torch.unsqueeze(new_tokens.cuda(), 1).unsqueeze(2) - new_output_logits = torch.gather(output, 2, indices).squeeze(2) - - # This copy can be optimized out by pre-allocating the memory. - output_logits = torch.cat([output_logits, new_output_logits], 1) - all_generated_indices = torch.cat([all_generated_indices, indices[:, :, 0]], 1) - if all_probs: - if extra.get("use_cache", False): - full_logits = torch.cat([full_logits, output], 1) - else: - full_logits = output - - done_token = (prev == eod_id) - done_token = done_token.byte() & started.byte() - - just_finished = (done_token & ~is_done).bool() - lengths[just_finished.view(-1)] = context_length - is_done = is_done | done_token - - done = torch.all(is_done) - torch.cuda.nvtx.range_pop() # "Output Sampling" - - context_length += 1 - counter += 1 - if done and not extra.get("benchmark_mode", False): - break - - if all_probs: - return tokens, context_length, lengths, output_logits, full_logits - return tokens, context_length, lengths, output_logits, None - -def initialize_ddp(model, cfg): - # check whether the DDP is initialized - if cfg.runtime == "nemo" and parallel_state.is_unitialized(): - def dummy(): - return - if model.trainer.strategy.launcher is not None: - model.trainer.strategy.launcher.launch(dummy, trainer=model.trainer) - model.trainer.strategy.setup_environment() - - if model.cfg.get('transformer_engine', False): - model.setup_transformer_engine_tp_groups() - -def get_special_tokens(tokenizer): - special_tokens = set() - if hasattr(tokenizer, 'pad_token') and tokenizer.pad_token is not None: - special_tokens.add(tokenizer.pad_token) - if hasattr(tokenizer, 'eos_token') and tokenizer.eos_token is not None: - special_tokens.add(tokenizer.eos_token) - if hasattr(tokenizer, 'bos_token') and tokenizer.bos_token is not None: - special_tokens.add(tokenizer.bos_token) - if hasattr(tokenizer, 'cls_token') and tokenizer.cls_token is not None: - special_tokens.add(tokenizer.cls_token) - if hasattr(tokenizer, 'unk_token') and tokenizer.unk_token is not None: - special_tokens.add(tokenizer.unk_token) - if hasattr(tokenizer, 'sep_token') and tokenizer.sep_token is not None: - special_tokens.add(tokenizer.sep_token) - if hasattr(tokenizer, 'mask_token') and tokenizer.mask_token is not None: - special_tokens.add(tokenizer.mask_token) - return special_tokens - -def process_output(model, output, return_segments=False): - torch.cuda.nvtx.range_push("Process Output") - inference_strategy = GPTModelTextGenerationStrategy(model) - tokenizer = model.tokenizer - if output is not None: - decode_tokens, output_logits, full_logits = output - decode_tokens = decode_tokens.cpu().numpy().tolist() - - # convert ids to text by applying tokenizer - resp_sentences = list(map(tokenizer.ids_to_text, decode_tokens)) - - all_offsets = [] - resp_sentences_seg = [] - if return_segments: - # segments sentences into words. - for decode_token in decode_tokens: - words = [] - for token in decode_token: - if not isinstance(token, Iterable): - token = [token] - word = tokenizer.ids_to_tokens(token) - if isinstance(word, Iterable): - word = word[0] - if hasattr(tokenizer.tokenizer, 'byte_decoder'): - word = bytearray([tokenizer.tokenizer.byte_decoder[c] for c in word]).decode( - 'utf-8', errors='replace' - ) - words.append(word) - resp_sentences_seg.append(words) - - # offsets calculation - special_tokens = get_special_tokens(tokenizer) - for item in resp_sentences_seg: - offsets = [0] - for index, token in enumerate(item): - if index != len(item) - 1: - if token in special_tokens: - offsets.append(offsets[-1]) - else: - offsets.append(len(token) + offsets[-1]) - all_offsets.append(offsets) - - output = {} - output['sentences'] = resp_sentences - output['tokens'] = resp_sentences_seg - output['logprob'] = output_logits - output['full_logprob'] = full_logits - output['token_ids'] = decode_tokens - output['offsets'] = all_offsets - output = inference_strategy.post_generation_process(output) - torch.cuda.nvtx.range_pop() # "Process Output" - return output - -def generate(model, inputs, cfg): - torch.cuda.nvtx.range_push("Prepare Batch") - initialize_ddp(model, cfg) - - tokens_to_generate = cfg.inference.tokens_to_generate - min_tokens_to_generate = cfg.inference.min_tokens_to_generate - add_BOS = cfg.inference.add_BOS - all_probs = cfg.inference.all_probs - temperature = cfg.inference.temperature - is_benchmark_mode = True if cfg.mode == "benchmark" else False - is_accuracy_mode = True if cfg.mode == "accuracy" else False - - inference_strategy = GPTModelTextGenerationStrategy(model) - if isinstance(inputs, tuple): - context_tokens_tensor, context_length_tensor = inputs - else: - context_tokens_tensor, context_length_tensor = inference_strategy.tokenize_batch( - inputs, tokens_to_generate, add_BOS - ) - - context_length = context_length_tensor.min().item() - - batch_token_result = sample_sequence_batch( - model, - inference_strategy, - context_tokens_tensor, - context_length_tensor, - tokens_to_generate, - all_probs, - temperature=temperature, - extra={ - "top_p": cfg.inference.top_p, - "top_k": cfg.inference.top_k, - "greedy": cfg.inference.greedy, - "repetition_penalty": cfg.inference.repetition_penalty, - "min_tokens_to_generate": min_tokens_to_generate, - "use_cache": cfg.use_cache, - "benchmark_mode": is_benchmark_mode, - "accuracy_mode": is_accuracy_mode, - "use_fp8_storage": cfg.onnx_export_options.use_fp8_storage, - }, - ) - - tokens, context_length, _, output_logits, full_logits = batch_token_result - - output = None - if tokens is not None: - output = tokens[:, :context_length], output_logits, full_logits - return output - -def full_inference(model, inputs, cfg): - output = generate(model, inputs, cfg) - if output is not None: - output = process_output(model, output, return_segments=(cfg.mode is not "benchmark")) - return output diff --git a/demo/NeMo/GPT3/frameworks.py b/demo/NeMo/GPT3/frameworks.py deleted file mode 100644 index 851f4cdf4..000000000 --- a/demo/NeMo/GPT3/frameworks.py +++ /dev/null @@ -1,81 +0,0 @@ -# -# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import os -import sys - -import omegaconf - -# Add syspath for custom library -if __name__ == "__main__": - filepath = os.path.dirname(os.path.abspath(__file__)) - project_root = os.path.join(filepath, os.pardir) - sys.path.append(project_root) - -from GPT3.nemo_utils import load_nemo_model -from GPT3.GPT3ModelConfig import GPT3ModelTRTConfig -from interface import NeMoCommand - -sys.path.append('../../HuggingFace') # Include HuggingFace -from NNDF.interface import FRAMEWORK_NATIVE -from NNDF.networks import ( - NetworkModel, - NetworkModels, -) - -class GPT3NeMoTorch(NeMoCommand): - def __init__( - self, - nemo_cfg, - config_class=GPT3ModelTRTConfig, - description="Runs framework results for GPT3 model with NeMo.", - **kwargs - ): - super().__init__(nemo_cfg, config_class, description, model_classes=None, **kwargs) - self.framework_name = FRAMEWORK_NATIVE - - def setup_tokenizer_and_model(self): - self.nemo_cfg.runtime = 'nemo' - self.model = load_nemo_model(self.nemo_cfg) - self.tokenizer = self.model.tokenizer - - torch_models = [ - NetworkModel( - name=GPT3ModelTRTConfig.NETWORK_FULL_NAME, fpath=self.workspace.torch_path - ) - ] - return NetworkModels(torch=torch_models, onnx=None, trt=None) - - def process_framework_specific_arguments(self, onnx_model: str = None, **kwargs): - if onnx_model: - raise RuntimeError( - "native framework does not support loading an ONNX file via `onnx-model` yet. Please specify the NeMo model using `nemo-model` instead." - ) - - -# Entry point -def getGPT3NeMoTorch(): - config_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../config.yaml") - nemo_cfg = omegaconf.OmegaConf.load(config_path) - return GPT3NeMoTorch(nemo_cfg) - -# Entry point -RUN_CMD = getGPT3NeMoTorch() - -if __name__ == "__main__": - result = RUN_CMD() - print("Results: {}".format(result)) diff --git a/demo/NeMo/GPT3/lambada_dataset.py b/demo/NeMo/GPT3/lambada_dataset.py deleted file mode 100644 index a7945cec7..000000000 --- a/demo/NeMo/GPT3/lambada_dataset.py +++ /dev/null @@ -1,126 +0,0 @@ -# -# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import os -import collections -import json -import requests -import sys -import torch -from torch.nn.utils.rnn import pad_sequence - -# Add syspath for custom library -if __name__ == "__main__": - filepath = os.path.dirname(os.path.abspath(__file__)) - project_root = os.path.join(filepath, os.pardir) - sys.path.append(project_root) - -from nemo_export import create_dir_if_not_exist - -__all__ = ['Lambada'] - - -class Lambada(): - - def __init__(self, base_dir, tokens_to_generate, padding = -1, max_length = 2048): - assert tokens_to_generate >= 1 - assert padding == -1 or tokens_to_generate == 1 - self.base_dir = base_dir - self.tokens_to_generate = tokens_to_generate - self.padding = padding - self.max_length = max_length - self.download() - - def get_data_file_path(self): - path = os.path.join(self.base_dir, "lambada") - path = os.path.join(path, "lambada_test.jsonl") - create_dir_if_not_exist(path) - return path - - def download(self): - path = self.get_data_file_path() - if not os.path.exists(path): - url = "https://github.com/cybertronai/bflm/raw/master/lambada_test.jsonl" - with requests.get(url) as r, open(path, 'wb') as fh: - fh.write(r.content) - - def load(self): - path = self.get_data_file_path() - with open(path) as fh: - for line in fh: - yield json.loads(line) - - def _preprocess(self, text): - text = text.replace("“", '"') - text = text.replace("”", '"') - text = text.replace("’", "'") - text = text.replace("‘", "'") - return text - - def doc_to_text(self, doc): - return "\n" + self._preprocess(doc["text"].rsplit(" ", 1)[0]).strip() - - def doc_to_target(self, doc): - split_text = doc["text"].rsplit(" ", 1) - if len(split_text) <= 1: - raise ValueError(f"Input doc '{doc}' does not have target.") - return " " + self._preprocess(split_text[1]) - - def preprocess_input(self, tokenizer, docs): - _Input = collections.namedtuple("_DS_Input", ["inputs", "inp_enc", "lens", "lens_pad", "conti_len"]) - batch_size = len(docs) - tokens = [] - conti_lens = [] - lens = [] - inp_encs = [] - for doc in docs: - # Handle padded text - if not doc["text"]: - inp_enc = [0] - conti_len = 0 - else: - text = self.doc_to_text(doc) - target = self.doc_to_target(doc) - - context_enc = tokenizer.text_to_ids(text) - continuation_enc = tokenizer.text_to_ids(target) - - inp_enc = (context_enc + continuation_enc)[-(self.max_length + 1) :] - conti_len = len(continuation_enc) - - inp_encs.append(inp_enc) - conti_lens.append(conti_len) - tokens.append(torch.tensor(inp_enc)) - lens.append(len(inp_enc) - 1) - max_lens = max(lens) - - tokens_pad = pad_sequence(tokens, batch_first=False, padding_value=tokenizer.eos_id) - if self.padding != -1 and max_lens % self.padding != 0: - # We need align the context length to multiple of 8 for FP8 run using NeMo framework. - extra_pad_len = self.padding - (max_lens % self.padding) - - extra_pad = torch.ones(extra_pad_len, batch_size) * tokenizer.eos_id - extra_pad = extra_pad.type_as(tokens_pad) - inp_enc_pad = torch.vstack((tokens_pad, extra_pad)).T - - lens_pad = max_lens + extra_pad_len - else: - inp_enc_pad = tokens_pad.T - lens_pad = max_lens + 1 - self.tokens_to_generate - - inputs = (torch.tensor(inp_enc_pad).cuda(), (torch.ones(batch_size, dtype=torch.int32) * lens_pad).cuda()) - return _Input(inputs=inputs, inp_enc=inp_encs, lens=lens, lens_pad=lens_pad, conti_len=conti_lens) - diff --git a/demo/NeMo/GPT3/nemo_utils.py b/demo/NeMo/GPT3/nemo_utils.py deleted file mode 100644 index f6d5bca7c..000000000 --- a/demo/NeMo/GPT3/nemo_utils.py +++ /dev/null @@ -1,161 +0,0 @@ -# -# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import gc -import os -import sys - -# Only print out error messages from NeMo -from nemo.utils.nemo_logging import Logger as NG_LOGGER -nemo_logger = NG_LOGGER(False) -nemo_logger.setLevel(nemo_logger.ERROR) - -from nemo.utils.app_state import AppState -from nemo.utils.model_utils import inject_model_parallel_rank -from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel -from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector -from omegaconf import OmegaConf, open_dict -from pytorch_lightning.trainer.trainer import Trainer -import torch - -sys.path.append('../../HuggingFace') # Include HuggingFace directory. -from NNDF.logger import G_LOGGER - - -def get_computeprob_response(tokenizer, response, inputs): - """ - This function is a modified version from: - https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/nlp/modules/common/text_generation_utils.py#L139 - - So parallel state does not need to be initialized before calling this function. - """ - compute_prob_response = {} - new_token_ids = [] - new_tokens = [] - new_texts = [] - log_probs = [] - full_logprobs = [] - offsets = [] - for batch_id in range(len(response['tokens'])): - if isinstance(inputs, (list, tuple)): - if isinstance(inputs[0], str): - new_token_id = tokenizer.text_to_ids(inputs[batch_id]) - new_text = inputs[batch_id] - token_len = len(new_token_id) - elif isinstance(inputs[0], torch.Tensor): - token_len = int(inputs[1][batch_id].item()) - new_token_id = inputs[0][batch_id][:token_len].tolist() - new_text = tokenizer.ids_to_text(new_token_id) - new_token_ids.append(new_token_id) - new_tokens.append(response['tokens'][batch_id][:token_len]) - new_texts.append(new_text) - log_probs.append(response['logprob'][batch_id][:token_len]) - full_logprobs.append(response['full_logprob'][batch_id][:token_len]) - offsets.append(response['offsets'][batch_id][:-1]) - compute_prob_response['sentences'] = new_texts - compute_prob_response['tokens'] = new_tokens - compute_prob_response['token_ids'] = new_token_ids - compute_prob_response['logprob'] = log_probs - compute_prob_response['full_logprob'] = full_logprobs - compute_prob_response['offsets'] = offsets - return compute_prob_response - - -def load_nemo_model(cfg, model_class=MegatronGPTModel): - # Trainer is required for restoring model parallel models - trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer) - - if cfg.gpt_model_file and cfg.checkpoint_dir: - raise ValueError(f"NeMo model and checkpoint cannot be both set.") - - if cfg.gpt_model_file: - save_restore_connector = NLPSaveRestoreConnector() - if os.path.isdir(cfg.gpt_model_file): - save_restore_connector.model_extracted_dir = cfg.gpt_model_file - - pretrained_cfg = MegatronGPTModel.restore_from( - restore_path=cfg.gpt_model_file, - trainer=trainer, - return_config=True, - save_restore_connector=save_restore_connector, - ) - OmegaConf.set_struct(pretrained_cfg, True) - with open_dict(pretrained_cfg): - pretrained_cfg.sequence_parallel = False - pretrained_cfg.activations_checkpoint_granularity = None - pretrained_cfg.activations_checkpoint_method = None - pretrained_cfg.precision = trainer.precision - if trainer.precision == "16": - pretrained_cfg.megatron_amp_O2 = False - model = model_class.restore_from( - restore_path=cfg.gpt_model_file, - trainer=trainer, - override_config_path=pretrained_cfg, - save_restore_connector=save_restore_connector, - ) - G_LOGGER.info(f"{type(model)} has been successfully restored from {cfg.gpt_model_file}") - elif cfg.checkpoint_dir: - checkpoint_file= os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name) - if not os.path.exists(checkpoint_file): - raise ValueError(f"File {checkpoint_file} does not exist.") - - app_state = AppState() - if cfg.tensor_model_parallel_size > 1 or cfg.pipeline_model_parallel_size > 1: - app_state.model_parallel_size = cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size - app_state.tensor_model_parallel_size = cfg.tensor_model_parallel_size - app_state.pipeline_model_parallel_size = cfg.pipeline_model_parallel_size - ( - app_state.tensor_model_parallel_rank, - app_state.pipeline_model_parallel_rank, - app_state.model_parallel_size, - app_state.data_parallel_size, - app_state.pipeline_model_parallel_split_rank, - app_state.virtual_pipeline_model_parallel_rank, - ) = fake_initialize_model_parallel( - world_size=app_state.model_parallel_size, - rank=trainer.global_rank, - tensor_model_parallel_size_=cfg.tensor_model_parallel_size, - pipeline_model_parallel_size_=cfg.pipeline_model_parallel_size, - pipeline_model_parallel_split_rank_=cfg.pipeline_model_parallel_split_rank, - ) - checkpoint_path = inject_model_parallel_rank(checkpoint_file) - model = model_class.load_from_checkpoint(checkpoint_path, hparams_file=cfg.hparams_file, trainer=trainer) - G_LOGGER.info(f"{type(model)} has been successfully restored from checkpoint {checkpoint_path}") - else: - raise ValueError("Need to provide a nemo gpt model through config file.") - - model.freeze() - - # Have to turn off activations_checkpoint_method for inference - try: - model.model.language_model.encoder.activations_checkpoint_method = None - except AttributeError: - pass - - model.eval() - G_LOGGER.debug(f"Model configuration: {model.cfg}") - G_LOGGER.debug(f"Vocabulary size: {model.tokenizer.vocab_size}") - return model.cuda() - -def release_nemo_model(model): - print(f"Releaseing nemo model.") - model.model.cpu() - del model.model - gc.collect() - torch.cuda.empty_cache() - model.model = None diff --git a/demo/NeMo/GPT3/onnxrt.py b/demo/NeMo/GPT3/onnxrt.py deleted file mode 100644 index 78bd0acab..000000000 --- a/demo/NeMo/GPT3/onnxrt.py +++ /dev/null @@ -1,112 +0,0 @@ -# -# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import os -import sys - -import onnxruntime as ort -import onnx -import omegaconf -from nemo.collections.nlp.modules.common.tokenizer_utils import get_tokenizer -from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel - -# Add syspath for custom library -if __name__ == "__main__": - filepath = os.path.dirname(os.path.abspath(__file__)) - project_root = os.path.join(filepath, os.pardir) - sys.path.append(project_root) - -from interface import NeMoCommand, BaseModel -from nemo_export import NeMoConverter -from GPT3.GPT3ModelConfig import GPT3ModelTRTConfig - -sys.path.append('../../HuggingFace') # Include HuggingFace -from NNDF.interface import FRAMEWORK_ONNXRT -from NNDF.logger import G_LOGGER -from NNDF.networks import ( - NetworkModel, - NetworkModels, -) - -class GPT3NeMoOnnxRT(NeMoCommand): - def __init__( - self, - nemo_cfg, - config_class=GPT3ModelTRTConfig, - description="Runs ONNX Runtime results for GPT3 model.", - **kwargs - ): - super().__init__(nemo_cfg, config_class, description, model_classes=None, **kwargs) - self.framework_name = FRAMEWORK_ONNXRT - - - def load_onnx_model(self): - G_LOGGER.info(f'Loading ONNX model from {self.nemo_cfg.onnx_model_file}') - - def get_opset_version(name : str) -> int: - """Returns opset. - - `model` here is local in scope and python's gc will collect - it without manual memory management via `del`. - """ - model = onnx.load(name, load_external_data=False) - return model.opset_import[0].version - - assert get_opset_version(self.nemo_cfg.onnx_model_file) == 17 - return ort.InferenceSession(self.nemo_cfg.onnx_model_file) - - - def setup_tokenizer_and_model(self): - self.nemo_cfg.runtime = 'onnx' - self.model = BaseModel() - self.model.cfg = self.nemo_cfg.model - self.model.tokenizer = get_tokenizer(tokenizer_name='megatron-gpt-345m', vocab_file=None, merges_file=None) - - if not self.nemo_cfg.onnx_model_file: - self.nemo_cfg.onnx_model_file = os.path.join( - self.workspace.dpath, - f"onnx/model-{self.nemo_cfg.trainer.precision}.onnx", - ) - - converter = NeMoConverter(self.nemo_cfg, MegatronGPTModel) - if not os.path.isfile(self.nemo_cfg.onnx_model_file): - # Convert NeMo model to ONNX model - onnx_name = converter.nemo_to_onnx() - self.nemo_cfg.onnx_model_file = onnx_name - - # The ONNX model is in opset17 by default. - self.model.onnxrt = self.load_onnx_model() - self.tokenizer = self.model.tokenizer - onnx_models = [ - NetworkModel( - name=GPT3ModelTRTConfig.NETWORK_FULL_NAME, fpath=self.nemo_cfg.onnx_model_file, - ) - ] - return NetworkModels(torch=None, onnx=onnx_models, trt=None) - -# Entry point -def getGPT3NeMoOnnxRT(): - config_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../config.yaml") - nemo_cfg = omegaconf.OmegaConf.load(config_path) - return GPT3NeMoOnnxRT(nemo_cfg) - -# Entry point -RUN_CMD = getGPT3NeMoOnnxRT() - -if __name__ == "__main__": - result = RUN_CMD() - print("Results: {}".format(result)) diff --git a/demo/NeMo/GPT3/sequence_perplexity.py b/demo/NeMo/GPT3/sequence_perplexity.py deleted file mode 100644 index 9fc9ef29c..000000000 --- a/demo/NeMo/GPT3/sequence_perplexity.py +++ /dev/null @@ -1,76 +0,0 @@ -# -# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import math -import numpy as np -import torch - -__all__ = ['SequencePerplexity'] - -class SequencePerplexity(): - def __init__(self, topN): - super().__init__() - self.ppls = [] - self.sequence_ppls = [] - self.topN_equals = [0] * len(topN) - self.topN = topN - - def update(self, ds_input, response, tokenizer): - for batch, tokens in enumerate(response['token_ids']): - inp_len = ds_input.lens[batch] - if inp_len == 0: - continue - - conti_len = ds_input.conti_len[batch] - - response_token_ids = tokens[:inp_len] - assert response_token_ids == ds_input.inp_enc[batch][:-1], f"Mismatch in input tokens." - full_log_probs = response['full_logprob'][batch][:inp_len] - - # calculate ppl with whole sequence. - label = torch.tensor([ds_input.inp_enc[batch][1:]]).cuda() - log_probs = full_log_probs.unsqueeze(0).permute((0, 2, 1)) - ppl = torch.nn.CrossEntropyLoss()(log_probs, label) - self.sequence_ppls.append(ppl.cpu()) - - # calculate topN. - log_probs = full_log_probs[-conti_len:] - conti_token_ids = ds_input.inp_enc[batch][-conti_len:] - conti_tokens = tokenizer.ids_to_tokens(conti_token_ids) - - for index, topN in enumerate(self.topN): - if conti_token_ids[0] in log_probs.topk(topN, dim=-1).indices: - self.topN_equals[index] += 1 - - # calculate ppl with last token. - log_probs = log_probs.cpu().to(torch.float32) - conti_enc = torch.tensor(tokenizer.tokens_to_ids(conti_tokens)) - conti_probs = torch.gather(log_probs, 1, conti_enc.unsqueeze(-1)).squeeze(-1) - - ppl = float(conti_probs.sum()) - self.ppls.append(ppl) - - def compute(self): - ppls = math.exp(-np.mean(np.array(self.ppls))) - sequence_ppls = math.exp(np.mean(np.array(self.sequence_ppls))) - acc = [equals / len(self.ppls) for equals in self.topN_equals] - txt = [] - for i, j in zip(self.topN, acc): - txt.append("acc(top{}): {:.4f}".format(i, j)) - acc_text = ", ".join(txt) - return ppls, sequence_ppls, acc, acc_text - diff --git a/demo/NeMo/GPT3/trt.py b/demo/NeMo/GPT3/trt.py deleted file mode 100644 index 189c1ba32..000000000 --- a/demo/NeMo/GPT3/trt.py +++ /dev/null @@ -1,236 +0,0 @@ -# -# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import os -import sys - -import omegaconf -from nemo.collections.nlp.modules.common.tokenizer_utils import get_tokenizer -from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel - -# Add syspath for custom library -if __name__ == "__main__": - filepath = os.path.dirname(os.path.abspath(__file__)) - project_root = os.path.join(filepath, os.pardir) - sys.path.append(project_root) - -from nemo_export import NeMoConverter, create_dir_if_not_exist -from GPT3.GPT3ModelConfig import GPT3ModelTRTConfig -from GPT3.trt_utils import load_trt_model -from interface import NeMoCommand, BaseModel -import onnx - -sys.path.append('../../HuggingFace') # Include HuggingFace -from NNDF.interface import FRAMEWORK_TENSORRT -from NNDF.logger import G_LOGGER -from NNDF.models import _log_fake_perf_metrics -from NNDF.networks import ( - NetworkModel, - NetworkModels, -) - -class GPT3NeMoTRT(NeMoCommand): - def __init__( - self, - nemo_cfg, - config_class=GPT3ModelTRTConfig, - description="Runs TensorRT results for GPT3 model.", - **kwargs - ): - super().__init__(nemo_cfg, config_class, description, model_classes=None, **kwargs) - self.framework_name = FRAMEWORK_TENSORRT - - - def setup_tokenizer_and_model(self): - self.nemo_cfg.runtime = 'trt' - self.model = BaseModel() - self.model.cfg = self.nemo_cfg.model - self.model.tokenizer = get_tokenizer(tokenizer_name='megatron-gpt-345m', vocab_file=None, merges_file=None) - - # Path to write new onnx models if need arises. Prevents overwrite of - # user-provided onnx files in case opset_version needs to be upgraded - # to 19 or onnx files with kv-cache needs to be written. - onnx_workpath = os.path.join( - self.workspace.dpath, - "onnx", - ) - if self.nemo_cfg.onnx_model_file: - # Input by user, can be a read-only location. - onnx_name = self.nemo_cfg.onnx_model_file - else: - onnx_name = os.path.join( - onnx_workpath, - f"model-{self.nemo_cfg.trainer.precision}.onnx", - ) - self.nemo_cfg.onnx_model_file = onnx_name - self.nemo_cfg.trt_export_options.timing_cache = self.timing_cache - - converter = NeMoConverter(self.nemo_cfg, MegatronGPTModel) - if not os.path.isfile(onnx_name): - # Convert NeMo model to ONNX model - onnx_name = converter.nemo_to_onnx() - - def get_opset_version(name : str) -> int: - """Returns opset. - - `model` here is local in scope and python's gc will collect - it without manual memory management via `del`. - """ - model = onnx.load(name, load_external_data=False) - return model.opset_import[0].version - - opset_version = get_opset_version(onnx_name) - if opset_version < 19: - opset19_onnx_name = NeMoConverter.get_opset19_onnx_fpath( - onnx_name, onnx_workpath - ) - if not os.path.isfile(opset19_onnx_name): - opset19_onnx_name = NeMoConverter.onnx_to_opset19( - onnx_name, onnx_workpath - ) - - if opset19_onnx_name != None: - onnx_name = opset19_onnx_name - - # Add KV cache to ONNX model - kv_output_policy = "kv_new" - - converter = NeMoConverter(self.nemo_cfg) - - def has_kv_cache_support( - model_name: str, match_names=("key", "value", "kv") - ) -> bool: - """To detect onnx models with kv_cache exported, input node names - contain match_names. - """ - model = onnx.load(model_name, load_external_data=False) - - # Get network inputs. - input_all = [node.name for node in model.graph.input] - input_initializer = [node.name for node in model.graph.initializer] - net_input_names = list(set(input_all) - set(input_initializer)) - - kv_nodes = filter( - lambda name: any(map(lambda match: match in name, match_names)), - net_input_names, - ) - return any(kv_nodes) and len(net_input_names) > 2 - - if (not self.nemo_cfg.use_cache) and (has_kv_cache_support(onnx_name)): - raise RuntimeError( - "ONNX model has been exported with kv-cache enabled, but " - "runtime configuration has kv-cache disabled. Consider " - "enabling kv-cache support via the `use-cache` option." - ) - - if self.nemo_cfg.use_cache and (not has_kv_cache_support(onnx_name)): - G_LOGGER.info(f"Converting {onnx_name} with KV-cache support") - new_dir = onnx_workpath + f"_{kv_output_policy}" - if self.nemo_cfg.onnx_export_options.use_fp8_storage: - new_dir += f"_fp8_storage" - onnx_output_fpath = os.path.join(new_dir, onnx_name.split("/")[-1]) - - if not os.path.isfile(onnx_output_fpath): - create_dir_if_not_exist(onnx_output_fpath) - converter.create_onnx(onnx_name, onnx_output_fpath, kv_output_policy) - onnx_name = onnx_output_fpath - - if self.nemo_cfg.onnx_export_options.prune: - onnx_name = converter.prune_onnx(onnx_name) - - # Convert ONNX model to TRT engine - self.nemo_cfg.trt_export_options.use_strongly_typed = self.use_strongly_typed - self.nemo_cfg.trt_export_options.timing_cache = self.timing_cache - self.nemo_cfg.trt_export_options.opt_seq_len = self.opt_seq_len - - suffixes = [] - suffixes.append("bs" + str(self.nemo_cfg.batch_size)) - if self.nemo_cfg.trt_export_options.opt_seq_len != None: - suffixes.append("opt" + str(self.nemo_cfg.trt_export_options.opt_seq_len)) - if self.nemo_cfg.use_cache: - suffixes.append("kv") - if self.nemo_cfg.onnx_export_options.use_fp8_storage: - suffixes.append("fp8_storage") - if self.nemo_cfg.trt_export_options.sparse: - suffixes.append("sp") - if not self.nemo_cfg.trt_export_options.use_strongly_typed: - suffixes.append("no_strongly_typed") - suffix = "-".join(suffixes) - trt_fpath = os.path.join(self.workspace.dpath, f"trt-{suffix}.plan") - - if os.path.isfile(trt_fpath): - G_LOGGER.debug(f"TRT Engine plan exists at location {trt_fpath}.") - _log_fake_perf_metrics() - else: - converter.onnx_to_trt(onnx_name, trt_fpath) - - self.nemo_cfg.trt_engine_file = trt_fpath - self.model.trt = load_trt_model(self.nemo_cfg) - self.tokenizer = self.model.tokenizer - onnx_models = [ - NetworkModel( - name=GPT3ModelTRTConfig.NETWORK_FULL_NAME, fpath=self.nemo_cfg.onnx_model_file, - ) - ] - return NetworkModels(torch=None, onnx=onnx_models, trt=None) - - def add_args(self): - super().add_args() - engine_group = self._parser.add_argument_group("trt engine") - engine_group.add_argument( - "--opt-seq-len", - default=None, - help="Set optimized input sequence length to be used in engine building", - type=int, - ) - engine_group.add_argument( - "--no-timing-cache", - default=False, - help="Set to not use timing cache for speeding up engine building", - action="store_true", - ) - engine_group.add_argument( - "--no-strongly-typed", - default=False, - help="Disable strongly typed mode in engine building", - action="store_true", - ) - - def process_framework_specific_arguments( - self, - opt_seq_len: int = None, - no_timing_cache: bool = False, - no_strongly_typed: bool = False, - **kwargs - ): - self.opt_seq_len = opt_seq_len - self.use_timing_cache = not no_timing_cache - self.use_strongly_typed = not no_strongly_typed - self.timing_cache = self.workspace.get_timing_cache() if self.use_timing_cache else None - -# Entry point -def getGPT3NeMoTRT(): - config_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../config.yaml") - nemo_cfg = omegaconf.OmegaConf.load(config_path) - return GPT3NeMoTRT(nemo_cfg) - -# Entry point -RUN_CMD = getGPT3NeMoTRT() - -if __name__ == "__main__": - result = RUN_CMD() - print("Results: {}".format(result)) diff --git a/demo/NeMo/GPT3/trt_utils.py b/demo/NeMo/GPT3/trt_utils.py deleted file mode 100644 index a146cf7e8..000000000 --- a/demo/NeMo/GPT3/trt_utils.py +++ /dev/null @@ -1,231 +0,0 @@ -# -# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import sys - -import numpy as np -import tensorrt as trt -import torch -from transformers.configuration_utils import PretrainedConfig - -sys.path.append('../../HuggingFace') # Include HuggingFace directory -from NNDF.models import TRTEngineFile -from NNDF.networks import NetworkMetadata -from NNDF.tensorrt_utils import TRTNativeRunner -from NNDF.logger import G_LOGGER -from Seq2Seq.export import DecoderTRTEngine - -from HuggingFace.NNDF.tensorrt_utils import TRTNativeRunner, CUASSERT -from cuda import cudart - - -class GPTTRTDecoder(TRTNativeRunner): - - INPUT_IDS_INDEX = 0 - POSITION_IDS_INDEX = 1 - ATTENTION_MASK_INDEX = 2 - - def __init__( - self, - trt_engine_file: TRTEngineFile, - use_cache: bool, - use_fp8_storage: bool, - cfg, - network_metadata: NetworkMetadata = None, - hf_config: PretrainedConfig = None, - ): - super().__init__(trt_engine_file, network_metadata, hf_config) - self.use_cache = use_cache - self.use_fp8_storage = use_fp8_storage - if self.use_cache: - self._set_context_mode_trt_context() - self.io_names = set() - self.input_tensor_names = set() - for i in range(self.trt_engine.num_io_tensors): - tensor_name = self.trt_engine.get_tensor_name(i) - self.io_names.add(tensor_name) - if self.trt_engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.INPUT: - self.input_tensor_names.add(tensor_name) - - self.cfg = cfg - logits_size = self.cfg.batch_size * self.cfg.model.max_seq_len * self.cfg.model.vocab_size - - self.batch_size = self.cfg.batch_size - self.max_seq_len = self.cfg.model.max_seq_len - self.num_layers = self.cfg.model.num_layers - self.nb_heads = self.cfg.model.nb_heads - self.head_size = self.cfg.model.head_size - - dtype = self.get_torch_type(self.get_output_name()) - self.logits = torch.zeros(logits_size, dtype=dtype).contiguous().cuda() - - - self.init_kv_cache() - self.past_decoder_length = 0 - - # Setting next input shape when executing gpu kernel. - # Use dict to record which inputs have changed. - self.input_shape_change_record = dict() - - def init_kv_cache(self): - # kv cache buffer - self.attention_kv_cache_buffer = dict() - cache_dtype = torch.float16 - if self.use_fp8_storage: - cache_dtype = torch.uint8 - for i in range(self.num_layers): - for code in ["key", "value"]: - attention_kv_cache_name = self.make_kv_cache_name(i, code) - self.attention_kv_cache_buffer[attention_kv_cache_name] = torch.empty( - self.max_seq_len, - self.batch_size, - self.nb_heads, - self.head_size, - dtype=cache_dtype, - device=torch.cuda.current_device(), - ).contiguous().cuda() - - - def make_kv_cache_name(self, layer, code): - return f"key_values.{layer}.decoder.{code}" - - def _set_context_mode_trt_context(self): - # Create TRT context for context mode (1st decoder run) with optimization profile index = 1 - self.context_trt_context = self.trt_engine.create_execution_context() - self.context_trt_context.set_optimization_profile_async(1, self.stream) - - def get_torch_type(self, name): - trt_type = self.trt_engine.get_tensor_dtype(name) - mapping = { - trt.float32: torch.float32, - trt.float16: torch.float16, - trt.int8: torch.int8, - trt.int32: torch.int32, - trt.int64: torch.int64, - trt.bool: torch.bool, - trt.uint8: torch.uint8, - trt.bfloat16: torch.bfloat16, - } - if trt_type in mapping: - return mapping[trt_type] - raise ValueError(f"Got unexpected tensorrt dtype {trt_type} in get_torch_type().") - - def get_input_ids_name(self): - return self.trt_engine.get_tensor_name(self.INPUT_IDS_INDEX) - - def has_position_ids(self): - # If the input at POSITION_IDS_INDEX has a dimension of 2, assume it is position_ids. - return len(self.trt_engine.get_tensor_shape(self.trt_engine.get_tensor_name(self.POSITION_IDS_INDEX))) == 2 - - def get_position_ids_name(self): - if self.has_position_ids(): - return self.trt_engine.get_tensor_name(self.POSITION_IDS_INDEX) - else: - return None - - def get_output_name(self): - return "logits" - - def has_attention_mask(self): - if self.ATTENTION_MASK_INDEX < self.trt_engine.num_io_tensors: - return self.trt_engine.get_tensor_name(self.ATTENTION_MASK_INDEX) == "attention_mask" - return False - - def get_attention_mask_name(self): - if self.has_attention_mask(): - return self.trt_engine.get_tensor_name(self.ATTENTION_MASK_INDEX) - return None - - def run(self, output_name, io_descs, seq_len, context_mode=False): - torch.cuda.nvtx.range_push("TRT Setup") - if self.use_cache: - if context_mode: - self.past_decoder_length = 0 - else: - # When kv-cache is used, seq_len is always 1 in Generation phase. - seq_len = 1 - cur_shape = (self.past_decoder_length, self.batch_size, self.nb_heads, self.head_size) - new_shape = (seq_len, self.batch_size, self.nb_heads, self.head_size) - assert self.past_decoder_length + seq_len < self.max_seq_len - offset = self.batch_size*self.nb_heads*self.head_size*self.past_decoder_length - for i in range(self.num_layers): - for code in ["key", "value"]: - attention_kv_cache_name = self.make_kv_cache_name(i, code) - cur_address = self.attention_kv_cache_buffer[attention_kv_cache_name].data_ptr() - # new kv address start from the past kv-cache data end - io_descs[f"past_{attention_kv_cache_name}"] = (cur_address, cur_shape) - new_address = cur_address + offset*self.attention_kv_cache_buffer[attention_kv_cache_name].element_size() - modifier = "" - if self.use_fp8_storage: - modifier = "_qfp8" - new_kv_name = f"new_{attention_kv_cache_name}{modifier}" - io_descs[new_kv_name] = (new_address, new_shape) - self.past_decoder_length += seq_len - else: - self.past_decoder_length = 0 - # Set active optimization profile and active execution context. - self.trt_context.set_optimization_profile_async(self.profile_idx, self.stream) - active_context = self.trt_context - if context_mode and self.use_cache: - active_context = self.context_trt_context - - # Set up input bindings. - for name, tensor_shape in io_descs.items(): - active_context.set_tensor_address(name, tensor_shape[0]) - if name in self.input_tensor_names: - if name in self.input_shape_change_record and \ - self.input_shape_change_record[name][0] == active_context and \ - self.input_shape_change_record[name][1] == tensor_shape[1]: - continue - else: - active_context.set_input_shape(name, tensor_shape[1]) - elif self.use_cache: - pass - else: - assert False, "All tensors must be inputs for non-KV mode" - assert active_context.all_shape_inputs_specified - - # Set up output bindings. - assert output_name == self.get_output_name() - engine_out_torch_type = self.get_torch_type(output_name) - if self.logits.dtype != engine_out_torch_type: - raise ValueError(f"Output data type does not match, {self.logits.dtype} vs. {engine_out_torch_type}.") - shape = active_context.get_tensor_shape(output_name) - active_context.set_tensor_address(output_name, self.logits.data_ptr()) - - - # Execute inference. - torch.cuda.nvtx.range_pop() # "TRT Setup" - active_context.execute_async_v3(self.stream) - if not context_mode and self.use_cache: - self.input_shape_change_record.clear() - for i in range(self.num_layers): - for code in ["key", "value"]: - next_past_shape = (self.past_decoder_length, self.batch_size, self.nb_heads, self.head_size) - attention_kv_cache_name = self.make_kv_cache_name(i, code) - # set next iter input shape when cpu idle - active_context.set_input_shape(f"past_{attention_kv_cache_name}", next_past_shape) - self.input_shape_change_record[f"past_{attention_kv_cache_name}"] = [active_context, next_past_shape] - CUASSERT(cudart.cudaStreamSynchronize(self.stream)) - if len(shape) != 3: - raise ValueError("Output must have a dimension of 3.") - output = self.logits[:shape[0] * shape[1] * shape[2]].view(tuple(shape)) - return output - -def load_trt_model(cfg): - G_LOGGER.info(f'Loading TensorRT engine from {cfg.trt_engine_file} with use_cache={cfg.use_cache}, use_fp8_storage={cfg.onnx_export_options.use_fp8_storage} ') - trt_engine_file = DecoderTRTEngine(cfg.trt_engine_file) - return GPTTRTDecoder(trt_engine_file, cfg.use_cache, cfg.onnx_export_options.use_fp8_storage, cfg) diff --git a/demo/NeMo/README.md b/demo/NeMo/README.md deleted file mode 100644 index 44f183dd6..000000000 --- a/demo/NeMo/README.md +++ /dev/null @@ -1,156 +0,0 @@ -# TensorRT FP8 Inference for NeMo models -**Deprecation:** For all users using TensorRT to accelerate Large Language Model inference, please use [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM/). TensorRT-LLM includes full model coverage and functionalities of HuggingFace demo and NeMo demo. It also contains more optimizations and functionalities (e.g. model quantization, in-flight batching, etc.), multi-GPU support, better model coverage and much better inference performance. HuggingFace Demo and NeMo demo will not be maintained, and they will be removed from OSS in TRT 10.0 release. - -This repository demonstrates TensorRT inference with NeMo Megatron models in FP8/FP16/BF16 precision. - -Currently, this repository supports [NeMo GPT](https://huggingface.co/nvidia/nemo-megatron-gpt-5B/tree/fp8) models only. - -# Environment Setup -It's recommended to run inside a container to avoid conflicts when installing dependencies. Please check out [`NGC TensorRT`](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tensorrt/tags) and find a container with TensorRT 9.0 or above. A GPU with compute capability 8.9 or above is required to run the demo with FP8 precision. - -``` -# Run inside a TensorRT container -sh install.sh [--deps ] [-j ] [--ninja] -``` - -All arguments are optional. `--deps` indicates the relative dependency download directory, `-j` indicates number of parallel jobs for building and `--ninja` installs the `ninja` build system which can speed up installation. See `sh install.sh --help` for more details on the arguments. - -> The script will install required dependencies and it can take around 30 minutes or more. - -**Please note that the [HuggingFace demo directory](demo/HuggingFace) needs to be visible when running this demo, so utility functions can be correctly imported.** - -# File Structure -This demo follows simliar structure and command-line interface as in [HuggingFace demo](/demo/HuggingFace). -``` -. -├── GPT3 # GPT3 directory -│ ├── GPT3ModelConfig.py # model configuration and variant-specific parameters -│ ├── frameworks.py # NeMo PyTorch inference script -│ ├── onnxrt.py # OnnxRT inference script -│ ├── trt.py # TensorRT inference script -│ ├── decoding.py # main inference logic for all runtimes -│ └── ... # files with utility functions for model export and inference -├── config.yaml # full configuration for model export and inference -├── interface.py # definitions of setup functions -├── nemo_export.py # export functions for NeMo model -> ONNX model -> TRT engine -└── run.py # main entry script -``` - -# Overview - -This demo contains two scripts `run.py` and `nemo_export.py`. Script `run.py` accepts a NeMo model or an ONNX model as input, and performs end-to-end inference with various actions specified by the user. Script `nemo_export.py` accepts a NeMo model or an ONNX model as input, and exports the input to an ONNX model or a TensorRT engine. - -# How to run inference -The `run` action will run end-to-end inference on sentences specified in [config.yaml](/demo/NeMo/config.yaml). A model, a variant, and precision are required to run this command. -``` -python3 run.py run GPT3 --variant gpt-5b --working-dir $(pwd)/temp --fp8 --bf16 --nemo-model= -``` - -Expected output for the second sentence: -``` -Batch 1: {'sentences': ['TensorRT is a Deep Learning compiler used for deep learning. It is a compiler for TensorFlow, CNTK, and Torch. It is a compiler for the TensorFlow, CNTK,'], - 'tokens': [['<|endoftext|>', 'T', 'ensor', 'RT', ' is', ' a', ' Deep', ' Learning', ' compiler', ' used', ' for', ' deep', ' learning', '.', ' It', ' is', ' a', ' compiler', ' for', ' T', 'ensor', 'Flow', ',', ' C', 'NT', 'K', ',', ' and', ' Torch', '.', ' It', ' is', ' a', ' compiler', ' for', ' the', ' T', 'ensor', 'Flow', ',', ' C', 'NT', 'K', ',']], - 'logprob': tensor([[-4.6415e+00, -6.9270e+00, -7.4458e+00, -1.9856e+00, -5.9787e-01, - -8.1058e+00, -7.9629e-02, -5.8013e+00, -5.5222e+00, -1.4401e+00, - -5.5644e+00, -3.3747e-01, -3.3463e+00, -1.1306e+00, -1.3685e+00, - -1.7793e+00, -2.8960e+00, -1.4127e+00, -2.3209e+00, -7.3454e-04, - -9.8682e-02, -1.3268e+00, -2.1373e+00, -3.9281e-01, -6.5222e-04, - -2.9425e-01, -1.4167e+00, -1.8416e+00, -9.2462e-01, -1.4805e+00, - -1.4299e+00, -2.0632e+00, -2.9947e+00, -9.1487e-01, -2.6651e+00, - -2.2772e+00, -4.7057e-03, -2.2852e-01, -2.4777e+00, -2.4731e-01, - -7.0602e-03, -4.7339e-04, -1.1645e-01]], device='cuda:0'), - 'full_logprob': None, - 'token_ids': [[50256, 51, 22854, 14181, 318, 257, 10766, 18252, 17050, 973, 329, 2769, 4673, 13, 632, 318, 257, 17050, 329, 309, 22854, 37535, 11, 327, 11251, 42, 11, 290, 34868, 13, 632, 318, 257, 17050, 329, 262, 309, 22854, 37535, 11, 327, 11251, 42, 11]], - 'offsets': [[0, 0, 1, 6, 8, 11, 13, 18, 27, 36, 41, 45, 50, 59, 60, 63, 66, 68, 77, 81, 83, 88, 92, 93, 95, 97, 98, 99, 103, 109, 110, 113, 116, 118, 127, 131, 135, 137, 142, 146, 147, 149, 151, 152]]} -``` - -# How to run with various configurations -- FP8, FP16, and BF16 precisions are supported, and they can be set through `--fp8`, `--fp16`, and `--bf16` respectively. Currently, the script has constraints on how precisions are specified, and supported combinations are: - 1. Pure FP16: `--fp16` (default) - 2. Pure BF16: `--bf16` - 3. FP8-FP16: `--fp8 --fp16` - 4. FP8-BF16: `--fp8 --bf16` - -- `--nemo-model=` or `--nemo-checkpoint=` can be used to load a NeMo model or checkpoint from a specified path, respectively. If these arguments are not provided, a NeMo model will be downloaded (and cached/re-used for subsequent runs) in the working directory. - -- K-V cache can be enabled through `--use-cache` - -- Batch size can be changed through `--batch-size=` - -- Default max sequence length is `256`, can be changed through `--max-seq-len=` - -# How to run performance benchmark -The `benchmark` action will run inference with specified input and output sequence lengths multiple times. -``` -python3 run.py benchmark GPT3 --variant gpt-5b --working-dir $(pwd)/temp --fp8 --bf16 --nemo-model= --batch-size=16 --input-seq-len=128 --output-seq-len=20 --use-cache --warmup=10 --iterations=100 -``` - -Expected output for `trt`: -``` -*************************** -Running 100 iterations with batch size: 16, input sequence length: 128 and output sequence length: 20 -[E2E inference] Total Time: 11.55453 s, Average Time: 0.11555 s, 95th Percentile Time: 0.11581 s, 99th Percentile Time: 0.11587 s, Throughput: 2769.48 tokens/s -[Without tokenizer] Total Time: 10.44539 s, Average Time: 0.10445 s, 95th Percentile Time: 0.10459 s, 99th Percentile Time: 0.10465 s, Throughput: 3063.55 tokens/s -*************************** -``` - -Expected output for `frameworks`: -``` -*************************** -Running 100 iterations with batch size: 16, input sequence length: 128 and output sequence length: 20 -[E2E inference] Total Time: 55.23503 s, Average Time: 0.55235 s, 95th Percentile Time: 0.55525 s, 99th Percentile Time: 0.56992 s, Throughput: 579.34 tokens/s -[Without tokenizer] Total Time: 54.06591 s, Average Time: 0.54066 s, 95th Percentile Time: 0.54369 s, 99th Percentile Time: 0.55839 s, Throughput: 591.87 tokens/s -*************************** -``` - -# How to run accuracy check -The `accuracy` action will run accuracy check on a dataset. Default is to use [LAMBADA](https://paperswithcode.com/dataset/lambada) dataset. -``` -python3 run.py accuracy GPT3 --variant gpt-5b --working-dir $(pwd)/temp --fp8 --bf16 --nemo-model= --use-cache -``` - -Expected output for `trt`: -``` -*************************** -Lambada ppl(last token): 4.4756, ppl(sequence): 18.3254, acc(top1): 0.6722, acc(top3): 0.8597, acc(top5): 0.9076 -*************************** -``` - -Expected output for `frameworks`: -``` -*************************** -Lambada ppl(last token): 4.4669, ppl(sequence): 18.3161, acc(top1): 0.6765, acc(top3): 0.8612, acc(top5): 0.9082 -*************************** -``` - -# How to export a NeMo model to ONNX -NeMo to ONNX conversion consists of 3 steps: -1. Export ONNX from NeMo. -2. NeMo uses TransformerEngine to export FP8 models to ONNX (step 1) and the exported ONNX has custom TensorRT Q/DQ nodes. Script `convert_te_onnx_to_trt_onnx.py` can be used to convert the custom operators into standard opset19 ONNX Q/DQ nodes. -3. Add KV-cache inputs and outputs to the exported ONNX, so it is faster when performing inference on the model. - -`nemo_export.py` has `--opset19` and `--use-cache` option to decide whether to perform step 2. and step 3., respectively: -``` -python3 nemo_export.py --nemo-model=model.nemo --onnx=onnx/model.onnx --opset19 --use-cache -``` -`--extra-configs` can be used to specified configs that are defined in `config.yml` but not being exposed from existing command-line interface. -Please specify `--help` to see more options. - - -# How to run sparsity for benchmark - -*Note: this is for performance analysis. The pruned model should not be used for accuracy purpose unless it was fine-tuned for sparsity. The pruning may take minutes or hours depending on the model size.* - - -1. Enable sparsity knobs in `config.yaml`: - * Set `onnx_export_options.prune` to `True` to enable pruning of the ONNX model. - * Set `trt_export_options.sparse` to `True` to enable sparse tactics profiling in TensorRT. -2. Run the scripts. You should be able to see logs like below. - -``` -[2023-07-28 00:15:03,015][OSS][INFO] Prune ONNX model with: polygraphy surgeon prune ${OSS_ROOT}/demo/NeMo/temp/gpt-5b/GPT3-gpt-5b-fp8-fp16-ms256/onnx/model-16.opset19.onnx -o ${OSS_ROOT}/demo/NeMo/temp/gpt-5b/GPT3-gpt-5b-fp8-fp16-ms256/onnx/pruned.model-16.opset19.onnx --save-external-data ${OSS_ROOT}/demo/NeMo/temp/gpt-5b/GPT3-gpt-5b-fp8-fp16-ms256/onnx/pruned.model-16.opset19.onnx_data -[2023-07-28 00:15:03,016][OSS][INFO] This may take a while... -... - -[2023-07-28 03:36:52,307][OSS][DEBUG] trtexec --onnx=${OSS_ROOT}/demo/NeMo/temp/gpt-5b/GPT3-gpt-5b-fp8-fp16-ms256/onnx/pruned.model-16.opset19.onnx --minShapes=input_ids:1x1,position_ids:1x1 --optShapes=input_ids:1x128,position_ids:1x128 --maxShapes=input_ids:1x256,position_ids:1x256 --fp8 --fp16 --sparsity=enable --timingCacheFile=functional.cache -``` diff --git a/demo/NeMo/apex.patch b/demo/NeMo/apex.patch deleted file mode 100644 index daa1b6153..000000000 --- a/demo/NeMo/apex.patch +++ /dev/null @@ -1,29 +0,0 @@ -diff --git a/setup.py b/setup.py -index cb1a790..949f877 100644 ---- a/setup.py -+++ b/setup.py -@@ -29,15 +29,15 @@ def check_cuda_torch_binary_vs_bare_metal(cuda_dir): - print("\nCompiling cuda extensions with") - print(raw_output + "from " + cuda_dir + "/bin\n") - -- if (bare_metal_version != torch_binary_version): -- raise RuntimeError( -- "Cuda extensions are being compiled with a version of Cuda that does " -- "not match the version used to compile Pytorch binaries. " -- "Pytorch binaries were compiled with Cuda {}.\n".format(torch.version.cuda) -- + "In some cases, a minor-version mismatch will not cause later errors: " -- "https://github.com/NVIDIA/apex/pull/323#discussion_r287021798. " -- "You can try commenting out this check (at your own risk)." -- ) -+ # if (bare_metal_version != torch_binary_version): -+ # raise RuntimeError( -+ # "Cuda extensions are being compiled with a version of Cuda that does " -+ # "not match the version used to compile Pytorch binaries. " -+ # "Pytorch binaries were compiled with Cuda {}.\n".format(torch.version.cuda) -+ # + "In some cases, a minor-version mismatch will not cause later errors: " -+ # "https://github.com/NVIDIA/apex/pull/323#discussion_r287021798. " -+ # "You can try commenting out this check (at your own risk)." -+ # ) - - - def raise_if_cuda_home_none(global_option: str) -> None: diff --git a/demo/NeMo/config.yaml b/demo/NeMo/config.yaml deleted file mode 100644 index 2b1888bb8..000000000 --- a/demo/NeMo/config.yaml +++ /dev/null @@ -1,87 +0,0 @@ -runtime: null -gpt_model_file: null # GPT nemo file path -onnx_model_file: null # ONNX file path -trt_engine_file: null # TRT engine file path - -# Parameters for loading from a checkpoint -checkpoint_dir: null # Path to a folder that contains a .ckpt file -checkpoint_name: null # Name of the .ckpt file within the checkpoint_dir. -hparams_file: null # Path to a .yaml file that contains the hyperparameters of the checkpoint. - -batch_size: 1 -use_cache: True -use_one_input: False # export ONNX model with only one input -prompts: # prompts for GPT inference - - "How are you?" - - "TensorRT is a Deep Learning compiler used for deep learning." - -mode: 'inference' # Could change to accuracy or benchmark - -inference: - greedy: True # Whether or not to use sampling ; use greedy decoding otherwise - top_k: 0 # The number of highest probability vocabulary tokens to keep for top-k-filtering. - top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation. - temperature: 1.0 # sampling temperature - add_BOS: True # add the bos token at the begining of the prompt - tokens_to_generate: 30 # The maximum length of the sequence to be generated. - all_probs: False # whether return the log prob for all the tokens in vocab - repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty. - min_tokens_to_generate: 0 # The minimum length of the sequence to be generated. - compute_logprob: False # a flag used to compute logprob of all the input text, a very special case of running inference, default False - seed: 1234 - -accuracy: - dataset: Lambada - metric: Perplexity - top_n: 1,3,5 - tokens_to_generate: 5 - -benchmark: - input_seq_len: 20 - output_seq_len: 20 - -# for nemo to onnx export -onnx_export_options: - runtime_check: False - verbose: False - onnx_opset: 17 - do_constant_folding: True - cache_support: False - prune: False # Prune the ONNX model for Sparse Tensor Cores 2:4 pattern - device: 'cuda' - check_tolerance: 0.01 - use_fp8_storage: False - quantize_bmms: False - -# for onnx to trt export -trt_export_options: - opt_seq_len: 128 # define the optimized sequence length - use_tf32: True - use_fp16: False - use_fp8: False - use_bf16: False - use_strongly_typed: True # enable strongly typed mode will invalidate `use_[fp8|fp16|bf16]` flags. - sparse: False # enable sparse in TRT engine builder - timing_cache: 'functional.cache' - -trainer: - devices: 1 - num_nodes: 1 - accelerator: gpu - logger: False # logger provided by exp_manager - precision: 32 # 16, 32, or bf16 - -tensor_model_parallel_size: 1 -pipeline_model_parallel_size: 1 -pipeline_model_parallel_split_rank: 0 # used for encoder and decoder model (0 for others) - -# model architecture -model: - max_seq_len: 256 # define the max sequence length for attention mask - encoder_seq_length: 2048 - max_position_embeddings: ${.encoder_seq_length} - num_layers: 24 - hidden_size: 4096 - nb_heads: 32 - head_size: 128 - vocab_size: 50304 diff --git a/demo/NeMo/install.sh b/demo/NeMo/install.sh deleted file mode 100644 index 277f250a4..000000000 --- a/demo/NeMo/install.sh +++ /dev/null @@ -1,485 +0,0 @@ -#!/bin/sh -# -# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Sourcing messes up the directory detection with readlink. -if [ ! "${0##*/}" = "install.sh" ]; then - echo "Please run this install script, don't source it." >&2 - echo "Use -h for usage and help." >&2 - return 1 -fi - -NEMO_DIR=$(dirname "$(readlink -f "$0")") -DEMO_DIR=$(dirname "${NEMO_DIR}") -SCRIPT_DIR=$(dirname "${DEMO_DIR}")/scripts - -DEPENDENCIES_DIR="temp" -BUILD_SRCLIBS=1 -BUILD_NINJA=0 -ARG_JOBS=1 -ARG_HELP=0 - -install_essential_tools() { - pip_not_found=$(pip --version 2>&1 | grep -o "not found") - if [ "$pip_not_found" != "" ]; then - echo " > Installing pip..." - apt-get update - apt-get install -y python3-dev - cd "${1}" || exit - if [ ! -f "get-pip.py" ]; then - apt-get install -y wget - wget https://bootstrap.pypa.io/get-pip.py - fi - python3 get-pip.py - cd .. - fi - - git_not_found=$(git --version 2>&1 | grep -o "not found") - if [ "$git_not_found" != "" ]; then - echo " > Installing git..." - apt-get update - apt-get install -y git - fi -} - -install_ninja() { - if [ ! -d "ninja" ]; then - git clone https://github.com/ninja-build/ninja.git - fi - cd ninja || exit - git checkout v1.11.1 - - if [ ! -x "./ninja" ]; then - CMD="python3 configure.py --bootstrap" - echo " >> ${CMD}" - eval "${CMD}" - unset CMD - else - echo " > ninja already built!" - fi - - PATH_WITH_NINJA="$(pwd):${PATH}" - # Path exported for the current program scope only. - export PATH="${PATH_WITH_NINJA}" - unset PATH_WITH_NINJA - cd .. -} - -PACKAGE_NEEDS_REINSTALL=0 - -check_if_managed_install() { - PACKAGE_NEEDS_REINSTALL=0 - dist_path="${1}" - # https://packaging.python.org/en/latest/specifications/direct-url/ - if [ ! -f "${dist_path}/direct_url.json" ]; then - PACKAGE_NEEDS_REINSTALL=1 - return - fi - if [ "$(grep -c "${NEMO_DIR}" "${dist_path}/direct_url.json")" != "1" ]; then - PACKAGE_NEEDS_REINSTALL=1 - fi -} - -apex_install_logic() { - if [ ! -d "apex" ]; then - git clone https://github.com/NVIDIA/apex.git - fi - - cd apex || exit - APEX_PATH="$(pwd)" - git config --global --add safe.directory "${APEX_PATH}" - unset APEX_PATH - - git checkout 5b5d41034b506591a316c308c3d2cd14d5187e23 - git apply "${NEMO_DIR}"/apex.patch # Bypass CUDA version check in apex - - torchcppext=$(pip show torch | grep Location | cut -d' ' -f2)"/torch/utils/cpp_extension.py" - if [ ! -f "$torchcppext" ]; then - echo "Could not locate torch installation using pip" - exit 1 - fi - sed -i 's/raise RuntimeError(CUDA_MISMATCH_MESSAGE.format(cuda_str_version, torch.version.cuda))/pass/' "$torchcppext" # Bypass CUDA version check in torch - unset torchcppext - - CMD="MAX_JOBS=${ARG_JOBS} python3 setup.py bdist_wheel -v --cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" - echo " >> ${CMD}" - eval "${CMD}" - unset CMD - - python3 -m pip install "$(find './dist' -name '*.whl' | head -n1)" - cd ../ -} - -check_if_apex_needs_reinstall() { - apex_loc="$(pip show apex | grep '^Location' | awk '{print $2}')" - apex_dist_loc="$(find "${apex_loc}" -depth -maxdepth 1 -name 'apex*dist-info' -type d | head -n1)" - - check_if_managed_install "${apex_dist_loc}" - apex_needs_reinstall=${PACKAGE_NEEDS_REINSTALL} - echo "${apex_needs_reinstall}" - - unset apex_dist_loc - unset apex_loc -} - -install_apex() { - has_apex=$(pip list | grep "^apex " | grep "apex" -o | awk '{print $1}' | awk '{print length}') - apex_needs_reinstall=0 - - if [ "$has_apex" != "4" ]; then - apex_install_logic - else - check_if_apex_needs_reinstall - if [ "$apex_needs_reinstall" != "0" ]; then - echo " > Reinstalling Apex per demo version..." - python3 -m pip uninstall -y apex - apex_install_logic - else - echo " > Apex already installed!" - fi - fi - unset apex_needs_reinstall - unset has_apex -} - -megatron_install_logic() { - if [ ! -d "Megatron-LM" ]; then - git clone -b main https://github.com/NVIDIA/Megatron-LM.git - fi - - cd Megatron-LM || exit - MEGATRON_PATH="$(pwd)" - git config --global --add safe.directory "${MEGATRON_PATH}" - unset MEGATRON_PATH - - git checkout 992da75a1fd90989eb1a97be8d9ff3eca993aa83 - CMD="python3 -m pip install ./" - echo " >> ${CMD}" - eval "${CMD}" - unset CMD - cd ../ -} - -check_if_megatron_needs_reinstall() { - megatron_loc="$(pip show megatron-core | grep '^Location' | awk '{print $2}')" - megatron_dist_loc="$(find "${megatron_loc}" -depth -maxdepth 1 -name 'megatron*dist-info' -type d | head -n1)" - - check_if_managed_install "${megatron_dist_loc}" - megatron_needs_reinstall=${PACKAGE_NEEDS_REINSTALL} - - unset megatron_dist_loc - unset megatron_loc -} - -install_megatron() { - has_megatron=$(pip list | grep "^megatron-core " | grep "megatron-core" -o | awk '{print $1}' | awk '{print length}') - megatron_needs_reinstall=0 - - if [ "$has_megatron" != "13" ]; then - megatron_install_logic - else - check_if_megatron_needs_reinstall - if [ "$megatron_needs_reinstall" != "0" ]; then - echo " > Reinstalling Megatron per demo version..." - python3 -m pip uninstall -y megatron-core - megatron_install_logic - else - echo " > Megatron already installed!" - fi - fi - unset megatron_needs_reinstall - unset has_megatron -} - -flash_attention_install_logic() { - if [ ! -d "flash-attention" ]; then - git clone https://github.com/HazyResearch/flash-attention.git - fi - - cd flash-attention || exit - FLASH_ATTENTION_PATH="$(pwd)" - git config --global --add safe.directory "${FLASH_ATTENTION_PATH}" - unset FLASH_ATTENTION_PATH - - git checkout v1.0.6 - CMD="MAX_JOBS=${ARG_JOBS} python3 setup.py bdist_wheel" - echo " >> ${CMD}" - eval "${CMD}" - unset CMD - python3 -m pip install "$(find './dist' -name '*.whl' | head -n1)" - cd .. -} - -check_if_flash_attention_needs_reinstall() { - flash_attn_loc="$(pip show flash-attn | grep '^Location' | awk '{print $2}')" - flash_attn_dist_loc="$(find "${flash_attn_loc}" -depth -maxdepth 1 -name 'flash_attn*dist-info' -type d | head -n1)" - - check_if_managed_install "${flash_attn_dist_loc}" - flash_attn_needs_reinstall=${PACKAGE_NEEDS_REINSTALL} - - unset flash_attn_dist_loc - unset flash_attn_loc -} - -install_flash_attention() { - has_flashattn=$(pip list | grep "^flash-attn " | grep "flash-attn" -o | awk '{print $1}' | awk '{print length}') - flash_attn_needs_reinstall=0 - - if [ "$has_flashattn" != "10" ]; then - flash_attention_install_logic - else - check_if_flash_attention_needs_reinstall - if [ "$flash_attn_needs_reinstall" != "0" ]; then - echo " > Reinstalling flash_attn per demo version..." - python3 -m pip uninstall -y flash-attn - flash_attention_install_logic - else - echo " > flash-attention already installed!" - fi - fi - - unset flash_attn_needs_reinstall - unset has_flashattn -} - -transformer_engine_install_logic() { - if [ ! -d "TransformerEngine" ]; then - git clone https://github.com/NVIDIA/TransformerEngine.git - fi - - cd TransformerEngine || exit - TRANSFORMER_ENGINE_PATH="$(pwd)" - git config --global --add safe.directory "${TRANSFORMER_ENGINE_PATH}" - unset TRANSFORMER_ENGINE_PATH - - git checkout 804f120322a13cd5f21ea8268860607dcecd055c - git submodule update --recursive --init - CMD="MAKEFLAGS=-j${ARG_JOBS} MAX_JOBS=${ARG_JOBS} python3 setup.py bdist_wheel --framework=pytorch" - echo " >> ${CMD}" - eval "${CMD}" - unset CMD - python3 -m pip install "$(find './dist' -name '*.whl' | head -n1)" - cd .. - - # Check for common point of failure with TE. - has_te_loc=$(pip list | grep "^transformer-engine " | grep "transformer-engine" -o | awk '{print $1}' | awk '{print length}') - [ "$has_te_loc" != "18" ] && { - echo " > TransformerEngine install failed. Probable cause of failures:" - echo " - CUDNN location was not picked up. If your CUDNN include dir" - echo " is /path/to/cudnn/include and lib is /path/to/cudnn/lib, " - echo " Invoke the script as CUDNN_PATH=/path/to/cudnn sh install.sh ..." - exit 1 - } - unset has_te_loc -} - -check_if_transformer_engine_needs_reinstall() { - te_loc="$(pip show transformer-engine | grep '^Location' | awk '{print $2}')" - te_dist_loc="$(find "${te_loc}" -depth -maxdepth 1 -name 'transformer_engine*dist-info' -type d | head -n1)" - - check_if_managed_install "${te_dist_loc}" - te_needs_reinstall=${PACKAGE_NEEDS_REINSTALL} - - unset te_dist_loc - unset te_loc -} - -install_transformer_engine() { - has_te=$(pip list | grep "^transformer-engine " | grep "transformer-engine" -o | awk '{print $1}' | awk '{print length}') - te_needs_reinstall=0 - - if [ "$has_te" != "18" ]; then - transformer_engine_install_logic - else - check_if_transformer_engine_needs_reinstall - if [ "$te_needs_reinstall" != "0" ]; then - echo " > Reinstalling TransformerEngine per demo version..." - python3 -m pip uninstall -y transformer-engine - transformer_engine_install_logic - else - echo " > TransformerEngine already installed!" - fi - fi - - unset te_needs_reinstall - unset has_te - - # Patch TE files. - sh "${NEMO_DIR}/patch_te.sh" -} - -nemo_install_logic() { - if [ ! -d "NeMo" ]; then - git clone --branch main --single-branch https://github.com/NVIDIA/NeMo.git NeMo - fi - - cd NeMo || exit - NeMo_PATH="$(pwd)" - git config --global --add safe.directory "${NeMo_PATH}" - unset NeMo_PATH - - git checkout bf270794267e0240d8a8b2f2514c80c6929c76f1 - bash reinstall.sh - cd ../ -} - -check_if_nemo_needs_reinstall() { - nemo_loc="$(pip show nemo-toolkit | grep '^Location' | awk '{print $2}')" - nemo_dist_loc="$(find "${nemo_loc}" -depth -maxdepth 1 -name 'nemo_toolkit*dist-info' -type d | head -n1)" - - check_if_managed_install "${nemo_dist_loc}" - nemo_needs_reinstall=${PACKAGE_NEEDS_REINSTALL} - - unset nemo_dist_loc - unset nemo_loc -} - -install_nemo() { - has_nemo=$(pip list | grep "^nemo-toolkit " | grep "nemo-toolkit" -o | awk '{print $1}' | awk '{print length}') - nemo_needs_reinstall=0 - - if [ "$has_nemo" != "12" ]; then - nemo_install_logic - else - check_if_nemo_needs_reinstall - if [ "$nemo_needs_reinstall" != "0" ]; then - echo " > Reinstalling NeMo per demo version..." - python3 -m pip uninstall -y nemo-toolkit - nemo_install_logic - else - echo " > NeMo already installed!" - fi - fi -} - -while [ "$#" -gt 0 ]; do - case $1 in - --deps) - DEPENDENCIES_DIR="$2" - shift - ;; - -j | --jobs) - ARG_JOBS="$2" - shift - ;; - --ninja) BUILD_NINJA=1 ;; - --skipsrc) BUILD_SRCLIBS=0 ;; - -h | --help) ARG_HELP=1 ;; - *) - echo "Unknown parameter passed: $1" - echo "For help type: $0 --help" - exit 1 - ;; - esac - shift -done - -if [ "$ARG_HELP" -eq "1" ]; then - echo "Usage: sh $0 [options]" - echo "All arguments are optional." - echo " --help or -h : Print this help menu." - echo " [--deps] {temp} : Path to download and build dependencies." - echo " [-j | --jobs] {1} : Number of jobs to use for building from source." - echo " [--ninja] : Flag to build ninja (if not present) to speed up installation." - # skipsrc is not documented to prevent users from invoking it directly. - exit -fi - -DEPENDENCIES_DIR="${NEMO_DIR}/${DEPENDENCIES_DIR}" -echo " > Using ${DEPENDENCIES_DIR}' to store dependencies." -mkdir -p "${DEPENDENCIES_DIR}" -install_essential_tools "${DEPENDENCIES_DIR}" - -echo " > Installing Requirements.txt..." -pip install --upgrade pip -pip install nvidia-pyindex || { - echo "Could not install nvidia-pyindex, stopping install" - exit 1 -} -# # One of the hidden dependencies require Cython, but doesn't specify it. -# # https://github.com/VKCOM/YouTokenToMe/pull/108 -# # WAR by installing Cython before requirements. -pip install "Cython==0.29.36" || { - echo "Could not install Cython, stopping install" - exit 1 -} -# PyYaml, Cython and pip don't play well together. -# https://github.com/yaml/pyyaml/issues/601 -pip install "pyyaml==5.4.1" --no-build-isolation || { - echo "Could not install PyYaml, stopping install" - exit 1 -} -# Install a specific version of opencc to WAR a GLIBC not found error. -pip install "opencc==1.1.6" || { - echo "Could not install OpenCC, stopping install" - exit 1 -} -pip install -r requirements.txt || { - echo "Could not install dependencies, stopping install" - exit 1 -} - -# Installation from source -if [ "$BUILD_SRCLIBS" -eq "1" ]; then - (command -v -- "ninja" >/dev/null 2>&1) || [ "$BUILD_NINJA" -eq "0" ] && echo " > Could not locate ninja, consider passing the --ninja flag to speedup dependency installation." -fi - -cd "${DEPENDENCIES_DIR}" || exit -if (! command -v -- "ninja" >/dev/null 2>&1) && [ "$BUILD_NINJA" -eq "1" ]; then - echo " > Building ninja..." - install_ninja -fi - -if [ "$BUILD_SRCLIBS" -eq "1" ]; then - echo " > Installing Apex..." - install_apex -fi - -echo " > Installing Megatron-LM..." -install_megatron - -if [ "$BUILD_SRCLIBS" -eq "1" ]; then - echo " > Installing flash-attention..." - install_flash_attention -fi - -if [ "$BUILD_SRCLIBS" -eq "1" ]; then - echo " > Installing TransformerEngine..." - install_transformer_engine -fi - -echo " > Installing NeMo..." -install_nemo - -if [ ! -f "${NEMO_DIR}/GPT3/convert_te_onnx_to_trt_onnx.py" ]; then - echo " > Copying opset19 conversion script..." - if [ ! -f "${SCRIPT_DIR}/convert_te_onnx_to_trt_onnx.py" ]; then - echo "Opset19 conversion script is not located at /scripts/convert_te_onnx_to_trt_onnx.py" - return 1 - fi - cp "${SCRIPT_DIR}/convert_te_onnx_to_trt_onnx.py" "${NEMO_DIR}/GPT3/convert_te_onnx_to_trt_onnx.py" -fi - -cd ../ - -unset ARG_HELP -unset ARG_JOBS -unset BUILD_NINJA -unset DEPENDENCIES_DIR -unset SCRIPT_DIR -unset DEMO_DIR -unset NEMO_DIR diff --git a/demo/NeMo/interface.py b/demo/NeMo/interface.py deleted file mode 100644 index ec3dcbf7e..000000000 --- a/demo/NeMo/interface.py +++ /dev/null @@ -1,727 +0,0 @@ -# -# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from datetime import datetime -import os -import random -import sys -import time -from typing import List, Union, Dict -from copy import copy - -from cuda import cuda -from tqdm import tqdm -import numpy as np -import torch - -from transformers import PretrainedConfig -from omegaconf import OmegaConf, listconfig - -# Add syspath for custom library -if __name__ == "__main__": - filepath = os.path.dirname(os.path.abspath(__file__)) - project_root = os.path.join(filepath, os.pardir) - sys.path.append(project_root) - -from GPT3.decoding import full_inference, generate, process_output -from GPT3.GPT3ModelConfig import GPT3ModelTRTConfig -from GPT3.lambada_dataset import Lambada -from GPT3.nemo_utils import get_computeprob_response -from GPT3.sequence_perplexity import SequencePerplexity - -sys.path.append('../HuggingFace') # Include HuggingFace -from NNDF.general_utils import NNFolderWorkspace -from NNDF.logger import G_LOGGER -from NNDF.networks import ( - Precision, - NetworkMetadata, - TimingProfile, - BenchmarkingResult, - NetworkResult, - NetworkCheckpointResult, -) -from NNDF.interface import NetworkCommand - -# Manually set by referring to examples/nlp/language_modeling/conf/megatron_gpt_config.yaml -# If a field cannot be found, set to None. -DEFAULT_CONFIG = { - "is_encoder_decoder": False, - "is_decoder": True, - "architectures": [ "GPT3NeMoModel" ], -} - -GPT3CONFIG_MAPPINGS = { - "gpt-126m": PretrainedConfig.from_dict(dict({"_name_or_path": "gpt-126m", - "num_heads": 12, - "num_layers": 12, - "hidden_size": 768, - "max_position_embeddings": 2048, - "min_seq_len": 0, - }, **DEFAULT_CONFIG)), - "gpt-1.3b": PretrainedConfig.from_dict(dict({"_name_or_path": "gpt-1.3b", - "num_heads": 16, - "num_layers": 24, - "hidden_size": 2048, - "max_position_embeddings": 2048, - "min_seq_len": 0, - }, **DEFAULT_CONFIG)), - "gpt-5b": PretrainedConfig.from_dict(dict({"_name_or_path": "gpt-5b", - "num_heads": 32, - "num_layers": 24, - "hidden_size": 4096, - "max_position_embeddings": 2048, - "min_seq_len": 16, - }, **DEFAULT_CONFIG)), -} - -def _hf_hub_metadata(variant: str, fp8: bool) -> Dict[str, str]: - repo_mappings = { - "gpt-1.3b": "nvidia/nemo-megatron-gpt-1.3B", - "gpt-5b": "nvidia/nemo-megatron-gpt-5B", - } - - try: - repo_id = repo_mappings[variant] - except KeyError: - raise RuntimeError( - "Variant should be one of {}, got {}".format( - list(repo_mappings.keys()), variant - ) - ) - - file_key = (variant, "fp8" if fp8 else "fp16") - file_mappings = { - ("gpt-1.3b", "fp8"): ("nemo_gpt1.3B_fp16.nemo", None), - ("gpt-1.3b", "fp16"): ("nemo_gpt1.3B_fp16.nemo", None), - ("gpt-5b", "fp8"): ("nemo_gpt5B_fp8_bf16_tp1.nemo", "fp8"), - ("gpt-5b", "fp16"): ("nemo_gpt5B_fp16_tp1.nemo", None), - } - - try: - filename, branch = file_mappings[file_key] - except KeyError: - raise RuntimeError( - "Downloading nemo file for variant : {}, precision : {} from huggingface hub is unsupported. Consider passing a nemo-model or onnx-model from the command line.".format( - file_key[0], file_key[1] - ) - ) - - return {"repo_id": repo_id, "filename": filename, "revision": branch} - - -def download_model(dst_dir: str, cache_dir: str, *args, **kwargs) -> str: - from huggingface_hub import hf_hub_download - - os.makedirs(dst_dir, exist_ok=True) - os.makedirs(cache_dir, exist_ok=True) - - model_metadata = _hf_hub_metadata(*args, **kwargs) - return hf_hub_download( - local_dir=str(dst_dir), - local_dir_use_symlinks="auto", - cache_dir=cache_dir, - **model_metadata, - ) - - -def load_dataset(dataset_name, base_dir, tokens_to_generate, padding): - ds_map = {"Lambada": Lambada(base_dir, tokens_to_generate, padding)} - return ds_map[dataset_name] - -def get_accuracy_metric(cfg): - topN = [int(i.strip()) for i in cfg.top_n.split(",")] - m_map = {"Perplexity": SequencePerplexity(topN)} - return m_map[cfg.metric] - -def remove_padded_prompts(output, nb_paddings): - if nb_paddings == 0: - return output - result = {} - for k, v in output.items(): - if v != None and (type(v) is list or type(v) is torch.Tensor): - v = v[:-nb_paddings] - result[k] = v - return result - -def get_random_input(tokenizer, batch_size, in_seq_len, out_seq_len): - vocab_size = tokenizer.tokenizer.vocab_size - return (torch.randint(0, vocab_size, (batch_size, in_seq_len + out_seq_len), dtype=torch.int64).cuda(), - (torch.ones(batch_size, dtype=torch.int64) * in_seq_len).cuda()) - -class BaseModel(torch.nn.Module): - def __init__(self): - super(BaseModel, self).__init__() - self.model = None - def forward(self, x): - raise Exception("BaseModel forward method is not intended to be called.") - -class NeMoCommand(NetworkCommand): - def __init__( - self, - nemo_cfg, - config_class, - description, - **kwargs - ): - self.nemo_cfg = nemo_cfg - super().__init__(config_class, description, **kwargs) - - def validate_and_set_precision(self, fp8, fp16, bf16, use_fp8_storage, quantize_bmms): - if fp8: - if fp16: - G_LOGGER.info("Use FP8-FP16 precision.") - if bf16: - G_LOGGER.info("Use FP8-BF16 precision.") - elif fp16: - G_LOGGER.info("Use pure FP16 precision.") - elif bf16: - G_LOGGER.info("Use pure BF16 precision.") - else: - fp16 = True - G_LOGGER.warn("Precision is not specified. Use pure FP16 precision by default.") - - self.fp8, self.fp16, self.bf16 = fp8, fp16, bf16 - self.nemo_cfg.trt_export_options.use_fp8 = fp8 - self.nemo_cfg.trt_export_options.use_fp16 = fp16 - self.nemo_cfg.trt_export_options.use_bf16 = bf16 - self.nemo_cfg.onnx_export_options.use_fp8_storage = use_fp8_storage - self.nemo_cfg.onnx_export_options.quantize_bmms = quantize_bmms - - if fp16: - self.nemo_cfg.trainer.precision = "16" - elif bf16: - self.nemo_cfg.trainer.precision = "bf16" - else: - self.nemo_cfg.trainer.precision = "32" - - def update_hyperparams(self, model_config): - self.nemo_cfg.model.num_layers = model_config.num_layers - self.nemo_cfg.model.nb_heads = model_config.num_heads - self.nemo_cfg.model.head_size = model_config.hidden_size // model_config.num_heads - self.nemo_cfg.model.hidden_size = model_config.hidden_size - self.nemo_cfg.model.encoder_seq_length = model_config.max_position_embeddings - self.nemo_cfg.model.max_position_embeddings = model_config.max_position_embeddings - - def setup_environment( - self, - variant: str, - working_dir: str = "temp", - batch_size: int = 1, - num_beams: int = 1, - use_cache: bool = True, - verbose: bool = False, - info: bool = False, - iterations: int = None, - warmup: int = None, - number: int = None, - duration: int = None, - percentile: int = None, - cleanup: bool = False, - action: str = None, - max_seq_len: int = None, - fp8: bool = True, - fp16: bool = False, - bf16: bool = False, - use_fp8_storage: bool = False, - quantize_bmms: bool = False, - input_seq_len: int = None, - output_seq_len: int = None, - nemo_model: str = None, - nemo_checkpoint: str = None, - nemo_hparams: str = None, - onnx_model: str = None, - **kwargs, - ) -> None: - """ - Use Arguments from command line or user specified to setup config for the model. - """ - self.validate_and_set_precision(fp8, fp16, bf16, use_fp8_storage, quantize_bmms) - - if not torch.cuda.is_available(): - raise EnvironmentError("GPU is required for NeMo demo.") - - # Initialize CUDA Driver API - err, = cuda.cuInit(0) - if err != cuda.CUresult.CUDA_SUCCESS: - raise RuntimeError("Cuda initialization failed with error: {}".format(err)) - - # See https://pytorch.org/docs/stable/_modules/torch.html#set_float32_matmul_precision - torch.set_float32_matmul_precision('medium') - - if max_seq_len != None: - self.nemo_cfg.model.max_seq_len = max_seq_len - - assert action != None, "Action must be specified" - if action == "accuracy": - self.nemo_cfg.mode = "accuracy" - self.nemo_cfg.inference.compute_logprob = True - self.nemo_cfg.inference.all_probs = True - self.nemo_cfg.inference.greedy = True - self.nemo_cfg.inference.add_BOS = False - self.nemo_cfg.inference.tokens_to_generate = 1 - self.nemo_cfg.inference.min_tokens_to_generate = 0 - self.nemo_cfg.inference.temperature = 1.0 - self.nemo_cfg.inference.top_k = 0 - self.nemo_cfg.inference.top_p = 0.9 - self.nemo_cfg.inference.repetition_penalty = 1.0 - elif action == "benchmark": - self.nemo_cfg.mode = "benchmark" - if input_seq_len != None: - self.nemo_cfg.benchmark.input_seq_len = input_seq_len - if output_seq_len != None: - self.nemo_cfg.benchmark.output_seq_len = output_seq_len - self.nemo_cfg.inference.tokens_to_generate = self.nemo_cfg.benchmark.output_seq_len - self.nemo_cfg.inference.min_tokens_to_generate = self.nemo_cfg.benchmark.output_seq_len - - if self.nemo_cfg.model.max_seq_len < (self.nemo_cfg.benchmark.input_seq_len + self.nemo_cfg.benchmark.output_seq_len): - raise ValueError(f"Max sequence length of the model needs to be greater than or equal to the sum of input sequence length and output sequence length. Got {self.nemo_cfg.model.max_seq_len} < {self.nemo_cfg.benchmark.input_seq_len} + {self.nemo_cfg.benchmark.output_seq_len}.") - - if (nemo_model or nemo_checkpoint) and onnx_model: - raise RuntimeError( - "Both nemo-model and onnx-model cannot be specified together. Please specify either nemo-model or onnx-model." - ) - - assert variant in GPT3CONFIG_MAPPINGS - model_config = GPT3CONFIG_MAPPINGS[variant] - - if self.nemo_cfg.model.max_seq_len > model_config.max_position_embeddings: - G_LOGGER.warn( - f"Updating max_position_embeddings to be the same as max_seq_len {self.nemo_cfg.model.max_seq_len}." - ) - G_LOGGER.warn( - f"Outputs longer than {model_config.max_position_embeddings} might be unmeaningful." - ) - model_config.max_position_embeddings = self.nemo_cfg.model.max_seq_len - - if self.nemo_cfg.model.max_seq_len < model_config.min_seq_len: - G_LOGGER.warn( - f"Force updating max_seq_len to minimum required length {model_config.min_seq_len}." - ) - self.nemo_cfg.model.max_seq_len = model_config.min_seq_len - - self.nemo_cfg.batch_size = batch_size - self.nemo_cfg.use_cache = use_cache - - if nemo_checkpoint != None: - # Set NeMo checkpoint configs - self.nemo_cfg.checkpoint_dir = os.path.dirname(nemo_checkpoint) - if not self.nemo_cfg.checkpoint_dir: - raise ValueError(f"NeMo checkpoint needs to be provided with full path.") - self.nemo_cfg.checkpoint_name = os.path.basename(nemo_checkpoint) - self.nemo_cfg.hparams_file = nemo_hparams - else: - if onnx_model != None: - G_LOGGER.info(f"Using onnx model {onnx_model} for inference.") - if os.path.exists(onnx_model): - self.nemo_cfg.onnx_model_file = onnx_model - else: - raise IOError( - f"Could not find the specified onnx file {onnx_model}." - ) - else: - if nemo_model != None: - if os.path.exists(nemo_model): - self.nemo_cfg.gpt_model_file = nemo_model - else: - raise IOError( - f"Could not find the specified nemo file {nemo_model}." - ) - else: - G_LOGGER.info("Downloading nemo model from HuggingFace Hub") - # Download nemo model if it does not exist. - # Setup temporary metadata, config to create a workspace to put the - # downloaded artefacts in - download_metadata = NetworkMetadata( - variant=variant, - precision=Precision(fp16=self.fp16), - use_cache=use_cache, - num_beams=num_beams, - batch_size=batch_size - ) - - download_config = self.config_class(metadata=download_metadata) - download_config.from_nemo_config(copy(self.nemo_cfg)) - download_workspace = NNFolderWorkspace(download_config, working_dir) - - self.nemo_cfg.gpt_model_file = download_model( - dst_dir=download_workspace.dpath + "/artefacts", - cache_dir=download_workspace.dpath + "/cache", - variant=variant, - fp8=fp8, - ) - - if self.nemo_cfg.gpt_model_file == None and self.nemo_cfg.checkpoint_dir == None and onnx_model == None: - G_LOGGER.error("No model exists based on specified configs and precisions.") - raise ValueError("Model not found.") - - self.update_hyperparams(model_config) - - # HuggingFace code - if verbose: - G_LOGGER.setLevel(level=G_LOGGER.DEBUG) - elif info: - G_LOGGER.setLevel(level=G_LOGGER.INFO) - - if variant is None: - G_LOGGER.error("You need to specify --variant to run NeMo demo") - return - - if self._args is not None: - G_LOGGER.info("Setting up environment with arguments: {}".format(self._args)) - else: - G_LOGGER.info("User-customized API is called") - - self.metadata = NetworkMetadata( - variant=variant, - precision=Precision(fp16=self.fp16), - use_cache=use_cache, - num_beams=num_beams, - batch_size=batch_size - ) - - self.config = self.config_class( - metadata = self.metadata - ) - - self.config.from_nemo_config(self.nemo_cfg) - - self.workspace = NNFolderWorkspace( - self.config, working_dir - ) - - self.timing_profile = TimingProfile( - iterations=iterations, - number=number, - warmup=warmup, - duration=duration, - percentile=percentile, - ) - - self.keep_torch_model = not cleanup - self.keep_onnx_model = not cleanup - self.keep_trt_engine = not cleanup - - self.process_framework_specific_arguments(onnx_model=onnx_model, **kwargs) - - def process_framework_specific_arguments(self, **kwargs): - pass - - def run(self) -> Union[List[NetworkResult], BenchmarkingResult]: - """ - Main entry point of our function which compiles and generates our model data for command-line mode. - The general process for the commands are all the same: - (1) Download the model - (2) Run either checkpoint or benchmark - (3) Returns the result - """ - t0 = time.time() - self.models = self.setup_tokenizer_and_model() - t1 = time.time() - G_LOGGER.info("setup_tokenizer_and_model() takes {:.4f}s in total.".format(t1 - t0)) - - results = [] - ppl = None - random.seed(self.nemo_cfg.inference.seed) - np.random.seed(self.nemo_cfg.inference.seed) - torch.manual_seed(self.nemo_cfg.inference.seed) - if self.nemo_cfg.mode == "accuracy": - G_LOGGER.debug("Run in accuracy mode.") - eval_ppl = get_accuracy_metric(self.nemo_cfg.accuracy) - has_align_requirement = self.nemo_cfg.runtime == 'nemo' and hasattr(self.model.cfg, "fp8") and self.model.cfg.fp8 == True - if has_align_requirement and self.nemo_cfg.accuracy.tokens_to_generate > 1: - self.nemo_cfg.accuracy.tokens_to_generate = 1 - G_LOGGER.warn("Force set tokens_to_generate=1 for FP8 run in NeMo framework.") - dataset = load_dataset(self.nemo_cfg.accuracy.dataset, self.workspace.rootdir, self.nemo_cfg.accuracy.tokens_to_generate, 8 if has_align_requirement else -1) - tokenizer = self.tokenizer - - def eval_ppl_with_batch_input(eval_ppl, batch_input): - ds_input = dataset.preprocess_input(tokenizer, batch_input) - self.nemo_cfg.inference.tokens_to_generate = self.nemo_cfg.accuracy.tokens_to_generate - self.nemo_cfg.inference.min_tokens_to_generate = self.nemo_cfg.accuracy.tokens_to_generate - - inputs = ds_input.inputs - response = full_inference( - model=self.model, - inputs=inputs, - cfg=self.nemo_cfg, - ) - - # It is still predication task even when tokens_to_generate > 1, so we need restore the context length. - batch_size = ds_input.inputs[0].shape[0] - real_ctx_length = ds_input.inputs[0].shape[1] - 1 - inputs = (ds_input.inputs[0], torch.ones(batch_size, dtype=torch.int32) * real_ctx_length) - - response = get_computeprob_response(tokenizer, response, inputs) - eval_ppl.update(ds_input=ds_input, response=response, tokenizer=tokenizer) - - batch_input = [] - for doc in tqdm(dataset.load()): - batch_input.append(doc) - - if len(batch_input) == self.nemo_cfg.batch_size: - eval_ppl_with_batch_input(eval_ppl, batch_input) - batch_input.clear() - - if len(batch_input): - # Pad empty text to batch size - while (len(batch_input) % self.nemo_cfg.batch_size) != 0: - batch_input.append({"text": ""}) - eval_ppl_with_batch_input(eval_ppl, batch_input) - - ppl, sequence_ppl, _, acc_text = eval_ppl.compute() - print("***************************") - print("{} ppl(last token): {:.4f}, ppl(sequence): {:.4f}, {}".format(self.nemo_cfg.accuracy.dataset, ppl, sequence_ppl, acc_text)) - print("***************************") - elif self.nemo_cfg.mode == "benchmark": - G_LOGGER.debug("Run in benchmark mode.") - rand_input = get_random_input(self.model.tokenizer, self.nemo_cfg.batch_size, self.nemo_cfg.benchmark.input_seq_len, self.nemo_cfg.benchmark.output_seq_len) - - for _ in range(self.timing_profile.warmup): - output = full_inference(self.model, rand_input, self.nemo_cfg) - - class BenchmarkTimer: - def __init__(self, name): - self.name = name - self.started = False - self.start_time = None - self.times = [] - - def start(self): - assert not self.started - self.started = True - self.start_time = time.perf_counter() - - def end(self): - assert self.started - self.started = False - self.times.append(time.perf_counter() - self.start_time) - - def stats_str(self, num_tokens): - total_time = sum(self.times) - avg_time = total_time / float(len(self.times)) - self.times.sort() - percentile95 = self.times[int(len(self.times) * 0.95)] - percentile99 = self.times[int(len(self.times) * 0.99)] - throughput = float(num_tokens) / avg_time - return("[{:10s}] Total Time: {:0.5f} s, Average Time: {:0.5f} s, 95th Percentile Time: {:0.5f} s, 99th Percentile Time: {:0.5f} s, Throughput: {:0.2f} tokens/s".format(self.name, total_time, avg_time, percentile95, percentile99, throughput)) - - G_LOGGER.info("Warm up finished. Start benchmarking...") - e2e_timer = BenchmarkTimer("E2E inference") - core_timer = BenchmarkTimer("Without tokenizer") - start_time = datetime.now() - iter_idx = 0 - cur_duration = 0 - while iter_idx < self.timing_profile.iterations or cur_duration < self.timing_profile.duration: - core_timer.start() - e2e_timer.start() - output = generate(self.model, rand_input, self.nemo_cfg) - core_timer.end() - - output = process_output(self.model, output) - e2e_timer.end() - - iter_idx += 1 - cur_duration = (datetime.now() - start_time).total_seconds() - - num_tokens = self.nemo_cfg.batch_size * self.nemo_cfg.benchmark.output_seq_len - print("***************************") - print(f"Running {iter_idx} iterations with duration: {cur_duration}s, batch size: {self.nemo_cfg.batch_size}, input sequence length: {self.nemo_cfg.benchmark.input_seq_len} and output sequence length: {self.nemo_cfg.benchmark.output_seq_len}") - print(f"{e2e_timer.stats_str(num_tokens)}") - print(f"{core_timer.stats_str(num_tokens)}") - print("***************************") - else: - G_LOGGER.debug("Run in inference mode.") - assert self.nemo_cfg.mode == "inference" - if self.nemo_cfg.runtime == 'nemo' and hasattr(self.model.cfg, "fp8") and self.model.cfg.fp8 == True and self.nemo_cfg.batch_size % 8 != 0: - new_batch_size = ((self.nemo_cfg.batch_size + 7) // 8) * 8 - print("Update batch size from {} to {} for NeMo FP8 inference.".format(self.nemo_cfg.batch_size, new_batch_size)) - self.nemo_cfg.batch_size = new_batch_size - - nb_paddings = 0 - while (len(self.nemo_cfg.prompts) % self.nemo_cfg.batch_size) != 0: - self.nemo_cfg.prompts.append(self.nemo_cfg.prompts[-1]) - nb_paddings += 1 - - batch_idx = 0 - start = 0 - while True: - inputs = OmegaConf.to_container(listconfig.ListConfig(self.nemo_cfg.prompts[start:start+self.nemo_cfg.batch_size])) - output = full_inference(self.model, inputs, self.nemo_cfg) - output = remove_padded_prompts(output, nb_paddings) - print("***************************") - print("Batch {}: {}".format(batch_idx, output)) - print("***************************") - batch_idx += 1 - start += self.nemo_cfg.batch_size - if start >= len(self.nemo_cfg.prompts): - break - - t2 = time.time() - G_LOGGER.info("Inference session is {:.4f}s in total.".format(t2 - t1)) - - # Release runtime objects - if self.nemo_cfg.runtime == 'onnx': - del self.model.onnxrt - elif self.nemo_cfg.runtime == 'trt': - del self.model.trt - - return results, ppl - - def add_args(self) -> None: - general_group = self._parser.add_argument_group("general") - general_group.add_argument( - "--help", - "-h", - help="Shows help message for NeMo commands.", - action="store_true", - ) - general_group.add_argument( - "--verbose", "-v", - help="Display verbose logs.", - action="store_true" - ) - general_group.add_argument( - "--info", help="Display info logs.", action="store_true" - ) - general_group.add_argument( - "--working-dir", "-wd", - help="Location of where to save the model and other downloaded files.", - required=True, - ) - - timing_group = self._parser.add_argument_group("inference measurement") - timing_group.add_argument( - "--duration", - type=int, - help="Minimal duration of inference iterations to measure in seconds.", - default=NetworkCommand.DEFAULT_DURATION, - ) - timing_group.add_argument( - "--iterations", - type=int, - help="Number of iterations to measure.", - default=NetworkCommand.DEFAULT_ITERATIONS, - ) - timing_group.add_argument( - "--warmup", - type=int, - help="Number of warmup iterations before actual measurement occurs.", - default=NetworkCommand.DEFAULT_WARMUP, - ) - - model_config_group = self._parser.add_argument_group("model") - model_config_group.add_argument( - "--nemo-model", - help="Set a NeMo model to be used.", - type=str, - default=None - ) - model_config_group.add_argument( - "--nemo-checkpoint", - help="Set a NeMo checkpoint to be used.", - type=str, - default=None - ) - model_config_group.add_argument( - "--nemo-hparams", - help="Set a NeMo hparams.yaml to be used.", - type=str, - default=None - ) - model_config_group.add_argument( - "--onnx-model", - help="Set a onnx model (exported from a NeMo model) to be used. See `export_utils.py` in the model directory for exporting onnx files", - type=str, - default=None, - ) - model_config_group.add_argument( - "--max-seq-len", - help="Set maximum sequence lengths used for a GPT model.", - type=int, - default=None, - ) - model_config_group.add_argument( - "--batch-size", "-b", - help="Set batch size for inference", - required=False, - type=int, - default=1 - ) - model_config_group.add_argument( - "--variant", "-m", - help="Model to generate", - required=True, - choices=GPT3ModelTRTConfig.TARGET_MODELS, - ) - model_config_group.add_argument( - "--use-cache", - "-kv", - help="Enable KV cache", - action="store_true", - default=False, - ) - model_config_group.add_argument( - "--fp8", - action="store_true", - help="Use FP8 precision.", - default=False - ) - model_config_group.add_argument( - "--fp16", - action="store_true", - help="Use FP16 precision.", - default=False - ) - model_config_group.add_argument( - "--bf16", - action="store_true", - help="Use BF16 precision.", - default=False - ) - model_config_group.add_argument( - "--use-fp8-storage", - action="store_true", - help="Use FP8 storage precision.", - default=False - ) - model_config_group.add_argument( - "--quantize-bmms", - help="Quantize attention BMMs", - action="store_true", - default=False, - ) - - def __call__(self): - t0 = time.time() - self.add_args() - self._args = self._parser.parse_args() - if "help" in self._args and self._args.help == True: - self._parser.print_help() - exit(0) - - self.setup_environment( - **vars(self._args), - ) - t1 = time.time() - G_LOGGER.info("Set up environment takes {:.4f}s.".format(t1 - t0)) - - network_results, ppl_results = self.run() - return NetworkCheckpointResult( - network_results=network_results, - accuracy=0, - perplexity=0, - ) diff --git a/demo/NeMo/nemo_export.py b/demo/NeMo/nemo_export.py deleted file mode 100644 index b9f5ad3a9..000000000 --- a/demo/NeMo/nemo_export.py +++ /dev/null @@ -1,922 +0,0 @@ -# -# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import argparse -import subprocess as sp -import shlex -import omegaconf -import os -import sys -import warnings -from typing import Dict, List, Optional, Tuple -import numpy as np - -# nemo -from nemo.core import ModelPT -from nemo.core.classes import Exportable -from nemo.core.neural_types import ChannelType, NeuralType -from nemo.utils.export_utils import augment_filename -from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel, MegatronGPTExportableModel - -# onnx -import onnx -import onnx_graphsurgeon as gs - -# polygraphy -from polygraphy.backend.trt import Profile, CreateConfig, engine_from_network, NetworkFromOnnxPath, save_engine -from polygraphy.logger import G_LOGGER as PG_LOGGER - -import torch -import transformer_engine - -if __name__ == "__main__": - filepath = os.path.dirname(os.path.abspath(__file__)) - project_root = os.path.join(filepath, os.pardir, "HuggingFace") - sys.path.append(project_root) - -# Add syspath for custom library -from GPT3.nemo_utils import load_nemo_model, release_nemo_model -from GPT3.convert_te_onnx_to_trt_onnx import replace_customop_qdq_with_onnx_qdq - -# HuggingFace utils -from NNDF.logger import G_LOGGER -from NNDF.models import _calculate_polygraphy_verbosity - -# ONNX conversion script - -# Set polygraphy logging level here. -PG_LOGGER.module_severity = PG_LOGGER.INFO - -class MegatronGPTSingleInputExportableModel(MegatronGPTExportableModel): - """ - Wrapper for MegatronGPTExportableModel to export ONNX with a single input - """ - - def __init__(self, model, max_seq_len): - super().__init__(model) - self.cfg = model.cfg - self.max_seq_len = max_seq_len - - def forward(self, tokens): - def model_forward(tokens): - position_ids, attention_mask = self.get_position_ids_and_mask(tokens, self.max_seq_len) - assert tokens.shape == position_ids.shape - assert attention_mask.shape[2] == attention_mask.shape[3] == tokens.shape[1] == position_ids.shape[1] - return self.model.forward( - tokens=tokens.cuda(), - text_position_ids=position_ids.cuda(), - attention_mask=attention_mask.cuda(), - labels=None, - ) - - with torch.no_grad(), torch.inference_mode(), torch.autocast( - 'cuda', dtype=self.dtype - ), warnings.catch_warnings(): - warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning, module=r'.*') - if self.fp8_enabled: - with transformer_engine.pytorch.onnx_export(self.fp8_enabled), transformer_engine.pytorch.fp8_autocast( - enabled=self.fp8_enabled, fp8_recipe=self.fp8_recipe - ): - output_tensor = model_forward(tokens) - else: - output_tensor = model_forward(tokens) - return output_tensor - - def get_position_ids_and_mask(self, data, max_seq_len): - seq_len = data.size()[1] - # Attention mask (lower triangular). - attention_mask = torch.tril(torch.ones( - (1, max_seq_len, max_seq_len), device=data.device)).view( - 1, 1, max_seq_len, max_seq_len) - - # Position ids. - position_ids = torch.arange(max_seq_len, dtype=torch.long, - device=data.device) - position_ids = position_ids[:seq_len].unsqueeze(0).expand_as(data) - - # Convert attention mask to binary: - attention_mask = (attention_mask < 0.5) - - return position_ids, attention_mask[:1, :1, :seq_len, :seq_len] - - def input_example(self): - ids = self.model.tokenizer.text_to_ids("how is the weather on Sunday morning?") - id_tensors = torch.unsqueeze(torch.LongTensor(ids), dim=0) - G_LOGGER.debug(f"Calling input_example shape {id_tensors.shape}") - return id_tensors, # return a tuple - - @property - def input_types(self) -> Optional[Dict[str, NeuralType]]: - return { - "input_ids": NeuralType(('B', 'T'), ChannelType()), - } - - @property - def input_names(self) -> List[str]: - return ['input_ids'] - -def get_trtexec_cmd(onnx_fpath, cfg, bs): - max_seq_len = cfg.model.max_seq_len - opt_seq_len = cfg.trt_export_options.opt_seq_len if cfg.trt_export_options.opt_seq_len else (max_seq_len // 2) - trtexec_cmd = f"trtexec --onnx={onnx_fpath}" - min_shapes = f"--minShapes=input_ids:{bs}x1" - opt_shapes = f"--optShapes=input_ids:{bs}x{opt_seq_len}" - max_shapes = f"--maxShapes=input_ids:{bs}x{max_seq_len}" - if not cfg.use_one_input: - min_shapes += f",position_ids:{bs}x1" - opt_shapes += f",position_ids:{bs}x{opt_seq_len}" - max_shapes += f",position_ids:{bs}x{max_seq_len}" - if not cfg.trt_export_options.use_fp8: - min_shapes += ",attention_mask:1x1x1x1" - opt_shapes += f",attention_mask:1x1x{opt_seq_len}x{opt_seq_len}" - max_shapes += f",attention_mask:1x1x{max_seq_len}x{max_seq_len}" - - if cfg.use_cache: - trtexec_cmd += " --profile=0" - nbheads, headsize = cfg.model.nb_heads, cfg.model.head_size - input_k = get_past_key_name('*') - input_v = get_past_value_name('*') - # ("sequence", "batch", nbheads, headsize) - min_shapes += f",{input_k}:0x{bs}x{nbheads}x{headsize},{input_v}:0x{bs}x{nbheads}x{headsize}" - opt_shapes += f",{input_k}:0x{bs}x{nbheads}x{headsize},{input_v}:0x{bs}x{nbheads}x{headsize}" - max_shapes += f",{input_k}:0x{bs}x{nbheads}x{headsize},{input_v}:0x{bs}x{nbheads}x{headsize}" - trtexec_cmd += f" {min_shapes} {opt_shapes} {max_shapes}" - - if cfg.use_cache: - trtexec_cmd += " --profile=1" - - min_shapes = f"--minShapes=input_ids:{bs}x1" - opt_shapes = f"--optShapes=input_ids:{bs}x1" - max_shapes = f"--maxShapes=input_ids:{bs}x1" - if not cfg.use_one_input: - min_shapes += f",position_ids:{bs}x1" - opt_shapes += f",position_ids:{bs}x1" - max_shapes += f",position_ids:{bs}x1" - if not cfg.trt_export_options.use_fp8: - min_shapes += ",attention_mask:1x1x1x1" - opt_shapes += f",attention_mask:1x1x{opt_seq_len}x{opt_seq_len}" - max_shapes += f",attention_mask:1x1x{max_seq_len}x{max_seq_len}" - - nbheads, headsize = cfg.model.nb_heads, cfg.model.head_size - input_k = get_past_key_name('*') - input_v = get_past_value_name('*') - # ("sequence", "batch", nbheads, headsize) - min_shapes += f",{input_k}:1x{bs}x{nbheads}x{headsize},{input_v}:1x{bs}x{nbheads}x{headsize}" - opt_shapes += f",{input_k}:{opt_seq_len}x{bs}x{nbheads}x{headsize},{input_v}:{opt_seq_len}x{bs}x{nbheads}x{headsize}" - max_shapes += f",{input_k}:{max_seq_len - 1}x{bs}x{nbheads}x{headsize},{input_v}:{max_seq_len - 1}x{bs}x{nbheads}x{headsize}" - trtexec_cmd += f" {min_shapes} {opt_shapes} {max_shapes}" - - use_tf32 = cfg.trt_export_options.use_tf32 - use_fp8 = cfg.trt_export_options.use_fp8 - use_fp16 = cfg.trt_export_options.use_fp16 - use_bf16 = cfg.trt_export_options.use_bf16 - use_strongly_typed = cfg.trt_export_options.use_strongly_typed - sparse = cfg.trt_export_options.sparse - trtexec_cmd += " --noTF32" if not use_tf32 else "" - trtexec_cmd += " --fp8" if (use_fp8 and not use_strongly_typed) else "" - trtexec_cmd += " --fp16" if (use_fp16 and not use_strongly_typed) else "" - trtexec_cmd += " --bf16" if (use_bf16 and not use_strongly_typed) else "" - trtexec_cmd += " --stronglyTyped" if use_strongly_typed else "" - trtexec_cmd += " --sparsity=enable" if sparse else "" - trtexec_cmd += " --timingCacheFile=functional.cache" - return trtexec_cmd - - -def add_zero_point(g, base_name, dtype): - """Add Q/DQ zero-point constant""" - _zp_fp8_value = onnx.helper.make_tensor(base_name + "_zp_fp8_value", dtype, (1,), [0.0]) - zero_point_fp8 = gs.Variable(base_name + "_zero_point", dtype=dtype, shape=(1,)) - zero_point_const = gs.Node(op="Constant", name= base_name + "_zero_point_const", inputs=[], outputs=[zero_point_fp8], attrs={"value": _zp_fp8_value}) - g.nodes.append(zero_point_const) - return zero_point_fp8 - - -def add_scale(g, base_name, dtype, value): - """Add Q/DQ scale constant""" - _scale_value = onnx.helper.make_tensor(base_name + "_scale_value", dtype, (1,), [value]) - scale = gs.Variable(base_name + "_scale", dtype=dtype, shape=(1,)) - scale_const = gs.Node(op="Constant", name=base_name + "_scale_const", inputs=[], outputs=[scale], attrs={"value": _scale_value}) - g.nodes.append(scale_const) - return scale - - -def add_cast(g, inp, outp_dtype, cast_name): - """Add Cast operator """ - cast_outp = gs.Variable(cast_name+"_out", dtype=outp_dtype) - new_cast = gs.Node( - op="Cast", - name=cast_name, - inputs=[inp], - outputs=[cast_outp], - attrs={"to": outp_dtype} - ) - g.nodes.append(new_cast) - return cast_outp - - -def add_q(g, inp, hp_dtype, q_dtype, q_name=None): - """Add QuantizeLinear operator""" - scale_dtype = hp_dtype - q_name = q_name or f"{inp.name}_qfp8" - q_out = gs.Variable(q_name, dtype=q_dtype) - q = gs.Node(op="QuantizeLinear", name=q_name, - inputs=[ - inp, - add_scale(g, inp.name, scale_dtype, 1.0), - add_zero_point(g, inp.name, q_dtype) - ], - outputs=[q_out]) - g.nodes.append(q) - return q_out - - -def add_dq(g, inp, hp_dtype, dq_dtype): - """Add DequantizeLinear operator""" - dq_name = f"{inp.name}_dqfp8" - scale_dtype = hp_dtype - dq_out = gs.Variable(dq_name, dtype=hp_dtype) - dq = gs.Node(op="DequantizeLinear", name=dq_name, - inputs=[ - inp, - add_scale(g, inp.name, scale_dtype, 1.0), - add_zero_point(g, inp.name, dq_dtype)], - outputs=[dq_out]) - g.nodes.append(dq) - return dq_out - - -def quantize_all_bmms(g, dtype_high_prec, use_fp8_storage): - """Quantize the inputs of all batched matmul operators""" - - def quantize_bmm(g, bmm, dtype_high_prec): - assert len(bmm.inputs) == 2 - dq_outputs = [] - for i in range(len(bmm.inputs)): - if i == 0 or not use_fp8_storage: - q_outp = add_q(g, bmm.inputs[i], dtype_high_prec, onnx.TensorProto.FLOAT8E4M3FN) - dq_out = add_dq(g, q_outp, dtype_high_prec, onnx.TensorProto.FLOAT8E4M3FN) - else: - # mm.inputs[1] is the input from K or V which we don't quantize if is stored - # in the cache in quantized type. - dq_out = add_dq(g, bmm.inputs[i], dtype_high_prec, onnx.TensorProto.FLOAT8E4M3FN) - dq_outputs.append(dq_out) - bmm.inputs = dq_outputs - - bmm_nodes = [node for node in g.nodes if node.op == "MatMul"] - G_LOGGER.info("Quantizing attention BMMs") - G_LOGGER.info(f"Found {len(bmm_nodes)} MatMul operator nodes") - for bmm in bmm_nodes: - # Do not quantize the Matmul at the head of GPT3 (it is used ) - if bmm.name == "/model/module/MatMul": - continue - quantize_bmm(g, bmm, dtype_high_prec) - - -# Use ONNX graphsurgeon to add KV-cache to ONNX file -# Reusing the HF demo names. -def get_past_key_name(layer_id): - past_key_name = f"past_key_values.{layer_id}.decoder.key" - return past_key_name - -def get_past_value_name(layer_id): - past_value_name = f"past_key_values.{layer_id}.decoder.value" - return past_value_name - -def get_past_shape(nbheads, headsize): - return ("sequence_past_decoder_length", "batch", nbheads, headsize) - -def get_present_key_name(layer_id: int): - present_key_name = f"present_key_values.{layer_id}.decoder.key" - return present_key_name - -def get_present_value_name(layer_id: int): - present_value_name = f"present_key_values.{layer_id}.decoder.value" - return present_value_name - -def get_present_shape(nbheads, headsize): - return ("sequence_present_decoder_length", "batch", nbheads, headsize) - -def get_new_key_name(layer_id: int): - new_key_name = f"new_key_values.{layer_id}.decoder.key" - return new_key_name - -def get_new_value_name(layer_id: int): - new_value_name = f"new_key_values.{layer_id}.decoder.value" - return new_value_name - -def get_new_shape(nbheads, headsize): - return ("sequence", "batch", nbheads, headsize) - -def quantize_new_k_v(g, key_new, value_new, hp_dtype): - key_new_q_outp = add_q(g, key_new, hp_dtype, onnx.TensorProto.FLOAT8E4M3FN) - key_new_dq_out = add_dq(g, key_new_q_outp, hp_dtype, onnx.TensorProto.FLOAT8E4M3FN) - value_new_q_outp = add_q(g, value_new, hp_dtype, onnx.TensorProto.FLOAT8E4M3FN) - value_new_dq_out = add_dq(g, value_new_q_outp, hp_dtype, onnx.TensorProto.FLOAT8E4M3FN) - return key_new_dq_out, value_new_dq_out - -def add_kvcache_for( - g, layer_id, qkv_split, nbheads, headsize, dtype, kv_output_policy, hp_dtype, use_fp8_storage, quantize_bmms): - _, key_new, value_new = qkv_split.outputs - key_consumers = [c for c in key_new.outputs] - value_consumers = [c for c in value_new.outputs] - - def add_graph_past_inputs(use_fp8_storage): - past_key = gs.Variable( - name=get_past_key_name(layer_id), - dtype=dtype, - shape=get_past_shape(nbheads, headsize)) - past_value = gs.Variable( - name=get_past_value_name(layer_id), - dtype=dtype, - shape=get_past_shape(nbheads, headsize)) - g.inputs.append(past_key) - g.inputs.append(past_value) - - if use_fp8_storage and not quantize_bmms: - past_key_dq = add_dq(g, past_key, hp_dtype, onnx.TensorProto.FLOAT8E4M3FN) - past_value_dq = add_dq(g, past_value, hp_dtype, onnx.TensorProto.FLOAT8E4M3FN) - return past_key_dq, past_value_dq - - return past_key, past_value - - def add_concat(concat_name, input0, input1, output_name): - concat_out = gs.Variable( - output_name, - dtype=dtype, - shape=get_present_shape(nbheads, headsize)) - - concat = gs.Node(op="Concat", name=concat_name, - inputs=[input0, input1], outputs=[concat_out], - attrs={"axis": 0}) - g.nodes.append(concat) - return concat_out - - def add_cache_outputs(kv_output_policy, use_fp8_storage, hp_dtype): - if kv_output_policy == "kv_cache_concat": - new_key_output, new_value_output = key_concat_out, value_concat_out - elif kv_output_policy == "kv_new": - key_new.dtype = dtype - key_new.shape = get_new_shape(nbheads, headsize) - key_new.name = get_new_key_name(layer_id) - value_new.dtype = dtype - value_new.shape = get_new_shape(nbheads, headsize) - value_new.name = get_new_value_name(layer_id) - - if use_fp8_storage: - key_new_q = add_q(g, key_new, hp_dtype, onnx.TensorProto.FLOAT8E4M3FN, - f"{key_new.name}_qfp8") - value_new_q = add_q(g, value_new, hp_dtype, onnx.TensorProto.FLOAT8E4M3FN, - f"{value_new.name}_qfp8") - new_key_output, new_value_output = key_new_q, value_new_q - else: - new_key_output, new_value_output = key_new, value_new - else: - raise ValueError(f"Unsupported kv_output_policy: {kv_output_policy}") - g.outputs.append(new_key_output) - g.outputs.append(new_value_output) - return new_key_output, new_value_output - - past_key, past_value = add_graph_past_inputs(use_fp8_storage) - new_key_output, new_value_output = add_cache_outputs(kv_output_policy, use_fp8_storage, hp_dtype) - - if quantize_bmms: - if use_fp8_storage: - key_new = new_key_output - value_new = new_value_output - else: - key_new, value_new = quantize_new_k_v(g, key_new, value_new, hp_dtype) - key_concat_out = add_concat(f"key.{layer_id}.concat", - past_key, key_new, get_present_key_name(layer_id)) - value_concat_out = add_concat(f"value.{layer_id}.concat", - past_value, value_new, get_present_value_name(layer_id)) - - for c in key_consumers: - c.inputs[0] = key_concat_out - for c in value_consumers: - c.inputs[0] = value_concat_out - - -def add_kvcache(g, nbheads, headsize, dtype, kv_output_policy, hp_dtype, use_fp8_storage, quantize_bmms): - """Add KV-cache to each Transformer layer's QKV split """ - G_LOGGER.info("Adding KV-cache") - qkv_split_nodes = [node for node in g.nodes if node.op == "Split"] - G_LOGGER.debug(f"Found {len(qkv_split_nodes)} QKV-split nodes") - - for layer_id, qkv_split in enumerate(qkv_split_nodes): - add_kvcache_for( - g, layer_id, qkv_split, nbheads, headsize, dtype, kv_output_policy, hp_dtype, use_fp8_storage, quantize_bmms) - - G_LOGGER.debug("Done adding cache operations") - return len(qkv_split_nodes) - - -def normalize_dyn_axes_to_hf_names(g, vocab_size): - g.inputs[0].name = "input_ids" - g.inputs[0].shape = ("batch", "sequence") - if len(g.inputs) > 1: - g.inputs[1].name = "position_ids" - g.inputs[1].shape = ("batch", "sequence") - g.outputs[0].name = "logits" - g.outputs[0].shape = ("batch", "sequence", vocab_size) - G_LOGGER.debug("Done normalizing dynamic axes names to HuggingFace demo names") - - -def process_onnx( - kv_output_policy, - onnx_input_fpath, - onnx_output_fpath, - separate_param_files, - use_cache, - quantize_bmms, - nbheads, headsize, vocab_size, dtype, hp_dtype, use_fp8_storage): - """ - Process an ONNX model, add KV cache inputs and output, save result model to a specified path. - """ - G_LOGGER.info(f"Importing {onnx_input_fpath}... this will take some time") - g = gs.import_onnx(onnx.load(onnx_input_fpath)) - normalize_dyn_axes_to_hf_names(g, vocab_size) - num_layers = 0 - if use_cache: - num_layers = add_kvcache(g, nbheads, headsize, dtype, kv_output_policy, hp_dtype, use_fp8_storage, quantize_bmms) - g.cleanup().toposort() - - if quantize_bmms: - quantize_all_bmms(g, hp_dtype, use_fp8_storage) - g.cleanup().toposort() - - G_LOGGER.info(f"Exporting {onnx_output_fpath}") - model = gs.export_onnx(g) - G_LOGGER.info(f"Saving {onnx_output_fpath}") - if separate_param_files: - onnx.save_model(model, onnx_output_fpath, save_as_external_data=True, - all_tensors_to_one_file = False, convert_attribute=False) - else: - onnx.save_model(model, onnx_output_fpath, save_as_external_data=False) - G_LOGGER.info(f"Done: {onnx_output_fpath}") - return num_layers - - -def create_dir_if_not_exist(path): - dir = os.path.dirname(path) - if not os.path.exists(dir) and dir != "": - G_LOGGER.info(f"Making directory {dir}") - os.makedirs(dir) - - -class NeMoConverter(): - """ - A class to convert a NeMo model to an ONNX file, and convert an ONNX file to a TensorRT engine. - """ - def __init__(self, cfg, model_type=ModelPT): - self.model_type = model_type - self.cfg = cfg - self.model = None - self.export_envvars() - - def export_envvars(self) -> None: - if self.cfg.trt_export_options.use_fp8: - G_LOGGER.info( - f"Setting max sequence length to {self.cfg.model.max_seq_len}" - ) - os.environ["NVTE_ONNX_KVCACHE_MAX_SEQ_LEN"] = str( - self.cfg.model.max_seq_len - ) - - def nemo_to_onnx(self) -> str: - """ - Convert a NeMo model to an ONNX model, return the file path to the ONNX model. - """ - if self.model == None: - self.model = load_nemo_model(self.cfg, self.model_type) - - if not isinstance(self.model, Exportable): - G_LOGGER.error("Your NeMo model class ({}) is not Exportable.".format(self.model.__class__.__name__)) - sys.exit(1) - - if hasattr(self.model.cfg, "fp8") and self.model.cfg.fp8 == True: - if self.cfg.trt_export_options.use_fp8 == False: - G_LOGGER.info("Turning on trt_export_options.use_fp8 because NeMo model is in FP8 precision.") - self.cfg.trt_export_options.use_fp8 = True - else: - if self.cfg.trt_export_options.use_fp8 == True: - G_LOGGER.info("Turning off trt_export_options.use_fp8 because NeMo model is not in FP8 precision.") - self.cfg.trt_export_options.use_fp8 = False - - onnx_out = self.cfg.onnx_model_file - create_dir_if_not_exist(onnx_out) - check_trace = self.cfg.onnx_export_options.runtime_check - onnx_names = [] - - dynamic_axes={ - 'input_ids': {0: "batch", 1: "sequence"}, - 'position_ids': {0: "batch", 1: "sequence"}, - 'logits': {0: "batch", 1: "sequence"}, - } - - if self.cfg.use_one_input: - # Use a wrapper class to get rid of inputs other than input_ids. - self.model = MegatronGPTSingleInputExportableModel(self.model, self.cfg.model.max_seq_len) - del dynamic_axes['position_ids'] - - try: - self.model.to(device=self.cfg.onnx_export_options.device).freeze() - self.model.eval() - if not self.cfg.trt_export_options.use_fp8: - G_LOGGER.info("Exporting ONNX with attention_mask") - dynamic_axes['attention_mask'] = {2: "sequence", 3: "sequence"} - - self.model.export( - onnx_out, - onnx_opset_version=self.cfg.onnx_export_options.onnx_opset, - do_constant_folding=self.cfg.onnx_export_options.do_constant_folding, - dynamic_axes=dynamic_axes, - check_trace=check_trace, - check_tolerance=self.cfg.onnx_export_options.check_tolerance, - verbose=self.cfg.onnx_export_options.verbose, - ) - onnx_names = [augment_filename(onnx_out, subnet_name) for subnet_name in self.model.list_export_subnets()] - - except Exception as e: - G_LOGGER.error( - "Export failed. Please make sure your NeMo model class ({}) has working export() and that you have the latest NeMo package installed with [all] dependencies.".format( - self.model.__class__ - ) - ) - raise e - - release_nemo_model(self.model) - assert len(onnx_names) == 1 - os.rename(onnx_names[0], onnx_out) - return onnx_out - - def prune_onnx(self, input_path) -> str: - """ - Prune the input ONNX model to be structured sparsity pattern by using polygraphy. - """ - if not self.cfg.trt_export_options.sparse: - G_LOGGER.warning(f"Model pruning is enabled but sparsity is not enabled for TRT engine builder.") - - ibname = os.path.basename(input_path) - obname = "pruned." + ibname - opath = os.path.join(os.path.dirname(input_path), obname) - o_data_real_path = opath + "_data" - if os.path.exists(opath) and os.path.exists(o_data_real_path): - return opath - - o_data_bname = os.path.basename(o_data_real_path) - cmds = f"polygraphy surgeon prune {input_path} -o {opath} --save-external-data {o_data_bname}" - G_LOGGER.info(f"Prune ONNX model with: {cmds}") - G_LOGGER.info(f"This may take a while...") - sp.run(shlex.split(cmds), check=True, stdout=sp.PIPE, stderr=sp.STDOUT) - return opath - - - def create_onnx(self, onnx_input_fpath, onnx_output_fpath, kv_output_policy="kv_new"): - """ - Create an ONNX model with modifications from `onnx_input_fpath`, save the ONNX model to `onnx_output_fpath`. - The ONNX is modified to use a KV-Cache and/or quantize the attention batched matrix-multiplication ops. - No return value for this function. - """ - assert os.path.splitext(onnx_input_fpath)[1] == ".onnx", "Input ONNX file must end with '.onnx'." - assert os.path.splitext(onnx_output_fpath)[1] == ".onnx", "Output ONNX file must end with '.onnx'." - - quantize_bmms = self.cfg.onnx_export_options.quantize_bmms - use_cache = self.cfg.use_cache - nbheads, headsize = self.cfg.model.nb_heads, self.cfg.model.head_size - hp_dtype = onnx.TensorProto.BFLOAT16 if self.cfg.trt_export_options.use_bf16 else onnx.TensorProto.FLOAT16 - dtype = hp_dtype - if self.cfg.onnx_export_options.use_fp8_storage: - dtype = onnx.TensorProto.FLOAT8E4M3FN - assert nbheads * headsize == self.cfg.model.hidden_size, "Model hidden size does not match." - num_qkvs = process_onnx(kv_output_policy, - onnx_input_fpath, onnx_output_fpath, separate_param_files=True, - use_cache=use_cache, quantize_bmms=quantize_bmms, - nbheads=nbheads, headsize=headsize, vocab_size=self.cfg.model.vocab_size, dtype=dtype, hp_dtype=hp_dtype, use_fp8_storage=self.cfg.onnx_export_options.use_fp8_storage) - - G_LOGGER.info(f"Number of QKV subgraphs = {num_qkvs}, number of layers = {self.cfg.model.num_layers}") - if num_qkvs != self.cfg.model.num_layers: - raise ValueError("Number of QKV subgraphs must be the same as number of layers in the model.") - G_LOGGER.info(f"Saved KV-cache onnx to {onnx_output_fpath}") - - - # Reads an onnx file and creates a trt engine file - def onnx_to_trt(self, onnx_fpath, trt_fpath): - """ - Convert an ONNX model from `onnx_fpath` to a TensorRT engine, and save the result to `trt_fpath`. - """ - # Set up polygraphy config - use_tf32 = self.cfg.trt_export_options.use_tf32 - use_fp16 = self.cfg.trt_export_options.use_fp16 - use_fp8 = self.cfg.trt_export_options.use_fp8 - use_bf16 = self.cfg.trt_export_options.use_bf16 - strongly_typed = self.cfg.trt_export_options.use_strongly_typed - sparse = self.cfg.trt_export_options.sparse - if sparse and not self.cfg.onnx_export_options.prune: - G_LOGGER.warning("Sparsity for TRT engine builder is enabled, but model pruning is not.") - - # Create optimization profiles - bs = self.cfg.batch_size - max_seq_len = self.cfg.model.max_seq_len - opt_seq_len = self.cfg.trt_export_options.opt_seq_len if self.cfg.trt_export_options.opt_seq_len else (max_seq_len // 2) - profile_non_kv = Profile() - profile_non_kv.add(name="input_ids", min=(bs, 1), opt=(bs, opt_seq_len), max=(bs, max_seq_len)) # (batch, sequence) - if not self.cfg.use_one_input: - profile_non_kv.add(name="position_ids", min=(bs, 1), opt=(bs, opt_seq_len), max=(bs, max_seq_len)) # (batch, sequence) - # For FP8 precision, attention mask is created inside transformer_engine. - if not self.cfg.trt_export_options.use_fp8: - profile_non_kv.add(name="attention_mask", min=(1, 1, 1, 1), opt=(1, 1, opt_seq_len, opt_seq_len), max=(1, 1, max_seq_len, max_seq_len)) # (1, 1, sequence, sequence) - - num_layers, nbheads, headsize = self.cfg.model.num_layers, self.cfg.model.nb_heads, self.cfg.model.head_size - if self.cfg.use_cache: - for i in range(num_layers): - input_k = get_past_key_name(i) - input_v = get_past_value_name(i) - # (sequence, batch, nbheads, headsize) - profile_non_kv.add(name=input_k, min=(0, bs, nbheads, headsize), opt=(0, bs, nbheads, headsize), max=(0, bs, nbheads, headsize)) - profile_non_kv.add(name=input_v, min=(0, bs, nbheads, headsize), opt=(0, bs, nbheads, headsize), max=(0, bs, nbheads, headsize)) - - profiles = [profile_non_kv] - - # When enabling KV-cache, use first profile for context phase and second profile for generation phase - if self.cfg.use_cache: - profile_kv = Profile() - profile_kv.add(name="input_ids", min=(bs, 1), opt=(bs, 1), max=(bs, 1)) # (batch, sequence) - if not self.cfg.use_one_input: - profile_kv.add(name="position_ids", min=(bs, 1), opt=(bs, 1), max=(bs, 1)) # (batch, sequence) - # For FP8 precision, attention mask is created inside transformer_engine. - if not self.cfg.trt_export_options.use_fp8: - profile_kv.add(name="attention_mask", min=(1, 1, 1, 1), opt=(1, 1, opt_seq_len, opt_seq_len), max=(1, 1, max_seq_len, max_seq_len)) # (1, 1, sequence, sequence) - - assert num_layers > 0 - nbheads, headsize = self.cfg.model.nb_heads, self.cfg.model.head_size - for i in range(num_layers): - input_k = get_past_key_name(i) - input_v = get_past_value_name(i) - # (sequence, batch, nbheads, headsize) - profile_kv.add(name=input_k, min=(1, bs, nbheads, headsize), opt=(opt_seq_len, bs, nbheads, headsize), max=(max_seq_len-1, bs, nbheads, headsize)) - profile_kv.add(name=input_v, min=(1, bs, nbheads, headsize), opt=(opt_seq_len, bs, nbheads, headsize), max=(max_seq_len-1, bs, nbheads, headsize)) - profiles = [profile_kv, profile_non_kv] - - - # Read about these arguments here: - # https://github.com/NVIDIA/TensorRT/blob/main/tools/Polygraphy/polygraphy/backend/trt/config.py - # Note that the precision args below *enable*, not *require*, the specified precision - preview_features = [] - - trt_config = CreateConfig( - tf32= use_tf32, - fp16=False if strongly_typed else use_fp16, - bf16=False if strongly_typed else use_bf16, - sparse_weights=sparse, - profiles=profiles, - precision_constraints=None if strongly_typed else "obey", - preview_features=preview_features, - fp8=False if strongly_typed else use_fp8, - load_timing_cache=self.cfg.trt_export_options.timing_cache, - ) - - # Print out trtexec command for debugging - G_LOGGER.debug(" >>> trtexec command for debugging:") - G_LOGGER.debug(get_trtexec_cmd(onnx_fpath, self.cfg, bs)) - - with PG_LOGGER.verbosity(_calculate_polygraphy_verbosity()): - G_LOGGER.info(f"Reading ONNX file at {onnx_fpath}") - network = NetworkFromOnnxPath(onnx_fpath, strongly_typed=strongly_typed) - G_LOGGER.info("Building TRT engine") - engine = engine_from_network(network, config=trt_config) - G_LOGGER.info(f"Saving TRT engine to {trt_fpath}") - save_engine(engine, trt_fpath) - - @staticmethod - def _resolve_opset19_paths(onnx_fpath, results_path: Optional[str] = None) -> str: - foldername, filename = os.path.split(onnx_fpath) - return foldername if not results_path else results_path, filename - - @staticmethod - def get_opset19_onnx_fpath(onnx_fpath, results_path: Optional[str] = None) -> str: - suffix = ".opset19.onnx" - results_path, filename = NeMoConverter._resolve_opset19_paths( - onnx_fpath, results_path - ) - return os.path.join(results_path, os.path.splitext(filename)[0] + suffix) - - - @staticmethod - def onnx_to_opset19(onnx_fpath, results_path: Optional[str] = None) -> str: - """ - Convert a ONNX model `onnx_fpath` to be with standard opset19 Q/DQ nodes, return a string - contains a file path to the result ONNX if any conversion is performed, otherwise return `None`. - """ - mappings = replace_customop_qdq_with_onnx_qdq( - [onnx_fpath], - NeMoConverter._resolve_opset19_paths(onnx_fpath, results_path)[0], - create_netron_compatible_model=False, - remove_cast_before_q=False, - remove_cast_after_dq=False, - change_qdq_scale_precision="", - ) - if ( - (not mappings) - or (onnx_fpath not in mappings) - or (mappings[onnx_fpath] == None) - ): - G_LOGGER.error(f"Opset19 onnx file conversion failed for {onnx_fpath}.") - assert False - - G_LOGGER.info(f"Converted {onnx_fpath} to {mappings[onnx_fpath]} for opset19.") - return mappings[onnx_fpath] - -def parse_args(): - parser = argparse.ArgumentParser(description='NeMo export script arguments', add_help=True) - parser.add_argument( - "--nemo-model", - help="Set a NeMo model to be used.", - required=False, - default=None, - type=str, - ) - parser.add_argument( - "--nemo-checkpoint", - help="Set a NeMo checkpoint to be used.", - required=False, - default=None, - type=str, - ) - parser.add_argument( - "--onnx-model", - help="A path to load an ONNX model for conversion.", - required=False, - default=None, - type=str, - ) - parser.add_argument( - "--save-onnx-dir", - help="A directory to save the generated ONNX model. Must be writable.", - required=True, - ) - parser.add_argument( - "--opset19", - action="store_true", - help="If set, the ONNX will be converted to opset19.", - default=False - ) - parser.add_argument( - "--use-cache", - action="store_true", - help="If set, the ONNX will have KV-cache inputs and outputs.", - default=False - ) - parser.add_argument( - "--quantize-bmms", - help="Quantize attention BMMs", - action="store_true", - default=False, - ) - parser.add_argument( - "--save-engine", - required=False, - help="If set to a path, a TensorRT engine will be built from ONNX and save to the path.", - ) - parser.add_argument( - "--fp8", - action="store_true", - help="Use FP8 precision during conversion.", - default=False - ) - parser.add_argument( - "--fp16", - action="store_true", - help="Use FP16 precision during conversion.", - default=False - ) - parser.add_argument( - "--bf16", - action="store_true", - help="Use BF16 precision during conversion.", - default=False - ) - parser.add_argument( - "--extra-configs", - required=False, - help='Use this flag to set fields specified in config.yml with a format of --extra-configs="[=][ =]*". Values specified by this flag will not override any value set from other flags.', - default=None, - type=str, - ) - args = parser.parse_args() - return args - -def main(): - G_LOGGER.setLevel(level=G_LOGGER.INFO) - - config_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.yaml") - cfg = omegaconf.OmegaConf.load(config_path) - G_LOGGER.info(f"Loaded configs = {cfg}") - - args = parse_args() - if (args.nemo_model != None or args.nemo_checkpoint != None) and args.onnx_model != None: - G_LOGGER.error("NeMo model and ONNX model cannot be both set.") - exit(1) - - if args.nemo_model == None and args.nemo_checkpoint == None and args.onnx_model == None: - G_LOGGER.error("Either one of --nemo-model, --nemo-checkpoint, or --onnx-model needs to be set.") - exit(1) - - if args.extra_configs != None: - kwargs = args.extra_configs.split(" ") - for kwarg in kwargs: - kw = kwarg.split("=") - if len(kw) != 2: - raise ValueError(f'Arg {kwarg} is not in a format of "="') - def nested_set(dic, keys, value): - for i in range(len(keys)): - if not hasattr(dic, keys[i]): - raise ValueError(f"Cannot find key {keys[:i+1]} in the config.") - if i == len(keys) - 1: - dic[keys[i]] = value - else: - dic = dic[keys[i]] - - G_LOGGER.info(f"Setting {kw[0]} to {kw[1]}") - nested_set(cfg, kw[0].split("."), kw[1]) - G_LOGGER.info(f"Modified Configs = {cfg}") - - # Set precision for conversion - if args.fp16: - cfg.trainer.precision = "16" - cfg.trt_export_options.use_fp16 = True - elif args.bf16: - cfg.trainer.precision = "bf16" - cfg.trt_export_options.use_bf16 = True - else: - cfg.trainer.precision = "32" - - if args.fp8: - cfg.trt_export_options.use_fp8 = True - - if args.quantize_bmms: - cfg.onnx_export_options.quantize_bmms = True - - if os.path.exists(args.save_onnx_dir) and not os.path.isdir(args.save_onnx_dir): - raise ValueError(f"{args.save_onnx_dir} is not a directory.") - - cfg.onnx_model_file = os.path.join(args.save_onnx_dir, "model.onnx") - create_dir_if_not_exist(cfg.onnx_model_file) - - # Convert NeMo model to ONNX model - converter = None - if args.nemo_model or args.nemo_checkpoint: - cfg.gpt_model_file = args.nemo_model - if args.nemo_checkpoint: - cfg.checkpoint_dir = os.path.dirname(args.nemo_checkpoint) - cfg.checkpoint_name = os.path.basename(args.nemo_checkpoint) - converter = NeMoConverter(cfg, MegatronGPTModel) - onnx_name = converter.nemo_to_onnx() - G_LOGGER.info(f"ONNX exported from NeMo {onnx_name}") - elif args.onnx_model: - onnx_name = args.onnx_model - - # Convert Q/DQ nodes to use standard opset19 operators - if args.opset19: - op19_onnx = NeMoConverter.onnx_to_opset19(onnx_name, args.save_onnx_dir) - if op19_onnx != None: - G_LOGGER.info(f"Get opset19 onnx file {op19_onnx}") - onnx_name = op19_onnx - - # Add KV cache to ONNX model - if cfg.use_cache: - G_LOGGER.info(f"Converting {onnx_name} with KV-cache support") - kv_output_policy = "kv_new" - new_dir = os.path.join(args.save_onnx_dir, f"{kv_output_policy}") - onnx_output_fpath = os.path.join(new_dir, onnx_name.split("/")[-1]) - create_dir_if_not_exist(onnx_output_fpath) - if not converter: - converter = NeMoConverter(cfg, MegatronGPTModel) - converter.create_onnx(onnx_name, onnx_output_fpath, kv_output_policy) - onnx_name = onnx_output_fpath - - if cfg.onnx_export_options.prune: - onnx_name = converter.prune_onnx(onnx_name) - - # Convert ONNX model to TRT engine - if args.save_engine: - create_dir_if_not_exist(args.save_engine) - if not converter: - converter = NeMoConverter(cfg, MegatronGPTModel) - converter.onnx_to_trt(onnx_name, args.save_engine) - -if __name__ == '__main__': - main() diff --git a/demo/NeMo/patch_te.sh b/demo/NeMo/patch_te.sh deleted file mode 100644 index 4f060dd84..000000000 --- a/demo/NeMo/patch_te.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/sh -# -# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Sourcing messes up the directory detection with readlink. -if [ ! "${0##*/}" = "patch_te.sh" ]; then - echo "Please run this patch script, don't source it." >&2 - return 1 -fi - -NEMO_DIR=$(dirname "$(readlink -f "$0")") - -te_loc="$(pip show transformer_engine | grep '^Location' | awk '{print $2}')" -cd "${te_loc}/transformer_engine" || { - echo "Could not locate transformer-engine python package. Please check if installation proceeded correctly." - exit 1 -} -# Use sys.executable when calling pip within subprocess to recognize virtualenv. -# If patch is already applied, skip it and proceed with the rest of the script, quit otherwise. -# NOTE: patch needs to be updated to track the commit of TE in install.sh. -OUT="$(patch --forward common/__init__.py <"${NEMO_DIR}"/transformer_engine.patch)" || echo "${OUT}" | grep "Skipping patch" -q || { - echo "Could not patch transformer engine because ${OUT}" - exit 1 -} -unset OUT -cd - || exit -unset te_loc diff --git a/demo/NeMo/requirements.txt b/demo/NeMo/requirements.txt deleted file mode 100644 index c715ed76a..000000000 --- a/demo/NeMo/requirements.txt +++ /dev/null @@ -1,13 +0,0 @@ -nemo-toolkit[nlp]==1.17.0 -onnx==1.14.0 -protobuf==3.20.3 -onnxruntime==1.13.1 -transformers==4.27.0 -cuda-python==12.1.0 -setuptools==65.5.1 -tqdm ---pre --extra-index-url https://download.pytorch.org/whl/cu121 -torch==2.1.0 -torchaudio==2.1.0 -torchvision==0.16.0 -onnx-graphsurgeon==0.3.27 diff --git a/demo/NeMo/run.py b/demo/NeMo/run.py deleted file mode 100644 index 5ba00b5a5..000000000 --- a/demo/NeMo/run.py +++ /dev/null @@ -1,200 +0,0 @@ -# -# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -Demonstrates TensorRT capabilities with networks trained by NeMo. -Requires Python 3.6+ -""" - -import argparse -import os -import sys -from typing import List, Tuple - -ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) -sys.path.append(ROOT_DIR) - -sys.path.append('../') # Include one-level up directory so to reuse HuggingFace utils. -from HuggingFace.run import ( - Action, - NetworkScriptAction, - WRAPPER_LIST_ACTION, -) -from HuggingFace.NNDF.logger import G_LOGGER -from HuggingFace.NNDF.general_utils import register_network_folders -from HuggingFace.NNDF.cuda_bootstrapper import bootstrap_ld_library_path - -WRAPPER_RUN_ACTION = "run" -WRAPPER_ACCURACY_ACTION = "accuracy" -WRAPPER_BENCHMARK_ACTION = "benchmark" -WRAPPER_ACTIONS = [WRAPPER_LIST_ACTION, WRAPPER_RUN_ACTION, WRAPPER_ACCURACY_ACTION, WRAPPER_BENCHMARK_ACTION] - -class ListAction(Action): - def __init__(self, networks: List[str], parser: argparse.ArgumentParser): - super().__init__(networks, parser) - self.networks = networks - - def execute(self, args: argparse.Namespace): - print("Networks that are supported by NeMo Demo:") - [print(n) for n in self.networks] - return 0 - -class RunAction(NetworkScriptAction): - def execute(self, args: argparse.Namespace): - module = self.load_script(args.script, args) - module.RUN_CMD._parser = self.parser - - old_path = os.getcwd() - # Execute script in each relevant folder - try: - os.chdir(args.network) - _ = module.RUN_CMD() - finally: - os.chdir(old_path) - - return 0 - - def add_args(self, parser: argparse.ArgumentParser): - super().add_args(parser) - run_group = parser.add_argument_group("run args") - run_group.add_argument("script", choices=self.PER_NETWORK_SCRIPTS) - -class BenchmarkAction(NetworkScriptAction): - def execute(self, args: argparse.Namespace): - module = self.load_script(args.script, args) - module.RUN_CMD._parser = self.parser - - old_path = os.getcwd() - # Execute script in each relevant folder - try: - os.chdir(args.network) - _ = module.RUN_CMD() - finally: - os.chdir(old_path) - - return 0 - - def add_args(self, parser: argparse.ArgumentParser): - super().add_args(parser) - benchmarking_group = parser.add_argument_group("benchmark args") - benchmarking_group.add_argument("script", choices=self.PER_NETWORK_SCRIPTS) - benchmarking_group.add_argument( - "--input-seq-len", - type=int, - help="Specify fixed input sequence length for perf benchmarking. Required for benchmark except when both input_profile_max and output_profile_max are provided for trt", - ) - benchmarking_group.add_argument( - "--output-seq-len", - type=int, - help="Specify fixed output sequence length for perf benchmarking. Required for benchmark except when both input_profile_max and output_profile_max are provided for trt", - ) - -class AccuracyAction(NetworkScriptAction): - def execute(self, args: argparse.Namespace): - module = self.load_script(args.script, args) - module.RUN_CMD._parser = self.parser - - old_path = os.getcwd() - # Execute script in each relevant folder - try: - os.chdir(args.network) - _ = module.RUN_CMD() - finally: - os.chdir(old_path) - - return 0 - - def add_args(self, parser: argparse.ArgumentParser): - super().add_args(parser) - accuracy_group = parser.add_argument_group("accuracy args") - accuracy_group.add_argument("script", choices=self.PER_NETWORK_SCRIPTS) - accuracy_group.add_argument( - "--task", - type=str, - default="lambada", - choices=["lambada"], - help="Specify which task to be used for accuracy check.", - ) - -def get_action( - action_name: str, networks: List[str], parser: argparse.ArgumentParser -) -> Action: - return { - WRAPPER_LIST_ACTION: ListAction, - WRAPPER_RUN_ACTION: RunAction, - WRAPPER_BENCHMARK_ACTION: BenchmarkAction, - WRAPPER_ACCURACY_ACTION: AccuracyAction, - }[action_name](networks, parser) - -def verify_python_version(): - if sys.version_info.major < 3 or sys.version_info.minor <= 6: - raise RuntimeError("NeMo OSS Demo does not support Python <= 3.6 due to end-of-life.") - if sys.version_info.major < 3 or sys.version_info.minor < 8 or (sys.version_info.minor == 8 and sys.version_info.micro < 10): - G_LOGGER.warn("NeMo OSS Demo is not tested for Python < 3.8.10") - -def get_default_parser( - description: str = "", add_default_help=False -) -> Tuple[argparse.ArgumentParser, bool]: - """ - Returns argparser for use by main(). Allows the ability to toggle default help message with a custom help flag - so that argparser does not throw SystemExit when --help is passed in. Useful for custom --help functionality. - - Returns: - (argparse.ArgumentParser): argparser used by main() - """ - # This variable is set so that usage errors don't show up in wrapper - parser = argparse.ArgumentParser( - conflict_handler="resolve", - description=description, - add_help=add_default_help, - prog="run.py", - ) - - required_group = parser.add_argument_group("required wrapper arguments") - required_group.add_argument("action", choices=WRAPPER_ACTIONS) - return parser - -def main() -> None: - """ - Parses network folders and responsible for passing --help flags to subcommands if --network is provided. - """ - # Verify python version support - verify_python_version() - - # Get all available network scripts - networks = register_network_folders(os.getcwd()) - - # Add network folder for entry point - description = "Runs TensorRT networks that are based-off of NeMo variants." - parser = get_default_parser(description) - - # Get the general network wrapper help - known_args, _ = parser.parse_known_args() - - # Delegate parser to action specifics - action = get_action(known_args.action, networks, parser) - known_args, _ = parser.parse_known_args() - - # If bootstrap occurs, then the spawned process completes the rest of demo. - # We can exit safely. We spawn after parsing basic args to reduce loading churn on rudimentary help commands. - if bootstrap_ld_library_path(): - sys.exit(0) - - return action.execute(known_args) - -if __name__ == "__main__": - main() diff --git a/demo/NeMo/transformer_engine.patch b/demo/NeMo/transformer_engine.patch deleted file mode 100644 index c4c96dea5..000000000 --- a/demo/NeMo/transformer_engine.patch +++ /dev/null @@ -1,17 +0,0 @@ ---- common/__init__.py 2023-06-22 17:22:59.046208583 +0000 -+++ common/backup.py 2023-06-22 20:53:01.154819280 +0000 -@@ -7,12 +7,13 @@ - import os - import platform - import subprocess -+import sys - - - def get_te_path(): - """Find Transformer Engine install path using pip""" - -- command = ["pip", "show", "transformer_engine"] -+ command = [sys.executable, "-m", "pip", "show", "transformer_engine"] - result = subprocess.run(command, capture_output=True, check=True, text=True) - result = result.stdout.replace("\n", ":").split(":") - return result[result.index("Location")+1].strip()