diff --git a/demo/NeMo/.gitignore b/demo/NeMo/.gitignore
deleted file mode 100644
index af9bae11c..000000000
--- a/demo/NeMo/.gitignore
+++ /dev/null
@@ -1,5 +0,0 @@
-apex/
-Megatron-LM/
-NeMo/
-temp/
-__pycache__/
diff --git a/demo/NeMo/GPT3/GPT3ModelConfig.py b/demo/NeMo/GPT3/GPT3ModelConfig.py
deleted file mode 100644
index 0e50d6cec..000000000
--- a/demo/NeMo/GPT3/GPT3ModelConfig.py
+++ /dev/null
@@ -1,87 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Base Class
-import sys
-sys.path.append('../../HuggingFace') # Include HuggingFace directory
-from NNDF.networks import NNConfig, NetworkMetadata
-
-class GPT3ModelTRTConfig(NNConfig):
-
-    NETWORK_FULL_NAME = "full"
-    TARGET_MODELS = [
-        "gpt-126m",
-        "gpt-1.3b",
-        "gpt-5b",
-    ]
-
-    def __init__(
-        self,
-        metadata,
-        **kwargs
-    ):
-        super().__init__(
-            network_name="GPT3",
-            **kwargs
-        )
-        self.nemo_config = None
-        self.use_mask = False
-        self.metadata = metadata
-        self.variant = metadata.variant
-
-    def from_nemo_config(self, nemo_config):
-        self.nemo_config = nemo_config
-
-    def get_metadata_string(self, metadata: NetworkMetadata) -> str:
-        """
-        Serializes a Metadata object into string.
-        String will be checked if friendly to filenames across Windows and Linux operating systems.
-        This function is a modified version from HuggingFace/NNDF/networks.py.
-
-        returns:
-            string: <network>-<variant-name>[-<precision>]*-<others>
-        """
-
-        enabled_precisions = self.nemo_config.trt_export_options
-        precision_str = "-".join(
-            [
-                k for k, v in {
-                    "fp8": enabled_precisions.use_fp8,
-                    "fp16": enabled_precisions.use_fp16,
-                    "bf16": enabled_precisions.use_bf16,
-                }.items() if v
-            ]
-        )
-
-        result = [self.network_name, metadata.variant]
-        if precision_str:
-            result.append(precision_str)
-
-        # Append max sequence length
-        result.append("ms" + str(self.nemo_config.model.max_seq_len))
-
-        if metadata.use_cache:
-            result.append("kv_cache")
-
-        final_str = "-".join(result)
-        assert self._is_valid_filename(
-            final_str
-        ), "Metadata for current network {} is not filename friendly: {}.".format(
-            self.network_name, final_str
-        )
-
-        return final_str
diff --git a/demo/NeMo/GPT3/decoding.py b/demo/NeMo/GPT3/decoding.py
deleted file mode 100644
index 2edf66e7b..000000000
--- a/demo/NeMo/GPT3/decoding.py
+++ /dev/null
@@ -1,453 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from collections.abc import Iterable
-import sys
-from typing import List
-
-from apex.transformer.pipeline_parallel.utils import _reconfigure_microbatch_calculator
-from megatron.core import parallel_state
-from nemo.collections.nlp.modules.common.text_generation_strategy import GPTModelTextGenerationStrategy
-from nemo.utils import AppState
-import torch
-import torch.nn.functional as F
-
-from GPT3.trt_utils import GPTTRTDecoder
-
-sys.path.append('../../HuggingFace') # Include HuggingFace
-from NNDF.logger import G_LOGGER
-
-
-def sample_sequence_batch(
-    model,
-    inference_strategy,
-    context_tokens,
-    context_lengths,
-    tokens_to_generate,
-    all_probs=False,
-    temperature=None,
-    extra={},
-):
-    def repetition_penalty(logits, repetition_penalty, used_tokens):
-        """ Implement the repetition penalty, check paper
-        https://arxiv.org/pdf/1909.05858.pdf
-        """
-        if used_tokens is not None and repetition_penalty != 1.0:
-            logits_update = torch.gather(logits, 1, used_tokens)
-            logits = torch.scatter(logits, 1, used_tokens, logits_update / repetition_penalty)
-        return logits
-
-    def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf'), started=None):
-        """
-        This function has been mostly taken from huggingface conversational
-            ai code at
-            https://medium.com/huggingface/how-to-build-a-state-of-the-art-
-                conversational-ai-with-transfer-learning-2d818ac26313
-
-            @param logits: logits tensor
-            @param top_k: keep only top k tokens with highest probability
-            @param top_p: keep the top tokens with cumulative probability
-            @filter_value: value to set filtered tokens to
-            @started: a tensor of bools indicating whether the text generation starts for the batch
-            returns the filtered logits
-        """
-        if top_k > 0:
-            # Remove all tokens with a probability less than the
-            # last token of the top-k
-            indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
-            if started is not None:
-                for i in torch.arange(indices_to_remove.size(0))[started]:
-                    logits[i, indices_to_remove[i]] = filter_value
-            else:
-                logits[indices_to_remove] = filter_value
-
-        if top_p > 0.0:
-            # Cconvert to 1D
-            sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
-            cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
-
-            # Remove tokens with cumulative probability above the threshold
-            sorted_indices_to_remove = cumulative_probs > top_p
-            # Shift the indices to the right to keep also the first token
-            # above the threshold
-            sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
-            sorted_indices_to_remove[..., 0] = 0
-            if started is not None:
-                for i in torch.arange(sorted_indices.size(0))[started]:
-                    indices_to_remove = sorted_indices[i][sorted_indices_to_remove[i]]
-                    logits[i, indices_to_remove] = filter_value
-            else:
-                for i in range(sorted_indices.size(0)):
-                    indices_to_remove = sorted_indices[i][sorted_indices_to_remove[i]]
-                    logits[i, indices_to_remove] = filter_value
-
-        return logits
-
-    app_state = AppState()
-    batch_size = context_tokens.shape[0]
-    if not (hasattr(model, "trt") or hasattr(model, "onnx")):
-        _reconfigure_microbatch_calculator(
-            rank=app_state.global_rank,
-            rampup_batch_size=None,
-            global_batch_size=batch_size,
-            micro_batch_size=batch_size,
-            data_parallel_size=1,
-        )
-
-    tokenizer = model.tokenizer
-    # initialize the batch
-    with torch.no_grad():
-        context_length = context_lengths.min().item()
-        context_lengths_cpu = context_lengths.cpu()
-        inference_strategy.init_batch(context_tokens, context_length)
-        # added eos_id to support the function generate_samples_eval that passes
-        # eos_id as an argument and needs termination when that id id found.
-        eod_id = tokenizer.eos_id
-        counter = 0
-
-        tokens = context_tokens
-        output_logits = None
-        all_generated_indices = None  # used to track all generated indices
-        # Generate enough tokens for the longest sequence
-        maxlen = tokens_to_generate + context_lengths.max().item()
-        maxlen = inference_strategy.clip_max_len(maxlen)
-
-        is_done = torch.zeros([batch_size]).byte()
-        lengths = torch.ones([batch_size]).long() * maxlen
-
-        use_cache = extra.get("use_cache", False)
-        is_onnx = hasattr(model, "onnx")
-        is_trt = hasattr(model, "trt")
-
-        if is_trt:
-            assert isinstance(model.trt, GPTTRTDecoder)
-            input_ids_name = model.trt.get_input_ids_name()
-            input_ids_type = model.trt.get_torch_type(input_ids_name)
-            position_ids_name = model.trt.get_position_ids_name()
-            position_ids_type =  model.trt.get_torch_type(position_ids_name)
-            attention_mask_name = model.trt.get_attention_mask_name()
-            if attention_mask_name != None:
-                attention_mask_type = model.trt.get_torch_type(attention_mask_name)
-
-            position_ids = inference_strategy.position_ids
-            attention_mask = inference_strategy.attention_mask
-
-        torch.cuda.nvtx.range_pop() # "Prepare Batch"
-        while context_length < maxlen:
-            torch.cuda.nvtx.range_push("I/O Setup")
-
-            output = None
-            if is_onnx and use_cache:
-                G_LOGGER.warn(f"ONNX runtime path does not support KV-cache.")
-
-            # Modify counter based on using cache or not.
-            if is_trt:
-                # TRT input preprocessing doesn't use nemo function
-                pass
-            elif not is_onnx and use_cache:
-                batch, tensor_shape = inference_strategy.prepare_batch_at_step(
-                    tokens, maxlen, batch_size, counter, context_length
-                )
-            else:
-                batch, tensor_shape = inference_strategy.prepare_batch_at_step(
-                    tokens, maxlen, batch_size, 0, context_length # step is always 0
-                )
-
-            # inputs input_ids: [BS, SEQ], position_ids: [BS, SEQ], attention_mask: [1, 1, SEQ, SEQ]
-            if is_trt:
-                context_mode = (use_cache and counter == 0) or not use_cache
-                if context_mode or not use_cache:
-                    # context mode
-                    batch_tokens = tokens[:, :context_length]
-                    batch_position_ids = position_ids[:, :context_length]
-                else:
-                    # generate mode
-                    batch_tokens = tokens[:, context_length - 1].view(batch_size, -1)
-                    batch_position_ids = position_ids[:, context_length - 1].view(batch_size, -1)
-                seq_len = batch_tokens.shape[1]
-                batch_attention_mask = attention_mask[0:1, 0:1, :seq_len, :seq_len]
-                input_ids = batch_tokens.type(input_ids_type).contiguous().cuda()
-                tensor_dict = {input_ids_name : (input_ids.data_ptr(), input_ids.shape)}
-                if position_ids_name != None:
-                    batch_position_ids = batch_position_ids.type(position_ids_type).contiguous().cuda()
-                    tensor_dict[position_ids_name] = (batch_position_ids.data_ptr(), batch_position_ids.shape)
-                if attention_mask_name != None:
-                    batch_attention_mask = batch_attention_mask.type(attention_mask_type).contiguous().cuda()
-                    tensor_dict[attention_mask_name] = (batch_attention_mask.data_ptr(), batch_attention_mask.shape)
-
-                logits_name = model.trt.get_output_name()
-                torch.cuda.nvtx.range_pop() # "I/O Setup"
-                output = model.trt.run(logits_name, tensor_dict, seq_len, context_mode)
-
-            elif is_onnx:
-                assert len(batch) == 5, "Length of batch must be 5."
-                (
-                    batch_tokens,
-                    attention_mask,
-                    position_ids,
-                    set_inference_key_value_memory,
-                    _,
-                ) = batch
-                seq_len = batch_tokens.shape[1]
-                attention_mask = attention_mask[0:1, 0:1, 0:seq_len, 0:seq_len]
-
-                from onnxruntime import InferenceSession
-                assert isinstance(model.onnxrt, InferenceSession)
-                # Currently only support onnx runtime with cpu
-                # Our fp8 models don't currently use a user-provided attention_mask
-                tensor_dict = {'input_ids': batch_tokens.cpu().detach().numpy(),
-                                'position_ids': position_ids.cpu().detach().numpy()}
-
-                def have_attention_mask(sess):
-                    return any(inp.name == 'attention_mask' for inp in all_inputs)
-
-                if have_attention_mask(model.onnxrt):
-                    tensor_dict['attention_mask'] = attention_mask.cpu().detach().numpy()
-                torch.cuda.nvtx.range_pop() # "I/O Setup"
-                output = model.onnxrt.run(['logits'], tensor_dict)[0]
-                output = torch.Tensor(output).cuda()
-                # output logits: [BS, SEQ, 50304]
-            else:
-                # nemo path
-                torch.cuda.nvtx.range_pop() # "I/O Setup"
-                output = inference_strategy.forward_step(batch, tensor_shape)
-                output = output[0]['logits'].float()
-
-            assert output is not None
-            torch.cuda.nvtx.range_push("Output Sampling")
-            output = output.float()
-            logits = output[:, -1].view(batch_size, -1).contiguous()
-
-            # make sure it will generate at least min_length
-            min_length = extra.get('min_tokens_to_generate', 0)
-            if min_length > 0:
-                within_min_length = (context_length - context_lengths) < min_length
-                logits[within_min_length, eod_id] = -float('Inf')
-
-            # make sure it won't sample outside the vocab_size range
-            logits[:, tokenizer.vocab_size :] = -float('Inf')
-
-            # started indicates whether the current token step passes the context_length, so we make sure not to overwrite the context tokens
-            started = context_lengths_cpu <= context_length
-            if extra.get('greedy', False):
-                prev = torch.argmax(logits, dim=-1).view(-1)
-            else:
-                logits = logits.float()
-                logits /= temperature
-                # handle repetition penality
-                logits = repetition_penalty(logits, extra.get('repetition_penalty', 1.0), all_generated_indices)
-                logits = top_k_logits(
-                    logits, top_k=extra.get('top_k', 0), top_p=extra.get('top_p', 0.9), started=started
-                )
-                probs = F.softmax(logits, dim=-1)
-                prev = torch.multinomial(probs, num_samples=1).view(-1)
-
-            prev = prev.cpu()
-            # Clamp the predicted out of vocabulary tokens
-            prev = torch.clamp(prev, max=tokenizer.vocab_size - 1)
-            # Replace sampled tokens w/ done token if EOD has already been sampled
-            new_tokens = torch.where(is_done, eod_id, prev)
-            # post process the inference tokens based on the strategy
-            inference_strategy.post_process(tokens, new_tokens, context_length)
-
-            # Insert either new predicted or next prompt token
-            if extra.get("accuracy_mode", False):
-                # We only update the last token for accuracy mode.
-                at_prediction_index = (context_lengths + tokens_to_generate - 1 == context_length)
-                tokens[:, context_length] = torch.where(at_prediction_index, new_tokens.cuda(), tokens[:, context_length])
-            else:
-                tokens[:, context_length] = torch.where(started.cuda(), new_tokens.cuda(), tokens[:, context_length])
-
-            if not extra.get("benchmark_mode", False):
-                if output_logits is None:
-                    output = F.log_softmax(output[:, :context_length, :], 2)
-                    indices = torch.unsqueeze(tokens[:, 1 : context_length + 1], 2)
-                    output_logits = torch.gather(output, 2, indices).squeeze(2)
-                    all_generated_indices = indices[:, :, 0]
-                    if all_probs:
-                        full_logits = output
-                else:
-                    output = F.log_softmax(output, 2)
-                    indices = torch.unsqueeze(new_tokens.cuda(), 1).unsqueeze(2)
-                    new_output_logits = torch.gather(output, 2, indices).squeeze(2)
-
-                    # This copy can be optimized out by pre-allocating the memory.
-                    output_logits = torch.cat([output_logits, new_output_logits], 1)
-                    all_generated_indices = torch.cat([all_generated_indices, indices[:, :, 0]], 1)
-                    if all_probs:
-                        if extra.get("use_cache", False):
-                            full_logits = torch.cat([full_logits, output], 1)
-                        else:
-                            full_logits = output
-
-            done_token = (prev == eod_id)
-            done_token = done_token.byte() & started.byte()
-
-            just_finished = (done_token & ~is_done).bool()
-            lengths[just_finished.view(-1)] = context_length
-            is_done = is_done | done_token
-
-            done = torch.all(is_done)
-            torch.cuda.nvtx.range_pop() # "Output Sampling"
-
-            context_length += 1
-            counter += 1
-            if done and not extra.get("benchmark_mode", False):
-                break
-
-        if all_probs:
-            return tokens, context_length, lengths, output_logits, full_logits
-        return tokens, context_length, lengths, output_logits, None
-
-def initialize_ddp(model, cfg):
-    # check whether the DDP is initialized
-    if cfg.runtime == "nemo" and parallel_state.is_unitialized():
-        def dummy():
-            return
-        if model.trainer.strategy.launcher is not None:
-            model.trainer.strategy.launcher.launch(dummy, trainer=model.trainer)
-        model.trainer.strategy.setup_environment()
-
-        if model.cfg.get('transformer_engine', False):
-            model.setup_transformer_engine_tp_groups()
-
-def get_special_tokens(tokenizer):
-    special_tokens = set()
-    if hasattr(tokenizer, 'pad_token') and tokenizer.pad_token is not None:
-        special_tokens.add(tokenizer.pad_token)
-    if hasattr(tokenizer, 'eos_token') and tokenizer.eos_token is not None:
-        special_tokens.add(tokenizer.eos_token)
-    if hasattr(tokenizer, 'bos_token') and tokenizer.bos_token is not None:
-        special_tokens.add(tokenizer.bos_token)
-    if hasattr(tokenizer, 'cls_token') and tokenizer.cls_token is not None:
-        special_tokens.add(tokenizer.cls_token)
-    if hasattr(tokenizer, 'unk_token') and tokenizer.unk_token is not None:
-        special_tokens.add(tokenizer.unk_token)
-    if hasattr(tokenizer, 'sep_token') and tokenizer.sep_token is not None:
-        special_tokens.add(tokenizer.sep_token)
-    if hasattr(tokenizer, 'mask_token') and tokenizer.mask_token is not None:
-        special_tokens.add(tokenizer.mask_token)
-    return special_tokens
-
-def process_output(model, output, return_segments=False):
-    torch.cuda.nvtx.range_push("Process Output")
-    inference_strategy = GPTModelTextGenerationStrategy(model)
-    tokenizer = model.tokenizer
-    if output is not None:
-        decode_tokens, output_logits, full_logits = output
-        decode_tokens = decode_tokens.cpu().numpy().tolist()
-
-        # convert ids to text by applying tokenizer
-        resp_sentences = list(map(tokenizer.ids_to_text, decode_tokens))
-
-        all_offsets = []
-        resp_sentences_seg = []
-        if return_segments:
-            # segments sentences into words.
-            for decode_token in decode_tokens:
-                words = []
-                for token in decode_token:
-                    if not isinstance(token, Iterable):
-                        token = [token]
-                    word = tokenizer.ids_to_tokens(token)
-                    if isinstance(word, Iterable):
-                        word = word[0]
-                    if hasattr(tokenizer.tokenizer, 'byte_decoder'):
-                        word = bytearray([tokenizer.tokenizer.byte_decoder[c] for c in word]).decode(
-                            'utf-8', errors='replace'
-                        )
-                    words.append(word)
-                resp_sentences_seg.append(words)
-
-            # offsets calculation
-            special_tokens = get_special_tokens(tokenizer)
-            for item in resp_sentences_seg:
-                offsets = [0]
-                for index, token in enumerate(item):
-                    if index != len(item) - 1:
-                        if token in special_tokens:
-                            offsets.append(offsets[-1])
-                        else:
-                            offsets.append(len(token) + offsets[-1])
-                all_offsets.append(offsets)
-
-        output = {}
-        output['sentences'] = resp_sentences
-        output['tokens'] = resp_sentences_seg
-        output['logprob'] = output_logits
-        output['full_logprob'] = full_logits
-        output['token_ids'] = decode_tokens
-        output['offsets'] = all_offsets
-        output = inference_strategy.post_generation_process(output)
-    torch.cuda.nvtx.range_pop() # "Process Output"
-    return output
-
-def generate(model, inputs, cfg):
-    torch.cuda.nvtx.range_push("Prepare Batch")
-    initialize_ddp(model, cfg)
-
-    tokens_to_generate = cfg.inference.tokens_to_generate
-    min_tokens_to_generate = cfg.inference.min_tokens_to_generate
-    add_BOS = cfg.inference.add_BOS
-    all_probs = cfg.inference.all_probs
-    temperature = cfg.inference.temperature
-    is_benchmark_mode = True if cfg.mode == "benchmark" else False
-    is_accuracy_mode = True if cfg.mode == "accuracy" else False
-
-    inference_strategy = GPTModelTextGenerationStrategy(model)
-    if isinstance(inputs, tuple):
-        context_tokens_tensor, context_length_tensor = inputs
-    else:
-        context_tokens_tensor, context_length_tensor = inference_strategy.tokenize_batch(
-            inputs, tokens_to_generate, add_BOS
-        )
-
-    context_length = context_length_tensor.min().item()
-
-    batch_token_result = sample_sequence_batch(
-        model,
-        inference_strategy,
-        context_tokens_tensor,
-        context_length_tensor,
-        tokens_to_generate,
-        all_probs,
-        temperature=temperature,
-        extra={
-            "top_p": cfg.inference.top_p,
-            "top_k": cfg.inference.top_k,
-            "greedy": cfg.inference.greedy,
-            "repetition_penalty": cfg.inference.repetition_penalty,
-            "min_tokens_to_generate": min_tokens_to_generate,
-            "use_cache": cfg.use_cache,
-            "benchmark_mode": is_benchmark_mode,
-            "accuracy_mode": is_accuracy_mode,
-            "use_fp8_storage": cfg.onnx_export_options.use_fp8_storage,
-        },
-    )
-
-    tokens, context_length, _, output_logits, full_logits = batch_token_result
-
-    output = None
-    if tokens is not None:
-        output = tokens[:, :context_length], output_logits, full_logits
-    return output
-
-def full_inference(model, inputs, cfg):
-    output = generate(model, inputs, cfg)
-    if output is not None:
-        output = process_output(model, output, return_segments=(cfg.mode is not "benchmark"))
-    return output
diff --git a/demo/NeMo/GPT3/frameworks.py b/demo/NeMo/GPT3/frameworks.py
deleted file mode 100644
index 851f4cdf4..000000000
--- a/demo/NeMo/GPT3/frameworks.py
+++ /dev/null
@@ -1,81 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import os
-import sys
-
-import omegaconf
-
-# Add syspath for custom library
-if __name__ == "__main__":
-    filepath = os.path.dirname(os.path.abspath(__file__))
-    project_root = os.path.join(filepath, os.pardir)
-    sys.path.append(project_root)
-
-from GPT3.nemo_utils import load_nemo_model
-from GPT3.GPT3ModelConfig import GPT3ModelTRTConfig
-from interface import NeMoCommand
-
-sys.path.append('../../HuggingFace') # Include HuggingFace
-from NNDF.interface import FRAMEWORK_NATIVE
-from NNDF.networks import (
-    NetworkModel,
-    NetworkModels,
-)
-
-class GPT3NeMoTorch(NeMoCommand):
-    def __init__(
-        self,
-        nemo_cfg,
-        config_class=GPT3ModelTRTConfig,
-        description="Runs framework results for GPT3 model with NeMo.",
-        **kwargs
-    ):
-        super().__init__(nemo_cfg, config_class, description, model_classes=None, **kwargs)
-        self.framework_name = FRAMEWORK_NATIVE
-
-    def setup_tokenizer_and_model(self):
-        self.nemo_cfg.runtime = 'nemo'
-        self.model = load_nemo_model(self.nemo_cfg)
-        self.tokenizer = self.model.tokenizer
-
-        torch_models = [
-            NetworkModel(
-                name=GPT3ModelTRTConfig.NETWORK_FULL_NAME, fpath=self.workspace.torch_path
-            )
-        ]
-        return NetworkModels(torch=torch_models, onnx=None, trt=None)
-
-    def process_framework_specific_arguments(self, onnx_model: str = None, **kwargs):
-        if onnx_model:
-            raise RuntimeError(
-                "native framework does not support loading an ONNX file via `onnx-model` yet. Please specify the NeMo model using `nemo-model` instead."
-            )
-
-
-# Entry point
-def getGPT3NeMoTorch():
-    config_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../config.yaml")
-    nemo_cfg = omegaconf.OmegaConf.load(config_path)
-    return GPT3NeMoTorch(nemo_cfg)
-
-# Entry point
-RUN_CMD = getGPT3NeMoTorch()
-
-if __name__ == "__main__":
-    result = RUN_CMD()
-    print("Results: {}".format(result))
diff --git a/demo/NeMo/GPT3/lambada_dataset.py b/demo/NeMo/GPT3/lambada_dataset.py
deleted file mode 100644
index a7945cec7..000000000
--- a/demo/NeMo/GPT3/lambada_dataset.py
+++ /dev/null
@@ -1,126 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-import os
-import collections
-import json
-import requests
-import sys
-import torch
-from torch.nn.utils.rnn import pad_sequence
-
-# Add syspath for custom library
-if __name__ == "__main__":
-    filepath = os.path.dirname(os.path.abspath(__file__))
-    project_root = os.path.join(filepath, os.pardir)
-    sys.path.append(project_root)
-
-from nemo_export import create_dir_if_not_exist
-
-__all__ = ['Lambada']
-
-
-class Lambada():
-
-    def __init__(self, base_dir, tokens_to_generate, padding = -1, max_length = 2048):
-        assert tokens_to_generate >= 1
-        assert padding == -1 or tokens_to_generate == 1
-        self.base_dir = base_dir
-        self.tokens_to_generate = tokens_to_generate
-        self.padding = padding
-        self.max_length = max_length 
-        self.download()
-
-    def get_data_file_path(self):
-        path = os.path.join(self.base_dir, "lambada")
-        path = os.path.join(path, "lambada_test.jsonl")
-        create_dir_if_not_exist(path)
-        return path
-
-    def download(self):
-        path = self.get_data_file_path()
-        if not os.path.exists(path):
-            url = "https://github.com/cybertronai/bflm/raw/master/lambada_test.jsonl"
-            with requests.get(url) as r, open(path, 'wb') as fh:
-                fh.write(r.content)
-
-    def load(self):
-        path = self.get_data_file_path()
-        with open(path) as fh:
-            for line in fh:
-                yield json.loads(line)
-
-    def _preprocess(self, text):
-        text = text.replace("“", '"')
-        text = text.replace("”", '"')
-        text = text.replace("’", "'")
-        text = text.replace("‘", "'")
-        return text
-
-    def doc_to_text(self, doc):
-        return "\n" + self._preprocess(doc["text"].rsplit(" ", 1)[0]).strip()
-
-    def doc_to_target(self, doc):
-        split_text = doc["text"].rsplit(" ", 1)
-        if len(split_text) <= 1:
-            raise ValueError(f"Input doc '{doc}' does not have target.")
-        return " " + self._preprocess(split_text[1])
-
-    def preprocess_input(self, tokenizer, docs):
-        _Input = collections.namedtuple("_DS_Input", ["inputs", "inp_enc", "lens", "lens_pad", "conti_len"])
-        batch_size = len(docs)
-        tokens = []
-        conti_lens = []
-        lens = []
-        inp_encs = []
-        for doc in docs:
-            # Handle padded text
-            if not doc["text"]:
-                inp_enc = [0]
-                conti_len = 0
-            else:
-                text = self.doc_to_text(doc)
-                target = self.doc_to_target(doc)
-
-                context_enc = tokenizer.text_to_ids(text)
-                continuation_enc = tokenizer.text_to_ids(target)
-
-                inp_enc = (context_enc + continuation_enc)[-(self.max_length + 1) :]
-                conti_len = len(continuation_enc)
-
-            inp_encs.append(inp_enc)
-            conti_lens.append(conti_len)
-            tokens.append(torch.tensor(inp_enc))
-            lens.append(len(inp_enc) - 1)
-        max_lens = max(lens)
-
-        tokens_pad = pad_sequence(tokens, batch_first=False, padding_value=tokenizer.eos_id)
-        if self.padding != -1 and max_lens % self.padding != 0:
-            # We need align the context length to multiple of 8 for FP8 run using NeMo framework.
-            extra_pad_len = self.padding - (max_lens % self.padding)
-
-            extra_pad = torch.ones(extra_pad_len, batch_size) * tokenizer.eos_id
-            extra_pad = extra_pad.type_as(tokens_pad)
-            inp_enc_pad = torch.vstack((tokens_pad, extra_pad)).T
-
-            lens_pad = max_lens + extra_pad_len
-        else:
-            inp_enc_pad = tokens_pad.T
-            lens_pad = max_lens + 1 - self.tokens_to_generate
-
-        inputs = (torch.tensor(inp_enc_pad).cuda(), (torch.ones(batch_size, dtype=torch.int32) * lens_pad).cuda())
-        return _Input(inputs=inputs, inp_enc=inp_encs, lens=lens, lens_pad=lens_pad, conti_len=conti_lens)
-
diff --git a/demo/NeMo/GPT3/nemo_utils.py b/demo/NeMo/GPT3/nemo_utils.py
deleted file mode 100644
index f6d5bca7c..000000000
--- a/demo/NeMo/GPT3/nemo_utils.py
+++ /dev/null
@@ -1,161 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import gc
-import os
-import sys
-
-# Only print out error messages from NeMo
-from nemo.utils.nemo_logging import Logger as NG_LOGGER
-nemo_logger = NG_LOGGER(False)
-nemo_logger.setLevel(nemo_logger.ERROR)
-
-from nemo.utils.app_state import AppState
-from nemo.utils.model_utils import inject_model_parallel_rank
-from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel
-from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
-from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector
-from omegaconf import OmegaConf, open_dict
-from pytorch_lightning.trainer.trainer import Trainer
-import torch
-
-sys.path.append('../../HuggingFace') # Include HuggingFace directory.
-from NNDF.logger import G_LOGGER
-
-
-def get_computeprob_response(tokenizer, response, inputs):
-    """
-        This function is a modified version from:
-        https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/nlp/modules/common/text_generation_utils.py#L139
-
-        So parallel state does not need to be initialized before calling this function.
-    """
-    compute_prob_response = {}
-    new_token_ids = []
-    new_tokens = []
-    new_texts = []
-    log_probs = []
-    full_logprobs = []
-    offsets = []
-    for batch_id in range(len(response['tokens'])):
-        if isinstance(inputs, (list, tuple)):
-            if isinstance(inputs[0], str):
-                new_token_id = tokenizer.text_to_ids(inputs[batch_id])
-                new_text = inputs[batch_id]
-                token_len = len(new_token_id)
-            elif isinstance(inputs[0], torch.Tensor):
-                token_len = int(inputs[1][batch_id].item())
-                new_token_id = inputs[0][batch_id][:token_len].tolist()
-                new_text = tokenizer.ids_to_text(new_token_id)
-        new_token_ids.append(new_token_id)
-        new_tokens.append(response['tokens'][batch_id][:token_len])
-        new_texts.append(new_text)
-        log_probs.append(response['logprob'][batch_id][:token_len])
-        full_logprobs.append(response['full_logprob'][batch_id][:token_len])
-        offsets.append(response['offsets'][batch_id][:-1])
-    compute_prob_response['sentences'] = new_texts
-    compute_prob_response['tokens'] = new_tokens
-    compute_prob_response['token_ids'] = new_token_ids
-    compute_prob_response['logprob'] = log_probs
-    compute_prob_response['full_logprob'] = full_logprobs
-    compute_prob_response['offsets'] = offsets
-    return compute_prob_response
-
-
-def load_nemo_model(cfg, model_class=MegatronGPTModel):
-    # Trainer is required for restoring model parallel models
-    trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer)
-
-    if cfg.gpt_model_file and cfg.checkpoint_dir:
-        raise ValueError(f"NeMo model and checkpoint cannot be both set.")
-
-    if cfg.gpt_model_file:
-        save_restore_connector = NLPSaveRestoreConnector()
-        if os.path.isdir(cfg.gpt_model_file):
-            save_restore_connector.model_extracted_dir = cfg.gpt_model_file
-
-        pretrained_cfg = MegatronGPTModel.restore_from(
-            restore_path=cfg.gpt_model_file,
-            trainer=trainer,
-            return_config=True,
-            save_restore_connector=save_restore_connector,
-        )
-        OmegaConf.set_struct(pretrained_cfg, True)
-        with open_dict(pretrained_cfg):
-            pretrained_cfg.sequence_parallel = False
-            pretrained_cfg.activations_checkpoint_granularity = None
-            pretrained_cfg.activations_checkpoint_method = None
-            pretrained_cfg.precision = trainer.precision
-            if trainer.precision == "16":
-                pretrained_cfg.megatron_amp_O2 = False
-        model = model_class.restore_from(
-            restore_path=cfg.gpt_model_file,
-            trainer=trainer,
-            override_config_path=pretrained_cfg,
-            save_restore_connector=save_restore_connector,
-        )
-        G_LOGGER.info(f"{type(model)} has been successfully restored from {cfg.gpt_model_file}")
-    elif cfg.checkpoint_dir:
-        checkpoint_file= os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name)
-        if not os.path.exists(checkpoint_file):
-            raise ValueError(f"File {checkpoint_file} does not exist.")
-
-        app_state = AppState()
-        if cfg.tensor_model_parallel_size > 1 or cfg.pipeline_model_parallel_size > 1:
-            app_state.model_parallel_size = cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size
-            app_state.tensor_model_parallel_size = cfg.tensor_model_parallel_size
-            app_state.pipeline_model_parallel_size = cfg.pipeline_model_parallel_size
-            (
-                app_state.tensor_model_parallel_rank,
-                app_state.pipeline_model_parallel_rank,
-                app_state.model_parallel_size,
-                app_state.data_parallel_size,
-                app_state.pipeline_model_parallel_split_rank,
-                app_state.virtual_pipeline_model_parallel_rank,
-            ) = fake_initialize_model_parallel(
-                world_size=app_state.model_parallel_size,
-                rank=trainer.global_rank,
-                tensor_model_parallel_size_=cfg.tensor_model_parallel_size,
-                pipeline_model_parallel_size_=cfg.pipeline_model_parallel_size,
-                pipeline_model_parallel_split_rank_=cfg.pipeline_model_parallel_split_rank,
-            )
-        checkpoint_path = inject_model_parallel_rank(checkpoint_file)
-        model = model_class.load_from_checkpoint(checkpoint_path, hparams_file=cfg.hparams_file, trainer=trainer)
-        G_LOGGER.info(f"{type(model)} has been successfully restored from checkpoint {checkpoint_path}")
-    else:
-        raise ValueError("Need to provide a nemo gpt model through config file.")
-
-    model.freeze()
-
-    # Have to turn off activations_checkpoint_method for inference
-    try:
-        model.model.language_model.encoder.activations_checkpoint_method = None
-    except AttributeError:
-        pass
-
-    model.eval()
-    G_LOGGER.debug(f"Model configuration: {model.cfg}")
-    G_LOGGER.debug(f"Vocabulary size: {model.tokenizer.vocab_size}")
-    return model.cuda()
-
-def release_nemo_model(model):
-    print(f"Releaseing nemo model.")
-    model.model.cpu()
-    del model.model
-    gc.collect()
-    torch.cuda.empty_cache()
-    model.model = None
diff --git a/demo/NeMo/GPT3/onnxrt.py b/demo/NeMo/GPT3/onnxrt.py
deleted file mode 100644
index 78bd0acab..000000000
--- a/demo/NeMo/GPT3/onnxrt.py
+++ /dev/null
@@ -1,112 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import os
-import sys
-
-import onnxruntime as ort
-import onnx
-import omegaconf
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_tokenizer
-from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
-
-# Add syspath for custom library
-if __name__ == "__main__":
-    filepath = os.path.dirname(os.path.abspath(__file__))
-    project_root = os.path.join(filepath, os.pardir)
-    sys.path.append(project_root)
-
-from interface import NeMoCommand, BaseModel
-from nemo_export import NeMoConverter
-from GPT3.GPT3ModelConfig import GPT3ModelTRTConfig
-
-sys.path.append('../../HuggingFace') # Include HuggingFace
-from NNDF.interface import FRAMEWORK_ONNXRT
-from NNDF.logger import G_LOGGER
-from NNDF.networks import (
-    NetworkModel,
-    NetworkModels,
-)
-
-class GPT3NeMoOnnxRT(NeMoCommand):
-    def __init__(
-        self,
-        nemo_cfg,
-        config_class=GPT3ModelTRTConfig,
-        description="Runs ONNX Runtime results for GPT3 model.",
-        **kwargs
-    ):
-        super().__init__(nemo_cfg, config_class, description, model_classes=None, **kwargs)
-        self.framework_name = FRAMEWORK_ONNXRT
-
-
-    def load_onnx_model(self):
-        G_LOGGER.info(f'Loading ONNX model from {self.nemo_cfg.onnx_model_file}')
-
-        def get_opset_version(name : str) -> int:
-            """Returns opset.
-
-            `model` here is local in scope and python's gc will collect
-            it without manual memory management via `del`.
-            """
-            model = onnx.load(name, load_external_data=False)
-            return model.opset_import[0].version
-
-        assert get_opset_version(self.nemo_cfg.onnx_model_file) == 17
-        return ort.InferenceSession(self.nemo_cfg.onnx_model_file)
-
-
-    def setup_tokenizer_and_model(self):
-        self.nemo_cfg.runtime = 'onnx'
-        self.model = BaseModel()
-        self.model.cfg = self.nemo_cfg.model
-        self.model.tokenizer = get_tokenizer(tokenizer_name='megatron-gpt-345m', vocab_file=None, merges_file=None)
-
-        if not self.nemo_cfg.onnx_model_file:
-            self.nemo_cfg.onnx_model_file = os.path.join(
-                self.workspace.dpath,
-                f"onnx/model-{self.nemo_cfg.trainer.precision}.onnx",
-            )
-
-        converter = NeMoConverter(self.nemo_cfg, MegatronGPTModel)
-        if not os.path.isfile(self.nemo_cfg.onnx_model_file):
-            # Convert NeMo model to ONNX model
-            onnx_name = converter.nemo_to_onnx()
-            self.nemo_cfg.onnx_model_file = onnx_name
-
-        # The ONNX model is in opset17 by default.
-        self.model.onnxrt = self.load_onnx_model()
-        self.tokenizer = self.model.tokenizer
-        onnx_models = [
-            NetworkModel(
-                name=GPT3ModelTRTConfig.NETWORK_FULL_NAME, fpath=self.nemo_cfg.onnx_model_file,
-            )
-        ]
-        return NetworkModels(torch=None, onnx=onnx_models, trt=None)
-
-# Entry point
-def getGPT3NeMoOnnxRT():
-    config_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../config.yaml")
-    nemo_cfg = omegaconf.OmegaConf.load(config_path)
-    return GPT3NeMoOnnxRT(nemo_cfg)
-
-# Entry point
-RUN_CMD = getGPT3NeMoOnnxRT()
-
-if __name__ == "__main__":
-    result = RUN_CMD()
-    print("Results: {}".format(result))
diff --git a/demo/NeMo/GPT3/sequence_perplexity.py b/demo/NeMo/GPT3/sequence_perplexity.py
deleted file mode 100644
index 9fc9ef29c..000000000
--- a/demo/NeMo/GPT3/sequence_perplexity.py
+++ /dev/null
@@ -1,76 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import math
-import numpy as np
-import torch
-
-__all__ = ['SequencePerplexity']
-
-class SequencePerplexity():
-    def __init__(self, topN):
-        super().__init__()
-        self.ppls = []
-        self.sequence_ppls = []
-        self.topN_equals = [0] * len(topN)
-        self.topN = topN
-
-    def update(self, ds_input, response, tokenizer):
-        for batch, tokens in enumerate(response['token_ids']):
-            inp_len = ds_input.lens[batch]
-            if inp_len == 0:
-                continue
-
-            conti_len = ds_input.conti_len[batch]
-
-            response_token_ids = tokens[:inp_len]
-            assert response_token_ids == ds_input.inp_enc[batch][:-1], f"Mismatch in input tokens."
-            full_log_probs = response['full_logprob'][batch][:inp_len]
-
-            # calculate ppl with whole sequence.
-            label = torch.tensor([ds_input.inp_enc[batch][1:]]).cuda()
-            log_probs = full_log_probs.unsqueeze(0).permute((0, 2, 1))
-            ppl = torch.nn.CrossEntropyLoss()(log_probs, label)
-            self.sequence_ppls.append(ppl.cpu())
-
-            # calculate topN.
-            log_probs = full_log_probs[-conti_len:]
-            conti_token_ids = ds_input.inp_enc[batch][-conti_len:]
-            conti_tokens = tokenizer.ids_to_tokens(conti_token_ids)
-
-            for index, topN in enumerate(self.topN):
-                if conti_token_ids[0] in log_probs.topk(topN, dim=-1).indices:
-                    self.topN_equals[index] += 1 
-
-            # calculate ppl with last token.
-            log_probs = log_probs.cpu().to(torch.float32)
-            conti_enc = torch.tensor(tokenizer.tokens_to_ids(conti_tokens))
-            conti_probs = torch.gather(log_probs, 1, conti_enc.unsqueeze(-1)).squeeze(-1)
-
-            ppl = float(conti_probs.sum())
-            self.ppls.append(ppl)
-
-    def compute(self):
-        ppls = math.exp(-np.mean(np.array(self.ppls)))
-        sequence_ppls = math.exp(np.mean(np.array(self.sequence_ppls)))
-        acc = [equals / len(self.ppls) for equals in self.topN_equals]
-        txt = []
-        for i, j in zip(self.topN, acc):
-            txt.append("acc(top{}): {:.4f}".format(i, j))
-        acc_text = ", ".join(txt)
-        return ppls, sequence_ppls, acc, acc_text
-
diff --git a/demo/NeMo/GPT3/trt.py b/demo/NeMo/GPT3/trt.py
deleted file mode 100644
index 189c1ba32..000000000
--- a/demo/NeMo/GPT3/trt.py
+++ /dev/null
@@ -1,236 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import os
-import sys
-
-import omegaconf
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_tokenizer
-from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
-
-# Add syspath for custom library
-if __name__ == "__main__":
-    filepath = os.path.dirname(os.path.abspath(__file__))
-    project_root = os.path.join(filepath, os.pardir)
-    sys.path.append(project_root)
-
-from nemo_export import NeMoConverter, create_dir_if_not_exist
-from GPT3.GPT3ModelConfig import GPT3ModelTRTConfig
-from GPT3.trt_utils import load_trt_model
-from interface import NeMoCommand, BaseModel
-import onnx
-
-sys.path.append('../../HuggingFace') # Include HuggingFace
-from NNDF.interface import FRAMEWORK_TENSORRT
-from NNDF.logger import G_LOGGER
-from NNDF.models import _log_fake_perf_metrics
-from NNDF.networks import (
-    NetworkModel,
-    NetworkModels,
-)
-
-class GPT3NeMoTRT(NeMoCommand):
-    def __init__(
-        self,
-        nemo_cfg,
-        config_class=GPT3ModelTRTConfig,
-        description="Runs TensorRT results for GPT3 model.",
-        **kwargs
-    ):
-        super().__init__(nemo_cfg, config_class, description, model_classes=None, **kwargs)
-        self.framework_name = FRAMEWORK_TENSORRT
-
-
-    def setup_tokenizer_and_model(self):
-        self.nemo_cfg.runtime = 'trt'
-        self.model = BaseModel()
-        self.model.cfg = self.nemo_cfg.model
-        self.model.tokenizer = get_tokenizer(tokenizer_name='megatron-gpt-345m', vocab_file=None, merges_file=None)
-
-        # Path to write new onnx models if need arises. Prevents overwrite of
-        # user-provided onnx files in case opset_version needs to be upgraded
-        # to 19 or onnx files with kv-cache needs to be written.
-        onnx_workpath = os.path.join(
-            self.workspace.dpath,
-            "onnx",
-        )
-        if self.nemo_cfg.onnx_model_file:
-            # Input by user, can be a read-only location.
-            onnx_name = self.nemo_cfg.onnx_model_file
-        else:
-            onnx_name = os.path.join(
-                onnx_workpath,
-                f"model-{self.nemo_cfg.trainer.precision}.onnx",
-            )
-            self.nemo_cfg.onnx_model_file = onnx_name
-            self.nemo_cfg.trt_export_options.timing_cache = self.timing_cache
-
-            converter = NeMoConverter(self.nemo_cfg, MegatronGPTModel)
-            if not os.path.isfile(onnx_name):
-                # Convert NeMo model to ONNX model
-                onnx_name = converter.nemo_to_onnx()
-
-        def get_opset_version(name : str) -> int:
-            """Returns opset.
-
-            `model` here is local in scope and python's gc will collect
-            it without manual memory management via `del`.
-            """
-            model = onnx.load(name, load_external_data=False)
-            return model.opset_import[0].version
-
-        opset_version = get_opset_version(onnx_name)
-        if opset_version < 19:
-            opset19_onnx_name = NeMoConverter.get_opset19_onnx_fpath(
-                onnx_name, onnx_workpath
-            )
-            if not os.path.isfile(opset19_onnx_name):
-                opset19_onnx_name = NeMoConverter.onnx_to_opset19(
-                    onnx_name, onnx_workpath
-                )
-
-            if opset19_onnx_name != None:
-                onnx_name = opset19_onnx_name
-
-        # Add KV cache to ONNX model
-        kv_output_policy = "kv_new"
-
-        converter = NeMoConverter(self.nemo_cfg)
-
-        def has_kv_cache_support(
-            model_name: str, match_names=("key", "value", "kv")
-        ) -> bool:
-            """To detect onnx models with kv_cache exported, input node names
-            contain match_names.
-            """
-            model = onnx.load(model_name, load_external_data=False)
-
-            # Get network inputs.
-            input_all = [node.name for node in model.graph.input]
-            input_initializer =  [node.name for node in model.graph.initializer]
-            net_input_names = list(set(input_all)  - set(input_initializer))
-
-            kv_nodes = filter(
-                lambda name: any(map(lambda match: match in name, match_names)),
-                net_input_names,
-            )
-            return any(kv_nodes) and len(net_input_names) > 2
-
-        if (not self.nemo_cfg.use_cache) and (has_kv_cache_support(onnx_name)):
-            raise RuntimeError(
-                "ONNX model has been exported with kv-cache enabled, but "
-                "runtime configuration has kv-cache disabled. Consider "
-                "enabling kv-cache support via the `use-cache` option."
-            )
-
-        if self.nemo_cfg.use_cache and (not has_kv_cache_support(onnx_name)):
-            G_LOGGER.info(f"Converting {onnx_name} with KV-cache support")
-            new_dir = onnx_workpath + f"_{kv_output_policy}"
-            if self.nemo_cfg.onnx_export_options.use_fp8_storage:
-                new_dir += f"_fp8_storage"
-            onnx_output_fpath = os.path.join(new_dir, onnx_name.split("/")[-1])
-
-            if not os.path.isfile(onnx_output_fpath):
-                create_dir_if_not_exist(onnx_output_fpath)
-                converter.create_onnx(onnx_name, onnx_output_fpath, kv_output_policy)
-            onnx_name = onnx_output_fpath
-
-        if self.nemo_cfg.onnx_export_options.prune:
-            onnx_name = converter.prune_onnx(onnx_name)
-
-        # Convert ONNX model to TRT engine
-        self.nemo_cfg.trt_export_options.use_strongly_typed = self.use_strongly_typed
-        self.nemo_cfg.trt_export_options.timing_cache = self.timing_cache
-        self.nemo_cfg.trt_export_options.opt_seq_len = self.opt_seq_len
-
-        suffixes = []
-        suffixes.append("bs" + str(self.nemo_cfg.batch_size))
-        if self.nemo_cfg.trt_export_options.opt_seq_len != None:
-            suffixes.append("opt" + str(self.nemo_cfg.trt_export_options.opt_seq_len))
-        if self.nemo_cfg.use_cache:
-            suffixes.append("kv")
-        if self.nemo_cfg.onnx_export_options.use_fp8_storage:
-            suffixes.append("fp8_storage")
-        if self.nemo_cfg.trt_export_options.sparse:
-            suffixes.append("sp")
-        if not self.nemo_cfg.trt_export_options.use_strongly_typed:
-            suffixes.append("no_strongly_typed")
-        suffix = "-".join(suffixes)
-        trt_fpath = os.path.join(self.workspace.dpath, f"trt-{suffix}.plan")
-
-        if os.path.isfile(trt_fpath):
-            G_LOGGER.debug(f"TRT Engine plan exists at location {trt_fpath}.")
-            _log_fake_perf_metrics()
-        else:
-            converter.onnx_to_trt(onnx_name, trt_fpath)
-
-        self.nemo_cfg.trt_engine_file = trt_fpath
-        self.model.trt = load_trt_model(self.nemo_cfg)
-        self.tokenizer = self.model.tokenizer
-        onnx_models = [
-            NetworkModel(
-                name=GPT3ModelTRTConfig.NETWORK_FULL_NAME, fpath=self.nemo_cfg.onnx_model_file,
-            )
-        ]
-        return NetworkModels(torch=None, onnx=onnx_models, trt=None)
-
-    def add_args(self):
-        super().add_args()
-        engine_group = self._parser.add_argument_group("trt engine")
-        engine_group.add_argument(
-            "--opt-seq-len",
-            default=None,
-            help="Set optimized input sequence length to be used in engine building",
-            type=int,
-        )
-        engine_group.add_argument(
-            "--no-timing-cache",
-            default=False,
-            help="Set to not use timing cache for speeding up engine building",
-            action="store_true",
-        )
-        engine_group.add_argument(
-            "--no-strongly-typed",
-            default=False,
-            help="Disable strongly typed mode in engine building",
-            action="store_true",
-        )
-
-    def process_framework_specific_arguments(
-        self,
-        opt_seq_len: int = None,
-        no_timing_cache: bool = False,
-        no_strongly_typed: bool = False,
-        **kwargs
-    ):
-        self.opt_seq_len = opt_seq_len
-        self.use_timing_cache = not no_timing_cache
-        self.use_strongly_typed = not no_strongly_typed
-        self.timing_cache = self.workspace.get_timing_cache() if self.use_timing_cache else None
-
-# Entry point
-def getGPT3NeMoTRT():
-    config_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../config.yaml")
-    nemo_cfg = omegaconf.OmegaConf.load(config_path)
-    return GPT3NeMoTRT(nemo_cfg)
-
-# Entry point
-RUN_CMD = getGPT3NeMoTRT()
-
-if __name__ == "__main__":
-    result = RUN_CMD()
-    print("Results: {}".format(result))
diff --git a/demo/NeMo/GPT3/trt_utils.py b/demo/NeMo/GPT3/trt_utils.py
deleted file mode 100644
index a146cf7e8..000000000
--- a/demo/NeMo/GPT3/trt_utils.py
+++ /dev/null
@@ -1,231 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-import sys
-
-import numpy as np
-import tensorrt as trt
-import torch
-from transformers.configuration_utils import PretrainedConfig
-
-sys.path.append('../../HuggingFace') # Include HuggingFace directory
-from NNDF.models import TRTEngineFile
-from NNDF.networks import NetworkMetadata
-from NNDF.tensorrt_utils import TRTNativeRunner
-from NNDF.logger import G_LOGGER
-from Seq2Seq.export import DecoderTRTEngine
-
-from HuggingFace.NNDF.tensorrt_utils import TRTNativeRunner, CUASSERT
-from cuda import cudart
-
-
-class GPTTRTDecoder(TRTNativeRunner):
-
-    INPUT_IDS_INDEX = 0
-    POSITION_IDS_INDEX = 1
-    ATTENTION_MASK_INDEX = 2
-
-    def __init__(
-        self,
-        trt_engine_file: TRTEngineFile,
-        use_cache: bool,
-        use_fp8_storage: bool,
-        cfg,
-        network_metadata: NetworkMetadata = None,
-        hf_config: PretrainedConfig = None,
-    ):
-        super().__init__(trt_engine_file, network_metadata, hf_config)
-        self.use_cache = use_cache
-        self.use_fp8_storage = use_fp8_storage
-        if self.use_cache:
-            self._set_context_mode_trt_context()
-        self.io_names = set()
-        self.input_tensor_names = set()
-        for i in range(self.trt_engine.num_io_tensors):
-            tensor_name = self.trt_engine.get_tensor_name(i)
-            self.io_names.add(tensor_name)
-            if self.trt_engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.INPUT:
-                self.input_tensor_names.add(tensor_name)
-
-        self.cfg = cfg
-        logits_size = self.cfg.batch_size * self.cfg.model.max_seq_len * self.cfg.model.vocab_size
-
-        self.batch_size = self.cfg.batch_size
-        self.max_seq_len = self.cfg.model.max_seq_len
-        self.num_layers = self.cfg.model.num_layers
-        self.nb_heads = self.cfg.model.nb_heads
-        self.head_size = self.cfg.model.head_size
-
-        dtype = self.get_torch_type(self.get_output_name())
-        self.logits = torch.zeros(logits_size, dtype=dtype).contiguous().cuda()
-
-
-        self.init_kv_cache()
-        self.past_decoder_length = 0
-
-        # Setting next input shape when executing gpu kernel.
-        # Use dict to record which inputs have changed.
-        self.input_shape_change_record = dict()
-
-    def init_kv_cache(self):
-        # kv cache buffer
-        self.attention_kv_cache_buffer = dict()
-        cache_dtype = torch.float16
-        if self.use_fp8_storage:
-            cache_dtype = torch.uint8
-        for i in range(self.num_layers):
-            for code in ["key", "value"]:
-                attention_kv_cache_name = self.make_kv_cache_name(i, code)
-                self.attention_kv_cache_buffer[attention_kv_cache_name] = torch.empty(
-                    self.max_seq_len,
-                    self.batch_size,
-                    self.nb_heads,
-                    self.head_size,
-                    dtype=cache_dtype,
-                    device=torch.cuda.current_device(),
-                ).contiguous().cuda()
-                
-
-    def make_kv_cache_name(self, layer, code):
-        return f"key_values.{layer}.decoder.{code}"
-
-    def _set_context_mode_trt_context(self):
-        # Create TRT context for context mode (1st decoder run) with optimization profile index = 1
-        self.context_trt_context = self.trt_engine.create_execution_context()
-        self.context_trt_context.set_optimization_profile_async(1, self.stream)
-
-    def get_torch_type(self, name):
-        trt_type = self.trt_engine.get_tensor_dtype(name)
-        mapping = {
-            trt.float32: torch.float32,
-            trt.float16: torch.float16,
-            trt.int8: torch.int8,
-            trt.int32: torch.int32,
-            trt.int64: torch.int64,
-            trt.bool: torch.bool,
-            trt.uint8: torch.uint8,
-            trt.bfloat16: torch.bfloat16,
-        }
-        if trt_type in mapping:
-            return mapping[trt_type]
-        raise ValueError(f"Got unexpected tensorrt dtype {trt_type} in get_torch_type().")
-
-    def get_input_ids_name(self):
-        return self.trt_engine.get_tensor_name(self.INPUT_IDS_INDEX)
-
-    def has_position_ids(self):
-        # If the input at POSITION_IDS_INDEX has a dimension of 2, assume it is position_ids.
-        return len(self.trt_engine.get_tensor_shape(self.trt_engine.get_tensor_name(self.POSITION_IDS_INDEX))) == 2
-
-    def get_position_ids_name(self):
-        if self.has_position_ids():
-            return self.trt_engine.get_tensor_name(self.POSITION_IDS_INDEX)
-        else:
-            return None
-
-    def get_output_name(self):
-        return "logits"
-
-    def has_attention_mask(self):
-        if self.ATTENTION_MASK_INDEX < self.trt_engine.num_io_tensors:
-            return self.trt_engine.get_tensor_name(self.ATTENTION_MASK_INDEX) == "attention_mask"
-        return False
-
-    def get_attention_mask_name(self):
-        if self.has_attention_mask():
-            return self.trt_engine.get_tensor_name(self.ATTENTION_MASK_INDEX)
-        return None
-
-    def run(self, output_name, io_descs, seq_len, context_mode=False):
-        torch.cuda.nvtx.range_push("TRT Setup")
-        if self.use_cache:
-            if context_mode:
-                self.past_decoder_length = 0
-            else:
-                # When kv-cache is used, seq_len is always 1 in Generation phase.
-                seq_len = 1
-            cur_shape = (self.past_decoder_length, self.batch_size, self.nb_heads, self.head_size)
-            new_shape = (seq_len, self.batch_size, self.nb_heads, self.head_size)
-            assert self.past_decoder_length + seq_len < self.max_seq_len
-            offset = self.batch_size*self.nb_heads*self.head_size*self.past_decoder_length
-            for i in range(self.num_layers):
-                for code in ["key", "value"]:
-                    attention_kv_cache_name = self.make_kv_cache_name(i, code)
-                    cur_address = self.attention_kv_cache_buffer[attention_kv_cache_name].data_ptr()
-                    # new kv address start from the past kv-cache data end
-                    io_descs[f"past_{attention_kv_cache_name}"] = (cur_address, cur_shape)
-                    new_address = cur_address + offset*self.attention_kv_cache_buffer[attention_kv_cache_name].element_size()
-                    modifier = ""
-                    if self.use_fp8_storage:
-                        modifier = "_qfp8"
-                    new_kv_name = f"new_{attention_kv_cache_name}{modifier}"
-                    io_descs[new_kv_name] = (new_address, new_shape)
-            self.past_decoder_length += seq_len
-        else:
-            self.past_decoder_length = 0
-        # Set active optimization profile and active execution context.
-        self.trt_context.set_optimization_profile_async(self.profile_idx, self.stream)
-        active_context = self.trt_context
-        if context_mode and self.use_cache:
-            active_context = self.context_trt_context
-
-        # Set up input bindings.
-        for name, tensor_shape in io_descs.items():
-            active_context.set_tensor_address(name, tensor_shape[0])
-            if name in self.input_tensor_names:
-                if name in self.input_shape_change_record and \
-                    self.input_shape_change_record[name][0] == active_context and \
-                    self.input_shape_change_record[name][1] == tensor_shape[1]:
-                    continue
-                else:
-                    active_context.set_input_shape(name, tensor_shape[1])
-            elif self.use_cache:
-                pass
-            else:
-                assert False, "All tensors must be inputs for non-KV mode"
-        assert active_context.all_shape_inputs_specified
-
-        # Set up output bindings.
-        assert output_name == self.get_output_name()
-        engine_out_torch_type = self.get_torch_type(output_name)
-        if self.logits.dtype != engine_out_torch_type:
-            raise ValueError(f"Output data type does not match, {self.logits.dtype} vs. {engine_out_torch_type}.")
-        shape = active_context.get_tensor_shape(output_name)
-        active_context.set_tensor_address(output_name, self.logits.data_ptr())
-
-
-        # Execute inference.
-        torch.cuda.nvtx.range_pop() # "TRT Setup"
-        active_context.execute_async_v3(self.stream)
-        if not context_mode and self.use_cache:
-            self.input_shape_change_record.clear()
-            for i in range(self.num_layers):
-                for code in ["key", "value"]:
-                    next_past_shape = (self.past_decoder_length, self.batch_size, self.nb_heads, self.head_size)
-                    attention_kv_cache_name = self.make_kv_cache_name(i, code)
-                    # set next iter input shape when cpu idle
-                    active_context.set_input_shape(f"past_{attention_kv_cache_name}", next_past_shape)
-                    self.input_shape_change_record[f"past_{attention_kv_cache_name}"] = [active_context, next_past_shape]
-        CUASSERT(cudart.cudaStreamSynchronize(self.stream))
-        if len(shape) != 3:
-            raise ValueError("Output must have a dimension of 3.")
-        output = self.logits[:shape[0] * shape[1] * shape[2]].view(tuple(shape))
-        return output
-
-def load_trt_model(cfg):
-    G_LOGGER.info(f'Loading TensorRT engine from {cfg.trt_engine_file} with use_cache={cfg.use_cache}, use_fp8_storage={cfg.onnx_export_options.use_fp8_storage} ')
-    trt_engine_file = DecoderTRTEngine(cfg.trt_engine_file)
-    return GPTTRTDecoder(trt_engine_file, cfg.use_cache, cfg.onnx_export_options.use_fp8_storage, cfg)
diff --git a/demo/NeMo/README.md b/demo/NeMo/README.md
deleted file mode 100644
index 44f183dd6..000000000
--- a/demo/NeMo/README.md
+++ /dev/null
@@ -1,156 +0,0 @@
-# TensorRT FP8 Inference for NeMo models
-**Deprecation:** For all users using TensorRT to accelerate Large Language Model inference, please use [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM/). TensorRT-LLM includes full model coverage and functionalities of HuggingFace demo and NeMo demo. It also contains more optimizations and functionalities (e.g. model quantization, in-flight batching, etc.), multi-GPU support, better model coverage and much better inference performance. HuggingFace Demo and NeMo demo will not be maintained, and they will be removed from OSS in TRT 10.0 release.
-
-This repository demonstrates TensorRT inference with NeMo Megatron models in FP8/FP16/BF16 precision.
-
-Currently, this repository supports [NeMo GPT](https://huggingface.co/nvidia/nemo-megatron-gpt-5B/tree/fp8) models only.
-
-# Environment Setup
-It's recommended to run inside a container to avoid conflicts when installing dependencies. Please check out [`NGC TensorRT`](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tensorrt/tags) and find a container with TensorRT 9.0 or above. A GPU with compute capability 8.9 or above is required to run the demo with FP8 precision.
-
-```
-# Run inside a TensorRT container
-sh install.sh [--deps <directory>] [-j <nproc>] [--ninja]
-```
-
-All arguments are optional. `--deps` indicates the relative dependency download directory, `-j` indicates number of parallel jobs for building and `--ninja` installs the `ninja` build system which can speed up installation. See `sh install.sh --help` for more details on the arguments.
-
-> The script will install required dependencies and it can take around 30 minutes or more.
-
-**Please note that the [HuggingFace demo directory](demo/HuggingFace) needs to be visible when running this demo, so utility functions can be correctly imported.**
-
-# File Structure
-This demo follows simliar structure and command-line interface as in [HuggingFace demo](/demo/HuggingFace).
-```
-.
-├── GPT3                              # GPT3 directory
-│   ├── GPT3ModelConfig.py            # model configuration and variant-specific parameters
-│   ├── frameworks.py                 # NeMo PyTorch inference script
-│   ├── onnxrt.py                     # OnnxRT inference script
-│   ├── trt.py                        # TensorRT inference script
-│   ├── decoding.py                   # main inference logic for all runtimes
-│   └── ...                           # files with utility functions for model export and inference
-├── config.yaml                       # full configuration for model export and inference
-├── interface.py                      # definitions of setup functions
-├── nemo_export.py                    # export functions for NeMo model -> ONNX model -> TRT engine
-└── run.py                            # main entry script
-```
-
-# Overview
-
-This demo contains two scripts `run.py` and `nemo_export.py`. Script `run.py` accepts a NeMo model or an ONNX model as input, and performs end-to-end inference with various actions specified by the user. Script `nemo_export.py` accepts a NeMo model or an ONNX model as input, and exports the input to an ONNX model or a TensorRT engine.
-
-# How to run inference
-The `run` action will run end-to-end inference on sentences specified in [config.yaml](/demo/NeMo/config.yaml). A model, a variant, and precision are required to run this command.
-```
-python3 run.py run GPT3 <frameworks|trt> --variant gpt-5b --working-dir $(pwd)/temp --fp8 --bf16 --nemo-model=<model_fp8_bf16.nemo>
-```
-
-Expected output for the second sentence:
-```
-Batch 1: {'sentences': ['TensorRT is a Deep Learning compiler used for deep learning. It is a compiler for TensorFlow, CNTK, and Torch. It is a compiler for the TensorFlow, CNTK,'],
-          'tokens': [['<|endoftext|>', 'T', 'ensor', 'RT', ' is', ' a', ' Deep', ' Learning', ' compiler', ' used', ' for', ' deep', ' learning', '.', ' It', ' is', ' a', ' compiler', ' for', ' T', 'ensor', 'Flow', ',', ' C', 'NT', 'K', ',', ' and', ' Torch', '.', ' It', ' is', ' a', ' compiler', ' for', ' the', ' T', 'ensor', 'Flow', ',', ' C', 'NT', 'K', ',']],
-          'logprob': tensor([[-4.6415e+00, -6.9270e+00, -7.4458e+00, -1.9856e+00, -5.9787e-01,
-                              -8.1058e+00, -7.9629e-02, -5.8013e+00, -5.5222e+00, -1.4401e+00,
-                              -5.5644e+00, -3.3747e-01, -3.3463e+00, -1.1306e+00, -1.3685e+00,
-                              -1.7793e+00, -2.8960e+00, -1.4127e+00, -2.3209e+00, -7.3454e-04,
-                              -9.8682e-02, -1.3268e+00, -2.1373e+00, -3.9281e-01, -6.5222e-04,
-                              -2.9425e-01, -1.4167e+00, -1.8416e+00, -9.2462e-01, -1.4805e+00,
-                              -1.4299e+00, -2.0632e+00, -2.9947e+00, -9.1487e-01, -2.6651e+00,
-                              -2.2772e+00, -4.7057e-03, -2.2852e-01, -2.4777e+00, -2.4731e-01,
-                              -7.0602e-03, -4.7339e-04, -1.1645e-01]], device='cuda:0'),
-         'full_logprob': None,
-         'token_ids': [[50256, 51, 22854, 14181, 318, 257, 10766, 18252, 17050, 973, 329, 2769, 4673, 13, 632, 318, 257, 17050, 329, 309, 22854, 37535, 11, 327, 11251, 42, 11, 290, 34868, 13, 632, 318, 257, 17050, 329, 262, 309, 22854, 37535, 11, 327, 11251, 42, 11]],
-         'offsets': [[0, 0, 1, 6, 8, 11, 13, 18, 27, 36, 41, 45, 50, 59, 60, 63, 66, 68, 77, 81, 83, 88, 92, 93, 95, 97, 98, 99, 103, 109, 110, 113, 116, 118, 127, 131, 135, 137, 142, 146, 147, 149, 151, 152]]}
-```
-
-# How to run with various configurations
-- FP8, FP16, and BF16 precisions are supported, and they can be set through `--fp8`, `--fp16`, and `--bf16` respectively. Currently, the script has constraints on how precisions are specified, and supported combinations are:
-  1. Pure FP16: `--fp16` (default)
-  2. Pure BF16: `--bf16`
-  3. FP8-FP16: `--fp8 --fp16`
-  4. FP8-BF16: `--fp8 --bf16`
-
-- `--nemo-model=<model.nemo>` or `--nemo-checkpoint=<model.ckpt>` can be used to load a NeMo model or checkpoint from a specified path, respectively. If these arguments are not provided, a NeMo model will be downloaded (and cached/re-used for subsequent runs) in the working directory.
-
-- K-V cache can be enabled through `--use-cache`
-
-- Batch size can be changed through `--batch-size=<bs>`
-
-- Default max sequence length is `256`, can be changed through `--max-seq-len=<ms>`
-
-# How to run performance benchmark
-The `benchmark` action will run inference with specified input and output sequence lengths multiple times.
-```
-python3 run.py benchmark GPT3 <frameworks|trt> --variant gpt-5b --working-dir $(pwd)/temp --fp8 --bf16 --nemo-model=<model_fp8_bf16.nemo> --batch-size=16 --input-seq-len=128 --output-seq-len=20 --use-cache --warmup=10 --iterations=100
-```
-
-Expected output for `trt`:
-```
-***************************
-Running 100 iterations with batch size: 16, input sequence length: 128 and output sequence length: 20
-[E2E inference] Total Time: 11.55453 s, Average Time: 0.11555 s, 95th Percentile Time: 0.11581 s, 99th Percentile Time: 0.11587 s, Throughput: 2769.48 tokens/s
-[Without tokenizer] Total Time: 10.44539 s, Average Time: 0.10445 s, 95th Percentile Time: 0.10459 s, 99th Percentile Time: 0.10465 s, Throughput: 3063.55 tokens/s
-***************************
-```
-
-Expected output for `frameworks`:
-```
-***************************
-Running 100 iterations with batch size: 16, input sequence length: 128 and output sequence length: 20
-[E2E inference] Total Time: 55.23503 s, Average Time: 0.55235 s, 95th Percentile Time: 0.55525 s, 99th Percentile Time: 0.56992 s, Throughput: 579.34 tokens/s
-[Without tokenizer] Total Time: 54.06591 s, Average Time: 0.54066 s, 95th Percentile Time: 0.54369 s, 99th Percentile Time: 0.55839 s, Throughput: 591.87 tokens/s
-***************************
-```
-
-# How to run accuracy check
-The `accuracy` action will run accuracy check on a dataset. Default is to use [LAMBADA](https://paperswithcode.com/dataset/lambada) dataset.
-```
-python3 run.py accuracy GPT3 <frameworks|trt> --variant gpt-5b --working-dir $(pwd)/temp --fp8 --bf16 --nemo-model=<model_fp8_bf16.nemo> --use-cache
-```
-
-Expected output for `trt`:
-```
-***************************
-Lambada ppl(last token): 4.4756, ppl(sequence): 18.3254, acc(top1): 0.6722, acc(top3): 0.8597, acc(top5): 0.9076
-***************************
-```
-
-Expected output for `frameworks`:
-```
-***************************
-Lambada ppl(last token): 4.4669, ppl(sequence): 18.3161, acc(top1): 0.6765, acc(top3): 0.8612, acc(top5): 0.9082
-***************************
-```
-
-# How to export a NeMo model to ONNX
-NeMo to ONNX conversion consists of 3 steps:
-1. Export ONNX from NeMo.
-2. NeMo uses TransformerEngine to export FP8 models to ONNX (step 1) and the exported ONNX has custom TensorRT Q/DQ nodes. Script `convert_te_onnx_to_trt_onnx.py` can be used to convert the custom operators into standard opset19 ONNX Q/DQ nodes.
-3. Add KV-cache inputs and outputs to the exported ONNX, so it is faster when performing inference on the model.
-
-`nemo_export.py` has `--opset19` and `--use-cache` option to decide whether to perform step 2. and step 3., respectively:
-```
-python3 nemo_export.py --nemo-model=model.nemo --onnx=onnx/model.onnx --opset19 --use-cache
-```
-`--extra-configs` can be used to specified configs that are defined in `config.yml` but not being exposed from existing command-line interface.
-Please specify `--help` to see more options.
-
-
-# How to run sparsity for benchmark
-
-*Note: this is for performance analysis. The pruned model should not be used for accuracy purpose unless it was fine-tuned for sparsity. The pruning may take minutes or hours depending on the model size.*
-
-
-1. Enable sparsity knobs in `config.yaml`:
-  * Set `onnx_export_options.prune` to `True` to enable pruning of the ONNX model.
-  * Set `trt_export_options.sparse` to `True` to enable sparse tactics profiling in TensorRT.
-2. Run the scripts. You should be able to see logs like below.
-
-```
-[2023-07-28 00:15:03,015][OSS][INFO] Prune ONNX model with: polygraphy surgeon prune ${OSS_ROOT}/demo/NeMo/temp/gpt-5b/GPT3-gpt-5b-fp8-fp16-ms256/onnx/model-16.opset19.onnx -o ${OSS_ROOT}/demo/NeMo/temp/gpt-5b/GPT3-gpt-5b-fp8-fp16-ms256/onnx/pruned.model-16.opset19.onnx --save-external-data ${OSS_ROOT}/demo/NeMo/temp/gpt-5b/GPT3-gpt-5b-fp8-fp16-ms256/onnx/pruned.model-16.opset19.onnx_data
-[2023-07-28 00:15:03,016][OSS][INFO] This may take a while...
-...
-
-[2023-07-28 03:36:52,307][OSS][DEBUG] trtexec --onnx=${OSS_ROOT}/demo/NeMo/temp/gpt-5b/GPT3-gpt-5b-fp8-fp16-ms256/onnx/pruned.model-16.opset19.onnx --minShapes=input_ids:1x1,position_ids:1x1 --optShapes=input_ids:1x128,position_ids:1x128 --maxShapes=input_ids:1x256,position_ids:1x256 --fp8 --fp16 --sparsity=enable --timingCacheFile=functional.cache
-```
diff --git a/demo/NeMo/apex.patch b/demo/NeMo/apex.patch
deleted file mode 100644
index daa1b6153..000000000
--- a/demo/NeMo/apex.patch
+++ /dev/null
@@ -1,29 +0,0 @@
-diff --git a/setup.py b/setup.py
-index cb1a790..949f877 100644
---- a/setup.py
-+++ b/setup.py
-@@ -29,15 +29,15 @@ def check_cuda_torch_binary_vs_bare_metal(cuda_dir):
-     print("\nCompiling cuda extensions with")
-     print(raw_output + "from " + cuda_dir + "/bin\n")
-
--    if (bare_metal_version != torch_binary_version):
--        raise RuntimeError(
--            "Cuda extensions are being compiled with a version of Cuda that does "
--            "not match the version used to compile Pytorch binaries.  "
--            "Pytorch binaries were compiled with Cuda {}.\n".format(torch.version.cuda)
--            + "In some cases, a minor-version mismatch will not cause later errors:  "
--            "https://github.com/NVIDIA/apex/pull/323#discussion_r287021798.  "
--            "You can try commenting out this check (at your own risk)."
--        )
-+    # if (bare_metal_version != torch_binary_version):
-+    #     raise RuntimeError(
-+    #         "Cuda extensions are being compiled with a version of Cuda that does "
-+    #         "not match the version used to compile Pytorch binaries.  "
-+    #         "Pytorch binaries were compiled with Cuda {}.\n".format(torch.version.cuda)
-+    #         + "In some cases, a minor-version mismatch will not cause later errors:  "
-+    #         "https://github.com/NVIDIA/apex/pull/323#discussion_r287021798.  "
-+    #         "You can try commenting out this check (at your own risk)."
-+    #     )
-
-
- def raise_if_cuda_home_none(global_option: str) -> None:
diff --git a/demo/NeMo/config.yaml b/demo/NeMo/config.yaml
deleted file mode 100644
index 2b1888bb8..000000000
--- a/demo/NeMo/config.yaml
+++ /dev/null
@@ -1,87 +0,0 @@
-runtime: null
-gpt_model_file: null # GPT nemo file path
-onnx_model_file: null # ONNX file path
-trt_engine_file: null # TRT engine file path
-
-# Parameters for loading from a checkpoint
-checkpoint_dir: null # Path to a folder that contains a .ckpt file
-checkpoint_name: null # Name of the .ckpt file within the checkpoint_dir.
-hparams_file: null # Path to a .yaml file that contains the hyperparameters of the checkpoint.
-
-batch_size: 1
-use_cache: True
-use_one_input: False # export ONNX model with only one input
-prompts: # prompts for GPT inference
-  - "How are you?"
-  - "TensorRT is a Deep Learning compiler used for deep learning."
-
-mode: 'inference' # Could change to accuracy or benchmark
-
-inference:
-  greedy: True # Whether or not to use sampling ; use greedy decoding otherwise
-  top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
-  top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
-  temperature: 1.0 # sampling temperature
-  add_BOS: True # add the bos token at the begining of the prompt
-  tokens_to_generate: 30 # The maximum length of the sequence to be generated.
-  all_probs: False  # whether return the log prob for all the tokens in vocab
-  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
-  min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
-  compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
-  seed: 1234
-
-accuracy:
-  dataset: Lambada
-  metric: Perplexity
-  top_n: 1,3,5
-  tokens_to_generate: 5
-
-benchmark:
-  input_seq_len: 20
-  output_seq_len: 20
-
-# for nemo to onnx export
-onnx_export_options:
-  runtime_check: False
-  verbose: False
-  onnx_opset: 17
-  do_constant_folding: True
-  cache_support: False
-  prune: False # Prune the ONNX model for Sparse Tensor Cores 2:4 pattern
-  device: 'cuda'
-  check_tolerance: 0.01
-  use_fp8_storage: False
-  quantize_bmms: False
-
-# for onnx to trt export
-trt_export_options:
-  opt_seq_len: 128 # define the optimized sequence length
-  use_tf32: True
-  use_fp16: False
-  use_fp8: False
-  use_bf16: False
-  use_strongly_typed: True # enable strongly typed mode will invalidate `use_[fp8|fp16|bf16]` flags.
-  sparse: False # enable sparse in TRT engine builder
-  timing_cache: 'functional.cache'
-
-trainer:
-  devices: 1
-  num_nodes: 1
-  accelerator: gpu
-  logger: False # logger provided by exp_manager
-  precision: 32 # 16, 32, or bf16
-
-tensor_model_parallel_size: 1
-pipeline_model_parallel_size: 1
-pipeline_model_parallel_split_rank: 0 # used for encoder and decoder model (0 for others)
-
-# model architecture
-model:
-  max_seq_len: 256 # define the max sequence length for attention mask
-  encoder_seq_length: 2048
-  max_position_embeddings: ${.encoder_seq_length}
-  num_layers: 24
-  hidden_size: 4096
-  nb_heads: 32
-  head_size: 128
-  vocab_size: 50304
diff --git a/demo/NeMo/install.sh b/demo/NeMo/install.sh
deleted file mode 100644
index 277f250a4..000000000
--- a/demo/NeMo/install.sh
+++ /dev/null
@@ -1,485 +0,0 @@
-#!/bin/sh
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Sourcing messes up the directory detection with readlink.
-if [ ! "${0##*/}" = "install.sh" ]; then
-	echo "Please run this install script, don't source it." >&2
-	echo "Use -h for usage and help." >&2
-	return 1
-fi
-
-NEMO_DIR=$(dirname "$(readlink -f "$0")")
-DEMO_DIR=$(dirname "${NEMO_DIR}")
-SCRIPT_DIR=$(dirname "${DEMO_DIR}")/scripts
-
-DEPENDENCIES_DIR="temp"
-BUILD_SRCLIBS=1
-BUILD_NINJA=0
-ARG_JOBS=1
-ARG_HELP=0
-
-install_essential_tools() {
-	pip_not_found=$(pip --version 2>&1 | grep -o "not found")
-	if [ "$pip_not_found" != "" ]; then
-		echo " > Installing pip..."
-		apt-get update
-		apt-get install -y python3-dev
-		cd "${1}" || exit
-		if [ ! -f "get-pip.py" ]; then
-			apt-get install -y wget
-			wget https://bootstrap.pypa.io/get-pip.py
-		fi
-		python3 get-pip.py
-		cd ..
-	fi
-
-	git_not_found=$(git --version 2>&1 | grep -o "not found")
-	if [ "$git_not_found" != "" ]; then
-		echo " > Installing git..."
-		apt-get update
-		apt-get install -y git
-	fi
-}
-
-install_ninja() {
-	if [ ! -d "ninja" ]; then
-		git clone https://github.com/ninja-build/ninja.git
-	fi
-	cd ninja || exit
-	git checkout v1.11.1
-
-	if [ ! -x "./ninja" ]; then
-		CMD="python3 configure.py --bootstrap"
-		echo " >> ${CMD}"
-		eval "${CMD}"
-		unset CMD
-	else
-		echo " > ninja already built!"
-	fi
-
-	PATH_WITH_NINJA="$(pwd):${PATH}"
-	# Path exported for the current program scope only.
-	export PATH="${PATH_WITH_NINJA}"
-	unset PATH_WITH_NINJA
-	cd ..
-}
-
-PACKAGE_NEEDS_REINSTALL=0
-
-check_if_managed_install() {
-	PACKAGE_NEEDS_REINSTALL=0
-	dist_path="${1}"
-	# https://packaging.python.org/en/latest/specifications/direct-url/
-	if [ ! -f "${dist_path}/direct_url.json" ]; then
-		PACKAGE_NEEDS_REINSTALL=1
-		return
-	fi
-	if [ "$(grep -c "${NEMO_DIR}" "${dist_path}/direct_url.json")" != "1" ]; then
-		PACKAGE_NEEDS_REINSTALL=1
-	fi
-}
-
-apex_install_logic() {
-	if [ ! -d "apex" ]; then
-		git clone https://github.com/NVIDIA/apex.git
-	fi
-
-	cd apex || exit
-	APEX_PATH="$(pwd)"
-	git config --global --add safe.directory "${APEX_PATH}"
-	unset APEX_PATH
-
-	git checkout 5b5d41034b506591a316c308c3d2cd14d5187e23
-	git apply "${NEMO_DIR}"/apex.patch # Bypass CUDA version check in apex
-
-	torchcppext=$(pip show torch | grep Location | cut -d' ' -f2)"/torch/utils/cpp_extension.py"
-	if [ ! -f "$torchcppext" ]; then
-		echo "Could not locate torch installation using pip"
-		exit 1
-	fi
-	sed -i 's/raise RuntimeError(CUDA_MISMATCH_MESSAGE.format(cuda_str_version, torch.version.cuda))/pass/' "$torchcppext" # Bypass CUDA version check in torch
-	unset torchcppext
-
-	CMD="MAX_JOBS=${ARG_JOBS} python3 setup.py bdist_wheel -v --cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam"
-	echo " >> ${CMD}"
-	eval "${CMD}"
-	unset CMD
-
-	python3 -m pip install "$(find './dist' -name '*.whl' | head -n1)"
-	cd ../
-}
-
-check_if_apex_needs_reinstall() {
-	apex_loc="$(pip show apex | grep '^Location' | awk '{print $2}')"
-	apex_dist_loc="$(find "${apex_loc}" -depth -maxdepth 1 -name 'apex*dist-info' -type d | head -n1)"
-
-	check_if_managed_install "${apex_dist_loc}"
-	apex_needs_reinstall=${PACKAGE_NEEDS_REINSTALL}
-	echo "${apex_needs_reinstall}"
-
-	unset apex_dist_loc
-	unset apex_loc
-}
-
-install_apex() {
-	has_apex=$(pip list | grep "^apex " | grep "apex" -o | awk '{print $1}' | awk '{print length}')
-	apex_needs_reinstall=0
-
-	if [ "$has_apex" != "4" ]; then
-		apex_install_logic
-	else
-		check_if_apex_needs_reinstall
-		if [ "$apex_needs_reinstall" != "0" ]; then
-			echo " > Reinstalling Apex per demo version..."
-			python3 -m pip uninstall -y apex
-			apex_install_logic
-		else
-			echo " > Apex already installed!"
-		fi
-	fi
-	unset apex_needs_reinstall
-	unset has_apex
-}
-
-megatron_install_logic() {
-	if [ ! -d "Megatron-LM" ]; then
-		git clone -b main https://github.com/NVIDIA/Megatron-LM.git
-	fi
-
-	cd Megatron-LM || exit
-	MEGATRON_PATH="$(pwd)"
-	git config --global --add safe.directory "${MEGATRON_PATH}"
-	unset MEGATRON_PATH
-
-	git checkout 992da75a1fd90989eb1a97be8d9ff3eca993aa83
-	CMD="python3 -m pip install ./"
-	echo " >> ${CMD}"
-	eval "${CMD}"
-	unset CMD
-	cd ../
-}
-
-check_if_megatron_needs_reinstall() {
-	megatron_loc="$(pip show megatron-core | grep '^Location' | awk '{print $2}')"
-	megatron_dist_loc="$(find "${megatron_loc}" -depth -maxdepth 1 -name 'megatron*dist-info' -type d | head -n1)"
-
-	check_if_managed_install "${megatron_dist_loc}"
-	megatron_needs_reinstall=${PACKAGE_NEEDS_REINSTALL}
-
-	unset megatron_dist_loc
-	unset megatron_loc
-}
-
-install_megatron() {
-	has_megatron=$(pip list | grep "^megatron-core " | grep "megatron-core" -o | awk '{print $1}' | awk '{print length}')
-	megatron_needs_reinstall=0
-
-	if [ "$has_megatron" != "13" ]; then
-		megatron_install_logic
-	else
-		check_if_megatron_needs_reinstall
-		if [ "$megatron_needs_reinstall" != "0" ]; then
-			echo " > Reinstalling Megatron per demo version..."
-			python3 -m pip uninstall -y megatron-core
-			megatron_install_logic
-		else
-			echo " > Megatron already installed!"
-		fi
-	fi
-	unset megatron_needs_reinstall
-	unset has_megatron
-}
-
-flash_attention_install_logic() {
-	if [ ! -d "flash-attention" ]; then
-		git clone https://github.com/HazyResearch/flash-attention.git
-	fi
-
-	cd flash-attention || exit
-	FLASH_ATTENTION_PATH="$(pwd)"
-	git config --global --add safe.directory "${FLASH_ATTENTION_PATH}"
-	unset FLASH_ATTENTION_PATH
-
-	git checkout v1.0.6
-	CMD="MAX_JOBS=${ARG_JOBS} python3 setup.py bdist_wheel"
-	echo " >> ${CMD}"
-	eval "${CMD}"
-	unset CMD
-	python3 -m pip install "$(find './dist' -name '*.whl' | head -n1)"
-	cd ..
-}
-
-check_if_flash_attention_needs_reinstall() {
-	flash_attn_loc="$(pip show flash-attn | grep '^Location' | awk '{print $2}')"
-	flash_attn_dist_loc="$(find "${flash_attn_loc}" -depth -maxdepth 1 -name 'flash_attn*dist-info' -type d | head -n1)"
-
-	check_if_managed_install "${flash_attn_dist_loc}"
-	flash_attn_needs_reinstall=${PACKAGE_NEEDS_REINSTALL}
-
-	unset flash_attn_dist_loc
-	unset flash_attn_loc
-}
-
-install_flash_attention() {
-	has_flashattn=$(pip list | grep "^flash-attn " | grep "flash-attn" -o | awk '{print $1}' | awk '{print length}')
-	flash_attn_needs_reinstall=0
-
-	if [ "$has_flashattn" != "10" ]; then
-		flash_attention_install_logic
-	else
-		check_if_flash_attention_needs_reinstall
-		if [ "$flash_attn_needs_reinstall" != "0" ]; then
-			echo " > Reinstalling flash_attn per demo version..."
-			python3 -m pip uninstall -y flash-attn
-			flash_attention_install_logic
-		else
-			echo " > flash-attention already installed!"
-		fi
-	fi
-
-	unset flash_attn_needs_reinstall
-	unset has_flashattn
-}
-
-transformer_engine_install_logic() {
-	if [ ! -d "TransformerEngine" ]; then
-		git clone https://github.com/NVIDIA/TransformerEngine.git
-	fi
-
-	cd TransformerEngine || exit
-	TRANSFORMER_ENGINE_PATH="$(pwd)"
-	git config --global --add safe.directory "${TRANSFORMER_ENGINE_PATH}"
-	unset TRANSFORMER_ENGINE_PATH
-
-	git checkout 804f120322a13cd5f21ea8268860607dcecd055c
-	git submodule update --recursive --init
-	CMD="MAKEFLAGS=-j${ARG_JOBS} MAX_JOBS=${ARG_JOBS} python3 setup.py bdist_wheel --framework=pytorch"
-	echo " >> ${CMD}"
-	eval "${CMD}"
-	unset CMD
-	python3 -m pip install "$(find './dist' -name '*.whl' | head -n1)"
-	cd ..
-
-	# Check for common point of failure with TE.
-	has_te_loc=$(pip list | grep "^transformer-engine " | grep "transformer-engine" -o | awk '{print $1}' | awk '{print length}')
-	[ "$has_te_loc" != "18" ] && {
-		echo " > TransformerEngine install failed. Probable cause of failures:"
-		echo "   - CUDNN location was not picked up. If your CUDNN include dir"
-		echo "     is /path/to/cudnn/include and lib is /path/to/cudnn/lib,   "
-		echo "     Invoke the script as CUDNN_PATH=/path/to/cudnn sh install.sh ..."
-		exit 1
-	}
-	unset has_te_loc
-}
-
-check_if_transformer_engine_needs_reinstall() {
-	te_loc="$(pip show transformer-engine | grep '^Location' | awk '{print $2}')"
-	te_dist_loc="$(find "${te_loc}" -depth -maxdepth 1 -name 'transformer_engine*dist-info' -type d | head -n1)"
-
-	check_if_managed_install "${te_dist_loc}"
-	te_needs_reinstall=${PACKAGE_NEEDS_REINSTALL}
-
-	unset te_dist_loc
-	unset te_loc
-}
-
-install_transformer_engine() {
-	has_te=$(pip list | grep "^transformer-engine " | grep "transformer-engine" -o | awk '{print $1}' | awk '{print length}')
-	te_needs_reinstall=0
-
-	if [ "$has_te" != "18" ]; then
-		transformer_engine_install_logic
-	else
-		check_if_transformer_engine_needs_reinstall
-		if [ "$te_needs_reinstall" != "0" ]; then
-			echo " > Reinstalling TransformerEngine per demo version..."
-			python3 -m pip uninstall -y transformer-engine
-			transformer_engine_install_logic
-		else
-			echo " > TransformerEngine already installed!"
-		fi
-	fi
-
-	unset te_needs_reinstall
-	unset has_te
-
-	# Patch TE files.
-	sh "${NEMO_DIR}/patch_te.sh"
-}
-
-nemo_install_logic() {
-	if [ ! -d "NeMo" ]; then
-		git clone --branch main --single-branch https://github.com/NVIDIA/NeMo.git NeMo
-	fi
-
-	cd NeMo || exit
-	NeMo_PATH="$(pwd)"
-	git config --global --add safe.directory "${NeMo_PATH}"
-	unset NeMo_PATH
-
-	git checkout bf270794267e0240d8a8b2f2514c80c6929c76f1
-	bash reinstall.sh
-	cd ../
-}
-
-check_if_nemo_needs_reinstall() {
-	nemo_loc="$(pip show nemo-toolkit | grep '^Location' | awk '{print $2}')"
-	nemo_dist_loc="$(find "${nemo_loc}" -depth -maxdepth 1 -name 'nemo_toolkit*dist-info' -type d | head -n1)"
-
-	check_if_managed_install "${nemo_dist_loc}"
-	nemo_needs_reinstall=${PACKAGE_NEEDS_REINSTALL}
-
-	unset nemo_dist_loc
-	unset nemo_loc
-}
-
-install_nemo() {
-	has_nemo=$(pip list | grep "^nemo-toolkit " | grep "nemo-toolkit" -o | awk '{print $1}' | awk '{print length}')
-	nemo_needs_reinstall=0
-
-	if [ "$has_nemo" != "12" ]; then
-		nemo_install_logic
-	else
-		check_if_nemo_needs_reinstall
-		if [ "$nemo_needs_reinstall" != "0" ]; then
-			echo " > Reinstalling NeMo per demo version..."
-			python3 -m pip uninstall -y nemo-toolkit
-			nemo_install_logic
-		else
-			echo " > NeMo already installed!"
-		fi
-	fi
-}
-
-while [ "$#" -gt 0 ]; do
-	case $1 in
-	--deps)
-		DEPENDENCIES_DIR="$2"
-		shift
-		;;
-	-j | --jobs)
-		ARG_JOBS="$2"
-		shift
-		;;
-	--ninja) BUILD_NINJA=1 ;;
-	--skipsrc) BUILD_SRCLIBS=0 ;;
-	-h | --help) ARG_HELP=1 ;;
-	*)
-		echo "Unknown parameter passed: $1"
-		echo "For help type: $0 --help"
-		exit 1
-		;;
-	esac
-	shift
-done
-
-if [ "$ARG_HELP" -eq "1" ]; then
-	echo "Usage: sh $0 [options]"
-	echo "All arguments are optional."
-	echo " --help or -h         : Print this help menu."
-	echo " [--deps] {temp}      : Path to download and build dependencies."
-	echo " [-j | --jobs] {1}    : Number of jobs to use for building from source."
-	echo " [--ninja]            : Flag to build ninja (if not present) to speed up installation."
-	# skipsrc is not documented to prevent users from invoking it directly.
-	exit
-fi
-
-DEPENDENCIES_DIR="${NEMO_DIR}/${DEPENDENCIES_DIR}"
-echo " > Using ${DEPENDENCIES_DIR}' to store dependencies."
-mkdir -p "${DEPENDENCIES_DIR}"
-install_essential_tools "${DEPENDENCIES_DIR}"
-
-echo " > Installing Requirements.txt..."
-pip install --upgrade pip
-pip install nvidia-pyindex || {
-	echo "Could not install nvidia-pyindex, stopping install"
-	exit 1
-}
-# # One of the hidden dependencies require Cython, but doesn't specify it.
-# # https://github.com/VKCOM/YouTokenToMe/pull/108
-# # WAR by installing Cython before requirements.
-pip install "Cython==0.29.36" || {
-	echo "Could not install Cython, stopping install"
-	exit 1
-}
-# PyYaml, Cython and pip don't play well together.
-# https://github.com/yaml/pyyaml/issues/601
-pip install "pyyaml==5.4.1" --no-build-isolation || {
-	echo "Could not install PyYaml, stopping install"
-	exit 1
-}
-# Install a specific version of opencc to WAR a GLIBC not found error.
-pip install "opencc==1.1.6" || {
-	echo "Could not install OpenCC, stopping install"
-	exit 1
-}
-pip install -r requirements.txt || {
-	echo "Could not install dependencies, stopping install"
-	exit 1
-}
-
-# Installation from source
-if [ "$BUILD_SRCLIBS" -eq "1" ]; then
-	(command -v -- "ninja" >/dev/null 2>&1) || [ "$BUILD_NINJA" -eq "0" ] && echo " > Could not locate ninja, consider passing the --ninja flag to speedup dependency installation."
-fi
-
-cd "${DEPENDENCIES_DIR}" || exit
-if (! command -v -- "ninja" >/dev/null 2>&1) && [ "$BUILD_NINJA" -eq "1" ]; then
-	echo " > Building ninja..."
-	install_ninja
-fi
-
-if [ "$BUILD_SRCLIBS" -eq "1" ]; then
-	echo " > Installing Apex..."
-	install_apex
-fi
-
-echo " > Installing Megatron-LM..."
-install_megatron
-
-if [ "$BUILD_SRCLIBS" -eq "1" ]; then
-	echo " > Installing flash-attention..."
-	install_flash_attention
-fi
-
-if [ "$BUILD_SRCLIBS" -eq "1" ]; then
-	echo " > Installing TransformerEngine..."
-	install_transformer_engine
-fi
-
-echo " > Installing NeMo..."
-install_nemo
-
-if [ ! -f "${NEMO_DIR}/GPT3/convert_te_onnx_to_trt_onnx.py" ]; then
-	echo " > Copying opset19 conversion script..."
-	if [ ! -f "${SCRIPT_DIR}/convert_te_onnx_to_trt_onnx.py" ]; then
-		echo "Opset19 conversion script is not located at <ROOT_DIR>/scripts/convert_te_onnx_to_trt_onnx.py"
-		return 1
-	fi
-	cp "${SCRIPT_DIR}/convert_te_onnx_to_trt_onnx.py" "${NEMO_DIR}/GPT3/convert_te_onnx_to_trt_onnx.py"
-fi
-
-cd ../
-
-unset ARG_HELP
-unset ARG_JOBS
-unset BUILD_NINJA
-unset DEPENDENCIES_DIR
-unset SCRIPT_DIR
-unset DEMO_DIR
-unset NEMO_DIR
diff --git a/demo/NeMo/interface.py b/demo/NeMo/interface.py
deleted file mode 100644
index ec3dcbf7e..000000000
--- a/demo/NeMo/interface.py
+++ /dev/null
@@ -1,727 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from datetime import datetime
-import os
-import random
-import sys
-import time
-from typing import List, Union, Dict
-from copy import copy
-
-from cuda import cuda
-from tqdm import tqdm
-import numpy as np
-import torch
-
-from transformers import PretrainedConfig
-from omegaconf import OmegaConf, listconfig
-
-# Add syspath for custom library
-if __name__ == "__main__":
-    filepath = os.path.dirname(os.path.abspath(__file__))
-    project_root = os.path.join(filepath, os.pardir)
-    sys.path.append(project_root)
-
-from GPT3.decoding import full_inference, generate, process_output
-from GPT3.GPT3ModelConfig import GPT3ModelTRTConfig
-from GPT3.lambada_dataset import Lambada
-from GPT3.nemo_utils import get_computeprob_response
-from GPT3.sequence_perplexity import SequencePerplexity
-
-sys.path.append('../HuggingFace') # Include HuggingFace
-from NNDF.general_utils import NNFolderWorkspace
-from NNDF.logger import G_LOGGER
-from NNDF.networks import (
-    Precision,
-    NetworkMetadata,
-    TimingProfile,
-    BenchmarkingResult,
-    NetworkResult,
-    NetworkCheckpointResult,
-)
-from NNDF.interface import NetworkCommand
-
-# Manually set by referring to examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
-# If a field cannot be found, set to None.
-DEFAULT_CONFIG = {
-    "is_encoder_decoder": False,
-    "is_decoder": True,
-    "architectures": [ "GPT3NeMoModel" ],
-}
-
-GPT3CONFIG_MAPPINGS = {
-    "gpt-126m": PretrainedConfig.from_dict(dict({"_name_or_path": "gpt-126m",
-        "num_heads": 12,
-        "num_layers": 12,
-        "hidden_size": 768,
-        "max_position_embeddings": 2048,
-        "min_seq_len": 0,
-    }, **DEFAULT_CONFIG)),
-    "gpt-1.3b": PretrainedConfig.from_dict(dict({"_name_or_path": "gpt-1.3b",
-        "num_heads": 16,
-        "num_layers": 24,
-        "hidden_size": 2048,
-        "max_position_embeddings": 2048,
-        "min_seq_len": 0,
-    }, **DEFAULT_CONFIG)),
-    "gpt-5b": PretrainedConfig.from_dict(dict({"_name_or_path": "gpt-5b",
-        "num_heads": 32,
-        "num_layers": 24,
-        "hidden_size": 4096,
-        "max_position_embeddings": 2048,
-        "min_seq_len": 16,
-    }, **DEFAULT_CONFIG)),
-}
-
-def _hf_hub_metadata(variant: str, fp8: bool) -> Dict[str, str]:
-    repo_mappings = {
-        "gpt-1.3b": "nvidia/nemo-megatron-gpt-1.3B",
-        "gpt-5b": "nvidia/nemo-megatron-gpt-5B",
-    }
-
-    try:
-        repo_id = repo_mappings[variant]
-    except KeyError:
-        raise RuntimeError(
-            "Variant should be one of {}, got {}".format(
-                list(repo_mappings.keys()), variant
-            )
-        )
-
-    file_key = (variant, "fp8" if fp8 else "fp16")
-    file_mappings = {
-        ("gpt-1.3b", "fp8"): ("nemo_gpt1.3B_fp16.nemo", None),
-        ("gpt-1.3b", "fp16"): ("nemo_gpt1.3B_fp16.nemo", None),
-        ("gpt-5b", "fp8"): ("nemo_gpt5B_fp8_bf16_tp1.nemo", "fp8"),
-        ("gpt-5b", "fp16"): ("nemo_gpt5B_fp16_tp1.nemo", None),
-    }
-
-    try:
-        filename, branch = file_mappings[file_key]
-    except KeyError:
-        raise RuntimeError(
-            "Downloading nemo file for variant : {}, precision : {} from huggingface hub is unsupported. Consider passing a nemo-model or onnx-model from the command line.".format(
-                file_key[0], file_key[1]
-            )
-        )
-
-    return {"repo_id": repo_id, "filename": filename, "revision": branch}
-
-
-def download_model(dst_dir: str, cache_dir: str, *args, **kwargs) -> str:
-    from huggingface_hub import hf_hub_download
-
-    os.makedirs(dst_dir, exist_ok=True)
-    os.makedirs(cache_dir, exist_ok=True)
-
-    model_metadata = _hf_hub_metadata(*args, **kwargs)
-    return hf_hub_download(
-        local_dir=str(dst_dir),
-        local_dir_use_symlinks="auto",
-        cache_dir=cache_dir,
-        **model_metadata,
-    )
-
-
-def load_dataset(dataset_name, base_dir, tokens_to_generate, padding):
-    ds_map = {"Lambada": Lambada(base_dir, tokens_to_generate, padding)}
-    return ds_map[dataset_name]
-
-def get_accuracy_metric(cfg):
-    topN = [int(i.strip()) for i in cfg.top_n.split(",")]
-    m_map = {"Perplexity": SequencePerplexity(topN)}
-    return m_map[cfg.metric]
-
-def remove_padded_prompts(output, nb_paddings):
-    if nb_paddings == 0:
-        return output
-    result = {}
-    for k, v in output.items():
-        if v != None and (type(v) is list or type(v) is torch.Tensor):
-            v = v[:-nb_paddings]
-        result[k] = v
-    return result
-
-def get_random_input(tokenizer, batch_size, in_seq_len, out_seq_len):
-    vocab_size = tokenizer.tokenizer.vocab_size
-    return (torch.randint(0, vocab_size, (batch_size, in_seq_len + out_seq_len), dtype=torch.int64).cuda(),
-            (torch.ones(batch_size, dtype=torch.int64) * in_seq_len).cuda())
-
-class BaseModel(torch.nn.Module):
-    def __init__(self):
-        super(BaseModel, self).__init__()
-        self.model = None
-    def forward(self, x):
-        raise Exception("BaseModel forward method is not intended to be called.")
-
-class NeMoCommand(NetworkCommand):
-    def __init__(
-        self,
-        nemo_cfg,
-        config_class,
-        description,
-        **kwargs
-    ):
-        self.nemo_cfg = nemo_cfg
-        super().__init__(config_class, description, **kwargs)
-
-    def validate_and_set_precision(self, fp8, fp16, bf16, use_fp8_storage, quantize_bmms):
-        if fp8:
-            if fp16:
-                G_LOGGER.info("Use FP8-FP16 precision.")
-            if bf16:
-                G_LOGGER.info("Use FP8-BF16 precision.")
-        elif fp16:
-            G_LOGGER.info("Use pure FP16 precision.")
-        elif bf16:
-            G_LOGGER.info("Use pure BF16 precision.")
-        else:
-            fp16 = True
-            G_LOGGER.warn("Precision is not specified. Use pure FP16 precision by default.")
-
-        self.fp8, self.fp16, self.bf16 = fp8, fp16, bf16
-        self.nemo_cfg.trt_export_options.use_fp8 = fp8
-        self.nemo_cfg.trt_export_options.use_fp16 = fp16
-        self.nemo_cfg.trt_export_options.use_bf16 = bf16
-        self.nemo_cfg.onnx_export_options.use_fp8_storage = use_fp8_storage
-        self.nemo_cfg.onnx_export_options.quantize_bmms = quantize_bmms
-
-        if fp16:
-            self.nemo_cfg.trainer.precision = "16"
-        elif bf16:
-            self.nemo_cfg.trainer.precision = "bf16"
-        else:
-            self.nemo_cfg.trainer.precision = "32"
-
-    def update_hyperparams(self, model_config):
-        self.nemo_cfg.model.num_layers = model_config.num_layers
-        self.nemo_cfg.model.nb_heads = model_config.num_heads
-        self.nemo_cfg.model.head_size = model_config.hidden_size // model_config.num_heads
-        self.nemo_cfg.model.hidden_size = model_config.hidden_size
-        self.nemo_cfg.model.encoder_seq_length = model_config.max_position_embeddings
-        self.nemo_cfg.model.max_position_embeddings = model_config.max_position_embeddings
-
-    def setup_environment(
-        self,
-        variant: str,
-        working_dir: str = "temp",
-        batch_size: int = 1,
-        num_beams: int = 1,
-        use_cache: bool = True,
-        verbose: bool = False,
-        info: bool = False,
-        iterations: int = None,
-        warmup: int = None,
-        number: int = None,
-        duration: int = None,
-        percentile: int = None,
-        cleanup: bool = False,
-        action: str = None,
-        max_seq_len: int = None,
-        fp8: bool = True,
-        fp16: bool = False,
-        bf16: bool = False,
-        use_fp8_storage: bool = False,
-        quantize_bmms: bool = False,
-        input_seq_len: int = None,
-        output_seq_len: int = None,
-        nemo_model: str = None,
-        nemo_checkpoint: str = None,
-        nemo_hparams: str = None,
-        onnx_model: str = None,
-        **kwargs,
-    ) -> None:
-        """
-        Use Arguments from command line or user specified to setup config for the model.
-        """
-        self.validate_and_set_precision(fp8, fp16, bf16, use_fp8_storage, quantize_bmms)
-
-        if not torch.cuda.is_available():
-            raise EnvironmentError("GPU is required for NeMo demo.")
-
-        # Initialize CUDA Driver API
-        err, = cuda.cuInit(0)
-        if err != cuda.CUresult.CUDA_SUCCESS:
-            raise RuntimeError("Cuda initialization failed with error: {}".format(err))
-
-        # See https://pytorch.org/docs/stable/_modules/torch.html#set_float32_matmul_precision
-        torch.set_float32_matmul_precision('medium')
-
-        if max_seq_len != None:
-            self.nemo_cfg.model.max_seq_len = max_seq_len
-
-        assert action != None, "Action must be specified"
-        if action == "accuracy":
-            self.nemo_cfg.mode = "accuracy"
-            self.nemo_cfg.inference.compute_logprob = True
-            self.nemo_cfg.inference.all_probs = True
-            self.nemo_cfg.inference.greedy = True
-            self.nemo_cfg.inference.add_BOS = False
-            self.nemo_cfg.inference.tokens_to_generate = 1
-            self.nemo_cfg.inference.min_tokens_to_generate = 0
-            self.nemo_cfg.inference.temperature = 1.0
-            self.nemo_cfg.inference.top_k = 0
-            self.nemo_cfg.inference.top_p = 0.9
-            self.nemo_cfg.inference.repetition_penalty = 1.0
-        elif action == "benchmark":
-            self.nemo_cfg.mode = "benchmark"
-            if input_seq_len != None:
-                self.nemo_cfg.benchmark.input_seq_len = input_seq_len
-            if output_seq_len != None:
-                self.nemo_cfg.benchmark.output_seq_len = output_seq_len
-            self.nemo_cfg.inference.tokens_to_generate = self.nemo_cfg.benchmark.output_seq_len
-            self.nemo_cfg.inference.min_tokens_to_generate = self.nemo_cfg.benchmark.output_seq_len
-
-        if self.nemo_cfg.model.max_seq_len < (self.nemo_cfg.benchmark.input_seq_len + self.nemo_cfg.benchmark.output_seq_len):
-            raise ValueError(f"Max sequence length of the model needs to be greater than or equal to the sum of input sequence length and output sequence length. Got {self.nemo_cfg.model.max_seq_len} < {self.nemo_cfg.benchmark.input_seq_len} + {self.nemo_cfg.benchmark.output_seq_len}.")
-
-        if (nemo_model or nemo_checkpoint) and onnx_model:
-            raise RuntimeError(
-                "Both nemo-model and onnx-model cannot be specified together. Please specify either nemo-model or onnx-model."
-            )
-
-        assert variant in GPT3CONFIG_MAPPINGS
-        model_config = GPT3CONFIG_MAPPINGS[variant]
-
-        if self.nemo_cfg.model.max_seq_len > model_config.max_position_embeddings:
-            G_LOGGER.warn(
-                f"Updating max_position_embeddings to be the same as max_seq_len {self.nemo_cfg.model.max_seq_len}."
-            )
-            G_LOGGER.warn(
-                f"Outputs longer than {model_config.max_position_embeddings} might be unmeaningful."
-            )
-            model_config.max_position_embeddings = self.nemo_cfg.model.max_seq_len
-
-        if self.nemo_cfg.model.max_seq_len < model_config.min_seq_len:
-            G_LOGGER.warn(
-                f"Force updating max_seq_len to minimum required length {model_config.min_seq_len}."
-            )
-            self.nemo_cfg.model.max_seq_len = model_config.min_seq_len
-
-        self.nemo_cfg.batch_size = batch_size
-        self.nemo_cfg.use_cache = use_cache
-
-        if nemo_checkpoint != None:
-            # Set NeMo checkpoint configs
-            self.nemo_cfg.checkpoint_dir = os.path.dirname(nemo_checkpoint)
-            if not self.nemo_cfg.checkpoint_dir:
-                raise ValueError(f"NeMo checkpoint needs to be provided with full path.")
-            self.nemo_cfg.checkpoint_name = os.path.basename(nemo_checkpoint)
-            self.nemo_cfg.hparams_file = nemo_hparams
-        else:
-            if onnx_model != None:
-                G_LOGGER.info(f"Using onnx model {onnx_model} for inference.")
-                if os.path.exists(onnx_model):
-                    self.nemo_cfg.onnx_model_file = onnx_model
-                else:
-                    raise IOError(
-                        f"Could not find the specified onnx file {onnx_model}."
-                    )
-            else:
-                if nemo_model != None:
-                    if os.path.exists(nemo_model):
-                        self.nemo_cfg.gpt_model_file = nemo_model
-                    else:
-                        raise IOError(
-                            f"Could not find the specified nemo file {nemo_model}."
-                        )
-                else:
-                    G_LOGGER.info("Downloading nemo model from HuggingFace Hub")
-                    # Download nemo model if it does not exist.
-                    # Setup temporary metadata, config to create a workspace to put the
-                    # downloaded artefacts in
-                    download_metadata = NetworkMetadata(
-                        variant=variant,
-                        precision=Precision(fp16=self.fp16),
-                        use_cache=use_cache,
-                        num_beams=num_beams,
-                        batch_size=batch_size
-                    )
-
-                    download_config = self.config_class(metadata=download_metadata)
-                    download_config.from_nemo_config(copy(self.nemo_cfg))
-                    download_workspace = NNFolderWorkspace(download_config, working_dir)
-
-                    self.nemo_cfg.gpt_model_file = download_model(
-                        dst_dir=download_workspace.dpath + "/artefacts",
-                        cache_dir=download_workspace.dpath + "/cache",
-                        variant=variant,
-                        fp8=fp8,
-                    )
-
-        if self.nemo_cfg.gpt_model_file == None and self.nemo_cfg.checkpoint_dir == None and onnx_model == None:
-            G_LOGGER.error("No model exists based on specified configs and precisions.")
-            raise ValueError("Model not found.")
-
-        self.update_hyperparams(model_config)
-
-        # HuggingFace code
-        if verbose:
-            G_LOGGER.setLevel(level=G_LOGGER.DEBUG)
-        elif info:
-            G_LOGGER.setLevel(level=G_LOGGER.INFO)
-
-        if variant is None:
-            G_LOGGER.error("You need to specify --variant to run NeMo demo")
-            return
-
-        if self._args is not None:
-            G_LOGGER.info("Setting up environment with arguments: {}".format(self._args))
-        else:
-            G_LOGGER.info("User-customized API is called")
-
-        self.metadata = NetworkMetadata(
-            variant=variant,
-            precision=Precision(fp16=self.fp16),
-            use_cache=use_cache,
-            num_beams=num_beams,
-            batch_size=batch_size
-        )
-
-        self.config = self.config_class(
-            metadata = self.metadata
-        )
-
-        self.config.from_nemo_config(self.nemo_cfg)
-
-        self.workspace = NNFolderWorkspace(
-            self.config, working_dir
-        )
-
-        self.timing_profile = TimingProfile(
-            iterations=iterations,
-            number=number,
-            warmup=warmup,
-            duration=duration,
-            percentile=percentile,
-        )
-
-        self.keep_torch_model = not cleanup
-        self.keep_onnx_model = not cleanup
-        self.keep_trt_engine = not cleanup
-
-        self.process_framework_specific_arguments(onnx_model=onnx_model, **kwargs)
-
-    def process_framework_specific_arguments(self, **kwargs):
-        pass
-
-    def run(self) -> Union[List[NetworkResult], BenchmarkingResult]:
-        """
-        Main entry point of our function which compiles and generates our model data for command-line mode.
-        The general process for the commands are all the same:
-        (1) Download the model
-        (2) Run either checkpoint or benchmark
-        (3) Returns the result
-        """
-        t0 = time.time()
-        self.models = self.setup_tokenizer_and_model()
-        t1 = time.time()
-        G_LOGGER.info("setup_tokenizer_and_model() takes {:.4f}s in total.".format(t1 - t0))
-
-        results = []
-        ppl = None
-        random.seed(self.nemo_cfg.inference.seed)
-        np.random.seed(self.nemo_cfg.inference.seed)
-        torch.manual_seed(self.nemo_cfg.inference.seed)
-        if self.nemo_cfg.mode == "accuracy":
-            G_LOGGER.debug("Run in accuracy mode.")
-            eval_ppl = get_accuracy_metric(self.nemo_cfg.accuracy)
-            has_align_requirement = self.nemo_cfg.runtime == 'nemo' and hasattr(self.model.cfg, "fp8") and self.model.cfg.fp8 == True
-            if has_align_requirement and self.nemo_cfg.accuracy.tokens_to_generate > 1:
-                self.nemo_cfg.accuracy.tokens_to_generate = 1
-                G_LOGGER.warn("Force set tokens_to_generate=1 for FP8 run in NeMo framework.")
-            dataset = load_dataset(self.nemo_cfg.accuracy.dataset, self.workspace.rootdir, self.nemo_cfg.accuracy.tokens_to_generate, 8 if has_align_requirement else -1)
-            tokenizer = self.tokenizer
-
-            def eval_ppl_with_batch_input(eval_ppl, batch_input):
-                ds_input = dataset.preprocess_input(tokenizer, batch_input)
-                self.nemo_cfg.inference.tokens_to_generate = self.nemo_cfg.accuracy.tokens_to_generate
-                self.nemo_cfg.inference.min_tokens_to_generate = self.nemo_cfg.accuracy.tokens_to_generate
-
-                inputs = ds_input.inputs
-                response = full_inference(
-                    model=self.model,
-                    inputs=inputs,
-                    cfg=self.nemo_cfg,
-                )
-
-                # It is still predication task even when tokens_to_generate > 1, so we need restore the context length.
-                batch_size = ds_input.inputs[0].shape[0]
-                real_ctx_length = ds_input.inputs[0].shape[1] - 1
-                inputs = (ds_input.inputs[0], torch.ones(batch_size, dtype=torch.int32) * real_ctx_length)
-
-                response = get_computeprob_response(tokenizer, response, inputs)
-                eval_ppl.update(ds_input=ds_input, response=response, tokenizer=tokenizer)
-
-            batch_input = []
-            for doc in tqdm(dataset.load()):
-                batch_input.append(doc)
-
-                if len(batch_input) == self.nemo_cfg.batch_size:
-                    eval_ppl_with_batch_input(eval_ppl, batch_input)
-                    batch_input.clear()
-
-            if len(batch_input):
-                # Pad empty text to batch size
-                while (len(batch_input) % self.nemo_cfg.batch_size) != 0:
-                    batch_input.append({"text": ""})
-                eval_ppl_with_batch_input(eval_ppl, batch_input)
-
-            ppl, sequence_ppl, _, acc_text = eval_ppl.compute()
-            print("***************************")
-            print("{} ppl(last token): {:.4f}, ppl(sequence): {:.4f}, {}".format(self.nemo_cfg.accuracy.dataset, ppl, sequence_ppl, acc_text))
-            print("***************************")
-        elif self.nemo_cfg.mode == "benchmark":
-            G_LOGGER.debug("Run in benchmark mode.")
-            rand_input = get_random_input(self.model.tokenizer, self.nemo_cfg.batch_size, self.nemo_cfg.benchmark.input_seq_len, self.nemo_cfg.benchmark.output_seq_len)
-
-            for _ in range(self.timing_profile.warmup):
-                output = full_inference(self.model, rand_input, self.nemo_cfg)
-
-            class BenchmarkTimer:
-                def __init__(self, name):
-                    self.name = name
-                    self.started = False
-                    self.start_time = None
-                    self.times = []
-
-                def start(self):
-                    assert not self.started
-                    self.started = True
-                    self.start_time = time.perf_counter()
-
-                def end(self):
-                    assert self.started
-                    self.started = False
-                    self.times.append(time.perf_counter() - self.start_time)
-
-                def stats_str(self, num_tokens):
-                    total_time = sum(self.times)
-                    avg_time = total_time / float(len(self.times))
-                    self.times.sort()
-                    percentile95 = self.times[int(len(self.times) * 0.95)]
-                    percentile99 = self.times[int(len(self.times) * 0.99)]
-                    throughput = float(num_tokens) / avg_time
-                    return("[{:10s}] Total Time: {:0.5f} s, Average Time: {:0.5f} s, 95th Percentile Time: {:0.5f} s, 99th Percentile Time: {:0.5f} s, Throughput: {:0.2f} tokens/s".format(self.name, total_time, avg_time, percentile95, percentile99, throughput))
-
-            G_LOGGER.info("Warm up finished. Start benchmarking...")
-            e2e_timer = BenchmarkTimer("E2E inference")
-            core_timer = BenchmarkTimer("Without tokenizer")
-            start_time = datetime.now()
-            iter_idx = 0
-            cur_duration = 0
-            while iter_idx < self.timing_profile.iterations or cur_duration < self.timing_profile.duration:
-                core_timer.start()
-                e2e_timer.start()
-                output = generate(self.model, rand_input, self.nemo_cfg)
-                core_timer.end()
-
-                output = process_output(self.model, output)
-                e2e_timer.end()
-
-                iter_idx += 1
-                cur_duration = (datetime.now() - start_time).total_seconds()
-
-            num_tokens = self.nemo_cfg.batch_size * self.nemo_cfg.benchmark.output_seq_len
-            print("***************************")
-            print(f"Running {iter_idx} iterations with duration: {cur_duration}s, batch size: {self.nemo_cfg.batch_size}, input sequence length: {self.nemo_cfg.benchmark.input_seq_len} and output sequence length: {self.nemo_cfg.benchmark.output_seq_len}")
-            print(f"{e2e_timer.stats_str(num_tokens)}")
-            print(f"{core_timer.stats_str(num_tokens)}")
-            print("***************************")
-        else:
-            G_LOGGER.debug("Run in inference mode.")
-            assert self.nemo_cfg.mode == "inference"
-            if self.nemo_cfg.runtime == 'nemo' and hasattr(self.model.cfg, "fp8") and self.model.cfg.fp8 == True and self.nemo_cfg.batch_size % 8 != 0:
-                new_batch_size = ((self.nemo_cfg.batch_size + 7) // 8) * 8
-                print("Update batch size from {} to {} for NeMo FP8 inference.".format(self.nemo_cfg.batch_size, new_batch_size))
-                self.nemo_cfg.batch_size = new_batch_size
-
-            nb_paddings = 0
-            while (len(self.nemo_cfg.prompts) % self.nemo_cfg.batch_size) != 0:
-                self.nemo_cfg.prompts.append(self.nemo_cfg.prompts[-1])
-                nb_paddings += 1
-
-            batch_idx = 0
-            start = 0
-            while True:
-                inputs = OmegaConf.to_container(listconfig.ListConfig(self.nemo_cfg.prompts[start:start+self.nemo_cfg.batch_size]))
-                output = full_inference(self.model, inputs, self.nemo_cfg)
-                output = remove_padded_prompts(output, nb_paddings)
-                print("***************************")
-                print("Batch {}: {}".format(batch_idx, output))
-                print("***************************")
-                batch_idx += 1
-                start += self.nemo_cfg.batch_size
-                if start >= len(self.nemo_cfg.prompts):
-                    break
-
-        t2 = time.time()
-        G_LOGGER.info("Inference session is {:.4f}s in total.".format(t2 - t1))
-
-        # Release runtime objects
-        if self.nemo_cfg.runtime == 'onnx':
-            del self.model.onnxrt
-        elif self.nemo_cfg.runtime == 'trt':
-            del self.model.trt
-
-        return results, ppl
-
-    def add_args(self) -> None:
-        general_group = self._parser.add_argument_group("general")
-        general_group.add_argument(
-            "--help",
-            "-h",
-            help="Shows help message for NeMo commands.",
-            action="store_true",
-        )
-        general_group.add_argument(
-            "--verbose", "-v",
-            help="Display verbose logs.",
-            action="store_true"
-        )
-        general_group.add_argument(
-            "--info", help="Display info logs.", action="store_true"
-        )
-        general_group.add_argument(
-            "--working-dir", "-wd",
-            help="Location of where to save the model and other downloaded files.",
-            required=True,
-        )
-
-        timing_group = self._parser.add_argument_group("inference measurement")
-        timing_group.add_argument(
-            "--duration",
-            type=int,
-            help="Minimal duration of inference iterations to measure in seconds.",
-            default=NetworkCommand.DEFAULT_DURATION,
-        )
-        timing_group.add_argument(
-            "--iterations",
-            type=int,
-            help="Number of iterations to measure.",
-            default=NetworkCommand.DEFAULT_ITERATIONS,
-        )
-        timing_group.add_argument(
-            "--warmup",
-            type=int,
-            help="Number of warmup iterations before actual measurement occurs.",
-            default=NetworkCommand.DEFAULT_WARMUP,
-        )
-
-        model_config_group = self._parser.add_argument_group("model")
-        model_config_group.add_argument(
-            "--nemo-model",
-            help="Set a NeMo model to be used.",
-            type=str,
-            default=None
-        )
-        model_config_group.add_argument(
-            "--nemo-checkpoint",
-            help="Set a NeMo checkpoint to be used.",
-            type=str,
-            default=None
-        )
-        model_config_group.add_argument(
-            "--nemo-hparams",
-            help="Set a NeMo hparams.yaml to be used.",
-            type=str,
-            default=None
-        )
-        model_config_group.add_argument(
-            "--onnx-model",
-            help="Set a onnx model (exported from a NeMo model) to be used. See `export_utils.py` in the model directory for exporting onnx files",
-            type=str,
-            default=None,
-        )
-        model_config_group.add_argument(
-            "--max-seq-len",
-            help="Set maximum sequence lengths used for a GPT model.",
-            type=int,
-            default=None,
-        )
-        model_config_group.add_argument(
-            "--batch-size", "-b",
-            help="Set batch size for inference",
-            required=False,
-            type=int,
-            default=1
-        )
-        model_config_group.add_argument(
-            "--variant", "-m",
-            help="Model to generate",
-            required=True,
-            choices=GPT3ModelTRTConfig.TARGET_MODELS,
-        )
-        model_config_group.add_argument(
-            "--use-cache",
-            "-kv",
-            help="Enable KV cache",
-            action="store_true",
-            default=False,
-        )
-        model_config_group.add_argument(
-            "--fp8",
-            action="store_true",
-            help="Use FP8 precision.",
-            default=False
-        )
-        model_config_group.add_argument(
-            "--fp16",
-            action="store_true",
-            help="Use FP16 precision.",
-            default=False
-        )
-        model_config_group.add_argument(
-            "--bf16",
-            action="store_true",
-            help="Use BF16 precision.",
-            default=False
-        )
-        model_config_group.add_argument(
-            "--use-fp8-storage",
-            action="store_true",
-            help="Use FP8 storage precision.",
-            default=False
-        )
-        model_config_group.add_argument(
-            "--quantize-bmms",
-            help="Quantize attention BMMs",
-            action="store_true",
-            default=False,
-        )
-
-    def __call__(self):
-        t0 = time.time()
-        self.add_args()
-        self._args = self._parser.parse_args()
-        if "help" in self._args and self._args.help == True:
-            self._parser.print_help()
-            exit(0)
-
-        self.setup_environment(
-            **vars(self._args),
-        )
-        t1 = time.time()
-        G_LOGGER.info("Set up environment takes {:.4f}s.".format(t1 - t0))
-
-        network_results, ppl_results = self.run()
-        return NetworkCheckpointResult(
-            network_results=network_results,
-            accuracy=0,
-            perplexity=0,
-        )
diff --git a/demo/NeMo/nemo_export.py b/demo/NeMo/nemo_export.py
deleted file mode 100644
index b9f5ad3a9..000000000
--- a/demo/NeMo/nemo_export.py
+++ /dev/null
@@ -1,922 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import argparse
-import subprocess as sp
-import shlex
-import omegaconf
-import os
-import sys
-import warnings
-from typing import Dict, List, Optional, Tuple
-import numpy as np
-
-# nemo
-from nemo.core import ModelPT
-from nemo.core.classes import Exportable
-from nemo.core.neural_types import ChannelType, NeuralType
-from nemo.utils.export_utils import augment_filename
-from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel, MegatronGPTExportableModel
-
-# onnx
-import onnx
-import onnx_graphsurgeon as gs
-
-# polygraphy
-from polygraphy.backend.trt import Profile, CreateConfig, engine_from_network, NetworkFromOnnxPath, save_engine
-from polygraphy.logger import G_LOGGER as PG_LOGGER
-
-import torch
-import transformer_engine
-
-if __name__ == "__main__":
-    filepath = os.path.dirname(os.path.abspath(__file__))
-    project_root = os.path.join(filepath, os.pardir, "HuggingFace")
-    sys.path.append(project_root)
-
-# Add syspath for custom library
-from GPT3.nemo_utils import load_nemo_model, release_nemo_model
-from GPT3.convert_te_onnx_to_trt_onnx import replace_customop_qdq_with_onnx_qdq
-
-# HuggingFace utils
-from NNDF.logger import G_LOGGER
-from NNDF.models import _calculate_polygraphy_verbosity
-
-# ONNX conversion script
-
-# Set polygraphy logging level here.
-PG_LOGGER.module_severity = PG_LOGGER.INFO
-
-class MegatronGPTSingleInputExportableModel(MegatronGPTExportableModel):
-    """
-    Wrapper for MegatronGPTExportableModel to export ONNX with a single input
-    """
-
-    def __init__(self, model, max_seq_len):
-        super().__init__(model)
-        self.cfg = model.cfg
-        self.max_seq_len = max_seq_len
-
-    def forward(self, tokens):
-        def model_forward(tokens):
-            position_ids, attention_mask = self.get_position_ids_and_mask(tokens, self.max_seq_len)
-            assert tokens.shape == position_ids.shape
-            assert attention_mask.shape[2] == attention_mask.shape[3] == tokens.shape[1] == position_ids.shape[1]
-            return self.model.forward(
-                tokens=tokens.cuda(),
-                text_position_ids=position_ids.cuda(),
-                attention_mask=attention_mask.cuda(),
-                labels=None,
-            )
-
-        with torch.no_grad(), torch.inference_mode(), torch.autocast(
-            'cuda', dtype=self.dtype
-        ), warnings.catch_warnings():
-            warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning, module=r'.*')
-            if self.fp8_enabled:
-                with transformer_engine.pytorch.onnx_export(self.fp8_enabled), transformer_engine.pytorch.fp8_autocast(
-                    enabled=self.fp8_enabled, fp8_recipe=self.fp8_recipe
-                ):
-                    output_tensor = model_forward(tokens)
-            else:
-                output_tensor = model_forward(tokens)
-        return output_tensor
-
-    def get_position_ids_and_mask(self, data, max_seq_len):
-        seq_len = data.size()[1]
-        # Attention mask (lower triangular).
-        attention_mask = torch.tril(torch.ones(
-            (1, max_seq_len, max_seq_len), device=data.device)).view(
-                1, 1, max_seq_len, max_seq_len)
-
-        # Position ids.
-        position_ids = torch.arange(max_seq_len, dtype=torch.long,
-                                    device=data.device)
-        position_ids = position_ids[:seq_len].unsqueeze(0).expand_as(data)
-
-        # Convert attention mask to binary:
-        attention_mask = (attention_mask < 0.5)
-
-        return position_ids, attention_mask[:1, :1, :seq_len, :seq_len]
-
-    def input_example(self):
-        ids = self.model.tokenizer.text_to_ids("how is the weather on Sunday morning?")
-        id_tensors = torch.unsqueeze(torch.LongTensor(ids), dim=0)
-        G_LOGGER.debug(f"Calling input_example shape {id_tensors.shape}")
-        return id_tensors, # return a tuple
-
-    @property
-    def input_types(self) -> Optional[Dict[str, NeuralType]]:
-        return {
-            "input_ids": NeuralType(('B', 'T'), ChannelType()),
-        }
-
-    @property
-    def input_names(self) -> List[str]:
-        return ['input_ids']
-
-def get_trtexec_cmd(onnx_fpath, cfg, bs):
-    max_seq_len = cfg.model.max_seq_len
-    opt_seq_len = cfg.trt_export_options.opt_seq_len if cfg.trt_export_options.opt_seq_len else (max_seq_len // 2)
-    trtexec_cmd = f"trtexec --onnx={onnx_fpath}"
-    min_shapes = f"--minShapes=input_ids:{bs}x1"
-    opt_shapes = f"--optShapes=input_ids:{bs}x{opt_seq_len}"
-    max_shapes = f"--maxShapes=input_ids:{bs}x{max_seq_len}"
-    if not cfg.use_one_input:
-        min_shapes += f",position_ids:{bs}x1"
-        opt_shapes += f",position_ids:{bs}x{opt_seq_len}"
-        max_shapes += f",position_ids:{bs}x{max_seq_len}"
-    if not cfg.trt_export_options.use_fp8:
-        min_shapes += ",attention_mask:1x1x1x1"
-        opt_shapes += f",attention_mask:1x1x{opt_seq_len}x{opt_seq_len}"
-        max_shapes += f",attention_mask:1x1x{max_seq_len}x{max_seq_len}"
-
-    if cfg.use_cache:
-        trtexec_cmd += " --profile=0"
-        nbheads, headsize = cfg.model.nb_heads, cfg.model.head_size
-        input_k = get_past_key_name('*')
-        input_v = get_past_value_name('*')
-        # ("sequence", "batch", nbheads, headsize)
-        min_shapes += f",{input_k}:0x{bs}x{nbheads}x{headsize},{input_v}:0x{bs}x{nbheads}x{headsize}"
-        opt_shapes += f",{input_k}:0x{bs}x{nbheads}x{headsize},{input_v}:0x{bs}x{nbheads}x{headsize}"
-        max_shapes += f",{input_k}:0x{bs}x{nbheads}x{headsize},{input_v}:0x{bs}x{nbheads}x{headsize}"
-    trtexec_cmd += f" {min_shapes} {opt_shapes} {max_shapes}"
-
-    if cfg.use_cache:
-        trtexec_cmd += " --profile=1"
-
-        min_shapes = f"--minShapes=input_ids:{bs}x1"
-        opt_shapes = f"--optShapes=input_ids:{bs}x1"
-        max_shapes = f"--maxShapes=input_ids:{bs}x1"
-        if not cfg.use_one_input:
-            min_shapes += f",position_ids:{bs}x1"
-            opt_shapes += f",position_ids:{bs}x1"
-            max_shapes += f",position_ids:{bs}x1"
-        if not cfg.trt_export_options.use_fp8:
-            min_shapes += ",attention_mask:1x1x1x1"
-            opt_shapes += f",attention_mask:1x1x{opt_seq_len}x{opt_seq_len}"
-            max_shapes += f",attention_mask:1x1x{max_seq_len}x{max_seq_len}"
-
-        nbheads, headsize = cfg.model.nb_heads, cfg.model.head_size
-        input_k = get_past_key_name('*')
-        input_v = get_past_value_name('*')
-        # ("sequence", "batch", nbheads, headsize)
-        min_shapes += f",{input_k}:1x{bs}x{nbheads}x{headsize},{input_v}:1x{bs}x{nbheads}x{headsize}"
-        opt_shapes += f",{input_k}:{opt_seq_len}x{bs}x{nbheads}x{headsize},{input_v}:{opt_seq_len}x{bs}x{nbheads}x{headsize}"
-        max_shapes += f",{input_k}:{max_seq_len - 1}x{bs}x{nbheads}x{headsize},{input_v}:{max_seq_len - 1}x{bs}x{nbheads}x{headsize}"
-        trtexec_cmd += f" {min_shapes} {opt_shapes} {max_shapes}"
-
-    use_tf32 = cfg.trt_export_options.use_tf32
-    use_fp8 = cfg.trt_export_options.use_fp8
-    use_fp16 = cfg.trt_export_options.use_fp16
-    use_bf16 = cfg.trt_export_options.use_bf16
-    use_strongly_typed = cfg.trt_export_options.use_strongly_typed
-    sparse = cfg.trt_export_options.sparse
-    trtexec_cmd += " --noTF32" if not use_tf32 else ""
-    trtexec_cmd += " --fp8" if (use_fp8 and not use_strongly_typed) else ""
-    trtexec_cmd += " --fp16" if (use_fp16 and not use_strongly_typed) else ""
-    trtexec_cmd += " --bf16" if (use_bf16 and not use_strongly_typed) else ""
-    trtexec_cmd += " --stronglyTyped" if use_strongly_typed else ""
-    trtexec_cmd += " --sparsity=enable" if sparse else ""
-    trtexec_cmd += " --timingCacheFile=functional.cache"
-    return trtexec_cmd
-
-
-def add_zero_point(g, base_name, dtype):
-    """Add Q/DQ zero-point constant"""
-    _zp_fp8_value = onnx.helper.make_tensor(base_name + "_zp_fp8_value", dtype, (1,), [0.0])
-    zero_point_fp8 = gs.Variable(base_name + "_zero_point", dtype=dtype, shape=(1,))
-    zero_point_const = gs.Node(op="Constant", name= base_name + "_zero_point_const", inputs=[], outputs=[zero_point_fp8], attrs={"value": _zp_fp8_value})
-    g.nodes.append(zero_point_const)
-    return zero_point_fp8
-
-
-def add_scale(g, base_name, dtype, value):
-    """Add Q/DQ scale constant"""
-    _scale_value = onnx.helper.make_tensor(base_name + "_scale_value", dtype, (1,), [value])
-    scale = gs.Variable(base_name + "_scale", dtype=dtype, shape=(1,))
-    scale_const = gs.Node(op="Constant", name=base_name + "_scale_const", inputs=[], outputs=[scale], attrs={"value": _scale_value})
-    g.nodes.append(scale_const)
-    return scale
-
-
-def add_cast(g, inp, outp_dtype, cast_name):
-    """Add Cast operator """
-    cast_outp = gs.Variable(cast_name+"_out", dtype=outp_dtype)
-    new_cast = gs.Node(
-        op="Cast",
-        name=cast_name,
-        inputs=[inp],
-        outputs=[cast_outp],
-        attrs={"to": outp_dtype}
-    )
-    g.nodes.append(new_cast)
-    return cast_outp
-
-
-def add_q(g, inp, hp_dtype, q_dtype, q_name=None):
-    """Add QuantizeLinear operator"""
-    scale_dtype = hp_dtype
-    q_name = q_name or f"{inp.name}_qfp8"
-    q_out = gs.Variable(q_name, dtype=q_dtype)
-    q = gs.Node(op="QuantizeLinear", name=q_name,
-        inputs=[
-            inp,
-            add_scale(g, inp.name, scale_dtype, 1.0),
-            add_zero_point(g, inp.name, q_dtype)
-        ],
-        outputs=[q_out])
-    g.nodes.append(q)
-    return q_out
-
-
-def add_dq(g, inp, hp_dtype, dq_dtype):
-    """Add DequantizeLinear operator"""
-    dq_name = f"{inp.name}_dqfp8"
-    scale_dtype = hp_dtype
-    dq_out = gs.Variable(dq_name, dtype=hp_dtype)
-    dq = gs.Node(op="DequantizeLinear", name=dq_name,
-        inputs=[
-            inp,
-            add_scale(g, inp.name, scale_dtype, 1.0),
-            add_zero_point(g, inp.name, dq_dtype)],
-        outputs=[dq_out])
-    g.nodes.append(dq)
-    return dq_out
-
-
-def quantize_all_bmms(g, dtype_high_prec, use_fp8_storage):
-    """Quantize the inputs of all batched matmul operators"""
-
-    def quantize_bmm(g, bmm, dtype_high_prec):
-        assert len(bmm.inputs) == 2
-        dq_outputs = []
-        for i in range(len(bmm.inputs)):
-            if i == 0 or not use_fp8_storage:
-                q_outp = add_q(g, bmm.inputs[i], dtype_high_prec, onnx.TensorProto.FLOAT8E4M3FN)
-                dq_out = add_dq(g, q_outp, dtype_high_prec, onnx.TensorProto.FLOAT8E4M3FN)
-            else:
-                # mm.inputs[1] is the input from K or V which we don't quantize if is stored
-                # in the cache in quantized type.
-                dq_out = add_dq(g, bmm.inputs[i], dtype_high_prec, onnx.TensorProto.FLOAT8E4M3FN)
-            dq_outputs.append(dq_out)
-        bmm.inputs = dq_outputs
-
-    bmm_nodes = [node for node in g.nodes if node.op == "MatMul"]
-    G_LOGGER.info("Quantizing attention BMMs")
-    G_LOGGER.info(f"Found {len(bmm_nodes)} MatMul operator nodes")
-    for bmm in bmm_nodes:
-        # Do not quantize the Matmul at the head of GPT3 (it is used )
-        if bmm.name == "/model/module/MatMul":
-            continue
-        quantize_bmm(g, bmm, dtype_high_prec)
-
-
-# Use ONNX graphsurgeon to add KV-cache to ONNX file
-# Reusing the HF demo names.
-def get_past_key_name(layer_id):
-    past_key_name = f"past_key_values.{layer_id}.decoder.key"
-    return past_key_name
-
-def get_past_value_name(layer_id):
-    past_value_name = f"past_key_values.{layer_id}.decoder.value"
-    return past_value_name
-
-def get_past_shape(nbheads, headsize):
-    return ("sequence_past_decoder_length", "batch", nbheads, headsize)
-
-def get_present_key_name(layer_id: int):
-    present_key_name = f"present_key_values.{layer_id}.decoder.key"
-    return present_key_name
-
-def get_present_value_name(layer_id: int):
-    present_value_name = f"present_key_values.{layer_id}.decoder.value"
-    return present_value_name
-
-def get_present_shape(nbheads, headsize):
-    return ("sequence_present_decoder_length", "batch", nbheads, headsize)
-
-def get_new_key_name(layer_id: int):
-    new_key_name = f"new_key_values.{layer_id}.decoder.key"
-    return new_key_name
-
-def get_new_value_name(layer_id: int):
-    new_value_name = f"new_key_values.{layer_id}.decoder.value"
-    return new_value_name
-
-def get_new_shape(nbheads, headsize):
-    return ("sequence", "batch", nbheads, headsize)
-
-def quantize_new_k_v(g, key_new, value_new, hp_dtype):
-    key_new_q_outp = add_q(g, key_new, hp_dtype, onnx.TensorProto.FLOAT8E4M3FN)
-    key_new_dq_out = add_dq(g, key_new_q_outp, hp_dtype, onnx.TensorProto.FLOAT8E4M3FN)
-    value_new_q_outp = add_q(g, value_new, hp_dtype, onnx.TensorProto.FLOAT8E4M3FN)
-    value_new_dq_out = add_dq(g, value_new_q_outp, hp_dtype, onnx.TensorProto.FLOAT8E4M3FN)
-    return key_new_dq_out, value_new_dq_out
-
-def add_kvcache_for(
-    g, layer_id, qkv_split, nbheads, headsize, dtype, kv_output_policy, hp_dtype, use_fp8_storage, quantize_bmms):
-    _, key_new, value_new = qkv_split.outputs
-    key_consumers = [c for c in key_new.outputs]
-    value_consumers = [c for c in value_new.outputs]
-
-    def add_graph_past_inputs(use_fp8_storage):
-        past_key = gs.Variable(
-            name=get_past_key_name(layer_id),
-            dtype=dtype,
-            shape=get_past_shape(nbheads, headsize))
-        past_value = gs.Variable(
-            name=get_past_value_name(layer_id),
-            dtype=dtype,
-            shape=get_past_shape(nbheads, headsize))
-        g.inputs.append(past_key)
-        g.inputs.append(past_value)
-
-        if use_fp8_storage and not quantize_bmms:
-            past_key_dq = add_dq(g, past_key, hp_dtype, onnx.TensorProto.FLOAT8E4M3FN)
-            past_value_dq = add_dq(g, past_value, hp_dtype, onnx.TensorProto.FLOAT8E4M3FN)
-            return past_key_dq, past_value_dq
-
-        return past_key, past_value
-
-    def add_concat(concat_name, input0, input1, output_name):
-        concat_out = gs.Variable(
-            output_name,
-            dtype=dtype,
-            shape=get_present_shape(nbheads, headsize))
-
-        concat = gs.Node(op="Concat", name=concat_name,
-            inputs=[input0, input1], outputs=[concat_out],
-            attrs={"axis": 0})
-        g.nodes.append(concat)
-        return concat_out
-
-    def add_cache_outputs(kv_output_policy, use_fp8_storage, hp_dtype):
-        if kv_output_policy == "kv_cache_concat":
-            new_key_output, new_value_output = key_concat_out, value_concat_out
-        elif kv_output_policy == "kv_new":
-            key_new.dtype = dtype
-            key_new.shape = get_new_shape(nbheads, headsize)
-            key_new.name = get_new_key_name(layer_id)
-            value_new.dtype = dtype
-            value_new.shape = get_new_shape(nbheads, headsize)
-            value_new.name = get_new_value_name(layer_id)
-
-            if use_fp8_storage:
-                key_new_q = add_q(g, key_new, hp_dtype, onnx.TensorProto.FLOAT8E4M3FN,
-                    f"{key_new.name}_qfp8")
-                value_new_q = add_q(g, value_new, hp_dtype, onnx.TensorProto.FLOAT8E4M3FN,
-                    f"{value_new.name}_qfp8")
-                new_key_output, new_value_output = key_new_q, value_new_q
-            else:
-                new_key_output, new_value_output = key_new, value_new
-        else:
-            raise ValueError(f"Unsupported kv_output_policy: {kv_output_policy}")
-        g.outputs.append(new_key_output)
-        g.outputs.append(new_value_output)
-        return new_key_output, new_value_output
-
-    past_key, past_value = add_graph_past_inputs(use_fp8_storage)
-    new_key_output, new_value_output = add_cache_outputs(kv_output_policy, use_fp8_storage, hp_dtype)
-
-    if quantize_bmms:
-        if use_fp8_storage:
-            key_new = new_key_output
-            value_new = new_value_output
-        else:
-            key_new, value_new = quantize_new_k_v(g, key_new, value_new, hp_dtype)
-    key_concat_out = add_concat(f"key.{layer_id}.concat",
-        past_key, key_new, get_present_key_name(layer_id))
-    value_concat_out = add_concat(f"value.{layer_id}.concat",
-        past_value, value_new, get_present_value_name(layer_id))
-
-    for c in key_consumers:
-        c.inputs[0] = key_concat_out
-    for c in value_consumers:
-        c.inputs[0] = value_concat_out
-
-
-def add_kvcache(g, nbheads, headsize, dtype, kv_output_policy, hp_dtype, use_fp8_storage, quantize_bmms):
-    """Add KV-cache to each Transformer layer's QKV split """
-    G_LOGGER.info("Adding KV-cache")
-    qkv_split_nodes = [node for node in g.nodes if node.op == "Split"]
-    G_LOGGER.debug(f"Found {len(qkv_split_nodes)} QKV-split nodes")
-
-    for layer_id, qkv_split in enumerate(qkv_split_nodes):
-        add_kvcache_for(
-            g, layer_id, qkv_split, nbheads, headsize, dtype, kv_output_policy, hp_dtype, use_fp8_storage, quantize_bmms)
-
-    G_LOGGER.debug("Done adding cache operations")
-    return len(qkv_split_nodes)
-
-
-def normalize_dyn_axes_to_hf_names(g, vocab_size):
-    g.inputs[0].name = "input_ids"
-    g.inputs[0].shape = ("batch", "sequence")
-    if len(g.inputs) > 1:
-        g.inputs[1].name = "position_ids"
-        g.inputs[1].shape = ("batch", "sequence")
-    g.outputs[0].name = "logits"
-    g.outputs[0].shape = ("batch", "sequence", vocab_size)
-    G_LOGGER.debug("Done normalizing dynamic axes names to HuggingFace demo names")
-
-
-def process_onnx(
-    kv_output_policy,
-    onnx_input_fpath,
-    onnx_output_fpath,
-    separate_param_files,
-    use_cache,
-    quantize_bmms,
-    nbheads, headsize, vocab_size, dtype, hp_dtype, use_fp8_storage):
-    """
-    Process an ONNX model, add KV cache inputs and output, save result model to a specified path.
-    """
-    G_LOGGER.info(f"Importing {onnx_input_fpath}... this will take some time")
-    g = gs.import_onnx(onnx.load(onnx_input_fpath))
-    normalize_dyn_axes_to_hf_names(g, vocab_size)
-    num_layers = 0
-    if use_cache:
-        num_layers = add_kvcache(g, nbheads, headsize, dtype, kv_output_policy, hp_dtype, use_fp8_storage, quantize_bmms)
-        g.cleanup().toposort()
-
-    if quantize_bmms:
-        quantize_all_bmms(g, hp_dtype, use_fp8_storage)
-        g.cleanup().toposort()
-
-    G_LOGGER.info(f"Exporting {onnx_output_fpath}")
-    model = gs.export_onnx(g)
-    G_LOGGER.info(f"Saving {onnx_output_fpath}")
-    if separate_param_files:
-        onnx.save_model(model, onnx_output_fpath, save_as_external_data=True,
-             all_tensors_to_one_file = False, convert_attribute=False)
-    else:
-        onnx.save_model(model, onnx_output_fpath, save_as_external_data=False)
-    G_LOGGER.info(f"Done: {onnx_output_fpath}")
-    return num_layers
-
-
-def create_dir_if_not_exist(path):
-    dir = os.path.dirname(path)
-    if not os.path.exists(dir) and dir != "":
-        G_LOGGER.info(f"Making directory {dir}")
-        os.makedirs(dir)
-
-
-class NeMoConverter():
-    """
-    A class to convert a NeMo model to an ONNX file, and convert an ONNX file to a TensorRT engine.
-    """
-    def __init__(self, cfg, model_type=ModelPT):
-        self.model_type = model_type
-        self.cfg = cfg
-        self.model = None
-        self.export_envvars()
-
-    def export_envvars(self) -> None:
-        if self.cfg.trt_export_options.use_fp8:
-            G_LOGGER.info(
-                f"Setting max sequence length to {self.cfg.model.max_seq_len}"
-            )
-            os.environ["NVTE_ONNX_KVCACHE_MAX_SEQ_LEN"] = str(
-                self.cfg.model.max_seq_len
-            )
-
-    def nemo_to_onnx(self) -> str:
-        """
-        Convert a NeMo model to an ONNX model, return the file path to the ONNX model.
-        """
-        if self.model == None:
-            self.model = load_nemo_model(self.cfg, self.model_type)
-
-        if not isinstance(self.model, Exportable):
-            G_LOGGER.error("Your NeMo model class ({}) is not Exportable.".format(self.model.__class__.__name__))
-            sys.exit(1)
-
-        if hasattr(self.model.cfg, "fp8") and self.model.cfg.fp8 == True:
-            if self.cfg.trt_export_options.use_fp8 == False:
-                G_LOGGER.info("Turning on trt_export_options.use_fp8 because NeMo model is in FP8 precision.")
-                self.cfg.trt_export_options.use_fp8 = True
-        else:
-            if self.cfg.trt_export_options.use_fp8 == True:
-                G_LOGGER.info("Turning off trt_export_options.use_fp8 because NeMo model is not in FP8 precision.")
-                self.cfg.trt_export_options.use_fp8 = False
-
-        onnx_out = self.cfg.onnx_model_file
-        create_dir_if_not_exist(onnx_out)
-        check_trace = self.cfg.onnx_export_options.runtime_check
-        onnx_names = []
-
-        dynamic_axes={
-            'input_ids': {0: "batch", 1: "sequence"},
-            'position_ids': {0: "batch", 1: "sequence"},
-            'logits': {0: "batch", 1: "sequence"},
-        }
-
-        if self.cfg.use_one_input:
-            # Use a wrapper class to get rid of inputs other than input_ids.
-            self.model = MegatronGPTSingleInputExportableModel(self.model, self.cfg.model.max_seq_len)
-            del dynamic_axes['position_ids']
-
-        try:
-            self.model.to(device=self.cfg.onnx_export_options.device).freeze()
-            self.model.eval()
-            if not self.cfg.trt_export_options.use_fp8:
-                G_LOGGER.info("Exporting ONNX with attention_mask")
-                dynamic_axes['attention_mask'] = {2: "sequence", 3: "sequence"}
-
-            self.model.export(
-                onnx_out,
-                onnx_opset_version=self.cfg.onnx_export_options.onnx_opset,
-                do_constant_folding=self.cfg.onnx_export_options.do_constant_folding,
-                dynamic_axes=dynamic_axes,
-                check_trace=check_trace,
-                check_tolerance=self.cfg.onnx_export_options.check_tolerance,
-                verbose=self.cfg.onnx_export_options.verbose,
-            )
-            onnx_names = [augment_filename(onnx_out, subnet_name) for subnet_name in self.model.list_export_subnets()]
-
-        except Exception as e:
-            G_LOGGER.error(
-                "Export failed. Please make sure your NeMo model class ({}) has working export() and that you have the latest NeMo package installed with [all] dependencies.".format(
-                    self.model.__class__
-                )
-            )
-            raise e
-
-        release_nemo_model(self.model)
-        assert len(onnx_names) == 1
-        os.rename(onnx_names[0], onnx_out)
-        return onnx_out
-
-    def prune_onnx(self, input_path) -> str:
-        """
-        Prune the input ONNX model to be structured sparsity pattern by using polygraphy.
-        """
-        if not self.cfg.trt_export_options.sparse:
-            G_LOGGER.warning(f"Model pruning is enabled but sparsity is not enabled for TRT engine builder.")
-
-        ibname = os.path.basename(input_path)
-        obname = "pruned." + ibname
-        opath = os.path.join(os.path.dirname(input_path), obname)
-        o_data_real_path = opath + "_data"
-        if os.path.exists(opath) and os.path.exists(o_data_real_path):
-            return opath
-
-        o_data_bname = os.path.basename(o_data_real_path)
-        cmds = f"polygraphy surgeon prune {input_path} -o {opath} --save-external-data {o_data_bname}"
-        G_LOGGER.info(f"Prune ONNX model with: {cmds}")
-        G_LOGGER.info(f"This may take a while...")
-        sp.run(shlex.split(cmds), check=True, stdout=sp.PIPE, stderr=sp.STDOUT)
-        return opath
-
-
-    def create_onnx(self, onnx_input_fpath, onnx_output_fpath, kv_output_policy="kv_new"):
-        """
-        Create an ONNX model with modifications from `onnx_input_fpath`, save the ONNX model to `onnx_output_fpath`.
-        The ONNX is modified to use a KV-Cache and/or quantize the attention batched matrix-multiplication ops.
-        No return value for this function.
-        """
-        assert os.path.splitext(onnx_input_fpath)[1] == ".onnx", "Input ONNX file must end with '.onnx'."
-        assert os.path.splitext(onnx_output_fpath)[1] == ".onnx", "Output ONNX file must end with '.onnx'."
-
-        quantize_bmms = self.cfg.onnx_export_options.quantize_bmms
-        use_cache = self.cfg.use_cache
-        nbheads, headsize = self.cfg.model.nb_heads, self.cfg.model.head_size
-        hp_dtype = onnx.TensorProto.BFLOAT16 if self.cfg.trt_export_options.use_bf16 else onnx.TensorProto.FLOAT16
-        dtype = hp_dtype
-        if self.cfg.onnx_export_options.use_fp8_storage:
-            dtype = onnx.TensorProto.FLOAT8E4M3FN
-        assert nbheads * headsize == self.cfg.model.hidden_size, "Model hidden size does not match."
-        num_qkvs = process_onnx(kv_output_policy,
-            onnx_input_fpath, onnx_output_fpath, separate_param_files=True,
-            use_cache=use_cache, quantize_bmms=quantize_bmms,
-            nbheads=nbheads, headsize=headsize, vocab_size=self.cfg.model.vocab_size, dtype=dtype, hp_dtype=hp_dtype, use_fp8_storage=self.cfg.onnx_export_options.use_fp8_storage)
-
-        G_LOGGER.info(f"Number of QKV subgraphs = {num_qkvs}, number of layers = {self.cfg.model.num_layers}")
-        if num_qkvs != self.cfg.model.num_layers:
-            raise ValueError("Number of QKV subgraphs must be the same as number of layers in the model.")
-        G_LOGGER.info(f"Saved KV-cache onnx to {onnx_output_fpath}")
-
-
-    # Reads an onnx file and creates a trt engine file
-    def onnx_to_trt(self, onnx_fpath, trt_fpath):
-        """
-        Convert an ONNX model from `onnx_fpath` to a TensorRT engine, and save the result to `trt_fpath`.
-        """
-        # Set up polygraphy config
-        use_tf32 = self.cfg.trt_export_options.use_tf32
-        use_fp16 = self.cfg.trt_export_options.use_fp16
-        use_fp8 = self.cfg.trt_export_options.use_fp8
-        use_bf16 = self.cfg.trt_export_options.use_bf16
-        strongly_typed = self.cfg.trt_export_options.use_strongly_typed
-        sparse = self.cfg.trt_export_options.sparse
-        if sparse and not self.cfg.onnx_export_options.prune:
-            G_LOGGER.warning("Sparsity for TRT engine builder is enabled, but model pruning is not.")
-
-        # Create optimization profiles
-        bs = self.cfg.batch_size
-        max_seq_len = self.cfg.model.max_seq_len
-        opt_seq_len = self.cfg.trt_export_options.opt_seq_len if self.cfg.trt_export_options.opt_seq_len else (max_seq_len // 2)
-        profile_non_kv = Profile()
-        profile_non_kv.add(name="input_ids", min=(bs, 1), opt=(bs, opt_seq_len), max=(bs, max_seq_len)) # (batch, sequence)
-        if not self.cfg.use_one_input:
-            profile_non_kv.add(name="position_ids", min=(bs, 1), opt=(bs, opt_seq_len), max=(bs, max_seq_len)) # (batch, sequence)
-            # For FP8 precision, attention mask is created inside transformer_engine.
-            if not self.cfg.trt_export_options.use_fp8:
-                profile_non_kv.add(name="attention_mask", min=(1, 1, 1, 1), opt=(1, 1, opt_seq_len, opt_seq_len), max=(1, 1, max_seq_len, max_seq_len)) # (1, 1, sequence, sequence)
-
-        num_layers, nbheads, headsize = self.cfg.model.num_layers, self.cfg.model.nb_heads, self.cfg.model.head_size
-        if self.cfg.use_cache:
-            for i in range(num_layers):
-                input_k = get_past_key_name(i)
-                input_v = get_past_value_name(i)
-                # (sequence, batch, nbheads, headsize)
-                profile_non_kv.add(name=input_k, min=(0, bs, nbheads, headsize), opt=(0, bs, nbheads, headsize), max=(0, bs, nbheads, headsize))
-                profile_non_kv.add(name=input_v, min=(0, bs, nbheads, headsize), opt=(0, bs, nbheads, headsize), max=(0, bs, nbheads, headsize))
-
-        profiles = [profile_non_kv]
-
-        # When enabling KV-cache, use first profile for context phase and second profile for generation phase
-        if self.cfg.use_cache:
-            profile_kv = Profile()
-            profile_kv.add(name="input_ids", min=(bs, 1), opt=(bs, 1), max=(bs, 1)) # (batch, sequence)
-            if not self.cfg.use_one_input:
-                profile_kv.add(name="position_ids", min=(bs, 1), opt=(bs, 1), max=(bs, 1)) # (batch, sequence)
-                # For FP8 precision, attention mask is created inside transformer_engine.
-                if not self.cfg.trt_export_options.use_fp8:
-                    profile_kv.add(name="attention_mask", min=(1, 1, 1, 1), opt=(1, 1, opt_seq_len, opt_seq_len), max=(1, 1, max_seq_len, max_seq_len)) # (1, 1, sequence, sequence)
-
-            assert num_layers > 0
-            nbheads, headsize = self.cfg.model.nb_heads, self.cfg.model.head_size
-            for i in range(num_layers):
-                input_k = get_past_key_name(i)
-                input_v = get_past_value_name(i)
-                # (sequence, batch, nbheads, headsize)
-                profile_kv.add(name=input_k, min=(1, bs, nbheads, headsize), opt=(opt_seq_len, bs, nbheads, headsize), max=(max_seq_len-1, bs, nbheads, headsize))
-                profile_kv.add(name=input_v, min=(1, bs, nbheads, headsize), opt=(opt_seq_len, bs, nbheads, headsize), max=(max_seq_len-1, bs, nbheads, headsize))
-            profiles = [profile_kv, profile_non_kv]
-
-
-        # Read about these arguments here:
-        # https://github.com/NVIDIA/TensorRT/blob/main/tools/Polygraphy/polygraphy/backend/trt/config.py
-        # Note that the precision args below *enable*, not *require*, the specified precision
-        preview_features = []
-
-        trt_config = CreateConfig(
-            tf32= use_tf32,
-            fp16=False if strongly_typed else use_fp16,
-            bf16=False if strongly_typed else use_bf16,
-            sparse_weights=sparse,
-            profiles=profiles,
-            precision_constraints=None if strongly_typed else "obey",
-            preview_features=preview_features,
-            fp8=False if strongly_typed else use_fp8,
-            load_timing_cache=self.cfg.trt_export_options.timing_cache,
-        )
-
-        # Print out trtexec command for debugging
-        G_LOGGER.debug(" >>> trtexec command for debugging:")
-        G_LOGGER.debug(get_trtexec_cmd(onnx_fpath, self.cfg, bs))
-
-        with PG_LOGGER.verbosity(_calculate_polygraphy_verbosity()):
-            G_LOGGER.info(f"Reading ONNX file at {onnx_fpath}")
-            network = NetworkFromOnnxPath(onnx_fpath, strongly_typed=strongly_typed)
-            G_LOGGER.info("Building TRT engine")
-            engine = engine_from_network(network, config=trt_config)
-            G_LOGGER.info(f"Saving TRT engine to {trt_fpath}")
-            save_engine(engine, trt_fpath)
-
-    @staticmethod
-    def _resolve_opset19_paths(onnx_fpath, results_path: Optional[str] = None) -> str:
-        foldername, filename = os.path.split(onnx_fpath)
-        return foldername if not results_path else results_path, filename
-
-    @staticmethod
-    def get_opset19_onnx_fpath(onnx_fpath, results_path: Optional[str] = None) -> str:
-        suffix = ".opset19.onnx"
-        results_path, filename = NeMoConverter._resolve_opset19_paths(
-            onnx_fpath, results_path
-        )
-        return os.path.join(results_path, os.path.splitext(filename)[0] + suffix)
-
-
-    @staticmethod
-    def onnx_to_opset19(onnx_fpath, results_path: Optional[str] = None) -> str:
-        """
-        Convert a ONNX model `onnx_fpath` to be with standard opset19 Q/DQ nodes, return a string
-        contains a file path to the result ONNX if any conversion is performed, otherwise return `None`.
-        """
-        mappings = replace_customop_qdq_with_onnx_qdq(
-            [onnx_fpath],
-            NeMoConverter._resolve_opset19_paths(onnx_fpath, results_path)[0],
-            create_netron_compatible_model=False,
-            remove_cast_before_q=False,
-            remove_cast_after_dq=False,
-            change_qdq_scale_precision="",
-        )
-        if (
-            (not mappings)
-            or (onnx_fpath not in mappings)
-            or (mappings[onnx_fpath] == None)
-        ):
-            G_LOGGER.error(f"Opset19 onnx file conversion failed for {onnx_fpath}.")
-            assert False
-
-        G_LOGGER.info(f"Converted {onnx_fpath} to {mappings[onnx_fpath]} for opset19.")
-        return mappings[onnx_fpath]
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='NeMo export script arguments', add_help=True)
-    parser.add_argument(
-        "--nemo-model",
-        help="Set a NeMo model to be used.",
-        required=False,
-        default=None,
-        type=str,
-    )
-    parser.add_argument(
-        "--nemo-checkpoint",
-        help="Set a NeMo checkpoint to be used.",
-        required=False,
-        default=None,
-        type=str,
-    )
-    parser.add_argument(
-        "--onnx-model",
-        help="A path to load an ONNX model for conversion.",
-        required=False,
-        default=None,
-        type=str,
-    )
-    parser.add_argument(
-        "--save-onnx-dir",
-        help="A directory to save the generated ONNX model. Must be writable.",
-        required=True,
-    )
-    parser.add_argument(
-        "--opset19",
-        action="store_true",
-        help="If set, the ONNX will be converted to opset19.",
-        default=False
-    )
-    parser.add_argument(
-        "--use-cache",
-        action="store_true",
-        help="If set, the ONNX will have KV-cache inputs and outputs.",
-        default=False
-    )
-    parser.add_argument(
-        "--quantize-bmms",
-        help="Quantize attention BMMs",
-        action="store_true",
-        default=False,
-    )
-    parser.add_argument(
-        "--save-engine",
-        required=False,
-        help="If set to a path, a TensorRT engine will be built from ONNX and save to the path.",
-    )
-    parser.add_argument(
-        "--fp8",
-        action="store_true",
-        help="Use FP8 precision during conversion.",
-        default=False
-    )
-    parser.add_argument(
-        "--fp16",
-        action="store_true",
-        help="Use FP16 precision during conversion.",
-        default=False
-    )
-    parser.add_argument(
-        "--bf16",
-        action="store_true",
-        help="Use BF16 precision during conversion.",
-        default=False
-    )
-    parser.add_argument(
-        "--extra-configs",
-        required=False,
-        help='Use this flag to set fields specified in config.yml with a format of --extra-configs="[<KEY>=<VALUE>][ <KEY>=<VALUE>]*". Values specified by this flag will not override any value set from other flags.',
-        default=None,
-        type=str,
-    )
-    args = parser.parse_args()
-    return args
-
-def main():
-    G_LOGGER.setLevel(level=G_LOGGER.INFO)
-
-    config_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.yaml")
-    cfg = omegaconf.OmegaConf.load(config_path)
-    G_LOGGER.info(f"Loaded configs = {cfg}")
-
-    args = parse_args()
-    if (args.nemo_model != None or args.nemo_checkpoint != None) and args.onnx_model != None:
-        G_LOGGER.error("NeMo model and ONNX model cannot be both set.")
-        exit(1)
-
-    if args.nemo_model == None and args.nemo_checkpoint == None and args.onnx_model == None:
-        G_LOGGER.error("Either one of --nemo-model, --nemo-checkpoint, or --onnx-model needs to be set.")
-        exit(1)
-
-    if args.extra_configs != None:
-        kwargs = args.extra_configs.split(" ")
-        for kwarg in kwargs:
-            kw = kwarg.split("=")
-            if len(kw) != 2:
-                raise ValueError(f'Arg {kwarg} is not in a format of "<KEY>=<VALUE>"')
-            def nested_set(dic, keys, value):
-                for i in range(len(keys)):
-                    if not hasattr(dic, keys[i]):
-                        raise ValueError(f"Cannot find key {keys[:i+1]} in the config.")
-                    if i == len(keys) - 1:
-                        dic[keys[i]] = value
-                    else:
-                        dic = dic[keys[i]]
-
-            G_LOGGER.info(f"Setting {kw[0]} to {kw[1]}")
-            nested_set(cfg, kw[0].split("."), kw[1])
-        G_LOGGER.info(f"Modified Configs = {cfg}")
-
-    # Set precision for conversion
-    if args.fp16:
-        cfg.trainer.precision = "16"
-        cfg.trt_export_options.use_fp16 = True
-    elif args.bf16:
-        cfg.trainer.precision = "bf16"
-        cfg.trt_export_options.use_bf16 = True
-    else:
-        cfg.trainer.precision = "32"
-
-    if args.fp8:
-        cfg.trt_export_options.use_fp8 = True
-
-    if args.quantize_bmms:
-        cfg.onnx_export_options.quantize_bmms = True
-
-    if os.path.exists(args.save_onnx_dir) and not os.path.isdir(args.save_onnx_dir):
-        raise ValueError(f"{args.save_onnx_dir} is not a directory.")
-
-    cfg.onnx_model_file = os.path.join(args.save_onnx_dir, "model.onnx")
-    create_dir_if_not_exist(cfg.onnx_model_file)
-
-    # Convert NeMo model to ONNX model
-    converter = None
-    if args.nemo_model or args.nemo_checkpoint:
-        cfg.gpt_model_file = args.nemo_model
-        if args.nemo_checkpoint:
-            cfg.checkpoint_dir = os.path.dirname(args.nemo_checkpoint)
-            cfg.checkpoint_name = os.path.basename(args.nemo_checkpoint)
-        converter = NeMoConverter(cfg, MegatronGPTModel)
-        onnx_name = converter.nemo_to_onnx()
-        G_LOGGER.info(f"ONNX exported from NeMo {onnx_name}")
-    elif args.onnx_model:
-        onnx_name = args.onnx_model
-
-    # Convert Q/DQ nodes to use standard opset19 operators
-    if args.opset19:
-        op19_onnx = NeMoConverter.onnx_to_opset19(onnx_name, args.save_onnx_dir)
-        if op19_onnx != None:
-            G_LOGGER.info(f"Get opset19 onnx file {op19_onnx}")
-            onnx_name = op19_onnx
-
-    # Add KV cache to ONNX model
-    if cfg.use_cache:
-        G_LOGGER.info(f"Converting {onnx_name} with KV-cache support")
-        kv_output_policy = "kv_new"
-        new_dir = os.path.join(args.save_onnx_dir, f"{kv_output_policy}")
-        onnx_output_fpath = os.path.join(new_dir, onnx_name.split("/")[-1])
-        create_dir_if_not_exist(onnx_output_fpath)
-        if not converter:
-            converter = NeMoConverter(cfg, MegatronGPTModel)
-        converter.create_onnx(onnx_name, onnx_output_fpath, kv_output_policy)
-        onnx_name = onnx_output_fpath
-
-    if cfg.onnx_export_options.prune:
-        onnx_name = converter.prune_onnx(onnx_name)
-
-    # Convert ONNX model to TRT engine
-    if args.save_engine:
-        create_dir_if_not_exist(args.save_engine)
-        if not converter:
-            converter = NeMoConverter(cfg, MegatronGPTModel)
-        converter.onnx_to_trt(onnx_name, args.save_engine)
-
-if __name__ == '__main__':
-    main()
diff --git a/demo/NeMo/patch_te.sh b/demo/NeMo/patch_te.sh
deleted file mode 100644
index 4f060dd84..000000000
--- a/demo/NeMo/patch_te.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/bin/sh
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Sourcing messes up the directory detection with readlink.
-if [ ! "${0##*/}" = "patch_te.sh" ]; then
-	echo "Please run this patch script, don't source it." >&2
-	return 1
-fi
-
-NEMO_DIR=$(dirname "$(readlink -f "$0")")
-
-te_loc="$(pip show transformer_engine | grep '^Location' | awk '{print $2}')"
-cd "${te_loc}/transformer_engine" || {
-	echo "Could not locate transformer-engine python package. Please check if installation proceeded correctly."
-	exit 1
-}
-# Use sys.executable when calling pip within subprocess to recognize virtualenv.
-# If patch is already applied, skip it and proceed with the rest of the script, quit otherwise.
-# NOTE: patch needs to be updated to track the commit of TE in install.sh.
-OUT="$(patch --forward common/__init__.py <"${NEMO_DIR}"/transformer_engine.patch)" || echo "${OUT}" | grep "Skipping patch" -q || {
-	echo "Could not patch transformer engine because ${OUT}"
-	exit 1
-}
-unset OUT
-cd - || exit
-unset te_loc
diff --git a/demo/NeMo/requirements.txt b/demo/NeMo/requirements.txt
deleted file mode 100644
index c715ed76a..000000000
--- a/demo/NeMo/requirements.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-nemo-toolkit[nlp]==1.17.0
-onnx==1.14.0
-protobuf==3.20.3
-onnxruntime==1.13.1
-transformers==4.27.0
-cuda-python==12.1.0
-setuptools==65.5.1
-tqdm
---pre --extra-index-url https://download.pytorch.org/whl/cu121
-torch==2.1.0
-torchaudio==2.1.0
-torchvision==0.16.0
-onnx-graphsurgeon==0.3.27
diff --git a/demo/NeMo/run.py b/demo/NeMo/run.py
deleted file mode 100644
index 5ba00b5a5..000000000
--- a/demo/NeMo/run.py
+++ /dev/null
@@ -1,200 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""
-Demonstrates TensorRT capabilities with networks trained by NeMo.
-Requires Python 3.6+
-"""
-
-import argparse
-import os
-import sys
-from typing import List, Tuple
-
-ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
-sys.path.append(ROOT_DIR)
-
-sys.path.append('../') # Include one-level up directory so to reuse HuggingFace utils.
-from HuggingFace.run import (
-    Action,
-    NetworkScriptAction,
-    WRAPPER_LIST_ACTION,
-)
-from HuggingFace.NNDF.logger import G_LOGGER
-from HuggingFace.NNDF.general_utils import register_network_folders
-from HuggingFace.NNDF.cuda_bootstrapper import bootstrap_ld_library_path
-
-WRAPPER_RUN_ACTION = "run"
-WRAPPER_ACCURACY_ACTION = "accuracy"
-WRAPPER_BENCHMARK_ACTION = "benchmark"
-WRAPPER_ACTIONS = [WRAPPER_LIST_ACTION, WRAPPER_RUN_ACTION, WRAPPER_ACCURACY_ACTION, WRAPPER_BENCHMARK_ACTION]
-
-class ListAction(Action):
-    def __init__(self, networks: List[str], parser: argparse.ArgumentParser):
-        super().__init__(networks, parser)
-        self.networks = networks
-
-    def execute(self, args: argparse.Namespace):
-        print("Networks that are supported by NeMo Demo:")
-        [print(n) for n in self.networks]
-        return 0
-
-class RunAction(NetworkScriptAction):
-    def execute(self, args: argparse.Namespace):
-        module = self.load_script(args.script, args)
-        module.RUN_CMD._parser = self.parser
-
-        old_path = os.getcwd()
-        # Execute script in each relevant folder
-        try:
-            os.chdir(args.network)
-            _ = module.RUN_CMD()
-        finally:
-            os.chdir(old_path)
-
-        return 0
-
-    def add_args(self, parser: argparse.ArgumentParser):
-        super().add_args(parser)
-        run_group = parser.add_argument_group("run args")
-        run_group.add_argument("script", choices=self.PER_NETWORK_SCRIPTS)
-
-class BenchmarkAction(NetworkScriptAction):
-    def execute(self, args: argparse.Namespace):
-        module = self.load_script(args.script, args)
-        module.RUN_CMD._parser = self.parser
-
-        old_path = os.getcwd()
-        # Execute script in each relevant folder
-        try:
-            os.chdir(args.network)
-            _ = module.RUN_CMD()
-        finally:
-            os.chdir(old_path)
-
-        return 0
-
-    def add_args(self, parser: argparse.ArgumentParser):
-        super().add_args(parser)
-        benchmarking_group = parser.add_argument_group("benchmark args")
-        benchmarking_group.add_argument("script", choices=self.PER_NETWORK_SCRIPTS)
-        benchmarking_group.add_argument(
-            "--input-seq-len",
-            type=int,
-            help="Specify fixed input sequence length for perf benchmarking. Required for benchmark except when both input_profile_max and output_profile_max are provided for trt",
-        )
-        benchmarking_group.add_argument(
-            "--output-seq-len",
-            type=int,
-            help="Specify fixed output sequence length for perf benchmarking. Required for benchmark except when both input_profile_max and output_profile_max are provided for trt",
-        )
-
-class AccuracyAction(NetworkScriptAction):
-    def execute(self, args: argparse.Namespace):
-        module = self.load_script(args.script, args)
-        module.RUN_CMD._parser = self.parser
-
-        old_path = os.getcwd()
-        # Execute script in each relevant folder
-        try:
-            os.chdir(args.network)
-            _ = module.RUN_CMD()
-        finally:
-            os.chdir(old_path)
-
-        return 0
-
-    def add_args(self, parser: argparse.ArgumentParser):
-        super().add_args(parser)
-        accuracy_group = parser.add_argument_group("accuracy args")
-        accuracy_group.add_argument("script", choices=self.PER_NETWORK_SCRIPTS)
-        accuracy_group.add_argument(
-            "--task",
-            type=str,
-            default="lambada",
-            choices=["lambada"],
-            help="Specify which task to be used for accuracy check.",
-        )
-
-def get_action(
-    action_name: str, networks: List[str], parser: argparse.ArgumentParser
-) -> Action:
-    return {
-        WRAPPER_LIST_ACTION: ListAction,
-        WRAPPER_RUN_ACTION: RunAction,
-        WRAPPER_BENCHMARK_ACTION: BenchmarkAction,
-        WRAPPER_ACCURACY_ACTION: AccuracyAction,
-    }[action_name](networks, parser)
-
-def verify_python_version():
-    if sys.version_info.major < 3 or sys.version_info.minor <= 6:
-        raise RuntimeError("NeMo OSS Demo does not support Python <= 3.6 due to end-of-life.")
-    if sys.version_info.major < 3 or sys.version_info.minor < 8 or (sys.version_info.minor == 8 and sys.version_info.micro < 10):
-        G_LOGGER.warn("NeMo OSS Demo is not tested for Python < 3.8.10")
-
-def get_default_parser(
-    description: str = "", add_default_help=False
-) -> Tuple[argparse.ArgumentParser, bool]:
-    """
-    Returns argparser for use by main(). Allows the ability to toggle default help message with a custom help flag
-    so that argparser does not throw SystemExit when --help is passed in. Useful for custom --help functionality.
-
-    Returns:
-        (argparse.ArgumentParser): argparser used by main()
-    """
-    # This variable is set so that usage errors don't show up in wrapper
-    parser = argparse.ArgumentParser(
-        conflict_handler="resolve",
-        description=description,
-        add_help=add_default_help,
-        prog="run.py",
-    )
-
-    required_group = parser.add_argument_group("required wrapper arguments")
-    required_group.add_argument("action", choices=WRAPPER_ACTIONS)
-    return parser
-
-def main() -> None:
-    """
-    Parses network folders and responsible for passing --help flags to subcommands if --network is provided.
-    """
-    # Verify python version support
-    verify_python_version()
-
-    # Get all available network scripts
-    networks = register_network_folders(os.getcwd())
-
-    # Add network folder for entry point
-    description = "Runs TensorRT networks that are based-off of NeMo variants."
-    parser = get_default_parser(description)
-
-    # Get the general network wrapper help
-    known_args, _ = parser.parse_known_args()
-
-    # Delegate parser to action specifics
-    action = get_action(known_args.action, networks, parser)
-    known_args, _ = parser.parse_known_args()
-
-    # If bootstrap occurs, then the spawned process completes the rest of demo.
-    # We can exit safely. We spawn after parsing basic args to reduce loading churn on rudimentary help commands.
-    if bootstrap_ld_library_path():
-        sys.exit(0)
-
-    return action.execute(known_args)
-
-if __name__ == "__main__":
-    main()
diff --git a/demo/NeMo/transformer_engine.patch b/demo/NeMo/transformer_engine.patch
deleted file mode 100644
index c4c96dea5..000000000
--- a/demo/NeMo/transformer_engine.patch
+++ /dev/null
@@ -1,17 +0,0 @@
---- common/__init__.py	2023-06-22 17:22:59.046208583 +0000
-+++ common/backup.py	2023-06-22 20:53:01.154819280 +0000
-@@ -7,12 +7,13 @@
- import os
- import platform
- import subprocess
-+import sys
-
-
- def get_te_path():
-     """Find Transformer Engine install path using pip"""
-
--    command = ["pip", "show", "transformer_engine"]
-+    command = [sys.executable, "-m", "pip", "show", "transformer_engine"]
-     result = subprocess.run(command, capture_output=True, check=True, text=True)
-     result = result.stdout.replace("\n", ":").split(":")
-     return result[result.index("Location")+1].strip()