Skip to content

Commit

Permalink
Fix mtbench test
Browse files Browse the repository at this point in the history
Signed-off-by: Igor Gitman <[email protected]>
  • Loading branch information
Kipok committed Nov 25, 2024
1 parent a5db4ed commit 7947d76
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 24 deletions.
57 changes: 34 additions & 23 deletions nemo_skills/inference/server/serve_trt.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,23 +391,26 @@ def _stream(


class TensorRTLLM:
def __init__(self,
model_path: str,
max_batch_size: Optional[int] = None,
kv_cache_free_gpu_memory_fraction: Optional[float] = None,
):
def __init__(
self,
model_path: str,
max_batch_size: Optional[int] = None,
kv_cache_free_gpu_memory_fraction: Optional[float] = None,
):
with open(Path(model_path) / "config.json", 'r') as f:
config = json.load(f)
self.tokenizer, self.pad_id, self.end_id = load_tokenizer(
tokenizer_dir=model_path, model_name=read_model_name(config)
)

runner_kwargs = dict(engine_dir=model_path,
rank=tensorrt_llm.mpi_rank(),
max_batch_size=max_batch_size,
kv_cache_free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction,
enable_chunked_context=True,
kv_cache_enable_block_reuse=True)
runner_kwargs = dict(
engine_dir=model_path,
rank=tensorrt_llm.mpi_rank(),
max_batch_size=max_batch_size,
kv_cache_free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction,
enable_chunked_context=True,
kv_cache_enable_block_reuse=True,
)

self.runner = ModelRunnerCpp.from_dir(**runner_kwargs)

Expand Down Expand Up @@ -510,15 +513,19 @@ class GenerationResponse(BaseModel):


class MPIWrapper:
def __init__(self,
model_path: str,
max_batch_size: Optional[int] = None,
kv_cache_free_gpu_memory_fraction: Optional[float] = None):
def __init__(
self,
model_path: str,
max_batch_size: Optional[int] = None,
kv_cache_free_gpu_memory_fraction: Optional[float] = None,
):
self.comm = MPI.COMM_WORLD
self.rank = self.comm.Get_rank()
self.model = TensorRTLLM(model_path=model_path,
max_batch_size=max_batch_size,
kv_cache_free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction)
self.model = TensorRTLLM(
model_path=model_path,
max_batch_size=max_batch_size,
kv_cache_free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction,
)
self.app = None
if self.rank == 0:
self.app = self._create_app()
Expand Down Expand Up @@ -577,12 +584,16 @@ def main():
parser.add_argument("--host", type=str, default="0.0.0.0")
parser.add_argument("--port", type=int, default=5000)
parser.add_argument("--max_batch_size", type=int, default=None, help="Maximum batch size")
parser.add_argument("--kv_cache_free_gpu_memory_fraction", type=float, default=None, help="Free GPU memory fraction for cache")
parser.add_argument(
"--kv_cache_free_gpu_memory_fraction", type=float, default=None, help="Free GPU memory fraction for cache"
)
args = parser.parse_args()

wrapper = MPIWrapper(model_path=args.model_path,
max_batch_size=args.max_batch_size,
kv_cache_free_gpu_memory_fraction=args.kv_cache_free_gpu_memory_fraction)
wrapper = MPIWrapper(
model_path=args.model_path,
max_batch_size=args.max_batch_size,
kv_cache_free_gpu_memory_fraction=args.kv_cache_free_gpu_memory_fraction,
)
wrapper.run(host=args.host, port=args.port)


Expand All @@ -596,4 +607,4 @@ def filter(self, record):
logging.getLogger('uvicorn.access').addFilter(LogFilter())
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))

main()
main()
2 changes: 1 addition & 1 deletion tests/test_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def test_eval_mtbench_api():
f"ns eval "
f" --cluster test-local --config_dir {Path(__file__).absolute().parent / 'gpu-tests'} "
f" --server_type=openai "
f" --model=meta/llama-3.1-8b-instruct "
f" --model=meta/llama-3.1-405b-instruct "
f" --server_address=https://integrate.api.nvidia.com/v1 "
f" --benchmarks=mt-bench:0 "
f" --output_dir=/tmp/nemo-skills-tests/mtbench-api "
Expand Down

0 comments on commit 7947d76

Please sign in to comment.