You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I converted the decoder of a TTS model (with HiFi-GAN vocoder) from PyTorch to ONNX and then to an engine format. During inference, both input and output shapes are dynamic, changing with each call. Currently, I’m allocating and deallocating memory on each inference run, but I’m unsure if this is the best approach.
System Details
TensorRT: 10.5.0
CUDA: 12.1
OS: Ubuntu 20.04
GPU: A100
Problem
Dynamic Shape Handling: Is my approach of allocating/deallocating at each inference and overall code of inference is correct?
Output Shape: My code does not correctly handle dynamic output shapes and it's always give the same size output (1,).
onnx to engine conversion:
import tensorrt as trt
import numpy as np
import pycuda.autoinit
Convert ONNX to TensorRT engine
`import tensorrt as trt
import numpy as np
import pycuda.autoinit
# Convert ONNX to TensorRT engine
def build_engine(onnx_file_path, min_shape, opt_shape, max_shape):
logger = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(logger)
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) #we have enabled the explicit Batch
network = builder.create_network(EXPLICIT_BATCH)
parser = trt.OnnxParser(network, logger)
success = parser.parse_from_file(onnx_file_path)
for idx in range(parser.num_errors):
print(parser.get_error(idx))
if not success:
pass
config = builder.create_builder_config()
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30) # 1 MiB
# Set dynamic shapes
profile = builder.create_optimization_profile()
profile.set_shape("asr", min_shape['asr'], opt_shape['asr'], max_shape['asr'])
profile.set_shape("f0", min_shape['f0'], opt_shape['f0'], max_shape['f0'])
profile.set_shape("n", min_shape['n'], opt_shape['n'], max_shape['n'])
profile.set_shape("ref", min_shape['ref'], opt_shape['ref'], max_shape['ref'])
config.add_optimization_profile(profile)
config.default_device_type = trt.DeviceType.GPU
#engine = builder.build_engine(network, config)
serialized_engine = builder.build_serialized_network(network, config)
if serialized_engine is None:
print("Failed to build engine")
return None
with open("sample.engine", "wb") as f:
f.write(serialized_engine)
return serialized_engine
# Main execution
def main():
onnx_file_path = "new_decoder.onnx"
# Define shapes
hidden_dim, style_dim = 512, 128
min_time_dim, max_time_dim, opt_time_dim = 28, 1106, 56
min_shape = {
'asr': (1, hidden_dim, min_time_dim),
'f0': (1, min_time_dim * 2),
'n': (1, min_time_dim * 2),
'ref': (1, style_dim)
}
opt_shape = {
'asr': (1, hidden_dim, opt_time_dim),
'f0': (1, opt_time_dim * 2),
'n': (1, opt_time_dim * 2),
'ref': (1, style_dim)
}
max_shape = {
'asr': (1, hidden_dim, max_time_dim),
'f0': (1, max_time_dim * 2),
'n': (1, max_time_dim * 2),
'ref': (1, style_dim)
}
# Build TensorRT engine
engine = build_engine(onnx_file_path, min_shape, opt_shape, max_shape)
if engine is None:
print("Failed to build engine")
return
if __name__ == "__main__":
main()`
inference.py:
`import numpy as np
import torch
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit # initializes CUDA driver and context
import time
class HostDeviceMem(object):
'''
Helper class to record host-device memory pointer pairs
'''
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
# Define constants for input dimensions
hidden_dim, style_dim = 512, 128
min_time_dim, max_time_dim, opt_time_dim = 28, 1106, 56
# Define dynamic shapes for the inputs
min_shape = {
'asr': (1, hidden_dim, min_time_dim),
'f0': (1, min_time_dim * 2),
'n': (1, min_time_dim * 2),
'ref': (1, style_dim)
}
opt_shape = {
'asr': (1, hidden_dim, opt_time_dim),
'f0': (1, opt_time_dim * 2),
'n': (1, opt_time_dim * 2),
'ref': (1, style_dim)
}
max_shape = {
'asr': (1, hidden_dim, max_time_dim),
'f0': (1, max_time_dim * 2),
'n': (1, max_time_dim * 2),
'ref': (1, style_dim)
}
# # Create random example inputs matching optimal shape
# input_asr = torch.randn(max_shape['asr']).numpy()
# input_f0 = torch.randn(max_shape['f0']).numpy()
# input_n = torch.randn(max_shape['n']).numpy()
# input_ref = torch.randn(max_shape['ref']).numpy()
asr = torch.randn(opt_shape['asr']).numpy()
f0 = torch.randn(opt_shape['f0']).numpy()
n = torch.randn(opt_shape['n']).numpy()
ref = torch.randn(opt_shape['ref']).numpy()
logger = trt.Logger(trt.Logger.WARNING)
runtime = trt.Runtime(logger)
with open("sample.engine", "rb") as f:
serialized_engine = f.read()
engine = runtime.deserialize_cuda_engine(serialized_engine)
context = engine.create_execution_context()
inputs, outputs, bindings = [], [], []
stream = cuda.Stream()
for i in range(engine.num_io_tensors):
tensor_name = engine.get_tensor_name(i)
print("Tensor:", tensor_name, "Shape:", engine.get_tensor_shape(tensor_name))
def infer(asr, f0, n, ref):
# Actual shapes of the inputs
input_shapes = [asr.shape, f0.shape, n.shape, ref.shape]
inputs = []
outputs = []
bindings = []
context.set_input_shape("asr", asr.shape)
context.set_input_shape("f0", f0.shape)
context.set_input_shape("n", n.shape)
context.set_input_shape("ref", ref.shape)
for i in range(engine.num_io_tensors):
tensor_name = engine.get_tensor_name(i)
dtype = trt.nptype(engine.get_tensor_dtype(tensor_name))
# Check if it's an input or output tensor
if engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.INPUT:
shape = input_shapes.pop(0) # Get the shape from the input shapes
size = trt.volume(shape)
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
inputs.append(HostDeviceMem(host_mem, device_mem))
bindings.append(int(device_mem))
np.copyto(inputs[-1].host, locals()[tensor_name].ravel()) # Assuming your inputs are named like this
else:
temp_shape = (1,) # Placeholder, adjust if necessary
size = trt.volume(temp_shape)
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
outputs.append(HostDeviceMem(host_mem, device_mem))
bindings.append(int(device_mem))
# Transfer inputs to device
for i in range(len(inputs)):
cuda.memcpy_htod_async(inputs[i].device, inputs[i].host, stream)
# Set tensor address for each input/output
for i in range(engine.num_io_tensors):
context.set_tensor_address(engine.get_tensor_name(i), bindings[i])
# Transfer predictions back
cuda.memcpy_dtoh_async(outputs[0].host, outputs[0].device, stream)
# Synchronize the stream
stream.synchronize()
return outputs[0].host
def cleanup():
for input_mem in inputs:
input_mem.device.free() # Free device memory for each input
for output_mem in outputs:
output_mem.device.free() # Free device memory for each output
# Run inference
start_time = time.time()
output = infer(asr, f0, n, ref)
end_time = time.time()
# print time in milliseconds
print("Time taken:", (end_time - start_time) * 1000, "ms")
print("Output shape:", output.shape)
# Clean up memory after inference
cleanup()
`
Output:
Tensor: asr Shape: (1, 512, -1)
Tensor: f0 Shape: (1, -1)
Tensor: n Shape: (1, -1)
Tensor: ref Shape: (1, 128)
Tensor: output Shape: (1, 1, -1) # all these tensors attached showing correct shapes with -1 indicating dynamic dimension.
Time taken: 1.0862350463867188 ms
Output shape: (1,) # output shape is always this, needed to fix this
Are there optimized methods for managing dynamic shapes more efficiently for this setup? Any help or guidance would be greatly appreciated!
The text was updated successfully, but these errors were encountered:
Description:
I converted the decoder of a TTS model (with HiFi-GAN vocoder) from PyTorch to ONNX and then to an engine format. During inference, both input and output shapes are dynamic, changing with each call. Currently, I’m allocating and deallocating memory on each inference run, but I’m unsure if this is the best approach.
System Details
TensorRT: 10.5.0
CUDA: 12.1
OS: Ubuntu 20.04
GPU: A100
Problem
Dynamic Shape Handling: Is my approach of allocating/deallocating at each inference and overall code of inference is correct?
Output Shape: My code does not correctly handle dynamic output shapes and it's always give the same size output (1,).
onnx to engine conversion:
inference.py:
`
Output:
Tensor: asr Shape: (1, 512, -1)
Tensor: f0 Shape: (1, -1)
Tensor: n Shape: (1, -1)
Tensor: ref Shape: (1, 128)
Tensor: output Shape: (1, 1, -1) # all these tensors attached showing correct shapes with -1 indicating dynamic dimension.
Time taken: 1.0862350463867188 ms
Output shape: (1,) # output shape is always this, needed to fix this
Are there optimized methods for managing dynamic shapes more efficiently for this setup? Any help or guidance would be greatly appreciated!
The text was updated successfully, but these errors were encountered: