How to use "model.trt" in Python

Question

I have a pytorch model that I exported to ONNX and converted to a tensorflow model with the following command:

trtexec --onnx=model.onnx --batch=400 --saveEngine=model.trt

All of this works, but how do I now load this model.trt in python and run the inference?

Answer 1

Found an answer based on this tutorial .

import numpy as np
import tensorrt as trt
import pycuda.driver as cuda

dev = cuda.Device(0)
ctx = dev.make_context()

try:
    TRT_LOGGER = trt.Logger(trt.Logger.INFO)
    with open("model.trt", 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
        engine = runtime.deserialize_cuda_engine(f.read())

    with engine.create_execution_context() as context:
        # get sizes of input and output and allocate memory required for input data and for output data
        for binding in engine:
            if engine.binding_is_input(binding):  # we expect only one input
                input_shape = engine.get_binding_shape(binding)
                input_size = trt.volume(input_shape) * engine.max_batch_size * np.dtype(np.float32).itemsize  # in bytes
                device_input = cuda.mem_alloc(input_size)
            else:  # and one output
                output_shape = engine.get_binding_shape(binding)
                # create page-locked memory buffers (i.e. won't be swapped to disk)
                host_output = cuda.pagelocked_empty(trt.volume(output_shape) * engine.max_batch_size, dtype=np.float32)
                device_output = cuda.mem_alloc(host_output.nbytes)

        stream = cuda.Stream()

        host_input = np.array(batch, dtype=np.float32, order='C')
        cuda.memcpy_htod_async(device_input, host_input, stream)

        context.execute_async(bindings=[int(device_input), int(device_output)], stream_handle=stream.handle)
        cuda.memcpy_dtoh_async(host_output, device_output, stream)
        stream.synchronize()

        # postprocess results
        output_data = host_output.reshape(engine.max_batch_size, output_shape[0]).T

finally:
    ctx.pop()

Answer 2

The official documentation has a lot of examples . The basic steps to follow are:

ONNX parser: takes a trained model in ONNX format as input and populates a network object in TensorRT
Builder: takes a network in TensorRT and generates an engine that is optimized for the target platform
Engine: takes input data, performs inferences and emits inference output
Logger: object associated with the builder and engine to capture errors, warnings and other information during the build and inference phases

An example for the engine is:

 import tensorrt as trt
import pycuda.autoinit
import pycuda.driver as cuda
from onnx import ModelProto 
import onnx

import numpy as np
import matplotlib.pyplot as plt
from time import time




TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
trt_runtime = trt.Runtime(TRT_LOGGER)

#batch_size = 1
explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
#inp_shape = [batch_size, 3, 1024, 1024] # the shape I was using


def build_engine(onnx_path, shape = inp_shape):
    with trt.Builder(TRT_LOGGER) as builder,builder.create_builder_config() as config,\
    builder.create_network(explicit_batch) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
        if builder.platform_has_fast_fp16:
            builder.fp16_mode = True
        builder.max_workspace_size = (1 << 30)
        #builder.max_workspace_size = (3072 << 20)
        #profile = builder.create_optimization_profile()
        #config.max_workspace_size = (3072 << 20)
        #config.add_optimization_profile(profile)
        print("parsing")
        with open(onnx_path, 'rb') as model:
            print("onnx found")
            if not parser.parse(model.read()):
                print("parse failed")
                for error in range(parser.num_errors):
                    print(parser.get_error(error))
            #parser.parse(model.read())
        last_layer = network.get_layer(network.num_layers - 1)
        # Check if last layer recognizes it's output
        if not last_layer.get_output(0):
            # If not, then mark the output using TensorRT API
            network.mark_output(last_layer.get_output(0))
        network.get_input(0).shape = shape

        engine = builder.build_cuda_engine(network)
        return engine


def save_engine(engine, file_name):
    buf = engine.serialize()
    with open(file_name, 'wb') as f:
        f.write(buf)


def load_engine(trt_runtime, plan_path):
    with open(engine_path, 'rb') as f:
        engine_data = f.read()
    engine = trt_runtime.deserialize_cuda_engine(engine_data)
    return engine


if __name__ == "__main__":
    onnx_path = "./path/to/your/model.onnx"
    engine_name = "./path/to/engine.plan"
    
    model = ModelProto()
    with open(onnx_path, "rb") as f:
        model.ParseFromString(f.read())

    d0 = model.graph.input[0].type.tensor_type.shape.dim[1].dim_value
    d1 = model.graph.input[0].type.tensor_type.shape.dim[2].dim_value
    d2 = model.graph.input[0].type.tensor_type.shape.dim[3].dim_value
    shape = [batch_size , d0, d1 ,d2]
    print(shape)

    print("trying to build engine")
    engine = build_engine(onnx_path,shape)
    save_engine(engine,engine_name)


    print("finished")

Follow this page for another example and information.

How to use "model.trt" in Python

Question

2 answers

solution1
1 2021-05-31 10:50:16

solution2
1 ACCPTED 2021-06-10 18:52:03

How to use "model.trt" in Python

Question

2 answers

solution1 1 2021-05-31 10:50:16

solution2 1 ACCPTED 2021-06-10 18:52:03

solution1
1 2021-05-31 10:50:16

solution2
1 ACCPTED 2021-06-10 18:52:03