Skip to content

Commit a0053c5

Browse files
committed
update trt inference with PythonTorchTensorRTModule
1 parent 6610856 commit a0053c5

File tree

1 file changed

+3
-75
lines changed

1 file changed

+3
-75
lines changed

tests/py/dynamo/runtime/test_convert_method_to_trt_engine.py

Lines changed: 3 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -1,73 +1,11 @@
11
import unittest
22

3-
import numpy as np
4-
import pycuda.driver as cuda
53
import tensorrt as trt
64
import torch
75
import torch_tensorrt
6+
from torch_tensorrt.dynamo.runtime import PythonTorchTensorRTModule
87
from torch_tensorrt.dynamo.utils import COSINE_THRESHOLD, cosine_similarity
98

10-
try:
11-
import pycuda.autoprimaryctx
12-
except ModuleNotFoundError:
13-
import pycuda.autoinit
14-
15-
16-
class HostDeviceMem(object):
17-
def __init__(self, host_mem, device_mem):
18-
self.host = host_mem
19-
self.device = device_mem
20-
21-
def __str__(self):
22-
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
23-
24-
def __repr__(self):
25-
return self.__str__()
26-
27-
28-
def allocate_buffers(engine):
29-
inputs = []
30-
outputs = []
31-
bindings = []
32-
stream = cuda.Stream()
33-
for binding in engine:
34-
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
35-
dtype = trt.nptype(engine.get_binding_dtype(binding))
36-
# Allocate host and device buffers
37-
host_mem = cuda.pagelocked_empty(size, dtype)
38-
device_mem = cuda.mem_alloc(host_mem.nbytes)
39-
# Append the device buffer to device bindings.
40-
bindings.append(int(device_mem))
41-
# Append to the appropriate list.
42-
if engine.binding_is_input(binding):
43-
inputs.append(HostDeviceMem(host_mem, device_mem))
44-
else:
45-
outputs.append(HostDeviceMem(host_mem, device_mem))
46-
return inputs, outputs, bindings, stream
47-
48-
49-
def do_inference_v2(context, bindings, inputs, outputs, stream):
50-
# Transfer input data to the GPU.
51-
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
52-
# Run inference.
53-
context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
54-
# Transfer predictions back from the GPU.
55-
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
56-
# Synchronize the stream
57-
stream.synchronize()
58-
# Return only the host outputs.
59-
return [out.host for out in outputs]
60-
61-
gt_tensor = gt_tensor.flatten().to(torch.float32)
62-
pred_tensor = pred_tensor.flatten().to(torch.float32)
63-
if torch.sum(gt_tensor) == 0.0 or torch.sum(pred_tensor) == 0.0:
64-
if torch.allclose(gt_tensor, pred_tensor, atol=1e-4, rtol=1e-4, equal_nan=True):
65-
return 1.0
66-
res = torch.nn.functional.cosine_similarity(gt_tensor, pred_tensor, dim=0, eps=1e-6)
67-
res = res.cpu().detach().item()
68-
69-
return res
70-
719

7210
class TestConvertMethodToTrtEngine(unittest.TestCase):
7311
def test_convert_module(self):
@@ -91,19 +29,9 @@ def forward(self, a, b):
9129
with trt.Logger() as logger, trt.Runtime(logger) as runtime:
9230
engine = runtime.deserialize_cuda_engine(trt_engine_str)
9331

94-
# Allocate memory for inputs and outputs
95-
inputs, outputs, bindings, stream = allocate_buffers(engine)
96-
context = engine.create_execution_context()
97-
98-
# Copy input data to buffer (need .ravel() here, as the inputs[0] buffer is (4,) not (2, 2))
99-
np.copyto(inputs[0].host, input_data_0.ravel())
100-
np.copyto(inputs[1].host, input_data_1.ravel())
101-
10232
# Inference on TRT Engine
103-
trt_outputs = do_inference_v2(
104-
context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream
105-
)
106-
trt_output = torch.from_numpy(trt_outputs[0])
33+
py_trt_module = PythonTorchTensorRTModule(engine, ["a", "b"], ["output0"])
34+
trt_output = py_trt_module(input_data_0, input_data_1).cpu()
10735

10836
# Inference on PyTorch model
10937
model_output = model(input_data_0, input_data_1)

0 commit comments

Comments
 (0)