1
1
import unittest
2
2
3
- import numpy as np
4
- import pycuda .driver as cuda
5
3
import tensorrt as trt
6
4
import torch
7
5
import torch_tensorrt
6
+ from torch_tensorrt .dynamo .runtime import PythonTorchTensorRTModule
8
7
from torch_tensorrt .dynamo .utils import COSINE_THRESHOLD , cosine_similarity
9
8
10
- try :
11
- import pycuda .autoprimaryctx
12
- except ModuleNotFoundError :
13
- import pycuda .autoinit
14
-
15
-
16
- class HostDeviceMem (object ):
17
- def __init__ (self , host_mem , device_mem ):
18
- self .host = host_mem
19
- self .device = device_mem
20
-
21
- def __str__ (self ):
22
- return "Host:\n " + str (self .host ) + "\n Device:\n " + str (self .device )
23
-
24
- def __repr__ (self ):
25
- return self .__str__ ()
26
-
27
-
28
- def allocate_buffers (engine ):
29
- inputs = []
30
- outputs = []
31
- bindings = []
32
- stream = cuda .Stream ()
33
- for binding in engine :
34
- size = trt .volume (engine .get_binding_shape (binding )) * engine .max_batch_size
35
- dtype = trt .nptype (engine .get_binding_dtype (binding ))
36
- # Allocate host and device buffers
37
- host_mem = cuda .pagelocked_empty (size , dtype )
38
- device_mem = cuda .mem_alloc (host_mem .nbytes )
39
- # Append the device buffer to device bindings.
40
- bindings .append (int (device_mem ))
41
- # Append to the appropriate list.
42
- if engine .binding_is_input (binding ):
43
- inputs .append (HostDeviceMem (host_mem , device_mem ))
44
- else :
45
- outputs .append (HostDeviceMem (host_mem , device_mem ))
46
- return inputs , outputs , bindings , stream
47
-
48
-
49
- def do_inference_v2 (context , bindings , inputs , outputs , stream ):
50
- # Transfer input data to the GPU.
51
- [cuda .memcpy_htod_async (inp .device , inp .host , stream ) for inp in inputs ]
52
- # Run inference.
53
- context .execute_async_v2 (bindings = bindings , stream_handle = stream .handle )
54
- # Transfer predictions back from the GPU.
55
- [cuda .memcpy_dtoh_async (out .host , out .device , stream ) for out in outputs ]
56
- # Synchronize the stream
57
- stream .synchronize ()
58
- # Return only the host outputs.
59
- return [out .host for out in outputs ]
60
-
61
- gt_tensor = gt_tensor .flatten ().to (torch .float32 )
62
- pred_tensor = pred_tensor .flatten ().to (torch .float32 )
63
- if torch .sum (gt_tensor ) == 0.0 or torch .sum (pred_tensor ) == 0.0 :
64
- if torch .allclose (gt_tensor , pred_tensor , atol = 1e-4 , rtol = 1e-4 , equal_nan = True ):
65
- return 1.0
66
- res = torch .nn .functional .cosine_similarity (gt_tensor , pred_tensor , dim = 0 , eps = 1e-6 )
67
- res = res .cpu ().detach ().item ()
68
-
69
- return res
70
-
71
9
72
10
class TestConvertMethodToTrtEngine (unittest .TestCase ):
73
11
def test_convert_module (self ):
@@ -91,19 +29,9 @@ def forward(self, a, b):
91
29
with trt .Logger () as logger , trt .Runtime (logger ) as runtime :
92
30
engine = runtime .deserialize_cuda_engine (trt_engine_str )
93
31
94
- # Allocate memory for inputs and outputs
95
- inputs , outputs , bindings , stream = allocate_buffers (engine )
96
- context = engine .create_execution_context ()
97
-
98
- # Copy input data to buffer (need .ravel() here, as the inputs[0] buffer is (4,) not (2, 2))
99
- np .copyto (inputs [0 ].host , input_data_0 .ravel ())
100
- np .copyto (inputs [1 ].host , input_data_1 .ravel ())
101
-
102
32
# Inference on TRT Engine
103
- trt_outputs = do_inference_v2 (
104
- context , bindings = bindings , inputs = inputs , outputs = outputs , stream = stream
105
- )
106
- trt_output = torch .from_numpy (trt_outputs [0 ])
33
+ py_trt_module = PythonTorchTensorRTModule (engine , ["a" , "b" ], ["output0" ])
34
+ trt_output = py_trt_module (input_data_0 , input_data_1 ).cpu ()
107
35
108
36
# Inference on PyTorch model
109
37
model_output = model (input_data_0 , input_data_1 )
0 commit comments