Skip to content

Commit 744c1a8

Browse files
committed
chore: Functionalize inputs setup
1 parent 6b3097e commit 744c1a8

File tree

4 files changed

+135
-115
lines changed

4 files changed

+135
-115
lines changed

core/runtime/execute_engine.cpp

Lines changed: 72 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,77 @@ bool _validate_shapes(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngi
9191

9292
return false;
9393
}
94+
void setup_input_tensors(
95+
std::vector<at::Tensor> inputs,
96+
c10::intrusive_ptr<TRTEngine> compiled_engine,
97+
bool need_cudagraphs_record) {
98+
// this is a buffer to store shape tensor input addresses throughout the runtime scope
99+
std::list<std::vector<int64_t>> inputShapeTensorValues;
100+
std::list<at::Tensor> formatted_inputs(compiled_engine->num_io.first);
101+
102+
for (size_t i = 0; i < inputs.size(); i++) {
103+
std::string name = compiled_engine->in_binding_names[i];
104+
105+
TORCHTRT_CHECK(
106+
inputs[i].is_cuda(), "Expected input tensors to have device cuda, found device " << inputs[i].device());
107+
108+
auto expected_type =
109+
util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str()));
110+
TORCHTRT_CHECK(
111+
inputs[i].dtype() == expected_type,
112+
"Expected input tensors to have type " << expected_type << ", found type " << inputs[i].dtype());
113+
114+
auto dims = core::util::toDims(inputs[i].sizes());
115+
auto shape = core::util::toVec(dims);
116+
LOG_DEBUG("Input Name: " << name << " Shape: " << dims);
117+
118+
if (compiled_engine->cuda_engine->isShapeInferenceIO(name.c_str())) {
119+
// Shape tensor inputs are casted to int64 explicitly.
120+
// Refer to
121+
// https://github.com/NVIDIA/TensorRT/blob/d2f4ef789a9a6ffdf37b55c3f81b486225f6b380/samples/common/sampleInference.cpp#L435
122+
auto input_cpu = inputs[i].clone().contiguous().cpu().to(torch::kInt64);
123+
std::vector<int64_t> inputs_cpu_vec(
124+
input_cpu.data_ptr<int64_t>(), input_cpu.data_ptr<int64_t>() + input_cpu.numel());
125+
inputShapeTensorValues.emplace_back(inputs_cpu_vec);
126+
TORCHTRT_CHECK(
127+
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()),
128+
"Error while setting the tensor address for shape inputs");
129+
130+
if (CUDAGRAPHS_MODE) {
131+
// @peri044 I dont know if this makes sense since they are supposed to be GPU buffers
132+
compiled_engine->input_buffers[i] = input_cpu;
133+
}
134+
TORCHTRT_CHECK(
135+
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()),
136+
"Error while setting the tensor address for shape inputs");
94137

138+
} else {
139+
at::Tensor contig_input = inputs[i].view(shape).contiguous();
140+
formatted_inputs.emplace_back(std::move(contig_input));
141+
142+
if (need_cudagraphs_record) {
143+
// Create a new persistent input buffer
144+
compiled_engine->input_buffers[i] = std::move(formatted_inputs.back().clone());
145+
}
146+
147+
TORCHTRT_CHECK(
148+
compiled_engine->exec_ctx->setInputShape(name.c_str(), dims), "Error while setting the input shape");
149+
150+
if (CUDAGRAPHS_MODE) {
151+
// If using CUDAGraphs copy formatted input to the corresponding persistent input buffer
152+
compiled_engine->input_buffers[i].copy_(formatted_inputs.back(), true);
153+
TORCHTRT_CHECK(
154+
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), compiled_engine->input_buffers[i].data_ptr()),
155+
"Error while setting the input tensor address for inputs");
156+
} else {
157+
// Otherwise use the formatted buffer directly
158+
TORCHTRT_CHECK(
159+
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), formatted_inputs.back().data_ptr()),
160+
"Error while setting the input tensor address for inputs");
161+
}
162+
}
163+
}
164+
}
95165
std::vector<at::Tensor> create_output_tensors(c10::intrusive_ptr<TRTEngine> compiled_engine) {
96166
std::vector<at::Tensor> outputs(compiled_engine->num_io.second);
97167
for (auto output_indices : compiled_engine->out_binding_map) {
@@ -144,11 +214,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
144214
compiled_engine->cudagraph.reset();
145215
}
146216

147-
// this is a buffer to store shape tensor input addresses throughout the runtime scope
148-
std::list<std::vector<int64_t>> inputShapeTensorValues;
149-
150217
// Intialize inputs and outputs to be available throughout the succeeding scopes
151-
std::list<at::Tensor> formatted_inputs(compiled_engine->num_io.first);
152218
std::vector<at::Tensor> outputs(compiled_engine->num_io.second);
153219

154220
if (MULTI_DEVICE_SAFE_MODE) {
@@ -206,68 +272,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
206272
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path);
207273
}
208274

209-
for (size_t i = 0; i < inputs.size(); i++) {
210-
std::string name = compiled_engine->in_binding_names[i];
211-
212-
TORCHTRT_CHECK(
213-
inputs[i].is_cuda(), "Expected input tensors to have device cuda, found device " << inputs[i].device());
214-
215-
auto expected_type =
216-
util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str()));
217-
TORCHTRT_CHECK(
218-
inputs[i].dtype() == expected_type,
219-
"Expected input tensors to have type " << expected_type << ", found type " << inputs[i].dtype());
220-
221-
auto dims = core::util::toDims(inputs[i].sizes());
222-
auto shape = core::util::toVec(dims);
223-
LOG_DEBUG("Input Name: " << name << " Shape: " << dims);
224-
225-
if (compiled_engine->cuda_engine->isShapeInferenceIO(name.c_str())) {
226-
// Shape tensor inputs are casted to int64 explicitly.
227-
// Refer to
228-
// https://github.com/NVIDIA/TensorRT/blob/d2f4ef789a9a6ffdf37b55c3f81b486225f6b380/samples/common/sampleInference.cpp#L435
229-
auto input_cpu = inputs[i].clone().contiguous().cpu().to(torch::kInt64);
230-
std::vector<int64_t> inputs_cpu_vec(
231-
input_cpu.data_ptr<int64_t>(), input_cpu.data_ptr<int64_t>() + input_cpu.numel());
232-
inputShapeTensorValues.emplace_back(inputs_cpu_vec);
233-
TORCHTRT_CHECK(
234-
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()),
235-
"Error while setting the tensor address for shape inputs");
236-
237-
if (CUDAGRAPHS_MODE) {
238-
// @peri044 I dont know if this makes sense since they are supposed to be GPU buffers
239-
compiled_engine->input_buffers[i] = input_cpu;
240-
}
241-
TORCHTRT_CHECK(
242-
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()),
243-
"Error while setting the tensor address for shape inputs");
244-
245-
} else {
246-
at::Tensor contig_input = inputs[i].view(shape).contiguous();
247-
formatted_inputs.emplace_back(std::move(contig_input));
248-
249-
if (need_cudagraphs_record) {
250-
// Create a new persistent input buffer
251-
compiled_engine->input_buffers[i] = std::move(formatted_inputs.back().clone());
252-
}
253-
254-
TORCHTRT_CHECK(
255-
compiled_engine->exec_ctx->setInputShape(name.c_str(), dims), "Error while setting the input shape");
256-
257-
if (CUDAGRAPHS_MODE) {
258-
// If using CUDAGraphs copy formatted input to the corresponding persistent input buffer
259-
compiled_engine->input_buffers[i].copy_(formatted_inputs.back(), true);
260-
TORCHTRT_CHECK(
261-
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), compiled_engine->input_buffers[i].data_ptr()),
262-
"Error while setting the input tensor address for inputs");
263-
} else {
264-
// Otherwise use the formatted buffer directly
265-
TORCHTRT_CHECK(
266-
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), formatted_inputs.back().data_ptr()),
267-
"Error while setting the input tensor address for inputs");
268-
}
269-
}
270-
}
275+
setup_input_tensors(inputs, compiled_engine, need_cudagraphs_record);
271276

272277
// Check if input shapes can be inferred.
273278
int32_t const io_size{compiled_engine->cuda_engine->getNbIOTensors()};
@@ -286,7 +291,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
286291
output_profiler_guard =
287292
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path);
288293
}
289-
if ((false == compiled_engine->use_pre_allocated_outputs) || shape_changed) {
294+
if (!compiled_engine->use_pre_allocated_outputs || shape_changed) {
290295
outputs = create_output_tensors(compiled_engine);
291296
} else {
292297
outputs = compiled_engine->pre_allocated_outputs;

py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py

Lines changed: 56 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ def __init__(
110110
self.target_platform = Platform.current_platform()
111111
self.prev_cudagraphs_enabled = False
112112
self.pre_allocated_outputs: List[torch.Tensor] = []
113-
self.use_pre_allocated_outputs = True
113+
self.use_pre_allocated_outputs = False
114114

115115
if self.serialized_engine is not None and not self.settings.lazy_engine_init:
116116
self.setup_engine()
@@ -236,6 +236,57 @@ def __del__(self) -> None:
236236
if self.cudagraph:
237237
self.cudagraph.reset()
238238

239+
def setup_input_tensors(
240+
self,
241+
contiguous_inputs: List[torch.Tensor],
242+
cudagraphs_enabled: bool,
243+
need_cudagraphs_record: bool,
244+
) -> None:
245+
for i, input_name in enumerate(self.input_names):
246+
if not contiguous_inputs[i].is_cuda:
247+
logger.warning(
248+
f"Detected input {input_name} of engine {self.engine.name} is not on a cuda device. "
249+
"This tensor is being moved by the runtime but for performance considerations, "
250+
"ensure your inputs are all on GPU and open an issue here "
251+
"(https://github.com/pytorch/TensorRT/issues) if this warning persists."
252+
)
253+
contiguous_inputs = (
254+
contiguous_inputs[:i]
255+
+ [contiguous_inputs[i].cuda()]
256+
+ contiguous_inputs[i + 1 :]
257+
)
258+
259+
assert (
260+
contiguous_inputs[i].dtype == self.input_dtypes[i]
261+
), f"Dtype mismatch for {i}th input({input_name}). Expect {self.input_dtypes[i]}, got {contiguous_inputs[i].dtype}."
262+
263+
if need_cudagraphs_record:
264+
# If cudagraphs is enabled, this memory is reserved for future cudagraph runs
265+
# Clone is required to avoid re-using user-provided GPU memory
266+
self._input_buffers[i] = contiguous_inputs[i].clone()
267+
268+
# For shape tensors, we use CPU pointers and for data tensors, we use GPU pointers
269+
# as per TensorRT requirements
270+
if self.engine.is_shape_inference_io(input_name):
271+
# Shape tensor inputs are casted to int64 explicitly
272+
# Currently Torch CPU pointers are not working; numpy pointers are used instead
273+
# to refer to underlying memory
274+
inputs_cpu = contiguous_inputs[i].cpu().to(torch.int64).numpy().copy()
275+
self.context.set_tensor_address(input_name, inputs_cpu.ctypes.data)
276+
else:
277+
self.context.set_input_shape(
278+
input_name, tuple(contiguous_inputs[i].shape)
279+
)
280+
if cudagraphs_enabled:
281+
self._input_buffers[i].copy_(contiguous_inputs[i])
282+
self.context.set_tensor_address(
283+
input_name, self._input_buffers[i].data_ptr()
284+
)
285+
else:
286+
self.context.set_tensor_address(
287+
input_name, contiguous_inputs[i].data_ptr()
288+
)
289+
239290
def create_output_tensors(self) -> List[torch.Tensor]:
240291
# create output tensors
241292
outputs: List[torch.Tensor] = []
@@ -273,6 +324,7 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
273324
need_cudagraphs_record = True
274325
else:
275326
need_cudagraphs_record = cudagraphs_enabled and shape_changed
327+
276328
self.prev_cudagraphs_enabled = cudagraphs_enabled
277329

278330
if need_cudagraphs_record:
@@ -328,54 +380,10 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
328380
self.input_names
329381
), f"Wrong number of inputs, expect {len(self.input_names)} get {len(contiguous_inputs)}."
330382

331-
for i, input_name in enumerate(self.input_names):
332-
if not contiguous_inputs[i].is_cuda:
333-
logger.warning(
334-
f"Detected input {input_name} of engine {self.engine.name} is not on a cuda device. "
335-
"This tensor is being moved by the runtime but for performance considerations, "
336-
"ensure your inputs are all on GPU and open an issue here "
337-
"(https://github.com/pytorch/TensorRT/issues) if this warning persists."
338-
)
339-
contiguous_inputs = (
340-
contiguous_inputs[:i]
341-
+ [contiguous_inputs[i].cuda()]
342-
+ contiguous_inputs[i + 1 :]
343-
)
344-
345-
assert (
346-
contiguous_inputs[i].dtype == self.input_dtypes[i]
347-
), f"Dtype mismatch for {i}th input({input_name}). Expect {self.input_dtypes[i]}, got {contiguous_inputs[i].dtype}."
383+
self.setup_input_tensors(
384+
contiguous_inputs, cudagraphs_enabled, need_cudagraphs_record
385+
)
348386

349-
if need_cudagraphs_record:
350-
# If cudagraphs is enabled, this memory is reserved for future cudagraph runs
351-
# Clone is required to avoid re-using user-provided GPU memory
352-
self._input_buffers[i] = contiguous_inputs[i].clone()
353-
354-
# For shape tensors, we use CPU pointers and for data tensors, we use GPU pointers
355-
# as per TensorRT requirements
356-
if self.engine.is_shape_inference_io(input_name):
357-
# Shape tensor inputs are casted to int64 explicitly
358-
# Currently Torch CPU pointers are not working; numpy pointers are used instead
359-
# to refer to underlying memory
360-
inputs_cpu = (
361-
contiguous_inputs[i].cpu().to(torch.int64).numpy().copy()
362-
)
363-
self.context.set_tensor_address(
364-
input_name, inputs_cpu.ctypes.data
365-
)
366-
else:
367-
self.context.set_input_shape(
368-
input_name, tuple(contiguous_inputs[i].shape)
369-
)
370-
if cudagraphs_enabled:
371-
self._input_buffers[i].copy_(contiguous_inputs[i])
372-
self.context.set_tensor_address(
373-
input_name, self._input_buffers[i].data_ptr()
374-
)
375-
else:
376-
self.context.set_tensor_address(
377-
input_name, contiguous_inputs[i].data_ptr()
378-
)
379387
if shape_changed:
380388
# Check if input shapes can be inferred.
381389
uninferred_input_names = self.context.infer_shapes()

tests/py/dynamo/runtime/test_002_cudagraphs_cpp.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@
1717
"Torch-TensorRT runtime is not available",
1818
)
1919
class TestCudagraphsCPP(TestCase):
20+
def tearDown(self):
21+
# Reset to default cuda graph mode after each test
22+
torch_tensorrt.runtime.set_cudagraphs_mode(False)
2023

2124
def test_cudagraphs_on(self):
2225
torch_tensorrt.runtime.set_cudagraphs_mode(True)

tests/py/dynamo/runtime/test_002_cudagraphs_py.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@
1313

1414

1515
class TestCudagraphsPython(TestCase):
16+
def tearDown(self):
17+
# Reset to default cuda graph mode after each test
18+
torch_tensorrt.runtime.set_cudagraphs_mode(False)
19+
1620
def test_cudagraphs_on(self):
1721
torch_tensorrt.runtime.set_cudagraphs_mode(True)
1822
self.assertTrue(torch_tensorrt.runtime.get_cudagraphs_mode())

0 commit comments

Comments
 (0)