Skip to content

Commit f480353

Browse files
committed
chore: Functionalize inputs setup
1 parent 4a5f0d1 commit f480353

File tree

4 files changed

+135
-115
lines changed

4 files changed

+135
-115
lines changed

core/runtime/execute_engine.cpp

Lines changed: 72 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,77 @@ bool _validate_shapes(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngi
9191

9292
return false;
9393
}
94+
void setup_input_tensors(
95+
std::vector<at::Tensor> inputs,
96+
c10::intrusive_ptr<TRTEngine> compiled_engine,
97+
bool need_cudagraphs_record) {
98+
// this is a buffer to store shape tensor input addresses throughout the runtime scope
99+
std::list<std::vector<int64_t>> inputShapeTensorValues;
100+
std::list<at::Tensor> formatted_inputs(compiled_engine->num_io.first);
101+
102+
for (size_t i = 0; i < inputs.size(); i++) {
103+
std::string name = compiled_engine->in_binding_names[i];
104+
105+
TORCHTRT_CHECK(
106+
inputs[i].is_cuda(), "Expected input tensors to have device cuda, found device " << inputs[i].device());
107+
108+
auto expected_type =
109+
util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str()));
110+
TORCHTRT_CHECK(
111+
inputs[i].dtype() == expected_type,
112+
"Expected input tensors to have type " << expected_type << ", found type " << inputs[i].dtype());
113+
114+
auto dims = core::util::toDims(inputs[i].sizes());
115+
auto shape = core::util::toVec(dims);
116+
LOG_DEBUG("Input Name: " << name << " Shape: " << dims);
117+
118+
if (compiled_engine->cuda_engine->isShapeInferenceIO(name.c_str())) {
119+
// Shape tensor inputs are casted to int64 explicitly.
120+
// Refer to
121+
// https://github.com/NVIDIA/TensorRT/blob/d2f4ef789a9a6ffdf37b55c3f81b486225f6b380/samples/common/sampleInference.cpp#L435
122+
auto input_cpu = inputs[i].clone().contiguous().cpu().to(torch::kInt64);
123+
std::vector<int64_t> inputs_cpu_vec(
124+
input_cpu.data_ptr<int64_t>(), input_cpu.data_ptr<int64_t>() + input_cpu.numel());
125+
inputShapeTensorValues.emplace_back(inputs_cpu_vec);
126+
TORCHTRT_CHECK(
127+
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()),
128+
"Error while setting the tensor address for shape inputs");
129+
130+
if (CUDAGRAPHS_MODE) {
131+
// @peri044 I dont know if this makes sense since they are supposed to be GPU buffers
132+
compiled_engine->input_buffers[i] = input_cpu;
133+
}
134+
TORCHTRT_CHECK(
135+
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()),
136+
"Error while setting the tensor address for shape inputs");
94137

138+
} else {
139+
at::Tensor contig_input = inputs[i].view(shape).contiguous();
140+
formatted_inputs.emplace_back(std::move(contig_input));
141+
142+
if (need_cudagraphs_record) {
143+
// Create a new persistent input buffer
144+
compiled_engine->input_buffers[i] = std::move(formatted_inputs.back().clone());
145+
}
146+
147+
TORCHTRT_CHECK(
148+
compiled_engine->exec_ctx->setInputShape(name.c_str(), dims), "Error while setting the input shape");
149+
150+
if (CUDAGRAPHS_MODE) {
151+
// If using CUDAGraphs copy formatted input to the corresponding persistent input buffer
152+
compiled_engine->input_buffers[i].copy_(formatted_inputs.back(), true);
153+
TORCHTRT_CHECK(
154+
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), compiled_engine->input_buffers[i].data_ptr()),
155+
"Error while setting the input tensor address for inputs");
156+
} else {
157+
// Otherwise use the formatted buffer directly
158+
TORCHTRT_CHECK(
159+
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), formatted_inputs.back().data_ptr()),
160+
"Error while setting the input tensor address for inputs");
161+
}
162+
}
163+
}
164+
}
95165
std::vector<at::Tensor> create_output_tensors(c10::intrusive_ptr<TRTEngine> compiled_engine) {
96166
std::vector<at::Tensor> outputs(compiled_engine->num_io.second);
97167
for (auto output_indices : compiled_engine->out_binding_map) {
@@ -142,11 +212,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
142212
compiled_engine->cudagraph.reset();
143213
}
144214

145-
// this is a buffer to store shape tensor input addresses throughout the runtime scope
146-
std::list<std::vector<int64_t>> inputShapeTensorValues;
147-
148215
// Intialize inputs and outputs to be available throughout the succeeding scopes
149-
std::list<at::Tensor> formatted_inputs(compiled_engine->num_io.first);
150216
std::vector<at::Tensor> outputs(compiled_engine->num_io.second);
151217

152218
if (MULTI_DEVICE_SAFE_MODE) {
@@ -204,68 +270,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
204270
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path);
205271
}
206272

207-
for (size_t i = 0; i < inputs.size(); i++) {
208-
std::string name = compiled_engine->in_binding_names[i];
209-
210-
TORCHTRT_CHECK(
211-
inputs[i].is_cuda(), "Expected input tensors to have device cuda, found device " << inputs[i].device());
212-
213-
auto expected_type =
214-
util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str()));
215-
TORCHTRT_CHECK(
216-
inputs[i].dtype() == expected_type,
217-
"Expected input tensors to have type " << expected_type << ", found type " << inputs[i].dtype());
218-
219-
auto dims = core::util::toDims(inputs[i].sizes());
220-
auto shape = core::util::toVec(dims);
221-
LOG_DEBUG("Input Name: " << name << " Shape: " << dims);
222-
223-
if (compiled_engine->cuda_engine->isShapeInferenceIO(name.c_str())) {
224-
// Shape tensor inputs are casted to int64 explicitly.
225-
// Refer to
226-
// https://github.com/NVIDIA/TensorRT/blob/d2f4ef789a9a6ffdf37b55c3f81b486225f6b380/samples/common/sampleInference.cpp#L435
227-
auto input_cpu = inputs[i].clone().contiguous().cpu().to(torch::kInt64);
228-
std::vector<int64_t> inputs_cpu_vec(
229-
input_cpu.data_ptr<int64_t>(), input_cpu.data_ptr<int64_t>() + input_cpu.numel());
230-
inputShapeTensorValues.emplace_back(inputs_cpu_vec);
231-
TORCHTRT_CHECK(
232-
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()),
233-
"Error while setting the tensor address for shape inputs");
234-
235-
if (CUDAGRAPHS_MODE) {
236-
// @peri044 I dont know if this makes sense since they are supposed to be GPU buffers
237-
compiled_engine->input_buffers[i] = input_cpu;
238-
}
239-
TORCHTRT_CHECK(
240-
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()),
241-
"Error while setting the tensor address for shape inputs");
242-
243-
} else {
244-
at::Tensor contig_input = inputs[i].view(shape).contiguous();
245-
formatted_inputs.emplace_back(std::move(contig_input));
246-
247-
if (need_cudagraphs_record) {
248-
// Create a new persistent input buffer
249-
compiled_engine->input_buffers[i] = std::move(formatted_inputs.back().clone());
250-
}
251-
252-
TORCHTRT_CHECK(
253-
compiled_engine->exec_ctx->setInputShape(name.c_str(), dims), "Error while setting the input shape");
254-
255-
if (CUDAGRAPHS_MODE) {
256-
// If using CUDAGraphs copy formatted input to the corresponding persistent input buffer
257-
compiled_engine->input_buffers[i].copy_(formatted_inputs.back(), true);
258-
TORCHTRT_CHECK(
259-
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), compiled_engine->input_buffers[i].data_ptr()),
260-
"Error while setting the input tensor address for inputs");
261-
} else {
262-
// Otherwise use the formatted buffer directly
263-
TORCHTRT_CHECK(
264-
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), formatted_inputs.back().data_ptr()),
265-
"Error while setting the input tensor address for inputs");
266-
}
267-
}
268-
}
273+
setup_input_tensors(inputs, compiled_engine, need_cudagraphs_record);
269274

270275
// Check if input shapes can be inferred.
271276
int32_t const io_size{compiled_engine->cuda_engine->getNbIOTensors()};
@@ -284,7 +289,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
284289
output_profiler_guard =
285290
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path);
286291
}
287-
if ((false == compiled_engine->use_pre_allocated_outputs) || shape_changed) {
292+
if (!compiled_engine->use_pre_allocated_outputs || shape_changed) {
288293
outputs = create_output_tensors(compiled_engine);
289294
} else {
290295
outputs = compiled_engine->pre_allocated_outputs;

py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py

Lines changed: 56 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ def __init__(
109109
self.target_platform = Platform.current_platform()
110110
self.prev_cudagraphs_enabled = False
111111
self.pre_allocated_outputs: List[torch.Tensor] = []
112-
self.use_pre_allocated_outputs = True
112+
self.use_pre_allocated_outputs = False
113113

114114
if self.serialized_engine is not None and not self.settings.lazy_engine_init:
115115
self.setup_engine()
@@ -235,6 +235,57 @@ def __del__(self) -> None:
235235
if self.cudagraph:
236236
self.cudagraph.reset()
237237

238+
def setup_input_tensors(
239+
self,
240+
contiguous_inputs: List[torch.Tensor],
241+
cudagraphs_enabled: bool,
242+
need_cudagraphs_record: bool,
243+
) -> None:
244+
for i, input_name in enumerate(self.input_names):
245+
if not contiguous_inputs[i].is_cuda:
246+
logger.warning(
247+
f"Detected input {input_name} of engine {self.engine.name} is not on a cuda device. "
248+
"This tensor is being moved by the runtime but for performance considerations, "
249+
"ensure your inputs are all on GPU and open an issue here "
250+
"(https://github.com/pytorch/TensorRT/issues) if this warning persists."
251+
)
252+
contiguous_inputs = (
253+
contiguous_inputs[:i]
254+
+ [contiguous_inputs[i].cuda()]
255+
+ contiguous_inputs[i + 1 :]
256+
)
257+
258+
assert (
259+
contiguous_inputs[i].dtype == self.input_dtypes[i]
260+
), f"Dtype mismatch for {i}th input({input_name}). Expect {self.input_dtypes[i]}, got {contiguous_inputs[i].dtype}."
261+
262+
if need_cudagraphs_record:
263+
# If cudagraphs is enabled, this memory is reserved for future cudagraph runs
264+
# Clone is required to avoid re-using user-provided GPU memory
265+
self._input_buffers[i] = contiguous_inputs[i].clone()
266+
267+
# For shape tensors, we use CPU pointers and for data tensors, we use GPU pointers
268+
# as per TensorRT requirements
269+
if self.engine.is_shape_inference_io(input_name):
270+
# Shape tensor inputs are casted to int64 explicitly
271+
# Currently Torch CPU pointers are not working; numpy pointers are used instead
272+
# to refer to underlying memory
273+
inputs_cpu = contiguous_inputs[i].cpu().to(torch.int64).numpy().copy()
274+
self.context.set_tensor_address(input_name, inputs_cpu.ctypes.data)
275+
else:
276+
self.context.set_input_shape(
277+
input_name, tuple(contiguous_inputs[i].shape)
278+
)
279+
if cudagraphs_enabled:
280+
self._input_buffers[i].copy_(contiguous_inputs[i])
281+
self.context.set_tensor_address(
282+
input_name, self._input_buffers[i].data_ptr()
283+
)
284+
else:
285+
self.context.set_tensor_address(
286+
input_name, contiguous_inputs[i].data_ptr()
287+
)
288+
238289
def create_output_tensors(self) -> List[torch.Tensor]:
239290
# create output tensors
240291
outputs: List[torch.Tensor] = []
@@ -272,6 +323,7 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
272323
need_cudagraphs_record = True
273324
else:
274325
need_cudagraphs_record = cudagraphs_enabled and shape_changed
326+
275327
self.prev_cudagraphs_enabled = cudagraphs_enabled
276328

277329
if need_cudagraphs_record:
@@ -327,54 +379,10 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
327379
self.input_names
328380
), f"Wrong number of inputs, expect {len(self.input_names)} get {len(contiguous_inputs)}."
329381

330-
for i, input_name in enumerate(self.input_names):
331-
if not contiguous_inputs[i].is_cuda:
332-
logger.warning(
333-
f"Detected input {input_name} of engine {self.engine.name} is not on a cuda device. "
334-
"This tensor is being moved by the runtime but for performance considerations, "
335-
"ensure your inputs are all on GPU and open an issue here "
336-
"(https://github.com/pytorch/TensorRT/issues) if this warning persists."
337-
)
338-
contiguous_inputs = (
339-
contiguous_inputs[:i]
340-
+ [contiguous_inputs[i].cuda()]
341-
+ contiguous_inputs[i + 1 :]
342-
)
343-
344-
assert (
345-
contiguous_inputs[i].dtype == self.input_dtypes[i]
346-
), f"Dtype mismatch for {i}th input({input_name}). Expect {self.input_dtypes[i]}, got {contiguous_inputs[i].dtype}."
382+
self.setup_input_tensors(
383+
contiguous_inputs, cudagraphs_enabled, need_cudagraphs_record
384+
)
347385

348-
if need_cudagraphs_record:
349-
# If cudagraphs is enabled, this memory is reserved for future cudagraph runs
350-
# Clone is required to avoid re-using user-provided GPU memory
351-
self._input_buffers[i] = contiguous_inputs[i].clone()
352-
353-
# For shape tensors, we use CPU pointers and for data tensors, we use GPU pointers
354-
# as per TensorRT requirements
355-
if self.engine.is_shape_inference_io(input_name):
356-
# Shape tensor inputs are casted to int64 explicitly
357-
# Currently Torch CPU pointers are not working; numpy pointers are used instead
358-
# to refer to underlying memory
359-
inputs_cpu = (
360-
contiguous_inputs[i].cpu().to(torch.int64).numpy().copy()
361-
)
362-
self.context.set_tensor_address(
363-
input_name, inputs_cpu.ctypes.data
364-
)
365-
else:
366-
self.context.set_input_shape(
367-
input_name, tuple(contiguous_inputs[i].shape)
368-
)
369-
if cudagraphs_enabled:
370-
self._input_buffers[i].copy_(contiguous_inputs[i])
371-
self.context.set_tensor_address(
372-
input_name, self._input_buffers[i].data_ptr()
373-
)
374-
else:
375-
self.context.set_tensor_address(
376-
input_name, contiguous_inputs[i].data_ptr()
377-
)
378386
if shape_changed:
379387
# Check if input shapes can be inferred.
380388
uninferred_input_names = self.context.infer_shapes()

tests/py/dynamo/runtime/test_002_cudagraphs_cpp.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@
1717
"Torch-TensorRT runtime is not available",
1818
)
1919
class TestCudagraphsCPP(TestCase):
20+
def tearDown(self):
21+
# Reset to default cuda graph mode after each test
22+
torch_tensorrt.runtime.set_cudagraphs_mode(False)
2023

2124
def test_cudagraphs_on(self):
2225
torch_tensorrt.runtime.set_cudagraphs_mode(True)

tests/py/dynamo/runtime/test_002_cudagraphs_py.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@
1313

1414

1515
class TestCudagraphsPython(TestCase):
16+
def tearDown(self):
17+
# Reset to default cuda graph mode after each test
18+
torch_tensorrt.runtime.set_cudagraphs_mode(False)
19+
1620
def test_cudagraphs_on(self):
1721
torch_tensorrt.runtime.set_cudagraphs_mode(True)
1822
self.assertTrue(torch_tensorrt.runtime.get_cudagraphs_mode())

0 commit comments

Comments
 (0)