@@ -91,7 +91,77 @@ bool _validate_shapes(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngi
91
91
92
92
return false ;
93
93
}
94
+ void setup_input_tensors (
95
+ std::vector<at::Tensor> inputs,
96
+ c10::intrusive_ptr<TRTEngine> compiled_engine,
97
+ bool need_cudagraphs_record) {
98
+ // this is a buffer to store shape tensor input addresses throughout the runtime scope
99
+ std::list<std::vector<int64_t >> inputShapeTensorValues;
100
+ std::list<at::Tensor> formatted_inputs (compiled_engine->num_io .first );
101
+
102
+ for (size_t i = 0 ; i < inputs.size (); i++) {
103
+ std::string name = compiled_engine->in_binding_names [i];
104
+
105
+ TORCHTRT_CHECK (
106
+ inputs[i].is_cuda (), " Expected input tensors to have device cuda, found device " << inputs[i].device ());
107
+
108
+ auto expected_type =
109
+ util::TRTDataTypeToScalarType (compiled_engine->exec_ctx ->getEngine ().getTensorDataType (name.c_str ()));
110
+ TORCHTRT_CHECK (
111
+ inputs[i].dtype () == expected_type,
112
+ " Expected input tensors to have type " << expected_type << " , found type " << inputs[i].dtype ());
113
+
114
+ auto dims = core::util::toDims (inputs[i].sizes ());
115
+ auto shape = core::util::toVec (dims);
116
+ LOG_DEBUG (" Input Name: " << name << " Shape: " << dims);
117
+
118
+ if (compiled_engine->cuda_engine ->isShapeInferenceIO (name.c_str ())) {
119
+ // Shape tensor inputs are casted to int64 explicitly.
120
+ // Refer to
121
+ // https://github.com/NVIDIA/TensorRT/blob/d2f4ef789a9a6ffdf37b55c3f81b486225f6b380/samples/common/sampleInference.cpp#L435
122
+ auto input_cpu = inputs[i].clone ().contiguous ().cpu ().to (torch::kInt64 );
123
+ std::vector<int64_t > inputs_cpu_vec (
124
+ input_cpu.data_ptr <int64_t >(), input_cpu.data_ptr <int64_t >() + input_cpu.numel ());
125
+ inputShapeTensorValues.emplace_back (inputs_cpu_vec);
126
+ TORCHTRT_CHECK (
127
+ compiled_engine->exec_ctx ->setTensorAddress (name.c_str (), inputShapeTensorValues.back ().data ()),
128
+ " Error while setting the tensor address for shape inputs" );
129
+
130
+ if (CUDAGRAPHS_MODE) {
131
+ // @peri044 I dont know if this makes sense since they are supposed to be GPU buffers
132
+ compiled_engine->input_buffers [i] = input_cpu;
133
+ }
134
+ TORCHTRT_CHECK (
135
+ compiled_engine->exec_ctx ->setTensorAddress (name.c_str (), inputShapeTensorValues.back ().data ()),
136
+ " Error while setting the tensor address for shape inputs" );
94
137
138
+ } else {
139
+ at::Tensor contig_input = inputs[i].view (shape).contiguous ();
140
+ formatted_inputs.emplace_back (std::move (contig_input));
141
+
142
+ if (need_cudagraphs_record) {
143
+ // Create a new persistent input buffer
144
+ compiled_engine->input_buffers [i] = std::move (formatted_inputs.back ().clone ());
145
+ }
146
+
147
+ TORCHTRT_CHECK (
148
+ compiled_engine->exec_ctx ->setInputShape (name.c_str (), dims), " Error while setting the input shape" );
149
+
150
+ if (CUDAGRAPHS_MODE) {
151
+ // If using CUDAGraphs copy formatted input to the corresponding persistent input buffer
152
+ compiled_engine->input_buffers [i].copy_ (formatted_inputs.back (), true );
153
+ TORCHTRT_CHECK (
154
+ compiled_engine->exec_ctx ->setTensorAddress (name.c_str (), compiled_engine->input_buffers [i].data_ptr ()),
155
+ " Error while setting the input tensor address for inputs" );
156
+ } else {
157
+ // Otherwise use the formatted buffer directly
158
+ TORCHTRT_CHECK (
159
+ compiled_engine->exec_ctx ->setTensorAddress (name.c_str (), formatted_inputs.back ().data_ptr ()),
160
+ " Error while setting the input tensor address for inputs" );
161
+ }
162
+ }
163
+ }
164
+ }
95
165
std::vector<at::Tensor> create_output_tensors (c10::intrusive_ptr<TRTEngine> compiled_engine) {
96
166
std::vector<at::Tensor> outputs (compiled_engine->num_io .second );
97
167
for (auto output_indices : compiled_engine->out_binding_map ) {
@@ -144,11 +214,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
144
214
compiled_engine->cudagraph .reset ();
145
215
}
146
216
147
- // this is a buffer to store shape tensor input addresses throughout the runtime scope
148
- std::list<std::vector<int64_t >> inputShapeTensorValues;
149
-
150
217
// Intialize inputs and outputs to be available throughout the succeeding scopes
151
- std::list<at::Tensor> formatted_inputs (compiled_engine->num_io .first );
152
218
std::vector<at::Tensor> outputs (compiled_engine->num_io .second );
153
219
154
220
if (MULTI_DEVICE_SAFE_MODE) {
@@ -206,68 +272,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
206
272
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path );
207
273
}
208
274
209
- for (size_t i = 0 ; i < inputs.size (); i++) {
210
- std::string name = compiled_engine->in_binding_names [i];
211
-
212
- TORCHTRT_CHECK (
213
- inputs[i].is_cuda (), " Expected input tensors to have device cuda, found device " << inputs[i].device ());
214
-
215
- auto expected_type =
216
- util::TRTDataTypeToScalarType (compiled_engine->exec_ctx ->getEngine ().getTensorDataType (name.c_str ()));
217
- TORCHTRT_CHECK (
218
- inputs[i].dtype () == expected_type,
219
- " Expected input tensors to have type " << expected_type << " , found type " << inputs[i].dtype ());
220
-
221
- auto dims = core::util::toDims (inputs[i].sizes ());
222
- auto shape = core::util::toVec (dims);
223
- LOG_DEBUG (" Input Name: " << name << " Shape: " << dims);
224
-
225
- if (compiled_engine->cuda_engine ->isShapeInferenceIO (name.c_str ())) {
226
- // Shape tensor inputs are casted to int64 explicitly.
227
- // Refer to
228
- // https://github.com/NVIDIA/TensorRT/blob/d2f4ef789a9a6ffdf37b55c3f81b486225f6b380/samples/common/sampleInference.cpp#L435
229
- auto input_cpu = inputs[i].clone ().contiguous ().cpu ().to (torch::kInt64 );
230
- std::vector<int64_t > inputs_cpu_vec (
231
- input_cpu.data_ptr <int64_t >(), input_cpu.data_ptr <int64_t >() + input_cpu.numel ());
232
- inputShapeTensorValues.emplace_back (inputs_cpu_vec);
233
- TORCHTRT_CHECK (
234
- compiled_engine->exec_ctx ->setTensorAddress (name.c_str (), inputShapeTensorValues.back ().data ()),
235
- " Error while setting the tensor address for shape inputs" );
236
-
237
- if (CUDAGRAPHS_MODE) {
238
- // @peri044 I dont know if this makes sense since they are supposed to be GPU buffers
239
- compiled_engine->input_buffers [i] = input_cpu;
240
- }
241
- TORCHTRT_CHECK (
242
- compiled_engine->exec_ctx ->setTensorAddress (name.c_str (), inputShapeTensorValues.back ().data ()),
243
- " Error while setting the tensor address for shape inputs" );
244
-
245
- } else {
246
- at::Tensor contig_input = inputs[i].view (shape).contiguous ();
247
- formatted_inputs.emplace_back (std::move (contig_input));
248
-
249
- if (need_cudagraphs_record) {
250
- // Create a new persistent input buffer
251
- compiled_engine->input_buffers [i] = std::move (formatted_inputs.back ().clone ());
252
- }
253
-
254
- TORCHTRT_CHECK (
255
- compiled_engine->exec_ctx ->setInputShape (name.c_str (), dims), " Error while setting the input shape" );
256
-
257
- if (CUDAGRAPHS_MODE) {
258
- // If using CUDAGraphs copy formatted input to the corresponding persistent input buffer
259
- compiled_engine->input_buffers [i].copy_ (formatted_inputs.back (), true );
260
- TORCHTRT_CHECK (
261
- compiled_engine->exec_ctx ->setTensorAddress (name.c_str (), compiled_engine->input_buffers [i].data_ptr ()),
262
- " Error while setting the input tensor address for inputs" );
263
- } else {
264
- // Otherwise use the formatted buffer directly
265
- TORCHTRT_CHECK (
266
- compiled_engine->exec_ctx ->setTensorAddress (name.c_str (), formatted_inputs.back ().data_ptr ()),
267
- " Error while setting the input tensor address for inputs" );
268
- }
269
- }
270
- }
275
+ setup_input_tensors (inputs, compiled_engine, need_cudagraphs_record);
271
276
272
277
// Check if input shapes can be inferred.
273
278
int32_t const io_size{compiled_engine->cuda_engine ->getNbIOTensors ()};
@@ -286,7 +291,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
286
291
output_profiler_guard =
287
292
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path );
288
293
}
289
- if (( false == compiled_engine->use_pre_allocated_outputs ) || shape_changed) {
294
+ if (! compiled_engine->use_pre_allocated_outputs || shape_changed) {
290
295
outputs = create_output_tensors (compiled_engine);
291
296
} else {
292
297
outputs = compiled_engine->pre_allocated_outputs ;
0 commit comments