@@ -91,7 +91,77 @@ bool _validate_shapes(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngi
91
91
92
92
return false ;
93
93
}
94
+ void setup_input_tensors (
95
+ std::vector<at::Tensor> inputs,
96
+ c10::intrusive_ptr<TRTEngine> compiled_engine,
97
+ bool need_cudagraphs_record) {
98
+ // this is a buffer to store shape tensor input addresses throughout the runtime scope
99
+ std::list<std::vector<int64_t >> inputShapeTensorValues;
100
+ std::list<at::Tensor> formatted_inputs (compiled_engine->num_io .first );
101
+
102
+ for (size_t i = 0 ; i < inputs.size (); i++) {
103
+ std::string name = compiled_engine->in_binding_names [i];
104
+
105
+ TORCHTRT_CHECK (
106
+ inputs[i].is_cuda (), " Expected input tensors to have device cuda, found device " << inputs[i].device ());
107
+
108
+ auto expected_type =
109
+ util::TRTDataTypeToScalarType (compiled_engine->exec_ctx ->getEngine ().getTensorDataType (name.c_str ()));
110
+ TORCHTRT_CHECK (
111
+ inputs[i].dtype () == expected_type,
112
+ " Expected input tensors to have type " << expected_type << " , found type " << inputs[i].dtype ());
113
+
114
+ auto dims = core::util::toDims (inputs[i].sizes ());
115
+ auto shape = core::util::toVec (dims);
116
+ LOG_DEBUG (" Input Name: " << name << " Shape: " << dims);
117
+
118
+ if (compiled_engine->cuda_engine ->isShapeInferenceIO (name.c_str ())) {
119
+ // Shape tensor inputs are casted to int64 explicitly.
120
+ // Refer to
121
+ // https://github.com/NVIDIA/TensorRT/blob/d2f4ef789a9a6ffdf37b55c3f81b486225f6b380/samples/common/sampleInference.cpp#L435
122
+ auto input_cpu = inputs[i].clone ().contiguous ().cpu ().to (torch::kInt64 );
123
+ std::vector<int64_t > inputs_cpu_vec (
124
+ input_cpu.data_ptr <int64_t >(), input_cpu.data_ptr <int64_t >() + input_cpu.numel ());
125
+ inputShapeTensorValues.emplace_back (inputs_cpu_vec);
126
+ TORCHTRT_CHECK (
127
+ compiled_engine->exec_ctx ->setTensorAddress (name.c_str (), inputShapeTensorValues.back ().data ()),
128
+ " Error while setting the tensor address for shape inputs" );
129
+
130
+ if (CUDAGRAPHS_MODE) {
131
+ // @peri044 I dont know if this makes sense since they are supposed to be GPU buffers
132
+ compiled_engine->input_buffers [i] = input_cpu;
133
+ }
134
+ TORCHTRT_CHECK (
135
+ compiled_engine->exec_ctx ->setTensorAddress (name.c_str (), inputShapeTensorValues.back ().data ()),
136
+ " Error while setting the tensor address for shape inputs" );
94
137
138
+ } else {
139
+ at::Tensor contig_input = inputs[i].view (shape).contiguous ();
140
+ formatted_inputs.emplace_back (std::move (contig_input));
141
+
142
+ if (need_cudagraphs_record) {
143
+ // Create a new persistent input buffer
144
+ compiled_engine->input_buffers [i] = std::move (formatted_inputs.back ().clone ());
145
+ }
146
+
147
+ TORCHTRT_CHECK (
148
+ compiled_engine->exec_ctx ->setInputShape (name.c_str (), dims), " Error while setting the input shape" );
149
+
150
+ if (CUDAGRAPHS_MODE) {
151
+ // If using CUDAGraphs copy formatted input to the corresponding persistent input buffer
152
+ compiled_engine->input_buffers [i].copy_ (formatted_inputs.back (), true );
153
+ TORCHTRT_CHECK (
154
+ compiled_engine->exec_ctx ->setTensorAddress (name.c_str (), compiled_engine->input_buffers [i].data_ptr ()),
155
+ " Error while setting the input tensor address for inputs" );
156
+ } else {
157
+ // Otherwise use the formatted buffer directly
158
+ TORCHTRT_CHECK (
159
+ compiled_engine->exec_ctx ->setTensorAddress (name.c_str (), formatted_inputs.back ().data_ptr ()),
160
+ " Error while setting the input tensor address for inputs" );
161
+ }
162
+ }
163
+ }
164
+ }
95
165
std::vector<at::Tensor> create_output_tensors (c10::intrusive_ptr<TRTEngine> compiled_engine) {
96
166
std::vector<at::Tensor> outputs (compiled_engine->num_io .second );
97
167
for (auto output_indices : compiled_engine->out_binding_map ) {
@@ -142,11 +212,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
142
212
compiled_engine->cudagraph .reset ();
143
213
}
144
214
145
- // this is a buffer to store shape tensor input addresses throughout the runtime scope
146
- std::list<std::vector<int64_t >> inputShapeTensorValues;
147
-
148
215
// Intialize inputs and outputs to be available throughout the succeeding scopes
149
- std::list<at::Tensor> formatted_inputs (compiled_engine->num_io .first );
150
216
std::vector<at::Tensor> outputs (compiled_engine->num_io .second );
151
217
152
218
if (MULTI_DEVICE_SAFE_MODE) {
@@ -204,68 +270,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
204
270
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path );
205
271
}
206
272
207
- for (size_t i = 0 ; i < inputs.size (); i++) {
208
- std::string name = compiled_engine->in_binding_names [i];
209
-
210
- TORCHTRT_CHECK (
211
- inputs[i].is_cuda (), " Expected input tensors to have device cuda, found device " << inputs[i].device ());
212
-
213
- auto expected_type =
214
- util::TRTDataTypeToScalarType (compiled_engine->exec_ctx ->getEngine ().getTensorDataType (name.c_str ()));
215
- TORCHTRT_CHECK (
216
- inputs[i].dtype () == expected_type,
217
- " Expected input tensors to have type " << expected_type << " , found type " << inputs[i].dtype ());
218
-
219
- auto dims = core::util::toDims (inputs[i].sizes ());
220
- auto shape = core::util::toVec (dims);
221
- LOG_DEBUG (" Input Name: " << name << " Shape: " << dims);
222
-
223
- if (compiled_engine->cuda_engine ->isShapeInferenceIO (name.c_str ())) {
224
- // Shape tensor inputs are casted to int64 explicitly.
225
- // Refer to
226
- // https://github.com/NVIDIA/TensorRT/blob/d2f4ef789a9a6ffdf37b55c3f81b486225f6b380/samples/common/sampleInference.cpp#L435
227
- auto input_cpu = inputs[i].clone ().contiguous ().cpu ().to (torch::kInt64 );
228
- std::vector<int64_t > inputs_cpu_vec (
229
- input_cpu.data_ptr <int64_t >(), input_cpu.data_ptr <int64_t >() + input_cpu.numel ());
230
- inputShapeTensorValues.emplace_back (inputs_cpu_vec);
231
- TORCHTRT_CHECK (
232
- compiled_engine->exec_ctx ->setTensorAddress (name.c_str (), inputShapeTensorValues.back ().data ()),
233
- " Error while setting the tensor address for shape inputs" );
234
-
235
- if (CUDAGRAPHS_MODE) {
236
- // @peri044 I dont know if this makes sense since they are supposed to be GPU buffers
237
- compiled_engine->input_buffers [i] = input_cpu;
238
- }
239
- TORCHTRT_CHECK (
240
- compiled_engine->exec_ctx ->setTensorAddress (name.c_str (), inputShapeTensorValues.back ().data ()),
241
- " Error while setting the tensor address for shape inputs" );
242
-
243
- } else {
244
- at::Tensor contig_input = inputs[i].view (shape).contiguous ();
245
- formatted_inputs.emplace_back (std::move (contig_input));
246
-
247
- if (need_cudagraphs_record) {
248
- // Create a new persistent input buffer
249
- compiled_engine->input_buffers [i] = std::move (formatted_inputs.back ().clone ());
250
- }
251
-
252
- TORCHTRT_CHECK (
253
- compiled_engine->exec_ctx ->setInputShape (name.c_str (), dims), " Error while setting the input shape" );
254
-
255
- if (CUDAGRAPHS_MODE) {
256
- // If using CUDAGraphs copy formatted input to the corresponding persistent input buffer
257
- compiled_engine->input_buffers [i].copy_ (formatted_inputs.back (), true );
258
- TORCHTRT_CHECK (
259
- compiled_engine->exec_ctx ->setTensorAddress (name.c_str (), compiled_engine->input_buffers [i].data_ptr ()),
260
- " Error while setting the input tensor address for inputs" );
261
- } else {
262
- // Otherwise use the formatted buffer directly
263
- TORCHTRT_CHECK (
264
- compiled_engine->exec_ctx ->setTensorAddress (name.c_str (), formatted_inputs.back ().data_ptr ()),
265
- " Error while setting the input tensor address for inputs" );
266
- }
267
- }
268
- }
273
+ setup_input_tensors (inputs, compiled_engine, need_cudagraphs_record);
269
274
270
275
// Check if input shapes can be inferred.
271
276
int32_t const io_size{compiled_engine->cuda_engine ->getNbIOTensors ()};
@@ -284,7 +289,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
284
289
output_profiler_guard =
285
290
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path );
286
291
}
287
- if (( false == compiled_engine->use_pre_allocated_outputs ) || shape_changed) {
292
+ if (! compiled_engine->use_pre_allocated_outputs || shape_changed) {
288
293
outputs = create_output_tensors (compiled_engine);
289
294
} else {
290
295
outputs = compiled_engine->pre_allocated_outputs ;
0 commit comments