minor fixes

zewenli98 · zewenli98 · commit 7ee35f16b58e · 2025-03-12T12:52:05.000-07:00
diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
@@ -42,7 +42,7 @@ void* DynamicOutputAllocator::reallocateOutputAsync(
   std::vector<int64_t> shape = {static_cast<int64_t>(size)};
   auto it = buffers.find(tensorName);
   if (it == buffers.end() || it->second.sizes() != shape) {
-    buffers[tensorName] = at::empty(shape, at::TensorOptions().dtype(dtypes.at(tensorName)).device(c10::kCUDA));
+    buffers[tensorName] = at::empty(shape, at::TensorOptions().dtype(dtypes.at(tensorName)).device(at::kCUDA));
     return buffers[tensorName].data_ptr();
   } else {
     return it->second.data_ptr();
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
@@ -379,7 +379,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
     if (inputs.size() > 0) {
       current_device_id = inputs[0].device().index(); // Done this way to avoid a call to cudart
     } else {
-      current_device_id = c10::cuda::current_device();
+      current_device_id = at::cuda::current_device();
     }
 
     compiled_engine->caller_stream = c10::cuda::getCurrentCUDAStream(current_device_id);
@@ -428,11 +428,13 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
       for (int i = 0; i < dims.nbDims; ++i) {
         prod *= dims.d[i];
       }
-      std::vector<int64_t> dims_vec(dims.nbDims);
+      std::vector<int64_t> shape(dims.nbDims);
       for (int i = 0; i < dims.nbDims; ++i) {
-        dims_vec[i] = dims.d[i];
+        shape[i] = dims.d[i];
       }
-      output = output.reshape(-1).view(dtype).slice(0, 0, prod).reshape(dims_vec);
+      // When using the OutputAllocator, the allocated buffer might be larger than the size of the output,
+      // so we need to reshape the buffer to the output shape
+      output = output.reshape(-1).view(dtype).slice(0, 0, prod).reshape(shape);
       outputs.push_back(output);
     }
 
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -624,6 +624,8 @@ def run_output_allocator() -> torch.Tensor | Tuple[torch.Tensor, ...]:
                         .detach()
                     )
                     prod = int(torch.prod(torch.tensor(shape)))
+                    # When using the OutputAllocator, the allocated buffer might be larger than the size of the output,
+                    # so we need to reshape the buffer to the output shape
                     output = output.reshape(-1).view(dtype)[:prod].reshape(shape)
                     outputs.append(output)
 

Original file line number	Diff line number	Diff line change
`@@ -379,7 +379,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr`
`379`	`379`	`if (inputs.size() > 0) {`
`380`	`380`	`current_device_id = inputs[0].device().index(); // Done this way to avoid a call to cudart`
`381`	`381`	`} else {`
`382`		`- current_device_id = c10::cuda::current_device();`
	`382`	`+ current_device_id = at::cuda::current_device();`
`383`	`383`	`}`
`384`	`384`
`385`	`385`	`compiled_engine->caller_stream = c10::cuda::getCurrentCUDAStream(current_device_id);`
`@@ -428,11 +428,13 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr`
`428`	`428`	`for (int i = 0; i < dims.nbDims; ++i) {`
`429`	`429`	`prod *= dims.d[i];`
`430`	`430`	`}`
`431`		`- std::vector<int64_t> dims_vec(dims.nbDims);`
	`431`	`+ std::vector<int64_t> shape(dims.nbDims);`
`432`	`432`	`for (int i = 0; i < dims.nbDims; ++i) {`
`433`		`- dims_vec[i] = dims.d[i];`
	`433`	`+ shape[i] = dims.d[i];`
`434`	`434`	`}`
`435`		`- output = output.reshape(-1).view(dtype).slice(0, 0, prod).reshape(dims_vec);`
	`435`	`+ // When using the OutputAllocator, the allocated buffer might be larger than the size of the output,`
	`436`	`+ // so we need to reshape the buffer to the output shape`
	`437`	`+ output = output.reshape(-1).view(dtype).slice(0, 0, prod).reshape(shape);`
`436`	`438`	`outputs.push_back(output);`
`437`	`439`	`}`
`438`	`440`
Original file line number	Diff line number	Diff line change
`@@ -624,6 +624,8 @@ def run_output_allocator() -> torch.Tensor \| Tuple[torch.Tensor, ...]:`
`624`	`624`	`.detach()`
`625`	`625`	`)`
`626`	`626`	`prod = int(torch.prod(torch.tensor(shape)))`
	`627`	`+ # When using the OutputAllocator, the allocated buffer might be larger than the size of the output,`
	`628`	`+ # so we need to reshape the buffer to the output shape`
`627`	`629`	`output = output.reshape(-1).view(dtype)[:prod].reshape(shape)`
`628`	`630`	`outputs.append(output)`
`629`	`631`