Skip to content

Commit 7ee35f1

Browse files
committed
minor fixes
1 parent 3ce2645 commit 7ee35f1

File tree

3 files changed

+9
-5
lines changed

3 files changed

+9
-5
lines changed

core/runtime/TRTEngine.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ void* DynamicOutputAllocator::reallocateOutputAsync(
4242
std::vector<int64_t> shape = {static_cast<int64_t>(size)};
4343
auto it = buffers.find(tensorName);
4444
if (it == buffers.end() || it->second.sizes() != shape) {
45-
buffers[tensorName] = at::empty(shape, at::TensorOptions().dtype(dtypes.at(tensorName)).device(c10::kCUDA));
45+
buffers[tensorName] = at::empty(shape, at::TensorOptions().dtype(dtypes.at(tensorName)).device(at::kCUDA));
4646
return buffers[tensorName].data_ptr();
4747
} else {
4848
return it->second.data_ptr();

core/runtime/execute_engine.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -379,7 +379,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
379379
if (inputs.size() > 0) {
380380
current_device_id = inputs[0].device().index(); // Done this way to avoid a call to cudart
381381
} else {
382-
current_device_id = c10::cuda::current_device();
382+
current_device_id = at::cuda::current_device();
383383
}
384384

385385
compiled_engine->caller_stream = c10::cuda::getCurrentCUDAStream(current_device_id);
@@ -428,11 +428,13 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
428428
for (int i = 0; i < dims.nbDims; ++i) {
429429
prod *= dims.d[i];
430430
}
431-
std::vector<int64_t> dims_vec(dims.nbDims);
431+
std::vector<int64_t> shape(dims.nbDims);
432432
for (int i = 0; i < dims.nbDims; ++i) {
433-
dims_vec[i] = dims.d[i];
433+
shape[i] = dims.d[i];
434434
}
435-
output = output.reshape(-1).view(dtype).slice(0, 0, prod).reshape(dims_vec);
435+
// When using the OutputAllocator, the allocated buffer might be larger than the size of the output,
436+
// so we need to reshape the buffer to the output shape
437+
output = output.reshape(-1).view(dtype).slice(0, 0, prod).reshape(shape);
436438
outputs.push_back(output);
437439
}
438440

py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -624,6 +624,8 @@ def run_output_allocator() -> torch.Tensor | Tuple[torch.Tensor, ...]:
624624
.detach()
625625
)
626626
prod = int(torch.prod(torch.tensor(shape)))
627+
# When using the OutputAllocator, the allocated buffer might be larger than the size of the output,
628+
# so we need to reshape the buffer to the output shape
627629
output = output.reshape(-1).view(dtype)[:prod].reshape(shape)
628630
outputs.append(output)
629631

0 commit comments

Comments
 (0)