Skip to content

Commit 9f2865d

Browse files
authored
Fix decoupled gpu output error handling (#362)
* Fix decoupled gpu output error handling * Return full error string upon exception from model
1 parent 4551e04 commit 9f2865d

File tree

2 files changed

+24
-19
lines changed

2 files changed

+24
-19
lines changed

src/pb_stub.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -735,9 +735,9 @@ Stub::ProcessRequests(RequestBatch* request_batch_shm_ptr)
735735
"Failed to process the request(s) for model '" + name_ +
736736
"', message: ") +
737737
error_string;
738-
LOG_INFO << err_message.c_str();
738+
LOG_ERROR << err_message.c_str();
739739
response_batch_shm_ptr->has_error = true;
740-
error_string_shm = PbString::Create(shm_pool_, error_string);
740+
error_string_shm = PbString::Create(shm_pool_, err_message);
741741
response_batch_shm_ptr->error = error_string_shm->ShmHandle();
742742
response_batch_shm_ptr->is_error_set = true;
743743
}

src/response_sender.cc

Lines changed: 22 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -147,22 +147,38 @@ ResponseSender::Send(
147147
}
148148

149149
if (has_gpu_output) {
150+
ScopedDefer _([send_message_payload] {
151+
bi::scoped_lock<bi::interprocess_mutex> guard{send_message_payload->mu};
152+
send_message_payload->is_stub_turn = false;
153+
send_message_payload->cv.notify_one();
154+
while (!send_message_payload->is_stub_turn) {
155+
// Wait for the stub process to send the response and populate error
156+
// message if any.
157+
send_message_payload->cv.wait(guard);
158+
}
159+
});
160+
150161
AllocatedSharedMemory<GPUBuffersShm> gpu_buffers_handle =
151162
shm_pool_->Load<GPUBuffersShm>(
152163
send_message_payload->gpu_buffers_handle);
164+
if (!gpu_buffers_handle.data_->success) {
165+
std::unique_ptr<PbString> error = PbString::LoadFromSharedMemory(
166+
shm_pool_, gpu_buffers_handle.data_->error);
167+
throw PythonBackendException(
168+
"Failed to load GPU buffers: " + error->String());
169+
}
153170

154171
AllocatedSharedMemory<bi::managed_external_buffer::handle_t>
155172
gpu_buffers_handle_shm =
156173
shm_pool_->Load<bi::managed_external_buffer::handle_t>(
157174
gpu_buffers_handle.data_->buffers);
158175
uint64_t gpu_buffer_count = gpu_buffers_handle.data_->buffer_count;
159176
if (gpu_tensors.size() != gpu_buffer_count) {
160-
LOG_ERROR
161-
<< (std::string(
162-
"GPU buffers size does not match the provided buffers: ") +
163-
std::to_string(gpu_tensors.size()) +
164-
" != " + std::to_string(gpu_buffer_count));
165-
return;
177+
throw PythonBackendException(
178+
std::string(
179+
"GPU buffers size does not match the provided buffers: ") +
180+
std::to_string(gpu_tensors.size()) +
181+
" != " + std::to_string(gpu_buffer_count));
166182
}
167183

168184
std::vector<std::unique_ptr<PbMemory>> dst_buffers;
@@ -175,17 +191,6 @@ ResponseSender::Send(
175191
std::shared_ptr<PbTensor>& src_buffer = gpu_tensors[i];
176192
PbMemory::CopyBuffer(dst_buffers[i], src_buffer->Memory());
177193
}
178-
179-
{
180-
bi::scoped_lock<bi::interprocess_mutex> guard{send_message_payload->mu};
181-
send_message_payload->is_stub_turn = false;
182-
send_message_payload->cv.notify_one();
183-
while (!send_message_payload->is_stub_turn) {
184-
// Wait for the stub process to send the response and populate error
185-
// message if any.
186-
send_message_payload->cv.wait(guard);
187-
}
188-
}
189194
}
190195

191196
if (send_message_payload->has_error) {

0 commit comments

Comments
 (0)