Fix error handling

krishung5 · krishung5 · commit dbf5c4317c46 · 2024-10-02T16:35:53.000-07:00
diff --git a/src/pb_stub.cc b/src/pb_stub.cc
@@ -719,6 +719,24 @@ Stub::ProcessRequests(RequestBatch* request_batch_shm_ptr)
     ResponseBatch* response_batch_shm_ptr = reinterpret_cast<ResponseBatch*>(
         response_batch.value().data_.get() + sizeof(IPCMessageShm));
 
+    // Handle two special cases:
+    // 1. For default(non-decoupled) mode, where the response
+    // factory should already be cleaned up with the previous response sent
+    // from response sender, and yet the model tries to return another
+    // response from `execute()` function. Notify the backend to NOT to
+    // delete the response factory again during error handling.
+    // 2.The response sender is already closed, need to notify the backend to
+    // NOT to delete the response factory again during error handling.
+    // std::string error_string = pb_exception.what();
+    if ((err_message.find(
+             "Non-decoupled model cannot send more than one response") !=
+         std::string::npos) ||
+        (err_message.find("Response sender has been closed") !=
+         std::string::npos)) {
+      response_batch_shm_ptr->is_response_factory_deleted = true;
+      LOG_ERROR << "=== caught error: " << err_message;
+    }
+
     response_batch_shm_ptr->has_error = true;
     error_string_shm = PbString::Create(shm_pool_, err_message);
     response_batch_shm_ptr->error = error_string_shm->ShmHandle();
@@ -734,6 +752,7 @@ Stub::ProcessRequests(RequestBatch* request_batch_shm_ptr)
     }
   } else {
     if (!response_batch) {
+      // No response is returned from `execute()`.
       std::cerr << "===== response_batch is not set" << std::endl;
       response_batch = shm_pool_->Construct<char>(
           sizeof(ResponseBatch) + sizeof(IPCMessageShm));
@@ -846,31 +865,8 @@ Stub::ProcessReturnedResponses(
       }
 
       InferResponse* response = py_responses[i].cast<InferResponse*>();
-
-      try {
-        request->GetResponseSender()->UpdateStateAndCounters(
-            response, TRITONSERVER_RESPONSE_COMPLETE_FINAL);
-      }
-      catch (const PythonBackendException& pb_exception) {
-        // Special case for default(non-decoupled) mode, where the response
-        // factory should already be cleaned up with the previous response sent
-        // from response sender, and yet the model tries to return another
-        // response from `execute()` function. Notify the backend to NOT to
-        // delete the response factory again during error handling.
-        std::string error_string = pb_exception.what();
-        if (error_string.find(
-                "Non-decoupled model cannot send more than one response") !=
-            std::string::npos) {
-          response_batch = std::move(shm_pool_->Construct<char>(
-              sizeof(ResponseBatch) + sizeof(IPCMessageShm)));
-          ResponseBatch* response_batch_shm_ptr =
-              reinterpret_cast<ResponseBatch*>(
-                  response_batch.value().data_.get() + sizeof(IPCMessageShm));
-          response_batch_shm_ptr->is_response_factory_deleted = true;
-          LOG_ERROR << "=== caught error: " << pb_exception.what();
-        }
-        throw pb_exception;
-      }
+      request->GetResponseSender()->UpdateStateAndCounters(
+          response, TRITONSERVER_RESPONSE_COMPLETE_FINAL);
     }
   }
   // Return all the created responses using response_batch. The reason
@@ -887,16 +883,18 @@ Stub::ProcessReturnedResponses(
       reinterpret_cast<bi::managed_external_buffer::handle_t*>(
           response_batch.value().data_.get() + sizeof(ResponseBatch) +
           sizeof(IPCMessageShm));
-
+  std::cerr << "===== response_size: " << responses_size << std::endl;
   for (size_t i = 0; i < responses_size; i++) {
     // Check the return type of execute function.
     InferRequest* infer_request = py_requests[i].cast<InferRequest*>();
     InferResponse* infer_response = py_responses[i].cast<InferResponse*>();
     if (!py::isinstance<py::none>(py_responses[i])) {
+      std::cerr << "===== response is NOT None" << std::endl;
       infer_response->PruneOutputTensors(infer_request->RequestedOutputNames());
       ProcessResponse(infer_response);
       responses_shm_handle[i] = infer_response->ShmHandle();
     } else {
+      std::cerr << "===== response is None" << std::endl;
       responses_shm_handle[i] = 0;
     }
   }
diff --git a/src/python_be.cc b/src/python_be.cc
@@ -153,6 +153,23 @@ ModelInstanceState::SetErrorForResponseSendMessage(
   }
 }
 
+bool
+ModelInstanceState::IsStubProcessAlive()
+{
+  boost::posix_time::ptime timeout =
+      boost::get_system_time() + boost::posix_time::seconds(1);
+  bi::scoped_lock<bi::interprocess_mutex> lock(*Stub()->HealthMutex(), timeout);
+
+  // Check if lock has been acquired.
+  if (lock) {
+    return Stub()->IpcControl()->stub_health;
+  } else {
+    // If It failed to obtain the lock, it means that the stub has been
+    // stuck or exited while holding the health mutex lock.
+    return false;
+  }
+}
+
 TRITONSERVER_Error*
 ModelInstanceState::SaveRequestsToSharedMemory(
     TRITONBACKEND_Request** requests, const uint32_t request_count,
@@ -1011,11 +1028,43 @@ ModelInstanceState::ProcessModelControlRequest(
       });
 }
 
-void
+TRITONSERVER_Error*
 ModelInstanceState::SendMessageToStub(
     bi::managed_external_buffer::handle_t message)
 {
-  Stub()->StubMessageQueue()->Push(message);
+  // Stub()->StubMessageQueue()->Push(message);
+  bool success = false;
+  while (!success) {
+    uint64_t timeout_miliseconds = 1000;
+    {
+      boost::posix_time::ptime timeout =
+          boost::get_system_time() +
+          boost::posix_time::milliseconds(timeout_miliseconds);
+
+      bi::scoped_lock<bi::interprocess_mutex> lock(
+          *(Stub()->HealthMutex()), timeout);
+
+      // Check if lock has been acquired.
+      if (lock) {
+        Stub()->IpcControl()->stub_health = false;
+      } else {
+        // If it failed to obtain the lock, it means that the stub has been
+        // stuck or exited while holding the health mutex lock.
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INTERNAL, "Failed to obtain the health mutex.");
+      }
+    }
+
+    Stub()->StubMessageQueue()->Push(
+        message, timeout_miliseconds /* duration ms */, success);
+
+    if (!success && !IsStubProcessAlive()) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL, "Stub process is not healthy.");
+    }
+  }
+
+  return nullptr;  // success
 }
 
 void
@@ -1025,10 +1074,29 @@ ModelInstanceState::SendMessageAndReceiveResponse(
     std::shared_ptr<std::vector<TRITONBACKEND_Response*>>& responses,
     TRITONBACKEND_Request** requests, const uint32_t request_count)
 {
-  SendMessageToStub(message);
+  // SendMessageToStub(message);
+
+  // bi::managed_external_buffer::handle_t response_message;
+  // Stub()->ReceiveMessageFromStub(response_message);
+
+  // response = response_message;
+
+  auto error = SendMessageToStub(message);
+  if (error != nullptr) {
+    RespondErrorToAllRequests(
+        TRITONSERVER_ErrorMessage(error), responses, requests, request_count);
+
+    return;
+  }
 
   bi::managed_external_buffer::handle_t response_message;
-  Stub()->ReceiveMessageFromStub(response_message);
+  error = Stub()->ReceiveMessageFromStub(response_message);
+  if (error != nullptr) {
+    RespondErrorToAllRequests(
+        TRITONSERVER_ErrorMessage(error), responses, requests, request_count);
+
+    return;
+  }
 
   response = response_message;
 }
@@ -1061,6 +1129,7 @@ ModelInstanceState::RespondErrorToAllRequests(
   }
 }
 
+
 void
 ModelInstanceState::StartMonitor()
 {
@@ -1282,7 +1351,7 @@ ModelInstanceState::ProcessRequests(
   {
     Stub()->StubMessageQueue()->Push(ipc_message->ShmHandle());
     bi::managed_external_buffer::handle_t response_message;
-    Stub()->ReceiveMessageFromStub(response_message);
+    RETURN_IF_ERROR(Stub()->ReceiveMessageFromStub(response_message));
     response =
         IPCMessage::LoadFromSharedMemory(Stub()->ShmPool(), response_message);
   }
@@ -1329,26 +1398,34 @@ ModelInstanceState::ProcessRequests(
   }
 
   if (response_batch_shm_ptr->batch_size > 0) {
+    bi::managed_external_buffer::handle_t* response_shm_handle =
+        reinterpret_cast<bi::managed_external_buffer::handle_t*>(
+            ipc_message_shm + sizeof(ResponseBatch) + sizeof(IPCMessageShm));
+
     std::shared_ptr<std::vector<TRITONBACKEND_Response*>> responses(
         new std::vector<TRITONBACKEND_Response*>());
     responses->reserve(request_count);
     for (size_t i = 0; i < request_count; i++) {
-      TRITONBACKEND_Response* response;
-      auto err = TRITONBACKEND_ResponseNew(&response, requests[i]);
-      if (err == nullptr) {
-        responses->emplace_back(response);
-      } else {
+      // It is possible to have multiple responses batched together in a single
+      // response batch shm, where some of the responses are None due to the
+      // usage of response sender, so only create a TRITONBACKEND_Response
+      // object for the valid responses, and skip the None responses later.
+      if (response_shm_handle[i] == 0) {
+        std::cerr << "=== PYBE response_shm_handle is 0 ===" << std::endl;
         responses->emplace_back(nullptr);
-        LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response");
-        TRITONSERVER_ErrorDelete(err);
+      } else {
+        TRITONBACKEND_Response* response;
+        auto err = TRITONBACKEND_ResponseNew(&response, requests[i]);
+        if (err == nullptr) {
+          responses->emplace_back(response);
+        } else {
+          responses->emplace_back(nullptr);
+          LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response");
+          TRITONSERVER_ErrorDelete(err);
+        }
       }
     }
-    bi::managed_external_buffer::handle_t* response_shm_handle =
-        reinterpret_cast<bi::managed_external_buffer::handle_t*>(
-            ipc_message_shm + sizeof(ResponseBatch) + sizeof(IPCMessageShm));
 
-    // If the output provided by the model is in GPU, we will pass the list of
-    // buffers provided by Triton to the stub process.
     std::vector<bool> requires_deferred_callback;
 
     bool has_gpu_output = false;
@@ -1360,6 +1437,11 @@ ModelInstanceState::ProcessRequests(
     std::cerr << "=== PYBE request_count: " << request_count << std::endl;
     for (uint32_t r = 0; r < request_count; ++r) {
       NVTX_RANGE(nvtx_, "LoadingResponse " + Name());
+      if (response_shm_handle[r] == 0) {
+        std::cerr << "=== PYBE skip the response_shm_handle is 0 ==="
+                  << std::endl;
+        continue;
+      }
       TRITONBACKEND_Response* response = (*responses)[r];
       TRITONBACKEND_Request* request = requests[r];
       uint32_t requested_output_count = 0;
@@ -1378,13 +1460,14 @@ ModelInstanceState::ProcessRequests(
           continue;
         }
 
-        if (response_shm_handle[r] == 0) {
-          LOG_IF_ERROR(
-              TRITONBACKEND_ResponseDelete((*responses)[r]),
-              "failed to delete response");
-          (*responses)[r] = nullptr;
-          continue;
-        }
+        // if (response_shm_handle[r] == 0) {
+        //   std::cerr << "=== PYBE response_shm_handle is 0 ===" << std::endl;
+        //   LOG_IF_ERROR(
+        //       TRITONBACKEND_ResponseDelete((*responses)[r]),
+        //       "failed to delete response");
+        //   (*responses)[r] = nullptr;
+        //   continue;
+        // }
         {
           TRITONBACKEND_ResponseFactory* response_factory =
               reinterpret_cast<TRITONBACKEND_ResponseFactory*>(
@@ -1448,6 +1531,8 @@ ModelInstanceState::ProcessRequests(
             responses, r,
             TRITONBACKEND_RequestOutputName(request, j, &output_name));
         requested_output_names.insert(output_name);
+        std::cerr << "=== PYBE requested_output_name: " << output_name
+                  << std::endl;
       }
 
       bool require_deferred_callback = false;
diff --git a/src/python_be.h b/src/python_be.h
@@ -369,7 +369,12 @@ class ModelInstanceState : public BackendModelInstance {
       std::shared_ptr<std::vector<TRITONBACKEND_Response*>>& responses,
       TRITONBACKEND_Request** requests, const uint32_t request_count);
 
-  void SendMessageToStub(bi::managed_external_buffer::handle_t message);
+  // void SendMessageToStub(bi::managed_external_buffer::handle_t message);
+  TRITONSERVER_Error* SendMessageToStub(
+      bi::managed_external_buffer::handle_t message);
+
+  // Checks whether the stub process is live
+  bool IsStubProcessAlive();
 
   // Model instance stub
   std::unique_ptr<StubLauncher>& Stub() { return model_instance_stub_; }
diff --git a/src/stub_launcher.cc b/src/stub_launcher.cc
@@ -593,7 +593,7 @@ StubLauncher::ModelInstanceStubProcess()
   stub_message_queue_->Push(initialize_message->ShmHandle());
 
   bi::managed_external_buffer::handle_t message;
-  ReceiveMessageFromStub(message);
+  RETURN_IF_ERROR(ReceiveMessageFromStub(message));
 
   std::unique_ptr<IPCMessage> initialize_response_message =
       IPCMessage::LoadFromSharedMemory(shm_pool_, message);
@@ -724,11 +724,59 @@ StubLauncher::KillStubProcess()
 #endif
 }
 
-void
+TRITONSERVER_Error*
 StubLauncher::ReceiveMessageFromStub(
     bi::managed_external_buffer::handle_t& message)
 {
-  message = parent_message_queue_->Pop();
+  // message = parent_message_queue_->Pop();
+  bool success = false;
+  while (!success) {
+    uint64_t timeout_miliseconds = 1000;
+    {
+      boost::posix_time::ptime timeout =
+          boost::get_system_time() +
+          boost::posix_time::milliseconds(timeout_miliseconds);
+
+      bi::scoped_lock<bi::interprocess_mutex> lock(*health_mutex_, timeout);
+
+      // Check if lock has been acquired.
+      if (lock) {
+        ipc_control_->stub_health = false;
+      } else {
+        // If it failed to obtain the lock, it means that the stub has been
+        // stuck or exited while holding the health mutex lock.
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INTERNAL, "Failed to obtain the health mutex.");
+      }
+    }
+
+    message = parent_message_queue_->Pop(
+        timeout_miliseconds /* duration ms */, success);
+
+    bool is_stub_alive = false;
+    {
+      boost::posix_time::ptime timeout =
+          boost::get_system_time() + boost::posix_time::seconds(1);
+      bi::scoped_lock<bi::interprocess_mutex> lock(*health_mutex_, timeout);
+      if (lock) {
+        is_stub_alive = ipc_control_->stub_health;
+      } else {
+        // If It failed to obtain the lock, it means that the stub has been
+        // stuck or exited while holding the health mutex lock.
+        is_stub_alive = false;
+      }
+    }
+
+    if (!success && !is_stub_alive) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          (std::string("Stub process '") + model_instance_name_ +
+           "' is not healthy.")
+              .c_str());
+    }
+  }
+
+  return nullptr;  // success
 }
 
 void
diff --git a/src/stub_launcher.h b/src/stub_launcher.h
@@ -146,7 +146,8 @@ class StubLauncher {
   void KillStubProcess();
 
   // Get a message from the stub process
-  void ReceiveMessageFromStub(bi::managed_external_buffer::handle_t& message);
+  TRITONSERVER_Error* ReceiveMessageFromStub(
+      bi::managed_external_buffer::handle_t& message);
 
   // Wait for stub process
   void WaitForStubProcess();