triton-inference-server · kthui · Jun 6, 2024 · May 31, 2024 · May 31, 2024 · Jun 3, 2024
diff --git a/src/infer_request.cc b/src/infer_request.cc
@@ -74,7 +74,7 @@ InferRequest::InferRequest(
   pb_cancel_ =
       std::make_shared<PbCancel>(response_factory_address_, request_address_);
   response_sender_ = std::make_shared<ResponseSender>(
-      request_address_, response_factory_address_,
+      request_address_, response_factory_address_, nullptr /* is_decoupled */,
       Stub::GetOrCreateInstance()->SharedMemory(), pb_cancel_);
 #endif
 }
@@ -272,7 +272,8 @@ InferRequest::SaveToSharedMemory(std::unique_ptr<SharedMemoryManager>& shm_pool)
 std::unique_ptr<InferRequest>
 InferRequest::LoadFromSharedMemory(
     std::unique_ptr<SharedMemoryManager>& shm_pool,
-    bi::managed_external_buffer::handle_t request_handle, bool open_cuda_handle)
+    bi::managed_external_buffer::handle_t request_handle, bool open_cuda_handle,
+    bool const* is_model_decoupled)
 {
   AllocatedSharedMemory<char> infer_request_shm =
       shm_pool->Load<char>(request_handle);
@@ -328,7 +329,7 @@ InferRequest::LoadFromSharedMemory(
   return std::unique_ptr<InferRequest>(new InferRequest(
       infer_request_shm, request_id_shm, correlation_id_shm,
       requested_output_names_shm, model_name_shm, input_tensors, parameters_shm,
-      infer_trace_shm));
+      infer_trace_shm, is_model_decoupled));
 }
 
 InferRequest::InferRequest(
@@ -339,7 +340,8 @@ InferRequest::InferRequest(
     std::unique_ptr<PbString>& model_name_shm,
     std::vector<std::shared_ptr<PbTensor>>& input_tensors,
     std::unique_ptr<PbString>& parameters_shm,
-    std::unique_ptr<InferenceTrace>& infer_trace_shm)
+    std::unique_ptr<InferenceTrace>& infer_trace_shm,
+    bool const* is_model_decoupled)
     : infer_request_shm_(std::move(infer_request_shm)),
       request_id_shm_(std::move(request_id_shm)),
       requested_output_names_shm_(std::move(requested_output_names_shm)),
@@ -387,7 +389,7 @@ InferRequest::InferRequest(
   pb_cancel_ =
       std::make_shared<PbCancel>(response_factory_address_, request_address_);
   response_sender_ = std::make_shared<ResponseSender>(
-      request_address_, response_factory_address_,
+      request_address_, response_factory_address_, is_model_decoupled,
       Stub::GetOrCreateInstance()->SharedMemory(), pb_cancel_);
 #endif
 }
@@ -402,13 +404,6 @@ InferRequest::IsCancelled()
 std::shared_ptr<ResponseSender>
 InferRequest::GetResponseSender()
 {
-  std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
-  if (!stub->IsDecoupled()) {
-    throw PythonBackendException(
-        "'get_response_sender' function must be called only when the model is "
-        "using the decoupled transaction policy.");
-  }
-
   return response_sender_;
 }
 

diff --git a/src/infer_request.h b/src/infer_request.h
@@ -118,7 +118,7 @@ class InferRequest {
   static std::unique_ptr<InferRequest> LoadFromSharedMemory(
       std::unique_ptr<SharedMemoryManager>& shm_pool,
       bi::managed_external_buffer::handle_t request_handle,
-      bool open_cuda_handle);
+      bool open_cuda_handle, bool const* is_model_decoupled);
 
   /// Disallow copying the inference request object.
   DISALLOW_COPY_AND_ASSIGN(InferRequest);
@@ -135,7 +135,8 @@ class InferRequest {
       std::unique_ptr<PbString>& model_name_shm,
       std::vector<std::shared_ptr<PbTensor>>& input_tensors,
       std::unique_ptr<PbString>& parameters_shm,
-      std::unique_ptr<InferenceTrace>& infer_trace_shm);
+      std::unique_ptr<InferenceTrace>& infer_trace_shm,
+      bool const* is_model_decoupled);
 
   std::string request_id_;
   CorrelationId correlation_id_;