[XNNPACK] Share workspace across delegate instances

digantdesai · web-flow · commit ef564147ba1a · 2024-08-14T15:55:43.000-07:00
Differential Revision: D61251056 Pull Request resolved: #4526
diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt
@@ -32,6 +32,13 @@ if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
 endif()
 
+# NB: Enabling this will serialize execution of delegate instances
+# Keeping this OFF by default to maintain existing behavior, to be revisited.
+option(EXECUTORCH_XNNPACK_SHARED_WORKSPACE "Enable workspace sharing across different delegate instances" OFF)
+if(EXECUTORCH_XNNPACK_SHARED_WORKSPACE)
+  add_definitions(-DENABLE_XNNPACK_SHARED_WORKSPACE)
+endif()
+
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 set(_common_compile_options -Wno-deprecated-declarations -fPIC)
 
diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp
@@ -1612,7 +1612,8 @@ __ET_NODISCARD Error XNNCompiler::compileModel(
     const void* buffer_pointer,
     size_t num_bytes,
     XNNExecutor* executor,
-    MemoryAllocator* runtime_allocator) {
+    MemoryAllocator* runtime_allocator,
+    xnn_workspace_t workspace) {
   Result<XNNHeader> header = XNNHeader::Parse(buffer_pointer, num_bytes);
   const uint8_t* flatbuffer_data = nullptr;
   const uint8_t* constant_data = nullptr;
@@ -1708,11 +1709,26 @@ __ET_NODISCARD Error XNNCompiler::compileModel(
 #endif
 
   xnn_runtime_t runtime_ptr = nullptr;
-  status = xnn_create_runtime_v2(
+
+#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
+  ET_CHECK_OR_RETURN_ERROR(
+      workspace != nullptr, Internal, "Failed to initialize XNNPACK workspace");
+  status = xnn_create_runtime_v4(
+      subgraph.get(),
+      /*weight_cache=*/nullptr, // TODO - support weight cache
+      workspace,
+      torch::executorch::threadpool::get_pthreadpool(),
+      runtime_flags,
+      &runtime_ptr);
+#else
+  status = xnn_create_runtime_v3(
       subgraph.get(),
+      /*weight_cache=*/nullptr, // TODO - support weight cache
       torch::executorch::threadpool::get_pthreadpool(),
       runtime_flags,
       &runtime_ptr);
+#endif
+
   ET_CHECK_OR_RETURN_ERROR(
       xnn_status_success == status,
       Internal,
diff --git a/backends/xnnpack/runtime/XNNCompiler.h b/backends/xnnpack/runtime/XNNCompiler.h
@@ -29,7 +29,8 @@ class XNNCompiler {
       const void* buffer_pointer,
       size_t num_bytes,
       XNNExecutor* executor,
-      MemoryAllocator* runtime_allocator);
+      MemoryAllocator* runtime_allocator,
+      xnn_workspace_t workspace);
 };
 
 } // namespace delegate
diff --git a/backends/xnnpack/runtime/XNNPACKBackend.cpp b/backends/xnnpack/runtime/XNNPACKBackend.cpp
@@ -11,7 +11,9 @@
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
 #include <executorch/runtime/platform/profiler.h>
+
 #include <memory>
+#include <mutex>
 
 #pragma clang diagnostic ignored "-Wglobal-constructors"
 
@@ -22,6 +24,36 @@ class XnnpackBackend final : public PyTorchBackendInterface {
  public:
   ~XnnpackBackend() = default;
 
+  XnnpackBackend() {
+    // Initialize XNNPACK
+    xnn_status status = xnn_initialize(/*allocator=*/nullptr);
+    if (status != xnn_status_success) {
+      ET_LOG(
+          Error,
+          "Failed to initialize, XNNPACK status: 0x%x",
+          (unsigned int)status);
+      return;
+    }
+
+#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
+    // Create a workspace for the XNNExecutor to use. This workspace will be
+    // shared across all delegate instances.
+    ET_LOG(Debug, "Creating XNN workspace");
+    xnn_workspace_t workspace = nullptr;
+    status = xnn_create_workspace(&workspace);
+    if (status != xnn_status_success) {
+      ET_LOG(
+          Error,
+          "Failed to create XNN workspace, XNNPACK status: 0x%x",
+          (unsigned int)status);
+      workspace = nullptr;
+      return;
+    }
+    workspace_.reset(workspace);
+    ET_LOG(Debug, "Created XNN workspace: %p", workspace_.get());
+#endif // ENABLE_XNNPACK_SHARED_WORKSPACE
+  }
+
   bool is_available() const override {
     return xnn_status_success == xnn_initialize(/*allocator=*/nullptr);
   }
@@ -38,12 +70,12 @@ class XnnpackBackend final : public PyTorchBackendInterface {
     // new and since this type is not trivially destructible, we must call the
     // destructor manually in destroy().
     new (executor) xnnpack::delegate::XNNExecutor;
-
     Error err = xnnpack::delegate::XNNCompiler::compileModel(
         processed->data(),
         processed->size(),
         executor,
-        context.get_runtime_allocator());
+        context.get_runtime_allocator(),
+        workspace_.get());
     // This backend does not need its processed data after compiling the model.
     processed->Free();
 
@@ -65,6 +97,10 @@ class XnnpackBackend final : public PyTorchBackendInterface {
       EValue** args) const override {
     auto executor = static_cast<xnnpack::delegate::XNNExecutor*>(handle);
 
+#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
+    const std::lock_guard<std::mutex> lock(workspace_mutex_);
+#endif
+
     // Prepare Inputs/Outputs and Propagate Input Shapes
     Error err = executor->prepare_args(args);
     if (err != Error::Ok) {
@@ -94,6 +130,13 @@ class XnnpackBackend final : public PyTorchBackendInterface {
       executor->~XNNExecutor();
     }
   }
+
+ private:
+  // This is a global workspace for all delegate instances.
+  mutable std::mutex workspace_mutex_;
+  std::unique_ptr<xnn_workspace, decltype(&xnn_release_workspace)> workspace_{
+      nullptr,
+      &xnn_release_workspace};
 };
 
 namespace {
diff --git a/backends/xnnpack/targets.bzl b/backends/xnnpack/targets.bzl
@@ -36,7 +36,10 @@ def define_common_targets():
             "@EXECUTORCH_CLIENTS",
         ],
         preprocessor_flags = [
+            # Uncomment to enable per operator timings
             # "-DENABLE_XNNPACK_PROFILING",
+            # Uncomment to enable workspace sharing across delegates
+            # "-DENABLE_XNNPACK_SHARED_WORKSPACE"
         ],
         exported_deps = [
             "//executorch/runtime/backend:interface",