Use lazy descriptor pool allocation (#2285)

SS-JIA · facebook-github-bot · commit aed32c40096c · 2024-03-06T19:04:51.000-08:00
Summary: Pull Request resolved: #2285 ## Context In Vulkan, memory for Descriptor Sets (which are used to bind data to shader arguments) must be pre-allocated. Previously, the convention is that a large number of descriptor sets are allocated upon creation of a Vulkan Context. While this worked well in Lite Interpreter, where only a global vulkan context is used, it will lead to overallocating descriptor sets in the Vulkan Delegate, where every `ComputeGraph` has its own dedicated Context. pytorch/pytorch#121134 allows the Descriptor Set pool to be initialized in a deferred fashion. This means that a ComputeGraph can count the total number of descriptors needed across all the compute shaders that will be encoded, and then allocate a Descriptor Set Pool of the appropriate size. ## Implementation Overview 1. When constructing `ComputeGraph`, make sure that the descriptor pool config contains 0 for number of max sets. This will ensure that no descriptor pool will be initialized when constructing the graph's `api::Context` instance 2. When building the graph, `ExecuteNode` and `PrepackNode` will call `graph.update_descriptor_counts(shader)` upon construction, which allows `ComputeGraph` to count the total number of descriptor sets needed. 3. There is a separate descriptor count object for prepack and execute, since they correspond to different command buffers. 4. Before encoding any command buffers, call `graph.prepare()` which will construct a descriptor pool config from the descriptor counts. ## Notes One interesting finding is that I had to apply a safety factor to the descriptor counts to prevent the pool from running out of memory. This was reproducible on both Linux and Android. A more robust design, i.e. as discussed [here](https://www.reddit.com/r/vulkan/comments/17v66fi/question_about_descriptor_pool_allocations/) may be to maintain separate descriptor pools for each layout type. We should revisit this refactor at a later time. bypass-github-export-checks Reviewed By: jorgep31415 Differential Revision: D54603935 fbshipit-source-id: eb04403b5f0967d69b390153c778b58bd940004e
diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp
@@ -62,39 +62,6 @@ api::ScalarType get_scalar_type(const vkgraph::VkDataType& vk_datatype) {
   }
 }
 
-GraphConfig generate_config() {
-  const uint32_t submit_frequency = UINT32_MAX;
-
-  const api::CommandPoolConfig cmd_config{
-      4u, // cmdPoolInitialSize
-      2u, // cmdPoolBatchSize
-  };
-
-  const api::DescriptorPoolConfig descriptor_pool_config{
-      1024u, // descriptorPoolMaxSets
-      1024u, // descriptorUniformBufferCount
-      1024u, // descriptorStorageBufferCount
-      1024u, // descriptorCombinedSamplerCount
-      1024u, // descriptorStorageImageCount
-      32u, // descriptorPileSizes
-  };
-
-  const api::QueryPoolConfig query_pool_config{};
-
-  const api::ContextConfig context_config{
-      submit_frequency, // cmdSubmitFrequency
-      cmd_config, // cmdPoolConfig
-      descriptor_pool_config, // descriptorPoolConfig
-      query_pool_config, // queryPoolConfig
-  };
-
-  const GraphConfig graph_config{
-      context_config,
-  };
-
-  return graph_config;
-}
-
 class GraphBuilder {
   ComputeGraph* compute_graph_;
   VkGraphPtr flatbuffer_;
@@ -269,6 +236,8 @@ class VulkanBackend final : public PyTorchBackendInterface {
 
     builder.build_graph();
 
+    compute_graph->prepare();
+
     compute_graph->encode_prepack();
     compute_graph->prepack();
 
@@ -284,7 +253,7 @@ class VulkanBackend final : public PyTorchBackendInterface {
     ComputeGraph* compute_graph = ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(
         context.get_runtime_allocator(), ComputeGraph);
 
-    new (compute_graph) ComputeGraph(generate_config());
+    new (compute_graph) ComputeGraph(GraphConfig());
 
     Error err = compileModel(processed->data(), compute_graph);
 
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -18,6 +18,8 @@ namespace vulkan {
 
 ComputeGraph::ComputeGraph(GraphConfig config)
     : config_{config},
+      prepack_descriptor_counts_{},
+      execute_descriptor_counts_{},
       context_{new api::Context(
           api::runtime()->default_adapter_i(),
           config_.contextConfig)},
@@ -27,6 +29,19 @@ ComputeGraph::ComputeGraph(GraphConfig config)
       execute_nodes_{},
       inputs_{},
       outputs_{} {
+  // Ensure that descriptor counts are initialized to 0
+  prepack_descriptor_counts_.descriptorPoolMaxSets = 0;
+  prepack_descriptor_counts_.descriptorUniformBufferCount = 0;
+  prepack_descriptor_counts_.descriptorStorageBufferCount = 0;
+  prepack_descriptor_counts_.descriptorCombinedSamplerCount = 0;
+  prepack_descriptor_counts_.descriptorStorageImageCount = 0;
+
+  execute_descriptor_counts_.descriptorPoolMaxSets = 0;
+  execute_descriptor_counts_.descriptorUniformBufferCount = 0;
+  execute_descriptor_counts_.descriptorStorageBufferCount = 0;
+  execute_descriptor_counts_.descriptorCombinedSamplerCount = 0;
+  execute_descriptor_counts_.descriptorStorageImageCount = 0;
+
   context_->set_cmd(/*reusable = */ true);
 }
 
@@ -39,6 +54,33 @@ ComputeGraph::~ComputeGraph() {
   context_->flush();
 }
 
+void ComputeGraph::update_descriptor_counts(
+    const api::ShaderInfo& shader_info,
+    bool execute) {
+  api::DescriptorPoolConfig* config =
+      execute ? &execute_descriptor_counts_ : &prepack_descriptor_counts_;
+
+  config->descriptorPoolMaxSets += 1;
+  for (const VkDescriptorType arg_type : shader_info.kernel_layout) {
+    switch (arg_type) {
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+        config->descriptorUniformBufferCount += 1;
+        break;
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+        config->descriptorStorageBufferCount += 1;
+        break;
+      case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+        config->descriptorCombinedSamplerCount += 1;
+        break;
+      case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+        config->descriptorStorageImageCount += 1;
+        break;
+      default:
+        VK_THROW("Unsupported descriptor type!");
+    }
+  }
+}
+
 ValueRef ComputeGraph::add_tensor(
     const std::vector<int64_t>& sizes,
     const api::ScalarType dtype,
@@ -138,6 +180,29 @@ void ComputeGraph::copy_from_staging(
   copy_staging_to_ptr(staging, data, nbytes);
 }
 
+void ComputeGraph::prepare() {
+#define MERGE_FIELD(field)                    \
+  static_cast<uint32_t>(std::ceil(            \
+      std::max(                               \
+          execute_descriptor_counts_.field,   \
+          prepack_descriptor_counts_.field) * \
+      config_.descriptorPoolSafetyFactor))
+
+  api::DescriptorPoolConfig config{
+      MERGE_FIELD(descriptorPoolMaxSets),
+      MERGE_FIELD(descriptorUniformBufferCount),
+      MERGE_FIELD(descriptorStorageBufferCount),
+      MERGE_FIELD(descriptorCombinedSamplerCount),
+      MERGE_FIELD(descriptorStorageImageCount),
+      1u,
+  };
+
+  if (!context_->descriptor_pool()) {
+    context_->descriptor_pool().init(config);
+  }
+#undef MERGE_FIELD
+}
+
 void ComputeGraph::encode_prepack() {
   for (std::unique_ptr<PrepackNode>& node : prepack_nodes_) {
     node->encode(this);
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -60,6 +60,9 @@ class ComputeGraph final {
 
  private:
   GraphConfig config_;
+  api::DescriptorPoolConfig prepack_descriptor_counts_;
+  api::DescriptorPoolConfig execute_descriptor_counts_;
+
   std::unique_ptr<api::Context> context_;
   std::vector<SharedObject> shared_objects_;
   std::vector<Value> values_;
@@ -87,6 +90,10 @@ class ComputeGraph final {
     return outputs_;
   }
 
+  void update_descriptor_counts(
+      const api::ShaderInfo& shader_info,
+      bool execute);
+
   /*
    * Returns the value at a particular reference
    */
@@ -163,6 +170,12 @@ class ComputeGraph final {
 
   SharedObject& get_shared_object(const int64_t idx);
 
+  //
+  // Graph Preparation
+  //
+
+  void prepare();
+
   //
   // Input/Output
   //
diff --git a/backends/vulkan/runtime/graph/GraphConfig.cpp b/backends/vulkan/runtime/graph/GraphConfig.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/GraphConfig.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+
+GraphConfig::GraphConfig() {
+  // No automatic submissions
+  const uint32_t submit_frequency = UINT32_MAX;
+
+  // Only one command buffer will be encoded at a time
+  const api::CommandPoolConfig cmd_config{
+      1u, // cmdPoolInitialSize
+      1u, // cmdPoolBatchSize
+  };
+
+  // Use lazy descriptor pool initialization by default; the graph runtime will
+  // tally up the number of descriptor sets needed while building the graph and
+  // trigger descriptor pool initialization with exact sizes before encoding the
+  // command buffer.
+  const api::DescriptorPoolConfig descriptor_pool_config{
+      0u, // descriptorPoolMaxSets
+      0u, // descriptorUniformBufferCount
+      0u, // descriptorStorageBufferCount
+      0u, // descriptorCombinedSamplerCount
+      0u, // descriptorStorageImageCount
+      0u, // descriptorPileSizes
+  };
+
+  const api::QueryPoolConfig query_pool_config{};
+
+  const api::ContextConfig context_config{
+      submit_frequency, // cmdSubmitFrequency
+      cmd_config, // cmdPoolConfig
+      descriptor_pool_config, // descriptorPoolConfig
+      query_pool_config, // queryPoolConfig
+  };
+
+  contextConfig = context_config;
+
+  // Empirically selected safety factor. If descriptor pools start running out
+  // of memory, increase this safety factor.
+  descriptorPoolSafetyFactor = 1.25;
+}
+
+} // namespace vulkan
+} // namespace native
+} // namespace at
diff --git a/backends/vulkan/runtime/graph/GraphConfig.h b/backends/vulkan/runtime/graph/GraphConfig.h
@@ -18,6 +18,16 @@ namespace vulkan {
 
 struct GraphConfig final {
   api::ContextConfig contextConfig;
+
+  // Creating a descriptor pool with exactly the number of descriptors tallied
+  // by iterating through the shader layouts of shaders used in the graph risks
+  // the descriptor pool running out of memory, therefore apply a safety factor
+  // to descriptor counts when creating the descriptor pool to mitigate this
+  // risk.
+  float descriptorPoolSafetyFactor;
+
+  // Generate a default graph config with pre-configured settings
+  explicit GraphConfig();
 };
 
 } // namespace vulkan
diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp b/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp
@@ -16,6 +16,21 @@ namespace at {
 namespace native {
 namespace vulkan {
 
+ExecuteNode::ExecuteNode(
+    ComputeGraph& graph,
+    const api::ShaderInfo& shader,
+    const api::utils::uvec3& global_workgroup_size,
+    const api::utils::uvec3& local_workgroup_size,
+    const std::vector<ArgGroup>& args,
+    api::UniformParamsBuffer&& params)
+    : shader_(shader),
+      global_workgroup_size_(global_workgroup_size),
+      local_workgroup_size_(local_workgroup_size),
+      args_(args),
+      params_(std::move(params)) {
+  graph.update_descriptor_counts(shader, /*execute = */ true);
+}
+
 void ExecuteNode::encode(ComputeGraph* graph) {
   api::Context* const context = graph->context();
   api::PipelineBarrier pipeline_barrier{};
diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.h b/backends/vulkan/runtime/graph/ops/ExecuteNode.h
@@ -50,16 +50,12 @@ class ExecuteNode final {
 
  public:
   ExecuteNode(
+      ComputeGraph& graph,
       const api::ShaderInfo& shader,
       const api::utils::uvec3& global_workgroup_size,
       const api::utils::uvec3& local_workgroup_size,
       const std::vector<ArgGroup>& args,
-      api::UniformParamsBuffer&& params)
-      : shader_(shader),
-        global_workgroup_size_(global_workgroup_size),
-        local_workgroup_size_(local_workgroup_size),
-        args_(args),
-        params_(std::move(params)) {}
+      api::UniformParamsBuffer&& params);
 
   ~ExecuteNode() = default;
 
diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
@@ -17,6 +17,23 @@ namespace at {
 namespace native {
 namespace vulkan {
 
+PrepackNode::PrepackNode(
+    ComputeGraph& graph,
+    const api::ShaderInfo& shader,
+    const api::utils::uvec3& global_workgroup_size,
+    const api::utils::uvec3& local_workgroup_size,
+    const ValueRef tref,
+    const ValueRef packed,
+    api::UniformParamsBuffer&& params)
+    : shader_(shader),
+      global_workgroup_size_(global_workgroup_size),
+      local_workgroup_size_(local_workgroup_size),
+      tref_(tref),
+      packed_(packed),
+      params_(std::move(params)) {
+  graph.update_descriptor_counts(shader, /*execute = */ false);
+}
+
 void PrepackNode::encode(ComputeGraph* graph) {
   api::Context* const context = graph->context();
   api::PipelineBarrier pipeline_barrier{};
diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.h b/backends/vulkan/runtime/graph/ops/PrepackNode.h
@@ -33,18 +33,13 @@ class PrepackNode final {
 
  public:
   PrepackNode(
+      ComputeGraph& graph,
       const api::ShaderInfo& shader,
       const api::utils::uvec3& global_workgroup_size,
       const api::utils::uvec3& local_workgroup_size,
       const ValueRef tref,
       const ValueRef packed,
-      api::UniformParamsBuffer&& params)
-      : shader_(shader),
-        global_workgroup_size_(global_workgroup_size),
-        local_workgroup_size_(local_workgroup_size),
-        tref_(tref),
-        packed_(packed),
-        params_(std::move(params)) {}
+      api::UniformParamsBuffer&& params);
 
   ~PrepackNode() = default;
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp b/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp
@@ -72,6 +72,7 @@ void add_arithmetic_node(
   api::UniformParamsBuffer params(graph.context(), block);
 
   graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
       shader,
       global_size,
       local_size,
diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
@@ -48,6 +48,7 @@ void add_staging_to_tensor_node(
       graph.context(), create_staging_params(t_out));
 
   graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
       shader,
       global_size,
       local_size,
@@ -90,6 +91,7 @@ void add_tensor_to_staging_node(
   }
 
   graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
       shader,
       global_size,
       local_size,
@@ -112,7 +114,7 @@ ValueRef prepack(ComputeGraph& graph, const ValueRef vref) {
   api::UniformParamsBuffer params(graph.context(), sp);
 
   graph.prepack_nodes().emplace_back(new PrepackNode(
-      shader, global_size, local_size, vref, v, std::move(params)));
+      graph, shader, global_size, local_size, vref, v, std::move(params)));
 
   return v;
 }
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp