[ET-VK][ez] Enable dynamic shape support when using push constants (#11302)

pytorchbot · web-flow · commit 70532b055f52 · 2025-06-02T20:21:39.000-04:00
## Changes * Call `encode_execute()` upon resize in `VulkanBackend.cpp` * Minor update to `DispatchNode` to store push constant data array as a persistent member of the class ## Motivation Passing in tensor metadata (i.e. sizes, strides) via push constants is typically more performant than passing them via a UBO (uniform buffer object). However, currently dynamic shapes do not work when push constants are used as I realized that the tensor metadata contained in the push constants do not get updated. It appears that that `vkCmdPushConstants` sets the push constants when encoding the command buffer, however the push constants will not be updated if the command buffer is submitted for execution multiple times. Therefore, to update push constant values **the command buffer needs to be re-encoded**. ## Performance Impact This may add a small performance overhead (i.e. re-encoding the command buffer) when executing models with dynamic shapes. Models that do not trigger tensor resizing will not be impacted. However, I measured the impact on a llama 3.2 1B model and the impact of re-encoding a command buffer appears to be negligible. In any case, re-encoding the command buffer is a "necessary evil" when working with dynamic shapes, otherwise the tensor metadata seen by shaders may never get updated. Furthermore, re-encoding the command buffer can allow an opportunity to adjust global work group sizing to match current tensor sizes, which may have a huge performance impact when maximum tensor sizes far exceeds what tensor sizes will realistically be during inference (one instance of this is for transformer models when the max sequence length is very long). Differential Revision: [D75686051](https://our.internmc.facebook.com/intern/diff/D75686051/)
diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp
@@ -499,6 +499,8 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
     compute_graph->encode_prepack();
     compute_graph->prepack();
 
+    // TODO(ssjia): remove this once we can batch compile compute pipelines
+    // during prepare().
     compute_graph->encode_execute();
 
     return Error::Ok;
@@ -567,9 +569,14 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
       }
     }
 
+    // propagate_resize() will re-encode the command buffer so that push
+    // constants are updated and DynamicDispatchNode can update the compute
+    // shader, global workgroup size, and local workgroup size to perform the
+    // model inference.
     if (should_propagate_resize) {
       compute_graph->propagate_resize();
     }
+
     compute_graph->execute();
 
     for (size_t i = 0; i < compute_graph->outputs().size(); i++) {
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -678,11 +678,12 @@ void ComputeGraph::encode_execute() {
   }
 }
 
-void ComputeGraph::execute() const {
+void ComputeGraph::execute() {
   vkapi::VulkanFence fence = context_->fences().get_fence();
   context_->submit_cmd_to_gpu(fence.get_submit_handle());
   fence.wait();
   context_->fences().return_fence(fence);
+  execute_count_++;
 }
 
 void ComputeGraph::resize_input(
@@ -696,6 +697,7 @@ void ComputeGraph::propagate_resize() {
   for (std::unique_ptr<ExecuteNode>& node : execute_nodes_) {
     node->trigger_resize(this);
   }
+  encode_execute();
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -187,6 +187,7 @@ class ComputeGraph final {
 
  protected:
   size_t values_in_use_ = 0;
+  size_t execute_count_ = 0;
 
  public:
   //
@@ -745,7 +746,7 @@ class ComputeGraph final {
   //
 
   void encode_execute();
-  void execute() const;
+  void execute();
 
   //
   // Dynamic Shape support
@@ -762,6 +763,10 @@ class ComputeGraph final {
     return context_->adapter_ptr()->supports_int16_shader_types();
   }
 
+  inline size_t execute_count() const {
+    return execute_count_;
+  }
+
   /*
    * Check whether the GPU supports 8 bit buffers.
    */
diff --git a/backends/vulkan/runtime/graph/ops/DispatchNode.cpp b/backends/vulkan/runtime/graph/ops/DispatchNode.cpp
@@ -46,15 +46,7 @@ void DispatchNode::encode(ComputeGraph* graph) {
 
   std::unique_lock<std::mutex> cmd_lock = context->dispatch_lock();
 
-  std::array<uint8_t, kMaxPushConstantSize> push_constants_data;
-  uint32_t push_constants_offset = 0;
-
-  for (const auto& push_constant : push_constants_) {
-    push_constants_offset += push_constant.write(
-        push_constants_data.data(),
-        push_constants_offset,
-        kMaxPushConstantSize);
-  }
+  write_push_constant_data();
 
   context->report_shader_dispatch_start(
       shader_.kernel_name,
@@ -63,7 +55,7 @@ void DispatchNode::encode(ComputeGraph* graph) {
       node_id_);
 
   vkapi::DescriptorSet descriptor_set = context->get_descriptor_set(
-      shader_, local_workgroup_size_, spec_vars_, push_constants_offset);
+      shader_, local_workgroup_size_, spec_vars_, push_constants_offset_);
 
   uint32_t idx = 0;
   idx = bind_values_to_descriptor_set(
@@ -76,10 +68,20 @@ void DispatchNode::encode(ComputeGraph* graph) {
       pipeline_barrier,
       shader_,
       global_workgroup_size_,
-      push_constants_data.data(),
-      push_constants_offset);
+      push_constants_data_.data(),
+      push_constants_offset_);
 
   context->report_shader_dispatch_end();
 }
 
+void DispatchNode::write_push_constant_data() {
+  push_constants_offset_ = 0;
+  for (const auto& push_constant : push_constants_) {
+    push_constants_offset_ += push_constant.write(
+        push_constants_data_.data(),
+        push_constants_offset_,
+        kMaxPushConstantSize);
+  }
+}
+
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/DispatchNode.h b/backends/vulkan/runtime/graph/ops/DispatchNode.h
@@ -50,6 +50,12 @@ class DispatchNode : public ExecuteNode {
   const vkapi::SpecVarList spec_vars_;
   const std::vector<PushConstantDataInfo> push_constants_;
 
+  // For push constants
+  std::array<uint8_t, kMaxPushConstantSize> push_constants_data_{};
+  uint32_t push_constants_offset_ = 0;
+
+  void write_push_constant_data();
+
  public:
   operator bool() const {
     return shader_;
diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.h b/backends/vulkan/runtime/graph/ops/ExecuteNode.h
@@ -65,7 +65,7 @@ class ExecuteNode {
     (void)graph;
   }
 
-  inline void trigger_resize(ComputeGraph* graph) {
+  virtual inline void trigger_resize(ComputeGraph* graph) {
     if (resize_fn_ != nullptr) {
       resize_fn_(graph, args_, resize_args_);
     }
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -1660,9 +1660,8 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
   for (auto& new_sizes : new_sizes_list) {
     graph.get_tensor(a.value)->virtual_resize(new_sizes);
     graph.get_tensor(b.value)->virtual_resize(new_sizes);
-    graph.get_tensor(c)->virtual_resize(new_sizes);
     graph.get_tensor(d.value)->virtual_resize(new_sizes);
-    graph.get_tensor(e)->virtual_resize(new_sizes);
+    graph.propagate_resize();
 
     float val_a = new_sizes[1] + 4.0f;
     float val_b = new_sizes[2] + 1.5f;

Original file line number	Diff line number	Diff line change
`@@ -499,6 +499,8 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {`
`499`	`499`	`compute_graph->encode_prepack();`
`500`	`500`	`compute_graph->prepack();`
`501`	`501`
	`502`	`+ // TODO(ssjia): remove this once we can batch compile compute pipelines`
	`503`	`+ // during prepare().`
`502`	`504`	`compute_graph->encode_execute();`
`503`	`505`
`504`	`506`	`return Error::Ok;`
`@@ -567,9 +569,14 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {`
`567`	`569`	`}`
`568`	`570`	`}`
`569`	`571`
	`572`	`+ // propagate_resize() will re-encode the command buffer so that push`
	`573`	`+ // constants are updated and DynamicDispatchNode can update the compute`
	`574`	`+ // shader, global workgroup size, and local workgroup size to perform the`
	`575`	`+ // model inference.`
`570`	`576`	`if (should_propagate_resize) {`
`571`	`577`	`compute_graph->propagate_resize();`
`572`	`578`	`}`
	`579`	`+`
`573`	`580`	`compute_graph->execute();`
`574`	`581`
`575`	`582`	`for (size_t i = 0; i < compute_graph->outputs().size(); i++) {`
Original file line number	Diff line number	Diff line change
`@@ -678,11 +678,12 @@ void ComputeGraph::encode_execute() {`
`678`	`678`	`}`
`679`	`679`	`}`
`680`	`680`
`681`		`-void ComputeGraph::execute() const {`
	`681`	`+void ComputeGraph::execute() {`
`682`	`682`	`vkapi::VulkanFence fence = context_->fences().get_fence();`
`683`	`683`	`context_->submit_cmd_to_gpu(fence.get_submit_handle());`
`684`	`684`	`fence.wait();`
`685`	`685`	`context_->fences().return_fence(fence);`
	`686`	`+ execute_count_++;`
`686`	`687`	`}`
`687`	`688`
`688`	`689`	`void ComputeGraph::resize_input(`
`@@ -696,6 +697,7 @@ void ComputeGraph::propagate_resize() {`
`696`	`697`	`for (std::unique_ptr<ExecuteNode>& node : execute_nodes_) {`
`697`	`698`	`node->trigger_resize(this);`
`698`	`699`	`}`
	`700`	`+ encode_execute();`
`699`	`701`	`}`
`700`	`702`
`701`	`703`	`} // namespace vkcompute`
Original file line number	Diff line number	Diff line change
`@@ -65,7 +65,7 @@ class ExecuteNode {`
`65`	`65`	`(void)graph;`
`66`	`66`	`}`
`67`	`67`
`68`		`- inline void trigger_resize(ComputeGraph* graph) {`
	`68`	`+ virtual inline void trigger_resize(ComputeGraph* graph) {`
`69`	`69`	`if (resize_fn_ != nullptr) {`
`70`	`70`	`resize_fn_(graph, args_, resize_args_);`
`71`	`71`	`}`