Add Warp Size metric (#4298)

Esteban Padilla Cerdio · facebook-github-bot · commit 7e417f42dd06 · 2024-07-19T11:48:22.000-07:00
Summary: Pull Request resolved: #4298 This very simple metric runs a kernel across an increasing number of workgroups, until there is a noticeable increase in latency, as seen in the following graph: {F1762497995} The shader uses an integer division as its metric, because it is a multi-cycle operation that puts the ALU to work and stops the SM from context switching. As other metrics, we start by obtaining the minimum number of iterations, NITER, that can run in 1000us, as to have a baseline for comparison and reduce timing noise. With this number of iterations, we run the kernel with an increasing number of threads. We also use a multidimensional global workgroup with a Y size of 1024 in hopes of saturating the ALUs and have a better point of reference for the latency caused by adding warps. Once we detect a jump in latency, we can assume that that is the warp size. More information can be found [here](https://www.microsoft.com/en-us/research/uploads/prod/2022/02/mobigpu_mobicom22_camera.pdf) on page 5. Reviewed By: jorgep31415 Differential Revision: D59920169 fbshipit-source-id: 4ac9324e10f0ab1a72433fd7ce98ad5f5ab839e9
diff --git a/backends/vulkan/tools/gpuinfo/glsl/warp_size.glsl b/backends/vulkan/tools/gpuinfo/glsl/warp_size.glsl
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+layout(std430) buffer;
+
+${layout_declare_buffer(0, "w", "out_buff", DTYPE)}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int NITER = 1;
+
+void main() {
+    int sum = 0;
+    for (int j = 0; j < NITER; ++j) {
+        // Integer division is an exemplary multi-cycle instruction that can
+        // hardly be optimized, thus reducing the impact of latency hiding.
+        sum += j / 3;
+        barrier();
+    }
+    out_buff[gl_GlobalInvocationID[0]] = sum;
+}
diff --git a/backends/vulkan/tools/gpuinfo/glsl/warp_size.yaml b/backends/vulkan/tools/gpuinfo/glsl/warp_size.yaml
@@ -0,0 +1,12 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+warp_size:
+  parameter_names_with_default_values:
+    DTYPE: int
+    STORAGE: buffer
+  shader_variants:
+    - NAME: warp_size
diff --git a/backends/vulkan/tools/gpuinfo/src/app.cpp b/backends/vulkan/tools/gpuinfo/src/app.cpp
@@ -7,6 +7,7 @@
  */
 
 #include <executorch/backends/vulkan/runtime/api/api.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
 #include <iostream>
 
 #include "stats.h"
@@ -20,6 +21,7 @@ class App {
   uint32_t max_shared_mem_size_;
   uint32_t sm_count_;
   uint32_t nthread_logic_;
+  uint32_t subgroup_size_;
 
  public:
   App() {
@@ -34,11 +36,24 @@ class App {
     nthread_logic_ = cl_device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
     buf_cache_size_ = cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_CACHE_SIZE>();
     max_shared_mem_size_ = cl_device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
+
+    VkPhysicalDeviceSubgroupProperties subgroup_props{};
+    VkPhysicalDeviceProperties2 props2{};
+
+    props2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
+    props2.pNext = &subgroup_props;
+    subgroup_props.sType =
+        VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES;
+    vkGetPhysicalDeviceProperties2(
+        context()->adapter_ptr()->physical_handle(), &props2);
+    subgroup_size_ = subgroup_props.subgroupSize;
+
     std::cout << std::endl;
     std::cout << "SM count," << sm_count_ << std::endl;
     std::cout << "Logic Thread Count," << nthread_logic_ << std::endl;
     std::cout << "Cache Size," << buf_cache_size_ << std::endl;
     std::cout << "Shared Memory Size," << max_shared_mem_size_ << std::endl;
+    std::cout << "SubGroup Size," << subgroup_size_ << std::endl;
   }
 
   void reg_count() {
@@ -313,6 +328,60 @@ class App {
     const uint32_t RANGE = max_shared_mem_size_;
     _bandwidth("Shared", RANGE);
   }
+
+  void warp_size() {
+    std::cout << "\n------ Warp Size ------" << std::endl;
+    const double COMPENSATE = 0.01;
+    const double THRESHOLD = 3;
+
+    uint32_t NITER;
+
+    auto bench = [&](uint32_t nthread) {
+      StorageBuffer out_buf(context(), vkapi::kInt, nthread_logic_);
+      vkapi::PipelineBarrier pipeline_barrier{};
+
+      auto shader_name = "warp_size";
+
+      auto time = benchmark_on_gpu(shader_name, 10, [&]() {
+        context()->submit_compute_job(
+            VK_KERNEL_FROM_STR(shader_name),
+            pipeline_barrier,
+            // Large number of work groups selected to potentially saturate all
+            // ALUs and thus have a better baseline for comparison.
+            {nthread, 1024, 1},
+            {nthread, 1, 1},
+            {SV(NITER)},
+            VK_NULL_HANDLE,
+            0,
+            out_buf.buffer());
+      });
+
+      return time;
+    };
+
+    ensure_min_niter(1000, NITER, [&]() { return bench(1); });
+
+    uint32_t warp_size = subgroup_size_;
+    DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
+
+    // We increase the number of threads until we hit a jump in the data.
+    uint32_t nthread = 1;
+    for (; nthread <= nthread_logic_; ++nthread) {
+      double time = bench(nthread);
+      std::cout << "nthread=\t" << nthread << "\t(\t" << time << "\tus)"
+                << std::endl;
+      if (dj.push(time)) {
+        warp_size = nthread - 1;
+        break;
+      }
+    }
+    if (nthread >= nthread_logic_) {
+      std::cout
+          << "Unable to conclude a physical warp size. Assuming warp_size == subgroup_size"
+          << std::endl;
+    }
+    std::cout << "WarpSize," << warp_size << std::endl;
+  }
 };
 
 int main(int argc, const char** argv) {
@@ -324,6 +393,7 @@ int main(int argc, const char** argv) {
   app.buf_bandwidth();
   app.ubo_bandwidth();
   app.shared_mem_bandwidth();
+  app.warp_size();
 
   return 0;
 }