Add buffer cacheline size metric (#4228)

Esteban Padilla Cerdio · facebook-github-bot · commit 6903715b1990 · 2024-07-15T09:59:57.000-07:00
Summary: Pull Request resolved: #4228 This diff introduces a metric to GPUInfo that calculates the cacheline size of the buffer data pathway. In this experiment, all threads read from the cache with a varying stride. Reading two values from the same cacheline is cheap because the whole line is fetched as a block, regardless of which data we actually want. By varying the separation between the addresses of these two values, there will be a point where the shader will be forced to fetch two separate cachelines, which will have an effect in latency that we can detect. [This article](https://igoro.com/archive/gallery-of-processor-cache-effects/) has more information on the topic. Each run of the shader fetches the two values from different points in memory. The shader also has a seemingly redundant variable `zero` that will force the compiler to avoid optimizing the for loop. The experiment will look like this: {F1754670481} Some useful concept definitions: NITER: The number of iterations that would take the lowest stride to run in 1000 microseconds. All experiments will then run this number of times. This is to have a timing baseline and avoid timing errors. PITCH: A number of bytes of separation between cache lines that ensures that all concurrent groups are being used, and therefore a fetch from two different cache lines is sure to have a latency increase. STRIDE: The actual size of the cache line that will be obtained experimentally. Increasing this until it reaches the cache line size should show a latency increase, giving us the result we look for. Reviewed By: jorgep31415 Differential Revision: D59649561 fbshipit-source-id: 2e82250d55929868982d17d1f405270897dcf9f4
diff --git a/backends/vulkan/tools/gpuinfo/glsl/buf_cacheline_size.glsl b/backends/vulkan/tools/gpuinfo/glsl/buf_cacheline_size.glsl
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+layout(std430) buffer;
+
+
+${layout_declare_buffer(0, "r", "source", DTYPE)}
+${layout_declare_buffer(1, "w", "destination", DTYPE)}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int niter = 1;
+layout(constant_id = 4) const int stride = 1;
+layout(constant_id = 5) const int pitch = 1;
+
+void main() {
+  float c = 0;
+  for (int i = 0; i < niter; ++i) {
+    const int zero = i >> 31;
+    c += source[zero + pitch * gl_GlobalInvocationID[0]];
+    c += source[zero + stride + pitch * gl_GlobalInvocationID[0]];
+  }
+  destination[0] = c;
+}
diff --git a/backends/vulkan/tools/gpuinfo/glsl/buf_cacheline_size.yaml b/backends/vulkan/tools/gpuinfo/glsl/buf_cacheline_size.yaml
@@ -0,0 +1,12 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+buf_cacheline_size:
+  parameter_names_with_default_values:
+    DTYPE: float
+    STORAGE: buffer
+  shader_variants:
+    - NAME: buf_cacheline_size
diff --git a/backends/vulkan/tools/gpuinfo/glsl/reg_count.glsl b/backends/vulkan/tools/gpuinfo/glsl/reg_count.glsl
@@ -12,10 +12,7 @@
 
 layout(std430) buffer;
 
-layout(set = 0, binding = 0) buffer PRECISION restrict writeonly Buffer {
-  float data[];
-}
-out_buff;
+${layout_declare_buffer(0, "w", "out_buff", DTYPE)}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -35,5 +32,5 @@ void main() {
   i = i >> 31;
 
   $for k in range(int(NREG)):
-    out_buff.data[${k} * i] = reg_data${k};
+    out_buff[${k} * i] = reg_data${k};
 }
diff --git a/backends/vulkan/tools/gpuinfo/src/app.cpp b/backends/vulkan/tools/gpuinfo/src/app.cpp
@@ -47,6 +47,7 @@ class App {
     const uint32_t NREG_MAX = 512;
     const uint32_t NREG_STEP = 1;
 
+    // TODO: Make these values configurable
     const double COMPENSATE = 0.01;
     const double THRESHOLD = 3;
 
@@ -150,11 +151,76 @@ class App {
               << std::endl;
     std::cout << "Register type," << reg_ty << std::endl;
   }
+
+  void buf_cacheline_size() {
+    std::cout << std::endl;
+    std::cout << "------ Buffer Cacheline Size ------" << std::endl;
+
+    // TODO: Make these values configurable
+    const double COMPENSATE = 0.01;
+    const double THRESHOLD = 10;
+
+    const uint32_t PITCH = buf_cache_size_ / nthread_logic_;
+    const uint32_t BUF_SIZE = buf_cache_size_;
+    const uint32_t MAX_STRIDE = PITCH;
+
+    uint32_t NITER;
+
+    auto bench = [&](int stride) {
+      size_t len = sizeof(float);
+      StorageBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE);
+      StorageBuffer out_buf(context(), vkapi::kFloat, len);
+      vkapi::PipelineBarrier pipeline_barrier{};
+
+      auto shader_name = "buf_cacheline_size";
+
+      auto time = benchmark_on_gpu(shader_name, 100, [&]() {
+        context()->submit_compute_job(
+            VK_KERNEL_FROM_STR(shader_name),
+            pipeline_barrier,
+            {nthread_logic_, 1, 1},
+            {nthread_logic_, 1, 1},
+            {SV(NITER), SV(stride), SV(PITCH)},
+            VK_NULL_HANDLE,
+            0,
+            in_buf.buffer(),
+            out_buf.buffer());
+      });
+      return time;
+    };
+
+    ensure_min_niter(1000, NITER, [&]() { return bench(1); });
+
+    uint32_t cacheline_size;
+
+    DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
+    uint32_t stride = 1;
+    for (; stride <= MAX_STRIDE; ++stride) {
+      double time = bench(stride);
+      std::cout << "Testing stride=\t" << stride << "\t, time=\t" << time
+                << std::endl;
+
+      if (dj.push(time)) {
+        cacheline_size = stride * sizeof(float);
+        break;
+      }
+    }
+    if (stride >= MAX_STRIDE) {
+      std::cout << "Unable to conclude a top level buffer cacheline size."
+                << std::endl;
+      cacheline_size = MAX_STRIDE;
+    }
+
+    std::cout << "BufTopLevelCachelineSize," << cacheline_size << std::endl;
+  }
 };
 
 int main(int argc, const char** argv) {
   App app;
 
+  // TODO: Allow user to skip tests
   app.reg_count();
+  app.buf_cacheline_size();
+
   return 0;
 }