Skip to content

Commit 7e417f4

Browse files
Esteban Padilla Cerdiofacebook-github-bot
authored andcommitted
Add Warp Size metric (#4298)
Summary: Pull Request resolved: #4298 This very simple metric runs a kernel across an increasing number of workgroups, until there is a noticeable increase in latency, as seen in the following graph: {F1762497995} The shader uses an integer division as its metric, because it is a multi-cycle operation that puts the ALU to work and stops the SM from context switching. As other metrics, we start by obtaining the minimum number of iterations, NITER, that can run in 1000us, as to have a baseline for comparison and reduce timing noise. With this number of iterations, we run the kernel with an increasing number of threads. We also use a multidimensional global workgroup with a Y size of 1024 in hopes of saturating the ALUs and have a better point of reference for the latency caused by adding warps. Once we detect a jump in latency, we can assume that that is the warp size. More information can be found [here](https://www.microsoft.com/en-us/research/uploads/prod/2022/02/mobigpu_mobicom22_camera.pdf) on page 5. Reviewed By: jorgep31415 Differential Revision: D59920169 fbshipit-source-id: 4ac9324e10f0ab1a72433fd7ce98ad5f5ab839e9
1 parent ba052a4 commit 7e417f4

File tree

3 files changed

+112
-0
lines changed

3 files changed

+112
-0
lines changed
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#version 450 core
10+
11+
#define PRECISION ${PRECISION}
12+
13+
layout(std430) buffer;
14+
15+
${layout_declare_buffer(0, "w", "out_buff", DTYPE)}
16+
17+
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
18+
19+
layout(constant_id = 3) const int NITER = 1;
20+
21+
void main() {
22+
int sum = 0;
23+
for (int j = 0; j < NITER; ++j) {
24+
// Integer division is an exemplary multi-cycle instruction that can
25+
// hardly be optimized, thus reducing the impact of latency hiding.
26+
sum += j / 3;
27+
barrier();
28+
}
29+
out_buff[gl_GlobalInvocationID[0]] = sum;
30+
}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
warp_size:
8+
parameter_names_with_default_values:
9+
DTYPE: int
10+
STORAGE: buffer
11+
shader_variants:
12+
- NAME: warp_size

backends/vulkan/tools/gpuinfo/src/app.cpp

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
*/
88

99
#include <executorch/backends/vulkan/runtime/api/api.h>
10+
#include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
1011
#include <iostream>
1112

1213
#include "stats.h"
@@ -20,6 +21,7 @@ class App {
2021
uint32_t max_shared_mem_size_;
2122
uint32_t sm_count_;
2223
uint32_t nthread_logic_;
24+
uint32_t subgroup_size_;
2325

2426
public:
2527
App() {
@@ -34,11 +36,24 @@ class App {
3436
nthread_logic_ = cl_device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
3537
buf_cache_size_ = cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_CACHE_SIZE>();
3638
max_shared_mem_size_ = cl_device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
39+
40+
VkPhysicalDeviceSubgroupProperties subgroup_props{};
41+
VkPhysicalDeviceProperties2 props2{};
42+
43+
props2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
44+
props2.pNext = &subgroup_props;
45+
subgroup_props.sType =
46+
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES;
47+
vkGetPhysicalDeviceProperties2(
48+
context()->adapter_ptr()->physical_handle(), &props2);
49+
subgroup_size_ = subgroup_props.subgroupSize;
50+
3751
std::cout << std::endl;
3852
std::cout << "SM count," << sm_count_ << std::endl;
3953
std::cout << "Logic Thread Count," << nthread_logic_ << std::endl;
4054
std::cout << "Cache Size," << buf_cache_size_ << std::endl;
4155
std::cout << "Shared Memory Size," << max_shared_mem_size_ << std::endl;
56+
std::cout << "SubGroup Size," << subgroup_size_ << std::endl;
4257
}
4358

4459
void reg_count() {
@@ -313,6 +328,60 @@ class App {
313328
const uint32_t RANGE = max_shared_mem_size_;
314329
_bandwidth("Shared", RANGE);
315330
}
331+
332+
void warp_size() {
333+
std::cout << "\n------ Warp Size ------" << std::endl;
334+
const double COMPENSATE = 0.01;
335+
const double THRESHOLD = 3;
336+
337+
uint32_t NITER;
338+
339+
auto bench = [&](uint32_t nthread) {
340+
StorageBuffer out_buf(context(), vkapi::kInt, nthread_logic_);
341+
vkapi::PipelineBarrier pipeline_barrier{};
342+
343+
auto shader_name = "warp_size";
344+
345+
auto time = benchmark_on_gpu(shader_name, 10, [&]() {
346+
context()->submit_compute_job(
347+
VK_KERNEL_FROM_STR(shader_name),
348+
pipeline_barrier,
349+
// Large number of work groups selected to potentially saturate all
350+
// ALUs and thus have a better baseline for comparison.
351+
{nthread, 1024, 1},
352+
{nthread, 1, 1},
353+
{SV(NITER)},
354+
VK_NULL_HANDLE,
355+
0,
356+
out_buf.buffer());
357+
});
358+
359+
return time;
360+
};
361+
362+
ensure_min_niter(1000, NITER, [&]() { return bench(1); });
363+
364+
uint32_t warp_size = subgroup_size_;
365+
DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
366+
367+
// We increase the number of threads until we hit a jump in the data.
368+
uint32_t nthread = 1;
369+
for (; nthread <= nthread_logic_; ++nthread) {
370+
double time = bench(nthread);
371+
std::cout << "nthread=\t" << nthread << "\t(\t" << time << "\tus)"
372+
<< std::endl;
373+
if (dj.push(time)) {
374+
warp_size = nthread - 1;
375+
break;
376+
}
377+
}
378+
if (nthread >= nthread_logic_) {
379+
std::cout
380+
<< "Unable to conclude a physical warp size. Assuming warp_size == subgroup_size"
381+
<< std::endl;
382+
}
383+
std::cout << "WarpSize," << warp_size << std::endl;
384+
}
316385
};
317386

318387
int main(int argc, const char** argv) {
@@ -324,6 +393,7 @@ int main(int argc, const char** argv) {
324393
app.buf_bandwidth();
325394
app.ubo_bandwidth();
326395
app.shared_mem_bandwidth();
396+
app.warp_size();
327397

328398
return 0;
329399
}

0 commit comments

Comments
 (0)