Skip to content

Commit 6903715

Browse files
Esteban Padilla Cerdiofacebook-github-bot
authored andcommitted
Add buffer cacheline size metric (#4228)
Summary: Pull Request resolved: #4228 This diff introduces a metric to GPUInfo that calculates the cacheline size of the buffer data pathway. In this experiment, all threads read from the cache with a varying stride. Reading two values from the same cacheline is cheap because the whole line is fetched as a block, regardless of which data we actually want. By varying the separation between the addresses of these two values, there will be a point where the shader will be forced to fetch two separate cachelines, which will have an effect in latency that we can detect. [This article](https://igoro.com/archive/gallery-of-processor-cache-effects/) has more information on the topic. Each run of the shader fetches the two values from different points in memory. The shader also has a seemingly redundant variable `zero` that will force the compiler to avoid optimizing the for loop. The experiment will look like this: {F1754670481} Some useful concept definitions: NITER: The number of iterations that would take the lowest stride to run in 1000 microseconds. All experiments will then run this number of times. This is to have a timing baseline and avoid timing errors. PITCH: A number of bytes of separation between cache lines that ensures that all concurrent groups are being used, and therefore a fetch from two different cache lines is sure to have a latency increase. STRIDE: The actual size of the cache line that will be obtained experimentally. Increasing this until it reaches the cache line size should show a latency increase, giving us the result we look for. Reviewed By: jorgep31415 Differential Revision: D59649561 fbshipit-source-id: 2e82250d55929868982d17d1f405270897dcf9f4
1 parent dd7fa6a commit 6903715

File tree

4 files changed

+113
-5
lines changed

4 files changed

+113
-5
lines changed
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#version 450 core
10+
11+
#define PRECISION ${PRECISION}
12+
13+
layout(std430) buffer;
14+
15+
16+
${layout_declare_buffer(0, "r", "source", DTYPE)}
17+
${layout_declare_buffer(1, "w", "destination", DTYPE)}
18+
19+
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
20+
21+
layout(constant_id = 3) const int niter = 1;
22+
layout(constant_id = 4) const int stride = 1;
23+
layout(constant_id = 5) const int pitch = 1;
24+
25+
void main() {
26+
float c = 0;
27+
for (int i = 0; i < niter; ++i) {
28+
const int zero = i >> 31;
29+
c += source[zero + pitch * gl_GlobalInvocationID[0]];
30+
c += source[zero + stride + pitch * gl_GlobalInvocationID[0]];
31+
}
32+
destination[0] = c;
33+
}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
buf_cacheline_size:
8+
parameter_names_with_default_values:
9+
DTYPE: float
10+
STORAGE: buffer
11+
shader_variants:
12+
- NAME: buf_cacheline_size

backends/vulkan/tools/gpuinfo/glsl/reg_count.glsl

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,7 @@
1212

1313
layout(std430) buffer;
1414

15-
layout(set = 0, binding = 0) buffer PRECISION restrict writeonly Buffer {
16-
float data[];
17-
}
18-
out_buff;
15+
${layout_declare_buffer(0, "w", "out_buff", DTYPE)}
1916

2017
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
2118

@@ -35,5 +32,5 @@ void main() {
3532
i = i >> 31;
3633

3734
$for k in range(int(NREG)):
38-
out_buff.data[${k} * i] = reg_data${k};
35+
out_buff[${k} * i] = reg_data${k};
3936
}

backends/vulkan/tools/gpuinfo/src/app.cpp

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ class App {
4747
const uint32_t NREG_MAX = 512;
4848
const uint32_t NREG_STEP = 1;
4949

50+
// TODO: Make these values configurable
5051
const double COMPENSATE = 0.01;
5152
const double THRESHOLD = 3;
5253

@@ -150,11 +151,76 @@ class App {
150151
<< std::endl;
151152
std::cout << "Register type," << reg_ty << std::endl;
152153
}
154+
155+
void buf_cacheline_size() {
156+
std::cout << std::endl;
157+
std::cout << "------ Buffer Cacheline Size ------" << std::endl;
158+
159+
// TODO: Make these values configurable
160+
const double COMPENSATE = 0.01;
161+
const double THRESHOLD = 10;
162+
163+
const uint32_t PITCH = buf_cache_size_ / nthread_logic_;
164+
const uint32_t BUF_SIZE = buf_cache_size_;
165+
const uint32_t MAX_STRIDE = PITCH;
166+
167+
uint32_t NITER;
168+
169+
auto bench = [&](int stride) {
170+
size_t len = sizeof(float);
171+
StorageBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE);
172+
StorageBuffer out_buf(context(), vkapi::kFloat, len);
173+
vkapi::PipelineBarrier pipeline_barrier{};
174+
175+
auto shader_name = "buf_cacheline_size";
176+
177+
auto time = benchmark_on_gpu(shader_name, 100, [&]() {
178+
context()->submit_compute_job(
179+
VK_KERNEL_FROM_STR(shader_name),
180+
pipeline_barrier,
181+
{nthread_logic_, 1, 1},
182+
{nthread_logic_, 1, 1},
183+
{SV(NITER), SV(stride), SV(PITCH)},
184+
VK_NULL_HANDLE,
185+
0,
186+
in_buf.buffer(),
187+
out_buf.buffer());
188+
});
189+
return time;
190+
};
191+
192+
ensure_min_niter(1000, NITER, [&]() { return bench(1); });
193+
194+
uint32_t cacheline_size;
195+
196+
DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
197+
uint32_t stride = 1;
198+
for (; stride <= MAX_STRIDE; ++stride) {
199+
double time = bench(stride);
200+
std::cout << "Testing stride=\t" << stride << "\t, time=\t" << time
201+
<< std::endl;
202+
203+
if (dj.push(time)) {
204+
cacheline_size = stride * sizeof(float);
205+
break;
206+
}
207+
}
208+
if (stride >= MAX_STRIDE) {
209+
std::cout << "Unable to conclude a top level buffer cacheline size."
210+
<< std::endl;
211+
cacheline_size = MAX_STRIDE;
212+
}
213+
214+
std::cout << "BufTopLevelCachelineSize," << cacheline_size << std::endl;
215+
}
153216
};
154217

155218
int main(int argc, const char** argv) {
156219
App app;
157220

221+
// TODO: Allow user to skip tests
158222
app.reg_count();
223+
app.buf_cacheline_size();
224+
159225
return 0;
160226
}

0 commit comments

Comments
 (0)