Skip to content

Commit b04d6c7

Browse files
estebanpadillafacebook-github-bot
authored andcommitted
RegCount concurrency calculation (#4173)
Summary: Pull Request resolved: #4173 This project adds an internal implementation of https://github.com/microsoft/ArchProbe. This stack introduces a kernel that can be used to get the number of available registers on a mobile GPU by gradually increasing the number of accessed elements and detecting dramatic drops in performance. See [this paper](https://www.microsoft.com/en-us/research/uploads/prod/2022/02/mobigpu_mobicom22_camera.pdf), page 4, for more information. This diff finds the number of concurrency groups available for full registry usage, and half registry usage. On a Samsung Galaxy S22, the latency graphs for full and half registry usage look like this: Full: {F1750677545} Half: {F1750679467} Differential Revision: D59497314 Reviewed By: SS-JIA
1 parent a5c722d commit b04d6c7

File tree

1 file changed

+54
-5
lines changed
  • backends/vulkan/tools/gpuinfo/src

1 file changed

+54
-5
lines changed

backends/vulkan/tools/gpuinfo/src/app.cpp

Lines changed: 54 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,13 @@ void reg_count() {
2121
const double COMPENSATE = 0.01;
2222
const double THRESHOLD = 3;
2323

24+
const uint32_t NGRP_MIN = 1;
25+
const uint32_t NGRP_MAX = 64;
26+
const uint32_t NGRP_STEP = 1;
27+
2428
uint32_t NITER;
2529

26-
auto bench = [&](uint32_t nthread, uint32_t ngrp, uint32_t nreg) {
30+
auto bench = [&](uint32_t ngrp, uint32_t nreg) {
2731
size_t len = sizeof(float);
2832
StorageBuffer buffer(context(), vkapi::kFloat, len);
2933
ParamsBuffer params(context(), int32_t(len));
@@ -35,8 +39,8 @@ void reg_count() {
3539
context()->submit_compute_job(
3640
VK_KERNEL_FROM_STR(shader_name),
3741
pipeline_barrier,
38-
{nthread, ngrp, 1},
39-
{nthread, 1, 1},
42+
{1, ngrp, 1},
43+
{1, 1, 1},
4044
{SV(NITER)},
4145
VK_NULL_HANDLE,
4246
0,
@@ -47,15 +51,15 @@ void reg_count() {
4751
};
4852

4953
std::cout << "Calculating NITER..." << std::endl;
50-
ensure_min_niter(1000, NITER, [&]() { return bench(1, 1, NREG_MIN); });
54+
ensure_min_niter(1000, NITER, [&]() { return bench(1, NREG_MIN); });
5155
std::cout << "NITER," << NITER << std::endl;
5256

5357
uint32_t nreg_max;
5458

5559
DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
5660
uint32_t nreg = NREG_MIN;
5761
for (; nreg <= NREG_MAX; nreg += NREG_STEP) {
58-
double time = bench(1, 1, nreg);
62+
double time = bench(1, nreg);
5963
std::cout << "Testing nreg=\t" << nreg << "\tTime=\t" << time << std::endl;
6064
if (dj.push(time)) {
6165
nreg -= NREG_STEP;
@@ -69,6 +73,51 @@ void reg_count() {
6973
} else {
7074
std::cout << nreg_max << " registers are available at most" << std::endl;
7175
}
76+
77+
auto find_ngrp_by_nreg = [&](const uint32_t nreg) {
78+
DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
79+
for (auto ngrp = NGRP_MIN; ngrp <= NGRP_MAX; ngrp += NGRP_STEP) {
80+
auto time = bench(ngrp, nreg);
81+
std::cout << "Testing occupation (nreg=" << nreg << "); ngrp=" << ngrp
82+
<< ", time=" << time << " us" << std::endl;
83+
84+
if (dj.push(time)) {
85+
ngrp -= NGRP_STEP;
86+
std::cout << "Using " << nreg << " registers can have " << ngrp
87+
<< " concurrent single-thread workgroups" << std::endl;
88+
return ngrp;
89+
}
90+
}
91+
std::cout
92+
<< "Unable to conclude a maximum number of concurrent single-thread workgroups when "
93+
<< nreg << " registers are occupied" << std::endl;
94+
return (uint32_t)1;
95+
};
96+
97+
uint32_t ngrp_full, ngrp_half;
98+
ngrp_full = find_ngrp_by_nreg(nreg_max);
99+
ngrp_half = find_ngrp_by_nreg(nreg_max / 2);
100+
101+
std::string reg_ty;
102+
103+
if (ngrp_full * 1.5 < ngrp_half) {
104+
std::cout << "All physical threads in an sm share " << nreg_max
105+
<< " registers" << std::endl;
106+
reg_ty = "Pooled";
107+
108+
} else {
109+
std::cout << "Each physical thread has " << nreg_max << " registers"
110+
<< std::endl;
111+
reg_ty = "Dedicated";
112+
}
113+
114+
std::cout << "\n\nNITER," << NITER << std::endl;
115+
std::cout << "Max registers," << nreg_max << std::endl;
116+
std::cout << "Concurrent full single thread workgroups," << ngrp_full
117+
<< std::endl;
118+
std::cout << "Concurrent half single thread workgroups," << ngrp_half
119+
<< std::endl;
120+
std::cout << "Register type," << reg_ty << std::endl;
72121
}
73122

74123
int main(int argc, const char** argv) {

0 commit comments

Comments
 (0)