Skip to content

Commit a5c722d

Browse files
estebanpadillafacebook-github-bot
authored andcommitted
RegCount max registers calculation (#4171)
Summary: Pull Request resolved: #4171 This project adds an internal implementation of https://github.com/microsoft/ArchProbe. This stack introduces a kernel that can be used to get the number of available registers on a mobile GPU by gradually increasing the number of accessed elements and detecting dramatic drops in performance. See [this paper](https://www.microsoft.com/en-us/research/uploads/prod/2022/02/mobigpu_mobicom22_camera.pdf), page 4, for more information. This diffs finds the number of registers in a single thread by increasing the number of registers and finding changes in latency. For a Galaxy S22, the latency graph looks like this. {F1750619092} We can easily spot the moment where there is a spill into memory. Differential Revision: D59494644 Reviewed By: SS-JIA
1 parent df26343 commit a5c722d

File tree

2 files changed

+159
-0
lines changed

2 files changed

+159
-0
lines changed
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
/*
2+
* Portions (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
/*
10+
* Code sourced from
11+
* https://github.com/microsoft/ArchProbe/blob/main/include/stats.hpp with the
12+
* following MIT license
13+
*
14+
* MIT License
15+
*
16+
* Copyright (c) Microsoft Corporation.
17+
*
18+
* Permission is hereby granted, free of charge, to any person obtaining a copy
19+
* of this software and associated documentation files (the "Software"), to
20+
* deal in the Software without restriction, including without limitation the
21+
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
22+
* sell copies of the Software, and to permit persons to whom the Software is
23+
* furnished to do so, subject to the following conditions:
24+
*
25+
* The above copyright notice and this permission notice shall be included in
26+
* all copies or substantial portions of the Software.
27+
*
28+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
31+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
32+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
33+
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
34+
* IN THE SOFTWARE
35+
*/
36+
37+
#pragma once
38+
#include <array>
39+
#include <cstdint>
40+
41+
template <typename T>
42+
class AvgStats {
43+
T sum_ = 0;
44+
uint64_t n_ = 0;
45+
46+
public:
47+
typedef T value_t;
48+
49+
void push(T value) {
50+
sum_ += value;
51+
n_ += 1;
52+
}
53+
inline bool has_value() const {
54+
return n_ != 0;
55+
}
56+
operator T() const {
57+
return sum_ / n_;
58+
}
59+
};
60+
61+
template <typename T, size_t NTap>
62+
class NTapAvgStats {
63+
std::array<double, NTap> hist_;
64+
size_t cur_idx_;
65+
bool ready_;
66+
67+
public:
68+
typedef T value_t;
69+
70+
void push(T value) {
71+
hist_[cur_idx_++] = value;
72+
if (cur_idx_ >= NTap) {
73+
cur_idx_ = 0;
74+
ready_ = true;
75+
}
76+
}
77+
inline bool has_value() const {
78+
return ready_;
79+
}
80+
operator T() const {
81+
double out = 0.0;
82+
for (double x : hist_) {
83+
out += x;
84+
}
85+
out /= NTap;
86+
return out;
87+
}
88+
};
89+
90+
template <uint32_t NTap>
91+
struct DtJumpFinder {
92+
private:
93+
NTapAvgStats<double, NTap> time_avg_;
94+
AvgStats<double> dtime_avg_;
95+
double compensation_;
96+
double threshold_;
97+
98+
public:
99+
// Compensation is a tiny additive to give on delta time so that the algorithm
100+
// works smoothly when a sequence of identical timing is ingested, which is
101+
// pretty common in our tests. Threshold is simply how many times the new
102+
// delta has to be to be recognized as a deviation.
103+
DtJumpFinder(double compensation = 0.01, double threshold = 10)
104+
: time_avg_(),
105+
dtime_avg_(),
106+
compensation_(compensation),
107+
threshold_(threshold) {}
108+
109+
// Returns true if the delta time regarding to the last data point seems
110+
// normal; returns false if it seems the new data point is too much away from
111+
// the historical records.
112+
bool push(double time) {
113+
if (time_avg_.has_value()) {
114+
double dtime = std::abs(time - time_avg_) + (compensation_ * time_avg_);
115+
if (dtime_avg_.has_value()) {
116+
double ddtime = std::abs(dtime - dtime_avg_);
117+
if (ddtime > threshold_ * dtime_avg_) {
118+
return true;
119+
}
120+
}
121+
dtime_avg_.push(dtime);
122+
}
123+
time_avg_.push(time);
124+
return false;
125+
}
126+
127+
double dtime_avg() const {
128+
return dtime_avg_;
129+
}
130+
double compensate_time() const {
131+
return compensation_ * time_avg_;
132+
}
133+
};

backends/vulkan/tools/gpuinfo/src/app.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,16 @@
1010
#include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
1111
#include <iostream>
1212

13+
#include "stats.h"
1314
#include "utils.h"
1415

1516
void reg_count() {
1617
const uint32_t NREG_MIN = 1;
18+
const uint32_t NREG_MAX = 512;
19+
const uint32_t NREG_STEP = 1;
20+
21+
const double COMPENSATE = 0.01;
22+
const double THRESHOLD = 3;
1723

1824
uint32_t NITER;
1925

@@ -43,6 +49,26 @@ void reg_count() {
4349
std::cout << "Calculating NITER..." << std::endl;
4450
ensure_min_niter(1000, NITER, [&]() { return bench(1, 1, NREG_MIN); });
4551
std::cout << "NITER," << NITER << std::endl;
52+
53+
uint32_t nreg_max;
54+
55+
DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
56+
uint32_t nreg = NREG_MIN;
57+
for (; nreg <= NREG_MAX; nreg += NREG_STEP) {
58+
double time = bench(1, 1, nreg);
59+
std::cout << "Testing nreg=\t" << nreg << "\tTime=\t" << time << std::endl;
60+
if (dj.push(time)) {
61+
nreg -= NREG_STEP;
62+
nreg_max = nreg;
63+
break;
64+
}
65+
}
66+
if (nreg >= NREG_MAX) {
67+
std::cout << "Unable to conclude a maximal register count" << std::endl;
68+
nreg_max = NREG_STEP;
69+
} else {
70+
std::cout << nreg_max << " registers are available at most" << std::endl;
71+
}
4672
}
4773

4874
int main(int argc, const char** argv) {

0 commit comments

Comments
 (0)