Skip to content

Commit b034494

Browse files
Esteban Padilla Cerdiofacebook-github-bot
authored andcommitted
Try to get SM Count or request as input
Summary: The number of Streaming Multiprocessors (SM Count) is an essential variable for most of ArchProbe's metric algorithms. OpenCL has the property available directly, but Vulkan does not. Only through the [SM_BUILTINS](https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VK_NV_shader_sm_builtins.html) extension can they be obtained. However, this extension is part only of the Nvidia ecosystem, so for other GPU's, the only option is to pass it as an input. Differential Revision: D59636879
1 parent fe6f236 commit b034494

File tree

5 files changed

+171
-97
lines changed

5 files changed

+171
-97
lines changed

backends/vulkan/runtime/vk_api/Adapter.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ VkDevice create_logical_device(
7373
VK_KHR_16BIT_STORAGE_EXTENSION_NAME,
7474
VK_KHR_8BIT_STORAGE_EXTENSION_NAME,
7575
VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME,
76+
VK_NV_SHADER_SM_BUILTINS_EXTENSION_NAME,
7677
};
7778

7879
std::vector<const char*> enabled_device_extensions;

backends/vulkan/runtime/vk_api/Adapter.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,10 @@ class Adapter final {
179179
return has_8bit_storage() && has_8bit_compute();
180180
}
181181

182+
inline bool has_sm_builtins() {
183+
return physical_device_.shader_sm_builtins.shaderSMBuiltins == VK_TRUE;
184+
}
185+
182186
// Command Buffer Submission
183187

184188
void

backends/vulkan/runtime/vk_api/Device.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ PhysicalDevice::PhysicalDevice(VkPhysicalDevice physical_device_handle)
2828
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES},
2929
shader_float16_int8_types{
3030
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_FLOAT16_INT8_FEATURES_KHR},
31+
shader_sm_builtins{
32+
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_SM_BUILTINS_FEATURES_NV},
3133
queue_families{},
3234
num_compute_queues(0),
3335
has_unified_memory(false),
@@ -45,7 +47,8 @@ PhysicalDevice::PhysicalDevice(VkPhysicalDevice physical_device_handle)
4547
features2.pNext = &shader_16bit_storage;
4648
shader_16bit_storage.pNext = &shader_8bit_storage;
4749
shader_8bit_storage.pNext = &shader_float16_int8_types;
48-
shader_float16_int8_types.pNext = nullptr;
50+
shader_float16_int8_types.pNext = &shader_sm_builtins;
51+
shader_sm_builtins.pNext = nullptr;
4952

5053
vkGetPhysicalDeviceFeatures2(handle, &features2);
5154

backends/vulkan/runtime/vk_api/Device.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ struct PhysicalDevice final {
2929
VkPhysicalDevice16BitStorageFeatures shader_16bit_storage;
3030
VkPhysicalDevice8BitStorageFeatures shader_8bit_storage;
3131
VkPhysicalDeviceShaderFloat16Int8Features shader_float16_int8_types;
32+
VkPhysicalDeviceShaderSMBuiltinsFeaturesNV shader_sm_builtins;
3233

3334
// Available GPU queues
3435
std::vector<VkQueueFamilyProperties> queue_families;

backends/vulkan/tools/gpuinfo/src/app.cpp

Lines changed: 161 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -13,117 +13,182 @@
1313
#include "stats.h"
1414
#include "utils.h"
1515

16-
void reg_count() {
17-
const uint32_t NREG_MIN = 1;
18-
const uint32_t NREG_MAX = 512;
19-
const uint32_t NREG_STEP = 1;
20-
21-
const double COMPENSATE = 0.01;
22-
const double THRESHOLD = 3;
23-
24-
const uint32_t NGRP_MIN = 1;
25-
const uint32_t NGRP_MAX = 64;
26-
const uint32_t NGRP_STEP = 1;
27-
28-
uint32_t NITER;
29-
30-
auto bench = [&](uint32_t ngrp, uint32_t nreg) {
31-
size_t len = sizeof(float);
32-
StorageBuffer buffer(context(), vkapi::kFloat, len);
33-
ParamsBuffer params(context(), int32_t(len));
34-
vkapi::PipelineBarrier pipeline_barrier{};
35-
36-
auto shader_name = "reg_count_" + std::to_string(nreg);
37-
38-
auto time = benchmark_on_gpu(shader_name, NITER, [&]() {
39-
context()->submit_compute_job(
40-
VK_KERNEL_FROM_STR(shader_name),
41-
pipeline_barrier,
42-
{1, ngrp, 1},
43-
{1, 1, 1},
44-
{SV(NITER)},
45-
VK_NULL_HANDLE,
46-
0,
47-
buffer.buffer(),
48-
params.buffer());
49-
});
50-
return time;
51-
};
52-
53-
std::cout << "Calculating NITER..." << std::endl;
54-
ensure_min_niter(1000, NITER, [&]() { return bench(1, NREG_MIN); });
55-
std::cout << "NITER," << NITER << std::endl;
56-
57-
uint32_t nreg_max;
58-
59-
DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
60-
uint32_t nreg = NREG_MIN;
61-
for (; nreg <= NREG_MAX; nreg += NREG_STEP) {
62-
double time = bench(1, nreg);
63-
std::cout << "Testing nreg=\t" << nreg << "\tTime=\t" << time << std::endl;
64-
if (dj.push(time)) {
65-
nreg -= NREG_STEP;
66-
nreg_max = nreg;
67-
break;
16+
using namespace vkapi;
17+
18+
class App {
19+
private:
20+
VkPhysicalDevice device_handle_;
21+
uint32_t sm_count_;
22+
23+
int get_sm_count() {
24+
if (!context()->adapter_ptr()->has_sm_builtins()) {
25+
return 0;
6826
}
27+
28+
VkPhysicalDeviceShaderSMBuiltinsPropertiesNV smBuiltinProperties = {
29+
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_SM_BUILTINS_PROPERTIES_NV};
30+
31+
VkPhysicalDeviceProperties2 deviceProperties = {
32+
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2};
33+
deviceProperties.pNext = &smBuiltinProperties;
34+
vkGetPhysicalDeviceProperties2(device_handle_, &deviceProperties);
35+
36+
return smBuiltinProperties.shaderSMCount;
6937
}
70-
if (nreg >= NREG_MAX) {
71-
std::cout << "Unable to conclude a maximal register count" << std::endl;
72-
nreg_max = NREG_STEP;
73-
} else {
74-
std::cout << nreg_max << " registers are available at most" << std::endl;
38+
39+
public:
40+
bool initialize(uint32_t sm_count) {
41+
device_handle_ = context()->adapter_ptr()->physical_handle();
42+
context()->initialize_querypool();
43+
44+
std::cout << context()->adapter_ptr()->stringize() << "\n\n";
45+
46+
sm_count_ = get_sm_count();
47+
48+
if (sm_count_ > 0) {
49+
} else if (sm_count > 0) {
50+
sm_count_ = sm_count;
51+
} else {
52+
std::cout
53+
<< "SM counter is not available, please specify the SM count as an argument of this binary."
54+
<< std::endl;
55+
return false;
56+
}
57+
58+
std::cout << "\nSM count," << sm_count_ << std::endl;
59+
return true;
7560
}
7661

77-
auto find_ngrp_by_nreg = [&](const uint32_t nreg) {
78-
DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
79-
for (auto ngrp = NGRP_MIN; ngrp <= NGRP_MAX; ngrp += NGRP_STEP) {
80-
auto time = bench(ngrp, nreg);
81-
std::cout << "Testing occupation (nreg=" << nreg << "); ngrp=" << ngrp
82-
<< ", time=" << time << " us" << std::endl;
62+
void reg_count() {
63+
std::cout << "\n------ Register Count ------" << std::endl;
64+
const uint32_t NREG_MIN = 1;
65+
const uint32_t NREG_MAX = 512;
66+
const uint32_t NREG_STEP = 1;
67+
68+
const double COMPENSATE = 0.01;
69+
const double THRESHOLD = 3;
70+
71+
const uint32_t NGRP_MIN = 1;
72+
const uint32_t NGRP_MAX = 64;
73+
const uint32_t NGRP_STEP = 1;
74+
75+
uint32_t NITER;
76+
77+
auto bench = [&](uint32_t ngrp, uint32_t nreg) {
78+
size_t len = sizeof(float);
79+
StorageBuffer buffer(context(), vkapi::kFloat, len);
80+
ParamsBuffer params(context(), int32_t(len));
81+
vkapi::PipelineBarrier pipeline_barrier{};
82+
83+
auto shader_name = "reg_count_" + std::to_string(nreg);
84+
85+
auto time = benchmark_on_gpu(shader_name, NITER, [&]() {
86+
context()->submit_compute_job(
87+
VK_KERNEL_FROM_STR(shader_name),
88+
pipeline_barrier,
89+
{1, ngrp, 1},
90+
{1, 1, 1},
91+
{SV(NITER)},
92+
VK_NULL_HANDLE,
93+
0,
94+
buffer.buffer(),
95+
params.buffer());
96+
});
97+
return time;
98+
};
99+
100+
std::cout << "Calculating NITER..." << std::endl;
101+
ensure_min_niter(1000, NITER, [&]() { return bench(1, NREG_MIN); });
102+
std::cout << "NITER," << NITER << std::endl;
103+
104+
uint32_t nreg_max;
83105

106+
DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
107+
uint32_t nreg = NREG_MIN;
108+
for (; nreg <= NREG_MAX; nreg += NREG_STEP) {
109+
double time = bench(1, nreg);
110+
std::cout << "Testing nreg=\t" << nreg << "\tTime=\t" << time
111+
<< std::endl;
84112
if (dj.push(time)) {
85-
ngrp -= NGRP_STEP;
86-
std::cout << "Using " << nreg << " registers can have " << ngrp
87-
<< " concurrent single-thread workgroups" << std::endl;
88-
return ngrp;
113+
nreg -= NREG_STEP;
114+
nreg_max = nreg;
115+
break;
89116
}
90117
}
91-
std::cout
92-
<< "Unable to conclude a maximum number of concurrent single-thread workgroups when "
93-
<< nreg << " registers are occupied" << std::endl;
94-
return (uint32_t)1;
95-
};
96-
97-
uint32_t ngrp_full, ngrp_half;
98-
ngrp_full = find_ngrp_by_nreg(nreg_max);
99-
ngrp_half = find_ngrp_by_nreg(nreg_max / 2);
100-
101-
std::string reg_ty;
118+
if (nreg >= NREG_MAX) {
119+
std::cout << "Unable to conclude a maximal register count" << std::endl;
120+
nreg_max = NREG_STEP;
121+
} else {
122+
std::cout << nreg_max << " registers are available at most" << std::endl;
123+
}
102124

103-
if (ngrp_full * 1.5 < ngrp_half) {
104-
std::cout << "All physical threads in an sm share " << nreg_max
105-
<< " registers" << std::endl;
106-
reg_ty = "Pooled";
125+
auto find_ngrp_by_nreg = [&](const uint32_t nreg) {
126+
DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
127+
for (auto ngrp = NGRP_MIN; ngrp <= NGRP_MAX; ngrp += NGRP_STEP) {
128+
auto time = bench(ngrp, nreg);
129+
std::cout << "Testing occupation (nreg=" << nreg << "); ngrp=" << ngrp
130+
<< ", time=" << time << " us" << std::endl;
131+
132+
if (dj.push(time)) {
133+
ngrp -= NGRP_STEP;
134+
std::cout << "Using " << nreg << " registers can have " << ngrp
135+
<< " concurrent single-thread workgroups" << std::endl;
136+
return ngrp;
137+
}
138+
}
139+
std::cout
140+
<< "Unable to conclude a maximum number of concurrent single-thread workgroups when "
141+
<< nreg << " registers are occupied" << std::endl;
142+
return (uint32_t)1;
143+
};
144+
145+
uint32_t ngrp_full, ngrp_half;
146+
ngrp_full = find_ngrp_by_nreg(nreg_max);
147+
ngrp_half = find_ngrp_by_nreg(nreg_max / 2);
148+
149+
std::string reg_ty;
150+
151+
if (ngrp_full * 1.5 < ngrp_half) {
152+
std::cout << "All physical threads in an sm share " << nreg_max
153+
<< " registers" << std::endl;
154+
reg_ty = "Pooled";
155+
156+
} else {
157+
std::cout << "Each physical thread has " << nreg_max << " registers"
158+
<< std::endl;
159+
reg_ty = "Dedicated";
160+
}
107161

108-
} else {
109-
std::cout << "Each physical thread has " << nreg_max << " registers"
162+
std::cout << "\n\nNITER," << NITER << std::endl;
163+
std::cout << "Max registers," << nreg_max << std::endl;
164+
std::cout << "Concurrent full single thread workgroups," << ngrp_full
165+
<< std::endl;
166+
std::cout << "Concurrent half single thread workgroups," << ngrp_half
110167
<< std::endl;
111-
reg_ty = "Dedicated";
168+
std::cout << "Register type," << reg_ty << std::endl;
112169
}
113-
114-
std::cout << "\n\nNITER," << NITER << std::endl;
115-
std::cout << "Max registers," << nreg_max << std::endl;
116-
std::cout << "Concurrent full single thread workgroups," << ngrp_full
117-
<< std::endl;
118-
std::cout << "Concurrent half single thread workgroups," << ngrp_half
119-
<< std::endl;
120-
std::cout << "Register type," << reg_ty << std::endl;
121-
}
170+
};
122171

123172
int main(int argc, const char** argv) {
124-
context()->initialize_querypool();
173+
App app;
174+
175+
int32_t sm_count = 0;
176+
177+
if (argc == 2) {
178+
sm_count = atoi(argv[1]);
179+
if (sm_count <= 0) {
180+
std::cout << "Invalid SM count" << std::endl;
181+
return 1;
182+
}
183+
} else if (argc > 2) {
184+
std::cout << "Usage: vulkan_gpuinfo [sm count]" << std::endl;
185+
return 1;
186+
}
125187

126-
reg_count();
188+
if (!app.initialize(sm_count)) {
189+
return 1;
190+
}
127191

192+
app.reg_count();
128193
return 0;
129194
}

0 commit comments

Comments
 (0)