Skip to content

Try to get SM Count or request as input #4210

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions backends/vulkan/tools/gpuinfo/include/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@

#include <executorch/backends/vulkan/runtime/api/api.h>

#define CL_TARGET_OPENCL_VERSION 200
#define CL_HPP_TARGET_OPENCL_VERSION CL_TARGET_OPENCL_VERSION
#include <CL/opencl.hpp>

using namespace vkcompute;
using namespace api;

Expand Down Expand Up @@ -49,3 +53,29 @@ void ensure_min_niter(
niter = uint32_t(niter * min_time_us / t);
}
}

cl_platform_id get_cl_platform_id() {
cl_uint nplatform_id;
clGetPlatformIDs(0, nullptr, &nplatform_id);
std::vector<cl_platform_id> platform_ids;
platform_ids.resize(nplatform_id);
clGetPlatformIDs(nplatform_id, platform_ids.data(), nullptr);
return platform_ids[0];
}

cl_device_id get_cl_dev_id(cl_platform_id platform_id) {
cl_uint ndev_id;
clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, 0, nullptr, &ndev_id);
std::vector<cl_device_id> dev_ids;
dev_ids.resize(ndev_id);
clGetDeviceIDs(
platform_id, CL_DEVICE_TYPE_ALL, ndev_id, dev_ids.data(), nullptr);
return dev_ids[0];
}

cl::Device get_cl_device() {
auto platform_id = get_cl_platform_id();
auto dev_id = get_cl_dev_id(platform_id);
cl::Device dev(dev_id);
return dev;
}
225 changes: 126 additions & 99 deletions backends/vulkan/tools/gpuinfo/src/app.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,117 +13,144 @@
#include "stats.h"
#include "utils.h"

void reg_count() {
const uint32_t NREG_MIN = 1;
const uint32_t NREG_MAX = 512;
const uint32_t NREG_STEP = 1;

const double COMPENSATE = 0.01;
const double THRESHOLD = 3;

const uint32_t NGRP_MIN = 1;
const uint32_t NGRP_MAX = 64;
const uint32_t NGRP_STEP = 1;

uint32_t NITER;

auto bench = [&](uint32_t ngrp, uint32_t nreg) {
size_t len = sizeof(float);
StorageBuffer buffer(context(), vkapi::kFloat, len);
ParamsBuffer params(context(), int32_t(len));
vkapi::PipelineBarrier pipeline_barrier{};

auto shader_name = "reg_count_" + std::to_string(nreg);

auto time = benchmark_on_gpu(shader_name, 100, [&]() {
context()->submit_compute_job(
VK_KERNEL_FROM_STR(shader_name),
pipeline_barrier,
{1, ngrp, 1},
{1, 1, 1},
{SV(NITER)},
VK_NULL_HANDLE,
0,
buffer.buffer(),
params.buffer());
});
return time;
};

std::cout << "Calculating NITER..." << std::endl;
ensure_min_niter(1000, NITER, [&]() { return bench(1, NREG_MIN); });
std::cout << "NITER," << NITER << std::endl;

uint32_t nreg_max;

DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
uint32_t nreg = NREG_MIN;
for (; nreg <= NREG_MAX; nreg += NREG_STEP) {
double time = bench(1, nreg);
std::cout << "Testing nreg=\t" << nreg << "\tTime=\t" << time << std::endl;
if (dj.push(time)) {
nreg -= NREG_STEP;
nreg_max = nreg;
break;
}
}
if (nreg >= NREG_MAX) {
std::cout << "Unable to conclude a maximal register count" << std::endl;
nreg_max = NREG_STEP;
} else {
std::cout << nreg_max << " registers are available at most" << std::endl;
using namespace vkapi;

class App {
private:
size_t buf_cache_size_;
uint32_t sm_count_;
uint32_t nthread_logic_;

public:
App() {
context()->initialize_querypool();

std::cout << context()->adapter_ptr()->stringize() << "\n\n";

auto cl_device = get_cl_device();

sm_count_ = cl_device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
nthread_logic_ = cl_device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
buf_cache_size_ = cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_CACHE_SIZE>();

std::cout << "\nSM count," << sm_count_ << std::endl;
std::cout << "Logic Thread Count," << nthread_logic_ << std::endl;
std::cout << "Cache Size," << buf_cache_size_ << std::endl;
}

auto find_ngrp_by_nreg = [&](const uint32_t nreg) {
DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
for (auto ngrp = NGRP_MIN; ngrp <= NGRP_MAX; ngrp += NGRP_STEP) {
auto time = bench(ngrp, nreg);
std::cout << "Testing occupation (nreg=" << nreg << "); ngrp=" << ngrp
<< ", time=" << time << " us" << std::endl;
void reg_count() {
std::cout << "\n------ Register Count ------" << std::endl;
const uint32_t NREG_MIN = 1;
const uint32_t NREG_MAX = 512;
const uint32_t NREG_STEP = 1;

const double COMPENSATE = 0.01;
const double THRESHOLD = 3;

const uint32_t NGRP_MIN = 1;
const uint32_t NGRP_MAX = 64;
const uint32_t NGRP_STEP = 1;

uint32_t NITER;

auto bench = [&](uint32_t ngrp, uint32_t nreg) {
size_t len = sizeof(float);
StorageBuffer buffer(context(), vkapi::kFloat, len);
ParamsBuffer params(context(), int32_t(len));
vkapi::PipelineBarrier pipeline_barrier{};

auto shader_name = "reg_count_" + std::to_string(nreg);

auto time = benchmark_on_gpu(shader_name, 100, [&]() {
context()->submit_compute_job(
VK_KERNEL_FROM_STR(shader_name),
pipeline_barrier,
{1, ngrp, 1},
{1, 1, 1},
{SV(NITER)},
VK_NULL_HANDLE,
0,
buffer.buffer(),
params.buffer());
});
return time;
};

std::cout << "Calculating NITER..." << std::endl;
ensure_min_niter(1000, NITER, [&]() { return bench(1, NREG_MIN); });
std::cout << "NITER," << NITER << std::endl;

uint32_t nreg_max;

DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
uint32_t nreg = NREG_MIN;
for (; nreg <= NREG_MAX; nreg += NREG_STEP) {
double time = bench(1, nreg);
std::cout << "Testing nreg=\t" << nreg << "\tTime=\t" << time
<< std::endl;
if (dj.push(time)) {
ngrp -= NGRP_STEP;
std::cout << "Using " << nreg << " registers can have " << ngrp
<< " concurrent single-thread workgroups" << std::endl;
return ngrp;
nreg -= NREG_STEP;
nreg_max = nreg;
break;
}
}
std::cout
<< "Unable to conclude a maximum number of concurrent single-thread workgroups when "
<< nreg << " registers are occupied" << std::endl;
return (uint32_t)1;
};

uint32_t ngrp_full, ngrp_half;
ngrp_full = find_ngrp_by_nreg(nreg_max);
ngrp_half = find_ngrp_by_nreg(nreg_max / 2);

std::string reg_ty;
if (nreg >= NREG_MAX) {
std::cout << "Unable to conclude a maximal register count" << std::endl;
nreg_max = NREG_STEP;
} else {
std::cout << nreg_max << " registers are available at most" << std::endl;
}

if (ngrp_full * 1.5 < ngrp_half) {
std::cout << "All physical threads in an sm share " << nreg_max
<< " registers" << std::endl;
reg_ty = "Pooled";
auto find_ngrp_by_nreg = [&](const uint32_t nreg) {
DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
for (auto ngrp = NGRP_MIN; ngrp <= NGRP_MAX; ngrp += NGRP_STEP) {
auto time = bench(ngrp, nreg);
std::cout << "Testing occupation (nreg=" << nreg << "); ngrp=" << ngrp
<< ", time=" << time << " us" << std::endl;

if (dj.push(time)) {
ngrp -= NGRP_STEP;
std::cout << "Using " << nreg << " registers can have " << ngrp
<< " concurrent single-thread workgroups" << std::endl;
return ngrp;
}
}
std::cout
<< "Unable to conclude a maximum number of concurrent single-thread workgroups when "
<< nreg << " registers are occupied" << std::endl;
return (uint32_t)1;
};

uint32_t ngrp_full, ngrp_half;
ngrp_full = find_ngrp_by_nreg(nreg_max);
ngrp_half = find_ngrp_by_nreg(nreg_max / 2);

std::string reg_ty;

if (ngrp_full * 1.5 < ngrp_half) {
std::cout << "All physical threads in an sm share " << nreg_max
<< " registers" << std::endl;
reg_ty = "Pooled";

} else {
std::cout << "Each physical thread has " << nreg_max << " registers"
<< std::endl;
reg_ty = "Dedicated";
}

} else {
std::cout << "Each physical thread has " << nreg_max << " registers"
std::cout << "\n\nNITER," << NITER << std::endl;
std::cout << "Max registers," << nreg_max << std::endl;
std::cout << "Concurrent full single thread workgroups," << ngrp_full
<< std::endl;
reg_ty = "Dedicated";
std::cout << "Concurrent half single thread workgroups," << ngrp_half
<< std::endl;
std::cout << "Register type," << reg_ty << std::endl;
}

std::cout << "\n\nNITER," << NITER << std::endl;
std::cout << "Max registers," << nreg_max << std::endl;
std::cout << "Concurrent full single thread workgroups," << ngrp_full
<< std::endl;
std::cout << "Concurrent half single thread workgroups," << ngrp_half
<< std::endl;
std::cout << "Register type," << reg_ty << std::endl;
}
};

int main(int argc, const char** argv) {
context()->initialize_querypool();

reg_count();
App app;

app.reg_count();
return 0;
}