|
13 | 13 | #include "stats.h"
|
14 | 14 | #include "utils.h"
|
15 | 15 |
|
16 |
| -void reg_count() { |
17 |
| - const uint32_t NREG_MIN = 1; |
18 |
| - const uint32_t NREG_MAX = 512; |
19 |
| - const uint32_t NREG_STEP = 1; |
20 |
| - |
21 |
| - const double COMPENSATE = 0.01; |
22 |
| - const double THRESHOLD = 3; |
23 |
| - |
24 |
| - const uint32_t NGRP_MIN = 1; |
25 |
| - const uint32_t NGRP_MAX = 64; |
26 |
| - const uint32_t NGRP_STEP = 1; |
27 |
| - |
28 |
| - uint32_t NITER; |
29 |
| - |
30 |
| - auto bench = [&](uint32_t ngrp, uint32_t nreg) { |
31 |
| - size_t len = sizeof(float); |
32 |
| - StorageBuffer buffer(context(), vkapi::kFloat, len); |
33 |
| - ParamsBuffer params(context(), int32_t(len)); |
34 |
| - vkapi::PipelineBarrier pipeline_barrier{}; |
35 |
| - |
36 |
| - auto shader_name = "reg_count_" + std::to_string(nreg); |
37 |
| - |
38 |
| - auto time = benchmark_on_gpu(shader_name, 100, [&]() { |
39 |
| - context()->submit_compute_job( |
40 |
| - VK_KERNEL_FROM_STR(shader_name), |
41 |
| - pipeline_barrier, |
42 |
| - {1, ngrp, 1}, |
43 |
| - {1, 1, 1}, |
44 |
| - {SV(NITER)}, |
45 |
| - VK_NULL_HANDLE, |
46 |
| - 0, |
47 |
| - buffer.buffer(), |
48 |
| - params.buffer()); |
49 |
| - }); |
50 |
| - return time; |
51 |
| - }; |
52 |
| - |
53 |
| - std::cout << "Calculating NITER..." << std::endl; |
54 |
| - ensure_min_niter(1000, NITER, [&]() { return bench(1, NREG_MIN); }); |
55 |
| - std::cout << "NITER," << NITER << std::endl; |
56 |
| - |
57 |
| - uint32_t nreg_max; |
58 |
| - |
59 |
| - DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); |
60 |
| - uint32_t nreg = NREG_MIN; |
61 |
| - for (; nreg <= NREG_MAX; nreg += NREG_STEP) { |
62 |
| - double time = bench(1, nreg); |
63 |
| - std::cout << "Testing nreg=\t" << nreg << "\tTime=\t" << time << std::endl; |
64 |
| - if (dj.push(time)) { |
65 |
| - nreg -= NREG_STEP; |
66 |
| - nreg_max = nreg; |
67 |
| - break; |
68 |
| - } |
69 |
| - } |
70 |
| - if (nreg >= NREG_MAX) { |
71 |
| - std::cout << "Unable to conclude a maximal register count" << std::endl; |
72 |
| - nreg_max = NREG_STEP; |
73 |
| - } else { |
74 |
| - std::cout << nreg_max << " registers are available at most" << std::endl; |
| 16 | +using namespace vkapi; |
| 17 | + |
| 18 | +class App { |
| 19 | + private: |
| 20 | + size_t buf_cache_size_; |
| 21 | + uint32_t sm_count_; |
| 22 | + uint32_t nthread_logic_; |
| 23 | + |
| 24 | + public: |
| 25 | + App() { |
| 26 | + context()->initialize_querypool(); |
| 27 | + |
| 28 | + std::cout << context()->adapter_ptr()->stringize() << std::endl |
| 29 | + << std::endl; |
| 30 | + |
| 31 | + auto cl_device = get_cl_device(); |
| 32 | + |
| 33 | + sm_count_ = cl_device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(); |
| 34 | + nthread_logic_ = cl_device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>(); |
| 35 | + buf_cache_size_ = cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_CACHE_SIZE>(); |
| 36 | + |
| 37 | + std::cout << std::endl; |
| 38 | + std::cout << "SM count," << sm_count_ << std::endl; |
| 39 | + std::cout << "Logic Thread Count," << nthread_logic_ << std::endl; |
| 40 | + std::cout << "Cache Size," << buf_cache_size_ << std::endl; |
75 | 41 | }
|
76 | 42 |
|
77 |
| - auto find_ngrp_by_nreg = [&](const uint32_t nreg) { |
78 |
| - DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); |
79 |
| - for (auto ngrp = NGRP_MIN; ngrp <= NGRP_MAX; ngrp += NGRP_STEP) { |
80 |
| - auto time = bench(ngrp, nreg); |
81 |
| - std::cout << "Testing occupation (nreg=" << nreg << "); ngrp=" << ngrp |
82 |
| - << ", time=" << time << " us" << std::endl; |
| 43 | + void reg_count() { |
| 44 | + std::cout << std::endl; |
| 45 | + std::cout << "------ Register Count ------" << std::endl; |
| 46 | + const uint32_t NREG_MIN = 1; |
| 47 | + const uint32_t NREG_MAX = 512; |
| 48 | + const uint32_t NREG_STEP = 1; |
| 49 | + |
| 50 | + const double COMPENSATE = 0.01; |
| 51 | + const double THRESHOLD = 3; |
| 52 | + |
| 53 | + const uint32_t NGRP_MIN = 1; |
| 54 | + const uint32_t NGRP_MAX = 64; |
| 55 | + const uint32_t NGRP_STEP = 1; |
| 56 | + |
| 57 | + uint32_t NITER; |
| 58 | + |
| 59 | + auto bench = [&](uint32_t ngrp, uint32_t nreg) { |
| 60 | + size_t len = sizeof(float); |
| 61 | + StorageBuffer buffer(context(), vkapi::kFloat, len); |
| 62 | + ParamsBuffer params(context(), int32_t(len)); |
| 63 | + vkapi::PipelineBarrier pipeline_barrier{}; |
| 64 | + |
| 65 | + auto shader_name = "reg_count_" + std::to_string(nreg); |
| 66 | + |
| 67 | + auto time = benchmark_on_gpu(shader_name, 100, [&]() { |
| 68 | + context()->submit_compute_job( |
| 69 | + VK_KERNEL_FROM_STR(shader_name), |
| 70 | + pipeline_barrier, |
| 71 | + {1, ngrp, 1}, |
| 72 | + {1, 1, 1}, |
| 73 | + {SV(NITER)}, |
| 74 | + VK_NULL_HANDLE, |
| 75 | + 0, |
| 76 | + buffer.buffer(), |
| 77 | + params.buffer()); |
| 78 | + }); |
| 79 | + return time; |
| 80 | + }; |
| 81 | + |
| 82 | + std::cout << "Calculating NITER..." << std::endl; |
| 83 | + ensure_min_niter(1000, NITER, [&]() { return bench(1, NREG_MIN); }); |
| 84 | + std::cout << "NITER," << NITER << std::endl; |
| 85 | + |
| 86 | + uint32_t nreg_max; |
83 | 87 |
|
| 88 | + DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); |
| 89 | + uint32_t nreg = NREG_MIN; |
| 90 | + for (; nreg <= NREG_MAX; nreg += NREG_STEP) { |
| 91 | + double time = bench(1, nreg); |
| 92 | + std::cout << "Testing nreg=\t" << nreg << "\tTime=\t" << time |
| 93 | + << std::endl; |
84 | 94 | if (dj.push(time)) {
|
85 |
| - ngrp -= NGRP_STEP; |
86 |
| - std::cout << "Using " << nreg << " registers can have " << ngrp |
87 |
| - << " concurrent single-thread workgroups" << std::endl; |
88 |
| - return ngrp; |
| 95 | + nreg -= NREG_STEP; |
| 96 | + nreg_max = nreg; |
| 97 | + break; |
89 | 98 | }
|
90 | 99 | }
|
91 |
| - std::cout |
92 |
| - << "Unable to conclude a maximum number of concurrent single-thread workgroups when " |
93 |
| - << nreg << " registers are occupied" << std::endl; |
94 |
| - return (uint32_t)1; |
95 |
| - }; |
96 |
| - |
97 |
| - uint32_t ngrp_full, ngrp_half; |
98 |
| - ngrp_full = find_ngrp_by_nreg(nreg_max); |
99 |
| - ngrp_half = find_ngrp_by_nreg(nreg_max / 2); |
100 |
| - |
101 |
| - std::string reg_ty; |
| 100 | + if (nreg >= NREG_MAX) { |
| 101 | + std::cout << "Unable to conclude a maximal register count" << std::endl; |
| 102 | + nreg_max = NREG_STEP; |
| 103 | + } else { |
| 104 | + std::cout << nreg_max << " registers are available at most" << std::endl; |
| 105 | + } |
102 | 106 |
|
103 |
| - if (ngrp_full * 1.5 < ngrp_half) { |
104 |
| - std::cout << "All physical threads in an sm share " << nreg_max |
105 |
| - << " registers" << std::endl; |
106 |
| - reg_ty = "Pooled"; |
| 107 | + auto find_ngrp_by_nreg = [&](const uint32_t nreg) { |
| 108 | + DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); |
| 109 | + for (auto ngrp = NGRP_MIN; ngrp <= NGRP_MAX; ngrp += NGRP_STEP) { |
| 110 | + auto time = bench(ngrp, nreg); |
| 111 | + std::cout << "Testing occupation (nreg=" << nreg << "); ngrp=" << ngrp |
| 112 | + << ", time=" << time << " us" << std::endl; |
| 113 | + |
| 114 | + if (dj.push(time)) { |
| 115 | + ngrp -= NGRP_STEP; |
| 116 | + std::cout << "Using " << nreg << " registers can have " << ngrp |
| 117 | + << " concurrent single-thread workgroups" << std::endl; |
| 118 | + return ngrp; |
| 119 | + } |
| 120 | + } |
| 121 | + std::cout |
| 122 | + << "Unable to conclude a maximum number of concurrent single-thread workgroups when " |
| 123 | + << nreg << " registers are occupied" << std::endl; |
| 124 | + return (uint32_t)1; |
| 125 | + }; |
| 126 | + |
| 127 | + uint32_t ngrp_full, ngrp_half; |
| 128 | + ngrp_full = find_ngrp_by_nreg(nreg_max); |
| 129 | + ngrp_half = find_ngrp_by_nreg(nreg_max / 2); |
| 130 | + |
| 131 | + std::string reg_ty; |
| 132 | + |
| 133 | + if (ngrp_full * 1.5 < ngrp_half) { |
| 134 | + std::cout << "All physical threads in an sm share " << nreg_max |
| 135 | + << " registers" << std::endl; |
| 136 | + reg_ty = "Pooled"; |
| 137 | + |
| 138 | + } else { |
| 139 | + std::cout << "Each physical thread has " << nreg_max << " registers" |
| 140 | + << std::endl; |
| 141 | + reg_ty = "Dedicated"; |
| 142 | + } |
107 | 143 |
|
108 |
| - } else { |
109 |
| - std::cout << "Each physical thread has " << nreg_max << " registers" |
| 144 | + std::cout << std::endl << std::endl; |
| 145 | + std::cout << "NITER," << NITER << std::endl; |
| 146 | + std::cout << "Max registers," << nreg_max << std::endl; |
| 147 | + std::cout << "Concurrent full single thread workgroups," << ngrp_full |
| 148 | + << std::endl; |
| 149 | + std::cout << "Concurrent half single thread workgroups," << ngrp_half |
110 | 150 | << std::endl;
|
111 |
| - reg_ty = "Dedicated"; |
| 151 | + std::cout << "Register type," << reg_ty << std::endl; |
112 | 152 | }
|
113 |
| - |
114 |
| - std::cout << "\n\nNITER," << NITER << std::endl; |
115 |
| - std::cout << "Max registers," << nreg_max << std::endl; |
116 |
| - std::cout << "Concurrent full single thread workgroups," << ngrp_full |
117 |
| - << std::endl; |
118 |
| - std::cout << "Concurrent half single thread workgroups," << ngrp_half |
119 |
| - << std::endl; |
120 |
| - std::cout << "Register type," << reg_ty << std::endl; |
121 |
| -} |
| 153 | +}; |
122 | 154 |
|
123 | 155 | int main(int argc, const char** argv) {
|
124 |
| - context()->initialize_querypool(); |
125 |
| - |
126 |
| - reg_count(); |
| 156 | + App app; |
127 | 157 |
|
| 158 | + app.reg_count(); |
128 | 159 | return 0;
|
129 | 160 | }
|
0 commit comments