1 /*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9 #pragma once
10
11 #include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
12
13 #include "app.h"
14 #include "stats.h"
15 #include "utils.h"
16
17 using namespace vkapi;
18
19 namespace gpuinfo {
20
reg_count(const App & app)21 void reg_count(const App& app) {
22 if (!app.enabled("reg_count")) {
23 std::cout << "Skipped Register Count" << std::endl;
24 return;
25 }
26
27 std::cout << std::endl;
28 std::cout << "------ Register Count ------" << std::endl;
29 const uint32_t NREG_MIN = 1;
30 const uint32_t NREG_MAX = 512;
31 const uint32_t NREG_STEP = 1;
32
33 const double COMPENSATE = app.get_config("reg_count", "compensate");
34 const double THRESHOLD = app.get_config("reg_count", "threshold");
35
36 const uint32_t NGRP_MIN = 1;
37 const uint32_t NGRP_MAX = 64;
38 const uint32_t NGRP_STEP = 1;
39
40 uint32_t NITER;
41
42 auto bench = [&](uint32_t ngrp, uint32_t nreg) {
43 StagingBuffer buffer(context(), vkapi::kFloat, 1);
44 vkapi::PipelineBarrier pipeline_barrier{};
45
46 auto shader_name = "reg_count_" + std::to_string(nreg);
47
48 auto time = benchmark_on_gpu(shader_name, 30, [&]() {
49 context()->submit_compute_job(
50 VK_KERNEL_FROM_STR(shader_name),
51 pipeline_barrier,
52 {1, ngrp, 1},
53 {1, 1, 1},
54 {SV(NITER)},
55 VK_NULL_HANDLE,
56 0,
57 buffer.buffer());
58 });
59 return time;
60 };
61
62 ensure_min_niter(1000, NITER, [&]() { return bench(1, NREG_MIN); });
63
64 uint32_t nreg_max;
65
66 DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
67 uint32_t nreg = NREG_MIN;
68 for (; nreg <= NREG_MAX; nreg += NREG_STEP) {
69 double time = bench(1, nreg);
70 std::cout << "Testing nreg=\t" << nreg << "\tTime=\t" << time << "\tus"
71 << std::endl;
72 if (dj.push(time)) {
73 nreg -= NREG_STEP;
74 nreg_max = nreg;
75 break;
76 }
77 }
78 if (nreg >= NREG_MAX) {
79 std::cout << "Unable to conclude a maximal register count" << std::endl;
80 nreg_max = NREG_STEP;
81 } else {
82 std::cout << nreg_max << " registers are available at most" << std::endl;
83 }
84
85 auto find_ngrp_by_nreg = [&](const uint32_t nreg) {
86 DtJumpFinder<3> dj(COMPENSATE, THRESHOLD);
87 for (auto ngrp = NGRP_MIN; ngrp <= NGRP_MAX; ngrp += NGRP_STEP) {
88 auto time = bench(ngrp, nreg);
89 std::cout << "Testing occupation (nreg=\t" << nreg << "\t); ngrp=\t"
90 << ngrp << "\t, time=\t" << time << "\tus" << std::endl;
91
92 if (dj.push(time)) {
93 ngrp -= NGRP_STEP;
94 std::cout << "Using " << nreg << " registers can have " << ngrp
95 << " concurrent single-thread workgroups" << std::endl;
96 return ngrp;
97 }
98 }
99 std::cout
100 << "Unable to conclude a maximum number of concurrent single-thread workgroups when "
101 << nreg << " registers are occupied" << std::endl;
102 return (uint32_t)1;
103 };
104
105 uint32_t ngrp_full, ngrp_half;
106 ngrp_full = find_ngrp_by_nreg(nreg_max);
107 ngrp_half = find_ngrp_by_nreg(nreg_max / 2);
108
109 std::string reg_ty;
110
111 if (ngrp_full * 1.5 < ngrp_half) {
112 std::cout << "All physical threads in an sm share " << nreg_max
113 << " registers" << std::endl;
114 reg_ty = "Pooled";
115
116 } else {
117 std::cout << "Each physical thread has " << nreg_max << " registers"
118 << std::endl;
119 reg_ty = "Dedicated";
120 }
121
122 std::cout << std::endl << std::endl;
123 std::cout << "MaxRegisters," << nreg_max << std::endl;
124 std::cout << "ConcurrentWorkgroupsFullReg," << ngrp_full << std::endl;
125 std::cout << "ConcurrentWorkgroupsHalfReg," << ngrp_half << std::endl;
126 std::cout << "RegisterType," << reg_ty << std::endl;
127 }
128
129 // Warp size is a difficult metric to obtain because the hardware limitations
130 // do not always coincide with the way the SM divides the workload. For
131 // instance, the hardware can have a warp size of 64 threads, but an SM might
132 // be able to simulate concurrency of 128 threads with a single scheduler.
133
134 // Because of this, it is important to measure the warp size different ways,
135 // that can evidence both the physical limitations of the hardware, and the
136 // actual behavior of the driver.
137
138 // Additionally,the SM can behave in two different ways when the assigned
139 // workload is smaller than the warp size.
140
141 // In Case 1, like ARM Mali, the SM can assign dummy workloads to fill empty
142 // threads and maintain a uniform workload.
143
144 // In Case 2, like in Adreno, the driver might decide to pack multiple works
145 // together and dispatch them at once.
146 void warp_size(const App& app, const bool verbose = false) {
147 if (!app.enabled("warp_size")) {
148 std::cout << "Skipped Warp Size" << std::endl;
149 return;
150 }
151
152 std::cout << "\n------ Warp Size ------" << std::endl;
153
154 // Method A: Stress test with a kernel that uses complex ALU operations like
155 // integer division to avoid latency hiding. Increase the number of threads
156 // until a jump in latency is detected.
157
158 // This timing-based method helps us identify physical warp sizes. It also
159 // helps with Case 2, when threads of multiple warps are managed by the same
160 // scheduler at the same time.
161 const double COMPENSATE = app.get_config("warp_size", "compensate");
162 const double THRESHOLD = app.get_config("warp_size", "threshold");
163
164 uint32_t NITER;
165
166 auto bench = [&](uint32_t nthread) {
167 StagingBuffer out_buf(context(), vkapi::kInt, app.nthread_logic);
168 vkapi::PipelineBarrier pipeline_barrier{};
169
170 auto shader_name = "warp_size_physical";
171
172 auto time = benchmark_on_gpu(shader_name, 10, [&]() {
173 context()->submit_compute_job(
174 VK_KERNEL_FROM_STR(shader_name),
175 pipeline_barrier,
176 // Large number of work groups selected to potentially saturate all
177 // ALUs and thus have a better baseline for comparison.
178 {nthread, 1024, 1},
179 {nthread, 1, 1},
180 {SV(NITER)},
181 VK_NULL_HANDLE,
182 0,
183 out_buf.buffer());
184 });
185
186 return time;
187 };
188
189 ensure_min_niter(1000, NITER, [&]() { return bench(1); });
190
191 uint32_t warp_size = app.subgroup_size;
192 DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
193
194 // We increase the number of threads until we hit a jump in the data.
195 uint32_t nthread = 1;
196 for (; nthread <= app.nthread_logic; ++nthread) {
197 double time = bench(nthread);
198 std::cout << "nthread=\t" << nthread << "\t(\t" << time << "\tus)"
199 << std::endl;
200 if (dj.push(time)) {
201 warp_size = nthread - 1;
202 break;
203 }
204 }
205 if (nthread >= app.nthread_logic) {
206 std::cout
207 << "Unable to conclude a physical warp size. Assuming warp_size == subgroup_size"
208 << std::endl;
209 }
210
211 // Method B: Let all the threads in a warp race and atomically fetch-add
212 // a counter, then store the counter values to the output buffer in the
213 // scheduling order of these threads. If all the order numbers follow an
214 // ascending order, then the threads are likely executing within a warp.
215 // Threads in different warps are not managed by the same scheduler, so they
216 // would race for a same ID out of order, unaware of each other.
217
218 // This method evidences the actual driver behavior when running
219 // concurrency, regardless of the physical limitations of the hardware.
220
221 // Likewise, this method helps us identify warp sizes when the SM
222 // sub-divides its ALUs into independent groups, like the three execution
223 // engines in a Mali G76 core. It helps warp-probing in Case 1 because it
224 // doesn't depend on kernel timing, so the extra wait time doesn't lead to
225 // inaccuracy.
226 auto bench_sm = [&](uint32_t nthread) {
227 StagingBuffer out_buf(context(), vkapi::kInt, app.nthread_logic);
228 vkapi::PipelineBarrier pipeline_barrier{};
229
230 auto shader_name = "warp_size_scheduler";
231
232 benchmark_on_gpu(shader_name, 1, [&]() {
233 context()->submit_compute_job(
234 VK_KERNEL_FROM_STR(shader_name),
235 pipeline_barrier,
236 {nthread, 1, 1},
237 {nthread, 1, 1},
238 {},
239 VK_NULL_HANDLE,
240 0,
241 out_buf.buffer());
242 });
243
244 std::vector<int32_t> data(app.nthread_logic);
245 out_buf.copy_to(data.data(), out_buf.nbytes());
246
247 if (verbose) {
248 std::stringstream ss;
249 for (auto j = 0; j < nthread; ++j) {
250 ss << data[j] << " ";
251 }
252 std::cout << ss.str() << std::endl;
253 }
254
255 // Check until which point is the data in ascending order.
256 int32_t last = -1;
257 int32_t j = 0;
258 for (; j < nthread; ++j) {
259 if (last >= data[j]) {
260 break;
261 }
262 last = data[j];
263 }
264
265 return j;
266 };
267
268 // Test increasing sizes until the data is no longer in ascending order.
269 uint32_t warp_size_scheduler = warp_size;
270 int i = 1;
271 for (; i <= app.nthread_logic; ++i) {
272 uint32_t nascend = bench_sm(i);
273 if (nascend != i) {
274 warp_size_scheduler = nascend;
275 break;
276 }
277 }
278 if (i > app.nthread_logic) {
279 std::cout << "Unable to conclude an SM Warp Size." << std::endl;
280 }
281
282 std::cout << "PhysicalWarpSize," << warp_size << std::endl;
283 std::cout << "SMWarpSize," << warp_size_scheduler << std::endl;
284 }
285 }; // namespace gpuinfo
286