xref: /aosp_15_r20/external/executorch/backends/vulkan/tools/gpuinfo/include/architecture.h (revision 523fa7a60841cd1ecfb9cc4201f1ca8b03ed023a)
1 /*
2  * Copyright (c) Meta Platforms, Inc. and affiliates.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD-style license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 #pragma once
10 
11 #include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
12 
13 #include "app.h"
14 #include "stats.h"
15 #include "utils.h"
16 
17 using namespace vkapi;
18 
19 namespace gpuinfo {
20 
reg_count(const App & app)21 void reg_count(const App& app) {
22   if (!app.enabled("reg_count")) {
23     std::cout << "Skipped Register Count" << std::endl;
24     return;
25   }
26 
27   std::cout << std::endl;
28   std::cout << "------ Register Count ------" << std::endl;
29   const uint32_t NREG_MIN = 1;
30   const uint32_t NREG_MAX = 512;
31   const uint32_t NREG_STEP = 1;
32 
33   const double COMPENSATE = app.get_config("reg_count", "compensate");
34   const double THRESHOLD = app.get_config("reg_count", "threshold");
35 
36   const uint32_t NGRP_MIN = 1;
37   const uint32_t NGRP_MAX = 64;
38   const uint32_t NGRP_STEP = 1;
39 
40   uint32_t NITER;
41 
42   auto bench = [&](uint32_t ngrp, uint32_t nreg) {
43     StagingBuffer buffer(context(), vkapi::kFloat, 1);
44     vkapi::PipelineBarrier pipeline_barrier{};
45 
46     auto shader_name = "reg_count_" + std::to_string(nreg);
47 
48     auto time = benchmark_on_gpu(shader_name, 30, [&]() {
49       context()->submit_compute_job(
50           VK_KERNEL_FROM_STR(shader_name),
51           pipeline_barrier,
52           {1, ngrp, 1},
53           {1, 1, 1},
54           {SV(NITER)},
55           VK_NULL_HANDLE,
56           0,
57           buffer.buffer());
58     });
59     return time;
60   };
61 
62   ensure_min_niter(1000, NITER, [&]() { return bench(1, NREG_MIN); });
63 
64   uint32_t nreg_max;
65 
66   DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
67   uint32_t nreg = NREG_MIN;
68   for (; nreg <= NREG_MAX; nreg += NREG_STEP) {
69     double time = bench(1, nreg);
70     std::cout << "Testing nreg=\t" << nreg << "\tTime=\t" << time << "\tus"
71               << std::endl;
72     if (dj.push(time)) {
73       nreg -= NREG_STEP;
74       nreg_max = nreg;
75       break;
76     }
77   }
78   if (nreg >= NREG_MAX) {
79     std::cout << "Unable to conclude a maximal register count" << std::endl;
80     nreg_max = NREG_STEP;
81   } else {
82     std::cout << nreg_max << " registers are available at most" << std::endl;
83   }
84 
85   auto find_ngrp_by_nreg = [&](const uint32_t nreg) {
86     DtJumpFinder<3> dj(COMPENSATE, THRESHOLD);
87     for (auto ngrp = NGRP_MIN; ngrp <= NGRP_MAX; ngrp += NGRP_STEP) {
88       auto time = bench(ngrp, nreg);
89       std::cout << "Testing occupation (nreg=\t" << nreg << "\t); ngrp=\t"
90                 << ngrp << "\t, time=\t" << time << "\tus" << std::endl;
91 
92       if (dj.push(time)) {
93         ngrp -= NGRP_STEP;
94         std::cout << "Using " << nreg << " registers can have " << ngrp
95                   << " concurrent single-thread workgroups" << std::endl;
96         return ngrp;
97       }
98     }
99     std::cout
100         << "Unable to conclude a maximum number of concurrent single-thread workgroups when "
101         << nreg << " registers are occupied" << std::endl;
102     return (uint32_t)1;
103   };
104 
105   uint32_t ngrp_full, ngrp_half;
106   ngrp_full = find_ngrp_by_nreg(nreg_max);
107   ngrp_half = find_ngrp_by_nreg(nreg_max / 2);
108 
109   std::string reg_ty;
110 
111   if (ngrp_full * 1.5 < ngrp_half) {
112     std::cout << "All physical threads in an sm share " << nreg_max
113               << " registers" << std::endl;
114     reg_ty = "Pooled";
115 
116   } else {
117     std::cout << "Each physical thread has " << nreg_max << " registers"
118               << std::endl;
119     reg_ty = "Dedicated";
120   }
121 
122   std::cout << std::endl << std::endl;
123   std::cout << "MaxRegisters," << nreg_max << std::endl;
124   std::cout << "ConcurrentWorkgroupsFullReg," << ngrp_full << std::endl;
125   std::cout << "ConcurrentWorkgroupsHalfReg," << ngrp_half << std::endl;
126   std::cout << "RegisterType," << reg_ty << std::endl;
127 }
128 
129 // Warp size is a difficult metric to obtain because the hardware limitations
130 // do not always coincide with the way the SM divides the workload. For
131 // instance, the hardware can have a warp size of 64 threads, but an SM might
132 // be able to simulate concurrency of 128 threads with a single scheduler.
133 
134 // Because of this, it is important to measure the warp size different ways,
135 // that can evidence both the physical limitations of the hardware, and the
136 // actual behavior of the driver.
137 
138 // Additionally,the SM can behave in two different ways when the assigned
139 // workload is smaller than the warp size.
140 
141 // In Case 1, like ARM Mali, the SM can assign dummy workloads to fill empty
142 // threads and maintain a uniform workload.
143 
144 // In Case 2, like in Adreno, the driver might decide to pack multiple works
145 // together and dispatch them at once.
146 void warp_size(const App& app, const bool verbose = false) {
147   if (!app.enabled("warp_size")) {
148     std::cout << "Skipped Warp Size" << std::endl;
149     return;
150   }
151 
152   std::cout << "\n------ Warp Size ------" << std::endl;
153 
154   // Method A: Stress test with a kernel that uses complex ALU operations like
155   // integer division to avoid latency hiding. Increase the number of threads
156   // until a jump in latency is detected.
157 
158   // This timing-based method helps us identify physical warp sizes. It also
159   // helps with Case 2, when threads of multiple warps are managed by the same
160   // scheduler at the same time.
161   const double COMPENSATE = app.get_config("warp_size", "compensate");
162   const double THRESHOLD = app.get_config("warp_size", "threshold");
163 
164   uint32_t NITER;
165 
166   auto bench = [&](uint32_t nthread) {
167     StagingBuffer out_buf(context(), vkapi::kInt, app.nthread_logic);
168     vkapi::PipelineBarrier pipeline_barrier{};
169 
170     auto shader_name = "warp_size_physical";
171 
172     auto time = benchmark_on_gpu(shader_name, 10, [&]() {
173       context()->submit_compute_job(
174           VK_KERNEL_FROM_STR(shader_name),
175           pipeline_barrier,
176           // Large number of work groups selected to potentially saturate all
177           // ALUs and thus have a better baseline for comparison.
178           {nthread, 1024, 1},
179           {nthread, 1, 1},
180           {SV(NITER)},
181           VK_NULL_HANDLE,
182           0,
183           out_buf.buffer());
184     });
185 
186     return time;
187   };
188 
189   ensure_min_niter(1000, NITER, [&]() { return bench(1); });
190 
191   uint32_t warp_size = app.subgroup_size;
192   DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
193 
194   // We increase the number of threads until we hit a jump in the data.
195   uint32_t nthread = 1;
196   for (; nthread <= app.nthread_logic; ++nthread) {
197     double time = bench(nthread);
198     std::cout << "nthread=\t" << nthread << "\t(\t" << time << "\tus)"
199               << std::endl;
200     if (dj.push(time)) {
201       warp_size = nthread - 1;
202       break;
203     }
204   }
205   if (nthread >= app.nthread_logic) {
206     std::cout
207         << "Unable to conclude a physical warp size. Assuming warp_size == subgroup_size"
208         << std::endl;
209   }
210 
211   // Method B: Let all the threads in a warp race and atomically fetch-add
212   // a counter, then store the counter values to the output buffer in the
213   // scheduling order of these threads. If all the order numbers follow an
214   // ascending order, then the threads are likely executing within a warp.
215   // Threads in different warps are not managed by the same scheduler, so they
216   // would race for a same ID out of order, unaware of each other.
217 
218   // This method evidences the actual driver behavior when running
219   // concurrency, regardless of the physical limitations of the hardware.
220 
221   // Likewise, this method helps us identify warp sizes when the SM
222   // sub-divides its ALUs into independent groups, like the three execution
223   // engines in a Mali G76 core. It helps warp-probing in Case 1 because it
224   // doesn't depend on kernel timing, so the extra wait time doesn't lead to
225   // inaccuracy.
226   auto bench_sm = [&](uint32_t nthread) {
227     StagingBuffer out_buf(context(), vkapi::kInt, app.nthread_logic);
228     vkapi::PipelineBarrier pipeline_barrier{};
229 
230     auto shader_name = "warp_size_scheduler";
231 
232     benchmark_on_gpu(shader_name, 1, [&]() {
233       context()->submit_compute_job(
234           VK_KERNEL_FROM_STR(shader_name),
235           pipeline_barrier,
236           {nthread, 1, 1},
237           {nthread, 1, 1},
238           {},
239           VK_NULL_HANDLE,
240           0,
241           out_buf.buffer());
242     });
243 
244     std::vector<int32_t> data(app.nthread_logic);
245     out_buf.copy_to(data.data(), out_buf.nbytes());
246 
247     if (verbose) {
248       std::stringstream ss;
249       for (auto j = 0; j < nthread; ++j) {
250         ss << data[j] << " ";
251       }
252       std::cout << ss.str() << std::endl;
253     }
254 
255     // Check until which point is the data in ascending order.
256     int32_t last = -1;
257     int32_t j = 0;
258     for (; j < nthread; ++j) {
259       if (last >= data[j]) {
260         break;
261       }
262       last = data[j];
263     }
264 
265     return j;
266   };
267 
268   // Test increasing sizes until the data is no longer in ascending order.
269   uint32_t warp_size_scheduler = warp_size;
270   int i = 1;
271   for (; i <= app.nthread_logic; ++i) {
272     uint32_t nascend = bench_sm(i);
273     if (nascend != i) {
274       warp_size_scheduler = nascend;
275       break;
276     }
277   }
278   if (i > app.nthread_logic) {
279     std::cout << "Unable to conclude an SM Warp Size." << std::endl;
280   }
281 
282   std::cout << "PhysicalWarpSize," << warp_size << std::endl;
283   std::cout << "SMWarpSize," << warp_size_scheduler << std::endl;
284 }
285 }; // namespace gpuinfo
286