xref: /aosp_15_r20/external/executorch/backends/vulkan/tools/gpuinfo/include/buffers.h (revision 523fa7a60841cd1ecfb9cc4201f1ca8b03ed023a)
1 /*
2  * Copyright (c) Meta Platforms, Inc. and affiliates.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD-style license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 #pragma once
10 
11 #include "app.h"
12 #include "stats.h"
13 #include "utils.h"
14 
15 using namespace vkapi;
16 
17 namespace gpuinfo {
18 
buf_cacheline_size(const App & app)19 void buf_cacheline_size(const App& app) {
20   if (!app.enabled("buf_cacheline_size")) {
21     std::cout << "Skipped Buffer Cacheline Size" << std::endl;
22     return;
23   }
24 
25   std::cout << std::endl;
26   std::cout << "------ Buffer Cacheline Size ------" << std::endl;
27 
28   const double COMPENSATE = app.get_config("buf_cacheline_size", "compensate");
29   const double THRESHOLD = app.get_config("buf_cacheline_size", "threshold");
30 
31   const uint32_t PITCH = app.buf_cache_size / app.nthread_logic;
32   const uint32_t BUF_SIZE = app.buf_cache_size;
33   const uint32_t MAX_STRIDE = PITCH;
34 
35   uint32_t NITER;
36 
37   auto bench = [&](int stride) {
38     StagingBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE);
39     StagingBuffer out_buf(context(), vkapi::kFloat, 1);
40     vkapi::PipelineBarrier pipeline_barrier{};
41 
42     auto shader_name = "buf_cacheline_size";
43 
44     auto time = benchmark_on_gpu(shader_name, 100, [&]() {
45       context()->submit_compute_job(
46           VK_KERNEL_FROM_STR(shader_name),
47           pipeline_barrier,
48           {app.nthread_logic, 1, 1},
49           {app.nthread_logic, 1, 1},
50           {SV(NITER), SV(stride), SV(PITCH)},
51           VK_NULL_HANDLE,
52           0,
53           in_buf.buffer(),
54           out_buf.buffer());
55     });
56     return time;
57   };
58 
59   ensure_min_niter(1000, NITER, [&]() { return bench(1); });
60 
61   uint32_t cacheline_size;
62 
63   DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
64   uint32_t stride = 1;
65   for (; stride <= MAX_STRIDE; ++stride) {
66     double time = bench(stride);
67     std::cout << "Testing stride=\t" << stride << "\t, time=\t" << time
68               << std::endl;
69 
70     if (dj.push(time)) {
71       cacheline_size = stride * sizeof(float);
72       break;
73     }
74   }
75   if (stride >= MAX_STRIDE) {
76     std::cout << "Unable to conclude a top level buffer cacheline size."
77               << std::endl;
78     cacheline_size = MAX_STRIDE * sizeof(float);
79   }
80 
81   std::cout << "BufTopLevelCachelineSize," << cacheline_size << std::endl;
82 }
83 
_bandwidth(const App & app,const std::string memtype,const uint32_t range)84 void _bandwidth(
85     const App& app,
86     const std::string memtype,
87     const uint32_t range) {
88   auto memtype_lower = memtype;
89   std::transform(
90       memtype_lower.begin(),
91       memtype_lower.end(),
92       memtype_lower.begin(),
93       [](unsigned char c) { return std::tolower(c); });
94 
95   auto test_name = memtype_lower + "_bandwidth";
96 
97   // Cache lines flushed
98   const uint32_t NFLUSH = app.get_config(test_name, "nflush");
99   // Number of loop unrolls. Changing this value requires an equal change in
100   // buf_bandwidth.yaml
101   const uint32_t NUNROLL = app.get_config(test_name, "nunroll");
102   // Number of iterations. Increasing this value reduces noise in exchange for
103   // higher latency.
104   const uint32_t NITER = app.get_config(test_name, "niter");
105   // Vector dimensions (vec4)
106   const uint32_t VEC_WIDTH = 4;
107   const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float);
108   // Number of vectors that fit in the selected memory space
109   const uint32_t NVEC = range / VEC_SIZE;
110   // Number of memory reads per thread
111   const uint32_t NREAD_PER_THREAD = NUNROLL * NITER;
112   // Number of threads needed to read al l vectors
113   // The thread count doesn't divide by thread workload in shared memory
114   // because of the limited memory size.
115   const uint32_t NTHREAD = memtype == "Shared" ? NVEC : NVEC / NREAD_PER_THREAD;
116   // Occupy all threads
117   const uint32_t local_x = app.nthread_logic;
118   // Ensure that global is a multiple of local, and distribute across all SMs
119   const uint32_t global_x =
120       (NTHREAD / local_x * local_x) * app.sm_count * NFLUSH;
121 
122   auto bench = [&](uint32_t access_size) {
123     // Number of vectors that fit in this iteration
124     const uint32_t nvec_access = access_size / VEC_SIZE;
125 
126     // The address mask works as a modulo because x % 2^n == x & (2^n - 1).
127     // This will help us limit address accessing to a specific set of unique
128     // addresses depending on the access size we want to measure.
129     const uint32_t addr_mask = nvec_access - 1;
130 
131     // This is to distribute the accesses to unique addresses across the
132     // workgroups, once the size of the access excedes the workgroup width.
133     const uint32_t workgroup_width = local_x * NITER * NUNROLL;
134 
135     StagingBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float));
136     StagingBuffer out_buf(
137         context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic);
138     vkapi::PipelineBarrier pipeline_barrier{};
139 
140     auto shader_name = "buf_bandwidth_" + memtype_lower;
141 
142     auto time = benchmark_on_gpu(shader_name, 10, [&]() {
143       context()->submit_compute_job(
144           VK_KERNEL_FROM_STR(shader_name),
145           pipeline_barrier,
146           {global_x, 1, 1},
147           {local_x, 1, 1},
148           {SV(NITER),
149            SV(nvec_access),
150            SV(local_x),
151            SV(addr_mask),
152            SV(workgroup_width)},
153           VK_NULL_HANDLE,
154           0,
155           in_buf.buffer(),
156           out_buf.buffer());
157     });
158 
159     const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE;
160     auto gbps = SIZE_TRANS * 1e-3 / time;
161     std::cout << memtype << " bandwidth accessing \t" << access_size
162               << "\tB unique data is \t" << gbps << " \tgbps (\t" << time
163               << "\tus)" << std::endl;
164     return gbps;
165   };
166 
167   double max_bandwidth = 0;
168   double min_bandwidth = DBL_MAX;
169   for (uint32_t access_size = VEC_SIZE; access_size < range; access_size *= 2) {
170     double gbps = bench(access_size);
171     max_bandwidth = std::max(gbps, max_bandwidth);
172     min_bandwidth = std::min(gbps, min_bandwidth);
173   }
174 
175   std::cout << "Max" << memtype << "Bandwidth (GB/s)," << max_bandwidth
176             << std::endl;
177   std::cout << "Min" << memtype << "Bandwidth (GB/s)," << min_bandwidth
178             << std::endl;
179 }
180 
buf_bandwidth(const App & app)181 void buf_bandwidth(const App& app) {
182   if (!app.enabled("buffer_bandwidth")) {
183     std::cout << "Skipped Memory Bandwidth" << std::endl;
184     return;
185   }
186 
187   std::cout << "\n------ Memory Bandwidth ------" << std::endl;
188   // Maximum memory space read - 128MB
189   // For regular devices, bandwidth plateaus at less memory than this, so more
190   // is not needed.
191   const uint32_t RANGE = app.get_config("buffer_bandwidth", "range");
192   _bandwidth(app, "Buffer", RANGE);
193 }
194 
ubo_bandwidth(const App & app)195 void ubo_bandwidth(const App& app) {
196   if (!app.enabled("ubo_bandwidth")) {
197     std::cout << "Skipped UBO Bandwidth" << std::endl;
198     return;
199   }
200 
201   std::cout << "\n------ UBO Bandwidth ------" << std::endl;
202   const uint32_t RANGE = app.get_config("ubo_bandwidth", "range");
203   _bandwidth(app, "UBO", RANGE);
204 }
205 
shared_mem_bandwidth(const App & app)206 void shared_mem_bandwidth(const App& app) {
207   if (!app.enabled("shared_bandwidth")) {
208     std::cout << "Skipped Shared Memory Bandwidth" << std::endl;
209     return;
210   }
211 
212   std::cout << "\n------ Shared Bandwidth ------" << std::endl;
213   const uint32_t RANGE = app.max_shared_mem_size;
214   _bandwidth(app, "Shared", RANGE);
215 }
216 } // namespace gpuinfo
217