1 /*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9 #include <executorch/backends/vulkan/test/utils/test_utils.h>
10
11 #include <executorch/runtime/core/exec_aten/exec_aten.h>
12
13 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
14
15 #include <cassert>
16 #include <random>
17
18 using namespace vkcompute;
19
20 //
21 // Operator Recording Functions
22 //
23
record_nchw_to_buffer_op(api::Context * const context,vkapi::VulkanBuffer & src_buffer,api::vTensor & v_dst)24 void record_nchw_to_buffer_op(
25 api::Context* const context,
26 vkapi::VulkanBuffer& src_buffer,
27 api::vTensor& v_dst) {
28 vkapi::PipelineBarrier pipeline_barrier{};
29
30 context->submit_compute_job(
31 get_nchw_to_tensor_shader(v_dst),
32 pipeline_barrier,
33 {uint32_t(v_dst.numel()), 1, 1},
34 {64, 1, 1},
35 {},
36 VK_NULL_HANDLE,
37 0,
38 v_dst.buffer(
39 pipeline_barrier,
40 vkapi::PipelineStage::COMPUTE,
41 vkapi::MemoryAccessType::WRITE),
42 src_buffer,
43 v_dst.sizes_ubo(),
44 v_dst.strides_ubo(),
45 v_dst.numel_ubo());
46 }
47
record_buffer_to_nchw_op(api::Context * const context,api::vTensor & v_src,vkapi::VulkanBuffer & dst_buffer)48 void record_buffer_to_nchw_op(
49 api::Context* const context,
50 api::vTensor& v_src,
51 vkapi::VulkanBuffer& dst_buffer) {
52 vkapi::PipelineBarrier pipeline_barrier{};
53 context->submit_compute_job(
54 get_tensor_to_nchw_shader(v_src),
55 pipeline_barrier,
56 {uint32_t(v_src.numel()), 1, 1},
57 {64, 1, 1},
58 {},
59 VK_NULL_HANDLE,
60 0,
61 dst_buffer,
62 v_src.buffer(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
63 v_src.sizes_ubo(),
64 v_src.strides_ubo(),
65 v_src.numel_ubo());
66 }
67
record_nchw_to_image_op(api::Context * const context,vkapi::VulkanBuffer & src_buffer,api::vTensor & v_dst)68 void record_nchw_to_image_op(
69 api::Context* const context,
70 vkapi::VulkanBuffer& src_buffer,
71 api::vTensor& v_dst) {
72 vkapi::PipelineBarrier pipeline_barrier{};
73 vkapi::SpecVarList specialization_constants = {v_dst.hashed_layout()};
74
75 context->submit_compute_job(
76 get_nchw_to_tensor_shader(
77 v_dst, context->adapter_ptr()->has_full_int8_buffers_support()),
78 pipeline_barrier,
79 v_dst.logical_limits(),
80 adaptive_work_group_size(v_dst.logical_limits()),
81 specialization_constants,
82 VK_NULL_HANDLE,
83 0,
84 v_dst.image(
85 pipeline_barrier,
86 vkapi::PipelineStage::COMPUTE,
87 vkapi::MemoryAccessType::WRITE),
88 src_buffer,
89 v_dst.sizes_ubo());
90 }
91
record_image_to_nchw_op(api::Context * const context,api::vTensor & v_src,vkapi::VulkanBuffer & dst_buffer)92 void record_image_to_nchw_op(
93 api::Context* const context,
94 api::vTensor& v_src,
95 vkapi::VulkanBuffer& dst_buffer) {
96 vkapi::PipelineBarrier pipeline_barrier{};
97 vkapi::SpecVarList specialization_constants = {v_src.hashed_layout()};
98
99 context->submit_compute_job(
100 get_tensor_to_nchw_shader(v_src),
101 pipeline_barrier,
102 v_src.logical_limits(),
103 adaptive_work_group_size(v_src.logical_limits()),
104 specialization_constants,
105 VK_NULL_HANDLE,
106 0,
107 dst_buffer,
108 v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
109 v_src.sizes_ubo());
110 }
111
record_bitw8_image_to_nchw_nobitw8buffer_op(api::Context * const context,api::vTensor & v_src,api::StagingBuffer & dst_buffer)112 void record_bitw8_image_to_nchw_nobitw8buffer_op(
113 api::Context* const context,
114 api::vTensor& v_src,
115 api::StagingBuffer& dst_buffer) {
116 vkapi::PipelineBarrier pipeline_barrier{};
117 uint32_t buffer_len = utils::safe_downcast<uint32_t>(dst_buffer.numel() / 4);
118 utils::uvec3 global_wg_size = {buffer_len, 1, 1};
119
120 std::string kernel_name = "bitw8_image_to_nchw_nobitw8buffer";
121 add_storage_type_suffix(kernel_name, v_src);
122 add_dtype_suffix(kernel_name, v_src);
123
124 context->submit_compute_job(
125 VK_KERNEL_FROM_STR(kernel_name),
126 pipeline_barrier,
127 global_wg_size,
128 adaptive_work_group_size(global_wg_size),
129 {v_src.hashed_layout()},
130 VK_NULL_HANDLE,
131 0,
132 dst_buffer.buffer(),
133 v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
134 v_src.sizes_ubo(),
135 v_src.numel_ubo());
136 }
137
record_conv2d_prepack_weights_op(api::Context * const context,vkapi::VulkanBuffer & src_buffer,api::vTensor & v_dst,const std::vector<int64_t> & original_sizes,const bool transposed)138 void record_conv2d_prepack_weights_op(
139 api::Context* const context,
140 vkapi::VulkanBuffer& src_buffer,
141 api::vTensor& v_dst,
142 const std::vector<int64_t>& original_sizes,
143 const bool transposed) {
144 vkapi::PipelineBarrier pipeline_barrier{};
145
146 std::string kernel_name;
147 if (transposed) {
148 kernel_name = "conv_transpose2d";
149 } else {
150 kernel_name = "conv2d";
151 }
152 kernel_name += "_prepack_weights";
153 add_dtype_suffix(kernel_name, v_dst);
154 vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
155
156 api::ParamsBuffer original_sizes_ubo(
157 context, utils::make_ivec4(original_sizes, /*reverse = */ true));
158
159 vkapi::SpecVarList specialization_constants = {};
160 context->submit_compute_job(
161 shader,
162 pipeline_barrier,
163 v_dst.logical_limits(),
164 adaptive_work_group_size(v_dst.logical_limits()),
165 specialization_constants,
166 VK_NULL_HANDLE,
167 0,
168 v_dst.image(
169 pipeline_barrier,
170 vkapi::PipelineStage::COMPUTE,
171 vkapi::MemoryAccessType::WRITE),
172 src_buffer,
173 v_dst.sizes_ubo(),
174 original_sizes_ubo.buffer());
175 }
176
record_binary_op(api::Context * const context,const std::string & op_name,api::vTensor & v_in1,api::vTensor & v_in2,api::vTensor & v_dst)177 void record_binary_op(
178 api::Context* const context,
179 const std::string& op_name,
180 api::vTensor& v_in1,
181 api::vTensor& v_in2,
182 api::vTensor& v_dst) {
183 std::string kernel_name = "binary_" + op_name + "_nobroadcast__test";
184 add_dtype_suffix(kernel_name, v_dst);
185
186 vkapi::PipelineBarrier pipeline_barrier{};
187 vkapi::SpecVarList specialization_constants = {};
188 context->submit_compute_job(
189 VK_KERNEL_FROM_STR(kernel_name),
190 pipeline_barrier,
191 v_dst.logical_limits(),
192 adaptive_work_group_size(v_dst.logical_limits()),
193 specialization_constants,
194 VK_NULL_HANDLE,
195 0,
196 v_dst.image(
197 pipeline_barrier,
198 vkapi::PipelineStage::COMPUTE,
199 vkapi::MemoryAccessType::WRITE),
200 v_in1.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
201 v_in2.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
202 v_dst.sizes_ubo());
203 }
204
execute_and_check_add(api::vTensor & a,api::vTensor & b,api::vTensor & c,float a_val,float b_val)205 void execute_and_check_add(
206 api::vTensor& a,
207 api::vTensor& b,
208 api::vTensor& c,
209 float a_val,
210 float b_val) {
211 // Fill input tensors
212 fill_vtensor(a, a_val);
213 fill_vtensor(b, b_val);
214
215 // a + b = c
216 record_binary_op(api::context(), "add", a, b, c);
217
218 // Extract output tensor
219 std::vector<float> data_out = extract_vtensor(c);
220
221 // Check output
222 for (size_t i = 0; i < data_out.size(); ++i) {
223 CHECK_VALUE(data_out, i, (a_val + b_val));
224 }
225 }
226
record_index_fill_buffer(api::Context * context,api::vTensor & v_ten)227 void record_index_fill_buffer(api::Context* context, api::vTensor& v_ten) {
228 std::string kernel_name("idx_fill_buffer");
229 switch (v_ten.dtype()) {
230 case vkapi::kFloat:
231 kernel_name += "_float";
232 break;
233 case vkapi::kHalf:
234 kernel_name += "_half";
235 break;
236 case vkapi::kQInt8:
237 kernel_name += "_int8";
238 break;
239 case vkapi::kQUInt8:
240 kernel_name += "_uint8";
241 break;
242 default:
243 throw std::runtime_error("Unsupported dtype");
244 break;
245 }
246
247 api::ParamsBuffer params(api::context(), int32_t(v_ten.numel()));
248
249 {
250 vkapi::PipelineBarrier pipeline_barrier{};
251 vkapi::SpecVarList specialization_constants = {};
252 api::context()->submit_compute_job(
253 VK_KERNEL_FROM_STR(kernel_name),
254 pipeline_barrier,
255 {uint32_t(v_ten.numel()), 1, 1},
256 {64, 1, 1},
257 specialization_constants,
258 VK_NULL_HANDLE,
259 0,
260 v_ten.buffer(
261 pipeline_barrier,
262 vkapi::PipelineStage::COMPUTE,
263 vkapi::MemoryAccessType::READ),
264 params.buffer());
265 }
266 }
267
record_scalar_add_buffer(api::Context * context,api::vTensor & v_ten,float offset)268 void record_scalar_add_buffer(
269 api::Context* context,
270 api::vTensor& v_ten,
271 float offset) {
272 vkapi::PipelineBarrier pipeline_barrier{};
273 vkapi::SpecVarList specialization_constants = {SV(offset)};
274 std::string kernel = "scalar_add_buffer";
275 add_dtype_suffix(kernel, v_ten);
276 api::context()->submit_compute_job(
277 VK_KERNEL_FROM_STR(kernel),
278 pipeline_barrier,
279 {uint32_t(v_ten.numel()), 1, 1},
280 {64, 1, 1},
281 specialization_constants,
282 VK_NULL_HANDLE,
283 0,
284 v_ten.buffer(
285 pipeline_barrier,
286 vkapi::PipelineStage::COMPUTE,
287 vkapi::MemoryAccessType::READ | vkapi::MemoryAccessType::WRITE),
288 v_ten.numel_ubo());
289 }
290
record_reference_matmul(api::Context * context,api::vTensor & out,api::vTensor & mat1,api::vTensor & mat2)291 void record_reference_matmul(
292 api::Context* context,
293 api::vTensor& out,
294 api::vTensor& mat1,
295 api::vTensor& mat2) {
296 vkapi::PipelineBarrier pipeline_barrier{};
297 api::context()->submit_compute_job(
298 VK_KERNEL(reference_matmul),
299 pipeline_barrier,
300 {uint32_t(out.size(1)), uint32_t(out.size(0)), 1},
301 {64, 1, 1},
302 {},
303 VK_NULL_HANDLE,
304 0,
305 out.buffer(
306 pipeline_barrier,
307 vkapi::PipelineStage::COMPUTE,
308 vkapi::MemoryAccessType::WRITE),
309 mat1.buffer(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
310 mat2.buffer(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
311 out.sizes_ubo(),
312 out.strides_ubo(),
313 mat1.sizes_ubo(),
314 mat1.strides_ubo(),
315 mat2.sizes_ubo(),
316 mat2.strides_ubo());
317 }
318
record_matmul_texture3d(api::Context * context,api::vTensor & out,api::vTensor & mat1,api::vTensor & mat2)319 void record_matmul_texture3d(
320 api::Context* context,
321 api::vTensor& out,
322 api::vTensor& mat1,
323 api::vTensor& mat2) {
324 std::string kernel_name = "matmul_naive";
325 kernel_name.reserve(kShaderNameReserve);
326 add_storage_type_suffix(kernel_name, out.storage_type());
327 add_dtype_suffix(kernel_name, out.dtype());
328
329 utils::uvec3 global_wg_size = out.logical_limits();
330
331 vkapi::PipelineBarrier pipeline_barrier{};
332 api::context()->submit_compute_job(
333 VK_KERNEL_FROM_STR(kernel_name),
334 pipeline_barrier,
335 global_wg_size,
336 {8, 8, 1},
337 {out.hashed_layout(), mat1.hashed_layout(), mat2.hashed_layout()},
338 VK_NULL_HANDLE,
339 0,
340 out.image(
341 pipeline_barrier,
342 vkapi::PipelineStage::COMPUTE,
343 vkapi::MemoryAccessType::WRITE),
344 mat1.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
345 mat2.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
346 out.sizes_ubo(),
347 out.logical_limits_ubo(),
348 mat1.sizes_ubo(),
349 mat2.sizes_ubo());
350 }
351
352 //
353 // Input & Output Utilities
354 //
355
356 #define FORALL_SUPPORTED_TYPES(_) \
357 _(uint8_t, Byte) \
358 _(int8_t, Char) \
359 _(int32_t, Int) \
360 _(executorch::aten::Half, Half) \
361 _(float, Float) \
362 _(int8_t, QInt8)
363
fill_vtensor(api::vTensor & vten,std::vector<float> & data)364 void fill_vtensor(api::vTensor& vten, std::vector<float>& data) {
365 api::StagingBuffer staging_buffer(api::context(), vten.dtype(), data.size());
366
367 #define CASE(ctype, name) \
368 case vkapi::ScalarType::name: { \
369 std::vector<ctype> data_converted; \
370 data_converted.resize(data.size()); \
371 for (int i = 0; i < data.size(); ++i) { \
372 data_converted[i] = ctype(data[i]); \
373 } \
374 staging_buffer.copy_from( \
375 data_converted.data(), vten.staging_buffer_nbytes()); \
376 } break;
377
378 switch (vten.dtype()) {
379 FORALL_SUPPORTED_TYPES(CASE)
380 default:
381 VK_THROW("Unsupported dtype");
382 }
383
384 #undef CASE
385
386 if (vten.storage_type() == utils::StorageType::BUFFER) {
387 record_nchw_to_buffer_op(api::context(), staging_buffer.buffer(), vten);
388 } else {
389 record_nchw_to_image_op(api::context(), staging_buffer.buffer(), vten);
390 }
391 }
392
fill_vtensor(api::vTensor & vten,float val,bool iota)393 void fill_vtensor(api::vTensor& vten, float val, bool iota) {
394 std::vector<float> vten_data(vten.staging_buffer_numel());
395 if (iota) {
396 std::iota(vten_data.begin(), vten_data.end(), val);
397 } else {
398 std::fill(vten_data.begin(), vten_data.end(), val);
399 }
400
401 fill_vtensor(vten, vten_data);
402 }
403
create_random_float_buffer(const size_t numel,const float min,const float max)404 std::vector<float> create_random_float_buffer(
405 const size_t numel,
406 const float min,
407 const float max) {
408 std::vector<float> data(numel);
409 std::default_random_engine rng;
410 std::uniform_real_distribution<float> dist(min, max);
411
412 for (size_t i = 0; i < data.size(); ++i) {
413 data[i] = dist(rng);
414 }
415 return data;
416 }
417
create_random_uint8_buffer(const size_t numel,const uint8_t min,const uint8_t max)418 std::vector<uint8_t> create_random_uint8_buffer(
419 const size_t numel,
420 const uint8_t min,
421 const uint8_t max) {
422 std::vector<uint8_t> data(numel);
423 std::default_random_engine rng;
424 std::uniform_real_distribution<float> dist(min, max);
425
426 for (size_t i = 0; i < data.size(); ++i) {
427 data[i] = (uint8_t)dist(rng);
428 }
429 return data;
430 }
431
fill_vtensor(ComputeGraph & graph,const IOValueRef idx,float val,bool iota)432 void fill_vtensor(
433 ComputeGraph& graph,
434 const IOValueRef idx,
435 float val,
436 bool iota) {
437 vTensorPtr t = graph.get_tensor(idx.value);
438 std::vector<float> data(t->numel());
439 if (t->storage_type() != utils::kBuffer) {
440 data.resize(t->staging_buffer_numel());
441 }
442 if (iota) {
443 std::iota(data.begin(), data.end(), val);
444 } else {
445 std::fill(data.begin(), data.end(), val);
446 }
447
448 graph.copy_into_staging(idx.staging, data.data(), data.size());
449 }
450
extract_vtensor(api::vTensor & vten,std::vector<float> & data)451 void extract_vtensor(api::vTensor& vten, std::vector<float>& data) {
452 api::StagingBuffer staging_buffer(
453 api::context(), vten.dtype(), vten.staging_buffer_numel());
454
455 if (vten.storage_type() == utils::StorageType::BUFFER) {
456 record_buffer_to_nchw_op(api::context(), vten, staging_buffer.buffer());
457 } else {
458 record_image_to_nchw_op(api::context(), vten, staging_buffer.buffer());
459 }
460
461 vkapi::VulkanFence fence = api::context()->fences().get_fence();
462 api::context()->submit_cmd_to_gpu(fence.get_submit_handle());
463 fence.wait();
464
465 #define CASE(ctype, name) \
466 case vkapi::ScalarType::name: { \
467 std::vector<ctype> data_converted(data.size()); \
468 staging_buffer.copy_to( \
469 data_converted.data(), vten.staging_buffer_nbytes()); \
470 for (int i = 0; i < data.size(); ++i) { \
471 data[i] = float(data_converted[i]); \
472 } \
473 } break;
474
475 switch (vten.dtype()) {
476 FORALL_SUPPORTED_TYPES(CASE)
477 default:
478 VK_THROW("Unsupported dtype");
479 }
480
481 #undef CASE
482 }
483
484 //
485 // Context Management
486 //
487
submit_to_gpu()488 void submit_to_gpu() {
489 vkapi::VulkanFence fence = api::context()->fences().get_fence();
490 api::context()->submit_cmd_to_gpu(fence.get_submit_handle());
491 fence.wait();
492 }
493
allocate_memory_for(const api::vTensor & vten)494 vkapi::Allocation allocate_memory_for(const api::vTensor& vten) {
495 VmaAllocationCreateInfo alloc_create_info =
496 api::context()->adapter_ptr()->vma().gpuonly_resource_create_info();
497 return api::context()->adapter_ptr()->vma().create_allocation(
498 vten.get_memory_requirements(), alloc_create_info);
499 }
500
get_vma_stats()501 VmaTotalStatistics get_vma_stats() {
502 return api::context()->adapter_ptr()->vma().get_memory_statistics();
503 }
504
get_vma_allocation_count()505 size_t get_vma_allocation_count() {
506 return get_vma_stats().total.statistics.allocationCount;
507 }
508
509 //
510 // Graph Test Utilities
511 //
512
execute_graph_and_check_output(ComputeGraph & graph,std::vector<float> input_vals,std::vector<float> expected_outputs)513 void execute_graph_and_check_output(
514 ComputeGraph& graph,
515 std::vector<float> input_vals,
516 std::vector<float> expected_outputs) {
517 assert(input_vals.size() == graph.inputs().size());
518 assert(expected_outputs.size() == graph.outputs().size());
519
520 for (size_t i = 0; i < graph.inputs().size(); ++i) {
521 fill_vtensor(graph, graph.inputs().at(i), input_vals.at(i));
522 }
523
524 graph.execute();
525
526 for (size_t i = 0; i < graph.outputs().size(); ++i) {
527 IOValueRef out_ioval = graph.outputs().at(i);
528 vTensorPtr t_out = graph.get_tensor(out_ioval.value);
529
530 std::vector<float> output_data(t_out->staging_buffer_numel());
531 graph.copy_from_staging(
532 out_ioval.staging, output_data.data(), output_data.size());
533
534 for (size_t j = 0; j < t_out->numel(); ++j) {
535 CHECK_VALUE(output_data, j, expected_outputs.at(i));
536 }
537 }
538 }
539
check_close(float a,float b,float atol,float rtol)540 bool check_close(float a, float b, float atol, float rtol) {
541 float max = std::max(std::abs(a), std::abs(b));
542 float diff = std::abs(a - b);
543 return diff <= (atol + rtol * max);
544 }
545