xref: /aosp_15_r20/external/executorch/backends/vulkan/test/utils/test_utils.cpp (revision 523fa7a60841cd1ecfb9cc4201f1ca8b03ed023a)
1 /*
2  * Copyright (c) Meta Platforms, Inc. and affiliates.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD-style license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 #include <executorch/backends/vulkan/test/utils/test_utils.h>
10 
11 #include <executorch/runtime/core/exec_aten/exec_aten.h>
12 
13 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
14 
15 #include <cassert>
16 #include <random>
17 
18 using namespace vkcompute;
19 
20 //
21 // Operator Recording Functions
22 //
23 
record_nchw_to_buffer_op(api::Context * const context,vkapi::VulkanBuffer & src_buffer,api::vTensor & v_dst)24 void record_nchw_to_buffer_op(
25     api::Context* const context,
26     vkapi::VulkanBuffer& src_buffer,
27     api::vTensor& v_dst) {
28   vkapi::PipelineBarrier pipeline_barrier{};
29 
30   context->submit_compute_job(
31       get_nchw_to_tensor_shader(v_dst),
32       pipeline_barrier,
33       {uint32_t(v_dst.numel()), 1, 1},
34       {64, 1, 1},
35       {},
36       VK_NULL_HANDLE,
37       0,
38       v_dst.buffer(
39           pipeline_barrier,
40           vkapi::PipelineStage::COMPUTE,
41           vkapi::MemoryAccessType::WRITE),
42       src_buffer,
43       v_dst.sizes_ubo(),
44       v_dst.strides_ubo(),
45       v_dst.numel_ubo());
46 }
47 
record_buffer_to_nchw_op(api::Context * const context,api::vTensor & v_src,vkapi::VulkanBuffer & dst_buffer)48 void record_buffer_to_nchw_op(
49     api::Context* const context,
50     api::vTensor& v_src,
51     vkapi::VulkanBuffer& dst_buffer) {
52   vkapi::PipelineBarrier pipeline_barrier{};
53   context->submit_compute_job(
54       get_tensor_to_nchw_shader(v_src),
55       pipeline_barrier,
56       {uint32_t(v_src.numel()), 1, 1},
57       {64, 1, 1},
58       {},
59       VK_NULL_HANDLE,
60       0,
61       dst_buffer,
62       v_src.buffer(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
63       v_src.sizes_ubo(),
64       v_src.strides_ubo(),
65       v_src.numel_ubo());
66 }
67 
record_nchw_to_image_op(api::Context * const context,vkapi::VulkanBuffer & src_buffer,api::vTensor & v_dst)68 void record_nchw_to_image_op(
69     api::Context* const context,
70     vkapi::VulkanBuffer& src_buffer,
71     api::vTensor& v_dst) {
72   vkapi::PipelineBarrier pipeline_barrier{};
73   vkapi::SpecVarList specialization_constants = {v_dst.hashed_layout()};
74 
75   context->submit_compute_job(
76       get_nchw_to_tensor_shader(
77           v_dst, context->adapter_ptr()->has_full_int8_buffers_support()),
78       pipeline_barrier,
79       v_dst.logical_limits(),
80       adaptive_work_group_size(v_dst.logical_limits()),
81       specialization_constants,
82       VK_NULL_HANDLE,
83       0,
84       v_dst.image(
85           pipeline_barrier,
86           vkapi::PipelineStage::COMPUTE,
87           vkapi::MemoryAccessType::WRITE),
88       src_buffer,
89       v_dst.sizes_ubo());
90 }
91 
record_image_to_nchw_op(api::Context * const context,api::vTensor & v_src,vkapi::VulkanBuffer & dst_buffer)92 void record_image_to_nchw_op(
93     api::Context* const context,
94     api::vTensor& v_src,
95     vkapi::VulkanBuffer& dst_buffer) {
96   vkapi::PipelineBarrier pipeline_barrier{};
97   vkapi::SpecVarList specialization_constants = {v_src.hashed_layout()};
98 
99   context->submit_compute_job(
100       get_tensor_to_nchw_shader(v_src),
101       pipeline_barrier,
102       v_src.logical_limits(),
103       adaptive_work_group_size(v_src.logical_limits()),
104       specialization_constants,
105       VK_NULL_HANDLE,
106       0,
107       dst_buffer,
108       v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
109       v_src.sizes_ubo());
110 }
111 
record_bitw8_image_to_nchw_nobitw8buffer_op(api::Context * const context,api::vTensor & v_src,api::StagingBuffer & dst_buffer)112 void record_bitw8_image_to_nchw_nobitw8buffer_op(
113     api::Context* const context,
114     api::vTensor& v_src,
115     api::StagingBuffer& dst_buffer) {
116   vkapi::PipelineBarrier pipeline_barrier{};
117   uint32_t buffer_len = utils::safe_downcast<uint32_t>(dst_buffer.numel() / 4);
118   utils::uvec3 global_wg_size = {buffer_len, 1, 1};
119 
120   std::string kernel_name = "bitw8_image_to_nchw_nobitw8buffer";
121   add_storage_type_suffix(kernel_name, v_src);
122   add_dtype_suffix(kernel_name, v_src);
123 
124   context->submit_compute_job(
125       VK_KERNEL_FROM_STR(kernel_name),
126       pipeline_barrier,
127       global_wg_size,
128       adaptive_work_group_size(global_wg_size),
129       {v_src.hashed_layout()},
130       VK_NULL_HANDLE,
131       0,
132       dst_buffer.buffer(),
133       v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
134       v_src.sizes_ubo(),
135       v_src.numel_ubo());
136 }
137 
record_conv2d_prepack_weights_op(api::Context * const context,vkapi::VulkanBuffer & src_buffer,api::vTensor & v_dst,const std::vector<int64_t> & original_sizes,const bool transposed)138 void record_conv2d_prepack_weights_op(
139     api::Context* const context,
140     vkapi::VulkanBuffer& src_buffer,
141     api::vTensor& v_dst,
142     const std::vector<int64_t>& original_sizes,
143     const bool transposed) {
144   vkapi::PipelineBarrier pipeline_barrier{};
145 
146   std::string kernel_name;
147   if (transposed) {
148     kernel_name = "conv_transpose2d";
149   } else {
150     kernel_name = "conv2d";
151   }
152   kernel_name += "_prepack_weights";
153   add_dtype_suffix(kernel_name, v_dst);
154   vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
155 
156   api::ParamsBuffer original_sizes_ubo(
157       context, utils::make_ivec4(original_sizes, /*reverse = */ true));
158 
159   vkapi::SpecVarList specialization_constants = {};
160   context->submit_compute_job(
161       shader,
162       pipeline_barrier,
163       v_dst.logical_limits(),
164       adaptive_work_group_size(v_dst.logical_limits()),
165       specialization_constants,
166       VK_NULL_HANDLE,
167       0,
168       v_dst.image(
169           pipeline_barrier,
170           vkapi::PipelineStage::COMPUTE,
171           vkapi::MemoryAccessType::WRITE),
172       src_buffer,
173       v_dst.sizes_ubo(),
174       original_sizes_ubo.buffer());
175 }
176 
record_binary_op(api::Context * const context,const std::string & op_name,api::vTensor & v_in1,api::vTensor & v_in2,api::vTensor & v_dst)177 void record_binary_op(
178     api::Context* const context,
179     const std::string& op_name,
180     api::vTensor& v_in1,
181     api::vTensor& v_in2,
182     api::vTensor& v_dst) {
183   std::string kernel_name = "binary_" + op_name + "_nobroadcast__test";
184   add_dtype_suffix(kernel_name, v_dst);
185 
186   vkapi::PipelineBarrier pipeline_barrier{};
187   vkapi::SpecVarList specialization_constants = {};
188   context->submit_compute_job(
189       VK_KERNEL_FROM_STR(kernel_name),
190       pipeline_barrier,
191       v_dst.logical_limits(),
192       adaptive_work_group_size(v_dst.logical_limits()),
193       specialization_constants,
194       VK_NULL_HANDLE,
195       0,
196       v_dst.image(
197           pipeline_barrier,
198           vkapi::PipelineStage::COMPUTE,
199           vkapi::MemoryAccessType::WRITE),
200       v_in1.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
201       v_in2.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
202       v_dst.sizes_ubo());
203 }
204 
execute_and_check_add(api::vTensor & a,api::vTensor & b,api::vTensor & c,float a_val,float b_val)205 void execute_and_check_add(
206     api::vTensor& a,
207     api::vTensor& b,
208     api::vTensor& c,
209     float a_val,
210     float b_val) {
211   // Fill input tensors
212   fill_vtensor(a, a_val);
213   fill_vtensor(b, b_val);
214 
215   // a + b = c
216   record_binary_op(api::context(), "add", a, b, c);
217 
218   // Extract output tensor
219   std::vector<float> data_out = extract_vtensor(c);
220 
221   // Check output
222   for (size_t i = 0; i < data_out.size(); ++i) {
223     CHECK_VALUE(data_out, i, (a_val + b_val));
224   }
225 }
226 
record_index_fill_buffer(api::Context * context,api::vTensor & v_ten)227 void record_index_fill_buffer(api::Context* context, api::vTensor& v_ten) {
228   std::string kernel_name("idx_fill_buffer");
229   switch (v_ten.dtype()) {
230     case vkapi::kFloat:
231       kernel_name += "_float";
232       break;
233     case vkapi::kHalf:
234       kernel_name += "_half";
235       break;
236     case vkapi::kQInt8:
237       kernel_name += "_int8";
238       break;
239     case vkapi::kQUInt8:
240       kernel_name += "_uint8";
241       break;
242     default:
243       throw std::runtime_error("Unsupported dtype");
244       break;
245   }
246 
247   api::ParamsBuffer params(api::context(), int32_t(v_ten.numel()));
248 
249   {
250     vkapi::PipelineBarrier pipeline_barrier{};
251     vkapi::SpecVarList specialization_constants = {};
252     api::context()->submit_compute_job(
253         VK_KERNEL_FROM_STR(kernel_name),
254         pipeline_barrier,
255         {uint32_t(v_ten.numel()), 1, 1},
256         {64, 1, 1},
257         specialization_constants,
258         VK_NULL_HANDLE,
259         0,
260         v_ten.buffer(
261             pipeline_barrier,
262             vkapi::PipelineStage::COMPUTE,
263             vkapi::MemoryAccessType::READ),
264         params.buffer());
265   }
266 }
267 
record_scalar_add_buffer(api::Context * context,api::vTensor & v_ten,float offset)268 void record_scalar_add_buffer(
269     api::Context* context,
270     api::vTensor& v_ten,
271     float offset) {
272   vkapi::PipelineBarrier pipeline_barrier{};
273   vkapi::SpecVarList specialization_constants = {SV(offset)};
274   std::string kernel = "scalar_add_buffer";
275   add_dtype_suffix(kernel, v_ten);
276   api::context()->submit_compute_job(
277       VK_KERNEL_FROM_STR(kernel),
278       pipeline_barrier,
279       {uint32_t(v_ten.numel()), 1, 1},
280       {64, 1, 1},
281       specialization_constants,
282       VK_NULL_HANDLE,
283       0,
284       v_ten.buffer(
285           pipeline_barrier,
286           vkapi::PipelineStage::COMPUTE,
287           vkapi::MemoryAccessType::READ | vkapi::MemoryAccessType::WRITE),
288       v_ten.numel_ubo());
289 }
290 
record_reference_matmul(api::Context * context,api::vTensor & out,api::vTensor & mat1,api::vTensor & mat2)291 void record_reference_matmul(
292     api::Context* context,
293     api::vTensor& out,
294     api::vTensor& mat1,
295     api::vTensor& mat2) {
296   vkapi::PipelineBarrier pipeline_barrier{};
297   api::context()->submit_compute_job(
298       VK_KERNEL(reference_matmul),
299       pipeline_barrier,
300       {uint32_t(out.size(1)), uint32_t(out.size(0)), 1},
301       {64, 1, 1},
302       {},
303       VK_NULL_HANDLE,
304       0,
305       out.buffer(
306           pipeline_barrier,
307           vkapi::PipelineStage::COMPUTE,
308           vkapi::MemoryAccessType::WRITE),
309       mat1.buffer(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
310       mat2.buffer(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
311       out.sizes_ubo(),
312       out.strides_ubo(),
313       mat1.sizes_ubo(),
314       mat1.strides_ubo(),
315       mat2.sizes_ubo(),
316       mat2.strides_ubo());
317 }
318 
record_matmul_texture3d(api::Context * context,api::vTensor & out,api::vTensor & mat1,api::vTensor & mat2)319 void record_matmul_texture3d(
320     api::Context* context,
321     api::vTensor& out,
322     api::vTensor& mat1,
323     api::vTensor& mat2) {
324   std::string kernel_name = "matmul_naive";
325   kernel_name.reserve(kShaderNameReserve);
326   add_storage_type_suffix(kernel_name, out.storage_type());
327   add_dtype_suffix(kernel_name, out.dtype());
328 
329   utils::uvec3 global_wg_size = out.logical_limits();
330 
331   vkapi::PipelineBarrier pipeline_barrier{};
332   api::context()->submit_compute_job(
333       VK_KERNEL_FROM_STR(kernel_name),
334       pipeline_barrier,
335       global_wg_size,
336       {8, 8, 1},
337       {out.hashed_layout(), mat1.hashed_layout(), mat2.hashed_layout()},
338       VK_NULL_HANDLE,
339       0,
340       out.image(
341           pipeline_barrier,
342           vkapi::PipelineStage::COMPUTE,
343           vkapi::MemoryAccessType::WRITE),
344       mat1.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
345       mat2.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
346       out.sizes_ubo(),
347       out.logical_limits_ubo(),
348       mat1.sizes_ubo(),
349       mat2.sizes_ubo());
350 }
351 
352 //
353 // Input & Output Utilities
354 //
355 
356 #define FORALL_SUPPORTED_TYPES(_) \
357   _(uint8_t, Byte)                \
358   _(int8_t, Char)                 \
359   _(int32_t, Int)                 \
360   _(executorch::aten::Half, Half) \
361   _(float, Float)                 \
362   _(int8_t, QInt8)
363 
fill_vtensor(api::vTensor & vten,std::vector<float> & data)364 void fill_vtensor(api::vTensor& vten, std::vector<float>& data) {
365   api::StagingBuffer staging_buffer(api::context(), vten.dtype(), data.size());
366 
367 #define CASE(ctype, name)                                     \
368   case vkapi::ScalarType::name: {                             \
369     std::vector<ctype> data_converted;                        \
370     data_converted.resize(data.size());                       \
371     for (int i = 0; i < data.size(); ++i) {                   \
372       data_converted[i] = ctype(data[i]);                     \
373     }                                                         \
374     staging_buffer.copy_from(                                 \
375         data_converted.data(), vten.staging_buffer_nbytes()); \
376   } break;
377 
378   switch (vten.dtype()) {
379     FORALL_SUPPORTED_TYPES(CASE)
380     default:
381       VK_THROW("Unsupported dtype");
382   }
383 
384 #undef CASE
385 
386   if (vten.storage_type() == utils::StorageType::BUFFER) {
387     record_nchw_to_buffer_op(api::context(), staging_buffer.buffer(), vten);
388   } else {
389     record_nchw_to_image_op(api::context(), staging_buffer.buffer(), vten);
390   }
391 }
392 
fill_vtensor(api::vTensor & vten,float val,bool iota)393 void fill_vtensor(api::vTensor& vten, float val, bool iota) {
394   std::vector<float> vten_data(vten.staging_buffer_numel());
395   if (iota) {
396     std::iota(vten_data.begin(), vten_data.end(), val);
397   } else {
398     std::fill(vten_data.begin(), vten_data.end(), val);
399   }
400 
401   fill_vtensor(vten, vten_data);
402 }
403 
create_random_float_buffer(const size_t numel,const float min,const float max)404 std::vector<float> create_random_float_buffer(
405     const size_t numel,
406     const float min,
407     const float max) {
408   std::vector<float> data(numel);
409   std::default_random_engine rng;
410   std::uniform_real_distribution<float> dist(min, max);
411 
412   for (size_t i = 0; i < data.size(); ++i) {
413     data[i] = dist(rng);
414   }
415   return data;
416 }
417 
create_random_uint8_buffer(const size_t numel,const uint8_t min,const uint8_t max)418 std::vector<uint8_t> create_random_uint8_buffer(
419     const size_t numel,
420     const uint8_t min,
421     const uint8_t max) {
422   std::vector<uint8_t> data(numel);
423   std::default_random_engine rng;
424   std::uniform_real_distribution<float> dist(min, max);
425 
426   for (size_t i = 0; i < data.size(); ++i) {
427     data[i] = (uint8_t)dist(rng);
428   }
429   return data;
430 }
431 
fill_vtensor(ComputeGraph & graph,const IOValueRef idx,float val,bool iota)432 void fill_vtensor(
433     ComputeGraph& graph,
434     const IOValueRef idx,
435     float val,
436     bool iota) {
437   vTensorPtr t = graph.get_tensor(idx.value);
438   std::vector<float> data(t->numel());
439   if (t->storage_type() != utils::kBuffer) {
440     data.resize(t->staging_buffer_numel());
441   }
442   if (iota) {
443     std::iota(data.begin(), data.end(), val);
444   } else {
445     std::fill(data.begin(), data.end(), val);
446   }
447 
448   graph.copy_into_staging(idx.staging, data.data(), data.size());
449 }
450 
extract_vtensor(api::vTensor & vten,std::vector<float> & data)451 void extract_vtensor(api::vTensor& vten, std::vector<float>& data) {
452   api::StagingBuffer staging_buffer(
453       api::context(), vten.dtype(), vten.staging_buffer_numel());
454 
455   if (vten.storage_type() == utils::StorageType::BUFFER) {
456     record_buffer_to_nchw_op(api::context(), vten, staging_buffer.buffer());
457   } else {
458     record_image_to_nchw_op(api::context(), vten, staging_buffer.buffer());
459   }
460 
461   vkapi::VulkanFence fence = api::context()->fences().get_fence();
462   api::context()->submit_cmd_to_gpu(fence.get_submit_handle());
463   fence.wait();
464 
465 #define CASE(ctype, name)                                     \
466   case vkapi::ScalarType::name: {                             \
467     std::vector<ctype> data_converted(data.size());           \
468     staging_buffer.copy_to(                                   \
469         data_converted.data(), vten.staging_buffer_nbytes()); \
470     for (int i = 0; i < data.size(); ++i) {                   \
471       data[i] = float(data_converted[i]);                     \
472     }                                                         \
473   } break;
474 
475   switch (vten.dtype()) {
476     FORALL_SUPPORTED_TYPES(CASE)
477     default:
478       VK_THROW("Unsupported dtype");
479   }
480 
481 #undef CASE
482 }
483 
484 //
485 // Context Management
486 //
487 
submit_to_gpu()488 void submit_to_gpu() {
489   vkapi::VulkanFence fence = api::context()->fences().get_fence();
490   api::context()->submit_cmd_to_gpu(fence.get_submit_handle());
491   fence.wait();
492 }
493 
allocate_memory_for(const api::vTensor & vten)494 vkapi::Allocation allocate_memory_for(const api::vTensor& vten) {
495   VmaAllocationCreateInfo alloc_create_info =
496       api::context()->adapter_ptr()->vma().gpuonly_resource_create_info();
497   return api::context()->adapter_ptr()->vma().create_allocation(
498       vten.get_memory_requirements(), alloc_create_info);
499 }
500 
get_vma_stats()501 VmaTotalStatistics get_vma_stats() {
502   return api::context()->adapter_ptr()->vma().get_memory_statistics();
503 }
504 
get_vma_allocation_count()505 size_t get_vma_allocation_count() {
506   return get_vma_stats().total.statistics.allocationCount;
507 }
508 
509 //
510 // Graph Test Utilities
511 //
512 
execute_graph_and_check_output(ComputeGraph & graph,std::vector<float> input_vals,std::vector<float> expected_outputs)513 void execute_graph_and_check_output(
514     ComputeGraph& graph,
515     std::vector<float> input_vals,
516     std::vector<float> expected_outputs) {
517   assert(input_vals.size() == graph.inputs().size());
518   assert(expected_outputs.size() == graph.outputs().size());
519 
520   for (size_t i = 0; i < graph.inputs().size(); ++i) {
521     fill_vtensor(graph, graph.inputs().at(i), input_vals.at(i));
522   }
523 
524   graph.execute();
525 
526   for (size_t i = 0; i < graph.outputs().size(); ++i) {
527     IOValueRef out_ioval = graph.outputs().at(i);
528     vTensorPtr t_out = graph.get_tensor(out_ioval.value);
529 
530     std::vector<float> output_data(t_out->staging_buffer_numel());
531     graph.copy_from_staging(
532         out_ioval.staging, output_data.data(), output_data.size());
533 
534     for (size_t j = 0; j < t_out->numel(); ++j) {
535       CHECK_VALUE(output_data, j, expected_outputs.at(i));
536     }
537   }
538 }
539 
check_close(float a,float b,float atol,float rtol)540 bool check_close(float a, float b, float atol, float rtol) {
541   float max = std::max(std::abs(a), std::abs(b));
542   float diff = std::abs(a - b);
543   return diff <= (atol + rtol * max);
544 }
545