xref: /aosp_15_r20/external/executorch/backends/vulkan/test/vulkan_compute_api_test.cpp (revision 523fa7a60841cd1ecfb9cc4201f1ca8b03ed023a)
1 /*
2  * Copyright (c) Meta Platforms, Inc. and affiliates.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD-style license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 #include <gtest/gtest.h>
10 
11 #include <bitset>
12 #include <utility>
13 #include <vector>
14 
15 #include <executorch/runtime/core/exec_aten/exec_aten.h>
16 
17 #include <executorch/backends/vulkan/runtime/api/api.h>
18 
19 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
20 
21 #include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
22 
23 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
24 
25 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.h>
26 
27 #include <executorch/backends/vulkan/test/utils/test_utils.h>
28 
29 #include <executorch/backends/vulkan/runtime/graph/ops/DispatchNode.h>
30 
31 using namespace vkcompute;
32 using namespace vkcompute::api;
33 
34 std::vector<float>
transpose_matrix(std::vector<float> & mat,const int H,const int W)35 transpose_matrix(std::vector<float>& mat, const int H, const int W) {
36   std::vector<float> out(W * H);
37   for (int out_y = 0; out_y < H; ++out_y) {
38     for (int out_x = 0; out_x < W; ++out_x) {
39       out[out_x * H + out_y] = mat[out_y * W + out_x];
40     }
41   }
42   return out;
43 }
44 
compute_reference_matmul(std::vector<float> & mat1,std::vector<float> & mat2,const int M,const int K,const int N)45 std::vector<float> compute_reference_matmul(
46     std::vector<float>& mat1,
47     std::vector<float>& mat2,
48     const int M,
49     const int K,
50     const int N) {
51   std::vector<float> out(M * N);
52   for (int out_y = 0; out_y < M; ++out_y) {
53     for (int out_x = 0; out_x < N; ++out_x) {
54       out[out_y * N + out_x] = 0;
55       for (int k = 0; k < K; ++k) {
56         out[out_y * N + out_x] += mat1[out_y * K + k] * mat2[k * N + out_x];
57       }
58     }
59   }
60   return out;
61 }
62 
63 std::vector<std::vector<int64_t>> standard_sizes_to_test = {
64     // 2D
65     {7, 11},
66     {13, 6},
67     // 3D
68     {2, 9, 7},
69     {9, 15, 19},
70     {7, 11, 24},
71     {13, 8, 11},
72     {12, 11, 19},
73     // 4D
74     {2, 2, 3, 5},
75     {9, 13, 11, 17},
76     {17, 14, 18, 20},
77     {7, 13, 12, 21},
78     {3, 8, 13, 17},
79 };
80 
81 //
82 // Compute API Tests
83 //
84 
85 class VulkanComputeAPITest : public ::testing::Test {
86  public:
SetUp()87   void SetUp() override {
88     // Make sure we are starting with a clean slate
89     EXPECT_TRUE(get_vma_allocation_count() == 0);
90   }
91 
TearDown()92   void TearDown() override {
93     context()->flush();
94 
95     // Make sure we are ending with a clean slate
96     EXPECT_TRUE(get_vma_allocation_count() == 0);
97   }
98 };
99 
TEST_F(VulkanComputeAPITest,print_adapter)100 TEST_F(VulkanComputeAPITest, print_adapter) {
101   std::cout << *(context()->adapter_ptr()) << std::endl;
102 }
103 
get_reference_strides(const std::vector<int64_t> & sizes,const utils::GPUMemoryLayout layout,const bool unsqueezed=false)104 std::vector<int64_t> get_reference_strides(
105     const std::vector<int64_t>& sizes,
106     const utils::GPUMemoryLayout layout,
107     const bool unsqueezed = false) {
108   int64_t C = utils::val_at(-3, sizes);
109   int64_t H = utils::val_at(-2, sizes);
110   int64_t W = utils::val_at(-1, sizes);
111 
112   int64_t numel = utils::multiply_integers(sizes);
113 
114   switch (layout) {
115     case utils::kWidthPacked:
116       switch (sizes.size()) {
117         case 1:
118           if (unsqueezed)
119             return {numel, numel, numel, 1};
120           return {1};
121         case 2:
122           if (unsqueezed)
123             return {numel, numel, W, 1};
124           return {W, 1};
125         case 3:
126           if (unsqueezed)
127             return {numel, H * W, W, 1};
128           return {H * W, W, 1};
129         case 4:
130           return {C * H * W, H * W, W, 1};
131         default:
132           return {};
133       }
134       break;
135     case utils::kHeightPacked:
136       switch (sizes.size()) {
137         case 1:
138           if (unsqueezed)
139             return {numel, numel, numel, 1};
140           return {1};
141         case 2:
142           if (unsqueezed)
143             return {numel, numel, 1, H};
144           return {1, H};
145         case 3:
146           if (unsqueezed)
147             return {numel, H * W, 1, H};
148           return {W * H, 1, H};
149         case 4:
150           return {C * W * H, W * H, 1, H};
151         default:
152           return {};
153       }
154     case utils::kChannelsPacked:
155       switch (sizes.size()) {
156         case 1:
157           if (unsqueezed)
158             return {numel, numel, numel, 1};
159           return {1};
160         case 2:
161           if (unsqueezed)
162             return {numel, numel, W, 1};
163           return {W, 1};
164         case 3:
165           if (unsqueezed)
166             return {numel, 1, W * C, C};
167           return {1, W * C, C};
168         case 4:
169           return {H * W * C, 1, W * C, C};
170         default:
171           return {};
172       }
173   }
174   return {};
175 }
176 
TEST_F(VulkanComputeAPITest,empty_init_shader_info_test)177 TEST_F(VulkanComputeAPITest, empty_init_shader_info_test) {
178   vkapi::ShaderInfo empty_shader_info;
179   EXPECT_FALSE(empty_shader_info);
180   EXPECT_TRUE(empty_shader_info.src_code.bin == nullptr);
181   EXPECT_TRUE(empty_shader_info.src_code.size == 0u);
182 }
183 
TEST_F(VulkanComputeAPITest,calculate_dim_order_test)184 TEST_F(VulkanComputeAPITest, calculate_dim_order_test) {
185   // ndim, GPUMemoryLayout, expected dim order pairs
186   std::vector<std::tuple<size_t, int32_t, std::vector<int64_t>>> test_cases = {
187       {1, WHCN::kWidthDim, {0}},
188       {1, WHCN::kHeightDim, {0}},
189       {1, WHCN::kChannelsDim, {0}},
190       {2, WHCN::kWidthDim, {0, 1}},
191       {2, WHCN::kHeightDim, {1, 0}},
192       {2, WHCN::kChannelsDim, {0, 1}},
193       {3, WHCN::kWidthDim, {0, 1, 2}},
194       {3, WHCN::kHeightDim, {0, 2, 1}},
195       {3, WHCN::kChannelsDim, {1, 2, 0}},
196       {4, WHCN::kWidthDim, {0, 1, 2, 3}},
197       {4, WHCN::kHeightDim, {0, 1, 3, 2}},
198       {4, WHCN::kChannelsDim, {0, 2, 3, 1}},
199   };
200 
201   for (const auto& test_case : test_cases) {
202     const size_t& ndim = std::get<0>(test_case);
203     const int32_t packed_dim = std::get<1>(test_case);
204     const auto& expected_dim_order = std::get<2>(test_case);
205     std::vector<int64_t> dim_order = calculate_dim_order(ndim, packed_dim);
206 
207     ASSERT_TRUE(dim_order == expected_dim_order);
208   }
209 }
210 
TEST_F(VulkanComputeAPITest,calculate_tensor_strides_test)211 TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) {
212   vTensor v_tensor_to_resize(
213       context(),
214       {25, 25, 25, 25},
215       vkapi::kFloat,
216       utils::kBuffer,
217       utils::kWidthPacked,
218       /*allocate_memory = */ false);
219 
220   for (const auto& sizes : standard_sizes_to_test) {
221     if (sizes.size() < 3) {
222       continue;
223     }
224     for (const auto& layout :
225          {utils::kWidthPacked, utils::kHeightPacked, utils::kChannelsPacked}) {
226       {
227         const int32_t packed_dim = static_cast<int32_t>(layout);
228         std::vector<int64_t> dim_order =
229             calculate_dim_order(sizes.size(), packed_dim);
230         std::vector<int64_t> strides = calculate_strides(sizes, dim_order);
231         std::vector<int64_t> ref_strides = get_reference_strides(sizes, layout);
232         ASSERT_TRUE(strides == ref_strides);
233 
234         int64_t numel = utils::multiply_integers(sizes);
235         std::vector<int64_t> unsqueezed_strides =
236             unsqueeze_strides(strides, numel);
237         std::vector<int64_t> ref_unsqueezed_strides =
238             get_reference_strides(sizes, layout, true);
239 
240         ASSERT_TRUE(unsqueezed_strides == ref_unsqueezed_strides);
241 
242         // Create new vTensor and check that the strides are correct
243         vTensor new_v_tensor(
244             context(),
245             sizes,
246             vkapi::kFloat,
247             utils::kBuffer,
248             layout,
249             /*allocate_memory = */ false);
250 
251         ASSERT_TRUE(new_v_tensor.strides() == ref_strides);
252         ASSERT_TRUE(
253             new_v_tensor.unsqueezed_strides() == ref_unsqueezed_strides);
254 
255         // Resize vtensor and check that updated metadata is correct
256         v_tensor_to_resize.virtual_reconfigure(sizes, dim_order);
257         ASSERT_TRUE(v_tensor_to_resize.strides() == ref_strides);
258         ASSERT_TRUE(
259             v_tensor_to_resize.unsqueezed_strides() == ref_unsqueezed_strides);
260       }
261     }
262   }
263 }
264 
TEST_F(VulkanComputeAPITest,virtual_transpose_test)265 TEST_F(VulkanComputeAPITest, virtual_transpose_test) {
266   std::vector<int64_t> sizes = {7, 9, 11, 13};
267   // (dim0, dim1), new_sizes, new_dim_order, new_axis_map, new_packed_dim_idx
268   std::vector<std::vector<std::vector<int64_t>>> test_cases = {
269       {{2, 3}, {7, 9, 13, 11}, {0, 1, 3, 2}, {1, 0, 2, 2}, {1}},
270       {{2, 1}, {7, 11, 9, 13}, {0, 2, 1, 3}, {0, 2, 1, 1}, {0}},
271       {{1, 3}, {7, 13, 11, 9}, {0, 3, 2, 1}, {2, 1, 0, 0}, {2}},
272   };
273 
274   for (const auto& test_case : test_cases) {
275     const int dim0 = test_case.at(0).at(0);
276     const int dim1 = test_case.at(0).at(1);
277 
278     const auto& expected_sizes = test_case.at(1);
279     const auto& expected_dim_order = test_case.at(2);
280     const auto& expected_axis_map = test_case.at(3);
281     const int expected_packed_dim = test_case.at(4).at(0);
282 
283     {
284       vTensor a_buffer = vTensor(
285           context(), sizes, vkapi::kFloat, utils::kBuffer, utils::kWidthPacked);
286 
287       a_buffer.virtual_transpose(dim0, dim1);
288       EXPECT_TRUE(a_buffer.sizes() == expected_sizes);
289       EXPECT_TRUE(a_buffer.dim_order() == expected_dim_order);
290     }
291 
292     {
293       vTensor a_texture = vTensor(
294           context(),
295           sizes,
296           vkapi::kFloat,
297           utils::kTexture3D,
298           utils::kWidthPacked);
299       a_texture.virtual_transpose(dim0, dim1);
300       EXPECT_TRUE(a_texture.sizes() == expected_sizes);
301       EXPECT_TRUE(a_texture.axis_map() == expected_axis_map);
302       EXPECT_TRUE(a_texture.packed_dim() == expected_packed_dim);
303     }
304   }
305 }
306 
TEST_F(VulkanComputeAPITest,view_of_view_test)307 TEST_F(VulkanComputeAPITest, view_of_view_test) {
308   constexpr int N = 3;
309   constexpr int C = 5;
310   constexpr int H = 17;
311   constexpr int W = 19;
312 
313   std::vector<int64_t> sizes = {N, C, H, W};
314 
315   vTensor t1 = vTensor(
316       context(), sizes, vkapi::kFloat, utils::kTexture3D, utils::kWidthPacked);
317 
318   vTensor t2 = vTensor(t1);
319   EXPECT_TRUE(t2.sizes() == sizes);
320   vTensor t3 = vTensor(t2);
321   EXPECT_TRUE(t2.sizes() == sizes);
322 
323   t2.virtual_transpose(1, 2);
324   std::vector<int64_t> expected_t2_sizes = {N, H, C, W};
325   EXPECT_TRUE(t2.sizes() == expected_t2_sizes);
326 
327   // Because t3 was created before t2's metadata was updated, we need to first
328   // update t3's metadata to match t2's metadata. Then the transpose will yield
329   // the correct metadata.
330   t3.virtual_clone(t2);
331   t3.virtual_transpose(2, 3);
332   std::vector<int64_t> expected_t3_sizes = {N, H, W, C};
333   EXPECT_TRUE(t3.sizes() == expected_t3_sizes);
334 }
335 
make_temp_ivec3(int x,int y,int z)336 utils::ivec3 make_temp_ivec3(int x, int y, int z) {
337   return utils::ivec3{x, y, z};
338 }
339 
TEST_F(VulkanComputeAPITest,vec_test)340 TEST_F(VulkanComputeAPITest, vec_test) {
341   {
342     utils::vec3 v3({1, 2, 3});
343     ASSERT_TRUE(v3[0] == 1);
344     ASSERT_TRUE(v3[1] == 2);
345     ASSERT_TRUE(v3[2] == 3);
346     v3 = {4, 5, 6};
347     ASSERT_TRUE(v3[0] == 4);
348     ASSERT_TRUE(v3[1] == 5);
349     ASSERT_TRUE(v3[2] == 6);
350   }
351 
352   {
353     utils::uvec4 uv4({4, 3, 2, 1});
354     ASSERT_TRUE(uv4[0] == 4);
355     ASSERT_TRUE(uv4[1] == 3);
356     ASSERT_TRUE(uv4[2] == 2);
357     ASSERT_TRUE(uv4[3] == 1);
358     uv4 = {11, 13, 12, 88};
359     ASSERT_TRUE(uv4[0] == 11);
360     ASSERT_TRUE(uv4[1] == 13);
361     ASSERT_TRUE(uv4[2] == 12);
362     ASSERT_TRUE(uv4[3] == 88);
363   }
364 
365   // Test copy from same type
366   {
367     utils::ivec3 v{5, 6, 8};
368     utils::ivec3 v2 = v;
369 
370     ASSERT_TRUE(v2[0] == 5);
371     ASSERT_TRUE(v2[1] == 6);
372     ASSERT_TRUE(v2[2] == 8);
373   }
374 
375   // Test copy from different type
376   {
377     utils::uvec3 v{5, 6, 8};
378     utils::ivec3 v2 = v;
379 
380     ASSERT_TRUE(v2[0] == 5);
381     ASSERT_TRUE(v2[1] == 6);
382     ASSERT_TRUE(v2[2] == 8);
383   }
384 
385   // Test construction from temporary vec
386   {
387     utils::uvec3 v{make_temp_ivec3(4, 5, 10)};
388     ASSERT_TRUE(v[0] == 4);
389     ASSERT_TRUE(v[1] == 5);
390     ASSERT_TRUE(v[2] == 10);
391   }
392 
393   // Test initalization from temporary vec
394   {
395     utils::uvec3 v = make_temp_ivec3(4, 5, 10);
396     ASSERT_TRUE(v[0] == 4);
397     ASSERT_TRUE(v[1] == 5);
398     ASSERT_TRUE(v[2] == 10);
399   }
400 }
401 
TEST_F(VulkanComputeAPITest,retrieve_custom_shader_test)402 TEST_F(VulkanComputeAPITest, retrieve_custom_shader_test) {
403   // Try to get shader from custom shader library
404   const vkapi::ShaderInfo& kernel = VK_KERNEL(test_shader);
405 
406   ASSERT_TRUE(kernel.kernel_name == "test_shader");
407 }
408 
TEST_F(VulkanComputeAPITest,spec_var_classes_test)409 TEST_F(VulkanComputeAPITest, spec_var_classes_test) {
410   // Check equality operator
411   ASSERT_TRUE(SV(1.5f) == SV(1.5f));
412   ASSERT_FALSE(SV(15.0f) == SV(15));
413   ASSERT_FALSE(SV(1u) == SV(true));
414 
415   size_t sv_size = sizeof(vkapi::SpecVar);
416 
417   vkapi::SpecVarList spec_vars = {};
418   ASSERT_TRUE(spec_vars.size() == 0);
419   spec_vars = {SV(1.1f), SV(32), SV(45)};
420   ASSERT_TRUE(spec_vars.size() == 3);
421   vkapi::SpecVarList spec_vars_other = {SV(2.6f), SV(true), SV(78u), SV(5.5f)};
422   spec_vars.append(spec_vars_other);
423   ASSERT_TRUE(spec_vars.size() == 7);
424 
425   // Check validity of the data
426   const vkapi::SpecVar* data = spec_vars.data();
427   ASSERT_TRUE(*(reinterpret_cast<const float*>(data + 3)) == 2.6f);
428   ASSERT_TRUE(*(reinterpret_cast<const int32_t*>(data + 1)) == 32);
429   ASSERT_TRUE(*(reinterpret_cast<const int32_t*>(data + 5)) == 78u);
430 
431   // Check validity of the map entries
432   std::vector<VkSpecializationMapEntry> entries =
433       spec_vars.generate_map_entries();
434 
435   for (size_t i = 0; i < spec_vars.size(); ++i) {
436     ASSERT_TRUE(entries[i].constantID == i);
437     ASSERT_TRUE(entries[i].offset == sv_size * i);
438     if (i != 4) {
439       ASSERT_TRUE(entries[i].size == 4);
440     } else {
441       ASSERT_TRUE(entries[i].size == 1);
442     }
443   }
444 
445   // Check copy
446   vkapi::SpecVarList spec_vars_copy(spec_vars);
447   ASSERT_TRUE(spec_vars_copy.size() == 7);
448 
449   // Check validity of the copied data
450   const vkapi::SpecVar* copy_data = spec_vars_copy.data();
451   ASSERT_TRUE(*(reinterpret_cast<const bool*>(copy_data + 4)) == true);
452   ASSERT_TRUE(*(reinterpret_cast<const int32_t*>(copy_data + 2)) == 45);
453   ASSERT_TRUE(*(reinterpret_cast<const float*>(copy_data + 6)) == 5.5f);
454 }
455 
TEST_F(VulkanComputeAPITest,spec_var_shader_test)456 TEST_F(VulkanComputeAPITest, spec_var_shader_test) {
457   size_t len = 16;
458   StagingBuffer buffer(context(), vkapi::kFloat, len);
459 
460   float scale = 3.0f;
461   float offset = 1.5f;
462 
463   {
464     ParamsBuffer params(context(), int32_t(len));
465     uint32_t len_div4 = utils::div_up(uint32_t(len), uint32_t(4));
466     vkapi::PipelineBarrier pipeline_barrier{};
467     context()->submit_compute_job(
468         VK_KERNEL(fill_buffer),
469         pipeline_barrier,
470         {64, 1, 1},
471         {len_div4, 1, 1},
472         {SV(scale), SV(offset)},
473         VK_NULL_HANDLE,
474         0,
475         buffer.buffer(),
476         params.buffer());
477   }
478 
479   submit_to_gpu();
480 
481   std::vector<float> data(len);
482   buffer.copy_to(data.data(), buffer.nbytes());
483 
484   for (size_t i = 0; i < len; ++i) {
485     CHECK_VALUE(data, i, scale * i + offset);
486   }
487 }
488 
TEST_F(VulkanComputeAPITest,update_params_between_submit)489 TEST_F(VulkanComputeAPITest, update_params_between_submit) {
490   context()->set_cmd(/*reusable = */ true);
491   std::vector<int64_t> sizes = {4, 4, 2};
492   vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
493 
494   std::string kernel_name("fill_texture__test");
495   add_dtype_suffix(kernel_name, a);
496 
497   struct Params final {
498     utils::ivec3 size;
499     int32_t fill;
500     utils::vec4 values;
501   };
502 
503   Params block{
504       {2, 4, 1},
505       0,
506       {5.0, 5.0, 5.0, 5.0},
507   };
508 
509   ParamsBuffer params(context(), block);
510 
511   {
512     vkapi::PipelineBarrier pipeline_barrier{};
513     vkapi::SpecVarList specialization_constants = {};
514     context()->submit_compute_job(
515         VK_KERNEL_FROM_STR(kernel_name),
516         pipeline_barrier,
517         {4, 4, 4},
518         {4, 4, 4},
519         specialization_constants,
520         VK_NULL_HANDLE,
521         0,
522         a.image(
523             pipeline_barrier,
524             vkapi::PipelineStage::COMPUTE,
525             vkapi::MemoryAccessType::WRITE),
526         params.buffer());
527   }
528 
529   StagingBuffer staging_buffer(
530       context(), vkapi::kFloat, a.staging_buffer_numel());
531   record_image_to_nchw_op(context(), a, staging_buffer.buffer());
532 
533   submit_to_gpu();
534   check_staging_buffer(staging_buffer, 5.0f);
535 
536   Params new_block{
537       {2, 4, 1},
538       0,
539       {4.0, 4.0, 4.0, 4.0},
540   };
541 
542   params.update(new_block);
543 
544   submit_to_gpu();
545   check_staging_buffer(staging_buffer, 4.0f);
546 }
547 
548 template <typename T, vkapi::ScalarType dtype>
test_storage_buffer_type(const size_t len)549 void test_storage_buffer_type(const size_t len) {
550   StagingBuffer buffer(context(), dtype, len);
551 
552   std::string kernel_name("idx_fill_buffer");
553   switch (dtype) {
554     case vkapi::kFloat:
555       kernel_name += "_float";
556       break;
557     case vkapi::kHalf:
558       kernel_name += "_half";
559       break;
560     case vkapi::kQInt8:
561       kernel_name += "_int8";
562       break;
563     case vkapi::kQUInt8:
564       kernel_name += "_uint8";
565       break;
566     default:
567       throw std::runtime_error("Unsupported dtype");
568       break;
569   }
570 
571   ParamsBuffer params(context(), int32_t(len));
572 
573   {
574     uint32_t len_div4 = utils::div_up(uint32_t(len), uint32_t(4));
575     vkapi::PipelineBarrier pipeline_barrier{};
576     vkapi::SpecVarList specialization_constants = {};
577     context()->submit_compute_job(
578         VK_KERNEL_FROM_STR(kernel_name),
579         pipeline_barrier,
580         {64, 1, 1},
581         {len_div4, 1, 1},
582         specialization_constants,
583         VK_NULL_HANDLE,
584         0,
585         buffer.buffer(),
586         params.buffer());
587   }
588 
589   submit_to_gpu();
590 
591   std::vector<T> data(len);
592   buffer.copy_to(data.data(), buffer.nbytes());
593 
594   for (size_t i = 0; i < len; ++i) {
595     CHECK_VALUE(data, i, T(i));
596   }
597 }
598 
TEST_F(VulkanComputeAPITest,test_buffer_float)599 TEST_F(VulkanComputeAPITest, test_buffer_float) {
600   test_storage_buffer_type<float, vkapi::kFloat>(16);
601 }
602 
TEST_F(VulkanComputeAPITest,test_buffer_float16)603 TEST_F(VulkanComputeAPITest, test_buffer_float16) {
604   if (!context()->adapter_ptr()->has_full_float16_buffers_support()) {
605     GTEST_SKIP();
606   }
607   test_storage_buffer_type<executorch::aten::Half, vkapi::kHalf>(16);
608 }
609 
TEST_F(VulkanComputeAPITest,test_buffer_int8)610 TEST_F(VulkanComputeAPITest, test_buffer_int8) {
611   if (!context()->adapter_ptr()->has_full_int8_buffers_support()) {
612     GTEST_SKIP();
613   }
614   test_storage_buffer_type<int8_t, vkapi::kQInt8>(16);
615 }
616 
TEST_F(VulkanComputeAPITest,test_zero_size_tensor)617 TEST_F(VulkanComputeAPITest, test_zero_size_tensor) {
618   // Simple test that performs a + b -> c
619 
620   std::vector<int64_t> sizes = {0, 5, 7};
621   vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
622   vTensor b = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
623   vTensor c = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
624 
625   // Fill input tensors
626   fill_vtensor(a, 2.5f);
627   fill_vtensor(b, 1.5f);
628 
629   // a + b -> c
630   record_binary_op(context(), "add", a, b, c);
631 
632   // Extract output tensor
633   std::vector<float> data_out = extract_vtensor(c);
634 
635   // Assert all tensors are empty
636   ASSERT_TRUE(a.numel() == 0);
637   ASSERT_TRUE(b.numel() == 0);
638   ASSERT_TRUE(c.numel() == 0);
639   ASSERT_TRUE(a.nbytes() == 0);
640   ASSERT_TRUE(b.nbytes() == 0);
641   ASSERT_TRUE(c.nbytes() == 0);
642 
643   // Check output
644   for (size_t i = 0; i < data_out.size(); ++i) {
645     CHECK_VALUE(data_out, i, 4.0f);
646   }
647 }
648 
649 template <typename T>
run_buffer_tensor_sanity_check(vTensor & tensor)650 void run_buffer_tensor_sanity_check(vTensor& tensor) {
651   fill_vtensor(tensor, 0.0f, true);
652 
653   record_scalar_add_buffer(context(), tensor, 2.0f);
654   std::vector<float> data_out = extract_vtensor(tensor);
655 
656   // Check output
657   for (size_t i = 0; i < tensor.numel(); ++i) {
658     CHECK_VALUE(data_out, i, i + 2.0f);
659   }
660 }
661 
TEST_F(VulkanComputeAPITest,buffer_tensor_sanity_check)662 TEST_F(VulkanComputeAPITest, buffer_tensor_sanity_check) {
663   for (const auto& sizes : standard_sizes_to_test) {
664     for (const auto& dtype : {vkapi::kFloat, vkapi::kHalf, vkapi::kChar}) {
665       if (dtype == vkapi::kHalf &&
666           !context()->adapter_ptr()->has_full_float16_buffers_support()) {
667         continue;
668       }
669       if (dtype == vkapi::kHalf && utils::multiply_integers(sizes) >= 2048) {
670         continue;
671       }
672       if (dtype == vkapi::kChar &&
673           !context()->adapter_ptr()->has_full_int8_buffers_support()) {
674         continue;
675       }
676       if (dtype == vkapi::kChar && utils::multiply_integers(sizes) >= 128) {
677         continue;
678       }
679       for (const auto& layout :
680            {utils::kWidthPacked,
681             utils::kHeightPacked,
682             utils::kChannelsPacked}) {
683         vTensor a = vTensor(context(), sizes, dtype, utils::kBuffer, layout);
684         switch (dtype) {
685           case vkapi::kFloat:
686             run_buffer_tensor_sanity_check<float>(a);
687             break;
688           case vkapi::kHalf:
689             run_buffer_tensor_sanity_check<executorch::aten::Half>(a);
690             break;
691           case vkapi::kChar:
692             run_buffer_tensor_sanity_check<int8_t>(a);
693             break;
694           default:
695             VK_THROW("Unsupported dtype");
696         }
697       }
698     }
699   }
700 }
701 
TEST_F(VulkanComputeAPITest,texture_add_sanity_check)702 TEST_F(VulkanComputeAPITest, texture_add_sanity_check) {
703   // Simple test that performs a + b -> c
704 
705   std::vector<int64_t> sizes = {4, 4, 1};
706   vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
707   vTensor b = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
708   vTensor c = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
709 
710   // Fill input tensors
711   fill_vtensor(a, 2.5f);
712   fill_vtensor(b, 1.5f);
713 
714   // a + b -> c
715   record_binary_op(context(), "add", a, b, c);
716 
717   // Extract output tensor
718   std::vector<float> data_out = extract_vtensor(c);
719 
720   // Check output
721   for (size_t i = 0; i < data_out.size(); ++i) {
722     CHECK_VALUE(data_out, i, 4.0f);
723   }
724 }
725 
TEST_F(VulkanComputeAPITest,tensor_alias_test)726 TEST_F(VulkanComputeAPITest, tensor_alias_test) {
727   for (utils::StorageType storage_type : {utils::kTexture3D, utils::kBuffer}) {
728     std::vector<int64_t> sizes = {9, 9};
729 
730     const size_t alloc_count_before = get_vma_allocation_count();
731 
732     vTensor original = vTensor(context(), sizes, vkapi::kFloat, storage_type);
733 
734     vTensor copy = vTensor(original);
735 
736     // Two tensors but only one additional allocation.
737     EXPECT_TRUE(get_vma_allocation_count() == alloc_count_before + 1);
738     EXPECT_TRUE(copy.is_view_of(original));
739 
740     // Fill original tensor with some data
741     fill_vtensor(original, 2.5f, true);
742 
743     std::vector<float> data_out(copy.staging_buffer_numel());
744     // Extract the copy tensor; should contain the data of the original tensor
745     extract_vtensor(copy, data_out);
746 
747     for (size_t i = 0; i < original.numel(); ++i) {
748       CHECK_VALUE(data_out, i, 2.5f + i);
749     }
750   }
751 }
752 
TEST_F(VulkanComputeAPITest,tensor_no_copy_transpose_test)753 TEST_F(VulkanComputeAPITest, tensor_no_copy_transpose_test) {
754   constexpr int M = 11;
755   constexpr int K = 23;
756   constexpr int N = 17;
757   std::vector<int64_t> mat1_sizes = {M, K};
758   std::vector<int64_t> mat2_sizes = {N, K};
759   std::vector<int64_t> out_sizes = {M, N};
760 
761   for (const auto storage_type : {utils::kTexture3D, utils::kBuffer}) {
762     vTensor mat1 = vTensor(
763         context(),
764         mat1_sizes,
765         vkapi::kFloat,
766         storage_type,
767         utils::kWidthPacked);
768     vTensor mat2 = vTensor(
769         context(),
770         mat2_sizes,
771         vkapi::kFloat,
772         storage_type,
773         utils::kWidthPacked);
774     vTensor out = vTensor(
775         context(), out_sizes, vkapi::kFloat, storage_type, utils::kWidthPacked);
776 
777     // Generate data
778     std::vector<float> mat1_data =
779         create_random_float_buffer(mat1.staging_buffer_numel());
780     std::vector<float> mat2_data =
781         create_random_float_buffer(mat2.staging_buffer_numel());
782 
783     // Create direct view and modify sizes and strides later
784     vTensor mat2_t = vTensor(mat2);
785     // Update sizes and strides of mat2_t to be that of a transposed tensor
786     mat2_t.virtual_transpose(0, 1);
787 
788     EXPECT_TRUE(mat2_t.packed_dim() == WHCN::kHeightDim);
789 
790     std::vector<float> mat2_t_data = transpose_matrix(mat2_data, N, K);
791     std::vector<float> ref_out =
792         compute_reference_matmul(mat1_data, mat2_t_data, M, K, N);
793 
794     // Fill original tensor with some data
795     fill_vtensor(mat1, mat1_data);
796     fill_vtensor(mat2, mat2_data);
797 
798     if (storage_type == utils::kTexture3D) {
799       record_matmul_texture3d(context(), out, mat1, mat2_t);
800     } else {
801       record_reference_matmul(context(), out, mat1, mat2_t);
802     }
803 
804     std::vector<float> data_out(out.staging_buffer_numel());
805     // Extract the copy tensor; should contain the data of the original tensor
806     extract_vtensor(out, data_out);
807 
808     for (size_t i = 0; i < ref_out.size(); ++i) {
809       EXPECT_TRUE(check_close(data_out[i], ref_out[i]));
810     }
811   }
812 }
813 
TEST_F(VulkanComputeAPITest,tensor_no_copy_slice_test)814 TEST_F(VulkanComputeAPITest, tensor_no_copy_slice_test) {
815   constexpr int L = 31;
816 
817   // S{N} refers to slice {N}
818   constexpr int L_S1 = 17;
819   constexpr int O_S1 = 5;
820 
821   constexpr int L_S2 = 7;
822   constexpr int O_S2 = 3;
823 
824   std::vector<int64_t> dim_order = {0};
825 
826   std::vector<int64_t> t_sizes = {L};
827   std::vector<int64_t> s1_sizes = {L_S1};
828   std::vector<int64_t> s2_sizes = {L_S2};
829 
830   vTensor orig = CREATE_FLOAT_BUFFER(t_sizes, /*allocate_memory=*/true);
831 
832   fill_vtensor(orig, 0);
833 
834   vTensor s1 = vTensor(orig, s1_sizes, dim_order, O_S1);
835   vTensor s2 = vTensor(s1, s2_sizes, dim_order, O_S2);
836 
837   record_scalar_add_buffer(api::context(), s1, 4.5f);
838   record_scalar_add_buffer(api::context(), s2, 7.5f);
839 
840   std::vector<float> orig_data(orig.staging_buffer_numel());
841   extract_vtensor(orig, orig_data);
842 
843   int id = 0;
844   while (id < O_S1) {
845     EXPECT_TRUE(orig_data[id] == 0);
846     ++id;
847   }
848   while (id < O_S1 + O_S2) {
849     EXPECT_TRUE(orig_data[id] == 4.5);
850     ++id;
851   }
852   while (id < O_S1 + O_S2 + L_S2) {
853     EXPECT_TRUE(orig_data[id] == 12);
854     ++id;
855   }
856   while (id < O_S1 + L_S1) {
857     EXPECT_TRUE(orig_data[id] == 4.5);
858     ++id;
859   }
860   while (id < L) {
861     EXPECT_TRUE(orig_data[id] == 0);
862     ++id;
863   }
864 }
865 
TEST_F(VulkanComputeAPITest,texture_deferred_allocation_test)866 TEST_F(VulkanComputeAPITest, texture_deferred_allocation_test) {
867   // This test is the same as texture_add_sanity_check, except that the tensor
868   // memory is allocated in a deferred fashion
869 
870   std::vector<int64_t> sizes = {4, 4, 1};
871   vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
872   vTensor b = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
873   vTensor c = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
874 
875   // No allocations made so far
876   EXPECT_TRUE(get_vma_allocation_count() == 0);
877 
878   std::vector<float> data_a(a.staging_buffer_numel());
879   std::fill(data_a.begin(), data_a.end(), 2.5f);
880   std::vector<float> data_b(b.staging_buffer_numel());
881   std::fill(data_b.begin(), data_b.end(), 1.5f);
882 
883   // Allocate memory at the last possible opportunity
884   vkapi::Allocation a_mem = allocate_memory_for(a);
885   a.image().bind_allocation(a_mem);
886   vkapi::Allocation b_mem = allocate_memory_for(b);
887   b.image().bind_allocation(b_mem);
888   vkapi::Allocation c_mem = allocate_memory_for(c);
889   c.image().bind_allocation(c_mem);
890 
891   // One allocation for each tensor
892   EXPECT_TRUE(get_vma_allocation_count() == 3);
893 
894   fill_vtensor(a, data_a);
895   fill_vtensor(b, data_b);
896 
897   record_binary_op(context(), "add", a, b, c);
898 
899   std::vector<float> data_c(c.staging_buffer_numel());
900   extract_vtensor(c, data_c);
901 
902   for (size_t i = 0; i < data_c.size(); ++i) {
903     CHECK_VALUE(data_c, i, 4.0f);
904   }
905 }
906 
TEST_F(VulkanComputeAPITest,texture_resource_aliasing_test)907 TEST_F(VulkanComputeAPITest, texture_resource_aliasing_test) {
908   // This test performs the following operations:
909   // 1. a + b -> c
910   // 2. c + d -> e
911   // and share memory between tensors whenever possible.
912 
913   std::vector<int64_t> sizes = {4, 4, 1};
914   vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
915   vTensor b = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
916   vTensor c = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
917   vTensor d = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
918   vTensor e = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
919 
920   // No allocations made so far
921   EXPECT_TRUE(get_vma_allocation_count() == 0);
922 
923   // a and d can share the same memory allocation
924   vkapi::Allocation a_d_mem = allocate_memory_for(a);
925   a.image().bind_allocation(a_d_mem);
926   d.image().bind_allocation(a_d_mem);
927   // b and e can share the same memory allocation
928   vkapi::Allocation b_e_mem = allocate_memory_for(b);
929   b.image().bind_allocation(b_e_mem);
930   e.image().bind_allocation(b_e_mem);
931   // c must have its own memory allocation
932   vkapi::Allocation c_mem = allocate_memory_for(c);
933   c.image().bind_allocation(c_mem);
934 
935   // 3 allocations should be made
936   EXPECT_TRUE(get_vma_allocation_count() == 3);
937 
938   // Specify input data
939   std::vector<float> data_a(a.staging_buffer_numel());
940   std::fill(data_a.begin(), data_a.end(), 2.5f);
941   std::vector<float> data_b(b.staging_buffer_numel());
942   std::fill(data_b.begin(), data_b.end(), 1.5f);
943   std::vector<float> data_d(b.staging_buffer_numel());
944   std::fill(data_d.begin(), data_d.end(), 1.0f);
945 
946   // First, fill a and b with data
947   fill_vtensor(a, data_a);
948   fill_vtensor(b, data_b);
949 
950   // a + b -> c
951   record_binary_op(context(), "add", a, b, c);
952 
953   // Now d can be filled with data
954   fill_vtensor(d, data_d);
955 
956   // c + d -> e
957   record_binary_op(context(), "add", c, d, e);
958 
959   // Extract data from e
960   std::vector<float> data_e(e.staging_buffer_numel());
961   extract_vtensor(e, data_e);
962 
963   // Sanity check that the values are correct
964   for (size_t i = 0; i < data_e.size(); ++i) {
965     CHECK_VALUE(data_e, i, 5.0f);
966   }
967 }
968 
TEST_F(VulkanComputeAPITest,resource_bind_twice_fails)969 TEST_F(VulkanComputeAPITest, resource_bind_twice_fails) {
970   // Check that binding a resource that already has memory associated with it
971   // fails
972 
973   std::vector<int64_t> sizes = {4, 4, 1};
974   vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
975 
976   // Try to double bind a resource, which should fail
977   vkapi::Allocation a_mem = allocate_memory_for(a);
978   EXPECT_THROW(a.image().bind_allocation(a_mem), vkapi::Error);
979 }
980 
TEST_F(VulkanComputeAPITest,resource_destructor_non_owning_memory)981 TEST_F(VulkanComputeAPITest, resource_destructor_non_owning_memory) {
982   // Check that the destructor of a vTensor that does not own its memory
983   // does not free the memory
984 
985   vkapi::Allocation memory;
986 
987   // Default Allocation constructor should not allocate memory
988   EXPECT_TRUE(get_vma_allocation_count() == 0);
989 
990   std::vector<int64_t> sizes = {4, 4, 1};
991   {
992     vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
993 
994     memory = allocate_memory_for(a);
995     EXPECT_TRUE(get_vma_allocation_count() == 1);
996     a.image().bind_allocation(memory);
997   }
998 
999   // Check that the memory is still allocated
1000   EXPECT_TRUE(get_vma_allocation_count() == 1);
1001 }
1002 
TEST_F(VulkanComputeAPITest,use_non_bound_textures_fails)1003 TEST_F(VulkanComputeAPITest, use_non_bound_textures_fails) {
1004   // Try to encode a command buffer with a vTensor that does not have
1005   // memory
1006 
1007   std::vector<int64_t> sizes = {4, 4, 1};
1008   vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
1009 
1010   // No allocations yet
1011   EXPECT_TRUE(get_vma_allocation_count() == 0);
1012 
1013   std::vector<float> data_a(a.staging_buffer_numel());
1014   std::fill(data_a.begin(), data_a.end(), 2.5f);
1015 
1016   // Encoding a command buffer with a vTensor without memory should throw
1017   EXPECT_THROW(fill_vtensor(a, data_a), vkapi::Error);
1018 }
1019 
TEST_F(VulkanComputeAPITest,texture_virtual_resize)1020 TEST_F(VulkanComputeAPITest, texture_virtual_resize) {
1021   context()->set_cmd(/*reusable = */ true);
1022   std::vector<int64_t> sizes = {8, 12, 12};
1023   vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
1024   vTensor b = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
1025   vTensor c = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
1026 
1027   DEFINE_STAGING_BUFFER_AND_RECORD_TO_GPU_FOR(a)
1028   DEFINE_STAGING_BUFFER_AND_RECORD_TO_GPU_FOR(b)
1029 
1030   fill_staging(staging_buffer_a, 11.5f);
1031   fill_staging(staging_buffer_b, 12.5f);
1032 
1033   record_binary_op(context(), "add", a, b, c);
1034 
1035   DEFINE_STAGING_BUFFER_AND_RECORD_FROM_GPU_FOR(c)
1036 
1037   submit_to_gpu();
1038   check_staging_buffer(staging_buffer_c, 24.0f);
1039 
1040   std::vector<std::vector<int64_t>> new_sizes_list = {
1041       {4, 2, 4}, {4, 3, 6}, {8, 12, 12}, {8, 1, 1}, {8, 11, 10}};
1042 
1043   for (auto& new_sizes : new_sizes_list) {
1044     a.virtual_resize(new_sizes);
1045     b.virtual_resize(new_sizes);
1046     c.virtual_resize(new_sizes);
1047 
1048     fill_staging(
1049         staging_buffer_a, float(new_sizes[1] + 1.5f), a.staging_buffer_numel());
1050     fill_staging(
1051         staging_buffer_b,
1052         float(new_sizes[2] + 55.0f),
1053         b.staging_buffer_numel());
1054 
1055     submit_to_gpu();
1056     check_staging_buffer(
1057         staging_buffer_c,
1058         float(new_sizes[1] + new_sizes[2] + 56.5f),
1059         c.staging_buffer_numel());
1060   }
1061 }
1062 
1063 //
1064 // Compute Graph Tests
1065 //
1066 
1067 #define EXTRACT_TENSOR(name)                                 \
1068   std::vector<float> data_##name(                            \
1069       graph.get_tensor(name.value)->staging_buffer_numel()); \
1070   graph.copy_from_staging(name.staging, data_##name.data(), data_##name.size());
1071 
1072 // The purpose of this test is simply to track the size of various classes over
1073 // time, in the interest of making sure that they doesn't grow too large.
TEST_F(VulkanComputeAPITest,print_object_sizes)1074 TEST_F(VulkanComputeAPITest, print_object_sizes) {
1075 #define PRINT_SIZE(name) \
1076   std::cout << #name << " size: " << sizeof(name) << " B" << std::endl
1077   PRINT_SIZE(vTensor);
1078   PRINT_SIZE(Value);
1079   PRINT_SIZE(StagingBuffer);
1080   PRINT_SIZE(ComputeGraph);
1081   PRINT_SIZE(DispatchNode);
1082 #undef PRINT_SIZE
1083 
1084   // The actual sizes of each object is dependent on the platform. However, we
1085   // can alert ourselves to any significant changes in the sizes of these
1086   // objects by checking the `sizeof()` the class against some loose thresholds.
1087 
1088   // Current known size on 64 bit system: 1040 B
1089   EXPECT_TRUE(sizeof(vTensor) < 1200);
1090   // Current known size on 64 bit system: 1056 B
1091   EXPECT_TRUE(sizeof(Value) < 1200);
1092   // Current known size on 64 bit system: 120 B
1093   EXPECT_TRUE(sizeof(StagingBuffer) < 500);
1094   // Current known size on 64 bit system: 384 B
1095   EXPECT_TRUE(sizeof(ComputeGraph) < 500);
1096   // Current known size on 64 bit system: 248 B
1097   EXPECT_TRUE(sizeof(DispatchNode) < 500);
1098 }
1099 
TEST_F(VulkanComputeAPITest,test_tensor_creation_from_vulkan_image)1100 TEST_F(VulkanComputeAPITest, test_tensor_creation_from_vulkan_image) {
1101   const auto w = 16;
1102   const auto h = 12;
1103   const auto d = 1;
1104   const utils::uvec3 image_extents = {w, h, d};
1105 
1106   vkapi::Adapter* adapter_ptr = context()->adapter_ptr();
1107 
1108   vkapi::ImageSampler::Properties sampler_props{
1109       VK_FILTER_NEAREST,
1110       VK_SAMPLER_MIPMAP_MODE_NEAREST,
1111       VK_SAMPLER_ADDRESS_MODE_REPEAT,
1112       VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK,
1113   };
1114 
1115   VkFormat image_format = VK_FORMAT_R32G32B32A32_SFLOAT;
1116   VkImageType image_type = VK_IMAGE_TYPE_3D;
1117   VkImageViewType image_view_type = VK_IMAGE_VIEW_TYPE_3D;
1118 
1119   VkSampler sampler = adapter_ptr->sampler_cache().retrieve(sampler_props);
1120 
1121   auto image = adapter_ptr->vma().create_image(
1122       context()->device(),
1123       vkapi::create_extent3d(image_extents),
1124       image_format,
1125       image_type,
1126       context()->preferred_image_tiling(),
1127       image_view_type,
1128       sampler_props,
1129       sampler,
1130       /*allow_transfer = */ true,
1131       /*allocate_memory = */ true);
1132 
1133   auto tensor = vTensor(context(), image);
1134 
1135   const auto exp_sizes = std::vector<int64_t>{w, h, d * 4};
1136   EXPECT_TRUE(tensor.sizes() == exp_sizes);
1137   EXPECT_TRUE(tensor.packed_dim() == 2);
1138 
1139   const auto exp_numel = w * h * d * 4;
1140   EXPECT_TRUE(tensor.numel() == exp_numel);
1141   EXPECT_TRUE(tensor.padded_numel() == exp_numel);
1142 }
1143 
TEST(VulkanComputeGraphTest,test_values_scalars)1144 TEST(VulkanComputeGraphTest, test_values_scalars) {
1145   GraphConfig config;
1146   ComputeGraph graph(config);
1147 
1148   ValueRef idx;
1149 
1150   idx = graph.add_scalar<int64_t>(4);
1151   EXPECT_TRUE(graph.get_int(idx) == 4);
1152 
1153   idx = graph.add_scalar<double>(5.5f);
1154   EXPECT_TRUE(graph.get_double(idx) == 5.5f);
1155 }
1156 
TEST(VulkanComputeGraphTest,test_values_scalar_list_inplace_constructed)1157 TEST(VulkanComputeGraphTest, test_values_scalar_list_inplace_constructed) {
1158   GraphConfig config;
1159   ComputeGraph graph(config);
1160 
1161   ValueRef idx = graph.add_scalar_list<int64_t>({1, 2, 3, 4});
1162   const auto arr = graph.get_int_list(idx);
1163   EXPECT_TRUE(arr->size() == 4);
1164   for (int i = 0; i < 4; i++) {
1165     EXPECT_TRUE(arr->at(i) == i + 1);
1166   }
1167 }
1168 
TEST(VulkanComputeGraphTest,test_values_scalar_list_outside_constructed)1169 TEST(VulkanComputeGraphTest, test_values_scalar_list_outside_constructed) {
1170   GraphConfig config;
1171   ComputeGraph graph(config);
1172 
1173   ValueRef idx;
1174   {
1175     std::vector<double> data = {5.0, 4.0, 3.0, 2.0, 1.0};
1176     idx = graph.add_scalar_list(std::move(data));
1177   }
1178   const auto& arr = graph.get_double_list(idx);
1179   EXPECT_TRUE(arr->size() == 5);
1180   for (int i = 0; i < 5; i++) {
1181     EXPECT_TRUE(arr->at(i) == (5 - i));
1182   }
1183 }
1184 
TEST(VulkanComputeGraphTest,test_values_string)1185 TEST(VulkanComputeGraphTest, test_values_string) {
1186   GraphConfig config;
1187   ComputeGraph graph(config);
1188 
1189   ValueRef idx;
1190   {
1191     std::string data = "hello, world";
1192     idx = graph.add_string(std::move(data));
1193   }
1194   std::string stored = graph.get_string(idx);
1195   EXPECT_TRUE(stored == "hello, world");
1196 }
1197 
TEST(VulkanComputeGraphTest,empty_init_graphnode_test)1198 TEST(VulkanComputeGraphTest, empty_init_graphnode_test) {
1199   ExecuteNode node(nullptr, {});
1200 
1201   GraphConfig config;
1202   ComputeGraph graph(config);
1203 
1204   // Encode an empty ExecuteNode and check that command buffer encoding does not
1205   // crash.
1206   graph.execute_nodes().emplace_back(new ExecuteNode(nullptr, {}));
1207   EXPECT_NO_FATAL_FAILURE(graph.encode_execute());
1208 }
1209 
TEST(VulkanComputeGraphTest,test_zero_dim_tensor)1210 TEST(VulkanComputeGraphTest, test_zero_dim_tensor) {
1211   GraphConfig config;
1212   ComputeGraph graph(config);
1213 
1214   std::vector<int64_t> size_big = {7, 3, 5};
1215   std::vector<int64_t> size_small = {};
1216 
1217   // Build graph
1218 
1219   IOValueRef a = graph.add_input_tensor(size_big, vkapi::kFloat);
1220   IOValueRef b = graph.add_input_tensor(size_small, vkapi::kFloat);
1221 
1222   IOValueRef out = {};
1223 
1224   out.value = graph.add_tensor(size_big, vkapi::kFloat);
1225 
1226   auto addFn = VK_GET_OP_FN("aten.add.Tensor");
1227   addFn(graph, {a.value, b.value, kDummyValueRef, out.value});
1228 
1229   out.staging = graph.set_output_tensor(out.value);
1230 
1231   graph.prepare();
1232   graph.encode_execute();
1233 
1234   // Run graph
1235 
1236   for (float i = 5.0f; i < 30.0f; i += 10.0f) {
1237     float val_a = i + 2.0f;
1238     float val_b = i + 1.5f;
1239     float val_c = val_a + val_b;
1240 
1241     fill_vtensor(graph, a, val_a);
1242     fill_vtensor(graph, b, val_b);
1243 
1244     graph.execute();
1245 
1246     EXTRACT_TENSOR(out);
1247 
1248     // Sanity check that the values are correct
1249     for (size_t i = 0; i < graph.get_tensor(out.value)->numel(); ++i) {
1250       CHECK_VALUE(data_out, i, val_c);
1251     }
1252   }
1253 }
1254 
TEST(VulkanComputeGraphTest,test_simple_graph_with_buffer)1255 TEST(VulkanComputeGraphTest, test_simple_graph_with_buffer) {
1256   GraphConfig config;
1257   ComputeGraph graph(config);
1258 
1259   std::vector<int64_t> sizes = {7, 13, 19};
1260 
1261   // Build graph
1262 
1263   IOValueRef a = graph.add_input_tensor(sizes, vkapi::kFloat, utils::kBuffer);
1264 
1265   IOValueRef out = {};
1266 
1267   out.value = graph.add_tensor(sizes, vkapi::kFloat, utils::kBuffer);
1268 
1269   auto addFn = VK_GET_OP_FN("aten.abs.default");
1270   addFn(graph, {a.value, out.value, kDummyValueRef, kDummyValueRef});
1271 
1272   out.staging = graph.set_output_tensor(out.value);
1273 
1274   graph.prepare();
1275   graph.encode_execute();
1276 
1277   // Run graph
1278 
1279   for (float i = 5.0f; i < 30.0f; i += 10.0f) {
1280     float val = -i + 2.0f;
1281     float expected_val = std::abs(val);
1282 
1283     fill_vtensor(graph, a, val);
1284 
1285     graph.execute();
1286 
1287     EXTRACT_TENSOR(out);
1288 
1289     // Sanity check that the values are correct
1290     for (size_t i = 0; i < graph.get_tensor(out.value)->numel(); ++i) {
1291       CHECK_VALUE(data_out, i, expected_val);
1292     }
1293   }
1294 }
1295 
TEST(VulkanComputeGraphTest,test_simple_graph_with_view)1296 TEST(VulkanComputeGraphTest, test_simple_graph_with_view) {
1297   constexpr int W = 7;
1298   constexpr int H = 7;
1299   // slice height
1300   constexpr int S_H = 2;
1301   // slice offset
1302   constexpr int S_O = 3;
1303 
1304   GraphConfig config;
1305   config.set_storage_type_override(utils::kBuffer);
1306   ComputeGraph graph(config);
1307 
1308   std::vector<int64_t> dim_order = {0, 1};
1309 
1310   std::vector<int64_t> orig_sizes = {H, W};
1311   std::vector<int64_t> slice_sizes = {S_H, W};
1312   const int offset = S_O * W;
1313 
1314   // Build graph
1315 
1316   IOValueRef orig = graph.add_input_tensor(orig_sizes, vkapi::kFloat);
1317   ValueRef slice =
1318       graph.add_tensor_view(orig.value, slice_sizes, dim_order, offset);
1319 
1320   EXPECT_TRUE(graph.val_is_view_of(slice, orig.value));
1321 
1322   IOValueRef out = {};
1323 
1324   out.value = graph.add_tensor(slice_sizes, vkapi::kFloat);
1325 
1326   auto opFn = VK_GET_OP_FN("aten.abs.default");
1327   opFn(graph, {slice, out.value, kDummyValueRef, kDummyValueRef});
1328 
1329   out.staging = graph.set_output_tensor(out.value);
1330 
1331   graph.prepare();
1332   graph.encode_execute();
1333 
1334   // Run graph
1335 
1336   for (float i = 5.0f; i < 30.0f; i += 10.0f) {
1337     float start_val = -130 + i;
1338 
1339     fill_vtensor(graph, orig, start_val, true);
1340 
1341     graph.execute();
1342 
1343     EXTRACT_TENSOR(out);
1344 
1345     for (size_t i = 0; i < graph.get_tensor(out.value)->numel(); ++i) {
1346       const float expected_val = std::abs(start_val) - float(offset) - i;
1347       CHECK_VALUE(data_out, i, expected_val);
1348     }
1349   }
1350 }
1351 
TEST(VulkanComputeGraphTest,test_graph_view_of_view)1352 TEST(VulkanComputeGraphTest, test_graph_view_of_view) {
1353   GraphConfig config;
1354   config.set_storage_type_override(utils::kTexture3D);
1355   ComputeGraph graph(config);
1356 
1357   constexpr int N = 3;
1358   constexpr int C = 5;
1359   constexpr int H = 17;
1360   constexpr int W = 19;
1361 
1362   std::vector<int64_t> orig_sizes = {N, C, H, W};
1363 
1364   // Test a common view of view usage pattern. In delegate execution, the values
1365   // of the graph are created first; then operators are added. As a result,
1366   // creating views of views is a bit tricky because metadata updates to a view
1367   // does not update the metadata of the view's views. Nonetheless, view
1368   // operators have an implicit assumption that the metadata of the output is
1369   // equivalent to the metadata of the input. Therefore, view operators must
1370   // account for unseen updates to the input view by first calling
1371   // `virtual_clone()` to make the output equivalent to the input before.
1372   // modifying metadata.
1373 
1374   ValueRef t1 = graph.add_tensor(orig_sizes, vkapi::kFloat);
1375   ValueRef t2 = graph.add_tensor_view(t1);
1376   ValueRef t3 = graph.add_tensor_view(t2);
1377 
1378   ValueRef channels = graph.add_scalar<int64_t>(1);
1379   ValueRef height = graph.add_scalar<int64_t>(2);
1380   ValueRef width = graph.add_scalar<int64_t>(3);
1381 
1382   auto opFn = VK_GET_OP_FN("aten.transpose.int");
1383 
1384   opFn(graph, {t1, channels, height, t2});
1385   std::vector<int64_t> t2_sizes = graph.sizes_of(t2);
1386   std::vector<int64_t> expected_t2_sizes = {N, H, C, W};
1387   EXPECT_TRUE(t2_sizes == expected_t2_sizes);
1388 
1389   opFn(graph, {t2, height, width, t3});
1390   std::vector<int64_t> t3_sizes = graph.sizes_of(t3);
1391   std::vector<int64_t> expected_t3_sizes = {N, H, W, C};
1392   EXPECT_TRUE(t3_sizes == expected_t3_sizes);
1393 }
1394 
TEST(VulkanComputeGraphTest,test_simple_graph)1395 TEST(VulkanComputeGraphTest, test_simple_graph) {
1396   GraphConfig config;
1397   ComputeGraph graph(config);
1398 
1399   std::vector<int64_t> size_big = {1, 8, 8};
1400   std::vector<int64_t> size_small = {1, 1, 8};
1401 
1402   // Build graph
1403 
1404   IOValueRef a = graph.add_input_tensor(size_big, vkapi::kFloat);
1405   IOValueRef b = graph.add_input_tensor(size_small, vkapi::kFloat);
1406 
1407   IOValueRef out = {};
1408 
1409   out.value = graph.add_tensor(size_big, vkapi::kFloat);
1410 
1411   auto addFn = VK_GET_OP_FN("aten.add.Tensor");
1412   addFn(graph, {a.value, b.value, kDummyValueRef, out.value});
1413 
1414   out.staging = graph.set_output_tensor(out.value);
1415 
1416   graph.prepare();
1417   graph.encode_execute();
1418 
1419   // Run graph
1420 
1421   for (float i = 5.0f; i < 30.0f; i += 10.0f) {
1422     float val_a = i + 2.0f;
1423     float val_b = i + 1.5f;
1424     float val_c = val_a + val_b;
1425 
1426     fill_vtensor(graph, a, val_a);
1427     fill_vtensor(graph, b, val_b);
1428 
1429     graph.execute();
1430 
1431     EXTRACT_TENSOR(out);
1432 
1433     // Sanity check that the values are correct
1434     for (size_t i = 0; i < graph.get_tensor(out.value)->numel(); ++i) {
1435       CHECK_VALUE(data_out, i, val_c);
1436     }
1437   }
1438 }
1439 
TEST(VulkanComputeGraphTest,test_simple_graph_with_symint)1440 TEST(VulkanComputeGraphTest, test_simple_graph_with_symint) {
1441   GraphConfig config;
1442   config.set_storage_type_override(utils::kTexture3D);
1443   ComputeGraph graph(config);
1444 
1445   std::vector<int64_t> sizes = {8, 64, 124};
1446 
1447   // Build graph
1448 
1449   ValueRef scalar = graph.add_symint(1);
1450   IOValueRef a = graph.add_input_tensor(sizes, vkapi::kFloat);
1451 
1452   IOValueRef out = {};
1453   out.value = a.value;
1454 
1455   graph.execute_nodes().emplace_back(new DispatchNode(
1456       graph,
1457       VK_KERNEL_FROM_STR("scalar_add_texture"),
1458       graph.create_global_wg_size(a.value),
1459       graph.create_local_wg_size(a.value),
1460       // Inputs and Outputs
1461       {{out.value, vkapi::MemoryAccessType::WRITE}},
1462       // Shader params buffers
1463       {graph.logical_limits_ubo(a.value),
1464        graph.get_or_create_int_param_buffer(scalar)},
1465       // Specialization Constants
1466       {},
1467       // Resizing Logic
1468       nullptr,
1469       {}));
1470 
1471   out.staging = graph.set_output_tensor(out.value);
1472 
1473   graph.prepare();
1474   graph.encode_execute();
1475 
1476   // Run graph
1477 
1478   for (float i = 5.0f; i < 30.0f; i += 10.0f) {
1479     int scalar_val = i - 3.0f;
1480     graph.set_symint(scalar, scalar_val);
1481 
1482     int32_t scalar_val_read = graph.read_symint(scalar);
1483     EXPECT_TRUE(scalar_val_read == scalar_val);
1484 
1485     float val_a = i + 2.0f;
1486     float val_out = val_a + scalar_val;
1487 
1488     fill_vtensor(graph, a, val_a);
1489 
1490     graph.execute();
1491 
1492     EXTRACT_TENSOR(out);
1493 
1494     // Sanity check that the values are correct
1495     for (size_t i = 0; i < graph.get_tensor(out.value)->numel(); ++i) {
1496       CHECK_VALUE(data_out, i, val_out);
1497     }
1498   }
1499 }
1500 
1501 #define CREATE_WEIGHT_TENSOR(name, sizes, dtype, val)              \
1502   std::vector<float> data_##name(utils::multiply_integers(sizes)); \
1503   std::fill(data_##name.begin(), data_##name.end(), val);          \
1504   ValueRef name = graph.add_tensorref(sizes, dtype, data_##name.data());
1505 
TEST(VulkanComputeGraphTest,test_simple_prepacked_graph)1506 TEST(VulkanComputeGraphTest, test_simple_prepacked_graph) {
1507   GraphConfig config;
1508   config.enable_querypool = true;
1509   ComputeGraph graph(config);
1510 
1511   std::vector<int64_t> size_big = {8, 73, 62};
1512   std::vector<int64_t> size_small = {8, 73, 1};
1513 
1514   CREATE_WEIGHT_TENSOR(w1, size_small, vkapi::kFloat, 3.5f);
1515   CREATE_WEIGHT_TENSOR(w2, size_small, vkapi::kFloat, 3.0f);
1516 
1517   // Build graph
1518 
1519   IOValueRef a = graph.add_input_tensor(size_big, vkapi::kFloat);
1520 
1521   ValueRef c = graph.add_tensor(size_big, vkapi::kFloat);
1522   ValueRef e = graph.add_tensor(size_big, vkapi::kFloat);
1523 
1524   ValueRef w1_packed = graph.add_tensor(size_small, vkapi::kFloat);
1525   ValueRef w2_packed = graph.add_tensor(size_small, vkapi::kFloat);
1526 
1527   auto prepackFn = VK_GET_OP_FN("et_vk.prepack.default");
1528   prepackFn(graph, {w1, w1_packed});
1529   prepackFn(graph, {w2, w2_packed});
1530 
1531   auto addFn = VK_GET_OP_FN("aten.add.Tensor");
1532   addFn(graph, {a.value, w1_packed, kDummyValueRef, c});
1533 
1534   auto mulFn = VK_GET_OP_FN("aten.mul.Tensor");
1535   mulFn(graph, {c, w2_packed, e});
1536 
1537   IOValueRef out = {};
1538   out.value = e;
1539   out.staging = graph.set_output_tensor(out.value);
1540 
1541   graph.prepare();
1542 
1543   graph.encode_prepack();
1544   graph.prepack();
1545 
1546   graph.encode_execute();
1547 
1548   // Run graph
1549 
1550   for (float i = 5.0f; i < 30.0f; i += 10.0f) {
1551     float val_out = (i + 3.5f) * 3.0f;
1552 
1553     fill_vtensor(graph, a, i);
1554 
1555     // Execute graph
1556     graph.execute();
1557 
1558     EXTRACT_TENSOR(out);
1559 
1560     // Sanity check that the values are correct
1561     for (size_t i = 0; i < graph.get_tensor(out.value)->numel(); ++i) {
1562       CHECK_VALUE(data_out, i, val_out);
1563     }
1564 
1565     if (graph.context()->querypool()) {
1566       graph.context()->querypool().extract_results();
1567       graph.context()->querypool().print_results();
1568     }
1569   }
1570 }
1571 
TEST(VulkanComputeGraphTest,test_simple_shared_objects_with_resize)1572 TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
1573   GraphConfig config;
1574   ComputeGraph graph(config);
1575   size_t expected_vma_allocation_count = 0;
1576 
1577   std::vector<int64_t> size_big = {12, 64, 64};
1578   std::vector<int64_t> size_small = {12, 64, 64};
1579 
1580   // Build graph and regularly check allocation counts
1581 
1582   IOValueRef a = graph.add_input_tensor(
1583       size_big,
1584       vkapi::kFloat,
1585       /*shared_object_idx = */ 2);
1586   IOValueRef b = graph.add_input_tensor(
1587       size_small,
1588       vkapi::kFloat,
1589       /*shared_object_idx = */ 4);
1590 
1591   // +2: t.sizes_ubo() for each staging shader
1592   // +2: staging buffer for each input tensor
1593   expected_vma_allocation_count += 4;
1594   EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
1595 
1596   ValueRef c = graph.add_tensor(
1597       size_big,
1598       vkapi::kFloat,
1599       /*shared_object_idx = */ 6);
1600 
1601   auto addFn = VK_GET_OP_FN("aten.add.Tensor");
1602   addFn(graph, {a.value, b.value, kDummyValueRef, c});
1603 
1604   // +2: alpha UBO, broadcast UBO for arithmetic shader
1605   // +1: t.sizes_ubo() for arithmetic shader output c
1606   expected_vma_allocation_count += 3;
1607   EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
1608 
1609   IOValueRef d = graph.add_input_tensor(
1610       size_small,
1611       vkapi::kFloat,
1612       /*shared_object_idx = */ 2);
1613 
1614   // +1: t.sizes_ubo() uniform buffer for staging shader
1615   // +1: staging buffer for the input tensor
1616   expected_vma_allocation_count += 2;
1617   EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
1618 
1619   ValueRef e = graph.add_tensor(
1620       size_big,
1621       vkapi::kFloat,
1622       /*shared_object_idx = */ 4);
1623 
1624   auto mulFn = VK_GET_OP_FN("aten.mul.Tensor");
1625   mulFn(graph, {c, d.value, e});
1626 
1627   // +2: alpha UBO, broadcast UBO for arithmetic shader
1628   // +1: t.sizes_ubo() for arithmetic shader output e
1629   expected_vma_allocation_count += 3;
1630   EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
1631 
1632   IOValueRef out = {};
1633   out.value = e;
1634   out.staging = graph.set_output_tensor(out.value);
1635 
1636   // +1: staging buffer for the output tensor
1637   expected_vma_allocation_count += 1;
1638   EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
1639 
1640   graph.prepare();
1641   graph.encode_execute();
1642 
1643   // +3: shared memory allocations for tensors
1644   expected_vma_allocation_count += 3;
1645   EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
1646 
1647   // Run graph
1648 
1649   std::vector<std::vector<int64_t>> new_sizes_list = {
1650       {8, 44, 34}, {4, 13, 56}, {8, 12, 64}, {12, 55, 33}, {4, 54, 10}};
1651 
1652   for (auto& new_sizes : new_sizes_list) {
1653     graph.get_tensor(a.value)->virtual_resize(new_sizes);
1654     graph.get_tensor(b.value)->virtual_resize(new_sizes);
1655     graph.get_tensor(c)->virtual_resize(new_sizes);
1656     graph.get_tensor(d.value)->virtual_resize(new_sizes);
1657     graph.get_tensor(e)->virtual_resize(new_sizes);
1658 
1659     float val_a = new_sizes[1] + 4.0f;
1660     float val_b = new_sizes[2] + 1.5f;
1661     float val_d = new_sizes[0] + 2.0f;
1662     float val_out = (val_a + val_b) * val_d;
1663 
1664     fill_vtensor(graph, a, val_a);
1665     fill_vtensor(graph, b, val_b);
1666     fill_vtensor(graph, d, val_d);
1667 
1668     // Execute graph
1669     graph.execute();
1670 
1671     EXTRACT_TENSOR(out);
1672 
1673     // Sanity check that the values are correct
1674     for (size_t i = 0; i < graph.get_tensor(out.value)->numel(); i++) {
1675       CHECK_VALUE(data_out, i, val_out);
1676     }
1677   }
1678 
1679   std::vector<std::vector<int64_t>> new_sizes_list_2 = {
1680       {8, 44, 34}, {4, 13, 56}, {8, 12, 64}, {12, 55, 33}, {4, 54, 10}};
1681 
1682   for (auto& new_sizes : new_sizes_list_2) {
1683     graph.resize_input(0, new_sizes);
1684     graph.resize_input(1, new_sizes);
1685     graph.resize_input(2, new_sizes);
1686     graph.propagate_resize();
1687 
1688     // Check output shape
1689     EXPECT_TRUE(graph.get_tensor(out.value)->sizes() == new_sizes);
1690 
1691     float val_a = new_sizes[1] + 6.0f;
1692     float val_b = new_sizes[2] + 2.5f;
1693     float val_d = new_sizes[0] + 4.0f;
1694     float val_out = (val_a + val_b) * val_d;
1695 
1696     fill_vtensor(graph, a, val_a);
1697     fill_vtensor(graph, b, val_b);
1698     fill_vtensor(graph, d, val_d);
1699 
1700     // Execute graph
1701     graph.execute();
1702 
1703     EXTRACT_TENSOR(out);
1704 
1705     // Sanity check that the values are correct
1706     for (size_t i = 0; i < graph.get_tensor(out.value)->numel(); i++) {
1707       CHECK_VALUE(data_out, i, val_out);
1708     }
1709   }
1710 }
1711 
TEST(VulkanComputeGraphTest,test_simple_graph_with_tmp_tensors)1712 TEST(VulkanComputeGraphTest, test_simple_graph_with_tmp_tensors) {
1713   GraphConfig config;
1714   ComputeGraph graph(config);
1715 
1716   std::vector<int64_t> size_big = {8, 64, 124};
1717   std::vector<int64_t> size_small = {8, 1, 124};
1718 
1719   // Build graph
1720 
1721   IOValueRef a = graph.add_input_tensor(
1722       size_big, vkapi::kFloat, /*shared_object_idx = */ 0);
1723   IOValueRef b = graph.add_input_tensor(
1724       size_small, vkapi::kFloat, /*shared_object_idx = */ 1);
1725 
1726   IOValueRef out = {};
1727 
1728   out.value =
1729       graph.add_tensor(size_big, vkapi::kFloat, /*shared_object_idx = */ 2);
1730 
1731   // Perform the following compute
1732   //
1733   // a, b, out;
1734   // {
1735   //   inter;
1736   //   {
1737   //     tmp = a + b
1738   //     tmp2 = tmp + a
1739   //     inter = tmp2 + b
1740   //   }
1741   //   {
1742   //     tmp = inter + b;
1743   //     tmp2 = tmp + a
1744   //     out = tmp2 + b;
1745   //   }
1746   // }
1747   {
1748     TmpTensor inter(&graph, size_big, vkapi::kFloat);
1749     EXPECT_TRUE(inter.sobj_idx == 3);
1750     {
1751       TmpTensor tmp(&graph, size_big, vkapi::kFloat);
1752       EXPECT_TRUE(tmp.sobj_idx == 4);
1753       VK_GET_OP_FN("aten.add.Tensor")
1754       (graph, {a, b, kDummyValueRef, tmp});
1755 
1756       TmpTensor tmp2(&graph, size_big, vkapi::kFloat);
1757       EXPECT_TRUE(tmp2.sobj_idx == 5);
1758       VK_GET_OP_FN("aten.add.Tensor")
1759       (graph, {tmp, a, kDummyValueRef, tmp2});
1760 
1761       VK_GET_OP_FN("aten.add.Tensor")
1762       (graph, {tmp2, b, kDummyValueRef, inter});
1763     }
1764     {
1765       TmpTensor tmp(&graph, size_big, vkapi::kFloat);
1766       EXPECT_TRUE(tmp.sobj_idx == 4);
1767       VK_GET_OP_FN("aten.add.Tensor")
1768       (graph, {inter, b, kDummyValueRef, tmp});
1769 
1770       TmpTensor tmp2(&graph, size_big, vkapi::kFloat);
1771       EXPECT_TRUE(tmp2.sobj_idx == 5);
1772       VK_GET_OP_FN("aten.add.Tensor")
1773       (graph, {tmp, a, kDummyValueRef, tmp2});
1774 
1775       VK_GET_OP_FN("aten.add.Tensor")
1776       (graph, {tmp2, b, kDummyValueRef, out});
1777     }
1778   }
1779 
1780   out.staging = graph.set_output_tensor(out.value);
1781 
1782   graph.prepare();
1783   graph.encode_execute();
1784 
1785   // Run graph
1786 
1787   for (float i = 5.0f; i < 30.0f; i += 10.0f) {
1788     float val_a = i + 2.0f;
1789     float val_b = i + 1.5f;
1790     float val_tmp = val_a + val_b;
1791     float val_tmp2 = val_tmp + val_a;
1792     float val_inter = val_tmp2 + val_b;
1793     float val_tmp_2 = val_inter + val_b;
1794     float val_tmp2_2 = val_tmp_2 + val_a;
1795     float val_out = val_tmp2_2 + val_b;
1796 
1797     fill_vtensor(graph, a, val_a);
1798     fill_vtensor(graph, b, val_b);
1799 
1800     graph.execute();
1801 
1802     EXTRACT_TENSOR(out);
1803 
1804     // Sanity check that the values are correct
1805     for (size_t i = 0; i < graph.get_tensor(out.value)->numel(); ++i) {
1806       CHECK_VALUE(data_out, i, val_out);
1807     }
1808   }
1809 }
1810 
TEST(VulkanComputeGraphTest,test_large_graph)1811 TEST(VulkanComputeGraphTest, test_large_graph) {
1812   auto build_start_time = std::chrono::system_clock::now();
1813   GraphConfig config;
1814   ComputeGraph graph(config);
1815 
1816   int64_t input_w = 256;
1817   int64_t input_h = 256;
1818   int64_t input_c = 8;
1819 
1820   std::vector<int64_t> size_big = {input_c, input_h, input_w};
1821   std::vector<int64_t> size_small = {input_c, input_h, 1};
1822 
1823   std::vector<int64_t> size_big_alt = {input_c / 2, input_h / 2, input_w / 2};
1824   std::vector<int64_t> size_small_alt = {input_c / 2, input_h / 2, 1};
1825 
1826   // Build graph
1827 
1828   IOValueRef a = graph.add_input_tensor(size_big, vkapi::kFloat, 2);
1829   IOValueRef b = graph.add_input_tensor(size_small, vkapi::kFloat, 4);
1830 
1831   ValueRef c = graph.add_tensor(size_big, vkapi::kFloat, 6);
1832 
1833   auto addFn = VK_GET_OP_FN("aten.add.Tensor");
1834   addFn(graph, {a.value, b.value, kDummyValueRef, c});
1835 
1836   int n = 100;
1837 
1838   for (int i = 0; i < n; i++) {
1839     addFn(graph, {c, b.value, kDummyValueRef, a.value});
1840 
1841     addFn(graph, {a.value, b.value, kDummyValueRef, c});
1842   }
1843 
1844   IOValueRef out = {};
1845   out.value = c;
1846   out.staging = graph.set_output_tensor(out.value);
1847 
1848   graph.prepare();
1849   graph.encode_execute();
1850 
1851   auto build_end_time = std::chrono::system_clock::now();
1852 
1853   auto build_time = std::chrono::duration_cast<std::chrono::microseconds>(
1854       build_end_time - build_start_time);
1855 
1856   std::stringstream ss;
1857   for (int i = 0; i < 10; i++) {
1858     auto resize_start_time = std::chrono::system_clock::now();
1859     if (i % 2 == 0) {
1860       graph.resize_input(0, size_big_alt);
1861       graph.resize_input(1, size_small_alt);
1862     } else {
1863       graph.resize_input(0, size_big);
1864       graph.resize_input(1, size_small);
1865     }
1866     graph.propagate_resize();
1867     auto resize_end_time = std::chrono::system_clock::now();
1868 
1869     auto resize_time = std::chrono::duration_cast<std::chrono::microseconds>(
1870         resize_end_time - resize_start_time);
1871 
1872     float val_a = 1.0f;
1873     float val_b = 2.0f;
1874 
1875     float val_e = val_a + val_b * (2 * n + 1);
1876 
1877     auto inference_start_time = std::chrono::system_clock::now();
1878 
1879     fill_vtensor(graph, a, val_a);
1880     fill_vtensor(graph, b, val_b);
1881 
1882     graph.execute();
1883 
1884     EXTRACT_TENSOR(out);
1885 
1886     auto inference_end_time = std::chrono::system_clock::now();
1887 
1888     auto inference_time = std::chrono::duration_cast<std::chrono::microseconds>(
1889         inference_end_time - inference_start_time);
1890 
1891     for (int i = 0; i < graph.get_tensor(out.value)->numel(); i++) {
1892       CHECK_VALUE(data_out, i, val_e);
1893     }
1894 
1895     ss << "[          ] Resize:    " << std::setw(10) << std::right
1896        << resize_time.count() << " us" << std::endl;
1897     ss << "[          ] Inference: " << std::setw(10) << std::right
1898        << inference_time.count() << " us" << std::endl;
1899   }
1900   ss << "[          ] Model Load:" << std::setw(10) << std::right
1901      << build_time.count() << " us" << std::endl;
1902   std::cout << ss.str();
1903 }
1904 
test_clone(std::vector<int64_t> sizes,utils::StorageType src_storage,utils::GPUMemoryLayout src_layout,utils::StorageType dst_storage,utils::GPUMemoryLayout dst_layout)1905 void test_clone(
1906     std::vector<int64_t> sizes,
1907     utils::StorageType src_storage,
1908     utils::GPUMemoryLayout src_layout,
1909     utils::StorageType dst_storage,
1910     utils::GPUMemoryLayout dst_layout) {
1911   GraphConfig config;
1912   ComputeGraph graph(config);
1913 
1914   IOValueRef a =
1915       graph.add_input_tensor(sizes, vkapi::kFloat, src_storage, src_layout);
1916 
1917   IOValueRef out = {};
1918   out.value = graph.add_tensor(sizes, vkapi::kFloat, dst_storage, dst_layout);
1919 
1920   auto copyFn = VK_GET_OP_FN("aten.clone.default");
1921   copyFn(graph, {a.value, kDummyValueRef, out.value});
1922 
1923   out.staging = graph.set_output_tensor(out.value);
1924 
1925   graph.prepare();
1926   graph.encode_execute();
1927 
1928   fill_vtensor(graph, a, 0.0f, /*iota = */ true);
1929 
1930   graph.propagate_resize();
1931   graph.execute();
1932 
1933   EXTRACT_TENSOR(out);
1934   EXTRACT_TENSOR(a);
1935 
1936   for (int i = 0; i < graph.numel_of(a.value); ++i) {
1937     EXPECT_TRUE(data_out[i] == data_a[i]);
1938   }
1939 }
1940 
TEST(VulkanComputeGraphTest,test_clone)1941 TEST(VulkanComputeGraphTest, test_clone) {
1942   std::vector<std::pair<utils::GPUMemoryLayout, utils::GPUMemoryLayout>> cases{
1943       {utils::kWidthPacked, utils::kWidthPacked},
1944       {utils::kWidthPacked, utils::kChannelsPacked},
1945       {utils::kChannelsPacked, utils::kChannelsPacked},
1946   };
1947 
1948   for (std::vector<int64_t> sizes : standard_sizes_to_test) {
1949     for (auto& [src_layout, dst_layout] : cases) {
1950       test_clone(
1951           sizes, utils::kTexture3D, src_layout, utils::kBuffer, dst_layout);
1952       test_clone(
1953           sizes, utils::kBuffer, src_layout, utils::kTexture3D, dst_layout);
1954       test_clone(
1955           sizes, utils::kTexture3D, src_layout, utils::kTexture3D, dst_layout);
1956     }
1957   }
1958 }
1959 
TEST(VulkanComputeGraphTest,test_etvk_copy_offset_node)1960 TEST(VulkanComputeGraphTest, test_etvk_copy_offset_node) {
1961   GraphConfig config;
1962   ComputeGraph graph(config);
1963 
1964   int64_t n = 6;
1965   int64_t c = 12;
1966   int64_t h = 4;
1967   int64_t w = 8;
1968   utils::GPUMemoryLayout memory_layout =
1969       utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
1970 
1971   std::vector<int64_t> size = {n, c, h, w};
1972 
1973   IOValueRef a = graph.add_input_tensor(size, vkapi::kFloat, memory_layout);
1974 
1975   IOValueRef out = {};
1976   out.value = graph.add_tensor(size, vkapi::kFloat, memory_layout);
1977 
1978   // Notice that copy_node operates on in texture's x, y, z dimension. In the
1979   // comment, we provide the cooresponding coordinate in nchw.
1980 
1981   // src_offset is (n=0, c=4, h=1, w=1)
1982   ValueRef src_offset_ref = graph.add_scalar_list<int64_t>({1, 1, 1});
1983 
1984   // dst_offset is (n=1, c=8, h=2, w=0) in nchw coordinate
1985   // Argument is {x, y, z}.
1986   // x = 0 since w = 0
1987   // y = 2 since h = 2
1988   // z = c / 4 + 2 since
1989   //   1. there c/4 planes per batch, n=1 means we are on the first batch;
1990   //   2. +2 because c = 8, with channel packing it means two texels.
1991   ValueRef dst_offset_ref = graph.add_scalar_list<int64_t>({0, 2, c / 4 + 2});
1992 
1993   // range is (n=1, c=8, h=2, w=4)
1994   // Argument is {x, y, z}.
1995   // x = 4 since w = 4
1996   // y = 2 since h = 2
1997   // z = 2 since we are only copying 8 channels, hence 2 texel. n = 1 can be a
1998   // bit misleading here, since it gives the impression that we are copying the
1999   // entire channel. However, remember when we copy, we are trying to
2000   // dst[dst_offset:dst_offset + range] = src[src_offset:src_offset + range],
2001   // range must be non zero.
2002   ValueRef range_ref = graph.add_scalar_list<int64_t>({4, 2, 2});
2003 
2004   auto copyFn = VK_GET_OP_FN("etvk.copy_offset");
2005   copyFn(
2006       graph, {a.value, range_ref, src_offset_ref, dst_offset_ref, out.value});
2007 
2008   out.staging = graph.set_output_tensor(out.value);
2009 
2010   graph.prepare();
2011   graph.encode_execute();
2012 
2013   fill_vtensor(graph, a, 0.0f, /*iota = */ true);
2014 
2015   graph.execute();
2016 
2017   EXTRACT_TENSOR(out);
2018   EXTRACT_TENSOR(a);
2019 
2020   // We will examine the results in the dst_range
2021   // The value in the cooresponding coordinate should match between the source
2022   // and destination tensor. We loop thru the range, calculate both the src and
2023   // dst index using the offsets, and compare the values in the extracted
2024   // vector. They should match.
2025   int n_idx = 0;
2026   // at each nested loop, index range from dst_offset to dst_offset + range
2027 
2028   for (int c_idx = 0; c_idx < 8; c_idx++) {
2029     for (int h_idx = 0; h_idx < 2; h_idx++) {
2030       for (int w_idx = 0; w_idx < 4; w_idx++) {
2031         auto dst_idx =
2032             get_buf_idx(graph, out, {n_idx + 1, c_idx + 8, h_idx + 2, w_idx});
2033         auto src_idx =
2034             get_buf_idx(graph, a, {n_idx, c_idx + 4, h_idx + 1, w_idx + 1});
2035 
2036         EXPECT_TRUE(data_out[dst_idx] == data_a[src_idx]);
2037       }
2038     }
2039   }
2040 }
2041 
TEST(VulkanComputeGraphTest,test_etvk_copy_channel_offset_node)2042 TEST(VulkanComputeGraphTest, test_etvk_copy_channel_offset_node) {
2043   GraphConfig config;
2044   ComputeGraph graph(config);
2045 
2046   int64_t n = 2;
2047   int64_t c = 12;
2048   int64_t h = 4;
2049   int64_t w = 8;
2050   utils::GPUMemoryLayout memory_layout =
2051       utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
2052 
2053   std::vector<int64_t> size = {n, c, h, w};
2054 
2055   IOValueRef a = graph.add_input_tensor(size, vkapi::kFloat, memory_layout);
2056 
2057   IOValueRef out = {};
2058   out.value = graph.add_tensor(size, vkapi::kFloat, memory_layout);
2059 
2060   int64_t src_offset = 2;
2061   int64_t dst_offset = 3;
2062   int64_t range = 7;
2063 
2064   ValueRef src_offset_ref = graph.add_scalar<int64_t>(src_offset);
2065   ValueRef dst_offset_ref = graph.add_scalar<int64_t>(dst_offset);
2066   ValueRef range_ref = graph.add_scalar<int64_t>(range);
2067 
2068   auto copyFn = VK_GET_OP_FN("etvk.copy_channel_offset");
2069   copyFn(
2070       graph, {a.value, range_ref, src_offset_ref, dst_offset_ref, out.value});
2071 
2072   out.staging = graph.set_output_tensor(out.value);
2073 
2074   graph.prepare();
2075   graph.encode_execute();
2076 
2077   fill_vtensor(graph, a, 0.0f, true);
2078 
2079   graph.execute();
2080 
2081   EXTRACT_TENSOR(out);
2082   EXTRACT_TENSOR(a);
2083 
2084   for (int n_idx = 0; n_idx < n; n_idx++) {
2085     for (int c_idx = 0; c_idx < range; c_idx++) {
2086       for (int h_idx = 0; h_idx < h; h_idx++) {
2087         for (int w_idx = 0; w_idx < w; w_idx++) {
2088           auto src_idx =
2089               get_buf_idx(graph, a, {n_idx, c_idx + src_offset, h_idx, w_idx});
2090           auto dst_idx = get_buf_idx(
2091               graph, out, {n_idx, c_idx + dst_offset, h_idx, w_idx});
2092           EXPECT_TRUE(data_out[dst_idx] == data_a[src_idx]);
2093         }
2094       }
2095     }
2096   }
2097 }
2098 
TEST(VulkanComputeGraphTest,test_etvk_copy_channel_offset_node_clean_boundary)2099 TEST(
2100     VulkanComputeGraphTest,
2101     test_etvk_copy_channel_offset_node_clean_boundary) {
2102   // Tricky part for channel copy is handling the boundary across multiple copy.
2103   // For example, when we concat two [3, 1, 1] nchw-tensors along the channel
2104   // dimension, due to channel packing, elements from different source texel
2105   // will be packed into same destination texel at the boundaries.
2106   GraphConfig config;
2107   ComputeGraph graph(config);
2108 
2109   int64_t n = 2;
2110   int64_t c = 12;
2111   int64_t h = 4;
2112   int64_t w = 8;
2113   utils::GPUMemoryLayout memory_layout =
2114       utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
2115 
2116   std::vector<int64_t> size = {n, c, h, w};
2117 
2118   IOValueRef zero = graph.add_input_tensor(size, vkapi::kFloat, memory_layout);
2119   IOValueRef a = graph.add_input_tensor(size, vkapi::kFloat, memory_layout);
2120   IOValueRef b = graph.add_input_tensor(size, vkapi::kFloat, memory_layout);
2121 
2122   IOValueRef out = {};
2123   out.value = graph.add_tensor(size, vkapi::kFloat, memory_layout);
2124 
2125   auto copyFn = VK_GET_OP_FN("etvk.copy_channel_offset");
2126 
2127   // Make sure entire out tensor is zeroed. The zero tensor will be filled with
2128   // zero later.
2129   copyFn(
2130       graph,
2131       {zero.value,
2132        graph.add_scalar<int64_t>(c),
2133        graph.add_scalar<int64_t>(0),
2134        graph.add_scalar<int64_t>(0),
2135        out.value});
2136 
2137   int64_t a_src_offset = 0;
2138   int64_t a_dst_offset = 2;
2139   int64_t a_range = 5;
2140   // a will write to channge [2, 7)
2141   copyFn(
2142       graph,
2143       {a.value,
2144        graph.add_scalar<int64_t>(a_range),
2145        graph.add_scalar<int64_t>(a_src_offset),
2146        graph.add_scalar<int64_t>(a_dst_offset),
2147        out.value});
2148 
2149   // b will write to channel [6, 11)
2150   // Intentional for b to override channel=6
2151   int64_t b_src_offset = 0;
2152   int64_t b_dst_offset = 6;
2153   int64_t b_range = 5;
2154 
2155   copyFn(
2156       graph,
2157       {b.value,
2158        graph.add_scalar<int64_t>(b_range),
2159        graph.add_scalar<int64_t>(b_src_offset),
2160        graph.add_scalar<int64_t>(b_dst_offset),
2161        out.value});
2162 
2163   out.staging = graph.set_output_tensor(out.value);
2164 
2165   graph.prepare();
2166   graph.encode_execute();
2167 
2168   float a_value = 1.0f;
2169   float b_value = 2.0f;
2170   float zero_value = 0.0f;
2171   fill_vtensor(graph, a, a_value);
2172   fill_vtensor(graph, b, b_value);
2173   fill_vtensor(graph, zero, zero_value);
2174 
2175   graph.execute();
2176 
2177   EXTRACT_TENSOR(out);
2178 
2179   for (int n_idx = 0; n_idx < n; n_idx++) {
2180     // c_idx only up to a_range-1 because the expected overwrite by b
2181     for (int c_idx = a_dst_offset; c_idx < a_dst_offset + a_range - 1;
2182          c_idx++) {
2183       for (int h_idx = 0; h_idx < h; h_idx++) {
2184         for (int w_idx = 0; w_idx < w; w_idx++) {
2185           auto dst_idx = get_buf_idx(graph, out, {n_idx, c_idx, h_idx, w_idx});
2186           EXPECT_TRUE(data_out[dst_idx] == a_value);
2187         }
2188       }
2189     }
2190   }
2191 
2192   for (int n_idx = 0; n_idx < n; n_idx++) {
2193     for (int c_idx = b_dst_offset; c_idx < b_dst_offset + b_range; c_idx++) {
2194       for (int h_idx = 0; h_idx < h; h_idx++) {
2195         for (int w_idx = 0; w_idx < w; w_idx++) {
2196           auto dst_idx = get_buf_idx(graph, out, {n_idx, c_idx, h_idx, w_idx});
2197           EXPECT_TRUE(data_out[dst_idx] == b_value);
2198         }
2199       }
2200     }
2201   }
2202 
2203   // Also verify that data before a_dst_offset and after b_dst_offset + b_range
2204   // are untouched.
2205   for (int n_idx = 0; n_idx < n; n_idx++) {
2206     for (int c_idx = 0; c_idx < a_dst_offset; c_idx++) {
2207       for (int h_idx = 0; h_idx < h; h_idx++) {
2208         for (int w_idx = 0; w_idx < w; w_idx++) {
2209           auto dst_idx = get_buf_idx(graph, out, {n_idx, c_idx, h_idx, w_idx});
2210           EXPECT_TRUE(data_out[dst_idx] == zero_value);
2211         }
2212       }
2213     }
2214   }
2215 
2216   for (int n_idx = 0; n_idx < n; n_idx++) {
2217     for (int c_idx = b_dst_offset + b_range; c_idx < c; c_idx++) {
2218       for (int h_idx = 0; h_idx < h; h_idx++) {
2219         for (int w_idx = 0; w_idx < w; w_idx++) {
2220           auto dst_idx = get_buf_idx(graph, out, {n_idx, c_idx, h_idx, w_idx});
2221           EXPECT_TRUE(data_out[dst_idx] == zero_value);
2222         }
2223       }
2224     }
2225   }
2226 }
2227 
TEST(VulkanComputeGraphTest,test_etvk_copy_offset_int_node)2228 TEST(VulkanComputeGraphTest, test_etvk_copy_offset_int_node) {
2229   GraphConfig config;
2230   ComputeGraph graph(config);
2231 
2232   int64_t n = 6;
2233   int64_t c = 12;
2234   int64_t h = 4;
2235   int64_t w = 8;
2236   utils::GPUMemoryLayout memory_layout =
2237       utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
2238 
2239   std::vector<int64_t> size = {n, c, h, w};
2240 
2241   IOValueRef a = graph.add_input_tensor(size, vkapi::kInt, memory_layout);
2242 
2243   IOValueRef out = {};
2244   out.value = graph.add_tensor(size, vkapi::kInt, memory_layout);
2245 
2246   // Notice that copy_node operates on in texture's x, y, z dimension. In the
2247   // comment, we provide the cooresponding coordinate in nchw.
2248 
2249   // src_offset is (n=0, c=4, h=1, w=1)
2250   ValueRef src_offset_ref = graph.add_scalar_list<int64_t>({1, 1, 1});
2251 
2252   // dst_offset is (n=1, c=8, h=2, w=0) in nchw coordinate
2253   // Argument is {x, y, z}.
2254   // x = 0 since w = 0
2255   // y = 2 since h = 2
2256   // z = c / 4 + 2 since
2257   //   1. there c/4 planes per batch, n=1 means we are on the first batch;
2258   //   2. +2 because c = 8, with channel packing it means two texels.
2259   ValueRef dst_offset_ref = graph.add_scalar_list<int64_t>({0, 2, c / 4 + 2});
2260 
2261   // range is (n=1, c=8, h=2, w=4)
2262   // Argument is {x, y, z}.
2263   // x = 4 since w = 4
2264   // y = 2 since h = 2
2265   // z = 2 since we are only copying 8 channels, hence 2 texel. n = 1 can be a
2266   // bit misleading here, since it gives the impression that we are copying the
2267   // entire channel. However, remember when we copy, we are trying to
2268   // dst[dst_offset:dst_offset + range] = src[src_offset:src_offset + range],
2269   // range must be non zero.
2270   ValueRef range_ref = graph.add_scalar_list<int64_t>({4, 2, 2});
2271 
2272   auto copyFn = VK_GET_OP_FN("etvk.copy_offset");
2273   copyFn(
2274       graph, {a.value, range_ref, src_offset_ref, dst_offset_ref, out.value});
2275 
2276   out.staging = graph.set_output_tensor(out.value);
2277 
2278   graph.prepare();
2279   graph.encode_execute();
2280 
2281   fill_vtensor(graph, a, 0, /*iota = */ true);
2282 
2283   graph.execute();
2284 
2285   EXTRACT_TENSOR(out);
2286   EXTRACT_TENSOR(a);
2287 
2288   // We will examine the results in the dst_range
2289   // The value in the cooresponding coordinate should match between the source
2290   // and destination tensor. We loop thru the range, calculate both the src and
2291   // dst index using the offsets, and compare the values in the extracted
2292   // vector. They should match.
2293   int n_idx = 0;
2294   // at each nested loop, index range from dst_offset to dst_offset + range
2295 
2296   for (int c_idx = 0; c_idx < 8; c_idx++) {
2297     for (int h_idx = 0; h_idx < 2; h_idx++) {
2298       for (int w_idx = 0; w_idx < 4; w_idx++) {
2299         auto dst_idx =
2300             get_buf_idx(graph, out, {n_idx + 1, c_idx + 8, h_idx + 2, w_idx});
2301         auto src_idx =
2302             get_buf_idx(graph, a, {n_idx, c_idx + 4, h_idx + 1, w_idx + 1});
2303 
2304         EXPECT_TRUE(data_out[dst_idx] == data_a[src_idx]);
2305       }
2306     }
2307   }
2308 }
2309 
TEST(VulkanComputeGraphTest,test_etvk_copy_channel_offset_int_node)2310 TEST(VulkanComputeGraphTest, test_etvk_copy_channel_offset_int_node) {
2311   GraphConfig config;
2312   ComputeGraph graph(config);
2313 
2314   int64_t n = 2;
2315   int64_t c = 12;
2316   int64_t h = 4;
2317   int64_t w = 8;
2318   utils::GPUMemoryLayout memory_layout =
2319       utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
2320 
2321   std::vector<int64_t> size = {n, c, h, w};
2322 
2323   IOValueRef a = graph.add_input_tensor(size, vkapi::kFloat, memory_layout);
2324 
2325   IOValueRef out = {};
2326   out.value = graph.add_tensor(size, vkapi::kFloat, memory_layout);
2327 
2328   int64_t src_offset = 2;
2329   int64_t dst_offset = 3;
2330   int64_t range = 7;
2331 
2332   ValueRef src_offset_ref = graph.add_scalar<int64_t>(src_offset);
2333   ValueRef dst_offset_ref = graph.add_scalar<int64_t>(dst_offset);
2334   ValueRef range_ref = graph.add_scalar<int64_t>(range);
2335 
2336   auto copyFn = VK_GET_OP_FN("etvk.copy_channel_offset");
2337   copyFn(
2338       graph, {a.value, range_ref, src_offset_ref, dst_offset_ref, out.value});
2339 
2340   out.staging = graph.set_output_tensor(out.value);
2341 
2342   graph.prepare();
2343   graph.encode_execute();
2344 
2345   fill_vtensor(graph, a, 0.0f, true);
2346 
2347   graph.execute();
2348 
2349   EXTRACT_TENSOR(out);
2350   EXTRACT_TENSOR(a);
2351 
2352   for (int n_idx = 0; n_idx < n; n_idx++) {
2353     for (int c_idx = 0; c_idx < range; c_idx++) {
2354       for (int h_idx = 0; h_idx < h; h_idx++) {
2355         for (int w_idx = 0; w_idx < w; w_idx++) {
2356           auto src_idx =
2357               get_buf_idx(graph, a, {n_idx, c_idx + src_offset, h_idx, w_idx});
2358           auto dst_idx = get_buf_idx(
2359               graph, out, {n_idx, c_idx + dst_offset, h_idx, w_idx});
2360           EXPECT_TRUE(data_out[dst_idx] == data_a[src_idx]);
2361         }
2362       }
2363     }
2364   }
2365 }
2366 
TEST(VulkanComputeGraphTest,test_view_change_packing)2367 TEST(VulkanComputeGraphTest, test_view_change_packing) {
2368   std::vector<std::pair<utils::GPUMemoryLayout, utils::GPUMemoryLayout>>
2369       layout_pairs = {
2370           {utils::kWidthPacked, utils::kChannelsPacked},
2371           {utils::kWidthPacked, utils::kHeightPacked},
2372           {utils::kWidthPacked, utils::kWidthPacked},
2373           {utils::kHeightPacked, utils::kChannelsPacked},
2374           {utils::kHeightPacked, utils::kHeightPacked},
2375           {utils::kHeightPacked, utils::kHeightPacked},
2376           {utils::kChannelsPacked, utils::kChannelsPacked},
2377           {utils::kChannelsPacked, utils::kHeightPacked},
2378           {utils::kChannelsPacked, utils::kHeightPacked},
2379       };
2380 
2381   int64_t n = 3;
2382   int64_t c = 2;
2383   int64_t h = 2;
2384   int64_t w = 5;
2385   std::vector<int64_t> size = {n, c, h, w};
2386 
2387   for (auto layout_pair : layout_pairs) {
2388     GraphConfig config;
2389     ComputeGraph graph(config);
2390 
2391     IOValueRef in =
2392         graph.add_input_tensor(size, vkapi::kFloat, layout_pair.first);
2393 
2394     IOValueRef out = {};
2395     out.value = graph.add_tensor(size, vkapi::kFloat, layout_pair.second);
2396 
2397     auto viewFn = VK_GET_OP_FN("aten.view_copy.default");
2398     viewFn(graph, {in.value, graph.add_none(), out.value});
2399 
2400     out.staging = graph.set_output_tensor(out.value);
2401 
2402     graph.prepare();
2403     graph.encode_execute();
2404 
2405     fill_vtensor(graph, in, 0.0, true);
2406 
2407     graph.execute();
2408 
2409     EXTRACT_TENSOR(out);
2410 
2411     // The extracted data is a flattened nchw buffer. Hence, should expect the
2412     // all elements inside the out array to match the index.
2413     for (int i = 0; i < graph.get_tensor(out.value)->numel(); i++) {
2414       CHECK_VALUE(data_out, i, i);
2415     }
2416   }
2417 }
2418 
2419 class VulkanToFromGPUShaderTest : public ::testing::Test {
2420  public:
SetUp()2421   void SetUp() override {
2422     // Make sure we are starting with a clean slate
2423     EXPECT_TRUE(get_vma_allocation_count() == 0);
2424   }
2425 
TearDown()2426   void TearDown() override {
2427     context()->flush();
2428 
2429     // Make sure we are ending with a clean slate
2430     EXPECT_TRUE(get_vma_allocation_count() == 0);
2431   }
2432 };
2433 
2434 template <typename T>
run_from_gpu_test(std::vector<int64_t> & sizes,utils::GPUMemoryLayout memory_layout=utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED,vkapi::ScalarType dtype=vkapi::kFloat,utils::StorageType storage_type=utils::StorageType::TEXTURE_3D)2435 void run_from_gpu_test(
2436     std::vector<int64_t>& sizes,
2437     utils::GPUMemoryLayout memory_layout =
2438         utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED,
2439     vkapi::ScalarType dtype = vkapi::kFloat,
2440     utils::StorageType storage_type = utils::StorageType::TEXTURE_3D) {
2441   if (dtype == vkapi::kHalf &&
2442       !context()->adapter_ptr()->supports_16bit_storage_buffers()) {
2443     return;
2444   }
2445   vTensor vten = vTensor(context(), sizes, dtype, storage_type, memory_layout);
2446 
2447   std::string kernel_name("idx_fill_texture");
2448   add_dtype_suffix(kernel_name, vten);
2449 
2450   int32_t offset = -50;
2451 
2452   {
2453     vkapi::PipelineBarrier pipeline_barrier{};
2454     context()->submit_compute_job(
2455         VK_KERNEL_FROM_STR(kernel_name),
2456         pipeline_barrier,
2457         vten.logical_limits(),
2458         {4, 4, 4},
2459         {vten.packed_dim(), offset},
2460         VK_NULL_HANDLE,
2461         0,
2462         vten.image(
2463             pipeline_barrier,
2464             vkapi::PipelineStage::COMPUTE,
2465             vkapi::MemoryAccessType::WRITE),
2466         vten.sizes_ubo());
2467   }
2468 
2469   StagingBuffer staging_buffer(context(), dtype, vten.staging_buffer_numel());
2470 
2471   if (dtype == vkapi::kChar &&
2472       !context()->adapter_ptr()->has_full_int8_buffers_support()) {
2473     record_bitw8_image_to_nchw_nobitw8buffer_op(
2474         context(), vten, staging_buffer);
2475   } else {
2476     record_image_to_nchw_op(context(), vten, staging_buffer.buffer());
2477   }
2478 
2479   submit_to_gpu();
2480 
2481   std::vector<T> data_out(staging_buffer.numel());
2482   staging_buffer.copy_to(data_out.data(), staging_buffer.nbytes());
2483 
2484   for (int i = 0; i < vten.numel(); i++) {
2485     CHECK_VALUE(data_out, i, i + offset);
2486   }
2487 }
2488 
2489 template <typename T>
round_trip_test(std::vector<int64_t> & sizes,utils::GPUMemoryLayout memory_layout=utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED,vkapi::ScalarType dtype=vkapi::kFloat,utils::StorageType storage_type=utils::StorageType::TEXTURE_3D)2490 void round_trip_test(
2491     std::vector<int64_t>& sizes,
2492     utils::GPUMemoryLayout memory_layout =
2493         utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED,
2494     vkapi::ScalarType dtype = vkapi::kFloat,
2495     utils::StorageType storage_type = utils::StorageType::TEXTURE_3D) {
2496   if (dtype == vkapi::kHalf &&
2497       !context()->adapter_ptr()->supports_16bit_storage_buffers()) {
2498     return;
2499   }
2500 
2501   vTensor vten = vTensor(context(), sizes, dtype, storage_type, memory_layout);
2502 
2503   // Create and fill input staging buffer
2504   StagingBuffer staging_buffer_in(
2505       context(), dtype, vten.staging_buffer_numel());
2506 
2507   std::vector<T> data_in(staging_buffer_in.numel());
2508   for (int i = 0; i < staging_buffer_in.numel(); i++) {
2509     data_in[i] = T(i * -1);
2510   }
2511   staging_buffer_in.copy_from(data_in.data(), vten.staging_buffer_nbytes());
2512 
2513   // Output staging buffer
2514   StagingBuffer staging_buffer_out(
2515       context(), dtype, vten.staging_buffer_numel());
2516 
2517   record_nchw_to_image_op(context(), staging_buffer_in.buffer(), vten);
2518 
2519   // Copy data in and out of the tensor
2520   if (dtype == vkapi::kChar &&
2521       !context()->adapter_ptr()->has_full_int8_buffers_support()) {
2522     record_bitw8_image_to_nchw_nobitw8buffer_op(
2523         context(), vten, staging_buffer_out);
2524   } else {
2525     record_image_to_nchw_op(context(), vten, staging_buffer_out.buffer());
2526   }
2527 
2528   // Execute command buffer
2529   submit_to_gpu();
2530 
2531   // Extract data from output staging buffer
2532   std::vector<T> data_out(staging_buffer_out.numel());
2533   staging_buffer_out.copy_to(data_out.data(), staging_buffer_out.nbytes());
2534 
2535   // All indices should be equal to the input data
2536   for (int i = 0; i < vten.numel(); i++) {
2537     CHECK_VALUE(data_out, i, data_in[i]);
2538   }
2539 }
2540 
2541 template <typename T>
compute_graph_round_trip_test(std::vector<int64_t> & sizes,utils::GPUMemoryLayout memory_layout=utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED,vkapi::ScalarType dtype=vkapi::kFloat,utils::StorageType storage_type=utils::StorageType::TEXTURE_3D)2542 void compute_graph_round_trip_test(
2543     std::vector<int64_t>& sizes,
2544     utils::GPUMemoryLayout memory_layout =
2545         utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED,
2546     vkapi::ScalarType dtype = vkapi::kFloat,
2547     utils::StorageType storage_type = utils::StorageType::TEXTURE_3D) {
2548   if (dtype == vkapi::kHalf &&
2549       !context()->adapter_ptr()->supports_16bit_storage_buffers()) {
2550     return;
2551   }
2552 
2553   GraphConfig config;
2554   ComputeGraph graph(config);
2555 
2556   ValueRef r_tensor =
2557       graph.add_tensor(sizes, dtype, storage_type, memory_layout);
2558   ValueRef r_staging_in = graph.set_input_tensor(r_tensor);
2559   ValueRef r_staging_out = graph.set_output_tensor(r_tensor);
2560 
2561   graph.prepare();
2562   graph.encode_execute();
2563 
2564   vTensorPtr tensor = graph.get_tensor(r_tensor);
2565 
2566   std::vector<T> data_in(tensor->numel());
2567   for (int i = 0; i < data_in.size(); i++) {
2568     data_in[i] = T(i * -1);
2569   }
2570   graph.copy_into_staging(r_staging_in, data_in.data(), data_in.size());
2571 
2572   graph.execute();
2573 
2574   std::vector<T> data_out(tensor->staging_buffer_numel());
2575   graph.copy_from_staging(r_staging_out, data_out.data(), data_out.size());
2576 
2577   for (int i = 0; i < data_in.size(); i++) {
2578     CHECK_VALUE(data_out, i, data_in[i]);
2579   }
2580 }
2581 
TEST(VulkanToFromGPUShaderTest,round_trip_tests)2582 TEST(VulkanToFromGPUShaderTest, round_trip_tests) {
2583   // The below tests will fill each texel element with the value of the linear
2584   // buffer index that corresponds to it. The texel at position (0, 0, 0) will
2585   // be filled with the values [0, 1, 2, 3], the texel at position (1, 0, 0)
2586   // will be filled with the values [4, 5, 6, 7], and so forth. The contents of
2587   // the texture are then written back to the CPU, and to check that the
2588   // transfer has ben performed correctly the value at each index of the CPU
2589   // data buffer should be equal to the index.
2590   //
2591   // The below test cases should ensure that the total number of elements does
2592   // not exceed 2048, or else the tests will fail for FP16 textures due to
2593   // precision issues. Half precision floating point formats can only represent
2594   // integers from 2048 to 4096 using intervals of 2.
2595   std::vector<std::vector<int64_t>> to_test = {
2596       // 2D sizes
2597       {17, 21},
2598       {67, 23},
2599       {55, 33},
2600       // 3D sizes
2601       {7, 9, 13},
2602       {21, 2, 19},
2603       {17, 17, 5},
2604       // 4D sizes
2605       {7, 3, 13, 7},
2606       {11, 9, 9, 1},
2607       {3, 3, 3, 3},
2608       {3, 1, 7, 13},
2609   };
2610 
2611   // These sizes are set such that the total number of elements is less than
2612   // 128 which is the maximum representable value for int8.
2613   std::vector<std::vector<int64_t>> to_test_int8 = {
2614       // 2D sizes
2615       {14, 7},
2616       // 3D sizes
2617       {3, 7, 5},
2618       {4, 2, 11},
2619       // 4D sizes
2620       {3, 3, 3, 3},
2621       {7, 1, 6, 3},
2622   };
2623 
2624 #define RUN_TESTS(ctype, dtype)                                      \
2625   round_trip_test<ctype>(                                            \
2626       sizes, utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, dtype); \
2627   round_trip_test<ctype>(                                            \
2628       sizes, utils::GPUMemoryLayout::TENSOR_WIDTH_PACKED, dtype);    \
2629   round_trip_test<ctype>(                                            \
2630       sizes, utils::GPUMemoryLayout::TENSOR_HEIGHT_PACKED, dtype);   \
2631   compute_graph_round_trip_test<ctype>(                              \
2632       sizes, utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, dtype); \
2633   compute_graph_round_trip_test<ctype>(                              \
2634       sizes, utils::GPUMemoryLayout::TENSOR_WIDTH_PACKED, dtype);    \
2635   compute_graph_round_trip_test<ctype>(                              \
2636       sizes, utils::GPUMemoryLayout::TENSOR_HEIGHT_PACKED, dtype);
2637 
2638   for (auto& sizes : to_test) {
2639     RUN_TESTS(float, vkapi::kFloat)
2640     RUN_TESTS(executorch::aten::Half, vkapi::kHalf)
2641   }
2642 
2643   for (auto& sizes : to_test_int8) {
2644     RUN_TESTS(int8_t, vkapi::kChar);
2645   }
2646 
2647 #undef RUN_TESTS
2648 }
2649 
2650 //
2651 // Operator Smoke Tests
2652 //
2653 
test_binary_op(std::string op_name,std::vector<int64_t> sizes_big,std::vector<int64_t> sizes_small,vkapi::ScalarType dtype,utils::GPUMemoryLayout memory_layout)2654 void test_binary_op(
2655     std::string op_name,
2656     std::vector<int64_t> sizes_big,
2657     std::vector<int64_t> sizes_small,
2658     vkapi::ScalarType dtype,
2659     utils::GPUMemoryLayout memory_layout) {
2660   GraphConfig config;
2661   ComputeGraph graph(config);
2662 
2663   IOValueRef arg2{};
2664 
2665   // Build graph
2666 
2667   IOValueRef arg1 = graph.add_input_tensor(sizes_big, dtype, memory_layout);
2668   arg2 = graph.add_input_tensor(sizes_small, dtype, memory_layout);
2669 
2670   IOValueRef out;
2671   out.value = graph.add_tensor(sizes_big, dtype, memory_layout);
2672 
2673   std::stringstream ss;
2674   ss << "aten.";
2675   ss << op_name;
2676   ss << ".Tensor";
2677   VK_GET_OP_FN(ss.str())
2678   (graph, {arg1.value, arg2.value, kDummyValueRef, out.value});
2679 
2680   out.staging = graph.set_output_tensor(out.value);
2681 
2682   graph.prepare();
2683   graph.encode_prepack();
2684   graph.prepack();
2685   graph.encode_execute();
2686 
2687   for (int i = 1; i < 4; i++) {
2688     float val_arg1 = i + 1.5;
2689     float val_arg2 = i - 3.5;
2690 
2691     float val_out = val_arg1 + val_arg2;
2692     if (op_name == "sub") {
2693       val_out = val_arg1 - val_arg2;
2694     }
2695     if (op_name == "mul") {
2696       val_out = val_arg1 * val_arg2;
2697     }
2698     if (op_name == "div") {
2699       val_out = val_arg1 / val_arg2;
2700     }
2701 
2702     execute_graph_and_check_output(graph, {val_arg1, val_arg2}, {val_out});
2703   }
2704 }
2705 
2706 #define CALL_TEST_FN_FORALL_CONDITIONS(_)                   \
2707   _(vkapi::kFloat, utils::kTexture3D, utils::kWidthPacked)  \
2708   _(vkapi::kFloat, utils::kTexture3D, utils::kHeightPacked) \
2709   _(vkapi::kFloat, utils::kTexture3D, utils::kChannelsPacked)
2710 
2711 #define CALL_TEST_FN_FOR_W_PACKED(_)                              \
2712   _(vkapi::kFloat, utils::kTexture3D, utils::kWidthPacked, false) \
2713   _(vkapi::kFloat, utils::kTexture3D, utils::kWidthPacked, true)  \
2714   _(vkapi::kFloat, utils::kBuffer, utils::kWidthPacked, false)    \
2715   _(vkapi::kFloat, utils::kBuffer, utils::kWidthPacked, true)
2716 
2717 #define CALL_TEST_FN_FOR_C_PACKED(_)                                 \
2718   _(vkapi::kFloat, utils::kTexture3D, utils::kChannelsPacked, false) \
2719   _(vkapi::kFloat, utils::kTexture3D, utils::kChannelsPacked, true)  \
2720   _(vkapi::kFloat, utils::kBuffer, utils::kChannelsPacked, false)    \
2721   _(vkapi::kFloat, utils::kBuffer, utils::kChannelsPacked, true)
2722 
TEST(VulkanComputeGraphOpsTest,add_smoke_test)2723 TEST(VulkanComputeGraphOpsTest, add_smoke_test) {
2724 #define RUN_TESTS(dtype, storage, layout)                         \
2725   test_binary_op("add", {17, 21}, {17, 21}, dtype, layout);       \
2726   test_binary_op("add", {17, 21}, {1, 1}, dtype, layout);         \
2727   test_binary_op("sub", {11, 22}, {11, 22}, dtype, layout);       \
2728   test_binary_op("sub", {11, 22}, {11, 1}, dtype, layout);        \
2729   test_binary_op("add", {7, 17, 17}, {7, 17, 17}, dtype, layout); \
2730   test_binary_op("add", {7, 17, 17}, {7, 1, 17}, dtype, layout);  \
2731   test_binary_op("sub", {9, 9, 7}, {9, 9, 7}, dtype, layout);     \
2732   test_binary_op("sub", {9, 9, 7}, {9, 1, 1}, dtype, layout);
2733 
2734   CALL_TEST_FN_FORALL_CONDITIONS(RUN_TESTS);
2735 
2736 #undef RUN_TESTS
2737 }
2738 
test_mm(int B,int M,int K,int N,vkapi::ScalarType dtype,utils::StorageType storage_type,utils::GPUMemoryLayout memory_layout,bool prepack=true)2739 void test_mm(
2740     int B,
2741     int M,
2742     int K,
2743     int N,
2744     vkapi::ScalarType dtype,
2745     utils::StorageType storage_type,
2746     utils::GPUMemoryLayout memory_layout,
2747     bool prepack = true) {
2748   GraphConfig config;
2749   config.set_storage_type_override(storage_type);
2750   ComputeGraph graph(config);
2751 
2752   std::vector<int64_t> mat1_size = {M, K};
2753   std::vector<int64_t> mat2_size = {K, N};
2754   std::vector<int64_t> out_size = {M, N};
2755   if (B > 1) {
2756     mat1_size.resize(3);
2757     mat1_size = {B, M, K};
2758     mat2_size.resize(3);
2759     mat2_size = {B, K, N};
2760     out_size.resize(3);
2761     out_size = {B, M, N};
2762   }
2763 
2764   IOValueRef mat2{};
2765 
2766   CREATE_WEIGHT_TENSOR(mat2_w, mat2_size, dtype, 2.0f);
2767 
2768   // Build graph
2769 
2770   IOValueRef mat1 = graph.add_input_tensor(mat1_size, dtype, memory_layout);
2771 
2772   if (prepack) {
2773     mat2.value = mat2_w;
2774   } else {
2775     mat2.value = graph.add_tensor(mat2_size, dtype, memory_layout);
2776     mat2.staging = graph.set_input_tensor(mat2.value);
2777   }
2778 
2779   IOValueRef out;
2780   out.value = graph.add_tensor(out_size, dtype, memory_layout);
2781 
2782   VK_GET_OP_FN("aten.mm.default")(graph, {mat1.value, mat2.value, out.value});
2783 
2784   out.staging = graph.set_output_tensor(out.value);
2785 
2786   graph.prepare();
2787   graph.encode_prepack();
2788   graph.prepack();
2789   graph.encode_execute();
2790 
2791   for (int i = 1; i < 4; i++) {
2792     if (prepack) {
2793       float val_mat1 = i;
2794       float val_out = K * (val_mat1 * 2.0f);
2795       execute_graph_and_check_output(graph, {val_mat1}, {val_out});
2796     } else {
2797       float val_mat1 = i;
2798       float val_mat2 = i + 1;
2799       float val_out = K * (val_mat1 * val_mat2);
2800       execute_graph_and_check_output(graph, {val_mat1, val_mat2}, {val_out});
2801     }
2802   }
2803 }
2804 
TEST(VulkanComputeGraphOpsTest,mm_smoke_test)2805 TEST(VulkanComputeGraphOpsTest, mm_smoke_test) {
2806 #define RUN_TESTS(dtype, storage_type, layout, prepack) \
2807   test_mm(                                              \
2808       /*B = */ 1,                                       \
2809       /*M = */ 31,                                      \
2810       /*K = */ 127,                                     \
2811       /*N = */ 23,                                      \
2812       dtype,                                            \
2813       storage_type,                                     \
2814       layout,                                           \
2815       prepack);                                         \
2816   test_mm(                                              \
2817       /*B = */ 5,                                       \
2818       /*M = */ 31,                                      \
2819       /*K = */ 127,                                     \
2820       /*N = */ 23,                                      \
2821       dtype,                                            \
2822       storage_type,                                     \
2823       layout,                                           \
2824       prepack);                                         \
2825   test_mm(                                              \
2826       /*B = */ 7,                                       \
2827       /*M = */ 13,                                      \
2828       /*K = */ 89,                                      \
2829       /*N = */ 17,                                      \
2830       dtype,                                            \
2831       storage_type,                                     \
2832       layout,                                           \
2833       prepack);                                         \
2834   test_mm(                                              \
2835       /*B = */ 1,                                       \
2836       /*M = */ 13,                                      \
2837       /*K = */ 89,                                      \
2838       /*N = */ 17,                                      \
2839       dtype,                                            \
2840       storage_type,                                     \
2841       layout,                                           \
2842       prepack);
2843 
2844   CALL_TEST_FN_FOR_W_PACKED(RUN_TESTS);
2845   CALL_TEST_FN_FOR_C_PACKED(RUN_TESTS);
2846 
2847 #undef RUN_TESTS
2848 }
2849 
test_max_pool2d(const std::vector<int64_t> & in_size,const int64_t base_val,std::vector<int64_t> & kernel)2850 void test_max_pool2d(
2851     const std::vector<int64_t>& in_size,
2852     const int64_t base_val,
2853     std::vector<int64_t>& kernel) {
2854   GraphConfig config;
2855   ComputeGraph graph(config);
2856 
2857   // Build graph
2858 
2859   std::vector<int64_t> out_size(in_size);
2860   int h = in_size.size() - 2;
2861   int w = in_size.size() - 1;
2862   out_size[h] = in_size[h] - kernel[0] + 1;
2863   out_size[w] = in_size[w] - kernel[1] + 1;
2864 
2865   IOValueRef in_ioval = graph.add_input_tensor(
2866       in_size, vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED);
2867   IOValueRef out_ioval;
2868   out_ioval.value = graph.add_tensor(
2869       out_size, vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED);
2870   IOValueRef idx_ioval;
2871   idx_ioval.value = graph.add_tensor(
2872       out_size, vkapi::kInt, utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED);
2873   ValueRef out = graph.add_value_list({out_ioval.value, idx_ioval.value});
2874 
2875   std::vector<int64_t> kernel_copy(kernel);
2876   VK_GET_OP_FN("aten.max_pool2d_with_indices.default")
2877   (graph,
2878    {in_ioval.value,
2879     graph.add_scalar_list<int64_t>(std::move(kernel)),
2880     graph.add_scalar_list<int64_t>({1, 1}),
2881     graph.add_scalar_list<int64_t>({0, 0}),
2882     graph.add_scalar_list<int64_t>({1, 1}),
2883     graph.add_scalar(false),
2884     out});
2885 
2886   out_ioval.staging = graph.set_output_tensor(out_ioval.value);
2887   idx_ioval.staging = graph.set_output_tensor(idx_ioval.value);
2888 
2889   graph.prepare();
2890   graph.encode_prepack();
2891   graph.prepack();
2892   graph.encode_execute();
2893 
2894   // Run graph
2895 
2896   fill_vtensor(graph, graph.inputs().at(0), base_val, /*iota = */ true);
2897 
2898   vTensorPtr t_in = graph.get_tensor(in_ioval.value);
2899   std::vector<float> input_data(t_in->staging_buffer_numel());
2900   graph.copy_from_staging(
2901       in_ioval.staging, input_data.data(), input_data.size());
2902 
2903   graph.execute();
2904 
2905   vTensorPtr t_out = graph.get_tensor(out_ioval.value);
2906   std::vector<float> output_data(t_out->staging_buffer_numel());
2907   graph.copy_from_staging(
2908       out_ioval.staging, output_data.data(), output_data.size());
2909   vTensorPtr t_idx = graph.get_tensor(idx_ioval.value);
2910   std::vector<int> index_data(t_idx->staging_buffer_numel());
2911   graph.copy_from_staging(
2912       idx_ioval.staging, index_data.data(), index_data.size());
2913 
2914   // Check results
2915 
2916   int h_offset = kernel_copy[0] - 1;
2917   int w_offset = kernel_copy[1] - 1;
2918   int h_out = utils::val_at(-2, t_out->sizes());
2919   int w_out = utils::val_at(-1, t_out->sizes());
2920   int w_in = utils::val_at(-1, t_in->sizes());
2921   for (size_t i = 0; i < h_out; ++i) {
2922     for (size_t j = 0; j < w_out; ++j) {
2923       size_t idx_out = i * w_out + j;
2924       size_t idx_in = (i + h_offset) * w_in + (j + w_offset);
2925       CHECK_VALUE(index_data, idx_out, idx_in);
2926       CHECK_VALUE(output_data, idx_out, input_data[idx_in]);
2927     }
2928   }
2929 }
2930 
TEST(VulkanComputeGraphOpsTest,max_pool2d_smoke_test)2931 TEST(VulkanComputeGraphOpsTest, max_pool2d_smoke_test) {
2932   std::vector<int64_t> kernel = {2, 3};
2933   test_max_pool2d(
2934       /*in_size = */ {1, 4, 6},
2935       /*base_val = */ 10.0f,
2936       kernel);
2937 }
2938 
test_conv2d(const std::vector<int64_t> & original_sizes,const std::vector<int64_t> & padded_sizes,const std::vector<int64_t> & gpu_sizes,const bool transposed,const std::vector<float> & data_out_expected)2939 void test_conv2d(
2940     const std::vector<int64_t>& original_sizes,
2941     const std::vector<int64_t>& padded_sizes,
2942     const std::vector<int64_t>& gpu_sizes,
2943     const bool transposed,
2944     const std::vector<float>& data_out_expected) {
2945   vTensor vten = vTensor(
2946       context(),
2947       gpu_sizes,
2948       vkapi::kFloat,
2949       utils::StorageType::TEXTURE_2D,
2950       utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED);
2951 
2952   // Create and fill input staging buffer
2953   const int64_t in_numel = utils::multiply_integers(original_sizes);
2954   StagingBuffer staging_buffer_in(context(), vkapi::kFloat, in_numel);
2955 
2956   std::vector<float> data_in(in_numel);
2957   for (int i = 0; i < in_numel; i++) {
2958     data_in[i] = i + 1;
2959   }
2960   staging_buffer_in.copy_from(data_in.data(), sizeof(float) * in_numel);
2961 
2962   // Output staging buffer
2963   const int64_t out_numel =
2964       padded_sizes[0] * padded_sizes[1] * original_sizes[2] * original_sizes[3];
2965   StagingBuffer staging_buffer_out(context(), vkapi::kFloat, out_numel);
2966 
2967   // Copy data in and out of the tensor
2968   record_conv2d_prepack_weights_op(
2969       context(), staging_buffer_in.buffer(), vten, original_sizes, transposed);
2970   record_image_to_nchw_op(context(), vten, staging_buffer_out.buffer());
2971 
2972   // Execute command buffer
2973   submit_to_gpu();
2974 
2975   // Extract data from output staging buffer
2976   std::vector<float> data_out(out_numel);
2977   staging_buffer_out.copy_to(data_out.data(), sizeof(float) * out_numel);
2978 
2979   // Check data matches results copied from ATen-VK
2980   for (int i = 0; i < vten.numel(); i++) {
2981     CHECK_VALUE(data_out, i, data_out_expected[i]);
2982   }
2983 }
2984 
TEST(VulkanComputeGraphOpsTest,conv2d_prepack_test)2985 TEST(VulkanComputeGraphOpsTest, conv2d_prepack_test) {
2986   test_conv2d(
2987       /*original_sizes = */ {2, 3, 1, 2},
2988       /*padded_sizes = */ {4, 4},
2989       /*gpu_sizes = */ {4, 1, 8},
2990       /*transposed = */ false,
2991       /*data_out_expected = */ {1, 3, 5,  0,  2, 4, 6, 0, 7, 9, 11,
2992                                 0, 8, 10, 12, 0, 0, 0, 0, 0, 0, 0,
2993                                 0, 0, 0,  0,  0, 0, 0, 0, 0, 0});
2994   test_conv2d(
2995       /*original_sizes = */ {2, 3, 1, 2},
2996       /*padded_sizes = */ {4, 4},
2997       /*gpu_sizes = */ {4, 1, 8},
2998       /*transposed = */ true,
2999       /*data_out_expected = */ {2, 8, 0, 0, 1, 7, 0,  0, 4, 10, 0,
3000                                 0, 3, 9, 0, 0, 6, 12, 0, 0, 5,  11,
3001                                 0, 0, 0, 0, 0, 0, 0,  0, 0, 0});
3002 }
3003 
test_grid_priors(std::vector<int64_t> input_sizes,std::vector<int64_t> output_sizes,int stride,double offset,const std::vector<float> & data_out_expected)3004 void test_grid_priors(
3005     std::vector<int64_t> input_sizes,
3006     std::vector<int64_t> output_sizes,
3007     int stride,
3008     double offset,
3009     const std::vector<float>& data_out_expected) {
3010   GraphConfig config;
3011   ComputeGraph graph(config);
3012 
3013   // Build graph
3014   IOValueRef in = graph.add_input_tensor(
3015       input_sizes,
3016       vkapi::kFloat,
3017       utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED);
3018   IOValueRef out;
3019   out.value = graph.add_tensor(
3020       output_sizes,
3021       vkapi::kFloat,
3022       utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED);
3023 
3024   VK_GET_OP_FN("et_vk.grid_priors.default")
3025   (graph,
3026    {in.value,
3027     graph.add_scalar<int64_t>(stride),
3028     graph.add_scalar<double>(offset),
3029     out.value});
3030 
3031   out.staging = graph.set_output_tensor(out.value);
3032 
3033   graph.prepare();
3034   graph.encode_prepack();
3035   graph.prepack();
3036   graph.encode_execute();
3037 
3038   vTensorPtr t_in = graph.get_tensor(in.value);
3039   vTensorPtr t_out = graph.get_tensor(out.value);
3040   // Resize input
3041   graph.propagate_resize();
3042 
3043   // run graph
3044   graph.execute();
3045 
3046   std::vector<float> output_data(t_out->staging_buffer_numel());
3047   graph.copy_from_staging(out.staging, output_data.data(), output_data.size());
3048 
3049   // check results
3050   int h_out = utils::val_at(-2, t_out->sizes());
3051   int w_out = utils::val_at(-1, t_out->sizes());
3052   for (size_t i = 0; i < h_out; ++i) {
3053     for (size_t j = 0; j < w_out; ++j) {
3054       size_t idx_out = i * w_out + j;
3055       CHECK_VALUE(output_data, idx_out, data_out_expected[idx_out]);
3056     }
3057   }
3058 }
3059 
TEST(VulkanComputeGraphOpsTest,grid_priors_test)3060 TEST(VulkanComputeGraphOpsTest, grid_priors_test) {
3061   test_grid_priors(
3062       /*input size = */ {1, 5, 2, 3},
3063       /*output size = */ {6, 2},
3064       /*stride = */ 1,
3065       /*offset = */ 0.0,
3066       /*data_out_expected = */ {0, 0, 1, 0, 2, 0, 0, 1, 1, 1, 2, 1});
3067 
3068   test_grid_priors(
3069       /*input size = */ {1, 5, 2, 3},
3070       /*output size = */ {6, 2},
3071       /*stride = */ 8,
3072       /*offset = */ 0.5,
3073       /*data_out_expected = */ {4, 4, 12, 4, 20, 4, 4, 12, 12, 12, 20, 12});
3074 }
3075 
test_transpose_view_mm(const int B,const int M,const int K,const int N,utils::StorageType storage_type)3076 void test_transpose_view_mm(
3077     const int B,
3078     const int M,
3079     const int K,
3080     const int N,
3081     utils::StorageType storage_type) {
3082   GraphConfig config;
3083   config.set_storage_type_override(storage_type);
3084   ComputeGraph graph(config);
3085 
3086   std::vector<int64_t> mat1_size = {M, K};
3087   std::vector<int64_t> mat2_t_size = {N, K};
3088   std::vector<int64_t> out_size = {M, N};
3089 
3090   std::vector<int64_t> mat1_small_size = {M - 4, K - 3};
3091   std::vector<int64_t> mat2_t_small_size = {N - 1, K - 3};
3092 
3093   if (B > 1) {
3094     mat1_size.resize(3);
3095     mat1_size = {B, M, K};
3096     mat2_t_size.resize(3);
3097     mat2_t_size = {B, N, K};
3098     out_size.resize(3);
3099     out_size = {B, M, N};
3100 
3101     mat1_small_size.resize(3);
3102     mat1_small_size = {B, M - 4, K - 3};
3103     mat2_t_small_size.resize(3);
3104     mat2_t_small_size = {B, N - 1, K - 3};
3105   }
3106 
3107   // Build graph; use shared objects to test views of shared objects
3108 
3109   IOValueRef mat1 =
3110       graph.add_input_tensor(mat1_size, vkapi::kFloat, utils::kWidthPacked, 0);
3111   IOValueRef mat2_transpose = graph.add_input_tensor(
3112       mat2_t_size, vkapi::kFloat, utils::kWidthPacked, 1);
3113 
3114   ValueRef mat2 = graph.add_tensor_view(mat2_transpose.value);
3115 
3116   ValueRef dim0;
3117   ValueRef dim1;
3118 
3119   if (B > 1) {
3120     dim0 = graph.add_scalar<int64_t>(1);
3121     dim1 = graph.add_scalar<int64_t>(2);
3122   } else {
3123     dim0 = graph.add_scalar<int64_t>(0);
3124     dim1 = graph.add_scalar<int64_t>(1);
3125   }
3126 
3127   IOValueRef out;
3128   out.value = graph.add_tensor(out_size, vkapi::kFloat, utils::kWidthPacked, 2);
3129 
3130   VK_GET_OP_FN("aten.transpose.int")
3131   (graph, {mat2_transpose.value, dim0, dim1, mat2});
3132   VK_GET_OP_FN("aten.mm.default")(graph, {mat1.value, mat2, out.value});
3133 
3134   out.staging = graph.set_output_tensor(out.value);
3135 
3136   graph.prepare();
3137   graph.encode_prepack();
3138   graph.prepack();
3139   graph.encode_execute();
3140 
3141   for (int i = 1; i < 4; i++) {
3142     float val_mat1 = i;
3143     float val_mat2 = i + 1;
3144     float val_out = K * (val_mat1 * val_mat2);
3145 
3146     // Try at full size
3147     graph.resize_input(0, mat1_size);
3148     graph.resize_input(1, mat2_t_size);
3149     graph.propagate_resize();
3150     execute_graph_and_check_output(graph, {val_mat1, val_mat2}, {val_out});
3151 
3152     // Try at reduced sizes
3153     val_out = (K - 3) * (val_mat1 * val_mat2);
3154     graph.resize_input(0, mat1_small_size);
3155     graph.resize_input(1, mat2_t_small_size);
3156     graph.propagate_resize();
3157     execute_graph_and_check_output(graph, {val_mat1, val_mat2}, {val_out});
3158   }
3159 }
3160 
TEST(VulkanComputeGraphOpsTest,test_transpose_with_mm)3161 TEST(VulkanComputeGraphOpsTest, test_transpose_with_mm) {
3162   for (auto storage_type : {utils::kBuffer, utils::kTexture3D}) {
3163     test_transpose_view_mm(2, 7, 17, 5, storage_type);
3164   }
3165 }
3166 
test_to_copy()3167 void test_to_copy() {
3168   GraphConfig config;
3169   config.set_storage_type_override(utils::kTexture3D);
3170   ComputeGraph graph(config);
3171   int M = 8;
3172   int N = 8;
3173   int K = 8;
3174   // Build graph
3175   IOValueRef in = graph.add_input_tensor(
3176       {1, M, N, K},
3177       vkapi::kFloat,
3178       utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED);
3179 
3180   std::vector<float> data_in =
3181       create_random_float_buffer(M * N * K, -1024, 1024);
3182   graph.copy_into_staging(in.staging, data_in.data(), data_in.size());
3183 
3184   IOValueRef out;
3185   out.value = graph.add_tensor(
3186       {1, M, N, K},
3187       vkapi::kHalf,
3188       utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED);
3189 
3190   auto op = VK_GET_OP_FN("aten._to_copy.default");
3191   op(graph,
3192      {in.value,
3193       graph.add_none(),
3194       graph.add_none(),
3195       graph.add_none(),
3196       graph.add_none(),
3197       graph.add_none(),
3198       graph.add_none(),
3199       out.value});
3200 
3201   out.staging = graph.set_output_tensor(out.value);
3202 
3203   graph.prepare();
3204   graph.encode_prepack();
3205   graph.prepack();
3206   graph.encode_execute();
3207   graph.propagate_resize();
3208   graph.execute();
3209 
3210   std::vector<torch::executor::Half> output_data(graph.numel_of(out.value));
3211   graph.copy_from_staging(out.staging, output_data.data(), output_data.size());
3212 
3213   EXPECT_EQ(data_in.size(), output_data.size());
3214 
3215   float mse_ex = 0.0f;
3216   float mse_vk = 0.0f;
3217 
3218   // check results
3219   for (size_t i = 0; i < output_data.size(); ++i) {
3220     float input = data_in[i];
3221     torch::executor::Half expected_output =
3222         static_cast<torch::executor::Half>(input);
3223     uint16_t* expected_bits = reinterpret_cast<uint16_t*>(&expected_output);
3224     torch::executor::Half output = output_data[i];
3225     uint16_t* output_bits = reinterpret_cast<uint16_t*>(&output);
3226 
3227     std::string msg;
3228     msg.reserve(64);
3229     msg = "input = " + std::to_string(input) + "(0b" +
3230         std::bitset<32>(*reinterpret_cast<uint32_t*>(&input)).to_string() +
3231         "), expected output = " + std::to_string(expected_output) + "(0b" +
3232         std::bitset<16>(*expected_bits).to_string() +
3233         "), recieved output = " + std::to_string(output) + "(0b" +
3234         std::bitset<16>(*output_bits).to_string() + ")";
3235 
3236     std::cout << msg << std::endl;
3237 
3238     // Note: Torch executor half "rounds up" when converting to fp16 whereas
3239     // most driver implementations of Vulkan's opFConvert() just truncates the
3240     // extra bits for performance (rounding introduces conditional).
3241     // Example:
3242     // INPUT F32 = 25.248 (sign{0b0}, exp{0b10000011},
3243     // mantissa{0b10010011111101111100111}),
3244     // TORCH HALF OUTPUT F16 = 25.25 (sign{0b0}, exp{0b10011},
3245     // mantissa{0b1001010000}),
3246     // VULKAN OUTPUT F16 = 25.2344 (sign{0b0}, exp{0b10011},
3247     // mantissa{0b1001001111})
3248     // Note:
3249     // The vulkan mantissa exactly matches the first 10
3250     // bits of the input 23 bit mantissa. But since the 11th bit is 1, the
3251     // torch half output is rounded up (essentially adding a 1).
3252     // Vulkan mantissa{0b1001001111} + 1 = Torch half mantissa{0b1001010000}
3253 
3254     EXPECT_TRUE(
3255         (*output_bits == *expected_bits) ||
3256         /*rounding error*/ ((*output_bits + 1u) == *expected_bits));
3257     mse_ex += std::pow(expected_output - input, 2);
3258     mse_vk += std::pow(output - input, 2);
3259   }
3260 
3261   mse_ex /= output_data.size();
3262   mse_vk /= output_data.size();
3263   std::cout << "========================================================="
3264             << std::endl;
3265   std::cout << "mse_ex = " << mse_ex << ", mse_vk = " << mse_vk << std::endl;
3266 }
3267 
TEST(VulkanComputeGraphOpsTest,test_to_copy)3268 TEST(VulkanComputeGraphOpsTest, test_to_copy) {
3269   if (context()->adapter_ptr()->supports_16bit_storage_buffers()) {
3270     test_to_copy();
3271   }
3272 }
3273