1 /*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9 #include <gtest/gtest.h>
10
11 #include <bitset>
12 #include <utility>
13 #include <vector>
14
15 #include <executorch/runtime/core/exec_aten/exec_aten.h>
16
17 #include <executorch/backends/vulkan/runtime/api/api.h>
18
19 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
20
21 #include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
22
23 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
24
25 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.h>
26
27 #include <executorch/backends/vulkan/test/utils/test_utils.h>
28
29 #include <executorch/backends/vulkan/runtime/graph/ops/DispatchNode.h>
30
31 using namespace vkcompute;
32 using namespace vkcompute::api;
33
34 std::vector<float>
transpose_matrix(std::vector<float> & mat,const int H,const int W)35 transpose_matrix(std::vector<float>& mat, const int H, const int W) {
36 std::vector<float> out(W * H);
37 for (int out_y = 0; out_y < H; ++out_y) {
38 for (int out_x = 0; out_x < W; ++out_x) {
39 out[out_x * H + out_y] = mat[out_y * W + out_x];
40 }
41 }
42 return out;
43 }
44
compute_reference_matmul(std::vector<float> & mat1,std::vector<float> & mat2,const int M,const int K,const int N)45 std::vector<float> compute_reference_matmul(
46 std::vector<float>& mat1,
47 std::vector<float>& mat2,
48 const int M,
49 const int K,
50 const int N) {
51 std::vector<float> out(M * N);
52 for (int out_y = 0; out_y < M; ++out_y) {
53 for (int out_x = 0; out_x < N; ++out_x) {
54 out[out_y * N + out_x] = 0;
55 for (int k = 0; k < K; ++k) {
56 out[out_y * N + out_x] += mat1[out_y * K + k] * mat2[k * N + out_x];
57 }
58 }
59 }
60 return out;
61 }
62
63 std::vector<std::vector<int64_t>> standard_sizes_to_test = {
64 // 2D
65 {7, 11},
66 {13, 6},
67 // 3D
68 {2, 9, 7},
69 {9, 15, 19},
70 {7, 11, 24},
71 {13, 8, 11},
72 {12, 11, 19},
73 // 4D
74 {2, 2, 3, 5},
75 {9, 13, 11, 17},
76 {17, 14, 18, 20},
77 {7, 13, 12, 21},
78 {3, 8, 13, 17},
79 };
80
81 //
82 // Compute API Tests
83 //
84
85 class VulkanComputeAPITest : public ::testing::Test {
86 public:
SetUp()87 void SetUp() override {
88 // Make sure we are starting with a clean slate
89 EXPECT_TRUE(get_vma_allocation_count() == 0);
90 }
91
TearDown()92 void TearDown() override {
93 context()->flush();
94
95 // Make sure we are ending with a clean slate
96 EXPECT_TRUE(get_vma_allocation_count() == 0);
97 }
98 };
99
TEST_F(VulkanComputeAPITest,print_adapter)100 TEST_F(VulkanComputeAPITest, print_adapter) {
101 std::cout << *(context()->adapter_ptr()) << std::endl;
102 }
103
get_reference_strides(const std::vector<int64_t> & sizes,const utils::GPUMemoryLayout layout,const bool unsqueezed=false)104 std::vector<int64_t> get_reference_strides(
105 const std::vector<int64_t>& sizes,
106 const utils::GPUMemoryLayout layout,
107 const bool unsqueezed = false) {
108 int64_t C = utils::val_at(-3, sizes);
109 int64_t H = utils::val_at(-2, sizes);
110 int64_t W = utils::val_at(-1, sizes);
111
112 int64_t numel = utils::multiply_integers(sizes);
113
114 switch (layout) {
115 case utils::kWidthPacked:
116 switch (sizes.size()) {
117 case 1:
118 if (unsqueezed)
119 return {numel, numel, numel, 1};
120 return {1};
121 case 2:
122 if (unsqueezed)
123 return {numel, numel, W, 1};
124 return {W, 1};
125 case 3:
126 if (unsqueezed)
127 return {numel, H * W, W, 1};
128 return {H * W, W, 1};
129 case 4:
130 return {C * H * W, H * W, W, 1};
131 default:
132 return {};
133 }
134 break;
135 case utils::kHeightPacked:
136 switch (sizes.size()) {
137 case 1:
138 if (unsqueezed)
139 return {numel, numel, numel, 1};
140 return {1};
141 case 2:
142 if (unsqueezed)
143 return {numel, numel, 1, H};
144 return {1, H};
145 case 3:
146 if (unsqueezed)
147 return {numel, H * W, 1, H};
148 return {W * H, 1, H};
149 case 4:
150 return {C * W * H, W * H, 1, H};
151 default:
152 return {};
153 }
154 case utils::kChannelsPacked:
155 switch (sizes.size()) {
156 case 1:
157 if (unsqueezed)
158 return {numel, numel, numel, 1};
159 return {1};
160 case 2:
161 if (unsqueezed)
162 return {numel, numel, W, 1};
163 return {W, 1};
164 case 3:
165 if (unsqueezed)
166 return {numel, 1, W * C, C};
167 return {1, W * C, C};
168 case 4:
169 return {H * W * C, 1, W * C, C};
170 default:
171 return {};
172 }
173 }
174 return {};
175 }
176
TEST_F(VulkanComputeAPITest,empty_init_shader_info_test)177 TEST_F(VulkanComputeAPITest, empty_init_shader_info_test) {
178 vkapi::ShaderInfo empty_shader_info;
179 EXPECT_FALSE(empty_shader_info);
180 EXPECT_TRUE(empty_shader_info.src_code.bin == nullptr);
181 EXPECT_TRUE(empty_shader_info.src_code.size == 0u);
182 }
183
TEST_F(VulkanComputeAPITest,calculate_dim_order_test)184 TEST_F(VulkanComputeAPITest, calculate_dim_order_test) {
185 // ndim, GPUMemoryLayout, expected dim order pairs
186 std::vector<std::tuple<size_t, int32_t, std::vector<int64_t>>> test_cases = {
187 {1, WHCN::kWidthDim, {0}},
188 {1, WHCN::kHeightDim, {0}},
189 {1, WHCN::kChannelsDim, {0}},
190 {2, WHCN::kWidthDim, {0, 1}},
191 {2, WHCN::kHeightDim, {1, 0}},
192 {2, WHCN::kChannelsDim, {0, 1}},
193 {3, WHCN::kWidthDim, {0, 1, 2}},
194 {3, WHCN::kHeightDim, {0, 2, 1}},
195 {3, WHCN::kChannelsDim, {1, 2, 0}},
196 {4, WHCN::kWidthDim, {0, 1, 2, 3}},
197 {4, WHCN::kHeightDim, {0, 1, 3, 2}},
198 {4, WHCN::kChannelsDim, {0, 2, 3, 1}},
199 };
200
201 for (const auto& test_case : test_cases) {
202 const size_t& ndim = std::get<0>(test_case);
203 const int32_t packed_dim = std::get<1>(test_case);
204 const auto& expected_dim_order = std::get<2>(test_case);
205 std::vector<int64_t> dim_order = calculate_dim_order(ndim, packed_dim);
206
207 ASSERT_TRUE(dim_order == expected_dim_order);
208 }
209 }
210
TEST_F(VulkanComputeAPITest,calculate_tensor_strides_test)211 TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) {
212 vTensor v_tensor_to_resize(
213 context(),
214 {25, 25, 25, 25},
215 vkapi::kFloat,
216 utils::kBuffer,
217 utils::kWidthPacked,
218 /*allocate_memory = */ false);
219
220 for (const auto& sizes : standard_sizes_to_test) {
221 if (sizes.size() < 3) {
222 continue;
223 }
224 for (const auto& layout :
225 {utils::kWidthPacked, utils::kHeightPacked, utils::kChannelsPacked}) {
226 {
227 const int32_t packed_dim = static_cast<int32_t>(layout);
228 std::vector<int64_t> dim_order =
229 calculate_dim_order(sizes.size(), packed_dim);
230 std::vector<int64_t> strides = calculate_strides(sizes, dim_order);
231 std::vector<int64_t> ref_strides = get_reference_strides(sizes, layout);
232 ASSERT_TRUE(strides == ref_strides);
233
234 int64_t numel = utils::multiply_integers(sizes);
235 std::vector<int64_t> unsqueezed_strides =
236 unsqueeze_strides(strides, numel);
237 std::vector<int64_t> ref_unsqueezed_strides =
238 get_reference_strides(sizes, layout, true);
239
240 ASSERT_TRUE(unsqueezed_strides == ref_unsqueezed_strides);
241
242 // Create new vTensor and check that the strides are correct
243 vTensor new_v_tensor(
244 context(),
245 sizes,
246 vkapi::kFloat,
247 utils::kBuffer,
248 layout,
249 /*allocate_memory = */ false);
250
251 ASSERT_TRUE(new_v_tensor.strides() == ref_strides);
252 ASSERT_TRUE(
253 new_v_tensor.unsqueezed_strides() == ref_unsqueezed_strides);
254
255 // Resize vtensor and check that updated metadata is correct
256 v_tensor_to_resize.virtual_reconfigure(sizes, dim_order);
257 ASSERT_TRUE(v_tensor_to_resize.strides() == ref_strides);
258 ASSERT_TRUE(
259 v_tensor_to_resize.unsqueezed_strides() == ref_unsqueezed_strides);
260 }
261 }
262 }
263 }
264
TEST_F(VulkanComputeAPITest,virtual_transpose_test)265 TEST_F(VulkanComputeAPITest, virtual_transpose_test) {
266 std::vector<int64_t> sizes = {7, 9, 11, 13};
267 // (dim0, dim1), new_sizes, new_dim_order, new_axis_map, new_packed_dim_idx
268 std::vector<std::vector<std::vector<int64_t>>> test_cases = {
269 {{2, 3}, {7, 9, 13, 11}, {0, 1, 3, 2}, {1, 0, 2, 2}, {1}},
270 {{2, 1}, {7, 11, 9, 13}, {0, 2, 1, 3}, {0, 2, 1, 1}, {0}},
271 {{1, 3}, {7, 13, 11, 9}, {0, 3, 2, 1}, {2, 1, 0, 0}, {2}},
272 };
273
274 for (const auto& test_case : test_cases) {
275 const int dim0 = test_case.at(0).at(0);
276 const int dim1 = test_case.at(0).at(1);
277
278 const auto& expected_sizes = test_case.at(1);
279 const auto& expected_dim_order = test_case.at(2);
280 const auto& expected_axis_map = test_case.at(3);
281 const int expected_packed_dim = test_case.at(4).at(0);
282
283 {
284 vTensor a_buffer = vTensor(
285 context(), sizes, vkapi::kFloat, utils::kBuffer, utils::kWidthPacked);
286
287 a_buffer.virtual_transpose(dim0, dim1);
288 EXPECT_TRUE(a_buffer.sizes() == expected_sizes);
289 EXPECT_TRUE(a_buffer.dim_order() == expected_dim_order);
290 }
291
292 {
293 vTensor a_texture = vTensor(
294 context(),
295 sizes,
296 vkapi::kFloat,
297 utils::kTexture3D,
298 utils::kWidthPacked);
299 a_texture.virtual_transpose(dim0, dim1);
300 EXPECT_TRUE(a_texture.sizes() == expected_sizes);
301 EXPECT_TRUE(a_texture.axis_map() == expected_axis_map);
302 EXPECT_TRUE(a_texture.packed_dim() == expected_packed_dim);
303 }
304 }
305 }
306
TEST_F(VulkanComputeAPITest,view_of_view_test)307 TEST_F(VulkanComputeAPITest, view_of_view_test) {
308 constexpr int N = 3;
309 constexpr int C = 5;
310 constexpr int H = 17;
311 constexpr int W = 19;
312
313 std::vector<int64_t> sizes = {N, C, H, W};
314
315 vTensor t1 = vTensor(
316 context(), sizes, vkapi::kFloat, utils::kTexture3D, utils::kWidthPacked);
317
318 vTensor t2 = vTensor(t1);
319 EXPECT_TRUE(t2.sizes() == sizes);
320 vTensor t3 = vTensor(t2);
321 EXPECT_TRUE(t2.sizes() == sizes);
322
323 t2.virtual_transpose(1, 2);
324 std::vector<int64_t> expected_t2_sizes = {N, H, C, W};
325 EXPECT_TRUE(t2.sizes() == expected_t2_sizes);
326
327 // Because t3 was created before t2's metadata was updated, we need to first
328 // update t3's metadata to match t2's metadata. Then the transpose will yield
329 // the correct metadata.
330 t3.virtual_clone(t2);
331 t3.virtual_transpose(2, 3);
332 std::vector<int64_t> expected_t3_sizes = {N, H, W, C};
333 EXPECT_TRUE(t3.sizes() == expected_t3_sizes);
334 }
335
make_temp_ivec3(int x,int y,int z)336 utils::ivec3 make_temp_ivec3(int x, int y, int z) {
337 return utils::ivec3{x, y, z};
338 }
339
TEST_F(VulkanComputeAPITest,vec_test)340 TEST_F(VulkanComputeAPITest, vec_test) {
341 {
342 utils::vec3 v3({1, 2, 3});
343 ASSERT_TRUE(v3[0] == 1);
344 ASSERT_TRUE(v3[1] == 2);
345 ASSERT_TRUE(v3[2] == 3);
346 v3 = {4, 5, 6};
347 ASSERT_TRUE(v3[0] == 4);
348 ASSERT_TRUE(v3[1] == 5);
349 ASSERT_TRUE(v3[2] == 6);
350 }
351
352 {
353 utils::uvec4 uv4({4, 3, 2, 1});
354 ASSERT_TRUE(uv4[0] == 4);
355 ASSERT_TRUE(uv4[1] == 3);
356 ASSERT_TRUE(uv4[2] == 2);
357 ASSERT_TRUE(uv4[3] == 1);
358 uv4 = {11, 13, 12, 88};
359 ASSERT_TRUE(uv4[0] == 11);
360 ASSERT_TRUE(uv4[1] == 13);
361 ASSERT_TRUE(uv4[2] == 12);
362 ASSERT_TRUE(uv4[3] == 88);
363 }
364
365 // Test copy from same type
366 {
367 utils::ivec3 v{5, 6, 8};
368 utils::ivec3 v2 = v;
369
370 ASSERT_TRUE(v2[0] == 5);
371 ASSERT_TRUE(v2[1] == 6);
372 ASSERT_TRUE(v2[2] == 8);
373 }
374
375 // Test copy from different type
376 {
377 utils::uvec3 v{5, 6, 8};
378 utils::ivec3 v2 = v;
379
380 ASSERT_TRUE(v2[0] == 5);
381 ASSERT_TRUE(v2[1] == 6);
382 ASSERT_TRUE(v2[2] == 8);
383 }
384
385 // Test construction from temporary vec
386 {
387 utils::uvec3 v{make_temp_ivec3(4, 5, 10)};
388 ASSERT_TRUE(v[0] == 4);
389 ASSERT_TRUE(v[1] == 5);
390 ASSERT_TRUE(v[2] == 10);
391 }
392
393 // Test initalization from temporary vec
394 {
395 utils::uvec3 v = make_temp_ivec3(4, 5, 10);
396 ASSERT_TRUE(v[0] == 4);
397 ASSERT_TRUE(v[1] == 5);
398 ASSERT_TRUE(v[2] == 10);
399 }
400 }
401
TEST_F(VulkanComputeAPITest,retrieve_custom_shader_test)402 TEST_F(VulkanComputeAPITest, retrieve_custom_shader_test) {
403 // Try to get shader from custom shader library
404 const vkapi::ShaderInfo& kernel = VK_KERNEL(test_shader);
405
406 ASSERT_TRUE(kernel.kernel_name == "test_shader");
407 }
408
TEST_F(VulkanComputeAPITest,spec_var_classes_test)409 TEST_F(VulkanComputeAPITest, spec_var_classes_test) {
410 // Check equality operator
411 ASSERT_TRUE(SV(1.5f) == SV(1.5f));
412 ASSERT_FALSE(SV(15.0f) == SV(15));
413 ASSERT_FALSE(SV(1u) == SV(true));
414
415 size_t sv_size = sizeof(vkapi::SpecVar);
416
417 vkapi::SpecVarList spec_vars = {};
418 ASSERT_TRUE(spec_vars.size() == 0);
419 spec_vars = {SV(1.1f), SV(32), SV(45)};
420 ASSERT_TRUE(spec_vars.size() == 3);
421 vkapi::SpecVarList spec_vars_other = {SV(2.6f), SV(true), SV(78u), SV(5.5f)};
422 spec_vars.append(spec_vars_other);
423 ASSERT_TRUE(spec_vars.size() == 7);
424
425 // Check validity of the data
426 const vkapi::SpecVar* data = spec_vars.data();
427 ASSERT_TRUE(*(reinterpret_cast<const float*>(data + 3)) == 2.6f);
428 ASSERT_TRUE(*(reinterpret_cast<const int32_t*>(data + 1)) == 32);
429 ASSERT_TRUE(*(reinterpret_cast<const int32_t*>(data + 5)) == 78u);
430
431 // Check validity of the map entries
432 std::vector<VkSpecializationMapEntry> entries =
433 spec_vars.generate_map_entries();
434
435 for (size_t i = 0; i < spec_vars.size(); ++i) {
436 ASSERT_TRUE(entries[i].constantID == i);
437 ASSERT_TRUE(entries[i].offset == sv_size * i);
438 if (i != 4) {
439 ASSERT_TRUE(entries[i].size == 4);
440 } else {
441 ASSERT_TRUE(entries[i].size == 1);
442 }
443 }
444
445 // Check copy
446 vkapi::SpecVarList spec_vars_copy(spec_vars);
447 ASSERT_TRUE(spec_vars_copy.size() == 7);
448
449 // Check validity of the copied data
450 const vkapi::SpecVar* copy_data = spec_vars_copy.data();
451 ASSERT_TRUE(*(reinterpret_cast<const bool*>(copy_data + 4)) == true);
452 ASSERT_TRUE(*(reinterpret_cast<const int32_t*>(copy_data + 2)) == 45);
453 ASSERT_TRUE(*(reinterpret_cast<const float*>(copy_data + 6)) == 5.5f);
454 }
455
TEST_F(VulkanComputeAPITest,spec_var_shader_test)456 TEST_F(VulkanComputeAPITest, spec_var_shader_test) {
457 size_t len = 16;
458 StagingBuffer buffer(context(), vkapi::kFloat, len);
459
460 float scale = 3.0f;
461 float offset = 1.5f;
462
463 {
464 ParamsBuffer params(context(), int32_t(len));
465 uint32_t len_div4 = utils::div_up(uint32_t(len), uint32_t(4));
466 vkapi::PipelineBarrier pipeline_barrier{};
467 context()->submit_compute_job(
468 VK_KERNEL(fill_buffer),
469 pipeline_barrier,
470 {64, 1, 1},
471 {len_div4, 1, 1},
472 {SV(scale), SV(offset)},
473 VK_NULL_HANDLE,
474 0,
475 buffer.buffer(),
476 params.buffer());
477 }
478
479 submit_to_gpu();
480
481 std::vector<float> data(len);
482 buffer.copy_to(data.data(), buffer.nbytes());
483
484 for (size_t i = 0; i < len; ++i) {
485 CHECK_VALUE(data, i, scale * i + offset);
486 }
487 }
488
TEST_F(VulkanComputeAPITest,update_params_between_submit)489 TEST_F(VulkanComputeAPITest, update_params_between_submit) {
490 context()->set_cmd(/*reusable = */ true);
491 std::vector<int64_t> sizes = {4, 4, 2};
492 vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
493
494 std::string kernel_name("fill_texture__test");
495 add_dtype_suffix(kernel_name, a);
496
497 struct Params final {
498 utils::ivec3 size;
499 int32_t fill;
500 utils::vec4 values;
501 };
502
503 Params block{
504 {2, 4, 1},
505 0,
506 {5.0, 5.0, 5.0, 5.0},
507 };
508
509 ParamsBuffer params(context(), block);
510
511 {
512 vkapi::PipelineBarrier pipeline_barrier{};
513 vkapi::SpecVarList specialization_constants = {};
514 context()->submit_compute_job(
515 VK_KERNEL_FROM_STR(kernel_name),
516 pipeline_barrier,
517 {4, 4, 4},
518 {4, 4, 4},
519 specialization_constants,
520 VK_NULL_HANDLE,
521 0,
522 a.image(
523 pipeline_barrier,
524 vkapi::PipelineStage::COMPUTE,
525 vkapi::MemoryAccessType::WRITE),
526 params.buffer());
527 }
528
529 StagingBuffer staging_buffer(
530 context(), vkapi::kFloat, a.staging_buffer_numel());
531 record_image_to_nchw_op(context(), a, staging_buffer.buffer());
532
533 submit_to_gpu();
534 check_staging_buffer(staging_buffer, 5.0f);
535
536 Params new_block{
537 {2, 4, 1},
538 0,
539 {4.0, 4.0, 4.0, 4.0},
540 };
541
542 params.update(new_block);
543
544 submit_to_gpu();
545 check_staging_buffer(staging_buffer, 4.0f);
546 }
547
548 template <typename T, vkapi::ScalarType dtype>
test_storage_buffer_type(const size_t len)549 void test_storage_buffer_type(const size_t len) {
550 StagingBuffer buffer(context(), dtype, len);
551
552 std::string kernel_name("idx_fill_buffer");
553 switch (dtype) {
554 case vkapi::kFloat:
555 kernel_name += "_float";
556 break;
557 case vkapi::kHalf:
558 kernel_name += "_half";
559 break;
560 case vkapi::kQInt8:
561 kernel_name += "_int8";
562 break;
563 case vkapi::kQUInt8:
564 kernel_name += "_uint8";
565 break;
566 default:
567 throw std::runtime_error("Unsupported dtype");
568 break;
569 }
570
571 ParamsBuffer params(context(), int32_t(len));
572
573 {
574 uint32_t len_div4 = utils::div_up(uint32_t(len), uint32_t(4));
575 vkapi::PipelineBarrier pipeline_barrier{};
576 vkapi::SpecVarList specialization_constants = {};
577 context()->submit_compute_job(
578 VK_KERNEL_FROM_STR(kernel_name),
579 pipeline_barrier,
580 {64, 1, 1},
581 {len_div4, 1, 1},
582 specialization_constants,
583 VK_NULL_HANDLE,
584 0,
585 buffer.buffer(),
586 params.buffer());
587 }
588
589 submit_to_gpu();
590
591 std::vector<T> data(len);
592 buffer.copy_to(data.data(), buffer.nbytes());
593
594 for (size_t i = 0; i < len; ++i) {
595 CHECK_VALUE(data, i, T(i));
596 }
597 }
598
TEST_F(VulkanComputeAPITest,test_buffer_float)599 TEST_F(VulkanComputeAPITest, test_buffer_float) {
600 test_storage_buffer_type<float, vkapi::kFloat>(16);
601 }
602
TEST_F(VulkanComputeAPITest,test_buffer_float16)603 TEST_F(VulkanComputeAPITest, test_buffer_float16) {
604 if (!context()->adapter_ptr()->has_full_float16_buffers_support()) {
605 GTEST_SKIP();
606 }
607 test_storage_buffer_type<executorch::aten::Half, vkapi::kHalf>(16);
608 }
609
TEST_F(VulkanComputeAPITest,test_buffer_int8)610 TEST_F(VulkanComputeAPITest, test_buffer_int8) {
611 if (!context()->adapter_ptr()->has_full_int8_buffers_support()) {
612 GTEST_SKIP();
613 }
614 test_storage_buffer_type<int8_t, vkapi::kQInt8>(16);
615 }
616
TEST_F(VulkanComputeAPITest,test_zero_size_tensor)617 TEST_F(VulkanComputeAPITest, test_zero_size_tensor) {
618 // Simple test that performs a + b -> c
619
620 std::vector<int64_t> sizes = {0, 5, 7};
621 vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
622 vTensor b = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
623 vTensor c = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
624
625 // Fill input tensors
626 fill_vtensor(a, 2.5f);
627 fill_vtensor(b, 1.5f);
628
629 // a + b -> c
630 record_binary_op(context(), "add", a, b, c);
631
632 // Extract output tensor
633 std::vector<float> data_out = extract_vtensor(c);
634
635 // Assert all tensors are empty
636 ASSERT_TRUE(a.numel() == 0);
637 ASSERT_TRUE(b.numel() == 0);
638 ASSERT_TRUE(c.numel() == 0);
639 ASSERT_TRUE(a.nbytes() == 0);
640 ASSERT_TRUE(b.nbytes() == 0);
641 ASSERT_TRUE(c.nbytes() == 0);
642
643 // Check output
644 for (size_t i = 0; i < data_out.size(); ++i) {
645 CHECK_VALUE(data_out, i, 4.0f);
646 }
647 }
648
649 template <typename T>
run_buffer_tensor_sanity_check(vTensor & tensor)650 void run_buffer_tensor_sanity_check(vTensor& tensor) {
651 fill_vtensor(tensor, 0.0f, true);
652
653 record_scalar_add_buffer(context(), tensor, 2.0f);
654 std::vector<float> data_out = extract_vtensor(tensor);
655
656 // Check output
657 for (size_t i = 0; i < tensor.numel(); ++i) {
658 CHECK_VALUE(data_out, i, i + 2.0f);
659 }
660 }
661
TEST_F(VulkanComputeAPITest,buffer_tensor_sanity_check)662 TEST_F(VulkanComputeAPITest, buffer_tensor_sanity_check) {
663 for (const auto& sizes : standard_sizes_to_test) {
664 for (const auto& dtype : {vkapi::kFloat, vkapi::kHalf, vkapi::kChar}) {
665 if (dtype == vkapi::kHalf &&
666 !context()->adapter_ptr()->has_full_float16_buffers_support()) {
667 continue;
668 }
669 if (dtype == vkapi::kHalf && utils::multiply_integers(sizes) >= 2048) {
670 continue;
671 }
672 if (dtype == vkapi::kChar &&
673 !context()->adapter_ptr()->has_full_int8_buffers_support()) {
674 continue;
675 }
676 if (dtype == vkapi::kChar && utils::multiply_integers(sizes) >= 128) {
677 continue;
678 }
679 for (const auto& layout :
680 {utils::kWidthPacked,
681 utils::kHeightPacked,
682 utils::kChannelsPacked}) {
683 vTensor a = vTensor(context(), sizes, dtype, utils::kBuffer, layout);
684 switch (dtype) {
685 case vkapi::kFloat:
686 run_buffer_tensor_sanity_check<float>(a);
687 break;
688 case vkapi::kHalf:
689 run_buffer_tensor_sanity_check<executorch::aten::Half>(a);
690 break;
691 case vkapi::kChar:
692 run_buffer_tensor_sanity_check<int8_t>(a);
693 break;
694 default:
695 VK_THROW("Unsupported dtype");
696 }
697 }
698 }
699 }
700 }
701
TEST_F(VulkanComputeAPITest,texture_add_sanity_check)702 TEST_F(VulkanComputeAPITest, texture_add_sanity_check) {
703 // Simple test that performs a + b -> c
704
705 std::vector<int64_t> sizes = {4, 4, 1};
706 vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
707 vTensor b = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
708 vTensor c = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
709
710 // Fill input tensors
711 fill_vtensor(a, 2.5f);
712 fill_vtensor(b, 1.5f);
713
714 // a + b -> c
715 record_binary_op(context(), "add", a, b, c);
716
717 // Extract output tensor
718 std::vector<float> data_out = extract_vtensor(c);
719
720 // Check output
721 for (size_t i = 0; i < data_out.size(); ++i) {
722 CHECK_VALUE(data_out, i, 4.0f);
723 }
724 }
725
TEST_F(VulkanComputeAPITest,tensor_alias_test)726 TEST_F(VulkanComputeAPITest, tensor_alias_test) {
727 for (utils::StorageType storage_type : {utils::kTexture3D, utils::kBuffer}) {
728 std::vector<int64_t> sizes = {9, 9};
729
730 const size_t alloc_count_before = get_vma_allocation_count();
731
732 vTensor original = vTensor(context(), sizes, vkapi::kFloat, storage_type);
733
734 vTensor copy = vTensor(original);
735
736 // Two tensors but only one additional allocation.
737 EXPECT_TRUE(get_vma_allocation_count() == alloc_count_before + 1);
738 EXPECT_TRUE(copy.is_view_of(original));
739
740 // Fill original tensor with some data
741 fill_vtensor(original, 2.5f, true);
742
743 std::vector<float> data_out(copy.staging_buffer_numel());
744 // Extract the copy tensor; should contain the data of the original tensor
745 extract_vtensor(copy, data_out);
746
747 for (size_t i = 0; i < original.numel(); ++i) {
748 CHECK_VALUE(data_out, i, 2.5f + i);
749 }
750 }
751 }
752
TEST_F(VulkanComputeAPITest,tensor_no_copy_transpose_test)753 TEST_F(VulkanComputeAPITest, tensor_no_copy_transpose_test) {
754 constexpr int M = 11;
755 constexpr int K = 23;
756 constexpr int N = 17;
757 std::vector<int64_t> mat1_sizes = {M, K};
758 std::vector<int64_t> mat2_sizes = {N, K};
759 std::vector<int64_t> out_sizes = {M, N};
760
761 for (const auto storage_type : {utils::kTexture3D, utils::kBuffer}) {
762 vTensor mat1 = vTensor(
763 context(),
764 mat1_sizes,
765 vkapi::kFloat,
766 storage_type,
767 utils::kWidthPacked);
768 vTensor mat2 = vTensor(
769 context(),
770 mat2_sizes,
771 vkapi::kFloat,
772 storage_type,
773 utils::kWidthPacked);
774 vTensor out = vTensor(
775 context(), out_sizes, vkapi::kFloat, storage_type, utils::kWidthPacked);
776
777 // Generate data
778 std::vector<float> mat1_data =
779 create_random_float_buffer(mat1.staging_buffer_numel());
780 std::vector<float> mat2_data =
781 create_random_float_buffer(mat2.staging_buffer_numel());
782
783 // Create direct view and modify sizes and strides later
784 vTensor mat2_t = vTensor(mat2);
785 // Update sizes and strides of mat2_t to be that of a transposed tensor
786 mat2_t.virtual_transpose(0, 1);
787
788 EXPECT_TRUE(mat2_t.packed_dim() == WHCN::kHeightDim);
789
790 std::vector<float> mat2_t_data = transpose_matrix(mat2_data, N, K);
791 std::vector<float> ref_out =
792 compute_reference_matmul(mat1_data, mat2_t_data, M, K, N);
793
794 // Fill original tensor with some data
795 fill_vtensor(mat1, mat1_data);
796 fill_vtensor(mat2, mat2_data);
797
798 if (storage_type == utils::kTexture3D) {
799 record_matmul_texture3d(context(), out, mat1, mat2_t);
800 } else {
801 record_reference_matmul(context(), out, mat1, mat2_t);
802 }
803
804 std::vector<float> data_out(out.staging_buffer_numel());
805 // Extract the copy tensor; should contain the data of the original tensor
806 extract_vtensor(out, data_out);
807
808 for (size_t i = 0; i < ref_out.size(); ++i) {
809 EXPECT_TRUE(check_close(data_out[i], ref_out[i]));
810 }
811 }
812 }
813
TEST_F(VulkanComputeAPITest,tensor_no_copy_slice_test)814 TEST_F(VulkanComputeAPITest, tensor_no_copy_slice_test) {
815 constexpr int L = 31;
816
817 // S{N} refers to slice {N}
818 constexpr int L_S1 = 17;
819 constexpr int O_S1 = 5;
820
821 constexpr int L_S2 = 7;
822 constexpr int O_S2 = 3;
823
824 std::vector<int64_t> dim_order = {0};
825
826 std::vector<int64_t> t_sizes = {L};
827 std::vector<int64_t> s1_sizes = {L_S1};
828 std::vector<int64_t> s2_sizes = {L_S2};
829
830 vTensor orig = CREATE_FLOAT_BUFFER(t_sizes, /*allocate_memory=*/true);
831
832 fill_vtensor(orig, 0);
833
834 vTensor s1 = vTensor(orig, s1_sizes, dim_order, O_S1);
835 vTensor s2 = vTensor(s1, s2_sizes, dim_order, O_S2);
836
837 record_scalar_add_buffer(api::context(), s1, 4.5f);
838 record_scalar_add_buffer(api::context(), s2, 7.5f);
839
840 std::vector<float> orig_data(orig.staging_buffer_numel());
841 extract_vtensor(orig, orig_data);
842
843 int id = 0;
844 while (id < O_S1) {
845 EXPECT_TRUE(orig_data[id] == 0);
846 ++id;
847 }
848 while (id < O_S1 + O_S2) {
849 EXPECT_TRUE(orig_data[id] == 4.5);
850 ++id;
851 }
852 while (id < O_S1 + O_S2 + L_S2) {
853 EXPECT_TRUE(orig_data[id] == 12);
854 ++id;
855 }
856 while (id < O_S1 + L_S1) {
857 EXPECT_TRUE(orig_data[id] == 4.5);
858 ++id;
859 }
860 while (id < L) {
861 EXPECT_TRUE(orig_data[id] == 0);
862 ++id;
863 }
864 }
865
TEST_F(VulkanComputeAPITest,texture_deferred_allocation_test)866 TEST_F(VulkanComputeAPITest, texture_deferred_allocation_test) {
867 // This test is the same as texture_add_sanity_check, except that the tensor
868 // memory is allocated in a deferred fashion
869
870 std::vector<int64_t> sizes = {4, 4, 1};
871 vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
872 vTensor b = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
873 vTensor c = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
874
875 // No allocations made so far
876 EXPECT_TRUE(get_vma_allocation_count() == 0);
877
878 std::vector<float> data_a(a.staging_buffer_numel());
879 std::fill(data_a.begin(), data_a.end(), 2.5f);
880 std::vector<float> data_b(b.staging_buffer_numel());
881 std::fill(data_b.begin(), data_b.end(), 1.5f);
882
883 // Allocate memory at the last possible opportunity
884 vkapi::Allocation a_mem = allocate_memory_for(a);
885 a.image().bind_allocation(a_mem);
886 vkapi::Allocation b_mem = allocate_memory_for(b);
887 b.image().bind_allocation(b_mem);
888 vkapi::Allocation c_mem = allocate_memory_for(c);
889 c.image().bind_allocation(c_mem);
890
891 // One allocation for each tensor
892 EXPECT_TRUE(get_vma_allocation_count() == 3);
893
894 fill_vtensor(a, data_a);
895 fill_vtensor(b, data_b);
896
897 record_binary_op(context(), "add", a, b, c);
898
899 std::vector<float> data_c(c.staging_buffer_numel());
900 extract_vtensor(c, data_c);
901
902 for (size_t i = 0; i < data_c.size(); ++i) {
903 CHECK_VALUE(data_c, i, 4.0f);
904 }
905 }
906
TEST_F(VulkanComputeAPITest,texture_resource_aliasing_test)907 TEST_F(VulkanComputeAPITest, texture_resource_aliasing_test) {
908 // This test performs the following operations:
909 // 1. a + b -> c
910 // 2. c + d -> e
911 // and share memory between tensors whenever possible.
912
913 std::vector<int64_t> sizes = {4, 4, 1};
914 vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
915 vTensor b = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
916 vTensor c = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
917 vTensor d = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
918 vTensor e = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
919
920 // No allocations made so far
921 EXPECT_TRUE(get_vma_allocation_count() == 0);
922
923 // a and d can share the same memory allocation
924 vkapi::Allocation a_d_mem = allocate_memory_for(a);
925 a.image().bind_allocation(a_d_mem);
926 d.image().bind_allocation(a_d_mem);
927 // b and e can share the same memory allocation
928 vkapi::Allocation b_e_mem = allocate_memory_for(b);
929 b.image().bind_allocation(b_e_mem);
930 e.image().bind_allocation(b_e_mem);
931 // c must have its own memory allocation
932 vkapi::Allocation c_mem = allocate_memory_for(c);
933 c.image().bind_allocation(c_mem);
934
935 // 3 allocations should be made
936 EXPECT_TRUE(get_vma_allocation_count() == 3);
937
938 // Specify input data
939 std::vector<float> data_a(a.staging_buffer_numel());
940 std::fill(data_a.begin(), data_a.end(), 2.5f);
941 std::vector<float> data_b(b.staging_buffer_numel());
942 std::fill(data_b.begin(), data_b.end(), 1.5f);
943 std::vector<float> data_d(b.staging_buffer_numel());
944 std::fill(data_d.begin(), data_d.end(), 1.0f);
945
946 // First, fill a and b with data
947 fill_vtensor(a, data_a);
948 fill_vtensor(b, data_b);
949
950 // a + b -> c
951 record_binary_op(context(), "add", a, b, c);
952
953 // Now d can be filled with data
954 fill_vtensor(d, data_d);
955
956 // c + d -> e
957 record_binary_op(context(), "add", c, d, e);
958
959 // Extract data from e
960 std::vector<float> data_e(e.staging_buffer_numel());
961 extract_vtensor(e, data_e);
962
963 // Sanity check that the values are correct
964 for (size_t i = 0; i < data_e.size(); ++i) {
965 CHECK_VALUE(data_e, i, 5.0f);
966 }
967 }
968
TEST_F(VulkanComputeAPITest,resource_bind_twice_fails)969 TEST_F(VulkanComputeAPITest, resource_bind_twice_fails) {
970 // Check that binding a resource that already has memory associated with it
971 // fails
972
973 std::vector<int64_t> sizes = {4, 4, 1};
974 vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
975
976 // Try to double bind a resource, which should fail
977 vkapi::Allocation a_mem = allocate_memory_for(a);
978 EXPECT_THROW(a.image().bind_allocation(a_mem), vkapi::Error);
979 }
980
TEST_F(VulkanComputeAPITest,resource_destructor_non_owning_memory)981 TEST_F(VulkanComputeAPITest, resource_destructor_non_owning_memory) {
982 // Check that the destructor of a vTensor that does not own its memory
983 // does not free the memory
984
985 vkapi::Allocation memory;
986
987 // Default Allocation constructor should not allocate memory
988 EXPECT_TRUE(get_vma_allocation_count() == 0);
989
990 std::vector<int64_t> sizes = {4, 4, 1};
991 {
992 vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
993
994 memory = allocate_memory_for(a);
995 EXPECT_TRUE(get_vma_allocation_count() == 1);
996 a.image().bind_allocation(memory);
997 }
998
999 // Check that the memory is still allocated
1000 EXPECT_TRUE(get_vma_allocation_count() == 1);
1001 }
1002
TEST_F(VulkanComputeAPITest,use_non_bound_textures_fails)1003 TEST_F(VulkanComputeAPITest, use_non_bound_textures_fails) {
1004 // Try to encode a command buffer with a vTensor that does not have
1005 // memory
1006
1007 std::vector<int64_t> sizes = {4, 4, 1};
1008 vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
1009
1010 // No allocations yet
1011 EXPECT_TRUE(get_vma_allocation_count() == 0);
1012
1013 std::vector<float> data_a(a.staging_buffer_numel());
1014 std::fill(data_a.begin(), data_a.end(), 2.5f);
1015
1016 // Encoding a command buffer with a vTensor without memory should throw
1017 EXPECT_THROW(fill_vtensor(a, data_a), vkapi::Error);
1018 }
1019
TEST_F(VulkanComputeAPITest,texture_virtual_resize)1020 TEST_F(VulkanComputeAPITest, texture_virtual_resize) {
1021 context()->set_cmd(/*reusable = */ true);
1022 std::vector<int64_t> sizes = {8, 12, 12};
1023 vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
1024 vTensor b = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
1025 vTensor c = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
1026
1027 DEFINE_STAGING_BUFFER_AND_RECORD_TO_GPU_FOR(a)
1028 DEFINE_STAGING_BUFFER_AND_RECORD_TO_GPU_FOR(b)
1029
1030 fill_staging(staging_buffer_a, 11.5f);
1031 fill_staging(staging_buffer_b, 12.5f);
1032
1033 record_binary_op(context(), "add", a, b, c);
1034
1035 DEFINE_STAGING_BUFFER_AND_RECORD_FROM_GPU_FOR(c)
1036
1037 submit_to_gpu();
1038 check_staging_buffer(staging_buffer_c, 24.0f);
1039
1040 std::vector<std::vector<int64_t>> new_sizes_list = {
1041 {4, 2, 4}, {4, 3, 6}, {8, 12, 12}, {8, 1, 1}, {8, 11, 10}};
1042
1043 for (auto& new_sizes : new_sizes_list) {
1044 a.virtual_resize(new_sizes);
1045 b.virtual_resize(new_sizes);
1046 c.virtual_resize(new_sizes);
1047
1048 fill_staging(
1049 staging_buffer_a, float(new_sizes[1] + 1.5f), a.staging_buffer_numel());
1050 fill_staging(
1051 staging_buffer_b,
1052 float(new_sizes[2] + 55.0f),
1053 b.staging_buffer_numel());
1054
1055 submit_to_gpu();
1056 check_staging_buffer(
1057 staging_buffer_c,
1058 float(new_sizes[1] + new_sizes[2] + 56.5f),
1059 c.staging_buffer_numel());
1060 }
1061 }
1062
1063 //
1064 // Compute Graph Tests
1065 //
1066
1067 #define EXTRACT_TENSOR(name) \
1068 std::vector<float> data_##name( \
1069 graph.get_tensor(name.value)->staging_buffer_numel()); \
1070 graph.copy_from_staging(name.staging, data_##name.data(), data_##name.size());
1071
1072 // The purpose of this test is simply to track the size of various classes over
1073 // time, in the interest of making sure that they doesn't grow too large.
TEST_F(VulkanComputeAPITest,print_object_sizes)1074 TEST_F(VulkanComputeAPITest, print_object_sizes) {
1075 #define PRINT_SIZE(name) \
1076 std::cout << #name << " size: " << sizeof(name) << " B" << std::endl
1077 PRINT_SIZE(vTensor);
1078 PRINT_SIZE(Value);
1079 PRINT_SIZE(StagingBuffer);
1080 PRINT_SIZE(ComputeGraph);
1081 PRINT_SIZE(DispatchNode);
1082 #undef PRINT_SIZE
1083
1084 // The actual sizes of each object is dependent on the platform. However, we
1085 // can alert ourselves to any significant changes in the sizes of these
1086 // objects by checking the `sizeof()` the class against some loose thresholds.
1087
1088 // Current known size on 64 bit system: 1040 B
1089 EXPECT_TRUE(sizeof(vTensor) < 1200);
1090 // Current known size on 64 bit system: 1056 B
1091 EXPECT_TRUE(sizeof(Value) < 1200);
1092 // Current known size on 64 bit system: 120 B
1093 EXPECT_TRUE(sizeof(StagingBuffer) < 500);
1094 // Current known size on 64 bit system: 384 B
1095 EXPECT_TRUE(sizeof(ComputeGraph) < 500);
1096 // Current known size on 64 bit system: 248 B
1097 EXPECT_TRUE(sizeof(DispatchNode) < 500);
1098 }
1099
TEST_F(VulkanComputeAPITest,test_tensor_creation_from_vulkan_image)1100 TEST_F(VulkanComputeAPITest, test_tensor_creation_from_vulkan_image) {
1101 const auto w = 16;
1102 const auto h = 12;
1103 const auto d = 1;
1104 const utils::uvec3 image_extents = {w, h, d};
1105
1106 vkapi::Adapter* adapter_ptr = context()->adapter_ptr();
1107
1108 vkapi::ImageSampler::Properties sampler_props{
1109 VK_FILTER_NEAREST,
1110 VK_SAMPLER_MIPMAP_MODE_NEAREST,
1111 VK_SAMPLER_ADDRESS_MODE_REPEAT,
1112 VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK,
1113 };
1114
1115 VkFormat image_format = VK_FORMAT_R32G32B32A32_SFLOAT;
1116 VkImageType image_type = VK_IMAGE_TYPE_3D;
1117 VkImageViewType image_view_type = VK_IMAGE_VIEW_TYPE_3D;
1118
1119 VkSampler sampler = adapter_ptr->sampler_cache().retrieve(sampler_props);
1120
1121 auto image = adapter_ptr->vma().create_image(
1122 context()->device(),
1123 vkapi::create_extent3d(image_extents),
1124 image_format,
1125 image_type,
1126 context()->preferred_image_tiling(),
1127 image_view_type,
1128 sampler_props,
1129 sampler,
1130 /*allow_transfer = */ true,
1131 /*allocate_memory = */ true);
1132
1133 auto tensor = vTensor(context(), image);
1134
1135 const auto exp_sizes = std::vector<int64_t>{w, h, d * 4};
1136 EXPECT_TRUE(tensor.sizes() == exp_sizes);
1137 EXPECT_TRUE(tensor.packed_dim() == 2);
1138
1139 const auto exp_numel = w * h * d * 4;
1140 EXPECT_TRUE(tensor.numel() == exp_numel);
1141 EXPECT_TRUE(tensor.padded_numel() == exp_numel);
1142 }
1143
TEST(VulkanComputeGraphTest,test_values_scalars)1144 TEST(VulkanComputeGraphTest, test_values_scalars) {
1145 GraphConfig config;
1146 ComputeGraph graph(config);
1147
1148 ValueRef idx;
1149
1150 idx = graph.add_scalar<int64_t>(4);
1151 EXPECT_TRUE(graph.get_int(idx) == 4);
1152
1153 idx = graph.add_scalar<double>(5.5f);
1154 EXPECT_TRUE(graph.get_double(idx) == 5.5f);
1155 }
1156
TEST(VulkanComputeGraphTest,test_values_scalar_list_inplace_constructed)1157 TEST(VulkanComputeGraphTest, test_values_scalar_list_inplace_constructed) {
1158 GraphConfig config;
1159 ComputeGraph graph(config);
1160
1161 ValueRef idx = graph.add_scalar_list<int64_t>({1, 2, 3, 4});
1162 const auto arr = graph.get_int_list(idx);
1163 EXPECT_TRUE(arr->size() == 4);
1164 for (int i = 0; i < 4; i++) {
1165 EXPECT_TRUE(arr->at(i) == i + 1);
1166 }
1167 }
1168
TEST(VulkanComputeGraphTest,test_values_scalar_list_outside_constructed)1169 TEST(VulkanComputeGraphTest, test_values_scalar_list_outside_constructed) {
1170 GraphConfig config;
1171 ComputeGraph graph(config);
1172
1173 ValueRef idx;
1174 {
1175 std::vector<double> data = {5.0, 4.0, 3.0, 2.0, 1.0};
1176 idx = graph.add_scalar_list(std::move(data));
1177 }
1178 const auto& arr = graph.get_double_list(idx);
1179 EXPECT_TRUE(arr->size() == 5);
1180 for (int i = 0; i < 5; i++) {
1181 EXPECT_TRUE(arr->at(i) == (5 - i));
1182 }
1183 }
1184
TEST(VulkanComputeGraphTest,test_values_string)1185 TEST(VulkanComputeGraphTest, test_values_string) {
1186 GraphConfig config;
1187 ComputeGraph graph(config);
1188
1189 ValueRef idx;
1190 {
1191 std::string data = "hello, world";
1192 idx = graph.add_string(std::move(data));
1193 }
1194 std::string stored = graph.get_string(idx);
1195 EXPECT_TRUE(stored == "hello, world");
1196 }
1197
TEST(VulkanComputeGraphTest,empty_init_graphnode_test)1198 TEST(VulkanComputeGraphTest, empty_init_graphnode_test) {
1199 ExecuteNode node(nullptr, {});
1200
1201 GraphConfig config;
1202 ComputeGraph graph(config);
1203
1204 // Encode an empty ExecuteNode and check that command buffer encoding does not
1205 // crash.
1206 graph.execute_nodes().emplace_back(new ExecuteNode(nullptr, {}));
1207 EXPECT_NO_FATAL_FAILURE(graph.encode_execute());
1208 }
1209
TEST(VulkanComputeGraphTest,test_zero_dim_tensor)1210 TEST(VulkanComputeGraphTest, test_zero_dim_tensor) {
1211 GraphConfig config;
1212 ComputeGraph graph(config);
1213
1214 std::vector<int64_t> size_big = {7, 3, 5};
1215 std::vector<int64_t> size_small = {};
1216
1217 // Build graph
1218
1219 IOValueRef a = graph.add_input_tensor(size_big, vkapi::kFloat);
1220 IOValueRef b = graph.add_input_tensor(size_small, vkapi::kFloat);
1221
1222 IOValueRef out = {};
1223
1224 out.value = graph.add_tensor(size_big, vkapi::kFloat);
1225
1226 auto addFn = VK_GET_OP_FN("aten.add.Tensor");
1227 addFn(graph, {a.value, b.value, kDummyValueRef, out.value});
1228
1229 out.staging = graph.set_output_tensor(out.value);
1230
1231 graph.prepare();
1232 graph.encode_execute();
1233
1234 // Run graph
1235
1236 for (float i = 5.0f; i < 30.0f; i += 10.0f) {
1237 float val_a = i + 2.0f;
1238 float val_b = i + 1.5f;
1239 float val_c = val_a + val_b;
1240
1241 fill_vtensor(graph, a, val_a);
1242 fill_vtensor(graph, b, val_b);
1243
1244 graph.execute();
1245
1246 EXTRACT_TENSOR(out);
1247
1248 // Sanity check that the values are correct
1249 for (size_t i = 0; i < graph.get_tensor(out.value)->numel(); ++i) {
1250 CHECK_VALUE(data_out, i, val_c);
1251 }
1252 }
1253 }
1254
TEST(VulkanComputeGraphTest,test_simple_graph_with_buffer)1255 TEST(VulkanComputeGraphTest, test_simple_graph_with_buffer) {
1256 GraphConfig config;
1257 ComputeGraph graph(config);
1258
1259 std::vector<int64_t> sizes = {7, 13, 19};
1260
1261 // Build graph
1262
1263 IOValueRef a = graph.add_input_tensor(sizes, vkapi::kFloat, utils::kBuffer);
1264
1265 IOValueRef out = {};
1266
1267 out.value = graph.add_tensor(sizes, vkapi::kFloat, utils::kBuffer);
1268
1269 auto addFn = VK_GET_OP_FN("aten.abs.default");
1270 addFn(graph, {a.value, out.value, kDummyValueRef, kDummyValueRef});
1271
1272 out.staging = graph.set_output_tensor(out.value);
1273
1274 graph.prepare();
1275 graph.encode_execute();
1276
1277 // Run graph
1278
1279 for (float i = 5.0f; i < 30.0f; i += 10.0f) {
1280 float val = -i + 2.0f;
1281 float expected_val = std::abs(val);
1282
1283 fill_vtensor(graph, a, val);
1284
1285 graph.execute();
1286
1287 EXTRACT_TENSOR(out);
1288
1289 // Sanity check that the values are correct
1290 for (size_t i = 0; i < graph.get_tensor(out.value)->numel(); ++i) {
1291 CHECK_VALUE(data_out, i, expected_val);
1292 }
1293 }
1294 }
1295
TEST(VulkanComputeGraphTest,test_simple_graph_with_view)1296 TEST(VulkanComputeGraphTest, test_simple_graph_with_view) {
1297 constexpr int W = 7;
1298 constexpr int H = 7;
1299 // slice height
1300 constexpr int S_H = 2;
1301 // slice offset
1302 constexpr int S_O = 3;
1303
1304 GraphConfig config;
1305 config.set_storage_type_override(utils::kBuffer);
1306 ComputeGraph graph(config);
1307
1308 std::vector<int64_t> dim_order = {0, 1};
1309
1310 std::vector<int64_t> orig_sizes = {H, W};
1311 std::vector<int64_t> slice_sizes = {S_H, W};
1312 const int offset = S_O * W;
1313
1314 // Build graph
1315
1316 IOValueRef orig = graph.add_input_tensor(orig_sizes, vkapi::kFloat);
1317 ValueRef slice =
1318 graph.add_tensor_view(orig.value, slice_sizes, dim_order, offset);
1319
1320 EXPECT_TRUE(graph.val_is_view_of(slice, orig.value));
1321
1322 IOValueRef out = {};
1323
1324 out.value = graph.add_tensor(slice_sizes, vkapi::kFloat);
1325
1326 auto opFn = VK_GET_OP_FN("aten.abs.default");
1327 opFn(graph, {slice, out.value, kDummyValueRef, kDummyValueRef});
1328
1329 out.staging = graph.set_output_tensor(out.value);
1330
1331 graph.prepare();
1332 graph.encode_execute();
1333
1334 // Run graph
1335
1336 for (float i = 5.0f; i < 30.0f; i += 10.0f) {
1337 float start_val = -130 + i;
1338
1339 fill_vtensor(graph, orig, start_val, true);
1340
1341 graph.execute();
1342
1343 EXTRACT_TENSOR(out);
1344
1345 for (size_t i = 0; i < graph.get_tensor(out.value)->numel(); ++i) {
1346 const float expected_val = std::abs(start_val) - float(offset) - i;
1347 CHECK_VALUE(data_out, i, expected_val);
1348 }
1349 }
1350 }
1351
TEST(VulkanComputeGraphTest,test_graph_view_of_view)1352 TEST(VulkanComputeGraphTest, test_graph_view_of_view) {
1353 GraphConfig config;
1354 config.set_storage_type_override(utils::kTexture3D);
1355 ComputeGraph graph(config);
1356
1357 constexpr int N = 3;
1358 constexpr int C = 5;
1359 constexpr int H = 17;
1360 constexpr int W = 19;
1361
1362 std::vector<int64_t> orig_sizes = {N, C, H, W};
1363
1364 // Test a common view of view usage pattern. In delegate execution, the values
1365 // of the graph are created first; then operators are added. As a result,
1366 // creating views of views is a bit tricky because metadata updates to a view
1367 // does not update the metadata of the view's views. Nonetheless, view
1368 // operators have an implicit assumption that the metadata of the output is
1369 // equivalent to the metadata of the input. Therefore, view operators must
1370 // account for unseen updates to the input view by first calling
1371 // `virtual_clone()` to make the output equivalent to the input before.
1372 // modifying metadata.
1373
1374 ValueRef t1 = graph.add_tensor(orig_sizes, vkapi::kFloat);
1375 ValueRef t2 = graph.add_tensor_view(t1);
1376 ValueRef t3 = graph.add_tensor_view(t2);
1377
1378 ValueRef channels = graph.add_scalar<int64_t>(1);
1379 ValueRef height = graph.add_scalar<int64_t>(2);
1380 ValueRef width = graph.add_scalar<int64_t>(3);
1381
1382 auto opFn = VK_GET_OP_FN("aten.transpose.int");
1383
1384 opFn(graph, {t1, channels, height, t2});
1385 std::vector<int64_t> t2_sizes = graph.sizes_of(t2);
1386 std::vector<int64_t> expected_t2_sizes = {N, H, C, W};
1387 EXPECT_TRUE(t2_sizes == expected_t2_sizes);
1388
1389 opFn(graph, {t2, height, width, t3});
1390 std::vector<int64_t> t3_sizes = graph.sizes_of(t3);
1391 std::vector<int64_t> expected_t3_sizes = {N, H, W, C};
1392 EXPECT_TRUE(t3_sizes == expected_t3_sizes);
1393 }
1394
TEST(VulkanComputeGraphTest,test_simple_graph)1395 TEST(VulkanComputeGraphTest, test_simple_graph) {
1396 GraphConfig config;
1397 ComputeGraph graph(config);
1398
1399 std::vector<int64_t> size_big = {1, 8, 8};
1400 std::vector<int64_t> size_small = {1, 1, 8};
1401
1402 // Build graph
1403
1404 IOValueRef a = graph.add_input_tensor(size_big, vkapi::kFloat);
1405 IOValueRef b = graph.add_input_tensor(size_small, vkapi::kFloat);
1406
1407 IOValueRef out = {};
1408
1409 out.value = graph.add_tensor(size_big, vkapi::kFloat);
1410
1411 auto addFn = VK_GET_OP_FN("aten.add.Tensor");
1412 addFn(graph, {a.value, b.value, kDummyValueRef, out.value});
1413
1414 out.staging = graph.set_output_tensor(out.value);
1415
1416 graph.prepare();
1417 graph.encode_execute();
1418
1419 // Run graph
1420
1421 for (float i = 5.0f; i < 30.0f; i += 10.0f) {
1422 float val_a = i + 2.0f;
1423 float val_b = i + 1.5f;
1424 float val_c = val_a + val_b;
1425
1426 fill_vtensor(graph, a, val_a);
1427 fill_vtensor(graph, b, val_b);
1428
1429 graph.execute();
1430
1431 EXTRACT_TENSOR(out);
1432
1433 // Sanity check that the values are correct
1434 for (size_t i = 0; i < graph.get_tensor(out.value)->numel(); ++i) {
1435 CHECK_VALUE(data_out, i, val_c);
1436 }
1437 }
1438 }
1439
TEST(VulkanComputeGraphTest,test_simple_graph_with_symint)1440 TEST(VulkanComputeGraphTest, test_simple_graph_with_symint) {
1441 GraphConfig config;
1442 config.set_storage_type_override(utils::kTexture3D);
1443 ComputeGraph graph(config);
1444
1445 std::vector<int64_t> sizes = {8, 64, 124};
1446
1447 // Build graph
1448
1449 ValueRef scalar = graph.add_symint(1);
1450 IOValueRef a = graph.add_input_tensor(sizes, vkapi::kFloat);
1451
1452 IOValueRef out = {};
1453 out.value = a.value;
1454
1455 graph.execute_nodes().emplace_back(new DispatchNode(
1456 graph,
1457 VK_KERNEL_FROM_STR("scalar_add_texture"),
1458 graph.create_global_wg_size(a.value),
1459 graph.create_local_wg_size(a.value),
1460 // Inputs and Outputs
1461 {{out.value, vkapi::MemoryAccessType::WRITE}},
1462 // Shader params buffers
1463 {graph.logical_limits_ubo(a.value),
1464 graph.get_or_create_int_param_buffer(scalar)},
1465 // Specialization Constants
1466 {},
1467 // Resizing Logic
1468 nullptr,
1469 {}));
1470
1471 out.staging = graph.set_output_tensor(out.value);
1472
1473 graph.prepare();
1474 graph.encode_execute();
1475
1476 // Run graph
1477
1478 for (float i = 5.0f; i < 30.0f; i += 10.0f) {
1479 int scalar_val = i - 3.0f;
1480 graph.set_symint(scalar, scalar_val);
1481
1482 int32_t scalar_val_read = graph.read_symint(scalar);
1483 EXPECT_TRUE(scalar_val_read == scalar_val);
1484
1485 float val_a = i + 2.0f;
1486 float val_out = val_a + scalar_val;
1487
1488 fill_vtensor(graph, a, val_a);
1489
1490 graph.execute();
1491
1492 EXTRACT_TENSOR(out);
1493
1494 // Sanity check that the values are correct
1495 for (size_t i = 0; i < graph.get_tensor(out.value)->numel(); ++i) {
1496 CHECK_VALUE(data_out, i, val_out);
1497 }
1498 }
1499 }
1500
1501 #define CREATE_WEIGHT_TENSOR(name, sizes, dtype, val) \
1502 std::vector<float> data_##name(utils::multiply_integers(sizes)); \
1503 std::fill(data_##name.begin(), data_##name.end(), val); \
1504 ValueRef name = graph.add_tensorref(sizes, dtype, data_##name.data());
1505
TEST(VulkanComputeGraphTest,test_simple_prepacked_graph)1506 TEST(VulkanComputeGraphTest, test_simple_prepacked_graph) {
1507 GraphConfig config;
1508 config.enable_querypool = true;
1509 ComputeGraph graph(config);
1510
1511 std::vector<int64_t> size_big = {8, 73, 62};
1512 std::vector<int64_t> size_small = {8, 73, 1};
1513
1514 CREATE_WEIGHT_TENSOR(w1, size_small, vkapi::kFloat, 3.5f);
1515 CREATE_WEIGHT_TENSOR(w2, size_small, vkapi::kFloat, 3.0f);
1516
1517 // Build graph
1518
1519 IOValueRef a = graph.add_input_tensor(size_big, vkapi::kFloat);
1520
1521 ValueRef c = graph.add_tensor(size_big, vkapi::kFloat);
1522 ValueRef e = graph.add_tensor(size_big, vkapi::kFloat);
1523
1524 ValueRef w1_packed = graph.add_tensor(size_small, vkapi::kFloat);
1525 ValueRef w2_packed = graph.add_tensor(size_small, vkapi::kFloat);
1526
1527 auto prepackFn = VK_GET_OP_FN("et_vk.prepack.default");
1528 prepackFn(graph, {w1, w1_packed});
1529 prepackFn(graph, {w2, w2_packed});
1530
1531 auto addFn = VK_GET_OP_FN("aten.add.Tensor");
1532 addFn(graph, {a.value, w1_packed, kDummyValueRef, c});
1533
1534 auto mulFn = VK_GET_OP_FN("aten.mul.Tensor");
1535 mulFn(graph, {c, w2_packed, e});
1536
1537 IOValueRef out = {};
1538 out.value = e;
1539 out.staging = graph.set_output_tensor(out.value);
1540
1541 graph.prepare();
1542
1543 graph.encode_prepack();
1544 graph.prepack();
1545
1546 graph.encode_execute();
1547
1548 // Run graph
1549
1550 for (float i = 5.0f; i < 30.0f; i += 10.0f) {
1551 float val_out = (i + 3.5f) * 3.0f;
1552
1553 fill_vtensor(graph, a, i);
1554
1555 // Execute graph
1556 graph.execute();
1557
1558 EXTRACT_TENSOR(out);
1559
1560 // Sanity check that the values are correct
1561 for (size_t i = 0; i < graph.get_tensor(out.value)->numel(); ++i) {
1562 CHECK_VALUE(data_out, i, val_out);
1563 }
1564
1565 if (graph.context()->querypool()) {
1566 graph.context()->querypool().extract_results();
1567 graph.context()->querypool().print_results();
1568 }
1569 }
1570 }
1571
TEST(VulkanComputeGraphTest,test_simple_shared_objects_with_resize)1572 TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
1573 GraphConfig config;
1574 ComputeGraph graph(config);
1575 size_t expected_vma_allocation_count = 0;
1576
1577 std::vector<int64_t> size_big = {12, 64, 64};
1578 std::vector<int64_t> size_small = {12, 64, 64};
1579
1580 // Build graph and regularly check allocation counts
1581
1582 IOValueRef a = graph.add_input_tensor(
1583 size_big,
1584 vkapi::kFloat,
1585 /*shared_object_idx = */ 2);
1586 IOValueRef b = graph.add_input_tensor(
1587 size_small,
1588 vkapi::kFloat,
1589 /*shared_object_idx = */ 4);
1590
1591 // +2: t.sizes_ubo() for each staging shader
1592 // +2: staging buffer for each input tensor
1593 expected_vma_allocation_count += 4;
1594 EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
1595
1596 ValueRef c = graph.add_tensor(
1597 size_big,
1598 vkapi::kFloat,
1599 /*shared_object_idx = */ 6);
1600
1601 auto addFn = VK_GET_OP_FN("aten.add.Tensor");
1602 addFn(graph, {a.value, b.value, kDummyValueRef, c});
1603
1604 // +2: alpha UBO, broadcast UBO for arithmetic shader
1605 // +1: t.sizes_ubo() for arithmetic shader output c
1606 expected_vma_allocation_count += 3;
1607 EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
1608
1609 IOValueRef d = graph.add_input_tensor(
1610 size_small,
1611 vkapi::kFloat,
1612 /*shared_object_idx = */ 2);
1613
1614 // +1: t.sizes_ubo() uniform buffer for staging shader
1615 // +1: staging buffer for the input tensor
1616 expected_vma_allocation_count += 2;
1617 EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
1618
1619 ValueRef e = graph.add_tensor(
1620 size_big,
1621 vkapi::kFloat,
1622 /*shared_object_idx = */ 4);
1623
1624 auto mulFn = VK_GET_OP_FN("aten.mul.Tensor");
1625 mulFn(graph, {c, d.value, e});
1626
1627 // +2: alpha UBO, broadcast UBO for arithmetic shader
1628 // +1: t.sizes_ubo() for arithmetic shader output e
1629 expected_vma_allocation_count += 3;
1630 EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
1631
1632 IOValueRef out = {};
1633 out.value = e;
1634 out.staging = graph.set_output_tensor(out.value);
1635
1636 // +1: staging buffer for the output tensor
1637 expected_vma_allocation_count += 1;
1638 EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
1639
1640 graph.prepare();
1641 graph.encode_execute();
1642
1643 // +3: shared memory allocations for tensors
1644 expected_vma_allocation_count += 3;
1645 EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
1646
1647 // Run graph
1648
1649 std::vector<std::vector<int64_t>> new_sizes_list = {
1650 {8, 44, 34}, {4, 13, 56}, {8, 12, 64}, {12, 55, 33}, {4, 54, 10}};
1651
1652 for (auto& new_sizes : new_sizes_list) {
1653 graph.get_tensor(a.value)->virtual_resize(new_sizes);
1654 graph.get_tensor(b.value)->virtual_resize(new_sizes);
1655 graph.get_tensor(c)->virtual_resize(new_sizes);
1656 graph.get_tensor(d.value)->virtual_resize(new_sizes);
1657 graph.get_tensor(e)->virtual_resize(new_sizes);
1658
1659 float val_a = new_sizes[1] + 4.0f;
1660 float val_b = new_sizes[2] + 1.5f;
1661 float val_d = new_sizes[0] + 2.0f;
1662 float val_out = (val_a + val_b) * val_d;
1663
1664 fill_vtensor(graph, a, val_a);
1665 fill_vtensor(graph, b, val_b);
1666 fill_vtensor(graph, d, val_d);
1667
1668 // Execute graph
1669 graph.execute();
1670
1671 EXTRACT_TENSOR(out);
1672
1673 // Sanity check that the values are correct
1674 for (size_t i = 0; i < graph.get_tensor(out.value)->numel(); i++) {
1675 CHECK_VALUE(data_out, i, val_out);
1676 }
1677 }
1678
1679 std::vector<std::vector<int64_t>> new_sizes_list_2 = {
1680 {8, 44, 34}, {4, 13, 56}, {8, 12, 64}, {12, 55, 33}, {4, 54, 10}};
1681
1682 for (auto& new_sizes : new_sizes_list_2) {
1683 graph.resize_input(0, new_sizes);
1684 graph.resize_input(1, new_sizes);
1685 graph.resize_input(2, new_sizes);
1686 graph.propagate_resize();
1687
1688 // Check output shape
1689 EXPECT_TRUE(graph.get_tensor(out.value)->sizes() == new_sizes);
1690
1691 float val_a = new_sizes[1] + 6.0f;
1692 float val_b = new_sizes[2] + 2.5f;
1693 float val_d = new_sizes[0] + 4.0f;
1694 float val_out = (val_a + val_b) * val_d;
1695
1696 fill_vtensor(graph, a, val_a);
1697 fill_vtensor(graph, b, val_b);
1698 fill_vtensor(graph, d, val_d);
1699
1700 // Execute graph
1701 graph.execute();
1702
1703 EXTRACT_TENSOR(out);
1704
1705 // Sanity check that the values are correct
1706 for (size_t i = 0; i < graph.get_tensor(out.value)->numel(); i++) {
1707 CHECK_VALUE(data_out, i, val_out);
1708 }
1709 }
1710 }
1711
TEST(VulkanComputeGraphTest,test_simple_graph_with_tmp_tensors)1712 TEST(VulkanComputeGraphTest, test_simple_graph_with_tmp_tensors) {
1713 GraphConfig config;
1714 ComputeGraph graph(config);
1715
1716 std::vector<int64_t> size_big = {8, 64, 124};
1717 std::vector<int64_t> size_small = {8, 1, 124};
1718
1719 // Build graph
1720
1721 IOValueRef a = graph.add_input_tensor(
1722 size_big, vkapi::kFloat, /*shared_object_idx = */ 0);
1723 IOValueRef b = graph.add_input_tensor(
1724 size_small, vkapi::kFloat, /*shared_object_idx = */ 1);
1725
1726 IOValueRef out = {};
1727
1728 out.value =
1729 graph.add_tensor(size_big, vkapi::kFloat, /*shared_object_idx = */ 2);
1730
1731 // Perform the following compute
1732 //
1733 // a, b, out;
1734 // {
1735 // inter;
1736 // {
1737 // tmp = a + b
1738 // tmp2 = tmp + a
1739 // inter = tmp2 + b
1740 // }
1741 // {
1742 // tmp = inter + b;
1743 // tmp2 = tmp + a
1744 // out = tmp2 + b;
1745 // }
1746 // }
1747 {
1748 TmpTensor inter(&graph, size_big, vkapi::kFloat);
1749 EXPECT_TRUE(inter.sobj_idx == 3);
1750 {
1751 TmpTensor tmp(&graph, size_big, vkapi::kFloat);
1752 EXPECT_TRUE(tmp.sobj_idx == 4);
1753 VK_GET_OP_FN("aten.add.Tensor")
1754 (graph, {a, b, kDummyValueRef, tmp});
1755
1756 TmpTensor tmp2(&graph, size_big, vkapi::kFloat);
1757 EXPECT_TRUE(tmp2.sobj_idx == 5);
1758 VK_GET_OP_FN("aten.add.Tensor")
1759 (graph, {tmp, a, kDummyValueRef, tmp2});
1760
1761 VK_GET_OP_FN("aten.add.Tensor")
1762 (graph, {tmp2, b, kDummyValueRef, inter});
1763 }
1764 {
1765 TmpTensor tmp(&graph, size_big, vkapi::kFloat);
1766 EXPECT_TRUE(tmp.sobj_idx == 4);
1767 VK_GET_OP_FN("aten.add.Tensor")
1768 (graph, {inter, b, kDummyValueRef, tmp});
1769
1770 TmpTensor tmp2(&graph, size_big, vkapi::kFloat);
1771 EXPECT_TRUE(tmp2.sobj_idx == 5);
1772 VK_GET_OP_FN("aten.add.Tensor")
1773 (graph, {tmp, a, kDummyValueRef, tmp2});
1774
1775 VK_GET_OP_FN("aten.add.Tensor")
1776 (graph, {tmp2, b, kDummyValueRef, out});
1777 }
1778 }
1779
1780 out.staging = graph.set_output_tensor(out.value);
1781
1782 graph.prepare();
1783 graph.encode_execute();
1784
1785 // Run graph
1786
1787 for (float i = 5.0f; i < 30.0f; i += 10.0f) {
1788 float val_a = i + 2.0f;
1789 float val_b = i + 1.5f;
1790 float val_tmp = val_a + val_b;
1791 float val_tmp2 = val_tmp + val_a;
1792 float val_inter = val_tmp2 + val_b;
1793 float val_tmp_2 = val_inter + val_b;
1794 float val_tmp2_2 = val_tmp_2 + val_a;
1795 float val_out = val_tmp2_2 + val_b;
1796
1797 fill_vtensor(graph, a, val_a);
1798 fill_vtensor(graph, b, val_b);
1799
1800 graph.execute();
1801
1802 EXTRACT_TENSOR(out);
1803
1804 // Sanity check that the values are correct
1805 for (size_t i = 0; i < graph.get_tensor(out.value)->numel(); ++i) {
1806 CHECK_VALUE(data_out, i, val_out);
1807 }
1808 }
1809 }
1810
TEST(VulkanComputeGraphTest,test_large_graph)1811 TEST(VulkanComputeGraphTest, test_large_graph) {
1812 auto build_start_time = std::chrono::system_clock::now();
1813 GraphConfig config;
1814 ComputeGraph graph(config);
1815
1816 int64_t input_w = 256;
1817 int64_t input_h = 256;
1818 int64_t input_c = 8;
1819
1820 std::vector<int64_t> size_big = {input_c, input_h, input_w};
1821 std::vector<int64_t> size_small = {input_c, input_h, 1};
1822
1823 std::vector<int64_t> size_big_alt = {input_c / 2, input_h / 2, input_w / 2};
1824 std::vector<int64_t> size_small_alt = {input_c / 2, input_h / 2, 1};
1825
1826 // Build graph
1827
1828 IOValueRef a = graph.add_input_tensor(size_big, vkapi::kFloat, 2);
1829 IOValueRef b = graph.add_input_tensor(size_small, vkapi::kFloat, 4);
1830
1831 ValueRef c = graph.add_tensor(size_big, vkapi::kFloat, 6);
1832
1833 auto addFn = VK_GET_OP_FN("aten.add.Tensor");
1834 addFn(graph, {a.value, b.value, kDummyValueRef, c});
1835
1836 int n = 100;
1837
1838 for (int i = 0; i < n; i++) {
1839 addFn(graph, {c, b.value, kDummyValueRef, a.value});
1840
1841 addFn(graph, {a.value, b.value, kDummyValueRef, c});
1842 }
1843
1844 IOValueRef out = {};
1845 out.value = c;
1846 out.staging = graph.set_output_tensor(out.value);
1847
1848 graph.prepare();
1849 graph.encode_execute();
1850
1851 auto build_end_time = std::chrono::system_clock::now();
1852
1853 auto build_time = std::chrono::duration_cast<std::chrono::microseconds>(
1854 build_end_time - build_start_time);
1855
1856 std::stringstream ss;
1857 for (int i = 0; i < 10; i++) {
1858 auto resize_start_time = std::chrono::system_clock::now();
1859 if (i % 2 == 0) {
1860 graph.resize_input(0, size_big_alt);
1861 graph.resize_input(1, size_small_alt);
1862 } else {
1863 graph.resize_input(0, size_big);
1864 graph.resize_input(1, size_small);
1865 }
1866 graph.propagate_resize();
1867 auto resize_end_time = std::chrono::system_clock::now();
1868
1869 auto resize_time = std::chrono::duration_cast<std::chrono::microseconds>(
1870 resize_end_time - resize_start_time);
1871
1872 float val_a = 1.0f;
1873 float val_b = 2.0f;
1874
1875 float val_e = val_a + val_b * (2 * n + 1);
1876
1877 auto inference_start_time = std::chrono::system_clock::now();
1878
1879 fill_vtensor(graph, a, val_a);
1880 fill_vtensor(graph, b, val_b);
1881
1882 graph.execute();
1883
1884 EXTRACT_TENSOR(out);
1885
1886 auto inference_end_time = std::chrono::system_clock::now();
1887
1888 auto inference_time = std::chrono::duration_cast<std::chrono::microseconds>(
1889 inference_end_time - inference_start_time);
1890
1891 for (int i = 0; i < graph.get_tensor(out.value)->numel(); i++) {
1892 CHECK_VALUE(data_out, i, val_e);
1893 }
1894
1895 ss << "[ ] Resize: " << std::setw(10) << std::right
1896 << resize_time.count() << " us" << std::endl;
1897 ss << "[ ] Inference: " << std::setw(10) << std::right
1898 << inference_time.count() << " us" << std::endl;
1899 }
1900 ss << "[ ] Model Load:" << std::setw(10) << std::right
1901 << build_time.count() << " us" << std::endl;
1902 std::cout << ss.str();
1903 }
1904
test_clone(std::vector<int64_t> sizes,utils::StorageType src_storage,utils::GPUMemoryLayout src_layout,utils::StorageType dst_storage,utils::GPUMemoryLayout dst_layout)1905 void test_clone(
1906 std::vector<int64_t> sizes,
1907 utils::StorageType src_storage,
1908 utils::GPUMemoryLayout src_layout,
1909 utils::StorageType dst_storage,
1910 utils::GPUMemoryLayout dst_layout) {
1911 GraphConfig config;
1912 ComputeGraph graph(config);
1913
1914 IOValueRef a =
1915 graph.add_input_tensor(sizes, vkapi::kFloat, src_storage, src_layout);
1916
1917 IOValueRef out = {};
1918 out.value = graph.add_tensor(sizes, vkapi::kFloat, dst_storage, dst_layout);
1919
1920 auto copyFn = VK_GET_OP_FN("aten.clone.default");
1921 copyFn(graph, {a.value, kDummyValueRef, out.value});
1922
1923 out.staging = graph.set_output_tensor(out.value);
1924
1925 graph.prepare();
1926 graph.encode_execute();
1927
1928 fill_vtensor(graph, a, 0.0f, /*iota = */ true);
1929
1930 graph.propagate_resize();
1931 graph.execute();
1932
1933 EXTRACT_TENSOR(out);
1934 EXTRACT_TENSOR(a);
1935
1936 for (int i = 0; i < graph.numel_of(a.value); ++i) {
1937 EXPECT_TRUE(data_out[i] == data_a[i]);
1938 }
1939 }
1940
TEST(VulkanComputeGraphTest,test_clone)1941 TEST(VulkanComputeGraphTest, test_clone) {
1942 std::vector<std::pair<utils::GPUMemoryLayout, utils::GPUMemoryLayout>> cases{
1943 {utils::kWidthPacked, utils::kWidthPacked},
1944 {utils::kWidthPacked, utils::kChannelsPacked},
1945 {utils::kChannelsPacked, utils::kChannelsPacked},
1946 };
1947
1948 for (std::vector<int64_t> sizes : standard_sizes_to_test) {
1949 for (auto& [src_layout, dst_layout] : cases) {
1950 test_clone(
1951 sizes, utils::kTexture3D, src_layout, utils::kBuffer, dst_layout);
1952 test_clone(
1953 sizes, utils::kBuffer, src_layout, utils::kTexture3D, dst_layout);
1954 test_clone(
1955 sizes, utils::kTexture3D, src_layout, utils::kTexture3D, dst_layout);
1956 }
1957 }
1958 }
1959
TEST(VulkanComputeGraphTest,test_etvk_copy_offset_node)1960 TEST(VulkanComputeGraphTest, test_etvk_copy_offset_node) {
1961 GraphConfig config;
1962 ComputeGraph graph(config);
1963
1964 int64_t n = 6;
1965 int64_t c = 12;
1966 int64_t h = 4;
1967 int64_t w = 8;
1968 utils::GPUMemoryLayout memory_layout =
1969 utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
1970
1971 std::vector<int64_t> size = {n, c, h, w};
1972
1973 IOValueRef a = graph.add_input_tensor(size, vkapi::kFloat, memory_layout);
1974
1975 IOValueRef out = {};
1976 out.value = graph.add_tensor(size, vkapi::kFloat, memory_layout);
1977
1978 // Notice that copy_node operates on in texture's x, y, z dimension. In the
1979 // comment, we provide the cooresponding coordinate in nchw.
1980
1981 // src_offset is (n=0, c=4, h=1, w=1)
1982 ValueRef src_offset_ref = graph.add_scalar_list<int64_t>({1, 1, 1});
1983
1984 // dst_offset is (n=1, c=8, h=2, w=0) in nchw coordinate
1985 // Argument is {x, y, z}.
1986 // x = 0 since w = 0
1987 // y = 2 since h = 2
1988 // z = c / 4 + 2 since
1989 // 1. there c/4 planes per batch, n=1 means we are on the first batch;
1990 // 2. +2 because c = 8, with channel packing it means two texels.
1991 ValueRef dst_offset_ref = graph.add_scalar_list<int64_t>({0, 2, c / 4 + 2});
1992
1993 // range is (n=1, c=8, h=2, w=4)
1994 // Argument is {x, y, z}.
1995 // x = 4 since w = 4
1996 // y = 2 since h = 2
1997 // z = 2 since we are only copying 8 channels, hence 2 texel. n = 1 can be a
1998 // bit misleading here, since it gives the impression that we are copying the
1999 // entire channel. However, remember when we copy, we are trying to
2000 // dst[dst_offset:dst_offset + range] = src[src_offset:src_offset + range],
2001 // range must be non zero.
2002 ValueRef range_ref = graph.add_scalar_list<int64_t>({4, 2, 2});
2003
2004 auto copyFn = VK_GET_OP_FN("etvk.copy_offset");
2005 copyFn(
2006 graph, {a.value, range_ref, src_offset_ref, dst_offset_ref, out.value});
2007
2008 out.staging = graph.set_output_tensor(out.value);
2009
2010 graph.prepare();
2011 graph.encode_execute();
2012
2013 fill_vtensor(graph, a, 0.0f, /*iota = */ true);
2014
2015 graph.execute();
2016
2017 EXTRACT_TENSOR(out);
2018 EXTRACT_TENSOR(a);
2019
2020 // We will examine the results in the dst_range
2021 // The value in the cooresponding coordinate should match between the source
2022 // and destination tensor. We loop thru the range, calculate both the src and
2023 // dst index using the offsets, and compare the values in the extracted
2024 // vector. They should match.
2025 int n_idx = 0;
2026 // at each nested loop, index range from dst_offset to dst_offset + range
2027
2028 for (int c_idx = 0; c_idx < 8; c_idx++) {
2029 for (int h_idx = 0; h_idx < 2; h_idx++) {
2030 for (int w_idx = 0; w_idx < 4; w_idx++) {
2031 auto dst_idx =
2032 get_buf_idx(graph, out, {n_idx + 1, c_idx + 8, h_idx + 2, w_idx});
2033 auto src_idx =
2034 get_buf_idx(graph, a, {n_idx, c_idx + 4, h_idx + 1, w_idx + 1});
2035
2036 EXPECT_TRUE(data_out[dst_idx] == data_a[src_idx]);
2037 }
2038 }
2039 }
2040 }
2041
TEST(VulkanComputeGraphTest,test_etvk_copy_channel_offset_node)2042 TEST(VulkanComputeGraphTest, test_etvk_copy_channel_offset_node) {
2043 GraphConfig config;
2044 ComputeGraph graph(config);
2045
2046 int64_t n = 2;
2047 int64_t c = 12;
2048 int64_t h = 4;
2049 int64_t w = 8;
2050 utils::GPUMemoryLayout memory_layout =
2051 utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
2052
2053 std::vector<int64_t> size = {n, c, h, w};
2054
2055 IOValueRef a = graph.add_input_tensor(size, vkapi::kFloat, memory_layout);
2056
2057 IOValueRef out = {};
2058 out.value = graph.add_tensor(size, vkapi::kFloat, memory_layout);
2059
2060 int64_t src_offset = 2;
2061 int64_t dst_offset = 3;
2062 int64_t range = 7;
2063
2064 ValueRef src_offset_ref = graph.add_scalar<int64_t>(src_offset);
2065 ValueRef dst_offset_ref = graph.add_scalar<int64_t>(dst_offset);
2066 ValueRef range_ref = graph.add_scalar<int64_t>(range);
2067
2068 auto copyFn = VK_GET_OP_FN("etvk.copy_channel_offset");
2069 copyFn(
2070 graph, {a.value, range_ref, src_offset_ref, dst_offset_ref, out.value});
2071
2072 out.staging = graph.set_output_tensor(out.value);
2073
2074 graph.prepare();
2075 graph.encode_execute();
2076
2077 fill_vtensor(graph, a, 0.0f, true);
2078
2079 graph.execute();
2080
2081 EXTRACT_TENSOR(out);
2082 EXTRACT_TENSOR(a);
2083
2084 for (int n_idx = 0; n_idx < n; n_idx++) {
2085 for (int c_idx = 0; c_idx < range; c_idx++) {
2086 for (int h_idx = 0; h_idx < h; h_idx++) {
2087 for (int w_idx = 0; w_idx < w; w_idx++) {
2088 auto src_idx =
2089 get_buf_idx(graph, a, {n_idx, c_idx + src_offset, h_idx, w_idx});
2090 auto dst_idx = get_buf_idx(
2091 graph, out, {n_idx, c_idx + dst_offset, h_idx, w_idx});
2092 EXPECT_TRUE(data_out[dst_idx] == data_a[src_idx]);
2093 }
2094 }
2095 }
2096 }
2097 }
2098
TEST(VulkanComputeGraphTest,test_etvk_copy_channel_offset_node_clean_boundary)2099 TEST(
2100 VulkanComputeGraphTest,
2101 test_etvk_copy_channel_offset_node_clean_boundary) {
2102 // Tricky part for channel copy is handling the boundary across multiple copy.
2103 // For example, when we concat two [3, 1, 1] nchw-tensors along the channel
2104 // dimension, due to channel packing, elements from different source texel
2105 // will be packed into same destination texel at the boundaries.
2106 GraphConfig config;
2107 ComputeGraph graph(config);
2108
2109 int64_t n = 2;
2110 int64_t c = 12;
2111 int64_t h = 4;
2112 int64_t w = 8;
2113 utils::GPUMemoryLayout memory_layout =
2114 utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
2115
2116 std::vector<int64_t> size = {n, c, h, w};
2117
2118 IOValueRef zero = graph.add_input_tensor(size, vkapi::kFloat, memory_layout);
2119 IOValueRef a = graph.add_input_tensor(size, vkapi::kFloat, memory_layout);
2120 IOValueRef b = graph.add_input_tensor(size, vkapi::kFloat, memory_layout);
2121
2122 IOValueRef out = {};
2123 out.value = graph.add_tensor(size, vkapi::kFloat, memory_layout);
2124
2125 auto copyFn = VK_GET_OP_FN("etvk.copy_channel_offset");
2126
2127 // Make sure entire out tensor is zeroed. The zero tensor will be filled with
2128 // zero later.
2129 copyFn(
2130 graph,
2131 {zero.value,
2132 graph.add_scalar<int64_t>(c),
2133 graph.add_scalar<int64_t>(0),
2134 graph.add_scalar<int64_t>(0),
2135 out.value});
2136
2137 int64_t a_src_offset = 0;
2138 int64_t a_dst_offset = 2;
2139 int64_t a_range = 5;
2140 // a will write to channge [2, 7)
2141 copyFn(
2142 graph,
2143 {a.value,
2144 graph.add_scalar<int64_t>(a_range),
2145 graph.add_scalar<int64_t>(a_src_offset),
2146 graph.add_scalar<int64_t>(a_dst_offset),
2147 out.value});
2148
2149 // b will write to channel [6, 11)
2150 // Intentional for b to override channel=6
2151 int64_t b_src_offset = 0;
2152 int64_t b_dst_offset = 6;
2153 int64_t b_range = 5;
2154
2155 copyFn(
2156 graph,
2157 {b.value,
2158 graph.add_scalar<int64_t>(b_range),
2159 graph.add_scalar<int64_t>(b_src_offset),
2160 graph.add_scalar<int64_t>(b_dst_offset),
2161 out.value});
2162
2163 out.staging = graph.set_output_tensor(out.value);
2164
2165 graph.prepare();
2166 graph.encode_execute();
2167
2168 float a_value = 1.0f;
2169 float b_value = 2.0f;
2170 float zero_value = 0.0f;
2171 fill_vtensor(graph, a, a_value);
2172 fill_vtensor(graph, b, b_value);
2173 fill_vtensor(graph, zero, zero_value);
2174
2175 graph.execute();
2176
2177 EXTRACT_TENSOR(out);
2178
2179 for (int n_idx = 0; n_idx < n; n_idx++) {
2180 // c_idx only up to a_range-1 because the expected overwrite by b
2181 for (int c_idx = a_dst_offset; c_idx < a_dst_offset + a_range - 1;
2182 c_idx++) {
2183 for (int h_idx = 0; h_idx < h; h_idx++) {
2184 for (int w_idx = 0; w_idx < w; w_idx++) {
2185 auto dst_idx = get_buf_idx(graph, out, {n_idx, c_idx, h_idx, w_idx});
2186 EXPECT_TRUE(data_out[dst_idx] == a_value);
2187 }
2188 }
2189 }
2190 }
2191
2192 for (int n_idx = 0; n_idx < n; n_idx++) {
2193 for (int c_idx = b_dst_offset; c_idx < b_dst_offset + b_range; c_idx++) {
2194 for (int h_idx = 0; h_idx < h; h_idx++) {
2195 for (int w_idx = 0; w_idx < w; w_idx++) {
2196 auto dst_idx = get_buf_idx(graph, out, {n_idx, c_idx, h_idx, w_idx});
2197 EXPECT_TRUE(data_out[dst_idx] == b_value);
2198 }
2199 }
2200 }
2201 }
2202
2203 // Also verify that data before a_dst_offset and after b_dst_offset + b_range
2204 // are untouched.
2205 for (int n_idx = 0; n_idx < n; n_idx++) {
2206 for (int c_idx = 0; c_idx < a_dst_offset; c_idx++) {
2207 for (int h_idx = 0; h_idx < h; h_idx++) {
2208 for (int w_idx = 0; w_idx < w; w_idx++) {
2209 auto dst_idx = get_buf_idx(graph, out, {n_idx, c_idx, h_idx, w_idx});
2210 EXPECT_TRUE(data_out[dst_idx] == zero_value);
2211 }
2212 }
2213 }
2214 }
2215
2216 for (int n_idx = 0; n_idx < n; n_idx++) {
2217 for (int c_idx = b_dst_offset + b_range; c_idx < c; c_idx++) {
2218 for (int h_idx = 0; h_idx < h; h_idx++) {
2219 for (int w_idx = 0; w_idx < w; w_idx++) {
2220 auto dst_idx = get_buf_idx(graph, out, {n_idx, c_idx, h_idx, w_idx});
2221 EXPECT_TRUE(data_out[dst_idx] == zero_value);
2222 }
2223 }
2224 }
2225 }
2226 }
2227
TEST(VulkanComputeGraphTest,test_etvk_copy_offset_int_node)2228 TEST(VulkanComputeGraphTest, test_etvk_copy_offset_int_node) {
2229 GraphConfig config;
2230 ComputeGraph graph(config);
2231
2232 int64_t n = 6;
2233 int64_t c = 12;
2234 int64_t h = 4;
2235 int64_t w = 8;
2236 utils::GPUMemoryLayout memory_layout =
2237 utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
2238
2239 std::vector<int64_t> size = {n, c, h, w};
2240
2241 IOValueRef a = graph.add_input_tensor(size, vkapi::kInt, memory_layout);
2242
2243 IOValueRef out = {};
2244 out.value = graph.add_tensor(size, vkapi::kInt, memory_layout);
2245
2246 // Notice that copy_node operates on in texture's x, y, z dimension. In the
2247 // comment, we provide the cooresponding coordinate in nchw.
2248
2249 // src_offset is (n=0, c=4, h=1, w=1)
2250 ValueRef src_offset_ref = graph.add_scalar_list<int64_t>({1, 1, 1});
2251
2252 // dst_offset is (n=1, c=8, h=2, w=0) in nchw coordinate
2253 // Argument is {x, y, z}.
2254 // x = 0 since w = 0
2255 // y = 2 since h = 2
2256 // z = c / 4 + 2 since
2257 // 1. there c/4 planes per batch, n=1 means we are on the first batch;
2258 // 2. +2 because c = 8, with channel packing it means two texels.
2259 ValueRef dst_offset_ref = graph.add_scalar_list<int64_t>({0, 2, c / 4 + 2});
2260
2261 // range is (n=1, c=8, h=2, w=4)
2262 // Argument is {x, y, z}.
2263 // x = 4 since w = 4
2264 // y = 2 since h = 2
2265 // z = 2 since we are only copying 8 channels, hence 2 texel. n = 1 can be a
2266 // bit misleading here, since it gives the impression that we are copying the
2267 // entire channel. However, remember when we copy, we are trying to
2268 // dst[dst_offset:dst_offset + range] = src[src_offset:src_offset + range],
2269 // range must be non zero.
2270 ValueRef range_ref = graph.add_scalar_list<int64_t>({4, 2, 2});
2271
2272 auto copyFn = VK_GET_OP_FN("etvk.copy_offset");
2273 copyFn(
2274 graph, {a.value, range_ref, src_offset_ref, dst_offset_ref, out.value});
2275
2276 out.staging = graph.set_output_tensor(out.value);
2277
2278 graph.prepare();
2279 graph.encode_execute();
2280
2281 fill_vtensor(graph, a, 0, /*iota = */ true);
2282
2283 graph.execute();
2284
2285 EXTRACT_TENSOR(out);
2286 EXTRACT_TENSOR(a);
2287
2288 // We will examine the results in the dst_range
2289 // The value in the cooresponding coordinate should match between the source
2290 // and destination tensor. We loop thru the range, calculate both the src and
2291 // dst index using the offsets, and compare the values in the extracted
2292 // vector. They should match.
2293 int n_idx = 0;
2294 // at each nested loop, index range from dst_offset to dst_offset + range
2295
2296 for (int c_idx = 0; c_idx < 8; c_idx++) {
2297 for (int h_idx = 0; h_idx < 2; h_idx++) {
2298 for (int w_idx = 0; w_idx < 4; w_idx++) {
2299 auto dst_idx =
2300 get_buf_idx(graph, out, {n_idx + 1, c_idx + 8, h_idx + 2, w_idx});
2301 auto src_idx =
2302 get_buf_idx(graph, a, {n_idx, c_idx + 4, h_idx + 1, w_idx + 1});
2303
2304 EXPECT_TRUE(data_out[dst_idx] == data_a[src_idx]);
2305 }
2306 }
2307 }
2308 }
2309
TEST(VulkanComputeGraphTest,test_etvk_copy_channel_offset_int_node)2310 TEST(VulkanComputeGraphTest, test_etvk_copy_channel_offset_int_node) {
2311 GraphConfig config;
2312 ComputeGraph graph(config);
2313
2314 int64_t n = 2;
2315 int64_t c = 12;
2316 int64_t h = 4;
2317 int64_t w = 8;
2318 utils::GPUMemoryLayout memory_layout =
2319 utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
2320
2321 std::vector<int64_t> size = {n, c, h, w};
2322
2323 IOValueRef a = graph.add_input_tensor(size, vkapi::kFloat, memory_layout);
2324
2325 IOValueRef out = {};
2326 out.value = graph.add_tensor(size, vkapi::kFloat, memory_layout);
2327
2328 int64_t src_offset = 2;
2329 int64_t dst_offset = 3;
2330 int64_t range = 7;
2331
2332 ValueRef src_offset_ref = graph.add_scalar<int64_t>(src_offset);
2333 ValueRef dst_offset_ref = graph.add_scalar<int64_t>(dst_offset);
2334 ValueRef range_ref = graph.add_scalar<int64_t>(range);
2335
2336 auto copyFn = VK_GET_OP_FN("etvk.copy_channel_offset");
2337 copyFn(
2338 graph, {a.value, range_ref, src_offset_ref, dst_offset_ref, out.value});
2339
2340 out.staging = graph.set_output_tensor(out.value);
2341
2342 graph.prepare();
2343 graph.encode_execute();
2344
2345 fill_vtensor(graph, a, 0.0f, true);
2346
2347 graph.execute();
2348
2349 EXTRACT_TENSOR(out);
2350 EXTRACT_TENSOR(a);
2351
2352 for (int n_idx = 0; n_idx < n; n_idx++) {
2353 for (int c_idx = 0; c_idx < range; c_idx++) {
2354 for (int h_idx = 0; h_idx < h; h_idx++) {
2355 for (int w_idx = 0; w_idx < w; w_idx++) {
2356 auto src_idx =
2357 get_buf_idx(graph, a, {n_idx, c_idx + src_offset, h_idx, w_idx});
2358 auto dst_idx = get_buf_idx(
2359 graph, out, {n_idx, c_idx + dst_offset, h_idx, w_idx});
2360 EXPECT_TRUE(data_out[dst_idx] == data_a[src_idx]);
2361 }
2362 }
2363 }
2364 }
2365 }
2366
TEST(VulkanComputeGraphTest,test_view_change_packing)2367 TEST(VulkanComputeGraphTest, test_view_change_packing) {
2368 std::vector<std::pair<utils::GPUMemoryLayout, utils::GPUMemoryLayout>>
2369 layout_pairs = {
2370 {utils::kWidthPacked, utils::kChannelsPacked},
2371 {utils::kWidthPacked, utils::kHeightPacked},
2372 {utils::kWidthPacked, utils::kWidthPacked},
2373 {utils::kHeightPacked, utils::kChannelsPacked},
2374 {utils::kHeightPacked, utils::kHeightPacked},
2375 {utils::kHeightPacked, utils::kHeightPacked},
2376 {utils::kChannelsPacked, utils::kChannelsPacked},
2377 {utils::kChannelsPacked, utils::kHeightPacked},
2378 {utils::kChannelsPacked, utils::kHeightPacked},
2379 };
2380
2381 int64_t n = 3;
2382 int64_t c = 2;
2383 int64_t h = 2;
2384 int64_t w = 5;
2385 std::vector<int64_t> size = {n, c, h, w};
2386
2387 for (auto layout_pair : layout_pairs) {
2388 GraphConfig config;
2389 ComputeGraph graph(config);
2390
2391 IOValueRef in =
2392 graph.add_input_tensor(size, vkapi::kFloat, layout_pair.first);
2393
2394 IOValueRef out = {};
2395 out.value = graph.add_tensor(size, vkapi::kFloat, layout_pair.second);
2396
2397 auto viewFn = VK_GET_OP_FN("aten.view_copy.default");
2398 viewFn(graph, {in.value, graph.add_none(), out.value});
2399
2400 out.staging = graph.set_output_tensor(out.value);
2401
2402 graph.prepare();
2403 graph.encode_execute();
2404
2405 fill_vtensor(graph, in, 0.0, true);
2406
2407 graph.execute();
2408
2409 EXTRACT_TENSOR(out);
2410
2411 // The extracted data is a flattened nchw buffer. Hence, should expect the
2412 // all elements inside the out array to match the index.
2413 for (int i = 0; i < graph.get_tensor(out.value)->numel(); i++) {
2414 CHECK_VALUE(data_out, i, i);
2415 }
2416 }
2417 }
2418
2419 class VulkanToFromGPUShaderTest : public ::testing::Test {
2420 public:
SetUp()2421 void SetUp() override {
2422 // Make sure we are starting with a clean slate
2423 EXPECT_TRUE(get_vma_allocation_count() == 0);
2424 }
2425
TearDown()2426 void TearDown() override {
2427 context()->flush();
2428
2429 // Make sure we are ending with a clean slate
2430 EXPECT_TRUE(get_vma_allocation_count() == 0);
2431 }
2432 };
2433
2434 template <typename T>
run_from_gpu_test(std::vector<int64_t> & sizes,utils::GPUMemoryLayout memory_layout=utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED,vkapi::ScalarType dtype=vkapi::kFloat,utils::StorageType storage_type=utils::StorageType::TEXTURE_3D)2435 void run_from_gpu_test(
2436 std::vector<int64_t>& sizes,
2437 utils::GPUMemoryLayout memory_layout =
2438 utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED,
2439 vkapi::ScalarType dtype = vkapi::kFloat,
2440 utils::StorageType storage_type = utils::StorageType::TEXTURE_3D) {
2441 if (dtype == vkapi::kHalf &&
2442 !context()->adapter_ptr()->supports_16bit_storage_buffers()) {
2443 return;
2444 }
2445 vTensor vten = vTensor(context(), sizes, dtype, storage_type, memory_layout);
2446
2447 std::string kernel_name("idx_fill_texture");
2448 add_dtype_suffix(kernel_name, vten);
2449
2450 int32_t offset = -50;
2451
2452 {
2453 vkapi::PipelineBarrier pipeline_barrier{};
2454 context()->submit_compute_job(
2455 VK_KERNEL_FROM_STR(kernel_name),
2456 pipeline_barrier,
2457 vten.logical_limits(),
2458 {4, 4, 4},
2459 {vten.packed_dim(), offset},
2460 VK_NULL_HANDLE,
2461 0,
2462 vten.image(
2463 pipeline_barrier,
2464 vkapi::PipelineStage::COMPUTE,
2465 vkapi::MemoryAccessType::WRITE),
2466 vten.sizes_ubo());
2467 }
2468
2469 StagingBuffer staging_buffer(context(), dtype, vten.staging_buffer_numel());
2470
2471 if (dtype == vkapi::kChar &&
2472 !context()->adapter_ptr()->has_full_int8_buffers_support()) {
2473 record_bitw8_image_to_nchw_nobitw8buffer_op(
2474 context(), vten, staging_buffer);
2475 } else {
2476 record_image_to_nchw_op(context(), vten, staging_buffer.buffer());
2477 }
2478
2479 submit_to_gpu();
2480
2481 std::vector<T> data_out(staging_buffer.numel());
2482 staging_buffer.copy_to(data_out.data(), staging_buffer.nbytes());
2483
2484 for (int i = 0; i < vten.numel(); i++) {
2485 CHECK_VALUE(data_out, i, i + offset);
2486 }
2487 }
2488
2489 template <typename T>
round_trip_test(std::vector<int64_t> & sizes,utils::GPUMemoryLayout memory_layout=utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED,vkapi::ScalarType dtype=vkapi::kFloat,utils::StorageType storage_type=utils::StorageType::TEXTURE_3D)2490 void round_trip_test(
2491 std::vector<int64_t>& sizes,
2492 utils::GPUMemoryLayout memory_layout =
2493 utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED,
2494 vkapi::ScalarType dtype = vkapi::kFloat,
2495 utils::StorageType storage_type = utils::StorageType::TEXTURE_3D) {
2496 if (dtype == vkapi::kHalf &&
2497 !context()->adapter_ptr()->supports_16bit_storage_buffers()) {
2498 return;
2499 }
2500
2501 vTensor vten = vTensor(context(), sizes, dtype, storage_type, memory_layout);
2502
2503 // Create and fill input staging buffer
2504 StagingBuffer staging_buffer_in(
2505 context(), dtype, vten.staging_buffer_numel());
2506
2507 std::vector<T> data_in(staging_buffer_in.numel());
2508 for (int i = 0; i < staging_buffer_in.numel(); i++) {
2509 data_in[i] = T(i * -1);
2510 }
2511 staging_buffer_in.copy_from(data_in.data(), vten.staging_buffer_nbytes());
2512
2513 // Output staging buffer
2514 StagingBuffer staging_buffer_out(
2515 context(), dtype, vten.staging_buffer_numel());
2516
2517 record_nchw_to_image_op(context(), staging_buffer_in.buffer(), vten);
2518
2519 // Copy data in and out of the tensor
2520 if (dtype == vkapi::kChar &&
2521 !context()->adapter_ptr()->has_full_int8_buffers_support()) {
2522 record_bitw8_image_to_nchw_nobitw8buffer_op(
2523 context(), vten, staging_buffer_out);
2524 } else {
2525 record_image_to_nchw_op(context(), vten, staging_buffer_out.buffer());
2526 }
2527
2528 // Execute command buffer
2529 submit_to_gpu();
2530
2531 // Extract data from output staging buffer
2532 std::vector<T> data_out(staging_buffer_out.numel());
2533 staging_buffer_out.copy_to(data_out.data(), staging_buffer_out.nbytes());
2534
2535 // All indices should be equal to the input data
2536 for (int i = 0; i < vten.numel(); i++) {
2537 CHECK_VALUE(data_out, i, data_in[i]);
2538 }
2539 }
2540
2541 template <typename T>
compute_graph_round_trip_test(std::vector<int64_t> & sizes,utils::GPUMemoryLayout memory_layout=utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED,vkapi::ScalarType dtype=vkapi::kFloat,utils::StorageType storage_type=utils::StorageType::TEXTURE_3D)2542 void compute_graph_round_trip_test(
2543 std::vector<int64_t>& sizes,
2544 utils::GPUMemoryLayout memory_layout =
2545 utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED,
2546 vkapi::ScalarType dtype = vkapi::kFloat,
2547 utils::StorageType storage_type = utils::StorageType::TEXTURE_3D) {
2548 if (dtype == vkapi::kHalf &&
2549 !context()->adapter_ptr()->supports_16bit_storage_buffers()) {
2550 return;
2551 }
2552
2553 GraphConfig config;
2554 ComputeGraph graph(config);
2555
2556 ValueRef r_tensor =
2557 graph.add_tensor(sizes, dtype, storage_type, memory_layout);
2558 ValueRef r_staging_in = graph.set_input_tensor(r_tensor);
2559 ValueRef r_staging_out = graph.set_output_tensor(r_tensor);
2560
2561 graph.prepare();
2562 graph.encode_execute();
2563
2564 vTensorPtr tensor = graph.get_tensor(r_tensor);
2565
2566 std::vector<T> data_in(tensor->numel());
2567 for (int i = 0; i < data_in.size(); i++) {
2568 data_in[i] = T(i * -1);
2569 }
2570 graph.copy_into_staging(r_staging_in, data_in.data(), data_in.size());
2571
2572 graph.execute();
2573
2574 std::vector<T> data_out(tensor->staging_buffer_numel());
2575 graph.copy_from_staging(r_staging_out, data_out.data(), data_out.size());
2576
2577 for (int i = 0; i < data_in.size(); i++) {
2578 CHECK_VALUE(data_out, i, data_in[i]);
2579 }
2580 }
2581
TEST(VulkanToFromGPUShaderTest,round_trip_tests)2582 TEST(VulkanToFromGPUShaderTest, round_trip_tests) {
2583 // The below tests will fill each texel element with the value of the linear
2584 // buffer index that corresponds to it. The texel at position (0, 0, 0) will
2585 // be filled with the values [0, 1, 2, 3], the texel at position (1, 0, 0)
2586 // will be filled with the values [4, 5, 6, 7], and so forth. The contents of
2587 // the texture are then written back to the CPU, and to check that the
2588 // transfer has ben performed correctly the value at each index of the CPU
2589 // data buffer should be equal to the index.
2590 //
2591 // The below test cases should ensure that the total number of elements does
2592 // not exceed 2048, or else the tests will fail for FP16 textures due to
2593 // precision issues. Half precision floating point formats can only represent
2594 // integers from 2048 to 4096 using intervals of 2.
2595 std::vector<std::vector<int64_t>> to_test = {
2596 // 2D sizes
2597 {17, 21},
2598 {67, 23},
2599 {55, 33},
2600 // 3D sizes
2601 {7, 9, 13},
2602 {21, 2, 19},
2603 {17, 17, 5},
2604 // 4D sizes
2605 {7, 3, 13, 7},
2606 {11, 9, 9, 1},
2607 {3, 3, 3, 3},
2608 {3, 1, 7, 13},
2609 };
2610
2611 // These sizes are set such that the total number of elements is less than
2612 // 128 which is the maximum representable value for int8.
2613 std::vector<std::vector<int64_t>> to_test_int8 = {
2614 // 2D sizes
2615 {14, 7},
2616 // 3D sizes
2617 {3, 7, 5},
2618 {4, 2, 11},
2619 // 4D sizes
2620 {3, 3, 3, 3},
2621 {7, 1, 6, 3},
2622 };
2623
2624 #define RUN_TESTS(ctype, dtype) \
2625 round_trip_test<ctype>( \
2626 sizes, utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, dtype); \
2627 round_trip_test<ctype>( \
2628 sizes, utils::GPUMemoryLayout::TENSOR_WIDTH_PACKED, dtype); \
2629 round_trip_test<ctype>( \
2630 sizes, utils::GPUMemoryLayout::TENSOR_HEIGHT_PACKED, dtype); \
2631 compute_graph_round_trip_test<ctype>( \
2632 sizes, utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, dtype); \
2633 compute_graph_round_trip_test<ctype>( \
2634 sizes, utils::GPUMemoryLayout::TENSOR_WIDTH_PACKED, dtype); \
2635 compute_graph_round_trip_test<ctype>( \
2636 sizes, utils::GPUMemoryLayout::TENSOR_HEIGHT_PACKED, dtype);
2637
2638 for (auto& sizes : to_test) {
2639 RUN_TESTS(float, vkapi::kFloat)
2640 RUN_TESTS(executorch::aten::Half, vkapi::kHalf)
2641 }
2642
2643 for (auto& sizes : to_test_int8) {
2644 RUN_TESTS(int8_t, vkapi::kChar);
2645 }
2646
2647 #undef RUN_TESTS
2648 }
2649
2650 //
2651 // Operator Smoke Tests
2652 //
2653
test_binary_op(std::string op_name,std::vector<int64_t> sizes_big,std::vector<int64_t> sizes_small,vkapi::ScalarType dtype,utils::GPUMemoryLayout memory_layout)2654 void test_binary_op(
2655 std::string op_name,
2656 std::vector<int64_t> sizes_big,
2657 std::vector<int64_t> sizes_small,
2658 vkapi::ScalarType dtype,
2659 utils::GPUMemoryLayout memory_layout) {
2660 GraphConfig config;
2661 ComputeGraph graph(config);
2662
2663 IOValueRef arg2{};
2664
2665 // Build graph
2666
2667 IOValueRef arg1 = graph.add_input_tensor(sizes_big, dtype, memory_layout);
2668 arg2 = graph.add_input_tensor(sizes_small, dtype, memory_layout);
2669
2670 IOValueRef out;
2671 out.value = graph.add_tensor(sizes_big, dtype, memory_layout);
2672
2673 std::stringstream ss;
2674 ss << "aten.";
2675 ss << op_name;
2676 ss << ".Tensor";
2677 VK_GET_OP_FN(ss.str())
2678 (graph, {arg1.value, arg2.value, kDummyValueRef, out.value});
2679
2680 out.staging = graph.set_output_tensor(out.value);
2681
2682 graph.prepare();
2683 graph.encode_prepack();
2684 graph.prepack();
2685 graph.encode_execute();
2686
2687 for (int i = 1; i < 4; i++) {
2688 float val_arg1 = i + 1.5;
2689 float val_arg2 = i - 3.5;
2690
2691 float val_out = val_arg1 + val_arg2;
2692 if (op_name == "sub") {
2693 val_out = val_arg1 - val_arg2;
2694 }
2695 if (op_name == "mul") {
2696 val_out = val_arg1 * val_arg2;
2697 }
2698 if (op_name == "div") {
2699 val_out = val_arg1 / val_arg2;
2700 }
2701
2702 execute_graph_and_check_output(graph, {val_arg1, val_arg2}, {val_out});
2703 }
2704 }
2705
2706 #define CALL_TEST_FN_FORALL_CONDITIONS(_) \
2707 _(vkapi::kFloat, utils::kTexture3D, utils::kWidthPacked) \
2708 _(vkapi::kFloat, utils::kTexture3D, utils::kHeightPacked) \
2709 _(vkapi::kFloat, utils::kTexture3D, utils::kChannelsPacked)
2710
2711 #define CALL_TEST_FN_FOR_W_PACKED(_) \
2712 _(vkapi::kFloat, utils::kTexture3D, utils::kWidthPacked, false) \
2713 _(vkapi::kFloat, utils::kTexture3D, utils::kWidthPacked, true) \
2714 _(vkapi::kFloat, utils::kBuffer, utils::kWidthPacked, false) \
2715 _(vkapi::kFloat, utils::kBuffer, utils::kWidthPacked, true)
2716
2717 #define CALL_TEST_FN_FOR_C_PACKED(_) \
2718 _(vkapi::kFloat, utils::kTexture3D, utils::kChannelsPacked, false) \
2719 _(vkapi::kFloat, utils::kTexture3D, utils::kChannelsPacked, true) \
2720 _(vkapi::kFloat, utils::kBuffer, utils::kChannelsPacked, false) \
2721 _(vkapi::kFloat, utils::kBuffer, utils::kChannelsPacked, true)
2722
TEST(VulkanComputeGraphOpsTest,add_smoke_test)2723 TEST(VulkanComputeGraphOpsTest, add_smoke_test) {
2724 #define RUN_TESTS(dtype, storage, layout) \
2725 test_binary_op("add", {17, 21}, {17, 21}, dtype, layout); \
2726 test_binary_op("add", {17, 21}, {1, 1}, dtype, layout); \
2727 test_binary_op("sub", {11, 22}, {11, 22}, dtype, layout); \
2728 test_binary_op("sub", {11, 22}, {11, 1}, dtype, layout); \
2729 test_binary_op("add", {7, 17, 17}, {7, 17, 17}, dtype, layout); \
2730 test_binary_op("add", {7, 17, 17}, {7, 1, 17}, dtype, layout); \
2731 test_binary_op("sub", {9, 9, 7}, {9, 9, 7}, dtype, layout); \
2732 test_binary_op("sub", {9, 9, 7}, {9, 1, 1}, dtype, layout);
2733
2734 CALL_TEST_FN_FORALL_CONDITIONS(RUN_TESTS);
2735
2736 #undef RUN_TESTS
2737 }
2738
test_mm(int B,int M,int K,int N,vkapi::ScalarType dtype,utils::StorageType storage_type,utils::GPUMemoryLayout memory_layout,bool prepack=true)2739 void test_mm(
2740 int B,
2741 int M,
2742 int K,
2743 int N,
2744 vkapi::ScalarType dtype,
2745 utils::StorageType storage_type,
2746 utils::GPUMemoryLayout memory_layout,
2747 bool prepack = true) {
2748 GraphConfig config;
2749 config.set_storage_type_override(storage_type);
2750 ComputeGraph graph(config);
2751
2752 std::vector<int64_t> mat1_size = {M, K};
2753 std::vector<int64_t> mat2_size = {K, N};
2754 std::vector<int64_t> out_size = {M, N};
2755 if (B > 1) {
2756 mat1_size.resize(3);
2757 mat1_size = {B, M, K};
2758 mat2_size.resize(3);
2759 mat2_size = {B, K, N};
2760 out_size.resize(3);
2761 out_size = {B, M, N};
2762 }
2763
2764 IOValueRef mat2{};
2765
2766 CREATE_WEIGHT_TENSOR(mat2_w, mat2_size, dtype, 2.0f);
2767
2768 // Build graph
2769
2770 IOValueRef mat1 = graph.add_input_tensor(mat1_size, dtype, memory_layout);
2771
2772 if (prepack) {
2773 mat2.value = mat2_w;
2774 } else {
2775 mat2.value = graph.add_tensor(mat2_size, dtype, memory_layout);
2776 mat2.staging = graph.set_input_tensor(mat2.value);
2777 }
2778
2779 IOValueRef out;
2780 out.value = graph.add_tensor(out_size, dtype, memory_layout);
2781
2782 VK_GET_OP_FN("aten.mm.default")(graph, {mat1.value, mat2.value, out.value});
2783
2784 out.staging = graph.set_output_tensor(out.value);
2785
2786 graph.prepare();
2787 graph.encode_prepack();
2788 graph.prepack();
2789 graph.encode_execute();
2790
2791 for (int i = 1; i < 4; i++) {
2792 if (prepack) {
2793 float val_mat1 = i;
2794 float val_out = K * (val_mat1 * 2.0f);
2795 execute_graph_and_check_output(graph, {val_mat1}, {val_out});
2796 } else {
2797 float val_mat1 = i;
2798 float val_mat2 = i + 1;
2799 float val_out = K * (val_mat1 * val_mat2);
2800 execute_graph_and_check_output(graph, {val_mat1, val_mat2}, {val_out});
2801 }
2802 }
2803 }
2804
TEST(VulkanComputeGraphOpsTest,mm_smoke_test)2805 TEST(VulkanComputeGraphOpsTest, mm_smoke_test) {
2806 #define RUN_TESTS(dtype, storage_type, layout, prepack) \
2807 test_mm( \
2808 /*B = */ 1, \
2809 /*M = */ 31, \
2810 /*K = */ 127, \
2811 /*N = */ 23, \
2812 dtype, \
2813 storage_type, \
2814 layout, \
2815 prepack); \
2816 test_mm( \
2817 /*B = */ 5, \
2818 /*M = */ 31, \
2819 /*K = */ 127, \
2820 /*N = */ 23, \
2821 dtype, \
2822 storage_type, \
2823 layout, \
2824 prepack); \
2825 test_mm( \
2826 /*B = */ 7, \
2827 /*M = */ 13, \
2828 /*K = */ 89, \
2829 /*N = */ 17, \
2830 dtype, \
2831 storage_type, \
2832 layout, \
2833 prepack); \
2834 test_mm( \
2835 /*B = */ 1, \
2836 /*M = */ 13, \
2837 /*K = */ 89, \
2838 /*N = */ 17, \
2839 dtype, \
2840 storage_type, \
2841 layout, \
2842 prepack);
2843
2844 CALL_TEST_FN_FOR_W_PACKED(RUN_TESTS);
2845 CALL_TEST_FN_FOR_C_PACKED(RUN_TESTS);
2846
2847 #undef RUN_TESTS
2848 }
2849
test_max_pool2d(const std::vector<int64_t> & in_size,const int64_t base_val,std::vector<int64_t> & kernel)2850 void test_max_pool2d(
2851 const std::vector<int64_t>& in_size,
2852 const int64_t base_val,
2853 std::vector<int64_t>& kernel) {
2854 GraphConfig config;
2855 ComputeGraph graph(config);
2856
2857 // Build graph
2858
2859 std::vector<int64_t> out_size(in_size);
2860 int h = in_size.size() - 2;
2861 int w = in_size.size() - 1;
2862 out_size[h] = in_size[h] - kernel[0] + 1;
2863 out_size[w] = in_size[w] - kernel[1] + 1;
2864
2865 IOValueRef in_ioval = graph.add_input_tensor(
2866 in_size, vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED);
2867 IOValueRef out_ioval;
2868 out_ioval.value = graph.add_tensor(
2869 out_size, vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED);
2870 IOValueRef idx_ioval;
2871 idx_ioval.value = graph.add_tensor(
2872 out_size, vkapi::kInt, utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED);
2873 ValueRef out = graph.add_value_list({out_ioval.value, idx_ioval.value});
2874
2875 std::vector<int64_t> kernel_copy(kernel);
2876 VK_GET_OP_FN("aten.max_pool2d_with_indices.default")
2877 (graph,
2878 {in_ioval.value,
2879 graph.add_scalar_list<int64_t>(std::move(kernel)),
2880 graph.add_scalar_list<int64_t>({1, 1}),
2881 graph.add_scalar_list<int64_t>({0, 0}),
2882 graph.add_scalar_list<int64_t>({1, 1}),
2883 graph.add_scalar(false),
2884 out});
2885
2886 out_ioval.staging = graph.set_output_tensor(out_ioval.value);
2887 idx_ioval.staging = graph.set_output_tensor(idx_ioval.value);
2888
2889 graph.prepare();
2890 graph.encode_prepack();
2891 graph.prepack();
2892 graph.encode_execute();
2893
2894 // Run graph
2895
2896 fill_vtensor(graph, graph.inputs().at(0), base_val, /*iota = */ true);
2897
2898 vTensorPtr t_in = graph.get_tensor(in_ioval.value);
2899 std::vector<float> input_data(t_in->staging_buffer_numel());
2900 graph.copy_from_staging(
2901 in_ioval.staging, input_data.data(), input_data.size());
2902
2903 graph.execute();
2904
2905 vTensorPtr t_out = graph.get_tensor(out_ioval.value);
2906 std::vector<float> output_data(t_out->staging_buffer_numel());
2907 graph.copy_from_staging(
2908 out_ioval.staging, output_data.data(), output_data.size());
2909 vTensorPtr t_idx = graph.get_tensor(idx_ioval.value);
2910 std::vector<int> index_data(t_idx->staging_buffer_numel());
2911 graph.copy_from_staging(
2912 idx_ioval.staging, index_data.data(), index_data.size());
2913
2914 // Check results
2915
2916 int h_offset = kernel_copy[0] - 1;
2917 int w_offset = kernel_copy[1] - 1;
2918 int h_out = utils::val_at(-2, t_out->sizes());
2919 int w_out = utils::val_at(-1, t_out->sizes());
2920 int w_in = utils::val_at(-1, t_in->sizes());
2921 for (size_t i = 0; i < h_out; ++i) {
2922 for (size_t j = 0; j < w_out; ++j) {
2923 size_t idx_out = i * w_out + j;
2924 size_t idx_in = (i + h_offset) * w_in + (j + w_offset);
2925 CHECK_VALUE(index_data, idx_out, idx_in);
2926 CHECK_VALUE(output_data, idx_out, input_data[idx_in]);
2927 }
2928 }
2929 }
2930
TEST(VulkanComputeGraphOpsTest,max_pool2d_smoke_test)2931 TEST(VulkanComputeGraphOpsTest, max_pool2d_smoke_test) {
2932 std::vector<int64_t> kernel = {2, 3};
2933 test_max_pool2d(
2934 /*in_size = */ {1, 4, 6},
2935 /*base_val = */ 10.0f,
2936 kernel);
2937 }
2938
test_conv2d(const std::vector<int64_t> & original_sizes,const std::vector<int64_t> & padded_sizes,const std::vector<int64_t> & gpu_sizes,const bool transposed,const std::vector<float> & data_out_expected)2939 void test_conv2d(
2940 const std::vector<int64_t>& original_sizes,
2941 const std::vector<int64_t>& padded_sizes,
2942 const std::vector<int64_t>& gpu_sizes,
2943 const bool transposed,
2944 const std::vector<float>& data_out_expected) {
2945 vTensor vten = vTensor(
2946 context(),
2947 gpu_sizes,
2948 vkapi::kFloat,
2949 utils::StorageType::TEXTURE_2D,
2950 utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED);
2951
2952 // Create and fill input staging buffer
2953 const int64_t in_numel = utils::multiply_integers(original_sizes);
2954 StagingBuffer staging_buffer_in(context(), vkapi::kFloat, in_numel);
2955
2956 std::vector<float> data_in(in_numel);
2957 for (int i = 0; i < in_numel; i++) {
2958 data_in[i] = i + 1;
2959 }
2960 staging_buffer_in.copy_from(data_in.data(), sizeof(float) * in_numel);
2961
2962 // Output staging buffer
2963 const int64_t out_numel =
2964 padded_sizes[0] * padded_sizes[1] * original_sizes[2] * original_sizes[3];
2965 StagingBuffer staging_buffer_out(context(), vkapi::kFloat, out_numel);
2966
2967 // Copy data in and out of the tensor
2968 record_conv2d_prepack_weights_op(
2969 context(), staging_buffer_in.buffer(), vten, original_sizes, transposed);
2970 record_image_to_nchw_op(context(), vten, staging_buffer_out.buffer());
2971
2972 // Execute command buffer
2973 submit_to_gpu();
2974
2975 // Extract data from output staging buffer
2976 std::vector<float> data_out(out_numel);
2977 staging_buffer_out.copy_to(data_out.data(), sizeof(float) * out_numel);
2978
2979 // Check data matches results copied from ATen-VK
2980 for (int i = 0; i < vten.numel(); i++) {
2981 CHECK_VALUE(data_out, i, data_out_expected[i]);
2982 }
2983 }
2984
TEST(VulkanComputeGraphOpsTest,conv2d_prepack_test)2985 TEST(VulkanComputeGraphOpsTest, conv2d_prepack_test) {
2986 test_conv2d(
2987 /*original_sizes = */ {2, 3, 1, 2},
2988 /*padded_sizes = */ {4, 4},
2989 /*gpu_sizes = */ {4, 1, 8},
2990 /*transposed = */ false,
2991 /*data_out_expected = */ {1, 3, 5, 0, 2, 4, 6, 0, 7, 9, 11,
2992 0, 8, 10, 12, 0, 0, 0, 0, 0, 0, 0,
2993 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
2994 test_conv2d(
2995 /*original_sizes = */ {2, 3, 1, 2},
2996 /*padded_sizes = */ {4, 4},
2997 /*gpu_sizes = */ {4, 1, 8},
2998 /*transposed = */ true,
2999 /*data_out_expected = */ {2, 8, 0, 0, 1, 7, 0, 0, 4, 10, 0,
3000 0, 3, 9, 0, 0, 6, 12, 0, 0, 5, 11,
3001 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
3002 }
3003
test_grid_priors(std::vector<int64_t> input_sizes,std::vector<int64_t> output_sizes,int stride,double offset,const std::vector<float> & data_out_expected)3004 void test_grid_priors(
3005 std::vector<int64_t> input_sizes,
3006 std::vector<int64_t> output_sizes,
3007 int stride,
3008 double offset,
3009 const std::vector<float>& data_out_expected) {
3010 GraphConfig config;
3011 ComputeGraph graph(config);
3012
3013 // Build graph
3014 IOValueRef in = graph.add_input_tensor(
3015 input_sizes,
3016 vkapi::kFloat,
3017 utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED);
3018 IOValueRef out;
3019 out.value = graph.add_tensor(
3020 output_sizes,
3021 vkapi::kFloat,
3022 utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED);
3023
3024 VK_GET_OP_FN("et_vk.grid_priors.default")
3025 (graph,
3026 {in.value,
3027 graph.add_scalar<int64_t>(stride),
3028 graph.add_scalar<double>(offset),
3029 out.value});
3030
3031 out.staging = graph.set_output_tensor(out.value);
3032
3033 graph.prepare();
3034 graph.encode_prepack();
3035 graph.prepack();
3036 graph.encode_execute();
3037
3038 vTensorPtr t_in = graph.get_tensor(in.value);
3039 vTensorPtr t_out = graph.get_tensor(out.value);
3040 // Resize input
3041 graph.propagate_resize();
3042
3043 // run graph
3044 graph.execute();
3045
3046 std::vector<float> output_data(t_out->staging_buffer_numel());
3047 graph.copy_from_staging(out.staging, output_data.data(), output_data.size());
3048
3049 // check results
3050 int h_out = utils::val_at(-2, t_out->sizes());
3051 int w_out = utils::val_at(-1, t_out->sizes());
3052 for (size_t i = 0; i < h_out; ++i) {
3053 for (size_t j = 0; j < w_out; ++j) {
3054 size_t idx_out = i * w_out + j;
3055 CHECK_VALUE(output_data, idx_out, data_out_expected[idx_out]);
3056 }
3057 }
3058 }
3059
TEST(VulkanComputeGraphOpsTest,grid_priors_test)3060 TEST(VulkanComputeGraphOpsTest, grid_priors_test) {
3061 test_grid_priors(
3062 /*input size = */ {1, 5, 2, 3},
3063 /*output size = */ {6, 2},
3064 /*stride = */ 1,
3065 /*offset = */ 0.0,
3066 /*data_out_expected = */ {0, 0, 1, 0, 2, 0, 0, 1, 1, 1, 2, 1});
3067
3068 test_grid_priors(
3069 /*input size = */ {1, 5, 2, 3},
3070 /*output size = */ {6, 2},
3071 /*stride = */ 8,
3072 /*offset = */ 0.5,
3073 /*data_out_expected = */ {4, 4, 12, 4, 20, 4, 4, 12, 12, 12, 20, 12});
3074 }
3075
test_transpose_view_mm(const int B,const int M,const int K,const int N,utils::StorageType storage_type)3076 void test_transpose_view_mm(
3077 const int B,
3078 const int M,
3079 const int K,
3080 const int N,
3081 utils::StorageType storage_type) {
3082 GraphConfig config;
3083 config.set_storage_type_override(storage_type);
3084 ComputeGraph graph(config);
3085
3086 std::vector<int64_t> mat1_size = {M, K};
3087 std::vector<int64_t> mat2_t_size = {N, K};
3088 std::vector<int64_t> out_size = {M, N};
3089
3090 std::vector<int64_t> mat1_small_size = {M - 4, K - 3};
3091 std::vector<int64_t> mat2_t_small_size = {N - 1, K - 3};
3092
3093 if (B > 1) {
3094 mat1_size.resize(3);
3095 mat1_size = {B, M, K};
3096 mat2_t_size.resize(3);
3097 mat2_t_size = {B, N, K};
3098 out_size.resize(3);
3099 out_size = {B, M, N};
3100
3101 mat1_small_size.resize(3);
3102 mat1_small_size = {B, M - 4, K - 3};
3103 mat2_t_small_size.resize(3);
3104 mat2_t_small_size = {B, N - 1, K - 3};
3105 }
3106
3107 // Build graph; use shared objects to test views of shared objects
3108
3109 IOValueRef mat1 =
3110 graph.add_input_tensor(mat1_size, vkapi::kFloat, utils::kWidthPacked, 0);
3111 IOValueRef mat2_transpose = graph.add_input_tensor(
3112 mat2_t_size, vkapi::kFloat, utils::kWidthPacked, 1);
3113
3114 ValueRef mat2 = graph.add_tensor_view(mat2_transpose.value);
3115
3116 ValueRef dim0;
3117 ValueRef dim1;
3118
3119 if (B > 1) {
3120 dim0 = graph.add_scalar<int64_t>(1);
3121 dim1 = graph.add_scalar<int64_t>(2);
3122 } else {
3123 dim0 = graph.add_scalar<int64_t>(0);
3124 dim1 = graph.add_scalar<int64_t>(1);
3125 }
3126
3127 IOValueRef out;
3128 out.value = graph.add_tensor(out_size, vkapi::kFloat, utils::kWidthPacked, 2);
3129
3130 VK_GET_OP_FN("aten.transpose.int")
3131 (graph, {mat2_transpose.value, dim0, dim1, mat2});
3132 VK_GET_OP_FN("aten.mm.default")(graph, {mat1.value, mat2, out.value});
3133
3134 out.staging = graph.set_output_tensor(out.value);
3135
3136 graph.prepare();
3137 graph.encode_prepack();
3138 graph.prepack();
3139 graph.encode_execute();
3140
3141 for (int i = 1; i < 4; i++) {
3142 float val_mat1 = i;
3143 float val_mat2 = i + 1;
3144 float val_out = K * (val_mat1 * val_mat2);
3145
3146 // Try at full size
3147 graph.resize_input(0, mat1_size);
3148 graph.resize_input(1, mat2_t_size);
3149 graph.propagate_resize();
3150 execute_graph_and_check_output(graph, {val_mat1, val_mat2}, {val_out});
3151
3152 // Try at reduced sizes
3153 val_out = (K - 3) * (val_mat1 * val_mat2);
3154 graph.resize_input(0, mat1_small_size);
3155 graph.resize_input(1, mat2_t_small_size);
3156 graph.propagate_resize();
3157 execute_graph_and_check_output(graph, {val_mat1, val_mat2}, {val_out});
3158 }
3159 }
3160
TEST(VulkanComputeGraphOpsTest,test_transpose_with_mm)3161 TEST(VulkanComputeGraphOpsTest, test_transpose_with_mm) {
3162 for (auto storage_type : {utils::kBuffer, utils::kTexture3D}) {
3163 test_transpose_view_mm(2, 7, 17, 5, storage_type);
3164 }
3165 }
3166
test_to_copy()3167 void test_to_copy() {
3168 GraphConfig config;
3169 config.set_storage_type_override(utils::kTexture3D);
3170 ComputeGraph graph(config);
3171 int M = 8;
3172 int N = 8;
3173 int K = 8;
3174 // Build graph
3175 IOValueRef in = graph.add_input_tensor(
3176 {1, M, N, K},
3177 vkapi::kFloat,
3178 utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED);
3179
3180 std::vector<float> data_in =
3181 create_random_float_buffer(M * N * K, -1024, 1024);
3182 graph.copy_into_staging(in.staging, data_in.data(), data_in.size());
3183
3184 IOValueRef out;
3185 out.value = graph.add_tensor(
3186 {1, M, N, K},
3187 vkapi::kHalf,
3188 utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED);
3189
3190 auto op = VK_GET_OP_FN("aten._to_copy.default");
3191 op(graph,
3192 {in.value,
3193 graph.add_none(),
3194 graph.add_none(),
3195 graph.add_none(),
3196 graph.add_none(),
3197 graph.add_none(),
3198 graph.add_none(),
3199 out.value});
3200
3201 out.staging = graph.set_output_tensor(out.value);
3202
3203 graph.prepare();
3204 graph.encode_prepack();
3205 graph.prepack();
3206 graph.encode_execute();
3207 graph.propagate_resize();
3208 graph.execute();
3209
3210 std::vector<torch::executor::Half> output_data(graph.numel_of(out.value));
3211 graph.copy_from_staging(out.staging, output_data.data(), output_data.size());
3212
3213 EXPECT_EQ(data_in.size(), output_data.size());
3214
3215 float mse_ex = 0.0f;
3216 float mse_vk = 0.0f;
3217
3218 // check results
3219 for (size_t i = 0; i < output_data.size(); ++i) {
3220 float input = data_in[i];
3221 torch::executor::Half expected_output =
3222 static_cast<torch::executor::Half>(input);
3223 uint16_t* expected_bits = reinterpret_cast<uint16_t*>(&expected_output);
3224 torch::executor::Half output = output_data[i];
3225 uint16_t* output_bits = reinterpret_cast<uint16_t*>(&output);
3226
3227 std::string msg;
3228 msg.reserve(64);
3229 msg = "input = " + std::to_string(input) + "(0b" +
3230 std::bitset<32>(*reinterpret_cast<uint32_t*>(&input)).to_string() +
3231 "), expected output = " + std::to_string(expected_output) + "(0b" +
3232 std::bitset<16>(*expected_bits).to_string() +
3233 "), recieved output = " + std::to_string(output) + "(0b" +
3234 std::bitset<16>(*output_bits).to_string() + ")";
3235
3236 std::cout << msg << std::endl;
3237
3238 // Note: Torch executor half "rounds up" when converting to fp16 whereas
3239 // most driver implementations of Vulkan's opFConvert() just truncates the
3240 // extra bits for performance (rounding introduces conditional).
3241 // Example:
3242 // INPUT F32 = 25.248 (sign{0b0}, exp{0b10000011},
3243 // mantissa{0b10010011111101111100111}),
3244 // TORCH HALF OUTPUT F16 = 25.25 (sign{0b0}, exp{0b10011},
3245 // mantissa{0b1001010000}),
3246 // VULKAN OUTPUT F16 = 25.2344 (sign{0b0}, exp{0b10011},
3247 // mantissa{0b1001001111})
3248 // Note:
3249 // The vulkan mantissa exactly matches the first 10
3250 // bits of the input 23 bit mantissa. But since the 11th bit is 1, the
3251 // torch half output is rounded up (essentially adding a 1).
3252 // Vulkan mantissa{0b1001001111} + 1 = Torch half mantissa{0b1001010000}
3253
3254 EXPECT_TRUE(
3255 (*output_bits == *expected_bits) ||
3256 /*rounding error*/ ((*output_bits + 1u) == *expected_bits));
3257 mse_ex += std::pow(expected_output - input, 2);
3258 mse_vk += std::pow(output - input, 2);
3259 }
3260
3261 mse_ex /= output_data.size();
3262 mse_vk /= output_data.size();
3263 std::cout << "========================================================="
3264 << std::endl;
3265 std::cout << "mse_ex = " << mse_ex << ", mse_vk = " << mse_vk << std::endl;
3266 }
3267
TEST(VulkanComputeGraphOpsTest,test_to_copy)3268 TEST(VulkanComputeGraphOpsTest, test_to_copy) {
3269 if (context()->adapter_ptr()->supports_16bit_storage_buffers()) {
3270 test_to_copy();
3271 }
3272 }
3273