xref: /aosp_15_r20/external/ComputeLibrary/tests/validation/dynamic_fusion/gpu/Integration.cpp (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1 /*
2  * Copyright (c) 2022-2023 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #include "arm_compute/core/CL/CLKernelLibrary.h"
26 #include "arm_compute/core/TensorInfo.h"
27 #include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"
28 #include "arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h"
29 #include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h"
30 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
31 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h"
32 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuCast.h"
33 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h"
34 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h"
35 
36 #include "tests/CL/CLAccessor.h"
37 #include "tests/framework/Macros.h"
38 #include "tests/validation/Validation.h"
39 #include "tests/validation/dynamic_fusion/Utils.h"
40 #include "tests/validation/reference/ConvolutionLayer.h"
41 #include "tests/validation/reference/DepthConvertLayer.h"
42 #include "tests/validation/reference/ElementwiseOperations.h"
43 #include "tests/validation/reference/Permute.h"
44 
45 using namespace arm_compute::experimental::dynamic_fusion;
46 using namespace arm_compute::test::validation::utils;
47 
48 namespace arm_compute
49 {
50 namespace test
51 {
52 namespace validation
53 {
54 TEST_SUITE(CL)
TEST_SUITE(INTEGRATION)55 TEST_SUITE(INTEGRATION)
56 TEST_SUITE(DYNAMIC_FUSION)
57 TEST_CASE(Conv2d, framework::DatasetMode::ALL)
58 {
59     /* Computation:
60      * out = conv2d1x1(direct_conv)(input, weights, bias)
61      */
62     CLScheduler::get().default_reinit();
63 
64     const auto data_type      = DataType::F32;
65     const auto data_layout    = DataLayout::NHWC;
66     const auto t_input_shape  = TensorShape(384, 12, 12);
67     const auto t_weight_shape = TensorShape(384, 1, 1, 16);
68     const auto t_dst_shape    = TensorShape(16, 12, 12);
69 
70     // Create a new workload sketch
71     auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
72     auto              gpu_ctx        = GpuWorkloadContext{ &cl_compile_ctx };
73     GpuWorkloadSketch sketch{ &gpu_ctx };
74 
75     // Fuse conv2d
76     Conv2dAttributes conv2d_attr{};
77     TensorInfo       input_info  = sketch.create_tensor_info(t_input_shape, 1, data_type, data_layout);
78     TensorInfo       weight_info = sketch.create_tensor_info(TensorInfo(t_weight_shape, 1, data_type, data_layout));
79 
80     ITensorInfo *conv_out_info = GpuConv2d::create_op(sketch, &input_info, &weight_info, nullptr, conv2d_attr);
81 
82     TensorInfo dst_info = sketch.create_tensor_info();
83     GpuOutput::create_op(sketch, conv_out_info, &dst_info);
84 
85     // Configure runtime
86     ClWorkloadRuntime runtime;
87     runtime.configure(sketch);
88 
89     // (Important) Allocate auxiliary tensor memory if there are any
90     // Instead of using ACL allocated memory, the user can choose to import memory into the tensors
91     for(auto &data : runtime.get_auxiliary_tensors())
92     {
93         CLTensor     *tensor      = std::get<0>(data);
94         TensorInfo    info        = std::get<1>(data);
95         AuxMemoryInfo aux_mem_req = std::get<2>(data);
96         tensor->allocator()->init(info, aux_mem_req.alignment);
97         tensor->allocator()->allocate(); // Use ACL allocated memory
98         // auto buf = cl::Buffer();
99         // tensor->allocator()->import_memory(buf);  // Or, import external memory
100     }
101 
102     // Construct user tensors
103     CLTensor t_input{};
104     CLTensor t_weight{};
105     CLTensor t_dst{};
106 
107     // Initialize user tensors
108     t_input.allocator()->init(input_info);
109     t_weight.allocator()->init(weight_info);
110     t_dst.allocator()->init(dst_info);
111 
112     // Allocate and fill user tensors
113     // Instead of using ACL allocator, the user can choose to import memory into the tensors
114     t_input.allocator()->allocate();
115     t_weight.allocator()->allocate();
116     t_dst.allocator()->allocate();
117     fill<float>(CLAccessor(t_input), 0, library.get());
118     fill<float>(CLAccessor(t_weight), 1, library.get());
119 
120     // Run runtime
121     runtime.run({ &t_input, &t_weight, &t_dst });
122 
123     // Create reference
124     SimpleTensor<float> ref_t_input{ t_input_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
125     SimpleTensor<float> ref_t_weight{ t_weight_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
126     SimpleTensor<float> ref_t_bias_placeholder{ t_dst_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
127 
128     // Fill reference
129     fill<float>(ref_t_input, 0, library.get());
130     fill<float>(ref_t_weight, 1, library.get());
131 
132     auto ref_t_input_nchw            = reference::permute(ref_t_input, PermutationVector(1U, 2U, 0U));
133     auto ref_t_weight_nchw           = reference::permute(ref_t_weight, PermutationVector(1U, 2U, 0U));
134     auto ref_t_bias_placeholder_nchw = reference::permute(ref_t_bias_placeholder, PermutationVector(1U, 2U, 0U));
135     auto t_dst_shape_nchw            = t_dst_shape;
136     permute(t_dst_shape_nchw, PermutationVector(1U, 2U, 0U));
137 
138     PadStrideInfo legacy_pad_stride(conv2d_attr.stride().x(), conv2d_attr.stride().y(), conv2d_attr.pad().left, conv2d_attr.pad().right, conv2d_attr.pad().top, conv2d_attr.pad().bottom,
139                                     DimensionRoundingType{});
140     auto       ref_t_dst_nchw = reference::convolution_layer(ref_t_input_nchw, ref_t_weight_nchw, ref_t_bias_placeholder_nchw, t_dst_shape_nchw, legacy_pad_stride, conv2d_attr.dilation());
141     const auto ref_t_dst      = reference::permute(ref_t_dst_nchw, PermutationVector(2U, 0U, 1U));
142 
143     RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
144     validate(CLAccessor(t_dst), ref_t_dst_nchw, tolerance_f32);
145 }
TEST_CASE(Add_Output_Add_Output,framework::DatasetMode::ALL)146 TEST_CASE(Add_Output_Add_Output, framework::DatasetMode::ALL)
147 {
148     /* Computation:
149      *   out_0 = in_0 + in_1
150      *   out_1 = out_0 + in_2
151      */
152     CLScheduler::get().default_reinit();
153 
154     const auto data_type     = DataType::F32;
155     const auto t_input_shape = TensorShape(33, 3, 2);
156 
157     // Create a new workload sketch
158     auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
159     auto              gpu_ctx        = GpuWorkloadContext{ &cl_compile_ctx };
160     GpuWorkloadSketch sketch{ &gpu_ctx };
161 
162     TensorInfo in_0_info = sketch.create_tensor_info(t_input_shape, 1, data_type);
163     TensorInfo in_1_info = sketch.create_tensor_info(t_input_shape, 1, data_type);
164     TensorInfo in_2_info = sketch.create_tensor_info(t_input_shape, 1, data_type);
165 
166     TensorInfo out_0_info = sketch.create_tensor_info();
167     TensorInfo out_1_info = sketch.create_tensor_info();
168 
169     ITensorInfo *ans_0_info = GpuAdd::create_op(sketch, &in_0_info, &in_1_info);
170     GpuOutput::create_op(sketch, ans_0_info, &out_0_info);
171     ITensorInfo *ans_1_info = GpuAdd::create_op(sketch, ans_0_info, &in_2_info);
172     GpuOutput::create_op(sketch, ans_1_info, &out_1_info);
173 
174     // Configure runtime
175     ClWorkloadRuntime runtime;
176     runtime.configure(sketch);
177 
178     // (Important) Allocate auxiliary tensor memory if there are any
179     // Instead of using ACL allocated memory, the user can choose to import memory into the tensors
180     for(auto &data : runtime.get_auxiliary_tensors())
181     {
182         CLTensor     *tensor      = std::get<0>(data);
183         TensorInfo    info        = std::get<1>(data);
184         AuxMemoryInfo aux_mem_req = std::get<2>(data);
185         tensor->allocator()->init(info, aux_mem_req.alignment);
186         tensor->allocator()->allocate(); // Use ACL allocated memory
187         // auto buf = cl::Buffer();
188         // tensor->allocator()->import_memory(buf);  // Or, import external memory
189     }
190 
191     // Construct user tensors
192     CLTensor t_in_0{};
193     CLTensor t_in_1{};
194     CLTensor t_in_2{};
195 
196     CLTensor t_out_0{};
197     CLTensor t_out_1{};
198 
199     // Initialize user tensors
200     t_in_0.allocator()->init(in_0_info);
201     t_in_1.allocator()->init(in_1_info);
202     t_in_2.allocator()->init(in_2_info);
203 
204     t_out_0.allocator()->init(out_0_info);
205     t_out_1.allocator()->init(out_1_info);
206 
207     // Allocate and fill user tensors
208     // Instead of using ACL allocator, the user can choose to import memory into the tensors
209     t_in_0.allocator()->allocate();
210     t_in_1.allocator()->allocate();
211     t_in_2.allocator()->allocate();
212 
213     t_out_0.allocator()->allocate();
214     t_out_1.allocator()->allocate();
215 
216     fill<float>(CLAccessor(t_in_0), 0, library.get());
217     fill<float>(CLAccessor(t_in_1), 1, library.get());
218     fill<float>(CLAccessor(t_in_2), 2, library.get());
219 
220     // Run runtime
221     runtime.run({ &t_in_0, &t_in_1, &t_in_2, &t_out_0, &t_out_1 });
222 
223     // Create reference
224     SimpleTensor<float> ref_t_in_0{ t_input_shape, data_type, 1, QuantizationInfo() };
225     SimpleTensor<float> ref_t_in_1{ t_input_shape, data_type, 1, QuantizationInfo() };
226     SimpleTensor<float> ref_t_in_2{ t_input_shape, data_type, 1, QuantizationInfo() };
227 
228     SimpleTensor<float> ref_t_out_0{ t_input_shape, data_type, 1, QuantizationInfo() };
229     SimpleTensor<float> ref_t_out_1{ t_input_shape, data_type, 1, QuantizationInfo() };
230 
231     // Fill reference
232     fill<float>(ref_t_in_0, 0, library.get());
233     fill<float>(ref_t_in_1, 1, library.get());
234     fill<float>(ref_t_in_2, 2, library.get());
235 
236     reference::arithmetic_operation(ArithmeticOperation::ADD, ref_t_in_0, ref_t_in_1, ref_t_out_0, ConvertPolicy::WRAP);
237     reference::arithmetic_operation(ArithmeticOperation::ADD, ref_t_out_0, ref_t_in_2, ref_t_out_1, ConvertPolicy::WRAP);
238 
239     RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
240     validate(CLAccessor(t_out_0), ref_t_out_0, tolerance_f32);
241     validate(CLAccessor(t_out_1), ref_t_out_1, tolerance_f32);
242 }
TEST_CASE(Add_Output_Add_Cast_Cast_Output,framework::DatasetMode::ALL)243 TEST_CASE(Add_Output_Add_Cast_Cast_Output, framework::DatasetMode::ALL)
244 {
245     /* Computation:
246      *   out_0 = in_0 + in_1
247      *   out_1 = float(int32_t(out_0 + in_2))
248      */
249     CLScheduler::get().default_reinit();
250 
251     const auto data_type     = DataType::F32;
252     const auto t_input_shape = TensorShape(3, 8, 5);
253 
254     // Create a new workload sketch
255     auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
256     auto              gpu_ctx        = GpuWorkloadContext{ &cl_compile_ctx };
257     GpuWorkloadSketch sketch{ &gpu_ctx };
258 
259     TensorInfo in_0_info = sketch.create_tensor_info(t_input_shape, 1, data_type);
260     TensorInfo in_1_info = sketch.create_tensor_info(t_input_shape, 1, data_type);
261     TensorInfo in_2_info = sketch.create_tensor_info(t_input_shape, 1, data_type);
262 
263     TensorInfo out_0_info = sketch.create_tensor_info();
264     TensorInfo out_1_info = sketch.create_tensor_info();
265 
266     CastAttributes cast_0_attr;
267     cast_0_attr.data_type(DataType::S32).convert_policy(ConvertPolicy::SATURATE);
268 
269     CastAttributes cast_1_attr;
270     cast_1_attr.data_type(DataType::F32).convert_policy(ConvertPolicy::SATURATE);
271 
272     ITensorInfo *ans_0_info = GpuAdd::create_op(sketch, &in_0_info, &in_1_info);
273     GpuOutput::create_op(sketch, ans_0_info, &out_0_info);
274     ITensorInfo *ans_1_info = GpuAdd::create_op(sketch, ans_0_info, &in_2_info);
275     ITensorInfo *ans_2_info = GpuCast::create_op(sketch, ans_1_info, cast_0_attr);
276     ITensorInfo *ans_3_info = GpuCast::create_op(sketch, ans_2_info, cast_1_attr);
277     GpuOutput::create_op(sketch, ans_3_info, &out_1_info);
278 
279     // Configure runtime
280     ClWorkloadRuntime runtime;
281     runtime.configure(sketch);
282 
283     // (Important) Allocate auxiliary tensor memory if there are any
284     // Instead of using ACL allocated memory, the user can choose to import memory into the tensors
285     for(auto &data : runtime.get_auxiliary_tensors())
286     {
287         CLTensor     *tensor      = std::get<0>(data);
288         TensorInfo    info        = std::get<1>(data);
289         AuxMemoryInfo aux_mem_req = std::get<2>(data);
290         tensor->allocator()->init(info, aux_mem_req.alignment);
291         tensor->allocator()->allocate(); // Use ACL allocated memory
292         // auto buf = cl::Buffer();
293         // tensor->allocator()->import_memory(buf);  // Or, import external memory
294     }
295 
296     // Construct user tensors
297     CLTensor t_in_0{};
298     CLTensor t_in_1{};
299     CLTensor t_in_2{};
300 
301     CLTensor t_out_0{};
302     CLTensor t_out_1{};
303 
304     // Initialize user tensors
305     t_in_0.allocator()->init(in_0_info);
306     t_in_1.allocator()->init(in_1_info);
307     t_in_2.allocator()->init(in_2_info);
308 
309     t_out_0.allocator()->init(out_0_info);
310     t_out_1.allocator()->init(out_1_info);
311 
312     // Allocate and fill user tensors
313     // Instead of using ACL allocator, the user can choose to import memory into the tensors
314     t_in_0.allocator()->allocate();
315     t_in_1.allocator()->allocate();
316     t_in_2.allocator()->allocate();
317 
318     t_out_0.allocator()->allocate();
319     t_out_1.allocator()->allocate();
320 
321     fill<float>(CLAccessor(t_in_0), 0, library.get());
322     fill<float>(CLAccessor(t_in_1), 1, library.get());
323     fill<float>(CLAccessor(t_in_2), 2, library.get());
324 
325     // Run runtime
326     runtime.run({ &t_in_0, &t_in_1, &t_in_2, &t_out_0, &t_out_1 });
327 
328     // Create reference
329     SimpleTensor<float> ref_t_in_0{ t_input_shape, data_type, 1, QuantizationInfo() };
330     SimpleTensor<float> ref_t_in_1{ t_input_shape, data_type, 1, QuantizationInfo() };
331     SimpleTensor<float> ref_t_in_2{ t_input_shape, data_type, 1, QuantizationInfo() };
332 
333     SimpleTensor<float> ref_t_out_0{ t_input_shape, data_type, 1, QuantizationInfo() };
334     SimpleTensor<float> ref_t_ans_1{ t_input_shape, data_type, 1, QuantizationInfo() };
335 
336     // Fill reference
337     fill<float>(ref_t_in_0, 0, library.get());
338     fill<float>(ref_t_in_1, 1, library.get());
339     fill<float>(ref_t_in_2, 2, library.get());
340 
341     reference::arithmetic_operation(ArithmeticOperation::ADD, ref_t_in_0, ref_t_in_1, ref_t_out_0, ConvertPolicy::WRAP);
342     reference::arithmetic_operation(ArithmeticOperation::ADD, ref_t_out_0, ref_t_in_2, ref_t_ans_1, ConvertPolicy::WRAP);
343     const auto ref_t_ans_2 = reference::depth_convert<float, int32_t>(ref_t_ans_1, DataType::S32, ConvertPolicy::SATURATE, 0);
344     const auto ref_t_out_1 = reference::depth_convert<int32_t, float>(ref_t_ans_2, DataType::F32, ConvertPolicy::SATURATE, 0);
345 
346     RelativeTolerance<float> tolerance_add_f32(0.001f);
347     AbsoluteTolerance<float> tolerance_cast_f32(1.0f);
348     validate(CLAccessor(t_out_0), ref_t_out_0, tolerance_add_f32);
349     validate(CLAccessor(t_out_1), ref_t_out_1, tolerance_cast_f32);
350 }
351 TEST_SUITE(Invalid_Fusion_Should_Fail)
TEST_CASE(Multiple_Complex_Ops_0,framework::DatasetMode::ALL)352 TEST_CASE(Multiple_Complex_Ops_0, framework::DatasetMode::ALL)
353 {
354     /* Computation:
355      * out = conv2d(conv2d(l0_input, l0_weight), l1_weight)
356      */
357     CLScheduler::get().default_reinit();
358 
359     const auto data_type      = DataType::F32;
360     const auto data_layout    = DataLayout::NHWC;
361     const auto t_input_shape  = TensorShape(384, 12, 12);
362     const auto t_weight_shape = TensorShape(384, 1, 1, 16);
363     auto       t_input_info   = TensorInfo(t_input_shape, 1, data_type, data_layout);
364     auto       t_weight_info  = TensorInfo(t_weight_shape, 1, data_type, data_layout);
365     auto       t_dst_info     = TensorInfo();
366 
367     Conv2dAttributes conv2d_attr{};
368 
369     // Create a new workload sketch
370     auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
371     auto              gpu_ctx        = GpuWorkloadContext{ &cl_compile_ctx };
372     GpuWorkloadSketch sketch{ &gpu_ctx };
373 
374     // Create tensor infos
375     TensorInfo   input_info  = sketch.create_tensor_info(t_input_shape, 1, data_type, data_layout);
376     TensorInfo   weight_info = sketch.create_tensor_info(TensorInfo(t_weight_shape, 1, data_type, data_layout));
377     ITensorInfo *dst_info;
378 
379     // Fuse conv2d into the workload
380     {
381         // Validate operator
382         const Status success = GpuConv2d::validate_op(sketch, &input_info, &weight_info, nullptr, conv2d_attr);
383         ARM_COMPUTE_EXPECT(bool(success), framework::LogLevel::ERRORS);
384 
385         dst_info = GpuConv2d::create_op(sketch, &input_info, &weight_info, nullptr, conv2d_attr);
386     }
387 
388     // Create tensor infos
389     TensorInfo weight_info_2 = sketch.create_tensor_info(t_weight_info);
390 
391     // Fuse conv2d into the workload
392     {
393         // Validate operator, should fail
394         const Status success            = GpuConv2d::validate_op(sketch, dst_info, &weight_info_2, nullptr, conv2d_attr);
395         const auto   expected_error_str = "Operator fusion test failed. This operator cannot be fused into the workload";
396 
397         ARM_COMPUTE_EXPECT(!bool(success), framework::LogLevel::ERRORS);
398         ARM_COMPUTE_EXPECT((success.error_description().find(expected_error_str) != std::string::npos), framework::LogLevel::ERRORS);
399     }
400 }
401 TEST_SUITE_END() // Invalid_Fusion_Should_Fail
402 TEST_SUITE_END() // DYNAMIC_FUSION
403 TEST_SUITE_END() // INTEGRATION
404 TEST_SUITE_END() // CL
405 } // namespace validation
406 } // namespace test
407 } // namespace arm_compute
408