xref: /aosp_15_r20/external/ComputeLibrary/src/gpu/cl/ClKernelLibrary.cpp (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1 /*
2  * Copyright (c) 2016-2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "src/gpu/cl/ClKernelLibrary.h"
25 
26 #include "arm_compute/core/Error.h"
27 #include "arm_compute/core/Utils.h"
28 
29 #include <algorithm>
30 #include <array>
31 #include <fstream>
32 #include <utility>
33 
34 #ifdef ARM_COMPUTE_COMPRESSED_KERNELS
35 #include <zlib.h>
36 
37 namespace
38 {
39 /* Decoding table */
40 constexpr std::array<uint8_t, 256> b64_invtab =
41 {
42     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
43     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
44     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 62, 0, 0, 0, 63,
45     52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 0, 0, 0, 0, 0, 0,
46     0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
47     15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 0, 0, 0, 0,
48     0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
49     41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0, 0, 0, 0, 0,
50     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
51     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
52     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
53     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
54     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
55     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
56     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
57     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
58 };
59 
60 /** Decode a base64 encoded string
61  *
62  * @param[in] str Base64 encoded string to decode
63  *
64  * @return The decode string in case of a valid, non-empty string otherwise an empty string
65  */
decode_base64(const std::string & str)66 std::string decode_base64(const std::string &str)
67 {
68     constexpr const char pad_char = '=';
69 
70     // Handle empty string
71     if(str.empty())
72     {
73         return {};
74     }
75 
76     // Base64 encoded string has size multiple of 4
77     if(str.length() % 4)
78     {
79         return {};
80     }
81 
82     //
83     // Check encoded string padding
84     std::size_t padding = (str.rbegin()[0] == pad_char) + (str.rbegin()[1] == pad_char);
85     const int   str_len = str.size();
86 
87     // Reserve memory for the decoded string
88     // Note each 4 consecutive elements of 6-bit encode 3 bytes
89     std::string dec_b64;
90     dec_b64.reserve(((str_len / 4) * 3));
91 
92     // Block decoding function (exclude padding)
93     int       c   = 0;
94     const int end = str_len - 4 - padding;
95     for(; c <= end; c += 4)
96     {
97         const int byte0 = b64_invtab[str[c]];
98         const int byte1 = b64_invtab[str[c + 1]];
99         const int byte2 = b64_invtab[str[c + 2]];
100         const int byte3 = b64_invtab[str[c + 3]];
101 
102         dec_b64.push_back((byte0 << 2) | (byte1 >> 4));
103         dec_b64.push_back((byte1 << 4) | (byte2 >> 2));
104         dec_b64.push_back((byte2 << 6) | (byte3));
105     }
106 
107     // Last step that might contain padding symbols
108     if(padding == 1)
109     {
110         const int byte0 = b64_invtab[str[c]];
111         const int byte1 = b64_invtab[str[c + 1]];
112         const int byte2 = b64_invtab[str[c + 2]];
113 
114         dec_b64.push_back((byte0 << 2) | (byte1 >> 4));
115         dec_b64.push_back((byte1 << 4) | (byte2 >> 2));
116     }
117     else if(padding == 2)
118     {
119         const int byte0 = b64_invtab[str[c]];
120         const int byte1 = b64_invtab[str[c + 1]];
121 
122         dec_b64.push_back((byte0 << 2) | (byte1 >> 4));
123     }
124 
125     return dec_b64;
126 }
127 
128 /** Decompress a zlib compressed string
129  *
130  * @param[in] str ZLib compressed string
131  *
132  * @return The decompressed string if successful, otherwise false.
133  */
decompress_zlib(const std::string & str)134 std::string decompress_zlib(const std::string &str)
135 {
136     // Create and initialize decompression stream
137     z_stream ds{};
138     if(inflateInit(&ds) != Z_OK)
139     {
140         return std::string();
141     }
142     ds.avail_in = str.size();
143     ds.next_in  = (Bytef *)str.data();
144 
145     // Roll-over the string using a buffer and decompress
146     int         status = Z_OK;
147     char        roll_buff[16384];
148     std::string inflated_str;
149     do
150     {
151         ds.avail_out = sizeof(roll_buff);
152         ds.next_out  = reinterpret_cast<Bytef *>(roll_buff);
153 
154         status = inflate(&ds, 0);
155         if(inflated_str.size() < ds.total_out)
156         {
157             inflated_str.append(roll_buff, ds.total_out - inflated_str.size());
158         }
159     }
160     while(status == Z_OK);
161 
162     // Finalize decompression stream
163     inflateEnd(&ds);
164     if(status != Z_STREAM_END)
165     {
166         return std::string();
167     }
168 
169     return inflated_str;
170 }
171 } // namespace
172 #endif /* ARM_COMPUTE_COMPRESSED_KERNELS */
173 
174 namespace arm_compute
175 {
176 namespace opencl
177 {
178 const std::map<std::string, std::string> ClKernelLibrary::_kernel_program_map =
179 {
180     // Common Kernels
181     { "activation_layer", "common/activation_layer.cl" },
182     { "activation_layer_quant", "common/activation_layer_quant.cl" },
183     { "activation_layer_quant_f32", "common/activation_layer_quant.cl" },
184     { "arg_min_max_x", "common/arg_min_max.cl" },
185     { "arg_min_max_y", "common/arg_min_max.cl" },
186     { "arg_min_max_z", "common/arg_min_max.cl" },
187     { "arg_min_max_w", "common/arg_min_max.cl" },
188     { "bitwise_or", "common/bitwise_op.cl" },
189     { "bitwise_and", "common/bitwise_op.cl" },
190     { "bitwise_xor", "common/bitwise_op.cl" },
191     { "bitwise_not", "common/bitwise_op.cl" },
192     { "bounding_box_transform", "common/bounding_box_transform.cl" },
193     { "bounding_box_transform_quantized", "common/bounding_box_transform_quantized.cl" },
194     { "compare_equal", "common/comparisons.cl" },
195     { "compare_equal_quantized", "common/comparisons.cl" },
196     { "compare_notequal", "common/comparisons.cl" },
197     { "compare_notequal_quantized", "common/comparisons.cl" },
198     { "compare_greater", "common/comparisons.cl" },
199     { "compare_greater_quantized", "common/comparisons.cl" },
200     { "compare_greaterequal", "common/comparisons.cl" },
201     { "compare_greaterequal_quantized", "common/comparisons.cl" },
202     { "compare_less", "common/comparisons.cl" },
203     { "compare_less_quantized", "common/comparisons.cl" },
204     { "compare_lessequal", "common/comparisons.cl" },
205     { "compare_lessequal_quantized", "common/comparisons.cl" },
206     { "concatenate", "common/concatenate.cl" },
207     { "concatenate_width", "common/concatenate.cl" },
208     { "concatenate_height", "common/concatenate.cl" },
209     { "concatenate_width_x2", "common/concatenate.cl" },
210     { "concatenate_width_x4", "common/concatenate.cl" },
211     { "col2im", "common/col2im.cl" },
212     { "cast_down", "common/cast.cl" },
213     { "cast_up", "common/cast.cl" },
214     { "convert_fc_weights", "common/convert_fc_weights.cl" },
215     { "copy_tensor", "common/copy_tensor.cl" },
216     { "crop_tensor", "common/crop_tensor.cl" },
217     { "deconvolution_reshape", "common/deconvolution_layer.cl" },
218     { "deconvolution_upsample", "common/deconvolution_layer.cl" },
219     { "dequantization_layer", "common/dequantization_layer.cl" },
220     { "elementwise_operation_ADD", "common/elementwise_operation.cl" },
221     { "elementwise_operation_SUB", "common/elementwise_operation.cl" },
222     { "elementwise_operation_MAX", "common/elementwise_operation.cl" },
223     { "elementwise_operation_MIN", "common/elementwise_operation.cl" },
224     { "elementwise_operation_DIV", "common/elementwise_operation.cl" },
225     { "elementwise_operation_SQUARED_DIFF", "common/elementwise_operation.cl" },
226     { "elementwise_operation_POWER", "common/elementwise_operation.cl" },
227     { "elementwise_operation_PRELU", "common/elementwise_operation.cl" },
228     { "elementwise_operation_AND", "common/elementwise_operation.cl" },
229     { "elementwise_operation_OR", "common/elementwise_operation.cl" },
230     { "elementwise_operation_ADD_quantized", "common/elementwise_operation_quantized.cl" },
231     { "elementwise_operation_SUB_quantized", "common/elementwise_operation_quantized.cl" },
232     { "elementwise_operation_MAX_quantized", "common/elementwise_operation_quantized.cl" },
233     { "elementwise_operation_MIN_quantized", "common/elementwise_operation_quantized.cl" },
234     { "elementwise_operation_DIV_quantized", "common/elementwise_operation_quantized.cl" },
235     { "elementwise_operation_SQUARED_DIFF_quantized", "common/elementwise_operation_quantized.cl" },
236     { "elementwise_operation_PRELU_quantized", "common/elementwise_operation_quantized.cl" },
237     { "elementwise_unary", "common/elementwise_unary.cl" },
238     { "fft_digit_reverse_axis_0", "common/fft_digit_reverse.cl" },
239     { "fft_digit_reverse_axis_1", "common/fft_digit_reverse.cl" },
240     { "fft_radix_2_first_stage_axis_0", "common/fft.cl" },
241     { "fft_radix_2_first_stage_axis_1", "common/fft.cl" },
242     { "fft_radix_2_axis_0", "common/fft.cl" },
243     { "fft_radix_2_axis_1", "common/fft.cl" },
244     { "fft_radix_3_first_stage_axis_0", "common/fft.cl" },
245     { "fft_radix_3_first_stage_axis_1", "common/fft.cl" },
246     { "fft_radix_3_axis_0", "common/fft.cl" },
247     { "fft_radix_3_axis_1", "common/fft.cl" },
248     { "fft_radix_4_first_stage_axis_0", "common/fft.cl" },
249     { "fft_radix_4_first_stage_axis_1", "common/fft.cl" },
250     { "fft_radix_4_axis_0", "common/fft.cl" },
251     { "fft_radix_4_axis_1", "common/fft.cl" },
252     { "fft_radix_5_first_stage_axis_0", "common/fft.cl" },
253     { "fft_radix_5_first_stage_axis_1", "common/fft.cl" },
254     { "fft_radix_5_axis_0", "common/fft.cl" },
255     { "fft_radix_5_axis_1", "common/fft.cl" },
256     { "fft_radix_7_first_stage_axis_0", "common/fft.cl" },
257     { "fft_radix_7_first_stage_axis_1", "common/fft.cl" },
258     { "fft_radix_7_axis_0", "common/fft.cl" },
259     { "fft_radix_7_axis_1", "common/fft.cl" },
260     { "fft_radix_8_first_stage_axis_0", "common/fft.cl" },
261     { "fft_radix_8_first_stage_axis_1", "common/fft.cl" },
262     { "fft_radix_8_axis_0", "common/fft.cl" },
263     { "fft_radix_8_axis_1", "common/fft.cl" },
264     { "fft_scale_conj", "common/fft_scale.cl" },
265     { "fill_image_borders_constant", "common/fill_border.cl" },
266     { "fill_image_borders_replicate", "common/fill_border.cl" },
267     { "floor_layer", "common/floor.cl" },
268     { "fuse_batchnormalization_layer", "common/batchnormalization_layer.cl" },
269     { "gather", "common/gather.cl" },
270     { "gemm_ma_f16", "common/gemm.cl" },
271     { "gemm_ma_f32", "common/gemm.cl" },
272     { "gemm_mv", "common/gemv.cl" },
273     { "gemm_mv_quantized", "common/gemv.cl" },
274     { "gemm_mm_native", "common/gemm.cl" },
275     { "gemm_mm_reshaped_only_rhs_nt_mmul", "common/gemm_reshaped_only_rhs_mmul.cl" },
276     { "gemm_mm_reshaped_only_rhs_nt_mmul_texture", "common/gemm_reshaped_only_rhs_mmul.cl" },
277     { "gemm_mm_native_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl" },
278     { "gemm_mm_reshaped_lhs_nt_rhs_t", "common/gemm.cl" },
279     { "gemm_mm_reshaped_lhs_nt_rhs_t_texture", "common/gemm.cl" },
280     { "gemm_mm_reshaped_lhs_t_rhs_nt", "common/gemm.cl" },
281     { "gemm_mm_reshaped_lhs_t_rhs_nt_texture", "common/gemm.cl" },
282     { "gemm_mm_reshaped_lhs_nt_rhs_t_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl" },
283     { "gemm_mm_reshaped_lhs_nt_rhs_t_texture_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl" },
284     { "gemm_mm_reshaped_lhs_t_rhs_nt_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl" },
285     { "gemm_mm_reshaped_lhs_t_rhs_nt_texture_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl" },
286     { "gemm_mm_reshaped_only_rhs_nt", "common/gemm.cl" },
287     { "gemm_mm_reshaped_only_rhs_nt_texture", "common/gemm.cl" },
288     { "gemm_mm_reshaped_only_rhs_t", "common/gemm.cl" },
289     { "gemm_mm_reshaped_only_rhs_t_texture", "common/gemm.cl" },
290     { "gemm_mm_reshaped_only_rhs_nt_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" },
291     { "gemm_mm_reshaped_only_rhs_nt_texture_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" },
292     { "gemm_mm_reshaped_only_rhs_t_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" },
293     { "gemm_mm_reshaped_only_rhs_t_texture_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" },
294     { "gemm_lc_vm_f32", "common/gemm.cl" },
295     { "gemm_reshape_lhs_matrix_nt", "common/gemm_utils.cl" },
296     { "gemm_reshape_lhs_matrix_t", "common/gemm_utils.cl" },
297     { "gemm_reshape_rhs_matrix_nt", "common/gemm_utils.cl" },
298     { "gemm_reshape_rhs_matrix_t", "common/gemm_utils.cl" },
299     { "gemmlowp_matrix_a_reduction", "common/gemmlowp.cl" },
300     { "gemmlowp_matrix_a_reduction_dot8", "common/gemmlowp.cl" },
301     { "gemmlowp_matrix_b_reduction", "common/gemmlowp.cl" },
302     { "gemmlowp_mm_native", "common/gemmlowp.cl" },
303     { "gemmlowp_mm_reshaped_lhs_nt_rhs_t", "common/gemmlowp.cl" },
304     { "gemmlowp_mm_reshaped_only_rhs_t", "common/gemmlowp.cl" },
305     { "gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint", "common/gemmlowp.cl" },
306     { "gemmlowp_mm_reshaped_only_rhs_mmul", "common/gemmlowp_reshaped_only_rhs_mmul.cl" },
307     { "gemmlowp_offset_contribution", "common/gemmlowp.cl" },
308     { "gemmlowp_offset_contribution_quantize_down", "common/gemmlowp.cl" },
309     { "gemmlowp_offset_contribution_quantize_down_fixedpoint", "common/gemmlowp.cl" },
310     { "gemmlowp_output_stage_quantize_down", "common/gemmlowp.cl" },
311     { "gemmlowp_output_stage_quantize_down_fixedpoint", "common/gemmlowp.cl" },
312     { "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16", "common/gemmlowp.cl" },
313     { "gemmlowp_output_stage_quantize_down_float", "common/gemmlowp.cl" },
314     { "generate_proposals_compute_all_anchors", "common/generate_proposals.cl" },
315     { "generate_proposals_compute_all_anchors_quantized", "common/generate_proposals_quantized.cl" },
316     { "instance_normalization", "common/instance_normalization.cl" },
317     { "compute_mean_var", "common/instance_normalization.cl" },
318     { "l2_normalize_x", "common/l2_normalize.cl" },
319     { "l2_normalize_y", "common/l2_normalize.cl" },
320     { "l2_normalize_z", "common/l2_normalize.cl" },
321     { "max_unpooling_layer_2", "common/unpooling_layer.cl" },
322     { "mean_stddev_normalization", "common/mean_stddev_normalization.cl" },
323     { "memset", "common/memset.cl" },
324     { "minmax_layer", "common/minmax_layer.cl" },
325     { "non_max_suppression", "common/nonmax.cl" },
326     { "pad_layer_constant", "common/pad_layer.cl" },
327     { "pad_layer_symmetric_reflect", "common/pad_layer.cl" },
328     { "permute", "common/permute.cl" },
329     { "pixelwise_mul_complex", "common/pixelwise_mul_float.cl" },
330     { "pixelwise_mul_float", "common/pixelwise_mul_float.cl" },
331     { "pixelwise_mul_int", "common/pixelwise_mul_int.cl" },
332     { "pixelwise_mul_quantized", "common/pixelwise_mul_int.cl" },
333     { "qlstm_layer_normalization", "common/qlstm_layer_normalization.cl" },
334     { "quantization_layer", "common/quantization_layer.cl" },
335     { "range", "common/range.cl" },
336     { "range_quantized", "common/range.cl" },
337     { "reduction_operation_x", "common/reduction_operation.cl" },
338     { "reduction_operation_non_parallel_x", "common/reduction_operation.cl" },
339     { "reduction_operation_y", "common/reduction_operation.cl" },
340     { "reduction_operation_z", "common/reduction_operation.cl" },
341     { "reduction_operation_w", "common/reduction_operation.cl" },
342     { "reshape_layer", "common/reshape_layer.cl" },
343     { "reshape_to_columns", "common/convolution_layer.cl" },
344     { "reverse", "common/reverse.cl" },
345     { "roi_align_layer", "common/roi_align_layer.cl" },
346     { "roi_align_layer_quantized", "common/roi_align_layer_quantized.cl" },
347     { "roi_pooling_layer", "common/roi_pooling_layer.cl" },
348     { "select_same_rank", "common/select.cl" },
349     { "select_different_rank_2", "common/select.cl" },
350     { "select_different_rank_n", "common/select.cl" },
351     { "softmax_layer_norm", "common/softmax_layer.cl" },
352     { "softmax_layer_norm_quantized", "common/softmax_layer_quantized.cl" },
353     { "softmax_layer_max_shift_exp_sum_quantized_serial", "common/softmax_layer_quantized.cl" },
354     { "softmax_layer_max_shift_exp_sum_quantized_parallel", "common/softmax_layer_quantized.cl" },
355     { "softmax_layer_max_shift_exp_sum_serial", "common/softmax_layer.cl" },
356     { "softmax_layer_max_shift_exp_sum_parallel", "common/softmax_layer.cl" },
357     { "stack_layer", "common/stack_layer.cl" },
358     { "strided_slice", "common/slice_ops.cl" },
359     { "tile", "common/tile.cl" },
360     { "transpose", "common/transpose.cl" },
361 #ifdef ENABLE_NCHW_KERNELS
362     { "batch_to_space_nchw", "nchw/batch_to_space.cl" },
363     { "batch_to_space_static_nchw", "nchw/batch_to_space.cl" },
364     { "batchnormalization_layer_nchw", "nchw/batchnormalization_layer.cl" },
365     { "channel_shuffle_nchw", "nchw/channel_shuffle.cl" },
366     { "depth_to_space_nchw", "nchw/depth_to_space.cl" },
367     { "dequantization_layer_per_channel_nchw", "nchw/dequantization_layer.cl" },
368     { "direct_convolution1x1", "nchw/direct_convolution1x1.cl" },
369     { "direct_convolution_nchw", "nchw/direct_convolution.cl" },
370 
371     { "im2col1x1_stridex1_nchw", "nchw/im2col.cl" },
372     { "im2col3x3_nchw", "nchw/im2col.cl" },
373     { "im2col5x5_nchw", "nchw/im2col.cl" },
374     { "im2col11x11_padx0_pady0_nchw", "nchw/im2col.cl" },
375     { "im2col_generic_nchw", "nchw/im2col.cl" },
376     { "im2col_generic_padx0_pady0_nchw", "nchw/im2col.cl" },
377     { "normalization_layer_cross_map_nchw", "nchw/normalization_layer.cl" },
378     { "normalization_layer_in_map_nchw", "nchw/normalization_layer.cl" },
379     { "normalize_planar_yuv_layer_nchw", "nchw/normalize_planar_yuv_layer.cl" },
380     { "normalize_planar_yuv_layer_q8_nchw", "nchw/normalize_planar_yuv_layer_quantized.cl" },
381     { "pooling_layer_MxN_nchw", "nchw/pooling_layer.cl" },
382     { "pooling_layer_2_nchw_indices", "nchw/pooling_layer.cl" },
383     { "prior_box_layer_nchw", "nchw/prior_box_layer.cl" },
384     { "reorg_layer_nchw", "nchw/reorg_layer.cl" },
385     { "scale_nearest_neighbour_nchw", "nchw/scale.cl" },
386     { "scale_bilinear_nchw", "nchw/scale.cl" },
387     { "space_to_batch_nchw", "nchw/space_to_batch.cl" },
388     { "space_to_batch_static_nchw", "nchw/space_to_batch.cl" },
389     { "space_to_depth_nchw", "nchw/space_to_depth.cl" },
390     { "upsample_layer_nchw", "nchw/upsample_layer.cl" },
391     { "winograd_filter_transform_2x2_3x3_nchw", "nchw/winograd_filter_transform.cl" },
392     { "winograd_filter_transform_2x1_3x1_nchw", "nchw/winograd_filter_transform.cl" },
393     { "winograd_filter_transform_1x2_1x3_nchw", "nchw/winograd_filter_transform.cl" },
394     { "winograd_filter_transform_4x4_3x3_nchw", "nchw/winograd_filter_transform.cl" },
395     { "winograd_filter_transform_4x1_3x1_nchw", "nchw/winograd_filter_transform.cl" },
396     { "winograd_filter_transform_1x4_1x3_nchw", "nchw/winograd_filter_transform.cl" },
397     { "winograd_filter_transform_4x4_5x5_nchw", "nchw/winograd_filter_transform.cl" },
398     { "winograd_filter_transform_4x1_5x1_nchw", "nchw/winograd_filter_transform.cl" },
399     { "winograd_filter_transform_1x4_1x5_nchw", "nchw/winograd_filter_transform.cl" },
400     { "winograd_input_transform_2x2_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl" },
401     { "winograd_input_transform_2x2_3x3_stepz2_nchw", "nchw/winograd_input_transform.cl" },
402     { "winograd_input_transform_2x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl" },
403     { "winograd_input_transform_2x1_3x1_stepz2_nchw", "nchw/winograd_input_transform.cl" },
404     { "winograd_input_transform_1x2_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl" },
405     { "winograd_input_transform_1x2_1x3_stepz2_nchw", "nchw/winograd_input_transform.cl" },
406     { "winograd_input_transform_4x4_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl" },
407     { "winograd_input_transform_4x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl" },
408     { "winograd_input_transform_1x4_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl" },
409     { "winograd_input_transform_4x4_5x5_stepz1_nchw", "nchw/winograd_input_transform.cl" },
410     { "winograd_input_transform_4x1_5x1_stepz1_nchw", "nchw/winograd_input_transform.cl" },
411     { "winograd_input_transform_1x4_1x5_stepz1_nchw", "nchw/winograd_input_transform.cl" },
412     { "winograd_output_transform_2x2_3x3_nchw", "nchw/winograd_output_transform.cl" },
413     { "winograd_output_transform_2x1_3x1_nchw", "nchw/winograd_output_transform.cl" },
414     { "winograd_output_transform_1x2_1x3_nchw", "nchw/winograd_output_transform.cl" },
415     { "winograd_output_transform_4x4_3x3_nchw", "nchw/winograd_output_transform.cl" },
416     { "winograd_output_transform_4x1_3x1_nchw", "nchw/winograd_output_transform.cl" },
417     { "winograd_output_transform_1x4_1x3_nchw", "nchw/winograd_output_transform.cl" },
418     { "winograd_output_transform_4x4_5x5_nchw", "nchw/winograd_output_transform.cl" },
419     { "winograd_output_transform_4x1_5x1_nchw", "nchw/winograd_output_transform.cl" },
420     { "winograd_output_transform_1x4_1x5_nchw", "nchw/winograd_output_transform.cl" },
421 #endif /* ENABLE_NCHW_KERNELS */
422 #ifdef ENABLE_NHWC_KERNELS
423     { "batch_to_space_nhwc", "nhwc/batch_to_space.cl" },
424     { "batch_to_space_static_nhwc", "nhwc/batch_to_space.cl" },
425     { "batchnormalization_layer_nhwc", "nhwc/batchnormalization_layer.cl" },
426     { "channel_shuffle_nhwc", "nhwc/channel_shuffle.cl" },
427     { "depth_to_space_nhwc", "nhwc/depth_to_space.cl" },
428     { "dequantization_layer_per_channel_nhwc", "nhwc/dequantization_layer.cl" },
429     { "dwc_native_fp_nhwc", "nhwc/dwc_native_fp_nhwc.cl" },
430     { "dwc_native_quantized_nhwc", "nhwc/dwc_native_quantized_nhwc.cl" },
431     { "direct_convolution_nhwc", "nhwc/direct_convolution.cl" },
432     { "direct_convolution3d_ndhwc", "nhwc/direct_convolution3d.cl" },
433     { "im2col3x3_nhwc", "nhwc/im2col.cl" },
434     { "im2col9x9_nhwc", "nhwc/im2col.cl" },
435     { "im2col_generic_nhwc", "nhwc/im2col.cl" },
436     { "indirect_convolution_nhwc", "nhwc/indirect_convolution.cl" },
437     { "indirect_convolution_address_precalculation", "nhwc/indirect_convolution.cl" },
438     { "normalization_layer_cross_map_nhwc", "nhwc/normalization_layer.cl" },
439     { "normalization_layer_in_map_nhwc", "nhwc/normalization_layer.cl" },
440     { "normalize_planar_yuv_layer_nhwc", "nhwc/normalize_planar_yuv_layer.cl" },
441     { "normalize_planar_yuv_layer_q8_nhwc", "nhwc/normalize_planar_yuv_layer_quantized.cl" },
442     { "pooling_layer_MxN_nhwc", "nhwc/pooling_layer.cl" },
443     { "pooling_layer_2x2_nhwc", "nhwc/pooling_layer.cl" },
444     { "pooling_layer_MxN_quantized_nhwc", "nhwc/pooling_layer_quantized.cl" },
445     { "pooling_3d_layer_MxN_ndhwc", "nhwc/pooling_3d_layer.cl" },
446     { "pooling_3d_layer_MxN_ndhwc_quantized", "nhwc/pooling_3d_layer_quantized.cl" },
447     { "reorg_layer_nhwc", "nhwc/reorg_layer.cl" },
448     { "scale_nearest_neighbour_nhwc", "nhwc/scale.cl" },
449     { "scale_bilinear_nhwc", "nhwc/scale.cl" },
450     { "space_to_batch_nhwc", "nhwc/space_to_batch.cl" },
451     { "space_to_batch_static_nhwc", "nhwc/space_to_batch.cl" },
452     { "space_to_depth_nhwc", "nhwc/space_to_depth.cl" },
453     { "transposed_convolution_nhwc", "nhwc/transposed_convolution.cl" },
454     { "upsample_layer_nhwc", "nhwc/upsample_layer.cl" },
455     { "winograd_filter_transform_4x1_3x1_nhwc", "nhwc/winograd_filter_transform.cl" },
456     { "winograd_filter_transform_1x4_1x3_nhwc", "nhwc/winograd_filter_transform.cl" },
457     { "winograd_filter_transform_4x4_3x3_nhwc", "nhwc/winograd_filter_transform.cl" },
458     { "winograd_filter_transform_4x4_5x5_nhwc", "nhwc/winograd_filter_transform.cl" },
459     { "winograd_filter_transform_4x1_5x1_nhwc", "nhwc/winograd_filter_transform.cl" },
460     { "winograd_filter_transform_1x4_1x5_nhwc", "nhwc/winograd_filter_transform.cl" },
461     { "winograd_filter_transform_2x2_7x7_nhwc", "nhwc/winograd_filter_transform.cl" },
462     { "winograd_filter_transform_2x1_7x1_nhwc", "nhwc/winograd_filter_transform.cl" },
463     { "winograd_filter_transform_1x2_1x7_nhwc", "nhwc/winograd_filter_transform.cl" },
464     { "winograd_input_transform_4x1_3x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
465     { "winograd_input_transform_1x4_1x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
466     { "winograd_input_transform_4x4_3x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
467     { "winograd_input_transform_4x4_5x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
468     { "winograd_input_transform_4x1_5x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
469     { "winograd_input_transform_1x4_1x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
470     { "winograd_input_transform_2x2_7x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
471     { "winograd_input_transform_2x1_7x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
472     { "winograd_input_transform_1x2_1x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
473     { "winograd_output_transform_4x1_3x1_nhwc", "nhwc/winograd_output_transform.cl" },
474     { "winograd_output_transform_1x4_1x3_nhwc", "nhwc/winograd_output_transform.cl" },
475     { "winograd_output_transform_4x4_3x3_nhwc", "nhwc/winograd_output_transform.cl" },
476     { "winograd_output_transform_4x4_5x5_nhwc", "nhwc/winograd_output_transform.cl" },
477     { "winograd_output_transform_4x1_5x1_nhwc", "nhwc/winograd_output_transform.cl" },
478     { "winograd_output_transform_1x4_1x5_nhwc", "nhwc/winograd_output_transform.cl" },
479     { "winograd_output_transform_2x2_7x7_nhwc", "nhwc/winograd_output_transform.cl" },
480     { "winograd_output_transform_2x1_7x1_nhwc", "nhwc/winograd_output_transform.cl" },
481     { "winograd_output_transform_1x2_1x7_nhwc", "nhwc/winograd_output_transform.cl" },
482 #endif /* ENABLE_NHWC_KERNELS */
483 };
484 
485 const std::map<std::string, std::string> ClKernelLibrary::_program_source_map =
486 {
487 #ifdef EMBEDDED_KERNELS
488     {
489         "activation_float_helpers.h",
490 #include "./cl_kernels/activation_float_helpers.hembed"
491     },
492     {
493         "activation_quant_helpers.h",
494 #include "./cl_kernels/activation_quant_helpers.hembed"
495     },
496     {
497         "common/activation_layer.cl",
498 #include "./cl_kernels/common/activation_layer.clembed"
499     },
500     {
501         "common/activation_layer_quant.cl",
502 #include "./cl_kernels/common/activation_layer_quant.clembed"
503     },
504     {
505         "common/arg_min_max.cl",
506 #include "./cl_kernels/common/arg_min_max.clembed"
507     },
508     {
509         "common/bitwise_op.cl",
510 #include "./cl_kernels/common/bitwise_op.clembed"
511     },
512     {
513         "common/bounding_box_transform.cl",
514 #include "./cl_kernels/common/bounding_box_transform.clembed"
515     },
516     {
517         "common/bounding_box_transform_quantized.cl",
518 #include "./cl_kernels/common/bounding_box_transform_quantized.clembed"
519     },
520     {
521         "common/col2im.cl",
522 #include "./cl_kernels/common/col2im.clembed"
523     },
524     {
525         "common/comparisons.cl",
526 #include "./cl_kernels/common/comparisons.clembed"
527     },
528     {
529         "common/concatenate.cl",
530 #include "./cl_kernels/common/concatenate.clembed"
531     },
532     {
533         "common/convert_fc_weights.cl",
534 #include "./cl_kernels/common/convert_fc_weights.clembed"
535     },
536     {
537         "common/convolution_layer.cl",
538 #include "./cl_kernels/common/convolution_layer.clembed"
539     },
540     {
541         "common/copy_tensor.cl",
542 #include "./cl_kernels/common/copy_tensor.clembed"
543     },
544     {
545         "common/crop_tensor.cl",
546 #include "./cl_kernels/common/crop_tensor.clembed"
547     },
548     {
549         "common/deconvolution_layer.cl",
550 #include "./cl_kernels/common/deconvolution_layer.clembed"
551     },
552     {
553         "common/cast.cl",
554 #include "./cl_kernels/common/cast.clembed"
555     },
556     {
557         "common/dequantization_layer.cl",
558 #include "./cl_kernels/common/dequantization_layer.clembed"
559     },
560     {
561         "common/elementwise_operation.cl",
562 #include "./cl_kernels/common/elementwise_operation.clembed"
563     },
564     {
565         "common/elementwise_operation_quantized.cl",
566 #include "./cl_kernels/common/elementwise_operation_quantized.clembed"
567     },
568     {
569         "common/elementwise_unary.cl",
570 #include "./cl_kernels/common/elementwise_unary.clembed"
571     },
572     {
573         "common/fft.cl",
574 #include "./cl_kernels/common/fft.clembed"
575     },
576     {
577         "common/fft_digit_reverse.cl",
578 #include "./cl_kernels/common/fft_digit_reverse.clembed"
579     },
580     {
581         "common/fft_scale.cl",
582 #include "./cl_kernels/common/fft_scale.clembed"
583     },
584     {
585         "common/fill_border.cl",
586 #include "./cl_kernels/common/fill_border.clembed"
587     },
588     {
589         "common/floor.cl",
590 #include "./cl_kernels/common/floor.clembed"
591     },
592     {
593         "common/gather.cl",
594 #include "./cl_kernels/common/gather.clembed"
595     },
596     {
597         "common/gemm.cl",
598 #include "./cl_kernels/common/gemm.clembed"
599     },
600     {
601         "common/gemm_reshaped_only_rhs_mmul.cl",
602 #include "./cl_kernels/common/gemm_reshaped_only_rhs_mmul.clembed"
603     },
604     {
605         "common/gemm_utils.cl",
606 #include "./cl_kernels/common/gemm_utils.clembed"
607     },
608     {
609         "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h",
610 #include "./cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.hembed"
611     },
612     {
613         "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h",
614 #include "./cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.hembed"
615     },
616     {
617         "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl",
618 #include "./cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.clembed"
619     },
620     {
621         "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl",
622 #include "./cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.clembed"
623     },
624     {
625         "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl",
626 #include "./cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.clembed"
627     },
628     {
629         "common/gemmlowp.cl",
630 #include "./cl_kernels/common/gemmlowp.clembed"
631     },
632     {
633         "common/gemmlowp_reshaped_only_rhs_mmul.cl",
634 #include "./cl_kernels/common/gemmlowp_reshaped_only_rhs_mmul.clembed"
635     },
636     {
637         "common/gemv.cl",
638 #include "./cl_kernels/common/gemv.clembed"
639     },
640     {
641         "common/generate_proposals.cl",
642 #include "./cl_kernels/common/generate_proposals.clembed"
643     },
644     {
645         "common/generate_proposals_quantized.cl",
646 #include "./cl_kernels/common/generate_proposals_quantized.clembed"
647     },
648     {
649         "gemm_helpers.h",
650 #include "./cl_kernels/gemm_helpers.hembed"
651     },
652     {
653         "helpers.h",
654 #include "./cl_kernels/helpers.hembed"
655     },
656     {
657         "helpers_asymm.h",
658 #include "./cl_kernels/helpers_asymm.hembed"
659     },
660     {
661         "repeat.h",
662 #include "./cl_kernels/repeat.hembed"
663     },
664     {
665         "tile_helpers.h",
666 #include "./cl_kernels/tile_helpers.hembed"
667     },
668     {
669         "common/instance_normalization.cl",
670 #include "./cl_kernels/common/instance_normalization.clembed"
671     },
672     {
673         "common/l2_normalize.cl",
674 #include "./cl_kernels/common/l2_normalize.clembed"
675     },
676     {
677         "common/mean_stddev_normalization.cl",
678 #include "./cl_kernels/common/mean_stddev_normalization.clembed"
679     },
680     {
681         "common/memset.cl",
682 #include "./cl_kernels/common/memset.clembed"
683     },
684     {
685         "common/minmax_layer.cl",
686 #include "./cl_kernels/common/minmax_layer.clembed"
687     },
688     {
689         "common/nonmax.cl",
690 #include "./cl_kernels/common/nonmax.clembed"
691     },
692     {
693         "common/batchnormalization_layer.cl",
694 #include "./cl_kernels/common/batchnormalization_layer.clembed"
695     },
696     {
697         "common/pad_layer.cl",
698 #include "./cl_kernels/common/pad_layer.clembed"
699     },
700     {
701         "common/permute.cl",
702 #include "./cl_kernels/common/permute.clembed"
703     },
704     {
705         "common/pixelwise_mul_float.cl",
706 #include "./cl_kernels/common/pixelwise_mul_float.clembed"
707     },
708     {
709         "common/pixelwise_mul_int.cl",
710 #include "./cl_kernels/common/pixelwise_mul_int.clembed"
711     },
712     {
713         "common/qlstm_layer_normalization.cl",
714 #include "./cl_kernels/common/qlstm_layer_normalization.clembed"
715     },
716     {
717         "common/quantization_layer.cl",
718 #include "./cl_kernels/common/quantization_layer.clembed"
719     },
720     {
721         "common/range.cl",
722 #include "./cl_kernels/common/range.clembed"
723     },
724     {
725         "common/reduction_operation.cl",
726 #include "./cl_kernels/common/reduction_operation.clembed"
727     },
728     {
729         "common/reshape_layer.cl",
730 #include "./cl_kernels/common/reshape_layer.clembed"
731     },
732     {
733         "common/reverse.cl",
734 #include "./cl_kernels/common/reverse.clembed"
735     },
736     {
737         "common/roi_align_layer.cl",
738 #include "./cl_kernels/common/roi_align_layer.clembed"
739     },
740     {
741         "common/roi_align_layer_quantized.cl",
742 #include "./cl_kernels/common/roi_align_layer_quantized.clembed"
743     },
744     {
745         "common/roi_pooling_layer.cl",
746 #include "./cl_kernels/common/roi_pooling_layer.clembed"
747     },
748     {
749         "common/select.cl",
750 #include "./cl_kernels/common/select.clembed"
751     },
752     {
753         "common/softmax_layer.cl",
754 #include "./cl_kernels/common/softmax_layer.clembed"
755     },
756     {
757         "common/softmax_layer_quantized.cl",
758 #include "./cl_kernels/common/softmax_layer_quantized.clembed"
759     },
760     {
761         "common/slice_ops.cl",
762 #include "./cl_kernels/common/slice_ops.clembed"
763     },
764     {
765         "common/stack_layer.cl",
766 #include "./cl_kernels/common/stack_layer.clembed"
767     },
768     {
769         "common/tile.cl",
770 #include "./cl_kernels/common/tile.clembed"
771     },
772     {
773         "common/transpose.cl",
774 #include "./cl_kernels/common/transpose.clembed"
775     },
776     {
777         "types.h",
778 #include "./cl_kernels/types.hembed"
779     },
780     {
781         "common/unpooling_layer.cl",
782 #include "./cl_kernels/common/unpooling_layer.clembed"
783     },
784 #ifdef ENABLE_NCHW_KERNELS
785     {
786         "nchw/batch_to_space.cl",
787 #include "./cl_kernels/nchw/batch_to_space.clembed"
788     },
789     {
790         "nchw/channel_shuffle.cl",
791 #include "./cl_kernels/nchw/channel_shuffle.clembed"
792     },
793     {
794         "nchw/upsample_layer.cl",
795 #include "./cl_kernels/nchw/upsample_layer.clembed"
796     },
797     {
798         "nchw/depth_to_space.cl",
799 #include "./cl_kernels/nchw/depth_to_space.clembed"
800     },
801     {
802         "nchw/dequantization_layer.cl",
803 #include "./cl_kernels/nchw/dequantization_layer.clembed"
804     },
805     {
806         "nchw/direct_convolution.cl",
807 #include "./cl_kernels/nchw/direct_convolution.clembed"
808     },
809     {
810         "nchw/im2col.cl",
811 #include "./cl_kernels/nchw/im2col.clembed"
812     },
813     {
814         "nchw/normalization_layer.cl",
815 #include "./cl_kernels/nchw/normalization_layer.clembed"
816     },
817     {
818         "nchw/normalize_planar_yuv_layer.cl",
819 #include "./cl_kernels/nchw/normalize_planar_yuv_layer.clembed"
820     },
821     {
822         "nchw/normalize_planar_yuv_layer_quantized.cl",
823 #include "./cl_kernels/nchw/normalize_planar_yuv_layer_quantized.clembed"
824     },
825     {
826         "nchw/batchnormalization_layer.cl",
827 #include "./cl_kernels/nchw/batchnormalization_layer.clembed"
828     },
829     {
830         "nchw/pooling_layer.cl",
831 #include "./cl_kernels/nchw/pooling_layer.clembed"
832     },
833     {
834         "nchw/prior_box_layer.cl",
835 #include "./cl_kernels/nchw/prior_box_layer.clembed"
836     },
837     {
838         "nchw/reorg_layer.cl",
839 #include "./cl_kernels/nchw/reorg_layer.clembed"
840     },
841     {
842         "nchw/scale.cl",
843 #include "./cl_kernels/nchw/scale.clembed"
844     },
845     {
846         "nchw/space_to_batch.cl",
847 #include "./cl_kernels/nchw/space_to_batch.clembed"
848     },
849     {
850         "nchw/space_to_depth.cl",
851 #include "./cl_kernels/nchw/space_to_depth.clembed"
852     },
853     {
854         "nchw/winograd_filter_transform.cl",
855 #include "./cl_kernels/nchw/winograd_filter_transform.clembed"
856     },
857     {
858         "nchw/winograd_input_transform.cl",
859 #include "./cl_kernels/nchw/winograd_input_transform.clembed"
860     },
861     {
862         "nchw/winograd_output_transform.cl",
863 #include "./cl_kernels/nchw/winograd_output_transform.clembed"
864     },
865 #endif /* ENABLE_NCHW_KERNELS */
866 
867 #ifdef ENABLE_NHWC_KERNELS
868     {
869         "nhwc/batch_to_space.cl",
870 #include "./cl_kernels/nhwc/batch_to_space.clembed"
871     },
872     {
873         "nhwc/channel_shuffle.cl",
874 #include "./cl_kernels/nhwc/channel_shuffle.clembed"
875     },
876     {
877         "nhwc/upsample_layer.cl",
878 #include "./cl_kernels/nhwc/upsample_layer.clembed"
879     },
880     {
881         "nhwc/depth_to_space.cl",
882 #include "./cl_kernels/nhwc/depth_to_space.clembed"
883     },
884     {
885         "nhwc/dequantization_layer.cl",
886 #include "./cl_kernels/nhwc/dequantization_layer.clembed"
887     },
888     {
889         "nhwc/direct_convolution.cl",
890 #include "./cl_kernels/nhwc/direct_convolution.clembed"
891     },
892     {
893         "nhwc/direct_convolution3d.cl",
894 #include "./cl_kernels/nhwc/direct_convolution3d.clembed"
895     },
896     {
897         "nhwc/dwc_native_fp_nhwc.cl",
898 #include "./cl_kernels/nhwc/dwc_native_fp_nhwc.clembed"
899     },
900     {
901         "nhwc/dwc_native_quantized_nhwc.cl",
902 #include "./cl_kernels/nhwc/dwc_native_quantized_nhwc.clembed"
903     },
904     {
905         "nhwc/normalization_layer.cl",
906 #include "./cl_kernels/nhwc/normalization_layer.clembed"
907     },
908     {
909         "nhwc/normalize_planar_yuv_layer.cl",
910 #include "./cl_kernels/nhwc/normalize_planar_yuv_layer.clembed"
911     },
912     {
913         "nhwc/normalize_planar_yuv_layer_quantized.cl",
914 #include "./cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.clembed"
915     },
916     {
917         "nhwc/im2col.cl",
918 #include "./cl_kernels/nhwc/im2col.clembed"
919     },
920     {
921         "nhwc/indirect_convolution.cl",
922 #include "./cl_kernels/nhwc/indirect_convolution.clembed"
923     },
924     {
925         "nhwc/batchnormalization_layer.cl",
926 #include "./cl_kernels/nhwc/batchnormalization_layer.clembed"
927     },
928     {
929         "nhwc/pooling_layer.cl",
930 #include "./cl_kernels/nhwc/pooling_layer.clembed"
931     },
932     {
933         "nhwc/pooling_3d_layer.cl",
934 #include "./cl_kernels/nhwc/pooling_3d_layer.clembed"
935     },
936     {
937         "nhwc/pooling_3d_layer_quantized.cl",
938 #include "./cl_kernels/nhwc/pooling_3d_layer_quantized.clembed"
939     },
940     {
941         "nhwc/pooling_layer_quantized.cl",
942 #include "./cl_kernels/nhwc/pooling_layer_quantized.clembed"
943     },
944     {
945         "nhwc/reorg_layer.cl",
946 #include "./cl_kernels/nhwc/reorg_layer.clembed"
947     },
948     {
949         "nhwc/scale.cl",
950 #include "./cl_kernels/nhwc/scale.clembed"
951     },
952     {
953         "nhwc/space_to_batch.cl",
954 #include "./cl_kernels/nhwc/space_to_batch.clembed"
955     },
956     {
957         "nhwc/space_to_depth.cl",
958 #include "./cl_kernels/nhwc/space_to_depth.clembed"
959     },
960     {
961         "nhwc/transposed_convolution.cl",
962 #include "./cl_kernels/nhwc/transposed_convolution.clembed"
963     },
964     {
965         "nhwc/winograd_filter_transform.cl",
966 #include "./cl_kernels/nhwc/winograd_filter_transform.clembed"
967     },
968     {
969         "nhwc/winograd_input_transform.cl",
970 #include "./cl_kernels/nhwc/winograd_input_transform.clembed"
971     },
972     {
973         "nhwc/winograd_output_transform.cl",
974 #include "./cl_kernels/nhwc/winograd_output_transform.clembed"
975     },
976 #endif /* ENABLE_NHWC_KERNELS */
977 #endif /* EMBEDDED_KERNELS */
978 };
979 
get()980 ClKernelLibrary &ClKernelLibrary::get()
981 {
982     static ClKernelLibrary _kernel_library;
983     return _kernel_library;
984 }
985 
program_name(const std::string & kernel_name) const986 std::string ClKernelLibrary::program_name(const std::string &kernel_name) const
987 {
988     // Find which program contains the kernel
989     auto kernel_program_it = _kernel_program_map.find(kernel_name);
990 
991     if(_kernel_program_map.end() == kernel_program_it)
992     {
993         ARM_COMPUTE_ERROR_VAR("Kernel %s not found in the CLKernelLibrary", kernel_name.c_str());
994     }
995 
996     const std::string program_name = kernel_program_it->second;
997 
998     return program_name;
999 }
1000 
set_kernel_path(std::string kernel_path)1001 void ClKernelLibrary::set_kernel_path(std::string kernel_path)
1002 {
1003     _kernel_path = std::move(kernel_path);
1004     _kernel_path += "/";
1005 }
1006 
kernel_path() const1007 const std::string &ClKernelLibrary::kernel_path() const
1008 {
1009     return _kernel_path;
1010 }
1011 
program(const std::string & program_name) const1012 ClKernelLibrary::ClProgramInfo ClKernelLibrary::program(const std::string &program_name) const
1013 {
1014 #ifdef EMBEDDED_KERNELS
1015 #ifdef ARM_COMPUTE_COMPRESSED_KERNELS
1016     const auto inflatted_program_source_it = _decompressed_source_map.find(program_name);
1017     if(inflatted_program_source_it != _decompressed_source_map.end())
1018     {
1019         return ClProgramInfo{ inflatted_program_source_it->second, false };
1020     }
1021 #endif /* ARM_COMPUTE_COMPRESSED_KERNELS */
1022 
1023     const auto program_source_it = _program_source_map.find(program_name);
1024     if(program_source_it == _program_source_map.end())
1025     {
1026         ARM_COMPUTE_ERROR_VAR("Embedded program for %s does not exist.", program_name.c_str());
1027     }
1028     std::string program_source = program_source_it->second;
1029 
1030 #ifdef ARM_COMPUTE_COMPRESSED_KERNELS
1031     std::string decompressed_program_source = decompress_zlib(decode_base64(program_source_it->second));
1032     ARM_COMPUTE_ERROR_ON_MSG(decompressed_program_source.empty(), "Cannot de-compress requested program");
1033     _decompressed_source_map.insert(std::make_pair(program_name, decompressed_program_source));
1034     program_source = std::move(decompressed_program_source);
1035 #endif /* ARM_COMPUTE_COMPRESSED_KERNELS */
1036 
1037     return ClProgramInfo{ program_source, false };
1038 #else  /* EMBEDDED_KERNELS */
1039     // Check for binary
1040     std::string source_name = _kernel_path + program_name;
1041     std::string binary_name = source_name + "bin";
1042     std::string program_source{};
1043     bool        is_binary = false;
1044 
1045     if(std::ifstream(binary_name).is_open())
1046     {
1047         program_source = read_file(binary_name, true);
1048         is_binary      = true;
1049     }
1050     else if(std::ifstream(source_name).is_open())
1051     {
1052         program_source = read_file(source_name, false);
1053     }
1054     else
1055     {
1056         ARM_COMPUTE_ERROR_VAR("Kernel file %s does not exist.", source_name.c_str());
1057     }
1058 
1059     return ClProgramInfo{ program_source, is_binary };
1060 #endif /* EMBEDDED_KERNELS */
1061 }
1062 } // namespace opencl
1063 } // namespace arm_compute
1064