xref: /aosp_15_r20/external/ComputeLibrary/src/cpu/kernels/CpuCastKernel.cpp (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1*c217d954SCole Faust /*
2*c217d954SCole Faust  * Copyright (c) 2016-2022 Arm Limited.
3*c217d954SCole Faust  *
4*c217d954SCole Faust  * SPDX-License-Identifier: MIT
5*c217d954SCole Faust  *
6*c217d954SCole Faust  * Permission is hereby granted, free of charge, to any person obtaining a copy
7*c217d954SCole Faust  * of this software and associated documentation files (the "Software"), to
8*c217d954SCole Faust  * deal in the Software without restriction, including without limitation the
9*c217d954SCole Faust  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10*c217d954SCole Faust  * sell copies of the Software, and to permit persons to whom the Software is
11*c217d954SCole Faust  * furnished to do so, subject to the following conditions:
12*c217d954SCole Faust  *
13*c217d954SCole Faust  * The above copyright notice and this permission notice shall be included in all
14*c217d954SCole Faust  * copies or substantial portions of the Software.
15*c217d954SCole Faust  *
16*c217d954SCole Faust  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17*c217d954SCole Faust  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18*c217d954SCole Faust  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19*c217d954SCole Faust  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20*c217d954SCole Faust  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21*c217d954SCole Faust  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22*c217d954SCole Faust  * SOFTWARE.
23*c217d954SCole Faust  */
24*c217d954SCole Faust #include "src/cpu/kernels/CpuCastKernel.h"
25*c217d954SCole Faust 
26*c217d954SCole Faust #include "arm_compute/core/Error.h"
27*c217d954SCole Faust #include "arm_compute/core/Helpers.h"
28*c217d954SCole Faust #include "arm_compute/core/ITensor.h"
29*c217d954SCole Faust #include "arm_compute/core/TensorInfo.h"
30*c217d954SCole Faust #include "arm_compute/core/Validate.h"
31*c217d954SCole Faust #include "src/core/CPP/Validate.h"
32*c217d954SCole Faust #include "src/core/NEON/NEFixedPoint.h"
33*c217d954SCole Faust #include "src/core/NEON/NEMath.h"
34*c217d954SCole Faust #include "src/core/NEON/wrapper/wrapper.h"
35*c217d954SCole Faust #include "src/core/common/Registrars.h"
36*c217d954SCole Faust #include "src/core/helpers/AutoConfiguration.h"
37*c217d954SCole Faust #include "src/core/helpers/WindowHelpers.h"
38*c217d954SCole Faust #include "support/SaturateCast.h"
39*c217d954SCole Faust 
40*c217d954SCole Faust #include "src/cpu/kernels/cast/list.h"
41*c217d954SCole Faust 
42*c217d954SCole Faust namespace arm_compute
43*c217d954SCole Faust {
44*c217d954SCole Faust namespace cpu
45*c217d954SCole Faust {
46*c217d954SCole Faust namespace kernels
47*c217d954SCole Faust {
48*c217d954SCole Faust namespace
49*c217d954SCole Faust {
50*c217d954SCole Faust static const std::vector<CpuCastKernel::CastKernel> available_kernels =
51*c217d954SCole Faust {
52*c217d954SCole Faust     {
53*c217d954SCole Faust         "neon_qs8_cast",
__anon2cc4acfe0202() 54*c217d954SCole Faust         [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::QASYMM8_SIGNED && data.dst_dt == DataType::F16 && data.isa.fp16; },
55*c217d954SCole Faust         REGISTER_FP16_NEON(arm_compute::cpu::neon_qasymm8_signed_to_fp16_cast)
56*c217d954SCole Faust     },
57*c217d954SCole Faust     {
58*c217d954SCole Faust         "neon_qu8_cast",
__anon2cc4acfe0302() 59*c217d954SCole Faust         [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::QASYMM8 && data.dst_dt == DataType::F16 && data.isa.fp16; },
60*c217d954SCole Faust         REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast)
61*c217d954SCole Faust     },
62*c217d954SCole Faust     {
63*c217d954SCole Faust         "neon_u8_cast",
__anon2cc4acfe0402() 64*c217d954SCole Faust         [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::U8 && data.dst_dt == DataType::F16 && data.isa.fp16; },
65*c217d954SCole Faust         REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast)
66*c217d954SCole Faust     },
67*c217d954SCole Faust     {
68*c217d954SCole Faust         "neon_fp16_cast",
__anon2cc4acfe0502() 69*c217d954SCole Faust         [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::F16 && data.isa.fp16; },
70*c217d954SCole Faust         REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_to_other_dt_cast)
71*c217d954SCole Faust     },
72*c217d954SCole Faust     {
73*c217d954SCole Faust         "neon_fp32_to_fp16_cast",
__anon2cc4acfe0602() 74*c217d954SCole Faust         [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::F32 && data.dst_dt == DataType::F16 && data.isa.fp16; },
75*c217d954SCole Faust         REGISTER_FP16_NEON(arm_compute::cpu::neon_fp32_to_fp16_cast)
76*c217d954SCole Faust     },
77*c217d954SCole Faust     {
78*c217d954SCole Faust         "neon_fp32_to_bf16_cast",
__anon2cc4acfe0702() 79*c217d954SCole Faust         [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::F32 && data.dst_dt == DataType::BFLOAT16 && data.isa.bf16; },
80*c217d954SCole Faust         REGISTER_BF16_NEON(arm_compute::cpu::neon_fp32_to_bfloat16_cast)
81*c217d954SCole Faust     },
82*c217d954SCole Faust     {
83*c217d954SCole Faust         "neon_s32_cast",
__anon2cc4acfe0802() 84*c217d954SCole Faust         [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::S32 && data.dst_dt == DataType::F16 && data.isa.fp16; },
85*c217d954SCole Faust         REGISTER_FP16_NEON(arm_compute::cpu::neon_s32_to_fp16_cast)
86*c217d954SCole Faust     },
87*c217d954SCole Faust     {
88*c217d954SCole Faust         "neon_bf16_cast",
__anon2cc4acfe0902() 89*c217d954SCole Faust         [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::BFLOAT16 && data.dst_dt == DataType::F32 && data.isa.bf16; },
90*c217d954SCole Faust         REGISTER_BF16_NEON(arm_compute::cpu::neon_bfloat16_to_fp32_cast)
91*c217d954SCole Faust     },
92*c217d954SCole Faust };
93*c217d954SCole Faust 
validate_arguments(const ITensorInfo * src,const ITensorInfo * dst,ConvertPolicy policy)94*c217d954SCole Faust Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
95*c217d954SCole Faust {
96*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
97*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(dst);
98*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(src);
99*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(dst);
100*c217d954SCole Faust     ARM_COMPUTE_UNUSED(policy);
101*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON(src == dst);
102*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
103*c217d954SCole Faust                                                          DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16,
104*c217d954SCole Faust                                                          DataType::F32, DataType::S32);
105*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
106*c217d954SCole Faust                                                          DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16,
107*c217d954SCole Faust                                                          DataType::U32, DataType::S32, DataType::F32);
108*c217d954SCole Faust 
109*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8_SIGNED && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::S32
110*c217d954SCole Faust                                                                                      && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
111*c217d954SCole Faust                                     "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
112*c217d954SCole Faust 
113*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8 && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16
114*c217d954SCole Faust                                                                               && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
115*c217d954SCole Faust                                     "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
116*c217d954SCole Faust 
117*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U8 && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16
118*c217d954SCole Faust                                                                          && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
119*c217d954SCole Faust                                     "Only data_types supported [in] U8 -> [out] U16, S16, S32, F16, F32");
120*c217d954SCole Faust 
121*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U16 && (dst->data_type() != DataType::U8 && dst->data_type() != DataType::U32),
122*c217d954SCole Faust                                     "Only data_types supported [in] U16 ->  [out] U8, U32");
123*c217d954SCole Faust 
124*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S16 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::U8 && dst->data_type() != DataType::S32),
125*c217d954SCole Faust                                     "Only data_types supported [in] S16 ->  [out] U8, S32");
126*c217d954SCole Faust 
127*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::BFLOAT16 && dst->data_type() != DataType::F32,
128*c217d954SCole Faust                                     "Only data_types supported [in] BFLOAT16 ->  [out] F32");
129*c217d954SCole Faust 
130*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F16 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
131*c217d954SCole Faust                                                                           && dst->data_type() != DataType::U8
132*c217d954SCole Faust                                                                           && dst->data_type() != DataType::F32 && dst->data_type() != DataType::S32),
133*c217d954SCole Faust                                     "Only data_types supported [in] F16 ->  [out] QASYMM8, F32, S32, U8");
134*c217d954SCole Faust 
135*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
136*c217d954SCole Faust                                                                           && dst->data_type() != DataType::F16 && dst->data_type() != DataType::BFLOAT16
137*c217d954SCole Faust                                                                           && dst->data_type() != DataType::S32 && dst->data_type() != DataType::U8),
138*c217d954SCole Faust                                     "Only data_types supported [in] F32 ->  [out] QASYMM8, BFLOAT16, F16, S32, U8");
139*c217d954SCole Faust 
140*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
141*c217d954SCole Faust                                                                           && dst->data_type() != DataType::F16
142*c217d954SCole Faust                                                                           && dst->data_type() != DataType::F32 && dst->data_type() != DataType::U8),
143*c217d954SCole Faust                                     "Only data_types supported [in] S32 ->  [out] QASYMM8, F16, F32, U8");
144*c217d954SCole Faust 
145*c217d954SCole Faust     // Validate in case of configured dst
146*c217d954SCole Faust     if(dst->total_size() > 0)
147*c217d954SCole Faust     {
148*c217d954SCole Faust         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
149*c217d954SCole Faust     }
150*c217d954SCole Faust 
151*c217d954SCole Faust     return Status{};
152*c217d954SCole Faust }
153*c217d954SCole Faust } // namespace
154*c217d954SCole Faust 
configure(const ITensorInfo * src,ITensorInfo * dst,ConvertPolicy policy)155*c217d954SCole Faust void CpuCastKernel::configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy)
156*c217d954SCole Faust {
157*c217d954SCole Faust     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
158*c217d954SCole Faust 
159*c217d954SCole Faust     // Auto initialize dst shape if not initialized (We can only auto-configure the shape, datatype must be given)
160*c217d954SCole Faust     set_shape_if_empty(*dst, src->tensor_shape());
161*c217d954SCole Faust 
162*c217d954SCole Faust     _policy = policy;
163*c217d954SCole Faust 
164*c217d954SCole Faust     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, policy));
165*c217d954SCole Faust 
166*c217d954SCole Faust     // Configure kernel window
167*c217d954SCole Faust     Window win = calculate_max_window(*src, Steps());
168*c217d954SCole Faust 
169*c217d954SCole Faust     ICPPKernel::configure(win);
170*c217d954SCole Faust }
171*c217d954SCole Faust 
validate(const ITensorInfo * src,const ITensorInfo * dst,ConvertPolicy policy)172*c217d954SCole Faust Status CpuCastKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
173*c217d954SCole Faust {
174*c217d954SCole Faust     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, policy));
175*c217d954SCole Faust     return Status{};
176*c217d954SCole Faust }
177*c217d954SCole Faust 
run_op(ITensorPack & tensors,const Window & window,const ThreadInfo & info)178*c217d954SCole Faust void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
179*c217d954SCole Faust {
180*c217d954SCole Faust     ARM_COMPUTE_UNUSED(info);
181*c217d954SCole Faust     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
182*c217d954SCole Faust     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
183*c217d954SCole Faust 
184*c217d954SCole Faust     const auto window_start_x = static_cast<int>(window.x().start());
185*c217d954SCole Faust     const auto window_end_x   = static_cast<int>(window.x().end());
186*c217d954SCole Faust     const int  window_step_x  = 16;
187*c217d954SCole Faust 
188*c217d954SCole Faust     const ITensor *_src = tensors.get_const_tensor(TensorType::ACL_SRC);
189*c217d954SCole Faust     ITensor       *_dst = tensors.get_tensor(TensorType::ACL_DST);
190*c217d954SCole Faust     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
191*c217d954SCole Faust     ARM_COMPUTE_ERROR_ON(_src == _dst);
192*c217d954SCole Faust 
193*c217d954SCole Faust     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
194*c217d954SCole Faust 
195*c217d954SCole Faust     Window win{ window };
196*c217d954SCole Faust     win.set(Window::DimX, Window::Dimension(0, 1, 1));
197*c217d954SCole Faust 
198*c217d954SCole Faust     Iterator src(_src, win);
199*c217d954SCole Faust     Iterator dst(_dst, win);
200*c217d954SCole Faust 
201*c217d954SCole Faust     /*ukernel runs only when using fp16/bfloat16, so we validate it isn't a nullptr only before using it */
202*c217d954SCole Faust     const auto *uk = CpuCastKernel::get_implementation(CastDataTypeISASelectorData{ _src->info()->data_type(), _dst->info()->data_type(), CPUInfo::get().get_isa() });
203*c217d954SCole Faust 
204*c217d954SCole Faust     switch(_src->info()->data_type())
205*c217d954SCole Faust     {
206*c217d954SCole Faust         case DataType::QASYMM8_SIGNED:
207*c217d954SCole Faust         {
208*c217d954SCole Faust             switch(_dst->info()->data_type())
209*c217d954SCole Faust             {
210*c217d954SCole Faust                 case DataType::S16:
211*c217d954SCole Faust                 {
212*c217d954SCole Faust                     /* Up-conversion QASYMM8_SIGNED -> S16 */
213*c217d954SCole Faust                     execute_window_loop(win, [&](const Coordinates &)
214*c217d954SCole Faust                     {
215*c217d954SCole Faust                         const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
216*c217d954SCole Faust                         const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
217*c217d954SCole Faust                         int        x       = window_start_x;
218*c217d954SCole Faust 
219*c217d954SCole Faust                         for(; x <= (window_end_x - window_step_x); x += window_step_x)
220*c217d954SCole Faust                         {
221*c217d954SCole Faust                             const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
222*c217d954SCole Faust 
223*c217d954SCole Faust                             const int16x8x2_t texels =
224*c217d954SCole Faust                             {
225*c217d954SCole Faust                                 {
226*c217d954SCole Faust                                     vmovl_s8(vget_low_s8(texels_s8)),
227*c217d954SCole Faust                                     vmovl_s8(vget_high_s8(texels_s8))
228*c217d954SCole Faust                                 }
229*c217d954SCole Faust                             };
230*c217d954SCole Faust 
231*c217d954SCole Faust                             vst1q_s16(dst_ptr + x, texels.val[0]);
232*c217d954SCole Faust                             vst1q_s16(dst_ptr + x + 8, texels.val[1]);
233*c217d954SCole Faust                         }
234*c217d954SCole Faust 
235*c217d954SCole Faust                         // Compute left-over elements
236*c217d954SCole Faust                         for(; x < window_end_x; ++x)
237*c217d954SCole Faust                         {
238*c217d954SCole Faust                             *(dst_ptr + x) = static_cast<int16_t>(*(src_ptr + x));
239*c217d954SCole Faust                         }
240*c217d954SCole Faust                     },
241*c217d954SCole Faust                     src, dst);
242*c217d954SCole Faust                     break;
243*c217d954SCole Faust                 }
244*c217d954SCole Faust                 case DataType::S32:
245*c217d954SCole Faust                 {
246*c217d954SCole Faust                     /* Up-conversion QASYMM8_SIGNED -> S32 */
247*c217d954SCole Faust                     execute_window_loop(win, [&](const Coordinates &)
248*c217d954SCole Faust                     {
249*c217d954SCole Faust                         const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
250*c217d954SCole Faust                         const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
251*c217d954SCole Faust                         int        x       = window_start_x;
252*c217d954SCole Faust 
253*c217d954SCole Faust                         for(; x <= (window_end_x - window_step_x); x += window_step_x)
254*c217d954SCole Faust                         {
255*c217d954SCole Faust                             const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
256*c217d954SCole Faust 
257*c217d954SCole Faust                             const int16x8x2_t texels =
258*c217d954SCole Faust                             {
259*c217d954SCole Faust                                 {
260*c217d954SCole Faust                                     vmovl_s8(vget_low_s8(texels_s8)),
261*c217d954SCole Faust                                     vmovl_s8(vget_high_s8(texels_s8))
262*c217d954SCole Faust                                 }
263*c217d954SCole Faust                             };
264*c217d954SCole Faust 
265*c217d954SCole Faust                             vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
266*c217d954SCole Faust                             vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
267*c217d954SCole Faust                             vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
268*c217d954SCole Faust                             vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
269*c217d954SCole Faust                         }
270*c217d954SCole Faust 
271*c217d954SCole Faust                         // Compute left-over elements
272*c217d954SCole Faust                         for(; x < window_end_x; ++x)
273*c217d954SCole Faust                         {
274*c217d954SCole Faust                             *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
275*c217d954SCole Faust                         }
276*c217d954SCole Faust                     },
277*c217d954SCole Faust                     src, dst);
278*c217d954SCole Faust                     break;
279*c217d954SCole Faust                 }
280*c217d954SCole Faust                 case DataType::F32:
281*c217d954SCole Faust                 {
282*c217d954SCole Faust                     /* Up-conversion QASYMM8_SIGNED -> F32 */
283*c217d954SCole Faust                     execute_window_loop(win, [&](const Coordinates &)
284*c217d954SCole Faust                     {
285*c217d954SCole Faust                         const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
286*c217d954SCole Faust                         const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
287*c217d954SCole Faust 
288*c217d954SCole Faust                         int x = window_start_x;
289*c217d954SCole Faust                         for(; x <= (window_end_x - window_step_x); x += window_step_x)
290*c217d954SCole Faust                         {
291*c217d954SCole Faust                             const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
292*c217d954SCole Faust 
293*c217d954SCole Faust                             const int16x8x2_t texels =
294*c217d954SCole Faust                             {
295*c217d954SCole Faust                                 {
296*c217d954SCole Faust                                     vmovl_s8(vget_low_s8(texels_s8)),
297*c217d954SCole Faust                                     vmovl_s8(vget_high_s8(texels_s8))
298*c217d954SCole Faust                                 }
299*c217d954SCole Faust                             };
300*c217d954SCole Faust                             vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
301*c217d954SCole Faust                             vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
302*c217d954SCole Faust                             vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
303*c217d954SCole Faust                             vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
304*c217d954SCole Faust                         }
305*c217d954SCole Faust 
306*c217d954SCole Faust                         // Compute left-over elements
307*c217d954SCole Faust                         for(; x < window_end_x; ++x)
308*c217d954SCole Faust                         {
309*c217d954SCole Faust                             *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
310*c217d954SCole Faust                         }
311*c217d954SCole Faust                     },
312*c217d954SCole Faust                     src, dst);
313*c217d954SCole Faust                     break;
314*c217d954SCole Faust                 }
315*c217d954SCole Faust                 case DataType::F16:
316*c217d954SCole Faust                 {
317*c217d954SCole Faust                     /* Up-conversion QASYMM8_SIGNED -> F16 */
318*c217d954SCole Faust                     ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
319*c217d954SCole Faust                     uk->ukernel(_src, _dst, info, _policy, window);
320*c217d954SCole Faust                     break;
321*c217d954SCole Faust                 }
322*c217d954SCole Faust                 default:
323*c217d954SCole Faust                     ARM_COMPUTE_ERROR("dst data type not supported");
324*c217d954SCole Faust             }
325*c217d954SCole Faust             break;
326*c217d954SCole Faust         }
327*c217d954SCole Faust 
328*c217d954SCole Faust         case DataType::QASYMM8:
329*c217d954SCole Faust         case DataType::U8:
330*c217d954SCole Faust         {
331*c217d954SCole Faust             switch(_dst->info()->data_type())
332*c217d954SCole Faust             {
333*c217d954SCole Faust                 case DataType::S16:
334*c217d954SCole Faust                 {
335*c217d954SCole Faust                     /* Up-conversion U8 -> S16 */
336*c217d954SCole Faust                     execute_window_loop(win, [&](const Coordinates &)
337*c217d954SCole Faust                     {
338*c217d954SCole Faust                         const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
339*c217d954SCole Faust                         const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
340*c217d954SCole Faust 
341*c217d954SCole Faust                         int x = window_start_x;
342*c217d954SCole Faust                         for(; x <= (window_end_x - window_step_x); x += window_step_x)
343*c217d954SCole Faust                         {
344*c217d954SCole Faust                             const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
345*c217d954SCole Faust 
346*c217d954SCole Faust                             const int16x8x2_t texels =
347*c217d954SCole Faust                             {
348*c217d954SCole Faust                                 {
349*c217d954SCole Faust                                     vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
350*c217d954SCole Faust                                     vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
351*c217d954SCole Faust                                 }
352*c217d954SCole Faust                             };
353*c217d954SCole Faust 
354*c217d954SCole Faust                             vst1q_s16(dst_ptr + x, texels.val[0]);
355*c217d954SCole Faust                             vst1q_s16(dst_ptr + x + 8, texels.val[1]);
356*c217d954SCole Faust                         }
357*c217d954SCole Faust 
358*c217d954SCole Faust                         // Compute left-over elements
359*c217d954SCole Faust                         for(; x < window_end_x; ++x)
360*c217d954SCole Faust                         {
361*c217d954SCole Faust                             *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
362*c217d954SCole Faust                         }
363*c217d954SCole Faust                     },
364*c217d954SCole Faust                     src, dst);
365*c217d954SCole Faust                     break;
366*c217d954SCole Faust                 }
367*c217d954SCole Faust                 case DataType::S32:
368*c217d954SCole Faust                 {
369*c217d954SCole Faust                     /* Up-conversion U8 -> S32 */
370*c217d954SCole Faust                     execute_window_loop(win, [&](const Coordinates &)
371*c217d954SCole Faust                     {
372*c217d954SCole Faust                         const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
373*c217d954SCole Faust                         const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
374*c217d954SCole Faust 
375*c217d954SCole Faust                         int x = window_start_x;
376*c217d954SCole Faust                         for(; x <= (window_end_x - window_step_x); x += window_step_x)
377*c217d954SCole Faust                         {
378*c217d954SCole Faust                             const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
379*c217d954SCole Faust 
380*c217d954SCole Faust                             const int16x8x2_t texels =
381*c217d954SCole Faust                             {
382*c217d954SCole Faust                                 {
383*c217d954SCole Faust                                     vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
384*c217d954SCole Faust                                     vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
385*c217d954SCole Faust                                 }
386*c217d954SCole Faust                             };
387*c217d954SCole Faust 
388*c217d954SCole Faust                             vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
389*c217d954SCole Faust                             vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
390*c217d954SCole Faust                             vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
391*c217d954SCole Faust                             vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
392*c217d954SCole Faust                         }
393*c217d954SCole Faust 
394*c217d954SCole Faust                         // Compute left-over elements
395*c217d954SCole Faust                         for(; x < window_end_x; ++x)
396*c217d954SCole Faust                         {
397*c217d954SCole Faust                             *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
398*c217d954SCole Faust                         }
399*c217d954SCole Faust                     },
400*c217d954SCole Faust                     src, dst);
401*c217d954SCole Faust                     break;
402*c217d954SCole Faust                 }
403*c217d954SCole Faust                 case DataType::F32:
404*c217d954SCole Faust                 {
405*c217d954SCole Faust                     /* Up-conversion U8 -> F32 */
406*c217d954SCole Faust                     execute_window_loop(win, [&](const Coordinates &)
407*c217d954SCole Faust                     {
408*c217d954SCole Faust                         const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
409*c217d954SCole Faust                         const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
410*c217d954SCole Faust 
411*c217d954SCole Faust                         int x = window_start_x;
412*c217d954SCole Faust                         for(; x <= (window_end_x - window_step_x); x += window_step_x)
413*c217d954SCole Faust                         {
414*c217d954SCole Faust                             const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
415*c217d954SCole Faust 
416*c217d954SCole Faust                             const int16x8x2_t texels =
417*c217d954SCole Faust                             {
418*c217d954SCole Faust                                 {
419*c217d954SCole Faust                                     vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
420*c217d954SCole Faust                                     vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
421*c217d954SCole Faust                                 }
422*c217d954SCole Faust                             };
423*c217d954SCole Faust                             vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
424*c217d954SCole Faust                             vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
425*c217d954SCole Faust                             vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
426*c217d954SCole Faust                             vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
427*c217d954SCole Faust                         }
428*c217d954SCole Faust 
429*c217d954SCole Faust                         // Compute left-over elements
430*c217d954SCole Faust                         for(; x < window_end_x; ++x)
431*c217d954SCole Faust                         {
432*c217d954SCole Faust                             *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
433*c217d954SCole Faust                         }
434*c217d954SCole Faust                     },
435*c217d954SCole Faust                     src, dst);
436*c217d954SCole Faust                     break;
437*c217d954SCole Faust                 }
438*c217d954SCole Faust                 case DataType::F16:
439*c217d954SCole Faust                 {
440*c217d954SCole Faust                     /* Up-conversion U8 -> FP16 */
441*c217d954SCole Faust                     ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
442*c217d954SCole Faust                     uk->ukernel(_src, _dst, info, _policy, window);
443*c217d954SCole Faust                     break;
444*c217d954SCole Faust                 }
445*c217d954SCole Faust                 case DataType::U16:
446*c217d954SCole Faust                 {
447*c217d954SCole Faust                     /* Up-conversion U8 -> U16 */
448*c217d954SCole Faust                     execute_window_loop(win, [&](const Coordinates &)
449*c217d954SCole Faust                     {
450*c217d954SCole Faust                         const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
451*c217d954SCole Faust                         const auto dst_ptr = reinterpret_cast<uint16_t *>(dst.ptr());
452*c217d954SCole Faust 
453*c217d954SCole Faust                         int x = window_start_x;
454*c217d954SCole Faust                         for(; x <= (window_end_x - window_step_x); x += window_step_x)
455*c217d954SCole Faust                         {
456*c217d954SCole Faust                             const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
457*c217d954SCole Faust 
458*c217d954SCole Faust                             const uint16x8x2_t texels =
459*c217d954SCole Faust                             {
460*c217d954SCole Faust                                 {
461*c217d954SCole Faust                                     vmovl_u8(vget_low_u8(texels_u8)),
462*c217d954SCole Faust                                     vmovl_u8(vget_high_u8(texels_u8))
463*c217d954SCole Faust                                 }
464*c217d954SCole Faust                             };
465*c217d954SCole Faust 
466*c217d954SCole Faust                             vst1q_u16(dst_ptr + x, texels.val[0]);
467*c217d954SCole Faust                             vst1q_u16(dst_ptr + x + 8, texels.val[1]);
468*c217d954SCole Faust                         }
469*c217d954SCole Faust 
470*c217d954SCole Faust                         // Compute left-over elements
471*c217d954SCole Faust                         for(; x < window_end_x; ++x)
472*c217d954SCole Faust                         {
473*c217d954SCole Faust                             *(dst_ptr + x) = static_cast<uint16_t>(*(src_ptr + x));
474*c217d954SCole Faust                         }
475*c217d954SCole Faust                     },
476*c217d954SCole Faust                     src, dst);
477*c217d954SCole Faust                     break;
478*c217d954SCole Faust                 }
479*c217d954SCole Faust                 default:
480*c217d954SCole Faust                     ARM_COMPUTE_ERROR("dst data type not supported");
481*c217d954SCole Faust             }
482*c217d954SCole Faust             break;
483*c217d954SCole Faust         }
484*c217d954SCole Faust         case DataType::S16:
485*c217d954SCole Faust         {
486*c217d954SCole Faust             switch(_dst->info()->data_type())
487*c217d954SCole Faust             {
488*c217d954SCole Faust                 case DataType::QASYMM8_SIGNED:
489*c217d954SCole Faust                 {
490*c217d954SCole Faust                     /* Down-conversion S16 -> QASYMM8_SIGNED */
491*c217d954SCole Faust                     if(ConvertPolicy::SATURATE == _policy)
492*c217d954SCole Faust                     {
493*c217d954SCole Faust                         execute_window_loop(win, [&](const Coordinates &)
494*c217d954SCole Faust                         {
495*c217d954SCole Faust                             const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
496*c217d954SCole Faust                             const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
497*c217d954SCole Faust 
498*c217d954SCole Faust                             int x = window_start_x;
499*c217d954SCole Faust                             for(; x <= (window_end_x - window_step_x); x += window_step_x)
500*c217d954SCole Faust                             {
501*c217d954SCole Faust                                 const int16x8x2_t texels =
502*c217d954SCole Faust                                 {
503*c217d954SCole Faust                                     {
504*c217d954SCole Faust                                         vld1q_s16(src_ptr + x),
505*c217d954SCole Faust                                         vld1q_s16(src_ptr + x + 8)
506*c217d954SCole Faust                                     }
507*c217d954SCole Faust                                 };
508*c217d954SCole Faust 
509*c217d954SCole Faust                                 vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1])));
510*c217d954SCole Faust                             }
511*c217d954SCole Faust 
512*c217d954SCole Faust                             // Compute left-over elements
513*c217d954SCole Faust                             for(; x < window_end_x; ++x)
514*c217d954SCole Faust                             {
515*c217d954SCole Faust                                 *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
516*c217d954SCole Faust                             }
517*c217d954SCole Faust                         },
518*c217d954SCole Faust                         src, dst);
519*c217d954SCole Faust                     }
520*c217d954SCole Faust                     else
521*c217d954SCole Faust                     {
522*c217d954SCole Faust                         execute_window_loop(win, [&](const Coordinates &)
523*c217d954SCole Faust                         {
524*c217d954SCole Faust                             const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
525*c217d954SCole Faust                             const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
526*c217d954SCole Faust 
527*c217d954SCole Faust                             int x = window_start_x;
528*c217d954SCole Faust                             for(; x <= (window_end_x - window_step_x); x += window_step_x)
529*c217d954SCole Faust                             {
530*c217d954SCole Faust                                 const int16x8x2_t texels =
531*c217d954SCole Faust                                 {
532*c217d954SCole Faust                                     {
533*c217d954SCole Faust                                         vld1q_s16(src_ptr + x),
534*c217d954SCole Faust                                         vld1q_s16(src_ptr + x + 8)
535*c217d954SCole Faust                                     }
536*c217d954SCole Faust                                 };
537*c217d954SCole Faust 
538*c217d954SCole Faust                                 vst1q_s8(dst_ptr + x, vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1])));
539*c217d954SCole Faust                             }
540*c217d954SCole Faust 
541*c217d954SCole Faust                             // Compute left-over elements
542*c217d954SCole Faust                             for(; x < window_end_x; ++x)
543*c217d954SCole Faust                             {
544*c217d954SCole Faust                                 *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
545*c217d954SCole Faust                             }
546*c217d954SCole Faust                         },
547*c217d954SCole Faust                         src, dst);
548*c217d954SCole Faust                     }
549*c217d954SCole Faust                     break;
550*c217d954SCole Faust                 }
551*c217d954SCole Faust                 case DataType::U8:
552*c217d954SCole Faust                 {
553*c217d954SCole Faust                     /* Down-conversion S16 -> U8 */
554*c217d954SCole Faust                     if(ConvertPolicy::SATURATE == _policy)
555*c217d954SCole Faust                     {
556*c217d954SCole Faust                         execute_window_loop(win, [&](const Coordinates &)
557*c217d954SCole Faust                         {
558*c217d954SCole Faust                             const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
559*c217d954SCole Faust                             const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
560*c217d954SCole Faust 
561*c217d954SCole Faust                             int x = window_start_x;
562*c217d954SCole Faust                             for(; x <= (window_end_x - window_step_x); x += window_step_x)
563*c217d954SCole Faust                             {
564*c217d954SCole Faust                                 const int16x8x2_t texels =
565*c217d954SCole Faust                                 {
566*c217d954SCole Faust                                     {
567*c217d954SCole Faust                                         vld1q_s16(src_ptr + x),
568*c217d954SCole Faust                                         vld1q_s16(src_ptr + x + 8)
569*c217d954SCole Faust                                     }
570*c217d954SCole Faust                                 };
571*c217d954SCole Faust 
572*c217d954SCole Faust                                 vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1])));
573*c217d954SCole Faust                             }
574*c217d954SCole Faust 
575*c217d954SCole Faust                             // Compute left-over elements
576*c217d954SCole Faust                             for(; x < window_end_x; ++x)
577*c217d954SCole Faust                             {
578*c217d954SCole Faust                                 *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
579*c217d954SCole Faust                             }
580*c217d954SCole Faust                         },
581*c217d954SCole Faust                         src, dst);
582*c217d954SCole Faust                     }
583*c217d954SCole Faust                     else
584*c217d954SCole Faust                     {
585*c217d954SCole Faust                         execute_window_loop(win, [&](const Coordinates &)
586*c217d954SCole Faust                         {
587*c217d954SCole Faust                             const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
588*c217d954SCole Faust                             const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
589*c217d954SCole Faust 
590*c217d954SCole Faust                             int x = window_start_x;
591*c217d954SCole Faust                             for(; x <= (window_end_x - window_step_x); x += window_step_x)
592*c217d954SCole Faust                             {
593*c217d954SCole Faust                                 const int16x8x2_t texels =
594*c217d954SCole Faust                                 {
595*c217d954SCole Faust                                     {
596*c217d954SCole Faust                                         vld1q_s16(src_ptr + x),
597*c217d954SCole Faust                                         vld1q_s16(src_ptr + x + 8)
598*c217d954SCole Faust                                     }
599*c217d954SCole Faust                                 };
600*c217d954SCole Faust 
601*c217d954SCole Faust                                 vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])),
602*c217d954SCole Faust                                                                   vmovn_u16(vreinterpretq_u16_s16(texels.val[1]))));
603*c217d954SCole Faust                             }
604*c217d954SCole Faust 
605*c217d954SCole Faust                             // Compute left-over elements
606*c217d954SCole Faust                             for(; x < window_end_x; ++x)
607*c217d954SCole Faust                             {
608*c217d954SCole Faust                                 *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
609*c217d954SCole Faust                             }
610*c217d954SCole Faust                         },
611*c217d954SCole Faust                         src, dst);
612*c217d954SCole Faust                     }
613*c217d954SCole Faust                     break;
614*c217d954SCole Faust                 }
615*c217d954SCole Faust                 case DataType::S32:
616*c217d954SCole Faust                 {
617*c217d954SCole Faust                     /* Up-conversion S16 -> S32 */
618*c217d954SCole Faust                     execute_window_loop(win, [&](const Coordinates &)
619*c217d954SCole Faust                     {
620*c217d954SCole Faust                         const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
621*c217d954SCole Faust                         const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
622*c217d954SCole Faust 
623*c217d954SCole Faust                         int x = window_start_x;
624*c217d954SCole Faust                         for(; x <= (window_end_x - window_step_x); x += window_step_x)
625*c217d954SCole Faust                         {
626*c217d954SCole Faust                             const int16x8x2_t texels =
627*c217d954SCole Faust                             {
628*c217d954SCole Faust                                 {
629*c217d954SCole Faust                                     vld1q_s16(src_ptr + x),
630*c217d954SCole Faust                                     vld1q_s16(src_ptr + x + 8)
631*c217d954SCole Faust                                 }
632*c217d954SCole Faust                             };
633*c217d954SCole Faust 
634*c217d954SCole Faust                             const int32x4x4_t texels_s32 =
635*c217d954SCole Faust                             {
636*c217d954SCole Faust                                 {
637*c217d954SCole Faust                                     vmovl_s16(vget_low_s16(texels.val[0])),
638*c217d954SCole Faust                                     vmovl_s16(vget_high_s16(texels.val[0])),
639*c217d954SCole Faust                                     vmovl_s16(vget_low_s16(texels.val[1])),
640*c217d954SCole Faust                                     vmovl_s16(vget_high_s16(texels.val[1]))
641*c217d954SCole Faust                                 }
642*c217d954SCole Faust                             };
643*c217d954SCole Faust 
644*c217d954SCole Faust                             vst1q_s32(dst_ptr + x, texels_s32.val[0]);
645*c217d954SCole Faust                             vst1q_s32(dst_ptr + x + 4, texels_s32.val[1]);
646*c217d954SCole Faust                             vst1q_s32(dst_ptr + x + 8, texels_s32.val[2]);
647*c217d954SCole Faust                             vst1q_s32(dst_ptr + x + 12, texels_s32.val[3]);
648*c217d954SCole Faust                         }
649*c217d954SCole Faust 
650*c217d954SCole Faust                         // Compute left-over elements
651*c217d954SCole Faust                         for(; x < window_end_x; ++x)
652*c217d954SCole Faust                         {
653*c217d954SCole Faust                             *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
654*c217d954SCole Faust                         }
655*c217d954SCole Faust                     },
656*c217d954SCole Faust                     src, dst);
657*c217d954SCole Faust                     break;
658*c217d954SCole Faust                 }
659*c217d954SCole Faust                 default:
660*c217d954SCole Faust                     ARM_COMPUTE_ERROR("dst data type not supported");
661*c217d954SCole Faust             }
662*c217d954SCole Faust             break;
663*c217d954SCole Faust         }
664*c217d954SCole Faust 
665*c217d954SCole Faust         case DataType::U16:
666*c217d954SCole Faust         {
667*c217d954SCole Faust             switch(_dst->info()->data_type())
668*c217d954SCole Faust             {
669*c217d954SCole Faust                 case DataType::U8:
670*c217d954SCole Faust                 {
671*c217d954SCole Faust                     /* Down-conversion U16 -> U8 */
672*c217d954SCole Faust                     if(ConvertPolicy::SATURATE == _policy)
673*c217d954SCole Faust                     {
674*c217d954SCole Faust                         execute_window_loop(win, [&](const Coordinates &)
675*c217d954SCole Faust                         {
676*c217d954SCole Faust                             const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
677*c217d954SCole Faust                             const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
678*c217d954SCole Faust 
679*c217d954SCole Faust                             int x = window_start_x;
680*c217d954SCole Faust                             for(; x <= (window_end_x - window_step_x); x += window_step_x)
681*c217d954SCole Faust                             {
682*c217d954SCole Faust                                 const uint16x8x2_t texels =
683*c217d954SCole Faust                                 {
684*c217d954SCole Faust                                     {
685*c217d954SCole Faust                                         vld1q_u16(src_ptr + x),
686*c217d954SCole Faust                                         vld1q_u16(src_ptr + x + 8)
687*c217d954SCole Faust                                     }
688*c217d954SCole Faust                                 };
689*c217d954SCole Faust 
690*c217d954SCole Faust                                 vst1q_u8(dst_ptr + x, vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
691*c217d954SCole Faust                             }
692*c217d954SCole Faust 
693*c217d954SCole Faust                             // Compute left-over elements
694*c217d954SCole Faust                             for(; x < window_end_x; ++x)
695*c217d954SCole Faust                             {
696*c217d954SCole Faust                                 *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
697*c217d954SCole Faust                             }
698*c217d954SCole Faust                         },
699*c217d954SCole Faust                         src, dst);
700*c217d954SCole Faust                     }
701*c217d954SCole Faust                     else
702*c217d954SCole Faust                     {
703*c217d954SCole Faust                         execute_window_loop(win, [&](const Coordinates &)
704*c217d954SCole Faust                         {
705*c217d954SCole Faust                             const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
706*c217d954SCole Faust                             const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
707*c217d954SCole Faust 
708*c217d954SCole Faust                             int x = window_start_x;
709*c217d954SCole Faust                             for(; x <= (window_end_x - window_step_x); x += window_step_x)
710*c217d954SCole Faust                             {
711*c217d954SCole Faust                                 const uint16x8x2_t texels =
712*c217d954SCole Faust                                 {
713*c217d954SCole Faust                                     {
714*c217d954SCole Faust                                         vld1q_u16(src_ptr + x),
715*c217d954SCole Faust                                         vld1q_u16(src_ptr + x + 8)
716*c217d954SCole Faust                                     }
717*c217d954SCole Faust                                 };
718*c217d954SCole Faust 
719*c217d954SCole Faust                                 vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
720*c217d954SCole Faust                             }
721*c217d954SCole Faust 
722*c217d954SCole Faust                             // Compute left-over elements
723*c217d954SCole Faust                             for(; x < window_end_x; ++x)
724*c217d954SCole Faust                             {
725*c217d954SCole Faust                                 *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
726*c217d954SCole Faust                             }
727*c217d954SCole Faust 
728*c217d954SCole Faust                         },
729*c217d954SCole Faust                         src, dst);
730*c217d954SCole Faust                     }
731*c217d954SCole Faust                     break;
732*c217d954SCole Faust                 }
733*c217d954SCole Faust                 case DataType::U32:
734*c217d954SCole Faust                 {
735*c217d954SCole Faust                     /* Up-conversion U16 -> U32 */
736*c217d954SCole Faust                     execute_window_loop(win, [&](const Coordinates &)
737*c217d954SCole Faust                     {
738*c217d954SCole Faust                         const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
739*c217d954SCole Faust                         const auto dst_ptr = reinterpret_cast<uint32_t *>(dst.ptr());
740*c217d954SCole Faust 
741*c217d954SCole Faust                         int x = window_start_x;
742*c217d954SCole Faust                         for(; x <= (window_end_x - window_step_x); x += window_step_x)
743*c217d954SCole Faust                         {
744*c217d954SCole Faust                             const uint16x8x2_t texels =
745*c217d954SCole Faust                             {
746*c217d954SCole Faust                                 {
747*c217d954SCole Faust                                     vld1q_u16(src_ptr + x),
748*c217d954SCole Faust                                     vld1q_u16(src_ptr + x + 8)
749*c217d954SCole Faust                                 }
750*c217d954SCole Faust                             };
751*c217d954SCole Faust 
752*c217d954SCole Faust                             vst1q_u32(dst_ptr + x, vmovl_u16(vget_low_u16(texels.val[0])));
753*c217d954SCole Faust                             vst1q_u32(dst_ptr + x + 4, vmovl_u16(vget_high_u16(texels.val[0])));
754*c217d954SCole Faust                             vst1q_u32(dst_ptr + x + 8, vmovl_u16(vget_low_u16(texels.val[1])));
755*c217d954SCole Faust                             vst1q_u32(dst_ptr + x + 12, vmovl_u16(vget_high_u16(texels.val[1])));
756*c217d954SCole Faust                         }
757*c217d954SCole Faust                         // Compute left-over elements
758*c217d954SCole Faust                         for(; x < window_end_x; ++x)
759*c217d954SCole Faust                         {
760*c217d954SCole Faust                             *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
761*c217d954SCole Faust                         }
762*c217d954SCole Faust 
763*c217d954SCole Faust                     },
764*c217d954SCole Faust                     src, dst);
765*c217d954SCole Faust                     break;
766*c217d954SCole Faust                 }
767*c217d954SCole Faust                 default:
768*c217d954SCole Faust                     ARM_COMPUTE_ERROR("dst data type not supported");
769*c217d954SCole Faust             }
770*c217d954SCole Faust             break;
771*c217d954SCole Faust         }
772*c217d954SCole Faust         case DataType::BFLOAT16:
773*c217d954SCole Faust         {
774*c217d954SCole Faust             /* Up-conversion BFLOAT16 -> F32 */
775*c217d954SCole Faust             ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
776*c217d954SCole Faust             uk->ukernel(_src, _dst, info, _policy, window);
777*c217d954SCole Faust             break;
778*c217d954SCole Faust         }
779*c217d954SCole Faust         case DataType::F16:
780*c217d954SCole Faust         {
781*c217d954SCole Faust             /* conversion F16 -> any data type */
782*c217d954SCole Faust             ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
783*c217d954SCole Faust             uk->ukernel(_src, _dst, info, _policy, window);
784*c217d954SCole Faust             break;
785*c217d954SCole Faust         }
786*c217d954SCole Faust         case DataType::F32:
787*c217d954SCole Faust             switch(_dst->info()->data_type())
788*c217d954SCole Faust             {
789*c217d954SCole Faust                 case DataType::F16:
790*c217d954SCole Faust                 {
791*c217d954SCole Faust                     /* Down-conversion F32 -> F16 */
792*c217d954SCole Faust                     ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
793*c217d954SCole Faust                     uk->ukernel(_src, _dst, info, _policy, window);
794*c217d954SCole Faust                     break;
795*c217d954SCole Faust                 }
796*c217d954SCole Faust                 case DataType::BFLOAT16:
797*c217d954SCole Faust                 {
798*c217d954SCole Faust                     /* Down-conversion F32 -> BFLOAT16 */
799*c217d954SCole Faust                     ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
800*c217d954SCole Faust                     uk->ukernel(_src, _dst, info, _policy, window);
801*c217d954SCole Faust                     break;
802*c217d954SCole Faust                 }
803*c217d954SCole Faust                 case DataType::S32:
804*c217d954SCole Faust                 {
805*c217d954SCole Faust                     /* Conversion F32 -> S32 */
806*c217d954SCole Faust                     execute_window_loop(win, [&](const Coordinates &)
807*c217d954SCole Faust                     {
808*c217d954SCole Faust                         const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
809*c217d954SCole Faust                         const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
810*c217d954SCole Faust 
811*c217d954SCole Faust                         int x = window_start_x;
812*c217d954SCole Faust                         for(; x <= (window_end_x - window_step_x); x += window_step_x)
813*c217d954SCole Faust                         {
814*c217d954SCole Faust                             const float32x4x4_t texels =
815*c217d954SCole Faust                             {
816*c217d954SCole Faust                                 {
817*c217d954SCole Faust                                     vld1q_f32(src_ptr + x),
818*c217d954SCole Faust                                     vld1q_f32(src_ptr + x + 4),
819*c217d954SCole Faust                                     vld1q_f32(src_ptr + x + 8),
820*c217d954SCole Faust                                     vld1q_f32(src_ptr + x + 12),
821*c217d954SCole Faust                                 }
822*c217d954SCole Faust                             };
823*c217d954SCole Faust 
824*c217d954SCole Faust                             vst1q_s32(dst_ptr + x, vcvtq_s32_f32(texels.val[0]));
825*c217d954SCole Faust                             vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(texels.val[1]));
826*c217d954SCole Faust                             vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(texels.val[2]));
827*c217d954SCole Faust                             vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(texels.val[3]));
828*c217d954SCole Faust                         }
829*c217d954SCole Faust 
830*c217d954SCole Faust                         // Compute left-over elements
831*c217d954SCole Faust                         for(; x < window_end_x; ++x)
832*c217d954SCole Faust                         {
833*c217d954SCole Faust                             *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
834*c217d954SCole Faust                         }
835*c217d954SCole Faust                     },
836*c217d954SCole Faust                     src, dst);
837*c217d954SCole Faust                     break;
838*c217d954SCole Faust                 }
839*c217d954SCole Faust                 case DataType::QASYMM8:
840*c217d954SCole Faust                 case DataType::U8:
841*c217d954SCole Faust                 {
842*c217d954SCole Faust                     /* Down-conversion F32 -> U8 */
843*c217d954SCole Faust                     execute_window_loop(win, [&](const Coordinates &)
844*c217d954SCole Faust                     {
845*c217d954SCole Faust                         const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
846*c217d954SCole Faust                         const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
847*c217d954SCole Faust 
848*c217d954SCole Faust                         int x = window_start_x;
849*c217d954SCole Faust                         for(; x <= (window_end_x - window_step_x); x += window_step_x)
850*c217d954SCole Faust                         {
851*c217d954SCole Faust                             const float32x4x4_t texels =
852*c217d954SCole Faust                             {
853*c217d954SCole Faust                                 {
854*c217d954SCole Faust                                     vld1q_f32(src_ptr + x),
855*c217d954SCole Faust                                     vld1q_f32(src_ptr + x + 4),
856*c217d954SCole Faust                                     vld1q_f32(src_ptr + x + 8),
857*c217d954SCole Faust                                     vld1q_f32(src_ptr + x + 12),
858*c217d954SCole Faust                                 }
859*c217d954SCole Faust                             };
860*c217d954SCole Faust 
861*c217d954SCole Faust                             vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])), vqmovun_s32(vcvtq_s32_f32(texels.val[1])))));
862*c217d954SCole Faust                             vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])), vqmovun_s32(vcvtq_s32_f32(texels.val[3])))));
863*c217d954SCole Faust                         }
864*c217d954SCole Faust 
865*c217d954SCole Faust                         // Compute left-over elements
866*c217d954SCole Faust                         for(; x < window_end_x; ++x)
867*c217d954SCole Faust                         {
868*c217d954SCole Faust                             *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
869*c217d954SCole Faust                         }
870*c217d954SCole Faust                     },
871*c217d954SCole Faust                     src, dst);
872*c217d954SCole Faust                     break;
873*c217d954SCole Faust                 }
874*c217d954SCole Faust                 case DataType::QASYMM8_SIGNED:
875*c217d954SCole Faust                 {
876*c217d954SCole Faust                     /* Down-conversion F32 -> QASYMM8_SIGNED */
877*c217d954SCole Faust                     execute_window_loop(win, [&](const Coordinates &)
878*c217d954SCole Faust                     {
879*c217d954SCole Faust                         const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
880*c217d954SCole Faust                         const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
881*c217d954SCole Faust 
882*c217d954SCole Faust                         int x = window_start_x;
883*c217d954SCole Faust                         for(; x <= (window_end_x - window_step_x); x += window_step_x)
884*c217d954SCole Faust                         {
885*c217d954SCole Faust                             const float32x4x4_t texels =
886*c217d954SCole Faust                             {
887*c217d954SCole Faust                                 {
888*c217d954SCole Faust                                     vld1q_f32(src_ptr + x),
889*c217d954SCole Faust                                     vld1q_f32(src_ptr + x + 4),
890*c217d954SCole Faust                                     vld1q_f32(src_ptr + x + 8),
891*c217d954SCole Faust                                     vld1q_f32(src_ptr + x + 12),
892*c217d954SCole Faust                                 }
893*c217d954SCole Faust                             };
894*c217d954SCole Faust 
895*c217d954SCole Faust                             vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])), vqmovn_s32(vcvtq_s32_f32(texels.val[1])))));
896*c217d954SCole Faust                             vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])), vqmovn_s32(vcvtq_s32_f32(texels.val[3])))));
897*c217d954SCole Faust                         }
898*c217d954SCole Faust                         // Compute left-over elements
899*c217d954SCole Faust                         for(; x < window_end_x; ++x)
900*c217d954SCole Faust                         {
901*c217d954SCole Faust                             *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
902*c217d954SCole Faust                         }
903*c217d954SCole Faust                     },
904*c217d954SCole Faust                     src, dst);
905*c217d954SCole Faust                     break;
906*c217d954SCole Faust                 }
907*c217d954SCole Faust 
908*c217d954SCole Faust                 default:
909*c217d954SCole Faust                     ARM_COMPUTE_ERROR("dst data type not supported");
910*c217d954SCole Faust             }
911*c217d954SCole Faust             break;
912*c217d954SCole Faust 
913*c217d954SCole Faust         case DataType::S32:
914*c217d954SCole Faust             switch(_dst->info()->data_type())
915*c217d954SCole Faust             {
916*c217d954SCole Faust                 case DataType::F16:
917*c217d954SCole Faust                 {
918*c217d954SCole Faust                     /* Down-conversion S32 -> F16 */
919*c217d954SCole Faust                     ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
920*c217d954SCole Faust                     uk->ukernel(_src, _dst, info, _policy, window);
921*c217d954SCole Faust                     break;
922*c217d954SCole Faust                 }
923*c217d954SCole Faust                 case DataType::F32:
924*c217d954SCole Faust                 {
925*c217d954SCole Faust                     /* Conversion S32 -> F32 */
926*c217d954SCole Faust                     execute_window_loop(win, [&](const Coordinates &)
927*c217d954SCole Faust                     {
928*c217d954SCole Faust                         const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
929*c217d954SCole Faust                         const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
930*c217d954SCole Faust 
931*c217d954SCole Faust                         int x = window_start_x;
932*c217d954SCole Faust                         for(; x <= (window_end_x - window_step_x); x += window_step_x)
933*c217d954SCole Faust                         {
934*c217d954SCole Faust                             const int32x4x4_t texels =
935*c217d954SCole Faust                             {
936*c217d954SCole Faust                                 {
937*c217d954SCole Faust                                     vld1q_s32(src_ptr + x),
938*c217d954SCole Faust                                     vld1q_s32(src_ptr + x + 4),
939*c217d954SCole Faust                                     vld1q_s32(src_ptr + x + 8),
940*c217d954SCole Faust                                     vld1q_s32(src_ptr + x + 12),
941*c217d954SCole Faust                                 }
942*c217d954SCole Faust                             };
943*c217d954SCole Faust 
944*c217d954SCole Faust                             vst1q_f32(dst_ptr + x, vcvtq_f32_s32(texels.val[0]));
945*c217d954SCole Faust                             vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(texels.val[1]));
946*c217d954SCole Faust                             vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(texels.val[2]));
947*c217d954SCole Faust                             vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(texels.val[3]));
948*c217d954SCole Faust                         }
949*c217d954SCole Faust 
950*c217d954SCole Faust                         // Compute left-over elements
951*c217d954SCole Faust                         for(; x < window_end_x; ++x)
952*c217d954SCole Faust                         {
953*c217d954SCole Faust                             *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
954*c217d954SCole Faust                         }
955*c217d954SCole Faust                     },
956*c217d954SCole Faust                     src, dst);
957*c217d954SCole Faust                     break;
958*c217d954SCole Faust                 }
959*c217d954SCole Faust                 case DataType::QASYMM8_SIGNED:
960*c217d954SCole Faust                 {
961*c217d954SCole Faust                     /* Down-conversion S32 -> QASYMM8_SIGNED */
962*c217d954SCole Faust                     if(ConvertPolicy::SATURATE == _policy)
963*c217d954SCole Faust                     {
964*c217d954SCole Faust                         execute_window_loop(win, [&](const Coordinates &)
965*c217d954SCole Faust                         {
966*c217d954SCole Faust                             const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
967*c217d954SCole Faust                             const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
968*c217d954SCole Faust 
969*c217d954SCole Faust                             int x = window_start_x;
970*c217d954SCole Faust                             for(; x <= (window_end_x - window_step_x); x += window_step_x)
971*c217d954SCole Faust                             {
972*c217d954SCole Faust                                 const int32x4x4_t texels =
973*c217d954SCole Faust                                 {
974*c217d954SCole Faust                                     {
975*c217d954SCole Faust                                         vld1q_s32(src_ptr + x),
976*c217d954SCole Faust                                         vld1q_s32(src_ptr + x + 4),
977*c217d954SCole Faust                                         vld1q_s32(src_ptr + x + 8),
978*c217d954SCole Faust                                         vld1q_s32(src_ptr + x + 12),
979*c217d954SCole Faust                                     }
980*c217d954SCole Faust                                 };
981*c217d954SCole Faust                                 vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]), vqmovn_s32(texels.val[1]))));
982*c217d954SCole Faust                                 vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]), vqmovn_s32(texels.val[3]))));
983*c217d954SCole Faust                             }
984*c217d954SCole Faust 
985*c217d954SCole Faust                             // Compute left-over elements
986*c217d954SCole Faust                             for(; x < window_end_x; ++x)
987*c217d954SCole Faust                             {
988*c217d954SCole Faust                                 *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
989*c217d954SCole Faust                             }
990*c217d954SCole Faust                         },
991*c217d954SCole Faust                         src, dst);
992*c217d954SCole Faust                     }
993*c217d954SCole Faust                     else
994*c217d954SCole Faust                     {
995*c217d954SCole Faust                         execute_window_loop(win, [&](const Coordinates &)
996*c217d954SCole Faust                         {
997*c217d954SCole Faust                             const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
998*c217d954SCole Faust                             const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
999*c217d954SCole Faust 
1000*c217d954SCole Faust                             int x = window_start_x;
1001*c217d954SCole Faust                             for(; x <= (window_end_x - window_step_x); x += window_step_x)
1002*c217d954SCole Faust                             {
1003*c217d954SCole Faust                                 const int32x4x4_t texels =
1004*c217d954SCole Faust                                 {
1005*c217d954SCole Faust                                     {
1006*c217d954SCole Faust                                         vld1q_s32(src_ptr + x),
1007*c217d954SCole Faust                                         vld1q_s32(src_ptr + x + 4),
1008*c217d954SCole Faust                                         vld1q_s32(src_ptr + x + 8),
1009*c217d954SCole Faust                                         vld1q_s32(src_ptr + x + 12)
1010*c217d954SCole Faust                                     }
1011*c217d954SCole Faust                                 };
1012*c217d954SCole Faust 
1013*c217d954SCole Faust                                 vst1_s8(dst_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]), vmovn_s32(texels.val[1]))));
1014*c217d954SCole Faust                                 vst1_s8(dst_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]), vmovn_s32(texels.val[3]))));
1015*c217d954SCole Faust                             }
1016*c217d954SCole Faust 
1017*c217d954SCole Faust                             // Compute left-over elements
1018*c217d954SCole Faust                             for(; x < window_end_x; ++x)
1019*c217d954SCole Faust                             {
1020*c217d954SCole Faust                                 *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
1021*c217d954SCole Faust                             }
1022*c217d954SCole Faust                         },
1023*c217d954SCole Faust                         src, dst);
1024*c217d954SCole Faust                     }
1025*c217d954SCole Faust                     break;
1026*c217d954SCole Faust                 }
1027*c217d954SCole Faust                 case DataType::QASYMM8:
1028*c217d954SCole Faust                 case DataType::U8:
1029*c217d954SCole Faust                 {
1030*c217d954SCole Faust                     /* Down-conversion S32 -> U8 */
1031*c217d954SCole Faust                     if(ConvertPolicy::SATURATE == _policy)
1032*c217d954SCole Faust                     {
1033*c217d954SCole Faust                         execute_window_loop(win, [&](const Coordinates &)
1034*c217d954SCole Faust                         {
1035*c217d954SCole Faust                             const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
1036*c217d954SCole Faust                             const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
1037*c217d954SCole Faust 
1038*c217d954SCole Faust                             int x = window_start_x;
1039*c217d954SCole Faust                             for(; x <= (window_end_x - window_step_x); x += window_step_x)
1040*c217d954SCole Faust                             {
1041*c217d954SCole Faust                                 const int32x4x4_t texels =
1042*c217d954SCole Faust                                 {
1043*c217d954SCole Faust                                     {
1044*c217d954SCole Faust                                         vld1q_s32(src_ptr + x),
1045*c217d954SCole Faust                                         vld1q_s32(src_ptr + x + 4),
1046*c217d954SCole Faust                                         vld1q_s32(src_ptr + x + 8),
1047*c217d954SCole Faust                                         vld1q_s32(src_ptr + x + 12)
1048*c217d954SCole Faust                                     }
1049*c217d954SCole Faust                                 };
1050*c217d954SCole Faust                                 vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]), vqmovun_s32(texels.val[1]))));
1051*c217d954SCole Faust                                 vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]), vqmovun_s32(texels.val[3]))));
1052*c217d954SCole Faust                             }
1053*c217d954SCole Faust 
1054*c217d954SCole Faust                             // Compute left-over elements
1055*c217d954SCole Faust                             for(; x < window_end_x; ++x)
1056*c217d954SCole Faust                             {
1057*c217d954SCole Faust                                 *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
1058*c217d954SCole Faust                             }
1059*c217d954SCole Faust                         },
1060*c217d954SCole Faust                         src, dst);
1061*c217d954SCole Faust                     }
1062*c217d954SCole Faust                     else
1063*c217d954SCole Faust                     {
1064*c217d954SCole Faust                         execute_window_loop(win, [&](const Coordinates &)
1065*c217d954SCole Faust                         {
1066*c217d954SCole Faust                             const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
1067*c217d954SCole Faust                             const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
1068*c217d954SCole Faust 
1069*c217d954SCole Faust                             int x = window_start_x;
1070*c217d954SCole Faust                             for(; x <= (window_end_x - window_step_x); x += window_step_x)
1071*c217d954SCole Faust                             {
1072*c217d954SCole Faust                                 const int32x4x4_t texels =
1073*c217d954SCole Faust                                 {
1074*c217d954SCole Faust                                     {
1075*c217d954SCole Faust                                         vld1q_s32(src_ptr + x),
1076*c217d954SCole Faust                                         vld1q_s32(src_ptr + x + 4),
1077*c217d954SCole Faust                                         vld1q_s32(src_ptr + x + 8),
1078*c217d954SCole Faust                                         vld1q_s32(src_ptr + x + 12)
1079*c217d954SCole Faust                                     }
1080*c217d954SCole Faust                                 };
1081*c217d954SCole Faust 
1082*c217d954SCole Faust                                 vst1_u8(dst_ptr + x, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])), vmovn_u32(vreinterpretq_u32_s32(texels.val[1])))));
1083*c217d954SCole Faust                                 vst1_u8(dst_ptr + x + 8, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])), vmovn_u32(vreinterpretq_u32_s32(texels.val[3])))));
1084*c217d954SCole Faust                             }
1085*c217d954SCole Faust 
1086*c217d954SCole Faust                             // Compute left-over elements
1087*c217d954SCole Faust                             for(; x < window_end_x; ++x)
1088*c217d954SCole Faust                             {
1089*c217d954SCole Faust                                 *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
1090*c217d954SCole Faust                             }
1091*c217d954SCole Faust                         },
1092*c217d954SCole Faust                         src, dst);
1093*c217d954SCole Faust                     }
1094*c217d954SCole Faust                     break;
1095*c217d954SCole Faust                 }
1096*c217d954SCole Faust                 default:
1097*c217d954SCole Faust                     ARM_COMPUTE_ERROR("dst data type not supported");
1098*c217d954SCole Faust             }
1099*c217d954SCole Faust             break;
1100*c217d954SCole Faust         default:
1101*c217d954SCole Faust             ARM_COMPUTE_ERROR("Not supported");
1102*c217d954SCole Faust     }
1103*c217d954SCole Faust }
1104*c217d954SCole Faust 
name() const1105*c217d954SCole Faust const char *CpuCastKernel::name() const
1106*c217d954SCole Faust {
1107*c217d954SCole Faust     return "CpuCastKernel.cpp";
1108*c217d954SCole Faust }
1109*c217d954SCole Faust 
get_available_kernels()1110*c217d954SCole Faust const std::vector<CpuCastKernel::CastKernel> &CpuCastKernel::get_available_kernels()
1111*c217d954SCole Faust {
1112*c217d954SCole Faust     return available_kernels;
1113*c217d954SCole Faust }
1114*c217d954SCole Faust 
1115*c217d954SCole Faust } // namespace kernels
1116*c217d954SCole Faust } // namespace cpu
1117*c217d954SCole Faust } // namespace arm_compute
1118