1*c217d954SCole Faust /*
2*c217d954SCole Faust * Copyright (c) 2016-2022 Arm Limited.
3*c217d954SCole Faust *
4*c217d954SCole Faust * SPDX-License-Identifier: MIT
5*c217d954SCole Faust *
6*c217d954SCole Faust * Permission is hereby granted, free of charge, to any person obtaining a copy
7*c217d954SCole Faust * of this software and associated documentation files (the "Software"), to
8*c217d954SCole Faust * deal in the Software without restriction, including without limitation the
9*c217d954SCole Faust * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10*c217d954SCole Faust * sell copies of the Software, and to permit persons to whom the Software is
11*c217d954SCole Faust * furnished to do so, subject to the following conditions:
12*c217d954SCole Faust *
13*c217d954SCole Faust * The above copyright notice and this permission notice shall be included in all
14*c217d954SCole Faust * copies or substantial portions of the Software.
15*c217d954SCole Faust *
16*c217d954SCole Faust * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17*c217d954SCole Faust * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18*c217d954SCole Faust * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19*c217d954SCole Faust * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20*c217d954SCole Faust * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21*c217d954SCole Faust * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22*c217d954SCole Faust * SOFTWARE.
23*c217d954SCole Faust */
24*c217d954SCole Faust #include "src/cpu/kernels/CpuCastKernel.h"
25*c217d954SCole Faust
26*c217d954SCole Faust #include "arm_compute/core/Error.h"
27*c217d954SCole Faust #include "arm_compute/core/Helpers.h"
28*c217d954SCole Faust #include "arm_compute/core/ITensor.h"
29*c217d954SCole Faust #include "arm_compute/core/TensorInfo.h"
30*c217d954SCole Faust #include "arm_compute/core/Validate.h"
31*c217d954SCole Faust #include "src/core/CPP/Validate.h"
32*c217d954SCole Faust #include "src/core/NEON/NEFixedPoint.h"
33*c217d954SCole Faust #include "src/core/NEON/NEMath.h"
34*c217d954SCole Faust #include "src/core/NEON/wrapper/wrapper.h"
35*c217d954SCole Faust #include "src/core/common/Registrars.h"
36*c217d954SCole Faust #include "src/core/helpers/AutoConfiguration.h"
37*c217d954SCole Faust #include "src/core/helpers/WindowHelpers.h"
38*c217d954SCole Faust #include "support/SaturateCast.h"
39*c217d954SCole Faust
40*c217d954SCole Faust #include "src/cpu/kernels/cast/list.h"
41*c217d954SCole Faust
42*c217d954SCole Faust namespace arm_compute
43*c217d954SCole Faust {
44*c217d954SCole Faust namespace cpu
45*c217d954SCole Faust {
46*c217d954SCole Faust namespace kernels
47*c217d954SCole Faust {
48*c217d954SCole Faust namespace
49*c217d954SCole Faust {
50*c217d954SCole Faust static const std::vector<CpuCastKernel::CastKernel> available_kernels =
51*c217d954SCole Faust {
52*c217d954SCole Faust {
53*c217d954SCole Faust "neon_qs8_cast",
__anon2cc4acfe0202() 54*c217d954SCole Faust [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::QASYMM8_SIGNED && data.dst_dt == DataType::F16 && data.isa.fp16; },
55*c217d954SCole Faust REGISTER_FP16_NEON(arm_compute::cpu::neon_qasymm8_signed_to_fp16_cast)
56*c217d954SCole Faust },
57*c217d954SCole Faust {
58*c217d954SCole Faust "neon_qu8_cast",
__anon2cc4acfe0302() 59*c217d954SCole Faust [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::QASYMM8 && data.dst_dt == DataType::F16 && data.isa.fp16; },
60*c217d954SCole Faust REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast)
61*c217d954SCole Faust },
62*c217d954SCole Faust {
63*c217d954SCole Faust "neon_u8_cast",
__anon2cc4acfe0402() 64*c217d954SCole Faust [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::U8 && data.dst_dt == DataType::F16 && data.isa.fp16; },
65*c217d954SCole Faust REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast)
66*c217d954SCole Faust },
67*c217d954SCole Faust {
68*c217d954SCole Faust "neon_fp16_cast",
__anon2cc4acfe0502() 69*c217d954SCole Faust [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::F16 && data.isa.fp16; },
70*c217d954SCole Faust REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_to_other_dt_cast)
71*c217d954SCole Faust },
72*c217d954SCole Faust {
73*c217d954SCole Faust "neon_fp32_to_fp16_cast",
__anon2cc4acfe0602() 74*c217d954SCole Faust [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::F32 && data.dst_dt == DataType::F16 && data.isa.fp16; },
75*c217d954SCole Faust REGISTER_FP16_NEON(arm_compute::cpu::neon_fp32_to_fp16_cast)
76*c217d954SCole Faust },
77*c217d954SCole Faust {
78*c217d954SCole Faust "neon_fp32_to_bf16_cast",
__anon2cc4acfe0702() 79*c217d954SCole Faust [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::F32 && data.dst_dt == DataType::BFLOAT16 && data.isa.bf16; },
80*c217d954SCole Faust REGISTER_BF16_NEON(arm_compute::cpu::neon_fp32_to_bfloat16_cast)
81*c217d954SCole Faust },
82*c217d954SCole Faust {
83*c217d954SCole Faust "neon_s32_cast",
__anon2cc4acfe0802() 84*c217d954SCole Faust [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::S32 && data.dst_dt == DataType::F16 && data.isa.fp16; },
85*c217d954SCole Faust REGISTER_FP16_NEON(arm_compute::cpu::neon_s32_to_fp16_cast)
86*c217d954SCole Faust },
87*c217d954SCole Faust {
88*c217d954SCole Faust "neon_bf16_cast",
__anon2cc4acfe0902() 89*c217d954SCole Faust [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::BFLOAT16 && data.dst_dt == DataType::F32 && data.isa.bf16; },
90*c217d954SCole Faust REGISTER_BF16_NEON(arm_compute::cpu::neon_bfloat16_to_fp32_cast)
91*c217d954SCole Faust },
92*c217d954SCole Faust };
93*c217d954SCole Faust
validate_arguments(const ITensorInfo * src,const ITensorInfo * dst,ConvertPolicy policy)94*c217d954SCole Faust Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
95*c217d954SCole Faust {
96*c217d954SCole Faust ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
97*c217d954SCole Faust ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(dst);
98*c217d954SCole Faust ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(src);
99*c217d954SCole Faust ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(dst);
100*c217d954SCole Faust ARM_COMPUTE_UNUSED(policy);
101*c217d954SCole Faust ARM_COMPUTE_RETURN_ERROR_ON(src == dst);
102*c217d954SCole Faust ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
103*c217d954SCole Faust DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16,
104*c217d954SCole Faust DataType::F32, DataType::S32);
105*c217d954SCole Faust ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
106*c217d954SCole Faust DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16,
107*c217d954SCole Faust DataType::U32, DataType::S32, DataType::F32);
108*c217d954SCole Faust
109*c217d954SCole Faust ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8_SIGNED && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::S32
110*c217d954SCole Faust && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
111*c217d954SCole Faust "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
112*c217d954SCole Faust
113*c217d954SCole Faust ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8 && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16
114*c217d954SCole Faust && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
115*c217d954SCole Faust "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
116*c217d954SCole Faust
117*c217d954SCole Faust ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U8 && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16
118*c217d954SCole Faust && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
119*c217d954SCole Faust "Only data_types supported [in] U8 -> [out] U16, S16, S32, F16, F32");
120*c217d954SCole Faust
121*c217d954SCole Faust ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U16 && (dst->data_type() != DataType::U8 && dst->data_type() != DataType::U32),
122*c217d954SCole Faust "Only data_types supported [in] U16 -> [out] U8, U32");
123*c217d954SCole Faust
124*c217d954SCole Faust ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S16 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::U8 && dst->data_type() != DataType::S32),
125*c217d954SCole Faust "Only data_types supported [in] S16 -> [out] U8, S32");
126*c217d954SCole Faust
127*c217d954SCole Faust ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::BFLOAT16 && dst->data_type() != DataType::F32,
128*c217d954SCole Faust "Only data_types supported [in] BFLOAT16 -> [out] F32");
129*c217d954SCole Faust
130*c217d954SCole Faust ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F16 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
131*c217d954SCole Faust && dst->data_type() != DataType::U8
132*c217d954SCole Faust && dst->data_type() != DataType::F32 && dst->data_type() != DataType::S32),
133*c217d954SCole Faust "Only data_types supported [in] F16 -> [out] QASYMM8, F32, S32, U8");
134*c217d954SCole Faust
135*c217d954SCole Faust ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
136*c217d954SCole Faust && dst->data_type() != DataType::F16 && dst->data_type() != DataType::BFLOAT16
137*c217d954SCole Faust && dst->data_type() != DataType::S32 && dst->data_type() != DataType::U8),
138*c217d954SCole Faust "Only data_types supported [in] F32 -> [out] QASYMM8, BFLOAT16, F16, S32, U8");
139*c217d954SCole Faust
140*c217d954SCole Faust ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
141*c217d954SCole Faust && dst->data_type() != DataType::F16
142*c217d954SCole Faust && dst->data_type() != DataType::F32 && dst->data_type() != DataType::U8),
143*c217d954SCole Faust "Only data_types supported [in] S32 -> [out] QASYMM8, F16, F32, U8");
144*c217d954SCole Faust
145*c217d954SCole Faust // Validate in case of configured dst
146*c217d954SCole Faust if(dst->total_size() > 0)
147*c217d954SCole Faust {
148*c217d954SCole Faust ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
149*c217d954SCole Faust }
150*c217d954SCole Faust
151*c217d954SCole Faust return Status{};
152*c217d954SCole Faust }
153*c217d954SCole Faust } // namespace
154*c217d954SCole Faust
configure(const ITensorInfo * src,ITensorInfo * dst,ConvertPolicy policy)155*c217d954SCole Faust void CpuCastKernel::configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy)
156*c217d954SCole Faust {
157*c217d954SCole Faust ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
158*c217d954SCole Faust
159*c217d954SCole Faust // Auto initialize dst shape if not initialized (We can only auto-configure the shape, datatype must be given)
160*c217d954SCole Faust set_shape_if_empty(*dst, src->tensor_shape());
161*c217d954SCole Faust
162*c217d954SCole Faust _policy = policy;
163*c217d954SCole Faust
164*c217d954SCole Faust ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, policy));
165*c217d954SCole Faust
166*c217d954SCole Faust // Configure kernel window
167*c217d954SCole Faust Window win = calculate_max_window(*src, Steps());
168*c217d954SCole Faust
169*c217d954SCole Faust ICPPKernel::configure(win);
170*c217d954SCole Faust }
171*c217d954SCole Faust
validate(const ITensorInfo * src,const ITensorInfo * dst,ConvertPolicy policy)172*c217d954SCole Faust Status CpuCastKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
173*c217d954SCole Faust {
174*c217d954SCole Faust ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, policy));
175*c217d954SCole Faust return Status{};
176*c217d954SCole Faust }
177*c217d954SCole Faust
run_op(ITensorPack & tensors,const Window & window,const ThreadInfo & info)178*c217d954SCole Faust void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
179*c217d954SCole Faust {
180*c217d954SCole Faust ARM_COMPUTE_UNUSED(info);
181*c217d954SCole Faust ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
182*c217d954SCole Faust ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
183*c217d954SCole Faust
184*c217d954SCole Faust const auto window_start_x = static_cast<int>(window.x().start());
185*c217d954SCole Faust const auto window_end_x = static_cast<int>(window.x().end());
186*c217d954SCole Faust const int window_step_x = 16;
187*c217d954SCole Faust
188*c217d954SCole Faust const ITensor *_src = tensors.get_const_tensor(TensorType::ACL_SRC);
189*c217d954SCole Faust ITensor *_dst = tensors.get_tensor(TensorType::ACL_DST);
190*c217d954SCole Faust ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
191*c217d954SCole Faust ARM_COMPUTE_ERROR_ON(_src == _dst);
192*c217d954SCole Faust
193*c217d954SCole Faust ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
194*c217d954SCole Faust
195*c217d954SCole Faust Window win{ window };
196*c217d954SCole Faust win.set(Window::DimX, Window::Dimension(0, 1, 1));
197*c217d954SCole Faust
198*c217d954SCole Faust Iterator src(_src, win);
199*c217d954SCole Faust Iterator dst(_dst, win);
200*c217d954SCole Faust
201*c217d954SCole Faust /*ukernel runs only when using fp16/bfloat16, so we validate it isn't a nullptr only before using it */
202*c217d954SCole Faust const auto *uk = CpuCastKernel::get_implementation(CastDataTypeISASelectorData{ _src->info()->data_type(), _dst->info()->data_type(), CPUInfo::get().get_isa() });
203*c217d954SCole Faust
204*c217d954SCole Faust switch(_src->info()->data_type())
205*c217d954SCole Faust {
206*c217d954SCole Faust case DataType::QASYMM8_SIGNED:
207*c217d954SCole Faust {
208*c217d954SCole Faust switch(_dst->info()->data_type())
209*c217d954SCole Faust {
210*c217d954SCole Faust case DataType::S16:
211*c217d954SCole Faust {
212*c217d954SCole Faust /* Up-conversion QASYMM8_SIGNED -> S16 */
213*c217d954SCole Faust execute_window_loop(win, [&](const Coordinates &)
214*c217d954SCole Faust {
215*c217d954SCole Faust const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
216*c217d954SCole Faust const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
217*c217d954SCole Faust int x = window_start_x;
218*c217d954SCole Faust
219*c217d954SCole Faust for(; x <= (window_end_x - window_step_x); x += window_step_x)
220*c217d954SCole Faust {
221*c217d954SCole Faust const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
222*c217d954SCole Faust
223*c217d954SCole Faust const int16x8x2_t texels =
224*c217d954SCole Faust {
225*c217d954SCole Faust {
226*c217d954SCole Faust vmovl_s8(vget_low_s8(texels_s8)),
227*c217d954SCole Faust vmovl_s8(vget_high_s8(texels_s8))
228*c217d954SCole Faust }
229*c217d954SCole Faust };
230*c217d954SCole Faust
231*c217d954SCole Faust vst1q_s16(dst_ptr + x, texels.val[0]);
232*c217d954SCole Faust vst1q_s16(dst_ptr + x + 8, texels.val[1]);
233*c217d954SCole Faust }
234*c217d954SCole Faust
235*c217d954SCole Faust // Compute left-over elements
236*c217d954SCole Faust for(; x < window_end_x; ++x)
237*c217d954SCole Faust {
238*c217d954SCole Faust *(dst_ptr + x) = static_cast<int16_t>(*(src_ptr + x));
239*c217d954SCole Faust }
240*c217d954SCole Faust },
241*c217d954SCole Faust src, dst);
242*c217d954SCole Faust break;
243*c217d954SCole Faust }
244*c217d954SCole Faust case DataType::S32:
245*c217d954SCole Faust {
246*c217d954SCole Faust /* Up-conversion QASYMM8_SIGNED -> S32 */
247*c217d954SCole Faust execute_window_loop(win, [&](const Coordinates &)
248*c217d954SCole Faust {
249*c217d954SCole Faust const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
250*c217d954SCole Faust const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
251*c217d954SCole Faust int x = window_start_x;
252*c217d954SCole Faust
253*c217d954SCole Faust for(; x <= (window_end_x - window_step_x); x += window_step_x)
254*c217d954SCole Faust {
255*c217d954SCole Faust const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
256*c217d954SCole Faust
257*c217d954SCole Faust const int16x8x2_t texels =
258*c217d954SCole Faust {
259*c217d954SCole Faust {
260*c217d954SCole Faust vmovl_s8(vget_low_s8(texels_s8)),
261*c217d954SCole Faust vmovl_s8(vget_high_s8(texels_s8))
262*c217d954SCole Faust }
263*c217d954SCole Faust };
264*c217d954SCole Faust
265*c217d954SCole Faust vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
266*c217d954SCole Faust vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
267*c217d954SCole Faust vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
268*c217d954SCole Faust vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
269*c217d954SCole Faust }
270*c217d954SCole Faust
271*c217d954SCole Faust // Compute left-over elements
272*c217d954SCole Faust for(; x < window_end_x; ++x)
273*c217d954SCole Faust {
274*c217d954SCole Faust *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
275*c217d954SCole Faust }
276*c217d954SCole Faust },
277*c217d954SCole Faust src, dst);
278*c217d954SCole Faust break;
279*c217d954SCole Faust }
280*c217d954SCole Faust case DataType::F32:
281*c217d954SCole Faust {
282*c217d954SCole Faust /* Up-conversion QASYMM8_SIGNED -> F32 */
283*c217d954SCole Faust execute_window_loop(win, [&](const Coordinates &)
284*c217d954SCole Faust {
285*c217d954SCole Faust const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
286*c217d954SCole Faust const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
287*c217d954SCole Faust
288*c217d954SCole Faust int x = window_start_x;
289*c217d954SCole Faust for(; x <= (window_end_x - window_step_x); x += window_step_x)
290*c217d954SCole Faust {
291*c217d954SCole Faust const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
292*c217d954SCole Faust
293*c217d954SCole Faust const int16x8x2_t texels =
294*c217d954SCole Faust {
295*c217d954SCole Faust {
296*c217d954SCole Faust vmovl_s8(vget_low_s8(texels_s8)),
297*c217d954SCole Faust vmovl_s8(vget_high_s8(texels_s8))
298*c217d954SCole Faust }
299*c217d954SCole Faust };
300*c217d954SCole Faust vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
301*c217d954SCole Faust vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
302*c217d954SCole Faust vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
303*c217d954SCole Faust vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
304*c217d954SCole Faust }
305*c217d954SCole Faust
306*c217d954SCole Faust // Compute left-over elements
307*c217d954SCole Faust for(; x < window_end_x; ++x)
308*c217d954SCole Faust {
309*c217d954SCole Faust *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
310*c217d954SCole Faust }
311*c217d954SCole Faust },
312*c217d954SCole Faust src, dst);
313*c217d954SCole Faust break;
314*c217d954SCole Faust }
315*c217d954SCole Faust case DataType::F16:
316*c217d954SCole Faust {
317*c217d954SCole Faust /* Up-conversion QASYMM8_SIGNED -> F16 */
318*c217d954SCole Faust ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
319*c217d954SCole Faust uk->ukernel(_src, _dst, info, _policy, window);
320*c217d954SCole Faust break;
321*c217d954SCole Faust }
322*c217d954SCole Faust default:
323*c217d954SCole Faust ARM_COMPUTE_ERROR("dst data type not supported");
324*c217d954SCole Faust }
325*c217d954SCole Faust break;
326*c217d954SCole Faust }
327*c217d954SCole Faust
328*c217d954SCole Faust case DataType::QASYMM8:
329*c217d954SCole Faust case DataType::U8:
330*c217d954SCole Faust {
331*c217d954SCole Faust switch(_dst->info()->data_type())
332*c217d954SCole Faust {
333*c217d954SCole Faust case DataType::S16:
334*c217d954SCole Faust {
335*c217d954SCole Faust /* Up-conversion U8 -> S16 */
336*c217d954SCole Faust execute_window_loop(win, [&](const Coordinates &)
337*c217d954SCole Faust {
338*c217d954SCole Faust const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
339*c217d954SCole Faust const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
340*c217d954SCole Faust
341*c217d954SCole Faust int x = window_start_x;
342*c217d954SCole Faust for(; x <= (window_end_x - window_step_x); x += window_step_x)
343*c217d954SCole Faust {
344*c217d954SCole Faust const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
345*c217d954SCole Faust
346*c217d954SCole Faust const int16x8x2_t texels =
347*c217d954SCole Faust {
348*c217d954SCole Faust {
349*c217d954SCole Faust vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
350*c217d954SCole Faust vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
351*c217d954SCole Faust }
352*c217d954SCole Faust };
353*c217d954SCole Faust
354*c217d954SCole Faust vst1q_s16(dst_ptr + x, texels.val[0]);
355*c217d954SCole Faust vst1q_s16(dst_ptr + x + 8, texels.val[1]);
356*c217d954SCole Faust }
357*c217d954SCole Faust
358*c217d954SCole Faust // Compute left-over elements
359*c217d954SCole Faust for(; x < window_end_x; ++x)
360*c217d954SCole Faust {
361*c217d954SCole Faust *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
362*c217d954SCole Faust }
363*c217d954SCole Faust },
364*c217d954SCole Faust src, dst);
365*c217d954SCole Faust break;
366*c217d954SCole Faust }
367*c217d954SCole Faust case DataType::S32:
368*c217d954SCole Faust {
369*c217d954SCole Faust /* Up-conversion U8 -> S32 */
370*c217d954SCole Faust execute_window_loop(win, [&](const Coordinates &)
371*c217d954SCole Faust {
372*c217d954SCole Faust const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
373*c217d954SCole Faust const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
374*c217d954SCole Faust
375*c217d954SCole Faust int x = window_start_x;
376*c217d954SCole Faust for(; x <= (window_end_x - window_step_x); x += window_step_x)
377*c217d954SCole Faust {
378*c217d954SCole Faust const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
379*c217d954SCole Faust
380*c217d954SCole Faust const int16x8x2_t texels =
381*c217d954SCole Faust {
382*c217d954SCole Faust {
383*c217d954SCole Faust vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
384*c217d954SCole Faust vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
385*c217d954SCole Faust }
386*c217d954SCole Faust };
387*c217d954SCole Faust
388*c217d954SCole Faust vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
389*c217d954SCole Faust vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
390*c217d954SCole Faust vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
391*c217d954SCole Faust vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
392*c217d954SCole Faust }
393*c217d954SCole Faust
394*c217d954SCole Faust // Compute left-over elements
395*c217d954SCole Faust for(; x < window_end_x; ++x)
396*c217d954SCole Faust {
397*c217d954SCole Faust *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
398*c217d954SCole Faust }
399*c217d954SCole Faust },
400*c217d954SCole Faust src, dst);
401*c217d954SCole Faust break;
402*c217d954SCole Faust }
403*c217d954SCole Faust case DataType::F32:
404*c217d954SCole Faust {
405*c217d954SCole Faust /* Up-conversion U8 -> F32 */
406*c217d954SCole Faust execute_window_loop(win, [&](const Coordinates &)
407*c217d954SCole Faust {
408*c217d954SCole Faust const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
409*c217d954SCole Faust const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
410*c217d954SCole Faust
411*c217d954SCole Faust int x = window_start_x;
412*c217d954SCole Faust for(; x <= (window_end_x - window_step_x); x += window_step_x)
413*c217d954SCole Faust {
414*c217d954SCole Faust const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
415*c217d954SCole Faust
416*c217d954SCole Faust const int16x8x2_t texels =
417*c217d954SCole Faust {
418*c217d954SCole Faust {
419*c217d954SCole Faust vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
420*c217d954SCole Faust vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
421*c217d954SCole Faust }
422*c217d954SCole Faust };
423*c217d954SCole Faust vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
424*c217d954SCole Faust vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
425*c217d954SCole Faust vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
426*c217d954SCole Faust vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
427*c217d954SCole Faust }
428*c217d954SCole Faust
429*c217d954SCole Faust // Compute left-over elements
430*c217d954SCole Faust for(; x < window_end_x; ++x)
431*c217d954SCole Faust {
432*c217d954SCole Faust *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
433*c217d954SCole Faust }
434*c217d954SCole Faust },
435*c217d954SCole Faust src, dst);
436*c217d954SCole Faust break;
437*c217d954SCole Faust }
438*c217d954SCole Faust case DataType::F16:
439*c217d954SCole Faust {
440*c217d954SCole Faust /* Up-conversion U8 -> FP16 */
441*c217d954SCole Faust ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
442*c217d954SCole Faust uk->ukernel(_src, _dst, info, _policy, window);
443*c217d954SCole Faust break;
444*c217d954SCole Faust }
445*c217d954SCole Faust case DataType::U16:
446*c217d954SCole Faust {
447*c217d954SCole Faust /* Up-conversion U8 -> U16 */
448*c217d954SCole Faust execute_window_loop(win, [&](const Coordinates &)
449*c217d954SCole Faust {
450*c217d954SCole Faust const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
451*c217d954SCole Faust const auto dst_ptr = reinterpret_cast<uint16_t *>(dst.ptr());
452*c217d954SCole Faust
453*c217d954SCole Faust int x = window_start_x;
454*c217d954SCole Faust for(; x <= (window_end_x - window_step_x); x += window_step_x)
455*c217d954SCole Faust {
456*c217d954SCole Faust const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
457*c217d954SCole Faust
458*c217d954SCole Faust const uint16x8x2_t texels =
459*c217d954SCole Faust {
460*c217d954SCole Faust {
461*c217d954SCole Faust vmovl_u8(vget_low_u8(texels_u8)),
462*c217d954SCole Faust vmovl_u8(vget_high_u8(texels_u8))
463*c217d954SCole Faust }
464*c217d954SCole Faust };
465*c217d954SCole Faust
466*c217d954SCole Faust vst1q_u16(dst_ptr + x, texels.val[0]);
467*c217d954SCole Faust vst1q_u16(dst_ptr + x + 8, texels.val[1]);
468*c217d954SCole Faust }
469*c217d954SCole Faust
470*c217d954SCole Faust // Compute left-over elements
471*c217d954SCole Faust for(; x < window_end_x; ++x)
472*c217d954SCole Faust {
473*c217d954SCole Faust *(dst_ptr + x) = static_cast<uint16_t>(*(src_ptr + x));
474*c217d954SCole Faust }
475*c217d954SCole Faust },
476*c217d954SCole Faust src, dst);
477*c217d954SCole Faust break;
478*c217d954SCole Faust }
479*c217d954SCole Faust default:
480*c217d954SCole Faust ARM_COMPUTE_ERROR("dst data type not supported");
481*c217d954SCole Faust }
482*c217d954SCole Faust break;
483*c217d954SCole Faust }
484*c217d954SCole Faust case DataType::S16:
485*c217d954SCole Faust {
486*c217d954SCole Faust switch(_dst->info()->data_type())
487*c217d954SCole Faust {
488*c217d954SCole Faust case DataType::QASYMM8_SIGNED:
489*c217d954SCole Faust {
490*c217d954SCole Faust /* Down-conversion S16 -> QASYMM8_SIGNED */
491*c217d954SCole Faust if(ConvertPolicy::SATURATE == _policy)
492*c217d954SCole Faust {
493*c217d954SCole Faust execute_window_loop(win, [&](const Coordinates &)
494*c217d954SCole Faust {
495*c217d954SCole Faust const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
496*c217d954SCole Faust const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
497*c217d954SCole Faust
498*c217d954SCole Faust int x = window_start_x;
499*c217d954SCole Faust for(; x <= (window_end_x - window_step_x); x += window_step_x)
500*c217d954SCole Faust {
501*c217d954SCole Faust const int16x8x2_t texels =
502*c217d954SCole Faust {
503*c217d954SCole Faust {
504*c217d954SCole Faust vld1q_s16(src_ptr + x),
505*c217d954SCole Faust vld1q_s16(src_ptr + x + 8)
506*c217d954SCole Faust }
507*c217d954SCole Faust };
508*c217d954SCole Faust
509*c217d954SCole Faust vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1])));
510*c217d954SCole Faust }
511*c217d954SCole Faust
512*c217d954SCole Faust // Compute left-over elements
513*c217d954SCole Faust for(; x < window_end_x; ++x)
514*c217d954SCole Faust {
515*c217d954SCole Faust *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
516*c217d954SCole Faust }
517*c217d954SCole Faust },
518*c217d954SCole Faust src, dst);
519*c217d954SCole Faust }
520*c217d954SCole Faust else
521*c217d954SCole Faust {
522*c217d954SCole Faust execute_window_loop(win, [&](const Coordinates &)
523*c217d954SCole Faust {
524*c217d954SCole Faust const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
525*c217d954SCole Faust const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
526*c217d954SCole Faust
527*c217d954SCole Faust int x = window_start_x;
528*c217d954SCole Faust for(; x <= (window_end_x - window_step_x); x += window_step_x)
529*c217d954SCole Faust {
530*c217d954SCole Faust const int16x8x2_t texels =
531*c217d954SCole Faust {
532*c217d954SCole Faust {
533*c217d954SCole Faust vld1q_s16(src_ptr + x),
534*c217d954SCole Faust vld1q_s16(src_ptr + x + 8)
535*c217d954SCole Faust }
536*c217d954SCole Faust };
537*c217d954SCole Faust
538*c217d954SCole Faust vst1q_s8(dst_ptr + x, vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1])));
539*c217d954SCole Faust }
540*c217d954SCole Faust
541*c217d954SCole Faust // Compute left-over elements
542*c217d954SCole Faust for(; x < window_end_x; ++x)
543*c217d954SCole Faust {
544*c217d954SCole Faust *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
545*c217d954SCole Faust }
546*c217d954SCole Faust },
547*c217d954SCole Faust src, dst);
548*c217d954SCole Faust }
549*c217d954SCole Faust break;
550*c217d954SCole Faust }
551*c217d954SCole Faust case DataType::U8:
552*c217d954SCole Faust {
553*c217d954SCole Faust /* Down-conversion S16 -> U8 */
554*c217d954SCole Faust if(ConvertPolicy::SATURATE == _policy)
555*c217d954SCole Faust {
556*c217d954SCole Faust execute_window_loop(win, [&](const Coordinates &)
557*c217d954SCole Faust {
558*c217d954SCole Faust const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
559*c217d954SCole Faust const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
560*c217d954SCole Faust
561*c217d954SCole Faust int x = window_start_x;
562*c217d954SCole Faust for(; x <= (window_end_x - window_step_x); x += window_step_x)
563*c217d954SCole Faust {
564*c217d954SCole Faust const int16x8x2_t texels =
565*c217d954SCole Faust {
566*c217d954SCole Faust {
567*c217d954SCole Faust vld1q_s16(src_ptr + x),
568*c217d954SCole Faust vld1q_s16(src_ptr + x + 8)
569*c217d954SCole Faust }
570*c217d954SCole Faust };
571*c217d954SCole Faust
572*c217d954SCole Faust vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1])));
573*c217d954SCole Faust }
574*c217d954SCole Faust
575*c217d954SCole Faust // Compute left-over elements
576*c217d954SCole Faust for(; x < window_end_x; ++x)
577*c217d954SCole Faust {
578*c217d954SCole Faust *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
579*c217d954SCole Faust }
580*c217d954SCole Faust },
581*c217d954SCole Faust src, dst);
582*c217d954SCole Faust }
583*c217d954SCole Faust else
584*c217d954SCole Faust {
585*c217d954SCole Faust execute_window_loop(win, [&](const Coordinates &)
586*c217d954SCole Faust {
587*c217d954SCole Faust const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
588*c217d954SCole Faust const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
589*c217d954SCole Faust
590*c217d954SCole Faust int x = window_start_x;
591*c217d954SCole Faust for(; x <= (window_end_x - window_step_x); x += window_step_x)
592*c217d954SCole Faust {
593*c217d954SCole Faust const int16x8x2_t texels =
594*c217d954SCole Faust {
595*c217d954SCole Faust {
596*c217d954SCole Faust vld1q_s16(src_ptr + x),
597*c217d954SCole Faust vld1q_s16(src_ptr + x + 8)
598*c217d954SCole Faust }
599*c217d954SCole Faust };
600*c217d954SCole Faust
601*c217d954SCole Faust vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])),
602*c217d954SCole Faust vmovn_u16(vreinterpretq_u16_s16(texels.val[1]))));
603*c217d954SCole Faust }
604*c217d954SCole Faust
605*c217d954SCole Faust // Compute left-over elements
606*c217d954SCole Faust for(; x < window_end_x; ++x)
607*c217d954SCole Faust {
608*c217d954SCole Faust *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
609*c217d954SCole Faust }
610*c217d954SCole Faust },
611*c217d954SCole Faust src, dst);
612*c217d954SCole Faust }
613*c217d954SCole Faust break;
614*c217d954SCole Faust }
615*c217d954SCole Faust case DataType::S32:
616*c217d954SCole Faust {
617*c217d954SCole Faust /* Up-conversion S16 -> S32 */
618*c217d954SCole Faust execute_window_loop(win, [&](const Coordinates &)
619*c217d954SCole Faust {
620*c217d954SCole Faust const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
621*c217d954SCole Faust const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
622*c217d954SCole Faust
623*c217d954SCole Faust int x = window_start_x;
624*c217d954SCole Faust for(; x <= (window_end_x - window_step_x); x += window_step_x)
625*c217d954SCole Faust {
626*c217d954SCole Faust const int16x8x2_t texels =
627*c217d954SCole Faust {
628*c217d954SCole Faust {
629*c217d954SCole Faust vld1q_s16(src_ptr + x),
630*c217d954SCole Faust vld1q_s16(src_ptr + x + 8)
631*c217d954SCole Faust }
632*c217d954SCole Faust };
633*c217d954SCole Faust
634*c217d954SCole Faust const int32x4x4_t texels_s32 =
635*c217d954SCole Faust {
636*c217d954SCole Faust {
637*c217d954SCole Faust vmovl_s16(vget_low_s16(texels.val[0])),
638*c217d954SCole Faust vmovl_s16(vget_high_s16(texels.val[0])),
639*c217d954SCole Faust vmovl_s16(vget_low_s16(texels.val[1])),
640*c217d954SCole Faust vmovl_s16(vget_high_s16(texels.val[1]))
641*c217d954SCole Faust }
642*c217d954SCole Faust };
643*c217d954SCole Faust
644*c217d954SCole Faust vst1q_s32(dst_ptr + x, texels_s32.val[0]);
645*c217d954SCole Faust vst1q_s32(dst_ptr + x + 4, texels_s32.val[1]);
646*c217d954SCole Faust vst1q_s32(dst_ptr + x + 8, texels_s32.val[2]);
647*c217d954SCole Faust vst1q_s32(dst_ptr + x + 12, texels_s32.val[3]);
648*c217d954SCole Faust }
649*c217d954SCole Faust
650*c217d954SCole Faust // Compute left-over elements
651*c217d954SCole Faust for(; x < window_end_x; ++x)
652*c217d954SCole Faust {
653*c217d954SCole Faust *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
654*c217d954SCole Faust }
655*c217d954SCole Faust },
656*c217d954SCole Faust src, dst);
657*c217d954SCole Faust break;
658*c217d954SCole Faust }
659*c217d954SCole Faust default:
660*c217d954SCole Faust ARM_COMPUTE_ERROR("dst data type not supported");
661*c217d954SCole Faust }
662*c217d954SCole Faust break;
663*c217d954SCole Faust }
664*c217d954SCole Faust
665*c217d954SCole Faust case DataType::U16:
666*c217d954SCole Faust {
667*c217d954SCole Faust switch(_dst->info()->data_type())
668*c217d954SCole Faust {
669*c217d954SCole Faust case DataType::U8:
670*c217d954SCole Faust {
671*c217d954SCole Faust /* Down-conversion U16 -> U8 */
672*c217d954SCole Faust if(ConvertPolicy::SATURATE == _policy)
673*c217d954SCole Faust {
674*c217d954SCole Faust execute_window_loop(win, [&](const Coordinates &)
675*c217d954SCole Faust {
676*c217d954SCole Faust const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
677*c217d954SCole Faust const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
678*c217d954SCole Faust
679*c217d954SCole Faust int x = window_start_x;
680*c217d954SCole Faust for(; x <= (window_end_x - window_step_x); x += window_step_x)
681*c217d954SCole Faust {
682*c217d954SCole Faust const uint16x8x2_t texels =
683*c217d954SCole Faust {
684*c217d954SCole Faust {
685*c217d954SCole Faust vld1q_u16(src_ptr + x),
686*c217d954SCole Faust vld1q_u16(src_ptr + x + 8)
687*c217d954SCole Faust }
688*c217d954SCole Faust };
689*c217d954SCole Faust
690*c217d954SCole Faust vst1q_u8(dst_ptr + x, vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
691*c217d954SCole Faust }
692*c217d954SCole Faust
693*c217d954SCole Faust // Compute left-over elements
694*c217d954SCole Faust for(; x < window_end_x; ++x)
695*c217d954SCole Faust {
696*c217d954SCole Faust *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
697*c217d954SCole Faust }
698*c217d954SCole Faust },
699*c217d954SCole Faust src, dst);
700*c217d954SCole Faust }
701*c217d954SCole Faust else
702*c217d954SCole Faust {
703*c217d954SCole Faust execute_window_loop(win, [&](const Coordinates &)
704*c217d954SCole Faust {
705*c217d954SCole Faust const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
706*c217d954SCole Faust const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
707*c217d954SCole Faust
708*c217d954SCole Faust int x = window_start_x;
709*c217d954SCole Faust for(; x <= (window_end_x - window_step_x); x += window_step_x)
710*c217d954SCole Faust {
711*c217d954SCole Faust const uint16x8x2_t texels =
712*c217d954SCole Faust {
713*c217d954SCole Faust {
714*c217d954SCole Faust vld1q_u16(src_ptr + x),
715*c217d954SCole Faust vld1q_u16(src_ptr + x + 8)
716*c217d954SCole Faust }
717*c217d954SCole Faust };
718*c217d954SCole Faust
719*c217d954SCole Faust vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
720*c217d954SCole Faust }
721*c217d954SCole Faust
722*c217d954SCole Faust // Compute left-over elements
723*c217d954SCole Faust for(; x < window_end_x; ++x)
724*c217d954SCole Faust {
725*c217d954SCole Faust *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
726*c217d954SCole Faust }
727*c217d954SCole Faust
728*c217d954SCole Faust },
729*c217d954SCole Faust src, dst);
730*c217d954SCole Faust }
731*c217d954SCole Faust break;
732*c217d954SCole Faust }
733*c217d954SCole Faust case DataType::U32:
734*c217d954SCole Faust {
735*c217d954SCole Faust /* Up-conversion U16 -> U32 */
736*c217d954SCole Faust execute_window_loop(win, [&](const Coordinates &)
737*c217d954SCole Faust {
738*c217d954SCole Faust const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
739*c217d954SCole Faust const auto dst_ptr = reinterpret_cast<uint32_t *>(dst.ptr());
740*c217d954SCole Faust
741*c217d954SCole Faust int x = window_start_x;
742*c217d954SCole Faust for(; x <= (window_end_x - window_step_x); x += window_step_x)
743*c217d954SCole Faust {
744*c217d954SCole Faust const uint16x8x2_t texels =
745*c217d954SCole Faust {
746*c217d954SCole Faust {
747*c217d954SCole Faust vld1q_u16(src_ptr + x),
748*c217d954SCole Faust vld1q_u16(src_ptr + x + 8)
749*c217d954SCole Faust }
750*c217d954SCole Faust };
751*c217d954SCole Faust
752*c217d954SCole Faust vst1q_u32(dst_ptr + x, vmovl_u16(vget_low_u16(texels.val[0])));
753*c217d954SCole Faust vst1q_u32(dst_ptr + x + 4, vmovl_u16(vget_high_u16(texels.val[0])));
754*c217d954SCole Faust vst1q_u32(dst_ptr + x + 8, vmovl_u16(vget_low_u16(texels.val[1])));
755*c217d954SCole Faust vst1q_u32(dst_ptr + x + 12, vmovl_u16(vget_high_u16(texels.val[1])));
756*c217d954SCole Faust }
757*c217d954SCole Faust // Compute left-over elements
758*c217d954SCole Faust for(; x < window_end_x; ++x)
759*c217d954SCole Faust {
760*c217d954SCole Faust *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
761*c217d954SCole Faust }
762*c217d954SCole Faust
763*c217d954SCole Faust },
764*c217d954SCole Faust src, dst);
765*c217d954SCole Faust break;
766*c217d954SCole Faust }
767*c217d954SCole Faust default:
768*c217d954SCole Faust ARM_COMPUTE_ERROR("dst data type not supported");
769*c217d954SCole Faust }
770*c217d954SCole Faust break;
771*c217d954SCole Faust }
772*c217d954SCole Faust case DataType::BFLOAT16:
773*c217d954SCole Faust {
774*c217d954SCole Faust /* Up-conversion BFLOAT16 -> F32 */
775*c217d954SCole Faust ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
776*c217d954SCole Faust uk->ukernel(_src, _dst, info, _policy, window);
777*c217d954SCole Faust break;
778*c217d954SCole Faust }
779*c217d954SCole Faust case DataType::F16:
780*c217d954SCole Faust {
781*c217d954SCole Faust /* conversion F16 -> any data type */
782*c217d954SCole Faust ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
783*c217d954SCole Faust uk->ukernel(_src, _dst, info, _policy, window);
784*c217d954SCole Faust break;
785*c217d954SCole Faust }
786*c217d954SCole Faust case DataType::F32:
787*c217d954SCole Faust switch(_dst->info()->data_type())
788*c217d954SCole Faust {
789*c217d954SCole Faust case DataType::F16:
790*c217d954SCole Faust {
791*c217d954SCole Faust /* Down-conversion F32 -> F16 */
792*c217d954SCole Faust ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
793*c217d954SCole Faust uk->ukernel(_src, _dst, info, _policy, window);
794*c217d954SCole Faust break;
795*c217d954SCole Faust }
796*c217d954SCole Faust case DataType::BFLOAT16:
797*c217d954SCole Faust {
798*c217d954SCole Faust /* Down-conversion F32 -> BFLOAT16 */
799*c217d954SCole Faust ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
800*c217d954SCole Faust uk->ukernel(_src, _dst, info, _policy, window);
801*c217d954SCole Faust break;
802*c217d954SCole Faust }
803*c217d954SCole Faust case DataType::S32:
804*c217d954SCole Faust {
805*c217d954SCole Faust /* Conversion F32 -> S32 */
806*c217d954SCole Faust execute_window_loop(win, [&](const Coordinates &)
807*c217d954SCole Faust {
808*c217d954SCole Faust const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
809*c217d954SCole Faust const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
810*c217d954SCole Faust
811*c217d954SCole Faust int x = window_start_x;
812*c217d954SCole Faust for(; x <= (window_end_x - window_step_x); x += window_step_x)
813*c217d954SCole Faust {
814*c217d954SCole Faust const float32x4x4_t texels =
815*c217d954SCole Faust {
816*c217d954SCole Faust {
817*c217d954SCole Faust vld1q_f32(src_ptr + x),
818*c217d954SCole Faust vld1q_f32(src_ptr + x + 4),
819*c217d954SCole Faust vld1q_f32(src_ptr + x + 8),
820*c217d954SCole Faust vld1q_f32(src_ptr + x + 12),
821*c217d954SCole Faust }
822*c217d954SCole Faust };
823*c217d954SCole Faust
824*c217d954SCole Faust vst1q_s32(dst_ptr + x, vcvtq_s32_f32(texels.val[0]));
825*c217d954SCole Faust vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(texels.val[1]));
826*c217d954SCole Faust vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(texels.val[2]));
827*c217d954SCole Faust vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(texels.val[3]));
828*c217d954SCole Faust }
829*c217d954SCole Faust
830*c217d954SCole Faust // Compute left-over elements
831*c217d954SCole Faust for(; x < window_end_x; ++x)
832*c217d954SCole Faust {
833*c217d954SCole Faust *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
834*c217d954SCole Faust }
835*c217d954SCole Faust },
836*c217d954SCole Faust src, dst);
837*c217d954SCole Faust break;
838*c217d954SCole Faust }
839*c217d954SCole Faust case DataType::QASYMM8:
840*c217d954SCole Faust case DataType::U8:
841*c217d954SCole Faust {
842*c217d954SCole Faust /* Down-conversion F32 -> U8 */
843*c217d954SCole Faust execute_window_loop(win, [&](const Coordinates &)
844*c217d954SCole Faust {
845*c217d954SCole Faust const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
846*c217d954SCole Faust const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
847*c217d954SCole Faust
848*c217d954SCole Faust int x = window_start_x;
849*c217d954SCole Faust for(; x <= (window_end_x - window_step_x); x += window_step_x)
850*c217d954SCole Faust {
851*c217d954SCole Faust const float32x4x4_t texels =
852*c217d954SCole Faust {
853*c217d954SCole Faust {
854*c217d954SCole Faust vld1q_f32(src_ptr + x),
855*c217d954SCole Faust vld1q_f32(src_ptr + x + 4),
856*c217d954SCole Faust vld1q_f32(src_ptr + x + 8),
857*c217d954SCole Faust vld1q_f32(src_ptr + x + 12),
858*c217d954SCole Faust }
859*c217d954SCole Faust };
860*c217d954SCole Faust
861*c217d954SCole Faust vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])), vqmovun_s32(vcvtq_s32_f32(texels.val[1])))));
862*c217d954SCole Faust vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])), vqmovun_s32(vcvtq_s32_f32(texels.val[3])))));
863*c217d954SCole Faust }
864*c217d954SCole Faust
865*c217d954SCole Faust // Compute left-over elements
866*c217d954SCole Faust for(; x < window_end_x; ++x)
867*c217d954SCole Faust {
868*c217d954SCole Faust *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
869*c217d954SCole Faust }
870*c217d954SCole Faust },
871*c217d954SCole Faust src, dst);
872*c217d954SCole Faust break;
873*c217d954SCole Faust }
874*c217d954SCole Faust case DataType::QASYMM8_SIGNED:
875*c217d954SCole Faust {
876*c217d954SCole Faust /* Down-conversion F32 -> QASYMM8_SIGNED */
877*c217d954SCole Faust execute_window_loop(win, [&](const Coordinates &)
878*c217d954SCole Faust {
879*c217d954SCole Faust const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
880*c217d954SCole Faust const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
881*c217d954SCole Faust
882*c217d954SCole Faust int x = window_start_x;
883*c217d954SCole Faust for(; x <= (window_end_x - window_step_x); x += window_step_x)
884*c217d954SCole Faust {
885*c217d954SCole Faust const float32x4x4_t texels =
886*c217d954SCole Faust {
887*c217d954SCole Faust {
888*c217d954SCole Faust vld1q_f32(src_ptr + x),
889*c217d954SCole Faust vld1q_f32(src_ptr + x + 4),
890*c217d954SCole Faust vld1q_f32(src_ptr + x + 8),
891*c217d954SCole Faust vld1q_f32(src_ptr + x + 12),
892*c217d954SCole Faust }
893*c217d954SCole Faust };
894*c217d954SCole Faust
895*c217d954SCole Faust vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])), vqmovn_s32(vcvtq_s32_f32(texels.val[1])))));
896*c217d954SCole Faust vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])), vqmovn_s32(vcvtq_s32_f32(texels.val[3])))));
897*c217d954SCole Faust }
898*c217d954SCole Faust // Compute left-over elements
899*c217d954SCole Faust for(; x < window_end_x; ++x)
900*c217d954SCole Faust {
901*c217d954SCole Faust *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
902*c217d954SCole Faust }
903*c217d954SCole Faust },
904*c217d954SCole Faust src, dst);
905*c217d954SCole Faust break;
906*c217d954SCole Faust }
907*c217d954SCole Faust
908*c217d954SCole Faust default:
909*c217d954SCole Faust ARM_COMPUTE_ERROR("dst data type not supported");
910*c217d954SCole Faust }
911*c217d954SCole Faust break;
912*c217d954SCole Faust
913*c217d954SCole Faust case DataType::S32:
914*c217d954SCole Faust switch(_dst->info()->data_type())
915*c217d954SCole Faust {
916*c217d954SCole Faust case DataType::F16:
917*c217d954SCole Faust {
918*c217d954SCole Faust /* Down-conversion S32 -> F16 */
919*c217d954SCole Faust ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
920*c217d954SCole Faust uk->ukernel(_src, _dst, info, _policy, window);
921*c217d954SCole Faust break;
922*c217d954SCole Faust }
923*c217d954SCole Faust case DataType::F32:
924*c217d954SCole Faust {
925*c217d954SCole Faust /* Conversion S32 -> F32 */
926*c217d954SCole Faust execute_window_loop(win, [&](const Coordinates &)
927*c217d954SCole Faust {
928*c217d954SCole Faust const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
929*c217d954SCole Faust const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
930*c217d954SCole Faust
931*c217d954SCole Faust int x = window_start_x;
932*c217d954SCole Faust for(; x <= (window_end_x - window_step_x); x += window_step_x)
933*c217d954SCole Faust {
934*c217d954SCole Faust const int32x4x4_t texels =
935*c217d954SCole Faust {
936*c217d954SCole Faust {
937*c217d954SCole Faust vld1q_s32(src_ptr + x),
938*c217d954SCole Faust vld1q_s32(src_ptr + x + 4),
939*c217d954SCole Faust vld1q_s32(src_ptr + x + 8),
940*c217d954SCole Faust vld1q_s32(src_ptr + x + 12),
941*c217d954SCole Faust }
942*c217d954SCole Faust };
943*c217d954SCole Faust
944*c217d954SCole Faust vst1q_f32(dst_ptr + x, vcvtq_f32_s32(texels.val[0]));
945*c217d954SCole Faust vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(texels.val[1]));
946*c217d954SCole Faust vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(texels.val[2]));
947*c217d954SCole Faust vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(texels.val[3]));
948*c217d954SCole Faust }
949*c217d954SCole Faust
950*c217d954SCole Faust // Compute left-over elements
951*c217d954SCole Faust for(; x < window_end_x; ++x)
952*c217d954SCole Faust {
953*c217d954SCole Faust *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
954*c217d954SCole Faust }
955*c217d954SCole Faust },
956*c217d954SCole Faust src, dst);
957*c217d954SCole Faust break;
958*c217d954SCole Faust }
959*c217d954SCole Faust case DataType::QASYMM8_SIGNED:
960*c217d954SCole Faust {
961*c217d954SCole Faust /* Down-conversion S32 -> QASYMM8_SIGNED */
962*c217d954SCole Faust if(ConvertPolicy::SATURATE == _policy)
963*c217d954SCole Faust {
964*c217d954SCole Faust execute_window_loop(win, [&](const Coordinates &)
965*c217d954SCole Faust {
966*c217d954SCole Faust const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
967*c217d954SCole Faust const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
968*c217d954SCole Faust
969*c217d954SCole Faust int x = window_start_x;
970*c217d954SCole Faust for(; x <= (window_end_x - window_step_x); x += window_step_x)
971*c217d954SCole Faust {
972*c217d954SCole Faust const int32x4x4_t texels =
973*c217d954SCole Faust {
974*c217d954SCole Faust {
975*c217d954SCole Faust vld1q_s32(src_ptr + x),
976*c217d954SCole Faust vld1q_s32(src_ptr + x + 4),
977*c217d954SCole Faust vld1q_s32(src_ptr + x + 8),
978*c217d954SCole Faust vld1q_s32(src_ptr + x + 12),
979*c217d954SCole Faust }
980*c217d954SCole Faust };
981*c217d954SCole Faust vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]), vqmovn_s32(texels.val[1]))));
982*c217d954SCole Faust vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]), vqmovn_s32(texels.val[3]))));
983*c217d954SCole Faust }
984*c217d954SCole Faust
985*c217d954SCole Faust // Compute left-over elements
986*c217d954SCole Faust for(; x < window_end_x; ++x)
987*c217d954SCole Faust {
988*c217d954SCole Faust *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
989*c217d954SCole Faust }
990*c217d954SCole Faust },
991*c217d954SCole Faust src, dst);
992*c217d954SCole Faust }
993*c217d954SCole Faust else
994*c217d954SCole Faust {
995*c217d954SCole Faust execute_window_loop(win, [&](const Coordinates &)
996*c217d954SCole Faust {
997*c217d954SCole Faust const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
998*c217d954SCole Faust const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
999*c217d954SCole Faust
1000*c217d954SCole Faust int x = window_start_x;
1001*c217d954SCole Faust for(; x <= (window_end_x - window_step_x); x += window_step_x)
1002*c217d954SCole Faust {
1003*c217d954SCole Faust const int32x4x4_t texels =
1004*c217d954SCole Faust {
1005*c217d954SCole Faust {
1006*c217d954SCole Faust vld1q_s32(src_ptr + x),
1007*c217d954SCole Faust vld1q_s32(src_ptr + x + 4),
1008*c217d954SCole Faust vld1q_s32(src_ptr + x + 8),
1009*c217d954SCole Faust vld1q_s32(src_ptr + x + 12)
1010*c217d954SCole Faust }
1011*c217d954SCole Faust };
1012*c217d954SCole Faust
1013*c217d954SCole Faust vst1_s8(dst_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]), vmovn_s32(texels.val[1]))));
1014*c217d954SCole Faust vst1_s8(dst_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]), vmovn_s32(texels.val[3]))));
1015*c217d954SCole Faust }
1016*c217d954SCole Faust
1017*c217d954SCole Faust // Compute left-over elements
1018*c217d954SCole Faust for(; x < window_end_x; ++x)
1019*c217d954SCole Faust {
1020*c217d954SCole Faust *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
1021*c217d954SCole Faust }
1022*c217d954SCole Faust },
1023*c217d954SCole Faust src, dst);
1024*c217d954SCole Faust }
1025*c217d954SCole Faust break;
1026*c217d954SCole Faust }
1027*c217d954SCole Faust case DataType::QASYMM8:
1028*c217d954SCole Faust case DataType::U8:
1029*c217d954SCole Faust {
1030*c217d954SCole Faust /* Down-conversion S32 -> U8 */
1031*c217d954SCole Faust if(ConvertPolicy::SATURATE == _policy)
1032*c217d954SCole Faust {
1033*c217d954SCole Faust execute_window_loop(win, [&](const Coordinates &)
1034*c217d954SCole Faust {
1035*c217d954SCole Faust const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
1036*c217d954SCole Faust const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
1037*c217d954SCole Faust
1038*c217d954SCole Faust int x = window_start_x;
1039*c217d954SCole Faust for(; x <= (window_end_x - window_step_x); x += window_step_x)
1040*c217d954SCole Faust {
1041*c217d954SCole Faust const int32x4x4_t texels =
1042*c217d954SCole Faust {
1043*c217d954SCole Faust {
1044*c217d954SCole Faust vld1q_s32(src_ptr + x),
1045*c217d954SCole Faust vld1q_s32(src_ptr + x + 4),
1046*c217d954SCole Faust vld1q_s32(src_ptr + x + 8),
1047*c217d954SCole Faust vld1q_s32(src_ptr + x + 12)
1048*c217d954SCole Faust }
1049*c217d954SCole Faust };
1050*c217d954SCole Faust vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]), vqmovun_s32(texels.val[1]))));
1051*c217d954SCole Faust vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]), vqmovun_s32(texels.val[3]))));
1052*c217d954SCole Faust }
1053*c217d954SCole Faust
1054*c217d954SCole Faust // Compute left-over elements
1055*c217d954SCole Faust for(; x < window_end_x; ++x)
1056*c217d954SCole Faust {
1057*c217d954SCole Faust *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
1058*c217d954SCole Faust }
1059*c217d954SCole Faust },
1060*c217d954SCole Faust src, dst);
1061*c217d954SCole Faust }
1062*c217d954SCole Faust else
1063*c217d954SCole Faust {
1064*c217d954SCole Faust execute_window_loop(win, [&](const Coordinates &)
1065*c217d954SCole Faust {
1066*c217d954SCole Faust const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
1067*c217d954SCole Faust const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
1068*c217d954SCole Faust
1069*c217d954SCole Faust int x = window_start_x;
1070*c217d954SCole Faust for(; x <= (window_end_x - window_step_x); x += window_step_x)
1071*c217d954SCole Faust {
1072*c217d954SCole Faust const int32x4x4_t texels =
1073*c217d954SCole Faust {
1074*c217d954SCole Faust {
1075*c217d954SCole Faust vld1q_s32(src_ptr + x),
1076*c217d954SCole Faust vld1q_s32(src_ptr + x + 4),
1077*c217d954SCole Faust vld1q_s32(src_ptr + x + 8),
1078*c217d954SCole Faust vld1q_s32(src_ptr + x + 12)
1079*c217d954SCole Faust }
1080*c217d954SCole Faust };
1081*c217d954SCole Faust
1082*c217d954SCole Faust vst1_u8(dst_ptr + x, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])), vmovn_u32(vreinterpretq_u32_s32(texels.val[1])))));
1083*c217d954SCole Faust vst1_u8(dst_ptr + x + 8, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])), vmovn_u32(vreinterpretq_u32_s32(texels.val[3])))));
1084*c217d954SCole Faust }
1085*c217d954SCole Faust
1086*c217d954SCole Faust // Compute left-over elements
1087*c217d954SCole Faust for(; x < window_end_x; ++x)
1088*c217d954SCole Faust {
1089*c217d954SCole Faust *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
1090*c217d954SCole Faust }
1091*c217d954SCole Faust },
1092*c217d954SCole Faust src, dst);
1093*c217d954SCole Faust }
1094*c217d954SCole Faust break;
1095*c217d954SCole Faust }
1096*c217d954SCole Faust default:
1097*c217d954SCole Faust ARM_COMPUTE_ERROR("dst data type not supported");
1098*c217d954SCole Faust }
1099*c217d954SCole Faust break;
1100*c217d954SCole Faust default:
1101*c217d954SCole Faust ARM_COMPUTE_ERROR("Not supported");
1102*c217d954SCole Faust }
1103*c217d954SCole Faust }
1104*c217d954SCole Faust
name() const1105*c217d954SCole Faust const char *CpuCastKernel::name() const
1106*c217d954SCole Faust {
1107*c217d954SCole Faust return "CpuCastKernel.cpp";
1108*c217d954SCole Faust }
1109*c217d954SCole Faust
get_available_kernels()1110*c217d954SCole Faust const std::vector<CpuCastKernel::CastKernel> &CpuCastKernel::get_available_kernels()
1111*c217d954SCole Faust {
1112*c217d954SCole Faust return available_kernels;
1113*c217d954SCole Faust }
1114*c217d954SCole Faust
1115*c217d954SCole Faust } // namespace kernels
1116*c217d954SCole Faust } // namespace cpu
1117*c217d954SCole Faust } // namespace arm_compute
1118