xref: /aosp_15_r20/external/ComputeLibrary/src/core/NEON/kernels/detail/NEColorConvertHelper.inl (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1/*
2 * Copyright (c) 2016-2020 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/Error.h"
25#include "arm_compute/core/Helpers.h"
26#include "arm_compute/core/IMultiImage.h"
27#include "arm_compute/core/Utils.h"
28#include "src/core/NEON/NEMath.h"
29
30#include <arm_neon.h>
31
32namespace
33{
34#ifndef DOXYGEN_SKIP_THIS
35constexpr float red_coef_bt709    = 1.5748F;
36constexpr float green_coef_bt709  = -0.1873f;
37constexpr float green_coef2_bt709 = -0.4681f;
38constexpr float blue_coef_bt709   = 1.8556f;
39
40constexpr float rgb2yuv_bt709_kr = 0.2126f;
41constexpr float rgb2yuv_bt709_kb = 0.0722f;
42// K_g = 1 - K_r - K_b
43constexpr float rgb2yuv_bt709_kg = 0.7152f;
44// C_u = 1 / (2 * (1 - K_b))
45constexpr float rgb2yuv_bt709_cu = 0.5389f;
46// C_v = 1 / (2 * (1 - K_r))
47constexpr float rgb2yuv_bt709_cv = 0.6350f;
48
49constexpr float rgb2u8_red_coef   = 0.2126f;
50constexpr float rgb2u8_green_coef = 0.7152f;
51constexpr float rgb2u8_blue_coef  = 0.0722f;
52
53inline float32x4_t rgb_to_greyscale_calculation(const float32x4_t &rcolor, const float32x4_t &gcolor, const float32x4_t &bcolor,
54                                                const float rcoef, const float gcoef, const float bcoef)
55{
56    float32x4_t greyscale = vmulq_n_f32(rcolor, rcoef);
57    greyscale             = vmlaq_n_f32(greyscale, gcolor, gcoef);
58    greyscale             = vmlaq_n_f32(greyscale, bcolor, bcoef);
59    return greyscale;
60}
61
62inline void rgb_to_u8_conversion(const uint8x16x3_t &in, uint8x16_t &out)
63{
64    float32x4x4_t out_float32;
65
66    //Conversion from 3(RGB) 4 uint8s to 3(RGB) 4 floats
67    const float32x4x4_t r_float32 = arm_compute::convert_uint8x16_to_float32x4x4(in.val[0]);
68    const float32x4x4_t g_float32 = arm_compute::convert_uint8x16_to_float32x4x4(in.val[1]);
69    const float32x4x4_t b_float32 = arm_compute::convert_uint8x16_to_float32x4x4(in.val[2]);
70
71    //New grayscale image = ( (RED_COEFF * R) + (GREEN_COEFF * G) + (BLUE_COEFF * B) )
72    //Computation of 1(Greyscale) 4 uint8 using 3(RGB) 4 uint8s float
73    out_float32.val[0] = rgb_to_greyscale_calculation(r_float32.val[0], g_float32.val[0], b_float32.val[0],
74                                                      rgb2u8_red_coef, rgb2u8_green_coef, rgb2u8_blue_coef);
75
76    out_float32.val[1] = rgb_to_greyscale_calculation(r_float32.val[1], g_float32.val[1], b_float32.val[1],
77                                                      rgb2u8_red_coef, rgb2u8_green_coef, rgb2u8_blue_coef);
78
79    out_float32.val[2] = rgb_to_greyscale_calculation(r_float32.val[2], g_float32.val[2], b_float32.val[2],
80                                                      rgb2u8_red_coef, rgb2u8_green_coef, rgb2u8_blue_coef);
81
82    out_float32.val[3] = rgb_to_greyscale_calculation(r_float32.val[3], g_float32.val[3], b_float32.val[3],
83                                                      rgb2u8_red_coef, rgb2u8_green_coef, rgb2u8_blue_coef);
84
85    //Conversion from 1(Greyscale) 4 floats to 1(Greyscale) 4 uint8s
86    arm_compute::convert_float32x4x4_to_uint8x16(out_float32, out);
87}
88
89inline void rgb_to_yuv_calculation(const float32x4_t &rvec, const float32x4_t &gvec, const float32x4_t &bvec,
90                                   float32x4_t &yvec, float32x4_t &uvec, float32x4_t &vvec)
91{
92    /*
93    Y'= 0.2126*R' + 0.7152*G' + 0.0722*B'
94    U'=-0.1146*R' - 0.3854*G' + 0.5000*B'
95    V'= 0.5000*R' - 0.4542*G' - 0.0458*B'
96    */
97    const auto c128 = vdupq_n_f32(128.f);
98
99    // Y = R * K_r + G * (1 - K_r - K_b) * B * K_b
100    yvec = vmulq_n_f32(rvec, rgb2yuv_bt709_kr);
101    yvec = vmlaq_n_f32(yvec, gvec, rgb2yuv_bt709_kg);
102    yvec = vmlaq_n_f32(yvec, bvec, rgb2yuv_bt709_kb);
103
104    // U = (B - Y) / (2 * (1 - K_b))
105    uvec = vsubq_f32(bvec, yvec);
106    uvec = vmlaq_n_f32(c128, uvec, rgb2yuv_bt709_cu);
107
108    // V = (R - Y) / (2 * (1 - K_r))
109    vvec = vsubq_f32(rvec, yvec);
110    vvec = vmlaq_n_f32(c128, vvec, rgb2yuv_bt709_cv);
111}
112
113inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uvec_val, const float32x4_t &yyvec_val,
114                                    float32x4_t vvec_val, unsigned char *output_ptr, const bool alpha)
115{
116    float32x4x3_t rgb1, rgb2;
117
118    // Compute: cb - 128 and cr - 128;
119    const auto c128 = vdupq_n_f32(128.f);
120    uvec_val        = vsubq_f32(uvec_val, c128);
121    vvec_val        = vsubq_f32(vvec_val, c128);
122
123    // Compute:
124    // r = 0.0000f*f_u + 1.5748f*f_v;
125    // g = 0.1873f*f_u - 0.4681f*f_v;
126    // b = 1.8556f*f_u + 0.0000f*f_v;
127    const auto red   = vmulq_n_f32(vvec_val, red_coef_bt709);
128    const auto blue  = vmulq_n_f32(uvec_val, blue_coef_bt709);
129    const auto green = vaddq_f32(vmulq_n_f32(uvec_val, green_coef_bt709),
130                                 vmulq_n_f32(vvec_val, green_coef2_bt709));
131
132    // Compute the final r,g,b values using y1 for the first texel and y2 for the second one.
133    // the result is stored in two float32x4x3_t which then are converted to one uint8x8x3_t
134    // and written back to memory using vst3 instruction
135
136    rgb1.val[0] = vaddq_f32(yvec_val, red);
137    rgb1.val[1] = vaddq_f32(yvec_val, green);
138    rgb1.val[2] = vaddq_f32(yvec_val, blue);
139
140    rgb2.val[0] = vaddq_f32(yyvec_val, red);
141    rgb2.val[1] = vaddq_f32(yyvec_val, green);
142    rgb2.val[2] = vaddq_f32(yyvec_val, blue);
143
144    uint8x8x3_t u8_rgb;
145    arm_compute::convert_float32x4x3_to_uint8x8x3(rgb1, rgb2, u8_rgb);
146
147    if(!alpha)
148    {
149        vst3_lane_u8(&output_ptr[0], u8_rgb, 0);
150        vst3_lane_u8(&output_ptr[3], u8_rgb, 4);
151        vst3_lane_u8(&output_ptr[6], u8_rgb, 1);
152        vst3_lane_u8(&output_ptr[9], u8_rgb, 5);
153        vst3_lane_u8(&output_ptr[12], u8_rgb, 2);
154        vst3_lane_u8(&output_ptr[15], u8_rgb, 6);
155        vst3_lane_u8(&output_ptr[18], u8_rgb, 3);
156        vst3_lane_u8(&output_ptr[21], u8_rgb, 7);
157    }
158    else
159    {
160        uint8x8x4_t u8_rgba;
161        u8_rgba.val[0] = u8_rgb.val[0];
162        u8_rgba.val[1] = u8_rgb.val[1];
163        u8_rgba.val[2] = u8_rgb.val[2];
164        u8_rgba.val[3] = vdup_n_u8(255);
165        vst4_lane_u8(&output_ptr[0], u8_rgba, 0);
166        vst4_lane_u8(&output_ptr[4], u8_rgba, 4);
167        vst4_lane_u8(&output_ptr[8], u8_rgba, 1);
168        vst4_lane_u8(&output_ptr[12], u8_rgba, 5);
169        vst4_lane_u8(&output_ptr[16], u8_rgba, 2);
170        vst4_lane_u8(&output_ptr[20], u8_rgba, 6);
171        vst4_lane_u8(&output_ptr[24], u8_rgba, 3);
172        vst4_lane_u8(&output_ptr[28], u8_rgba, 7);
173    }
174}
175
176inline uint8x16x3_t load_rgb(const unsigned char *const ptr, const bool alpha)
177{
178    uint8x16x3_t rgb;
179
180    if(alpha)
181    {
182        const auto tmp = vld4q_u8(ptr);
183        rgb.val[0]     = tmp.val[0];
184        rgb.val[1]     = tmp.val[1];
185        rgb.val[2]     = tmp.val[2];
186    }
187    else
188    {
189        rgb = vld3q_u8(ptr);
190    }
191
192    return rgb;
193}
194
195inline void rgb_to_yuv_conversion(uint8x16x3_t &vec_top, uint8x16x3_t &vec_bottom)
196{
197    // Convert the uint8x16_t to float32x4x4_t
198    const float32x4x4_t frvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vec_top.val[0]);
199    const float32x4x4_t fgvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vec_top.val[1]);
200    const float32x4x4_t fbvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vec_top.val[2]);
201
202    const float32x4x4_t frvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vec_bottom.val[0]);
203    const float32x4x4_t fgvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vec_bottom.val[1]);
204    const float32x4x4_t fbvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vec_bottom.val[2]);
205
206    float32x4x4_t fyvec_top, fuvec_top, fvvec_top;
207    float32x4x4_t fyvec_bottom, fuvec_bottom, fvvec_bottom;
208
209    for(auto i = 0; i < 4; ++i)
210    {
211        rgb_to_yuv_calculation(frvec_top.val[i], fgvec_top.val[i], fbvec_top.val[i],
212                               fyvec_top.val[i], fuvec_top.val[i], fvvec_top.val[i]);
213        rgb_to_yuv_calculation(frvec_bottom.val[i], fgvec_bottom.val[i], fbvec_bottom.val[i],
214                               fyvec_bottom.val[i], fuvec_bottom.val[i], fvvec_bottom.val[i]);
215    }
216
217    arm_compute::convert_float32x4x4_to_uint8x16(fyvec_top, vec_top.val[0]);
218    arm_compute::convert_float32x4x4_to_uint8x16(fuvec_top, vec_top.val[1]);
219    arm_compute::convert_float32x4x4_to_uint8x16(fvvec_top, vec_top.val[2]);
220    arm_compute::convert_float32x4x4_to_uint8x16(fyvec_bottom, vec_bottom.val[0]);
221    arm_compute::convert_float32x4x4_to_uint8x16(fuvec_bottom, vec_bottom.val[1]);
222    arm_compute::convert_float32x4x4_to_uint8x16(fvvec_bottom, vec_bottom.val[2]);
223}
224
225inline void store_rgb_to_nv12(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top,
226                              const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom,
227                              unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom,
228                              unsigned char *const __restrict out_uv)
229{
230    uint8x16x3_t vec_top, vec_bottom;
231    vec_top.val[0]    = rvec_top;
232    vec_top.val[1]    = gvec_top;
233    vec_top.val[2]    = bvec_top;
234    vec_bottom.val[0] = rvec_bottom;
235    vec_bottom.val[1] = gvec_bottom;
236    vec_bottom.val[2] = bvec_bottom;
237
238    rgb_to_yuv_conversion(vec_top, vec_bottom);
239
240    vst1q_u8(out_y_top, vec_top.val[0]);
241    vst1q_u8(out_y_bottom, vec_bottom.val[0]);
242
243    const auto uvec = vuzpq_u8(vec_top.val[1], vec_bottom.val[1]);
244    const auto vvec = vuzpq_u8(vec_top.val[2], vec_bottom.val[2]);
245    const auto utmp = vrhaddq_u8(uvec.val[0], uvec.val[1]);
246    const auto vtmp = vrhaddq_u8(vvec.val[0], vvec.val[1]);
247
248    uint8x8x2_t uvvec;
249    uvvec.val[0] = vhadd_u8(vget_low_u8(utmp), vget_high_u8(utmp));
250    uvvec.val[1] = vhadd_u8(vget_low_u8(vtmp), vget_high_u8(vtmp));
251
252    vst2_u8(out_uv, uvvec);
253}
254
255inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top,
256                              const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom,
257                              unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom,
258                              unsigned char *const __restrict out_u,
259                              unsigned char *const __restrict out_v)
260{
261    uint8x16x3_t vec_top, vec_bottom;
262    vec_top.val[0]    = rvec_top;
263    vec_top.val[1]    = gvec_top;
264    vec_top.val[2]    = bvec_top;
265    vec_bottom.val[0] = rvec_bottom;
266    vec_bottom.val[1] = gvec_bottom;
267    vec_bottom.val[2] = bvec_bottom;
268
269    rgb_to_yuv_conversion(vec_top, vec_bottom);
270
271    vst1q_u8(out_y_top, vec_top.val[0]);
272    vst1q_u8(out_y_bottom, vec_bottom.val[0]);
273
274    const auto uvvec_top    = vuzpq_u8(vec_top.val[1], vec_top.val[2]);
275    const auto uvvec_bottom = vuzpq_u8(vec_bottom.val[1], vec_bottom.val[2]);
276    const auto uvvec        = vhaddq_u8(vrhaddq_u8(uvvec_top.val[0], uvvec_top.val[1]),
277                                        vrhaddq_u8(uvvec_bottom.val[0], uvvec_bottom.val[1]));
278
279    vst1_u8(out_u, vget_low_u8(uvvec));
280    vst1_u8(out_v, vget_high_u8(uvvec));
281}
282
283inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, const uint8x16_t &bvec,
284                              unsigned char *const __restrict out_y,
285                              unsigned char *const __restrict out_u,
286                              unsigned char *const __restrict out_v)
287{
288    // Convert the uint8x16_t to float32x4x4_t
289    const float32x4x4_t frvec = arm_compute::convert_uint8x16_to_float32x4x4(rvec);
290    const float32x4x4_t fgvec = arm_compute::convert_uint8x16_to_float32x4x4(gvec);
291    const float32x4x4_t fbvec = arm_compute::convert_uint8x16_to_float32x4x4(bvec);
292
293    float32x4x4_t fyvec, fuvec, fvvec;
294    for(auto i = 0; i < 4; ++i)
295    {
296        rgb_to_yuv_calculation(frvec.val[i], fgvec.val[i], fbvec.val[i],
297                               fyvec.val[i], fuvec.val[i], fvvec.val[i]);
298    }
299
300    uint8x16_t yvec, uvec, vvec;
301    arm_compute::convert_float32x4x4_to_uint8x16(fyvec, yvec);
302    arm_compute::convert_float32x4x4_to_uint8x16(fuvec, uvec);
303    arm_compute::convert_float32x4x4_to_uint8x16(fvvec, vvec);
304
305    vst1q_u8(out_y, yvec);
306    vst1q_u8(out_u, uvec);
307    vst1q_u8(out_v, vvec);
308}
309#endif /* DOXYGEN_SKIP_THIS */
310}
311
312namespace arm_compute
313{
314/** Convert RGB to RGBX.
315 *
316 * @param[in]  input  Input RGB data buffer.
317 * @param[out] output Output RGBX buffer.
318 * @param[in]  win    Window for iterating the buffers.
319 *
320 */
321void colorconvert_rgb_to_rgbx(const void *__restrict input, void *__restrict output, const Window &win)
322{
323    ARM_COMPUTE_ERROR_ON(nullptr == input);
324    ARM_COMPUTE_ERROR_ON(nullptr == output);
325
326    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
327    const auto output_ptr = static_cast<IImage *__restrict>(output);
328
329    Iterator in(input_ptr, win);
330    Iterator out(output_ptr, win);
331
332    execute_window_loop(win, [&](const Coordinates &)
333    {
334        const auto   ta1 = vld3q_u8(in.ptr());
335        uint8x16x4_t ta2;
336        ta2.val[0] = ta1.val[0];
337        ta2.val[1] = ta1.val[1];
338        ta2.val[2] = ta1.val[2];
339        ta2.val[3] = vdupq_n_u8(255);
340        vst4q_u8(out.ptr(), ta2);
341    },
342    in, out);
343}
344
345/** Convert RGB to U8.
346 *
347 * @param[in]  input  Input RGB data buffer.
348 * @param[out] output Output U8 buffer.
349 * @param[in]  win    Window for iterating the buffers.
350 *
351 */
352void colorconvert_rgb_to_u8(const void *__restrict input, void *__restrict output, const Window &win)
353{
354    ARM_COMPUTE_ERROR_ON(nullptr == input);
355    ARM_COMPUTE_ERROR_ON(nullptr == output);
356
357    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
358    const auto output_ptr = static_cast<IImage *__restrict>(output);
359
360    Iterator in(input_ptr, win);
361    Iterator out(output_ptr, win);
362
363    execute_window_loop(win, [&](const Coordinates &)
364    {
365        const auto ta1 = vld3q_u8(in.ptr());
366        uint8x16_t ta2;
367        rgb_to_u8_conversion(ta1, ta2);
368        vst1q_u8(out.ptr(), ta2);
369    },
370    in, out);
371}
372
373/** Convert RGBX to RGB.
374 *
375 * @param[in]  input  Input RGBX data buffer.
376 * @param[out] output Output RGB buffer.
377 * @param[in]  win    Window for iterating the buffers.
378 *
379 */
380void colorconvert_rgbx_to_rgb(const void *input, void *output, const Window &win)
381{
382    ARM_COMPUTE_ERROR_ON(nullptr == input);
383    ARM_COMPUTE_ERROR_ON(nullptr == output);
384
385    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
386    const auto output_ptr = static_cast<IImage *__restrict>(output);
387
388    Iterator in(input_ptr, win);
389    Iterator out(output_ptr, win);
390
391    execute_window_loop(win, [&](const Coordinates &)
392    {
393        const auto   ta1 = vld4q_u8(in.ptr());
394        uint8x16x3_t ta2;
395        ta2.val[0] = ta1.val[0];
396        ta2.val[1] = ta1.val[1];
397        ta2.val[2] = ta1.val[2];
398        vst3q_u8(out.ptr(), ta2);
399    },
400    in, out);
401}
402
403/** Convert YUYV to RGB.
404 *
405 * @param[in]  input  Input YUYV data buffer.
406 * @param[out] output Output RGB buffer.
407 * @param[in]  win    Window for iterating the buffers.
408 *
409 */
410template <bool yuyv, bool alpha>
411void colorconvert_yuyv_to_rgb(const void *__restrict input, void *__restrict output, const Window &win)
412{
413    ARM_COMPUTE_ERROR_ON(nullptr == input);
414    ARM_COMPUTE_ERROR_ON(nullptr == output);
415
416    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
417    const auto output_ptr = static_cast<IImage *__restrict>(output);
418
419    constexpr auto element_size = alpha ? 32 : 24;
420    constexpr auto shift        = yuyv ? 0 : 1;
421
422    Iterator in(input_ptr, win);
423    Iterator out(output_ptr, win);
424
425    execute_window_loop(win, [&](const Coordinates &)
426    {
427        const auto ta = vld4q_u8(in.ptr());
428        //ta.val[0] = Y0 Y2 Y4 Y6 ...
429        //ta.val[1] = U0 U2 U4 U6 ...
430        //ta.val[2] = Y1 Y3 Y5 Y7 ...
431        //ta.val[3] = V0 V2 V4 V7 ...
432
433        // Convert the uint8x16x4_t to float32x4x4_t
434        const float32x4x4_t yvec  = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[0 + shift]);
435        const float32x4x4_t uvec  = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[1 - shift]);
436        const float32x4x4_t yyvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[2 + shift]);
437        const float32x4x4_t vvec  = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[3 - shift]);
438
439        yuyv_to_rgb_calculation(yvec.val[0], uvec.val[0], yyvec.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
440        yuyv_to_rgb_calculation(yvec.val[1], uvec.val[1], yyvec.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
441        yuyv_to_rgb_calculation(yvec.val[2], uvec.val[2], yyvec.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
442        yuyv_to_rgb_calculation(yvec.val[3], uvec.val[3], yyvec.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
443    },
444    in, out);
445}
446
447/** Convert NV12 to RGB.
448 *
449 * @param[in]  input  Input NV12 data buffer.
450 * @param[out] output Output RGB buffer.
451 * @param[in]  win    Window for iterating the buffers.
452 *
453 */
454template <bool uv, bool alpha>
455void colorconvert_nv12_to_rgb(const void *__restrict input, void *__restrict output, const Window &win)
456{
457    ARM_COMPUTE_ERROR_ON(nullptr == input);
458    ARM_COMPUTE_ERROR_ON(nullptr == output);
459    win.validate();
460
461    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
462    const auto output_ptr = static_cast<IImage *__restrict>(output);
463
464    constexpr auto element_size = alpha ? 32 : 24;
465    const auto     out_stride   = output_ptr->info()->strides_in_bytes().y();
466    constexpr auto shift        = uv ? 0 : 1;
467
468    // UV's width and height are subsampled
469    Window win_uv(win);
470    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win.x().step() / 2));
471    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
472    win_uv.validate();
473
474    Iterator in_y(input_ptr->plane(0), win);
475    Iterator in_uv(input_ptr->plane(1), win_uv);
476    Iterator out(output_ptr, win);
477
478    execute_window_loop(win, [&](const Coordinates &)
479    {
480        const auto ta_y_top    = vld2q_u8(in_y.ptr());
481        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
482        const auto ta_uv       = vld2q_u8(in_uv.ptr());
483        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
484        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
485        //ta_uv.val[0] = U0 U2 U4 U6 ...
486        //ta_uv.val[1] = V0 V2 V4 V6 ...
487
488        // Convert the uint8x16x4_t to float32x4x4_t
489        float32x4x4_t yvec_top     = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]);
490        float32x4x4_t yyvec_top    = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]);
491        float32x4x4_t yvec_bottom  = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]);
492        float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]);
493        float32x4x4_t uvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[0 + shift]);
494        float32x4x4_t vvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[1 - shift]);
495
496        yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
497        yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
498        yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
499        yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
500
501        yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha);
502        yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha);
503        yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha);
504        yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha);
505    },
506    in_y, in_uv, out);
507}
508
509/** Convert IYUV to RGB.
510 *
511 * @param[in]  input  Input IYUV data buffer.
512 * @param[out] output Output RGB buffer.
513 * @param[in]  win    Window for iterating the buffers.
514 *
515 */
516template <bool alpha>
517void colorconvert_iyuv_to_rgb(const void *__restrict input, void *__restrict output, const Window &win)
518{
519    ARM_COMPUTE_ERROR_ON(nullptr == input);
520    ARM_COMPUTE_ERROR_ON(nullptr == output);
521    win.validate();
522
523    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
524    const auto output_ptr = static_cast<IImage *__restrict>(output);
525
526    constexpr auto element_size = alpha ? 32 : 24;
527    const auto     out_stride   = output_ptr->info()->strides_in_bytes().y();
528
529    // UV's width and height are subsampled
530    Window win_uv(win);
531    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
532    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
533    win_uv.validate();
534
535    Iterator in_y(input_ptr->plane(0), win);
536    Iterator in_u(input_ptr->plane(1), win_uv);
537    Iterator in_v(input_ptr->plane(2), win_uv);
538    Iterator out(output_ptr, win);
539
540    execute_window_loop(win, [&](const Coordinates &)
541    {
542        const auto *y_top_ptr    = in_y.ptr();
543        const auto *y_bottom_ptr = in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y();
544        const auto *u_ptr        = in_u.ptr();
545        const auto *v_ptr        = in_v.ptr();
546
547        // Work-around issue in gcc 9(>=) where vld2q might cause issues with register allocation
548#if defined(__arch64__)
549        const auto ta0_y_top    = vld1q_u8(y_top_ptr);
550        const auto ta1_y_top    = vld1q_u8(y_top_ptr + 16);
551        const auto ta0_y_bottom = vld1q_u8(y_bottom_ptr);
552        const auto ta1_y_bottom = vld1q_u8(y_bottom_ptr + 16);
553        const auto ta_u         = vld1q_u8(u_ptr);
554        const auto ta_v         = vld1q_u8(v_ptr);
555
556        // Convert the uint8x16x4_t to float32x4x4_t
557        float32x4x4_t yvec_top     = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_top, ta1_y_top));
558        float32x4x4_t yyvec_top    = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_top, ta1_y_top));
559        float32x4x4_t yvec_bottom  = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_bottom, ta1_y_bottom));
560        float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_bottom, ta1_y_bottom));
561        float32x4x4_t uvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_u);
562        float32x4x4_t vvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_v);
563#else  /* defined(__arch64__) */
564        const auto ta_y_top    = vld2q_u8(y_top_ptr);
565        const auto ta_y_bottom = vld2q_u8(y_bottom_ptr);
566        const auto ta_u        = vld1q_u8(u_ptr);
567        const auto ta_v        = vld1q_u8(v_ptr);
568        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
569        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
570        //ta_u.val[0] = U0 U2 U4 U6 ...
571        //ta_v.val[0] = V0 V2 V4 V6 ...
572
573        // Convert the uint8x16x4_t to float32x4x4_t
574        float32x4x4_t yvec_top     = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]);
575        float32x4x4_t yyvec_top    = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]);
576        float32x4x4_t yvec_bottom  = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]);
577        float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]);
578        float32x4x4_t uvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_u);
579        float32x4x4_t vvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_v);
580#endif /* defined(__arch64__) */
581
582        yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
583        yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
584        yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
585        yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
586
587        yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha);
588        yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha);
589        yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha);
590        yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha);
591    },
592    in_y, in_u, in_v, out);
593}
594
595/** Convert YUYV to NV12.
596 *
597 * @param[in]  input  Input YUYV data buffer.
598 * @param[out] output Output NV12 buffer.
599 * @param[in]  win    Window for iterating the buffers.
600 *
601 */
602template <bool yuyv>
603void colorconvert_yuyv_to_nv12(const void *__restrict input, void *__restrict output, const Window &win)
604{
605    ARM_COMPUTE_ERROR_ON(nullptr == input);
606    ARM_COMPUTE_ERROR_ON(nullptr == output);
607    win.validate();
608
609    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
610    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
611
612    constexpr auto shift = yuyv ? 0 : 1;
613
614    // NV12's UV's width and height are subsampled
615    Window win_uv(win);
616    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
617    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
618    win_uv.validate();
619
620    Iterator in(input_ptr, win);
621    Iterator out_y(output_ptr->plane(0), win);
622    Iterator out_uv(output_ptr->plane(1), win_uv);
623
624    execute_window_loop(win, [&](const Coordinates &)
625    {
626        const auto ta_top    = vld4q_u8(in.ptr());
627        const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
628        //ta.val[0] = Y0 Y2 Y4 Y6 ...
629        //ta.val[1] = U0 U2 U4 U6 ...
630        //ta.val[2] = Y1 Y3 Y5 Y7 ...
631        //ta.val[3] = V0 V2 V4 V7 ...
632
633        uint8x16x2_t yvec;
634        yvec.val[0] = ta_top.val[0 + shift];
635        yvec.val[1] = ta_top.val[2 + shift];
636        vst2q_u8(out_y.ptr(), yvec);
637
638        uint8x16x2_t yyvec;
639        yyvec.val[0] = ta_bottom.val[0 + shift];
640        yyvec.val[1] = ta_bottom.val[2 + shift];
641        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
642
643        uint8x16x2_t uvvec;
644        uvvec.val[0] = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
645        uvvec.val[1] = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
646        vst2q_u8(out_uv.ptr(), uvvec);
647    },
648    in, out_y, out_uv);
649}
650
651/** Convert IYUV to NV12.
652 *
653 * @param[in]  input  Input IYUV data buffer.
654 * @param[out] output Output NV12 buffer.
655 * @param[in]  win    Window for iterating the buffers.
656 *
657 */
658void colorconvert_iyuv_to_nv12(const void *__restrict input, void *__restrict output, const Window &win)
659{
660    ARM_COMPUTE_ERROR_ON(nullptr == input);
661    ARM_COMPUTE_ERROR_ON(nullptr == output);
662    win.validate();
663
664    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
665    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
666
667    // UV's width and height are subsampled
668    Window win_uv(win);
669    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
670    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
671    win_uv.validate();
672
673    Iterator in_y(input_ptr->plane(0), win);
674    Iterator in_u(input_ptr->plane(1), win_uv);
675    Iterator in_v(input_ptr->plane(2), win_uv);
676    Iterator out_y(output_ptr->plane(0), win);
677    Iterator out_uv(output_ptr->plane(1), win_uv);
678
679    execute_window_loop(win, [&](const Coordinates &)
680    {
681        const auto   ta_y_top    = vld2q_u8(in_y.ptr());
682        const auto   ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
683        uint8x16x2_t ta_uv;
684        ta_uv.val[0] = vld1q_u8(in_u.ptr());
685        ta_uv.val[1] = vld1q_u8(in_v.ptr());
686        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
687        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
688        //ta_uv.val[0] = U0 U2 U4 U6 ...
689        //ta_uv.val[1] = V0 V2 V4 V6 ...
690
691        vst2q_u8(out_y.ptr(), ta_y_top);
692        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
693        vst2q_u8(out_uv.ptr(), ta_uv);
694    },
695    in_y, in_u, in_v, out_y, out_uv);
696}
697
698/** Convert NV12 to IYUV.
699 *
700 * @param[in]  input  Input NV12 data buffer.
701 * @param[out] output Output IYUV buffer.
702 * @param[in]  win    Window for iterating the buffers.
703 *
704 */
705template <bool uv>
706void colorconvert_nv12_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win)
707{
708    ARM_COMPUTE_ERROR_ON(nullptr == input);
709    ARM_COMPUTE_ERROR_ON(nullptr == output);
710    win.validate();
711
712    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
713    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
714
715    constexpr auto shift = uv ? 0 : 1;
716
717    // UV's width and height are subsampled
718    Window win_uv(win);
719    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
720    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
721    win_uv.validate();
722
723    Iterator in_y(input_ptr->plane(0), win);
724    Iterator in_uv(input_ptr->plane(1), win_uv);
725    Iterator out_y(output_ptr->plane(0), win);
726    Iterator out_u(output_ptr->plane(1), win_uv);
727    Iterator out_v(output_ptr->plane(2), win_uv);
728
729    execute_window_loop(win, [&](const Coordinates &)
730    {
731        const auto ta_y_top    = vld2q_u8(in_y.ptr());
732        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
733        const auto ta_uv       = vld2q_u8(in_uv.ptr());
734        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
735        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
736        //ta_uv.val[0] = U0 U2 U4 U6 ...
737        //ta_uv.val[1] = V0 V2 V4 V6 ...
738
739        vst2q_u8(out_y.ptr(), ta_y_top);
740        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
741        vst1q_u8(out_u.ptr(), ta_uv.val[0 + shift]);
742        vst1q_u8(out_v.ptr(), ta_uv.val[1 - shift]);
743    },
744    in_y, in_uv, out_y, out_u, out_v);
745}
746
747/** Convert YUYV to IYUV.
748 *
749 * @param[in]  input  Input YUYV data buffer.
750 * @param[out] output Output IYUV buffer.
751 * @param[in]  win    Window for iterating the buffers.
752 *
753 */
754template <bool yuyv>
755void colorconvert_yuyv_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win)
756{
757    ARM_COMPUTE_ERROR_ON(nullptr == input);
758    ARM_COMPUTE_ERROR_ON(nullptr == output);
759    win.validate();
760
761    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
762    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
763
764    constexpr auto shift = yuyv ? 0 : 1;
765
766    // Destination's UV's width and height are subsampled
767    Window win_uv(win);
768    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
769    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
770    win_uv.validate();
771
772    Iterator in(input_ptr, win);
773    Iterator out_y(output_ptr->plane(0), win);
774    Iterator out_u(output_ptr->plane(1), win_uv);
775    Iterator out_v(output_ptr->plane(2), win_uv);
776
777    execute_window_loop(win, [&](const Coordinates &)
778    {
779        const auto ta_top    = vld4q_u8(in.ptr());
780        const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
781        //ta.val[0] = Y0 Y2 Y4 Y6 ...
782        //ta.val[1] = U0 U2 U4 U6 ...
783        //ta.val[2] = Y1 Y3 Y5 Y7 ...
784        //ta.val[3] = V0 V2 V4 V7 ...
785
786        uint8x16x2_t yvec;
787        yvec.val[0] = ta_top.val[0 + shift];
788        yvec.val[1] = ta_top.val[2 + shift];
789        vst2q_u8(out_y.ptr(), yvec);
790
791        uint8x16x2_t yyvec;
792        yyvec.val[0] = ta_bottom.val[0 + shift];
793        yyvec.val[1] = ta_bottom.val[2 + shift];
794        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
795
796        uint8x16_t uvec;
797        uvec = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
798        vst1q_u8(out_u.ptr(), uvec);
799
800        uint8x16_t vvec;
801        vvec = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
802        vst1q_u8(out_v.ptr(), vvec);
803    },
804    in, out_y, out_u, out_v);
805}
806
807/** Convert NV12 to YUV4.
808 *
809 * @param[in]  input  Input NV12 data buffer.
810 * @param[out] output Output YUV4 buffer.
811 * @param[in]  win    Window for iterating the buffers.
812 *
813 */
814template <bool uv>
815void colorconvert_nv12_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win)
816{
817    ARM_COMPUTE_ERROR_ON(nullptr == input);
818    ARM_COMPUTE_ERROR_ON(nullptr == output);
819    win.validate();
820
821    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
822    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
823
824    constexpr auto shift = uv ? 0 : 1;
825
826    // UV's width and height are subsampled
827    Window win_uv(win);
828    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
829    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
830    win_uv.validate();
831
832    Iterator in_y(input_ptr->plane(0), win);
833    Iterator in_uv(input_ptr->plane(1), win_uv);
834    Iterator out_y(output_ptr->plane(0), win);
835    Iterator out_u(output_ptr->plane(1), win);
836    Iterator out_v(output_ptr->plane(2), win);
837
838    execute_window_loop(win, [&](const Coordinates &)
839    {
840        const auto ta_y_top    = vld2q_u8(in_y.ptr());
841        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
842        const auto ta_uv       = vld2q_u8(in_uv.ptr());
843        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
844        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
845        //ta_uv.val[0] = U0 U2 U4 U6 ...
846        //ta_uv.val[1] = V0 V2 V4 V6 ...
847
848        vst2q_u8(out_y.ptr(), ta_y_top);
849        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
850
851        uint8x16x2_t uvec;
852        uvec.val[0] = ta_uv.val[0 + shift];
853        uvec.val[1] = ta_uv.val[0 + shift];
854        vst2q_u8(out_u.ptr(), uvec);
855        vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
856
857        uint8x16x2_t vvec;
858        vvec.val[0] = ta_uv.val[1 - shift];
859        vvec.val[1] = ta_uv.val[1 - shift];
860        vst2q_u8(out_v.ptr(), vvec);
861        vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
862    },
863    in_y, in_uv, out_y, out_u, out_v);
864}
865
866/** Convert IYUV to YUV4.
867 *
868 * @param[in]  input  Input IYUV data buffer.
869 * @param[out] output Output YUV4 buffer.
870 * @param[in]  win    Window for iterating the buffers.
871 *
872 */
873void colorconvert_iyuv_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win)
874{
875    ARM_COMPUTE_ERROR_ON(nullptr == input);
876    ARM_COMPUTE_ERROR_ON(nullptr == output);
877    win.validate();
878
879    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
880    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
881
882    // UV's width and height are subsampled
883    Window win_uv(win);
884    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
885    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
886    win_uv.validate();
887
888    Iterator in_y(input_ptr->plane(0), win);
889    Iterator in_u(input_ptr->plane(1), win_uv);
890    Iterator in_v(input_ptr->plane(2), win_uv);
891    Iterator out_y(output_ptr->plane(0), win);
892    Iterator out_u(output_ptr->plane(1), win);
893    Iterator out_v(output_ptr->plane(2), win);
894
895    execute_window_loop(win, [&](const Coordinates &)
896    {
897        const auto ta_y_top    = vld2q_u8(in_y.ptr());
898        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
899        const auto ta_u        = vld1q_u8(in_u.ptr());
900        const auto ta_v        = vld1q_u8(in_v.ptr());
901        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
902        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
903        //ta_u = U0 U2 U4 U6 ...
904        //ta_v = V0 V2 V4 V6 ...
905
906        vst2q_u8(out_y.ptr(), ta_y_top);
907        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
908
909        uint8x16x2_t uvec;
910        uvec.val[0] = ta_u;
911        uvec.val[1] = ta_u;
912        vst2q_u8(out_u.ptr(), uvec);
913        vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
914
915        uint8x16x2_t vvec;
916        vvec.val[0] = ta_v;
917        vvec.val[1] = ta_v;
918        vst2q_u8(out_v.ptr(), vvec);
919        vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
920    },
921    in_y, in_u, in_v, out_y, out_u, out_v);
922}
923
924/** Convert RGB to NV12.
925 *
926 * @param[in]  input  Input RGB data buffer.
927 * @param[out] output Output NV12 buffer.
928 * @param[in]  win    Window for iterating the buffers.
929 *
930 */
931template <bool alpha>
932void colorconvert_rgb_to_nv12(const void *__restrict input, void *__restrict output, const Window &win)
933{
934    ARM_COMPUTE_ERROR_ON(nullptr == input);
935    ARM_COMPUTE_ERROR_ON(nullptr == output);
936    win.validate();
937
938    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
939    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
940
941    // UV's width and height are subsampled
942    Window win_uv(win);
943    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
944    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
945    win_uv.validate();
946
947    Iterator in(input_ptr, win);
948    Iterator out_y(output_ptr->plane(0), win);
949    Iterator out_uv(output_ptr->plane(1), win_uv);
950
951    execute_window_loop(win, [&](const Coordinates &)
952    {
953        const auto ta_rgb_top    = load_rgb(in.ptr(), alpha);
954        const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
955        //ta_rgb.val[0] = R0 R1 R2 R3 ...
956        //ta_rgb.val[1] = G0 G1 G2 G3 ...
957        //ta_rgb.val[2] = B0 B1 B2 B3 ...
958
959        store_rgb_to_nv12(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2],
960                          ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2],
961                          out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(),
962                          out_uv.ptr());
963    },
964    in, out_y, out_uv);
965}
966
967/** Convert RGB to IYUV.
968 *
969 * @param[in]  input  Input RGB data buffer.
970 * @param[out] output Output IYUV buffer.
971 * @param[in]  win    Window for iterating the buffers.
972 *
973 */
974template <bool alpha>
975void colorconvert_rgb_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win)
976{
977    ARM_COMPUTE_ERROR_ON(nullptr == input);
978    ARM_COMPUTE_ERROR_ON(nullptr == output);
979    win.validate();
980
981    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
982    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
983
984    // UV's width and height are subsampled
985    Window win_uv(win);
986    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
987    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
988    win_uv.validate();
989
990    Iterator in(input_ptr, win);
991    Iterator out_y(output_ptr->plane(0), win);
992    Iterator out_u(output_ptr->plane(1), win_uv);
993    Iterator out_v(output_ptr->plane(2), win_uv);
994
995    execute_window_loop(win, [&](const Coordinates &)
996    {
997        const auto ta_rgb_top    = load_rgb(in.ptr(), alpha);
998        const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
999        //ta_rgb.val[0] = R0 R1 R2 R3 ...
1000        //ta_rgb.val[1] = G0 G1 G2 G3 ...
1001        //ta_rgb.val[2] = B0 B1 B2 B3 ...
1002
1003        store_rgb_to_iyuv(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2],
1004                          ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2],
1005                          out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(),
1006                          out_u.ptr(), out_v.ptr());
1007    },
1008    in, out_y, out_u, out_v);
1009}
1010
1011/** Convert RGB to YUV4.
1012 *
1013 * @param[in]  input  Input RGB data buffer.
1014 * @param[out] output Output YUV4 buffer.
1015 * @param[in]  win    Window for iterating the buffers.
1016 *
1017 */
1018template <bool alpha>
1019void colorconvert_rgb_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win)
1020{
1021    ARM_COMPUTE_ERROR_ON(nullptr == input);
1022    ARM_COMPUTE_ERROR_ON(nullptr == output);
1023    win.validate();
1024
1025    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
1026    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
1027
1028    Iterator in(input_ptr, win);
1029    Iterator out_y(output_ptr->plane(0), win);
1030    Iterator out_u(output_ptr->plane(1), win);
1031    Iterator out_v(output_ptr->plane(2), win);
1032
1033    execute_window_loop(win, [&](const Coordinates &)
1034    {
1035        const auto ta_rgb = load_rgb(in.ptr(), alpha);
1036        //ta_rgb.val[0] = R0 R1 R2 R3 ...
1037        //ta_rgb.val[1] = G0 G1 G2 G3 ...
1038        //ta_rgb.val[2] = B0 B1 B2 B3 ...
1039
1040        store_rgb_to_yuv4(ta_rgb.val[0], ta_rgb.val[1], ta_rgb.val[2],
1041                          out_y.ptr(), out_u.ptr(), out_v.ptr());
1042    },
1043    in, out_y, out_u, out_v);
1044}
1045} // namespace arm_compute
1046