1/* 2 * Copyright (c) 2016-2020 Arm Limited. 3 * 4 * SPDX-License-Identifier: MIT 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to 8 * deal in the Software without restriction, including without limitation the 9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 10 * sell copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in all 14 * copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24#include "arm_compute/core/Error.h" 25#include "arm_compute/core/Helpers.h" 26#include "arm_compute/core/IMultiImage.h" 27#include "arm_compute/core/Utils.h" 28#include "src/core/NEON/NEMath.h" 29 30#include <arm_neon.h> 31 32namespace 33{ 34#ifndef DOXYGEN_SKIP_THIS 35constexpr float red_coef_bt709 = 1.5748F; 36constexpr float green_coef_bt709 = -0.1873f; 37constexpr float green_coef2_bt709 = -0.4681f; 38constexpr float blue_coef_bt709 = 1.8556f; 39 40constexpr float rgb2yuv_bt709_kr = 0.2126f; 41constexpr float rgb2yuv_bt709_kb = 0.0722f; 42// K_g = 1 - K_r - K_b 43constexpr float rgb2yuv_bt709_kg = 0.7152f; 44// C_u = 1 / (2 * (1 - K_b)) 45constexpr float rgb2yuv_bt709_cu = 0.5389f; 46// C_v = 1 / (2 * (1 - K_r)) 47constexpr float rgb2yuv_bt709_cv = 0.6350f; 48 49constexpr float rgb2u8_red_coef = 0.2126f; 50constexpr float rgb2u8_green_coef = 0.7152f; 51constexpr float rgb2u8_blue_coef = 0.0722f; 52 53inline float32x4_t rgb_to_greyscale_calculation(const float32x4_t &rcolor, const float32x4_t &gcolor, const float32x4_t &bcolor, 54 const float rcoef, const float gcoef, const float bcoef) 55{ 56 float32x4_t greyscale = vmulq_n_f32(rcolor, rcoef); 57 greyscale = vmlaq_n_f32(greyscale, gcolor, gcoef); 58 greyscale = vmlaq_n_f32(greyscale, bcolor, bcoef); 59 return greyscale; 60} 61 62inline void rgb_to_u8_conversion(const uint8x16x3_t &in, uint8x16_t &out) 63{ 64 float32x4x4_t out_float32; 65 66 //Conversion from 3(RGB) 4 uint8s to 3(RGB) 4 floats 67 const float32x4x4_t r_float32 = arm_compute::convert_uint8x16_to_float32x4x4(in.val[0]); 68 const float32x4x4_t g_float32 = arm_compute::convert_uint8x16_to_float32x4x4(in.val[1]); 69 const float32x4x4_t b_float32 = arm_compute::convert_uint8x16_to_float32x4x4(in.val[2]); 70 71 //New grayscale image = ( (RED_COEFF * R) + (GREEN_COEFF * G) + (BLUE_COEFF * B) ) 72 //Computation of 1(Greyscale) 4 uint8 using 3(RGB) 4 uint8s float 73 out_float32.val[0] = rgb_to_greyscale_calculation(r_float32.val[0], g_float32.val[0], b_float32.val[0], 74 rgb2u8_red_coef, rgb2u8_green_coef, rgb2u8_blue_coef); 75 76 out_float32.val[1] = rgb_to_greyscale_calculation(r_float32.val[1], g_float32.val[1], b_float32.val[1], 77 rgb2u8_red_coef, rgb2u8_green_coef, rgb2u8_blue_coef); 78 79 out_float32.val[2] = rgb_to_greyscale_calculation(r_float32.val[2], g_float32.val[2], b_float32.val[2], 80 rgb2u8_red_coef, rgb2u8_green_coef, rgb2u8_blue_coef); 81 82 out_float32.val[3] = rgb_to_greyscale_calculation(r_float32.val[3], g_float32.val[3], b_float32.val[3], 83 rgb2u8_red_coef, rgb2u8_green_coef, rgb2u8_blue_coef); 84 85 //Conversion from 1(Greyscale) 4 floats to 1(Greyscale) 4 uint8s 86 arm_compute::convert_float32x4x4_to_uint8x16(out_float32, out); 87} 88 89inline void rgb_to_yuv_calculation(const float32x4_t &rvec, const float32x4_t &gvec, const float32x4_t &bvec, 90 float32x4_t &yvec, float32x4_t &uvec, float32x4_t &vvec) 91{ 92 /* 93 Y'= 0.2126*R' + 0.7152*G' + 0.0722*B' 94 U'=-0.1146*R' - 0.3854*G' + 0.5000*B' 95 V'= 0.5000*R' - 0.4542*G' - 0.0458*B' 96 */ 97 const auto c128 = vdupq_n_f32(128.f); 98 99 // Y = R * K_r + G * (1 - K_r - K_b) * B * K_b 100 yvec = vmulq_n_f32(rvec, rgb2yuv_bt709_kr); 101 yvec = vmlaq_n_f32(yvec, gvec, rgb2yuv_bt709_kg); 102 yvec = vmlaq_n_f32(yvec, bvec, rgb2yuv_bt709_kb); 103 104 // U = (B - Y) / (2 * (1 - K_b)) 105 uvec = vsubq_f32(bvec, yvec); 106 uvec = vmlaq_n_f32(c128, uvec, rgb2yuv_bt709_cu); 107 108 // V = (R - Y) / (2 * (1 - K_r)) 109 vvec = vsubq_f32(rvec, yvec); 110 vvec = vmlaq_n_f32(c128, vvec, rgb2yuv_bt709_cv); 111} 112 113inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uvec_val, const float32x4_t &yyvec_val, 114 float32x4_t vvec_val, unsigned char *output_ptr, const bool alpha) 115{ 116 float32x4x3_t rgb1, rgb2; 117 118 // Compute: cb - 128 and cr - 128; 119 const auto c128 = vdupq_n_f32(128.f); 120 uvec_val = vsubq_f32(uvec_val, c128); 121 vvec_val = vsubq_f32(vvec_val, c128); 122 123 // Compute: 124 // r = 0.0000f*f_u + 1.5748f*f_v; 125 // g = 0.1873f*f_u - 0.4681f*f_v; 126 // b = 1.8556f*f_u + 0.0000f*f_v; 127 const auto red = vmulq_n_f32(vvec_val, red_coef_bt709); 128 const auto blue = vmulq_n_f32(uvec_val, blue_coef_bt709); 129 const auto green = vaddq_f32(vmulq_n_f32(uvec_val, green_coef_bt709), 130 vmulq_n_f32(vvec_val, green_coef2_bt709)); 131 132 // Compute the final r,g,b values using y1 for the first texel and y2 for the second one. 133 // the result is stored in two float32x4x3_t which then are converted to one uint8x8x3_t 134 // and written back to memory using vst3 instruction 135 136 rgb1.val[0] = vaddq_f32(yvec_val, red); 137 rgb1.val[1] = vaddq_f32(yvec_val, green); 138 rgb1.val[2] = vaddq_f32(yvec_val, blue); 139 140 rgb2.val[0] = vaddq_f32(yyvec_val, red); 141 rgb2.val[1] = vaddq_f32(yyvec_val, green); 142 rgb2.val[2] = vaddq_f32(yyvec_val, blue); 143 144 uint8x8x3_t u8_rgb; 145 arm_compute::convert_float32x4x3_to_uint8x8x3(rgb1, rgb2, u8_rgb); 146 147 if(!alpha) 148 { 149 vst3_lane_u8(&output_ptr[0], u8_rgb, 0); 150 vst3_lane_u8(&output_ptr[3], u8_rgb, 4); 151 vst3_lane_u8(&output_ptr[6], u8_rgb, 1); 152 vst3_lane_u8(&output_ptr[9], u8_rgb, 5); 153 vst3_lane_u8(&output_ptr[12], u8_rgb, 2); 154 vst3_lane_u8(&output_ptr[15], u8_rgb, 6); 155 vst3_lane_u8(&output_ptr[18], u8_rgb, 3); 156 vst3_lane_u8(&output_ptr[21], u8_rgb, 7); 157 } 158 else 159 { 160 uint8x8x4_t u8_rgba; 161 u8_rgba.val[0] = u8_rgb.val[0]; 162 u8_rgba.val[1] = u8_rgb.val[1]; 163 u8_rgba.val[2] = u8_rgb.val[2]; 164 u8_rgba.val[3] = vdup_n_u8(255); 165 vst4_lane_u8(&output_ptr[0], u8_rgba, 0); 166 vst4_lane_u8(&output_ptr[4], u8_rgba, 4); 167 vst4_lane_u8(&output_ptr[8], u8_rgba, 1); 168 vst4_lane_u8(&output_ptr[12], u8_rgba, 5); 169 vst4_lane_u8(&output_ptr[16], u8_rgba, 2); 170 vst4_lane_u8(&output_ptr[20], u8_rgba, 6); 171 vst4_lane_u8(&output_ptr[24], u8_rgba, 3); 172 vst4_lane_u8(&output_ptr[28], u8_rgba, 7); 173 } 174} 175 176inline uint8x16x3_t load_rgb(const unsigned char *const ptr, const bool alpha) 177{ 178 uint8x16x3_t rgb; 179 180 if(alpha) 181 { 182 const auto tmp = vld4q_u8(ptr); 183 rgb.val[0] = tmp.val[0]; 184 rgb.val[1] = tmp.val[1]; 185 rgb.val[2] = tmp.val[2]; 186 } 187 else 188 { 189 rgb = vld3q_u8(ptr); 190 } 191 192 return rgb; 193} 194 195inline void rgb_to_yuv_conversion(uint8x16x3_t &vec_top, uint8x16x3_t &vec_bottom) 196{ 197 // Convert the uint8x16_t to float32x4x4_t 198 const float32x4x4_t frvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vec_top.val[0]); 199 const float32x4x4_t fgvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vec_top.val[1]); 200 const float32x4x4_t fbvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vec_top.val[2]); 201 202 const float32x4x4_t frvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vec_bottom.val[0]); 203 const float32x4x4_t fgvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vec_bottom.val[1]); 204 const float32x4x4_t fbvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vec_bottom.val[2]); 205 206 float32x4x4_t fyvec_top, fuvec_top, fvvec_top; 207 float32x4x4_t fyvec_bottom, fuvec_bottom, fvvec_bottom; 208 209 for(auto i = 0; i < 4; ++i) 210 { 211 rgb_to_yuv_calculation(frvec_top.val[i], fgvec_top.val[i], fbvec_top.val[i], 212 fyvec_top.val[i], fuvec_top.val[i], fvvec_top.val[i]); 213 rgb_to_yuv_calculation(frvec_bottom.val[i], fgvec_bottom.val[i], fbvec_bottom.val[i], 214 fyvec_bottom.val[i], fuvec_bottom.val[i], fvvec_bottom.val[i]); 215 } 216 217 arm_compute::convert_float32x4x4_to_uint8x16(fyvec_top, vec_top.val[0]); 218 arm_compute::convert_float32x4x4_to_uint8x16(fuvec_top, vec_top.val[1]); 219 arm_compute::convert_float32x4x4_to_uint8x16(fvvec_top, vec_top.val[2]); 220 arm_compute::convert_float32x4x4_to_uint8x16(fyvec_bottom, vec_bottom.val[0]); 221 arm_compute::convert_float32x4x4_to_uint8x16(fuvec_bottom, vec_bottom.val[1]); 222 arm_compute::convert_float32x4x4_to_uint8x16(fvvec_bottom, vec_bottom.val[2]); 223} 224 225inline void store_rgb_to_nv12(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top, 226 const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom, 227 unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom, 228 unsigned char *const __restrict out_uv) 229{ 230 uint8x16x3_t vec_top, vec_bottom; 231 vec_top.val[0] = rvec_top; 232 vec_top.val[1] = gvec_top; 233 vec_top.val[2] = bvec_top; 234 vec_bottom.val[0] = rvec_bottom; 235 vec_bottom.val[1] = gvec_bottom; 236 vec_bottom.val[2] = bvec_bottom; 237 238 rgb_to_yuv_conversion(vec_top, vec_bottom); 239 240 vst1q_u8(out_y_top, vec_top.val[0]); 241 vst1q_u8(out_y_bottom, vec_bottom.val[0]); 242 243 const auto uvec = vuzpq_u8(vec_top.val[1], vec_bottom.val[1]); 244 const auto vvec = vuzpq_u8(vec_top.val[2], vec_bottom.val[2]); 245 const auto utmp = vrhaddq_u8(uvec.val[0], uvec.val[1]); 246 const auto vtmp = vrhaddq_u8(vvec.val[0], vvec.val[1]); 247 248 uint8x8x2_t uvvec; 249 uvvec.val[0] = vhadd_u8(vget_low_u8(utmp), vget_high_u8(utmp)); 250 uvvec.val[1] = vhadd_u8(vget_low_u8(vtmp), vget_high_u8(vtmp)); 251 252 vst2_u8(out_uv, uvvec); 253} 254 255inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top, 256 const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom, 257 unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom, 258 unsigned char *const __restrict out_u, 259 unsigned char *const __restrict out_v) 260{ 261 uint8x16x3_t vec_top, vec_bottom; 262 vec_top.val[0] = rvec_top; 263 vec_top.val[1] = gvec_top; 264 vec_top.val[2] = bvec_top; 265 vec_bottom.val[0] = rvec_bottom; 266 vec_bottom.val[1] = gvec_bottom; 267 vec_bottom.val[2] = bvec_bottom; 268 269 rgb_to_yuv_conversion(vec_top, vec_bottom); 270 271 vst1q_u8(out_y_top, vec_top.val[0]); 272 vst1q_u8(out_y_bottom, vec_bottom.val[0]); 273 274 const auto uvvec_top = vuzpq_u8(vec_top.val[1], vec_top.val[2]); 275 const auto uvvec_bottom = vuzpq_u8(vec_bottom.val[1], vec_bottom.val[2]); 276 const auto uvvec = vhaddq_u8(vrhaddq_u8(uvvec_top.val[0], uvvec_top.val[1]), 277 vrhaddq_u8(uvvec_bottom.val[0], uvvec_bottom.val[1])); 278 279 vst1_u8(out_u, vget_low_u8(uvvec)); 280 vst1_u8(out_v, vget_high_u8(uvvec)); 281} 282 283inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, const uint8x16_t &bvec, 284 unsigned char *const __restrict out_y, 285 unsigned char *const __restrict out_u, 286 unsigned char *const __restrict out_v) 287{ 288 // Convert the uint8x16_t to float32x4x4_t 289 const float32x4x4_t frvec = arm_compute::convert_uint8x16_to_float32x4x4(rvec); 290 const float32x4x4_t fgvec = arm_compute::convert_uint8x16_to_float32x4x4(gvec); 291 const float32x4x4_t fbvec = arm_compute::convert_uint8x16_to_float32x4x4(bvec); 292 293 float32x4x4_t fyvec, fuvec, fvvec; 294 for(auto i = 0; i < 4; ++i) 295 { 296 rgb_to_yuv_calculation(frvec.val[i], fgvec.val[i], fbvec.val[i], 297 fyvec.val[i], fuvec.val[i], fvvec.val[i]); 298 } 299 300 uint8x16_t yvec, uvec, vvec; 301 arm_compute::convert_float32x4x4_to_uint8x16(fyvec, yvec); 302 arm_compute::convert_float32x4x4_to_uint8x16(fuvec, uvec); 303 arm_compute::convert_float32x4x4_to_uint8x16(fvvec, vvec); 304 305 vst1q_u8(out_y, yvec); 306 vst1q_u8(out_u, uvec); 307 vst1q_u8(out_v, vvec); 308} 309#endif /* DOXYGEN_SKIP_THIS */ 310} 311 312namespace arm_compute 313{ 314/** Convert RGB to RGBX. 315 * 316 * @param[in] input Input RGB data buffer. 317 * @param[out] output Output RGBX buffer. 318 * @param[in] win Window for iterating the buffers. 319 * 320 */ 321void colorconvert_rgb_to_rgbx(const void *__restrict input, void *__restrict output, const Window &win) 322{ 323 ARM_COMPUTE_ERROR_ON(nullptr == input); 324 ARM_COMPUTE_ERROR_ON(nullptr == output); 325 326 const auto input_ptr = static_cast<const IImage *__restrict>(input); 327 const auto output_ptr = static_cast<IImage *__restrict>(output); 328 329 Iterator in(input_ptr, win); 330 Iterator out(output_ptr, win); 331 332 execute_window_loop(win, [&](const Coordinates &) 333 { 334 const auto ta1 = vld3q_u8(in.ptr()); 335 uint8x16x4_t ta2; 336 ta2.val[0] = ta1.val[0]; 337 ta2.val[1] = ta1.val[1]; 338 ta2.val[2] = ta1.val[2]; 339 ta2.val[3] = vdupq_n_u8(255); 340 vst4q_u8(out.ptr(), ta2); 341 }, 342 in, out); 343} 344 345/** Convert RGB to U8. 346 * 347 * @param[in] input Input RGB data buffer. 348 * @param[out] output Output U8 buffer. 349 * @param[in] win Window for iterating the buffers. 350 * 351 */ 352void colorconvert_rgb_to_u8(const void *__restrict input, void *__restrict output, const Window &win) 353{ 354 ARM_COMPUTE_ERROR_ON(nullptr == input); 355 ARM_COMPUTE_ERROR_ON(nullptr == output); 356 357 const auto input_ptr = static_cast<const IImage *__restrict>(input); 358 const auto output_ptr = static_cast<IImage *__restrict>(output); 359 360 Iterator in(input_ptr, win); 361 Iterator out(output_ptr, win); 362 363 execute_window_loop(win, [&](const Coordinates &) 364 { 365 const auto ta1 = vld3q_u8(in.ptr()); 366 uint8x16_t ta2; 367 rgb_to_u8_conversion(ta1, ta2); 368 vst1q_u8(out.ptr(), ta2); 369 }, 370 in, out); 371} 372 373/** Convert RGBX to RGB. 374 * 375 * @param[in] input Input RGBX data buffer. 376 * @param[out] output Output RGB buffer. 377 * @param[in] win Window for iterating the buffers. 378 * 379 */ 380void colorconvert_rgbx_to_rgb(const void *input, void *output, const Window &win) 381{ 382 ARM_COMPUTE_ERROR_ON(nullptr == input); 383 ARM_COMPUTE_ERROR_ON(nullptr == output); 384 385 const auto input_ptr = static_cast<const IImage *__restrict>(input); 386 const auto output_ptr = static_cast<IImage *__restrict>(output); 387 388 Iterator in(input_ptr, win); 389 Iterator out(output_ptr, win); 390 391 execute_window_loop(win, [&](const Coordinates &) 392 { 393 const auto ta1 = vld4q_u8(in.ptr()); 394 uint8x16x3_t ta2; 395 ta2.val[0] = ta1.val[0]; 396 ta2.val[1] = ta1.val[1]; 397 ta2.val[2] = ta1.val[2]; 398 vst3q_u8(out.ptr(), ta2); 399 }, 400 in, out); 401} 402 403/** Convert YUYV to RGB. 404 * 405 * @param[in] input Input YUYV data buffer. 406 * @param[out] output Output RGB buffer. 407 * @param[in] win Window for iterating the buffers. 408 * 409 */ 410template <bool yuyv, bool alpha> 411void colorconvert_yuyv_to_rgb(const void *__restrict input, void *__restrict output, const Window &win) 412{ 413 ARM_COMPUTE_ERROR_ON(nullptr == input); 414 ARM_COMPUTE_ERROR_ON(nullptr == output); 415 416 const auto input_ptr = static_cast<const IImage *__restrict>(input); 417 const auto output_ptr = static_cast<IImage *__restrict>(output); 418 419 constexpr auto element_size = alpha ? 32 : 24; 420 constexpr auto shift = yuyv ? 0 : 1; 421 422 Iterator in(input_ptr, win); 423 Iterator out(output_ptr, win); 424 425 execute_window_loop(win, [&](const Coordinates &) 426 { 427 const auto ta = vld4q_u8(in.ptr()); 428 //ta.val[0] = Y0 Y2 Y4 Y6 ... 429 //ta.val[1] = U0 U2 U4 U6 ... 430 //ta.val[2] = Y1 Y3 Y5 Y7 ... 431 //ta.val[3] = V0 V2 V4 V7 ... 432 433 // Convert the uint8x16x4_t to float32x4x4_t 434 const float32x4x4_t yvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[0 + shift]); 435 const float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[1 - shift]); 436 const float32x4x4_t yyvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[2 + shift]); 437 const float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[3 - shift]); 438 439 yuyv_to_rgb_calculation(yvec.val[0], uvec.val[0], yyvec.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha); 440 yuyv_to_rgb_calculation(yvec.val[1], uvec.val[1], yyvec.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha); 441 yuyv_to_rgb_calculation(yvec.val[2], uvec.val[2], yyvec.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha); 442 yuyv_to_rgb_calculation(yvec.val[3], uvec.val[3], yyvec.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha); 443 }, 444 in, out); 445} 446 447/** Convert NV12 to RGB. 448 * 449 * @param[in] input Input NV12 data buffer. 450 * @param[out] output Output RGB buffer. 451 * @param[in] win Window for iterating the buffers. 452 * 453 */ 454template <bool uv, bool alpha> 455void colorconvert_nv12_to_rgb(const void *__restrict input, void *__restrict output, const Window &win) 456{ 457 ARM_COMPUTE_ERROR_ON(nullptr == input); 458 ARM_COMPUTE_ERROR_ON(nullptr == output); 459 win.validate(); 460 461 const auto input_ptr = static_cast<const IMultiImage *__restrict>(input); 462 const auto output_ptr = static_cast<IImage *__restrict>(output); 463 464 constexpr auto element_size = alpha ? 32 : 24; 465 const auto out_stride = output_ptr->info()->strides_in_bytes().y(); 466 constexpr auto shift = uv ? 0 : 1; 467 468 // UV's width and height are subsampled 469 Window win_uv(win); 470 win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win.x().step() / 2)); 471 win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); 472 win_uv.validate(); 473 474 Iterator in_y(input_ptr->plane(0), win); 475 Iterator in_uv(input_ptr->plane(1), win_uv); 476 Iterator out(output_ptr, win); 477 478 execute_window_loop(win, [&](const Coordinates &) 479 { 480 const auto ta_y_top = vld2q_u8(in_y.ptr()); 481 const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); 482 const auto ta_uv = vld2q_u8(in_uv.ptr()); 483 //ta_y.val[0] = Y0 Y2 Y4 Y6 ... 484 //ta_y.val[1] = Y1 Y3 Y5 Y7 ... 485 //ta_uv.val[0] = U0 U2 U4 U6 ... 486 //ta_uv.val[1] = V0 V2 V4 V6 ... 487 488 // Convert the uint8x16x4_t to float32x4x4_t 489 float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]); 490 float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]); 491 float32x4x4_t yvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]); 492 float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]); 493 float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[0 + shift]); 494 float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[1 - shift]); 495 496 yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha); 497 yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha); 498 yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha); 499 yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha); 500 501 yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha); 502 yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha); 503 yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha); 504 yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha); 505 }, 506 in_y, in_uv, out); 507} 508 509/** Convert IYUV to RGB. 510 * 511 * @param[in] input Input IYUV data buffer. 512 * @param[out] output Output RGB buffer. 513 * @param[in] win Window for iterating the buffers. 514 * 515 */ 516template <bool alpha> 517void colorconvert_iyuv_to_rgb(const void *__restrict input, void *__restrict output, const Window &win) 518{ 519 ARM_COMPUTE_ERROR_ON(nullptr == input); 520 ARM_COMPUTE_ERROR_ON(nullptr == output); 521 win.validate(); 522 523 const auto input_ptr = static_cast<const IMultiImage *__restrict>(input); 524 const auto output_ptr = static_cast<IImage *__restrict>(output); 525 526 constexpr auto element_size = alpha ? 32 : 24; 527 const auto out_stride = output_ptr->info()->strides_in_bytes().y(); 528 529 // UV's width and height are subsampled 530 Window win_uv(win); 531 win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); 532 win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); 533 win_uv.validate(); 534 535 Iterator in_y(input_ptr->plane(0), win); 536 Iterator in_u(input_ptr->plane(1), win_uv); 537 Iterator in_v(input_ptr->plane(2), win_uv); 538 Iterator out(output_ptr, win); 539 540 execute_window_loop(win, [&](const Coordinates &) 541 { 542 const auto *y_top_ptr = in_y.ptr(); 543 const auto *y_bottom_ptr = in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y(); 544 const auto *u_ptr = in_u.ptr(); 545 const auto *v_ptr = in_v.ptr(); 546 547 // Work-around issue in gcc 9(>=) where vld2q might cause issues with register allocation 548#if defined(__arch64__) 549 const auto ta0_y_top = vld1q_u8(y_top_ptr); 550 const auto ta1_y_top = vld1q_u8(y_top_ptr + 16); 551 const auto ta0_y_bottom = vld1q_u8(y_bottom_ptr); 552 const auto ta1_y_bottom = vld1q_u8(y_bottom_ptr + 16); 553 const auto ta_u = vld1q_u8(u_ptr); 554 const auto ta_v = vld1q_u8(v_ptr); 555 556 // Convert the uint8x16x4_t to float32x4x4_t 557 float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_top, ta1_y_top)); 558 float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_top, ta1_y_top)); 559 float32x4x4_t yvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_bottom, ta1_y_bottom)); 560 float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_bottom, ta1_y_bottom)); 561 float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_u); 562 float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_v); 563#else /* defined(__arch64__) */ 564 const auto ta_y_top = vld2q_u8(y_top_ptr); 565 const auto ta_y_bottom = vld2q_u8(y_bottom_ptr); 566 const auto ta_u = vld1q_u8(u_ptr); 567 const auto ta_v = vld1q_u8(v_ptr); 568 //ta_y.val[0] = Y0 Y2 Y4 Y6 ... 569 //ta_y.val[1] = Y1 Y3 Y5 Y7 ... 570 //ta_u.val[0] = U0 U2 U4 U6 ... 571 //ta_v.val[0] = V0 V2 V4 V6 ... 572 573 // Convert the uint8x16x4_t to float32x4x4_t 574 float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]); 575 float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]); 576 float32x4x4_t yvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]); 577 float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]); 578 float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_u); 579 float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_v); 580#endif /* defined(__arch64__) */ 581 582 yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha); 583 yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha); 584 yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha); 585 yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha); 586 587 yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha); 588 yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha); 589 yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha); 590 yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha); 591 }, 592 in_y, in_u, in_v, out); 593} 594 595/** Convert YUYV to NV12. 596 * 597 * @param[in] input Input YUYV data buffer. 598 * @param[out] output Output NV12 buffer. 599 * @param[in] win Window for iterating the buffers. 600 * 601 */ 602template <bool yuyv> 603void colorconvert_yuyv_to_nv12(const void *__restrict input, void *__restrict output, const Window &win) 604{ 605 ARM_COMPUTE_ERROR_ON(nullptr == input); 606 ARM_COMPUTE_ERROR_ON(nullptr == output); 607 win.validate(); 608 609 const auto input_ptr = static_cast<const IImage *__restrict>(input); 610 const auto output_ptr = static_cast<IMultiImage *__restrict>(output); 611 612 constexpr auto shift = yuyv ? 0 : 1; 613 614 // NV12's UV's width and height are subsampled 615 Window win_uv(win); 616 win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); 617 win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); 618 win_uv.validate(); 619 620 Iterator in(input_ptr, win); 621 Iterator out_y(output_ptr->plane(0), win); 622 Iterator out_uv(output_ptr->plane(1), win_uv); 623 624 execute_window_loop(win, [&](const Coordinates &) 625 { 626 const auto ta_top = vld4q_u8(in.ptr()); 627 const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y()); 628 //ta.val[0] = Y0 Y2 Y4 Y6 ... 629 //ta.val[1] = U0 U2 U4 U6 ... 630 //ta.val[2] = Y1 Y3 Y5 Y7 ... 631 //ta.val[3] = V0 V2 V4 V7 ... 632 633 uint8x16x2_t yvec; 634 yvec.val[0] = ta_top.val[0 + shift]; 635 yvec.val[1] = ta_top.val[2 + shift]; 636 vst2q_u8(out_y.ptr(), yvec); 637 638 uint8x16x2_t yyvec; 639 yyvec.val[0] = ta_bottom.val[0 + shift]; 640 yyvec.val[1] = ta_bottom.val[2 + shift]; 641 vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec); 642 643 uint8x16x2_t uvvec; 644 uvvec.val[0] = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]); 645 uvvec.val[1] = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]); 646 vst2q_u8(out_uv.ptr(), uvvec); 647 }, 648 in, out_y, out_uv); 649} 650 651/** Convert IYUV to NV12. 652 * 653 * @param[in] input Input IYUV data buffer. 654 * @param[out] output Output NV12 buffer. 655 * @param[in] win Window for iterating the buffers. 656 * 657 */ 658void colorconvert_iyuv_to_nv12(const void *__restrict input, void *__restrict output, const Window &win) 659{ 660 ARM_COMPUTE_ERROR_ON(nullptr == input); 661 ARM_COMPUTE_ERROR_ON(nullptr == output); 662 win.validate(); 663 664 const auto input_ptr = static_cast<const IMultiImage *__restrict>(input); 665 const auto output_ptr = static_cast<IMultiImage *__restrict>(output); 666 667 // UV's width and height are subsampled 668 Window win_uv(win); 669 win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); 670 win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); 671 win_uv.validate(); 672 673 Iterator in_y(input_ptr->plane(0), win); 674 Iterator in_u(input_ptr->plane(1), win_uv); 675 Iterator in_v(input_ptr->plane(2), win_uv); 676 Iterator out_y(output_ptr->plane(0), win); 677 Iterator out_uv(output_ptr->plane(1), win_uv); 678 679 execute_window_loop(win, [&](const Coordinates &) 680 { 681 const auto ta_y_top = vld2q_u8(in_y.ptr()); 682 const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); 683 uint8x16x2_t ta_uv; 684 ta_uv.val[0] = vld1q_u8(in_u.ptr()); 685 ta_uv.val[1] = vld1q_u8(in_v.ptr()); 686 //ta_y.val[0] = Y0 Y2 Y4 Y6 ... 687 //ta_y.val[1] = Y1 Y3 Y5 Y7 ... 688 //ta_uv.val[0] = U0 U2 U4 U6 ... 689 //ta_uv.val[1] = V0 V2 V4 V6 ... 690 691 vst2q_u8(out_y.ptr(), ta_y_top); 692 vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); 693 vst2q_u8(out_uv.ptr(), ta_uv); 694 }, 695 in_y, in_u, in_v, out_y, out_uv); 696} 697 698/** Convert NV12 to IYUV. 699 * 700 * @param[in] input Input NV12 data buffer. 701 * @param[out] output Output IYUV buffer. 702 * @param[in] win Window for iterating the buffers. 703 * 704 */ 705template <bool uv> 706void colorconvert_nv12_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win) 707{ 708 ARM_COMPUTE_ERROR_ON(nullptr == input); 709 ARM_COMPUTE_ERROR_ON(nullptr == output); 710 win.validate(); 711 712 const auto input_ptr = static_cast<const IMultiImage *__restrict>(input); 713 const auto output_ptr = static_cast<IMultiImage *__restrict>(output); 714 715 constexpr auto shift = uv ? 0 : 1; 716 717 // UV's width and height are subsampled 718 Window win_uv(win); 719 win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); 720 win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); 721 win_uv.validate(); 722 723 Iterator in_y(input_ptr->plane(0), win); 724 Iterator in_uv(input_ptr->plane(1), win_uv); 725 Iterator out_y(output_ptr->plane(0), win); 726 Iterator out_u(output_ptr->plane(1), win_uv); 727 Iterator out_v(output_ptr->plane(2), win_uv); 728 729 execute_window_loop(win, [&](const Coordinates &) 730 { 731 const auto ta_y_top = vld2q_u8(in_y.ptr()); 732 const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); 733 const auto ta_uv = vld2q_u8(in_uv.ptr()); 734 //ta_y.val[0] = Y0 Y2 Y4 Y6 ... 735 //ta_y.val[1] = Y1 Y3 Y5 Y7 ... 736 //ta_uv.val[0] = U0 U2 U4 U6 ... 737 //ta_uv.val[1] = V0 V2 V4 V6 ... 738 739 vst2q_u8(out_y.ptr(), ta_y_top); 740 vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); 741 vst1q_u8(out_u.ptr(), ta_uv.val[0 + shift]); 742 vst1q_u8(out_v.ptr(), ta_uv.val[1 - shift]); 743 }, 744 in_y, in_uv, out_y, out_u, out_v); 745} 746 747/** Convert YUYV to IYUV. 748 * 749 * @param[in] input Input YUYV data buffer. 750 * @param[out] output Output IYUV buffer. 751 * @param[in] win Window for iterating the buffers. 752 * 753 */ 754template <bool yuyv> 755void colorconvert_yuyv_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win) 756{ 757 ARM_COMPUTE_ERROR_ON(nullptr == input); 758 ARM_COMPUTE_ERROR_ON(nullptr == output); 759 win.validate(); 760 761 const auto input_ptr = static_cast<const IImage *__restrict>(input); 762 const auto output_ptr = static_cast<IMultiImage *__restrict>(output); 763 764 constexpr auto shift = yuyv ? 0 : 1; 765 766 // Destination's UV's width and height are subsampled 767 Window win_uv(win); 768 win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); 769 win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); 770 win_uv.validate(); 771 772 Iterator in(input_ptr, win); 773 Iterator out_y(output_ptr->plane(0), win); 774 Iterator out_u(output_ptr->plane(1), win_uv); 775 Iterator out_v(output_ptr->plane(2), win_uv); 776 777 execute_window_loop(win, [&](const Coordinates &) 778 { 779 const auto ta_top = vld4q_u8(in.ptr()); 780 const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y()); 781 //ta.val[0] = Y0 Y2 Y4 Y6 ... 782 //ta.val[1] = U0 U2 U4 U6 ... 783 //ta.val[2] = Y1 Y3 Y5 Y7 ... 784 //ta.val[3] = V0 V2 V4 V7 ... 785 786 uint8x16x2_t yvec; 787 yvec.val[0] = ta_top.val[0 + shift]; 788 yvec.val[1] = ta_top.val[2 + shift]; 789 vst2q_u8(out_y.ptr(), yvec); 790 791 uint8x16x2_t yyvec; 792 yyvec.val[0] = ta_bottom.val[0 + shift]; 793 yyvec.val[1] = ta_bottom.val[2 + shift]; 794 vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec); 795 796 uint8x16_t uvec; 797 uvec = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]); 798 vst1q_u8(out_u.ptr(), uvec); 799 800 uint8x16_t vvec; 801 vvec = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]); 802 vst1q_u8(out_v.ptr(), vvec); 803 }, 804 in, out_y, out_u, out_v); 805} 806 807/** Convert NV12 to YUV4. 808 * 809 * @param[in] input Input NV12 data buffer. 810 * @param[out] output Output YUV4 buffer. 811 * @param[in] win Window for iterating the buffers. 812 * 813 */ 814template <bool uv> 815void colorconvert_nv12_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win) 816{ 817 ARM_COMPUTE_ERROR_ON(nullptr == input); 818 ARM_COMPUTE_ERROR_ON(nullptr == output); 819 win.validate(); 820 821 const auto input_ptr = static_cast<const IMultiImage *__restrict>(input); 822 const auto output_ptr = static_cast<IMultiImage *__restrict>(output); 823 824 constexpr auto shift = uv ? 0 : 1; 825 826 // UV's width and height are subsampled 827 Window win_uv(win); 828 win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); 829 win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); 830 win_uv.validate(); 831 832 Iterator in_y(input_ptr->plane(0), win); 833 Iterator in_uv(input_ptr->plane(1), win_uv); 834 Iterator out_y(output_ptr->plane(0), win); 835 Iterator out_u(output_ptr->plane(1), win); 836 Iterator out_v(output_ptr->plane(2), win); 837 838 execute_window_loop(win, [&](const Coordinates &) 839 { 840 const auto ta_y_top = vld2q_u8(in_y.ptr()); 841 const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); 842 const auto ta_uv = vld2q_u8(in_uv.ptr()); 843 //ta_y.val[0] = Y0 Y2 Y4 Y6 ... 844 //ta_y.val[1] = Y1 Y3 Y5 Y7 ... 845 //ta_uv.val[0] = U0 U2 U4 U6 ... 846 //ta_uv.val[1] = V0 V2 V4 V6 ... 847 848 vst2q_u8(out_y.ptr(), ta_y_top); 849 vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); 850 851 uint8x16x2_t uvec; 852 uvec.val[0] = ta_uv.val[0 + shift]; 853 uvec.val[1] = ta_uv.val[0 + shift]; 854 vst2q_u8(out_u.ptr(), uvec); 855 vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec); 856 857 uint8x16x2_t vvec; 858 vvec.val[0] = ta_uv.val[1 - shift]; 859 vvec.val[1] = ta_uv.val[1 - shift]; 860 vst2q_u8(out_v.ptr(), vvec); 861 vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec); 862 }, 863 in_y, in_uv, out_y, out_u, out_v); 864} 865 866/** Convert IYUV to YUV4. 867 * 868 * @param[in] input Input IYUV data buffer. 869 * @param[out] output Output YUV4 buffer. 870 * @param[in] win Window for iterating the buffers. 871 * 872 */ 873void colorconvert_iyuv_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win) 874{ 875 ARM_COMPUTE_ERROR_ON(nullptr == input); 876 ARM_COMPUTE_ERROR_ON(nullptr == output); 877 win.validate(); 878 879 const auto input_ptr = static_cast<const IMultiImage *__restrict>(input); 880 const auto output_ptr = static_cast<IMultiImage *__restrict>(output); 881 882 // UV's width and height are subsampled 883 Window win_uv(win); 884 win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); 885 win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); 886 win_uv.validate(); 887 888 Iterator in_y(input_ptr->plane(0), win); 889 Iterator in_u(input_ptr->plane(1), win_uv); 890 Iterator in_v(input_ptr->plane(2), win_uv); 891 Iterator out_y(output_ptr->plane(0), win); 892 Iterator out_u(output_ptr->plane(1), win); 893 Iterator out_v(output_ptr->plane(2), win); 894 895 execute_window_loop(win, [&](const Coordinates &) 896 { 897 const auto ta_y_top = vld2q_u8(in_y.ptr()); 898 const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); 899 const auto ta_u = vld1q_u8(in_u.ptr()); 900 const auto ta_v = vld1q_u8(in_v.ptr()); 901 //ta_y.val[0] = Y0 Y2 Y4 Y6 ... 902 //ta_y.val[1] = Y1 Y3 Y5 Y7 ... 903 //ta_u = U0 U2 U4 U6 ... 904 //ta_v = V0 V2 V4 V6 ... 905 906 vst2q_u8(out_y.ptr(), ta_y_top); 907 vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); 908 909 uint8x16x2_t uvec; 910 uvec.val[0] = ta_u; 911 uvec.val[1] = ta_u; 912 vst2q_u8(out_u.ptr(), uvec); 913 vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec); 914 915 uint8x16x2_t vvec; 916 vvec.val[0] = ta_v; 917 vvec.val[1] = ta_v; 918 vst2q_u8(out_v.ptr(), vvec); 919 vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec); 920 }, 921 in_y, in_u, in_v, out_y, out_u, out_v); 922} 923 924/** Convert RGB to NV12. 925 * 926 * @param[in] input Input RGB data buffer. 927 * @param[out] output Output NV12 buffer. 928 * @param[in] win Window for iterating the buffers. 929 * 930 */ 931template <bool alpha> 932void colorconvert_rgb_to_nv12(const void *__restrict input, void *__restrict output, const Window &win) 933{ 934 ARM_COMPUTE_ERROR_ON(nullptr == input); 935 ARM_COMPUTE_ERROR_ON(nullptr == output); 936 win.validate(); 937 938 const auto input_ptr = static_cast<const IImage *__restrict>(input); 939 const auto output_ptr = static_cast<IMultiImage *__restrict>(output); 940 941 // UV's width and height are subsampled 942 Window win_uv(win); 943 win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); 944 win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); 945 win_uv.validate(); 946 947 Iterator in(input_ptr, win); 948 Iterator out_y(output_ptr->plane(0), win); 949 Iterator out_uv(output_ptr->plane(1), win_uv); 950 951 execute_window_loop(win, [&](const Coordinates &) 952 { 953 const auto ta_rgb_top = load_rgb(in.ptr(), alpha); 954 const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha); 955 //ta_rgb.val[0] = R0 R1 R2 R3 ... 956 //ta_rgb.val[1] = G0 G1 G2 G3 ... 957 //ta_rgb.val[2] = B0 B1 B2 B3 ... 958 959 store_rgb_to_nv12(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2], 960 ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2], 961 out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), 962 out_uv.ptr()); 963 }, 964 in, out_y, out_uv); 965} 966 967/** Convert RGB to IYUV. 968 * 969 * @param[in] input Input RGB data buffer. 970 * @param[out] output Output IYUV buffer. 971 * @param[in] win Window for iterating the buffers. 972 * 973 */ 974template <bool alpha> 975void colorconvert_rgb_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win) 976{ 977 ARM_COMPUTE_ERROR_ON(nullptr == input); 978 ARM_COMPUTE_ERROR_ON(nullptr == output); 979 win.validate(); 980 981 const auto input_ptr = static_cast<const IImage *__restrict>(input); 982 const auto output_ptr = static_cast<IMultiImage *__restrict>(output); 983 984 // UV's width and height are subsampled 985 Window win_uv(win); 986 win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); 987 win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); 988 win_uv.validate(); 989 990 Iterator in(input_ptr, win); 991 Iterator out_y(output_ptr->plane(0), win); 992 Iterator out_u(output_ptr->plane(1), win_uv); 993 Iterator out_v(output_ptr->plane(2), win_uv); 994 995 execute_window_loop(win, [&](const Coordinates &) 996 { 997 const auto ta_rgb_top = load_rgb(in.ptr(), alpha); 998 const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha); 999 //ta_rgb.val[0] = R0 R1 R2 R3 ... 1000 //ta_rgb.val[1] = G0 G1 G2 G3 ... 1001 //ta_rgb.val[2] = B0 B1 B2 B3 ... 1002 1003 store_rgb_to_iyuv(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2], 1004 ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2], 1005 out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), 1006 out_u.ptr(), out_v.ptr()); 1007 }, 1008 in, out_y, out_u, out_v); 1009} 1010 1011/** Convert RGB to YUV4. 1012 * 1013 * @param[in] input Input RGB data buffer. 1014 * @param[out] output Output YUV4 buffer. 1015 * @param[in] win Window for iterating the buffers. 1016 * 1017 */ 1018template <bool alpha> 1019void colorconvert_rgb_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win) 1020{ 1021 ARM_COMPUTE_ERROR_ON(nullptr == input); 1022 ARM_COMPUTE_ERROR_ON(nullptr == output); 1023 win.validate(); 1024 1025 const auto input_ptr = static_cast<const IImage *__restrict>(input); 1026 const auto output_ptr = static_cast<IMultiImage *__restrict>(output); 1027 1028 Iterator in(input_ptr, win); 1029 Iterator out_y(output_ptr->plane(0), win); 1030 Iterator out_u(output_ptr->plane(1), win); 1031 Iterator out_v(output_ptr->plane(2), win); 1032 1033 execute_window_loop(win, [&](const Coordinates &) 1034 { 1035 const auto ta_rgb = load_rgb(in.ptr(), alpha); 1036 //ta_rgb.val[0] = R0 R1 R2 R3 ... 1037 //ta_rgb.val[1] = G0 G1 G2 G3 ... 1038 //ta_rgb.val[2] = B0 B1 B2 B3 ... 1039 1040 store_rgb_to_yuv4(ta_rgb.val[0], ta_rgb.val[1], ta_rgb.val[2], 1041 out_y.ptr(), out_u.ptr(), out_v.ptr()); 1042 }, 1043 in, out_y, out_u, out_v); 1044} 1045} // namespace arm_compute 1046