xref: /aosp_15_r20/external/libultrahdr/lib/src/dsp/arm/gainmapmath_neon.cpp (revision 89a0ef05262152531a00a15832a2d3b1e3990773)
1 /*
2  * Copyright 2024 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "ultrahdr/gainmapmath.h"
18 
19 #include <arm_neon.h>
20 #include <cassert>
21 
22 #ifdef _MSC_VER
23 #define ALIGNED(x) __declspec(align(x))
24 #else
25 #define ALIGNED(x) __attribute__((aligned(x)))
26 #endif
27 
28 namespace ultrahdr {
29 
30 // Scale all coefficients by 2^14 to avoid needing floating-point arithmetic. This can cause an off
31 // by one error compared to the scalar floating-point implementation.
32 
33 // Removing conversion coefficients 1 and 0 from the group for each standard leaves 6 coefficients.
34 // Pack them into a single 128-bit vector as follows, zeroing the remaining elements:
35 // {Y1, Y2, U1, U2, V1, V2, 0, 0}
36 
37 // Yuv Bt709 -> Yuv Bt601
38 // Y' = (1.0f * Y) + ( 0.101579f * U) + ( 0.196076f * V)
39 // U' = (0.0f * Y) + ( 0.989854f * U) + (-0.110653f * V)
40 // V' = (0.0f * Y) + (-0.072453f * U) + ( 0.983398f * V)
41 ALIGNED(16)
42 const int16_t kYuv709To601_coeffs_neon[8] = {1664, 3213, 16218, -1813, -1187, 16112, 0, 0};
43 
44 // Yuv Bt709 -> Yuv Bt2100
45 // Y' = (1.0f * Y) + (-0.016969f * U) + ( 0.096312f * V)
46 // U' = (0.0f * Y) + ( 0.995306f * U) + (-0.051192f * V)
47 // V' = (0.0f * Y) + ( 0.011507f * U) + ( 1.002637f * V)
48 ALIGNED(16)
49 const int16_t kYuv709To2100_coeffs_neon[8] = {-278, 1578, 16307, -839, 189, 16427, 0, 0};
50 
51 // Yuv Bt601 -> Yuv Bt709
52 // Y' = (1.0f * Y) + (-0.118188f * U) + (-0.212685f * V),
53 // U' = (0.0f * Y) + ( 1.018640f * U) + ( 0.114618f * V),
54 // V' = (0.0f * Y) + ( 0.075049f * U) + ( 1.025327f * V);
55 ALIGNED(16)
56 const int16_t kYuv601To709_coeffs_neon[8] = {-1936, -3485, 16689, 1878, 1230, 16799, 0, 0};
57 
58 // Yuv Bt601 -> Yuv Bt2100
59 // Y' = (1.0f * Y) + (-0.128245f * U) + (-0.115879f * V)
60 // U' = (0.0f * Y) + ( 1.010016f * U) + ( 0.061592f * V)
61 // V' = (0.0f * Y) + ( 0.086969f * U) + ( 1.029350f * V)
62 ALIGNED(16)
63 const int16_t kYuv601To2100_coeffs_neon[8] = {-2101, -1899, 16548, 1009, 1425, 16865, 0, 0};
64 
65 // Yuv Bt2100 -> Yuv Bt709
66 // Y' = (1.0f * Y) + ( 0.018149f * U) + (-0.095132f * V)
67 // U' = (0.0f * Y) + ( 1.004123f * U) + ( 0.051267f * V)
68 // V' = (0.0f * Y) + (-0.011524f * U) + ( 0.996782f * V)
69 ALIGNED(16)
70 const int16_t kYuv2100To709_coeffs_neon[8] = {297, -1559, 16452, 840, -189, 16331, 0, 0};
71 
72 // Yuv Bt2100 -> Yuv Bt601
73 // Y' = (1.0f * Y) + ( 0.117887f * U) + ( 0.105521f * V)
74 // U' = (0.0f * Y) + ( 0.995211f * U) + (-0.059549f * V)
75 // V' = (0.0f * Y) + (-0.084085f * U) + ( 0.976518f * V)
76 ALIGNED(16)
77 const int16_t kYuv2100To601_coeffs_neon[8] = {1931, 1729, 16306, -976, -1378, 15999, 0, 0};
78 
yConversion_neon(uint8x8_t y,int16x8_t u,int16x8_t v,int16x8_t coeffs)79 static inline int16x8_t yConversion_neon(uint8x8_t y, int16x8_t u, int16x8_t v, int16x8_t coeffs) {
80   int32x4_t lo = vmull_lane_s16(vget_low_s16(u), vget_low_s16(coeffs), 0);
81   int32x4_t hi = vmull_lane_s16(vget_high_s16(u), vget_low_s16(coeffs), 0);
82   lo = vmlal_lane_s16(lo, vget_low_s16(v), vget_low_s16(coeffs), 1);
83   hi = vmlal_lane_s16(hi, vget_high_s16(v), vget_low_s16(coeffs), 1);
84 
85   // Descale result to account for coefficients being scaled by 2^14.
86   uint16x8_t y_output =
87       vreinterpretq_u16_s16(vcombine_s16(vqrshrn_n_s32(lo, 14), vqrshrn_n_s32(hi, 14)));
88   return vreinterpretq_s16_u16(vaddw_u8(y_output, y));
89 }
90 
uConversion_neon(int16x8_t u,int16x8_t v,int16x8_t coeffs)91 static inline int16x8_t uConversion_neon(int16x8_t u, int16x8_t v, int16x8_t coeffs) {
92   int32x4_t u_lo = vmull_lane_s16(vget_low_s16(u), vget_low_s16(coeffs), 2);
93   int32x4_t u_hi = vmull_lane_s16(vget_high_s16(u), vget_low_s16(coeffs), 2);
94   u_lo = vmlal_lane_s16(u_lo, vget_low_s16(v), vget_low_s16(coeffs), 3);
95   u_hi = vmlal_lane_s16(u_hi, vget_high_s16(v), vget_low_s16(coeffs), 3);
96 
97   // Descale result to account for coefficients being scaled by 2^14.
98   const int16x8_t u_output = vcombine_s16(vqrshrn_n_s32(u_lo, 14), vqrshrn_n_s32(u_hi, 14));
99   return u_output;
100 }
101 
vConversion_neon(int16x8_t u,int16x8_t v,int16x8_t coeffs)102 static inline int16x8_t vConversion_neon(int16x8_t u, int16x8_t v, int16x8_t coeffs) {
103   int32x4_t v_lo = vmull_lane_s16(vget_low_s16(u), vget_high_s16(coeffs), 0);
104   int32x4_t v_hi = vmull_lane_s16(vget_high_s16(u), vget_high_s16(coeffs), 0);
105   v_lo = vmlal_lane_s16(v_lo, vget_low_s16(v), vget_high_s16(coeffs), 1);
106   v_hi = vmlal_lane_s16(v_hi, vget_high_s16(v), vget_high_s16(coeffs), 1);
107 
108   // Descale result to account for coefficients being scaled by 2^14.
109   const int16x8_t v_output = vcombine_s16(vqrshrn_n_s32(v_lo, 14), vqrshrn_n_s32(v_hi, 14));
110   return v_output;
111 }
112 
yuvConversion_neon(uint8x8_t y,int16x8_t u,int16x8_t v,int16x8_t coeffs)113 int16x8x3_t yuvConversion_neon(uint8x8_t y, int16x8_t u, int16x8_t v, int16x8_t coeffs) {
114   const int16x8_t y_output = yConversion_neon(y, u, v, coeffs);
115   const int16x8_t u_output = uConversion_neon(u, v, coeffs);
116   const int16x8_t v_output = vConversion_neon(u, v, coeffs);
117   return {y_output, u_output, v_output};
118 }
119 
transformYuv420_neon(uhdr_raw_image_t * image,const int16_t * coeffs_ptr)120 void transformYuv420_neon(uhdr_raw_image_t* image, const int16_t* coeffs_ptr) {
121   // Implementation assumes image buffer is multiple of 16.
122   assert(image->w % 16 == 0);
123   uint8_t* y0_ptr = static_cast<uint8_t*>(image->planes[UHDR_PLANE_Y]);
124   uint8_t* y1_ptr = y0_ptr + image->stride[UHDR_PLANE_Y];
125   uint8_t* u_ptr = static_cast<uint8_t*>(image->planes[UHDR_PLANE_U]);
126   uint8_t* v_ptr = static_cast<uint8_t*>(image->planes[UHDR_PLANE_V]);
127 
128   const int16x8_t coeffs = vld1q_s16(coeffs_ptr);
129   const uint16x8_t uv_bias = vreinterpretq_u16_s16(vdupq_n_s16(-128));
130   size_t h = 0;
131   do {
132     size_t w = 0;
133     do {
134       uint8x16_t y0 = vld1q_u8(y0_ptr + w * 2);
135       uint8x16_t y1 = vld1q_u8(y1_ptr + w * 2);
136       uint8x8_t u = vld1_u8(u_ptr + w);
137       uint8x8_t v = vld1_u8(v_ptr + w);
138 
139       // 128 bias for UV given we are using libjpeg; see:
140       // https://github.com/kornelski/libjpeg/blob/master/structure.doc
141       int16x8_t u_wide_s16 = vreinterpretq_s16_u16(vaddw_u8(uv_bias, u));  // -128 + u
142       int16x8_t v_wide_s16 = vreinterpretq_s16_u16(vaddw_u8(uv_bias, v));  // -128 + v
143 
144       const int16x8_t u_wide_lo = vzipq_s16(u_wide_s16, u_wide_s16).val[0];
145       const int16x8_t u_wide_hi = vzipq_s16(u_wide_s16, u_wide_s16).val[1];
146       const int16x8_t v_wide_lo = vzipq_s16(v_wide_s16, v_wide_s16).val[0];
147       const int16x8_t v_wide_hi = vzipq_s16(v_wide_s16, v_wide_s16).val[1];
148 
149       const int16x8_t y0_lo = yConversion_neon(vget_low_u8(y0), u_wide_lo, v_wide_lo, coeffs);
150       const int16x8_t y0_hi = yConversion_neon(vget_high_u8(y0), u_wide_hi, v_wide_hi, coeffs);
151       const int16x8_t y1_lo = yConversion_neon(vget_low_u8(y1), u_wide_lo, v_wide_lo, coeffs);
152       const int16x8_t y1_hi = yConversion_neon(vget_high_u8(y1), u_wide_hi, v_wide_hi, coeffs);
153 
154       const int16x8_t new_u = uConversion_neon(u_wide_s16, v_wide_s16, coeffs);
155       const int16x8_t new_v = vConversion_neon(u_wide_s16, v_wide_s16, coeffs);
156 
157       // Narrow from 16-bit to 8-bit with saturation.
158       const uint8x16_t y0_output = vcombine_u8(vqmovun_s16(y0_lo), vqmovun_s16(y0_hi));
159       const uint8x16_t y1_output = vcombine_u8(vqmovun_s16(y1_lo), vqmovun_s16(y1_hi));
160       const uint8x8_t u_output = vqmovun_s16(vaddq_s16(new_u, vdupq_n_s16(128)));
161       const uint8x8_t v_output = vqmovun_s16(vaddq_s16(new_v, vdupq_n_s16(128)));
162 
163       vst1q_u8(y0_ptr + w * 2, y0_output);
164       vst1q_u8(y1_ptr + w * 2, y1_output);
165       vst1_u8(u_ptr + w, u_output);
166       vst1_u8(v_ptr + w, v_output);
167 
168       w += 8;
169     } while (w < image->w / 2);
170     y0_ptr += image->stride[UHDR_PLANE_Y] * 2;
171     y1_ptr += image->stride[UHDR_PLANE_Y] * 2;
172     u_ptr += image->stride[UHDR_PLANE_U];
173     v_ptr += image->stride[UHDR_PLANE_V];
174   } while (++h < image->h / 2);
175 }
176 
transformYuv444_neon(uhdr_raw_image_t * image,const int16_t * coeffs_ptr)177 void transformYuv444_neon(uhdr_raw_image_t* image, const int16_t* coeffs_ptr) {
178   // Implementation assumes image buffer is multiple of 16.
179   assert(image->w % 16 == 0);
180   uint8_t* y_ptr = static_cast<uint8_t*>(image->planes[UHDR_PLANE_Y]);
181   uint8_t* u_ptr = static_cast<uint8_t*>(image->planes[UHDR_PLANE_U]);
182   uint8_t* v_ptr = static_cast<uint8_t*>(image->planes[UHDR_PLANE_V]);
183 
184   const int16x8_t coeffs = vld1q_s16(coeffs_ptr);
185   const uint16x8_t uv_bias = vreinterpretq_u16_s16(vdupq_n_s16(-128));
186   size_t h = 0;
187   do {
188     size_t w = 0;
189     do {
190       uint8x16_t y = vld1q_u8(y_ptr + w);
191       uint8x16_t u = vld1q_u8(u_ptr + w);
192       uint8x16_t v = vld1q_u8(v_ptr + w);
193 
194       // 128 bias for UV given we are using libjpeg; see:
195       // https://github.com/kornelski/libjpeg/blob/master/structure.doc
196       int16x8_t u_wide_low_s16 =
197           vreinterpretq_s16_u16(vaddw_u8(uv_bias, vget_low_u8(u)));  // -128 + u
198       int16x8_t v_wide_low_s16 =
199           vreinterpretq_s16_u16(vaddw_u8(uv_bias, vget_low_u8(v)));  // -128 + v
200       int16x8_t u_wide_high_s16 =
201           vreinterpretq_s16_u16(vaddw_u8(uv_bias, vget_high_u8(u)));  // -128 + u
202       int16x8_t v_wide_high_s16 =
203           vreinterpretq_s16_u16(vaddw_u8(uv_bias, vget_high_u8(v)));  // -128 + v
204 
205       const int16x8_t y_lo =
206           yConversion_neon(vget_low_u8(y), u_wide_low_s16, v_wide_low_s16, coeffs);
207       const int16x8_t y_hi =
208           yConversion_neon(vget_high_u8(y), u_wide_high_s16, v_wide_high_s16, coeffs);
209 
210       const int16x8_t new_u_lo = uConversion_neon(u_wide_low_s16, v_wide_low_s16, coeffs);
211       const int16x8_t new_v_lo = vConversion_neon(u_wide_low_s16, v_wide_low_s16, coeffs);
212       const int16x8_t new_u_hi = uConversion_neon(u_wide_high_s16, v_wide_high_s16, coeffs);
213       const int16x8_t new_v_hi = vConversion_neon(u_wide_high_s16, v_wide_high_s16, coeffs);
214 
215       // Narrow from 16-bit to 8-bit with saturation.
216       const uint8x16_t y_output = vcombine_u8(vqmovun_s16(y_lo), vqmovun_s16(y_hi));
217       const uint8x8_t u_output_lo = vqmovun_s16(vaddq_s16(new_u_lo, vdupq_n_s16(128)));
218       const uint8x8_t u_output_hi = vqmovun_s16(vaddq_s16(new_u_hi, vdupq_n_s16(128)));
219       const uint8x8_t v_output_lo = vqmovun_s16(vaddq_s16(new_v_lo, vdupq_n_s16(128)));
220       const uint8x8_t v_output_hi = vqmovun_s16(vaddq_s16(new_v_hi, vdupq_n_s16(128)));
221       const uint8x16_t u_output = vcombine_u8(u_output_lo, u_output_hi);
222       const uint8x16_t v_output = vcombine_u8(v_output_lo, v_output_hi);
223 
224       vst1q_u8(y_ptr + w, y_output);
225       vst1q_u8(u_ptr + w, u_output);
226       vst1q_u8(v_ptr + w, v_output);
227 
228       w += 16;
229     } while (w < image->w);
230     y_ptr += image->stride[UHDR_PLANE_Y];
231     u_ptr += image->stride[UHDR_PLANE_U];
232     v_ptr += image->stride[UHDR_PLANE_V];
233   } while (++h < image->h);
234 }
235 
convertYuv_neon(uhdr_raw_image_t * image,uhdr_color_gamut_t src_encoding,uhdr_color_gamut_t dst_encoding)236 uhdr_error_info_t convertYuv_neon(uhdr_raw_image_t* image, uhdr_color_gamut_t src_encoding,
237                                   uhdr_color_gamut_t dst_encoding) {
238   uhdr_error_info_t status = g_no_error;
239   const int16_t* coeffs = nullptr;
240 
241   switch (src_encoding) {
242     case UHDR_CG_BT_709:
243       switch (dst_encoding) {
244         case UHDR_CG_BT_709:
245           return status;
246         case UHDR_CG_DISPLAY_P3:
247           coeffs = kYuv709To601_coeffs_neon;
248           break;
249         case UHDR_CG_BT_2100:
250           coeffs = kYuv709To2100_coeffs_neon;
251           break;
252         default:
253           status.error_code = UHDR_CODEC_INVALID_PARAM;
254           status.has_detail = 1;
255           snprintf(status.detail, sizeof status.detail, "Unrecognized dest color gamut %d",
256                    dst_encoding);
257           return status;
258       }
259       break;
260     case UHDR_CG_DISPLAY_P3:
261       switch (dst_encoding) {
262         case UHDR_CG_BT_709:
263           coeffs = kYuv601To709_coeffs_neon;
264           break;
265         case UHDR_CG_DISPLAY_P3:
266           return status;
267         case UHDR_CG_BT_2100:
268           coeffs = kYuv601To2100_coeffs_neon;
269           break;
270         default:
271           status.error_code = UHDR_CODEC_INVALID_PARAM;
272           status.has_detail = 1;
273           snprintf(status.detail, sizeof status.detail, "Unrecognized dest color gamut %d",
274                    dst_encoding);
275           return status;
276       }
277       break;
278     case UHDR_CG_BT_2100:
279       switch (dst_encoding) {
280         case UHDR_CG_BT_709:
281           coeffs = kYuv2100To709_coeffs_neon;
282           break;
283         case UHDR_CG_DISPLAY_P3:
284           coeffs = kYuv2100To601_coeffs_neon;
285           break;
286         case UHDR_CG_BT_2100:
287           return status;
288         default:
289           status.error_code = UHDR_CODEC_INVALID_PARAM;
290           status.has_detail = 1;
291           snprintf(status.detail, sizeof status.detail, "Unrecognized dest color gamut %d",
292                    dst_encoding);
293           return status;
294       }
295       break;
296     default:
297       status.error_code = UHDR_CODEC_INVALID_PARAM;
298       status.has_detail = 1;
299       snprintf(status.detail, sizeof status.detail, "Unrecognized src color gamut %d",
300                src_encoding);
301       return status;
302   }
303 
304   if (image->fmt == UHDR_IMG_FMT_12bppYCbCr420) {
305     transformYuv420_neon(image, coeffs);
306   } else if (image->fmt == UHDR_IMG_FMT_24bppYCbCr444) {
307     transformYuv444_neon(image, coeffs);
308   } else {
309     status.error_code = UHDR_CODEC_UNSUPPORTED_FEATURE;
310     status.has_detail = 1;
311     snprintf(status.detail, sizeof status.detail,
312              "No implementation available for performing gamut conversion for color format %d",
313              image->fmt);
314     return status;
315   }
316 
317   return status;
318 }
319 
320 // Scale all coefficients by 2^14 to avoid needing floating-point arithmetic. This can cause an off
321 // by one error compared to the scalar floating-point implementation.
322 
323 // In the 3x3 conversion matrix, 0.5 is duplicated. But represented as only one entry in lut leaving
324 // with an array size of 8 elements.
325 
326 // RGB Bt709 -> Yuv Bt709
327 // Y = 0.212639 * R + 0.715169 * G + 0.072192 * B
328 // U = -0.114592135 * R + -0.385407865 * G + 0.5 * B
329 // V = 0.5 * R + -0.454155718 * G + -0.045844282 * B
330 ALIGNED(16)
331 const uint16_t kRgb709ToYuv_coeffs_neon[8] = {3484, 11717, 1183, 1877, 6315, 8192, 7441, 751};
332 
333 // RGB Display P3 -> Yuv Display P3
334 // Y = 0.2289746 * R + 0.6917385 * G + 0.0792869 * B
335 // U = -0.124346335 * R + -0.375653665 * G + 0.5 * B
336 // V = 0.5 * R + -0.448583471 * G + -0.051416529 * B
337 ALIGNED(16)
338 const uint16_t kRgbDispP3ToYuv_coeffs_neon[8] = {3752, 11333, 1299, 2037, 6155, 8192, 7350, 842};
339 
340 // RGB Bt2100 -> Yuv Bt2100
341 // Y = 0.2627 * R + 0.677998 * G + 0.059302 * B
342 // U = -0.13963036 * R + -0.36036964 * G + 0.5 * B
343 // V = 0.5 * R + -0.459784348 * G + -0.040215652 * B
344 ALIGNED(16)
345 const uint16_t kRgb2100ToYuv_coeffs_neon[8] = {4304, 11108, 972, 2288, 5904, 8192, 7533, 659};
346 
347 // The core logic is taken from jsimd_rgb_ycc_convert_neon implementation in jccolext-neon.c of
348 // libjpeg-turbo
ConvertRgba8888ToYuv444_neon(uhdr_raw_image_t * src,uhdr_raw_image_t * dst,const uint16_t * coeffs_ptr)349 static void ConvertRgba8888ToYuv444_neon(uhdr_raw_image_t* src, uhdr_raw_image_t* dst,
350                                          const uint16_t* coeffs_ptr) {
351   // Implementation processes 16 pixel per iteration.
352   assert(src->stride[UHDR_PLANE_PACKED] % 16 == 0);
353   uint8_t* rgba_base_ptr = static_cast<uint8_t*>(src->planes[UHDR_PLANE_PACKED]);
354 
355   uint8_t* y_base_ptr = static_cast<uint8_t*>(dst->planes[UHDR_PLANE_Y]);
356   uint8_t* u_base_ptr = static_cast<uint8_t*>(dst->planes[UHDR_PLANE_U]);
357   uint8_t* v_base_ptr = static_cast<uint8_t*>(dst->planes[UHDR_PLANE_V]);
358 
359   const uint16x8_t coeffs = vld1q_u16(coeffs_ptr);
360   const uint32x4_t bias = vdupq_n_u32((128 << 14) + 8191);
361 
362   unsigned int h = 0;
363   do {
364     unsigned int w = 0;
365     uint8_t* rgba_ptr = rgba_base_ptr + (size_t)src->stride[UHDR_PLANE_PACKED] * 4 * h;
366     uint8_t* y_ptr = y_base_ptr + (size_t)dst->stride[UHDR_PLANE_Y] * h;
367     uint8_t* u_ptr = u_base_ptr + (size_t)dst->stride[UHDR_PLANE_U] * h;
368     uint8_t* v_ptr = v_base_ptr + (size_t)dst->stride[UHDR_PLANE_V] * h;
369     do {
370       uint8x16x4_t rgb_pixels = vld4q_u8(rgba_ptr);
371 
372       uint16x8_t r_l = vmovl_u8(vget_low_u8(rgb_pixels.val[0]));
373       uint16x8_t g_l = vmovl_u8(vget_low_u8(rgb_pixels.val[1]));
374       uint16x8_t b_l = vmovl_u8(vget_low_u8(rgb_pixels.val[2]));
375       uint16x8_t r_h = vmovl_u8(vget_high_u8(rgb_pixels.val[0]));
376       uint16x8_t g_h = vmovl_u8(vget_high_u8(rgb_pixels.val[1]));
377       uint16x8_t b_h = vmovl_u8(vget_high_u8(rgb_pixels.val[2]));
378 
379       /* Compute Y */
380       uint32x4_t y_ll = vmull_lane_u16(vget_low_u16(r_l), vget_low_u16(coeffs), 0);
381       y_ll = vmlal_lane_u16(y_ll, vget_low_u16(g_l), vget_low_u16(coeffs), 1);
382       y_ll = vmlal_lane_u16(y_ll, vget_low_u16(b_l), vget_low_u16(coeffs), 2);
383       uint32x4_t y_lh = vmull_lane_u16(vget_high_u16(r_l), vget_low_u16(coeffs), 0);
384       y_lh = vmlal_lane_u16(y_lh, vget_high_u16(g_l), vget_low_u16(coeffs), 1);
385       y_lh = vmlal_lane_u16(y_lh, vget_high_u16(b_l), vget_low_u16(coeffs), 2);
386       uint32x4_t y_hl = vmull_lane_u16(vget_low_u16(r_h), vget_low_u16(coeffs), 0);
387       y_hl = vmlal_lane_u16(y_hl, vget_low_u16(g_h), vget_low_u16(coeffs), 1);
388       y_hl = vmlal_lane_u16(y_hl, vget_low_u16(b_h), vget_low_u16(coeffs), 2);
389       uint32x4_t y_hh = vmull_lane_u16(vget_high_u16(r_h), vget_low_u16(coeffs), 0);
390       y_hh = vmlal_lane_u16(y_hh, vget_high_u16(g_h), vget_low_u16(coeffs), 1);
391       y_hh = vmlal_lane_u16(y_hh, vget_high_u16(b_h), vget_low_u16(coeffs), 2);
392 
393       /* Compute Cb */
394       uint32x4_t cb_ll = bias;
395       cb_ll = vmlsl_lane_u16(cb_ll, vget_low_u16(r_l), vget_low_u16(coeffs), 3);
396       cb_ll = vmlsl_lane_u16(cb_ll, vget_low_u16(g_l), vget_high_u16(coeffs), 0);
397       cb_ll = vmlal_lane_u16(cb_ll, vget_low_u16(b_l), vget_high_u16(coeffs), 1);
398       uint32x4_t cb_lh = bias;
399       cb_lh = vmlsl_lane_u16(cb_lh, vget_high_u16(r_l), vget_low_u16(coeffs), 3);
400       cb_lh = vmlsl_lane_u16(cb_lh, vget_high_u16(g_l), vget_high_u16(coeffs), 0);
401       cb_lh = vmlal_lane_u16(cb_lh, vget_high_u16(b_l), vget_high_u16(coeffs), 1);
402       uint32x4_t cb_hl = bias;
403       cb_hl = vmlsl_lane_u16(cb_hl, vget_low_u16(r_h), vget_low_u16(coeffs), 3);
404       cb_hl = vmlsl_lane_u16(cb_hl, vget_low_u16(g_h), vget_high_u16(coeffs), 0);
405       cb_hl = vmlal_lane_u16(cb_hl, vget_low_u16(b_h), vget_high_u16(coeffs), 1);
406       uint32x4_t cb_hh = bias;
407       cb_hh = vmlsl_lane_u16(cb_hh, vget_high_u16(r_h), vget_low_u16(coeffs), 3);
408       cb_hh = vmlsl_lane_u16(cb_hh, vget_high_u16(g_h), vget_high_u16(coeffs), 0);
409       cb_hh = vmlal_lane_u16(cb_hh, vget_high_u16(b_h), vget_high_u16(coeffs), 1);
410 
411       /* Compute Cr */
412       uint32x4_t cr_ll = bias;
413       cr_ll = vmlal_lane_u16(cr_ll, vget_low_u16(r_l), vget_high_u16(coeffs), 1);
414       cr_ll = vmlsl_lane_u16(cr_ll, vget_low_u16(g_l), vget_high_u16(coeffs), 2);
415       cr_ll = vmlsl_lane_u16(cr_ll, vget_low_u16(b_l), vget_high_u16(coeffs), 3);
416       uint32x4_t cr_lh = bias;
417       cr_lh = vmlal_lane_u16(cr_lh, vget_high_u16(r_l), vget_high_u16(coeffs), 1);
418       cr_lh = vmlsl_lane_u16(cr_lh, vget_high_u16(g_l), vget_high_u16(coeffs), 2);
419       cr_lh = vmlsl_lane_u16(cr_lh, vget_high_u16(b_l), vget_high_u16(coeffs), 3);
420       uint32x4_t cr_hl = bias;
421       cr_hl = vmlal_lane_u16(cr_hl, vget_low_u16(r_h), vget_high_u16(coeffs), 1);
422       cr_hl = vmlsl_lane_u16(cr_hl, vget_low_u16(g_h), vget_high_u16(coeffs), 2);
423       cr_hl = vmlsl_lane_u16(cr_hl, vget_low_u16(b_h), vget_high_u16(coeffs), 3);
424       uint32x4_t cr_hh = bias;
425       cr_hh = vmlal_lane_u16(cr_hh, vget_high_u16(r_h), vget_high_u16(coeffs), 1);
426       cr_hh = vmlsl_lane_u16(cr_hh, vget_high_u16(g_h), vget_high_u16(coeffs), 2);
427       cr_hh = vmlsl_lane_u16(cr_hh, vget_high_u16(b_h), vget_high_u16(coeffs), 3);
428 
429       /* Descale Y values (rounding right shift) and narrow to 16-bit. */
430       uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 14), vrshrn_n_u32(y_lh, 14));
431       uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 14), vrshrn_n_u32(y_hh, 14));
432       /* Descale Cb values (right shift) and narrow to 16-bit. */
433       uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 14), vshrn_n_u32(cb_lh, 14));
434       uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 14), vshrn_n_u32(cb_hh, 14));
435       /* Descale Cr values (right shift) and narrow to 16-bit. */
436       uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 14), vshrn_n_u32(cr_lh, 14));
437       uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 14), vshrn_n_u32(cr_hh, 14));
438 
439       /* Narrow Y, Cb, and Cr values to 8-bit and store to memory.  Buffer
440        * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
441        */
442       vst1q_u8(y_ptr, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
443       vst1q_u8(u_ptr, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
444       vst1q_u8(v_ptr, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
445 
446       /* Increment pointers. */
447       rgba_ptr += (16 * 4);
448       y_ptr += 16;
449       u_ptr += 16;
450       v_ptr += 16;
451 
452       w += 16;
453     } while (w < src->w);
454   } while (++h < src->h);
455 }
456 
convert_raw_input_to_ycbcr_neon(uhdr_raw_image_t * src)457 std::unique_ptr<uhdr_raw_image_ext_t> convert_raw_input_to_ycbcr_neon(uhdr_raw_image_t* src) {
458   if (src->fmt == UHDR_IMG_FMT_32bppRGBA8888) {
459     std::unique_ptr<uhdr_raw_image_ext_t> dst = nullptr;
460     const uint16_t* coeffs_ptr = nullptr;
461 
462     if (src->cg == UHDR_CG_BT_709) {
463       coeffs_ptr = kRgb709ToYuv_coeffs_neon;
464     } else if (src->cg == UHDR_CG_BT_2100) {
465       coeffs_ptr = kRgbDispP3ToYuv_coeffs_neon;
466     } else if (src->cg == UHDR_CG_DISPLAY_P3) {
467       coeffs_ptr = kRgb2100ToYuv_coeffs_neon;
468     } else {
469       return dst;
470     }
471     dst = std::make_unique<uhdr_raw_image_ext_t>(UHDR_IMG_FMT_24bppYCbCr444, src->cg, src->ct,
472                                                  UHDR_CR_FULL_RANGE, src->w, src->h, 64);
473     ConvertRgba8888ToYuv444_neon(src, dst.get(), coeffs_ptr);
474     return dst;
475   }
476   return nullptr;
477 }
478 
479 }  // namespace ultrahdr
480