1 /*
2 * Copyright 2024 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "ultrahdr/gainmapmath.h"
18
19 #include <arm_neon.h>
20 #include <cassert>
21
22 #ifdef _MSC_VER
23 #define ALIGNED(x) __declspec(align(x))
24 #else
25 #define ALIGNED(x) __attribute__((aligned(x)))
26 #endif
27
28 namespace ultrahdr {
29
30 // Scale all coefficients by 2^14 to avoid needing floating-point arithmetic. This can cause an off
31 // by one error compared to the scalar floating-point implementation.
32
33 // Removing conversion coefficients 1 and 0 from the group for each standard leaves 6 coefficients.
34 // Pack them into a single 128-bit vector as follows, zeroing the remaining elements:
35 // {Y1, Y2, U1, U2, V1, V2, 0, 0}
36
37 // Yuv Bt709 -> Yuv Bt601
38 // Y' = (1.0f * Y) + ( 0.101579f * U) + ( 0.196076f * V)
39 // U' = (0.0f * Y) + ( 0.989854f * U) + (-0.110653f * V)
40 // V' = (0.0f * Y) + (-0.072453f * U) + ( 0.983398f * V)
41 ALIGNED(16)
42 const int16_t kYuv709To601_coeffs_neon[8] = {1664, 3213, 16218, -1813, -1187, 16112, 0, 0};
43
44 // Yuv Bt709 -> Yuv Bt2100
45 // Y' = (1.0f * Y) + (-0.016969f * U) + ( 0.096312f * V)
46 // U' = (0.0f * Y) + ( 0.995306f * U) + (-0.051192f * V)
47 // V' = (0.0f * Y) + ( 0.011507f * U) + ( 1.002637f * V)
48 ALIGNED(16)
49 const int16_t kYuv709To2100_coeffs_neon[8] = {-278, 1578, 16307, -839, 189, 16427, 0, 0};
50
51 // Yuv Bt601 -> Yuv Bt709
52 // Y' = (1.0f * Y) + (-0.118188f * U) + (-0.212685f * V),
53 // U' = (0.0f * Y) + ( 1.018640f * U) + ( 0.114618f * V),
54 // V' = (0.0f * Y) + ( 0.075049f * U) + ( 1.025327f * V);
55 ALIGNED(16)
56 const int16_t kYuv601To709_coeffs_neon[8] = {-1936, -3485, 16689, 1878, 1230, 16799, 0, 0};
57
58 // Yuv Bt601 -> Yuv Bt2100
59 // Y' = (1.0f * Y) + (-0.128245f * U) + (-0.115879f * V)
60 // U' = (0.0f * Y) + ( 1.010016f * U) + ( 0.061592f * V)
61 // V' = (0.0f * Y) + ( 0.086969f * U) + ( 1.029350f * V)
62 ALIGNED(16)
63 const int16_t kYuv601To2100_coeffs_neon[8] = {-2101, -1899, 16548, 1009, 1425, 16865, 0, 0};
64
65 // Yuv Bt2100 -> Yuv Bt709
66 // Y' = (1.0f * Y) + ( 0.018149f * U) + (-0.095132f * V)
67 // U' = (0.0f * Y) + ( 1.004123f * U) + ( 0.051267f * V)
68 // V' = (0.0f * Y) + (-0.011524f * U) + ( 0.996782f * V)
69 ALIGNED(16)
70 const int16_t kYuv2100To709_coeffs_neon[8] = {297, -1559, 16452, 840, -189, 16331, 0, 0};
71
72 // Yuv Bt2100 -> Yuv Bt601
73 // Y' = (1.0f * Y) + ( 0.117887f * U) + ( 0.105521f * V)
74 // U' = (0.0f * Y) + ( 0.995211f * U) + (-0.059549f * V)
75 // V' = (0.0f * Y) + (-0.084085f * U) + ( 0.976518f * V)
76 ALIGNED(16)
77 const int16_t kYuv2100To601_coeffs_neon[8] = {1931, 1729, 16306, -976, -1378, 15999, 0, 0};
78
yConversion_neon(uint8x8_t y,int16x8_t u,int16x8_t v,int16x8_t coeffs)79 static inline int16x8_t yConversion_neon(uint8x8_t y, int16x8_t u, int16x8_t v, int16x8_t coeffs) {
80 int32x4_t lo = vmull_lane_s16(vget_low_s16(u), vget_low_s16(coeffs), 0);
81 int32x4_t hi = vmull_lane_s16(vget_high_s16(u), vget_low_s16(coeffs), 0);
82 lo = vmlal_lane_s16(lo, vget_low_s16(v), vget_low_s16(coeffs), 1);
83 hi = vmlal_lane_s16(hi, vget_high_s16(v), vget_low_s16(coeffs), 1);
84
85 // Descale result to account for coefficients being scaled by 2^14.
86 uint16x8_t y_output =
87 vreinterpretq_u16_s16(vcombine_s16(vqrshrn_n_s32(lo, 14), vqrshrn_n_s32(hi, 14)));
88 return vreinterpretq_s16_u16(vaddw_u8(y_output, y));
89 }
90
uConversion_neon(int16x8_t u,int16x8_t v,int16x8_t coeffs)91 static inline int16x8_t uConversion_neon(int16x8_t u, int16x8_t v, int16x8_t coeffs) {
92 int32x4_t u_lo = vmull_lane_s16(vget_low_s16(u), vget_low_s16(coeffs), 2);
93 int32x4_t u_hi = vmull_lane_s16(vget_high_s16(u), vget_low_s16(coeffs), 2);
94 u_lo = vmlal_lane_s16(u_lo, vget_low_s16(v), vget_low_s16(coeffs), 3);
95 u_hi = vmlal_lane_s16(u_hi, vget_high_s16(v), vget_low_s16(coeffs), 3);
96
97 // Descale result to account for coefficients being scaled by 2^14.
98 const int16x8_t u_output = vcombine_s16(vqrshrn_n_s32(u_lo, 14), vqrshrn_n_s32(u_hi, 14));
99 return u_output;
100 }
101
vConversion_neon(int16x8_t u,int16x8_t v,int16x8_t coeffs)102 static inline int16x8_t vConversion_neon(int16x8_t u, int16x8_t v, int16x8_t coeffs) {
103 int32x4_t v_lo = vmull_lane_s16(vget_low_s16(u), vget_high_s16(coeffs), 0);
104 int32x4_t v_hi = vmull_lane_s16(vget_high_s16(u), vget_high_s16(coeffs), 0);
105 v_lo = vmlal_lane_s16(v_lo, vget_low_s16(v), vget_high_s16(coeffs), 1);
106 v_hi = vmlal_lane_s16(v_hi, vget_high_s16(v), vget_high_s16(coeffs), 1);
107
108 // Descale result to account for coefficients being scaled by 2^14.
109 const int16x8_t v_output = vcombine_s16(vqrshrn_n_s32(v_lo, 14), vqrshrn_n_s32(v_hi, 14));
110 return v_output;
111 }
112
yuvConversion_neon(uint8x8_t y,int16x8_t u,int16x8_t v,int16x8_t coeffs)113 int16x8x3_t yuvConversion_neon(uint8x8_t y, int16x8_t u, int16x8_t v, int16x8_t coeffs) {
114 const int16x8_t y_output = yConversion_neon(y, u, v, coeffs);
115 const int16x8_t u_output = uConversion_neon(u, v, coeffs);
116 const int16x8_t v_output = vConversion_neon(u, v, coeffs);
117 return {y_output, u_output, v_output};
118 }
119
transformYuv420_neon(uhdr_raw_image_t * image,const int16_t * coeffs_ptr)120 void transformYuv420_neon(uhdr_raw_image_t* image, const int16_t* coeffs_ptr) {
121 // Implementation assumes image buffer is multiple of 16.
122 assert(image->w % 16 == 0);
123 uint8_t* y0_ptr = static_cast<uint8_t*>(image->planes[UHDR_PLANE_Y]);
124 uint8_t* y1_ptr = y0_ptr + image->stride[UHDR_PLANE_Y];
125 uint8_t* u_ptr = static_cast<uint8_t*>(image->planes[UHDR_PLANE_U]);
126 uint8_t* v_ptr = static_cast<uint8_t*>(image->planes[UHDR_PLANE_V]);
127
128 const int16x8_t coeffs = vld1q_s16(coeffs_ptr);
129 const uint16x8_t uv_bias = vreinterpretq_u16_s16(vdupq_n_s16(-128));
130 size_t h = 0;
131 do {
132 size_t w = 0;
133 do {
134 uint8x16_t y0 = vld1q_u8(y0_ptr + w * 2);
135 uint8x16_t y1 = vld1q_u8(y1_ptr + w * 2);
136 uint8x8_t u = vld1_u8(u_ptr + w);
137 uint8x8_t v = vld1_u8(v_ptr + w);
138
139 // 128 bias for UV given we are using libjpeg; see:
140 // https://github.com/kornelski/libjpeg/blob/master/structure.doc
141 int16x8_t u_wide_s16 = vreinterpretq_s16_u16(vaddw_u8(uv_bias, u)); // -128 + u
142 int16x8_t v_wide_s16 = vreinterpretq_s16_u16(vaddw_u8(uv_bias, v)); // -128 + v
143
144 const int16x8_t u_wide_lo = vzipq_s16(u_wide_s16, u_wide_s16).val[0];
145 const int16x8_t u_wide_hi = vzipq_s16(u_wide_s16, u_wide_s16).val[1];
146 const int16x8_t v_wide_lo = vzipq_s16(v_wide_s16, v_wide_s16).val[0];
147 const int16x8_t v_wide_hi = vzipq_s16(v_wide_s16, v_wide_s16).val[1];
148
149 const int16x8_t y0_lo = yConversion_neon(vget_low_u8(y0), u_wide_lo, v_wide_lo, coeffs);
150 const int16x8_t y0_hi = yConversion_neon(vget_high_u8(y0), u_wide_hi, v_wide_hi, coeffs);
151 const int16x8_t y1_lo = yConversion_neon(vget_low_u8(y1), u_wide_lo, v_wide_lo, coeffs);
152 const int16x8_t y1_hi = yConversion_neon(vget_high_u8(y1), u_wide_hi, v_wide_hi, coeffs);
153
154 const int16x8_t new_u = uConversion_neon(u_wide_s16, v_wide_s16, coeffs);
155 const int16x8_t new_v = vConversion_neon(u_wide_s16, v_wide_s16, coeffs);
156
157 // Narrow from 16-bit to 8-bit with saturation.
158 const uint8x16_t y0_output = vcombine_u8(vqmovun_s16(y0_lo), vqmovun_s16(y0_hi));
159 const uint8x16_t y1_output = vcombine_u8(vqmovun_s16(y1_lo), vqmovun_s16(y1_hi));
160 const uint8x8_t u_output = vqmovun_s16(vaddq_s16(new_u, vdupq_n_s16(128)));
161 const uint8x8_t v_output = vqmovun_s16(vaddq_s16(new_v, vdupq_n_s16(128)));
162
163 vst1q_u8(y0_ptr + w * 2, y0_output);
164 vst1q_u8(y1_ptr + w * 2, y1_output);
165 vst1_u8(u_ptr + w, u_output);
166 vst1_u8(v_ptr + w, v_output);
167
168 w += 8;
169 } while (w < image->w / 2);
170 y0_ptr += image->stride[UHDR_PLANE_Y] * 2;
171 y1_ptr += image->stride[UHDR_PLANE_Y] * 2;
172 u_ptr += image->stride[UHDR_PLANE_U];
173 v_ptr += image->stride[UHDR_PLANE_V];
174 } while (++h < image->h / 2);
175 }
176
transformYuv444_neon(uhdr_raw_image_t * image,const int16_t * coeffs_ptr)177 void transformYuv444_neon(uhdr_raw_image_t* image, const int16_t* coeffs_ptr) {
178 // Implementation assumes image buffer is multiple of 16.
179 assert(image->w % 16 == 0);
180 uint8_t* y_ptr = static_cast<uint8_t*>(image->planes[UHDR_PLANE_Y]);
181 uint8_t* u_ptr = static_cast<uint8_t*>(image->planes[UHDR_PLANE_U]);
182 uint8_t* v_ptr = static_cast<uint8_t*>(image->planes[UHDR_PLANE_V]);
183
184 const int16x8_t coeffs = vld1q_s16(coeffs_ptr);
185 const uint16x8_t uv_bias = vreinterpretq_u16_s16(vdupq_n_s16(-128));
186 size_t h = 0;
187 do {
188 size_t w = 0;
189 do {
190 uint8x16_t y = vld1q_u8(y_ptr + w);
191 uint8x16_t u = vld1q_u8(u_ptr + w);
192 uint8x16_t v = vld1q_u8(v_ptr + w);
193
194 // 128 bias for UV given we are using libjpeg; see:
195 // https://github.com/kornelski/libjpeg/blob/master/structure.doc
196 int16x8_t u_wide_low_s16 =
197 vreinterpretq_s16_u16(vaddw_u8(uv_bias, vget_low_u8(u))); // -128 + u
198 int16x8_t v_wide_low_s16 =
199 vreinterpretq_s16_u16(vaddw_u8(uv_bias, vget_low_u8(v))); // -128 + v
200 int16x8_t u_wide_high_s16 =
201 vreinterpretq_s16_u16(vaddw_u8(uv_bias, vget_high_u8(u))); // -128 + u
202 int16x8_t v_wide_high_s16 =
203 vreinterpretq_s16_u16(vaddw_u8(uv_bias, vget_high_u8(v))); // -128 + v
204
205 const int16x8_t y_lo =
206 yConversion_neon(vget_low_u8(y), u_wide_low_s16, v_wide_low_s16, coeffs);
207 const int16x8_t y_hi =
208 yConversion_neon(vget_high_u8(y), u_wide_high_s16, v_wide_high_s16, coeffs);
209
210 const int16x8_t new_u_lo = uConversion_neon(u_wide_low_s16, v_wide_low_s16, coeffs);
211 const int16x8_t new_v_lo = vConversion_neon(u_wide_low_s16, v_wide_low_s16, coeffs);
212 const int16x8_t new_u_hi = uConversion_neon(u_wide_high_s16, v_wide_high_s16, coeffs);
213 const int16x8_t new_v_hi = vConversion_neon(u_wide_high_s16, v_wide_high_s16, coeffs);
214
215 // Narrow from 16-bit to 8-bit with saturation.
216 const uint8x16_t y_output = vcombine_u8(vqmovun_s16(y_lo), vqmovun_s16(y_hi));
217 const uint8x8_t u_output_lo = vqmovun_s16(vaddq_s16(new_u_lo, vdupq_n_s16(128)));
218 const uint8x8_t u_output_hi = vqmovun_s16(vaddq_s16(new_u_hi, vdupq_n_s16(128)));
219 const uint8x8_t v_output_lo = vqmovun_s16(vaddq_s16(new_v_lo, vdupq_n_s16(128)));
220 const uint8x8_t v_output_hi = vqmovun_s16(vaddq_s16(new_v_hi, vdupq_n_s16(128)));
221 const uint8x16_t u_output = vcombine_u8(u_output_lo, u_output_hi);
222 const uint8x16_t v_output = vcombine_u8(v_output_lo, v_output_hi);
223
224 vst1q_u8(y_ptr + w, y_output);
225 vst1q_u8(u_ptr + w, u_output);
226 vst1q_u8(v_ptr + w, v_output);
227
228 w += 16;
229 } while (w < image->w);
230 y_ptr += image->stride[UHDR_PLANE_Y];
231 u_ptr += image->stride[UHDR_PLANE_U];
232 v_ptr += image->stride[UHDR_PLANE_V];
233 } while (++h < image->h);
234 }
235
convertYuv_neon(uhdr_raw_image_t * image,uhdr_color_gamut_t src_encoding,uhdr_color_gamut_t dst_encoding)236 uhdr_error_info_t convertYuv_neon(uhdr_raw_image_t* image, uhdr_color_gamut_t src_encoding,
237 uhdr_color_gamut_t dst_encoding) {
238 uhdr_error_info_t status = g_no_error;
239 const int16_t* coeffs = nullptr;
240
241 switch (src_encoding) {
242 case UHDR_CG_BT_709:
243 switch (dst_encoding) {
244 case UHDR_CG_BT_709:
245 return status;
246 case UHDR_CG_DISPLAY_P3:
247 coeffs = kYuv709To601_coeffs_neon;
248 break;
249 case UHDR_CG_BT_2100:
250 coeffs = kYuv709To2100_coeffs_neon;
251 break;
252 default:
253 status.error_code = UHDR_CODEC_INVALID_PARAM;
254 status.has_detail = 1;
255 snprintf(status.detail, sizeof status.detail, "Unrecognized dest color gamut %d",
256 dst_encoding);
257 return status;
258 }
259 break;
260 case UHDR_CG_DISPLAY_P3:
261 switch (dst_encoding) {
262 case UHDR_CG_BT_709:
263 coeffs = kYuv601To709_coeffs_neon;
264 break;
265 case UHDR_CG_DISPLAY_P3:
266 return status;
267 case UHDR_CG_BT_2100:
268 coeffs = kYuv601To2100_coeffs_neon;
269 break;
270 default:
271 status.error_code = UHDR_CODEC_INVALID_PARAM;
272 status.has_detail = 1;
273 snprintf(status.detail, sizeof status.detail, "Unrecognized dest color gamut %d",
274 dst_encoding);
275 return status;
276 }
277 break;
278 case UHDR_CG_BT_2100:
279 switch (dst_encoding) {
280 case UHDR_CG_BT_709:
281 coeffs = kYuv2100To709_coeffs_neon;
282 break;
283 case UHDR_CG_DISPLAY_P3:
284 coeffs = kYuv2100To601_coeffs_neon;
285 break;
286 case UHDR_CG_BT_2100:
287 return status;
288 default:
289 status.error_code = UHDR_CODEC_INVALID_PARAM;
290 status.has_detail = 1;
291 snprintf(status.detail, sizeof status.detail, "Unrecognized dest color gamut %d",
292 dst_encoding);
293 return status;
294 }
295 break;
296 default:
297 status.error_code = UHDR_CODEC_INVALID_PARAM;
298 status.has_detail = 1;
299 snprintf(status.detail, sizeof status.detail, "Unrecognized src color gamut %d",
300 src_encoding);
301 return status;
302 }
303
304 if (image->fmt == UHDR_IMG_FMT_12bppYCbCr420) {
305 transformYuv420_neon(image, coeffs);
306 } else if (image->fmt == UHDR_IMG_FMT_24bppYCbCr444) {
307 transformYuv444_neon(image, coeffs);
308 } else {
309 status.error_code = UHDR_CODEC_UNSUPPORTED_FEATURE;
310 status.has_detail = 1;
311 snprintf(status.detail, sizeof status.detail,
312 "No implementation available for performing gamut conversion for color format %d",
313 image->fmt);
314 return status;
315 }
316
317 return status;
318 }
319
320 // Scale all coefficients by 2^14 to avoid needing floating-point arithmetic. This can cause an off
321 // by one error compared to the scalar floating-point implementation.
322
323 // In the 3x3 conversion matrix, 0.5 is duplicated. But represented as only one entry in lut leaving
324 // with an array size of 8 elements.
325
326 // RGB Bt709 -> Yuv Bt709
327 // Y = 0.212639 * R + 0.715169 * G + 0.072192 * B
328 // U = -0.114592135 * R + -0.385407865 * G + 0.5 * B
329 // V = 0.5 * R + -0.454155718 * G + -0.045844282 * B
330 ALIGNED(16)
331 const uint16_t kRgb709ToYuv_coeffs_neon[8] = {3484, 11717, 1183, 1877, 6315, 8192, 7441, 751};
332
333 // RGB Display P3 -> Yuv Display P3
334 // Y = 0.2289746 * R + 0.6917385 * G + 0.0792869 * B
335 // U = -0.124346335 * R + -0.375653665 * G + 0.5 * B
336 // V = 0.5 * R + -0.448583471 * G + -0.051416529 * B
337 ALIGNED(16)
338 const uint16_t kRgbDispP3ToYuv_coeffs_neon[8] = {3752, 11333, 1299, 2037, 6155, 8192, 7350, 842};
339
340 // RGB Bt2100 -> Yuv Bt2100
341 // Y = 0.2627 * R + 0.677998 * G + 0.059302 * B
342 // U = -0.13963036 * R + -0.36036964 * G + 0.5 * B
343 // V = 0.5 * R + -0.459784348 * G + -0.040215652 * B
344 ALIGNED(16)
345 const uint16_t kRgb2100ToYuv_coeffs_neon[8] = {4304, 11108, 972, 2288, 5904, 8192, 7533, 659};
346
347 // The core logic is taken from jsimd_rgb_ycc_convert_neon implementation in jccolext-neon.c of
348 // libjpeg-turbo
ConvertRgba8888ToYuv444_neon(uhdr_raw_image_t * src,uhdr_raw_image_t * dst,const uint16_t * coeffs_ptr)349 static void ConvertRgba8888ToYuv444_neon(uhdr_raw_image_t* src, uhdr_raw_image_t* dst,
350 const uint16_t* coeffs_ptr) {
351 // Implementation processes 16 pixel per iteration.
352 assert(src->stride[UHDR_PLANE_PACKED] % 16 == 0);
353 uint8_t* rgba_base_ptr = static_cast<uint8_t*>(src->planes[UHDR_PLANE_PACKED]);
354
355 uint8_t* y_base_ptr = static_cast<uint8_t*>(dst->planes[UHDR_PLANE_Y]);
356 uint8_t* u_base_ptr = static_cast<uint8_t*>(dst->planes[UHDR_PLANE_U]);
357 uint8_t* v_base_ptr = static_cast<uint8_t*>(dst->planes[UHDR_PLANE_V]);
358
359 const uint16x8_t coeffs = vld1q_u16(coeffs_ptr);
360 const uint32x4_t bias = vdupq_n_u32((128 << 14) + 8191);
361
362 unsigned int h = 0;
363 do {
364 unsigned int w = 0;
365 uint8_t* rgba_ptr = rgba_base_ptr + (size_t)src->stride[UHDR_PLANE_PACKED] * 4 * h;
366 uint8_t* y_ptr = y_base_ptr + (size_t)dst->stride[UHDR_PLANE_Y] * h;
367 uint8_t* u_ptr = u_base_ptr + (size_t)dst->stride[UHDR_PLANE_U] * h;
368 uint8_t* v_ptr = v_base_ptr + (size_t)dst->stride[UHDR_PLANE_V] * h;
369 do {
370 uint8x16x4_t rgb_pixels = vld4q_u8(rgba_ptr);
371
372 uint16x8_t r_l = vmovl_u8(vget_low_u8(rgb_pixels.val[0]));
373 uint16x8_t g_l = vmovl_u8(vget_low_u8(rgb_pixels.val[1]));
374 uint16x8_t b_l = vmovl_u8(vget_low_u8(rgb_pixels.val[2]));
375 uint16x8_t r_h = vmovl_u8(vget_high_u8(rgb_pixels.val[0]));
376 uint16x8_t g_h = vmovl_u8(vget_high_u8(rgb_pixels.val[1]));
377 uint16x8_t b_h = vmovl_u8(vget_high_u8(rgb_pixels.val[2]));
378
379 /* Compute Y */
380 uint32x4_t y_ll = vmull_lane_u16(vget_low_u16(r_l), vget_low_u16(coeffs), 0);
381 y_ll = vmlal_lane_u16(y_ll, vget_low_u16(g_l), vget_low_u16(coeffs), 1);
382 y_ll = vmlal_lane_u16(y_ll, vget_low_u16(b_l), vget_low_u16(coeffs), 2);
383 uint32x4_t y_lh = vmull_lane_u16(vget_high_u16(r_l), vget_low_u16(coeffs), 0);
384 y_lh = vmlal_lane_u16(y_lh, vget_high_u16(g_l), vget_low_u16(coeffs), 1);
385 y_lh = vmlal_lane_u16(y_lh, vget_high_u16(b_l), vget_low_u16(coeffs), 2);
386 uint32x4_t y_hl = vmull_lane_u16(vget_low_u16(r_h), vget_low_u16(coeffs), 0);
387 y_hl = vmlal_lane_u16(y_hl, vget_low_u16(g_h), vget_low_u16(coeffs), 1);
388 y_hl = vmlal_lane_u16(y_hl, vget_low_u16(b_h), vget_low_u16(coeffs), 2);
389 uint32x4_t y_hh = vmull_lane_u16(vget_high_u16(r_h), vget_low_u16(coeffs), 0);
390 y_hh = vmlal_lane_u16(y_hh, vget_high_u16(g_h), vget_low_u16(coeffs), 1);
391 y_hh = vmlal_lane_u16(y_hh, vget_high_u16(b_h), vget_low_u16(coeffs), 2);
392
393 /* Compute Cb */
394 uint32x4_t cb_ll = bias;
395 cb_ll = vmlsl_lane_u16(cb_ll, vget_low_u16(r_l), vget_low_u16(coeffs), 3);
396 cb_ll = vmlsl_lane_u16(cb_ll, vget_low_u16(g_l), vget_high_u16(coeffs), 0);
397 cb_ll = vmlal_lane_u16(cb_ll, vget_low_u16(b_l), vget_high_u16(coeffs), 1);
398 uint32x4_t cb_lh = bias;
399 cb_lh = vmlsl_lane_u16(cb_lh, vget_high_u16(r_l), vget_low_u16(coeffs), 3);
400 cb_lh = vmlsl_lane_u16(cb_lh, vget_high_u16(g_l), vget_high_u16(coeffs), 0);
401 cb_lh = vmlal_lane_u16(cb_lh, vget_high_u16(b_l), vget_high_u16(coeffs), 1);
402 uint32x4_t cb_hl = bias;
403 cb_hl = vmlsl_lane_u16(cb_hl, vget_low_u16(r_h), vget_low_u16(coeffs), 3);
404 cb_hl = vmlsl_lane_u16(cb_hl, vget_low_u16(g_h), vget_high_u16(coeffs), 0);
405 cb_hl = vmlal_lane_u16(cb_hl, vget_low_u16(b_h), vget_high_u16(coeffs), 1);
406 uint32x4_t cb_hh = bias;
407 cb_hh = vmlsl_lane_u16(cb_hh, vget_high_u16(r_h), vget_low_u16(coeffs), 3);
408 cb_hh = vmlsl_lane_u16(cb_hh, vget_high_u16(g_h), vget_high_u16(coeffs), 0);
409 cb_hh = vmlal_lane_u16(cb_hh, vget_high_u16(b_h), vget_high_u16(coeffs), 1);
410
411 /* Compute Cr */
412 uint32x4_t cr_ll = bias;
413 cr_ll = vmlal_lane_u16(cr_ll, vget_low_u16(r_l), vget_high_u16(coeffs), 1);
414 cr_ll = vmlsl_lane_u16(cr_ll, vget_low_u16(g_l), vget_high_u16(coeffs), 2);
415 cr_ll = vmlsl_lane_u16(cr_ll, vget_low_u16(b_l), vget_high_u16(coeffs), 3);
416 uint32x4_t cr_lh = bias;
417 cr_lh = vmlal_lane_u16(cr_lh, vget_high_u16(r_l), vget_high_u16(coeffs), 1);
418 cr_lh = vmlsl_lane_u16(cr_lh, vget_high_u16(g_l), vget_high_u16(coeffs), 2);
419 cr_lh = vmlsl_lane_u16(cr_lh, vget_high_u16(b_l), vget_high_u16(coeffs), 3);
420 uint32x4_t cr_hl = bias;
421 cr_hl = vmlal_lane_u16(cr_hl, vget_low_u16(r_h), vget_high_u16(coeffs), 1);
422 cr_hl = vmlsl_lane_u16(cr_hl, vget_low_u16(g_h), vget_high_u16(coeffs), 2);
423 cr_hl = vmlsl_lane_u16(cr_hl, vget_low_u16(b_h), vget_high_u16(coeffs), 3);
424 uint32x4_t cr_hh = bias;
425 cr_hh = vmlal_lane_u16(cr_hh, vget_high_u16(r_h), vget_high_u16(coeffs), 1);
426 cr_hh = vmlsl_lane_u16(cr_hh, vget_high_u16(g_h), vget_high_u16(coeffs), 2);
427 cr_hh = vmlsl_lane_u16(cr_hh, vget_high_u16(b_h), vget_high_u16(coeffs), 3);
428
429 /* Descale Y values (rounding right shift) and narrow to 16-bit. */
430 uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 14), vrshrn_n_u32(y_lh, 14));
431 uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 14), vrshrn_n_u32(y_hh, 14));
432 /* Descale Cb values (right shift) and narrow to 16-bit. */
433 uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 14), vshrn_n_u32(cb_lh, 14));
434 uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 14), vshrn_n_u32(cb_hh, 14));
435 /* Descale Cr values (right shift) and narrow to 16-bit. */
436 uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 14), vshrn_n_u32(cr_lh, 14));
437 uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 14), vshrn_n_u32(cr_hh, 14));
438
439 /* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
440 * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
441 */
442 vst1q_u8(y_ptr, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
443 vst1q_u8(u_ptr, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
444 vst1q_u8(v_ptr, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
445
446 /* Increment pointers. */
447 rgba_ptr += (16 * 4);
448 y_ptr += 16;
449 u_ptr += 16;
450 v_ptr += 16;
451
452 w += 16;
453 } while (w < src->w);
454 } while (++h < src->h);
455 }
456
convert_raw_input_to_ycbcr_neon(uhdr_raw_image_t * src)457 std::unique_ptr<uhdr_raw_image_ext_t> convert_raw_input_to_ycbcr_neon(uhdr_raw_image_t* src) {
458 if (src->fmt == UHDR_IMG_FMT_32bppRGBA8888) {
459 std::unique_ptr<uhdr_raw_image_ext_t> dst = nullptr;
460 const uint16_t* coeffs_ptr = nullptr;
461
462 if (src->cg == UHDR_CG_BT_709) {
463 coeffs_ptr = kRgb709ToYuv_coeffs_neon;
464 } else if (src->cg == UHDR_CG_BT_2100) {
465 coeffs_ptr = kRgbDispP3ToYuv_coeffs_neon;
466 } else if (src->cg == UHDR_CG_DISPLAY_P3) {
467 coeffs_ptr = kRgb2100ToYuv_coeffs_neon;
468 } else {
469 return dst;
470 }
471 dst = std::make_unique<uhdr_raw_image_ext_t>(UHDR_IMG_FMT_24bppYCbCr444, src->cg, src->ct,
472 UHDR_CR_FULL_RANGE, src->w, src->h, 64);
473 ConvertRgba8888ToYuv444_neon(src, dst.get(), coeffs_ptr);
474 return dst;
475 }
476 return nullptr;
477 }
478
479 } // namespace ultrahdr
480