xref: /aosp_15_r20/external/libjpeg-turbo/simd/arm/aarch64/jccolext-neon.c (revision dfc6aa5c1cfd4bc4e2018dc74aa96e29ee49c6da)
1 /*
2  * jccolext-neon.c - colorspace conversion (64-bit Arm Neon)
3  *
4  * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
5  *
6  * This software is provided 'as-is', without any express or implied
7  * warranty.  In no event will the authors be held liable for any damages
8  * arising from the use of this software.
9  *
10  * Permission is granted to anyone to use this software for any purpose,
11  * including commercial applications, and to alter it and redistribute it
12  * freely, subject to the following restrictions:
13  *
14  * 1. The origin of this software must not be misrepresented; you must not
15  *    claim that you wrote the original software. If you use this software
16  *    in a product, an acknowledgment in the product documentation would be
17  *    appreciated but is not required.
18  * 2. Altered source versions must be plainly marked as such, and must not be
19  *    misrepresented as being the original software.
20  * 3. This notice may not be removed or altered from any source distribution.
21  */
22 
23 /* This file is included by jccolor-neon.c */
24 
25 
26 /* RGB -> YCbCr conversion is defined by the following equations:
27  *    Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
28  *    Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128
29  *    Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B  + 128
30  *
31  * Avoid floating point arithmetic by using shifted integer constants:
32  *    0.29899597 = 19595 * 2^-16
33  *    0.58700561 = 38470 * 2^-16
34  *    0.11399841 =  7471 * 2^-16
35  *    0.16874695 = 11059 * 2^-16
36  *    0.33125305 = 21709 * 2^-16
37  *    0.50000000 = 32768 * 2^-16
38  *    0.41868592 = 27439 * 2^-16
39  *    0.08131409 =  5329 * 2^-16
40  * These constants are defined in jccolor-neon.c
41  *
42  * We add the fixed-point equivalent of 0.5 to Cb and Cr, which effectively
43  * rounds up or down the result via integer truncation.
44  */
45 
jsimd_rgb_ycc_convert_neon(JDIMENSION image_width,JSAMPARRAY input_buf,JSAMPIMAGE output_buf,JDIMENSION output_row,int num_rows)46 void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
47                                 JSAMPIMAGE output_buf, JDIMENSION output_row,
48                                 int num_rows)
49 {
50   /* Pointer to RGB(X/A) input data */
51   JSAMPROW inptr;
52   /* Pointers to Y, Cb, and Cr output data */
53   JSAMPROW outptr0, outptr1, outptr2;
54   /* Allocate temporary buffer for final (image_width % 16) pixels in row. */
55   ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
56 
57   /* Set up conversion constants. */
58   const uint16x8_t consts = vld1q_u16(jsimd_rgb_ycc_neon_consts);
59   const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767);
60 
61   while (--num_rows >= 0) {
62     inptr = *input_buf++;
63     outptr0 = output_buf[0][output_row];
64     outptr1 = output_buf[1][output_row];
65     outptr2 = output_buf[2][output_row];
66     output_row++;
67 
68     int cols_remaining = image_width;
69     for (; cols_remaining >= 16; cols_remaining -= 16) {
70 
71 #if RGB_PIXELSIZE == 4
72       uint8x16x4_t input_pixels = vld4q_u8(inptr);
73 #else
74       uint8x16x3_t input_pixels = vld3q_u8(inptr);
75 #endif
76       uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
77       uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
78       uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
79       uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
80       uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
81       uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
82 
83       /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
84       uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
85       y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
86       y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
87       uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0);
88       y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1);
89       y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2);
90       uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
91       y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
92       y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
93       uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0);
94       y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1);
95       y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2);
96 
97       /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128 */
98       uint32x4_t cb_ll = scaled_128_5;
99       cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
100       cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
101       cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
102       uint32x4_t cb_lh = scaled_128_5;
103       cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3);
104       cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4);
105       cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5);
106       uint32x4_t cb_hl = scaled_128_5;
107       cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
108       cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
109       cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
110       uint32x4_t cb_hh = scaled_128_5;
111       cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3);
112       cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4);
113       cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5);
114 
115       /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B  + 128 */
116       uint32x4_t cr_ll = scaled_128_5;
117       cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
118       cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
119       cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
120       uint32x4_t cr_lh = scaled_128_5;
121       cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5);
122       cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6);
123       cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7);
124       uint32x4_t cr_hl = scaled_128_5;
125       cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
126       cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
127       cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
128       uint32x4_t cr_hh = scaled_128_5;
129       cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5);
130       cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6);
131       cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7);
132 
133       /* Descale Y values (rounding right shift) and narrow to 16-bit. */
134       uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
135                                     vrshrn_n_u32(y_lh, 16));
136       uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
137                                     vrshrn_n_u32(y_hh, 16));
138       /* Descale Cb values (right shift) and narrow to 16-bit. */
139       uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
140                                      vshrn_n_u32(cb_lh, 16));
141       uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
142                                      vshrn_n_u32(cb_hh, 16));
143       /* Descale Cr values (right shift) and narrow to 16-bit. */
144       uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
145                                      vshrn_n_u32(cr_lh, 16));
146       uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
147                                      vshrn_n_u32(cr_hh, 16));
148       /* Narrow Y, Cb, and Cr values to 8-bit and store to memory.  Buffer
149        * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
150        */
151       vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
152       vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
153       vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
154 
155       /* Increment pointers. */
156       inptr += (16 * RGB_PIXELSIZE);
157       outptr0 += 16;
158       outptr1 += 16;
159       outptr2 += 16;
160     }
161 
162     if (cols_remaining > 8) {
163       /* To prevent buffer overread by the vector load instructions, the last
164        * (image_width % 16) columns of data are first memcopied to a temporary
165        * buffer large enough to accommodate the vector load.
166        */
167       memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
168       inptr = tmp_buf;
169 
170 #if RGB_PIXELSIZE == 4
171       uint8x16x4_t input_pixels = vld4q_u8(inptr);
172 #else
173       uint8x16x3_t input_pixels = vld3q_u8(inptr);
174 #endif
175       uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
176       uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
177       uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
178       uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
179       uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
180       uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
181 
182       /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
183       uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
184       y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
185       y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
186       uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0);
187       y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1);
188       y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2);
189       uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
190       y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
191       y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
192       uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0);
193       y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1);
194       y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2);
195 
196       /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128 */
197       uint32x4_t cb_ll = scaled_128_5;
198       cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
199       cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
200       cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
201       uint32x4_t cb_lh = scaled_128_5;
202       cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3);
203       cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4);
204       cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5);
205       uint32x4_t cb_hl = scaled_128_5;
206       cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
207       cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
208       cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
209       uint32x4_t cb_hh = scaled_128_5;
210       cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3);
211       cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4);
212       cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5);
213 
214       /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B  + 128 */
215       uint32x4_t cr_ll = scaled_128_5;
216       cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
217       cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
218       cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
219       uint32x4_t cr_lh = scaled_128_5;
220       cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5);
221       cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6);
222       cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7);
223       uint32x4_t cr_hl = scaled_128_5;
224       cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
225       cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
226       cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
227       uint32x4_t cr_hh = scaled_128_5;
228       cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5);
229       cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6);
230       cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7);
231 
232       /* Descale Y values (rounding right shift) and narrow to 16-bit. */
233       uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
234                                     vrshrn_n_u32(y_lh, 16));
235       uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
236                                     vrshrn_n_u32(y_hh, 16));
237       /* Descale Cb values (right shift) and narrow to 16-bit. */
238       uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
239                                      vshrn_n_u32(cb_lh, 16));
240       uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
241                                      vshrn_n_u32(cb_hh, 16));
242       /* Descale Cr values (right shift) and narrow to 16-bit. */
243       uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
244                                      vshrn_n_u32(cr_lh, 16));
245       uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
246                                      vshrn_n_u32(cr_hh, 16));
247       /* Narrow Y, Cb, and Cr values to 8-bit and store to memory.  Buffer
248        * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
249        */
250       vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
251       vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
252       vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
253 
254     } else if (cols_remaining > 0) {
255       /* To prevent buffer overread by the vector load instructions, the last
256        * (image_width % 8) columns of data are first memcopied to a temporary
257        * buffer large enough to accommodate the vector load.
258        */
259       memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
260       inptr = tmp_buf;
261 
262 #if RGB_PIXELSIZE == 4
263       uint8x8x4_t input_pixels = vld4_u8(inptr);
264 #else
265       uint8x8x3_t input_pixels = vld3_u8(inptr);
266 #endif
267       uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]);
268       uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]);
269       uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]);
270 
271       /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
272       uint32x4_t y_l = vmull_laneq_u16(vget_low_u16(r), consts, 0);
273       y_l = vmlal_laneq_u16(y_l, vget_low_u16(g), consts, 1);
274       y_l = vmlal_laneq_u16(y_l, vget_low_u16(b), consts, 2);
275       uint32x4_t y_h = vmull_laneq_u16(vget_high_u16(r), consts, 0);
276       y_h = vmlal_laneq_u16(y_h, vget_high_u16(g), consts, 1);
277       y_h = vmlal_laneq_u16(y_h, vget_high_u16(b), consts, 2);
278 
279       /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128 */
280       uint32x4_t cb_l = scaled_128_5;
281       cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(r), consts, 3);
282       cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(g), consts, 4);
283       cb_l = vmlal_laneq_u16(cb_l, vget_low_u16(b), consts, 5);
284       uint32x4_t cb_h = scaled_128_5;
285       cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(r), consts, 3);
286       cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(g), consts, 4);
287       cb_h = vmlal_laneq_u16(cb_h, vget_high_u16(b), consts, 5);
288 
289       /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B  + 128 */
290       uint32x4_t cr_l = scaled_128_5;
291       cr_l = vmlal_laneq_u16(cr_l, vget_low_u16(r), consts, 5);
292       cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(g), consts, 6);
293       cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(b), consts, 7);
294       uint32x4_t cr_h = scaled_128_5;
295       cr_h = vmlal_laneq_u16(cr_h, vget_high_u16(r), consts, 5);
296       cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(g), consts, 6);
297       cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(b), consts, 7);
298 
299       /* Descale Y values (rounding right shift) and narrow to 16-bit. */
300       uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_l, 16),
301                                       vrshrn_n_u32(y_h, 16));
302       /* Descale Cb values (right shift) and narrow to 16-bit. */
303       uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_l, 16),
304                                        vshrn_n_u32(cb_h, 16));
305       /* Descale Cr values (right shift) and narrow to 16-bit. */
306       uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_l, 16),
307                                        vshrn_n_u32(cr_h, 16));
308       /* Narrow Y, Cb, and Cr values to 8-bit and store to memory.  Buffer
309        * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
310        */
311       vst1_u8(outptr0, vmovn_u16(y_u16));
312       vst1_u8(outptr1, vmovn_u16(cb_u16));
313       vst1_u8(outptr2, vmovn_u16(cr_u16));
314     }
315   }
316 }
317