1 /*
2 * jccolext-neon.c - colorspace conversion (64-bit Arm Neon)
3 *
4 * Copyright (C) 2020, Arm Limited. All Rights Reserved.
5 *
6 * This software is provided 'as-is', without any express or implied
7 * warranty. In no event will the authors be held liable for any damages
8 * arising from the use of this software.
9 *
10 * Permission is granted to anyone to use this software for any purpose,
11 * including commercial applications, and to alter it and redistribute it
12 * freely, subject to the following restrictions:
13 *
14 * 1. The origin of this software must not be misrepresented; you must not
15 * claim that you wrote the original software. If you use this software
16 * in a product, an acknowledgment in the product documentation would be
17 * appreciated but is not required.
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 * misrepresented as being the original software.
20 * 3. This notice may not be removed or altered from any source distribution.
21 */
22
23 /* This file is included by jccolor-neon.c */
24
25
26 /* RGB -> YCbCr conversion is defined by the following equations:
27 * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
28 * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128
29 * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128
30 *
31 * Avoid floating point arithmetic by using shifted integer constants:
32 * 0.29899597 = 19595 * 2^-16
33 * 0.58700561 = 38470 * 2^-16
34 * 0.11399841 = 7471 * 2^-16
35 * 0.16874695 = 11059 * 2^-16
36 * 0.33125305 = 21709 * 2^-16
37 * 0.50000000 = 32768 * 2^-16
38 * 0.41868592 = 27439 * 2^-16
39 * 0.08131409 = 5329 * 2^-16
40 * These constants are defined in jccolor-neon.c
41 *
42 * We add the fixed-point equivalent of 0.5 to Cb and Cr, which effectively
43 * rounds up or down the result via integer truncation.
44 */
45
jsimd_rgb_ycc_convert_neon(JDIMENSION image_width,JSAMPARRAY input_buf,JSAMPIMAGE output_buf,JDIMENSION output_row,int num_rows)46 void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
47 JSAMPIMAGE output_buf, JDIMENSION output_row,
48 int num_rows)
49 {
50 /* Pointer to RGB(X/A) input data */
51 JSAMPROW inptr;
52 /* Pointers to Y, Cb, and Cr output data */
53 JSAMPROW outptr0, outptr1, outptr2;
54 /* Allocate temporary buffer for final (image_width % 16) pixels in row. */
55 ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
56
57 /* Set up conversion constants. */
58 const uint16x8_t consts = vld1q_u16(jsimd_rgb_ycc_neon_consts);
59 const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767);
60
61 while (--num_rows >= 0) {
62 inptr = *input_buf++;
63 outptr0 = output_buf[0][output_row];
64 outptr1 = output_buf[1][output_row];
65 outptr2 = output_buf[2][output_row];
66 output_row++;
67
68 int cols_remaining = image_width;
69 for (; cols_remaining >= 16; cols_remaining -= 16) {
70
71 #if RGB_PIXELSIZE == 4
72 uint8x16x4_t input_pixels = vld4q_u8(inptr);
73 #else
74 uint8x16x3_t input_pixels = vld3q_u8(inptr);
75 #endif
76 uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
77 uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
78 uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
79 uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
80 uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
81 uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
82
83 /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
84 uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
85 y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
86 y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
87 uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0);
88 y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1);
89 y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2);
90 uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
91 y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
92 y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
93 uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0);
94 y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1);
95 y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2);
96
97 /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
98 uint32x4_t cb_ll = scaled_128_5;
99 cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
100 cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
101 cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
102 uint32x4_t cb_lh = scaled_128_5;
103 cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3);
104 cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4);
105 cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5);
106 uint32x4_t cb_hl = scaled_128_5;
107 cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
108 cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
109 cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
110 uint32x4_t cb_hh = scaled_128_5;
111 cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3);
112 cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4);
113 cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5);
114
115 /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
116 uint32x4_t cr_ll = scaled_128_5;
117 cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
118 cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
119 cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
120 uint32x4_t cr_lh = scaled_128_5;
121 cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5);
122 cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6);
123 cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7);
124 uint32x4_t cr_hl = scaled_128_5;
125 cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
126 cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
127 cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
128 uint32x4_t cr_hh = scaled_128_5;
129 cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5);
130 cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6);
131 cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7);
132
133 /* Descale Y values (rounding right shift) and narrow to 16-bit. */
134 uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
135 vrshrn_n_u32(y_lh, 16));
136 uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
137 vrshrn_n_u32(y_hh, 16));
138 /* Descale Cb values (right shift) and narrow to 16-bit. */
139 uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
140 vshrn_n_u32(cb_lh, 16));
141 uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
142 vshrn_n_u32(cb_hh, 16));
143 /* Descale Cr values (right shift) and narrow to 16-bit. */
144 uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
145 vshrn_n_u32(cr_lh, 16));
146 uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
147 vshrn_n_u32(cr_hh, 16));
148 /* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
149 * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
150 */
151 vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
152 vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
153 vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
154
155 /* Increment pointers. */
156 inptr += (16 * RGB_PIXELSIZE);
157 outptr0 += 16;
158 outptr1 += 16;
159 outptr2 += 16;
160 }
161
162 if (cols_remaining > 8) {
163 /* To prevent buffer overread by the vector load instructions, the last
164 * (image_width % 16) columns of data are first memcopied to a temporary
165 * buffer large enough to accommodate the vector load.
166 */
167 memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
168 inptr = tmp_buf;
169
170 #if RGB_PIXELSIZE == 4
171 uint8x16x4_t input_pixels = vld4q_u8(inptr);
172 #else
173 uint8x16x3_t input_pixels = vld3q_u8(inptr);
174 #endif
175 uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
176 uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
177 uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
178 uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
179 uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
180 uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
181
182 /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
183 uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
184 y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
185 y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
186 uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0);
187 y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1);
188 y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2);
189 uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
190 y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
191 y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
192 uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0);
193 y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1);
194 y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2);
195
196 /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
197 uint32x4_t cb_ll = scaled_128_5;
198 cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
199 cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
200 cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
201 uint32x4_t cb_lh = scaled_128_5;
202 cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3);
203 cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4);
204 cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5);
205 uint32x4_t cb_hl = scaled_128_5;
206 cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
207 cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
208 cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
209 uint32x4_t cb_hh = scaled_128_5;
210 cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3);
211 cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4);
212 cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5);
213
214 /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
215 uint32x4_t cr_ll = scaled_128_5;
216 cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
217 cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
218 cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
219 uint32x4_t cr_lh = scaled_128_5;
220 cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5);
221 cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6);
222 cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7);
223 uint32x4_t cr_hl = scaled_128_5;
224 cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
225 cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
226 cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
227 uint32x4_t cr_hh = scaled_128_5;
228 cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5);
229 cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6);
230 cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7);
231
232 /* Descale Y values (rounding right shift) and narrow to 16-bit. */
233 uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
234 vrshrn_n_u32(y_lh, 16));
235 uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
236 vrshrn_n_u32(y_hh, 16));
237 /* Descale Cb values (right shift) and narrow to 16-bit. */
238 uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
239 vshrn_n_u32(cb_lh, 16));
240 uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
241 vshrn_n_u32(cb_hh, 16));
242 /* Descale Cr values (right shift) and narrow to 16-bit. */
243 uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
244 vshrn_n_u32(cr_lh, 16));
245 uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
246 vshrn_n_u32(cr_hh, 16));
247 /* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
248 * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
249 */
250 vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
251 vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
252 vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
253
254 } else if (cols_remaining > 0) {
255 /* To prevent buffer overread by the vector load instructions, the last
256 * (image_width % 8) columns of data are first memcopied to a temporary
257 * buffer large enough to accommodate the vector load.
258 */
259 memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
260 inptr = tmp_buf;
261
262 #if RGB_PIXELSIZE == 4
263 uint8x8x4_t input_pixels = vld4_u8(inptr);
264 #else
265 uint8x8x3_t input_pixels = vld3_u8(inptr);
266 #endif
267 uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]);
268 uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]);
269 uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]);
270
271 /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
272 uint32x4_t y_l = vmull_laneq_u16(vget_low_u16(r), consts, 0);
273 y_l = vmlal_laneq_u16(y_l, vget_low_u16(g), consts, 1);
274 y_l = vmlal_laneq_u16(y_l, vget_low_u16(b), consts, 2);
275 uint32x4_t y_h = vmull_laneq_u16(vget_high_u16(r), consts, 0);
276 y_h = vmlal_laneq_u16(y_h, vget_high_u16(g), consts, 1);
277 y_h = vmlal_laneq_u16(y_h, vget_high_u16(b), consts, 2);
278
279 /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
280 uint32x4_t cb_l = scaled_128_5;
281 cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(r), consts, 3);
282 cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(g), consts, 4);
283 cb_l = vmlal_laneq_u16(cb_l, vget_low_u16(b), consts, 5);
284 uint32x4_t cb_h = scaled_128_5;
285 cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(r), consts, 3);
286 cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(g), consts, 4);
287 cb_h = vmlal_laneq_u16(cb_h, vget_high_u16(b), consts, 5);
288
289 /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
290 uint32x4_t cr_l = scaled_128_5;
291 cr_l = vmlal_laneq_u16(cr_l, vget_low_u16(r), consts, 5);
292 cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(g), consts, 6);
293 cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(b), consts, 7);
294 uint32x4_t cr_h = scaled_128_5;
295 cr_h = vmlal_laneq_u16(cr_h, vget_high_u16(r), consts, 5);
296 cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(g), consts, 6);
297 cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(b), consts, 7);
298
299 /* Descale Y values (rounding right shift) and narrow to 16-bit. */
300 uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_l, 16),
301 vrshrn_n_u32(y_h, 16));
302 /* Descale Cb values (right shift) and narrow to 16-bit. */
303 uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_l, 16),
304 vshrn_n_u32(cb_h, 16));
305 /* Descale Cr values (right shift) and narrow to 16-bit. */
306 uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_l, 16),
307 vshrn_n_u32(cr_h, 16));
308 /* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
309 * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
310 */
311 vst1_u8(outptr0, vmovn_u16(y_u16));
312 vst1_u8(outptr1, vmovn_u16(cb_u16));
313 vst1_u8(outptr2, vmovn_u16(cr_u16));
314 }
315 }
316 }
317