1*dfc6aa5cSAndroid Build Coastguard Worker /*
2*dfc6aa5cSAndroid Build Coastguard Worker * jccolext-neon.c - colorspace conversion (64-bit Arm Neon)
3*dfc6aa5cSAndroid Build Coastguard Worker *
4*dfc6aa5cSAndroid Build Coastguard Worker * Copyright (C) 2020, Arm Limited. All Rights Reserved.
5*dfc6aa5cSAndroid Build Coastguard Worker *
6*dfc6aa5cSAndroid Build Coastguard Worker * This software is provided 'as-is', without any express or implied
7*dfc6aa5cSAndroid Build Coastguard Worker * warranty. In no event will the authors be held liable for any damages
8*dfc6aa5cSAndroid Build Coastguard Worker * arising from the use of this software.
9*dfc6aa5cSAndroid Build Coastguard Worker *
10*dfc6aa5cSAndroid Build Coastguard Worker * Permission is granted to anyone to use this software for any purpose,
11*dfc6aa5cSAndroid Build Coastguard Worker * including commercial applications, and to alter it and redistribute it
12*dfc6aa5cSAndroid Build Coastguard Worker * freely, subject to the following restrictions:
13*dfc6aa5cSAndroid Build Coastguard Worker *
14*dfc6aa5cSAndroid Build Coastguard Worker * 1. The origin of this software must not be misrepresented; you must not
15*dfc6aa5cSAndroid Build Coastguard Worker * claim that you wrote the original software. If you use this software
16*dfc6aa5cSAndroid Build Coastguard Worker * in a product, an acknowledgment in the product documentation would be
17*dfc6aa5cSAndroid Build Coastguard Worker * appreciated but is not required.
18*dfc6aa5cSAndroid Build Coastguard Worker * 2. Altered source versions must be plainly marked as such, and must not be
19*dfc6aa5cSAndroid Build Coastguard Worker * misrepresented as being the original software.
20*dfc6aa5cSAndroid Build Coastguard Worker * 3. This notice may not be removed or altered from any source distribution.
21*dfc6aa5cSAndroid Build Coastguard Worker */
22*dfc6aa5cSAndroid Build Coastguard Worker
23*dfc6aa5cSAndroid Build Coastguard Worker /* This file is included by jccolor-neon.c */
24*dfc6aa5cSAndroid Build Coastguard Worker
25*dfc6aa5cSAndroid Build Coastguard Worker
26*dfc6aa5cSAndroid Build Coastguard Worker /* RGB -> YCbCr conversion is defined by the following equations:
27*dfc6aa5cSAndroid Build Coastguard Worker * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
28*dfc6aa5cSAndroid Build Coastguard Worker * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128
29*dfc6aa5cSAndroid Build Coastguard Worker * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128
30*dfc6aa5cSAndroid Build Coastguard Worker *
31*dfc6aa5cSAndroid Build Coastguard Worker * Avoid floating point arithmetic by using shifted integer constants:
32*dfc6aa5cSAndroid Build Coastguard Worker * 0.29899597 = 19595 * 2^-16
33*dfc6aa5cSAndroid Build Coastguard Worker * 0.58700561 = 38470 * 2^-16
34*dfc6aa5cSAndroid Build Coastguard Worker * 0.11399841 = 7471 * 2^-16
35*dfc6aa5cSAndroid Build Coastguard Worker * 0.16874695 = 11059 * 2^-16
36*dfc6aa5cSAndroid Build Coastguard Worker * 0.33125305 = 21709 * 2^-16
37*dfc6aa5cSAndroid Build Coastguard Worker * 0.50000000 = 32768 * 2^-16
38*dfc6aa5cSAndroid Build Coastguard Worker * 0.41868592 = 27439 * 2^-16
39*dfc6aa5cSAndroid Build Coastguard Worker * 0.08131409 = 5329 * 2^-16
40*dfc6aa5cSAndroid Build Coastguard Worker * These constants are defined in jccolor-neon.c
41*dfc6aa5cSAndroid Build Coastguard Worker *
42*dfc6aa5cSAndroid Build Coastguard Worker * We add the fixed-point equivalent of 0.5 to Cb and Cr, which effectively
43*dfc6aa5cSAndroid Build Coastguard Worker * rounds up or down the result via integer truncation.
44*dfc6aa5cSAndroid Build Coastguard Worker */
45*dfc6aa5cSAndroid Build Coastguard Worker
jsimd_rgb_ycc_convert_neon(JDIMENSION image_width,JSAMPARRAY input_buf,JSAMPIMAGE output_buf,JDIMENSION output_row,int num_rows)46*dfc6aa5cSAndroid Build Coastguard Worker void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
47*dfc6aa5cSAndroid Build Coastguard Worker JSAMPIMAGE output_buf, JDIMENSION output_row,
48*dfc6aa5cSAndroid Build Coastguard Worker int num_rows)
49*dfc6aa5cSAndroid Build Coastguard Worker {
50*dfc6aa5cSAndroid Build Coastguard Worker /* Pointer to RGB(X/A) input data */
51*dfc6aa5cSAndroid Build Coastguard Worker JSAMPROW inptr;
52*dfc6aa5cSAndroid Build Coastguard Worker /* Pointers to Y, Cb, and Cr output data */
53*dfc6aa5cSAndroid Build Coastguard Worker JSAMPROW outptr0, outptr1, outptr2;
54*dfc6aa5cSAndroid Build Coastguard Worker /* Allocate temporary buffer for final (image_width % 16) pixels in row. */
55*dfc6aa5cSAndroid Build Coastguard Worker ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
56*dfc6aa5cSAndroid Build Coastguard Worker
57*dfc6aa5cSAndroid Build Coastguard Worker /* Set up conversion constants. */
58*dfc6aa5cSAndroid Build Coastguard Worker const uint16x8_t consts = vld1q_u16(jsimd_rgb_ycc_neon_consts);
59*dfc6aa5cSAndroid Build Coastguard Worker const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767);
60*dfc6aa5cSAndroid Build Coastguard Worker
61*dfc6aa5cSAndroid Build Coastguard Worker while (--num_rows >= 0) {
62*dfc6aa5cSAndroid Build Coastguard Worker inptr = *input_buf++;
63*dfc6aa5cSAndroid Build Coastguard Worker outptr0 = output_buf[0][output_row];
64*dfc6aa5cSAndroid Build Coastguard Worker outptr1 = output_buf[1][output_row];
65*dfc6aa5cSAndroid Build Coastguard Worker outptr2 = output_buf[2][output_row];
66*dfc6aa5cSAndroid Build Coastguard Worker output_row++;
67*dfc6aa5cSAndroid Build Coastguard Worker
68*dfc6aa5cSAndroid Build Coastguard Worker int cols_remaining = image_width;
69*dfc6aa5cSAndroid Build Coastguard Worker for (; cols_remaining >= 16; cols_remaining -= 16) {
70*dfc6aa5cSAndroid Build Coastguard Worker
71*dfc6aa5cSAndroid Build Coastguard Worker #if RGB_PIXELSIZE == 4
72*dfc6aa5cSAndroid Build Coastguard Worker uint8x16x4_t input_pixels = vld4q_u8(inptr);
73*dfc6aa5cSAndroid Build Coastguard Worker #else
74*dfc6aa5cSAndroid Build Coastguard Worker uint8x16x3_t input_pixels = vld3q_u8(inptr);
75*dfc6aa5cSAndroid Build Coastguard Worker #endif
76*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
77*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
78*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
79*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
80*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
81*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
82*dfc6aa5cSAndroid Build Coastguard Worker
83*dfc6aa5cSAndroid Build Coastguard Worker /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
84*dfc6aa5cSAndroid Build Coastguard Worker uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
85*dfc6aa5cSAndroid Build Coastguard Worker y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
86*dfc6aa5cSAndroid Build Coastguard Worker y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
87*dfc6aa5cSAndroid Build Coastguard Worker uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0);
88*dfc6aa5cSAndroid Build Coastguard Worker y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1);
89*dfc6aa5cSAndroid Build Coastguard Worker y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2);
90*dfc6aa5cSAndroid Build Coastguard Worker uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
91*dfc6aa5cSAndroid Build Coastguard Worker y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
92*dfc6aa5cSAndroid Build Coastguard Worker y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
93*dfc6aa5cSAndroid Build Coastguard Worker uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0);
94*dfc6aa5cSAndroid Build Coastguard Worker y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1);
95*dfc6aa5cSAndroid Build Coastguard Worker y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2);
96*dfc6aa5cSAndroid Build Coastguard Worker
97*dfc6aa5cSAndroid Build Coastguard Worker /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
98*dfc6aa5cSAndroid Build Coastguard Worker uint32x4_t cb_ll = scaled_128_5;
99*dfc6aa5cSAndroid Build Coastguard Worker cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
100*dfc6aa5cSAndroid Build Coastguard Worker cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
101*dfc6aa5cSAndroid Build Coastguard Worker cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
102*dfc6aa5cSAndroid Build Coastguard Worker uint32x4_t cb_lh = scaled_128_5;
103*dfc6aa5cSAndroid Build Coastguard Worker cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3);
104*dfc6aa5cSAndroid Build Coastguard Worker cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4);
105*dfc6aa5cSAndroid Build Coastguard Worker cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5);
106*dfc6aa5cSAndroid Build Coastguard Worker uint32x4_t cb_hl = scaled_128_5;
107*dfc6aa5cSAndroid Build Coastguard Worker cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
108*dfc6aa5cSAndroid Build Coastguard Worker cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
109*dfc6aa5cSAndroid Build Coastguard Worker cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
110*dfc6aa5cSAndroid Build Coastguard Worker uint32x4_t cb_hh = scaled_128_5;
111*dfc6aa5cSAndroid Build Coastguard Worker cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3);
112*dfc6aa5cSAndroid Build Coastguard Worker cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4);
113*dfc6aa5cSAndroid Build Coastguard Worker cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5);
114*dfc6aa5cSAndroid Build Coastguard Worker
115*dfc6aa5cSAndroid Build Coastguard Worker /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
116*dfc6aa5cSAndroid Build Coastguard Worker uint32x4_t cr_ll = scaled_128_5;
117*dfc6aa5cSAndroid Build Coastguard Worker cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
118*dfc6aa5cSAndroid Build Coastguard Worker cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
119*dfc6aa5cSAndroid Build Coastguard Worker cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
120*dfc6aa5cSAndroid Build Coastguard Worker uint32x4_t cr_lh = scaled_128_5;
121*dfc6aa5cSAndroid Build Coastguard Worker cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5);
122*dfc6aa5cSAndroid Build Coastguard Worker cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6);
123*dfc6aa5cSAndroid Build Coastguard Worker cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7);
124*dfc6aa5cSAndroid Build Coastguard Worker uint32x4_t cr_hl = scaled_128_5;
125*dfc6aa5cSAndroid Build Coastguard Worker cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
126*dfc6aa5cSAndroid Build Coastguard Worker cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
127*dfc6aa5cSAndroid Build Coastguard Worker cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
128*dfc6aa5cSAndroid Build Coastguard Worker uint32x4_t cr_hh = scaled_128_5;
129*dfc6aa5cSAndroid Build Coastguard Worker cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5);
130*dfc6aa5cSAndroid Build Coastguard Worker cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6);
131*dfc6aa5cSAndroid Build Coastguard Worker cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7);
132*dfc6aa5cSAndroid Build Coastguard Worker
133*dfc6aa5cSAndroid Build Coastguard Worker /* Descale Y values (rounding right shift) and narrow to 16-bit. */
134*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
135*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_u32(y_lh, 16));
136*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
137*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_u32(y_hh, 16));
138*dfc6aa5cSAndroid Build Coastguard Worker /* Descale Cb values (right shift) and narrow to 16-bit. */
139*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
140*dfc6aa5cSAndroid Build Coastguard Worker vshrn_n_u32(cb_lh, 16));
141*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
142*dfc6aa5cSAndroid Build Coastguard Worker vshrn_n_u32(cb_hh, 16));
143*dfc6aa5cSAndroid Build Coastguard Worker /* Descale Cr values (right shift) and narrow to 16-bit. */
144*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
145*dfc6aa5cSAndroid Build Coastguard Worker vshrn_n_u32(cr_lh, 16));
146*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
147*dfc6aa5cSAndroid Build Coastguard Worker vshrn_n_u32(cr_hh, 16));
148*dfc6aa5cSAndroid Build Coastguard Worker /* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
149*dfc6aa5cSAndroid Build Coastguard Worker * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
150*dfc6aa5cSAndroid Build Coastguard Worker */
151*dfc6aa5cSAndroid Build Coastguard Worker vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
152*dfc6aa5cSAndroid Build Coastguard Worker vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
153*dfc6aa5cSAndroid Build Coastguard Worker vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
154*dfc6aa5cSAndroid Build Coastguard Worker
155*dfc6aa5cSAndroid Build Coastguard Worker /* Increment pointers. */
156*dfc6aa5cSAndroid Build Coastguard Worker inptr += (16 * RGB_PIXELSIZE);
157*dfc6aa5cSAndroid Build Coastguard Worker outptr0 += 16;
158*dfc6aa5cSAndroid Build Coastguard Worker outptr1 += 16;
159*dfc6aa5cSAndroid Build Coastguard Worker outptr2 += 16;
160*dfc6aa5cSAndroid Build Coastguard Worker }
161*dfc6aa5cSAndroid Build Coastguard Worker
162*dfc6aa5cSAndroid Build Coastguard Worker if (cols_remaining > 8) {
163*dfc6aa5cSAndroid Build Coastguard Worker /* To prevent buffer overread by the vector load instructions, the last
164*dfc6aa5cSAndroid Build Coastguard Worker * (image_width % 16) columns of data are first memcopied to a temporary
165*dfc6aa5cSAndroid Build Coastguard Worker * buffer large enough to accommodate the vector load.
166*dfc6aa5cSAndroid Build Coastguard Worker */
167*dfc6aa5cSAndroid Build Coastguard Worker memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
168*dfc6aa5cSAndroid Build Coastguard Worker inptr = tmp_buf;
169*dfc6aa5cSAndroid Build Coastguard Worker
170*dfc6aa5cSAndroid Build Coastguard Worker #if RGB_PIXELSIZE == 4
171*dfc6aa5cSAndroid Build Coastguard Worker uint8x16x4_t input_pixels = vld4q_u8(inptr);
172*dfc6aa5cSAndroid Build Coastguard Worker #else
173*dfc6aa5cSAndroid Build Coastguard Worker uint8x16x3_t input_pixels = vld3q_u8(inptr);
174*dfc6aa5cSAndroid Build Coastguard Worker #endif
175*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
176*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
177*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
178*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
179*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
180*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
181*dfc6aa5cSAndroid Build Coastguard Worker
182*dfc6aa5cSAndroid Build Coastguard Worker /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
183*dfc6aa5cSAndroid Build Coastguard Worker uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
184*dfc6aa5cSAndroid Build Coastguard Worker y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
185*dfc6aa5cSAndroid Build Coastguard Worker y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
186*dfc6aa5cSAndroid Build Coastguard Worker uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0);
187*dfc6aa5cSAndroid Build Coastguard Worker y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1);
188*dfc6aa5cSAndroid Build Coastguard Worker y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2);
189*dfc6aa5cSAndroid Build Coastguard Worker uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
190*dfc6aa5cSAndroid Build Coastguard Worker y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
191*dfc6aa5cSAndroid Build Coastguard Worker y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
192*dfc6aa5cSAndroid Build Coastguard Worker uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0);
193*dfc6aa5cSAndroid Build Coastguard Worker y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1);
194*dfc6aa5cSAndroid Build Coastguard Worker y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2);
195*dfc6aa5cSAndroid Build Coastguard Worker
196*dfc6aa5cSAndroid Build Coastguard Worker /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
197*dfc6aa5cSAndroid Build Coastguard Worker uint32x4_t cb_ll = scaled_128_5;
198*dfc6aa5cSAndroid Build Coastguard Worker cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
199*dfc6aa5cSAndroid Build Coastguard Worker cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
200*dfc6aa5cSAndroid Build Coastguard Worker cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
201*dfc6aa5cSAndroid Build Coastguard Worker uint32x4_t cb_lh = scaled_128_5;
202*dfc6aa5cSAndroid Build Coastguard Worker cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3);
203*dfc6aa5cSAndroid Build Coastguard Worker cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4);
204*dfc6aa5cSAndroid Build Coastguard Worker cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5);
205*dfc6aa5cSAndroid Build Coastguard Worker uint32x4_t cb_hl = scaled_128_5;
206*dfc6aa5cSAndroid Build Coastguard Worker cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
207*dfc6aa5cSAndroid Build Coastguard Worker cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
208*dfc6aa5cSAndroid Build Coastguard Worker cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
209*dfc6aa5cSAndroid Build Coastguard Worker uint32x4_t cb_hh = scaled_128_5;
210*dfc6aa5cSAndroid Build Coastguard Worker cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3);
211*dfc6aa5cSAndroid Build Coastguard Worker cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4);
212*dfc6aa5cSAndroid Build Coastguard Worker cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5);
213*dfc6aa5cSAndroid Build Coastguard Worker
214*dfc6aa5cSAndroid Build Coastguard Worker /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
215*dfc6aa5cSAndroid Build Coastguard Worker uint32x4_t cr_ll = scaled_128_5;
216*dfc6aa5cSAndroid Build Coastguard Worker cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
217*dfc6aa5cSAndroid Build Coastguard Worker cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
218*dfc6aa5cSAndroid Build Coastguard Worker cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
219*dfc6aa5cSAndroid Build Coastguard Worker uint32x4_t cr_lh = scaled_128_5;
220*dfc6aa5cSAndroid Build Coastguard Worker cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5);
221*dfc6aa5cSAndroid Build Coastguard Worker cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6);
222*dfc6aa5cSAndroid Build Coastguard Worker cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7);
223*dfc6aa5cSAndroid Build Coastguard Worker uint32x4_t cr_hl = scaled_128_5;
224*dfc6aa5cSAndroid Build Coastguard Worker cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
225*dfc6aa5cSAndroid Build Coastguard Worker cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
226*dfc6aa5cSAndroid Build Coastguard Worker cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
227*dfc6aa5cSAndroid Build Coastguard Worker uint32x4_t cr_hh = scaled_128_5;
228*dfc6aa5cSAndroid Build Coastguard Worker cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5);
229*dfc6aa5cSAndroid Build Coastguard Worker cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6);
230*dfc6aa5cSAndroid Build Coastguard Worker cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7);
231*dfc6aa5cSAndroid Build Coastguard Worker
232*dfc6aa5cSAndroid Build Coastguard Worker /* Descale Y values (rounding right shift) and narrow to 16-bit. */
233*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
234*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_u32(y_lh, 16));
235*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
236*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_u32(y_hh, 16));
237*dfc6aa5cSAndroid Build Coastguard Worker /* Descale Cb values (right shift) and narrow to 16-bit. */
238*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
239*dfc6aa5cSAndroid Build Coastguard Worker vshrn_n_u32(cb_lh, 16));
240*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
241*dfc6aa5cSAndroid Build Coastguard Worker vshrn_n_u32(cb_hh, 16));
242*dfc6aa5cSAndroid Build Coastguard Worker /* Descale Cr values (right shift) and narrow to 16-bit. */
243*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
244*dfc6aa5cSAndroid Build Coastguard Worker vshrn_n_u32(cr_lh, 16));
245*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
246*dfc6aa5cSAndroid Build Coastguard Worker vshrn_n_u32(cr_hh, 16));
247*dfc6aa5cSAndroid Build Coastguard Worker /* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
248*dfc6aa5cSAndroid Build Coastguard Worker * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
249*dfc6aa5cSAndroid Build Coastguard Worker */
250*dfc6aa5cSAndroid Build Coastguard Worker vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
251*dfc6aa5cSAndroid Build Coastguard Worker vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
252*dfc6aa5cSAndroid Build Coastguard Worker vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
253*dfc6aa5cSAndroid Build Coastguard Worker
254*dfc6aa5cSAndroid Build Coastguard Worker } else if (cols_remaining > 0) {
255*dfc6aa5cSAndroid Build Coastguard Worker /* To prevent buffer overread by the vector load instructions, the last
256*dfc6aa5cSAndroid Build Coastguard Worker * (image_width % 8) columns of data are first memcopied to a temporary
257*dfc6aa5cSAndroid Build Coastguard Worker * buffer large enough to accommodate the vector load.
258*dfc6aa5cSAndroid Build Coastguard Worker */
259*dfc6aa5cSAndroid Build Coastguard Worker memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
260*dfc6aa5cSAndroid Build Coastguard Worker inptr = tmp_buf;
261*dfc6aa5cSAndroid Build Coastguard Worker
262*dfc6aa5cSAndroid Build Coastguard Worker #if RGB_PIXELSIZE == 4
263*dfc6aa5cSAndroid Build Coastguard Worker uint8x8x4_t input_pixels = vld4_u8(inptr);
264*dfc6aa5cSAndroid Build Coastguard Worker #else
265*dfc6aa5cSAndroid Build Coastguard Worker uint8x8x3_t input_pixels = vld3_u8(inptr);
266*dfc6aa5cSAndroid Build Coastguard Worker #endif
267*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]);
268*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]);
269*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]);
270*dfc6aa5cSAndroid Build Coastguard Worker
271*dfc6aa5cSAndroid Build Coastguard Worker /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
272*dfc6aa5cSAndroid Build Coastguard Worker uint32x4_t y_l = vmull_laneq_u16(vget_low_u16(r), consts, 0);
273*dfc6aa5cSAndroid Build Coastguard Worker y_l = vmlal_laneq_u16(y_l, vget_low_u16(g), consts, 1);
274*dfc6aa5cSAndroid Build Coastguard Worker y_l = vmlal_laneq_u16(y_l, vget_low_u16(b), consts, 2);
275*dfc6aa5cSAndroid Build Coastguard Worker uint32x4_t y_h = vmull_laneq_u16(vget_high_u16(r), consts, 0);
276*dfc6aa5cSAndroid Build Coastguard Worker y_h = vmlal_laneq_u16(y_h, vget_high_u16(g), consts, 1);
277*dfc6aa5cSAndroid Build Coastguard Worker y_h = vmlal_laneq_u16(y_h, vget_high_u16(b), consts, 2);
278*dfc6aa5cSAndroid Build Coastguard Worker
279*dfc6aa5cSAndroid Build Coastguard Worker /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
280*dfc6aa5cSAndroid Build Coastguard Worker uint32x4_t cb_l = scaled_128_5;
281*dfc6aa5cSAndroid Build Coastguard Worker cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(r), consts, 3);
282*dfc6aa5cSAndroid Build Coastguard Worker cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(g), consts, 4);
283*dfc6aa5cSAndroid Build Coastguard Worker cb_l = vmlal_laneq_u16(cb_l, vget_low_u16(b), consts, 5);
284*dfc6aa5cSAndroid Build Coastguard Worker uint32x4_t cb_h = scaled_128_5;
285*dfc6aa5cSAndroid Build Coastguard Worker cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(r), consts, 3);
286*dfc6aa5cSAndroid Build Coastguard Worker cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(g), consts, 4);
287*dfc6aa5cSAndroid Build Coastguard Worker cb_h = vmlal_laneq_u16(cb_h, vget_high_u16(b), consts, 5);
288*dfc6aa5cSAndroid Build Coastguard Worker
289*dfc6aa5cSAndroid Build Coastguard Worker /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
290*dfc6aa5cSAndroid Build Coastguard Worker uint32x4_t cr_l = scaled_128_5;
291*dfc6aa5cSAndroid Build Coastguard Worker cr_l = vmlal_laneq_u16(cr_l, vget_low_u16(r), consts, 5);
292*dfc6aa5cSAndroid Build Coastguard Worker cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(g), consts, 6);
293*dfc6aa5cSAndroid Build Coastguard Worker cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(b), consts, 7);
294*dfc6aa5cSAndroid Build Coastguard Worker uint32x4_t cr_h = scaled_128_5;
295*dfc6aa5cSAndroid Build Coastguard Worker cr_h = vmlal_laneq_u16(cr_h, vget_high_u16(r), consts, 5);
296*dfc6aa5cSAndroid Build Coastguard Worker cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(g), consts, 6);
297*dfc6aa5cSAndroid Build Coastguard Worker cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(b), consts, 7);
298*dfc6aa5cSAndroid Build Coastguard Worker
299*dfc6aa5cSAndroid Build Coastguard Worker /* Descale Y values (rounding right shift) and narrow to 16-bit. */
300*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_l, 16),
301*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_u32(y_h, 16));
302*dfc6aa5cSAndroid Build Coastguard Worker /* Descale Cb values (right shift) and narrow to 16-bit. */
303*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_l, 16),
304*dfc6aa5cSAndroid Build Coastguard Worker vshrn_n_u32(cb_h, 16));
305*dfc6aa5cSAndroid Build Coastguard Worker /* Descale Cr values (right shift) and narrow to 16-bit. */
306*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_l, 16),
307*dfc6aa5cSAndroid Build Coastguard Worker vshrn_n_u32(cr_h, 16));
308*dfc6aa5cSAndroid Build Coastguard Worker /* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
309*dfc6aa5cSAndroid Build Coastguard Worker * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
310*dfc6aa5cSAndroid Build Coastguard Worker */
311*dfc6aa5cSAndroid Build Coastguard Worker vst1_u8(outptr0, vmovn_u16(y_u16));
312*dfc6aa5cSAndroid Build Coastguard Worker vst1_u8(outptr1, vmovn_u16(cb_u16));
313*dfc6aa5cSAndroid Build Coastguard Worker vst1_u8(outptr2, vmovn_u16(cr_u16));
314*dfc6aa5cSAndroid Build Coastguard Worker }
315*dfc6aa5cSAndroid Build Coastguard Worker }
316*dfc6aa5cSAndroid Build Coastguard Worker }
317