xref: /aosp_15_r20/external/libjpeg-turbo/simd/arm/jquanti-neon.c (revision dfc6aa5c1cfd4bc4e2018dc74aa96e29ee49c6da)
1*dfc6aa5cSAndroid Build Coastguard Worker /*
2*dfc6aa5cSAndroid Build Coastguard Worker  * jquanti-neon.c - sample data conversion and quantization (Arm Neon)
3*dfc6aa5cSAndroid Build Coastguard Worker  *
4*dfc6aa5cSAndroid Build Coastguard Worker  * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
5*dfc6aa5cSAndroid Build Coastguard Worker  *
6*dfc6aa5cSAndroid Build Coastguard Worker  * This software is provided 'as-is', without any express or implied
7*dfc6aa5cSAndroid Build Coastguard Worker  * warranty.  In no event will the authors be held liable for any damages
8*dfc6aa5cSAndroid Build Coastguard Worker  * arising from the use of this software.
9*dfc6aa5cSAndroid Build Coastguard Worker  *
10*dfc6aa5cSAndroid Build Coastguard Worker  * Permission is granted to anyone to use this software for any purpose,
11*dfc6aa5cSAndroid Build Coastguard Worker  * including commercial applications, and to alter it and redistribute it
12*dfc6aa5cSAndroid Build Coastguard Worker  * freely, subject to the following restrictions:
13*dfc6aa5cSAndroid Build Coastguard Worker  *
14*dfc6aa5cSAndroid Build Coastguard Worker  * 1. The origin of this software must not be misrepresented; you must not
15*dfc6aa5cSAndroid Build Coastguard Worker  *    claim that you wrote the original software. If you use this software
16*dfc6aa5cSAndroid Build Coastguard Worker  *    in a product, an acknowledgment in the product documentation would be
17*dfc6aa5cSAndroid Build Coastguard Worker  *    appreciated but is not required.
18*dfc6aa5cSAndroid Build Coastguard Worker  * 2. Altered source versions must be plainly marked as such, and must not be
19*dfc6aa5cSAndroid Build Coastguard Worker  *    misrepresented as being the original software.
20*dfc6aa5cSAndroid Build Coastguard Worker  * 3. This notice may not be removed or altered from any source distribution.
21*dfc6aa5cSAndroid Build Coastguard Worker  */
22*dfc6aa5cSAndroid Build Coastguard Worker 
23*dfc6aa5cSAndroid Build Coastguard Worker #define JPEG_INTERNALS
24*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jinclude.h"
25*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jpeglib.h"
26*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jsimd.h"
27*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jdct.h"
28*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jsimddct.h"
29*dfc6aa5cSAndroid Build Coastguard Worker #include "../jsimd.h"
30*dfc6aa5cSAndroid Build Coastguard Worker 
31*dfc6aa5cSAndroid Build Coastguard Worker #include <arm_neon.h>
32*dfc6aa5cSAndroid Build Coastguard Worker 
33*dfc6aa5cSAndroid Build Coastguard Worker 
34*dfc6aa5cSAndroid Build Coastguard Worker /* After downsampling, the resulting sample values are in the range [0, 255],
35*dfc6aa5cSAndroid Build Coastguard Worker  * but the Discrete Cosine Transform (DCT) operates on values centered around
36*dfc6aa5cSAndroid Build Coastguard Worker  * 0.
37*dfc6aa5cSAndroid Build Coastguard Worker  *
38*dfc6aa5cSAndroid Build Coastguard Worker  * To prepare sample values for the DCT, load samples into a DCT workspace,
39*dfc6aa5cSAndroid Build Coastguard Worker  * subtracting CENTERJSAMPLE (128).  The samples, now in the range [-128, 127],
40*dfc6aa5cSAndroid Build Coastguard Worker  * are also widened from 8- to 16-bit.
41*dfc6aa5cSAndroid Build Coastguard Worker  *
42*dfc6aa5cSAndroid Build Coastguard Worker  * The equivalent scalar C function convsamp() can be found in jcdctmgr.c.
43*dfc6aa5cSAndroid Build Coastguard Worker  */
44*dfc6aa5cSAndroid Build Coastguard Worker 
jsimd_convsamp_neon(JSAMPARRAY sample_data,JDIMENSION start_col,DCTELEM * workspace)45*dfc6aa5cSAndroid Build Coastguard Worker void jsimd_convsamp_neon(JSAMPARRAY sample_data, JDIMENSION start_col,
46*dfc6aa5cSAndroid Build Coastguard Worker                          DCTELEM *workspace)
47*dfc6aa5cSAndroid Build Coastguard Worker {
48*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t samp_row0 = vld1_u8(sample_data[0] + start_col);
49*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t samp_row1 = vld1_u8(sample_data[1] + start_col);
50*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t samp_row2 = vld1_u8(sample_data[2] + start_col);
51*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t samp_row3 = vld1_u8(sample_data[3] + start_col);
52*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t samp_row4 = vld1_u8(sample_data[4] + start_col);
53*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t samp_row5 = vld1_u8(sample_data[5] + start_col);
54*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t samp_row6 = vld1_u8(sample_data[6] + start_col);
55*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t samp_row7 = vld1_u8(sample_data[7] + start_col);
56*dfc6aa5cSAndroid Build Coastguard Worker 
57*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t row0 =
58*dfc6aa5cSAndroid Build Coastguard Worker     vreinterpretq_s16_u16(vsubl_u8(samp_row0, vdup_n_u8(CENTERJSAMPLE)));
59*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t row1 =
60*dfc6aa5cSAndroid Build Coastguard Worker     vreinterpretq_s16_u16(vsubl_u8(samp_row1, vdup_n_u8(CENTERJSAMPLE)));
61*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t row2 =
62*dfc6aa5cSAndroid Build Coastguard Worker     vreinterpretq_s16_u16(vsubl_u8(samp_row2, vdup_n_u8(CENTERJSAMPLE)));
63*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t row3 =
64*dfc6aa5cSAndroid Build Coastguard Worker     vreinterpretq_s16_u16(vsubl_u8(samp_row3, vdup_n_u8(CENTERJSAMPLE)));
65*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t row4 =
66*dfc6aa5cSAndroid Build Coastguard Worker     vreinterpretq_s16_u16(vsubl_u8(samp_row4, vdup_n_u8(CENTERJSAMPLE)));
67*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t row5 =
68*dfc6aa5cSAndroid Build Coastguard Worker     vreinterpretq_s16_u16(vsubl_u8(samp_row5, vdup_n_u8(CENTERJSAMPLE)));
69*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t row6 =
70*dfc6aa5cSAndroid Build Coastguard Worker     vreinterpretq_s16_u16(vsubl_u8(samp_row6, vdup_n_u8(CENTERJSAMPLE)));
71*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t row7 =
72*dfc6aa5cSAndroid Build Coastguard Worker     vreinterpretq_s16_u16(vsubl_u8(samp_row7, vdup_n_u8(CENTERJSAMPLE)));
73*dfc6aa5cSAndroid Build Coastguard Worker 
74*dfc6aa5cSAndroid Build Coastguard Worker   vst1q_s16(workspace + 0 * DCTSIZE, row0);
75*dfc6aa5cSAndroid Build Coastguard Worker   vst1q_s16(workspace + 1 * DCTSIZE, row1);
76*dfc6aa5cSAndroid Build Coastguard Worker   vst1q_s16(workspace + 2 * DCTSIZE, row2);
77*dfc6aa5cSAndroid Build Coastguard Worker   vst1q_s16(workspace + 3 * DCTSIZE, row3);
78*dfc6aa5cSAndroid Build Coastguard Worker   vst1q_s16(workspace + 4 * DCTSIZE, row4);
79*dfc6aa5cSAndroid Build Coastguard Worker   vst1q_s16(workspace + 5 * DCTSIZE, row5);
80*dfc6aa5cSAndroid Build Coastguard Worker   vst1q_s16(workspace + 6 * DCTSIZE, row6);
81*dfc6aa5cSAndroid Build Coastguard Worker   vst1q_s16(workspace + 7 * DCTSIZE, row7);
82*dfc6aa5cSAndroid Build Coastguard Worker }
83*dfc6aa5cSAndroid Build Coastguard Worker 
84*dfc6aa5cSAndroid Build Coastguard Worker 
85*dfc6aa5cSAndroid Build Coastguard Worker /* After the DCT, the resulting array of coefficient values needs to be divided
86*dfc6aa5cSAndroid Build Coastguard Worker  * by an array of quantization values.
87*dfc6aa5cSAndroid Build Coastguard Worker  *
88*dfc6aa5cSAndroid Build Coastguard Worker  * To avoid a slow division operation, the DCT coefficients are multiplied by
89*dfc6aa5cSAndroid Build Coastguard Worker  * the (scaled) reciprocals of the quantization values and then right-shifted.
90*dfc6aa5cSAndroid Build Coastguard Worker  *
91*dfc6aa5cSAndroid Build Coastguard Worker  * The equivalent scalar C function quantize() can be found in jcdctmgr.c.
92*dfc6aa5cSAndroid Build Coastguard Worker  */
93*dfc6aa5cSAndroid Build Coastguard Worker 
jsimd_quantize_neon(JCOEFPTR coef_block,DCTELEM * divisors,DCTELEM * workspace)94*dfc6aa5cSAndroid Build Coastguard Worker void jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
95*dfc6aa5cSAndroid Build Coastguard Worker                          DCTELEM *workspace)
96*dfc6aa5cSAndroid Build Coastguard Worker {
97*dfc6aa5cSAndroid Build Coastguard Worker   JCOEFPTR out_ptr = coef_block;
98*dfc6aa5cSAndroid Build Coastguard Worker   UDCTELEM *recip_ptr = (UDCTELEM *)divisors;
99*dfc6aa5cSAndroid Build Coastguard Worker   UDCTELEM *corr_ptr = (UDCTELEM *)divisors + DCTSIZE2;
100*dfc6aa5cSAndroid Build Coastguard Worker   DCTELEM *shift_ptr = divisors + 3 * DCTSIZE2;
101*dfc6aa5cSAndroid Build Coastguard Worker   int i;
102*dfc6aa5cSAndroid Build Coastguard Worker 
103*dfc6aa5cSAndroid Build Coastguard Worker #if defined(__clang__) && (defined(__aarch64__) || defined(_M_ARM64))
104*dfc6aa5cSAndroid Build Coastguard Worker #pragma unroll
105*dfc6aa5cSAndroid Build Coastguard Worker #endif
106*dfc6aa5cSAndroid Build Coastguard Worker   for (i = 0; i < DCTSIZE; i += DCTSIZE / 2) {
107*dfc6aa5cSAndroid Build Coastguard Worker     /* Load DCT coefficients. */
108*dfc6aa5cSAndroid Build Coastguard Worker     int16x8_t row0 = vld1q_s16(workspace + (i + 0) * DCTSIZE);
109*dfc6aa5cSAndroid Build Coastguard Worker     int16x8_t row1 = vld1q_s16(workspace + (i + 1) * DCTSIZE);
110*dfc6aa5cSAndroid Build Coastguard Worker     int16x8_t row2 = vld1q_s16(workspace + (i + 2) * DCTSIZE);
111*dfc6aa5cSAndroid Build Coastguard Worker     int16x8_t row3 = vld1q_s16(workspace + (i + 3) * DCTSIZE);
112*dfc6aa5cSAndroid Build Coastguard Worker     /* Load reciprocals of quantization values. */
113*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t recip0 = vld1q_u16(recip_ptr + (i + 0) * DCTSIZE);
114*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t recip1 = vld1q_u16(recip_ptr + (i + 1) * DCTSIZE);
115*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t recip2 = vld1q_u16(recip_ptr + (i + 2) * DCTSIZE);
116*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t recip3 = vld1q_u16(recip_ptr + (i + 3) * DCTSIZE);
117*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t corr0 = vld1q_u16(corr_ptr + (i + 0) * DCTSIZE);
118*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t corr1 = vld1q_u16(corr_ptr + (i + 1) * DCTSIZE);
119*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t corr2 = vld1q_u16(corr_ptr + (i + 2) * DCTSIZE);
120*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t corr3 = vld1q_u16(corr_ptr + (i + 3) * DCTSIZE);
121*dfc6aa5cSAndroid Build Coastguard Worker     int16x8_t shift0 = vld1q_s16(shift_ptr + (i + 0) * DCTSIZE);
122*dfc6aa5cSAndroid Build Coastguard Worker     int16x8_t shift1 = vld1q_s16(shift_ptr + (i + 1) * DCTSIZE);
123*dfc6aa5cSAndroid Build Coastguard Worker     int16x8_t shift2 = vld1q_s16(shift_ptr + (i + 2) * DCTSIZE);
124*dfc6aa5cSAndroid Build Coastguard Worker     int16x8_t shift3 = vld1q_s16(shift_ptr + (i + 3) * DCTSIZE);
125*dfc6aa5cSAndroid Build Coastguard Worker 
126*dfc6aa5cSAndroid Build Coastguard Worker     /* Extract sign from coefficients. */
127*dfc6aa5cSAndroid Build Coastguard Worker     int16x8_t sign_row0 = vshrq_n_s16(row0, 15);
128*dfc6aa5cSAndroid Build Coastguard Worker     int16x8_t sign_row1 = vshrq_n_s16(row1, 15);
129*dfc6aa5cSAndroid Build Coastguard Worker     int16x8_t sign_row2 = vshrq_n_s16(row2, 15);
130*dfc6aa5cSAndroid Build Coastguard Worker     int16x8_t sign_row3 = vshrq_n_s16(row3, 15);
131*dfc6aa5cSAndroid Build Coastguard Worker     /* Get absolute value of DCT coefficients. */
132*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t abs_row0 = vreinterpretq_u16_s16(vabsq_s16(row0));
133*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t abs_row1 = vreinterpretq_u16_s16(vabsq_s16(row1));
134*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t abs_row2 = vreinterpretq_u16_s16(vabsq_s16(row2));
135*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t abs_row3 = vreinterpretq_u16_s16(vabsq_s16(row3));
136*dfc6aa5cSAndroid Build Coastguard Worker     /* Add correction. */
137*dfc6aa5cSAndroid Build Coastguard Worker     abs_row0 = vaddq_u16(abs_row0, corr0);
138*dfc6aa5cSAndroid Build Coastguard Worker     abs_row1 = vaddq_u16(abs_row1, corr1);
139*dfc6aa5cSAndroid Build Coastguard Worker     abs_row2 = vaddq_u16(abs_row2, corr2);
140*dfc6aa5cSAndroid Build Coastguard Worker     abs_row3 = vaddq_u16(abs_row3, corr3);
141*dfc6aa5cSAndroid Build Coastguard Worker 
142*dfc6aa5cSAndroid Build Coastguard Worker     /* Multiply DCT coefficients by quantization reciprocals. */
143*dfc6aa5cSAndroid Build Coastguard Worker     int32x4_t row0_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row0),
144*dfc6aa5cSAndroid Build Coastguard Worker                                                        vget_low_u16(recip0)));
145*dfc6aa5cSAndroid Build Coastguard Worker     int32x4_t row0_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row0),
146*dfc6aa5cSAndroid Build Coastguard Worker                                                        vget_high_u16(recip0)));
147*dfc6aa5cSAndroid Build Coastguard Worker     int32x4_t row1_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row1),
148*dfc6aa5cSAndroid Build Coastguard Worker                                                        vget_low_u16(recip1)));
149*dfc6aa5cSAndroid Build Coastguard Worker     int32x4_t row1_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row1),
150*dfc6aa5cSAndroid Build Coastguard Worker                                                        vget_high_u16(recip1)));
151*dfc6aa5cSAndroid Build Coastguard Worker     int32x4_t row2_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row2),
152*dfc6aa5cSAndroid Build Coastguard Worker                                                        vget_low_u16(recip2)));
153*dfc6aa5cSAndroid Build Coastguard Worker     int32x4_t row2_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row2),
154*dfc6aa5cSAndroid Build Coastguard Worker                                                        vget_high_u16(recip2)));
155*dfc6aa5cSAndroid Build Coastguard Worker     int32x4_t row3_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row3),
156*dfc6aa5cSAndroid Build Coastguard Worker                                                        vget_low_u16(recip3)));
157*dfc6aa5cSAndroid Build Coastguard Worker     int32x4_t row3_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row3),
158*dfc6aa5cSAndroid Build Coastguard Worker                                                        vget_high_u16(recip3)));
159*dfc6aa5cSAndroid Build Coastguard Worker     /* Narrow back to 16-bit. */
160*dfc6aa5cSAndroid Build Coastguard Worker     row0 = vcombine_s16(vshrn_n_s32(row0_l, 16), vshrn_n_s32(row0_h, 16));
161*dfc6aa5cSAndroid Build Coastguard Worker     row1 = vcombine_s16(vshrn_n_s32(row1_l, 16), vshrn_n_s32(row1_h, 16));
162*dfc6aa5cSAndroid Build Coastguard Worker     row2 = vcombine_s16(vshrn_n_s32(row2_l, 16), vshrn_n_s32(row2_h, 16));
163*dfc6aa5cSAndroid Build Coastguard Worker     row3 = vcombine_s16(vshrn_n_s32(row3_l, 16), vshrn_n_s32(row3_h, 16));
164*dfc6aa5cSAndroid Build Coastguard Worker 
165*dfc6aa5cSAndroid Build Coastguard Worker     /* Since VSHR only supports an immediate as its second argument, negate the
166*dfc6aa5cSAndroid Build Coastguard Worker      * shift value and shift left.
167*dfc6aa5cSAndroid Build Coastguard Worker      */
168*dfc6aa5cSAndroid Build Coastguard Worker     row0 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row0),
169*dfc6aa5cSAndroid Build Coastguard Worker                                            vnegq_s16(shift0)));
170*dfc6aa5cSAndroid Build Coastguard Worker     row1 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row1),
171*dfc6aa5cSAndroid Build Coastguard Worker                                            vnegq_s16(shift1)));
172*dfc6aa5cSAndroid Build Coastguard Worker     row2 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row2),
173*dfc6aa5cSAndroid Build Coastguard Worker                                            vnegq_s16(shift2)));
174*dfc6aa5cSAndroid Build Coastguard Worker     row3 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row3),
175*dfc6aa5cSAndroid Build Coastguard Worker                                            vnegq_s16(shift3)));
176*dfc6aa5cSAndroid Build Coastguard Worker 
177*dfc6aa5cSAndroid Build Coastguard Worker     /* Restore sign to original product. */
178*dfc6aa5cSAndroid Build Coastguard Worker     row0 = veorq_s16(row0, sign_row0);
179*dfc6aa5cSAndroid Build Coastguard Worker     row0 = vsubq_s16(row0, sign_row0);
180*dfc6aa5cSAndroid Build Coastguard Worker     row1 = veorq_s16(row1, sign_row1);
181*dfc6aa5cSAndroid Build Coastguard Worker     row1 = vsubq_s16(row1, sign_row1);
182*dfc6aa5cSAndroid Build Coastguard Worker     row2 = veorq_s16(row2, sign_row2);
183*dfc6aa5cSAndroid Build Coastguard Worker     row2 = vsubq_s16(row2, sign_row2);
184*dfc6aa5cSAndroid Build Coastguard Worker     row3 = veorq_s16(row3, sign_row3);
185*dfc6aa5cSAndroid Build Coastguard Worker     row3 = vsubq_s16(row3, sign_row3);
186*dfc6aa5cSAndroid Build Coastguard Worker 
187*dfc6aa5cSAndroid Build Coastguard Worker     /* Store quantized coefficients to memory. */
188*dfc6aa5cSAndroid Build Coastguard Worker     vst1q_s16(out_ptr + (i + 0) * DCTSIZE, row0);
189*dfc6aa5cSAndroid Build Coastguard Worker     vst1q_s16(out_ptr + (i + 1) * DCTSIZE, row1);
190*dfc6aa5cSAndroid Build Coastguard Worker     vst1q_s16(out_ptr + (i + 2) * DCTSIZE, row2);
191*dfc6aa5cSAndroid Build Coastguard Worker     vst1q_s16(out_ptr + (i + 3) * DCTSIZE, row3);
192*dfc6aa5cSAndroid Build Coastguard Worker   }
193*dfc6aa5cSAndroid Build Coastguard Worker }
194