1*dfc6aa5cSAndroid Build Coastguard Worker /*
2*dfc6aa5cSAndroid Build Coastguard Worker * jidctint-neon.c - accurate integer IDCT (Arm Neon)
3*dfc6aa5cSAndroid Build Coastguard Worker *
4*dfc6aa5cSAndroid Build Coastguard Worker * Copyright (C) 2020, Arm Limited. All Rights Reserved.
5*dfc6aa5cSAndroid Build Coastguard Worker * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
6*dfc6aa5cSAndroid Build Coastguard Worker *
7*dfc6aa5cSAndroid Build Coastguard Worker * This software is provided 'as-is', without any express or implied
8*dfc6aa5cSAndroid Build Coastguard Worker * warranty. In no event will the authors be held liable for any damages
9*dfc6aa5cSAndroid Build Coastguard Worker * arising from the use of this software.
10*dfc6aa5cSAndroid Build Coastguard Worker *
11*dfc6aa5cSAndroid Build Coastguard Worker * Permission is granted to anyone to use this software for any purpose,
12*dfc6aa5cSAndroid Build Coastguard Worker * including commercial applications, and to alter it and redistribute it
13*dfc6aa5cSAndroid Build Coastguard Worker * freely, subject to the following restrictions:
14*dfc6aa5cSAndroid Build Coastguard Worker *
15*dfc6aa5cSAndroid Build Coastguard Worker * 1. The origin of this software must not be misrepresented; you must not
16*dfc6aa5cSAndroid Build Coastguard Worker * claim that you wrote the original software. If you use this software
17*dfc6aa5cSAndroid Build Coastguard Worker * in a product, an acknowledgment in the product documentation would be
18*dfc6aa5cSAndroid Build Coastguard Worker * appreciated but is not required.
19*dfc6aa5cSAndroid Build Coastguard Worker * 2. Altered source versions must be plainly marked as such, and must not be
20*dfc6aa5cSAndroid Build Coastguard Worker * misrepresented as being the original software.
21*dfc6aa5cSAndroid Build Coastguard Worker * 3. This notice may not be removed or altered from any source distribution.
22*dfc6aa5cSAndroid Build Coastguard Worker */
23*dfc6aa5cSAndroid Build Coastguard Worker
24*dfc6aa5cSAndroid Build Coastguard Worker #define JPEG_INTERNALS
25*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jinclude.h"
26*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jpeglib.h"
27*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jsimd.h"
28*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jdct.h"
29*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jsimddct.h"
30*dfc6aa5cSAndroid Build Coastguard Worker #include "../jsimd.h"
31*dfc6aa5cSAndroid Build Coastguard Worker #include "align.h"
32*dfc6aa5cSAndroid Build Coastguard Worker #include "neon-compat.h"
33*dfc6aa5cSAndroid Build Coastguard Worker
34*dfc6aa5cSAndroid Build Coastguard Worker #include <arm_neon.h>
35*dfc6aa5cSAndroid Build Coastguard Worker
36*dfc6aa5cSAndroid Build Coastguard Worker
37*dfc6aa5cSAndroid Build Coastguard Worker #define CONST_BITS 13
38*dfc6aa5cSAndroid Build Coastguard Worker #define PASS1_BITS 2
39*dfc6aa5cSAndroid Build Coastguard Worker
40*dfc6aa5cSAndroid Build Coastguard Worker #define DESCALE_P1 (CONST_BITS - PASS1_BITS)
41*dfc6aa5cSAndroid Build Coastguard Worker #define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
42*dfc6aa5cSAndroid Build Coastguard Worker
43*dfc6aa5cSAndroid Build Coastguard Worker /* The computation of the inverse DCT requires the use of constants known at
44*dfc6aa5cSAndroid Build Coastguard Worker * compile time. Scaled integer constants are used to avoid floating-point
45*dfc6aa5cSAndroid Build Coastguard Worker * arithmetic:
46*dfc6aa5cSAndroid Build Coastguard Worker * 0.298631336 = 2446 * 2^-13
47*dfc6aa5cSAndroid Build Coastguard Worker * 0.390180644 = 3196 * 2^-13
48*dfc6aa5cSAndroid Build Coastguard Worker * 0.541196100 = 4433 * 2^-13
49*dfc6aa5cSAndroid Build Coastguard Worker * 0.765366865 = 6270 * 2^-13
50*dfc6aa5cSAndroid Build Coastguard Worker * 0.899976223 = 7373 * 2^-13
51*dfc6aa5cSAndroid Build Coastguard Worker * 1.175875602 = 9633 * 2^-13
52*dfc6aa5cSAndroid Build Coastguard Worker * 1.501321110 = 12299 * 2^-13
53*dfc6aa5cSAndroid Build Coastguard Worker * 1.847759065 = 15137 * 2^-13
54*dfc6aa5cSAndroid Build Coastguard Worker * 1.961570560 = 16069 * 2^-13
55*dfc6aa5cSAndroid Build Coastguard Worker * 2.053119869 = 16819 * 2^-13
56*dfc6aa5cSAndroid Build Coastguard Worker * 2.562915447 = 20995 * 2^-13
57*dfc6aa5cSAndroid Build Coastguard Worker * 3.072711026 = 25172 * 2^-13
58*dfc6aa5cSAndroid Build Coastguard Worker */
59*dfc6aa5cSAndroid Build Coastguard Worker
60*dfc6aa5cSAndroid Build Coastguard Worker #define F_0_298 2446
61*dfc6aa5cSAndroid Build Coastguard Worker #define F_0_390 3196
62*dfc6aa5cSAndroid Build Coastguard Worker #define F_0_541 4433
63*dfc6aa5cSAndroid Build Coastguard Worker #define F_0_765 6270
64*dfc6aa5cSAndroid Build Coastguard Worker #define F_0_899 7373
65*dfc6aa5cSAndroid Build Coastguard Worker #define F_1_175 9633
66*dfc6aa5cSAndroid Build Coastguard Worker #define F_1_501 12299
67*dfc6aa5cSAndroid Build Coastguard Worker #define F_1_847 15137
68*dfc6aa5cSAndroid Build Coastguard Worker #define F_1_961 16069
69*dfc6aa5cSAndroid Build Coastguard Worker #define F_2_053 16819
70*dfc6aa5cSAndroid Build Coastguard Worker #define F_2_562 20995
71*dfc6aa5cSAndroid Build Coastguard Worker #define F_3_072 25172
72*dfc6aa5cSAndroid Build Coastguard Worker
73*dfc6aa5cSAndroid Build Coastguard Worker #define F_1_175_MINUS_1_961 (F_1_175 - F_1_961)
74*dfc6aa5cSAndroid Build Coastguard Worker #define F_1_175_MINUS_0_390 (F_1_175 - F_0_390)
75*dfc6aa5cSAndroid Build Coastguard Worker #define F_0_541_MINUS_1_847 (F_0_541 - F_1_847)
76*dfc6aa5cSAndroid Build Coastguard Worker #define F_3_072_MINUS_2_562 (F_3_072 - F_2_562)
77*dfc6aa5cSAndroid Build Coastguard Worker #define F_0_298_MINUS_0_899 (F_0_298 - F_0_899)
78*dfc6aa5cSAndroid Build Coastguard Worker #define F_1_501_MINUS_0_899 (F_1_501 - F_0_899)
79*dfc6aa5cSAndroid Build Coastguard Worker #define F_2_053_MINUS_2_562 (F_2_053 - F_2_562)
80*dfc6aa5cSAndroid Build Coastguard Worker #define F_0_541_PLUS_0_765 (F_0_541 + F_0_765)
81*dfc6aa5cSAndroid Build Coastguard Worker
82*dfc6aa5cSAndroid Build Coastguard Worker
83*dfc6aa5cSAndroid Build Coastguard Worker ALIGN(16) static const int16_t jsimd_idct_islow_neon_consts[] = {
84*dfc6aa5cSAndroid Build Coastguard Worker F_0_899, F_0_541,
85*dfc6aa5cSAndroid Build Coastguard Worker F_2_562, F_0_298_MINUS_0_899,
86*dfc6aa5cSAndroid Build Coastguard Worker F_1_501_MINUS_0_899, F_2_053_MINUS_2_562,
87*dfc6aa5cSAndroid Build Coastguard Worker F_0_541_PLUS_0_765, F_1_175,
88*dfc6aa5cSAndroid Build Coastguard Worker F_1_175_MINUS_0_390, F_0_541_MINUS_1_847,
89*dfc6aa5cSAndroid Build Coastguard Worker F_3_072_MINUS_2_562, F_1_175_MINUS_1_961,
90*dfc6aa5cSAndroid Build Coastguard Worker 0, 0, 0, 0
91*dfc6aa5cSAndroid Build Coastguard Worker };
92*dfc6aa5cSAndroid Build Coastguard Worker
93*dfc6aa5cSAndroid Build Coastguard Worker
94*dfc6aa5cSAndroid Build Coastguard Worker /* Forward declaration of regular and sparse IDCT helper functions */
95*dfc6aa5cSAndroid Build Coastguard Worker
96*dfc6aa5cSAndroid Build Coastguard Worker static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0,
97*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t row1,
98*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t row2,
99*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t row3,
100*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t row4,
101*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t row5,
102*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t row6,
103*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t row7,
104*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t quant_row0,
105*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t quant_row1,
106*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t quant_row2,
107*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t quant_row3,
108*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t quant_row4,
109*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t quant_row5,
110*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t quant_row6,
111*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t quant_row7,
112*dfc6aa5cSAndroid Build Coastguard Worker int16_t *workspace_1,
113*dfc6aa5cSAndroid Build Coastguard Worker int16_t *workspace_2);
114*dfc6aa5cSAndroid Build Coastguard Worker
115*dfc6aa5cSAndroid Build Coastguard Worker static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
116*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t row1,
117*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t row2,
118*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t row3,
119*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t quant_row0,
120*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t quant_row1,
121*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t quant_row2,
122*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t quant_row3,
123*dfc6aa5cSAndroid Build Coastguard Worker int16_t *workspace_1,
124*dfc6aa5cSAndroid Build Coastguard Worker int16_t *workspace_2);
125*dfc6aa5cSAndroid Build Coastguard Worker
126*dfc6aa5cSAndroid Build Coastguard Worker static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace,
127*dfc6aa5cSAndroid Build Coastguard Worker JSAMPARRAY output_buf,
128*dfc6aa5cSAndroid Build Coastguard Worker JDIMENSION output_col,
129*dfc6aa5cSAndroid Build Coastguard Worker unsigned buf_offset);
130*dfc6aa5cSAndroid Build Coastguard Worker
131*dfc6aa5cSAndroid Build Coastguard Worker static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
132*dfc6aa5cSAndroid Build Coastguard Worker JSAMPARRAY output_buf,
133*dfc6aa5cSAndroid Build Coastguard Worker JDIMENSION output_col,
134*dfc6aa5cSAndroid Build Coastguard Worker unsigned buf_offset);
135*dfc6aa5cSAndroid Build Coastguard Worker
136*dfc6aa5cSAndroid Build Coastguard Worker
137*dfc6aa5cSAndroid Build Coastguard Worker /* Perform dequantization and inverse DCT on one block of coefficients. For
138*dfc6aa5cSAndroid Build Coastguard Worker * reference, the C implementation (jpeg_idct_slow()) can be found in
139*dfc6aa5cSAndroid Build Coastguard Worker * jidctint.c.
140*dfc6aa5cSAndroid Build Coastguard Worker *
141*dfc6aa5cSAndroid Build Coastguard Worker * Optimization techniques used for fast data access:
142*dfc6aa5cSAndroid Build Coastguard Worker *
143*dfc6aa5cSAndroid Build Coastguard Worker * In each pass, the inverse DCT is computed for the left and right 4x8 halves
144*dfc6aa5cSAndroid Build Coastguard Worker * of the DCT block. This avoids spilling due to register pressure, and the
145*dfc6aa5cSAndroid Build Coastguard Worker * increased granularity allows for an optimized calculation depending on the
146*dfc6aa5cSAndroid Build Coastguard Worker * values of the DCT coefficients. Between passes, intermediate data is stored
147*dfc6aa5cSAndroid Build Coastguard Worker * in 4x8 workspace buffers.
148*dfc6aa5cSAndroid Build Coastguard Worker *
149*dfc6aa5cSAndroid Build Coastguard Worker * Transposing the 8x8 DCT block after each pass can be achieved by transposing
150*dfc6aa5cSAndroid Build Coastguard Worker * each of the four 4x4 quadrants and swapping quadrants 1 and 2 (refer to the
151*dfc6aa5cSAndroid Build Coastguard Worker * diagram below.) Swapping quadrants is cheap, since the second pass can just
152*dfc6aa5cSAndroid Build Coastguard Worker * swap the workspace buffer pointers.
153*dfc6aa5cSAndroid Build Coastguard Worker *
154*dfc6aa5cSAndroid Build Coastguard Worker * +-------+-------+ +-------+-------+
155*dfc6aa5cSAndroid Build Coastguard Worker * | | | | | |
156*dfc6aa5cSAndroid Build Coastguard Worker * | 0 | 1 | | 0 | 2 |
157*dfc6aa5cSAndroid Build Coastguard Worker * | | | transpose | | |
158*dfc6aa5cSAndroid Build Coastguard Worker * +-------+-------+ ------> +-------+-------+
159*dfc6aa5cSAndroid Build Coastguard Worker * | | | | | |
160*dfc6aa5cSAndroid Build Coastguard Worker * | 2 | 3 | | 1 | 3 |
161*dfc6aa5cSAndroid Build Coastguard Worker * | | | | | |
162*dfc6aa5cSAndroid Build Coastguard Worker * +-------+-------+ +-------+-------+
163*dfc6aa5cSAndroid Build Coastguard Worker *
164*dfc6aa5cSAndroid Build Coastguard Worker * Optimization techniques used to accelerate the inverse DCT calculation:
165*dfc6aa5cSAndroid Build Coastguard Worker *
166*dfc6aa5cSAndroid Build Coastguard Worker * In a DCT coefficient block, the coefficients are increasingly likely to be 0
167*dfc6aa5cSAndroid Build Coastguard Worker * as you move diagonally from top left to bottom right. If whole rows of
168*dfc6aa5cSAndroid Build Coastguard Worker * coefficients are 0, then the inverse DCT calculation can be simplified. On
169*dfc6aa5cSAndroid Build Coastguard Worker * the first pass of the inverse DCT, we test for three special cases before
170*dfc6aa5cSAndroid Build Coastguard Worker * defaulting to a full "regular" inverse DCT:
171*dfc6aa5cSAndroid Build Coastguard Worker *
172*dfc6aa5cSAndroid Build Coastguard Worker * 1) Coefficients in rows 4-7 are all zero. In this case, we perform a
173*dfc6aa5cSAndroid Build Coastguard Worker * "sparse" simplified inverse DCT on rows 0-3.
174*dfc6aa5cSAndroid Build Coastguard Worker * 2) AC coefficients (rows 1-7) are all zero. In this case, the inverse DCT
175*dfc6aa5cSAndroid Build Coastguard Worker * result is equal to the dequantized DC coefficients.
176*dfc6aa5cSAndroid Build Coastguard Worker * 3) AC and DC coefficients are all zero. In this case, the inverse DCT
177*dfc6aa5cSAndroid Build Coastguard Worker * result is all zero. For the left 4x8 half, this is handled identically
178*dfc6aa5cSAndroid Build Coastguard Worker * to Case 2 above. For the right 4x8 half, we do no work and signal that
179*dfc6aa5cSAndroid Build Coastguard Worker * the "sparse" algorithm is required for the second pass.
180*dfc6aa5cSAndroid Build Coastguard Worker *
181*dfc6aa5cSAndroid Build Coastguard Worker * In the second pass, only a single special case is tested: whether the AC and
182*dfc6aa5cSAndroid Build Coastguard Worker * DC coefficients were all zero in the right 4x8 block during the first pass
183*dfc6aa5cSAndroid Build Coastguard Worker * (refer to Case 3 above.) If this is the case, then a "sparse" variant of
184*dfc6aa5cSAndroid Build Coastguard Worker * the second pass is performed for both the left and right halves of the DCT
185*dfc6aa5cSAndroid Build Coastguard Worker * block. (The transposition after the first pass means that the right 4x8
186*dfc6aa5cSAndroid Build Coastguard Worker * block during the first pass becomes rows 4-7 during the second pass.)
187*dfc6aa5cSAndroid Build Coastguard Worker */
188*dfc6aa5cSAndroid Build Coastguard Worker
jsimd_idct_islow_neon(void * dct_table,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)189*dfc6aa5cSAndroid Build Coastguard Worker void jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
190*dfc6aa5cSAndroid Build Coastguard Worker JSAMPARRAY output_buf, JDIMENSION output_col)
191*dfc6aa5cSAndroid Build Coastguard Worker {
192*dfc6aa5cSAndroid Build Coastguard Worker ISLOW_MULT_TYPE *quantptr = dct_table;
193*dfc6aa5cSAndroid Build Coastguard Worker
194*dfc6aa5cSAndroid Build Coastguard Worker int16_t workspace_l[8 * DCTSIZE / 2];
195*dfc6aa5cSAndroid Build Coastguard Worker int16_t workspace_r[8 * DCTSIZE / 2];
196*dfc6aa5cSAndroid Build Coastguard Worker
197*dfc6aa5cSAndroid Build Coastguard Worker /* Compute IDCT first pass on left 4x8 coefficient block. */
198*dfc6aa5cSAndroid Build Coastguard Worker
199*dfc6aa5cSAndroid Build Coastguard Worker /* Load DCT coefficients in left 4x8 block. */
200*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t row0 = vld1_s16(coef_block + 0 * DCTSIZE);
201*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t row1 = vld1_s16(coef_block + 1 * DCTSIZE);
202*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t row2 = vld1_s16(coef_block + 2 * DCTSIZE);
203*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t row3 = vld1_s16(coef_block + 3 * DCTSIZE);
204*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t row4 = vld1_s16(coef_block + 4 * DCTSIZE);
205*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t row5 = vld1_s16(coef_block + 5 * DCTSIZE);
206*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t row6 = vld1_s16(coef_block + 6 * DCTSIZE);
207*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t row7 = vld1_s16(coef_block + 7 * DCTSIZE);
208*dfc6aa5cSAndroid Build Coastguard Worker
209*dfc6aa5cSAndroid Build Coastguard Worker /* Load quantization table for left 4x8 block. */
210*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE);
211*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
212*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
213*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
214*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE);
215*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
216*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
217*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
218*dfc6aa5cSAndroid Build Coastguard Worker
219*dfc6aa5cSAndroid Build Coastguard Worker /* Construct bitmap to test if DCT coefficients in left 4x8 block are 0. */
220*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t bitmap = vorr_s16(row7, row6);
221*dfc6aa5cSAndroid Build Coastguard Worker bitmap = vorr_s16(bitmap, row5);
222*dfc6aa5cSAndroid Build Coastguard Worker bitmap = vorr_s16(bitmap, row4);
223*dfc6aa5cSAndroid Build Coastguard Worker int64_t bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
224*dfc6aa5cSAndroid Build Coastguard Worker
225*dfc6aa5cSAndroid Build Coastguard Worker if (bitmap_rows_4567 == 0) {
226*dfc6aa5cSAndroid Build Coastguard Worker bitmap = vorr_s16(bitmap, row3);
227*dfc6aa5cSAndroid Build Coastguard Worker bitmap = vorr_s16(bitmap, row2);
228*dfc6aa5cSAndroid Build Coastguard Worker bitmap = vorr_s16(bitmap, row1);
229*dfc6aa5cSAndroid Build Coastguard Worker int64_t left_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
230*dfc6aa5cSAndroid Build Coastguard Worker
231*dfc6aa5cSAndroid Build Coastguard Worker if (left_ac_bitmap == 0) {
232*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
233*dfc6aa5cSAndroid Build Coastguard Worker int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } };
234*dfc6aa5cSAndroid Build Coastguard Worker /* Store 4x4 blocks to workspace, transposing in the process. */
235*dfc6aa5cSAndroid Build Coastguard Worker vst4_s16(workspace_l, quadrant);
236*dfc6aa5cSAndroid Build Coastguard Worker vst4_s16(workspace_r, quadrant);
237*dfc6aa5cSAndroid Build Coastguard Worker } else {
238*dfc6aa5cSAndroid Build Coastguard Worker jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
239*dfc6aa5cSAndroid Build Coastguard Worker quant_row1, quant_row2, quant_row3,
240*dfc6aa5cSAndroid Build Coastguard Worker workspace_l, workspace_r);
241*dfc6aa5cSAndroid Build Coastguard Worker }
242*dfc6aa5cSAndroid Build Coastguard Worker } else {
243*dfc6aa5cSAndroid Build Coastguard Worker jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
244*dfc6aa5cSAndroid Build Coastguard Worker row6, row7, quant_row0, quant_row1,
245*dfc6aa5cSAndroid Build Coastguard Worker quant_row2, quant_row3, quant_row4,
246*dfc6aa5cSAndroid Build Coastguard Worker quant_row5, quant_row6, quant_row7,
247*dfc6aa5cSAndroid Build Coastguard Worker workspace_l, workspace_r);
248*dfc6aa5cSAndroid Build Coastguard Worker }
249*dfc6aa5cSAndroid Build Coastguard Worker
250*dfc6aa5cSAndroid Build Coastguard Worker /* Compute IDCT first pass on right 4x8 coefficient block. */
251*dfc6aa5cSAndroid Build Coastguard Worker
252*dfc6aa5cSAndroid Build Coastguard Worker /* Load DCT coefficients in right 4x8 block. */
253*dfc6aa5cSAndroid Build Coastguard Worker row0 = vld1_s16(coef_block + 0 * DCTSIZE + 4);
254*dfc6aa5cSAndroid Build Coastguard Worker row1 = vld1_s16(coef_block + 1 * DCTSIZE + 4);
255*dfc6aa5cSAndroid Build Coastguard Worker row2 = vld1_s16(coef_block + 2 * DCTSIZE + 4);
256*dfc6aa5cSAndroid Build Coastguard Worker row3 = vld1_s16(coef_block + 3 * DCTSIZE + 4);
257*dfc6aa5cSAndroid Build Coastguard Worker row4 = vld1_s16(coef_block + 4 * DCTSIZE + 4);
258*dfc6aa5cSAndroid Build Coastguard Worker row5 = vld1_s16(coef_block + 5 * DCTSIZE + 4);
259*dfc6aa5cSAndroid Build Coastguard Worker row6 = vld1_s16(coef_block + 6 * DCTSIZE + 4);
260*dfc6aa5cSAndroid Build Coastguard Worker row7 = vld1_s16(coef_block + 7 * DCTSIZE + 4);
261*dfc6aa5cSAndroid Build Coastguard Worker
262*dfc6aa5cSAndroid Build Coastguard Worker /* Load quantization table for right 4x8 block. */
263*dfc6aa5cSAndroid Build Coastguard Worker quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE + 4);
264*dfc6aa5cSAndroid Build Coastguard Worker quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
265*dfc6aa5cSAndroid Build Coastguard Worker quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
266*dfc6aa5cSAndroid Build Coastguard Worker quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
267*dfc6aa5cSAndroid Build Coastguard Worker quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4);
268*dfc6aa5cSAndroid Build Coastguard Worker quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
269*dfc6aa5cSAndroid Build Coastguard Worker quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
270*dfc6aa5cSAndroid Build Coastguard Worker quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
271*dfc6aa5cSAndroid Build Coastguard Worker
272*dfc6aa5cSAndroid Build Coastguard Worker /* Construct bitmap to test if DCT coefficients in right 4x8 block are 0. */
273*dfc6aa5cSAndroid Build Coastguard Worker bitmap = vorr_s16(row7, row6);
274*dfc6aa5cSAndroid Build Coastguard Worker bitmap = vorr_s16(bitmap, row5);
275*dfc6aa5cSAndroid Build Coastguard Worker bitmap = vorr_s16(bitmap, row4);
276*dfc6aa5cSAndroid Build Coastguard Worker bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
277*dfc6aa5cSAndroid Build Coastguard Worker bitmap = vorr_s16(bitmap, row3);
278*dfc6aa5cSAndroid Build Coastguard Worker bitmap = vorr_s16(bitmap, row2);
279*dfc6aa5cSAndroid Build Coastguard Worker bitmap = vorr_s16(bitmap, row1);
280*dfc6aa5cSAndroid Build Coastguard Worker int64_t right_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
281*dfc6aa5cSAndroid Build Coastguard Worker
282*dfc6aa5cSAndroid Build Coastguard Worker /* If this remains non-zero, a "regular" second pass will be performed. */
283*dfc6aa5cSAndroid Build Coastguard Worker int64_t right_ac_dc_bitmap = 1;
284*dfc6aa5cSAndroid Build Coastguard Worker
285*dfc6aa5cSAndroid Build Coastguard Worker if (right_ac_bitmap == 0) {
286*dfc6aa5cSAndroid Build Coastguard Worker bitmap = vorr_s16(bitmap, row0);
287*dfc6aa5cSAndroid Build Coastguard Worker right_ac_dc_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
288*dfc6aa5cSAndroid Build Coastguard Worker
289*dfc6aa5cSAndroid Build Coastguard Worker if (right_ac_dc_bitmap != 0) {
290*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
291*dfc6aa5cSAndroid Build Coastguard Worker int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } };
292*dfc6aa5cSAndroid Build Coastguard Worker /* Store 4x4 blocks to workspace, transposing in the process. */
293*dfc6aa5cSAndroid Build Coastguard Worker vst4_s16(workspace_l + 4 * DCTSIZE / 2, quadrant);
294*dfc6aa5cSAndroid Build Coastguard Worker vst4_s16(workspace_r + 4 * DCTSIZE / 2, quadrant);
295*dfc6aa5cSAndroid Build Coastguard Worker }
296*dfc6aa5cSAndroid Build Coastguard Worker } else {
297*dfc6aa5cSAndroid Build Coastguard Worker if (bitmap_rows_4567 == 0) {
298*dfc6aa5cSAndroid Build Coastguard Worker jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
299*dfc6aa5cSAndroid Build Coastguard Worker quant_row1, quant_row2, quant_row3,
300*dfc6aa5cSAndroid Build Coastguard Worker workspace_l + 4 * DCTSIZE / 2,
301*dfc6aa5cSAndroid Build Coastguard Worker workspace_r + 4 * DCTSIZE / 2);
302*dfc6aa5cSAndroid Build Coastguard Worker } else {
303*dfc6aa5cSAndroid Build Coastguard Worker jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
304*dfc6aa5cSAndroid Build Coastguard Worker row6, row7, quant_row0, quant_row1,
305*dfc6aa5cSAndroid Build Coastguard Worker quant_row2, quant_row3, quant_row4,
306*dfc6aa5cSAndroid Build Coastguard Worker quant_row5, quant_row6, quant_row7,
307*dfc6aa5cSAndroid Build Coastguard Worker workspace_l + 4 * DCTSIZE / 2,
308*dfc6aa5cSAndroid Build Coastguard Worker workspace_r + 4 * DCTSIZE / 2);
309*dfc6aa5cSAndroid Build Coastguard Worker }
310*dfc6aa5cSAndroid Build Coastguard Worker }
311*dfc6aa5cSAndroid Build Coastguard Worker
312*dfc6aa5cSAndroid Build Coastguard Worker /* Second pass: compute IDCT on rows in workspace. */
313*dfc6aa5cSAndroid Build Coastguard Worker
314*dfc6aa5cSAndroid Build Coastguard Worker /* If all coefficients in right 4x8 block are 0, use "sparse" second pass. */
315*dfc6aa5cSAndroid Build Coastguard Worker if (right_ac_dc_bitmap == 0) {
316*dfc6aa5cSAndroid Build Coastguard Worker jsimd_idct_islow_pass2_sparse(workspace_l, output_buf, output_col, 0);
317*dfc6aa5cSAndroid Build Coastguard Worker jsimd_idct_islow_pass2_sparse(workspace_r, output_buf, output_col, 4);
318*dfc6aa5cSAndroid Build Coastguard Worker } else {
319*dfc6aa5cSAndroid Build Coastguard Worker jsimd_idct_islow_pass2_regular(workspace_l, output_buf, output_col, 0);
320*dfc6aa5cSAndroid Build Coastguard Worker jsimd_idct_islow_pass2_regular(workspace_r, output_buf, output_col, 4);
321*dfc6aa5cSAndroid Build Coastguard Worker }
322*dfc6aa5cSAndroid Build Coastguard Worker }
323*dfc6aa5cSAndroid Build Coastguard Worker
324*dfc6aa5cSAndroid Build Coastguard Worker
325*dfc6aa5cSAndroid Build Coastguard Worker /* Perform dequantization and the first pass of the accurate inverse DCT on a
326*dfc6aa5cSAndroid Build Coastguard Worker * 4x8 block of coefficients. (To process the full 8x8 DCT block, this
327*dfc6aa5cSAndroid Build Coastguard Worker * function-- or some other optimized variant-- needs to be called for both the
328*dfc6aa5cSAndroid Build Coastguard Worker * left and right 4x8 blocks.)
329*dfc6aa5cSAndroid Build Coastguard Worker *
330*dfc6aa5cSAndroid Build Coastguard Worker * This "regular" version assumes that no optimization can be made to the IDCT
331*dfc6aa5cSAndroid Build Coastguard Worker * calculation, since no useful set of AC coefficients is all 0.
332*dfc6aa5cSAndroid Build Coastguard Worker *
333*dfc6aa5cSAndroid Build Coastguard Worker * The original C implementation of the accurate IDCT (jpeg_idct_slow()) can be
334*dfc6aa5cSAndroid Build Coastguard Worker * found in jidctint.c. Algorithmic changes made here are documented inline.
335*dfc6aa5cSAndroid Build Coastguard Worker */
336*dfc6aa5cSAndroid Build Coastguard Worker
jsimd_idct_islow_pass1_regular(int16x4_t row0,int16x4_t row1,int16x4_t row2,int16x4_t row3,int16x4_t row4,int16x4_t row5,int16x4_t row6,int16x4_t row7,int16x4_t quant_row0,int16x4_t quant_row1,int16x4_t quant_row2,int16x4_t quant_row3,int16x4_t quant_row4,int16x4_t quant_row5,int16x4_t quant_row6,int16x4_t quant_row7,int16_t * workspace_1,int16_t * workspace_2)337*dfc6aa5cSAndroid Build Coastguard Worker static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0,
338*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t row1,
339*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t row2,
340*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t row3,
341*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t row4,
342*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t row5,
343*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t row6,
344*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t row7,
345*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t quant_row0,
346*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t quant_row1,
347*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t quant_row2,
348*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t quant_row3,
349*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t quant_row4,
350*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t quant_row5,
351*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t quant_row6,
352*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t quant_row7,
353*dfc6aa5cSAndroid Build Coastguard Worker int16_t *workspace_1,
354*dfc6aa5cSAndroid Build Coastguard Worker int16_t *workspace_2)
355*dfc6aa5cSAndroid Build Coastguard Worker {
356*dfc6aa5cSAndroid Build Coastguard Worker /* Load constants for IDCT computation. */
357*dfc6aa5cSAndroid Build Coastguard Worker #ifdef HAVE_VLD1_S16_X3
358*dfc6aa5cSAndroid Build Coastguard Worker const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
359*dfc6aa5cSAndroid Build Coastguard Worker #else
360*dfc6aa5cSAndroid Build Coastguard Worker const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
361*dfc6aa5cSAndroid Build Coastguard Worker const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
362*dfc6aa5cSAndroid Build Coastguard Worker const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
363*dfc6aa5cSAndroid Build Coastguard Worker const int16x4x3_t consts = { { consts1, consts2, consts3 } };
364*dfc6aa5cSAndroid Build Coastguard Worker #endif
365*dfc6aa5cSAndroid Build Coastguard Worker
366*dfc6aa5cSAndroid Build Coastguard Worker /* Even part */
367*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t z2_s16 = vmul_s16(row2, quant_row2);
368*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t z3_s16 = vmul_s16(row6, quant_row6);
369*dfc6aa5cSAndroid Build Coastguard Worker
370*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
371*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
372*dfc6aa5cSAndroid Build Coastguard Worker tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1);
373*dfc6aa5cSAndroid Build Coastguard Worker tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1);
374*dfc6aa5cSAndroid Build Coastguard Worker
375*dfc6aa5cSAndroid Build Coastguard Worker z2_s16 = vmul_s16(row0, quant_row0);
376*dfc6aa5cSAndroid Build Coastguard Worker z3_s16 = vmul_s16(row4, quant_row4);
377*dfc6aa5cSAndroid Build Coastguard Worker
378*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS);
379*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS);
380*dfc6aa5cSAndroid Build Coastguard Worker
381*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
382*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
383*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
384*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
385*dfc6aa5cSAndroid Build Coastguard Worker
386*dfc6aa5cSAndroid Build Coastguard Worker /* Odd part */
387*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t tmp0_s16 = vmul_s16(row7, quant_row7);
388*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t tmp1_s16 = vmul_s16(row5, quant_row5);
389*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3);
390*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1);
391*dfc6aa5cSAndroid Build Coastguard Worker
392*dfc6aa5cSAndroid Build Coastguard Worker z3_s16 = vadd_s16(tmp0_s16, tmp2_s16);
393*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16);
394*dfc6aa5cSAndroid Build Coastguard Worker
395*dfc6aa5cSAndroid Build Coastguard Worker /* Implementation as per jpeg_idct_islow() in jidctint.c:
396*dfc6aa5cSAndroid Build Coastguard Worker * z5 = (z3 + z4) * 1.175875602;
397*dfc6aa5cSAndroid Build Coastguard Worker * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
398*dfc6aa5cSAndroid Build Coastguard Worker * z3 += z5; z4 += z5;
399*dfc6aa5cSAndroid Build Coastguard Worker *
400*dfc6aa5cSAndroid Build Coastguard Worker * This implementation:
401*dfc6aa5cSAndroid Build Coastguard Worker * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
402*dfc6aa5cSAndroid Build Coastguard Worker * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
403*dfc6aa5cSAndroid Build Coastguard Worker */
404*dfc6aa5cSAndroid Build Coastguard Worker
405*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
406*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
407*dfc6aa5cSAndroid Build Coastguard Worker z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
408*dfc6aa5cSAndroid Build Coastguard Worker z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
409*dfc6aa5cSAndroid Build Coastguard Worker
410*dfc6aa5cSAndroid Build Coastguard Worker /* Implementation as per jpeg_idct_islow() in jidctint.c:
411*dfc6aa5cSAndroid Build Coastguard Worker * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
412*dfc6aa5cSAndroid Build Coastguard Worker * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
413*dfc6aa5cSAndroid Build Coastguard Worker * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
414*dfc6aa5cSAndroid Build Coastguard Worker * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
415*dfc6aa5cSAndroid Build Coastguard Worker * tmp0 += z1 + z3; tmp1 += z2 + z4;
416*dfc6aa5cSAndroid Build Coastguard Worker * tmp2 += z2 + z3; tmp3 += z1 + z4;
417*dfc6aa5cSAndroid Build Coastguard Worker *
418*dfc6aa5cSAndroid Build Coastguard Worker * This implementation:
419*dfc6aa5cSAndroid Build Coastguard Worker * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
420*dfc6aa5cSAndroid Build Coastguard Worker * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
421*dfc6aa5cSAndroid Build Coastguard Worker * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
422*dfc6aa5cSAndroid Build Coastguard Worker * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
423*dfc6aa5cSAndroid Build Coastguard Worker * tmp0 += z3; tmp1 += z4;
424*dfc6aa5cSAndroid Build Coastguard Worker * tmp2 += z3; tmp3 += z4;
425*dfc6aa5cSAndroid Build Coastguard Worker */
426*dfc6aa5cSAndroid Build Coastguard Worker
427*dfc6aa5cSAndroid Build Coastguard Worker tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3);
428*dfc6aa5cSAndroid Build Coastguard Worker tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1);
429*dfc6aa5cSAndroid Build Coastguard Worker tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2);
430*dfc6aa5cSAndroid Build Coastguard Worker tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0);
431*dfc6aa5cSAndroid Build Coastguard Worker
432*dfc6aa5cSAndroid Build Coastguard Worker tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0);
433*dfc6aa5cSAndroid Build Coastguard Worker tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2);
434*dfc6aa5cSAndroid Build Coastguard Worker tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2);
435*dfc6aa5cSAndroid Build Coastguard Worker tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0);
436*dfc6aa5cSAndroid Build Coastguard Worker
437*dfc6aa5cSAndroid Build Coastguard Worker tmp0 = vaddq_s32(tmp0, z3);
438*dfc6aa5cSAndroid Build Coastguard Worker tmp1 = vaddq_s32(tmp1, z4);
439*dfc6aa5cSAndroid Build Coastguard Worker tmp2 = vaddq_s32(tmp2, z3);
440*dfc6aa5cSAndroid Build Coastguard Worker tmp3 = vaddq_s32(tmp3, z4);
441*dfc6aa5cSAndroid Build Coastguard Worker
442*dfc6aa5cSAndroid Build Coastguard Worker /* Final output stage: descale and narrow to 16-bit. */
443*dfc6aa5cSAndroid Build Coastguard Worker int16x4x4_t rows_0123 = { {
444*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
445*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
446*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
447*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
448*dfc6aa5cSAndroid Build Coastguard Worker } };
449*dfc6aa5cSAndroid Build Coastguard Worker int16x4x4_t rows_4567 = { {
450*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
451*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
452*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
453*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
454*dfc6aa5cSAndroid Build Coastguard Worker } };
455*dfc6aa5cSAndroid Build Coastguard Worker
456*dfc6aa5cSAndroid Build Coastguard Worker /* Store 4x4 blocks to the intermediate workspace, ready for the second pass.
457*dfc6aa5cSAndroid Build Coastguard Worker * (VST4 transposes the blocks. We need to operate on rows in the next
458*dfc6aa5cSAndroid Build Coastguard Worker * pass.)
459*dfc6aa5cSAndroid Build Coastguard Worker */
460*dfc6aa5cSAndroid Build Coastguard Worker vst4_s16(workspace_1, rows_0123);
461*dfc6aa5cSAndroid Build Coastguard Worker vst4_s16(workspace_2, rows_4567);
462*dfc6aa5cSAndroid Build Coastguard Worker }
463*dfc6aa5cSAndroid Build Coastguard Worker
464*dfc6aa5cSAndroid Build Coastguard Worker
465*dfc6aa5cSAndroid Build Coastguard Worker /* Perform dequantization and the first pass of the accurate inverse DCT on a
466*dfc6aa5cSAndroid Build Coastguard Worker * 4x8 block of coefficients.
467*dfc6aa5cSAndroid Build Coastguard Worker *
468*dfc6aa5cSAndroid Build Coastguard Worker * This "sparse" version assumes that the AC coefficients in rows 4-7 are all
469*dfc6aa5cSAndroid Build Coastguard Worker * 0. This simplifies the IDCT calculation, accelerating overall performance.
470*dfc6aa5cSAndroid Build Coastguard Worker */
471*dfc6aa5cSAndroid Build Coastguard Worker
jsimd_idct_islow_pass1_sparse(int16x4_t row0,int16x4_t row1,int16x4_t row2,int16x4_t row3,int16x4_t quant_row0,int16x4_t quant_row1,int16x4_t quant_row2,int16x4_t quant_row3,int16_t * workspace_1,int16_t * workspace_2)472*dfc6aa5cSAndroid Build Coastguard Worker static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
473*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t row1,
474*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t row2,
475*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t row3,
476*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t quant_row0,
477*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t quant_row1,
478*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t quant_row2,
479*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t quant_row3,
480*dfc6aa5cSAndroid Build Coastguard Worker int16_t *workspace_1,
481*dfc6aa5cSAndroid Build Coastguard Worker int16_t *workspace_2)
482*dfc6aa5cSAndroid Build Coastguard Worker {
483*dfc6aa5cSAndroid Build Coastguard Worker /* Load constants for IDCT computation. */
484*dfc6aa5cSAndroid Build Coastguard Worker #ifdef HAVE_VLD1_S16_X3
485*dfc6aa5cSAndroid Build Coastguard Worker const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
486*dfc6aa5cSAndroid Build Coastguard Worker #else
487*dfc6aa5cSAndroid Build Coastguard Worker const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
488*dfc6aa5cSAndroid Build Coastguard Worker const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
489*dfc6aa5cSAndroid Build Coastguard Worker const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
490*dfc6aa5cSAndroid Build Coastguard Worker const int16x4x3_t consts = { { consts1, consts2, consts3 } };
491*dfc6aa5cSAndroid Build Coastguard Worker #endif
492*dfc6aa5cSAndroid Build Coastguard Worker
493*dfc6aa5cSAndroid Build Coastguard Worker /* Even part (z3 is all 0) */
494*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t z2_s16 = vmul_s16(row2, quant_row2);
495*dfc6aa5cSAndroid Build Coastguard Worker
496*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
497*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
498*dfc6aa5cSAndroid Build Coastguard Worker
499*dfc6aa5cSAndroid Build Coastguard Worker z2_s16 = vmul_s16(row0, quant_row0);
500*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS);
501*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS);
502*dfc6aa5cSAndroid Build Coastguard Worker
503*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
504*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
505*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
506*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
507*dfc6aa5cSAndroid Build Coastguard Worker
508*dfc6aa5cSAndroid Build Coastguard Worker /* Odd part (tmp0 and tmp1 are both all 0) */
509*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3);
510*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1);
511*dfc6aa5cSAndroid Build Coastguard Worker
512*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t z3_s16 = tmp2_s16;
513*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t z4_s16 = tmp3_s16;
514*dfc6aa5cSAndroid Build Coastguard Worker
515*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
516*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
517*dfc6aa5cSAndroid Build Coastguard Worker z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
518*dfc6aa5cSAndroid Build Coastguard Worker z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
519*dfc6aa5cSAndroid Build Coastguard Worker
520*dfc6aa5cSAndroid Build Coastguard Worker tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0);
521*dfc6aa5cSAndroid Build Coastguard Worker tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2);
522*dfc6aa5cSAndroid Build Coastguard Worker tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2);
523*dfc6aa5cSAndroid Build Coastguard Worker tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0);
524*dfc6aa5cSAndroid Build Coastguard Worker
525*dfc6aa5cSAndroid Build Coastguard Worker /* Final output stage: descale and narrow to 16-bit. */
526*dfc6aa5cSAndroid Build Coastguard Worker int16x4x4_t rows_0123 = { {
527*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
528*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
529*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
530*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
531*dfc6aa5cSAndroid Build Coastguard Worker } };
532*dfc6aa5cSAndroid Build Coastguard Worker int16x4x4_t rows_4567 = { {
533*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
534*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
535*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
536*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
537*dfc6aa5cSAndroid Build Coastguard Worker } };
538*dfc6aa5cSAndroid Build Coastguard Worker
539*dfc6aa5cSAndroid Build Coastguard Worker /* Store 4x4 blocks to the intermediate workspace, ready for the second pass.
540*dfc6aa5cSAndroid Build Coastguard Worker * (VST4 transposes the blocks. We need to operate on rows in the next
541*dfc6aa5cSAndroid Build Coastguard Worker * pass.)
542*dfc6aa5cSAndroid Build Coastguard Worker */
543*dfc6aa5cSAndroid Build Coastguard Worker vst4_s16(workspace_1, rows_0123);
544*dfc6aa5cSAndroid Build Coastguard Worker vst4_s16(workspace_2, rows_4567);
545*dfc6aa5cSAndroid Build Coastguard Worker }
546*dfc6aa5cSAndroid Build Coastguard Worker
547*dfc6aa5cSAndroid Build Coastguard Worker
548*dfc6aa5cSAndroid Build Coastguard Worker /* Perform the second pass of the accurate inverse DCT on a 4x8 block of
549*dfc6aa5cSAndroid Build Coastguard Worker * coefficients. (To process the full 8x8 DCT block, this function-- or some
550*dfc6aa5cSAndroid Build Coastguard Worker * other optimized variant-- needs to be called for both the right and left 4x8
551*dfc6aa5cSAndroid Build Coastguard Worker * blocks.)
552*dfc6aa5cSAndroid Build Coastguard Worker *
553*dfc6aa5cSAndroid Build Coastguard Worker * This "regular" version assumes that no optimization can be made to the IDCT
554*dfc6aa5cSAndroid Build Coastguard Worker * calculation, since no useful set of coefficient values are all 0 after the
555*dfc6aa5cSAndroid Build Coastguard Worker * first pass.
556*dfc6aa5cSAndroid Build Coastguard Worker *
557*dfc6aa5cSAndroid Build Coastguard Worker * Again, the original C implementation of the accurate IDCT (jpeg_idct_slow())
558*dfc6aa5cSAndroid Build Coastguard Worker * can be found in jidctint.c. Algorithmic changes made here are documented
559*dfc6aa5cSAndroid Build Coastguard Worker * inline.
560*dfc6aa5cSAndroid Build Coastguard Worker */
561*dfc6aa5cSAndroid Build Coastguard Worker
jsimd_idct_islow_pass2_regular(int16_t * workspace,JSAMPARRAY output_buf,JDIMENSION output_col,unsigned buf_offset)562*dfc6aa5cSAndroid Build Coastguard Worker static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace,
563*dfc6aa5cSAndroid Build Coastguard Worker JSAMPARRAY output_buf,
564*dfc6aa5cSAndroid Build Coastguard Worker JDIMENSION output_col,
565*dfc6aa5cSAndroid Build Coastguard Worker unsigned buf_offset)
566*dfc6aa5cSAndroid Build Coastguard Worker {
567*dfc6aa5cSAndroid Build Coastguard Worker /* Load constants for IDCT computation. */
568*dfc6aa5cSAndroid Build Coastguard Worker #ifdef HAVE_VLD1_S16_X3
569*dfc6aa5cSAndroid Build Coastguard Worker const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
570*dfc6aa5cSAndroid Build Coastguard Worker #else
571*dfc6aa5cSAndroid Build Coastguard Worker const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
572*dfc6aa5cSAndroid Build Coastguard Worker const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
573*dfc6aa5cSAndroid Build Coastguard Worker const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
574*dfc6aa5cSAndroid Build Coastguard Worker const int16x4x3_t consts = { { consts1, consts2, consts3 } };
575*dfc6aa5cSAndroid Build Coastguard Worker #endif
576*dfc6aa5cSAndroid Build Coastguard Worker
577*dfc6aa5cSAndroid Build Coastguard Worker /* Even part */
578*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2);
579*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t z3_s16 = vld1_s16(workspace + 6 * DCTSIZE / 2);
580*dfc6aa5cSAndroid Build Coastguard Worker
581*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
582*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
583*dfc6aa5cSAndroid Build Coastguard Worker tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1);
584*dfc6aa5cSAndroid Build Coastguard Worker tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1);
585*dfc6aa5cSAndroid Build Coastguard Worker
586*dfc6aa5cSAndroid Build Coastguard Worker z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2);
587*dfc6aa5cSAndroid Build Coastguard Worker z3_s16 = vld1_s16(workspace + 4 * DCTSIZE / 2);
588*dfc6aa5cSAndroid Build Coastguard Worker
589*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS);
590*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS);
591*dfc6aa5cSAndroid Build Coastguard Worker
592*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
593*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
594*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
595*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
596*dfc6aa5cSAndroid Build Coastguard Worker
597*dfc6aa5cSAndroid Build Coastguard Worker /* Odd part */
598*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t tmp0_s16 = vld1_s16(workspace + 7 * DCTSIZE / 2);
599*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t tmp1_s16 = vld1_s16(workspace + 5 * DCTSIZE / 2);
600*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2);
601*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2);
602*dfc6aa5cSAndroid Build Coastguard Worker
603*dfc6aa5cSAndroid Build Coastguard Worker z3_s16 = vadd_s16(tmp0_s16, tmp2_s16);
604*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16);
605*dfc6aa5cSAndroid Build Coastguard Worker
606*dfc6aa5cSAndroid Build Coastguard Worker /* Implementation as per jpeg_idct_islow() in jidctint.c:
607*dfc6aa5cSAndroid Build Coastguard Worker * z5 = (z3 + z4) * 1.175875602;
608*dfc6aa5cSAndroid Build Coastguard Worker * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
609*dfc6aa5cSAndroid Build Coastguard Worker * z3 += z5; z4 += z5;
610*dfc6aa5cSAndroid Build Coastguard Worker *
611*dfc6aa5cSAndroid Build Coastguard Worker * This implementation:
612*dfc6aa5cSAndroid Build Coastguard Worker * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
613*dfc6aa5cSAndroid Build Coastguard Worker * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
614*dfc6aa5cSAndroid Build Coastguard Worker */
615*dfc6aa5cSAndroid Build Coastguard Worker
616*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
617*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
618*dfc6aa5cSAndroid Build Coastguard Worker z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
619*dfc6aa5cSAndroid Build Coastguard Worker z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
620*dfc6aa5cSAndroid Build Coastguard Worker
621*dfc6aa5cSAndroid Build Coastguard Worker /* Implementation as per jpeg_idct_islow() in jidctint.c:
622*dfc6aa5cSAndroid Build Coastguard Worker * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
623*dfc6aa5cSAndroid Build Coastguard Worker * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
624*dfc6aa5cSAndroid Build Coastguard Worker * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
625*dfc6aa5cSAndroid Build Coastguard Worker * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
626*dfc6aa5cSAndroid Build Coastguard Worker * tmp0 += z1 + z3; tmp1 += z2 + z4;
627*dfc6aa5cSAndroid Build Coastguard Worker * tmp2 += z2 + z3; tmp3 += z1 + z4;
628*dfc6aa5cSAndroid Build Coastguard Worker *
629*dfc6aa5cSAndroid Build Coastguard Worker * This implementation:
630*dfc6aa5cSAndroid Build Coastguard Worker * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
631*dfc6aa5cSAndroid Build Coastguard Worker * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
632*dfc6aa5cSAndroid Build Coastguard Worker * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
633*dfc6aa5cSAndroid Build Coastguard Worker * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
634*dfc6aa5cSAndroid Build Coastguard Worker * tmp0 += z3; tmp1 += z4;
635*dfc6aa5cSAndroid Build Coastguard Worker * tmp2 += z3; tmp3 += z4;
636*dfc6aa5cSAndroid Build Coastguard Worker */
637*dfc6aa5cSAndroid Build Coastguard Worker
638*dfc6aa5cSAndroid Build Coastguard Worker tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3);
639*dfc6aa5cSAndroid Build Coastguard Worker tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1);
640*dfc6aa5cSAndroid Build Coastguard Worker tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2);
641*dfc6aa5cSAndroid Build Coastguard Worker tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0);
642*dfc6aa5cSAndroid Build Coastguard Worker
643*dfc6aa5cSAndroid Build Coastguard Worker tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0);
644*dfc6aa5cSAndroid Build Coastguard Worker tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2);
645*dfc6aa5cSAndroid Build Coastguard Worker tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2);
646*dfc6aa5cSAndroid Build Coastguard Worker tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0);
647*dfc6aa5cSAndroid Build Coastguard Worker
648*dfc6aa5cSAndroid Build Coastguard Worker tmp0 = vaddq_s32(tmp0, z3);
649*dfc6aa5cSAndroid Build Coastguard Worker tmp1 = vaddq_s32(tmp1, z4);
650*dfc6aa5cSAndroid Build Coastguard Worker tmp2 = vaddq_s32(tmp2, z3);
651*dfc6aa5cSAndroid Build Coastguard Worker tmp3 = vaddq_s32(tmp3, z4);
652*dfc6aa5cSAndroid Build Coastguard Worker
653*dfc6aa5cSAndroid Build Coastguard Worker /* Final output stage: descale and narrow to 16-bit. */
654*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3),
655*dfc6aa5cSAndroid Build Coastguard Worker vaddhn_s32(tmp12, tmp1));
656*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2),
657*dfc6aa5cSAndroid Build Coastguard Worker vaddhn_s32(tmp13, tmp0));
658*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0),
659*dfc6aa5cSAndroid Build Coastguard Worker vsubhn_s32(tmp11, tmp2));
660*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1),
661*dfc6aa5cSAndroid Build Coastguard Worker vsubhn_s32(tmp10, tmp3));
662*dfc6aa5cSAndroid Build Coastguard Worker /* Descale and narrow to 8-bit. */
663*dfc6aa5cSAndroid Build Coastguard Worker int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16);
664*dfc6aa5cSAndroid Build Coastguard Worker int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16);
665*dfc6aa5cSAndroid Build Coastguard Worker int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16);
666*dfc6aa5cSAndroid Build Coastguard Worker int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16);
667*dfc6aa5cSAndroid Build Coastguard Worker /* Clamp to range [0-255]. */
668*dfc6aa5cSAndroid Build Coastguard Worker uint8x8_t cols_02_u8 = vadd_u8(vreinterpret_u8_s8(cols_02_s8),
669*dfc6aa5cSAndroid Build Coastguard Worker vdup_n_u8(CENTERJSAMPLE));
670*dfc6aa5cSAndroid Build Coastguard Worker uint8x8_t cols_13_u8 = vadd_u8(vreinterpret_u8_s8(cols_13_s8),
671*dfc6aa5cSAndroid Build Coastguard Worker vdup_n_u8(CENTERJSAMPLE));
672*dfc6aa5cSAndroid Build Coastguard Worker uint8x8_t cols_46_u8 = vadd_u8(vreinterpret_u8_s8(cols_46_s8),
673*dfc6aa5cSAndroid Build Coastguard Worker vdup_n_u8(CENTERJSAMPLE));
674*dfc6aa5cSAndroid Build Coastguard Worker uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8),
675*dfc6aa5cSAndroid Build Coastguard Worker vdup_n_u8(CENTERJSAMPLE));
676*dfc6aa5cSAndroid Build Coastguard Worker
677*dfc6aa5cSAndroid Build Coastguard Worker /* Transpose 4x8 block and store to memory. (Zipping adjacent columns
678*dfc6aa5cSAndroid Build Coastguard Worker * together allows us to store 16-bit elements.)
679*dfc6aa5cSAndroid Build Coastguard Worker */
680*dfc6aa5cSAndroid Build Coastguard Worker uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8);
681*dfc6aa5cSAndroid Build Coastguard Worker uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8);
682*dfc6aa5cSAndroid Build Coastguard Worker uint16x4x4_t cols_01_23_45_67 = { {
683*dfc6aa5cSAndroid Build Coastguard Worker vreinterpret_u16_u8(cols_01_23.val[0]),
684*dfc6aa5cSAndroid Build Coastguard Worker vreinterpret_u16_u8(cols_01_23.val[1]),
685*dfc6aa5cSAndroid Build Coastguard Worker vreinterpret_u16_u8(cols_45_67.val[0]),
686*dfc6aa5cSAndroid Build Coastguard Worker vreinterpret_u16_u8(cols_45_67.val[1])
687*dfc6aa5cSAndroid Build Coastguard Worker } };
688*dfc6aa5cSAndroid Build Coastguard Worker
689*dfc6aa5cSAndroid Build Coastguard Worker JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
690*dfc6aa5cSAndroid Build Coastguard Worker JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
691*dfc6aa5cSAndroid Build Coastguard Worker JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col;
692*dfc6aa5cSAndroid Build Coastguard Worker JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col;
693*dfc6aa5cSAndroid Build Coastguard Worker /* VST4 of 16-bit elements completes the transpose. */
694*dfc6aa5cSAndroid Build Coastguard Worker vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0);
695*dfc6aa5cSAndroid Build Coastguard Worker vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1);
696*dfc6aa5cSAndroid Build Coastguard Worker vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2);
697*dfc6aa5cSAndroid Build Coastguard Worker vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3);
698*dfc6aa5cSAndroid Build Coastguard Worker }
699*dfc6aa5cSAndroid Build Coastguard Worker
700*dfc6aa5cSAndroid Build Coastguard Worker
701*dfc6aa5cSAndroid Build Coastguard Worker /* Performs the second pass of the accurate inverse DCT on a 4x8 block
702*dfc6aa5cSAndroid Build Coastguard Worker * of coefficients.
703*dfc6aa5cSAndroid Build Coastguard Worker *
704*dfc6aa5cSAndroid Build Coastguard Worker * This "sparse" version assumes that the coefficient values (after the first
705*dfc6aa5cSAndroid Build Coastguard Worker * pass) in rows 4-7 are all 0. This simplifies the IDCT calculation,
706*dfc6aa5cSAndroid Build Coastguard Worker * accelerating overall performance.
707*dfc6aa5cSAndroid Build Coastguard Worker */
708*dfc6aa5cSAndroid Build Coastguard Worker
jsimd_idct_islow_pass2_sparse(int16_t * workspace,JSAMPARRAY output_buf,JDIMENSION output_col,unsigned buf_offset)709*dfc6aa5cSAndroid Build Coastguard Worker static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
710*dfc6aa5cSAndroid Build Coastguard Worker JSAMPARRAY output_buf,
711*dfc6aa5cSAndroid Build Coastguard Worker JDIMENSION output_col,
712*dfc6aa5cSAndroid Build Coastguard Worker unsigned buf_offset)
713*dfc6aa5cSAndroid Build Coastguard Worker {
714*dfc6aa5cSAndroid Build Coastguard Worker /* Load constants for IDCT computation. */
715*dfc6aa5cSAndroid Build Coastguard Worker #ifdef HAVE_VLD1_S16_X3
716*dfc6aa5cSAndroid Build Coastguard Worker const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
717*dfc6aa5cSAndroid Build Coastguard Worker #else
718*dfc6aa5cSAndroid Build Coastguard Worker const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
719*dfc6aa5cSAndroid Build Coastguard Worker const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
720*dfc6aa5cSAndroid Build Coastguard Worker const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
721*dfc6aa5cSAndroid Build Coastguard Worker const int16x4x3_t consts = { { consts1, consts2, consts3 } };
722*dfc6aa5cSAndroid Build Coastguard Worker #endif
723*dfc6aa5cSAndroid Build Coastguard Worker
724*dfc6aa5cSAndroid Build Coastguard Worker /* Even part (z3 is all 0) */
725*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2);
726*dfc6aa5cSAndroid Build Coastguard Worker
727*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
728*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
729*dfc6aa5cSAndroid Build Coastguard Worker
730*dfc6aa5cSAndroid Build Coastguard Worker z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2);
731*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS);
732*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS);
733*dfc6aa5cSAndroid Build Coastguard Worker
734*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
735*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
736*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
737*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
738*dfc6aa5cSAndroid Build Coastguard Worker
739*dfc6aa5cSAndroid Build Coastguard Worker /* Odd part (tmp0 and tmp1 are both all 0) */
740*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2);
741*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2);
742*dfc6aa5cSAndroid Build Coastguard Worker
743*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t z3_s16 = tmp2_s16;
744*dfc6aa5cSAndroid Build Coastguard Worker int16x4_t z4_s16 = tmp3_s16;
745*dfc6aa5cSAndroid Build Coastguard Worker
746*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
747*dfc6aa5cSAndroid Build Coastguard Worker z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
748*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
749*dfc6aa5cSAndroid Build Coastguard Worker z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
750*dfc6aa5cSAndroid Build Coastguard Worker
751*dfc6aa5cSAndroid Build Coastguard Worker tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0);
752*dfc6aa5cSAndroid Build Coastguard Worker tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2);
753*dfc6aa5cSAndroid Build Coastguard Worker tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2);
754*dfc6aa5cSAndroid Build Coastguard Worker tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0);
755*dfc6aa5cSAndroid Build Coastguard Worker
756*dfc6aa5cSAndroid Build Coastguard Worker /* Final output stage: descale and narrow to 16-bit. */
757*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3),
758*dfc6aa5cSAndroid Build Coastguard Worker vaddhn_s32(tmp12, tmp1));
759*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2),
760*dfc6aa5cSAndroid Build Coastguard Worker vaddhn_s32(tmp13, tmp0));
761*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0),
762*dfc6aa5cSAndroid Build Coastguard Worker vsubhn_s32(tmp11, tmp2));
763*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1),
764*dfc6aa5cSAndroid Build Coastguard Worker vsubhn_s32(tmp10, tmp3));
765*dfc6aa5cSAndroid Build Coastguard Worker /* Descale and narrow to 8-bit. */
766*dfc6aa5cSAndroid Build Coastguard Worker int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16);
767*dfc6aa5cSAndroid Build Coastguard Worker int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16);
768*dfc6aa5cSAndroid Build Coastguard Worker int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16);
769*dfc6aa5cSAndroid Build Coastguard Worker int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16);
770*dfc6aa5cSAndroid Build Coastguard Worker /* Clamp to range [0-255]. */
771*dfc6aa5cSAndroid Build Coastguard Worker uint8x8_t cols_02_u8 = vadd_u8(vreinterpret_u8_s8(cols_02_s8),
772*dfc6aa5cSAndroid Build Coastguard Worker vdup_n_u8(CENTERJSAMPLE));
773*dfc6aa5cSAndroid Build Coastguard Worker uint8x8_t cols_13_u8 = vadd_u8(vreinterpret_u8_s8(cols_13_s8),
774*dfc6aa5cSAndroid Build Coastguard Worker vdup_n_u8(CENTERJSAMPLE));
775*dfc6aa5cSAndroid Build Coastguard Worker uint8x8_t cols_46_u8 = vadd_u8(vreinterpret_u8_s8(cols_46_s8),
776*dfc6aa5cSAndroid Build Coastguard Worker vdup_n_u8(CENTERJSAMPLE));
777*dfc6aa5cSAndroid Build Coastguard Worker uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8),
778*dfc6aa5cSAndroid Build Coastguard Worker vdup_n_u8(CENTERJSAMPLE));
779*dfc6aa5cSAndroid Build Coastguard Worker
780*dfc6aa5cSAndroid Build Coastguard Worker /* Transpose 4x8 block and store to memory. (Zipping adjacent columns
781*dfc6aa5cSAndroid Build Coastguard Worker * together allows us to store 16-bit elements.)
782*dfc6aa5cSAndroid Build Coastguard Worker */
783*dfc6aa5cSAndroid Build Coastguard Worker uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8);
784*dfc6aa5cSAndroid Build Coastguard Worker uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8);
785*dfc6aa5cSAndroid Build Coastguard Worker uint16x4x4_t cols_01_23_45_67 = { {
786*dfc6aa5cSAndroid Build Coastguard Worker vreinterpret_u16_u8(cols_01_23.val[0]),
787*dfc6aa5cSAndroid Build Coastguard Worker vreinterpret_u16_u8(cols_01_23.val[1]),
788*dfc6aa5cSAndroid Build Coastguard Worker vreinterpret_u16_u8(cols_45_67.val[0]),
789*dfc6aa5cSAndroid Build Coastguard Worker vreinterpret_u16_u8(cols_45_67.val[1])
790*dfc6aa5cSAndroid Build Coastguard Worker } };
791*dfc6aa5cSAndroid Build Coastguard Worker
792*dfc6aa5cSAndroid Build Coastguard Worker JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
793*dfc6aa5cSAndroid Build Coastguard Worker JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
794*dfc6aa5cSAndroid Build Coastguard Worker JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col;
795*dfc6aa5cSAndroid Build Coastguard Worker JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col;
796*dfc6aa5cSAndroid Build Coastguard Worker /* VST4 of 16-bit elements completes the transpose. */
797*dfc6aa5cSAndroid Build Coastguard Worker vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0);
798*dfc6aa5cSAndroid Build Coastguard Worker vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1);
799*dfc6aa5cSAndroid Build Coastguard Worker vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2);
800*dfc6aa5cSAndroid Build Coastguard Worker vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3);
801*dfc6aa5cSAndroid Build Coastguard Worker }
802