xref: /aosp_15_r20/external/libjpeg-turbo/simd/arm/jcphuff-neon.c (revision dfc6aa5c1cfd4bc4e2018dc74aa96e29ee49c6da)
1*dfc6aa5cSAndroid Build Coastguard Worker /*
2*dfc6aa5cSAndroid Build Coastguard Worker  * jcphuff-neon.c - prepare data for progressive Huffman encoding (Arm Neon)
3*dfc6aa5cSAndroid Build Coastguard Worker  *
4*dfc6aa5cSAndroid Build Coastguard Worker  * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
5*dfc6aa5cSAndroid Build Coastguard Worker  * Copyright (C) 2022, Matthieu Darbois.  All Rights Reserved.
6*dfc6aa5cSAndroid Build Coastguard Worker  * Copyright (C) 2022, D. R. Commander.  All Rights Reserved.
7*dfc6aa5cSAndroid Build Coastguard Worker  *
8*dfc6aa5cSAndroid Build Coastguard Worker  * This software is provided 'as-is', without any express or implied
9*dfc6aa5cSAndroid Build Coastguard Worker  * warranty.  In no event will the authors be held liable for any damages
10*dfc6aa5cSAndroid Build Coastguard Worker  * arising from the use of this software.
11*dfc6aa5cSAndroid Build Coastguard Worker  *
12*dfc6aa5cSAndroid Build Coastguard Worker  * Permission is granted to anyone to use this software for any purpose,
13*dfc6aa5cSAndroid Build Coastguard Worker  * including commercial applications, and to alter it and redistribute it
14*dfc6aa5cSAndroid Build Coastguard Worker  * freely, subject to the following restrictions:
15*dfc6aa5cSAndroid Build Coastguard Worker  *
16*dfc6aa5cSAndroid Build Coastguard Worker  * 1. The origin of this software must not be misrepresented; you must not
17*dfc6aa5cSAndroid Build Coastguard Worker  *    claim that you wrote the original software. If you use this software
18*dfc6aa5cSAndroid Build Coastguard Worker  *    in a product, an acknowledgment in the product documentation would be
19*dfc6aa5cSAndroid Build Coastguard Worker  *    appreciated but is not required.
20*dfc6aa5cSAndroid Build Coastguard Worker  * 2. Altered source versions must be plainly marked as such, and must not be
21*dfc6aa5cSAndroid Build Coastguard Worker  *    misrepresented as being the original software.
22*dfc6aa5cSAndroid Build Coastguard Worker  * 3. This notice may not be removed or altered from any source distribution.
23*dfc6aa5cSAndroid Build Coastguard Worker  */
24*dfc6aa5cSAndroid Build Coastguard Worker 
25*dfc6aa5cSAndroid Build Coastguard Worker #define JPEG_INTERNALS
26*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jinclude.h"
27*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jpeglib.h"
28*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jsimd.h"
29*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jdct.h"
30*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jsimddct.h"
31*dfc6aa5cSAndroid Build Coastguard Worker #include "../jsimd.h"
32*dfc6aa5cSAndroid Build Coastguard Worker #include "neon-compat.h"
33*dfc6aa5cSAndroid Build Coastguard Worker 
34*dfc6aa5cSAndroid Build Coastguard Worker #include <arm_neon.h>
35*dfc6aa5cSAndroid Build Coastguard Worker 
36*dfc6aa5cSAndroid Build Coastguard Worker 
37*dfc6aa5cSAndroid Build Coastguard Worker /* Data preparation for encode_mcu_AC_first().
38*dfc6aa5cSAndroid Build Coastguard Worker  *
39*dfc6aa5cSAndroid Build Coastguard Worker  * The equivalent scalar C function (encode_mcu_AC_first_prepare()) can be
40*dfc6aa5cSAndroid Build Coastguard Worker  * found in jcphuff.c.
41*dfc6aa5cSAndroid Build Coastguard Worker  */
42*dfc6aa5cSAndroid Build Coastguard Worker 
jsimd_encode_mcu_AC_first_prepare_neon(const JCOEF * block,const int * jpeg_natural_order_start,int Sl,int Al,UJCOEF * values,size_t * zerobits)43*dfc6aa5cSAndroid Build Coastguard Worker void jsimd_encode_mcu_AC_first_prepare_neon
44*dfc6aa5cSAndroid Build Coastguard Worker   (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
45*dfc6aa5cSAndroid Build Coastguard Worker    UJCOEF *values, size_t *zerobits)
46*dfc6aa5cSAndroid Build Coastguard Worker {
47*dfc6aa5cSAndroid Build Coastguard Worker   UJCOEF *values_ptr = values;
48*dfc6aa5cSAndroid Build Coastguard Worker   UJCOEF *diff_values_ptr = values + DCTSIZE2;
49*dfc6aa5cSAndroid Build Coastguard Worker 
50*dfc6aa5cSAndroid Build Coastguard Worker   /* Rows of coefficients to zero (since they haven't been processed) */
51*dfc6aa5cSAndroid Build Coastguard Worker   int i, rows_to_zero = 8;
52*dfc6aa5cSAndroid Build Coastguard Worker 
53*dfc6aa5cSAndroid Build Coastguard Worker   for (i = 0; i < Sl / 16; i++) {
54*dfc6aa5cSAndroid Build Coastguard Worker     int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
55*dfc6aa5cSAndroid Build Coastguard Worker     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
56*dfc6aa5cSAndroid Build Coastguard Worker     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
57*dfc6aa5cSAndroid Build Coastguard Worker     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
58*dfc6aa5cSAndroid Build Coastguard Worker     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
59*dfc6aa5cSAndroid Build Coastguard Worker     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
60*dfc6aa5cSAndroid Build Coastguard Worker     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
61*dfc6aa5cSAndroid Build Coastguard Worker     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
62*dfc6aa5cSAndroid Build Coastguard Worker     int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
63*dfc6aa5cSAndroid Build Coastguard Worker     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
64*dfc6aa5cSAndroid Build Coastguard Worker     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
65*dfc6aa5cSAndroid Build Coastguard Worker     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
66*dfc6aa5cSAndroid Build Coastguard Worker     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
67*dfc6aa5cSAndroid Build Coastguard Worker     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
68*dfc6aa5cSAndroid Build Coastguard Worker     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
69*dfc6aa5cSAndroid Build Coastguard Worker     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
70*dfc6aa5cSAndroid Build Coastguard Worker 
71*dfc6aa5cSAndroid Build Coastguard Worker     /* Isolate sign of coefficients. */
72*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t sign_coefs1 = vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15));
73*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t sign_coefs2 = vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15));
74*dfc6aa5cSAndroid Build Coastguard Worker     /* Compute absolute value of coefficients and apply point transform Al. */
75*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
76*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
77*dfc6aa5cSAndroid Build Coastguard Worker     abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
78*dfc6aa5cSAndroid Build Coastguard Worker     abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
79*dfc6aa5cSAndroid Build Coastguard Worker 
80*dfc6aa5cSAndroid Build Coastguard Worker     /* Compute diff values. */
81*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t diff1 = veorq_u16(abs_coefs1, sign_coefs1);
82*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t diff2 = veorq_u16(abs_coefs2, sign_coefs2);
83*dfc6aa5cSAndroid Build Coastguard Worker 
84*dfc6aa5cSAndroid Build Coastguard Worker     /* Store transformed coefficients and diff values. */
85*dfc6aa5cSAndroid Build Coastguard Worker     vst1q_u16(values_ptr, abs_coefs1);
86*dfc6aa5cSAndroid Build Coastguard Worker     vst1q_u16(values_ptr + DCTSIZE, abs_coefs2);
87*dfc6aa5cSAndroid Build Coastguard Worker     vst1q_u16(diff_values_ptr, diff1);
88*dfc6aa5cSAndroid Build Coastguard Worker     vst1q_u16(diff_values_ptr + DCTSIZE, diff2);
89*dfc6aa5cSAndroid Build Coastguard Worker     values_ptr += 16;
90*dfc6aa5cSAndroid Build Coastguard Worker     diff_values_ptr += 16;
91*dfc6aa5cSAndroid Build Coastguard Worker     jpeg_natural_order_start += 16;
92*dfc6aa5cSAndroid Build Coastguard Worker     rows_to_zero -= 2;
93*dfc6aa5cSAndroid Build Coastguard Worker   }
94*dfc6aa5cSAndroid Build Coastguard Worker 
95*dfc6aa5cSAndroid Build Coastguard Worker   /* Same operation but for remaining partial vector */
96*dfc6aa5cSAndroid Build Coastguard Worker   int remaining_coefs = Sl % 16;
97*dfc6aa5cSAndroid Build Coastguard Worker   if (remaining_coefs > 8) {
98*dfc6aa5cSAndroid Build Coastguard Worker     int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
99*dfc6aa5cSAndroid Build Coastguard Worker     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
100*dfc6aa5cSAndroid Build Coastguard Worker     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
101*dfc6aa5cSAndroid Build Coastguard Worker     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
102*dfc6aa5cSAndroid Build Coastguard Worker     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
103*dfc6aa5cSAndroid Build Coastguard Worker     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
104*dfc6aa5cSAndroid Build Coastguard Worker     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
105*dfc6aa5cSAndroid Build Coastguard Worker     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
106*dfc6aa5cSAndroid Build Coastguard Worker     int16x8_t coefs2 = vdupq_n_s16(0);
107*dfc6aa5cSAndroid Build Coastguard Worker     switch (remaining_coefs) {
108*dfc6aa5cSAndroid Build Coastguard Worker     case 15:
109*dfc6aa5cSAndroid Build Coastguard Worker       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
110*dfc6aa5cSAndroid Build Coastguard Worker       FALLTHROUGH               /*FALLTHROUGH*/
111*dfc6aa5cSAndroid Build Coastguard Worker     case 14:
112*dfc6aa5cSAndroid Build Coastguard Worker       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
113*dfc6aa5cSAndroid Build Coastguard Worker       FALLTHROUGH               /*FALLTHROUGH*/
114*dfc6aa5cSAndroid Build Coastguard Worker     case 13:
115*dfc6aa5cSAndroid Build Coastguard Worker       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
116*dfc6aa5cSAndroid Build Coastguard Worker       FALLTHROUGH               /*FALLTHROUGH*/
117*dfc6aa5cSAndroid Build Coastguard Worker     case 12:
118*dfc6aa5cSAndroid Build Coastguard Worker       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
119*dfc6aa5cSAndroid Build Coastguard Worker       FALLTHROUGH               /*FALLTHROUGH*/
120*dfc6aa5cSAndroid Build Coastguard Worker     case 11:
121*dfc6aa5cSAndroid Build Coastguard Worker       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
122*dfc6aa5cSAndroid Build Coastguard Worker       FALLTHROUGH               /*FALLTHROUGH*/
123*dfc6aa5cSAndroid Build Coastguard Worker     case 10:
124*dfc6aa5cSAndroid Build Coastguard Worker       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
125*dfc6aa5cSAndroid Build Coastguard Worker       FALLTHROUGH               /*FALLTHROUGH*/
126*dfc6aa5cSAndroid Build Coastguard Worker     case 9:
127*dfc6aa5cSAndroid Build Coastguard Worker       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
128*dfc6aa5cSAndroid Build Coastguard Worker       FALLTHROUGH               /*FALLTHROUGH*/
129*dfc6aa5cSAndroid Build Coastguard Worker     default:
130*dfc6aa5cSAndroid Build Coastguard Worker       break;
131*dfc6aa5cSAndroid Build Coastguard Worker     }
132*dfc6aa5cSAndroid Build Coastguard Worker 
133*dfc6aa5cSAndroid Build Coastguard Worker     /* Isolate sign of coefficients. */
134*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t sign_coefs1 = vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15));
135*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t sign_coefs2 = vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15));
136*dfc6aa5cSAndroid Build Coastguard Worker     /* Compute absolute value of coefficients and apply point transform Al. */
137*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
138*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
139*dfc6aa5cSAndroid Build Coastguard Worker     abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
140*dfc6aa5cSAndroid Build Coastguard Worker     abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
141*dfc6aa5cSAndroid Build Coastguard Worker 
142*dfc6aa5cSAndroid Build Coastguard Worker     /* Compute diff values. */
143*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t diff1 = veorq_u16(abs_coefs1, sign_coefs1);
144*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t diff2 = veorq_u16(abs_coefs2, sign_coefs2);
145*dfc6aa5cSAndroid Build Coastguard Worker 
146*dfc6aa5cSAndroid Build Coastguard Worker     /* Store transformed coefficients and diff values. */
147*dfc6aa5cSAndroid Build Coastguard Worker     vst1q_u16(values_ptr, abs_coefs1);
148*dfc6aa5cSAndroid Build Coastguard Worker     vst1q_u16(values_ptr + DCTSIZE, abs_coefs2);
149*dfc6aa5cSAndroid Build Coastguard Worker     vst1q_u16(diff_values_ptr, diff1);
150*dfc6aa5cSAndroid Build Coastguard Worker     vst1q_u16(diff_values_ptr + DCTSIZE, diff2);
151*dfc6aa5cSAndroid Build Coastguard Worker     values_ptr += 16;
152*dfc6aa5cSAndroid Build Coastguard Worker     diff_values_ptr += 16;
153*dfc6aa5cSAndroid Build Coastguard Worker     rows_to_zero -= 2;
154*dfc6aa5cSAndroid Build Coastguard Worker 
155*dfc6aa5cSAndroid Build Coastguard Worker   } else if (remaining_coefs > 0) {
156*dfc6aa5cSAndroid Build Coastguard Worker     int16x8_t coefs = vdupq_n_s16(0);
157*dfc6aa5cSAndroid Build Coastguard Worker 
158*dfc6aa5cSAndroid Build Coastguard Worker     switch (remaining_coefs) {
159*dfc6aa5cSAndroid Build Coastguard Worker     case 8:
160*dfc6aa5cSAndroid Build Coastguard Worker       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
161*dfc6aa5cSAndroid Build Coastguard Worker       FALLTHROUGH               /*FALLTHROUGH*/
162*dfc6aa5cSAndroid Build Coastguard Worker     case 7:
163*dfc6aa5cSAndroid Build Coastguard Worker       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
164*dfc6aa5cSAndroid Build Coastguard Worker       FALLTHROUGH               /*FALLTHROUGH*/
165*dfc6aa5cSAndroid Build Coastguard Worker     case 6:
166*dfc6aa5cSAndroid Build Coastguard Worker       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
167*dfc6aa5cSAndroid Build Coastguard Worker       FALLTHROUGH               /*FALLTHROUGH*/
168*dfc6aa5cSAndroid Build Coastguard Worker     case 5:
169*dfc6aa5cSAndroid Build Coastguard Worker       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
170*dfc6aa5cSAndroid Build Coastguard Worker       FALLTHROUGH               /*FALLTHROUGH*/
171*dfc6aa5cSAndroid Build Coastguard Worker     case 4:
172*dfc6aa5cSAndroid Build Coastguard Worker       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
173*dfc6aa5cSAndroid Build Coastguard Worker       FALLTHROUGH               /*FALLTHROUGH*/
174*dfc6aa5cSAndroid Build Coastguard Worker     case 3:
175*dfc6aa5cSAndroid Build Coastguard Worker       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
176*dfc6aa5cSAndroid Build Coastguard Worker       FALLTHROUGH               /*FALLTHROUGH*/
177*dfc6aa5cSAndroid Build Coastguard Worker     case 2:
178*dfc6aa5cSAndroid Build Coastguard Worker       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
179*dfc6aa5cSAndroid Build Coastguard Worker       FALLTHROUGH               /*FALLTHROUGH*/
180*dfc6aa5cSAndroid Build Coastguard Worker     case 1:
181*dfc6aa5cSAndroid Build Coastguard Worker       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
182*dfc6aa5cSAndroid Build Coastguard Worker       FALLTHROUGH               /*FALLTHROUGH*/
183*dfc6aa5cSAndroid Build Coastguard Worker     default:
184*dfc6aa5cSAndroid Build Coastguard Worker       break;
185*dfc6aa5cSAndroid Build Coastguard Worker     }
186*dfc6aa5cSAndroid Build Coastguard Worker 
187*dfc6aa5cSAndroid Build Coastguard Worker     /* Isolate sign of coefficients. */
188*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t sign_coefs = vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15));
189*dfc6aa5cSAndroid Build Coastguard Worker     /* Compute absolute value of coefficients and apply point transform Al. */
190*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t abs_coefs = vreinterpretq_u16_s16(vabsq_s16(coefs));
191*dfc6aa5cSAndroid Build Coastguard Worker     abs_coefs = vshlq_u16(abs_coefs, vdupq_n_s16(-Al));
192*dfc6aa5cSAndroid Build Coastguard Worker 
193*dfc6aa5cSAndroid Build Coastguard Worker     /* Compute diff values. */
194*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t diff = veorq_u16(abs_coefs, sign_coefs);
195*dfc6aa5cSAndroid Build Coastguard Worker 
196*dfc6aa5cSAndroid Build Coastguard Worker     /* Store transformed coefficients and diff values. */
197*dfc6aa5cSAndroid Build Coastguard Worker     vst1q_u16(values_ptr, abs_coefs);
198*dfc6aa5cSAndroid Build Coastguard Worker     vst1q_u16(diff_values_ptr, diff);
199*dfc6aa5cSAndroid Build Coastguard Worker     values_ptr += 8;
200*dfc6aa5cSAndroid Build Coastguard Worker     diff_values_ptr += 8;
201*dfc6aa5cSAndroid Build Coastguard Worker     rows_to_zero--;
202*dfc6aa5cSAndroid Build Coastguard Worker   }
203*dfc6aa5cSAndroid Build Coastguard Worker 
204*dfc6aa5cSAndroid Build Coastguard Worker   /* Zero remaining memory in the values and diff_values blocks. */
205*dfc6aa5cSAndroid Build Coastguard Worker   for (i = 0; i < rows_to_zero; i++) {
206*dfc6aa5cSAndroid Build Coastguard Worker     vst1q_u16(values_ptr, vdupq_n_u16(0));
207*dfc6aa5cSAndroid Build Coastguard Worker     vst1q_u16(diff_values_ptr, vdupq_n_u16(0));
208*dfc6aa5cSAndroid Build Coastguard Worker     values_ptr += 8;
209*dfc6aa5cSAndroid Build Coastguard Worker     diff_values_ptr += 8;
210*dfc6aa5cSAndroid Build Coastguard Worker   }
211*dfc6aa5cSAndroid Build Coastguard Worker 
212*dfc6aa5cSAndroid Build Coastguard Worker   /* Construct zerobits bitmap.  A set bit means that the corresponding
213*dfc6aa5cSAndroid Build Coastguard Worker    * coefficient != 0.
214*dfc6aa5cSAndroid Build Coastguard Worker    */
215*dfc6aa5cSAndroid Build Coastguard Worker   uint16x8_t row0 = vld1q_u16(values + 0 * DCTSIZE);
216*dfc6aa5cSAndroid Build Coastguard Worker   uint16x8_t row1 = vld1q_u16(values + 1 * DCTSIZE);
217*dfc6aa5cSAndroid Build Coastguard Worker   uint16x8_t row2 = vld1q_u16(values + 2 * DCTSIZE);
218*dfc6aa5cSAndroid Build Coastguard Worker   uint16x8_t row3 = vld1q_u16(values + 3 * DCTSIZE);
219*dfc6aa5cSAndroid Build Coastguard Worker   uint16x8_t row4 = vld1q_u16(values + 4 * DCTSIZE);
220*dfc6aa5cSAndroid Build Coastguard Worker   uint16x8_t row5 = vld1q_u16(values + 5 * DCTSIZE);
221*dfc6aa5cSAndroid Build Coastguard Worker   uint16x8_t row6 = vld1q_u16(values + 6 * DCTSIZE);
222*dfc6aa5cSAndroid Build Coastguard Worker   uint16x8_t row7 = vld1q_u16(values + 7 * DCTSIZE);
223*dfc6aa5cSAndroid Build Coastguard Worker 
224*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t row0_eq0 = vmovn_u16(vceqq_u16(row0, vdupq_n_u16(0)));
225*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t row1_eq0 = vmovn_u16(vceqq_u16(row1, vdupq_n_u16(0)));
226*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t row2_eq0 = vmovn_u16(vceqq_u16(row2, vdupq_n_u16(0)));
227*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t row3_eq0 = vmovn_u16(vceqq_u16(row3, vdupq_n_u16(0)));
228*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t row4_eq0 = vmovn_u16(vceqq_u16(row4, vdupq_n_u16(0)));
229*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t row5_eq0 = vmovn_u16(vceqq_u16(row5, vdupq_n_u16(0)));
230*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t row6_eq0 = vmovn_u16(vceqq_u16(row6, vdupq_n_u16(0)));
231*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t row7_eq0 = vmovn_u16(vceqq_u16(row7, vdupq_n_u16(0)));
232*dfc6aa5cSAndroid Build Coastguard Worker 
233*dfc6aa5cSAndroid Build Coastguard Worker   /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
234*dfc6aa5cSAndroid Build Coastguard Worker   const uint8x8_t bitmap_mask =
235*dfc6aa5cSAndroid Build Coastguard Worker     vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
236*dfc6aa5cSAndroid Build Coastguard Worker 
237*dfc6aa5cSAndroid Build Coastguard Worker   row0_eq0 = vand_u8(row0_eq0, bitmap_mask);
238*dfc6aa5cSAndroid Build Coastguard Worker   row1_eq0 = vand_u8(row1_eq0, bitmap_mask);
239*dfc6aa5cSAndroid Build Coastguard Worker   row2_eq0 = vand_u8(row2_eq0, bitmap_mask);
240*dfc6aa5cSAndroid Build Coastguard Worker   row3_eq0 = vand_u8(row3_eq0, bitmap_mask);
241*dfc6aa5cSAndroid Build Coastguard Worker   row4_eq0 = vand_u8(row4_eq0, bitmap_mask);
242*dfc6aa5cSAndroid Build Coastguard Worker   row5_eq0 = vand_u8(row5_eq0, bitmap_mask);
243*dfc6aa5cSAndroid Build Coastguard Worker   row6_eq0 = vand_u8(row6_eq0, bitmap_mask);
244*dfc6aa5cSAndroid Build Coastguard Worker   row7_eq0 = vand_u8(row7_eq0, bitmap_mask);
245*dfc6aa5cSAndroid Build Coastguard Worker 
246*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t bitmap_rows_01 = vpadd_u8(row0_eq0, row1_eq0);
247*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t bitmap_rows_23 = vpadd_u8(row2_eq0, row3_eq0);
248*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t bitmap_rows_45 = vpadd_u8(row4_eq0, row5_eq0);
249*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t bitmap_rows_67 = vpadd_u8(row6_eq0, row7_eq0);
250*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
251*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
252*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
253*dfc6aa5cSAndroid Build Coastguard Worker 
254*dfc6aa5cSAndroid Build Coastguard Worker #if defined(__aarch64__) || defined(_M_ARM64)
255*dfc6aa5cSAndroid Build Coastguard Worker   /* Move bitmap to a 64-bit scalar register. */
256*dfc6aa5cSAndroid Build Coastguard Worker   uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
257*dfc6aa5cSAndroid Build Coastguard Worker   /* Store zerobits bitmap. */
258*dfc6aa5cSAndroid Build Coastguard Worker   *zerobits = ~bitmap;
259*dfc6aa5cSAndroid Build Coastguard Worker #else
260*dfc6aa5cSAndroid Build Coastguard Worker   /* Move bitmap to two 32-bit scalar registers. */
261*dfc6aa5cSAndroid Build Coastguard Worker   uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
262*dfc6aa5cSAndroid Build Coastguard Worker   uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
263*dfc6aa5cSAndroid Build Coastguard Worker   /* Store zerobits bitmap. */
264*dfc6aa5cSAndroid Build Coastguard Worker   zerobits[0] = ~bitmap0;
265*dfc6aa5cSAndroid Build Coastguard Worker   zerobits[1] = ~bitmap1;
266*dfc6aa5cSAndroid Build Coastguard Worker #endif
267*dfc6aa5cSAndroid Build Coastguard Worker }
268*dfc6aa5cSAndroid Build Coastguard Worker 
269*dfc6aa5cSAndroid Build Coastguard Worker 
270*dfc6aa5cSAndroid Build Coastguard Worker /* Data preparation for encode_mcu_AC_refine().
271*dfc6aa5cSAndroid Build Coastguard Worker  *
272*dfc6aa5cSAndroid Build Coastguard Worker  * The equivalent scalar C function (encode_mcu_AC_refine_prepare()) can be
273*dfc6aa5cSAndroid Build Coastguard Worker  * found in jcphuff.c.
274*dfc6aa5cSAndroid Build Coastguard Worker  */
275*dfc6aa5cSAndroid Build Coastguard Worker 
jsimd_encode_mcu_AC_refine_prepare_neon(const JCOEF * block,const int * jpeg_natural_order_start,int Sl,int Al,UJCOEF * absvalues,size_t * bits)276*dfc6aa5cSAndroid Build Coastguard Worker int jsimd_encode_mcu_AC_refine_prepare_neon
277*dfc6aa5cSAndroid Build Coastguard Worker   (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
278*dfc6aa5cSAndroid Build Coastguard Worker    UJCOEF *absvalues, size_t *bits)
279*dfc6aa5cSAndroid Build Coastguard Worker {
280*dfc6aa5cSAndroid Build Coastguard Worker   /* Temporary storage buffers for data used to compute the signbits bitmap and
281*dfc6aa5cSAndroid Build Coastguard Worker    * the end-of-block (EOB) position
282*dfc6aa5cSAndroid Build Coastguard Worker    */
283*dfc6aa5cSAndroid Build Coastguard Worker   uint8_t coef_sign_bits[64];
284*dfc6aa5cSAndroid Build Coastguard Worker   uint8_t coef_eq1_bits[64];
285*dfc6aa5cSAndroid Build Coastguard Worker 
286*dfc6aa5cSAndroid Build Coastguard Worker   UJCOEF *absvalues_ptr = absvalues;
287*dfc6aa5cSAndroid Build Coastguard Worker   uint8_t *coef_sign_bits_ptr = coef_sign_bits;
288*dfc6aa5cSAndroid Build Coastguard Worker   uint8_t *eq1_bits_ptr = coef_eq1_bits;
289*dfc6aa5cSAndroid Build Coastguard Worker 
290*dfc6aa5cSAndroid Build Coastguard Worker   /* Rows of coefficients to zero (since they haven't been processed) */
291*dfc6aa5cSAndroid Build Coastguard Worker   int i, rows_to_zero = 8;
292*dfc6aa5cSAndroid Build Coastguard Worker 
293*dfc6aa5cSAndroid Build Coastguard Worker   for (i = 0; i < Sl / 16; i++) {
294*dfc6aa5cSAndroid Build Coastguard Worker     int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
295*dfc6aa5cSAndroid Build Coastguard Worker     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
296*dfc6aa5cSAndroid Build Coastguard Worker     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
297*dfc6aa5cSAndroid Build Coastguard Worker     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
298*dfc6aa5cSAndroid Build Coastguard Worker     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
299*dfc6aa5cSAndroid Build Coastguard Worker     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
300*dfc6aa5cSAndroid Build Coastguard Worker     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
301*dfc6aa5cSAndroid Build Coastguard Worker     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
302*dfc6aa5cSAndroid Build Coastguard Worker     int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
303*dfc6aa5cSAndroid Build Coastguard Worker     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
304*dfc6aa5cSAndroid Build Coastguard Worker     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
305*dfc6aa5cSAndroid Build Coastguard Worker     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
306*dfc6aa5cSAndroid Build Coastguard Worker     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
307*dfc6aa5cSAndroid Build Coastguard Worker     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
308*dfc6aa5cSAndroid Build Coastguard Worker     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
309*dfc6aa5cSAndroid Build Coastguard Worker     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
310*dfc6aa5cSAndroid Build Coastguard Worker 
311*dfc6aa5cSAndroid Build Coastguard Worker     /* Compute and store data for signbits bitmap. */
312*dfc6aa5cSAndroid Build Coastguard Worker     uint8x8_t sign_coefs1 =
313*dfc6aa5cSAndroid Build Coastguard Worker       vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
314*dfc6aa5cSAndroid Build Coastguard Worker     uint8x8_t sign_coefs2 =
315*dfc6aa5cSAndroid Build Coastguard Worker       vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
316*dfc6aa5cSAndroid Build Coastguard Worker     vst1_u8(coef_sign_bits_ptr, sign_coefs1);
317*dfc6aa5cSAndroid Build Coastguard Worker     vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
318*dfc6aa5cSAndroid Build Coastguard Worker 
319*dfc6aa5cSAndroid Build Coastguard Worker     /* Compute absolute value of coefficients and apply point transform Al. */
320*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
321*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
322*dfc6aa5cSAndroid Build Coastguard Worker     abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
323*dfc6aa5cSAndroid Build Coastguard Worker     abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
324*dfc6aa5cSAndroid Build Coastguard Worker     vst1q_u16(absvalues_ptr, abs_coefs1);
325*dfc6aa5cSAndroid Build Coastguard Worker     vst1q_u16(absvalues_ptr + DCTSIZE, abs_coefs2);
326*dfc6aa5cSAndroid Build Coastguard Worker 
327*dfc6aa5cSAndroid Build Coastguard Worker     /* Test whether transformed coefficient values == 1 (used to find EOB
328*dfc6aa5cSAndroid Build Coastguard Worker      * position.)
329*dfc6aa5cSAndroid Build Coastguard Worker      */
330*dfc6aa5cSAndroid Build Coastguard Worker     uint8x8_t coefs_eq11 = vmovn_u16(vceqq_u16(abs_coefs1, vdupq_n_u16(1)));
331*dfc6aa5cSAndroid Build Coastguard Worker     uint8x8_t coefs_eq12 = vmovn_u16(vceqq_u16(abs_coefs2, vdupq_n_u16(1)));
332*dfc6aa5cSAndroid Build Coastguard Worker     vst1_u8(eq1_bits_ptr, coefs_eq11);
333*dfc6aa5cSAndroid Build Coastguard Worker     vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
334*dfc6aa5cSAndroid Build Coastguard Worker 
335*dfc6aa5cSAndroid Build Coastguard Worker     absvalues_ptr += 16;
336*dfc6aa5cSAndroid Build Coastguard Worker     coef_sign_bits_ptr += 16;
337*dfc6aa5cSAndroid Build Coastguard Worker     eq1_bits_ptr += 16;
338*dfc6aa5cSAndroid Build Coastguard Worker     jpeg_natural_order_start += 16;
339*dfc6aa5cSAndroid Build Coastguard Worker     rows_to_zero -= 2;
340*dfc6aa5cSAndroid Build Coastguard Worker   }
341*dfc6aa5cSAndroid Build Coastguard Worker 
342*dfc6aa5cSAndroid Build Coastguard Worker   /* Same operation but for remaining partial vector */
343*dfc6aa5cSAndroid Build Coastguard Worker   int remaining_coefs = Sl % 16;
344*dfc6aa5cSAndroid Build Coastguard Worker   if (remaining_coefs > 8) {
345*dfc6aa5cSAndroid Build Coastguard Worker     int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
346*dfc6aa5cSAndroid Build Coastguard Worker     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
347*dfc6aa5cSAndroid Build Coastguard Worker     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
348*dfc6aa5cSAndroid Build Coastguard Worker     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
349*dfc6aa5cSAndroid Build Coastguard Worker     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
350*dfc6aa5cSAndroid Build Coastguard Worker     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
351*dfc6aa5cSAndroid Build Coastguard Worker     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
352*dfc6aa5cSAndroid Build Coastguard Worker     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
353*dfc6aa5cSAndroid Build Coastguard Worker     int16x8_t coefs2 = vdupq_n_s16(0);
354*dfc6aa5cSAndroid Build Coastguard Worker     switch (remaining_coefs) {
355*dfc6aa5cSAndroid Build Coastguard Worker     case 15:
356*dfc6aa5cSAndroid Build Coastguard Worker       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
357*dfc6aa5cSAndroid Build Coastguard Worker       FALLTHROUGH               /*FALLTHROUGH*/
358*dfc6aa5cSAndroid Build Coastguard Worker     case 14:
359*dfc6aa5cSAndroid Build Coastguard Worker       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
360*dfc6aa5cSAndroid Build Coastguard Worker       FALLTHROUGH               /*FALLTHROUGH*/
361*dfc6aa5cSAndroid Build Coastguard Worker     case 13:
362*dfc6aa5cSAndroid Build Coastguard Worker       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
363*dfc6aa5cSAndroid Build Coastguard Worker       FALLTHROUGH               /*FALLTHROUGH*/
364*dfc6aa5cSAndroid Build Coastguard Worker     case 12:
365*dfc6aa5cSAndroid Build Coastguard Worker       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
366*dfc6aa5cSAndroid Build Coastguard Worker       FALLTHROUGH               /*FALLTHROUGH*/
367*dfc6aa5cSAndroid Build Coastguard Worker     case 11:
368*dfc6aa5cSAndroid Build Coastguard Worker       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
369*dfc6aa5cSAndroid Build Coastguard Worker       FALLTHROUGH               /*FALLTHROUGH*/
370*dfc6aa5cSAndroid Build Coastguard Worker     case 10:
371*dfc6aa5cSAndroid Build Coastguard Worker       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
372*dfc6aa5cSAndroid Build Coastguard Worker       FALLTHROUGH               /*FALLTHROUGH*/
373*dfc6aa5cSAndroid Build Coastguard Worker     case 9:
374*dfc6aa5cSAndroid Build Coastguard Worker       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
375*dfc6aa5cSAndroid Build Coastguard Worker       FALLTHROUGH               /*FALLTHROUGH*/
376*dfc6aa5cSAndroid Build Coastguard Worker     default:
377*dfc6aa5cSAndroid Build Coastguard Worker       break;
378*dfc6aa5cSAndroid Build Coastguard Worker     }
379*dfc6aa5cSAndroid Build Coastguard Worker 
380*dfc6aa5cSAndroid Build Coastguard Worker     /* Compute and store data for signbits bitmap. */
381*dfc6aa5cSAndroid Build Coastguard Worker     uint8x8_t sign_coefs1 =
382*dfc6aa5cSAndroid Build Coastguard Worker       vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
383*dfc6aa5cSAndroid Build Coastguard Worker     uint8x8_t sign_coefs2 =
384*dfc6aa5cSAndroid Build Coastguard Worker       vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
385*dfc6aa5cSAndroid Build Coastguard Worker     vst1_u8(coef_sign_bits_ptr, sign_coefs1);
386*dfc6aa5cSAndroid Build Coastguard Worker     vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
387*dfc6aa5cSAndroid Build Coastguard Worker 
388*dfc6aa5cSAndroid Build Coastguard Worker     /* Compute absolute value of coefficients and apply point transform Al. */
389*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
390*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
391*dfc6aa5cSAndroid Build Coastguard Worker     abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
392*dfc6aa5cSAndroid Build Coastguard Worker     abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
393*dfc6aa5cSAndroid Build Coastguard Worker     vst1q_u16(absvalues_ptr, abs_coefs1);
394*dfc6aa5cSAndroid Build Coastguard Worker     vst1q_u16(absvalues_ptr + DCTSIZE, abs_coefs2);
395*dfc6aa5cSAndroid Build Coastguard Worker 
396*dfc6aa5cSAndroid Build Coastguard Worker     /* Test whether transformed coefficient values == 1 (used to find EOB
397*dfc6aa5cSAndroid Build Coastguard Worker      * position.)
398*dfc6aa5cSAndroid Build Coastguard Worker      */
399*dfc6aa5cSAndroid Build Coastguard Worker     uint8x8_t coefs_eq11 = vmovn_u16(vceqq_u16(abs_coefs1, vdupq_n_u16(1)));
400*dfc6aa5cSAndroid Build Coastguard Worker     uint8x8_t coefs_eq12 = vmovn_u16(vceqq_u16(abs_coefs2, vdupq_n_u16(1)));
401*dfc6aa5cSAndroid Build Coastguard Worker     vst1_u8(eq1_bits_ptr, coefs_eq11);
402*dfc6aa5cSAndroid Build Coastguard Worker     vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
403*dfc6aa5cSAndroid Build Coastguard Worker 
404*dfc6aa5cSAndroid Build Coastguard Worker     absvalues_ptr += 16;
405*dfc6aa5cSAndroid Build Coastguard Worker     coef_sign_bits_ptr += 16;
406*dfc6aa5cSAndroid Build Coastguard Worker     eq1_bits_ptr += 16;
407*dfc6aa5cSAndroid Build Coastguard Worker     jpeg_natural_order_start += 16;
408*dfc6aa5cSAndroid Build Coastguard Worker     rows_to_zero -= 2;
409*dfc6aa5cSAndroid Build Coastguard Worker 
410*dfc6aa5cSAndroid Build Coastguard Worker   } else if (remaining_coefs > 0) {
411*dfc6aa5cSAndroid Build Coastguard Worker     int16x8_t coefs = vdupq_n_s16(0);
412*dfc6aa5cSAndroid Build Coastguard Worker 
413*dfc6aa5cSAndroid Build Coastguard Worker     switch (remaining_coefs) {
414*dfc6aa5cSAndroid Build Coastguard Worker     case 8:
415*dfc6aa5cSAndroid Build Coastguard Worker       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
416*dfc6aa5cSAndroid Build Coastguard Worker       FALLTHROUGH               /*FALLTHROUGH*/
417*dfc6aa5cSAndroid Build Coastguard Worker     case 7:
418*dfc6aa5cSAndroid Build Coastguard Worker       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
419*dfc6aa5cSAndroid Build Coastguard Worker       FALLTHROUGH               /*FALLTHROUGH*/
420*dfc6aa5cSAndroid Build Coastguard Worker     case 6:
421*dfc6aa5cSAndroid Build Coastguard Worker       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
422*dfc6aa5cSAndroid Build Coastguard Worker       FALLTHROUGH               /*FALLTHROUGH*/
423*dfc6aa5cSAndroid Build Coastguard Worker     case 5:
424*dfc6aa5cSAndroid Build Coastguard Worker       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
425*dfc6aa5cSAndroid Build Coastguard Worker       FALLTHROUGH               /*FALLTHROUGH*/
426*dfc6aa5cSAndroid Build Coastguard Worker     case 4:
427*dfc6aa5cSAndroid Build Coastguard Worker       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
428*dfc6aa5cSAndroid Build Coastguard Worker       FALLTHROUGH               /*FALLTHROUGH*/
429*dfc6aa5cSAndroid Build Coastguard Worker     case 3:
430*dfc6aa5cSAndroid Build Coastguard Worker       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
431*dfc6aa5cSAndroid Build Coastguard Worker       FALLTHROUGH               /*FALLTHROUGH*/
432*dfc6aa5cSAndroid Build Coastguard Worker     case 2:
433*dfc6aa5cSAndroid Build Coastguard Worker       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
434*dfc6aa5cSAndroid Build Coastguard Worker       FALLTHROUGH               /*FALLTHROUGH*/
435*dfc6aa5cSAndroid Build Coastguard Worker     case 1:
436*dfc6aa5cSAndroid Build Coastguard Worker       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
437*dfc6aa5cSAndroid Build Coastguard Worker       FALLTHROUGH               /*FALLTHROUGH*/
438*dfc6aa5cSAndroid Build Coastguard Worker     default:
439*dfc6aa5cSAndroid Build Coastguard Worker       break;
440*dfc6aa5cSAndroid Build Coastguard Worker     }
441*dfc6aa5cSAndroid Build Coastguard Worker 
442*dfc6aa5cSAndroid Build Coastguard Worker     /* Compute and store data for signbits bitmap. */
443*dfc6aa5cSAndroid Build Coastguard Worker     uint8x8_t sign_coefs =
444*dfc6aa5cSAndroid Build Coastguard Worker       vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15)));
445*dfc6aa5cSAndroid Build Coastguard Worker     vst1_u8(coef_sign_bits_ptr, sign_coefs);
446*dfc6aa5cSAndroid Build Coastguard Worker 
447*dfc6aa5cSAndroid Build Coastguard Worker     /* Compute absolute value of coefficients and apply point transform Al. */
448*dfc6aa5cSAndroid Build Coastguard Worker     uint16x8_t abs_coefs = vreinterpretq_u16_s16(vabsq_s16(coefs));
449*dfc6aa5cSAndroid Build Coastguard Worker     abs_coefs = vshlq_u16(abs_coefs, vdupq_n_s16(-Al));
450*dfc6aa5cSAndroid Build Coastguard Worker     vst1q_u16(absvalues_ptr, abs_coefs);
451*dfc6aa5cSAndroid Build Coastguard Worker 
452*dfc6aa5cSAndroid Build Coastguard Worker     /* Test whether transformed coefficient values == 1 (used to find EOB
453*dfc6aa5cSAndroid Build Coastguard Worker      * position.)
454*dfc6aa5cSAndroid Build Coastguard Worker      */
455*dfc6aa5cSAndroid Build Coastguard Worker     uint8x8_t coefs_eq1 = vmovn_u16(vceqq_u16(abs_coefs, vdupq_n_u16(1)));
456*dfc6aa5cSAndroid Build Coastguard Worker     vst1_u8(eq1_bits_ptr, coefs_eq1);
457*dfc6aa5cSAndroid Build Coastguard Worker 
458*dfc6aa5cSAndroid Build Coastguard Worker     absvalues_ptr += 8;
459*dfc6aa5cSAndroid Build Coastguard Worker     coef_sign_bits_ptr += 8;
460*dfc6aa5cSAndroid Build Coastguard Worker     eq1_bits_ptr += 8;
461*dfc6aa5cSAndroid Build Coastguard Worker     rows_to_zero--;
462*dfc6aa5cSAndroid Build Coastguard Worker   }
463*dfc6aa5cSAndroid Build Coastguard Worker 
464*dfc6aa5cSAndroid Build Coastguard Worker   /* Zero remaining memory in blocks. */
465*dfc6aa5cSAndroid Build Coastguard Worker   for (i = 0; i < rows_to_zero; i++) {
466*dfc6aa5cSAndroid Build Coastguard Worker     vst1q_u16(absvalues_ptr, vdupq_n_u16(0));
467*dfc6aa5cSAndroid Build Coastguard Worker     vst1_u8(coef_sign_bits_ptr, vdup_n_u8(0));
468*dfc6aa5cSAndroid Build Coastguard Worker     vst1_u8(eq1_bits_ptr, vdup_n_u8(0));
469*dfc6aa5cSAndroid Build Coastguard Worker     absvalues_ptr += 8;
470*dfc6aa5cSAndroid Build Coastguard Worker     coef_sign_bits_ptr += 8;
471*dfc6aa5cSAndroid Build Coastguard Worker     eq1_bits_ptr += 8;
472*dfc6aa5cSAndroid Build Coastguard Worker   }
473*dfc6aa5cSAndroid Build Coastguard Worker 
474*dfc6aa5cSAndroid Build Coastguard Worker   /* Construct zerobits bitmap. */
475*dfc6aa5cSAndroid Build Coastguard Worker   uint16x8_t abs_row0 = vld1q_u16(absvalues + 0 * DCTSIZE);
476*dfc6aa5cSAndroid Build Coastguard Worker   uint16x8_t abs_row1 = vld1q_u16(absvalues + 1 * DCTSIZE);
477*dfc6aa5cSAndroid Build Coastguard Worker   uint16x8_t abs_row2 = vld1q_u16(absvalues + 2 * DCTSIZE);
478*dfc6aa5cSAndroid Build Coastguard Worker   uint16x8_t abs_row3 = vld1q_u16(absvalues + 3 * DCTSIZE);
479*dfc6aa5cSAndroid Build Coastguard Worker   uint16x8_t abs_row4 = vld1q_u16(absvalues + 4 * DCTSIZE);
480*dfc6aa5cSAndroid Build Coastguard Worker   uint16x8_t abs_row5 = vld1q_u16(absvalues + 5 * DCTSIZE);
481*dfc6aa5cSAndroid Build Coastguard Worker   uint16x8_t abs_row6 = vld1q_u16(absvalues + 6 * DCTSIZE);
482*dfc6aa5cSAndroid Build Coastguard Worker   uint16x8_t abs_row7 = vld1q_u16(absvalues + 7 * DCTSIZE);
483*dfc6aa5cSAndroid Build Coastguard Worker 
484*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_u16(abs_row0, vdupq_n_u16(0)));
485*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_u16(abs_row1, vdupq_n_u16(0)));
486*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_u16(abs_row2, vdupq_n_u16(0)));
487*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_u16(abs_row3, vdupq_n_u16(0)));
488*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_u16(abs_row4, vdupq_n_u16(0)));
489*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_u16(abs_row5, vdupq_n_u16(0)));
490*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_u16(abs_row6, vdupq_n_u16(0)));
491*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_u16(abs_row7, vdupq_n_u16(0)));
492*dfc6aa5cSAndroid Build Coastguard Worker 
493*dfc6aa5cSAndroid Build Coastguard Worker   /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
494*dfc6aa5cSAndroid Build Coastguard Worker   const uint8x8_t bitmap_mask =
495*dfc6aa5cSAndroid Build Coastguard Worker     vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
496*dfc6aa5cSAndroid Build Coastguard Worker 
497*dfc6aa5cSAndroid Build Coastguard Worker   abs_row0_eq0 = vand_u8(abs_row0_eq0, bitmap_mask);
498*dfc6aa5cSAndroid Build Coastguard Worker   abs_row1_eq0 = vand_u8(abs_row1_eq0, bitmap_mask);
499*dfc6aa5cSAndroid Build Coastguard Worker   abs_row2_eq0 = vand_u8(abs_row2_eq0, bitmap_mask);
500*dfc6aa5cSAndroid Build Coastguard Worker   abs_row3_eq0 = vand_u8(abs_row3_eq0, bitmap_mask);
501*dfc6aa5cSAndroid Build Coastguard Worker   abs_row4_eq0 = vand_u8(abs_row4_eq0, bitmap_mask);
502*dfc6aa5cSAndroid Build Coastguard Worker   abs_row5_eq0 = vand_u8(abs_row5_eq0, bitmap_mask);
503*dfc6aa5cSAndroid Build Coastguard Worker   abs_row6_eq0 = vand_u8(abs_row6_eq0, bitmap_mask);
504*dfc6aa5cSAndroid Build Coastguard Worker   abs_row7_eq0 = vand_u8(abs_row7_eq0, bitmap_mask);
505*dfc6aa5cSAndroid Build Coastguard Worker 
506*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t bitmap_rows_01 = vpadd_u8(abs_row0_eq0, abs_row1_eq0);
507*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t bitmap_rows_23 = vpadd_u8(abs_row2_eq0, abs_row3_eq0);
508*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t bitmap_rows_45 = vpadd_u8(abs_row4_eq0, abs_row5_eq0);
509*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t bitmap_rows_67 = vpadd_u8(abs_row6_eq0, abs_row7_eq0);
510*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
511*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
512*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
513*dfc6aa5cSAndroid Build Coastguard Worker 
514*dfc6aa5cSAndroid Build Coastguard Worker #if defined(__aarch64__) || defined(_M_ARM64)
515*dfc6aa5cSAndroid Build Coastguard Worker   /* Move bitmap to a 64-bit scalar register. */
516*dfc6aa5cSAndroid Build Coastguard Worker   uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
517*dfc6aa5cSAndroid Build Coastguard Worker   /* Store zerobits bitmap. */
518*dfc6aa5cSAndroid Build Coastguard Worker   bits[0] = ~bitmap;
519*dfc6aa5cSAndroid Build Coastguard Worker #else
520*dfc6aa5cSAndroid Build Coastguard Worker   /* Move bitmap to two 32-bit scalar registers. */
521*dfc6aa5cSAndroid Build Coastguard Worker   uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
522*dfc6aa5cSAndroid Build Coastguard Worker   uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
523*dfc6aa5cSAndroid Build Coastguard Worker   /* Store zerobits bitmap. */
524*dfc6aa5cSAndroid Build Coastguard Worker   bits[0] = ~bitmap0;
525*dfc6aa5cSAndroid Build Coastguard Worker   bits[1] = ~bitmap1;
526*dfc6aa5cSAndroid Build Coastguard Worker #endif
527*dfc6aa5cSAndroid Build Coastguard Worker 
528*dfc6aa5cSAndroid Build Coastguard Worker   /* Construct signbits bitmap. */
529*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t signbits_row0 = vld1_u8(coef_sign_bits + 0 * DCTSIZE);
530*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t signbits_row1 = vld1_u8(coef_sign_bits + 1 * DCTSIZE);
531*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t signbits_row2 = vld1_u8(coef_sign_bits + 2 * DCTSIZE);
532*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t signbits_row3 = vld1_u8(coef_sign_bits + 3 * DCTSIZE);
533*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t signbits_row4 = vld1_u8(coef_sign_bits + 4 * DCTSIZE);
534*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t signbits_row5 = vld1_u8(coef_sign_bits + 5 * DCTSIZE);
535*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t signbits_row6 = vld1_u8(coef_sign_bits + 6 * DCTSIZE);
536*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t signbits_row7 = vld1_u8(coef_sign_bits + 7 * DCTSIZE);
537*dfc6aa5cSAndroid Build Coastguard Worker 
538*dfc6aa5cSAndroid Build Coastguard Worker   signbits_row0 = vand_u8(signbits_row0, bitmap_mask);
539*dfc6aa5cSAndroid Build Coastguard Worker   signbits_row1 = vand_u8(signbits_row1, bitmap_mask);
540*dfc6aa5cSAndroid Build Coastguard Worker   signbits_row2 = vand_u8(signbits_row2, bitmap_mask);
541*dfc6aa5cSAndroid Build Coastguard Worker   signbits_row3 = vand_u8(signbits_row3, bitmap_mask);
542*dfc6aa5cSAndroid Build Coastguard Worker   signbits_row4 = vand_u8(signbits_row4, bitmap_mask);
543*dfc6aa5cSAndroid Build Coastguard Worker   signbits_row5 = vand_u8(signbits_row5, bitmap_mask);
544*dfc6aa5cSAndroid Build Coastguard Worker   signbits_row6 = vand_u8(signbits_row6, bitmap_mask);
545*dfc6aa5cSAndroid Build Coastguard Worker   signbits_row7 = vand_u8(signbits_row7, bitmap_mask);
546*dfc6aa5cSAndroid Build Coastguard Worker 
547*dfc6aa5cSAndroid Build Coastguard Worker   bitmap_rows_01 = vpadd_u8(signbits_row0, signbits_row1);
548*dfc6aa5cSAndroid Build Coastguard Worker   bitmap_rows_23 = vpadd_u8(signbits_row2, signbits_row3);
549*dfc6aa5cSAndroid Build Coastguard Worker   bitmap_rows_45 = vpadd_u8(signbits_row4, signbits_row5);
550*dfc6aa5cSAndroid Build Coastguard Worker   bitmap_rows_67 = vpadd_u8(signbits_row6, signbits_row7);
551*dfc6aa5cSAndroid Build Coastguard Worker   bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
552*dfc6aa5cSAndroid Build Coastguard Worker   bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
553*dfc6aa5cSAndroid Build Coastguard Worker   bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
554*dfc6aa5cSAndroid Build Coastguard Worker 
555*dfc6aa5cSAndroid Build Coastguard Worker #if defined(__aarch64__) || defined(_M_ARM64)
556*dfc6aa5cSAndroid Build Coastguard Worker   /* Move bitmap to a 64-bit scalar register. */
557*dfc6aa5cSAndroid Build Coastguard Worker   bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
558*dfc6aa5cSAndroid Build Coastguard Worker   /* Store signbits bitmap. */
559*dfc6aa5cSAndroid Build Coastguard Worker   bits[1] = ~bitmap;
560*dfc6aa5cSAndroid Build Coastguard Worker #else
561*dfc6aa5cSAndroid Build Coastguard Worker   /* Move bitmap to two 32-bit scalar registers. */
562*dfc6aa5cSAndroid Build Coastguard Worker   bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
563*dfc6aa5cSAndroid Build Coastguard Worker   bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
564*dfc6aa5cSAndroid Build Coastguard Worker   /* Store signbits bitmap. */
565*dfc6aa5cSAndroid Build Coastguard Worker   bits[2] = ~bitmap0;
566*dfc6aa5cSAndroid Build Coastguard Worker   bits[3] = ~bitmap1;
567*dfc6aa5cSAndroid Build Coastguard Worker #endif
568*dfc6aa5cSAndroid Build Coastguard Worker 
569*dfc6aa5cSAndroid Build Coastguard Worker   /* Construct bitmap to find EOB position (the index of the last coefficient
570*dfc6aa5cSAndroid Build Coastguard Worker    * equal to 1.)
571*dfc6aa5cSAndroid Build Coastguard Worker    */
572*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t row0_eq1 = vld1_u8(coef_eq1_bits + 0 * DCTSIZE);
573*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t row1_eq1 = vld1_u8(coef_eq1_bits + 1 * DCTSIZE);
574*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t row2_eq1 = vld1_u8(coef_eq1_bits + 2 * DCTSIZE);
575*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t row3_eq1 = vld1_u8(coef_eq1_bits + 3 * DCTSIZE);
576*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t row4_eq1 = vld1_u8(coef_eq1_bits + 4 * DCTSIZE);
577*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t row5_eq1 = vld1_u8(coef_eq1_bits + 5 * DCTSIZE);
578*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t row6_eq1 = vld1_u8(coef_eq1_bits + 6 * DCTSIZE);
579*dfc6aa5cSAndroid Build Coastguard Worker   uint8x8_t row7_eq1 = vld1_u8(coef_eq1_bits + 7 * DCTSIZE);
580*dfc6aa5cSAndroid Build Coastguard Worker 
581*dfc6aa5cSAndroid Build Coastguard Worker   row0_eq1 = vand_u8(row0_eq1, bitmap_mask);
582*dfc6aa5cSAndroid Build Coastguard Worker   row1_eq1 = vand_u8(row1_eq1, bitmap_mask);
583*dfc6aa5cSAndroid Build Coastguard Worker   row2_eq1 = vand_u8(row2_eq1, bitmap_mask);
584*dfc6aa5cSAndroid Build Coastguard Worker   row3_eq1 = vand_u8(row3_eq1, bitmap_mask);
585*dfc6aa5cSAndroid Build Coastguard Worker   row4_eq1 = vand_u8(row4_eq1, bitmap_mask);
586*dfc6aa5cSAndroid Build Coastguard Worker   row5_eq1 = vand_u8(row5_eq1, bitmap_mask);
587*dfc6aa5cSAndroid Build Coastguard Worker   row6_eq1 = vand_u8(row6_eq1, bitmap_mask);
588*dfc6aa5cSAndroid Build Coastguard Worker   row7_eq1 = vand_u8(row7_eq1, bitmap_mask);
589*dfc6aa5cSAndroid Build Coastguard Worker 
590*dfc6aa5cSAndroid Build Coastguard Worker   bitmap_rows_01 = vpadd_u8(row0_eq1, row1_eq1);
591*dfc6aa5cSAndroid Build Coastguard Worker   bitmap_rows_23 = vpadd_u8(row2_eq1, row3_eq1);
592*dfc6aa5cSAndroid Build Coastguard Worker   bitmap_rows_45 = vpadd_u8(row4_eq1, row5_eq1);
593*dfc6aa5cSAndroid Build Coastguard Worker   bitmap_rows_67 = vpadd_u8(row6_eq1, row7_eq1);
594*dfc6aa5cSAndroid Build Coastguard Worker   bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
595*dfc6aa5cSAndroid Build Coastguard Worker   bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
596*dfc6aa5cSAndroid Build Coastguard Worker   bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
597*dfc6aa5cSAndroid Build Coastguard Worker 
598*dfc6aa5cSAndroid Build Coastguard Worker #if defined(__aarch64__) || defined(_M_ARM64)
599*dfc6aa5cSAndroid Build Coastguard Worker   /* Move bitmap to a 64-bit scalar register. */
600*dfc6aa5cSAndroid Build Coastguard Worker   bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
601*dfc6aa5cSAndroid Build Coastguard Worker 
602*dfc6aa5cSAndroid Build Coastguard Worker   /* Return EOB position. */
603*dfc6aa5cSAndroid Build Coastguard Worker   if (bitmap == 0) {
604*dfc6aa5cSAndroid Build Coastguard Worker     /* EOB position is defined to be 0 if all coefficients != 1. */
605*dfc6aa5cSAndroid Build Coastguard Worker     return 0;
606*dfc6aa5cSAndroid Build Coastguard Worker   } else {
607*dfc6aa5cSAndroid Build Coastguard Worker     return 63 - BUILTIN_CLZLL(bitmap);
608*dfc6aa5cSAndroid Build Coastguard Worker   }
609*dfc6aa5cSAndroid Build Coastguard Worker #else
610*dfc6aa5cSAndroid Build Coastguard Worker   /* Move bitmap to two 32-bit scalar registers. */
611*dfc6aa5cSAndroid Build Coastguard Worker   bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
612*dfc6aa5cSAndroid Build Coastguard Worker   bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
613*dfc6aa5cSAndroid Build Coastguard Worker 
614*dfc6aa5cSAndroid Build Coastguard Worker   /* Return EOB position. */
615*dfc6aa5cSAndroid Build Coastguard Worker   if (bitmap0 == 0 && bitmap1 == 0) {
616*dfc6aa5cSAndroid Build Coastguard Worker     return 0;
617*dfc6aa5cSAndroid Build Coastguard Worker   } else if (bitmap1 != 0) {
618*dfc6aa5cSAndroid Build Coastguard Worker     return 63 - BUILTIN_CLZ(bitmap1);
619*dfc6aa5cSAndroid Build Coastguard Worker   } else {
620*dfc6aa5cSAndroid Build Coastguard Worker     return 31 - BUILTIN_CLZ(bitmap0);
621*dfc6aa5cSAndroid Build Coastguard Worker   }
622*dfc6aa5cSAndroid Build Coastguard Worker #endif
623*dfc6aa5cSAndroid Build Coastguard Worker }
624