1*fb1b10abSAndroid Build Coastguard Worker /*
2*fb1b10abSAndroid Build Coastguard Worker * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3*fb1b10abSAndroid Build Coastguard Worker *
4*fb1b10abSAndroid Build Coastguard Worker * Use of this source code is governed by a BSD-style license
5*fb1b10abSAndroid Build Coastguard Worker * that can be found in the LICENSE file in the root of the source
6*fb1b10abSAndroid Build Coastguard Worker * tree. An additional intellectual property rights grant can be found
7*fb1b10abSAndroid Build Coastguard Worker * in the file PATENTS. All contributing project authors may
8*fb1b10abSAndroid Build Coastguard Worker * be found in the AUTHORS file in the root of the source tree.
9*fb1b10abSAndroid Build Coastguard Worker */
10*fb1b10abSAndroid Build Coastguard Worker
11*fb1b10abSAndroid Build Coastguard Worker #include <arm_neon.h>
12*fb1b10abSAndroid Build Coastguard Worker #include <assert.h>
13*fb1b10abSAndroid Build Coastguard Worker
14*fb1b10abSAndroid Build Coastguard Worker #include "./vpx_config.h"
15*fb1b10abSAndroid Build Coastguard Worker #include "./vpx_dsp_rtcd.h"
16*fb1b10abSAndroid Build Coastguard Worker #include "vpx_dsp/arm/mem_neon.h"
17*fb1b10abSAndroid Build Coastguard Worker #include "vp9/common/vp9_scan.h"
18*fb1b10abSAndroid Build Coastguard Worker #include "vp9/encoder/vp9_block.h"
19*fb1b10abSAndroid Build Coastguard Worker
calculate_dqcoeff_and_store(const int16x8_t qcoeff,const int16x8_t dequant,tran_low_t * dqcoeff_ptr)20*fb1b10abSAndroid Build Coastguard Worker static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
21*fb1b10abSAndroid Build Coastguard Worker const int16x8_t dequant,
22*fb1b10abSAndroid Build Coastguard Worker tran_low_t *dqcoeff_ptr) {
23*fb1b10abSAndroid Build Coastguard Worker #if CONFIG_VP9_HIGHBITDEPTH
24*fb1b10abSAndroid Build Coastguard Worker const int32x4_t dqcoeff_0 =
25*fb1b10abSAndroid Build Coastguard Worker vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
26*fb1b10abSAndroid Build Coastguard Worker const int32x4_t dqcoeff_1 =
27*fb1b10abSAndroid Build Coastguard Worker vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
28*fb1b10abSAndroid Build Coastguard Worker
29*fb1b10abSAndroid Build Coastguard Worker vst1q_s32(dqcoeff_ptr, dqcoeff_0);
30*fb1b10abSAndroid Build Coastguard Worker vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
31*fb1b10abSAndroid Build Coastguard Worker #else
32*fb1b10abSAndroid Build Coastguard Worker vst1q_s16(dqcoeff_ptr, vmulq_s16(qcoeff, dequant));
33*fb1b10abSAndroid Build Coastguard Worker #endif // CONFIG_VP9_HIGHBITDEPTH
34*fb1b10abSAndroid Build Coastguard Worker }
35*fb1b10abSAndroid Build Coastguard Worker
36*fb1b10abSAndroid Build Coastguard Worker static INLINE int16x8_t
quantize_b_neon(const tran_low_t * coeff_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16x8_t zbin,const int16x8_t round,const int16x8_t quant,const int16x8_t quant_shift,const int16x8_t dequant)37*fb1b10abSAndroid Build Coastguard Worker quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
38*fb1b10abSAndroid Build Coastguard Worker tran_low_t *dqcoeff_ptr, const int16x8_t zbin,
39*fb1b10abSAndroid Build Coastguard Worker const int16x8_t round, const int16x8_t quant,
40*fb1b10abSAndroid Build Coastguard Worker const int16x8_t quant_shift, const int16x8_t dequant) {
41*fb1b10abSAndroid Build Coastguard Worker // Load coeffs as 8 x 16-bit ints, take sign and abs values
42*fb1b10abSAndroid Build Coastguard Worker const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
43*fb1b10abSAndroid Build Coastguard Worker const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
44*fb1b10abSAndroid Build Coastguard Worker const int16x8_t coeff_abs = vabsq_s16(coeff);
45*fb1b10abSAndroid Build Coastguard Worker
46*fb1b10abSAndroid Build Coastguard Worker // Calculate mask of elements outside the bin
47*fb1b10abSAndroid Build Coastguard Worker const int16x8_t zbin_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
48*fb1b10abSAndroid Build Coastguard Worker
49*fb1b10abSAndroid Build Coastguard Worker // Get the rounded values
50*fb1b10abSAndroid Build Coastguard Worker const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
51*fb1b10abSAndroid Build Coastguard Worker
52*fb1b10abSAndroid Build Coastguard Worker // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
53*fb1b10abSAndroid Build Coastguard Worker int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
54*fb1b10abSAndroid Build Coastguard Worker
55*fb1b10abSAndroid Build Coastguard Worker qcoeff = vaddq_s16(qcoeff, rounded);
56*fb1b10abSAndroid Build Coastguard Worker
57*fb1b10abSAndroid Build Coastguard Worker // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16
58*fb1b10abSAndroid Build Coastguard Worker qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1);
59*fb1b10abSAndroid Build Coastguard Worker
60*fb1b10abSAndroid Build Coastguard Worker // Restore the sign bit.
61*fb1b10abSAndroid Build Coastguard Worker qcoeff = veorq_s16(qcoeff, coeff_sign);
62*fb1b10abSAndroid Build Coastguard Worker qcoeff = vsubq_s16(qcoeff, coeff_sign);
63*fb1b10abSAndroid Build Coastguard Worker
64*fb1b10abSAndroid Build Coastguard Worker // Only keep the relevant coeffs
65*fb1b10abSAndroid Build Coastguard Worker qcoeff = vandq_s16(qcoeff, zbin_mask);
66*fb1b10abSAndroid Build Coastguard Worker store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
67*fb1b10abSAndroid Build Coastguard Worker
68*fb1b10abSAndroid Build Coastguard Worker calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr);
69*fb1b10abSAndroid Build Coastguard Worker
70*fb1b10abSAndroid Build Coastguard Worker return qcoeff;
71*fb1b10abSAndroid Build Coastguard Worker }
72*fb1b10abSAndroid Build Coastguard Worker
vpx_quantize_b_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const struct macroblock_plane * const mb_plane,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const struct ScanOrder * const scan_order)73*fb1b10abSAndroid Build Coastguard Worker void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
74*fb1b10abSAndroid Build Coastguard Worker const struct macroblock_plane *const mb_plane,
75*fb1b10abSAndroid Build Coastguard Worker tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
76*fb1b10abSAndroid Build Coastguard Worker const int16_t *dequant_ptr, uint16_t *eob_ptr,
77*fb1b10abSAndroid Build Coastguard Worker const struct ScanOrder *const scan_order) {
78*fb1b10abSAndroid Build Coastguard Worker const int16x8_t neg_one = vdupq_n_s16(-1);
79*fb1b10abSAndroid Build Coastguard Worker uint16x8_t eob_max;
80*fb1b10abSAndroid Build Coastguard Worker int16_t const *iscan = scan_order->iscan;
81*fb1b10abSAndroid Build Coastguard Worker
82*fb1b10abSAndroid Build Coastguard Worker // Only the first element of each vector is DC.
83*fb1b10abSAndroid Build Coastguard Worker int16x8_t zbin = vld1q_s16(mb_plane->zbin);
84*fb1b10abSAndroid Build Coastguard Worker int16x8_t round = vld1q_s16(mb_plane->round);
85*fb1b10abSAndroid Build Coastguard Worker int16x8_t quant = vld1q_s16(mb_plane->quant);
86*fb1b10abSAndroid Build Coastguard Worker int16x8_t quant_shift = vld1q_s16(mb_plane->quant_shift);
87*fb1b10abSAndroid Build Coastguard Worker int16x8_t dequant = vld1q_s16(dequant_ptr);
88*fb1b10abSAndroid Build Coastguard Worker
89*fb1b10abSAndroid Build Coastguard Worker // Process first 8 values which include a dc component.
90*fb1b10abSAndroid Build Coastguard Worker {
91*fb1b10abSAndroid Build Coastguard Worker const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
92*fb1b10abSAndroid Build Coastguard Worker
93*fb1b10abSAndroid Build Coastguard Worker const int16x8_t qcoeff =
94*fb1b10abSAndroid Build Coastguard Worker quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, quant,
95*fb1b10abSAndroid Build Coastguard Worker quant_shift, dequant);
96*fb1b10abSAndroid Build Coastguard Worker
97*fb1b10abSAndroid Build Coastguard Worker // Set non-zero elements to -1 and use that to extract values for eob.
98*fb1b10abSAndroid Build Coastguard Worker eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
99*fb1b10abSAndroid Build Coastguard Worker
100*fb1b10abSAndroid Build Coastguard Worker __builtin_prefetch(coeff_ptr + 64);
101*fb1b10abSAndroid Build Coastguard Worker coeff_ptr += 8;
102*fb1b10abSAndroid Build Coastguard Worker iscan += 8;
103*fb1b10abSAndroid Build Coastguard Worker qcoeff_ptr += 8;
104*fb1b10abSAndroid Build Coastguard Worker dqcoeff_ptr += 8;
105*fb1b10abSAndroid Build Coastguard Worker }
106*fb1b10abSAndroid Build Coastguard Worker
107*fb1b10abSAndroid Build Coastguard Worker n_coeffs -= 8;
108*fb1b10abSAndroid Build Coastguard Worker
109*fb1b10abSAndroid Build Coastguard Worker {
110*fb1b10abSAndroid Build Coastguard Worker zbin = vdupq_lane_s16(vget_low_s16(zbin), 1);
111*fb1b10abSAndroid Build Coastguard Worker round = vdupq_lane_s16(vget_low_s16(round), 1);
112*fb1b10abSAndroid Build Coastguard Worker quant = vdupq_lane_s16(vget_low_s16(quant), 1);
113*fb1b10abSAndroid Build Coastguard Worker quant_shift = vdupq_lane_s16(vget_low_s16(quant_shift), 1);
114*fb1b10abSAndroid Build Coastguard Worker dequant = vdupq_lane_s16(vget_low_s16(dequant), 1);
115*fb1b10abSAndroid Build Coastguard Worker
116*fb1b10abSAndroid Build Coastguard Worker do {
117*fb1b10abSAndroid Build Coastguard Worker const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
118*fb1b10abSAndroid Build Coastguard Worker
119*fb1b10abSAndroid Build Coastguard Worker const int16x8_t qcoeff =
120*fb1b10abSAndroid Build Coastguard Worker quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
121*fb1b10abSAndroid Build Coastguard Worker quant, quant_shift, dequant);
122*fb1b10abSAndroid Build Coastguard Worker
123*fb1b10abSAndroid Build Coastguard Worker // Set non-zero elements to -1 and use that to extract values for eob.
124*fb1b10abSAndroid Build Coastguard Worker eob_max =
125*fb1b10abSAndroid Build Coastguard Worker vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
126*fb1b10abSAndroid Build Coastguard Worker
127*fb1b10abSAndroid Build Coastguard Worker __builtin_prefetch(coeff_ptr + 64);
128*fb1b10abSAndroid Build Coastguard Worker coeff_ptr += 8;
129*fb1b10abSAndroid Build Coastguard Worker iscan += 8;
130*fb1b10abSAndroid Build Coastguard Worker qcoeff_ptr += 8;
131*fb1b10abSAndroid Build Coastguard Worker dqcoeff_ptr += 8;
132*fb1b10abSAndroid Build Coastguard Worker n_coeffs -= 8;
133*fb1b10abSAndroid Build Coastguard Worker } while (n_coeffs > 0);
134*fb1b10abSAndroid Build Coastguard Worker }
135*fb1b10abSAndroid Build Coastguard Worker
136*fb1b10abSAndroid Build Coastguard Worker #if VPX_ARCH_AARCH64
137*fb1b10abSAndroid Build Coastguard Worker *eob_ptr = vmaxvq_u16(eob_max);
138*fb1b10abSAndroid Build Coastguard Worker #else
139*fb1b10abSAndroid Build Coastguard Worker {
140*fb1b10abSAndroid Build Coastguard Worker const uint16x4_t eob_max_0 =
141*fb1b10abSAndroid Build Coastguard Worker vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
142*fb1b10abSAndroid Build Coastguard Worker const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
143*fb1b10abSAndroid Build Coastguard Worker const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
144*fb1b10abSAndroid Build Coastguard Worker vst1_lane_u16(eob_ptr, eob_max_2, 0);
145*fb1b10abSAndroid Build Coastguard Worker }
146*fb1b10abSAndroid Build Coastguard Worker #endif // VPX_ARCH_AARCH64
147*fb1b10abSAndroid Build Coastguard Worker }
148*fb1b10abSAndroid Build Coastguard Worker
extract_sign_bit(int32x4_t a)149*fb1b10abSAndroid Build Coastguard Worker static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
150*fb1b10abSAndroid Build Coastguard Worker return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
151*fb1b10abSAndroid Build Coastguard Worker }
152*fb1b10abSAndroid Build Coastguard Worker
calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff,const int16x8_t dequant,tran_low_t * dqcoeff_ptr)153*fb1b10abSAndroid Build Coastguard Worker static INLINE void calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff,
154*fb1b10abSAndroid Build Coastguard Worker const int16x8_t dequant,
155*fb1b10abSAndroid Build Coastguard Worker tran_low_t *dqcoeff_ptr) {
156*fb1b10abSAndroid Build Coastguard Worker int32x4_t dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
157*fb1b10abSAndroid Build Coastguard Worker int32x4_t dqcoeff_1 =
158*fb1b10abSAndroid Build Coastguard Worker vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
159*fb1b10abSAndroid Build Coastguard Worker
160*fb1b10abSAndroid Build Coastguard Worker // Add 1 if negative to round towards zero because the C uses division.
161*fb1b10abSAndroid Build Coastguard Worker dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
162*fb1b10abSAndroid Build Coastguard Worker dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
163*fb1b10abSAndroid Build Coastguard Worker
164*fb1b10abSAndroid Build Coastguard Worker #if CONFIG_VP9_HIGHBITDEPTH
165*fb1b10abSAndroid Build Coastguard Worker dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1);
166*fb1b10abSAndroid Build Coastguard Worker dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1);
167*fb1b10abSAndroid Build Coastguard Worker vst1q_s32(dqcoeff_ptr, dqcoeff_0);
168*fb1b10abSAndroid Build Coastguard Worker vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
169*fb1b10abSAndroid Build Coastguard Worker #else
170*fb1b10abSAndroid Build Coastguard Worker vst1q_s16(dqcoeff_ptr,
171*fb1b10abSAndroid Build Coastguard Worker vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)));
172*fb1b10abSAndroid Build Coastguard Worker #endif // CONFIG_VP9_HIGHBITDEPTH
173*fb1b10abSAndroid Build Coastguard Worker }
174*fb1b10abSAndroid Build Coastguard Worker
175*fb1b10abSAndroid Build Coastguard Worker static INLINE int16x8_t
quantize_b_32x32_neon(const tran_low_t * coeff_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16x8_t zbin,const int16x8_t round,const int16x8_t quant,const int16x8_t quant_shift,const int16x8_t dequant)176*fb1b10abSAndroid Build Coastguard Worker quantize_b_32x32_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
177*fb1b10abSAndroid Build Coastguard Worker tran_low_t *dqcoeff_ptr, const int16x8_t zbin,
178*fb1b10abSAndroid Build Coastguard Worker const int16x8_t round, const int16x8_t quant,
179*fb1b10abSAndroid Build Coastguard Worker const int16x8_t quant_shift, const int16x8_t dequant) {
180*fb1b10abSAndroid Build Coastguard Worker // Load coeffs as 8 x 16-bit ints, take sign and abs values
181*fb1b10abSAndroid Build Coastguard Worker const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
182*fb1b10abSAndroid Build Coastguard Worker const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
183*fb1b10abSAndroid Build Coastguard Worker const int16x8_t coeff_abs = vabsq_s16(coeff);
184*fb1b10abSAndroid Build Coastguard Worker
185*fb1b10abSAndroid Build Coastguard Worker // Calculate mask of elements outside the bin
186*fb1b10abSAndroid Build Coastguard Worker const int16x8_t zbin_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
187*fb1b10abSAndroid Build Coastguard Worker
188*fb1b10abSAndroid Build Coastguard Worker // Get the rounded values
189*fb1b10abSAndroid Build Coastguard Worker const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
190*fb1b10abSAndroid Build Coastguard Worker
191*fb1b10abSAndroid Build Coastguard Worker // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
192*fb1b10abSAndroid Build Coastguard Worker int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
193*fb1b10abSAndroid Build Coastguard Worker
194*fb1b10abSAndroid Build Coastguard Worker qcoeff = vaddq_s16(qcoeff, rounded);
195*fb1b10abSAndroid Build Coastguard Worker
196*fb1b10abSAndroid Build Coastguard Worker // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15
197*fb1b10abSAndroid Build Coastguard Worker qcoeff = vqdmulhq_s16(qcoeff, quant_shift);
198*fb1b10abSAndroid Build Coastguard Worker
199*fb1b10abSAndroid Build Coastguard Worker // Restore the sign bit.
200*fb1b10abSAndroid Build Coastguard Worker qcoeff = veorq_s16(qcoeff, coeff_sign);
201*fb1b10abSAndroid Build Coastguard Worker qcoeff = vsubq_s16(qcoeff, coeff_sign);
202*fb1b10abSAndroid Build Coastguard Worker
203*fb1b10abSAndroid Build Coastguard Worker // Only keep the relevant coeffs
204*fb1b10abSAndroid Build Coastguard Worker qcoeff = vandq_s16(qcoeff, zbin_mask);
205*fb1b10abSAndroid Build Coastguard Worker store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
206*fb1b10abSAndroid Build Coastguard Worker
207*fb1b10abSAndroid Build Coastguard Worker calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr);
208*fb1b10abSAndroid Build Coastguard Worker
209*fb1b10abSAndroid Build Coastguard Worker return qcoeff;
210*fb1b10abSAndroid Build Coastguard Worker }
211*fb1b10abSAndroid Build Coastguard Worker
212*fb1b10abSAndroid Build Coastguard Worker // Main difference is that zbin values are halved before comparison and dqcoeff
213*fb1b10abSAndroid Build Coastguard Worker // values are divided by 2. zbin is rounded but dqcoeff is not.
vpx_quantize_b_32x32_neon(const tran_low_t * coeff_ptr,const struct macroblock_plane * mb_plane,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const struct ScanOrder * const scan_order)214*fb1b10abSAndroid Build Coastguard Worker void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr,
215*fb1b10abSAndroid Build Coastguard Worker const struct macroblock_plane *mb_plane,
216*fb1b10abSAndroid Build Coastguard Worker tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
217*fb1b10abSAndroid Build Coastguard Worker const int16_t *dequant_ptr, uint16_t *eob_ptr,
218*fb1b10abSAndroid Build Coastguard Worker const struct ScanOrder *const scan_order) {
219*fb1b10abSAndroid Build Coastguard Worker const int16x8_t neg_one = vdupq_n_s16(-1);
220*fb1b10abSAndroid Build Coastguard Worker uint16x8_t eob_max;
221*fb1b10abSAndroid Build Coastguard Worker int i;
222*fb1b10abSAndroid Build Coastguard Worker const int16_t *iscan = scan_order->iscan;
223*fb1b10abSAndroid Build Coastguard Worker
224*fb1b10abSAndroid Build Coastguard Worker // Only the first element of each vector is DC.
225*fb1b10abSAndroid Build Coastguard Worker int16x8_t zbin = vrshrq_n_s16(vld1q_s16(mb_plane->zbin), 1);
226*fb1b10abSAndroid Build Coastguard Worker int16x8_t round = vrshrq_n_s16(vld1q_s16(mb_plane->round), 1);
227*fb1b10abSAndroid Build Coastguard Worker int16x8_t quant = vld1q_s16(mb_plane->quant);
228*fb1b10abSAndroid Build Coastguard Worker int16x8_t quant_shift = vld1q_s16(mb_plane->quant_shift);
229*fb1b10abSAndroid Build Coastguard Worker int16x8_t dequant = vld1q_s16(dequant_ptr);
230*fb1b10abSAndroid Build Coastguard Worker
231*fb1b10abSAndroid Build Coastguard Worker // Process first 8 values which include a dc component.
232*fb1b10abSAndroid Build Coastguard Worker {
233*fb1b10abSAndroid Build Coastguard Worker const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
234*fb1b10abSAndroid Build Coastguard Worker
235*fb1b10abSAndroid Build Coastguard Worker const int16x8_t qcoeff =
236*fb1b10abSAndroid Build Coastguard Worker quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
237*fb1b10abSAndroid Build Coastguard Worker quant, quant_shift, dequant);
238*fb1b10abSAndroid Build Coastguard Worker
239*fb1b10abSAndroid Build Coastguard Worker // Set non-zero elements to -1 and use that to extract values for eob.
240*fb1b10abSAndroid Build Coastguard Worker eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
241*fb1b10abSAndroid Build Coastguard Worker
242*fb1b10abSAndroid Build Coastguard Worker __builtin_prefetch(coeff_ptr + 64);
243*fb1b10abSAndroid Build Coastguard Worker coeff_ptr += 8;
244*fb1b10abSAndroid Build Coastguard Worker iscan += 8;
245*fb1b10abSAndroid Build Coastguard Worker qcoeff_ptr += 8;
246*fb1b10abSAndroid Build Coastguard Worker dqcoeff_ptr += 8;
247*fb1b10abSAndroid Build Coastguard Worker }
248*fb1b10abSAndroid Build Coastguard Worker
249*fb1b10abSAndroid Build Coastguard Worker {
250*fb1b10abSAndroid Build Coastguard Worker zbin = vdupq_lane_s16(vget_low_s16(zbin), 1);
251*fb1b10abSAndroid Build Coastguard Worker round = vdupq_lane_s16(vget_low_s16(round), 1);
252*fb1b10abSAndroid Build Coastguard Worker quant = vdupq_lane_s16(vget_low_s16(quant), 1);
253*fb1b10abSAndroid Build Coastguard Worker quant_shift = vdupq_lane_s16(vget_low_s16(quant_shift), 1);
254*fb1b10abSAndroid Build Coastguard Worker dequant = vdupq_lane_s16(vget_low_s16(dequant), 1);
255*fb1b10abSAndroid Build Coastguard Worker
256*fb1b10abSAndroid Build Coastguard Worker for (i = 1; i < 32 * 32 / 8; ++i) {
257*fb1b10abSAndroid Build Coastguard Worker const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
258*fb1b10abSAndroid Build Coastguard Worker
259*fb1b10abSAndroid Build Coastguard Worker const int16x8_t qcoeff =
260*fb1b10abSAndroid Build Coastguard Worker quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
261*fb1b10abSAndroid Build Coastguard Worker quant, quant_shift, dequant);
262*fb1b10abSAndroid Build Coastguard Worker
263*fb1b10abSAndroid Build Coastguard Worker // Set non-zero elements to -1 and use that to extract values for eob.
264*fb1b10abSAndroid Build Coastguard Worker eob_max =
265*fb1b10abSAndroid Build Coastguard Worker vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
266*fb1b10abSAndroid Build Coastguard Worker
267*fb1b10abSAndroid Build Coastguard Worker __builtin_prefetch(coeff_ptr + 64);
268*fb1b10abSAndroid Build Coastguard Worker coeff_ptr += 8;
269*fb1b10abSAndroid Build Coastguard Worker iscan += 8;
270*fb1b10abSAndroid Build Coastguard Worker qcoeff_ptr += 8;
271*fb1b10abSAndroid Build Coastguard Worker dqcoeff_ptr += 8;
272*fb1b10abSAndroid Build Coastguard Worker }
273*fb1b10abSAndroid Build Coastguard Worker }
274*fb1b10abSAndroid Build Coastguard Worker
275*fb1b10abSAndroid Build Coastguard Worker #if VPX_ARCH_AARCH64
276*fb1b10abSAndroid Build Coastguard Worker *eob_ptr = vmaxvq_u16(eob_max);
277*fb1b10abSAndroid Build Coastguard Worker #else
278*fb1b10abSAndroid Build Coastguard Worker {
279*fb1b10abSAndroid Build Coastguard Worker const uint16x4_t eob_max_0 =
280*fb1b10abSAndroid Build Coastguard Worker vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
281*fb1b10abSAndroid Build Coastguard Worker const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
282*fb1b10abSAndroid Build Coastguard Worker const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
283*fb1b10abSAndroid Build Coastguard Worker vst1_lane_u16(eob_ptr, eob_max_2, 0);
284*fb1b10abSAndroid Build Coastguard Worker }
285*fb1b10abSAndroid Build Coastguard Worker #endif // VPX_ARCH_AARCH64
286*fb1b10abSAndroid Build Coastguard Worker }
287