xref: /aosp_15_r20/external/libvpx/vpx_dsp/arm/quantize_neon.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 #include <assert.h>
13 
14 #include "./vpx_config.h"
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx_dsp/arm/mem_neon.h"
17 #include "vp9/common/vp9_scan.h"
18 #include "vp9/encoder/vp9_block.h"
19 
calculate_dqcoeff_and_store(const int16x8_t qcoeff,const int16x8_t dequant,tran_low_t * dqcoeff_ptr)20 static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
21                                                const int16x8_t dequant,
22                                                tran_low_t *dqcoeff_ptr) {
23 #if CONFIG_VP9_HIGHBITDEPTH
24   const int32x4_t dqcoeff_0 =
25       vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
26   const int32x4_t dqcoeff_1 =
27       vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
28 
29   vst1q_s32(dqcoeff_ptr, dqcoeff_0);
30   vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
31 #else
32   vst1q_s16(dqcoeff_ptr, vmulq_s16(qcoeff, dequant));
33 #endif  // CONFIG_VP9_HIGHBITDEPTH
34 }
35 
36 static INLINE int16x8_t
quantize_b_neon(const tran_low_t * coeff_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16x8_t zbin,const int16x8_t round,const int16x8_t quant,const int16x8_t quant_shift,const int16x8_t dequant)37 quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
38                 tran_low_t *dqcoeff_ptr, const int16x8_t zbin,
39                 const int16x8_t round, const int16x8_t quant,
40                 const int16x8_t quant_shift, const int16x8_t dequant) {
41   // Load coeffs as 8 x 16-bit ints, take sign and abs values
42   const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
43   const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
44   const int16x8_t coeff_abs = vabsq_s16(coeff);
45 
46   // Calculate mask of elements outside the bin
47   const int16x8_t zbin_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
48 
49   // Get the rounded values
50   const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
51 
52   // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
53   int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
54 
55   qcoeff = vaddq_s16(qcoeff, rounded);
56 
57   // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16
58   qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1);
59 
60   // Restore the sign bit.
61   qcoeff = veorq_s16(qcoeff, coeff_sign);
62   qcoeff = vsubq_s16(qcoeff, coeff_sign);
63 
64   // Only keep the relevant coeffs
65   qcoeff = vandq_s16(qcoeff, zbin_mask);
66   store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
67 
68   calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr);
69 
70   return qcoeff;
71 }
72 
vpx_quantize_b_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const struct macroblock_plane * const mb_plane,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const struct ScanOrder * const scan_order)73 void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
74                          const struct macroblock_plane *const mb_plane,
75                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
76                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
77                          const struct ScanOrder *const scan_order) {
78   const int16x8_t neg_one = vdupq_n_s16(-1);
79   uint16x8_t eob_max;
80   int16_t const *iscan = scan_order->iscan;
81 
82   // Only the first element of each vector is DC.
83   int16x8_t zbin = vld1q_s16(mb_plane->zbin);
84   int16x8_t round = vld1q_s16(mb_plane->round);
85   int16x8_t quant = vld1q_s16(mb_plane->quant);
86   int16x8_t quant_shift = vld1q_s16(mb_plane->quant_shift);
87   int16x8_t dequant = vld1q_s16(dequant_ptr);
88 
89   // Process first 8 values which include a dc component.
90   {
91     const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
92 
93     const int16x8_t qcoeff =
94         quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, quant,
95                         quant_shift, dequant);
96 
97     // Set non-zero elements to -1 and use that to extract values for eob.
98     eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
99 
100     __builtin_prefetch(coeff_ptr + 64);
101     coeff_ptr += 8;
102     iscan += 8;
103     qcoeff_ptr += 8;
104     dqcoeff_ptr += 8;
105   }
106 
107   n_coeffs -= 8;
108 
109   {
110     zbin = vdupq_lane_s16(vget_low_s16(zbin), 1);
111     round = vdupq_lane_s16(vget_low_s16(round), 1);
112     quant = vdupq_lane_s16(vget_low_s16(quant), 1);
113     quant_shift = vdupq_lane_s16(vget_low_s16(quant_shift), 1);
114     dequant = vdupq_lane_s16(vget_low_s16(dequant), 1);
115 
116     do {
117       const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
118 
119       const int16x8_t qcoeff =
120           quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
121                           quant, quant_shift, dequant);
122 
123       // Set non-zero elements to -1 and use that to extract values for eob.
124       eob_max =
125           vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
126 
127       __builtin_prefetch(coeff_ptr + 64);
128       coeff_ptr += 8;
129       iscan += 8;
130       qcoeff_ptr += 8;
131       dqcoeff_ptr += 8;
132       n_coeffs -= 8;
133     } while (n_coeffs > 0);
134   }
135 
136 #if VPX_ARCH_AARCH64
137   *eob_ptr = vmaxvq_u16(eob_max);
138 #else
139   {
140     const uint16x4_t eob_max_0 =
141         vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
142     const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
143     const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
144     vst1_lane_u16(eob_ptr, eob_max_2, 0);
145   }
146 #endif  // VPX_ARCH_AARCH64
147 }
148 
extract_sign_bit(int32x4_t a)149 static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
150   return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
151 }
152 
calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff,const int16x8_t dequant,tran_low_t * dqcoeff_ptr)153 static INLINE void calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff,
154                                                      const int16x8_t dequant,
155                                                      tran_low_t *dqcoeff_ptr) {
156   int32x4_t dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
157   int32x4_t dqcoeff_1 =
158       vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
159 
160   // Add 1 if negative to round towards zero because the C uses division.
161   dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
162   dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
163 
164 #if CONFIG_VP9_HIGHBITDEPTH
165   dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1);
166   dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1);
167   vst1q_s32(dqcoeff_ptr, dqcoeff_0);
168   vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
169 #else
170   vst1q_s16(dqcoeff_ptr,
171             vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)));
172 #endif  // CONFIG_VP9_HIGHBITDEPTH
173 }
174 
175 static INLINE int16x8_t
quantize_b_32x32_neon(const tran_low_t * coeff_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16x8_t zbin,const int16x8_t round,const int16x8_t quant,const int16x8_t quant_shift,const int16x8_t dequant)176 quantize_b_32x32_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
177                       tran_low_t *dqcoeff_ptr, const int16x8_t zbin,
178                       const int16x8_t round, const int16x8_t quant,
179                       const int16x8_t quant_shift, const int16x8_t dequant) {
180   // Load coeffs as 8 x 16-bit ints, take sign and abs values
181   const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
182   const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
183   const int16x8_t coeff_abs = vabsq_s16(coeff);
184 
185   // Calculate mask of elements outside the bin
186   const int16x8_t zbin_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
187 
188   // Get the rounded values
189   const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
190 
191   // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
192   int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
193 
194   qcoeff = vaddq_s16(qcoeff, rounded);
195 
196   // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15
197   qcoeff = vqdmulhq_s16(qcoeff, quant_shift);
198 
199   // Restore the sign bit.
200   qcoeff = veorq_s16(qcoeff, coeff_sign);
201   qcoeff = vsubq_s16(qcoeff, coeff_sign);
202 
203   // Only keep the relevant coeffs
204   qcoeff = vandq_s16(qcoeff, zbin_mask);
205   store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
206 
207   calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr);
208 
209   return qcoeff;
210 }
211 
212 // Main difference is that zbin values are halved before comparison and dqcoeff
213 // values are divided by 2. zbin is rounded but dqcoeff is not.
vpx_quantize_b_32x32_neon(const tran_low_t * coeff_ptr,const struct macroblock_plane * mb_plane,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const struct ScanOrder * const scan_order)214 void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr,
215                                const struct macroblock_plane *mb_plane,
216                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
217                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
218                                const struct ScanOrder *const scan_order) {
219   const int16x8_t neg_one = vdupq_n_s16(-1);
220   uint16x8_t eob_max;
221   int i;
222   const int16_t *iscan = scan_order->iscan;
223 
224   // Only the first element of each vector is DC.
225   int16x8_t zbin = vrshrq_n_s16(vld1q_s16(mb_plane->zbin), 1);
226   int16x8_t round = vrshrq_n_s16(vld1q_s16(mb_plane->round), 1);
227   int16x8_t quant = vld1q_s16(mb_plane->quant);
228   int16x8_t quant_shift = vld1q_s16(mb_plane->quant_shift);
229   int16x8_t dequant = vld1q_s16(dequant_ptr);
230 
231   // Process first 8 values which include a dc component.
232   {
233     const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
234 
235     const int16x8_t qcoeff =
236         quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
237                               quant, quant_shift, dequant);
238 
239     // Set non-zero elements to -1 and use that to extract values for eob.
240     eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
241 
242     __builtin_prefetch(coeff_ptr + 64);
243     coeff_ptr += 8;
244     iscan += 8;
245     qcoeff_ptr += 8;
246     dqcoeff_ptr += 8;
247   }
248 
249   {
250     zbin = vdupq_lane_s16(vget_low_s16(zbin), 1);
251     round = vdupq_lane_s16(vget_low_s16(round), 1);
252     quant = vdupq_lane_s16(vget_low_s16(quant), 1);
253     quant_shift = vdupq_lane_s16(vget_low_s16(quant_shift), 1);
254     dequant = vdupq_lane_s16(vget_low_s16(dequant), 1);
255 
256     for (i = 1; i < 32 * 32 / 8; ++i) {
257       const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
258 
259       const int16x8_t qcoeff =
260           quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
261                                 quant, quant_shift, dequant);
262 
263       // Set non-zero elements to -1 and use that to extract values for eob.
264       eob_max =
265           vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
266 
267       __builtin_prefetch(coeff_ptr + 64);
268       coeff_ptr += 8;
269       iscan += 8;
270       qcoeff_ptr += 8;
271       dqcoeff_ptr += 8;
272     }
273   }
274 
275 #if VPX_ARCH_AARCH64
276   *eob_ptr = vmaxvq_u16(eob_max);
277 #else
278   {
279     const uint16x4_t eob_max_0 =
280         vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
281     const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
282     const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
283     vst1_lane_u16(eob_ptr, eob_max_2, 0);
284   }
285 #endif  // VPX_ARCH_AARCH64
286 }
287