xref: /aosp_15_r20/external/libaom/aom_dsp/x86/highbd_quantize_intrin_avx2.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <immintrin.h>
13 
14 #include "config/aom_dsp_rtcd.h"
15 
16 #include "aom/aom_integer.h"
17 
init_one_qp(const __m128i * p,__m256i * qp)18 static inline void init_one_qp(const __m128i *p, __m256i *qp) {
19   const __m128i sign = _mm_srai_epi16(*p, 15);
20   const __m128i dc = _mm_unpacklo_epi16(*p, sign);
21   const __m128i ac = _mm_unpackhi_epi16(*p, sign);
22   *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1);
23 }
24 
update_qp(__m256i * qp)25 static inline void update_qp(__m256i *qp) {
26   int i;
27   for (i = 0; i < 5; ++i) {
28     qp[i] = _mm256_permute2x128_si256(qp[i], qp[i], 0x11);
29   }
30 }
31 
init_qp(const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * dequant_ptr,const int16_t * quant_shift_ptr,__m256i * qp,int log_scale)32 static inline void init_qp(const int16_t *zbin_ptr, const int16_t *round_ptr,
33                            const int16_t *quant_ptr, const int16_t *dequant_ptr,
34                            const int16_t *quant_shift_ptr, __m256i *qp,
35                            int log_scale) {
36   const __m128i zbin = _mm_loadu_si128((const __m128i *)zbin_ptr);
37   const __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
38   const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
39   const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
40   const __m128i quant_shift = _mm_loadu_si128((const __m128i *)quant_shift_ptr);
41   init_one_qp(&zbin, &qp[0]);
42   init_one_qp(&round, &qp[1]);
43   init_one_qp(&quant, &qp[2]);
44   init_one_qp(&dequant, &qp[3]);
45   init_one_qp(&quant_shift, &qp[4]);
46   if (log_scale > 0) {
47     const __m256i rnd = _mm256_set1_epi32((int16_t)(1 << (log_scale - 1)));
48     qp[0] = _mm256_add_epi32(qp[0], rnd);
49     qp[0] = _mm256_srai_epi32(qp[0], log_scale);
50 
51     qp[1] = _mm256_add_epi32(qp[1], rnd);
52     qp[1] = _mm256_srai_epi32(qp[1], log_scale);
53   }
54   // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when
55   // calculating the zbin mask.
56   qp[0] = _mm256_sub_epi32(qp[0], _mm256_set1_epi32(1));
57 }
58 
59 // Note:
60 // *x is vector multiplied by *y which is 16 int32_t parallel multiplication
61 // and right shift 16.  The output, 16 int32_t is save in *p.
mm256_mul_shift_epi32(const __m256i * x,const __m256i * y)62 static inline __m256i mm256_mul_shift_epi32(const __m256i *x,
63                                             const __m256i *y) {
64   __m256i prod_lo = _mm256_mul_epi32(*x, *y);
65   __m256i prod_hi = _mm256_srli_epi64(*x, 32);
66   const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
67   prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
68 
69   prod_lo = _mm256_srli_epi64(prod_lo, 16);
70   const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
71   prod_lo = _mm256_and_si256(prod_lo, mask);
72   prod_hi = _mm256_srli_epi64(prod_hi, 16);
73 
74   prod_hi = _mm256_slli_epi64(prod_hi, 32);
75   return _mm256_or_si256(prod_lo, prod_hi);
76 }
77 
get_max_lane_eob(const int16_t * iscan_ptr,__m256i eobmax,__m256i nz_mask)78 static AOM_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan_ptr,
79                                                  __m256i eobmax,
80                                                  __m256i nz_mask) {
81   const __m256i packed_nz_mask = _mm256_packs_epi32(nz_mask, nz_mask);
82   const __m256i packed_nz_mask_perm =
83       _mm256_permute4x64_epi64(packed_nz_mask, 0xD8);
84   const __m256i iscan =
85       _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr));
86   const __m256i iscan_plus1 = _mm256_sub_epi16(iscan, packed_nz_mask_perm);
87   const __m256i nz_iscan = _mm256_and_si256(iscan_plus1, packed_nz_mask_perm);
88   return _mm256_max_epi16(eobmax, nz_iscan);
89 }
90 
91 // Get the max eob from the lower 128 bits.
get_max_eob(__m256i eob)92 static AOM_FORCE_INLINE uint16_t get_max_eob(__m256i eob) {
93   __m256i eob_s;
94   eob_s = _mm256_shuffle_epi32(eob, 0xe);
95   eob = _mm256_max_epi16(eob, eob_s);
96   eob_s = _mm256_shufflelo_epi16(eob, 0xe);
97   eob = _mm256_max_epi16(eob, eob_s);
98   eob_s = _mm256_shufflelo_epi16(eob, 1);
99   eob = _mm256_max_epi16(eob, eob_s);
100   return (uint16_t)_mm256_extract_epi16(eob, 0);
101 }
102 
mm256_mul_shift_epi32_logscale(const __m256i * x,const __m256i * y,int log_scale)103 static AOM_FORCE_INLINE __m256i mm256_mul_shift_epi32_logscale(const __m256i *x,
104                                                                const __m256i *y,
105                                                                int log_scale) {
106   __m256i prod_lo = _mm256_mul_epi32(*x, *y);
107   __m256i prod_hi = _mm256_srli_epi64(*x, 32);
108   const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
109   prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
110   prod_lo = _mm256_srli_epi64(prod_lo, 16 - log_scale);
111   const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
112   prod_lo = _mm256_and_si256(prod_lo, mask);
113   prod_hi = _mm256_srli_epi64(prod_hi, 16 - log_scale);
114   prod_hi = _mm256_slli_epi64(prod_hi, 32);
115   return _mm256_or_si256(prod_lo, prod_hi);
116 }
117 
quantize_logscale(const __m256i * qp,const tran_low_t * coeff_ptr,const int16_t * iscan_ptr,tran_low_t * qcoeff,tran_low_t * dqcoeff,__m256i * eob,int log_scale)118 static AOM_FORCE_INLINE void quantize_logscale(
119     const __m256i *qp, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
120     tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob, int log_scale) {
121   const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
122   const __m256i abs_coeff = _mm256_abs_epi32(coeff);
123   const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]);
124 
125   if (UNLIKELY(_mm256_movemask_epi8(zbin_mask) == 0)) {
126     const __m256i zero = _mm256_setzero_si256();
127     _mm256_storeu_si256((__m256i *)qcoeff, zero);
128     _mm256_storeu_si256((__m256i *)dqcoeff, zero);
129     return;
130   }
131 
132   const __m256i tmp_rnd =
133       _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask);
134   // const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
135   const __m256i tmp = mm256_mul_shift_epi32_logscale(&tmp_rnd, &qp[2], 0);
136   const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd);
137   // const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >>
138   //                              (16 - log_scale + AOM_QM_BITS));
139   const __m256i abs_q =
140       mm256_mul_shift_epi32_logscale(&tmp2, &qp[4], log_scale);
141   const __m256i abs_dq =
142       _mm256_srli_epi32(_mm256_mullo_epi32(abs_q, qp[3]), log_scale);
143   const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
144   const __m256i q = _mm256_sign_epi32(abs_q, coeff);
145   const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
146 
147   _mm256_storeu_si256((__m256i *)qcoeff, q);
148   _mm256_storeu_si256((__m256i *)dqcoeff, dq);
149 
150   *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask);
151 }
152 
quantize(const __m256i * qp,const tran_low_t * coeff_ptr,const int16_t * iscan_ptr,tran_low_t * qcoeff,tran_low_t * dqcoeff,__m256i * eob)153 static AOM_FORCE_INLINE void quantize(const __m256i *qp,
154                                       const tran_low_t *coeff_ptr,
155                                       const int16_t *iscan_ptr,
156                                       tran_low_t *qcoeff, tran_low_t *dqcoeff,
157                                       __m256i *eob) {
158   const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
159   const __m256i abs_coeff = _mm256_abs_epi32(coeff);
160   const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]);
161 
162   if (UNLIKELY(_mm256_movemask_epi8(zbin_mask) == 0)) {
163     const __m256i zero = _mm256_setzero_si256();
164     _mm256_storeu_si256((__m256i *)qcoeff, zero);
165     _mm256_storeu_si256((__m256i *)dqcoeff, zero);
166     return;
167   }
168 
169   const __m256i tmp_rnd =
170       _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask);
171   const __m256i tmp = mm256_mul_shift_epi32(&tmp_rnd, &qp[2]);
172   const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd);
173   const __m256i abs_q = mm256_mul_shift_epi32(&tmp2, &qp[4]);
174   const __m256i abs_dq = _mm256_mullo_epi32(abs_q, qp[3]);
175   const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
176   const __m256i q = _mm256_sign_epi32(abs_q, coeff);
177   const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
178 
179   _mm256_storeu_si256((__m256i *)qcoeff, q);
180   _mm256_storeu_si256((__m256i *)dqcoeff, dq);
181 
182   *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask);
183 }
184 
aom_highbd_quantize_b_avx2(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)185 void aom_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
186                                 const int16_t *zbin_ptr,
187                                 const int16_t *round_ptr,
188                                 const int16_t *quant_ptr,
189                                 const int16_t *quant_shift_ptr,
190                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
191                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
192                                 const int16_t *scan, const int16_t *iscan) {
193   (void)scan;
194   const int step = 8;
195 
196   __m256i eob = _mm256_setzero_si256();
197   __m256i qp[5];
198 
199   init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 0);
200 
201   quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
202 
203   coeff_ptr += step;
204   qcoeff_ptr += step;
205   dqcoeff_ptr += step;
206   iscan += step;
207   n_coeffs -= step;
208 
209   update_qp(qp);
210 
211   while (n_coeffs > 0) {
212     quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
213 
214     coeff_ptr += step;
215     qcoeff_ptr += step;
216     dqcoeff_ptr += step;
217     iscan += step;
218     n_coeffs -= step;
219   }
220 
221   *eob_ptr = get_max_eob(eob);
222 }
223 
aom_highbd_quantize_b_32x32_avx2(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)224 void aom_highbd_quantize_b_32x32_avx2(
225     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
226     const int16_t *round_ptr, const int16_t *quant_ptr,
227     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
228     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
229     const int16_t *scan, const int16_t *iscan) {
230   (void)scan;
231   const unsigned int step = 8;
232 
233   __m256i eob = _mm256_setzero_si256();
234   __m256i qp[5];
235   init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 1);
236 
237   quantize_logscale(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob, 1);
238 
239   coeff_ptr += step;
240   qcoeff_ptr += step;
241   dqcoeff_ptr += step;
242   iscan += step;
243   n_coeffs -= step;
244 
245   update_qp(qp);
246 
247   while (n_coeffs > 0) {
248     quantize_logscale(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob, 1);
249 
250     coeff_ptr += step;
251     qcoeff_ptr += step;
252     dqcoeff_ptr += step;
253     iscan += step;
254     n_coeffs -= step;
255   }
256 
257   *eob_ptr = get_max_eob(eob);
258 }
259 
aom_highbd_quantize_b_64x64_avx2(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)260 void aom_highbd_quantize_b_64x64_avx2(
261     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
262     const int16_t *round_ptr, const int16_t *quant_ptr,
263     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
264     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
265     const int16_t *scan, const int16_t *iscan) {
266   (void)scan;
267   const int step = 8;
268 
269   __m256i eob = _mm256_setzero_si256();
270   __m256i qp[5];
271   init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 2);
272 
273   quantize_logscale(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob, 2);
274 
275   coeff_ptr += step;
276   qcoeff_ptr += step;
277   dqcoeff_ptr += step;
278   iscan += step;
279   n_coeffs -= step;
280 
281   update_qp(qp);
282 
283   while (n_coeffs > 0) {
284     quantize_logscale(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob, 2);
285 
286     coeff_ptr += step;
287     qcoeff_ptr += step;
288     dqcoeff_ptr += step;
289     iscan += step;
290     n_coeffs -= step;
291   }
292 
293   *eob_ptr = get_max_eob(eob);
294 }
295