xref: /aosp_15_r20/external/libvpx/vpx_dsp/x86/quantize_avx.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #if defined(_MSC_VER)
13 #include <intrin.h>
14 #endif
15 #include <immintrin.h>
16 
17 #include "./vpx_dsp_rtcd.h"
18 #include "vpx/vpx_integer.h"
19 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
20 #include "vpx_dsp/x86/quantize_sse2.h"
21 #include "vpx_dsp/x86/quantize_ssse3.h"
22 #include "vp9/common/vp9_scan.h"
23 #include "vp9/encoder/vp9_block.h"
24 
vpx_quantize_b_avx(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const struct macroblock_plane * const mb_plane,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const struct ScanOrder * const scan_order)25 void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
26                         const struct macroblock_plane *const mb_plane,
27                         tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
28                         const int16_t *dequant_ptr, uint16_t *eob_ptr,
29                         const struct ScanOrder *const scan_order) {
30   const __m128i zero = _mm_setzero_si128();
31   const __m256i big_zero = _mm256_setzero_si256();
32   int index;
33   const int16_t *iscan = scan_order->iscan;
34 
35   __m128i zbin, round, quant, dequant, shift;
36   __m128i coeff0, coeff1;
37   __m128i qcoeff0, qcoeff1;
38   __m128i cmp_mask0, cmp_mask1;
39   __m128i all_zero;
40   __m128i eob = zero, eob0;
41 
42   *eob_ptr = 0;
43 
44   load_b_values(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift);
45 
46   // Do DC and first 15 AC.
47   coeff0 = load_tran_low(coeff_ptr);
48   coeff1 = load_tran_low(coeff_ptr + 8);
49 
50   qcoeff0 = _mm_abs_epi16(coeff0);
51   qcoeff1 = _mm_abs_epi16(coeff1);
52 
53   cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
54   zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
55   cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
56 
57   all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
58   if (_mm_test_all_zeros(all_zero, all_zero)) {
59     _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero);
60     _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero);
61 #if CONFIG_VP9_HIGHBITDEPTH
62     _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero);
63     _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero);
64 #endif  // CONFIG_VP9_HIGHBITDEPTH
65 
66     if (n_coeffs == 16) return;
67 
68     round = _mm_unpackhi_epi64(round, round);
69     quant = _mm_unpackhi_epi64(quant, quant);
70     shift = _mm_unpackhi_epi64(shift, shift);
71     dequant = _mm_unpackhi_epi64(dequant, dequant);
72   } else {
73     calculate_qcoeff(&qcoeff0, round, quant, shift);
74     round = _mm_unpackhi_epi64(round, round);
75     quant = _mm_unpackhi_epi64(quant, quant);
76     shift = _mm_unpackhi_epi64(shift, shift);
77     calculate_qcoeff(&qcoeff1, round, quant, shift);
78 
79     // Reinsert signs
80     qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
81     qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
82 
83     // Mask out zbin threshold coeffs
84     qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
85     qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
86 
87     store_tran_low(qcoeff0, qcoeff_ptr);
88     store_tran_low(qcoeff1, qcoeff_ptr + 8);
89 
90     calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr);
91     dequant = _mm_unpackhi_epi64(dequant, dequant);
92     calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
93 
94     eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
95   }
96 
97   // AC only loop.
98   for (index = 16; index < n_coeffs; index += 16) {
99     coeff0 = load_tran_low(coeff_ptr + index);
100     coeff1 = load_tran_low(coeff_ptr + index + 8);
101 
102     qcoeff0 = _mm_abs_epi16(coeff0);
103     qcoeff1 = _mm_abs_epi16(coeff1);
104 
105     cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
106     cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
107 
108     all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
109     if (_mm_test_all_zeros(all_zero, all_zero)) {
110       _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero);
111       _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero);
112 #if CONFIG_VP9_HIGHBITDEPTH
113       _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero);
114       _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero);
115 #endif  // CONFIG_VP9_HIGHBITDEPTH
116       continue;
117     }
118 
119     calculate_qcoeff(&qcoeff0, round, quant, shift);
120     calculate_qcoeff(&qcoeff1, round, quant, shift);
121 
122     qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
123     qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
124 
125     qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
126     qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
127 
128     store_tran_low(qcoeff0, qcoeff_ptr + index);
129     store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
130 
131     calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
132     calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
133 
134     eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
135     eob = _mm_max_epi16(eob, eob0);
136   }
137 
138   *eob_ptr = accumulate_eob(eob);
139 }
140 
vpx_quantize_b_32x32_avx(const tran_low_t * coeff_ptr,const struct macroblock_plane * const mb_plane,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const struct ScanOrder * const scan_order)141 void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr,
142                               const struct macroblock_plane *const mb_plane,
143                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
144                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
145                               const struct ScanOrder *const scan_order) {
146   const __m128i zero = _mm_setzero_si128();
147   const __m256i big_zero = _mm256_setzero_si256();
148   int index;
149   const int16_t *iscan = scan_order->iscan;
150 
151   __m128i zbin, round, quant, dequant, shift;
152   __m128i coeff0, coeff1;
153   __m128i qcoeff0, qcoeff1;
154   __m128i cmp_mask0, cmp_mask1;
155   __m128i all_zero;
156   __m128i eob = zero, eob0;
157 
158   load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant,
159                      &shift);
160 
161   // Do DC and first 15 AC.
162   coeff0 = load_tran_low(coeff_ptr);
163   coeff1 = load_tran_low(coeff_ptr + 8);
164 
165   qcoeff0 = _mm_abs_epi16(coeff0);
166   qcoeff1 = _mm_abs_epi16(coeff1);
167 
168   cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
169   zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC.
170   cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
171 
172   all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
173   if (_mm_test_all_zeros(all_zero, all_zero)) {
174     _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero);
175     _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero);
176 #if CONFIG_VP9_HIGHBITDEPTH
177     _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero);
178     _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero);
179 #endif  // CONFIG_VP9_HIGHBITDEPTH
180 
181     round = _mm_unpackhi_epi64(round, round);
182     quant = _mm_unpackhi_epi64(quant, quant);
183     shift = _mm_unpackhi_epi64(shift, shift);
184     dequant = _mm_unpackhi_epi64(dequant, dequant);
185   } else {
186     calculate_qcoeff(&qcoeff0, round, quant, shift);
187     round = _mm_unpackhi_epi64(round, round);
188     quant = _mm_unpackhi_epi64(quant, quant);
189     shift = _mm_unpackhi_epi64(shift, shift);
190     calculate_qcoeff(&qcoeff1, round, quant, shift);
191 
192     // Reinsert signs.
193     qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
194     qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
195 
196     // Mask out zbin threshold coeffs.
197     qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
198     qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
199 
200     store_tran_low(qcoeff0, qcoeff_ptr);
201     store_tran_low(qcoeff1, qcoeff_ptr + 8);
202 
203     calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero, dqcoeff_ptr);
204     dequant = _mm_unpackhi_epi64(dequant, dequant);
205     calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8);
206 
207     eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
208   }
209 
210   // AC only loop.
211   for (index = 16; index < 32 * 32; index += 16) {
212     coeff0 = load_tran_low(coeff_ptr + index);
213     coeff1 = load_tran_low(coeff_ptr + index + 8);
214 
215     qcoeff0 = _mm_abs_epi16(coeff0);
216     qcoeff1 = _mm_abs_epi16(coeff1);
217 
218     cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
219     cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
220 
221     all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
222     if (_mm_test_all_zeros(all_zero, all_zero)) {
223       _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero);
224       _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero);
225 #if CONFIG_VP9_HIGHBITDEPTH
226       _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero);
227       _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero);
228 #endif  // CONFIG_VP9_HIGHBITDEPTH
229       continue;
230     }
231 
232     calculate_qcoeff(&qcoeff0, round, quant, shift);
233     calculate_qcoeff(&qcoeff1, round, quant, shift);
234 
235     qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
236     qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
237 
238     qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
239     qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
240 
241     store_tran_low(qcoeff0, qcoeff_ptr + index);
242     store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
243 
244     calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero,
245                                       dqcoeff_ptr + index);
246     calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero,
247                                       dqcoeff_ptr + index + 8);
248 
249     eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
250     eob = _mm_max_epi16(eob, eob0);
251   }
252 
253   *eob_ptr = accumulate_eob(eob);
254 }
255