1 /*
2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #if defined(_MSC_VER)
13 #include <intrin.h>
14 #endif
15 #include <immintrin.h>
16
17 #include "./vpx_dsp_rtcd.h"
18 #include "vpx/vpx_integer.h"
19 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
20 #include "vpx_dsp/x86/quantize_sse2.h"
21 #include "vpx_dsp/x86/quantize_ssse3.h"
22 #include "vp9/common/vp9_scan.h"
23 #include "vp9/encoder/vp9_block.h"
24
vpx_quantize_b_avx(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const struct macroblock_plane * const mb_plane,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const struct ScanOrder * const scan_order)25 void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
26 const struct macroblock_plane *const mb_plane,
27 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
28 const int16_t *dequant_ptr, uint16_t *eob_ptr,
29 const struct ScanOrder *const scan_order) {
30 const __m128i zero = _mm_setzero_si128();
31 const __m256i big_zero = _mm256_setzero_si256();
32 int index;
33 const int16_t *iscan = scan_order->iscan;
34
35 __m128i zbin, round, quant, dequant, shift;
36 __m128i coeff0, coeff1;
37 __m128i qcoeff0, qcoeff1;
38 __m128i cmp_mask0, cmp_mask1;
39 __m128i all_zero;
40 __m128i eob = zero, eob0;
41
42 *eob_ptr = 0;
43
44 load_b_values(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift);
45
46 // Do DC and first 15 AC.
47 coeff0 = load_tran_low(coeff_ptr);
48 coeff1 = load_tran_low(coeff_ptr + 8);
49
50 qcoeff0 = _mm_abs_epi16(coeff0);
51 qcoeff1 = _mm_abs_epi16(coeff1);
52
53 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
54 zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
55 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
56
57 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
58 if (_mm_test_all_zeros(all_zero, all_zero)) {
59 _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero);
60 _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero);
61 #if CONFIG_VP9_HIGHBITDEPTH
62 _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero);
63 _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero);
64 #endif // CONFIG_VP9_HIGHBITDEPTH
65
66 if (n_coeffs == 16) return;
67
68 round = _mm_unpackhi_epi64(round, round);
69 quant = _mm_unpackhi_epi64(quant, quant);
70 shift = _mm_unpackhi_epi64(shift, shift);
71 dequant = _mm_unpackhi_epi64(dequant, dequant);
72 } else {
73 calculate_qcoeff(&qcoeff0, round, quant, shift);
74 round = _mm_unpackhi_epi64(round, round);
75 quant = _mm_unpackhi_epi64(quant, quant);
76 shift = _mm_unpackhi_epi64(shift, shift);
77 calculate_qcoeff(&qcoeff1, round, quant, shift);
78
79 // Reinsert signs
80 qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
81 qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
82
83 // Mask out zbin threshold coeffs
84 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
85 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
86
87 store_tran_low(qcoeff0, qcoeff_ptr);
88 store_tran_low(qcoeff1, qcoeff_ptr + 8);
89
90 calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr);
91 dequant = _mm_unpackhi_epi64(dequant, dequant);
92 calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
93
94 eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
95 }
96
97 // AC only loop.
98 for (index = 16; index < n_coeffs; index += 16) {
99 coeff0 = load_tran_low(coeff_ptr + index);
100 coeff1 = load_tran_low(coeff_ptr + index + 8);
101
102 qcoeff0 = _mm_abs_epi16(coeff0);
103 qcoeff1 = _mm_abs_epi16(coeff1);
104
105 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
106 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
107
108 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
109 if (_mm_test_all_zeros(all_zero, all_zero)) {
110 _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero);
111 _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero);
112 #if CONFIG_VP9_HIGHBITDEPTH
113 _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero);
114 _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero);
115 #endif // CONFIG_VP9_HIGHBITDEPTH
116 continue;
117 }
118
119 calculate_qcoeff(&qcoeff0, round, quant, shift);
120 calculate_qcoeff(&qcoeff1, round, quant, shift);
121
122 qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
123 qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
124
125 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
126 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
127
128 store_tran_low(qcoeff0, qcoeff_ptr + index);
129 store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
130
131 calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
132 calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
133
134 eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
135 eob = _mm_max_epi16(eob, eob0);
136 }
137
138 *eob_ptr = accumulate_eob(eob);
139 }
140
vpx_quantize_b_32x32_avx(const tran_low_t * coeff_ptr,const struct macroblock_plane * const mb_plane,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const struct ScanOrder * const scan_order)141 void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr,
142 const struct macroblock_plane *const mb_plane,
143 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
144 const int16_t *dequant_ptr, uint16_t *eob_ptr,
145 const struct ScanOrder *const scan_order) {
146 const __m128i zero = _mm_setzero_si128();
147 const __m256i big_zero = _mm256_setzero_si256();
148 int index;
149 const int16_t *iscan = scan_order->iscan;
150
151 __m128i zbin, round, quant, dequant, shift;
152 __m128i coeff0, coeff1;
153 __m128i qcoeff0, qcoeff1;
154 __m128i cmp_mask0, cmp_mask1;
155 __m128i all_zero;
156 __m128i eob = zero, eob0;
157
158 load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant,
159 &shift);
160
161 // Do DC and first 15 AC.
162 coeff0 = load_tran_low(coeff_ptr);
163 coeff1 = load_tran_low(coeff_ptr + 8);
164
165 qcoeff0 = _mm_abs_epi16(coeff0);
166 qcoeff1 = _mm_abs_epi16(coeff1);
167
168 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
169 zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC.
170 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
171
172 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
173 if (_mm_test_all_zeros(all_zero, all_zero)) {
174 _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero);
175 _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero);
176 #if CONFIG_VP9_HIGHBITDEPTH
177 _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero);
178 _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero);
179 #endif // CONFIG_VP9_HIGHBITDEPTH
180
181 round = _mm_unpackhi_epi64(round, round);
182 quant = _mm_unpackhi_epi64(quant, quant);
183 shift = _mm_unpackhi_epi64(shift, shift);
184 dequant = _mm_unpackhi_epi64(dequant, dequant);
185 } else {
186 calculate_qcoeff(&qcoeff0, round, quant, shift);
187 round = _mm_unpackhi_epi64(round, round);
188 quant = _mm_unpackhi_epi64(quant, quant);
189 shift = _mm_unpackhi_epi64(shift, shift);
190 calculate_qcoeff(&qcoeff1, round, quant, shift);
191
192 // Reinsert signs.
193 qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
194 qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
195
196 // Mask out zbin threshold coeffs.
197 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
198 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
199
200 store_tran_low(qcoeff0, qcoeff_ptr);
201 store_tran_low(qcoeff1, qcoeff_ptr + 8);
202
203 calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero, dqcoeff_ptr);
204 dequant = _mm_unpackhi_epi64(dequant, dequant);
205 calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8);
206
207 eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
208 }
209
210 // AC only loop.
211 for (index = 16; index < 32 * 32; index += 16) {
212 coeff0 = load_tran_low(coeff_ptr + index);
213 coeff1 = load_tran_low(coeff_ptr + index + 8);
214
215 qcoeff0 = _mm_abs_epi16(coeff0);
216 qcoeff1 = _mm_abs_epi16(coeff1);
217
218 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
219 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
220
221 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
222 if (_mm_test_all_zeros(all_zero, all_zero)) {
223 _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero);
224 _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero);
225 #if CONFIG_VP9_HIGHBITDEPTH
226 _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero);
227 _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero);
228 #endif // CONFIG_VP9_HIGHBITDEPTH
229 continue;
230 }
231
232 calculate_qcoeff(&qcoeff0, round, quant, shift);
233 calculate_qcoeff(&qcoeff1, round, quant, shift);
234
235 qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
236 qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
237
238 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
239 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
240
241 store_tran_low(qcoeff0, qcoeff_ptr + index);
242 store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
243
244 calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero,
245 dqcoeff_ptr + index);
246 calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero,
247 dqcoeff_ptr + index + 8);
248
249 eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
250 eob = _mm_max_epi16(eob, eob0);
251 }
252
253 *eob_ptr = accumulate_eob(eob);
254 }
255