1 /*
2 * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <tmmintrin.h>
13
14 #include "./vp9_rtcd.h"
15 #include "vpx/vpx_integer.h"
16 #include "vpx_dsp/vpx_dsp_common.h"
17 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
18 #include "vpx_dsp/x86/quantize_sse2.h"
19 #include "vpx_dsp/x86/quantize_ssse3.h"
20 #include "vp9/common/vp9_scan.h"
21 #include "vp9/encoder/vp9_block.h"
22
vp9_quantize_fp_ssse3(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const struct macroblock_plane * const mb_plane,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const struct ScanOrder * const scan_order)23 void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
24 const struct macroblock_plane *const mb_plane,
25 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
26 const int16_t *dequant_ptr, uint16_t *eob_ptr,
27 const struct ScanOrder *const scan_order) {
28 const __m128i zero = _mm_setzero_si128();
29 __m128i thr;
30 int nzflag;
31 int index = 16;
32 __m128i round, quant, dequant;
33 __m128i coeff0, coeff1;
34 __m128i qcoeff0, qcoeff1;
35 __m128i eob;
36 const int16_t *iscan = scan_order->iscan;
37
38 // Setup global values.
39 load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant);
40
41 // Do DC and first 15 AC.
42 coeff0 = load_tran_low(coeff_ptr);
43 coeff1 = load_tran_low(coeff_ptr + 8);
44
45 qcoeff0 = _mm_abs_epi16(coeff0);
46 qcoeff1 = _mm_abs_epi16(coeff1);
47
48 qcoeff0 = _mm_adds_epi16(qcoeff0, round);
49 qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
50
51 round = _mm_unpackhi_epi64(round, round);
52 quant = _mm_unpackhi_epi64(quant, quant);
53
54 qcoeff1 = _mm_adds_epi16(qcoeff1, round);
55 qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
56
57 // Reinsert signs.
58 qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
59 qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
60
61 store_tran_low(qcoeff0, qcoeff_ptr);
62 store_tran_low(qcoeff1, qcoeff_ptr + 8);
63
64 qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
65 dequant = _mm_unpackhi_epi64(dequant, dequant);
66 qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
67
68 store_tran_low(qcoeff0, dqcoeff_ptr);
69 store_tran_low(qcoeff1, dqcoeff_ptr + 8);
70
71 eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
72
73 thr = _mm_srai_epi16(dequant, 1);
74
75 // AC only loop.
76 while (index < n_coeffs) {
77 coeff0 = load_tran_low(coeff_ptr + index);
78 coeff1 = load_tran_low(coeff_ptr + index + 8);
79
80 qcoeff0 = _mm_abs_epi16(coeff0);
81 qcoeff1 = _mm_abs_epi16(coeff1);
82
83 nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
84 _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
85
86 if (nzflag) {
87 __m128i eob0;
88 qcoeff0 = _mm_adds_epi16(qcoeff0, round);
89 qcoeff1 = _mm_adds_epi16(qcoeff1, round);
90 qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
91 qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
92
93 // Reinsert signs.
94 qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
95 qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
96
97 store_tran_low(qcoeff0, qcoeff_ptr + index);
98 store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
99
100 qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
101 qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
102
103 store_tran_low(qcoeff0, dqcoeff_ptr + index);
104 store_tran_low(qcoeff1, dqcoeff_ptr + index + 8);
105
106 eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
107 eob = _mm_max_epi16(eob, eob0);
108 } else {
109 store_zero_tran_low(qcoeff_ptr + index);
110 store_zero_tran_low(qcoeff_ptr + index + 8);
111
112 store_zero_tran_low(dqcoeff_ptr + index);
113 store_zero_tran_low(dqcoeff_ptr + index + 8);
114 }
115
116 index += 16;
117 }
118
119 *eob_ptr = accumulate_eob(eob);
120 }
121
vp9_quantize_fp_32x32_ssse3(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const struct macroblock_plane * const mb_plane,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const struct ScanOrder * const scan_order)122 void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
123 const struct macroblock_plane *const mb_plane,
124 tran_low_t *qcoeff_ptr,
125 tran_low_t *dqcoeff_ptr,
126 const int16_t *dequant_ptr, uint16_t *eob_ptr,
127 const struct ScanOrder *const scan_order) {
128 const __m128i zero = _mm_setzero_si128();
129 const __m128i one_s16 = _mm_set1_epi16(1);
130 __m128i thr;
131 int nzflag;
132 int index = 16;
133 __m128i round, quant, dequant;
134 __m128i coeff0, coeff1;
135 __m128i qcoeff0, qcoeff1;
136 __m128i eob;
137 const int16_t *iscan = scan_order->iscan;
138
139 // Setup global values.
140 load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant);
141 // The 32x32 halves round.
142 round = _mm_add_epi16(round, one_s16);
143 round = _mm_srli_epi16(round, 1);
144
145 // The 16x16 shifts by 16, the 32x32 shifts by 15. We want to use pmulhw so
146 // upshift quant to account for this.
147 quant = _mm_slli_epi16(quant, 1);
148
149 // Do DC and first 15 AC.
150 coeff0 = load_tran_low(coeff_ptr);
151 coeff1 = load_tran_low(coeff_ptr + 8);
152
153 qcoeff0 = _mm_abs_epi16(coeff0);
154 qcoeff1 = _mm_abs_epi16(coeff1);
155
156 qcoeff0 = _mm_adds_epi16(qcoeff0, round);
157 qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
158
159 round = _mm_unpackhi_epi64(round, round);
160 quant = _mm_unpackhi_epi64(quant, quant);
161
162 qcoeff1 = _mm_adds_epi16(qcoeff1, round);
163 qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
164
165 // Reinsert signs.
166 qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
167 qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
168
169 store_tran_low(qcoeff0, qcoeff_ptr);
170 store_tran_low(qcoeff1, qcoeff_ptr + 8);
171
172 // Get the abs value of qcoeff again so we can use shifts for division.
173 qcoeff0 = _mm_abs_epi16(qcoeff0);
174 qcoeff1 = _mm_abs_epi16(qcoeff1);
175
176 qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
177 dequant = _mm_unpackhi_epi64(dequant, dequant);
178 qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
179
180 // Divide by 2.
181 qcoeff0 = _mm_srli_epi16(qcoeff0, 1);
182 qcoeff1 = _mm_srli_epi16(qcoeff1, 1);
183
184 // Reinsert signs.
185 qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
186 qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
187
188 store_tran_low(qcoeff0, dqcoeff_ptr);
189 store_tran_low(qcoeff1, dqcoeff_ptr + 8);
190
191 eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
192
193 thr = _mm_srai_epi16(dequant, 2);
194
195 // AC only loop.
196 while (index < n_coeffs) {
197 coeff0 = load_tran_low(coeff_ptr + index);
198 coeff1 = load_tran_low(coeff_ptr + index + 8);
199
200 qcoeff0 = _mm_abs_epi16(coeff0);
201 qcoeff1 = _mm_abs_epi16(coeff1);
202
203 nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
204 _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
205
206 if (nzflag) {
207 qcoeff0 = _mm_adds_epi16(qcoeff0, round);
208 qcoeff1 = _mm_adds_epi16(qcoeff1, round);
209 qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
210 qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
211
212 // Reinsert signs.
213 qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
214 qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
215
216 store_tran_low(qcoeff0, qcoeff_ptr + index);
217 store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
218
219 // Get the abs value of qcoeff again so we can use shifts for division.
220 qcoeff0 = _mm_abs_epi16(qcoeff0);
221 qcoeff1 = _mm_abs_epi16(qcoeff1);
222
223 qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
224 qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
225
226 // Divide by 2.
227 qcoeff0 = _mm_srli_epi16(qcoeff0, 1);
228 qcoeff1 = _mm_srli_epi16(qcoeff1, 1);
229
230 // Reinsert signs.
231 qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
232 qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
233
234 store_tran_low(qcoeff0, dqcoeff_ptr + index);
235 store_tran_low(qcoeff1, dqcoeff_ptr + index + 8);
236 } else {
237 store_zero_tran_low(qcoeff_ptr + index);
238 store_zero_tran_low(qcoeff_ptr + index + 8);
239
240 store_zero_tran_low(dqcoeff_ptr + index);
241 store_zero_tran_low(dqcoeff_ptr + index + 8);
242 }
243
244 if (nzflag) {
245 const __m128i eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
246 eob = _mm_max_epi16(eob, eob0);
247 }
248 index += 16;
249 }
250
251 *eob_ptr = accumulate_eob(eob);
252 }
253