1 /*
2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12 #include <assert.h>
13
14 #include "./vpx_config.h"
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx_dsp/arm/mem_neon.h"
17 #include "vp9/common/vp9_scan.h"
18 #include "vp9/encoder/vp9_block.h"
19
calculate_dqcoeff_and_store(const int16x8_t qcoeff,const int16x8_t dequant,tran_low_t * dqcoeff_ptr)20 static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
21 const int16x8_t dequant,
22 tran_low_t *dqcoeff_ptr) {
23 #if CONFIG_VP9_HIGHBITDEPTH
24 const int32x4_t dqcoeff_0 =
25 vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
26 const int32x4_t dqcoeff_1 =
27 vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
28
29 vst1q_s32(dqcoeff_ptr, dqcoeff_0);
30 vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
31 #else
32 vst1q_s16(dqcoeff_ptr, vmulq_s16(qcoeff, dequant));
33 #endif // CONFIG_VP9_HIGHBITDEPTH
34 }
35
36 static INLINE int16x8_t
quantize_b_neon(const tran_low_t * coeff_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16x8_t zbin,const int16x8_t round,const int16x8_t quant,const int16x8_t quant_shift,const int16x8_t dequant)37 quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
38 tran_low_t *dqcoeff_ptr, const int16x8_t zbin,
39 const int16x8_t round, const int16x8_t quant,
40 const int16x8_t quant_shift, const int16x8_t dequant) {
41 // Load coeffs as 8 x 16-bit ints, take sign and abs values
42 const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
43 const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
44 const int16x8_t coeff_abs = vabsq_s16(coeff);
45
46 // Calculate mask of elements outside the bin
47 const int16x8_t zbin_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
48
49 // Get the rounded values
50 const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
51
52 // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
53 int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
54
55 qcoeff = vaddq_s16(qcoeff, rounded);
56
57 // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16
58 qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1);
59
60 // Restore the sign bit.
61 qcoeff = veorq_s16(qcoeff, coeff_sign);
62 qcoeff = vsubq_s16(qcoeff, coeff_sign);
63
64 // Only keep the relevant coeffs
65 qcoeff = vandq_s16(qcoeff, zbin_mask);
66 store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
67
68 calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr);
69
70 return qcoeff;
71 }
72
vpx_quantize_b_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const struct macroblock_plane * const mb_plane,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const struct ScanOrder * const scan_order)73 void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
74 const struct macroblock_plane *const mb_plane,
75 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
76 const int16_t *dequant_ptr, uint16_t *eob_ptr,
77 const struct ScanOrder *const scan_order) {
78 const int16x8_t neg_one = vdupq_n_s16(-1);
79 uint16x8_t eob_max;
80 int16_t const *iscan = scan_order->iscan;
81
82 // Only the first element of each vector is DC.
83 int16x8_t zbin = vld1q_s16(mb_plane->zbin);
84 int16x8_t round = vld1q_s16(mb_plane->round);
85 int16x8_t quant = vld1q_s16(mb_plane->quant);
86 int16x8_t quant_shift = vld1q_s16(mb_plane->quant_shift);
87 int16x8_t dequant = vld1q_s16(dequant_ptr);
88
89 // Process first 8 values which include a dc component.
90 {
91 const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
92
93 const int16x8_t qcoeff =
94 quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, quant,
95 quant_shift, dequant);
96
97 // Set non-zero elements to -1 and use that to extract values for eob.
98 eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
99
100 __builtin_prefetch(coeff_ptr + 64);
101 coeff_ptr += 8;
102 iscan += 8;
103 qcoeff_ptr += 8;
104 dqcoeff_ptr += 8;
105 }
106
107 n_coeffs -= 8;
108
109 {
110 zbin = vdupq_lane_s16(vget_low_s16(zbin), 1);
111 round = vdupq_lane_s16(vget_low_s16(round), 1);
112 quant = vdupq_lane_s16(vget_low_s16(quant), 1);
113 quant_shift = vdupq_lane_s16(vget_low_s16(quant_shift), 1);
114 dequant = vdupq_lane_s16(vget_low_s16(dequant), 1);
115
116 do {
117 const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
118
119 const int16x8_t qcoeff =
120 quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
121 quant, quant_shift, dequant);
122
123 // Set non-zero elements to -1 and use that to extract values for eob.
124 eob_max =
125 vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
126
127 __builtin_prefetch(coeff_ptr + 64);
128 coeff_ptr += 8;
129 iscan += 8;
130 qcoeff_ptr += 8;
131 dqcoeff_ptr += 8;
132 n_coeffs -= 8;
133 } while (n_coeffs > 0);
134 }
135
136 #if VPX_ARCH_AARCH64
137 *eob_ptr = vmaxvq_u16(eob_max);
138 #else
139 {
140 const uint16x4_t eob_max_0 =
141 vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
142 const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
143 const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
144 vst1_lane_u16(eob_ptr, eob_max_2, 0);
145 }
146 #endif // VPX_ARCH_AARCH64
147 }
148
extract_sign_bit(int32x4_t a)149 static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
150 return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
151 }
152
calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff,const int16x8_t dequant,tran_low_t * dqcoeff_ptr)153 static INLINE void calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff,
154 const int16x8_t dequant,
155 tran_low_t *dqcoeff_ptr) {
156 int32x4_t dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
157 int32x4_t dqcoeff_1 =
158 vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
159
160 // Add 1 if negative to round towards zero because the C uses division.
161 dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
162 dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
163
164 #if CONFIG_VP9_HIGHBITDEPTH
165 dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1);
166 dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1);
167 vst1q_s32(dqcoeff_ptr, dqcoeff_0);
168 vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
169 #else
170 vst1q_s16(dqcoeff_ptr,
171 vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)));
172 #endif // CONFIG_VP9_HIGHBITDEPTH
173 }
174
175 static INLINE int16x8_t
quantize_b_32x32_neon(const tran_low_t * coeff_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16x8_t zbin,const int16x8_t round,const int16x8_t quant,const int16x8_t quant_shift,const int16x8_t dequant)176 quantize_b_32x32_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
177 tran_low_t *dqcoeff_ptr, const int16x8_t zbin,
178 const int16x8_t round, const int16x8_t quant,
179 const int16x8_t quant_shift, const int16x8_t dequant) {
180 // Load coeffs as 8 x 16-bit ints, take sign and abs values
181 const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
182 const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
183 const int16x8_t coeff_abs = vabsq_s16(coeff);
184
185 // Calculate mask of elements outside the bin
186 const int16x8_t zbin_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
187
188 // Get the rounded values
189 const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
190
191 // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
192 int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
193
194 qcoeff = vaddq_s16(qcoeff, rounded);
195
196 // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15
197 qcoeff = vqdmulhq_s16(qcoeff, quant_shift);
198
199 // Restore the sign bit.
200 qcoeff = veorq_s16(qcoeff, coeff_sign);
201 qcoeff = vsubq_s16(qcoeff, coeff_sign);
202
203 // Only keep the relevant coeffs
204 qcoeff = vandq_s16(qcoeff, zbin_mask);
205 store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
206
207 calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr);
208
209 return qcoeff;
210 }
211
212 // Main difference is that zbin values are halved before comparison and dqcoeff
213 // values are divided by 2. zbin is rounded but dqcoeff is not.
vpx_quantize_b_32x32_neon(const tran_low_t * coeff_ptr,const struct macroblock_plane * mb_plane,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const struct ScanOrder * const scan_order)214 void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr,
215 const struct macroblock_plane *mb_plane,
216 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
217 const int16_t *dequant_ptr, uint16_t *eob_ptr,
218 const struct ScanOrder *const scan_order) {
219 const int16x8_t neg_one = vdupq_n_s16(-1);
220 uint16x8_t eob_max;
221 int i;
222 const int16_t *iscan = scan_order->iscan;
223
224 // Only the first element of each vector is DC.
225 int16x8_t zbin = vrshrq_n_s16(vld1q_s16(mb_plane->zbin), 1);
226 int16x8_t round = vrshrq_n_s16(vld1q_s16(mb_plane->round), 1);
227 int16x8_t quant = vld1q_s16(mb_plane->quant);
228 int16x8_t quant_shift = vld1q_s16(mb_plane->quant_shift);
229 int16x8_t dequant = vld1q_s16(dequant_ptr);
230
231 // Process first 8 values which include a dc component.
232 {
233 const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
234
235 const int16x8_t qcoeff =
236 quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
237 quant, quant_shift, dequant);
238
239 // Set non-zero elements to -1 and use that to extract values for eob.
240 eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
241
242 __builtin_prefetch(coeff_ptr + 64);
243 coeff_ptr += 8;
244 iscan += 8;
245 qcoeff_ptr += 8;
246 dqcoeff_ptr += 8;
247 }
248
249 {
250 zbin = vdupq_lane_s16(vget_low_s16(zbin), 1);
251 round = vdupq_lane_s16(vget_low_s16(round), 1);
252 quant = vdupq_lane_s16(vget_low_s16(quant), 1);
253 quant_shift = vdupq_lane_s16(vget_low_s16(quant_shift), 1);
254 dequant = vdupq_lane_s16(vget_low_s16(dequant), 1);
255
256 for (i = 1; i < 32 * 32 / 8; ++i) {
257 const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
258
259 const int16x8_t qcoeff =
260 quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
261 quant, quant_shift, dequant);
262
263 // Set non-zero elements to -1 and use that to extract values for eob.
264 eob_max =
265 vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
266
267 __builtin_prefetch(coeff_ptr + 64);
268 coeff_ptr += 8;
269 iscan += 8;
270 qcoeff_ptr += 8;
271 dqcoeff_ptr += 8;
272 }
273 }
274
275 #if VPX_ARCH_AARCH64
276 *eob_ptr = vmaxvq_u16(eob_max);
277 #else
278 {
279 const uint16x4_t eob_max_0 =
280 vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
281 const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
282 const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
283 vst1_lane_u16(eob_ptr, eob_max_2, 0);
284 }
285 #endif // VPX_ARCH_AARCH64
286 }
287