1 /*
2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
13 #include "./vpx_config.h"
14 #include "./vp9_rtcd.h"
15
16 #include "vpx/vpx_integer.h"
17 #include "vp9/common/vp9_reconinter.h"
18 #include "vp9/encoder/vp9_context_tree.h"
19 #include "vp9/encoder/vp9_denoiser.h"
20 #include "vpx_mem/vpx_mem.h"
21
22 // Compute the sum of all pixel differences of this MB.
horizontal_add_s8x16(const int8x16_t v_sum_diff_total)23 static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) {
24 #if VPX_ARCH_AARCH64
25 return vaddlvq_s8(v_sum_diff_total);
26 #else
27 const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff_total);
28 const int32x4_t fedc_ba98_7654_3210 = vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
29 const int64x2_t fedcba98_76543210 = vpaddlq_s32(fedc_ba98_7654_3210);
30 const int64x1_t x = vqadd_s64(vget_high_s64(fedcba98_76543210),
31 vget_low_s64(fedcba98_76543210));
32 const int sum_diff = vget_lane_s32(vreinterpret_s32_s64(x), 0);
33 return sum_diff;
34 #endif
35 }
36
37 // Denoise a 16x1 vector.
denoiser_16x1_neon(const uint8_t * sig,const uint8_t * mc_running_avg_y,uint8_t * running_avg_y,const uint8x16_t v_level1_threshold,const uint8x16_t v_level2_threshold,const uint8x16_t v_level3_threshold,const uint8x16_t v_level1_adjustment,const uint8x16_t v_delta_level_1_and_2,const uint8x16_t v_delta_level_2_and_3,int8x16_t v_sum_diff_total)38 static INLINE int8x16_t denoiser_16x1_neon(
39 const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
40 const uint8x16_t v_level1_threshold, const uint8x16_t v_level2_threshold,
41 const uint8x16_t v_level3_threshold, const uint8x16_t v_level1_adjustment,
42 const uint8x16_t v_delta_level_1_and_2,
43 const uint8x16_t v_delta_level_2_and_3, int8x16_t v_sum_diff_total) {
44 const uint8x16_t v_sig = vld1q_u8(sig);
45 const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
46
47 /* Calculate absolute difference and sign masks. */
48 const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y);
49 const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y);
50 const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y);
51
52 /* Figure out which level that put us in. */
53 const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold, v_abs_diff);
54 const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold, v_abs_diff);
55 const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold, v_abs_diff);
56
57 /* Calculate absolute adjustments for level 1, 2 and 3. */
58 const uint8x16_t v_level2_adjustment =
59 vandq_u8(v_level2_mask, v_delta_level_1_and_2);
60 const uint8x16_t v_level3_adjustment =
61 vandq_u8(v_level3_mask, v_delta_level_2_and_3);
62 const uint8x16_t v_level1and2_adjustment =
63 vaddq_u8(v_level1_adjustment, v_level2_adjustment);
64 const uint8x16_t v_level1and2and3_adjustment =
65 vaddq_u8(v_level1and2_adjustment, v_level3_adjustment);
66
67 /* Figure adjustment absolute value by selecting between the absolute
68 * difference if in level0 or the value for level 1, 2 and 3.
69 */
70 const uint8x16_t v_abs_adjustment =
71 vbslq_u8(v_level1_mask, v_level1and2and3_adjustment, v_abs_diff);
72
73 /* Calculate positive and negative adjustments. Apply them to the signal
74 * and accumulate them. Adjustments are less than eight and the maximum
75 * sum of them (7 * 16) can fit in a signed char.
76 */
77 const uint8x16_t v_pos_adjustment =
78 vandq_u8(v_diff_pos_mask, v_abs_adjustment);
79 const uint8x16_t v_neg_adjustment =
80 vandq_u8(v_diff_neg_mask, v_abs_adjustment);
81
82 uint8x16_t v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment);
83 v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment);
84
85 /* Store results. */
86 vst1q_u8(running_avg_y, v_running_avg_y);
87
88 /* Sum all the accumulators to have the sum of all pixel differences
89 * for this macroblock.
90 */
91 {
92 const int8x16_t v_sum_diff =
93 vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment),
94 vreinterpretq_s8_u8(v_neg_adjustment));
95 v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff);
96 }
97 return v_sum_diff_total;
98 }
99
denoiser_adjust_16x1_neon(const uint8_t * sig,const uint8_t * mc_running_avg_y,uint8_t * running_avg_y,const uint8x16_t k_delta,int8x16_t v_sum_diff_total)100 static INLINE int8x16_t denoiser_adjust_16x1_neon(
101 const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
102 const uint8x16_t k_delta, int8x16_t v_sum_diff_total) {
103 uint8x16_t v_running_avg_y = vld1q_u8(running_avg_y);
104 const uint8x16_t v_sig = vld1q_u8(sig);
105 const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
106
107 /* Calculate absolute difference and sign masks. */
108 const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y);
109 const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y);
110 const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y);
111 // Clamp absolute difference to delta to get the adjustment.
112 const uint8x16_t v_abs_adjustment = vminq_u8(v_abs_diff, (k_delta));
113
114 const uint8x16_t v_pos_adjustment =
115 vandq_u8(v_diff_pos_mask, v_abs_adjustment);
116 const uint8x16_t v_neg_adjustment =
117 vandq_u8(v_diff_neg_mask, v_abs_adjustment);
118
119 v_running_avg_y = vqsubq_u8(v_running_avg_y, v_pos_adjustment);
120 v_running_avg_y = vqaddq_u8(v_running_avg_y, v_neg_adjustment);
121
122 /* Store results. */
123 vst1q_u8(running_avg_y, v_running_avg_y);
124
125 {
126 const int8x16_t v_sum_diff =
127 vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment),
128 vreinterpretq_s8_u8(v_pos_adjustment));
129 v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff);
130 }
131 return v_sum_diff_total;
132 }
133
134 // Denoise 8x8 and 8x16 blocks.
vp9_denoiser_8xN_neon(const uint8_t * sig,int sig_stride,const uint8_t * mc_running_avg_y,int mc_avg_y_stride,uint8_t * running_avg_y,int avg_y_stride,int increase_denoising,BLOCK_SIZE bs,int motion_magnitude,int width)135 static int vp9_denoiser_8xN_neon(const uint8_t *sig, int sig_stride,
136 const uint8_t *mc_running_avg_y,
137 int mc_avg_y_stride, uint8_t *running_avg_y,
138 int avg_y_stride, int increase_denoising,
139 BLOCK_SIZE bs, int motion_magnitude,
140 int width) {
141 int sum_diff_thresh, r, sum_diff = 0;
142 const int shift_inc =
143 (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
144 ? 1
145 : 0;
146 uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16];
147
148 const uint8x16_t v_level1_adjustment = vmovq_n_u8(
149 (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3);
150 const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
151 const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
152 const uint8x16_t v_level1_threshold = vdupq_n_u8(4 + shift_inc);
153 const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
154 const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
155
156 const int b_height = (4 << b_height_log2_lookup[bs]) >> 1;
157
158 int8x16_t v_sum_diff_total = vdupq_n_s8(0);
159
160 for (r = 0; r < b_height; ++r) {
161 memcpy(sig_buffer[r], sig, width);
162 memcpy(sig_buffer[r] + width, sig + sig_stride, width);
163 memcpy(mc_running_buffer[r], mc_running_avg_y, width);
164 memcpy(mc_running_buffer[r] + width, mc_running_avg_y + mc_avg_y_stride,
165 width);
166 memcpy(running_buffer[r], running_avg_y, width);
167 memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width);
168 v_sum_diff_total = denoiser_16x1_neon(
169 sig_buffer[r], mc_running_buffer[r], running_buffer[r],
170 v_level1_threshold, v_level2_threshold, v_level3_threshold,
171 v_level1_adjustment, v_delta_level_1_and_2, v_delta_level_2_and_3,
172 v_sum_diff_total);
173 {
174 const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]);
175 const uint8x8_t v_running_buffer_high = vget_high_u8(v_running_buffer);
176 const uint8x8_t v_running_buffer_low = vget_low_u8(v_running_buffer);
177 vst1_u8(running_avg_y, v_running_buffer_low);
178 vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high);
179 }
180 // Update pointers for next iteration.
181 sig += (sig_stride << 1);
182 mc_running_avg_y += (mc_avg_y_stride << 1);
183 running_avg_y += (avg_y_stride << 1);
184 }
185
186 {
187 sum_diff = horizontal_add_s8x16(v_sum_diff_total);
188 sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
189 if (abs(sum_diff) > sum_diff_thresh) {
190 // Before returning to copy the block (i.e., apply no denoising),
191 // check if we can still apply some (weaker) temporal filtering to
192 // this block, that would otherwise not be denoised at all. Simplest
193 // is to apply an additional adjustment to running_avg_y to bring it
194 // closer to sig. The adjustment is capped by a maximum delta, and
195 // chosen such that in most cases the resulting sum_diff will be
196 // within the acceptable range given by sum_diff_thresh.
197
198 // The delta is set by the excess of absolute pixel diff over the
199 // threshold.
200 const int delta =
201 ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
202 // Only apply the adjustment for max delta up to 3.
203 if (delta < 4) {
204 const uint8x16_t k_delta = vmovq_n_u8(delta);
205 running_avg_y -= avg_y_stride * (b_height << 1);
206 for (r = 0; r < b_height; ++r) {
207 v_sum_diff_total = denoiser_adjust_16x1_neon(
208 sig_buffer[r], mc_running_buffer[r], running_buffer[r], k_delta,
209 v_sum_diff_total);
210 {
211 const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]);
212 const uint8x8_t v_running_buffer_high =
213 vget_high_u8(v_running_buffer);
214 const uint8x8_t v_running_buffer_low =
215 vget_low_u8(v_running_buffer);
216 vst1_u8(running_avg_y, v_running_buffer_low);
217 vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high);
218 }
219 // Update pointers for next iteration.
220 running_avg_y += (avg_y_stride << 1);
221 }
222 sum_diff = horizontal_add_s8x16(v_sum_diff_total);
223 if (abs(sum_diff) > sum_diff_thresh) {
224 return COPY_BLOCK;
225 }
226 } else {
227 return COPY_BLOCK;
228 }
229 }
230 }
231
232 return FILTER_BLOCK;
233 }
234
235 // Denoise 16x16, 16x32, 32x16, 32x32, 32x64, 64x32 and 64x64 blocks.
vp9_denoiser_NxM_neon(const uint8_t * sig,int sig_stride,const uint8_t * mc_running_avg_y,int mc_avg_y_stride,uint8_t * running_avg_y,int avg_y_stride,int increase_denoising,BLOCK_SIZE bs,int motion_magnitude)236 static int vp9_denoiser_NxM_neon(const uint8_t *sig, int sig_stride,
237 const uint8_t *mc_running_avg_y,
238 int mc_avg_y_stride, uint8_t *running_avg_y,
239 int avg_y_stride, int increase_denoising,
240 BLOCK_SIZE bs, int motion_magnitude) {
241 const int shift_inc =
242 (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
243 ? 1
244 : 0;
245 const uint8x16_t v_level1_adjustment = vmovq_n_u8(
246 (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3);
247 const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
248 const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
249 const uint8x16_t v_level1_threshold = vmovq_n_u8(4 + shift_inc);
250 const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
251 const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
252
253 const int b_width = (4 << b_width_log2_lookup[bs]);
254 const int b_height = (4 << b_height_log2_lookup[bs]);
255 const int b_width_shift4 = b_width >> 4;
256
257 int8x16_t v_sum_diff_total[4][4];
258 int r, c, sum_diff = 0;
259
260 for (r = 0; r < 4; ++r) {
261 for (c = 0; c < b_width_shift4; ++c) {
262 v_sum_diff_total[c][r] = vdupq_n_s8(0);
263 }
264 }
265
266 for (r = 0; r < b_height; ++r) {
267 for (c = 0; c < b_width_shift4; ++c) {
268 v_sum_diff_total[c][r >> 4] = denoiser_16x1_neon(
269 sig, mc_running_avg_y, running_avg_y, v_level1_threshold,
270 v_level2_threshold, v_level3_threshold, v_level1_adjustment,
271 v_delta_level_1_and_2, v_delta_level_2_and_3,
272 v_sum_diff_total[c][r >> 4]);
273
274 // Update pointers for next iteration.
275 sig += 16;
276 mc_running_avg_y += 16;
277 running_avg_y += 16;
278 }
279
280 if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
281 for (c = 0; c < b_width_shift4; ++c) {
282 sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]);
283 }
284 }
285
286 // Update pointers for next iteration.
287 sig = sig - b_width + sig_stride;
288 mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
289 running_avg_y = running_avg_y - b_width + avg_y_stride;
290 }
291
292 {
293 const int sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
294 if (abs(sum_diff) > sum_diff_thresh) {
295 const int delta =
296 ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
297 // Only apply the adjustment for max delta up to 3.
298 if (delta < 4) {
299 const uint8x16_t k_delta = vdupq_n_u8(delta);
300 sig -= sig_stride * b_height;
301 mc_running_avg_y -= mc_avg_y_stride * b_height;
302 running_avg_y -= avg_y_stride * b_height;
303 sum_diff = 0;
304
305 for (r = 0; r < b_height; ++r) {
306 for (c = 0; c < b_width_shift4; ++c) {
307 v_sum_diff_total[c][r >> 4] =
308 denoiser_adjust_16x1_neon(sig, mc_running_avg_y, running_avg_y,
309 k_delta, v_sum_diff_total[c][r >> 4]);
310
311 // Update pointers for next iteration.
312 sig += 16;
313 mc_running_avg_y += 16;
314 running_avg_y += 16;
315 }
316 if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
317 for (c = 0; c < b_width_shift4; ++c) {
318 sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]);
319 }
320 }
321
322 sig = sig - b_width + sig_stride;
323 mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
324 running_avg_y = running_avg_y - b_width + avg_y_stride;
325 }
326
327 if (abs(sum_diff) > sum_diff_thresh) {
328 return COPY_BLOCK;
329 }
330 } else {
331 return COPY_BLOCK;
332 }
333 }
334 }
335 return FILTER_BLOCK;
336 }
337
vp9_denoiser_filter_neon(const uint8_t * sig,int sig_stride,const uint8_t * mc_avg,int mc_avg_stride,uint8_t * avg,int avg_stride,int increase_denoising,BLOCK_SIZE bs,int motion_magnitude)338 int vp9_denoiser_filter_neon(const uint8_t *sig, int sig_stride,
339 const uint8_t *mc_avg, int mc_avg_stride,
340 uint8_t *avg, int avg_stride,
341 int increase_denoising, BLOCK_SIZE bs,
342 int motion_magnitude) {
343 // Rank by frequency of the block type to have an early termination.
344 if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 ||
345 bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 ||
346 bs == BLOCK_32X64 || bs == BLOCK_64X32) {
347 return vp9_denoiser_NxM_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg,
348 avg_stride, increase_denoising, bs,
349 motion_magnitude);
350 } else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) {
351 return vp9_denoiser_8xN_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg,
352 avg_stride, increase_denoising, bs,
353 motion_magnitude, 8);
354 }
355 return COPY_BLOCK;
356 }
357