xref: /aosp_15_r20/external/libaom/av1/common/arm/cdef_block_neon.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <arm_neon.h>
13 #include <assert.h>
14 
15 #include "config/aom_config.h"
16 #include "config/av1_rtcd.h"
17 
18 #include "aom_dsp/arm/mem_neon.h"
19 #include "aom_dsp/arm/sum_neon.h"
20 #include "av1/common/cdef_block.h"
21 
cdef_copy_rect8_8bit_to_16bit_neon(uint16_t * dst,int dstride,const uint8_t * src,int sstride,int width,int height)22 void cdef_copy_rect8_8bit_to_16bit_neon(uint16_t *dst, int dstride,
23                                         const uint8_t *src, int sstride,
24                                         int width, int height) {
25   do {
26     const uint8_t *src_ptr = src;
27     uint16_t *dst_ptr = dst;
28 
29     int w = 0;
30     while (width - w >= 16) {
31       uint8x16_t row = vld1q_u8(src_ptr + w);
32       uint8x16x2_t row_u16 = { { row, vdupq_n_u8(0) } };
33       vst2q_u8((uint8_t *)(dst_ptr + w), row_u16);
34 
35       w += 16;
36     }
37     if (width - w >= 8) {
38       uint8x8_t row = vld1_u8(src_ptr + w);
39       vst1q_u16(dst_ptr + w, vmovl_u8(row));
40       w += 8;
41     }
42     if (width - w == 4) {
43       for (int i = w; i < w + 4; i++) {
44         dst_ptr[i] = src_ptr[i];
45       }
46     }
47 
48     src += sstride;
49     dst += dstride;
50   } while (--height != 0);
51 }
52 
53 #if CONFIG_AV1_HIGHBITDEPTH
cdef_copy_rect8_16bit_to_16bit_neon(uint16_t * dst,int dstride,const uint16_t * src,int sstride,int width,int height)54 void cdef_copy_rect8_16bit_to_16bit_neon(uint16_t *dst, int dstride,
55                                          const uint16_t *src, int sstride,
56                                          int width, int height) {
57   do {
58     const uint16_t *src_ptr = src;
59     uint16_t *dst_ptr = dst;
60 
61     int w = 0;
62     while (width - w >= 8) {
63       uint16x8_t row = vld1q_u16(src_ptr + w);
64       vst1q_u16(dst_ptr + w, row);
65 
66       w += 8;
67     }
68     if (width - w == 4) {
69       uint16x4_t row = vld1_u16(src_ptr + w);
70       vst1_u16(dst_ptr + w, row);
71     }
72 
73     src += sstride;
74     dst += dstride;
75   } while (--height != 0);
76 }
77 #endif  // CONFIG_AV1_HIGHBITDEPTH
78 
79 // partial A is a 16-bit vector of the form:
80 // [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form:
81 // [0  y1 y2 y3 y4 y5 y6 y7].
82 // This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
83 // (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1
84 // and const2.
fold_mul_and_sum_neon(int16x8_t partiala,int16x8_t partialb,uint32x4_t const1,uint32x4_t const2)85 static inline uint32x4_t fold_mul_and_sum_neon(int16x8_t partiala,
86                                                int16x8_t partialb,
87                                                uint32x4_t const1,
88                                                uint32x4_t const2) {
89   // Reverse partial B.
90   // pattern = { 12 13 10 11 8 9 6 7 4 5 2 3 0 1 14 15 }.
91   uint8x16_t pattern = vreinterpretq_u8_u64(
92       vcombine_u64(vcreate_u64((uint64_t)0x07060908 << 32 | 0x0b0a0d0c),
93                    vcreate_u64((uint64_t)0x0f0e0100 << 32 | 0x03020504)));
94 
95 #if AOM_ARCH_AARCH64
96   partialb =
97       vreinterpretq_s16_s8(vqtbl1q_s8(vreinterpretq_s8_s16(partialb), pattern));
98 #else
99   int8x8x2_t p = { { vget_low_s8(vreinterpretq_s8_s16(partialb)),
100                      vget_high_s8(vreinterpretq_s8_s16(partialb)) } };
101   int8x8_t shuffle_hi = vtbl2_s8(p, vget_high_s8(vreinterpretq_s8_u8(pattern)));
102   int8x8_t shuffle_lo = vtbl2_s8(p, vget_low_s8(vreinterpretq_s8_u8(pattern)));
103   partialb = vreinterpretq_s16_s8(vcombine_s8(shuffle_lo, shuffle_hi));
104 #endif
105 
106   // Square and add the corresponding x and y values.
107   int32x4_t cost_lo = vmull_s16(vget_low_s16(partiala), vget_low_s16(partiala));
108   cost_lo = vmlal_s16(cost_lo, vget_low_s16(partialb), vget_low_s16(partialb));
109   int32x4_t cost_hi =
110       vmull_s16(vget_high_s16(partiala), vget_high_s16(partiala));
111   cost_hi =
112       vmlal_s16(cost_hi, vget_high_s16(partialb), vget_high_s16(partialb));
113 
114   // Multiply by constant.
115   uint32x4_t cost = vmulq_u32(vreinterpretq_u32_s32(cost_lo), const1);
116   cost = vmlaq_u32(cost, vreinterpretq_u32_s32(cost_hi), const2);
117   return cost;
118 }
119 
120 // This function computes the cost along directions 4, 5, 6, 7. (4 is diagonal
121 // down-right, 6 is vertical).
122 //
123 // For each direction the lines are shifted so that we can perform a
124 // basic sum on each vector element. For example, direction 5 is "south by
125 // southeast", so we need to add the pixels along each line i below:
126 //
127 // 0  1 2 3 4 5 6 7
128 // 0  1 2 3 4 5 6 7
129 // 8  0 1 2 3 4 5 6
130 // 8  0 1 2 3 4 5 6
131 // 9  8 0 1 2 3 4 5
132 // 9  8 0 1 2 3 4 5
133 // 10 9 8 0 1 2 3 4
134 // 10 9 8 0 1 2 3 4
135 //
136 // For this to fit nicely in vectors, the lines need to be shifted like so:
137 //        0 1 2 3 4 5 6 7
138 //        0 1 2 3 4 5 6 7
139 //      8 0 1 2 3 4 5 6
140 //      8 0 1 2 3 4 5 6
141 //    9 8 0 1 2 3 4 5
142 //    9 8 0 1 2 3 4 5
143 // 10 9 8 0 1 2 3 4
144 // 10 9 8 0 1 2 3 4
145 //
146 // In this configuration we can now perform SIMD additions to get the cost
147 // along direction 5. Since this won't fit into a single 128-bit vector, we use
148 // two of them to compute each half of the new configuration, and pad the empty
149 // spaces with zeros. Similar shifting is done for other directions, except
150 // direction 6 which is straightforward as it's the vertical direction.
compute_vert_directions_neon(int16x8_t lines[8],uint32_t cost[4])151 static inline uint32x4_t compute_vert_directions_neon(int16x8_t lines[8],
152                                                       uint32_t cost[4]) {
153   const int16x8_t zero = vdupq_n_s16(0);
154 
155   // Partial sums for lines 0 and 1.
156   int16x8_t partial4a = vextq_s16(zero, lines[0], 1);
157   partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[1], 2));
158   int16x8_t partial4b = vextq_s16(lines[0], zero, 1);
159   partial4b = vaddq_s16(partial4b, vextq_s16(lines[1], zero, 2));
160   int16x8_t tmp = vaddq_s16(lines[0], lines[1]);
161   int16x8_t partial5a = vextq_s16(zero, tmp, 3);
162   int16x8_t partial5b = vextq_s16(tmp, zero, 3);
163   int16x8_t partial7a = vextq_s16(zero, tmp, 6);
164   int16x8_t partial7b = vextq_s16(tmp, zero, 6);
165   int16x8_t partial6 = tmp;
166 
167   // Partial sums for lines 2 and 3.
168   partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[2], 3));
169   partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[3], 4));
170   partial4b = vaddq_s16(partial4b, vextq_s16(lines[2], zero, 3));
171   partial4b = vaddq_s16(partial4b, vextq_s16(lines[3], zero, 4));
172   tmp = vaddq_s16(lines[2], lines[3]);
173   partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 4));
174   partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 4));
175   partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 5));
176   partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 5));
177   partial6 = vaddq_s16(partial6, tmp);
178 
179   // Partial sums for lines 4 and 5.
180   partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[4], 5));
181   partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[5], 6));
182   partial4b = vaddq_s16(partial4b, vextq_s16(lines[4], zero, 5));
183   partial4b = vaddq_s16(partial4b, vextq_s16(lines[5], zero, 6));
184   tmp = vaddq_s16(lines[4], lines[5]);
185   partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 5));
186   partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 5));
187   partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 4));
188   partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 4));
189   partial6 = vaddq_s16(partial6, tmp);
190 
191   // Partial sums for lines 6 and 7.
192   partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[6], 7));
193   partial4a = vaddq_s16(partial4a, lines[7]);
194   partial4b = vaddq_s16(partial4b, vextq_s16(lines[6], zero, 7));
195   tmp = vaddq_s16(lines[6], lines[7]);
196   partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 6));
197   partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 6));
198   partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 3));
199   partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 3));
200   partial6 = vaddq_s16(partial6, tmp);
201 
202   uint32x4_t const0 = vreinterpretq_u32_u64(
203       vcombine_u64(vcreate_u64((uint64_t)420 << 32 | 840),
204                    vcreate_u64((uint64_t)210 << 32 | 280)));
205   uint32x4_t const1 = vreinterpretq_u32_u64(
206       vcombine_u64(vcreate_u64((uint64_t)140 << 32 | 168),
207                    vcreate_u64((uint64_t)105 << 32 | 120)));
208   uint32x4_t const2 = vreinterpretq_u32_u64(
209       vcombine_u64(vcreate_u64(0), vcreate_u64((uint64_t)210 << 32 | 420)));
210   uint32x4_t const3 = vreinterpretq_u32_u64(
211       vcombine_u64(vcreate_u64((uint64_t)105 << 32 | 140),
212                    vcreate_u64((uint64_t)105 << 32 | 105)));
213 
214   // Compute costs in terms of partial sums.
215   int32x4_t partial6_s32 =
216       vmull_s16(vget_low_s16(partial6), vget_low_s16(partial6));
217   partial6_s32 =
218       vmlal_s16(partial6_s32, vget_high_s16(partial6), vget_high_s16(partial6));
219 
220   uint32x4_t costs[4];
221   costs[0] = fold_mul_and_sum_neon(partial4a, partial4b, const0, const1);
222   costs[1] = fold_mul_and_sum_neon(partial5a, partial5b, const2, const3);
223   costs[2] = vmulq_n_u32(vreinterpretq_u32_s32(partial6_s32), 105);
224   costs[3] = fold_mul_and_sum_neon(partial7a, partial7b, const2, const3);
225 
226   costs[0] = horizontal_add_4d_u32x4(costs);
227   vst1q_u32(cost, costs[0]);
228   return costs[0];
229 }
230 
fold_mul_and_sum_pairwise_neon(int16x8_t partiala,int16x8_t partialb,int16x8_t partialc,uint32x4_t const0)231 static inline uint32x4_t fold_mul_and_sum_pairwise_neon(int16x8_t partiala,
232                                                         int16x8_t partialb,
233                                                         int16x8_t partialc,
234                                                         uint32x4_t const0) {
235   // Reverse partial c.
236   // pattern = { 10 11 8 9 6 7 4 5 2 3 0 1 12 13 14 15 }.
237   uint8x16_t pattern = vreinterpretq_u8_u64(
238       vcombine_u64(vcreate_u64((uint64_t)0x05040706 << 32 | 0x09080b0a),
239                    vcreate_u64((uint64_t)0x0f0e0d0c << 32 | 0x01000302)));
240 
241 #if AOM_ARCH_AARCH64
242   partialc =
243       vreinterpretq_s16_s8(vqtbl1q_s8(vreinterpretq_s8_s16(partialc), pattern));
244 #else
245   int8x8x2_t p = { { vget_low_s8(vreinterpretq_s8_s16(partialc)),
246                      vget_high_s8(vreinterpretq_s8_s16(partialc)) } };
247   int8x8_t shuffle_hi = vtbl2_s8(p, vget_high_s8(vreinterpretq_s8_u8(pattern)));
248   int8x8_t shuffle_lo = vtbl2_s8(p, vget_low_s8(vreinterpretq_s8_u8(pattern)));
249   partialc = vreinterpretq_s16_s8(vcombine_s8(shuffle_lo, shuffle_hi));
250 #endif
251 
252   int32x4_t partiala_s32 = vpaddlq_s16(partiala);
253   int32x4_t partialb_s32 = vpaddlq_s16(partialb);
254   int32x4_t partialc_s32 = vpaddlq_s16(partialc);
255 
256   partiala_s32 = vmulq_s32(partiala_s32, partiala_s32);
257   partialb_s32 = vmulq_s32(partialb_s32, partialb_s32);
258   partialc_s32 = vmulq_s32(partialc_s32, partialc_s32);
259 
260   partiala_s32 = vaddq_s32(partiala_s32, partialc_s32);
261 
262   uint32x4_t cost = vmulq_n_u32(vreinterpretq_u32_s32(partialb_s32), 105);
263   cost = vmlaq_u32(cost, vreinterpretq_u32_s32(partiala_s32), const0);
264   return cost;
265 }
266 
267 // This function computes the cost along directions 0, 1, 2, 3. (0 means
268 // 45-degree up-right, 2 is horizontal).
269 //
270 // For direction 1 and 3 ("east northeast" and "east southeast") the shifted
271 // lines need three vectors instead of two. For direction 1 for example, we need
272 // to compute the sums along the line i below:
273 // 0 0 1 1 2 2 3  3
274 // 1 1 2 2 3 3 4  4
275 // 2 2 3 3 4 4 5  5
276 // 3 3 4 4 5 5 6  6
277 // 4 4 5 5 6 6 7  7
278 // 5 5 6 6 7 7 8  8
279 // 6 6 7 7 8 8 9  9
280 // 7 7 8 8 9 9 10 10
281 //
282 // Which means we need the following configuration:
283 // 0 0 1 1 2 2 3 3
284 //     1 1 2 2 3 3 4 4
285 //         2 2 3 3 4 4 5 5
286 //             3 3 4 4 5 5 6 6
287 //                 4 4 5 5 6 6 7 7
288 //                     5 5 6 6 7 7 8 8
289 //                         6 6 7 7 8 8 9 9
290 //                             7 7 8 8 9 9 10 10
291 //
292 // Three vectors are needed to compute this, as well as some extra pairwise
293 // additions.
compute_horiz_directions_neon(int16x8_t lines[8],uint32_t cost[4])294 static uint32x4_t compute_horiz_directions_neon(int16x8_t lines[8],
295                                                 uint32_t cost[4]) {
296   const int16x8_t zero = vdupq_n_s16(0);
297 
298   // Compute diagonal directions (1, 2, 3).
299   // Partial sums for lines 0 and 1.
300   int16x8_t partial0a = lines[0];
301   partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[1], 7));
302   int16x8_t partial0b = vextq_s16(lines[1], zero, 7);
303   int16x8_t partial1a = vaddq_s16(lines[0], vextq_s16(zero, lines[1], 6));
304   int16x8_t partial1b = vextq_s16(lines[1], zero, 6);
305   int16x8_t partial3a = vextq_s16(lines[0], zero, 2);
306   partial3a = vaddq_s16(partial3a, vextq_s16(lines[1], zero, 4));
307   int16x8_t partial3b = vextq_s16(zero, lines[0], 2);
308   partial3b = vaddq_s16(partial3b, vextq_s16(zero, lines[1], 4));
309 
310   // Partial sums for lines 2 and 3.
311   partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[2], 6));
312   partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[3], 5));
313   partial0b = vaddq_s16(partial0b, vextq_s16(lines[2], zero, 6));
314   partial0b = vaddq_s16(partial0b, vextq_s16(lines[3], zero, 5));
315   partial1a = vaddq_s16(partial1a, vextq_s16(zero, lines[2], 4));
316   partial1a = vaddq_s16(partial1a, vextq_s16(zero, lines[3], 2));
317   partial1b = vaddq_s16(partial1b, vextq_s16(lines[2], zero, 4));
318   partial1b = vaddq_s16(partial1b, vextq_s16(lines[3], zero, 2));
319   partial3a = vaddq_s16(partial3a, vextq_s16(lines[2], zero, 6));
320   partial3b = vaddq_s16(partial3b, vextq_s16(zero, lines[2], 6));
321   partial3b = vaddq_s16(partial3b, lines[3]);
322 
323   // Partial sums for lines 4 and 5.
324   partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[4], 4));
325   partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[5], 3));
326   partial0b = vaddq_s16(partial0b, vextq_s16(lines[4], zero, 4));
327   partial0b = vaddq_s16(partial0b, vextq_s16(lines[5], zero, 3));
328   partial1b = vaddq_s16(partial1b, lines[4]);
329   partial1b = vaddq_s16(partial1b, vextq_s16(zero, lines[5], 6));
330   int16x8_t partial1c = vextq_s16(lines[5], zero, 6);
331   partial3b = vaddq_s16(partial3b, vextq_s16(lines[4], zero, 2));
332   partial3b = vaddq_s16(partial3b, vextq_s16(lines[5], zero, 4));
333   int16x8_t partial3c = vextq_s16(zero, lines[4], 2);
334   partial3c = vaddq_s16(partial3c, vextq_s16(zero, lines[5], 4));
335 
336   // Partial sums for lines 6 and 7.
337   partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[6], 2));
338   partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[7], 1));
339   partial0b = vaddq_s16(partial0b, vextq_s16(lines[6], zero, 2));
340   partial0b = vaddq_s16(partial0b, vextq_s16(lines[7], zero, 1));
341   partial1b = vaddq_s16(partial1b, vextq_s16(zero, lines[6], 4));
342   partial1b = vaddq_s16(partial1b, vextq_s16(zero, lines[7], 2));
343   partial1c = vaddq_s16(partial1c, vextq_s16(lines[6], zero, 4));
344   partial1c = vaddq_s16(partial1c, vextq_s16(lines[7], zero, 2));
345   partial3b = vaddq_s16(partial3b, vextq_s16(lines[6], zero, 6));
346   partial3c = vaddq_s16(partial3c, vextq_s16(zero, lines[6], 6));
347   partial3c = vaddq_s16(partial3c, lines[7]);
348 
349   // Special case for direction 2 as it's just a sum along each line.
350   int16x8_t lines03[4] = { lines[0], lines[1], lines[2], lines[3] };
351   int16x8_t lines47[4] = { lines[4], lines[5], lines[6], lines[7] };
352   int32x4_t partial2a = horizontal_add_4d_s16x8(lines03);
353   int32x4_t partial2b = horizontal_add_4d_s16x8(lines47);
354 
355   uint32x4_t partial2a_u32 =
356       vreinterpretq_u32_s32(vmulq_s32(partial2a, partial2a));
357   uint32x4_t partial2b_u32 =
358       vreinterpretq_u32_s32(vmulq_s32(partial2b, partial2b));
359 
360   uint32x4_t const0 = vreinterpretq_u32_u64(
361       vcombine_u64(vcreate_u64((uint64_t)420 << 32 | 840),
362                    vcreate_u64((uint64_t)210 << 32 | 280)));
363   uint32x4_t const1 = vreinterpretq_u32_u64(
364       vcombine_u64(vcreate_u64((uint64_t)140 << 32 | 168),
365                    vcreate_u64((uint64_t)105 << 32 | 120)));
366   uint32x4_t const2 = vreinterpretq_u32_u64(
367       vcombine_u64(vcreate_u64((uint64_t)210 << 32 | 420),
368                    vcreate_u64((uint64_t)105 << 32 | 140)));
369 
370   uint32x4_t costs[4];
371   costs[0] = fold_mul_and_sum_neon(partial0a, partial0b, const0, const1);
372   costs[1] =
373       fold_mul_and_sum_pairwise_neon(partial1a, partial1b, partial1c, const2);
374   costs[2] = vaddq_u32(partial2a_u32, partial2b_u32);
375   costs[2] = vmulq_n_u32(costs[2], 105);
376   costs[3] =
377       fold_mul_and_sum_pairwise_neon(partial3c, partial3b, partial3a, const2);
378 
379   costs[0] = horizontal_add_4d_u32x4(costs);
380   vst1q_u32(cost, costs[0]);
381   return costs[0];
382 }
383 
cdef_find_dir_neon(const uint16_t * img,int stride,int32_t * var,int coeff_shift)384 int cdef_find_dir_neon(const uint16_t *img, int stride, int32_t *var,
385                        int coeff_shift) {
386   uint32_t cost[8];
387   uint32_t best_cost = 0;
388   int best_dir = 0;
389   int16x8_t lines[8];
390   for (int i = 0; i < 8; i++) {
391     uint16x8_t s = vld1q_u16(&img[i * stride]);
392     lines[i] = vreinterpretq_s16_u16(
393         vsubq_u16(vshlq_u16(s, vdupq_n_s16(-coeff_shift)), vdupq_n_u16(128)));
394   }
395 
396   // Compute "mostly vertical" directions.
397   uint32x4_t cost47 = compute_vert_directions_neon(lines, cost + 4);
398 
399   // Compute "mostly horizontal" directions.
400   uint32x4_t cost03 = compute_horiz_directions_neon(lines, cost);
401 
402   // Find max cost as well as its index to get best_dir.
403   // The max cost needs to be propagated in the whole vector to find its
404   // position in the original cost vectors cost03 and cost47.
405   uint32x4_t cost07 = vmaxq_u32(cost03, cost47);
406 #if AOM_ARCH_AARCH64
407   best_cost = vmaxvq_u32(cost07);
408   uint32x4_t max_cost = vdupq_n_u32(best_cost);
409   uint8x16x2_t costs = { { vreinterpretq_u8_u32(vceqq_u32(max_cost, cost03)),
410                            vreinterpretq_u8_u32(
411                                vceqq_u32(max_cost, cost47)) } };
412   // idx = { 28, 24, 20, 16, 12, 8, 4, 0 };
413   uint8x8_t idx = vreinterpret_u8_u64(vcreate_u64(0x0004080c1014181cULL));
414   // Get the lowest 8 bit of each 32-bit elements and reverse them.
415   uint8x8_t tbl = vqtbl2_u8(costs, idx);
416   uint64_t a = vget_lane_u64(vreinterpret_u64_u8(tbl), 0);
417   best_dir = aom_clzll(a) >> 3;
418 #else
419   uint32x2_t cost64 = vpmax_u32(vget_low_u32(cost07), vget_high_u32(cost07));
420   cost64 = vpmax_u32(cost64, cost64);
421   uint32x4_t max_cost = vcombine_u32(cost64, cost64);
422   best_cost = vget_lane_u32(cost64, 0);
423   uint16x8_t costs = vcombine_u16(vmovn_u32(vceqq_u32(max_cost, cost03)),
424                                   vmovn_u32(vceqq_u32(max_cost, cost47)));
425   uint8x8_t idx =
426       vand_u8(vmovn_u16(costs),
427               vreinterpret_u8_u64(vcreate_u64(0x8040201008040201ULL)));
428   int sum = horizontal_add_u8x8(idx);
429   best_dir = get_msb(sum ^ (sum - 1));
430 #endif
431 
432   // Difference between the optimal variance and the variance along the
433   // orthogonal direction. Again, the sum(x^2) terms cancel out.
434   *var = best_cost - cost[(best_dir + 4) & 7];
435   // We'd normally divide by 840, but dividing by 1024 is close enough
436   // for what we're going to do with this.
437   *var >>= 10;
438   return best_dir;
439 }
440 
cdef_find_dir_dual_neon(const uint16_t * img1,const uint16_t * img2,int stride,int32_t * var_out_1st,int32_t * var_out_2nd,int coeff_shift,int * out_dir_1st_8x8,int * out_dir_2nd_8x8)441 void cdef_find_dir_dual_neon(const uint16_t *img1, const uint16_t *img2,
442                              int stride, int32_t *var_out_1st,
443                              int32_t *var_out_2nd, int coeff_shift,
444                              int *out_dir_1st_8x8, int *out_dir_2nd_8x8) {
445   // Process first 8x8.
446   *out_dir_1st_8x8 = cdef_find_dir(img1, stride, var_out_1st, coeff_shift);
447 
448   // Process second 8x8.
449   *out_dir_2nd_8x8 = cdef_find_dir(img2, stride, var_out_2nd, coeff_shift);
450 }
451 
452 // sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp)))
constrain16(uint16x8_t a,uint16x8_t b,unsigned int threshold,int adjdamp)453 static inline int16x8_t constrain16(uint16x8_t a, uint16x8_t b,
454                                     unsigned int threshold, int adjdamp) {
455   uint16x8_t diff = vabdq_u16(a, b);
456   const uint16x8_t a_gt_b = vcgtq_u16(a, b);
457   const uint16x8_t s = vqsubq_u16(vdupq_n_u16(threshold),
458                                   vshlq_u16(diff, vdupq_n_s16(-adjdamp)));
459   const int16x8_t clip = vreinterpretq_s16_u16(vminq_u16(diff, s));
460   return vbslq_s16(a_gt_b, clip, vnegq_s16(clip));
461 }
462 
primary_filter(uint16x8_t s,uint16x8_t tap[4],const int * pri_taps,int pri_strength,int pri_damping,int16x8_t * sum)463 static inline void primary_filter(uint16x8_t s, uint16x8_t tap[4],
464                                   const int *pri_taps, int pri_strength,
465                                   int pri_damping, int16x8_t *sum) {
466   // Near taps
467   int16x8_t n0 = constrain16(tap[0], s, pri_strength, pri_damping);
468   int16x8_t n1 = constrain16(tap[1], s, pri_strength, pri_damping);
469   // sum += pri_taps[0] * (n0 + n1)
470   n0 = vaddq_s16(n0, n1);
471   *sum = vmlaq_n_s16(*sum, n0, pri_taps[0]);
472 
473   // Far taps
474   int16x8_t f0 = constrain16(tap[2], s, pri_strength, pri_damping);
475   int16x8_t f1 = constrain16(tap[3], s, pri_strength, pri_damping);
476   // sum += pri_taps[1] * (f0 + f1)
477   f0 = vaddq_s16(f0, f1);
478   *sum = vmlaq_n_s16(*sum, f0, pri_taps[1]);
479 }
480 
secondary_filter(uint16x8_t s,uint16x8_t tap[8],const int * sec_taps,int sec_strength,int sec_damping,int16x8_t * sum)481 static inline void secondary_filter(uint16x8_t s, uint16x8_t tap[8],
482                                     const int *sec_taps, int sec_strength,
483                                     int sec_damping, int16x8_t *sum) {
484   // Near taps
485   int16x8_t s0 = constrain16(tap[0], s, sec_strength, sec_damping);
486   int16x8_t s1 = constrain16(tap[1], s, sec_strength, sec_damping);
487   int16x8_t s2 = constrain16(tap[2], s, sec_strength, sec_damping);
488   int16x8_t s3 = constrain16(tap[3], s, sec_strength, sec_damping);
489 
490   // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
491   s0 = vaddq_s16(s0, s1);
492   s2 = vaddq_s16(s2, s3);
493   s0 = vaddq_s16(s0, s2);
494   *sum = vmlaq_n_s16(*sum, s0, sec_taps[0]);
495 
496   // Far taps
497   s0 = constrain16(tap[4], s, sec_strength, sec_damping);
498   s1 = constrain16(tap[5], s, sec_strength, sec_damping);
499   s2 = constrain16(tap[6], s, sec_strength, sec_damping);
500   s3 = constrain16(tap[7], s, sec_strength, sec_damping);
501 
502   // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
503   s0 = vaddq_s16(s0, s1);
504   s2 = vaddq_s16(s2, s3);
505   s0 = vaddq_s16(s0, s2);
506   *sum = vmlaq_n_s16(*sum, s0, sec_taps[1]);
507 }
508 
cdef_filter_8_0_neon(void * dest,int dstride,const uint16_t * in,int pri_strength,int sec_strength,int dir,int pri_damping,int sec_damping,int coeff_shift,int block_width,int block_height)509 void cdef_filter_8_0_neon(void *dest, int dstride, const uint16_t *in,
510                           int pri_strength, int sec_strength, int dir,
511                           int pri_damping, int sec_damping, int coeff_shift,
512                           int block_width, int block_height) {
513   uint16x8_t max, min;
514   const uint16x8_t cdef_large_value_mask =
515       vdupq_n_u16(((uint16_t)~CDEF_VERY_LARGE));
516   const int po1 = cdef_directions[dir][0];
517   const int po2 = cdef_directions[dir][1];
518   const int s1o1 = cdef_directions[dir + 2][0];
519   const int s1o2 = cdef_directions[dir + 2][1];
520   const int s2o1 = cdef_directions[dir - 2][0];
521   const int s2o2 = cdef_directions[dir - 2][1];
522   const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
523   const int *sec_taps = cdef_sec_taps;
524 
525   if (pri_strength) {
526     pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
527   }
528   if (sec_strength) {
529     sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
530   }
531 
532   if (block_width == 8) {
533     uint8_t *dst8 = (uint8_t *)dest;
534 
535     int h = block_height;
536     do {
537       int16x8_t sum = vdupq_n_s16(0);
538       uint16x8_t s = vld1q_u16(in);
539       max = min = s;
540 
541       uint16x8_t pri_src[4];
542 
543       // Primary near taps
544       pri_src[0] = vld1q_u16(in + po1);
545       pri_src[1] = vld1q_u16(in - po1);
546 
547       // Primary far taps
548       pri_src[2] = vld1q_u16(in + po2);
549       pri_src[3] = vld1q_u16(in - po2);
550 
551       primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum);
552 
553       // The source is 16 bits, however, we only really care about the lower
554       // 8 bits.  The upper 8 bits contain the "large" flag.  After the final
555       // primary max has been calculated, zero out the upper 8 bits.  Use this
556       // to find the "16 bit" max.
557       uint8x16_t pri_max0 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[0]),
558                                      vreinterpretq_u8_u16(pri_src[1]));
559       uint8x16_t pri_max1 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[2]),
560                                      vreinterpretq_u8_u16(pri_src[3]));
561       pri_max0 = vmaxq_u8(pri_max0, pri_max1);
562       max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(pri_max0),
563                                      cdef_large_value_mask));
564 
565       uint16x8_t pri_min0 = vminq_u16(pri_src[0], pri_src[1]);
566       uint16x8_t pri_min1 = vminq_u16(pri_src[2], pri_src[3]);
567       pri_min0 = vminq_u16(pri_min0, pri_min1);
568       min = vminq_u16(min, pri_min0);
569 
570       uint16x8_t sec_src[8];
571 
572       // Secondary near taps
573       sec_src[0] = vld1q_u16(in + s1o1);
574       sec_src[1] = vld1q_u16(in - s1o1);
575       sec_src[2] = vld1q_u16(in + s2o1);
576       sec_src[3] = vld1q_u16(in - s2o1);
577 
578       // Secondary far taps
579       sec_src[4] = vld1q_u16(in + s1o2);
580       sec_src[5] = vld1q_u16(in - s1o2);
581       sec_src[6] = vld1q_u16(in + s2o2);
582       sec_src[7] = vld1q_u16(in - s2o2);
583 
584       secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
585 
586       // The source is 16 bits, however, we only really care about the lower
587       // 8 bits.  The upper 8 bits contain the "large" flag.  After the final
588       // primary max has been calculated, zero out the upper 8 bits.  Use this
589       // to find the "16 bit" max.
590       uint8x16_t sec_max0 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[0]),
591                                      vreinterpretq_u8_u16(sec_src[1]));
592       uint8x16_t sec_max1 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[2]),
593                                      vreinterpretq_u8_u16(sec_src[3]));
594       uint8x16_t sec_max2 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[4]),
595                                      vreinterpretq_u8_u16(sec_src[5]));
596       uint8x16_t sec_max3 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[6]),
597                                      vreinterpretq_u8_u16(sec_src[7]));
598       sec_max0 = vmaxq_u8(sec_max0, sec_max1);
599       sec_max2 = vmaxq_u8(sec_max2, sec_max3);
600       sec_max0 = vmaxq_u8(sec_max0, sec_max2);
601       max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(sec_max0),
602                                      cdef_large_value_mask));
603 
604       uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]);
605       uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]);
606       uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]);
607       uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]);
608       sec_min0 = vminq_u16(sec_min0, sec_min1);
609       sec_min2 = vminq_u16(sec_min2, sec_min3);
610       sec_min0 = vminq_u16(sec_min0, sec_min2);
611       min = vminq_u16(min, sec_min0);
612 
613       // res = s + ((sum - (sum < 0) + 8) >> 4)
614       sum =
615           vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
616       int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
617 
618       res_s16 = vminq_s16(vmaxq_s16(res_s16, vreinterpretq_s16_u16(min)),
619                           vreinterpretq_s16_u16(max));
620 
621       const uint8x8_t res_u8 = vqmovun_s16(res_s16);
622       vst1_u8(dst8, res_u8);
623 
624       in += CDEF_BSTRIDE;
625       dst8 += dstride;
626     } while (--h != 0);
627   } else {
628     uint8_t *dst8 = (uint8_t *)dest;
629 
630     int h = block_height;
631     do {
632       int16x8_t sum = vdupq_n_s16(0);
633       uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
634       max = min = s;
635 
636       uint16x8_t pri_src[4];
637 
638       // Primary near taps
639       pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE);
640       pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE);
641 
642       // Primary far taps
643       pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE);
644       pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE);
645 
646       primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum);
647 
648       // The source is 16 bits, however, we only really care about the lower
649       // 8 bits.  The upper 8 bits contain the "large" flag.  After the final
650       // primary max has been calculated, zero out the upper 8 bits.  Use this
651       // to find the "16 bit" max.
652       uint8x16_t pri_max0 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[0]),
653                                      vreinterpretq_u8_u16(pri_src[1]));
654       uint8x16_t pri_max1 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[2]),
655                                      vreinterpretq_u8_u16(pri_src[3]));
656       pri_max0 = vmaxq_u8(pri_max0, pri_max1);
657       max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(pri_max0),
658                                      cdef_large_value_mask));
659 
660       uint16x8_t pri_min1 = vminq_u16(pri_src[0], pri_src[1]);
661       uint16x8_t pri_min2 = vminq_u16(pri_src[2], pri_src[3]);
662       pri_min1 = vminq_u16(pri_min1, pri_min2);
663       min = vminq_u16(min, pri_min1);
664 
665       uint16x8_t sec_src[8];
666 
667       // Secondary near taps
668       sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE);
669       sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE);
670       sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE);
671       sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE);
672 
673       // Secondary far taps
674       sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE);
675       sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE);
676       sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE);
677       sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE);
678 
679       secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
680 
681       // The source is 16 bits, however, we only really care about the lower
682       // 8 bits.  The upper 8 bits contain the "large" flag.  After the final
683       // primary max has been calculated, zero out the upper 8 bits.  Use this
684       // to find the "16 bit" max.
685       uint8x16_t sec_max0 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[0]),
686                                      vreinterpretq_u8_u16(sec_src[1]));
687       uint8x16_t sec_max1 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[2]),
688                                      vreinterpretq_u8_u16(sec_src[3]));
689       uint8x16_t sec_max2 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[4]),
690                                      vreinterpretq_u8_u16(sec_src[5]));
691       uint8x16_t sec_max3 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[6]),
692                                      vreinterpretq_u8_u16(sec_src[7]));
693       sec_max0 = vmaxq_u8(sec_max0, sec_max1);
694       sec_max2 = vmaxq_u8(sec_max2, sec_max3);
695       sec_max0 = vmaxq_u8(sec_max0, sec_max2);
696       max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(sec_max0),
697                                      cdef_large_value_mask));
698 
699       uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]);
700       uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]);
701       uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]);
702       uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]);
703       sec_min0 = vminq_u16(sec_min0, sec_min1);
704       sec_min2 = vminq_u16(sec_min2, sec_min3);
705       sec_min0 = vminq_u16(sec_min0, sec_min2);
706       min = vminq_u16(min, sec_min0);
707 
708       // res = s + ((sum - (sum < 0) + 8) >> 4)
709       sum =
710           vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
711       int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
712 
713       res_s16 = vminq_s16(vmaxq_s16(res_s16, vreinterpretq_s16_u16(min)),
714                           vreinterpretq_s16_u16(max));
715 
716       const uint8x8_t res_u8 = vqmovun_s16(res_s16);
717       store_u8x4_strided_x2(dst8, dstride, res_u8);
718 
719       in += 2 * CDEF_BSTRIDE;
720       dst8 += 2 * dstride;
721       h -= 2;
722     } while (h != 0);
723   }
724 }
725 
cdef_filter_8_1_neon(void * dest,int dstride,const uint16_t * in,int pri_strength,int sec_strength,int dir,int pri_damping,int sec_damping,int coeff_shift,int block_width,int block_height)726 void cdef_filter_8_1_neon(void *dest, int dstride, const uint16_t *in,
727                           int pri_strength, int sec_strength, int dir,
728                           int pri_damping, int sec_damping, int coeff_shift,
729                           int block_width, int block_height) {
730   (void)sec_strength;
731   (void)sec_damping;
732 
733   const int po1 = cdef_directions[dir][0];
734   const int po2 = cdef_directions[dir][1];
735   const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
736 
737   if (pri_strength) {
738     pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
739   }
740 
741   if (block_width == 8) {
742     uint8_t *dst8 = (uint8_t *)dest;
743 
744     int h = block_height;
745     do {
746       int16x8_t sum = vdupq_n_s16(0);
747       uint16x8_t s = vld1q_u16(in);
748 
749       uint16x8_t tap[4];
750 
751       // Primary near taps
752       tap[0] = vld1q_u16(in + po1);
753       tap[1] = vld1q_u16(in - po1);
754 
755       // Primary far taps
756       tap[2] = vld1q_u16(in + po2);
757       tap[3] = vld1q_u16(in - po2);
758 
759       primary_filter(s, tap, pri_taps, pri_strength, pri_damping, &sum);
760 
761       // res = s + ((sum - (sum < 0) + 8) >> 4)
762       sum =
763           vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
764       const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
765 
766       const uint8x8_t res_u8 = vqmovun_s16(res_s16);
767       vst1_u8(dst8, res_u8);
768 
769       in += CDEF_BSTRIDE;
770       dst8 += dstride;
771     } while (--h != 0);
772 
773   } else {
774     uint8_t *dst8 = (uint8_t *)dest;
775 
776     int h = block_height;
777     do {
778       int16x8_t sum = vdupq_n_s16(0);
779       uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
780 
781       uint16x8_t pri_src[4];
782 
783       // Primary near taps
784       pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE);
785       pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE);
786 
787       // Primary far taps
788       pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE);
789       pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE);
790 
791       primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum);
792 
793       // res = s + ((sum - (sum < 0) + 8) >> 4)
794       sum =
795           vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
796       const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
797 
798       const uint8x8_t res_u8 = vqmovun_s16(res_s16);
799       store_u8x4_strided_x2(dst8, dstride, res_u8);
800 
801       in += 2 * CDEF_BSTRIDE;
802       dst8 += 2 * dstride;
803       h -= 2;
804     } while (h != 0);
805   }
806 }
807 
cdef_filter_8_2_neon(void * dest,int dstride,const uint16_t * in,int pri_strength,int sec_strength,int dir,int pri_damping,int sec_damping,int coeff_shift,int block_width,int block_height)808 void cdef_filter_8_2_neon(void *dest, int dstride, const uint16_t *in,
809                           int pri_strength, int sec_strength, int dir,
810                           int pri_damping, int sec_damping, int coeff_shift,
811                           int block_width, int block_height) {
812   (void)pri_strength;
813   (void)pri_damping;
814   (void)coeff_shift;
815 
816   const int s1o1 = cdef_directions[dir + 2][0];
817   const int s1o2 = cdef_directions[dir + 2][1];
818   const int s2o1 = cdef_directions[dir - 2][0];
819   const int s2o2 = cdef_directions[dir - 2][1];
820   const int *sec_taps = cdef_sec_taps;
821 
822   if (sec_strength) {
823     sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
824   }
825 
826   if (block_width == 8) {
827     uint8_t *dst8 = (uint8_t *)dest;
828 
829     int h = block_height;
830     do {
831       int16x8_t sum = vdupq_n_s16(0);
832       uint16x8_t s = vld1q_u16(in);
833 
834       uint16x8_t sec_src[8];
835 
836       // Secondary near taps
837       sec_src[0] = vld1q_u16(in + s1o1);
838       sec_src[1] = vld1q_u16(in - s1o1);
839       sec_src[2] = vld1q_u16(in + s2o1);
840       sec_src[3] = vld1q_u16(in - s2o1);
841 
842       // Secondary far taps
843       sec_src[4] = vld1q_u16(in + s1o2);
844       sec_src[5] = vld1q_u16(in - s1o2);
845       sec_src[6] = vld1q_u16(in + s2o2);
846       sec_src[7] = vld1q_u16(in - s2o2);
847 
848       secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
849 
850       // res = s + ((sum - (sum < 0) + 8) >> 4)
851       sum =
852           vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
853       const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
854 
855       const uint8x8_t res_u8 = vqmovun_s16(res_s16);
856       vst1_u8(dst8, res_u8);
857 
858       in += CDEF_BSTRIDE;
859       dst8 += dstride;
860     } while (--h != 0);
861   } else {
862     uint8_t *dst8 = (uint8_t *)dest;
863 
864     int h = block_height;
865     do {
866       int16x8_t sum = vdupq_n_s16(0);
867       uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
868 
869       uint16x8_t sec_src[8];
870 
871       // Secondary near taps
872       sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE);
873       sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE);
874       sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE);
875       sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE);
876 
877       // Secondary far taps
878       sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE);
879       sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE);
880       sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE);
881       sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE);
882 
883       secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
884 
885       // res = s + ((sum - (sum < 0) + 8) >> 4)
886       sum =
887           vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
888       const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
889 
890       const uint8x8_t res_u8 = vqmovun_s16(res_s16);
891       store_u8x4_strided_x2(dst8, dstride, res_u8);
892 
893       in += 2 * CDEF_BSTRIDE;
894       dst8 += 2 * dstride;
895       h -= 2;
896     } while (h != 0);
897   }
898 }
899 
cdef_filter_8_3_neon(void * dest,int dstride,const uint16_t * in,int pri_strength,int sec_strength,int dir,int pri_damping,int sec_damping,int coeff_shift,int block_width,int block_height)900 void cdef_filter_8_3_neon(void *dest, int dstride, const uint16_t *in,
901                           int pri_strength, int sec_strength, int dir,
902                           int pri_damping, int sec_damping, int coeff_shift,
903                           int block_width, int block_height) {
904   (void)pri_strength;
905   (void)sec_strength;
906   (void)dir;
907   (void)pri_damping;
908   (void)sec_damping;
909   (void)coeff_shift;
910   (void)block_width;
911   if (block_width == 8) {
912     uint8_t *dst8 = (uint8_t *)dest;
913 
914     int h = block_height;
915     do {
916       const uint16x8_t s = vld1q_u16(in);
917       const uint8x8_t res = vqmovn_u16(s);
918       vst1_u8(dst8, res);
919 
920       in += CDEF_BSTRIDE;
921       dst8 += dstride;
922     } while (--h != 0);
923   } else {
924     uint8_t *dst8 = (uint8_t *)dest;
925 
926     int h = block_height;
927     do {
928       const uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
929       const uint8x8_t res = vqmovn_u16(s);
930       store_u8x4_strided_x2(dst8, dstride, res);
931 
932       in += 2 * CDEF_BSTRIDE;
933       dst8 += 2 * dstride;
934       h -= 2;
935     } while (h != 0);
936   }
937 }
938 
cdef_filter_16_0_neon(void * dest,int dstride,const uint16_t * in,int pri_strength,int sec_strength,int dir,int pri_damping,int sec_damping,int coeff_shift,int block_width,int block_height)939 void cdef_filter_16_0_neon(void *dest, int dstride, const uint16_t *in,
940                            int pri_strength, int sec_strength, int dir,
941                            int pri_damping, int sec_damping, int coeff_shift,
942                            int block_width, int block_height) {
943   uint16x8_t max, min;
944   const uint16x8_t cdef_large_value_mask =
945       vdupq_n_u16(((uint16_t)~CDEF_VERY_LARGE));
946   const int po1 = cdef_directions[dir][0];
947   const int po2 = cdef_directions[dir][1];
948   const int s1o1 = cdef_directions[dir + 2][0];
949   const int s1o2 = cdef_directions[dir + 2][1];
950   const int s2o1 = cdef_directions[dir - 2][0];
951   const int s2o2 = cdef_directions[dir - 2][1];
952   const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
953   const int *sec_taps = cdef_sec_taps;
954 
955   if (pri_strength) {
956     pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
957   }
958   if (sec_strength) {
959     sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
960   }
961 
962   if (block_width == 8) {
963     uint16_t *dst16 = (uint16_t *)dest;
964 
965     int h = block_height;
966     do {
967       int16x8_t sum = vdupq_n_s16(0);
968       uint16x8_t s = vld1q_u16(in);
969       max = min = s;
970 
971       uint16x8_t pri_src[4];
972 
973       // Primary near taps
974       pri_src[0] = vld1q_u16(in + po1);
975       pri_src[1] = vld1q_u16(in - po1);
976 
977       // Primary far taps
978       pri_src[2] = vld1q_u16(in + po2);
979       pri_src[3] = vld1q_u16(in - po2);
980 
981       primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum);
982 
983       uint16x8_t pri_min0 = vminq_u16(pri_src[0], pri_src[1]);
984       uint16x8_t pri_min1 = vminq_u16(pri_src[2], pri_src[3]);
985       pri_min0 = vminq_u16(pri_min0, pri_min1);
986       min = vminq_u16(min, pri_min0);
987 
988       /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
989       pri_src[0] = vandq_u16(pri_src[0], cdef_large_value_mask);
990       pri_src[1] = vandq_u16(pri_src[1], cdef_large_value_mask);
991       pri_src[2] = vandq_u16(pri_src[2], cdef_large_value_mask);
992       pri_src[3] = vandq_u16(pri_src[3], cdef_large_value_mask);
993 
994       uint16x8_t pri_max0 = vmaxq_u16(pri_src[0], pri_src[1]);
995       uint16x8_t pri_max1 = vmaxq_u16(pri_src[2], pri_src[3]);
996       pri_max0 = vmaxq_u16(pri_max0, pri_max1);
997       max = vmaxq_u16(max, pri_max0);
998 
999       uint16x8_t sec_src[8];
1000 
1001       // Secondary near taps
1002       sec_src[0] = vld1q_u16(in + s1o1);
1003       sec_src[1] = vld1q_u16(in - s1o1);
1004       sec_src[2] = vld1q_u16(in + s2o1);
1005       sec_src[3] = vld1q_u16(in - s2o1);
1006 
1007       // Secondary far taps
1008       sec_src[4] = vld1q_u16(in + s1o2);
1009       sec_src[5] = vld1q_u16(in - s1o2);
1010       sec_src[6] = vld1q_u16(in + s2o2);
1011       sec_src[7] = vld1q_u16(in - s2o2);
1012 
1013       secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
1014 
1015       uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]);
1016       uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]);
1017       uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]);
1018       uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]);
1019       sec_min0 = vminq_u16(sec_min0, sec_min1);
1020       sec_min2 = vminq_u16(sec_min2, sec_min3);
1021       sec_min0 = vminq_u16(sec_min0, sec_min2);
1022       min = vminq_u16(min, sec_min0);
1023 
1024       /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
1025       sec_src[0] = vandq_u16(sec_src[0], cdef_large_value_mask);
1026       sec_src[1] = vandq_u16(sec_src[1], cdef_large_value_mask);
1027       sec_src[2] = vandq_u16(sec_src[2], cdef_large_value_mask);
1028       sec_src[3] = vandq_u16(sec_src[3], cdef_large_value_mask);
1029       sec_src[4] = vandq_u16(sec_src[4], cdef_large_value_mask);
1030       sec_src[5] = vandq_u16(sec_src[5], cdef_large_value_mask);
1031       sec_src[6] = vandq_u16(sec_src[6], cdef_large_value_mask);
1032       sec_src[7] = vandq_u16(sec_src[7], cdef_large_value_mask);
1033 
1034       uint16x8_t sec_max0 = vmaxq_u16(sec_src[0], sec_src[1]);
1035       uint16x8_t sec_max1 = vmaxq_u16(sec_src[2], sec_src[3]);
1036       uint16x8_t sec_max2 = vmaxq_u16(sec_src[4], sec_src[5]);
1037       uint16x8_t sec_max3 = vmaxq_u16(sec_src[6], sec_src[7]);
1038       sec_max0 = vmaxq_u16(sec_max0, sec_max1);
1039       sec_max2 = vmaxq_u16(sec_max2, sec_max3);
1040       sec_max0 = vmaxq_u16(sec_max0, sec_max2);
1041       max = vmaxq_u16(max, sec_max0);
1042 
1043       // res = s + ((sum - (sum < 0) + 8) >> 4)
1044       sum =
1045           vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
1046       int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
1047 
1048       res = vminq_s16(vmaxq_s16(res, vreinterpretq_s16_u16(min)),
1049                       vreinterpretq_s16_u16(max));
1050 
1051       vst1q_u16(dst16, vreinterpretq_u16_s16(res));
1052 
1053       in += CDEF_BSTRIDE;
1054       dst16 += dstride;
1055     } while (--h != 0);
1056   } else {
1057     uint16_t *dst16 = (uint16_t *)dest;
1058 
1059     int h = block_height;
1060     do {
1061       int16x8_t sum = vdupq_n_s16(0);
1062       uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
1063       max = min = s;
1064 
1065       uint16x8_t pri_src[4];
1066 
1067       // Primary near taps
1068       pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE);
1069       pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE);
1070 
1071       // Primary far taps
1072       pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE);
1073       pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE);
1074 
1075       primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum);
1076 
1077       uint16x8_t pri_min1 = vminq_u16(pri_src[0], pri_src[1]);
1078       uint16x8_t pri_min2 = vminq_u16(pri_src[2], pri_src[3]);
1079       pri_min1 = vminq_u16(pri_min1, pri_min2);
1080       min = vminq_u16(min, pri_min1);
1081 
1082       /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
1083       pri_src[0] = vandq_u16(pri_src[0], cdef_large_value_mask);
1084       pri_src[1] = vandq_u16(pri_src[1], cdef_large_value_mask);
1085       pri_src[2] = vandq_u16(pri_src[2], cdef_large_value_mask);
1086       pri_src[3] = vandq_u16(pri_src[3], cdef_large_value_mask);
1087       uint16x8_t pri_max0 = vmaxq_u16(pri_src[0], pri_src[1]);
1088       uint16x8_t pri_max1 = vmaxq_u16(pri_src[2], pri_src[3]);
1089       pri_max0 = vmaxq_u16(pri_max0, pri_max1);
1090       max = vmaxq_u16(max, pri_max0);
1091 
1092       uint16x8_t sec_src[8];
1093 
1094       // Secondary near taps
1095       sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE);
1096       sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE);
1097       sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE);
1098       sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE);
1099 
1100       // Secondary far taps
1101       sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE);
1102       sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE);
1103       sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE);
1104       sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE);
1105 
1106       secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
1107 
1108       uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]);
1109       uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]);
1110       uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]);
1111       uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]);
1112       sec_min0 = vminq_u16(sec_min0, sec_min1);
1113       sec_min2 = vminq_u16(sec_min2, sec_min3);
1114       sec_min0 = vminq_u16(sec_min0, sec_min2);
1115       min = vminq_u16(min, sec_min0);
1116 
1117       /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
1118       sec_src[0] = vandq_u16(sec_src[0], cdef_large_value_mask);
1119       sec_src[1] = vandq_u16(sec_src[1], cdef_large_value_mask);
1120       sec_src[2] = vandq_u16(sec_src[2], cdef_large_value_mask);
1121       sec_src[3] = vandq_u16(sec_src[3], cdef_large_value_mask);
1122       sec_src[4] = vandq_u16(sec_src[4], cdef_large_value_mask);
1123       sec_src[5] = vandq_u16(sec_src[5], cdef_large_value_mask);
1124       sec_src[6] = vandq_u16(sec_src[6], cdef_large_value_mask);
1125       sec_src[7] = vandq_u16(sec_src[7], cdef_large_value_mask);
1126 
1127       uint16x8_t sec_max0 = vmaxq_u16(sec_src[0], sec_src[1]);
1128       uint16x8_t sec_max1 = vmaxq_u16(sec_src[2], sec_src[3]);
1129       uint16x8_t sec_max2 = vmaxq_u16(sec_src[4], sec_src[5]);
1130       uint16x8_t sec_max3 = vmaxq_u16(sec_src[6], sec_src[7]);
1131       sec_max0 = vmaxq_u16(sec_max0, sec_max1);
1132       sec_max2 = vmaxq_u16(sec_max2, sec_max3);
1133       sec_max0 = vmaxq_u16(sec_max0, sec_max2);
1134       max = vmaxq_u16(max, sec_max0);
1135 
1136       // res = s + ((sum - (sum < 0) + 8) >> 4)
1137       sum =
1138           vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
1139       int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
1140 
1141       res = vminq_s16(vmaxq_s16(res, vreinterpretq_s16_u16(min)),
1142                       vreinterpretq_s16_u16(max));
1143 
1144       store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res));
1145 
1146       in += 2 * CDEF_BSTRIDE;
1147       dst16 += 2 * dstride;
1148       h -= 2;
1149     } while (h != 0);
1150   }
1151 }
1152 
cdef_filter_16_1_neon(void * dest,int dstride,const uint16_t * in,int pri_strength,int sec_strength,int dir,int pri_damping,int sec_damping,int coeff_shift,int block_width,int block_height)1153 void cdef_filter_16_1_neon(void *dest, int dstride, const uint16_t *in,
1154                            int pri_strength, int sec_strength, int dir,
1155                            int pri_damping, int sec_damping, int coeff_shift,
1156                            int block_width, int block_height) {
1157   (void)sec_strength;
1158   (void)sec_damping;
1159 
1160   const int po1 = cdef_directions[dir][0];
1161   const int po2 = cdef_directions[dir][1];
1162   const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
1163 
1164   if (pri_strength) {
1165     pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
1166   }
1167 
1168   if (block_width == 8) {
1169     uint16_t *dst16 = (uint16_t *)dest;
1170 
1171     int h = block_height;
1172     do {
1173       int16x8_t sum = vdupq_n_s16(0);
1174       uint16x8_t s = vld1q_u16(in);
1175 
1176       uint16x8_t tap[4];
1177 
1178       // Primary near taps
1179       tap[0] = vld1q_u16(in + po1);
1180       tap[1] = vld1q_u16(in - po1);
1181 
1182       // Primary far taps
1183       tap[2] = vld1q_u16(in + po2);
1184       tap[3] = vld1q_u16(in - po2);
1185 
1186       primary_filter(s, tap, pri_taps, pri_strength, pri_damping, &sum);
1187 
1188       // res = s + ((sum - (sum < 0) + 8) >> 4)
1189       sum =
1190           vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
1191       const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
1192 
1193       vst1q_u16(dst16, vreinterpretq_u16_s16(res));
1194 
1195       in += CDEF_BSTRIDE;
1196       dst16 += dstride;
1197     } while (--h != 0);
1198   } else {
1199     uint16_t *dst16 = (uint16_t *)dest;
1200 
1201     int h = block_height;
1202     do {
1203       int16x8_t sum = vdupq_n_s16(0);
1204       uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
1205 
1206       uint16x8_t pri_src[4];
1207 
1208       // Primary near taps
1209       pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE);
1210       pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE);
1211 
1212       // Primary far taps
1213       pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE);
1214       pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE);
1215 
1216       primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum);
1217 
1218       // res = s + ((sum - (sum < 0) + 8) >> 4)
1219       sum =
1220           vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
1221       const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
1222 
1223       store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res));
1224 
1225       in += 2 * CDEF_BSTRIDE;
1226       dst16 += 2 * dstride;
1227       h -= 2;
1228     } while (h != 0);
1229   }
1230 }
1231 
cdef_filter_16_2_neon(void * dest,int dstride,const uint16_t * in,int pri_strength,int sec_strength,int dir,int pri_damping,int sec_damping,int coeff_shift,int block_width,int block_height)1232 void cdef_filter_16_2_neon(void *dest, int dstride, const uint16_t *in,
1233                            int pri_strength, int sec_strength, int dir,
1234                            int pri_damping, int sec_damping, int coeff_shift,
1235                            int block_width, int block_height) {
1236   (void)pri_strength;
1237   (void)pri_damping;
1238   (void)coeff_shift;
1239 
1240   const int s1o1 = cdef_directions[dir + 2][0];
1241   const int s1o2 = cdef_directions[dir + 2][1];
1242   const int s2o1 = cdef_directions[dir - 2][0];
1243   const int s2o2 = cdef_directions[dir - 2][1];
1244   const int *sec_taps = cdef_sec_taps;
1245 
1246   if (sec_strength) {
1247     sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
1248   }
1249 
1250   if (block_width == 8) {
1251     uint16_t *dst16 = (uint16_t *)dest;
1252 
1253     int h = block_height;
1254     do {
1255       int16x8_t sum = vdupq_n_s16(0);
1256       uint16x8_t s = vld1q_u16(in);
1257 
1258       uint16x8_t sec_src[8];
1259 
1260       // Secondary near taps
1261       sec_src[0] = vld1q_u16(in + s1o1);
1262       sec_src[1] = vld1q_u16(in - s1o1);
1263       sec_src[2] = vld1q_u16(in + s2o1);
1264       sec_src[3] = vld1q_u16(in - s2o1);
1265 
1266       // Secondary far taps
1267       sec_src[4] = vld1q_u16(in + s1o2);
1268       sec_src[5] = vld1q_u16(in - s1o2);
1269       sec_src[6] = vld1q_u16(in + s2o2);
1270       sec_src[7] = vld1q_u16(in - s2o2);
1271 
1272       secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
1273 
1274       // res = s + ((sum - (sum < 0) + 8) >> 4)
1275       sum =
1276           vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
1277       const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
1278 
1279       vst1q_u16(dst16, vreinterpretq_u16_s16(res));
1280 
1281       in += CDEF_BSTRIDE;
1282       dst16 += dstride;
1283     } while (--h != 0);
1284   } else {
1285     uint16_t *dst16 = (uint16_t *)dest;
1286 
1287     int h = block_height;
1288     do {
1289       int16x8_t sum = vdupq_n_s16(0);
1290       uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
1291 
1292       uint16x8_t sec_src[8];
1293 
1294       // Secondary near taps
1295       sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE);
1296       sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE);
1297       sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE);
1298       sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE);
1299 
1300       // Secondary far taps
1301       sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE);
1302       sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE);
1303       sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE);
1304       sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE);
1305 
1306       secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
1307 
1308       // res = s + ((sum - (sum < 0) + 8) >> 4)
1309       sum =
1310           vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
1311       const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
1312 
1313       store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res));
1314 
1315       in += 2 * CDEF_BSTRIDE;
1316       dst16 += 2 * dstride;
1317       h -= 2;
1318     } while (h != 0);
1319   }
1320 }
1321 
cdef_filter_16_3_neon(void * dest,int dstride,const uint16_t * in,int pri_strength,int sec_strength,int dir,int pri_damping,int sec_damping,int coeff_shift,int block_width,int block_height)1322 void cdef_filter_16_3_neon(void *dest, int dstride, const uint16_t *in,
1323                            int pri_strength, int sec_strength, int dir,
1324                            int pri_damping, int sec_damping, int coeff_shift,
1325                            int block_width, int block_height) {
1326   (void)pri_strength;
1327   (void)sec_strength;
1328   (void)dir;
1329   (void)pri_damping;
1330   (void)sec_damping;
1331   (void)coeff_shift;
1332   (void)block_width;
1333   if (block_width == 8) {
1334     uint16_t *dst16 = (uint16_t *)dest;
1335 
1336     int h = block_height;
1337     do {
1338       const uint16x8_t s = vld1q_u16(in);
1339       vst1q_u16(dst16, s);
1340 
1341       in += CDEF_BSTRIDE;
1342       dst16 += dstride;
1343     } while (--h != 0);
1344   } else {
1345     uint16_t *dst16 = (uint16_t *)dest;
1346 
1347     int h = block_height;
1348     do {
1349       const uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
1350       store_u16x4_strided_x2(dst16, dstride, s);
1351 
1352       in += 2 * CDEF_BSTRIDE;
1353       dst16 += 2 * dstride;
1354       h -= 2;
1355     } while (h != 0);
1356   }
1357 }
1358