1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <arm_neon.h>
13 #include <assert.h>
14
15 #include "config/aom_config.h"
16 #include "config/av1_rtcd.h"
17
18 #include "aom_dsp/arm/mem_neon.h"
19 #include "aom_dsp/arm/sum_neon.h"
20 #include "av1/common/cdef_block.h"
21
cdef_copy_rect8_8bit_to_16bit_neon(uint16_t * dst,int dstride,const uint8_t * src,int sstride,int width,int height)22 void cdef_copy_rect8_8bit_to_16bit_neon(uint16_t *dst, int dstride,
23 const uint8_t *src, int sstride,
24 int width, int height) {
25 do {
26 const uint8_t *src_ptr = src;
27 uint16_t *dst_ptr = dst;
28
29 int w = 0;
30 while (width - w >= 16) {
31 uint8x16_t row = vld1q_u8(src_ptr + w);
32 uint8x16x2_t row_u16 = { { row, vdupq_n_u8(0) } };
33 vst2q_u8((uint8_t *)(dst_ptr + w), row_u16);
34
35 w += 16;
36 }
37 if (width - w >= 8) {
38 uint8x8_t row = vld1_u8(src_ptr + w);
39 vst1q_u16(dst_ptr + w, vmovl_u8(row));
40 w += 8;
41 }
42 if (width - w == 4) {
43 for (int i = w; i < w + 4; i++) {
44 dst_ptr[i] = src_ptr[i];
45 }
46 }
47
48 src += sstride;
49 dst += dstride;
50 } while (--height != 0);
51 }
52
53 #if CONFIG_AV1_HIGHBITDEPTH
cdef_copy_rect8_16bit_to_16bit_neon(uint16_t * dst,int dstride,const uint16_t * src,int sstride,int width,int height)54 void cdef_copy_rect8_16bit_to_16bit_neon(uint16_t *dst, int dstride,
55 const uint16_t *src, int sstride,
56 int width, int height) {
57 do {
58 const uint16_t *src_ptr = src;
59 uint16_t *dst_ptr = dst;
60
61 int w = 0;
62 while (width - w >= 8) {
63 uint16x8_t row = vld1q_u16(src_ptr + w);
64 vst1q_u16(dst_ptr + w, row);
65
66 w += 8;
67 }
68 if (width - w == 4) {
69 uint16x4_t row = vld1_u16(src_ptr + w);
70 vst1_u16(dst_ptr + w, row);
71 }
72
73 src += sstride;
74 dst += dstride;
75 } while (--height != 0);
76 }
77 #endif // CONFIG_AV1_HIGHBITDEPTH
78
79 // partial A is a 16-bit vector of the form:
80 // [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form:
81 // [0 y1 y2 y3 y4 y5 y6 y7].
82 // This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
83 // (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1
84 // and const2.
fold_mul_and_sum_neon(int16x8_t partiala,int16x8_t partialb,uint32x4_t const1,uint32x4_t const2)85 static inline uint32x4_t fold_mul_and_sum_neon(int16x8_t partiala,
86 int16x8_t partialb,
87 uint32x4_t const1,
88 uint32x4_t const2) {
89 // Reverse partial B.
90 // pattern = { 12 13 10 11 8 9 6 7 4 5 2 3 0 1 14 15 }.
91 uint8x16_t pattern = vreinterpretq_u8_u64(
92 vcombine_u64(vcreate_u64((uint64_t)0x07060908 << 32 | 0x0b0a0d0c),
93 vcreate_u64((uint64_t)0x0f0e0100 << 32 | 0x03020504)));
94
95 #if AOM_ARCH_AARCH64
96 partialb =
97 vreinterpretq_s16_s8(vqtbl1q_s8(vreinterpretq_s8_s16(partialb), pattern));
98 #else
99 int8x8x2_t p = { { vget_low_s8(vreinterpretq_s8_s16(partialb)),
100 vget_high_s8(vreinterpretq_s8_s16(partialb)) } };
101 int8x8_t shuffle_hi = vtbl2_s8(p, vget_high_s8(vreinterpretq_s8_u8(pattern)));
102 int8x8_t shuffle_lo = vtbl2_s8(p, vget_low_s8(vreinterpretq_s8_u8(pattern)));
103 partialb = vreinterpretq_s16_s8(vcombine_s8(shuffle_lo, shuffle_hi));
104 #endif
105
106 // Square and add the corresponding x and y values.
107 int32x4_t cost_lo = vmull_s16(vget_low_s16(partiala), vget_low_s16(partiala));
108 cost_lo = vmlal_s16(cost_lo, vget_low_s16(partialb), vget_low_s16(partialb));
109 int32x4_t cost_hi =
110 vmull_s16(vget_high_s16(partiala), vget_high_s16(partiala));
111 cost_hi =
112 vmlal_s16(cost_hi, vget_high_s16(partialb), vget_high_s16(partialb));
113
114 // Multiply by constant.
115 uint32x4_t cost = vmulq_u32(vreinterpretq_u32_s32(cost_lo), const1);
116 cost = vmlaq_u32(cost, vreinterpretq_u32_s32(cost_hi), const2);
117 return cost;
118 }
119
120 // This function computes the cost along directions 4, 5, 6, 7. (4 is diagonal
121 // down-right, 6 is vertical).
122 //
123 // For each direction the lines are shifted so that we can perform a
124 // basic sum on each vector element. For example, direction 5 is "south by
125 // southeast", so we need to add the pixels along each line i below:
126 //
127 // 0 1 2 3 4 5 6 7
128 // 0 1 2 3 4 5 6 7
129 // 8 0 1 2 3 4 5 6
130 // 8 0 1 2 3 4 5 6
131 // 9 8 0 1 2 3 4 5
132 // 9 8 0 1 2 3 4 5
133 // 10 9 8 0 1 2 3 4
134 // 10 9 8 0 1 2 3 4
135 //
136 // For this to fit nicely in vectors, the lines need to be shifted like so:
137 // 0 1 2 3 4 5 6 7
138 // 0 1 2 3 4 5 6 7
139 // 8 0 1 2 3 4 5 6
140 // 8 0 1 2 3 4 5 6
141 // 9 8 0 1 2 3 4 5
142 // 9 8 0 1 2 3 4 5
143 // 10 9 8 0 1 2 3 4
144 // 10 9 8 0 1 2 3 4
145 //
146 // In this configuration we can now perform SIMD additions to get the cost
147 // along direction 5. Since this won't fit into a single 128-bit vector, we use
148 // two of them to compute each half of the new configuration, and pad the empty
149 // spaces with zeros. Similar shifting is done for other directions, except
150 // direction 6 which is straightforward as it's the vertical direction.
compute_vert_directions_neon(int16x8_t lines[8],uint32_t cost[4])151 static inline uint32x4_t compute_vert_directions_neon(int16x8_t lines[8],
152 uint32_t cost[4]) {
153 const int16x8_t zero = vdupq_n_s16(0);
154
155 // Partial sums for lines 0 and 1.
156 int16x8_t partial4a = vextq_s16(zero, lines[0], 1);
157 partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[1], 2));
158 int16x8_t partial4b = vextq_s16(lines[0], zero, 1);
159 partial4b = vaddq_s16(partial4b, vextq_s16(lines[1], zero, 2));
160 int16x8_t tmp = vaddq_s16(lines[0], lines[1]);
161 int16x8_t partial5a = vextq_s16(zero, tmp, 3);
162 int16x8_t partial5b = vextq_s16(tmp, zero, 3);
163 int16x8_t partial7a = vextq_s16(zero, tmp, 6);
164 int16x8_t partial7b = vextq_s16(tmp, zero, 6);
165 int16x8_t partial6 = tmp;
166
167 // Partial sums for lines 2 and 3.
168 partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[2], 3));
169 partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[3], 4));
170 partial4b = vaddq_s16(partial4b, vextq_s16(lines[2], zero, 3));
171 partial4b = vaddq_s16(partial4b, vextq_s16(lines[3], zero, 4));
172 tmp = vaddq_s16(lines[2], lines[3]);
173 partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 4));
174 partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 4));
175 partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 5));
176 partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 5));
177 partial6 = vaddq_s16(partial6, tmp);
178
179 // Partial sums for lines 4 and 5.
180 partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[4], 5));
181 partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[5], 6));
182 partial4b = vaddq_s16(partial4b, vextq_s16(lines[4], zero, 5));
183 partial4b = vaddq_s16(partial4b, vextq_s16(lines[5], zero, 6));
184 tmp = vaddq_s16(lines[4], lines[5]);
185 partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 5));
186 partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 5));
187 partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 4));
188 partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 4));
189 partial6 = vaddq_s16(partial6, tmp);
190
191 // Partial sums for lines 6 and 7.
192 partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[6], 7));
193 partial4a = vaddq_s16(partial4a, lines[7]);
194 partial4b = vaddq_s16(partial4b, vextq_s16(lines[6], zero, 7));
195 tmp = vaddq_s16(lines[6], lines[7]);
196 partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 6));
197 partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 6));
198 partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 3));
199 partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 3));
200 partial6 = vaddq_s16(partial6, tmp);
201
202 uint32x4_t const0 = vreinterpretq_u32_u64(
203 vcombine_u64(vcreate_u64((uint64_t)420 << 32 | 840),
204 vcreate_u64((uint64_t)210 << 32 | 280)));
205 uint32x4_t const1 = vreinterpretq_u32_u64(
206 vcombine_u64(vcreate_u64((uint64_t)140 << 32 | 168),
207 vcreate_u64((uint64_t)105 << 32 | 120)));
208 uint32x4_t const2 = vreinterpretq_u32_u64(
209 vcombine_u64(vcreate_u64(0), vcreate_u64((uint64_t)210 << 32 | 420)));
210 uint32x4_t const3 = vreinterpretq_u32_u64(
211 vcombine_u64(vcreate_u64((uint64_t)105 << 32 | 140),
212 vcreate_u64((uint64_t)105 << 32 | 105)));
213
214 // Compute costs in terms of partial sums.
215 int32x4_t partial6_s32 =
216 vmull_s16(vget_low_s16(partial6), vget_low_s16(partial6));
217 partial6_s32 =
218 vmlal_s16(partial6_s32, vget_high_s16(partial6), vget_high_s16(partial6));
219
220 uint32x4_t costs[4];
221 costs[0] = fold_mul_and_sum_neon(partial4a, partial4b, const0, const1);
222 costs[1] = fold_mul_and_sum_neon(partial5a, partial5b, const2, const3);
223 costs[2] = vmulq_n_u32(vreinterpretq_u32_s32(partial6_s32), 105);
224 costs[3] = fold_mul_and_sum_neon(partial7a, partial7b, const2, const3);
225
226 costs[0] = horizontal_add_4d_u32x4(costs);
227 vst1q_u32(cost, costs[0]);
228 return costs[0];
229 }
230
fold_mul_and_sum_pairwise_neon(int16x8_t partiala,int16x8_t partialb,int16x8_t partialc,uint32x4_t const0)231 static inline uint32x4_t fold_mul_and_sum_pairwise_neon(int16x8_t partiala,
232 int16x8_t partialb,
233 int16x8_t partialc,
234 uint32x4_t const0) {
235 // Reverse partial c.
236 // pattern = { 10 11 8 9 6 7 4 5 2 3 0 1 12 13 14 15 }.
237 uint8x16_t pattern = vreinterpretq_u8_u64(
238 vcombine_u64(vcreate_u64((uint64_t)0x05040706 << 32 | 0x09080b0a),
239 vcreate_u64((uint64_t)0x0f0e0d0c << 32 | 0x01000302)));
240
241 #if AOM_ARCH_AARCH64
242 partialc =
243 vreinterpretq_s16_s8(vqtbl1q_s8(vreinterpretq_s8_s16(partialc), pattern));
244 #else
245 int8x8x2_t p = { { vget_low_s8(vreinterpretq_s8_s16(partialc)),
246 vget_high_s8(vreinterpretq_s8_s16(partialc)) } };
247 int8x8_t shuffle_hi = vtbl2_s8(p, vget_high_s8(vreinterpretq_s8_u8(pattern)));
248 int8x8_t shuffle_lo = vtbl2_s8(p, vget_low_s8(vreinterpretq_s8_u8(pattern)));
249 partialc = vreinterpretq_s16_s8(vcombine_s8(shuffle_lo, shuffle_hi));
250 #endif
251
252 int32x4_t partiala_s32 = vpaddlq_s16(partiala);
253 int32x4_t partialb_s32 = vpaddlq_s16(partialb);
254 int32x4_t partialc_s32 = vpaddlq_s16(partialc);
255
256 partiala_s32 = vmulq_s32(partiala_s32, partiala_s32);
257 partialb_s32 = vmulq_s32(partialb_s32, partialb_s32);
258 partialc_s32 = vmulq_s32(partialc_s32, partialc_s32);
259
260 partiala_s32 = vaddq_s32(partiala_s32, partialc_s32);
261
262 uint32x4_t cost = vmulq_n_u32(vreinterpretq_u32_s32(partialb_s32), 105);
263 cost = vmlaq_u32(cost, vreinterpretq_u32_s32(partiala_s32), const0);
264 return cost;
265 }
266
267 // This function computes the cost along directions 0, 1, 2, 3. (0 means
268 // 45-degree up-right, 2 is horizontal).
269 //
270 // For direction 1 and 3 ("east northeast" and "east southeast") the shifted
271 // lines need three vectors instead of two. For direction 1 for example, we need
272 // to compute the sums along the line i below:
273 // 0 0 1 1 2 2 3 3
274 // 1 1 2 2 3 3 4 4
275 // 2 2 3 3 4 4 5 5
276 // 3 3 4 4 5 5 6 6
277 // 4 4 5 5 6 6 7 7
278 // 5 5 6 6 7 7 8 8
279 // 6 6 7 7 8 8 9 9
280 // 7 7 8 8 9 9 10 10
281 //
282 // Which means we need the following configuration:
283 // 0 0 1 1 2 2 3 3
284 // 1 1 2 2 3 3 4 4
285 // 2 2 3 3 4 4 5 5
286 // 3 3 4 4 5 5 6 6
287 // 4 4 5 5 6 6 7 7
288 // 5 5 6 6 7 7 8 8
289 // 6 6 7 7 8 8 9 9
290 // 7 7 8 8 9 9 10 10
291 //
292 // Three vectors are needed to compute this, as well as some extra pairwise
293 // additions.
compute_horiz_directions_neon(int16x8_t lines[8],uint32_t cost[4])294 static uint32x4_t compute_horiz_directions_neon(int16x8_t lines[8],
295 uint32_t cost[4]) {
296 const int16x8_t zero = vdupq_n_s16(0);
297
298 // Compute diagonal directions (1, 2, 3).
299 // Partial sums for lines 0 and 1.
300 int16x8_t partial0a = lines[0];
301 partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[1], 7));
302 int16x8_t partial0b = vextq_s16(lines[1], zero, 7);
303 int16x8_t partial1a = vaddq_s16(lines[0], vextq_s16(zero, lines[1], 6));
304 int16x8_t partial1b = vextq_s16(lines[1], zero, 6);
305 int16x8_t partial3a = vextq_s16(lines[0], zero, 2);
306 partial3a = vaddq_s16(partial3a, vextq_s16(lines[1], zero, 4));
307 int16x8_t partial3b = vextq_s16(zero, lines[0], 2);
308 partial3b = vaddq_s16(partial3b, vextq_s16(zero, lines[1], 4));
309
310 // Partial sums for lines 2 and 3.
311 partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[2], 6));
312 partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[3], 5));
313 partial0b = vaddq_s16(partial0b, vextq_s16(lines[2], zero, 6));
314 partial0b = vaddq_s16(partial0b, vextq_s16(lines[3], zero, 5));
315 partial1a = vaddq_s16(partial1a, vextq_s16(zero, lines[2], 4));
316 partial1a = vaddq_s16(partial1a, vextq_s16(zero, lines[3], 2));
317 partial1b = vaddq_s16(partial1b, vextq_s16(lines[2], zero, 4));
318 partial1b = vaddq_s16(partial1b, vextq_s16(lines[3], zero, 2));
319 partial3a = vaddq_s16(partial3a, vextq_s16(lines[2], zero, 6));
320 partial3b = vaddq_s16(partial3b, vextq_s16(zero, lines[2], 6));
321 partial3b = vaddq_s16(partial3b, lines[3]);
322
323 // Partial sums for lines 4 and 5.
324 partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[4], 4));
325 partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[5], 3));
326 partial0b = vaddq_s16(partial0b, vextq_s16(lines[4], zero, 4));
327 partial0b = vaddq_s16(partial0b, vextq_s16(lines[5], zero, 3));
328 partial1b = vaddq_s16(partial1b, lines[4]);
329 partial1b = vaddq_s16(partial1b, vextq_s16(zero, lines[5], 6));
330 int16x8_t partial1c = vextq_s16(lines[5], zero, 6);
331 partial3b = vaddq_s16(partial3b, vextq_s16(lines[4], zero, 2));
332 partial3b = vaddq_s16(partial3b, vextq_s16(lines[5], zero, 4));
333 int16x8_t partial3c = vextq_s16(zero, lines[4], 2);
334 partial3c = vaddq_s16(partial3c, vextq_s16(zero, lines[5], 4));
335
336 // Partial sums for lines 6 and 7.
337 partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[6], 2));
338 partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[7], 1));
339 partial0b = vaddq_s16(partial0b, vextq_s16(lines[6], zero, 2));
340 partial0b = vaddq_s16(partial0b, vextq_s16(lines[7], zero, 1));
341 partial1b = vaddq_s16(partial1b, vextq_s16(zero, lines[6], 4));
342 partial1b = vaddq_s16(partial1b, vextq_s16(zero, lines[7], 2));
343 partial1c = vaddq_s16(partial1c, vextq_s16(lines[6], zero, 4));
344 partial1c = vaddq_s16(partial1c, vextq_s16(lines[7], zero, 2));
345 partial3b = vaddq_s16(partial3b, vextq_s16(lines[6], zero, 6));
346 partial3c = vaddq_s16(partial3c, vextq_s16(zero, lines[6], 6));
347 partial3c = vaddq_s16(partial3c, lines[7]);
348
349 // Special case for direction 2 as it's just a sum along each line.
350 int16x8_t lines03[4] = { lines[0], lines[1], lines[2], lines[3] };
351 int16x8_t lines47[4] = { lines[4], lines[5], lines[6], lines[7] };
352 int32x4_t partial2a = horizontal_add_4d_s16x8(lines03);
353 int32x4_t partial2b = horizontal_add_4d_s16x8(lines47);
354
355 uint32x4_t partial2a_u32 =
356 vreinterpretq_u32_s32(vmulq_s32(partial2a, partial2a));
357 uint32x4_t partial2b_u32 =
358 vreinterpretq_u32_s32(vmulq_s32(partial2b, partial2b));
359
360 uint32x4_t const0 = vreinterpretq_u32_u64(
361 vcombine_u64(vcreate_u64((uint64_t)420 << 32 | 840),
362 vcreate_u64((uint64_t)210 << 32 | 280)));
363 uint32x4_t const1 = vreinterpretq_u32_u64(
364 vcombine_u64(vcreate_u64((uint64_t)140 << 32 | 168),
365 vcreate_u64((uint64_t)105 << 32 | 120)));
366 uint32x4_t const2 = vreinterpretq_u32_u64(
367 vcombine_u64(vcreate_u64((uint64_t)210 << 32 | 420),
368 vcreate_u64((uint64_t)105 << 32 | 140)));
369
370 uint32x4_t costs[4];
371 costs[0] = fold_mul_and_sum_neon(partial0a, partial0b, const0, const1);
372 costs[1] =
373 fold_mul_and_sum_pairwise_neon(partial1a, partial1b, partial1c, const2);
374 costs[2] = vaddq_u32(partial2a_u32, partial2b_u32);
375 costs[2] = vmulq_n_u32(costs[2], 105);
376 costs[3] =
377 fold_mul_and_sum_pairwise_neon(partial3c, partial3b, partial3a, const2);
378
379 costs[0] = horizontal_add_4d_u32x4(costs);
380 vst1q_u32(cost, costs[0]);
381 return costs[0];
382 }
383
cdef_find_dir_neon(const uint16_t * img,int stride,int32_t * var,int coeff_shift)384 int cdef_find_dir_neon(const uint16_t *img, int stride, int32_t *var,
385 int coeff_shift) {
386 uint32_t cost[8];
387 uint32_t best_cost = 0;
388 int best_dir = 0;
389 int16x8_t lines[8];
390 for (int i = 0; i < 8; i++) {
391 uint16x8_t s = vld1q_u16(&img[i * stride]);
392 lines[i] = vreinterpretq_s16_u16(
393 vsubq_u16(vshlq_u16(s, vdupq_n_s16(-coeff_shift)), vdupq_n_u16(128)));
394 }
395
396 // Compute "mostly vertical" directions.
397 uint32x4_t cost47 = compute_vert_directions_neon(lines, cost + 4);
398
399 // Compute "mostly horizontal" directions.
400 uint32x4_t cost03 = compute_horiz_directions_neon(lines, cost);
401
402 // Find max cost as well as its index to get best_dir.
403 // The max cost needs to be propagated in the whole vector to find its
404 // position in the original cost vectors cost03 and cost47.
405 uint32x4_t cost07 = vmaxq_u32(cost03, cost47);
406 #if AOM_ARCH_AARCH64
407 best_cost = vmaxvq_u32(cost07);
408 uint32x4_t max_cost = vdupq_n_u32(best_cost);
409 uint8x16x2_t costs = { { vreinterpretq_u8_u32(vceqq_u32(max_cost, cost03)),
410 vreinterpretq_u8_u32(
411 vceqq_u32(max_cost, cost47)) } };
412 // idx = { 28, 24, 20, 16, 12, 8, 4, 0 };
413 uint8x8_t idx = vreinterpret_u8_u64(vcreate_u64(0x0004080c1014181cULL));
414 // Get the lowest 8 bit of each 32-bit elements and reverse them.
415 uint8x8_t tbl = vqtbl2_u8(costs, idx);
416 uint64_t a = vget_lane_u64(vreinterpret_u64_u8(tbl), 0);
417 best_dir = aom_clzll(a) >> 3;
418 #else
419 uint32x2_t cost64 = vpmax_u32(vget_low_u32(cost07), vget_high_u32(cost07));
420 cost64 = vpmax_u32(cost64, cost64);
421 uint32x4_t max_cost = vcombine_u32(cost64, cost64);
422 best_cost = vget_lane_u32(cost64, 0);
423 uint16x8_t costs = vcombine_u16(vmovn_u32(vceqq_u32(max_cost, cost03)),
424 vmovn_u32(vceqq_u32(max_cost, cost47)));
425 uint8x8_t idx =
426 vand_u8(vmovn_u16(costs),
427 vreinterpret_u8_u64(vcreate_u64(0x8040201008040201ULL)));
428 int sum = horizontal_add_u8x8(idx);
429 best_dir = get_msb(sum ^ (sum - 1));
430 #endif
431
432 // Difference between the optimal variance and the variance along the
433 // orthogonal direction. Again, the sum(x^2) terms cancel out.
434 *var = best_cost - cost[(best_dir + 4) & 7];
435 // We'd normally divide by 840, but dividing by 1024 is close enough
436 // for what we're going to do with this.
437 *var >>= 10;
438 return best_dir;
439 }
440
cdef_find_dir_dual_neon(const uint16_t * img1,const uint16_t * img2,int stride,int32_t * var_out_1st,int32_t * var_out_2nd,int coeff_shift,int * out_dir_1st_8x8,int * out_dir_2nd_8x8)441 void cdef_find_dir_dual_neon(const uint16_t *img1, const uint16_t *img2,
442 int stride, int32_t *var_out_1st,
443 int32_t *var_out_2nd, int coeff_shift,
444 int *out_dir_1st_8x8, int *out_dir_2nd_8x8) {
445 // Process first 8x8.
446 *out_dir_1st_8x8 = cdef_find_dir(img1, stride, var_out_1st, coeff_shift);
447
448 // Process second 8x8.
449 *out_dir_2nd_8x8 = cdef_find_dir(img2, stride, var_out_2nd, coeff_shift);
450 }
451
452 // sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp)))
constrain16(uint16x8_t a,uint16x8_t b,unsigned int threshold,int adjdamp)453 static inline int16x8_t constrain16(uint16x8_t a, uint16x8_t b,
454 unsigned int threshold, int adjdamp) {
455 uint16x8_t diff = vabdq_u16(a, b);
456 const uint16x8_t a_gt_b = vcgtq_u16(a, b);
457 const uint16x8_t s = vqsubq_u16(vdupq_n_u16(threshold),
458 vshlq_u16(diff, vdupq_n_s16(-adjdamp)));
459 const int16x8_t clip = vreinterpretq_s16_u16(vminq_u16(diff, s));
460 return vbslq_s16(a_gt_b, clip, vnegq_s16(clip));
461 }
462
primary_filter(uint16x8_t s,uint16x8_t tap[4],const int * pri_taps,int pri_strength,int pri_damping,int16x8_t * sum)463 static inline void primary_filter(uint16x8_t s, uint16x8_t tap[4],
464 const int *pri_taps, int pri_strength,
465 int pri_damping, int16x8_t *sum) {
466 // Near taps
467 int16x8_t n0 = constrain16(tap[0], s, pri_strength, pri_damping);
468 int16x8_t n1 = constrain16(tap[1], s, pri_strength, pri_damping);
469 // sum += pri_taps[0] * (n0 + n1)
470 n0 = vaddq_s16(n0, n1);
471 *sum = vmlaq_n_s16(*sum, n0, pri_taps[0]);
472
473 // Far taps
474 int16x8_t f0 = constrain16(tap[2], s, pri_strength, pri_damping);
475 int16x8_t f1 = constrain16(tap[3], s, pri_strength, pri_damping);
476 // sum += pri_taps[1] * (f0 + f1)
477 f0 = vaddq_s16(f0, f1);
478 *sum = vmlaq_n_s16(*sum, f0, pri_taps[1]);
479 }
480
secondary_filter(uint16x8_t s,uint16x8_t tap[8],const int * sec_taps,int sec_strength,int sec_damping,int16x8_t * sum)481 static inline void secondary_filter(uint16x8_t s, uint16x8_t tap[8],
482 const int *sec_taps, int sec_strength,
483 int sec_damping, int16x8_t *sum) {
484 // Near taps
485 int16x8_t s0 = constrain16(tap[0], s, sec_strength, sec_damping);
486 int16x8_t s1 = constrain16(tap[1], s, sec_strength, sec_damping);
487 int16x8_t s2 = constrain16(tap[2], s, sec_strength, sec_damping);
488 int16x8_t s3 = constrain16(tap[3], s, sec_strength, sec_damping);
489
490 // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
491 s0 = vaddq_s16(s0, s1);
492 s2 = vaddq_s16(s2, s3);
493 s0 = vaddq_s16(s0, s2);
494 *sum = vmlaq_n_s16(*sum, s0, sec_taps[0]);
495
496 // Far taps
497 s0 = constrain16(tap[4], s, sec_strength, sec_damping);
498 s1 = constrain16(tap[5], s, sec_strength, sec_damping);
499 s2 = constrain16(tap[6], s, sec_strength, sec_damping);
500 s3 = constrain16(tap[7], s, sec_strength, sec_damping);
501
502 // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
503 s0 = vaddq_s16(s0, s1);
504 s2 = vaddq_s16(s2, s3);
505 s0 = vaddq_s16(s0, s2);
506 *sum = vmlaq_n_s16(*sum, s0, sec_taps[1]);
507 }
508
cdef_filter_8_0_neon(void * dest,int dstride,const uint16_t * in,int pri_strength,int sec_strength,int dir,int pri_damping,int sec_damping,int coeff_shift,int block_width,int block_height)509 void cdef_filter_8_0_neon(void *dest, int dstride, const uint16_t *in,
510 int pri_strength, int sec_strength, int dir,
511 int pri_damping, int sec_damping, int coeff_shift,
512 int block_width, int block_height) {
513 uint16x8_t max, min;
514 const uint16x8_t cdef_large_value_mask =
515 vdupq_n_u16(((uint16_t)~CDEF_VERY_LARGE));
516 const int po1 = cdef_directions[dir][0];
517 const int po2 = cdef_directions[dir][1];
518 const int s1o1 = cdef_directions[dir + 2][0];
519 const int s1o2 = cdef_directions[dir + 2][1];
520 const int s2o1 = cdef_directions[dir - 2][0];
521 const int s2o2 = cdef_directions[dir - 2][1];
522 const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
523 const int *sec_taps = cdef_sec_taps;
524
525 if (pri_strength) {
526 pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
527 }
528 if (sec_strength) {
529 sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
530 }
531
532 if (block_width == 8) {
533 uint8_t *dst8 = (uint8_t *)dest;
534
535 int h = block_height;
536 do {
537 int16x8_t sum = vdupq_n_s16(0);
538 uint16x8_t s = vld1q_u16(in);
539 max = min = s;
540
541 uint16x8_t pri_src[4];
542
543 // Primary near taps
544 pri_src[0] = vld1q_u16(in + po1);
545 pri_src[1] = vld1q_u16(in - po1);
546
547 // Primary far taps
548 pri_src[2] = vld1q_u16(in + po2);
549 pri_src[3] = vld1q_u16(in - po2);
550
551 primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum);
552
553 // The source is 16 bits, however, we only really care about the lower
554 // 8 bits. The upper 8 bits contain the "large" flag. After the final
555 // primary max has been calculated, zero out the upper 8 bits. Use this
556 // to find the "16 bit" max.
557 uint8x16_t pri_max0 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[0]),
558 vreinterpretq_u8_u16(pri_src[1]));
559 uint8x16_t pri_max1 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[2]),
560 vreinterpretq_u8_u16(pri_src[3]));
561 pri_max0 = vmaxq_u8(pri_max0, pri_max1);
562 max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(pri_max0),
563 cdef_large_value_mask));
564
565 uint16x8_t pri_min0 = vminq_u16(pri_src[0], pri_src[1]);
566 uint16x8_t pri_min1 = vminq_u16(pri_src[2], pri_src[3]);
567 pri_min0 = vminq_u16(pri_min0, pri_min1);
568 min = vminq_u16(min, pri_min0);
569
570 uint16x8_t sec_src[8];
571
572 // Secondary near taps
573 sec_src[0] = vld1q_u16(in + s1o1);
574 sec_src[1] = vld1q_u16(in - s1o1);
575 sec_src[2] = vld1q_u16(in + s2o1);
576 sec_src[3] = vld1q_u16(in - s2o1);
577
578 // Secondary far taps
579 sec_src[4] = vld1q_u16(in + s1o2);
580 sec_src[5] = vld1q_u16(in - s1o2);
581 sec_src[6] = vld1q_u16(in + s2o2);
582 sec_src[7] = vld1q_u16(in - s2o2);
583
584 secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
585
586 // The source is 16 bits, however, we only really care about the lower
587 // 8 bits. The upper 8 bits contain the "large" flag. After the final
588 // primary max has been calculated, zero out the upper 8 bits. Use this
589 // to find the "16 bit" max.
590 uint8x16_t sec_max0 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[0]),
591 vreinterpretq_u8_u16(sec_src[1]));
592 uint8x16_t sec_max1 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[2]),
593 vreinterpretq_u8_u16(sec_src[3]));
594 uint8x16_t sec_max2 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[4]),
595 vreinterpretq_u8_u16(sec_src[5]));
596 uint8x16_t sec_max3 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[6]),
597 vreinterpretq_u8_u16(sec_src[7]));
598 sec_max0 = vmaxq_u8(sec_max0, sec_max1);
599 sec_max2 = vmaxq_u8(sec_max2, sec_max3);
600 sec_max0 = vmaxq_u8(sec_max0, sec_max2);
601 max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(sec_max0),
602 cdef_large_value_mask));
603
604 uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]);
605 uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]);
606 uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]);
607 uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]);
608 sec_min0 = vminq_u16(sec_min0, sec_min1);
609 sec_min2 = vminq_u16(sec_min2, sec_min3);
610 sec_min0 = vminq_u16(sec_min0, sec_min2);
611 min = vminq_u16(min, sec_min0);
612
613 // res = s + ((sum - (sum < 0) + 8) >> 4)
614 sum =
615 vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
616 int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
617
618 res_s16 = vminq_s16(vmaxq_s16(res_s16, vreinterpretq_s16_u16(min)),
619 vreinterpretq_s16_u16(max));
620
621 const uint8x8_t res_u8 = vqmovun_s16(res_s16);
622 vst1_u8(dst8, res_u8);
623
624 in += CDEF_BSTRIDE;
625 dst8 += dstride;
626 } while (--h != 0);
627 } else {
628 uint8_t *dst8 = (uint8_t *)dest;
629
630 int h = block_height;
631 do {
632 int16x8_t sum = vdupq_n_s16(0);
633 uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
634 max = min = s;
635
636 uint16x8_t pri_src[4];
637
638 // Primary near taps
639 pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE);
640 pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE);
641
642 // Primary far taps
643 pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE);
644 pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE);
645
646 primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum);
647
648 // The source is 16 bits, however, we only really care about the lower
649 // 8 bits. The upper 8 bits contain the "large" flag. After the final
650 // primary max has been calculated, zero out the upper 8 bits. Use this
651 // to find the "16 bit" max.
652 uint8x16_t pri_max0 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[0]),
653 vreinterpretq_u8_u16(pri_src[1]));
654 uint8x16_t pri_max1 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[2]),
655 vreinterpretq_u8_u16(pri_src[3]));
656 pri_max0 = vmaxq_u8(pri_max0, pri_max1);
657 max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(pri_max0),
658 cdef_large_value_mask));
659
660 uint16x8_t pri_min1 = vminq_u16(pri_src[0], pri_src[1]);
661 uint16x8_t pri_min2 = vminq_u16(pri_src[2], pri_src[3]);
662 pri_min1 = vminq_u16(pri_min1, pri_min2);
663 min = vminq_u16(min, pri_min1);
664
665 uint16x8_t sec_src[8];
666
667 // Secondary near taps
668 sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE);
669 sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE);
670 sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE);
671 sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE);
672
673 // Secondary far taps
674 sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE);
675 sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE);
676 sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE);
677 sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE);
678
679 secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
680
681 // The source is 16 bits, however, we only really care about the lower
682 // 8 bits. The upper 8 bits contain the "large" flag. After the final
683 // primary max has been calculated, zero out the upper 8 bits. Use this
684 // to find the "16 bit" max.
685 uint8x16_t sec_max0 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[0]),
686 vreinterpretq_u8_u16(sec_src[1]));
687 uint8x16_t sec_max1 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[2]),
688 vreinterpretq_u8_u16(sec_src[3]));
689 uint8x16_t sec_max2 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[4]),
690 vreinterpretq_u8_u16(sec_src[5]));
691 uint8x16_t sec_max3 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[6]),
692 vreinterpretq_u8_u16(sec_src[7]));
693 sec_max0 = vmaxq_u8(sec_max0, sec_max1);
694 sec_max2 = vmaxq_u8(sec_max2, sec_max3);
695 sec_max0 = vmaxq_u8(sec_max0, sec_max2);
696 max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(sec_max0),
697 cdef_large_value_mask));
698
699 uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]);
700 uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]);
701 uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]);
702 uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]);
703 sec_min0 = vminq_u16(sec_min0, sec_min1);
704 sec_min2 = vminq_u16(sec_min2, sec_min3);
705 sec_min0 = vminq_u16(sec_min0, sec_min2);
706 min = vminq_u16(min, sec_min0);
707
708 // res = s + ((sum - (sum < 0) + 8) >> 4)
709 sum =
710 vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
711 int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
712
713 res_s16 = vminq_s16(vmaxq_s16(res_s16, vreinterpretq_s16_u16(min)),
714 vreinterpretq_s16_u16(max));
715
716 const uint8x8_t res_u8 = vqmovun_s16(res_s16);
717 store_u8x4_strided_x2(dst8, dstride, res_u8);
718
719 in += 2 * CDEF_BSTRIDE;
720 dst8 += 2 * dstride;
721 h -= 2;
722 } while (h != 0);
723 }
724 }
725
cdef_filter_8_1_neon(void * dest,int dstride,const uint16_t * in,int pri_strength,int sec_strength,int dir,int pri_damping,int sec_damping,int coeff_shift,int block_width,int block_height)726 void cdef_filter_8_1_neon(void *dest, int dstride, const uint16_t *in,
727 int pri_strength, int sec_strength, int dir,
728 int pri_damping, int sec_damping, int coeff_shift,
729 int block_width, int block_height) {
730 (void)sec_strength;
731 (void)sec_damping;
732
733 const int po1 = cdef_directions[dir][0];
734 const int po2 = cdef_directions[dir][1];
735 const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
736
737 if (pri_strength) {
738 pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
739 }
740
741 if (block_width == 8) {
742 uint8_t *dst8 = (uint8_t *)dest;
743
744 int h = block_height;
745 do {
746 int16x8_t sum = vdupq_n_s16(0);
747 uint16x8_t s = vld1q_u16(in);
748
749 uint16x8_t tap[4];
750
751 // Primary near taps
752 tap[0] = vld1q_u16(in + po1);
753 tap[1] = vld1q_u16(in - po1);
754
755 // Primary far taps
756 tap[2] = vld1q_u16(in + po2);
757 tap[3] = vld1q_u16(in - po2);
758
759 primary_filter(s, tap, pri_taps, pri_strength, pri_damping, &sum);
760
761 // res = s + ((sum - (sum < 0) + 8) >> 4)
762 sum =
763 vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
764 const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
765
766 const uint8x8_t res_u8 = vqmovun_s16(res_s16);
767 vst1_u8(dst8, res_u8);
768
769 in += CDEF_BSTRIDE;
770 dst8 += dstride;
771 } while (--h != 0);
772
773 } else {
774 uint8_t *dst8 = (uint8_t *)dest;
775
776 int h = block_height;
777 do {
778 int16x8_t sum = vdupq_n_s16(0);
779 uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
780
781 uint16x8_t pri_src[4];
782
783 // Primary near taps
784 pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE);
785 pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE);
786
787 // Primary far taps
788 pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE);
789 pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE);
790
791 primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum);
792
793 // res = s + ((sum - (sum < 0) + 8) >> 4)
794 sum =
795 vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
796 const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
797
798 const uint8x8_t res_u8 = vqmovun_s16(res_s16);
799 store_u8x4_strided_x2(dst8, dstride, res_u8);
800
801 in += 2 * CDEF_BSTRIDE;
802 dst8 += 2 * dstride;
803 h -= 2;
804 } while (h != 0);
805 }
806 }
807
cdef_filter_8_2_neon(void * dest,int dstride,const uint16_t * in,int pri_strength,int sec_strength,int dir,int pri_damping,int sec_damping,int coeff_shift,int block_width,int block_height)808 void cdef_filter_8_2_neon(void *dest, int dstride, const uint16_t *in,
809 int pri_strength, int sec_strength, int dir,
810 int pri_damping, int sec_damping, int coeff_shift,
811 int block_width, int block_height) {
812 (void)pri_strength;
813 (void)pri_damping;
814 (void)coeff_shift;
815
816 const int s1o1 = cdef_directions[dir + 2][0];
817 const int s1o2 = cdef_directions[dir + 2][1];
818 const int s2o1 = cdef_directions[dir - 2][0];
819 const int s2o2 = cdef_directions[dir - 2][1];
820 const int *sec_taps = cdef_sec_taps;
821
822 if (sec_strength) {
823 sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
824 }
825
826 if (block_width == 8) {
827 uint8_t *dst8 = (uint8_t *)dest;
828
829 int h = block_height;
830 do {
831 int16x8_t sum = vdupq_n_s16(0);
832 uint16x8_t s = vld1q_u16(in);
833
834 uint16x8_t sec_src[8];
835
836 // Secondary near taps
837 sec_src[0] = vld1q_u16(in + s1o1);
838 sec_src[1] = vld1q_u16(in - s1o1);
839 sec_src[2] = vld1q_u16(in + s2o1);
840 sec_src[3] = vld1q_u16(in - s2o1);
841
842 // Secondary far taps
843 sec_src[4] = vld1q_u16(in + s1o2);
844 sec_src[5] = vld1q_u16(in - s1o2);
845 sec_src[6] = vld1q_u16(in + s2o2);
846 sec_src[7] = vld1q_u16(in - s2o2);
847
848 secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
849
850 // res = s + ((sum - (sum < 0) + 8) >> 4)
851 sum =
852 vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
853 const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
854
855 const uint8x8_t res_u8 = vqmovun_s16(res_s16);
856 vst1_u8(dst8, res_u8);
857
858 in += CDEF_BSTRIDE;
859 dst8 += dstride;
860 } while (--h != 0);
861 } else {
862 uint8_t *dst8 = (uint8_t *)dest;
863
864 int h = block_height;
865 do {
866 int16x8_t sum = vdupq_n_s16(0);
867 uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
868
869 uint16x8_t sec_src[8];
870
871 // Secondary near taps
872 sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE);
873 sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE);
874 sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE);
875 sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE);
876
877 // Secondary far taps
878 sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE);
879 sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE);
880 sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE);
881 sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE);
882
883 secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
884
885 // res = s + ((sum - (sum < 0) + 8) >> 4)
886 sum =
887 vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
888 const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
889
890 const uint8x8_t res_u8 = vqmovun_s16(res_s16);
891 store_u8x4_strided_x2(dst8, dstride, res_u8);
892
893 in += 2 * CDEF_BSTRIDE;
894 dst8 += 2 * dstride;
895 h -= 2;
896 } while (h != 0);
897 }
898 }
899
cdef_filter_8_3_neon(void * dest,int dstride,const uint16_t * in,int pri_strength,int sec_strength,int dir,int pri_damping,int sec_damping,int coeff_shift,int block_width,int block_height)900 void cdef_filter_8_3_neon(void *dest, int dstride, const uint16_t *in,
901 int pri_strength, int sec_strength, int dir,
902 int pri_damping, int sec_damping, int coeff_shift,
903 int block_width, int block_height) {
904 (void)pri_strength;
905 (void)sec_strength;
906 (void)dir;
907 (void)pri_damping;
908 (void)sec_damping;
909 (void)coeff_shift;
910 (void)block_width;
911 if (block_width == 8) {
912 uint8_t *dst8 = (uint8_t *)dest;
913
914 int h = block_height;
915 do {
916 const uint16x8_t s = vld1q_u16(in);
917 const uint8x8_t res = vqmovn_u16(s);
918 vst1_u8(dst8, res);
919
920 in += CDEF_BSTRIDE;
921 dst8 += dstride;
922 } while (--h != 0);
923 } else {
924 uint8_t *dst8 = (uint8_t *)dest;
925
926 int h = block_height;
927 do {
928 const uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
929 const uint8x8_t res = vqmovn_u16(s);
930 store_u8x4_strided_x2(dst8, dstride, res);
931
932 in += 2 * CDEF_BSTRIDE;
933 dst8 += 2 * dstride;
934 h -= 2;
935 } while (h != 0);
936 }
937 }
938
cdef_filter_16_0_neon(void * dest,int dstride,const uint16_t * in,int pri_strength,int sec_strength,int dir,int pri_damping,int sec_damping,int coeff_shift,int block_width,int block_height)939 void cdef_filter_16_0_neon(void *dest, int dstride, const uint16_t *in,
940 int pri_strength, int sec_strength, int dir,
941 int pri_damping, int sec_damping, int coeff_shift,
942 int block_width, int block_height) {
943 uint16x8_t max, min;
944 const uint16x8_t cdef_large_value_mask =
945 vdupq_n_u16(((uint16_t)~CDEF_VERY_LARGE));
946 const int po1 = cdef_directions[dir][0];
947 const int po2 = cdef_directions[dir][1];
948 const int s1o1 = cdef_directions[dir + 2][0];
949 const int s1o2 = cdef_directions[dir + 2][1];
950 const int s2o1 = cdef_directions[dir - 2][0];
951 const int s2o2 = cdef_directions[dir - 2][1];
952 const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
953 const int *sec_taps = cdef_sec_taps;
954
955 if (pri_strength) {
956 pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
957 }
958 if (sec_strength) {
959 sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
960 }
961
962 if (block_width == 8) {
963 uint16_t *dst16 = (uint16_t *)dest;
964
965 int h = block_height;
966 do {
967 int16x8_t sum = vdupq_n_s16(0);
968 uint16x8_t s = vld1q_u16(in);
969 max = min = s;
970
971 uint16x8_t pri_src[4];
972
973 // Primary near taps
974 pri_src[0] = vld1q_u16(in + po1);
975 pri_src[1] = vld1q_u16(in - po1);
976
977 // Primary far taps
978 pri_src[2] = vld1q_u16(in + po2);
979 pri_src[3] = vld1q_u16(in - po2);
980
981 primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum);
982
983 uint16x8_t pri_min0 = vminq_u16(pri_src[0], pri_src[1]);
984 uint16x8_t pri_min1 = vminq_u16(pri_src[2], pri_src[3]);
985 pri_min0 = vminq_u16(pri_min0, pri_min1);
986 min = vminq_u16(min, pri_min0);
987
988 /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
989 pri_src[0] = vandq_u16(pri_src[0], cdef_large_value_mask);
990 pri_src[1] = vandq_u16(pri_src[1], cdef_large_value_mask);
991 pri_src[2] = vandq_u16(pri_src[2], cdef_large_value_mask);
992 pri_src[3] = vandq_u16(pri_src[3], cdef_large_value_mask);
993
994 uint16x8_t pri_max0 = vmaxq_u16(pri_src[0], pri_src[1]);
995 uint16x8_t pri_max1 = vmaxq_u16(pri_src[2], pri_src[3]);
996 pri_max0 = vmaxq_u16(pri_max0, pri_max1);
997 max = vmaxq_u16(max, pri_max0);
998
999 uint16x8_t sec_src[8];
1000
1001 // Secondary near taps
1002 sec_src[0] = vld1q_u16(in + s1o1);
1003 sec_src[1] = vld1q_u16(in - s1o1);
1004 sec_src[2] = vld1q_u16(in + s2o1);
1005 sec_src[3] = vld1q_u16(in - s2o1);
1006
1007 // Secondary far taps
1008 sec_src[4] = vld1q_u16(in + s1o2);
1009 sec_src[5] = vld1q_u16(in - s1o2);
1010 sec_src[6] = vld1q_u16(in + s2o2);
1011 sec_src[7] = vld1q_u16(in - s2o2);
1012
1013 secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
1014
1015 uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]);
1016 uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]);
1017 uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]);
1018 uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]);
1019 sec_min0 = vminq_u16(sec_min0, sec_min1);
1020 sec_min2 = vminq_u16(sec_min2, sec_min3);
1021 sec_min0 = vminq_u16(sec_min0, sec_min2);
1022 min = vminq_u16(min, sec_min0);
1023
1024 /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
1025 sec_src[0] = vandq_u16(sec_src[0], cdef_large_value_mask);
1026 sec_src[1] = vandq_u16(sec_src[1], cdef_large_value_mask);
1027 sec_src[2] = vandq_u16(sec_src[2], cdef_large_value_mask);
1028 sec_src[3] = vandq_u16(sec_src[3], cdef_large_value_mask);
1029 sec_src[4] = vandq_u16(sec_src[4], cdef_large_value_mask);
1030 sec_src[5] = vandq_u16(sec_src[5], cdef_large_value_mask);
1031 sec_src[6] = vandq_u16(sec_src[6], cdef_large_value_mask);
1032 sec_src[7] = vandq_u16(sec_src[7], cdef_large_value_mask);
1033
1034 uint16x8_t sec_max0 = vmaxq_u16(sec_src[0], sec_src[1]);
1035 uint16x8_t sec_max1 = vmaxq_u16(sec_src[2], sec_src[3]);
1036 uint16x8_t sec_max2 = vmaxq_u16(sec_src[4], sec_src[5]);
1037 uint16x8_t sec_max3 = vmaxq_u16(sec_src[6], sec_src[7]);
1038 sec_max0 = vmaxq_u16(sec_max0, sec_max1);
1039 sec_max2 = vmaxq_u16(sec_max2, sec_max3);
1040 sec_max0 = vmaxq_u16(sec_max0, sec_max2);
1041 max = vmaxq_u16(max, sec_max0);
1042
1043 // res = s + ((sum - (sum < 0) + 8) >> 4)
1044 sum =
1045 vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
1046 int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
1047
1048 res = vminq_s16(vmaxq_s16(res, vreinterpretq_s16_u16(min)),
1049 vreinterpretq_s16_u16(max));
1050
1051 vst1q_u16(dst16, vreinterpretq_u16_s16(res));
1052
1053 in += CDEF_BSTRIDE;
1054 dst16 += dstride;
1055 } while (--h != 0);
1056 } else {
1057 uint16_t *dst16 = (uint16_t *)dest;
1058
1059 int h = block_height;
1060 do {
1061 int16x8_t sum = vdupq_n_s16(0);
1062 uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
1063 max = min = s;
1064
1065 uint16x8_t pri_src[4];
1066
1067 // Primary near taps
1068 pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE);
1069 pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE);
1070
1071 // Primary far taps
1072 pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE);
1073 pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE);
1074
1075 primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum);
1076
1077 uint16x8_t pri_min1 = vminq_u16(pri_src[0], pri_src[1]);
1078 uint16x8_t pri_min2 = vminq_u16(pri_src[2], pri_src[3]);
1079 pri_min1 = vminq_u16(pri_min1, pri_min2);
1080 min = vminq_u16(min, pri_min1);
1081
1082 /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
1083 pri_src[0] = vandq_u16(pri_src[0], cdef_large_value_mask);
1084 pri_src[1] = vandq_u16(pri_src[1], cdef_large_value_mask);
1085 pri_src[2] = vandq_u16(pri_src[2], cdef_large_value_mask);
1086 pri_src[3] = vandq_u16(pri_src[3], cdef_large_value_mask);
1087 uint16x8_t pri_max0 = vmaxq_u16(pri_src[0], pri_src[1]);
1088 uint16x8_t pri_max1 = vmaxq_u16(pri_src[2], pri_src[3]);
1089 pri_max0 = vmaxq_u16(pri_max0, pri_max1);
1090 max = vmaxq_u16(max, pri_max0);
1091
1092 uint16x8_t sec_src[8];
1093
1094 // Secondary near taps
1095 sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE);
1096 sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE);
1097 sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE);
1098 sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE);
1099
1100 // Secondary far taps
1101 sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE);
1102 sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE);
1103 sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE);
1104 sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE);
1105
1106 secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
1107
1108 uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]);
1109 uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]);
1110 uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]);
1111 uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]);
1112 sec_min0 = vminq_u16(sec_min0, sec_min1);
1113 sec_min2 = vminq_u16(sec_min2, sec_min3);
1114 sec_min0 = vminq_u16(sec_min0, sec_min2);
1115 min = vminq_u16(min, sec_min0);
1116
1117 /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
1118 sec_src[0] = vandq_u16(sec_src[0], cdef_large_value_mask);
1119 sec_src[1] = vandq_u16(sec_src[1], cdef_large_value_mask);
1120 sec_src[2] = vandq_u16(sec_src[2], cdef_large_value_mask);
1121 sec_src[3] = vandq_u16(sec_src[3], cdef_large_value_mask);
1122 sec_src[4] = vandq_u16(sec_src[4], cdef_large_value_mask);
1123 sec_src[5] = vandq_u16(sec_src[5], cdef_large_value_mask);
1124 sec_src[6] = vandq_u16(sec_src[6], cdef_large_value_mask);
1125 sec_src[7] = vandq_u16(sec_src[7], cdef_large_value_mask);
1126
1127 uint16x8_t sec_max0 = vmaxq_u16(sec_src[0], sec_src[1]);
1128 uint16x8_t sec_max1 = vmaxq_u16(sec_src[2], sec_src[3]);
1129 uint16x8_t sec_max2 = vmaxq_u16(sec_src[4], sec_src[5]);
1130 uint16x8_t sec_max3 = vmaxq_u16(sec_src[6], sec_src[7]);
1131 sec_max0 = vmaxq_u16(sec_max0, sec_max1);
1132 sec_max2 = vmaxq_u16(sec_max2, sec_max3);
1133 sec_max0 = vmaxq_u16(sec_max0, sec_max2);
1134 max = vmaxq_u16(max, sec_max0);
1135
1136 // res = s + ((sum - (sum < 0) + 8) >> 4)
1137 sum =
1138 vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
1139 int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
1140
1141 res = vminq_s16(vmaxq_s16(res, vreinterpretq_s16_u16(min)),
1142 vreinterpretq_s16_u16(max));
1143
1144 store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res));
1145
1146 in += 2 * CDEF_BSTRIDE;
1147 dst16 += 2 * dstride;
1148 h -= 2;
1149 } while (h != 0);
1150 }
1151 }
1152
cdef_filter_16_1_neon(void * dest,int dstride,const uint16_t * in,int pri_strength,int sec_strength,int dir,int pri_damping,int sec_damping,int coeff_shift,int block_width,int block_height)1153 void cdef_filter_16_1_neon(void *dest, int dstride, const uint16_t *in,
1154 int pri_strength, int sec_strength, int dir,
1155 int pri_damping, int sec_damping, int coeff_shift,
1156 int block_width, int block_height) {
1157 (void)sec_strength;
1158 (void)sec_damping;
1159
1160 const int po1 = cdef_directions[dir][0];
1161 const int po2 = cdef_directions[dir][1];
1162 const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
1163
1164 if (pri_strength) {
1165 pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
1166 }
1167
1168 if (block_width == 8) {
1169 uint16_t *dst16 = (uint16_t *)dest;
1170
1171 int h = block_height;
1172 do {
1173 int16x8_t sum = vdupq_n_s16(0);
1174 uint16x8_t s = vld1q_u16(in);
1175
1176 uint16x8_t tap[4];
1177
1178 // Primary near taps
1179 tap[0] = vld1q_u16(in + po1);
1180 tap[1] = vld1q_u16(in - po1);
1181
1182 // Primary far taps
1183 tap[2] = vld1q_u16(in + po2);
1184 tap[3] = vld1q_u16(in - po2);
1185
1186 primary_filter(s, tap, pri_taps, pri_strength, pri_damping, &sum);
1187
1188 // res = s + ((sum - (sum < 0) + 8) >> 4)
1189 sum =
1190 vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
1191 const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
1192
1193 vst1q_u16(dst16, vreinterpretq_u16_s16(res));
1194
1195 in += CDEF_BSTRIDE;
1196 dst16 += dstride;
1197 } while (--h != 0);
1198 } else {
1199 uint16_t *dst16 = (uint16_t *)dest;
1200
1201 int h = block_height;
1202 do {
1203 int16x8_t sum = vdupq_n_s16(0);
1204 uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
1205
1206 uint16x8_t pri_src[4];
1207
1208 // Primary near taps
1209 pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE);
1210 pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE);
1211
1212 // Primary far taps
1213 pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE);
1214 pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE);
1215
1216 primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum);
1217
1218 // res = s + ((sum - (sum < 0) + 8) >> 4)
1219 sum =
1220 vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
1221 const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
1222
1223 store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res));
1224
1225 in += 2 * CDEF_BSTRIDE;
1226 dst16 += 2 * dstride;
1227 h -= 2;
1228 } while (h != 0);
1229 }
1230 }
1231
cdef_filter_16_2_neon(void * dest,int dstride,const uint16_t * in,int pri_strength,int sec_strength,int dir,int pri_damping,int sec_damping,int coeff_shift,int block_width,int block_height)1232 void cdef_filter_16_2_neon(void *dest, int dstride, const uint16_t *in,
1233 int pri_strength, int sec_strength, int dir,
1234 int pri_damping, int sec_damping, int coeff_shift,
1235 int block_width, int block_height) {
1236 (void)pri_strength;
1237 (void)pri_damping;
1238 (void)coeff_shift;
1239
1240 const int s1o1 = cdef_directions[dir + 2][0];
1241 const int s1o2 = cdef_directions[dir + 2][1];
1242 const int s2o1 = cdef_directions[dir - 2][0];
1243 const int s2o2 = cdef_directions[dir - 2][1];
1244 const int *sec_taps = cdef_sec_taps;
1245
1246 if (sec_strength) {
1247 sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
1248 }
1249
1250 if (block_width == 8) {
1251 uint16_t *dst16 = (uint16_t *)dest;
1252
1253 int h = block_height;
1254 do {
1255 int16x8_t sum = vdupq_n_s16(0);
1256 uint16x8_t s = vld1q_u16(in);
1257
1258 uint16x8_t sec_src[8];
1259
1260 // Secondary near taps
1261 sec_src[0] = vld1q_u16(in + s1o1);
1262 sec_src[1] = vld1q_u16(in - s1o1);
1263 sec_src[2] = vld1q_u16(in + s2o1);
1264 sec_src[3] = vld1q_u16(in - s2o1);
1265
1266 // Secondary far taps
1267 sec_src[4] = vld1q_u16(in + s1o2);
1268 sec_src[5] = vld1q_u16(in - s1o2);
1269 sec_src[6] = vld1q_u16(in + s2o2);
1270 sec_src[7] = vld1q_u16(in - s2o2);
1271
1272 secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
1273
1274 // res = s + ((sum - (sum < 0) + 8) >> 4)
1275 sum =
1276 vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
1277 const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
1278
1279 vst1q_u16(dst16, vreinterpretq_u16_s16(res));
1280
1281 in += CDEF_BSTRIDE;
1282 dst16 += dstride;
1283 } while (--h != 0);
1284 } else {
1285 uint16_t *dst16 = (uint16_t *)dest;
1286
1287 int h = block_height;
1288 do {
1289 int16x8_t sum = vdupq_n_s16(0);
1290 uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
1291
1292 uint16x8_t sec_src[8];
1293
1294 // Secondary near taps
1295 sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE);
1296 sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE);
1297 sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE);
1298 sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE);
1299
1300 // Secondary far taps
1301 sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE);
1302 sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE);
1303 sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE);
1304 sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE);
1305
1306 secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
1307
1308 // res = s + ((sum - (sum < 0) + 8) >> 4)
1309 sum =
1310 vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
1311 const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
1312
1313 store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res));
1314
1315 in += 2 * CDEF_BSTRIDE;
1316 dst16 += 2 * dstride;
1317 h -= 2;
1318 } while (h != 0);
1319 }
1320 }
1321
cdef_filter_16_3_neon(void * dest,int dstride,const uint16_t * in,int pri_strength,int sec_strength,int dir,int pri_damping,int sec_damping,int coeff_shift,int block_width,int block_height)1322 void cdef_filter_16_3_neon(void *dest, int dstride, const uint16_t *in,
1323 int pri_strength, int sec_strength, int dir,
1324 int pri_damping, int sec_damping, int coeff_shift,
1325 int block_width, int block_height) {
1326 (void)pri_strength;
1327 (void)sec_strength;
1328 (void)dir;
1329 (void)pri_damping;
1330 (void)sec_damping;
1331 (void)coeff_shift;
1332 (void)block_width;
1333 if (block_width == 8) {
1334 uint16_t *dst16 = (uint16_t *)dest;
1335
1336 int h = block_height;
1337 do {
1338 const uint16x8_t s = vld1q_u16(in);
1339 vst1q_u16(dst16, s);
1340
1341 in += CDEF_BSTRIDE;
1342 dst16 += dstride;
1343 } while (--h != 0);
1344 } else {
1345 uint16_t *dst16 = (uint16_t *)dest;
1346
1347 int h = block_height;
1348 do {
1349 const uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
1350 store_u16x4_strided_x2(dst16, dstride, s);
1351
1352 in += 2 * CDEF_BSTRIDE;
1353 dst16 += 2 * dstride;
1354 h -= 2;
1355 } while (h != 0);
1356 }
1357 }
1358