xref: /aosp_15_r20/external/libdav1d/src/arm/64/looprestoration16.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1/*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2020, Martin Storsjo
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/arm/asm.S"
29#include "util.S"
30
31const right_ext_mask_buf
32        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
33        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
34        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
35        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
36        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
37        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
38right_ext_mask:
39        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
40        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
41        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
42        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
43        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
44        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
45endconst
46
47// void dav1d_wiener_filter7_16bpc_neon(pixel *p, const ptrdiff_t p_stride,
48//                                      const pixel (*left)[4], const pixel *lpf,
49//                                      const int w, int h,
50//                                      const int16_t filter[2][8],
51//                                      const enum LrEdgeFlags edges,
52//                                      const int bitdepth_max);
53function wiener_filter7_16bpc_neon, export=1
54        ldr             w8,  [sp]
55        AARCH64_SIGN_LINK_REGISTER
56        stp             x29, x30, [sp, #-32]!
57        stp             d8,  d9,  [sp, #16]
58        mov             x29, sp
59        ld1             {v0.8h, v1.8h},  [x6]
60        tst             w7,  #4               // LR_HAVE_TOP
61        sub_sp          384*2*6
62
63        dup             v28.8h,  w8           // bitdepth_max
64        clz             w8,  w8
65        movi            v30.4s,  #1
66        sub             w10, w8,  #38         // -(bitdepth + 6)
67        sub             w11, w8,  #11         // round_bits_v
68        sub             w8,  w8,  #25         // -round_bits_h
69        neg             w10, w10              // bitdepth + 6
70        neg             w11, w11              // -round_bits_v
71        dup             v2.4s,   w10
72        dup             v29.4s,  w8           // -round_bits_h
73        dup             v27.4s,  w11          // -round_bits_v
74        movi            v31.8h,  #0x20, lsl #8  // 1 << 13 = 8192
75        ushl            v30.4s,  v30.4s,  v2.4s // 1 << (bitdepth + 6)
76
77        zip1            v0.2d,   v0.2d,   v1.2d // move vertical coeffs to v0.h[4-7], freeing up v1
78
79        // x9  - t6
80        // x10 - t5
81        // x11 - t4
82        // x12 - t3
83        // x13 - t2
84        // x14 - t1
85        // x15 - t0
86        mov             x14, sp               // t1
87        b.eq            L(no_top_7)
88
89        mov             x16, x2               // backup left
90        mov             x2,  #0
91        bl              wiener_filter7_h_16bpc_neon
92        add             x3,  x3,  x1          // lpf += stride
93        mov             x9,  x14              // t6
94        mov             x10, x14              // t5
95        add             x14, x14, #384*2      // t1 += 384*2
96        bl              wiener_filter7_h_16bpc_neon
97        add             x3,  x3,  x1,  lsl #2
98        add             x3,  x3,  x1          // lpf += stride*5
99        mov             x11, x14              // t4
100        add             x14, x14, #384*2      // t1 += 384*2
101        mov             x2,  x16              // left
102        mov             x16, x3               // backup lpf
103        mov             x3,  x0               // lpf = p
104        bl              wiener_filter7_h_16bpc_neon
105        subs            w5,  w5,  #1          // h--
106        mov             x12, x14              // t3
107        mov             x13, x14              // t2
108        b.eq            L(v1_7)
109        add             x3,  x3,  x1          // src += stride
110        add             x14, x14, #384*2      // t1 += 384*2
111        bl              wiener_filter7_h_16bpc_neon
112        mov             x13, x14              // t2
113        subs            w5,  w5,  #1          // h--
114        b.eq            L(v2_7)
115        add             x3,  x3,  x1          // src += stride
116        add             x14, x14, #384*2      // t1 += 384*2
117        bl              wiener_filter7_h_16bpc_neon
118        subs            w5,  w5,  #1          // h--
119        b.eq            L(v3_7)
120        add             x3,  x3,  x1          // src += stride
121
122L(main_7):
123        add             x15, x14, #384*2      // t0 = t1 + 384*2
124L(main_loop_7):
125        bl              wiener_filter7_hv_16bpc_neon
126        subs            w5,  w5,  #1          // h--
127        b.ne            L(main_loop_7)
128        tst             w7,  #8 // LR_HAVE_BOTTOM
129        b.eq            L(v3_7)
130
131        mov             x3,  x16              // restore lpf
132        mov             x2,  #0               // left = NULL
133        bl              wiener_filter7_hv_16bpc_neon
134        bl              wiener_filter7_hv_16bpc_neon
135L(v1_7):
136        bl              wiener_filter7_v_16bpc_neon
137
138        mov             sp,  x29
139        ldp             d8,  d9,  [sp, #16]
140        ldp             x29, x30, [sp], #32
141        AARCH64_VALIDATE_LINK_REGISTER
142        ret
143
144L(no_top_7):
145        add             x3,  x3,  x1,  lsl #2
146        add             x16, x3,  x1,  lsl #1 // lpf += stride*6, backup
147        mov             x3,  x0               // lpf = p
148
149        bl              wiener_filter7_h_16bpc_neon
150        subs            w5,  w5,  #1          // h--
151        mov             x9,  x14              // t6
152        mov             x10, x14              // t5
153        mov             x11, x14              // t4
154        mov             x12, x14              // t3
155        mov             x13, x14              // t2
156        b.eq            L(v1_7)
157        add             x3,  x3,  x1          // src += p_stride
158        add             x14, x14, #384*2      // t1 += 384*2
159        bl              wiener_filter7_h_16bpc_neon
160        subs            w5,  w5,  #1          // h--
161        mov             x13, x14              // t2
162        b.eq            L(v2_7)
163        add             x3,  x3,  x1          // src += p_stride
164        add             x14, x14, #384*2      // t1 += 384*2
165        bl              wiener_filter7_h_16bpc_neon
166        subs            w5,  w5,  #1          // h--
167        b.eq            L(v3_7)
168        add             x3,  x3,  x1          // src += p_stride
169        add             x15, x14, #384*2      // t0 = t1 + 384*2
170        bl              wiener_filter7_hv_16bpc_neon
171        subs            w5,  w5,  #1          // h--
172        b.eq            L(v3_7)
173        add             x15, x15, #384*2*4    // t0 += 384*2*4
174        bl              wiener_filter7_hv_16bpc_neon
175        subs            w5,  w5,  #1          // h--
176        b.ne            L(main_7)
177L(v3_7):
178        bl              wiener_filter7_v_16bpc_neon
179L(v2_7):
180        bl              wiener_filter7_v_16bpc_neon
181        b               L(v1_7)
182endfunc
183
184
185function wiener_filter7_h_16bpc_neon
186        stp             x3,  x4,  [sp, #-32]!
187        str             x14,      [sp, #16]
188
189        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
190        tst             w7,  #1 // LR_HAVE_LEFT
191        b.eq            1f
192        // LR_HAVE_LEFT
193        cbnz            x2,  0f
194        // left == NULL
195        sub             x3,  x3,  #6
196        ld1             {v2.8h, v3.8h}, [x3], #32
197        b               2f
198
1990:
200        // LR_HAVE_LEFT, left != NULL
201        ld1             {v2.8h, v3.8h}, [x3], #32
202        ld1             {v4.d}[1], [x2], #8
203        // Move x3 back to account for the last 3 pixels we loaded earlier,
204        // which we'll shift out.
205        sub             x3,  x3,  #6
206        ext             v3.16b,  v2.16b,  v3.16b,  #10
207        ext             v2.16b,  v4.16b,  v2.16b,  #10
208        b               2f
209
2101:
211        ld1             {v2.8h, v3.8h}, [x3], #32
212        // !LR_HAVE_LEFT, fill v4 with the leftmost pixel
213        // and shift v3 to have 3x the first pixel at the front.
214        dup             v4.8h,  v2.h[0]
215        // Move x3 back to account for the last 3 pixels we loaded before,
216        // which we shifted out.
217        sub             x3,  x3,  #6
218        ext             v3.16b,  v2.16b,  v3.16b,  #10
219        ext             v2.16b,  v4.16b,  v2.16b,  #10
220
2212:
222        ld1             {v4.8h}, [x3], #16
223
224        tst             w7,  #2 // LR_HAVE_RIGHT
225        b.ne            4f
226
2273:      // !LR_HAVE_RIGHT
228
229        // Check whether we need to pad the right edge
230        cmp             w4,  #19
231        b.ge            4f   // If w >= 19, all used input pixels are valid
232
233        // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
234        // this ends up called again; it's not strictly needed in those
235        // cases (we pad enough here), but keeping the code as simple as possible.
236
237        // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
238        // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel.
239        sub             w17, w4,  #22
240        // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
241        // buffer pointer.
242        movrel          x6,  right_ext_mask, -6
243        ldr             h26, [x3,  w17, sxtw #1]
244        sub             x6,  x6,  w4,  uxtw #1
245        dup             v26.8h,  v26.h[0]
246        ld1             {v23.16b, v24.16b, v25.16b}, [x6]
247
248        bit             v2.16b,  v26.16b, v23.16b
249        bit             v3.16b,  v26.16b, v24.16b
250        bit             v4.16b,  v26.16b, v25.16b
251
2524:      // Loop horizontally
253        // Interleaving the mul/mla chains actually hurts performance
254        // significantly on Cortex A53, thus keeping mul/mla tightly
255        // chained like this.
256        ext             v17.16b, v2.16b,  v3.16b, #4
257        ext             v19.16b, v2.16b,  v3.16b, #8
258        ext             v16.16b, v2.16b,  v3.16b, #2
259        ext             v20.16b, v2.16b,  v3.16b, #10
260        ext             v21.16b, v2.16b,  v3.16b, #12
261        ext             v18.16b, v2.16b,  v3.16b, #6
262        add             v19.8h,  v19.8h,  v17.8h
263        add             v20.8h,  v20.8h,  v16.8h
264        add             v21.8h,  v21.8h,  v2.8h
265        smull           v6.4s,   v18.4h,  v0.h[3]
266        smlal           v6.4s,   v19.4h,  v0.h[2]
267        smlal           v6.4s,   v20.4h,  v0.h[1]
268        smlal           v6.4s,   v21.4h,  v0.h[0]
269        smull2          v7.4s,   v18.8h,  v0.h[3]
270        smlal2          v7.4s,   v19.8h,  v0.h[2]
271        smlal2          v7.4s,   v20.8h,  v0.h[1]
272        smlal2          v7.4s,   v21.8h,  v0.h[0]
273
274        ext             v17.16b, v3.16b,  v4.16b, #4
275        ext             v19.16b, v3.16b,  v4.16b, #8
276        ext             v16.16b, v3.16b,  v4.16b, #2
277        ext             v20.16b, v3.16b,  v4.16b, #10
278        ext             v21.16b, v3.16b,  v4.16b, #12
279        ext             v18.16b, v3.16b,  v4.16b, #6
280
281        add             v19.8h,  v19.8h,  v17.8h
282        add             v20.8h,  v20.8h,  v16.8h
283        add             v21.8h,  v21.8h,  v3.8h
284        smull           v16.4s,  v18.4h,  v0.h[3]
285        smlal           v16.4s,  v19.4h,  v0.h[2]
286        smlal           v16.4s,  v20.4h,  v0.h[1]
287        smlal           v16.4s,  v21.4h,  v0.h[0]
288        smull2          v17.4s,  v18.8h,  v0.h[3]
289        smlal2          v17.4s,  v19.8h,  v0.h[2]
290        smlal2          v17.4s,  v20.8h,  v0.h[1]
291        smlal2          v17.4s,  v21.8h,  v0.h[0]
292
293        mvni            v24.8h,  #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
294        add             v6.4s,   v6.4s,   v30.4s
295        add             v7.4s,   v7.4s,   v30.4s
296        add             v16.4s,  v16.4s,  v30.4s
297        add             v17.4s,  v17.4s,  v30.4s
298        srshl           v6.4s,   v6.4s,   v29.4s
299        srshl           v7.4s,   v7.4s,   v29.4s
300        srshl           v16.4s,  v16.4s,  v29.4s
301        srshl           v17.4s,  v17.4s,  v29.4s
302        sqxtun          v6.4h,   v6.4s
303        sqxtun2         v6.8h,   v7.4s
304        sqxtun          v7.4h,   v16.4s
305        sqxtun2         v7.8h,   v17.4s
306        umin            v6.8h,   v6.8h,   v24.8h
307        umin            v7.8h,   v7.8h,   v24.8h
308        sub             v6.8h,   v6.8h,   v31.8h
309        sub             v7.8h,   v7.8h,   v31.8h
310
311        subs            w4,  w4,  #16
312
313        st1             {v6.8h, v7.8h}, [x14], #32
314
315        b.le            0f
316        mov             v2.16b,  v4.16b
317        tst             w7,  #2 // LR_HAVE_RIGHT
318        ld1             {v3.8h, v4.8h}, [x3], #32
319        b.ne            4b // If we don't need to pad, just keep filtering.
320        b               3b // If we need to pad, check how many pixels we have left.
321
3220:
323        ldr             x14,      [sp, #16]
324        ldp             x3,  x4,  [sp], #32
325        ret
326endfunc
327
328function wiener_filter7_v_16bpc_neon
329        // Backing up/restoring registers shifted, so that x9 gets the value
330        // of x10, etc, afterwards.
331        stp             x10, x11, [sp, #-64]!
332        stp             x12, x13, [sp, #16]
333        stp             x14, x14, [sp, #32]
334        stp             x0,  x4,  [sp, #48]
3351:
336        ld1             {v16.8h, v17.8h}, [x9],  #32
337        ld1             {v18.8h, v19.8h}, [x10], #32
338        ld1             {v20.8h, v21.8h}, [x11], #32
339        ld1             {v22.8h, v23.8h}, [x12], #32
340        ld1             {v24.8h, v25.8h}, [x13], #32
341        ld1             {v6.8h,  v7.8h},  [x14], #32
342
343        smull           v2.4s,   v16.4h,  v0.h[4]
344        smlal           v2.4s,   v18.4h,  v0.h[5]
345        smlal           v2.4s,   v20.4h,  v0.h[6]
346        smlal           v2.4s,   v22.4h,  v0.h[7]
347        smlal           v2.4s,   v24.4h,  v0.h[6]
348        smlal           v2.4s,   v6.4h,   v0.h[5]
349        smlal           v2.4s,   v6.4h,   v0.h[4]
350        smull2          v3.4s,   v16.8h,  v0.h[4]
351        smlal2          v3.4s,   v18.8h,  v0.h[5]
352        smlal2          v3.4s,   v20.8h,  v0.h[6]
353        smlal2          v3.4s,   v22.8h,  v0.h[7]
354        smlal2          v3.4s,   v24.8h,  v0.h[6]
355        smlal2          v3.4s,   v6.8h,   v0.h[5]
356        smlal2          v3.4s,   v6.8h,   v0.h[4]
357        smull           v4.4s,   v17.4h,  v0.h[4]
358        smlal           v4.4s,   v19.4h,  v0.h[5]
359        smlal           v4.4s,   v21.4h,  v0.h[6]
360        smlal           v4.4s,   v23.4h,  v0.h[7]
361        smlal           v4.4s,   v25.4h,  v0.h[6]
362        smlal           v4.4s,   v7.4h,   v0.h[5]
363        smlal           v4.4s,   v7.4h,   v0.h[4]
364        smull2          v5.4s,   v17.8h,  v0.h[4]
365        smlal2          v5.4s,   v19.8h,  v0.h[5]
366        smlal2          v5.4s,   v21.8h,  v0.h[6]
367        smlal2          v5.4s,   v23.8h,  v0.h[7]
368        smlal2          v5.4s,   v25.8h,  v0.h[6]
369        smlal2          v5.4s,   v7.8h,   v0.h[5]
370        smlal2          v5.4s,   v7.8h,   v0.h[4]
371        srshl           v2.4s,   v2.4s,   v27.4s  // -round_bits_v
372        srshl           v3.4s,   v3.4s,   v27.4s
373        srshl           v4.4s,   v4.4s,   v27.4s
374        srshl           v5.4s,   v5.4s,   v27.4s
375        sqxtun          v2.4h,   v2.4s
376        sqxtun2         v2.8h,   v3.4s
377        sqxtun          v3.4h,   v4.4s
378        sqxtun2         v3.8h,   v5.4s
379        umin            v2.8h,   v2.8h,   v28.8h  // bitdepth_max
380        umin            v3.8h,   v3.8h,   v28.8h
381        subs            w4,  w4,  #16
382        st1             {v2.8h, v3.8h}, [x0], #32
383        b.gt            1b
384
385        ldp             x0,  x4,  [sp, #48]
386        ldp             x13, x14, [sp, #32]
387        ldp             x11, x12, [sp, #16]
388        ldp             x9,  x10, [sp], #64
389
390        add             x0,  x0,  x1
391        ret
392endfunc
393
394function wiener_filter7_hv_16bpc_neon
395        // Backing up/restoring registers shifted, so that x9 gets the value
396        // of x10, etc, and x15==x9, afterwards.
397        stp             x10, x11, [sp, #-80]!
398        stp             x12, x13, [sp, #16]
399        stp             x14, x15, [sp, #32]
400        stp             x10, x0,  [sp, #48]
401        stp             x3,  x4,  [sp, #64]
402
403        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
404        tst             w7,  #1 // LR_HAVE_LEFT
405        b.eq            1f
406        // LR_HAVE_LEFT
407        cbnz            x2,  0f
408        // left == NULL
409        sub             x3,  x3,  #6
410        ld1             {v2.8h, v3.8h}, [x3], #32
411        b               2f
412
4130:
414        // LR_HAVE_LEFT, left != NULL
415        ld1             {v2.8h, v3.8h}, [x3], #32
416        ld1             {v4.d}[1], [x2], #8
417        // Move x3 back to account for the last 3 pixels we loaded earlier,
418        // which we'll shift out.
419        sub             x3,  x3,  #6
420        ext             v3.16b,  v2.16b,  v3.16b,  #10
421        ext             v2.16b,  v4.16b,  v2.16b,  #10
422        b               2f
4231:
424        ld1             {v2.8h, v3.8h}, [x3], #32
425        // !LR_HAVE_LEFT, fill v4 with the leftmost pixel
426        // and shift v3 to have 3x the first pixel at the front.
427        dup             v4.8h,  v2.h[0]
428        // Move x3 back to account for the last 3 pixels we loaded before,
429        // which we shifted out.
430        sub             x3,  x3,  #6
431        ext             v3.16b,  v2.16b,  v3.16b,  #10
432        ext             v2.16b,  v4.16b,  v2.16b,  #10
433
4342:
435        ld1             {v4.8h}, [x3], #16
436
437        tst             w7,  #2 // LR_HAVE_RIGHT
438        b.ne            4f
439
4403:      // !LR_HAVE_RIGHT
441
442        // Check whether we need to pad the right edge
443        cmp             w4,  #19
444        b.ge            4f   // If w >= 19, all used input pixels are valid
445
446        // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
447        // this ends up called again; it's not strictly needed in those
448        // cases (we pad enough here), but keeping the code as simple as possible.
449
450        // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
451        // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel.
452        sub             w17, w4,  #22
453        // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
454        // buffer pointer.
455        movrel          x6,  right_ext_mask, -6
456        ldr             h26, [x3,  w17, sxtw #1]
457        sub             x6,  x6,  w4,  uxtw #1
458        dup             v26.8h,  v26.h[0]
459        ld1             {v23.16b, v24.16b, v25.16b}, [x6]
460
461        bit             v2.16b,  v26.16b, v23.16b
462        bit             v3.16b,  v26.16b, v24.16b
463        bit             v4.16b,  v26.16b, v25.16b
464
4654:      // Loop horizontally
466        ext             v17.16b, v2.16b,  v3.16b, #4
467        ext             v19.16b, v2.16b,  v3.16b, #8
468        ext             v16.16b, v2.16b,  v3.16b, #2
469        ext             v20.16b, v2.16b,  v3.16b, #10
470        ext             v21.16b, v2.16b,  v3.16b, #12
471        ext             v18.16b, v2.16b,  v3.16b, #6
472        add             v19.8h,  v19.8h,  v17.8h
473        add             v20.8h,  v20.8h,  v16.8h
474        add             v21.8h,  v21.8h,  v2.8h
475        smull           v6.4s,   v18.4h,  v0.h[3]
476        smlal           v6.4s,   v19.4h,  v0.h[2]
477        smlal           v6.4s,   v20.4h,  v0.h[1]
478        smlal           v6.4s,   v21.4h,  v0.h[0]
479        smull2          v7.4s,   v18.8h,  v0.h[3]
480        smlal2          v7.4s,   v19.8h,  v0.h[2]
481        smlal2          v7.4s,   v20.8h,  v0.h[1]
482        smlal2          v7.4s,   v21.8h,  v0.h[0]
483
484        ext             v17.16b, v3.16b,  v4.16b, #4
485        ext             v19.16b, v3.16b,  v4.16b, #8
486        ext             v16.16b, v3.16b,  v4.16b, #2
487        ext             v20.16b, v3.16b,  v4.16b, #10
488        ext             v21.16b, v3.16b,  v4.16b, #12
489        ext             v18.16b, v3.16b,  v4.16b, #6
490
491        add             v19.8h,  v19.8h,  v17.8h
492        add             v20.8h,  v20.8h,  v16.8h
493        add             v21.8h,  v21.8h,  v3.8h
494        smull           v24.4s,  v18.4h,  v0.h[3]
495        smlal           v24.4s,  v19.4h,  v0.h[2]
496        smlal           v24.4s,  v20.4h,  v0.h[1]
497        smlal           v24.4s,  v21.4h,  v0.h[0]
498        smull2          v25.4s,  v18.8h,  v0.h[3]
499        smlal2          v25.4s,  v19.8h,  v0.h[2]
500        smlal2          v25.4s,  v20.8h,  v0.h[1]
501        smlal2          v25.4s,  v21.8h,  v0.h[0]
502
503        ld1             {v16.8h, v17.8h}, [x9],  #32
504
505        mvni            v26.8h,  #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
506        add             v6.4s,   v6.4s,   v30.4s
507        add             v7.4s,   v7.4s,   v30.4s
508        add             v24.4s,  v24.4s,  v30.4s
509        add             v25.4s,  v25.4s,  v30.4s
510        ld1             {v18.8h, v19.8h}, [x10], #32
511        srshl           v6.4s,   v6.4s,   v29.4s
512        srshl           v7.4s,   v7.4s,   v29.4s
513        srshl           v24.4s,  v24.4s,  v29.4s
514        srshl           v25.4s,  v25.4s,  v29.4s
515        ld1             {v20.8h, v21.8h}, [x11], #32
516        sqxtun          v6.4h,   v6.4s
517        sqxtun2         v6.8h,   v7.4s
518        sqxtun          v7.4h,   v24.4s
519        sqxtun2         v7.8h,   v25.4s
520        ld1             {v22.8h, v23.8h}, [x12], #32
521        umin            v6.8h,   v6.8h,   v26.8h
522        umin            v7.8h,   v7.8h,   v26.8h
523        ld1             {v24.8h, v25.8h}, [x13], #32
524        sub             v6.8h,   v6.8h,   v31.8h
525        sub             v7.8h,   v7.8h,   v31.8h
526
527        ld1             {v8.8h,  v9.8h},  [x14], #32
528
529        smull           v1.4s,   v16.4h,  v0.h[4]
530        smlal           v1.4s,   v18.4h,  v0.h[5]
531        smlal           v1.4s,   v20.4h,  v0.h[6]
532        smlal           v1.4s,   v22.4h,  v0.h[7]
533        smlal           v1.4s,   v24.4h,  v0.h[6]
534        smlal           v1.4s,   v8.4h,   v0.h[5]
535        smlal           v1.4s,   v6.4h,   v0.h[4]
536        smull2          v5.4s,   v16.8h,  v0.h[4]
537        smlal2          v5.4s,   v18.8h,  v0.h[5]
538        smlal2          v5.4s,   v20.8h,  v0.h[6]
539        smlal2          v5.4s,   v22.8h,  v0.h[7]
540        smlal2          v5.4s,   v24.8h,  v0.h[6]
541        smlal2          v5.4s,   v8.8h,   v0.h[5]
542        smlal2          v5.4s,   v6.8h,   v0.h[4]
543        smull           v26.4s,  v17.4h,  v0.h[4]
544        smlal           v26.4s,  v19.4h,  v0.h[5]
545        smlal           v26.4s,  v21.4h,  v0.h[6]
546        smlal           v26.4s,  v23.4h,  v0.h[7]
547        smlal           v26.4s,  v25.4h,  v0.h[6]
548        smlal           v26.4s,  v9.4h,   v0.h[5]
549        smlal           v26.4s,  v7.4h,   v0.h[4]
550        smull2          v16.4s,  v17.8h,  v0.h[4]
551        smlal2          v16.4s,  v19.8h,  v0.h[5]
552        smlal2          v16.4s,  v21.8h,  v0.h[6]
553        smlal2          v16.4s,  v23.8h,  v0.h[7]
554        smlal2          v16.4s,  v25.8h,  v0.h[6]
555        smlal2          v16.4s,  v9.8h,   v0.h[5]
556        smlal2          v16.4s,  v7.8h,   v0.h[4]
557        srshl           v1.4s,   v1.4s,   v27.4s  // -round_bits_v
558        srshl           v5.4s,   v5.4s,   v27.4s
559        srshl           v26.4s,  v26.4s,  v27.4s
560        srshl           v16.4s,  v16.4s,  v27.4s
561        sqxtun          v18.4h,  v1.4s
562        sqxtun2         v18.8h,  v5.4s
563        sqxtun          v19.4h,  v26.4s
564        sqxtun2         v19.8h,  v16.4s
565        st1             {v6.8h, v7.8h}, [x15], #32
566        umin            v18.8h,  v18.8h,  v28.8h  // bitdepth_max
567        umin            v19.8h,  v19.8h,  v28.8h
568        subs            w4,  w4,  #16
569
570        st1             {v18.8h, v19.8h}, [x0], #32
571
572        b.le            0f
573        mov             v2.16b,  v4.16b
574        tst             w7,  #2 // LR_HAVE_RIGHT
575        ld1             {v3.8h, v4.8h}, [x3], #32
576        b.ne            4b // If we don't need to pad, just keep filtering.
577        b               3b // If we need to pad, check how many pixels we have left.
578
5790:
580        ldp             x3,  x4,  [sp, #64]
581        ldp             x15, x0,  [sp, #48]
582        ldp             x13, x14, [sp, #32]
583        ldp             x11, x12, [sp, #16]
584        ldp             x9,  x10, [sp], #80
585
586        add             x3,  x3,  x1
587        add             x0,  x0,  x1
588
589        ret
590endfunc
591
592// void dav1d_wiener_filter5_16bpc_neon(pixel *p, const ptrdiff_t p_stride,
593//                                      const pixel (*left)[4], const pixel *lpf,
594//                                      const int w, int h,
595//                                      const int16_t filter[2][8],
596//                                      const enum LrEdgeFlags edges,
597//                                      const int bitdepth_max);
598function wiener_filter5_16bpc_neon, export=1
599        ldr             w8,  [sp]
600        AARCH64_SIGN_LINK_REGISTER
601        stp             x29, x30, [sp, #-32]!
602        stp             d8,  d9,  [sp, #16]
603        mov             x29, sp
604        ld1             {v0.8h, v1.8h},  [x6]
605        tst             w7,  #4               // LR_HAVE_TOP
606        sub_sp          384*2*4
607
608        dup             v28.8h,  w8           // bitdepth_max
609        clz             w8,  w8
610        movi            v30.4s,  #1
611        sub             w10, w8,  #38         // -(bitdepth + 6)
612        sub             w11, w8,  #11         // round_bits_v
613        sub             w8,  w8,  #25         // -round_bits_h
614        neg             w10, w10              // bitdepth + 6
615        neg             w11, w11              // -round_bits_v
616        dup             v2.4s,   w10
617        dup             v29.4s,  w8           // -round_bits_h
618        dup             v27.4s,  w11          // -round_bits_v
619        movi            v31.8h,  #0x20, lsl #8  // 1 << 13 = 8192
620        ushl            v30.4s,  v30.4s,  v2.4s // 1 << (bitdepth + 6)
621
622        zip1            v0.2d,   v0.2d,   v1.2d // move vertical coeffs to v0.h[4-7], freeing up v1
623
624        // x11 - t4
625        // x12 - t3
626        // x13 - t2
627        // x14 - t1
628        // x15 - t0
629        mov             x14, sp               // t1
630        b.eq            L(no_top_5)
631
632        mov             x16, x2               // backup left
633        mov             x2,  #0
634        bl              wiener_filter5_h_16bpc_neon
635        add             x3,  x3,  x1          // lpf += stride
636        mov             x11, x14              // t4
637        add             x14, x14, #384*2      // t1 += 384*2
638        bl              wiener_filter5_h_16bpc_neon
639        add             x3,  x3,  x1,  lsl #2
640        add             x3,  x3,  x1          // lpf += stride*5
641        mov             x12, x14              // t3
642        add             x14, x14, #384*2      // t1 += 384*2
643        mov             x2,  x16              // left
644        mov             x16, x3               // backup lpf
645        mov             x3,  x0               // lpf = p
646        bl              wiener_filter5_h_16bpc_neon
647        subs            w5,  w5,  #1          // h--
648        mov             x13, x14              // t2
649        b.eq            L(v1_5)
650        add             x3,  x3,  x1          // src += stride
651        add             x14, x14, #384*2      // t1 += 384*2
652        bl              wiener_filter5_h_16bpc_neon
653        subs            w5,  w5,  #1          // h--
654        b.eq            L(v2_5)
655        add             x3,  x3,  x1          // src += stride
656
657L(main_5):
658        mov             x15, x11              // t0 = t4
659L(main_loop_5):
660        bl              wiener_filter5_hv_16bpc_neon
661        subs            w5,  w5,  #1          // h--
662        b.ne            L(main_loop_5)
663        tst             w7,  #8 // LR_HAVE_BOTTOM
664        b.eq            L(v2_5)
665
666        mov             x3,  x16              // restore lpf
667        mov             x2,  #0               // left = NULL
668        bl              wiener_filter5_hv_16bpc_neon
669        bl              wiener_filter5_hv_16bpc_neon
670L(end_5):
671
672        mov             sp,  x29
673        ldp             d8,  d9,  [sp, #16]
674        ldp             x29, x30, [sp], #32
675        AARCH64_VALIDATE_LINK_REGISTER
676        ret
677
678L(no_top_5):
679        add             x3,  x3,  x1,  lsl #2
680        add             x16, x3,  x1,  lsl #1 // lpf += stride*6, backup
681        mov             x3,  x0               // lpf = p
682
683        bl              wiener_filter5_h_16bpc_neon
684        subs            w5,  w5,  #1          // h--
685        mov             x11, x14              // t4
686        mov             x12, x14              // t3
687        mov             x13, x14              // t2
688        b.eq            L(v1_5)
689        add             x3,  x3,  x1          // src += stride
690        add             x14, x14, #384*2      // t1 += 384*2
691        bl              wiener_filter5_h_16bpc_neon
692        subs            w5,  w5,  #1          // h--
693        b.eq            L(v2_5)
694        add             x3,  x3,  x1          // src += stride
695        add             x15, x14, #384*2      // t0 = t1 + 384*2
696        bl              wiener_filter5_hv_16bpc_neon
697        subs            w5,  w5,  #1          // h--
698        b.eq            L(v2_5)
699        add             x15, x15, #384*2*3    // t0 += 384*2*3
700        bl              wiener_filter5_hv_16bpc_neon
701        subs            w5,  w5,  #1          // h--
702        b.ne            L(main_5)
703L(v2_5):
704        bl              wiener_filter5_v_16bpc_neon
705        add             x0,  x0,  x1
706        mov             x11, x12
707        mov             x12, x13
708        mov             x13, x14
709L(v1_5):
710        bl              wiener_filter5_v_16bpc_neon
711        b               L(end_5)
712endfunc
713
714
715function wiener_filter5_h_16bpc_neon
716        stp             x3,  x4,  [sp, #-32]!
717        str             x14,      [sp, #16]
718
719        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
720        tst             w7,  #1 // LR_HAVE_LEFT
721        b.eq            1f
722        // LR_HAVE_LEFT
723        cbnz            x2,  0f
724        // left == NULL
725        sub             x3,  x3,  #4
726        ld1             {v2.8h, v3.8h}, [x3], #32
727        b               2f
728
7290:
730        // LR_HAVE_LEFT, left != NULL
731        ld1             {v2.8h, v3.8h}, [x3], #32
732        ld1             {v4.d}[1], [x2], #8
733        // Move x3 back to account for the last 2 pixels we loaded earlier,
734        // which we'll shift out.
735        sub             x3,  x3,  #4
736        ext             v3.16b,  v2.16b,  v3.16b,  #12
737        ext             v2.16b,  v4.16b,  v2.16b,  #12
738        b               2f
739
7401:
741        ld1             {v2.8h, v3.8h}, [x3], #32
742        // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
743        // and shift v3 to have 3x the first pixel at the front.
744        dup             v4.8h,  v2.h[0]
745        // Move x3 back to account for the last 2 pixels we loaded before,
746        // which we shifted out.
747        sub             x3,  x3,  #4
748        ext             v3.16b,  v2.16b,  v3.16b,  #12
749        ext             v2.16b,  v4.16b,  v2.16b,  #12
750
7512:
752        ld1             {v4.8h}, [x3], #16
753
754        tst             w7,  #2 // LR_HAVE_RIGHT
755        b.ne            4f
756
7573:      // !LR_HAVE_RIGHT
758
759        // Check whether we need to pad the right edge
760        cmp             w4,  #18
761        b.ge            4f   // If w >= 18, all used input pixels are valid
762
763        // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
764        // this ends up called again; it's not strictly needed in those
765        // cases (we pad enough here), but keeping the code as simple as possible.
766
767        // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
768        // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel.
769        sub             w17, w4,  #23
770        // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
771        // buffer pointer.
772        movrel          x6,  right_ext_mask, -4
773        ldr             h26, [x3,  w17, sxtw #1]
774        sub             x6,  x6,  w4,  uxtw #1
775        dup             v26.8h,  v26.h[0]
776        ld1             {v23.16b, v24.16b, v25.16b}, [x6]
777
778        bit             v2.16b,  v26.16b, v23.16b
779        bit             v3.16b,  v26.16b, v24.16b
780        bit             v4.16b,  v26.16b, v25.16b
781
7824:      // Loop horizontally
783        // Interleaving the mul/mla chains actually hurts performance
784        // significantly on Cortex A53, thus keeping mul/mla tightly
785        // chained like this.
786        ext             v16.16b, v2.16b,  v3.16b, #2
787        ext             v18.16b, v2.16b,  v3.16b, #6
788        ext             v19.16b, v2.16b,  v3.16b, #8
789        ext             v17.16b, v2.16b,  v3.16b, #4
790        add             v18.8h,  v18.8h,  v16.8h
791        add             v19.8h,  v19.8h,  v2.8h
792        smull           v6.4s,   v17.4h,  v0.h[3]
793        smlal           v6.4s,   v18.4h,  v0.h[2]
794        smlal           v6.4s,   v19.4h,  v0.h[1]
795        smull2          v7.4s,   v17.8h,  v0.h[3]
796        smlal2          v7.4s,   v18.8h,  v0.h[2]
797        smlal2          v7.4s,   v19.8h,  v0.h[1]
798
799        ext             v16.16b, v3.16b,  v4.16b, #2
800        ext             v18.16b, v3.16b,  v4.16b, #6
801        ext             v19.16b, v3.16b,  v4.16b, #8
802        ext             v17.16b, v3.16b,  v4.16b, #4
803        add             v18.8h,  v18.8h,  v16.8h
804        add             v19.8h,  v19.8h,  v3.8h
805        smull           v16.4s,  v17.4h,  v0.h[3]
806        smlal           v16.4s,  v18.4h,  v0.h[2]
807        smlal           v16.4s,  v19.4h,  v0.h[1]
808        smull2          v17.4s,  v17.8h,  v0.h[3]
809        smlal2          v17.4s,  v18.8h,  v0.h[2]
810        smlal2          v17.4s,  v19.8h,  v0.h[1]
811
812        mvni            v24.8h,  #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
813        add             v6.4s,   v6.4s,   v30.4s
814        add             v7.4s,   v7.4s,   v30.4s
815        add             v16.4s,  v16.4s,  v30.4s
816        add             v17.4s,  v17.4s,  v30.4s
817        srshl           v6.4s,   v6.4s,   v29.4s
818        srshl           v7.4s,   v7.4s,   v29.4s
819        srshl           v16.4s,  v16.4s,  v29.4s
820        srshl           v17.4s,  v17.4s,  v29.4s
821        sqxtun          v6.4h,   v6.4s
822        sqxtun2         v6.8h,   v7.4s
823        sqxtun          v7.4h,   v16.4s
824        sqxtun2         v7.8h,   v17.4s
825        umin            v6.8h,   v6.8h,   v24.8h
826        umin            v7.8h,   v7.8h,   v24.8h
827        sub             v6.8h,   v6.8h,   v31.8h
828        sub             v7.8h,   v7.8h,   v31.8h
829
830        subs            w4,  w4,  #16
831
832        st1             {v6.8h, v7.8h}, [x14], #32
833
834        b.le            0f
835        mov             v2.16b,  v4.16b
836        tst             w7,  #2 // LR_HAVE_RIGHT
837        ld1             {v3.8h, v4.8h}, [x3], #32
838        b.ne            4b // If we don't need to pad, just keep filtering.
839        b               3b // If we need to pad, check how many pixels we have left.
840
8410:
842        ldr             x14,      [sp, #16]
843        ldp             x3,  x4,  [sp], #32
844        ret
845endfunc
846
847function wiener_filter5_v_16bpc_neon
848        stp             x11, x12, [sp, #-48]!
849        stp             x13, x14, [sp, #16]
850        stp             x0,  x4,  [sp, #32]
8511:
852        ld1             {v16.8h, v17.8h}, [x11], #32
853        ld1             {v18.8h, v19.8h}, [x12], #32
854        ld1             {v20.8h, v21.8h}, [x13], #32
855        ld1             {v22.8h, v23.8h}, [x14], #32
856
857        smull           v2.4s,   v16.4h,  v0.h[5]
858        smlal           v2.4s,   v18.4h,  v0.h[6]
859        smlal           v2.4s,   v20.4h,  v0.h[7]
860        smlal           v2.4s,   v22.4h,  v0.h[6]
861        smlal           v2.4s,   v22.4h,  v0.h[5]
862        smull2          v3.4s,   v16.8h,  v0.h[5]
863        smlal2          v3.4s,   v18.8h,  v0.h[6]
864        smlal2          v3.4s,   v20.8h,  v0.h[7]
865        smlal2          v3.4s,   v22.8h,  v0.h[6]
866        smlal2          v3.4s,   v22.8h,  v0.h[5]
867        smull           v4.4s,   v17.4h,  v0.h[5]
868        smlal           v4.4s,   v19.4h,  v0.h[6]
869        smlal           v4.4s,   v21.4h,  v0.h[7]
870        smlal           v4.4s,   v23.4h,  v0.h[6]
871        smlal           v4.4s,   v23.4h,  v0.h[5]
872        smull2          v5.4s,   v17.8h,  v0.h[5]
873        smlal2          v5.4s,   v19.8h,  v0.h[6]
874        smlal2          v5.4s,   v21.8h,  v0.h[7]
875        smlal2          v5.4s,   v23.8h,  v0.h[6]
876        smlal2          v5.4s,   v23.8h,  v0.h[5]
877        srshl           v2.4s,   v2.4s,   v27.4s  // -round_bits_v
878        srshl           v3.4s,   v3.4s,   v27.4s
879        srshl           v4.4s,   v4.4s,   v27.4s
880        srshl           v5.4s,   v5.4s,   v27.4s
881        sqxtun          v2.4h,   v2.4s
882        sqxtun2         v2.8h,   v3.4s
883        sqxtun          v3.4h,   v4.4s
884        sqxtun2         v3.8h,   v5.4s
885        umin            v2.8h,   v2.8h,   v28.8h  // bitdepth_max
886        umin            v3.8h,   v3.8h,   v28.8h
887
888        subs            w4,  w4,  #16
889        st1             {v2.8h, v3.8h}, [x0], #32
890        b.gt            1b
891
892        ldp             x0,  x4,  [sp, #32]
893        ldp             x13, x14, [sp, #16]
894        ldp             x11, x12, [sp], #48
895
896        ret
897endfunc
898
899function wiener_filter5_hv_16bpc_neon
900        // Backing up/restoring registers shifted, so that x11 gets the value
901        // of x12, etc, and x15==x11, afterwards.
902        stp             x12, x13, [sp, #-64]!
903        stp             x14, x15, [sp, #16]
904        stp             x12, x0,  [sp, #32]
905        stp             x3,  x4,  [sp, #48]
906
907        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
908        tst             w7,  #1 // LR_HAVE_LEFT
909        b.eq            1f
910        // LR_HAVE_LEFT
911        cbnz            x2,  0f
912        // left == NULL
913        sub             x3,  x3,  #4
914        ld1             {v2.8h, v3.8h}, [x3], #32
915        b               2f
916
9170:
918        // LR_HAVE_LEFT, left != NULL
919        ld1             {v2.8h, v3.8h}, [x3], #32
920        ld1             {v4.d}[1], [x2], #8
921        // Move x3 back to account for the last 2 pixels we loaded earlier,
922        // which we'll shift out.
923        sub             x3,  x3,  #4
924        ext             v3.16b,  v2.16b,  v3.16b,  #12
925        ext             v2.16b,  v4.16b,  v2.16b,  #12
926        b               2f
9271:
928        ld1             {v2.8h, v3.8h}, [x3], #32
929        // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
930        // and shift v3 to have 2x the first pixel at the front.
931        dup             v4.8h,   v2.h[0]
932        // Move x3 back to account for the last 2 pixels we loaded before,
933        // which we shifted out.
934        sub             x3,  x3,  #4
935        ext             v3.16b,  v2.16b,  v3.16b,  #12
936        ext             v2.16b,  v4.16b,  v2.16b,  #12
937
9382:
939        ld1             {v4.8h}, [x3], #16
940
941        tst             w7,  #2 // LR_HAVE_RIGHT
942        b.ne            4f
943
9443:      // !LR_HAVE_RIGHT
945
946        // Check whether we need to pad the right edge
947        cmp             w4,  #18
948        b.ge            4f   // If w >= 18, all used input pixels are valid
949
950        // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
951        // this ends up called again; it's not strictly needed in those
952        // cases (we pad enough here), but keeping the code as simple as possible.
953
954        // The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie
955        // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel.
956        sub             w17, w4,  #23
957        // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
958        // buffer pointer.
959        movrel          x6,  right_ext_mask, -4
960        ldr             h26, [x3,  w17, sxtw #1]
961        sub             x6,  x6,  w4,  uxtw #1
962        dup             v26.8h,  v26.h[0]
963        ld1             {v23.16b, v24.16b, v25.16b}, [x6]
964
965        bit             v2.16b,  v26.16b, v23.16b
966        bit             v3.16b,  v26.16b, v24.16b
967        bit             v4.16b,  v26.16b, v25.16b
968
9694:      // Loop horizontally
970        ext             v16.16b, v2.16b,  v3.16b, #2
971        ext             v18.16b, v2.16b,  v3.16b, #6
972        ext             v19.16b, v2.16b,  v3.16b, #8
973        ext             v17.16b, v2.16b,  v3.16b, #4
974        add             v18.8h,  v18.8h,  v16.8h
975        add             v19.8h,  v19.8h,  v2.8h
976        smull           v6.4s,   v17.4h,  v0.h[3]
977        smlal           v6.4s,   v18.4h,  v0.h[2]
978        smlal           v6.4s,   v19.4h,  v0.h[1]
979        smull2          v7.4s,   v17.8h,  v0.h[3]
980        smlal2          v7.4s,   v18.8h,  v0.h[2]
981        smlal2          v7.4s,   v19.8h,  v0.h[1]
982
983        ext             v16.16b, v3.16b,  v4.16b, #2
984        ext             v18.16b, v3.16b,  v4.16b, #6
985        ext             v19.16b, v3.16b,  v4.16b, #8
986        ext             v17.16b, v3.16b,  v4.16b, #4
987        add             v18.8h,  v18.8h,  v16.8h
988        add             v19.8h,  v19.8h,  v3.8h
989        smull           v24.4s,  v17.4h,  v0.h[3]
990        smlal           v24.4s,  v18.4h,  v0.h[2]
991        smlal           v24.4s,  v19.4h,  v0.h[1]
992        smull2          v25.4s,  v17.8h,  v0.h[3]
993        smlal2          v25.4s,  v18.8h,  v0.h[2]
994        smlal2          v25.4s,  v19.8h,  v0.h[1]
995
996        ld1             {v16.8h, v17.8h}, [x11], #32
997        mvni            v26.8h,  #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
998        add             v6.4s,   v6.4s,   v30.4s
999        add             v7.4s,   v7.4s,   v30.4s
1000        add             v24.4s,  v24.4s,  v30.4s
1001        add             v25.4s,  v25.4s,  v30.4s
1002        ld1             {v18.8h, v19.8h}, [x12], #32
1003        srshl           v6.4s,   v6.4s,   v29.4s
1004        srshl           v7.4s,   v7.4s,   v29.4s
1005        srshl           v24.4s,  v24.4s,  v29.4s
1006        srshl           v25.4s,  v25.4s,  v29.4s
1007        ld1             {v20.8h, v21.8h}, [x13], #32
1008        sqxtun          v6.4h,   v6.4s
1009        sqxtun2         v6.8h,   v7.4s
1010        sqxtun          v7.4h,   v24.4s
1011        sqxtun2         v7.8h,   v25.4s
1012        ld1             {v22.8h, v23.8h}, [x14], #32
1013        umin            v6.8h,   v6.8h,   v26.8h
1014        umin            v7.8h,   v7.8h,   v26.8h
1015        sub             v6.8h,   v6.8h,   v31.8h
1016        sub             v7.8h,   v7.8h,   v31.8h
1017
1018        smull           v8.4s,   v16.4h,  v0.h[5]
1019        smlal           v8.4s,   v18.4h,  v0.h[6]
1020        smlal           v8.4s,   v20.4h,  v0.h[7]
1021        smlal           v8.4s,   v22.4h,  v0.h[6]
1022        smlal           v8.4s,   v6.4h,   v0.h[5]
1023        smull2          v9.4s,   v16.8h,  v0.h[5]
1024        smlal2          v9.4s,   v18.8h,  v0.h[6]
1025        smlal2          v9.4s,   v20.8h,  v0.h[7]
1026        smlal2          v9.4s,   v22.8h,  v0.h[6]
1027        smlal2          v9.4s,   v6.8h,   v0.h[5]
1028        smull           v1.4s,   v17.4h,  v0.h[5]
1029        smlal           v1.4s,   v19.4h,  v0.h[6]
1030        smlal           v1.4s,   v21.4h,  v0.h[7]
1031        smlal           v1.4s,   v23.4h,  v0.h[6]
1032        smlal           v1.4s,   v7.4h,   v0.h[5]
1033        smull2          v5.4s,   v17.8h,  v0.h[5]
1034        smlal2          v5.4s,   v19.8h,  v0.h[6]
1035        smlal2          v5.4s,   v21.8h,  v0.h[7]
1036        smlal2          v5.4s,   v23.8h,  v0.h[6]
1037        smlal2          v5.4s,   v7.8h,   v0.h[5]
1038        srshl           v8.4s,   v8.4s,   v27.4s  // -round_bits_v
1039        srshl           v9.4s,   v9.4s,   v27.4s
1040        srshl           v1.4s,   v1.4s,   v27.4s
1041        srshl           v5.4s,   v5.4s,   v27.4s
1042        sqxtun          v8.4h,   v8.4s
1043        sqxtun2         v8.8h,   v9.4s
1044        sqxtun          v9.4h,   v1.4s
1045        sqxtun2         v9.8h,   v5.4s
1046        st1             {v6.8h, v7.8h}, [x15], #32
1047        umin            v8.8h,   v8.8h,   v28.8h  // bitdepth_max
1048        umin            v9.8h,   v9.8h,   v28.8h
1049
1050        subs            w4,  w4,  #16
1051
1052        st1             {v8.8h, v9.8h}, [x0], #32
1053
1054        b.le            0f
1055        mov             v2.16b,  v4.16b
1056        tst             w7,  #2 // LR_HAVE_RIGHT
1057        ld1             {v3.8h, v4.8h}, [x3], #32
1058        b.ne            4b // If we don't need to pad, just keep filtering.
1059        b               3b // If we need to pad, check how many pixels we have left.
1060
10610:
1062        ldp             x3,  x4,  [sp, #48]
1063        ldp             x15, x0,  [sp, #32]
1064        ldp             x13, x14, [sp, #16]
1065        ldp             x11, x12, [sp], #64
1066
1067        add             x3,  x3,  x1
1068        add             x0,  x0,  x1
1069
1070        ret
1071endfunc
1072
1073#include "looprestoration_tmpl.S"
1074
1075// void dav1d_sgr_box3_row_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
1076//                                      const pixel (*left)[4],
1077//                                      const pixel *src, const int w,
1078//                                      const enum LrEdgeFlags edges);
1079function sgr_box3_row_h_16bpc_neon, export=1
1080        add             w4,  w4,  #2 // w += 2
1081
1082        tst             w5,  #1 // LR_HAVE_LEFT
1083        b.eq            1f
1084        cbnz            x2,  0f
1085
1086        // LR_HAVE_LEFT && left == NULL
1087        sub             x3,  x3,  #4
1088        ld1             {v0.8h, v1.8h}, [x3], #32
1089        b               2f
1090
10910:
1092        // LR_HAVE_LEFT, left != NULL
1093        ld1             {v0.8h, v1.8h}, [x3], #32
1094        ld1             {v2.d}[1], [x2]
1095        // Move x3 back to account for the last 2 pixels we loaded earlier,
1096        // which we'll shift out.
1097        sub             x3,  x3,  #4
1098        ext             v1.16b, v0.16b, v1.16b, #12
1099        ext             v0.16b, v2.16b, v0.16b, #12
1100        b               2f
1101
11021:
1103        ld1             {v0.8h, v1.8h}, [x3], #32
1104        // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
1105        // and shift v0/v1 to have 2x the first pixel at the front.
1106        dup             v2.8h, v0.h[0]
1107        // Move x3 back to account for the last 2 pixels we loaded before,
1108        // which we shifted out.
1109        sub             x3,  x3,  #4
1110        ext             v1.16b, v0.16b, v1.16b, #12
1111        ext             v0.16b, v2.16b, v0.16b, #12
1112
11132:
1114        tst             w5,  #2 // LR_HAVE_RIGHT
1115        b.ne            4f
1116        // If we'll need to pad the right edge, load that pixel to pad with
1117        // here since we can find it pretty easily from here.
1118        sub             w13, w4, #(2 + 16 - 2 + 1)
1119        ldr             h30, [x3,  w13, sxtw #1]
1120        // Fill v30 with the right padding pixel
1121        dup             v30.8h,  v30.h[0]
11223:      // !LR_HAVE_RIGHT
1123
1124        // Check whether we need to pad the right edge
1125        cmp             w4,  #10
1126        b.ge            4f   // If w >= 10, all used input pixels are valid
1127
1128        // 1 <= w < 10, w pixels valid in v0. For w=9, this ends up called
1129        // again; it's not strictly needed in those cases (we pad enough here),
1130        // but keeping the code as simple as possible.
1131
1132        // Insert padding in v0.b[w] onwards
1133        movrel          x13, right_ext_mask
1134        sub             x13, x13, w4,  uxtw #1
1135        ld1             {v28.16b, v29.16b}, [x13]
1136
1137        bit             v0.16b,  v30.16b, v28.16b
1138        bit             v1.16b,  v30.16b, v29.16b
1139
11404:      // Loop horizontally
1141        ext             v26.16b, v0.16b,  v1.16b,  #2
1142        ext             v27.16b, v0.16b,  v1.16b,  #4
1143
1144        add             v6.8h,   v0.8h,   v26.8h
1145        umull           v22.4s,  v0.4h,   v0.4h
1146        umlal           v22.4s,  v26.4h,  v26.4h
1147        umlal           v22.4s,  v27.4h,  v27.4h
1148        add             v6.8h,   v6.8h,   v27.8h
1149        umull2          v23.4s,  v0.8h,   v0.8h
1150        umlal2          v23.4s,  v26.8h,  v26.8h
1151        umlal2          v23.4s,  v27.8h,  v27.8h
1152
1153        subs            w4,  w4,  #8
1154
1155        st1             {v6.8h},         [x1],  #16
1156        st1             {v22.4s,v23.4s}, [x0],  #32
1157
1158        b.le            9f
1159        tst             w5,  #2 // LR_HAVE_RIGHT
1160        mov             v0.16b,  v1.16b
1161        ld1             {v1.8h},  [x3],  #16
1162
1163        b.ne            4b // If we don't need to pad, just keep summing.
1164        b               3b // If we need to pad, check how many pixels we have left.
1165
11669:
1167        ret
1168endfunc
1169
1170// void dav1d_sgr_box5_row_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
1171//                                      const pixel (*left)[4],
1172//                                      const pixel *src, const int w,
1173//                                      const enum LrEdgeFlags edges);
1174function sgr_box5_row_h_16bpc_neon, export=1
1175        add             w4,  w4,  #2 // w += 2
1176
1177        tst             w5,  #1 // LR_HAVE_LEFT
1178        b.eq            1f
1179        cbnz            x2,  0f
1180
1181        // LR_HAVE_LEFT && left == NULL
1182        sub             x3,  x3,  #6
1183        ld1             {v0.8h, v1.8h}, [x3], #32
1184        b               2f
1185
11860:
1187        // LR_HAVE_LEFT, left != NULL
1188        ld1             {v0.8h, v1.8h}, [x3], #32
1189        ld1             {v2.d}[1], [x2], #8
1190        // Move x3 back to account for the last 3 pixels we loaded earlier,
1191        // which we'll shift out.
1192        sub             x3,  x3,  #6
1193        ext             v1.16b,  v0.16b,  v1.16b,  #10
1194        ext             v0.16b,  v2.16b,  v0.16b,  #10
1195        b               2f
1196
11971:
1198        ld1             {v0.8h, v1.8h}, [x3], #32
1199        // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
1200        // and shift v0/v1 to have 3x the first pixel at the front.
1201        dup             v2.8h,  v0.h[0]
1202        // Move x3 back to account for the last 3 pixels we loaded before,
1203        // which we shifted out.
1204        sub             x3,  x3,  #6
1205        ext             v1.16b,  v0.16b,  v1.16b,  #10
1206        ext             v0.16b,  v2.16b,  v0.16b,  #10
1207
12082:
1209        tst             w5,  #2 // LR_HAVE_RIGHT
1210        b.ne            4f
1211        // If we'll need to pad the right edge, load that pixel to pad with
1212        // here since we can find it pretty easily from here.
1213        sub             w13, w4, #(2 + 16 - 3 + 1)
1214        ldr             h30, [x3,  w13, sxtw #1]
1215        // Fill v30 with the right padding pixel
1216        dup             v30.8h,  v30.h[0]
12173:      // !LR_HAVE_RIGHT
1218
1219        // Check whether we need to pad the right edge
1220        cmp             w4,  #11
1221        b.ge            4f   // If w >= 11, all used input pixels are valid
1222
1223        // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
1224        // this ends up called again; it's not strictly needed in those
1225        // cases (we pad enough here), but keeping the code as simple as possible.
1226
1227        // Insert padding in v0.b[w+1] onwards; fuse the +1 into the
1228        // buffer pointer.
1229        movrel          x13, right_ext_mask, -1
1230        sub             x13, x13, w4,  uxtw #1
1231        ld1             {v28.16b, v29.16b}, [x13]
1232
1233        bit             v0.16b,  v30.16b, v28.16b
1234        bit             v1.16b,  v30.16b, v29.16b
1235
12364:      // Loop horizontally
1237        ext             v26.16b, v0.16b,  v1.16b,  #2
1238        ext             v27.16b, v0.16b,  v1.16b,  #4
1239
1240        add             v6.8h,   v0.8h,   v26.8h
1241        umull           v22.4s,  v0.4h,   v0.4h
1242        umlal           v22.4s,  v26.4h,  v26.4h
1243        umlal           v22.4s,  v27.4h,  v27.4h
1244        add             v6.8h,   v6.8h,   v27.8h
1245        umull2          v23.4s,  v0.8h,   v0.8h
1246        umlal2          v23.4s,  v26.8h,  v26.8h
1247        umlal2          v23.4s,  v27.8h,  v27.8h
1248
1249        ext             v26.16b, v0.16b,  v1.16b,  #6
1250        ext             v27.16b, v0.16b,  v1.16b,  #8
1251
1252        add             v6.8h,   v6.8h,   v26.8h
1253        umlal           v22.4s,  v26.4h,  v26.4h
1254        umlal           v22.4s,  v27.4h,  v27.4h
1255        add             v6.8h,   v6.8h,   v27.8h
1256        umlal2          v23.4s,  v26.8h,  v26.8h
1257        umlal2          v23.4s,  v27.8h,  v27.8h
1258
1259        subs            w4,  w4,  #8
1260
1261        st1             {v6.8h},         [x1],  #16
1262        st1             {v22.4s,v23.4s}, [x0],  #32
1263
1264        b.le            9f
1265        tst             w5,  #2 // LR_HAVE_RIGHT
1266        mov             v0.16b,  v1.16b
1267        ld1             {v1.8h}, [x3], #16
1268
1269        b.ne            4b // If we don't need to pad, just keep summing.
1270        b               3b // If we need to pad, check how many pixels we have left.
1271
12729:
1273        ret
1274endfunc
1275
1276// void dav1d_sgr_box35_row_h_16bpc_neon(int32_t *sumsq3, int16_t *sum3,
1277//                                       int32_t *sumsq5, int16_t *sum5,
1278//                                       const pixel (*left)[4],
1279//                                       const pixel *src, const int w,
1280//                                       const enum LrEdgeFlags edges);
1281function sgr_box35_row_h_16bpc_neon, export=1
1282        add             w6,  w6,  #2 // w += 2
1283
1284        tst             w7,  #1 // LR_HAVE_LEFT
1285        b.eq            1f
1286        cbnz            x4,  0f
1287
1288        // LR_HAVE_LEFT && left == NULL
1289        sub             x5,  x5,  #6
1290        ld1             {v0.8h, v1.8h}, [x5], #32
1291        b               2f
1292
12930:
1294        // LR_HAVE_LEFT, left != NULL
1295        ld1             {v0.8h, v1.8h}, [x5], #32
1296        ld1             {v2.d}[1], [x4], #8
1297        // Move x3 back to account for the last 3 pixels we loaded earlier,
1298        // which we'll shift out.
1299        sub             x5,  x5,  #6
1300        ext             v1.16b,  v0.16b,  v1.16b,  #10
1301        ext             v0.16b,  v2.16b,  v0.16b,  #10
1302        b               2f
1303
13041:
1305        ld1             {v0.8h, v1.8h}, [x5], #32
1306        // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
1307        // and shift v0/v1 to have 3x the first pixel at the front.
1308        dup             v2.8h,  v0.h[0]
1309        // Move x5 back to account for the last 3 pixels we loaded before,
1310        // which we shifted out.
1311        sub             x5,  x5,  #6
1312        ext             v1.16b,  v0.16b,  v1.16b,  #10
1313        ext             v0.16b,  v2.16b,  v0.16b,  #10
1314
13152:
1316        tst             w7,  #2 // LR_HAVE_RIGHT
1317        b.ne            4f
1318        // If we'll need to pad the right edge, load that pixel to pad with
1319        // here since we can find it pretty easily from here.
1320        sub             w13, w6, #(2 + 16 - 3 + 1)
1321        ldr             h30, [x5,  w13, sxtw #1]
1322        // Fill v30 with the right padding pixel
1323        dup             v30.8h,  v30.h[0]
13243:      // !LR_HAVE_RIGHT
1325
1326        // Check whether we need to pad the right edge
1327        cmp             w6,  #11
1328        b.ge            4f   // If w >= 11, all used input pixels are valid
1329
1330        // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
1331        // this ends up called again; it's not strictly needed in those
1332        // cases (we pad enough here), but keeping the code as simple as possible.
1333
1334        // Insert padding in v0.b[w+1] onwards; fuse the +1 into the
1335        // buffer pointer.
1336        movrel          x13, right_ext_mask, -1
1337        sub             x13, x13, w6,  uxtw #1
1338        ld1             {v28.16b, v29.16b}, [x13]
1339
1340        bit             v0.16b,  v30.16b, v28.16b
1341        bit             v1.16b,  v30.16b, v29.16b
1342
13434:      // Loop horizontally
1344        ext             v16.16b, v0.16b,  v1.16b,  #2
1345        ext             v17.16b, v0.16b,  v1.16b,  #4
1346        ext             v19.16b, v0.16b,  v1.16b,  #8
1347        ext             v18.16b, v0.16b,  v1.16b,  #6
1348
1349        add             v20.8h,  v16.8h,  v17.8h
1350        add             v21.8h,  v0.8h,   v19.8h
1351        add             v20.8h,  v20.8h,  v18.8h
1352
1353        umull           v22.4s,  v16.4h,  v16.4h
1354        umlal           v22.4s,  v17.4h,  v17.4h
1355        umlal           v22.4s,  v18.4h,  v18.4h
1356
1357        umull2          v23.4s,  v16.8h,  v16.8h
1358        umlal2          v23.4s,  v17.8h,  v17.8h
1359        umlal2          v23.4s,  v18.8h,  v18.8h
1360
1361        add             v21.8h,  v21.8h,  v20.8h
1362        st1             {v20.8h},        [x1], #16
1363        st1             {v22.4s,v23.4s}, [x0], #32
1364
1365        umlal           v22.4s,  v0.4h,   v0.4h
1366        umlal           v22.4s,  v19.4h,  v19.4h
1367
1368        umlal2          v23.4s,  v0.8h,   v0.8h
1369        umlal2          v23.4s,  v19.8h,  v19.8h
1370
1371        subs            w6,  w6,  #8
1372
1373        st1             {v21.8h},        [x3], #16
1374        st1             {v22.4s,v23.4s}, [x2], #32
1375
1376        b.le            9f
1377        tst             w7,  #2 // LR_HAVE_RIGHT
1378        mov             v0.16b,  v1.16b
1379        ld1             {v1.8h}, [x5], #16
1380
1381        b.ne            4b // If we don't need to pad, just keep summing.
1382        b               3b // If we need to pad, check how many pixels we have left.
1383
13849:
1385        ret
1386endfunc
1387
1388sgr_funcs 16
1389