xref: /aosp_15_r20/external/libdav1d/src/arm/32/looprestoration16.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1/*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2020, Martin Storsjo
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/arm/asm.S"
29#include "util.S"
30
31const right_ext_mask_buf
32        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
33        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
34        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
35        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
36right_ext_mask:
37        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
38        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
39        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
40        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
41endconst
42
43// void dav1d_wiener_filter_h_16bpc_neon(int16_t *dst, const pixel (*left)[4],
44//                                       const pixel *src, ptrdiff_t stride,
45//                                       const int16_t fh[7], const intptr_t w,
46//                                       int h, enum LrEdgeFlags edges,
47//                                       const int bitdepth_max);
48function wiener_filter_h_16bpc_neon, export=1
49        push            {r4-r11,lr}
50        vpush           {q4-q7}
51        ldrd            r4,  r5,  [sp, #100]
52        ldrd            r6,  r7,  [sp, #108]
53        ldr             r8,       [sp, #116] // bitdepth_max
54        vld1.16         {q0}, [r4, :128]
55        clz             r8,  r8
56        vmov.i32        q14, #1
57        sub             r9,  r8,  #38  // -(bitdepth + 6)
58        sub             r8,  r8,  #25  // -round_bits_h
59        neg             r9,  r9        // bitdepth + 6
60        vdup.32         q1,  r9
61        vdup.32         q13, r8        // -round_bits_h
62        vmov.i16        q15, #8192
63        vshl.u32        q14, q14, q1   // 1 << (bitdepth + 6)
64        mov             r8,  r5
65        // Calculate mid_stride
66        add             r10, r5,  #7
67        bic             r10, r10, #7
68        lsl             r10, r10, #1
69
70        // Set up pointers for reading/writing alternate rows
71        add             r12, r0,  r10
72        lsl             r10, r10, #1
73        add             lr,  r2,  r3
74        lsl             r3,  r3,  #1
75
76        // Subtract the aligned width from mid_stride
77        add             r11, r5,  #7
78        bic             r11, r11, #7
79        sub             r10, r10, r11, lsl #1
80
81        // Subtract the number of pixels read from the source stride
82        add             r11, r11, #8
83        sub             r3,  r3,  r11, lsl #1
84
85        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
86        tst             r7,  #1 // LR_HAVE_LEFT
87        beq             2f
88        // LR_HAVE_LEFT
89        cmp             r1,  #0
90        bne             0f
91        // left == NULL
92        sub             r2,  r2,  #6
93        sub             lr,  lr,  #6
94        b               1f
950:      // LR_HAVE_LEFT, left != NULL
962:      // !LR_HAVE_LEFT, increase the stride.
97        // For this case we don't read the left 3 pixels from the src pointer,
98        // but shift it as if we had done that.
99        add             r3,  r3,  #6
100
101
1021:      // Loop vertically
103        vld1.16         {q2, q3}, [r2]!
104        vld1.16         {q4, q5}, [lr]!
105
106        tst             r7,  #1 // LR_HAVE_LEFT
107        beq             0f
108        cmp             r1,  #0
109        beq             2f
110        // LR_HAVE_LEFT, left != NULL
111        vld1.16         {d3},  [r1]!
112        // Move r2/lr back to account for the last 3 pixels we loaded earlier,
113        // which we'll shift out.
114        sub             r2,  r2,  #6
115        sub             lr,  lr,  #6
116        vld1.16         {d13}, [r1]!
117        vext.8          q3,  q2,  q3,  #10
118        vext.8          q2,  q1,  q2,  #10
119        vext.8          q5,  q4,  q5,  #10
120        vext.8          q4,  q6,  q4,  #10
121        b               2f
1220:
123        // !LR_HAVE_LEFT, fill q1 with the leftmost pixel
124        // and shift q2/q3 to have 3x the first pixel at the front.
125        vdup.16         q1,  d4[0]
126        vdup.16         q6,  d8[0]
127        // Move r2 back to account for the last 3 pixels we loaded before,
128        // which we shifted out.
129        sub             r2,  r2,  #6
130        sub             lr,  lr,  #6
131        vext.8          q3,  q2,  q3,  #10
132        vext.8          q2,  q1,  q2,  #10
133        vext.8          q5,  q4,  q5,  #10
134        vext.8          q4,  q6,  q4,  #10
135
1362:
137
138        tst             r7,  #2 // LR_HAVE_RIGHT
139        bne             4f
140        // If we'll need to pad the right edge, load that pixel to pad with
141        // here since we can find it pretty easily from here.
142        sub             r9,  r5,  #14
143        lsl             r9,  r9,  #1
144        ldrh            r11, [r2, r9]
145        ldrh            r9,  [lr, r9]
146        // Fill q11/q12 with the right padding pixel
147        vdup.16         q11, r11
148        vdup.16         q12, r9
1493:      // !LR_HAVE_RIGHT
150
151        // Check whether we need to pad the right edge
152        cmp             r5,  #11
153        bge             4f   // If w >= 11, all used input pixels are valid
154
155        // 1 <= w < 11, w+3 pixels valid in q2-q3. For w=9 or w=10,
156        // this ends up called again; it's not strictly needed in those
157        // cases (we pad enough here), but keeping the code as simple as possible.
158
159        // Insert padding in q2/3.h[w+3] onwards; fuse the +3 (*2) into the
160        // buffer pointer.
161        movrel_local    r4,  right_ext_mask, -6
162        sub             r4,  r4,  r5,  lsl #1
163        vld1.8          {q9, q10}, [r4]
164
165        vbit            q2,  q11, q9
166        vbit            q3,  q11, q10
167        vbit            q4,  q12, q9
168        vbit            q5,  q12, q10
169
1704:      // Loop horizontally
171        vext.8          q7,  q2,  q3,  #4
172        vext.8          q8,  q2,  q3,  #8
173        vext.8          q6,  q2,  q3,  #2
174        vext.8          q9,  q2,  q3,  #10
175        vadd.i16        q8,  q8,  q7
176        vadd.i16        q9,  q9,  q6
177        vext.8          q6,  q2,  q3,  #12
178        vext.8          q7,  q2,  q3,  #6
179        vadd.i16        q2,  q2,  q6
180        vmull.s16       q6,  d14, d0[3]
181        vmlal.s16       q6,  d16, d1[0]
182        vmlal.s16       q6,  d18, d1[1]
183        vmlal.s16       q6,  d4,  d1[2]
184        vmull.s16       q7,  d15, d0[3]
185        vmlal.s16       q7,  d17, d1[0]
186        vmlal.s16       q7,  d19, d1[1]
187        vmlal.s16       q7,  d5,  d1[2]
188
189        vext.8          q8,  q4,  q5,  #4
190        vext.8          q10, q4,  q5,  #8
191        vext.8          q9,  q4,  q5,  #2
192        vext.8          q2,  q4,  q5,  #10
193        vadd.i16        q10, q10, q8
194        vadd.i16        q2,  q2,  q9
195        vext.8          q8,  q4,  q5,  #12
196        vext.8          q9,  q4,  q5,  #6
197        vadd.i16        q4,  q4,  q8
198        vmull.s16       q8,  d18, d0[3]
199        vmlal.s16       q8,  d20, d1[0]
200        vmlal.s16       q8,  d4,  d1[1]
201        vmlal.s16       q8,  d8,  d1[2]
202        vmull.s16       q9,  d19, d0[3]
203        vmlal.s16       q9,  d21, d1[0]
204        vmlal.s16       q9,  d5,  d1[1]
205        vmlal.s16       q9,  d9,  d1[2]
206
207        vmvn.i16        q10, #0x8000 // 0x7fff = (1 << 15) - 1
208        vadd.i32        q6,  q6,  q14
209        vadd.i32        q7,  q7,  q14
210        vadd.i32        q8,  q8,  q14
211        vadd.i32        q9,  q9,  q14
212        vrshl.s32       q6,  q6,  q13
213        vrshl.s32       q7,  q7,  q13
214        vrshl.s32       q8,  q8,  q13
215        vrshl.s32       q9,  q9,  q13
216        vqmovun.s32     d12, q6
217        vqmovun.s32     d13, q7
218        vqmovun.s32     d14, q8
219        vqmovun.s32     d15, q9
220        vmin.u16        q6,  q6,  q10
221        vmin.u16        q7,  q7,  q10
222        vsub.i16        q6,  q6,  q15
223        vsub.i16        q7,  q7,  q15
224        subs            r5,  r5,  #8
225        vst1.16         {q6}, [r0,  :128]!
226        vst1.16         {q7}, [r12, :128]!
227
228        ble             9f
229        tst             r7,  #2 // LR_HAVE_RIGHT
230        vmov            q2,  q3
231        vmov            q4,  q5
232        vld1.16         {q3}, [r2]!
233        vld1.16         {q5}, [lr]!
234        bne             4b // If we don't need to pad, just keep filtering.
235        b               3b // If we need to pad, check how many pixels we have left.
236
2379:
238        subs            r6,  r6,  #2
239        ble             0f
240        // Jump to the next row and loop horizontally
241        add             r0,  r0,  r10
242        add             r12, r12, r10
243        add             r2,  r2,  r3
244        add             lr,  lr,  r3
245        mov             r5,  r8
246        b               1b
2470:
248        vpop            {q4-q7}
249        pop             {r4-r11,pc}
250endfunc
251
252// void dav1d_wiener_filter_v_16bpc_neon(pixel *dst, ptrdiff_t stride,
253//                                       const int16_t *mid, int w, int h,
254//                                       const int16_t fv[7], enum LrEdgeFlags edges,
255//                                       ptrdiff_t mid_stride, const int bitdepth_max);
256function wiener_filter_v_16bpc_neon, export=1
257        push            {r4-r7,lr}
258        vpush           {q4-q5}
259        ldrd            r4,  r5,  [sp, #52]
260        ldrd            r6,  r7,  [sp, #60]
261        ldr             lr,       [sp, #68] // bitdepth_max
262        vld1.16         {q0},  [r5, :128]
263        vdup.16         q5,  lr
264        clz             lr,  lr
265        sub             lr,  lr,  #11   // round_bits_v
266        vdup.32         q4,  lr
267        mov             lr,  r4
268        vneg.s32        q4,  q4         // -round_bits_v
269
270        // Calculate the number of rows to move back when looping vertically
271        mov             r12, r4
272        tst             r6,  #4 // LR_HAVE_TOP
273        beq             0f
274        sub             r2,  r2,  r7, lsl #1
275        add             r12, r12, #2
2760:
277        tst             r6,  #8 // LR_HAVE_BOTTOM
278        beq             1f
279        add             r12, r12, #2
280
2811:      // Start of horizontal loop; start one vertical filter slice.
282        // Load rows into q8-q11 and pad properly.
283        tst             r6,  #4 // LR_HAVE_TOP
284        vld1.16         {q8},  [r2, :128], r7
285        beq             2f
286        // LR_HAVE_TOP
287        vld1.16         {q10}, [r2, :128], r7
288        vmov            q9,  q8
289        vld1.16         {q11}, [r2, :128], r7
290        b               3f
2912:      // !LR_HAVE_TOP
292        vmov            q9,  q8
293        vmov            q10, q8
294        vmov            q11, q8
295
2963:
297        cmp             r4,  #4
298        blt             5f
299        // Start filtering normally; fill in q12-q14 with unique rows.
300        vld1.16         {q12}, [r2, :128], r7
301        vld1.16         {q13}, [r2, :128], r7
302        vld1.16         {q14}, [r2, :128], r7
303
3044:
305.macro filter compare
306        subs            r4,  r4,  #1
307        // Interleaving the mul/mla chains actually hurts performance
308        // significantly on Cortex A53, thus keeping mul/mla tightly
309        // chained like this.
310        vmull.s16       q2,  d16, d0[0]
311        vmlal.s16       q2,  d18, d0[1]
312        vmlal.s16       q2,  d20, d0[2]
313        vmlal.s16       q2,  d22, d0[3]
314        vmlal.s16       q2,  d24, d1[0]
315        vmlal.s16       q2,  d26, d1[1]
316        vmlal.s16       q2,  d28, d1[2]
317        vmull.s16       q3,  d17, d0[0]
318        vmlal.s16       q3,  d19, d0[1]
319        vmlal.s16       q3,  d21, d0[2]
320        vmlal.s16       q3,  d23, d0[3]
321        vmlal.s16       q3,  d25, d1[0]
322        vmlal.s16       q3,  d27, d1[1]
323        vmlal.s16       q3,  d29, d1[2]
324        vrshl.s32       q2,  q2,  q4    // round_bits_v
325        vrshl.s32       q3,  q3,  q4
326        vqmovun.s32     d4,  q2
327        vqmovun.s32     d5,  q3
328        vmin.u16        q2,  q2,  q5    // bitdepth_max
329        vst1.16         {q2}, [r0, :128], r1
330.if \compare
331        cmp             r4,  #4
332.else
333        ble             9f
334.endif
335        vmov            q8,  q9
336        vmov            q9,  q10
337        vmov            q10, q11
338        vmov            q11, q12
339        vmov            q12, q13
340        vmov            q13, q14
341.endm
342        filter          1
343        blt             7f
344        vld1.16         {q14}, [r2, :128], r7
345        b               4b
346
3475:      // Less than 4 rows in total; not all of q12-q13 are filled yet.
348        tst             r6,  #8 // LR_HAVE_BOTTOM
349        beq             6f
350        // LR_HAVE_BOTTOM
351        cmp             r4,  #2
352        // We load at least 2 rows in all cases.
353        vld1.16         {q12}, [r2, :128], r7
354        vld1.16         {q13}, [r2, :128], r7
355        bgt             53f // 3 rows in total
356        beq             52f // 2 rows in total
35751:     // 1 row in total, q11 already loaded, load edge into q12-q14.
358        vmov            q13, q12
359        b               8f
36052:     // 2 rows in total, q11 already loaded, load q12 with content data
361        // and 2 rows of edge.
362        vld1.16         {q14}, [r2, :128], r7
363        vmov            q15, q14
364        b               8f
36553:
366        // 3 rows in total, q11 already loaded, load q12 and q13 with content
367        // and 2 rows of edge.
368        vld1.16         {q14}, [r2, :128], r7
369        vld1.16         {q15}, [r2, :128], r7
370        vmov            q1,  q15
371        b               8f
372
3736:
374        // !LR_HAVE_BOTTOM
375        cmp             r4,  #2
376        bgt             63f // 3 rows in total
377        beq             62f // 2 rows in total
37861:     // 1 row in total, q11 already loaded, pad that into q12-q14.
379        vmov            q12, q11
380        vmov            q13, q11
381        vmov            q14, q11
382        b               8f
38362:     // 2 rows in total, q11 already loaded, load q12 and pad that into q12-q15.
384        vld1.16         {q12}, [r2, :128], r7
385        vmov            q13, q12
386        vmov            q14, q12
387        vmov            q15, q12
388        b               8f
38963:
390        // 3 rows in total, q11 already loaded, load q12 and q13 and pad q13 into q14-q15,q1.
391        vld1.16         {q12}, [r2, :128], r7
392        vld1.16         {q13}, [r2, :128], r7
393        vmov            q14, q13
394        vmov            q15, q13
395        vmov            q1,  q13
396        b               8f
397
3987:
399        // All registers up to q13 are filled already, 3 valid rows left.
400        // < 4 valid rows left; fill in padding and filter the last
401        // few rows.
402        tst             r6,  #8 // LR_HAVE_BOTTOM
403        beq             71f
404        // LR_HAVE_BOTTOM; load 2 rows of edge.
405        vld1.16         {q14}, [r2, :128], r7
406        vld1.16         {q15}, [r2, :128], r7
407        vmov            q1,  q15
408        b               8f
40971:
410        // !LR_HAVE_BOTTOM, pad 3 rows
411        vmov            q14, q13
412        vmov            q15, q13
413        vmov            q1,  q13
414
4158:      // At this point, all registers up to q14-q15,q1 are loaded with
416        // edge/padding (depending on how many rows are left).
417        filter          0 // This branches to 9f when done
418        vmov            q14, q15
419        vmov            q15, q1
420        b               8b
421
4229:      // End of one vertical slice.
423        subs            r3,  r3,  #8
424        ble             0f
425        // Move pointers back up to the top and loop horizontally.
426        mls             r0,  r1,  lr,  r0
427        mls             r2,  r7,  r12, r2
428        add             r0,  r0,  #16
429        add             r2,  r2,  #16
430        mov             r4,  lr
431        b               1b
432
4330:
434        vpop            {q4-q5}
435        pop             {r4-r7,pc}
436.purgem filter
437endfunc
438
439#define SUM_STRIDE (384+16)
440
441#include "looprestoration_tmpl.S"
442
443// void dav1d_sgr_box3_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
444//                                  const pixel (*left)[4],
445//                                  const pixel *src, const ptrdiff_t stride,
446//                                  const int w, const int h,
447//                                  const enum LrEdgeFlags edges);
448function sgr_box3_h_16bpc_neon, export=1
449        push            {r4-r11,lr}
450        vpush           {q4-q7}
451        ldrd            r4,  r5,  [sp, #100]
452        ldrd            r6,  r7,  [sp, #108]
453        add             r5,  r5,  #2 // w += 2
454
455        // Set up pointers for reading/writing alternate rows
456        add             r10, r0,  #(4*SUM_STRIDE)   // sumsq
457        add             r11, r1,  #(2*SUM_STRIDE)   // sum
458        add             r12, r3,  r4                // src
459        lsl             r4,  r4,  #1
460        mov             r9,       #(2*2*SUM_STRIDE) // double sum stride
461
462        // Subtract the aligned width from the output stride.
463        add             lr,  r5,  #7
464        bic             lr,  lr,  #7
465        sub             r9,  r9,  lr, lsl #1
466
467        // Store the width for the vertical loop
468        mov             r8,  r5
469
470        // Subtract the number of pixels read from the input from the stride
471        add             lr,  lr,  #8
472        sub             r4,  r4,  lr, lsl #1
473
474        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
475        tst             r7,  #1 // LR_HAVE_LEFT
476        beq             2f
477        // LR_HAVE_LEFT
478        cmp             r2,  #0
479        bne             0f
480        // left == NULL
481        sub             r3,  r3,  #4
482        sub             r12, r12, #4
483        b               1f
4840:      // LR_HAVE_LEFT, left != NULL
4852:      // !LR_HAVE_LEFT, increase the stride.
486        // For this case we don't read the left 2 pixels from the src pointer,
487        // but shift it as if we had done that.
488        add             r4,  r4,  #4
489
490
4911:      // Loop vertically
492        vld1.16         {q0, q1}, [r3]!
493        vld1.16         {q4, q5}, [r12]!
494
495        tst             r7,  #1 // LR_HAVE_LEFT
496        beq             0f
497        cmp             r2,  #0
498        beq             2f
499        // LR_HAVE_LEFT, left != NULL
500        vld1.16         {d5}, [r2]!
501        // Move r3/r12 back to account for the last 2 pixels we loaded earlier,
502        // which we'll shift out.
503        sub             r3,  r3,  #4
504        sub             r12, r12, #4
505        vld1.16         {d13}, [r2]!
506        vext.8          q1,  q0,  q1,  #12
507        vext.8          q0,  q2,  q0,  #12
508        vext.8          q5,  q4,  q5,  #12
509        vext.8          q4,  q6,  q4,  #12
510        b               2f
5110:
512        // !LR_HAVE_LEFT, fill q2 with the leftmost pixel
513        // and shift q0 to have 2x the first byte at the front.
514        vdup.16         q2,  d0[0]
515        vdup.16         q6,  d8[0]
516        // Move r3 back to account for the last 2 pixels we loaded before,
517        // which we shifted out.
518        sub             r3,  r3,  #4
519        sub             r12, r12, #4
520        vext.8          q1,  q0,  q1,  #12
521        vext.8          q0,  q2,  q0,  #12
522        vext.8          q5,  q4,  q5,  #12
523        vext.8          q4,  q6,  q4,  #12
524
5252:
526        tst             r7,  #2 // LR_HAVE_RIGHT
527        bne             4f
528        // If we'll need to pad the right edge, load that pixel to pad with
529        // here since we can find it pretty easily from here.
530        sub             lr,  r5,  #(2 + 16 - 2 + 1)
531        lsl             lr,  lr,  #1
532        ldrh            r11, [r3,  lr]
533        ldrh            lr,  [r12, lr]
534        // Fill q14/q15 with the right padding pixel
535        vdup.16         q14, r11
536        vdup.16         q15, lr
537        // Restore r11 after using it for a temporary value
538        add             r11, r1,  #(2*SUM_STRIDE)
5393:      // !LR_HAVE_RIGHT
540
541        // Check whether we need to pad the right edge
542        cmp             r5,  #10
543        bge             4f   // If w >= 10, all used input pixels are valid
544
545        // 1 <= w < 10, w pixels valid in q0-q1. For w=9, this ends up called
546        // again; it's not strictly needed in those cases (we pad enough here),
547        // but keeping the code as simple as possible.
548
549        // Insert padding in q0/1.h[w] onwards
550        movrel_local    lr,  right_ext_mask
551        sub             lr,  lr,  r5,  lsl #1
552        vld1.8          {q12, q13}, [lr]
553
554        vbit            q0,  q14, q12
555        vbit            q1,  q14, q13
556        vbit            q4,  q15, q12
557        vbit            q5,  q15, q13
558
5594:      // Loop horizontally
560        vext.8          q8,  q0,  q1,  #2
561        vext.8          q10, q4,  q5,  #2
562        vext.8          q9,  q0,  q1,  #4
563        vext.8          q11, q4,  q5,  #4
564        vadd.i16        q2,  q0,  q8
565        vadd.i16        q3,  q4,  q10
566        vadd.i16        q2,  q2,  q9
567        vadd.i16        q3,  q3,  q11
568
569        vmull.u16       q6,  d0,  d0
570        vmlal.u16       q6,  d16, d16
571        vmlal.u16       q6,  d18, d18
572        vmull.u16       q12, d8,  d8
573        vmlal.u16       q12, d20, d20
574        vmlal.u16       q12, d22, d22
575        vmull.u16       q7,  d1,  d1
576        vmlal.u16       q7,  d17, d17
577        vmlal.u16       q7,  d19, d19
578        vmull.u16       q13, d9,  d9
579        vmlal.u16       q13, d21, d21
580        vmlal.u16       q13, d23, d23
581        subs            r5,  r5,  #8
582        vst1.16         {q2},       [r1,  :128]!
583        vst1.16         {q3},       [r11, :128]!
584        vst1.32         {q6,  q7},  [r0,  :128]!
585        vst1.32         {q12, q13}, [r10, :128]!
586
587        ble             9f
588        tst             r7,  #2 // LR_HAVE_RIGHT
589        vmov            q0,  q1
590        vmov            q4,  q5
591        vld1.16         {q1}, [r3]!
592        vld1.16         {q5}, [r12]!
593
594        bne             4b // If we don't need to pad, just keep summing.
595        b               3b // If we need to pad, check how many pixels we have left.
596
5979:
598        subs            r6,  r6,  #2
599        ble             0f
600        // Jump to the next row and loop horizontally
601        add             r0,  r0,  r9, lsl #1
602        add             r10, r10, r9, lsl #1
603        add             r1,  r1,  r9
604        add             r11, r11, r9
605        add             r3,  r3,  r4
606        add             r12, r12, r4
607        mov             r5,  r8
608        b               1b
6090:
610        vpop            {q4-q7}
611        pop             {r4-r11,pc}
612endfunc
613
614// void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
615//                                  const pixel (*left)[4],
616//                                  const pixel *src, const ptrdiff_t stride,
617//                                  const int w, const int h,
618//                                  const enum LrEdgeFlags edges);
619function sgr_box5_h_16bpc_neon, export=1
620        push            {r4-r11,lr}
621        vpush           {q4-q7}
622        ldrd            r4,  r5,  [sp, #100]
623        ldrd            r6,  r7,  [sp, #108]
624        add             r5,  r5,  #2 // w += 2
625
626        // Set up pointers for reading/writing alternate rows
627        add             r10, r0,  #(4*SUM_STRIDE)   // sumsq
628        add             r11, r1,  #(2*SUM_STRIDE)   // sum
629        add             r12, r3,  r4                // src
630        lsl             r4,  r4,  #1
631        mov             r9,       #(2*2*SUM_STRIDE) // double sum stride
632
633        // Subtract the aligned width from the output stride.
634        add             lr,  r5,  #7
635        bic             lr,  lr,  #7
636        sub             r9,  r9,  lr, lsl #1
637        add             lr,  lr,  #8
638        sub             r4,  r4,  lr, lsl #1
639
640        // Store the width for the vertical loop
641        mov             r8,  r5
642
643        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
644        tst             r7,  #1 // LR_HAVE_LEFT
645        beq             2f
646        // LR_HAVE_LEFT
647        cmp             r2,  #0
648        bne             0f
649        // left == NULL
650        sub             r3,  r3,  #6
651        sub             r12, r12, #6
652        b               1f
6530:      // LR_HAVE_LEFT, left != NULL
6542:      // !LR_HAVE_LEFT, increase the stride.
655        // For this case we don't read the left 3 pixels from the src pointer,
656        // but shift it as if we had done that.
657        add             r4,  r4,  #6
658
6591:      // Loop vertically
660        vld1.16         {q0, q1}, [r3]!
661        vld1.16         {q4, q5}, [r12]!
662
663        tst             r7,  #1 // LR_HAVE_LEFT
664        beq             0f
665        cmp             r2,  #0
666        beq             2f
667        // LR_HAVE_LEFT, left != NULL
668        vld1.16         {d5}, [r2]!
669        // Move r3/r12 back to account for the last 3 pixels we loaded earlier,
670        // which we'll shift out.
671        sub             r3,  r3,  #6
672        sub             r12, r12, #6
673        vld1.16         {d13}, [r2]!
674        vext.8          q1,  q0,  q1,  #10
675        vext.8          q0,  q2,  q0,  #10
676        vext.8          q5,  q4,  q5,  #10
677        vext.8          q4,  q6,  q4,  #10
678        b               2f
6790:
680        // !LR_HAVE_LEFT, fill q2 with the leftmost pixel
681        // and shift q0 to have 3x the first pixel at the front.
682        vdup.16         q2,  d0[0]
683        vdup.16         q6,  d8[0]
684        // Move r3 back to account for the last 3 pixels we loaded before,
685        // which we shifted out.
686        sub             r3,  r3,  #6
687        sub             r12, r12, #6
688        vext.8          q1,  q0,  q1,  #10
689        vext.8          q0,  q2,  q0,  #10
690        vext.8          q5,  q4,  q5,  #10
691        vext.8          q4,  q6,  q4,  #10
692
6932:
694        tst             r7,  #2 // LR_HAVE_RIGHT
695        bne             4f
696        // If we'll need to pad the right edge, load that pixel to pad with
697        // here since we can find it pretty easily from here.
698        sub             lr,  r5,  #(2 + 16 - 3 + 1)
699        lsl             lr,  lr,  #1
700        ldrh            r11, [r3,  lr]
701        ldrh            lr,  [r12, lr]
702        // Fill q14/q15 with the right padding pixel
703        vdup.16         q14, r11
704        vdup.16         q15, lr
705        // Restore r11 after using it for a temporary value
706        add             r11, r1,  #(2*SUM_STRIDE)
7073:      // !LR_HAVE_RIGHT
708
709        // Check whether we need to pad the right edge
710        cmp             r5,  #11
711        bge             4f   // If w >= 11, all used input pixels are valid
712
713        // 1 <= w < 11, w+1 pixels valid in q0-q1. For w=9 or w=10,
714        // this ends up called again; it's not strictly needed in those
715        // cases (we pad enough here), but keeping the code as simple as possible.
716
717        // Insert padding in q0/1.h[w+1] onwards; fuse the +1 into the
718        // buffer pointer.
719        movrel_local    lr,  right_ext_mask, -2
720        sub             lr,  lr,  r5,  lsl #1
721        vld1.8          {q12, q13}, [lr]
722
723        vbit            q0,  q14, q12
724        vbit            q1,  q14, q13
725        vbit            q4,  q15, q12
726        vbit            q5,  q15, q13
727
7284:      // Loop horizontally
729        vext.8          q8,  q0,  q1,  #2
730        vext.8          q10, q4,  q5,  #2
731        vext.8          q9,  q0,  q1,  #4
732        vext.8          q11, q4,  q5,  #4
733        vadd.i16        q2,  q0,  q8
734        vadd.i16        q3,  q4,  q10
735        vadd.i16        q2,  q2,  q9
736        vadd.i16        q3,  q3,  q11
737
738        vmull.u16       q6,  d0,  d0
739        vmlal.u16       q6,  d16, d16
740        vmlal.u16       q6,  d18, d18
741        vmull.u16       q12, d8,  d8
742        vmlal.u16       q12, d20, d20
743        vmlal.u16       q12, d22, d22
744        vmull.u16       q7,  d1,  d1
745        vmlal.u16       q7,  d17, d17
746        vmlal.u16       q7,  d19, d19
747        vmull.u16       q13, d9,  d9
748        vmlal.u16       q13, d21, d21
749        vmlal.u16       q13, d23, d23
750
751        vext.8          q8,  q0,  q1,  #6
752        vext.8          q10, q4,  q5,  #6
753        vext.8          q9,  q0,  q1,  #8
754        vext.8          q11, q4,  q5,  #8
755        vadd.i16        q2,  q2,  q8
756        vadd.i16        q3,  q3,  q10
757        vadd.i16        q2,  q2,  q9
758        vadd.i16        q3,  q3,  q11
759
760        vmlal.u16       q6,  d16, d16
761        vmlal.u16       q6,  d1,  d1
762        vmlal.u16       q12, d20, d20
763        vmlal.u16       q12, d9,  d9
764        vmlal.u16       q7,  d17, d17
765        vmlal.u16       q7,  d19, d19
766        vmlal.u16       q13, d21, d21
767        vmlal.u16       q13, d23, d23
768
769        subs            r5,  r5,  #8
770        vst1.16         {q2},       [r1,  :128]!
771        vst1.16         {q3},       [r11, :128]!
772        vst1.32         {q6,  q7},  [r0,  :128]!
773        vst1.32         {q12, q13}, [r10, :128]!
774
775        ble             9f
776        tst             r7,  #2 // LR_HAVE_RIGHT
777        vmov            q0,  q1
778        vmov            q4,  q5
779        vld1.16         {q1}, [r3]!
780        vld1.16         {q5}, [r12]!
781        bne             4b // If we don't need to pad, just keep summing.
782        b               3b // If we need to pad, check how many pixels we have left.
783
7849:
785        subs            r6,  r6,  #2
786        ble             0f
787        // Jump to the next row and loop horizontally
788        add             r0,  r0,  r9, lsl #1
789        add             r10, r10, r9, lsl #1
790        add             r1,  r1,  r9
791        add             r11, r11, r9
792        add             r3,  r3,  r4
793        add             r12, r12, r4
794        mov             r5,  r8
795        b               1b
7960:
797        vpop            {q4-q7}
798        pop             {r4-r11,pc}
799endfunc
800
801sgr_funcs 16
802