xref: /aosp_15_r20/external/libdav1d/src/arm/32/looprestoration.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1/*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2019, Martin Storsjo
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/arm/asm.S"
29#include "util.S"
30
31const right_ext_mask_buf
32        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
33        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
34        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
35        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
36right_ext_mask:
37        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
38        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
39        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
40        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
41endconst
42
43// void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4],
44//                                      const pixel *src, ptrdiff_t stride,
45//                                      const int16_t fh[8], intptr_t w,
46//                                      int h, enum LrEdgeFlags edges);
47function wiener_filter_h_8bpc_neon, export=1
48        push            {r4-r11,lr}
49        vpush           {q4-q7}
50        ldrd            r4,  r5,  [sp, #100]
51        ldrd            r6,  r7,  [sp, #108]
52        mov             r8,  r5
53        vld1.16         {q0},  [r4, :128]
54        movw            r9,  #(1 << 14) - (1 << 2)
55        vdup.16         q14, r9
56        vmov.s16        q15, #2048
57        // Calculate mid_stride
58        add             r10, r5,  #7
59        bic             r10, r10, #7
60        lsl             r10, r10, #1
61
62        // Set up pointers for reading/writing alternate rows
63        add             r12, r0,  r10
64        lsl             r10, r10, #1
65        add             lr,  r2,  r3
66        lsl             r3,  r3,  #1
67
68        // Subtract the aligned width from mid_stride
69        add             r11, r5,  #7
70        bic             r11, r11, #7
71        sub             r10, r10, r11, lsl #1
72
73        // Subtract the number of pixels read from the source stride
74        add             r11, r11, #8
75        sub             r3,  r3,  r11
76
77        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
78        tst             r7,  #1 // LR_HAVE_LEFT
79        beq             2f
80        // LR_HAVE_LEFT
81        cmp             r1,  #0
82        bne             0f
83        // left == NULL
84        sub             r2,  r2,  #3
85        sub             lr,  lr,  #3
86        b               1f
870:      // LR_HAVE_LEFT, left != NULL
882:      // !LR_HAVE_LEFT, increase the stride.
89        // For this case we don't read the left 3 pixels from the src pointer,
90        // but shift it as if we had done that.
91        add             r3,  r3,  #3
92
93
941:      // Loop vertically
95        vld1.8          {q2},  [r2]!
96        vld1.8          {q9},  [lr]!
97
98        tst             r7,  #1 // LR_HAVE_LEFT
99        beq             0f
100        cmp             r1,  #0
101        beq             2f
102        // LR_HAVE_LEFT, left != NULL
103        vld1.32         {d3[1]},  [r1]!
104        // Move r2/lr back to account for the last 3 bytes we loaded earlier,
105        // which we'll shift out.
106        sub             r2,  r2,  #3
107        sub             lr,  lr,  #3
108        vld1.32         {d17[1]},  [r1]!
109        vext.8          q2,  q1,  q2,  #13
110        vext.8          q9,  q8,  q9,  #13
111        b               2f
1120:
113        // !LR_HAVE_LEFT, fill q1 with the leftmost byte
114        // and shift q2 to have 3x the first byte at the front.
115        vdup.8          q1,  d4[0]
116        vdup.8          q8,  d18[0]
117        // Move r2 back to account for the last 3 bytes we loaded before,
118        // which we shifted out.
119        sub             r2,  r2,  #3
120        sub             lr,  lr,  #3
121        vext.8          q2,  q1,  q2,  #13
122        vext.8          q9,  q8,  q9,  #13
123
1242:
125        vmovl.u8        q1,  d4
126        vmovl.u8        q2,  d5
127        vmovl.u8        q8,  d18
128        vmovl.u8        q9,  d19
129
130        tst             r7,  #2 // LR_HAVE_RIGHT
131        bne             4f
132        // If we'll need to pad the right edge, load that byte to pad with
133        // here since we can find it pretty easily from here.
134        sub             r9,  r5,  #14
135        ldrb            r11, [r2, r9]
136        ldrb            r9,  [lr, r9]
137        // Fill q12/q13 with the right padding pixel
138        vdup.16         q12, r11
139        vdup.16         q13, r9
1403:      // !LR_HAVE_RIGHT
141
142        // Check whether we need to pad the right edge
143        cmp             r5,  #11
144        bge             4f   // If w >= 11, all used input pixels are valid
145
146        // 1 <= w < 11, w+3 pixels valid in q1-q2. For w=9 or w=10,
147        // this ends up called again; it's not strictly needed in those
148        // cases (we pad enough here), but keeping the code as simple as possible.
149
150        // Insert padding in q1/2.h[w+3] onwards; fuse the +3 (*2) into the
151        // buffer pointer.
152        movrel_local    r4,  right_ext_mask, -6
153        sub             r4,  r4,  r5,  lsl #1
154        vld1.8          {q10, q11}, [r4]
155
156        vbit            q1,  q12, q10
157        vbit            q2,  q12, q11
158        vbit            q8,  q13, q10
159        vbit            q9,  q13, q11
160
1614:      // Loop horizontally
162        vext.8          q11, q1,  q2,  #4
163        vext.8          q5,  q1,  q2,  #8
164        vext.8          q10, q1,  q2,  #2
165        vext.8          q6,  q1,  q2,  #10
166        vext.8          q7,  q1,  q2,  #12
167        vext.8          q4,  q1,  q2,  #6
168        vadd.i16        q5,  q5,  q11
169        vadd.i16        q6,  q6,  q10
170        vadd.i16        q7,  q7,  q1
171        vmul.s16        q3,  q4,  d0[3]
172        vmla.s16        q3,  q5,  d1[0]
173        vmla.s16        q3,  q6,  d1[1]
174        vmla.s16        q3,  q7,  d1[2]
175
176        vext.8          q4,  q8,  q9,  #4
177        vext.8          q6,  q8,  q9,  #8
178        vext.8          q11, q8,  q9,  #2
179        vext.8          q7,  q8,  q9,  #10
180        vadd.i16        q6,  q6,  q4
181        vext.8          q4,  q8,  q9,  #12
182        vext.8          q5,  q8,  q9,  #6
183        vadd.i16        q7,  q7,  q11
184        vadd.i16        q4,  q4,  q8
185        vmul.s16        q10, q5,  d0[3]
186        vmla.s16        q10, q6,  d1[0]
187        vmla.s16        q10, q7,  d1[1]
188        vmla.s16        q10, q4,  d1[2]
189
190        vext.8          q1,  q1,  q2,  #6
191        vext.8          q8,  q8,  q9,  #6
192        vshl.s16        q1,  q1,  #7
193        vshl.s16        q8,  q8,  #7
194        vsub.s16        q1,  q1,  q14
195        vsub.s16        q8,  q8,  q14
196        vqadd.s16       q3,  q3,  q1
197        vqadd.s16       q10, q10, q8
198        vshr.s16        q3,  q3,  #3
199        vshr.s16        q10, q10, #3
200        vadd.s16        q3,  q3,  q15
201        vadd.s16        q10, q10, q15
202        subs            r5,  r5,  #8
203        vst1.16         {q3},  [r0,  :128]!
204        vst1.16         {q10}, [r12, :128]!
205
206        ble             9f
207        tst             r7,  #2 // LR_HAVE_RIGHT
208        vmov            q1,  q2
209        vmov            q8,  q9
210        vld1.8          {d4},  [r2]!
211        vld1.8          {d18}, [lr]!
212        vmovl.u8        q2,  d4
213        vmovl.u8        q9,  d18
214        bne             4b // If we don't need to pad, just keep filtering.
215        b               3b // If we need to pad, check how many pixels we have left.
216
2179:
218        subs            r6,  r6,  #2
219        ble             0f
220        // Jump to the next row and loop horizontally
221        add             r0,  r0,  r10
222        add             r12, r12, r10
223        add             r2,  r2,  r3
224        add             lr,  lr,  r3
225        mov             r5,  r8
226        b               1b
2270:
228        vpop            {q4-q7}
229        pop             {r4-r11,pc}
230endfunc
231
232// void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, ptrdiff_t stride,
233//                                      const int16_t *mid, int w, int h,
234//                                      const int16_t fv[8], enum LrEdgeFlags edges,
235//                                      ptrdiff_t mid_stride);
236function wiener_filter_v_8bpc_neon, export=1
237        push            {r4-r7,lr}
238        vpush           {q4-q6}
239        ldrd            r4,  r5,  [sp, #68]
240        ldrd            r6,  r7,  [sp, #76]
241        mov             lr,  r4
242        vld1.16         {q0},  [r5, :128]
243
244        // Calculate the number of rows to move back when looping vertically
245        mov             r12, r4
246        tst             r6,  #4 // LR_HAVE_TOP
247        beq             0f
248        sub             r2,  r2,  r7,  lsl #1
249        add             r12, r12, #2
2500:
251        tst             r6,  #8 // LR_HAVE_BOTTOM
252        beq             1f
253        add             r12, r12, #2
254
2551:      // Start of horizontal loop; start one vertical filter slice.
256        // Load rows into q8-q11 and pad properly.
257        tst             r6,  #4 // LR_HAVE_TOP
258        vld1.16         {q8},  [r2, :128], r7
259        beq             2f
260        // LR_HAVE_TOP
261        vld1.16         {q10}, [r2, :128], r7
262        vmov            q9,  q8
263        vld1.16         {q11}, [r2, :128], r7
264        b               3f
2652:      // !LR_HAVE_TOP
266        vmov            q9,  q8
267        vmov            q10, q8
268        vmov            q11, q8
269
2703:
271        cmp             r4,  #4
272        blt             5f
273        // Start filtering normally; fill in q12-q14 with unique rows.
274        vld1.16         {q12}, [r2, :128], r7
275        vld1.16         {q13}, [r2, :128], r7
276        vld1.16         {q14}, [r2, :128], r7
277
2784:
279.macro filter compare
280        subs            r4,  r4,  #1
281        // Interleaving the mul/mla chains actually hurts performance
282        // significantly on Cortex A53, thus keeping mul/mla tightly
283        // chained like this.
284        vadd.i16        q4,  q10, q12
285        vadd.i16        q5,  q9,  q13
286        vadd.i16        q6,  q8,  q14
287        vmull.s16       q2,  d22, d0[3]
288        vmlal.s16       q2,  d8,  d1[0]
289        vmlal.s16       q2,  d10, d1[1]
290        vmlal.s16       q2,  d12, d1[2]
291        vmull.s16       q3,  d23, d0[3]
292        vmlal.s16       q3,  d9,  d1[0]
293        vmlal.s16       q3,  d11, d1[1]
294        vmlal.s16       q3,  d13, d1[2]
295        vqrshrun.s32    d4,  q2,  #11
296        vqrshrun.s32    d5,  q3,  #11
297        vqmovun.s16     d4,  q2
298        vst1.8          {d4}, [r0, :64], r1
299.if \compare
300        cmp             r4,  #4
301.else
302        ble             9f
303.endif
304        vmov            q8,  q9
305        vmov            q9,  q10
306        vmov            q10, q11
307        vmov            q11, q12
308        vmov            q12, q13
309        vmov            q13, q14
310.endm
311        filter          1
312        blt             7f
313        vld1.16         {q14}, [r2, :128], r7
314        b               4b
315
3165:      // Less than 4 rows in total; not all of q12-q13 are filled yet.
317        tst             r6,  #8 // LR_HAVE_BOTTOM
318        beq             6f
319        // LR_HAVE_BOTTOM
320        cmp             r4,  #2
321        // We load at least 2 rows in all cases.
322        vld1.16         {q12}, [r2, :128], r7
323        vld1.16         {q13}, [r2, :128], r7
324        bgt             53f // 3 rows in total
325        beq             52f // 2 rows in total
32651:     // 1 row in total, q11 already loaded, load edge into q12-q14.
327        vmov            q13, q12
328        b               8f
32952:     // 2 rows in total, q11 already loaded, load q12 with content data
330        // and 2 rows of edge.
331        vld1.16         {q14}, [r2, :128], r7
332        vmov            q15, q14
333        b               8f
33453:
335        // 3 rows in total, q11 already loaded, load q12 and q13 with content
336        // and 2 rows of edge.
337        vld1.16         {q14}, [r2, :128], r7
338        vld1.16         {q15}, [r2, :128], r7
339        vmov            q1,  q15
340        b               8f
341
3426:
343        // !LR_HAVE_BOTTOM
344        cmp             r4,  #2
345        bgt             63f // 3 rows in total
346        beq             62f // 2 rows in total
34761:     // 1 row in total, q11 already loaded, pad that into q12-q14.
348        vmov            q12, q11
349        vmov            q13, q11
350        vmov            q14, q11
351        b               8f
35262:     // 2 rows in total, q11 already loaded, load q12 and pad that into q12-q15.
353        vld1.16         {q12}, [r2, :128], r7
354        vmov            q13, q12
355        vmov            q14, q12
356        vmov            q15, q12
357        b               8f
35863:
359        // 3 rows in total, q11 already loaded, load q12 and q13 and pad q13 into q14-q15,q1.
360        vld1.16         {q12}, [r2, :128], r7
361        vld1.16         {q13}, [r2, :128], r7
362        vmov            q14, q13
363        vmov            q15, q13
364        vmov            q1,  q13
365        b               8f
366
3677:
368        // All registers up to q13 are filled already, 3 valid rows left.
369        // < 4 valid rows left; fill in padding and filter the last
370        // few rows.
371        tst             r6,  #8 // LR_HAVE_BOTTOM
372        beq             71f
373        // LR_HAVE_BOTTOM; load 2 rows of edge.
374        vld1.16         {q14}, [r2, :128], r7
375        vld1.16         {q15}, [r2, :128], r7
376        vmov            q1,  q15
377        b               8f
37871:
379        // !LR_HAVE_BOTTOM, pad 3 rows
380        vmov            q14, q13
381        vmov            q15, q13
382        vmov            q1,  q13
383
3848:      // At this point, all registers up to q14-15,q1 are loaded with
385        // edge/padding (depending on how many rows are left).
386        filter          0 // This branches to 9f when done
387        vmov            q14, q15
388        vmov            q15, q1
389        b               8b
390
3919:      // End of one vertical slice.
392        subs            r3,  r3,  #8
393        ble             0f
394        // Move pointers back up to the top and loop horizontally.
395        mls             r0,  r1,  lr,  r0
396        mls             r2,  r7,  r12, r2
397        add             r0,  r0,  #8
398        add             r2,  r2,  #16
399        mov             r4,  lr
400        b               1b
401
4020:
403        vpop            {q4-q6}
404        pop             {r4-r7,pc}
405.purgem filter
406endfunc
407
408#define SUM_STRIDE (384+16)
409
410#include "looprestoration_tmpl.S"
411
412// void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
413//                                 const pixel (*left)[4],
414//                                 const pixel *src, const ptrdiff_t stride,
415//                                 const int w, const int h,
416//                                 const enum LrEdgeFlags edges);
417function sgr_box3_h_8bpc_neon, export=1
418        push            {r4-r11,lr}
419        vpush           {q4-q7}
420        ldrd            r4,  r5,  [sp, #100]
421        ldrd            r6,  r7,  [sp, #108]
422        add             r5,  r5,  #2 // w += 2
423
424        // Set up pointers for reading/writing alternate rows
425        add             r10, r0,  #(4*SUM_STRIDE)   // sumsq
426        add             r11, r1,  #(2*SUM_STRIDE)   // sum
427        add             r12, r3,  r4                // src
428        lsl             r4,  r4,  #1
429        mov             r9,       #(2*2*SUM_STRIDE) // double sum stride
430
431        // Subtract the aligned width from the output stride.
432        add             lr,  r5,  #7
433        bic             lr,  lr,  #7
434        sub             r9,  r9,  lr, lsl #1
435
436        // Store the width for the vertical loop
437        mov             r8,  r5
438
439        // Subtract the number of pixels read from the input from the stride
440        add             lr,  lr,  #8
441        sub             r4,  r4,  lr
442
443        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
444        tst             r7,  #1 // LR_HAVE_LEFT
445        beq             2f
446        // LR_HAVE_LEFT
447        cmp             r2,  #0
448        bne             0f
449        // left == NULL
450        sub             r3,  r3,  #2
451        sub             r12, r12, #2
452        b               1f
4530:      // LR_HAVE_LEFT, left != NULL
4542:      // !LR_HAVE_LEFT, increase the stride.
455        // For this case we don't read the left 2 pixels from the src pointer,
456        // but shift it as if we had done that.
457        add             r4,  r4,  #2
458
459
4601:      // Loop vertically
461        vld1.8          {q0}, [r3]!
462        vld1.8          {q4}, [r12]!
463
464        tst             r7,  #1 // LR_HAVE_LEFT
465        beq             0f
466        cmp             r2,  #0
467        beq             2f
468        // LR_HAVE_LEFT, left != NULL
469        vld1.32         {d3[]}, [r2]!
470        // Move r3/r12 back to account for the last 2 bytes we loaded earlier,
471        // which we'll shift out.
472        sub             r3,  r3,  #2
473        sub             r12, r12, #2
474        vld1.32         {d11[]}, [r2]!
475        vext.8          q0,  q1,  q0,  #14
476        vext.8          q4,  q5,  q4,  #14
477        b               2f
4780:
479        // !LR_HAVE_LEFT, fill q1 with the leftmost byte
480        // and shift q0 to have 2x the first byte at the front.
481        vdup.8          q1,  d0[0]
482        vdup.8          q5,  d8[0]
483        // Move r3 back to account for the last 2 bytes we loaded before,
484        // which we shifted out.
485        sub             r3,  r3,  #2
486        sub             r12, r12, #2
487        vext.8          q0,  q1,  q0,  #14
488        vext.8          q4,  q5,  q4,  #14
489
4902:
491        vmull.u8        q1,  d0,  d0
492        vmull.u8        q2,  d1,  d1
493        vmull.u8        q5,  d8,  d8
494        vmull.u8        q6,  d9,  d9
495
496        tst             r7,  #2 // LR_HAVE_RIGHT
497        bne             4f
498        // If we'll need to pad the right edge, load that byte to pad with
499        // here since we can find it pretty easily from here.
500        sub             lr,  r5,  #(2 + 16 - 2 + 1)
501        ldrb            r11, [r3,  lr]
502        ldrb            lr,  [r12, lr]
503        // Fill q14/q15 with the right padding pixel
504        vdup.8          q14, r11
505        vdup.8          q15, lr
506        // Restore r11 after using it for a temporary value
507        add             r11, r1,  #(2*SUM_STRIDE)
5083:      // !LR_HAVE_RIGHT
509
510        // Check whether we need to pad the right edge
511        cmp             r5,  #10
512        bge             4f   // If w >= 10, all used input pixels are valid
513
514        // 1 <= w < 10, w pixels valid in q0. For w=9, this ends up called
515        // again; it's not strictly needed in those cases (we pad enough here),
516        // but keeping the code as simple as possible.
517
518        // Insert padding in q0/4.b[w] onwards
519        movrel_local    lr,  right_ext_mask
520        sub             lr,  lr,  r5
521        vld1.8          {q13}, [lr]
522
523        vbit            q0,  q14, q13
524        vbit            q4,  q15, q13
525
526        // Update the precalculated squares
527        vmull.u8        q1,  d0,  d0
528        vmull.u8        q2,  d1,  d1
529        vmull.u8        q5,  d8,  d8
530        vmull.u8        q6,  d9,  d9
531
5324:      // Loop horizontally
533        vext.8          d16, d0,  d1,  #1
534        vext.8          d17, d0,  d1,  #2
535        vext.8          d18, d8,  d9,  #1
536        vext.8          d19, d8,  d9,  #2
537        vaddl.u8        q3,  d0,  d16
538        vaddw.u8        q3,  q3,  d17
539        vaddl.u8        q7,  d8,  d18
540        vaddw.u8        q7,  q7,  d19
541
542        vext.8          q8,  q1,  q2,  #2
543        vext.8          q9,  q1,  q2,  #4
544        vext.8          q10, q5,  q6,  #2
545        vext.8          q11, q5,  q6,  #4
546
547        vaddl.u16       q12, d2,  d16
548        vaddl.u16       q13, d3,  d17
549        vaddw.u16       q12, q12, d18
550        vaddw.u16       q13, q13, d19
551
552        vaddl.u16       q8,  d10, d20
553        vaddl.u16       q9,  d11, d21
554        vaddw.u16       q8,  q8,  d22
555        vaddw.u16       q9,  q9,  d23
556
557        subs            r5,  r5,  #8
558        vst1.16         {q3},       [r1,  :128]!
559        vst1.16         {q7},       [r11, :128]!
560        vst1.32         {q12, q13}, [r0,  :128]!
561        vst1.32         {q8,  q9},  [r10, :128]!
562
563        ble             9f
564        tst             r7,  #2 // LR_HAVE_RIGHT
565        vld1.8          {d6},  [r3]!
566        vld1.8          {d14}, [r12]!
567        vmov            q1,  q2
568        vmov            q5,  q6
569        vext.8          q0,  q0,  q3,  #8
570        vext.8          q4,  q4,  q7,  #8
571        vmull.u8        q2,  d6,  d6
572        vmull.u8        q6,  d14, d14
573
574        bne             4b // If we don't need to pad, just keep summing.
575        b               3b // If we need to pad, check how many pixels we have left.
576
5779:
578        subs            r6,  r6,  #2
579        ble             0f
580        // Jump to the next row and loop horizontally
581        add             r0,  r0,  r9, lsl #1
582        add             r10, r10, r9, lsl #1
583        add             r1,  r1,  r9
584        add             r11, r11, r9
585        add             r3,  r3,  r4
586        add             r12, r12, r4
587        mov             r5,  r8
588        b               1b
5890:
590        vpop            {q4-q7}
591        pop             {r4-r11,pc}
592endfunc
593
594// void dav1d_sgr_box5_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
595//                                 const pixel (*left)[4],
596//                                 const pixel *src, const ptrdiff_t stride,
597//                                 const int w, const int h,
598//                                 const enum LrEdgeFlags edges);
599function sgr_box5_h_8bpc_neon, export=1
600        push            {r4-r11,lr}
601        vpush           {q4-q7}
602        ldrd            r4,  r5,  [sp, #100]
603        ldrd            r6,  r7,  [sp, #108]
604        add             r5,  r5,  #2 // w += 2
605
606        // Set up pointers for reading/writing alternate rows
607        add             r10, r0,  #(4*SUM_STRIDE)   // sumsq
608        add             r11, r1,  #(2*SUM_STRIDE)   // sum
609        add             r12, r3,  r4                // src
610        lsl             r4,  r4,  #1
611        mov             r9,       #(2*2*SUM_STRIDE) // double sum stride
612
613        // Subtract the aligned width from the output stride.
614        add             lr,  r5,  #7
615        bic             lr,  lr,  #7
616        sub             r9,  r9,  lr, lsl #1
617        add             lr,  lr,  #8
618        sub             r4,  r4,  lr
619
620        // Store the width for the vertical loop
621        mov             r8,  r5
622
623        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
624        tst             r7,  #1 // LR_HAVE_LEFT
625        beq             2f
626        // LR_HAVE_LEFT
627        cmp             r2,  #0
628        bne             0f
629        // left == NULL
630        sub             r3,  r3,  #3
631        sub             r12, r12, #3
632        b               1f
6330:      // LR_HAVE_LEFT, left != NULL
6342:      // !LR_HAVE_LEFT, increase the stride.
635        // For this case we don't read the left 3 pixels from the src pointer,
636        // but shift it as if we had done that.
637        add             r4,  r4,  #3
638
6391:      // Loop vertically
640        vld1.8          {q0}, [r3]!
641        vld1.8          {q4}, [r12]!
642
643        tst             r7,  #1 // LR_HAVE_LEFT
644        beq             0f
645        cmp             r2,  #0
646        beq             2f
647        // LR_HAVE_LEFT, left != NULL
648        vld1.32         {d3[]}, [r2]!
649        // Move r3/r12 back to account for the last 3 bytes we loaded earlier,
650        // which we'll shift out.
651        sub             r3,  r3,  #3
652        sub             r12, r12, #3
653        vld1.32         {d11[]}, [r2]!
654        vext.8          q0,  q1,  q0,  #13
655        vext.8          q4,  q5,  q4,  #13
656        b               2f
6570:
658        // !LR_HAVE_LEFT, fill q1 with the leftmost byte
659        // and shift q0 to have 3x the first byte at the front.
660        vdup.8          q1,  d0[0]
661        vdup.8          q5,  d8[0]
662        // Move r3 back to account for the last 3 bytes we loaded before,
663        // which we shifted out.
664        sub             r3,  r3,  #3
665        sub             r12, r12, #3
666        vext.8          q0,  q1,  q0,  #13
667        vext.8          q4,  q5,  q4,  #13
668
6692:
670        vmull.u8        q1,  d0,  d0
671        vmull.u8        q2,  d1,  d1
672        vmull.u8        q5,  d8,  d8
673        vmull.u8        q6,  d9,  d9
674
675        tst             r7,  #2 // LR_HAVE_RIGHT
676        bne             4f
677        // If we'll need to pad the right edge, load that byte to pad with
678        // here since we can find it pretty easily from here.
679        sub             lr,  r5,  #(2 + 16 - 3 + 1)
680        ldrb            r11, [r3,  lr]
681        ldrb            lr,  [r12, lr]
682        // Fill q14/q15 with the right padding pixel
683        vdup.8          q14, r11
684        vdup.8          q15, lr
685        // Restore r11 after using it for a temporary value
686        add             r11, r1,  #(2*SUM_STRIDE)
6873:      // !LR_HAVE_RIGHT
688
689        // Check whether we need to pad the right edge
690        cmp             r5,  #11
691        bge             4f   // If w >= 11, all used input pixels are valid
692
693        // 1 <= w < 11, w+1 pixels valid in q0. For w=9 or w=10,
694        // this ends up called again; it's not strictly needed in those
695        // cases (we pad enough here), but keeping the code as simple as possible.
696
697        // Insert padding in q0/4.b[w+1] onwards; fuse the +1 into the
698        // buffer pointer.
699        movrel_local    lr,  right_ext_mask, -1
700        sub             lr,  lr,  r5
701        vld1.8          {q13}, [lr]
702
703        vbit            q0,  q14, q13
704        vbit            q4,  q15, q13
705
706        // Update the precalculated squares
707        vmull.u8        q1,  d0,  d0
708        vmull.u8        q2,  d1,  d1
709        vmull.u8        q5,  d8,  d8
710        vmull.u8        q6,  d9,  d9
711
7124:      // Loop horizontally
713        vext.8          d16, d0,  d1,  #1
714        vext.8          d17, d0,  d1,  #2
715        vext.8          d18, d0,  d1,  #3
716        vext.8          d19, d0,  d1,  #4
717        vext.8          d20, d8,  d9,  #1
718        vext.8          d21, d8,  d9,  #2
719        vext.8          d22, d8,  d9,  #3
720        vext.8          d23, d8,  d9,  #4
721        vaddl.u8        q3,  d0,  d16
722        vaddl.u8        q12, d17, d18
723        vaddl.u8        q7,  d8,  d20
724        vaddl.u8        q13, d21, d22
725        vaddw.u8        q3,  q3,  d19
726        vaddw.u8        q7,  q7,  d23
727        vadd.u16        q3,  q3,  q12
728        vadd.u16        q7,  q7,  q13
729
730        vext.8          q8,  q1,  q2,  #2
731        vext.8          q9,  q1,  q2,  #4
732        vext.8          q10, q1,  q2,  #6
733        vext.8          q11, q1,  q2,  #8
734        vaddl.u16       q12, d2,  d16
735        vaddl.u16       q13, d3,  d17
736        vaddl.u16       q8,  d18, d20
737        vaddl.u16       q9,  d19, d21
738        vaddw.u16       q12, q12, d22
739        vaddw.u16       q13, q13, d23
740        vadd.i32        q12, q12, q8
741        vadd.i32        q13, q13, q9
742        vext.8          q8,  q5,  q6,  #2
743        vext.8          q9,  q5,  q6,  #4
744        vext.8          q10, q5,  q6,  #6
745        vext.8          q11, q5,  q6,  #8
746        vaddl.u16       q1,  d10, d16
747        vaddl.u16       q5,  d11, d17
748        vaddl.u16       q8,  d18, d20
749        vaddl.u16       q9,  d19, d21
750        vaddw.u16       q1,  q1,  d22
751        vaddw.u16       q5,  q5,  d23
752        vadd.i32        q10, q1,  q8
753        vadd.i32        q11, q5,  q9
754
755        subs            r5,  r5,  #8
756        vst1.16         {q3},       [r1,  :128]!
757        vst1.16         {q7},       [r11, :128]!
758        vst1.32         {q12, q13}, [r0,  :128]!
759        vst1.32         {q10, q11}, [r10, :128]!
760
761        ble             9f
762        tst             r7,  #2 // LR_HAVE_RIGHT
763        vld1.8          {d6},  [r3]!
764        vld1.8          {d14}, [r12]!
765        vmov            q1,  q2
766        vmov            q5,  q6
767        vext.8          q0,  q0,  q3,  #8
768        vext.8          q4,  q4,  q7,  #8
769        vmull.u8        q2,  d6,  d6
770        vmull.u8        q6,  d14, d14
771        bne             4b // If we don't need to pad, just keep summing.
772        b               3b // If we need to pad, check how many pixels we have left.
773
7749:
775        subs            r6,  r6,  #2
776        ble             0f
777        // Jump to the next row and loop horizontally
778        add             r0,  r0,  r9, lsl #1
779        add             r10, r10, r9, lsl #1
780        add             r1,  r1,  r9
781        add             r11, r11, r9
782        add             r3,  r3,  r4
783        add             r12, r12, r4
784        mov             r5,  r8
785        b               1b
7860:
787        vpop            {q4-q7}
788        pop             {r4-r11,pc}
789endfunc
790
791sgr_funcs 8
792