xref: /aosp_15_r20/external/libdav1d/src/arm/64/looprestoration_tmpl.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1/*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2018, Martin Storsjo
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/arm/asm.S"
29
30#define FILTER_OUT_STRIDE 384
31
32.macro sgr_funcs bpc
33// void dav1d_sgr_finish_filter1_2rows_Xbpc_neon(int16_t *tmp,
34//                                               const pixel *src,
35//                                               const ptrdiff_t src_stride,
36//                                               const int32_t **a,
37//                                               const int16_t **b,
38//                                               const int w, const int h);
39function sgr_finish_filter1_2rows_\bpc\()bpc_neon, export=1
40        stp             d8,  d9,  [sp, #-0x40]!
41        stp             d10, d11, [sp, #0x10]
42        stp             d12, d13, [sp, #0x20]
43        stp             d14, d15, [sp, #0x30]
44
45        ldp             x7,  x8,  [x3]
46        ldp             x9,  x3,  [x3, #16]
47        ldp             x10, x11, [x4]
48        ldp             x12, x4,  [x4, #16]
49
50        mov             x13, #FILTER_OUT_STRIDE
51        cmp             w6,  #1
52        add             x2,  x1,  x2 // src + stride
53        csel            x2,  x1,  x2,  le // if (h <= 1) x2 = x1
54        add             x13, x0,  x13, lsl #1
55
56        movi            v30.8h, #3
57        movi            v31.4s, #3
581:
59        ld1             {v0.8h, v1.8h}, [x10], #32
60        ld1             {v2.8h, v3.8h}, [x11], #32
61        ld1             {v4.8h, v5.8h}, [x12], #32
62        ld1             {v6.8h, v7.8h}, [x4],  #32
63        ld1             {v16.4s, v17.4s, v18.4s}, [x7], #48
64        ld1             {v19.4s, v20.4s, v21.4s}, [x8], #48
65        ld1             {v22.4s, v23.4s, v24.4s}, [x9], #48
66        ld1             {v25.4s, v26.4s, v27.4s}, [x3], #48
67
682:
69        ext             v8.16b,  v0.16b,  v1.16b, #2  // [0][1]
70        ext             v9.16b,  v2.16b,  v3.16b, #2  // [1][1]
71        ext             v10.16b, v4.16b,  v5.16b, #2  // [2][1]
72        ext             v11.16b, v0.16b,  v1.16b, #4  // [0][2]
73        ext             v12.16b, v2.16b,  v3.16b, #4  // [1][2]
74        ext             v13.16b, v4.16b,  v5.16b, #4  // [2][2]
75
76        add             v14.8h,  v2.8h,   v8.8h       // [1][0] + [0][1]
77        add             v15.8h,  v9.8h,   v10.8h      // [1][1] + [2][1]
78
79        add             v28.8h,  v0.8h,   v11.8h      // [0][0] + [0][2]
80        add             v14.8h,  v14.8h,  v12.8h      // () + [1][2]
81        add             v29.8h,  v4.8h,   v13.8h      // [2][0] + [2][2]
82
83        ext             v8.16b,  v6.16b,  v7.16b, #2  // [3][1]
84        ext             v11.16b, v6.16b,  v7.16b, #4  // [3][2]
85
86        add             v14.8h,  v14.8h,  v15.8h      // mid
87        add             v15.8h,  v28.8h,  v29.8h      // corners
88
89        add             v28.8h,  v4.8h,   v9.8h       // [2][0] + [1][1]
90        add             v29.8h,  v10.8h,  v8.8h       // [2][1] + [3][1]
91
92        add             v2.8h,   v2.8h,   v12.8h      // [1][0] + [1][2]
93        add             v28.8h,  v28.8h,  v13.8h      // () + [2][2]
94        add             v4.8h,   v6.8h,   v11.8h      // [3][0] + [3][2]
95
96        add             v0.8h,   v28.8h,  v29.8h      // mid
97        add             v2.8h,   v2.8h,   v4.8h       // corners
98
99        shl             v4.8h,   v14.8h,  #2
100        mla             v4.8h,   v15.8h,  v30.8h      // * 3 -> a
101
102        shl             v0.8h,   v0.8h,   #2
103        mla             v0.8h,   v2.8h,   v30.8h      // * 3 -> a
104
105        ext             v8.16b,  v16.16b, v17.16b, #4 // [0][1]
106        ext             v9.16b,  v17.16b, v18.16b, #4
107        ext             v10.16b, v16.16b, v17.16b, #8 // [0][2]
108        ext             v11.16b, v17.16b, v18.16b, #8
109        ext             v12.16b, v19.16b, v20.16b, #4 // [1][1]
110        ext             v13.16b, v20.16b, v21.16b, #4
111        add             v8.4s,   v8.4s,   v19.4s      // [0][1] + [1][0]
112        add             v9.4s,   v9.4s,   v20.4s
113        add             v16.4s,  v16.4s,  v10.4s      // [0][0] + [0][2]
114        add             v17.4s,  v17.4s,  v11.4s
115        ext             v14.16b, v19.16b, v20.16b, #8 // [1][2]
116        ext             v15.16b, v20.16b, v21.16b, #8
117        add             v16.4s,  v16.4s,  v22.4s      // () + [2][0]
118        add             v17.4s,  v17.4s,  v23.4s
119        add             v28.4s,  v12.4s,  v14.4s      // [1][1] + [1][2]
120        add             v29.4s,  v13.4s,  v15.4s
121        ext             v10.16b, v22.16b, v23.16b, #4 // [2][1]
122        ext             v11.16b, v23.16b, v24.16b, #4
123        add             v8.4s,   v8.4s,   v28.4s      // mid (incomplete)
124        add             v9.4s,   v9.4s,   v29.4s
125
126        add             v19.4s,  v19.4s,  v14.4s      // [1][0] + [1][2]
127        add             v20.4s,  v20.4s,  v15.4s
128        add             v14.4s,  v22.4s,  v12.4s      // [2][0] + [1][1]
129        add             v15.4s,  v23.4s,  v13.4s
130
131        ext             v12.16b, v22.16b, v23.16b, #8 // [2][2]
132        ext             v13.16b, v23.16b, v24.16b, #8
133        ext             v28.16b, v25.16b, v26.16b, #4 // [3][1]
134        ext             v29.16b, v26.16b, v27.16b, #4
135        add             v8.4s,   v8.4s,   v10.4s      // () + [2][1] = mid
136        add             v9.4s,   v9.4s,   v11.4s
137        add             v14.4s,  v14.4s,  v10.4s      // () + [2][1]
138        add             v15.4s,  v15.4s,  v11.4s
139        ext             v10.16b, v25.16b, v26.16b, #8 // [3][2]
140        ext             v11.16b, v26.16b, v27.16b, #8
141        add             v16.4s,  v16.4s,  v12.4s      // () + [2][2] = corner
142        add             v17.4s,  v17.4s,  v13.4s
143
144        add             v12.4s,  v12.4s,  v28.4s      // [2][2] + [3][1]
145        add             v13.4s,  v13.4s,  v29.4s
146        add             v25.4s,  v25.4s,  v10.4s      // [3][0] + [3][2]
147        add             v26.4s,  v26.4s,  v11.4s
148
149        add             v14.4s,  v14.4s,  v12.4s      // mid
150        add             v15.4s,  v15.4s,  v13.4s
151        add             v19.4s,  v19.4s,  v25.4s      // corner
152        add             v20.4s,  v20.4s,  v26.4s
153
154.if \bpc == 8
155        ld1             {v25.8b}, [x1], #8            // src
156        ld1             {v26.8b}, [x2], #8
157.else
158        ld1             {v25.8h}, [x1], #16           // src
159        ld1             {v26.8h}, [x2], #16
160.endif
161
162        shl             v8.4s,   v8.4s,   #2
163        shl             v9.4s,   v9.4s,   #2
164        mla             v8.4s,   v16.4s,  v31.4s      // * 3 -> b
165        mla             v9.4s,   v17.4s,  v31.4s
166
167.if \bpc == 8
168        uxtl            v25.8h,  v25.8b               // src
169        uxtl            v26.8h,  v26.8b
170.endif
171
172        shl             v14.4s,  v14.4s,  #2
173        shl             v15.4s,  v15.4s,  #2
174        mla             v14.4s,  v19.4s,  v31.4s      // * 3 -> b
175        mla             v15.4s,  v20.4s,  v31.4s
176
177        umlal           v8.4s,   v4.4h,   v25.4h      // b + a * src
178        umlal2          v9.4s,   v4.8h,   v25.8h
179        umlal           v14.4s,  v0.4h,   v26.4h      // b + a * src
180        umlal2          v15.4s,  v0.8h,   v26.8h
181        mov             v0.16b,  v1.16b
182        rshrn           v8.4h,   v8.4s,   #9
183        rshrn2          v8.8h,   v9.4s,   #9
184        mov             v2.16b,  v3.16b
185        rshrn           v14.4h,  v14.4s,  #9
186        rshrn2          v14.8h,  v15.4s,  #9
187        subs            w5,  w5,  #8
188        mov             v4.16b,  v5.16b
189        st1             {v8.8h},  [x0],  #16
190        mov             v6.16b,  v7.16b
191        st1             {v14.8h}, [x13], #16
192
193        b.le            3f
194        mov             v16.16b, v18.16b
195        mov             v19.16b, v21.16b
196        mov             v22.16b, v24.16b
197        mov             v25.16b, v27.16b
198        ld1             {v1.8h}, [x10], #16
199        ld1             {v3.8h}, [x11], #16
200        ld1             {v5.8h}, [x12], #16
201        ld1             {v7.8h}, [x4],  #16
202        ld1             {v17.4s, v18.4s}, [x7], #32
203        ld1             {v20.4s, v21.4s}, [x8], #32
204        ld1             {v23.4s, v24.4s}, [x9], #32
205        ld1             {v26.4s, v27.4s}, [x3], #32
206        b               2b
207
2083:
209        ldp             d14, d15, [sp, #0x30]
210        ldp             d12, d13, [sp, #0x20]
211        ldp             d10, d11, [sp, #0x10]
212        ldp             d8,  d9,  [sp], 0x40
213        ret
214endfunc
215
216// void dav1d_sgr_finish_weighted1_Xbpc_neon(pixel *dst,
217//                                           const int32_t **a, const int16_t **b,
218//                                           const int w, const int w1,
219//                                           const int bitdepth_max);
220function sgr_finish_weighted1_\bpc\()bpc_neon, export=1
221        ldp             x7,  x8,  [x1]
222        ldr             x1,       [x1, #16]
223        ldp             x9,  x10, [x2]
224        ldr             x2,       [x2, #16]
225
226        dup             v31.8h, w4
227        dup             v30.8h, w5
228
229        movi            v6.8h,  #3
230        movi            v7.4s,  #3
2311:
232        ld1             {v0.8h, v1.8h}, [x9],  #32
233        ld1             {v2.8h, v3.8h}, [x10], #32
234        ld1             {v4.8h, v5.8h}, [x2],  #32
235        ld1             {v16.4s, v17.4s, v18.4s}, [x7], #48
236        ld1             {v19.4s, v20.4s, v21.4s}, [x8], #48
237        ld1             {v22.4s, v23.4s, v24.4s}, [x1], #48
238
2392:
240        ext             v25.16b, v0.16b,  v1.16b, #2  // -stride
241        ext             v26.16b, v2.16b,  v3.16b, #2  // 0
242        ext             v27.16b, v4.16b,  v5.16b, #2  // +stride
243        ext             v28.16b, v0.16b,  v1.16b, #4  // +1-stride
244        ext             v29.16b, v2.16b,  v3.16b, #4  // +1
245        add             v2.8h,   v2.8h,   v25.8h      // -1, -stride
246        ext             v25.16b, v4.16b,  v5.16b, #4  // +1+stride
247        add             v26.8h,  v26.8h,  v27.8h      // 0, +stride
248        add             v0.8h,   v0.8h,   v28.8h      // -1-stride, +1-stride
249        add             v2.8h,   v2.8h,   v26.8h
250        add             v4.8h,   v4.8h,   v25.8h      // -1+stride, +1+stride
251        add             v2.8h,   v2.8h,   v29.8h      // +1
252        add             v0.8h,   v0.8h,   v4.8h
253
254        ext             v25.16b, v16.16b, v17.16b, #4 // -stride
255        ext             v26.16b, v17.16b, v18.16b, #4
256        shl             v2.8h,   v2.8h,   #2
257        ext             v27.16b, v16.16b, v17.16b, #8 // +1-stride
258        ext             v28.16b, v17.16b, v18.16b, #8
259        ext             v29.16b, v19.16b, v20.16b, #4 // 0
260        ext             v4.16b,  v20.16b, v21.16b, #4
261        mla             v2.8h,   v0.8h,   v6.8h       // * 3 -> a
262        add             v25.4s,  v25.4s,  v19.4s      // -stride, -1
263        add             v26.4s,  v26.4s,  v20.4s
264        add             v16.4s,  v16.4s,  v27.4s      // -1-stride, +1-stride
265        add             v17.4s,  v17.4s,  v28.4s
266        ext             v27.16b, v19.16b, v20.16b, #8 // +1
267        ext             v28.16b, v20.16b, v21.16b, #8
268        add             v16.4s,  v16.4s,  v22.4s      // -1+stride
269        add             v17.4s,  v17.4s,  v23.4s
270        add             v29.4s,  v29.4s,  v27.4s      // 0, +1
271        add             v4.4s,   v4.4s,   v28.4s
272        add             v25.4s,  v25.4s,  v29.4s
273        add             v26.4s,  v26.4s,  v4.4s
274        ext             v27.16b, v22.16b, v23.16b, #4 // +stride
275        ext             v28.16b, v23.16b, v24.16b, #4
276        ext             v29.16b, v22.16b, v23.16b, #8 // +1+stride
277        ext             v4.16b,  v23.16b, v24.16b, #8
278.if \bpc == 8
279        ld1             {v19.8b}, [x0]                // src
280.else
281        ld1             {v19.8h}, [x0]                // src
282.endif
283        add             v25.4s,  v25.4s,  v27.4s      // +stride
284        add             v26.4s,  v26.4s,  v28.4s
285        add             v16.4s,  v16.4s,  v29.4s      // +1+stride
286        add             v17.4s,  v17.4s,  v4.4s
287        shl             v25.4s,  v25.4s,  #2
288        shl             v26.4s,  v26.4s,  #2
289        mla             v25.4s,  v16.4s,  v7.4s       // * 3 -> b
290        mla             v26.4s,  v17.4s,  v7.4s
291.if \bpc == 8
292        uxtl            v19.8h,  v19.8b               // src
293.endif
294        mov             v0.16b,  v1.16b
295        umlal           v25.4s,  v2.4h,   v19.4h      // b + a * src
296        umlal2          v26.4s,  v2.8h,   v19.8h
297        mov             v2.16b,  v3.16b
298        rshrn           v25.4h,  v25.4s,  #9
299        rshrn2          v25.8h,  v26.4s,  #9
300
301        subs            w3,  w3,  #8
302
303        // weighted1
304        shl             v19.8h,  v19.8h,  #4   // u
305        mov             v4.16b,  v5.16b
306
307        sub             v25.8h,  v25.8h,  v19.8h // t1 - u
308        ld1             {v1.8h}, [x9],  #16
309        ushll           v26.4s,  v19.4h,  #7     // u << 7
310        ushll2          v27.4s,  v19.8h,  #7     // u << 7
311        ld1             {v3.8h}, [x10], #16
312        smlal           v26.4s,  v25.4h,  v31.4h // v
313        smlal2          v27.4s,  v25.8h,  v31.8h // v
314        ld1             {v5.8h}, [x2],  #16
315.if \bpc == 8
316        rshrn           v26.4h,  v26.4s,  #11
317        rshrn2          v26.8h,  v27.4s,  #11
318        mov             v16.16b, v18.16b
319        sqxtun          v26.8b,  v26.8h
320        mov             v19.16b, v21.16b
321        mov             v22.16b, v24.16b
322        st1             {v26.8b}, [x0], #8
323.else
324        sqrshrun        v26.4h,  v26.4s,  #11
325        sqrshrun2       v26.8h,  v27.4s,  #11
326        mov             v16.16b, v18.16b
327        umin            v26.8h,  v26.8h,  v30.8h
328        mov             v19.16b, v21.16b
329        mov             v22.16b, v24.16b
330        st1             {v26.8h}, [x0], #16
331.endif
332
333        b.le            3f
334        ld1             {v17.4s, v18.4s}, [x7], #32
335        ld1             {v20.4s, v21.4s}, [x8], #32
336        ld1             {v23.4s, v24.4s}, [x1], #32
337        b               2b
338
3393:
340        ret
341endfunc
342
343// void dav1d_sgr_finish_filter2_2rows_Xbpc_neon(int16_t *tmp,
344//                                               const pixel *src,
345//                                               const ptrdiff_t stride,
346//                                               const int32_t **a,
347//                                               const int16_t **b,
348//                                               const int w, const int h);
349function sgr_finish_filter2_2rows_\bpc\()bpc_neon, export=1
350        stp             d8,  d9,  [sp, #-0x40]!
351        stp             d10, d11, [sp, #0x10]
352        stp             d12, d13, [sp, #0x20]
353        stp             d14, d15, [sp, #0x30]
354
355        ldp             x3,  x7,  [x3]
356        ldp             x4,  x8,  [x4]
357        mov             x10, #FILTER_OUT_STRIDE
358        cmp             w6,  #1
359        add             x2,  x1,  x2 // src + stride
360        csel            x2,  x1,  x2,  le // if (h <= 1) x2 = x1
361        add             x10, x0,  x10, lsl #1
362        movi            v4.8h,  #5
363        movi            v5.4s,  #5
364        movi            v6.8h,  #6
365        movi            v7.4s,  #6
3661:
367        ld1             {v0.8h, v1.8h}, [x4], #32
368        ld1             {v2.8h, v3.8h}, [x8], #32
369        ld1             {v16.4s, v17.4s, v18.4s}, [x3], #48
370        ld1             {v19.4s, v20.4s, v21.4s}, [x7], #48
371
3722:
373        ext             v24.16b, v0.16b,  v1.16b, #4  // +1-stride
374        ext             v25.16b, v2.16b,  v3.16b, #4  // +1+stride
375        ext             v22.16b, v0.16b,  v1.16b, #2  // -stride
376        ext             v23.16b, v2.16b,  v3.16b, #2  // +stride
377        add             v0.8h,   v0.8h,   v24.8h      // -1-stride, +1-stride
378        add             v25.8h,  v2.8h,   v25.8h      // -1+stride, +1+stride
379        add             v2.8h,   v22.8h,  v23.8h      // -stride, +stride
380        add             v0.8h,   v0.8h,   v25.8h
381
382        mul             v8.8h,   v25.8h,  v4.8h       // * 5
383        mla             v8.8h,   v23.8h,  v6.8h       // * 6
384
385        ext             v22.16b, v16.16b, v17.16b, #4 // -stride
386        ext             v23.16b, v17.16b, v18.16b, #4
387        ext             v24.16b, v19.16b, v20.16b, #4 // +stride
388        ext             v25.16b, v20.16b, v21.16b, #4
389        ext             v26.16b, v16.16b, v17.16b, #8 // +1-stride
390        ext             v27.16b, v17.16b, v18.16b, #8
391        ext             v28.16b, v19.16b, v20.16b, #8 // +1+stride
392        ext             v29.16b, v20.16b, v21.16b, #8
393        mul             v0.8h,   v0.8h,   v4.8h       // * 5
394        mla             v0.8h,   v2.8h,   v6.8h       // * 6
395.if \bpc == 8
396        ld1             {v31.8b}, [x1], #8
397        ld1             {v30.8b}, [x2], #8
398.else
399        ld1             {v31.8h}, [x1], #16
400        ld1             {v30.8h}, [x2], #16
401.endif
402        add             v16.4s,  v16.4s,  v26.4s      // -1-stride, +1-stride
403        add             v17.4s,  v17.4s,  v27.4s
404        add             v19.4s,  v19.4s,  v28.4s      // -1+stride, +1+stride
405        add             v20.4s,  v20.4s,  v29.4s
406        add             v16.4s,  v16.4s,  v19.4s
407        add             v17.4s,  v17.4s,  v20.4s
408
409        mul             v9.4s,   v19.4s,  v5.4s       // * 5
410        mla             v9.4s,   v24.4s,  v7.4s       // * 6
411        mul             v10.4s,  v20.4s,  v5.4s       // * 5
412        mla             v10.4s,  v25.4s,  v7.4s       // * 6
413
414        add             v22.4s,  v22.4s,  v24.4s      // -stride, +stride
415        add             v23.4s,  v23.4s,  v25.4s
416        // This is, surprisingly, faster than other variants where the
417        // mul+mla pairs are further apart, on Cortex A53.
418        mul             v16.4s,  v16.4s,  v5.4s       // * 5
419        mla             v16.4s,  v22.4s,  v7.4s       // * 6
420        mul             v17.4s,  v17.4s,  v5.4s       // * 5
421        mla             v17.4s,  v23.4s,  v7.4s       // * 6
422
423.if \bpc == 8
424        uxtl            v31.8h,  v31.8b
425        uxtl            v30.8h,  v30.8b
426.endif
427        umlal           v16.4s,  v0.4h,   v31.4h      // b + a * src
428        umlal2          v17.4s,  v0.8h,   v31.8h
429        umlal           v9.4s,   v8.4h,   v30.4h      // b + a * src
430        umlal2          v10.4s,  v8.8h,   v30.8h
431        mov             v0.16b,  v1.16b
432        rshrn           v16.4h,  v16.4s,  #9
433        rshrn2          v16.8h,  v17.4s,  #9
434        rshrn           v9.4h,   v9.4s,   #8
435        rshrn2          v9.8h,   v10.4s,  #8
436        subs            w5,  w5,  #8
437        mov             v2.16b,  v3.16b
438        st1             {v16.8h}, [x0],  #16
439        st1             {v9.8h},  [x10], #16
440
441        b.le            9f
442        mov             v16.16b, v18.16b
443        mov             v19.16b, v21.16b
444        ld1             {v1.8h}, [x4], #16
445        ld1             {v3.8h}, [x8], #16
446        ld1             {v17.4s, v18.4s}, [x3], #32
447        ld1             {v20.4s, v21.4s}, [x7], #32
448        b               2b
449
4509:
451        ldp             d14, d15, [sp, #0x30]
452        ldp             d12, d13, [sp, #0x20]
453        ldp             d10, d11, [sp, #0x10]
454        ldp             d8,  d9,  [sp], 0x40
455        ret
456endfunc
457
458// void dav1d_sgr_finish_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride,
459//                                           const int32_t **a,
460//                                           const int16_t **b,
461//                                           const int w, const int h,
462//                                           const int w1,
463//                                           const int bitdepth_max);
464function sgr_finish_weighted2_\bpc\()bpc_neon, export=1
465        stp             d8,  d9,  [sp, #-0x30]!
466        str             d10,      [sp, #0x10]
467        stp             d14, d15, [sp, #0x20]
468
469        dup             v14.8h, w6
470        dup             v15.8h, w7
471
472        ldp             x2,  x7,  [x2]
473        ldp             x3,  x8,  [x3]
474        cmp             w5,  #1
475        add             x1,  x0,  x1 // src + stride
476        // if (h <= 1), set the pointer to the second row to any dummy buffer
477        // we can clobber (x2 in this case)
478        csel            x1,  x2,  x1,  le
479        movi            v4.8h,  #5
480        movi            v5.4s,  #5
481        movi            v6.8h,  #6
482        movi            v7.4s,  #6
4831:
484        ld1             {v0.8h, v1.8h}, [x3], #32
485        ld1             {v2.8h, v3.8h}, [x8], #32
486        ld1             {v16.4s, v17.4s, v18.4s}, [x2], #48
487        ld1             {v19.4s, v20.4s, v21.4s}, [x7], #48
488
4892:
490        ext             v24.16b, v0.16b,  v1.16b, #4  // +1-stride
491        ext             v25.16b, v2.16b,  v3.16b, #4  // +1+stride
492        ext             v22.16b, v0.16b,  v1.16b, #2  // -stride
493        ext             v23.16b, v2.16b,  v3.16b, #2  // +stride
494        add             v0.8h,   v0.8h,   v24.8h      // -1-stride, +1-stride
495        add             v25.8h,  v2.8h,   v25.8h      // -1+stride, +1+stride
496        add             v2.8h,   v22.8h,  v23.8h      // -stride, +stride
497        add             v0.8h,   v0.8h,   v25.8h
498
499        mul             v8.8h,   v25.8h,  v4.8h       // * 5
500        mla             v8.8h,   v23.8h,  v6.8h       // * 6
501
502        ext             v22.16b, v16.16b, v17.16b, #4 // -stride
503        ext             v23.16b, v17.16b, v18.16b, #4
504        ext             v24.16b, v19.16b, v20.16b, #4 // +stride
505        ext             v25.16b, v20.16b, v21.16b, #4
506        ext             v26.16b, v16.16b, v17.16b, #8 // +1-stride
507        ext             v27.16b, v17.16b, v18.16b, #8
508        ext             v28.16b, v19.16b, v20.16b, #8 // +1+stride
509        ext             v29.16b, v20.16b, v21.16b, #8
510        mul             v0.8h,   v0.8h,   v4.8h       // * 5
511        mla             v0.8h,   v2.8h,   v6.8h       // * 6
512.if \bpc == 8
513        ld1             {v31.8b}, [x0]
514        ld1             {v30.8b}, [x1]
515.else
516        ld1             {v31.8h}, [x0]
517        ld1             {v30.8h}, [x1]
518.endif
519        add             v16.4s,  v16.4s,  v26.4s      // -1-stride, +1-stride
520        add             v17.4s,  v17.4s,  v27.4s
521        add             v19.4s,  v19.4s,  v28.4s      // -1+stride, +1+stride
522        add             v20.4s,  v20.4s,  v29.4s
523        add             v16.4s,  v16.4s,  v19.4s
524        add             v17.4s,  v17.4s,  v20.4s
525
526        mul             v9.4s,   v19.4s,  v5.4s       // * 5
527        mla             v9.4s,   v24.4s,  v7.4s       // * 6
528        mul             v10.4s,  v20.4s,  v5.4s       // * 5
529        mla             v10.4s,  v25.4s,  v7.4s       // * 6
530
531        add             v22.4s,  v22.4s,  v24.4s      // -stride, +stride
532        add             v23.4s,  v23.4s,  v25.4s
533        // This is, surprisingly, faster than other variants where the
534        // mul+mla pairs are further apart, on Cortex A53.
535        mul             v16.4s,  v16.4s,  v5.4s       // * 5
536        mla             v16.4s,  v22.4s,  v7.4s       // * 6
537        mul             v17.4s,  v17.4s,  v5.4s       // * 5
538        mla             v17.4s,  v23.4s,  v7.4s       // * 6
539
540.if \bpc == 8
541        uxtl            v31.8h,  v31.8b
542        uxtl            v30.8h,  v30.8b
543.endif
544        umlal           v16.4s,  v0.4h,   v31.4h      // b + a * src
545        umlal2          v17.4s,  v0.8h,   v31.8h
546        umlal           v9.4s,   v8.4h,   v30.4h      // b + a * src
547        umlal2          v10.4s,  v8.8h,   v30.8h
548        mov             v0.16b,  v1.16b
549        rshrn           v16.4h,  v16.4s,  #9
550        rshrn2          v16.8h,  v17.4s,  #9
551        rshrn           v9.4h,   v9.4s,   #8
552        rshrn2          v9.8h,   v10.4s,  #8
553
554        subs            w4,  w4,  #8
555
556        // weighted1
557        shl             v31.8h,  v31.8h,  #4     // u
558        shl             v30.8h,  v30.8h,  #4
559        mov             v2.16b,  v3.16b
560
561        sub             v16.8h,  v16.8h,  v31.8h // t1 - u
562        sub             v9.8h,   v9.8h,   v30.8h
563        ld1             {v1.8h}, [x3], #16
564        ushll           v22.4s,  v31.4h,  #7     // u << 7
565        ushll2          v23.4s,  v31.8h,  #7
566        ushll           v24.4s,  v30.4h,  #7
567        ushll2          v25.4s,  v30.8h,  #7
568        ld1             {v3.8h}, [x8], #16
569        smlal           v22.4s,  v16.4h,  v14.4h // v
570        smlal2          v23.4s,  v16.8h,  v14.8h
571        mov             v16.16b, v18.16b
572        smlal           v24.4s,  v9.4h,   v14.4h
573        smlal2          v25.4s,  v9.8h,   v14.8h
574        mov             v19.16b, v21.16b
575.if \bpc == 8
576        rshrn           v22.4h,  v22.4s,  #11
577        rshrn2          v22.8h,  v23.4s,  #11
578        rshrn           v23.4h,  v24.4s,  #11
579        rshrn2          v23.8h,  v25.4s,  #11
580        sqxtun          v22.8b,  v22.8h
581        sqxtun          v23.8b,  v23.8h
582        st1             {v22.8b}, [x0], #8
583        st1             {v23.8b}, [x1], #8
584.else
585        sqrshrun        v22.4h,  v22.4s,  #11
586        sqrshrun2       v22.8h,  v23.4s,  #11
587        sqrshrun        v23.4h,  v24.4s,  #11
588        sqrshrun2       v23.8h,  v25.4s,  #11
589        umin            v22.8h,  v22.8h,  v15.8h
590        umin            v23.8h,  v23.8h,  v15.8h
591        st1             {v22.8h}, [x0], #16
592        st1             {v23.8h}, [x1], #16
593.endif
594
595        b.le            3f
596        ld1             {v17.4s, v18.4s}, [x2], #32
597        ld1             {v20.4s, v21.4s}, [x7], #32
598        b               2b
599
6003:
601        ldp             d14, d15, [sp, #0x20]
602        ldr             d10,      [sp, #0x10]
603        ldp             d8,  d9,  [sp], 0x30
604        ret
605endfunc
606
607// void dav1d_sgr_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride,
608//                                    const pixel *src, const ptrdiff_t src_stride,
609//                                    const int16_t *t1, const int16_t *t2,
610//                                    const int w, const int h,
611//                                    const int16_t wt[2], const int bitdepth_max);
612function sgr_weighted2_\bpc\()bpc_neon, export=1
613.if \bpc == 8
614        ldr             x8,  [sp]
615.else
616        ldp             x8,  x9,  [sp]
617.endif
618        cmp             w7,  #2
619        add             x10, x0,  x1
620        add             x11, x2,  x3
621        add             x12, x4,  #2*FILTER_OUT_STRIDE
622        add             x13, x5,  #2*FILTER_OUT_STRIDE
623        ld2r            {v30.8h, v31.8h}, [x8] // wt[0], wt[1]
624.if \bpc == 16
625        dup             v29.8h,  w9
626.endif
627        mov             x8,  #4*FILTER_OUT_STRIDE
628        lsl             x1,  x1,  #1
629        lsl             x3,  x3,  #1
630        add             x9,  x6,  #7
631        bic             x9,  x9,  #7 // Aligned width
632.if \bpc == 8
633        sub             x1,  x1,  x9
634        sub             x3,  x3,  x9
635.else
636        sub             x1,  x1,  x9, lsl #1
637        sub             x3,  x3,  x9, lsl #1
638.endif
639        sub             x8,  x8,  x9, lsl #1
640        mov             w9,  w6
641        b.lt            2f
6421:
643.if \bpc == 8
644        ld1             {v0.8b},  [x2],  #8
645        ld1             {v16.8b}, [x11], #8
646.else
647        ld1             {v0.8h},  [x2],  #16
648        ld1             {v16.8h}, [x11], #16
649.endif
650        ld1             {v1.8h},  [x4],  #16
651        ld1             {v17.8h}, [x12], #16
652        ld1             {v2.8h},  [x5],  #16
653        ld1             {v18.8h}, [x13], #16
654        subs            w6,  w6,  #8
655.if \bpc == 8
656        ushll           v0.8h,  v0.8b,  #4     // u
657        ushll           v16.8h, v16.8b, #4     // u
658.else
659        shl             v0.8h,  v0.8h,  #4     // u
660        shl             v16.8h, v16.8h, #4     // u
661.endif
662        sub             v1.8h,  v1.8h,  v0.8h  // t1 - u
663        sub             v2.8h,  v2.8h,  v0.8h  // t2 - u
664        sub             v17.8h, v17.8h, v16.8h // t1 - u
665        sub             v18.8h, v18.8h, v16.8h // t2 - u
666        ushll           v3.4s,  v0.4h,  #7     // u << 7
667        ushll2          v4.4s,  v0.8h,  #7     // u << 7
668        ushll           v19.4s, v16.4h, #7     // u << 7
669        ushll2          v20.4s, v16.8h, #7     // u << 7
670        smlal           v3.4s,  v1.4h,  v30.4h // wt[0] * (t1 - u)
671        smlal           v3.4s,  v2.4h,  v31.4h // wt[1] * (t2 - u)
672        smlal2          v4.4s,  v1.8h,  v30.8h // wt[0] * (t1 - u)
673        smlal2          v4.4s,  v2.8h,  v31.8h // wt[1] * (t2 - u)
674        smlal           v19.4s, v17.4h, v30.4h // wt[0] * (t1 - u)
675        smlal           v19.4s, v18.4h, v31.4h // wt[1] * (t2 - u)
676        smlal2          v20.4s, v17.8h, v30.8h // wt[0] * (t1 - u)
677        smlal2          v20.4s, v18.8h, v31.8h // wt[1] * (t2 - u)
678.if \bpc == 8
679        rshrn           v3.4h,  v3.4s,  #11
680        rshrn2          v3.8h,  v4.4s,  #11
681        rshrn           v19.4h, v19.4s, #11
682        rshrn2          v19.8h, v20.4s, #11
683        sqxtun          v3.8b,  v3.8h
684        sqxtun          v19.8b, v19.8h
685        st1             {v3.8b},  [x0],  #8
686        st1             {v19.8b}, [x10], #8
687.else
688        sqrshrun        v3.4h,  v3.4s,  #11
689        sqrshrun2       v3.8h,  v4.4s,  #11
690        sqrshrun        v19.4h, v19.4s, #11
691        sqrshrun2       v19.8h, v20.4s, #11
692        umin            v3.8h,  v3.8h,  v29.8h
693        umin            v19.8h, v19.8h, v29.8h
694        st1             {v3.8h},  [x0],  #16
695        st1             {v19.8h}, [x10], #16
696.endif
697        b.gt            1b
698
699        subs            w7,  w7,  #2
700        cmp             w7,  #1
701        b.lt            0f
702        mov             w6,  w9
703        add             x0,  x0,  x1
704        add             x10, x10, x1
705        add             x2,  x2,  x3
706        add             x11, x11, x3
707        add             x4,  x4,  x8
708        add             x12, x12, x8
709        add             x5,  x5,  x8
710        add             x13, x13, x8
711        b.eq            2f
712        b               1b
713
7142:
715.if \bpc == 8
716        ld1             {v0.8b}, [x2], #8
717.else
718        ld1             {v0.8h}, [x2], #16
719.endif
720        ld1             {v1.8h}, [x4], #16
721        ld1             {v2.8h}, [x5], #16
722        subs            w6,  w6,  #8
723.if \bpc == 8
724        ushll           v0.8h,  v0.8b,  #4     // u
725.else
726        shl             v0.8h,  v0.8h,  #4     // u
727.endif
728        sub             v1.8h,  v1.8h,  v0.8h  // t1 - u
729        sub             v2.8h,  v2.8h,  v0.8h  // t2 - u
730        ushll           v3.4s,  v0.4h,  #7     // u << 7
731        ushll2          v4.4s,  v0.8h,  #7     // u << 7
732        smlal           v3.4s,  v1.4h,  v30.4h // wt[0] * (t1 - u)
733        smlal           v3.4s,  v2.4h,  v31.4h // wt[1] * (t2 - u)
734        smlal2          v4.4s,  v1.8h,  v30.8h // wt[0] * (t1 - u)
735        smlal2          v4.4s,  v2.8h,  v31.8h // wt[1] * (t2 - u)
736.if \bpc == 8
737        rshrn           v3.4h,  v3.4s,  #11
738        rshrn2          v3.8h,  v4.4s,  #11
739        sqxtun          v3.8b,  v3.8h
740        st1             {v3.8b}, [x0], #8
741.else
742        sqrshrun        v3.4h,  v3.4s,  #11
743        sqrshrun2       v3.8h,  v4.4s,  #11
744        umin            v3.8h,  v3.8h,  v29.8h
745        st1             {v3.8h}, [x0], #16
746.endif
747        b.gt            1b
7480:
749        ret
750endfunc
751.endm
752