xref: /aosp_15_r20/external/libdav1d/src/arm/64/refmvs.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1/*
2 * Copyright © 2021, VideoLAN and dav1d authors
3 * Copyright © 2021, Martin Storsjo
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/arm/asm.S"
29#include "util.S"
30
31// void dav1d_splat_mv_neon(refmvs_block **rr, const refmvs_block *rmv,
32//                          int bx4, int bw4, int bh4)
33
34function splat_mv_neon, export=1
35        ld1             {v3.16b},  [x1]
36        clz             w3,  w3
37        movrel          x5,  splat_tbl
38        sub             w3,  w3,  #26
39        ext             v2.16b,  v3.16b,  v3.16b,  #12
40        ldrsw           x3,  [x5, w3, uxtw #2]
41        add             w2,  w2,  w2,  lsl #1
42        ext             v0.16b,  v2.16b,  v3.16b,  #4
43        add             x3,  x5,  x3
44        ext             v1.16b,  v2.16b,  v3.16b,  #8
45        lsl             w2,  w2,  #2
46        ext             v2.16b,  v2.16b,  v3.16b,  #12
471:
48        ldr             x1,  [x0],  #8
49        subs            w4,  w4,  #1
50        add             x1,  x1,  x2
51        br              x3
52
5310:
54        AARCH64_VALID_JUMP_TARGET
55        st1             {v0.8b}, [x1]
56        str             s2,  [x1, #8]
57        b.gt            1b
58        ret
5920:
60        AARCH64_VALID_JUMP_TARGET
61        st1             {v0.16b}, [x1]
62        str             d1,  [x1, #16]
63        b.gt            1b
64        ret
65320:
66        AARCH64_VALID_JUMP_TARGET
67        st1             {v0.16b, v1.16b, v2.16b}, [x1], #48
68        st1             {v0.16b, v1.16b, v2.16b}, [x1], #48
69        st1             {v0.16b, v1.16b, v2.16b}, [x1], #48
70        st1             {v0.16b, v1.16b, v2.16b}, [x1], #48
71160:
72        AARCH64_VALID_JUMP_TARGET
73        st1             {v0.16b, v1.16b, v2.16b}, [x1], #48
74        st1             {v0.16b, v1.16b, v2.16b}, [x1], #48
7580:
76        AARCH64_VALID_JUMP_TARGET
77        st1             {v0.16b, v1.16b, v2.16b}, [x1], #48
7840:
79        AARCH64_VALID_JUMP_TARGET
80        st1             {v0.16b, v1.16b, v2.16b}, [x1]
81        b.gt            1b
82        ret
83endfunc
84
85jumptable splat_tbl
86        .word 320b  - splat_tbl
87        .word 160b  - splat_tbl
88        .word 80b   - splat_tbl
89        .word 40b   - splat_tbl
90        .word 20b   - splat_tbl
91        .word 10b   - splat_tbl
92endjumptable
93
94const mv_tbls, align=4
95        .byte           255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
96        .byte           0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0
97        .byte           4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4
98        .byte           4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4
99endconst
100
101const mask_mult, align=4
102        .byte           1, 2, 1, 2, 0, 0, 0, 0
103endconst
104
105// void dav1d_save_tmvs_neon(refmvs_temporal_block *rp, ptrdiff_t stride,
106//                           refmvs_block **rr, const uint8_t *ref_sign,
107//                           int col_end8, int row_end8,
108//                           int col_start8, int row_start8)
109function save_tmvs_neon, export=1
110        AARCH64_SIGN_LINK_REGISTER
111        stp             x29, x30, [sp, #-16]!
112        mov             x29, sp
113
114        movi            v30.8b,  #0
115        ld1             {v31.8b}, [x3]
116        movrel          x8,  save_tmvs_tbl
117        movrel          x16, mask_mult
118        movrel          x13, mv_tbls
119        ld1             {v29.8b}, [x16]
120        ext             v31.8b,  v30.8b,  v31.8b,  #7 // [0, ref_sign]
121        mov             w15, #5
122        mov             w14, #12*2
123        sxtw            x4,  w4
124        sxtw            x6,  w6
125        mul             w1,  w1,  w15             // stride *= 5
126        sub             w5,  w5,  w7              // h = row_end8 - row_start8
127        lsl             w7,  w7,  #1              // row_start8 <<= 1
1281:
129        mov             w15, #5
130        and             w9,  w7,  #30             // (y & 15) * 2
131        ldr             x9,  [x2, w9, uxtw #3]    // b = rr[(y & 15) * 2]
132        add             x9,  x9,  #12             // &b[... + 1]
133        madd            x10, x4,  x14,  x9        // end_cand_b = &b[col_end8*2 + 1]
134        madd            x9,  x6,  x14,  x9        // cand_b = &b[x*2 + 1]
135
136        madd            x3,  x6,  x15,  x0        // &rp[x]
137
1382:
139        ldrb            w11, [x9, #10]            // cand_b->bs
140        ld1             {v0.16b}, [x9]            // cand_b->mv
141        add             x11, x8,  w11, uxtw #3
142        ldr             h1,  [x9, #8]             // cand_b->ref
143        ldr             w12, [x11]                // bw8
144        mov             x15, x8
145        add             x9,  x9,  w12, uxtw #1    // cand_b += bw8*2
146        cmp             x9,  x10
147        mov             v2.8b,   v0.8b
148        b.ge            3f
149
150        ldrb            w15, [x9, #10]            // cand_b->bs
151        add             x16, x9,  #8
152        ld1             {v4.16b}, [x9]            // cand_b->mv
153        add             x15, x8,  w15, uxtw #3
154        ld1             {v1.h}[1], [x16]          // cand_b->ref
155        ldr             w12, [x15]                // bw8
156        add             x9,  x9,  w12, uxtw #1    // cand_b += bw8*2
157        trn1            v2.2d,   v0.2d,   v4.2d
158
1593:
160        abs             v2.8h,   v2.8h            // abs(mv[].xy)
161        tbl             v1.8b, {v31.16b}, v1.8b   // ref_sign[ref]
162        ushr            v2.8h,   v2.8h,   #12     // abs(mv[].xy) >> 12
163        umull           v1.8h,   v1.8b,   v29.8b  // ref_sign[ref] * {1, 2}
164        cmeq            v2.4s,   v2.4s,   #0      // abs(mv[].xy) <= 4096
165        xtn             v2.4h,   v2.4s            // abs() condition to 16 bit
166        and             v1.8b,   v1.8b,   v2.8b   // h[0-3] contains conditions for mv[0-1]
167        addp            v1.4h,   v1.4h,   v1.4h   // Combine condition for [1] and [0]
168        umov            w16, v1.h[0]              // Extract case for first block
169        umov            w17, v1.h[1]
170        ldrsw           x11, [x11, #4]            // Fetch jump table entry
171        ldrsw           x15, [x15, #4]
172        ldr             q1, [x13, w16, uxtw #4]   // Load permutation table base on case
173        ldr             q5, [x13, w17, uxtw #4]
174        add             x11, x8,  x11             // Find jump table target
175        add             x15, x8,  x15
176        tbl             v0.16b, {v0.16b}, v1.16b  // Permute cand_b to output refmvs_temporal_block
177        tbl             v4.16b, {v4.16b}, v5.16b
178
179        // v1 follows on v0, with another 3 full repetitions of the pattern.
180        ext             v1.16b,  v0.16b,  v0.16b,  #1
181        ext             v5.16b,  v4.16b,  v4.16b,  #1
182        // v2 ends with 3 complete repetitions of the pattern.
183        ext             v2.16b,  v0.16b,  v1.16b,  #4
184        ext             v6.16b,  v4.16b,  v5.16b,  #4
185
186        blr             x11
187        b.ge            4f  // if (cand_b >= end)
188        mov             v0.16b,  v4.16b
189        mov             v1.16b,  v5.16b
190        mov             v2.16b,  v6.16b
191        cmp             x9,  x10
192        blr             x15
193        b.lt            2b  // if (cand_b < end)
194
1954:
196        subs            w5,  w5,  #1              // h--
197        add             w7,  w7,  #2              // y += 2
198        add             x0,  x0,  x1              // rp += stride
199        b.gt            1b
200
201        ldp             x29, x30, [sp], #16
202        AARCH64_VALIDATE_LINK_REGISTER
203        ret
204
20510:
206        AARCH64_VALID_CALL_TARGET
207        add             x16, x3,  #4
208        st1             {v0.s}[0], [x3]
209        st1             {v0.b}[4], [x16]
210        add             x3,  x3,  #5
211        ret
21220:
213        AARCH64_VALID_CALL_TARGET
214        add             x16, x3,  #8
215        st1             {v0.d}[0], [x3]
216        st1             {v0.h}[4], [x16]
217        add             x3,  x3,  #2*5
218        ret
21940:
220        AARCH64_VALID_CALL_TARGET
221        st1             {v0.16b}, [x3]
222        str             s1, [x3, #16]
223        add             x3,  x3,  #4*5
224        ret
22580:
226        AARCH64_VALID_CALL_TARGET
227        // This writes 6 full entries plus 2 extra bytes
228        st1             {v0.16b, v1.16b}, [x3]
229        // Write the last few, overlapping with the first write.
230        stur            q2, [x3, #(8*5-16)]
231        add             x3,  x3,  #8*5
232        ret
233160:
234        AARCH64_VALID_CALL_TARGET
235        add             x16, x3,  #6*5
236        add             x17, x3,  #12*5
237        // This writes 6 full entries plus 2 extra bytes
238        st1             {v0.16b, v1.16b}, [x3]
239        // Write another 6 full entries, slightly overlapping with the first set
240        st1             {v0.16b, v1.16b}, [x16]
241        // Write 8 bytes (one full entry) after the first 12
242        st1             {v0.8b}, [x17]
243        // Write the last 3 entries
244        str             q2, [x3, #(16*5-16)]
245        add             x3,  x3,  #16*5
246        ret
247endfunc
248
249jumptable save_tmvs_tbl
250        .word 16 * 12
251        .word 160b - save_tmvs_tbl
252        .word 16 * 12
253        .word 160b - save_tmvs_tbl
254        .word 8 * 12
255        .word 80b  - save_tmvs_tbl
256        .word 8 * 12
257        .word 80b  - save_tmvs_tbl
258        .word 8 * 12
259        .word 80b  - save_tmvs_tbl
260        .word 8 * 12
261        .word 80b  - save_tmvs_tbl
262        .word 4 * 12
263        .word 40b  - save_tmvs_tbl
264        .word 4 * 12
265        .word 40b  - save_tmvs_tbl
266        .word 4 * 12
267        .word 40b  - save_tmvs_tbl
268        .word 4 * 12
269        .word 40b  - save_tmvs_tbl
270        .word 2 * 12
271        .word 20b  - save_tmvs_tbl
272        .word 2 * 12
273        .word 20b  - save_tmvs_tbl
274        .word 2 * 12
275        .word 20b  - save_tmvs_tbl
276        .word 2 * 12
277        .word 20b  - save_tmvs_tbl
278        .word 2 * 12
279        .word 20b  - save_tmvs_tbl
280        .word 1 * 12
281        .word 10b  - save_tmvs_tbl
282        .word 1 * 12
283        .word 10b  - save_tmvs_tbl
284        .word 1 * 12
285        .word 10b  - save_tmvs_tbl
286        .word 1 * 12
287        .word 10b  - save_tmvs_tbl
288        .word 1 * 12
289        .word 10b  - save_tmvs_tbl
290        .word 1 * 12
291        .word 10b  - save_tmvs_tbl
292        .word 1 * 12
293        .word 10b  - save_tmvs_tbl
294endjumptable
295