xref: /aosp_15_r20/external/libdav1d/src/arm/32/refmvs.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1/*
2 * Copyright © 2021, VideoLAN and dav1d authors
3 * Copyright © 2021, Martin Storsjo
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/arm/asm.S"
29#include "util.S"
30
31// void dav1d_splat_mv_neon(refmvs_block **rr, const refmvs_block *rmv,
32//                          int bx4, int bw4, int bh4)
33
34function splat_mv_neon, export=1
35        push            {r4, lr}
36        vld1.8          {q3},  [r1]
37        ldr             r4,  [sp, #8]
38        clz             r3,  r3
39        adr             lr,  L(splat_tbl)
40        sub             r3,  r3,  #26
41        vext.8          q2,  q3,  q3,  #12
42        ldr             r3,  [lr, r3, lsl #2]
43        add             r2,  r2,  r2,  lsl #1
44        vext.8          q0,  q2,  q3,  #4
45        add             r3,  lr,  r3
46        vext.8          q1,  q2,  q3,  #8
47        lsl             r2,  r2,  #2
48        vext.8          q2,  q2,  q3,  #12
49        vmov            q3,  q0
501:
51        ldr             r1,  [r0],  #4
52        subs            r4,  r4,  #1
53        add             r1,  r1,  r2
54        bx              r3
55
56        .align 2
57L(splat_tbl):
58        .word 320f - L(splat_tbl) + CONFIG_THUMB
59        .word 160f - L(splat_tbl) + CONFIG_THUMB
60        .word 80f  - L(splat_tbl) + CONFIG_THUMB
61        .word 40f  - L(splat_tbl) + CONFIG_THUMB
62        .word 20f  - L(splat_tbl) + CONFIG_THUMB
63        .word 10f  - L(splat_tbl) + CONFIG_THUMB
64
6510:
66        vst1.8          {d0}, [r1]
67        vstr            s2,  [r1, #8]
68        bgt             1b
69        pop             {r4, pc}
7020:
71        vst1.8          {q0}, [r1]
72        vstr            d2,  [r1, #16]
73        bgt             1b
74        pop             {r4, pc}
7540:
76        vst1.8          {q0, q1}, [r1]!
77        vst1.8          {q2},     [r1]
78        bgt             1b
79        pop             {r4, pc}
80320:
81        vst1.8          {q0, q1}, [r1]!
82        vst1.8          {q2, q3}, [r1]!
83        vst1.8          {q1, q2}, [r1]!
84        vst1.8          {q0, q1}, [r1]!
85        vst1.8          {q2, q3}, [r1]!
86        vst1.8          {q1, q2}, [r1]!
87160:
88        vst1.8          {q0, q1}, [r1]!
89        vst1.8          {q2, q3}, [r1]!
90        vst1.8          {q1, q2}, [r1]!
9180:
92        vst1.8          {q0, q1}, [r1]!
93        vst1.8          {q2, q3}, [r1]!
94        vst1.8          {q1, q2}, [r1]
95        bgt             1b
96        pop             {r4, pc}
97endfunc
98
99const mv_tbls, align=4
100        .byte           255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
101        .byte           0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0
102        .byte           4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4
103        .byte           4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4
104endconst
105
106const mask_mult, align=4
107        .byte           1, 2, 1, 2, 0, 0, 0, 0
108endconst
109
110// void dav1d_save_tmvs_neon(refmvs_temporal_block *rp, ptrdiff_t stride,
111//                           refmvs_block **rr, const uint8_t *ref_sign,
112//                           int col_end8, int row_end8,
113//                           int col_start8, int row_start8)
114function save_tmvs_neon, export=1
115        push            {r4-r11,lr}
116        ldrd            r4,  r5,  [sp, #36]
117        ldrd            r6,  r7,  [sp, #44]
118
119        vmov.i8         d30, #0
120        vld1.8          {d31}, [r3]
121        adr             r8,  L(save_tmvs_tbl)
122        movrel_local    lr,  mask_mult
123        movrel_local    r12, mv_tbls
124        vld1.8          {d29}, [lr]
125        vext.8          d31, d30, d31, #7         // [0, ref_sign]
126        mov             r3,  #5
127        mul             r1,  r1,  r3              // stride *= 5
128        sub             r5,  r5,  r7              // h = row_end8 - row_start8
129        lsl             r7,  r7,  #1              // row_start8 <<= 1
1301:
131        mov             r3,  #5
132        mov             r11, #12*2
133        and             r9,  r7,  #30             // (y & 15) * 2
134        ldr             r9,  [r2, r9, lsl #2]     // b = rr[(y & 15) * 2]
135        add             r9,  r9,  #12             // &b[... + 1]
136        mla             r10, r4,  r11,  r9        // end_cand_b = &b[col_end8*2 + 1]
137        mla             r9,  r6,  r11,  r9        // cand_b = &b[x*2 + 1]
138
139        mla             r3,  r6,  r3,   r0        // &rp[x]
140
141        push            {r2,r4,r6}
142
1432:
144        ldrb            r11, [r9, #10]            // cand_b->bs
145        add             lr,  r9,  #8
146        vld1.8          {d0, d1}, [r9]            // cand_b->mv
147        add             r11, r8,  r11, lsl #3
148        vld1.16         {d2[]},  [lr]             // cand_b->ref
149        ldrh            lr,  [r11]                // bw8
150        mov             r2,  r8
151        add             r9,  r9,  lr,  lsl #1     // cand_b += bw8*2
152        cmp             r9,  r10
153        vmov            d4,  d0
154        bge             3f
155
156        ldrb            r2,  [r9, #10]            // cand_b->bs
157        add             lr,  r9,  #8
158        vld1.8          {d6, d7}, [r9]            // cand_b->mv
159        add             r2,  r8,  r2,  lsl #3
160        vld1.16         {d2[1]},  [lr]            // cand_b->ref
161        ldrh            lr,  [r2]                 // bw8
162        add             r9,  r9,  lr,  lsl #1     // cand_b += bw8*2
163        vmov            d5,  d6
164
1653:
166        vabs.s16        q2,  q2                   // abs(mv[].xy)
167        vtbl.8          d2,  {d31}, d2            // ref_sign[ref]
168        vshr.u16        q2,  q2,  #12             // abs(mv[].xy) >> 12
169        vmull.u8        q1,  d2,  d29             // ref_sign[ref] * {1, 2}
170        vceq.i32        q2,  q2,  #0              // abs(mv[].xy) <= 4096
171        vmovn.i32       d4,  q2                   // abs() condition to 16 bit
172        vand            d2,  d2,  d4              // h[0-3] contains conditions for mv[0-1]
173        vpadd.i16       d2,  d2,  d2              // Combine condition for [1] and [0]
174        vmov.u16        r4,  d2[0]                // Extract case for first block
175        vmov.u16        r6,  d2[1]
176        ldr             r11, [r11, #4]            // Fetch jump table entry
177        ldr             r2,  [r2,  #4]
178        add             r4,  r12,  r4,  lsl #4
179        add             r6,  r12,  r6,  lsl #4
180        vld1.8          {d2, d3}, [r4]            // Load permutation table base on case
181        vld1.8          {d4, d5}, [r6]
182        add             r11, r8,  r11             // Find jump table target
183        add             r2,  r8,  r2
184        vtbl.8          d16, {d0, d1}, d2         // Permute cand_b to output refmvs_temporal_block
185        vtbl.8          d17, {d0, d1}, d3
186        vtbl.8          d18, {d6, d7}, d4
187        vtbl.8          d19, {d6, d7}, d5
188        vmov            q0,  q8
189
190        // q1 follows on q0 (q8), with another 3 full repetitions of the pattern.
191        vext.8          q1,  q8,  q8,  #1
192        vext.8          q10, q9,  q9,  #1
193        // q2 ends with 3 complete repetitions of the pattern.
194        vext.8          q2,  q8,  q1,  #4
195        vext.8          q11, q9,  q10, #4
196
197        blx             r11
198        bge             4f  // if (cand_b >= end)
199        vmov            q0,  q9
200        vmov            q1,  q10
201        vmov            q2,  q11
202        cmp             r9,  r10
203        blx             r2
204        blt             2b  // if (cand_b < end)
205
2064:
207        pop             {r2,r4,r6}
208
209        subs            r5,  r5,  #1              // h--
210        add             r7,  r7,  #2              // y += 2
211        add             r0,  r0,  r1              // rp += stride
212        bgt             1b
213
214        pop             {r4-r11,pc}
215
216        .align 2
217L(save_tmvs_tbl):
218        .word 16 * 12
219        .word 160f - L(save_tmvs_tbl) + CONFIG_THUMB
220        .word 16 * 12
221        .word 160f - L(save_tmvs_tbl) + CONFIG_THUMB
222        .word 8 * 12
223        .word 80f  - L(save_tmvs_tbl) + CONFIG_THUMB
224        .word 8 * 12
225        .word 80f  - L(save_tmvs_tbl) + CONFIG_THUMB
226        .word 8 * 12
227        .word 80f  - L(save_tmvs_tbl) + CONFIG_THUMB
228        .word 8 * 12
229        .word 80f  - L(save_tmvs_tbl) + CONFIG_THUMB
230        .word 4 * 12
231        .word 40f  - L(save_tmvs_tbl) + CONFIG_THUMB
232        .word 4 * 12
233        .word 40f  - L(save_tmvs_tbl) + CONFIG_THUMB
234        .word 4 * 12
235        .word 40f  - L(save_tmvs_tbl) + CONFIG_THUMB
236        .word 4 * 12
237        .word 40f  - L(save_tmvs_tbl) + CONFIG_THUMB
238        .word 2 * 12
239        .word 20f  - L(save_tmvs_tbl) + CONFIG_THUMB
240        .word 2 * 12
241        .word 20f  - L(save_tmvs_tbl) + CONFIG_THUMB
242        .word 2 * 12
243        .word 20f  - L(save_tmvs_tbl) + CONFIG_THUMB
244        .word 2 * 12
245        .word 20f  - L(save_tmvs_tbl) + CONFIG_THUMB
246        .word 2 * 12
247        .word 20f  - L(save_tmvs_tbl) + CONFIG_THUMB
248        .word 1 * 12
249        .word 10f  - L(save_tmvs_tbl) + CONFIG_THUMB
250        .word 1 * 12
251        .word 10f  - L(save_tmvs_tbl) + CONFIG_THUMB
252        .word 1 * 12
253        .word 10f  - L(save_tmvs_tbl) + CONFIG_THUMB
254        .word 1 * 12
255        .word 10f  - L(save_tmvs_tbl) + CONFIG_THUMB
256        .word 1 * 12
257        .word 10f  - L(save_tmvs_tbl) + CONFIG_THUMB
258        .word 1 * 12
259        .word 10f  - L(save_tmvs_tbl) + CONFIG_THUMB
260        .word 1 * 12
261        .word 10f  - L(save_tmvs_tbl) + CONFIG_THUMB
262
26310:
264        add             r4,  r3,  #4
265        vst1.32         {d0[0]}, [r3]
266        vst1.8          {d0[4]}, [r4]
267        add             r3,  r3,  #5
268        bx              lr
26920:
270        add             r4,  r3,  #8
271        vst1.8          {d0}, [r3]
272        vst1.16         {d1[0]}, [r4]
273        add             r3,  r3,  #2*5
274        bx              lr
27540:
276        add             r4,  r3,  #16
277        vst1.8          {q0}, [r3]
278        vst1.32         {d2[0]}, [r4]
279        add             r3,  r3,  #4*5
280        bx              lr
28180:
282        add             r4,  r3,  #(8*5-16)
283        // This writes 6 full entries plus 2 extra bytes
284        vst1.8          {q0, q1}, [r3]
285        // Write the last few, overlapping with the first write.
286        vst1.8          {q2}, [r4]
287        add             r3,  r3,  #8*5
288        bx              lr
289160:
290        add             r4,  r3,  #6*5
291        add             r6,  r3,  #12*5
292        // This writes 6 full entries plus 2 extra bytes
293        vst1.8          {q0, q1}, [r3]
294        // Write another 6 full entries, slightly overlapping with the first set
295        vst1.8          {q0, q1}, [r4]
296        add             r4,  r3,  #(16*5-16)
297        // Write 8 bytes (one full entry) after the first 12
298        vst1.8          {d0}, [r6]
299        // Write the last 3 entries
300        vst1.8          {q2}, [r4]
301        add             r3,  r3,  #16*5
302        bx              lr
303endfunc
304