1/* 2 * Copyright © 2021, VideoLAN and dav1d authors 3 * Copyright © 2021, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/arm/asm.S" 29#include "util.S" 30 31// void dav1d_splat_mv_neon(refmvs_block **rr, const refmvs_block *rmv, 32// int bx4, int bw4, int bh4) 33 34function splat_mv_neon, export=1 35 ld1 {v3.16b}, [x1] 36 clz w3, w3 37 movrel x5, splat_tbl 38 sub w3, w3, #26 39 ext v2.16b, v3.16b, v3.16b, #12 40 ldrsw x3, [x5, w3, uxtw #2] 41 add w2, w2, w2, lsl #1 42 ext v0.16b, v2.16b, v3.16b, #4 43 add x3, x5, x3 44 ext v1.16b, v2.16b, v3.16b, #8 45 lsl w2, w2, #2 46 ext v2.16b, v2.16b, v3.16b, #12 471: 48 ldr x1, [x0], #8 49 subs w4, w4, #1 50 add x1, x1, x2 51 br x3 52 5310: 54 AARCH64_VALID_JUMP_TARGET 55 st1 {v0.8b}, [x1] 56 str s2, [x1, #8] 57 b.gt 1b 58 ret 5920: 60 AARCH64_VALID_JUMP_TARGET 61 st1 {v0.16b}, [x1] 62 str d1, [x1, #16] 63 b.gt 1b 64 ret 65320: 66 AARCH64_VALID_JUMP_TARGET 67 st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 68 st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 69 st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 70 st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 71160: 72 AARCH64_VALID_JUMP_TARGET 73 st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 74 st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 7580: 76 AARCH64_VALID_JUMP_TARGET 77 st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 7840: 79 AARCH64_VALID_JUMP_TARGET 80 st1 {v0.16b, v1.16b, v2.16b}, [x1] 81 b.gt 1b 82 ret 83endfunc 84 85jumptable splat_tbl 86 .word 320b - splat_tbl 87 .word 160b - splat_tbl 88 .word 80b - splat_tbl 89 .word 40b - splat_tbl 90 .word 20b - splat_tbl 91 .word 10b - splat_tbl 92endjumptable 93 94const mv_tbls, align=4 95 .byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 96 .byte 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0 97 .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4 98 .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4 99endconst 100 101const mask_mult, align=4 102 .byte 1, 2, 1, 2, 0, 0, 0, 0 103endconst 104 105// void dav1d_save_tmvs_neon(refmvs_temporal_block *rp, ptrdiff_t stride, 106// refmvs_block **rr, const uint8_t *ref_sign, 107// int col_end8, int row_end8, 108// int col_start8, int row_start8) 109function save_tmvs_neon, export=1 110 AARCH64_SIGN_LINK_REGISTER 111 stp x29, x30, [sp, #-16]! 112 mov x29, sp 113 114 movi v30.8b, #0 115 ld1 {v31.8b}, [x3] 116 movrel x8, save_tmvs_tbl 117 movrel x16, mask_mult 118 movrel x13, mv_tbls 119 ld1 {v29.8b}, [x16] 120 ext v31.8b, v30.8b, v31.8b, #7 // [0, ref_sign] 121 mov w15, #5 122 mov w14, #12*2 123 sxtw x4, w4 124 sxtw x6, w6 125 mul w1, w1, w15 // stride *= 5 126 sub w5, w5, w7 // h = row_end8 - row_start8 127 lsl w7, w7, #1 // row_start8 <<= 1 1281: 129 mov w15, #5 130 and w9, w7, #30 // (y & 15) * 2 131 ldr x9, [x2, w9, uxtw #3] // b = rr[(y & 15) * 2] 132 add x9, x9, #12 // &b[... + 1] 133 madd x10, x4, x14, x9 // end_cand_b = &b[col_end8*2 + 1] 134 madd x9, x6, x14, x9 // cand_b = &b[x*2 + 1] 135 136 madd x3, x6, x15, x0 // &rp[x] 137 1382: 139 ldrb w11, [x9, #10] // cand_b->bs 140 ld1 {v0.16b}, [x9] // cand_b->mv 141 add x11, x8, w11, uxtw #3 142 ldr h1, [x9, #8] // cand_b->ref 143 ldr w12, [x11] // bw8 144 mov x15, x8 145 add x9, x9, w12, uxtw #1 // cand_b += bw8*2 146 cmp x9, x10 147 mov v2.8b, v0.8b 148 b.ge 3f 149 150 ldrb w15, [x9, #10] // cand_b->bs 151 add x16, x9, #8 152 ld1 {v4.16b}, [x9] // cand_b->mv 153 add x15, x8, w15, uxtw #3 154 ld1 {v1.h}[1], [x16] // cand_b->ref 155 ldr w12, [x15] // bw8 156 add x9, x9, w12, uxtw #1 // cand_b += bw8*2 157 trn1 v2.2d, v0.2d, v4.2d 158 1593: 160 abs v2.8h, v2.8h // abs(mv[].xy) 161 tbl v1.8b, {v31.16b}, v1.8b // ref_sign[ref] 162 ushr v2.8h, v2.8h, #12 // abs(mv[].xy) >> 12 163 umull v1.8h, v1.8b, v29.8b // ref_sign[ref] * {1, 2} 164 cmeq v2.4s, v2.4s, #0 // abs(mv[].xy) <= 4096 165 xtn v2.4h, v2.4s // abs() condition to 16 bit 166 and v1.8b, v1.8b, v2.8b // h[0-3] contains conditions for mv[0-1] 167 addp v1.4h, v1.4h, v1.4h // Combine condition for [1] and [0] 168 umov w16, v1.h[0] // Extract case for first block 169 umov w17, v1.h[1] 170 ldrsw x11, [x11, #4] // Fetch jump table entry 171 ldrsw x15, [x15, #4] 172 ldr q1, [x13, w16, uxtw #4] // Load permutation table base on case 173 ldr q5, [x13, w17, uxtw #4] 174 add x11, x8, x11 // Find jump table target 175 add x15, x8, x15 176 tbl v0.16b, {v0.16b}, v1.16b // Permute cand_b to output refmvs_temporal_block 177 tbl v4.16b, {v4.16b}, v5.16b 178 179 // v1 follows on v0, with another 3 full repetitions of the pattern. 180 ext v1.16b, v0.16b, v0.16b, #1 181 ext v5.16b, v4.16b, v4.16b, #1 182 // v2 ends with 3 complete repetitions of the pattern. 183 ext v2.16b, v0.16b, v1.16b, #4 184 ext v6.16b, v4.16b, v5.16b, #4 185 186 blr x11 187 b.ge 4f // if (cand_b >= end) 188 mov v0.16b, v4.16b 189 mov v1.16b, v5.16b 190 mov v2.16b, v6.16b 191 cmp x9, x10 192 blr x15 193 b.lt 2b // if (cand_b < end) 194 1954: 196 subs w5, w5, #1 // h-- 197 add w7, w7, #2 // y += 2 198 add x0, x0, x1 // rp += stride 199 b.gt 1b 200 201 ldp x29, x30, [sp], #16 202 AARCH64_VALIDATE_LINK_REGISTER 203 ret 204 20510: 206 AARCH64_VALID_CALL_TARGET 207 add x16, x3, #4 208 st1 {v0.s}[0], [x3] 209 st1 {v0.b}[4], [x16] 210 add x3, x3, #5 211 ret 21220: 213 AARCH64_VALID_CALL_TARGET 214 add x16, x3, #8 215 st1 {v0.d}[0], [x3] 216 st1 {v0.h}[4], [x16] 217 add x3, x3, #2*5 218 ret 21940: 220 AARCH64_VALID_CALL_TARGET 221 st1 {v0.16b}, [x3] 222 str s1, [x3, #16] 223 add x3, x3, #4*5 224 ret 22580: 226 AARCH64_VALID_CALL_TARGET 227 // This writes 6 full entries plus 2 extra bytes 228 st1 {v0.16b, v1.16b}, [x3] 229 // Write the last few, overlapping with the first write. 230 stur q2, [x3, #(8*5-16)] 231 add x3, x3, #8*5 232 ret 233160: 234 AARCH64_VALID_CALL_TARGET 235 add x16, x3, #6*5 236 add x17, x3, #12*5 237 // This writes 6 full entries plus 2 extra bytes 238 st1 {v0.16b, v1.16b}, [x3] 239 // Write another 6 full entries, slightly overlapping with the first set 240 st1 {v0.16b, v1.16b}, [x16] 241 // Write 8 bytes (one full entry) after the first 12 242 st1 {v0.8b}, [x17] 243 // Write the last 3 entries 244 str q2, [x3, #(16*5-16)] 245 add x3, x3, #16*5 246 ret 247endfunc 248 249jumptable save_tmvs_tbl 250 .word 16 * 12 251 .word 160b - save_tmvs_tbl 252 .word 16 * 12 253 .word 160b - save_tmvs_tbl 254 .word 8 * 12 255 .word 80b - save_tmvs_tbl 256 .word 8 * 12 257 .word 80b - save_tmvs_tbl 258 .word 8 * 12 259 .word 80b - save_tmvs_tbl 260 .word 8 * 12 261 .word 80b - save_tmvs_tbl 262 .word 4 * 12 263 .word 40b - save_tmvs_tbl 264 .word 4 * 12 265 .word 40b - save_tmvs_tbl 266 .word 4 * 12 267 .word 40b - save_tmvs_tbl 268 .word 4 * 12 269 .word 40b - save_tmvs_tbl 270 .word 2 * 12 271 .word 20b - save_tmvs_tbl 272 .word 2 * 12 273 .word 20b - save_tmvs_tbl 274 .word 2 * 12 275 .word 20b - save_tmvs_tbl 276 .word 2 * 12 277 .word 20b - save_tmvs_tbl 278 .word 2 * 12 279 .word 20b - save_tmvs_tbl 280 .word 1 * 12 281 .word 10b - save_tmvs_tbl 282 .word 1 * 12 283 .word 10b - save_tmvs_tbl 284 .word 1 * 12 285 .word 10b - save_tmvs_tbl 286 .word 1 * 12 287 .word 10b - save_tmvs_tbl 288 .word 1 * 12 289 .word 10b - save_tmvs_tbl 290 .word 1 * 12 291 .word 10b - save_tmvs_tbl 292 .word 1 * 12 293 .word 10b - save_tmvs_tbl 294endjumptable 295