1/* 2 * Copyright © 2021, VideoLAN and dav1d authors 3 * Copyright © 2021, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/arm/asm.S" 29#include "util.S" 30 31// void dav1d_splat_mv_neon(refmvs_block **rr, const refmvs_block *rmv, 32// int bx4, int bw4, int bh4) 33 34function splat_mv_neon, export=1 35 push {r4, lr} 36 vld1.8 {q3}, [r1] 37 ldr r4, [sp, #8] 38 clz r3, r3 39 adr lr, L(splat_tbl) 40 sub r3, r3, #26 41 vext.8 q2, q3, q3, #12 42 ldr r3, [lr, r3, lsl #2] 43 add r2, r2, r2, lsl #1 44 vext.8 q0, q2, q3, #4 45 add r3, lr, r3 46 vext.8 q1, q2, q3, #8 47 lsl r2, r2, #2 48 vext.8 q2, q2, q3, #12 49 vmov q3, q0 501: 51 ldr r1, [r0], #4 52 subs r4, r4, #1 53 add r1, r1, r2 54 bx r3 55 56 .align 2 57L(splat_tbl): 58 .word 320f - L(splat_tbl) + CONFIG_THUMB 59 .word 160f - L(splat_tbl) + CONFIG_THUMB 60 .word 80f - L(splat_tbl) + CONFIG_THUMB 61 .word 40f - L(splat_tbl) + CONFIG_THUMB 62 .word 20f - L(splat_tbl) + CONFIG_THUMB 63 .word 10f - L(splat_tbl) + CONFIG_THUMB 64 6510: 66 vst1.8 {d0}, [r1] 67 vstr s2, [r1, #8] 68 bgt 1b 69 pop {r4, pc} 7020: 71 vst1.8 {q0}, [r1] 72 vstr d2, [r1, #16] 73 bgt 1b 74 pop {r4, pc} 7540: 76 vst1.8 {q0, q1}, [r1]! 77 vst1.8 {q2}, [r1] 78 bgt 1b 79 pop {r4, pc} 80320: 81 vst1.8 {q0, q1}, [r1]! 82 vst1.8 {q2, q3}, [r1]! 83 vst1.8 {q1, q2}, [r1]! 84 vst1.8 {q0, q1}, [r1]! 85 vst1.8 {q2, q3}, [r1]! 86 vst1.8 {q1, q2}, [r1]! 87160: 88 vst1.8 {q0, q1}, [r1]! 89 vst1.8 {q2, q3}, [r1]! 90 vst1.8 {q1, q2}, [r1]! 9180: 92 vst1.8 {q0, q1}, [r1]! 93 vst1.8 {q2, q3}, [r1]! 94 vst1.8 {q1, q2}, [r1] 95 bgt 1b 96 pop {r4, pc} 97endfunc 98 99const mv_tbls, align=4 100 .byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 101 .byte 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0 102 .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4 103 .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4 104endconst 105 106const mask_mult, align=4 107 .byte 1, 2, 1, 2, 0, 0, 0, 0 108endconst 109 110// void dav1d_save_tmvs_neon(refmvs_temporal_block *rp, ptrdiff_t stride, 111// refmvs_block **rr, const uint8_t *ref_sign, 112// int col_end8, int row_end8, 113// int col_start8, int row_start8) 114function save_tmvs_neon, export=1 115 push {r4-r11,lr} 116 ldrd r4, r5, [sp, #36] 117 ldrd r6, r7, [sp, #44] 118 119 vmov.i8 d30, #0 120 vld1.8 {d31}, [r3] 121 adr r8, L(save_tmvs_tbl) 122 movrel_local lr, mask_mult 123 movrel_local r12, mv_tbls 124 vld1.8 {d29}, [lr] 125 vext.8 d31, d30, d31, #7 // [0, ref_sign] 126 mov r3, #5 127 mul r1, r1, r3 // stride *= 5 128 sub r5, r5, r7 // h = row_end8 - row_start8 129 lsl r7, r7, #1 // row_start8 <<= 1 1301: 131 mov r3, #5 132 mov r11, #12*2 133 and r9, r7, #30 // (y & 15) * 2 134 ldr r9, [r2, r9, lsl #2] // b = rr[(y & 15) * 2] 135 add r9, r9, #12 // &b[... + 1] 136 mla r10, r4, r11, r9 // end_cand_b = &b[col_end8*2 + 1] 137 mla r9, r6, r11, r9 // cand_b = &b[x*2 + 1] 138 139 mla r3, r6, r3, r0 // &rp[x] 140 141 push {r2,r4,r6} 142 1432: 144 ldrb r11, [r9, #10] // cand_b->bs 145 add lr, r9, #8 146 vld1.8 {d0, d1}, [r9] // cand_b->mv 147 add r11, r8, r11, lsl #3 148 vld1.16 {d2[]}, [lr] // cand_b->ref 149 ldrh lr, [r11] // bw8 150 mov r2, r8 151 add r9, r9, lr, lsl #1 // cand_b += bw8*2 152 cmp r9, r10 153 vmov d4, d0 154 bge 3f 155 156 ldrb r2, [r9, #10] // cand_b->bs 157 add lr, r9, #8 158 vld1.8 {d6, d7}, [r9] // cand_b->mv 159 add r2, r8, r2, lsl #3 160 vld1.16 {d2[1]}, [lr] // cand_b->ref 161 ldrh lr, [r2] // bw8 162 add r9, r9, lr, lsl #1 // cand_b += bw8*2 163 vmov d5, d6 164 1653: 166 vabs.s16 q2, q2 // abs(mv[].xy) 167 vtbl.8 d2, {d31}, d2 // ref_sign[ref] 168 vshr.u16 q2, q2, #12 // abs(mv[].xy) >> 12 169 vmull.u8 q1, d2, d29 // ref_sign[ref] * {1, 2} 170 vceq.i32 q2, q2, #0 // abs(mv[].xy) <= 4096 171 vmovn.i32 d4, q2 // abs() condition to 16 bit 172 vand d2, d2, d4 // h[0-3] contains conditions for mv[0-1] 173 vpadd.i16 d2, d2, d2 // Combine condition for [1] and [0] 174 vmov.u16 r4, d2[0] // Extract case for first block 175 vmov.u16 r6, d2[1] 176 ldr r11, [r11, #4] // Fetch jump table entry 177 ldr r2, [r2, #4] 178 add r4, r12, r4, lsl #4 179 add r6, r12, r6, lsl #4 180 vld1.8 {d2, d3}, [r4] // Load permutation table base on case 181 vld1.8 {d4, d5}, [r6] 182 add r11, r8, r11 // Find jump table target 183 add r2, r8, r2 184 vtbl.8 d16, {d0, d1}, d2 // Permute cand_b to output refmvs_temporal_block 185 vtbl.8 d17, {d0, d1}, d3 186 vtbl.8 d18, {d6, d7}, d4 187 vtbl.8 d19, {d6, d7}, d5 188 vmov q0, q8 189 190 // q1 follows on q0 (q8), with another 3 full repetitions of the pattern. 191 vext.8 q1, q8, q8, #1 192 vext.8 q10, q9, q9, #1 193 // q2 ends with 3 complete repetitions of the pattern. 194 vext.8 q2, q8, q1, #4 195 vext.8 q11, q9, q10, #4 196 197 blx r11 198 bge 4f // if (cand_b >= end) 199 vmov q0, q9 200 vmov q1, q10 201 vmov q2, q11 202 cmp r9, r10 203 blx r2 204 blt 2b // if (cand_b < end) 205 2064: 207 pop {r2,r4,r6} 208 209 subs r5, r5, #1 // h-- 210 add r7, r7, #2 // y += 2 211 add r0, r0, r1 // rp += stride 212 bgt 1b 213 214 pop {r4-r11,pc} 215 216 .align 2 217L(save_tmvs_tbl): 218 .word 16 * 12 219 .word 160f - L(save_tmvs_tbl) + CONFIG_THUMB 220 .word 16 * 12 221 .word 160f - L(save_tmvs_tbl) + CONFIG_THUMB 222 .word 8 * 12 223 .word 80f - L(save_tmvs_tbl) + CONFIG_THUMB 224 .word 8 * 12 225 .word 80f - L(save_tmvs_tbl) + CONFIG_THUMB 226 .word 8 * 12 227 .word 80f - L(save_tmvs_tbl) + CONFIG_THUMB 228 .word 8 * 12 229 .word 80f - L(save_tmvs_tbl) + CONFIG_THUMB 230 .word 4 * 12 231 .word 40f - L(save_tmvs_tbl) + CONFIG_THUMB 232 .word 4 * 12 233 .word 40f - L(save_tmvs_tbl) + CONFIG_THUMB 234 .word 4 * 12 235 .word 40f - L(save_tmvs_tbl) + CONFIG_THUMB 236 .word 4 * 12 237 .word 40f - L(save_tmvs_tbl) + CONFIG_THUMB 238 .word 2 * 12 239 .word 20f - L(save_tmvs_tbl) + CONFIG_THUMB 240 .word 2 * 12 241 .word 20f - L(save_tmvs_tbl) + CONFIG_THUMB 242 .word 2 * 12 243 .word 20f - L(save_tmvs_tbl) + CONFIG_THUMB 244 .word 2 * 12 245 .word 20f - L(save_tmvs_tbl) + CONFIG_THUMB 246 .word 2 * 12 247 .word 20f - L(save_tmvs_tbl) + CONFIG_THUMB 248 .word 1 * 12 249 .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB 250 .word 1 * 12 251 .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB 252 .word 1 * 12 253 .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB 254 .word 1 * 12 255 .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB 256 .word 1 * 12 257 .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB 258 .word 1 * 12 259 .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB 260 .word 1 * 12 261 .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB 262 26310: 264 add r4, r3, #4 265 vst1.32 {d0[0]}, [r3] 266 vst1.8 {d0[4]}, [r4] 267 add r3, r3, #5 268 bx lr 26920: 270 add r4, r3, #8 271 vst1.8 {d0}, [r3] 272 vst1.16 {d1[0]}, [r4] 273 add r3, r3, #2*5 274 bx lr 27540: 276 add r4, r3, #16 277 vst1.8 {q0}, [r3] 278 vst1.32 {d2[0]}, [r4] 279 add r3, r3, #4*5 280 bx lr 28180: 282 add r4, r3, #(8*5-16) 283 // This writes 6 full entries plus 2 extra bytes 284 vst1.8 {q0, q1}, [r3] 285 // Write the last few, overlapping with the first write. 286 vst1.8 {q2}, [r4] 287 add r3, r3, #8*5 288 bx lr 289160: 290 add r4, r3, #6*5 291 add r6, r3, #12*5 292 // This writes 6 full entries plus 2 extra bytes 293 vst1.8 {q0, q1}, [r3] 294 // Write another 6 full entries, slightly overlapping with the first set 295 vst1.8 {q0, q1}, [r4] 296 add r4, r3, #(16*5-16) 297 // Write 8 bytes (one full entry) after the first 12 298 vst1.8 {d0}, [r6] 299 // Write the last 3 entries 300 vst1.8 {q2}, [r4] 301 add r3, r3, #16*5 302 bx lr 303endfunc 304