1/* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2019, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/arm/asm.S" 29#include "util.S" 30#include "cdef_tmpl.S" 31 32.macro pad_top_bottom s1, s2, w, stride, rn, rw, ret 33 tst w7, #1 // CDEF_HAVE_LEFT 34 b.eq 2f 35 // CDEF_HAVE_LEFT 36 sub \s1, \s1, #2 37 sub \s2, \s2, #2 38 tst w7, #2 // CDEF_HAVE_RIGHT 39 b.eq 1f 40 // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 41 ldr \rn\()0, [\s1] 42 ldr s1, [\s1, #\w] 43 ldr \rn\()2, [\s2] 44 ldr s3, [\s2, #\w] 45 uxtl v0.8h, v0.8b 46 uxtl v1.8h, v1.8b 47 uxtl v2.8h, v2.8b 48 uxtl v3.8h, v3.8b 49 str \rw\()0, [x0] 50 str d1, [x0, #2*\w] 51 add x0, x0, #2*\stride 52 str \rw\()2, [x0] 53 str d3, [x0, #2*\w] 54.if \ret 55 ret 56.else 57 add x0, x0, #2*\stride 58 b 3f 59.endif 60 611: 62 // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT 63 ldr \rn\()0, [\s1] 64 ldr h1, [\s1, #\w] 65 ldr \rn\()2, [\s2] 66 ldr h3, [\s2, #\w] 67 uxtl v0.8h, v0.8b 68 uxtl v1.8h, v1.8b 69 uxtl v2.8h, v2.8b 70 uxtl v3.8h, v3.8b 71 str \rw\()0, [x0] 72 str s1, [x0, #2*\w] 73 str s31, [x0, #2*\w+4] 74 add x0, x0, #2*\stride 75 str \rw\()2, [x0] 76 str s3, [x0, #2*\w] 77 str s31, [x0, #2*\w+4] 78.if \ret 79 ret 80.else 81 add x0, x0, #2*\stride 82 b 3f 83.endif 84 852: 86 // !CDEF_HAVE_LEFT 87 tst w7, #2 // CDEF_HAVE_RIGHT 88 b.eq 1f 89 // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 90 ldr \rn\()0, [\s1] 91 ldr h1, [\s1, #\w] 92 ldr \rn\()2, [\s2] 93 ldr h3, [\s2, #\w] 94 uxtl v0.8h, v0.8b 95 uxtl v1.8h, v1.8b 96 uxtl v2.8h, v2.8b 97 uxtl v3.8h, v3.8b 98 str s31, [x0] 99 stur \rw\()0, [x0, #4] 100 str s1, [x0, #4+2*\w] 101 add x0, x0, #2*\stride 102 str s31, [x0] 103 stur \rw\()2, [x0, #4] 104 str s3, [x0, #4+2*\w] 105.if \ret 106 ret 107.else 108 add x0, x0, #2*\stride 109 b 3f 110.endif 111 1121: 113 // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT 114 ldr \rn\()0, [\s1] 115 ldr \rn\()1, [\s2] 116 uxtl v0.8h, v0.8b 117 uxtl v1.8h, v1.8b 118 str s31, [x0] 119 stur \rw\()0, [x0, #4] 120 str s31, [x0, #4+2*\w] 121 add x0, x0, #2*\stride 122 str s31, [x0] 123 stur \rw\()1, [x0, #4] 124 str s31, [x0, #4+2*\w] 125.if \ret 126 ret 127.else 128 add x0, x0, #2*\stride 129.endif 1303: 131.endm 132 133.macro load_n_incr dst, src, incr, w 134.if \w == 4 135 ld1 {\dst\().s}[0], [\src], \incr 136.else 137 ld1 {\dst\().8b}, [\src], \incr 138.endif 139.endm 140 141// void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src, 142// ptrdiff_t src_stride, const pixel (*left)[2], 143// const pixel *const top, 144// const pixel *const bottom, int h, 145// enum CdefEdgeFlags edges); 146 147.macro padding_func w, stride, rn, rw 148function cdef_padding\w\()_8bpc_neon, export=1 149 cmp w7, #0xf // fully edged 150 b.eq cdef_padding\w\()_edged_8bpc_neon 151 movi v30.8h, #0x80, lsl #8 152 mov v31.16b, v30.16b 153 sub x0, x0, #2*(2*\stride+2) 154 tst w7, #4 // CDEF_HAVE_TOP 155 b.ne 1f 156 // !CDEF_HAVE_TOP 157 st1 {v30.8h, v31.8h}, [x0], #32 158.if \w == 8 159 st1 {v30.8h, v31.8h}, [x0], #32 160.endif 161 b 3f 1621: 163 // CDEF_HAVE_TOP 164 add x9, x4, x2 165 pad_top_bottom x4, x9, \w, \stride, \rn, \rw, 0 166 167 // Middle section 1683: 169 tst w7, #1 // CDEF_HAVE_LEFT 170 b.eq 2f 171 // CDEF_HAVE_LEFT 172 tst w7, #2 // CDEF_HAVE_RIGHT 173 b.eq 1f 174 // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 1750: 176 ld1 {v0.h}[0], [x3], #2 177 ldr h2, [x1, #\w] 178 load_n_incr v1, x1, x2, \w 179 subs w6, w6, #1 180 uxtl v0.8h, v0.8b 181 uxtl v1.8h, v1.8b 182 uxtl v2.8h, v2.8b 183 str s0, [x0] 184 stur \rw\()1, [x0, #4] 185 str s2, [x0, #4+2*\w] 186 add x0, x0, #2*\stride 187 b.gt 0b 188 b 3f 1891: 190 // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT 191 ld1 {v0.h}[0], [x3], #2 192 load_n_incr v1, x1, x2, \w 193 subs w6, w6, #1 194 uxtl v0.8h, v0.8b 195 uxtl v1.8h, v1.8b 196 str s0, [x0] 197 stur \rw\()1, [x0, #4] 198 str s31, [x0, #4+2*\w] 199 add x0, x0, #2*\stride 200 b.gt 1b 201 b 3f 2022: 203 tst w7, #2 // CDEF_HAVE_RIGHT 204 b.eq 1f 205 // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 2060: 207 ldr h1, [x1, #\w] 208 load_n_incr v0, x1, x2, \w 209 subs w6, w6, #1 210 uxtl v0.8h, v0.8b 211 uxtl v1.8h, v1.8b 212 str s31, [x0] 213 stur \rw\()0, [x0, #4] 214 str s1, [x0, #4+2*\w] 215 add x0, x0, #2*\stride 216 b.gt 0b 217 b 3f 2181: 219 // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT 220 load_n_incr v0, x1, x2, \w 221 subs w6, w6, #1 222 uxtl v0.8h, v0.8b 223 str s31, [x0] 224 stur \rw\()0, [x0, #4] 225 str s31, [x0, #4+2*\w] 226 add x0, x0, #2*\stride 227 b.gt 1b 228 2293: 230 tst w7, #8 // CDEF_HAVE_BOTTOM 231 b.ne 1f 232 // !CDEF_HAVE_BOTTOM 233 st1 {v30.8h, v31.8h}, [x0], #32 234.if \w == 8 235 st1 {v30.8h, v31.8h}, [x0], #32 236.endif 237 ret 2381: 239 // CDEF_HAVE_BOTTOM 240 add x9, x5, x2 241 pad_top_bottom x5, x9, \w, \stride, \rn, \rw, 1 242endfunc 243.endm 244 245padding_func 8, 16, d, q 246padding_func 4, 8, s, d 247 248// void cdef_paddingX_edged_8bpc_neon(uint8_t *tmp, const pixel *src, 249// ptrdiff_t src_stride, const pixel (*left)[2], 250// const pixel *const top, 251// const pixel *const bottom, int h, 252// enum CdefEdgeFlags edges); 253 254.macro padding_func_edged w, stride, reg 255function cdef_padding\w\()_edged_8bpc_neon, export=1 256 sub x4, x4, #2 257 sub x5, x5, #2 258 sub x0, x0, #(2*\stride+2) 259 260.if \w == 4 261 ldr d0, [x4] 262 ldr d1, [x4, x2] 263 st1 {v0.8b, v1.8b}, [x0], #16 264.else 265 add x9, x4, x2 266 ldr d0, [x4] 267 ldr s1, [x4, #8] 268 ldr d2, [x9] 269 ldr s3, [x9, #8] 270 str d0, [x0] 271 str s1, [x0, #8] 272 str d2, [x0, #\stride] 273 str s3, [x0, #\stride+8] 274 add x0, x0, #2*\stride 275.endif 276 2770: 278 ld1 {v0.h}[0], [x3], #2 279 ldr h2, [x1, #\w] 280 load_n_incr v1, x1, x2, \w 281 subs w6, w6, #1 282 str h0, [x0] 283 stur \reg\()1, [x0, #2] 284 str h2, [x0, #2+\w] 285 add x0, x0, #\stride 286 b.gt 0b 287 288.if \w == 4 289 ldr d0, [x5] 290 ldr d1, [x5, x2] 291 st1 {v0.8b, v1.8b}, [x0], #16 292.else 293 add x9, x5, x2 294 ldr d0, [x5] 295 ldr s1, [x5, #8] 296 ldr d2, [x9] 297 ldr s3, [x9, #8] 298 str d0, [x0] 299 str s1, [x0, #8] 300 str d2, [x0, #\stride] 301 str s3, [x0, #\stride+8] 302.endif 303 ret 304endfunc 305.endm 306 307padding_func_edged 8, 16, d 308padding_func_edged 4, 8, s 309 310tables 311 312filter 8, 8 313filter 4, 8 314 315find_dir 8 316 317.macro load_px_8 d1, d2, w 318.if \w == 8 319 add x6, x2, w9, sxtb // x + off 320 sub x9, x2, w9, sxtb // x - off 321 ld1 {\d1\().d}[0], [x6] // p0 322 add x6, x6, #16 // += stride 323 ld1 {\d2\().d}[0], [x9] // p1 324 add x9, x9, #16 // += stride 325 ld1 {\d1\().d}[1], [x6] // p0 326 ld1 {\d2\().d}[1], [x9] // p0 327.else 328 add x6, x2, w9, sxtb // x + off 329 sub x9, x2, w9, sxtb // x - off 330 ld1 {\d1\().s}[0], [x6] // p0 331 add x6, x6, #8 // += stride 332 ld1 {\d2\().s}[0], [x9] // p1 333 add x9, x9, #8 // += stride 334 ld1 {\d1\().s}[1], [x6] // p0 335 add x6, x6, #8 // += stride 336 ld1 {\d2\().s}[1], [x9] // p1 337 add x9, x9, #8 // += stride 338 ld1 {\d1\().s}[2], [x6] // p0 339 add x6, x6, #8 // += stride 340 ld1 {\d2\().s}[2], [x9] // p1 341 add x9, x9, #8 // += stride 342 ld1 {\d1\().s}[3], [x6] // p0 343 ld1 {\d2\().s}[3], [x9] // p1 344.endif 345.endm 346.macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min 347.if \min 348 umin v3.16b, v3.16b, \s1\().16b 349 umax v4.16b, v4.16b, \s1\().16b 350 umin v3.16b, v3.16b, \s2\().16b 351 umax v4.16b, v4.16b, \s2\().16b 352.endif 353 uabd v16.16b, v0.16b, \s1\().16b // abs(diff) 354 uabd v20.16b, v0.16b, \s2\().16b // abs(diff) 355 ushl v17.16b, v16.16b, \shift // abs(diff) >> shift 356 ushl v21.16b, v20.16b, \shift // abs(diff) >> shift 357 uqsub v17.16b, \thresh_vec, v17.16b // clip = imax(0, threshold - (abs(diff) >> shift)) 358 uqsub v21.16b, \thresh_vec, v21.16b // clip = imax(0, threshold - (abs(diff) >> shift)) 359 cmhi v18.16b, v0.16b, \s1\().16b // px > p0 360 cmhi v22.16b, v0.16b, \s2\().16b // px > p1 361 umin v17.16b, v17.16b, v16.16b // imin(abs(diff), clip) 362 umin v21.16b, v21.16b, v20.16b // imin(abs(diff), clip) 363 dup v19.16b, \tap // taps[k] 364 neg v16.16b, v17.16b // -imin() 365 neg v20.16b, v21.16b // -imin() 366 bsl v18.16b, v16.16b, v17.16b // constrain() = apply_sign() 367 bsl v22.16b, v20.16b, v21.16b // constrain() = apply_sign() 368 mla v1.16b, v18.16b, v19.16b // sum += taps[k] * constrain() 369 mla v2.16b, v22.16b, v19.16b // sum += taps[k] * constrain() 370.endm 371 372// void cdef_filterX_edged_8bpc_neon(pixel *dst, ptrdiff_t dst_stride, 373// const uint8_t *tmp, int pri_strength, 374// int sec_strength, int dir, int damping, 375// int h); 376.macro filter_func_8 w, pri, sec, min, suffix 377function cdef_filter\w\suffix\()_edged_8bpc_neon 378.if \pri 379 movrel x8, pri_taps 380 and w9, w3, #1 381 add x8, x8, w9, uxtw #1 382.endif 383 movrel x9, directions\w 384 add x5, x9, w5, uxtw #1 385 movi v30.8b, #7 386 dup v28.8b, w6 // damping 387 388.if \pri 389 dup v25.16b, w3 // threshold 390.endif 391.if \sec 392 dup v27.16b, w4 // threshold 393.endif 394 trn1 v24.8b, v25.8b, v27.8b 395 clz v24.8b, v24.8b // clz(threshold) 396 sub v24.8b, v30.8b, v24.8b // ulog2(threshold) 397 uqsub v24.8b, v28.8b, v24.8b // shift = imax(0, damping - ulog2(threshold)) 398 neg v24.8b, v24.8b // -shift 399.if \sec 400 dup v26.16b, v24.b[1] 401.endif 402.if \pri 403 dup v24.16b, v24.b[0] 404.endif 405 4061: 407.if \w == 8 408 add x12, x2, #16 409 ld1 {v0.d}[0], [x2] // px 410 ld1 {v0.d}[1], [x12] // px 411.else 412 add x12, x2, #1*8 413 add x13, x2, #2*8 414 add x14, x2, #3*8 415 ld1 {v0.s}[0], [x2] // px 416 ld1 {v0.s}[1], [x12] // px 417 ld1 {v0.s}[2], [x13] // px 418 ld1 {v0.s}[3], [x14] // px 419.endif 420 421 // We need 9-bits or two 8-bit accululators to fit the sum. 422 // Max of |sum| > 15*2*6(pri) + 4*4*3(sec) = 228. 423 // Start sum at -1 instead of 0 to help handle rounding later. 424 movi v1.16b, #255 // sum 425 movi v2.16b, #0 // sum 426.if \min 427 mov v3.16b, v0.16b // min 428 mov v4.16b, v0.16b // max 429.endif 430 431 // Instead of loading sec_taps 2, 1 from memory, just set it 432 // to 2 initially and decrease for the second round. 433 // This is also used as loop counter. 434 mov w11, #2 // sec_taps[0] 435 4362: 437.if \pri 438 ldrb w9, [x5] // off1 439 440 load_px_8 v5, v6, \w 441.endif 442 443.if \sec 444 add x5, x5, #4 // +2*2 445 ldrb w9, [x5] // off2 446 load_px_8 v28, v29, \w 447.endif 448 449.if \pri 450 ldrb w10, [x8] // *pri_taps 451 452 handle_pixel_8 v5, v6, v25.16b, v24.16b, w10, \min 453.endif 454 455.if \sec 456 add x5, x5, #8 // +2*4 457 ldrb w9, [x5] // off3 458 load_px_8 v5, v6, \w 459 460 handle_pixel_8 v28, v29, v27.16b, v26.16b, w11, \min 461 462 handle_pixel_8 v5, v6, v27.16b, v26.16b, w11, \min 463 464 sub x5, x5, #11 // x5 -= 2*(2+4); x5 += 1; 465.else 466 add x5, x5, #1 // x5 += 1 467.endif 468 subs w11, w11, #1 // sec_tap-- (value) 469.if \pri 470 add x8, x8, #1 // pri_taps++ (pointer) 471.endif 472 b.ne 2b 473 474 // Perform halving adds since the value won't fit otherwise. 475 // To handle the offset for negative values, use both halving w/ and w/o rounding. 476 srhadd v5.16b, v1.16b, v2.16b // sum >> 1 477 shadd v6.16b, v1.16b, v2.16b // (sum - 1) >> 1 478 cmlt v1.16b, v5.16b, #0 // sum < 0 479 bsl v1.16b, v6.16b, v5.16b // (sum - (sum < 0)) >> 1 480 481 srshr v1.16b, v1.16b, #3 // (8 + sum - (sum < 0)) >> 4 482 483 usqadd v0.16b, v1.16b // px + (8 + sum ...) >> 4 484.if \min 485 umin v0.16b, v0.16b, v4.16b 486 umax v0.16b, v0.16b, v3.16b // iclip(px + .., min, max) 487.endif 488.if \w == 8 489 st1 {v0.d}[0], [x0], x1 490 add x2, x2, #2*16 // tmp += 2*tmp_stride 491 subs w7, w7, #2 // h -= 2 492 st1 {v0.d}[1], [x0], x1 493.else 494 st1 {v0.s}[0], [x0], x1 495 add x2, x2, #4*8 // tmp += 4*tmp_stride 496 st1 {v0.s}[1], [x0], x1 497 subs w7, w7, #4 // h -= 4 498 st1 {v0.s}[2], [x0], x1 499 st1 {v0.s}[3], [x0], x1 500.endif 501 502 // Reset pri_taps and directions back to the original point 503 sub x5, x5, #2 504.if \pri 505 sub x8, x8, #2 506.endif 507 508 b.gt 1b 509 ret 510endfunc 511.endm 512 513.macro filter_8 w 514filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri 515filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec 516filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec 517.endm 518 519filter_8 8 520filter_8 4 521