1/* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2019, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/arm/asm.S" 29#include "util.S" 30#include "cdef_tmpl.S" 31 32// n1 = s0/d0 33// w1 = d0/q0 34// n2 = s4/d2 35// w2 = d2/q1 36.macro pad_top_bottom s1, s2, w, stride, n1, w1, n2, w2, align, ret 37 tst r7, #1 // CDEF_HAVE_LEFT 38 beq 2f 39 // CDEF_HAVE_LEFT 40 tst r7, #2 // CDEF_HAVE_RIGHT 41 beq 1f 42 // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 43 ldrh r12, [\s1, #-2] 44 vldr \n1, [\s1] 45 vdup.16 d4, r12 46 ldrh r12, [\s1, #\w] 47 vmov.16 d4[1], r12 48 ldrh r12, [\s2, #-2] 49 vldr \n2, [\s2] 50 vmov.16 d4[2], r12 51 ldrh r12, [\s2, #\w] 52 vmovl.u8 q0, d0 53 vmov.16 d4[3], r12 54 vmovl.u8 q1, d2 55 vmovl.u8 q2, d4 56 vstr s8, [r0, #-4] 57 vst1.16 {\w1}, [r0, :\align] 58 vstr s9, [r0, #2*\w] 59 add r0, r0, #2*\stride 60 vstr s10, [r0, #-4] 61 vst1.16 {\w2}, [r0, :\align] 62 vstr s11, [r0, #2*\w] 63.if \ret 64 pop {r4-r8,pc} 65.else 66 add r0, r0, #2*\stride 67 b 3f 68.endif 69 701: 71 // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT 72 ldrh r12, [\s1, #-2] 73 vldr \n1, [\s1] 74 vdup.16 d4, r12 75 ldrh r12, [\s2, #-2] 76 vldr \n2, [\s2] 77 vmovl.u8 q0, d0 78 vmov.16 d4[1], r12 79 vmovl.u8 q1, d2 80 vmovl.u8 q2, d4 81 vstr s8, [r0, #-4] 82 vst1.16 {\w1}, [r0, :\align] 83 vstr s12, [r0, #2*\w] 84 add r0, r0, #2*\stride 85 vstr s9, [r0, #-4] 86 vst1.16 {\w2}, [r0, :\align] 87 vstr s12, [r0, #2*\w] 88.if \ret 89 pop {r4-r8,pc} 90.else 91 add r0, r0, #2*\stride 92 b 3f 93.endif 94 952: 96 // !CDEF_HAVE_LEFT 97 tst r7, #2 // CDEF_HAVE_RIGHT 98 beq 1f 99 // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 100 vldr \n1, [\s1] 101 ldrh r12, [\s1, #\w] 102 vldr \n2, [\s2] 103 vdup.16 d4, r12 104 ldrh r12, [\s2, #\w] 105 vmovl.u8 q0, d0 106 vmov.16 d4[1], r12 107 vmovl.u8 q1, d2 108 vmovl.u8 q2, d4 109 vstr s12, [r0, #-4] 110 vst1.16 {\w1}, [r0, :\align] 111 vstr s8, [r0, #2*\w] 112 add r0, r0, #2*\stride 113 vstr s12, [r0, #-4] 114 vst1.16 {\w2}, [r0, :\align] 115 vstr s9, [r0, #2*\w] 116.if \ret 117 pop {r4-r8,pc} 118.else 119 add r0, r0, #2*\stride 120 b 3f 121.endif 122 1231: 124 // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT 125 vldr \n1, [\s1] 126 vldr \n2, [\s2] 127 vmovl.u8 q0, d0 128 vmovl.u8 q1, d2 129 vstr s12, [r0, #-4] 130 vst1.16 {\w1}, [r0, :\align] 131 vstr s12, [r0, #2*\w] 132 add r0, r0, #2*\stride 133 vstr s12, [r0, #-4] 134 vst1.16 {\w2}, [r0, :\align] 135 vstr s12, [r0, #2*\w] 136.if \ret 137 pop {r4-r8,pc} 138.else 139 add r0, r0, #2*\stride 140.endif 1413: 142.endm 143 144.macro load_n_incr dst, src, incr, w 145.if \w == 4 146 vld1.32 {\dst\()[0]}, [\src, :32], \incr 147.else 148 vld1.8 {\dst\()}, [\src, :64], \incr 149.endif 150.endm 151 152// void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src, 153// ptrdiff_t src_stride, const pixel (*left)[2], 154// const pixel *const top, 155// const pixel *const bottom, int h, 156// enum CdefEdgeFlags edges); 157 158// n1 = s0/d0 159// w1 = d0/q0 160// n2 = s4/d2 161// w2 = d2/q1 162.macro padding_func w, stride, n1, w1, n2, w2, align 163function cdef_padding\w\()_8bpc_neon, export=1 164 push {r4-r8,lr} 165 ldrd r4, r5, [sp, #24] 166 ldrd r6, r7, [sp, #32] 167 cmp r7, #0xf // fully edged 168 beq cdef_padding\w\()_edged_8bpc_neon 169 vmov.i16 q3, #0x8000 170 tst r7, #4 // CDEF_HAVE_TOP 171 bne 1f 172 // !CDEF_HAVE_TOP 173 sub r12, r0, #2*(2*\stride+2) 174 vmov.i16 q2, #0x8000 175 vst1.16 {q2,q3}, [r12]! 176.if \w == 8 177 vst1.16 {q2,q3}, [r12]! 178.endif 179 b 3f 1801: 181 // CDEF_HAVE_TOP 182 add r8, r4, r2 183 sub r0, r0, #2*(2*\stride) 184 pad_top_bottom r4, r8, \w, \stride, \n1, \w1, \n2, \w2, \align, 0 185 186 // Middle section 1873: 188 tst r7, #1 // CDEF_HAVE_LEFT 189 beq 2f 190 // CDEF_HAVE_LEFT 191 tst r7, #2 // CDEF_HAVE_RIGHT 192 beq 1f 193 // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 1940: 195 vld1.16 {d2[]}, [r3, :16]! 196 ldrh r12, [r1, #\w] 197 load_n_incr d0, r1, r2, \w 198 subs r6, r6, #1 199 vmov.16 d2[1], r12 200 vmovl.u8 q0, d0 201 vmovl.u8 q1, d2 202 vstr s4, [r0, #-4] 203 vst1.16 {\w1}, [r0, :\align] 204 vstr s5, [r0, #2*\w] 205 add r0, r0, #2*\stride 206 bgt 0b 207 b 3f 2081: 209 // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT 210 vld1.16 {d2[]}, [r3, :16]! 211 load_n_incr d0, r1, r2, \w 212 subs r6, r6, #1 213 vmovl.u8 q0, d0 214 vmovl.u8 q1, d2 215 vstr s4, [r0, #-4] 216 vst1.16 {\w1}, [r0, :\align] 217 vstr s12, [r0, #2*\w] 218 add r0, r0, #2*\stride 219 bgt 1b 220 b 3f 2212: 222 tst r7, #2 // CDEF_HAVE_RIGHT 223 beq 1f 224 // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 2250: 226 ldrh r12, [r1, #\w] 227 load_n_incr d0, r1, r2, \w 228 vdup.16 d2, r12 229 subs r6, r6, #1 230 vmovl.u8 q0, d0 231 vmovl.u8 q1, d2 232 vstr s12, [r0, #-4] 233 vst1.16 {\w1}, [r0, :\align] 234 vstr s4, [r0, #2*\w] 235 add r0, r0, #2*\stride 236 bgt 0b 237 b 3f 2381: 239 // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT 240 load_n_incr d0, r1, r2, \w 241 subs r6, r6, #1 242 vmovl.u8 q0, d0 243 vstr s12, [r0, #-4] 244 vst1.16 {\w1}, [r0, :\align] 245 vstr s12, [r0, #2*\w] 246 add r0, r0, #2*\stride 247 bgt 1b 248 2493: 250 tst r7, #8 // CDEF_HAVE_BOTTOM 251 bne 1f 252 // !CDEF_HAVE_BOTTOM 253 sub r12, r0, #4 254 vmov.i16 q2, #0x8000 255 vst1.16 {q2,q3}, [r12]! 256.if \w == 8 257 vst1.16 {q2,q3}, [r12]! 258.endif 259 pop {r4-r8,pc} 2601: 261 // CDEF_HAVE_BOTTOM 262 add r8, r5, r2 263 pad_top_bottom r5, r8, \w, \stride, \n1, \w1, \n2, \w2, \align, 1 264endfunc 265.endm 266 267padding_func 8, 16, d0, q0, d2, q1, 128 268padding_func 4, 8, s0, d0, s4, d2, 64 269 270// void cdef_paddingX_edged_8bpc_neon(uint16_t *tmp, const pixel *src, 271// ptrdiff_t src_stride, const pixel (*left)[2], 272// const pixel *const top, 273// const pixel *const bottom, int h, 274// enum CdefEdgeFlags edges); 275 276.macro padding_func_edged w, stride, reg, align 277function cdef_padding\w\()_edged_8bpc_neon 278 sub r0, r0, #(2*\stride) 279 280 ldrh r12, [r4, #-2] 281 vldr \reg, [r4] 282 add r8, r4, r2 283 strh r12, [r0, #-2] 284 ldrh r12, [r4, #\w] 285 vstr \reg, [r0] 286 strh r12, [r0, #\w] 287 288 ldrh r12, [r8, #-2] 289 vldr \reg, [r8] 290 strh r12, [r0, #\stride-2] 291 ldrh r12, [r8, #\w] 292 vstr \reg, [r0, #\stride] 293 strh r12, [r0, #\stride+\w] 294 add r0, r0, #2*\stride 295 2960: 297 ldrh r12, [r3], #2 298 vldr \reg, [r1] 299 str r12, [r0, #-2] 300 ldrh r12, [r1, #\w] 301 add r1, r1, r2 302 subs r6, r6, #1 303 vstr \reg, [r0] 304 str r12, [r0, #\w] 305 add r0, r0, #\stride 306 bgt 0b 307 308 ldrh r12, [r5, #-2] 309 vldr \reg, [r5] 310 add r8, r5, r2 311 strh r12, [r0, #-2] 312 ldrh r12, [r5, #\w] 313 vstr \reg, [r0] 314 strh r12, [r0, #\w] 315 316 ldrh r12, [r8, #-2] 317 vldr \reg, [r8] 318 strh r12, [r0, #\stride-2] 319 ldrh r12, [r8, #\w] 320 vstr \reg, [r0, #\stride] 321 strh r12, [r0, #\stride+\w] 322 323 pop {r4-r8,pc} 324endfunc 325.endm 326 327padding_func_edged 8, 16, d0, 64 328padding_func_edged 4, 8, s0, 32 329 330tables 331 332filter 8, 8 333filter 4, 8 334 335find_dir 8 336 337.macro load_px_8 d11, d12, d21, d22, w 338.if \w == 8 339 add r6, r2, r9 // x + off 340 sub r9, r2, r9 // x - off 341 vld1.8 {\d11}, [r6] // p0 342 add r6, r6, #16 // += stride 343 vld1.8 {\d21}, [r9] // p1 344 add r9, r9, #16 // += stride 345 vld1.8 {\d12}, [r6] // p0 346 vld1.8 {\d22}, [r9] // p1 347.else 348 add r6, r2, r9 // x + off 349 sub r9, r2, r9 // x - off 350 vld1.32 {\d11[0]}, [r6] // p0 351 add r6, r6, #8 // += stride 352 vld1.32 {\d21[0]}, [r9] // p1 353 add r9, r9, #8 // += stride 354 vld1.32 {\d11[1]}, [r6] // p0 355 add r6, r6, #8 // += stride 356 vld1.32 {\d21[1]}, [r9] // p1 357 add r9, r9, #8 // += stride 358 vld1.32 {\d12[0]}, [r6] // p0 359 add r6, r6, #8 // += stride 360 vld1.32 {\d22[0]}, [r9] // p1 361 add r9, r9, #8 // += stride 362 vld1.32 {\d12[1]}, [r6] // p0 363 vld1.32 {\d22[1]}, [r9] // p1 364.endif 365.endm 366.macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min 367.if \min 368 vmin.u8 q3, q3, \s1 369 vmax.u8 q4, q4, \s1 370 vmin.u8 q3, q3, \s2 371 vmax.u8 q4, q4, \s2 372.endif 373 vabd.u8 q8, q0, \s1 // abs(diff) 374 vabd.u8 q11, q0, \s2 // abs(diff) 375 vshl.u8 q9, q8, \shift // abs(diff) >> shift 376 vshl.u8 q12, q11, \shift // abs(diff) >> shift 377 vqsub.u8 q9, \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift)) 378 vqsub.u8 q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift)) 379 vcgt.u8 q10, q0, \s1 // px > p0 380 vcgt.u8 q13, q0, \s2 // px > p1 381 vmin.u8 q9, q9, q8 // imin(abs(diff), clip) 382 vmin.u8 q12, q12, q11 // imin(abs(diff), clip) 383 vneg.s8 q8, q9 // -imin() 384 vneg.s8 q11, q12 // -imin() 385 vbsl q10, q8, q9 // constrain() = imax(imin(diff, clip), -clip) 386 vdup.8 d18, \tap // taps[k] 387 vbsl q13, q11, q12 // constrain() = imax(imin(diff, clip), -clip) 388 vmlal.s8 q1, d20, d18 // sum += taps[k] * constrain() 389 vmlal.s8 q1, d26, d18 // sum += taps[k] * constrain() 390 vmlal.s8 q2, d21, d18 // sum += taps[k] * constrain() 391 vmlal.s8 q2, d27, d18 // sum += taps[k] * constrain() 392.endm 393 394// void cdef_filterX_edged_neon(pixel *dst, ptrdiff_t dst_stride, 395// const uint16_t *tmp, int pri_strength, 396// int sec_strength, int dir, int damping, 397// int h, size_t edges); 398.macro filter_func_8 w, pri, sec, min, suffix 399function cdef_filter\w\suffix\()_edged_neon 400.if \pri 401 movrel_local r8, pri_taps 402 and r9, r3, #1 403 add r8, r8, r9, lsl #1 404.endif 405 movrel_local r9, directions\w 406 add r5, r9, r5, lsl #1 407 vmov.u8 d17, #7 408 vdup.8 d16, r6 // damping 409 410 vmov.8 d8[0], r3 411 vmov.8 d8[1], r4 412 vclz.i8 d8, d8 // clz(threshold) 413 vsub.i8 d8, d17, d8 // ulog2(threshold) 414 vqsub.u8 d8, d16, d8 // shift = imax(0, damping - ulog2(threshold)) 415 vneg.s8 d8, d8 // -shift 416.if \sec 417 vdup.8 q6, d8[1] 418.endif 419.if \pri 420 vdup.8 q5, d8[0] 421.endif 422 4231: 424.if \w == 8 425 add r12, r2, #16 426 vld1.8 {d0}, [r2, :64] // px 427 vld1.8 {d1}, [r12, :64] // px 428.else 429 add r12, r2, #8 430 vld1.32 {d0[0]}, [r2, :32] // px 431 add r9, r2, #2*8 432 vld1.32 {d0[1]}, [r12, :32] // px 433 add r12, r12, #2*8 434 vld1.32 {d1[0]}, [r9, :32] // px 435 vld1.32 {d1[1]}, [r12, :32] // px 436.endif 437 438 vmov.u8 q1, #0 // sum 439 vmov.u8 q2, #0 // sum 440.if \min 441 vmov.u16 q3, q0 // min 442 vmov.u16 q4, q0 // max 443.endif 444 445 // Instead of loading sec_taps 2, 1 from memory, just set it 446 // to 2 initially and decrease for the second round. 447 // This is also used as loop counter. 448 mov lr, #2 // sec_taps[0] 449 4502: 451.if \pri 452 ldrsb r9, [r5] // off1 453 454 load_px_8 d28, d29, d30, d31, \w 455.endif 456 457.if \sec 458 add r5, r5, #4 // +2*2 459 ldrsb r9, [r5] // off2 460.endif 461 462.if \pri 463 ldrb r12, [r8] // *pri_taps 464 vdup.8 q7, r3 // threshold 465 466 handle_pixel_8 q14, q15, q7, q5, r12, \min 467.endif 468 469.if \sec 470 load_px_8 d28, d29, d30, d31, \w 471 472 add r5, r5, #8 // +2*4 473 ldrsb r9, [r5] // off3 474 475 vdup.8 q7, r4 // threshold 476 477 handle_pixel_8 q14, q15, q7, q6, lr, \min 478 479 load_px_8 d28, d29, d30, d31, \w 480 481 handle_pixel_8 q14, q15, q7, q6, lr, \min 482 483 sub r5, r5, #11 // r5 -= 2*(2+4); r5 += 1; 484.else 485 add r5, r5, #1 // r5 += 1 486.endif 487 subs lr, lr, #1 // sec_tap-- (value) 488.if \pri 489 add r8, r8, #1 // pri_taps++ (pointer) 490.endif 491 bne 2b 492 493 vshr.s16 q14, q1, #15 // -(sum < 0) 494 vshr.s16 q15, q2, #15 // -(sum < 0) 495 vadd.i16 q1, q1, q14 // sum - (sum < 0) 496 vadd.i16 q2, q2, q15 // sum - (sum < 0) 497 vrshr.s16 q1, q1, #4 // (8 + sum - (sum < 0)) >> 4 498 vrshr.s16 q2, q2, #4 // (8 + sum - (sum < 0)) >> 4 499 vaddw.u8 q1, q1, d0 // px + (8 + sum ...) >> 4 500 vaddw.u8 q2, q2, d1 // px + (8 + sum ...) >> 4 501 vqmovun.s16 d0, q1 502 vqmovun.s16 d1, q2 503.if \min 504 vmin.u8 q0, q0, q4 505 vmax.u8 q0, q0, q3 // iclip(px + .., min, max) 506.endif 507.if \w == 8 508 vst1.8 {d0}, [r0, :64], r1 509 add r2, r2, #2*16 // tmp += 2*tmp_stride 510 subs r7, r7, #2 // h -= 2 511 vst1.8 {d1}, [r0, :64], r1 512.else 513 vst1.32 {d0[0]}, [r0, :32], r1 514 add r2, r2, #4*8 // tmp += 4*tmp_stride 515 vst1.32 {d0[1]}, [r0, :32], r1 516 subs r7, r7, #4 // h -= 4 517 vst1.32 {d1[0]}, [r0, :32], r1 518 vst1.32 {d1[1]}, [r0, :32], r1 519.endif 520 521 // Reset pri_taps and directions back to the original point 522 sub r5, r5, #2 523.if \pri 524 sub r8, r8, #2 525.endif 526 527 bgt 1b 528 vpop {q4-q7} 529 pop {r4-r9,pc} 530endfunc 531.endm 532 533.macro filter_8 w 534filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri 535filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec 536filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec 537.endm 538 539filter_8 8 540filter_8 4 541