1/* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2020, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/arm/asm.S" 29#include "util.S" 30 31const right_ext_mask_buf 32 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 33 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 34 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 35 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 36 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 37 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 38right_ext_mask: 39 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 40 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 41 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 42 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 43 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 44 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 45endconst 46 47// void dav1d_wiener_filter7_16bpc_neon(pixel *p, const ptrdiff_t p_stride, 48// const pixel (*left)[4], const pixel *lpf, 49// const int w, int h, 50// const int16_t filter[2][8], 51// const enum LrEdgeFlags edges, 52// const int bitdepth_max); 53function wiener_filter7_16bpc_neon, export=1 54 ldr w8, [sp] 55 AARCH64_SIGN_LINK_REGISTER 56 stp x29, x30, [sp, #-32]! 57 stp d8, d9, [sp, #16] 58 mov x29, sp 59 ld1 {v0.8h, v1.8h}, [x6] 60 tst w7, #4 // LR_HAVE_TOP 61 sub_sp 384*2*6 62 63 dup v28.8h, w8 // bitdepth_max 64 clz w8, w8 65 movi v30.4s, #1 66 sub w10, w8, #38 // -(bitdepth + 6) 67 sub w11, w8, #11 // round_bits_v 68 sub w8, w8, #25 // -round_bits_h 69 neg w10, w10 // bitdepth + 6 70 neg w11, w11 // -round_bits_v 71 dup v2.4s, w10 72 dup v29.4s, w8 // -round_bits_h 73 dup v27.4s, w11 // -round_bits_v 74 movi v31.8h, #0x20, lsl #8 // 1 << 13 = 8192 75 ushl v30.4s, v30.4s, v2.4s // 1 << (bitdepth + 6) 76 77 zip1 v0.2d, v0.2d, v1.2d // move vertical coeffs to v0.h[4-7], freeing up v1 78 79 // x9 - t6 80 // x10 - t5 81 // x11 - t4 82 // x12 - t3 83 // x13 - t2 84 // x14 - t1 85 // x15 - t0 86 mov x14, sp // t1 87 b.eq L(no_top_7) 88 89 mov x16, x2 // backup left 90 mov x2, #0 91 bl wiener_filter7_h_16bpc_neon 92 add x3, x3, x1 // lpf += stride 93 mov x9, x14 // t6 94 mov x10, x14 // t5 95 add x14, x14, #384*2 // t1 += 384*2 96 bl wiener_filter7_h_16bpc_neon 97 add x3, x3, x1, lsl #2 98 add x3, x3, x1 // lpf += stride*5 99 mov x11, x14 // t4 100 add x14, x14, #384*2 // t1 += 384*2 101 mov x2, x16 // left 102 mov x16, x3 // backup lpf 103 mov x3, x0 // lpf = p 104 bl wiener_filter7_h_16bpc_neon 105 subs w5, w5, #1 // h-- 106 mov x12, x14 // t3 107 mov x13, x14 // t2 108 b.eq L(v1_7) 109 add x3, x3, x1 // src += stride 110 add x14, x14, #384*2 // t1 += 384*2 111 bl wiener_filter7_h_16bpc_neon 112 mov x13, x14 // t2 113 subs w5, w5, #1 // h-- 114 b.eq L(v2_7) 115 add x3, x3, x1 // src += stride 116 add x14, x14, #384*2 // t1 += 384*2 117 bl wiener_filter7_h_16bpc_neon 118 subs w5, w5, #1 // h-- 119 b.eq L(v3_7) 120 add x3, x3, x1 // src += stride 121 122L(main_7): 123 add x15, x14, #384*2 // t0 = t1 + 384*2 124L(main_loop_7): 125 bl wiener_filter7_hv_16bpc_neon 126 subs w5, w5, #1 // h-- 127 b.ne L(main_loop_7) 128 tst w7, #8 // LR_HAVE_BOTTOM 129 b.eq L(v3_7) 130 131 mov x3, x16 // restore lpf 132 mov x2, #0 // left = NULL 133 bl wiener_filter7_hv_16bpc_neon 134 bl wiener_filter7_hv_16bpc_neon 135L(v1_7): 136 bl wiener_filter7_v_16bpc_neon 137 138 mov sp, x29 139 ldp d8, d9, [sp, #16] 140 ldp x29, x30, [sp], #32 141 AARCH64_VALIDATE_LINK_REGISTER 142 ret 143 144L(no_top_7): 145 add x3, x3, x1, lsl #2 146 add x16, x3, x1, lsl #1 // lpf += stride*6, backup 147 mov x3, x0 // lpf = p 148 149 bl wiener_filter7_h_16bpc_neon 150 subs w5, w5, #1 // h-- 151 mov x9, x14 // t6 152 mov x10, x14 // t5 153 mov x11, x14 // t4 154 mov x12, x14 // t3 155 mov x13, x14 // t2 156 b.eq L(v1_7) 157 add x3, x3, x1 // src += p_stride 158 add x14, x14, #384*2 // t1 += 384*2 159 bl wiener_filter7_h_16bpc_neon 160 subs w5, w5, #1 // h-- 161 mov x13, x14 // t2 162 b.eq L(v2_7) 163 add x3, x3, x1 // src += p_stride 164 add x14, x14, #384*2 // t1 += 384*2 165 bl wiener_filter7_h_16bpc_neon 166 subs w5, w5, #1 // h-- 167 b.eq L(v3_7) 168 add x3, x3, x1 // src += p_stride 169 add x15, x14, #384*2 // t0 = t1 + 384*2 170 bl wiener_filter7_hv_16bpc_neon 171 subs w5, w5, #1 // h-- 172 b.eq L(v3_7) 173 add x15, x15, #384*2*4 // t0 += 384*2*4 174 bl wiener_filter7_hv_16bpc_neon 175 subs w5, w5, #1 // h-- 176 b.ne L(main_7) 177L(v3_7): 178 bl wiener_filter7_v_16bpc_neon 179L(v2_7): 180 bl wiener_filter7_v_16bpc_neon 181 b L(v1_7) 182endfunc 183 184 185function wiener_filter7_h_16bpc_neon 186 stp x3, x4, [sp, #-32]! 187 str x14, [sp, #16] 188 189 // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL 190 tst w7, #1 // LR_HAVE_LEFT 191 b.eq 1f 192 // LR_HAVE_LEFT 193 cbnz x2, 0f 194 // left == NULL 195 sub x3, x3, #6 196 ld1 {v2.8h, v3.8h}, [x3], #32 197 b 2f 198 1990: 200 // LR_HAVE_LEFT, left != NULL 201 ld1 {v2.8h, v3.8h}, [x3], #32 202 ld1 {v4.d}[1], [x2], #8 203 // Move x3 back to account for the last 3 pixels we loaded earlier, 204 // which we'll shift out. 205 sub x3, x3, #6 206 ext v3.16b, v2.16b, v3.16b, #10 207 ext v2.16b, v4.16b, v2.16b, #10 208 b 2f 209 2101: 211 ld1 {v2.8h, v3.8h}, [x3], #32 212 // !LR_HAVE_LEFT, fill v4 with the leftmost pixel 213 // and shift v3 to have 3x the first pixel at the front. 214 dup v4.8h, v2.h[0] 215 // Move x3 back to account for the last 3 pixels we loaded before, 216 // which we shifted out. 217 sub x3, x3, #6 218 ext v3.16b, v2.16b, v3.16b, #10 219 ext v2.16b, v4.16b, v2.16b, #10 220 2212: 222 ld1 {v4.8h}, [x3], #16 223 224 tst w7, #2 // LR_HAVE_RIGHT 225 b.ne 4f 226 2273: // !LR_HAVE_RIGHT 228 229 // Check whether we need to pad the right edge 230 cmp w4, #19 231 b.ge 4f // If w >= 19, all used input pixels are valid 232 233 // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9, 234 // this ends up called again; it's not strictly needed in those 235 // cases (we pad enough here), but keeping the code as simple as possible. 236 237 // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie 238 // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. 239 sub w17, w4, #22 240 // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the 241 // buffer pointer. 242 movrel x6, right_ext_mask, -6 243 ldr h26, [x3, w17, sxtw #1] 244 sub x6, x6, w4, uxtw #1 245 dup v26.8h, v26.h[0] 246 ld1 {v23.16b, v24.16b, v25.16b}, [x6] 247 248 bit v2.16b, v26.16b, v23.16b 249 bit v3.16b, v26.16b, v24.16b 250 bit v4.16b, v26.16b, v25.16b 251 2524: // Loop horizontally 253 // Interleaving the mul/mla chains actually hurts performance 254 // significantly on Cortex A53, thus keeping mul/mla tightly 255 // chained like this. 256 ext v17.16b, v2.16b, v3.16b, #4 257 ext v19.16b, v2.16b, v3.16b, #8 258 ext v16.16b, v2.16b, v3.16b, #2 259 ext v20.16b, v2.16b, v3.16b, #10 260 ext v21.16b, v2.16b, v3.16b, #12 261 ext v18.16b, v2.16b, v3.16b, #6 262 add v19.8h, v19.8h, v17.8h 263 add v20.8h, v20.8h, v16.8h 264 add v21.8h, v21.8h, v2.8h 265 smull v6.4s, v18.4h, v0.h[3] 266 smlal v6.4s, v19.4h, v0.h[2] 267 smlal v6.4s, v20.4h, v0.h[1] 268 smlal v6.4s, v21.4h, v0.h[0] 269 smull2 v7.4s, v18.8h, v0.h[3] 270 smlal2 v7.4s, v19.8h, v0.h[2] 271 smlal2 v7.4s, v20.8h, v0.h[1] 272 smlal2 v7.4s, v21.8h, v0.h[0] 273 274 ext v17.16b, v3.16b, v4.16b, #4 275 ext v19.16b, v3.16b, v4.16b, #8 276 ext v16.16b, v3.16b, v4.16b, #2 277 ext v20.16b, v3.16b, v4.16b, #10 278 ext v21.16b, v3.16b, v4.16b, #12 279 ext v18.16b, v3.16b, v4.16b, #6 280 281 add v19.8h, v19.8h, v17.8h 282 add v20.8h, v20.8h, v16.8h 283 add v21.8h, v21.8h, v3.8h 284 smull v16.4s, v18.4h, v0.h[3] 285 smlal v16.4s, v19.4h, v0.h[2] 286 smlal v16.4s, v20.4h, v0.h[1] 287 smlal v16.4s, v21.4h, v0.h[0] 288 smull2 v17.4s, v18.8h, v0.h[3] 289 smlal2 v17.4s, v19.8h, v0.h[2] 290 smlal2 v17.4s, v20.8h, v0.h[1] 291 smlal2 v17.4s, v21.8h, v0.h[0] 292 293 mvni v24.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 294 add v6.4s, v6.4s, v30.4s 295 add v7.4s, v7.4s, v30.4s 296 add v16.4s, v16.4s, v30.4s 297 add v17.4s, v17.4s, v30.4s 298 srshl v6.4s, v6.4s, v29.4s 299 srshl v7.4s, v7.4s, v29.4s 300 srshl v16.4s, v16.4s, v29.4s 301 srshl v17.4s, v17.4s, v29.4s 302 sqxtun v6.4h, v6.4s 303 sqxtun2 v6.8h, v7.4s 304 sqxtun v7.4h, v16.4s 305 sqxtun2 v7.8h, v17.4s 306 umin v6.8h, v6.8h, v24.8h 307 umin v7.8h, v7.8h, v24.8h 308 sub v6.8h, v6.8h, v31.8h 309 sub v7.8h, v7.8h, v31.8h 310 311 subs w4, w4, #16 312 313 st1 {v6.8h, v7.8h}, [x14], #32 314 315 b.le 0f 316 mov v2.16b, v4.16b 317 tst w7, #2 // LR_HAVE_RIGHT 318 ld1 {v3.8h, v4.8h}, [x3], #32 319 b.ne 4b // If we don't need to pad, just keep filtering. 320 b 3b // If we need to pad, check how many pixels we have left. 321 3220: 323 ldr x14, [sp, #16] 324 ldp x3, x4, [sp], #32 325 ret 326endfunc 327 328function wiener_filter7_v_16bpc_neon 329 // Backing up/restoring registers shifted, so that x9 gets the value 330 // of x10, etc, afterwards. 331 stp x10, x11, [sp, #-64]! 332 stp x12, x13, [sp, #16] 333 stp x14, x14, [sp, #32] 334 stp x0, x4, [sp, #48] 3351: 336 ld1 {v16.8h, v17.8h}, [x9], #32 337 ld1 {v18.8h, v19.8h}, [x10], #32 338 ld1 {v20.8h, v21.8h}, [x11], #32 339 ld1 {v22.8h, v23.8h}, [x12], #32 340 ld1 {v24.8h, v25.8h}, [x13], #32 341 ld1 {v6.8h, v7.8h}, [x14], #32 342 343 smull v2.4s, v16.4h, v0.h[4] 344 smlal v2.4s, v18.4h, v0.h[5] 345 smlal v2.4s, v20.4h, v0.h[6] 346 smlal v2.4s, v22.4h, v0.h[7] 347 smlal v2.4s, v24.4h, v0.h[6] 348 smlal v2.4s, v6.4h, v0.h[5] 349 smlal v2.4s, v6.4h, v0.h[4] 350 smull2 v3.4s, v16.8h, v0.h[4] 351 smlal2 v3.4s, v18.8h, v0.h[5] 352 smlal2 v3.4s, v20.8h, v0.h[6] 353 smlal2 v3.4s, v22.8h, v0.h[7] 354 smlal2 v3.4s, v24.8h, v0.h[6] 355 smlal2 v3.4s, v6.8h, v0.h[5] 356 smlal2 v3.4s, v6.8h, v0.h[4] 357 smull v4.4s, v17.4h, v0.h[4] 358 smlal v4.4s, v19.4h, v0.h[5] 359 smlal v4.4s, v21.4h, v0.h[6] 360 smlal v4.4s, v23.4h, v0.h[7] 361 smlal v4.4s, v25.4h, v0.h[6] 362 smlal v4.4s, v7.4h, v0.h[5] 363 smlal v4.4s, v7.4h, v0.h[4] 364 smull2 v5.4s, v17.8h, v0.h[4] 365 smlal2 v5.4s, v19.8h, v0.h[5] 366 smlal2 v5.4s, v21.8h, v0.h[6] 367 smlal2 v5.4s, v23.8h, v0.h[7] 368 smlal2 v5.4s, v25.8h, v0.h[6] 369 smlal2 v5.4s, v7.8h, v0.h[5] 370 smlal2 v5.4s, v7.8h, v0.h[4] 371 srshl v2.4s, v2.4s, v27.4s // -round_bits_v 372 srshl v3.4s, v3.4s, v27.4s 373 srshl v4.4s, v4.4s, v27.4s 374 srshl v5.4s, v5.4s, v27.4s 375 sqxtun v2.4h, v2.4s 376 sqxtun2 v2.8h, v3.4s 377 sqxtun v3.4h, v4.4s 378 sqxtun2 v3.8h, v5.4s 379 umin v2.8h, v2.8h, v28.8h // bitdepth_max 380 umin v3.8h, v3.8h, v28.8h 381 subs w4, w4, #16 382 st1 {v2.8h, v3.8h}, [x0], #32 383 b.gt 1b 384 385 ldp x0, x4, [sp, #48] 386 ldp x13, x14, [sp, #32] 387 ldp x11, x12, [sp, #16] 388 ldp x9, x10, [sp], #64 389 390 add x0, x0, x1 391 ret 392endfunc 393 394function wiener_filter7_hv_16bpc_neon 395 // Backing up/restoring registers shifted, so that x9 gets the value 396 // of x10, etc, and x15==x9, afterwards. 397 stp x10, x11, [sp, #-80]! 398 stp x12, x13, [sp, #16] 399 stp x14, x15, [sp, #32] 400 stp x10, x0, [sp, #48] 401 stp x3, x4, [sp, #64] 402 403 // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL 404 tst w7, #1 // LR_HAVE_LEFT 405 b.eq 1f 406 // LR_HAVE_LEFT 407 cbnz x2, 0f 408 // left == NULL 409 sub x3, x3, #6 410 ld1 {v2.8h, v3.8h}, [x3], #32 411 b 2f 412 4130: 414 // LR_HAVE_LEFT, left != NULL 415 ld1 {v2.8h, v3.8h}, [x3], #32 416 ld1 {v4.d}[1], [x2], #8 417 // Move x3 back to account for the last 3 pixels we loaded earlier, 418 // which we'll shift out. 419 sub x3, x3, #6 420 ext v3.16b, v2.16b, v3.16b, #10 421 ext v2.16b, v4.16b, v2.16b, #10 422 b 2f 4231: 424 ld1 {v2.8h, v3.8h}, [x3], #32 425 // !LR_HAVE_LEFT, fill v4 with the leftmost pixel 426 // and shift v3 to have 3x the first pixel at the front. 427 dup v4.8h, v2.h[0] 428 // Move x3 back to account for the last 3 pixels we loaded before, 429 // which we shifted out. 430 sub x3, x3, #6 431 ext v3.16b, v2.16b, v3.16b, #10 432 ext v2.16b, v4.16b, v2.16b, #10 433 4342: 435 ld1 {v4.8h}, [x3], #16 436 437 tst w7, #2 // LR_HAVE_RIGHT 438 b.ne 4f 439 4403: // !LR_HAVE_RIGHT 441 442 // Check whether we need to pad the right edge 443 cmp w4, #19 444 b.ge 4f // If w >= 19, all used input pixels are valid 445 446 // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9, 447 // this ends up called again; it's not strictly needed in those 448 // cases (we pad enough here), but keeping the code as simple as possible. 449 450 // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie 451 // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. 452 sub w17, w4, #22 453 // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the 454 // buffer pointer. 455 movrel x6, right_ext_mask, -6 456 ldr h26, [x3, w17, sxtw #1] 457 sub x6, x6, w4, uxtw #1 458 dup v26.8h, v26.h[0] 459 ld1 {v23.16b, v24.16b, v25.16b}, [x6] 460 461 bit v2.16b, v26.16b, v23.16b 462 bit v3.16b, v26.16b, v24.16b 463 bit v4.16b, v26.16b, v25.16b 464 4654: // Loop horizontally 466 ext v17.16b, v2.16b, v3.16b, #4 467 ext v19.16b, v2.16b, v3.16b, #8 468 ext v16.16b, v2.16b, v3.16b, #2 469 ext v20.16b, v2.16b, v3.16b, #10 470 ext v21.16b, v2.16b, v3.16b, #12 471 ext v18.16b, v2.16b, v3.16b, #6 472 add v19.8h, v19.8h, v17.8h 473 add v20.8h, v20.8h, v16.8h 474 add v21.8h, v21.8h, v2.8h 475 smull v6.4s, v18.4h, v0.h[3] 476 smlal v6.4s, v19.4h, v0.h[2] 477 smlal v6.4s, v20.4h, v0.h[1] 478 smlal v6.4s, v21.4h, v0.h[0] 479 smull2 v7.4s, v18.8h, v0.h[3] 480 smlal2 v7.4s, v19.8h, v0.h[2] 481 smlal2 v7.4s, v20.8h, v0.h[1] 482 smlal2 v7.4s, v21.8h, v0.h[0] 483 484 ext v17.16b, v3.16b, v4.16b, #4 485 ext v19.16b, v3.16b, v4.16b, #8 486 ext v16.16b, v3.16b, v4.16b, #2 487 ext v20.16b, v3.16b, v4.16b, #10 488 ext v21.16b, v3.16b, v4.16b, #12 489 ext v18.16b, v3.16b, v4.16b, #6 490 491 add v19.8h, v19.8h, v17.8h 492 add v20.8h, v20.8h, v16.8h 493 add v21.8h, v21.8h, v3.8h 494 smull v24.4s, v18.4h, v0.h[3] 495 smlal v24.4s, v19.4h, v0.h[2] 496 smlal v24.4s, v20.4h, v0.h[1] 497 smlal v24.4s, v21.4h, v0.h[0] 498 smull2 v25.4s, v18.8h, v0.h[3] 499 smlal2 v25.4s, v19.8h, v0.h[2] 500 smlal2 v25.4s, v20.8h, v0.h[1] 501 smlal2 v25.4s, v21.8h, v0.h[0] 502 503 ld1 {v16.8h, v17.8h}, [x9], #32 504 505 mvni v26.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 506 add v6.4s, v6.4s, v30.4s 507 add v7.4s, v7.4s, v30.4s 508 add v24.4s, v24.4s, v30.4s 509 add v25.4s, v25.4s, v30.4s 510 ld1 {v18.8h, v19.8h}, [x10], #32 511 srshl v6.4s, v6.4s, v29.4s 512 srshl v7.4s, v7.4s, v29.4s 513 srshl v24.4s, v24.4s, v29.4s 514 srshl v25.4s, v25.4s, v29.4s 515 ld1 {v20.8h, v21.8h}, [x11], #32 516 sqxtun v6.4h, v6.4s 517 sqxtun2 v6.8h, v7.4s 518 sqxtun v7.4h, v24.4s 519 sqxtun2 v7.8h, v25.4s 520 ld1 {v22.8h, v23.8h}, [x12], #32 521 umin v6.8h, v6.8h, v26.8h 522 umin v7.8h, v7.8h, v26.8h 523 ld1 {v24.8h, v25.8h}, [x13], #32 524 sub v6.8h, v6.8h, v31.8h 525 sub v7.8h, v7.8h, v31.8h 526 527 ld1 {v8.8h, v9.8h}, [x14], #32 528 529 smull v1.4s, v16.4h, v0.h[4] 530 smlal v1.4s, v18.4h, v0.h[5] 531 smlal v1.4s, v20.4h, v0.h[6] 532 smlal v1.4s, v22.4h, v0.h[7] 533 smlal v1.4s, v24.4h, v0.h[6] 534 smlal v1.4s, v8.4h, v0.h[5] 535 smlal v1.4s, v6.4h, v0.h[4] 536 smull2 v5.4s, v16.8h, v0.h[4] 537 smlal2 v5.4s, v18.8h, v0.h[5] 538 smlal2 v5.4s, v20.8h, v0.h[6] 539 smlal2 v5.4s, v22.8h, v0.h[7] 540 smlal2 v5.4s, v24.8h, v0.h[6] 541 smlal2 v5.4s, v8.8h, v0.h[5] 542 smlal2 v5.4s, v6.8h, v0.h[4] 543 smull v26.4s, v17.4h, v0.h[4] 544 smlal v26.4s, v19.4h, v0.h[5] 545 smlal v26.4s, v21.4h, v0.h[6] 546 smlal v26.4s, v23.4h, v0.h[7] 547 smlal v26.4s, v25.4h, v0.h[6] 548 smlal v26.4s, v9.4h, v0.h[5] 549 smlal v26.4s, v7.4h, v0.h[4] 550 smull2 v16.4s, v17.8h, v0.h[4] 551 smlal2 v16.4s, v19.8h, v0.h[5] 552 smlal2 v16.4s, v21.8h, v0.h[6] 553 smlal2 v16.4s, v23.8h, v0.h[7] 554 smlal2 v16.4s, v25.8h, v0.h[6] 555 smlal2 v16.4s, v9.8h, v0.h[5] 556 smlal2 v16.4s, v7.8h, v0.h[4] 557 srshl v1.4s, v1.4s, v27.4s // -round_bits_v 558 srshl v5.4s, v5.4s, v27.4s 559 srshl v26.4s, v26.4s, v27.4s 560 srshl v16.4s, v16.4s, v27.4s 561 sqxtun v18.4h, v1.4s 562 sqxtun2 v18.8h, v5.4s 563 sqxtun v19.4h, v26.4s 564 sqxtun2 v19.8h, v16.4s 565 st1 {v6.8h, v7.8h}, [x15], #32 566 umin v18.8h, v18.8h, v28.8h // bitdepth_max 567 umin v19.8h, v19.8h, v28.8h 568 subs w4, w4, #16 569 570 st1 {v18.8h, v19.8h}, [x0], #32 571 572 b.le 0f 573 mov v2.16b, v4.16b 574 tst w7, #2 // LR_HAVE_RIGHT 575 ld1 {v3.8h, v4.8h}, [x3], #32 576 b.ne 4b // If we don't need to pad, just keep filtering. 577 b 3b // If we need to pad, check how many pixels we have left. 578 5790: 580 ldp x3, x4, [sp, #64] 581 ldp x15, x0, [sp, #48] 582 ldp x13, x14, [sp, #32] 583 ldp x11, x12, [sp, #16] 584 ldp x9, x10, [sp], #80 585 586 add x3, x3, x1 587 add x0, x0, x1 588 589 ret 590endfunc 591 592// void dav1d_wiener_filter5_16bpc_neon(pixel *p, const ptrdiff_t p_stride, 593// const pixel (*left)[4], const pixel *lpf, 594// const int w, int h, 595// const int16_t filter[2][8], 596// const enum LrEdgeFlags edges, 597// const int bitdepth_max); 598function wiener_filter5_16bpc_neon, export=1 599 ldr w8, [sp] 600 AARCH64_SIGN_LINK_REGISTER 601 stp x29, x30, [sp, #-32]! 602 stp d8, d9, [sp, #16] 603 mov x29, sp 604 ld1 {v0.8h, v1.8h}, [x6] 605 tst w7, #4 // LR_HAVE_TOP 606 sub_sp 384*2*4 607 608 dup v28.8h, w8 // bitdepth_max 609 clz w8, w8 610 movi v30.4s, #1 611 sub w10, w8, #38 // -(bitdepth + 6) 612 sub w11, w8, #11 // round_bits_v 613 sub w8, w8, #25 // -round_bits_h 614 neg w10, w10 // bitdepth + 6 615 neg w11, w11 // -round_bits_v 616 dup v2.4s, w10 617 dup v29.4s, w8 // -round_bits_h 618 dup v27.4s, w11 // -round_bits_v 619 movi v31.8h, #0x20, lsl #8 // 1 << 13 = 8192 620 ushl v30.4s, v30.4s, v2.4s // 1 << (bitdepth + 6) 621 622 zip1 v0.2d, v0.2d, v1.2d // move vertical coeffs to v0.h[4-7], freeing up v1 623 624 // x11 - t4 625 // x12 - t3 626 // x13 - t2 627 // x14 - t1 628 // x15 - t0 629 mov x14, sp // t1 630 b.eq L(no_top_5) 631 632 mov x16, x2 // backup left 633 mov x2, #0 634 bl wiener_filter5_h_16bpc_neon 635 add x3, x3, x1 // lpf += stride 636 mov x11, x14 // t4 637 add x14, x14, #384*2 // t1 += 384*2 638 bl wiener_filter5_h_16bpc_neon 639 add x3, x3, x1, lsl #2 640 add x3, x3, x1 // lpf += stride*5 641 mov x12, x14 // t3 642 add x14, x14, #384*2 // t1 += 384*2 643 mov x2, x16 // left 644 mov x16, x3 // backup lpf 645 mov x3, x0 // lpf = p 646 bl wiener_filter5_h_16bpc_neon 647 subs w5, w5, #1 // h-- 648 mov x13, x14 // t2 649 b.eq L(v1_5) 650 add x3, x3, x1 // src += stride 651 add x14, x14, #384*2 // t1 += 384*2 652 bl wiener_filter5_h_16bpc_neon 653 subs w5, w5, #1 // h-- 654 b.eq L(v2_5) 655 add x3, x3, x1 // src += stride 656 657L(main_5): 658 mov x15, x11 // t0 = t4 659L(main_loop_5): 660 bl wiener_filter5_hv_16bpc_neon 661 subs w5, w5, #1 // h-- 662 b.ne L(main_loop_5) 663 tst w7, #8 // LR_HAVE_BOTTOM 664 b.eq L(v2_5) 665 666 mov x3, x16 // restore lpf 667 mov x2, #0 // left = NULL 668 bl wiener_filter5_hv_16bpc_neon 669 bl wiener_filter5_hv_16bpc_neon 670L(end_5): 671 672 mov sp, x29 673 ldp d8, d9, [sp, #16] 674 ldp x29, x30, [sp], #32 675 AARCH64_VALIDATE_LINK_REGISTER 676 ret 677 678L(no_top_5): 679 add x3, x3, x1, lsl #2 680 add x16, x3, x1, lsl #1 // lpf += stride*6, backup 681 mov x3, x0 // lpf = p 682 683 bl wiener_filter5_h_16bpc_neon 684 subs w5, w5, #1 // h-- 685 mov x11, x14 // t4 686 mov x12, x14 // t3 687 mov x13, x14 // t2 688 b.eq L(v1_5) 689 add x3, x3, x1 // src += stride 690 add x14, x14, #384*2 // t1 += 384*2 691 bl wiener_filter5_h_16bpc_neon 692 subs w5, w5, #1 // h-- 693 b.eq L(v2_5) 694 add x3, x3, x1 // src += stride 695 add x15, x14, #384*2 // t0 = t1 + 384*2 696 bl wiener_filter5_hv_16bpc_neon 697 subs w5, w5, #1 // h-- 698 b.eq L(v2_5) 699 add x15, x15, #384*2*3 // t0 += 384*2*3 700 bl wiener_filter5_hv_16bpc_neon 701 subs w5, w5, #1 // h-- 702 b.ne L(main_5) 703L(v2_5): 704 bl wiener_filter5_v_16bpc_neon 705 add x0, x0, x1 706 mov x11, x12 707 mov x12, x13 708 mov x13, x14 709L(v1_5): 710 bl wiener_filter5_v_16bpc_neon 711 b L(end_5) 712endfunc 713 714 715function wiener_filter5_h_16bpc_neon 716 stp x3, x4, [sp, #-32]! 717 str x14, [sp, #16] 718 719 // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL 720 tst w7, #1 // LR_HAVE_LEFT 721 b.eq 1f 722 // LR_HAVE_LEFT 723 cbnz x2, 0f 724 // left == NULL 725 sub x3, x3, #4 726 ld1 {v2.8h, v3.8h}, [x3], #32 727 b 2f 728 7290: 730 // LR_HAVE_LEFT, left != NULL 731 ld1 {v2.8h, v3.8h}, [x3], #32 732 ld1 {v4.d}[1], [x2], #8 733 // Move x3 back to account for the last 2 pixels we loaded earlier, 734 // which we'll shift out. 735 sub x3, x3, #4 736 ext v3.16b, v2.16b, v3.16b, #12 737 ext v2.16b, v4.16b, v2.16b, #12 738 b 2f 739 7401: 741 ld1 {v2.8h, v3.8h}, [x3], #32 742 // !LR_HAVE_LEFT, fill v2 with the leftmost pixel 743 // and shift v3 to have 3x the first pixel at the front. 744 dup v4.8h, v2.h[0] 745 // Move x3 back to account for the last 2 pixels we loaded before, 746 // which we shifted out. 747 sub x3, x3, #4 748 ext v3.16b, v2.16b, v3.16b, #12 749 ext v2.16b, v4.16b, v2.16b, #12 750 7512: 752 ld1 {v4.8h}, [x3], #16 753 754 tst w7, #2 // LR_HAVE_RIGHT 755 b.ne 4f 756 7573: // !LR_HAVE_RIGHT 758 759 // Check whether we need to pad the right edge 760 cmp w4, #18 761 b.ge 4f // If w >= 18, all used input pixels are valid 762 763 // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9, 764 // this ends up called again; it's not strictly needed in those 765 // cases (we pad enough here), but keeping the code as simple as possible. 766 767 // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie 768 // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. 769 sub w17, w4, #23 770 // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the 771 // buffer pointer. 772 movrel x6, right_ext_mask, -4 773 ldr h26, [x3, w17, sxtw #1] 774 sub x6, x6, w4, uxtw #1 775 dup v26.8h, v26.h[0] 776 ld1 {v23.16b, v24.16b, v25.16b}, [x6] 777 778 bit v2.16b, v26.16b, v23.16b 779 bit v3.16b, v26.16b, v24.16b 780 bit v4.16b, v26.16b, v25.16b 781 7824: // Loop horizontally 783 // Interleaving the mul/mla chains actually hurts performance 784 // significantly on Cortex A53, thus keeping mul/mla tightly 785 // chained like this. 786 ext v16.16b, v2.16b, v3.16b, #2 787 ext v18.16b, v2.16b, v3.16b, #6 788 ext v19.16b, v2.16b, v3.16b, #8 789 ext v17.16b, v2.16b, v3.16b, #4 790 add v18.8h, v18.8h, v16.8h 791 add v19.8h, v19.8h, v2.8h 792 smull v6.4s, v17.4h, v0.h[3] 793 smlal v6.4s, v18.4h, v0.h[2] 794 smlal v6.4s, v19.4h, v0.h[1] 795 smull2 v7.4s, v17.8h, v0.h[3] 796 smlal2 v7.4s, v18.8h, v0.h[2] 797 smlal2 v7.4s, v19.8h, v0.h[1] 798 799 ext v16.16b, v3.16b, v4.16b, #2 800 ext v18.16b, v3.16b, v4.16b, #6 801 ext v19.16b, v3.16b, v4.16b, #8 802 ext v17.16b, v3.16b, v4.16b, #4 803 add v18.8h, v18.8h, v16.8h 804 add v19.8h, v19.8h, v3.8h 805 smull v16.4s, v17.4h, v0.h[3] 806 smlal v16.4s, v18.4h, v0.h[2] 807 smlal v16.4s, v19.4h, v0.h[1] 808 smull2 v17.4s, v17.8h, v0.h[3] 809 smlal2 v17.4s, v18.8h, v0.h[2] 810 smlal2 v17.4s, v19.8h, v0.h[1] 811 812 mvni v24.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 813 add v6.4s, v6.4s, v30.4s 814 add v7.4s, v7.4s, v30.4s 815 add v16.4s, v16.4s, v30.4s 816 add v17.4s, v17.4s, v30.4s 817 srshl v6.4s, v6.4s, v29.4s 818 srshl v7.4s, v7.4s, v29.4s 819 srshl v16.4s, v16.4s, v29.4s 820 srshl v17.4s, v17.4s, v29.4s 821 sqxtun v6.4h, v6.4s 822 sqxtun2 v6.8h, v7.4s 823 sqxtun v7.4h, v16.4s 824 sqxtun2 v7.8h, v17.4s 825 umin v6.8h, v6.8h, v24.8h 826 umin v7.8h, v7.8h, v24.8h 827 sub v6.8h, v6.8h, v31.8h 828 sub v7.8h, v7.8h, v31.8h 829 830 subs w4, w4, #16 831 832 st1 {v6.8h, v7.8h}, [x14], #32 833 834 b.le 0f 835 mov v2.16b, v4.16b 836 tst w7, #2 // LR_HAVE_RIGHT 837 ld1 {v3.8h, v4.8h}, [x3], #32 838 b.ne 4b // If we don't need to pad, just keep filtering. 839 b 3b // If we need to pad, check how many pixels we have left. 840 8410: 842 ldr x14, [sp, #16] 843 ldp x3, x4, [sp], #32 844 ret 845endfunc 846 847function wiener_filter5_v_16bpc_neon 848 stp x11, x12, [sp, #-48]! 849 stp x13, x14, [sp, #16] 850 stp x0, x4, [sp, #32] 8511: 852 ld1 {v16.8h, v17.8h}, [x11], #32 853 ld1 {v18.8h, v19.8h}, [x12], #32 854 ld1 {v20.8h, v21.8h}, [x13], #32 855 ld1 {v22.8h, v23.8h}, [x14], #32 856 857 smull v2.4s, v16.4h, v0.h[5] 858 smlal v2.4s, v18.4h, v0.h[6] 859 smlal v2.4s, v20.4h, v0.h[7] 860 smlal v2.4s, v22.4h, v0.h[6] 861 smlal v2.4s, v22.4h, v0.h[5] 862 smull2 v3.4s, v16.8h, v0.h[5] 863 smlal2 v3.4s, v18.8h, v0.h[6] 864 smlal2 v3.4s, v20.8h, v0.h[7] 865 smlal2 v3.4s, v22.8h, v0.h[6] 866 smlal2 v3.4s, v22.8h, v0.h[5] 867 smull v4.4s, v17.4h, v0.h[5] 868 smlal v4.4s, v19.4h, v0.h[6] 869 smlal v4.4s, v21.4h, v0.h[7] 870 smlal v4.4s, v23.4h, v0.h[6] 871 smlal v4.4s, v23.4h, v0.h[5] 872 smull2 v5.4s, v17.8h, v0.h[5] 873 smlal2 v5.4s, v19.8h, v0.h[6] 874 smlal2 v5.4s, v21.8h, v0.h[7] 875 smlal2 v5.4s, v23.8h, v0.h[6] 876 smlal2 v5.4s, v23.8h, v0.h[5] 877 srshl v2.4s, v2.4s, v27.4s // -round_bits_v 878 srshl v3.4s, v3.4s, v27.4s 879 srshl v4.4s, v4.4s, v27.4s 880 srshl v5.4s, v5.4s, v27.4s 881 sqxtun v2.4h, v2.4s 882 sqxtun2 v2.8h, v3.4s 883 sqxtun v3.4h, v4.4s 884 sqxtun2 v3.8h, v5.4s 885 umin v2.8h, v2.8h, v28.8h // bitdepth_max 886 umin v3.8h, v3.8h, v28.8h 887 888 subs w4, w4, #16 889 st1 {v2.8h, v3.8h}, [x0], #32 890 b.gt 1b 891 892 ldp x0, x4, [sp, #32] 893 ldp x13, x14, [sp, #16] 894 ldp x11, x12, [sp], #48 895 896 ret 897endfunc 898 899function wiener_filter5_hv_16bpc_neon 900 // Backing up/restoring registers shifted, so that x11 gets the value 901 // of x12, etc, and x15==x11, afterwards. 902 stp x12, x13, [sp, #-64]! 903 stp x14, x15, [sp, #16] 904 stp x12, x0, [sp, #32] 905 stp x3, x4, [sp, #48] 906 907 // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL 908 tst w7, #1 // LR_HAVE_LEFT 909 b.eq 1f 910 // LR_HAVE_LEFT 911 cbnz x2, 0f 912 // left == NULL 913 sub x3, x3, #4 914 ld1 {v2.8h, v3.8h}, [x3], #32 915 b 2f 916 9170: 918 // LR_HAVE_LEFT, left != NULL 919 ld1 {v2.8h, v3.8h}, [x3], #32 920 ld1 {v4.d}[1], [x2], #8 921 // Move x3 back to account for the last 2 pixels we loaded earlier, 922 // which we'll shift out. 923 sub x3, x3, #4 924 ext v3.16b, v2.16b, v3.16b, #12 925 ext v2.16b, v4.16b, v2.16b, #12 926 b 2f 9271: 928 ld1 {v2.8h, v3.8h}, [x3], #32 929 // !LR_HAVE_LEFT, fill v2 with the leftmost pixel 930 // and shift v3 to have 2x the first pixel at the front. 931 dup v4.8h, v2.h[0] 932 // Move x3 back to account for the last 2 pixels we loaded before, 933 // which we shifted out. 934 sub x3, x3, #4 935 ext v3.16b, v2.16b, v3.16b, #12 936 ext v2.16b, v4.16b, v2.16b, #12 937 9382: 939 ld1 {v4.8h}, [x3], #16 940 941 tst w7, #2 // LR_HAVE_RIGHT 942 b.ne 4f 943 9443: // !LR_HAVE_RIGHT 945 946 // Check whether we need to pad the right edge 947 cmp w4, #18 948 b.ge 4f // If w >= 18, all used input pixels are valid 949 950 // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9, 951 // this ends up called again; it's not strictly needed in those 952 // cases (we pad enough here), but keeping the code as simple as possible. 953 954 // The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie 955 // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. 956 sub w17, w4, #23 957 // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the 958 // buffer pointer. 959 movrel x6, right_ext_mask, -4 960 ldr h26, [x3, w17, sxtw #1] 961 sub x6, x6, w4, uxtw #1 962 dup v26.8h, v26.h[0] 963 ld1 {v23.16b, v24.16b, v25.16b}, [x6] 964 965 bit v2.16b, v26.16b, v23.16b 966 bit v3.16b, v26.16b, v24.16b 967 bit v4.16b, v26.16b, v25.16b 968 9694: // Loop horizontally 970 ext v16.16b, v2.16b, v3.16b, #2 971 ext v18.16b, v2.16b, v3.16b, #6 972 ext v19.16b, v2.16b, v3.16b, #8 973 ext v17.16b, v2.16b, v3.16b, #4 974 add v18.8h, v18.8h, v16.8h 975 add v19.8h, v19.8h, v2.8h 976 smull v6.4s, v17.4h, v0.h[3] 977 smlal v6.4s, v18.4h, v0.h[2] 978 smlal v6.4s, v19.4h, v0.h[1] 979 smull2 v7.4s, v17.8h, v0.h[3] 980 smlal2 v7.4s, v18.8h, v0.h[2] 981 smlal2 v7.4s, v19.8h, v0.h[1] 982 983 ext v16.16b, v3.16b, v4.16b, #2 984 ext v18.16b, v3.16b, v4.16b, #6 985 ext v19.16b, v3.16b, v4.16b, #8 986 ext v17.16b, v3.16b, v4.16b, #4 987 add v18.8h, v18.8h, v16.8h 988 add v19.8h, v19.8h, v3.8h 989 smull v24.4s, v17.4h, v0.h[3] 990 smlal v24.4s, v18.4h, v0.h[2] 991 smlal v24.4s, v19.4h, v0.h[1] 992 smull2 v25.4s, v17.8h, v0.h[3] 993 smlal2 v25.4s, v18.8h, v0.h[2] 994 smlal2 v25.4s, v19.8h, v0.h[1] 995 996 ld1 {v16.8h, v17.8h}, [x11], #32 997 mvni v26.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 998 add v6.4s, v6.4s, v30.4s 999 add v7.4s, v7.4s, v30.4s 1000 add v24.4s, v24.4s, v30.4s 1001 add v25.4s, v25.4s, v30.4s 1002 ld1 {v18.8h, v19.8h}, [x12], #32 1003 srshl v6.4s, v6.4s, v29.4s 1004 srshl v7.4s, v7.4s, v29.4s 1005 srshl v24.4s, v24.4s, v29.4s 1006 srshl v25.4s, v25.4s, v29.4s 1007 ld1 {v20.8h, v21.8h}, [x13], #32 1008 sqxtun v6.4h, v6.4s 1009 sqxtun2 v6.8h, v7.4s 1010 sqxtun v7.4h, v24.4s 1011 sqxtun2 v7.8h, v25.4s 1012 ld1 {v22.8h, v23.8h}, [x14], #32 1013 umin v6.8h, v6.8h, v26.8h 1014 umin v7.8h, v7.8h, v26.8h 1015 sub v6.8h, v6.8h, v31.8h 1016 sub v7.8h, v7.8h, v31.8h 1017 1018 smull v8.4s, v16.4h, v0.h[5] 1019 smlal v8.4s, v18.4h, v0.h[6] 1020 smlal v8.4s, v20.4h, v0.h[7] 1021 smlal v8.4s, v22.4h, v0.h[6] 1022 smlal v8.4s, v6.4h, v0.h[5] 1023 smull2 v9.4s, v16.8h, v0.h[5] 1024 smlal2 v9.4s, v18.8h, v0.h[6] 1025 smlal2 v9.4s, v20.8h, v0.h[7] 1026 smlal2 v9.4s, v22.8h, v0.h[6] 1027 smlal2 v9.4s, v6.8h, v0.h[5] 1028 smull v1.4s, v17.4h, v0.h[5] 1029 smlal v1.4s, v19.4h, v0.h[6] 1030 smlal v1.4s, v21.4h, v0.h[7] 1031 smlal v1.4s, v23.4h, v0.h[6] 1032 smlal v1.4s, v7.4h, v0.h[5] 1033 smull2 v5.4s, v17.8h, v0.h[5] 1034 smlal2 v5.4s, v19.8h, v0.h[6] 1035 smlal2 v5.4s, v21.8h, v0.h[7] 1036 smlal2 v5.4s, v23.8h, v0.h[6] 1037 smlal2 v5.4s, v7.8h, v0.h[5] 1038 srshl v8.4s, v8.4s, v27.4s // -round_bits_v 1039 srshl v9.4s, v9.4s, v27.4s 1040 srshl v1.4s, v1.4s, v27.4s 1041 srshl v5.4s, v5.4s, v27.4s 1042 sqxtun v8.4h, v8.4s 1043 sqxtun2 v8.8h, v9.4s 1044 sqxtun v9.4h, v1.4s 1045 sqxtun2 v9.8h, v5.4s 1046 st1 {v6.8h, v7.8h}, [x15], #32 1047 umin v8.8h, v8.8h, v28.8h // bitdepth_max 1048 umin v9.8h, v9.8h, v28.8h 1049 1050 subs w4, w4, #16 1051 1052 st1 {v8.8h, v9.8h}, [x0], #32 1053 1054 b.le 0f 1055 mov v2.16b, v4.16b 1056 tst w7, #2 // LR_HAVE_RIGHT 1057 ld1 {v3.8h, v4.8h}, [x3], #32 1058 b.ne 4b // If we don't need to pad, just keep filtering. 1059 b 3b // If we need to pad, check how many pixels we have left. 1060 10610: 1062 ldp x3, x4, [sp, #48] 1063 ldp x15, x0, [sp, #32] 1064 ldp x13, x14, [sp, #16] 1065 ldp x11, x12, [sp], #64 1066 1067 add x3, x3, x1 1068 add x0, x0, x1 1069 1070 ret 1071endfunc 1072 1073#include "looprestoration_tmpl.S" 1074 1075// void dav1d_sgr_box3_row_h_16bpc_neon(int32_t *sumsq, int16_t *sum, 1076// const pixel (*left)[4], 1077// const pixel *src, const int w, 1078// const enum LrEdgeFlags edges); 1079function sgr_box3_row_h_16bpc_neon, export=1 1080 add w4, w4, #2 // w += 2 1081 1082 tst w5, #1 // LR_HAVE_LEFT 1083 b.eq 1f 1084 cbnz x2, 0f 1085 1086 // LR_HAVE_LEFT && left == NULL 1087 sub x3, x3, #4 1088 ld1 {v0.8h, v1.8h}, [x3], #32 1089 b 2f 1090 10910: 1092 // LR_HAVE_LEFT, left != NULL 1093 ld1 {v0.8h, v1.8h}, [x3], #32 1094 ld1 {v2.d}[1], [x2] 1095 // Move x3 back to account for the last 2 pixels we loaded earlier, 1096 // which we'll shift out. 1097 sub x3, x3, #4 1098 ext v1.16b, v0.16b, v1.16b, #12 1099 ext v0.16b, v2.16b, v0.16b, #12 1100 b 2f 1101 11021: 1103 ld1 {v0.8h, v1.8h}, [x3], #32 1104 // !LR_HAVE_LEFT, fill v2 with the leftmost pixel 1105 // and shift v0/v1 to have 2x the first pixel at the front. 1106 dup v2.8h, v0.h[0] 1107 // Move x3 back to account for the last 2 pixels we loaded before, 1108 // which we shifted out. 1109 sub x3, x3, #4 1110 ext v1.16b, v0.16b, v1.16b, #12 1111 ext v0.16b, v2.16b, v0.16b, #12 1112 11132: 1114 tst w5, #2 // LR_HAVE_RIGHT 1115 b.ne 4f 1116 // If we'll need to pad the right edge, load that pixel to pad with 1117 // here since we can find it pretty easily from here. 1118 sub w13, w4, #(2 + 16 - 2 + 1) 1119 ldr h30, [x3, w13, sxtw #1] 1120 // Fill v30 with the right padding pixel 1121 dup v30.8h, v30.h[0] 11223: // !LR_HAVE_RIGHT 1123 1124 // Check whether we need to pad the right edge 1125 cmp w4, #10 1126 b.ge 4f // If w >= 10, all used input pixels are valid 1127 1128 // 1 <= w < 10, w pixels valid in v0. For w=9, this ends up called 1129 // again; it's not strictly needed in those cases (we pad enough here), 1130 // but keeping the code as simple as possible. 1131 1132 // Insert padding in v0.b[w] onwards 1133 movrel x13, right_ext_mask 1134 sub x13, x13, w4, uxtw #1 1135 ld1 {v28.16b, v29.16b}, [x13] 1136 1137 bit v0.16b, v30.16b, v28.16b 1138 bit v1.16b, v30.16b, v29.16b 1139 11404: // Loop horizontally 1141 ext v26.16b, v0.16b, v1.16b, #2 1142 ext v27.16b, v0.16b, v1.16b, #4 1143 1144 add v6.8h, v0.8h, v26.8h 1145 umull v22.4s, v0.4h, v0.4h 1146 umlal v22.4s, v26.4h, v26.4h 1147 umlal v22.4s, v27.4h, v27.4h 1148 add v6.8h, v6.8h, v27.8h 1149 umull2 v23.4s, v0.8h, v0.8h 1150 umlal2 v23.4s, v26.8h, v26.8h 1151 umlal2 v23.4s, v27.8h, v27.8h 1152 1153 subs w4, w4, #8 1154 1155 st1 {v6.8h}, [x1], #16 1156 st1 {v22.4s,v23.4s}, [x0], #32 1157 1158 b.le 9f 1159 tst w5, #2 // LR_HAVE_RIGHT 1160 mov v0.16b, v1.16b 1161 ld1 {v1.8h}, [x3], #16 1162 1163 b.ne 4b // If we don't need to pad, just keep summing. 1164 b 3b // If we need to pad, check how many pixels we have left. 1165 11669: 1167 ret 1168endfunc 1169 1170// void dav1d_sgr_box5_row_h_16bpc_neon(int32_t *sumsq, int16_t *sum, 1171// const pixel (*left)[4], 1172// const pixel *src, const int w, 1173// const enum LrEdgeFlags edges); 1174function sgr_box5_row_h_16bpc_neon, export=1 1175 add w4, w4, #2 // w += 2 1176 1177 tst w5, #1 // LR_HAVE_LEFT 1178 b.eq 1f 1179 cbnz x2, 0f 1180 1181 // LR_HAVE_LEFT && left == NULL 1182 sub x3, x3, #6 1183 ld1 {v0.8h, v1.8h}, [x3], #32 1184 b 2f 1185 11860: 1187 // LR_HAVE_LEFT, left != NULL 1188 ld1 {v0.8h, v1.8h}, [x3], #32 1189 ld1 {v2.d}[1], [x2], #8 1190 // Move x3 back to account for the last 3 pixels we loaded earlier, 1191 // which we'll shift out. 1192 sub x3, x3, #6 1193 ext v1.16b, v0.16b, v1.16b, #10 1194 ext v0.16b, v2.16b, v0.16b, #10 1195 b 2f 1196 11971: 1198 ld1 {v0.8h, v1.8h}, [x3], #32 1199 // !LR_HAVE_LEFT, fill v2 with the leftmost pixel 1200 // and shift v0/v1 to have 3x the first pixel at the front. 1201 dup v2.8h, v0.h[0] 1202 // Move x3 back to account for the last 3 pixels we loaded before, 1203 // which we shifted out. 1204 sub x3, x3, #6 1205 ext v1.16b, v0.16b, v1.16b, #10 1206 ext v0.16b, v2.16b, v0.16b, #10 1207 12082: 1209 tst w5, #2 // LR_HAVE_RIGHT 1210 b.ne 4f 1211 // If we'll need to pad the right edge, load that pixel to pad with 1212 // here since we can find it pretty easily from here. 1213 sub w13, w4, #(2 + 16 - 3 + 1) 1214 ldr h30, [x3, w13, sxtw #1] 1215 // Fill v30 with the right padding pixel 1216 dup v30.8h, v30.h[0] 12173: // !LR_HAVE_RIGHT 1218 1219 // Check whether we need to pad the right edge 1220 cmp w4, #11 1221 b.ge 4f // If w >= 11, all used input pixels are valid 1222 1223 // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10, 1224 // this ends up called again; it's not strictly needed in those 1225 // cases (we pad enough here), but keeping the code as simple as possible. 1226 1227 // Insert padding in v0.b[w+1] onwards; fuse the +1 into the 1228 // buffer pointer. 1229 movrel x13, right_ext_mask, -1 1230 sub x13, x13, w4, uxtw #1 1231 ld1 {v28.16b, v29.16b}, [x13] 1232 1233 bit v0.16b, v30.16b, v28.16b 1234 bit v1.16b, v30.16b, v29.16b 1235 12364: // Loop horizontally 1237 ext v26.16b, v0.16b, v1.16b, #2 1238 ext v27.16b, v0.16b, v1.16b, #4 1239 1240 add v6.8h, v0.8h, v26.8h 1241 umull v22.4s, v0.4h, v0.4h 1242 umlal v22.4s, v26.4h, v26.4h 1243 umlal v22.4s, v27.4h, v27.4h 1244 add v6.8h, v6.8h, v27.8h 1245 umull2 v23.4s, v0.8h, v0.8h 1246 umlal2 v23.4s, v26.8h, v26.8h 1247 umlal2 v23.4s, v27.8h, v27.8h 1248 1249 ext v26.16b, v0.16b, v1.16b, #6 1250 ext v27.16b, v0.16b, v1.16b, #8 1251 1252 add v6.8h, v6.8h, v26.8h 1253 umlal v22.4s, v26.4h, v26.4h 1254 umlal v22.4s, v27.4h, v27.4h 1255 add v6.8h, v6.8h, v27.8h 1256 umlal2 v23.4s, v26.8h, v26.8h 1257 umlal2 v23.4s, v27.8h, v27.8h 1258 1259 subs w4, w4, #8 1260 1261 st1 {v6.8h}, [x1], #16 1262 st1 {v22.4s,v23.4s}, [x0], #32 1263 1264 b.le 9f 1265 tst w5, #2 // LR_HAVE_RIGHT 1266 mov v0.16b, v1.16b 1267 ld1 {v1.8h}, [x3], #16 1268 1269 b.ne 4b // If we don't need to pad, just keep summing. 1270 b 3b // If we need to pad, check how many pixels we have left. 1271 12729: 1273 ret 1274endfunc 1275 1276// void dav1d_sgr_box35_row_h_16bpc_neon(int32_t *sumsq3, int16_t *sum3, 1277// int32_t *sumsq5, int16_t *sum5, 1278// const pixel (*left)[4], 1279// const pixel *src, const int w, 1280// const enum LrEdgeFlags edges); 1281function sgr_box35_row_h_16bpc_neon, export=1 1282 add w6, w6, #2 // w += 2 1283 1284 tst w7, #1 // LR_HAVE_LEFT 1285 b.eq 1f 1286 cbnz x4, 0f 1287 1288 // LR_HAVE_LEFT && left == NULL 1289 sub x5, x5, #6 1290 ld1 {v0.8h, v1.8h}, [x5], #32 1291 b 2f 1292 12930: 1294 // LR_HAVE_LEFT, left != NULL 1295 ld1 {v0.8h, v1.8h}, [x5], #32 1296 ld1 {v2.d}[1], [x4], #8 1297 // Move x3 back to account for the last 3 pixels we loaded earlier, 1298 // which we'll shift out. 1299 sub x5, x5, #6 1300 ext v1.16b, v0.16b, v1.16b, #10 1301 ext v0.16b, v2.16b, v0.16b, #10 1302 b 2f 1303 13041: 1305 ld1 {v0.8h, v1.8h}, [x5], #32 1306 // !LR_HAVE_LEFT, fill v2 with the leftmost pixel 1307 // and shift v0/v1 to have 3x the first pixel at the front. 1308 dup v2.8h, v0.h[0] 1309 // Move x5 back to account for the last 3 pixels we loaded before, 1310 // which we shifted out. 1311 sub x5, x5, #6 1312 ext v1.16b, v0.16b, v1.16b, #10 1313 ext v0.16b, v2.16b, v0.16b, #10 1314 13152: 1316 tst w7, #2 // LR_HAVE_RIGHT 1317 b.ne 4f 1318 // If we'll need to pad the right edge, load that pixel to pad with 1319 // here since we can find it pretty easily from here. 1320 sub w13, w6, #(2 + 16 - 3 + 1) 1321 ldr h30, [x5, w13, sxtw #1] 1322 // Fill v30 with the right padding pixel 1323 dup v30.8h, v30.h[0] 13243: // !LR_HAVE_RIGHT 1325 1326 // Check whether we need to pad the right edge 1327 cmp w6, #11 1328 b.ge 4f // If w >= 11, all used input pixels are valid 1329 1330 // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10, 1331 // this ends up called again; it's not strictly needed in those 1332 // cases (we pad enough here), but keeping the code as simple as possible. 1333 1334 // Insert padding in v0.b[w+1] onwards; fuse the +1 into the 1335 // buffer pointer. 1336 movrel x13, right_ext_mask, -1 1337 sub x13, x13, w6, uxtw #1 1338 ld1 {v28.16b, v29.16b}, [x13] 1339 1340 bit v0.16b, v30.16b, v28.16b 1341 bit v1.16b, v30.16b, v29.16b 1342 13434: // Loop horizontally 1344 ext v16.16b, v0.16b, v1.16b, #2 1345 ext v17.16b, v0.16b, v1.16b, #4 1346 ext v19.16b, v0.16b, v1.16b, #8 1347 ext v18.16b, v0.16b, v1.16b, #6 1348 1349 add v20.8h, v16.8h, v17.8h 1350 add v21.8h, v0.8h, v19.8h 1351 add v20.8h, v20.8h, v18.8h 1352 1353 umull v22.4s, v16.4h, v16.4h 1354 umlal v22.4s, v17.4h, v17.4h 1355 umlal v22.4s, v18.4h, v18.4h 1356 1357 umull2 v23.4s, v16.8h, v16.8h 1358 umlal2 v23.4s, v17.8h, v17.8h 1359 umlal2 v23.4s, v18.8h, v18.8h 1360 1361 add v21.8h, v21.8h, v20.8h 1362 st1 {v20.8h}, [x1], #16 1363 st1 {v22.4s,v23.4s}, [x0], #32 1364 1365 umlal v22.4s, v0.4h, v0.4h 1366 umlal v22.4s, v19.4h, v19.4h 1367 1368 umlal2 v23.4s, v0.8h, v0.8h 1369 umlal2 v23.4s, v19.8h, v19.8h 1370 1371 subs w6, w6, #8 1372 1373 st1 {v21.8h}, [x3], #16 1374 st1 {v22.4s,v23.4s}, [x2], #32 1375 1376 b.le 9f 1377 tst w7, #2 // LR_HAVE_RIGHT 1378 mov v0.16b, v1.16b 1379 ld1 {v1.8h}, [x5], #16 1380 1381 b.ne 4b // If we don't need to pad, just keep summing. 1382 b 3b // If we need to pad, check how many pixels we have left. 1383 13849: 1385 ret 1386endfunc 1387 1388sgr_funcs 16 1389