1/* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2020, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/arm/asm.S" 29#include "util.S" 30 31const right_ext_mask_buf 32 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 33 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 34 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 35 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 36right_ext_mask: 37 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 38 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 39 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 40 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 41endconst 42 43// void dav1d_wiener_filter_h_16bpc_neon(int16_t *dst, const pixel (*left)[4], 44// const pixel *src, ptrdiff_t stride, 45// const int16_t fh[7], const intptr_t w, 46// int h, enum LrEdgeFlags edges, 47// const int bitdepth_max); 48function wiener_filter_h_16bpc_neon, export=1 49 push {r4-r11,lr} 50 vpush {q4-q7} 51 ldrd r4, r5, [sp, #100] 52 ldrd r6, r7, [sp, #108] 53 ldr r8, [sp, #116] // bitdepth_max 54 vld1.16 {q0}, [r4, :128] 55 clz r8, r8 56 vmov.i32 q14, #1 57 sub r9, r8, #38 // -(bitdepth + 6) 58 sub r8, r8, #25 // -round_bits_h 59 neg r9, r9 // bitdepth + 6 60 vdup.32 q1, r9 61 vdup.32 q13, r8 // -round_bits_h 62 vmov.i16 q15, #8192 63 vshl.u32 q14, q14, q1 // 1 << (bitdepth + 6) 64 mov r8, r5 65 // Calculate mid_stride 66 add r10, r5, #7 67 bic r10, r10, #7 68 lsl r10, r10, #1 69 70 // Set up pointers for reading/writing alternate rows 71 add r12, r0, r10 72 lsl r10, r10, #1 73 add lr, r2, r3 74 lsl r3, r3, #1 75 76 // Subtract the aligned width from mid_stride 77 add r11, r5, #7 78 bic r11, r11, #7 79 sub r10, r10, r11, lsl #1 80 81 // Subtract the number of pixels read from the source stride 82 add r11, r11, #8 83 sub r3, r3, r11, lsl #1 84 85 // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL 86 tst r7, #1 // LR_HAVE_LEFT 87 beq 2f 88 // LR_HAVE_LEFT 89 cmp r1, #0 90 bne 0f 91 // left == NULL 92 sub r2, r2, #6 93 sub lr, lr, #6 94 b 1f 950: // LR_HAVE_LEFT, left != NULL 962: // !LR_HAVE_LEFT, increase the stride. 97 // For this case we don't read the left 3 pixels from the src pointer, 98 // but shift it as if we had done that. 99 add r3, r3, #6 100 101 1021: // Loop vertically 103 vld1.16 {q2, q3}, [r2]! 104 vld1.16 {q4, q5}, [lr]! 105 106 tst r7, #1 // LR_HAVE_LEFT 107 beq 0f 108 cmp r1, #0 109 beq 2f 110 // LR_HAVE_LEFT, left != NULL 111 vld1.16 {d3}, [r1]! 112 // Move r2/lr back to account for the last 3 pixels we loaded earlier, 113 // which we'll shift out. 114 sub r2, r2, #6 115 sub lr, lr, #6 116 vld1.16 {d13}, [r1]! 117 vext.8 q3, q2, q3, #10 118 vext.8 q2, q1, q2, #10 119 vext.8 q5, q4, q5, #10 120 vext.8 q4, q6, q4, #10 121 b 2f 1220: 123 // !LR_HAVE_LEFT, fill q1 with the leftmost pixel 124 // and shift q2/q3 to have 3x the first pixel at the front. 125 vdup.16 q1, d4[0] 126 vdup.16 q6, d8[0] 127 // Move r2 back to account for the last 3 pixels we loaded before, 128 // which we shifted out. 129 sub r2, r2, #6 130 sub lr, lr, #6 131 vext.8 q3, q2, q3, #10 132 vext.8 q2, q1, q2, #10 133 vext.8 q5, q4, q5, #10 134 vext.8 q4, q6, q4, #10 135 1362: 137 138 tst r7, #2 // LR_HAVE_RIGHT 139 bne 4f 140 // If we'll need to pad the right edge, load that pixel to pad with 141 // here since we can find it pretty easily from here. 142 sub r9, r5, #14 143 lsl r9, r9, #1 144 ldrh r11, [r2, r9] 145 ldrh r9, [lr, r9] 146 // Fill q11/q12 with the right padding pixel 147 vdup.16 q11, r11 148 vdup.16 q12, r9 1493: // !LR_HAVE_RIGHT 150 151 // Check whether we need to pad the right edge 152 cmp r5, #11 153 bge 4f // If w >= 11, all used input pixels are valid 154 155 // 1 <= w < 11, w+3 pixels valid in q2-q3. For w=9 or w=10, 156 // this ends up called again; it's not strictly needed in those 157 // cases (we pad enough here), but keeping the code as simple as possible. 158 159 // Insert padding in q2/3.h[w+3] onwards; fuse the +3 (*2) into the 160 // buffer pointer. 161 movrel_local r4, right_ext_mask, -6 162 sub r4, r4, r5, lsl #1 163 vld1.8 {q9, q10}, [r4] 164 165 vbit q2, q11, q9 166 vbit q3, q11, q10 167 vbit q4, q12, q9 168 vbit q5, q12, q10 169 1704: // Loop horizontally 171 vext.8 q7, q2, q3, #4 172 vext.8 q8, q2, q3, #8 173 vext.8 q6, q2, q3, #2 174 vext.8 q9, q2, q3, #10 175 vadd.i16 q8, q8, q7 176 vadd.i16 q9, q9, q6 177 vext.8 q6, q2, q3, #12 178 vext.8 q7, q2, q3, #6 179 vadd.i16 q2, q2, q6 180 vmull.s16 q6, d14, d0[3] 181 vmlal.s16 q6, d16, d1[0] 182 vmlal.s16 q6, d18, d1[1] 183 vmlal.s16 q6, d4, d1[2] 184 vmull.s16 q7, d15, d0[3] 185 vmlal.s16 q7, d17, d1[0] 186 vmlal.s16 q7, d19, d1[1] 187 vmlal.s16 q7, d5, d1[2] 188 189 vext.8 q8, q4, q5, #4 190 vext.8 q10, q4, q5, #8 191 vext.8 q9, q4, q5, #2 192 vext.8 q2, q4, q5, #10 193 vadd.i16 q10, q10, q8 194 vadd.i16 q2, q2, q9 195 vext.8 q8, q4, q5, #12 196 vext.8 q9, q4, q5, #6 197 vadd.i16 q4, q4, q8 198 vmull.s16 q8, d18, d0[3] 199 vmlal.s16 q8, d20, d1[0] 200 vmlal.s16 q8, d4, d1[1] 201 vmlal.s16 q8, d8, d1[2] 202 vmull.s16 q9, d19, d0[3] 203 vmlal.s16 q9, d21, d1[0] 204 vmlal.s16 q9, d5, d1[1] 205 vmlal.s16 q9, d9, d1[2] 206 207 vmvn.i16 q10, #0x8000 // 0x7fff = (1 << 15) - 1 208 vadd.i32 q6, q6, q14 209 vadd.i32 q7, q7, q14 210 vadd.i32 q8, q8, q14 211 vadd.i32 q9, q9, q14 212 vrshl.s32 q6, q6, q13 213 vrshl.s32 q7, q7, q13 214 vrshl.s32 q8, q8, q13 215 vrshl.s32 q9, q9, q13 216 vqmovun.s32 d12, q6 217 vqmovun.s32 d13, q7 218 vqmovun.s32 d14, q8 219 vqmovun.s32 d15, q9 220 vmin.u16 q6, q6, q10 221 vmin.u16 q7, q7, q10 222 vsub.i16 q6, q6, q15 223 vsub.i16 q7, q7, q15 224 subs r5, r5, #8 225 vst1.16 {q6}, [r0, :128]! 226 vst1.16 {q7}, [r12, :128]! 227 228 ble 9f 229 tst r7, #2 // LR_HAVE_RIGHT 230 vmov q2, q3 231 vmov q4, q5 232 vld1.16 {q3}, [r2]! 233 vld1.16 {q5}, [lr]! 234 bne 4b // If we don't need to pad, just keep filtering. 235 b 3b // If we need to pad, check how many pixels we have left. 236 2379: 238 subs r6, r6, #2 239 ble 0f 240 // Jump to the next row and loop horizontally 241 add r0, r0, r10 242 add r12, r12, r10 243 add r2, r2, r3 244 add lr, lr, r3 245 mov r5, r8 246 b 1b 2470: 248 vpop {q4-q7} 249 pop {r4-r11,pc} 250endfunc 251 252// void dav1d_wiener_filter_v_16bpc_neon(pixel *dst, ptrdiff_t stride, 253// const int16_t *mid, int w, int h, 254// const int16_t fv[7], enum LrEdgeFlags edges, 255// ptrdiff_t mid_stride, const int bitdepth_max); 256function wiener_filter_v_16bpc_neon, export=1 257 push {r4-r7,lr} 258 vpush {q4-q5} 259 ldrd r4, r5, [sp, #52] 260 ldrd r6, r7, [sp, #60] 261 ldr lr, [sp, #68] // bitdepth_max 262 vld1.16 {q0}, [r5, :128] 263 vdup.16 q5, lr 264 clz lr, lr 265 sub lr, lr, #11 // round_bits_v 266 vdup.32 q4, lr 267 mov lr, r4 268 vneg.s32 q4, q4 // -round_bits_v 269 270 // Calculate the number of rows to move back when looping vertically 271 mov r12, r4 272 tst r6, #4 // LR_HAVE_TOP 273 beq 0f 274 sub r2, r2, r7, lsl #1 275 add r12, r12, #2 2760: 277 tst r6, #8 // LR_HAVE_BOTTOM 278 beq 1f 279 add r12, r12, #2 280 2811: // Start of horizontal loop; start one vertical filter slice. 282 // Load rows into q8-q11 and pad properly. 283 tst r6, #4 // LR_HAVE_TOP 284 vld1.16 {q8}, [r2, :128], r7 285 beq 2f 286 // LR_HAVE_TOP 287 vld1.16 {q10}, [r2, :128], r7 288 vmov q9, q8 289 vld1.16 {q11}, [r2, :128], r7 290 b 3f 2912: // !LR_HAVE_TOP 292 vmov q9, q8 293 vmov q10, q8 294 vmov q11, q8 295 2963: 297 cmp r4, #4 298 blt 5f 299 // Start filtering normally; fill in q12-q14 with unique rows. 300 vld1.16 {q12}, [r2, :128], r7 301 vld1.16 {q13}, [r2, :128], r7 302 vld1.16 {q14}, [r2, :128], r7 303 3044: 305.macro filter compare 306 subs r4, r4, #1 307 // Interleaving the mul/mla chains actually hurts performance 308 // significantly on Cortex A53, thus keeping mul/mla tightly 309 // chained like this. 310 vmull.s16 q2, d16, d0[0] 311 vmlal.s16 q2, d18, d0[1] 312 vmlal.s16 q2, d20, d0[2] 313 vmlal.s16 q2, d22, d0[3] 314 vmlal.s16 q2, d24, d1[0] 315 vmlal.s16 q2, d26, d1[1] 316 vmlal.s16 q2, d28, d1[2] 317 vmull.s16 q3, d17, d0[0] 318 vmlal.s16 q3, d19, d0[1] 319 vmlal.s16 q3, d21, d0[2] 320 vmlal.s16 q3, d23, d0[3] 321 vmlal.s16 q3, d25, d1[0] 322 vmlal.s16 q3, d27, d1[1] 323 vmlal.s16 q3, d29, d1[2] 324 vrshl.s32 q2, q2, q4 // round_bits_v 325 vrshl.s32 q3, q3, q4 326 vqmovun.s32 d4, q2 327 vqmovun.s32 d5, q3 328 vmin.u16 q2, q2, q5 // bitdepth_max 329 vst1.16 {q2}, [r0, :128], r1 330.if \compare 331 cmp r4, #4 332.else 333 ble 9f 334.endif 335 vmov q8, q9 336 vmov q9, q10 337 vmov q10, q11 338 vmov q11, q12 339 vmov q12, q13 340 vmov q13, q14 341.endm 342 filter 1 343 blt 7f 344 vld1.16 {q14}, [r2, :128], r7 345 b 4b 346 3475: // Less than 4 rows in total; not all of q12-q13 are filled yet. 348 tst r6, #8 // LR_HAVE_BOTTOM 349 beq 6f 350 // LR_HAVE_BOTTOM 351 cmp r4, #2 352 // We load at least 2 rows in all cases. 353 vld1.16 {q12}, [r2, :128], r7 354 vld1.16 {q13}, [r2, :128], r7 355 bgt 53f // 3 rows in total 356 beq 52f // 2 rows in total 35751: // 1 row in total, q11 already loaded, load edge into q12-q14. 358 vmov q13, q12 359 b 8f 36052: // 2 rows in total, q11 already loaded, load q12 with content data 361 // and 2 rows of edge. 362 vld1.16 {q14}, [r2, :128], r7 363 vmov q15, q14 364 b 8f 36553: 366 // 3 rows in total, q11 already loaded, load q12 and q13 with content 367 // and 2 rows of edge. 368 vld1.16 {q14}, [r2, :128], r7 369 vld1.16 {q15}, [r2, :128], r7 370 vmov q1, q15 371 b 8f 372 3736: 374 // !LR_HAVE_BOTTOM 375 cmp r4, #2 376 bgt 63f // 3 rows in total 377 beq 62f // 2 rows in total 37861: // 1 row in total, q11 already loaded, pad that into q12-q14. 379 vmov q12, q11 380 vmov q13, q11 381 vmov q14, q11 382 b 8f 38362: // 2 rows in total, q11 already loaded, load q12 and pad that into q12-q15. 384 vld1.16 {q12}, [r2, :128], r7 385 vmov q13, q12 386 vmov q14, q12 387 vmov q15, q12 388 b 8f 38963: 390 // 3 rows in total, q11 already loaded, load q12 and q13 and pad q13 into q14-q15,q1. 391 vld1.16 {q12}, [r2, :128], r7 392 vld1.16 {q13}, [r2, :128], r7 393 vmov q14, q13 394 vmov q15, q13 395 vmov q1, q13 396 b 8f 397 3987: 399 // All registers up to q13 are filled already, 3 valid rows left. 400 // < 4 valid rows left; fill in padding and filter the last 401 // few rows. 402 tst r6, #8 // LR_HAVE_BOTTOM 403 beq 71f 404 // LR_HAVE_BOTTOM; load 2 rows of edge. 405 vld1.16 {q14}, [r2, :128], r7 406 vld1.16 {q15}, [r2, :128], r7 407 vmov q1, q15 408 b 8f 40971: 410 // !LR_HAVE_BOTTOM, pad 3 rows 411 vmov q14, q13 412 vmov q15, q13 413 vmov q1, q13 414 4158: // At this point, all registers up to q14-q15,q1 are loaded with 416 // edge/padding (depending on how many rows are left). 417 filter 0 // This branches to 9f when done 418 vmov q14, q15 419 vmov q15, q1 420 b 8b 421 4229: // End of one vertical slice. 423 subs r3, r3, #8 424 ble 0f 425 // Move pointers back up to the top and loop horizontally. 426 mls r0, r1, lr, r0 427 mls r2, r7, r12, r2 428 add r0, r0, #16 429 add r2, r2, #16 430 mov r4, lr 431 b 1b 432 4330: 434 vpop {q4-q5} 435 pop {r4-r7,pc} 436.purgem filter 437endfunc 438 439#define SUM_STRIDE (384+16) 440 441#include "looprestoration_tmpl.S" 442 443// void dav1d_sgr_box3_h_16bpc_neon(int32_t *sumsq, int16_t *sum, 444// const pixel (*left)[4], 445// const pixel *src, const ptrdiff_t stride, 446// const int w, const int h, 447// const enum LrEdgeFlags edges); 448function sgr_box3_h_16bpc_neon, export=1 449 push {r4-r11,lr} 450 vpush {q4-q7} 451 ldrd r4, r5, [sp, #100] 452 ldrd r6, r7, [sp, #108] 453 add r5, r5, #2 // w += 2 454 455 // Set up pointers for reading/writing alternate rows 456 add r10, r0, #(4*SUM_STRIDE) // sumsq 457 add r11, r1, #(2*SUM_STRIDE) // sum 458 add r12, r3, r4 // src 459 lsl r4, r4, #1 460 mov r9, #(2*2*SUM_STRIDE) // double sum stride 461 462 // Subtract the aligned width from the output stride. 463 add lr, r5, #7 464 bic lr, lr, #7 465 sub r9, r9, lr, lsl #1 466 467 // Store the width for the vertical loop 468 mov r8, r5 469 470 // Subtract the number of pixels read from the input from the stride 471 add lr, lr, #8 472 sub r4, r4, lr, lsl #1 473 474 // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL 475 tst r7, #1 // LR_HAVE_LEFT 476 beq 2f 477 // LR_HAVE_LEFT 478 cmp r2, #0 479 bne 0f 480 // left == NULL 481 sub r3, r3, #4 482 sub r12, r12, #4 483 b 1f 4840: // LR_HAVE_LEFT, left != NULL 4852: // !LR_HAVE_LEFT, increase the stride. 486 // For this case we don't read the left 2 pixels from the src pointer, 487 // but shift it as if we had done that. 488 add r4, r4, #4 489 490 4911: // Loop vertically 492 vld1.16 {q0, q1}, [r3]! 493 vld1.16 {q4, q5}, [r12]! 494 495 tst r7, #1 // LR_HAVE_LEFT 496 beq 0f 497 cmp r2, #0 498 beq 2f 499 // LR_HAVE_LEFT, left != NULL 500 vld1.16 {d5}, [r2]! 501 // Move r3/r12 back to account for the last 2 pixels we loaded earlier, 502 // which we'll shift out. 503 sub r3, r3, #4 504 sub r12, r12, #4 505 vld1.16 {d13}, [r2]! 506 vext.8 q1, q0, q1, #12 507 vext.8 q0, q2, q0, #12 508 vext.8 q5, q4, q5, #12 509 vext.8 q4, q6, q4, #12 510 b 2f 5110: 512 // !LR_HAVE_LEFT, fill q2 with the leftmost pixel 513 // and shift q0 to have 2x the first byte at the front. 514 vdup.16 q2, d0[0] 515 vdup.16 q6, d8[0] 516 // Move r3 back to account for the last 2 pixels we loaded before, 517 // which we shifted out. 518 sub r3, r3, #4 519 sub r12, r12, #4 520 vext.8 q1, q0, q1, #12 521 vext.8 q0, q2, q0, #12 522 vext.8 q5, q4, q5, #12 523 vext.8 q4, q6, q4, #12 524 5252: 526 tst r7, #2 // LR_HAVE_RIGHT 527 bne 4f 528 // If we'll need to pad the right edge, load that pixel to pad with 529 // here since we can find it pretty easily from here. 530 sub lr, r5, #(2 + 16 - 2 + 1) 531 lsl lr, lr, #1 532 ldrh r11, [r3, lr] 533 ldrh lr, [r12, lr] 534 // Fill q14/q15 with the right padding pixel 535 vdup.16 q14, r11 536 vdup.16 q15, lr 537 // Restore r11 after using it for a temporary value 538 add r11, r1, #(2*SUM_STRIDE) 5393: // !LR_HAVE_RIGHT 540 541 // Check whether we need to pad the right edge 542 cmp r5, #10 543 bge 4f // If w >= 10, all used input pixels are valid 544 545 // 1 <= w < 10, w pixels valid in q0-q1. For w=9, this ends up called 546 // again; it's not strictly needed in those cases (we pad enough here), 547 // but keeping the code as simple as possible. 548 549 // Insert padding in q0/1.h[w] onwards 550 movrel_local lr, right_ext_mask 551 sub lr, lr, r5, lsl #1 552 vld1.8 {q12, q13}, [lr] 553 554 vbit q0, q14, q12 555 vbit q1, q14, q13 556 vbit q4, q15, q12 557 vbit q5, q15, q13 558 5594: // Loop horizontally 560 vext.8 q8, q0, q1, #2 561 vext.8 q10, q4, q5, #2 562 vext.8 q9, q0, q1, #4 563 vext.8 q11, q4, q5, #4 564 vadd.i16 q2, q0, q8 565 vadd.i16 q3, q4, q10 566 vadd.i16 q2, q2, q9 567 vadd.i16 q3, q3, q11 568 569 vmull.u16 q6, d0, d0 570 vmlal.u16 q6, d16, d16 571 vmlal.u16 q6, d18, d18 572 vmull.u16 q12, d8, d8 573 vmlal.u16 q12, d20, d20 574 vmlal.u16 q12, d22, d22 575 vmull.u16 q7, d1, d1 576 vmlal.u16 q7, d17, d17 577 vmlal.u16 q7, d19, d19 578 vmull.u16 q13, d9, d9 579 vmlal.u16 q13, d21, d21 580 vmlal.u16 q13, d23, d23 581 subs r5, r5, #8 582 vst1.16 {q2}, [r1, :128]! 583 vst1.16 {q3}, [r11, :128]! 584 vst1.32 {q6, q7}, [r0, :128]! 585 vst1.32 {q12, q13}, [r10, :128]! 586 587 ble 9f 588 tst r7, #2 // LR_HAVE_RIGHT 589 vmov q0, q1 590 vmov q4, q5 591 vld1.16 {q1}, [r3]! 592 vld1.16 {q5}, [r12]! 593 594 bne 4b // If we don't need to pad, just keep summing. 595 b 3b // If we need to pad, check how many pixels we have left. 596 5979: 598 subs r6, r6, #2 599 ble 0f 600 // Jump to the next row and loop horizontally 601 add r0, r0, r9, lsl #1 602 add r10, r10, r9, lsl #1 603 add r1, r1, r9 604 add r11, r11, r9 605 add r3, r3, r4 606 add r12, r12, r4 607 mov r5, r8 608 b 1b 6090: 610 vpop {q4-q7} 611 pop {r4-r11,pc} 612endfunc 613 614// void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum, 615// const pixel (*left)[4], 616// const pixel *src, const ptrdiff_t stride, 617// const int w, const int h, 618// const enum LrEdgeFlags edges); 619function sgr_box5_h_16bpc_neon, export=1 620 push {r4-r11,lr} 621 vpush {q4-q7} 622 ldrd r4, r5, [sp, #100] 623 ldrd r6, r7, [sp, #108] 624 add r5, r5, #2 // w += 2 625 626 // Set up pointers for reading/writing alternate rows 627 add r10, r0, #(4*SUM_STRIDE) // sumsq 628 add r11, r1, #(2*SUM_STRIDE) // sum 629 add r12, r3, r4 // src 630 lsl r4, r4, #1 631 mov r9, #(2*2*SUM_STRIDE) // double sum stride 632 633 // Subtract the aligned width from the output stride. 634 add lr, r5, #7 635 bic lr, lr, #7 636 sub r9, r9, lr, lsl #1 637 add lr, lr, #8 638 sub r4, r4, lr, lsl #1 639 640 // Store the width for the vertical loop 641 mov r8, r5 642 643 // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL 644 tst r7, #1 // LR_HAVE_LEFT 645 beq 2f 646 // LR_HAVE_LEFT 647 cmp r2, #0 648 bne 0f 649 // left == NULL 650 sub r3, r3, #6 651 sub r12, r12, #6 652 b 1f 6530: // LR_HAVE_LEFT, left != NULL 6542: // !LR_HAVE_LEFT, increase the stride. 655 // For this case we don't read the left 3 pixels from the src pointer, 656 // but shift it as if we had done that. 657 add r4, r4, #6 658 6591: // Loop vertically 660 vld1.16 {q0, q1}, [r3]! 661 vld1.16 {q4, q5}, [r12]! 662 663 tst r7, #1 // LR_HAVE_LEFT 664 beq 0f 665 cmp r2, #0 666 beq 2f 667 // LR_HAVE_LEFT, left != NULL 668 vld1.16 {d5}, [r2]! 669 // Move r3/r12 back to account for the last 3 pixels we loaded earlier, 670 // which we'll shift out. 671 sub r3, r3, #6 672 sub r12, r12, #6 673 vld1.16 {d13}, [r2]! 674 vext.8 q1, q0, q1, #10 675 vext.8 q0, q2, q0, #10 676 vext.8 q5, q4, q5, #10 677 vext.8 q4, q6, q4, #10 678 b 2f 6790: 680 // !LR_HAVE_LEFT, fill q2 with the leftmost pixel 681 // and shift q0 to have 3x the first pixel at the front. 682 vdup.16 q2, d0[0] 683 vdup.16 q6, d8[0] 684 // Move r3 back to account for the last 3 pixels we loaded before, 685 // which we shifted out. 686 sub r3, r3, #6 687 sub r12, r12, #6 688 vext.8 q1, q0, q1, #10 689 vext.8 q0, q2, q0, #10 690 vext.8 q5, q4, q5, #10 691 vext.8 q4, q6, q4, #10 692 6932: 694 tst r7, #2 // LR_HAVE_RIGHT 695 bne 4f 696 // If we'll need to pad the right edge, load that pixel to pad with 697 // here since we can find it pretty easily from here. 698 sub lr, r5, #(2 + 16 - 3 + 1) 699 lsl lr, lr, #1 700 ldrh r11, [r3, lr] 701 ldrh lr, [r12, lr] 702 // Fill q14/q15 with the right padding pixel 703 vdup.16 q14, r11 704 vdup.16 q15, lr 705 // Restore r11 after using it for a temporary value 706 add r11, r1, #(2*SUM_STRIDE) 7073: // !LR_HAVE_RIGHT 708 709 // Check whether we need to pad the right edge 710 cmp r5, #11 711 bge 4f // If w >= 11, all used input pixels are valid 712 713 // 1 <= w < 11, w+1 pixels valid in q0-q1. For w=9 or w=10, 714 // this ends up called again; it's not strictly needed in those 715 // cases (we pad enough here), but keeping the code as simple as possible. 716 717 // Insert padding in q0/1.h[w+1] onwards; fuse the +1 into the 718 // buffer pointer. 719 movrel_local lr, right_ext_mask, -2 720 sub lr, lr, r5, lsl #1 721 vld1.8 {q12, q13}, [lr] 722 723 vbit q0, q14, q12 724 vbit q1, q14, q13 725 vbit q4, q15, q12 726 vbit q5, q15, q13 727 7284: // Loop horizontally 729 vext.8 q8, q0, q1, #2 730 vext.8 q10, q4, q5, #2 731 vext.8 q9, q0, q1, #4 732 vext.8 q11, q4, q5, #4 733 vadd.i16 q2, q0, q8 734 vadd.i16 q3, q4, q10 735 vadd.i16 q2, q2, q9 736 vadd.i16 q3, q3, q11 737 738 vmull.u16 q6, d0, d0 739 vmlal.u16 q6, d16, d16 740 vmlal.u16 q6, d18, d18 741 vmull.u16 q12, d8, d8 742 vmlal.u16 q12, d20, d20 743 vmlal.u16 q12, d22, d22 744 vmull.u16 q7, d1, d1 745 vmlal.u16 q7, d17, d17 746 vmlal.u16 q7, d19, d19 747 vmull.u16 q13, d9, d9 748 vmlal.u16 q13, d21, d21 749 vmlal.u16 q13, d23, d23 750 751 vext.8 q8, q0, q1, #6 752 vext.8 q10, q4, q5, #6 753 vext.8 q9, q0, q1, #8 754 vext.8 q11, q4, q5, #8 755 vadd.i16 q2, q2, q8 756 vadd.i16 q3, q3, q10 757 vadd.i16 q2, q2, q9 758 vadd.i16 q3, q3, q11 759 760 vmlal.u16 q6, d16, d16 761 vmlal.u16 q6, d1, d1 762 vmlal.u16 q12, d20, d20 763 vmlal.u16 q12, d9, d9 764 vmlal.u16 q7, d17, d17 765 vmlal.u16 q7, d19, d19 766 vmlal.u16 q13, d21, d21 767 vmlal.u16 q13, d23, d23 768 769 subs r5, r5, #8 770 vst1.16 {q2}, [r1, :128]! 771 vst1.16 {q3}, [r11, :128]! 772 vst1.32 {q6, q7}, [r0, :128]! 773 vst1.32 {q12, q13}, [r10, :128]! 774 775 ble 9f 776 tst r7, #2 // LR_HAVE_RIGHT 777 vmov q0, q1 778 vmov q4, q5 779 vld1.16 {q1}, [r3]! 780 vld1.16 {q5}, [r12]! 781 bne 4b // If we don't need to pad, just keep summing. 782 b 3b // If we need to pad, check how many pixels we have left. 783 7849: 785 subs r6, r6, #2 786 ble 0f 787 // Jump to the next row and loop horizontally 788 add r0, r0, r9, lsl #1 789 add r10, r10, r9, lsl #1 790 add r1, r1, r9 791 add r11, r11, r9 792 add r3, r3, r4 793 add r12, r12, r4 794 mov r5, r8 795 b 1b 7960: 797 vpop {q4-q7} 798 pop {r4-r11,pc} 799endfunc 800 801sgr_funcs 16 802