1/* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2019, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/arm/asm.S" 29#include "util.S" 30 31const right_ext_mask_buf 32 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 33 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 34 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 35 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 36right_ext_mask: 37 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 38 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 39 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 40 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 41endconst 42 43// void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4], 44// const pixel *src, ptrdiff_t stride, 45// const int16_t fh[8], intptr_t w, 46// int h, enum LrEdgeFlags edges); 47function wiener_filter_h_8bpc_neon, export=1 48 push {r4-r11,lr} 49 vpush {q4-q7} 50 ldrd r4, r5, [sp, #100] 51 ldrd r6, r7, [sp, #108] 52 mov r8, r5 53 vld1.16 {q0}, [r4, :128] 54 movw r9, #(1 << 14) - (1 << 2) 55 vdup.16 q14, r9 56 vmov.s16 q15, #2048 57 // Calculate mid_stride 58 add r10, r5, #7 59 bic r10, r10, #7 60 lsl r10, r10, #1 61 62 // Set up pointers for reading/writing alternate rows 63 add r12, r0, r10 64 lsl r10, r10, #1 65 add lr, r2, r3 66 lsl r3, r3, #1 67 68 // Subtract the aligned width from mid_stride 69 add r11, r5, #7 70 bic r11, r11, #7 71 sub r10, r10, r11, lsl #1 72 73 // Subtract the number of pixels read from the source stride 74 add r11, r11, #8 75 sub r3, r3, r11 76 77 // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL 78 tst r7, #1 // LR_HAVE_LEFT 79 beq 2f 80 // LR_HAVE_LEFT 81 cmp r1, #0 82 bne 0f 83 // left == NULL 84 sub r2, r2, #3 85 sub lr, lr, #3 86 b 1f 870: // LR_HAVE_LEFT, left != NULL 882: // !LR_HAVE_LEFT, increase the stride. 89 // For this case we don't read the left 3 pixels from the src pointer, 90 // but shift it as if we had done that. 91 add r3, r3, #3 92 93 941: // Loop vertically 95 vld1.8 {q2}, [r2]! 96 vld1.8 {q9}, [lr]! 97 98 tst r7, #1 // LR_HAVE_LEFT 99 beq 0f 100 cmp r1, #0 101 beq 2f 102 // LR_HAVE_LEFT, left != NULL 103 vld1.32 {d3[1]}, [r1]! 104 // Move r2/lr back to account for the last 3 bytes we loaded earlier, 105 // which we'll shift out. 106 sub r2, r2, #3 107 sub lr, lr, #3 108 vld1.32 {d17[1]}, [r1]! 109 vext.8 q2, q1, q2, #13 110 vext.8 q9, q8, q9, #13 111 b 2f 1120: 113 // !LR_HAVE_LEFT, fill q1 with the leftmost byte 114 // and shift q2 to have 3x the first byte at the front. 115 vdup.8 q1, d4[0] 116 vdup.8 q8, d18[0] 117 // Move r2 back to account for the last 3 bytes we loaded before, 118 // which we shifted out. 119 sub r2, r2, #3 120 sub lr, lr, #3 121 vext.8 q2, q1, q2, #13 122 vext.8 q9, q8, q9, #13 123 1242: 125 vmovl.u8 q1, d4 126 vmovl.u8 q2, d5 127 vmovl.u8 q8, d18 128 vmovl.u8 q9, d19 129 130 tst r7, #2 // LR_HAVE_RIGHT 131 bne 4f 132 // If we'll need to pad the right edge, load that byte to pad with 133 // here since we can find it pretty easily from here. 134 sub r9, r5, #14 135 ldrb r11, [r2, r9] 136 ldrb r9, [lr, r9] 137 // Fill q12/q13 with the right padding pixel 138 vdup.16 q12, r11 139 vdup.16 q13, r9 1403: // !LR_HAVE_RIGHT 141 142 // Check whether we need to pad the right edge 143 cmp r5, #11 144 bge 4f // If w >= 11, all used input pixels are valid 145 146 // 1 <= w < 11, w+3 pixels valid in q1-q2. For w=9 or w=10, 147 // this ends up called again; it's not strictly needed in those 148 // cases (we pad enough here), but keeping the code as simple as possible. 149 150 // Insert padding in q1/2.h[w+3] onwards; fuse the +3 (*2) into the 151 // buffer pointer. 152 movrel_local r4, right_ext_mask, -6 153 sub r4, r4, r5, lsl #1 154 vld1.8 {q10, q11}, [r4] 155 156 vbit q1, q12, q10 157 vbit q2, q12, q11 158 vbit q8, q13, q10 159 vbit q9, q13, q11 160 1614: // Loop horizontally 162 vext.8 q11, q1, q2, #4 163 vext.8 q5, q1, q2, #8 164 vext.8 q10, q1, q2, #2 165 vext.8 q6, q1, q2, #10 166 vext.8 q7, q1, q2, #12 167 vext.8 q4, q1, q2, #6 168 vadd.i16 q5, q5, q11 169 vadd.i16 q6, q6, q10 170 vadd.i16 q7, q7, q1 171 vmul.s16 q3, q4, d0[3] 172 vmla.s16 q3, q5, d1[0] 173 vmla.s16 q3, q6, d1[1] 174 vmla.s16 q3, q7, d1[2] 175 176 vext.8 q4, q8, q9, #4 177 vext.8 q6, q8, q9, #8 178 vext.8 q11, q8, q9, #2 179 vext.8 q7, q8, q9, #10 180 vadd.i16 q6, q6, q4 181 vext.8 q4, q8, q9, #12 182 vext.8 q5, q8, q9, #6 183 vadd.i16 q7, q7, q11 184 vadd.i16 q4, q4, q8 185 vmul.s16 q10, q5, d0[3] 186 vmla.s16 q10, q6, d1[0] 187 vmla.s16 q10, q7, d1[1] 188 vmla.s16 q10, q4, d1[2] 189 190 vext.8 q1, q1, q2, #6 191 vext.8 q8, q8, q9, #6 192 vshl.s16 q1, q1, #7 193 vshl.s16 q8, q8, #7 194 vsub.s16 q1, q1, q14 195 vsub.s16 q8, q8, q14 196 vqadd.s16 q3, q3, q1 197 vqadd.s16 q10, q10, q8 198 vshr.s16 q3, q3, #3 199 vshr.s16 q10, q10, #3 200 vadd.s16 q3, q3, q15 201 vadd.s16 q10, q10, q15 202 subs r5, r5, #8 203 vst1.16 {q3}, [r0, :128]! 204 vst1.16 {q10}, [r12, :128]! 205 206 ble 9f 207 tst r7, #2 // LR_HAVE_RIGHT 208 vmov q1, q2 209 vmov q8, q9 210 vld1.8 {d4}, [r2]! 211 vld1.8 {d18}, [lr]! 212 vmovl.u8 q2, d4 213 vmovl.u8 q9, d18 214 bne 4b // If we don't need to pad, just keep filtering. 215 b 3b // If we need to pad, check how many pixels we have left. 216 2179: 218 subs r6, r6, #2 219 ble 0f 220 // Jump to the next row and loop horizontally 221 add r0, r0, r10 222 add r12, r12, r10 223 add r2, r2, r3 224 add lr, lr, r3 225 mov r5, r8 226 b 1b 2270: 228 vpop {q4-q7} 229 pop {r4-r11,pc} 230endfunc 231 232// void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, ptrdiff_t stride, 233// const int16_t *mid, int w, int h, 234// const int16_t fv[8], enum LrEdgeFlags edges, 235// ptrdiff_t mid_stride); 236function wiener_filter_v_8bpc_neon, export=1 237 push {r4-r7,lr} 238 vpush {q4-q6} 239 ldrd r4, r5, [sp, #68] 240 ldrd r6, r7, [sp, #76] 241 mov lr, r4 242 vld1.16 {q0}, [r5, :128] 243 244 // Calculate the number of rows to move back when looping vertically 245 mov r12, r4 246 tst r6, #4 // LR_HAVE_TOP 247 beq 0f 248 sub r2, r2, r7, lsl #1 249 add r12, r12, #2 2500: 251 tst r6, #8 // LR_HAVE_BOTTOM 252 beq 1f 253 add r12, r12, #2 254 2551: // Start of horizontal loop; start one vertical filter slice. 256 // Load rows into q8-q11 and pad properly. 257 tst r6, #4 // LR_HAVE_TOP 258 vld1.16 {q8}, [r2, :128], r7 259 beq 2f 260 // LR_HAVE_TOP 261 vld1.16 {q10}, [r2, :128], r7 262 vmov q9, q8 263 vld1.16 {q11}, [r2, :128], r7 264 b 3f 2652: // !LR_HAVE_TOP 266 vmov q9, q8 267 vmov q10, q8 268 vmov q11, q8 269 2703: 271 cmp r4, #4 272 blt 5f 273 // Start filtering normally; fill in q12-q14 with unique rows. 274 vld1.16 {q12}, [r2, :128], r7 275 vld1.16 {q13}, [r2, :128], r7 276 vld1.16 {q14}, [r2, :128], r7 277 2784: 279.macro filter compare 280 subs r4, r4, #1 281 // Interleaving the mul/mla chains actually hurts performance 282 // significantly on Cortex A53, thus keeping mul/mla tightly 283 // chained like this. 284 vadd.i16 q4, q10, q12 285 vadd.i16 q5, q9, q13 286 vadd.i16 q6, q8, q14 287 vmull.s16 q2, d22, d0[3] 288 vmlal.s16 q2, d8, d1[0] 289 vmlal.s16 q2, d10, d1[1] 290 vmlal.s16 q2, d12, d1[2] 291 vmull.s16 q3, d23, d0[3] 292 vmlal.s16 q3, d9, d1[0] 293 vmlal.s16 q3, d11, d1[1] 294 vmlal.s16 q3, d13, d1[2] 295 vqrshrun.s32 d4, q2, #11 296 vqrshrun.s32 d5, q3, #11 297 vqmovun.s16 d4, q2 298 vst1.8 {d4}, [r0, :64], r1 299.if \compare 300 cmp r4, #4 301.else 302 ble 9f 303.endif 304 vmov q8, q9 305 vmov q9, q10 306 vmov q10, q11 307 vmov q11, q12 308 vmov q12, q13 309 vmov q13, q14 310.endm 311 filter 1 312 blt 7f 313 vld1.16 {q14}, [r2, :128], r7 314 b 4b 315 3165: // Less than 4 rows in total; not all of q12-q13 are filled yet. 317 tst r6, #8 // LR_HAVE_BOTTOM 318 beq 6f 319 // LR_HAVE_BOTTOM 320 cmp r4, #2 321 // We load at least 2 rows in all cases. 322 vld1.16 {q12}, [r2, :128], r7 323 vld1.16 {q13}, [r2, :128], r7 324 bgt 53f // 3 rows in total 325 beq 52f // 2 rows in total 32651: // 1 row in total, q11 already loaded, load edge into q12-q14. 327 vmov q13, q12 328 b 8f 32952: // 2 rows in total, q11 already loaded, load q12 with content data 330 // and 2 rows of edge. 331 vld1.16 {q14}, [r2, :128], r7 332 vmov q15, q14 333 b 8f 33453: 335 // 3 rows in total, q11 already loaded, load q12 and q13 with content 336 // and 2 rows of edge. 337 vld1.16 {q14}, [r2, :128], r7 338 vld1.16 {q15}, [r2, :128], r7 339 vmov q1, q15 340 b 8f 341 3426: 343 // !LR_HAVE_BOTTOM 344 cmp r4, #2 345 bgt 63f // 3 rows in total 346 beq 62f // 2 rows in total 34761: // 1 row in total, q11 already loaded, pad that into q12-q14. 348 vmov q12, q11 349 vmov q13, q11 350 vmov q14, q11 351 b 8f 35262: // 2 rows in total, q11 already loaded, load q12 and pad that into q12-q15. 353 vld1.16 {q12}, [r2, :128], r7 354 vmov q13, q12 355 vmov q14, q12 356 vmov q15, q12 357 b 8f 35863: 359 // 3 rows in total, q11 already loaded, load q12 and q13 and pad q13 into q14-q15,q1. 360 vld1.16 {q12}, [r2, :128], r7 361 vld1.16 {q13}, [r2, :128], r7 362 vmov q14, q13 363 vmov q15, q13 364 vmov q1, q13 365 b 8f 366 3677: 368 // All registers up to q13 are filled already, 3 valid rows left. 369 // < 4 valid rows left; fill in padding and filter the last 370 // few rows. 371 tst r6, #8 // LR_HAVE_BOTTOM 372 beq 71f 373 // LR_HAVE_BOTTOM; load 2 rows of edge. 374 vld1.16 {q14}, [r2, :128], r7 375 vld1.16 {q15}, [r2, :128], r7 376 vmov q1, q15 377 b 8f 37871: 379 // !LR_HAVE_BOTTOM, pad 3 rows 380 vmov q14, q13 381 vmov q15, q13 382 vmov q1, q13 383 3848: // At this point, all registers up to q14-15,q1 are loaded with 385 // edge/padding (depending on how many rows are left). 386 filter 0 // This branches to 9f when done 387 vmov q14, q15 388 vmov q15, q1 389 b 8b 390 3919: // End of one vertical slice. 392 subs r3, r3, #8 393 ble 0f 394 // Move pointers back up to the top and loop horizontally. 395 mls r0, r1, lr, r0 396 mls r2, r7, r12, r2 397 add r0, r0, #8 398 add r2, r2, #16 399 mov r4, lr 400 b 1b 401 4020: 403 vpop {q4-q6} 404 pop {r4-r7,pc} 405.purgem filter 406endfunc 407 408#define SUM_STRIDE (384+16) 409 410#include "looprestoration_tmpl.S" 411 412// void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum, 413// const pixel (*left)[4], 414// const pixel *src, const ptrdiff_t stride, 415// const int w, const int h, 416// const enum LrEdgeFlags edges); 417function sgr_box3_h_8bpc_neon, export=1 418 push {r4-r11,lr} 419 vpush {q4-q7} 420 ldrd r4, r5, [sp, #100] 421 ldrd r6, r7, [sp, #108] 422 add r5, r5, #2 // w += 2 423 424 // Set up pointers for reading/writing alternate rows 425 add r10, r0, #(4*SUM_STRIDE) // sumsq 426 add r11, r1, #(2*SUM_STRIDE) // sum 427 add r12, r3, r4 // src 428 lsl r4, r4, #1 429 mov r9, #(2*2*SUM_STRIDE) // double sum stride 430 431 // Subtract the aligned width from the output stride. 432 add lr, r5, #7 433 bic lr, lr, #7 434 sub r9, r9, lr, lsl #1 435 436 // Store the width for the vertical loop 437 mov r8, r5 438 439 // Subtract the number of pixels read from the input from the stride 440 add lr, lr, #8 441 sub r4, r4, lr 442 443 // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL 444 tst r7, #1 // LR_HAVE_LEFT 445 beq 2f 446 // LR_HAVE_LEFT 447 cmp r2, #0 448 bne 0f 449 // left == NULL 450 sub r3, r3, #2 451 sub r12, r12, #2 452 b 1f 4530: // LR_HAVE_LEFT, left != NULL 4542: // !LR_HAVE_LEFT, increase the stride. 455 // For this case we don't read the left 2 pixels from the src pointer, 456 // but shift it as if we had done that. 457 add r4, r4, #2 458 459 4601: // Loop vertically 461 vld1.8 {q0}, [r3]! 462 vld1.8 {q4}, [r12]! 463 464 tst r7, #1 // LR_HAVE_LEFT 465 beq 0f 466 cmp r2, #0 467 beq 2f 468 // LR_HAVE_LEFT, left != NULL 469 vld1.32 {d3[]}, [r2]! 470 // Move r3/r12 back to account for the last 2 bytes we loaded earlier, 471 // which we'll shift out. 472 sub r3, r3, #2 473 sub r12, r12, #2 474 vld1.32 {d11[]}, [r2]! 475 vext.8 q0, q1, q0, #14 476 vext.8 q4, q5, q4, #14 477 b 2f 4780: 479 // !LR_HAVE_LEFT, fill q1 with the leftmost byte 480 // and shift q0 to have 2x the first byte at the front. 481 vdup.8 q1, d0[0] 482 vdup.8 q5, d8[0] 483 // Move r3 back to account for the last 2 bytes we loaded before, 484 // which we shifted out. 485 sub r3, r3, #2 486 sub r12, r12, #2 487 vext.8 q0, q1, q0, #14 488 vext.8 q4, q5, q4, #14 489 4902: 491 vmull.u8 q1, d0, d0 492 vmull.u8 q2, d1, d1 493 vmull.u8 q5, d8, d8 494 vmull.u8 q6, d9, d9 495 496 tst r7, #2 // LR_HAVE_RIGHT 497 bne 4f 498 // If we'll need to pad the right edge, load that byte to pad with 499 // here since we can find it pretty easily from here. 500 sub lr, r5, #(2 + 16 - 2 + 1) 501 ldrb r11, [r3, lr] 502 ldrb lr, [r12, lr] 503 // Fill q14/q15 with the right padding pixel 504 vdup.8 q14, r11 505 vdup.8 q15, lr 506 // Restore r11 after using it for a temporary value 507 add r11, r1, #(2*SUM_STRIDE) 5083: // !LR_HAVE_RIGHT 509 510 // Check whether we need to pad the right edge 511 cmp r5, #10 512 bge 4f // If w >= 10, all used input pixels are valid 513 514 // 1 <= w < 10, w pixels valid in q0. For w=9, this ends up called 515 // again; it's not strictly needed in those cases (we pad enough here), 516 // but keeping the code as simple as possible. 517 518 // Insert padding in q0/4.b[w] onwards 519 movrel_local lr, right_ext_mask 520 sub lr, lr, r5 521 vld1.8 {q13}, [lr] 522 523 vbit q0, q14, q13 524 vbit q4, q15, q13 525 526 // Update the precalculated squares 527 vmull.u8 q1, d0, d0 528 vmull.u8 q2, d1, d1 529 vmull.u8 q5, d8, d8 530 vmull.u8 q6, d9, d9 531 5324: // Loop horizontally 533 vext.8 d16, d0, d1, #1 534 vext.8 d17, d0, d1, #2 535 vext.8 d18, d8, d9, #1 536 vext.8 d19, d8, d9, #2 537 vaddl.u8 q3, d0, d16 538 vaddw.u8 q3, q3, d17 539 vaddl.u8 q7, d8, d18 540 vaddw.u8 q7, q7, d19 541 542 vext.8 q8, q1, q2, #2 543 vext.8 q9, q1, q2, #4 544 vext.8 q10, q5, q6, #2 545 vext.8 q11, q5, q6, #4 546 547 vaddl.u16 q12, d2, d16 548 vaddl.u16 q13, d3, d17 549 vaddw.u16 q12, q12, d18 550 vaddw.u16 q13, q13, d19 551 552 vaddl.u16 q8, d10, d20 553 vaddl.u16 q9, d11, d21 554 vaddw.u16 q8, q8, d22 555 vaddw.u16 q9, q9, d23 556 557 subs r5, r5, #8 558 vst1.16 {q3}, [r1, :128]! 559 vst1.16 {q7}, [r11, :128]! 560 vst1.32 {q12, q13}, [r0, :128]! 561 vst1.32 {q8, q9}, [r10, :128]! 562 563 ble 9f 564 tst r7, #2 // LR_HAVE_RIGHT 565 vld1.8 {d6}, [r3]! 566 vld1.8 {d14}, [r12]! 567 vmov q1, q2 568 vmov q5, q6 569 vext.8 q0, q0, q3, #8 570 vext.8 q4, q4, q7, #8 571 vmull.u8 q2, d6, d6 572 vmull.u8 q6, d14, d14 573 574 bne 4b // If we don't need to pad, just keep summing. 575 b 3b // If we need to pad, check how many pixels we have left. 576 5779: 578 subs r6, r6, #2 579 ble 0f 580 // Jump to the next row and loop horizontally 581 add r0, r0, r9, lsl #1 582 add r10, r10, r9, lsl #1 583 add r1, r1, r9 584 add r11, r11, r9 585 add r3, r3, r4 586 add r12, r12, r4 587 mov r5, r8 588 b 1b 5890: 590 vpop {q4-q7} 591 pop {r4-r11,pc} 592endfunc 593 594// void dav1d_sgr_box5_h_8bpc_neon(int32_t *sumsq, int16_t *sum, 595// const pixel (*left)[4], 596// const pixel *src, const ptrdiff_t stride, 597// const int w, const int h, 598// const enum LrEdgeFlags edges); 599function sgr_box5_h_8bpc_neon, export=1 600 push {r4-r11,lr} 601 vpush {q4-q7} 602 ldrd r4, r5, [sp, #100] 603 ldrd r6, r7, [sp, #108] 604 add r5, r5, #2 // w += 2 605 606 // Set up pointers for reading/writing alternate rows 607 add r10, r0, #(4*SUM_STRIDE) // sumsq 608 add r11, r1, #(2*SUM_STRIDE) // sum 609 add r12, r3, r4 // src 610 lsl r4, r4, #1 611 mov r9, #(2*2*SUM_STRIDE) // double sum stride 612 613 // Subtract the aligned width from the output stride. 614 add lr, r5, #7 615 bic lr, lr, #7 616 sub r9, r9, lr, lsl #1 617 add lr, lr, #8 618 sub r4, r4, lr 619 620 // Store the width for the vertical loop 621 mov r8, r5 622 623 // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL 624 tst r7, #1 // LR_HAVE_LEFT 625 beq 2f 626 // LR_HAVE_LEFT 627 cmp r2, #0 628 bne 0f 629 // left == NULL 630 sub r3, r3, #3 631 sub r12, r12, #3 632 b 1f 6330: // LR_HAVE_LEFT, left != NULL 6342: // !LR_HAVE_LEFT, increase the stride. 635 // For this case we don't read the left 3 pixels from the src pointer, 636 // but shift it as if we had done that. 637 add r4, r4, #3 638 6391: // Loop vertically 640 vld1.8 {q0}, [r3]! 641 vld1.8 {q4}, [r12]! 642 643 tst r7, #1 // LR_HAVE_LEFT 644 beq 0f 645 cmp r2, #0 646 beq 2f 647 // LR_HAVE_LEFT, left != NULL 648 vld1.32 {d3[]}, [r2]! 649 // Move r3/r12 back to account for the last 3 bytes we loaded earlier, 650 // which we'll shift out. 651 sub r3, r3, #3 652 sub r12, r12, #3 653 vld1.32 {d11[]}, [r2]! 654 vext.8 q0, q1, q0, #13 655 vext.8 q4, q5, q4, #13 656 b 2f 6570: 658 // !LR_HAVE_LEFT, fill q1 with the leftmost byte 659 // and shift q0 to have 3x the first byte at the front. 660 vdup.8 q1, d0[0] 661 vdup.8 q5, d8[0] 662 // Move r3 back to account for the last 3 bytes we loaded before, 663 // which we shifted out. 664 sub r3, r3, #3 665 sub r12, r12, #3 666 vext.8 q0, q1, q0, #13 667 vext.8 q4, q5, q4, #13 668 6692: 670 vmull.u8 q1, d0, d0 671 vmull.u8 q2, d1, d1 672 vmull.u8 q5, d8, d8 673 vmull.u8 q6, d9, d9 674 675 tst r7, #2 // LR_HAVE_RIGHT 676 bne 4f 677 // If we'll need to pad the right edge, load that byte to pad with 678 // here since we can find it pretty easily from here. 679 sub lr, r5, #(2 + 16 - 3 + 1) 680 ldrb r11, [r3, lr] 681 ldrb lr, [r12, lr] 682 // Fill q14/q15 with the right padding pixel 683 vdup.8 q14, r11 684 vdup.8 q15, lr 685 // Restore r11 after using it for a temporary value 686 add r11, r1, #(2*SUM_STRIDE) 6873: // !LR_HAVE_RIGHT 688 689 // Check whether we need to pad the right edge 690 cmp r5, #11 691 bge 4f // If w >= 11, all used input pixels are valid 692 693 // 1 <= w < 11, w+1 pixels valid in q0. For w=9 or w=10, 694 // this ends up called again; it's not strictly needed in those 695 // cases (we pad enough here), but keeping the code as simple as possible. 696 697 // Insert padding in q0/4.b[w+1] onwards; fuse the +1 into the 698 // buffer pointer. 699 movrel_local lr, right_ext_mask, -1 700 sub lr, lr, r5 701 vld1.8 {q13}, [lr] 702 703 vbit q0, q14, q13 704 vbit q4, q15, q13 705 706 // Update the precalculated squares 707 vmull.u8 q1, d0, d0 708 vmull.u8 q2, d1, d1 709 vmull.u8 q5, d8, d8 710 vmull.u8 q6, d9, d9 711 7124: // Loop horizontally 713 vext.8 d16, d0, d1, #1 714 vext.8 d17, d0, d1, #2 715 vext.8 d18, d0, d1, #3 716 vext.8 d19, d0, d1, #4 717 vext.8 d20, d8, d9, #1 718 vext.8 d21, d8, d9, #2 719 vext.8 d22, d8, d9, #3 720 vext.8 d23, d8, d9, #4 721 vaddl.u8 q3, d0, d16 722 vaddl.u8 q12, d17, d18 723 vaddl.u8 q7, d8, d20 724 vaddl.u8 q13, d21, d22 725 vaddw.u8 q3, q3, d19 726 vaddw.u8 q7, q7, d23 727 vadd.u16 q3, q3, q12 728 vadd.u16 q7, q7, q13 729 730 vext.8 q8, q1, q2, #2 731 vext.8 q9, q1, q2, #4 732 vext.8 q10, q1, q2, #6 733 vext.8 q11, q1, q2, #8 734 vaddl.u16 q12, d2, d16 735 vaddl.u16 q13, d3, d17 736 vaddl.u16 q8, d18, d20 737 vaddl.u16 q9, d19, d21 738 vaddw.u16 q12, q12, d22 739 vaddw.u16 q13, q13, d23 740 vadd.i32 q12, q12, q8 741 vadd.i32 q13, q13, q9 742 vext.8 q8, q5, q6, #2 743 vext.8 q9, q5, q6, #4 744 vext.8 q10, q5, q6, #6 745 vext.8 q11, q5, q6, #8 746 vaddl.u16 q1, d10, d16 747 vaddl.u16 q5, d11, d17 748 vaddl.u16 q8, d18, d20 749 vaddl.u16 q9, d19, d21 750 vaddw.u16 q1, q1, d22 751 vaddw.u16 q5, q5, d23 752 vadd.i32 q10, q1, q8 753 vadd.i32 q11, q5, q9 754 755 subs r5, r5, #8 756 vst1.16 {q3}, [r1, :128]! 757 vst1.16 {q7}, [r11, :128]! 758 vst1.32 {q12, q13}, [r0, :128]! 759 vst1.32 {q10, q11}, [r10, :128]! 760 761 ble 9f 762 tst r7, #2 // LR_HAVE_RIGHT 763 vld1.8 {d6}, [r3]! 764 vld1.8 {d14}, [r12]! 765 vmov q1, q2 766 vmov q5, q6 767 vext.8 q0, q0, q3, #8 768 vext.8 q4, q4, q7, #8 769 vmull.u8 q2, d6, d6 770 vmull.u8 q6, d14, d14 771 bne 4b // If we don't need to pad, just keep summing. 772 b 3b // If we need to pad, check how many pixels we have left. 773 7749: 775 subs r6, r6, #2 776 ble 0f 777 // Jump to the next row and loop horizontally 778 add r0, r0, r9, lsl #1 779 add r10, r10, r9, lsl #1 780 add r1, r1, r9 781 add r11, r11, r9 782 add r3, r3, r4 783 add r12, r12, r4 784 mov r5, r8 785 b 1b 7860: 787 vpop {q4-q7} 788 pop {r4-r11,pc} 789endfunc 790 791sgr_funcs 8 792