1/* 2 * Copyright © 2021, VideoLAN and dav1d authors 3 * Copyright © 2021, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/arm/asm.S" 29#include "util.S" 30#include "src/arm/asm-offsets.h" 31 32#define GRAIN_WIDTH 82 33#define GRAIN_HEIGHT 73 34 35#define SUB_GRAIN_WIDTH 44 36#define SUB_GRAIN_HEIGHT 38 37 38.macro increment_seed steps, shift=1 39 lsr r11, r2, #3 40 lsr r12, r2, #12 41 lsr lr, r2, #1 42 eor r11, r2, r11 // (r >> 0) ^ (r >> 3) 43 eor r12, r12, lr // (r >> 12) ^ (r >> 1) 44 eor r11, r11, r12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1) 45.if \shift 46 lsr r2, r2, #\steps 47.endif 48 and r11, r11, #((1 << \steps) - 1) // bit 49.if \shift 50 orr r2, r2, r11, lsl #(16 - \steps) // *state 51.else 52 orr r2, r2, r11, lsl #16 // *state 53.endif 54.endm 55 56.macro read_rand dest, bits, age 57 ubfx \dest, r2, #16 - \bits - \age, #\bits 58.endm 59 60.macro read_shift_rand dest, bits 61 ubfx \dest, r2, #17 - \bits, #\bits 62 lsr r2, r2, #1 63.endm 64 65// special calling convention: 66// r2 holds seed 67// r3 holds dav1d_gaussian_sequence 68// clobbers r11-r12 69// returns in d0-d1 70function get_gaussian_neon 71 push {r5-r6,lr} 72 increment_seed 4 73 read_rand r5, 11, 3 74 read_rand r6, 11, 2 75 add r5, r3, r5, lsl #1 76 add r6, r3, r6, lsl #1 77 vld1.16 {d0[0]}, [r5] 78 read_rand r5, 11, 1 79 vld1.16 {d0[1]}, [r6] 80 add r5, r3, r5, lsl #1 81 read_rand r6, 11, 0 82 increment_seed 4 83 add r6, r3, r6, lsl #1 84 vld1.16 {d0[2]}, [r5] 85 read_rand r5, 11, 3 86 vld1.16 {d0[3]}, [r6] 87 add r5, r3, r5, lsl #1 88 read_rand r6, 11, 2 89 vld1.16 {d1[0]}, [r5] 90 add r6, r3, r6, lsl #1 91 read_rand r5, 11, 1 92 vld1.16 {d1[1]}, [r6] 93 read_rand r6, 11, 0 94 add r5, r3, r5, lsl #1 95 add r6, r3, r6, lsl #1 96 vld1.16 {d1[2]}, [r5] 97 vld1.16 {d1[3]}, [r6] 98 pop {r5-r6,pc} 99endfunc 100 101.macro get_grain_row r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10 102 bl get_gaussian_neon 103 vrshl.s16 q0, q0, q15 104 vmovn.i16 \r0, q0 105 bl get_gaussian_neon 106 vrshl.s16 q0, q0, q15 107 vmovn.i16 \r1, q0 108 bl get_gaussian_neon 109 vrshl.s16 q0, q0, q15 110 vmovn.i16 \r2, q0 111 bl get_gaussian_neon 112 vrshl.s16 q0, q0, q15 113 vmovn.i16 \r3, q0 114 bl get_gaussian_neon 115 vrshl.s16 q0, q0, q15 116 vmovn.i16 \r4, q0 117 bl get_gaussian_neon 118 vrshl.s16 q0, q0, q15 119 vmovn.i16 \r5, q0 120 bl get_gaussian_neon 121 vrshl.s16 q0, q0, q15 122 vmovn.i16 \r6, q0 123 bl get_gaussian_neon 124 vrshl.s16 q0, q0, q15 125 vmovn.i16 \r7, q0 126 bl get_gaussian_neon 127 vrshl.s16 q0, q0, q15 128 vmovn.i16 \r8, q0 129 bl get_gaussian_neon 130 vrshl.s16 q0, q0, q15 131 vmovn.i16 \r9, q0 132 increment_seed 2 133 read_rand r11, 11, 1 134 read_rand r12, 11, 0 135 add r11, r3, r11, lsl #1 136 add r12, r3, r12, lsl #1 137 vld1.16 {d0[0]}, [r11] 138 vld1.16 {d0[1]}, [r12] 139 vrshl.s16 d0, d0, d30 140 vmovn.i16 \r10, q0 141.endm 142 143.macro store_grain_row r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10 144 vst1.16 {\r0, \r1, \r2, \r3}, [r0]! 145 vst1.16 {\r4, \r5, \r6, \r7}, [r0]! 146 vst1.16 {\r8, \r9}, [r0]! 147 vst1.16 {\r10[0]}, [r0]! 148.endm 149 150.macro get_grain_row_44 r0, r1, r2, r3, r4, r5 151 bl get_gaussian_neon 152 vrshl.s16 q0, q0, q15 153 vmovn.i16 \r0, q0 154 bl get_gaussian_neon 155 vrshl.s16 q0, q0, q15 156 vmovn.i16 \r1, q0 157 bl get_gaussian_neon 158 vrshl.s16 q0, q0, q15 159 vmovn.i16 \r2, q0 160 bl get_gaussian_neon 161 vrshl.s16 q0, q0, q15 162 vmovn.i16 \r3, q0 163 bl get_gaussian_neon 164 vrshl.s16 q0, q0, q15 165 vmovn.i16 \r4, q0 166 increment_seed 4 167 read_rand r11, 11, 3 168 read_rand r12, 11, 2 169 add r11, r3, r11, lsl #1 170 add r12, r3, r12, lsl #1 171 vld1.16 {d0[]}, [r11] 172 read_rand r11, 11, 1 173 vld1.16 {d0[1]}, [r12] 174 add r11, r3, r11, lsl #1 175 read_rand r12, 11, 0 176 vld1.16 {d0[2]}, [r11] 177 add r12, r3, r12, lsl #1 178 vld1.16 {d0[3]}, [r12] 179 vrshl.s16 d0, d0, d30 180 vmovn.i16 \r5, q0 181.endm 182 183.macro store_grain_row_44 r0, r1, r2, r3, r4, r5 184 vst1.16 {\r0, \r1, \r2, \r3}, [r0]! 185 vst1.16 {\r4, \r5}, [r0] 186 add r0, r0, #GRAIN_WIDTH-32 187.endm 188 189function get_grain_2_neon 190 push {r11,lr} 191 increment_seed 2 192 read_rand r11, 11, 1 193 read_rand r12, 11, 0 194 add r11, r3, r11, lsl #1 195 add r12, r3, r12, lsl #1 196 vld1.16 {d0[0]}, [r11] 197 vld1.16 {d0[1]}, [r12] 198 vrshl.s16 d0, d0, d30 199 vmovn.i16 d0, q0 200 pop {r11,pc} 201endfunc 202 203.macro get_grain_2 dst 204 bl get_grain_2_neon 205.ifnc \dst, d0 206 vmov \dst, d0 207.endif 208.endm 209 210// r1 holds the number of entries to produce 211// r6, r8 and r10 hold the previous output entries 212// q0 holds the vector of produced entries 213// q1 holds the input vector of sums from above 214.macro output_lag n 215function output_lag\n\()_neon 216 push {r0, lr} 217.if \n == 1 218 mov lr, #-128 219.else 220 mov r0, #1 221 mov lr, #1 222 sub r7, r7, #1 223 sub r9, r9, #1 224 lsl r0, r0, r7 225 lsl lr, lr, r9 226 add r7, r7, #1 227 add r9, r9, #1 228.endif 2291: 230 read_shift_rand r12, 11 231 vmov.32 r11, d2[0] 232 lsl r12, r12, #1 233 vext.8 q0, q0, q0, #1 234 ldrsh r12, [r3, r12] 235.if \n == 1 236 mla r11, r6, r4, r11 // sum (above) + *coeff * prev output 237 add r6, r11, r8 // 1 << (ar_coeff_shift - 1) 238 add r12, r12, r10 239 asr r6, r6, r7 // >> ar_coeff_shift 240 asr r12, r12, r9 // >> (4 + grain_scale_shift) 241 add r6, r6, r12 242 cmp r6, r5 243.elseif \n == 2 244 mla r11, r8, r4, r11 // sum (above) + *coeff * prev output 1 245 mla r11, r6, r10, r11 // += *coeff * prev output 2 246 mov r8, r6 247 add r6, r11, r0 // 1 << (ar_coeff_shift - 1) 248 add r12, r12, lr // 1 << (4 + grain_scale_shift - 1) 249 asr r6, r6, r7 // >> ar_coeff_shift 250 asr r12, r12, r9 // >> (4 + grain_scale_shift) 251 add r6, r6, r12 252 push {lr} 253 cmp r6, r5 254 mov lr, #-128 255.else 256 push {r1-r3} 257 sbfx r1, r4, #0, #8 258 sbfx r2, r4, #8, #8 259 sbfx r3, r4, #16, #8 260 mla r11, r10, r1, r11 // sum (above) + *coeff * prev output 1 261 mla r11, r8, r2, r11 // sum (above) + *coeff * prev output 2 262 mla r11, r6, r3, r11 // += *coeff * prev output 3 263 pop {r1-r3} 264 mov r10, r8 265 mov r8, r6 266 267 add r6, r11, r0 // 1 << (ar_coeff_shift - 1) 268 add r12, r12, lr // 1 << (4 + grain_scale_shift - 1) 269 asr r6, r6, r7 // >> ar_coeff_shift 270 asr r12, r12, r9 // >> (4 + grain_scale_shift) 271 add r6, r6, r12 272 push {lr} 273 cmp r6, r5 274 mov lr, #-128 275.endif 276 it gt 277 movgt r6, r5 278 cmp r6, lr 279 it lt 280 movlt r6, lr 281.if \n >= 2 282 pop {lr} 283.endif 284 subs r1, r1, #1 285 vext.8 q1, q1, q1, #4 286 vmov.8 d1[7], r6 287 bgt 1b 288 pop {r0, pc} 289endfunc 290.endm 291 292output_lag 1 293output_lag 2 294output_lag 3 295 296 297function sum_lag1_above_neon 298 vmull.s8 q2, d6, d28 299 vmull.s8 q3, d7, d28 300 vmull.s8 q4, d0, d27 301 vmull.s8 q5, d1, d27 302 303 vaddl.s16 q0, d4, d8 304 vaddl.s16 q2, d5, d9 305 vaddl.s16 q4, d6, d10 306 vaddl.s16 q5, d7, d11 307 308 vmull.s8 q3, d3, d29 309 vmull.s8 q1, d2, d29 310 311 vaddw.s16 q4, q4, d6 312 vaddw.s16 q5, q5, d7 313 vaddw.s16 q3, q2, d3 314 vaddw.s16 q2, q0, d2 315 bx lr 316endfunc 317 318.macro sum_lag_n_body lag, type, uv_layout, edge, elems, store, uv_coeff 319.ifc \lag\()_\edge, lag3_left 320 bl sum_lag3_left_above_neon 321.else 322 bl sum_\lag\()_above_neon 323.endif 324.ifc \type, uv_420 325 vpush {q6-q7} 326 add r12, r11, #GRAIN_WIDTH 327 vld1.16 {q0, q1}, [r11]! 328 vld1.16 {q6, q7}, [r12]! 329 vpaddl.s8 q0, q0 330 vpaddl.s8 q1, q1 331 vpaddl.s8 q6, q6 332 vpaddl.s8 q7, q7 333 vadd.i16 q0, q0, q6 334 vadd.i16 q1, q1, q7 335 vpop {q6-q7} 336 vrshrn.s16 d0, q0, #2 337 vrshrn.s16 d1, q1, #2 338.endif 339.ifc \type, uv_422 340 vld1.8 {q0, q1}, [r11]! 341 vpaddl.s8 q0, q0 342 vpaddl.s8 q1, q1 343 vrshrn.s16 d0, q0, #1 344 vrshrn.s16 d1, q1, #1 345.endif 346.ifc \type, uv_444 347 vld1.8 {q0}, [r11]! 348.endif 349.if \uv_layout 350.ifnb \uv_coeff 351 vdup.8 d13, \uv_coeff 352.endif 353 vmull.s8 q1, d0, d13 354 vmull.s8 q0, d1, d13 355 vaddw.s16 q2, q2, d2 356 vaddw.s16 q3, q3, d3 357 vaddw.s16 q4, q4, d0 358 vaddw.s16 q5, q5, d1 359.endif 360.if \uv_layout && \elems == 16 361 b sum_\lag\()_y_\edge\()_start 362.elseif \uv_layout == 444 && \elems == 15 363 b sum_\lag\()_y_\edge\()_start 364.elseif \uv_layout == 422 && \elems == 9 365 b sum_\lag\()_uv_420_\edge\()_start 366.else 367sum_\lag\()_\type\()_\edge\()_start: 368 push {r11} 369.ifc \edge, left 370 increment_seed 4 371 read_rand r11, 11, 3 372 read_rand r12, 11, 2 373 add r11, r3, r11, lsl #1 374 add r12, r3, r12, lsl #1 375 vld1.16 {d1[1]}, [r11] 376 read_rand r11, 11, 1 377 vld1.16 {d1[2]}, [r12] 378 add r11, r3, r11, lsl #1 379 vld1.16 {d1[3]}, [r11] 380 lsl r2, r2, #1 // shift back the state as if we'd done increment_seed with shift=0 381 vrshl.s16 d1, d1, d30 382 vmovn.i16 d1, q0 383 vext.8 q2, q2, q2, #12 384.ifc \lag, lag3 385 vmov.s8 r10, d1[5] 386.endif 387.ifnc \lag, lag1 388 vmov.s8 r8, d1[6] 389.endif 390 vmov.s8 r6, d1[7] 391 392 vmov q1, q2 393 mov r1, #1 394 bl output_\lag\()_neon 395.else 396 increment_seed 4, shift=0 397 vmov q1, q2 398 mov r1, #4 399 bl output_\lag\()_neon 400.endif 401 402 increment_seed 4, shift=0 403 vmov q1, q3 404 mov r1, #4 405 bl output_\lag\()_neon 406 407 increment_seed 4, shift=0 408 vmov q1, q4 409.if \elems == 9 410 mov r1, #1 411 bl output_\lag\()_neon 412 lsr r2, r2, #3 413 414 read_rand r11, 11, 2 415 read_rand r12, 11, 1 416 add r11, r3, r11, lsl #1 417 add r12, r3, r12, lsl #1 418 vld1.16 {d2[0]}, [r11] 419 read_rand r11, 11, 0 420 vld1.16 {d2[1]}, [r12] 421 add r11, r3, r11, lsl #1 422 vld1.16 {d2[2]}, [r11] 423 vrshl.s16 d2, d2, d30 424 vmovn.i16 d2, q1 425 vext.8 q0, q0, q1, #7 426.else 427 mov r1, #4 428 bl output_\lag\()_neon 429 430 increment_seed 4, shift=0 431 vmov q1, q5 432 433.ifc \edge, right 434 mov r1, #3 435 bl output_\lag\()_neon 436 read_shift_rand r11, 11 437 add r11, r3, r11, lsl #1 438 vld1.16 {d2[0]}, [r11] 439 vrshl.s16 d2, d2, d30 440 vext.8 q0, q0, q1, #1 441.else 442 mov r1, #4 443 bl output_\lag\()_neon 444.endif 445.endif 446.if \store 447 vst1.8 {q0}, [r0]! 448.endif 449 pop {r11} 450 pop {r1, pc} 451.endif 452.endm 453 454.macro sum_lag1_func type, uv_layout, edge, elems=16 455function sum_\type\()_lag1_\edge\()_neon 456 push {r1, lr} 457 sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems, store=0 458endfunc 459.endm 460 461sum_lag1_func y, 0, left 462sum_lag1_func y, 0, mid 463sum_lag1_func y, 0, right, 15 464sum_lag1_func uv_444, 444, left 465sum_lag1_func uv_444, 444, mid 466sum_lag1_func uv_444, 444, right, 15 467sum_lag1_func uv_422, 422, left 468sum_lag1_func uv_422, 422, mid 469sum_lag1_func uv_422, 422, right, 9 470sum_lag1_func uv_420, 420, left 471sum_lag1_func uv_420, 420, mid 472sum_lag1_func uv_420, 420, right, 9 473 474.macro sum_lag1 type, dst, left, mid, right, edge=mid 475 vmov q3, \mid 476 vext.8 q0, \left, \mid, #15 477 vext.8 q1, \mid, \right, #1 478 bl sum_\type\()_lag1_\edge\()_neon 479 vmov \dst, q0 480.endm 481 482.macro sum_y_lag1 dst, left, mid, right, edge=mid 483 sum_lag1 y, \dst, \left, \mid, \right, \edge 484.endm 485 486.macro sum_uv_444_lag1 dst, left, mid, right, edge=mid 487 sum_lag1 uv_444, \dst, \left, \mid, \right, \edge 488.endm 489 490.macro sum_uv_422_lag1 dst, left, mid, right, edge=mid 491 sum_lag1 uv_422, \dst, \left, \mid, \right, \edge 492.endm 493 494.macro sum_uv_420_lag1 dst, left, mid, right, edge=mid 495 sum_lag1 uv_420, \dst, \left, \mid, \right, \edge 496.endm 497 498 499function sum_lag2_above_neon 500 push {lr} 501 sub r12, r0, #2*GRAIN_WIDTH - 16 502 sub lr, r0, #1*GRAIN_WIDTH - 16 503 vld1.8 {q10}, [r12] // load top right 504 vld1.8 {q13}, [lr] 505 506 vext.8 q6, q8, q9, #14 // top left, top mid 507 vdup.8 d14, d28[0] 508 vext.8 q8, q8, q9, #15 509 vdup.8 d15, d28[1] 510 511 vmull.s8 q0, d12, d14 512 vmull.s8 q1, d13, d14 513 vmull.s8 q6, d16, d15 514 vmull.s8 q8, d17, d15 515 516 vaddl.s16 q2, d0, d12 517 vaddl.s16 q3, d1, d13 518 vaddl.s16 q4, d2, d16 519 vaddl.s16 q5, d3, d17 520 521 vext.8 q6, q9, q10, #1 // top mid, top right 522 vdup.8 d14, d28[3] 523 vext.8 q8, q9, q10, #2 524 vdup.8 d15, d28[4] 525 526 vmull.s8 q0, d12, d14 527 vmull.s8 q1, d13, d14 528 vmull.s8 q6, d16, d15 529 vmull.s8 q8, d17, d15 530 531 vaddl.s16 q7, d0, d12 532 vaddl.s16 q0, d1, d13 533 vaddl.s16 q6, d2, d16 534 vaddl.s16 q1, d3, d17 535 536 vadd.i32 q2, q2, q7 537 vadd.i32 q3, q3, q0 538 vadd.i32 q4, q4, q6 539 vadd.i32 q5, q5, q1 540 541 vext.8 q6, q11, q12, #14 // top left, top mid 542 vdup.8 d14, d28[5] 543 vext.8 q8, q11, q12, #15 544 vdup.8 d15, d28[6] 545 546 vmull.s8 q0, d12, d14 547 vmull.s8 q1, d13, d14 548 vmull.s8 q6, d16, d15 549 vmull.s8 q8, d17, d15 550 551 vaddl.s16 q7, d0, d12 552 vaddl.s16 q0, d1, d13 553 vaddl.s16 q6, d2, d16 554 vaddl.s16 q1, d3, d17 555 556 vadd.i32 q2, q2, q7 557 vadd.i32 q3, q3, q0 558 vadd.i32 q4, q4, q6 559 vadd.i32 q5, q5, q1 560 561 vext.8 q6, q12, q13, #1 // top mid, top right 562 vdup.8 d14, d29[0] 563 vext.8 q8, q12, q13, #2 564 vdup.8 d15, d29[1] 565 566 vmull.s8 q0, d12, d14 567 vmull.s8 q1, d13, d14 568 vmull.s8 q6, d16, d15 569 vmull.s8 q8, d17, d15 570 571 vaddl.s16 q7, d0, d12 572 vaddl.s16 q0, d1, d13 573 vaddl.s16 q6, d2, d16 574 vaddl.s16 q1, d3, d17 575 576 vadd.i32 q2, q2, q7 577 vadd.i32 q3, q3, q0 578 vadd.i32 q4, q4, q6 579 vadd.i32 q5, q5, q1 580 581 vdup.8 d14, d28[2] 582 vdup.8 d15, d28[7] 583 584 vmull.s8 q0, d18, d14 585 vmull.s8 q1, d19, d14 586 vmull.s8 q6, d24, d15 587 vmull.s8 q8, d25, d15 588 589 vaddl.s16 q7, d0, d12 590 vaddl.s16 q0, d1, d13 591 vaddl.s16 q6, d2, d16 592 vaddl.s16 q1, d3, d17 593 594 vmov q8, q9 595 vmov q9, q10 596 597 vadd.i32 q2, q2, q7 598 vadd.i32 q3, q3, q0 599 vadd.i32 q4, q4, q6 600 vadd.i32 q5, q5, q1 601 602 vmov q11, q12 603 vmov q12, q13 604 605 pop {pc} 606endfunc 607 608.macro sum_lag2_func type, uv_layout, edge, elems=16 609function sum_\type\()_lag2_\edge\()_neon 610 push {r1, lr} 611.ifc \edge, left 612 sub r12, r0, #2*GRAIN_WIDTH 613 sub lr, r0, #1*GRAIN_WIDTH 614 vld1.8 {q9}, [r12] // load the previous block right above 615 vld1.8 {q12}, [lr] 616.endif 617 sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=d29[4] 618endfunc 619.endm 620 621sum_lag2_func y, 0, left 622sum_lag2_func y, 0, mid 623sum_lag2_func y, 0, right, 15 624sum_lag2_func uv_444, 444, left 625sum_lag2_func uv_444, 444, mid 626sum_lag2_func uv_444, 444, right, 15 627sum_lag2_func uv_422, 422, left 628sum_lag2_func uv_422, 422, mid 629sum_lag2_func uv_422, 422, right, 9 630sum_lag2_func uv_420, 420, left 631sum_lag2_func uv_420, 420, mid 632sum_lag2_func uv_420, 420, right, 9 633 634 635function sum_lag3_left_above_neon 636 // A separate codepath for the left edge, to avoid reading outside 637 // of the edge of the buffer. 638 sub r12, r0, #3*GRAIN_WIDTH 639 vld1.8 {q11, q12}, [r12] 640 vext.8 q12, q11, q12, #13 641 vext.8 q11, q11, q11, #13 642 b sum_lag3_above_start 643endfunc 644 645function sum_lag3_above_neon 646 sub r12, r0, #3*GRAIN_WIDTH + 3 647 vld1.8 {q11, q12}, [r12] 648 649sum_lag3_above_start: 650 vdup.8 d20, d26[0] 651 vext.8 q9, q11, q12, #1 652 vdup.8 d21, d26[1] 653 654 vmull.s8 q0, d22, d20 655 vmull.s8 q1, d23, d20 656 vmull.s8 q6, d18, d21 657 vmull.s8 q7, d19, d21 658 659 vext.8 q8, q11, q12, #2 660 vdup.8 d20, d26[2] 661 vext.8 q9, q11, q12, #3 662 vdup.8 d21, d26[3] 663 664 vaddl.s16 q2, d0, d12 665 vaddl.s16 q3, d1, d13 666 vaddl.s16 q4, d2, d14 667 vaddl.s16 q5, d3, d15 668 669 vmull.s8 q0, d16, d20 670 vmull.s8 q1, d17, d20 671 vmull.s8 q6, d18, d21 672 vmull.s8 q7, d19, d21 673 674 vaddl.s16 q8, d0, d12 675 vaddl.s16 q9, d1, d13 676 vaddl.s16 q0, d2, d14 677 vaddl.s16 q1, d3, d15 678 679 vext.8 q6, q11, q12, #4 680 vdup.8 d20, d26[4] 681 vext.8 q7, q11, q12, #5 682 vdup.8 d21, d26[5] 683 684 vadd.i32 q2, q2, q8 685 vadd.i32 q3, q3, q9 686 vadd.i32 q4, q4, q0 687 vadd.i32 q5, q5, q1 688 689 vmull.s8 q0, d12, d20 690 vmull.s8 q1, d13, d20 691 vmull.s8 q8, d14, d21 692 vmull.s8 q9, d15, d21 693 694 sub r12, r0, #2*GRAIN_WIDTH + 3 695 696 vaddl.s16 q6, d0, d16 697 vaddl.s16 q7, d1, d17 698 vaddl.s16 q0, d2, d18 699 vaddl.s16 q1, d3, d19 700 701 vext.8 q8, q11, q12, #6 702 vld1.8 {q11, q12}, [r12] 703 vdup.8 d20, d26[6] 704 vdup.8 d21, d26[7] 705 706 vadd.i32 q2, q2, q6 707 vadd.i32 q3, q3, q7 708 vadd.i32 q4, q4, q0 709 vadd.i32 q5, q5, q1 710 711 vmull.s8 q0, d16, d20 712 vmull.s8 q1, d17, d20 713 vmull.s8 q6, d22, d21 714 vmull.s8 q7, d23, d21 715 716 vaddl.s16 q8, d0, d12 717 vaddl.s16 q9, d1, d13 718 vaddl.s16 q0, d2, d14 719 vaddl.s16 q1, d3, d15 720 721 vext.8 q6, q11, q12, #1 722 vdup.8 d20, d27[0] 723 vext.8 q7, q11, q12, #2 724 vdup.8 d21, d27[1] 725 726 vadd.i32 q2, q2, q8 727 vadd.i32 q3, q3, q9 728 vadd.i32 q4, q4, q0 729 vadd.i32 q5, q5, q1 730 731 vmull.s8 q0, d12, d20 732 vmull.s8 q1, d13, d20 733 vmull.s8 q8, d14, d21 734 vmull.s8 q9, d15, d21 735 736 vaddl.s16 q6, d0, d16 737 vaddl.s16 q7, d1, d17 738 vaddl.s16 q0, d2, d18 739 vaddl.s16 q1, d3, d19 740 741 vext.8 q8, q11, q12, #3 742 vdup.8 d20, d27[2] 743 vext.8 q9, q11, q12, #4 744 vdup.8 d21, d27[3] 745 746 vadd.i32 q2, q2, q6 747 vadd.i32 q3, q3, q7 748 vadd.i32 q4, q4, q0 749 vadd.i32 q5, q5, q1 750 751 vmull.s8 q0, d16, d20 752 vmull.s8 q1, d17, d20 753 vmull.s8 q6, d18, d21 754 vmull.s8 q7, d19, d21 755 756 sub r12, r0, #1*GRAIN_WIDTH + 3 757 758 vaddl.s16 q8, d0, d12 759 vaddl.s16 q9, d1, d13 760 vaddl.s16 q0, d2, d14 761 vaddl.s16 q1, d3, d15 762 763 vext.8 q6, q11, q12, #5 764 vdup.8 d20, d27[4] 765 vext.8 q7, q11, q12, #6 766 vdup.8 d21, d27[5] 767 768 vld1.8 {q11, q12}, [r12] 769 770 vadd.i32 q2, q2, q8 771 vadd.i32 q3, q3, q9 772 vadd.i32 q4, q4, q0 773 vadd.i32 q5, q5, q1 774 775 vmull.s8 q0, d12, d20 776 vmull.s8 q1, d13, d20 777 vmull.s8 q8, d14, d21 778 vmull.s8 q9, d15, d21 779 780 vaddl.s16 q6, d0, d16 781 vaddl.s16 q7, d1, d17 782 vaddl.s16 q0, d2, d18 783 vaddl.s16 q1, d3, d19 784 785 vdup.8 d20, d27[6] 786 vext.8 q9, q11, q12, #1 787 vdup.8 d21, d27[7] 788 789 vadd.i32 q2, q2, q6 790 vadd.i32 q3, q3, q7 791 vadd.i32 q4, q4, q0 792 vadd.i32 q5, q5, q1 793 794 vmull.s8 q0, d22, d20 795 vmull.s8 q1, d23, d20 796 vmull.s8 q6, d18, d21 797 vmull.s8 q7, d19, d21 798 799 vaddl.s16 q8, d0, d12 800 vaddl.s16 q9, d1, d13 801 vaddl.s16 q0, d2, d14 802 vaddl.s16 q1, d3, d15 803 804 vext.8 q6, q11, q12, #2 805 vdup.8 d20, d28[0] 806 vext.8 q7, q11, q12, #3 807 vdup.8 d21, d28[1] 808 809 vadd.i32 q2, q2, q8 810 vadd.i32 q3, q3, q9 811 vadd.i32 q4, q4, q0 812 vadd.i32 q5, q5, q1 813 814 vmull.s8 q0, d12, d20 815 vmull.s8 q1, d13, d20 816 vmull.s8 q8, d14, d21 817 vmull.s8 q9, d15, d21 818 819 vaddl.s16 q6, d0, d16 820 vaddl.s16 q7, d1, d17 821 vaddl.s16 q0, d2, d18 822 vaddl.s16 q1, d3, d19 823 824 vext.8 q8, q11, q12, #4 825 vdup.8 d20, d28[2] 826 vext.8 q9, q11, q12, #5 827 vdup.8 d21, d28[3] 828 829 vadd.i32 q2, q2, q6 830 vadd.i32 q3, q3, q7 831 vadd.i32 q4, q4, q0 832 vadd.i32 q5, q5, q1 833 834 vmull.s8 q0, d16, d20 835 vmull.s8 q1, d17, d20 836 vmull.s8 q6, d18, d21 837 vmull.s8 q7, d19, d21 838 839 vaddl.s16 q8, d0, d12 840 vaddl.s16 q9, d1, d13 841 vaddl.s16 q0, d2, d14 842 vaddl.s16 q1, d3, d15 843 844 vext.8 q6, q11, q12, #6 845 vdup.8 d20, d28[4] 846 847 vadd.i32 q2, q2, q8 848 vadd.i32 q3, q3, q9 849 vadd.i32 q4, q4, q0 850 vadd.i32 q5, q5, q1 851 852 vmull.s8 q0, d12, d20 853 vmull.s8 q1, d13, d20 854 855 vaddw.s16 q2, q2, d0 856 vaddw.s16 q3, q3, d1 857 vaddw.s16 q4, q4, d2 858 vaddw.s16 q5, q5, d3 859 860 bx lr 861endfunc 862 863.macro sum_lag3_func type, uv_layout, edge, elems=16 864function sum_\type\()_lag3_\edge\()_neon 865 push {r1, lr} 866 sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=d29[0] 867endfunc 868.endm 869 870sum_lag3_func y, 0, left 871sum_lag3_func y, 0, mid 872sum_lag3_func y, 0, right, 15 873sum_lag3_func uv_444, 444, left 874sum_lag3_func uv_444, 444, mid 875sum_lag3_func uv_444, 444, right, 15 876sum_lag3_func uv_422, 422, left 877sum_lag3_func uv_422, 422, mid 878sum_lag3_func uv_422, 422, right, 9 879sum_lag3_func uv_420, 420, left 880sum_lag3_func uv_420, 420, mid 881sum_lag3_func uv_420, 420, right, 9 882 883function generate_grain_rows_neon 884 push {r11,lr} 8851: 886 get_grain_row d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26 887 subs r1, r1, #1 888 store_grain_row d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26 889 bgt 1b 890 pop {r11,pc} 891endfunc 892 893function generate_grain_rows_44_neon 894 push {r11,lr} 8951: 896 get_grain_row_44 d16, d17, d18, d19, d20, d21 897 subs r1, r1, #1 898 store_grain_row_44 d16, d17, d18, d19, d20, d21 899 bgt 1b 900 pop {r11,pc} 901endfunc 902 903function gen_grain_uv_444_lag0_neon 904 vld1.8 {q3}, [r11]! 905 push {r11,lr} 906 bl get_gaussian_neon 907 vrshl.s16 q8, q0, q15 908 bl get_gaussian_neon 909 vrshl.s16 q9, q0, q15 910 vqmovn.s16 d0, q8 911 vqmovn.s16 d1, q9 912 913 vand q3, q3, q1 914 vmull.s8 q2, d6, d22 915 vmull.s8 q3, d7, d22 916 vrshl.s16 q2, q2, q12 917 vrshl.s16 q3, q3, q12 918 vaddw.s8 q2, q2, d0 919 vaddw.s8 q3, q3, d1 920 vqmovn.s16 d4, q2 921 vqmovn.s16 d5, q3 922 vst1.8 {q2}, [r0]! 923 pop {r11,pc} 924endfunc 925 926function get_grain_row_44_neon 927 push {r11,lr} 928 get_grain_row_44 d16, d17, d18, d19, d20, d21 929 pop {r11,pc} 930endfunc 931 932function add_uv_420_coeff_lag0_neon 933 vld1.16 {q2, q3}, [r11]! 934 vld1.16 {q4, q5}, [r12]! 935 vpaddl.s8 q2, q2 936 vpaddl.s8 q3, q3 937 vpaddl.s8 q4, q4 938 vpaddl.s8 q5, q5 939 vadd.i16 q2, q2, q4 940 vadd.i16 q3, q3, q5 941 vrshrn.s16 d4, q2, #2 942 vrshrn.s16 d5, q3, #2 943 b add_coeff_lag0_start 944endfunc 945 946function add_uv_422_coeff_lag0_neon 947 vld1.16 {q2, q3}, [r11]! 948 vpaddl.s8 q2, q2 949 vpaddl.s8 q3, q3 950 vrshrn.s16 d4, q2, #1 951 vrshrn.s16 d5, q3, #1 952 953add_coeff_lag0_start: 954 vand q3, q2, q1 955 vmull.s8 q2, d6, d22 956 vmull.s8 q3, d7, d22 957 vrshl.s16 q2, q2, q12 958 vrshl.s16 q3, q3, q12 959 vaddw.s8 q2, q2, d0 960 vaddw.s8 q3, q3, d1 961 vqmovn.s16 d4, q2 962 vqmovn.s16 d5, q3 963 bx lr 964endfunc 965 966.macro gen_grain_82 type 967function generate_grain_\type\()_8bpc_neon, export=1 968 push {r4-r11,lr} 969 970.ifc \type, uv_444 971 mov r12, r3 972 mov lr, #28 973 add r11, r1, #3*GRAIN_WIDTH 974 mov r1, r2 975 mul r12, r12, lr 976.endif 977 movrel r3, X(gaussian_sequence) 978 ldr r2, [r1, #FGD_SEED] 979 ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT] 980.ifc \type, y 981 add r4, r1, #FGD_AR_COEFFS_Y 982.else 983 add r4, r1, #FGD_AR_COEFFS_UV 984.endif 985 adr r5, L(gen_grain_\type\()_tbl) 986 ldr r6, [r1, #FGD_AR_COEFF_LAG] 987 add r9, r9, #4 988 ldr r6, [r5, r6, lsl #2] 989 vdup.16 q15, r9 // 4 + data->grain_scale_shift 990 add r5, r5, r6 991 vneg.s16 q15, q15 992 993.ifc \type, uv_444 994 cmp r12, #0 995 movw r10, #0x49d8 996 movw lr, #0xb524 997 // Intentionally using a separate register instead of moveq with an 998 // immediate constant, to avoid armv8 deprecated it instruction forms. 999 it eq 1000 moveq r10, lr 1001 add r4, r4, r12 // Add offset to ar_coeffs_uv[1] 1002 eor r2, r2, r10 1003.endif 1004 1005 ldr r7, [r1, #FGD_AR_COEFF_SHIFT] 1006 mov r8, #1 1007 mov r10, #1 1008 lsl r8, r8, r7 // 1 << ar_coeff_shift 1009 lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift) 1010 lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1) 1011 lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1) 1012 1013 bx r5 1014 1015 .align 2 1016L(gen_grain_\type\()_tbl): 1017 .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB 1018 .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB 1019 .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB 1020 .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB 1021 1022L(generate_grain_\type\()_lag0): 1023.ifc \type, y 1024 mov r1, #GRAIN_HEIGHT 1025 bl generate_grain_rows_neon 1026.else 1027 1028 mov r1, #3 1029 bl generate_grain_rows_neon 1030 mov r1, #GRAIN_HEIGHT-3 1031 1032 vdup.16 q12, r7 1033 vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0] 1034 vmov.i8 q0, #0 1035 vmov.i8 q1, #255 1036 vext.8 q13, q0, q1, #13 1037 vext.8 q14, q1, q0, #1 1038 vneg.s16 q12, q12 1039 10401: 1041 vmov q1, q13 1042 bl gen_grain_uv_444_lag0_neon // 16 1043 vmov.i8 q1, #255 1044 bl gen_grain_uv_444_lag0_neon // 32 1045 bl gen_grain_uv_444_lag0_neon // 48 1046 bl gen_grain_uv_444_lag0_neon // 64 1047 vmov q1, q14 1048 bl gen_grain_uv_444_lag0_neon // 80 1049 get_grain_2 d16 1050 subs r1, r1, #1 1051 add r11, r11, #2 1052 vst1.16 {d16[0]}, [r0]! 1053 bgt 1b 1054.endif 1055 pop {r4-r11,pc} 1056 1057L(generate_grain_\type\()_lag1): 1058 vpush {q4-q7} 1059 mov r5, #127 1060 vld1.8 {d27[]}, [r4]! // ar_coeffs_y[0] 1061 vld1.8 {d28[]}, [r4]! // ar_coeffs_y[1] 1062 vld1.8 {d29[]}, [r4] // ar_coeffs_y[2] 1063.ifc \type, y 1064 ldrsb r4, [r4, #1] // ar_coeffs_y[3] 1065.else 1066 add r4, r4, #2 1067.endif 1068 1069 mov r1, #3 1070.ifc \type, uv_444 1071 vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4] 1072 ldrsb r4, [r4, #-1] // ar_coeffs_uv[3] 1073.endif 1074 bl generate_grain_rows_neon 1075 1076 mov r1, #GRAIN_HEIGHT - 3 10771: 1078 sum_\type\()_lag1 q7, q8, q8, q9, left 1079 sum_\type\()_lag1 q8, q8, q9, q10 1080 sum_\type\()_lag1 q9, q9, q10, q11 1081 sum_\type\()_lag1 q10, q10, q11, q12 1082 sum_\type\()_lag1 q12, q11, q12, q13, right 1083 get_grain_2 d26 1084 subs r1, r1, #1 1085.ifc \type, uv_444 1086 add r11, r11, #2 1087.endif 1088 store_grain_row d14, d15, d16, d17, d18, d19, d20, d21, d24, d25, d26 1089 vmov q11, q10 1090 vmov q10, q9 1091 vmov q9, q8 1092 vmov q8, q7 1093 bgt 1b 1094 1095 vpop {q4-q7} 1096 pop {r4-r11,pc} 1097 1098L(generate_grain_\type\()_lag2): 1099 vpush {q4-q7} 1100 mov r5, #127 1101 vld1.8 {d28,d29}, [r4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12] 1102 1103 vmov.s8 r4, d29[2] 1104 vmov.s8 r10, d29[3] 1105 1106 mov r1, #3 1107 bl generate_grain_rows_neon 1108 1109 mov r1, #GRAIN_HEIGHT - 3 11101: 1111 bl sum_\type\()_lag2_left_neon 1112 bl sum_\type\()_lag2_mid_neon 1113 bl sum_\type\()_lag2_mid_neon 1114 bl sum_\type\()_lag2_mid_neon 1115 bl sum_\type\()_lag2_right_neon 1116 get_grain_2 d16 1117 subs r1, r1, #1 1118.ifc \type, uv_444 1119 add r11, r11, #2 1120.endif 1121 vst1.16 {d16[0]}, [r0]! 1122 bgt 1b 1123 1124 vpop {q4-q7} 1125 pop {r4-r11,pc} 1126 1127L(generate_grain_\type\()_lag3): 1128 vpush {q4-q7} 1129 mov r5, #127 1130 vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] 1131 1132 vmov.u8 r4, d28[5] 1133 vmov.u8 r10, d28[6] 1134 vmov.u8 r12, d28[7] 1135 1136 orr r4, r4, r10, lsl #8 1137 orr r4, r4, r12, lsl #16 1138 1139 mov r1, #3 1140 vpush {d26} 1141 bl generate_grain_rows_neon 1142 vpop {d26} 1143 1144 mov r1, #GRAIN_HEIGHT - 3 11451: 1146 bl sum_\type\()_lag3_left_neon 1147 bl sum_\type\()_lag3_mid_neon 1148 bl sum_\type\()_lag3_mid_neon 1149 bl sum_\type\()_lag3_mid_neon 1150 bl sum_\type\()_lag3_right_neon 1151 get_grain_2 d16 1152 subs r1, r1, #1 1153.ifc \type, uv_444 1154 add r11, r11, #2 1155.endif 1156 vst1.16 {d16[0]}, [r0]! 1157 bgt 1b 1158 1159 vpop {q4-q7} 1160 pop {r4-r11,pc} 1161endfunc 1162.endm 1163 1164gen_grain_82 y 1165gen_grain_82 uv_444 1166 1167.macro set_height dst, type 1168.ifc \type, uv_420 1169 mov \dst, #SUB_GRAIN_HEIGHT-3 1170.else 1171 mov \dst, #GRAIN_HEIGHT-3 1172.endif 1173.endm 1174 1175.macro increment_y_ptr reg, type 1176.ifc \type, uv_420 1177 add \reg, \reg, #2*GRAIN_WIDTH-(3*32) 1178.else 1179 sub \reg, \reg, #3*32-GRAIN_WIDTH 1180.endif 1181.endm 1182 1183.macro gen_grain_44 type 1184function generate_grain_\type\()_8bpc_neon, export=1 1185 push {r4-r11,lr} 1186 1187 mov r12, r3 1188 mov lr, #28 1189 add r11, r1, #3*GRAIN_WIDTH-3 1190 mov r1, r2 1191 mul r12, r12, lr 1192 1193 movrel r3, X(gaussian_sequence) 1194 ldr r2, [r1, #FGD_SEED] 1195 ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT] 1196 add r4, r1, #FGD_AR_COEFFS_UV 1197 adr r5, L(gen_grain_\type\()_tbl) 1198 ldr r6, [r1, #FGD_AR_COEFF_LAG] 1199 add r9, r9, #4 1200 ldr r6, [r5, r6, lsl #2] 1201 vdup.16 q15, r9 // 4 + data->grain_scale_shift 1202 add r5, r5, r6 1203 vneg.s16 q15, q15 1204 1205 cmp r12, #0 1206 movw r10, #0x49d8 1207 movw lr, #0xb524 1208 // Intentionally using a separate register instead of moveq with an 1209 // immediate constant, to avoid armv8 deprecated it instruction forms. 1210 it eq 1211 moveq r10, lr 1212 add r4, r4, r12 // Add offset to ar_coeffs_uv[1] 1213 eor r2, r2, r10 1214 1215 ldr r7, [r1, #FGD_AR_COEFF_SHIFT] 1216 mov r8, #1 1217 mov r10, #1 1218 lsl r8, r8, r7 // 1 << ar_coeff_shift 1219 lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift) 1220 lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1) 1221 lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1) 1222 bx r5 1223 1224 .align 2 1225L(gen_grain_\type\()_tbl): 1226 .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB 1227 .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB 1228 .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB 1229 .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB 1230 1231L(generate_grain_\type\()_lag0): 1232.ifc \type, uv_420 1233 vpush {q4-q5} 1234.endif 1235 mov r1, #3 1236 bl generate_grain_rows_44_neon 1237 set_height r1, \type 1238 1239 vdup.16 q12, r7 1240 vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0] 1241 vmov.i8 q0, #0 1242 vmov.i8 q1, #255 1243 vext.8 q13, q0, q1, #13 1244 vext.8 q14, q1, q0, #7 1245 vneg.s16 q12, q12 1246 12471: 1248 bl get_grain_row_44_neon 1249.ifc \type, uv_420 1250 add r12, r11, #GRAIN_WIDTH 1251.endif 1252 vmov q1, q13 1253 vmov q0, q8 1254 bl add_\type\()_coeff_lag0_neon 1255 vmov.i8 q1, #255 1256 vmov q0, q9 1257 vmov q8, q2 1258 bl add_\type\()_coeff_lag0_neon 1259 vmov.i8 q1, q14 1260 vmov q0, q10 1261 vmov q9, q2 1262 bl add_\type\()_coeff_lag0_neon 1263 vmov q10, q2 1264 subs r1, r1, #1 1265 increment_y_ptr r11, \type 1266 store_grain_row_44 d16, d17, d18, d19, d20, d21 1267 bgt 1b 1268 1269.ifc \type, uv_420 1270 vpop {q4-q5} 1271.endif 1272 pop {r4-r11,pc} 1273 1274L(generate_grain_\type\()_lag1): 1275 vpush {q4-q7} 1276 mov r5, #127 1277 vld1.8 {d27[]}, [r4]! // ar_coeffs_uv[0] 1278 vld1.8 {d28[]}, [r4]! // ar_coeffs_uv[1] 1279 vld1.8 {d29[]}, [r4] // ar_coeffs_uv[2] 1280 add r4, r4, #2 1281 1282 mov r1, #3 1283 vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4] 1284 ldrsb r4, [r4, #-1] // ar_coeffs_uv[3] 1285 bl generate_grain_rows_44_neon 1286 1287 set_height r1, \type 12881: 1289 sum_\type\()_lag1 q7, q8, q8, q9, left 1290 sum_\type\()_lag1 q8, q8, q9, q10 1291 sum_\type\()_lag1 q10, q9, q10, q11, right 1292 subs r1, r1, #1 1293 increment_y_ptr r11, \type 1294 store_grain_row_44 d14, d15, d16, d17, d20, d21 1295 vmov q9, q8 1296 vmov q8, q7 1297 bgt 1b 1298 1299 vpop {q4-q7} 1300 pop {r4-r11,pc} 1301 1302L(generate_grain_\type\()_lag2): 1303 vpush {q4-q7} 1304 mov r5, #127 1305 vld1.8 {d28,d29}, [r4] // ar_coeffs_uv[0-12] 1306 1307 vmov.s8 r4, d29[2] 1308 vmov.s8 r10, d29[3] 1309 1310 mov r1, #3 1311 bl generate_grain_rows_44_neon 1312 1313 set_height r1, \type 13141: 1315 bl sum_\type\()_lag2_left_neon 1316 bl sum_\type\()_lag2_mid_neon 1317 bl sum_\type\()_lag2_right_neon 1318 subs r1, r1, #1 1319 increment_y_ptr r11, \type 1320 add r0, r0, #GRAIN_WIDTH-48 1321 bgt 1b 1322 1323 vpop {q4-q7} 1324 pop {r4-r11,pc} 1325 1326L(generate_grain_\type\()_lag3): 1327 vpush {q4-q7} 1328 mov r5, #127 1329 vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] 1330 1331 vmov.u8 r4, d28[5] 1332 vmov.u8 r10, d28[6] 1333 vmov.u8 r12, d28[7] 1334 1335 orr r4, r4, r10, lsl #8 1336 orr r4, r4, r12, lsl #16 1337 1338 mov r1, #3 1339 bl generate_grain_rows_44_neon 1340 1341 set_height r1, \type 13421: 1343 bl sum_\type\()_lag3_left_neon 1344 bl sum_\type\()_lag3_mid_neon 1345 bl sum_\type\()_lag3_right_neon 1346 subs r1, r1, #1 1347 increment_y_ptr r11, \type 1348 add r0, r0, #GRAIN_WIDTH-48 1349 bgt 1b 1350 1351 vpop {q4-q7} 1352 pop {r4-r11,pc} 1353endfunc 1354.endm 1355 1356gen_grain_44 uv_420 1357gen_grain_44 uv_422 1358 1359.macro gather_interleaved dst1, dst2, src1, src2, off 1360 vmov.u8 r11, \src1[0+\off] 1361 vmov.u8 r12, \src2[0+\off] 1362 add r11, r11, r3 1363 vmov.u8 lr, \src1[2+\off] 1364 add r12, r12, r3 1365 vld1.8 {\dst1[0+\off]}, [r11] 1366 vmov.u8 r11, \src2[2+\off] 1367 add lr, lr, r3 1368 vld1.8 {\dst2[0+\off]}, [r12] 1369 vmov.u8 r12, \src1[4+\off] 1370 add r11, r11, r3 1371 vld1.8 {\dst1[2+\off]}, [lr] 1372 vmov.u8 lr, \src2[4+\off] 1373 add r12, r12, r3 1374 vld1.8 {\dst2[2+\off]}, [r11] 1375 vmov.u8 r11, \src1[6+\off] 1376 add lr, lr, r3 1377 vld1.8 {\dst1[4+\off]}, [r12] 1378 vmov.u8 r12, \src2[6+\off] 1379 add r11, r11, r3 1380 vld1.8 {\dst2[4+\off]}, [lr] 1381 add r12, r12, r3 1382 vld1.8 {\dst1[6+\off]}, [r11] 1383 vld1.8 {\dst2[6+\off]}, [r12] 1384.endm 1385 1386.macro gather dst1, dst2, dst3, dst4, src1, src2, src3, src4 1387 gather_interleaved \dst1, \dst3, \src1, \src3, 0 1388 gather_interleaved \dst1, \dst3, \src1, \src3, 1 1389 gather_interleaved \dst2, \dst4, \src2, \src4, 0 1390 gather_interleaved \dst2, \dst4, \src2, \src4, 1 1391.endm 1392 1393function gather32_neon 1394 push {r11-r12,lr} 1395 gather d8, d9, d10, d11, d0, d1, d2, d3 1396 pop {r11-r12,pc} 1397endfunc 1398 1399function gather16_neon 1400 push {r11-r12,lr} 1401 gather_interleaved d8, d9, d0, d1, 0 1402 gather_interleaved d8, d9, d0, d1, 1 1403 pop {r11-r12,pc} 1404endfunc 1405 1406const overlap_coeffs_0, align=4 1407 .byte 27, 17, 0, 0, 0, 0, 0, 0 1408 .byte 17, 27, 32, 32, 32, 32, 32, 32 1409endconst 1410 1411const overlap_coeffs_1, align=4 1412 .byte 23, 0, 0, 0, 0, 0, 0, 0 1413 .byte 22, 32, 32, 32, 32, 32, 32, 32 1414endconst 1415 1416.macro calc_offset offx, offy, src, sx, sy 1417 and \offy, \src, #0xF // randval & 0xF 1418 lsr \offx, \src, #4 // randval >> 4 1419.if \sy == 0 1420 add \offy, \offy, \offy // 2 * (randval & 0xF) 1421.endif 1422.if \sx == 0 1423 add \offx, \offx, \offx // 2 * (randval >> 4) 1424.endif 1425.endm 1426 1427.macro add_offset dst, offx, offy, src, stride 1428 mla \dst, \stride, \offy, \src // grain_lut += grain_stride * offy 1429 add \dst, \dst, \offx // grain_lut += offx 1430.endm 1431 1432// void dav1d_fgy_32x32_8bpc_neon(pixel *const dst, const pixel *const src, 1433// const ptrdiff_t stride, 1434// const uint8_t scaling[SCALING_SIZE], 1435// const int scaling_shift, 1436// const entry grain_lut[][GRAIN_WIDTH], 1437// const int offsets[][2], 1438// const int h, const ptrdiff_t clip, 1439// const ptrdiff_t type); 1440function fgy_32x32_8bpc_neon, export=1 1441 push {r4-r11,lr} 1442 vpush {q4-q7} 1443 ldrd r4, r5, [sp, #100] // scaling_shift, grain_lut 1444 ldrd r6, r7, [sp, #108] // offsets, h 1445 ldr r8, [sp, #116] // clip 1446 mov r9, #GRAIN_WIDTH // grain_lut stride 1447 1448 neg r4, r4 1449 vdup.16 q13, r4 // -scaling_shift 1450 cmp r8, #0 1451 1452 movrel_local r12, overlap_coeffs_0 1453 1454 beq 1f 1455 // clip 1456 vmov.i8 q14, #16 1457 vmov.i8 q15, #235 1458 b 2f 14591: 1460 // no clip 1461 vmov.i8 q14, #0 1462 vmov.i8 q15, #255 14632: 1464 1465 vld1.8 {d24, d25}, [r12, :128] // overlap_coeffs 1466 1467 add r5, r5, #9 // grain_lut += 9 1468 add r5, r5, r9, lsl #3 // grain_lut += 8 * grain_stride 1469 add r5, r5, r9 // grain_lut += grain_stride 1470 1471 ldr r10, [r6, #8] // offsets[1][0] 1472 calc_offset r10, r4, r10, 0, 0 1473 add_offset r4, r10, r4, r5, r9 1474 ldr r10, [r6, #4] // offsets[0][1] 1475 calc_offset r10, r11, r10, 0, 0 1476 add_offset r11, r10, r11, r5, r9 1477 ldr r10, [r6, #12] // offsets[1][1] 1478 calc_offset r10, r8, r10, 0, 0 1479 add_offset r8, r10, r8, r5, r9 1480 ldr r6, [r6] // offsets[0][0] 1481 calc_offset r6, lr, r6, 0, 0 1482 add_offset r5, r6, lr, r5, r9 1483 1484 add r4, r4, #32 // grain_lut += FG_BLOCK_SIZE * bx 1485 add r6, r11, r9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by 1486 1487 ldr r10, [sp, #120] // type 1488 adr r11, L(fgy_loop_tbl) 1489 1490 tst r10, #1 1491 ldr r10, [r11, r10, lsl #2] 1492 1493 add r8, r8, r9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by 1494 add r8, r8, #32 // grain_lut += FG_BLOCK_SIZE * bx 1495 1496 add r11, r11, r10 1497 1498 beq 1f 1499 // y overlap 1500 vdup.8 d14, d24[0] 1501 vdup.8 d15, d24[1] 1502 mov r10, r7 // backup actual h 1503 mov r7, #2 15041: 1505 bx r11 1506endfunc 1507 1508function fgy_loop_neon 1509L(fgy_loop_tbl): 1510 .word L(loop_00) - L(fgy_loop_tbl) + CONFIG_THUMB 1511 .word L(loop_01) - L(fgy_loop_tbl) + CONFIG_THUMB 1512 .word L(loop_10) - L(fgy_loop_tbl) + CONFIG_THUMB 1513 .word L(loop_11) - L(fgy_loop_tbl) + CONFIG_THUMB 1514 1515.macro fgy ox, oy 1516L(loop_\ox\oy): 15171: 1518.if \ox 1519 vld1.8 {d8}, [r4], r9 // grain_lut old 1520.endif 1521.if \oy 1522 vld1.8 {q2, q3}, [r6], r9 // grain_lut top 1523.endif 1524.if \ox && \oy 1525 vld1.8 {d10}, [r8], r9 // grain_lut top old 1526.endif 1527 vld1.8 {q0, q1}, [r1, :128], r2 // src 1528 vld1.8 {q10, q11}, [r5], r9 // grain_lut 1529 1530.if \ox 1531 vmull.s8 q4, d8, d24 1532 vmlal.s8 q4, d20, d25 1533.endif 1534 1535.if \oy 1536.if \ox 1537 vmull.s8 q5, d10, d24 1538 vmlal.s8 q5, d4, d25 1539 vqrshrn.s16 d20, q4, #5 1540 vqrshrn.s16 d4, q5, #5 1541.endif 1542 1543 vmull.s8 q4, d20, d15 1544 vmull.s8 q5, d21, d15 1545 vmull.s8 q8, d22, d15 1546 vmull.s8 q9, d23, d15 1547 vmlal.s8 q4, d4, d14 1548 vmlal.s8 q5, d5, d14 1549 vmlal.s8 q8, d6, d14 1550 vmlal.s8 q9, d7, d14 1551 vqrshrn.s16 d20, q4, #5 1552 vqrshrn.s16 d21, q5, #5 1553 vqrshrn.s16 d22, q8, #5 1554 vqrshrn.s16 d23, q9, #5 1555.elseif \ox 1556 vqrshrn.s16 d20, q4, #5 1557.endif 1558 1559 bl gather32_neon 1560 1561 vmovl.s8 q8, d20 // grain 1562 vmovl.s8 q9, d21 1563 vmovl.s8 q10, d22 1564 vmovl.s8 q11, d23 1565 1566 vmovl.u8 q2, d8 // scaling 1567 vmovl.u8 q3, d9 1568 vmovl.u8 q4, d10 1569 vmovl.u8 q5, d11 1570 1571 vmul.i16 q8, q8, q2 // scaling * grain 1572 vmul.i16 q9, q9, q3 1573 vmul.i16 q10, q10, q4 1574 vmul.i16 q11, q11, q5 1575 1576 vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift) 1577 vrshl.s16 q9, q9, q13 1578 vrshl.s16 q10, q10, q13 1579 vrshl.s16 q11, q11, q13 1580 1581 vaddw.u8 q8, q8, d0 // *src + noise 1582 vaddw.u8 q9, q9, d1 1583 vaddw.u8 q10, q10, d2 1584 vaddw.u8 q11, q11, d3 1585 1586 vqmovun.s16 d0, q8 1587 vqmovun.s16 d1, q9 1588 vqmovun.s16 d2, q10 1589 vqmovun.s16 d3, q11 1590 1591 vmax.u8 q0, q0, q14 1592 vmax.u8 q1, q1, q14 1593 vmin.u8 q0, q0, q15 1594 vmin.u8 q1, q1, q15 1595 1596 subs r7, r7, #1 1597.if \oy 1598 vdup.8 d14, d25[0] 1599 vdup.8 d15, d25[1] 1600.endif 1601 vst1.8 {q0, q1}, [r0, :128], r2 // dst 1602 bgt 1b 1603 1604.if \oy 1605 cmp r10, #2 1606 sub r7, r10, #2 // restore actual remaining h 1607 bgt L(loop_\ox\()0) 1608.endif 1609 vpop {q4-q7} 1610 pop {r4-r11,pc} 1611.endm 1612 1613 fgy 0, 0 1614 fgy 0, 1 1615 fgy 1, 0 1616 fgy 1, 1 1617endfunc 1618 1619// void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst, 1620// const pixel *const src, 1621// const ptrdiff_t stride, 1622// const uint8_t scaling[SCALING_SIZE], 1623// const Dav1dFilmGrainData *const data, 1624// const entry grain_lut[][GRAIN_WIDTH], 1625// const pixel *const luma_row, 1626// const ptrdiff_t luma_stride, 1627// const int offsets[][2], 1628// const ptrdiff_t h, const ptrdiff_t uv, 1629// const ptrdiff_t is_id, 1630// const ptrdiff_t type); 1631.macro fguv layout, sx, sy 1632function fguv_32x32_\layout\()_8bpc_neon, export=1 1633 push {r4-r11,lr} 1634 vpush {q4-q7} 1635 ldrd r4, r5, [sp, #100] // data, grain_lut 1636 ldrd r6, r7, [sp, #108] // luma_row, luma_stride 1637 ldrd r8, r9, [sp, #116] // offsets, h 1638 ldrd r10, r11, [sp, #124] // uv, is_id 1639 1640 // !csfl 1641 add r10, r4, r10, lsl #2 // + 4*uv 1642 add r12, r10, #FGD_UV_LUMA_MULT 1643 add lr, r10, #FGD_UV_MULT 1644 add r10, r10, #FGD_UV_OFFSET 1645 vld1.16 {d4[]}, [r12] // uv_luma_mult 1646 vld1.16 {d4[2]}, [r10] // uv_offset 1647 vld1.16 {d4[1]}, [lr] // uv_mult 1648 1649 ldr lr, [r4, #FGD_SCALING_SHIFT] 1650 ldr r12, [r4, #FGD_CLIP_TO_RESTRICTED_RANGE] 1651 neg lr, lr // -scaling_shift 1652 1653 cmp r12, #0 1654 vdup.16 q13, lr // -scaling_shift 1655 1656 beq 1f 1657 // clip 1658 cmp r11, #0 1659 vmov.i8 q14, #16 1660 vmov.i8 q15, #240 1661 beq 2f 1662 // is_id 1663 vmov.i8 q15, #235 1664 b 2f 16651: 1666 // no clip 1667 vmov.i8 q14, #0 1668 vmov.i8 q15, #255 16692: 1670 1671 mov r10, #GRAIN_WIDTH // grain_lut stride 1672 1673 add r5, r5, #(3 + (2 >> \sx)*3) // grain_lut += 9 or 6 1674.if \sy 1675 add r5, r5, r10, lsl #2 // grain_lut += 4 * grain_stride 1676 add r5, r5, r10, lsl #1 // grain_lut += 2 * grain_stride 1677.else 1678 add r5, r5, r10, lsl #3 // grain_lut += 8 * grain_stride 1679 add r5, r5, r10 // grain_lut += grain_stride 1680.endif 1681 1682 ldr r12, [r8, #8] // offsets[1][0] 1683 calc_offset r12, r4, r12, \sx, \sy 1684 add_offset r4, r12, r4, r5, r10 1685 1686 ldr r12, [r8, #4] // offsets[0][1] 1687 calc_offset r12, lr, r12, \sx, \sy 1688 add_offset lr, r12, lr, r5, r10 1689 1690 ldr r12, [r8, #12] // offsets[1][1] 1691 calc_offset r12, r11, r12, \sx, \sy 1692 add_offset r11, r12, r11, r5, r10 1693 1694 ldr r8, [r8] // offsets[0][0] 1695 calc_offset r8, r12, r8, \sx, \sy 1696 add_offset r5, r8, r12, r5, r10 1697 1698 add r4, r4, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx 1699 add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by 1700 add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by 1701 add r11, r11, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx 1702 1703 movrel_local r12, overlap_coeffs_\sx 1704 ldr lr, [sp, #132] // type 1705 1706 vld1.8 {d24, d25}, [r12, :128] // overlap_coeffs 1707 1708 movrel_local r12, L(fguv_loop_sx\sx\()_tbl) 1709#if CONFIG_THUMB 1710 // This uses movrel_local instead of adr above, because the target 1711 // can be out of range for adr. But movrel_local leaves the thumb bit 1712 // set on COFF (but probably wouldn't if building for thumb on ELF), 1713 // thus try to clear the bit for robustness. 1714 bic r12, r12, #1 1715#endif 1716 1717 tst lr, #1 1718 ldr lr, [r12, lr, lsl #2] 1719 1720 add r12, r12, lr 1721 1722 beq 1f 1723 // y overlap 1724 sub lr, r9, #(2 >> \sy) // backup remaining h 1725 mov r9, #(2 >> \sy) 1726 17271: 1728 1729.if \sy 1730 vmov.i8 d6, #23 1731 vmov.i8 d7, #22 1732.else 1733 vmov.i8 d6, #27 1734 vmov.i8 d7, #17 1735.endif 1736 1737.if \sy 1738 add r7, r7, r7 // luma_stride *= 2 1739.endif 1740 1741 bx r12 1742endfunc 1743.endm 1744 1745fguv 420, 1, 1 1746fguv 422, 1, 0 1747fguv 444, 0, 0 1748 1749function fguv_loop_sx0_neon 1750L(fguv_loop_sx0_tbl): 1751 .word L(fguv_loop_sx0_csfl0_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB 1752 .word L(fguv_loop_sx0_csfl0_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB 1753 .word L(fguv_loop_sx0_csfl0_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB 1754 .word L(fguv_loop_sx0_csfl0_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB 1755 .word L(fguv_loop_sx0_csfl1_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB 1756 .word L(fguv_loop_sx0_csfl1_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB 1757 .word L(fguv_loop_sx0_csfl1_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB 1758 .word L(fguv_loop_sx0_csfl1_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB 1759 1760.macro fguv_loop_sx0 csfl, ox, oy 1761L(fguv_loop_sx0_csfl\csfl\()_\ox\oy): 1762.if \oy 1763 mov r12, lr 1764.endif 17651: 1766.if \ox 1767 vld1.8 {d8}, [r4], r10 // grain_lut old 1768.endif 1769.if \oy 1770 vld1.8 {q8, q9}, [r8], r10 // grain_lut top 1771.endif 1772.if \ox && \oy 1773 vld1.8 {d10}, [r11], r10 // grain_lut top old 1774.endif 1775 vld1.8 {q0, q1}, [r6, :128], r7 // luma 1776 vld1.8 {q10, q11}, [r5], r10 // grain_lut 1777 1778.if \ox 1779 vmull.s8 q4, d8, d24 1780 vmlal.s8 q4, d20, d25 1781.endif 1782 1783.if \oy 1784.if \ox 1785 vmull.s8 q5, d10, d24 1786 vmlal.s8 q5, d16, d25 1787 vqrshrn.s16 d20, q4, #5 1788 vqrshrn.s16 d16, q5, #5 1789.endif 1790 1791 vmull.s8 q4, d20, d7 1792 vmull.s8 q5, d21, d7 1793 vmull.s8 q6, d22, d7 1794 vmull.s8 q7, d23, d7 1795 vmlal.s8 q4, d16, d6 1796 vmlal.s8 q5, d17, d6 1797 vmlal.s8 q6, d18, d6 1798 vmlal.s8 q7, d19, d6 1799 vqrshrn.s16 d20, q4, #5 1800 vqrshrn.s16 d21, q5, #5 1801 vqrshrn.s16 d22, q6, #5 1802 vqrshrn.s16 d23, q7, #5 1803.elseif \ox 1804 vqrshrn.s16 d20, q4, #5 1805.endif 1806.if !\csfl 1807 vld1.8 {q8, q9}, [r1, :128] // src 1808 vmovl.u8 q4, d0 1809 vmovl.u8 q5, d1 1810 vmovl.u8 q6, d2 1811 vmovl.u8 q7, d3 1812 vmovl.u8 q0, d16 1813 vmovl.u8 q1, d17 1814 vmovl.u8 q8, d18 1815 vmovl.u8 q9, d19 1816 vmul.i16 q4, q4, d4[0] 1817 vmul.i16 q5, q5, d4[0] 1818 vmul.i16 q6, q6, d4[0] 1819 vmul.i16 q7, q7, d4[0] 1820 vmul.i16 q0, q0, d4[1] 1821 vmul.i16 q1, q1, d4[1] 1822 vmul.i16 q8, q8, d4[1] 1823 vmul.i16 q9, q9, d4[1] 1824 vqadd.s16 q4, q4, q0 1825 vqadd.s16 q5, q5, q1 1826 vqadd.s16 q6, q6, q8 1827 vqadd.s16 q7, q7, q9 1828 vdup.16 q0, d4[2] 1829 vshr.s16 q4, q4, #6 1830 vshr.s16 q5, q5, #6 1831 vshr.s16 q6, q6, #6 1832 vshr.s16 q7, q7, #6 1833 vadd.i16 q4, q4, q0 1834 vadd.i16 q5, q5, q0 1835 vadd.i16 q6, q6, q0 1836 vadd.i16 q7, q7, q0 1837 vqmovun.s16 d0, q4 1838 vqmovun.s16 d1, q5 1839 vqmovun.s16 d2, q6 1840 vqmovun.s16 d3, q7 1841.endif 1842 1843 bl gather32_neon 1844 1845 vld1.8 {q0, q1}, [r1, :128], r2 // src 1846 1847 vmovl.s8 q8, d20 // grain 1848 vmovl.s8 q9, d21 1849 vmovl.s8 q10, d22 1850 vmovl.s8 q11, d23 1851 1852 vmovl.u8 q6, d8 // scaling 1853 vmovl.u8 q7, d9 1854 vmovl.u8 q4, d10 1855 vmovl.u8 q5, d11 1856 1857 vmul.i16 q8, q8, q6 // scaling * grain 1858 vmul.i16 q9, q9, q7 1859 vmul.i16 q10, q10, q4 1860 vmul.i16 q11, q11, q5 1861 1862 vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift) 1863 vrshl.s16 q9, q9, q13 1864 vrshl.s16 q10, q10, q13 1865 vrshl.s16 q11, q11, q13 1866 1867 vaddw.u8 q8, q8, d0 // *src + noise 1868 vaddw.u8 q9, q9, d1 1869 vaddw.u8 q10, q10, d2 1870 vaddw.u8 q11, q11, d3 1871 1872 vqmovun.s16 d0, q8 1873 vqmovun.s16 d1, q9 1874 vqmovun.s16 d2, q10 1875 vqmovun.s16 d3, q11 1876 1877 vmax.u8 q0, q0, q14 1878 vmax.u8 q1, q1, q14 1879 vmin.u8 q0, q0, q15 1880 vmin.u8 q1, q1, q15 1881 1882 subs r9, r9, #1 1883.if \oy 1884 vdup.8 d6, d25[0] 1885 vdup.8 d7, d25[1] 1886.endif 1887 1888 vst1.8 {q0, q1}, [r0, :128], r2 // dst 1889 bgt 1b 1890 1891.if \oy 1892 cmp r12, #0 1893 mov r9, r12 // restore actual remaining h 1894 bgt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0) 1895.endif 1896 b 9f 1897.endm 1898 fguv_loop_sx0 0, 0, 0 1899 fguv_loop_sx0 0, 0, 1 1900 fguv_loop_sx0 0, 1, 0 1901 fguv_loop_sx0 0, 1, 1 1902 fguv_loop_sx0 1, 0, 0 1903 fguv_loop_sx0 1, 0, 1 1904 fguv_loop_sx0 1, 1, 0 1905 fguv_loop_sx0 1, 1, 1 1906 19079: 1908 vpop {q4-q7} 1909 pop {r4-r11,pc} 1910endfunc 1911 1912function fguv_loop_sx1_neon 1913L(fguv_loop_sx1_tbl): 1914 .word L(fguv_loop_sx1_csfl0_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB 1915 .word L(fguv_loop_sx1_csfl0_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB 1916 .word L(fguv_loop_sx1_csfl0_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB 1917 .word L(fguv_loop_sx1_csfl0_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB 1918 .word L(fguv_loop_sx1_csfl1_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB 1919 .word L(fguv_loop_sx1_csfl1_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB 1920 .word L(fguv_loop_sx1_csfl1_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB 1921 .word L(fguv_loop_sx1_csfl1_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB 1922 1923.macro fguv_loop_sx1 csfl, ox, oy 1924L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): 1925.if \oy 1926 mov r12, lr 1927.endif 19281: 1929.if \ox 1930 vld1.8 {d8}, [r4], r10 // grain_lut old 1931.endif 1932.if \oy 1933 vld1.8 {q8}, [r8], r10 // grain_lut top 1934.endif 1935.if \ox && \oy 1936 vld1.8 {d10}, [r11], r10 // grain_lut top old 1937.endif 1938 vld1.8 {q0, q1}, [r6, :128], r7 // luma 1939 vld1.8 {q10}, [r5], r10 // grain_lut 1940 vld1.8 {q11}, [r1, :128], r2 // src 1941 1942.if \ox 1943 vmull.s8 q4, d8, d24 1944 vmlal.s8 q4, d20, d25 1945.endif 1946 1947 vpaddl.u8 q0, q0 1948 vpaddl.u8 q1, q1 1949.if \oy 1950.if \ox 1951 vmull.s8 q5, d10, d24 1952 vmlal.s8 q5, d16, d25 1953 vqrshrn.s16 d20, q4, #5 1954 vqrshrn.s16 d16, q5, #5 1955.endif 1956 1957 vmull.s8 q4, d20, d7 1958 vmull.s8 q5, d21, d7 1959 vmlal.s8 q4, d16, d6 1960 vmlal.s8 q5, d17, d6 1961 vqrshrn.s16 d20, q4, #5 1962 vqrshrn.s16 d21, q5, #5 1963.elseif \ox 1964 vqrshrn.s16 d20, q4, #5 1965.endif 1966.if \csfl 1967 vrshrn.u16 d0, q0, #1 1968 vrshrn.u16 d1, q1, #1 1969.else 1970 vrshr.u16 q4, q0, #1 1971 vrshr.u16 q5, q1, #1 1972 vmovl.u8 q0, d22 1973 vmovl.u8 q1, d23 1974 vmul.i16 q4, q4, d4[0] 1975 vmul.i16 q5, q5, d4[0] 1976 vmul.i16 q0, q0, d4[1] 1977 vmul.i16 q1, q1, d4[1] 1978 vqadd.s16 q4, q4, q0 1979 vqadd.s16 q5, q5, q1 1980 vdup.16 q0, d4[2] 1981 vshr.s16 q4, q4, #6 1982 vshr.s16 q5, q5, #6 1983 vadd.i16 q4, q4, q0 1984 vadd.i16 q5, q5, q0 1985 vqmovun.s16 d0, q4 1986 vqmovun.s16 d1, q5 1987.endif 1988 1989 bl gather16_neon 1990 1991 vmovl.s8 q8, d20 // grain 1992 vmovl.s8 q9, d21 1993 1994 vmovl.u8 q6, d8 // scaling 1995 vmovl.u8 q7, d9 1996 1997 vmul.i16 q8, q8, q6 // scaling * grain 1998 vmul.i16 q9, q9, q7 1999 2000 vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift) 2001 vrshl.s16 q9, q9, q13 2002 2003 vaddw.u8 q8, q8, d22 // *src + noise 2004 vaddw.u8 q9, q9, d23 2005 2006 vqmovun.s16 d0, q8 2007 vqmovun.s16 d1, q9 2008 2009 vmax.u8 q0, q0, q14 2010 vmin.u8 q0, q0, q15 2011 2012 subs r9, r9, #1 2013.if \oy 2014 vswp d6, d7 2015.endif 2016 vst1.8 {q0}, [r0, :128], r2 // dst 2017 bgt 1b 2018 2019.if \oy 2020 cmp r12, #0 2021 mov r9, r12 // restore actual remaining h 2022 bgt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0) 2023.endif 2024 2025 b 9f 2026.endm 2027 fguv_loop_sx1 0, 0, 0 2028 fguv_loop_sx1 0, 0, 1 2029 fguv_loop_sx1 0, 1, 0 2030 fguv_loop_sx1 0, 1, 1 2031 fguv_loop_sx1 1, 0, 0 2032 fguv_loop_sx1 1, 0, 1 2033 fguv_loop_sx1 1, 1, 0 2034 fguv_loop_sx1 1, 1, 1 2035 20369: 2037 vpop {q4-q7} 2038 pop {r4-r11,pc} 2039endfunc 2040