1/* 2 * Copyright (C) 2013-2014 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: 18#define END(f) .size f, .-f; 19 20#define BLEND_LIST(X) \ 21 X(0, CLEAR) \ 22 X(1, SRC) \ 23 X(2, DST) \ 24 X(3, SRC_OVER) \ 25 X(4, DST_OVER) \ 26 X(5, SRC_IN) \ 27 X(6, DST_IN) \ 28 X(7, SRC_OUT) \ 29 X(8, DST_OUT) \ 30 X(9, SRC_ATOP) \ 31 X(10, DST_ATOP) \ 32 X(11, XOR) \ 33 X(12, MULTIPLY) \ 34 X(13, ADD) \ 35 X(14, SUBTRACT) 36 37/* This operation was not enabled in the original RenderScript. We could 38 * enable it. 39 * 40 * X(15, DIFFERENCE) \ 41 */ 42 43/* For every blend operation supported, define a macro with just the arithmetic 44 * component. The rest can be handled later on. 45 * 46 * At entry q0-q3 contain the RGBA data from the destination buffer, and q8-q11 47 * contain the data from the source buffer. Both have already been split out 48 * into one colour component per register (if necessary). q3 and q11 contain 49 * the alpha components. 50 * 51 * At the same time as defining the assembly macro, define a corresponding 52 * preprocessor macro indicating any other requirements. 53 * zipped=0 -- The macro does not require the RGBA components to be 54 * separated. 55 * lddst=0 -- The macro does not require data from the destination buffer. 56 * ldsrc=0 -- The macro does not require data from the source buffer. 57 * nowrap=1 -- The macro requires no wrapper at all, and should simply be 58 * inserted without any surrounding load/store or loop code. 59 */ 60 61#define params_CLEAR zipped=0, lddst=0, ldsrc=0 62.macro blend_kernel_CLEAR 63 movi v0.16b, #0 64 movi v1.16b, #0 65 movi v2.16b, #0 66 movi v3.16b, #0 67.endm 68 69#define params_SRC zipped=0, lddst=0 70.macro blend_kernel_SRC 71 mov v0.16b, v8.16b 72 mov v1.16b, v9.16b 73 mov v2.16b, v10.16b 74 mov v3.16b, v11.16b 75.endm 76 77#define params_DST nowrap=1 78.macro blend_kernel_DST 79 /* nop */ 80.endm 81 82#define params_SRC_OVER zipped=1 83.macro blend_kernel_SRC_OVER 84 mvn v7.16b, v11.16b 85 86 umull2 v12.8h, v7.16b, v0.16b 87 umull v0.8h, v7.8b, v0.8b 88 umull2 v13.8h, v7.16b, v1.16b 89 umull v1.8h, v7.8b, v1.8b 90 umull2 v14.8h, v7.16b, v2.16b 91 umull v2.8h, v7.8b, v2.8b 92 umull2 v15.8h, v7.16b, v3.16b 93 umull v3.8h, v7.8b, v3.8b 94 95 rshrn v4.8b, v0.8h, #8 96 rshrn2 v4.16b, v12.8h, #8 97 rshrn v5.8b, v1.8h, #8 98 rshrn2 v5.16b, v13.8h, #8 99 rshrn v6.8b, v2.8h, #8 100 rshrn2 v6.16b, v14.8h, #8 101 rshrn v7.8b, v3.8h, #8 102 rshrn2 v7.16b, v15.8h, #8 103 104 uaddw v0.8h, v0.8h, v4.8b 105 uaddw2 v12.8h, v12.8h, v4.16b 106 uaddw v1.8h, v1.8h, v5.8b 107 uaddw2 v13.8h, v13.8h, v5.16b 108 uaddw v2.8h, v2.8h, v6.8b 109 uaddw2 v14.8h, v14.8h, v6.16b 110 uaddw v3.8h, v3.8h, v7.8b 111 uaddw2 v15.8h, v15.8h, v7.16b 112 113 rshrn v0.8b, v0.8h, #8 114 rshrn2 v0.16b, v12.8h, #8 115 rshrn v1.8b, v1.8h, #8 116 rshrn2 v1.16b, v13.8h, #8 117 rshrn v2.8b, v2.8h, #8 118 rshrn2 v2.16b, v14.8h, #8 119 rshrn v3.8b, v3.8h, #8 120 rshrn2 v3.16b, v15.8h, #8 121 122 uqadd v0.16b, v0.16b, v8.16b 123 uqadd v1.16b, v1.16b, v9.16b 124 uqadd v2.16b, v2.16b, v10.16b 125 uqadd v3.16b, v3.16b, v11.16b 126.endm 127 128#define params_DST_OVER zipped=1 129.macro blend_kernel_DST_OVER 130 mvn v7.16b, v3.16b 131 132 umull2 v12.8h, v7.16b, v8.16b 133 umull v8.8h, v7.8b, v8.8b 134 umull2 v13.8h, v7.16b, v9.16b 135 umull v9.8h, v7.8b, v9.8b 136 umull2 v14.8h, v7.16b, v10.16b 137 umull v10.8h, v7.8b, v10.8b 138 umull2 v15.8h, v7.16b, v11.16b 139 umull v11.8h, v7.8b, v11.8b 140 141 rshrn v4.8b, v8.8h, #8 142 rshrn2 v4.16b, v12.8h, #8 143 rshrn v5.8b, v9.8h, #8 144 rshrn2 v5.16b, v13.8h, #8 145 rshrn v6.8b, v10.8h, #8 146 rshrn2 v6.16b, v14.8h, #8 147 rshrn v7.8b, v11.8h, #8 148 rshrn2 v7.16b, v15.8h, #8 149 150 uaddw v8.8h, v8.8h, v4.8b 151 uaddw2 v12.8h, v12.8h, v4.16b 152 uaddw v9.8h, v9.8h, v5.8b 153 uaddw2 v13.8h, v13.8h, v5.16b 154 uaddw v10.8h, v10.8h, v6.8b 155 uaddw2 v14.8h, v14.8h, v6.16b 156 uaddw v11.8h, v11.8h, v7.8b 157 uaddw2 v15.8h, v15.8h, v7.16b 158 159 rshrn v8.8b, v8.8h, #8 160 rshrn2 v8.16b, v12.8h, #8 161 rshrn v9.8b, v9.8h, #8 162 rshrn2 v9.16b, v13.8h, #8 163 rshrn v10.8b, v10.8h, #8 164 rshrn2 v10.16b, v14.8h, #8 165 rshrn v11.8b, v11.8h, #8 166 rshrn2 v11.16b, v15.8h, #8 167 168 uqadd v0.16b, v0.16b, v8.16b 169 uqadd v1.16b, v1.16b, v9.16b 170 uqadd v2.16b, v2.16b, v10.16b 171 uqadd v3.16b, v3.16b, v11.16b 172.endm 173 174#define params_SRC_IN zipped=1 175.macro blend_kernel_SRC_IN 176 umull2 v12.8h, v3.16b, v8.16b 177 umull v0.8h, v3.8b, v8.8b 178 umull2 v13.8h, v3.16b, v9.16b 179 umull v1.8h, v3.8b, v9.8b 180 umull2 v14.8h, v3.16b, v10.16b 181 umull v2.8h, v3.8b, v10.8b 182 umull2 v15.8h, v3.16b, v11.16b 183 umull v3.8h, v3.8b, v11.8b 184 185 rshrn v4.8b, v0.8h, #8 186 rshrn2 v4.16b, v12.8h, #8 187 rshrn v5.8b, v1.8h, #8 188 rshrn2 v5.16b, v13.8h, #8 189 rshrn v6.8b, v2.8h, #8 190 rshrn2 v6.16b, v14.8h, #8 191 rshrn v7.8b, v3.8h, #8 192 rshrn2 v7.16b, v15.8h, #8 193 194 uaddw v0.8h, v0.8h, v4.8b 195 uaddw2 v12.8h, v12.8h, v4.16b 196 uaddw v1.8h, v1.8h, v5.8b 197 uaddw2 v13.8h, v13.8h, v5.16b 198 uaddw v2.8h, v2.8h, v6.8b 199 uaddw2 v14.8h, v14.8h, v6.16b 200 uaddw v3.8h, v3.8h, v7.8b 201 uaddw2 v15.8h, v15.8h, v7.16b 202 203 rshrn v0.8b, v0.8h, #8 204 rshrn2 v0.16b, v12.8h, #8 205 rshrn v1.8b, v1.8h, #8 206 rshrn2 v1.16b, v13.8h, #8 207 rshrn v2.8b, v2.8h, #8 208 rshrn2 v2.16b, v14.8h, #8 209 rshrn v3.8b, v3.8h, #8 210 rshrn2 v3.16b, v15.8h, #8 211.endm 212 213#define params_DST_IN zipped=1 214.macro blend_kernel_DST_IN 215 umull2 v12.8h, v0.16b, v11.16b 216 umull v0.8h, v0.8b, v11.8b 217 umull2 v13.8h, v1.16b, v11.16b 218 umull v1.8h, v1.8b, v11.8b 219 umull2 v14.8h, v2.16b, v11.16b 220 umull v2.8h, v2.8b, v11.8b 221 umull2 v15.8h, v3.16b, v11.16b 222 umull v3.8h, v3.8b, v11.8b 223 224 rshrn v4.8b, v0.8h, #8 225 rshrn2 v4.16b, v12.8h, #8 226 rshrn v5.8b, v1.8h, #8 227 rshrn2 v5.16b, v13.8h, #8 228 rshrn v6.8b, v2.8h, #8 229 rshrn2 v6.16b, v14.8h, #8 230 rshrn v7.8b, v3.8h, #8 231 rshrn2 v7.16b, v15.8h, #8 232 233 uaddw v0.8h, v0.8h, v4.8b 234 uaddw2 v12.8h, v12.8h, v4.16b 235 uaddw v1.8h, v1.8h, v5.8b 236 uaddw2 v13.8h, v13.8h, v5.16b 237 uaddw v2.8h, v2.8h, v6.8b 238 uaddw2 v14.8h, v14.8h, v6.16b 239 uaddw v3.8h, v3.8h, v7.8b 240 uaddw2 v15.8h, v15.8h, v7.16b 241 242 rshrn v0.8b, v0.8h, #8 243 rshrn2 v0.16b, v12.8h, #8 244 rshrn v1.8b, v1.8h, #8 245 rshrn2 v1.16b, v13.8h, #8 246 rshrn v2.8b, v2.8h, #8 247 rshrn2 v2.16b, v14.8h, #8 248 rshrn v3.8b, v3.8h, #8 249 rshrn2 v3.16b, v15.8h, #8 250.endm 251 252#define params_SRC_OUT zipped=1 253.macro blend_kernel_SRC_OUT 254 mvn v3.16b, v3.16b 255 blend_kernel_SRC_IN 256.endm 257 258 259#define params_DST_OUT zipped=1 260.macro blend_kernel_DST_OUT 261 mvn v11.16b, v11.16b 262 blend_kernel_DST_IN 263.endm 264 265#define params_SRC_ATOP zipped=1 266.macro blend_kernel_SRC_ATOP 267 mvn v11.16b, v11.16b 268 269 umull2 v12.8h, v11.16b, v0.16b 270 umull v0.8h, v11.8b, v0.8b 271 umull2 v13.8h, v11.16b, v1.16b 272 umull v1.8h, v11.8b, v1.8b 273 umull2 v14.8h, v11.16b, v2.16b 274 umull v2.8h, v11.8b, v2.8b 275 276 umull2 v4.8h, v3.16b, v8.16b 277 umull v8.8h, v3.8b, v8.8b 278 umull2 v5.8h, v3.16b, v9.16b 279 umull v9.8h, v3.8b, v9.8b 280 umull2 v6.8h, v3.16b, v10.16b 281 umull v10.8h, v3.8b, v10.8b 282 283 uqadd v12.8h, v12.8h, v4.8h 284 uqadd v0.8h, v0.8h, v8.8h 285 uqadd v13.8h, v13.8h, v5.8h 286 uqadd v1.8h, v1.8h, v9.8h 287 uqadd v14.8h, v14.8h, v6.8h 288 uqadd v2.8h, v2.8h, v10.8h 289 290 urshr v8.8h, v0.8h, #8 291 urshr v4.8h, v12.8h, #8 292 urshr v9.8h, v1.8h, #8 293 urshr v5.8h, v13.8h, #8 294 urshr v10.8h, v2.8h, #8 295 urshr v6.8h, v14.8h, #8 296 297 uqadd v0.8h, v0.8h, v8.8h 298 uqadd v12.8h, v12.8h, v4.8h 299 uqadd v1.8h, v1.8h, v9.8h 300 uqadd v13.8h, v13.8h, v5.8h 301 uqadd v2.8h, v2.8h, v10.8h 302 uqadd v14.8h, v14.8h, v6.8h 303 304 uqrshrn v0.8b, v0.8h, #8 305 uqrshrn2 v0.16b, v12.8h, #8 306 uqrshrn v1.8b, v1.8h, #8 307 uqrshrn2 v1.16b, v13.8h, #8 308 uqrshrn v2.8b, v2.8h, #8 309 uqrshrn2 v2.16b, v14.8h, #8 310.endm 311 312#define params_DST_ATOP zipped=1 313.macro blend_kernel_DST_ATOP 314 mvn v3.16b, v3.16b 315 316 umull2 v12.8h, v11.16b, v0.16b 317 umull v0.8h, v11.8b, v0.8b 318 umull2 v13.8h, v11.16b, v1.16b 319 umull v1.8h, v11.8b, v1.8b 320 umull2 v14.8h, v11.16b, v2.16b 321 umull v2.8h, v11.8b, v2.8b 322 323 umull2 v4.8h, v3.16b, v8.16b 324 umull v8.8h, v3.8b, v8.8b 325 umull2 v5.8h, v3.16b, v9.16b 326 umull v9.8h, v3.8b, v9.8b 327 umull2 v6.8h, v3.16b, v10.16b 328 umull v10.8h, v3.8b, v10.8b 329 330 uqadd v12.8h, v12.8h, v4.8h 331 uqadd v0.8h, v0.8h, v8.8h 332 uqadd v13.8h, v13.8h, v5.8h 333 uqadd v1.8h, v1.8h, v9.8h 334 uqadd v14.8h, v14.8h, v6.8h 335 uqadd v2.8h, v2.8h, v10.8h 336 337 urshr v8.8h, v0.8h, #8 338 urshr v4.8h, v12.8h, #8 339 urshr v9.8h, v1.8h, #8 340 urshr v5.8h, v13.8h, #8 341 urshr v10.8h, v2.8h, #8 342 urshr v6.8h, v14.8h, #8 343 344 uqadd v0.8h, v0.8h, v8.8h 345 uqadd v12.8h, v12.8h, v4.8h 346 uqadd v1.8h, v1.8h, v9.8h 347 uqadd v13.8h, v13.8h, v5.8h 348 uqadd v2.8h, v2.8h, v10.8h 349 uqadd v14.8h, v14.8h, v6.8h 350 351 uqrshrn v0.8b, v0.8h, #8 352 uqrshrn2 v0.16b, v12.8h, #8 353 uqrshrn v1.8b, v1.8h, #8 354 uqrshrn2 v1.16b, v13.8h, #8 355 uqrshrn v2.8b, v2.8h, #8 356 uqrshrn2 v2.16b, v14.8h, #8 357 358 mov v3.16b, v11.16b 359.endm 360 361#define params_MULTIPLY zipped=0 362.macro blend_kernel_MULTIPLY 363 umull2 v12.8h, v0.16b, v8.16b 364 umull v0.8h, v0.8b, v8.8b 365 umull2 v13.8h, v1.16b, v9.16b 366 umull v1.8h, v1.8b, v9.8b 367 umull2 v14.8h, v2.16b, v10.16b 368 umull v2.8h, v2.8b, v10.8b 369 umull2 v15.8h, v3.16b, v11.16b 370 umull v3.8h, v3.8b, v11.8b 371 372 rshrn v4.8b, v0.8h, #8 373 rshrn2 v4.16b, v12.8h, #8 374 rshrn v5.8b, v1.8h, #8 375 rshrn2 v5.16b, v13.8h, #8 376 rshrn v6.8b, v2.8h, #8 377 rshrn2 v6.16b, v14.8h, #8 378 rshrn v7.8b, v3.8h, #8 379 rshrn2 v7.16b, v15.8h, #8 380 381 uaddw v0.8h, v0.8h, v4.8b 382 uaddw2 v12.8h, v12.8h, v4.16b 383 uaddw v1.8h, v1.8h, v5.8b 384 uaddw2 v13.8h, v13.8h, v5.16b 385 uaddw v2.8h, v2.8h, v6.8b 386 uaddw2 v14.8h, v14.8h, v6.16b 387 uaddw v3.8h, v3.8h, v7.8b 388 uaddw2 v15.8h, v15.8h, v7.16b 389 390 rshrn v0.8b, v0.8h, #8 391 rshrn2 v0.16b, v12.8h, #8 392 rshrn v1.8b, v1.8h, #8 393 rshrn2 v1.16b, v13.8h, #8 394 rshrn v2.8b, v2.8h, #8 395 rshrn2 v2.16b, v14.8h, #8 396 rshrn v3.8b, v3.8h, #8 397 rshrn2 v3.16b, v15.8h, #8 398.endm 399 400#define params_ADD zipped=0 401.macro blend_kernel_ADD 402 uqadd v0.16b, v0.16b, v8.16b 403 uqadd v1.16b, v1.16b, v9.16b 404 uqadd v2.16b, v2.16b, v10.16b 405 uqadd v3.16b, v3.16b, v11.16b 406.endm 407 408#define params_SUBTRACT zipped=0 409.macro blend_kernel_SUBTRACT 410 uqsub v0.16b, v0.16b, v8.16b 411 uqsub v1.16b, v1.16b, v9.16b 412 uqsub v2.16b, v2.16b, v10.16b 413 uqsub v3.16b, v3.16b, v11.16b 414.endm 415 416#define params_DIFFERENCE zipped=0 417.macro blend_kernel_DIFFERENCE 418 uabd v0.16b, v0.16b, v8.16b 419 uabd v1.16b, v1.16b, v9.16b 420 uabd v2.16b, v2.16b, v10.16b 421 uabd v3.16b, v3.16b, v11.16b 422.endm 423 424#define params_XOR zipped=0 425.macro blend_kernel_XOR 426 eor v0.16b, v0.16b, v8.16b 427 eor v1.16b, v1.16b, v9.16b 428 eor v2.16b, v2.16b, v10.16b 429 eor v3.16b, v3.16b, v11.16b 430.endm 431 432 433/* Define the wrapper code which will load and store the data, iterate the 434 * correct number of times, and safely handle the remainder at the end of the 435 * loop. Various sections of assembly code are dropped or substituted for 436 * simpler operations if they're not needed. 437 */ 438.macro wrap_line kernel, nowrap=0, zipped=1, lddst=1, ldsrc=1, pld=1 439.if \nowrap 440 \kernel 441.else 442 sub x3, sp, #32 443 sub sp, sp, #64 444 st1 {v8.1d - v11.1d}, [sp] 445 st1 {v12.1d - v15.1d}, [x3] 446 subs x2, x2, #64 447 b 2f 448.align 4 4491: 450 .if \lddst 451 .if \zipped 452 ld4 {v0.16b - v3.16b}, [x0] 453 .else 454 ld1 {v0.16b - v3.16b}, [x0] 455 .endif 456 .endif 457 .if \ldsrc 458 .if \zipped 459 ld4 {v8.16b - v11.16b}, [x1], #64 460 .else 461 ld1 {v8.16b - v11.16b}, [x1], #64 462 .endif 463 .endif 464 .if \pld 465#if 0 /* TODO: test this on real hardware */ 466 .if \lddst ; prfm PLDL1STRM, [x0, #192] ; .endif 467 .if \ldsrc ; prfm PLDL1STRM, [x1, #192] ; .endif 468#endif 469 .endif 470 471 \kernel 472 473 subs x2, x2, #64 474 .if \zipped 475 st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64 476 .else 477 st1 {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64 478 .endif 479 4802: bge 1b 481 adds x2, x2, #64 482 beq 2f 483 484 /* To handle the tail portion of the data (something less than 64 485 * bytes) load small power-of-two chunks into working registers. It 486 * doesn't matter where they end up in the register; the same process 487 * will store them back out using the same positions and the operations 488 * don't require data to interact with its neighbours. 489 */ 490 movi v0.16b, #0 491 movi v1.16b, #0 492 movi v2.16b, #0 493 movi v3.16b, #0 494 495 movi v8.16b, #0 496 movi v9.16b, #0 497 movi v10.16b, #0 498 movi v11.16b, #0 499 500 tbz x2, #5, 1f 501 .if \lddst ; ld1 {v2.16b,v3.16b}, [x0], #32 ; .endif 502 .if \ldsrc ; ld1 {v10.16b,v11.16b}, [x1], #32 ; .endif 5031: tbz x2, #4, 1f 504 .if \lddst ; ld1 {v1.16b}, [x0], #16 ; .endif 505 .if \ldsrc ; ld1 {v9.16b}, [x1], #16 ; .endif 5061: tbz x2, #3, 1f 507 .if \lddst ; ld1 {v0.d}[1], [x0], #8 ; .endif 508 .if \ldsrc ; ld1 {v8.d}[1], [x1], #8 ; .endif 5091: tbz x2, #2, 1f 510 .if \lddst ; ld1 {v0.s}[1], [x0], #4 ; .endif 511 .if \ldsrc ; ld1 {v8.s}[1], [x1], #4 ; .endif 5121: tbz x2, #1, 1f 513 .if \lddst ; ld1 {v0.h}[1], [x0], #2 ; .endif 514 .if \ldsrc ; ld1 {v8.h}[1], [x1], #2 ; .endif 5151: tbz x2, #0, 1f 516 .if \lddst ; ld1 {v0.b}[1], [x0], #1 ; .endif 517 .if \ldsrc ; ld1 {v8.b}[1], [x1], #1 ; .endif 5181: 519 .if \lddst ; sub x0, x0, x2 ; .endif 520 521.if \zipped 522 /* One small impediment in the process above is that some of the load 523 * operations can't perform byte-wise structure deinterleaving at the 524 * same time as loading only part of a register. So the data is loaded 525 * linearly and unpacked manually at this point. 526 */ 527 uzp1 v4.16b, v0.16b, v1.16b 528 uzp2 v5.16b, v0.16b, v1.16b 529 uzp1 v6.16b, v2.16b, v3.16b 530 uzp2 v7.16b, v2.16b, v3.16b 531 uzp1 v0.16b, v4.16b, v6.16b 532 uzp2 v2.16b, v4.16b, v6.16b 533 uzp1 v1.16b, v5.16b, v7.16b 534 uzp2 v3.16b, v5.16b, v7.16b 535 536 uzp1 v4.16b, v8.16b, v9.16b 537 uzp2 v5.16b, v8.16b, v9.16b 538 uzp1 v6.16b, v10.16b, v11.16b 539 uzp2 v7.16b, v10.16b, v11.16b 540 uzp1 v8.16b, v4.16b, v6.16b 541 uzp2 v10.16b, v4.16b, v6.16b 542 uzp1 v9.16b, v5.16b, v7.16b 543 uzp2 v11.16b, v5.16b, v7.16b 544 545 \kernel 546 547 zip1 v4.16b, v0.16b, v2.16b 548 zip2 v6.16b, v0.16b, v2.16b 549 zip1 v5.16b, v1.16b, v3.16b 550 zip2 v7.16b, v1.16b, v3.16b 551 zip1 v0.16b, v4.16b, v5.16b 552 zip2 v1.16b, v4.16b, v5.16b 553 zip1 v2.16b, v6.16b, v7.16b 554 zip2 v3.16b, v6.16b, v7.16b 555 .else 556 \kernel 557 .endif 558 559 tbz x2, #5, 1f 560 st1 {v2.16b,v3.16b}, [x0], #32 5611: tbz x2, #4, 1f 562 st1 {v1.16b}, [x0], #16 5631: tbz x2, #3, 1f 564 st1 {v0.d}[1], [x0], #8 5651: tbz x2, #2, 1f 566 st1 {v0.s}[1], [x0], #4 5671: tbz x2, #1, 1f 568 st1 {v0.h}[1], [x0], #2 5691: tbz x2, #0, 2f 570 st1 {v0.b}[1], [x0], #1 5712: ld1 {v8.1d - v11.1d}, [sp], #32 572 ld1 {v12.1d - v15.1d}, [sp], #32 573.endif 574 mov x0, #0 575 ret 576.endm 577 578 579/* produce list of blend_line_XX() functions; each function uses the wrap_line 580 * macro, passing it the name of the operation macro it wants along with 581 * optional parameters to remove unnecessary operations. 582 */ 583#define BLEND_X(d, n) ENTRY(blend_line_##n) ; wrap_line blend_kernel_##n, params_##n ; END(blend_line_##n) ; 584 BLEND_LIST(BLEND_X) 585#undef BLEND_X 586 587#define BLEND_X(d, n) .set tablesize, d+1 ; 588 BLEND_LIST(BLEND_X) 589#undef BLEND_X 590 591/* int rsdIntrinsicBlend_K( 592 * uchar4 *out, // x0 593 * uchar4 const *in, // x1 594 * int slot, // x2 595 * size_t xstart, // x3 596 * size_t xend); // x4 597 */ 598ENTRY(rsdIntrinsicBlend_K) 599 adrp x5, blendtable 600 add x5, x5, :lo12:blendtable 601 cmp w2, tablesize 602 bhs 1f 603 ldrsh x6, [x5, w2, uxtw #1] 604 add x0, x0, w3, uxtw #2 605 add x1, x1, w3, uxtw #2 606 sub w2, w4, w3 607 ubfiz x2, x2, #2, #32 /* TODO: fix */ 608 cbz x6, 1f 609 adr x5, 2f 610 add x6, x5, x6 6112: br x6 6121: mov x0, #-1 613 ret 614 615END(rsdIntrinsicBlend_K) 616 617.rodata 618.set off,0 619blendtable: 620#define BLEND_X(d, n) .rept d-off ; .hword 0 ; .endr ; .hword blend_line_##n - 2b ; .set off, d+1 ; 621 BLEND_LIST(BLEND_X) 622#undef BLEND_X 623