1.macro push_v_regs 2 stp q8, q9, [sp, #-32]! 3 stp q10, q11, [sp, #-32]! 4 stp q12, q13, [sp, #-32]! 5 stp q14, q15, [sp, #-32]! 6//st1 { v8.2d, v9.2d, v10.2d, v11.2d}, [sp, #-64]! 7//st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp, #-64]! 8 stp X8, X9, [sp, #-16]! 9 stp X10, X11, [sp, #-16]! 10 stp X12, X13, [sp, #-16]! 11 stp X14, X15, [sp, #-16]! 12 stp X16, X17, [sp, #-16]! 13 stp X18, X19, [sp, #-16]! 14 stp X20, X21, [sp, #-16]! 15 stp X22, X23, [sp, #-16]! 16 stp X24, X25, [sp, #-16]! 17 stp X26, X27, [sp, #-16]! 18 stp X28, X29, [sp, #-16]! 19 stp X30, X29, [sp, #-16]! 20.endm 21 22.macro pop_v_regs 23 ldp X30, X29, [sp], #16 24 ldp X28, X29, [sp], #16 25 ldp X26, X27, [sp], #16 26 ldp X24, X25, [sp], #16 27 ldp X22, X23, [sp], #16 28 ldp X20, X21, [sp], #16 29 ldp X18, X19, [sp], #16 30 ldp X16, X17, [sp], #16 31 ldp X14, X15, [sp], #16 32 ldp X12, X13, [sp], #16 33 ldp X10, X11, [sp], #16 34 ldp X8, X9, [sp], #16 35//ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64 36//ld1 { v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64 37 ldp q14, q15, [sp], #32 38 ldp q12, q13, [sp], #32 39 ldp q10, q11, [sp], #32 40 ldp q8, q9, [sp], #32 41.endm 42 43 44.text 45.p2align 2 46.global ixheaacd_post_twid_overlap_add_armv8 47 48ixheaacd_post_twid_overlap_add_armv8: 49 50 // STMFD sp!, {x4-x12} 51 push_v_regs 52 //stp x19, x20,[sp,#-16]! 53 //VPUSH {d8 - d15} 54 55 //LDR w4, [sp, #100] 56 //sxtw x4,w4 57 //LDR w5, [sp, #104] 58 //sxtw x5,w5 59 //LDR w6, [sp, #108] 60 //sxtw x6,w6 61 MOV x16, x5 62 MOV x17, x7 63 LSL x9, x3, #2 64 ASR x9, x9, #1 65 ADD x6, x6, x9 66 SUB x6, x6, #4 67 68 MOV w8, #7500 69 sxtw x8, w8 70 ADD x2, x2, x8 71 72 73 74 movi v18.4h, #50 75 sub x20, x5, #15 76 neg x9, x20 77 movi v20.4s, #0x00, LSL #8 78 dup v16.4s,w5 79 SUB x5, x5, #16 80 //STR w5, [sp, #116] 81 MOV w25, w5 82 sxtw x25,w25 83 MOV x8, #1 84 LSL x8, x8, x9 85 //STR w8, [sp, #120] 86 MOV w26, w8 87 88 //sxtw x8,w8 89 90 91ARM_PROLOGUE: 92 93 94 LDR w8, [x1], #4 95 sxtw x8,w8 96 LDR w9, [x1], #4 97 sxtw x9,w9 98 99 LDR w10, [x2], #4 100 sxtw x10,w10 101 102 AND w19,w10,0xFFFF 103 sxth x19,w19 104 ASR w10,w10,#16 105// SMULWT x11, x8, x10 106// 107// SMULWB x12, x9, x10 108// SMULWB x5, x8, x10 109// SMLAWT x7, x9, x10, x5 110 111 SMULL x11, w8, w10 112 ASR x11,x11,#16 113 SMULL x12, w9, w19 114 ASR x12,x12,#16 115 SMULL x5, w8, w19 116 ASR x5,x5,#16 117 SMULL x7, w9, w10 118 ASR x7, x7, #16 119 ADD x7, x7, x5 120 121 SUB x8, x12, x11 122 MVN x5, x7 123 ADD x5, x5, #1 124 125 126 MOV x9, #50 127 MOV x12, #-50 128 AND w19,w9,0xFFFF 129 sxth x19,w19 130 SMULL x10, w5, w19 131 ASR x10,x10,#16 132 AND w19,w12,0xFFFF 133 sxth x19,w19 134 SMULL x11, w8, w19 135 ASR x11,x11,#16 136 137 ADD x8, x8, x10 138 ADD x5, x5, x11 139 140 //LDR w11, [sp, #104] 141 MOV w11, w16 142 sxth x11,w11 143 LDR w10, [x6], #-32 144 sxtw x10,w10 145 146 AND w19,w10,0xFFFF 147 sxth x19,w19 148 ASR w20,w10,#16 149 150 //SMULWB x7, x8, x10 151 SMULL x7, w8, w19 152 ASR x7, x7, #16 153 MVN x8, x8 154 ADD x8, x8, #1 155 //SMULWT x12, x8, x10 156 SMULL x12, w8, w20 157 ASR x12, x12, #16 158 159 CMP x11, #0 160 BLT NEXT 161 162 SUB x9, x11, #16 163 negs x9,x9 164 165 166 167 168 // LDR w8, [sp, #120] 169 //sxtw x8,w8 170 MOV v1.s[0], w26 171 MOV v2.s[0], w5 172 173 //sQADD w5, w5, w8 174 //ASR w5, w5, w9 175 176 SQADD v2.2s, v2.2s, v1.2s 177 MOV w5, v2.s[0] 178 ASR w5, w5, w9 179 180 SUB x9, x11, #31 181 negs x9,x9 182 ASR x20, x7, x9 183 //MOV x8, x20 184 ADDS x8, x20, #0 185 BGE NEXT2 186 CMN x8, #1 187NEXT2: 188 MOV x20, #0x80000000 189 csel x7, x20, x7,LT 190 MOV x20, #0x7fffffff 191 csel x7, x20, x7,GT 192 LSL x20, x7, x11 193 csel x7,x20,x7,EQ 194 195 SUB x9, x11, #31 196 negs x9,x9 197 ASR x20, x12, x9 198 //MOV x8, x20 199 ADDS x8, x20, #0 200 BGE NEXT3 201 CMN x8, #1 202NEXT3: 203 MOV x20, #0x80000000 204 csel x12, x20, x12,LT 205 MOV x20, #0x7fffffff 206 csel x12, x20, x12,GT 207 LSL x20, x12, x11 208 csel x12,x20,x12,EQ 209 210 B NEXT1 211NEXT: 212 MVN w11, w11 213 ADD w11, w11, #1 214 ASR w5, w5, w11 215 MOV w8, #0x8000 216 217 MOV v1.s[0], w8 218 MOV v2.s[0], w5 219 220 //QADD x5, x5, x8 221 222 SQADD v2.2s, v2.2s, v1.2s 223 MOV w5, v2.s[0] 224 225 ASR w5, w5, #16 226 ASR w7, w7, w11 227 ASR w12, w12, w11 228 229NEXT1: 230 LDR w9, [x4] 231 sxtw x9,w9 232 MOV w8, #0x8000 233 //sxtw x8,w8 234 235 STR w5, [x4], #4 236 sxtw x5,w5 237 238 239 ROR w20, w10, #16 240 //UXTH x5, x10, ROR #16 241 UXTH w5, w20 242 UXTH w10, w10 243 244 245 dup v0.2s,w9 246 dup v2.2s,w10 247 dup v3.2s,w5 248 //VZIP.32 D2, D3 249 ZIP1 v28.2s, v2.2s, v3.2s 250 ZIP2 v3.2s, v2.2s, v3.2s 251 MOV v2.8b, v28.8b 252 sMULL v0.2d, v2.2s, v0.2s 253 Sqxtn v8.2s, v0.2d 254 255 256 dup v0.2s,w12 257 dup v1.2s,w7 258 259 //VZIP.32 D0, D1 260 261 ZIP1 v28.2s, v0.2s, v1.2s 262 ZIP2 v1.2s, v0.2s, v1.2s 263 MOV v0.8b, v28.8b 264 265 SQSUB v8.2s, v0.2s , v8.2s 266 267 268 //sQshL v8.2s, v8.2s,#2 269 dup v0.2s,w8 270 //SQADD v8.2s, v8.2s , v0.2s 271 //sshR v8.2s, v8.2s,#16 272 273 274 275 MOV x7, x17 276 //sxtw x7,w7 277 LSL x10, x7, #2 278 279 ASR x5, x3, #1 280 //SMULBB x5, x10, x5 281 AND w5,w5,0xFFFF 282 sxth x5,w5 283 AND w19,w10,0xFFFF 284 sxth x19,w19 285 SMULL x5, w19, w5 286 287 ADD x5, x5, x0 288 SUB x0, x5, x10 289 MVN x9, x10 290 ADD x9, x9, #1 291 292 ST1 {V8.S}[1],[x0], x9 293 ST1 {V8.S}[0],[x5], x10 294 295 296 MOV x8, x1 297 LSL x12, x3, #2 298 299 ADD x1, x1, x12 300 301 SUB x1, x1, #40 302 303 MOV x12, #-32 304 305 306 307PROLOGUE_NEON: 308 309 ASR x3, x3, #2 310 SUB x3, x3, #4 311 ASR x3, x3, #2 312 SUB x3, x3, #2 313 314 LD2 { v0.4s, v1.4s}, [x1] 315 MOV v2.16b, v1.16b 316 ADD x1, x1, x12 317 318 //VUZP.16 D0, D1 319 UZP1 v28.8h, v0.8h, v0.8h 320 UZP2 v29.8h, v0.8h, v0.8h 321 MOV v0.d[0], v28.d[0] 322 MOV v0.d[1], v29.d[0] 323 324 //VUZP.16 D2, D3 325 326 UZP1 v28.8h, v2.8h, v2.8h 327 UZP2 v29.8h, v2.8h, v2.8h 328 MOV v2.d[0], v28.d[0] 329 MOV v2.d[1], v29.d[0] 330 331 332 //rev64 v0.8h, v0.8h 333 rev64 v0.8h, v0.8h 334 MOV v1.d[0], v0.d[1] 335 rev64 v2.8h, v2.8h 336 MOV v3.d[0], v2.d[1] 337 LD2 {v8.4h, v9.4h}, [x2] 338 ADD x2, x2, #16 339 340 LD2 { v4.4s, v5.4s}, [x8] 341 MOV v6.16b, v5.16b 342 ADD x8, x8,#32 343 uMULL v30.4s, v0.4h, v9.4h 344 345// VUZP.16 D4, D5 346 347 UZP1 v28.8h, v4.8h, v4.8h 348 UZP2 v29.8h, v4.8h, v4.8h 349 MOV v4.d[0], v28.d[0] 350 MOV v5.d[0], v29.d[0] 351 352 uMULL v28.4s, v2.4h, v8.4h 353 354// VUZP.16 D6, D7 355 UZP1 v26.8h, v6.8h, v6.8h 356 UZP2 v27.8h, v6.8h, v6.8h 357 MOV v6.d[0], v26.d[0] 358 MOV v7.d[0], v27.d[0] 359 360 uMULL v26.4s, v0.4h, v8.4h 361 362 363 uMULL v24.4s, v2.4h, v9.4h 364 365 LD2 { v10.4s, v11.4s}, [x6] 366 MOV v12.16b, v11.16b 367 ADD x6, x6, x12 368 ushR v30.4s, v30.4s,#16 369 370 //VUZP.16 D10, D11 371 372 UZP1 v22.8h, v10.8h, v10.8h 373 UZP2 v23.8h, v10.8h, v10.8h 374 MOV v10.d[0], v22.d[0] 375 MOV v10.d[1], v23.d[0] 376 377 ushR v28.4s, v28.4s,#16 378 379 //VUZP.16 D12, D13 380 381 UZP1 v22.8h, v12.8h, v12.8h 382 UZP2 v23.8h, v12.8h, v12.8h 383 MOV v12.d[0], v22.d[0] 384 MOV v12.d[1], v23.d[0] 385 386 sMLAL v30.4s, v1.4h, v9.4h 387 388 rev64 v10.8h, v10.8h 389 MOV v11.d[0], v10.d[1] 390 sMLAL v28.4s, v3.4h, v8.4h 391 392 rev64 v12.8h, v12.8h 393 MOV v13.d[0], v12.d[1] 394 ushR v26.4s, v26.4s,#16 395 396 397 ushR v24.4s, v24.4s,#16 398 399 sMLAL v26.4s, v1.4h, v8.4h 400 sMLAL v24.4s, v3.4h, v9.4h 401 402 403 404 ADD v30.4s, v30.4s , v28.4s 405 NEG v30.4s, v30.4s 406 407 uMULL v22.4s, v4.4h, v8.4h 408 409 SUB v28.4s, v24.4s , v26.4s 410 411 412 mov v26.16b, v30.16b 413 mov v24.16b, v28.16b 414 415// VUZP.16 D24, D25 416 417 UZP1 v19.8h, v24.8h, v24.8h 418 UZP2 v21.8h, v24.8h, v24.8h 419 MOV v24.d[0], v19.d[0] 420 MOV v25.d[0], v21.d[0] 421 422 423// VUZP.16 D26, D27 424 425 UZP1 v19.8h, v26.8h, v26.8h 426 UZP2 v21.8h, v26.8h, v26.8h 427 MOV v26.d[0], v19.d[0] 428 MOV v27.d[0], v21.d[0] 429 430 uMULL v2.4s, v24.4h, v18.4h 431 432 uMULL v0.4s, v26.4h, v18.4h 433 434 ushR v22.4s, v22.4s,#16 435 sMLAL v22.4s, v5.4h, v8.4h 436 437 ushR v2.4s, v2.4s,#16 438 ushR v0.4s, v0.4s,#16 439 sMLAL v2.4s, v25.4h, v18.4h 440 sMLAL v0.4s, v27.4h, v18.4h 441 442 uMULL v24.4s, v4.4h, v9.4h 443 uMULL v26.4s, v6.4h, v8.4h 444 445 NEG v2.4s, v2.4s 446 ADD v28.4s, v28.4s , v0.4s 447 ADD v30.4s, v30.4s , v2.4s 448 449 uMULL v0.4s, v6.4h, v9.4h 450 sshR v24.4s, v24.4s,#16 451 sMLAL v24.4s, v5.4h, v9.4h 452 sshR v26.4s, v26.4s,#16 453 sshR v0.4s, v0.4s,#16 454 sMLAL v26.4s, v7.4h, v8.4h 455 sMLAL v0.4s, v7.4h, v9.4h 456 457 458 459 460 ADD v22.4s, v22.4s , v0.4s 461 NEG v22.4s, v22.4s 462 SUB v24.4s, v26.4s , v24.4s 463 464 465 466 //LDR w11, [sp, #120] 467 //sxtw x11,w11 468 MOV w11, w26 469 dup v14.4s,w11 470 SQADD v28.4s, v28.4s , v14.4s 471 //LDR w11, [sp, #116] 472 MOV w11, w25 473 //sxtw x11,w11 474 dup v0.4s,w11 475 sQshL v28.4s, v28.4s, v0.4s 476 477 mov v0.16b, v22.16b 478 mov v14.16b, v24.16b 479 480 481// VUZP.16 D24, D25 482 483 UZP1 v19.8h, v24.8h, v24.8h 484 UZP2 v21.8h, v24.8h, v24.8h 485 MOV v24.d[0], v19.d[0] 486 MOV v25.d[0], v21.d[0] 487 488 489// VUZP.16 D22, D23 490 491 UZP1 v19.8h, v22.8h, v22.8h 492 UZP2 v21.8h, v22.8h, v22.8h 493 MOV v22.d[0], v19.d[0] 494 MOV v23.d[0], v21.d[0] 495 496 uMULL v8.4s, v24.4h, v18.4h 497 uMULL v26.4s, v22.4h, v18.4h 498 499 NEG v2.4s, v30.4s 500// VUZP.16 D30, D31 501 502 UZP1 v19.8h, v30.8h, v30.8h 503 UZP2 v21.8h, v30.8h, v30.8h 504 MOV v30.d[0], v19.d[0] 505 MOV v30.d[1], v21.d[0] 506 507// VUZP.16 D2, D3 508 509 UZP1 v19.8h, v2.8h, v2.8h 510 UZP2 v21.8h, v2.8h, v2.8h 511 MOV v2.d[0], v19.d[0] 512 MOV v3.d[0], v21.d[0] 513 514 uMULL v4.4s, v30.4h, v12.4h 515 516 uMULL v6.4s, v2.4h, v13.4h 517 518 ushR v8.4s, v8.4s,#16 519 ushR v26.4s, v26.4s,#16 520 521 sMLAL v8.4s, v25.4h, v18.4h 522 sMLAL v26.4s, v23.4h, v18.4h 523 524 ushR v4.4s, v4.4s,#16 525 ushR v6.4s, v6.4s,#16 526 527 MOV v19.d[0], v30.d[1] 528 529 sMLAL v4.4s, v19.4h, v12.4h 530 sMLAL v6.4s, v3.4h, v13.4h 531 532 NEG v8.4s, v8.4s 533 ADD v14.4s, v14.4s , v26.4s 534 ADD v0.4s, v0.4s , v8.4s 535 536 //LDR w11, [sp, #120] 537 //sxtw x11,w11 538 MOV w11, w26 539 dup v8.4s,w11 540 SQADD v0.4s, v0.4s , v8.4s 541 //LDR w11, [sp, #116] 542 //sxtw x11,w11 543 MOV w11, w25 544 dup v26.4s,w11 545 sQshL v0.4s, v0.4s, v26.4s 546 547 mov v26.16b, v28.16b 548 549 LD2 { v28.4s, v29.4s}, [x4] 550 MOV v30.16b, v29.16b 551 MOV v29.d[0], v28.d[1] 552 // VZIP.32 Q13, Q0 553 554 ZIP1 v19.4s, v26.4s, v0.4s 555 ZIP2 v0.4s, v26.4s, v0.4s 556 MOV v26.16b, v19.16b 557 558 ST1 { v26.4s}, [x4],#16 559 ST1 { v0.4s}, [x4],#16 560 561 movi v1.2s, #0 562 //VADDL.S16 Q0, D13, D1 563 564 SADDL v0.4s, v13.4h, v1.4h 565 MOV v1.d[0], v0.d[1] 566 sMULL v26.2d, v28.2s, v0.2s 567 Sqxtn v8.2s, v26.2d 568 sMULL v26.2d, v29.2s, v1.2s 569 Sqxtn v9.2s, v26.2d 570 MOV v8.d[1], v9.d[0] 571 movi v1.2s, #0 572// VADDL.S16 Q0, D12, D1 573 SADDL v0.4s, v12.4h, v1.4h 574 MOV v1.d[0], v0.d[1] 575 sMULL v24.2d, v28.2s, v0.2s 576 Sqxtn v26.2s, v24.2d 577 sMULL v24.2d, v29.2s, v1.2s 578 Sqxtn v27.2s, v24.2d 579 MOV v26.d[1], v27.d[0] 580 581 sQshL v4.4s, v4.4s, v16.4s 582 sQshL v6.4s, v6.4s, v16.4s 583 584 SQSUB v4.4s, v4.4s , v8.4s 585 SQSUB v6.4s, v6.4s , v26.4s 586 587 NEG v26.4s, v14.4s 588// VUZP.16 D14, D15 589 590 591 UZP1 v19.8h, v14.8h, v14.8h 592 UZP2 v21.8h, v14.8h, v14.8h 593 MOV v14.d[0], v19.d[0] 594 MOV v15.d[0], v21.d[0] 595 596// VUZP.16 D26, D27 597 598 599 UZP1 v19.8h, v26.8h, v26.8h 600 UZP2 v21.8h, v26.8h, v26.8h 601 MOV v26.d[0], v19.d[0] 602 MOV v27.d[0], v21.d[0] 603 604 605 movi v1.2s, #0 606// VADDL.S16 Q0, D10, D1 607 SADDL v0.4s, v10.4h, v1.4h 608 MOV v1.d[0], v0.d[0] 609 sMULL v22.2d, v30.2s, v0.2s 610 Sqxtn v24.2s, v22.2d 611 sMULL2 v22.2d, v30.4s, v0.4s 612 Sqxtn v25.2s, v22.2d 613 MOV v24.d[1], v25.d[0] 614 movi v1.2s, #0 615// VADDL.S16 Q0, D11, D1 616 SADDL v0.4s, v11.4h, v1.4h 617 MOV v1.d[0], v0.d[1] 618 619 sMULL v8.2d, v30.2s, v0.2s 620 Sqxtn v22.2s, v8.2d 621 sMULL2 v8.2d, v30.4s, v0.4s 622 Sqxtn v23.2s, v8.2d 623 MOV v22.d[1], v23.d[0] 624 uMULL v8.4s, v26.4h, v11.4h 625 uMULL v30.4s, v14.4h, v10.4h 626 627 LD2 { v0.4s, v1.4s}, [x1] 628 MOV v2.16b, v1.16b 629 ADD x1, x1, x12 630 631// VUZP.16 D0, D1 632 633 UZP1 v19.8h, v0.8h, v0.8h 634 UZP2 v21.8h, v0.8h, v0.8h 635 MOV v0.d[0], v19.d[0] 636 MOV v0.d[1], v21.d[0] 637 638// VUZP.16 D2, D3 639 640 UZP1 v19.8h, v2.8h, v2.8h 641 UZP2 v21.8h, v2.8h, v2.8h 642 MOV v2.d[0], v19.d[0] 643 MOV v2.d[1], v21.d[0] 644 645 ushR v8.4s, v8.4s,#16 646 647 rev64 v0.8h, v0.8h 648 MOV v1.d[0], v0.d[1] 649 ushR v30.4s, v30.4s,#16 650 651 rev64 v2.8h, v2.8h 652 MOV v3.d[0], v2.d[1] 653 sMLAL v8.4s, v27.4h, v11.4h 654 655 sMLAL v30.4s, v15.4h, v10.4h 656 657 LD2 { v10.4s, v11.4s}, [x6] 658 ADD x6, x6, x12 659 MOV v12.16b, v11.16b 660 661 UZP1 v19.8h, v10.8h, v10.8h 662 UZP2 v21.8h, v10.8h, v10.8h 663 MOV v10.d[0], v19.d[0] 664 MOV v10.d[1], v21.d[0] 665 666 667 UZP1 v19.8h, v12.8h, v12.8h 668 UZP2 v21.8h, v12.8h, v12.8h 669 MOV v12.d[0], v19.d[0] 670 MOV v12.d[1], v21.d[0] 671 MOV V14.16B , V4.16B 672 673 rev64 v10.8h, v10.8h 674 MOV v11.d[0], v10.d[1] 675 676 677 rev64 v12.8h, v12.8h 678 MOV v13.d[0], v12.d[1] 679 680 sQshL v8.4s, v8.4s, v16.4s 681 682 MOV V31.16B, V6.16B 683 LD2 { v4.4s, v5.4s}, [x8] 684 ADD x8, x8,#32 685 686 MOV v6.16b, v5.16b 687 sQshL v30.4s, v30.4s, v16.4s 688 689// VUZP.16 D4, D5 690 691 UZP1 v19.8h, v4.8h, v4.8h 692 UZP2 v21.8h, v4.8h, v4.8h 693 MOV v4.d[0], v19.d[0] 694 MOV v5.d[0], v21.d[0] 695 696 SQSUB v8.4s, v8.4s , v24.4s 697 698// VUZP.16 D6, D7 699 700 UZP1 v19.8h, v6.8h, v6.8h 701 UZP2 v21.8h, v6.8h, v6.8h 702 MOV v6.d[0], v19.d[0] 703 MOV v7.d[0], v21.d[0] 704 705 SQSUB v22.4s, v30.4s , v22.4s 706 707 708 MOV V30.16B, V8.16B 709 710 LD2 {v8.4h, v9.4h}, [x2] 711 ADD x2, x2, #16 712 713 714CORE_LOOP: 715 ST1 {V14.S}[0], [x0] 716 ADD x0, x0, x9 717 ST1 {V22.S}[0], [x0] 718 ADD x0, x0, x9 719 720 721 ST1 {V14.S}[1], [x0] 722 ADD x0, x0, x9 723 724 725 ST1 {V22.S}[1], [x0] 726 ADD x0, x0, x9 727 728 729 ST1 {V14.S}[2], [x0] 730 ADD x0, x0, x9 731 732 733 ST1 {V22.S}[2], [x0] 734 ADD x0, x0, x9 735 736 737 ST1 {V14.S}[3], [x0] 738 ADD x0, x0, x9 739 740 741 ST1 {V22.S}[3], [x0] 742 ADD x0, x0, x9 743 744 745 ST1 {V31.S}[0], [x5] 746 ADD x5, x5, x10 747 748 749 ST1 {V30.S}[0], [x5] 750 ADD x5, x5, x10 751 752 753 ST1 {V31.S}[1], [x5] 754 ADD x5, x5, x10 755 756 757 ST1 {V30.S}[1], [x5] 758 ADD x5, x5, x10 759 760 761 ST1 {V31.S}[2], [x5] 762 ADD x5, x5, x10 763 764 765 ST1 {V30.S}[2], [x5] 766 ADD x5, x5, x10 767 768 769 ST1 {V31.S}[3], [x5] 770 ADD x5, x5, x10 771 772 ST1 {V30.S}[3], [x5] 773 ADD x5, x5, x10 774 775 776 uMULL v30.4s, v0.4h, v9.4h 777 uMULL v28.4s, v2.4h, v8.4h 778 uMULL v26.4s, v0.4h, v8.4h 779 uMULL v24.4s, v2.4h, v9.4h 780 ushR v30.4s, v30.4s,#16 781 ushR v28.4s, v28.4s,#16 782 sMLAL v30.4s, v1.4h, v9.4h 783 sMLAL v28.4s, v3.4h, v8.4h 784 ushR v26.4s, v26.4s,#16 785 ushR v24.4s, v24.4s,#16 786 sMLAL v26.4s, v1.4h, v8.4h 787 sMLAL v24.4s, v3.4h, v9.4h 788 ADD v30.4s, v30.4s , v28.4s 789 NEG v30.4s, v30.4s 790 SUB v28.4s, v24.4s , v26.4s 791 792 mov v26.16b, v30.16b 793 uMULL v22.4s, v4.4h, v8.4h 794 795 mov v24.16b, v28.16b 796 797// VUZP.16 D24, D25 798 799 UZP1 v19.8h, v24.8h, v24.8h 800 UZP2 v21.8h, v24.8h, v24.8h 801 MOV v24.d[0], v19.d[0] 802 MOV v25.d[0], v21.d[0] 803 804 805// VUZP.16 D26, D27 806 807 UZP1 v19.8h, v26.8h, v26.8h 808 UZP2 v21.8h, v26.8h, v26.8h 809 MOV v26.d[0], v19.d[0] 810 MOV v27.d[0], v21.d[0] 811 812 uMULL v2.4s, v24.4h, v18.4h 813 uMULL v0.4s, v26.4h, v18.4h 814 815 ushR v22.4s, v22.4s,#16 816 sMLAL v22.4s, v5.4h, v8.4h 817 818 ushR v2.4s, v2.4s,#16 819 ushR v0.4s, v0.4s,#16 820 sMLAL v2.4s, v25.4h, v18.4h 821 sMLAL v0.4s, v27.4h, v18.4h 822 823 uMULL v24.4s, v4.4h, v9.4h 824 uMULL v26.4s, v6.4h, v8.4h 825 826 NEG v2.4s, v2.4s 827 ADD v28.4s, v28.4s , v0.4s 828 ADD v30.4s, v30.4s , v2.4s 829 830 uMULL v0.4s, v6.4h, v9.4h 831 sshR v24.4s, v24.4s,#16 832 sMLAL v24.4s, v5.4h, v9.4h 833 sshR v26.4s, v26.4s,#16 834 sshR v0.4s, v0.4s,#16 835 sMLAL v26.4s, v7.4h, v8.4h 836 sMLAL v0.4s, v7.4h, v9.4h 837 838 839 840 ADD v22.4s, v22.4s , v0.4s 841 842 NEG v22.4s, v22.4s 843 SUB v24.4s, v26.4s , v24.4s 844 845 846 //LDR w11, [sp, #120] 847 //sxtw x11,w11 848 MOV w11, w26 849 dup v14.4s,w11 850 SQADD v28.4s, v28.4s , v14.4s 851 //LDR w11, [sp, #116] 852 //sxtw x11,w11 853 MOV w11, w25 854 dup v0.4s,w11 855 sQshL v28.4s, v28.4s, v0.4s 856 857 858 mov v0.16b, v22.16b 859 mov v14.16b, v24.16b 860 861// VUZP.16 D24, D25 862 863 UZP1 v19.8h, v24.8h, v24.8h 864 UZP2 v21.8h, v24.8h, v24.8h 865 MOV v24.d[0], v19.d[0] 866 MOV v25.d[0], v21.d[0] 867 868 869// VUZP.16 D22, D23 870 871 UZP1 v19.8h, v22.8h, v22.8h 872 UZP2 v21.8h, v22.8h, v22.8h 873 MOV v22.d[0], v19.d[0] 874 MOV v23.d[0], v21.d[0] 875 876 uMULL v8.4s, v24.4h, v18.4h 877 uMULL v26.4s, v22.4h, v18.4h 878 879 NEG v2.4s, v30.4s 880 881// VUZP.16 D30, D31 882 883 UZP1 v19.8h, v30.8h, v30.8h 884 UZP2 v21.8h, v30.8h, v30.8h 885 MOV v30.d[0], v19.d[0] 886 MOV v30.d[1], v21.d[0] 887 888 889// VUZP.16 D2, D3 890 891 UZP1 v19.8h, v2.8h, v2.8h 892 UZP2 v21.8h, v2.8h, v2.8h 893 MOV v2.d[0], v19.d[0] 894 MOV v3.d[0], v21.d[0] 895 896 uMULL v4.4s, v30.4h, v12.4h 897 uMULL v6.4s, v2.4h, v13.4h 898 899 ushR v8.4s, v8.4s,#16 900 ushR v26.4s, v26.4s,#16 901 902 sMLAL v8.4s, v25.4h, v18.4h 903 sMLAL v26.4s, v23.4h, v18.4h 904 905 ushR v4.4s, v4.4s,#16 906 ushR v6.4s, v6.4s,#16 907 908 MOV v19.d[0], v30.d[1] 909 910 sMLAL v4.4s, v19.4h, v12.4h 911 sMLAL v6.4s, v3.4h, v13.4h 912 913 NEG v8.4s, v8.4s 914 ADD v14.4s, v14.4s , v26.4s 915 ADD v0.4s, v0.4s , v8.4s 916 917 918 919 //LDR w11, [sp, #120] 920 //sxtw x11,w11 921 MOV w11, w26 922 dup v8.4s,w11 923 SQADD v0.4s, v0.4s , v8.4s 924 //LDR w11, [sp, #116] 925 //sxtw x11,w11 926 MOV w11, w25 927 dup v26.4s,w11 928 sQshL v0.4s, v0.4s, v26.4s 929 mov v26.16b, v28.16b 930 931 LD2 { v28.4s, v29.4s}, [x4] 932 MOV v30.16b, v29.16b 933 MOV v29.d[0], v28.d[1] 934// VZIP.32 Q13, Q0 935 936 ZIP1 v19.4s, v26.4s, v0.4s 937 ZIP2 v0.4s, v26.4s, v0.4s 938 MOV v26.16b, v19.16b 939 940 ST1 { v26.4s}, [x4] 941 ADD x4, x4,#16 942 ST1 { v0.4s}, [x4] 943 ADD x4, x4,#16 944 945 movi v1.2s, #0 946// VADDL.S16 Q0, D13, D1 947 SADDL v0.4s, v13.4h, v1.4h 948 MOV v1.d[0], v0.d[1] 949 950 sMULL v26.2d, v28.2s, v0.2s 951 Sqxtn v8.2s, v26.2d 952 sMULL v26.2d, v29.2s, v1.2s 953 Sqxtn v9.2s, v26.2d 954 MOV v8.d[1], v9.d[0] 955 movi v1.2s, #0 956 //VADDL.S16 Q0, D12, D1 957 SADDL v0.4s, v12.4h, v1.4h 958 MOV v1.d[0], v0.d[1] 959 960 sMULL v24.2d, v28.2s, v0.2s 961 Sqxtn v26.2s, v24.2d 962 sMULL v24.2d, v29.2s, v1.2s 963 Sqxtn v27.2s, v24.2d 964 MOV v26.d[1], v27.d[0] 965 sQshL v4.4s, v4.4s, v16.4s 966 sQshL v6.4s, v6.4s, v16.4s 967 968 969 970 SQSUB v4.4s, v4.4s , v8.4s 971 SQSUB v6.4s, v6.4s , v26.4s 972 973 NEG v26.4s, v14.4s 974// VUZP.16 D26, D27 975 UZP1 v19.8h, v26.8h, v26.8h 976 UZP2 v21.8h, v26.8h, v26.8h 977 MOV v26.d[0], v19.d[0] 978 MOV v27.d[0], v21.d[0] 979 980 movi v1.2s, #0 981 //VADDL.S16 Q0, D10, D1 982 SADDL v0.4s, v10.4h, v1.4h 983 MOV v1.d[0], v0.d[1] 984 985 sMULL v22.2d, v30.2s, v0.2s 986 Sqxtn v24.2s, v22.2d 987 sMULL2 v22.2d, v30.4s, v0.4s 988 Sqxtn v25.2s, v22.2d 989 MOV v24.d[1], v25.d[0] 990 movi v1.2s, #0 991 //VADDL.S16 Q0, D11, D1 992 SADDL v0.4s, v11.4h, v1.4h 993 994 sMULL v8.2d, v30.2s, v0.2s 995 Sqxtn v22.2s, v8.2d 996 sMULL2 v8.2d, v30.4s, v0.4s 997 Sqxtn v23.2s, v8.2d 998 MOV v22.d[1], v23.d[0] 999 1000// VUZP.16 D14, D15 1001 1002 UZP1 v19.8h, v14.8h, v14.8h 1003 UZP2 v21.8h, v14.8h, v14.8h 1004 MOV v14.d[0], v19.d[0] 1005 MOV v15.d[0], v21.d[0] 1006 1007 uMULL v8.4s, v26.4h, v11.4h 1008 uMULL v30.4s, v14.4h, v10.4h 1009 1010 1011 LD2 { v0.4s, v1.4s}, [x1] 1012 MOV v2.16b, v1.16b 1013 ADD X1, X1, x12 1014 1015// VUZP.16 D0, D1 1016 UZP1 v19.8h, v0.8h, v0.8h 1017 UZP2 v21.8h, v0.8h, v0.8h 1018 MOV v0.d[0], v19.d[0] 1019 MOV v0.d[1], v21.d[0] 1020 1021// VUZP.16 D2, D3 1022 1023 UZP1 v19.8h, v2.8h, v2.8h 1024 UZP2 v21.8h, v2.8h, v2.8h 1025 MOV v2.d[0], v19.d[0] 1026 MOV v2.d[1], v21.d[0] 1027 1028 ushR v8.4s, v8.4s,#16 1029 1030 rev64 v0.8h, v0.8h 1031 MOV v1.d[0], v0.d[1] 1032 ushR v30.4s, v30.4s,#16 1033 1034 rev64 v2.8h, v2.8h 1035 MOV v3.d[0], v2.d[1] 1036 sMLAL v8.4s, v27.4h, v11.4h 1037 1038 sMLAL v30.4s, v15.4h, v10.4h 1039 1040 LD2 { v10.4s, v11.4s}, [x6] 1041 add X6, x6, x12 1042 MOV v12.16b, v11.16b 1043 1044 1045 //VUZP.16 D10, D11 1046 1047 UZP1 v19.8h, v10.8h, v10.8h 1048 UZP2 v21.8h, v10.8h, v10.8h 1049 MOV v10.d[0], v19.d[0] 1050 MOV v10.d[1], v21.d[0] 1051 1052 1053 1054// VUZP.16 D12, D13 1055 1056 UZP1 v19.8h, v12.8h, v12.8h 1057 UZP2 v21.8h, v12.8h, v12.8h 1058 MOV v12.d[0], v19.d[0] 1059 MOV v12.d[1], v21.d[0] 1060 1061 1062 1063 MOV V14.16B, V4.16B 1064 1065 rev64 v10.8h, v10.8h 1066 MOV v11.d[0], v10.d[1] 1067 1068 1069 rev64 v12.8h, v12.8h 1070 MOV v13.d[0], v12.d[1] 1071 1072 sQshL v8.4s, v8.4s, v16.4s 1073 1074 LD2 { v4.4s, v5.4s}, [x8] 1075 ADD x8, x8, #32 1076 1077 MOV V31.16B, V6.16B 1078 MOV v6.16b, v5.16b 1079 1080 sQshL v30.4s, v30.4s, v16.4s 1081 1082 1083 UZP1 v19.8h, v4.8h, v4.8h 1084 UZP2 v21.8h, v4.8h, v4.8h 1085 MOV v4.d[0], v19.d[0] 1086 MOV v5.d[0], v21.d[0] 1087 1088 1089 SQSUB v8.4s, v8.4s , v24.4s 1090 1091// VUZP.16 D6, D7 1092 1093 UZP1 v19.8h, v6.8h, v6.8h 1094 UZP2 v21.8h, v6.8h, v6.8h 1095 MOV v6.d[0], v19.d[0] 1096 MOV v7.d[0], v21.d[0] 1097 1098 SQSUB v22.4s, v30.4s , v22.4s 1099 1100 MOV V30.16B , V8.16B 1101 1102 LD2 {v8.4h, v9.4h}, [x2] 1103 ADD x2, x2,#16 1104 1105 1106 1107 1108 SUBS x3, x3, #1 1109 BNE CORE_LOOP 1110 1111 1112 1113 1114 1115EPILOGUE: 1116 1117 ST1 {V14.S}[0],[x0] 1118 ADD x0, x0, x9 1119 1120 1121 ST1 {V22.S}[0],[x0] 1122 ADD x0, x0, x9 1123 1124 1125 ST1 {V14.S}[1],[x0] 1126 ADD x0, x0, x9 1127 1128 1129 ST1 {V22.S}[1],[x0] 1130 ADD x0, x0, x9 1131 1132 1133 ST1 {V14.S}[2],[x0] 1134 ADD x0, x0, x9 1135 1136 1137 ST1 {V22.S}[2],[x0] 1138 ADD x0, x0, x9 1139 1140 1141 ST1 {V14.S}[3],[x0] 1142 ADD x0, x0, x9 1143 1144 1145 ST1 {V22.S}[3],[x0] 1146 ADD x0, x0, x9 1147 1148 1149 ST1 {V31.S}[0],[x5] 1150 ADD x5, x5, x10 1151 1152 1153 ST1 {V30.S}[0],[x5] 1154 ADD x5, x5, x10 1155 1156 1157 ST1 {V31.S}[1],[x5] 1158 ADD x5, x5, x10 1159 1160 1161 ST1 {V30.S}[1],[x5] 1162 ADD x5, x5, x10 1163 1164 1165 ST1 {V31.S}[2],[x5] 1166 ADD x5, x5, x10 1167 1168 1169 ST1 {V30.S}[2],[x5] 1170 ADD x5, x5, x10 1171 1172 1173 ST1 {V31.S}[3],[x5] 1174 ADD x5, x5, x10 1175 1176 1177 ST1 {V30.S}[3],[x5] 1178 ADD x5, x5, x10 1179 1180 1181 uMULL v30.4s, v0.4h, v9.4h 1182 uMULL v28.4s, v2.4h, v8.4h 1183 uMULL v26.4s, v0.4h, v8.4h 1184 uMULL v24.4s, v2.4h, v9.4h 1185 ushR v30.4s, v30.4s,#16 1186 ushR v28.4s, v28.4s,#16 1187 sMLAL v30.4s, v1.4h, v9.4h 1188 sMLAL v28.4s, v3.4h, v8.4h 1189 ushR v26.4s, v26.4s,#16 1190 ushR v24.4s, v24.4s,#16 1191 sMLAL v26.4s, v1.4h, v8.4h 1192 sMLAL v24.4s, v3.4h, v9.4h 1193 ADD v30.4s, v30.4s , v28.4s 1194 NEG v30.4s, v30.4s 1195 SUB v28.4s, v24.4s , v26.4s 1196 1197 1198 uMULL v22.4s, v4.4h, v8.4h 1199 mov v26.16b, v30.16b 1200 mov v24.16b, v28.16b 1201 1202 mov v26.16b, v30.16b 1203 mov v24.16b, v28.16b 1204 1205 //VUZP.16 D26, D27 1206 1207 UZP1 v19.8h, v26.8h, v26.8h 1208 UZP2 v21.8h, v26.8h, v26.8h 1209 MOV v26.d[0], v19.d[0] 1210 MOV v27.d[0], v21.d[0] 1211 1212// VUZP.16 D24, D25 1213 1214 UZP1 v19.8h, v24.8h, v24.8h 1215 UZP2 v21.8h, v24.8h, v24.8h 1216 MOV v24.d[0], v19.d[0] 1217 MOV v25.d[0], v21.d[0] 1218 1219 uMULL v2.4s, v24.4h, v18.4h 1220 uMULL v0.4s, v26.4h, v18.4h 1221 1222 ushR v22.4s, v22.4s,#16 1223 sMLAL v22.4s, v5.4h, v8.4h 1224 1225 ushR v2.4s, v2.4s,#16 1226 ushR v0.4s, v0.4s,#16 1227 sMLAL v2.4s, v25.4h, v18.4h 1228 sMLAL v0.4s, v27.4h, v18.4h 1229 1230 uMULL v24.4s, v4.4h, v9.4h 1231 uMULL v26.4s, v6.4h, v8.4h 1232 1233 NEG v2.4s, v2.4s 1234 ADD v28.4s, v28.4s , v0.4s 1235 ADD v30.4s, v30.4s , v2.4s 1236 1237 uMULL v0.4s, v6.4h, v9.4h 1238 sshR v24.4s, v24.4s,#16 1239 sMLAL v24.4s, v5.4h, v9.4h 1240 sshR v26.4s, v26.4s,#16 1241 sshR v0.4s, v0.4s,#16 1242 sMLAL v26.4s, v7.4h, v8.4h 1243 sMLAL v0.4s, v7.4h, v9.4h 1244 1245 1246 1247 1248 1249 ADD v22.4s, v22.4s , v0.4s 1250 NEG v22.4s, v22.4s 1251 SUB v24.4s, v26.4s , v24.4s 1252 1253 1254 1255 1256 //LDR w11, [sp, #120] 1257 //sxtw x11,w11 1258 MOV w11, w26 1259 dup v14.4s,w11 1260 SQADD v28.4s, v28.4s , v14.4s 1261 //LDR w11, [sp, #116] 1262 //sxtw x11,w11 1263 MOV w11, w25 1264 dup v0.4s,w11 1265 sQshL v28.4s, v28.4s, v0.4s 1266 1267 1268 mov v0.16b, v22.16b 1269 mov v14.16b, v24.16b 1270 1271 1272// VUZP.16 D22, D23 1273 1274 UZP1 v19.8h, v22.8h, v22.8h 1275 UZP2 v21.8h, v22.8h, v22.8h 1276 MOV v22.d[0], v19.d[0] 1277 MOV v23.d[0], v21.d[0] 1278 1279// VUZP.16 D24, D25 1280 1281 UZP1 v19.8h, v24.8h, v24.8h 1282 UZP2 v21.8h, v24.8h, v24.8h 1283 MOV v24.d[0], v19.d[0] 1284 MOV v25.d[0], v21.d[0] 1285 1286 uMULL v8.4s, v24.4h, v18.4h 1287 uMULL v26.4s, v22.4h, v18.4h 1288 1289 NEG v2.4s, v30.4s 1290 1291// VUZP.16 D30, D31 1292 1293 UZP1 v19.8h, v30.8h, v30.8h 1294 UZP2 v21.8h, v30.8h, v30.8h 1295 MOV v30.d[0], v19.d[0] 1296 MOV v30.d[1], v21.d[0] 1297 1298// VUZP.16 D2, D3 1299 1300 UZP1 v19.8h, v2.8h, v2.8h 1301 UZP2 v21.8h, v2.8h, v2.8h 1302 MOV v2.d[0], v19.d[0] 1303 MOV v3.d[0], v21.d[0] 1304 1305 uMULL v4.4s, v30.4h, v12.4h 1306 uMULL v6.4s, v2.4h, v13.4h 1307 1308 ushR v8.4s, v8.4s,#16 1309 ushR v26.4s, v26.4s,#16 1310 1311 sMLAL v8.4s, v25.4h, v18.4h 1312 sMLAL v26.4s, v23.4h, v18.4h 1313 1314 ushR v4.4s, v4.4s,#16 1315 ushR v6.4s, v6.4s,#16 1316 1317 MOV v19.d[0], v30.d[1] 1318 1319 sMLAL v4.4s, v19.4h, v12.4h 1320 sMLAL v6.4s, v3.4h, v13.4h 1321 1322 NEG v8.4s, v8.4s 1323 ADD v14.4s, v14.4s , v26.4s 1324 ADD v0.4s, v0.4s , v8.4s 1325 1326 //LDR w11, [sp, #120] 1327 //sxtw x11,w11 1328 MOV w11, w26 1329 dup v8.4s,w11 1330 SQADD v0.4s, v0.4s , v8.4s 1331 //LDR w11, [sp, #116] 1332 //sxtw x11,w11 1333 MOV w11, w25 1334 dup v26.4s,w11 1335 sQshL v0.4s, v0.4s, v26.4s 1336 1337 1338 mov v26.16b, v28.16b 1339 1340 LD2 { v28.4s, v29.4s}, [x4] 1341 MOV v30.16b, v29.16b 1342 MOV v29.d[0], v28.d[1] 1343// VZIP.32 Q13, Q0 1344 1345 ZIP1 v19.4s, v26.4s, v0.4s 1346 ZIP2 v0.4s, v26.4s, v0.4s 1347 MOV v26.16b, v19.16b 1348 1349 ST1 { v26.4s}, [x4],#16 1350 ST1 { v0.4s}, [x4],#16 1351 1352 movi v1.2s, #0 1353// VADDL.S16 Q0, D13, D1 1354 SADDL v0.4s, v13.4h, v1.4h 1355 MOV v1.d[0], v0.d[1] 1356 1357 sMULL v26.2d, v28.2s, v0.2s 1358 Sqxtn v8.2s, v26.2d 1359 sMULL v26.2d, v29.2s, v1.2s 1360 Sqxtn v9.2s, v26.2d 1361 MOV v8.d[1], v9.d[0] 1362 movi v1.2s, #0 1363// VADDL.S16 Q0, D12, D1 1364 SADDL v0.4s, v12.4h, v1.4h 1365 MOV v1.d[0], v0.d[1] 1366 1367 sMULL v24.2d, v28.2s, v0.2s 1368 Sqxtn v26.2s, v24.2d 1369 sMULL v24.2d, v29.2s, v1.2s 1370 Sqxtn v27.2s, v24.2d 1371 MOV v26.d[1], v27.d[0] 1372 1373 sQshL v4.4s, v4.4s, v16.4s 1374 sQshL v6.4s, v6.4s, v16.4s 1375 1376 SQSUB v4.4s, v4.4s , v8.4s 1377 SQSUB v6.4s, v6.4s , v26.4s 1378 1379 NEG v26.4s, v14.4s 1380// VUZP.16 D14, D15 1381 1382 UZP1 v19.8h, v14.8h, v14.8h 1383 UZP2 v21.8h, v14.8h, v14.8h 1384 MOV v14.d[0], v19.d[0] 1385 MOV v15.d[0], v21.d[0] 1386 1387 1388 // VUZP.16 D26, D27 1389 1390 UZP1 v19.8h, v26.8h, v26.8h 1391 UZP2 v21.8h, v26.8h, v26.8h 1392 MOV v26.d[0], v19.d[0] 1393 MOV v27.d[0], v21.d[0] 1394 1395 1396 movi v1.2s, #0 1397 //VADDL.S16 Q0, D10, D1 1398 SADDL v0.4s, v10.4h, v1.4h 1399 MOV v1.d[0], v0.d[1] 1400 1401 sMULL v22.2d, v30.2s, v0.2s 1402 Sqxtn v24.2s, v22.2d 1403 sMULL2 v22.2d, v30.4s, v0.4s 1404 Sqxtn v25.2s, v22.2d 1405 MOV v24.d[1], v25.d[0] 1406 movi v1.2s, #0 1407 //VADDL.S16 Q0, D11, D1 1408 SADDL v0.4s, v11.4h, v1.4h 1409 MOV v1.d[0], v0.d[1] 1410 1411 sMULL v8.2d, v30.2s, v0.2s 1412 Sqxtn v22.2s, v8.2d 1413 sMULL2 v8.2d, v30.4s, v0.4s 1414 Sqxtn v23.2s, v8.2d 1415 MOV v22.d[1], v23.d[0] 1416 1417 uMULL v8.4s, v26.4h, v11.4h 1418 uMULL v30.4s, v14.4h, v10.4h 1419 1420 ushR v8.4s, v8.4s,#16 1421 1422 ushR v30.4s, v30.4s,#16 1423 1424 sMLAL v8.4s, v27.4h, v11.4h 1425 1426 sMLAL v30.4s, v15.4h, v10.4h 1427 1428 1429 MOV V14.16B, V4.16B 1430 1431 1432 sQshL v8.4s, v8.4s, v16.4s 1433 1434 sQshL v30.4s, v30.4s, v16.4s 1435 1436 SQSUB v8.4s, v8.4s , v24.4s 1437 1438 SQSUB v22.4s, v30.4s , v22.4s 1439 1440 MOV V30.16B , V8.16B 1441 1442 1443 1444 1445 ST1 {V14.S}[0],[x0] 1446 ADD x0, x0, x9 1447 ST1 {V22.S}[0],[x0] 1448 ADD x0, x0, x9 1449 ST1 {V14.S}[1],[x0] 1450 ADD x0, x0, x9 1451 ST1 {V22.S}[1],[x0] 1452 ADD x0, x0, x9 1453 ST1 {V14.S}[2],[x0] 1454 ADD x0, x0, x9 1455 ST1 {V22.S}[2],[x0] 1456 ADD x0, x0, x9 1457 ST1 {V14.S}[3],[x0] 1458 ADD x0, x0, x9 1459 ST1 {V22.S}[3],[x0] 1460 ADD x0, x0, x9 1461 ST1 {V6.S}[0],[x5] 1462 ADD x5, x5, x10 1463 ST1 {V30.S}[0],[x5] 1464 ADD x5, x5, x10 1465 ST1 {V6.S}[1],[x5] 1466 ADD x5, x5, x10 1467 ST1 {V30.S}[1],[x5] 1468 ADD x5, x5, x10 1469 ST1 {V6.S}[2],[x5] 1470 ADD x5, x5, x10 1471 ST1 {V30.S}[2],[x5] 1472 ADD x5, x5, x10 1473 ST1 {V6.S}[3],[x5] 1474 ADD x5, x5, x10 1475 ST1 {V30.S}[3],[x5] 1476 ADD x5, x5, x10 1477 1478ARM_EPILOGUE: 1479 1480ARM_LOOP: 1481 1482 LD2 { v0.4s, v1.4s}, [x1] 1483 MOV v2.16b, v1.16b 1484 1485 //VUZP.16 D0, D1 1486 UZP1 v19.8h, v0.8h, v0.8h 1487 UZP2 v21.8h, v0.8h, v0.8h 1488 MOV v0.d[0], v19.d[0] 1489 MOV v0.d[1], v21.d[0] 1490 1491 //VUZP.16 D2, D3 1492 UZP1 v19.8h, v2.8h, v2.8h 1493 UZP2 v21.8h, v2.8h, v2.8h 1494 MOV v2.d[0], v19.d[0] 1495 MOV v2.d[1], v21.d[0] 1496 1497 1498 rev64 v0.8h, v0.8h 1499 MOV v1.d[0], v0.d[1] 1500 rev64 v2.8h, v2.8h 1501 MOV v3.d[0], v2.d[1] 1502 1503 LD2 {v8.4h, v9.4h}, [x2] 1504 ADD x2, x2,#16 1505 1506 LD2 {v4.2s, v5.2s}, [x8] 1507 ADD x8, x8,#16 1508 MOV v6.16b, v5.16b 1509 movi v5.2s, #0x00000000 1510 movi v7.2s, #0x00000000 1511 1512 LD1 {v5.s}[0],[x8],#4 1513 LD1 {v7.s}[0],[x8] 1514 1515 MOV x12, #16 1516 MOV v4.d[1], v5.d[0] 1517 MOV v6.d[1], v7.d[0] 1518// VUZP.16 D4, D5 1519 1520 UZP1 v19.8h, v4.8h, v4.8h 1521 UZP2 v21.8h, v4.8h, v4.8h 1522 MOV v4.d[0], v19.d[0] 1523 MOV v5.d[0], v21.d[0] 1524 1525// VUZP.16 D6, D7 1526 1527 UZP1 v19.8h, v6.8h, v6.8h 1528 UZP2 v21.8h, v6.8h, v6.8h 1529 MOV v6.d[0], v19.d[0] 1530 MOV v7.d[0], v21.d[0] 1531 1532 ADD x6, x6, #16 1533 1534 MOV x12, #-4 1535 LD2 {v11.2s, v12.2s}, [x6] 1536 ADD x6, x6, x12 1537 MOV v13.16b, v12.16b 1538 1539 1540 movi v10.2s, #0x00000000 1541 1542 LD1 {v12.s}[1],[x6] 1543 ADD x6, x6, x12 1544 LD1 {v10.s}[1],[x6] 1545 ADD x6, x6, x12 1546 LD1 {v12.s}[0],[x6] 1547 ADD x6, x6, x12 1548 1549 MOV v10.d[1], v11.d[0] 1550 MOV v12.d[1], v13.d[0] 1551 1552 //VUZP.16 D10, D11 1553 1554 UZP1 v19.8h, v10.8h, v10.8h 1555 UZP2 v21.8h, v10.8h, v10.8h 1556 MOV v10.d[0], v19.d[0] 1557 MOV v10.d[1], v21.d[0] 1558 1559 //VUZP.16 D12, D13 1560 1561 UZP1 v19.8h, v12.8h, v12.8h 1562 UZP2 v21.8h, v12.8h, v12.8h 1563 MOV v12.d[0], v19.d[0] 1564 MOV v12.d[1], v21.d[0] 1565 1566 1567 rev64 v10.8h, v10.8h 1568 MOV v11.d[0], v10.d[1] 1569 rev64 v12.8h, v12.8h 1570 MOV v13.d[0], v12.d[1] 1571 1572 uMULL v30.4s, v0.4h, v9.4h 1573 uMULL v28.4s, v2.4h, v8.4h 1574 uMULL v26.4s, v0.4h, v8.4h 1575 uMULL v24.4s, v2.4h, v9.4h 1576 1577 ushR v30.4s, v30.4s,#16 1578 ushR v28.4s, v28.4s,#16 1579 1580 sMLAL v30.4s, v1.4h, v9.4h 1581 sMLAL v28.4s, v3.4h, v8.4h 1582 1583 ushR v26.4s, v26.4s,#16 1584 ushR v24.4s, v24.4s,#16 1585 1586 sMLAL v26.4s, v1.4h, v8.4h 1587 sMLAL v24.4s, v3.4h, v9.4h 1588 1589 ADD v30.4s, v30.4s , v28.4s 1590 NEG v30.4s, v30.4s 1591 1592 uMULL v22.4s, v4.4h, v8.4h 1593 1594 SUB v28.4s, v24.4s , v26.4s 1595 1596 1597 mov v26.16b, v30.16b 1598 mov v24.16b, v28.16b 1599 1600// VUZP.16 D26, D27 1601 1602 UZP1 v19.8h, v26.8h, v26.8h 1603 UZP2 v21.8h, v26.8h, v26.8h 1604 MOV v26.d[0], v19.d[0] 1605 MOV v27.d[0], v21.d[0] 1606 1607 //VUZP.16 D24, D25 1608 1609 UZP1 v19.8h, v24.8h, v24.8h 1610 UZP2 v21.8h, v24.8h, v24.8h 1611 MOV v24.d[0], v19.d[0] 1612 MOV v25.d[0], v21.d[0] 1613 1614 uMULL v2.4s, v24.4h, v18.4h 1615 uMULL v0.4s, v26.4h, v18.4h 1616 1617 ushR v22.4s, v22.4s,#16 1618 sMLAL v22.4s, v5.4h, v8.4h 1619 1620 ushR v2.4s, v2.4s,#16 1621 ushR v0.4s, v0.4s,#16 1622 sMLAL v2.4s, v25.4h, v18.4h 1623 sMLAL v0.4s, v27.4h, v18.4h 1624 1625 uMULL v24.4s, v4.4h, v9.4h 1626 uMULL v26.4s, v6.4h, v8.4h 1627 1628 NEG v2.4s, v2.4s 1629 ADD v28.4s, v28.4s , v0.4s 1630 ADD v30.4s, v30.4s , v2.4s 1631 1632 uMULL v0.4s, v6.4h, v9.4h 1633 sshR v24.4s, v24.4s,#16 1634 sMLAL v24.4s, v5.4h, v9.4h 1635 sshR v26.4s, v26.4s,#16 1636 sshR v0.4s, v0.4s,#16 1637 sMLAL v26.4s, v7.4h, v8.4h 1638 sMLAL v0.4s, v7.4h, v9.4h 1639 1640 ADD v22.4s, v22.4s , v0.4s 1641 NEG v22.4s, v22.4s 1642 SUB v24.4s, v26.4s , v24.4s 1643 1644 //LDR w11, [sp, #120] 1645 //sxtw x11,w11 1646 MOV w11, w26 1647 dup v14.4s,w11 1648 SQADD v28.4s, v28.4s , v14.4s 1649 //LDR w11, [sp, #116] 1650 //sxtw x11,w11 1651 MOV w11, w25 1652 dup v0.4s,w11 1653 sQshL v28.4s, v28.4s, v0.4s 1654 1655 mov v0.16b, v22.16b 1656 mov v14.16b, v24.16b 1657 1658// VUZP.16 D22, D23 1659 1660 UZP1 v19.8h, v22.8h, v22.8h 1661 UZP2 v21.8h, v22.8h, v22.8h 1662 MOV v22.d[0], v19.d[0] 1663 MOV v23.d[0], v21.d[0] 1664 1665 // VUZP.16 D24, D25 1666 1667 UZP1 v19.8h, v24.8h, v24.8h 1668 UZP2 v21.8h, v24.8h, v24.8h 1669 MOV v24.d[0], v19.d[0] 1670 MOV v25.d[0], v21.d[0] 1671 1672 uMULL v8.4s, v24.4h, v18.4h 1673 uMULL v26.4s, v22.4h, v18.4h 1674 1675 NEG v2.4s, v30.4s 1676// VUZP.16 D30, D31 1677 1678 UZP1 v19.8h, v30.8h, v30.8h 1679 UZP2 v21.8h, v30.8h, v30.8h 1680 MOV v30.d[0], v19.d[0] 1681 MOV v30.d[1], v21.d[0] 1682 1683// VUZP.16 D2, D3 1684 1685 UZP1 v19.8h, v2.8h, v2.8h 1686 UZP2 v21.8h, v2.8h, v2.8h 1687 MOV v2.d[0], v19.d[0] 1688 MOV v3.d[0], v21.d[0] 1689 1690 uMULL v4.4s, v30.4h, v12.4h 1691 uMULL v6.4s, v2.4h, v13.4h 1692 1693 ushR v8.4s, v8.4s,#16 1694 ushR v26.4s, v26.4s,#16 1695 1696 sMLAL v8.4s, v25.4h, v18.4h 1697 sMLAL v26.4s, v23.4h, v18.4h 1698 1699 ushR v4.4s, v4.4s,#16 1700 ushR v6.4s, v6.4s,#16 1701 1702 MOV v19.d[0], v30.d[1] 1703 1704 sMLAL v4.4s, v19.4h, v12.4h 1705 sMLAL v6.4s, v3.4h, v13.4h 1706 1707 NEG v8.4s, v8.4s 1708 ADD v14.4s, v14.4s , v26.4s 1709 ADD v0.4s, v0.4s , v8.4s 1710 1711 //LDR w11, [sp, #120] 1712 //sxtw x11,w11 1713 MOV w11, w26 1714 dup v8.4s,w11 1715 SQADD v0.4s, v0.4s , v8.4s 1716 //LDR w11, [sp, #116] 1717 //sxtw x11,w11 1718 MOV w11, w25 1719 dup v26.4s,w11 1720 sQshL v0.4s, v0.4s, v26.4s 1721 1722 mov v26.16b, v28.16b 1723 1724 MOV x6, x4 1725 1726 LD1 {v28.2s, v29.2s}, [x4],#16 1727 movi v19.2s, #0x00000000 1728 LD1 {v30.s}[0],[x4],#4 1729 LD1 {v30.s}[1],[x4],#4 1730 LD1 {v19.s}[0],[x4],#4 1731 1732 MOV v28.d[1], v29.d[0] 1733 MOV v30.d[1], v19.d[0] 1734 1735 //VUZP.32 Q14, Q15 1736 1737 UZP1 v19.4s, v28.4s, v30.4s 1738 UZP2 v30.4s, v28.4s, v30.4s 1739 MOV v28.16b, v19.16b 1740 MOV v29.d[0], v28.d[1] 1741 1742 ST1 {v26.s}[0],[x6],#4 1743 ST1 {v0.s}[0],[x6],#4 1744 ST1 {v26.s}[1],[x6],#4 1745 ST1 {v0.s}[1],[x6],#4 1746 ST1 {v26.s}[2],[x6],#4 1747 ST1 {v0.s}[2],[x6],#4 1748 ST1 {v26.s}[3],[x6],#4 1749 1750 movi v1.2s, #0 1751 //VADDL.S16 Q0, D13, D1 1752 SADDL v0.4s, v13.4h, v1.4h 1753 MOV v1.d[0], v0.d[1] 1754 1755 sMULL v26.2d, v28.2s, v0.2s 1756 Sqxtn v8.2s, v26.2d 1757 sMULL v26.2d, v29.2s, v1.2s 1758 Sqxtn v9.2s, v26.2d 1759 MOV v8.d[1], v9.d[0] 1760 movi v1.2s, #0 1761 //VADDL.S16 Q0, D12, D1 1762 SADDL v0.4s, v12.4h, v1.4h 1763 MOV v1.d[0], v0.d[1] 1764 1765 sMULL v24.2d, v28.2s, v0.2s 1766 Sqxtn v26.2s, v24.2d 1767 sMULL v24.2d, v29.2s, v1.2s 1768 Sqxtn v27.2s, v24.2d 1769 MOV v26.d[1], v27.d[0] 1770 1771 sQshL v4.4s, v4.4s, v16.4s 1772 sQshL v6.4s, v6.4s, v16.4s 1773 1774 SQSUB v4.4s, v4.4s , v8.4s 1775 SQSUB v6.4s, v6.4s , v26.4s 1776 1777 NEG v26.4s, v14.4s 1778 //VUZP.16 D14, D15 1779 1780 UZP1 v19.8h, v14.8h, v14.8h 1781 UZP2 v21.8h, v14.8h, v14.8h 1782 MOV v14.d[0], v19.d[0] 1783 MOV v15.d[0], v21.d[0] 1784 1785// VUZP.16 D26, D27 1786 1787 UZP1 v19.8h, v26.8h, v26.8h 1788 UZP2 v21.8h, v26.8h, v26.8h 1789 MOV v26.d[0], v19.d[0] 1790 MOV v27.d[0], v21.d[0] 1791 1792 1793 movi v1.2s, #0 1794 //VADDL.S16 Q0, D10, D1 1795 SADDL v0.4s, v10.4h, v1.4h 1796 MOV v1.d[0], v0.d[1] 1797 1798 sMULL v22.2d, v30.2s, v0.2s 1799 Sqxtn v24.2s, v22.2d 1800 sMULL2 v22.2d, v30.4s, v0.4s 1801 Sqxtn v25.2s, v22.2d 1802 MOV v24.d[1], v25.d[0] 1803 1804 movi v1.2s, #0 1805// VADDL.S16 Q0, D11, D1 1806 SADDL v0.4s, v11.4h, v1.4h 1807 MOV v1.d[0], v0.d[1] 1808 1809 sMULL v8.2d, v30.2s, v0.2s 1810 Sqxtn v22.2s, v8.2d 1811 sMULL2 v8.2d, v30.4s, v0.4s 1812 Sqxtn v23.2s, v8.2d 1813 MOV v22.d[1], v23.d[0] 1814 1815 uMULL v8.4s, v26.4h, v11.4h 1816 uMULL v30.4s, v14.4h, v10.4h 1817 1818 ushR v8.4s, v8.4s,#16 1819 1820 ushR v30.4s, v30.4s,#16 1821 1822 sMLAL v8.4s, v27.4h, v11.4h 1823 1824 sMLAL v30.4s, v15.4h, v10.4h 1825 1826 MOV V14.16B , V4.16B 1827 1828 //mov v15.8b, v6.8b 1829 sQshL v8.4s, v8.4s, v16.4s 1830 1831 sQshL v30.4s, v30.4s, v16.4s 1832 1833 SQSUB v8.4s, v8.4s , v24.4s 1834 1835 SQSUB v22.4s, v30.4s , v22.4s 1836 1837 MOV V30.16B, V8.16B 1838 1839 1840 1841 1842 1843 1844 1845 ST1 {V14.S}[0],[x0] 1846 ADD x0, x0, x9 1847 ST1 {V22.S}[0],[x0] 1848 ADD x0, x0, x9 1849 ST1 {V14.S}[1],[x0] 1850 ADD x0, x0, x9 1851 ST1 {V22.S}[1],[x0] 1852 ADD x0, x0, x9 1853 ST1 {V14.S}[2],[x0] 1854 ADD x0, x0, x9 1855 ST1 {V22.S}[2],[x0] 1856 ADD x0, x0, x9 1857 ST1 {V14.S}[3],[x0] 1858 ADD x0, x0, x9 1859 1860 ST1 {V6.S}[0],[x5] 1861 ADD x5, x5, x10 1862 ST1 {V30.S}[0],[x5] 1863 ADD x5, x5, x10 1864 ST1 {V6.S}[1],[x5] 1865 ADD x5, x5, x10 1866 ST1 {V30.S}[1],[x5] 1867 ADD x5, x5, x10 1868 ST1 {V6.S}[2],[x5] 1869 ADD x5, x5, x10 1870 ST1 {V30.S}[2],[x5] 1871 ADD x5, x5, x10 1872 ST1 {V6.S}[3],[x5] 1873 ADD x5, x5, x10 1874 1875 pop_v_regs 1876 ret 1877 1878 1879