1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#include <ring-core/asm_base.h> 5 6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) 7#include <ring-core/arm_arch.h> 8 9 10 11 12.section .rodata 13 14.align 5 15Lsigma: 16.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral 17Lone: 18.long 1,0,0,0 19.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 20.align 2 21 22.text 23 24.globl ChaCha20_ctr32 25 26.def ChaCha20_ctr32 27 .type 32 28.endef 29.align 5 30ChaCha20_ctr32: 31 AARCH64_VALID_CALL_TARGET 32 cbz x2,Labort 33#if defined(OPENSSL_HWASAN) && __clang_major__ >= 10 34 adrp x5,:pg_hi21_nc:OPENSSL_armcap_P 35#else 36 adrp x5,OPENSSL_armcap_P 37#endif 38 cmp x2,#192 39 b.lo Lshort 40 ldr w17,[x5,:lo12:OPENSSL_armcap_P] 41 tst w17,#ARMV7_NEON 42 b.ne ChaCha20_neon 43 44Lshort: 45 AARCH64_SIGN_LINK_REGISTER 46 stp x29,x30,[sp,#-96]! 47 add x29,sp,#0 48 49 adrp x5,Lsigma 50 add x5,x5,:lo12:Lsigma 51 stp x19,x20,[sp,#16] 52 stp x21,x22,[sp,#32] 53 stp x23,x24,[sp,#48] 54 stp x25,x26,[sp,#64] 55 stp x27,x28,[sp,#80] 56 sub sp,sp,#64 57 58 ldp x22,x23,[x5] // load sigma 59 ldp x24,x25,[x3] // load key 60 ldp x26,x27,[x3,#16] 61 ldp x28,x30,[x4] // load counter 62#ifdef __AARCH64EB__ 63 ror x24,x24,#32 64 ror x25,x25,#32 65 ror x26,x26,#32 66 ror x27,x27,#32 67 ror x28,x28,#32 68 ror x30,x30,#32 69#endif 70 71Loop_outer: 72 mov w5,w22 // unpack key block 73 lsr x6,x22,#32 74 mov w7,w23 75 lsr x8,x23,#32 76 mov w9,w24 77 lsr x10,x24,#32 78 mov w11,w25 79 lsr x12,x25,#32 80 mov w13,w26 81 lsr x14,x26,#32 82 mov w15,w27 83 lsr x16,x27,#32 84 mov w17,w28 85 lsr x19,x28,#32 86 mov w20,w30 87 lsr x21,x30,#32 88 89 mov x4,#10 90 subs x2,x2,#64 91Loop: 92 sub x4,x4,#1 93 add w5,w5,w9 94 add w6,w6,w10 95 add w7,w7,w11 96 add w8,w8,w12 97 eor w17,w17,w5 98 eor w19,w19,w6 99 eor w20,w20,w7 100 eor w21,w21,w8 101 ror w17,w17,#16 102 ror w19,w19,#16 103 ror w20,w20,#16 104 ror w21,w21,#16 105 add w13,w13,w17 106 add w14,w14,w19 107 add w15,w15,w20 108 add w16,w16,w21 109 eor w9,w9,w13 110 eor w10,w10,w14 111 eor w11,w11,w15 112 eor w12,w12,w16 113 ror w9,w9,#20 114 ror w10,w10,#20 115 ror w11,w11,#20 116 ror w12,w12,#20 117 add w5,w5,w9 118 add w6,w6,w10 119 add w7,w7,w11 120 add w8,w8,w12 121 eor w17,w17,w5 122 eor w19,w19,w6 123 eor w20,w20,w7 124 eor w21,w21,w8 125 ror w17,w17,#24 126 ror w19,w19,#24 127 ror w20,w20,#24 128 ror w21,w21,#24 129 add w13,w13,w17 130 add w14,w14,w19 131 add w15,w15,w20 132 add w16,w16,w21 133 eor w9,w9,w13 134 eor w10,w10,w14 135 eor w11,w11,w15 136 eor w12,w12,w16 137 ror w9,w9,#25 138 ror w10,w10,#25 139 ror w11,w11,#25 140 ror w12,w12,#25 141 add w5,w5,w10 142 add w6,w6,w11 143 add w7,w7,w12 144 add w8,w8,w9 145 eor w21,w21,w5 146 eor w17,w17,w6 147 eor w19,w19,w7 148 eor w20,w20,w8 149 ror w21,w21,#16 150 ror w17,w17,#16 151 ror w19,w19,#16 152 ror w20,w20,#16 153 add w15,w15,w21 154 add w16,w16,w17 155 add w13,w13,w19 156 add w14,w14,w20 157 eor w10,w10,w15 158 eor w11,w11,w16 159 eor w12,w12,w13 160 eor w9,w9,w14 161 ror w10,w10,#20 162 ror w11,w11,#20 163 ror w12,w12,#20 164 ror w9,w9,#20 165 add w5,w5,w10 166 add w6,w6,w11 167 add w7,w7,w12 168 add w8,w8,w9 169 eor w21,w21,w5 170 eor w17,w17,w6 171 eor w19,w19,w7 172 eor w20,w20,w8 173 ror w21,w21,#24 174 ror w17,w17,#24 175 ror w19,w19,#24 176 ror w20,w20,#24 177 add w15,w15,w21 178 add w16,w16,w17 179 add w13,w13,w19 180 add w14,w14,w20 181 eor w10,w10,w15 182 eor w11,w11,w16 183 eor w12,w12,w13 184 eor w9,w9,w14 185 ror w10,w10,#25 186 ror w11,w11,#25 187 ror w12,w12,#25 188 ror w9,w9,#25 189 cbnz x4,Loop 190 191 add w5,w5,w22 // accumulate key block 192 add x6,x6,x22,lsr#32 193 add w7,w7,w23 194 add x8,x8,x23,lsr#32 195 add w9,w9,w24 196 add x10,x10,x24,lsr#32 197 add w11,w11,w25 198 add x12,x12,x25,lsr#32 199 add w13,w13,w26 200 add x14,x14,x26,lsr#32 201 add w15,w15,w27 202 add x16,x16,x27,lsr#32 203 add w17,w17,w28 204 add x19,x19,x28,lsr#32 205 add w20,w20,w30 206 add x21,x21,x30,lsr#32 207 208 b.lo Ltail 209 210 add x5,x5,x6,lsl#32 // pack 211 add x7,x7,x8,lsl#32 212 ldp x6,x8,[x1,#0] // load input 213 add x9,x9,x10,lsl#32 214 add x11,x11,x12,lsl#32 215 ldp x10,x12,[x1,#16] 216 add x13,x13,x14,lsl#32 217 add x15,x15,x16,lsl#32 218 ldp x14,x16,[x1,#32] 219 add x17,x17,x19,lsl#32 220 add x20,x20,x21,lsl#32 221 ldp x19,x21,[x1,#48] 222 add x1,x1,#64 223#ifdef __AARCH64EB__ 224 rev x5,x5 225 rev x7,x7 226 rev x9,x9 227 rev x11,x11 228 rev x13,x13 229 rev x15,x15 230 rev x17,x17 231 rev x20,x20 232#endif 233 eor x5,x5,x6 234 eor x7,x7,x8 235 eor x9,x9,x10 236 eor x11,x11,x12 237 eor x13,x13,x14 238 eor x15,x15,x16 239 eor x17,x17,x19 240 eor x20,x20,x21 241 242 stp x5,x7,[x0,#0] // store output 243 add x28,x28,#1 // increment counter 244 stp x9,x11,[x0,#16] 245 stp x13,x15,[x0,#32] 246 stp x17,x20,[x0,#48] 247 add x0,x0,#64 248 249 b.hi Loop_outer 250 251 ldp x19,x20,[x29,#16] 252 add sp,sp,#64 253 ldp x21,x22,[x29,#32] 254 ldp x23,x24,[x29,#48] 255 ldp x25,x26,[x29,#64] 256 ldp x27,x28,[x29,#80] 257 ldp x29,x30,[sp],#96 258 AARCH64_VALIDATE_LINK_REGISTER 259Labort: 260 ret 261 262.align 4 263Ltail: 264 add x2,x2,#64 265Less_than_64: 266 sub x0,x0,#1 267 add x1,x1,x2 268 add x0,x0,x2 269 add x4,sp,x2 270 neg x2,x2 271 272 add x5,x5,x6,lsl#32 // pack 273 add x7,x7,x8,lsl#32 274 add x9,x9,x10,lsl#32 275 add x11,x11,x12,lsl#32 276 add x13,x13,x14,lsl#32 277 add x15,x15,x16,lsl#32 278 add x17,x17,x19,lsl#32 279 add x20,x20,x21,lsl#32 280#ifdef __AARCH64EB__ 281 rev x5,x5 282 rev x7,x7 283 rev x9,x9 284 rev x11,x11 285 rev x13,x13 286 rev x15,x15 287 rev x17,x17 288 rev x20,x20 289#endif 290 stp x5,x7,[sp,#0] 291 stp x9,x11,[sp,#16] 292 stp x13,x15,[sp,#32] 293 stp x17,x20,[sp,#48] 294 295Loop_tail: 296 ldrb w10,[x1,x2] 297 ldrb w11,[x4,x2] 298 add x2,x2,#1 299 eor w10,w10,w11 300 strb w10,[x0,x2] 301 cbnz x2,Loop_tail 302 303 stp xzr,xzr,[sp,#0] 304 stp xzr,xzr,[sp,#16] 305 stp xzr,xzr,[sp,#32] 306 stp xzr,xzr,[sp,#48] 307 308 ldp x19,x20,[x29,#16] 309 add sp,sp,#64 310 ldp x21,x22,[x29,#32] 311 ldp x23,x24,[x29,#48] 312 ldp x25,x26,[x29,#64] 313 ldp x27,x28,[x29,#80] 314 ldp x29,x30,[sp],#96 315 AARCH64_VALIDATE_LINK_REGISTER 316 ret 317 318 319.def ChaCha20_neon 320 .type 32 321.endef 322.align 5 323ChaCha20_neon: 324 AARCH64_SIGN_LINK_REGISTER 325 stp x29,x30,[sp,#-96]! 326 add x29,sp,#0 327 328 adrp x5,Lsigma 329 add x5,x5,:lo12:Lsigma 330 stp x19,x20,[sp,#16] 331 stp x21,x22,[sp,#32] 332 stp x23,x24,[sp,#48] 333 stp x25,x26,[sp,#64] 334 stp x27,x28,[sp,#80] 335 cmp x2,#512 336 b.hs L512_or_more_neon 337 338 sub sp,sp,#64 339 340 ldp x22,x23,[x5] // load sigma 341 ld1 {v24.4s},[x5],#16 342 ldp x24,x25,[x3] // load key 343 ldp x26,x27,[x3,#16] 344 ld1 {v25.4s,v26.4s},[x3] 345 ldp x28,x30,[x4] // load counter 346 ld1 {v27.4s},[x4] 347 ld1 {v31.4s},[x5] 348#ifdef __AARCH64EB__ 349 rev64 v24.4s,v24.4s 350 ror x24,x24,#32 351 ror x25,x25,#32 352 ror x26,x26,#32 353 ror x27,x27,#32 354 ror x28,x28,#32 355 ror x30,x30,#32 356#endif 357 add v27.4s,v27.4s,v31.4s // += 1 358 add v28.4s,v27.4s,v31.4s 359 add v29.4s,v28.4s,v31.4s 360 shl v31.4s,v31.4s,#2 // 1 -> 4 361 362Loop_outer_neon: 363 mov w5,w22 // unpack key block 364 lsr x6,x22,#32 365 mov v0.16b,v24.16b 366 mov w7,w23 367 lsr x8,x23,#32 368 mov v4.16b,v24.16b 369 mov w9,w24 370 lsr x10,x24,#32 371 mov v16.16b,v24.16b 372 mov w11,w25 373 mov v1.16b,v25.16b 374 lsr x12,x25,#32 375 mov v5.16b,v25.16b 376 mov w13,w26 377 mov v17.16b,v25.16b 378 lsr x14,x26,#32 379 mov v3.16b,v27.16b 380 mov w15,w27 381 mov v7.16b,v28.16b 382 lsr x16,x27,#32 383 mov v19.16b,v29.16b 384 mov w17,w28 385 mov v2.16b,v26.16b 386 lsr x19,x28,#32 387 mov v6.16b,v26.16b 388 mov w20,w30 389 mov v18.16b,v26.16b 390 lsr x21,x30,#32 391 392 mov x4,#10 393 subs x2,x2,#256 394Loop_neon: 395 sub x4,x4,#1 396 add v0.4s,v0.4s,v1.4s 397 add w5,w5,w9 398 add v4.4s,v4.4s,v5.4s 399 add w6,w6,w10 400 add v16.4s,v16.4s,v17.4s 401 add w7,w7,w11 402 eor v3.16b,v3.16b,v0.16b 403 add w8,w8,w12 404 eor v7.16b,v7.16b,v4.16b 405 eor w17,w17,w5 406 eor v19.16b,v19.16b,v16.16b 407 eor w19,w19,w6 408 rev32 v3.8h,v3.8h 409 eor w20,w20,w7 410 rev32 v7.8h,v7.8h 411 eor w21,w21,w8 412 rev32 v19.8h,v19.8h 413 ror w17,w17,#16 414 add v2.4s,v2.4s,v3.4s 415 ror w19,w19,#16 416 add v6.4s,v6.4s,v7.4s 417 ror w20,w20,#16 418 add v18.4s,v18.4s,v19.4s 419 ror w21,w21,#16 420 eor v20.16b,v1.16b,v2.16b 421 add w13,w13,w17 422 eor v21.16b,v5.16b,v6.16b 423 add w14,w14,w19 424 eor v22.16b,v17.16b,v18.16b 425 add w15,w15,w20 426 ushr v1.4s,v20.4s,#20 427 add w16,w16,w21 428 ushr v5.4s,v21.4s,#20 429 eor w9,w9,w13 430 ushr v17.4s,v22.4s,#20 431 eor w10,w10,w14 432 sli v1.4s,v20.4s,#12 433 eor w11,w11,w15 434 sli v5.4s,v21.4s,#12 435 eor w12,w12,w16 436 sli v17.4s,v22.4s,#12 437 ror w9,w9,#20 438 add v0.4s,v0.4s,v1.4s 439 ror w10,w10,#20 440 add v4.4s,v4.4s,v5.4s 441 ror w11,w11,#20 442 add v16.4s,v16.4s,v17.4s 443 ror w12,w12,#20 444 eor v20.16b,v3.16b,v0.16b 445 add w5,w5,w9 446 eor v21.16b,v7.16b,v4.16b 447 add w6,w6,w10 448 eor v22.16b,v19.16b,v16.16b 449 add w7,w7,w11 450 ushr v3.4s,v20.4s,#24 451 add w8,w8,w12 452 ushr v7.4s,v21.4s,#24 453 eor w17,w17,w5 454 ushr v19.4s,v22.4s,#24 455 eor w19,w19,w6 456 sli v3.4s,v20.4s,#8 457 eor w20,w20,w7 458 sli v7.4s,v21.4s,#8 459 eor w21,w21,w8 460 sli v19.4s,v22.4s,#8 461 ror w17,w17,#24 462 add v2.4s,v2.4s,v3.4s 463 ror w19,w19,#24 464 add v6.4s,v6.4s,v7.4s 465 ror w20,w20,#24 466 add v18.4s,v18.4s,v19.4s 467 ror w21,w21,#24 468 eor v20.16b,v1.16b,v2.16b 469 add w13,w13,w17 470 eor v21.16b,v5.16b,v6.16b 471 add w14,w14,w19 472 eor v22.16b,v17.16b,v18.16b 473 add w15,w15,w20 474 ushr v1.4s,v20.4s,#25 475 add w16,w16,w21 476 ushr v5.4s,v21.4s,#25 477 eor w9,w9,w13 478 ushr v17.4s,v22.4s,#25 479 eor w10,w10,w14 480 sli v1.4s,v20.4s,#7 481 eor w11,w11,w15 482 sli v5.4s,v21.4s,#7 483 eor w12,w12,w16 484 sli v17.4s,v22.4s,#7 485 ror w9,w9,#25 486 ext v2.16b,v2.16b,v2.16b,#8 487 ror w10,w10,#25 488 ext v6.16b,v6.16b,v6.16b,#8 489 ror w11,w11,#25 490 ext v18.16b,v18.16b,v18.16b,#8 491 ror w12,w12,#25 492 ext v3.16b,v3.16b,v3.16b,#12 493 ext v7.16b,v7.16b,v7.16b,#12 494 ext v19.16b,v19.16b,v19.16b,#12 495 ext v1.16b,v1.16b,v1.16b,#4 496 ext v5.16b,v5.16b,v5.16b,#4 497 ext v17.16b,v17.16b,v17.16b,#4 498 add v0.4s,v0.4s,v1.4s 499 add w5,w5,w10 500 add v4.4s,v4.4s,v5.4s 501 add w6,w6,w11 502 add v16.4s,v16.4s,v17.4s 503 add w7,w7,w12 504 eor v3.16b,v3.16b,v0.16b 505 add w8,w8,w9 506 eor v7.16b,v7.16b,v4.16b 507 eor w21,w21,w5 508 eor v19.16b,v19.16b,v16.16b 509 eor w17,w17,w6 510 rev32 v3.8h,v3.8h 511 eor w19,w19,w7 512 rev32 v7.8h,v7.8h 513 eor w20,w20,w8 514 rev32 v19.8h,v19.8h 515 ror w21,w21,#16 516 add v2.4s,v2.4s,v3.4s 517 ror w17,w17,#16 518 add v6.4s,v6.4s,v7.4s 519 ror w19,w19,#16 520 add v18.4s,v18.4s,v19.4s 521 ror w20,w20,#16 522 eor v20.16b,v1.16b,v2.16b 523 add w15,w15,w21 524 eor v21.16b,v5.16b,v6.16b 525 add w16,w16,w17 526 eor v22.16b,v17.16b,v18.16b 527 add w13,w13,w19 528 ushr v1.4s,v20.4s,#20 529 add w14,w14,w20 530 ushr v5.4s,v21.4s,#20 531 eor w10,w10,w15 532 ushr v17.4s,v22.4s,#20 533 eor w11,w11,w16 534 sli v1.4s,v20.4s,#12 535 eor w12,w12,w13 536 sli v5.4s,v21.4s,#12 537 eor w9,w9,w14 538 sli v17.4s,v22.4s,#12 539 ror w10,w10,#20 540 add v0.4s,v0.4s,v1.4s 541 ror w11,w11,#20 542 add v4.4s,v4.4s,v5.4s 543 ror w12,w12,#20 544 add v16.4s,v16.4s,v17.4s 545 ror w9,w9,#20 546 eor v20.16b,v3.16b,v0.16b 547 add w5,w5,w10 548 eor v21.16b,v7.16b,v4.16b 549 add w6,w6,w11 550 eor v22.16b,v19.16b,v16.16b 551 add w7,w7,w12 552 ushr v3.4s,v20.4s,#24 553 add w8,w8,w9 554 ushr v7.4s,v21.4s,#24 555 eor w21,w21,w5 556 ushr v19.4s,v22.4s,#24 557 eor w17,w17,w6 558 sli v3.4s,v20.4s,#8 559 eor w19,w19,w7 560 sli v7.4s,v21.4s,#8 561 eor w20,w20,w8 562 sli v19.4s,v22.4s,#8 563 ror w21,w21,#24 564 add v2.4s,v2.4s,v3.4s 565 ror w17,w17,#24 566 add v6.4s,v6.4s,v7.4s 567 ror w19,w19,#24 568 add v18.4s,v18.4s,v19.4s 569 ror w20,w20,#24 570 eor v20.16b,v1.16b,v2.16b 571 add w15,w15,w21 572 eor v21.16b,v5.16b,v6.16b 573 add w16,w16,w17 574 eor v22.16b,v17.16b,v18.16b 575 add w13,w13,w19 576 ushr v1.4s,v20.4s,#25 577 add w14,w14,w20 578 ushr v5.4s,v21.4s,#25 579 eor w10,w10,w15 580 ushr v17.4s,v22.4s,#25 581 eor w11,w11,w16 582 sli v1.4s,v20.4s,#7 583 eor w12,w12,w13 584 sli v5.4s,v21.4s,#7 585 eor w9,w9,w14 586 sli v17.4s,v22.4s,#7 587 ror w10,w10,#25 588 ext v2.16b,v2.16b,v2.16b,#8 589 ror w11,w11,#25 590 ext v6.16b,v6.16b,v6.16b,#8 591 ror w12,w12,#25 592 ext v18.16b,v18.16b,v18.16b,#8 593 ror w9,w9,#25 594 ext v3.16b,v3.16b,v3.16b,#4 595 ext v7.16b,v7.16b,v7.16b,#4 596 ext v19.16b,v19.16b,v19.16b,#4 597 ext v1.16b,v1.16b,v1.16b,#12 598 ext v5.16b,v5.16b,v5.16b,#12 599 ext v17.16b,v17.16b,v17.16b,#12 600 cbnz x4,Loop_neon 601 602 add w5,w5,w22 // accumulate key block 603 add v0.4s,v0.4s,v24.4s 604 add x6,x6,x22,lsr#32 605 add v4.4s,v4.4s,v24.4s 606 add w7,w7,w23 607 add v16.4s,v16.4s,v24.4s 608 add x8,x8,x23,lsr#32 609 add v2.4s,v2.4s,v26.4s 610 add w9,w9,w24 611 add v6.4s,v6.4s,v26.4s 612 add x10,x10,x24,lsr#32 613 add v18.4s,v18.4s,v26.4s 614 add w11,w11,w25 615 add v3.4s,v3.4s,v27.4s 616 add x12,x12,x25,lsr#32 617 add w13,w13,w26 618 add v7.4s,v7.4s,v28.4s 619 add x14,x14,x26,lsr#32 620 add w15,w15,w27 621 add v19.4s,v19.4s,v29.4s 622 add x16,x16,x27,lsr#32 623 add w17,w17,w28 624 add v1.4s,v1.4s,v25.4s 625 add x19,x19,x28,lsr#32 626 add w20,w20,w30 627 add v5.4s,v5.4s,v25.4s 628 add x21,x21,x30,lsr#32 629 add v17.4s,v17.4s,v25.4s 630 631 b.lo Ltail_neon 632 633 add x5,x5,x6,lsl#32 // pack 634 add x7,x7,x8,lsl#32 635 ldp x6,x8,[x1,#0] // load input 636 add x9,x9,x10,lsl#32 637 add x11,x11,x12,lsl#32 638 ldp x10,x12,[x1,#16] 639 add x13,x13,x14,lsl#32 640 add x15,x15,x16,lsl#32 641 ldp x14,x16,[x1,#32] 642 add x17,x17,x19,lsl#32 643 add x20,x20,x21,lsl#32 644 ldp x19,x21,[x1,#48] 645 add x1,x1,#64 646#ifdef __AARCH64EB__ 647 rev x5,x5 648 rev x7,x7 649 rev x9,x9 650 rev x11,x11 651 rev x13,x13 652 rev x15,x15 653 rev x17,x17 654 rev x20,x20 655#endif 656 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 657 eor x5,x5,x6 658 eor x7,x7,x8 659 eor x9,x9,x10 660 eor x11,x11,x12 661 eor x13,x13,x14 662 eor v0.16b,v0.16b,v20.16b 663 eor x15,x15,x16 664 eor v1.16b,v1.16b,v21.16b 665 eor x17,x17,x19 666 eor v2.16b,v2.16b,v22.16b 667 eor x20,x20,x21 668 eor v3.16b,v3.16b,v23.16b 669 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 670 671 stp x5,x7,[x0,#0] // store output 672 add x28,x28,#4 // increment counter 673 stp x9,x11,[x0,#16] 674 add v27.4s,v27.4s,v31.4s // += 4 675 stp x13,x15,[x0,#32] 676 add v28.4s,v28.4s,v31.4s 677 stp x17,x20,[x0,#48] 678 add v29.4s,v29.4s,v31.4s 679 add x0,x0,#64 680 681 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 682 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 683 684 eor v4.16b,v4.16b,v20.16b 685 eor v5.16b,v5.16b,v21.16b 686 eor v6.16b,v6.16b,v22.16b 687 eor v7.16b,v7.16b,v23.16b 688 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 689 690 eor v16.16b,v16.16b,v0.16b 691 eor v17.16b,v17.16b,v1.16b 692 eor v18.16b,v18.16b,v2.16b 693 eor v19.16b,v19.16b,v3.16b 694 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 695 696 b.hi Loop_outer_neon 697 698 ldp x19,x20,[x29,#16] 699 add sp,sp,#64 700 ldp x21,x22,[x29,#32] 701 ldp x23,x24,[x29,#48] 702 ldp x25,x26,[x29,#64] 703 ldp x27,x28,[x29,#80] 704 ldp x29,x30,[sp],#96 705 AARCH64_VALIDATE_LINK_REGISTER 706 ret 707 708Ltail_neon: 709 add x2,x2,#256 710 cmp x2,#64 711 b.lo Less_than_64 712 713 add x5,x5,x6,lsl#32 // pack 714 add x7,x7,x8,lsl#32 715 ldp x6,x8,[x1,#0] // load input 716 add x9,x9,x10,lsl#32 717 add x11,x11,x12,lsl#32 718 ldp x10,x12,[x1,#16] 719 add x13,x13,x14,lsl#32 720 add x15,x15,x16,lsl#32 721 ldp x14,x16,[x1,#32] 722 add x17,x17,x19,lsl#32 723 add x20,x20,x21,lsl#32 724 ldp x19,x21,[x1,#48] 725 add x1,x1,#64 726#ifdef __AARCH64EB__ 727 rev x5,x5 728 rev x7,x7 729 rev x9,x9 730 rev x11,x11 731 rev x13,x13 732 rev x15,x15 733 rev x17,x17 734 rev x20,x20 735#endif 736 eor x5,x5,x6 737 eor x7,x7,x8 738 eor x9,x9,x10 739 eor x11,x11,x12 740 eor x13,x13,x14 741 eor x15,x15,x16 742 eor x17,x17,x19 743 eor x20,x20,x21 744 745 stp x5,x7,[x0,#0] // store output 746 add x28,x28,#4 // increment counter 747 stp x9,x11,[x0,#16] 748 stp x13,x15,[x0,#32] 749 stp x17,x20,[x0,#48] 750 add x0,x0,#64 751 b.eq Ldone_neon 752 sub x2,x2,#64 753 cmp x2,#64 754 b.lo Less_than_128 755 756 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 757 eor v0.16b,v0.16b,v20.16b 758 eor v1.16b,v1.16b,v21.16b 759 eor v2.16b,v2.16b,v22.16b 760 eor v3.16b,v3.16b,v23.16b 761 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 762 b.eq Ldone_neon 763 sub x2,x2,#64 764 cmp x2,#64 765 b.lo Less_than_192 766 767 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 768 eor v4.16b,v4.16b,v20.16b 769 eor v5.16b,v5.16b,v21.16b 770 eor v6.16b,v6.16b,v22.16b 771 eor v7.16b,v7.16b,v23.16b 772 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 773 b.eq Ldone_neon 774 sub x2,x2,#64 775 776 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] 777 b Last_neon 778 779Less_than_128: 780 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] 781 b Last_neon 782Less_than_192: 783 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] 784 b Last_neon 785 786.align 4 787Last_neon: 788 sub x0,x0,#1 789 add x1,x1,x2 790 add x0,x0,x2 791 add x4,sp,x2 792 neg x2,x2 793 794Loop_tail_neon: 795 ldrb w10,[x1,x2] 796 ldrb w11,[x4,x2] 797 add x2,x2,#1 798 eor w10,w10,w11 799 strb w10,[x0,x2] 800 cbnz x2,Loop_tail_neon 801 802 stp xzr,xzr,[sp,#0] 803 stp xzr,xzr,[sp,#16] 804 stp xzr,xzr,[sp,#32] 805 stp xzr,xzr,[sp,#48] 806 807Ldone_neon: 808 ldp x19,x20,[x29,#16] 809 add sp,sp,#64 810 ldp x21,x22,[x29,#32] 811 ldp x23,x24,[x29,#48] 812 ldp x25,x26,[x29,#64] 813 ldp x27,x28,[x29,#80] 814 ldp x29,x30,[sp],#96 815 AARCH64_VALIDATE_LINK_REGISTER 816 ret 817 818.def ChaCha20_512_neon 819 .type 32 820.endef 821.align 5 822ChaCha20_512_neon: 823 AARCH64_SIGN_LINK_REGISTER 824 stp x29,x30,[sp,#-96]! 825 add x29,sp,#0 826 827 adrp x5,Lsigma 828 add x5,x5,:lo12:Lsigma 829 stp x19,x20,[sp,#16] 830 stp x21,x22,[sp,#32] 831 stp x23,x24,[sp,#48] 832 stp x25,x26,[sp,#64] 833 stp x27,x28,[sp,#80] 834 835L512_or_more_neon: 836 sub sp,sp,#128+64 837 838 ldp x22,x23,[x5] // load sigma 839 ld1 {v24.4s},[x5],#16 840 ldp x24,x25,[x3] // load key 841 ldp x26,x27,[x3,#16] 842 ld1 {v25.4s,v26.4s},[x3] 843 ldp x28,x30,[x4] // load counter 844 ld1 {v27.4s},[x4] 845 ld1 {v31.4s},[x5] 846#ifdef __AARCH64EB__ 847 rev64 v24.4s,v24.4s 848 ror x24,x24,#32 849 ror x25,x25,#32 850 ror x26,x26,#32 851 ror x27,x27,#32 852 ror x28,x28,#32 853 ror x30,x30,#32 854#endif 855 add v27.4s,v27.4s,v31.4s // += 1 856 stp q24,q25,[sp,#0] // off-load key block, invariant part 857 add v27.4s,v27.4s,v31.4s // not typo 858 str q26,[sp,#32] 859 add v28.4s,v27.4s,v31.4s 860 add v29.4s,v28.4s,v31.4s 861 add v30.4s,v29.4s,v31.4s 862 shl v31.4s,v31.4s,#2 // 1 -> 4 863 864 stp d8,d9,[sp,#128+0] // meet ABI requirements 865 stp d10,d11,[sp,#128+16] 866 stp d12,d13,[sp,#128+32] 867 stp d14,d15,[sp,#128+48] 868 869 sub x2,x2,#512 // not typo 870 871Loop_outer_512_neon: 872 mov v0.16b,v24.16b 873 mov v4.16b,v24.16b 874 mov v8.16b,v24.16b 875 mov v12.16b,v24.16b 876 mov v16.16b,v24.16b 877 mov v20.16b,v24.16b 878 mov v1.16b,v25.16b 879 mov w5,w22 // unpack key block 880 mov v5.16b,v25.16b 881 lsr x6,x22,#32 882 mov v9.16b,v25.16b 883 mov w7,w23 884 mov v13.16b,v25.16b 885 lsr x8,x23,#32 886 mov v17.16b,v25.16b 887 mov w9,w24 888 mov v21.16b,v25.16b 889 lsr x10,x24,#32 890 mov v3.16b,v27.16b 891 mov w11,w25 892 mov v7.16b,v28.16b 893 lsr x12,x25,#32 894 mov v11.16b,v29.16b 895 mov w13,w26 896 mov v15.16b,v30.16b 897 lsr x14,x26,#32 898 mov v2.16b,v26.16b 899 mov w15,w27 900 mov v6.16b,v26.16b 901 lsr x16,x27,#32 902 add v19.4s,v3.4s,v31.4s // +4 903 mov w17,w28 904 add v23.4s,v7.4s,v31.4s // +4 905 lsr x19,x28,#32 906 mov v10.16b,v26.16b 907 mov w20,w30 908 mov v14.16b,v26.16b 909 lsr x21,x30,#32 910 mov v18.16b,v26.16b 911 stp q27,q28,[sp,#48] // off-load key block, variable part 912 mov v22.16b,v26.16b 913 str q29,[sp,#80] 914 915 mov x4,#5 916 subs x2,x2,#512 917Loop_upper_neon: 918 sub x4,x4,#1 919 add v0.4s,v0.4s,v1.4s 920 add w5,w5,w9 921 add v4.4s,v4.4s,v5.4s 922 add w6,w6,w10 923 add v8.4s,v8.4s,v9.4s 924 add w7,w7,w11 925 add v12.4s,v12.4s,v13.4s 926 add w8,w8,w12 927 add v16.4s,v16.4s,v17.4s 928 eor w17,w17,w5 929 add v20.4s,v20.4s,v21.4s 930 eor w19,w19,w6 931 eor v3.16b,v3.16b,v0.16b 932 eor w20,w20,w7 933 eor v7.16b,v7.16b,v4.16b 934 eor w21,w21,w8 935 eor v11.16b,v11.16b,v8.16b 936 ror w17,w17,#16 937 eor v15.16b,v15.16b,v12.16b 938 ror w19,w19,#16 939 eor v19.16b,v19.16b,v16.16b 940 ror w20,w20,#16 941 eor v23.16b,v23.16b,v20.16b 942 ror w21,w21,#16 943 rev32 v3.8h,v3.8h 944 add w13,w13,w17 945 rev32 v7.8h,v7.8h 946 add w14,w14,w19 947 rev32 v11.8h,v11.8h 948 add w15,w15,w20 949 rev32 v15.8h,v15.8h 950 add w16,w16,w21 951 rev32 v19.8h,v19.8h 952 eor w9,w9,w13 953 rev32 v23.8h,v23.8h 954 eor w10,w10,w14 955 add v2.4s,v2.4s,v3.4s 956 eor w11,w11,w15 957 add v6.4s,v6.4s,v7.4s 958 eor w12,w12,w16 959 add v10.4s,v10.4s,v11.4s 960 ror w9,w9,#20 961 add v14.4s,v14.4s,v15.4s 962 ror w10,w10,#20 963 add v18.4s,v18.4s,v19.4s 964 ror w11,w11,#20 965 add v22.4s,v22.4s,v23.4s 966 ror w12,w12,#20 967 eor v24.16b,v1.16b,v2.16b 968 add w5,w5,w9 969 eor v25.16b,v5.16b,v6.16b 970 add w6,w6,w10 971 eor v26.16b,v9.16b,v10.16b 972 add w7,w7,w11 973 eor v27.16b,v13.16b,v14.16b 974 add w8,w8,w12 975 eor v28.16b,v17.16b,v18.16b 976 eor w17,w17,w5 977 eor v29.16b,v21.16b,v22.16b 978 eor w19,w19,w6 979 ushr v1.4s,v24.4s,#20 980 eor w20,w20,w7 981 ushr v5.4s,v25.4s,#20 982 eor w21,w21,w8 983 ushr v9.4s,v26.4s,#20 984 ror w17,w17,#24 985 ushr v13.4s,v27.4s,#20 986 ror w19,w19,#24 987 ushr v17.4s,v28.4s,#20 988 ror w20,w20,#24 989 ushr v21.4s,v29.4s,#20 990 ror w21,w21,#24 991 sli v1.4s,v24.4s,#12 992 add w13,w13,w17 993 sli v5.4s,v25.4s,#12 994 add w14,w14,w19 995 sli v9.4s,v26.4s,#12 996 add w15,w15,w20 997 sli v13.4s,v27.4s,#12 998 add w16,w16,w21 999 sli v17.4s,v28.4s,#12 1000 eor w9,w9,w13 1001 sli v21.4s,v29.4s,#12 1002 eor w10,w10,w14 1003 add v0.4s,v0.4s,v1.4s 1004 eor w11,w11,w15 1005 add v4.4s,v4.4s,v5.4s 1006 eor w12,w12,w16 1007 add v8.4s,v8.4s,v9.4s 1008 ror w9,w9,#25 1009 add v12.4s,v12.4s,v13.4s 1010 ror w10,w10,#25 1011 add v16.4s,v16.4s,v17.4s 1012 ror w11,w11,#25 1013 add v20.4s,v20.4s,v21.4s 1014 ror w12,w12,#25 1015 eor v24.16b,v3.16b,v0.16b 1016 add w5,w5,w10 1017 eor v25.16b,v7.16b,v4.16b 1018 add w6,w6,w11 1019 eor v26.16b,v11.16b,v8.16b 1020 add w7,w7,w12 1021 eor v27.16b,v15.16b,v12.16b 1022 add w8,w8,w9 1023 eor v28.16b,v19.16b,v16.16b 1024 eor w21,w21,w5 1025 eor v29.16b,v23.16b,v20.16b 1026 eor w17,w17,w6 1027 ushr v3.4s,v24.4s,#24 1028 eor w19,w19,w7 1029 ushr v7.4s,v25.4s,#24 1030 eor w20,w20,w8 1031 ushr v11.4s,v26.4s,#24 1032 ror w21,w21,#16 1033 ushr v15.4s,v27.4s,#24 1034 ror w17,w17,#16 1035 ushr v19.4s,v28.4s,#24 1036 ror w19,w19,#16 1037 ushr v23.4s,v29.4s,#24 1038 ror w20,w20,#16 1039 sli v3.4s,v24.4s,#8 1040 add w15,w15,w21 1041 sli v7.4s,v25.4s,#8 1042 add w16,w16,w17 1043 sli v11.4s,v26.4s,#8 1044 add w13,w13,w19 1045 sli v15.4s,v27.4s,#8 1046 add w14,w14,w20 1047 sli v19.4s,v28.4s,#8 1048 eor w10,w10,w15 1049 sli v23.4s,v29.4s,#8 1050 eor w11,w11,w16 1051 add v2.4s,v2.4s,v3.4s 1052 eor w12,w12,w13 1053 add v6.4s,v6.4s,v7.4s 1054 eor w9,w9,w14 1055 add v10.4s,v10.4s,v11.4s 1056 ror w10,w10,#20 1057 add v14.4s,v14.4s,v15.4s 1058 ror w11,w11,#20 1059 add v18.4s,v18.4s,v19.4s 1060 ror w12,w12,#20 1061 add v22.4s,v22.4s,v23.4s 1062 ror w9,w9,#20 1063 eor v24.16b,v1.16b,v2.16b 1064 add w5,w5,w10 1065 eor v25.16b,v5.16b,v6.16b 1066 add w6,w6,w11 1067 eor v26.16b,v9.16b,v10.16b 1068 add w7,w7,w12 1069 eor v27.16b,v13.16b,v14.16b 1070 add w8,w8,w9 1071 eor v28.16b,v17.16b,v18.16b 1072 eor w21,w21,w5 1073 eor v29.16b,v21.16b,v22.16b 1074 eor w17,w17,w6 1075 ushr v1.4s,v24.4s,#25 1076 eor w19,w19,w7 1077 ushr v5.4s,v25.4s,#25 1078 eor w20,w20,w8 1079 ushr v9.4s,v26.4s,#25 1080 ror w21,w21,#24 1081 ushr v13.4s,v27.4s,#25 1082 ror w17,w17,#24 1083 ushr v17.4s,v28.4s,#25 1084 ror w19,w19,#24 1085 ushr v21.4s,v29.4s,#25 1086 ror w20,w20,#24 1087 sli v1.4s,v24.4s,#7 1088 add w15,w15,w21 1089 sli v5.4s,v25.4s,#7 1090 add w16,w16,w17 1091 sli v9.4s,v26.4s,#7 1092 add w13,w13,w19 1093 sli v13.4s,v27.4s,#7 1094 add w14,w14,w20 1095 sli v17.4s,v28.4s,#7 1096 eor w10,w10,w15 1097 sli v21.4s,v29.4s,#7 1098 eor w11,w11,w16 1099 ext v2.16b,v2.16b,v2.16b,#8 1100 eor w12,w12,w13 1101 ext v6.16b,v6.16b,v6.16b,#8 1102 eor w9,w9,w14 1103 ext v10.16b,v10.16b,v10.16b,#8 1104 ror w10,w10,#25 1105 ext v14.16b,v14.16b,v14.16b,#8 1106 ror w11,w11,#25 1107 ext v18.16b,v18.16b,v18.16b,#8 1108 ror w12,w12,#25 1109 ext v22.16b,v22.16b,v22.16b,#8 1110 ror w9,w9,#25 1111 ext v3.16b,v3.16b,v3.16b,#12 1112 ext v7.16b,v7.16b,v7.16b,#12 1113 ext v11.16b,v11.16b,v11.16b,#12 1114 ext v15.16b,v15.16b,v15.16b,#12 1115 ext v19.16b,v19.16b,v19.16b,#12 1116 ext v23.16b,v23.16b,v23.16b,#12 1117 ext v1.16b,v1.16b,v1.16b,#4 1118 ext v5.16b,v5.16b,v5.16b,#4 1119 ext v9.16b,v9.16b,v9.16b,#4 1120 ext v13.16b,v13.16b,v13.16b,#4 1121 ext v17.16b,v17.16b,v17.16b,#4 1122 ext v21.16b,v21.16b,v21.16b,#4 1123 add v0.4s,v0.4s,v1.4s 1124 add w5,w5,w9 1125 add v4.4s,v4.4s,v5.4s 1126 add w6,w6,w10 1127 add v8.4s,v8.4s,v9.4s 1128 add w7,w7,w11 1129 add v12.4s,v12.4s,v13.4s 1130 add w8,w8,w12 1131 add v16.4s,v16.4s,v17.4s 1132 eor w17,w17,w5 1133 add v20.4s,v20.4s,v21.4s 1134 eor w19,w19,w6 1135 eor v3.16b,v3.16b,v0.16b 1136 eor w20,w20,w7 1137 eor v7.16b,v7.16b,v4.16b 1138 eor w21,w21,w8 1139 eor v11.16b,v11.16b,v8.16b 1140 ror w17,w17,#16 1141 eor v15.16b,v15.16b,v12.16b 1142 ror w19,w19,#16 1143 eor v19.16b,v19.16b,v16.16b 1144 ror w20,w20,#16 1145 eor v23.16b,v23.16b,v20.16b 1146 ror w21,w21,#16 1147 rev32 v3.8h,v3.8h 1148 add w13,w13,w17 1149 rev32 v7.8h,v7.8h 1150 add w14,w14,w19 1151 rev32 v11.8h,v11.8h 1152 add w15,w15,w20 1153 rev32 v15.8h,v15.8h 1154 add w16,w16,w21 1155 rev32 v19.8h,v19.8h 1156 eor w9,w9,w13 1157 rev32 v23.8h,v23.8h 1158 eor w10,w10,w14 1159 add v2.4s,v2.4s,v3.4s 1160 eor w11,w11,w15 1161 add v6.4s,v6.4s,v7.4s 1162 eor w12,w12,w16 1163 add v10.4s,v10.4s,v11.4s 1164 ror w9,w9,#20 1165 add v14.4s,v14.4s,v15.4s 1166 ror w10,w10,#20 1167 add v18.4s,v18.4s,v19.4s 1168 ror w11,w11,#20 1169 add v22.4s,v22.4s,v23.4s 1170 ror w12,w12,#20 1171 eor v24.16b,v1.16b,v2.16b 1172 add w5,w5,w9 1173 eor v25.16b,v5.16b,v6.16b 1174 add w6,w6,w10 1175 eor v26.16b,v9.16b,v10.16b 1176 add w7,w7,w11 1177 eor v27.16b,v13.16b,v14.16b 1178 add w8,w8,w12 1179 eor v28.16b,v17.16b,v18.16b 1180 eor w17,w17,w5 1181 eor v29.16b,v21.16b,v22.16b 1182 eor w19,w19,w6 1183 ushr v1.4s,v24.4s,#20 1184 eor w20,w20,w7 1185 ushr v5.4s,v25.4s,#20 1186 eor w21,w21,w8 1187 ushr v9.4s,v26.4s,#20 1188 ror w17,w17,#24 1189 ushr v13.4s,v27.4s,#20 1190 ror w19,w19,#24 1191 ushr v17.4s,v28.4s,#20 1192 ror w20,w20,#24 1193 ushr v21.4s,v29.4s,#20 1194 ror w21,w21,#24 1195 sli v1.4s,v24.4s,#12 1196 add w13,w13,w17 1197 sli v5.4s,v25.4s,#12 1198 add w14,w14,w19 1199 sli v9.4s,v26.4s,#12 1200 add w15,w15,w20 1201 sli v13.4s,v27.4s,#12 1202 add w16,w16,w21 1203 sli v17.4s,v28.4s,#12 1204 eor w9,w9,w13 1205 sli v21.4s,v29.4s,#12 1206 eor w10,w10,w14 1207 add v0.4s,v0.4s,v1.4s 1208 eor w11,w11,w15 1209 add v4.4s,v4.4s,v5.4s 1210 eor w12,w12,w16 1211 add v8.4s,v8.4s,v9.4s 1212 ror w9,w9,#25 1213 add v12.4s,v12.4s,v13.4s 1214 ror w10,w10,#25 1215 add v16.4s,v16.4s,v17.4s 1216 ror w11,w11,#25 1217 add v20.4s,v20.4s,v21.4s 1218 ror w12,w12,#25 1219 eor v24.16b,v3.16b,v0.16b 1220 add w5,w5,w10 1221 eor v25.16b,v7.16b,v4.16b 1222 add w6,w6,w11 1223 eor v26.16b,v11.16b,v8.16b 1224 add w7,w7,w12 1225 eor v27.16b,v15.16b,v12.16b 1226 add w8,w8,w9 1227 eor v28.16b,v19.16b,v16.16b 1228 eor w21,w21,w5 1229 eor v29.16b,v23.16b,v20.16b 1230 eor w17,w17,w6 1231 ushr v3.4s,v24.4s,#24 1232 eor w19,w19,w7 1233 ushr v7.4s,v25.4s,#24 1234 eor w20,w20,w8 1235 ushr v11.4s,v26.4s,#24 1236 ror w21,w21,#16 1237 ushr v15.4s,v27.4s,#24 1238 ror w17,w17,#16 1239 ushr v19.4s,v28.4s,#24 1240 ror w19,w19,#16 1241 ushr v23.4s,v29.4s,#24 1242 ror w20,w20,#16 1243 sli v3.4s,v24.4s,#8 1244 add w15,w15,w21 1245 sli v7.4s,v25.4s,#8 1246 add w16,w16,w17 1247 sli v11.4s,v26.4s,#8 1248 add w13,w13,w19 1249 sli v15.4s,v27.4s,#8 1250 add w14,w14,w20 1251 sli v19.4s,v28.4s,#8 1252 eor w10,w10,w15 1253 sli v23.4s,v29.4s,#8 1254 eor w11,w11,w16 1255 add v2.4s,v2.4s,v3.4s 1256 eor w12,w12,w13 1257 add v6.4s,v6.4s,v7.4s 1258 eor w9,w9,w14 1259 add v10.4s,v10.4s,v11.4s 1260 ror w10,w10,#20 1261 add v14.4s,v14.4s,v15.4s 1262 ror w11,w11,#20 1263 add v18.4s,v18.4s,v19.4s 1264 ror w12,w12,#20 1265 add v22.4s,v22.4s,v23.4s 1266 ror w9,w9,#20 1267 eor v24.16b,v1.16b,v2.16b 1268 add w5,w5,w10 1269 eor v25.16b,v5.16b,v6.16b 1270 add w6,w6,w11 1271 eor v26.16b,v9.16b,v10.16b 1272 add w7,w7,w12 1273 eor v27.16b,v13.16b,v14.16b 1274 add w8,w8,w9 1275 eor v28.16b,v17.16b,v18.16b 1276 eor w21,w21,w5 1277 eor v29.16b,v21.16b,v22.16b 1278 eor w17,w17,w6 1279 ushr v1.4s,v24.4s,#25 1280 eor w19,w19,w7 1281 ushr v5.4s,v25.4s,#25 1282 eor w20,w20,w8 1283 ushr v9.4s,v26.4s,#25 1284 ror w21,w21,#24 1285 ushr v13.4s,v27.4s,#25 1286 ror w17,w17,#24 1287 ushr v17.4s,v28.4s,#25 1288 ror w19,w19,#24 1289 ushr v21.4s,v29.4s,#25 1290 ror w20,w20,#24 1291 sli v1.4s,v24.4s,#7 1292 add w15,w15,w21 1293 sli v5.4s,v25.4s,#7 1294 add w16,w16,w17 1295 sli v9.4s,v26.4s,#7 1296 add w13,w13,w19 1297 sli v13.4s,v27.4s,#7 1298 add w14,w14,w20 1299 sli v17.4s,v28.4s,#7 1300 eor w10,w10,w15 1301 sli v21.4s,v29.4s,#7 1302 eor w11,w11,w16 1303 ext v2.16b,v2.16b,v2.16b,#8 1304 eor w12,w12,w13 1305 ext v6.16b,v6.16b,v6.16b,#8 1306 eor w9,w9,w14 1307 ext v10.16b,v10.16b,v10.16b,#8 1308 ror w10,w10,#25 1309 ext v14.16b,v14.16b,v14.16b,#8 1310 ror w11,w11,#25 1311 ext v18.16b,v18.16b,v18.16b,#8 1312 ror w12,w12,#25 1313 ext v22.16b,v22.16b,v22.16b,#8 1314 ror w9,w9,#25 1315 ext v3.16b,v3.16b,v3.16b,#4 1316 ext v7.16b,v7.16b,v7.16b,#4 1317 ext v11.16b,v11.16b,v11.16b,#4 1318 ext v15.16b,v15.16b,v15.16b,#4 1319 ext v19.16b,v19.16b,v19.16b,#4 1320 ext v23.16b,v23.16b,v23.16b,#4 1321 ext v1.16b,v1.16b,v1.16b,#12 1322 ext v5.16b,v5.16b,v5.16b,#12 1323 ext v9.16b,v9.16b,v9.16b,#12 1324 ext v13.16b,v13.16b,v13.16b,#12 1325 ext v17.16b,v17.16b,v17.16b,#12 1326 ext v21.16b,v21.16b,v21.16b,#12 1327 cbnz x4,Loop_upper_neon 1328 1329 add w5,w5,w22 // accumulate key block 1330 add x6,x6,x22,lsr#32 1331 add w7,w7,w23 1332 add x8,x8,x23,lsr#32 1333 add w9,w9,w24 1334 add x10,x10,x24,lsr#32 1335 add w11,w11,w25 1336 add x12,x12,x25,lsr#32 1337 add w13,w13,w26 1338 add x14,x14,x26,lsr#32 1339 add w15,w15,w27 1340 add x16,x16,x27,lsr#32 1341 add w17,w17,w28 1342 add x19,x19,x28,lsr#32 1343 add w20,w20,w30 1344 add x21,x21,x30,lsr#32 1345 1346 add x5,x5,x6,lsl#32 // pack 1347 add x7,x7,x8,lsl#32 1348 ldp x6,x8,[x1,#0] // load input 1349 add x9,x9,x10,lsl#32 1350 add x11,x11,x12,lsl#32 1351 ldp x10,x12,[x1,#16] 1352 add x13,x13,x14,lsl#32 1353 add x15,x15,x16,lsl#32 1354 ldp x14,x16,[x1,#32] 1355 add x17,x17,x19,lsl#32 1356 add x20,x20,x21,lsl#32 1357 ldp x19,x21,[x1,#48] 1358 add x1,x1,#64 1359#ifdef __AARCH64EB__ 1360 rev x5,x5 1361 rev x7,x7 1362 rev x9,x9 1363 rev x11,x11 1364 rev x13,x13 1365 rev x15,x15 1366 rev x17,x17 1367 rev x20,x20 1368#endif 1369 eor x5,x5,x6 1370 eor x7,x7,x8 1371 eor x9,x9,x10 1372 eor x11,x11,x12 1373 eor x13,x13,x14 1374 eor x15,x15,x16 1375 eor x17,x17,x19 1376 eor x20,x20,x21 1377 1378 stp x5,x7,[x0,#0] // store output 1379 add x28,x28,#1 // increment counter 1380 mov w5,w22 // unpack key block 1381 lsr x6,x22,#32 1382 stp x9,x11,[x0,#16] 1383 mov w7,w23 1384 lsr x8,x23,#32 1385 stp x13,x15,[x0,#32] 1386 mov w9,w24 1387 lsr x10,x24,#32 1388 stp x17,x20,[x0,#48] 1389 add x0,x0,#64 1390 mov w11,w25 1391 lsr x12,x25,#32 1392 mov w13,w26 1393 lsr x14,x26,#32 1394 mov w15,w27 1395 lsr x16,x27,#32 1396 mov w17,w28 1397 lsr x19,x28,#32 1398 mov w20,w30 1399 lsr x21,x30,#32 1400 1401 mov x4,#5 1402Loop_lower_neon: 1403 sub x4,x4,#1 1404 add v0.4s,v0.4s,v1.4s 1405 add w5,w5,w9 1406 add v4.4s,v4.4s,v5.4s 1407 add w6,w6,w10 1408 add v8.4s,v8.4s,v9.4s 1409 add w7,w7,w11 1410 add v12.4s,v12.4s,v13.4s 1411 add w8,w8,w12 1412 add v16.4s,v16.4s,v17.4s 1413 eor w17,w17,w5 1414 add v20.4s,v20.4s,v21.4s 1415 eor w19,w19,w6 1416 eor v3.16b,v3.16b,v0.16b 1417 eor w20,w20,w7 1418 eor v7.16b,v7.16b,v4.16b 1419 eor w21,w21,w8 1420 eor v11.16b,v11.16b,v8.16b 1421 ror w17,w17,#16 1422 eor v15.16b,v15.16b,v12.16b 1423 ror w19,w19,#16 1424 eor v19.16b,v19.16b,v16.16b 1425 ror w20,w20,#16 1426 eor v23.16b,v23.16b,v20.16b 1427 ror w21,w21,#16 1428 rev32 v3.8h,v3.8h 1429 add w13,w13,w17 1430 rev32 v7.8h,v7.8h 1431 add w14,w14,w19 1432 rev32 v11.8h,v11.8h 1433 add w15,w15,w20 1434 rev32 v15.8h,v15.8h 1435 add w16,w16,w21 1436 rev32 v19.8h,v19.8h 1437 eor w9,w9,w13 1438 rev32 v23.8h,v23.8h 1439 eor w10,w10,w14 1440 add v2.4s,v2.4s,v3.4s 1441 eor w11,w11,w15 1442 add v6.4s,v6.4s,v7.4s 1443 eor w12,w12,w16 1444 add v10.4s,v10.4s,v11.4s 1445 ror w9,w9,#20 1446 add v14.4s,v14.4s,v15.4s 1447 ror w10,w10,#20 1448 add v18.4s,v18.4s,v19.4s 1449 ror w11,w11,#20 1450 add v22.4s,v22.4s,v23.4s 1451 ror w12,w12,#20 1452 eor v24.16b,v1.16b,v2.16b 1453 add w5,w5,w9 1454 eor v25.16b,v5.16b,v6.16b 1455 add w6,w6,w10 1456 eor v26.16b,v9.16b,v10.16b 1457 add w7,w7,w11 1458 eor v27.16b,v13.16b,v14.16b 1459 add w8,w8,w12 1460 eor v28.16b,v17.16b,v18.16b 1461 eor w17,w17,w5 1462 eor v29.16b,v21.16b,v22.16b 1463 eor w19,w19,w6 1464 ushr v1.4s,v24.4s,#20 1465 eor w20,w20,w7 1466 ushr v5.4s,v25.4s,#20 1467 eor w21,w21,w8 1468 ushr v9.4s,v26.4s,#20 1469 ror w17,w17,#24 1470 ushr v13.4s,v27.4s,#20 1471 ror w19,w19,#24 1472 ushr v17.4s,v28.4s,#20 1473 ror w20,w20,#24 1474 ushr v21.4s,v29.4s,#20 1475 ror w21,w21,#24 1476 sli v1.4s,v24.4s,#12 1477 add w13,w13,w17 1478 sli v5.4s,v25.4s,#12 1479 add w14,w14,w19 1480 sli v9.4s,v26.4s,#12 1481 add w15,w15,w20 1482 sli v13.4s,v27.4s,#12 1483 add w16,w16,w21 1484 sli v17.4s,v28.4s,#12 1485 eor w9,w9,w13 1486 sli v21.4s,v29.4s,#12 1487 eor w10,w10,w14 1488 add v0.4s,v0.4s,v1.4s 1489 eor w11,w11,w15 1490 add v4.4s,v4.4s,v5.4s 1491 eor w12,w12,w16 1492 add v8.4s,v8.4s,v9.4s 1493 ror w9,w9,#25 1494 add v12.4s,v12.4s,v13.4s 1495 ror w10,w10,#25 1496 add v16.4s,v16.4s,v17.4s 1497 ror w11,w11,#25 1498 add v20.4s,v20.4s,v21.4s 1499 ror w12,w12,#25 1500 eor v24.16b,v3.16b,v0.16b 1501 add w5,w5,w10 1502 eor v25.16b,v7.16b,v4.16b 1503 add w6,w6,w11 1504 eor v26.16b,v11.16b,v8.16b 1505 add w7,w7,w12 1506 eor v27.16b,v15.16b,v12.16b 1507 add w8,w8,w9 1508 eor v28.16b,v19.16b,v16.16b 1509 eor w21,w21,w5 1510 eor v29.16b,v23.16b,v20.16b 1511 eor w17,w17,w6 1512 ushr v3.4s,v24.4s,#24 1513 eor w19,w19,w7 1514 ushr v7.4s,v25.4s,#24 1515 eor w20,w20,w8 1516 ushr v11.4s,v26.4s,#24 1517 ror w21,w21,#16 1518 ushr v15.4s,v27.4s,#24 1519 ror w17,w17,#16 1520 ushr v19.4s,v28.4s,#24 1521 ror w19,w19,#16 1522 ushr v23.4s,v29.4s,#24 1523 ror w20,w20,#16 1524 sli v3.4s,v24.4s,#8 1525 add w15,w15,w21 1526 sli v7.4s,v25.4s,#8 1527 add w16,w16,w17 1528 sli v11.4s,v26.4s,#8 1529 add w13,w13,w19 1530 sli v15.4s,v27.4s,#8 1531 add w14,w14,w20 1532 sli v19.4s,v28.4s,#8 1533 eor w10,w10,w15 1534 sli v23.4s,v29.4s,#8 1535 eor w11,w11,w16 1536 add v2.4s,v2.4s,v3.4s 1537 eor w12,w12,w13 1538 add v6.4s,v6.4s,v7.4s 1539 eor w9,w9,w14 1540 add v10.4s,v10.4s,v11.4s 1541 ror w10,w10,#20 1542 add v14.4s,v14.4s,v15.4s 1543 ror w11,w11,#20 1544 add v18.4s,v18.4s,v19.4s 1545 ror w12,w12,#20 1546 add v22.4s,v22.4s,v23.4s 1547 ror w9,w9,#20 1548 eor v24.16b,v1.16b,v2.16b 1549 add w5,w5,w10 1550 eor v25.16b,v5.16b,v6.16b 1551 add w6,w6,w11 1552 eor v26.16b,v9.16b,v10.16b 1553 add w7,w7,w12 1554 eor v27.16b,v13.16b,v14.16b 1555 add w8,w8,w9 1556 eor v28.16b,v17.16b,v18.16b 1557 eor w21,w21,w5 1558 eor v29.16b,v21.16b,v22.16b 1559 eor w17,w17,w6 1560 ushr v1.4s,v24.4s,#25 1561 eor w19,w19,w7 1562 ushr v5.4s,v25.4s,#25 1563 eor w20,w20,w8 1564 ushr v9.4s,v26.4s,#25 1565 ror w21,w21,#24 1566 ushr v13.4s,v27.4s,#25 1567 ror w17,w17,#24 1568 ushr v17.4s,v28.4s,#25 1569 ror w19,w19,#24 1570 ushr v21.4s,v29.4s,#25 1571 ror w20,w20,#24 1572 sli v1.4s,v24.4s,#7 1573 add w15,w15,w21 1574 sli v5.4s,v25.4s,#7 1575 add w16,w16,w17 1576 sli v9.4s,v26.4s,#7 1577 add w13,w13,w19 1578 sli v13.4s,v27.4s,#7 1579 add w14,w14,w20 1580 sli v17.4s,v28.4s,#7 1581 eor w10,w10,w15 1582 sli v21.4s,v29.4s,#7 1583 eor w11,w11,w16 1584 ext v2.16b,v2.16b,v2.16b,#8 1585 eor w12,w12,w13 1586 ext v6.16b,v6.16b,v6.16b,#8 1587 eor w9,w9,w14 1588 ext v10.16b,v10.16b,v10.16b,#8 1589 ror w10,w10,#25 1590 ext v14.16b,v14.16b,v14.16b,#8 1591 ror w11,w11,#25 1592 ext v18.16b,v18.16b,v18.16b,#8 1593 ror w12,w12,#25 1594 ext v22.16b,v22.16b,v22.16b,#8 1595 ror w9,w9,#25 1596 ext v3.16b,v3.16b,v3.16b,#12 1597 ext v7.16b,v7.16b,v7.16b,#12 1598 ext v11.16b,v11.16b,v11.16b,#12 1599 ext v15.16b,v15.16b,v15.16b,#12 1600 ext v19.16b,v19.16b,v19.16b,#12 1601 ext v23.16b,v23.16b,v23.16b,#12 1602 ext v1.16b,v1.16b,v1.16b,#4 1603 ext v5.16b,v5.16b,v5.16b,#4 1604 ext v9.16b,v9.16b,v9.16b,#4 1605 ext v13.16b,v13.16b,v13.16b,#4 1606 ext v17.16b,v17.16b,v17.16b,#4 1607 ext v21.16b,v21.16b,v21.16b,#4 1608 add v0.4s,v0.4s,v1.4s 1609 add w5,w5,w9 1610 add v4.4s,v4.4s,v5.4s 1611 add w6,w6,w10 1612 add v8.4s,v8.4s,v9.4s 1613 add w7,w7,w11 1614 add v12.4s,v12.4s,v13.4s 1615 add w8,w8,w12 1616 add v16.4s,v16.4s,v17.4s 1617 eor w17,w17,w5 1618 add v20.4s,v20.4s,v21.4s 1619 eor w19,w19,w6 1620 eor v3.16b,v3.16b,v0.16b 1621 eor w20,w20,w7 1622 eor v7.16b,v7.16b,v4.16b 1623 eor w21,w21,w8 1624 eor v11.16b,v11.16b,v8.16b 1625 ror w17,w17,#16 1626 eor v15.16b,v15.16b,v12.16b 1627 ror w19,w19,#16 1628 eor v19.16b,v19.16b,v16.16b 1629 ror w20,w20,#16 1630 eor v23.16b,v23.16b,v20.16b 1631 ror w21,w21,#16 1632 rev32 v3.8h,v3.8h 1633 add w13,w13,w17 1634 rev32 v7.8h,v7.8h 1635 add w14,w14,w19 1636 rev32 v11.8h,v11.8h 1637 add w15,w15,w20 1638 rev32 v15.8h,v15.8h 1639 add w16,w16,w21 1640 rev32 v19.8h,v19.8h 1641 eor w9,w9,w13 1642 rev32 v23.8h,v23.8h 1643 eor w10,w10,w14 1644 add v2.4s,v2.4s,v3.4s 1645 eor w11,w11,w15 1646 add v6.4s,v6.4s,v7.4s 1647 eor w12,w12,w16 1648 add v10.4s,v10.4s,v11.4s 1649 ror w9,w9,#20 1650 add v14.4s,v14.4s,v15.4s 1651 ror w10,w10,#20 1652 add v18.4s,v18.4s,v19.4s 1653 ror w11,w11,#20 1654 add v22.4s,v22.4s,v23.4s 1655 ror w12,w12,#20 1656 eor v24.16b,v1.16b,v2.16b 1657 add w5,w5,w9 1658 eor v25.16b,v5.16b,v6.16b 1659 add w6,w6,w10 1660 eor v26.16b,v9.16b,v10.16b 1661 add w7,w7,w11 1662 eor v27.16b,v13.16b,v14.16b 1663 add w8,w8,w12 1664 eor v28.16b,v17.16b,v18.16b 1665 eor w17,w17,w5 1666 eor v29.16b,v21.16b,v22.16b 1667 eor w19,w19,w6 1668 ushr v1.4s,v24.4s,#20 1669 eor w20,w20,w7 1670 ushr v5.4s,v25.4s,#20 1671 eor w21,w21,w8 1672 ushr v9.4s,v26.4s,#20 1673 ror w17,w17,#24 1674 ushr v13.4s,v27.4s,#20 1675 ror w19,w19,#24 1676 ushr v17.4s,v28.4s,#20 1677 ror w20,w20,#24 1678 ushr v21.4s,v29.4s,#20 1679 ror w21,w21,#24 1680 sli v1.4s,v24.4s,#12 1681 add w13,w13,w17 1682 sli v5.4s,v25.4s,#12 1683 add w14,w14,w19 1684 sli v9.4s,v26.4s,#12 1685 add w15,w15,w20 1686 sli v13.4s,v27.4s,#12 1687 add w16,w16,w21 1688 sli v17.4s,v28.4s,#12 1689 eor w9,w9,w13 1690 sli v21.4s,v29.4s,#12 1691 eor w10,w10,w14 1692 add v0.4s,v0.4s,v1.4s 1693 eor w11,w11,w15 1694 add v4.4s,v4.4s,v5.4s 1695 eor w12,w12,w16 1696 add v8.4s,v8.4s,v9.4s 1697 ror w9,w9,#25 1698 add v12.4s,v12.4s,v13.4s 1699 ror w10,w10,#25 1700 add v16.4s,v16.4s,v17.4s 1701 ror w11,w11,#25 1702 add v20.4s,v20.4s,v21.4s 1703 ror w12,w12,#25 1704 eor v24.16b,v3.16b,v0.16b 1705 add w5,w5,w10 1706 eor v25.16b,v7.16b,v4.16b 1707 add w6,w6,w11 1708 eor v26.16b,v11.16b,v8.16b 1709 add w7,w7,w12 1710 eor v27.16b,v15.16b,v12.16b 1711 add w8,w8,w9 1712 eor v28.16b,v19.16b,v16.16b 1713 eor w21,w21,w5 1714 eor v29.16b,v23.16b,v20.16b 1715 eor w17,w17,w6 1716 ushr v3.4s,v24.4s,#24 1717 eor w19,w19,w7 1718 ushr v7.4s,v25.4s,#24 1719 eor w20,w20,w8 1720 ushr v11.4s,v26.4s,#24 1721 ror w21,w21,#16 1722 ushr v15.4s,v27.4s,#24 1723 ror w17,w17,#16 1724 ushr v19.4s,v28.4s,#24 1725 ror w19,w19,#16 1726 ushr v23.4s,v29.4s,#24 1727 ror w20,w20,#16 1728 sli v3.4s,v24.4s,#8 1729 add w15,w15,w21 1730 sli v7.4s,v25.4s,#8 1731 add w16,w16,w17 1732 sli v11.4s,v26.4s,#8 1733 add w13,w13,w19 1734 sli v15.4s,v27.4s,#8 1735 add w14,w14,w20 1736 sli v19.4s,v28.4s,#8 1737 eor w10,w10,w15 1738 sli v23.4s,v29.4s,#8 1739 eor w11,w11,w16 1740 add v2.4s,v2.4s,v3.4s 1741 eor w12,w12,w13 1742 add v6.4s,v6.4s,v7.4s 1743 eor w9,w9,w14 1744 add v10.4s,v10.4s,v11.4s 1745 ror w10,w10,#20 1746 add v14.4s,v14.4s,v15.4s 1747 ror w11,w11,#20 1748 add v18.4s,v18.4s,v19.4s 1749 ror w12,w12,#20 1750 add v22.4s,v22.4s,v23.4s 1751 ror w9,w9,#20 1752 eor v24.16b,v1.16b,v2.16b 1753 add w5,w5,w10 1754 eor v25.16b,v5.16b,v6.16b 1755 add w6,w6,w11 1756 eor v26.16b,v9.16b,v10.16b 1757 add w7,w7,w12 1758 eor v27.16b,v13.16b,v14.16b 1759 add w8,w8,w9 1760 eor v28.16b,v17.16b,v18.16b 1761 eor w21,w21,w5 1762 eor v29.16b,v21.16b,v22.16b 1763 eor w17,w17,w6 1764 ushr v1.4s,v24.4s,#25 1765 eor w19,w19,w7 1766 ushr v5.4s,v25.4s,#25 1767 eor w20,w20,w8 1768 ushr v9.4s,v26.4s,#25 1769 ror w21,w21,#24 1770 ushr v13.4s,v27.4s,#25 1771 ror w17,w17,#24 1772 ushr v17.4s,v28.4s,#25 1773 ror w19,w19,#24 1774 ushr v21.4s,v29.4s,#25 1775 ror w20,w20,#24 1776 sli v1.4s,v24.4s,#7 1777 add w15,w15,w21 1778 sli v5.4s,v25.4s,#7 1779 add w16,w16,w17 1780 sli v9.4s,v26.4s,#7 1781 add w13,w13,w19 1782 sli v13.4s,v27.4s,#7 1783 add w14,w14,w20 1784 sli v17.4s,v28.4s,#7 1785 eor w10,w10,w15 1786 sli v21.4s,v29.4s,#7 1787 eor w11,w11,w16 1788 ext v2.16b,v2.16b,v2.16b,#8 1789 eor w12,w12,w13 1790 ext v6.16b,v6.16b,v6.16b,#8 1791 eor w9,w9,w14 1792 ext v10.16b,v10.16b,v10.16b,#8 1793 ror w10,w10,#25 1794 ext v14.16b,v14.16b,v14.16b,#8 1795 ror w11,w11,#25 1796 ext v18.16b,v18.16b,v18.16b,#8 1797 ror w12,w12,#25 1798 ext v22.16b,v22.16b,v22.16b,#8 1799 ror w9,w9,#25 1800 ext v3.16b,v3.16b,v3.16b,#4 1801 ext v7.16b,v7.16b,v7.16b,#4 1802 ext v11.16b,v11.16b,v11.16b,#4 1803 ext v15.16b,v15.16b,v15.16b,#4 1804 ext v19.16b,v19.16b,v19.16b,#4 1805 ext v23.16b,v23.16b,v23.16b,#4 1806 ext v1.16b,v1.16b,v1.16b,#12 1807 ext v5.16b,v5.16b,v5.16b,#12 1808 ext v9.16b,v9.16b,v9.16b,#12 1809 ext v13.16b,v13.16b,v13.16b,#12 1810 ext v17.16b,v17.16b,v17.16b,#12 1811 ext v21.16b,v21.16b,v21.16b,#12 1812 cbnz x4,Loop_lower_neon 1813 1814 add w5,w5,w22 // accumulate key block 1815 ldp q24,q25,[sp,#0] 1816 add x6,x6,x22,lsr#32 1817 ldp q26,q27,[sp,#32] 1818 add w7,w7,w23 1819 ldp q28,q29,[sp,#64] 1820 add x8,x8,x23,lsr#32 1821 add v0.4s,v0.4s,v24.4s 1822 add w9,w9,w24 1823 add v4.4s,v4.4s,v24.4s 1824 add x10,x10,x24,lsr#32 1825 add v8.4s,v8.4s,v24.4s 1826 add w11,w11,w25 1827 add v12.4s,v12.4s,v24.4s 1828 add x12,x12,x25,lsr#32 1829 add v16.4s,v16.4s,v24.4s 1830 add w13,w13,w26 1831 add v20.4s,v20.4s,v24.4s 1832 add x14,x14,x26,lsr#32 1833 add v2.4s,v2.4s,v26.4s 1834 add w15,w15,w27 1835 add v6.4s,v6.4s,v26.4s 1836 add x16,x16,x27,lsr#32 1837 add v10.4s,v10.4s,v26.4s 1838 add w17,w17,w28 1839 add v14.4s,v14.4s,v26.4s 1840 add x19,x19,x28,lsr#32 1841 add v18.4s,v18.4s,v26.4s 1842 add w20,w20,w30 1843 add v22.4s,v22.4s,v26.4s 1844 add x21,x21,x30,lsr#32 1845 add v19.4s,v19.4s,v31.4s // +4 1846 add x5,x5,x6,lsl#32 // pack 1847 add v23.4s,v23.4s,v31.4s // +4 1848 add x7,x7,x8,lsl#32 1849 add v3.4s,v3.4s,v27.4s 1850 ldp x6,x8,[x1,#0] // load input 1851 add v7.4s,v7.4s,v28.4s 1852 add x9,x9,x10,lsl#32 1853 add v11.4s,v11.4s,v29.4s 1854 add x11,x11,x12,lsl#32 1855 add v15.4s,v15.4s,v30.4s 1856 ldp x10,x12,[x1,#16] 1857 add v19.4s,v19.4s,v27.4s 1858 add x13,x13,x14,lsl#32 1859 add v23.4s,v23.4s,v28.4s 1860 add x15,x15,x16,lsl#32 1861 add v1.4s,v1.4s,v25.4s 1862 ldp x14,x16,[x1,#32] 1863 add v5.4s,v5.4s,v25.4s 1864 add x17,x17,x19,lsl#32 1865 add v9.4s,v9.4s,v25.4s 1866 add x20,x20,x21,lsl#32 1867 add v13.4s,v13.4s,v25.4s 1868 ldp x19,x21,[x1,#48] 1869 add v17.4s,v17.4s,v25.4s 1870 add x1,x1,#64 1871 add v21.4s,v21.4s,v25.4s 1872 1873#ifdef __AARCH64EB__ 1874 rev x5,x5 1875 rev x7,x7 1876 rev x9,x9 1877 rev x11,x11 1878 rev x13,x13 1879 rev x15,x15 1880 rev x17,x17 1881 rev x20,x20 1882#endif 1883 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 1884 eor x5,x5,x6 1885 eor x7,x7,x8 1886 eor x9,x9,x10 1887 eor x11,x11,x12 1888 eor x13,x13,x14 1889 eor v0.16b,v0.16b,v24.16b 1890 eor x15,x15,x16 1891 eor v1.16b,v1.16b,v25.16b 1892 eor x17,x17,x19 1893 eor v2.16b,v2.16b,v26.16b 1894 eor x20,x20,x21 1895 eor v3.16b,v3.16b,v27.16b 1896 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 1897 1898 stp x5,x7,[x0,#0] // store output 1899 add x28,x28,#7 // increment counter 1900 stp x9,x11,[x0,#16] 1901 stp x13,x15,[x0,#32] 1902 stp x17,x20,[x0,#48] 1903 add x0,x0,#64 1904 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 1905 1906 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 1907 eor v4.16b,v4.16b,v24.16b 1908 eor v5.16b,v5.16b,v25.16b 1909 eor v6.16b,v6.16b,v26.16b 1910 eor v7.16b,v7.16b,v27.16b 1911 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 1912 1913 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 1914 eor v8.16b,v8.16b,v0.16b 1915 ldp q24,q25,[sp,#0] 1916 eor v9.16b,v9.16b,v1.16b 1917 ldp q26,q27,[sp,#32] 1918 eor v10.16b,v10.16b,v2.16b 1919 eor v11.16b,v11.16b,v3.16b 1920 st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 1921 1922 ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 1923 eor v12.16b,v12.16b,v4.16b 1924 eor v13.16b,v13.16b,v5.16b 1925 eor v14.16b,v14.16b,v6.16b 1926 eor v15.16b,v15.16b,v7.16b 1927 st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 1928 1929 ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 1930 eor v16.16b,v16.16b,v8.16b 1931 eor v17.16b,v17.16b,v9.16b 1932 eor v18.16b,v18.16b,v10.16b 1933 eor v19.16b,v19.16b,v11.16b 1934 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 1935 1936 shl v0.4s,v31.4s,#1 // 4 -> 8 1937 eor v20.16b,v20.16b,v12.16b 1938 eor v21.16b,v21.16b,v13.16b 1939 eor v22.16b,v22.16b,v14.16b 1940 eor v23.16b,v23.16b,v15.16b 1941 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 1942 1943 add v27.4s,v27.4s,v0.4s // += 8 1944 add v28.4s,v28.4s,v0.4s 1945 add v29.4s,v29.4s,v0.4s 1946 add v30.4s,v30.4s,v0.4s 1947 1948 b.hs Loop_outer_512_neon 1949 1950 adds x2,x2,#512 1951 ushr v0.4s,v31.4s,#2 // 4 -> 1 1952 1953 ldp d8,d9,[sp,#128+0] // meet ABI requirements 1954 ldp d10,d11,[sp,#128+16] 1955 ldp d12,d13,[sp,#128+32] 1956 ldp d14,d15,[sp,#128+48] 1957 1958 stp q24,q31,[sp,#0] // wipe off-load area 1959 stp q24,q31,[sp,#32] 1960 stp q24,q31,[sp,#64] 1961 1962 b.eq Ldone_512_neon 1963 1964 cmp x2,#192 1965 sub v27.4s,v27.4s,v0.4s // -= 1 1966 sub v28.4s,v28.4s,v0.4s 1967 sub v29.4s,v29.4s,v0.4s 1968 add sp,sp,#128 1969 b.hs Loop_outer_neon 1970 1971 eor v25.16b,v25.16b,v25.16b 1972 eor v26.16b,v26.16b,v26.16b 1973 eor v27.16b,v27.16b,v27.16b 1974 eor v28.16b,v28.16b,v28.16b 1975 eor v29.16b,v29.16b,v29.16b 1976 eor v30.16b,v30.16b,v30.16b 1977 b Loop_outer 1978 1979Ldone_512_neon: 1980 ldp x19,x20,[x29,#16] 1981 add sp,sp,#128+64 1982 ldp x21,x22,[x29,#32] 1983 ldp x23,x24,[x29,#48] 1984 ldp x25,x26,[x29,#64] 1985 ldp x27,x28,[x29,#80] 1986 ldp x29,x30,[sp],#96 1987 AARCH64_VALIDATE_LINK_REGISTER 1988 ret 1989 1990#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) 1991