1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#include <ring-core/asm_base.h> 5 6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) 7#include <ring-core/arm_arch.h> 8 9 10.hidden OPENSSL_armcap_P 11 12.section .rodata 13 14.align 5 15.Lsigma: 16.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral 17.Lone: 18.long 1,0,0,0 19.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 20.align 2 21 22.text 23 24.globl ChaCha20_ctr32 25.hidden ChaCha20_ctr32 26.type ChaCha20_ctr32,%function 27.align 5 28ChaCha20_ctr32: 29 AARCH64_VALID_CALL_TARGET 30 cbz x2,.Labort 31#if defined(OPENSSL_HWASAN) && __clang_major__ >= 10 32 adrp x5,:pg_hi21_nc:OPENSSL_armcap_P 33#else 34 adrp x5,OPENSSL_armcap_P 35#endif 36 cmp x2,#192 37 b.lo .Lshort 38 ldr w17,[x5,:lo12:OPENSSL_armcap_P] 39 tst w17,#ARMV7_NEON 40 b.ne ChaCha20_neon 41 42.Lshort: 43 AARCH64_SIGN_LINK_REGISTER 44 stp x29,x30,[sp,#-96]! 45 add x29,sp,#0 46 47 adrp x5,.Lsigma 48 add x5,x5,:lo12:.Lsigma 49 stp x19,x20,[sp,#16] 50 stp x21,x22,[sp,#32] 51 stp x23,x24,[sp,#48] 52 stp x25,x26,[sp,#64] 53 stp x27,x28,[sp,#80] 54 sub sp,sp,#64 55 56 ldp x22,x23,[x5] // load sigma 57 ldp x24,x25,[x3] // load key 58 ldp x26,x27,[x3,#16] 59 ldp x28,x30,[x4] // load counter 60#ifdef __AARCH64EB__ 61 ror x24,x24,#32 62 ror x25,x25,#32 63 ror x26,x26,#32 64 ror x27,x27,#32 65 ror x28,x28,#32 66 ror x30,x30,#32 67#endif 68 69.Loop_outer: 70 mov w5,w22 // unpack key block 71 lsr x6,x22,#32 72 mov w7,w23 73 lsr x8,x23,#32 74 mov w9,w24 75 lsr x10,x24,#32 76 mov w11,w25 77 lsr x12,x25,#32 78 mov w13,w26 79 lsr x14,x26,#32 80 mov w15,w27 81 lsr x16,x27,#32 82 mov w17,w28 83 lsr x19,x28,#32 84 mov w20,w30 85 lsr x21,x30,#32 86 87 mov x4,#10 88 subs x2,x2,#64 89.Loop: 90 sub x4,x4,#1 91 add w5,w5,w9 92 add w6,w6,w10 93 add w7,w7,w11 94 add w8,w8,w12 95 eor w17,w17,w5 96 eor w19,w19,w6 97 eor w20,w20,w7 98 eor w21,w21,w8 99 ror w17,w17,#16 100 ror w19,w19,#16 101 ror w20,w20,#16 102 ror w21,w21,#16 103 add w13,w13,w17 104 add w14,w14,w19 105 add w15,w15,w20 106 add w16,w16,w21 107 eor w9,w9,w13 108 eor w10,w10,w14 109 eor w11,w11,w15 110 eor w12,w12,w16 111 ror w9,w9,#20 112 ror w10,w10,#20 113 ror w11,w11,#20 114 ror w12,w12,#20 115 add w5,w5,w9 116 add w6,w6,w10 117 add w7,w7,w11 118 add w8,w8,w12 119 eor w17,w17,w5 120 eor w19,w19,w6 121 eor w20,w20,w7 122 eor w21,w21,w8 123 ror w17,w17,#24 124 ror w19,w19,#24 125 ror w20,w20,#24 126 ror w21,w21,#24 127 add w13,w13,w17 128 add w14,w14,w19 129 add w15,w15,w20 130 add w16,w16,w21 131 eor w9,w9,w13 132 eor w10,w10,w14 133 eor w11,w11,w15 134 eor w12,w12,w16 135 ror w9,w9,#25 136 ror w10,w10,#25 137 ror w11,w11,#25 138 ror w12,w12,#25 139 add w5,w5,w10 140 add w6,w6,w11 141 add w7,w7,w12 142 add w8,w8,w9 143 eor w21,w21,w5 144 eor w17,w17,w6 145 eor w19,w19,w7 146 eor w20,w20,w8 147 ror w21,w21,#16 148 ror w17,w17,#16 149 ror w19,w19,#16 150 ror w20,w20,#16 151 add w15,w15,w21 152 add w16,w16,w17 153 add w13,w13,w19 154 add w14,w14,w20 155 eor w10,w10,w15 156 eor w11,w11,w16 157 eor w12,w12,w13 158 eor w9,w9,w14 159 ror w10,w10,#20 160 ror w11,w11,#20 161 ror w12,w12,#20 162 ror w9,w9,#20 163 add w5,w5,w10 164 add w6,w6,w11 165 add w7,w7,w12 166 add w8,w8,w9 167 eor w21,w21,w5 168 eor w17,w17,w6 169 eor w19,w19,w7 170 eor w20,w20,w8 171 ror w21,w21,#24 172 ror w17,w17,#24 173 ror w19,w19,#24 174 ror w20,w20,#24 175 add w15,w15,w21 176 add w16,w16,w17 177 add w13,w13,w19 178 add w14,w14,w20 179 eor w10,w10,w15 180 eor w11,w11,w16 181 eor w12,w12,w13 182 eor w9,w9,w14 183 ror w10,w10,#25 184 ror w11,w11,#25 185 ror w12,w12,#25 186 ror w9,w9,#25 187 cbnz x4,.Loop 188 189 add w5,w5,w22 // accumulate key block 190 add x6,x6,x22,lsr#32 191 add w7,w7,w23 192 add x8,x8,x23,lsr#32 193 add w9,w9,w24 194 add x10,x10,x24,lsr#32 195 add w11,w11,w25 196 add x12,x12,x25,lsr#32 197 add w13,w13,w26 198 add x14,x14,x26,lsr#32 199 add w15,w15,w27 200 add x16,x16,x27,lsr#32 201 add w17,w17,w28 202 add x19,x19,x28,lsr#32 203 add w20,w20,w30 204 add x21,x21,x30,lsr#32 205 206 b.lo .Ltail 207 208 add x5,x5,x6,lsl#32 // pack 209 add x7,x7,x8,lsl#32 210 ldp x6,x8,[x1,#0] // load input 211 add x9,x9,x10,lsl#32 212 add x11,x11,x12,lsl#32 213 ldp x10,x12,[x1,#16] 214 add x13,x13,x14,lsl#32 215 add x15,x15,x16,lsl#32 216 ldp x14,x16,[x1,#32] 217 add x17,x17,x19,lsl#32 218 add x20,x20,x21,lsl#32 219 ldp x19,x21,[x1,#48] 220 add x1,x1,#64 221#ifdef __AARCH64EB__ 222 rev x5,x5 223 rev x7,x7 224 rev x9,x9 225 rev x11,x11 226 rev x13,x13 227 rev x15,x15 228 rev x17,x17 229 rev x20,x20 230#endif 231 eor x5,x5,x6 232 eor x7,x7,x8 233 eor x9,x9,x10 234 eor x11,x11,x12 235 eor x13,x13,x14 236 eor x15,x15,x16 237 eor x17,x17,x19 238 eor x20,x20,x21 239 240 stp x5,x7,[x0,#0] // store output 241 add x28,x28,#1 // increment counter 242 stp x9,x11,[x0,#16] 243 stp x13,x15,[x0,#32] 244 stp x17,x20,[x0,#48] 245 add x0,x0,#64 246 247 b.hi .Loop_outer 248 249 ldp x19,x20,[x29,#16] 250 add sp,sp,#64 251 ldp x21,x22,[x29,#32] 252 ldp x23,x24,[x29,#48] 253 ldp x25,x26,[x29,#64] 254 ldp x27,x28,[x29,#80] 255 ldp x29,x30,[sp],#96 256 AARCH64_VALIDATE_LINK_REGISTER 257.Labort: 258 ret 259 260.align 4 261.Ltail: 262 add x2,x2,#64 263.Less_than_64: 264 sub x0,x0,#1 265 add x1,x1,x2 266 add x0,x0,x2 267 add x4,sp,x2 268 neg x2,x2 269 270 add x5,x5,x6,lsl#32 // pack 271 add x7,x7,x8,lsl#32 272 add x9,x9,x10,lsl#32 273 add x11,x11,x12,lsl#32 274 add x13,x13,x14,lsl#32 275 add x15,x15,x16,lsl#32 276 add x17,x17,x19,lsl#32 277 add x20,x20,x21,lsl#32 278#ifdef __AARCH64EB__ 279 rev x5,x5 280 rev x7,x7 281 rev x9,x9 282 rev x11,x11 283 rev x13,x13 284 rev x15,x15 285 rev x17,x17 286 rev x20,x20 287#endif 288 stp x5,x7,[sp,#0] 289 stp x9,x11,[sp,#16] 290 stp x13,x15,[sp,#32] 291 stp x17,x20,[sp,#48] 292 293.Loop_tail: 294 ldrb w10,[x1,x2] 295 ldrb w11,[x4,x2] 296 add x2,x2,#1 297 eor w10,w10,w11 298 strb w10,[x0,x2] 299 cbnz x2,.Loop_tail 300 301 stp xzr,xzr,[sp,#0] 302 stp xzr,xzr,[sp,#16] 303 stp xzr,xzr,[sp,#32] 304 stp xzr,xzr,[sp,#48] 305 306 ldp x19,x20,[x29,#16] 307 add sp,sp,#64 308 ldp x21,x22,[x29,#32] 309 ldp x23,x24,[x29,#48] 310 ldp x25,x26,[x29,#64] 311 ldp x27,x28,[x29,#80] 312 ldp x29,x30,[sp],#96 313 AARCH64_VALIDATE_LINK_REGISTER 314 ret 315.size ChaCha20_ctr32,.-ChaCha20_ctr32 316 317.type ChaCha20_neon,%function 318.align 5 319ChaCha20_neon: 320 AARCH64_SIGN_LINK_REGISTER 321 stp x29,x30,[sp,#-96]! 322 add x29,sp,#0 323 324 adrp x5,.Lsigma 325 add x5,x5,:lo12:.Lsigma 326 stp x19,x20,[sp,#16] 327 stp x21,x22,[sp,#32] 328 stp x23,x24,[sp,#48] 329 stp x25,x26,[sp,#64] 330 stp x27,x28,[sp,#80] 331 cmp x2,#512 332 b.hs .L512_or_more_neon 333 334 sub sp,sp,#64 335 336 ldp x22,x23,[x5] // load sigma 337 ld1 {v24.4s},[x5],#16 338 ldp x24,x25,[x3] // load key 339 ldp x26,x27,[x3,#16] 340 ld1 {v25.4s,v26.4s},[x3] 341 ldp x28,x30,[x4] // load counter 342 ld1 {v27.4s},[x4] 343 ld1 {v31.4s},[x5] 344#ifdef __AARCH64EB__ 345 rev64 v24.4s,v24.4s 346 ror x24,x24,#32 347 ror x25,x25,#32 348 ror x26,x26,#32 349 ror x27,x27,#32 350 ror x28,x28,#32 351 ror x30,x30,#32 352#endif 353 add v27.4s,v27.4s,v31.4s // += 1 354 add v28.4s,v27.4s,v31.4s 355 add v29.4s,v28.4s,v31.4s 356 shl v31.4s,v31.4s,#2 // 1 -> 4 357 358.Loop_outer_neon: 359 mov w5,w22 // unpack key block 360 lsr x6,x22,#32 361 mov v0.16b,v24.16b 362 mov w7,w23 363 lsr x8,x23,#32 364 mov v4.16b,v24.16b 365 mov w9,w24 366 lsr x10,x24,#32 367 mov v16.16b,v24.16b 368 mov w11,w25 369 mov v1.16b,v25.16b 370 lsr x12,x25,#32 371 mov v5.16b,v25.16b 372 mov w13,w26 373 mov v17.16b,v25.16b 374 lsr x14,x26,#32 375 mov v3.16b,v27.16b 376 mov w15,w27 377 mov v7.16b,v28.16b 378 lsr x16,x27,#32 379 mov v19.16b,v29.16b 380 mov w17,w28 381 mov v2.16b,v26.16b 382 lsr x19,x28,#32 383 mov v6.16b,v26.16b 384 mov w20,w30 385 mov v18.16b,v26.16b 386 lsr x21,x30,#32 387 388 mov x4,#10 389 subs x2,x2,#256 390.Loop_neon: 391 sub x4,x4,#1 392 add v0.4s,v0.4s,v1.4s 393 add w5,w5,w9 394 add v4.4s,v4.4s,v5.4s 395 add w6,w6,w10 396 add v16.4s,v16.4s,v17.4s 397 add w7,w7,w11 398 eor v3.16b,v3.16b,v0.16b 399 add w8,w8,w12 400 eor v7.16b,v7.16b,v4.16b 401 eor w17,w17,w5 402 eor v19.16b,v19.16b,v16.16b 403 eor w19,w19,w6 404 rev32 v3.8h,v3.8h 405 eor w20,w20,w7 406 rev32 v7.8h,v7.8h 407 eor w21,w21,w8 408 rev32 v19.8h,v19.8h 409 ror w17,w17,#16 410 add v2.4s,v2.4s,v3.4s 411 ror w19,w19,#16 412 add v6.4s,v6.4s,v7.4s 413 ror w20,w20,#16 414 add v18.4s,v18.4s,v19.4s 415 ror w21,w21,#16 416 eor v20.16b,v1.16b,v2.16b 417 add w13,w13,w17 418 eor v21.16b,v5.16b,v6.16b 419 add w14,w14,w19 420 eor v22.16b,v17.16b,v18.16b 421 add w15,w15,w20 422 ushr v1.4s,v20.4s,#20 423 add w16,w16,w21 424 ushr v5.4s,v21.4s,#20 425 eor w9,w9,w13 426 ushr v17.4s,v22.4s,#20 427 eor w10,w10,w14 428 sli v1.4s,v20.4s,#12 429 eor w11,w11,w15 430 sli v5.4s,v21.4s,#12 431 eor w12,w12,w16 432 sli v17.4s,v22.4s,#12 433 ror w9,w9,#20 434 add v0.4s,v0.4s,v1.4s 435 ror w10,w10,#20 436 add v4.4s,v4.4s,v5.4s 437 ror w11,w11,#20 438 add v16.4s,v16.4s,v17.4s 439 ror w12,w12,#20 440 eor v20.16b,v3.16b,v0.16b 441 add w5,w5,w9 442 eor v21.16b,v7.16b,v4.16b 443 add w6,w6,w10 444 eor v22.16b,v19.16b,v16.16b 445 add w7,w7,w11 446 ushr v3.4s,v20.4s,#24 447 add w8,w8,w12 448 ushr v7.4s,v21.4s,#24 449 eor w17,w17,w5 450 ushr v19.4s,v22.4s,#24 451 eor w19,w19,w6 452 sli v3.4s,v20.4s,#8 453 eor w20,w20,w7 454 sli v7.4s,v21.4s,#8 455 eor w21,w21,w8 456 sli v19.4s,v22.4s,#8 457 ror w17,w17,#24 458 add v2.4s,v2.4s,v3.4s 459 ror w19,w19,#24 460 add v6.4s,v6.4s,v7.4s 461 ror w20,w20,#24 462 add v18.4s,v18.4s,v19.4s 463 ror w21,w21,#24 464 eor v20.16b,v1.16b,v2.16b 465 add w13,w13,w17 466 eor v21.16b,v5.16b,v6.16b 467 add w14,w14,w19 468 eor v22.16b,v17.16b,v18.16b 469 add w15,w15,w20 470 ushr v1.4s,v20.4s,#25 471 add w16,w16,w21 472 ushr v5.4s,v21.4s,#25 473 eor w9,w9,w13 474 ushr v17.4s,v22.4s,#25 475 eor w10,w10,w14 476 sli v1.4s,v20.4s,#7 477 eor w11,w11,w15 478 sli v5.4s,v21.4s,#7 479 eor w12,w12,w16 480 sli v17.4s,v22.4s,#7 481 ror w9,w9,#25 482 ext v2.16b,v2.16b,v2.16b,#8 483 ror w10,w10,#25 484 ext v6.16b,v6.16b,v6.16b,#8 485 ror w11,w11,#25 486 ext v18.16b,v18.16b,v18.16b,#8 487 ror w12,w12,#25 488 ext v3.16b,v3.16b,v3.16b,#12 489 ext v7.16b,v7.16b,v7.16b,#12 490 ext v19.16b,v19.16b,v19.16b,#12 491 ext v1.16b,v1.16b,v1.16b,#4 492 ext v5.16b,v5.16b,v5.16b,#4 493 ext v17.16b,v17.16b,v17.16b,#4 494 add v0.4s,v0.4s,v1.4s 495 add w5,w5,w10 496 add v4.4s,v4.4s,v5.4s 497 add w6,w6,w11 498 add v16.4s,v16.4s,v17.4s 499 add w7,w7,w12 500 eor v3.16b,v3.16b,v0.16b 501 add w8,w8,w9 502 eor v7.16b,v7.16b,v4.16b 503 eor w21,w21,w5 504 eor v19.16b,v19.16b,v16.16b 505 eor w17,w17,w6 506 rev32 v3.8h,v3.8h 507 eor w19,w19,w7 508 rev32 v7.8h,v7.8h 509 eor w20,w20,w8 510 rev32 v19.8h,v19.8h 511 ror w21,w21,#16 512 add v2.4s,v2.4s,v3.4s 513 ror w17,w17,#16 514 add v6.4s,v6.4s,v7.4s 515 ror w19,w19,#16 516 add v18.4s,v18.4s,v19.4s 517 ror w20,w20,#16 518 eor v20.16b,v1.16b,v2.16b 519 add w15,w15,w21 520 eor v21.16b,v5.16b,v6.16b 521 add w16,w16,w17 522 eor v22.16b,v17.16b,v18.16b 523 add w13,w13,w19 524 ushr v1.4s,v20.4s,#20 525 add w14,w14,w20 526 ushr v5.4s,v21.4s,#20 527 eor w10,w10,w15 528 ushr v17.4s,v22.4s,#20 529 eor w11,w11,w16 530 sli v1.4s,v20.4s,#12 531 eor w12,w12,w13 532 sli v5.4s,v21.4s,#12 533 eor w9,w9,w14 534 sli v17.4s,v22.4s,#12 535 ror w10,w10,#20 536 add v0.4s,v0.4s,v1.4s 537 ror w11,w11,#20 538 add v4.4s,v4.4s,v5.4s 539 ror w12,w12,#20 540 add v16.4s,v16.4s,v17.4s 541 ror w9,w9,#20 542 eor v20.16b,v3.16b,v0.16b 543 add w5,w5,w10 544 eor v21.16b,v7.16b,v4.16b 545 add w6,w6,w11 546 eor v22.16b,v19.16b,v16.16b 547 add w7,w7,w12 548 ushr v3.4s,v20.4s,#24 549 add w8,w8,w9 550 ushr v7.4s,v21.4s,#24 551 eor w21,w21,w5 552 ushr v19.4s,v22.4s,#24 553 eor w17,w17,w6 554 sli v3.4s,v20.4s,#8 555 eor w19,w19,w7 556 sli v7.4s,v21.4s,#8 557 eor w20,w20,w8 558 sli v19.4s,v22.4s,#8 559 ror w21,w21,#24 560 add v2.4s,v2.4s,v3.4s 561 ror w17,w17,#24 562 add v6.4s,v6.4s,v7.4s 563 ror w19,w19,#24 564 add v18.4s,v18.4s,v19.4s 565 ror w20,w20,#24 566 eor v20.16b,v1.16b,v2.16b 567 add w15,w15,w21 568 eor v21.16b,v5.16b,v6.16b 569 add w16,w16,w17 570 eor v22.16b,v17.16b,v18.16b 571 add w13,w13,w19 572 ushr v1.4s,v20.4s,#25 573 add w14,w14,w20 574 ushr v5.4s,v21.4s,#25 575 eor w10,w10,w15 576 ushr v17.4s,v22.4s,#25 577 eor w11,w11,w16 578 sli v1.4s,v20.4s,#7 579 eor w12,w12,w13 580 sli v5.4s,v21.4s,#7 581 eor w9,w9,w14 582 sli v17.4s,v22.4s,#7 583 ror w10,w10,#25 584 ext v2.16b,v2.16b,v2.16b,#8 585 ror w11,w11,#25 586 ext v6.16b,v6.16b,v6.16b,#8 587 ror w12,w12,#25 588 ext v18.16b,v18.16b,v18.16b,#8 589 ror w9,w9,#25 590 ext v3.16b,v3.16b,v3.16b,#4 591 ext v7.16b,v7.16b,v7.16b,#4 592 ext v19.16b,v19.16b,v19.16b,#4 593 ext v1.16b,v1.16b,v1.16b,#12 594 ext v5.16b,v5.16b,v5.16b,#12 595 ext v17.16b,v17.16b,v17.16b,#12 596 cbnz x4,.Loop_neon 597 598 add w5,w5,w22 // accumulate key block 599 add v0.4s,v0.4s,v24.4s 600 add x6,x6,x22,lsr#32 601 add v4.4s,v4.4s,v24.4s 602 add w7,w7,w23 603 add v16.4s,v16.4s,v24.4s 604 add x8,x8,x23,lsr#32 605 add v2.4s,v2.4s,v26.4s 606 add w9,w9,w24 607 add v6.4s,v6.4s,v26.4s 608 add x10,x10,x24,lsr#32 609 add v18.4s,v18.4s,v26.4s 610 add w11,w11,w25 611 add v3.4s,v3.4s,v27.4s 612 add x12,x12,x25,lsr#32 613 add w13,w13,w26 614 add v7.4s,v7.4s,v28.4s 615 add x14,x14,x26,lsr#32 616 add w15,w15,w27 617 add v19.4s,v19.4s,v29.4s 618 add x16,x16,x27,lsr#32 619 add w17,w17,w28 620 add v1.4s,v1.4s,v25.4s 621 add x19,x19,x28,lsr#32 622 add w20,w20,w30 623 add v5.4s,v5.4s,v25.4s 624 add x21,x21,x30,lsr#32 625 add v17.4s,v17.4s,v25.4s 626 627 b.lo .Ltail_neon 628 629 add x5,x5,x6,lsl#32 // pack 630 add x7,x7,x8,lsl#32 631 ldp x6,x8,[x1,#0] // load input 632 add x9,x9,x10,lsl#32 633 add x11,x11,x12,lsl#32 634 ldp x10,x12,[x1,#16] 635 add x13,x13,x14,lsl#32 636 add x15,x15,x16,lsl#32 637 ldp x14,x16,[x1,#32] 638 add x17,x17,x19,lsl#32 639 add x20,x20,x21,lsl#32 640 ldp x19,x21,[x1,#48] 641 add x1,x1,#64 642#ifdef __AARCH64EB__ 643 rev x5,x5 644 rev x7,x7 645 rev x9,x9 646 rev x11,x11 647 rev x13,x13 648 rev x15,x15 649 rev x17,x17 650 rev x20,x20 651#endif 652 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 653 eor x5,x5,x6 654 eor x7,x7,x8 655 eor x9,x9,x10 656 eor x11,x11,x12 657 eor x13,x13,x14 658 eor v0.16b,v0.16b,v20.16b 659 eor x15,x15,x16 660 eor v1.16b,v1.16b,v21.16b 661 eor x17,x17,x19 662 eor v2.16b,v2.16b,v22.16b 663 eor x20,x20,x21 664 eor v3.16b,v3.16b,v23.16b 665 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 666 667 stp x5,x7,[x0,#0] // store output 668 add x28,x28,#4 // increment counter 669 stp x9,x11,[x0,#16] 670 add v27.4s,v27.4s,v31.4s // += 4 671 stp x13,x15,[x0,#32] 672 add v28.4s,v28.4s,v31.4s 673 stp x17,x20,[x0,#48] 674 add v29.4s,v29.4s,v31.4s 675 add x0,x0,#64 676 677 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 678 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 679 680 eor v4.16b,v4.16b,v20.16b 681 eor v5.16b,v5.16b,v21.16b 682 eor v6.16b,v6.16b,v22.16b 683 eor v7.16b,v7.16b,v23.16b 684 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 685 686 eor v16.16b,v16.16b,v0.16b 687 eor v17.16b,v17.16b,v1.16b 688 eor v18.16b,v18.16b,v2.16b 689 eor v19.16b,v19.16b,v3.16b 690 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 691 692 b.hi .Loop_outer_neon 693 694 ldp x19,x20,[x29,#16] 695 add sp,sp,#64 696 ldp x21,x22,[x29,#32] 697 ldp x23,x24,[x29,#48] 698 ldp x25,x26,[x29,#64] 699 ldp x27,x28,[x29,#80] 700 ldp x29,x30,[sp],#96 701 AARCH64_VALIDATE_LINK_REGISTER 702 ret 703 704.Ltail_neon: 705 add x2,x2,#256 706 cmp x2,#64 707 b.lo .Less_than_64 708 709 add x5,x5,x6,lsl#32 // pack 710 add x7,x7,x8,lsl#32 711 ldp x6,x8,[x1,#0] // load input 712 add x9,x9,x10,lsl#32 713 add x11,x11,x12,lsl#32 714 ldp x10,x12,[x1,#16] 715 add x13,x13,x14,lsl#32 716 add x15,x15,x16,lsl#32 717 ldp x14,x16,[x1,#32] 718 add x17,x17,x19,lsl#32 719 add x20,x20,x21,lsl#32 720 ldp x19,x21,[x1,#48] 721 add x1,x1,#64 722#ifdef __AARCH64EB__ 723 rev x5,x5 724 rev x7,x7 725 rev x9,x9 726 rev x11,x11 727 rev x13,x13 728 rev x15,x15 729 rev x17,x17 730 rev x20,x20 731#endif 732 eor x5,x5,x6 733 eor x7,x7,x8 734 eor x9,x9,x10 735 eor x11,x11,x12 736 eor x13,x13,x14 737 eor x15,x15,x16 738 eor x17,x17,x19 739 eor x20,x20,x21 740 741 stp x5,x7,[x0,#0] // store output 742 add x28,x28,#4 // increment counter 743 stp x9,x11,[x0,#16] 744 stp x13,x15,[x0,#32] 745 stp x17,x20,[x0,#48] 746 add x0,x0,#64 747 b.eq .Ldone_neon 748 sub x2,x2,#64 749 cmp x2,#64 750 b.lo .Less_than_128 751 752 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 753 eor v0.16b,v0.16b,v20.16b 754 eor v1.16b,v1.16b,v21.16b 755 eor v2.16b,v2.16b,v22.16b 756 eor v3.16b,v3.16b,v23.16b 757 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 758 b.eq .Ldone_neon 759 sub x2,x2,#64 760 cmp x2,#64 761 b.lo .Less_than_192 762 763 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 764 eor v4.16b,v4.16b,v20.16b 765 eor v5.16b,v5.16b,v21.16b 766 eor v6.16b,v6.16b,v22.16b 767 eor v7.16b,v7.16b,v23.16b 768 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 769 b.eq .Ldone_neon 770 sub x2,x2,#64 771 772 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] 773 b .Last_neon 774 775.Less_than_128: 776 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] 777 b .Last_neon 778.Less_than_192: 779 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] 780 b .Last_neon 781 782.align 4 783.Last_neon: 784 sub x0,x0,#1 785 add x1,x1,x2 786 add x0,x0,x2 787 add x4,sp,x2 788 neg x2,x2 789 790.Loop_tail_neon: 791 ldrb w10,[x1,x2] 792 ldrb w11,[x4,x2] 793 add x2,x2,#1 794 eor w10,w10,w11 795 strb w10,[x0,x2] 796 cbnz x2,.Loop_tail_neon 797 798 stp xzr,xzr,[sp,#0] 799 stp xzr,xzr,[sp,#16] 800 stp xzr,xzr,[sp,#32] 801 stp xzr,xzr,[sp,#48] 802 803.Ldone_neon: 804 ldp x19,x20,[x29,#16] 805 add sp,sp,#64 806 ldp x21,x22,[x29,#32] 807 ldp x23,x24,[x29,#48] 808 ldp x25,x26,[x29,#64] 809 ldp x27,x28,[x29,#80] 810 ldp x29,x30,[sp],#96 811 AARCH64_VALIDATE_LINK_REGISTER 812 ret 813.size ChaCha20_neon,.-ChaCha20_neon 814.type ChaCha20_512_neon,%function 815.align 5 816ChaCha20_512_neon: 817 AARCH64_SIGN_LINK_REGISTER 818 stp x29,x30,[sp,#-96]! 819 add x29,sp,#0 820 821 adrp x5,.Lsigma 822 add x5,x5,:lo12:.Lsigma 823 stp x19,x20,[sp,#16] 824 stp x21,x22,[sp,#32] 825 stp x23,x24,[sp,#48] 826 stp x25,x26,[sp,#64] 827 stp x27,x28,[sp,#80] 828 829.L512_or_more_neon: 830 sub sp,sp,#128+64 831 832 ldp x22,x23,[x5] // load sigma 833 ld1 {v24.4s},[x5],#16 834 ldp x24,x25,[x3] // load key 835 ldp x26,x27,[x3,#16] 836 ld1 {v25.4s,v26.4s},[x3] 837 ldp x28,x30,[x4] // load counter 838 ld1 {v27.4s},[x4] 839 ld1 {v31.4s},[x5] 840#ifdef __AARCH64EB__ 841 rev64 v24.4s,v24.4s 842 ror x24,x24,#32 843 ror x25,x25,#32 844 ror x26,x26,#32 845 ror x27,x27,#32 846 ror x28,x28,#32 847 ror x30,x30,#32 848#endif 849 add v27.4s,v27.4s,v31.4s // += 1 850 stp q24,q25,[sp,#0] // off-load key block, invariant part 851 add v27.4s,v27.4s,v31.4s // not typo 852 str q26,[sp,#32] 853 add v28.4s,v27.4s,v31.4s 854 add v29.4s,v28.4s,v31.4s 855 add v30.4s,v29.4s,v31.4s 856 shl v31.4s,v31.4s,#2 // 1 -> 4 857 858 stp d8,d9,[sp,#128+0] // meet ABI requirements 859 stp d10,d11,[sp,#128+16] 860 stp d12,d13,[sp,#128+32] 861 stp d14,d15,[sp,#128+48] 862 863 sub x2,x2,#512 // not typo 864 865.Loop_outer_512_neon: 866 mov v0.16b,v24.16b 867 mov v4.16b,v24.16b 868 mov v8.16b,v24.16b 869 mov v12.16b,v24.16b 870 mov v16.16b,v24.16b 871 mov v20.16b,v24.16b 872 mov v1.16b,v25.16b 873 mov w5,w22 // unpack key block 874 mov v5.16b,v25.16b 875 lsr x6,x22,#32 876 mov v9.16b,v25.16b 877 mov w7,w23 878 mov v13.16b,v25.16b 879 lsr x8,x23,#32 880 mov v17.16b,v25.16b 881 mov w9,w24 882 mov v21.16b,v25.16b 883 lsr x10,x24,#32 884 mov v3.16b,v27.16b 885 mov w11,w25 886 mov v7.16b,v28.16b 887 lsr x12,x25,#32 888 mov v11.16b,v29.16b 889 mov w13,w26 890 mov v15.16b,v30.16b 891 lsr x14,x26,#32 892 mov v2.16b,v26.16b 893 mov w15,w27 894 mov v6.16b,v26.16b 895 lsr x16,x27,#32 896 add v19.4s,v3.4s,v31.4s // +4 897 mov w17,w28 898 add v23.4s,v7.4s,v31.4s // +4 899 lsr x19,x28,#32 900 mov v10.16b,v26.16b 901 mov w20,w30 902 mov v14.16b,v26.16b 903 lsr x21,x30,#32 904 mov v18.16b,v26.16b 905 stp q27,q28,[sp,#48] // off-load key block, variable part 906 mov v22.16b,v26.16b 907 str q29,[sp,#80] 908 909 mov x4,#5 910 subs x2,x2,#512 911.Loop_upper_neon: 912 sub x4,x4,#1 913 add v0.4s,v0.4s,v1.4s 914 add w5,w5,w9 915 add v4.4s,v4.4s,v5.4s 916 add w6,w6,w10 917 add v8.4s,v8.4s,v9.4s 918 add w7,w7,w11 919 add v12.4s,v12.4s,v13.4s 920 add w8,w8,w12 921 add v16.4s,v16.4s,v17.4s 922 eor w17,w17,w5 923 add v20.4s,v20.4s,v21.4s 924 eor w19,w19,w6 925 eor v3.16b,v3.16b,v0.16b 926 eor w20,w20,w7 927 eor v7.16b,v7.16b,v4.16b 928 eor w21,w21,w8 929 eor v11.16b,v11.16b,v8.16b 930 ror w17,w17,#16 931 eor v15.16b,v15.16b,v12.16b 932 ror w19,w19,#16 933 eor v19.16b,v19.16b,v16.16b 934 ror w20,w20,#16 935 eor v23.16b,v23.16b,v20.16b 936 ror w21,w21,#16 937 rev32 v3.8h,v3.8h 938 add w13,w13,w17 939 rev32 v7.8h,v7.8h 940 add w14,w14,w19 941 rev32 v11.8h,v11.8h 942 add w15,w15,w20 943 rev32 v15.8h,v15.8h 944 add w16,w16,w21 945 rev32 v19.8h,v19.8h 946 eor w9,w9,w13 947 rev32 v23.8h,v23.8h 948 eor w10,w10,w14 949 add v2.4s,v2.4s,v3.4s 950 eor w11,w11,w15 951 add v6.4s,v6.4s,v7.4s 952 eor w12,w12,w16 953 add v10.4s,v10.4s,v11.4s 954 ror w9,w9,#20 955 add v14.4s,v14.4s,v15.4s 956 ror w10,w10,#20 957 add v18.4s,v18.4s,v19.4s 958 ror w11,w11,#20 959 add v22.4s,v22.4s,v23.4s 960 ror w12,w12,#20 961 eor v24.16b,v1.16b,v2.16b 962 add w5,w5,w9 963 eor v25.16b,v5.16b,v6.16b 964 add w6,w6,w10 965 eor v26.16b,v9.16b,v10.16b 966 add w7,w7,w11 967 eor v27.16b,v13.16b,v14.16b 968 add w8,w8,w12 969 eor v28.16b,v17.16b,v18.16b 970 eor w17,w17,w5 971 eor v29.16b,v21.16b,v22.16b 972 eor w19,w19,w6 973 ushr v1.4s,v24.4s,#20 974 eor w20,w20,w7 975 ushr v5.4s,v25.4s,#20 976 eor w21,w21,w8 977 ushr v9.4s,v26.4s,#20 978 ror w17,w17,#24 979 ushr v13.4s,v27.4s,#20 980 ror w19,w19,#24 981 ushr v17.4s,v28.4s,#20 982 ror w20,w20,#24 983 ushr v21.4s,v29.4s,#20 984 ror w21,w21,#24 985 sli v1.4s,v24.4s,#12 986 add w13,w13,w17 987 sli v5.4s,v25.4s,#12 988 add w14,w14,w19 989 sli v9.4s,v26.4s,#12 990 add w15,w15,w20 991 sli v13.4s,v27.4s,#12 992 add w16,w16,w21 993 sli v17.4s,v28.4s,#12 994 eor w9,w9,w13 995 sli v21.4s,v29.4s,#12 996 eor w10,w10,w14 997 add v0.4s,v0.4s,v1.4s 998 eor w11,w11,w15 999 add v4.4s,v4.4s,v5.4s 1000 eor w12,w12,w16 1001 add v8.4s,v8.4s,v9.4s 1002 ror w9,w9,#25 1003 add v12.4s,v12.4s,v13.4s 1004 ror w10,w10,#25 1005 add v16.4s,v16.4s,v17.4s 1006 ror w11,w11,#25 1007 add v20.4s,v20.4s,v21.4s 1008 ror w12,w12,#25 1009 eor v24.16b,v3.16b,v0.16b 1010 add w5,w5,w10 1011 eor v25.16b,v7.16b,v4.16b 1012 add w6,w6,w11 1013 eor v26.16b,v11.16b,v8.16b 1014 add w7,w7,w12 1015 eor v27.16b,v15.16b,v12.16b 1016 add w8,w8,w9 1017 eor v28.16b,v19.16b,v16.16b 1018 eor w21,w21,w5 1019 eor v29.16b,v23.16b,v20.16b 1020 eor w17,w17,w6 1021 ushr v3.4s,v24.4s,#24 1022 eor w19,w19,w7 1023 ushr v7.4s,v25.4s,#24 1024 eor w20,w20,w8 1025 ushr v11.4s,v26.4s,#24 1026 ror w21,w21,#16 1027 ushr v15.4s,v27.4s,#24 1028 ror w17,w17,#16 1029 ushr v19.4s,v28.4s,#24 1030 ror w19,w19,#16 1031 ushr v23.4s,v29.4s,#24 1032 ror w20,w20,#16 1033 sli v3.4s,v24.4s,#8 1034 add w15,w15,w21 1035 sli v7.4s,v25.4s,#8 1036 add w16,w16,w17 1037 sli v11.4s,v26.4s,#8 1038 add w13,w13,w19 1039 sli v15.4s,v27.4s,#8 1040 add w14,w14,w20 1041 sli v19.4s,v28.4s,#8 1042 eor w10,w10,w15 1043 sli v23.4s,v29.4s,#8 1044 eor w11,w11,w16 1045 add v2.4s,v2.4s,v3.4s 1046 eor w12,w12,w13 1047 add v6.4s,v6.4s,v7.4s 1048 eor w9,w9,w14 1049 add v10.4s,v10.4s,v11.4s 1050 ror w10,w10,#20 1051 add v14.4s,v14.4s,v15.4s 1052 ror w11,w11,#20 1053 add v18.4s,v18.4s,v19.4s 1054 ror w12,w12,#20 1055 add v22.4s,v22.4s,v23.4s 1056 ror w9,w9,#20 1057 eor v24.16b,v1.16b,v2.16b 1058 add w5,w5,w10 1059 eor v25.16b,v5.16b,v6.16b 1060 add w6,w6,w11 1061 eor v26.16b,v9.16b,v10.16b 1062 add w7,w7,w12 1063 eor v27.16b,v13.16b,v14.16b 1064 add w8,w8,w9 1065 eor v28.16b,v17.16b,v18.16b 1066 eor w21,w21,w5 1067 eor v29.16b,v21.16b,v22.16b 1068 eor w17,w17,w6 1069 ushr v1.4s,v24.4s,#25 1070 eor w19,w19,w7 1071 ushr v5.4s,v25.4s,#25 1072 eor w20,w20,w8 1073 ushr v9.4s,v26.4s,#25 1074 ror w21,w21,#24 1075 ushr v13.4s,v27.4s,#25 1076 ror w17,w17,#24 1077 ushr v17.4s,v28.4s,#25 1078 ror w19,w19,#24 1079 ushr v21.4s,v29.4s,#25 1080 ror w20,w20,#24 1081 sli v1.4s,v24.4s,#7 1082 add w15,w15,w21 1083 sli v5.4s,v25.4s,#7 1084 add w16,w16,w17 1085 sli v9.4s,v26.4s,#7 1086 add w13,w13,w19 1087 sli v13.4s,v27.4s,#7 1088 add w14,w14,w20 1089 sli v17.4s,v28.4s,#7 1090 eor w10,w10,w15 1091 sli v21.4s,v29.4s,#7 1092 eor w11,w11,w16 1093 ext v2.16b,v2.16b,v2.16b,#8 1094 eor w12,w12,w13 1095 ext v6.16b,v6.16b,v6.16b,#8 1096 eor w9,w9,w14 1097 ext v10.16b,v10.16b,v10.16b,#8 1098 ror w10,w10,#25 1099 ext v14.16b,v14.16b,v14.16b,#8 1100 ror w11,w11,#25 1101 ext v18.16b,v18.16b,v18.16b,#8 1102 ror w12,w12,#25 1103 ext v22.16b,v22.16b,v22.16b,#8 1104 ror w9,w9,#25 1105 ext v3.16b,v3.16b,v3.16b,#12 1106 ext v7.16b,v7.16b,v7.16b,#12 1107 ext v11.16b,v11.16b,v11.16b,#12 1108 ext v15.16b,v15.16b,v15.16b,#12 1109 ext v19.16b,v19.16b,v19.16b,#12 1110 ext v23.16b,v23.16b,v23.16b,#12 1111 ext v1.16b,v1.16b,v1.16b,#4 1112 ext v5.16b,v5.16b,v5.16b,#4 1113 ext v9.16b,v9.16b,v9.16b,#4 1114 ext v13.16b,v13.16b,v13.16b,#4 1115 ext v17.16b,v17.16b,v17.16b,#4 1116 ext v21.16b,v21.16b,v21.16b,#4 1117 add v0.4s,v0.4s,v1.4s 1118 add w5,w5,w9 1119 add v4.4s,v4.4s,v5.4s 1120 add w6,w6,w10 1121 add v8.4s,v8.4s,v9.4s 1122 add w7,w7,w11 1123 add v12.4s,v12.4s,v13.4s 1124 add w8,w8,w12 1125 add v16.4s,v16.4s,v17.4s 1126 eor w17,w17,w5 1127 add v20.4s,v20.4s,v21.4s 1128 eor w19,w19,w6 1129 eor v3.16b,v3.16b,v0.16b 1130 eor w20,w20,w7 1131 eor v7.16b,v7.16b,v4.16b 1132 eor w21,w21,w8 1133 eor v11.16b,v11.16b,v8.16b 1134 ror w17,w17,#16 1135 eor v15.16b,v15.16b,v12.16b 1136 ror w19,w19,#16 1137 eor v19.16b,v19.16b,v16.16b 1138 ror w20,w20,#16 1139 eor v23.16b,v23.16b,v20.16b 1140 ror w21,w21,#16 1141 rev32 v3.8h,v3.8h 1142 add w13,w13,w17 1143 rev32 v7.8h,v7.8h 1144 add w14,w14,w19 1145 rev32 v11.8h,v11.8h 1146 add w15,w15,w20 1147 rev32 v15.8h,v15.8h 1148 add w16,w16,w21 1149 rev32 v19.8h,v19.8h 1150 eor w9,w9,w13 1151 rev32 v23.8h,v23.8h 1152 eor w10,w10,w14 1153 add v2.4s,v2.4s,v3.4s 1154 eor w11,w11,w15 1155 add v6.4s,v6.4s,v7.4s 1156 eor w12,w12,w16 1157 add v10.4s,v10.4s,v11.4s 1158 ror w9,w9,#20 1159 add v14.4s,v14.4s,v15.4s 1160 ror w10,w10,#20 1161 add v18.4s,v18.4s,v19.4s 1162 ror w11,w11,#20 1163 add v22.4s,v22.4s,v23.4s 1164 ror w12,w12,#20 1165 eor v24.16b,v1.16b,v2.16b 1166 add w5,w5,w9 1167 eor v25.16b,v5.16b,v6.16b 1168 add w6,w6,w10 1169 eor v26.16b,v9.16b,v10.16b 1170 add w7,w7,w11 1171 eor v27.16b,v13.16b,v14.16b 1172 add w8,w8,w12 1173 eor v28.16b,v17.16b,v18.16b 1174 eor w17,w17,w5 1175 eor v29.16b,v21.16b,v22.16b 1176 eor w19,w19,w6 1177 ushr v1.4s,v24.4s,#20 1178 eor w20,w20,w7 1179 ushr v5.4s,v25.4s,#20 1180 eor w21,w21,w8 1181 ushr v9.4s,v26.4s,#20 1182 ror w17,w17,#24 1183 ushr v13.4s,v27.4s,#20 1184 ror w19,w19,#24 1185 ushr v17.4s,v28.4s,#20 1186 ror w20,w20,#24 1187 ushr v21.4s,v29.4s,#20 1188 ror w21,w21,#24 1189 sli v1.4s,v24.4s,#12 1190 add w13,w13,w17 1191 sli v5.4s,v25.4s,#12 1192 add w14,w14,w19 1193 sli v9.4s,v26.4s,#12 1194 add w15,w15,w20 1195 sli v13.4s,v27.4s,#12 1196 add w16,w16,w21 1197 sli v17.4s,v28.4s,#12 1198 eor w9,w9,w13 1199 sli v21.4s,v29.4s,#12 1200 eor w10,w10,w14 1201 add v0.4s,v0.4s,v1.4s 1202 eor w11,w11,w15 1203 add v4.4s,v4.4s,v5.4s 1204 eor w12,w12,w16 1205 add v8.4s,v8.4s,v9.4s 1206 ror w9,w9,#25 1207 add v12.4s,v12.4s,v13.4s 1208 ror w10,w10,#25 1209 add v16.4s,v16.4s,v17.4s 1210 ror w11,w11,#25 1211 add v20.4s,v20.4s,v21.4s 1212 ror w12,w12,#25 1213 eor v24.16b,v3.16b,v0.16b 1214 add w5,w5,w10 1215 eor v25.16b,v7.16b,v4.16b 1216 add w6,w6,w11 1217 eor v26.16b,v11.16b,v8.16b 1218 add w7,w7,w12 1219 eor v27.16b,v15.16b,v12.16b 1220 add w8,w8,w9 1221 eor v28.16b,v19.16b,v16.16b 1222 eor w21,w21,w5 1223 eor v29.16b,v23.16b,v20.16b 1224 eor w17,w17,w6 1225 ushr v3.4s,v24.4s,#24 1226 eor w19,w19,w7 1227 ushr v7.4s,v25.4s,#24 1228 eor w20,w20,w8 1229 ushr v11.4s,v26.4s,#24 1230 ror w21,w21,#16 1231 ushr v15.4s,v27.4s,#24 1232 ror w17,w17,#16 1233 ushr v19.4s,v28.4s,#24 1234 ror w19,w19,#16 1235 ushr v23.4s,v29.4s,#24 1236 ror w20,w20,#16 1237 sli v3.4s,v24.4s,#8 1238 add w15,w15,w21 1239 sli v7.4s,v25.4s,#8 1240 add w16,w16,w17 1241 sli v11.4s,v26.4s,#8 1242 add w13,w13,w19 1243 sli v15.4s,v27.4s,#8 1244 add w14,w14,w20 1245 sli v19.4s,v28.4s,#8 1246 eor w10,w10,w15 1247 sli v23.4s,v29.4s,#8 1248 eor w11,w11,w16 1249 add v2.4s,v2.4s,v3.4s 1250 eor w12,w12,w13 1251 add v6.4s,v6.4s,v7.4s 1252 eor w9,w9,w14 1253 add v10.4s,v10.4s,v11.4s 1254 ror w10,w10,#20 1255 add v14.4s,v14.4s,v15.4s 1256 ror w11,w11,#20 1257 add v18.4s,v18.4s,v19.4s 1258 ror w12,w12,#20 1259 add v22.4s,v22.4s,v23.4s 1260 ror w9,w9,#20 1261 eor v24.16b,v1.16b,v2.16b 1262 add w5,w5,w10 1263 eor v25.16b,v5.16b,v6.16b 1264 add w6,w6,w11 1265 eor v26.16b,v9.16b,v10.16b 1266 add w7,w7,w12 1267 eor v27.16b,v13.16b,v14.16b 1268 add w8,w8,w9 1269 eor v28.16b,v17.16b,v18.16b 1270 eor w21,w21,w5 1271 eor v29.16b,v21.16b,v22.16b 1272 eor w17,w17,w6 1273 ushr v1.4s,v24.4s,#25 1274 eor w19,w19,w7 1275 ushr v5.4s,v25.4s,#25 1276 eor w20,w20,w8 1277 ushr v9.4s,v26.4s,#25 1278 ror w21,w21,#24 1279 ushr v13.4s,v27.4s,#25 1280 ror w17,w17,#24 1281 ushr v17.4s,v28.4s,#25 1282 ror w19,w19,#24 1283 ushr v21.4s,v29.4s,#25 1284 ror w20,w20,#24 1285 sli v1.4s,v24.4s,#7 1286 add w15,w15,w21 1287 sli v5.4s,v25.4s,#7 1288 add w16,w16,w17 1289 sli v9.4s,v26.4s,#7 1290 add w13,w13,w19 1291 sli v13.4s,v27.4s,#7 1292 add w14,w14,w20 1293 sli v17.4s,v28.4s,#7 1294 eor w10,w10,w15 1295 sli v21.4s,v29.4s,#7 1296 eor w11,w11,w16 1297 ext v2.16b,v2.16b,v2.16b,#8 1298 eor w12,w12,w13 1299 ext v6.16b,v6.16b,v6.16b,#8 1300 eor w9,w9,w14 1301 ext v10.16b,v10.16b,v10.16b,#8 1302 ror w10,w10,#25 1303 ext v14.16b,v14.16b,v14.16b,#8 1304 ror w11,w11,#25 1305 ext v18.16b,v18.16b,v18.16b,#8 1306 ror w12,w12,#25 1307 ext v22.16b,v22.16b,v22.16b,#8 1308 ror w9,w9,#25 1309 ext v3.16b,v3.16b,v3.16b,#4 1310 ext v7.16b,v7.16b,v7.16b,#4 1311 ext v11.16b,v11.16b,v11.16b,#4 1312 ext v15.16b,v15.16b,v15.16b,#4 1313 ext v19.16b,v19.16b,v19.16b,#4 1314 ext v23.16b,v23.16b,v23.16b,#4 1315 ext v1.16b,v1.16b,v1.16b,#12 1316 ext v5.16b,v5.16b,v5.16b,#12 1317 ext v9.16b,v9.16b,v9.16b,#12 1318 ext v13.16b,v13.16b,v13.16b,#12 1319 ext v17.16b,v17.16b,v17.16b,#12 1320 ext v21.16b,v21.16b,v21.16b,#12 1321 cbnz x4,.Loop_upper_neon 1322 1323 add w5,w5,w22 // accumulate key block 1324 add x6,x6,x22,lsr#32 1325 add w7,w7,w23 1326 add x8,x8,x23,lsr#32 1327 add w9,w9,w24 1328 add x10,x10,x24,lsr#32 1329 add w11,w11,w25 1330 add x12,x12,x25,lsr#32 1331 add w13,w13,w26 1332 add x14,x14,x26,lsr#32 1333 add w15,w15,w27 1334 add x16,x16,x27,lsr#32 1335 add w17,w17,w28 1336 add x19,x19,x28,lsr#32 1337 add w20,w20,w30 1338 add x21,x21,x30,lsr#32 1339 1340 add x5,x5,x6,lsl#32 // pack 1341 add x7,x7,x8,lsl#32 1342 ldp x6,x8,[x1,#0] // load input 1343 add x9,x9,x10,lsl#32 1344 add x11,x11,x12,lsl#32 1345 ldp x10,x12,[x1,#16] 1346 add x13,x13,x14,lsl#32 1347 add x15,x15,x16,lsl#32 1348 ldp x14,x16,[x1,#32] 1349 add x17,x17,x19,lsl#32 1350 add x20,x20,x21,lsl#32 1351 ldp x19,x21,[x1,#48] 1352 add x1,x1,#64 1353#ifdef __AARCH64EB__ 1354 rev x5,x5 1355 rev x7,x7 1356 rev x9,x9 1357 rev x11,x11 1358 rev x13,x13 1359 rev x15,x15 1360 rev x17,x17 1361 rev x20,x20 1362#endif 1363 eor x5,x5,x6 1364 eor x7,x7,x8 1365 eor x9,x9,x10 1366 eor x11,x11,x12 1367 eor x13,x13,x14 1368 eor x15,x15,x16 1369 eor x17,x17,x19 1370 eor x20,x20,x21 1371 1372 stp x5,x7,[x0,#0] // store output 1373 add x28,x28,#1 // increment counter 1374 mov w5,w22 // unpack key block 1375 lsr x6,x22,#32 1376 stp x9,x11,[x0,#16] 1377 mov w7,w23 1378 lsr x8,x23,#32 1379 stp x13,x15,[x0,#32] 1380 mov w9,w24 1381 lsr x10,x24,#32 1382 stp x17,x20,[x0,#48] 1383 add x0,x0,#64 1384 mov w11,w25 1385 lsr x12,x25,#32 1386 mov w13,w26 1387 lsr x14,x26,#32 1388 mov w15,w27 1389 lsr x16,x27,#32 1390 mov w17,w28 1391 lsr x19,x28,#32 1392 mov w20,w30 1393 lsr x21,x30,#32 1394 1395 mov x4,#5 1396.Loop_lower_neon: 1397 sub x4,x4,#1 1398 add v0.4s,v0.4s,v1.4s 1399 add w5,w5,w9 1400 add v4.4s,v4.4s,v5.4s 1401 add w6,w6,w10 1402 add v8.4s,v8.4s,v9.4s 1403 add w7,w7,w11 1404 add v12.4s,v12.4s,v13.4s 1405 add w8,w8,w12 1406 add v16.4s,v16.4s,v17.4s 1407 eor w17,w17,w5 1408 add v20.4s,v20.4s,v21.4s 1409 eor w19,w19,w6 1410 eor v3.16b,v3.16b,v0.16b 1411 eor w20,w20,w7 1412 eor v7.16b,v7.16b,v4.16b 1413 eor w21,w21,w8 1414 eor v11.16b,v11.16b,v8.16b 1415 ror w17,w17,#16 1416 eor v15.16b,v15.16b,v12.16b 1417 ror w19,w19,#16 1418 eor v19.16b,v19.16b,v16.16b 1419 ror w20,w20,#16 1420 eor v23.16b,v23.16b,v20.16b 1421 ror w21,w21,#16 1422 rev32 v3.8h,v3.8h 1423 add w13,w13,w17 1424 rev32 v7.8h,v7.8h 1425 add w14,w14,w19 1426 rev32 v11.8h,v11.8h 1427 add w15,w15,w20 1428 rev32 v15.8h,v15.8h 1429 add w16,w16,w21 1430 rev32 v19.8h,v19.8h 1431 eor w9,w9,w13 1432 rev32 v23.8h,v23.8h 1433 eor w10,w10,w14 1434 add v2.4s,v2.4s,v3.4s 1435 eor w11,w11,w15 1436 add v6.4s,v6.4s,v7.4s 1437 eor w12,w12,w16 1438 add v10.4s,v10.4s,v11.4s 1439 ror w9,w9,#20 1440 add v14.4s,v14.4s,v15.4s 1441 ror w10,w10,#20 1442 add v18.4s,v18.4s,v19.4s 1443 ror w11,w11,#20 1444 add v22.4s,v22.4s,v23.4s 1445 ror w12,w12,#20 1446 eor v24.16b,v1.16b,v2.16b 1447 add w5,w5,w9 1448 eor v25.16b,v5.16b,v6.16b 1449 add w6,w6,w10 1450 eor v26.16b,v9.16b,v10.16b 1451 add w7,w7,w11 1452 eor v27.16b,v13.16b,v14.16b 1453 add w8,w8,w12 1454 eor v28.16b,v17.16b,v18.16b 1455 eor w17,w17,w5 1456 eor v29.16b,v21.16b,v22.16b 1457 eor w19,w19,w6 1458 ushr v1.4s,v24.4s,#20 1459 eor w20,w20,w7 1460 ushr v5.4s,v25.4s,#20 1461 eor w21,w21,w8 1462 ushr v9.4s,v26.4s,#20 1463 ror w17,w17,#24 1464 ushr v13.4s,v27.4s,#20 1465 ror w19,w19,#24 1466 ushr v17.4s,v28.4s,#20 1467 ror w20,w20,#24 1468 ushr v21.4s,v29.4s,#20 1469 ror w21,w21,#24 1470 sli v1.4s,v24.4s,#12 1471 add w13,w13,w17 1472 sli v5.4s,v25.4s,#12 1473 add w14,w14,w19 1474 sli v9.4s,v26.4s,#12 1475 add w15,w15,w20 1476 sli v13.4s,v27.4s,#12 1477 add w16,w16,w21 1478 sli v17.4s,v28.4s,#12 1479 eor w9,w9,w13 1480 sli v21.4s,v29.4s,#12 1481 eor w10,w10,w14 1482 add v0.4s,v0.4s,v1.4s 1483 eor w11,w11,w15 1484 add v4.4s,v4.4s,v5.4s 1485 eor w12,w12,w16 1486 add v8.4s,v8.4s,v9.4s 1487 ror w9,w9,#25 1488 add v12.4s,v12.4s,v13.4s 1489 ror w10,w10,#25 1490 add v16.4s,v16.4s,v17.4s 1491 ror w11,w11,#25 1492 add v20.4s,v20.4s,v21.4s 1493 ror w12,w12,#25 1494 eor v24.16b,v3.16b,v0.16b 1495 add w5,w5,w10 1496 eor v25.16b,v7.16b,v4.16b 1497 add w6,w6,w11 1498 eor v26.16b,v11.16b,v8.16b 1499 add w7,w7,w12 1500 eor v27.16b,v15.16b,v12.16b 1501 add w8,w8,w9 1502 eor v28.16b,v19.16b,v16.16b 1503 eor w21,w21,w5 1504 eor v29.16b,v23.16b,v20.16b 1505 eor w17,w17,w6 1506 ushr v3.4s,v24.4s,#24 1507 eor w19,w19,w7 1508 ushr v7.4s,v25.4s,#24 1509 eor w20,w20,w8 1510 ushr v11.4s,v26.4s,#24 1511 ror w21,w21,#16 1512 ushr v15.4s,v27.4s,#24 1513 ror w17,w17,#16 1514 ushr v19.4s,v28.4s,#24 1515 ror w19,w19,#16 1516 ushr v23.4s,v29.4s,#24 1517 ror w20,w20,#16 1518 sli v3.4s,v24.4s,#8 1519 add w15,w15,w21 1520 sli v7.4s,v25.4s,#8 1521 add w16,w16,w17 1522 sli v11.4s,v26.4s,#8 1523 add w13,w13,w19 1524 sli v15.4s,v27.4s,#8 1525 add w14,w14,w20 1526 sli v19.4s,v28.4s,#8 1527 eor w10,w10,w15 1528 sli v23.4s,v29.4s,#8 1529 eor w11,w11,w16 1530 add v2.4s,v2.4s,v3.4s 1531 eor w12,w12,w13 1532 add v6.4s,v6.4s,v7.4s 1533 eor w9,w9,w14 1534 add v10.4s,v10.4s,v11.4s 1535 ror w10,w10,#20 1536 add v14.4s,v14.4s,v15.4s 1537 ror w11,w11,#20 1538 add v18.4s,v18.4s,v19.4s 1539 ror w12,w12,#20 1540 add v22.4s,v22.4s,v23.4s 1541 ror w9,w9,#20 1542 eor v24.16b,v1.16b,v2.16b 1543 add w5,w5,w10 1544 eor v25.16b,v5.16b,v6.16b 1545 add w6,w6,w11 1546 eor v26.16b,v9.16b,v10.16b 1547 add w7,w7,w12 1548 eor v27.16b,v13.16b,v14.16b 1549 add w8,w8,w9 1550 eor v28.16b,v17.16b,v18.16b 1551 eor w21,w21,w5 1552 eor v29.16b,v21.16b,v22.16b 1553 eor w17,w17,w6 1554 ushr v1.4s,v24.4s,#25 1555 eor w19,w19,w7 1556 ushr v5.4s,v25.4s,#25 1557 eor w20,w20,w8 1558 ushr v9.4s,v26.4s,#25 1559 ror w21,w21,#24 1560 ushr v13.4s,v27.4s,#25 1561 ror w17,w17,#24 1562 ushr v17.4s,v28.4s,#25 1563 ror w19,w19,#24 1564 ushr v21.4s,v29.4s,#25 1565 ror w20,w20,#24 1566 sli v1.4s,v24.4s,#7 1567 add w15,w15,w21 1568 sli v5.4s,v25.4s,#7 1569 add w16,w16,w17 1570 sli v9.4s,v26.4s,#7 1571 add w13,w13,w19 1572 sli v13.4s,v27.4s,#7 1573 add w14,w14,w20 1574 sli v17.4s,v28.4s,#7 1575 eor w10,w10,w15 1576 sli v21.4s,v29.4s,#7 1577 eor w11,w11,w16 1578 ext v2.16b,v2.16b,v2.16b,#8 1579 eor w12,w12,w13 1580 ext v6.16b,v6.16b,v6.16b,#8 1581 eor w9,w9,w14 1582 ext v10.16b,v10.16b,v10.16b,#8 1583 ror w10,w10,#25 1584 ext v14.16b,v14.16b,v14.16b,#8 1585 ror w11,w11,#25 1586 ext v18.16b,v18.16b,v18.16b,#8 1587 ror w12,w12,#25 1588 ext v22.16b,v22.16b,v22.16b,#8 1589 ror w9,w9,#25 1590 ext v3.16b,v3.16b,v3.16b,#12 1591 ext v7.16b,v7.16b,v7.16b,#12 1592 ext v11.16b,v11.16b,v11.16b,#12 1593 ext v15.16b,v15.16b,v15.16b,#12 1594 ext v19.16b,v19.16b,v19.16b,#12 1595 ext v23.16b,v23.16b,v23.16b,#12 1596 ext v1.16b,v1.16b,v1.16b,#4 1597 ext v5.16b,v5.16b,v5.16b,#4 1598 ext v9.16b,v9.16b,v9.16b,#4 1599 ext v13.16b,v13.16b,v13.16b,#4 1600 ext v17.16b,v17.16b,v17.16b,#4 1601 ext v21.16b,v21.16b,v21.16b,#4 1602 add v0.4s,v0.4s,v1.4s 1603 add w5,w5,w9 1604 add v4.4s,v4.4s,v5.4s 1605 add w6,w6,w10 1606 add v8.4s,v8.4s,v9.4s 1607 add w7,w7,w11 1608 add v12.4s,v12.4s,v13.4s 1609 add w8,w8,w12 1610 add v16.4s,v16.4s,v17.4s 1611 eor w17,w17,w5 1612 add v20.4s,v20.4s,v21.4s 1613 eor w19,w19,w6 1614 eor v3.16b,v3.16b,v0.16b 1615 eor w20,w20,w7 1616 eor v7.16b,v7.16b,v4.16b 1617 eor w21,w21,w8 1618 eor v11.16b,v11.16b,v8.16b 1619 ror w17,w17,#16 1620 eor v15.16b,v15.16b,v12.16b 1621 ror w19,w19,#16 1622 eor v19.16b,v19.16b,v16.16b 1623 ror w20,w20,#16 1624 eor v23.16b,v23.16b,v20.16b 1625 ror w21,w21,#16 1626 rev32 v3.8h,v3.8h 1627 add w13,w13,w17 1628 rev32 v7.8h,v7.8h 1629 add w14,w14,w19 1630 rev32 v11.8h,v11.8h 1631 add w15,w15,w20 1632 rev32 v15.8h,v15.8h 1633 add w16,w16,w21 1634 rev32 v19.8h,v19.8h 1635 eor w9,w9,w13 1636 rev32 v23.8h,v23.8h 1637 eor w10,w10,w14 1638 add v2.4s,v2.4s,v3.4s 1639 eor w11,w11,w15 1640 add v6.4s,v6.4s,v7.4s 1641 eor w12,w12,w16 1642 add v10.4s,v10.4s,v11.4s 1643 ror w9,w9,#20 1644 add v14.4s,v14.4s,v15.4s 1645 ror w10,w10,#20 1646 add v18.4s,v18.4s,v19.4s 1647 ror w11,w11,#20 1648 add v22.4s,v22.4s,v23.4s 1649 ror w12,w12,#20 1650 eor v24.16b,v1.16b,v2.16b 1651 add w5,w5,w9 1652 eor v25.16b,v5.16b,v6.16b 1653 add w6,w6,w10 1654 eor v26.16b,v9.16b,v10.16b 1655 add w7,w7,w11 1656 eor v27.16b,v13.16b,v14.16b 1657 add w8,w8,w12 1658 eor v28.16b,v17.16b,v18.16b 1659 eor w17,w17,w5 1660 eor v29.16b,v21.16b,v22.16b 1661 eor w19,w19,w6 1662 ushr v1.4s,v24.4s,#20 1663 eor w20,w20,w7 1664 ushr v5.4s,v25.4s,#20 1665 eor w21,w21,w8 1666 ushr v9.4s,v26.4s,#20 1667 ror w17,w17,#24 1668 ushr v13.4s,v27.4s,#20 1669 ror w19,w19,#24 1670 ushr v17.4s,v28.4s,#20 1671 ror w20,w20,#24 1672 ushr v21.4s,v29.4s,#20 1673 ror w21,w21,#24 1674 sli v1.4s,v24.4s,#12 1675 add w13,w13,w17 1676 sli v5.4s,v25.4s,#12 1677 add w14,w14,w19 1678 sli v9.4s,v26.4s,#12 1679 add w15,w15,w20 1680 sli v13.4s,v27.4s,#12 1681 add w16,w16,w21 1682 sli v17.4s,v28.4s,#12 1683 eor w9,w9,w13 1684 sli v21.4s,v29.4s,#12 1685 eor w10,w10,w14 1686 add v0.4s,v0.4s,v1.4s 1687 eor w11,w11,w15 1688 add v4.4s,v4.4s,v5.4s 1689 eor w12,w12,w16 1690 add v8.4s,v8.4s,v9.4s 1691 ror w9,w9,#25 1692 add v12.4s,v12.4s,v13.4s 1693 ror w10,w10,#25 1694 add v16.4s,v16.4s,v17.4s 1695 ror w11,w11,#25 1696 add v20.4s,v20.4s,v21.4s 1697 ror w12,w12,#25 1698 eor v24.16b,v3.16b,v0.16b 1699 add w5,w5,w10 1700 eor v25.16b,v7.16b,v4.16b 1701 add w6,w6,w11 1702 eor v26.16b,v11.16b,v8.16b 1703 add w7,w7,w12 1704 eor v27.16b,v15.16b,v12.16b 1705 add w8,w8,w9 1706 eor v28.16b,v19.16b,v16.16b 1707 eor w21,w21,w5 1708 eor v29.16b,v23.16b,v20.16b 1709 eor w17,w17,w6 1710 ushr v3.4s,v24.4s,#24 1711 eor w19,w19,w7 1712 ushr v7.4s,v25.4s,#24 1713 eor w20,w20,w8 1714 ushr v11.4s,v26.4s,#24 1715 ror w21,w21,#16 1716 ushr v15.4s,v27.4s,#24 1717 ror w17,w17,#16 1718 ushr v19.4s,v28.4s,#24 1719 ror w19,w19,#16 1720 ushr v23.4s,v29.4s,#24 1721 ror w20,w20,#16 1722 sli v3.4s,v24.4s,#8 1723 add w15,w15,w21 1724 sli v7.4s,v25.4s,#8 1725 add w16,w16,w17 1726 sli v11.4s,v26.4s,#8 1727 add w13,w13,w19 1728 sli v15.4s,v27.4s,#8 1729 add w14,w14,w20 1730 sli v19.4s,v28.4s,#8 1731 eor w10,w10,w15 1732 sli v23.4s,v29.4s,#8 1733 eor w11,w11,w16 1734 add v2.4s,v2.4s,v3.4s 1735 eor w12,w12,w13 1736 add v6.4s,v6.4s,v7.4s 1737 eor w9,w9,w14 1738 add v10.4s,v10.4s,v11.4s 1739 ror w10,w10,#20 1740 add v14.4s,v14.4s,v15.4s 1741 ror w11,w11,#20 1742 add v18.4s,v18.4s,v19.4s 1743 ror w12,w12,#20 1744 add v22.4s,v22.4s,v23.4s 1745 ror w9,w9,#20 1746 eor v24.16b,v1.16b,v2.16b 1747 add w5,w5,w10 1748 eor v25.16b,v5.16b,v6.16b 1749 add w6,w6,w11 1750 eor v26.16b,v9.16b,v10.16b 1751 add w7,w7,w12 1752 eor v27.16b,v13.16b,v14.16b 1753 add w8,w8,w9 1754 eor v28.16b,v17.16b,v18.16b 1755 eor w21,w21,w5 1756 eor v29.16b,v21.16b,v22.16b 1757 eor w17,w17,w6 1758 ushr v1.4s,v24.4s,#25 1759 eor w19,w19,w7 1760 ushr v5.4s,v25.4s,#25 1761 eor w20,w20,w8 1762 ushr v9.4s,v26.4s,#25 1763 ror w21,w21,#24 1764 ushr v13.4s,v27.4s,#25 1765 ror w17,w17,#24 1766 ushr v17.4s,v28.4s,#25 1767 ror w19,w19,#24 1768 ushr v21.4s,v29.4s,#25 1769 ror w20,w20,#24 1770 sli v1.4s,v24.4s,#7 1771 add w15,w15,w21 1772 sli v5.4s,v25.4s,#7 1773 add w16,w16,w17 1774 sli v9.4s,v26.4s,#7 1775 add w13,w13,w19 1776 sli v13.4s,v27.4s,#7 1777 add w14,w14,w20 1778 sli v17.4s,v28.4s,#7 1779 eor w10,w10,w15 1780 sli v21.4s,v29.4s,#7 1781 eor w11,w11,w16 1782 ext v2.16b,v2.16b,v2.16b,#8 1783 eor w12,w12,w13 1784 ext v6.16b,v6.16b,v6.16b,#8 1785 eor w9,w9,w14 1786 ext v10.16b,v10.16b,v10.16b,#8 1787 ror w10,w10,#25 1788 ext v14.16b,v14.16b,v14.16b,#8 1789 ror w11,w11,#25 1790 ext v18.16b,v18.16b,v18.16b,#8 1791 ror w12,w12,#25 1792 ext v22.16b,v22.16b,v22.16b,#8 1793 ror w9,w9,#25 1794 ext v3.16b,v3.16b,v3.16b,#4 1795 ext v7.16b,v7.16b,v7.16b,#4 1796 ext v11.16b,v11.16b,v11.16b,#4 1797 ext v15.16b,v15.16b,v15.16b,#4 1798 ext v19.16b,v19.16b,v19.16b,#4 1799 ext v23.16b,v23.16b,v23.16b,#4 1800 ext v1.16b,v1.16b,v1.16b,#12 1801 ext v5.16b,v5.16b,v5.16b,#12 1802 ext v9.16b,v9.16b,v9.16b,#12 1803 ext v13.16b,v13.16b,v13.16b,#12 1804 ext v17.16b,v17.16b,v17.16b,#12 1805 ext v21.16b,v21.16b,v21.16b,#12 1806 cbnz x4,.Loop_lower_neon 1807 1808 add w5,w5,w22 // accumulate key block 1809 ldp q24,q25,[sp,#0] 1810 add x6,x6,x22,lsr#32 1811 ldp q26,q27,[sp,#32] 1812 add w7,w7,w23 1813 ldp q28,q29,[sp,#64] 1814 add x8,x8,x23,lsr#32 1815 add v0.4s,v0.4s,v24.4s 1816 add w9,w9,w24 1817 add v4.4s,v4.4s,v24.4s 1818 add x10,x10,x24,lsr#32 1819 add v8.4s,v8.4s,v24.4s 1820 add w11,w11,w25 1821 add v12.4s,v12.4s,v24.4s 1822 add x12,x12,x25,lsr#32 1823 add v16.4s,v16.4s,v24.4s 1824 add w13,w13,w26 1825 add v20.4s,v20.4s,v24.4s 1826 add x14,x14,x26,lsr#32 1827 add v2.4s,v2.4s,v26.4s 1828 add w15,w15,w27 1829 add v6.4s,v6.4s,v26.4s 1830 add x16,x16,x27,lsr#32 1831 add v10.4s,v10.4s,v26.4s 1832 add w17,w17,w28 1833 add v14.4s,v14.4s,v26.4s 1834 add x19,x19,x28,lsr#32 1835 add v18.4s,v18.4s,v26.4s 1836 add w20,w20,w30 1837 add v22.4s,v22.4s,v26.4s 1838 add x21,x21,x30,lsr#32 1839 add v19.4s,v19.4s,v31.4s // +4 1840 add x5,x5,x6,lsl#32 // pack 1841 add v23.4s,v23.4s,v31.4s // +4 1842 add x7,x7,x8,lsl#32 1843 add v3.4s,v3.4s,v27.4s 1844 ldp x6,x8,[x1,#0] // load input 1845 add v7.4s,v7.4s,v28.4s 1846 add x9,x9,x10,lsl#32 1847 add v11.4s,v11.4s,v29.4s 1848 add x11,x11,x12,lsl#32 1849 add v15.4s,v15.4s,v30.4s 1850 ldp x10,x12,[x1,#16] 1851 add v19.4s,v19.4s,v27.4s 1852 add x13,x13,x14,lsl#32 1853 add v23.4s,v23.4s,v28.4s 1854 add x15,x15,x16,lsl#32 1855 add v1.4s,v1.4s,v25.4s 1856 ldp x14,x16,[x1,#32] 1857 add v5.4s,v5.4s,v25.4s 1858 add x17,x17,x19,lsl#32 1859 add v9.4s,v9.4s,v25.4s 1860 add x20,x20,x21,lsl#32 1861 add v13.4s,v13.4s,v25.4s 1862 ldp x19,x21,[x1,#48] 1863 add v17.4s,v17.4s,v25.4s 1864 add x1,x1,#64 1865 add v21.4s,v21.4s,v25.4s 1866 1867#ifdef __AARCH64EB__ 1868 rev x5,x5 1869 rev x7,x7 1870 rev x9,x9 1871 rev x11,x11 1872 rev x13,x13 1873 rev x15,x15 1874 rev x17,x17 1875 rev x20,x20 1876#endif 1877 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 1878 eor x5,x5,x6 1879 eor x7,x7,x8 1880 eor x9,x9,x10 1881 eor x11,x11,x12 1882 eor x13,x13,x14 1883 eor v0.16b,v0.16b,v24.16b 1884 eor x15,x15,x16 1885 eor v1.16b,v1.16b,v25.16b 1886 eor x17,x17,x19 1887 eor v2.16b,v2.16b,v26.16b 1888 eor x20,x20,x21 1889 eor v3.16b,v3.16b,v27.16b 1890 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 1891 1892 stp x5,x7,[x0,#0] // store output 1893 add x28,x28,#7 // increment counter 1894 stp x9,x11,[x0,#16] 1895 stp x13,x15,[x0,#32] 1896 stp x17,x20,[x0,#48] 1897 add x0,x0,#64 1898 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 1899 1900 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 1901 eor v4.16b,v4.16b,v24.16b 1902 eor v5.16b,v5.16b,v25.16b 1903 eor v6.16b,v6.16b,v26.16b 1904 eor v7.16b,v7.16b,v27.16b 1905 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 1906 1907 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 1908 eor v8.16b,v8.16b,v0.16b 1909 ldp q24,q25,[sp,#0] 1910 eor v9.16b,v9.16b,v1.16b 1911 ldp q26,q27,[sp,#32] 1912 eor v10.16b,v10.16b,v2.16b 1913 eor v11.16b,v11.16b,v3.16b 1914 st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 1915 1916 ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 1917 eor v12.16b,v12.16b,v4.16b 1918 eor v13.16b,v13.16b,v5.16b 1919 eor v14.16b,v14.16b,v6.16b 1920 eor v15.16b,v15.16b,v7.16b 1921 st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 1922 1923 ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 1924 eor v16.16b,v16.16b,v8.16b 1925 eor v17.16b,v17.16b,v9.16b 1926 eor v18.16b,v18.16b,v10.16b 1927 eor v19.16b,v19.16b,v11.16b 1928 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 1929 1930 shl v0.4s,v31.4s,#1 // 4 -> 8 1931 eor v20.16b,v20.16b,v12.16b 1932 eor v21.16b,v21.16b,v13.16b 1933 eor v22.16b,v22.16b,v14.16b 1934 eor v23.16b,v23.16b,v15.16b 1935 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 1936 1937 add v27.4s,v27.4s,v0.4s // += 8 1938 add v28.4s,v28.4s,v0.4s 1939 add v29.4s,v29.4s,v0.4s 1940 add v30.4s,v30.4s,v0.4s 1941 1942 b.hs .Loop_outer_512_neon 1943 1944 adds x2,x2,#512 1945 ushr v0.4s,v31.4s,#2 // 4 -> 1 1946 1947 ldp d8,d9,[sp,#128+0] // meet ABI requirements 1948 ldp d10,d11,[sp,#128+16] 1949 ldp d12,d13,[sp,#128+32] 1950 ldp d14,d15,[sp,#128+48] 1951 1952 stp q24,q31,[sp,#0] // wipe off-load area 1953 stp q24,q31,[sp,#32] 1954 stp q24,q31,[sp,#64] 1955 1956 b.eq .Ldone_512_neon 1957 1958 cmp x2,#192 1959 sub v27.4s,v27.4s,v0.4s // -= 1 1960 sub v28.4s,v28.4s,v0.4s 1961 sub v29.4s,v29.4s,v0.4s 1962 add sp,sp,#128 1963 b.hs .Loop_outer_neon 1964 1965 eor v25.16b,v25.16b,v25.16b 1966 eor v26.16b,v26.16b,v26.16b 1967 eor v27.16b,v27.16b,v27.16b 1968 eor v28.16b,v28.16b,v28.16b 1969 eor v29.16b,v29.16b,v29.16b 1970 eor v30.16b,v30.16b,v30.16b 1971 b .Loop_outer 1972 1973.Ldone_512_neon: 1974 ldp x19,x20,[x29,#16] 1975 add sp,sp,#128+64 1976 ldp x21,x22,[x29,#32] 1977 ldp x23,x24,[x29,#48] 1978 ldp x25,x26,[x29,#64] 1979 ldp x27,x28,[x29,#80] 1980 ldp x29,x30,[sp],#96 1981 AARCH64_VALIDATE_LINK_REGISTER 1982 ret 1983.size ChaCha20_512_neon,.-ChaCha20_512_neon 1984#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) 1985