1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#include <openssl/asm_base.h> 5 6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) 7.text 8 9.section .rodata 10.align 64 11.Lzero: 12.long 0,0,0,0 13.Lone: 14.long 1,0,0,0 15.Linc: 16.long 0,1,2,3 17.Lfour: 18.long 4,4,4,4 19.Lincy: 20.long 0,2,4,6,1,3,5,7 21.Leight: 22.long 8,8,8,8,8,8,8,8 23.Lrot16: 24.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd 25.Lrot24: 26.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe 27.Lsigma: 28.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 29.align 64 30.Lzeroz: 31.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 32.Lfourz: 33.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 34.Lincz: 35.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 36.Lsixteen: 37.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 38.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 39.text 40.globl ChaCha20_ctr32_nohw 41.hidden ChaCha20_ctr32_nohw 42.type ChaCha20_ctr32_nohw,@function 43.align 64 44ChaCha20_ctr32_nohw: 45.cfi_startproc 46_CET_ENDBR 47 pushq %rbx 48.cfi_adjust_cfa_offset 8 49.cfi_offset rbx,-16 50 pushq %rbp 51.cfi_adjust_cfa_offset 8 52.cfi_offset rbp,-24 53 pushq %r12 54.cfi_adjust_cfa_offset 8 55.cfi_offset r12,-32 56 pushq %r13 57.cfi_adjust_cfa_offset 8 58.cfi_offset r13,-40 59 pushq %r14 60.cfi_adjust_cfa_offset 8 61.cfi_offset r14,-48 62 pushq %r15 63.cfi_adjust_cfa_offset 8 64.cfi_offset r15,-56 65 subq $64+24,%rsp 66.cfi_adjust_cfa_offset 88 67.Lctr32_body: 68 69 70 movdqu (%rcx),%xmm1 71 movdqu 16(%rcx),%xmm2 72 movdqu (%r8),%xmm3 73 movdqa .Lone(%rip),%xmm4 74 75 76 movdqa %xmm1,16(%rsp) 77 movdqa %xmm2,32(%rsp) 78 movdqa %xmm3,48(%rsp) 79 movq %rdx,%rbp 80 jmp .Loop_outer 81 82.align 32 83.Loop_outer: 84 movl $0x61707865,%eax 85 movl $0x3320646e,%ebx 86 movl $0x79622d32,%ecx 87 movl $0x6b206574,%edx 88 movl 16(%rsp),%r8d 89 movl 20(%rsp),%r9d 90 movl 24(%rsp),%r10d 91 movl 28(%rsp),%r11d 92 movd %xmm3,%r12d 93 movl 52(%rsp),%r13d 94 movl 56(%rsp),%r14d 95 movl 60(%rsp),%r15d 96 97 movq %rbp,64+0(%rsp) 98 movl $10,%ebp 99 movq %rsi,64+8(%rsp) 100.byte 102,72,15,126,214 101 movq %rdi,64+16(%rsp) 102 movq %rsi,%rdi 103 shrq $32,%rdi 104 jmp .Loop 105 106.align 32 107.Loop: 108 addl %r8d,%eax 109 xorl %eax,%r12d 110 roll $16,%r12d 111 addl %r9d,%ebx 112 xorl %ebx,%r13d 113 roll $16,%r13d 114 addl %r12d,%esi 115 xorl %esi,%r8d 116 roll $12,%r8d 117 addl %r13d,%edi 118 xorl %edi,%r9d 119 roll $12,%r9d 120 addl %r8d,%eax 121 xorl %eax,%r12d 122 roll $8,%r12d 123 addl %r9d,%ebx 124 xorl %ebx,%r13d 125 roll $8,%r13d 126 addl %r12d,%esi 127 xorl %esi,%r8d 128 roll $7,%r8d 129 addl %r13d,%edi 130 xorl %edi,%r9d 131 roll $7,%r9d 132 movl %esi,32(%rsp) 133 movl %edi,36(%rsp) 134 movl 40(%rsp),%esi 135 movl 44(%rsp),%edi 136 addl %r10d,%ecx 137 xorl %ecx,%r14d 138 roll $16,%r14d 139 addl %r11d,%edx 140 xorl %edx,%r15d 141 roll $16,%r15d 142 addl %r14d,%esi 143 xorl %esi,%r10d 144 roll $12,%r10d 145 addl %r15d,%edi 146 xorl %edi,%r11d 147 roll $12,%r11d 148 addl %r10d,%ecx 149 xorl %ecx,%r14d 150 roll $8,%r14d 151 addl %r11d,%edx 152 xorl %edx,%r15d 153 roll $8,%r15d 154 addl %r14d,%esi 155 xorl %esi,%r10d 156 roll $7,%r10d 157 addl %r15d,%edi 158 xorl %edi,%r11d 159 roll $7,%r11d 160 addl %r9d,%eax 161 xorl %eax,%r15d 162 roll $16,%r15d 163 addl %r10d,%ebx 164 xorl %ebx,%r12d 165 roll $16,%r12d 166 addl %r15d,%esi 167 xorl %esi,%r9d 168 roll $12,%r9d 169 addl %r12d,%edi 170 xorl %edi,%r10d 171 roll $12,%r10d 172 addl %r9d,%eax 173 xorl %eax,%r15d 174 roll $8,%r15d 175 addl %r10d,%ebx 176 xorl %ebx,%r12d 177 roll $8,%r12d 178 addl %r15d,%esi 179 xorl %esi,%r9d 180 roll $7,%r9d 181 addl %r12d,%edi 182 xorl %edi,%r10d 183 roll $7,%r10d 184 movl %esi,40(%rsp) 185 movl %edi,44(%rsp) 186 movl 32(%rsp),%esi 187 movl 36(%rsp),%edi 188 addl %r11d,%ecx 189 xorl %ecx,%r13d 190 roll $16,%r13d 191 addl %r8d,%edx 192 xorl %edx,%r14d 193 roll $16,%r14d 194 addl %r13d,%esi 195 xorl %esi,%r11d 196 roll $12,%r11d 197 addl %r14d,%edi 198 xorl %edi,%r8d 199 roll $12,%r8d 200 addl %r11d,%ecx 201 xorl %ecx,%r13d 202 roll $8,%r13d 203 addl %r8d,%edx 204 xorl %edx,%r14d 205 roll $8,%r14d 206 addl %r13d,%esi 207 xorl %esi,%r11d 208 roll $7,%r11d 209 addl %r14d,%edi 210 xorl %edi,%r8d 211 roll $7,%r8d 212 decl %ebp 213 jnz .Loop 214 movl %edi,36(%rsp) 215 movl %esi,32(%rsp) 216 movq 64(%rsp),%rbp 217 movdqa %xmm2,%xmm1 218 movq 64+8(%rsp),%rsi 219 paddd %xmm4,%xmm3 220 movq 64+16(%rsp),%rdi 221 222 addl $0x61707865,%eax 223 addl $0x3320646e,%ebx 224 addl $0x79622d32,%ecx 225 addl $0x6b206574,%edx 226 addl 16(%rsp),%r8d 227 addl 20(%rsp),%r9d 228 addl 24(%rsp),%r10d 229 addl 28(%rsp),%r11d 230 addl 48(%rsp),%r12d 231 addl 52(%rsp),%r13d 232 addl 56(%rsp),%r14d 233 addl 60(%rsp),%r15d 234 paddd 32(%rsp),%xmm1 235 236 cmpq $64,%rbp 237 jb .Ltail 238 239 xorl 0(%rsi),%eax 240 xorl 4(%rsi),%ebx 241 xorl 8(%rsi),%ecx 242 xorl 12(%rsi),%edx 243 xorl 16(%rsi),%r8d 244 xorl 20(%rsi),%r9d 245 xorl 24(%rsi),%r10d 246 xorl 28(%rsi),%r11d 247 movdqu 32(%rsi),%xmm0 248 xorl 48(%rsi),%r12d 249 xorl 52(%rsi),%r13d 250 xorl 56(%rsi),%r14d 251 xorl 60(%rsi),%r15d 252 leaq 64(%rsi),%rsi 253 pxor %xmm1,%xmm0 254 255 movdqa %xmm2,32(%rsp) 256 movd %xmm3,48(%rsp) 257 258 movl %eax,0(%rdi) 259 movl %ebx,4(%rdi) 260 movl %ecx,8(%rdi) 261 movl %edx,12(%rdi) 262 movl %r8d,16(%rdi) 263 movl %r9d,20(%rdi) 264 movl %r10d,24(%rdi) 265 movl %r11d,28(%rdi) 266 movdqu %xmm0,32(%rdi) 267 movl %r12d,48(%rdi) 268 movl %r13d,52(%rdi) 269 movl %r14d,56(%rdi) 270 movl %r15d,60(%rdi) 271 leaq 64(%rdi),%rdi 272 273 subq $64,%rbp 274 jnz .Loop_outer 275 276 jmp .Ldone 277 278.align 16 279.Ltail: 280 movl %eax,0(%rsp) 281 movl %ebx,4(%rsp) 282 xorq %rbx,%rbx 283 movl %ecx,8(%rsp) 284 movl %edx,12(%rsp) 285 movl %r8d,16(%rsp) 286 movl %r9d,20(%rsp) 287 movl %r10d,24(%rsp) 288 movl %r11d,28(%rsp) 289 movdqa %xmm1,32(%rsp) 290 movl %r12d,48(%rsp) 291 movl %r13d,52(%rsp) 292 movl %r14d,56(%rsp) 293 movl %r15d,60(%rsp) 294 295.Loop_tail: 296 movzbl (%rsi,%rbx,1),%eax 297 movzbl (%rsp,%rbx,1),%edx 298 leaq 1(%rbx),%rbx 299 xorl %edx,%eax 300 movb %al,-1(%rdi,%rbx,1) 301 decq %rbp 302 jnz .Loop_tail 303 304.Ldone: 305 leaq 64+24+48(%rsp),%rsi 306 movq -48(%rsi),%r15 307.cfi_restore r15 308 movq -40(%rsi),%r14 309.cfi_restore r14 310 movq -32(%rsi),%r13 311.cfi_restore r13 312 movq -24(%rsi),%r12 313.cfi_restore r12 314 movq -16(%rsi),%rbp 315.cfi_restore rbp 316 movq -8(%rsi),%rbx 317.cfi_restore rbx 318 leaq (%rsi),%rsp 319.cfi_adjust_cfa_offset -136 320.Lno_data: 321 ret 322.cfi_endproc 323.size ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw 324.globl ChaCha20_ctr32_ssse3 325.hidden ChaCha20_ctr32_ssse3 326.type ChaCha20_ctr32_ssse3,@function 327.align 32 328ChaCha20_ctr32_ssse3: 329.cfi_startproc 330_CET_ENDBR 331 movq %rsp,%r9 332.cfi_def_cfa_register r9 333 subq $64+8,%rsp 334 movdqa .Lsigma(%rip),%xmm0 335 movdqu (%rcx),%xmm1 336 movdqu 16(%rcx),%xmm2 337 movdqu (%r8),%xmm3 338 movdqa .Lrot16(%rip),%xmm6 339 movdqa .Lrot24(%rip),%xmm7 340 341 movdqa %xmm0,0(%rsp) 342 movdqa %xmm1,16(%rsp) 343 movdqa %xmm2,32(%rsp) 344 movdqa %xmm3,48(%rsp) 345 movq $10,%r8 346 jmp .Loop_ssse3 347 348.align 32 349.Loop_outer_ssse3: 350 movdqa .Lone(%rip),%xmm3 351 movdqa 0(%rsp),%xmm0 352 movdqa 16(%rsp),%xmm1 353 movdqa 32(%rsp),%xmm2 354 paddd 48(%rsp),%xmm3 355 movq $10,%r8 356 movdqa %xmm3,48(%rsp) 357 jmp .Loop_ssse3 358 359.align 32 360.Loop_ssse3: 361 paddd %xmm1,%xmm0 362 pxor %xmm0,%xmm3 363.byte 102,15,56,0,222 364 paddd %xmm3,%xmm2 365 pxor %xmm2,%xmm1 366 movdqa %xmm1,%xmm4 367 psrld $20,%xmm1 368 pslld $12,%xmm4 369 por %xmm4,%xmm1 370 paddd %xmm1,%xmm0 371 pxor %xmm0,%xmm3 372.byte 102,15,56,0,223 373 paddd %xmm3,%xmm2 374 pxor %xmm2,%xmm1 375 movdqa %xmm1,%xmm4 376 psrld $25,%xmm1 377 pslld $7,%xmm4 378 por %xmm4,%xmm1 379 pshufd $78,%xmm2,%xmm2 380 pshufd $57,%xmm1,%xmm1 381 pshufd $147,%xmm3,%xmm3 382 nop 383 paddd %xmm1,%xmm0 384 pxor %xmm0,%xmm3 385.byte 102,15,56,0,222 386 paddd %xmm3,%xmm2 387 pxor %xmm2,%xmm1 388 movdqa %xmm1,%xmm4 389 psrld $20,%xmm1 390 pslld $12,%xmm4 391 por %xmm4,%xmm1 392 paddd %xmm1,%xmm0 393 pxor %xmm0,%xmm3 394.byte 102,15,56,0,223 395 paddd %xmm3,%xmm2 396 pxor %xmm2,%xmm1 397 movdqa %xmm1,%xmm4 398 psrld $25,%xmm1 399 pslld $7,%xmm4 400 por %xmm4,%xmm1 401 pshufd $78,%xmm2,%xmm2 402 pshufd $147,%xmm1,%xmm1 403 pshufd $57,%xmm3,%xmm3 404 decq %r8 405 jnz .Loop_ssse3 406 paddd 0(%rsp),%xmm0 407 paddd 16(%rsp),%xmm1 408 paddd 32(%rsp),%xmm2 409 paddd 48(%rsp),%xmm3 410 411 cmpq $64,%rdx 412 jb .Ltail_ssse3 413 414 movdqu 0(%rsi),%xmm4 415 movdqu 16(%rsi),%xmm5 416 pxor %xmm4,%xmm0 417 movdqu 32(%rsi),%xmm4 418 pxor %xmm5,%xmm1 419 movdqu 48(%rsi),%xmm5 420 leaq 64(%rsi),%rsi 421 pxor %xmm4,%xmm2 422 pxor %xmm5,%xmm3 423 424 movdqu %xmm0,0(%rdi) 425 movdqu %xmm1,16(%rdi) 426 movdqu %xmm2,32(%rdi) 427 movdqu %xmm3,48(%rdi) 428 leaq 64(%rdi),%rdi 429 430 subq $64,%rdx 431 jnz .Loop_outer_ssse3 432 433 jmp .Ldone_ssse3 434 435.align 16 436.Ltail_ssse3: 437 movdqa %xmm0,0(%rsp) 438 movdqa %xmm1,16(%rsp) 439 movdqa %xmm2,32(%rsp) 440 movdqa %xmm3,48(%rsp) 441 xorq %r8,%r8 442 443.Loop_tail_ssse3: 444 movzbl (%rsi,%r8,1),%eax 445 movzbl (%rsp,%r8,1),%ecx 446 leaq 1(%r8),%r8 447 xorl %ecx,%eax 448 movb %al,-1(%rdi,%r8,1) 449 decq %rdx 450 jnz .Loop_tail_ssse3 451 452.Ldone_ssse3: 453 leaq (%r9),%rsp 454.cfi_def_cfa_register rsp 455.Lssse3_epilogue: 456 ret 457.cfi_endproc 458.size ChaCha20_ctr32_ssse3,.-ChaCha20_ctr32_ssse3 459.globl ChaCha20_ctr32_ssse3_4x 460.hidden ChaCha20_ctr32_ssse3_4x 461.type ChaCha20_ctr32_ssse3_4x,@function 462.align 32 463ChaCha20_ctr32_ssse3_4x: 464.cfi_startproc 465_CET_ENDBR 466 movq %rsp,%r9 467.cfi_def_cfa_register r9 468 subq $0x140+8,%rsp 469 movdqa .Lsigma(%rip),%xmm11 470 movdqu (%rcx),%xmm15 471 movdqu 16(%rcx),%xmm7 472 movdqu (%r8),%xmm3 473 leaq 256(%rsp),%rcx 474 leaq .Lrot16(%rip),%r10 475 leaq .Lrot24(%rip),%r11 476 477 pshufd $0x00,%xmm11,%xmm8 478 pshufd $0x55,%xmm11,%xmm9 479 movdqa %xmm8,64(%rsp) 480 pshufd $0xaa,%xmm11,%xmm10 481 movdqa %xmm9,80(%rsp) 482 pshufd $0xff,%xmm11,%xmm11 483 movdqa %xmm10,96(%rsp) 484 movdqa %xmm11,112(%rsp) 485 486 pshufd $0x00,%xmm15,%xmm12 487 pshufd $0x55,%xmm15,%xmm13 488 movdqa %xmm12,128-256(%rcx) 489 pshufd $0xaa,%xmm15,%xmm14 490 movdqa %xmm13,144-256(%rcx) 491 pshufd $0xff,%xmm15,%xmm15 492 movdqa %xmm14,160-256(%rcx) 493 movdqa %xmm15,176-256(%rcx) 494 495 pshufd $0x00,%xmm7,%xmm4 496 pshufd $0x55,%xmm7,%xmm5 497 movdqa %xmm4,192-256(%rcx) 498 pshufd $0xaa,%xmm7,%xmm6 499 movdqa %xmm5,208-256(%rcx) 500 pshufd $0xff,%xmm7,%xmm7 501 movdqa %xmm6,224-256(%rcx) 502 movdqa %xmm7,240-256(%rcx) 503 504 pshufd $0x00,%xmm3,%xmm0 505 pshufd $0x55,%xmm3,%xmm1 506 paddd .Linc(%rip),%xmm0 507 pshufd $0xaa,%xmm3,%xmm2 508 movdqa %xmm1,272-256(%rcx) 509 pshufd $0xff,%xmm3,%xmm3 510 movdqa %xmm2,288-256(%rcx) 511 movdqa %xmm3,304-256(%rcx) 512 513 jmp .Loop_enter4x 514 515.align 32 516.Loop_outer4x: 517 movdqa 64(%rsp),%xmm8 518 movdqa 80(%rsp),%xmm9 519 movdqa 96(%rsp),%xmm10 520 movdqa 112(%rsp),%xmm11 521 movdqa 128-256(%rcx),%xmm12 522 movdqa 144-256(%rcx),%xmm13 523 movdqa 160-256(%rcx),%xmm14 524 movdqa 176-256(%rcx),%xmm15 525 movdqa 192-256(%rcx),%xmm4 526 movdqa 208-256(%rcx),%xmm5 527 movdqa 224-256(%rcx),%xmm6 528 movdqa 240-256(%rcx),%xmm7 529 movdqa 256-256(%rcx),%xmm0 530 movdqa 272-256(%rcx),%xmm1 531 movdqa 288-256(%rcx),%xmm2 532 movdqa 304-256(%rcx),%xmm3 533 paddd .Lfour(%rip),%xmm0 534 535.Loop_enter4x: 536 movdqa %xmm6,32(%rsp) 537 movdqa %xmm7,48(%rsp) 538 movdqa (%r10),%xmm7 539 movl $10,%eax 540 movdqa %xmm0,256-256(%rcx) 541 jmp .Loop4x 542 543.align 32 544.Loop4x: 545 paddd %xmm12,%xmm8 546 paddd %xmm13,%xmm9 547 pxor %xmm8,%xmm0 548 pxor %xmm9,%xmm1 549.byte 102,15,56,0,199 550.byte 102,15,56,0,207 551 paddd %xmm0,%xmm4 552 paddd %xmm1,%xmm5 553 pxor %xmm4,%xmm12 554 pxor %xmm5,%xmm13 555 movdqa %xmm12,%xmm6 556 pslld $12,%xmm12 557 psrld $20,%xmm6 558 movdqa %xmm13,%xmm7 559 pslld $12,%xmm13 560 por %xmm6,%xmm12 561 psrld $20,%xmm7 562 movdqa (%r11),%xmm6 563 por %xmm7,%xmm13 564 paddd %xmm12,%xmm8 565 paddd %xmm13,%xmm9 566 pxor %xmm8,%xmm0 567 pxor %xmm9,%xmm1 568.byte 102,15,56,0,198 569.byte 102,15,56,0,206 570 paddd %xmm0,%xmm4 571 paddd %xmm1,%xmm5 572 pxor %xmm4,%xmm12 573 pxor %xmm5,%xmm13 574 movdqa %xmm12,%xmm7 575 pslld $7,%xmm12 576 psrld $25,%xmm7 577 movdqa %xmm13,%xmm6 578 pslld $7,%xmm13 579 por %xmm7,%xmm12 580 psrld $25,%xmm6 581 movdqa (%r10),%xmm7 582 por %xmm6,%xmm13 583 movdqa %xmm4,0(%rsp) 584 movdqa %xmm5,16(%rsp) 585 movdqa 32(%rsp),%xmm4 586 movdqa 48(%rsp),%xmm5 587 paddd %xmm14,%xmm10 588 paddd %xmm15,%xmm11 589 pxor %xmm10,%xmm2 590 pxor %xmm11,%xmm3 591.byte 102,15,56,0,215 592.byte 102,15,56,0,223 593 paddd %xmm2,%xmm4 594 paddd %xmm3,%xmm5 595 pxor %xmm4,%xmm14 596 pxor %xmm5,%xmm15 597 movdqa %xmm14,%xmm6 598 pslld $12,%xmm14 599 psrld $20,%xmm6 600 movdqa %xmm15,%xmm7 601 pslld $12,%xmm15 602 por %xmm6,%xmm14 603 psrld $20,%xmm7 604 movdqa (%r11),%xmm6 605 por %xmm7,%xmm15 606 paddd %xmm14,%xmm10 607 paddd %xmm15,%xmm11 608 pxor %xmm10,%xmm2 609 pxor %xmm11,%xmm3 610.byte 102,15,56,0,214 611.byte 102,15,56,0,222 612 paddd %xmm2,%xmm4 613 paddd %xmm3,%xmm5 614 pxor %xmm4,%xmm14 615 pxor %xmm5,%xmm15 616 movdqa %xmm14,%xmm7 617 pslld $7,%xmm14 618 psrld $25,%xmm7 619 movdqa %xmm15,%xmm6 620 pslld $7,%xmm15 621 por %xmm7,%xmm14 622 psrld $25,%xmm6 623 movdqa (%r10),%xmm7 624 por %xmm6,%xmm15 625 paddd %xmm13,%xmm8 626 paddd %xmm14,%xmm9 627 pxor %xmm8,%xmm3 628 pxor %xmm9,%xmm0 629.byte 102,15,56,0,223 630.byte 102,15,56,0,199 631 paddd %xmm3,%xmm4 632 paddd %xmm0,%xmm5 633 pxor %xmm4,%xmm13 634 pxor %xmm5,%xmm14 635 movdqa %xmm13,%xmm6 636 pslld $12,%xmm13 637 psrld $20,%xmm6 638 movdqa %xmm14,%xmm7 639 pslld $12,%xmm14 640 por %xmm6,%xmm13 641 psrld $20,%xmm7 642 movdqa (%r11),%xmm6 643 por %xmm7,%xmm14 644 paddd %xmm13,%xmm8 645 paddd %xmm14,%xmm9 646 pxor %xmm8,%xmm3 647 pxor %xmm9,%xmm0 648.byte 102,15,56,0,222 649.byte 102,15,56,0,198 650 paddd %xmm3,%xmm4 651 paddd %xmm0,%xmm5 652 pxor %xmm4,%xmm13 653 pxor %xmm5,%xmm14 654 movdqa %xmm13,%xmm7 655 pslld $7,%xmm13 656 psrld $25,%xmm7 657 movdqa %xmm14,%xmm6 658 pslld $7,%xmm14 659 por %xmm7,%xmm13 660 psrld $25,%xmm6 661 movdqa (%r10),%xmm7 662 por %xmm6,%xmm14 663 movdqa %xmm4,32(%rsp) 664 movdqa %xmm5,48(%rsp) 665 movdqa 0(%rsp),%xmm4 666 movdqa 16(%rsp),%xmm5 667 paddd %xmm15,%xmm10 668 paddd %xmm12,%xmm11 669 pxor %xmm10,%xmm1 670 pxor %xmm11,%xmm2 671.byte 102,15,56,0,207 672.byte 102,15,56,0,215 673 paddd %xmm1,%xmm4 674 paddd %xmm2,%xmm5 675 pxor %xmm4,%xmm15 676 pxor %xmm5,%xmm12 677 movdqa %xmm15,%xmm6 678 pslld $12,%xmm15 679 psrld $20,%xmm6 680 movdqa %xmm12,%xmm7 681 pslld $12,%xmm12 682 por %xmm6,%xmm15 683 psrld $20,%xmm7 684 movdqa (%r11),%xmm6 685 por %xmm7,%xmm12 686 paddd %xmm15,%xmm10 687 paddd %xmm12,%xmm11 688 pxor %xmm10,%xmm1 689 pxor %xmm11,%xmm2 690.byte 102,15,56,0,206 691.byte 102,15,56,0,214 692 paddd %xmm1,%xmm4 693 paddd %xmm2,%xmm5 694 pxor %xmm4,%xmm15 695 pxor %xmm5,%xmm12 696 movdqa %xmm15,%xmm7 697 pslld $7,%xmm15 698 psrld $25,%xmm7 699 movdqa %xmm12,%xmm6 700 pslld $7,%xmm12 701 por %xmm7,%xmm15 702 psrld $25,%xmm6 703 movdqa (%r10),%xmm7 704 por %xmm6,%xmm12 705 decl %eax 706 jnz .Loop4x 707 708 paddd 64(%rsp),%xmm8 709 paddd 80(%rsp),%xmm9 710 paddd 96(%rsp),%xmm10 711 paddd 112(%rsp),%xmm11 712 713 movdqa %xmm8,%xmm6 714 punpckldq %xmm9,%xmm8 715 movdqa %xmm10,%xmm7 716 punpckldq %xmm11,%xmm10 717 punpckhdq %xmm9,%xmm6 718 punpckhdq %xmm11,%xmm7 719 movdqa %xmm8,%xmm9 720 punpcklqdq %xmm10,%xmm8 721 movdqa %xmm6,%xmm11 722 punpcklqdq %xmm7,%xmm6 723 punpckhqdq %xmm10,%xmm9 724 punpckhqdq %xmm7,%xmm11 725 paddd 128-256(%rcx),%xmm12 726 paddd 144-256(%rcx),%xmm13 727 paddd 160-256(%rcx),%xmm14 728 paddd 176-256(%rcx),%xmm15 729 730 movdqa %xmm8,0(%rsp) 731 movdqa %xmm9,16(%rsp) 732 movdqa 32(%rsp),%xmm8 733 movdqa 48(%rsp),%xmm9 734 735 movdqa %xmm12,%xmm10 736 punpckldq %xmm13,%xmm12 737 movdqa %xmm14,%xmm7 738 punpckldq %xmm15,%xmm14 739 punpckhdq %xmm13,%xmm10 740 punpckhdq %xmm15,%xmm7 741 movdqa %xmm12,%xmm13 742 punpcklqdq %xmm14,%xmm12 743 movdqa %xmm10,%xmm15 744 punpcklqdq %xmm7,%xmm10 745 punpckhqdq %xmm14,%xmm13 746 punpckhqdq %xmm7,%xmm15 747 paddd 192-256(%rcx),%xmm4 748 paddd 208-256(%rcx),%xmm5 749 paddd 224-256(%rcx),%xmm8 750 paddd 240-256(%rcx),%xmm9 751 752 movdqa %xmm6,32(%rsp) 753 movdqa %xmm11,48(%rsp) 754 755 movdqa %xmm4,%xmm14 756 punpckldq %xmm5,%xmm4 757 movdqa %xmm8,%xmm7 758 punpckldq %xmm9,%xmm8 759 punpckhdq %xmm5,%xmm14 760 punpckhdq %xmm9,%xmm7 761 movdqa %xmm4,%xmm5 762 punpcklqdq %xmm8,%xmm4 763 movdqa %xmm14,%xmm9 764 punpcklqdq %xmm7,%xmm14 765 punpckhqdq %xmm8,%xmm5 766 punpckhqdq %xmm7,%xmm9 767 paddd 256-256(%rcx),%xmm0 768 paddd 272-256(%rcx),%xmm1 769 paddd 288-256(%rcx),%xmm2 770 paddd 304-256(%rcx),%xmm3 771 772 movdqa %xmm0,%xmm8 773 punpckldq %xmm1,%xmm0 774 movdqa %xmm2,%xmm7 775 punpckldq %xmm3,%xmm2 776 punpckhdq %xmm1,%xmm8 777 punpckhdq %xmm3,%xmm7 778 movdqa %xmm0,%xmm1 779 punpcklqdq %xmm2,%xmm0 780 movdqa %xmm8,%xmm3 781 punpcklqdq %xmm7,%xmm8 782 punpckhqdq %xmm2,%xmm1 783 punpckhqdq %xmm7,%xmm3 784 cmpq $256,%rdx 785 jb .Ltail4x 786 787 movdqu 0(%rsi),%xmm6 788 movdqu 16(%rsi),%xmm11 789 movdqu 32(%rsi),%xmm2 790 movdqu 48(%rsi),%xmm7 791 pxor 0(%rsp),%xmm6 792 pxor %xmm12,%xmm11 793 pxor %xmm4,%xmm2 794 pxor %xmm0,%xmm7 795 796 movdqu %xmm6,0(%rdi) 797 movdqu 64(%rsi),%xmm6 798 movdqu %xmm11,16(%rdi) 799 movdqu 80(%rsi),%xmm11 800 movdqu %xmm2,32(%rdi) 801 movdqu 96(%rsi),%xmm2 802 movdqu %xmm7,48(%rdi) 803 movdqu 112(%rsi),%xmm7 804 leaq 128(%rsi),%rsi 805 pxor 16(%rsp),%xmm6 806 pxor %xmm13,%xmm11 807 pxor %xmm5,%xmm2 808 pxor %xmm1,%xmm7 809 810 movdqu %xmm6,64(%rdi) 811 movdqu 0(%rsi),%xmm6 812 movdqu %xmm11,80(%rdi) 813 movdqu 16(%rsi),%xmm11 814 movdqu %xmm2,96(%rdi) 815 movdqu 32(%rsi),%xmm2 816 movdqu %xmm7,112(%rdi) 817 leaq 128(%rdi),%rdi 818 movdqu 48(%rsi),%xmm7 819 pxor 32(%rsp),%xmm6 820 pxor %xmm10,%xmm11 821 pxor %xmm14,%xmm2 822 pxor %xmm8,%xmm7 823 824 movdqu %xmm6,0(%rdi) 825 movdqu 64(%rsi),%xmm6 826 movdqu %xmm11,16(%rdi) 827 movdqu 80(%rsi),%xmm11 828 movdqu %xmm2,32(%rdi) 829 movdqu 96(%rsi),%xmm2 830 movdqu %xmm7,48(%rdi) 831 movdqu 112(%rsi),%xmm7 832 leaq 128(%rsi),%rsi 833 pxor 48(%rsp),%xmm6 834 pxor %xmm15,%xmm11 835 pxor %xmm9,%xmm2 836 pxor %xmm3,%xmm7 837 movdqu %xmm6,64(%rdi) 838 movdqu %xmm11,80(%rdi) 839 movdqu %xmm2,96(%rdi) 840 movdqu %xmm7,112(%rdi) 841 leaq 128(%rdi),%rdi 842 843 subq $256,%rdx 844 jnz .Loop_outer4x 845 846 jmp .Ldone4x 847 848.Ltail4x: 849 cmpq $192,%rdx 850 jae .L192_or_more4x 851 cmpq $128,%rdx 852 jae .L128_or_more4x 853 cmpq $64,%rdx 854 jae .L64_or_more4x 855 856 857 xorq %r10,%r10 858 859 movdqa %xmm12,16(%rsp) 860 movdqa %xmm4,32(%rsp) 861 movdqa %xmm0,48(%rsp) 862 jmp .Loop_tail4x 863 864.align 32 865.L64_or_more4x: 866 movdqu 0(%rsi),%xmm6 867 movdqu 16(%rsi),%xmm11 868 movdqu 32(%rsi),%xmm2 869 movdqu 48(%rsi),%xmm7 870 pxor 0(%rsp),%xmm6 871 pxor %xmm12,%xmm11 872 pxor %xmm4,%xmm2 873 pxor %xmm0,%xmm7 874 movdqu %xmm6,0(%rdi) 875 movdqu %xmm11,16(%rdi) 876 movdqu %xmm2,32(%rdi) 877 movdqu %xmm7,48(%rdi) 878 je .Ldone4x 879 880 movdqa 16(%rsp),%xmm6 881 leaq 64(%rsi),%rsi 882 xorq %r10,%r10 883 movdqa %xmm6,0(%rsp) 884 movdqa %xmm13,16(%rsp) 885 leaq 64(%rdi),%rdi 886 movdqa %xmm5,32(%rsp) 887 subq $64,%rdx 888 movdqa %xmm1,48(%rsp) 889 jmp .Loop_tail4x 890 891.align 32 892.L128_or_more4x: 893 movdqu 0(%rsi),%xmm6 894 movdqu 16(%rsi),%xmm11 895 movdqu 32(%rsi),%xmm2 896 movdqu 48(%rsi),%xmm7 897 pxor 0(%rsp),%xmm6 898 pxor %xmm12,%xmm11 899 pxor %xmm4,%xmm2 900 pxor %xmm0,%xmm7 901 902 movdqu %xmm6,0(%rdi) 903 movdqu 64(%rsi),%xmm6 904 movdqu %xmm11,16(%rdi) 905 movdqu 80(%rsi),%xmm11 906 movdqu %xmm2,32(%rdi) 907 movdqu 96(%rsi),%xmm2 908 movdqu %xmm7,48(%rdi) 909 movdqu 112(%rsi),%xmm7 910 pxor 16(%rsp),%xmm6 911 pxor %xmm13,%xmm11 912 pxor %xmm5,%xmm2 913 pxor %xmm1,%xmm7 914 movdqu %xmm6,64(%rdi) 915 movdqu %xmm11,80(%rdi) 916 movdqu %xmm2,96(%rdi) 917 movdqu %xmm7,112(%rdi) 918 je .Ldone4x 919 920 movdqa 32(%rsp),%xmm6 921 leaq 128(%rsi),%rsi 922 xorq %r10,%r10 923 movdqa %xmm6,0(%rsp) 924 movdqa %xmm10,16(%rsp) 925 leaq 128(%rdi),%rdi 926 movdqa %xmm14,32(%rsp) 927 subq $128,%rdx 928 movdqa %xmm8,48(%rsp) 929 jmp .Loop_tail4x 930 931.align 32 932.L192_or_more4x: 933 movdqu 0(%rsi),%xmm6 934 movdqu 16(%rsi),%xmm11 935 movdqu 32(%rsi),%xmm2 936 movdqu 48(%rsi),%xmm7 937 pxor 0(%rsp),%xmm6 938 pxor %xmm12,%xmm11 939 pxor %xmm4,%xmm2 940 pxor %xmm0,%xmm7 941 942 movdqu %xmm6,0(%rdi) 943 movdqu 64(%rsi),%xmm6 944 movdqu %xmm11,16(%rdi) 945 movdqu 80(%rsi),%xmm11 946 movdqu %xmm2,32(%rdi) 947 movdqu 96(%rsi),%xmm2 948 movdqu %xmm7,48(%rdi) 949 movdqu 112(%rsi),%xmm7 950 leaq 128(%rsi),%rsi 951 pxor 16(%rsp),%xmm6 952 pxor %xmm13,%xmm11 953 pxor %xmm5,%xmm2 954 pxor %xmm1,%xmm7 955 956 movdqu %xmm6,64(%rdi) 957 movdqu 0(%rsi),%xmm6 958 movdqu %xmm11,80(%rdi) 959 movdqu 16(%rsi),%xmm11 960 movdqu %xmm2,96(%rdi) 961 movdqu 32(%rsi),%xmm2 962 movdqu %xmm7,112(%rdi) 963 leaq 128(%rdi),%rdi 964 movdqu 48(%rsi),%xmm7 965 pxor 32(%rsp),%xmm6 966 pxor %xmm10,%xmm11 967 pxor %xmm14,%xmm2 968 pxor %xmm8,%xmm7 969 movdqu %xmm6,0(%rdi) 970 movdqu %xmm11,16(%rdi) 971 movdqu %xmm2,32(%rdi) 972 movdqu %xmm7,48(%rdi) 973 je .Ldone4x 974 975 movdqa 48(%rsp),%xmm6 976 leaq 64(%rsi),%rsi 977 xorq %r10,%r10 978 movdqa %xmm6,0(%rsp) 979 movdqa %xmm15,16(%rsp) 980 leaq 64(%rdi),%rdi 981 movdqa %xmm9,32(%rsp) 982 subq $192,%rdx 983 movdqa %xmm3,48(%rsp) 984 985.Loop_tail4x: 986 movzbl (%rsi,%r10,1),%eax 987 movzbl (%rsp,%r10,1),%ecx 988 leaq 1(%r10),%r10 989 xorl %ecx,%eax 990 movb %al,-1(%rdi,%r10,1) 991 decq %rdx 992 jnz .Loop_tail4x 993 994.Ldone4x: 995 leaq (%r9),%rsp 996.cfi_def_cfa_register rsp 997.L4x_epilogue: 998 ret 999.cfi_endproc 1000.size ChaCha20_ctr32_ssse3_4x,.-ChaCha20_ctr32_ssse3_4x 1001.globl ChaCha20_ctr32_avx2 1002.hidden ChaCha20_ctr32_avx2 1003.type ChaCha20_ctr32_avx2,@function 1004.align 32 1005ChaCha20_ctr32_avx2: 1006.cfi_startproc 1007_CET_ENDBR 1008 movq %rsp,%r9 1009.cfi_def_cfa_register r9 1010 subq $0x280+8,%rsp 1011 andq $-32,%rsp 1012 vzeroupper 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 vbroadcasti128 .Lsigma(%rip),%ymm11 1024 vbroadcasti128 (%rcx),%ymm3 1025 vbroadcasti128 16(%rcx),%ymm15 1026 vbroadcasti128 (%r8),%ymm7 1027 leaq 256(%rsp),%rcx 1028 leaq 512(%rsp),%rax 1029 leaq .Lrot16(%rip),%r10 1030 leaq .Lrot24(%rip),%r11 1031 1032 vpshufd $0x00,%ymm11,%ymm8 1033 vpshufd $0x55,%ymm11,%ymm9 1034 vmovdqa %ymm8,128-256(%rcx) 1035 vpshufd $0xaa,%ymm11,%ymm10 1036 vmovdqa %ymm9,160-256(%rcx) 1037 vpshufd $0xff,%ymm11,%ymm11 1038 vmovdqa %ymm10,192-256(%rcx) 1039 vmovdqa %ymm11,224-256(%rcx) 1040 1041 vpshufd $0x00,%ymm3,%ymm0 1042 vpshufd $0x55,%ymm3,%ymm1 1043 vmovdqa %ymm0,256-256(%rcx) 1044 vpshufd $0xaa,%ymm3,%ymm2 1045 vmovdqa %ymm1,288-256(%rcx) 1046 vpshufd $0xff,%ymm3,%ymm3 1047 vmovdqa %ymm2,320-256(%rcx) 1048 vmovdqa %ymm3,352-256(%rcx) 1049 1050 vpshufd $0x00,%ymm15,%ymm12 1051 vpshufd $0x55,%ymm15,%ymm13 1052 vmovdqa %ymm12,384-512(%rax) 1053 vpshufd $0xaa,%ymm15,%ymm14 1054 vmovdqa %ymm13,416-512(%rax) 1055 vpshufd $0xff,%ymm15,%ymm15 1056 vmovdqa %ymm14,448-512(%rax) 1057 vmovdqa %ymm15,480-512(%rax) 1058 1059 vpshufd $0x00,%ymm7,%ymm4 1060 vpshufd $0x55,%ymm7,%ymm5 1061 vpaddd .Lincy(%rip),%ymm4,%ymm4 1062 vpshufd $0xaa,%ymm7,%ymm6 1063 vmovdqa %ymm5,544-512(%rax) 1064 vpshufd $0xff,%ymm7,%ymm7 1065 vmovdqa %ymm6,576-512(%rax) 1066 vmovdqa %ymm7,608-512(%rax) 1067 1068 jmp .Loop_enter8x 1069 1070.align 32 1071.Loop_outer8x: 1072 vmovdqa 128-256(%rcx),%ymm8 1073 vmovdqa 160-256(%rcx),%ymm9 1074 vmovdqa 192-256(%rcx),%ymm10 1075 vmovdqa 224-256(%rcx),%ymm11 1076 vmovdqa 256-256(%rcx),%ymm0 1077 vmovdqa 288-256(%rcx),%ymm1 1078 vmovdqa 320-256(%rcx),%ymm2 1079 vmovdqa 352-256(%rcx),%ymm3 1080 vmovdqa 384-512(%rax),%ymm12 1081 vmovdqa 416-512(%rax),%ymm13 1082 vmovdqa 448-512(%rax),%ymm14 1083 vmovdqa 480-512(%rax),%ymm15 1084 vmovdqa 512-512(%rax),%ymm4 1085 vmovdqa 544-512(%rax),%ymm5 1086 vmovdqa 576-512(%rax),%ymm6 1087 vmovdqa 608-512(%rax),%ymm7 1088 vpaddd .Leight(%rip),%ymm4,%ymm4 1089 1090.Loop_enter8x: 1091 vmovdqa %ymm14,64(%rsp) 1092 vmovdqa %ymm15,96(%rsp) 1093 vbroadcasti128 (%r10),%ymm15 1094 vmovdqa %ymm4,512-512(%rax) 1095 movl $10,%eax 1096 jmp .Loop8x 1097 1098.align 32 1099.Loop8x: 1100 vpaddd %ymm0,%ymm8,%ymm8 1101 vpxor %ymm4,%ymm8,%ymm4 1102 vpshufb %ymm15,%ymm4,%ymm4 1103 vpaddd %ymm1,%ymm9,%ymm9 1104 vpxor %ymm5,%ymm9,%ymm5 1105 vpshufb %ymm15,%ymm5,%ymm5 1106 vpaddd %ymm4,%ymm12,%ymm12 1107 vpxor %ymm0,%ymm12,%ymm0 1108 vpslld $12,%ymm0,%ymm14 1109 vpsrld $20,%ymm0,%ymm0 1110 vpor %ymm0,%ymm14,%ymm0 1111 vbroadcasti128 (%r11),%ymm14 1112 vpaddd %ymm5,%ymm13,%ymm13 1113 vpxor %ymm1,%ymm13,%ymm1 1114 vpslld $12,%ymm1,%ymm15 1115 vpsrld $20,%ymm1,%ymm1 1116 vpor %ymm1,%ymm15,%ymm1 1117 vpaddd %ymm0,%ymm8,%ymm8 1118 vpxor %ymm4,%ymm8,%ymm4 1119 vpshufb %ymm14,%ymm4,%ymm4 1120 vpaddd %ymm1,%ymm9,%ymm9 1121 vpxor %ymm5,%ymm9,%ymm5 1122 vpshufb %ymm14,%ymm5,%ymm5 1123 vpaddd %ymm4,%ymm12,%ymm12 1124 vpxor %ymm0,%ymm12,%ymm0 1125 vpslld $7,%ymm0,%ymm15 1126 vpsrld $25,%ymm0,%ymm0 1127 vpor %ymm0,%ymm15,%ymm0 1128 vbroadcasti128 (%r10),%ymm15 1129 vpaddd %ymm5,%ymm13,%ymm13 1130 vpxor %ymm1,%ymm13,%ymm1 1131 vpslld $7,%ymm1,%ymm14 1132 vpsrld $25,%ymm1,%ymm1 1133 vpor %ymm1,%ymm14,%ymm1 1134 vmovdqa %ymm12,0(%rsp) 1135 vmovdqa %ymm13,32(%rsp) 1136 vmovdqa 64(%rsp),%ymm12 1137 vmovdqa 96(%rsp),%ymm13 1138 vpaddd %ymm2,%ymm10,%ymm10 1139 vpxor %ymm6,%ymm10,%ymm6 1140 vpshufb %ymm15,%ymm6,%ymm6 1141 vpaddd %ymm3,%ymm11,%ymm11 1142 vpxor %ymm7,%ymm11,%ymm7 1143 vpshufb %ymm15,%ymm7,%ymm7 1144 vpaddd %ymm6,%ymm12,%ymm12 1145 vpxor %ymm2,%ymm12,%ymm2 1146 vpslld $12,%ymm2,%ymm14 1147 vpsrld $20,%ymm2,%ymm2 1148 vpor %ymm2,%ymm14,%ymm2 1149 vbroadcasti128 (%r11),%ymm14 1150 vpaddd %ymm7,%ymm13,%ymm13 1151 vpxor %ymm3,%ymm13,%ymm3 1152 vpslld $12,%ymm3,%ymm15 1153 vpsrld $20,%ymm3,%ymm3 1154 vpor %ymm3,%ymm15,%ymm3 1155 vpaddd %ymm2,%ymm10,%ymm10 1156 vpxor %ymm6,%ymm10,%ymm6 1157 vpshufb %ymm14,%ymm6,%ymm6 1158 vpaddd %ymm3,%ymm11,%ymm11 1159 vpxor %ymm7,%ymm11,%ymm7 1160 vpshufb %ymm14,%ymm7,%ymm7 1161 vpaddd %ymm6,%ymm12,%ymm12 1162 vpxor %ymm2,%ymm12,%ymm2 1163 vpslld $7,%ymm2,%ymm15 1164 vpsrld $25,%ymm2,%ymm2 1165 vpor %ymm2,%ymm15,%ymm2 1166 vbroadcasti128 (%r10),%ymm15 1167 vpaddd %ymm7,%ymm13,%ymm13 1168 vpxor %ymm3,%ymm13,%ymm3 1169 vpslld $7,%ymm3,%ymm14 1170 vpsrld $25,%ymm3,%ymm3 1171 vpor %ymm3,%ymm14,%ymm3 1172 vpaddd %ymm1,%ymm8,%ymm8 1173 vpxor %ymm7,%ymm8,%ymm7 1174 vpshufb %ymm15,%ymm7,%ymm7 1175 vpaddd %ymm2,%ymm9,%ymm9 1176 vpxor %ymm4,%ymm9,%ymm4 1177 vpshufb %ymm15,%ymm4,%ymm4 1178 vpaddd %ymm7,%ymm12,%ymm12 1179 vpxor %ymm1,%ymm12,%ymm1 1180 vpslld $12,%ymm1,%ymm14 1181 vpsrld $20,%ymm1,%ymm1 1182 vpor %ymm1,%ymm14,%ymm1 1183 vbroadcasti128 (%r11),%ymm14 1184 vpaddd %ymm4,%ymm13,%ymm13 1185 vpxor %ymm2,%ymm13,%ymm2 1186 vpslld $12,%ymm2,%ymm15 1187 vpsrld $20,%ymm2,%ymm2 1188 vpor %ymm2,%ymm15,%ymm2 1189 vpaddd %ymm1,%ymm8,%ymm8 1190 vpxor %ymm7,%ymm8,%ymm7 1191 vpshufb %ymm14,%ymm7,%ymm7 1192 vpaddd %ymm2,%ymm9,%ymm9 1193 vpxor %ymm4,%ymm9,%ymm4 1194 vpshufb %ymm14,%ymm4,%ymm4 1195 vpaddd %ymm7,%ymm12,%ymm12 1196 vpxor %ymm1,%ymm12,%ymm1 1197 vpslld $7,%ymm1,%ymm15 1198 vpsrld $25,%ymm1,%ymm1 1199 vpor %ymm1,%ymm15,%ymm1 1200 vbroadcasti128 (%r10),%ymm15 1201 vpaddd %ymm4,%ymm13,%ymm13 1202 vpxor %ymm2,%ymm13,%ymm2 1203 vpslld $7,%ymm2,%ymm14 1204 vpsrld $25,%ymm2,%ymm2 1205 vpor %ymm2,%ymm14,%ymm2 1206 vmovdqa %ymm12,64(%rsp) 1207 vmovdqa %ymm13,96(%rsp) 1208 vmovdqa 0(%rsp),%ymm12 1209 vmovdqa 32(%rsp),%ymm13 1210 vpaddd %ymm3,%ymm10,%ymm10 1211 vpxor %ymm5,%ymm10,%ymm5 1212 vpshufb %ymm15,%ymm5,%ymm5 1213 vpaddd %ymm0,%ymm11,%ymm11 1214 vpxor %ymm6,%ymm11,%ymm6 1215 vpshufb %ymm15,%ymm6,%ymm6 1216 vpaddd %ymm5,%ymm12,%ymm12 1217 vpxor %ymm3,%ymm12,%ymm3 1218 vpslld $12,%ymm3,%ymm14 1219 vpsrld $20,%ymm3,%ymm3 1220 vpor %ymm3,%ymm14,%ymm3 1221 vbroadcasti128 (%r11),%ymm14 1222 vpaddd %ymm6,%ymm13,%ymm13 1223 vpxor %ymm0,%ymm13,%ymm0 1224 vpslld $12,%ymm0,%ymm15 1225 vpsrld $20,%ymm0,%ymm0 1226 vpor %ymm0,%ymm15,%ymm0 1227 vpaddd %ymm3,%ymm10,%ymm10 1228 vpxor %ymm5,%ymm10,%ymm5 1229 vpshufb %ymm14,%ymm5,%ymm5 1230 vpaddd %ymm0,%ymm11,%ymm11 1231 vpxor %ymm6,%ymm11,%ymm6 1232 vpshufb %ymm14,%ymm6,%ymm6 1233 vpaddd %ymm5,%ymm12,%ymm12 1234 vpxor %ymm3,%ymm12,%ymm3 1235 vpslld $7,%ymm3,%ymm15 1236 vpsrld $25,%ymm3,%ymm3 1237 vpor %ymm3,%ymm15,%ymm3 1238 vbroadcasti128 (%r10),%ymm15 1239 vpaddd %ymm6,%ymm13,%ymm13 1240 vpxor %ymm0,%ymm13,%ymm0 1241 vpslld $7,%ymm0,%ymm14 1242 vpsrld $25,%ymm0,%ymm0 1243 vpor %ymm0,%ymm14,%ymm0 1244 decl %eax 1245 jnz .Loop8x 1246 1247 leaq 512(%rsp),%rax 1248 vpaddd 128-256(%rcx),%ymm8,%ymm8 1249 vpaddd 160-256(%rcx),%ymm9,%ymm9 1250 vpaddd 192-256(%rcx),%ymm10,%ymm10 1251 vpaddd 224-256(%rcx),%ymm11,%ymm11 1252 1253 vpunpckldq %ymm9,%ymm8,%ymm14 1254 vpunpckldq %ymm11,%ymm10,%ymm15 1255 vpunpckhdq %ymm9,%ymm8,%ymm8 1256 vpunpckhdq %ymm11,%ymm10,%ymm10 1257 vpunpcklqdq %ymm15,%ymm14,%ymm9 1258 vpunpckhqdq %ymm15,%ymm14,%ymm14 1259 vpunpcklqdq %ymm10,%ymm8,%ymm11 1260 vpunpckhqdq %ymm10,%ymm8,%ymm8 1261 vpaddd 256-256(%rcx),%ymm0,%ymm0 1262 vpaddd 288-256(%rcx),%ymm1,%ymm1 1263 vpaddd 320-256(%rcx),%ymm2,%ymm2 1264 vpaddd 352-256(%rcx),%ymm3,%ymm3 1265 1266 vpunpckldq %ymm1,%ymm0,%ymm10 1267 vpunpckldq %ymm3,%ymm2,%ymm15 1268 vpunpckhdq %ymm1,%ymm0,%ymm0 1269 vpunpckhdq %ymm3,%ymm2,%ymm2 1270 vpunpcklqdq %ymm15,%ymm10,%ymm1 1271 vpunpckhqdq %ymm15,%ymm10,%ymm10 1272 vpunpcklqdq %ymm2,%ymm0,%ymm3 1273 vpunpckhqdq %ymm2,%ymm0,%ymm0 1274 vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 1275 vperm2i128 $0x31,%ymm1,%ymm9,%ymm1 1276 vperm2i128 $0x20,%ymm10,%ymm14,%ymm9 1277 vperm2i128 $0x31,%ymm10,%ymm14,%ymm10 1278 vperm2i128 $0x20,%ymm3,%ymm11,%ymm14 1279 vperm2i128 $0x31,%ymm3,%ymm11,%ymm3 1280 vperm2i128 $0x20,%ymm0,%ymm8,%ymm11 1281 vperm2i128 $0x31,%ymm0,%ymm8,%ymm0 1282 vmovdqa %ymm15,0(%rsp) 1283 vmovdqa %ymm9,32(%rsp) 1284 vmovdqa 64(%rsp),%ymm15 1285 vmovdqa 96(%rsp),%ymm9 1286 1287 vpaddd 384-512(%rax),%ymm12,%ymm12 1288 vpaddd 416-512(%rax),%ymm13,%ymm13 1289 vpaddd 448-512(%rax),%ymm15,%ymm15 1290 vpaddd 480-512(%rax),%ymm9,%ymm9 1291 1292 vpunpckldq %ymm13,%ymm12,%ymm2 1293 vpunpckldq %ymm9,%ymm15,%ymm8 1294 vpunpckhdq %ymm13,%ymm12,%ymm12 1295 vpunpckhdq %ymm9,%ymm15,%ymm15 1296 vpunpcklqdq %ymm8,%ymm2,%ymm13 1297 vpunpckhqdq %ymm8,%ymm2,%ymm2 1298 vpunpcklqdq %ymm15,%ymm12,%ymm9 1299 vpunpckhqdq %ymm15,%ymm12,%ymm12 1300 vpaddd 512-512(%rax),%ymm4,%ymm4 1301 vpaddd 544-512(%rax),%ymm5,%ymm5 1302 vpaddd 576-512(%rax),%ymm6,%ymm6 1303 vpaddd 608-512(%rax),%ymm7,%ymm7 1304 1305 vpunpckldq %ymm5,%ymm4,%ymm15 1306 vpunpckldq %ymm7,%ymm6,%ymm8 1307 vpunpckhdq %ymm5,%ymm4,%ymm4 1308 vpunpckhdq %ymm7,%ymm6,%ymm6 1309 vpunpcklqdq %ymm8,%ymm15,%ymm5 1310 vpunpckhqdq %ymm8,%ymm15,%ymm15 1311 vpunpcklqdq %ymm6,%ymm4,%ymm7 1312 vpunpckhqdq %ymm6,%ymm4,%ymm4 1313 vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 1314 vperm2i128 $0x31,%ymm5,%ymm13,%ymm5 1315 vperm2i128 $0x20,%ymm15,%ymm2,%ymm13 1316 vperm2i128 $0x31,%ymm15,%ymm2,%ymm15 1317 vperm2i128 $0x20,%ymm7,%ymm9,%ymm2 1318 vperm2i128 $0x31,%ymm7,%ymm9,%ymm7 1319 vperm2i128 $0x20,%ymm4,%ymm12,%ymm9 1320 vperm2i128 $0x31,%ymm4,%ymm12,%ymm4 1321 vmovdqa 0(%rsp),%ymm6 1322 vmovdqa 32(%rsp),%ymm12 1323 1324 cmpq $512,%rdx 1325 jb .Ltail8x 1326 1327 vpxor 0(%rsi),%ymm6,%ymm6 1328 vpxor 32(%rsi),%ymm8,%ymm8 1329 vpxor 64(%rsi),%ymm1,%ymm1 1330 vpxor 96(%rsi),%ymm5,%ymm5 1331 leaq 128(%rsi),%rsi 1332 vmovdqu %ymm6,0(%rdi) 1333 vmovdqu %ymm8,32(%rdi) 1334 vmovdqu %ymm1,64(%rdi) 1335 vmovdqu %ymm5,96(%rdi) 1336 leaq 128(%rdi),%rdi 1337 1338 vpxor 0(%rsi),%ymm12,%ymm12 1339 vpxor 32(%rsi),%ymm13,%ymm13 1340 vpxor 64(%rsi),%ymm10,%ymm10 1341 vpxor 96(%rsi),%ymm15,%ymm15 1342 leaq 128(%rsi),%rsi 1343 vmovdqu %ymm12,0(%rdi) 1344 vmovdqu %ymm13,32(%rdi) 1345 vmovdqu %ymm10,64(%rdi) 1346 vmovdqu %ymm15,96(%rdi) 1347 leaq 128(%rdi),%rdi 1348 1349 vpxor 0(%rsi),%ymm14,%ymm14 1350 vpxor 32(%rsi),%ymm2,%ymm2 1351 vpxor 64(%rsi),%ymm3,%ymm3 1352 vpxor 96(%rsi),%ymm7,%ymm7 1353 leaq 128(%rsi),%rsi 1354 vmovdqu %ymm14,0(%rdi) 1355 vmovdqu %ymm2,32(%rdi) 1356 vmovdqu %ymm3,64(%rdi) 1357 vmovdqu %ymm7,96(%rdi) 1358 leaq 128(%rdi),%rdi 1359 1360 vpxor 0(%rsi),%ymm11,%ymm11 1361 vpxor 32(%rsi),%ymm9,%ymm9 1362 vpxor 64(%rsi),%ymm0,%ymm0 1363 vpxor 96(%rsi),%ymm4,%ymm4 1364 leaq 128(%rsi),%rsi 1365 vmovdqu %ymm11,0(%rdi) 1366 vmovdqu %ymm9,32(%rdi) 1367 vmovdqu %ymm0,64(%rdi) 1368 vmovdqu %ymm4,96(%rdi) 1369 leaq 128(%rdi),%rdi 1370 1371 subq $512,%rdx 1372 jnz .Loop_outer8x 1373 1374 jmp .Ldone8x 1375 1376.Ltail8x: 1377 cmpq $448,%rdx 1378 jae .L448_or_more8x 1379 cmpq $384,%rdx 1380 jae .L384_or_more8x 1381 cmpq $320,%rdx 1382 jae .L320_or_more8x 1383 cmpq $256,%rdx 1384 jae .L256_or_more8x 1385 cmpq $192,%rdx 1386 jae .L192_or_more8x 1387 cmpq $128,%rdx 1388 jae .L128_or_more8x 1389 cmpq $64,%rdx 1390 jae .L64_or_more8x 1391 1392 xorq %r10,%r10 1393 vmovdqa %ymm6,0(%rsp) 1394 vmovdqa %ymm8,32(%rsp) 1395 jmp .Loop_tail8x 1396 1397.align 32 1398.L64_or_more8x: 1399 vpxor 0(%rsi),%ymm6,%ymm6 1400 vpxor 32(%rsi),%ymm8,%ymm8 1401 vmovdqu %ymm6,0(%rdi) 1402 vmovdqu %ymm8,32(%rdi) 1403 je .Ldone8x 1404 1405 leaq 64(%rsi),%rsi 1406 xorq %r10,%r10 1407 vmovdqa %ymm1,0(%rsp) 1408 leaq 64(%rdi),%rdi 1409 subq $64,%rdx 1410 vmovdqa %ymm5,32(%rsp) 1411 jmp .Loop_tail8x 1412 1413.align 32 1414.L128_or_more8x: 1415 vpxor 0(%rsi),%ymm6,%ymm6 1416 vpxor 32(%rsi),%ymm8,%ymm8 1417 vpxor 64(%rsi),%ymm1,%ymm1 1418 vpxor 96(%rsi),%ymm5,%ymm5 1419 vmovdqu %ymm6,0(%rdi) 1420 vmovdqu %ymm8,32(%rdi) 1421 vmovdqu %ymm1,64(%rdi) 1422 vmovdqu %ymm5,96(%rdi) 1423 je .Ldone8x 1424 1425 leaq 128(%rsi),%rsi 1426 xorq %r10,%r10 1427 vmovdqa %ymm12,0(%rsp) 1428 leaq 128(%rdi),%rdi 1429 subq $128,%rdx 1430 vmovdqa %ymm13,32(%rsp) 1431 jmp .Loop_tail8x 1432 1433.align 32 1434.L192_or_more8x: 1435 vpxor 0(%rsi),%ymm6,%ymm6 1436 vpxor 32(%rsi),%ymm8,%ymm8 1437 vpxor 64(%rsi),%ymm1,%ymm1 1438 vpxor 96(%rsi),%ymm5,%ymm5 1439 vpxor 128(%rsi),%ymm12,%ymm12 1440 vpxor 160(%rsi),%ymm13,%ymm13 1441 vmovdqu %ymm6,0(%rdi) 1442 vmovdqu %ymm8,32(%rdi) 1443 vmovdqu %ymm1,64(%rdi) 1444 vmovdqu %ymm5,96(%rdi) 1445 vmovdqu %ymm12,128(%rdi) 1446 vmovdqu %ymm13,160(%rdi) 1447 je .Ldone8x 1448 1449 leaq 192(%rsi),%rsi 1450 xorq %r10,%r10 1451 vmovdqa %ymm10,0(%rsp) 1452 leaq 192(%rdi),%rdi 1453 subq $192,%rdx 1454 vmovdqa %ymm15,32(%rsp) 1455 jmp .Loop_tail8x 1456 1457.align 32 1458.L256_or_more8x: 1459 vpxor 0(%rsi),%ymm6,%ymm6 1460 vpxor 32(%rsi),%ymm8,%ymm8 1461 vpxor 64(%rsi),%ymm1,%ymm1 1462 vpxor 96(%rsi),%ymm5,%ymm5 1463 vpxor 128(%rsi),%ymm12,%ymm12 1464 vpxor 160(%rsi),%ymm13,%ymm13 1465 vpxor 192(%rsi),%ymm10,%ymm10 1466 vpxor 224(%rsi),%ymm15,%ymm15 1467 vmovdqu %ymm6,0(%rdi) 1468 vmovdqu %ymm8,32(%rdi) 1469 vmovdqu %ymm1,64(%rdi) 1470 vmovdqu %ymm5,96(%rdi) 1471 vmovdqu %ymm12,128(%rdi) 1472 vmovdqu %ymm13,160(%rdi) 1473 vmovdqu %ymm10,192(%rdi) 1474 vmovdqu %ymm15,224(%rdi) 1475 je .Ldone8x 1476 1477 leaq 256(%rsi),%rsi 1478 xorq %r10,%r10 1479 vmovdqa %ymm14,0(%rsp) 1480 leaq 256(%rdi),%rdi 1481 subq $256,%rdx 1482 vmovdqa %ymm2,32(%rsp) 1483 jmp .Loop_tail8x 1484 1485.align 32 1486.L320_or_more8x: 1487 vpxor 0(%rsi),%ymm6,%ymm6 1488 vpxor 32(%rsi),%ymm8,%ymm8 1489 vpxor 64(%rsi),%ymm1,%ymm1 1490 vpxor 96(%rsi),%ymm5,%ymm5 1491 vpxor 128(%rsi),%ymm12,%ymm12 1492 vpxor 160(%rsi),%ymm13,%ymm13 1493 vpxor 192(%rsi),%ymm10,%ymm10 1494 vpxor 224(%rsi),%ymm15,%ymm15 1495 vpxor 256(%rsi),%ymm14,%ymm14 1496 vpxor 288(%rsi),%ymm2,%ymm2 1497 vmovdqu %ymm6,0(%rdi) 1498 vmovdqu %ymm8,32(%rdi) 1499 vmovdqu %ymm1,64(%rdi) 1500 vmovdqu %ymm5,96(%rdi) 1501 vmovdqu %ymm12,128(%rdi) 1502 vmovdqu %ymm13,160(%rdi) 1503 vmovdqu %ymm10,192(%rdi) 1504 vmovdqu %ymm15,224(%rdi) 1505 vmovdqu %ymm14,256(%rdi) 1506 vmovdqu %ymm2,288(%rdi) 1507 je .Ldone8x 1508 1509 leaq 320(%rsi),%rsi 1510 xorq %r10,%r10 1511 vmovdqa %ymm3,0(%rsp) 1512 leaq 320(%rdi),%rdi 1513 subq $320,%rdx 1514 vmovdqa %ymm7,32(%rsp) 1515 jmp .Loop_tail8x 1516 1517.align 32 1518.L384_or_more8x: 1519 vpxor 0(%rsi),%ymm6,%ymm6 1520 vpxor 32(%rsi),%ymm8,%ymm8 1521 vpxor 64(%rsi),%ymm1,%ymm1 1522 vpxor 96(%rsi),%ymm5,%ymm5 1523 vpxor 128(%rsi),%ymm12,%ymm12 1524 vpxor 160(%rsi),%ymm13,%ymm13 1525 vpxor 192(%rsi),%ymm10,%ymm10 1526 vpxor 224(%rsi),%ymm15,%ymm15 1527 vpxor 256(%rsi),%ymm14,%ymm14 1528 vpxor 288(%rsi),%ymm2,%ymm2 1529 vpxor 320(%rsi),%ymm3,%ymm3 1530 vpxor 352(%rsi),%ymm7,%ymm7 1531 vmovdqu %ymm6,0(%rdi) 1532 vmovdqu %ymm8,32(%rdi) 1533 vmovdqu %ymm1,64(%rdi) 1534 vmovdqu %ymm5,96(%rdi) 1535 vmovdqu %ymm12,128(%rdi) 1536 vmovdqu %ymm13,160(%rdi) 1537 vmovdqu %ymm10,192(%rdi) 1538 vmovdqu %ymm15,224(%rdi) 1539 vmovdqu %ymm14,256(%rdi) 1540 vmovdqu %ymm2,288(%rdi) 1541 vmovdqu %ymm3,320(%rdi) 1542 vmovdqu %ymm7,352(%rdi) 1543 je .Ldone8x 1544 1545 leaq 384(%rsi),%rsi 1546 xorq %r10,%r10 1547 vmovdqa %ymm11,0(%rsp) 1548 leaq 384(%rdi),%rdi 1549 subq $384,%rdx 1550 vmovdqa %ymm9,32(%rsp) 1551 jmp .Loop_tail8x 1552 1553.align 32 1554.L448_or_more8x: 1555 vpxor 0(%rsi),%ymm6,%ymm6 1556 vpxor 32(%rsi),%ymm8,%ymm8 1557 vpxor 64(%rsi),%ymm1,%ymm1 1558 vpxor 96(%rsi),%ymm5,%ymm5 1559 vpxor 128(%rsi),%ymm12,%ymm12 1560 vpxor 160(%rsi),%ymm13,%ymm13 1561 vpxor 192(%rsi),%ymm10,%ymm10 1562 vpxor 224(%rsi),%ymm15,%ymm15 1563 vpxor 256(%rsi),%ymm14,%ymm14 1564 vpxor 288(%rsi),%ymm2,%ymm2 1565 vpxor 320(%rsi),%ymm3,%ymm3 1566 vpxor 352(%rsi),%ymm7,%ymm7 1567 vpxor 384(%rsi),%ymm11,%ymm11 1568 vpxor 416(%rsi),%ymm9,%ymm9 1569 vmovdqu %ymm6,0(%rdi) 1570 vmovdqu %ymm8,32(%rdi) 1571 vmovdqu %ymm1,64(%rdi) 1572 vmovdqu %ymm5,96(%rdi) 1573 vmovdqu %ymm12,128(%rdi) 1574 vmovdqu %ymm13,160(%rdi) 1575 vmovdqu %ymm10,192(%rdi) 1576 vmovdqu %ymm15,224(%rdi) 1577 vmovdqu %ymm14,256(%rdi) 1578 vmovdqu %ymm2,288(%rdi) 1579 vmovdqu %ymm3,320(%rdi) 1580 vmovdqu %ymm7,352(%rdi) 1581 vmovdqu %ymm11,384(%rdi) 1582 vmovdqu %ymm9,416(%rdi) 1583 je .Ldone8x 1584 1585 leaq 448(%rsi),%rsi 1586 xorq %r10,%r10 1587 vmovdqa %ymm0,0(%rsp) 1588 leaq 448(%rdi),%rdi 1589 subq $448,%rdx 1590 vmovdqa %ymm4,32(%rsp) 1591 1592.Loop_tail8x: 1593 movzbl (%rsi,%r10,1),%eax 1594 movzbl (%rsp,%r10,1),%ecx 1595 leaq 1(%r10),%r10 1596 xorl %ecx,%eax 1597 movb %al,-1(%rdi,%r10,1) 1598 decq %rdx 1599 jnz .Loop_tail8x 1600 1601.Ldone8x: 1602 vzeroall 1603 leaq (%r9),%rsp 1604.cfi_def_cfa_register rsp 1605.L8x_epilogue: 1606 ret 1607.cfi_endproc 1608.size ChaCha20_ctr32_avx2,.-ChaCha20_ctr32_avx2 1609#endif 1610