1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#include <openssl/asm_base.h> 5 6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) 7.text 8 9.section .rodata 10.align 64 11.Lzero: 12.long 0,0,0,0 13.Lone: 14.long 1,0,0,0 15.Linc: 16.long 0,1,2,3 17.Lfour: 18.long 4,4,4,4 19.Lincy: 20.long 0,2,4,6,1,3,5,7 21.Leight: 22.long 8,8,8,8,8,8,8,8 23.Lrot16: 24.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd 25.Lrot24: 26.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe 27.Lsigma: 28.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 29.align 64 30.Lzeroz: 31.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 32.Lfourz: 33.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 34.Lincz: 35.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 36.Lsixteen: 37.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 38.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 39.text 40.globl ChaCha20_ctr32_nohw 41.hidden ChaCha20_ctr32_nohw 42.type ChaCha20_ctr32_nohw,@function 43.align 64 44ChaCha20_ctr32_nohw: 45.cfi_startproc 46_CET_ENDBR 47 pushq %rbx 48.cfi_adjust_cfa_offset 8 49.cfi_offset rbx,-16 50 pushq %rbp 51.cfi_adjust_cfa_offset 8 52.cfi_offset rbp,-24 53 pushq %r12 54.cfi_adjust_cfa_offset 8 55.cfi_offset r12,-32 56 pushq %r13 57.cfi_adjust_cfa_offset 8 58.cfi_offset r13,-40 59 pushq %r14 60.cfi_adjust_cfa_offset 8 61.cfi_offset r14,-48 62 pushq %r15 63.cfi_adjust_cfa_offset 8 64.cfi_offset r15,-56 65 subq $64+24,%rsp 66.cfi_adjust_cfa_offset 88 67.Lctr32_body: 68 69 70 movdqu (%rcx),%xmm1 71 movdqu 16(%rcx),%xmm2 72 movdqu (%r8),%xmm3 73 movdqa .Lone(%rip),%xmm4 74 75 76 movdqa %xmm1,16(%rsp) 77 movdqa %xmm2,32(%rsp) 78 movdqa %xmm3,48(%rsp) 79 movq %rdx,%rbp 80 jmp .Loop_outer 81 82.align 32 83.Loop_outer: 84 movl $0x61707865,%eax 85 movl $0x3320646e,%ebx 86 movl $0x79622d32,%ecx 87 movl $0x6b206574,%edx 88 movl 16(%rsp),%r8d 89 movl 20(%rsp),%r9d 90 movl 24(%rsp),%r10d 91 movl 28(%rsp),%r11d 92 movd %xmm3,%r12d 93 movl 52(%rsp),%r13d 94 movl 56(%rsp),%r14d 95 movl 60(%rsp),%r15d 96 97 movq %rbp,64+0(%rsp) 98 movl $10,%ebp 99 movq %rsi,64+8(%rsp) 100.byte 102,72,15,126,214 101 movq %rdi,64+16(%rsp) 102 movq %rsi,%rdi 103 shrq $32,%rdi 104 jmp .Loop 105 106.align 32 107.Loop: 108 addl %r8d,%eax 109 xorl %eax,%r12d 110 roll $16,%r12d 111 addl %r9d,%ebx 112 xorl %ebx,%r13d 113 roll $16,%r13d 114 addl %r12d,%esi 115 xorl %esi,%r8d 116 roll $12,%r8d 117 addl %r13d,%edi 118 xorl %edi,%r9d 119 roll $12,%r9d 120 addl %r8d,%eax 121 xorl %eax,%r12d 122 roll $8,%r12d 123 addl %r9d,%ebx 124 xorl %ebx,%r13d 125 roll $8,%r13d 126 addl %r12d,%esi 127 xorl %esi,%r8d 128 roll $7,%r8d 129 addl %r13d,%edi 130 xorl %edi,%r9d 131 roll $7,%r9d 132 movl %esi,32(%rsp) 133 movl %edi,36(%rsp) 134 movl 40(%rsp),%esi 135 movl 44(%rsp),%edi 136 addl %r10d,%ecx 137 xorl %ecx,%r14d 138 roll $16,%r14d 139 addl %r11d,%edx 140 xorl %edx,%r15d 141 roll $16,%r15d 142 addl %r14d,%esi 143 xorl %esi,%r10d 144 roll $12,%r10d 145 addl %r15d,%edi 146 xorl %edi,%r11d 147 roll $12,%r11d 148 addl %r10d,%ecx 149 xorl %ecx,%r14d 150 roll $8,%r14d 151 addl %r11d,%edx 152 xorl %edx,%r15d 153 roll $8,%r15d 154 addl %r14d,%esi 155 xorl %esi,%r10d 156 roll $7,%r10d 157 addl %r15d,%edi 158 xorl %edi,%r11d 159 roll $7,%r11d 160 addl %r9d,%eax 161 xorl %eax,%r15d 162 roll $16,%r15d 163 addl %r10d,%ebx 164 xorl %ebx,%r12d 165 roll $16,%r12d 166 addl %r15d,%esi 167 xorl %esi,%r9d 168 roll $12,%r9d 169 addl %r12d,%edi 170 xorl %edi,%r10d 171 roll $12,%r10d 172 addl %r9d,%eax 173 xorl %eax,%r15d 174 roll $8,%r15d 175 addl %r10d,%ebx 176 xorl %ebx,%r12d 177 roll $8,%r12d 178 addl %r15d,%esi 179 xorl %esi,%r9d 180 roll $7,%r9d 181 addl %r12d,%edi 182 xorl %edi,%r10d 183 roll $7,%r10d 184 movl %esi,40(%rsp) 185 movl %edi,44(%rsp) 186 movl 32(%rsp),%esi 187 movl 36(%rsp),%edi 188 addl %r11d,%ecx 189 xorl %ecx,%r13d 190 roll $16,%r13d 191 addl %r8d,%edx 192 xorl %edx,%r14d 193 roll $16,%r14d 194 addl %r13d,%esi 195 xorl %esi,%r11d 196 roll $12,%r11d 197 addl %r14d,%edi 198 xorl %edi,%r8d 199 roll $12,%r8d 200 addl %r11d,%ecx 201 xorl %ecx,%r13d 202 roll $8,%r13d 203 addl %r8d,%edx 204 xorl %edx,%r14d 205 roll $8,%r14d 206 addl %r13d,%esi 207 xorl %esi,%r11d 208 roll $7,%r11d 209 addl %r14d,%edi 210 xorl %edi,%r8d 211 roll $7,%r8d 212 decl %ebp 213 jnz .Loop 214 movl %edi,36(%rsp) 215 movl %esi,32(%rsp) 216 movq 64(%rsp),%rbp 217 movdqa %xmm2,%xmm1 218 movq 64+8(%rsp),%rsi 219 paddd %xmm4,%xmm3 220 movq 64+16(%rsp),%rdi 221 222 addl $0x61707865,%eax 223 addl $0x3320646e,%ebx 224 addl $0x79622d32,%ecx 225 addl $0x6b206574,%edx 226 addl 16(%rsp),%r8d 227 addl 20(%rsp),%r9d 228 addl 24(%rsp),%r10d 229 addl 28(%rsp),%r11d 230 addl 48(%rsp),%r12d 231 addl 52(%rsp),%r13d 232 addl 56(%rsp),%r14d 233 addl 60(%rsp),%r15d 234 paddd 32(%rsp),%xmm1 235 236 cmpq $64,%rbp 237 jb .Ltail 238 239 xorl 0(%rsi),%eax 240 xorl 4(%rsi),%ebx 241 xorl 8(%rsi),%ecx 242 xorl 12(%rsi),%edx 243 xorl 16(%rsi),%r8d 244 xorl 20(%rsi),%r9d 245 xorl 24(%rsi),%r10d 246 xorl 28(%rsi),%r11d 247 movdqu 32(%rsi),%xmm0 248 xorl 48(%rsi),%r12d 249 xorl 52(%rsi),%r13d 250 xorl 56(%rsi),%r14d 251 xorl 60(%rsi),%r15d 252 leaq 64(%rsi),%rsi 253 pxor %xmm1,%xmm0 254 255 movdqa %xmm2,32(%rsp) 256 movd %xmm3,48(%rsp) 257 258 movl %eax,0(%rdi) 259 movl %ebx,4(%rdi) 260 movl %ecx,8(%rdi) 261 movl %edx,12(%rdi) 262 movl %r8d,16(%rdi) 263 movl %r9d,20(%rdi) 264 movl %r10d,24(%rdi) 265 movl %r11d,28(%rdi) 266 movdqu %xmm0,32(%rdi) 267 movl %r12d,48(%rdi) 268 movl %r13d,52(%rdi) 269 movl %r14d,56(%rdi) 270 movl %r15d,60(%rdi) 271 leaq 64(%rdi),%rdi 272 273 subq $64,%rbp 274 jnz .Loop_outer 275 276 jmp .Ldone 277 278.align 16 279.Ltail: 280 movl %eax,0(%rsp) 281 movl %ebx,4(%rsp) 282 xorq %rbx,%rbx 283 movl %ecx,8(%rsp) 284 movl %edx,12(%rsp) 285 movl %r8d,16(%rsp) 286 movl %r9d,20(%rsp) 287 movl %r10d,24(%rsp) 288 movl %r11d,28(%rsp) 289 movdqa %xmm1,32(%rsp) 290 movl %r12d,48(%rsp) 291 movl %r13d,52(%rsp) 292 movl %r14d,56(%rsp) 293 movl %r15d,60(%rsp) 294 295.Loop_tail: 296 movzbl (%rsi,%rbx,1),%eax 297 movzbl (%rsp,%rbx,1),%edx 298 leaq 1(%rbx),%rbx 299 xorl %edx,%eax 300 movb %al,-1(%rdi,%rbx,1) 301 decq %rbp 302 jnz .Loop_tail 303 304.Ldone: 305 leaq 64+24+48(%rsp),%rsi 306 movq -48(%rsi),%r15 307.cfi_restore r15 308 movq -40(%rsi),%r14 309.cfi_restore r14 310 movq -32(%rsi),%r13 311.cfi_restore r13 312 movq -24(%rsi),%r12 313.cfi_restore r12 314 movq -16(%rsi),%rbp 315.cfi_restore rbp 316 movq -8(%rsi),%rbx 317.cfi_restore rbx 318 leaq (%rsi),%rsp 319.cfi_adjust_cfa_offset -136 320.Lno_data: 321 ret 322.cfi_endproc 323.size ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw 324.globl ChaCha20_ctr32_ssse3 325.hidden ChaCha20_ctr32_ssse3 326.type ChaCha20_ctr32_ssse3,@function 327.align 32 328ChaCha20_ctr32_ssse3: 329.cfi_startproc 330_CET_ENDBR 331 movq %rsp,%r9 332.cfi_def_cfa_register r9 333 subq $64+8,%rsp 334 movdqa .Lsigma(%rip),%xmm0 335 movdqu (%rcx),%xmm1 336 movdqu 16(%rcx),%xmm2 337 movdqu (%r8),%xmm3 338 movdqa .Lrot16(%rip),%xmm6 339 movdqa .Lrot24(%rip),%xmm7 340 341 movdqa %xmm0,0(%rsp) 342 movdqa %xmm1,16(%rsp) 343 movdqa %xmm2,32(%rsp) 344 movdqa %xmm3,48(%rsp) 345 movq $10,%r8 346 jmp .Loop_ssse3 347 348.align 32 349.Loop_outer_ssse3: 350 movdqa .Lone(%rip),%xmm3 351 movdqa 0(%rsp),%xmm0 352 movdqa 16(%rsp),%xmm1 353 movdqa 32(%rsp),%xmm2 354 paddd 48(%rsp),%xmm3 355 movq $10,%r8 356 movdqa %xmm3,48(%rsp) 357 jmp .Loop_ssse3 358 359.align 32 360.Loop_ssse3: 361 paddd %xmm1,%xmm0 362 pxor %xmm0,%xmm3 363.byte 102,15,56,0,222 364 paddd %xmm3,%xmm2 365 pxor %xmm2,%xmm1 366 movdqa %xmm1,%xmm4 367 psrld $20,%xmm1 368 pslld $12,%xmm4 369 por %xmm4,%xmm1 370 paddd %xmm1,%xmm0 371 pxor %xmm0,%xmm3 372.byte 102,15,56,0,223 373 paddd %xmm3,%xmm2 374 pxor %xmm2,%xmm1 375 movdqa %xmm1,%xmm4 376 psrld $25,%xmm1 377 pslld $7,%xmm4 378 por %xmm4,%xmm1 379 pshufd $78,%xmm2,%xmm2 380 pshufd $57,%xmm1,%xmm1 381 pshufd $147,%xmm3,%xmm3 382 nop 383 paddd %xmm1,%xmm0 384 pxor %xmm0,%xmm3 385.byte 102,15,56,0,222 386 paddd %xmm3,%xmm2 387 pxor %xmm2,%xmm1 388 movdqa %xmm1,%xmm4 389 psrld $20,%xmm1 390 pslld $12,%xmm4 391 por %xmm4,%xmm1 392 paddd %xmm1,%xmm0 393 pxor %xmm0,%xmm3 394.byte 102,15,56,0,223 395 paddd %xmm3,%xmm2 396 pxor %xmm2,%xmm1 397 movdqa %xmm1,%xmm4 398 psrld $25,%xmm1 399 pslld $7,%xmm4 400 por %xmm4,%xmm1 401 pshufd $78,%xmm2,%xmm2 402 pshufd $147,%xmm1,%xmm1 403 pshufd $57,%xmm3,%xmm3 404 decq %r8 405 jnz .Loop_ssse3 406 paddd 0(%rsp),%xmm0 407 paddd 16(%rsp),%xmm1 408 paddd 32(%rsp),%xmm2 409 paddd 48(%rsp),%xmm3 410 411 cmpq $64,%rdx 412 jb .Ltail_ssse3 413 414 movdqu 0(%rsi),%xmm4 415 movdqu 16(%rsi),%xmm5 416 pxor %xmm4,%xmm0 417 movdqu 32(%rsi),%xmm4 418 pxor %xmm5,%xmm1 419 movdqu 48(%rsi),%xmm5 420 leaq 64(%rsi),%rsi 421 pxor %xmm4,%xmm2 422 pxor %xmm5,%xmm3 423 424 movdqu %xmm0,0(%rdi) 425 movdqu %xmm1,16(%rdi) 426 movdqu %xmm2,32(%rdi) 427 movdqu %xmm3,48(%rdi) 428 leaq 64(%rdi),%rdi 429 430 subq $64,%rdx 431 jnz .Loop_outer_ssse3 432 433 jmp .Ldone_ssse3 434 435.align 16 436.Ltail_ssse3: 437 movdqa %xmm0,0(%rsp) 438 movdqa %xmm1,16(%rsp) 439 movdqa %xmm2,32(%rsp) 440 movdqa %xmm3,48(%rsp) 441 xorq %r8,%r8 442 443.Loop_tail_ssse3: 444 movzbl (%rsi,%r8,1),%eax 445 movzbl (%rsp,%r8,1),%ecx 446 leaq 1(%r8),%r8 447 xorl %ecx,%eax 448 movb %al,-1(%rdi,%r8,1) 449 decq %rdx 450 jnz .Loop_tail_ssse3 451 452.Ldone_ssse3: 453 leaq (%r9),%rsp 454.cfi_def_cfa_register rsp 455.Lssse3_epilogue: 456 ret 457.cfi_endproc 458.size ChaCha20_ctr32_ssse3,.-ChaCha20_ctr32_ssse3 459.globl ChaCha20_ctr32_ssse3_4x 460.hidden ChaCha20_ctr32_ssse3_4x 461.type ChaCha20_ctr32_ssse3_4x,@function 462.align 32 463ChaCha20_ctr32_ssse3_4x: 464.cfi_startproc 465_CET_ENDBR 466 movq %rsp,%r9 467.cfi_def_cfa_register r9 468 movq %r10,%r11 469 subq $0x140+8,%rsp 470 movdqa .Lsigma(%rip),%xmm11 471 movdqu (%rcx),%xmm15 472 movdqu 16(%rcx),%xmm7 473 movdqu (%r8),%xmm3 474 leaq 256(%rsp),%rcx 475 leaq .Lrot16(%rip),%r10 476 leaq .Lrot24(%rip),%r11 477 478 pshufd $0x00,%xmm11,%xmm8 479 pshufd $0x55,%xmm11,%xmm9 480 movdqa %xmm8,64(%rsp) 481 pshufd $0xaa,%xmm11,%xmm10 482 movdqa %xmm9,80(%rsp) 483 pshufd $0xff,%xmm11,%xmm11 484 movdqa %xmm10,96(%rsp) 485 movdqa %xmm11,112(%rsp) 486 487 pshufd $0x00,%xmm15,%xmm12 488 pshufd $0x55,%xmm15,%xmm13 489 movdqa %xmm12,128-256(%rcx) 490 pshufd $0xaa,%xmm15,%xmm14 491 movdqa %xmm13,144-256(%rcx) 492 pshufd $0xff,%xmm15,%xmm15 493 movdqa %xmm14,160-256(%rcx) 494 movdqa %xmm15,176-256(%rcx) 495 496 pshufd $0x00,%xmm7,%xmm4 497 pshufd $0x55,%xmm7,%xmm5 498 movdqa %xmm4,192-256(%rcx) 499 pshufd $0xaa,%xmm7,%xmm6 500 movdqa %xmm5,208-256(%rcx) 501 pshufd $0xff,%xmm7,%xmm7 502 movdqa %xmm6,224-256(%rcx) 503 movdqa %xmm7,240-256(%rcx) 504 505 pshufd $0x00,%xmm3,%xmm0 506 pshufd $0x55,%xmm3,%xmm1 507 paddd .Linc(%rip),%xmm0 508 pshufd $0xaa,%xmm3,%xmm2 509 movdqa %xmm1,272-256(%rcx) 510 pshufd $0xff,%xmm3,%xmm3 511 movdqa %xmm2,288-256(%rcx) 512 movdqa %xmm3,304-256(%rcx) 513 514 jmp .Loop_enter4x 515 516.align 32 517.Loop_outer4x: 518 movdqa 64(%rsp),%xmm8 519 movdqa 80(%rsp),%xmm9 520 movdqa 96(%rsp),%xmm10 521 movdqa 112(%rsp),%xmm11 522 movdqa 128-256(%rcx),%xmm12 523 movdqa 144-256(%rcx),%xmm13 524 movdqa 160-256(%rcx),%xmm14 525 movdqa 176-256(%rcx),%xmm15 526 movdqa 192-256(%rcx),%xmm4 527 movdqa 208-256(%rcx),%xmm5 528 movdqa 224-256(%rcx),%xmm6 529 movdqa 240-256(%rcx),%xmm7 530 movdqa 256-256(%rcx),%xmm0 531 movdqa 272-256(%rcx),%xmm1 532 movdqa 288-256(%rcx),%xmm2 533 movdqa 304-256(%rcx),%xmm3 534 paddd .Lfour(%rip),%xmm0 535 536.Loop_enter4x: 537 movdqa %xmm6,32(%rsp) 538 movdqa %xmm7,48(%rsp) 539 movdqa (%r10),%xmm7 540 movl $10,%eax 541 movdqa %xmm0,256-256(%rcx) 542 jmp .Loop4x 543 544.align 32 545.Loop4x: 546 paddd %xmm12,%xmm8 547 paddd %xmm13,%xmm9 548 pxor %xmm8,%xmm0 549 pxor %xmm9,%xmm1 550.byte 102,15,56,0,199 551.byte 102,15,56,0,207 552 paddd %xmm0,%xmm4 553 paddd %xmm1,%xmm5 554 pxor %xmm4,%xmm12 555 pxor %xmm5,%xmm13 556 movdqa %xmm12,%xmm6 557 pslld $12,%xmm12 558 psrld $20,%xmm6 559 movdqa %xmm13,%xmm7 560 pslld $12,%xmm13 561 por %xmm6,%xmm12 562 psrld $20,%xmm7 563 movdqa (%r11),%xmm6 564 por %xmm7,%xmm13 565 paddd %xmm12,%xmm8 566 paddd %xmm13,%xmm9 567 pxor %xmm8,%xmm0 568 pxor %xmm9,%xmm1 569.byte 102,15,56,0,198 570.byte 102,15,56,0,206 571 paddd %xmm0,%xmm4 572 paddd %xmm1,%xmm5 573 pxor %xmm4,%xmm12 574 pxor %xmm5,%xmm13 575 movdqa %xmm12,%xmm7 576 pslld $7,%xmm12 577 psrld $25,%xmm7 578 movdqa %xmm13,%xmm6 579 pslld $7,%xmm13 580 por %xmm7,%xmm12 581 psrld $25,%xmm6 582 movdqa (%r10),%xmm7 583 por %xmm6,%xmm13 584 movdqa %xmm4,0(%rsp) 585 movdqa %xmm5,16(%rsp) 586 movdqa 32(%rsp),%xmm4 587 movdqa 48(%rsp),%xmm5 588 paddd %xmm14,%xmm10 589 paddd %xmm15,%xmm11 590 pxor %xmm10,%xmm2 591 pxor %xmm11,%xmm3 592.byte 102,15,56,0,215 593.byte 102,15,56,0,223 594 paddd %xmm2,%xmm4 595 paddd %xmm3,%xmm5 596 pxor %xmm4,%xmm14 597 pxor %xmm5,%xmm15 598 movdqa %xmm14,%xmm6 599 pslld $12,%xmm14 600 psrld $20,%xmm6 601 movdqa %xmm15,%xmm7 602 pslld $12,%xmm15 603 por %xmm6,%xmm14 604 psrld $20,%xmm7 605 movdqa (%r11),%xmm6 606 por %xmm7,%xmm15 607 paddd %xmm14,%xmm10 608 paddd %xmm15,%xmm11 609 pxor %xmm10,%xmm2 610 pxor %xmm11,%xmm3 611.byte 102,15,56,0,214 612.byte 102,15,56,0,222 613 paddd %xmm2,%xmm4 614 paddd %xmm3,%xmm5 615 pxor %xmm4,%xmm14 616 pxor %xmm5,%xmm15 617 movdqa %xmm14,%xmm7 618 pslld $7,%xmm14 619 psrld $25,%xmm7 620 movdqa %xmm15,%xmm6 621 pslld $7,%xmm15 622 por %xmm7,%xmm14 623 psrld $25,%xmm6 624 movdqa (%r10),%xmm7 625 por %xmm6,%xmm15 626 paddd %xmm13,%xmm8 627 paddd %xmm14,%xmm9 628 pxor %xmm8,%xmm3 629 pxor %xmm9,%xmm0 630.byte 102,15,56,0,223 631.byte 102,15,56,0,199 632 paddd %xmm3,%xmm4 633 paddd %xmm0,%xmm5 634 pxor %xmm4,%xmm13 635 pxor %xmm5,%xmm14 636 movdqa %xmm13,%xmm6 637 pslld $12,%xmm13 638 psrld $20,%xmm6 639 movdqa %xmm14,%xmm7 640 pslld $12,%xmm14 641 por %xmm6,%xmm13 642 psrld $20,%xmm7 643 movdqa (%r11),%xmm6 644 por %xmm7,%xmm14 645 paddd %xmm13,%xmm8 646 paddd %xmm14,%xmm9 647 pxor %xmm8,%xmm3 648 pxor %xmm9,%xmm0 649.byte 102,15,56,0,222 650.byte 102,15,56,0,198 651 paddd %xmm3,%xmm4 652 paddd %xmm0,%xmm5 653 pxor %xmm4,%xmm13 654 pxor %xmm5,%xmm14 655 movdqa %xmm13,%xmm7 656 pslld $7,%xmm13 657 psrld $25,%xmm7 658 movdqa %xmm14,%xmm6 659 pslld $7,%xmm14 660 por %xmm7,%xmm13 661 psrld $25,%xmm6 662 movdqa (%r10),%xmm7 663 por %xmm6,%xmm14 664 movdqa %xmm4,32(%rsp) 665 movdqa %xmm5,48(%rsp) 666 movdqa 0(%rsp),%xmm4 667 movdqa 16(%rsp),%xmm5 668 paddd %xmm15,%xmm10 669 paddd %xmm12,%xmm11 670 pxor %xmm10,%xmm1 671 pxor %xmm11,%xmm2 672.byte 102,15,56,0,207 673.byte 102,15,56,0,215 674 paddd %xmm1,%xmm4 675 paddd %xmm2,%xmm5 676 pxor %xmm4,%xmm15 677 pxor %xmm5,%xmm12 678 movdqa %xmm15,%xmm6 679 pslld $12,%xmm15 680 psrld $20,%xmm6 681 movdqa %xmm12,%xmm7 682 pslld $12,%xmm12 683 por %xmm6,%xmm15 684 psrld $20,%xmm7 685 movdqa (%r11),%xmm6 686 por %xmm7,%xmm12 687 paddd %xmm15,%xmm10 688 paddd %xmm12,%xmm11 689 pxor %xmm10,%xmm1 690 pxor %xmm11,%xmm2 691.byte 102,15,56,0,206 692.byte 102,15,56,0,214 693 paddd %xmm1,%xmm4 694 paddd %xmm2,%xmm5 695 pxor %xmm4,%xmm15 696 pxor %xmm5,%xmm12 697 movdqa %xmm15,%xmm7 698 pslld $7,%xmm15 699 psrld $25,%xmm7 700 movdqa %xmm12,%xmm6 701 pslld $7,%xmm12 702 por %xmm7,%xmm15 703 psrld $25,%xmm6 704 movdqa (%r10),%xmm7 705 por %xmm6,%xmm12 706 decl %eax 707 jnz .Loop4x 708 709 paddd 64(%rsp),%xmm8 710 paddd 80(%rsp),%xmm9 711 paddd 96(%rsp),%xmm10 712 paddd 112(%rsp),%xmm11 713 714 movdqa %xmm8,%xmm6 715 punpckldq %xmm9,%xmm8 716 movdqa %xmm10,%xmm7 717 punpckldq %xmm11,%xmm10 718 punpckhdq %xmm9,%xmm6 719 punpckhdq %xmm11,%xmm7 720 movdqa %xmm8,%xmm9 721 punpcklqdq %xmm10,%xmm8 722 movdqa %xmm6,%xmm11 723 punpcklqdq %xmm7,%xmm6 724 punpckhqdq %xmm10,%xmm9 725 punpckhqdq %xmm7,%xmm11 726 paddd 128-256(%rcx),%xmm12 727 paddd 144-256(%rcx),%xmm13 728 paddd 160-256(%rcx),%xmm14 729 paddd 176-256(%rcx),%xmm15 730 731 movdqa %xmm8,0(%rsp) 732 movdqa %xmm9,16(%rsp) 733 movdqa 32(%rsp),%xmm8 734 movdqa 48(%rsp),%xmm9 735 736 movdqa %xmm12,%xmm10 737 punpckldq %xmm13,%xmm12 738 movdqa %xmm14,%xmm7 739 punpckldq %xmm15,%xmm14 740 punpckhdq %xmm13,%xmm10 741 punpckhdq %xmm15,%xmm7 742 movdqa %xmm12,%xmm13 743 punpcklqdq %xmm14,%xmm12 744 movdqa %xmm10,%xmm15 745 punpcklqdq %xmm7,%xmm10 746 punpckhqdq %xmm14,%xmm13 747 punpckhqdq %xmm7,%xmm15 748 paddd 192-256(%rcx),%xmm4 749 paddd 208-256(%rcx),%xmm5 750 paddd 224-256(%rcx),%xmm8 751 paddd 240-256(%rcx),%xmm9 752 753 movdqa %xmm6,32(%rsp) 754 movdqa %xmm11,48(%rsp) 755 756 movdqa %xmm4,%xmm14 757 punpckldq %xmm5,%xmm4 758 movdqa %xmm8,%xmm7 759 punpckldq %xmm9,%xmm8 760 punpckhdq %xmm5,%xmm14 761 punpckhdq %xmm9,%xmm7 762 movdqa %xmm4,%xmm5 763 punpcklqdq %xmm8,%xmm4 764 movdqa %xmm14,%xmm9 765 punpcklqdq %xmm7,%xmm14 766 punpckhqdq %xmm8,%xmm5 767 punpckhqdq %xmm7,%xmm9 768 paddd 256-256(%rcx),%xmm0 769 paddd 272-256(%rcx),%xmm1 770 paddd 288-256(%rcx),%xmm2 771 paddd 304-256(%rcx),%xmm3 772 773 movdqa %xmm0,%xmm8 774 punpckldq %xmm1,%xmm0 775 movdqa %xmm2,%xmm7 776 punpckldq %xmm3,%xmm2 777 punpckhdq %xmm1,%xmm8 778 punpckhdq %xmm3,%xmm7 779 movdqa %xmm0,%xmm1 780 punpcklqdq %xmm2,%xmm0 781 movdqa %xmm8,%xmm3 782 punpcklqdq %xmm7,%xmm8 783 punpckhqdq %xmm2,%xmm1 784 punpckhqdq %xmm7,%xmm3 785 cmpq $256,%rdx 786 jb .Ltail4x 787 788 movdqu 0(%rsi),%xmm6 789 movdqu 16(%rsi),%xmm11 790 movdqu 32(%rsi),%xmm2 791 movdqu 48(%rsi),%xmm7 792 pxor 0(%rsp),%xmm6 793 pxor %xmm12,%xmm11 794 pxor %xmm4,%xmm2 795 pxor %xmm0,%xmm7 796 797 movdqu %xmm6,0(%rdi) 798 movdqu 64(%rsi),%xmm6 799 movdqu %xmm11,16(%rdi) 800 movdqu 80(%rsi),%xmm11 801 movdqu %xmm2,32(%rdi) 802 movdqu 96(%rsi),%xmm2 803 movdqu %xmm7,48(%rdi) 804 movdqu 112(%rsi),%xmm7 805 leaq 128(%rsi),%rsi 806 pxor 16(%rsp),%xmm6 807 pxor %xmm13,%xmm11 808 pxor %xmm5,%xmm2 809 pxor %xmm1,%xmm7 810 811 movdqu %xmm6,64(%rdi) 812 movdqu 0(%rsi),%xmm6 813 movdqu %xmm11,80(%rdi) 814 movdqu 16(%rsi),%xmm11 815 movdqu %xmm2,96(%rdi) 816 movdqu 32(%rsi),%xmm2 817 movdqu %xmm7,112(%rdi) 818 leaq 128(%rdi),%rdi 819 movdqu 48(%rsi),%xmm7 820 pxor 32(%rsp),%xmm6 821 pxor %xmm10,%xmm11 822 pxor %xmm14,%xmm2 823 pxor %xmm8,%xmm7 824 825 movdqu %xmm6,0(%rdi) 826 movdqu 64(%rsi),%xmm6 827 movdqu %xmm11,16(%rdi) 828 movdqu 80(%rsi),%xmm11 829 movdqu %xmm2,32(%rdi) 830 movdqu 96(%rsi),%xmm2 831 movdqu %xmm7,48(%rdi) 832 movdqu 112(%rsi),%xmm7 833 leaq 128(%rsi),%rsi 834 pxor 48(%rsp),%xmm6 835 pxor %xmm15,%xmm11 836 pxor %xmm9,%xmm2 837 pxor %xmm3,%xmm7 838 movdqu %xmm6,64(%rdi) 839 movdqu %xmm11,80(%rdi) 840 movdqu %xmm2,96(%rdi) 841 movdqu %xmm7,112(%rdi) 842 leaq 128(%rdi),%rdi 843 844 subq $256,%rdx 845 jnz .Loop_outer4x 846 847 jmp .Ldone4x 848 849.Ltail4x: 850 cmpq $192,%rdx 851 jae .L192_or_more4x 852 cmpq $128,%rdx 853 jae .L128_or_more4x 854 cmpq $64,%rdx 855 jae .L64_or_more4x 856 857 858 xorq %r10,%r10 859 860 movdqa %xmm12,16(%rsp) 861 movdqa %xmm4,32(%rsp) 862 movdqa %xmm0,48(%rsp) 863 jmp .Loop_tail4x 864 865.align 32 866.L64_or_more4x: 867 movdqu 0(%rsi),%xmm6 868 movdqu 16(%rsi),%xmm11 869 movdqu 32(%rsi),%xmm2 870 movdqu 48(%rsi),%xmm7 871 pxor 0(%rsp),%xmm6 872 pxor %xmm12,%xmm11 873 pxor %xmm4,%xmm2 874 pxor %xmm0,%xmm7 875 movdqu %xmm6,0(%rdi) 876 movdqu %xmm11,16(%rdi) 877 movdqu %xmm2,32(%rdi) 878 movdqu %xmm7,48(%rdi) 879 je .Ldone4x 880 881 movdqa 16(%rsp),%xmm6 882 leaq 64(%rsi),%rsi 883 xorq %r10,%r10 884 movdqa %xmm6,0(%rsp) 885 movdqa %xmm13,16(%rsp) 886 leaq 64(%rdi),%rdi 887 movdqa %xmm5,32(%rsp) 888 subq $64,%rdx 889 movdqa %xmm1,48(%rsp) 890 jmp .Loop_tail4x 891 892.align 32 893.L128_or_more4x: 894 movdqu 0(%rsi),%xmm6 895 movdqu 16(%rsi),%xmm11 896 movdqu 32(%rsi),%xmm2 897 movdqu 48(%rsi),%xmm7 898 pxor 0(%rsp),%xmm6 899 pxor %xmm12,%xmm11 900 pxor %xmm4,%xmm2 901 pxor %xmm0,%xmm7 902 903 movdqu %xmm6,0(%rdi) 904 movdqu 64(%rsi),%xmm6 905 movdqu %xmm11,16(%rdi) 906 movdqu 80(%rsi),%xmm11 907 movdqu %xmm2,32(%rdi) 908 movdqu 96(%rsi),%xmm2 909 movdqu %xmm7,48(%rdi) 910 movdqu 112(%rsi),%xmm7 911 pxor 16(%rsp),%xmm6 912 pxor %xmm13,%xmm11 913 pxor %xmm5,%xmm2 914 pxor %xmm1,%xmm7 915 movdqu %xmm6,64(%rdi) 916 movdqu %xmm11,80(%rdi) 917 movdqu %xmm2,96(%rdi) 918 movdqu %xmm7,112(%rdi) 919 je .Ldone4x 920 921 movdqa 32(%rsp),%xmm6 922 leaq 128(%rsi),%rsi 923 xorq %r10,%r10 924 movdqa %xmm6,0(%rsp) 925 movdqa %xmm10,16(%rsp) 926 leaq 128(%rdi),%rdi 927 movdqa %xmm14,32(%rsp) 928 subq $128,%rdx 929 movdqa %xmm8,48(%rsp) 930 jmp .Loop_tail4x 931 932.align 32 933.L192_or_more4x: 934 movdqu 0(%rsi),%xmm6 935 movdqu 16(%rsi),%xmm11 936 movdqu 32(%rsi),%xmm2 937 movdqu 48(%rsi),%xmm7 938 pxor 0(%rsp),%xmm6 939 pxor %xmm12,%xmm11 940 pxor %xmm4,%xmm2 941 pxor %xmm0,%xmm7 942 943 movdqu %xmm6,0(%rdi) 944 movdqu 64(%rsi),%xmm6 945 movdqu %xmm11,16(%rdi) 946 movdqu 80(%rsi),%xmm11 947 movdqu %xmm2,32(%rdi) 948 movdqu 96(%rsi),%xmm2 949 movdqu %xmm7,48(%rdi) 950 movdqu 112(%rsi),%xmm7 951 leaq 128(%rsi),%rsi 952 pxor 16(%rsp),%xmm6 953 pxor %xmm13,%xmm11 954 pxor %xmm5,%xmm2 955 pxor %xmm1,%xmm7 956 957 movdqu %xmm6,64(%rdi) 958 movdqu 0(%rsi),%xmm6 959 movdqu %xmm11,80(%rdi) 960 movdqu 16(%rsi),%xmm11 961 movdqu %xmm2,96(%rdi) 962 movdqu 32(%rsi),%xmm2 963 movdqu %xmm7,112(%rdi) 964 leaq 128(%rdi),%rdi 965 movdqu 48(%rsi),%xmm7 966 pxor 32(%rsp),%xmm6 967 pxor %xmm10,%xmm11 968 pxor %xmm14,%xmm2 969 pxor %xmm8,%xmm7 970 movdqu %xmm6,0(%rdi) 971 movdqu %xmm11,16(%rdi) 972 movdqu %xmm2,32(%rdi) 973 movdqu %xmm7,48(%rdi) 974 je .Ldone4x 975 976 movdqa 48(%rsp),%xmm6 977 leaq 64(%rsi),%rsi 978 xorq %r10,%r10 979 movdqa %xmm6,0(%rsp) 980 movdqa %xmm15,16(%rsp) 981 leaq 64(%rdi),%rdi 982 movdqa %xmm9,32(%rsp) 983 subq $192,%rdx 984 movdqa %xmm3,48(%rsp) 985 986.Loop_tail4x: 987 movzbl (%rsi,%r10,1),%eax 988 movzbl (%rsp,%r10,1),%ecx 989 leaq 1(%r10),%r10 990 xorl %ecx,%eax 991 movb %al,-1(%rdi,%r10,1) 992 decq %rdx 993 jnz .Loop_tail4x 994 995.Ldone4x: 996 leaq (%r9),%rsp 997.cfi_def_cfa_register rsp 998.L4x_epilogue: 999 ret 1000.cfi_endproc 1001.size ChaCha20_ctr32_ssse3_4x,.-ChaCha20_ctr32_ssse3_4x 1002.globl ChaCha20_ctr32_avx2 1003.hidden ChaCha20_ctr32_avx2 1004.type ChaCha20_ctr32_avx2,@function 1005.align 32 1006ChaCha20_ctr32_avx2: 1007.cfi_startproc 1008_CET_ENDBR 1009 movq %rsp,%r9 1010.cfi_def_cfa_register r9 1011 subq $0x280+8,%rsp 1012 andq $-32,%rsp 1013 vzeroupper 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 vbroadcasti128 .Lsigma(%rip),%ymm11 1025 vbroadcasti128 (%rcx),%ymm3 1026 vbroadcasti128 16(%rcx),%ymm15 1027 vbroadcasti128 (%r8),%ymm7 1028 leaq 256(%rsp),%rcx 1029 leaq 512(%rsp),%rax 1030 leaq .Lrot16(%rip),%r10 1031 leaq .Lrot24(%rip),%r11 1032 1033 vpshufd $0x00,%ymm11,%ymm8 1034 vpshufd $0x55,%ymm11,%ymm9 1035 vmovdqa %ymm8,128-256(%rcx) 1036 vpshufd $0xaa,%ymm11,%ymm10 1037 vmovdqa %ymm9,160-256(%rcx) 1038 vpshufd $0xff,%ymm11,%ymm11 1039 vmovdqa %ymm10,192-256(%rcx) 1040 vmovdqa %ymm11,224-256(%rcx) 1041 1042 vpshufd $0x00,%ymm3,%ymm0 1043 vpshufd $0x55,%ymm3,%ymm1 1044 vmovdqa %ymm0,256-256(%rcx) 1045 vpshufd $0xaa,%ymm3,%ymm2 1046 vmovdqa %ymm1,288-256(%rcx) 1047 vpshufd $0xff,%ymm3,%ymm3 1048 vmovdqa %ymm2,320-256(%rcx) 1049 vmovdqa %ymm3,352-256(%rcx) 1050 1051 vpshufd $0x00,%ymm15,%ymm12 1052 vpshufd $0x55,%ymm15,%ymm13 1053 vmovdqa %ymm12,384-512(%rax) 1054 vpshufd $0xaa,%ymm15,%ymm14 1055 vmovdqa %ymm13,416-512(%rax) 1056 vpshufd $0xff,%ymm15,%ymm15 1057 vmovdqa %ymm14,448-512(%rax) 1058 vmovdqa %ymm15,480-512(%rax) 1059 1060 vpshufd $0x00,%ymm7,%ymm4 1061 vpshufd $0x55,%ymm7,%ymm5 1062 vpaddd .Lincy(%rip),%ymm4,%ymm4 1063 vpshufd $0xaa,%ymm7,%ymm6 1064 vmovdqa %ymm5,544-512(%rax) 1065 vpshufd $0xff,%ymm7,%ymm7 1066 vmovdqa %ymm6,576-512(%rax) 1067 vmovdqa %ymm7,608-512(%rax) 1068 1069 jmp .Loop_enter8x 1070 1071.align 32 1072.Loop_outer8x: 1073 vmovdqa 128-256(%rcx),%ymm8 1074 vmovdqa 160-256(%rcx),%ymm9 1075 vmovdqa 192-256(%rcx),%ymm10 1076 vmovdqa 224-256(%rcx),%ymm11 1077 vmovdqa 256-256(%rcx),%ymm0 1078 vmovdqa 288-256(%rcx),%ymm1 1079 vmovdqa 320-256(%rcx),%ymm2 1080 vmovdqa 352-256(%rcx),%ymm3 1081 vmovdqa 384-512(%rax),%ymm12 1082 vmovdqa 416-512(%rax),%ymm13 1083 vmovdqa 448-512(%rax),%ymm14 1084 vmovdqa 480-512(%rax),%ymm15 1085 vmovdqa 512-512(%rax),%ymm4 1086 vmovdqa 544-512(%rax),%ymm5 1087 vmovdqa 576-512(%rax),%ymm6 1088 vmovdqa 608-512(%rax),%ymm7 1089 vpaddd .Leight(%rip),%ymm4,%ymm4 1090 1091.Loop_enter8x: 1092 vmovdqa %ymm14,64(%rsp) 1093 vmovdqa %ymm15,96(%rsp) 1094 vbroadcasti128 (%r10),%ymm15 1095 vmovdqa %ymm4,512-512(%rax) 1096 movl $10,%eax 1097 jmp .Loop8x 1098 1099.align 32 1100.Loop8x: 1101 vpaddd %ymm0,%ymm8,%ymm8 1102 vpxor %ymm4,%ymm8,%ymm4 1103 vpshufb %ymm15,%ymm4,%ymm4 1104 vpaddd %ymm1,%ymm9,%ymm9 1105 vpxor %ymm5,%ymm9,%ymm5 1106 vpshufb %ymm15,%ymm5,%ymm5 1107 vpaddd %ymm4,%ymm12,%ymm12 1108 vpxor %ymm0,%ymm12,%ymm0 1109 vpslld $12,%ymm0,%ymm14 1110 vpsrld $20,%ymm0,%ymm0 1111 vpor %ymm0,%ymm14,%ymm0 1112 vbroadcasti128 (%r11),%ymm14 1113 vpaddd %ymm5,%ymm13,%ymm13 1114 vpxor %ymm1,%ymm13,%ymm1 1115 vpslld $12,%ymm1,%ymm15 1116 vpsrld $20,%ymm1,%ymm1 1117 vpor %ymm1,%ymm15,%ymm1 1118 vpaddd %ymm0,%ymm8,%ymm8 1119 vpxor %ymm4,%ymm8,%ymm4 1120 vpshufb %ymm14,%ymm4,%ymm4 1121 vpaddd %ymm1,%ymm9,%ymm9 1122 vpxor %ymm5,%ymm9,%ymm5 1123 vpshufb %ymm14,%ymm5,%ymm5 1124 vpaddd %ymm4,%ymm12,%ymm12 1125 vpxor %ymm0,%ymm12,%ymm0 1126 vpslld $7,%ymm0,%ymm15 1127 vpsrld $25,%ymm0,%ymm0 1128 vpor %ymm0,%ymm15,%ymm0 1129 vbroadcasti128 (%r10),%ymm15 1130 vpaddd %ymm5,%ymm13,%ymm13 1131 vpxor %ymm1,%ymm13,%ymm1 1132 vpslld $7,%ymm1,%ymm14 1133 vpsrld $25,%ymm1,%ymm1 1134 vpor %ymm1,%ymm14,%ymm1 1135 vmovdqa %ymm12,0(%rsp) 1136 vmovdqa %ymm13,32(%rsp) 1137 vmovdqa 64(%rsp),%ymm12 1138 vmovdqa 96(%rsp),%ymm13 1139 vpaddd %ymm2,%ymm10,%ymm10 1140 vpxor %ymm6,%ymm10,%ymm6 1141 vpshufb %ymm15,%ymm6,%ymm6 1142 vpaddd %ymm3,%ymm11,%ymm11 1143 vpxor %ymm7,%ymm11,%ymm7 1144 vpshufb %ymm15,%ymm7,%ymm7 1145 vpaddd %ymm6,%ymm12,%ymm12 1146 vpxor %ymm2,%ymm12,%ymm2 1147 vpslld $12,%ymm2,%ymm14 1148 vpsrld $20,%ymm2,%ymm2 1149 vpor %ymm2,%ymm14,%ymm2 1150 vbroadcasti128 (%r11),%ymm14 1151 vpaddd %ymm7,%ymm13,%ymm13 1152 vpxor %ymm3,%ymm13,%ymm3 1153 vpslld $12,%ymm3,%ymm15 1154 vpsrld $20,%ymm3,%ymm3 1155 vpor %ymm3,%ymm15,%ymm3 1156 vpaddd %ymm2,%ymm10,%ymm10 1157 vpxor %ymm6,%ymm10,%ymm6 1158 vpshufb %ymm14,%ymm6,%ymm6 1159 vpaddd %ymm3,%ymm11,%ymm11 1160 vpxor %ymm7,%ymm11,%ymm7 1161 vpshufb %ymm14,%ymm7,%ymm7 1162 vpaddd %ymm6,%ymm12,%ymm12 1163 vpxor %ymm2,%ymm12,%ymm2 1164 vpslld $7,%ymm2,%ymm15 1165 vpsrld $25,%ymm2,%ymm2 1166 vpor %ymm2,%ymm15,%ymm2 1167 vbroadcasti128 (%r10),%ymm15 1168 vpaddd %ymm7,%ymm13,%ymm13 1169 vpxor %ymm3,%ymm13,%ymm3 1170 vpslld $7,%ymm3,%ymm14 1171 vpsrld $25,%ymm3,%ymm3 1172 vpor %ymm3,%ymm14,%ymm3 1173 vpaddd %ymm1,%ymm8,%ymm8 1174 vpxor %ymm7,%ymm8,%ymm7 1175 vpshufb %ymm15,%ymm7,%ymm7 1176 vpaddd %ymm2,%ymm9,%ymm9 1177 vpxor %ymm4,%ymm9,%ymm4 1178 vpshufb %ymm15,%ymm4,%ymm4 1179 vpaddd %ymm7,%ymm12,%ymm12 1180 vpxor %ymm1,%ymm12,%ymm1 1181 vpslld $12,%ymm1,%ymm14 1182 vpsrld $20,%ymm1,%ymm1 1183 vpor %ymm1,%ymm14,%ymm1 1184 vbroadcasti128 (%r11),%ymm14 1185 vpaddd %ymm4,%ymm13,%ymm13 1186 vpxor %ymm2,%ymm13,%ymm2 1187 vpslld $12,%ymm2,%ymm15 1188 vpsrld $20,%ymm2,%ymm2 1189 vpor %ymm2,%ymm15,%ymm2 1190 vpaddd %ymm1,%ymm8,%ymm8 1191 vpxor %ymm7,%ymm8,%ymm7 1192 vpshufb %ymm14,%ymm7,%ymm7 1193 vpaddd %ymm2,%ymm9,%ymm9 1194 vpxor %ymm4,%ymm9,%ymm4 1195 vpshufb %ymm14,%ymm4,%ymm4 1196 vpaddd %ymm7,%ymm12,%ymm12 1197 vpxor %ymm1,%ymm12,%ymm1 1198 vpslld $7,%ymm1,%ymm15 1199 vpsrld $25,%ymm1,%ymm1 1200 vpor %ymm1,%ymm15,%ymm1 1201 vbroadcasti128 (%r10),%ymm15 1202 vpaddd %ymm4,%ymm13,%ymm13 1203 vpxor %ymm2,%ymm13,%ymm2 1204 vpslld $7,%ymm2,%ymm14 1205 vpsrld $25,%ymm2,%ymm2 1206 vpor %ymm2,%ymm14,%ymm2 1207 vmovdqa %ymm12,64(%rsp) 1208 vmovdqa %ymm13,96(%rsp) 1209 vmovdqa 0(%rsp),%ymm12 1210 vmovdqa 32(%rsp),%ymm13 1211 vpaddd %ymm3,%ymm10,%ymm10 1212 vpxor %ymm5,%ymm10,%ymm5 1213 vpshufb %ymm15,%ymm5,%ymm5 1214 vpaddd %ymm0,%ymm11,%ymm11 1215 vpxor %ymm6,%ymm11,%ymm6 1216 vpshufb %ymm15,%ymm6,%ymm6 1217 vpaddd %ymm5,%ymm12,%ymm12 1218 vpxor %ymm3,%ymm12,%ymm3 1219 vpslld $12,%ymm3,%ymm14 1220 vpsrld $20,%ymm3,%ymm3 1221 vpor %ymm3,%ymm14,%ymm3 1222 vbroadcasti128 (%r11),%ymm14 1223 vpaddd %ymm6,%ymm13,%ymm13 1224 vpxor %ymm0,%ymm13,%ymm0 1225 vpslld $12,%ymm0,%ymm15 1226 vpsrld $20,%ymm0,%ymm0 1227 vpor %ymm0,%ymm15,%ymm0 1228 vpaddd %ymm3,%ymm10,%ymm10 1229 vpxor %ymm5,%ymm10,%ymm5 1230 vpshufb %ymm14,%ymm5,%ymm5 1231 vpaddd %ymm0,%ymm11,%ymm11 1232 vpxor %ymm6,%ymm11,%ymm6 1233 vpshufb %ymm14,%ymm6,%ymm6 1234 vpaddd %ymm5,%ymm12,%ymm12 1235 vpxor %ymm3,%ymm12,%ymm3 1236 vpslld $7,%ymm3,%ymm15 1237 vpsrld $25,%ymm3,%ymm3 1238 vpor %ymm3,%ymm15,%ymm3 1239 vbroadcasti128 (%r10),%ymm15 1240 vpaddd %ymm6,%ymm13,%ymm13 1241 vpxor %ymm0,%ymm13,%ymm0 1242 vpslld $7,%ymm0,%ymm14 1243 vpsrld $25,%ymm0,%ymm0 1244 vpor %ymm0,%ymm14,%ymm0 1245 decl %eax 1246 jnz .Loop8x 1247 1248 leaq 512(%rsp),%rax 1249 vpaddd 128-256(%rcx),%ymm8,%ymm8 1250 vpaddd 160-256(%rcx),%ymm9,%ymm9 1251 vpaddd 192-256(%rcx),%ymm10,%ymm10 1252 vpaddd 224-256(%rcx),%ymm11,%ymm11 1253 1254 vpunpckldq %ymm9,%ymm8,%ymm14 1255 vpunpckldq %ymm11,%ymm10,%ymm15 1256 vpunpckhdq %ymm9,%ymm8,%ymm8 1257 vpunpckhdq %ymm11,%ymm10,%ymm10 1258 vpunpcklqdq %ymm15,%ymm14,%ymm9 1259 vpunpckhqdq %ymm15,%ymm14,%ymm14 1260 vpunpcklqdq %ymm10,%ymm8,%ymm11 1261 vpunpckhqdq %ymm10,%ymm8,%ymm8 1262 vpaddd 256-256(%rcx),%ymm0,%ymm0 1263 vpaddd 288-256(%rcx),%ymm1,%ymm1 1264 vpaddd 320-256(%rcx),%ymm2,%ymm2 1265 vpaddd 352-256(%rcx),%ymm3,%ymm3 1266 1267 vpunpckldq %ymm1,%ymm0,%ymm10 1268 vpunpckldq %ymm3,%ymm2,%ymm15 1269 vpunpckhdq %ymm1,%ymm0,%ymm0 1270 vpunpckhdq %ymm3,%ymm2,%ymm2 1271 vpunpcklqdq %ymm15,%ymm10,%ymm1 1272 vpunpckhqdq %ymm15,%ymm10,%ymm10 1273 vpunpcklqdq %ymm2,%ymm0,%ymm3 1274 vpunpckhqdq %ymm2,%ymm0,%ymm0 1275 vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 1276 vperm2i128 $0x31,%ymm1,%ymm9,%ymm1 1277 vperm2i128 $0x20,%ymm10,%ymm14,%ymm9 1278 vperm2i128 $0x31,%ymm10,%ymm14,%ymm10 1279 vperm2i128 $0x20,%ymm3,%ymm11,%ymm14 1280 vperm2i128 $0x31,%ymm3,%ymm11,%ymm3 1281 vperm2i128 $0x20,%ymm0,%ymm8,%ymm11 1282 vperm2i128 $0x31,%ymm0,%ymm8,%ymm0 1283 vmovdqa %ymm15,0(%rsp) 1284 vmovdqa %ymm9,32(%rsp) 1285 vmovdqa 64(%rsp),%ymm15 1286 vmovdqa 96(%rsp),%ymm9 1287 1288 vpaddd 384-512(%rax),%ymm12,%ymm12 1289 vpaddd 416-512(%rax),%ymm13,%ymm13 1290 vpaddd 448-512(%rax),%ymm15,%ymm15 1291 vpaddd 480-512(%rax),%ymm9,%ymm9 1292 1293 vpunpckldq %ymm13,%ymm12,%ymm2 1294 vpunpckldq %ymm9,%ymm15,%ymm8 1295 vpunpckhdq %ymm13,%ymm12,%ymm12 1296 vpunpckhdq %ymm9,%ymm15,%ymm15 1297 vpunpcklqdq %ymm8,%ymm2,%ymm13 1298 vpunpckhqdq %ymm8,%ymm2,%ymm2 1299 vpunpcklqdq %ymm15,%ymm12,%ymm9 1300 vpunpckhqdq %ymm15,%ymm12,%ymm12 1301 vpaddd 512-512(%rax),%ymm4,%ymm4 1302 vpaddd 544-512(%rax),%ymm5,%ymm5 1303 vpaddd 576-512(%rax),%ymm6,%ymm6 1304 vpaddd 608-512(%rax),%ymm7,%ymm7 1305 1306 vpunpckldq %ymm5,%ymm4,%ymm15 1307 vpunpckldq %ymm7,%ymm6,%ymm8 1308 vpunpckhdq %ymm5,%ymm4,%ymm4 1309 vpunpckhdq %ymm7,%ymm6,%ymm6 1310 vpunpcklqdq %ymm8,%ymm15,%ymm5 1311 vpunpckhqdq %ymm8,%ymm15,%ymm15 1312 vpunpcklqdq %ymm6,%ymm4,%ymm7 1313 vpunpckhqdq %ymm6,%ymm4,%ymm4 1314 vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 1315 vperm2i128 $0x31,%ymm5,%ymm13,%ymm5 1316 vperm2i128 $0x20,%ymm15,%ymm2,%ymm13 1317 vperm2i128 $0x31,%ymm15,%ymm2,%ymm15 1318 vperm2i128 $0x20,%ymm7,%ymm9,%ymm2 1319 vperm2i128 $0x31,%ymm7,%ymm9,%ymm7 1320 vperm2i128 $0x20,%ymm4,%ymm12,%ymm9 1321 vperm2i128 $0x31,%ymm4,%ymm12,%ymm4 1322 vmovdqa 0(%rsp),%ymm6 1323 vmovdqa 32(%rsp),%ymm12 1324 1325 cmpq $512,%rdx 1326 jb .Ltail8x 1327 1328 vpxor 0(%rsi),%ymm6,%ymm6 1329 vpxor 32(%rsi),%ymm8,%ymm8 1330 vpxor 64(%rsi),%ymm1,%ymm1 1331 vpxor 96(%rsi),%ymm5,%ymm5 1332 leaq 128(%rsi),%rsi 1333 vmovdqu %ymm6,0(%rdi) 1334 vmovdqu %ymm8,32(%rdi) 1335 vmovdqu %ymm1,64(%rdi) 1336 vmovdqu %ymm5,96(%rdi) 1337 leaq 128(%rdi),%rdi 1338 1339 vpxor 0(%rsi),%ymm12,%ymm12 1340 vpxor 32(%rsi),%ymm13,%ymm13 1341 vpxor 64(%rsi),%ymm10,%ymm10 1342 vpxor 96(%rsi),%ymm15,%ymm15 1343 leaq 128(%rsi),%rsi 1344 vmovdqu %ymm12,0(%rdi) 1345 vmovdqu %ymm13,32(%rdi) 1346 vmovdqu %ymm10,64(%rdi) 1347 vmovdqu %ymm15,96(%rdi) 1348 leaq 128(%rdi),%rdi 1349 1350 vpxor 0(%rsi),%ymm14,%ymm14 1351 vpxor 32(%rsi),%ymm2,%ymm2 1352 vpxor 64(%rsi),%ymm3,%ymm3 1353 vpxor 96(%rsi),%ymm7,%ymm7 1354 leaq 128(%rsi),%rsi 1355 vmovdqu %ymm14,0(%rdi) 1356 vmovdqu %ymm2,32(%rdi) 1357 vmovdqu %ymm3,64(%rdi) 1358 vmovdqu %ymm7,96(%rdi) 1359 leaq 128(%rdi),%rdi 1360 1361 vpxor 0(%rsi),%ymm11,%ymm11 1362 vpxor 32(%rsi),%ymm9,%ymm9 1363 vpxor 64(%rsi),%ymm0,%ymm0 1364 vpxor 96(%rsi),%ymm4,%ymm4 1365 leaq 128(%rsi),%rsi 1366 vmovdqu %ymm11,0(%rdi) 1367 vmovdqu %ymm9,32(%rdi) 1368 vmovdqu %ymm0,64(%rdi) 1369 vmovdqu %ymm4,96(%rdi) 1370 leaq 128(%rdi),%rdi 1371 1372 subq $512,%rdx 1373 jnz .Loop_outer8x 1374 1375 jmp .Ldone8x 1376 1377.Ltail8x: 1378 cmpq $448,%rdx 1379 jae .L448_or_more8x 1380 cmpq $384,%rdx 1381 jae .L384_or_more8x 1382 cmpq $320,%rdx 1383 jae .L320_or_more8x 1384 cmpq $256,%rdx 1385 jae .L256_or_more8x 1386 cmpq $192,%rdx 1387 jae .L192_or_more8x 1388 cmpq $128,%rdx 1389 jae .L128_or_more8x 1390 cmpq $64,%rdx 1391 jae .L64_or_more8x 1392 1393 xorq %r10,%r10 1394 vmovdqa %ymm6,0(%rsp) 1395 vmovdqa %ymm8,32(%rsp) 1396 jmp .Loop_tail8x 1397 1398.align 32 1399.L64_or_more8x: 1400 vpxor 0(%rsi),%ymm6,%ymm6 1401 vpxor 32(%rsi),%ymm8,%ymm8 1402 vmovdqu %ymm6,0(%rdi) 1403 vmovdqu %ymm8,32(%rdi) 1404 je .Ldone8x 1405 1406 leaq 64(%rsi),%rsi 1407 xorq %r10,%r10 1408 vmovdqa %ymm1,0(%rsp) 1409 leaq 64(%rdi),%rdi 1410 subq $64,%rdx 1411 vmovdqa %ymm5,32(%rsp) 1412 jmp .Loop_tail8x 1413 1414.align 32 1415.L128_or_more8x: 1416 vpxor 0(%rsi),%ymm6,%ymm6 1417 vpxor 32(%rsi),%ymm8,%ymm8 1418 vpxor 64(%rsi),%ymm1,%ymm1 1419 vpxor 96(%rsi),%ymm5,%ymm5 1420 vmovdqu %ymm6,0(%rdi) 1421 vmovdqu %ymm8,32(%rdi) 1422 vmovdqu %ymm1,64(%rdi) 1423 vmovdqu %ymm5,96(%rdi) 1424 je .Ldone8x 1425 1426 leaq 128(%rsi),%rsi 1427 xorq %r10,%r10 1428 vmovdqa %ymm12,0(%rsp) 1429 leaq 128(%rdi),%rdi 1430 subq $128,%rdx 1431 vmovdqa %ymm13,32(%rsp) 1432 jmp .Loop_tail8x 1433 1434.align 32 1435.L192_or_more8x: 1436 vpxor 0(%rsi),%ymm6,%ymm6 1437 vpxor 32(%rsi),%ymm8,%ymm8 1438 vpxor 64(%rsi),%ymm1,%ymm1 1439 vpxor 96(%rsi),%ymm5,%ymm5 1440 vpxor 128(%rsi),%ymm12,%ymm12 1441 vpxor 160(%rsi),%ymm13,%ymm13 1442 vmovdqu %ymm6,0(%rdi) 1443 vmovdqu %ymm8,32(%rdi) 1444 vmovdqu %ymm1,64(%rdi) 1445 vmovdqu %ymm5,96(%rdi) 1446 vmovdqu %ymm12,128(%rdi) 1447 vmovdqu %ymm13,160(%rdi) 1448 je .Ldone8x 1449 1450 leaq 192(%rsi),%rsi 1451 xorq %r10,%r10 1452 vmovdqa %ymm10,0(%rsp) 1453 leaq 192(%rdi),%rdi 1454 subq $192,%rdx 1455 vmovdqa %ymm15,32(%rsp) 1456 jmp .Loop_tail8x 1457 1458.align 32 1459.L256_or_more8x: 1460 vpxor 0(%rsi),%ymm6,%ymm6 1461 vpxor 32(%rsi),%ymm8,%ymm8 1462 vpxor 64(%rsi),%ymm1,%ymm1 1463 vpxor 96(%rsi),%ymm5,%ymm5 1464 vpxor 128(%rsi),%ymm12,%ymm12 1465 vpxor 160(%rsi),%ymm13,%ymm13 1466 vpxor 192(%rsi),%ymm10,%ymm10 1467 vpxor 224(%rsi),%ymm15,%ymm15 1468 vmovdqu %ymm6,0(%rdi) 1469 vmovdqu %ymm8,32(%rdi) 1470 vmovdqu %ymm1,64(%rdi) 1471 vmovdqu %ymm5,96(%rdi) 1472 vmovdqu %ymm12,128(%rdi) 1473 vmovdqu %ymm13,160(%rdi) 1474 vmovdqu %ymm10,192(%rdi) 1475 vmovdqu %ymm15,224(%rdi) 1476 je .Ldone8x 1477 1478 leaq 256(%rsi),%rsi 1479 xorq %r10,%r10 1480 vmovdqa %ymm14,0(%rsp) 1481 leaq 256(%rdi),%rdi 1482 subq $256,%rdx 1483 vmovdqa %ymm2,32(%rsp) 1484 jmp .Loop_tail8x 1485 1486.align 32 1487.L320_or_more8x: 1488 vpxor 0(%rsi),%ymm6,%ymm6 1489 vpxor 32(%rsi),%ymm8,%ymm8 1490 vpxor 64(%rsi),%ymm1,%ymm1 1491 vpxor 96(%rsi),%ymm5,%ymm5 1492 vpxor 128(%rsi),%ymm12,%ymm12 1493 vpxor 160(%rsi),%ymm13,%ymm13 1494 vpxor 192(%rsi),%ymm10,%ymm10 1495 vpxor 224(%rsi),%ymm15,%ymm15 1496 vpxor 256(%rsi),%ymm14,%ymm14 1497 vpxor 288(%rsi),%ymm2,%ymm2 1498 vmovdqu %ymm6,0(%rdi) 1499 vmovdqu %ymm8,32(%rdi) 1500 vmovdqu %ymm1,64(%rdi) 1501 vmovdqu %ymm5,96(%rdi) 1502 vmovdqu %ymm12,128(%rdi) 1503 vmovdqu %ymm13,160(%rdi) 1504 vmovdqu %ymm10,192(%rdi) 1505 vmovdqu %ymm15,224(%rdi) 1506 vmovdqu %ymm14,256(%rdi) 1507 vmovdqu %ymm2,288(%rdi) 1508 je .Ldone8x 1509 1510 leaq 320(%rsi),%rsi 1511 xorq %r10,%r10 1512 vmovdqa %ymm3,0(%rsp) 1513 leaq 320(%rdi),%rdi 1514 subq $320,%rdx 1515 vmovdqa %ymm7,32(%rsp) 1516 jmp .Loop_tail8x 1517 1518.align 32 1519.L384_or_more8x: 1520 vpxor 0(%rsi),%ymm6,%ymm6 1521 vpxor 32(%rsi),%ymm8,%ymm8 1522 vpxor 64(%rsi),%ymm1,%ymm1 1523 vpxor 96(%rsi),%ymm5,%ymm5 1524 vpxor 128(%rsi),%ymm12,%ymm12 1525 vpxor 160(%rsi),%ymm13,%ymm13 1526 vpxor 192(%rsi),%ymm10,%ymm10 1527 vpxor 224(%rsi),%ymm15,%ymm15 1528 vpxor 256(%rsi),%ymm14,%ymm14 1529 vpxor 288(%rsi),%ymm2,%ymm2 1530 vpxor 320(%rsi),%ymm3,%ymm3 1531 vpxor 352(%rsi),%ymm7,%ymm7 1532 vmovdqu %ymm6,0(%rdi) 1533 vmovdqu %ymm8,32(%rdi) 1534 vmovdqu %ymm1,64(%rdi) 1535 vmovdqu %ymm5,96(%rdi) 1536 vmovdqu %ymm12,128(%rdi) 1537 vmovdqu %ymm13,160(%rdi) 1538 vmovdqu %ymm10,192(%rdi) 1539 vmovdqu %ymm15,224(%rdi) 1540 vmovdqu %ymm14,256(%rdi) 1541 vmovdqu %ymm2,288(%rdi) 1542 vmovdqu %ymm3,320(%rdi) 1543 vmovdqu %ymm7,352(%rdi) 1544 je .Ldone8x 1545 1546 leaq 384(%rsi),%rsi 1547 xorq %r10,%r10 1548 vmovdqa %ymm11,0(%rsp) 1549 leaq 384(%rdi),%rdi 1550 subq $384,%rdx 1551 vmovdqa %ymm9,32(%rsp) 1552 jmp .Loop_tail8x 1553 1554.align 32 1555.L448_or_more8x: 1556 vpxor 0(%rsi),%ymm6,%ymm6 1557 vpxor 32(%rsi),%ymm8,%ymm8 1558 vpxor 64(%rsi),%ymm1,%ymm1 1559 vpxor 96(%rsi),%ymm5,%ymm5 1560 vpxor 128(%rsi),%ymm12,%ymm12 1561 vpxor 160(%rsi),%ymm13,%ymm13 1562 vpxor 192(%rsi),%ymm10,%ymm10 1563 vpxor 224(%rsi),%ymm15,%ymm15 1564 vpxor 256(%rsi),%ymm14,%ymm14 1565 vpxor 288(%rsi),%ymm2,%ymm2 1566 vpxor 320(%rsi),%ymm3,%ymm3 1567 vpxor 352(%rsi),%ymm7,%ymm7 1568 vpxor 384(%rsi),%ymm11,%ymm11 1569 vpxor 416(%rsi),%ymm9,%ymm9 1570 vmovdqu %ymm6,0(%rdi) 1571 vmovdqu %ymm8,32(%rdi) 1572 vmovdqu %ymm1,64(%rdi) 1573 vmovdqu %ymm5,96(%rdi) 1574 vmovdqu %ymm12,128(%rdi) 1575 vmovdqu %ymm13,160(%rdi) 1576 vmovdqu %ymm10,192(%rdi) 1577 vmovdqu %ymm15,224(%rdi) 1578 vmovdqu %ymm14,256(%rdi) 1579 vmovdqu %ymm2,288(%rdi) 1580 vmovdqu %ymm3,320(%rdi) 1581 vmovdqu %ymm7,352(%rdi) 1582 vmovdqu %ymm11,384(%rdi) 1583 vmovdqu %ymm9,416(%rdi) 1584 je .Ldone8x 1585 1586 leaq 448(%rsi),%rsi 1587 xorq %r10,%r10 1588 vmovdqa %ymm0,0(%rsp) 1589 leaq 448(%rdi),%rdi 1590 subq $448,%rdx 1591 vmovdqa %ymm4,32(%rsp) 1592 1593.Loop_tail8x: 1594 movzbl (%rsi,%r10,1),%eax 1595 movzbl (%rsp,%r10,1),%ecx 1596 leaq 1(%r10),%r10 1597 xorl %ecx,%eax 1598 movb %al,-1(%rdi,%r10,1) 1599 decq %rdx 1600 jnz .Loop_tail8x 1601 1602.Ldone8x: 1603 vzeroall 1604 leaq (%r9),%rsp 1605.cfi_def_cfa_register rsp 1606.L8x_epilogue: 1607 ret 1608.cfi_endproc 1609.size ChaCha20_ctr32_avx2,.-ChaCha20_ctr32_avx2 1610#endif 1611