1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#include <openssl/asm_base.h> 5 6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) 7.text 8 9.section __DATA,__const 10.p2align 6 11L$zero: 12.long 0,0,0,0 13L$one: 14.long 1,0,0,0 15L$inc: 16.long 0,1,2,3 17L$four: 18.long 4,4,4,4 19L$incy: 20.long 0,2,4,6,1,3,5,7 21L$eight: 22.long 8,8,8,8,8,8,8,8 23L$rot16: 24.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd 25L$rot24: 26.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe 27L$sigma: 28.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 29.p2align 6 30L$zeroz: 31.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 32L$fourz: 33.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 34L$incz: 35.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 36L$sixteen: 37.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 38.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 39.text 40.globl _ChaCha20_ctr32_nohw 41.private_extern _ChaCha20_ctr32_nohw 42 43.p2align 6 44_ChaCha20_ctr32_nohw: 45 46_CET_ENDBR 47 pushq %rbx 48 49 pushq %rbp 50 51 pushq %r12 52 53 pushq %r13 54 55 pushq %r14 56 57 pushq %r15 58 59 subq $64+24,%rsp 60 61L$ctr32_body: 62 63 64 movdqu (%rcx),%xmm1 65 movdqu 16(%rcx),%xmm2 66 movdqu (%r8),%xmm3 67 movdqa L$one(%rip),%xmm4 68 69 70 movdqa %xmm1,16(%rsp) 71 movdqa %xmm2,32(%rsp) 72 movdqa %xmm3,48(%rsp) 73 movq %rdx,%rbp 74 jmp L$oop_outer 75 76.p2align 5 77L$oop_outer: 78 movl $0x61707865,%eax 79 movl $0x3320646e,%ebx 80 movl $0x79622d32,%ecx 81 movl $0x6b206574,%edx 82 movl 16(%rsp),%r8d 83 movl 20(%rsp),%r9d 84 movl 24(%rsp),%r10d 85 movl 28(%rsp),%r11d 86 movd %xmm3,%r12d 87 movl 52(%rsp),%r13d 88 movl 56(%rsp),%r14d 89 movl 60(%rsp),%r15d 90 91 movq %rbp,64+0(%rsp) 92 movl $10,%ebp 93 movq %rsi,64+8(%rsp) 94.byte 102,72,15,126,214 95 movq %rdi,64+16(%rsp) 96 movq %rsi,%rdi 97 shrq $32,%rdi 98 jmp L$oop 99 100.p2align 5 101L$oop: 102 addl %r8d,%eax 103 xorl %eax,%r12d 104 roll $16,%r12d 105 addl %r9d,%ebx 106 xorl %ebx,%r13d 107 roll $16,%r13d 108 addl %r12d,%esi 109 xorl %esi,%r8d 110 roll $12,%r8d 111 addl %r13d,%edi 112 xorl %edi,%r9d 113 roll $12,%r9d 114 addl %r8d,%eax 115 xorl %eax,%r12d 116 roll $8,%r12d 117 addl %r9d,%ebx 118 xorl %ebx,%r13d 119 roll $8,%r13d 120 addl %r12d,%esi 121 xorl %esi,%r8d 122 roll $7,%r8d 123 addl %r13d,%edi 124 xorl %edi,%r9d 125 roll $7,%r9d 126 movl %esi,32(%rsp) 127 movl %edi,36(%rsp) 128 movl 40(%rsp),%esi 129 movl 44(%rsp),%edi 130 addl %r10d,%ecx 131 xorl %ecx,%r14d 132 roll $16,%r14d 133 addl %r11d,%edx 134 xorl %edx,%r15d 135 roll $16,%r15d 136 addl %r14d,%esi 137 xorl %esi,%r10d 138 roll $12,%r10d 139 addl %r15d,%edi 140 xorl %edi,%r11d 141 roll $12,%r11d 142 addl %r10d,%ecx 143 xorl %ecx,%r14d 144 roll $8,%r14d 145 addl %r11d,%edx 146 xorl %edx,%r15d 147 roll $8,%r15d 148 addl %r14d,%esi 149 xorl %esi,%r10d 150 roll $7,%r10d 151 addl %r15d,%edi 152 xorl %edi,%r11d 153 roll $7,%r11d 154 addl %r9d,%eax 155 xorl %eax,%r15d 156 roll $16,%r15d 157 addl %r10d,%ebx 158 xorl %ebx,%r12d 159 roll $16,%r12d 160 addl %r15d,%esi 161 xorl %esi,%r9d 162 roll $12,%r9d 163 addl %r12d,%edi 164 xorl %edi,%r10d 165 roll $12,%r10d 166 addl %r9d,%eax 167 xorl %eax,%r15d 168 roll $8,%r15d 169 addl %r10d,%ebx 170 xorl %ebx,%r12d 171 roll $8,%r12d 172 addl %r15d,%esi 173 xorl %esi,%r9d 174 roll $7,%r9d 175 addl %r12d,%edi 176 xorl %edi,%r10d 177 roll $7,%r10d 178 movl %esi,40(%rsp) 179 movl %edi,44(%rsp) 180 movl 32(%rsp),%esi 181 movl 36(%rsp),%edi 182 addl %r11d,%ecx 183 xorl %ecx,%r13d 184 roll $16,%r13d 185 addl %r8d,%edx 186 xorl %edx,%r14d 187 roll $16,%r14d 188 addl %r13d,%esi 189 xorl %esi,%r11d 190 roll $12,%r11d 191 addl %r14d,%edi 192 xorl %edi,%r8d 193 roll $12,%r8d 194 addl %r11d,%ecx 195 xorl %ecx,%r13d 196 roll $8,%r13d 197 addl %r8d,%edx 198 xorl %edx,%r14d 199 roll $8,%r14d 200 addl %r13d,%esi 201 xorl %esi,%r11d 202 roll $7,%r11d 203 addl %r14d,%edi 204 xorl %edi,%r8d 205 roll $7,%r8d 206 decl %ebp 207 jnz L$oop 208 movl %edi,36(%rsp) 209 movl %esi,32(%rsp) 210 movq 64(%rsp),%rbp 211 movdqa %xmm2,%xmm1 212 movq 64+8(%rsp),%rsi 213 paddd %xmm4,%xmm3 214 movq 64+16(%rsp),%rdi 215 216 addl $0x61707865,%eax 217 addl $0x3320646e,%ebx 218 addl $0x79622d32,%ecx 219 addl $0x6b206574,%edx 220 addl 16(%rsp),%r8d 221 addl 20(%rsp),%r9d 222 addl 24(%rsp),%r10d 223 addl 28(%rsp),%r11d 224 addl 48(%rsp),%r12d 225 addl 52(%rsp),%r13d 226 addl 56(%rsp),%r14d 227 addl 60(%rsp),%r15d 228 paddd 32(%rsp),%xmm1 229 230 cmpq $64,%rbp 231 jb L$tail 232 233 xorl 0(%rsi),%eax 234 xorl 4(%rsi),%ebx 235 xorl 8(%rsi),%ecx 236 xorl 12(%rsi),%edx 237 xorl 16(%rsi),%r8d 238 xorl 20(%rsi),%r9d 239 xorl 24(%rsi),%r10d 240 xorl 28(%rsi),%r11d 241 movdqu 32(%rsi),%xmm0 242 xorl 48(%rsi),%r12d 243 xorl 52(%rsi),%r13d 244 xorl 56(%rsi),%r14d 245 xorl 60(%rsi),%r15d 246 leaq 64(%rsi),%rsi 247 pxor %xmm1,%xmm0 248 249 movdqa %xmm2,32(%rsp) 250 movd %xmm3,48(%rsp) 251 252 movl %eax,0(%rdi) 253 movl %ebx,4(%rdi) 254 movl %ecx,8(%rdi) 255 movl %edx,12(%rdi) 256 movl %r8d,16(%rdi) 257 movl %r9d,20(%rdi) 258 movl %r10d,24(%rdi) 259 movl %r11d,28(%rdi) 260 movdqu %xmm0,32(%rdi) 261 movl %r12d,48(%rdi) 262 movl %r13d,52(%rdi) 263 movl %r14d,56(%rdi) 264 movl %r15d,60(%rdi) 265 leaq 64(%rdi),%rdi 266 267 subq $64,%rbp 268 jnz L$oop_outer 269 270 jmp L$done 271 272.p2align 4 273L$tail: 274 movl %eax,0(%rsp) 275 movl %ebx,4(%rsp) 276 xorq %rbx,%rbx 277 movl %ecx,8(%rsp) 278 movl %edx,12(%rsp) 279 movl %r8d,16(%rsp) 280 movl %r9d,20(%rsp) 281 movl %r10d,24(%rsp) 282 movl %r11d,28(%rsp) 283 movdqa %xmm1,32(%rsp) 284 movl %r12d,48(%rsp) 285 movl %r13d,52(%rsp) 286 movl %r14d,56(%rsp) 287 movl %r15d,60(%rsp) 288 289L$oop_tail: 290 movzbl (%rsi,%rbx,1),%eax 291 movzbl (%rsp,%rbx,1),%edx 292 leaq 1(%rbx),%rbx 293 xorl %edx,%eax 294 movb %al,-1(%rdi,%rbx,1) 295 decq %rbp 296 jnz L$oop_tail 297 298L$done: 299 leaq 64+24+48(%rsp),%rsi 300 movq -48(%rsi),%r15 301 302 movq -40(%rsi),%r14 303 304 movq -32(%rsi),%r13 305 306 movq -24(%rsi),%r12 307 308 movq -16(%rsi),%rbp 309 310 movq -8(%rsi),%rbx 311 312 leaq (%rsi),%rsp 313 314L$no_data: 315 ret 316 317 318.globl _ChaCha20_ctr32_ssse3 319.private_extern _ChaCha20_ctr32_ssse3 320 321.p2align 5 322_ChaCha20_ctr32_ssse3: 323 324_CET_ENDBR 325 movq %rsp,%r9 326 327 subq $64+8,%rsp 328 movdqa L$sigma(%rip),%xmm0 329 movdqu (%rcx),%xmm1 330 movdqu 16(%rcx),%xmm2 331 movdqu (%r8),%xmm3 332 movdqa L$rot16(%rip),%xmm6 333 movdqa L$rot24(%rip),%xmm7 334 335 movdqa %xmm0,0(%rsp) 336 movdqa %xmm1,16(%rsp) 337 movdqa %xmm2,32(%rsp) 338 movdqa %xmm3,48(%rsp) 339 movq $10,%r8 340 jmp L$oop_ssse3 341 342.p2align 5 343L$oop_outer_ssse3: 344 movdqa L$one(%rip),%xmm3 345 movdqa 0(%rsp),%xmm0 346 movdqa 16(%rsp),%xmm1 347 movdqa 32(%rsp),%xmm2 348 paddd 48(%rsp),%xmm3 349 movq $10,%r8 350 movdqa %xmm3,48(%rsp) 351 jmp L$oop_ssse3 352 353.p2align 5 354L$oop_ssse3: 355 paddd %xmm1,%xmm0 356 pxor %xmm0,%xmm3 357.byte 102,15,56,0,222 358 paddd %xmm3,%xmm2 359 pxor %xmm2,%xmm1 360 movdqa %xmm1,%xmm4 361 psrld $20,%xmm1 362 pslld $12,%xmm4 363 por %xmm4,%xmm1 364 paddd %xmm1,%xmm0 365 pxor %xmm0,%xmm3 366.byte 102,15,56,0,223 367 paddd %xmm3,%xmm2 368 pxor %xmm2,%xmm1 369 movdqa %xmm1,%xmm4 370 psrld $25,%xmm1 371 pslld $7,%xmm4 372 por %xmm4,%xmm1 373 pshufd $78,%xmm2,%xmm2 374 pshufd $57,%xmm1,%xmm1 375 pshufd $147,%xmm3,%xmm3 376 nop 377 paddd %xmm1,%xmm0 378 pxor %xmm0,%xmm3 379.byte 102,15,56,0,222 380 paddd %xmm3,%xmm2 381 pxor %xmm2,%xmm1 382 movdqa %xmm1,%xmm4 383 psrld $20,%xmm1 384 pslld $12,%xmm4 385 por %xmm4,%xmm1 386 paddd %xmm1,%xmm0 387 pxor %xmm0,%xmm3 388.byte 102,15,56,0,223 389 paddd %xmm3,%xmm2 390 pxor %xmm2,%xmm1 391 movdqa %xmm1,%xmm4 392 psrld $25,%xmm1 393 pslld $7,%xmm4 394 por %xmm4,%xmm1 395 pshufd $78,%xmm2,%xmm2 396 pshufd $147,%xmm1,%xmm1 397 pshufd $57,%xmm3,%xmm3 398 decq %r8 399 jnz L$oop_ssse3 400 paddd 0(%rsp),%xmm0 401 paddd 16(%rsp),%xmm1 402 paddd 32(%rsp),%xmm2 403 paddd 48(%rsp),%xmm3 404 405 cmpq $64,%rdx 406 jb L$tail_ssse3 407 408 movdqu 0(%rsi),%xmm4 409 movdqu 16(%rsi),%xmm5 410 pxor %xmm4,%xmm0 411 movdqu 32(%rsi),%xmm4 412 pxor %xmm5,%xmm1 413 movdqu 48(%rsi),%xmm5 414 leaq 64(%rsi),%rsi 415 pxor %xmm4,%xmm2 416 pxor %xmm5,%xmm3 417 418 movdqu %xmm0,0(%rdi) 419 movdqu %xmm1,16(%rdi) 420 movdqu %xmm2,32(%rdi) 421 movdqu %xmm3,48(%rdi) 422 leaq 64(%rdi),%rdi 423 424 subq $64,%rdx 425 jnz L$oop_outer_ssse3 426 427 jmp L$done_ssse3 428 429.p2align 4 430L$tail_ssse3: 431 movdqa %xmm0,0(%rsp) 432 movdqa %xmm1,16(%rsp) 433 movdqa %xmm2,32(%rsp) 434 movdqa %xmm3,48(%rsp) 435 xorq %r8,%r8 436 437L$oop_tail_ssse3: 438 movzbl (%rsi,%r8,1),%eax 439 movzbl (%rsp,%r8,1),%ecx 440 leaq 1(%r8),%r8 441 xorl %ecx,%eax 442 movb %al,-1(%rdi,%r8,1) 443 decq %rdx 444 jnz L$oop_tail_ssse3 445 446L$done_ssse3: 447 leaq (%r9),%rsp 448 449L$ssse3_epilogue: 450 ret 451 452 453.globl _ChaCha20_ctr32_ssse3_4x 454.private_extern _ChaCha20_ctr32_ssse3_4x 455 456.p2align 5 457_ChaCha20_ctr32_ssse3_4x: 458 459_CET_ENDBR 460 movq %rsp,%r9 461 462 movq %r10,%r11 463 subq $0x140+8,%rsp 464 movdqa L$sigma(%rip),%xmm11 465 movdqu (%rcx),%xmm15 466 movdqu 16(%rcx),%xmm7 467 movdqu (%r8),%xmm3 468 leaq 256(%rsp),%rcx 469 leaq L$rot16(%rip),%r10 470 leaq L$rot24(%rip),%r11 471 472 pshufd $0x00,%xmm11,%xmm8 473 pshufd $0x55,%xmm11,%xmm9 474 movdqa %xmm8,64(%rsp) 475 pshufd $0xaa,%xmm11,%xmm10 476 movdqa %xmm9,80(%rsp) 477 pshufd $0xff,%xmm11,%xmm11 478 movdqa %xmm10,96(%rsp) 479 movdqa %xmm11,112(%rsp) 480 481 pshufd $0x00,%xmm15,%xmm12 482 pshufd $0x55,%xmm15,%xmm13 483 movdqa %xmm12,128-256(%rcx) 484 pshufd $0xaa,%xmm15,%xmm14 485 movdqa %xmm13,144-256(%rcx) 486 pshufd $0xff,%xmm15,%xmm15 487 movdqa %xmm14,160-256(%rcx) 488 movdqa %xmm15,176-256(%rcx) 489 490 pshufd $0x00,%xmm7,%xmm4 491 pshufd $0x55,%xmm7,%xmm5 492 movdqa %xmm4,192-256(%rcx) 493 pshufd $0xaa,%xmm7,%xmm6 494 movdqa %xmm5,208-256(%rcx) 495 pshufd $0xff,%xmm7,%xmm7 496 movdqa %xmm6,224-256(%rcx) 497 movdqa %xmm7,240-256(%rcx) 498 499 pshufd $0x00,%xmm3,%xmm0 500 pshufd $0x55,%xmm3,%xmm1 501 paddd L$inc(%rip),%xmm0 502 pshufd $0xaa,%xmm3,%xmm2 503 movdqa %xmm1,272-256(%rcx) 504 pshufd $0xff,%xmm3,%xmm3 505 movdqa %xmm2,288-256(%rcx) 506 movdqa %xmm3,304-256(%rcx) 507 508 jmp L$oop_enter4x 509 510.p2align 5 511L$oop_outer4x: 512 movdqa 64(%rsp),%xmm8 513 movdqa 80(%rsp),%xmm9 514 movdqa 96(%rsp),%xmm10 515 movdqa 112(%rsp),%xmm11 516 movdqa 128-256(%rcx),%xmm12 517 movdqa 144-256(%rcx),%xmm13 518 movdqa 160-256(%rcx),%xmm14 519 movdqa 176-256(%rcx),%xmm15 520 movdqa 192-256(%rcx),%xmm4 521 movdqa 208-256(%rcx),%xmm5 522 movdqa 224-256(%rcx),%xmm6 523 movdqa 240-256(%rcx),%xmm7 524 movdqa 256-256(%rcx),%xmm0 525 movdqa 272-256(%rcx),%xmm1 526 movdqa 288-256(%rcx),%xmm2 527 movdqa 304-256(%rcx),%xmm3 528 paddd L$four(%rip),%xmm0 529 530L$oop_enter4x: 531 movdqa %xmm6,32(%rsp) 532 movdqa %xmm7,48(%rsp) 533 movdqa (%r10),%xmm7 534 movl $10,%eax 535 movdqa %xmm0,256-256(%rcx) 536 jmp L$oop4x 537 538.p2align 5 539L$oop4x: 540 paddd %xmm12,%xmm8 541 paddd %xmm13,%xmm9 542 pxor %xmm8,%xmm0 543 pxor %xmm9,%xmm1 544.byte 102,15,56,0,199 545.byte 102,15,56,0,207 546 paddd %xmm0,%xmm4 547 paddd %xmm1,%xmm5 548 pxor %xmm4,%xmm12 549 pxor %xmm5,%xmm13 550 movdqa %xmm12,%xmm6 551 pslld $12,%xmm12 552 psrld $20,%xmm6 553 movdqa %xmm13,%xmm7 554 pslld $12,%xmm13 555 por %xmm6,%xmm12 556 psrld $20,%xmm7 557 movdqa (%r11),%xmm6 558 por %xmm7,%xmm13 559 paddd %xmm12,%xmm8 560 paddd %xmm13,%xmm9 561 pxor %xmm8,%xmm0 562 pxor %xmm9,%xmm1 563.byte 102,15,56,0,198 564.byte 102,15,56,0,206 565 paddd %xmm0,%xmm4 566 paddd %xmm1,%xmm5 567 pxor %xmm4,%xmm12 568 pxor %xmm5,%xmm13 569 movdqa %xmm12,%xmm7 570 pslld $7,%xmm12 571 psrld $25,%xmm7 572 movdqa %xmm13,%xmm6 573 pslld $7,%xmm13 574 por %xmm7,%xmm12 575 psrld $25,%xmm6 576 movdqa (%r10),%xmm7 577 por %xmm6,%xmm13 578 movdqa %xmm4,0(%rsp) 579 movdqa %xmm5,16(%rsp) 580 movdqa 32(%rsp),%xmm4 581 movdqa 48(%rsp),%xmm5 582 paddd %xmm14,%xmm10 583 paddd %xmm15,%xmm11 584 pxor %xmm10,%xmm2 585 pxor %xmm11,%xmm3 586.byte 102,15,56,0,215 587.byte 102,15,56,0,223 588 paddd %xmm2,%xmm4 589 paddd %xmm3,%xmm5 590 pxor %xmm4,%xmm14 591 pxor %xmm5,%xmm15 592 movdqa %xmm14,%xmm6 593 pslld $12,%xmm14 594 psrld $20,%xmm6 595 movdqa %xmm15,%xmm7 596 pslld $12,%xmm15 597 por %xmm6,%xmm14 598 psrld $20,%xmm7 599 movdqa (%r11),%xmm6 600 por %xmm7,%xmm15 601 paddd %xmm14,%xmm10 602 paddd %xmm15,%xmm11 603 pxor %xmm10,%xmm2 604 pxor %xmm11,%xmm3 605.byte 102,15,56,0,214 606.byte 102,15,56,0,222 607 paddd %xmm2,%xmm4 608 paddd %xmm3,%xmm5 609 pxor %xmm4,%xmm14 610 pxor %xmm5,%xmm15 611 movdqa %xmm14,%xmm7 612 pslld $7,%xmm14 613 psrld $25,%xmm7 614 movdqa %xmm15,%xmm6 615 pslld $7,%xmm15 616 por %xmm7,%xmm14 617 psrld $25,%xmm6 618 movdqa (%r10),%xmm7 619 por %xmm6,%xmm15 620 paddd %xmm13,%xmm8 621 paddd %xmm14,%xmm9 622 pxor %xmm8,%xmm3 623 pxor %xmm9,%xmm0 624.byte 102,15,56,0,223 625.byte 102,15,56,0,199 626 paddd %xmm3,%xmm4 627 paddd %xmm0,%xmm5 628 pxor %xmm4,%xmm13 629 pxor %xmm5,%xmm14 630 movdqa %xmm13,%xmm6 631 pslld $12,%xmm13 632 psrld $20,%xmm6 633 movdqa %xmm14,%xmm7 634 pslld $12,%xmm14 635 por %xmm6,%xmm13 636 psrld $20,%xmm7 637 movdqa (%r11),%xmm6 638 por %xmm7,%xmm14 639 paddd %xmm13,%xmm8 640 paddd %xmm14,%xmm9 641 pxor %xmm8,%xmm3 642 pxor %xmm9,%xmm0 643.byte 102,15,56,0,222 644.byte 102,15,56,0,198 645 paddd %xmm3,%xmm4 646 paddd %xmm0,%xmm5 647 pxor %xmm4,%xmm13 648 pxor %xmm5,%xmm14 649 movdqa %xmm13,%xmm7 650 pslld $7,%xmm13 651 psrld $25,%xmm7 652 movdqa %xmm14,%xmm6 653 pslld $7,%xmm14 654 por %xmm7,%xmm13 655 psrld $25,%xmm6 656 movdqa (%r10),%xmm7 657 por %xmm6,%xmm14 658 movdqa %xmm4,32(%rsp) 659 movdqa %xmm5,48(%rsp) 660 movdqa 0(%rsp),%xmm4 661 movdqa 16(%rsp),%xmm5 662 paddd %xmm15,%xmm10 663 paddd %xmm12,%xmm11 664 pxor %xmm10,%xmm1 665 pxor %xmm11,%xmm2 666.byte 102,15,56,0,207 667.byte 102,15,56,0,215 668 paddd %xmm1,%xmm4 669 paddd %xmm2,%xmm5 670 pxor %xmm4,%xmm15 671 pxor %xmm5,%xmm12 672 movdqa %xmm15,%xmm6 673 pslld $12,%xmm15 674 psrld $20,%xmm6 675 movdqa %xmm12,%xmm7 676 pslld $12,%xmm12 677 por %xmm6,%xmm15 678 psrld $20,%xmm7 679 movdqa (%r11),%xmm6 680 por %xmm7,%xmm12 681 paddd %xmm15,%xmm10 682 paddd %xmm12,%xmm11 683 pxor %xmm10,%xmm1 684 pxor %xmm11,%xmm2 685.byte 102,15,56,0,206 686.byte 102,15,56,0,214 687 paddd %xmm1,%xmm4 688 paddd %xmm2,%xmm5 689 pxor %xmm4,%xmm15 690 pxor %xmm5,%xmm12 691 movdqa %xmm15,%xmm7 692 pslld $7,%xmm15 693 psrld $25,%xmm7 694 movdqa %xmm12,%xmm6 695 pslld $7,%xmm12 696 por %xmm7,%xmm15 697 psrld $25,%xmm6 698 movdqa (%r10),%xmm7 699 por %xmm6,%xmm12 700 decl %eax 701 jnz L$oop4x 702 703 paddd 64(%rsp),%xmm8 704 paddd 80(%rsp),%xmm9 705 paddd 96(%rsp),%xmm10 706 paddd 112(%rsp),%xmm11 707 708 movdqa %xmm8,%xmm6 709 punpckldq %xmm9,%xmm8 710 movdqa %xmm10,%xmm7 711 punpckldq %xmm11,%xmm10 712 punpckhdq %xmm9,%xmm6 713 punpckhdq %xmm11,%xmm7 714 movdqa %xmm8,%xmm9 715 punpcklqdq %xmm10,%xmm8 716 movdqa %xmm6,%xmm11 717 punpcklqdq %xmm7,%xmm6 718 punpckhqdq %xmm10,%xmm9 719 punpckhqdq %xmm7,%xmm11 720 paddd 128-256(%rcx),%xmm12 721 paddd 144-256(%rcx),%xmm13 722 paddd 160-256(%rcx),%xmm14 723 paddd 176-256(%rcx),%xmm15 724 725 movdqa %xmm8,0(%rsp) 726 movdqa %xmm9,16(%rsp) 727 movdqa 32(%rsp),%xmm8 728 movdqa 48(%rsp),%xmm9 729 730 movdqa %xmm12,%xmm10 731 punpckldq %xmm13,%xmm12 732 movdqa %xmm14,%xmm7 733 punpckldq %xmm15,%xmm14 734 punpckhdq %xmm13,%xmm10 735 punpckhdq %xmm15,%xmm7 736 movdqa %xmm12,%xmm13 737 punpcklqdq %xmm14,%xmm12 738 movdqa %xmm10,%xmm15 739 punpcklqdq %xmm7,%xmm10 740 punpckhqdq %xmm14,%xmm13 741 punpckhqdq %xmm7,%xmm15 742 paddd 192-256(%rcx),%xmm4 743 paddd 208-256(%rcx),%xmm5 744 paddd 224-256(%rcx),%xmm8 745 paddd 240-256(%rcx),%xmm9 746 747 movdqa %xmm6,32(%rsp) 748 movdqa %xmm11,48(%rsp) 749 750 movdqa %xmm4,%xmm14 751 punpckldq %xmm5,%xmm4 752 movdqa %xmm8,%xmm7 753 punpckldq %xmm9,%xmm8 754 punpckhdq %xmm5,%xmm14 755 punpckhdq %xmm9,%xmm7 756 movdqa %xmm4,%xmm5 757 punpcklqdq %xmm8,%xmm4 758 movdqa %xmm14,%xmm9 759 punpcklqdq %xmm7,%xmm14 760 punpckhqdq %xmm8,%xmm5 761 punpckhqdq %xmm7,%xmm9 762 paddd 256-256(%rcx),%xmm0 763 paddd 272-256(%rcx),%xmm1 764 paddd 288-256(%rcx),%xmm2 765 paddd 304-256(%rcx),%xmm3 766 767 movdqa %xmm0,%xmm8 768 punpckldq %xmm1,%xmm0 769 movdqa %xmm2,%xmm7 770 punpckldq %xmm3,%xmm2 771 punpckhdq %xmm1,%xmm8 772 punpckhdq %xmm3,%xmm7 773 movdqa %xmm0,%xmm1 774 punpcklqdq %xmm2,%xmm0 775 movdqa %xmm8,%xmm3 776 punpcklqdq %xmm7,%xmm8 777 punpckhqdq %xmm2,%xmm1 778 punpckhqdq %xmm7,%xmm3 779 cmpq $256,%rdx 780 jb L$tail4x 781 782 movdqu 0(%rsi),%xmm6 783 movdqu 16(%rsi),%xmm11 784 movdqu 32(%rsi),%xmm2 785 movdqu 48(%rsi),%xmm7 786 pxor 0(%rsp),%xmm6 787 pxor %xmm12,%xmm11 788 pxor %xmm4,%xmm2 789 pxor %xmm0,%xmm7 790 791 movdqu %xmm6,0(%rdi) 792 movdqu 64(%rsi),%xmm6 793 movdqu %xmm11,16(%rdi) 794 movdqu 80(%rsi),%xmm11 795 movdqu %xmm2,32(%rdi) 796 movdqu 96(%rsi),%xmm2 797 movdqu %xmm7,48(%rdi) 798 movdqu 112(%rsi),%xmm7 799 leaq 128(%rsi),%rsi 800 pxor 16(%rsp),%xmm6 801 pxor %xmm13,%xmm11 802 pxor %xmm5,%xmm2 803 pxor %xmm1,%xmm7 804 805 movdqu %xmm6,64(%rdi) 806 movdqu 0(%rsi),%xmm6 807 movdqu %xmm11,80(%rdi) 808 movdqu 16(%rsi),%xmm11 809 movdqu %xmm2,96(%rdi) 810 movdqu 32(%rsi),%xmm2 811 movdqu %xmm7,112(%rdi) 812 leaq 128(%rdi),%rdi 813 movdqu 48(%rsi),%xmm7 814 pxor 32(%rsp),%xmm6 815 pxor %xmm10,%xmm11 816 pxor %xmm14,%xmm2 817 pxor %xmm8,%xmm7 818 819 movdqu %xmm6,0(%rdi) 820 movdqu 64(%rsi),%xmm6 821 movdqu %xmm11,16(%rdi) 822 movdqu 80(%rsi),%xmm11 823 movdqu %xmm2,32(%rdi) 824 movdqu 96(%rsi),%xmm2 825 movdqu %xmm7,48(%rdi) 826 movdqu 112(%rsi),%xmm7 827 leaq 128(%rsi),%rsi 828 pxor 48(%rsp),%xmm6 829 pxor %xmm15,%xmm11 830 pxor %xmm9,%xmm2 831 pxor %xmm3,%xmm7 832 movdqu %xmm6,64(%rdi) 833 movdqu %xmm11,80(%rdi) 834 movdqu %xmm2,96(%rdi) 835 movdqu %xmm7,112(%rdi) 836 leaq 128(%rdi),%rdi 837 838 subq $256,%rdx 839 jnz L$oop_outer4x 840 841 jmp L$done4x 842 843L$tail4x: 844 cmpq $192,%rdx 845 jae L$192_or_more4x 846 cmpq $128,%rdx 847 jae L$128_or_more4x 848 cmpq $64,%rdx 849 jae L$64_or_more4x 850 851 852 xorq %r10,%r10 853 854 movdqa %xmm12,16(%rsp) 855 movdqa %xmm4,32(%rsp) 856 movdqa %xmm0,48(%rsp) 857 jmp L$oop_tail4x 858 859.p2align 5 860L$64_or_more4x: 861 movdqu 0(%rsi),%xmm6 862 movdqu 16(%rsi),%xmm11 863 movdqu 32(%rsi),%xmm2 864 movdqu 48(%rsi),%xmm7 865 pxor 0(%rsp),%xmm6 866 pxor %xmm12,%xmm11 867 pxor %xmm4,%xmm2 868 pxor %xmm0,%xmm7 869 movdqu %xmm6,0(%rdi) 870 movdqu %xmm11,16(%rdi) 871 movdqu %xmm2,32(%rdi) 872 movdqu %xmm7,48(%rdi) 873 je L$done4x 874 875 movdqa 16(%rsp),%xmm6 876 leaq 64(%rsi),%rsi 877 xorq %r10,%r10 878 movdqa %xmm6,0(%rsp) 879 movdqa %xmm13,16(%rsp) 880 leaq 64(%rdi),%rdi 881 movdqa %xmm5,32(%rsp) 882 subq $64,%rdx 883 movdqa %xmm1,48(%rsp) 884 jmp L$oop_tail4x 885 886.p2align 5 887L$128_or_more4x: 888 movdqu 0(%rsi),%xmm6 889 movdqu 16(%rsi),%xmm11 890 movdqu 32(%rsi),%xmm2 891 movdqu 48(%rsi),%xmm7 892 pxor 0(%rsp),%xmm6 893 pxor %xmm12,%xmm11 894 pxor %xmm4,%xmm2 895 pxor %xmm0,%xmm7 896 897 movdqu %xmm6,0(%rdi) 898 movdqu 64(%rsi),%xmm6 899 movdqu %xmm11,16(%rdi) 900 movdqu 80(%rsi),%xmm11 901 movdqu %xmm2,32(%rdi) 902 movdqu 96(%rsi),%xmm2 903 movdqu %xmm7,48(%rdi) 904 movdqu 112(%rsi),%xmm7 905 pxor 16(%rsp),%xmm6 906 pxor %xmm13,%xmm11 907 pxor %xmm5,%xmm2 908 pxor %xmm1,%xmm7 909 movdqu %xmm6,64(%rdi) 910 movdqu %xmm11,80(%rdi) 911 movdqu %xmm2,96(%rdi) 912 movdqu %xmm7,112(%rdi) 913 je L$done4x 914 915 movdqa 32(%rsp),%xmm6 916 leaq 128(%rsi),%rsi 917 xorq %r10,%r10 918 movdqa %xmm6,0(%rsp) 919 movdqa %xmm10,16(%rsp) 920 leaq 128(%rdi),%rdi 921 movdqa %xmm14,32(%rsp) 922 subq $128,%rdx 923 movdqa %xmm8,48(%rsp) 924 jmp L$oop_tail4x 925 926.p2align 5 927L$192_or_more4x: 928 movdqu 0(%rsi),%xmm6 929 movdqu 16(%rsi),%xmm11 930 movdqu 32(%rsi),%xmm2 931 movdqu 48(%rsi),%xmm7 932 pxor 0(%rsp),%xmm6 933 pxor %xmm12,%xmm11 934 pxor %xmm4,%xmm2 935 pxor %xmm0,%xmm7 936 937 movdqu %xmm6,0(%rdi) 938 movdqu 64(%rsi),%xmm6 939 movdqu %xmm11,16(%rdi) 940 movdqu 80(%rsi),%xmm11 941 movdqu %xmm2,32(%rdi) 942 movdqu 96(%rsi),%xmm2 943 movdqu %xmm7,48(%rdi) 944 movdqu 112(%rsi),%xmm7 945 leaq 128(%rsi),%rsi 946 pxor 16(%rsp),%xmm6 947 pxor %xmm13,%xmm11 948 pxor %xmm5,%xmm2 949 pxor %xmm1,%xmm7 950 951 movdqu %xmm6,64(%rdi) 952 movdqu 0(%rsi),%xmm6 953 movdqu %xmm11,80(%rdi) 954 movdqu 16(%rsi),%xmm11 955 movdqu %xmm2,96(%rdi) 956 movdqu 32(%rsi),%xmm2 957 movdqu %xmm7,112(%rdi) 958 leaq 128(%rdi),%rdi 959 movdqu 48(%rsi),%xmm7 960 pxor 32(%rsp),%xmm6 961 pxor %xmm10,%xmm11 962 pxor %xmm14,%xmm2 963 pxor %xmm8,%xmm7 964 movdqu %xmm6,0(%rdi) 965 movdqu %xmm11,16(%rdi) 966 movdqu %xmm2,32(%rdi) 967 movdqu %xmm7,48(%rdi) 968 je L$done4x 969 970 movdqa 48(%rsp),%xmm6 971 leaq 64(%rsi),%rsi 972 xorq %r10,%r10 973 movdqa %xmm6,0(%rsp) 974 movdqa %xmm15,16(%rsp) 975 leaq 64(%rdi),%rdi 976 movdqa %xmm9,32(%rsp) 977 subq $192,%rdx 978 movdqa %xmm3,48(%rsp) 979 980L$oop_tail4x: 981 movzbl (%rsi,%r10,1),%eax 982 movzbl (%rsp,%r10,1),%ecx 983 leaq 1(%r10),%r10 984 xorl %ecx,%eax 985 movb %al,-1(%rdi,%r10,1) 986 decq %rdx 987 jnz L$oop_tail4x 988 989L$done4x: 990 leaq (%r9),%rsp 991 992L$4x_epilogue: 993 ret 994 995 996.globl _ChaCha20_ctr32_avx2 997.private_extern _ChaCha20_ctr32_avx2 998 999.p2align 5 1000_ChaCha20_ctr32_avx2: 1001 1002_CET_ENDBR 1003 movq %rsp,%r9 1004 1005 subq $0x280+8,%rsp 1006 andq $-32,%rsp 1007 vzeroupper 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 vbroadcasti128 L$sigma(%rip),%ymm11 1019 vbroadcasti128 (%rcx),%ymm3 1020 vbroadcasti128 16(%rcx),%ymm15 1021 vbroadcasti128 (%r8),%ymm7 1022 leaq 256(%rsp),%rcx 1023 leaq 512(%rsp),%rax 1024 leaq L$rot16(%rip),%r10 1025 leaq L$rot24(%rip),%r11 1026 1027 vpshufd $0x00,%ymm11,%ymm8 1028 vpshufd $0x55,%ymm11,%ymm9 1029 vmovdqa %ymm8,128-256(%rcx) 1030 vpshufd $0xaa,%ymm11,%ymm10 1031 vmovdqa %ymm9,160-256(%rcx) 1032 vpshufd $0xff,%ymm11,%ymm11 1033 vmovdqa %ymm10,192-256(%rcx) 1034 vmovdqa %ymm11,224-256(%rcx) 1035 1036 vpshufd $0x00,%ymm3,%ymm0 1037 vpshufd $0x55,%ymm3,%ymm1 1038 vmovdqa %ymm0,256-256(%rcx) 1039 vpshufd $0xaa,%ymm3,%ymm2 1040 vmovdqa %ymm1,288-256(%rcx) 1041 vpshufd $0xff,%ymm3,%ymm3 1042 vmovdqa %ymm2,320-256(%rcx) 1043 vmovdqa %ymm3,352-256(%rcx) 1044 1045 vpshufd $0x00,%ymm15,%ymm12 1046 vpshufd $0x55,%ymm15,%ymm13 1047 vmovdqa %ymm12,384-512(%rax) 1048 vpshufd $0xaa,%ymm15,%ymm14 1049 vmovdqa %ymm13,416-512(%rax) 1050 vpshufd $0xff,%ymm15,%ymm15 1051 vmovdqa %ymm14,448-512(%rax) 1052 vmovdqa %ymm15,480-512(%rax) 1053 1054 vpshufd $0x00,%ymm7,%ymm4 1055 vpshufd $0x55,%ymm7,%ymm5 1056 vpaddd L$incy(%rip),%ymm4,%ymm4 1057 vpshufd $0xaa,%ymm7,%ymm6 1058 vmovdqa %ymm5,544-512(%rax) 1059 vpshufd $0xff,%ymm7,%ymm7 1060 vmovdqa %ymm6,576-512(%rax) 1061 vmovdqa %ymm7,608-512(%rax) 1062 1063 jmp L$oop_enter8x 1064 1065.p2align 5 1066L$oop_outer8x: 1067 vmovdqa 128-256(%rcx),%ymm8 1068 vmovdqa 160-256(%rcx),%ymm9 1069 vmovdqa 192-256(%rcx),%ymm10 1070 vmovdqa 224-256(%rcx),%ymm11 1071 vmovdqa 256-256(%rcx),%ymm0 1072 vmovdqa 288-256(%rcx),%ymm1 1073 vmovdqa 320-256(%rcx),%ymm2 1074 vmovdqa 352-256(%rcx),%ymm3 1075 vmovdqa 384-512(%rax),%ymm12 1076 vmovdqa 416-512(%rax),%ymm13 1077 vmovdqa 448-512(%rax),%ymm14 1078 vmovdqa 480-512(%rax),%ymm15 1079 vmovdqa 512-512(%rax),%ymm4 1080 vmovdqa 544-512(%rax),%ymm5 1081 vmovdqa 576-512(%rax),%ymm6 1082 vmovdqa 608-512(%rax),%ymm7 1083 vpaddd L$eight(%rip),%ymm4,%ymm4 1084 1085L$oop_enter8x: 1086 vmovdqa %ymm14,64(%rsp) 1087 vmovdqa %ymm15,96(%rsp) 1088 vbroadcasti128 (%r10),%ymm15 1089 vmovdqa %ymm4,512-512(%rax) 1090 movl $10,%eax 1091 jmp L$oop8x 1092 1093.p2align 5 1094L$oop8x: 1095 vpaddd %ymm0,%ymm8,%ymm8 1096 vpxor %ymm4,%ymm8,%ymm4 1097 vpshufb %ymm15,%ymm4,%ymm4 1098 vpaddd %ymm1,%ymm9,%ymm9 1099 vpxor %ymm5,%ymm9,%ymm5 1100 vpshufb %ymm15,%ymm5,%ymm5 1101 vpaddd %ymm4,%ymm12,%ymm12 1102 vpxor %ymm0,%ymm12,%ymm0 1103 vpslld $12,%ymm0,%ymm14 1104 vpsrld $20,%ymm0,%ymm0 1105 vpor %ymm0,%ymm14,%ymm0 1106 vbroadcasti128 (%r11),%ymm14 1107 vpaddd %ymm5,%ymm13,%ymm13 1108 vpxor %ymm1,%ymm13,%ymm1 1109 vpslld $12,%ymm1,%ymm15 1110 vpsrld $20,%ymm1,%ymm1 1111 vpor %ymm1,%ymm15,%ymm1 1112 vpaddd %ymm0,%ymm8,%ymm8 1113 vpxor %ymm4,%ymm8,%ymm4 1114 vpshufb %ymm14,%ymm4,%ymm4 1115 vpaddd %ymm1,%ymm9,%ymm9 1116 vpxor %ymm5,%ymm9,%ymm5 1117 vpshufb %ymm14,%ymm5,%ymm5 1118 vpaddd %ymm4,%ymm12,%ymm12 1119 vpxor %ymm0,%ymm12,%ymm0 1120 vpslld $7,%ymm0,%ymm15 1121 vpsrld $25,%ymm0,%ymm0 1122 vpor %ymm0,%ymm15,%ymm0 1123 vbroadcasti128 (%r10),%ymm15 1124 vpaddd %ymm5,%ymm13,%ymm13 1125 vpxor %ymm1,%ymm13,%ymm1 1126 vpslld $7,%ymm1,%ymm14 1127 vpsrld $25,%ymm1,%ymm1 1128 vpor %ymm1,%ymm14,%ymm1 1129 vmovdqa %ymm12,0(%rsp) 1130 vmovdqa %ymm13,32(%rsp) 1131 vmovdqa 64(%rsp),%ymm12 1132 vmovdqa 96(%rsp),%ymm13 1133 vpaddd %ymm2,%ymm10,%ymm10 1134 vpxor %ymm6,%ymm10,%ymm6 1135 vpshufb %ymm15,%ymm6,%ymm6 1136 vpaddd %ymm3,%ymm11,%ymm11 1137 vpxor %ymm7,%ymm11,%ymm7 1138 vpshufb %ymm15,%ymm7,%ymm7 1139 vpaddd %ymm6,%ymm12,%ymm12 1140 vpxor %ymm2,%ymm12,%ymm2 1141 vpslld $12,%ymm2,%ymm14 1142 vpsrld $20,%ymm2,%ymm2 1143 vpor %ymm2,%ymm14,%ymm2 1144 vbroadcasti128 (%r11),%ymm14 1145 vpaddd %ymm7,%ymm13,%ymm13 1146 vpxor %ymm3,%ymm13,%ymm3 1147 vpslld $12,%ymm3,%ymm15 1148 vpsrld $20,%ymm3,%ymm3 1149 vpor %ymm3,%ymm15,%ymm3 1150 vpaddd %ymm2,%ymm10,%ymm10 1151 vpxor %ymm6,%ymm10,%ymm6 1152 vpshufb %ymm14,%ymm6,%ymm6 1153 vpaddd %ymm3,%ymm11,%ymm11 1154 vpxor %ymm7,%ymm11,%ymm7 1155 vpshufb %ymm14,%ymm7,%ymm7 1156 vpaddd %ymm6,%ymm12,%ymm12 1157 vpxor %ymm2,%ymm12,%ymm2 1158 vpslld $7,%ymm2,%ymm15 1159 vpsrld $25,%ymm2,%ymm2 1160 vpor %ymm2,%ymm15,%ymm2 1161 vbroadcasti128 (%r10),%ymm15 1162 vpaddd %ymm7,%ymm13,%ymm13 1163 vpxor %ymm3,%ymm13,%ymm3 1164 vpslld $7,%ymm3,%ymm14 1165 vpsrld $25,%ymm3,%ymm3 1166 vpor %ymm3,%ymm14,%ymm3 1167 vpaddd %ymm1,%ymm8,%ymm8 1168 vpxor %ymm7,%ymm8,%ymm7 1169 vpshufb %ymm15,%ymm7,%ymm7 1170 vpaddd %ymm2,%ymm9,%ymm9 1171 vpxor %ymm4,%ymm9,%ymm4 1172 vpshufb %ymm15,%ymm4,%ymm4 1173 vpaddd %ymm7,%ymm12,%ymm12 1174 vpxor %ymm1,%ymm12,%ymm1 1175 vpslld $12,%ymm1,%ymm14 1176 vpsrld $20,%ymm1,%ymm1 1177 vpor %ymm1,%ymm14,%ymm1 1178 vbroadcasti128 (%r11),%ymm14 1179 vpaddd %ymm4,%ymm13,%ymm13 1180 vpxor %ymm2,%ymm13,%ymm2 1181 vpslld $12,%ymm2,%ymm15 1182 vpsrld $20,%ymm2,%ymm2 1183 vpor %ymm2,%ymm15,%ymm2 1184 vpaddd %ymm1,%ymm8,%ymm8 1185 vpxor %ymm7,%ymm8,%ymm7 1186 vpshufb %ymm14,%ymm7,%ymm7 1187 vpaddd %ymm2,%ymm9,%ymm9 1188 vpxor %ymm4,%ymm9,%ymm4 1189 vpshufb %ymm14,%ymm4,%ymm4 1190 vpaddd %ymm7,%ymm12,%ymm12 1191 vpxor %ymm1,%ymm12,%ymm1 1192 vpslld $7,%ymm1,%ymm15 1193 vpsrld $25,%ymm1,%ymm1 1194 vpor %ymm1,%ymm15,%ymm1 1195 vbroadcasti128 (%r10),%ymm15 1196 vpaddd %ymm4,%ymm13,%ymm13 1197 vpxor %ymm2,%ymm13,%ymm2 1198 vpslld $7,%ymm2,%ymm14 1199 vpsrld $25,%ymm2,%ymm2 1200 vpor %ymm2,%ymm14,%ymm2 1201 vmovdqa %ymm12,64(%rsp) 1202 vmovdqa %ymm13,96(%rsp) 1203 vmovdqa 0(%rsp),%ymm12 1204 vmovdqa 32(%rsp),%ymm13 1205 vpaddd %ymm3,%ymm10,%ymm10 1206 vpxor %ymm5,%ymm10,%ymm5 1207 vpshufb %ymm15,%ymm5,%ymm5 1208 vpaddd %ymm0,%ymm11,%ymm11 1209 vpxor %ymm6,%ymm11,%ymm6 1210 vpshufb %ymm15,%ymm6,%ymm6 1211 vpaddd %ymm5,%ymm12,%ymm12 1212 vpxor %ymm3,%ymm12,%ymm3 1213 vpslld $12,%ymm3,%ymm14 1214 vpsrld $20,%ymm3,%ymm3 1215 vpor %ymm3,%ymm14,%ymm3 1216 vbroadcasti128 (%r11),%ymm14 1217 vpaddd %ymm6,%ymm13,%ymm13 1218 vpxor %ymm0,%ymm13,%ymm0 1219 vpslld $12,%ymm0,%ymm15 1220 vpsrld $20,%ymm0,%ymm0 1221 vpor %ymm0,%ymm15,%ymm0 1222 vpaddd %ymm3,%ymm10,%ymm10 1223 vpxor %ymm5,%ymm10,%ymm5 1224 vpshufb %ymm14,%ymm5,%ymm5 1225 vpaddd %ymm0,%ymm11,%ymm11 1226 vpxor %ymm6,%ymm11,%ymm6 1227 vpshufb %ymm14,%ymm6,%ymm6 1228 vpaddd %ymm5,%ymm12,%ymm12 1229 vpxor %ymm3,%ymm12,%ymm3 1230 vpslld $7,%ymm3,%ymm15 1231 vpsrld $25,%ymm3,%ymm3 1232 vpor %ymm3,%ymm15,%ymm3 1233 vbroadcasti128 (%r10),%ymm15 1234 vpaddd %ymm6,%ymm13,%ymm13 1235 vpxor %ymm0,%ymm13,%ymm0 1236 vpslld $7,%ymm0,%ymm14 1237 vpsrld $25,%ymm0,%ymm0 1238 vpor %ymm0,%ymm14,%ymm0 1239 decl %eax 1240 jnz L$oop8x 1241 1242 leaq 512(%rsp),%rax 1243 vpaddd 128-256(%rcx),%ymm8,%ymm8 1244 vpaddd 160-256(%rcx),%ymm9,%ymm9 1245 vpaddd 192-256(%rcx),%ymm10,%ymm10 1246 vpaddd 224-256(%rcx),%ymm11,%ymm11 1247 1248 vpunpckldq %ymm9,%ymm8,%ymm14 1249 vpunpckldq %ymm11,%ymm10,%ymm15 1250 vpunpckhdq %ymm9,%ymm8,%ymm8 1251 vpunpckhdq %ymm11,%ymm10,%ymm10 1252 vpunpcklqdq %ymm15,%ymm14,%ymm9 1253 vpunpckhqdq %ymm15,%ymm14,%ymm14 1254 vpunpcklqdq %ymm10,%ymm8,%ymm11 1255 vpunpckhqdq %ymm10,%ymm8,%ymm8 1256 vpaddd 256-256(%rcx),%ymm0,%ymm0 1257 vpaddd 288-256(%rcx),%ymm1,%ymm1 1258 vpaddd 320-256(%rcx),%ymm2,%ymm2 1259 vpaddd 352-256(%rcx),%ymm3,%ymm3 1260 1261 vpunpckldq %ymm1,%ymm0,%ymm10 1262 vpunpckldq %ymm3,%ymm2,%ymm15 1263 vpunpckhdq %ymm1,%ymm0,%ymm0 1264 vpunpckhdq %ymm3,%ymm2,%ymm2 1265 vpunpcklqdq %ymm15,%ymm10,%ymm1 1266 vpunpckhqdq %ymm15,%ymm10,%ymm10 1267 vpunpcklqdq %ymm2,%ymm0,%ymm3 1268 vpunpckhqdq %ymm2,%ymm0,%ymm0 1269 vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 1270 vperm2i128 $0x31,%ymm1,%ymm9,%ymm1 1271 vperm2i128 $0x20,%ymm10,%ymm14,%ymm9 1272 vperm2i128 $0x31,%ymm10,%ymm14,%ymm10 1273 vperm2i128 $0x20,%ymm3,%ymm11,%ymm14 1274 vperm2i128 $0x31,%ymm3,%ymm11,%ymm3 1275 vperm2i128 $0x20,%ymm0,%ymm8,%ymm11 1276 vperm2i128 $0x31,%ymm0,%ymm8,%ymm0 1277 vmovdqa %ymm15,0(%rsp) 1278 vmovdqa %ymm9,32(%rsp) 1279 vmovdqa 64(%rsp),%ymm15 1280 vmovdqa 96(%rsp),%ymm9 1281 1282 vpaddd 384-512(%rax),%ymm12,%ymm12 1283 vpaddd 416-512(%rax),%ymm13,%ymm13 1284 vpaddd 448-512(%rax),%ymm15,%ymm15 1285 vpaddd 480-512(%rax),%ymm9,%ymm9 1286 1287 vpunpckldq %ymm13,%ymm12,%ymm2 1288 vpunpckldq %ymm9,%ymm15,%ymm8 1289 vpunpckhdq %ymm13,%ymm12,%ymm12 1290 vpunpckhdq %ymm9,%ymm15,%ymm15 1291 vpunpcklqdq %ymm8,%ymm2,%ymm13 1292 vpunpckhqdq %ymm8,%ymm2,%ymm2 1293 vpunpcklqdq %ymm15,%ymm12,%ymm9 1294 vpunpckhqdq %ymm15,%ymm12,%ymm12 1295 vpaddd 512-512(%rax),%ymm4,%ymm4 1296 vpaddd 544-512(%rax),%ymm5,%ymm5 1297 vpaddd 576-512(%rax),%ymm6,%ymm6 1298 vpaddd 608-512(%rax),%ymm7,%ymm7 1299 1300 vpunpckldq %ymm5,%ymm4,%ymm15 1301 vpunpckldq %ymm7,%ymm6,%ymm8 1302 vpunpckhdq %ymm5,%ymm4,%ymm4 1303 vpunpckhdq %ymm7,%ymm6,%ymm6 1304 vpunpcklqdq %ymm8,%ymm15,%ymm5 1305 vpunpckhqdq %ymm8,%ymm15,%ymm15 1306 vpunpcklqdq %ymm6,%ymm4,%ymm7 1307 vpunpckhqdq %ymm6,%ymm4,%ymm4 1308 vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 1309 vperm2i128 $0x31,%ymm5,%ymm13,%ymm5 1310 vperm2i128 $0x20,%ymm15,%ymm2,%ymm13 1311 vperm2i128 $0x31,%ymm15,%ymm2,%ymm15 1312 vperm2i128 $0x20,%ymm7,%ymm9,%ymm2 1313 vperm2i128 $0x31,%ymm7,%ymm9,%ymm7 1314 vperm2i128 $0x20,%ymm4,%ymm12,%ymm9 1315 vperm2i128 $0x31,%ymm4,%ymm12,%ymm4 1316 vmovdqa 0(%rsp),%ymm6 1317 vmovdqa 32(%rsp),%ymm12 1318 1319 cmpq $512,%rdx 1320 jb L$tail8x 1321 1322 vpxor 0(%rsi),%ymm6,%ymm6 1323 vpxor 32(%rsi),%ymm8,%ymm8 1324 vpxor 64(%rsi),%ymm1,%ymm1 1325 vpxor 96(%rsi),%ymm5,%ymm5 1326 leaq 128(%rsi),%rsi 1327 vmovdqu %ymm6,0(%rdi) 1328 vmovdqu %ymm8,32(%rdi) 1329 vmovdqu %ymm1,64(%rdi) 1330 vmovdqu %ymm5,96(%rdi) 1331 leaq 128(%rdi),%rdi 1332 1333 vpxor 0(%rsi),%ymm12,%ymm12 1334 vpxor 32(%rsi),%ymm13,%ymm13 1335 vpxor 64(%rsi),%ymm10,%ymm10 1336 vpxor 96(%rsi),%ymm15,%ymm15 1337 leaq 128(%rsi),%rsi 1338 vmovdqu %ymm12,0(%rdi) 1339 vmovdqu %ymm13,32(%rdi) 1340 vmovdqu %ymm10,64(%rdi) 1341 vmovdqu %ymm15,96(%rdi) 1342 leaq 128(%rdi),%rdi 1343 1344 vpxor 0(%rsi),%ymm14,%ymm14 1345 vpxor 32(%rsi),%ymm2,%ymm2 1346 vpxor 64(%rsi),%ymm3,%ymm3 1347 vpxor 96(%rsi),%ymm7,%ymm7 1348 leaq 128(%rsi),%rsi 1349 vmovdqu %ymm14,0(%rdi) 1350 vmovdqu %ymm2,32(%rdi) 1351 vmovdqu %ymm3,64(%rdi) 1352 vmovdqu %ymm7,96(%rdi) 1353 leaq 128(%rdi),%rdi 1354 1355 vpxor 0(%rsi),%ymm11,%ymm11 1356 vpxor 32(%rsi),%ymm9,%ymm9 1357 vpxor 64(%rsi),%ymm0,%ymm0 1358 vpxor 96(%rsi),%ymm4,%ymm4 1359 leaq 128(%rsi),%rsi 1360 vmovdqu %ymm11,0(%rdi) 1361 vmovdqu %ymm9,32(%rdi) 1362 vmovdqu %ymm0,64(%rdi) 1363 vmovdqu %ymm4,96(%rdi) 1364 leaq 128(%rdi),%rdi 1365 1366 subq $512,%rdx 1367 jnz L$oop_outer8x 1368 1369 jmp L$done8x 1370 1371L$tail8x: 1372 cmpq $448,%rdx 1373 jae L$448_or_more8x 1374 cmpq $384,%rdx 1375 jae L$384_or_more8x 1376 cmpq $320,%rdx 1377 jae L$320_or_more8x 1378 cmpq $256,%rdx 1379 jae L$256_or_more8x 1380 cmpq $192,%rdx 1381 jae L$192_or_more8x 1382 cmpq $128,%rdx 1383 jae L$128_or_more8x 1384 cmpq $64,%rdx 1385 jae L$64_or_more8x 1386 1387 xorq %r10,%r10 1388 vmovdqa %ymm6,0(%rsp) 1389 vmovdqa %ymm8,32(%rsp) 1390 jmp L$oop_tail8x 1391 1392.p2align 5 1393L$64_or_more8x: 1394 vpxor 0(%rsi),%ymm6,%ymm6 1395 vpxor 32(%rsi),%ymm8,%ymm8 1396 vmovdqu %ymm6,0(%rdi) 1397 vmovdqu %ymm8,32(%rdi) 1398 je L$done8x 1399 1400 leaq 64(%rsi),%rsi 1401 xorq %r10,%r10 1402 vmovdqa %ymm1,0(%rsp) 1403 leaq 64(%rdi),%rdi 1404 subq $64,%rdx 1405 vmovdqa %ymm5,32(%rsp) 1406 jmp L$oop_tail8x 1407 1408.p2align 5 1409L$128_or_more8x: 1410 vpxor 0(%rsi),%ymm6,%ymm6 1411 vpxor 32(%rsi),%ymm8,%ymm8 1412 vpxor 64(%rsi),%ymm1,%ymm1 1413 vpxor 96(%rsi),%ymm5,%ymm5 1414 vmovdqu %ymm6,0(%rdi) 1415 vmovdqu %ymm8,32(%rdi) 1416 vmovdqu %ymm1,64(%rdi) 1417 vmovdqu %ymm5,96(%rdi) 1418 je L$done8x 1419 1420 leaq 128(%rsi),%rsi 1421 xorq %r10,%r10 1422 vmovdqa %ymm12,0(%rsp) 1423 leaq 128(%rdi),%rdi 1424 subq $128,%rdx 1425 vmovdqa %ymm13,32(%rsp) 1426 jmp L$oop_tail8x 1427 1428.p2align 5 1429L$192_or_more8x: 1430 vpxor 0(%rsi),%ymm6,%ymm6 1431 vpxor 32(%rsi),%ymm8,%ymm8 1432 vpxor 64(%rsi),%ymm1,%ymm1 1433 vpxor 96(%rsi),%ymm5,%ymm5 1434 vpxor 128(%rsi),%ymm12,%ymm12 1435 vpxor 160(%rsi),%ymm13,%ymm13 1436 vmovdqu %ymm6,0(%rdi) 1437 vmovdqu %ymm8,32(%rdi) 1438 vmovdqu %ymm1,64(%rdi) 1439 vmovdqu %ymm5,96(%rdi) 1440 vmovdqu %ymm12,128(%rdi) 1441 vmovdqu %ymm13,160(%rdi) 1442 je L$done8x 1443 1444 leaq 192(%rsi),%rsi 1445 xorq %r10,%r10 1446 vmovdqa %ymm10,0(%rsp) 1447 leaq 192(%rdi),%rdi 1448 subq $192,%rdx 1449 vmovdqa %ymm15,32(%rsp) 1450 jmp L$oop_tail8x 1451 1452.p2align 5 1453L$256_or_more8x: 1454 vpxor 0(%rsi),%ymm6,%ymm6 1455 vpxor 32(%rsi),%ymm8,%ymm8 1456 vpxor 64(%rsi),%ymm1,%ymm1 1457 vpxor 96(%rsi),%ymm5,%ymm5 1458 vpxor 128(%rsi),%ymm12,%ymm12 1459 vpxor 160(%rsi),%ymm13,%ymm13 1460 vpxor 192(%rsi),%ymm10,%ymm10 1461 vpxor 224(%rsi),%ymm15,%ymm15 1462 vmovdqu %ymm6,0(%rdi) 1463 vmovdqu %ymm8,32(%rdi) 1464 vmovdqu %ymm1,64(%rdi) 1465 vmovdqu %ymm5,96(%rdi) 1466 vmovdqu %ymm12,128(%rdi) 1467 vmovdqu %ymm13,160(%rdi) 1468 vmovdqu %ymm10,192(%rdi) 1469 vmovdqu %ymm15,224(%rdi) 1470 je L$done8x 1471 1472 leaq 256(%rsi),%rsi 1473 xorq %r10,%r10 1474 vmovdqa %ymm14,0(%rsp) 1475 leaq 256(%rdi),%rdi 1476 subq $256,%rdx 1477 vmovdqa %ymm2,32(%rsp) 1478 jmp L$oop_tail8x 1479 1480.p2align 5 1481L$320_or_more8x: 1482 vpxor 0(%rsi),%ymm6,%ymm6 1483 vpxor 32(%rsi),%ymm8,%ymm8 1484 vpxor 64(%rsi),%ymm1,%ymm1 1485 vpxor 96(%rsi),%ymm5,%ymm5 1486 vpxor 128(%rsi),%ymm12,%ymm12 1487 vpxor 160(%rsi),%ymm13,%ymm13 1488 vpxor 192(%rsi),%ymm10,%ymm10 1489 vpxor 224(%rsi),%ymm15,%ymm15 1490 vpxor 256(%rsi),%ymm14,%ymm14 1491 vpxor 288(%rsi),%ymm2,%ymm2 1492 vmovdqu %ymm6,0(%rdi) 1493 vmovdqu %ymm8,32(%rdi) 1494 vmovdqu %ymm1,64(%rdi) 1495 vmovdqu %ymm5,96(%rdi) 1496 vmovdqu %ymm12,128(%rdi) 1497 vmovdqu %ymm13,160(%rdi) 1498 vmovdqu %ymm10,192(%rdi) 1499 vmovdqu %ymm15,224(%rdi) 1500 vmovdqu %ymm14,256(%rdi) 1501 vmovdqu %ymm2,288(%rdi) 1502 je L$done8x 1503 1504 leaq 320(%rsi),%rsi 1505 xorq %r10,%r10 1506 vmovdqa %ymm3,0(%rsp) 1507 leaq 320(%rdi),%rdi 1508 subq $320,%rdx 1509 vmovdqa %ymm7,32(%rsp) 1510 jmp L$oop_tail8x 1511 1512.p2align 5 1513L$384_or_more8x: 1514 vpxor 0(%rsi),%ymm6,%ymm6 1515 vpxor 32(%rsi),%ymm8,%ymm8 1516 vpxor 64(%rsi),%ymm1,%ymm1 1517 vpxor 96(%rsi),%ymm5,%ymm5 1518 vpxor 128(%rsi),%ymm12,%ymm12 1519 vpxor 160(%rsi),%ymm13,%ymm13 1520 vpxor 192(%rsi),%ymm10,%ymm10 1521 vpxor 224(%rsi),%ymm15,%ymm15 1522 vpxor 256(%rsi),%ymm14,%ymm14 1523 vpxor 288(%rsi),%ymm2,%ymm2 1524 vpxor 320(%rsi),%ymm3,%ymm3 1525 vpxor 352(%rsi),%ymm7,%ymm7 1526 vmovdqu %ymm6,0(%rdi) 1527 vmovdqu %ymm8,32(%rdi) 1528 vmovdqu %ymm1,64(%rdi) 1529 vmovdqu %ymm5,96(%rdi) 1530 vmovdqu %ymm12,128(%rdi) 1531 vmovdqu %ymm13,160(%rdi) 1532 vmovdqu %ymm10,192(%rdi) 1533 vmovdqu %ymm15,224(%rdi) 1534 vmovdqu %ymm14,256(%rdi) 1535 vmovdqu %ymm2,288(%rdi) 1536 vmovdqu %ymm3,320(%rdi) 1537 vmovdqu %ymm7,352(%rdi) 1538 je L$done8x 1539 1540 leaq 384(%rsi),%rsi 1541 xorq %r10,%r10 1542 vmovdqa %ymm11,0(%rsp) 1543 leaq 384(%rdi),%rdi 1544 subq $384,%rdx 1545 vmovdqa %ymm9,32(%rsp) 1546 jmp L$oop_tail8x 1547 1548.p2align 5 1549L$448_or_more8x: 1550 vpxor 0(%rsi),%ymm6,%ymm6 1551 vpxor 32(%rsi),%ymm8,%ymm8 1552 vpxor 64(%rsi),%ymm1,%ymm1 1553 vpxor 96(%rsi),%ymm5,%ymm5 1554 vpxor 128(%rsi),%ymm12,%ymm12 1555 vpxor 160(%rsi),%ymm13,%ymm13 1556 vpxor 192(%rsi),%ymm10,%ymm10 1557 vpxor 224(%rsi),%ymm15,%ymm15 1558 vpxor 256(%rsi),%ymm14,%ymm14 1559 vpxor 288(%rsi),%ymm2,%ymm2 1560 vpxor 320(%rsi),%ymm3,%ymm3 1561 vpxor 352(%rsi),%ymm7,%ymm7 1562 vpxor 384(%rsi),%ymm11,%ymm11 1563 vpxor 416(%rsi),%ymm9,%ymm9 1564 vmovdqu %ymm6,0(%rdi) 1565 vmovdqu %ymm8,32(%rdi) 1566 vmovdqu %ymm1,64(%rdi) 1567 vmovdqu %ymm5,96(%rdi) 1568 vmovdqu %ymm12,128(%rdi) 1569 vmovdqu %ymm13,160(%rdi) 1570 vmovdqu %ymm10,192(%rdi) 1571 vmovdqu %ymm15,224(%rdi) 1572 vmovdqu %ymm14,256(%rdi) 1573 vmovdqu %ymm2,288(%rdi) 1574 vmovdqu %ymm3,320(%rdi) 1575 vmovdqu %ymm7,352(%rdi) 1576 vmovdqu %ymm11,384(%rdi) 1577 vmovdqu %ymm9,416(%rdi) 1578 je L$done8x 1579 1580 leaq 448(%rsi),%rsi 1581 xorq %r10,%r10 1582 vmovdqa %ymm0,0(%rsp) 1583 leaq 448(%rdi),%rdi 1584 subq $448,%rdx 1585 vmovdqa %ymm4,32(%rsp) 1586 1587L$oop_tail8x: 1588 movzbl (%rsi,%r10,1),%eax 1589 movzbl (%rsp,%r10,1),%ecx 1590 leaq 1(%r10),%r10 1591 xorl %ecx,%eax 1592 movb %al,-1(%rdi,%r10,1) 1593 decq %rdx 1594 jnz L$oop_tail8x 1595 1596L$done8x: 1597 vzeroall 1598 leaq (%r9),%rsp 1599 1600L$8x_epilogue: 1601 ret 1602 1603 1604#endif 1605