1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#include <ring-core/asm_base.h> 5 6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) 7.text 8 9 10 11.section __DATA,__const 12.p2align 6 13L$zero: 14.long 0,0,0,0 15L$one: 16.long 1,0,0,0 17L$inc: 18.long 0,1,2,3 19L$four: 20.long 4,4,4,4 21L$incy: 22.long 0,2,4,6,1,3,5,7 23L$eight: 24.long 8,8,8,8,8,8,8,8 25L$rot16: 26.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd 27L$rot24: 28.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe 29L$sigma: 30.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 31.p2align 6 32L$zeroz: 33.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 34L$fourz: 35.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 36L$incz: 37.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 38L$sixteen: 39.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 40.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 41.text 42.globl _ChaCha20_ctr32 43.private_extern _ChaCha20_ctr32 44 45.p2align 6 46_ChaCha20_ctr32: 47 48_CET_ENDBR 49 cmpq $0,%rdx 50 je L$no_data 51 movq _OPENSSL_ia32cap_P+4(%rip),%r10 52 testl $512,%r10d 53 jnz L$ChaCha20_ssse3 54 55 pushq %rbx 56 57 pushq %rbp 58 59 pushq %r12 60 61 pushq %r13 62 63 pushq %r14 64 65 pushq %r15 66 67 subq $64+24,%rsp 68 69L$ctr32_body: 70 71 72 movdqu (%rcx),%xmm1 73 movdqu 16(%rcx),%xmm2 74 movdqu (%r8),%xmm3 75 movdqa L$one(%rip),%xmm4 76 77 78 movdqa %xmm1,16(%rsp) 79 movdqa %xmm2,32(%rsp) 80 movdqa %xmm3,48(%rsp) 81 movq %rdx,%rbp 82 jmp L$oop_outer 83 84.p2align 5 85L$oop_outer: 86 movl $0x61707865,%eax 87 movl $0x3320646e,%ebx 88 movl $0x79622d32,%ecx 89 movl $0x6b206574,%edx 90 movl 16(%rsp),%r8d 91 movl 20(%rsp),%r9d 92 movl 24(%rsp),%r10d 93 movl 28(%rsp),%r11d 94 movd %xmm3,%r12d 95 movl 52(%rsp),%r13d 96 movl 56(%rsp),%r14d 97 movl 60(%rsp),%r15d 98 99 movq %rbp,64+0(%rsp) 100 movl $10,%ebp 101 movq %rsi,64+8(%rsp) 102.byte 102,72,15,126,214 103 movq %rdi,64+16(%rsp) 104 movq %rsi,%rdi 105 shrq $32,%rdi 106 jmp L$oop 107 108.p2align 5 109L$oop: 110 addl %r8d,%eax 111 xorl %eax,%r12d 112 roll $16,%r12d 113 addl %r9d,%ebx 114 xorl %ebx,%r13d 115 roll $16,%r13d 116 addl %r12d,%esi 117 xorl %esi,%r8d 118 roll $12,%r8d 119 addl %r13d,%edi 120 xorl %edi,%r9d 121 roll $12,%r9d 122 addl %r8d,%eax 123 xorl %eax,%r12d 124 roll $8,%r12d 125 addl %r9d,%ebx 126 xorl %ebx,%r13d 127 roll $8,%r13d 128 addl %r12d,%esi 129 xorl %esi,%r8d 130 roll $7,%r8d 131 addl %r13d,%edi 132 xorl %edi,%r9d 133 roll $7,%r9d 134 movl %esi,32(%rsp) 135 movl %edi,36(%rsp) 136 movl 40(%rsp),%esi 137 movl 44(%rsp),%edi 138 addl %r10d,%ecx 139 xorl %ecx,%r14d 140 roll $16,%r14d 141 addl %r11d,%edx 142 xorl %edx,%r15d 143 roll $16,%r15d 144 addl %r14d,%esi 145 xorl %esi,%r10d 146 roll $12,%r10d 147 addl %r15d,%edi 148 xorl %edi,%r11d 149 roll $12,%r11d 150 addl %r10d,%ecx 151 xorl %ecx,%r14d 152 roll $8,%r14d 153 addl %r11d,%edx 154 xorl %edx,%r15d 155 roll $8,%r15d 156 addl %r14d,%esi 157 xorl %esi,%r10d 158 roll $7,%r10d 159 addl %r15d,%edi 160 xorl %edi,%r11d 161 roll $7,%r11d 162 addl %r9d,%eax 163 xorl %eax,%r15d 164 roll $16,%r15d 165 addl %r10d,%ebx 166 xorl %ebx,%r12d 167 roll $16,%r12d 168 addl %r15d,%esi 169 xorl %esi,%r9d 170 roll $12,%r9d 171 addl %r12d,%edi 172 xorl %edi,%r10d 173 roll $12,%r10d 174 addl %r9d,%eax 175 xorl %eax,%r15d 176 roll $8,%r15d 177 addl %r10d,%ebx 178 xorl %ebx,%r12d 179 roll $8,%r12d 180 addl %r15d,%esi 181 xorl %esi,%r9d 182 roll $7,%r9d 183 addl %r12d,%edi 184 xorl %edi,%r10d 185 roll $7,%r10d 186 movl %esi,40(%rsp) 187 movl %edi,44(%rsp) 188 movl 32(%rsp),%esi 189 movl 36(%rsp),%edi 190 addl %r11d,%ecx 191 xorl %ecx,%r13d 192 roll $16,%r13d 193 addl %r8d,%edx 194 xorl %edx,%r14d 195 roll $16,%r14d 196 addl %r13d,%esi 197 xorl %esi,%r11d 198 roll $12,%r11d 199 addl %r14d,%edi 200 xorl %edi,%r8d 201 roll $12,%r8d 202 addl %r11d,%ecx 203 xorl %ecx,%r13d 204 roll $8,%r13d 205 addl %r8d,%edx 206 xorl %edx,%r14d 207 roll $8,%r14d 208 addl %r13d,%esi 209 xorl %esi,%r11d 210 roll $7,%r11d 211 addl %r14d,%edi 212 xorl %edi,%r8d 213 roll $7,%r8d 214 decl %ebp 215 jnz L$oop 216 movl %edi,36(%rsp) 217 movl %esi,32(%rsp) 218 movq 64(%rsp),%rbp 219 movdqa %xmm2,%xmm1 220 movq 64+8(%rsp),%rsi 221 paddd %xmm4,%xmm3 222 movq 64+16(%rsp),%rdi 223 224 addl $0x61707865,%eax 225 addl $0x3320646e,%ebx 226 addl $0x79622d32,%ecx 227 addl $0x6b206574,%edx 228 addl 16(%rsp),%r8d 229 addl 20(%rsp),%r9d 230 addl 24(%rsp),%r10d 231 addl 28(%rsp),%r11d 232 addl 48(%rsp),%r12d 233 addl 52(%rsp),%r13d 234 addl 56(%rsp),%r14d 235 addl 60(%rsp),%r15d 236 paddd 32(%rsp),%xmm1 237 238 cmpq $64,%rbp 239 jb L$tail 240 241 xorl 0(%rsi),%eax 242 xorl 4(%rsi),%ebx 243 xorl 8(%rsi),%ecx 244 xorl 12(%rsi),%edx 245 xorl 16(%rsi),%r8d 246 xorl 20(%rsi),%r9d 247 xorl 24(%rsi),%r10d 248 xorl 28(%rsi),%r11d 249 movdqu 32(%rsi),%xmm0 250 xorl 48(%rsi),%r12d 251 xorl 52(%rsi),%r13d 252 xorl 56(%rsi),%r14d 253 xorl 60(%rsi),%r15d 254 leaq 64(%rsi),%rsi 255 pxor %xmm1,%xmm0 256 257 movdqa %xmm2,32(%rsp) 258 movd %xmm3,48(%rsp) 259 260 movl %eax,0(%rdi) 261 movl %ebx,4(%rdi) 262 movl %ecx,8(%rdi) 263 movl %edx,12(%rdi) 264 movl %r8d,16(%rdi) 265 movl %r9d,20(%rdi) 266 movl %r10d,24(%rdi) 267 movl %r11d,28(%rdi) 268 movdqu %xmm0,32(%rdi) 269 movl %r12d,48(%rdi) 270 movl %r13d,52(%rdi) 271 movl %r14d,56(%rdi) 272 movl %r15d,60(%rdi) 273 leaq 64(%rdi),%rdi 274 275 subq $64,%rbp 276 jnz L$oop_outer 277 278 jmp L$done 279 280.p2align 4 281L$tail: 282 movl %eax,0(%rsp) 283 movl %ebx,4(%rsp) 284 xorq %rbx,%rbx 285 movl %ecx,8(%rsp) 286 movl %edx,12(%rsp) 287 movl %r8d,16(%rsp) 288 movl %r9d,20(%rsp) 289 movl %r10d,24(%rsp) 290 movl %r11d,28(%rsp) 291 movdqa %xmm1,32(%rsp) 292 movl %r12d,48(%rsp) 293 movl %r13d,52(%rsp) 294 movl %r14d,56(%rsp) 295 movl %r15d,60(%rsp) 296 297L$oop_tail: 298 movzbl (%rsi,%rbx,1),%eax 299 movzbl (%rsp,%rbx,1),%edx 300 leaq 1(%rbx),%rbx 301 xorl %edx,%eax 302 movb %al,-1(%rdi,%rbx,1) 303 decq %rbp 304 jnz L$oop_tail 305 306L$done: 307 leaq 64+24+48(%rsp),%rsi 308 movq -48(%rsi),%r15 309 310 movq -40(%rsi),%r14 311 312 movq -32(%rsi),%r13 313 314 movq -24(%rsi),%r12 315 316 movq -16(%rsi),%rbp 317 318 movq -8(%rsi),%rbx 319 320 leaq (%rsi),%rsp 321 322L$no_data: 323 ret 324 325 326 327.p2align 5 328ChaCha20_ssse3: 329L$ChaCha20_ssse3: 330 331 movq %rsp,%r9 332 333 cmpq $128,%rdx 334 ja L$ChaCha20_4x 335 336L$do_sse3_after_all: 337 subq $64+8,%rsp 338 movdqa L$sigma(%rip),%xmm0 339 movdqu (%rcx),%xmm1 340 movdqu 16(%rcx),%xmm2 341 movdqu (%r8),%xmm3 342 movdqa L$rot16(%rip),%xmm6 343 movdqa L$rot24(%rip),%xmm7 344 345 movdqa %xmm0,0(%rsp) 346 movdqa %xmm1,16(%rsp) 347 movdqa %xmm2,32(%rsp) 348 movdqa %xmm3,48(%rsp) 349 movq $10,%r8 350 jmp L$oop_ssse3 351 352.p2align 5 353L$oop_outer_ssse3: 354 movdqa L$one(%rip),%xmm3 355 movdqa 0(%rsp),%xmm0 356 movdqa 16(%rsp),%xmm1 357 movdqa 32(%rsp),%xmm2 358 paddd 48(%rsp),%xmm3 359 movq $10,%r8 360 movdqa %xmm3,48(%rsp) 361 jmp L$oop_ssse3 362 363.p2align 5 364L$oop_ssse3: 365 paddd %xmm1,%xmm0 366 pxor %xmm0,%xmm3 367.byte 102,15,56,0,222 368 paddd %xmm3,%xmm2 369 pxor %xmm2,%xmm1 370 movdqa %xmm1,%xmm4 371 psrld $20,%xmm1 372 pslld $12,%xmm4 373 por %xmm4,%xmm1 374 paddd %xmm1,%xmm0 375 pxor %xmm0,%xmm3 376.byte 102,15,56,0,223 377 paddd %xmm3,%xmm2 378 pxor %xmm2,%xmm1 379 movdqa %xmm1,%xmm4 380 psrld $25,%xmm1 381 pslld $7,%xmm4 382 por %xmm4,%xmm1 383 pshufd $78,%xmm2,%xmm2 384 pshufd $57,%xmm1,%xmm1 385 pshufd $147,%xmm3,%xmm3 386 nop 387 paddd %xmm1,%xmm0 388 pxor %xmm0,%xmm3 389.byte 102,15,56,0,222 390 paddd %xmm3,%xmm2 391 pxor %xmm2,%xmm1 392 movdqa %xmm1,%xmm4 393 psrld $20,%xmm1 394 pslld $12,%xmm4 395 por %xmm4,%xmm1 396 paddd %xmm1,%xmm0 397 pxor %xmm0,%xmm3 398.byte 102,15,56,0,223 399 paddd %xmm3,%xmm2 400 pxor %xmm2,%xmm1 401 movdqa %xmm1,%xmm4 402 psrld $25,%xmm1 403 pslld $7,%xmm4 404 por %xmm4,%xmm1 405 pshufd $78,%xmm2,%xmm2 406 pshufd $147,%xmm1,%xmm1 407 pshufd $57,%xmm3,%xmm3 408 decq %r8 409 jnz L$oop_ssse3 410 paddd 0(%rsp),%xmm0 411 paddd 16(%rsp),%xmm1 412 paddd 32(%rsp),%xmm2 413 paddd 48(%rsp),%xmm3 414 415 cmpq $64,%rdx 416 jb L$tail_ssse3 417 418 movdqu 0(%rsi),%xmm4 419 movdqu 16(%rsi),%xmm5 420 pxor %xmm4,%xmm0 421 movdqu 32(%rsi),%xmm4 422 pxor %xmm5,%xmm1 423 movdqu 48(%rsi),%xmm5 424 leaq 64(%rsi),%rsi 425 pxor %xmm4,%xmm2 426 pxor %xmm5,%xmm3 427 428 movdqu %xmm0,0(%rdi) 429 movdqu %xmm1,16(%rdi) 430 movdqu %xmm2,32(%rdi) 431 movdqu %xmm3,48(%rdi) 432 leaq 64(%rdi),%rdi 433 434 subq $64,%rdx 435 jnz L$oop_outer_ssse3 436 437 jmp L$done_ssse3 438 439.p2align 4 440L$tail_ssse3: 441 movdqa %xmm0,0(%rsp) 442 movdqa %xmm1,16(%rsp) 443 movdqa %xmm2,32(%rsp) 444 movdqa %xmm3,48(%rsp) 445 xorq %r8,%r8 446 447L$oop_tail_ssse3: 448 movzbl (%rsi,%r8,1),%eax 449 movzbl (%rsp,%r8,1),%ecx 450 leaq 1(%r8),%r8 451 xorl %ecx,%eax 452 movb %al,-1(%rdi,%r8,1) 453 decq %rdx 454 jnz L$oop_tail_ssse3 455 456L$done_ssse3: 457 leaq (%r9),%rsp 458 459L$ssse3_epilogue: 460 ret 461 462 463 464.p2align 5 465ChaCha20_4x: 466L$ChaCha20_4x: 467 468 movq %rsp,%r9 469 470 movq %r10,%r11 471 shrq $32,%r10 472 testq $32,%r10 473 jnz L$ChaCha20_8x 474 cmpq $192,%rdx 475 ja L$proceed4x 476 477 andq $71303168,%r11 478 cmpq $4194304,%r11 479 je L$do_sse3_after_all 480 481L$proceed4x: 482 subq $0x140+8,%rsp 483 movdqa L$sigma(%rip),%xmm11 484 movdqu (%rcx),%xmm15 485 movdqu 16(%rcx),%xmm7 486 movdqu (%r8),%xmm3 487 leaq 256(%rsp),%rcx 488 leaq L$rot16(%rip),%r10 489 leaq L$rot24(%rip),%r11 490 491 pshufd $0x00,%xmm11,%xmm8 492 pshufd $0x55,%xmm11,%xmm9 493 movdqa %xmm8,64(%rsp) 494 pshufd $0xaa,%xmm11,%xmm10 495 movdqa %xmm9,80(%rsp) 496 pshufd $0xff,%xmm11,%xmm11 497 movdqa %xmm10,96(%rsp) 498 movdqa %xmm11,112(%rsp) 499 500 pshufd $0x00,%xmm15,%xmm12 501 pshufd $0x55,%xmm15,%xmm13 502 movdqa %xmm12,128-256(%rcx) 503 pshufd $0xaa,%xmm15,%xmm14 504 movdqa %xmm13,144-256(%rcx) 505 pshufd $0xff,%xmm15,%xmm15 506 movdqa %xmm14,160-256(%rcx) 507 movdqa %xmm15,176-256(%rcx) 508 509 pshufd $0x00,%xmm7,%xmm4 510 pshufd $0x55,%xmm7,%xmm5 511 movdqa %xmm4,192-256(%rcx) 512 pshufd $0xaa,%xmm7,%xmm6 513 movdqa %xmm5,208-256(%rcx) 514 pshufd $0xff,%xmm7,%xmm7 515 movdqa %xmm6,224-256(%rcx) 516 movdqa %xmm7,240-256(%rcx) 517 518 pshufd $0x00,%xmm3,%xmm0 519 pshufd $0x55,%xmm3,%xmm1 520 paddd L$inc(%rip),%xmm0 521 pshufd $0xaa,%xmm3,%xmm2 522 movdqa %xmm1,272-256(%rcx) 523 pshufd $0xff,%xmm3,%xmm3 524 movdqa %xmm2,288-256(%rcx) 525 movdqa %xmm3,304-256(%rcx) 526 527 jmp L$oop_enter4x 528 529.p2align 5 530L$oop_outer4x: 531 movdqa 64(%rsp),%xmm8 532 movdqa 80(%rsp),%xmm9 533 movdqa 96(%rsp),%xmm10 534 movdqa 112(%rsp),%xmm11 535 movdqa 128-256(%rcx),%xmm12 536 movdqa 144-256(%rcx),%xmm13 537 movdqa 160-256(%rcx),%xmm14 538 movdqa 176-256(%rcx),%xmm15 539 movdqa 192-256(%rcx),%xmm4 540 movdqa 208-256(%rcx),%xmm5 541 movdqa 224-256(%rcx),%xmm6 542 movdqa 240-256(%rcx),%xmm7 543 movdqa 256-256(%rcx),%xmm0 544 movdqa 272-256(%rcx),%xmm1 545 movdqa 288-256(%rcx),%xmm2 546 movdqa 304-256(%rcx),%xmm3 547 paddd L$four(%rip),%xmm0 548 549L$oop_enter4x: 550 movdqa %xmm6,32(%rsp) 551 movdqa %xmm7,48(%rsp) 552 movdqa (%r10),%xmm7 553 movl $10,%eax 554 movdqa %xmm0,256-256(%rcx) 555 jmp L$oop4x 556 557.p2align 5 558L$oop4x: 559 paddd %xmm12,%xmm8 560 paddd %xmm13,%xmm9 561 pxor %xmm8,%xmm0 562 pxor %xmm9,%xmm1 563.byte 102,15,56,0,199 564.byte 102,15,56,0,207 565 paddd %xmm0,%xmm4 566 paddd %xmm1,%xmm5 567 pxor %xmm4,%xmm12 568 pxor %xmm5,%xmm13 569 movdqa %xmm12,%xmm6 570 pslld $12,%xmm12 571 psrld $20,%xmm6 572 movdqa %xmm13,%xmm7 573 pslld $12,%xmm13 574 por %xmm6,%xmm12 575 psrld $20,%xmm7 576 movdqa (%r11),%xmm6 577 por %xmm7,%xmm13 578 paddd %xmm12,%xmm8 579 paddd %xmm13,%xmm9 580 pxor %xmm8,%xmm0 581 pxor %xmm9,%xmm1 582.byte 102,15,56,0,198 583.byte 102,15,56,0,206 584 paddd %xmm0,%xmm4 585 paddd %xmm1,%xmm5 586 pxor %xmm4,%xmm12 587 pxor %xmm5,%xmm13 588 movdqa %xmm12,%xmm7 589 pslld $7,%xmm12 590 psrld $25,%xmm7 591 movdqa %xmm13,%xmm6 592 pslld $7,%xmm13 593 por %xmm7,%xmm12 594 psrld $25,%xmm6 595 movdqa (%r10),%xmm7 596 por %xmm6,%xmm13 597 movdqa %xmm4,0(%rsp) 598 movdqa %xmm5,16(%rsp) 599 movdqa 32(%rsp),%xmm4 600 movdqa 48(%rsp),%xmm5 601 paddd %xmm14,%xmm10 602 paddd %xmm15,%xmm11 603 pxor %xmm10,%xmm2 604 pxor %xmm11,%xmm3 605.byte 102,15,56,0,215 606.byte 102,15,56,0,223 607 paddd %xmm2,%xmm4 608 paddd %xmm3,%xmm5 609 pxor %xmm4,%xmm14 610 pxor %xmm5,%xmm15 611 movdqa %xmm14,%xmm6 612 pslld $12,%xmm14 613 psrld $20,%xmm6 614 movdqa %xmm15,%xmm7 615 pslld $12,%xmm15 616 por %xmm6,%xmm14 617 psrld $20,%xmm7 618 movdqa (%r11),%xmm6 619 por %xmm7,%xmm15 620 paddd %xmm14,%xmm10 621 paddd %xmm15,%xmm11 622 pxor %xmm10,%xmm2 623 pxor %xmm11,%xmm3 624.byte 102,15,56,0,214 625.byte 102,15,56,0,222 626 paddd %xmm2,%xmm4 627 paddd %xmm3,%xmm5 628 pxor %xmm4,%xmm14 629 pxor %xmm5,%xmm15 630 movdqa %xmm14,%xmm7 631 pslld $7,%xmm14 632 psrld $25,%xmm7 633 movdqa %xmm15,%xmm6 634 pslld $7,%xmm15 635 por %xmm7,%xmm14 636 psrld $25,%xmm6 637 movdqa (%r10),%xmm7 638 por %xmm6,%xmm15 639 paddd %xmm13,%xmm8 640 paddd %xmm14,%xmm9 641 pxor %xmm8,%xmm3 642 pxor %xmm9,%xmm0 643.byte 102,15,56,0,223 644.byte 102,15,56,0,199 645 paddd %xmm3,%xmm4 646 paddd %xmm0,%xmm5 647 pxor %xmm4,%xmm13 648 pxor %xmm5,%xmm14 649 movdqa %xmm13,%xmm6 650 pslld $12,%xmm13 651 psrld $20,%xmm6 652 movdqa %xmm14,%xmm7 653 pslld $12,%xmm14 654 por %xmm6,%xmm13 655 psrld $20,%xmm7 656 movdqa (%r11),%xmm6 657 por %xmm7,%xmm14 658 paddd %xmm13,%xmm8 659 paddd %xmm14,%xmm9 660 pxor %xmm8,%xmm3 661 pxor %xmm9,%xmm0 662.byte 102,15,56,0,222 663.byte 102,15,56,0,198 664 paddd %xmm3,%xmm4 665 paddd %xmm0,%xmm5 666 pxor %xmm4,%xmm13 667 pxor %xmm5,%xmm14 668 movdqa %xmm13,%xmm7 669 pslld $7,%xmm13 670 psrld $25,%xmm7 671 movdqa %xmm14,%xmm6 672 pslld $7,%xmm14 673 por %xmm7,%xmm13 674 psrld $25,%xmm6 675 movdqa (%r10),%xmm7 676 por %xmm6,%xmm14 677 movdqa %xmm4,32(%rsp) 678 movdqa %xmm5,48(%rsp) 679 movdqa 0(%rsp),%xmm4 680 movdqa 16(%rsp),%xmm5 681 paddd %xmm15,%xmm10 682 paddd %xmm12,%xmm11 683 pxor %xmm10,%xmm1 684 pxor %xmm11,%xmm2 685.byte 102,15,56,0,207 686.byte 102,15,56,0,215 687 paddd %xmm1,%xmm4 688 paddd %xmm2,%xmm5 689 pxor %xmm4,%xmm15 690 pxor %xmm5,%xmm12 691 movdqa %xmm15,%xmm6 692 pslld $12,%xmm15 693 psrld $20,%xmm6 694 movdqa %xmm12,%xmm7 695 pslld $12,%xmm12 696 por %xmm6,%xmm15 697 psrld $20,%xmm7 698 movdqa (%r11),%xmm6 699 por %xmm7,%xmm12 700 paddd %xmm15,%xmm10 701 paddd %xmm12,%xmm11 702 pxor %xmm10,%xmm1 703 pxor %xmm11,%xmm2 704.byte 102,15,56,0,206 705.byte 102,15,56,0,214 706 paddd %xmm1,%xmm4 707 paddd %xmm2,%xmm5 708 pxor %xmm4,%xmm15 709 pxor %xmm5,%xmm12 710 movdqa %xmm15,%xmm7 711 pslld $7,%xmm15 712 psrld $25,%xmm7 713 movdqa %xmm12,%xmm6 714 pslld $7,%xmm12 715 por %xmm7,%xmm15 716 psrld $25,%xmm6 717 movdqa (%r10),%xmm7 718 por %xmm6,%xmm12 719 decl %eax 720 jnz L$oop4x 721 722 paddd 64(%rsp),%xmm8 723 paddd 80(%rsp),%xmm9 724 paddd 96(%rsp),%xmm10 725 paddd 112(%rsp),%xmm11 726 727 movdqa %xmm8,%xmm6 728 punpckldq %xmm9,%xmm8 729 movdqa %xmm10,%xmm7 730 punpckldq %xmm11,%xmm10 731 punpckhdq %xmm9,%xmm6 732 punpckhdq %xmm11,%xmm7 733 movdqa %xmm8,%xmm9 734 punpcklqdq %xmm10,%xmm8 735 movdqa %xmm6,%xmm11 736 punpcklqdq %xmm7,%xmm6 737 punpckhqdq %xmm10,%xmm9 738 punpckhqdq %xmm7,%xmm11 739 paddd 128-256(%rcx),%xmm12 740 paddd 144-256(%rcx),%xmm13 741 paddd 160-256(%rcx),%xmm14 742 paddd 176-256(%rcx),%xmm15 743 744 movdqa %xmm8,0(%rsp) 745 movdqa %xmm9,16(%rsp) 746 movdqa 32(%rsp),%xmm8 747 movdqa 48(%rsp),%xmm9 748 749 movdqa %xmm12,%xmm10 750 punpckldq %xmm13,%xmm12 751 movdqa %xmm14,%xmm7 752 punpckldq %xmm15,%xmm14 753 punpckhdq %xmm13,%xmm10 754 punpckhdq %xmm15,%xmm7 755 movdqa %xmm12,%xmm13 756 punpcklqdq %xmm14,%xmm12 757 movdqa %xmm10,%xmm15 758 punpcklqdq %xmm7,%xmm10 759 punpckhqdq %xmm14,%xmm13 760 punpckhqdq %xmm7,%xmm15 761 paddd 192-256(%rcx),%xmm4 762 paddd 208-256(%rcx),%xmm5 763 paddd 224-256(%rcx),%xmm8 764 paddd 240-256(%rcx),%xmm9 765 766 movdqa %xmm6,32(%rsp) 767 movdqa %xmm11,48(%rsp) 768 769 movdqa %xmm4,%xmm14 770 punpckldq %xmm5,%xmm4 771 movdqa %xmm8,%xmm7 772 punpckldq %xmm9,%xmm8 773 punpckhdq %xmm5,%xmm14 774 punpckhdq %xmm9,%xmm7 775 movdqa %xmm4,%xmm5 776 punpcklqdq %xmm8,%xmm4 777 movdqa %xmm14,%xmm9 778 punpcklqdq %xmm7,%xmm14 779 punpckhqdq %xmm8,%xmm5 780 punpckhqdq %xmm7,%xmm9 781 paddd 256-256(%rcx),%xmm0 782 paddd 272-256(%rcx),%xmm1 783 paddd 288-256(%rcx),%xmm2 784 paddd 304-256(%rcx),%xmm3 785 786 movdqa %xmm0,%xmm8 787 punpckldq %xmm1,%xmm0 788 movdqa %xmm2,%xmm7 789 punpckldq %xmm3,%xmm2 790 punpckhdq %xmm1,%xmm8 791 punpckhdq %xmm3,%xmm7 792 movdqa %xmm0,%xmm1 793 punpcklqdq %xmm2,%xmm0 794 movdqa %xmm8,%xmm3 795 punpcklqdq %xmm7,%xmm8 796 punpckhqdq %xmm2,%xmm1 797 punpckhqdq %xmm7,%xmm3 798 cmpq $256,%rdx 799 jb L$tail4x 800 801 movdqu 0(%rsi),%xmm6 802 movdqu 16(%rsi),%xmm11 803 movdqu 32(%rsi),%xmm2 804 movdqu 48(%rsi),%xmm7 805 pxor 0(%rsp),%xmm6 806 pxor %xmm12,%xmm11 807 pxor %xmm4,%xmm2 808 pxor %xmm0,%xmm7 809 810 movdqu %xmm6,0(%rdi) 811 movdqu 64(%rsi),%xmm6 812 movdqu %xmm11,16(%rdi) 813 movdqu 80(%rsi),%xmm11 814 movdqu %xmm2,32(%rdi) 815 movdqu 96(%rsi),%xmm2 816 movdqu %xmm7,48(%rdi) 817 movdqu 112(%rsi),%xmm7 818 leaq 128(%rsi),%rsi 819 pxor 16(%rsp),%xmm6 820 pxor %xmm13,%xmm11 821 pxor %xmm5,%xmm2 822 pxor %xmm1,%xmm7 823 824 movdqu %xmm6,64(%rdi) 825 movdqu 0(%rsi),%xmm6 826 movdqu %xmm11,80(%rdi) 827 movdqu 16(%rsi),%xmm11 828 movdqu %xmm2,96(%rdi) 829 movdqu 32(%rsi),%xmm2 830 movdqu %xmm7,112(%rdi) 831 leaq 128(%rdi),%rdi 832 movdqu 48(%rsi),%xmm7 833 pxor 32(%rsp),%xmm6 834 pxor %xmm10,%xmm11 835 pxor %xmm14,%xmm2 836 pxor %xmm8,%xmm7 837 838 movdqu %xmm6,0(%rdi) 839 movdqu 64(%rsi),%xmm6 840 movdqu %xmm11,16(%rdi) 841 movdqu 80(%rsi),%xmm11 842 movdqu %xmm2,32(%rdi) 843 movdqu 96(%rsi),%xmm2 844 movdqu %xmm7,48(%rdi) 845 movdqu 112(%rsi),%xmm7 846 leaq 128(%rsi),%rsi 847 pxor 48(%rsp),%xmm6 848 pxor %xmm15,%xmm11 849 pxor %xmm9,%xmm2 850 pxor %xmm3,%xmm7 851 movdqu %xmm6,64(%rdi) 852 movdqu %xmm11,80(%rdi) 853 movdqu %xmm2,96(%rdi) 854 movdqu %xmm7,112(%rdi) 855 leaq 128(%rdi),%rdi 856 857 subq $256,%rdx 858 jnz L$oop_outer4x 859 860 jmp L$done4x 861 862L$tail4x: 863 cmpq $192,%rdx 864 jae L$192_or_more4x 865 cmpq $128,%rdx 866 jae L$128_or_more4x 867 cmpq $64,%rdx 868 jae L$64_or_more4x 869 870 871 xorq %r10,%r10 872 873 movdqa %xmm12,16(%rsp) 874 movdqa %xmm4,32(%rsp) 875 movdqa %xmm0,48(%rsp) 876 jmp L$oop_tail4x 877 878.p2align 5 879L$64_or_more4x: 880 movdqu 0(%rsi),%xmm6 881 movdqu 16(%rsi),%xmm11 882 movdqu 32(%rsi),%xmm2 883 movdqu 48(%rsi),%xmm7 884 pxor 0(%rsp),%xmm6 885 pxor %xmm12,%xmm11 886 pxor %xmm4,%xmm2 887 pxor %xmm0,%xmm7 888 movdqu %xmm6,0(%rdi) 889 movdqu %xmm11,16(%rdi) 890 movdqu %xmm2,32(%rdi) 891 movdqu %xmm7,48(%rdi) 892 je L$done4x 893 894 movdqa 16(%rsp),%xmm6 895 leaq 64(%rsi),%rsi 896 xorq %r10,%r10 897 movdqa %xmm6,0(%rsp) 898 movdqa %xmm13,16(%rsp) 899 leaq 64(%rdi),%rdi 900 movdqa %xmm5,32(%rsp) 901 subq $64,%rdx 902 movdqa %xmm1,48(%rsp) 903 jmp L$oop_tail4x 904 905.p2align 5 906L$128_or_more4x: 907 movdqu 0(%rsi),%xmm6 908 movdqu 16(%rsi),%xmm11 909 movdqu 32(%rsi),%xmm2 910 movdqu 48(%rsi),%xmm7 911 pxor 0(%rsp),%xmm6 912 pxor %xmm12,%xmm11 913 pxor %xmm4,%xmm2 914 pxor %xmm0,%xmm7 915 916 movdqu %xmm6,0(%rdi) 917 movdqu 64(%rsi),%xmm6 918 movdqu %xmm11,16(%rdi) 919 movdqu 80(%rsi),%xmm11 920 movdqu %xmm2,32(%rdi) 921 movdqu 96(%rsi),%xmm2 922 movdqu %xmm7,48(%rdi) 923 movdqu 112(%rsi),%xmm7 924 pxor 16(%rsp),%xmm6 925 pxor %xmm13,%xmm11 926 pxor %xmm5,%xmm2 927 pxor %xmm1,%xmm7 928 movdqu %xmm6,64(%rdi) 929 movdqu %xmm11,80(%rdi) 930 movdqu %xmm2,96(%rdi) 931 movdqu %xmm7,112(%rdi) 932 je L$done4x 933 934 movdqa 32(%rsp),%xmm6 935 leaq 128(%rsi),%rsi 936 xorq %r10,%r10 937 movdqa %xmm6,0(%rsp) 938 movdqa %xmm10,16(%rsp) 939 leaq 128(%rdi),%rdi 940 movdqa %xmm14,32(%rsp) 941 subq $128,%rdx 942 movdqa %xmm8,48(%rsp) 943 jmp L$oop_tail4x 944 945.p2align 5 946L$192_or_more4x: 947 movdqu 0(%rsi),%xmm6 948 movdqu 16(%rsi),%xmm11 949 movdqu 32(%rsi),%xmm2 950 movdqu 48(%rsi),%xmm7 951 pxor 0(%rsp),%xmm6 952 pxor %xmm12,%xmm11 953 pxor %xmm4,%xmm2 954 pxor %xmm0,%xmm7 955 956 movdqu %xmm6,0(%rdi) 957 movdqu 64(%rsi),%xmm6 958 movdqu %xmm11,16(%rdi) 959 movdqu 80(%rsi),%xmm11 960 movdqu %xmm2,32(%rdi) 961 movdqu 96(%rsi),%xmm2 962 movdqu %xmm7,48(%rdi) 963 movdqu 112(%rsi),%xmm7 964 leaq 128(%rsi),%rsi 965 pxor 16(%rsp),%xmm6 966 pxor %xmm13,%xmm11 967 pxor %xmm5,%xmm2 968 pxor %xmm1,%xmm7 969 970 movdqu %xmm6,64(%rdi) 971 movdqu 0(%rsi),%xmm6 972 movdqu %xmm11,80(%rdi) 973 movdqu 16(%rsi),%xmm11 974 movdqu %xmm2,96(%rdi) 975 movdqu 32(%rsi),%xmm2 976 movdqu %xmm7,112(%rdi) 977 leaq 128(%rdi),%rdi 978 movdqu 48(%rsi),%xmm7 979 pxor 32(%rsp),%xmm6 980 pxor %xmm10,%xmm11 981 pxor %xmm14,%xmm2 982 pxor %xmm8,%xmm7 983 movdqu %xmm6,0(%rdi) 984 movdqu %xmm11,16(%rdi) 985 movdqu %xmm2,32(%rdi) 986 movdqu %xmm7,48(%rdi) 987 je L$done4x 988 989 movdqa 48(%rsp),%xmm6 990 leaq 64(%rsi),%rsi 991 xorq %r10,%r10 992 movdqa %xmm6,0(%rsp) 993 movdqa %xmm15,16(%rsp) 994 leaq 64(%rdi),%rdi 995 movdqa %xmm9,32(%rsp) 996 subq $192,%rdx 997 movdqa %xmm3,48(%rsp) 998 999L$oop_tail4x: 1000 movzbl (%rsi,%r10,1),%eax 1001 movzbl (%rsp,%r10,1),%ecx 1002 leaq 1(%r10),%r10 1003 xorl %ecx,%eax 1004 movb %al,-1(%rdi,%r10,1) 1005 decq %rdx 1006 jnz L$oop_tail4x 1007 1008L$done4x: 1009 leaq (%r9),%rsp 1010 1011L$4x_epilogue: 1012 ret 1013 1014 1015 1016.p2align 5 1017ChaCha20_8x: 1018L$ChaCha20_8x: 1019 1020 movq %rsp,%r9 1021 1022 subq $0x280+8,%rsp 1023 andq $-32,%rsp 1024 vzeroupper 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 vbroadcasti128 L$sigma(%rip),%ymm11 1036 vbroadcasti128 (%rcx),%ymm3 1037 vbroadcasti128 16(%rcx),%ymm15 1038 vbroadcasti128 (%r8),%ymm7 1039 leaq 256(%rsp),%rcx 1040 leaq 512(%rsp),%rax 1041 leaq L$rot16(%rip),%r10 1042 leaq L$rot24(%rip),%r11 1043 1044 vpshufd $0x00,%ymm11,%ymm8 1045 vpshufd $0x55,%ymm11,%ymm9 1046 vmovdqa %ymm8,128-256(%rcx) 1047 vpshufd $0xaa,%ymm11,%ymm10 1048 vmovdqa %ymm9,160-256(%rcx) 1049 vpshufd $0xff,%ymm11,%ymm11 1050 vmovdqa %ymm10,192-256(%rcx) 1051 vmovdqa %ymm11,224-256(%rcx) 1052 1053 vpshufd $0x00,%ymm3,%ymm0 1054 vpshufd $0x55,%ymm3,%ymm1 1055 vmovdqa %ymm0,256-256(%rcx) 1056 vpshufd $0xaa,%ymm3,%ymm2 1057 vmovdqa %ymm1,288-256(%rcx) 1058 vpshufd $0xff,%ymm3,%ymm3 1059 vmovdqa %ymm2,320-256(%rcx) 1060 vmovdqa %ymm3,352-256(%rcx) 1061 1062 vpshufd $0x00,%ymm15,%ymm12 1063 vpshufd $0x55,%ymm15,%ymm13 1064 vmovdqa %ymm12,384-512(%rax) 1065 vpshufd $0xaa,%ymm15,%ymm14 1066 vmovdqa %ymm13,416-512(%rax) 1067 vpshufd $0xff,%ymm15,%ymm15 1068 vmovdqa %ymm14,448-512(%rax) 1069 vmovdqa %ymm15,480-512(%rax) 1070 1071 vpshufd $0x00,%ymm7,%ymm4 1072 vpshufd $0x55,%ymm7,%ymm5 1073 vpaddd L$incy(%rip),%ymm4,%ymm4 1074 vpshufd $0xaa,%ymm7,%ymm6 1075 vmovdqa %ymm5,544-512(%rax) 1076 vpshufd $0xff,%ymm7,%ymm7 1077 vmovdqa %ymm6,576-512(%rax) 1078 vmovdqa %ymm7,608-512(%rax) 1079 1080 jmp L$oop_enter8x 1081 1082.p2align 5 1083L$oop_outer8x: 1084 vmovdqa 128-256(%rcx),%ymm8 1085 vmovdqa 160-256(%rcx),%ymm9 1086 vmovdqa 192-256(%rcx),%ymm10 1087 vmovdqa 224-256(%rcx),%ymm11 1088 vmovdqa 256-256(%rcx),%ymm0 1089 vmovdqa 288-256(%rcx),%ymm1 1090 vmovdqa 320-256(%rcx),%ymm2 1091 vmovdqa 352-256(%rcx),%ymm3 1092 vmovdqa 384-512(%rax),%ymm12 1093 vmovdqa 416-512(%rax),%ymm13 1094 vmovdqa 448-512(%rax),%ymm14 1095 vmovdqa 480-512(%rax),%ymm15 1096 vmovdqa 512-512(%rax),%ymm4 1097 vmovdqa 544-512(%rax),%ymm5 1098 vmovdqa 576-512(%rax),%ymm6 1099 vmovdqa 608-512(%rax),%ymm7 1100 vpaddd L$eight(%rip),%ymm4,%ymm4 1101 1102L$oop_enter8x: 1103 vmovdqa %ymm14,64(%rsp) 1104 vmovdqa %ymm15,96(%rsp) 1105 vbroadcasti128 (%r10),%ymm15 1106 vmovdqa %ymm4,512-512(%rax) 1107 movl $10,%eax 1108 jmp L$oop8x 1109 1110.p2align 5 1111L$oop8x: 1112 vpaddd %ymm0,%ymm8,%ymm8 1113 vpxor %ymm4,%ymm8,%ymm4 1114 vpshufb %ymm15,%ymm4,%ymm4 1115 vpaddd %ymm1,%ymm9,%ymm9 1116 vpxor %ymm5,%ymm9,%ymm5 1117 vpshufb %ymm15,%ymm5,%ymm5 1118 vpaddd %ymm4,%ymm12,%ymm12 1119 vpxor %ymm0,%ymm12,%ymm0 1120 vpslld $12,%ymm0,%ymm14 1121 vpsrld $20,%ymm0,%ymm0 1122 vpor %ymm0,%ymm14,%ymm0 1123 vbroadcasti128 (%r11),%ymm14 1124 vpaddd %ymm5,%ymm13,%ymm13 1125 vpxor %ymm1,%ymm13,%ymm1 1126 vpslld $12,%ymm1,%ymm15 1127 vpsrld $20,%ymm1,%ymm1 1128 vpor %ymm1,%ymm15,%ymm1 1129 vpaddd %ymm0,%ymm8,%ymm8 1130 vpxor %ymm4,%ymm8,%ymm4 1131 vpshufb %ymm14,%ymm4,%ymm4 1132 vpaddd %ymm1,%ymm9,%ymm9 1133 vpxor %ymm5,%ymm9,%ymm5 1134 vpshufb %ymm14,%ymm5,%ymm5 1135 vpaddd %ymm4,%ymm12,%ymm12 1136 vpxor %ymm0,%ymm12,%ymm0 1137 vpslld $7,%ymm0,%ymm15 1138 vpsrld $25,%ymm0,%ymm0 1139 vpor %ymm0,%ymm15,%ymm0 1140 vbroadcasti128 (%r10),%ymm15 1141 vpaddd %ymm5,%ymm13,%ymm13 1142 vpxor %ymm1,%ymm13,%ymm1 1143 vpslld $7,%ymm1,%ymm14 1144 vpsrld $25,%ymm1,%ymm1 1145 vpor %ymm1,%ymm14,%ymm1 1146 vmovdqa %ymm12,0(%rsp) 1147 vmovdqa %ymm13,32(%rsp) 1148 vmovdqa 64(%rsp),%ymm12 1149 vmovdqa 96(%rsp),%ymm13 1150 vpaddd %ymm2,%ymm10,%ymm10 1151 vpxor %ymm6,%ymm10,%ymm6 1152 vpshufb %ymm15,%ymm6,%ymm6 1153 vpaddd %ymm3,%ymm11,%ymm11 1154 vpxor %ymm7,%ymm11,%ymm7 1155 vpshufb %ymm15,%ymm7,%ymm7 1156 vpaddd %ymm6,%ymm12,%ymm12 1157 vpxor %ymm2,%ymm12,%ymm2 1158 vpslld $12,%ymm2,%ymm14 1159 vpsrld $20,%ymm2,%ymm2 1160 vpor %ymm2,%ymm14,%ymm2 1161 vbroadcasti128 (%r11),%ymm14 1162 vpaddd %ymm7,%ymm13,%ymm13 1163 vpxor %ymm3,%ymm13,%ymm3 1164 vpslld $12,%ymm3,%ymm15 1165 vpsrld $20,%ymm3,%ymm3 1166 vpor %ymm3,%ymm15,%ymm3 1167 vpaddd %ymm2,%ymm10,%ymm10 1168 vpxor %ymm6,%ymm10,%ymm6 1169 vpshufb %ymm14,%ymm6,%ymm6 1170 vpaddd %ymm3,%ymm11,%ymm11 1171 vpxor %ymm7,%ymm11,%ymm7 1172 vpshufb %ymm14,%ymm7,%ymm7 1173 vpaddd %ymm6,%ymm12,%ymm12 1174 vpxor %ymm2,%ymm12,%ymm2 1175 vpslld $7,%ymm2,%ymm15 1176 vpsrld $25,%ymm2,%ymm2 1177 vpor %ymm2,%ymm15,%ymm2 1178 vbroadcasti128 (%r10),%ymm15 1179 vpaddd %ymm7,%ymm13,%ymm13 1180 vpxor %ymm3,%ymm13,%ymm3 1181 vpslld $7,%ymm3,%ymm14 1182 vpsrld $25,%ymm3,%ymm3 1183 vpor %ymm3,%ymm14,%ymm3 1184 vpaddd %ymm1,%ymm8,%ymm8 1185 vpxor %ymm7,%ymm8,%ymm7 1186 vpshufb %ymm15,%ymm7,%ymm7 1187 vpaddd %ymm2,%ymm9,%ymm9 1188 vpxor %ymm4,%ymm9,%ymm4 1189 vpshufb %ymm15,%ymm4,%ymm4 1190 vpaddd %ymm7,%ymm12,%ymm12 1191 vpxor %ymm1,%ymm12,%ymm1 1192 vpslld $12,%ymm1,%ymm14 1193 vpsrld $20,%ymm1,%ymm1 1194 vpor %ymm1,%ymm14,%ymm1 1195 vbroadcasti128 (%r11),%ymm14 1196 vpaddd %ymm4,%ymm13,%ymm13 1197 vpxor %ymm2,%ymm13,%ymm2 1198 vpslld $12,%ymm2,%ymm15 1199 vpsrld $20,%ymm2,%ymm2 1200 vpor %ymm2,%ymm15,%ymm2 1201 vpaddd %ymm1,%ymm8,%ymm8 1202 vpxor %ymm7,%ymm8,%ymm7 1203 vpshufb %ymm14,%ymm7,%ymm7 1204 vpaddd %ymm2,%ymm9,%ymm9 1205 vpxor %ymm4,%ymm9,%ymm4 1206 vpshufb %ymm14,%ymm4,%ymm4 1207 vpaddd %ymm7,%ymm12,%ymm12 1208 vpxor %ymm1,%ymm12,%ymm1 1209 vpslld $7,%ymm1,%ymm15 1210 vpsrld $25,%ymm1,%ymm1 1211 vpor %ymm1,%ymm15,%ymm1 1212 vbroadcasti128 (%r10),%ymm15 1213 vpaddd %ymm4,%ymm13,%ymm13 1214 vpxor %ymm2,%ymm13,%ymm2 1215 vpslld $7,%ymm2,%ymm14 1216 vpsrld $25,%ymm2,%ymm2 1217 vpor %ymm2,%ymm14,%ymm2 1218 vmovdqa %ymm12,64(%rsp) 1219 vmovdqa %ymm13,96(%rsp) 1220 vmovdqa 0(%rsp),%ymm12 1221 vmovdqa 32(%rsp),%ymm13 1222 vpaddd %ymm3,%ymm10,%ymm10 1223 vpxor %ymm5,%ymm10,%ymm5 1224 vpshufb %ymm15,%ymm5,%ymm5 1225 vpaddd %ymm0,%ymm11,%ymm11 1226 vpxor %ymm6,%ymm11,%ymm6 1227 vpshufb %ymm15,%ymm6,%ymm6 1228 vpaddd %ymm5,%ymm12,%ymm12 1229 vpxor %ymm3,%ymm12,%ymm3 1230 vpslld $12,%ymm3,%ymm14 1231 vpsrld $20,%ymm3,%ymm3 1232 vpor %ymm3,%ymm14,%ymm3 1233 vbroadcasti128 (%r11),%ymm14 1234 vpaddd %ymm6,%ymm13,%ymm13 1235 vpxor %ymm0,%ymm13,%ymm0 1236 vpslld $12,%ymm0,%ymm15 1237 vpsrld $20,%ymm0,%ymm0 1238 vpor %ymm0,%ymm15,%ymm0 1239 vpaddd %ymm3,%ymm10,%ymm10 1240 vpxor %ymm5,%ymm10,%ymm5 1241 vpshufb %ymm14,%ymm5,%ymm5 1242 vpaddd %ymm0,%ymm11,%ymm11 1243 vpxor %ymm6,%ymm11,%ymm6 1244 vpshufb %ymm14,%ymm6,%ymm6 1245 vpaddd %ymm5,%ymm12,%ymm12 1246 vpxor %ymm3,%ymm12,%ymm3 1247 vpslld $7,%ymm3,%ymm15 1248 vpsrld $25,%ymm3,%ymm3 1249 vpor %ymm3,%ymm15,%ymm3 1250 vbroadcasti128 (%r10),%ymm15 1251 vpaddd %ymm6,%ymm13,%ymm13 1252 vpxor %ymm0,%ymm13,%ymm0 1253 vpslld $7,%ymm0,%ymm14 1254 vpsrld $25,%ymm0,%ymm0 1255 vpor %ymm0,%ymm14,%ymm0 1256 decl %eax 1257 jnz L$oop8x 1258 1259 leaq 512(%rsp),%rax 1260 vpaddd 128-256(%rcx),%ymm8,%ymm8 1261 vpaddd 160-256(%rcx),%ymm9,%ymm9 1262 vpaddd 192-256(%rcx),%ymm10,%ymm10 1263 vpaddd 224-256(%rcx),%ymm11,%ymm11 1264 1265 vpunpckldq %ymm9,%ymm8,%ymm14 1266 vpunpckldq %ymm11,%ymm10,%ymm15 1267 vpunpckhdq %ymm9,%ymm8,%ymm8 1268 vpunpckhdq %ymm11,%ymm10,%ymm10 1269 vpunpcklqdq %ymm15,%ymm14,%ymm9 1270 vpunpckhqdq %ymm15,%ymm14,%ymm14 1271 vpunpcklqdq %ymm10,%ymm8,%ymm11 1272 vpunpckhqdq %ymm10,%ymm8,%ymm8 1273 vpaddd 256-256(%rcx),%ymm0,%ymm0 1274 vpaddd 288-256(%rcx),%ymm1,%ymm1 1275 vpaddd 320-256(%rcx),%ymm2,%ymm2 1276 vpaddd 352-256(%rcx),%ymm3,%ymm3 1277 1278 vpunpckldq %ymm1,%ymm0,%ymm10 1279 vpunpckldq %ymm3,%ymm2,%ymm15 1280 vpunpckhdq %ymm1,%ymm0,%ymm0 1281 vpunpckhdq %ymm3,%ymm2,%ymm2 1282 vpunpcklqdq %ymm15,%ymm10,%ymm1 1283 vpunpckhqdq %ymm15,%ymm10,%ymm10 1284 vpunpcklqdq %ymm2,%ymm0,%ymm3 1285 vpunpckhqdq %ymm2,%ymm0,%ymm0 1286 vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 1287 vperm2i128 $0x31,%ymm1,%ymm9,%ymm1 1288 vperm2i128 $0x20,%ymm10,%ymm14,%ymm9 1289 vperm2i128 $0x31,%ymm10,%ymm14,%ymm10 1290 vperm2i128 $0x20,%ymm3,%ymm11,%ymm14 1291 vperm2i128 $0x31,%ymm3,%ymm11,%ymm3 1292 vperm2i128 $0x20,%ymm0,%ymm8,%ymm11 1293 vperm2i128 $0x31,%ymm0,%ymm8,%ymm0 1294 vmovdqa %ymm15,0(%rsp) 1295 vmovdqa %ymm9,32(%rsp) 1296 vmovdqa 64(%rsp),%ymm15 1297 vmovdqa 96(%rsp),%ymm9 1298 1299 vpaddd 384-512(%rax),%ymm12,%ymm12 1300 vpaddd 416-512(%rax),%ymm13,%ymm13 1301 vpaddd 448-512(%rax),%ymm15,%ymm15 1302 vpaddd 480-512(%rax),%ymm9,%ymm9 1303 1304 vpunpckldq %ymm13,%ymm12,%ymm2 1305 vpunpckldq %ymm9,%ymm15,%ymm8 1306 vpunpckhdq %ymm13,%ymm12,%ymm12 1307 vpunpckhdq %ymm9,%ymm15,%ymm15 1308 vpunpcklqdq %ymm8,%ymm2,%ymm13 1309 vpunpckhqdq %ymm8,%ymm2,%ymm2 1310 vpunpcklqdq %ymm15,%ymm12,%ymm9 1311 vpunpckhqdq %ymm15,%ymm12,%ymm12 1312 vpaddd 512-512(%rax),%ymm4,%ymm4 1313 vpaddd 544-512(%rax),%ymm5,%ymm5 1314 vpaddd 576-512(%rax),%ymm6,%ymm6 1315 vpaddd 608-512(%rax),%ymm7,%ymm7 1316 1317 vpunpckldq %ymm5,%ymm4,%ymm15 1318 vpunpckldq %ymm7,%ymm6,%ymm8 1319 vpunpckhdq %ymm5,%ymm4,%ymm4 1320 vpunpckhdq %ymm7,%ymm6,%ymm6 1321 vpunpcklqdq %ymm8,%ymm15,%ymm5 1322 vpunpckhqdq %ymm8,%ymm15,%ymm15 1323 vpunpcklqdq %ymm6,%ymm4,%ymm7 1324 vpunpckhqdq %ymm6,%ymm4,%ymm4 1325 vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 1326 vperm2i128 $0x31,%ymm5,%ymm13,%ymm5 1327 vperm2i128 $0x20,%ymm15,%ymm2,%ymm13 1328 vperm2i128 $0x31,%ymm15,%ymm2,%ymm15 1329 vperm2i128 $0x20,%ymm7,%ymm9,%ymm2 1330 vperm2i128 $0x31,%ymm7,%ymm9,%ymm7 1331 vperm2i128 $0x20,%ymm4,%ymm12,%ymm9 1332 vperm2i128 $0x31,%ymm4,%ymm12,%ymm4 1333 vmovdqa 0(%rsp),%ymm6 1334 vmovdqa 32(%rsp),%ymm12 1335 1336 cmpq $512,%rdx 1337 jb L$tail8x 1338 1339 vpxor 0(%rsi),%ymm6,%ymm6 1340 vpxor 32(%rsi),%ymm8,%ymm8 1341 vpxor 64(%rsi),%ymm1,%ymm1 1342 vpxor 96(%rsi),%ymm5,%ymm5 1343 leaq 128(%rsi),%rsi 1344 vmovdqu %ymm6,0(%rdi) 1345 vmovdqu %ymm8,32(%rdi) 1346 vmovdqu %ymm1,64(%rdi) 1347 vmovdqu %ymm5,96(%rdi) 1348 leaq 128(%rdi),%rdi 1349 1350 vpxor 0(%rsi),%ymm12,%ymm12 1351 vpxor 32(%rsi),%ymm13,%ymm13 1352 vpxor 64(%rsi),%ymm10,%ymm10 1353 vpxor 96(%rsi),%ymm15,%ymm15 1354 leaq 128(%rsi),%rsi 1355 vmovdqu %ymm12,0(%rdi) 1356 vmovdqu %ymm13,32(%rdi) 1357 vmovdqu %ymm10,64(%rdi) 1358 vmovdqu %ymm15,96(%rdi) 1359 leaq 128(%rdi),%rdi 1360 1361 vpxor 0(%rsi),%ymm14,%ymm14 1362 vpxor 32(%rsi),%ymm2,%ymm2 1363 vpxor 64(%rsi),%ymm3,%ymm3 1364 vpxor 96(%rsi),%ymm7,%ymm7 1365 leaq 128(%rsi),%rsi 1366 vmovdqu %ymm14,0(%rdi) 1367 vmovdqu %ymm2,32(%rdi) 1368 vmovdqu %ymm3,64(%rdi) 1369 vmovdqu %ymm7,96(%rdi) 1370 leaq 128(%rdi),%rdi 1371 1372 vpxor 0(%rsi),%ymm11,%ymm11 1373 vpxor 32(%rsi),%ymm9,%ymm9 1374 vpxor 64(%rsi),%ymm0,%ymm0 1375 vpxor 96(%rsi),%ymm4,%ymm4 1376 leaq 128(%rsi),%rsi 1377 vmovdqu %ymm11,0(%rdi) 1378 vmovdqu %ymm9,32(%rdi) 1379 vmovdqu %ymm0,64(%rdi) 1380 vmovdqu %ymm4,96(%rdi) 1381 leaq 128(%rdi),%rdi 1382 1383 subq $512,%rdx 1384 jnz L$oop_outer8x 1385 1386 jmp L$done8x 1387 1388L$tail8x: 1389 cmpq $448,%rdx 1390 jae L$448_or_more8x 1391 cmpq $384,%rdx 1392 jae L$384_or_more8x 1393 cmpq $320,%rdx 1394 jae L$320_or_more8x 1395 cmpq $256,%rdx 1396 jae L$256_or_more8x 1397 cmpq $192,%rdx 1398 jae L$192_or_more8x 1399 cmpq $128,%rdx 1400 jae L$128_or_more8x 1401 cmpq $64,%rdx 1402 jae L$64_or_more8x 1403 1404 xorq %r10,%r10 1405 vmovdqa %ymm6,0(%rsp) 1406 vmovdqa %ymm8,32(%rsp) 1407 jmp L$oop_tail8x 1408 1409.p2align 5 1410L$64_or_more8x: 1411 vpxor 0(%rsi),%ymm6,%ymm6 1412 vpxor 32(%rsi),%ymm8,%ymm8 1413 vmovdqu %ymm6,0(%rdi) 1414 vmovdqu %ymm8,32(%rdi) 1415 je L$done8x 1416 1417 leaq 64(%rsi),%rsi 1418 xorq %r10,%r10 1419 vmovdqa %ymm1,0(%rsp) 1420 leaq 64(%rdi),%rdi 1421 subq $64,%rdx 1422 vmovdqa %ymm5,32(%rsp) 1423 jmp L$oop_tail8x 1424 1425.p2align 5 1426L$128_or_more8x: 1427 vpxor 0(%rsi),%ymm6,%ymm6 1428 vpxor 32(%rsi),%ymm8,%ymm8 1429 vpxor 64(%rsi),%ymm1,%ymm1 1430 vpxor 96(%rsi),%ymm5,%ymm5 1431 vmovdqu %ymm6,0(%rdi) 1432 vmovdqu %ymm8,32(%rdi) 1433 vmovdqu %ymm1,64(%rdi) 1434 vmovdqu %ymm5,96(%rdi) 1435 je L$done8x 1436 1437 leaq 128(%rsi),%rsi 1438 xorq %r10,%r10 1439 vmovdqa %ymm12,0(%rsp) 1440 leaq 128(%rdi),%rdi 1441 subq $128,%rdx 1442 vmovdqa %ymm13,32(%rsp) 1443 jmp L$oop_tail8x 1444 1445.p2align 5 1446L$192_or_more8x: 1447 vpxor 0(%rsi),%ymm6,%ymm6 1448 vpxor 32(%rsi),%ymm8,%ymm8 1449 vpxor 64(%rsi),%ymm1,%ymm1 1450 vpxor 96(%rsi),%ymm5,%ymm5 1451 vpxor 128(%rsi),%ymm12,%ymm12 1452 vpxor 160(%rsi),%ymm13,%ymm13 1453 vmovdqu %ymm6,0(%rdi) 1454 vmovdqu %ymm8,32(%rdi) 1455 vmovdqu %ymm1,64(%rdi) 1456 vmovdqu %ymm5,96(%rdi) 1457 vmovdqu %ymm12,128(%rdi) 1458 vmovdqu %ymm13,160(%rdi) 1459 je L$done8x 1460 1461 leaq 192(%rsi),%rsi 1462 xorq %r10,%r10 1463 vmovdqa %ymm10,0(%rsp) 1464 leaq 192(%rdi),%rdi 1465 subq $192,%rdx 1466 vmovdqa %ymm15,32(%rsp) 1467 jmp L$oop_tail8x 1468 1469.p2align 5 1470L$256_or_more8x: 1471 vpxor 0(%rsi),%ymm6,%ymm6 1472 vpxor 32(%rsi),%ymm8,%ymm8 1473 vpxor 64(%rsi),%ymm1,%ymm1 1474 vpxor 96(%rsi),%ymm5,%ymm5 1475 vpxor 128(%rsi),%ymm12,%ymm12 1476 vpxor 160(%rsi),%ymm13,%ymm13 1477 vpxor 192(%rsi),%ymm10,%ymm10 1478 vpxor 224(%rsi),%ymm15,%ymm15 1479 vmovdqu %ymm6,0(%rdi) 1480 vmovdqu %ymm8,32(%rdi) 1481 vmovdqu %ymm1,64(%rdi) 1482 vmovdqu %ymm5,96(%rdi) 1483 vmovdqu %ymm12,128(%rdi) 1484 vmovdqu %ymm13,160(%rdi) 1485 vmovdqu %ymm10,192(%rdi) 1486 vmovdqu %ymm15,224(%rdi) 1487 je L$done8x 1488 1489 leaq 256(%rsi),%rsi 1490 xorq %r10,%r10 1491 vmovdqa %ymm14,0(%rsp) 1492 leaq 256(%rdi),%rdi 1493 subq $256,%rdx 1494 vmovdqa %ymm2,32(%rsp) 1495 jmp L$oop_tail8x 1496 1497.p2align 5 1498L$320_or_more8x: 1499 vpxor 0(%rsi),%ymm6,%ymm6 1500 vpxor 32(%rsi),%ymm8,%ymm8 1501 vpxor 64(%rsi),%ymm1,%ymm1 1502 vpxor 96(%rsi),%ymm5,%ymm5 1503 vpxor 128(%rsi),%ymm12,%ymm12 1504 vpxor 160(%rsi),%ymm13,%ymm13 1505 vpxor 192(%rsi),%ymm10,%ymm10 1506 vpxor 224(%rsi),%ymm15,%ymm15 1507 vpxor 256(%rsi),%ymm14,%ymm14 1508 vpxor 288(%rsi),%ymm2,%ymm2 1509 vmovdqu %ymm6,0(%rdi) 1510 vmovdqu %ymm8,32(%rdi) 1511 vmovdqu %ymm1,64(%rdi) 1512 vmovdqu %ymm5,96(%rdi) 1513 vmovdqu %ymm12,128(%rdi) 1514 vmovdqu %ymm13,160(%rdi) 1515 vmovdqu %ymm10,192(%rdi) 1516 vmovdqu %ymm15,224(%rdi) 1517 vmovdqu %ymm14,256(%rdi) 1518 vmovdqu %ymm2,288(%rdi) 1519 je L$done8x 1520 1521 leaq 320(%rsi),%rsi 1522 xorq %r10,%r10 1523 vmovdqa %ymm3,0(%rsp) 1524 leaq 320(%rdi),%rdi 1525 subq $320,%rdx 1526 vmovdqa %ymm7,32(%rsp) 1527 jmp L$oop_tail8x 1528 1529.p2align 5 1530L$384_or_more8x: 1531 vpxor 0(%rsi),%ymm6,%ymm6 1532 vpxor 32(%rsi),%ymm8,%ymm8 1533 vpxor 64(%rsi),%ymm1,%ymm1 1534 vpxor 96(%rsi),%ymm5,%ymm5 1535 vpxor 128(%rsi),%ymm12,%ymm12 1536 vpxor 160(%rsi),%ymm13,%ymm13 1537 vpxor 192(%rsi),%ymm10,%ymm10 1538 vpxor 224(%rsi),%ymm15,%ymm15 1539 vpxor 256(%rsi),%ymm14,%ymm14 1540 vpxor 288(%rsi),%ymm2,%ymm2 1541 vpxor 320(%rsi),%ymm3,%ymm3 1542 vpxor 352(%rsi),%ymm7,%ymm7 1543 vmovdqu %ymm6,0(%rdi) 1544 vmovdqu %ymm8,32(%rdi) 1545 vmovdqu %ymm1,64(%rdi) 1546 vmovdqu %ymm5,96(%rdi) 1547 vmovdqu %ymm12,128(%rdi) 1548 vmovdqu %ymm13,160(%rdi) 1549 vmovdqu %ymm10,192(%rdi) 1550 vmovdqu %ymm15,224(%rdi) 1551 vmovdqu %ymm14,256(%rdi) 1552 vmovdqu %ymm2,288(%rdi) 1553 vmovdqu %ymm3,320(%rdi) 1554 vmovdqu %ymm7,352(%rdi) 1555 je L$done8x 1556 1557 leaq 384(%rsi),%rsi 1558 xorq %r10,%r10 1559 vmovdqa %ymm11,0(%rsp) 1560 leaq 384(%rdi),%rdi 1561 subq $384,%rdx 1562 vmovdqa %ymm9,32(%rsp) 1563 jmp L$oop_tail8x 1564 1565.p2align 5 1566L$448_or_more8x: 1567 vpxor 0(%rsi),%ymm6,%ymm6 1568 vpxor 32(%rsi),%ymm8,%ymm8 1569 vpxor 64(%rsi),%ymm1,%ymm1 1570 vpxor 96(%rsi),%ymm5,%ymm5 1571 vpxor 128(%rsi),%ymm12,%ymm12 1572 vpxor 160(%rsi),%ymm13,%ymm13 1573 vpxor 192(%rsi),%ymm10,%ymm10 1574 vpxor 224(%rsi),%ymm15,%ymm15 1575 vpxor 256(%rsi),%ymm14,%ymm14 1576 vpxor 288(%rsi),%ymm2,%ymm2 1577 vpxor 320(%rsi),%ymm3,%ymm3 1578 vpxor 352(%rsi),%ymm7,%ymm7 1579 vpxor 384(%rsi),%ymm11,%ymm11 1580 vpxor 416(%rsi),%ymm9,%ymm9 1581 vmovdqu %ymm6,0(%rdi) 1582 vmovdqu %ymm8,32(%rdi) 1583 vmovdqu %ymm1,64(%rdi) 1584 vmovdqu %ymm5,96(%rdi) 1585 vmovdqu %ymm12,128(%rdi) 1586 vmovdqu %ymm13,160(%rdi) 1587 vmovdqu %ymm10,192(%rdi) 1588 vmovdqu %ymm15,224(%rdi) 1589 vmovdqu %ymm14,256(%rdi) 1590 vmovdqu %ymm2,288(%rdi) 1591 vmovdqu %ymm3,320(%rdi) 1592 vmovdqu %ymm7,352(%rdi) 1593 vmovdqu %ymm11,384(%rdi) 1594 vmovdqu %ymm9,416(%rdi) 1595 je L$done8x 1596 1597 leaq 448(%rsi),%rsi 1598 xorq %r10,%r10 1599 vmovdqa %ymm0,0(%rsp) 1600 leaq 448(%rdi),%rdi 1601 subq $448,%rdx 1602 vmovdqa %ymm4,32(%rsp) 1603 1604L$oop_tail8x: 1605 movzbl (%rsi,%r10,1),%eax 1606 movzbl (%rsp,%r10,1),%ecx 1607 leaq 1(%r10),%r10 1608 xorl %ecx,%eax 1609 movb %al,-1(%rdi,%r10,1) 1610 decq %rdx 1611 jnz L$oop_tail8x 1612 1613L$done8x: 1614 vzeroall 1615 leaq (%r9),%rsp 1616 1617L$8x_epilogue: 1618 ret 1619 1620 1621#endif 1622