1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#include <openssl/asm_base.h> 5 6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) 7.text 8 9.section __DATA,__const 10.p2align 6 11L$zero: 12.long 0,0,0,0 13L$one: 14.long 1,0,0,0 15L$inc: 16.long 0,1,2,3 17L$four: 18.long 4,4,4,4 19L$incy: 20.long 0,2,4,6,1,3,5,7 21L$eight: 22.long 8,8,8,8,8,8,8,8 23L$rot16: 24.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd 25L$rot24: 26.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe 27L$sigma: 28.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 29.p2align 6 30L$zeroz: 31.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 32L$fourz: 33.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 34L$incz: 35.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 36L$sixteen: 37.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 38.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 39.text 40.globl _ChaCha20_ctr32_nohw 41.private_extern _ChaCha20_ctr32_nohw 42 43.p2align 6 44_ChaCha20_ctr32_nohw: 45 46_CET_ENDBR 47 pushq %rbx 48 49 pushq %rbp 50 51 pushq %r12 52 53 pushq %r13 54 55 pushq %r14 56 57 pushq %r15 58 59 subq $64+24,%rsp 60 61L$ctr32_body: 62 63 64 movdqu (%rcx),%xmm1 65 movdqu 16(%rcx),%xmm2 66 movdqu (%r8),%xmm3 67 movdqa L$one(%rip),%xmm4 68 69 70 movdqa %xmm1,16(%rsp) 71 movdqa %xmm2,32(%rsp) 72 movdqa %xmm3,48(%rsp) 73 movq %rdx,%rbp 74 jmp L$oop_outer 75 76.p2align 5 77L$oop_outer: 78 movl $0x61707865,%eax 79 movl $0x3320646e,%ebx 80 movl $0x79622d32,%ecx 81 movl $0x6b206574,%edx 82 movl 16(%rsp),%r8d 83 movl 20(%rsp),%r9d 84 movl 24(%rsp),%r10d 85 movl 28(%rsp),%r11d 86 movd %xmm3,%r12d 87 movl 52(%rsp),%r13d 88 movl 56(%rsp),%r14d 89 movl 60(%rsp),%r15d 90 91 movq %rbp,64+0(%rsp) 92 movl $10,%ebp 93 movq %rsi,64+8(%rsp) 94.byte 102,72,15,126,214 95 movq %rdi,64+16(%rsp) 96 movq %rsi,%rdi 97 shrq $32,%rdi 98 jmp L$oop 99 100.p2align 5 101L$oop: 102 addl %r8d,%eax 103 xorl %eax,%r12d 104 roll $16,%r12d 105 addl %r9d,%ebx 106 xorl %ebx,%r13d 107 roll $16,%r13d 108 addl %r12d,%esi 109 xorl %esi,%r8d 110 roll $12,%r8d 111 addl %r13d,%edi 112 xorl %edi,%r9d 113 roll $12,%r9d 114 addl %r8d,%eax 115 xorl %eax,%r12d 116 roll $8,%r12d 117 addl %r9d,%ebx 118 xorl %ebx,%r13d 119 roll $8,%r13d 120 addl %r12d,%esi 121 xorl %esi,%r8d 122 roll $7,%r8d 123 addl %r13d,%edi 124 xorl %edi,%r9d 125 roll $7,%r9d 126 movl %esi,32(%rsp) 127 movl %edi,36(%rsp) 128 movl 40(%rsp),%esi 129 movl 44(%rsp),%edi 130 addl %r10d,%ecx 131 xorl %ecx,%r14d 132 roll $16,%r14d 133 addl %r11d,%edx 134 xorl %edx,%r15d 135 roll $16,%r15d 136 addl %r14d,%esi 137 xorl %esi,%r10d 138 roll $12,%r10d 139 addl %r15d,%edi 140 xorl %edi,%r11d 141 roll $12,%r11d 142 addl %r10d,%ecx 143 xorl %ecx,%r14d 144 roll $8,%r14d 145 addl %r11d,%edx 146 xorl %edx,%r15d 147 roll $8,%r15d 148 addl %r14d,%esi 149 xorl %esi,%r10d 150 roll $7,%r10d 151 addl %r15d,%edi 152 xorl %edi,%r11d 153 roll $7,%r11d 154 addl %r9d,%eax 155 xorl %eax,%r15d 156 roll $16,%r15d 157 addl %r10d,%ebx 158 xorl %ebx,%r12d 159 roll $16,%r12d 160 addl %r15d,%esi 161 xorl %esi,%r9d 162 roll $12,%r9d 163 addl %r12d,%edi 164 xorl %edi,%r10d 165 roll $12,%r10d 166 addl %r9d,%eax 167 xorl %eax,%r15d 168 roll $8,%r15d 169 addl %r10d,%ebx 170 xorl %ebx,%r12d 171 roll $8,%r12d 172 addl %r15d,%esi 173 xorl %esi,%r9d 174 roll $7,%r9d 175 addl %r12d,%edi 176 xorl %edi,%r10d 177 roll $7,%r10d 178 movl %esi,40(%rsp) 179 movl %edi,44(%rsp) 180 movl 32(%rsp),%esi 181 movl 36(%rsp),%edi 182 addl %r11d,%ecx 183 xorl %ecx,%r13d 184 roll $16,%r13d 185 addl %r8d,%edx 186 xorl %edx,%r14d 187 roll $16,%r14d 188 addl %r13d,%esi 189 xorl %esi,%r11d 190 roll $12,%r11d 191 addl %r14d,%edi 192 xorl %edi,%r8d 193 roll $12,%r8d 194 addl %r11d,%ecx 195 xorl %ecx,%r13d 196 roll $8,%r13d 197 addl %r8d,%edx 198 xorl %edx,%r14d 199 roll $8,%r14d 200 addl %r13d,%esi 201 xorl %esi,%r11d 202 roll $7,%r11d 203 addl %r14d,%edi 204 xorl %edi,%r8d 205 roll $7,%r8d 206 decl %ebp 207 jnz L$oop 208 movl %edi,36(%rsp) 209 movl %esi,32(%rsp) 210 movq 64(%rsp),%rbp 211 movdqa %xmm2,%xmm1 212 movq 64+8(%rsp),%rsi 213 paddd %xmm4,%xmm3 214 movq 64+16(%rsp),%rdi 215 216 addl $0x61707865,%eax 217 addl $0x3320646e,%ebx 218 addl $0x79622d32,%ecx 219 addl $0x6b206574,%edx 220 addl 16(%rsp),%r8d 221 addl 20(%rsp),%r9d 222 addl 24(%rsp),%r10d 223 addl 28(%rsp),%r11d 224 addl 48(%rsp),%r12d 225 addl 52(%rsp),%r13d 226 addl 56(%rsp),%r14d 227 addl 60(%rsp),%r15d 228 paddd 32(%rsp),%xmm1 229 230 cmpq $64,%rbp 231 jb L$tail 232 233 xorl 0(%rsi),%eax 234 xorl 4(%rsi),%ebx 235 xorl 8(%rsi),%ecx 236 xorl 12(%rsi),%edx 237 xorl 16(%rsi),%r8d 238 xorl 20(%rsi),%r9d 239 xorl 24(%rsi),%r10d 240 xorl 28(%rsi),%r11d 241 movdqu 32(%rsi),%xmm0 242 xorl 48(%rsi),%r12d 243 xorl 52(%rsi),%r13d 244 xorl 56(%rsi),%r14d 245 xorl 60(%rsi),%r15d 246 leaq 64(%rsi),%rsi 247 pxor %xmm1,%xmm0 248 249 movdqa %xmm2,32(%rsp) 250 movd %xmm3,48(%rsp) 251 252 movl %eax,0(%rdi) 253 movl %ebx,4(%rdi) 254 movl %ecx,8(%rdi) 255 movl %edx,12(%rdi) 256 movl %r8d,16(%rdi) 257 movl %r9d,20(%rdi) 258 movl %r10d,24(%rdi) 259 movl %r11d,28(%rdi) 260 movdqu %xmm0,32(%rdi) 261 movl %r12d,48(%rdi) 262 movl %r13d,52(%rdi) 263 movl %r14d,56(%rdi) 264 movl %r15d,60(%rdi) 265 leaq 64(%rdi),%rdi 266 267 subq $64,%rbp 268 jnz L$oop_outer 269 270 jmp L$done 271 272.p2align 4 273L$tail: 274 movl %eax,0(%rsp) 275 movl %ebx,4(%rsp) 276 xorq %rbx,%rbx 277 movl %ecx,8(%rsp) 278 movl %edx,12(%rsp) 279 movl %r8d,16(%rsp) 280 movl %r9d,20(%rsp) 281 movl %r10d,24(%rsp) 282 movl %r11d,28(%rsp) 283 movdqa %xmm1,32(%rsp) 284 movl %r12d,48(%rsp) 285 movl %r13d,52(%rsp) 286 movl %r14d,56(%rsp) 287 movl %r15d,60(%rsp) 288 289L$oop_tail: 290 movzbl (%rsi,%rbx,1),%eax 291 movzbl (%rsp,%rbx,1),%edx 292 leaq 1(%rbx),%rbx 293 xorl %edx,%eax 294 movb %al,-1(%rdi,%rbx,1) 295 decq %rbp 296 jnz L$oop_tail 297 298L$done: 299 leaq 64+24+48(%rsp),%rsi 300 movq -48(%rsi),%r15 301 302 movq -40(%rsi),%r14 303 304 movq -32(%rsi),%r13 305 306 movq -24(%rsi),%r12 307 308 movq -16(%rsi),%rbp 309 310 movq -8(%rsi),%rbx 311 312 leaq (%rsi),%rsp 313 314L$no_data: 315 ret 316 317 318.globl _ChaCha20_ctr32_ssse3 319.private_extern _ChaCha20_ctr32_ssse3 320 321.p2align 5 322_ChaCha20_ctr32_ssse3: 323 324_CET_ENDBR 325 movq %rsp,%r9 326 327 subq $64+8,%rsp 328 movdqa L$sigma(%rip),%xmm0 329 movdqu (%rcx),%xmm1 330 movdqu 16(%rcx),%xmm2 331 movdqu (%r8),%xmm3 332 movdqa L$rot16(%rip),%xmm6 333 movdqa L$rot24(%rip),%xmm7 334 335 movdqa %xmm0,0(%rsp) 336 movdqa %xmm1,16(%rsp) 337 movdqa %xmm2,32(%rsp) 338 movdqa %xmm3,48(%rsp) 339 movq $10,%r8 340 jmp L$oop_ssse3 341 342.p2align 5 343L$oop_outer_ssse3: 344 movdqa L$one(%rip),%xmm3 345 movdqa 0(%rsp),%xmm0 346 movdqa 16(%rsp),%xmm1 347 movdqa 32(%rsp),%xmm2 348 paddd 48(%rsp),%xmm3 349 movq $10,%r8 350 movdqa %xmm3,48(%rsp) 351 jmp L$oop_ssse3 352 353.p2align 5 354L$oop_ssse3: 355 paddd %xmm1,%xmm0 356 pxor %xmm0,%xmm3 357.byte 102,15,56,0,222 358 paddd %xmm3,%xmm2 359 pxor %xmm2,%xmm1 360 movdqa %xmm1,%xmm4 361 psrld $20,%xmm1 362 pslld $12,%xmm4 363 por %xmm4,%xmm1 364 paddd %xmm1,%xmm0 365 pxor %xmm0,%xmm3 366.byte 102,15,56,0,223 367 paddd %xmm3,%xmm2 368 pxor %xmm2,%xmm1 369 movdqa %xmm1,%xmm4 370 psrld $25,%xmm1 371 pslld $7,%xmm4 372 por %xmm4,%xmm1 373 pshufd $78,%xmm2,%xmm2 374 pshufd $57,%xmm1,%xmm1 375 pshufd $147,%xmm3,%xmm3 376 nop 377 paddd %xmm1,%xmm0 378 pxor %xmm0,%xmm3 379.byte 102,15,56,0,222 380 paddd %xmm3,%xmm2 381 pxor %xmm2,%xmm1 382 movdqa %xmm1,%xmm4 383 psrld $20,%xmm1 384 pslld $12,%xmm4 385 por %xmm4,%xmm1 386 paddd %xmm1,%xmm0 387 pxor %xmm0,%xmm3 388.byte 102,15,56,0,223 389 paddd %xmm3,%xmm2 390 pxor %xmm2,%xmm1 391 movdqa %xmm1,%xmm4 392 psrld $25,%xmm1 393 pslld $7,%xmm4 394 por %xmm4,%xmm1 395 pshufd $78,%xmm2,%xmm2 396 pshufd $147,%xmm1,%xmm1 397 pshufd $57,%xmm3,%xmm3 398 decq %r8 399 jnz L$oop_ssse3 400 paddd 0(%rsp),%xmm0 401 paddd 16(%rsp),%xmm1 402 paddd 32(%rsp),%xmm2 403 paddd 48(%rsp),%xmm3 404 405 cmpq $64,%rdx 406 jb L$tail_ssse3 407 408 movdqu 0(%rsi),%xmm4 409 movdqu 16(%rsi),%xmm5 410 pxor %xmm4,%xmm0 411 movdqu 32(%rsi),%xmm4 412 pxor %xmm5,%xmm1 413 movdqu 48(%rsi),%xmm5 414 leaq 64(%rsi),%rsi 415 pxor %xmm4,%xmm2 416 pxor %xmm5,%xmm3 417 418 movdqu %xmm0,0(%rdi) 419 movdqu %xmm1,16(%rdi) 420 movdqu %xmm2,32(%rdi) 421 movdqu %xmm3,48(%rdi) 422 leaq 64(%rdi),%rdi 423 424 subq $64,%rdx 425 jnz L$oop_outer_ssse3 426 427 jmp L$done_ssse3 428 429.p2align 4 430L$tail_ssse3: 431 movdqa %xmm0,0(%rsp) 432 movdqa %xmm1,16(%rsp) 433 movdqa %xmm2,32(%rsp) 434 movdqa %xmm3,48(%rsp) 435 xorq %r8,%r8 436 437L$oop_tail_ssse3: 438 movzbl (%rsi,%r8,1),%eax 439 movzbl (%rsp,%r8,1),%ecx 440 leaq 1(%r8),%r8 441 xorl %ecx,%eax 442 movb %al,-1(%rdi,%r8,1) 443 decq %rdx 444 jnz L$oop_tail_ssse3 445 446L$done_ssse3: 447 leaq (%r9),%rsp 448 449L$ssse3_epilogue: 450 ret 451 452 453.globl _ChaCha20_ctr32_ssse3_4x 454.private_extern _ChaCha20_ctr32_ssse3_4x 455 456.p2align 5 457_ChaCha20_ctr32_ssse3_4x: 458 459_CET_ENDBR 460 movq %rsp,%r9 461 462 subq $0x140+8,%rsp 463 movdqa L$sigma(%rip),%xmm11 464 movdqu (%rcx),%xmm15 465 movdqu 16(%rcx),%xmm7 466 movdqu (%r8),%xmm3 467 leaq 256(%rsp),%rcx 468 leaq L$rot16(%rip),%r10 469 leaq L$rot24(%rip),%r11 470 471 pshufd $0x00,%xmm11,%xmm8 472 pshufd $0x55,%xmm11,%xmm9 473 movdqa %xmm8,64(%rsp) 474 pshufd $0xaa,%xmm11,%xmm10 475 movdqa %xmm9,80(%rsp) 476 pshufd $0xff,%xmm11,%xmm11 477 movdqa %xmm10,96(%rsp) 478 movdqa %xmm11,112(%rsp) 479 480 pshufd $0x00,%xmm15,%xmm12 481 pshufd $0x55,%xmm15,%xmm13 482 movdqa %xmm12,128-256(%rcx) 483 pshufd $0xaa,%xmm15,%xmm14 484 movdqa %xmm13,144-256(%rcx) 485 pshufd $0xff,%xmm15,%xmm15 486 movdqa %xmm14,160-256(%rcx) 487 movdqa %xmm15,176-256(%rcx) 488 489 pshufd $0x00,%xmm7,%xmm4 490 pshufd $0x55,%xmm7,%xmm5 491 movdqa %xmm4,192-256(%rcx) 492 pshufd $0xaa,%xmm7,%xmm6 493 movdqa %xmm5,208-256(%rcx) 494 pshufd $0xff,%xmm7,%xmm7 495 movdqa %xmm6,224-256(%rcx) 496 movdqa %xmm7,240-256(%rcx) 497 498 pshufd $0x00,%xmm3,%xmm0 499 pshufd $0x55,%xmm3,%xmm1 500 paddd L$inc(%rip),%xmm0 501 pshufd $0xaa,%xmm3,%xmm2 502 movdqa %xmm1,272-256(%rcx) 503 pshufd $0xff,%xmm3,%xmm3 504 movdqa %xmm2,288-256(%rcx) 505 movdqa %xmm3,304-256(%rcx) 506 507 jmp L$oop_enter4x 508 509.p2align 5 510L$oop_outer4x: 511 movdqa 64(%rsp),%xmm8 512 movdqa 80(%rsp),%xmm9 513 movdqa 96(%rsp),%xmm10 514 movdqa 112(%rsp),%xmm11 515 movdqa 128-256(%rcx),%xmm12 516 movdqa 144-256(%rcx),%xmm13 517 movdqa 160-256(%rcx),%xmm14 518 movdqa 176-256(%rcx),%xmm15 519 movdqa 192-256(%rcx),%xmm4 520 movdqa 208-256(%rcx),%xmm5 521 movdqa 224-256(%rcx),%xmm6 522 movdqa 240-256(%rcx),%xmm7 523 movdqa 256-256(%rcx),%xmm0 524 movdqa 272-256(%rcx),%xmm1 525 movdqa 288-256(%rcx),%xmm2 526 movdqa 304-256(%rcx),%xmm3 527 paddd L$four(%rip),%xmm0 528 529L$oop_enter4x: 530 movdqa %xmm6,32(%rsp) 531 movdqa %xmm7,48(%rsp) 532 movdqa (%r10),%xmm7 533 movl $10,%eax 534 movdqa %xmm0,256-256(%rcx) 535 jmp L$oop4x 536 537.p2align 5 538L$oop4x: 539 paddd %xmm12,%xmm8 540 paddd %xmm13,%xmm9 541 pxor %xmm8,%xmm0 542 pxor %xmm9,%xmm1 543.byte 102,15,56,0,199 544.byte 102,15,56,0,207 545 paddd %xmm0,%xmm4 546 paddd %xmm1,%xmm5 547 pxor %xmm4,%xmm12 548 pxor %xmm5,%xmm13 549 movdqa %xmm12,%xmm6 550 pslld $12,%xmm12 551 psrld $20,%xmm6 552 movdqa %xmm13,%xmm7 553 pslld $12,%xmm13 554 por %xmm6,%xmm12 555 psrld $20,%xmm7 556 movdqa (%r11),%xmm6 557 por %xmm7,%xmm13 558 paddd %xmm12,%xmm8 559 paddd %xmm13,%xmm9 560 pxor %xmm8,%xmm0 561 pxor %xmm9,%xmm1 562.byte 102,15,56,0,198 563.byte 102,15,56,0,206 564 paddd %xmm0,%xmm4 565 paddd %xmm1,%xmm5 566 pxor %xmm4,%xmm12 567 pxor %xmm5,%xmm13 568 movdqa %xmm12,%xmm7 569 pslld $7,%xmm12 570 psrld $25,%xmm7 571 movdqa %xmm13,%xmm6 572 pslld $7,%xmm13 573 por %xmm7,%xmm12 574 psrld $25,%xmm6 575 movdqa (%r10),%xmm7 576 por %xmm6,%xmm13 577 movdqa %xmm4,0(%rsp) 578 movdqa %xmm5,16(%rsp) 579 movdqa 32(%rsp),%xmm4 580 movdqa 48(%rsp),%xmm5 581 paddd %xmm14,%xmm10 582 paddd %xmm15,%xmm11 583 pxor %xmm10,%xmm2 584 pxor %xmm11,%xmm3 585.byte 102,15,56,0,215 586.byte 102,15,56,0,223 587 paddd %xmm2,%xmm4 588 paddd %xmm3,%xmm5 589 pxor %xmm4,%xmm14 590 pxor %xmm5,%xmm15 591 movdqa %xmm14,%xmm6 592 pslld $12,%xmm14 593 psrld $20,%xmm6 594 movdqa %xmm15,%xmm7 595 pslld $12,%xmm15 596 por %xmm6,%xmm14 597 psrld $20,%xmm7 598 movdqa (%r11),%xmm6 599 por %xmm7,%xmm15 600 paddd %xmm14,%xmm10 601 paddd %xmm15,%xmm11 602 pxor %xmm10,%xmm2 603 pxor %xmm11,%xmm3 604.byte 102,15,56,0,214 605.byte 102,15,56,0,222 606 paddd %xmm2,%xmm4 607 paddd %xmm3,%xmm5 608 pxor %xmm4,%xmm14 609 pxor %xmm5,%xmm15 610 movdqa %xmm14,%xmm7 611 pslld $7,%xmm14 612 psrld $25,%xmm7 613 movdqa %xmm15,%xmm6 614 pslld $7,%xmm15 615 por %xmm7,%xmm14 616 psrld $25,%xmm6 617 movdqa (%r10),%xmm7 618 por %xmm6,%xmm15 619 paddd %xmm13,%xmm8 620 paddd %xmm14,%xmm9 621 pxor %xmm8,%xmm3 622 pxor %xmm9,%xmm0 623.byte 102,15,56,0,223 624.byte 102,15,56,0,199 625 paddd %xmm3,%xmm4 626 paddd %xmm0,%xmm5 627 pxor %xmm4,%xmm13 628 pxor %xmm5,%xmm14 629 movdqa %xmm13,%xmm6 630 pslld $12,%xmm13 631 psrld $20,%xmm6 632 movdqa %xmm14,%xmm7 633 pslld $12,%xmm14 634 por %xmm6,%xmm13 635 psrld $20,%xmm7 636 movdqa (%r11),%xmm6 637 por %xmm7,%xmm14 638 paddd %xmm13,%xmm8 639 paddd %xmm14,%xmm9 640 pxor %xmm8,%xmm3 641 pxor %xmm9,%xmm0 642.byte 102,15,56,0,222 643.byte 102,15,56,0,198 644 paddd %xmm3,%xmm4 645 paddd %xmm0,%xmm5 646 pxor %xmm4,%xmm13 647 pxor %xmm5,%xmm14 648 movdqa %xmm13,%xmm7 649 pslld $7,%xmm13 650 psrld $25,%xmm7 651 movdqa %xmm14,%xmm6 652 pslld $7,%xmm14 653 por %xmm7,%xmm13 654 psrld $25,%xmm6 655 movdqa (%r10),%xmm7 656 por %xmm6,%xmm14 657 movdqa %xmm4,32(%rsp) 658 movdqa %xmm5,48(%rsp) 659 movdqa 0(%rsp),%xmm4 660 movdqa 16(%rsp),%xmm5 661 paddd %xmm15,%xmm10 662 paddd %xmm12,%xmm11 663 pxor %xmm10,%xmm1 664 pxor %xmm11,%xmm2 665.byte 102,15,56,0,207 666.byte 102,15,56,0,215 667 paddd %xmm1,%xmm4 668 paddd %xmm2,%xmm5 669 pxor %xmm4,%xmm15 670 pxor %xmm5,%xmm12 671 movdqa %xmm15,%xmm6 672 pslld $12,%xmm15 673 psrld $20,%xmm6 674 movdqa %xmm12,%xmm7 675 pslld $12,%xmm12 676 por %xmm6,%xmm15 677 psrld $20,%xmm7 678 movdqa (%r11),%xmm6 679 por %xmm7,%xmm12 680 paddd %xmm15,%xmm10 681 paddd %xmm12,%xmm11 682 pxor %xmm10,%xmm1 683 pxor %xmm11,%xmm2 684.byte 102,15,56,0,206 685.byte 102,15,56,0,214 686 paddd %xmm1,%xmm4 687 paddd %xmm2,%xmm5 688 pxor %xmm4,%xmm15 689 pxor %xmm5,%xmm12 690 movdqa %xmm15,%xmm7 691 pslld $7,%xmm15 692 psrld $25,%xmm7 693 movdqa %xmm12,%xmm6 694 pslld $7,%xmm12 695 por %xmm7,%xmm15 696 psrld $25,%xmm6 697 movdqa (%r10),%xmm7 698 por %xmm6,%xmm12 699 decl %eax 700 jnz L$oop4x 701 702 paddd 64(%rsp),%xmm8 703 paddd 80(%rsp),%xmm9 704 paddd 96(%rsp),%xmm10 705 paddd 112(%rsp),%xmm11 706 707 movdqa %xmm8,%xmm6 708 punpckldq %xmm9,%xmm8 709 movdqa %xmm10,%xmm7 710 punpckldq %xmm11,%xmm10 711 punpckhdq %xmm9,%xmm6 712 punpckhdq %xmm11,%xmm7 713 movdqa %xmm8,%xmm9 714 punpcklqdq %xmm10,%xmm8 715 movdqa %xmm6,%xmm11 716 punpcklqdq %xmm7,%xmm6 717 punpckhqdq %xmm10,%xmm9 718 punpckhqdq %xmm7,%xmm11 719 paddd 128-256(%rcx),%xmm12 720 paddd 144-256(%rcx),%xmm13 721 paddd 160-256(%rcx),%xmm14 722 paddd 176-256(%rcx),%xmm15 723 724 movdqa %xmm8,0(%rsp) 725 movdqa %xmm9,16(%rsp) 726 movdqa 32(%rsp),%xmm8 727 movdqa 48(%rsp),%xmm9 728 729 movdqa %xmm12,%xmm10 730 punpckldq %xmm13,%xmm12 731 movdqa %xmm14,%xmm7 732 punpckldq %xmm15,%xmm14 733 punpckhdq %xmm13,%xmm10 734 punpckhdq %xmm15,%xmm7 735 movdqa %xmm12,%xmm13 736 punpcklqdq %xmm14,%xmm12 737 movdqa %xmm10,%xmm15 738 punpcklqdq %xmm7,%xmm10 739 punpckhqdq %xmm14,%xmm13 740 punpckhqdq %xmm7,%xmm15 741 paddd 192-256(%rcx),%xmm4 742 paddd 208-256(%rcx),%xmm5 743 paddd 224-256(%rcx),%xmm8 744 paddd 240-256(%rcx),%xmm9 745 746 movdqa %xmm6,32(%rsp) 747 movdqa %xmm11,48(%rsp) 748 749 movdqa %xmm4,%xmm14 750 punpckldq %xmm5,%xmm4 751 movdqa %xmm8,%xmm7 752 punpckldq %xmm9,%xmm8 753 punpckhdq %xmm5,%xmm14 754 punpckhdq %xmm9,%xmm7 755 movdqa %xmm4,%xmm5 756 punpcklqdq %xmm8,%xmm4 757 movdqa %xmm14,%xmm9 758 punpcklqdq %xmm7,%xmm14 759 punpckhqdq %xmm8,%xmm5 760 punpckhqdq %xmm7,%xmm9 761 paddd 256-256(%rcx),%xmm0 762 paddd 272-256(%rcx),%xmm1 763 paddd 288-256(%rcx),%xmm2 764 paddd 304-256(%rcx),%xmm3 765 766 movdqa %xmm0,%xmm8 767 punpckldq %xmm1,%xmm0 768 movdqa %xmm2,%xmm7 769 punpckldq %xmm3,%xmm2 770 punpckhdq %xmm1,%xmm8 771 punpckhdq %xmm3,%xmm7 772 movdqa %xmm0,%xmm1 773 punpcklqdq %xmm2,%xmm0 774 movdqa %xmm8,%xmm3 775 punpcklqdq %xmm7,%xmm8 776 punpckhqdq %xmm2,%xmm1 777 punpckhqdq %xmm7,%xmm3 778 cmpq $256,%rdx 779 jb L$tail4x 780 781 movdqu 0(%rsi),%xmm6 782 movdqu 16(%rsi),%xmm11 783 movdqu 32(%rsi),%xmm2 784 movdqu 48(%rsi),%xmm7 785 pxor 0(%rsp),%xmm6 786 pxor %xmm12,%xmm11 787 pxor %xmm4,%xmm2 788 pxor %xmm0,%xmm7 789 790 movdqu %xmm6,0(%rdi) 791 movdqu 64(%rsi),%xmm6 792 movdqu %xmm11,16(%rdi) 793 movdqu 80(%rsi),%xmm11 794 movdqu %xmm2,32(%rdi) 795 movdqu 96(%rsi),%xmm2 796 movdqu %xmm7,48(%rdi) 797 movdqu 112(%rsi),%xmm7 798 leaq 128(%rsi),%rsi 799 pxor 16(%rsp),%xmm6 800 pxor %xmm13,%xmm11 801 pxor %xmm5,%xmm2 802 pxor %xmm1,%xmm7 803 804 movdqu %xmm6,64(%rdi) 805 movdqu 0(%rsi),%xmm6 806 movdqu %xmm11,80(%rdi) 807 movdqu 16(%rsi),%xmm11 808 movdqu %xmm2,96(%rdi) 809 movdqu 32(%rsi),%xmm2 810 movdqu %xmm7,112(%rdi) 811 leaq 128(%rdi),%rdi 812 movdqu 48(%rsi),%xmm7 813 pxor 32(%rsp),%xmm6 814 pxor %xmm10,%xmm11 815 pxor %xmm14,%xmm2 816 pxor %xmm8,%xmm7 817 818 movdqu %xmm6,0(%rdi) 819 movdqu 64(%rsi),%xmm6 820 movdqu %xmm11,16(%rdi) 821 movdqu 80(%rsi),%xmm11 822 movdqu %xmm2,32(%rdi) 823 movdqu 96(%rsi),%xmm2 824 movdqu %xmm7,48(%rdi) 825 movdqu 112(%rsi),%xmm7 826 leaq 128(%rsi),%rsi 827 pxor 48(%rsp),%xmm6 828 pxor %xmm15,%xmm11 829 pxor %xmm9,%xmm2 830 pxor %xmm3,%xmm7 831 movdqu %xmm6,64(%rdi) 832 movdqu %xmm11,80(%rdi) 833 movdqu %xmm2,96(%rdi) 834 movdqu %xmm7,112(%rdi) 835 leaq 128(%rdi),%rdi 836 837 subq $256,%rdx 838 jnz L$oop_outer4x 839 840 jmp L$done4x 841 842L$tail4x: 843 cmpq $192,%rdx 844 jae L$192_or_more4x 845 cmpq $128,%rdx 846 jae L$128_or_more4x 847 cmpq $64,%rdx 848 jae L$64_or_more4x 849 850 851 xorq %r10,%r10 852 853 movdqa %xmm12,16(%rsp) 854 movdqa %xmm4,32(%rsp) 855 movdqa %xmm0,48(%rsp) 856 jmp L$oop_tail4x 857 858.p2align 5 859L$64_or_more4x: 860 movdqu 0(%rsi),%xmm6 861 movdqu 16(%rsi),%xmm11 862 movdqu 32(%rsi),%xmm2 863 movdqu 48(%rsi),%xmm7 864 pxor 0(%rsp),%xmm6 865 pxor %xmm12,%xmm11 866 pxor %xmm4,%xmm2 867 pxor %xmm0,%xmm7 868 movdqu %xmm6,0(%rdi) 869 movdqu %xmm11,16(%rdi) 870 movdqu %xmm2,32(%rdi) 871 movdqu %xmm7,48(%rdi) 872 je L$done4x 873 874 movdqa 16(%rsp),%xmm6 875 leaq 64(%rsi),%rsi 876 xorq %r10,%r10 877 movdqa %xmm6,0(%rsp) 878 movdqa %xmm13,16(%rsp) 879 leaq 64(%rdi),%rdi 880 movdqa %xmm5,32(%rsp) 881 subq $64,%rdx 882 movdqa %xmm1,48(%rsp) 883 jmp L$oop_tail4x 884 885.p2align 5 886L$128_or_more4x: 887 movdqu 0(%rsi),%xmm6 888 movdqu 16(%rsi),%xmm11 889 movdqu 32(%rsi),%xmm2 890 movdqu 48(%rsi),%xmm7 891 pxor 0(%rsp),%xmm6 892 pxor %xmm12,%xmm11 893 pxor %xmm4,%xmm2 894 pxor %xmm0,%xmm7 895 896 movdqu %xmm6,0(%rdi) 897 movdqu 64(%rsi),%xmm6 898 movdqu %xmm11,16(%rdi) 899 movdqu 80(%rsi),%xmm11 900 movdqu %xmm2,32(%rdi) 901 movdqu 96(%rsi),%xmm2 902 movdqu %xmm7,48(%rdi) 903 movdqu 112(%rsi),%xmm7 904 pxor 16(%rsp),%xmm6 905 pxor %xmm13,%xmm11 906 pxor %xmm5,%xmm2 907 pxor %xmm1,%xmm7 908 movdqu %xmm6,64(%rdi) 909 movdqu %xmm11,80(%rdi) 910 movdqu %xmm2,96(%rdi) 911 movdqu %xmm7,112(%rdi) 912 je L$done4x 913 914 movdqa 32(%rsp),%xmm6 915 leaq 128(%rsi),%rsi 916 xorq %r10,%r10 917 movdqa %xmm6,0(%rsp) 918 movdqa %xmm10,16(%rsp) 919 leaq 128(%rdi),%rdi 920 movdqa %xmm14,32(%rsp) 921 subq $128,%rdx 922 movdqa %xmm8,48(%rsp) 923 jmp L$oop_tail4x 924 925.p2align 5 926L$192_or_more4x: 927 movdqu 0(%rsi),%xmm6 928 movdqu 16(%rsi),%xmm11 929 movdqu 32(%rsi),%xmm2 930 movdqu 48(%rsi),%xmm7 931 pxor 0(%rsp),%xmm6 932 pxor %xmm12,%xmm11 933 pxor %xmm4,%xmm2 934 pxor %xmm0,%xmm7 935 936 movdqu %xmm6,0(%rdi) 937 movdqu 64(%rsi),%xmm6 938 movdqu %xmm11,16(%rdi) 939 movdqu 80(%rsi),%xmm11 940 movdqu %xmm2,32(%rdi) 941 movdqu 96(%rsi),%xmm2 942 movdqu %xmm7,48(%rdi) 943 movdqu 112(%rsi),%xmm7 944 leaq 128(%rsi),%rsi 945 pxor 16(%rsp),%xmm6 946 pxor %xmm13,%xmm11 947 pxor %xmm5,%xmm2 948 pxor %xmm1,%xmm7 949 950 movdqu %xmm6,64(%rdi) 951 movdqu 0(%rsi),%xmm6 952 movdqu %xmm11,80(%rdi) 953 movdqu 16(%rsi),%xmm11 954 movdqu %xmm2,96(%rdi) 955 movdqu 32(%rsi),%xmm2 956 movdqu %xmm7,112(%rdi) 957 leaq 128(%rdi),%rdi 958 movdqu 48(%rsi),%xmm7 959 pxor 32(%rsp),%xmm6 960 pxor %xmm10,%xmm11 961 pxor %xmm14,%xmm2 962 pxor %xmm8,%xmm7 963 movdqu %xmm6,0(%rdi) 964 movdqu %xmm11,16(%rdi) 965 movdqu %xmm2,32(%rdi) 966 movdqu %xmm7,48(%rdi) 967 je L$done4x 968 969 movdqa 48(%rsp),%xmm6 970 leaq 64(%rsi),%rsi 971 xorq %r10,%r10 972 movdqa %xmm6,0(%rsp) 973 movdqa %xmm15,16(%rsp) 974 leaq 64(%rdi),%rdi 975 movdqa %xmm9,32(%rsp) 976 subq $192,%rdx 977 movdqa %xmm3,48(%rsp) 978 979L$oop_tail4x: 980 movzbl (%rsi,%r10,1),%eax 981 movzbl (%rsp,%r10,1),%ecx 982 leaq 1(%r10),%r10 983 xorl %ecx,%eax 984 movb %al,-1(%rdi,%r10,1) 985 decq %rdx 986 jnz L$oop_tail4x 987 988L$done4x: 989 leaq (%r9),%rsp 990 991L$4x_epilogue: 992 ret 993 994 995.globl _ChaCha20_ctr32_avx2 996.private_extern _ChaCha20_ctr32_avx2 997 998.p2align 5 999_ChaCha20_ctr32_avx2: 1000 1001_CET_ENDBR 1002 movq %rsp,%r9 1003 1004 subq $0x280+8,%rsp 1005 andq $-32,%rsp 1006 vzeroupper 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 vbroadcasti128 L$sigma(%rip),%ymm11 1018 vbroadcasti128 (%rcx),%ymm3 1019 vbroadcasti128 16(%rcx),%ymm15 1020 vbroadcasti128 (%r8),%ymm7 1021 leaq 256(%rsp),%rcx 1022 leaq 512(%rsp),%rax 1023 leaq L$rot16(%rip),%r10 1024 leaq L$rot24(%rip),%r11 1025 1026 vpshufd $0x00,%ymm11,%ymm8 1027 vpshufd $0x55,%ymm11,%ymm9 1028 vmovdqa %ymm8,128-256(%rcx) 1029 vpshufd $0xaa,%ymm11,%ymm10 1030 vmovdqa %ymm9,160-256(%rcx) 1031 vpshufd $0xff,%ymm11,%ymm11 1032 vmovdqa %ymm10,192-256(%rcx) 1033 vmovdqa %ymm11,224-256(%rcx) 1034 1035 vpshufd $0x00,%ymm3,%ymm0 1036 vpshufd $0x55,%ymm3,%ymm1 1037 vmovdqa %ymm0,256-256(%rcx) 1038 vpshufd $0xaa,%ymm3,%ymm2 1039 vmovdqa %ymm1,288-256(%rcx) 1040 vpshufd $0xff,%ymm3,%ymm3 1041 vmovdqa %ymm2,320-256(%rcx) 1042 vmovdqa %ymm3,352-256(%rcx) 1043 1044 vpshufd $0x00,%ymm15,%ymm12 1045 vpshufd $0x55,%ymm15,%ymm13 1046 vmovdqa %ymm12,384-512(%rax) 1047 vpshufd $0xaa,%ymm15,%ymm14 1048 vmovdqa %ymm13,416-512(%rax) 1049 vpshufd $0xff,%ymm15,%ymm15 1050 vmovdqa %ymm14,448-512(%rax) 1051 vmovdqa %ymm15,480-512(%rax) 1052 1053 vpshufd $0x00,%ymm7,%ymm4 1054 vpshufd $0x55,%ymm7,%ymm5 1055 vpaddd L$incy(%rip),%ymm4,%ymm4 1056 vpshufd $0xaa,%ymm7,%ymm6 1057 vmovdqa %ymm5,544-512(%rax) 1058 vpshufd $0xff,%ymm7,%ymm7 1059 vmovdqa %ymm6,576-512(%rax) 1060 vmovdqa %ymm7,608-512(%rax) 1061 1062 jmp L$oop_enter8x 1063 1064.p2align 5 1065L$oop_outer8x: 1066 vmovdqa 128-256(%rcx),%ymm8 1067 vmovdqa 160-256(%rcx),%ymm9 1068 vmovdqa 192-256(%rcx),%ymm10 1069 vmovdqa 224-256(%rcx),%ymm11 1070 vmovdqa 256-256(%rcx),%ymm0 1071 vmovdqa 288-256(%rcx),%ymm1 1072 vmovdqa 320-256(%rcx),%ymm2 1073 vmovdqa 352-256(%rcx),%ymm3 1074 vmovdqa 384-512(%rax),%ymm12 1075 vmovdqa 416-512(%rax),%ymm13 1076 vmovdqa 448-512(%rax),%ymm14 1077 vmovdqa 480-512(%rax),%ymm15 1078 vmovdqa 512-512(%rax),%ymm4 1079 vmovdqa 544-512(%rax),%ymm5 1080 vmovdqa 576-512(%rax),%ymm6 1081 vmovdqa 608-512(%rax),%ymm7 1082 vpaddd L$eight(%rip),%ymm4,%ymm4 1083 1084L$oop_enter8x: 1085 vmovdqa %ymm14,64(%rsp) 1086 vmovdqa %ymm15,96(%rsp) 1087 vbroadcasti128 (%r10),%ymm15 1088 vmovdqa %ymm4,512-512(%rax) 1089 movl $10,%eax 1090 jmp L$oop8x 1091 1092.p2align 5 1093L$oop8x: 1094 vpaddd %ymm0,%ymm8,%ymm8 1095 vpxor %ymm4,%ymm8,%ymm4 1096 vpshufb %ymm15,%ymm4,%ymm4 1097 vpaddd %ymm1,%ymm9,%ymm9 1098 vpxor %ymm5,%ymm9,%ymm5 1099 vpshufb %ymm15,%ymm5,%ymm5 1100 vpaddd %ymm4,%ymm12,%ymm12 1101 vpxor %ymm0,%ymm12,%ymm0 1102 vpslld $12,%ymm0,%ymm14 1103 vpsrld $20,%ymm0,%ymm0 1104 vpor %ymm0,%ymm14,%ymm0 1105 vbroadcasti128 (%r11),%ymm14 1106 vpaddd %ymm5,%ymm13,%ymm13 1107 vpxor %ymm1,%ymm13,%ymm1 1108 vpslld $12,%ymm1,%ymm15 1109 vpsrld $20,%ymm1,%ymm1 1110 vpor %ymm1,%ymm15,%ymm1 1111 vpaddd %ymm0,%ymm8,%ymm8 1112 vpxor %ymm4,%ymm8,%ymm4 1113 vpshufb %ymm14,%ymm4,%ymm4 1114 vpaddd %ymm1,%ymm9,%ymm9 1115 vpxor %ymm5,%ymm9,%ymm5 1116 vpshufb %ymm14,%ymm5,%ymm5 1117 vpaddd %ymm4,%ymm12,%ymm12 1118 vpxor %ymm0,%ymm12,%ymm0 1119 vpslld $7,%ymm0,%ymm15 1120 vpsrld $25,%ymm0,%ymm0 1121 vpor %ymm0,%ymm15,%ymm0 1122 vbroadcasti128 (%r10),%ymm15 1123 vpaddd %ymm5,%ymm13,%ymm13 1124 vpxor %ymm1,%ymm13,%ymm1 1125 vpslld $7,%ymm1,%ymm14 1126 vpsrld $25,%ymm1,%ymm1 1127 vpor %ymm1,%ymm14,%ymm1 1128 vmovdqa %ymm12,0(%rsp) 1129 vmovdqa %ymm13,32(%rsp) 1130 vmovdqa 64(%rsp),%ymm12 1131 vmovdqa 96(%rsp),%ymm13 1132 vpaddd %ymm2,%ymm10,%ymm10 1133 vpxor %ymm6,%ymm10,%ymm6 1134 vpshufb %ymm15,%ymm6,%ymm6 1135 vpaddd %ymm3,%ymm11,%ymm11 1136 vpxor %ymm7,%ymm11,%ymm7 1137 vpshufb %ymm15,%ymm7,%ymm7 1138 vpaddd %ymm6,%ymm12,%ymm12 1139 vpxor %ymm2,%ymm12,%ymm2 1140 vpslld $12,%ymm2,%ymm14 1141 vpsrld $20,%ymm2,%ymm2 1142 vpor %ymm2,%ymm14,%ymm2 1143 vbroadcasti128 (%r11),%ymm14 1144 vpaddd %ymm7,%ymm13,%ymm13 1145 vpxor %ymm3,%ymm13,%ymm3 1146 vpslld $12,%ymm3,%ymm15 1147 vpsrld $20,%ymm3,%ymm3 1148 vpor %ymm3,%ymm15,%ymm3 1149 vpaddd %ymm2,%ymm10,%ymm10 1150 vpxor %ymm6,%ymm10,%ymm6 1151 vpshufb %ymm14,%ymm6,%ymm6 1152 vpaddd %ymm3,%ymm11,%ymm11 1153 vpxor %ymm7,%ymm11,%ymm7 1154 vpshufb %ymm14,%ymm7,%ymm7 1155 vpaddd %ymm6,%ymm12,%ymm12 1156 vpxor %ymm2,%ymm12,%ymm2 1157 vpslld $7,%ymm2,%ymm15 1158 vpsrld $25,%ymm2,%ymm2 1159 vpor %ymm2,%ymm15,%ymm2 1160 vbroadcasti128 (%r10),%ymm15 1161 vpaddd %ymm7,%ymm13,%ymm13 1162 vpxor %ymm3,%ymm13,%ymm3 1163 vpslld $7,%ymm3,%ymm14 1164 vpsrld $25,%ymm3,%ymm3 1165 vpor %ymm3,%ymm14,%ymm3 1166 vpaddd %ymm1,%ymm8,%ymm8 1167 vpxor %ymm7,%ymm8,%ymm7 1168 vpshufb %ymm15,%ymm7,%ymm7 1169 vpaddd %ymm2,%ymm9,%ymm9 1170 vpxor %ymm4,%ymm9,%ymm4 1171 vpshufb %ymm15,%ymm4,%ymm4 1172 vpaddd %ymm7,%ymm12,%ymm12 1173 vpxor %ymm1,%ymm12,%ymm1 1174 vpslld $12,%ymm1,%ymm14 1175 vpsrld $20,%ymm1,%ymm1 1176 vpor %ymm1,%ymm14,%ymm1 1177 vbroadcasti128 (%r11),%ymm14 1178 vpaddd %ymm4,%ymm13,%ymm13 1179 vpxor %ymm2,%ymm13,%ymm2 1180 vpslld $12,%ymm2,%ymm15 1181 vpsrld $20,%ymm2,%ymm2 1182 vpor %ymm2,%ymm15,%ymm2 1183 vpaddd %ymm1,%ymm8,%ymm8 1184 vpxor %ymm7,%ymm8,%ymm7 1185 vpshufb %ymm14,%ymm7,%ymm7 1186 vpaddd %ymm2,%ymm9,%ymm9 1187 vpxor %ymm4,%ymm9,%ymm4 1188 vpshufb %ymm14,%ymm4,%ymm4 1189 vpaddd %ymm7,%ymm12,%ymm12 1190 vpxor %ymm1,%ymm12,%ymm1 1191 vpslld $7,%ymm1,%ymm15 1192 vpsrld $25,%ymm1,%ymm1 1193 vpor %ymm1,%ymm15,%ymm1 1194 vbroadcasti128 (%r10),%ymm15 1195 vpaddd %ymm4,%ymm13,%ymm13 1196 vpxor %ymm2,%ymm13,%ymm2 1197 vpslld $7,%ymm2,%ymm14 1198 vpsrld $25,%ymm2,%ymm2 1199 vpor %ymm2,%ymm14,%ymm2 1200 vmovdqa %ymm12,64(%rsp) 1201 vmovdqa %ymm13,96(%rsp) 1202 vmovdqa 0(%rsp),%ymm12 1203 vmovdqa 32(%rsp),%ymm13 1204 vpaddd %ymm3,%ymm10,%ymm10 1205 vpxor %ymm5,%ymm10,%ymm5 1206 vpshufb %ymm15,%ymm5,%ymm5 1207 vpaddd %ymm0,%ymm11,%ymm11 1208 vpxor %ymm6,%ymm11,%ymm6 1209 vpshufb %ymm15,%ymm6,%ymm6 1210 vpaddd %ymm5,%ymm12,%ymm12 1211 vpxor %ymm3,%ymm12,%ymm3 1212 vpslld $12,%ymm3,%ymm14 1213 vpsrld $20,%ymm3,%ymm3 1214 vpor %ymm3,%ymm14,%ymm3 1215 vbroadcasti128 (%r11),%ymm14 1216 vpaddd %ymm6,%ymm13,%ymm13 1217 vpxor %ymm0,%ymm13,%ymm0 1218 vpslld $12,%ymm0,%ymm15 1219 vpsrld $20,%ymm0,%ymm0 1220 vpor %ymm0,%ymm15,%ymm0 1221 vpaddd %ymm3,%ymm10,%ymm10 1222 vpxor %ymm5,%ymm10,%ymm5 1223 vpshufb %ymm14,%ymm5,%ymm5 1224 vpaddd %ymm0,%ymm11,%ymm11 1225 vpxor %ymm6,%ymm11,%ymm6 1226 vpshufb %ymm14,%ymm6,%ymm6 1227 vpaddd %ymm5,%ymm12,%ymm12 1228 vpxor %ymm3,%ymm12,%ymm3 1229 vpslld $7,%ymm3,%ymm15 1230 vpsrld $25,%ymm3,%ymm3 1231 vpor %ymm3,%ymm15,%ymm3 1232 vbroadcasti128 (%r10),%ymm15 1233 vpaddd %ymm6,%ymm13,%ymm13 1234 vpxor %ymm0,%ymm13,%ymm0 1235 vpslld $7,%ymm0,%ymm14 1236 vpsrld $25,%ymm0,%ymm0 1237 vpor %ymm0,%ymm14,%ymm0 1238 decl %eax 1239 jnz L$oop8x 1240 1241 leaq 512(%rsp),%rax 1242 vpaddd 128-256(%rcx),%ymm8,%ymm8 1243 vpaddd 160-256(%rcx),%ymm9,%ymm9 1244 vpaddd 192-256(%rcx),%ymm10,%ymm10 1245 vpaddd 224-256(%rcx),%ymm11,%ymm11 1246 1247 vpunpckldq %ymm9,%ymm8,%ymm14 1248 vpunpckldq %ymm11,%ymm10,%ymm15 1249 vpunpckhdq %ymm9,%ymm8,%ymm8 1250 vpunpckhdq %ymm11,%ymm10,%ymm10 1251 vpunpcklqdq %ymm15,%ymm14,%ymm9 1252 vpunpckhqdq %ymm15,%ymm14,%ymm14 1253 vpunpcklqdq %ymm10,%ymm8,%ymm11 1254 vpunpckhqdq %ymm10,%ymm8,%ymm8 1255 vpaddd 256-256(%rcx),%ymm0,%ymm0 1256 vpaddd 288-256(%rcx),%ymm1,%ymm1 1257 vpaddd 320-256(%rcx),%ymm2,%ymm2 1258 vpaddd 352-256(%rcx),%ymm3,%ymm3 1259 1260 vpunpckldq %ymm1,%ymm0,%ymm10 1261 vpunpckldq %ymm3,%ymm2,%ymm15 1262 vpunpckhdq %ymm1,%ymm0,%ymm0 1263 vpunpckhdq %ymm3,%ymm2,%ymm2 1264 vpunpcklqdq %ymm15,%ymm10,%ymm1 1265 vpunpckhqdq %ymm15,%ymm10,%ymm10 1266 vpunpcklqdq %ymm2,%ymm0,%ymm3 1267 vpunpckhqdq %ymm2,%ymm0,%ymm0 1268 vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 1269 vperm2i128 $0x31,%ymm1,%ymm9,%ymm1 1270 vperm2i128 $0x20,%ymm10,%ymm14,%ymm9 1271 vperm2i128 $0x31,%ymm10,%ymm14,%ymm10 1272 vperm2i128 $0x20,%ymm3,%ymm11,%ymm14 1273 vperm2i128 $0x31,%ymm3,%ymm11,%ymm3 1274 vperm2i128 $0x20,%ymm0,%ymm8,%ymm11 1275 vperm2i128 $0x31,%ymm0,%ymm8,%ymm0 1276 vmovdqa %ymm15,0(%rsp) 1277 vmovdqa %ymm9,32(%rsp) 1278 vmovdqa 64(%rsp),%ymm15 1279 vmovdqa 96(%rsp),%ymm9 1280 1281 vpaddd 384-512(%rax),%ymm12,%ymm12 1282 vpaddd 416-512(%rax),%ymm13,%ymm13 1283 vpaddd 448-512(%rax),%ymm15,%ymm15 1284 vpaddd 480-512(%rax),%ymm9,%ymm9 1285 1286 vpunpckldq %ymm13,%ymm12,%ymm2 1287 vpunpckldq %ymm9,%ymm15,%ymm8 1288 vpunpckhdq %ymm13,%ymm12,%ymm12 1289 vpunpckhdq %ymm9,%ymm15,%ymm15 1290 vpunpcklqdq %ymm8,%ymm2,%ymm13 1291 vpunpckhqdq %ymm8,%ymm2,%ymm2 1292 vpunpcklqdq %ymm15,%ymm12,%ymm9 1293 vpunpckhqdq %ymm15,%ymm12,%ymm12 1294 vpaddd 512-512(%rax),%ymm4,%ymm4 1295 vpaddd 544-512(%rax),%ymm5,%ymm5 1296 vpaddd 576-512(%rax),%ymm6,%ymm6 1297 vpaddd 608-512(%rax),%ymm7,%ymm7 1298 1299 vpunpckldq %ymm5,%ymm4,%ymm15 1300 vpunpckldq %ymm7,%ymm6,%ymm8 1301 vpunpckhdq %ymm5,%ymm4,%ymm4 1302 vpunpckhdq %ymm7,%ymm6,%ymm6 1303 vpunpcklqdq %ymm8,%ymm15,%ymm5 1304 vpunpckhqdq %ymm8,%ymm15,%ymm15 1305 vpunpcklqdq %ymm6,%ymm4,%ymm7 1306 vpunpckhqdq %ymm6,%ymm4,%ymm4 1307 vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 1308 vperm2i128 $0x31,%ymm5,%ymm13,%ymm5 1309 vperm2i128 $0x20,%ymm15,%ymm2,%ymm13 1310 vperm2i128 $0x31,%ymm15,%ymm2,%ymm15 1311 vperm2i128 $0x20,%ymm7,%ymm9,%ymm2 1312 vperm2i128 $0x31,%ymm7,%ymm9,%ymm7 1313 vperm2i128 $0x20,%ymm4,%ymm12,%ymm9 1314 vperm2i128 $0x31,%ymm4,%ymm12,%ymm4 1315 vmovdqa 0(%rsp),%ymm6 1316 vmovdqa 32(%rsp),%ymm12 1317 1318 cmpq $512,%rdx 1319 jb L$tail8x 1320 1321 vpxor 0(%rsi),%ymm6,%ymm6 1322 vpxor 32(%rsi),%ymm8,%ymm8 1323 vpxor 64(%rsi),%ymm1,%ymm1 1324 vpxor 96(%rsi),%ymm5,%ymm5 1325 leaq 128(%rsi),%rsi 1326 vmovdqu %ymm6,0(%rdi) 1327 vmovdqu %ymm8,32(%rdi) 1328 vmovdqu %ymm1,64(%rdi) 1329 vmovdqu %ymm5,96(%rdi) 1330 leaq 128(%rdi),%rdi 1331 1332 vpxor 0(%rsi),%ymm12,%ymm12 1333 vpxor 32(%rsi),%ymm13,%ymm13 1334 vpxor 64(%rsi),%ymm10,%ymm10 1335 vpxor 96(%rsi),%ymm15,%ymm15 1336 leaq 128(%rsi),%rsi 1337 vmovdqu %ymm12,0(%rdi) 1338 vmovdqu %ymm13,32(%rdi) 1339 vmovdqu %ymm10,64(%rdi) 1340 vmovdqu %ymm15,96(%rdi) 1341 leaq 128(%rdi),%rdi 1342 1343 vpxor 0(%rsi),%ymm14,%ymm14 1344 vpxor 32(%rsi),%ymm2,%ymm2 1345 vpxor 64(%rsi),%ymm3,%ymm3 1346 vpxor 96(%rsi),%ymm7,%ymm7 1347 leaq 128(%rsi),%rsi 1348 vmovdqu %ymm14,0(%rdi) 1349 vmovdqu %ymm2,32(%rdi) 1350 vmovdqu %ymm3,64(%rdi) 1351 vmovdqu %ymm7,96(%rdi) 1352 leaq 128(%rdi),%rdi 1353 1354 vpxor 0(%rsi),%ymm11,%ymm11 1355 vpxor 32(%rsi),%ymm9,%ymm9 1356 vpxor 64(%rsi),%ymm0,%ymm0 1357 vpxor 96(%rsi),%ymm4,%ymm4 1358 leaq 128(%rsi),%rsi 1359 vmovdqu %ymm11,0(%rdi) 1360 vmovdqu %ymm9,32(%rdi) 1361 vmovdqu %ymm0,64(%rdi) 1362 vmovdqu %ymm4,96(%rdi) 1363 leaq 128(%rdi),%rdi 1364 1365 subq $512,%rdx 1366 jnz L$oop_outer8x 1367 1368 jmp L$done8x 1369 1370L$tail8x: 1371 cmpq $448,%rdx 1372 jae L$448_or_more8x 1373 cmpq $384,%rdx 1374 jae L$384_or_more8x 1375 cmpq $320,%rdx 1376 jae L$320_or_more8x 1377 cmpq $256,%rdx 1378 jae L$256_or_more8x 1379 cmpq $192,%rdx 1380 jae L$192_or_more8x 1381 cmpq $128,%rdx 1382 jae L$128_or_more8x 1383 cmpq $64,%rdx 1384 jae L$64_or_more8x 1385 1386 xorq %r10,%r10 1387 vmovdqa %ymm6,0(%rsp) 1388 vmovdqa %ymm8,32(%rsp) 1389 jmp L$oop_tail8x 1390 1391.p2align 5 1392L$64_or_more8x: 1393 vpxor 0(%rsi),%ymm6,%ymm6 1394 vpxor 32(%rsi),%ymm8,%ymm8 1395 vmovdqu %ymm6,0(%rdi) 1396 vmovdqu %ymm8,32(%rdi) 1397 je L$done8x 1398 1399 leaq 64(%rsi),%rsi 1400 xorq %r10,%r10 1401 vmovdqa %ymm1,0(%rsp) 1402 leaq 64(%rdi),%rdi 1403 subq $64,%rdx 1404 vmovdqa %ymm5,32(%rsp) 1405 jmp L$oop_tail8x 1406 1407.p2align 5 1408L$128_or_more8x: 1409 vpxor 0(%rsi),%ymm6,%ymm6 1410 vpxor 32(%rsi),%ymm8,%ymm8 1411 vpxor 64(%rsi),%ymm1,%ymm1 1412 vpxor 96(%rsi),%ymm5,%ymm5 1413 vmovdqu %ymm6,0(%rdi) 1414 vmovdqu %ymm8,32(%rdi) 1415 vmovdqu %ymm1,64(%rdi) 1416 vmovdqu %ymm5,96(%rdi) 1417 je L$done8x 1418 1419 leaq 128(%rsi),%rsi 1420 xorq %r10,%r10 1421 vmovdqa %ymm12,0(%rsp) 1422 leaq 128(%rdi),%rdi 1423 subq $128,%rdx 1424 vmovdqa %ymm13,32(%rsp) 1425 jmp L$oop_tail8x 1426 1427.p2align 5 1428L$192_or_more8x: 1429 vpxor 0(%rsi),%ymm6,%ymm6 1430 vpxor 32(%rsi),%ymm8,%ymm8 1431 vpxor 64(%rsi),%ymm1,%ymm1 1432 vpxor 96(%rsi),%ymm5,%ymm5 1433 vpxor 128(%rsi),%ymm12,%ymm12 1434 vpxor 160(%rsi),%ymm13,%ymm13 1435 vmovdqu %ymm6,0(%rdi) 1436 vmovdqu %ymm8,32(%rdi) 1437 vmovdqu %ymm1,64(%rdi) 1438 vmovdqu %ymm5,96(%rdi) 1439 vmovdqu %ymm12,128(%rdi) 1440 vmovdqu %ymm13,160(%rdi) 1441 je L$done8x 1442 1443 leaq 192(%rsi),%rsi 1444 xorq %r10,%r10 1445 vmovdqa %ymm10,0(%rsp) 1446 leaq 192(%rdi),%rdi 1447 subq $192,%rdx 1448 vmovdqa %ymm15,32(%rsp) 1449 jmp L$oop_tail8x 1450 1451.p2align 5 1452L$256_or_more8x: 1453 vpxor 0(%rsi),%ymm6,%ymm6 1454 vpxor 32(%rsi),%ymm8,%ymm8 1455 vpxor 64(%rsi),%ymm1,%ymm1 1456 vpxor 96(%rsi),%ymm5,%ymm5 1457 vpxor 128(%rsi),%ymm12,%ymm12 1458 vpxor 160(%rsi),%ymm13,%ymm13 1459 vpxor 192(%rsi),%ymm10,%ymm10 1460 vpxor 224(%rsi),%ymm15,%ymm15 1461 vmovdqu %ymm6,0(%rdi) 1462 vmovdqu %ymm8,32(%rdi) 1463 vmovdqu %ymm1,64(%rdi) 1464 vmovdqu %ymm5,96(%rdi) 1465 vmovdqu %ymm12,128(%rdi) 1466 vmovdqu %ymm13,160(%rdi) 1467 vmovdqu %ymm10,192(%rdi) 1468 vmovdqu %ymm15,224(%rdi) 1469 je L$done8x 1470 1471 leaq 256(%rsi),%rsi 1472 xorq %r10,%r10 1473 vmovdqa %ymm14,0(%rsp) 1474 leaq 256(%rdi),%rdi 1475 subq $256,%rdx 1476 vmovdqa %ymm2,32(%rsp) 1477 jmp L$oop_tail8x 1478 1479.p2align 5 1480L$320_or_more8x: 1481 vpxor 0(%rsi),%ymm6,%ymm6 1482 vpxor 32(%rsi),%ymm8,%ymm8 1483 vpxor 64(%rsi),%ymm1,%ymm1 1484 vpxor 96(%rsi),%ymm5,%ymm5 1485 vpxor 128(%rsi),%ymm12,%ymm12 1486 vpxor 160(%rsi),%ymm13,%ymm13 1487 vpxor 192(%rsi),%ymm10,%ymm10 1488 vpxor 224(%rsi),%ymm15,%ymm15 1489 vpxor 256(%rsi),%ymm14,%ymm14 1490 vpxor 288(%rsi),%ymm2,%ymm2 1491 vmovdqu %ymm6,0(%rdi) 1492 vmovdqu %ymm8,32(%rdi) 1493 vmovdqu %ymm1,64(%rdi) 1494 vmovdqu %ymm5,96(%rdi) 1495 vmovdqu %ymm12,128(%rdi) 1496 vmovdqu %ymm13,160(%rdi) 1497 vmovdqu %ymm10,192(%rdi) 1498 vmovdqu %ymm15,224(%rdi) 1499 vmovdqu %ymm14,256(%rdi) 1500 vmovdqu %ymm2,288(%rdi) 1501 je L$done8x 1502 1503 leaq 320(%rsi),%rsi 1504 xorq %r10,%r10 1505 vmovdqa %ymm3,0(%rsp) 1506 leaq 320(%rdi),%rdi 1507 subq $320,%rdx 1508 vmovdqa %ymm7,32(%rsp) 1509 jmp L$oop_tail8x 1510 1511.p2align 5 1512L$384_or_more8x: 1513 vpxor 0(%rsi),%ymm6,%ymm6 1514 vpxor 32(%rsi),%ymm8,%ymm8 1515 vpxor 64(%rsi),%ymm1,%ymm1 1516 vpxor 96(%rsi),%ymm5,%ymm5 1517 vpxor 128(%rsi),%ymm12,%ymm12 1518 vpxor 160(%rsi),%ymm13,%ymm13 1519 vpxor 192(%rsi),%ymm10,%ymm10 1520 vpxor 224(%rsi),%ymm15,%ymm15 1521 vpxor 256(%rsi),%ymm14,%ymm14 1522 vpxor 288(%rsi),%ymm2,%ymm2 1523 vpxor 320(%rsi),%ymm3,%ymm3 1524 vpxor 352(%rsi),%ymm7,%ymm7 1525 vmovdqu %ymm6,0(%rdi) 1526 vmovdqu %ymm8,32(%rdi) 1527 vmovdqu %ymm1,64(%rdi) 1528 vmovdqu %ymm5,96(%rdi) 1529 vmovdqu %ymm12,128(%rdi) 1530 vmovdqu %ymm13,160(%rdi) 1531 vmovdqu %ymm10,192(%rdi) 1532 vmovdqu %ymm15,224(%rdi) 1533 vmovdqu %ymm14,256(%rdi) 1534 vmovdqu %ymm2,288(%rdi) 1535 vmovdqu %ymm3,320(%rdi) 1536 vmovdqu %ymm7,352(%rdi) 1537 je L$done8x 1538 1539 leaq 384(%rsi),%rsi 1540 xorq %r10,%r10 1541 vmovdqa %ymm11,0(%rsp) 1542 leaq 384(%rdi),%rdi 1543 subq $384,%rdx 1544 vmovdqa %ymm9,32(%rsp) 1545 jmp L$oop_tail8x 1546 1547.p2align 5 1548L$448_or_more8x: 1549 vpxor 0(%rsi),%ymm6,%ymm6 1550 vpxor 32(%rsi),%ymm8,%ymm8 1551 vpxor 64(%rsi),%ymm1,%ymm1 1552 vpxor 96(%rsi),%ymm5,%ymm5 1553 vpxor 128(%rsi),%ymm12,%ymm12 1554 vpxor 160(%rsi),%ymm13,%ymm13 1555 vpxor 192(%rsi),%ymm10,%ymm10 1556 vpxor 224(%rsi),%ymm15,%ymm15 1557 vpxor 256(%rsi),%ymm14,%ymm14 1558 vpxor 288(%rsi),%ymm2,%ymm2 1559 vpxor 320(%rsi),%ymm3,%ymm3 1560 vpxor 352(%rsi),%ymm7,%ymm7 1561 vpxor 384(%rsi),%ymm11,%ymm11 1562 vpxor 416(%rsi),%ymm9,%ymm9 1563 vmovdqu %ymm6,0(%rdi) 1564 vmovdqu %ymm8,32(%rdi) 1565 vmovdqu %ymm1,64(%rdi) 1566 vmovdqu %ymm5,96(%rdi) 1567 vmovdqu %ymm12,128(%rdi) 1568 vmovdqu %ymm13,160(%rdi) 1569 vmovdqu %ymm10,192(%rdi) 1570 vmovdqu %ymm15,224(%rdi) 1571 vmovdqu %ymm14,256(%rdi) 1572 vmovdqu %ymm2,288(%rdi) 1573 vmovdqu %ymm3,320(%rdi) 1574 vmovdqu %ymm7,352(%rdi) 1575 vmovdqu %ymm11,384(%rdi) 1576 vmovdqu %ymm9,416(%rdi) 1577 je L$done8x 1578 1579 leaq 448(%rsi),%rsi 1580 xorq %r10,%r10 1581 vmovdqa %ymm0,0(%rsp) 1582 leaq 448(%rdi),%rdi 1583 subq $448,%rdx 1584 vmovdqa %ymm4,32(%rsp) 1585 1586L$oop_tail8x: 1587 movzbl (%rsi,%r10,1),%eax 1588 movzbl (%rsp,%r10,1),%ecx 1589 leaq 1(%r10),%r10 1590 xorl %ecx,%eax 1591 movb %al,-1(%rdi,%r10,1) 1592 decq %rdx 1593 jnz L$oop_tail8x 1594 1595L$done8x: 1596 vzeroall 1597 leaq (%r9),%rsp 1598 1599L$8x_epilogue: 1600 ret 1601 1602 1603#endif 1604