1; This file is generated from a similarly-named Perl script in the BoringSSL 2; source tree. Do not edit by hand. 3 4%ifidn __OUTPUT_FORMAT__, win64 5default rel 6%define XMMWORD 7%define YMMWORD 8%define ZMMWORD 9%define _CET_ENDBR 10 11%include "ring_core_generated/prefix_symbols_nasm.inc" 12section .text code align=64 13 14 15EXTERN OPENSSL_ia32cap_P 16 17section .rdata rdata align=8 18ALIGN 64 19$L$zero: 20 DD 0,0,0,0 21$L$one: 22 DD 1,0,0,0 23$L$inc: 24 DD 0,1,2,3 25$L$four: 26 DD 4,4,4,4 27$L$incy: 28 DD 0,2,4,6,1,3,5,7 29$L$eight: 30 DD 8,8,8,8,8,8,8,8 31$L$rot16: 32 DB 0x2,0x3,0x0,0x1,0x6,0x7,0x4,0x5,0xa,0xb,0x8,0x9,0xe,0xf,0xc,0xd 33$L$rot24: 34 DB 0x3,0x0,0x1,0x2,0x7,0x4,0x5,0x6,0xb,0x8,0x9,0xa,0xf,0xc,0xd,0xe 35$L$sigma: 36 DB 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107 37 DB 0 38ALIGN 64 39$L$zeroz: 40 DD 0,0,0,0,1,0,0,0,2,0,0,0,3,0,0,0 41$L$fourz: 42 DD 4,0,0,0,4,0,0,0,4,0,0,0,4,0,0,0 43$L$incz: 44 DD 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 45$L$sixteen: 46 DD 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 47 DB 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 48 DB 95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32 49 DB 98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115 50 DB 108,46,111,114,103,62,0 51section .text 52 53global ChaCha20_ctr32 54 55ALIGN 64 56ChaCha20_ctr32: 57 mov QWORD[8+rsp],rdi ;WIN64 prologue 58 mov QWORD[16+rsp],rsi 59 mov rax,rsp 60$L$SEH_begin_ChaCha20_ctr32: 61 mov rdi,rcx 62 mov rsi,rdx 63 mov rdx,r8 64 mov rcx,r9 65 mov r8,QWORD[40+rsp] 66 67 68 69_CET_ENDBR 70 cmp rdx,0 71 je NEAR $L$no_data 72 mov r10,QWORD[((OPENSSL_ia32cap_P+4))] 73 test r10d,512 74 jnz NEAR $L$ChaCha20_ssse3 75 76 push rbx 77 78 push rbp 79 80 push r12 81 82 push r13 83 84 push r14 85 86 push r15 87 88 sub rsp,64+24 89 90$L$ctr32_body: 91 92 93 movdqu xmm1,XMMWORD[rcx] 94 movdqu xmm2,XMMWORD[16+rcx] 95 movdqu xmm3,XMMWORD[r8] 96 movdqa xmm4,XMMWORD[$L$one] 97 98 99 movdqa XMMWORD[16+rsp],xmm1 100 movdqa XMMWORD[32+rsp],xmm2 101 movdqa XMMWORD[48+rsp],xmm3 102 mov rbp,rdx 103 jmp NEAR $L$oop_outer 104 105ALIGN 32 106$L$oop_outer: 107 mov eax,0x61707865 108 mov ebx,0x3320646e 109 mov ecx,0x79622d32 110 mov edx,0x6b206574 111 mov r8d,DWORD[16+rsp] 112 mov r9d,DWORD[20+rsp] 113 mov r10d,DWORD[24+rsp] 114 mov r11d,DWORD[28+rsp] 115 movd r12d,xmm3 116 mov r13d,DWORD[52+rsp] 117 mov r14d,DWORD[56+rsp] 118 mov r15d,DWORD[60+rsp] 119 120 mov QWORD[((64+0))+rsp],rbp 121 mov ebp,10 122 mov QWORD[((64+8))+rsp],rsi 123DB 102,72,15,126,214 124 mov QWORD[((64+16))+rsp],rdi 125 mov rdi,rsi 126 shr rdi,32 127 jmp NEAR $L$oop 128 129ALIGN 32 130$L$oop: 131 add eax,r8d 132 xor r12d,eax 133 rol r12d,16 134 add ebx,r9d 135 xor r13d,ebx 136 rol r13d,16 137 add esi,r12d 138 xor r8d,esi 139 rol r8d,12 140 add edi,r13d 141 xor r9d,edi 142 rol r9d,12 143 add eax,r8d 144 xor r12d,eax 145 rol r12d,8 146 add ebx,r9d 147 xor r13d,ebx 148 rol r13d,8 149 add esi,r12d 150 xor r8d,esi 151 rol r8d,7 152 add edi,r13d 153 xor r9d,edi 154 rol r9d,7 155 mov DWORD[32+rsp],esi 156 mov DWORD[36+rsp],edi 157 mov esi,DWORD[40+rsp] 158 mov edi,DWORD[44+rsp] 159 add ecx,r10d 160 xor r14d,ecx 161 rol r14d,16 162 add edx,r11d 163 xor r15d,edx 164 rol r15d,16 165 add esi,r14d 166 xor r10d,esi 167 rol r10d,12 168 add edi,r15d 169 xor r11d,edi 170 rol r11d,12 171 add ecx,r10d 172 xor r14d,ecx 173 rol r14d,8 174 add edx,r11d 175 xor r15d,edx 176 rol r15d,8 177 add esi,r14d 178 xor r10d,esi 179 rol r10d,7 180 add edi,r15d 181 xor r11d,edi 182 rol r11d,7 183 add eax,r9d 184 xor r15d,eax 185 rol r15d,16 186 add ebx,r10d 187 xor r12d,ebx 188 rol r12d,16 189 add esi,r15d 190 xor r9d,esi 191 rol r9d,12 192 add edi,r12d 193 xor r10d,edi 194 rol r10d,12 195 add eax,r9d 196 xor r15d,eax 197 rol r15d,8 198 add ebx,r10d 199 xor r12d,ebx 200 rol r12d,8 201 add esi,r15d 202 xor r9d,esi 203 rol r9d,7 204 add edi,r12d 205 xor r10d,edi 206 rol r10d,7 207 mov DWORD[40+rsp],esi 208 mov DWORD[44+rsp],edi 209 mov esi,DWORD[32+rsp] 210 mov edi,DWORD[36+rsp] 211 add ecx,r11d 212 xor r13d,ecx 213 rol r13d,16 214 add edx,r8d 215 xor r14d,edx 216 rol r14d,16 217 add esi,r13d 218 xor r11d,esi 219 rol r11d,12 220 add edi,r14d 221 xor r8d,edi 222 rol r8d,12 223 add ecx,r11d 224 xor r13d,ecx 225 rol r13d,8 226 add edx,r8d 227 xor r14d,edx 228 rol r14d,8 229 add esi,r13d 230 xor r11d,esi 231 rol r11d,7 232 add edi,r14d 233 xor r8d,edi 234 rol r8d,7 235 dec ebp 236 jnz NEAR $L$oop 237 mov DWORD[36+rsp],edi 238 mov DWORD[32+rsp],esi 239 mov rbp,QWORD[64+rsp] 240 movdqa xmm1,xmm2 241 mov rsi,QWORD[((64+8))+rsp] 242 paddd xmm3,xmm4 243 mov rdi,QWORD[((64+16))+rsp] 244 245 add eax,0x61707865 246 add ebx,0x3320646e 247 add ecx,0x79622d32 248 add edx,0x6b206574 249 add r8d,DWORD[16+rsp] 250 add r9d,DWORD[20+rsp] 251 add r10d,DWORD[24+rsp] 252 add r11d,DWORD[28+rsp] 253 add r12d,DWORD[48+rsp] 254 add r13d,DWORD[52+rsp] 255 add r14d,DWORD[56+rsp] 256 add r15d,DWORD[60+rsp] 257 paddd xmm1,XMMWORD[32+rsp] 258 259 cmp rbp,64 260 jb NEAR $L$tail 261 262 xor eax,DWORD[rsi] 263 xor ebx,DWORD[4+rsi] 264 xor ecx,DWORD[8+rsi] 265 xor edx,DWORD[12+rsi] 266 xor r8d,DWORD[16+rsi] 267 xor r9d,DWORD[20+rsi] 268 xor r10d,DWORD[24+rsi] 269 xor r11d,DWORD[28+rsi] 270 movdqu xmm0,XMMWORD[32+rsi] 271 xor r12d,DWORD[48+rsi] 272 xor r13d,DWORD[52+rsi] 273 xor r14d,DWORD[56+rsi] 274 xor r15d,DWORD[60+rsi] 275 lea rsi,[64+rsi] 276 pxor xmm0,xmm1 277 278 movdqa XMMWORD[32+rsp],xmm2 279 movd DWORD[48+rsp],xmm3 280 281 mov DWORD[rdi],eax 282 mov DWORD[4+rdi],ebx 283 mov DWORD[8+rdi],ecx 284 mov DWORD[12+rdi],edx 285 mov DWORD[16+rdi],r8d 286 mov DWORD[20+rdi],r9d 287 mov DWORD[24+rdi],r10d 288 mov DWORD[28+rdi],r11d 289 movdqu XMMWORD[32+rdi],xmm0 290 mov DWORD[48+rdi],r12d 291 mov DWORD[52+rdi],r13d 292 mov DWORD[56+rdi],r14d 293 mov DWORD[60+rdi],r15d 294 lea rdi,[64+rdi] 295 296 sub rbp,64 297 jnz NEAR $L$oop_outer 298 299 jmp NEAR $L$done 300 301ALIGN 16 302$L$tail: 303 mov DWORD[rsp],eax 304 mov DWORD[4+rsp],ebx 305 xor rbx,rbx 306 mov DWORD[8+rsp],ecx 307 mov DWORD[12+rsp],edx 308 mov DWORD[16+rsp],r8d 309 mov DWORD[20+rsp],r9d 310 mov DWORD[24+rsp],r10d 311 mov DWORD[28+rsp],r11d 312 movdqa XMMWORD[32+rsp],xmm1 313 mov DWORD[48+rsp],r12d 314 mov DWORD[52+rsp],r13d 315 mov DWORD[56+rsp],r14d 316 mov DWORD[60+rsp],r15d 317 318$L$oop_tail: 319 movzx eax,BYTE[rbx*1+rsi] 320 movzx edx,BYTE[rbx*1+rsp] 321 lea rbx,[1+rbx] 322 xor eax,edx 323 mov BYTE[((-1))+rbx*1+rdi],al 324 dec rbp 325 jnz NEAR $L$oop_tail 326 327$L$done: 328 lea rsi,[((64+24+48))+rsp] 329 mov r15,QWORD[((-48))+rsi] 330 331 mov r14,QWORD[((-40))+rsi] 332 333 mov r13,QWORD[((-32))+rsi] 334 335 mov r12,QWORD[((-24))+rsi] 336 337 mov rbp,QWORD[((-16))+rsi] 338 339 mov rbx,QWORD[((-8))+rsi] 340 341 lea rsp,[rsi] 342 343$L$no_data: 344 mov rdi,QWORD[8+rsp] ;WIN64 epilogue 345 mov rsi,QWORD[16+rsp] 346 ret 347 348$L$SEH_end_ChaCha20_ctr32: 349 350ALIGN 32 351ChaCha20_ssse3: 352 mov QWORD[8+rsp],rdi ;WIN64 prologue 353 mov QWORD[16+rsp],rsi 354 mov rax,rsp 355$L$SEH_begin_ChaCha20_ssse3: 356 mov rdi,rcx 357 mov rsi,rdx 358 mov rdx,r8 359 mov rcx,r9 360 mov r8,QWORD[40+rsp] 361 362 363$L$ChaCha20_ssse3: 364 365 mov r9,rsp 366 367 cmp rdx,128 368 ja NEAR $L$ChaCha20_4x 369 370$L$do_sse3_after_all: 371 sub rsp,64+40 372 movaps XMMWORD[(-40)+r9],xmm6 373 movaps XMMWORD[(-24)+r9],xmm7 374$L$ssse3_body: 375 movdqa xmm0,XMMWORD[$L$sigma] 376 movdqu xmm1,XMMWORD[rcx] 377 movdqu xmm2,XMMWORD[16+rcx] 378 movdqu xmm3,XMMWORD[r8] 379 movdqa xmm6,XMMWORD[$L$rot16] 380 movdqa xmm7,XMMWORD[$L$rot24] 381 382 movdqa XMMWORD[rsp],xmm0 383 movdqa XMMWORD[16+rsp],xmm1 384 movdqa XMMWORD[32+rsp],xmm2 385 movdqa XMMWORD[48+rsp],xmm3 386 mov r8,10 387 jmp NEAR $L$oop_ssse3 388 389ALIGN 32 390$L$oop_outer_ssse3: 391 movdqa xmm3,XMMWORD[$L$one] 392 movdqa xmm0,XMMWORD[rsp] 393 movdqa xmm1,XMMWORD[16+rsp] 394 movdqa xmm2,XMMWORD[32+rsp] 395 paddd xmm3,XMMWORD[48+rsp] 396 mov r8,10 397 movdqa XMMWORD[48+rsp],xmm3 398 jmp NEAR $L$oop_ssse3 399 400ALIGN 32 401$L$oop_ssse3: 402 paddd xmm0,xmm1 403 pxor xmm3,xmm0 404DB 102,15,56,0,222 405 paddd xmm2,xmm3 406 pxor xmm1,xmm2 407 movdqa xmm4,xmm1 408 psrld xmm1,20 409 pslld xmm4,12 410 por xmm1,xmm4 411 paddd xmm0,xmm1 412 pxor xmm3,xmm0 413DB 102,15,56,0,223 414 paddd xmm2,xmm3 415 pxor xmm1,xmm2 416 movdqa xmm4,xmm1 417 psrld xmm1,25 418 pslld xmm4,7 419 por xmm1,xmm4 420 pshufd xmm2,xmm2,78 421 pshufd xmm1,xmm1,57 422 pshufd xmm3,xmm3,147 423 nop 424 paddd xmm0,xmm1 425 pxor xmm3,xmm0 426DB 102,15,56,0,222 427 paddd xmm2,xmm3 428 pxor xmm1,xmm2 429 movdqa xmm4,xmm1 430 psrld xmm1,20 431 pslld xmm4,12 432 por xmm1,xmm4 433 paddd xmm0,xmm1 434 pxor xmm3,xmm0 435DB 102,15,56,0,223 436 paddd xmm2,xmm3 437 pxor xmm1,xmm2 438 movdqa xmm4,xmm1 439 psrld xmm1,25 440 pslld xmm4,7 441 por xmm1,xmm4 442 pshufd xmm2,xmm2,78 443 pshufd xmm1,xmm1,147 444 pshufd xmm3,xmm3,57 445 dec r8 446 jnz NEAR $L$oop_ssse3 447 paddd xmm0,XMMWORD[rsp] 448 paddd xmm1,XMMWORD[16+rsp] 449 paddd xmm2,XMMWORD[32+rsp] 450 paddd xmm3,XMMWORD[48+rsp] 451 452 cmp rdx,64 453 jb NEAR $L$tail_ssse3 454 455 movdqu xmm4,XMMWORD[rsi] 456 movdqu xmm5,XMMWORD[16+rsi] 457 pxor xmm0,xmm4 458 movdqu xmm4,XMMWORD[32+rsi] 459 pxor xmm1,xmm5 460 movdqu xmm5,XMMWORD[48+rsi] 461 lea rsi,[64+rsi] 462 pxor xmm2,xmm4 463 pxor xmm3,xmm5 464 465 movdqu XMMWORD[rdi],xmm0 466 movdqu XMMWORD[16+rdi],xmm1 467 movdqu XMMWORD[32+rdi],xmm2 468 movdqu XMMWORD[48+rdi],xmm3 469 lea rdi,[64+rdi] 470 471 sub rdx,64 472 jnz NEAR $L$oop_outer_ssse3 473 474 jmp NEAR $L$done_ssse3 475 476ALIGN 16 477$L$tail_ssse3: 478 movdqa XMMWORD[rsp],xmm0 479 movdqa XMMWORD[16+rsp],xmm1 480 movdqa XMMWORD[32+rsp],xmm2 481 movdqa XMMWORD[48+rsp],xmm3 482 xor r8,r8 483 484$L$oop_tail_ssse3: 485 movzx eax,BYTE[r8*1+rsi] 486 movzx ecx,BYTE[r8*1+rsp] 487 lea r8,[1+r8] 488 xor eax,ecx 489 mov BYTE[((-1))+r8*1+rdi],al 490 dec rdx 491 jnz NEAR $L$oop_tail_ssse3 492 493$L$done_ssse3: 494 movaps xmm6,XMMWORD[((-40))+r9] 495 movaps xmm7,XMMWORD[((-24))+r9] 496 lea rsp,[r9] 497 498$L$ssse3_epilogue: 499 mov rdi,QWORD[8+rsp] ;WIN64 epilogue 500 mov rsi,QWORD[16+rsp] 501 ret 502 503$L$SEH_end_ChaCha20_ssse3: 504 505ALIGN 32 506ChaCha20_4x: 507 mov QWORD[8+rsp],rdi ;WIN64 prologue 508 mov QWORD[16+rsp],rsi 509 mov rax,rsp 510$L$SEH_begin_ChaCha20_4x: 511 mov rdi,rcx 512 mov rsi,rdx 513 mov rdx,r8 514 mov rcx,r9 515 mov r8,QWORD[40+rsp] 516 517 518$L$ChaCha20_4x: 519 520 mov r9,rsp 521 522 mov r11,r10 523 shr r10,32 524 test r10,32 525 jnz NEAR $L$ChaCha20_8x 526 cmp rdx,192 527 ja NEAR $L$proceed4x 528 529 and r11,71303168 530 cmp r11,4194304 531 je NEAR $L$do_sse3_after_all 532 533$L$proceed4x: 534 sub rsp,0x140+168 535 movaps XMMWORD[(-168)+r9],xmm6 536 movaps XMMWORD[(-152)+r9],xmm7 537 movaps XMMWORD[(-136)+r9],xmm8 538 movaps XMMWORD[(-120)+r9],xmm9 539 movaps XMMWORD[(-104)+r9],xmm10 540 movaps XMMWORD[(-88)+r9],xmm11 541 movaps XMMWORD[(-72)+r9],xmm12 542 movaps XMMWORD[(-56)+r9],xmm13 543 movaps XMMWORD[(-40)+r9],xmm14 544 movaps XMMWORD[(-24)+r9],xmm15 545$L$4x_body: 546 movdqa xmm11,XMMWORD[$L$sigma] 547 movdqu xmm15,XMMWORD[rcx] 548 movdqu xmm7,XMMWORD[16+rcx] 549 movdqu xmm3,XMMWORD[r8] 550 lea rcx,[256+rsp] 551 lea r10,[$L$rot16] 552 lea r11,[$L$rot24] 553 554 pshufd xmm8,xmm11,0x00 555 pshufd xmm9,xmm11,0x55 556 movdqa XMMWORD[64+rsp],xmm8 557 pshufd xmm10,xmm11,0xaa 558 movdqa XMMWORD[80+rsp],xmm9 559 pshufd xmm11,xmm11,0xff 560 movdqa XMMWORD[96+rsp],xmm10 561 movdqa XMMWORD[112+rsp],xmm11 562 563 pshufd xmm12,xmm15,0x00 564 pshufd xmm13,xmm15,0x55 565 movdqa XMMWORD[(128-256)+rcx],xmm12 566 pshufd xmm14,xmm15,0xaa 567 movdqa XMMWORD[(144-256)+rcx],xmm13 568 pshufd xmm15,xmm15,0xff 569 movdqa XMMWORD[(160-256)+rcx],xmm14 570 movdqa XMMWORD[(176-256)+rcx],xmm15 571 572 pshufd xmm4,xmm7,0x00 573 pshufd xmm5,xmm7,0x55 574 movdqa XMMWORD[(192-256)+rcx],xmm4 575 pshufd xmm6,xmm7,0xaa 576 movdqa XMMWORD[(208-256)+rcx],xmm5 577 pshufd xmm7,xmm7,0xff 578 movdqa XMMWORD[(224-256)+rcx],xmm6 579 movdqa XMMWORD[(240-256)+rcx],xmm7 580 581 pshufd xmm0,xmm3,0x00 582 pshufd xmm1,xmm3,0x55 583 paddd xmm0,XMMWORD[$L$inc] 584 pshufd xmm2,xmm3,0xaa 585 movdqa XMMWORD[(272-256)+rcx],xmm1 586 pshufd xmm3,xmm3,0xff 587 movdqa XMMWORD[(288-256)+rcx],xmm2 588 movdqa XMMWORD[(304-256)+rcx],xmm3 589 590 jmp NEAR $L$oop_enter4x 591 592ALIGN 32 593$L$oop_outer4x: 594 movdqa xmm8,XMMWORD[64+rsp] 595 movdqa xmm9,XMMWORD[80+rsp] 596 movdqa xmm10,XMMWORD[96+rsp] 597 movdqa xmm11,XMMWORD[112+rsp] 598 movdqa xmm12,XMMWORD[((128-256))+rcx] 599 movdqa xmm13,XMMWORD[((144-256))+rcx] 600 movdqa xmm14,XMMWORD[((160-256))+rcx] 601 movdqa xmm15,XMMWORD[((176-256))+rcx] 602 movdqa xmm4,XMMWORD[((192-256))+rcx] 603 movdqa xmm5,XMMWORD[((208-256))+rcx] 604 movdqa xmm6,XMMWORD[((224-256))+rcx] 605 movdqa xmm7,XMMWORD[((240-256))+rcx] 606 movdqa xmm0,XMMWORD[((256-256))+rcx] 607 movdqa xmm1,XMMWORD[((272-256))+rcx] 608 movdqa xmm2,XMMWORD[((288-256))+rcx] 609 movdqa xmm3,XMMWORD[((304-256))+rcx] 610 paddd xmm0,XMMWORD[$L$four] 611 612$L$oop_enter4x: 613 movdqa XMMWORD[32+rsp],xmm6 614 movdqa XMMWORD[48+rsp],xmm7 615 movdqa xmm7,XMMWORD[r10] 616 mov eax,10 617 movdqa XMMWORD[(256-256)+rcx],xmm0 618 jmp NEAR $L$oop4x 619 620ALIGN 32 621$L$oop4x: 622 paddd xmm8,xmm12 623 paddd xmm9,xmm13 624 pxor xmm0,xmm8 625 pxor xmm1,xmm9 626DB 102,15,56,0,199 627DB 102,15,56,0,207 628 paddd xmm4,xmm0 629 paddd xmm5,xmm1 630 pxor xmm12,xmm4 631 pxor xmm13,xmm5 632 movdqa xmm6,xmm12 633 pslld xmm12,12 634 psrld xmm6,20 635 movdqa xmm7,xmm13 636 pslld xmm13,12 637 por xmm12,xmm6 638 psrld xmm7,20 639 movdqa xmm6,XMMWORD[r11] 640 por xmm13,xmm7 641 paddd xmm8,xmm12 642 paddd xmm9,xmm13 643 pxor xmm0,xmm8 644 pxor xmm1,xmm9 645DB 102,15,56,0,198 646DB 102,15,56,0,206 647 paddd xmm4,xmm0 648 paddd xmm5,xmm1 649 pxor xmm12,xmm4 650 pxor xmm13,xmm5 651 movdqa xmm7,xmm12 652 pslld xmm12,7 653 psrld xmm7,25 654 movdqa xmm6,xmm13 655 pslld xmm13,7 656 por xmm12,xmm7 657 psrld xmm6,25 658 movdqa xmm7,XMMWORD[r10] 659 por xmm13,xmm6 660 movdqa XMMWORD[rsp],xmm4 661 movdqa XMMWORD[16+rsp],xmm5 662 movdqa xmm4,XMMWORD[32+rsp] 663 movdqa xmm5,XMMWORD[48+rsp] 664 paddd xmm10,xmm14 665 paddd xmm11,xmm15 666 pxor xmm2,xmm10 667 pxor xmm3,xmm11 668DB 102,15,56,0,215 669DB 102,15,56,0,223 670 paddd xmm4,xmm2 671 paddd xmm5,xmm3 672 pxor xmm14,xmm4 673 pxor xmm15,xmm5 674 movdqa xmm6,xmm14 675 pslld xmm14,12 676 psrld xmm6,20 677 movdqa xmm7,xmm15 678 pslld xmm15,12 679 por xmm14,xmm6 680 psrld xmm7,20 681 movdqa xmm6,XMMWORD[r11] 682 por xmm15,xmm7 683 paddd xmm10,xmm14 684 paddd xmm11,xmm15 685 pxor xmm2,xmm10 686 pxor xmm3,xmm11 687DB 102,15,56,0,214 688DB 102,15,56,0,222 689 paddd xmm4,xmm2 690 paddd xmm5,xmm3 691 pxor xmm14,xmm4 692 pxor xmm15,xmm5 693 movdqa xmm7,xmm14 694 pslld xmm14,7 695 psrld xmm7,25 696 movdqa xmm6,xmm15 697 pslld xmm15,7 698 por xmm14,xmm7 699 psrld xmm6,25 700 movdqa xmm7,XMMWORD[r10] 701 por xmm15,xmm6 702 paddd xmm8,xmm13 703 paddd xmm9,xmm14 704 pxor xmm3,xmm8 705 pxor xmm0,xmm9 706DB 102,15,56,0,223 707DB 102,15,56,0,199 708 paddd xmm4,xmm3 709 paddd xmm5,xmm0 710 pxor xmm13,xmm4 711 pxor xmm14,xmm5 712 movdqa xmm6,xmm13 713 pslld xmm13,12 714 psrld xmm6,20 715 movdqa xmm7,xmm14 716 pslld xmm14,12 717 por xmm13,xmm6 718 psrld xmm7,20 719 movdqa xmm6,XMMWORD[r11] 720 por xmm14,xmm7 721 paddd xmm8,xmm13 722 paddd xmm9,xmm14 723 pxor xmm3,xmm8 724 pxor xmm0,xmm9 725DB 102,15,56,0,222 726DB 102,15,56,0,198 727 paddd xmm4,xmm3 728 paddd xmm5,xmm0 729 pxor xmm13,xmm4 730 pxor xmm14,xmm5 731 movdqa xmm7,xmm13 732 pslld xmm13,7 733 psrld xmm7,25 734 movdqa xmm6,xmm14 735 pslld xmm14,7 736 por xmm13,xmm7 737 psrld xmm6,25 738 movdqa xmm7,XMMWORD[r10] 739 por xmm14,xmm6 740 movdqa XMMWORD[32+rsp],xmm4 741 movdqa XMMWORD[48+rsp],xmm5 742 movdqa xmm4,XMMWORD[rsp] 743 movdqa xmm5,XMMWORD[16+rsp] 744 paddd xmm10,xmm15 745 paddd xmm11,xmm12 746 pxor xmm1,xmm10 747 pxor xmm2,xmm11 748DB 102,15,56,0,207 749DB 102,15,56,0,215 750 paddd xmm4,xmm1 751 paddd xmm5,xmm2 752 pxor xmm15,xmm4 753 pxor xmm12,xmm5 754 movdqa xmm6,xmm15 755 pslld xmm15,12 756 psrld xmm6,20 757 movdqa xmm7,xmm12 758 pslld xmm12,12 759 por xmm15,xmm6 760 psrld xmm7,20 761 movdqa xmm6,XMMWORD[r11] 762 por xmm12,xmm7 763 paddd xmm10,xmm15 764 paddd xmm11,xmm12 765 pxor xmm1,xmm10 766 pxor xmm2,xmm11 767DB 102,15,56,0,206 768DB 102,15,56,0,214 769 paddd xmm4,xmm1 770 paddd xmm5,xmm2 771 pxor xmm15,xmm4 772 pxor xmm12,xmm5 773 movdqa xmm7,xmm15 774 pslld xmm15,7 775 psrld xmm7,25 776 movdqa xmm6,xmm12 777 pslld xmm12,7 778 por xmm15,xmm7 779 psrld xmm6,25 780 movdqa xmm7,XMMWORD[r10] 781 por xmm12,xmm6 782 dec eax 783 jnz NEAR $L$oop4x 784 785 paddd xmm8,XMMWORD[64+rsp] 786 paddd xmm9,XMMWORD[80+rsp] 787 paddd xmm10,XMMWORD[96+rsp] 788 paddd xmm11,XMMWORD[112+rsp] 789 790 movdqa xmm6,xmm8 791 punpckldq xmm8,xmm9 792 movdqa xmm7,xmm10 793 punpckldq xmm10,xmm11 794 punpckhdq xmm6,xmm9 795 punpckhdq xmm7,xmm11 796 movdqa xmm9,xmm8 797 punpcklqdq xmm8,xmm10 798 movdqa xmm11,xmm6 799 punpcklqdq xmm6,xmm7 800 punpckhqdq xmm9,xmm10 801 punpckhqdq xmm11,xmm7 802 paddd xmm12,XMMWORD[((128-256))+rcx] 803 paddd xmm13,XMMWORD[((144-256))+rcx] 804 paddd xmm14,XMMWORD[((160-256))+rcx] 805 paddd xmm15,XMMWORD[((176-256))+rcx] 806 807 movdqa XMMWORD[rsp],xmm8 808 movdqa XMMWORD[16+rsp],xmm9 809 movdqa xmm8,XMMWORD[32+rsp] 810 movdqa xmm9,XMMWORD[48+rsp] 811 812 movdqa xmm10,xmm12 813 punpckldq xmm12,xmm13 814 movdqa xmm7,xmm14 815 punpckldq xmm14,xmm15 816 punpckhdq xmm10,xmm13 817 punpckhdq xmm7,xmm15 818 movdqa xmm13,xmm12 819 punpcklqdq xmm12,xmm14 820 movdqa xmm15,xmm10 821 punpcklqdq xmm10,xmm7 822 punpckhqdq xmm13,xmm14 823 punpckhqdq xmm15,xmm7 824 paddd xmm4,XMMWORD[((192-256))+rcx] 825 paddd xmm5,XMMWORD[((208-256))+rcx] 826 paddd xmm8,XMMWORD[((224-256))+rcx] 827 paddd xmm9,XMMWORD[((240-256))+rcx] 828 829 movdqa XMMWORD[32+rsp],xmm6 830 movdqa XMMWORD[48+rsp],xmm11 831 832 movdqa xmm14,xmm4 833 punpckldq xmm4,xmm5 834 movdqa xmm7,xmm8 835 punpckldq xmm8,xmm9 836 punpckhdq xmm14,xmm5 837 punpckhdq xmm7,xmm9 838 movdqa xmm5,xmm4 839 punpcklqdq xmm4,xmm8 840 movdqa xmm9,xmm14 841 punpcklqdq xmm14,xmm7 842 punpckhqdq xmm5,xmm8 843 punpckhqdq xmm9,xmm7 844 paddd xmm0,XMMWORD[((256-256))+rcx] 845 paddd xmm1,XMMWORD[((272-256))+rcx] 846 paddd xmm2,XMMWORD[((288-256))+rcx] 847 paddd xmm3,XMMWORD[((304-256))+rcx] 848 849 movdqa xmm8,xmm0 850 punpckldq xmm0,xmm1 851 movdqa xmm7,xmm2 852 punpckldq xmm2,xmm3 853 punpckhdq xmm8,xmm1 854 punpckhdq xmm7,xmm3 855 movdqa xmm1,xmm0 856 punpcklqdq xmm0,xmm2 857 movdqa xmm3,xmm8 858 punpcklqdq xmm8,xmm7 859 punpckhqdq xmm1,xmm2 860 punpckhqdq xmm3,xmm7 861 cmp rdx,64*4 862 jb NEAR $L$tail4x 863 864 movdqu xmm6,XMMWORD[rsi] 865 movdqu xmm11,XMMWORD[16+rsi] 866 movdqu xmm2,XMMWORD[32+rsi] 867 movdqu xmm7,XMMWORD[48+rsi] 868 pxor xmm6,XMMWORD[rsp] 869 pxor xmm11,xmm12 870 pxor xmm2,xmm4 871 pxor xmm7,xmm0 872 873 movdqu XMMWORD[rdi],xmm6 874 movdqu xmm6,XMMWORD[64+rsi] 875 movdqu XMMWORD[16+rdi],xmm11 876 movdqu xmm11,XMMWORD[80+rsi] 877 movdqu XMMWORD[32+rdi],xmm2 878 movdqu xmm2,XMMWORD[96+rsi] 879 movdqu XMMWORD[48+rdi],xmm7 880 movdqu xmm7,XMMWORD[112+rsi] 881 lea rsi,[128+rsi] 882 pxor xmm6,XMMWORD[16+rsp] 883 pxor xmm11,xmm13 884 pxor xmm2,xmm5 885 pxor xmm7,xmm1 886 887 movdqu XMMWORD[64+rdi],xmm6 888 movdqu xmm6,XMMWORD[rsi] 889 movdqu XMMWORD[80+rdi],xmm11 890 movdqu xmm11,XMMWORD[16+rsi] 891 movdqu XMMWORD[96+rdi],xmm2 892 movdqu xmm2,XMMWORD[32+rsi] 893 movdqu XMMWORD[112+rdi],xmm7 894 lea rdi,[128+rdi] 895 movdqu xmm7,XMMWORD[48+rsi] 896 pxor xmm6,XMMWORD[32+rsp] 897 pxor xmm11,xmm10 898 pxor xmm2,xmm14 899 pxor xmm7,xmm8 900 901 movdqu XMMWORD[rdi],xmm6 902 movdqu xmm6,XMMWORD[64+rsi] 903 movdqu XMMWORD[16+rdi],xmm11 904 movdqu xmm11,XMMWORD[80+rsi] 905 movdqu XMMWORD[32+rdi],xmm2 906 movdqu xmm2,XMMWORD[96+rsi] 907 movdqu XMMWORD[48+rdi],xmm7 908 movdqu xmm7,XMMWORD[112+rsi] 909 lea rsi,[128+rsi] 910 pxor xmm6,XMMWORD[48+rsp] 911 pxor xmm11,xmm15 912 pxor xmm2,xmm9 913 pxor xmm7,xmm3 914 movdqu XMMWORD[64+rdi],xmm6 915 movdqu XMMWORD[80+rdi],xmm11 916 movdqu XMMWORD[96+rdi],xmm2 917 movdqu XMMWORD[112+rdi],xmm7 918 lea rdi,[128+rdi] 919 920 sub rdx,64*4 921 jnz NEAR $L$oop_outer4x 922 923 jmp NEAR $L$done4x 924 925$L$tail4x: 926 cmp rdx,192 927 jae NEAR $L$192_or_more4x 928 cmp rdx,128 929 jae NEAR $L$128_or_more4x 930 cmp rdx,64 931 jae NEAR $L$64_or_more4x 932 933 934 xor r10,r10 935 936 movdqa XMMWORD[16+rsp],xmm12 937 movdqa XMMWORD[32+rsp],xmm4 938 movdqa XMMWORD[48+rsp],xmm0 939 jmp NEAR $L$oop_tail4x 940 941ALIGN 32 942$L$64_or_more4x: 943 movdqu xmm6,XMMWORD[rsi] 944 movdqu xmm11,XMMWORD[16+rsi] 945 movdqu xmm2,XMMWORD[32+rsi] 946 movdqu xmm7,XMMWORD[48+rsi] 947 pxor xmm6,XMMWORD[rsp] 948 pxor xmm11,xmm12 949 pxor xmm2,xmm4 950 pxor xmm7,xmm0 951 movdqu XMMWORD[rdi],xmm6 952 movdqu XMMWORD[16+rdi],xmm11 953 movdqu XMMWORD[32+rdi],xmm2 954 movdqu XMMWORD[48+rdi],xmm7 955 je NEAR $L$done4x 956 957 movdqa xmm6,XMMWORD[16+rsp] 958 lea rsi,[64+rsi] 959 xor r10,r10 960 movdqa XMMWORD[rsp],xmm6 961 movdqa XMMWORD[16+rsp],xmm13 962 lea rdi,[64+rdi] 963 movdqa XMMWORD[32+rsp],xmm5 964 sub rdx,64 965 movdqa XMMWORD[48+rsp],xmm1 966 jmp NEAR $L$oop_tail4x 967 968ALIGN 32 969$L$128_or_more4x: 970 movdqu xmm6,XMMWORD[rsi] 971 movdqu xmm11,XMMWORD[16+rsi] 972 movdqu xmm2,XMMWORD[32+rsi] 973 movdqu xmm7,XMMWORD[48+rsi] 974 pxor xmm6,XMMWORD[rsp] 975 pxor xmm11,xmm12 976 pxor xmm2,xmm4 977 pxor xmm7,xmm0 978 979 movdqu XMMWORD[rdi],xmm6 980 movdqu xmm6,XMMWORD[64+rsi] 981 movdqu XMMWORD[16+rdi],xmm11 982 movdqu xmm11,XMMWORD[80+rsi] 983 movdqu XMMWORD[32+rdi],xmm2 984 movdqu xmm2,XMMWORD[96+rsi] 985 movdqu XMMWORD[48+rdi],xmm7 986 movdqu xmm7,XMMWORD[112+rsi] 987 pxor xmm6,XMMWORD[16+rsp] 988 pxor xmm11,xmm13 989 pxor xmm2,xmm5 990 pxor xmm7,xmm1 991 movdqu XMMWORD[64+rdi],xmm6 992 movdqu XMMWORD[80+rdi],xmm11 993 movdqu XMMWORD[96+rdi],xmm2 994 movdqu XMMWORD[112+rdi],xmm7 995 je NEAR $L$done4x 996 997 movdqa xmm6,XMMWORD[32+rsp] 998 lea rsi,[128+rsi] 999 xor r10,r10 1000 movdqa XMMWORD[rsp],xmm6 1001 movdqa XMMWORD[16+rsp],xmm10 1002 lea rdi,[128+rdi] 1003 movdqa XMMWORD[32+rsp],xmm14 1004 sub rdx,128 1005 movdqa XMMWORD[48+rsp],xmm8 1006 jmp NEAR $L$oop_tail4x 1007 1008ALIGN 32 1009$L$192_or_more4x: 1010 movdqu xmm6,XMMWORD[rsi] 1011 movdqu xmm11,XMMWORD[16+rsi] 1012 movdqu xmm2,XMMWORD[32+rsi] 1013 movdqu xmm7,XMMWORD[48+rsi] 1014 pxor xmm6,XMMWORD[rsp] 1015 pxor xmm11,xmm12 1016 pxor xmm2,xmm4 1017 pxor xmm7,xmm0 1018 1019 movdqu XMMWORD[rdi],xmm6 1020 movdqu xmm6,XMMWORD[64+rsi] 1021 movdqu XMMWORD[16+rdi],xmm11 1022 movdqu xmm11,XMMWORD[80+rsi] 1023 movdqu XMMWORD[32+rdi],xmm2 1024 movdqu xmm2,XMMWORD[96+rsi] 1025 movdqu XMMWORD[48+rdi],xmm7 1026 movdqu xmm7,XMMWORD[112+rsi] 1027 lea rsi,[128+rsi] 1028 pxor xmm6,XMMWORD[16+rsp] 1029 pxor xmm11,xmm13 1030 pxor xmm2,xmm5 1031 pxor xmm7,xmm1 1032 1033 movdqu XMMWORD[64+rdi],xmm6 1034 movdqu xmm6,XMMWORD[rsi] 1035 movdqu XMMWORD[80+rdi],xmm11 1036 movdqu xmm11,XMMWORD[16+rsi] 1037 movdqu XMMWORD[96+rdi],xmm2 1038 movdqu xmm2,XMMWORD[32+rsi] 1039 movdqu XMMWORD[112+rdi],xmm7 1040 lea rdi,[128+rdi] 1041 movdqu xmm7,XMMWORD[48+rsi] 1042 pxor xmm6,XMMWORD[32+rsp] 1043 pxor xmm11,xmm10 1044 pxor xmm2,xmm14 1045 pxor xmm7,xmm8 1046 movdqu XMMWORD[rdi],xmm6 1047 movdqu XMMWORD[16+rdi],xmm11 1048 movdqu XMMWORD[32+rdi],xmm2 1049 movdqu XMMWORD[48+rdi],xmm7 1050 je NEAR $L$done4x 1051 1052 movdqa xmm6,XMMWORD[48+rsp] 1053 lea rsi,[64+rsi] 1054 xor r10,r10 1055 movdqa XMMWORD[rsp],xmm6 1056 movdqa XMMWORD[16+rsp],xmm15 1057 lea rdi,[64+rdi] 1058 movdqa XMMWORD[32+rsp],xmm9 1059 sub rdx,192 1060 movdqa XMMWORD[48+rsp],xmm3 1061 1062$L$oop_tail4x: 1063 movzx eax,BYTE[r10*1+rsi] 1064 movzx ecx,BYTE[r10*1+rsp] 1065 lea r10,[1+r10] 1066 xor eax,ecx 1067 mov BYTE[((-1))+r10*1+rdi],al 1068 dec rdx 1069 jnz NEAR $L$oop_tail4x 1070 1071$L$done4x: 1072 movaps xmm6,XMMWORD[((-168))+r9] 1073 movaps xmm7,XMMWORD[((-152))+r9] 1074 movaps xmm8,XMMWORD[((-136))+r9] 1075 movaps xmm9,XMMWORD[((-120))+r9] 1076 movaps xmm10,XMMWORD[((-104))+r9] 1077 movaps xmm11,XMMWORD[((-88))+r9] 1078 movaps xmm12,XMMWORD[((-72))+r9] 1079 movaps xmm13,XMMWORD[((-56))+r9] 1080 movaps xmm14,XMMWORD[((-40))+r9] 1081 movaps xmm15,XMMWORD[((-24))+r9] 1082 lea rsp,[r9] 1083 1084$L$4x_epilogue: 1085 mov rdi,QWORD[8+rsp] ;WIN64 epilogue 1086 mov rsi,QWORD[16+rsp] 1087 ret 1088 1089$L$SEH_end_ChaCha20_4x: 1090 1091ALIGN 32 1092ChaCha20_8x: 1093 mov QWORD[8+rsp],rdi ;WIN64 prologue 1094 mov QWORD[16+rsp],rsi 1095 mov rax,rsp 1096$L$SEH_begin_ChaCha20_8x: 1097 mov rdi,rcx 1098 mov rsi,rdx 1099 mov rdx,r8 1100 mov rcx,r9 1101 mov r8,QWORD[40+rsp] 1102 1103 1104$L$ChaCha20_8x: 1105 1106 mov r9,rsp 1107 1108 sub rsp,0x280+168 1109 and rsp,-32 1110 movaps XMMWORD[(-168)+r9],xmm6 1111 movaps XMMWORD[(-152)+r9],xmm7 1112 movaps XMMWORD[(-136)+r9],xmm8 1113 movaps XMMWORD[(-120)+r9],xmm9 1114 movaps XMMWORD[(-104)+r9],xmm10 1115 movaps XMMWORD[(-88)+r9],xmm11 1116 movaps XMMWORD[(-72)+r9],xmm12 1117 movaps XMMWORD[(-56)+r9],xmm13 1118 movaps XMMWORD[(-40)+r9],xmm14 1119 movaps XMMWORD[(-24)+r9],xmm15 1120$L$8x_body: 1121 vzeroupper 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 vbroadcasti128 ymm11,XMMWORD[$L$sigma] 1133 vbroadcasti128 ymm3,XMMWORD[rcx] 1134 vbroadcasti128 ymm15,XMMWORD[16+rcx] 1135 vbroadcasti128 ymm7,XMMWORD[r8] 1136 lea rcx,[256+rsp] 1137 lea rax,[512+rsp] 1138 lea r10,[$L$rot16] 1139 lea r11,[$L$rot24] 1140 1141 vpshufd ymm8,ymm11,0x00 1142 vpshufd ymm9,ymm11,0x55 1143 vmovdqa YMMWORD[(128-256)+rcx],ymm8 1144 vpshufd ymm10,ymm11,0xaa 1145 vmovdqa YMMWORD[(160-256)+rcx],ymm9 1146 vpshufd ymm11,ymm11,0xff 1147 vmovdqa YMMWORD[(192-256)+rcx],ymm10 1148 vmovdqa YMMWORD[(224-256)+rcx],ymm11 1149 1150 vpshufd ymm0,ymm3,0x00 1151 vpshufd ymm1,ymm3,0x55 1152 vmovdqa YMMWORD[(256-256)+rcx],ymm0 1153 vpshufd ymm2,ymm3,0xaa 1154 vmovdqa YMMWORD[(288-256)+rcx],ymm1 1155 vpshufd ymm3,ymm3,0xff 1156 vmovdqa YMMWORD[(320-256)+rcx],ymm2 1157 vmovdqa YMMWORD[(352-256)+rcx],ymm3 1158 1159 vpshufd ymm12,ymm15,0x00 1160 vpshufd ymm13,ymm15,0x55 1161 vmovdqa YMMWORD[(384-512)+rax],ymm12 1162 vpshufd ymm14,ymm15,0xaa 1163 vmovdqa YMMWORD[(416-512)+rax],ymm13 1164 vpshufd ymm15,ymm15,0xff 1165 vmovdqa YMMWORD[(448-512)+rax],ymm14 1166 vmovdqa YMMWORD[(480-512)+rax],ymm15 1167 1168 vpshufd ymm4,ymm7,0x00 1169 vpshufd ymm5,ymm7,0x55 1170 vpaddd ymm4,ymm4,YMMWORD[$L$incy] 1171 vpshufd ymm6,ymm7,0xaa 1172 vmovdqa YMMWORD[(544-512)+rax],ymm5 1173 vpshufd ymm7,ymm7,0xff 1174 vmovdqa YMMWORD[(576-512)+rax],ymm6 1175 vmovdqa YMMWORD[(608-512)+rax],ymm7 1176 1177 jmp NEAR $L$oop_enter8x 1178 1179ALIGN 32 1180$L$oop_outer8x: 1181 vmovdqa ymm8,YMMWORD[((128-256))+rcx] 1182 vmovdqa ymm9,YMMWORD[((160-256))+rcx] 1183 vmovdqa ymm10,YMMWORD[((192-256))+rcx] 1184 vmovdqa ymm11,YMMWORD[((224-256))+rcx] 1185 vmovdqa ymm0,YMMWORD[((256-256))+rcx] 1186 vmovdqa ymm1,YMMWORD[((288-256))+rcx] 1187 vmovdqa ymm2,YMMWORD[((320-256))+rcx] 1188 vmovdqa ymm3,YMMWORD[((352-256))+rcx] 1189 vmovdqa ymm12,YMMWORD[((384-512))+rax] 1190 vmovdqa ymm13,YMMWORD[((416-512))+rax] 1191 vmovdqa ymm14,YMMWORD[((448-512))+rax] 1192 vmovdqa ymm15,YMMWORD[((480-512))+rax] 1193 vmovdqa ymm4,YMMWORD[((512-512))+rax] 1194 vmovdqa ymm5,YMMWORD[((544-512))+rax] 1195 vmovdqa ymm6,YMMWORD[((576-512))+rax] 1196 vmovdqa ymm7,YMMWORD[((608-512))+rax] 1197 vpaddd ymm4,ymm4,YMMWORD[$L$eight] 1198 1199$L$oop_enter8x: 1200 vmovdqa YMMWORD[64+rsp],ymm14 1201 vmovdqa YMMWORD[96+rsp],ymm15 1202 vbroadcasti128 ymm15,XMMWORD[r10] 1203 vmovdqa YMMWORD[(512-512)+rax],ymm4 1204 mov eax,10 1205 jmp NEAR $L$oop8x 1206 1207ALIGN 32 1208$L$oop8x: 1209 vpaddd ymm8,ymm8,ymm0 1210 vpxor ymm4,ymm8,ymm4 1211 vpshufb ymm4,ymm4,ymm15 1212 vpaddd ymm9,ymm9,ymm1 1213 vpxor ymm5,ymm9,ymm5 1214 vpshufb ymm5,ymm5,ymm15 1215 vpaddd ymm12,ymm12,ymm4 1216 vpxor ymm0,ymm12,ymm0 1217 vpslld ymm14,ymm0,12 1218 vpsrld ymm0,ymm0,20 1219 vpor ymm0,ymm14,ymm0 1220 vbroadcasti128 ymm14,XMMWORD[r11] 1221 vpaddd ymm13,ymm13,ymm5 1222 vpxor ymm1,ymm13,ymm1 1223 vpslld ymm15,ymm1,12 1224 vpsrld ymm1,ymm1,20 1225 vpor ymm1,ymm15,ymm1 1226 vpaddd ymm8,ymm8,ymm0 1227 vpxor ymm4,ymm8,ymm4 1228 vpshufb ymm4,ymm4,ymm14 1229 vpaddd ymm9,ymm9,ymm1 1230 vpxor ymm5,ymm9,ymm5 1231 vpshufb ymm5,ymm5,ymm14 1232 vpaddd ymm12,ymm12,ymm4 1233 vpxor ymm0,ymm12,ymm0 1234 vpslld ymm15,ymm0,7 1235 vpsrld ymm0,ymm0,25 1236 vpor ymm0,ymm15,ymm0 1237 vbroadcasti128 ymm15,XMMWORD[r10] 1238 vpaddd ymm13,ymm13,ymm5 1239 vpxor ymm1,ymm13,ymm1 1240 vpslld ymm14,ymm1,7 1241 vpsrld ymm1,ymm1,25 1242 vpor ymm1,ymm14,ymm1 1243 vmovdqa YMMWORD[rsp],ymm12 1244 vmovdqa YMMWORD[32+rsp],ymm13 1245 vmovdqa ymm12,YMMWORD[64+rsp] 1246 vmovdqa ymm13,YMMWORD[96+rsp] 1247 vpaddd ymm10,ymm10,ymm2 1248 vpxor ymm6,ymm10,ymm6 1249 vpshufb ymm6,ymm6,ymm15 1250 vpaddd ymm11,ymm11,ymm3 1251 vpxor ymm7,ymm11,ymm7 1252 vpshufb ymm7,ymm7,ymm15 1253 vpaddd ymm12,ymm12,ymm6 1254 vpxor ymm2,ymm12,ymm2 1255 vpslld ymm14,ymm2,12 1256 vpsrld ymm2,ymm2,20 1257 vpor ymm2,ymm14,ymm2 1258 vbroadcasti128 ymm14,XMMWORD[r11] 1259 vpaddd ymm13,ymm13,ymm7 1260 vpxor ymm3,ymm13,ymm3 1261 vpslld ymm15,ymm3,12 1262 vpsrld ymm3,ymm3,20 1263 vpor ymm3,ymm15,ymm3 1264 vpaddd ymm10,ymm10,ymm2 1265 vpxor ymm6,ymm10,ymm6 1266 vpshufb ymm6,ymm6,ymm14 1267 vpaddd ymm11,ymm11,ymm3 1268 vpxor ymm7,ymm11,ymm7 1269 vpshufb ymm7,ymm7,ymm14 1270 vpaddd ymm12,ymm12,ymm6 1271 vpxor ymm2,ymm12,ymm2 1272 vpslld ymm15,ymm2,7 1273 vpsrld ymm2,ymm2,25 1274 vpor ymm2,ymm15,ymm2 1275 vbroadcasti128 ymm15,XMMWORD[r10] 1276 vpaddd ymm13,ymm13,ymm7 1277 vpxor ymm3,ymm13,ymm3 1278 vpslld ymm14,ymm3,7 1279 vpsrld ymm3,ymm3,25 1280 vpor ymm3,ymm14,ymm3 1281 vpaddd ymm8,ymm8,ymm1 1282 vpxor ymm7,ymm8,ymm7 1283 vpshufb ymm7,ymm7,ymm15 1284 vpaddd ymm9,ymm9,ymm2 1285 vpxor ymm4,ymm9,ymm4 1286 vpshufb ymm4,ymm4,ymm15 1287 vpaddd ymm12,ymm12,ymm7 1288 vpxor ymm1,ymm12,ymm1 1289 vpslld ymm14,ymm1,12 1290 vpsrld ymm1,ymm1,20 1291 vpor ymm1,ymm14,ymm1 1292 vbroadcasti128 ymm14,XMMWORD[r11] 1293 vpaddd ymm13,ymm13,ymm4 1294 vpxor ymm2,ymm13,ymm2 1295 vpslld ymm15,ymm2,12 1296 vpsrld ymm2,ymm2,20 1297 vpor ymm2,ymm15,ymm2 1298 vpaddd ymm8,ymm8,ymm1 1299 vpxor ymm7,ymm8,ymm7 1300 vpshufb ymm7,ymm7,ymm14 1301 vpaddd ymm9,ymm9,ymm2 1302 vpxor ymm4,ymm9,ymm4 1303 vpshufb ymm4,ymm4,ymm14 1304 vpaddd ymm12,ymm12,ymm7 1305 vpxor ymm1,ymm12,ymm1 1306 vpslld ymm15,ymm1,7 1307 vpsrld ymm1,ymm1,25 1308 vpor ymm1,ymm15,ymm1 1309 vbroadcasti128 ymm15,XMMWORD[r10] 1310 vpaddd ymm13,ymm13,ymm4 1311 vpxor ymm2,ymm13,ymm2 1312 vpslld ymm14,ymm2,7 1313 vpsrld ymm2,ymm2,25 1314 vpor ymm2,ymm14,ymm2 1315 vmovdqa YMMWORD[64+rsp],ymm12 1316 vmovdqa YMMWORD[96+rsp],ymm13 1317 vmovdqa ymm12,YMMWORD[rsp] 1318 vmovdqa ymm13,YMMWORD[32+rsp] 1319 vpaddd ymm10,ymm10,ymm3 1320 vpxor ymm5,ymm10,ymm5 1321 vpshufb ymm5,ymm5,ymm15 1322 vpaddd ymm11,ymm11,ymm0 1323 vpxor ymm6,ymm11,ymm6 1324 vpshufb ymm6,ymm6,ymm15 1325 vpaddd ymm12,ymm12,ymm5 1326 vpxor ymm3,ymm12,ymm3 1327 vpslld ymm14,ymm3,12 1328 vpsrld ymm3,ymm3,20 1329 vpor ymm3,ymm14,ymm3 1330 vbroadcasti128 ymm14,XMMWORD[r11] 1331 vpaddd ymm13,ymm13,ymm6 1332 vpxor ymm0,ymm13,ymm0 1333 vpslld ymm15,ymm0,12 1334 vpsrld ymm0,ymm0,20 1335 vpor ymm0,ymm15,ymm0 1336 vpaddd ymm10,ymm10,ymm3 1337 vpxor ymm5,ymm10,ymm5 1338 vpshufb ymm5,ymm5,ymm14 1339 vpaddd ymm11,ymm11,ymm0 1340 vpxor ymm6,ymm11,ymm6 1341 vpshufb ymm6,ymm6,ymm14 1342 vpaddd ymm12,ymm12,ymm5 1343 vpxor ymm3,ymm12,ymm3 1344 vpslld ymm15,ymm3,7 1345 vpsrld ymm3,ymm3,25 1346 vpor ymm3,ymm15,ymm3 1347 vbroadcasti128 ymm15,XMMWORD[r10] 1348 vpaddd ymm13,ymm13,ymm6 1349 vpxor ymm0,ymm13,ymm0 1350 vpslld ymm14,ymm0,7 1351 vpsrld ymm0,ymm0,25 1352 vpor ymm0,ymm14,ymm0 1353 dec eax 1354 jnz NEAR $L$oop8x 1355 1356 lea rax,[512+rsp] 1357 vpaddd ymm8,ymm8,YMMWORD[((128-256))+rcx] 1358 vpaddd ymm9,ymm9,YMMWORD[((160-256))+rcx] 1359 vpaddd ymm10,ymm10,YMMWORD[((192-256))+rcx] 1360 vpaddd ymm11,ymm11,YMMWORD[((224-256))+rcx] 1361 1362 vpunpckldq ymm14,ymm8,ymm9 1363 vpunpckldq ymm15,ymm10,ymm11 1364 vpunpckhdq ymm8,ymm8,ymm9 1365 vpunpckhdq ymm10,ymm10,ymm11 1366 vpunpcklqdq ymm9,ymm14,ymm15 1367 vpunpckhqdq ymm14,ymm14,ymm15 1368 vpunpcklqdq ymm11,ymm8,ymm10 1369 vpunpckhqdq ymm8,ymm8,ymm10 1370 vpaddd ymm0,ymm0,YMMWORD[((256-256))+rcx] 1371 vpaddd ymm1,ymm1,YMMWORD[((288-256))+rcx] 1372 vpaddd ymm2,ymm2,YMMWORD[((320-256))+rcx] 1373 vpaddd ymm3,ymm3,YMMWORD[((352-256))+rcx] 1374 1375 vpunpckldq ymm10,ymm0,ymm1 1376 vpunpckldq ymm15,ymm2,ymm3 1377 vpunpckhdq ymm0,ymm0,ymm1 1378 vpunpckhdq ymm2,ymm2,ymm3 1379 vpunpcklqdq ymm1,ymm10,ymm15 1380 vpunpckhqdq ymm10,ymm10,ymm15 1381 vpunpcklqdq ymm3,ymm0,ymm2 1382 vpunpckhqdq ymm0,ymm0,ymm2 1383 vperm2i128 ymm15,ymm9,ymm1,0x20 1384 vperm2i128 ymm1,ymm9,ymm1,0x31 1385 vperm2i128 ymm9,ymm14,ymm10,0x20 1386 vperm2i128 ymm10,ymm14,ymm10,0x31 1387 vperm2i128 ymm14,ymm11,ymm3,0x20 1388 vperm2i128 ymm3,ymm11,ymm3,0x31 1389 vperm2i128 ymm11,ymm8,ymm0,0x20 1390 vperm2i128 ymm0,ymm8,ymm0,0x31 1391 vmovdqa YMMWORD[rsp],ymm15 1392 vmovdqa YMMWORD[32+rsp],ymm9 1393 vmovdqa ymm15,YMMWORD[64+rsp] 1394 vmovdqa ymm9,YMMWORD[96+rsp] 1395 1396 vpaddd ymm12,ymm12,YMMWORD[((384-512))+rax] 1397 vpaddd ymm13,ymm13,YMMWORD[((416-512))+rax] 1398 vpaddd ymm15,ymm15,YMMWORD[((448-512))+rax] 1399 vpaddd ymm9,ymm9,YMMWORD[((480-512))+rax] 1400 1401 vpunpckldq ymm2,ymm12,ymm13 1402 vpunpckldq ymm8,ymm15,ymm9 1403 vpunpckhdq ymm12,ymm12,ymm13 1404 vpunpckhdq ymm15,ymm15,ymm9 1405 vpunpcklqdq ymm13,ymm2,ymm8 1406 vpunpckhqdq ymm2,ymm2,ymm8 1407 vpunpcklqdq ymm9,ymm12,ymm15 1408 vpunpckhqdq ymm12,ymm12,ymm15 1409 vpaddd ymm4,ymm4,YMMWORD[((512-512))+rax] 1410 vpaddd ymm5,ymm5,YMMWORD[((544-512))+rax] 1411 vpaddd ymm6,ymm6,YMMWORD[((576-512))+rax] 1412 vpaddd ymm7,ymm7,YMMWORD[((608-512))+rax] 1413 1414 vpunpckldq ymm15,ymm4,ymm5 1415 vpunpckldq ymm8,ymm6,ymm7 1416 vpunpckhdq ymm4,ymm4,ymm5 1417 vpunpckhdq ymm6,ymm6,ymm7 1418 vpunpcklqdq ymm5,ymm15,ymm8 1419 vpunpckhqdq ymm15,ymm15,ymm8 1420 vpunpcklqdq ymm7,ymm4,ymm6 1421 vpunpckhqdq ymm4,ymm4,ymm6 1422 vperm2i128 ymm8,ymm13,ymm5,0x20 1423 vperm2i128 ymm5,ymm13,ymm5,0x31 1424 vperm2i128 ymm13,ymm2,ymm15,0x20 1425 vperm2i128 ymm15,ymm2,ymm15,0x31 1426 vperm2i128 ymm2,ymm9,ymm7,0x20 1427 vperm2i128 ymm7,ymm9,ymm7,0x31 1428 vperm2i128 ymm9,ymm12,ymm4,0x20 1429 vperm2i128 ymm4,ymm12,ymm4,0x31 1430 vmovdqa ymm6,YMMWORD[rsp] 1431 vmovdqa ymm12,YMMWORD[32+rsp] 1432 1433 cmp rdx,64*8 1434 jb NEAR $L$tail8x 1435 1436 vpxor ymm6,ymm6,YMMWORD[rsi] 1437 vpxor ymm8,ymm8,YMMWORD[32+rsi] 1438 vpxor ymm1,ymm1,YMMWORD[64+rsi] 1439 vpxor ymm5,ymm5,YMMWORD[96+rsi] 1440 lea rsi,[128+rsi] 1441 vmovdqu YMMWORD[rdi],ymm6 1442 vmovdqu YMMWORD[32+rdi],ymm8 1443 vmovdqu YMMWORD[64+rdi],ymm1 1444 vmovdqu YMMWORD[96+rdi],ymm5 1445 lea rdi,[128+rdi] 1446 1447 vpxor ymm12,ymm12,YMMWORD[rsi] 1448 vpxor ymm13,ymm13,YMMWORD[32+rsi] 1449 vpxor ymm10,ymm10,YMMWORD[64+rsi] 1450 vpxor ymm15,ymm15,YMMWORD[96+rsi] 1451 lea rsi,[128+rsi] 1452 vmovdqu YMMWORD[rdi],ymm12 1453 vmovdqu YMMWORD[32+rdi],ymm13 1454 vmovdqu YMMWORD[64+rdi],ymm10 1455 vmovdqu YMMWORD[96+rdi],ymm15 1456 lea rdi,[128+rdi] 1457 1458 vpxor ymm14,ymm14,YMMWORD[rsi] 1459 vpxor ymm2,ymm2,YMMWORD[32+rsi] 1460 vpxor ymm3,ymm3,YMMWORD[64+rsi] 1461 vpxor ymm7,ymm7,YMMWORD[96+rsi] 1462 lea rsi,[128+rsi] 1463 vmovdqu YMMWORD[rdi],ymm14 1464 vmovdqu YMMWORD[32+rdi],ymm2 1465 vmovdqu YMMWORD[64+rdi],ymm3 1466 vmovdqu YMMWORD[96+rdi],ymm7 1467 lea rdi,[128+rdi] 1468 1469 vpxor ymm11,ymm11,YMMWORD[rsi] 1470 vpxor ymm9,ymm9,YMMWORD[32+rsi] 1471 vpxor ymm0,ymm0,YMMWORD[64+rsi] 1472 vpxor ymm4,ymm4,YMMWORD[96+rsi] 1473 lea rsi,[128+rsi] 1474 vmovdqu YMMWORD[rdi],ymm11 1475 vmovdqu YMMWORD[32+rdi],ymm9 1476 vmovdqu YMMWORD[64+rdi],ymm0 1477 vmovdqu YMMWORD[96+rdi],ymm4 1478 lea rdi,[128+rdi] 1479 1480 sub rdx,64*8 1481 jnz NEAR $L$oop_outer8x 1482 1483 jmp NEAR $L$done8x 1484 1485$L$tail8x: 1486 cmp rdx,448 1487 jae NEAR $L$448_or_more8x 1488 cmp rdx,384 1489 jae NEAR $L$384_or_more8x 1490 cmp rdx,320 1491 jae NEAR $L$320_or_more8x 1492 cmp rdx,256 1493 jae NEAR $L$256_or_more8x 1494 cmp rdx,192 1495 jae NEAR $L$192_or_more8x 1496 cmp rdx,128 1497 jae NEAR $L$128_or_more8x 1498 cmp rdx,64 1499 jae NEAR $L$64_or_more8x 1500 1501 xor r10,r10 1502 vmovdqa YMMWORD[rsp],ymm6 1503 vmovdqa YMMWORD[32+rsp],ymm8 1504 jmp NEAR $L$oop_tail8x 1505 1506ALIGN 32 1507$L$64_or_more8x: 1508 vpxor ymm6,ymm6,YMMWORD[rsi] 1509 vpxor ymm8,ymm8,YMMWORD[32+rsi] 1510 vmovdqu YMMWORD[rdi],ymm6 1511 vmovdqu YMMWORD[32+rdi],ymm8 1512 je NEAR $L$done8x 1513 1514 lea rsi,[64+rsi] 1515 xor r10,r10 1516 vmovdqa YMMWORD[rsp],ymm1 1517 lea rdi,[64+rdi] 1518 sub rdx,64 1519 vmovdqa YMMWORD[32+rsp],ymm5 1520 jmp NEAR $L$oop_tail8x 1521 1522ALIGN 32 1523$L$128_or_more8x: 1524 vpxor ymm6,ymm6,YMMWORD[rsi] 1525 vpxor ymm8,ymm8,YMMWORD[32+rsi] 1526 vpxor ymm1,ymm1,YMMWORD[64+rsi] 1527 vpxor ymm5,ymm5,YMMWORD[96+rsi] 1528 vmovdqu YMMWORD[rdi],ymm6 1529 vmovdqu YMMWORD[32+rdi],ymm8 1530 vmovdqu YMMWORD[64+rdi],ymm1 1531 vmovdqu YMMWORD[96+rdi],ymm5 1532 je NEAR $L$done8x 1533 1534 lea rsi,[128+rsi] 1535 xor r10,r10 1536 vmovdqa YMMWORD[rsp],ymm12 1537 lea rdi,[128+rdi] 1538 sub rdx,128 1539 vmovdqa YMMWORD[32+rsp],ymm13 1540 jmp NEAR $L$oop_tail8x 1541 1542ALIGN 32 1543$L$192_or_more8x: 1544 vpxor ymm6,ymm6,YMMWORD[rsi] 1545 vpxor ymm8,ymm8,YMMWORD[32+rsi] 1546 vpxor ymm1,ymm1,YMMWORD[64+rsi] 1547 vpxor ymm5,ymm5,YMMWORD[96+rsi] 1548 vpxor ymm12,ymm12,YMMWORD[128+rsi] 1549 vpxor ymm13,ymm13,YMMWORD[160+rsi] 1550 vmovdqu YMMWORD[rdi],ymm6 1551 vmovdqu YMMWORD[32+rdi],ymm8 1552 vmovdqu YMMWORD[64+rdi],ymm1 1553 vmovdqu YMMWORD[96+rdi],ymm5 1554 vmovdqu YMMWORD[128+rdi],ymm12 1555 vmovdqu YMMWORD[160+rdi],ymm13 1556 je NEAR $L$done8x 1557 1558 lea rsi,[192+rsi] 1559 xor r10,r10 1560 vmovdqa YMMWORD[rsp],ymm10 1561 lea rdi,[192+rdi] 1562 sub rdx,192 1563 vmovdqa YMMWORD[32+rsp],ymm15 1564 jmp NEAR $L$oop_tail8x 1565 1566ALIGN 32 1567$L$256_or_more8x: 1568 vpxor ymm6,ymm6,YMMWORD[rsi] 1569 vpxor ymm8,ymm8,YMMWORD[32+rsi] 1570 vpxor ymm1,ymm1,YMMWORD[64+rsi] 1571 vpxor ymm5,ymm5,YMMWORD[96+rsi] 1572 vpxor ymm12,ymm12,YMMWORD[128+rsi] 1573 vpxor ymm13,ymm13,YMMWORD[160+rsi] 1574 vpxor ymm10,ymm10,YMMWORD[192+rsi] 1575 vpxor ymm15,ymm15,YMMWORD[224+rsi] 1576 vmovdqu YMMWORD[rdi],ymm6 1577 vmovdqu YMMWORD[32+rdi],ymm8 1578 vmovdqu YMMWORD[64+rdi],ymm1 1579 vmovdqu YMMWORD[96+rdi],ymm5 1580 vmovdqu YMMWORD[128+rdi],ymm12 1581 vmovdqu YMMWORD[160+rdi],ymm13 1582 vmovdqu YMMWORD[192+rdi],ymm10 1583 vmovdqu YMMWORD[224+rdi],ymm15 1584 je NEAR $L$done8x 1585 1586 lea rsi,[256+rsi] 1587 xor r10,r10 1588 vmovdqa YMMWORD[rsp],ymm14 1589 lea rdi,[256+rdi] 1590 sub rdx,256 1591 vmovdqa YMMWORD[32+rsp],ymm2 1592 jmp NEAR $L$oop_tail8x 1593 1594ALIGN 32 1595$L$320_or_more8x: 1596 vpxor ymm6,ymm6,YMMWORD[rsi] 1597 vpxor ymm8,ymm8,YMMWORD[32+rsi] 1598 vpxor ymm1,ymm1,YMMWORD[64+rsi] 1599 vpxor ymm5,ymm5,YMMWORD[96+rsi] 1600 vpxor ymm12,ymm12,YMMWORD[128+rsi] 1601 vpxor ymm13,ymm13,YMMWORD[160+rsi] 1602 vpxor ymm10,ymm10,YMMWORD[192+rsi] 1603 vpxor ymm15,ymm15,YMMWORD[224+rsi] 1604 vpxor ymm14,ymm14,YMMWORD[256+rsi] 1605 vpxor ymm2,ymm2,YMMWORD[288+rsi] 1606 vmovdqu YMMWORD[rdi],ymm6 1607 vmovdqu YMMWORD[32+rdi],ymm8 1608 vmovdqu YMMWORD[64+rdi],ymm1 1609 vmovdqu YMMWORD[96+rdi],ymm5 1610 vmovdqu YMMWORD[128+rdi],ymm12 1611 vmovdqu YMMWORD[160+rdi],ymm13 1612 vmovdqu YMMWORD[192+rdi],ymm10 1613 vmovdqu YMMWORD[224+rdi],ymm15 1614 vmovdqu YMMWORD[256+rdi],ymm14 1615 vmovdqu YMMWORD[288+rdi],ymm2 1616 je NEAR $L$done8x 1617 1618 lea rsi,[320+rsi] 1619 xor r10,r10 1620 vmovdqa YMMWORD[rsp],ymm3 1621 lea rdi,[320+rdi] 1622 sub rdx,320 1623 vmovdqa YMMWORD[32+rsp],ymm7 1624 jmp NEAR $L$oop_tail8x 1625 1626ALIGN 32 1627$L$384_or_more8x: 1628 vpxor ymm6,ymm6,YMMWORD[rsi] 1629 vpxor ymm8,ymm8,YMMWORD[32+rsi] 1630 vpxor ymm1,ymm1,YMMWORD[64+rsi] 1631 vpxor ymm5,ymm5,YMMWORD[96+rsi] 1632 vpxor ymm12,ymm12,YMMWORD[128+rsi] 1633 vpxor ymm13,ymm13,YMMWORD[160+rsi] 1634 vpxor ymm10,ymm10,YMMWORD[192+rsi] 1635 vpxor ymm15,ymm15,YMMWORD[224+rsi] 1636 vpxor ymm14,ymm14,YMMWORD[256+rsi] 1637 vpxor ymm2,ymm2,YMMWORD[288+rsi] 1638 vpxor ymm3,ymm3,YMMWORD[320+rsi] 1639 vpxor ymm7,ymm7,YMMWORD[352+rsi] 1640 vmovdqu YMMWORD[rdi],ymm6 1641 vmovdqu YMMWORD[32+rdi],ymm8 1642 vmovdqu YMMWORD[64+rdi],ymm1 1643 vmovdqu YMMWORD[96+rdi],ymm5 1644 vmovdqu YMMWORD[128+rdi],ymm12 1645 vmovdqu YMMWORD[160+rdi],ymm13 1646 vmovdqu YMMWORD[192+rdi],ymm10 1647 vmovdqu YMMWORD[224+rdi],ymm15 1648 vmovdqu YMMWORD[256+rdi],ymm14 1649 vmovdqu YMMWORD[288+rdi],ymm2 1650 vmovdqu YMMWORD[320+rdi],ymm3 1651 vmovdqu YMMWORD[352+rdi],ymm7 1652 je NEAR $L$done8x 1653 1654 lea rsi,[384+rsi] 1655 xor r10,r10 1656 vmovdqa YMMWORD[rsp],ymm11 1657 lea rdi,[384+rdi] 1658 sub rdx,384 1659 vmovdqa YMMWORD[32+rsp],ymm9 1660 jmp NEAR $L$oop_tail8x 1661 1662ALIGN 32 1663$L$448_or_more8x: 1664 vpxor ymm6,ymm6,YMMWORD[rsi] 1665 vpxor ymm8,ymm8,YMMWORD[32+rsi] 1666 vpxor ymm1,ymm1,YMMWORD[64+rsi] 1667 vpxor ymm5,ymm5,YMMWORD[96+rsi] 1668 vpxor ymm12,ymm12,YMMWORD[128+rsi] 1669 vpxor ymm13,ymm13,YMMWORD[160+rsi] 1670 vpxor ymm10,ymm10,YMMWORD[192+rsi] 1671 vpxor ymm15,ymm15,YMMWORD[224+rsi] 1672 vpxor ymm14,ymm14,YMMWORD[256+rsi] 1673 vpxor ymm2,ymm2,YMMWORD[288+rsi] 1674 vpxor ymm3,ymm3,YMMWORD[320+rsi] 1675 vpxor ymm7,ymm7,YMMWORD[352+rsi] 1676 vpxor ymm11,ymm11,YMMWORD[384+rsi] 1677 vpxor ymm9,ymm9,YMMWORD[416+rsi] 1678 vmovdqu YMMWORD[rdi],ymm6 1679 vmovdqu YMMWORD[32+rdi],ymm8 1680 vmovdqu YMMWORD[64+rdi],ymm1 1681 vmovdqu YMMWORD[96+rdi],ymm5 1682 vmovdqu YMMWORD[128+rdi],ymm12 1683 vmovdqu YMMWORD[160+rdi],ymm13 1684 vmovdqu YMMWORD[192+rdi],ymm10 1685 vmovdqu YMMWORD[224+rdi],ymm15 1686 vmovdqu YMMWORD[256+rdi],ymm14 1687 vmovdqu YMMWORD[288+rdi],ymm2 1688 vmovdqu YMMWORD[320+rdi],ymm3 1689 vmovdqu YMMWORD[352+rdi],ymm7 1690 vmovdqu YMMWORD[384+rdi],ymm11 1691 vmovdqu YMMWORD[416+rdi],ymm9 1692 je NEAR $L$done8x 1693 1694 lea rsi,[448+rsi] 1695 xor r10,r10 1696 vmovdqa YMMWORD[rsp],ymm0 1697 lea rdi,[448+rdi] 1698 sub rdx,448 1699 vmovdqa YMMWORD[32+rsp],ymm4 1700 1701$L$oop_tail8x: 1702 movzx eax,BYTE[r10*1+rsi] 1703 movzx ecx,BYTE[r10*1+rsp] 1704 lea r10,[1+r10] 1705 xor eax,ecx 1706 mov BYTE[((-1))+r10*1+rdi],al 1707 dec rdx 1708 jnz NEAR $L$oop_tail8x 1709 1710$L$done8x: 1711 vzeroall 1712 movaps xmm6,XMMWORD[((-168))+r9] 1713 movaps xmm7,XMMWORD[((-152))+r9] 1714 movaps xmm8,XMMWORD[((-136))+r9] 1715 movaps xmm9,XMMWORD[((-120))+r9] 1716 movaps xmm10,XMMWORD[((-104))+r9] 1717 movaps xmm11,XMMWORD[((-88))+r9] 1718 movaps xmm12,XMMWORD[((-72))+r9] 1719 movaps xmm13,XMMWORD[((-56))+r9] 1720 movaps xmm14,XMMWORD[((-40))+r9] 1721 movaps xmm15,XMMWORD[((-24))+r9] 1722 lea rsp,[r9] 1723 1724$L$8x_epilogue: 1725 mov rdi,QWORD[8+rsp] ;WIN64 epilogue 1726 mov rsi,QWORD[16+rsp] 1727 ret 1728 1729$L$SEH_end_ChaCha20_8x: 1730EXTERN __imp_RtlVirtualUnwind 1731 1732ALIGN 16 1733se_handler: 1734 push rsi 1735 push rdi 1736 push rbx 1737 push rbp 1738 push r12 1739 push r13 1740 push r14 1741 push r15 1742 pushfq 1743 sub rsp,64 1744 1745 mov rax,QWORD[120+r8] 1746 mov rbx,QWORD[248+r8] 1747 1748 mov rsi,QWORD[8+r9] 1749 mov r11,QWORD[56+r9] 1750 1751 lea r10,[$L$ctr32_body] 1752 cmp rbx,r10 1753 jb NEAR $L$common_seh_tail 1754 1755 mov rax,QWORD[152+r8] 1756 1757 lea r10,[$L$no_data] 1758 cmp rbx,r10 1759 jae NEAR $L$common_seh_tail 1760 1761 lea rax,[((64+24+48))+rax] 1762 1763 mov rbx,QWORD[((-8))+rax] 1764 mov rbp,QWORD[((-16))+rax] 1765 mov r12,QWORD[((-24))+rax] 1766 mov r13,QWORD[((-32))+rax] 1767 mov r14,QWORD[((-40))+rax] 1768 mov r15,QWORD[((-48))+rax] 1769 mov QWORD[144+r8],rbx 1770 mov QWORD[160+r8],rbp 1771 mov QWORD[216+r8],r12 1772 mov QWORD[224+r8],r13 1773 mov QWORD[232+r8],r14 1774 mov QWORD[240+r8],r15 1775 1776$L$common_seh_tail: 1777 mov rdi,QWORD[8+rax] 1778 mov rsi,QWORD[16+rax] 1779 mov QWORD[152+r8],rax 1780 mov QWORD[168+r8],rsi 1781 mov QWORD[176+r8],rdi 1782 1783 mov rdi,QWORD[40+r9] 1784 mov rsi,r8 1785 mov ecx,154 1786 DD 0xa548f3fc 1787 1788 mov rsi,r9 1789 xor rcx,rcx 1790 mov rdx,QWORD[8+rsi] 1791 mov r8,QWORD[rsi] 1792 mov r9,QWORD[16+rsi] 1793 mov r10,QWORD[40+rsi] 1794 lea r11,[56+rsi] 1795 lea r12,[24+rsi] 1796 mov QWORD[32+rsp],r10 1797 mov QWORD[40+rsp],r11 1798 mov QWORD[48+rsp],r12 1799 mov QWORD[56+rsp],rcx 1800 call QWORD[__imp_RtlVirtualUnwind] 1801 1802 mov eax,1 1803 add rsp,64 1804 popfq 1805 pop r15 1806 pop r14 1807 pop r13 1808 pop r12 1809 pop rbp 1810 pop rbx 1811 pop rdi 1812 pop rsi 1813 ret 1814 1815 1816 1817ALIGN 16 1818ssse3_handler: 1819 push rsi 1820 push rdi 1821 push rbx 1822 push rbp 1823 push r12 1824 push r13 1825 push r14 1826 push r15 1827 pushfq 1828 sub rsp,64 1829 1830 mov rax,QWORD[120+r8] 1831 mov rbx,QWORD[248+r8] 1832 1833 mov rsi,QWORD[8+r9] 1834 mov r11,QWORD[56+r9] 1835 1836 mov r10d,DWORD[r11] 1837 lea r10,[r10*1+rsi] 1838 cmp rbx,r10 1839 jb NEAR $L$common_seh_tail 1840 1841 mov rax,QWORD[192+r8] 1842 1843 mov r10d,DWORD[4+r11] 1844 lea r10,[r10*1+rsi] 1845 cmp rbx,r10 1846 jae NEAR $L$common_seh_tail 1847 1848 lea rsi,[((-40))+rax] 1849 lea rdi,[512+r8] 1850 mov ecx,4 1851 DD 0xa548f3fc 1852 1853 jmp NEAR $L$common_seh_tail 1854 1855 1856 1857ALIGN 16 1858full_handler: 1859 push rsi 1860 push rdi 1861 push rbx 1862 push rbp 1863 push r12 1864 push r13 1865 push r14 1866 push r15 1867 pushfq 1868 sub rsp,64 1869 1870 mov rax,QWORD[120+r8] 1871 mov rbx,QWORD[248+r8] 1872 1873 mov rsi,QWORD[8+r9] 1874 mov r11,QWORD[56+r9] 1875 1876 mov r10d,DWORD[r11] 1877 lea r10,[r10*1+rsi] 1878 cmp rbx,r10 1879 jb NEAR $L$common_seh_tail 1880 1881 mov rax,QWORD[192+r8] 1882 1883 mov r10d,DWORD[4+r11] 1884 lea r10,[r10*1+rsi] 1885 cmp rbx,r10 1886 jae NEAR $L$common_seh_tail 1887 1888 lea rsi,[((-168))+rax] 1889 lea rdi,[512+r8] 1890 mov ecx,20 1891 DD 0xa548f3fc 1892 1893 jmp NEAR $L$common_seh_tail 1894 1895 1896section .pdata rdata align=4 1897ALIGN 4 1898 DD $L$SEH_begin_ChaCha20_ctr32 wrt ..imagebase 1899 DD $L$SEH_end_ChaCha20_ctr32 wrt ..imagebase 1900 DD $L$SEH_info_ChaCha20_ctr32 wrt ..imagebase 1901 1902 DD $L$SEH_begin_ChaCha20_ssse3 wrt ..imagebase 1903 DD $L$SEH_end_ChaCha20_ssse3 wrt ..imagebase 1904 DD $L$SEH_info_ChaCha20_ssse3 wrt ..imagebase 1905 1906 DD $L$SEH_begin_ChaCha20_4x wrt ..imagebase 1907 DD $L$SEH_end_ChaCha20_4x wrt ..imagebase 1908 DD $L$SEH_info_ChaCha20_4x wrt ..imagebase 1909 DD $L$SEH_begin_ChaCha20_8x wrt ..imagebase 1910 DD $L$SEH_end_ChaCha20_8x wrt ..imagebase 1911 DD $L$SEH_info_ChaCha20_8x wrt ..imagebase 1912section .xdata rdata align=8 1913ALIGN 8 1914$L$SEH_info_ChaCha20_ctr32: 1915 DB 9,0,0,0 1916 DD se_handler wrt ..imagebase 1917 1918$L$SEH_info_ChaCha20_ssse3: 1919 DB 9,0,0,0 1920 DD ssse3_handler wrt ..imagebase 1921 DD $L$ssse3_body wrt ..imagebase,$L$ssse3_epilogue wrt ..imagebase 1922 1923$L$SEH_info_ChaCha20_4x: 1924 DB 9,0,0,0 1925 DD full_handler wrt ..imagebase 1926 DD $L$4x_body wrt ..imagebase,$L$4x_epilogue wrt ..imagebase 1927$L$SEH_info_ChaCha20_8x: 1928 DB 9,0,0,0 1929 DD full_handler wrt ..imagebase 1930 DD $L$8x_body wrt ..imagebase,$L$8x_epilogue wrt ..imagebase 1931%else 1932; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 1933ret 1934%endif 1935