1; This file is generated from a similarly-named Perl script in the BoringSSL 2; source tree. Do not edit by hand. 3 4%ifidn __OUTPUT_FORMAT__, win64 5default rel 6%define XMMWORD 7%define YMMWORD 8%define ZMMWORD 9%define _CET_ENDBR 10 11%ifdef BORINGSSL_PREFIX 12%include "boringssl_prefix_symbols_nasm.inc" 13%endif 14section .text code align=64 15 16 17section .rdata rdata align=8 18ALIGN 64 19$L$zero: 20 DD 0,0,0,0 21$L$one: 22 DD 1,0,0,0 23$L$inc: 24 DD 0,1,2,3 25$L$four: 26 DD 4,4,4,4 27$L$incy: 28 DD 0,2,4,6,1,3,5,7 29$L$eight: 30 DD 8,8,8,8,8,8,8,8 31$L$rot16: 32 DB 0x2,0x3,0x0,0x1,0x6,0x7,0x4,0x5,0xa,0xb,0x8,0x9,0xe,0xf,0xc,0xd 33$L$rot24: 34 DB 0x3,0x0,0x1,0x2,0x7,0x4,0x5,0x6,0xb,0x8,0x9,0xa,0xf,0xc,0xd,0xe 35$L$sigma: 36 DB 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107 37 DB 0 38ALIGN 64 39$L$zeroz: 40 DD 0,0,0,0,1,0,0,0,2,0,0,0,3,0,0,0 41$L$fourz: 42 DD 4,0,0,0,4,0,0,0,4,0,0,0,4,0,0,0 43$L$incz: 44 DD 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 45$L$sixteen: 46 DD 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 47 DB 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 48 DB 95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32 49 DB 98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115 50 DB 108,46,111,114,103,62,0 51section .text 52 53global ChaCha20_ctr32_nohw 54 55ALIGN 64 56ChaCha20_ctr32_nohw: 57 mov QWORD[8+rsp],rdi ;WIN64 prologue 58 mov QWORD[16+rsp],rsi 59 mov rax,rsp 60$L$SEH_begin_ChaCha20_ctr32_nohw: 61 mov rdi,rcx 62 mov rsi,rdx 63 mov rdx,r8 64 mov rcx,r9 65 mov r8,QWORD[40+rsp] 66 67 68 69_CET_ENDBR 70 push rbx 71 72 push rbp 73 74 push r12 75 76 push r13 77 78 push r14 79 80 push r15 81 82 sub rsp,64+24 83 84$L$ctr32_body: 85 86 87 movdqu xmm1,XMMWORD[rcx] 88 movdqu xmm2,XMMWORD[16+rcx] 89 movdqu xmm3,XMMWORD[r8] 90 movdqa xmm4,XMMWORD[$L$one] 91 92 93 movdqa XMMWORD[16+rsp],xmm1 94 movdqa XMMWORD[32+rsp],xmm2 95 movdqa XMMWORD[48+rsp],xmm3 96 mov rbp,rdx 97 jmp NEAR $L$oop_outer 98 99ALIGN 32 100$L$oop_outer: 101 mov eax,0x61707865 102 mov ebx,0x3320646e 103 mov ecx,0x79622d32 104 mov edx,0x6b206574 105 mov r8d,DWORD[16+rsp] 106 mov r9d,DWORD[20+rsp] 107 mov r10d,DWORD[24+rsp] 108 mov r11d,DWORD[28+rsp] 109 movd r12d,xmm3 110 mov r13d,DWORD[52+rsp] 111 mov r14d,DWORD[56+rsp] 112 mov r15d,DWORD[60+rsp] 113 114 mov QWORD[((64+0))+rsp],rbp 115 mov ebp,10 116 mov QWORD[((64+8))+rsp],rsi 117DB 102,72,15,126,214 118 mov QWORD[((64+16))+rsp],rdi 119 mov rdi,rsi 120 shr rdi,32 121 jmp NEAR $L$oop 122 123ALIGN 32 124$L$oop: 125 add eax,r8d 126 xor r12d,eax 127 rol r12d,16 128 add ebx,r9d 129 xor r13d,ebx 130 rol r13d,16 131 add esi,r12d 132 xor r8d,esi 133 rol r8d,12 134 add edi,r13d 135 xor r9d,edi 136 rol r9d,12 137 add eax,r8d 138 xor r12d,eax 139 rol r12d,8 140 add ebx,r9d 141 xor r13d,ebx 142 rol r13d,8 143 add esi,r12d 144 xor r8d,esi 145 rol r8d,7 146 add edi,r13d 147 xor r9d,edi 148 rol r9d,7 149 mov DWORD[32+rsp],esi 150 mov DWORD[36+rsp],edi 151 mov esi,DWORD[40+rsp] 152 mov edi,DWORD[44+rsp] 153 add ecx,r10d 154 xor r14d,ecx 155 rol r14d,16 156 add edx,r11d 157 xor r15d,edx 158 rol r15d,16 159 add esi,r14d 160 xor r10d,esi 161 rol r10d,12 162 add edi,r15d 163 xor r11d,edi 164 rol r11d,12 165 add ecx,r10d 166 xor r14d,ecx 167 rol r14d,8 168 add edx,r11d 169 xor r15d,edx 170 rol r15d,8 171 add esi,r14d 172 xor r10d,esi 173 rol r10d,7 174 add edi,r15d 175 xor r11d,edi 176 rol r11d,7 177 add eax,r9d 178 xor r15d,eax 179 rol r15d,16 180 add ebx,r10d 181 xor r12d,ebx 182 rol r12d,16 183 add esi,r15d 184 xor r9d,esi 185 rol r9d,12 186 add edi,r12d 187 xor r10d,edi 188 rol r10d,12 189 add eax,r9d 190 xor r15d,eax 191 rol r15d,8 192 add ebx,r10d 193 xor r12d,ebx 194 rol r12d,8 195 add esi,r15d 196 xor r9d,esi 197 rol r9d,7 198 add edi,r12d 199 xor r10d,edi 200 rol r10d,7 201 mov DWORD[40+rsp],esi 202 mov DWORD[44+rsp],edi 203 mov esi,DWORD[32+rsp] 204 mov edi,DWORD[36+rsp] 205 add ecx,r11d 206 xor r13d,ecx 207 rol r13d,16 208 add edx,r8d 209 xor r14d,edx 210 rol r14d,16 211 add esi,r13d 212 xor r11d,esi 213 rol r11d,12 214 add edi,r14d 215 xor r8d,edi 216 rol r8d,12 217 add ecx,r11d 218 xor r13d,ecx 219 rol r13d,8 220 add edx,r8d 221 xor r14d,edx 222 rol r14d,8 223 add esi,r13d 224 xor r11d,esi 225 rol r11d,7 226 add edi,r14d 227 xor r8d,edi 228 rol r8d,7 229 dec ebp 230 jnz NEAR $L$oop 231 mov DWORD[36+rsp],edi 232 mov DWORD[32+rsp],esi 233 mov rbp,QWORD[64+rsp] 234 movdqa xmm1,xmm2 235 mov rsi,QWORD[((64+8))+rsp] 236 paddd xmm3,xmm4 237 mov rdi,QWORD[((64+16))+rsp] 238 239 add eax,0x61707865 240 add ebx,0x3320646e 241 add ecx,0x79622d32 242 add edx,0x6b206574 243 add r8d,DWORD[16+rsp] 244 add r9d,DWORD[20+rsp] 245 add r10d,DWORD[24+rsp] 246 add r11d,DWORD[28+rsp] 247 add r12d,DWORD[48+rsp] 248 add r13d,DWORD[52+rsp] 249 add r14d,DWORD[56+rsp] 250 add r15d,DWORD[60+rsp] 251 paddd xmm1,XMMWORD[32+rsp] 252 253 cmp rbp,64 254 jb NEAR $L$tail 255 256 xor eax,DWORD[rsi] 257 xor ebx,DWORD[4+rsi] 258 xor ecx,DWORD[8+rsi] 259 xor edx,DWORD[12+rsi] 260 xor r8d,DWORD[16+rsi] 261 xor r9d,DWORD[20+rsi] 262 xor r10d,DWORD[24+rsi] 263 xor r11d,DWORD[28+rsi] 264 movdqu xmm0,XMMWORD[32+rsi] 265 xor r12d,DWORD[48+rsi] 266 xor r13d,DWORD[52+rsi] 267 xor r14d,DWORD[56+rsi] 268 xor r15d,DWORD[60+rsi] 269 lea rsi,[64+rsi] 270 pxor xmm0,xmm1 271 272 movdqa XMMWORD[32+rsp],xmm2 273 movd DWORD[48+rsp],xmm3 274 275 mov DWORD[rdi],eax 276 mov DWORD[4+rdi],ebx 277 mov DWORD[8+rdi],ecx 278 mov DWORD[12+rdi],edx 279 mov DWORD[16+rdi],r8d 280 mov DWORD[20+rdi],r9d 281 mov DWORD[24+rdi],r10d 282 mov DWORD[28+rdi],r11d 283 movdqu XMMWORD[32+rdi],xmm0 284 mov DWORD[48+rdi],r12d 285 mov DWORD[52+rdi],r13d 286 mov DWORD[56+rdi],r14d 287 mov DWORD[60+rdi],r15d 288 lea rdi,[64+rdi] 289 290 sub rbp,64 291 jnz NEAR $L$oop_outer 292 293 jmp NEAR $L$done 294 295ALIGN 16 296$L$tail: 297 mov DWORD[rsp],eax 298 mov DWORD[4+rsp],ebx 299 xor rbx,rbx 300 mov DWORD[8+rsp],ecx 301 mov DWORD[12+rsp],edx 302 mov DWORD[16+rsp],r8d 303 mov DWORD[20+rsp],r9d 304 mov DWORD[24+rsp],r10d 305 mov DWORD[28+rsp],r11d 306 movdqa XMMWORD[32+rsp],xmm1 307 mov DWORD[48+rsp],r12d 308 mov DWORD[52+rsp],r13d 309 mov DWORD[56+rsp],r14d 310 mov DWORD[60+rsp],r15d 311 312$L$oop_tail: 313 movzx eax,BYTE[rbx*1+rsi] 314 movzx edx,BYTE[rbx*1+rsp] 315 lea rbx,[1+rbx] 316 xor eax,edx 317 mov BYTE[((-1))+rbx*1+rdi],al 318 dec rbp 319 jnz NEAR $L$oop_tail 320 321$L$done: 322 lea rsi,[((64+24+48))+rsp] 323 mov r15,QWORD[((-48))+rsi] 324 325 mov r14,QWORD[((-40))+rsi] 326 327 mov r13,QWORD[((-32))+rsi] 328 329 mov r12,QWORD[((-24))+rsi] 330 331 mov rbp,QWORD[((-16))+rsi] 332 333 mov rbx,QWORD[((-8))+rsi] 334 335 lea rsp,[rsi] 336 337$L$no_data: 338 mov rdi,QWORD[8+rsp] ;WIN64 epilogue 339 mov rsi,QWORD[16+rsp] 340 ret 341 342$L$SEH_end_ChaCha20_ctr32_nohw: 343global ChaCha20_ctr32_ssse3 344 345ALIGN 32 346ChaCha20_ctr32_ssse3: 347 mov QWORD[8+rsp],rdi ;WIN64 prologue 348 mov QWORD[16+rsp],rsi 349 mov rax,rsp 350$L$SEH_begin_ChaCha20_ctr32_ssse3: 351 mov rdi,rcx 352 mov rsi,rdx 353 mov rdx,r8 354 mov rcx,r9 355 mov r8,QWORD[40+rsp] 356 357 358 359_CET_ENDBR 360 mov r9,rsp 361 362 sub rsp,64+40 363 movaps XMMWORD[(-40)+r9],xmm6 364 movaps XMMWORD[(-24)+r9],xmm7 365$L$ssse3_body: 366 movdqa xmm0,XMMWORD[$L$sigma] 367 movdqu xmm1,XMMWORD[rcx] 368 movdqu xmm2,XMMWORD[16+rcx] 369 movdqu xmm3,XMMWORD[r8] 370 movdqa xmm6,XMMWORD[$L$rot16] 371 movdqa xmm7,XMMWORD[$L$rot24] 372 373 movdqa XMMWORD[rsp],xmm0 374 movdqa XMMWORD[16+rsp],xmm1 375 movdqa XMMWORD[32+rsp],xmm2 376 movdqa XMMWORD[48+rsp],xmm3 377 mov r8,10 378 jmp NEAR $L$oop_ssse3 379 380ALIGN 32 381$L$oop_outer_ssse3: 382 movdqa xmm3,XMMWORD[$L$one] 383 movdqa xmm0,XMMWORD[rsp] 384 movdqa xmm1,XMMWORD[16+rsp] 385 movdqa xmm2,XMMWORD[32+rsp] 386 paddd xmm3,XMMWORD[48+rsp] 387 mov r8,10 388 movdqa XMMWORD[48+rsp],xmm3 389 jmp NEAR $L$oop_ssse3 390 391ALIGN 32 392$L$oop_ssse3: 393 paddd xmm0,xmm1 394 pxor xmm3,xmm0 395DB 102,15,56,0,222 396 paddd xmm2,xmm3 397 pxor xmm1,xmm2 398 movdqa xmm4,xmm1 399 psrld xmm1,20 400 pslld xmm4,12 401 por xmm1,xmm4 402 paddd xmm0,xmm1 403 pxor xmm3,xmm0 404DB 102,15,56,0,223 405 paddd xmm2,xmm3 406 pxor xmm1,xmm2 407 movdqa xmm4,xmm1 408 psrld xmm1,25 409 pslld xmm4,7 410 por xmm1,xmm4 411 pshufd xmm2,xmm2,78 412 pshufd xmm1,xmm1,57 413 pshufd xmm3,xmm3,147 414 nop 415 paddd xmm0,xmm1 416 pxor xmm3,xmm0 417DB 102,15,56,0,222 418 paddd xmm2,xmm3 419 pxor xmm1,xmm2 420 movdqa xmm4,xmm1 421 psrld xmm1,20 422 pslld xmm4,12 423 por xmm1,xmm4 424 paddd xmm0,xmm1 425 pxor xmm3,xmm0 426DB 102,15,56,0,223 427 paddd xmm2,xmm3 428 pxor xmm1,xmm2 429 movdqa xmm4,xmm1 430 psrld xmm1,25 431 pslld xmm4,7 432 por xmm1,xmm4 433 pshufd xmm2,xmm2,78 434 pshufd xmm1,xmm1,147 435 pshufd xmm3,xmm3,57 436 dec r8 437 jnz NEAR $L$oop_ssse3 438 paddd xmm0,XMMWORD[rsp] 439 paddd xmm1,XMMWORD[16+rsp] 440 paddd xmm2,XMMWORD[32+rsp] 441 paddd xmm3,XMMWORD[48+rsp] 442 443 cmp rdx,64 444 jb NEAR $L$tail_ssse3 445 446 movdqu xmm4,XMMWORD[rsi] 447 movdqu xmm5,XMMWORD[16+rsi] 448 pxor xmm0,xmm4 449 movdqu xmm4,XMMWORD[32+rsi] 450 pxor xmm1,xmm5 451 movdqu xmm5,XMMWORD[48+rsi] 452 lea rsi,[64+rsi] 453 pxor xmm2,xmm4 454 pxor xmm3,xmm5 455 456 movdqu XMMWORD[rdi],xmm0 457 movdqu XMMWORD[16+rdi],xmm1 458 movdqu XMMWORD[32+rdi],xmm2 459 movdqu XMMWORD[48+rdi],xmm3 460 lea rdi,[64+rdi] 461 462 sub rdx,64 463 jnz NEAR $L$oop_outer_ssse3 464 465 jmp NEAR $L$done_ssse3 466 467ALIGN 16 468$L$tail_ssse3: 469 movdqa XMMWORD[rsp],xmm0 470 movdqa XMMWORD[16+rsp],xmm1 471 movdqa XMMWORD[32+rsp],xmm2 472 movdqa XMMWORD[48+rsp],xmm3 473 xor r8,r8 474 475$L$oop_tail_ssse3: 476 movzx eax,BYTE[r8*1+rsi] 477 movzx ecx,BYTE[r8*1+rsp] 478 lea r8,[1+r8] 479 xor eax,ecx 480 mov BYTE[((-1))+r8*1+rdi],al 481 dec rdx 482 jnz NEAR $L$oop_tail_ssse3 483 484$L$done_ssse3: 485 movaps xmm6,XMMWORD[((-40))+r9] 486 movaps xmm7,XMMWORD[((-24))+r9] 487 lea rsp,[r9] 488 489$L$ssse3_epilogue: 490 mov rdi,QWORD[8+rsp] ;WIN64 epilogue 491 mov rsi,QWORD[16+rsp] 492 ret 493 494$L$SEH_end_ChaCha20_ctr32_ssse3: 495global ChaCha20_ctr32_ssse3_4x 496 497ALIGN 32 498ChaCha20_ctr32_ssse3_4x: 499 mov QWORD[8+rsp],rdi ;WIN64 prologue 500 mov QWORD[16+rsp],rsi 501 mov rax,rsp 502$L$SEH_begin_ChaCha20_ctr32_ssse3_4x: 503 mov rdi,rcx 504 mov rsi,rdx 505 mov rdx,r8 506 mov rcx,r9 507 mov r8,QWORD[40+rsp] 508 509 510 511_CET_ENDBR 512 mov r9,rsp 513 514 sub rsp,0x140+168 515 movaps XMMWORD[(-168)+r9],xmm6 516 movaps XMMWORD[(-152)+r9],xmm7 517 movaps XMMWORD[(-136)+r9],xmm8 518 movaps XMMWORD[(-120)+r9],xmm9 519 movaps XMMWORD[(-104)+r9],xmm10 520 movaps XMMWORD[(-88)+r9],xmm11 521 movaps XMMWORD[(-72)+r9],xmm12 522 movaps XMMWORD[(-56)+r9],xmm13 523 movaps XMMWORD[(-40)+r9],xmm14 524 movaps XMMWORD[(-24)+r9],xmm15 525$L$4x_body: 526 movdqa xmm11,XMMWORD[$L$sigma] 527 movdqu xmm15,XMMWORD[rcx] 528 movdqu xmm7,XMMWORD[16+rcx] 529 movdqu xmm3,XMMWORD[r8] 530 lea rcx,[256+rsp] 531 lea r10,[$L$rot16] 532 lea r11,[$L$rot24] 533 534 pshufd xmm8,xmm11,0x00 535 pshufd xmm9,xmm11,0x55 536 movdqa XMMWORD[64+rsp],xmm8 537 pshufd xmm10,xmm11,0xaa 538 movdqa XMMWORD[80+rsp],xmm9 539 pshufd xmm11,xmm11,0xff 540 movdqa XMMWORD[96+rsp],xmm10 541 movdqa XMMWORD[112+rsp],xmm11 542 543 pshufd xmm12,xmm15,0x00 544 pshufd xmm13,xmm15,0x55 545 movdqa XMMWORD[(128-256)+rcx],xmm12 546 pshufd xmm14,xmm15,0xaa 547 movdqa XMMWORD[(144-256)+rcx],xmm13 548 pshufd xmm15,xmm15,0xff 549 movdqa XMMWORD[(160-256)+rcx],xmm14 550 movdqa XMMWORD[(176-256)+rcx],xmm15 551 552 pshufd xmm4,xmm7,0x00 553 pshufd xmm5,xmm7,0x55 554 movdqa XMMWORD[(192-256)+rcx],xmm4 555 pshufd xmm6,xmm7,0xaa 556 movdqa XMMWORD[(208-256)+rcx],xmm5 557 pshufd xmm7,xmm7,0xff 558 movdqa XMMWORD[(224-256)+rcx],xmm6 559 movdqa XMMWORD[(240-256)+rcx],xmm7 560 561 pshufd xmm0,xmm3,0x00 562 pshufd xmm1,xmm3,0x55 563 paddd xmm0,XMMWORD[$L$inc] 564 pshufd xmm2,xmm3,0xaa 565 movdqa XMMWORD[(272-256)+rcx],xmm1 566 pshufd xmm3,xmm3,0xff 567 movdqa XMMWORD[(288-256)+rcx],xmm2 568 movdqa XMMWORD[(304-256)+rcx],xmm3 569 570 jmp NEAR $L$oop_enter4x 571 572ALIGN 32 573$L$oop_outer4x: 574 movdqa xmm8,XMMWORD[64+rsp] 575 movdqa xmm9,XMMWORD[80+rsp] 576 movdqa xmm10,XMMWORD[96+rsp] 577 movdqa xmm11,XMMWORD[112+rsp] 578 movdqa xmm12,XMMWORD[((128-256))+rcx] 579 movdqa xmm13,XMMWORD[((144-256))+rcx] 580 movdqa xmm14,XMMWORD[((160-256))+rcx] 581 movdqa xmm15,XMMWORD[((176-256))+rcx] 582 movdqa xmm4,XMMWORD[((192-256))+rcx] 583 movdqa xmm5,XMMWORD[((208-256))+rcx] 584 movdqa xmm6,XMMWORD[((224-256))+rcx] 585 movdqa xmm7,XMMWORD[((240-256))+rcx] 586 movdqa xmm0,XMMWORD[((256-256))+rcx] 587 movdqa xmm1,XMMWORD[((272-256))+rcx] 588 movdqa xmm2,XMMWORD[((288-256))+rcx] 589 movdqa xmm3,XMMWORD[((304-256))+rcx] 590 paddd xmm0,XMMWORD[$L$four] 591 592$L$oop_enter4x: 593 movdqa XMMWORD[32+rsp],xmm6 594 movdqa XMMWORD[48+rsp],xmm7 595 movdqa xmm7,XMMWORD[r10] 596 mov eax,10 597 movdqa XMMWORD[(256-256)+rcx],xmm0 598 jmp NEAR $L$oop4x 599 600ALIGN 32 601$L$oop4x: 602 paddd xmm8,xmm12 603 paddd xmm9,xmm13 604 pxor xmm0,xmm8 605 pxor xmm1,xmm9 606DB 102,15,56,0,199 607DB 102,15,56,0,207 608 paddd xmm4,xmm0 609 paddd xmm5,xmm1 610 pxor xmm12,xmm4 611 pxor xmm13,xmm5 612 movdqa xmm6,xmm12 613 pslld xmm12,12 614 psrld xmm6,20 615 movdqa xmm7,xmm13 616 pslld xmm13,12 617 por xmm12,xmm6 618 psrld xmm7,20 619 movdqa xmm6,XMMWORD[r11] 620 por xmm13,xmm7 621 paddd xmm8,xmm12 622 paddd xmm9,xmm13 623 pxor xmm0,xmm8 624 pxor xmm1,xmm9 625DB 102,15,56,0,198 626DB 102,15,56,0,206 627 paddd xmm4,xmm0 628 paddd xmm5,xmm1 629 pxor xmm12,xmm4 630 pxor xmm13,xmm5 631 movdqa xmm7,xmm12 632 pslld xmm12,7 633 psrld xmm7,25 634 movdqa xmm6,xmm13 635 pslld xmm13,7 636 por xmm12,xmm7 637 psrld xmm6,25 638 movdqa xmm7,XMMWORD[r10] 639 por xmm13,xmm6 640 movdqa XMMWORD[rsp],xmm4 641 movdqa XMMWORD[16+rsp],xmm5 642 movdqa xmm4,XMMWORD[32+rsp] 643 movdqa xmm5,XMMWORD[48+rsp] 644 paddd xmm10,xmm14 645 paddd xmm11,xmm15 646 pxor xmm2,xmm10 647 pxor xmm3,xmm11 648DB 102,15,56,0,215 649DB 102,15,56,0,223 650 paddd xmm4,xmm2 651 paddd xmm5,xmm3 652 pxor xmm14,xmm4 653 pxor xmm15,xmm5 654 movdqa xmm6,xmm14 655 pslld xmm14,12 656 psrld xmm6,20 657 movdqa xmm7,xmm15 658 pslld xmm15,12 659 por xmm14,xmm6 660 psrld xmm7,20 661 movdqa xmm6,XMMWORD[r11] 662 por xmm15,xmm7 663 paddd xmm10,xmm14 664 paddd xmm11,xmm15 665 pxor xmm2,xmm10 666 pxor xmm3,xmm11 667DB 102,15,56,0,214 668DB 102,15,56,0,222 669 paddd xmm4,xmm2 670 paddd xmm5,xmm3 671 pxor xmm14,xmm4 672 pxor xmm15,xmm5 673 movdqa xmm7,xmm14 674 pslld xmm14,7 675 psrld xmm7,25 676 movdqa xmm6,xmm15 677 pslld xmm15,7 678 por xmm14,xmm7 679 psrld xmm6,25 680 movdqa xmm7,XMMWORD[r10] 681 por xmm15,xmm6 682 paddd xmm8,xmm13 683 paddd xmm9,xmm14 684 pxor xmm3,xmm8 685 pxor xmm0,xmm9 686DB 102,15,56,0,223 687DB 102,15,56,0,199 688 paddd xmm4,xmm3 689 paddd xmm5,xmm0 690 pxor xmm13,xmm4 691 pxor xmm14,xmm5 692 movdqa xmm6,xmm13 693 pslld xmm13,12 694 psrld xmm6,20 695 movdqa xmm7,xmm14 696 pslld xmm14,12 697 por xmm13,xmm6 698 psrld xmm7,20 699 movdqa xmm6,XMMWORD[r11] 700 por xmm14,xmm7 701 paddd xmm8,xmm13 702 paddd xmm9,xmm14 703 pxor xmm3,xmm8 704 pxor xmm0,xmm9 705DB 102,15,56,0,222 706DB 102,15,56,0,198 707 paddd xmm4,xmm3 708 paddd xmm5,xmm0 709 pxor xmm13,xmm4 710 pxor xmm14,xmm5 711 movdqa xmm7,xmm13 712 pslld xmm13,7 713 psrld xmm7,25 714 movdqa xmm6,xmm14 715 pslld xmm14,7 716 por xmm13,xmm7 717 psrld xmm6,25 718 movdqa xmm7,XMMWORD[r10] 719 por xmm14,xmm6 720 movdqa XMMWORD[32+rsp],xmm4 721 movdqa XMMWORD[48+rsp],xmm5 722 movdqa xmm4,XMMWORD[rsp] 723 movdqa xmm5,XMMWORD[16+rsp] 724 paddd xmm10,xmm15 725 paddd xmm11,xmm12 726 pxor xmm1,xmm10 727 pxor xmm2,xmm11 728DB 102,15,56,0,207 729DB 102,15,56,0,215 730 paddd xmm4,xmm1 731 paddd xmm5,xmm2 732 pxor xmm15,xmm4 733 pxor xmm12,xmm5 734 movdqa xmm6,xmm15 735 pslld xmm15,12 736 psrld xmm6,20 737 movdqa xmm7,xmm12 738 pslld xmm12,12 739 por xmm15,xmm6 740 psrld xmm7,20 741 movdqa xmm6,XMMWORD[r11] 742 por xmm12,xmm7 743 paddd xmm10,xmm15 744 paddd xmm11,xmm12 745 pxor xmm1,xmm10 746 pxor xmm2,xmm11 747DB 102,15,56,0,206 748DB 102,15,56,0,214 749 paddd xmm4,xmm1 750 paddd xmm5,xmm2 751 pxor xmm15,xmm4 752 pxor xmm12,xmm5 753 movdqa xmm7,xmm15 754 pslld xmm15,7 755 psrld xmm7,25 756 movdqa xmm6,xmm12 757 pslld xmm12,7 758 por xmm15,xmm7 759 psrld xmm6,25 760 movdqa xmm7,XMMWORD[r10] 761 por xmm12,xmm6 762 dec eax 763 jnz NEAR $L$oop4x 764 765 paddd xmm8,XMMWORD[64+rsp] 766 paddd xmm9,XMMWORD[80+rsp] 767 paddd xmm10,XMMWORD[96+rsp] 768 paddd xmm11,XMMWORD[112+rsp] 769 770 movdqa xmm6,xmm8 771 punpckldq xmm8,xmm9 772 movdqa xmm7,xmm10 773 punpckldq xmm10,xmm11 774 punpckhdq xmm6,xmm9 775 punpckhdq xmm7,xmm11 776 movdqa xmm9,xmm8 777 punpcklqdq xmm8,xmm10 778 movdqa xmm11,xmm6 779 punpcklqdq xmm6,xmm7 780 punpckhqdq xmm9,xmm10 781 punpckhqdq xmm11,xmm7 782 paddd xmm12,XMMWORD[((128-256))+rcx] 783 paddd xmm13,XMMWORD[((144-256))+rcx] 784 paddd xmm14,XMMWORD[((160-256))+rcx] 785 paddd xmm15,XMMWORD[((176-256))+rcx] 786 787 movdqa XMMWORD[rsp],xmm8 788 movdqa XMMWORD[16+rsp],xmm9 789 movdqa xmm8,XMMWORD[32+rsp] 790 movdqa xmm9,XMMWORD[48+rsp] 791 792 movdqa xmm10,xmm12 793 punpckldq xmm12,xmm13 794 movdqa xmm7,xmm14 795 punpckldq xmm14,xmm15 796 punpckhdq xmm10,xmm13 797 punpckhdq xmm7,xmm15 798 movdqa xmm13,xmm12 799 punpcklqdq xmm12,xmm14 800 movdqa xmm15,xmm10 801 punpcklqdq xmm10,xmm7 802 punpckhqdq xmm13,xmm14 803 punpckhqdq xmm15,xmm7 804 paddd xmm4,XMMWORD[((192-256))+rcx] 805 paddd xmm5,XMMWORD[((208-256))+rcx] 806 paddd xmm8,XMMWORD[((224-256))+rcx] 807 paddd xmm9,XMMWORD[((240-256))+rcx] 808 809 movdqa XMMWORD[32+rsp],xmm6 810 movdqa XMMWORD[48+rsp],xmm11 811 812 movdqa xmm14,xmm4 813 punpckldq xmm4,xmm5 814 movdqa xmm7,xmm8 815 punpckldq xmm8,xmm9 816 punpckhdq xmm14,xmm5 817 punpckhdq xmm7,xmm9 818 movdqa xmm5,xmm4 819 punpcklqdq xmm4,xmm8 820 movdqa xmm9,xmm14 821 punpcklqdq xmm14,xmm7 822 punpckhqdq xmm5,xmm8 823 punpckhqdq xmm9,xmm7 824 paddd xmm0,XMMWORD[((256-256))+rcx] 825 paddd xmm1,XMMWORD[((272-256))+rcx] 826 paddd xmm2,XMMWORD[((288-256))+rcx] 827 paddd xmm3,XMMWORD[((304-256))+rcx] 828 829 movdqa xmm8,xmm0 830 punpckldq xmm0,xmm1 831 movdqa xmm7,xmm2 832 punpckldq xmm2,xmm3 833 punpckhdq xmm8,xmm1 834 punpckhdq xmm7,xmm3 835 movdqa xmm1,xmm0 836 punpcklqdq xmm0,xmm2 837 movdqa xmm3,xmm8 838 punpcklqdq xmm8,xmm7 839 punpckhqdq xmm1,xmm2 840 punpckhqdq xmm3,xmm7 841 cmp rdx,64*4 842 jb NEAR $L$tail4x 843 844 movdqu xmm6,XMMWORD[rsi] 845 movdqu xmm11,XMMWORD[16+rsi] 846 movdqu xmm2,XMMWORD[32+rsi] 847 movdqu xmm7,XMMWORD[48+rsi] 848 pxor xmm6,XMMWORD[rsp] 849 pxor xmm11,xmm12 850 pxor xmm2,xmm4 851 pxor xmm7,xmm0 852 853 movdqu XMMWORD[rdi],xmm6 854 movdqu xmm6,XMMWORD[64+rsi] 855 movdqu XMMWORD[16+rdi],xmm11 856 movdqu xmm11,XMMWORD[80+rsi] 857 movdqu XMMWORD[32+rdi],xmm2 858 movdqu xmm2,XMMWORD[96+rsi] 859 movdqu XMMWORD[48+rdi],xmm7 860 movdqu xmm7,XMMWORD[112+rsi] 861 lea rsi,[128+rsi] 862 pxor xmm6,XMMWORD[16+rsp] 863 pxor xmm11,xmm13 864 pxor xmm2,xmm5 865 pxor xmm7,xmm1 866 867 movdqu XMMWORD[64+rdi],xmm6 868 movdqu xmm6,XMMWORD[rsi] 869 movdqu XMMWORD[80+rdi],xmm11 870 movdqu xmm11,XMMWORD[16+rsi] 871 movdqu XMMWORD[96+rdi],xmm2 872 movdqu xmm2,XMMWORD[32+rsi] 873 movdqu XMMWORD[112+rdi],xmm7 874 lea rdi,[128+rdi] 875 movdqu xmm7,XMMWORD[48+rsi] 876 pxor xmm6,XMMWORD[32+rsp] 877 pxor xmm11,xmm10 878 pxor xmm2,xmm14 879 pxor xmm7,xmm8 880 881 movdqu XMMWORD[rdi],xmm6 882 movdqu xmm6,XMMWORD[64+rsi] 883 movdqu XMMWORD[16+rdi],xmm11 884 movdqu xmm11,XMMWORD[80+rsi] 885 movdqu XMMWORD[32+rdi],xmm2 886 movdqu xmm2,XMMWORD[96+rsi] 887 movdqu XMMWORD[48+rdi],xmm7 888 movdqu xmm7,XMMWORD[112+rsi] 889 lea rsi,[128+rsi] 890 pxor xmm6,XMMWORD[48+rsp] 891 pxor xmm11,xmm15 892 pxor xmm2,xmm9 893 pxor xmm7,xmm3 894 movdqu XMMWORD[64+rdi],xmm6 895 movdqu XMMWORD[80+rdi],xmm11 896 movdqu XMMWORD[96+rdi],xmm2 897 movdqu XMMWORD[112+rdi],xmm7 898 lea rdi,[128+rdi] 899 900 sub rdx,64*4 901 jnz NEAR $L$oop_outer4x 902 903 jmp NEAR $L$done4x 904 905$L$tail4x: 906 cmp rdx,192 907 jae NEAR $L$192_or_more4x 908 cmp rdx,128 909 jae NEAR $L$128_or_more4x 910 cmp rdx,64 911 jae NEAR $L$64_or_more4x 912 913 914 xor r10,r10 915 916 movdqa XMMWORD[16+rsp],xmm12 917 movdqa XMMWORD[32+rsp],xmm4 918 movdqa XMMWORD[48+rsp],xmm0 919 jmp NEAR $L$oop_tail4x 920 921ALIGN 32 922$L$64_or_more4x: 923 movdqu xmm6,XMMWORD[rsi] 924 movdqu xmm11,XMMWORD[16+rsi] 925 movdqu xmm2,XMMWORD[32+rsi] 926 movdqu xmm7,XMMWORD[48+rsi] 927 pxor xmm6,XMMWORD[rsp] 928 pxor xmm11,xmm12 929 pxor xmm2,xmm4 930 pxor xmm7,xmm0 931 movdqu XMMWORD[rdi],xmm6 932 movdqu XMMWORD[16+rdi],xmm11 933 movdqu XMMWORD[32+rdi],xmm2 934 movdqu XMMWORD[48+rdi],xmm7 935 je NEAR $L$done4x 936 937 movdqa xmm6,XMMWORD[16+rsp] 938 lea rsi,[64+rsi] 939 xor r10,r10 940 movdqa XMMWORD[rsp],xmm6 941 movdqa XMMWORD[16+rsp],xmm13 942 lea rdi,[64+rdi] 943 movdqa XMMWORD[32+rsp],xmm5 944 sub rdx,64 945 movdqa XMMWORD[48+rsp],xmm1 946 jmp NEAR $L$oop_tail4x 947 948ALIGN 32 949$L$128_or_more4x: 950 movdqu xmm6,XMMWORD[rsi] 951 movdqu xmm11,XMMWORD[16+rsi] 952 movdqu xmm2,XMMWORD[32+rsi] 953 movdqu xmm7,XMMWORD[48+rsi] 954 pxor xmm6,XMMWORD[rsp] 955 pxor xmm11,xmm12 956 pxor xmm2,xmm4 957 pxor xmm7,xmm0 958 959 movdqu XMMWORD[rdi],xmm6 960 movdqu xmm6,XMMWORD[64+rsi] 961 movdqu XMMWORD[16+rdi],xmm11 962 movdqu xmm11,XMMWORD[80+rsi] 963 movdqu XMMWORD[32+rdi],xmm2 964 movdqu xmm2,XMMWORD[96+rsi] 965 movdqu XMMWORD[48+rdi],xmm7 966 movdqu xmm7,XMMWORD[112+rsi] 967 pxor xmm6,XMMWORD[16+rsp] 968 pxor xmm11,xmm13 969 pxor xmm2,xmm5 970 pxor xmm7,xmm1 971 movdqu XMMWORD[64+rdi],xmm6 972 movdqu XMMWORD[80+rdi],xmm11 973 movdqu XMMWORD[96+rdi],xmm2 974 movdqu XMMWORD[112+rdi],xmm7 975 je NEAR $L$done4x 976 977 movdqa xmm6,XMMWORD[32+rsp] 978 lea rsi,[128+rsi] 979 xor r10,r10 980 movdqa XMMWORD[rsp],xmm6 981 movdqa XMMWORD[16+rsp],xmm10 982 lea rdi,[128+rdi] 983 movdqa XMMWORD[32+rsp],xmm14 984 sub rdx,128 985 movdqa XMMWORD[48+rsp],xmm8 986 jmp NEAR $L$oop_tail4x 987 988ALIGN 32 989$L$192_or_more4x: 990 movdqu xmm6,XMMWORD[rsi] 991 movdqu xmm11,XMMWORD[16+rsi] 992 movdqu xmm2,XMMWORD[32+rsi] 993 movdqu xmm7,XMMWORD[48+rsi] 994 pxor xmm6,XMMWORD[rsp] 995 pxor xmm11,xmm12 996 pxor xmm2,xmm4 997 pxor xmm7,xmm0 998 999 movdqu XMMWORD[rdi],xmm6 1000 movdqu xmm6,XMMWORD[64+rsi] 1001 movdqu XMMWORD[16+rdi],xmm11 1002 movdqu xmm11,XMMWORD[80+rsi] 1003 movdqu XMMWORD[32+rdi],xmm2 1004 movdqu xmm2,XMMWORD[96+rsi] 1005 movdqu XMMWORD[48+rdi],xmm7 1006 movdqu xmm7,XMMWORD[112+rsi] 1007 lea rsi,[128+rsi] 1008 pxor xmm6,XMMWORD[16+rsp] 1009 pxor xmm11,xmm13 1010 pxor xmm2,xmm5 1011 pxor xmm7,xmm1 1012 1013 movdqu XMMWORD[64+rdi],xmm6 1014 movdqu xmm6,XMMWORD[rsi] 1015 movdqu XMMWORD[80+rdi],xmm11 1016 movdqu xmm11,XMMWORD[16+rsi] 1017 movdqu XMMWORD[96+rdi],xmm2 1018 movdqu xmm2,XMMWORD[32+rsi] 1019 movdqu XMMWORD[112+rdi],xmm7 1020 lea rdi,[128+rdi] 1021 movdqu xmm7,XMMWORD[48+rsi] 1022 pxor xmm6,XMMWORD[32+rsp] 1023 pxor xmm11,xmm10 1024 pxor xmm2,xmm14 1025 pxor xmm7,xmm8 1026 movdqu XMMWORD[rdi],xmm6 1027 movdqu XMMWORD[16+rdi],xmm11 1028 movdqu XMMWORD[32+rdi],xmm2 1029 movdqu XMMWORD[48+rdi],xmm7 1030 je NEAR $L$done4x 1031 1032 movdqa xmm6,XMMWORD[48+rsp] 1033 lea rsi,[64+rsi] 1034 xor r10,r10 1035 movdqa XMMWORD[rsp],xmm6 1036 movdqa XMMWORD[16+rsp],xmm15 1037 lea rdi,[64+rdi] 1038 movdqa XMMWORD[32+rsp],xmm9 1039 sub rdx,192 1040 movdqa XMMWORD[48+rsp],xmm3 1041 1042$L$oop_tail4x: 1043 movzx eax,BYTE[r10*1+rsi] 1044 movzx ecx,BYTE[r10*1+rsp] 1045 lea r10,[1+r10] 1046 xor eax,ecx 1047 mov BYTE[((-1))+r10*1+rdi],al 1048 dec rdx 1049 jnz NEAR $L$oop_tail4x 1050 1051$L$done4x: 1052 movaps xmm6,XMMWORD[((-168))+r9] 1053 movaps xmm7,XMMWORD[((-152))+r9] 1054 movaps xmm8,XMMWORD[((-136))+r9] 1055 movaps xmm9,XMMWORD[((-120))+r9] 1056 movaps xmm10,XMMWORD[((-104))+r9] 1057 movaps xmm11,XMMWORD[((-88))+r9] 1058 movaps xmm12,XMMWORD[((-72))+r9] 1059 movaps xmm13,XMMWORD[((-56))+r9] 1060 movaps xmm14,XMMWORD[((-40))+r9] 1061 movaps xmm15,XMMWORD[((-24))+r9] 1062 lea rsp,[r9] 1063 1064$L$4x_epilogue: 1065 mov rdi,QWORD[8+rsp] ;WIN64 epilogue 1066 mov rsi,QWORD[16+rsp] 1067 ret 1068 1069$L$SEH_end_ChaCha20_ctr32_ssse3_4x: 1070global ChaCha20_ctr32_avx2 1071 1072ALIGN 32 1073ChaCha20_ctr32_avx2: 1074 mov QWORD[8+rsp],rdi ;WIN64 prologue 1075 mov QWORD[16+rsp],rsi 1076 mov rax,rsp 1077$L$SEH_begin_ChaCha20_ctr32_avx2: 1078 mov rdi,rcx 1079 mov rsi,rdx 1080 mov rdx,r8 1081 mov rcx,r9 1082 mov r8,QWORD[40+rsp] 1083 1084 1085 1086_CET_ENDBR 1087 mov r9,rsp 1088 1089 sub rsp,0x280+168 1090 and rsp,-32 1091 movaps XMMWORD[(-168)+r9],xmm6 1092 movaps XMMWORD[(-152)+r9],xmm7 1093 movaps XMMWORD[(-136)+r9],xmm8 1094 movaps XMMWORD[(-120)+r9],xmm9 1095 movaps XMMWORD[(-104)+r9],xmm10 1096 movaps XMMWORD[(-88)+r9],xmm11 1097 movaps XMMWORD[(-72)+r9],xmm12 1098 movaps XMMWORD[(-56)+r9],xmm13 1099 movaps XMMWORD[(-40)+r9],xmm14 1100 movaps XMMWORD[(-24)+r9],xmm15 1101$L$8x_body: 1102 vzeroupper 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 vbroadcasti128 ymm11,XMMWORD[$L$sigma] 1114 vbroadcasti128 ymm3,XMMWORD[rcx] 1115 vbroadcasti128 ymm15,XMMWORD[16+rcx] 1116 vbroadcasti128 ymm7,XMMWORD[r8] 1117 lea rcx,[256+rsp] 1118 lea rax,[512+rsp] 1119 lea r10,[$L$rot16] 1120 lea r11,[$L$rot24] 1121 1122 vpshufd ymm8,ymm11,0x00 1123 vpshufd ymm9,ymm11,0x55 1124 vmovdqa YMMWORD[(128-256)+rcx],ymm8 1125 vpshufd ymm10,ymm11,0xaa 1126 vmovdqa YMMWORD[(160-256)+rcx],ymm9 1127 vpshufd ymm11,ymm11,0xff 1128 vmovdqa YMMWORD[(192-256)+rcx],ymm10 1129 vmovdqa YMMWORD[(224-256)+rcx],ymm11 1130 1131 vpshufd ymm0,ymm3,0x00 1132 vpshufd ymm1,ymm3,0x55 1133 vmovdqa YMMWORD[(256-256)+rcx],ymm0 1134 vpshufd ymm2,ymm3,0xaa 1135 vmovdqa YMMWORD[(288-256)+rcx],ymm1 1136 vpshufd ymm3,ymm3,0xff 1137 vmovdqa YMMWORD[(320-256)+rcx],ymm2 1138 vmovdqa YMMWORD[(352-256)+rcx],ymm3 1139 1140 vpshufd ymm12,ymm15,0x00 1141 vpshufd ymm13,ymm15,0x55 1142 vmovdqa YMMWORD[(384-512)+rax],ymm12 1143 vpshufd ymm14,ymm15,0xaa 1144 vmovdqa YMMWORD[(416-512)+rax],ymm13 1145 vpshufd ymm15,ymm15,0xff 1146 vmovdqa YMMWORD[(448-512)+rax],ymm14 1147 vmovdqa YMMWORD[(480-512)+rax],ymm15 1148 1149 vpshufd ymm4,ymm7,0x00 1150 vpshufd ymm5,ymm7,0x55 1151 vpaddd ymm4,ymm4,YMMWORD[$L$incy] 1152 vpshufd ymm6,ymm7,0xaa 1153 vmovdqa YMMWORD[(544-512)+rax],ymm5 1154 vpshufd ymm7,ymm7,0xff 1155 vmovdqa YMMWORD[(576-512)+rax],ymm6 1156 vmovdqa YMMWORD[(608-512)+rax],ymm7 1157 1158 jmp NEAR $L$oop_enter8x 1159 1160ALIGN 32 1161$L$oop_outer8x: 1162 vmovdqa ymm8,YMMWORD[((128-256))+rcx] 1163 vmovdqa ymm9,YMMWORD[((160-256))+rcx] 1164 vmovdqa ymm10,YMMWORD[((192-256))+rcx] 1165 vmovdqa ymm11,YMMWORD[((224-256))+rcx] 1166 vmovdqa ymm0,YMMWORD[((256-256))+rcx] 1167 vmovdqa ymm1,YMMWORD[((288-256))+rcx] 1168 vmovdqa ymm2,YMMWORD[((320-256))+rcx] 1169 vmovdqa ymm3,YMMWORD[((352-256))+rcx] 1170 vmovdqa ymm12,YMMWORD[((384-512))+rax] 1171 vmovdqa ymm13,YMMWORD[((416-512))+rax] 1172 vmovdqa ymm14,YMMWORD[((448-512))+rax] 1173 vmovdqa ymm15,YMMWORD[((480-512))+rax] 1174 vmovdqa ymm4,YMMWORD[((512-512))+rax] 1175 vmovdqa ymm5,YMMWORD[((544-512))+rax] 1176 vmovdqa ymm6,YMMWORD[((576-512))+rax] 1177 vmovdqa ymm7,YMMWORD[((608-512))+rax] 1178 vpaddd ymm4,ymm4,YMMWORD[$L$eight] 1179 1180$L$oop_enter8x: 1181 vmovdqa YMMWORD[64+rsp],ymm14 1182 vmovdqa YMMWORD[96+rsp],ymm15 1183 vbroadcasti128 ymm15,XMMWORD[r10] 1184 vmovdqa YMMWORD[(512-512)+rax],ymm4 1185 mov eax,10 1186 jmp NEAR $L$oop8x 1187 1188ALIGN 32 1189$L$oop8x: 1190 vpaddd ymm8,ymm8,ymm0 1191 vpxor ymm4,ymm8,ymm4 1192 vpshufb ymm4,ymm4,ymm15 1193 vpaddd ymm9,ymm9,ymm1 1194 vpxor ymm5,ymm9,ymm5 1195 vpshufb ymm5,ymm5,ymm15 1196 vpaddd ymm12,ymm12,ymm4 1197 vpxor ymm0,ymm12,ymm0 1198 vpslld ymm14,ymm0,12 1199 vpsrld ymm0,ymm0,20 1200 vpor ymm0,ymm14,ymm0 1201 vbroadcasti128 ymm14,XMMWORD[r11] 1202 vpaddd ymm13,ymm13,ymm5 1203 vpxor ymm1,ymm13,ymm1 1204 vpslld ymm15,ymm1,12 1205 vpsrld ymm1,ymm1,20 1206 vpor ymm1,ymm15,ymm1 1207 vpaddd ymm8,ymm8,ymm0 1208 vpxor ymm4,ymm8,ymm4 1209 vpshufb ymm4,ymm4,ymm14 1210 vpaddd ymm9,ymm9,ymm1 1211 vpxor ymm5,ymm9,ymm5 1212 vpshufb ymm5,ymm5,ymm14 1213 vpaddd ymm12,ymm12,ymm4 1214 vpxor ymm0,ymm12,ymm0 1215 vpslld ymm15,ymm0,7 1216 vpsrld ymm0,ymm0,25 1217 vpor ymm0,ymm15,ymm0 1218 vbroadcasti128 ymm15,XMMWORD[r10] 1219 vpaddd ymm13,ymm13,ymm5 1220 vpxor ymm1,ymm13,ymm1 1221 vpslld ymm14,ymm1,7 1222 vpsrld ymm1,ymm1,25 1223 vpor ymm1,ymm14,ymm1 1224 vmovdqa YMMWORD[rsp],ymm12 1225 vmovdqa YMMWORD[32+rsp],ymm13 1226 vmovdqa ymm12,YMMWORD[64+rsp] 1227 vmovdqa ymm13,YMMWORD[96+rsp] 1228 vpaddd ymm10,ymm10,ymm2 1229 vpxor ymm6,ymm10,ymm6 1230 vpshufb ymm6,ymm6,ymm15 1231 vpaddd ymm11,ymm11,ymm3 1232 vpxor ymm7,ymm11,ymm7 1233 vpshufb ymm7,ymm7,ymm15 1234 vpaddd ymm12,ymm12,ymm6 1235 vpxor ymm2,ymm12,ymm2 1236 vpslld ymm14,ymm2,12 1237 vpsrld ymm2,ymm2,20 1238 vpor ymm2,ymm14,ymm2 1239 vbroadcasti128 ymm14,XMMWORD[r11] 1240 vpaddd ymm13,ymm13,ymm7 1241 vpxor ymm3,ymm13,ymm3 1242 vpslld ymm15,ymm3,12 1243 vpsrld ymm3,ymm3,20 1244 vpor ymm3,ymm15,ymm3 1245 vpaddd ymm10,ymm10,ymm2 1246 vpxor ymm6,ymm10,ymm6 1247 vpshufb ymm6,ymm6,ymm14 1248 vpaddd ymm11,ymm11,ymm3 1249 vpxor ymm7,ymm11,ymm7 1250 vpshufb ymm7,ymm7,ymm14 1251 vpaddd ymm12,ymm12,ymm6 1252 vpxor ymm2,ymm12,ymm2 1253 vpslld ymm15,ymm2,7 1254 vpsrld ymm2,ymm2,25 1255 vpor ymm2,ymm15,ymm2 1256 vbroadcasti128 ymm15,XMMWORD[r10] 1257 vpaddd ymm13,ymm13,ymm7 1258 vpxor ymm3,ymm13,ymm3 1259 vpslld ymm14,ymm3,7 1260 vpsrld ymm3,ymm3,25 1261 vpor ymm3,ymm14,ymm3 1262 vpaddd ymm8,ymm8,ymm1 1263 vpxor ymm7,ymm8,ymm7 1264 vpshufb ymm7,ymm7,ymm15 1265 vpaddd ymm9,ymm9,ymm2 1266 vpxor ymm4,ymm9,ymm4 1267 vpshufb ymm4,ymm4,ymm15 1268 vpaddd ymm12,ymm12,ymm7 1269 vpxor ymm1,ymm12,ymm1 1270 vpslld ymm14,ymm1,12 1271 vpsrld ymm1,ymm1,20 1272 vpor ymm1,ymm14,ymm1 1273 vbroadcasti128 ymm14,XMMWORD[r11] 1274 vpaddd ymm13,ymm13,ymm4 1275 vpxor ymm2,ymm13,ymm2 1276 vpslld ymm15,ymm2,12 1277 vpsrld ymm2,ymm2,20 1278 vpor ymm2,ymm15,ymm2 1279 vpaddd ymm8,ymm8,ymm1 1280 vpxor ymm7,ymm8,ymm7 1281 vpshufb ymm7,ymm7,ymm14 1282 vpaddd ymm9,ymm9,ymm2 1283 vpxor ymm4,ymm9,ymm4 1284 vpshufb ymm4,ymm4,ymm14 1285 vpaddd ymm12,ymm12,ymm7 1286 vpxor ymm1,ymm12,ymm1 1287 vpslld ymm15,ymm1,7 1288 vpsrld ymm1,ymm1,25 1289 vpor ymm1,ymm15,ymm1 1290 vbroadcasti128 ymm15,XMMWORD[r10] 1291 vpaddd ymm13,ymm13,ymm4 1292 vpxor ymm2,ymm13,ymm2 1293 vpslld ymm14,ymm2,7 1294 vpsrld ymm2,ymm2,25 1295 vpor ymm2,ymm14,ymm2 1296 vmovdqa YMMWORD[64+rsp],ymm12 1297 vmovdqa YMMWORD[96+rsp],ymm13 1298 vmovdqa ymm12,YMMWORD[rsp] 1299 vmovdqa ymm13,YMMWORD[32+rsp] 1300 vpaddd ymm10,ymm10,ymm3 1301 vpxor ymm5,ymm10,ymm5 1302 vpshufb ymm5,ymm5,ymm15 1303 vpaddd ymm11,ymm11,ymm0 1304 vpxor ymm6,ymm11,ymm6 1305 vpshufb ymm6,ymm6,ymm15 1306 vpaddd ymm12,ymm12,ymm5 1307 vpxor ymm3,ymm12,ymm3 1308 vpslld ymm14,ymm3,12 1309 vpsrld ymm3,ymm3,20 1310 vpor ymm3,ymm14,ymm3 1311 vbroadcasti128 ymm14,XMMWORD[r11] 1312 vpaddd ymm13,ymm13,ymm6 1313 vpxor ymm0,ymm13,ymm0 1314 vpslld ymm15,ymm0,12 1315 vpsrld ymm0,ymm0,20 1316 vpor ymm0,ymm15,ymm0 1317 vpaddd ymm10,ymm10,ymm3 1318 vpxor ymm5,ymm10,ymm5 1319 vpshufb ymm5,ymm5,ymm14 1320 vpaddd ymm11,ymm11,ymm0 1321 vpxor ymm6,ymm11,ymm6 1322 vpshufb ymm6,ymm6,ymm14 1323 vpaddd ymm12,ymm12,ymm5 1324 vpxor ymm3,ymm12,ymm3 1325 vpslld ymm15,ymm3,7 1326 vpsrld ymm3,ymm3,25 1327 vpor ymm3,ymm15,ymm3 1328 vbroadcasti128 ymm15,XMMWORD[r10] 1329 vpaddd ymm13,ymm13,ymm6 1330 vpxor ymm0,ymm13,ymm0 1331 vpslld ymm14,ymm0,7 1332 vpsrld ymm0,ymm0,25 1333 vpor ymm0,ymm14,ymm0 1334 dec eax 1335 jnz NEAR $L$oop8x 1336 1337 lea rax,[512+rsp] 1338 vpaddd ymm8,ymm8,YMMWORD[((128-256))+rcx] 1339 vpaddd ymm9,ymm9,YMMWORD[((160-256))+rcx] 1340 vpaddd ymm10,ymm10,YMMWORD[((192-256))+rcx] 1341 vpaddd ymm11,ymm11,YMMWORD[((224-256))+rcx] 1342 1343 vpunpckldq ymm14,ymm8,ymm9 1344 vpunpckldq ymm15,ymm10,ymm11 1345 vpunpckhdq ymm8,ymm8,ymm9 1346 vpunpckhdq ymm10,ymm10,ymm11 1347 vpunpcklqdq ymm9,ymm14,ymm15 1348 vpunpckhqdq ymm14,ymm14,ymm15 1349 vpunpcklqdq ymm11,ymm8,ymm10 1350 vpunpckhqdq ymm8,ymm8,ymm10 1351 vpaddd ymm0,ymm0,YMMWORD[((256-256))+rcx] 1352 vpaddd ymm1,ymm1,YMMWORD[((288-256))+rcx] 1353 vpaddd ymm2,ymm2,YMMWORD[((320-256))+rcx] 1354 vpaddd ymm3,ymm3,YMMWORD[((352-256))+rcx] 1355 1356 vpunpckldq ymm10,ymm0,ymm1 1357 vpunpckldq ymm15,ymm2,ymm3 1358 vpunpckhdq ymm0,ymm0,ymm1 1359 vpunpckhdq ymm2,ymm2,ymm3 1360 vpunpcklqdq ymm1,ymm10,ymm15 1361 vpunpckhqdq ymm10,ymm10,ymm15 1362 vpunpcklqdq ymm3,ymm0,ymm2 1363 vpunpckhqdq ymm0,ymm0,ymm2 1364 vperm2i128 ymm15,ymm9,ymm1,0x20 1365 vperm2i128 ymm1,ymm9,ymm1,0x31 1366 vperm2i128 ymm9,ymm14,ymm10,0x20 1367 vperm2i128 ymm10,ymm14,ymm10,0x31 1368 vperm2i128 ymm14,ymm11,ymm3,0x20 1369 vperm2i128 ymm3,ymm11,ymm3,0x31 1370 vperm2i128 ymm11,ymm8,ymm0,0x20 1371 vperm2i128 ymm0,ymm8,ymm0,0x31 1372 vmovdqa YMMWORD[rsp],ymm15 1373 vmovdqa YMMWORD[32+rsp],ymm9 1374 vmovdqa ymm15,YMMWORD[64+rsp] 1375 vmovdqa ymm9,YMMWORD[96+rsp] 1376 1377 vpaddd ymm12,ymm12,YMMWORD[((384-512))+rax] 1378 vpaddd ymm13,ymm13,YMMWORD[((416-512))+rax] 1379 vpaddd ymm15,ymm15,YMMWORD[((448-512))+rax] 1380 vpaddd ymm9,ymm9,YMMWORD[((480-512))+rax] 1381 1382 vpunpckldq ymm2,ymm12,ymm13 1383 vpunpckldq ymm8,ymm15,ymm9 1384 vpunpckhdq ymm12,ymm12,ymm13 1385 vpunpckhdq ymm15,ymm15,ymm9 1386 vpunpcklqdq ymm13,ymm2,ymm8 1387 vpunpckhqdq ymm2,ymm2,ymm8 1388 vpunpcklqdq ymm9,ymm12,ymm15 1389 vpunpckhqdq ymm12,ymm12,ymm15 1390 vpaddd ymm4,ymm4,YMMWORD[((512-512))+rax] 1391 vpaddd ymm5,ymm5,YMMWORD[((544-512))+rax] 1392 vpaddd ymm6,ymm6,YMMWORD[((576-512))+rax] 1393 vpaddd ymm7,ymm7,YMMWORD[((608-512))+rax] 1394 1395 vpunpckldq ymm15,ymm4,ymm5 1396 vpunpckldq ymm8,ymm6,ymm7 1397 vpunpckhdq ymm4,ymm4,ymm5 1398 vpunpckhdq ymm6,ymm6,ymm7 1399 vpunpcklqdq ymm5,ymm15,ymm8 1400 vpunpckhqdq ymm15,ymm15,ymm8 1401 vpunpcklqdq ymm7,ymm4,ymm6 1402 vpunpckhqdq ymm4,ymm4,ymm6 1403 vperm2i128 ymm8,ymm13,ymm5,0x20 1404 vperm2i128 ymm5,ymm13,ymm5,0x31 1405 vperm2i128 ymm13,ymm2,ymm15,0x20 1406 vperm2i128 ymm15,ymm2,ymm15,0x31 1407 vperm2i128 ymm2,ymm9,ymm7,0x20 1408 vperm2i128 ymm7,ymm9,ymm7,0x31 1409 vperm2i128 ymm9,ymm12,ymm4,0x20 1410 vperm2i128 ymm4,ymm12,ymm4,0x31 1411 vmovdqa ymm6,YMMWORD[rsp] 1412 vmovdqa ymm12,YMMWORD[32+rsp] 1413 1414 cmp rdx,64*8 1415 jb NEAR $L$tail8x 1416 1417 vpxor ymm6,ymm6,YMMWORD[rsi] 1418 vpxor ymm8,ymm8,YMMWORD[32+rsi] 1419 vpxor ymm1,ymm1,YMMWORD[64+rsi] 1420 vpxor ymm5,ymm5,YMMWORD[96+rsi] 1421 lea rsi,[128+rsi] 1422 vmovdqu YMMWORD[rdi],ymm6 1423 vmovdqu YMMWORD[32+rdi],ymm8 1424 vmovdqu YMMWORD[64+rdi],ymm1 1425 vmovdqu YMMWORD[96+rdi],ymm5 1426 lea rdi,[128+rdi] 1427 1428 vpxor ymm12,ymm12,YMMWORD[rsi] 1429 vpxor ymm13,ymm13,YMMWORD[32+rsi] 1430 vpxor ymm10,ymm10,YMMWORD[64+rsi] 1431 vpxor ymm15,ymm15,YMMWORD[96+rsi] 1432 lea rsi,[128+rsi] 1433 vmovdqu YMMWORD[rdi],ymm12 1434 vmovdqu YMMWORD[32+rdi],ymm13 1435 vmovdqu YMMWORD[64+rdi],ymm10 1436 vmovdqu YMMWORD[96+rdi],ymm15 1437 lea rdi,[128+rdi] 1438 1439 vpxor ymm14,ymm14,YMMWORD[rsi] 1440 vpxor ymm2,ymm2,YMMWORD[32+rsi] 1441 vpxor ymm3,ymm3,YMMWORD[64+rsi] 1442 vpxor ymm7,ymm7,YMMWORD[96+rsi] 1443 lea rsi,[128+rsi] 1444 vmovdqu YMMWORD[rdi],ymm14 1445 vmovdqu YMMWORD[32+rdi],ymm2 1446 vmovdqu YMMWORD[64+rdi],ymm3 1447 vmovdqu YMMWORD[96+rdi],ymm7 1448 lea rdi,[128+rdi] 1449 1450 vpxor ymm11,ymm11,YMMWORD[rsi] 1451 vpxor ymm9,ymm9,YMMWORD[32+rsi] 1452 vpxor ymm0,ymm0,YMMWORD[64+rsi] 1453 vpxor ymm4,ymm4,YMMWORD[96+rsi] 1454 lea rsi,[128+rsi] 1455 vmovdqu YMMWORD[rdi],ymm11 1456 vmovdqu YMMWORD[32+rdi],ymm9 1457 vmovdqu YMMWORD[64+rdi],ymm0 1458 vmovdqu YMMWORD[96+rdi],ymm4 1459 lea rdi,[128+rdi] 1460 1461 sub rdx,64*8 1462 jnz NEAR $L$oop_outer8x 1463 1464 jmp NEAR $L$done8x 1465 1466$L$tail8x: 1467 cmp rdx,448 1468 jae NEAR $L$448_or_more8x 1469 cmp rdx,384 1470 jae NEAR $L$384_or_more8x 1471 cmp rdx,320 1472 jae NEAR $L$320_or_more8x 1473 cmp rdx,256 1474 jae NEAR $L$256_or_more8x 1475 cmp rdx,192 1476 jae NEAR $L$192_or_more8x 1477 cmp rdx,128 1478 jae NEAR $L$128_or_more8x 1479 cmp rdx,64 1480 jae NEAR $L$64_or_more8x 1481 1482 xor r10,r10 1483 vmovdqa YMMWORD[rsp],ymm6 1484 vmovdqa YMMWORD[32+rsp],ymm8 1485 jmp NEAR $L$oop_tail8x 1486 1487ALIGN 32 1488$L$64_or_more8x: 1489 vpxor ymm6,ymm6,YMMWORD[rsi] 1490 vpxor ymm8,ymm8,YMMWORD[32+rsi] 1491 vmovdqu YMMWORD[rdi],ymm6 1492 vmovdqu YMMWORD[32+rdi],ymm8 1493 je NEAR $L$done8x 1494 1495 lea rsi,[64+rsi] 1496 xor r10,r10 1497 vmovdqa YMMWORD[rsp],ymm1 1498 lea rdi,[64+rdi] 1499 sub rdx,64 1500 vmovdqa YMMWORD[32+rsp],ymm5 1501 jmp NEAR $L$oop_tail8x 1502 1503ALIGN 32 1504$L$128_or_more8x: 1505 vpxor ymm6,ymm6,YMMWORD[rsi] 1506 vpxor ymm8,ymm8,YMMWORD[32+rsi] 1507 vpxor ymm1,ymm1,YMMWORD[64+rsi] 1508 vpxor ymm5,ymm5,YMMWORD[96+rsi] 1509 vmovdqu YMMWORD[rdi],ymm6 1510 vmovdqu YMMWORD[32+rdi],ymm8 1511 vmovdqu YMMWORD[64+rdi],ymm1 1512 vmovdqu YMMWORD[96+rdi],ymm5 1513 je NEAR $L$done8x 1514 1515 lea rsi,[128+rsi] 1516 xor r10,r10 1517 vmovdqa YMMWORD[rsp],ymm12 1518 lea rdi,[128+rdi] 1519 sub rdx,128 1520 vmovdqa YMMWORD[32+rsp],ymm13 1521 jmp NEAR $L$oop_tail8x 1522 1523ALIGN 32 1524$L$192_or_more8x: 1525 vpxor ymm6,ymm6,YMMWORD[rsi] 1526 vpxor ymm8,ymm8,YMMWORD[32+rsi] 1527 vpxor ymm1,ymm1,YMMWORD[64+rsi] 1528 vpxor ymm5,ymm5,YMMWORD[96+rsi] 1529 vpxor ymm12,ymm12,YMMWORD[128+rsi] 1530 vpxor ymm13,ymm13,YMMWORD[160+rsi] 1531 vmovdqu YMMWORD[rdi],ymm6 1532 vmovdqu YMMWORD[32+rdi],ymm8 1533 vmovdqu YMMWORD[64+rdi],ymm1 1534 vmovdqu YMMWORD[96+rdi],ymm5 1535 vmovdqu YMMWORD[128+rdi],ymm12 1536 vmovdqu YMMWORD[160+rdi],ymm13 1537 je NEAR $L$done8x 1538 1539 lea rsi,[192+rsi] 1540 xor r10,r10 1541 vmovdqa YMMWORD[rsp],ymm10 1542 lea rdi,[192+rdi] 1543 sub rdx,192 1544 vmovdqa YMMWORD[32+rsp],ymm15 1545 jmp NEAR $L$oop_tail8x 1546 1547ALIGN 32 1548$L$256_or_more8x: 1549 vpxor ymm6,ymm6,YMMWORD[rsi] 1550 vpxor ymm8,ymm8,YMMWORD[32+rsi] 1551 vpxor ymm1,ymm1,YMMWORD[64+rsi] 1552 vpxor ymm5,ymm5,YMMWORD[96+rsi] 1553 vpxor ymm12,ymm12,YMMWORD[128+rsi] 1554 vpxor ymm13,ymm13,YMMWORD[160+rsi] 1555 vpxor ymm10,ymm10,YMMWORD[192+rsi] 1556 vpxor ymm15,ymm15,YMMWORD[224+rsi] 1557 vmovdqu YMMWORD[rdi],ymm6 1558 vmovdqu YMMWORD[32+rdi],ymm8 1559 vmovdqu YMMWORD[64+rdi],ymm1 1560 vmovdqu YMMWORD[96+rdi],ymm5 1561 vmovdqu YMMWORD[128+rdi],ymm12 1562 vmovdqu YMMWORD[160+rdi],ymm13 1563 vmovdqu YMMWORD[192+rdi],ymm10 1564 vmovdqu YMMWORD[224+rdi],ymm15 1565 je NEAR $L$done8x 1566 1567 lea rsi,[256+rsi] 1568 xor r10,r10 1569 vmovdqa YMMWORD[rsp],ymm14 1570 lea rdi,[256+rdi] 1571 sub rdx,256 1572 vmovdqa YMMWORD[32+rsp],ymm2 1573 jmp NEAR $L$oop_tail8x 1574 1575ALIGN 32 1576$L$320_or_more8x: 1577 vpxor ymm6,ymm6,YMMWORD[rsi] 1578 vpxor ymm8,ymm8,YMMWORD[32+rsi] 1579 vpxor ymm1,ymm1,YMMWORD[64+rsi] 1580 vpxor ymm5,ymm5,YMMWORD[96+rsi] 1581 vpxor ymm12,ymm12,YMMWORD[128+rsi] 1582 vpxor ymm13,ymm13,YMMWORD[160+rsi] 1583 vpxor ymm10,ymm10,YMMWORD[192+rsi] 1584 vpxor ymm15,ymm15,YMMWORD[224+rsi] 1585 vpxor ymm14,ymm14,YMMWORD[256+rsi] 1586 vpxor ymm2,ymm2,YMMWORD[288+rsi] 1587 vmovdqu YMMWORD[rdi],ymm6 1588 vmovdqu YMMWORD[32+rdi],ymm8 1589 vmovdqu YMMWORD[64+rdi],ymm1 1590 vmovdqu YMMWORD[96+rdi],ymm5 1591 vmovdqu YMMWORD[128+rdi],ymm12 1592 vmovdqu YMMWORD[160+rdi],ymm13 1593 vmovdqu YMMWORD[192+rdi],ymm10 1594 vmovdqu YMMWORD[224+rdi],ymm15 1595 vmovdqu YMMWORD[256+rdi],ymm14 1596 vmovdqu YMMWORD[288+rdi],ymm2 1597 je NEAR $L$done8x 1598 1599 lea rsi,[320+rsi] 1600 xor r10,r10 1601 vmovdqa YMMWORD[rsp],ymm3 1602 lea rdi,[320+rdi] 1603 sub rdx,320 1604 vmovdqa YMMWORD[32+rsp],ymm7 1605 jmp NEAR $L$oop_tail8x 1606 1607ALIGN 32 1608$L$384_or_more8x: 1609 vpxor ymm6,ymm6,YMMWORD[rsi] 1610 vpxor ymm8,ymm8,YMMWORD[32+rsi] 1611 vpxor ymm1,ymm1,YMMWORD[64+rsi] 1612 vpxor ymm5,ymm5,YMMWORD[96+rsi] 1613 vpxor ymm12,ymm12,YMMWORD[128+rsi] 1614 vpxor ymm13,ymm13,YMMWORD[160+rsi] 1615 vpxor ymm10,ymm10,YMMWORD[192+rsi] 1616 vpxor ymm15,ymm15,YMMWORD[224+rsi] 1617 vpxor ymm14,ymm14,YMMWORD[256+rsi] 1618 vpxor ymm2,ymm2,YMMWORD[288+rsi] 1619 vpxor ymm3,ymm3,YMMWORD[320+rsi] 1620 vpxor ymm7,ymm7,YMMWORD[352+rsi] 1621 vmovdqu YMMWORD[rdi],ymm6 1622 vmovdqu YMMWORD[32+rdi],ymm8 1623 vmovdqu YMMWORD[64+rdi],ymm1 1624 vmovdqu YMMWORD[96+rdi],ymm5 1625 vmovdqu YMMWORD[128+rdi],ymm12 1626 vmovdqu YMMWORD[160+rdi],ymm13 1627 vmovdqu YMMWORD[192+rdi],ymm10 1628 vmovdqu YMMWORD[224+rdi],ymm15 1629 vmovdqu YMMWORD[256+rdi],ymm14 1630 vmovdqu YMMWORD[288+rdi],ymm2 1631 vmovdqu YMMWORD[320+rdi],ymm3 1632 vmovdqu YMMWORD[352+rdi],ymm7 1633 je NEAR $L$done8x 1634 1635 lea rsi,[384+rsi] 1636 xor r10,r10 1637 vmovdqa YMMWORD[rsp],ymm11 1638 lea rdi,[384+rdi] 1639 sub rdx,384 1640 vmovdqa YMMWORD[32+rsp],ymm9 1641 jmp NEAR $L$oop_tail8x 1642 1643ALIGN 32 1644$L$448_or_more8x: 1645 vpxor ymm6,ymm6,YMMWORD[rsi] 1646 vpxor ymm8,ymm8,YMMWORD[32+rsi] 1647 vpxor ymm1,ymm1,YMMWORD[64+rsi] 1648 vpxor ymm5,ymm5,YMMWORD[96+rsi] 1649 vpxor ymm12,ymm12,YMMWORD[128+rsi] 1650 vpxor ymm13,ymm13,YMMWORD[160+rsi] 1651 vpxor ymm10,ymm10,YMMWORD[192+rsi] 1652 vpxor ymm15,ymm15,YMMWORD[224+rsi] 1653 vpxor ymm14,ymm14,YMMWORD[256+rsi] 1654 vpxor ymm2,ymm2,YMMWORD[288+rsi] 1655 vpxor ymm3,ymm3,YMMWORD[320+rsi] 1656 vpxor ymm7,ymm7,YMMWORD[352+rsi] 1657 vpxor ymm11,ymm11,YMMWORD[384+rsi] 1658 vpxor ymm9,ymm9,YMMWORD[416+rsi] 1659 vmovdqu YMMWORD[rdi],ymm6 1660 vmovdqu YMMWORD[32+rdi],ymm8 1661 vmovdqu YMMWORD[64+rdi],ymm1 1662 vmovdqu YMMWORD[96+rdi],ymm5 1663 vmovdqu YMMWORD[128+rdi],ymm12 1664 vmovdqu YMMWORD[160+rdi],ymm13 1665 vmovdqu YMMWORD[192+rdi],ymm10 1666 vmovdqu YMMWORD[224+rdi],ymm15 1667 vmovdqu YMMWORD[256+rdi],ymm14 1668 vmovdqu YMMWORD[288+rdi],ymm2 1669 vmovdqu YMMWORD[320+rdi],ymm3 1670 vmovdqu YMMWORD[352+rdi],ymm7 1671 vmovdqu YMMWORD[384+rdi],ymm11 1672 vmovdqu YMMWORD[416+rdi],ymm9 1673 je NEAR $L$done8x 1674 1675 lea rsi,[448+rsi] 1676 xor r10,r10 1677 vmovdqa YMMWORD[rsp],ymm0 1678 lea rdi,[448+rdi] 1679 sub rdx,448 1680 vmovdqa YMMWORD[32+rsp],ymm4 1681 1682$L$oop_tail8x: 1683 movzx eax,BYTE[r10*1+rsi] 1684 movzx ecx,BYTE[r10*1+rsp] 1685 lea r10,[1+r10] 1686 xor eax,ecx 1687 mov BYTE[((-1))+r10*1+rdi],al 1688 dec rdx 1689 jnz NEAR $L$oop_tail8x 1690 1691$L$done8x: 1692 vzeroall 1693 movaps xmm6,XMMWORD[((-168))+r9] 1694 movaps xmm7,XMMWORD[((-152))+r9] 1695 movaps xmm8,XMMWORD[((-136))+r9] 1696 movaps xmm9,XMMWORD[((-120))+r9] 1697 movaps xmm10,XMMWORD[((-104))+r9] 1698 movaps xmm11,XMMWORD[((-88))+r9] 1699 movaps xmm12,XMMWORD[((-72))+r9] 1700 movaps xmm13,XMMWORD[((-56))+r9] 1701 movaps xmm14,XMMWORD[((-40))+r9] 1702 movaps xmm15,XMMWORD[((-24))+r9] 1703 lea rsp,[r9] 1704 1705$L$8x_epilogue: 1706 mov rdi,QWORD[8+rsp] ;WIN64 epilogue 1707 mov rsi,QWORD[16+rsp] 1708 ret 1709 1710$L$SEH_end_ChaCha20_ctr32_avx2: 1711EXTERN __imp_RtlVirtualUnwind 1712 1713ALIGN 16 1714se_handler: 1715 push rsi 1716 push rdi 1717 push rbx 1718 push rbp 1719 push r12 1720 push r13 1721 push r14 1722 push r15 1723 pushfq 1724 sub rsp,64 1725 1726 mov rax,QWORD[120+r8] 1727 mov rbx,QWORD[248+r8] 1728 1729 mov rsi,QWORD[8+r9] 1730 mov r11,QWORD[56+r9] 1731 1732 lea r10,[$L$ctr32_body] 1733 cmp rbx,r10 1734 jb NEAR $L$common_seh_tail 1735 1736 mov rax,QWORD[152+r8] 1737 1738 lea r10,[$L$no_data] 1739 cmp rbx,r10 1740 jae NEAR $L$common_seh_tail 1741 1742 lea rax,[((64+24+48))+rax] 1743 1744 mov rbx,QWORD[((-8))+rax] 1745 mov rbp,QWORD[((-16))+rax] 1746 mov r12,QWORD[((-24))+rax] 1747 mov r13,QWORD[((-32))+rax] 1748 mov r14,QWORD[((-40))+rax] 1749 mov r15,QWORD[((-48))+rax] 1750 mov QWORD[144+r8],rbx 1751 mov QWORD[160+r8],rbp 1752 mov QWORD[216+r8],r12 1753 mov QWORD[224+r8],r13 1754 mov QWORD[232+r8],r14 1755 mov QWORD[240+r8],r15 1756 1757$L$common_seh_tail: 1758 mov rdi,QWORD[8+rax] 1759 mov rsi,QWORD[16+rax] 1760 mov QWORD[152+r8],rax 1761 mov QWORD[168+r8],rsi 1762 mov QWORD[176+r8],rdi 1763 1764 mov rdi,QWORD[40+r9] 1765 mov rsi,r8 1766 mov ecx,154 1767 DD 0xa548f3fc 1768 1769 mov rsi,r9 1770 xor rcx,rcx 1771 mov rdx,QWORD[8+rsi] 1772 mov r8,QWORD[rsi] 1773 mov r9,QWORD[16+rsi] 1774 mov r10,QWORD[40+rsi] 1775 lea r11,[56+rsi] 1776 lea r12,[24+rsi] 1777 mov QWORD[32+rsp],r10 1778 mov QWORD[40+rsp],r11 1779 mov QWORD[48+rsp],r12 1780 mov QWORD[56+rsp],rcx 1781 call QWORD[__imp_RtlVirtualUnwind] 1782 1783 mov eax,1 1784 add rsp,64 1785 popfq 1786 pop r15 1787 pop r14 1788 pop r13 1789 pop r12 1790 pop rbp 1791 pop rbx 1792 pop rdi 1793 pop rsi 1794 ret 1795 1796 1797 1798ALIGN 16 1799ssse3_handler: 1800 push rsi 1801 push rdi 1802 push rbx 1803 push rbp 1804 push r12 1805 push r13 1806 push r14 1807 push r15 1808 pushfq 1809 sub rsp,64 1810 1811 mov rax,QWORD[120+r8] 1812 mov rbx,QWORD[248+r8] 1813 1814 mov rsi,QWORD[8+r9] 1815 mov r11,QWORD[56+r9] 1816 1817 mov r10d,DWORD[r11] 1818 lea r10,[r10*1+rsi] 1819 cmp rbx,r10 1820 jb NEAR $L$common_seh_tail 1821 1822 mov rax,QWORD[192+r8] 1823 1824 mov r10d,DWORD[4+r11] 1825 lea r10,[r10*1+rsi] 1826 cmp rbx,r10 1827 jae NEAR $L$common_seh_tail 1828 1829 lea rsi,[((-40))+rax] 1830 lea rdi,[512+r8] 1831 mov ecx,4 1832 DD 0xa548f3fc 1833 1834 jmp NEAR $L$common_seh_tail 1835 1836 1837 1838ALIGN 16 1839full_handler: 1840 push rsi 1841 push rdi 1842 push rbx 1843 push rbp 1844 push r12 1845 push r13 1846 push r14 1847 push r15 1848 pushfq 1849 sub rsp,64 1850 1851 mov rax,QWORD[120+r8] 1852 mov rbx,QWORD[248+r8] 1853 1854 mov rsi,QWORD[8+r9] 1855 mov r11,QWORD[56+r9] 1856 1857 mov r10d,DWORD[r11] 1858 lea r10,[r10*1+rsi] 1859 cmp rbx,r10 1860 jb NEAR $L$common_seh_tail 1861 1862 mov rax,QWORD[192+r8] 1863 1864 mov r10d,DWORD[4+r11] 1865 lea r10,[r10*1+rsi] 1866 cmp rbx,r10 1867 jae NEAR $L$common_seh_tail 1868 1869 lea rsi,[((-168))+rax] 1870 lea rdi,[512+r8] 1871 mov ecx,20 1872 DD 0xa548f3fc 1873 1874 jmp NEAR $L$common_seh_tail 1875 1876 1877section .pdata rdata align=4 1878ALIGN 4 1879 DD $L$SEH_begin_ChaCha20_ctr32_nohw wrt ..imagebase 1880 DD $L$SEH_end_ChaCha20_ctr32_nohw wrt ..imagebase 1881 DD $L$SEH_info_ChaCha20_ctr32_nohw wrt ..imagebase 1882 1883 DD $L$SEH_begin_ChaCha20_ctr32_ssse3 wrt ..imagebase 1884 DD $L$SEH_end_ChaCha20_ctr32_ssse3 wrt ..imagebase 1885 DD $L$SEH_info_ChaCha20_ctr32_ssse3 wrt ..imagebase 1886 1887 DD $L$SEH_begin_ChaCha20_ctr32_ssse3_4x wrt ..imagebase 1888 DD $L$SEH_end_ChaCha20_ctr32_ssse3_4x wrt ..imagebase 1889 DD $L$SEH_info_ChaCha20_ctr32_ssse3_4x wrt ..imagebase 1890 DD $L$SEH_begin_ChaCha20_ctr32_avx2 wrt ..imagebase 1891 DD $L$SEH_end_ChaCha20_ctr32_avx2 wrt ..imagebase 1892 DD $L$SEH_info_ChaCha20_ctr32_avx2 wrt ..imagebase 1893section .xdata rdata align=8 1894ALIGN 8 1895$L$SEH_info_ChaCha20_ctr32_nohw: 1896 DB 9,0,0,0 1897 DD se_handler wrt ..imagebase 1898 1899$L$SEH_info_ChaCha20_ctr32_ssse3: 1900 DB 9,0,0,0 1901 DD ssse3_handler wrt ..imagebase 1902 DD $L$ssse3_body wrt ..imagebase,$L$ssse3_epilogue wrt ..imagebase 1903 1904$L$SEH_info_ChaCha20_ctr32_ssse3_4x: 1905 DB 9,0,0,0 1906 DD full_handler wrt ..imagebase 1907 DD $L$4x_body wrt ..imagebase,$L$4x_epilogue wrt ..imagebase 1908$L$SEH_info_ChaCha20_ctr32_avx2: 1909 DB 9,0,0,0 1910 DD full_handler wrt ..imagebase 1911 DD $L$8x_body wrt ..imagebase,$L$8x_epilogue wrt ..imagebase 1912%else 1913; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 1914ret 1915%endif 1916