1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#include <ring-core/asm_base.h> 5 6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) 7.text 8 9 10 11.globl _bn_mul_mont_gather5 12.private_extern _bn_mul_mont_gather5 13 14.p2align 6 15_bn_mul_mont_gather5: 16 17_CET_ENDBR 18 movl %r9d,%r9d 19 movq %rsp,%rax 20 21 testl $7,%r9d 22 jnz L$mul_enter 23 leaq _OPENSSL_ia32cap_P(%rip),%r11 24 movl 8(%r11),%r11d 25 jmp L$mul4x_enter 26 27.p2align 4 28L$mul_enter: 29 movd 8(%rsp),%xmm5 30 pushq %rbx 31 32 pushq %rbp 33 34 pushq %r12 35 36 pushq %r13 37 38 pushq %r14 39 40 pushq %r15 41 42 43 negq %r9 44 movq %rsp,%r11 45 leaq -280(%rsp,%r9,8),%r10 46 negq %r9 47 andq $-1024,%r10 48 49 50 51 52 53 54 55 56 57 subq %r10,%r11 58 andq $-4096,%r11 59 leaq (%r10,%r11,1),%rsp 60 movq (%rsp),%r11 61 cmpq %r10,%rsp 62 ja L$mul_page_walk 63 jmp L$mul_page_walk_done 64 65L$mul_page_walk: 66 leaq -4096(%rsp),%rsp 67 movq (%rsp),%r11 68 cmpq %r10,%rsp 69 ja L$mul_page_walk 70L$mul_page_walk_done: 71 72 leaq L$inc(%rip),%r10 73 movq %rax,8(%rsp,%r9,8) 74 75L$mul_body: 76 77 leaq 128(%rdx),%r12 78 movdqa 0(%r10),%xmm0 79 movdqa 16(%r10),%xmm1 80 leaq 24-112(%rsp,%r9,8),%r10 81 andq $-16,%r10 82 83 pshufd $0,%xmm5,%xmm5 84 movdqa %xmm1,%xmm4 85 movdqa %xmm1,%xmm2 86 paddd %xmm0,%xmm1 87 pcmpeqd %xmm5,%xmm0 88.byte 0x67 89 movdqa %xmm4,%xmm3 90 paddd %xmm1,%xmm2 91 pcmpeqd %xmm5,%xmm1 92 movdqa %xmm0,112(%r10) 93 movdqa %xmm4,%xmm0 94 95 paddd %xmm2,%xmm3 96 pcmpeqd %xmm5,%xmm2 97 movdqa %xmm1,128(%r10) 98 movdqa %xmm4,%xmm1 99 100 paddd %xmm3,%xmm0 101 pcmpeqd %xmm5,%xmm3 102 movdqa %xmm2,144(%r10) 103 movdqa %xmm4,%xmm2 104 105 paddd %xmm0,%xmm1 106 pcmpeqd %xmm5,%xmm0 107 movdqa %xmm3,160(%r10) 108 movdqa %xmm4,%xmm3 109 paddd %xmm1,%xmm2 110 pcmpeqd %xmm5,%xmm1 111 movdqa %xmm0,176(%r10) 112 movdqa %xmm4,%xmm0 113 114 paddd %xmm2,%xmm3 115 pcmpeqd %xmm5,%xmm2 116 movdqa %xmm1,192(%r10) 117 movdqa %xmm4,%xmm1 118 119 paddd %xmm3,%xmm0 120 pcmpeqd %xmm5,%xmm3 121 movdqa %xmm2,208(%r10) 122 movdqa %xmm4,%xmm2 123 124 paddd %xmm0,%xmm1 125 pcmpeqd %xmm5,%xmm0 126 movdqa %xmm3,224(%r10) 127 movdqa %xmm4,%xmm3 128 paddd %xmm1,%xmm2 129 pcmpeqd %xmm5,%xmm1 130 movdqa %xmm0,240(%r10) 131 movdqa %xmm4,%xmm0 132 133 paddd %xmm2,%xmm3 134 pcmpeqd %xmm5,%xmm2 135 movdqa %xmm1,256(%r10) 136 movdqa %xmm4,%xmm1 137 138 paddd %xmm3,%xmm0 139 pcmpeqd %xmm5,%xmm3 140 movdqa %xmm2,272(%r10) 141 movdqa %xmm4,%xmm2 142 143 paddd %xmm0,%xmm1 144 pcmpeqd %xmm5,%xmm0 145 movdqa %xmm3,288(%r10) 146 movdqa %xmm4,%xmm3 147 paddd %xmm1,%xmm2 148 pcmpeqd %xmm5,%xmm1 149 movdqa %xmm0,304(%r10) 150 151 paddd %xmm2,%xmm3 152.byte 0x67 153 pcmpeqd %xmm5,%xmm2 154 movdqa %xmm1,320(%r10) 155 156 pcmpeqd %xmm5,%xmm3 157 movdqa %xmm2,336(%r10) 158 pand 64(%r12),%xmm0 159 160 pand 80(%r12),%xmm1 161 pand 96(%r12),%xmm2 162 movdqa %xmm3,352(%r10) 163 pand 112(%r12),%xmm3 164 por %xmm2,%xmm0 165 por %xmm3,%xmm1 166 movdqa -128(%r12),%xmm4 167 movdqa -112(%r12),%xmm5 168 movdqa -96(%r12),%xmm2 169 pand 112(%r10),%xmm4 170 movdqa -80(%r12),%xmm3 171 pand 128(%r10),%xmm5 172 por %xmm4,%xmm0 173 pand 144(%r10),%xmm2 174 por %xmm5,%xmm1 175 pand 160(%r10),%xmm3 176 por %xmm2,%xmm0 177 por %xmm3,%xmm1 178 movdqa -64(%r12),%xmm4 179 movdqa -48(%r12),%xmm5 180 movdqa -32(%r12),%xmm2 181 pand 176(%r10),%xmm4 182 movdqa -16(%r12),%xmm3 183 pand 192(%r10),%xmm5 184 por %xmm4,%xmm0 185 pand 208(%r10),%xmm2 186 por %xmm5,%xmm1 187 pand 224(%r10),%xmm3 188 por %xmm2,%xmm0 189 por %xmm3,%xmm1 190 movdqa 0(%r12),%xmm4 191 movdqa 16(%r12),%xmm5 192 movdqa 32(%r12),%xmm2 193 pand 240(%r10),%xmm4 194 movdqa 48(%r12),%xmm3 195 pand 256(%r10),%xmm5 196 por %xmm4,%xmm0 197 pand 272(%r10),%xmm2 198 por %xmm5,%xmm1 199 pand 288(%r10),%xmm3 200 por %xmm2,%xmm0 201 por %xmm3,%xmm1 202 por %xmm1,%xmm0 203 204 pshufd $0x4e,%xmm0,%xmm1 205 por %xmm1,%xmm0 206 leaq 256(%r12),%r12 207.byte 102,72,15,126,195 208 209 movq (%r8),%r8 210 movq (%rsi),%rax 211 212 xorq %r14,%r14 213 xorq %r15,%r15 214 215 movq %r8,%rbp 216 mulq %rbx 217 movq %rax,%r10 218 movq (%rcx),%rax 219 220 imulq %r10,%rbp 221 movq %rdx,%r11 222 223 mulq %rbp 224 addq %rax,%r10 225 movq 8(%rsi),%rax 226 adcq $0,%rdx 227 movq %rdx,%r13 228 229 leaq 1(%r15),%r15 230 jmp L$1st_enter 231 232.p2align 4 233L$1st: 234 addq %rax,%r13 235 movq (%rsi,%r15,8),%rax 236 adcq $0,%rdx 237 addq %r11,%r13 238 movq %r10,%r11 239 adcq $0,%rdx 240 movq %r13,-16(%rsp,%r15,8) 241 movq %rdx,%r13 242 243L$1st_enter: 244 mulq %rbx 245 addq %rax,%r11 246 movq (%rcx,%r15,8),%rax 247 adcq $0,%rdx 248 leaq 1(%r15),%r15 249 movq %rdx,%r10 250 251 mulq %rbp 252 cmpq %r9,%r15 253 jne L$1st 254 255 256 addq %rax,%r13 257 adcq $0,%rdx 258 addq %r11,%r13 259 adcq $0,%rdx 260 movq %r13,-16(%rsp,%r9,8) 261 movq %rdx,%r13 262 movq %r10,%r11 263 264 xorq %rdx,%rdx 265 addq %r11,%r13 266 adcq $0,%rdx 267 movq %r13,-8(%rsp,%r9,8) 268 movq %rdx,(%rsp,%r9,8) 269 270 leaq 1(%r14),%r14 271 jmp L$outer 272.p2align 4 273L$outer: 274 leaq 24+128(%rsp,%r9,8),%rdx 275 andq $-16,%rdx 276 pxor %xmm4,%xmm4 277 pxor %xmm5,%xmm5 278 movdqa -128(%r12),%xmm0 279 movdqa -112(%r12),%xmm1 280 movdqa -96(%r12),%xmm2 281 movdqa -80(%r12),%xmm3 282 pand -128(%rdx),%xmm0 283 pand -112(%rdx),%xmm1 284 por %xmm0,%xmm4 285 pand -96(%rdx),%xmm2 286 por %xmm1,%xmm5 287 pand -80(%rdx),%xmm3 288 por %xmm2,%xmm4 289 por %xmm3,%xmm5 290 movdqa -64(%r12),%xmm0 291 movdqa -48(%r12),%xmm1 292 movdqa -32(%r12),%xmm2 293 movdqa -16(%r12),%xmm3 294 pand -64(%rdx),%xmm0 295 pand -48(%rdx),%xmm1 296 por %xmm0,%xmm4 297 pand -32(%rdx),%xmm2 298 por %xmm1,%xmm5 299 pand -16(%rdx),%xmm3 300 por %xmm2,%xmm4 301 por %xmm3,%xmm5 302 movdqa 0(%r12),%xmm0 303 movdqa 16(%r12),%xmm1 304 movdqa 32(%r12),%xmm2 305 movdqa 48(%r12),%xmm3 306 pand 0(%rdx),%xmm0 307 pand 16(%rdx),%xmm1 308 por %xmm0,%xmm4 309 pand 32(%rdx),%xmm2 310 por %xmm1,%xmm5 311 pand 48(%rdx),%xmm3 312 por %xmm2,%xmm4 313 por %xmm3,%xmm5 314 movdqa 64(%r12),%xmm0 315 movdqa 80(%r12),%xmm1 316 movdqa 96(%r12),%xmm2 317 movdqa 112(%r12),%xmm3 318 pand 64(%rdx),%xmm0 319 pand 80(%rdx),%xmm1 320 por %xmm0,%xmm4 321 pand 96(%rdx),%xmm2 322 por %xmm1,%xmm5 323 pand 112(%rdx),%xmm3 324 por %xmm2,%xmm4 325 por %xmm3,%xmm5 326 por %xmm5,%xmm4 327 328 pshufd $0x4e,%xmm4,%xmm0 329 por %xmm4,%xmm0 330 leaq 256(%r12),%r12 331 332 movq (%rsi),%rax 333.byte 102,72,15,126,195 334 335 xorq %r15,%r15 336 movq %r8,%rbp 337 movq (%rsp),%r10 338 339 mulq %rbx 340 addq %rax,%r10 341 movq (%rcx),%rax 342 adcq $0,%rdx 343 344 imulq %r10,%rbp 345 movq %rdx,%r11 346 347 mulq %rbp 348 addq %rax,%r10 349 movq 8(%rsi),%rax 350 adcq $0,%rdx 351 movq 8(%rsp),%r10 352 movq %rdx,%r13 353 354 leaq 1(%r15),%r15 355 jmp L$inner_enter 356 357.p2align 4 358L$inner: 359 addq %rax,%r13 360 movq (%rsi,%r15,8),%rax 361 adcq $0,%rdx 362 addq %r10,%r13 363 movq (%rsp,%r15,8),%r10 364 adcq $0,%rdx 365 movq %r13,-16(%rsp,%r15,8) 366 movq %rdx,%r13 367 368L$inner_enter: 369 mulq %rbx 370 addq %rax,%r11 371 movq (%rcx,%r15,8),%rax 372 adcq $0,%rdx 373 addq %r11,%r10 374 movq %rdx,%r11 375 adcq $0,%r11 376 leaq 1(%r15),%r15 377 378 mulq %rbp 379 cmpq %r9,%r15 380 jne L$inner 381 382 addq %rax,%r13 383 adcq $0,%rdx 384 addq %r10,%r13 385 movq (%rsp,%r9,8),%r10 386 adcq $0,%rdx 387 movq %r13,-16(%rsp,%r9,8) 388 movq %rdx,%r13 389 390 xorq %rdx,%rdx 391 addq %r11,%r13 392 adcq $0,%rdx 393 addq %r10,%r13 394 adcq $0,%rdx 395 movq %r13,-8(%rsp,%r9,8) 396 movq %rdx,(%rsp,%r9,8) 397 398 leaq 1(%r14),%r14 399 cmpq %r9,%r14 400 jb L$outer 401 402 xorq %r14,%r14 403 movq (%rsp),%rax 404 leaq (%rsp),%rsi 405 movq %r9,%r15 406 jmp L$sub 407.p2align 4 408L$sub: sbbq (%rcx,%r14,8),%rax 409 movq %rax,(%rdi,%r14,8) 410 movq 8(%rsi,%r14,8),%rax 411 leaq 1(%r14),%r14 412 decq %r15 413 jnz L$sub 414 415 sbbq $0,%rax 416 movq $-1,%rbx 417 xorq %rax,%rbx 418 xorq %r14,%r14 419 movq %r9,%r15 420 421L$copy: 422 movq (%rdi,%r14,8),%rcx 423 movq (%rsp,%r14,8),%rdx 424 andq %rbx,%rcx 425 andq %rax,%rdx 426 movq %r14,(%rsp,%r14,8) 427 orq %rcx,%rdx 428 movq %rdx,(%rdi,%r14,8) 429 leaq 1(%r14),%r14 430 subq $1,%r15 431 jnz L$copy 432 433 movq 8(%rsp,%r9,8),%rsi 434 435 movq $1,%rax 436 437 movq -48(%rsi),%r15 438 439 movq -40(%rsi),%r14 440 441 movq -32(%rsi),%r13 442 443 movq -24(%rsi),%r12 444 445 movq -16(%rsi),%rbp 446 447 movq -8(%rsi),%rbx 448 449 leaq (%rsi),%rsp 450 451L$mul_epilogue: 452 ret 453 454 455 456.p2align 5 457bn_mul4x_mont_gather5: 458 459.byte 0x67 460 movq %rsp,%rax 461 462L$mul4x_enter: 463 andl $0x80108,%r11d 464 cmpl $0x80108,%r11d 465 je L$mulx4x_enter 466 pushq %rbx 467 468 pushq %rbp 469 470 pushq %r12 471 472 pushq %r13 473 474 pushq %r14 475 476 pushq %r15 477 478L$mul4x_prologue: 479 480.byte 0x67 481 shll $3,%r9d 482 leaq (%r9,%r9,2),%r10 483 negq %r9 484 485 486 487 488 489 490 491 492 493 494 leaq -320(%rsp,%r9,2),%r11 495 movq %rsp,%rbp 496 subq %rdi,%r11 497 andq $4095,%r11 498 cmpq %r11,%r10 499 jb L$mul4xsp_alt 500 subq %r11,%rbp 501 leaq -320(%rbp,%r9,2),%rbp 502 jmp L$mul4xsp_done 503 504.p2align 5 505L$mul4xsp_alt: 506 leaq 4096-320(,%r9,2),%r10 507 leaq -320(%rbp,%r9,2),%rbp 508 subq %r10,%r11 509 movq $0,%r10 510 cmovcq %r10,%r11 511 subq %r11,%rbp 512L$mul4xsp_done: 513 andq $-64,%rbp 514 movq %rsp,%r11 515 subq %rbp,%r11 516 andq $-4096,%r11 517 leaq (%r11,%rbp,1),%rsp 518 movq (%rsp),%r10 519 cmpq %rbp,%rsp 520 ja L$mul4x_page_walk 521 jmp L$mul4x_page_walk_done 522 523L$mul4x_page_walk: 524 leaq -4096(%rsp),%rsp 525 movq (%rsp),%r10 526 cmpq %rbp,%rsp 527 ja L$mul4x_page_walk 528L$mul4x_page_walk_done: 529 530 negq %r9 531 532 movq %rax,40(%rsp) 533 534L$mul4x_body: 535 536 call mul4x_internal 537 538 movq 40(%rsp),%rsi 539 540 movq $1,%rax 541 542 movq -48(%rsi),%r15 543 544 movq -40(%rsi),%r14 545 546 movq -32(%rsi),%r13 547 548 movq -24(%rsi),%r12 549 550 movq -16(%rsi),%rbp 551 552 movq -8(%rsi),%rbx 553 554 leaq (%rsi),%rsp 555 556L$mul4x_epilogue: 557 ret 558 559 560 561 562.p2align 5 563mul4x_internal: 564 565 shlq $5,%r9 566 movd 8(%rax),%xmm5 567 leaq L$inc(%rip),%rax 568 leaq 128(%rdx,%r9,1),%r13 569 shrq $5,%r9 570 movdqa 0(%rax),%xmm0 571 movdqa 16(%rax),%xmm1 572 leaq 88-112(%rsp,%r9,1),%r10 573 leaq 128(%rdx),%r12 574 575 pshufd $0,%xmm5,%xmm5 576 movdqa %xmm1,%xmm4 577.byte 0x67,0x67 578 movdqa %xmm1,%xmm2 579 paddd %xmm0,%xmm1 580 pcmpeqd %xmm5,%xmm0 581.byte 0x67 582 movdqa %xmm4,%xmm3 583 paddd %xmm1,%xmm2 584 pcmpeqd %xmm5,%xmm1 585 movdqa %xmm0,112(%r10) 586 movdqa %xmm4,%xmm0 587 588 paddd %xmm2,%xmm3 589 pcmpeqd %xmm5,%xmm2 590 movdqa %xmm1,128(%r10) 591 movdqa %xmm4,%xmm1 592 593 paddd %xmm3,%xmm0 594 pcmpeqd %xmm5,%xmm3 595 movdqa %xmm2,144(%r10) 596 movdqa %xmm4,%xmm2 597 598 paddd %xmm0,%xmm1 599 pcmpeqd %xmm5,%xmm0 600 movdqa %xmm3,160(%r10) 601 movdqa %xmm4,%xmm3 602 paddd %xmm1,%xmm2 603 pcmpeqd %xmm5,%xmm1 604 movdqa %xmm0,176(%r10) 605 movdqa %xmm4,%xmm0 606 607 paddd %xmm2,%xmm3 608 pcmpeqd %xmm5,%xmm2 609 movdqa %xmm1,192(%r10) 610 movdqa %xmm4,%xmm1 611 612 paddd %xmm3,%xmm0 613 pcmpeqd %xmm5,%xmm3 614 movdqa %xmm2,208(%r10) 615 movdqa %xmm4,%xmm2 616 617 paddd %xmm0,%xmm1 618 pcmpeqd %xmm5,%xmm0 619 movdqa %xmm3,224(%r10) 620 movdqa %xmm4,%xmm3 621 paddd %xmm1,%xmm2 622 pcmpeqd %xmm5,%xmm1 623 movdqa %xmm0,240(%r10) 624 movdqa %xmm4,%xmm0 625 626 paddd %xmm2,%xmm3 627 pcmpeqd %xmm5,%xmm2 628 movdqa %xmm1,256(%r10) 629 movdqa %xmm4,%xmm1 630 631 paddd %xmm3,%xmm0 632 pcmpeqd %xmm5,%xmm3 633 movdqa %xmm2,272(%r10) 634 movdqa %xmm4,%xmm2 635 636 paddd %xmm0,%xmm1 637 pcmpeqd %xmm5,%xmm0 638 movdqa %xmm3,288(%r10) 639 movdqa %xmm4,%xmm3 640 paddd %xmm1,%xmm2 641 pcmpeqd %xmm5,%xmm1 642 movdqa %xmm0,304(%r10) 643 644 paddd %xmm2,%xmm3 645.byte 0x67 646 pcmpeqd %xmm5,%xmm2 647 movdqa %xmm1,320(%r10) 648 649 pcmpeqd %xmm5,%xmm3 650 movdqa %xmm2,336(%r10) 651 pand 64(%r12),%xmm0 652 653 pand 80(%r12),%xmm1 654 pand 96(%r12),%xmm2 655 movdqa %xmm3,352(%r10) 656 pand 112(%r12),%xmm3 657 por %xmm2,%xmm0 658 por %xmm3,%xmm1 659 movdqa -128(%r12),%xmm4 660 movdqa -112(%r12),%xmm5 661 movdqa -96(%r12),%xmm2 662 pand 112(%r10),%xmm4 663 movdqa -80(%r12),%xmm3 664 pand 128(%r10),%xmm5 665 por %xmm4,%xmm0 666 pand 144(%r10),%xmm2 667 por %xmm5,%xmm1 668 pand 160(%r10),%xmm3 669 por %xmm2,%xmm0 670 por %xmm3,%xmm1 671 movdqa -64(%r12),%xmm4 672 movdqa -48(%r12),%xmm5 673 movdqa -32(%r12),%xmm2 674 pand 176(%r10),%xmm4 675 movdqa -16(%r12),%xmm3 676 pand 192(%r10),%xmm5 677 por %xmm4,%xmm0 678 pand 208(%r10),%xmm2 679 por %xmm5,%xmm1 680 pand 224(%r10),%xmm3 681 por %xmm2,%xmm0 682 por %xmm3,%xmm1 683 movdqa 0(%r12),%xmm4 684 movdqa 16(%r12),%xmm5 685 movdqa 32(%r12),%xmm2 686 pand 240(%r10),%xmm4 687 movdqa 48(%r12),%xmm3 688 pand 256(%r10),%xmm5 689 por %xmm4,%xmm0 690 pand 272(%r10),%xmm2 691 por %xmm5,%xmm1 692 pand 288(%r10),%xmm3 693 por %xmm2,%xmm0 694 por %xmm3,%xmm1 695 por %xmm1,%xmm0 696 697 pshufd $0x4e,%xmm0,%xmm1 698 por %xmm1,%xmm0 699 leaq 256(%r12),%r12 700.byte 102,72,15,126,195 701 702 movq %r13,16+8(%rsp) 703 movq %rdi,56+8(%rsp) 704 705 movq (%r8),%r8 706 movq (%rsi),%rax 707 leaq (%rsi,%r9,1),%rsi 708 negq %r9 709 710 movq %r8,%rbp 711 mulq %rbx 712 movq %rax,%r10 713 movq (%rcx),%rax 714 715 imulq %r10,%rbp 716 leaq 64+8(%rsp),%r14 717 movq %rdx,%r11 718 719 mulq %rbp 720 addq %rax,%r10 721 movq 8(%rsi,%r9,1),%rax 722 adcq $0,%rdx 723 movq %rdx,%rdi 724 725 mulq %rbx 726 addq %rax,%r11 727 movq 8(%rcx),%rax 728 adcq $0,%rdx 729 movq %rdx,%r10 730 731 mulq %rbp 732 addq %rax,%rdi 733 movq 16(%rsi,%r9,1),%rax 734 adcq $0,%rdx 735 addq %r11,%rdi 736 leaq 32(%r9),%r15 737 leaq 32(%rcx),%rcx 738 adcq $0,%rdx 739 movq %rdi,(%r14) 740 movq %rdx,%r13 741 jmp L$1st4x 742 743.p2align 5 744L$1st4x: 745 mulq %rbx 746 addq %rax,%r10 747 movq -16(%rcx),%rax 748 leaq 32(%r14),%r14 749 adcq $0,%rdx 750 movq %rdx,%r11 751 752 mulq %rbp 753 addq %rax,%r13 754 movq -8(%rsi,%r15,1),%rax 755 adcq $0,%rdx 756 addq %r10,%r13 757 adcq $0,%rdx 758 movq %r13,-24(%r14) 759 movq %rdx,%rdi 760 761 mulq %rbx 762 addq %rax,%r11 763 movq -8(%rcx),%rax 764 adcq $0,%rdx 765 movq %rdx,%r10 766 767 mulq %rbp 768 addq %rax,%rdi 769 movq (%rsi,%r15,1),%rax 770 adcq $0,%rdx 771 addq %r11,%rdi 772 adcq $0,%rdx 773 movq %rdi,-16(%r14) 774 movq %rdx,%r13 775 776 mulq %rbx 777 addq %rax,%r10 778 movq 0(%rcx),%rax 779 adcq $0,%rdx 780 movq %rdx,%r11 781 782 mulq %rbp 783 addq %rax,%r13 784 movq 8(%rsi,%r15,1),%rax 785 adcq $0,%rdx 786 addq %r10,%r13 787 adcq $0,%rdx 788 movq %r13,-8(%r14) 789 movq %rdx,%rdi 790 791 mulq %rbx 792 addq %rax,%r11 793 movq 8(%rcx),%rax 794 adcq $0,%rdx 795 movq %rdx,%r10 796 797 mulq %rbp 798 addq %rax,%rdi 799 movq 16(%rsi,%r15,1),%rax 800 adcq $0,%rdx 801 addq %r11,%rdi 802 leaq 32(%rcx),%rcx 803 adcq $0,%rdx 804 movq %rdi,(%r14) 805 movq %rdx,%r13 806 807 addq $32,%r15 808 jnz L$1st4x 809 810 mulq %rbx 811 addq %rax,%r10 812 movq -16(%rcx),%rax 813 leaq 32(%r14),%r14 814 adcq $0,%rdx 815 movq %rdx,%r11 816 817 mulq %rbp 818 addq %rax,%r13 819 movq -8(%rsi),%rax 820 adcq $0,%rdx 821 addq %r10,%r13 822 adcq $0,%rdx 823 movq %r13,-24(%r14) 824 movq %rdx,%rdi 825 826 mulq %rbx 827 addq %rax,%r11 828 movq -8(%rcx),%rax 829 adcq $0,%rdx 830 movq %rdx,%r10 831 832 mulq %rbp 833 addq %rax,%rdi 834 movq (%rsi,%r9,1),%rax 835 adcq $0,%rdx 836 addq %r11,%rdi 837 adcq $0,%rdx 838 movq %rdi,-16(%r14) 839 movq %rdx,%r13 840 841 leaq (%rcx,%r9,1),%rcx 842 843 xorq %rdi,%rdi 844 addq %r10,%r13 845 adcq $0,%rdi 846 movq %r13,-8(%r14) 847 848 jmp L$outer4x 849 850.p2align 5 851L$outer4x: 852 leaq 16+128(%r14),%rdx 853 pxor %xmm4,%xmm4 854 pxor %xmm5,%xmm5 855 movdqa -128(%r12),%xmm0 856 movdqa -112(%r12),%xmm1 857 movdqa -96(%r12),%xmm2 858 movdqa -80(%r12),%xmm3 859 pand -128(%rdx),%xmm0 860 pand -112(%rdx),%xmm1 861 por %xmm0,%xmm4 862 pand -96(%rdx),%xmm2 863 por %xmm1,%xmm5 864 pand -80(%rdx),%xmm3 865 por %xmm2,%xmm4 866 por %xmm3,%xmm5 867 movdqa -64(%r12),%xmm0 868 movdqa -48(%r12),%xmm1 869 movdqa -32(%r12),%xmm2 870 movdqa -16(%r12),%xmm3 871 pand -64(%rdx),%xmm0 872 pand -48(%rdx),%xmm1 873 por %xmm0,%xmm4 874 pand -32(%rdx),%xmm2 875 por %xmm1,%xmm5 876 pand -16(%rdx),%xmm3 877 por %xmm2,%xmm4 878 por %xmm3,%xmm5 879 movdqa 0(%r12),%xmm0 880 movdqa 16(%r12),%xmm1 881 movdqa 32(%r12),%xmm2 882 movdqa 48(%r12),%xmm3 883 pand 0(%rdx),%xmm0 884 pand 16(%rdx),%xmm1 885 por %xmm0,%xmm4 886 pand 32(%rdx),%xmm2 887 por %xmm1,%xmm5 888 pand 48(%rdx),%xmm3 889 por %xmm2,%xmm4 890 por %xmm3,%xmm5 891 movdqa 64(%r12),%xmm0 892 movdqa 80(%r12),%xmm1 893 movdqa 96(%r12),%xmm2 894 movdqa 112(%r12),%xmm3 895 pand 64(%rdx),%xmm0 896 pand 80(%rdx),%xmm1 897 por %xmm0,%xmm4 898 pand 96(%rdx),%xmm2 899 por %xmm1,%xmm5 900 pand 112(%rdx),%xmm3 901 por %xmm2,%xmm4 902 por %xmm3,%xmm5 903 por %xmm5,%xmm4 904 905 pshufd $0x4e,%xmm4,%xmm0 906 por %xmm4,%xmm0 907 leaq 256(%r12),%r12 908.byte 102,72,15,126,195 909 910 movq (%r14,%r9,1),%r10 911 movq %r8,%rbp 912 mulq %rbx 913 addq %rax,%r10 914 movq (%rcx),%rax 915 adcq $0,%rdx 916 917 imulq %r10,%rbp 918 movq %rdx,%r11 919 movq %rdi,(%r14) 920 921 leaq (%r14,%r9,1),%r14 922 923 mulq %rbp 924 addq %rax,%r10 925 movq 8(%rsi,%r9,1),%rax 926 adcq $0,%rdx 927 movq %rdx,%rdi 928 929 mulq %rbx 930 addq %rax,%r11 931 movq 8(%rcx),%rax 932 adcq $0,%rdx 933 addq 8(%r14),%r11 934 adcq $0,%rdx 935 movq %rdx,%r10 936 937 mulq %rbp 938 addq %rax,%rdi 939 movq 16(%rsi,%r9,1),%rax 940 adcq $0,%rdx 941 addq %r11,%rdi 942 leaq 32(%r9),%r15 943 leaq 32(%rcx),%rcx 944 adcq $0,%rdx 945 movq %rdx,%r13 946 jmp L$inner4x 947 948.p2align 5 949L$inner4x: 950 mulq %rbx 951 addq %rax,%r10 952 movq -16(%rcx),%rax 953 adcq $0,%rdx 954 addq 16(%r14),%r10 955 leaq 32(%r14),%r14 956 adcq $0,%rdx 957 movq %rdx,%r11 958 959 mulq %rbp 960 addq %rax,%r13 961 movq -8(%rsi,%r15,1),%rax 962 adcq $0,%rdx 963 addq %r10,%r13 964 adcq $0,%rdx 965 movq %rdi,-32(%r14) 966 movq %rdx,%rdi 967 968 mulq %rbx 969 addq %rax,%r11 970 movq -8(%rcx),%rax 971 adcq $0,%rdx 972 addq -8(%r14),%r11 973 adcq $0,%rdx 974 movq %rdx,%r10 975 976 mulq %rbp 977 addq %rax,%rdi 978 movq (%rsi,%r15,1),%rax 979 adcq $0,%rdx 980 addq %r11,%rdi 981 adcq $0,%rdx 982 movq %r13,-24(%r14) 983 movq %rdx,%r13 984 985 mulq %rbx 986 addq %rax,%r10 987 movq 0(%rcx),%rax 988 adcq $0,%rdx 989 addq (%r14),%r10 990 adcq $0,%rdx 991 movq %rdx,%r11 992 993 mulq %rbp 994 addq %rax,%r13 995 movq 8(%rsi,%r15,1),%rax 996 adcq $0,%rdx 997 addq %r10,%r13 998 adcq $0,%rdx 999 movq %rdi,-16(%r14) 1000 movq %rdx,%rdi 1001 1002 mulq %rbx 1003 addq %rax,%r11 1004 movq 8(%rcx),%rax 1005 adcq $0,%rdx 1006 addq 8(%r14),%r11 1007 adcq $0,%rdx 1008 movq %rdx,%r10 1009 1010 mulq %rbp 1011 addq %rax,%rdi 1012 movq 16(%rsi,%r15,1),%rax 1013 adcq $0,%rdx 1014 addq %r11,%rdi 1015 leaq 32(%rcx),%rcx 1016 adcq $0,%rdx 1017 movq %r13,-8(%r14) 1018 movq %rdx,%r13 1019 1020 addq $32,%r15 1021 jnz L$inner4x 1022 1023 mulq %rbx 1024 addq %rax,%r10 1025 movq -16(%rcx),%rax 1026 adcq $0,%rdx 1027 addq 16(%r14),%r10 1028 leaq 32(%r14),%r14 1029 adcq $0,%rdx 1030 movq %rdx,%r11 1031 1032 mulq %rbp 1033 addq %rax,%r13 1034 movq -8(%rsi),%rax 1035 adcq $0,%rdx 1036 addq %r10,%r13 1037 adcq $0,%rdx 1038 movq %rdi,-32(%r14) 1039 movq %rdx,%rdi 1040 1041 mulq %rbx 1042 addq %rax,%r11 1043 movq %rbp,%rax 1044 movq -8(%rcx),%rbp 1045 adcq $0,%rdx 1046 addq -8(%r14),%r11 1047 adcq $0,%rdx 1048 movq %rdx,%r10 1049 1050 mulq %rbp 1051 addq %rax,%rdi 1052 movq (%rsi,%r9,1),%rax 1053 adcq $0,%rdx 1054 addq %r11,%rdi 1055 adcq $0,%rdx 1056 movq %r13,-24(%r14) 1057 movq %rdx,%r13 1058 1059 movq %rdi,-16(%r14) 1060 leaq (%rcx,%r9,1),%rcx 1061 1062 xorq %rdi,%rdi 1063 addq %r10,%r13 1064 adcq $0,%rdi 1065 addq (%r14),%r13 1066 adcq $0,%rdi 1067 movq %r13,-8(%r14) 1068 1069 cmpq 16+8(%rsp),%r12 1070 jb L$outer4x 1071 xorq %rax,%rax 1072 subq %r13,%rbp 1073 adcq %r15,%r15 1074 orq %r15,%rdi 1075 subq %rdi,%rax 1076 leaq (%r14,%r9,1),%rbx 1077 movq (%rcx),%r12 1078 leaq (%rcx),%rbp 1079 movq %r9,%rcx 1080 sarq $3+2,%rcx 1081 movq 56+8(%rsp),%rdi 1082 decq %r12 1083 xorq %r10,%r10 1084 movq 8(%rbp),%r13 1085 movq 16(%rbp),%r14 1086 movq 24(%rbp),%r15 1087 jmp L$sqr4x_sub_entry 1088 1089 1090.globl _bn_power5 1091.private_extern _bn_power5 1092 1093.p2align 5 1094_bn_power5: 1095 1096_CET_ENDBR 1097 movq %rsp,%rax 1098 1099 leaq _OPENSSL_ia32cap_P(%rip),%r11 1100 movl 8(%r11),%r11d 1101 andl $0x80108,%r11d 1102 cmpl $0x80108,%r11d 1103 je L$powerx5_enter 1104 pushq %rbx 1105 1106 pushq %rbp 1107 1108 pushq %r12 1109 1110 pushq %r13 1111 1112 pushq %r14 1113 1114 pushq %r15 1115 1116L$power5_prologue: 1117 1118 shll $3,%r9d 1119 leal (%r9,%r9,2),%r10d 1120 negq %r9 1121 movq (%r8),%r8 1122 1123 1124 1125 1126 1127 1128 1129 1130 leaq -320(%rsp,%r9,2),%r11 1131 movq %rsp,%rbp 1132 subq %rdi,%r11 1133 andq $4095,%r11 1134 cmpq %r11,%r10 1135 jb L$pwr_sp_alt 1136 subq %r11,%rbp 1137 leaq -320(%rbp,%r9,2),%rbp 1138 jmp L$pwr_sp_done 1139 1140.p2align 5 1141L$pwr_sp_alt: 1142 leaq 4096-320(,%r9,2),%r10 1143 leaq -320(%rbp,%r9,2),%rbp 1144 subq %r10,%r11 1145 movq $0,%r10 1146 cmovcq %r10,%r11 1147 subq %r11,%rbp 1148L$pwr_sp_done: 1149 andq $-64,%rbp 1150 movq %rsp,%r11 1151 subq %rbp,%r11 1152 andq $-4096,%r11 1153 leaq (%r11,%rbp,1),%rsp 1154 movq (%rsp),%r10 1155 cmpq %rbp,%rsp 1156 ja L$pwr_page_walk 1157 jmp L$pwr_page_walk_done 1158 1159L$pwr_page_walk: 1160 leaq -4096(%rsp),%rsp 1161 movq (%rsp),%r10 1162 cmpq %rbp,%rsp 1163 ja L$pwr_page_walk 1164L$pwr_page_walk_done: 1165 1166 movq %r9,%r10 1167 negq %r9 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 movq %r8,32(%rsp) 1179 movq %rax,40(%rsp) 1180 1181L$power5_body: 1182.byte 102,72,15,110,207 1183.byte 102,72,15,110,209 1184.byte 102,73,15,110,218 1185.byte 102,72,15,110,226 1186 1187 call __bn_sqr8x_internal 1188 call __bn_post4x_internal 1189 call __bn_sqr8x_internal 1190 call __bn_post4x_internal 1191 call __bn_sqr8x_internal 1192 call __bn_post4x_internal 1193 call __bn_sqr8x_internal 1194 call __bn_post4x_internal 1195 call __bn_sqr8x_internal 1196 call __bn_post4x_internal 1197 1198.byte 102,72,15,126,209 1199.byte 102,72,15,126,226 1200 movq %rsi,%rdi 1201 movq 40(%rsp),%rax 1202 leaq 32(%rsp),%r8 1203 1204 call mul4x_internal 1205 1206 movq 40(%rsp),%rsi 1207 1208 movq $1,%rax 1209 movq -48(%rsi),%r15 1210 1211 movq -40(%rsi),%r14 1212 1213 movq -32(%rsi),%r13 1214 1215 movq -24(%rsi),%r12 1216 1217 movq -16(%rsi),%rbp 1218 1219 movq -8(%rsi),%rbx 1220 1221 leaq (%rsi),%rsp 1222 1223L$power5_epilogue: 1224 ret 1225 1226 1227 1228.globl _bn_sqr8x_internal 1229.private_extern _bn_sqr8x_internal 1230.private_extern _bn_sqr8x_internal 1231 1232.p2align 5 1233_bn_sqr8x_internal: 1234__bn_sqr8x_internal: 1235 1236_CET_ENDBR 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 leaq 32(%r10),%rbp 1311 leaq (%rsi,%r9,1),%rsi 1312 1313 movq %r9,%rcx 1314 1315 1316 movq -32(%rsi,%rbp,1),%r14 1317 leaq 48+8(%rsp,%r9,2),%rdi 1318 movq -24(%rsi,%rbp,1),%rax 1319 leaq -32(%rdi,%rbp,1),%rdi 1320 movq -16(%rsi,%rbp,1),%rbx 1321 movq %rax,%r15 1322 1323 mulq %r14 1324 movq %rax,%r10 1325 movq %rbx,%rax 1326 movq %rdx,%r11 1327 movq %r10,-24(%rdi,%rbp,1) 1328 1329 mulq %r14 1330 addq %rax,%r11 1331 movq %rbx,%rax 1332 adcq $0,%rdx 1333 movq %r11,-16(%rdi,%rbp,1) 1334 movq %rdx,%r10 1335 1336 1337 movq -8(%rsi,%rbp,1),%rbx 1338 mulq %r15 1339 movq %rax,%r12 1340 movq %rbx,%rax 1341 movq %rdx,%r13 1342 1343 leaq (%rbp),%rcx 1344 mulq %r14 1345 addq %rax,%r10 1346 movq %rbx,%rax 1347 movq %rdx,%r11 1348 adcq $0,%r11 1349 addq %r12,%r10 1350 adcq $0,%r11 1351 movq %r10,-8(%rdi,%rcx,1) 1352 jmp L$sqr4x_1st 1353 1354.p2align 5 1355L$sqr4x_1st: 1356 movq (%rsi,%rcx,1),%rbx 1357 mulq %r15 1358 addq %rax,%r13 1359 movq %rbx,%rax 1360 movq %rdx,%r12 1361 adcq $0,%r12 1362 1363 mulq %r14 1364 addq %rax,%r11 1365 movq %rbx,%rax 1366 movq 8(%rsi,%rcx,1),%rbx 1367 movq %rdx,%r10 1368 adcq $0,%r10 1369 addq %r13,%r11 1370 adcq $0,%r10 1371 1372 1373 mulq %r15 1374 addq %rax,%r12 1375 movq %rbx,%rax 1376 movq %r11,(%rdi,%rcx,1) 1377 movq %rdx,%r13 1378 adcq $0,%r13 1379 1380 mulq %r14 1381 addq %rax,%r10 1382 movq %rbx,%rax 1383 movq 16(%rsi,%rcx,1),%rbx 1384 movq %rdx,%r11 1385 adcq $0,%r11 1386 addq %r12,%r10 1387 adcq $0,%r11 1388 1389 mulq %r15 1390 addq %rax,%r13 1391 movq %rbx,%rax 1392 movq %r10,8(%rdi,%rcx,1) 1393 movq %rdx,%r12 1394 adcq $0,%r12 1395 1396 mulq %r14 1397 addq %rax,%r11 1398 movq %rbx,%rax 1399 movq 24(%rsi,%rcx,1),%rbx 1400 movq %rdx,%r10 1401 adcq $0,%r10 1402 addq %r13,%r11 1403 adcq $0,%r10 1404 1405 1406 mulq %r15 1407 addq %rax,%r12 1408 movq %rbx,%rax 1409 movq %r11,16(%rdi,%rcx,1) 1410 movq %rdx,%r13 1411 adcq $0,%r13 1412 leaq 32(%rcx),%rcx 1413 1414 mulq %r14 1415 addq %rax,%r10 1416 movq %rbx,%rax 1417 movq %rdx,%r11 1418 adcq $0,%r11 1419 addq %r12,%r10 1420 adcq $0,%r11 1421 movq %r10,-8(%rdi,%rcx,1) 1422 1423 cmpq $0,%rcx 1424 jne L$sqr4x_1st 1425 1426 mulq %r15 1427 addq %rax,%r13 1428 leaq 16(%rbp),%rbp 1429 adcq $0,%rdx 1430 addq %r11,%r13 1431 adcq $0,%rdx 1432 1433 movq %r13,(%rdi) 1434 movq %rdx,%r12 1435 movq %rdx,8(%rdi) 1436 jmp L$sqr4x_outer 1437 1438.p2align 5 1439L$sqr4x_outer: 1440 movq -32(%rsi,%rbp,1),%r14 1441 leaq 48+8(%rsp,%r9,2),%rdi 1442 movq -24(%rsi,%rbp,1),%rax 1443 leaq -32(%rdi,%rbp,1),%rdi 1444 movq -16(%rsi,%rbp,1),%rbx 1445 movq %rax,%r15 1446 1447 mulq %r14 1448 movq -24(%rdi,%rbp,1),%r10 1449 addq %rax,%r10 1450 movq %rbx,%rax 1451 adcq $0,%rdx 1452 movq %r10,-24(%rdi,%rbp,1) 1453 movq %rdx,%r11 1454 1455 mulq %r14 1456 addq %rax,%r11 1457 movq %rbx,%rax 1458 adcq $0,%rdx 1459 addq -16(%rdi,%rbp,1),%r11 1460 movq %rdx,%r10 1461 adcq $0,%r10 1462 movq %r11,-16(%rdi,%rbp,1) 1463 1464 xorq %r12,%r12 1465 1466 movq -8(%rsi,%rbp,1),%rbx 1467 mulq %r15 1468 addq %rax,%r12 1469 movq %rbx,%rax 1470 adcq $0,%rdx 1471 addq -8(%rdi,%rbp,1),%r12 1472 movq %rdx,%r13 1473 adcq $0,%r13 1474 1475 mulq %r14 1476 addq %rax,%r10 1477 movq %rbx,%rax 1478 adcq $0,%rdx 1479 addq %r12,%r10 1480 movq %rdx,%r11 1481 adcq $0,%r11 1482 movq %r10,-8(%rdi,%rbp,1) 1483 1484 leaq (%rbp),%rcx 1485 jmp L$sqr4x_inner 1486 1487.p2align 5 1488L$sqr4x_inner: 1489 movq (%rsi,%rcx,1),%rbx 1490 mulq %r15 1491 addq %rax,%r13 1492 movq %rbx,%rax 1493 movq %rdx,%r12 1494 adcq $0,%r12 1495 addq (%rdi,%rcx,1),%r13 1496 adcq $0,%r12 1497 1498.byte 0x67 1499 mulq %r14 1500 addq %rax,%r11 1501 movq %rbx,%rax 1502 movq 8(%rsi,%rcx,1),%rbx 1503 movq %rdx,%r10 1504 adcq $0,%r10 1505 addq %r13,%r11 1506 adcq $0,%r10 1507 1508 mulq %r15 1509 addq %rax,%r12 1510 movq %r11,(%rdi,%rcx,1) 1511 movq %rbx,%rax 1512 movq %rdx,%r13 1513 adcq $0,%r13 1514 addq 8(%rdi,%rcx,1),%r12 1515 leaq 16(%rcx),%rcx 1516 adcq $0,%r13 1517 1518 mulq %r14 1519 addq %rax,%r10 1520 movq %rbx,%rax 1521 adcq $0,%rdx 1522 addq %r12,%r10 1523 movq %rdx,%r11 1524 adcq $0,%r11 1525 movq %r10,-8(%rdi,%rcx,1) 1526 1527 cmpq $0,%rcx 1528 jne L$sqr4x_inner 1529 1530.byte 0x67 1531 mulq %r15 1532 addq %rax,%r13 1533 adcq $0,%rdx 1534 addq %r11,%r13 1535 adcq $0,%rdx 1536 1537 movq %r13,(%rdi) 1538 movq %rdx,%r12 1539 movq %rdx,8(%rdi) 1540 1541 addq $16,%rbp 1542 jnz L$sqr4x_outer 1543 1544 1545 movq -32(%rsi),%r14 1546 leaq 48+8(%rsp,%r9,2),%rdi 1547 movq -24(%rsi),%rax 1548 leaq -32(%rdi,%rbp,1),%rdi 1549 movq -16(%rsi),%rbx 1550 movq %rax,%r15 1551 1552 mulq %r14 1553 addq %rax,%r10 1554 movq %rbx,%rax 1555 movq %rdx,%r11 1556 adcq $0,%r11 1557 1558 mulq %r14 1559 addq %rax,%r11 1560 movq %rbx,%rax 1561 movq %r10,-24(%rdi) 1562 movq %rdx,%r10 1563 adcq $0,%r10 1564 addq %r13,%r11 1565 movq -8(%rsi),%rbx 1566 adcq $0,%r10 1567 1568 mulq %r15 1569 addq %rax,%r12 1570 movq %rbx,%rax 1571 movq %r11,-16(%rdi) 1572 movq %rdx,%r13 1573 adcq $0,%r13 1574 1575 mulq %r14 1576 addq %rax,%r10 1577 movq %rbx,%rax 1578 movq %rdx,%r11 1579 adcq $0,%r11 1580 addq %r12,%r10 1581 adcq $0,%r11 1582 movq %r10,-8(%rdi) 1583 1584 mulq %r15 1585 addq %rax,%r13 1586 movq -16(%rsi),%rax 1587 adcq $0,%rdx 1588 addq %r11,%r13 1589 adcq $0,%rdx 1590 1591 movq %r13,(%rdi) 1592 movq %rdx,%r12 1593 movq %rdx,8(%rdi) 1594 1595 mulq %rbx 1596 addq $16,%rbp 1597 xorq %r14,%r14 1598 subq %r9,%rbp 1599 xorq %r15,%r15 1600 1601 addq %r12,%rax 1602 adcq $0,%rdx 1603 movq %rax,8(%rdi) 1604 movq %rdx,16(%rdi) 1605 movq %r15,24(%rdi) 1606 1607 movq -16(%rsi,%rbp,1),%rax 1608 leaq 48+8(%rsp),%rdi 1609 xorq %r10,%r10 1610 movq 8(%rdi),%r11 1611 1612 leaq (%r14,%r10,2),%r12 1613 shrq $63,%r10 1614 leaq (%rcx,%r11,2),%r13 1615 shrq $63,%r11 1616 orq %r10,%r13 1617 movq 16(%rdi),%r10 1618 movq %r11,%r14 1619 mulq %rax 1620 negq %r15 1621 movq 24(%rdi),%r11 1622 adcq %rax,%r12 1623 movq -8(%rsi,%rbp,1),%rax 1624 movq %r12,(%rdi) 1625 adcq %rdx,%r13 1626 1627 leaq (%r14,%r10,2),%rbx 1628 movq %r13,8(%rdi) 1629 sbbq %r15,%r15 1630 shrq $63,%r10 1631 leaq (%rcx,%r11,2),%r8 1632 shrq $63,%r11 1633 orq %r10,%r8 1634 movq 32(%rdi),%r10 1635 movq %r11,%r14 1636 mulq %rax 1637 negq %r15 1638 movq 40(%rdi),%r11 1639 adcq %rax,%rbx 1640 movq 0(%rsi,%rbp,1),%rax 1641 movq %rbx,16(%rdi) 1642 adcq %rdx,%r8 1643 leaq 16(%rbp),%rbp 1644 movq %r8,24(%rdi) 1645 sbbq %r15,%r15 1646 leaq 64(%rdi),%rdi 1647 jmp L$sqr4x_shift_n_add 1648 1649.p2align 5 1650L$sqr4x_shift_n_add: 1651 leaq (%r14,%r10,2),%r12 1652 shrq $63,%r10 1653 leaq (%rcx,%r11,2),%r13 1654 shrq $63,%r11 1655 orq %r10,%r13 1656 movq -16(%rdi),%r10 1657 movq %r11,%r14 1658 mulq %rax 1659 negq %r15 1660 movq -8(%rdi),%r11 1661 adcq %rax,%r12 1662 movq -8(%rsi,%rbp,1),%rax 1663 movq %r12,-32(%rdi) 1664 adcq %rdx,%r13 1665 1666 leaq (%r14,%r10,2),%rbx 1667 movq %r13,-24(%rdi) 1668 sbbq %r15,%r15 1669 shrq $63,%r10 1670 leaq (%rcx,%r11,2),%r8 1671 shrq $63,%r11 1672 orq %r10,%r8 1673 movq 0(%rdi),%r10 1674 movq %r11,%r14 1675 mulq %rax 1676 negq %r15 1677 movq 8(%rdi),%r11 1678 adcq %rax,%rbx 1679 movq 0(%rsi,%rbp,1),%rax 1680 movq %rbx,-16(%rdi) 1681 adcq %rdx,%r8 1682 1683 leaq (%r14,%r10,2),%r12 1684 movq %r8,-8(%rdi) 1685 sbbq %r15,%r15 1686 shrq $63,%r10 1687 leaq (%rcx,%r11,2),%r13 1688 shrq $63,%r11 1689 orq %r10,%r13 1690 movq 16(%rdi),%r10 1691 movq %r11,%r14 1692 mulq %rax 1693 negq %r15 1694 movq 24(%rdi),%r11 1695 adcq %rax,%r12 1696 movq 8(%rsi,%rbp,1),%rax 1697 movq %r12,0(%rdi) 1698 adcq %rdx,%r13 1699 1700 leaq (%r14,%r10,2),%rbx 1701 movq %r13,8(%rdi) 1702 sbbq %r15,%r15 1703 shrq $63,%r10 1704 leaq (%rcx,%r11,2),%r8 1705 shrq $63,%r11 1706 orq %r10,%r8 1707 movq 32(%rdi),%r10 1708 movq %r11,%r14 1709 mulq %rax 1710 negq %r15 1711 movq 40(%rdi),%r11 1712 adcq %rax,%rbx 1713 movq 16(%rsi,%rbp,1),%rax 1714 movq %rbx,16(%rdi) 1715 adcq %rdx,%r8 1716 movq %r8,24(%rdi) 1717 sbbq %r15,%r15 1718 leaq 64(%rdi),%rdi 1719 addq $32,%rbp 1720 jnz L$sqr4x_shift_n_add 1721 1722 leaq (%r14,%r10,2),%r12 1723.byte 0x67 1724 shrq $63,%r10 1725 leaq (%rcx,%r11,2),%r13 1726 shrq $63,%r11 1727 orq %r10,%r13 1728 movq -16(%rdi),%r10 1729 movq %r11,%r14 1730 mulq %rax 1731 negq %r15 1732 movq -8(%rdi),%r11 1733 adcq %rax,%r12 1734 movq -8(%rsi),%rax 1735 movq %r12,-32(%rdi) 1736 adcq %rdx,%r13 1737 1738 leaq (%r14,%r10,2),%rbx 1739 movq %r13,-24(%rdi) 1740 sbbq %r15,%r15 1741 shrq $63,%r10 1742 leaq (%rcx,%r11,2),%r8 1743 shrq $63,%r11 1744 orq %r10,%r8 1745 mulq %rax 1746 negq %r15 1747 adcq %rax,%rbx 1748 adcq %rdx,%r8 1749 movq %rbx,-16(%rdi) 1750 movq %r8,-8(%rdi) 1751.byte 102,72,15,126,213 1752__bn_sqr8x_reduction: 1753 xorq %rax,%rax 1754 leaq (%r9,%rbp,1),%rcx 1755 leaq 48+8(%rsp,%r9,2),%rdx 1756 movq %rcx,0+8(%rsp) 1757 leaq 48+8(%rsp,%r9,1),%rdi 1758 movq %rdx,8+8(%rsp) 1759 negq %r9 1760 jmp L$8x_reduction_loop 1761 1762.p2align 5 1763L$8x_reduction_loop: 1764 leaq (%rdi,%r9,1),%rdi 1765.byte 0x66 1766 movq 0(%rdi),%rbx 1767 movq 8(%rdi),%r9 1768 movq 16(%rdi),%r10 1769 movq 24(%rdi),%r11 1770 movq 32(%rdi),%r12 1771 movq 40(%rdi),%r13 1772 movq 48(%rdi),%r14 1773 movq 56(%rdi),%r15 1774 movq %rax,(%rdx) 1775 leaq 64(%rdi),%rdi 1776 1777.byte 0x67 1778 movq %rbx,%r8 1779 imulq 32+8(%rsp),%rbx 1780 movq 0(%rbp),%rax 1781 movl $8,%ecx 1782 jmp L$8x_reduce 1783 1784.p2align 5 1785L$8x_reduce: 1786 mulq %rbx 1787 movq 8(%rbp),%rax 1788 negq %r8 1789 movq %rdx,%r8 1790 adcq $0,%r8 1791 1792 mulq %rbx 1793 addq %rax,%r9 1794 movq 16(%rbp),%rax 1795 adcq $0,%rdx 1796 addq %r9,%r8 1797 movq %rbx,48-8+8(%rsp,%rcx,8) 1798 movq %rdx,%r9 1799 adcq $0,%r9 1800 1801 mulq %rbx 1802 addq %rax,%r10 1803 movq 24(%rbp),%rax 1804 adcq $0,%rdx 1805 addq %r10,%r9 1806 movq 32+8(%rsp),%rsi 1807 movq %rdx,%r10 1808 adcq $0,%r10 1809 1810 mulq %rbx 1811 addq %rax,%r11 1812 movq 32(%rbp),%rax 1813 adcq $0,%rdx 1814 imulq %r8,%rsi 1815 addq %r11,%r10 1816 movq %rdx,%r11 1817 adcq $0,%r11 1818 1819 mulq %rbx 1820 addq %rax,%r12 1821 movq 40(%rbp),%rax 1822 adcq $0,%rdx 1823 addq %r12,%r11 1824 movq %rdx,%r12 1825 adcq $0,%r12 1826 1827 mulq %rbx 1828 addq %rax,%r13 1829 movq 48(%rbp),%rax 1830 adcq $0,%rdx 1831 addq %r13,%r12 1832 movq %rdx,%r13 1833 adcq $0,%r13 1834 1835 mulq %rbx 1836 addq %rax,%r14 1837 movq 56(%rbp),%rax 1838 adcq $0,%rdx 1839 addq %r14,%r13 1840 movq %rdx,%r14 1841 adcq $0,%r14 1842 1843 mulq %rbx 1844 movq %rsi,%rbx 1845 addq %rax,%r15 1846 movq 0(%rbp),%rax 1847 adcq $0,%rdx 1848 addq %r15,%r14 1849 movq %rdx,%r15 1850 adcq $0,%r15 1851 1852 decl %ecx 1853 jnz L$8x_reduce 1854 1855 leaq 64(%rbp),%rbp 1856 xorq %rax,%rax 1857 movq 8+8(%rsp),%rdx 1858 cmpq 0+8(%rsp),%rbp 1859 jae L$8x_no_tail 1860 1861.byte 0x66 1862 addq 0(%rdi),%r8 1863 adcq 8(%rdi),%r9 1864 adcq 16(%rdi),%r10 1865 adcq 24(%rdi),%r11 1866 adcq 32(%rdi),%r12 1867 adcq 40(%rdi),%r13 1868 adcq 48(%rdi),%r14 1869 adcq 56(%rdi),%r15 1870 sbbq %rsi,%rsi 1871 1872 movq 48+56+8(%rsp),%rbx 1873 movl $8,%ecx 1874 movq 0(%rbp),%rax 1875 jmp L$8x_tail 1876 1877.p2align 5 1878L$8x_tail: 1879 mulq %rbx 1880 addq %rax,%r8 1881 movq 8(%rbp),%rax 1882 movq %r8,(%rdi) 1883 movq %rdx,%r8 1884 adcq $0,%r8 1885 1886 mulq %rbx 1887 addq %rax,%r9 1888 movq 16(%rbp),%rax 1889 adcq $0,%rdx 1890 addq %r9,%r8 1891 leaq 8(%rdi),%rdi 1892 movq %rdx,%r9 1893 adcq $0,%r9 1894 1895 mulq %rbx 1896 addq %rax,%r10 1897 movq 24(%rbp),%rax 1898 adcq $0,%rdx 1899 addq %r10,%r9 1900 movq %rdx,%r10 1901 adcq $0,%r10 1902 1903 mulq %rbx 1904 addq %rax,%r11 1905 movq 32(%rbp),%rax 1906 adcq $0,%rdx 1907 addq %r11,%r10 1908 movq %rdx,%r11 1909 adcq $0,%r11 1910 1911 mulq %rbx 1912 addq %rax,%r12 1913 movq 40(%rbp),%rax 1914 adcq $0,%rdx 1915 addq %r12,%r11 1916 movq %rdx,%r12 1917 adcq $0,%r12 1918 1919 mulq %rbx 1920 addq %rax,%r13 1921 movq 48(%rbp),%rax 1922 adcq $0,%rdx 1923 addq %r13,%r12 1924 movq %rdx,%r13 1925 adcq $0,%r13 1926 1927 mulq %rbx 1928 addq %rax,%r14 1929 movq 56(%rbp),%rax 1930 adcq $0,%rdx 1931 addq %r14,%r13 1932 movq %rdx,%r14 1933 adcq $0,%r14 1934 1935 mulq %rbx 1936 movq 48-16+8(%rsp,%rcx,8),%rbx 1937 addq %rax,%r15 1938 adcq $0,%rdx 1939 addq %r15,%r14 1940 movq 0(%rbp),%rax 1941 movq %rdx,%r15 1942 adcq $0,%r15 1943 1944 decl %ecx 1945 jnz L$8x_tail 1946 1947 leaq 64(%rbp),%rbp 1948 movq 8+8(%rsp),%rdx 1949 cmpq 0+8(%rsp),%rbp 1950 jae L$8x_tail_done 1951 1952 movq 48+56+8(%rsp),%rbx 1953 negq %rsi 1954 movq 0(%rbp),%rax 1955 adcq 0(%rdi),%r8 1956 adcq 8(%rdi),%r9 1957 adcq 16(%rdi),%r10 1958 adcq 24(%rdi),%r11 1959 adcq 32(%rdi),%r12 1960 adcq 40(%rdi),%r13 1961 adcq 48(%rdi),%r14 1962 adcq 56(%rdi),%r15 1963 sbbq %rsi,%rsi 1964 1965 movl $8,%ecx 1966 jmp L$8x_tail 1967 1968.p2align 5 1969L$8x_tail_done: 1970 xorq %rax,%rax 1971 addq (%rdx),%r8 1972 adcq $0,%r9 1973 adcq $0,%r10 1974 adcq $0,%r11 1975 adcq $0,%r12 1976 adcq $0,%r13 1977 adcq $0,%r14 1978 adcq $0,%r15 1979 adcq $0,%rax 1980 1981 negq %rsi 1982L$8x_no_tail: 1983 adcq 0(%rdi),%r8 1984 adcq 8(%rdi),%r9 1985 adcq 16(%rdi),%r10 1986 adcq 24(%rdi),%r11 1987 adcq 32(%rdi),%r12 1988 adcq 40(%rdi),%r13 1989 adcq 48(%rdi),%r14 1990 adcq 56(%rdi),%r15 1991 adcq $0,%rax 1992 movq -8(%rbp),%rcx 1993 xorq %rsi,%rsi 1994 1995.byte 102,72,15,126,213 1996 1997 movq %r8,0(%rdi) 1998 movq %r9,8(%rdi) 1999.byte 102,73,15,126,217 2000 movq %r10,16(%rdi) 2001 movq %r11,24(%rdi) 2002 movq %r12,32(%rdi) 2003 movq %r13,40(%rdi) 2004 movq %r14,48(%rdi) 2005 movq %r15,56(%rdi) 2006 leaq 64(%rdi),%rdi 2007 2008 cmpq %rdx,%rdi 2009 jb L$8x_reduction_loop 2010 ret 2011 2012 2013 2014.p2align 5 2015__bn_post4x_internal: 2016 2017 movq 0(%rbp),%r12 2018 leaq (%rdi,%r9,1),%rbx 2019 movq %r9,%rcx 2020.byte 102,72,15,126,207 2021 negq %rax 2022.byte 102,72,15,126,206 2023 sarq $3+2,%rcx 2024 decq %r12 2025 xorq %r10,%r10 2026 movq 8(%rbp),%r13 2027 movq 16(%rbp),%r14 2028 movq 24(%rbp),%r15 2029 jmp L$sqr4x_sub_entry 2030 2031.p2align 4 2032L$sqr4x_sub: 2033 movq 0(%rbp),%r12 2034 movq 8(%rbp),%r13 2035 movq 16(%rbp),%r14 2036 movq 24(%rbp),%r15 2037L$sqr4x_sub_entry: 2038 leaq 32(%rbp),%rbp 2039 notq %r12 2040 notq %r13 2041 notq %r14 2042 notq %r15 2043 andq %rax,%r12 2044 andq %rax,%r13 2045 andq %rax,%r14 2046 andq %rax,%r15 2047 2048 negq %r10 2049 adcq 0(%rbx),%r12 2050 adcq 8(%rbx),%r13 2051 adcq 16(%rbx),%r14 2052 adcq 24(%rbx),%r15 2053 movq %r12,0(%rdi) 2054 leaq 32(%rbx),%rbx 2055 movq %r13,8(%rdi) 2056 sbbq %r10,%r10 2057 movq %r14,16(%rdi) 2058 movq %r15,24(%rdi) 2059 leaq 32(%rdi),%rdi 2060 2061 incq %rcx 2062 jnz L$sqr4x_sub 2063 2064 movq %r9,%r10 2065 negq %r9 2066 ret 2067 2068 2069 2070.p2align 5 2071bn_mulx4x_mont_gather5: 2072 2073 movq %rsp,%rax 2074 2075L$mulx4x_enter: 2076 pushq %rbx 2077 2078 pushq %rbp 2079 2080 pushq %r12 2081 2082 pushq %r13 2083 2084 pushq %r14 2085 2086 pushq %r15 2087 2088L$mulx4x_prologue: 2089 2090 shll $3,%r9d 2091 leaq (%r9,%r9,2),%r10 2092 negq %r9 2093 movq (%r8),%r8 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 leaq -320(%rsp,%r9,2),%r11 2105 movq %rsp,%rbp 2106 subq %rdi,%r11 2107 andq $4095,%r11 2108 cmpq %r11,%r10 2109 jb L$mulx4xsp_alt 2110 subq %r11,%rbp 2111 leaq -320(%rbp,%r9,2),%rbp 2112 jmp L$mulx4xsp_done 2113 2114L$mulx4xsp_alt: 2115 leaq 4096-320(,%r9,2),%r10 2116 leaq -320(%rbp,%r9,2),%rbp 2117 subq %r10,%r11 2118 movq $0,%r10 2119 cmovcq %r10,%r11 2120 subq %r11,%rbp 2121L$mulx4xsp_done: 2122 andq $-64,%rbp 2123 movq %rsp,%r11 2124 subq %rbp,%r11 2125 andq $-4096,%r11 2126 leaq (%r11,%rbp,1),%rsp 2127 movq (%rsp),%r10 2128 cmpq %rbp,%rsp 2129 ja L$mulx4x_page_walk 2130 jmp L$mulx4x_page_walk_done 2131 2132L$mulx4x_page_walk: 2133 leaq -4096(%rsp),%rsp 2134 movq (%rsp),%r10 2135 cmpq %rbp,%rsp 2136 ja L$mulx4x_page_walk 2137L$mulx4x_page_walk_done: 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 movq %r8,32(%rsp) 2152 movq %rax,40(%rsp) 2153 2154L$mulx4x_body: 2155 call mulx4x_internal 2156 2157 movq 40(%rsp),%rsi 2158 2159 movq $1,%rax 2160 2161 movq -48(%rsi),%r15 2162 2163 movq -40(%rsi),%r14 2164 2165 movq -32(%rsi),%r13 2166 2167 movq -24(%rsi),%r12 2168 2169 movq -16(%rsi),%rbp 2170 2171 movq -8(%rsi),%rbx 2172 2173 leaq (%rsi),%rsp 2174 2175L$mulx4x_epilogue: 2176 ret 2177 2178 2179 2180 2181.p2align 5 2182mulx4x_internal: 2183 2184 movq %r9,8(%rsp) 2185 movq %r9,%r10 2186 negq %r9 2187 shlq $5,%r9 2188 negq %r10 2189 leaq 128(%rdx,%r9,1),%r13 2190 shrq $5+5,%r9 2191 movd 8(%rax),%xmm5 2192 subq $1,%r9 2193 leaq L$inc(%rip),%rax 2194 movq %r13,16+8(%rsp) 2195 movq %r9,24+8(%rsp) 2196 movq %rdi,56+8(%rsp) 2197 movdqa 0(%rax),%xmm0 2198 movdqa 16(%rax),%xmm1 2199 leaq 88-112(%rsp,%r10,1),%r10 2200 leaq 128(%rdx),%rdi 2201 2202 pshufd $0,%xmm5,%xmm5 2203 movdqa %xmm1,%xmm4 2204.byte 0x67 2205 movdqa %xmm1,%xmm2 2206.byte 0x67 2207 paddd %xmm0,%xmm1 2208 pcmpeqd %xmm5,%xmm0 2209 movdqa %xmm4,%xmm3 2210 paddd %xmm1,%xmm2 2211 pcmpeqd %xmm5,%xmm1 2212 movdqa %xmm0,112(%r10) 2213 movdqa %xmm4,%xmm0 2214 2215 paddd %xmm2,%xmm3 2216 pcmpeqd %xmm5,%xmm2 2217 movdqa %xmm1,128(%r10) 2218 movdqa %xmm4,%xmm1 2219 2220 paddd %xmm3,%xmm0 2221 pcmpeqd %xmm5,%xmm3 2222 movdqa %xmm2,144(%r10) 2223 movdqa %xmm4,%xmm2 2224 2225 paddd %xmm0,%xmm1 2226 pcmpeqd %xmm5,%xmm0 2227 movdqa %xmm3,160(%r10) 2228 movdqa %xmm4,%xmm3 2229 paddd %xmm1,%xmm2 2230 pcmpeqd %xmm5,%xmm1 2231 movdqa %xmm0,176(%r10) 2232 movdqa %xmm4,%xmm0 2233 2234 paddd %xmm2,%xmm3 2235 pcmpeqd %xmm5,%xmm2 2236 movdqa %xmm1,192(%r10) 2237 movdqa %xmm4,%xmm1 2238 2239 paddd %xmm3,%xmm0 2240 pcmpeqd %xmm5,%xmm3 2241 movdqa %xmm2,208(%r10) 2242 movdqa %xmm4,%xmm2 2243 2244 paddd %xmm0,%xmm1 2245 pcmpeqd %xmm5,%xmm0 2246 movdqa %xmm3,224(%r10) 2247 movdqa %xmm4,%xmm3 2248 paddd %xmm1,%xmm2 2249 pcmpeqd %xmm5,%xmm1 2250 movdqa %xmm0,240(%r10) 2251 movdqa %xmm4,%xmm0 2252 2253 paddd %xmm2,%xmm3 2254 pcmpeqd %xmm5,%xmm2 2255 movdqa %xmm1,256(%r10) 2256 movdqa %xmm4,%xmm1 2257 2258 paddd %xmm3,%xmm0 2259 pcmpeqd %xmm5,%xmm3 2260 movdqa %xmm2,272(%r10) 2261 movdqa %xmm4,%xmm2 2262 2263 paddd %xmm0,%xmm1 2264 pcmpeqd %xmm5,%xmm0 2265 movdqa %xmm3,288(%r10) 2266 movdqa %xmm4,%xmm3 2267.byte 0x67 2268 paddd %xmm1,%xmm2 2269 pcmpeqd %xmm5,%xmm1 2270 movdqa %xmm0,304(%r10) 2271 2272 paddd %xmm2,%xmm3 2273 pcmpeqd %xmm5,%xmm2 2274 movdqa %xmm1,320(%r10) 2275 2276 pcmpeqd %xmm5,%xmm3 2277 movdqa %xmm2,336(%r10) 2278 2279 pand 64(%rdi),%xmm0 2280 pand 80(%rdi),%xmm1 2281 pand 96(%rdi),%xmm2 2282 movdqa %xmm3,352(%r10) 2283 pand 112(%rdi),%xmm3 2284 por %xmm2,%xmm0 2285 por %xmm3,%xmm1 2286 movdqa -128(%rdi),%xmm4 2287 movdqa -112(%rdi),%xmm5 2288 movdqa -96(%rdi),%xmm2 2289 pand 112(%r10),%xmm4 2290 movdqa -80(%rdi),%xmm3 2291 pand 128(%r10),%xmm5 2292 por %xmm4,%xmm0 2293 pand 144(%r10),%xmm2 2294 por %xmm5,%xmm1 2295 pand 160(%r10),%xmm3 2296 por %xmm2,%xmm0 2297 por %xmm3,%xmm1 2298 movdqa -64(%rdi),%xmm4 2299 movdqa -48(%rdi),%xmm5 2300 movdqa -32(%rdi),%xmm2 2301 pand 176(%r10),%xmm4 2302 movdqa -16(%rdi),%xmm3 2303 pand 192(%r10),%xmm5 2304 por %xmm4,%xmm0 2305 pand 208(%r10),%xmm2 2306 por %xmm5,%xmm1 2307 pand 224(%r10),%xmm3 2308 por %xmm2,%xmm0 2309 por %xmm3,%xmm1 2310 movdqa 0(%rdi),%xmm4 2311 movdqa 16(%rdi),%xmm5 2312 movdqa 32(%rdi),%xmm2 2313 pand 240(%r10),%xmm4 2314 movdqa 48(%rdi),%xmm3 2315 pand 256(%r10),%xmm5 2316 por %xmm4,%xmm0 2317 pand 272(%r10),%xmm2 2318 por %xmm5,%xmm1 2319 pand 288(%r10),%xmm3 2320 por %xmm2,%xmm0 2321 por %xmm3,%xmm1 2322 pxor %xmm1,%xmm0 2323 2324 pshufd $0x4e,%xmm0,%xmm1 2325 por %xmm1,%xmm0 2326 leaq 256(%rdi),%rdi 2327.byte 102,72,15,126,194 2328 leaq 64+32+8(%rsp),%rbx 2329 2330 movq %rdx,%r9 2331 mulxq 0(%rsi),%r8,%rax 2332 mulxq 8(%rsi),%r11,%r12 2333 addq %rax,%r11 2334 mulxq 16(%rsi),%rax,%r13 2335 adcq %rax,%r12 2336 adcq $0,%r13 2337 mulxq 24(%rsi),%rax,%r14 2338 2339 movq %r8,%r15 2340 imulq 32+8(%rsp),%r8 2341 xorq %rbp,%rbp 2342 movq %r8,%rdx 2343 2344 movq %rdi,8+8(%rsp) 2345 2346 leaq 32(%rsi),%rsi 2347 adcxq %rax,%r13 2348 adcxq %rbp,%r14 2349 2350 mulxq 0(%rcx),%rax,%r10 2351 adcxq %rax,%r15 2352 adoxq %r11,%r10 2353 mulxq 8(%rcx),%rax,%r11 2354 adcxq %rax,%r10 2355 adoxq %r12,%r11 2356 mulxq 16(%rcx),%rax,%r12 2357 movq 24+8(%rsp),%rdi 2358 movq %r10,-32(%rbx) 2359 adcxq %rax,%r11 2360 adoxq %r13,%r12 2361 mulxq 24(%rcx),%rax,%r15 2362 movq %r9,%rdx 2363 movq %r11,-24(%rbx) 2364 adcxq %rax,%r12 2365 adoxq %rbp,%r15 2366 leaq 32(%rcx),%rcx 2367 movq %r12,-16(%rbx) 2368 jmp L$mulx4x_1st 2369 2370.p2align 5 2371L$mulx4x_1st: 2372 adcxq %rbp,%r15 2373 mulxq 0(%rsi),%r10,%rax 2374 adcxq %r14,%r10 2375 mulxq 8(%rsi),%r11,%r14 2376 adcxq %rax,%r11 2377 mulxq 16(%rsi),%r12,%rax 2378 adcxq %r14,%r12 2379 mulxq 24(%rsi),%r13,%r14 2380.byte 0x67,0x67 2381 movq %r8,%rdx 2382 adcxq %rax,%r13 2383 adcxq %rbp,%r14 2384 leaq 32(%rsi),%rsi 2385 leaq 32(%rbx),%rbx 2386 2387 adoxq %r15,%r10 2388 mulxq 0(%rcx),%rax,%r15 2389 adcxq %rax,%r10 2390 adoxq %r15,%r11 2391 mulxq 8(%rcx),%rax,%r15 2392 adcxq %rax,%r11 2393 adoxq %r15,%r12 2394 mulxq 16(%rcx),%rax,%r15 2395 movq %r10,-40(%rbx) 2396 adcxq %rax,%r12 2397 movq %r11,-32(%rbx) 2398 adoxq %r15,%r13 2399 mulxq 24(%rcx),%rax,%r15 2400 movq %r9,%rdx 2401 movq %r12,-24(%rbx) 2402 adcxq %rax,%r13 2403 adoxq %rbp,%r15 2404 leaq 32(%rcx),%rcx 2405 movq %r13,-16(%rbx) 2406 2407 decq %rdi 2408 jnz L$mulx4x_1st 2409 2410 movq 8(%rsp),%rax 2411 adcq %rbp,%r15 2412 leaq (%rsi,%rax,1),%rsi 2413 addq %r15,%r14 2414 movq 8+8(%rsp),%rdi 2415 adcq %rbp,%rbp 2416 movq %r14,-8(%rbx) 2417 jmp L$mulx4x_outer 2418 2419.p2align 5 2420L$mulx4x_outer: 2421 leaq 16-256(%rbx),%r10 2422 pxor %xmm4,%xmm4 2423.byte 0x67,0x67 2424 pxor %xmm5,%xmm5 2425 movdqa -128(%rdi),%xmm0 2426 movdqa -112(%rdi),%xmm1 2427 movdqa -96(%rdi),%xmm2 2428 pand 256(%r10),%xmm0 2429 movdqa -80(%rdi),%xmm3 2430 pand 272(%r10),%xmm1 2431 por %xmm0,%xmm4 2432 pand 288(%r10),%xmm2 2433 por %xmm1,%xmm5 2434 pand 304(%r10),%xmm3 2435 por %xmm2,%xmm4 2436 por %xmm3,%xmm5 2437 movdqa -64(%rdi),%xmm0 2438 movdqa -48(%rdi),%xmm1 2439 movdqa -32(%rdi),%xmm2 2440 pand 320(%r10),%xmm0 2441 movdqa -16(%rdi),%xmm3 2442 pand 336(%r10),%xmm1 2443 por %xmm0,%xmm4 2444 pand 352(%r10),%xmm2 2445 por %xmm1,%xmm5 2446 pand 368(%r10),%xmm3 2447 por %xmm2,%xmm4 2448 por %xmm3,%xmm5 2449 movdqa 0(%rdi),%xmm0 2450 movdqa 16(%rdi),%xmm1 2451 movdqa 32(%rdi),%xmm2 2452 pand 384(%r10),%xmm0 2453 movdqa 48(%rdi),%xmm3 2454 pand 400(%r10),%xmm1 2455 por %xmm0,%xmm4 2456 pand 416(%r10),%xmm2 2457 por %xmm1,%xmm5 2458 pand 432(%r10),%xmm3 2459 por %xmm2,%xmm4 2460 por %xmm3,%xmm5 2461 movdqa 64(%rdi),%xmm0 2462 movdqa 80(%rdi),%xmm1 2463 movdqa 96(%rdi),%xmm2 2464 pand 448(%r10),%xmm0 2465 movdqa 112(%rdi),%xmm3 2466 pand 464(%r10),%xmm1 2467 por %xmm0,%xmm4 2468 pand 480(%r10),%xmm2 2469 por %xmm1,%xmm5 2470 pand 496(%r10),%xmm3 2471 por %xmm2,%xmm4 2472 por %xmm3,%xmm5 2473 por %xmm5,%xmm4 2474 2475 pshufd $0x4e,%xmm4,%xmm0 2476 por %xmm4,%xmm0 2477 leaq 256(%rdi),%rdi 2478.byte 102,72,15,126,194 2479 2480 movq %rbp,(%rbx) 2481 leaq 32(%rbx,%rax,1),%rbx 2482 mulxq 0(%rsi),%r8,%r11 2483 xorq %rbp,%rbp 2484 movq %rdx,%r9 2485 mulxq 8(%rsi),%r14,%r12 2486 adoxq -32(%rbx),%r8 2487 adcxq %r14,%r11 2488 mulxq 16(%rsi),%r15,%r13 2489 adoxq -24(%rbx),%r11 2490 adcxq %r15,%r12 2491 mulxq 24(%rsi),%rdx,%r14 2492 adoxq -16(%rbx),%r12 2493 adcxq %rdx,%r13 2494 leaq (%rcx,%rax,1),%rcx 2495 leaq 32(%rsi),%rsi 2496 adoxq -8(%rbx),%r13 2497 adcxq %rbp,%r14 2498 adoxq %rbp,%r14 2499 2500 movq %r8,%r15 2501 imulq 32+8(%rsp),%r8 2502 2503 movq %r8,%rdx 2504 xorq %rbp,%rbp 2505 movq %rdi,8+8(%rsp) 2506 2507 mulxq 0(%rcx),%rax,%r10 2508 adcxq %rax,%r15 2509 adoxq %r11,%r10 2510 mulxq 8(%rcx),%rax,%r11 2511 adcxq %rax,%r10 2512 adoxq %r12,%r11 2513 mulxq 16(%rcx),%rax,%r12 2514 adcxq %rax,%r11 2515 adoxq %r13,%r12 2516 mulxq 24(%rcx),%rax,%r15 2517 movq %r9,%rdx 2518 movq 24+8(%rsp),%rdi 2519 movq %r10,-32(%rbx) 2520 adcxq %rax,%r12 2521 movq %r11,-24(%rbx) 2522 adoxq %rbp,%r15 2523 movq %r12,-16(%rbx) 2524 leaq 32(%rcx),%rcx 2525 jmp L$mulx4x_inner 2526 2527.p2align 5 2528L$mulx4x_inner: 2529 mulxq 0(%rsi),%r10,%rax 2530 adcxq %rbp,%r15 2531 adoxq %r14,%r10 2532 mulxq 8(%rsi),%r11,%r14 2533 adcxq 0(%rbx),%r10 2534 adoxq %rax,%r11 2535 mulxq 16(%rsi),%r12,%rax 2536 adcxq 8(%rbx),%r11 2537 adoxq %r14,%r12 2538 mulxq 24(%rsi),%r13,%r14 2539 movq %r8,%rdx 2540 adcxq 16(%rbx),%r12 2541 adoxq %rax,%r13 2542 adcxq 24(%rbx),%r13 2543 adoxq %rbp,%r14 2544 leaq 32(%rsi),%rsi 2545 leaq 32(%rbx),%rbx 2546 adcxq %rbp,%r14 2547 2548 adoxq %r15,%r10 2549 mulxq 0(%rcx),%rax,%r15 2550 adcxq %rax,%r10 2551 adoxq %r15,%r11 2552 mulxq 8(%rcx),%rax,%r15 2553 adcxq %rax,%r11 2554 adoxq %r15,%r12 2555 mulxq 16(%rcx),%rax,%r15 2556 movq %r10,-40(%rbx) 2557 adcxq %rax,%r12 2558 adoxq %r15,%r13 2559 movq %r11,-32(%rbx) 2560 mulxq 24(%rcx),%rax,%r15 2561 movq %r9,%rdx 2562 leaq 32(%rcx),%rcx 2563 movq %r12,-24(%rbx) 2564 adcxq %rax,%r13 2565 adoxq %rbp,%r15 2566 movq %r13,-16(%rbx) 2567 2568 decq %rdi 2569 jnz L$mulx4x_inner 2570 2571 movq 0+8(%rsp),%rax 2572 adcq %rbp,%r15 2573 subq 0(%rbx),%rdi 2574 movq 8+8(%rsp),%rdi 2575 movq 16+8(%rsp),%r10 2576 adcq %r15,%r14 2577 leaq (%rsi,%rax,1),%rsi 2578 adcq %rbp,%rbp 2579 movq %r14,-8(%rbx) 2580 2581 cmpq %r10,%rdi 2582 jb L$mulx4x_outer 2583 2584 movq -8(%rcx),%r10 2585 movq %rbp,%r8 2586 movq (%rcx,%rax,1),%r12 2587 leaq (%rcx,%rax,1),%rbp 2588 movq %rax,%rcx 2589 leaq (%rbx,%rax,1),%rdi 2590 xorl %eax,%eax 2591 xorq %r15,%r15 2592 subq %r14,%r10 2593 adcq %r15,%r15 2594 orq %r15,%r8 2595 sarq $3+2,%rcx 2596 subq %r8,%rax 2597 movq 56+8(%rsp),%rdx 2598 decq %r12 2599 movq 8(%rbp),%r13 2600 xorq %r8,%r8 2601 movq 16(%rbp),%r14 2602 movq 24(%rbp),%r15 2603 jmp L$sqrx4x_sub_entry 2604 2605 2606 2607.p2align 5 2608bn_powerx5: 2609 2610 movq %rsp,%rax 2611 2612L$powerx5_enter: 2613 pushq %rbx 2614 2615 pushq %rbp 2616 2617 pushq %r12 2618 2619 pushq %r13 2620 2621 pushq %r14 2622 2623 pushq %r15 2624 2625L$powerx5_prologue: 2626 2627 shll $3,%r9d 2628 leaq (%r9,%r9,2),%r10 2629 negq %r9 2630 movq (%r8),%r8 2631 2632 2633 2634 2635 2636 2637 2638 2639 leaq -320(%rsp,%r9,2),%r11 2640 movq %rsp,%rbp 2641 subq %rdi,%r11 2642 andq $4095,%r11 2643 cmpq %r11,%r10 2644 jb L$pwrx_sp_alt 2645 subq %r11,%rbp 2646 leaq -320(%rbp,%r9,2),%rbp 2647 jmp L$pwrx_sp_done 2648 2649.p2align 5 2650L$pwrx_sp_alt: 2651 leaq 4096-320(,%r9,2),%r10 2652 leaq -320(%rbp,%r9,2),%rbp 2653 subq %r10,%r11 2654 movq $0,%r10 2655 cmovcq %r10,%r11 2656 subq %r11,%rbp 2657L$pwrx_sp_done: 2658 andq $-64,%rbp 2659 movq %rsp,%r11 2660 subq %rbp,%r11 2661 andq $-4096,%r11 2662 leaq (%r11,%rbp,1),%rsp 2663 movq (%rsp),%r10 2664 cmpq %rbp,%rsp 2665 ja L$pwrx_page_walk 2666 jmp L$pwrx_page_walk_done 2667 2668L$pwrx_page_walk: 2669 leaq -4096(%rsp),%rsp 2670 movq (%rsp),%r10 2671 cmpq %rbp,%rsp 2672 ja L$pwrx_page_walk 2673L$pwrx_page_walk_done: 2674 2675 movq %r9,%r10 2676 negq %r9 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 pxor %xmm0,%xmm0 2690.byte 102,72,15,110,207 2691.byte 102,72,15,110,209 2692.byte 102,73,15,110,218 2693.byte 102,72,15,110,226 2694 movq %r8,32(%rsp) 2695 movq %rax,40(%rsp) 2696 2697L$powerx5_body: 2698 2699 call __bn_sqrx8x_internal 2700 call __bn_postx4x_internal 2701 call __bn_sqrx8x_internal 2702 call __bn_postx4x_internal 2703 call __bn_sqrx8x_internal 2704 call __bn_postx4x_internal 2705 call __bn_sqrx8x_internal 2706 call __bn_postx4x_internal 2707 call __bn_sqrx8x_internal 2708 call __bn_postx4x_internal 2709 2710 movq %r10,%r9 2711 movq %rsi,%rdi 2712.byte 102,72,15,126,209 2713.byte 102,72,15,126,226 2714 movq 40(%rsp),%rax 2715 2716 call mulx4x_internal 2717 2718 movq 40(%rsp),%rsi 2719 2720 movq $1,%rax 2721 2722 movq -48(%rsi),%r15 2723 2724 movq -40(%rsi),%r14 2725 2726 movq -32(%rsi),%r13 2727 2728 movq -24(%rsi),%r12 2729 2730 movq -16(%rsi),%rbp 2731 2732 movq -8(%rsi),%rbx 2733 2734 leaq (%rsi),%rsp 2735 2736L$powerx5_epilogue: 2737 ret 2738 2739 2740 2741.globl _bn_sqrx8x_internal 2742.private_extern _bn_sqrx8x_internal 2743.private_extern _bn_sqrx8x_internal 2744 2745.p2align 5 2746_bn_sqrx8x_internal: 2747__bn_sqrx8x_internal: 2748 2749_CET_ENDBR 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 leaq 48+8(%rsp),%rdi 2791 leaq (%rsi,%r9,1),%rbp 2792 movq %r9,0+8(%rsp) 2793 movq %rbp,8+8(%rsp) 2794 jmp L$sqr8x_zero_start 2795 2796.p2align 5 2797.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 2798L$sqrx8x_zero: 2799.byte 0x3e 2800 movdqa %xmm0,0(%rdi) 2801 movdqa %xmm0,16(%rdi) 2802 movdqa %xmm0,32(%rdi) 2803 movdqa %xmm0,48(%rdi) 2804L$sqr8x_zero_start: 2805 movdqa %xmm0,64(%rdi) 2806 movdqa %xmm0,80(%rdi) 2807 movdqa %xmm0,96(%rdi) 2808 movdqa %xmm0,112(%rdi) 2809 leaq 128(%rdi),%rdi 2810 subq $64,%r9 2811 jnz L$sqrx8x_zero 2812 2813 movq 0(%rsi),%rdx 2814 2815 xorq %r10,%r10 2816 xorq %r11,%r11 2817 xorq %r12,%r12 2818 xorq %r13,%r13 2819 xorq %r14,%r14 2820 xorq %r15,%r15 2821 leaq 48+8(%rsp),%rdi 2822 xorq %rbp,%rbp 2823 jmp L$sqrx8x_outer_loop 2824 2825.p2align 5 2826L$sqrx8x_outer_loop: 2827 mulxq 8(%rsi),%r8,%rax 2828 adcxq %r9,%r8 2829 adoxq %rax,%r10 2830 mulxq 16(%rsi),%r9,%rax 2831 adcxq %r10,%r9 2832 adoxq %rax,%r11 2833.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 2834 adcxq %r11,%r10 2835 adoxq %rax,%r12 2836.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 2837 adcxq %r12,%r11 2838 adoxq %rax,%r13 2839 mulxq 40(%rsi),%r12,%rax 2840 adcxq %r13,%r12 2841 adoxq %rax,%r14 2842 mulxq 48(%rsi),%r13,%rax 2843 adcxq %r14,%r13 2844 adoxq %r15,%rax 2845 mulxq 56(%rsi),%r14,%r15 2846 movq 8(%rsi),%rdx 2847 adcxq %rax,%r14 2848 adoxq %rbp,%r15 2849 adcq 64(%rdi),%r15 2850 movq %r8,8(%rdi) 2851 movq %r9,16(%rdi) 2852 sbbq %rcx,%rcx 2853 xorq %rbp,%rbp 2854 2855 2856 mulxq 16(%rsi),%r8,%rbx 2857 mulxq 24(%rsi),%r9,%rax 2858 adcxq %r10,%r8 2859 adoxq %rbx,%r9 2860 mulxq 32(%rsi),%r10,%rbx 2861 adcxq %r11,%r9 2862 adoxq %rax,%r10 2863.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 2864 adcxq %r12,%r10 2865 adoxq %rbx,%r11 2866.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 2867 adcxq %r13,%r11 2868 adoxq %r14,%r12 2869.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 2870 movq 16(%rsi),%rdx 2871 adcxq %rax,%r12 2872 adoxq %rbx,%r13 2873 adcxq %r15,%r13 2874 adoxq %rbp,%r14 2875 adcxq %rbp,%r14 2876 2877 movq %r8,24(%rdi) 2878 movq %r9,32(%rdi) 2879 2880 mulxq 24(%rsi),%r8,%rbx 2881 mulxq 32(%rsi),%r9,%rax 2882 adcxq %r10,%r8 2883 adoxq %rbx,%r9 2884 mulxq 40(%rsi),%r10,%rbx 2885 adcxq %r11,%r9 2886 adoxq %rax,%r10 2887.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 2888 adcxq %r12,%r10 2889 adoxq %r13,%r11 2890.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 2891.byte 0x3e 2892 movq 24(%rsi),%rdx 2893 adcxq %rbx,%r11 2894 adoxq %rax,%r12 2895 adcxq %r14,%r12 2896 movq %r8,40(%rdi) 2897 movq %r9,48(%rdi) 2898 mulxq 32(%rsi),%r8,%rax 2899 adoxq %rbp,%r13 2900 adcxq %rbp,%r13 2901 2902 mulxq 40(%rsi),%r9,%rbx 2903 adcxq %r10,%r8 2904 adoxq %rax,%r9 2905 mulxq 48(%rsi),%r10,%rax 2906 adcxq %r11,%r9 2907 adoxq %r12,%r10 2908 mulxq 56(%rsi),%r11,%r12 2909 movq 32(%rsi),%rdx 2910 movq 40(%rsi),%r14 2911 adcxq %rbx,%r10 2912 adoxq %rax,%r11 2913 movq 48(%rsi),%r15 2914 adcxq %r13,%r11 2915 adoxq %rbp,%r12 2916 adcxq %rbp,%r12 2917 2918 movq %r8,56(%rdi) 2919 movq %r9,64(%rdi) 2920 2921 mulxq %r14,%r9,%rax 2922 movq 56(%rsi),%r8 2923 adcxq %r10,%r9 2924 mulxq %r15,%r10,%rbx 2925 adoxq %rax,%r10 2926 adcxq %r11,%r10 2927 mulxq %r8,%r11,%rax 2928 movq %r14,%rdx 2929 adoxq %rbx,%r11 2930 adcxq %r12,%r11 2931 2932 adcxq %rbp,%rax 2933 2934 mulxq %r15,%r14,%rbx 2935 mulxq %r8,%r12,%r13 2936 movq %r15,%rdx 2937 leaq 64(%rsi),%rsi 2938 adcxq %r14,%r11 2939 adoxq %rbx,%r12 2940 adcxq %rax,%r12 2941 adoxq %rbp,%r13 2942 2943.byte 0x67,0x67 2944 mulxq %r8,%r8,%r14 2945 adcxq %r8,%r13 2946 adcxq %rbp,%r14 2947 2948 cmpq 8+8(%rsp),%rsi 2949 je L$sqrx8x_outer_break 2950 2951 negq %rcx 2952 movq $-8,%rcx 2953 movq %rbp,%r15 2954 movq 64(%rdi),%r8 2955 adcxq 72(%rdi),%r9 2956 adcxq 80(%rdi),%r10 2957 adcxq 88(%rdi),%r11 2958 adcq 96(%rdi),%r12 2959 adcq 104(%rdi),%r13 2960 adcq 112(%rdi),%r14 2961 adcq 120(%rdi),%r15 2962 leaq (%rsi),%rbp 2963 leaq 128(%rdi),%rdi 2964 sbbq %rax,%rax 2965 2966 movq -64(%rsi),%rdx 2967 movq %rax,16+8(%rsp) 2968 movq %rdi,24+8(%rsp) 2969 2970 2971 xorl %eax,%eax 2972 jmp L$sqrx8x_loop 2973 2974.p2align 5 2975L$sqrx8x_loop: 2976 movq %r8,%rbx 2977 mulxq 0(%rbp),%rax,%r8 2978 adcxq %rax,%rbx 2979 adoxq %r9,%r8 2980 2981 mulxq 8(%rbp),%rax,%r9 2982 adcxq %rax,%r8 2983 adoxq %r10,%r9 2984 2985 mulxq 16(%rbp),%rax,%r10 2986 adcxq %rax,%r9 2987 adoxq %r11,%r10 2988 2989 mulxq 24(%rbp),%rax,%r11 2990 adcxq %rax,%r10 2991 adoxq %r12,%r11 2992 2993.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 2994 adcxq %rax,%r11 2995 adoxq %r13,%r12 2996 2997 mulxq 40(%rbp),%rax,%r13 2998 adcxq %rax,%r12 2999 adoxq %r14,%r13 3000 3001 mulxq 48(%rbp),%rax,%r14 3002 movq %rbx,(%rdi,%rcx,8) 3003 movl $0,%ebx 3004 adcxq %rax,%r13 3005 adoxq %r15,%r14 3006 3007.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 3008 movq 8(%rsi,%rcx,8),%rdx 3009 adcxq %rax,%r14 3010 adoxq %rbx,%r15 3011 adcxq %rbx,%r15 3012 3013.byte 0x67 3014 incq %rcx 3015 jnz L$sqrx8x_loop 3016 3017 leaq 64(%rbp),%rbp 3018 movq $-8,%rcx 3019 cmpq 8+8(%rsp),%rbp 3020 je L$sqrx8x_break 3021 3022 subq 16+8(%rsp),%rbx 3023.byte 0x66 3024 movq -64(%rsi),%rdx 3025 adcxq 0(%rdi),%r8 3026 adcxq 8(%rdi),%r9 3027 adcq 16(%rdi),%r10 3028 adcq 24(%rdi),%r11 3029 adcq 32(%rdi),%r12 3030 adcq 40(%rdi),%r13 3031 adcq 48(%rdi),%r14 3032 adcq 56(%rdi),%r15 3033 leaq 64(%rdi),%rdi 3034.byte 0x67 3035 sbbq %rax,%rax 3036 xorl %ebx,%ebx 3037 movq %rax,16+8(%rsp) 3038 jmp L$sqrx8x_loop 3039 3040.p2align 5 3041L$sqrx8x_break: 3042 xorq %rbp,%rbp 3043 subq 16+8(%rsp),%rbx 3044 adcxq %rbp,%r8 3045 movq 24+8(%rsp),%rcx 3046 adcxq %rbp,%r9 3047 movq 0(%rsi),%rdx 3048 adcq $0,%r10 3049 movq %r8,0(%rdi) 3050 adcq $0,%r11 3051 adcq $0,%r12 3052 adcq $0,%r13 3053 adcq $0,%r14 3054 adcq $0,%r15 3055 cmpq %rcx,%rdi 3056 je L$sqrx8x_outer_loop 3057 3058 movq %r9,8(%rdi) 3059 movq 8(%rcx),%r9 3060 movq %r10,16(%rdi) 3061 movq 16(%rcx),%r10 3062 movq %r11,24(%rdi) 3063 movq 24(%rcx),%r11 3064 movq %r12,32(%rdi) 3065 movq 32(%rcx),%r12 3066 movq %r13,40(%rdi) 3067 movq 40(%rcx),%r13 3068 movq %r14,48(%rdi) 3069 movq 48(%rcx),%r14 3070 movq %r15,56(%rdi) 3071 movq 56(%rcx),%r15 3072 movq %rcx,%rdi 3073 jmp L$sqrx8x_outer_loop 3074 3075.p2align 5 3076L$sqrx8x_outer_break: 3077 movq %r9,72(%rdi) 3078.byte 102,72,15,126,217 3079 movq %r10,80(%rdi) 3080 movq %r11,88(%rdi) 3081 movq %r12,96(%rdi) 3082 movq %r13,104(%rdi) 3083 movq %r14,112(%rdi) 3084 leaq 48+8(%rsp),%rdi 3085 movq (%rsi,%rcx,1),%rdx 3086 3087 movq 8(%rdi),%r11 3088 xorq %r10,%r10 3089 movq 0+8(%rsp),%r9 3090 adoxq %r11,%r11 3091 movq 16(%rdi),%r12 3092 movq 24(%rdi),%r13 3093 3094 3095.p2align 5 3096L$sqrx4x_shift_n_add: 3097 mulxq %rdx,%rax,%rbx 3098 adoxq %r12,%r12 3099 adcxq %r10,%rax 3100.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 3101.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 3102 adoxq %r13,%r13 3103 adcxq %r11,%rbx 3104 movq 40(%rdi),%r11 3105 movq %rax,0(%rdi) 3106 movq %rbx,8(%rdi) 3107 3108 mulxq %rdx,%rax,%rbx 3109 adoxq %r10,%r10 3110 adcxq %r12,%rax 3111 movq 16(%rsi,%rcx,1),%rdx 3112 movq 48(%rdi),%r12 3113 adoxq %r11,%r11 3114 adcxq %r13,%rbx 3115 movq 56(%rdi),%r13 3116 movq %rax,16(%rdi) 3117 movq %rbx,24(%rdi) 3118 3119 mulxq %rdx,%rax,%rbx 3120 adoxq %r12,%r12 3121 adcxq %r10,%rax 3122 movq 24(%rsi,%rcx,1),%rdx 3123 leaq 32(%rcx),%rcx 3124 movq 64(%rdi),%r10 3125 adoxq %r13,%r13 3126 adcxq %r11,%rbx 3127 movq 72(%rdi),%r11 3128 movq %rax,32(%rdi) 3129 movq %rbx,40(%rdi) 3130 3131 mulxq %rdx,%rax,%rbx 3132 adoxq %r10,%r10 3133 adcxq %r12,%rax 3134 jrcxz L$sqrx4x_shift_n_add_break 3135.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 3136 adoxq %r11,%r11 3137 adcxq %r13,%rbx 3138 movq 80(%rdi),%r12 3139 movq 88(%rdi),%r13 3140 movq %rax,48(%rdi) 3141 movq %rbx,56(%rdi) 3142 leaq 64(%rdi),%rdi 3143 nop 3144 jmp L$sqrx4x_shift_n_add 3145 3146.p2align 5 3147L$sqrx4x_shift_n_add_break: 3148 adcxq %r13,%rbx 3149 movq %rax,48(%rdi) 3150 movq %rbx,56(%rdi) 3151 leaq 64(%rdi),%rdi 3152.byte 102,72,15,126,213 3153__bn_sqrx8x_reduction: 3154 xorl %eax,%eax 3155 movq 32+8(%rsp),%rbx 3156 movq 48+8(%rsp),%rdx 3157 leaq -64(%rbp,%r9,1),%rcx 3158 3159 movq %rcx,0+8(%rsp) 3160 movq %rdi,8+8(%rsp) 3161 3162 leaq 48+8(%rsp),%rdi 3163 jmp L$sqrx8x_reduction_loop 3164 3165.p2align 5 3166L$sqrx8x_reduction_loop: 3167 movq 8(%rdi),%r9 3168 movq 16(%rdi),%r10 3169 movq 24(%rdi),%r11 3170 movq 32(%rdi),%r12 3171 movq %rdx,%r8 3172 imulq %rbx,%rdx 3173 movq 40(%rdi),%r13 3174 movq 48(%rdi),%r14 3175 movq 56(%rdi),%r15 3176 movq %rax,24+8(%rsp) 3177 3178 leaq 64(%rdi),%rdi 3179 xorq %rsi,%rsi 3180 movq $-8,%rcx 3181 jmp L$sqrx8x_reduce 3182 3183.p2align 5 3184L$sqrx8x_reduce: 3185 movq %r8,%rbx 3186 mulxq 0(%rbp),%rax,%r8 3187 adcxq %rbx,%rax 3188 adoxq %r9,%r8 3189 3190 mulxq 8(%rbp),%rbx,%r9 3191 adcxq %rbx,%r8 3192 adoxq %r10,%r9 3193 3194 mulxq 16(%rbp),%rbx,%r10 3195 adcxq %rbx,%r9 3196 adoxq %r11,%r10 3197 3198 mulxq 24(%rbp),%rbx,%r11 3199 adcxq %rbx,%r10 3200 adoxq %r12,%r11 3201 3202.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 3203 movq %rdx,%rax 3204 movq %r8,%rdx 3205 adcxq %rbx,%r11 3206 adoxq %r13,%r12 3207 3208 mulxq 32+8(%rsp),%rbx,%rdx 3209 movq %rax,%rdx 3210 movq %rax,64+48+8(%rsp,%rcx,8) 3211 3212 mulxq 40(%rbp),%rax,%r13 3213 adcxq %rax,%r12 3214 adoxq %r14,%r13 3215 3216 mulxq 48(%rbp),%rax,%r14 3217 adcxq %rax,%r13 3218 adoxq %r15,%r14 3219 3220 mulxq 56(%rbp),%rax,%r15 3221 movq %rbx,%rdx 3222 adcxq %rax,%r14 3223 adoxq %rsi,%r15 3224 adcxq %rsi,%r15 3225 3226.byte 0x67,0x67,0x67 3227 incq %rcx 3228 jnz L$sqrx8x_reduce 3229 3230 movq %rsi,%rax 3231 cmpq 0+8(%rsp),%rbp 3232 jae L$sqrx8x_no_tail 3233 3234 movq 48+8(%rsp),%rdx 3235 addq 0(%rdi),%r8 3236 leaq 64(%rbp),%rbp 3237 movq $-8,%rcx 3238 adcxq 8(%rdi),%r9 3239 adcxq 16(%rdi),%r10 3240 adcq 24(%rdi),%r11 3241 adcq 32(%rdi),%r12 3242 adcq 40(%rdi),%r13 3243 adcq 48(%rdi),%r14 3244 adcq 56(%rdi),%r15 3245 leaq 64(%rdi),%rdi 3246 sbbq %rax,%rax 3247 3248 xorq %rsi,%rsi 3249 movq %rax,16+8(%rsp) 3250 jmp L$sqrx8x_tail 3251 3252.p2align 5 3253L$sqrx8x_tail: 3254 movq %r8,%rbx 3255 mulxq 0(%rbp),%rax,%r8 3256 adcxq %rax,%rbx 3257 adoxq %r9,%r8 3258 3259 mulxq 8(%rbp),%rax,%r9 3260 adcxq %rax,%r8 3261 adoxq %r10,%r9 3262 3263 mulxq 16(%rbp),%rax,%r10 3264 adcxq %rax,%r9 3265 adoxq %r11,%r10 3266 3267 mulxq 24(%rbp),%rax,%r11 3268 adcxq %rax,%r10 3269 adoxq %r12,%r11 3270 3271.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 3272 adcxq %rax,%r11 3273 adoxq %r13,%r12 3274 3275 mulxq 40(%rbp),%rax,%r13 3276 adcxq %rax,%r12 3277 adoxq %r14,%r13 3278 3279 mulxq 48(%rbp),%rax,%r14 3280 adcxq %rax,%r13 3281 adoxq %r15,%r14 3282 3283 mulxq 56(%rbp),%rax,%r15 3284 movq 72+48+8(%rsp,%rcx,8),%rdx 3285 adcxq %rax,%r14 3286 adoxq %rsi,%r15 3287 movq %rbx,(%rdi,%rcx,8) 3288 movq %r8,%rbx 3289 adcxq %rsi,%r15 3290 3291 incq %rcx 3292 jnz L$sqrx8x_tail 3293 3294 cmpq 0+8(%rsp),%rbp 3295 jae L$sqrx8x_tail_done 3296 3297 subq 16+8(%rsp),%rsi 3298 movq 48+8(%rsp),%rdx 3299 leaq 64(%rbp),%rbp 3300 adcq 0(%rdi),%r8 3301 adcq 8(%rdi),%r9 3302 adcq 16(%rdi),%r10 3303 adcq 24(%rdi),%r11 3304 adcq 32(%rdi),%r12 3305 adcq 40(%rdi),%r13 3306 adcq 48(%rdi),%r14 3307 adcq 56(%rdi),%r15 3308 leaq 64(%rdi),%rdi 3309 sbbq %rax,%rax 3310 subq $8,%rcx 3311 3312 xorq %rsi,%rsi 3313 movq %rax,16+8(%rsp) 3314 jmp L$sqrx8x_tail 3315 3316.p2align 5 3317L$sqrx8x_tail_done: 3318 xorq %rax,%rax 3319 addq 24+8(%rsp),%r8 3320 adcq $0,%r9 3321 adcq $0,%r10 3322 adcq $0,%r11 3323 adcq $0,%r12 3324 adcq $0,%r13 3325 adcq $0,%r14 3326 adcq $0,%r15 3327 adcq $0,%rax 3328 3329 subq 16+8(%rsp),%rsi 3330L$sqrx8x_no_tail: 3331 adcq 0(%rdi),%r8 3332.byte 102,72,15,126,217 3333 adcq 8(%rdi),%r9 3334 movq 56(%rbp),%rsi 3335.byte 102,72,15,126,213 3336 adcq 16(%rdi),%r10 3337 adcq 24(%rdi),%r11 3338 adcq 32(%rdi),%r12 3339 adcq 40(%rdi),%r13 3340 adcq 48(%rdi),%r14 3341 adcq 56(%rdi),%r15 3342 adcq $0,%rax 3343 3344 movq 32+8(%rsp),%rbx 3345 movq 64(%rdi,%rcx,1),%rdx 3346 3347 movq %r8,0(%rdi) 3348 leaq 64(%rdi),%r8 3349 movq %r9,8(%rdi) 3350 movq %r10,16(%rdi) 3351 movq %r11,24(%rdi) 3352 movq %r12,32(%rdi) 3353 movq %r13,40(%rdi) 3354 movq %r14,48(%rdi) 3355 movq %r15,56(%rdi) 3356 3357 leaq 64(%rdi,%rcx,1),%rdi 3358 cmpq 8+8(%rsp),%r8 3359 jb L$sqrx8x_reduction_loop 3360 ret 3361 3362 3363.p2align 5 3364 3365__bn_postx4x_internal: 3366 3367 movq 0(%rbp),%r12 3368 movq %rcx,%r10 3369 movq %rcx,%r9 3370 negq %rax 3371 sarq $3+2,%rcx 3372 3373.byte 102,72,15,126,202 3374.byte 102,72,15,126,206 3375 decq %r12 3376 movq 8(%rbp),%r13 3377 xorq %r8,%r8 3378 movq 16(%rbp),%r14 3379 movq 24(%rbp),%r15 3380 jmp L$sqrx4x_sub_entry 3381 3382.p2align 4 3383L$sqrx4x_sub: 3384 movq 0(%rbp),%r12 3385 movq 8(%rbp),%r13 3386 movq 16(%rbp),%r14 3387 movq 24(%rbp),%r15 3388L$sqrx4x_sub_entry: 3389 andnq %rax,%r12,%r12 3390 leaq 32(%rbp),%rbp 3391 andnq %rax,%r13,%r13 3392 andnq %rax,%r14,%r14 3393 andnq %rax,%r15,%r15 3394 3395 negq %r8 3396 adcq 0(%rdi),%r12 3397 adcq 8(%rdi),%r13 3398 adcq 16(%rdi),%r14 3399 adcq 24(%rdi),%r15 3400 movq %r12,0(%rdx) 3401 leaq 32(%rdi),%rdi 3402 movq %r13,8(%rdx) 3403 sbbq %r8,%r8 3404 movq %r14,16(%rdx) 3405 movq %r15,24(%rdx) 3406 leaq 32(%rdx),%rdx 3407 3408 incq %rcx 3409 jnz L$sqrx4x_sub 3410 3411 negq %r9 3412 3413 ret 3414 3415 3416.globl _bn_scatter5 3417.private_extern _bn_scatter5 3418 3419.p2align 4 3420_bn_scatter5: 3421 3422_CET_ENDBR 3423 cmpl $0,%esi 3424 jz L$scatter_epilogue 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 leaq (%rdx,%rcx,8),%rdx 3435L$scatter: 3436 movq (%rdi),%rax 3437 leaq 8(%rdi),%rdi 3438 movq %rax,(%rdx) 3439 leaq 256(%rdx),%rdx 3440 subl $1,%esi 3441 jnz L$scatter 3442L$scatter_epilogue: 3443 ret 3444 3445 3446 3447.globl _bn_gather5 3448.private_extern _bn_gather5 3449 3450.p2align 5 3451_bn_gather5: 3452 3453L$SEH_begin_bn_gather5: 3454_CET_ENDBR 3455 3456.byte 0x4c,0x8d,0x14,0x24 3457 3458.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 3459 leaq L$inc(%rip),%rax 3460 andq $-16,%rsp 3461 3462 movd %ecx,%xmm5 3463 movdqa 0(%rax),%xmm0 3464 movdqa 16(%rax),%xmm1 3465 leaq 128(%rdx),%r11 3466 leaq 128(%rsp),%rax 3467 3468 pshufd $0,%xmm5,%xmm5 3469 movdqa %xmm1,%xmm4 3470 movdqa %xmm1,%xmm2 3471 paddd %xmm0,%xmm1 3472 pcmpeqd %xmm5,%xmm0 3473 movdqa %xmm4,%xmm3 3474 3475 paddd %xmm1,%xmm2 3476 pcmpeqd %xmm5,%xmm1 3477 movdqa %xmm0,-128(%rax) 3478 movdqa %xmm4,%xmm0 3479 3480 paddd %xmm2,%xmm3 3481 pcmpeqd %xmm5,%xmm2 3482 movdqa %xmm1,-112(%rax) 3483 movdqa %xmm4,%xmm1 3484 3485 paddd %xmm3,%xmm0 3486 pcmpeqd %xmm5,%xmm3 3487 movdqa %xmm2,-96(%rax) 3488 movdqa %xmm4,%xmm2 3489 paddd %xmm0,%xmm1 3490 pcmpeqd %xmm5,%xmm0 3491 movdqa %xmm3,-80(%rax) 3492 movdqa %xmm4,%xmm3 3493 3494 paddd %xmm1,%xmm2 3495 pcmpeqd %xmm5,%xmm1 3496 movdqa %xmm0,-64(%rax) 3497 movdqa %xmm4,%xmm0 3498 3499 paddd %xmm2,%xmm3 3500 pcmpeqd %xmm5,%xmm2 3501 movdqa %xmm1,-48(%rax) 3502 movdqa %xmm4,%xmm1 3503 3504 paddd %xmm3,%xmm0 3505 pcmpeqd %xmm5,%xmm3 3506 movdqa %xmm2,-32(%rax) 3507 movdqa %xmm4,%xmm2 3508 paddd %xmm0,%xmm1 3509 pcmpeqd %xmm5,%xmm0 3510 movdqa %xmm3,-16(%rax) 3511 movdqa %xmm4,%xmm3 3512 3513 paddd %xmm1,%xmm2 3514 pcmpeqd %xmm5,%xmm1 3515 movdqa %xmm0,0(%rax) 3516 movdqa %xmm4,%xmm0 3517 3518 paddd %xmm2,%xmm3 3519 pcmpeqd %xmm5,%xmm2 3520 movdqa %xmm1,16(%rax) 3521 movdqa %xmm4,%xmm1 3522 3523 paddd %xmm3,%xmm0 3524 pcmpeqd %xmm5,%xmm3 3525 movdqa %xmm2,32(%rax) 3526 movdqa %xmm4,%xmm2 3527 paddd %xmm0,%xmm1 3528 pcmpeqd %xmm5,%xmm0 3529 movdqa %xmm3,48(%rax) 3530 movdqa %xmm4,%xmm3 3531 3532 paddd %xmm1,%xmm2 3533 pcmpeqd %xmm5,%xmm1 3534 movdqa %xmm0,64(%rax) 3535 movdqa %xmm4,%xmm0 3536 3537 paddd %xmm2,%xmm3 3538 pcmpeqd %xmm5,%xmm2 3539 movdqa %xmm1,80(%rax) 3540 movdqa %xmm4,%xmm1 3541 3542 paddd %xmm3,%xmm0 3543 pcmpeqd %xmm5,%xmm3 3544 movdqa %xmm2,96(%rax) 3545 movdqa %xmm4,%xmm2 3546 movdqa %xmm3,112(%rax) 3547 jmp L$gather 3548 3549.p2align 5 3550L$gather: 3551 pxor %xmm4,%xmm4 3552 pxor %xmm5,%xmm5 3553 movdqa -128(%r11),%xmm0 3554 movdqa -112(%r11),%xmm1 3555 movdqa -96(%r11),%xmm2 3556 pand -128(%rax),%xmm0 3557 movdqa -80(%r11),%xmm3 3558 pand -112(%rax),%xmm1 3559 por %xmm0,%xmm4 3560 pand -96(%rax),%xmm2 3561 por %xmm1,%xmm5 3562 pand -80(%rax),%xmm3 3563 por %xmm2,%xmm4 3564 por %xmm3,%xmm5 3565 movdqa -64(%r11),%xmm0 3566 movdqa -48(%r11),%xmm1 3567 movdqa -32(%r11),%xmm2 3568 pand -64(%rax),%xmm0 3569 movdqa -16(%r11),%xmm3 3570 pand -48(%rax),%xmm1 3571 por %xmm0,%xmm4 3572 pand -32(%rax),%xmm2 3573 por %xmm1,%xmm5 3574 pand -16(%rax),%xmm3 3575 por %xmm2,%xmm4 3576 por %xmm3,%xmm5 3577 movdqa 0(%r11),%xmm0 3578 movdqa 16(%r11),%xmm1 3579 movdqa 32(%r11),%xmm2 3580 pand 0(%rax),%xmm0 3581 movdqa 48(%r11),%xmm3 3582 pand 16(%rax),%xmm1 3583 por %xmm0,%xmm4 3584 pand 32(%rax),%xmm2 3585 por %xmm1,%xmm5 3586 pand 48(%rax),%xmm3 3587 por %xmm2,%xmm4 3588 por %xmm3,%xmm5 3589 movdqa 64(%r11),%xmm0 3590 movdqa 80(%r11),%xmm1 3591 movdqa 96(%r11),%xmm2 3592 pand 64(%rax),%xmm0 3593 movdqa 112(%r11),%xmm3 3594 pand 80(%rax),%xmm1 3595 por %xmm0,%xmm4 3596 pand 96(%rax),%xmm2 3597 por %xmm1,%xmm5 3598 pand 112(%rax),%xmm3 3599 por %xmm2,%xmm4 3600 por %xmm3,%xmm5 3601 por %xmm5,%xmm4 3602 leaq 256(%r11),%r11 3603 3604 pshufd $0x4e,%xmm4,%xmm0 3605 por %xmm4,%xmm0 3606 movq %xmm0,(%rdi) 3607 leaq 8(%rdi),%rdi 3608 subl $1,%esi 3609 jnz L$gather 3610 3611 leaq (%r10),%rsp 3612 3613 ret 3614L$SEH_end_bn_gather5: 3615 3616 3617.section __DATA,__const 3618.p2align 6 3619L$inc: 3620.long 0,0, 1,1 3621.long 2,2, 2,2 3622.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 3623.text 3624#endif 3625