1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#include <ring-core/asm_base.h> 5 6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) 7.text 8 9 10 11.globl _bn_mul_mont 12.private_extern _bn_mul_mont 13 14.p2align 4 15_bn_mul_mont: 16 17_CET_ENDBR 18 movl %r9d,%r9d 19 movq %rsp,%rax 20 21 testl $3,%r9d 22 jnz L$mul_enter 23 cmpl $8,%r9d 24 jb L$mul_enter 25 leaq _OPENSSL_ia32cap_P(%rip),%r11 26 movl 8(%r11),%r11d 27 cmpq %rsi,%rdx 28 jne L$mul4x_enter 29 testl $7,%r9d 30 jz L$sqr8x_enter 31 jmp L$mul4x_enter 32 33.p2align 4 34L$mul_enter: 35 pushq %rbx 36 37 pushq %rbp 38 39 pushq %r12 40 41 pushq %r13 42 43 pushq %r14 44 45 pushq %r15 46 47 48 negq %r9 49 movq %rsp,%r11 50 leaq -16(%rsp,%r9,8),%r10 51 negq %r9 52 andq $-1024,%r10 53 54 55 56 57 58 59 60 61 62 subq %r10,%r11 63 andq $-4096,%r11 64 leaq (%r10,%r11,1),%rsp 65 movq (%rsp),%r11 66 cmpq %r10,%rsp 67 ja L$mul_page_walk 68 jmp L$mul_page_walk_done 69 70.p2align 4 71L$mul_page_walk: 72 leaq -4096(%rsp),%rsp 73 movq (%rsp),%r11 74 cmpq %r10,%rsp 75 ja L$mul_page_walk 76L$mul_page_walk_done: 77 78 movq %rax,8(%rsp,%r9,8) 79 80L$mul_body: 81 movq %rdx,%r12 82 movq (%r8),%r8 83 movq (%r12),%rbx 84 movq (%rsi),%rax 85 86 xorq %r14,%r14 87 xorq %r15,%r15 88 89 movq %r8,%rbp 90 mulq %rbx 91 movq %rax,%r10 92 movq (%rcx),%rax 93 94 imulq %r10,%rbp 95 movq %rdx,%r11 96 97 mulq %rbp 98 addq %rax,%r10 99 movq 8(%rsi),%rax 100 adcq $0,%rdx 101 movq %rdx,%r13 102 103 leaq 1(%r15),%r15 104 jmp L$1st_enter 105 106.p2align 4 107L$1st: 108 addq %rax,%r13 109 movq (%rsi,%r15,8),%rax 110 adcq $0,%rdx 111 addq %r11,%r13 112 movq %r10,%r11 113 adcq $0,%rdx 114 movq %r13,-16(%rsp,%r15,8) 115 movq %rdx,%r13 116 117L$1st_enter: 118 mulq %rbx 119 addq %rax,%r11 120 movq (%rcx,%r15,8),%rax 121 adcq $0,%rdx 122 leaq 1(%r15),%r15 123 movq %rdx,%r10 124 125 mulq %rbp 126 cmpq %r9,%r15 127 jne L$1st 128 129 addq %rax,%r13 130 movq (%rsi),%rax 131 adcq $0,%rdx 132 addq %r11,%r13 133 adcq $0,%rdx 134 movq %r13,-16(%rsp,%r15,8) 135 movq %rdx,%r13 136 movq %r10,%r11 137 138 xorq %rdx,%rdx 139 addq %r11,%r13 140 adcq $0,%rdx 141 movq %r13,-8(%rsp,%r9,8) 142 movq %rdx,(%rsp,%r9,8) 143 144 leaq 1(%r14),%r14 145 jmp L$outer 146.p2align 4 147L$outer: 148 movq (%r12,%r14,8),%rbx 149 xorq %r15,%r15 150 movq %r8,%rbp 151 movq (%rsp),%r10 152 mulq %rbx 153 addq %rax,%r10 154 movq (%rcx),%rax 155 adcq $0,%rdx 156 157 imulq %r10,%rbp 158 movq %rdx,%r11 159 160 mulq %rbp 161 addq %rax,%r10 162 movq 8(%rsi),%rax 163 adcq $0,%rdx 164 movq 8(%rsp),%r10 165 movq %rdx,%r13 166 167 leaq 1(%r15),%r15 168 jmp L$inner_enter 169 170.p2align 4 171L$inner: 172 addq %rax,%r13 173 movq (%rsi,%r15,8),%rax 174 adcq $0,%rdx 175 addq %r10,%r13 176 movq (%rsp,%r15,8),%r10 177 adcq $0,%rdx 178 movq %r13,-16(%rsp,%r15,8) 179 movq %rdx,%r13 180 181L$inner_enter: 182 mulq %rbx 183 addq %rax,%r11 184 movq (%rcx,%r15,8),%rax 185 adcq $0,%rdx 186 addq %r11,%r10 187 movq %rdx,%r11 188 adcq $0,%r11 189 leaq 1(%r15),%r15 190 191 mulq %rbp 192 cmpq %r9,%r15 193 jne L$inner 194 195 addq %rax,%r13 196 movq (%rsi),%rax 197 adcq $0,%rdx 198 addq %r10,%r13 199 movq (%rsp,%r15,8),%r10 200 adcq $0,%rdx 201 movq %r13,-16(%rsp,%r15,8) 202 movq %rdx,%r13 203 204 xorq %rdx,%rdx 205 addq %r11,%r13 206 adcq $0,%rdx 207 addq %r10,%r13 208 adcq $0,%rdx 209 movq %r13,-8(%rsp,%r9,8) 210 movq %rdx,(%rsp,%r9,8) 211 212 leaq 1(%r14),%r14 213 cmpq %r9,%r14 214 jb L$outer 215 216 xorq %r14,%r14 217 movq (%rsp),%rax 218 movq %r9,%r15 219 220.p2align 4 221L$sub: sbbq (%rcx,%r14,8),%rax 222 movq %rax,(%rdi,%r14,8) 223 movq 8(%rsp,%r14,8),%rax 224 leaq 1(%r14),%r14 225 decq %r15 226 jnz L$sub 227 228 sbbq $0,%rax 229 movq $-1,%rbx 230 xorq %rax,%rbx 231 xorq %r14,%r14 232 movq %r9,%r15 233 234L$copy: 235 movq (%rdi,%r14,8),%rcx 236 movq (%rsp,%r14,8),%rdx 237 andq %rbx,%rcx 238 andq %rax,%rdx 239 movq %r9,(%rsp,%r14,8) 240 orq %rcx,%rdx 241 movq %rdx,(%rdi,%r14,8) 242 leaq 1(%r14),%r14 243 subq $1,%r15 244 jnz L$copy 245 246 movq 8(%rsp,%r9,8),%rsi 247 248 movq $1,%rax 249 movq -48(%rsi),%r15 250 251 movq -40(%rsi),%r14 252 253 movq -32(%rsi),%r13 254 255 movq -24(%rsi),%r12 256 257 movq -16(%rsi),%rbp 258 259 movq -8(%rsi),%rbx 260 261 leaq (%rsi),%rsp 262 263L$mul_epilogue: 264 ret 265 266 267 268.p2align 4 269bn_mul4x_mont: 270 271 movl %r9d,%r9d 272 movq %rsp,%rax 273 274L$mul4x_enter: 275 andl $0x80100,%r11d 276 cmpl $0x80100,%r11d 277 je L$mulx4x_enter 278 pushq %rbx 279 280 pushq %rbp 281 282 pushq %r12 283 284 pushq %r13 285 286 pushq %r14 287 288 pushq %r15 289 290 291 negq %r9 292 movq %rsp,%r11 293 leaq -32(%rsp,%r9,8),%r10 294 negq %r9 295 andq $-1024,%r10 296 297 subq %r10,%r11 298 andq $-4096,%r11 299 leaq (%r10,%r11,1),%rsp 300 movq (%rsp),%r11 301 cmpq %r10,%rsp 302 ja L$mul4x_page_walk 303 jmp L$mul4x_page_walk_done 304 305L$mul4x_page_walk: 306 leaq -4096(%rsp),%rsp 307 movq (%rsp),%r11 308 cmpq %r10,%rsp 309 ja L$mul4x_page_walk 310L$mul4x_page_walk_done: 311 312 movq %rax,8(%rsp,%r9,8) 313 314L$mul4x_body: 315 movq %rdi,16(%rsp,%r9,8) 316 movq %rdx,%r12 317 movq (%r8),%r8 318 movq (%r12),%rbx 319 movq (%rsi),%rax 320 321 xorq %r14,%r14 322 xorq %r15,%r15 323 324 movq %r8,%rbp 325 mulq %rbx 326 movq %rax,%r10 327 movq (%rcx),%rax 328 329 imulq %r10,%rbp 330 movq %rdx,%r11 331 332 mulq %rbp 333 addq %rax,%r10 334 movq 8(%rsi),%rax 335 adcq $0,%rdx 336 movq %rdx,%rdi 337 338 mulq %rbx 339 addq %rax,%r11 340 movq 8(%rcx),%rax 341 adcq $0,%rdx 342 movq %rdx,%r10 343 344 mulq %rbp 345 addq %rax,%rdi 346 movq 16(%rsi),%rax 347 adcq $0,%rdx 348 addq %r11,%rdi 349 leaq 4(%r15),%r15 350 adcq $0,%rdx 351 movq %rdi,(%rsp) 352 movq %rdx,%r13 353 jmp L$1st4x 354.p2align 4 355L$1st4x: 356 mulq %rbx 357 addq %rax,%r10 358 movq -16(%rcx,%r15,8),%rax 359 adcq $0,%rdx 360 movq %rdx,%r11 361 362 mulq %rbp 363 addq %rax,%r13 364 movq -8(%rsi,%r15,8),%rax 365 adcq $0,%rdx 366 addq %r10,%r13 367 adcq $0,%rdx 368 movq %r13,-24(%rsp,%r15,8) 369 movq %rdx,%rdi 370 371 mulq %rbx 372 addq %rax,%r11 373 movq -8(%rcx,%r15,8),%rax 374 adcq $0,%rdx 375 movq %rdx,%r10 376 377 mulq %rbp 378 addq %rax,%rdi 379 movq (%rsi,%r15,8),%rax 380 adcq $0,%rdx 381 addq %r11,%rdi 382 adcq $0,%rdx 383 movq %rdi,-16(%rsp,%r15,8) 384 movq %rdx,%r13 385 386 mulq %rbx 387 addq %rax,%r10 388 movq (%rcx,%r15,8),%rax 389 adcq $0,%rdx 390 movq %rdx,%r11 391 392 mulq %rbp 393 addq %rax,%r13 394 movq 8(%rsi,%r15,8),%rax 395 adcq $0,%rdx 396 addq %r10,%r13 397 adcq $0,%rdx 398 movq %r13,-8(%rsp,%r15,8) 399 movq %rdx,%rdi 400 401 mulq %rbx 402 addq %rax,%r11 403 movq 8(%rcx,%r15,8),%rax 404 adcq $0,%rdx 405 leaq 4(%r15),%r15 406 movq %rdx,%r10 407 408 mulq %rbp 409 addq %rax,%rdi 410 movq -16(%rsi,%r15,8),%rax 411 adcq $0,%rdx 412 addq %r11,%rdi 413 adcq $0,%rdx 414 movq %rdi,-32(%rsp,%r15,8) 415 movq %rdx,%r13 416 cmpq %r9,%r15 417 jb L$1st4x 418 419 mulq %rbx 420 addq %rax,%r10 421 movq -16(%rcx,%r15,8),%rax 422 adcq $0,%rdx 423 movq %rdx,%r11 424 425 mulq %rbp 426 addq %rax,%r13 427 movq -8(%rsi,%r15,8),%rax 428 adcq $0,%rdx 429 addq %r10,%r13 430 adcq $0,%rdx 431 movq %r13,-24(%rsp,%r15,8) 432 movq %rdx,%rdi 433 434 mulq %rbx 435 addq %rax,%r11 436 movq -8(%rcx,%r15,8),%rax 437 adcq $0,%rdx 438 movq %rdx,%r10 439 440 mulq %rbp 441 addq %rax,%rdi 442 movq (%rsi),%rax 443 adcq $0,%rdx 444 addq %r11,%rdi 445 adcq $0,%rdx 446 movq %rdi,-16(%rsp,%r15,8) 447 movq %rdx,%r13 448 449 xorq %rdi,%rdi 450 addq %r10,%r13 451 adcq $0,%rdi 452 movq %r13,-8(%rsp,%r15,8) 453 movq %rdi,(%rsp,%r15,8) 454 455 leaq 1(%r14),%r14 456.p2align 2 457L$outer4x: 458 movq (%r12,%r14,8),%rbx 459 xorq %r15,%r15 460 movq (%rsp),%r10 461 movq %r8,%rbp 462 mulq %rbx 463 addq %rax,%r10 464 movq (%rcx),%rax 465 adcq $0,%rdx 466 467 imulq %r10,%rbp 468 movq %rdx,%r11 469 470 mulq %rbp 471 addq %rax,%r10 472 movq 8(%rsi),%rax 473 adcq $0,%rdx 474 movq %rdx,%rdi 475 476 mulq %rbx 477 addq %rax,%r11 478 movq 8(%rcx),%rax 479 adcq $0,%rdx 480 addq 8(%rsp),%r11 481 adcq $0,%rdx 482 movq %rdx,%r10 483 484 mulq %rbp 485 addq %rax,%rdi 486 movq 16(%rsi),%rax 487 adcq $0,%rdx 488 addq %r11,%rdi 489 leaq 4(%r15),%r15 490 adcq $0,%rdx 491 movq %rdi,(%rsp) 492 movq %rdx,%r13 493 jmp L$inner4x 494.p2align 4 495L$inner4x: 496 mulq %rbx 497 addq %rax,%r10 498 movq -16(%rcx,%r15,8),%rax 499 adcq $0,%rdx 500 addq -16(%rsp,%r15,8),%r10 501 adcq $0,%rdx 502 movq %rdx,%r11 503 504 mulq %rbp 505 addq %rax,%r13 506 movq -8(%rsi,%r15,8),%rax 507 adcq $0,%rdx 508 addq %r10,%r13 509 adcq $0,%rdx 510 movq %r13,-24(%rsp,%r15,8) 511 movq %rdx,%rdi 512 513 mulq %rbx 514 addq %rax,%r11 515 movq -8(%rcx,%r15,8),%rax 516 adcq $0,%rdx 517 addq -8(%rsp,%r15,8),%r11 518 adcq $0,%rdx 519 movq %rdx,%r10 520 521 mulq %rbp 522 addq %rax,%rdi 523 movq (%rsi,%r15,8),%rax 524 adcq $0,%rdx 525 addq %r11,%rdi 526 adcq $0,%rdx 527 movq %rdi,-16(%rsp,%r15,8) 528 movq %rdx,%r13 529 530 mulq %rbx 531 addq %rax,%r10 532 movq (%rcx,%r15,8),%rax 533 adcq $0,%rdx 534 addq (%rsp,%r15,8),%r10 535 adcq $0,%rdx 536 movq %rdx,%r11 537 538 mulq %rbp 539 addq %rax,%r13 540 movq 8(%rsi,%r15,8),%rax 541 adcq $0,%rdx 542 addq %r10,%r13 543 adcq $0,%rdx 544 movq %r13,-8(%rsp,%r15,8) 545 movq %rdx,%rdi 546 547 mulq %rbx 548 addq %rax,%r11 549 movq 8(%rcx,%r15,8),%rax 550 adcq $0,%rdx 551 addq 8(%rsp,%r15,8),%r11 552 adcq $0,%rdx 553 leaq 4(%r15),%r15 554 movq %rdx,%r10 555 556 mulq %rbp 557 addq %rax,%rdi 558 movq -16(%rsi,%r15,8),%rax 559 adcq $0,%rdx 560 addq %r11,%rdi 561 adcq $0,%rdx 562 movq %rdi,-32(%rsp,%r15,8) 563 movq %rdx,%r13 564 cmpq %r9,%r15 565 jb L$inner4x 566 567 mulq %rbx 568 addq %rax,%r10 569 movq -16(%rcx,%r15,8),%rax 570 adcq $0,%rdx 571 addq -16(%rsp,%r15,8),%r10 572 adcq $0,%rdx 573 movq %rdx,%r11 574 575 mulq %rbp 576 addq %rax,%r13 577 movq -8(%rsi,%r15,8),%rax 578 adcq $0,%rdx 579 addq %r10,%r13 580 adcq $0,%rdx 581 movq %r13,-24(%rsp,%r15,8) 582 movq %rdx,%rdi 583 584 mulq %rbx 585 addq %rax,%r11 586 movq -8(%rcx,%r15,8),%rax 587 adcq $0,%rdx 588 addq -8(%rsp,%r15,8),%r11 589 adcq $0,%rdx 590 leaq 1(%r14),%r14 591 movq %rdx,%r10 592 593 mulq %rbp 594 addq %rax,%rdi 595 movq (%rsi),%rax 596 adcq $0,%rdx 597 addq %r11,%rdi 598 adcq $0,%rdx 599 movq %rdi,-16(%rsp,%r15,8) 600 movq %rdx,%r13 601 602 xorq %rdi,%rdi 603 addq %r10,%r13 604 adcq $0,%rdi 605 addq (%rsp,%r9,8),%r13 606 adcq $0,%rdi 607 movq %r13,-8(%rsp,%r15,8) 608 movq %rdi,(%rsp,%r15,8) 609 610 cmpq %r9,%r14 611 jb L$outer4x 612 movq 16(%rsp,%r9,8),%rdi 613 leaq -4(%r9),%r15 614 movq 0(%rsp),%rax 615 movq 8(%rsp),%rdx 616 shrq $2,%r15 617 leaq (%rsp),%rsi 618 xorq %r14,%r14 619 620 subq 0(%rcx),%rax 621 movq 16(%rsi),%rbx 622 movq 24(%rsi),%rbp 623 sbbq 8(%rcx),%rdx 624 625L$sub4x: 626 movq %rax,0(%rdi,%r14,8) 627 movq %rdx,8(%rdi,%r14,8) 628 sbbq 16(%rcx,%r14,8),%rbx 629 movq 32(%rsi,%r14,8),%rax 630 movq 40(%rsi,%r14,8),%rdx 631 sbbq 24(%rcx,%r14,8),%rbp 632 movq %rbx,16(%rdi,%r14,8) 633 movq %rbp,24(%rdi,%r14,8) 634 sbbq 32(%rcx,%r14,8),%rax 635 movq 48(%rsi,%r14,8),%rbx 636 movq 56(%rsi,%r14,8),%rbp 637 sbbq 40(%rcx,%r14,8),%rdx 638 leaq 4(%r14),%r14 639 decq %r15 640 jnz L$sub4x 641 642 movq %rax,0(%rdi,%r14,8) 643 movq 32(%rsi,%r14,8),%rax 644 sbbq 16(%rcx,%r14,8),%rbx 645 movq %rdx,8(%rdi,%r14,8) 646 sbbq 24(%rcx,%r14,8),%rbp 647 movq %rbx,16(%rdi,%r14,8) 648 649 sbbq $0,%rax 650 movq %rbp,24(%rdi,%r14,8) 651 pxor %xmm0,%xmm0 652.byte 102,72,15,110,224 653 pcmpeqd %xmm5,%xmm5 654 pshufd $0,%xmm4,%xmm4 655 movq %r9,%r15 656 pxor %xmm4,%xmm5 657 shrq $2,%r15 658 xorl %eax,%eax 659 660 jmp L$copy4x 661.p2align 4 662L$copy4x: 663 movdqa (%rsp,%rax,1),%xmm1 664 movdqu (%rdi,%rax,1),%xmm2 665 pand %xmm4,%xmm1 666 pand %xmm5,%xmm2 667 movdqa 16(%rsp,%rax,1),%xmm3 668 movdqa %xmm0,(%rsp,%rax,1) 669 por %xmm2,%xmm1 670 movdqu 16(%rdi,%rax,1),%xmm2 671 movdqu %xmm1,(%rdi,%rax,1) 672 pand %xmm4,%xmm3 673 pand %xmm5,%xmm2 674 movdqa %xmm0,16(%rsp,%rax,1) 675 por %xmm2,%xmm3 676 movdqu %xmm3,16(%rdi,%rax,1) 677 leaq 32(%rax),%rax 678 decq %r15 679 jnz L$copy4x 680 movq 8(%rsp,%r9,8),%rsi 681 682 movq $1,%rax 683 movq -48(%rsi),%r15 684 685 movq -40(%rsi),%r14 686 687 movq -32(%rsi),%r13 688 689 movq -24(%rsi),%r12 690 691 movq -16(%rsi),%rbp 692 693 movq -8(%rsi),%rbx 694 695 leaq (%rsi),%rsp 696 697L$mul4x_epilogue: 698 ret 699 700 701 702 703 704 705.p2align 5 706bn_sqr8x_mont: 707 708 movq %rsp,%rax 709 710L$sqr8x_enter: 711 pushq %rbx 712 713 pushq %rbp 714 715 pushq %r12 716 717 pushq %r13 718 719 pushq %r14 720 721 pushq %r15 722 723L$sqr8x_prologue: 724 725 movl %r9d,%r10d 726 shll $3,%r9d 727 shlq $3+2,%r10 728 negq %r9 729 730 731 732 733 734 735 leaq -64(%rsp,%r9,2),%r11 736 movq %rsp,%rbp 737 movq (%r8),%r8 738 subq %rsi,%r11 739 andq $4095,%r11 740 cmpq %r11,%r10 741 jb L$sqr8x_sp_alt 742 subq %r11,%rbp 743 leaq -64(%rbp,%r9,2),%rbp 744 jmp L$sqr8x_sp_done 745 746.p2align 5 747L$sqr8x_sp_alt: 748 leaq 4096-64(,%r9,2),%r10 749 leaq -64(%rbp,%r9,2),%rbp 750 subq %r10,%r11 751 movq $0,%r10 752 cmovcq %r10,%r11 753 subq %r11,%rbp 754L$sqr8x_sp_done: 755 andq $-64,%rbp 756 movq %rsp,%r11 757 subq %rbp,%r11 758 andq $-4096,%r11 759 leaq (%r11,%rbp,1),%rsp 760 movq (%rsp),%r10 761 cmpq %rbp,%rsp 762 ja L$sqr8x_page_walk 763 jmp L$sqr8x_page_walk_done 764 765.p2align 4 766L$sqr8x_page_walk: 767 leaq -4096(%rsp),%rsp 768 movq (%rsp),%r10 769 cmpq %rbp,%rsp 770 ja L$sqr8x_page_walk 771L$sqr8x_page_walk_done: 772 773 movq %r9,%r10 774 negq %r9 775 776 movq %r8,32(%rsp) 777 movq %rax,40(%rsp) 778 779L$sqr8x_body: 780 781.byte 102,72,15,110,209 782 pxor %xmm0,%xmm0 783.byte 102,72,15,110,207 784.byte 102,73,15,110,218 785 leaq _OPENSSL_ia32cap_P(%rip),%rax 786 movl 8(%rax),%eax 787 andl $0x80100,%eax 788 cmpl $0x80100,%eax 789 jne L$sqr8x_nox 790 791 call _bn_sqrx8x_internal 792 793 794 795 796 leaq (%r8,%rcx,1),%rbx 797 movq %rcx,%r9 798 movq %rcx,%rdx 799.byte 102,72,15,126,207 800 sarq $3+2,%rcx 801 jmp L$sqr8x_sub 802 803.p2align 5 804L$sqr8x_nox: 805 call _bn_sqr8x_internal 806 807 808 809 810 leaq (%rdi,%r9,1),%rbx 811 movq %r9,%rcx 812 movq %r9,%rdx 813.byte 102,72,15,126,207 814 sarq $3+2,%rcx 815 jmp L$sqr8x_sub 816 817.p2align 5 818L$sqr8x_sub: 819 movq 0(%rbx),%r12 820 movq 8(%rbx),%r13 821 movq 16(%rbx),%r14 822 movq 24(%rbx),%r15 823 leaq 32(%rbx),%rbx 824 sbbq 0(%rbp),%r12 825 sbbq 8(%rbp),%r13 826 sbbq 16(%rbp),%r14 827 sbbq 24(%rbp),%r15 828 leaq 32(%rbp),%rbp 829 movq %r12,0(%rdi) 830 movq %r13,8(%rdi) 831 movq %r14,16(%rdi) 832 movq %r15,24(%rdi) 833 leaq 32(%rdi),%rdi 834 incq %rcx 835 jnz L$sqr8x_sub 836 837 sbbq $0,%rax 838 leaq (%rbx,%r9,1),%rbx 839 leaq (%rdi,%r9,1),%rdi 840 841.byte 102,72,15,110,200 842 pxor %xmm0,%xmm0 843 pshufd $0,%xmm1,%xmm1 844 movq 40(%rsp),%rsi 845 846 jmp L$sqr8x_cond_copy 847 848.p2align 5 849L$sqr8x_cond_copy: 850 movdqa 0(%rbx),%xmm2 851 movdqa 16(%rbx),%xmm3 852 leaq 32(%rbx),%rbx 853 movdqu 0(%rdi),%xmm4 854 movdqu 16(%rdi),%xmm5 855 leaq 32(%rdi),%rdi 856 movdqa %xmm0,-32(%rbx) 857 movdqa %xmm0,-16(%rbx) 858 movdqa %xmm0,-32(%rbx,%rdx,1) 859 movdqa %xmm0,-16(%rbx,%rdx,1) 860 pcmpeqd %xmm1,%xmm0 861 pand %xmm1,%xmm2 862 pand %xmm1,%xmm3 863 pand %xmm0,%xmm4 864 pand %xmm0,%xmm5 865 pxor %xmm0,%xmm0 866 por %xmm2,%xmm4 867 por %xmm3,%xmm5 868 movdqu %xmm4,-32(%rdi) 869 movdqu %xmm5,-16(%rdi) 870 addq $32,%r9 871 jnz L$sqr8x_cond_copy 872 873 movq $1,%rax 874 movq -48(%rsi),%r15 875 876 movq -40(%rsi),%r14 877 878 movq -32(%rsi),%r13 879 880 movq -24(%rsi),%r12 881 882 movq -16(%rsi),%rbp 883 884 movq -8(%rsi),%rbx 885 886 leaq (%rsi),%rsp 887 888L$sqr8x_epilogue: 889 ret 890 891 892 893.p2align 5 894bn_mulx4x_mont: 895 896 movq %rsp,%rax 897 898L$mulx4x_enter: 899 pushq %rbx 900 901 pushq %rbp 902 903 pushq %r12 904 905 pushq %r13 906 907 pushq %r14 908 909 pushq %r15 910 911L$mulx4x_prologue: 912 913 shll $3,%r9d 914 xorq %r10,%r10 915 subq %r9,%r10 916 movq (%r8),%r8 917 leaq -72(%rsp,%r10,1),%rbp 918 andq $-128,%rbp 919 movq %rsp,%r11 920 subq %rbp,%r11 921 andq $-4096,%r11 922 leaq (%r11,%rbp,1),%rsp 923 movq (%rsp),%r10 924 cmpq %rbp,%rsp 925 ja L$mulx4x_page_walk 926 jmp L$mulx4x_page_walk_done 927 928.p2align 4 929L$mulx4x_page_walk: 930 leaq -4096(%rsp),%rsp 931 movq (%rsp),%r10 932 cmpq %rbp,%rsp 933 ja L$mulx4x_page_walk 934L$mulx4x_page_walk_done: 935 936 leaq (%rdx,%r9,1),%r10 937 938 939 940 941 942 943 944 945 946 947 948 949 movq %r9,0(%rsp) 950 shrq $5,%r9 951 movq %r10,16(%rsp) 952 subq $1,%r9 953 movq %r8,24(%rsp) 954 movq %rdi,32(%rsp) 955 movq %rax,40(%rsp) 956 957 movq %r9,48(%rsp) 958 jmp L$mulx4x_body 959 960.p2align 5 961L$mulx4x_body: 962 leaq 8(%rdx),%rdi 963 movq (%rdx),%rdx 964 leaq 64+32(%rsp),%rbx 965 movq %rdx,%r9 966 967 mulxq 0(%rsi),%r8,%rax 968 mulxq 8(%rsi),%r11,%r14 969 addq %rax,%r11 970 movq %rdi,8(%rsp) 971 mulxq 16(%rsi),%r12,%r13 972 adcq %r14,%r12 973 adcq $0,%r13 974 975 movq %r8,%rdi 976 imulq 24(%rsp),%r8 977 xorq %rbp,%rbp 978 979 mulxq 24(%rsi),%rax,%r14 980 movq %r8,%rdx 981 leaq 32(%rsi),%rsi 982 adcxq %rax,%r13 983 adcxq %rbp,%r14 984 985 mulxq 0(%rcx),%rax,%r10 986 adcxq %rax,%rdi 987 adoxq %r11,%r10 988 mulxq 8(%rcx),%rax,%r11 989 adcxq %rax,%r10 990 adoxq %r12,%r11 991.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 992 movq 48(%rsp),%rdi 993 movq %r10,-32(%rbx) 994 adcxq %rax,%r11 995 adoxq %r13,%r12 996 mulxq 24(%rcx),%rax,%r15 997 movq %r9,%rdx 998 movq %r11,-24(%rbx) 999 adcxq %rax,%r12 1000 adoxq %rbp,%r15 1001 leaq 32(%rcx),%rcx 1002 movq %r12,-16(%rbx) 1003 1004 jmp L$mulx4x_1st 1005 1006.p2align 5 1007L$mulx4x_1st: 1008 adcxq %rbp,%r15 1009 mulxq 0(%rsi),%r10,%rax 1010 adcxq %r14,%r10 1011 mulxq 8(%rsi),%r11,%r14 1012 adcxq %rax,%r11 1013 mulxq 16(%rsi),%r12,%rax 1014 adcxq %r14,%r12 1015 mulxq 24(%rsi),%r13,%r14 1016.byte 0x67,0x67 1017 movq %r8,%rdx 1018 adcxq %rax,%r13 1019 adcxq %rbp,%r14 1020 leaq 32(%rsi),%rsi 1021 leaq 32(%rbx),%rbx 1022 1023 adoxq %r15,%r10 1024 mulxq 0(%rcx),%rax,%r15 1025 adcxq %rax,%r10 1026 adoxq %r15,%r11 1027 mulxq 8(%rcx),%rax,%r15 1028 adcxq %rax,%r11 1029 adoxq %r15,%r12 1030 mulxq 16(%rcx),%rax,%r15 1031 movq %r10,-40(%rbx) 1032 adcxq %rax,%r12 1033 movq %r11,-32(%rbx) 1034 adoxq %r15,%r13 1035 mulxq 24(%rcx),%rax,%r15 1036 movq %r9,%rdx 1037 movq %r12,-24(%rbx) 1038 adcxq %rax,%r13 1039 adoxq %rbp,%r15 1040 leaq 32(%rcx),%rcx 1041 movq %r13,-16(%rbx) 1042 1043 decq %rdi 1044 jnz L$mulx4x_1st 1045 1046 movq 0(%rsp),%rax 1047 movq 8(%rsp),%rdi 1048 adcq %rbp,%r15 1049 addq %r15,%r14 1050 sbbq %r15,%r15 1051 movq %r14,-8(%rbx) 1052 jmp L$mulx4x_outer 1053 1054.p2align 5 1055L$mulx4x_outer: 1056 movq (%rdi),%rdx 1057 leaq 8(%rdi),%rdi 1058 subq %rax,%rsi 1059 movq %r15,(%rbx) 1060 leaq 64+32(%rsp),%rbx 1061 subq %rax,%rcx 1062 1063 mulxq 0(%rsi),%r8,%r11 1064 xorl %ebp,%ebp 1065 movq %rdx,%r9 1066 mulxq 8(%rsi),%r14,%r12 1067 adoxq -32(%rbx),%r8 1068 adcxq %r14,%r11 1069 mulxq 16(%rsi),%r15,%r13 1070 adoxq -24(%rbx),%r11 1071 adcxq %r15,%r12 1072 adoxq -16(%rbx),%r12 1073 adcxq %rbp,%r13 1074 adoxq %rbp,%r13 1075 1076 movq %rdi,8(%rsp) 1077 movq %r8,%r15 1078 imulq 24(%rsp),%r8 1079 xorl %ebp,%ebp 1080 1081 mulxq 24(%rsi),%rax,%r14 1082 movq %r8,%rdx 1083 adcxq %rax,%r13 1084 adoxq -8(%rbx),%r13 1085 adcxq %rbp,%r14 1086 leaq 32(%rsi),%rsi 1087 adoxq %rbp,%r14 1088 1089 mulxq 0(%rcx),%rax,%r10 1090 adcxq %rax,%r15 1091 adoxq %r11,%r10 1092 mulxq 8(%rcx),%rax,%r11 1093 adcxq %rax,%r10 1094 adoxq %r12,%r11 1095 mulxq 16(%rcx),%rax,%r12 1096 movq %r10,-32(%rbx) 1097 adcxq %rax,%r11 1098 adoxq %r13,%r12 1099 mulxq 24(%rcx),%rax,%r15 1100 movq %r9,%rdx 1101 movq %r11,-24(%rbx) 1102 leaq 32(%rcx),%rcx 1103 adcxq %rax,%r12 1104 adoxq %rbp,%r15 1105 movq 48(%rsp),%rdi 1106 movq %r12,-16(%rbx) 1107 1108 jmp L$mulx4x_inner 1109 1110.p2align 5 1111L$mulx4x_inner: 1112 mulxq 0(%rsi),%r10,%rax 1113 adcxq %rbp,%r15 1114 adoxq %r14,%r10 1115 mulxq 8(%rsi),%r11,%r14 1116 adcxq 0(%rbx),%r10 1117 adoxq %rax,%r11 1118 mulxq 16(%rsi),%r12,%rax 1119 adcxq 8(%rbx),%r11 1120 adoxq %r14,%r12 1121 mulxq 24(%rsi),%r13,%r14 1122 movq %r8,%rdx 1123 adcxq 16(%rbx),%r12 1124 adoxq %rax,%r13 1125 adcxq 24(%rbx),%r13 1126 adoxq %rbp,%r14 1127 leaq 32(%rsi),%rsi 1128 leaq 32(%rbx),%rbx 1129 adcxq %rbp,%r14 1130 1131 adoxq %r15,%r10 1132 mulxq 0(%rcx),%rax,%r15 1133 adcxq %rax,%r10 1134 adoxq %r15,%r11 1135 mulxq 8(%rcx),%rax,%r15 1136 adcxq %rax,%r11 1137 adoxq %r15,%r12 1138 mulxq 16(%rcx),%rax,%r15 1139 movq %r10,-40(%rbx) 1140 adcxq %rax,%r12 1141 adoxq %r15,%r13 1142 mulxq 24(%rcx),%rax,%r15 1143 movq %r9,%rdx 1144 movq %r11,-32(%rbx) 1145 movq %r12,-24(%rbx) 1146 adcxq %rax,%r13 1147 adoxq %rbp,%r15 1148 leaq 32(%rcx),%rcx 1149 movq %r13,-16(%rbx) 1150 1151 decq %rdi 1152 jnz L$mulx4x_inner 1153 1154 movq 0(%rsp),%rax 1155 movq 8(%rsp),%rdi 1156 adcq %rbp,%r15 1157 subq 0(%rbx),%rbp 1158 adcq %r15,%r14 1159 sbbq %r15,%r15 1160 movq %r14,-8(%rbx) 1161 1162 cmpq 16(%rsp),%rdi 1163 jne L$mulx4x_outer 1164 1165 leaq 64(%rsp),%rbx 1166 subq %rax,%rcx 1167 negq %r15 1168 movq %rax,%rdx 1169 shrq $3+2,%rax 1170 movq 32(%rsp),%rdi 1171 jmp L$mulx4x_sub 1172 1173.p2align 5 1174L$mulx4x_sub: 1175 movq 0(%rbx),%r11 1176 movq 8(%rbx),%r12 1177 movq 16(%rbx),%r13 1178 movq 24(%rbx),%r14 1179 leaq 32(%rbx),%rbx 1180 sbbq 0(%rcx),%r11 1181 sbbq 8(%rcx),%r12 1182 sbbq 16(%rcx),%r13 1183 sbbq 24(%rcx),%r14 1184 leaq 32(%rcx),%rcx 1185 movq %r11,0(%rdi) 1186 movq %r12,8(%rdi) 1187 movq %r13,16(%rdi) 1188 movq %r14,24(%rdi) 1189 leaq 32(%rdi),%rdi 1190 decq %rax 1191 jnz L$mulx4x_sub 1192 1193 sbbq $0,%r15 1194 leaq 64(%rsp),%rbx 1195 subq %rdx,%rdi 1196 1197.byte 102,73,15,110,207 1198 pxor %xmm0,%xmm0 1199 pshufd $0,%xmm1,%xmm1 1200 movq 40(%rsp),%rsi 1201 1202 jmp L$mulx4x_cond_copy 1203 1204.p2align 5 1205L$mulx4x_cond_copy: 1206 movdqa 0(%rbx),%xmm2 1207 movdqa 16(%rbx),%xmm3 1208 leaq 32(%rbx),%rbx 1209 movdqu 0(%rdi),%xmm4 1210 movdqu 16(%rdi),%xmm5 1211 leaq 32(%rdi),%rdi 1212 movdqa %xmm0,-32(%rbx) 1213 movdqa %xmm0,-16(%rbx) 1214 pcmpeqd %xmm1,%xmm0 1215 pand %xmm1,%xmm2 1216 pand %xmm1,%xmm3 1217 pand %xmm0,%xmm4 1218 pand %xmm0,%xmm5 1219 pxor %xmm0,%xmm0 1220 por %xmm2,%xmm4 1221 por %xmm3,%xmm5 1222 movdqu %xmm4,-32(%rdi) 1223 movdqu %xmm5,-16(%rdi) 1224 subq $32,%rdx 1225 jnz L$mulx4x_cond_copy 1226 1227 movq %rdx,(%rbx) 1228 1229 movq $1,%rax 1230 movq -48(%rsi),%r15 1231 1232 movq -40(%rsi),%r14 1233 1234 movq -32(%rsi),%r13 1235 1236 movq -24(%rsi),%r12 1237 1238 movq -16(%rsi),%rbp 1239 1240 movq -8(%rsi),%rbx 1241 1242 leaq (%rsi),%rsp 1243 1244L$mulx4x_epilogue: 1245 ret 1246 1247 1248.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1249.p2align 4 1250#endif 1251