1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#include <ring-core/asm_base.h> 5 6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) 7.text 8 9.extern OPENSSL_ia32cap_P 10.hidden OPENSSL_ia32cap_P 11 12.globl bn_mul_mont 13.hidden bn_mul_mont 14.type bn_mul_mont,@function 15.align 16 16bn_mul_mont: 17.cfi_startproc 18_CET_ENDBR 19 movl %r9d,%r9d 20 movq %rsp,%rax 21.cfi_def_cfa_register %rax 22 testl $3,%r9d 23 jnz .Lmul_enter 24 cmpl $8,%r9d 25 jb .Lmul_enter 26 leaq OPENSSL_ia32cap_P(%rip),%r11 27 movl 8(%r11),%r11d 28 cmpq %rsi,%rdx 29 jne .Lmul4x_enter 30 testl $7,%r9d 31 jz .Lsqr8x_enter 32 jmp .Lmul4x_enter 33 34.align 16 35.Lmul_enter: 36 pushq %rbx 37.cfi_offset %rbx,-16 38 pushq %rbp 39.cfi_offset %rbp,-24 40 pushq %r12 41.cfi_offset %r12,-32 42 pushq %r13 43.cfi_offset %r13,-40 44 pushq %r14 45.cfi_offset %r14,-48 46 pushq %r15 47.cfi_offset %r15,-56 48 49 negq %r9 50 movq %rsp,%r11 51 leaq -16(%rsp,%r9,8),%r10 52 negq %r9 53 andq $-1024,%r10 54 55 56 57 58 59 60 61 62 63 subq %r10,%r11 64 andq $-4096,%r11 65 leaq (%r10,%r11,1),%rsp 66 movq (%rsp),%r11 67 cmpq %r10,%rsp 68 ja .Lmul_page_walk 69 jmp .Lmul_page_walk_done 70 71.align 16 72.Lmul_page_walk: 73 leaq -4096(%rsp),%rsp 74 movq (%rsp),%r11 75 cmpq %r10,%rsp 76 ja .Lmul_page_walk 77.Lmul_page_walk_done: 78 79 movq %rax,8(%rsp,%r9,8) 80.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08 81.Lmul_body: 82 movq %rdx,%r12 83 movq (%r8),%r8 84 movq (%r12),%rbx 85 movq (%rsi),%rax 86 87 xorq %r14,%r14 88 xorq %r15,%r15 89 90 movq %r8,%rbp 91 mulq %rbx 92 movq %rax,%r10 93 movq (%rcx),%rax 94 95 imulq %r10,%rbp 96 movq %rdx,%r11 97 98 mulq %rbp 99 addq %rax,%r10 100 movq 8(%rsi),%rax 101 adcq $0,%rdx 102 movq %rdx,%r13 103 104 leaq 1(%r15),%r15 105 jmp .L1st_enter 106 107.align 16 108.L1st: 109 addq %rax,%r13 110 movq (%rsi,%r15,8),%rax 111 adcq $0,%rdx 112 addq %r11,%r13 113 movq %r10,%r11 114 adcq $0,%rdx 115 movq %r13,-16(%rsp,%r15,8) 116 movq %rdx,%r13 117 118.L1st_enter: 119 mulq %rbx 120 addq %rax,%r11 121 movq (%rcx,%r15,8),%rax 122 adcq $0,%rdx 123 leaq 1(%r15),%r15 124 movq %rdx,%r10 125 126 mulq %rbp 127 cmpq %r9,%r15 128 jne .L1st 129 130 addq %rax,%r13 131 movq (%rsi),%rax 132 adcq $0,%rdx 133 addq %r11,%r13 134 adcq $0,%rdx 135 movq %r13,-16(%rsp,%r15,8) 136 movq %rdx,%r13 137 movq %r10,%r11 138 139 xorq %rdx,%rdx 140 addq %r11,%r13 141 adcq $0,%rdx 142 movq %r13,-8(%rsp,%r9,8) 143 movq %rdx,(%rsp,%r9,8) 144 145 leaq 1(%r14),%r14 146 jmp .Louter 147.align 16 148.Louter: 149 movq (%r12,%r14,8),%rbx 150 xorq %r15,%r15 151 movq %r8,%rbp 152 movq (%rsp),%r10 153 mulq %rbx 154 addq %rax,%r10 155 movq (%rcx),%rax 156 adcq $0,%rdx 157 158 imulq %r10,%rbp 159 movq %rdx,%r11 160 161 mulq %rbp 162 addq %rax,%r10 163 movq 8(%rsi),%rax 164 adcq $0,%rdx 165 movq 8(%rsp),%r10 166 movq %rdx,%r13 167 168 leaq 1(%r15),%r15 169 jmp .Linner_enter 170 171.align 16 172.Linner: 173 addq %rax,%r13 174 movq (%rsi,%r15,8),%rax 175 adcq $0,%rdx 176 addq %r10,%r13 177 movq (%rsp,%r15,8),%r10 178 adcq $0,%rdx 179 movq %r13,-16(%rsp,%r15,8) 180 movq %rdx,%r13 181 182.Linner_enter: 183 mulq %rbx 184 addq %rax,%r11 185 movq (%rcx,%r15,8),%rax 186 adcq $0,%rdx 187 addq %r11,%r10 188 movq %rdx,%r11 189 adcq $0,%r11 190 leaq 1(%r15),%r15 191 192 mulq %rbp 193 cmpq %r9,%r15 194 jne .Linner 195 196 addq %rax,%r13 197 movq (%rsi),%rax 198 adcq $0,%rdx 199 addq %r10,%r13 200 movq (%rsp,%r15,8),%r10 201 adcq $0,%rdx 202 movq %r13,-16(%rsp,%r15,8) 203 movq %rdx,%r13 204 205 xorq %rdx,%rdx 206 addq %r11,%r13 207 adcq $0,%rdx 208 addq %r10,%r13 209 adcq $0,%rdx 210 movq %r13,-8(%rsp,%r9,8) 211 movq %rdx,(%rsp,%r9,8) 212 213 leaq 1(%r14),%r14 214 cmpq %r9,%r14 215 jb .Louter 216 217 xorq %r14,%r14 218 movq (%rsp),%rax 219 movq %r9,%r15 220 221.align 16 222.Lsub: sbbq (%rcx,%r14,8),%rax 223 movq %rax,(%rdi,%r14,8) 224 movq 8(%rsp,%r14,8),%rax 225 leaq 1(%r14),%r14 226 decq %r15 227 jnz .Lsub 228 229 sbbq $0,%rax 230 movq $-1,%rbx 231 xorq %rax,%rbx 232 xorq %r14,%r14 233 movq %r9,%r15 234 235.Lcopy: 236 movq (%rdi,%r14,8),%rcx 237 movq (%rsp,%r14,8),%rdx 238 andq %rbx,%rcx 239 andq %rax,%rdx 240 movq %r9,(%rsp,%r14,8) 241 orq %rcx,%rdx 242 movq %rdx,(%rdi,%r14,8) 243 leaq 1(%r14),%r14 244 subq $1,%r15 245 jnz .Lcopy 246 247 movq 8(%rsp,%r9,8),%rsi 248.cfi_def_cfa %rsi,8 249 movq $1,%rax 250 movq -48(%rsi),%r15 251.cfi_restore %r15 252 movq -40(%rsi),%r14 253.cfi_restore %r14 254 movq -32(%rsi),%r13 255.cfi_restore %r13 256 movq -24(%rsi),%r12 257.cfi_restore %r12 258 movq -16(%rsi),%rbp 259.cfi_restore %rbp 260 movq -8(%rsi),%rbx 261.cfi_restore %rbx 262 leaq (%rsi),%rsp 263.cfi_def_cfa_register %rsp 264.Lmul_epilogue: 265 ret 266.cfi_endproc 267.size bn_mul_mont,.-bn_mul_mont 268.type bn_mul4x_mont,@function 269.align 16 270bn_mul4x_mont: 271.cfi_startproc 272 movl %r9d,%r9d 273 movq %rsp,%rax 274.cfi_def_cfa_register %rax 275.Lmul4x_enter: 276 andl $0x80100,%r11d 277 cmpl $0x80100,%r11d 278 je .Lmulx4x_enter 279 pushq %rbx 280.cfi_offset %rbx,-16 281 pushq %rbp 282.cfi_offset %rbp,-24 283 pushq %r12 284.cfi_offset %r12,-32 285 pushq %r13 286.cfi_offset %r13,-40 287 pushq %r14 288.cfi_offset %r14,-48 289 pushq %r15 290.cfi_offset %r15,-56 291 292 negq %r9 293 movq %rsp,%r11 294 leaq -32(%rsp,%r9,8),%r10 295 negq %r9 296 andq $-1024,%r10 297 298 subq %r10,%r11 299 andq $-4096,%r11 300 leaq (%r10,%r11,1),%rsp 301 movq (%rsp),%r11 302 cmpq %r10,%rsp 303 ja .Lmul4x_page_walk 304 jmp .Lmul4x_page_walk_done 305 306.Lmul4x_page_walk: 307 leaq -4096(%rsp),%rsp 308 movq (%rsp),%r11 309 cmpq %r10,%rsp 310 ja .Lmul4x_page_walk 311.Lmul4x_page_walk_done: 312 313 movq %rax,8(%rsp,%r9,8) 314.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08 315.Lmul4x_body: 316 movq %rdi,16(%rsp,%r9,8) 317 movq %rdx,%r12 318 movq (%r8),%r8 319 movq (%r12),%rbx 320 movq (%rsi),%rax 321 322 xorq %r14,%r14 323 xorq %r15,%r15 324 325 movq %r8,%rbp 326 mulq %rbx 327 movq %rax,%r10 328 movq (%rcx),%rax 329 330 imulq %r10,%rbp 331 movq %rdx,%r11 332 333 mulq %rbp 334 addq %rax,%r10 335 movq 8(%rsi),%rax 336 adcq $0,%rdx 337 movq %rdx,%rdi 338 339 mulq %rbx 340 addq %rax,%r11 341 movq 8(%rcx),%rax 342 adcq $0,%rdx 343 movq %rdx,%r10 344 345 mulq %rbp 346 addq %rax,%rdi 347 movq 16(%rsi),%rax 348 adcq $0,%rdx 349 addq %r11,%rdi 350 leaq 4(%r15),%r15 351 adcq $0,%rdx 352 movq %rdi,(%rsp) 353 movq %rdx,%r13 354 jmp .L1st4x 355.align 16 356.L1st4x: 357 mulq %rbx 358 addq %rax,%r10 359 movq -16(%rcx,%r15,8),%rax 360 adcq $0,%rdx 361 movq %rdx,%r11 362 363 mulq %rbp 364 addq %rax,%r13 365 movq -8(%rsi,%r15,8),%rax 366 adcq $0,%rdx 367 addq %r10,%r13 368 adcq $0,%rdx 369 movq %r13,-24(%rsp,%r15,8) 370 movq %rdx,%rdi 371 372 mulq %rbx 373 addq %rax,%r11 374 movq -8(%rcx,%r15,8),%rax 375 adcq $0,%rdx 376 movq %rdx,%r10 377 378 mulq %rbp 379 addq %rax,%rdi 380 movq (%rsi,%r15,8),%rax 381 adcq $0,%rdx 382 addq %r11,%rdi 383 adcq $0,%rdx 384 movq %rdi,-16(%rsp,%r15,8) 385 movq %rdx,%r13 386 387 mulq %rbx 388 addq %rax,%r10 389 movq (%rcx,%r15,8),%rax 390 adcq $0,%rdx 391 movq %rdx,%r11 392 393 mulq %rbp 394 addq %rax,%r13 395 movq 8(%rsi,%r15,8),%rax 396 adcq $0,%rdx 397 addq %r10,%r13 398 adcq $0,%rdx 399 movq %r13,-8(%rsp,%r15,8) 400 movq %rdx,%rdi 401 402 mulq %rbx 403 addq %rax,%r11 404 movq 8(%rcx,%r15,8),%rax 405 adcq $0,%rdx 406 leaq 4(%r15),%r15 407 movq %rdx,%r10 408 409 mulq %rbp 410 addq %rax,%rdi 411 movq -16(%rsi,%r15,8),%rax 412 adcq $0,%rdx 413 addq %r11,%rdi 414 adcq $0,%rdx 415 movq %rdi,-32(%rsp,%r15,8) 416 movq %rdx,%r13 417 cmpq %r9,%r15 418 jb .L1st4x 419 420 mulq %rbx 421 addq %rax,%r10 422 movq -16(%rcx,%r15,8),%rax 423 adcq $0,%rdx 424 movq %rdx,%r11 425 426 mulq %rbp 427 addq %rax,%r13 428 movq -8(%rsi,%r15,8),%rax 429 adcq $0,%rdx 430 addq %r10,%r13 431 adcq $0,%rdx 432 movq %r13,-24(%rsp,%r15,8) 433 movq %rdx,%rdi 434 435 mulq %rbx 436 addq %rax,%r11 437 movq -8(%rcx,%r15,8),%rax 438 adcq $0,%rdx 439 movq %rdx,%r10 440 441 mulq %rbp 442 addq %rax,%rdi 443 movq (%rsi),%rax 444 adcq $0,%rdx 445 addq %r11,%rdi 446 adcq $0,%rdx 447 movq %rdi,-16(%rsp,%r15,8) 448 movq %rdx,%r13 449 450 xorq %rdi,%rdi 451 addq %r10,%r13 452 adcq $0,%rdi 453 movq %r13,-8(%rsp,%r15,8) 454 movq %rdi,(%rsp,%r15,8) 455 456 leaq 1(%r14),%r14 457.align 4 458.Louter4x: 459 movq (%r12,%r14,8),%rbx 460 xorq %r15,%r15 461 movq (%rsp),%r10 462 movq %r8,%rbp 463 mulq %rbx 464 addq %rax,%r10 465 movq (%rcx),%rax 466 adcq $0,%rdx 467 468 imulq %r10,%rbp 469 movq %rdx,%r11 470 471 mulq %rbp 472 addq %rax,%r10 473 movq 8(%rsi),%rax 474 adcq $0,%rdx 475 movq %rdx,%rdi 476 477 mulq %rbx 478 addq %rax,%r11 479 movq 8(%rcx),%rax 480 adcq $0,%rdx 481 addq 8(%rsp),%r11 482 adcq $0,%rdx 483 movq %rdx,%r10 484 485 mulq %rbp 486 addq %rax,%rdi 487 movq 16(%rsi),%rax 488 adcq $0,%rdx 489 addq %r11,%rdi 490 leaq 4(%r15),%r15 491 adcq $0,%rdx 492 movq %rdi,(%rsp) 493 movq %rdx,%r13 494 jmp .Linner4x 495.align 16 496.Linner4x: 497 mulq %rbx 498 addq %rax,%r10 499 movq -16(%rcx,%r15,8),%rax 500 adcq $0,%rdx 501 addq -16(%rsp,%r15,8),%r10 502 adcq $0,%rdx 503 movq %rdx,%r11 504 505 mulq %rbp 506 addq %rax,%r13 507 movq -8(%rsi,%r15,8),%rax 508 adcq $0,%rdx 509 addq %r10,%r13 510 adcq $0,%rdx 511 movq %r13,-24(%rsp,%r15,8) 512 movq %rdx,%rdi 513 514 mulq %rbx 515 addq %rax,%r11 516 movq -8(%rcx,%r15,8),%rax 517 adcq $0,%rdx 518 addq -8(%rsp,%r15,8),%r11 519 adcq $0,%rdx 520 movq %rdx,%r10 521 522 mulq %rbp 523 addq %rax,%rdi 524 movq (%rsi,%r15,8),%rax 525 adcq $0,%rdx 526 addq %r11,%rdi 527 adcq $0,%rdx 528 movq %rdi,-16(%rsp,%r15,8) 529 movq %rdx,%r13 530 531 mulq %rbx 532 addq %rax,%r10 533 movq (%rcx,%r15,8),%rax 534 adcq $0,%rdx 535 addq (%rsp,%r15,8),%r10 536 adcq $0,%rdx 537 movq %rdx,%r11 538 539 mulq %rbp 540 addq %rax,%r13 541 movq 8(%rsi,%r15,8),%rax 542 adcq $0,%rdx 543 addq %r10,%r13 544 adcq $0,%rdx 545 movq %r13,-8(%rsp,%r15,8) 546 movq %rdx,%rdi 547 548 mulq %rbx 549 addq %rax,%r11 550 movq 8(%rcx,%r15,8),%rax 551 adcq $0,%rdx 552 addq 8(%rsp,%r15,8),%r11 553 adcq $0,%rdx 554 leaq 4(%r15),%r15 555 movq %rdx,%r10 556 557 mulq %rbp 558 addq %rax,%rdi 559 movq -16(%rsi,%r15,8),%rax 560 adcq $0,%rdx 561 addq %r11,%rdi 562 adcq $0,%rdx 563 movq %rdi,-32(%rsp,%r15,8) 564 movq %rdx,%r13 565 cmpq %r9,%r15 566 jb .Linner4x 567 568 mulq %rbx 569 addq %rax,%r10 570 movq -16(%rcx,%r15,8),%rax 571 adcq $0,%rdx 572 addq -16(%rsp,%r15,8),%r10 573 adcq $0,%rdx 574 movq %rdx,%r11 575 576 mulq %rbp 577 addq %rax,%r13 578 movq -8(%rsi,%r15,8),%rax 579 adcq $0,%rdx 580 addq %r10,%r13 581 adcq $0,%rdx 582 movq %r13,-24(%rsp,%r15,8) 583 movq %rdx,%rdi 584 585 mulq %rbx 586 addq %rax,%r11 587 movq -8(%rcx,%r15,8),%rax 588 adcq $0,%rdx 589 addq -8(%rsp,%r15,8),%r11 590 adcq $0,%rdx 591 leaq 1(%r14),%r14 592 movq %rdx,%r10 593 594 mulq %rbp 595 addq %rax,%rdi 596 movq (%rsi),%rax 597 adcq $0,%rdx 598 addq %r11,%rdi 599 adcq $0,%rdx 600 movq %rdi,-16(%rsp,%r15,8) 601 movq %rdx,%r13 602 603 xorq %rdi,%rdi 604 addq %r10,%r13 605 adcq $0,%rdi 606 addq (%rsp,%r9,8),%r13 607 adcq $0,%rdi 608 movq %r13,-8(%rsp,%r15,8) 609 movq %rdi,(%rsp,%r15,8) 610 611 cmpq %r9,%r14 612 jb .Louter4x 613 movq 16(%rsp,%r9,8),%rdi 614 leaq -4(%r9),%r15 615 movq 0(%rsp),%rax 616 movq 8(%rsp),%rdx 617 shrq $2,%r15 618 leaq (%rsp),%rsi 619 xorq %r14,%r14 620 621 subq 0(%rcx),%rax 622 movq 16(%rsi),%rbx 623 movq 24(%rsi),%rbp 624 sbbq 8(%rcx),%rdx 625 626.Lsub4x: 627 movq %rax,0(%rdi,%r14,8) 628 movq %rdx,8(%rdi,%r14,8) 629 sbbq 16(%rcx,%r14,8),%rbx 630 movq 32(%rsi,%r14,8),%rax 631 movq 40(%rsi,%r14,8),%rdx 632 sbbq 24(%rcx,%r14,8),%rbp 633 movq %rbx,16(%rdi,%r14,8) 634 movq %rbp,24(%rdi,%r14,8) 635 sbbq 32(%rcx,%r14,8),%rax 636 movq 48(%rsi,%r14,8),%rbx 637 movq 56(%rsi,%r14,8),%rbp 638 sbbq 40(%rcx,%r14,8),%rdx 639 leaq 4(%r14),%r14 640 decq %r15 641 jnz .Lsub4x 642 643 movq %rax,0(%rdi,%r14,8) 644 movq 32(%rsi,%r14,8),%rax 645 sbbq 16(%rcx,%r14,8),%rbx 646 movq %rdx,8(%rdi,%r14,8) 647 sbbq 24(%rcx,%r14,8),%rbp 648 movq %rbx,16(%rdi,%r14,8) 649 650 sbbq $0,%rax 651 movq %rbp,24(%rdi,%r14,8) 652 pxor %xmm0,%xmm0 653.byte 102,72,15,110,224 654 pcmpeqd %xmm5,%xmm5 655 pshufd $0,%xmm4,%xmm4 656 movq %r9,%r15 657 pxor %xmm4,%xmm5 658 shrq $2,%r15 659 xorl %eax,%eax 660 661 jmp .Lcopy4x 662.align 16 663.Lcopy4x: 664 movdqa (%rsp,%rax,1),%xmm1 665 movdqu (%rdi,%rax,1),%xmm2 666 pand %xmm4,%xmm1 667 pand %xmm5,%xmm2 668 movdqa 16(%rsp,%rax,1),%xmm3 669 movdqa %xmm0,(%rsp,%rax,1) 670 por %xmm2,%xmm1 671 movdqu 16(%rdi,%rax,1),%xmm2 672 movdqu %xmm1,(%rdi,%rax,1) 673 pand %xmm4,%xmm3 674 pand %xmm5,%xmm2 675 movdqa %xmm0,16(%rsp,%rax,1) 676 por %xmm2,%xmm3 677 movdqu %xmm3,16(%rdi,%rax,1) 678 leaq 32(%rax),%rax 679 decq %r15 680 jnz .Lcopy4x 681 movq 8(%rsp,%r9,8),%rsi 682.cfi_def_cfa %rsi, 8 683 movq $1,%rax 684 movq -48(%rsi),%r15 685.cfi_restore %r15 686 movq -40(%rsi),%r14 687.cfi_restore %r14 688 movq -32(%rsi),%r13 689.cfi_restore %r13 690 movq -24(%rsi),%r12 691.cfi_restore %r12 692 movq -16(%rsi),%rbp 693.cfi_restore %rbp 694 movq -8(%rsi),%rbx 695.cfi_restore %rbx 696 leaq (%rsi),%rsp 697.cfi_def_cfa_register %rsp 698.Lmul4x_epilogue: 699 ret 700.cfi_endproc 701.size bn_mul4x_mont,.-bn_mul4x_mont 702.extern bn_sqrx8x_internal 703.hidden bn_sqrx8x_internal 704.extern bn_sqr8x_internal 705.hidden bn_sqr8x_internal 706 707.type bn_sqr8x_mont,@function 708.align 32 709bn_sqr8x_mont: 710.cfi_startproc 711 movq %rsp,%rax 712.cfi_def_cfa_register %rax 713.Lsqr8x_enter: 714 pushq %rbx 715.cfi_offset %rbx,-16 716 pushq %rbp 717.cfi_offset %rbp,-24 718 pushq %r12 719.cfi_offset %r12,-32 720 pushq %r13 721.cfi_offset %r13,-40 722 pushq %r14 723.cfi_offset %r14,-48 724 pushq %r15 725.cfi_offset %r15,-56 726.Lsqr8x_prologue: 727 728 movl %r9d,%r10d 729 shll $3,%r9d 730 shlq $3+2,%r10 731 negq %r9 732 733 734 735 736 737 738 leaq -64(%rsp,%r9,2),%r11 739 movq %rsp,%rbp 740 movq (%r8),%r8 741 subq %rsi,%r11 742 andq $4095,%r11 743 cmpq %r11,%r10 744 jb .Lsqr8x_sp_alt 745 subq %r11,%rbp 746 leaq -64(%rbp,%r9,2),%rbp 747 jmp .Lsqr8x_sp_done 748 749.align 32 750.Lsqr8x_sp_alt: 751 leaq 4096-64(,%r9,2),%r10 752 leaq -64(%rbp,%r9,2),%rbp 753 subq %r10,%r11 754 movq $0,%r10 755 cmovcq %r10,%r11 756 subq %r11,%rbp 757.Lsqr8x_sp_done: 758 andq $-64,%rbp 759 movq %rsp,%r11 760 subq %rbp,%r11 761 andq $-4096,%r11 762 leaq (%r11,%rbp,1),%rsp 763 movq (%rsp),%r10 764 cmpq %rbp,%rsp 765 ja .Lsqr8x_page_walk 766 jmp .Lsqr8x_page_walk_done 767 768.align 16 769.Lsqr8x_page_walk: 770 leaq -4096(%rsp),%rsp 771 movq (%rsp),%r10 772 cmpq %rbp,%rsp 773 ja .Lsqr8x_page_walk 774.Lsqr8x_page_walk_done: 775 776 movq %r9,%r10 777 negq %r9 778 779 movq %r8,32(%rsp) 780 movq %rax,40(%rsp) 781.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 782.Lsqr8x_body: 783 784.byte 102,72,15,110,209 785 pxor %xmm0,%xmm0 786.byte 102,72,15,110,207 787.byte 102,73,15,110,218 788 leaq OPENSSL_ia32cap_P(%rip),%rax 789 movl 8(%rax),%eax 790 andl $0x80100,%eax 791 cmpl $0x80100,%eax 792 jne .Lsqr8x_nox 793 794 call bn_sqrx8x_internal 795 796 797 798 799 leaq (%r8,%rcx,1),%rbx 800 movq %rcx,%r9 801 movq %rcx,%rdx 802.byte 102,72,15,126,207 803 sarq $3+2,%rcx 804 jmp .Lsqr8x_sub 805 806.align 32 807.Lsqr8x_nox: 808 call bn_sqr8x_internal 809 810 811 812 813 leaq (%rdi,%r9,1),%rbx 814 movq %r9,%rcx 815 movq %r9,%rdx 816.byte 102,72,15,126,207 817 sarq $3+2,%rcx 818 jmp .Lsqr8x_sub 819 820.align 32 821.Lsqr8x_sub: 822 movq 0(%rbx),%r12 823 movq 8(%rbx),%r13 824 movq 16(%rbx),%r14 825 movq 24(%rbx),%r15 826 leaq 32(%rbx),%rbx 827 sbbq 0(%rbp),%r12 828 sbbq 8(%rbp),%r13 829 sbbq 16(%rbp),%r14 830 sbbq 24(%rbp),%r15 831 leaq 32(%rbp),%rbp 832 movq %r12,0(%rdi) 833 movq %r13,8(%rdi) 834 movq %r14,16(%rdi) 835 movq %r15,24(%rdi) 836 leaq 32(%rdi),%rdi 837 incq %rcx 838 jnz .Lsqr8x_sub 839 840 sbbq $0,%rax 841 leaq (%rbx,%r9,1),%rbx 842 leaq (%rdi,%r9,1),%rdi 843 844.byte 102,72,15,110,200 845 pxor %xmm0,%xmm0 846 pshufd $0,%xmm1,%xmm1 847 movq 40(%rsp),%rsi 848.cfi_def_cfa %rsi,8 849 jmp .Lsqr8x_cond_copy 850 851.align 32 852.Lsqr8x_cond_copy: 853 movdqa 0(%rbx),%xmm2 854 movdqa 16(%rbx),%xmm3 855 leaq 32(%rbx),%rbx 856 movdqu 0(%rdi),%xmm4 857 movdqu 16(%rdi),%xmm5 858 leaq 32(%rdi),%rdi 859 movdqa %xmm0,-32(%rbx) 860 movdqa %xmm0,-16(%rbx) 861 movdqa %xmm0,-32(%rbx,%rdx,1) 862 movdqa %xmm0,-16(%rbx,%rdx,1) 863 pcmpeqd %xmm1,%xmm0 864 pand %xmm1,%xmm2 865 pand %xmm1,%xmm3 866 pand %xmm0,%xmm4 867 pand %xmm0,%xmm5 868 pxor %xmm0,%xmm0 869 por %xmm2,%xmm4 870 por %xmm3,%xmm5 871 movdqu %xmm4,-32(%rdi) 872 movdqu %xmm5,-16(%rdi) 873 addq $32,%r9 874 jnz .Lsqr8x_cond_copy 875 876 movq $1,%rax 877 movq -48(%rsi),%r15 878.cfi_restore %r15 879 movq -40(%rsi),%r14 880.cfi_restore %r14 881 movq -32(%rsi),%r13 882.cfi_restore %r13 883 movq -24(%rsi),%r12 884.cfi_restore %r12 885 movq -16(%rsi),%rbp 886.cfi_restore %rbp 887 movq -8(%rsi),%rbx 888.cfi_restore %rbx 889 leaq (%rsi),%rsp 890.cfi_def_cfa_register %rsp 891.Lsqr8x_epilogue: 892 ret 893.cfi_endproc 894.size bn_sqr8x_mont,.-bn_sqr8x_mont 895.type bn_mulx4x_mont,@function 896.align 32 897bn_mulx4x_mont: 898.cfi_startproc 899 movq %rsp,%rax 900.cfi_def_cfa_register %rax 901.Lmulx4x_enter: 902 pushq %rbx 903.cfi_offset %rbx,-16 904 pushq %rbp 905.cfi_offset %rbp,-24 906 pushq %r12 907.cfi_offset %r12,-32 908 pushq %r13 909.cfi_offset %r13,-40 910 pushq %r14 911.cfi_offset %r14,-48 912 pushq %r15 913.cfi_offset %r15,-56 914.Lmulx4x_prologue: 915 916 shll $3,%r9d 917 xorq %r10,%r10 918 subq %r9,%r10 919 movq (%r8),%r8 920 leaq -72(%rsp,%r10,1),%rbp 921 andq $-128,%rbp 922 movq %rsp,%r11 923 subq %rbp,%r11 924 andq $-4096,%r11 925 leaq (%r11,%rbp,1),%rsp 926 movq (%rsp),%r10 927 cmpq %rbp,%rsp 928 ja .Lmulx4x_page_walk 929 jmp .Lmulx4x_page_walk_done 930 931.align 16 932.Lmulx4x_page_walk: 933 leaq -4096(%rsp),%rsp 934 movq (%rsp),%r10 935 cmpq %rbp,%rsp 936 ja .Lmulx4x_page_walk 937.Lmulx4x_page_walk_done: 938 939 leaq (%rdx,%r9,1),%r10 940 941 942 943 944 945 946 947 948 949 950 951 952 movq %r9,0(%rsp) 953 shrq $5,%r9 954 movq %r10,16(%rsp) 955 subq $1,%r9 956 movq %r8,24(%rsp) 957 movq %rdi,32(%rsp) 958 movq %rax,40(%rsp) 959.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 960 movq %r9,48(%rsp) 961 jmp .Lmulx4x_body 962 963.align 32 964.Lmulx4x_body: 965 leaq 8(%rdx),%rdi 966 movq (%rdx),%rdx 967 leaq 64+32(%rsp),%rbx 968 movq %rdx,%r9 969 970 mulxq 0(%rsi),%r8,%rax 971 mulxq 8(%rsi),%r11,%r14 972 addq %rax,%r11 973 movq %rdi,8(%rsp) 974 mulxq 16(%rsi),%r12,%r13 975 adcq %r14,%r12 976 adcq $0,%r13 977 978 movq %r8,%rdi 979 imulq 24(%rsp),%r8 980 xorq %rbp,%rbp 981 982 mulxq 24(%rsi),%rax,%r14 983 movq %r8,%rdx 984 leaq 32(%rsi),%rsi 985 adcxq %rax,%r13 986 adcxq %rbp,%r14 987 988 mulxq 0(%rcx),%rax,%r10 989 adcxq %rax,%rdi 990 adoxq %r11,%r10 991 mulxq 8(%rcx),%rax,%r11 992 adcxq %rax,%r10 993 adoxq %r12,%r11 994.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 995 movq 48(%rsp),%rdi 996 movq %r10,-32(%rbx) 997 adcxq %rax,%r11 998 adoxq %r13,%r12 999 mulxq 24(%rcx),%rax,%r15 1000 movq %r9,%rdx 1001 movq %r11,-24(%rbx) 1002 adcxq %rax,%r12 1003 adoxq %rbp,%r15 1004 leaq 32(%rcx),%rcx 1005 movq %r12,-16(%rbx) 1006 1007 jmp .Lmulx4x_1st 1008 1009.align 32 1010.Lmulx4x_1st: 1011 adcxq %rbp,%r15 1012 mulxq 0(%rsi),%r10,%rax 1013 adcxq %r14,%r10 1014 mulxq 8(%rsi),%r11,%r14 1015 adcxq %rax,%r11 1016 mulxq 16(%rsi),%r12,%rax 1017 adcxq %r14,%r12 1018 mulxq 24(%rsi),%r13,%r14 1019.byte 0x67,0x67 1020 movq %r8,%rdx 1021 adcxq %rax,%r13 1022 adcxq %rbp,%r14 1023 leaq 32(%rsi),%rsi 1024 leaq 32(%rbx),%rbx 1025 1026 adoxq %r15,%r10 1027 mulxq 0(%rcx),%rax,%r15 1028 adcxq %rax,%r10 1029 adoxq %r15,%r11 1030 mulxq 8(%rcx),%rax,%r15 1031 adcxq %rax,%r11 1032 adoxq %r15,%r12 1033 mulxq 16(%rcx),%rax,%r15 1034 movq %r10,-40(%rbx) 1035 adcxq %rax,%r12 1036 movq %r11,-32(%rbx) 1037 adoxq %r15,%r13 1038 mulxq 24(%rcx),%rax,%r15 1039 movq %r9,%rdx 1040 movq %r12,-24(%rbx) 1041 adcxq %rax,%r13 1042 adoxq %rbp,%r15 1043 leaq 32(%rcx),%rcx 1044 movq %r13,-16(%rbx) 1045 1046 decq %rdi 1047 jnz .Lmulx4x_1st 1048 1049 movq 0(%rsp),%rax 1050 movq 8(%rsp),%rdi 1051 adcq %rbp,%r15 1052 addq %r15,%r14 1053 sbbq %r15,%r15 1054 movq %r14,-8(%rbx) 1055 jmp .Lmulx4x_outer 1056 1057.align 32 1058.Lmulx4x_outer: 1059 movq (%rdi),%rdx 1060 leaq 8(%rdi),%rdi 1061 subq %rax,%rsi 1062 movq %r15,(%rbx) 1063 leaq 64+32(%rsp),%rbx 1064 subq %rax,%rcx 1065 1066 mulxq 0(%rsi),%r8,%r11 1067 xorl %ebp,%ebp 1068 movq %rdx,%r9 1069 mulxq 8(%rsi),%r14,%r12 1070 adoxq -32(%rbx),%r8 1071 adcxq %r14,%r11 1072 mulxq 16(%rsi),%r15,%r13 1073 adoxq -24(%rbx),%r11 1074 adcxq %r15,%r12 1075 adoxq -16(%rbx),%r12 1076 adcxq %rbp,%r13 1077 adoxq %rbp,%r13 1078 1079 movq %rdi,8(%rsp) 1080 movq %r8,%r15 1081 imulq 24(%rsp),%r8 1082 xorl %ebp,%ebp 1083 1084 mulxq 24(%rsi),%rax,%r14 1085 movq %r8,%rdx 1086 adcxq %rax,%r13 1087 adoxq -8(%rbx),%r13 1088 adcxq %rbp,%r14 1089 leaq 32(%rsi),%rsi 1090 adoxq %rbp,%r14 1091 1092 mulxq 0(%rcx),%rax,%r10 1093 adcxq %rax,%r15 1094 adoxq %r11,%r10 1095 mulxq 8(%rcx),%rax,%r11 1096 adcxq %rax,%r10 1097 adoxq %r12,%r11 1098 mulxq 16(%rcx),%rax,%r12 1099 movq %r10,-32(%rbx) 1100 adcxq %rax,%r11 1101 adoxq %r13,%r12 1102 mulxq 24(%rcx),%rax,%r15 1103 movq %r9,%rdx 1104 movq %r11,-24(%rbx) 1105 leaq 32(%rcx),%rcx 1106 adcxq %rax,%r12 1107 adoxq %rbp,%r15 1108 movq 48(%rsp),%rdi 1109 movq %r12,-16(%rbx) 1110 1111 jmp .Lmulx4x_inner 1112 1113.align 32 1114.Lmulx4x_inner: 1115 mulxq 0(%rsi),%r10,%rax 1116 adcxq %rbp,%r15 1117 adoxq %r14,%r10 1118 mulxq 8(%rsi),%r11,%r14 1119 adcxq 0(%rbx),%r10 1120 adoxq %rax,%r11 1121 mulxq 16(%rsi),%r12,%rax 1122 adcxq 8(%rbx),%r11 1123 adoxq %r14,%r12 1124 mulxq 24(%rsi),%r13,%r14 1125 movq %r8,%rdx 1126 adcxq 16(%rbx),%r12 1127 adoxq %rax,%r13 1128 adcxq 24(%rbx),%r13 1129 adoxq %rbp,%r14 1130 leaq 32(%rsi),%rsi 1131 leaq 32(%rbx),%rbx 1132 adcxq %rbp,%r14 1133 1134 adoxq %r15,%r10 1135 mulxq 0(%rcx),%rax,%r15 1136 adcxq %rax,%r10 1137 adoxq %r15,%r11 1138 mulxq 8(%rcx),%rax,%r15 1139 adcxq %rax,%r11 1140 adoxq %r15,%r12 1141 mulxq 16(%rcx),%rax,%r15 1142 movq %r10,-40(%rbx) 1143 adcxq %rax,%r12 1144 adoxq %r15,%r13 1145 mulxq 24(%rcx),%rax,%r15 1146 movq %r9,%rdx 1147 movq %r11,-32(%rbx) 1148 movq %r12,-24(%rbx) 1149 adcxq %rax,%r13 1150 adoxq %rbp,%r15 1151 leaq 32(%rcx),%rcx 1152 movq %r13,-16(%rbx) 1153 1154 decq %rdi 1155 jnz .Lmulx4x_inner 1156 1157 movq 0(%rsp),%rax 1158 movq 8(%rsp),%rdi 1159 adcq %rbp,%r15 1160 subq 0(%rbx),%rbp 1161 adcq %r15,%r14 1162 sbbq %r15,%r15 1163 movq %r14,-8(%rbx) 1164 1165 cmpq 16(%rsp),%rdi 1166 jne .Lmulx4x_outer 1167 1168 leaq 64(%rsp),%rbx 1169 subq %rax,%rcx 1170 negq %r15 1171 movq %rax,%rdx 1172 shrq $3+2,%rax 1173 movq 32(%rsp),%rdi 1174 jmp .Lmulx4x_sub 1175 1176.align 32 1177.Lmulx4x_sub: 1178 movq 0(%rbx),%r11 1179 movq 8(%rbx),%r12 1180 movq 16(%rbx),%r13 1181 movq 24(%rbx),%r14 1182 leaq 32(%rbx),%rbx 1183 sbbq 0(%rcx),%r11 1184 sbbq 8(%rcx),%r12 1185 sbbq 16(%rcx),%r13 1186 sbbq 24(%rcx),%r14 1187 leaq 32(%rcx),%rcx 1188 movq %r11,0(%rdi) 1189 movq %r12,8(%rdi) 1190 movq %r13,16(%rdi) 1191 movq %r14,24(%rdi) 1192 leaq 32(%rdi),%rdi 1193 decq %rax 1194 jnz .Lmulx4x_sub 1195 1196 sbbq $0,%r15 1197 leaq 64(%rsp),%rbx 1198 subq %rdx,%rdi 1199 1200.byte 102,73,15,110,207 1201 pxor %xmm0,%xmm0 1202 pshufd $0,%xmm1,%xmm1 1203 movq 40(%rsp),%rsi 1204.cfi_def_cfa %rsi,8 1205 jmp .Lmulx4x_cond_copy 1206 1207.align 32 1208.Lmulx4x_cond_copy: 1209 movdqa 0(%rbx),%xmm2 1210 movdqa 16(%rbx),%xmm3 1211 leaq 32(%rbx),%rbx 1212 movdqu 0(%rdi),%xmm4 1213 movdqu 16(%rdi),%xmm5 1214 leaq 32(%rdi),%rdi 1215 movdqa %xmm0,-32(%rbx) 1216 movdqa %xmm0,-16(%rbx) 1217 pcmpeqd %xmm1,%xmm0 1218 pand %xmm1,%xmm2 1219 pand %xmm1,%xmm3 1220 pand %xmm0,%xmm4 1221 pand %xmm0,%xmm5 1222 pxor %xmm0,%xmm0 1223 por %xmm2,%xmm4 1224 por %xmm3,%xmm5 1225 movdqu %xmm4,-32(%rdi) 1226 movdqu %xmm5,-16(%rdi) 1227 subq $32,%rdx 1228 jnz .Lmulx4x_cond_copy 1229 1230 movq %rdx,(%rbx) 1231 1232 movq $1,%rax 1233 movq -48(%rsi),%r15 1234.cfi_restore %r15 1235 movq -40(%rsi),%r14 1236.cfi_restore %r14 1237 movq -32(%rsi),%r13 1238.cfi_restore %r13 1239 movq -24(%rsi),%r12 1240.cfi_restore %r12 1241 movq -16(%rsi),%rbp 1242.cfi_restore %rbp 1243 movq -8(%rsi),%rbx 1244.cfi_restore %rbx 1245 leaq (%rsi),%rsp 1246.cfi_def_cfa_register %rsp 1247.Lmulx4x_epilogue: 1248 ret 1249.cfi_endproc 1250.size bn_mulx4x_mont,.-bn_mulx4x_mont 1251.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1252.align 16 1253#endif 1254