1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#include <ring-core/asm_base.h> 5 6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) 7#include <ring-core/arm_arch.h> 8 9.text 10 11.globl bn_mul_mont 12.hidden bn_mul_mont 13.type bn_mul_mont,%function 14.align 5 15bn_mul_mont: 16 AARCH64_SIGN_LINK_REGISTER 17 tst x5,#7 18 b.eq __bn_sqr8x_mont 19 tst x5,#3 20 b.eq __bn_mul4x_mont 21.Lmul_mont: 22 stp x29,x30,[sp,#-64]! 23 add x29,sp,#0 24 stp x19,x20,[sp,#16] 25 stp x21,x22,[sp,#32] 26 stp x23,x24,[sp,#48] 27 28 ldr x9,[x2],#8 // bp[0] 29 sub x22,sp,x5,lsl#3 30 ldp x7,x8,[x1],#16 // ap[0..1] 31 lsl x5,x5,#3 32 ldr x4,[x4] // *n0 33 and x22,x22,#-16 // ABI says so 34 ldp x13,x14,[x3],#16 // np[0..1] 35 36 mul x6,x7,x9 // ap[0]*bp[0] 37 sub x21,x5,#16 // j=num-2 38 umulh x7,x7,x9 39 mul x10,x8,x9 // ap[1]*bp[0] 40 umulh x11,x8,x9 41 42 mul x15,x6,x4 // "tp[0]"*n0 43 mov sp,x22 // alloca 44 45 // (*) mul x12,x13,x15 // np[0]*m1 46 umulh x13,x13,x15 47 mul x16,x14,x15 // np[1]*m1 48 // (*) adds x12,x12,x6 // discarded 49 // (*) As for removal of first multiplication and addition 50 // instructions. The outcome of first addition is 51 // guaranteed to be zero, which leaves two computationally 52 // significant outcomes: it either carries or not. Then 53 // question is when does it carry? Is there alternative 54 // way to deduce it? If you follow operations, you can 55 // observe that condition for carry is quite simple: 56 // x6 being non-zero. So that carry can be calculated 57 // by adding -1 to x6. That's what next instruction does. 58 subs xzr,x6,#1 // (*) 59 umulh x17,x14,x15 60 adc x13,x13,xzr 61 cbz x21,.L1st_skip 62 63.L1st: 64 ldr x8,[x1],#8 65 adds x6,x10,x7 66 sub x21,x21,#8 // j-- 67 adc x7,x11,xzr 68 69 ldr x14,[x3],#8 70 adds x12,x16,x13 71 mul x10,x8,x9 // ap[j]*bp[0] 72 adc x13,x17,xzr 73 umulh x11,x8,x9 74 75 adds x12,x12,x6 76 mul x16,x14,x15 // np[j]*m1 77 adc x13,x13,xzr 78 umulh x17,x14,x15 79 str x12,[x22],#8 // tp[j-1] 80 cbnz x21,.L1st 81 82.L1st_skip: 83 adds x6,x10,x7 84 sub x1,x1,x5 // rewind x1 85 adc x7,x11,xzr 86 87 adds x12,x16,x13 88 sub x3,x3,x5 // rewind x3 89 adc x13,x17,xzr 90 91 adds x12,x12,x6 92 sub x20,x5,#8 // i=num-1 93 adcs x13,x13,x7 94 95 adc x19,xzr,xzr // upmost overflow bit 96 stp x12,x13,[x22] 97 98.Louter: 99 ldr x9,[x2],#8 // bp[i] 100 ldp x7,x8,[x1],#16 101 ldr x23,[sp] // tp[0] 102 add x22,sp,#8 103 104 mul x6,x7,x9 // ap[0]*bp[i] 105 sub x21,x5,#16 // j=num-2 106 umulh x7,x7,x9 107 ldp x13,x14,[x3],#16 108 mul x10,x8,x9 // ap[1]*bp[i] 109 adds x6,x6,x23 110 umulh x11,x8,x9 111 adc x7,x7,xzr 112 113 mul x15,x6,x4 114 sub x20,x20,#8 // i-- 115 116 // (*) mul x12,x13,x15 // np[0]*m1 117 umulh x13,x13,x15 118 mul x16,x14,x15 // np[1]*m1 119 // (*) adds x12,x12,x6 120 subs xzr,x6,#1 // (*) 121 umulh x17,x14,x15 122 cbz x21,.Linner_skip 123 124.Linner: 125 ldr x8,[x1],#8 126 adc x13,x13,xzr 127 ldr x23,[x22],#8 // tp[j] 128 adds x6,x10,x7 129 sub x21,x21,#8 // j-- 130 adc x7,x11,xzr 131 132 adds x12,x16,x13 133 ldr x14,[x3],#8 134 adc x13,x17,xzr 135 136 mul x10,x8,x9 // ap[j]*bp[i] 137 adds x6,x6,x23 138 umulh x11,x8,x9 139 adc x7,x7,xzr 140 141 mul x16,x14,x15 // np[j]*m1 142 adds x12,x12,x6 143 umulh x17,x14,x15 144 str x12,[x22,#-16] // tp[j-1] 145 cbnz x21,.Linner 146 147.Linner_skip: 148 ldr x23,[x22],#8 // tp[j] 149 adc x13,x13,xzr 150 adds x6,x10,x7 151 sub x1,x1,x5 // rewind x1 152 adc x7,x11,xzr 153 154 adds x12,x16,x13 155 sub x3,x3,x5 // rewind x3 156 adcs x13,x17,x19 157 adc x19,xzr,xzr 158 159 adds x6,x6,x23 160 adc x7,x7,xzr 161 162 adds x12,x12,x6 163 adcs x13,x13,x7 164 adc x19,x19,xzr // upmost overflow bit 165 stp x12,x13,[x22,#-16] 166 167 cbnz x20,.Louter 168 169 // Final step. We see if result is larger than modulus, and 170 // if it is, subtract the modulus. But comparison implies 171 // subtraction. So we subtract modulus, see if it borrowed, 172 // and conditionally copy original value. 173 ldr x23,[sp] // tp[0] 174 add x22,sp,#8 175 ldr x14,[x3],#8 // np[0] 176 subs x21,x5,#8 // j=num-1 and clear borrow 177 mov x1,x0 178.Lsub: 179 sbcs x8,x23,x14 // tp[j]-np[j] 180 ldr x23,[x22],#8 181 sub x21,x21,#8 // j-- 182 ldr x14,[x3],#8 183 str x8,[x1],#8 // rp[j]=tp[j]-np[j] 184 cbnz x21,.Lsub 185 186 sbcs x8,x23,x14 187 sbcs x19,x19,xzr // did it borrow? 188 str x8,[x1],#8 // rp[num-1] 189 190 ldr x23,[sp] // tp[0] 191 add x22,sp,#8 192 ldr x8,[x0],#8 // rp[0] 193 sub x5,x5,#8 // num-- 194 nop 195.Lcond_copy: 196 sub x5,x5,#8 // num-- 197 csel x14,x23,x8,lo // did it borrow? 198 ldr x23,[x22],#8 199 ldr x8,[x0],#8 200 str xzr,[x22,#-16] // wipe tp 201 str x14,[x0,#-16] 202 cbnz x5,.Lcond_copy 203 204 csel x14,x23,x8,lo 205 str xzr,[x22,#-8] // wipe tp 206 str x14,[x0,#-8] 207 208 ldp x19,x20,[x29,#16] 209 mov sp,x29 210 ldp x21,x22,[x29,#32] 211 mov x0,#1 212 ldp x23,x24,[x29,#48] 213 ldr x29,[sp],#64 214 AARCH64_VALIDATE_LINK_REGISTER 215 ret 216.size bn_mul_mont,.-bn_mul_mont 217.type __bn_sqr8x_mont,%function 218.align 5 219__bn_sqr8x_mont: 220 // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to 221 // only from bn_mul_mont which has already signed the return address. 222 cmp x1,x2 223 b.ne __bn_mul4x_mont 224.Lsqr8x_mont: 225 stp x29,x30,[sp,#-128]! 226 add x29,sp,#0 227 stp x19,x20,[sp,#16] 228 stp x21,x22,[sp,#32] 229 stp x23,x24,[sp,#48] 230 stp x25,x26,[sp,#64] 231 stp x27,x28,[sp,#80] 232 stp x0,x3,[sp,#96] // offload rp and np 233 234 ldp x6,x7,[x1,#8*0] 235 ldp x8,x9,[x1,#8*2] 236 ldp x10,x11,[x1,#8*4] 237 ldp x12,x13,[x1,#8*6] 238 239 sub x2,sp,x5,lsl#4 240 lsl x5,x5,#3 241 ldr x4,[x4] // *n0 242 mov sp,x2 // alloca 243 sub x27,x5,#8*8 244 b .Lsqr8x_zero_start 245 246.Lsqr8x_zero: 247 sub x27,x27,#8*8 248 stp xzr,xzr,[x2,#8*0] 249 stp xzr,xzr,[x2,#8*2] 250 stp xzr,xzr,[x2,#8*4] 251 stp xzr,xzr,[x2,#8*6] 252.Lsqr8x_zero_start: 253 stp xzr,xzr,[x2,#8*8] 254 stp xzr,xzr,[x2,#8*10] 255 stp xzr,xzr,[x2,#8*12] 256 stp xzr,xzr,[x2,#8*14] 257 add x2,x2,#8*16 258 cbnz x27,.Lsqr8x_zero 259 260 add x3,x1,x5 261 add x1,x1,#8*8 262 mov x19,xzr 263 mov x20,xzr 264 mov x21,xzr 265 mov x22,xzr 266 mov x23,xzr 267 mov x24,xzr 268 mov x25,xzr 269 mov x26,xzr 270 mov x2,sp 271 str x4,[x29,#112] // offload n0 272 273 // Multiply everything but a[i]*a[i] 274.align 4 275.Lsqr8x_outer_loop: 276 // a[1]a[0] (i) 277 // a[2]a[0] 278 // a[3]a[0] 279 // a[4]a[0] 280 // a[5]a[0] 281 // a[6]a[0] 282 // a[7]a[0] 283 // a[2]a[1] (ii) 284 // a[3]a[1] 285 // a[4]a[1] 286 // a[5]a[1] 287 // a[6]a[1] 288 // a[7]a[1] 289 // a[3]a[2] (iii) 290 // a[4]a[2] 291 // a[5]a[2] 292 // a[6]a[2] 293 // a[7]a[2] 294 // a[4]a[3] (iv) 295 // a[5]a[3] 296 // a[6]a[3] 297 // a[7]a[3] 298 // a[5]a[4] (v) 299 // a[6]a[4] 300 // a[7]a[4] 301 // a[6]a[5] (vi) 302 // a[7]a[5] 303 // a[7]a[6] (vii) 304 305 mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) 306 mul x15,x8,x6 307 mul x16,x9,x6 308 mul x17,x10,x6 309 adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) 310 mul x14,x11,x6 311 adcs x21,x21,x15 312 mul x15,x12,x6 313 adcs x22,x22,x16 314 mul x16,x13,x6 315 adcs x23,x23,x17 316 umulh x17,x7,x6 // hi(a[1..7]*a[0]) 317 adcs x24,x24,x14 318 umulh x14,x8,x6 319 adcs x25,x25,x15 320 umulh x15,x9,x6 321 adcs x26,x26,x16 322 umulh x16,x10,x6 323 stp x19,x20,[x2],#8*2 // t[0..1] 324 adc x19,xzr,xzr // t[8] 325 adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) 326 umulh x17,x11,x6 327 adcs x22,x22,x14 328 umulh x14,x12,x6 329 adcs x23,x23,x15 330 umulh x15,x13,x6 331 adcs x24,x24,x16 332 mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) 333 adcs x25,x25,x17 334 mul x17,x9,x7 335 adcs x26,x26,x14 336 mul x14,x10,x7 337 adc x19,x19,x15 338 339 mul x15,x11,x7 340 adds x22,x22,x16 341 mul x16,x12,x7 342 adcs x23,x23,x17 343 mul x17,x13,x7 344 adcs x24,x24,x14 345 umulh x14,x8,x7 // hi(a[2..7]*a[1]) 346 adcs x25,x25,x15 347 umulh x15,x9,x7 348 adcs x26,x26,x16 349 umulh x16,x10,x7 350 adcs x19,x19,x17 351 umulh x17,x11,x7 352 stp x21,x22,[x2],#8*2 // t[2..3] 353 adc x20,xzr,xzr // t[9] 354 adds x23,x23,x14 355 umulh x14,x12,x7 356 adcs x24,x24,x15 357 umulh x15,x13,x7 358 adcs x25,x25,x16 359 mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) 360 adcs x26,x26,x17 361 mul x17,x10,x8 362 adcs x19,x19,x14 363 mul x14,x11,x8 364 adc x20,x20,x15 365 366 mul x15,x12,x8 367 adds x24,x24,x16 368 mul x16,x13,x8 369 adcs x25,x25,x17 370 umulh x17,x9,x8 // hi(a[3..7]*a[2]) 371 adcs x26,x26,x14 372 umulh x14,x10,x8 373 adcs x19,x19,x15 374 umulh x15,x11,x8 375 adcs x20,x20,x16 376 umulh x16,x12,x8 377 stp x23,x24,[x2],#8*2 // t[4..5] 378 adc x21,xzr,xzr // t[10] 379 adds x25,x25,x17 380 umulh x17,x13,x8 381 adcs x26,x26,x14 382 mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) 383 adcs x19,x19,x15 384 mul x15,x11,x9 385 adcs x20,x20,x16 386 mul x16,x12,x9 387 adc x21,x21,x17 388 389 mul x17,x13,x9 390 adds x26,x26,x14 391 umulh x14,x10,x9 // hi(a[4..7]*a[3]) 392 adcs x19,x19,x15 393 umulh x15,x11,x9 394 adcs x20,x20,x16 395 umulh x16,x12,x9 396 adcs x21,x21,x17 397 umulh x17,x13,x9 398 stp x25,x26,[x2],#8*2 // t[6..7] 399 adc x22,xzr,xzr // t[11] 400 adds x19,x19,x14 401 mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) 402 adcs x20,x20,x15 403 mul x15,x12,x10 404 adcs x21,x21,x16 405 mul x16,x13,x10 406 adc x22,x22,x17 407 408 umulh x17,x11,x10 // hi(a[5..7]*a[4]) 409 adds x20,x20,x14 410 umulh x14,x12,x10 411 adcs x21,x21,x15 412 umulh x15,x13,x10 413 adcs x22,x22,x16 414 mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) 415 adc x23,xzr,xzr // t[12] 416 adds x21,x21,x17 417 mul x17,x13,x11 418 adcs x22,x22,x14 419 umulh x14,x12,x11 // hi(a[6..7]*a[5]) 420 adc x23,x23,x15 421 422 umulh x15,x13,x11 423 adds x22,x22,x16 424 mul x16,x13,x12 // lo(a[7]*a[6]) (vii) 425 adcs x23,x23,x17 426 umulh x17,x13,x12 // hi(a[7]*a[6]) 427 adc x24,xzr,xzr // t[13] 428 adds x23,x23,x14 429 sub x27,x3,x1 // done yet? 430 adc x24,x24,x15 431 432 adds x24,x24,x16 433 sub x14,x3,x5 // rewinded ap 434 adc x25,xzr,xzr // t[14] 435 add x25,x25,x17 436 437 cbz x27,.Lsqr8x_outer_break 438 439 mov x4,x6 440 ldp x6,x7,[x2,#8*0] 441 ldp x8,x9,[x2,#8*2] 442 ldp x10,x11,[x2,#8*4] 443 ldp x12,x13,[x2,#8*6] 444 adds x19,x19,x6 445 adcs x20,x20,x7 446 ldp x6,x7,[x1,#8*0] 447 adcs x21,x21,x8 448 adcs x22,x22,x9 449 ldp x8,x9,[x1,#8*2] 450 adcs x23,x23,x10 451 adcs x24,x24,x11 452 ldp x10,x11,[x1,#8*4] 453 adcs x25,x25,x12 454 mov x0,x1 455 adcs x26,xzr,x13 456 ldp x12,x13,[x1,#8*6] 457 add x1,x1,#8*8 458 //adc x28,xzr,xzr // moved below 459 mov x27,#-8*8 460 461 // a[8]a[0] 462 // a[9]a[0] 463 // a[a]a[0] 464 // a[b]a[0] 465 // a[c]a[0] 466 // a[d]a[0] 467 // a[e]a[0] 468 // a[f]a[0] 469 // a[8]a[1] 470 // a[f]a[1]........................ 471 // a[8]a[2] 472 // a[f]a[2]........................ 473 // a[8]a[3] 474 // a[f]a[3]........................ 475 // a[8]a[4] 476 // a[f]a[4]........................ 477 // a[8]a[5] 478 // a[f]a[5]........................ 479 // a[8]a[6] 480 // a[f]a[6]........................ 481 // a[8]a[7] 482 // a[f]a[7]........................ 483.Lsqr8x_mul: 484 mul x14,x6,x4 485 adc x28,xzr,xzr // carry bit, modulo-scheduled 486 mul x15,x7,x4 487 add x27,x27,#8 488 mul x16,x8,x4 489 mul x17,x9,x4 490 adds x19,x19,x14 491 mul x14,x10,x4 492 adcs x20,x20,x15 493 mul x15,x11,x4 494 adcs x21,x21,x16 495 mul x16,x12,x4 496 adcs x22,x22,x17 497 mul x17,x13,x4 498 adcs x23,x23,x14 499 umulh x14,x6,x4 500 adcs x24,x24,x15 501 umulh x15,x7,x4 502 adcs x25,x25,x16 503 umulh x16,x8,x4 504 adcs x26,x26,x17 505 umulh x17,x9,x4 506 adc x28,x28,xzr 507 str x19,[x2],#8 508 adds x19,x20,x14 509 umulh x14,x10,x4 510 adcs x20,x21,x15 511 umulh x15,x11,x4 512 adcs x21,x22,x16 513 umulh x16,x12,x4 514 adcs x22,x23,x17 515 umulh x17,x13,x4 516 ldr x4,[x0,x27] 517 adcs x23,x24,x14 518 adcs x24,x25,x15 519 adcs x25,x26,x16 520 adcs x26,x28,x17 521 //adc x28,xzr,xzr // moved above 522 cbnz x27,.Lsqr8x_mul 523 // note that carry flag is guaranteed 524 // to be zero at this point 525 cmp x1,x3 // done yet? 526 b.eq .Lsqr8x_break 527 528 ldp x6,x7,[x2,#8*0] 529 ldp x8,x9,[x2,#8*2] 530 ldp x10,x11,[x2,#8*4] 531 ldp x12,x13,[x2,#8*6] 532 adds x19,x19,x6 533 ldr x4,[x0,#-8*8] 534 adcs x20,x20,x7 535 ldp x6,x7,[x1,#8*0] 536 adcs x21,x21,x8 537 adcs x22,x22,x9 538 ldp x8,x9,[x1,#8*2] 539 adcs x23,x23,x10 540 adcs x24,x24,x11 541 ldp x10,x11,[x1,#8*4] 542 adcs x25,x25,x12 543 mov x27,#-8*8 544 adcs x26,x26,x13 545 ldp x12,x13,[x1,#8*6] 546 add x1,x1,#8*8 547 //adc x28,xzr,xzr // moved above 548 b .Lsqr8x_mul 549 550.align 4 551.Lsqr8x_break: 552 ldp x6,x7,[x0,#8*0] 553 add x1,x0,#8*8 554 ldp x8,x9,[x0,#8*2] 555 sub x14,x3,x1 // is it last iteration? 556 ldp x10,x11,[x0,#8*4] 557 sub x15,x2,x14 558 ldp x12,x13,[x0,#8*6] 559 cbz x14,.Lsqr8x_outer_loop 560 561 stp x19,x20,[x2,#8*0] 562 ldp x19,x20,[x15,#8*0] 563 stp x21,x22,[x2,#8*2] 564 ldp x21,x22,[x15,#8*2] 565 stp x23,x24,[x2,#8*4] 566 ldp x23,x24,[x15,#8*4] 567 stp x25,x26,[x2,#8*6] 568 mov x2,x15 569 ldp x25,x26,[x15,#8*6] 570 b .Lsqr8x_outer_loop 571 572.align 4 573.Lsqr8x_outer_break: 574 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] 575 ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] 576 ldp x15,x16,[sp,#8*1] 577 ldp x11,x13,[x14,#8*2] 578 add x1,x14,#8*4 579 ldp x17,x14,[sp,#8*3] 580 581 stp x19,x20,[x2,#8*0] 582 mul x19,x7,x7 583 stp x21,x22,[x2,#8*2] 584 umulh x7,x7,x7 585 stp x23,x24,[x2,#8*4] 586 mul x8,x9,x9 587 stp x25,x26,[x2,#8*6] 588 mov x2,sp 589 umulh x9,x9,x9 590 adds x20,x7,x15,lsl#1 591 extr x15,x16,x15,#63 592 sub x27,x5,#8*4 593 594.Lsqr4x_shift_n_add: 595 adcs x21,x8,x15 596 extr x16,x17,x16,#63 597 sub x27,x27,#8*4 598 adcs x22,x9,x16 599 ldp x15,x16,[x2,#8*5] 600 mul x10,x11,x11 601 ldp x7,x9,[x1],#8*2 602 umulh x11,x11,x11 603 mul x12,x13,x13 604 umulh x13,x13,x13 605 extr x17,x14,x17,#63 606 stp x19,x20,[x2,#8*0] 607 adcs x23,x10,x17 608 extr x14,x15,x14,#63 609 stp x21,x22,[x2,#8*2] 610 adcs x24,x11,x14 611 ldp x17,x14,[x2,#8*7] 612 extr x15,x16,x15,#63 613 adcs x25,x12,x15 614 extr x16,x17,x16,#63 615 adcs x26,x13,x16 616 ldp x15,x16,[x2,#8*9] 617 mul x6,x7,x7 618 ldp x11,x13,[x1],#8*2 619 umulh x7,x7,x7 620 mul x8,x9,x9 621 umulh x9,x9,x9 622 stp x23,x24,[x2,#8*4] 623 extr x17,x14,x17,#63 624 stp x25,x26,[x2,#8*6] 625 add x2,x2,#8*8 626 adcs x19,x6,x17 627 extr x14,x15,x14,#63 628 adcs x20,x7,x14 629 ldp x17,x14,[x2,#8*3] 630 extr x15,x16,x15,#63 631 cbnz x27,.Lsqr4x_shift_n_add 632 ldp x1,x4,[x29,#104] // pull np and n0 633 634 adcs x21,x8,x15 635 extr x16,x17,x16,#63 636 adcs x22,x9,x16 637 ldp x15,x16,[x2,#8*5] 638 mul x10,x11,x11 639 umulh x11,x11,x11 640 stp x19,x20,[x2,#8*0] 641 mul x12,x13,x13 642 umulh x13,x13,x13 643 stp x21,x22,[x2,#8*2] 644 extr x17,x14,x17,#63 645 adcs x23,x10,x17 646 extr x14,x15,x14,#63 647 ldp x19,x20,[sp,#8*0] 648 adcs x24,x11,x14 649 extr x15,x16,x15,#63 650 ldp x6,x7,[x1,#8*0] 651 adcs x25,x12,x15 652 extr x16,xzr,x16,#63 653 ldp x8,x9,[x1,#8*2] 654 adc x26,x13,x16 655 ldp x10,x11,[x1,#8*4] 656 657 // Reduce by 512 bits per iteration 658 mul x28,x4,x19 // t[0]*n0 659 ldp x12,x13,[x1,#8*6] 660 add x3,x1,x5 661 ldp x21,x22,[sp,#8*2] 662 stp x23,x24,[x2,#8*4] 663 ldp x23,x24,[sp,#8*4] 664 stp x25,x26,[x2,#8*6] 665 ldp x25,x26,[sp,#8*6] 666 add x1,x1,#8*8 667 mov x30,xzr // initial top-most carry 668 mov x2,sp 669 mov x27,#8 670 671.Lsqr8x_reduction: 672 // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) 673 mul x15,x7,x28 674 sub x27,x27,#1 675 mul x16,x8,x28 676 str x28,[x2],#8 // put aside t[0]*n0 for tail processing 677 mul x17,x9,x28 678 // (*) adds xzr,x19,x14 679 subs xzr,x19,#1 // (*) 680 mul x14,x10,x28 681 adcs x19,x20,x15 682 mul x15,x11,x28 683 adcs x20,x21,x16 684 mul x16,x12,x28 685 adcs x21,x22,x17 686 mul x17,x13,x28 687 adcs x22,x23,x14 688 umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) 689 adcs x23,x24,x15 690 umulh x15,x7,x28 691 adcs x24,x25,x16 692 umulh x16,x8,x28 693 adcs x25,x26,x17 694 umulh x17,x9,x28 695 adc x26,xzr,xzr 696 adds x19,x19,x14 697 umulh x14,x10,x28 698 adcs x20,x20,x15 699 umulh x15,x11,x28 700 adcs x21,x21,x16 701 umulh x16,x12,x28 702 adcs x22,x22,x17 703 umulh x17,x13,x28 704 mul x28,x4,x19 // next t[0]*n0 705 adcs x23,x23,x14 706 adcs x24,x24,x15 707 adcs x25,x25,x16 708 adc x26,x26,x17 709 cbnz x27,.Lsqr8x_reduction 710 711 ldp x14,x15,[x2,#8*0] 712 ldp x16,x17,[x2,#8*2] 713 mov x0,x2 714 sub x27,x3,x1 // done yet? 715 adds x19,x19,x14 716 adcs x20,x20,x15 717 ldp x14,x15,[x2,#8*4] 718 adcs x21,x21,x16 719 adcs x22,x22,x17 720 ldp x16,x17,[x2,#8*6] 721 adcs x23,x23,x14 722 adcs x24,x24,x15 723 adcs x25,x25,x16 724 adcs x26,x26,x17 725 //adc x28,xzr,xzr // moved below 726 cbz x27,.Lsqr8x8_post_condition 727 728 ldr x4,[x2,#-8*8] 729 ldp x6,x7,[x1,#8*0] 730 ldp x8,x9,[x1,#8*2] 731 ldp x10,x11,[x1,#8*4] 732 mov x27,#-8*8 733 ldp x12,x13,[x1,#8*6] 734 add x1,x1,#8*8 735 736.Lsqr8x_tail: 737 mul x14,x6,x4 738 adc x28,xzr,xzr // carry bit, modulo-scheduled 739 mul x15,x7,x4 740 add x27,x27,#8 741 mul x16,x8,x4 742 mul x17,x9,x4 743 adds x19,x19,x14 744 mul x14,x10,x4 745 adcs x20,x20,x15 746 mul x15,x11,x4 747 adcs x21,x21,x16 748 mul x16,x12,x4 749 adcs x22,x22,x17 750 mul x17,x13,x4 751 adcs x23,x23,x14 752 umulh x14,x6,x4 753 adcs x24,x24,x15 754 umulh x15,x7,x4 755 adcs x25,x25,x16 756 umulh x16,x8,x4 757 adcs x26,x26,x17 758 umulh x17,x9,x4 759 adc x28,x28,xzr 760 str x19,[x2],#8 761 adds x19,x20,x14 762 umulh x14,x10,x4 763 adcs x20,x21,x15 764 umulh x15,x11,x4 765 adcs x21,x22,x16 766 umulh x16,x12,x4 767 adcs x22,x23,x17 768 umulh x17,x13,x4 769 ldr x4,[x0,x27] 770 adcs x23,x24,x14 771 adcs x24,x25,x15 772 adcs x25,x26,x16 773 adcs x26,x28,x17 774 //adc x28,xzr,xzr // moved above 775 cbnz x27,.Lsqr8x_tail 776 // note that carry flag is guaranteed 777 // to be zero at this point 778 ldp x6,x7,[x2,#8*0] 779 sub x27,x3,x1 // done yet? 780 sub x16,x3,x5 // rewinded np 781 ldp x8,x9,[x2,#8*2] 782 ldp x10,x11,[x2,#8*4] 783 ldp x12,x13,[x2,#8*6] 784 cbz x27,.Lsqr8x_tail_break 785 786 ldr x4,[x0,#-8*8] 787 adds x19,x19,x6 788 adcs x20,x20,x7 789 ldp x6,x7,[x1,#8*0] 790 adcs x21,x21,x8 791 adcs x22,x22,x9 792 ldp x8,x9,[x1,#8*2] 793 adcs x23,x23,x10 794 adcs x24,x24,x11 795 ldp x10,x11,[x1,#8*4] 796 adcs x25,x25,x12 797 mov x27,#-8*8 798 adcs x26,x26,x13 799 ldp x12,x13,[x1,#8*6] 800 add x1,x1,#8*8 801 //adc x28,xzr,xzr // moved above 802 b .Lsqr8x_tail 803 804.align 4 805.Lsqr8x_tail_break: 806 ldr x4,[x29,#112] // pull n0 807 add x27,x2,#8*8 // end of current t[num] window 808 809 subs xzr,x30,#1 // "move" top-most carry to carry bit 810 adcs x14,x19,x6 811 adcs x15,x20,x7 812 ldp x19,x20,[x0,#8*0] 813 adcs x21,x21,x8 814 ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] 815 adcs x22,x22,x9 816 ldp x8,x9,[x16,#8*2] 817 adcs x23,x23,x10 818 adcs x24,x24,x11 819 ldp x10,x11,[x16,#8*4] 820 adcs x25,x25,x12 821 adcs x26,x26,x13 822 ldp x12,x13,[x16,#8*6] 823 add x1,x16,#8*8 824 adc x30,xzr,xzr // top-most carry 825 mul x28,x4,x19 826 stp x14,x15,[x2,#8*0] 827 stp x21,x22,[x2,#8*2] 828 ldp x21,x22,[x0,#8*2] 829 stp x23,x24,[x2,#8*4] 830 ldp x23,x24,[x0,#8*4] 831 cmp x27,x29 // did we hit the bottom? 832 stp x25,x26,[x2,#8*6] 833 mov x2,x0 // slide the window 834 ldp x25,x26,[x0,#8*6] 835 mov x27,#8 836 b.ne .Lsqr8x_reduction 837 838 // Final step. We see if result is larger than modulus, and 839 // if it is, subtract the modulus. But comparison implies 840 // subtraction. So we subtract modulus, see if it borrowed, 841 // and conditionally copy original value. 842 ldr x0,[x29,#96] // pull rp 843 add x2,x2,#8*8 844 subs x14,x19,x6 845 sbcs x15,x20,x7 846 sub x27,x5,#8*8 847 mov x3,x0 // x0 copy 848 849.Lsqr8x_sub: 850 sbcs x16,x21,x8 851 ldp x6,x7,[x1,#8*0] 852 sbcs x17,x22,x9 853 stp x14,x15,[x0,#8*0] 854 sbcs x14,x23,x10 855 ldp x8,x9,[x1,#8*2] 856 sbcs x15,x24,x11 857 stp x16,x17,[x0,#8*2] 858 sbcs x16,x25,x12 859 ldp x10,x11,[x1,#8*4] 860 sbcs x17,x26,x13 861 ldp x12,x13,[x1,#8*6] 862 add x1,x1,#8*8 863 ldp x19,x20,[x2,#8*0] 864 sub x27,x27,#8*8 865 ldp x21,x22,[x2,#8*2] 866 ldp x23,x24,[x2,#8*4] 867 ldp x25,x26,[x2,#8*6] 868 add x2,x2,#8*8 869 stp x14,x15,[x0,#8*4] 870 sbcs x14,x19,x6 871 stp x16,x17,[x0,#8*6] 872 add x0,x0,#8*8 873 sbcs x15,x20,x7 874 cbnz x27,.Lsqr8x_sub 875 876 sbcs x16,x21,x8 877 mov x2,sp 878 add x1,sp,x5 879 ldp x6,x7,[x3,#8*0] 880 sbcs x17,x22,x9 881 stp x14,x15,[x0,#8*0] 882 sbcs x14,x23,x10 883 ldp x8,x9,[x3,#8*2] 884 sbcs x15,x24,x11 885 stp x16,x17,[x0,#8*2] 886 sbcs x16,x25,x12 887 ldp x19,x20,[x1,#8*0] 888 sbcs x17,x26,x13 889 ldp x21,x22,[x1,#8*2] 890 sbcs xzr,x30,xzr // did it borrow? 891 ldr x30,[x29,#8] // pull return address 892 stp x14,x15,[x0,#8*4] 893 stp x16,x17,[x0,#8*6] 894 895 sub x27,x5,#8*4 896.Lsqr4x_cond_copy: 897 sub x27,x27,#8*4 898 csel x14,x19,x6,lo 899 stp xzr,xzr,[x2,#8*0] 900 csel x15,x20,x7,lo 901 ldp x6,x7,[x3,#8*4] 902 ldp x19,x20,[x1,#8*4] 903 csel x16,x21,x8,lo 904 stp xzr,xzr,[x2,#8*2] 905 add x2,x2,#8*4 906 csel x17,x22,x9,lo 907 ldp x8,x9,[x3,#8*6] 908 ldp x21,x22,[x1,#8*6] 909 add x1,x1,#8*4 910 stp x14,x15,[x3,#8*0] 911 stp x16,x17,[x3,#8*2] 912 add x3,x3,#8*4 913 stp xzr,xzr,[x1,#8*0] 914 stp xzr,xzr,[x1,#8*2] 915 cbnz x27,.Lsqr4x_cond_copy 916 917 csel x14,x19,x6,lo 918 stp xzr,xzr,[x2,#8*0] 919 csel x15,x20,x7,lo 920 stp xzr,xzr,[x2,#8*2] 921 csel x16,x21,x8,lo 922 csel x17,x22,x9,lo 923 stp x14,x15,[x3,#8*0] 924 stp x16,x17,[x3,#8*2] 925 926 b .Lsqr8x_done 927 928.align 4 929.Lsqr8x8_post_condition: 930 adc x28,xzr,xzr 931 ldr x30,[x29,#8] // pull return address 932 // x19-7,x28 hold result, x6-7 hold modulus 933 subs x6,x19,x6 934 ldr x1,[x29,#96] // pull rp 935 sbcs x7,x20,x7 936 stp xzr,xzr,[sp,#8*0] 937 sbcs x8,x21,x8 938 stp xzr,xzr,[sp,#8*2] 939 sbcs x9,x22,x9 940 stp xzr,xzr,[sp,#8*4] 941 sbcs x10,x23,x10 942 stp xzr,xzr,[sp,#8*6] 943 sbcs x11,x24,x11 944 stp xzr,xzr,[sp,#8*8] 945 sbcs x12,x25,x12 946 stp xzr,xzr,[sp,#8*10] 947 sbcs x13,x26,x13 948 stp xzr,xzr,[sp,#8*12] 949 sbcs x28,x28,xzr // did it borrow? 950 stp xzr,xzr,[sp,#8*14] 951 952 // x6-7 hold result-modulus 953 csel x6,x19,x6,lo 954 csel x7,x20,x7,lo 955 csel x8,x21,x8,lo 956 csel x9,x22,x9,lo 957 stp x6,x7,[x1,#8*0] 958 csel x10,x23,x10,lo 959 csel x11,x24,x11,lo 960 stp x8,x9,[x1,#8*2] 961 csel x12,x25,x12,lo 962 csel x13,x26,x13,lo 963 stp x10,x11,[x1,#8*4] 964 stp x12,x13,[x1,#8*6] 965 966.Lsqr8x_done: 967 ldp x19,x20,[x29,#16] 968 mov sp,x29 969 ldp x21,x22,[x29,#32] 970 mov x0,#1 971 ldp x23,x24,[x29,#48] 972 ldp x25,x26,[x29,#64] 973 ldp x27,x28,[x29,#80] 974 ldr x29,[sp],#128 975 // x30 is popped earlier 976 AARCH64_VALIDATE_LINK_REGISTER 977 ret 978.size __bn_sqr8x_mont,.-__bn_sqr8x_mont 979.type __bn_mul4x_mont,%function 980.align 5 981__bn_mul4x_mont: 982 // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to 983 // only from bn_mul_mont or __bn_mul8x_mont which have already signed the 984 // return address. 985 stp x29,x30,[sp,#-128]! 986 add x29,sp,#0 987 stp x19,x20,[sp,#16] 988 stp x21,x22,[sp,#32] 989 stp x23,x24,[sp,#48] 990 stp x25,x26,[sp,#64] 991 stp x27,x28,[sp,#80] 992 993 sub x26,sp,x5,lsl#3 994 lsl x5,x5,#3 995 ldr x4,[x4] // *n0 996 sub sp,x26,#8*4 // alloca 997 998 add x10,x2,x5 999 add x27,x1,x5 1000 stp x0,x10,[x29,#96] // offload rp and &b[num] 1001 1002 ldr x24,[x2,#8*0] // b[0] 1003 ldp x6,x7,[x1,#8*0] // a[0..3] 1004 ldp x8,x9,[x1,#8*2] 1005 add x1,x1,#8*4 1006 mov x19,xzr 1007 mov x20,xzr 1008 mov x21,xzr 1009 mov x22,xzr 1010 ldp x14,x15,[x3,#8*0] // n[0..3] 1011 ldp x16,x17,[x3,#8*2] 1012 adds x3,x3,#8*4 // clear carry bit 1013 mov x0,xzr 1014 mov x28,#0 1015 mov x26,sp 1016 1017.Loop_mul4x_1st_reduction: 1018 mul x10,x6,x24 // lo(a[0..3]*b[0]) 1019 adc x0,x0,xzr // modulo-scheduled 1020 mul x11,x7,x24 1021 add x28,x28,#8 1022 mul x12,x8,x24 1023 and x28,x28,#31 1024 mul x13,x9,x24 1025 adds x19,x19,x10 1026 umulh x10,x6,x24 // hi(a[0..3]*b[0]) 1027 adcs x20,x20,x11 1028 mul x25,x19,x4 // t[0]*n0 1029 adcs x21,x21,x12 1030 umulh x11,x7,x24 1031 adcs x22,x22,x13 1032 umulh x12,x8,x24 1033 adc x23,xzr,xzr 1034 umulh x13,x9,x24 1035 ldr x24,[x2,x28] // next b[i] (or b[0]) 1036 adds x20,x20,x10 1037 // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) 1038 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1039 adcs x21,x21,x11 1040 mul x11,x15,x25 1041 adcs x22,x22,x12 1042 mul x12,x16,x25 1043 adc x23,x23,x13 // can't overflow 1044 mul x13,x17,x25 1045 // (*) adds xzr,x19,x10 1046 subs xzr,x19,#1 // (*) 1047 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) 1048 adcs x19,x20,x11 1049 umulh x11,x15,x25 1050 adcs x20,x21,x12 1051 umulh x12,x16,x25 1052 adcs x21,x22,x13 1053 umulh x13,x17,x25 1054 adcs x22,x23,x0 1055 adc x0,xzr,xzr 1056 adds x19,x19,x10 1057 sub x10,x27,x1 1058 adcs x20,x20,x11 1059 adcs x21,x21,x12 1060 adcs x22,x22,x13 1061 //adc x0,x0,xzr 1062 cbnz x28,.Loop_mul4x_1st_reduction 1063 1064 cbz x10,.Lmul4x4_post_condition 1065 1066 ldp x6,x7,[x1,#8*0] // a[4..7] 1067 ldp x8,x9,[x1,#8*2] 1068 add x1,x1,#8*4 1069 ldr x25,[sp] // a[0]*n0 1070 ldp x14,x15,[x3,#8*0] // n[4..7] 1071 ldp x16,x17,[x3,#8*2] 1072 add x3,x3,#8*4 1073 1074.Loop_mul4x_1st_tail: 1075 mul x10,x6,x24 // lo(a[4..7]*b[i]) 1076 adc x0,x0,xzr // modulo-scheduled 1077 mul x11,x7,x24 1078 add x28,x28,#8 1079 mul x12,x8,x24 1080 and x28,x28,#31 1081 mul x13,x9,x24 1082 adds x19,x19,x10 1083 umulh x10,x6,x24 // hi(a[4..7]*b[i]) 1084 adcs x20,x20,x11 1085 umulh x11,x7,x24 1086 adcs x21,x21,x12 1087 umulh x12,x8,x24 1088 adcs x22,x22,x13 1089 umulh x13,x9,x24 1090 adc x23,xzr,xzr 1091 ldr x24,[x2,x28] // next b[i] (or b[0]) 1092 adds x20,x20,x10 1093 mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) 1094 adcs x21,x21,x11 1095 mul x11,x15,x25 1096 adcs x22,x22,x12 1097 mul x12,x16,x25 1098 adc x23,x23,x13 // can't overflow 1099 mul x13,x17,x25 1100 adds x19,x19,x10 1101 umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) 1102 adcs x20,x20,x11 1103 umulh x11,x15,x25 1104 adcs x21,x21,x12 1105 umulh x12,x16,x25 1106 adcs x22,x22,x13 1107 adcs x23,x23,x0 1108 umulh x13,x17,x25 1109 adc x0,xzr,xzr 1110 ldr x25,[sp,x28] // next t[0]*n0 1111 str x19,[x26],#8 // result!!! 1112 adds x19,x20,x10 1113 sub x10,x27,x1 // done yet? 1114 adcs x20,x21,x11 1115 adcs x21,x22,x12 1116 adcs x22,x23,x13 1117 //adc x0,x0,xzr 1118 cbnz x28,.Loop_mul4x_1st_tail 1119 1120 sub x11,x27,x5 // rewinded x1 1121 cbz x10,.Lmul4x_proceed 1122 1123 ldp x6,x7,[x1,#8*0] 1124 ldp x8,x9,[x1,#8*2] 1125 add x1,x1,#8*4 1126 ldp x14,x15,[x3,#8*0] 1127 ldp x16,x17,[x3,#8*2] 1128 add x3,x3,#8*4 1129 b .Loop_mul4x_1st_tail 1130 1131.align 5 1132.Lmul4x_proceed: 1133 ldr x24,[x2,#8*4]! // *++b 1134 adc x30,x0,xzr 1135 ldp x6,x7,[x11,#8*0] // a[0..3] 1136 sub x3,x3,x5 // rewind np 1137 ldp x8,x9,[x11,#8*2] 1138 add x1,x11,#8*4 1139 1140 stp x19,x20,[x26,#8*0] // result!!! 1141 ldp x19,x20,[sp,#8*4] // t[0..3] 1142 stp x21,x22,[x26,#8*2] // result!!! 1143 ldp x21,x22,[sp,#8*6] 1144 1145 ldp x14,x15,[x3,#8*0] // n[0..3] 1146 mov x26,sp 1147 ldp x16,x17,[x3,#8*2] 1148 adds x3,x3,#8*4 // clear carry bit 1149 mov x0,xzr 1150 1151.align 4 1152.Loop_mul4x_reduction: 1153 mul x10,x6,x24 // lo(a[0..3]*b[4]) 1154 adc x0,x0,xzr // modulo-scheduled 1155 mul x11,x7,x24 1156 add x28,x28,#8 1157 mul x12,x8,x24 1158 and x28,x28,#31 1159 mul x13,x9,x24 1160 adds x19,x19,x10 1161 umulh x10,x6,x24 // hi(a[0..3]*b[4]) 1162 adcs x20,x20,x11 1163 mul x25,x19,x4 // t[0]*n0 1164 adcs x21,x21,x12 1165 umulh x11,x7,x24 1166 adcs x22,x22,x13 1167 umulh x12,x8,x24 1168 adc x23,xzr,xzr 1169 umulh x13,x9,x24 1170 ldr x24,[x2,x28] // next b[i] 1171 adds x20,x20,x10 1172 // (*) mul x10,x14,x25 1173 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1174 adcs x21,x21,x11 1175 mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 1176 adcs x22,x22,x12 1177 mul x12,x16,x25 1178 adc x23,x23,x13 // can't overflow 1179 mul x13,x17,x25 1180 // (*) adds xzr,x19,x10 1181 subs xzr,x19,#1 // (*) 1182 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 1183 adcs x19,x20,x11 1184 umulh x11,x15,x25 1185 adcs x20,x21,x12 1186 umulh x12,x16,x25 1187 adcs x21,x22,x13 1188 umulh x13,x17,x25 1189 adcs x22,x23,x0 1190 adc x0,xzr,xzr 1191 adds x19,x19,x10 1192 adcs x20,x20,x11 1193 adcs x21,x21,x12 1194 adcs x22,x22,x13 1195 //adc x0,x0,xzr 1196 cbnz x28,.Loop_mul4x_reduction 1197 1198 adc x0,x0,xzr 1199 ldp x10,x11,[x26,#8*4] // t[4..7] 1200 ldp x12,x13,[x26,#8*6] 1201 ldp x6,x7,[x1,#8*0] // a[4..7] 1202 ldp x8,x9,[x1,#8*2] 1203 add x1,x1,#8*4 1204 adds x19,x19,x10 1205 adcs x20,x20,x11 1206 adcs x21,x21,x12 1207 adcs x22,x22,x13 1208 //adc x0,x0,xzr 1209 1210 ldr x25,[sp] // t[0]*n0 1211 ldp x14,x15,[x3,#8*0] // n[4..7] 1212 ldp x16,x17,[x3,#8*2] 1213 add x3,x3,#8*4 1214 1215.align 4 1216.Loop_mul4x_tail: 1217 mul x10,x6,x24 // lo(a[4..7]*b[4]) 1218 adc x0,x0,xzr // modulo-scheduled 1219 mul x11,x7,x24 1220 add x28,x28,#8 1221 mul x12,x8,x24 1222 and x28,x28,#31 1223 mul x13,x9,x24 1224 adds x19,x19,x10 1225 umulh x10,x6,x24 // hi(a[4..7]*b[4]) 1226 adcs x20,x20,x11 1227 umulh x11,x7,x24 1228 adcs x21,x21,x12 1229 umulh x12,x8,x24 1230 adcs x22,x22,x13 1231 umulh x13,x9,x24 1232 adc x23,xzr,xzr 1233 ldr x24,[x2,x28] // next b[i] 1234 adds x20,x20,x10 1235 mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) 1236 adcs x21,x21,x11 1237 mul x11,x15,x25 1238 adcs x22,x22,x12 1239 mul x12,x16,x25 1240 adc x23,x23,x13 // can't overflow 1241 mul x13,x17,x25 1242 adds x19,x19,x10 1243 umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) 1244 adcs x20,x20,x11 1245 umulh x11,x15,x25 1246 adcs x21,x21,x12 1247 umulh x12,x16,x25 1248 adcs x22,x22,x13 1249 umulh x13,x17,x25 1250 adcs x23,x23,x0 1251 ldr x25,[sp,x28] // next a[0]*n0 1252 adc x0,xzr,xzr 1253 str x19,[x26],#8 // result!!! 1254 adds x19,x20,x10 1255 sub x10,x27,x1 // done yet? 1256 adcs x20,x21,x11 1257 adcs x21,x22,x12 1258 adcs x22,x23,x13 1259 //adc x0,x0,xzr 1260 cbnz x28,.Loop_mul4x_tail 1261 1262 sub x11,x3,x5 // rewinded np? 1263 adc x0,x0,xzr 1264 cbz x10,.Loop_mul4x_break 1265 1266 ldp x10,x11,[x26,#8*4] 1267 ldp x12,x13,[x26,#8*6] 1268 ldp x6,x7,[x1,#8*0] 1269 ldp x8,x9,[x1,#8*2] 1270 add x1,x1,#8*4 1271 adds x19,x19,x10 1272 adcs x20,x20,x11 1273 adcs x21,x21,x12 1274 adcs x22,x22,x13 1275 //adc x0,x0,xzr 1276 ldp x14,x15,[x3,#8*0] 1277 ldp x16,x17,[x3,#8*2] 1278 add x3,x3,#8*4 1279 b .Loop_mul4x_tail 1280 1281.align 4 1282.Loop_mul4x_break: 1283 ldp x12,x13,[x29,#96] // pull rp and &b[num] 1284 adds x19,x19,x30 1285 add x2,x2,#8*4 // bp++ 1286 adcs x20,x20,xzr 1287 sub x1,x1,x5 // rewind ap 1288 adcs x21,x21,xzr 1289 stp x19,x20,[x26,#8*0] // result!!! 1290 adcs x22,x22,xzr 1291 ldp x19,x20,[sp,#8*4] // t[0..3] 1292 adc x30,x0,xzr 1293 stp x21,x22,[x26,#8*2] // result!!! 1294 cmp x2,x13 // done yet? 1295 ldp x21,x22,[sp,#8*6] 1296 ldp x14,x15,[x11,#8*0] // n[0..3] 1297 ldp x16,x17,[x11,#8*2] 1298 add x3,x11,#8*4 1299 b.eq .Lmul4x_post 1300 1301 ldr x24,[x2] 1302 ldp x6,x7,[x1,#8*0] // a[0..3] 1303 ldp x8,x9,[x1,#8*2] 1304 adds x1,x1,#8*4 // clear carry bit 1305 mov x0,xzr 1306 mov x26,sp 1307 b .Loop_mul4x_reduction 1308 1309.align 4 1310.Lmul4x_post: 1311 // Final step. We see if result is larger than modulus, and 1312 // if it is, subtract the modulus. But comparison implies 1313 // subtraction. So we subtract modulus, see if it borrowed, 1314 // and conditionally copy original value. 1315 mov x0,x12 1316 mov x27,x12 // x0 copy 1317 subs x10,x19,x14 1318 add x26,sp,#8*8 1319 sbcs x11,x20,x15 1320 sub x28,x5,#8*4 1321 1322.Lmul4x_sub: 1323 sbcs x12,x21,x16 1324 ldp x14,x15,[x3,#8*0] 1325 sub x28,x28,#8*4 1326 ldp x19,x20,[x26,#8*0] 1327 sbcs x13,x22,x17 1328 ldp x16,x17,[x3,#8*2] 1329 add x3,x3,#8*4 1330 ldp x21,x22,[x26,#8*2] 1331 add x26,x26,#8*4 1332 stp x10,x11,[x0,#8*0] 1333 sbcs x10,x19,x14 1334 stp x12,x13,[x0,#8*2] 1335 add x0,x0,#8*4 1336 sbcs x11,x20,x15 1337 cbnz x28,.Lmul4x_sub 1338 1339 sbcs x12,x21,x16 1340 mov x26,sp 1341 add x1,sp,#8*4 1342 ldp x6,x7,[x27,#8*0] 1343 sbcs x13,x22,x17 1344 stp x10,x11,[x0,#8*0] 1345 ldp x8,x9,[x27,#8*2] 1346 stp x12,x13,[x0,#8*2] 1347 ldp x19,x20,[x1,#8*0] 1348 ldp x21,x22,[x1,#8*2] 1349 sbcs xzr,x30,xzr // did it borrow? 1350 ldr x30,[x29,#8] // pull return address 1351 1352 sub x28,x5,#8*4 1353.Lmul4x_cond_copy: 1354 sub x28,x28,#8*4 1355 csel x10,x19,x6,lo 1356 stp xzr,xzr,[x26,#8*0] 1357 csel x11,x20,x7,lo 1358 ldp x6,x7,[x27,#8*4] 1359 ldp x19,x20,[x1,#8*4] 1360 csel x12,x21,x8,lo 1361 stp xzr,xzr,[x26,#8*2] 1362 add x26,x26,#8*4 1363 csel x13,x22,x9,lo 1364 ldp x8,x9,[x27,#8*6] 1365 ldp x21,x22,[x1,#8*6] 1366 add x1,x1,#8*4 1367 stp x10,x11,[x27,#8*0] 1368 stp x12,x13,[x27,#8*2] 1369 add x27,x27,#8*4 1370 cbnz x28,.Lmul4x_cond_copy 1371 1372 csel x10,x19,x6,lo 1373 stp xzr,xzr,[x26,#8*0] 1374 csel x11,x20,x7,lo 1375 stp xzr,xzr,[x26,#8*2] 1376 csel x12,x21,x8,lo 1377 stp xzr,xzr,[x26,#8*3] 1378 csel x13,x22,x9,lo 1379 stp xzr,xzr,[x26,#8*4] 1380 stp x10,x11,[x27,#8*0] 1381 stp x12,x13,[x27,#8*2] 1382 1383 b .Lmul4x_done 1384 1385.align 4 1386.Lmul4x4_post_condition: 1387 adc x0,x0,xzr 1388 ldr x1,[x29,#96] // pull rp 1389 // x19-3,x0 hold result, x14-7 hold modulus 1390 subs x6,x19,x14 1391 ldr x30,[x29,#8] // pull return address 1392 sbcs x7,x20,x15 1393 stp xzr,xzr,[sp,#8*0] 1394 sbcs x8,x21,x16 1395 stp xzr,xzr,[sp,#8*2] 1396 sbcs x9,x22,x17 1397 stp xzr,xzr,[sp,#8*4] 1398 sbcs xzr,x0,xzr // did it borrow? 1399 stp xzr,xzr,[sp,#8*6] 1400 1401 // x6-3 hold result-modulus 1402 csel x6,x19,x6,lo 1403 csel x7,x20,x7,lo 1404 csel x8,x21,x8,lo 1405 csel x9,x22,x9,lo 1406 stp x6,x7,[x1,#8*0] 1407 stp x8,x9,[x1,#8*2] 1408 1409.Lmul4x_done: 1410 ldp x19,x20,[x29,#16] 1411 mov sp,x29 1412 ldp x21,x22,[x29,#32] 1413 mov x0,#1 1414 ldp x23,x24,[x29,#48] 1415 ldp x25,x26,[x29,#64] 1416 ldp x27,x28,[x29,#80] 1417 ldr x29,[sp],#128 1418 // x30 is popped earlier 1419 AARCH64_VALIDATE_LINK_REGISTER 1420 ret 1421.size __bn_mul4x_mont,.-__bn_mul4x_mont 1422.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1423.align 2 1424.align 4 1425#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) 1426