1#! /usr/bin/env perl 2# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <[email protected]> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# March 2015 18# 19# "Teaser" Montgomery multiplication module for ARMv8. Needs more 20# work. While it does improve RSA sign performance by 20-30% (less for 21# longer keys) on most processors, for some reason RSA2048 is not 22# faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication 23# instruction issue rate is limited on processor in question, meaning 24# that dedicated squaring procedure is a must. Well, actually all 25# contemporary AArch64 processors seem to have limited multiplication 26# issue rate, i.e. they can't issue multiplication every cycle, which 27# explains moderate improvement coefficients in comparison to 28# compiler-generated code. Recall that compiler is instructed to use 29# umulh and therefore uses same amount of multiplication instructions 30# to do the job. Assembly's edge is to minimize number of "collateral" 31# instructions and of course instruction scheduling. 32# 33# April 2015 34# 35# Squaring procedure that handles lengths divisible by 8 improves 36# RSA/DSA performance by 25-40-60% depending on processor and key 37# length. Overall improvement coefficients are always positive in 38# comparison to compiler-generated code. On Cortex-A57 improvement 39# is still modest on longest key lengths, while others exhibit e.g. 40# 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster 41# on Cortex-A57 and ~60-100% faster on others. 42 43$flavour = shift; 44$output = shift; 45 46$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 47( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 48( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or 49die "can't locate arm-xlate.pl"; 50 51open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 52*STDOUT=*OUT; 53 54($lo0,$hi0,$aj,$m0,$alo,$ahi, 55 $lo1,$hi1,$nj,$m1,$nlo,$nhi, 56 $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24); 57 58# int bn_mul_mont( 59$rp="x0"; # BN_ULONG *rp, 60$ap="x1"; # const BN_ULONG *ap, 61$bp="x2"; # const BN_ULONG *bp, 62$np="x3"; # const BN_ULONG *np, 63$n0="x4"; # const BN_ULONG *n0, 64$num="x5"; # size_t num); 65 66$code.=<<___; 67#include <openssl/arm_arch.h> 68 69.text 70 71.globl bn_mul_mont 72.type bn_mul_mont,%function 73.align 5 74bn_mul_mont: 75 AARCH64_SIGN_LINK_REGISTER 76 tst $num,#7 77 b.eq __bn_sqr8x_mont 78 tst $num,#3 79 b.eq __bn_mul4x_mont 80.Lmul_mont: 81 stp x29,x30,[sp,#-64]! 82 add x29,sp,#0 83 stp x19,x20,[sp,#16] 84 stp x21,x22,[sp,#32] 85 stp x23,x24,[sp,#48] 86 87 ldr $m0,[$bp],#8 // bp[0] 88 sub $tp,sp,$num,lsl#3 89 ldp $hi0,$aj,[$ap],#16 // ap[0..1] 90 lsl $num,$num,#3 91 ldr $n0,[$n0] // *n0 92 and $tp,$tp,#-16 // ABI says so 93 ldp $hi1,$nj,[$np],#16 // np[0..1] 94 95 mul $lo0,$hi0,$m0 // ap[0]*bp[0] 96 sub $j,$num,#16 // j=num-2 97 umulh $hi0,$hi0,$m0 98 mul $alo,$aj,$m0 // ap[1]*bp[0] 99 umulh $ahi,$aj,$m0 100 101 mul $m1,$lo0,$n0 // "tp[0]"*n0 102 mov sp,$tp // alloca 103 104 // (*) mul $lo1,$hi1,$m1 // np[0]*m1 105 umulh $hi1,$hi1,$m1 106 mul $nlo,$nj,$m1 // np[1]*m1 107 // (*) adds $lo1,$lo1,$lo0 // discarded 108 // (*) As for removal of first multiplication and addition 109 // instructions. The outcome of first addition is 110 // guaranteed to be zero, which leaves two computationally 111 // significant outcomes: it either carries or not. Then 112 // question is when does it carry? Is there alternative 113 // way to deduce it? If you follow operations, you can 114 // observe that condition for carry is quite simple: 115 // $lo0 being non-zero. So that carry can be calculated 116 // by adding -1 to $lo0. That's what next instruction does. 117 subs xzr,$lo0,#1 // (*) 118 umulh $nhi,$nj,$m1 119 adc $hi1,$hi1,xzr 120 cbz $j,.L1st_skip 121 122.L1st: 123 ldr $aj,[$ap],#8 124 adds $lo0,$alo,$hi0 125 sub $j,$j,#8 // j-- 126 adc $hi0,$ahi,xzr 127 128 ldr $nj,[$np],#8 129 adds $lo1,$nlo,$hi1 130 mul $alo,$aj,$m0 // ap[j]*bp[0] 131 adc $hi1,$nhi,xzr 132 umulh $ahi,$aj,$m0 133 134 adds $lo1,$lo1,$lo0 135 mul $nlo,$nj,$m1 // np[j]*m1 136 adc $hi1,$hi1,xzr 137 umulh $nhi,$nj,$m1 138 str $lo1,[$tp],#8 // tp[j-1] 139 cbnz $j,.L1st 140 141.L1st_skip: 142 adds $lo0,$alo,$hi0 143 sub $ap,$ap,$num // rewind $ap 144 adc $hi0,$ahi,xzr 145 146 adds $lo1,$nlo,$hi1 147 sub $np,$np,$num // rewind $np 148 adc $hi1,$nhi,xzr 149 150 adds $lo1,$lo1,$lo0 151 sub $i,$num,#8 // i=num-1 152 adcs $hi1,$hi1,$hi0 153 154 adc $ovf,xzr,xzr // upmost overflow bit 155 stp $lo1,$hi1,[$tp] 156 157.Louter: 158 ldr $m0,[$bp],#8 // bp[i] 159 ldp $hi0,$aj,[$ap],#16 160 ldr $tj,[sp] // tp[0] 161 add $tp,sp,#8 162 163 mul $lo0,$hi0,$m0 // ap[0]*bp[i] 164 sub $j,$num,#16 // j=num-2 165 umulh $hi0,$hi0,$m0 166 ldp $hi1,$nj,[$np],#16 167 mul $alo,$aj,$m0 // ap[1]*bp[i] 168 adds $lo0,$lo0,$tj 169 umulh $ahi,$aj,$m0 170 adc $hi0,$hi0,xzr 171 172 mul $m1,$lo0,$n0 173 sub $i,$i,#8 // i-- 174 175 // (*) mul $lo1,$hi1,$m1 // np[0]*m1 176 umulh $hi1,$hi1,$m1 177 mul $nlo,$nj,$m1 // np[1]*m1 178 // (*) adds $lo1,$lo1,$lo0 179 subs xzr,$lo0,#1 // (*) 180 umulh $nhi,$nj,$m1 181 cbz $j,.Linner_skip 182 183.Linner: 184 ldr $aj,[$ap],#8 185 adc $hi1,$hi1,xzr 186 ldr $tj,[$tp],#8 // tp[j] 187 adds $lo0,$alo,$hi0 188 sub $j,$j,#8 // j-- 189 adc $hi0,$ahi,xzr 190 191 adds $lo1,$nlo,$hi1 192 ldr $nj,[$np],#8 193 adc $hi1,$nhi,xzr 194 195 mul $alo,$aj,$m0 // ap[j]*bp[i] 196 adds $lo0,$lo0,$tj 197 umulh $ahi,$aj,$m0 198 adc $hi0,$hi0,xzr 199 200 mul $nlo,$nj,$m1 // np[j]*m1 201 adds $lo1,$lo1,$lo0 202 umulh $nhi,$nj,$m1 203 str $lo1,[$tp,#-16] // tp[j-1] 204 cbnz $j,.Linner 205 206.Linner_skip: 207 ldr $tj,[$tp],#8 // tp[j] 208 adc $hi1,$hi1,xzr 209 adds $lo0,$alo,$hi0 210 sub $ap,$ap,$num // rewind $ap 211 adc $hi0,$ahi,xzr 212 213 adds $lo1,$nlo,$hi1 214 sub $np,$np,$num // rewind $np 215 adcs $hi1,$nhi,$ovf 216 adc $ovf,xzr,xzr 217 218 adds $lo0,$lo0,$tj 219 adc $hi0,$hi0,xzr 220 221 adds $lo1,$lo1,$lo0 222 adcs $hi1,$hi1,$hi0 223 adc $ovf,$ovf,xzr // upmost overflow bit 224 stp $lo1,$hi1,[$tp,#-16] 225 226 cbnz $i,.Louter 227 228 // Final step. We see if result is larger than modulus, and 229 // if it is, subtract the modulus. But comparison implies 230 // subtraction. So we subtract modulus, see if it borrowed, 231 // and conditionally copy original value. 232 ldr $tj,[sp] // tp[0] 233 add $tp,sp,#8 234 ldr $nj,[$np],#8 // np[0] 235 subs $j,$num,#8 // j=num-1 and clear borrow 236 mov $ap,$rp 237.Lsub: 238 sbcs $aj,$tj,$nj // tp[j]-np[j] 239 ldr $tj,[$tp],#8 240 sub $j,$j,#8 // j-- 241 ldr $nj,[$np],#8 242 str $aj,[$ap],#8 // rp[j]=tp[j]-np[j] 243 cbnz $j,.Lsub 244 245 sbcs $aj,$tj,$nj 246 sbcs $ovf,$ovf,xzr // did it borrow? 247 str $aj,[$ap],#8 // rp[num-1] 248 249 ldr $tj,[sp] // tp[0] 250 add $tp,sp,#8 251 ldr $aj,[$rp],#8 // rp[0] 252 sub $num,$num,#8 // num-- 253 nop 254.Lcond_copy: 255 sub $num,$num,#8 // num-- 256 csel $nj,$tj,$aj,lo // did it borrow? 257 ldr $tj,[$tp],#8 258 ldr $aj,[$rp],#8 259 str xzr,[$tp,#-16] // wipe tp 260 str $nj,[$rp,#-16] 261 cbnz $num,.Lcond_copy 262 263 csel $nj,$tj,$aj,lo 264 str xzr,[$tp,#-8] // wipe tp 265 str $nj,[$rp,#-8] 266 267 ldp x19,x20,[x29,#16] 268 mov sp,x29 269 ldp x21,x22,[x29,#32] 270 mov x0,#1 271 ldp x23,x24,[x29,#48] 272 ldr x29,[sp],#64 273 AARCH64_VALIDATE_LINK_REGISTER 274 ret 275.size bn_mul_mont,.-bn_mul_mont 276___ 277{ 278######################################################################## 279# Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module. 280 281my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13)); 282my ($t0,$t1,$t2,$t3)=map("x$_",(14..17)); 283my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26)); 284my ($cnt,$carry,$topmost)=("x27","x28","x30"); 285my ($tp,$ap_end,$na0)=($bp,$np,$carry); 286 287$code.=<<___; 288.type __bn_sqr8x_mont,%function 289.align 5 290__bn_sqr8x_mont: 291 // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to 292 // only from bn_mul_mont which has already signed the return address. 293 cmp $ap,$bp 294 b.ne __bn_mul4x_mont 295.Lsqr8x_mont: 296 stp x29,x30,[sp,#-128]! 297 add x29,sp,#0 298 stp x19,x20,[sp,#16] 299 stp x21,x22,[sp,#32] 300 stp x23,x24,[sp,#48] 301 stp x25,x26,[sp,#64] 302 stp x27,x28,[sp,#80] 303 stp $rp,$np,[sp,#96] // offload rp and np 304 305 ldp $a0,$a1,[$ap,#8*0] 306 ldp $a2,$a3,[$ap,#8*2] 307 ldp $a4,$a5,[$ap,#8*4] 308 ldp $a6,$a7,[$ap,#8*6] 309 310 sub $tp,sp,$num,lsl#4 311 lsl $num,$num,#3 312 ldr $n0,[$n0] // *n0 313 mov sp,$tp // alloca 314 sub $cnt,$num,#8*8 315 b .Lsqr8x_zero_start 316 317.Lsqr8x_zero: 318 sub $cnt,$cnt,#8*8 319 stp xzr,xzr,[$tp,#8*0] 320 stp xzr,xzr,[$tp,#8*2] 321 stp xzr,xzr,[$tp,#8*4] 322 stp xzr,xzr,[$tp,#8*6] 323.Lsqr8x_zero_start: 324 stp xzr,xzr,[$tp,#8*8] 325 stp xzr,xzr,[$tp,#8*10] 326 stp xzr,xzr,[$tp,#8*12] 327 stp xzr,xzr,[$tp,#8*14] 328 add $tp,$tp,#8*16 329 cbnz $cnt,.Lsqr8x_zero 330 331 add $ap_end,$ap,$num 332 add $ap,$ap,#8*8 333 mov $acc0,xzr 334 mov $acc1,xzr 335 mov $acc2,xzr 336 mov $acc3,xzr 337 mov $acc4,xzr 338 mov $acc5,xzr 339 mov $acc6,xzr 340 mov $acc7,xzr 341 mov $tp,sp 342 str $n0,[x29,#112] // offload n0 343 344 // Multiply everything but a[i]*a[i] 345.align 4 346.Lsqr8x_outer_loop: 347 // a[1]a[0] (i) 348 // a[2]a[0] 349 // a[3]a[0] 350 // a[4]a[0] 351 // a[5]a[0] 352 // a[6]a[0] 353 // a[7]a[0] 354 // a[2]a[1] (ii) 355 // a[3]a[1] 356 // a[4]a[1] 357 // a[5]a[1] 358 // a[6]a[1] 359 // a[7]a[1] 360 // a[3]a[2] (iii) 361 // a[4]a[2] 362 // a[5]a[2] 363 // a[6]a[2] 364 // a[7]a[2] 365 // a[4]a[3] (iv) 366 // a[5]a[3] 367 // a[6]a[3] 368 // a[7]a[3] 369 // a[5]a[4] (v) 370 // a[6]a[4] 371 // a[7]a[4] 372 // a[6]a[5] (vi) 373 // a[7]a[5] 374 // a[7]a[6] (vii) 375 376 mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i) 377 mul $t1,$a2,$a0 378 mul $t2,$a3,$a0 379 mul $t3,$a4,$a0 380 adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0]) 381 mul $t0,$a5,$a0 382 adcs $acc2,$acc2,$t1 383 mul $t1,$a6,$a0 384 adcs $acc3,$acc3,$t2 385 mul $t2,$a7,$a0 386 adcs $acc4,$acc4,$t3 387 umulh $t3,$a1,$a0 // hi(a[1..7]*a[0]) 388 adcs $acc5,$acc5,$t0 389 umulh $t0,$a2,$a0 390 adcs $acc6,$acc6,$t1 391 umulh $t1,$a3,$a0 392 adcs $acc7,$acc7,$t2 393 umulh $t2,$a4,$a0 394 stp $acc0,$acc1,[$tp],#8*2 // t[0..1] 395 adc $acc0,xzr,xzr // t[8] 396 adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0]) 397 umulh $t3,$a5,$a0 398 adcs $acc3,$acc3,$t0 399 umulh $t0,$a6,$a0 400 adcs $acc4,$acc4,$t1 401 umulh $t1,$a7,$a0 402 adcs $acc5,$acc5,$t2 403 mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii) 404 adcs $acc6,$acc6,$t3 405 mul $t3,$a3,$a1 406 adcs $acc7,$acc7,$t0 407 mul $t0,$a4,$a1 408 adc $acc0,$acc0,$t1 409 410 mul $t1,$a5,$a1 411 adds $acc3,$acc3,$t2 412 mul $t2,$a6,$a1 413 adcs $acc4,$acc4,$t3 414 mul $t3,$a7,$a1 415 adcs $acc5,$acc5,$t0 416 umulh $t0,$a2,$a1 // hi(a[2..7]*a[1]) 417 adcs $acc6,$acc6,$t1 418 umulh $t1,$a3,$a1 419 adcs $acc7,$acc7,$t2 420 umulh $t2,$a4,$a1 421 adcs $acc0,$acc0,$t3 422 umulh $t3,$a5,$a1 423 stp $acc2,$acc3,[$tp],#8*2 // t[2..3] 424 adc $acc1,xzr,xzr // t[9] 425 adds $acc4,$acc4,$t0 426 umulh $t0,$a6,$a1 427 adcs $acc5,$acc5,$t1 428 umulh $t1,$a7,$a1 429 adcs $acc6,$acc6,$t2 430 mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii) 431 adcs $acc7,$acc7,$t3 432 mul $t3,$a4,$a2 433 adcs $acc0,$acc0,$t0 434 mul $t0,$a5,$a2 435 adc $acc1,$acc1,$t1 436 437 mul $t1,$a6,$a2 438 adds $acc5,$acc5,$t2 439 mul $t2,$a7,$a2 440 adcs $acc6,$acc6,$t3 441 umulh $t3,$a3,$a2 // hi(a[3..7]*a[2]) 442 adcs $acc7,$acc7,$t0 443 umulh $t0,$a4,$a2 444 adcs $acc0,$acc0,$t1 445 umulh $t1,$a5,$a2 446 adcs $acc1,$acc1,$t2 447 umulh $t2,$a6,$a2 448 stp $acc4,$acc5,[$tp],#8*2 // t[4..5] 449 adc $acc2,xzr,xzr // t[10] 450 adds $acc6,$acc6,$t3 451 umulh $t3,$a7,$a2 452 adcs $acc7,$acc7,$t0 453 mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv) 454 adcs $acc0,$acc0,$t1 455 mul $t1,$a5,$a3 456 adcs $acc1,$acc1,$t2 457 mul $t2,$a6,$a3 458 adc $acc2,$acc2,$t3 459 460 mul $t3,$a7,$a3 461 adds $acc7,$acc7,$t0 462 umulh $t0,$a4,$a3 // hi(a[4..7]*a[3]) 463 adcs $acc0,$acc0,$t1 464 umulh $t1,$a5,$a3 465 adcs $acc1,$acc1,$t2 466 umulh $t2,$a6,$a3 467 adcs $acc2,$acc2,$t3 468 umulh $t3,$a7,$a3 469 stp $acc6,$acc7,[$tp],#8*2 // t[6..7] 470 adc $acc3,xzr,xzr // t[11] 471 adds $acc0,$acc0,$t0 472 mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v) 473 adcs $acc1,$acc1,$t1 474 mul $t1,$a6,$a4 475 adcs $acc2,$acc2,$t2 476 mul $t2,$a7,$a4 477 adc $acc3,$acc3,$t3 478 479 umulh $t3,$a5,$a4 // hi(a[5..7]*a[4]) 480 adds $acc1,$acc1,$t0 481 umulh $t0,$a6,$a4 482 adcs $acc2,$acc2,$t1 483 umulh $t1,$a7,$a4 484 adcs $acc3,$acc3,$t2 485 mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi) 486 adc $acc4,xzr,xzr // t[12] 487 adds $acc2,$acc2,$t3 488 mul $t3,$a7,$a5 489 adcs $acc3,$acc3,$t0 490 umulh $t0,$a6,$a5 // hi(a[6..7]*a[5]) 491 adc $acc4,$acc4,$t1 492 493 umulh $t1,$a7,$a5 494 adds $acc3,$acc3,$t2 495 mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii) 496 adcs $acc4,$acc4,$t3 497 umulh $t3,$a7,$a6 // hi(a[7]*a[6]) 498 adc $acc5,xzr,xzr // t[13] 499 adds $acc4,$acc4,$t0 500 sub $cnt,$ap_end,$ap // done yet? 501 adc $acc5,$acc5,$t1 502 503 adds $acc5,$acc5,$t2 504 sub $t0,$ap_end,$num // rewinded ap 505 adc $acc6,xzr,xzr // t[14] 506 add $acc6,$acc6,$t3 507 508 cbz $cnt,.Lsqr8x_outer_break 509 510 mov $n0,$a0 511 ldp $a0,$a1,[$tp,#8*0] 512 ldp $a2,$a3,[$tp,#8*2] 513 ldp $a4,$a5,[$tp,#8*4] 514 ldp $a6,$a7,[$tp,#8*6] 515 adds $acc0,$acc0,$a0 516 adcs $acc1,$acc1,$a1 517 ldp $a0,$a1,[$ap,#8*0] 518 adcs $acc2,$acc2,$a2 519 adcs $acc3,$acc3,$a3 520 ldp $a2,$a3,[$ap,#8*2] 521 adcs $acc4,$acc4,$a4 522 adcs $acc5,$acc5,$a5 523 ldp $a4,$a5,[$ap,#8*4] 524 adcs $acc6,$acc6,$a6 525 mov $rp,$ap 526 adcs $acc7,xzr,$a7 527 ldp $a6,$a7,[$ap,#8*6] 528 add $ap,$ap,#8*8 529 //adc $carry,xzr,xzr // moved below 530 mov $cnt,#-8*8 531 532 // a[8]a[0] 533 // a[9]a[0] 534 // a[a]a[0] 535 // a[b]a[0] 536 // a[c]a[0] 537 // a[d]a[0] 538 // a[e]a[0] 539 // a[f]a[0] 540 // a[8]a[1] 541 // a[f]a[1]........................ 542 // a[8]a[2] 543 // a[f]a[2]........................ 544 // a[8]a[3] 545 // a[f]a[3]........................ 546 // a[8]a[4] 547 // a[f]a[4]........................ 548 // a[8]a[5] 549 // a[f]a[5]........................ 550 // a[8]a[6] 551 // a[f]a[6]........................ 552 // a[8]a[7] 553 // a[f]a[7]........................ 554.Lsqr8x_mul: 555 mul $t0,$a0,$n0 556 adc $carry,xzr,xzr // carry bit, modulo-scheduled 557 mul $t1,$a1,$n0 558 add $cnt,$cnt,#8 559 mul $t2,$a2,$n0 560 mul $t3,$a3,$n0 561 adds $acc0,$acc0,$t0 562 mul $t0,$a4,$n0 563 adcs $acc1,$acc1,$t1 564 mul $t1,$a5,$n0 565 adcs $acc2,$acc2,$t2 566 mul $t2,$a6,$n0 567 adcs $acc3,$acc3,$t3 568 mul $t3,$a7,$n0 569 adcs $acc4,$acc4,$t0 570 umulh $t0,$a0,$n0 571 adcs $acc5,$acc5,$t1 572 umulh $t1,$a1,$n0 573 adcs $acc6,$acc6,$t2 574 umulh $t2,$a2,$n0 575 adcs $acc7,$acc7,$t3 576 umulh $t3,$a3,$n0 577 adc $carry,$carry,xzr 578 str $acc0,[$tp],#8 579 adds $acc0,$acc1,$t0 580 umulh $t0,$a4,$n0 581 adcs $acc1,$acc2,$t1 582 umulh $t1,$a5,$n0 583 adcs $acc2,$acc3,$t2 584 umulh $t2,$a6,$n0 585 adcs $acc3,$acc4,$t3 586 umulh $t3,$a7,$n0 587 ldr $n0,[$rp,$cnt] 588 adcs $acc4,$acc5,$t0 589 adcs $acc5,$acc6,$t1 590 adcs $acc6,$acc7,$t2 591 adcs $acc7,$carry,$t3 592 //adc $carry,xzr,xzr // moved above 593 cbnz $cnt,.Lsqr8x_mul 594 // note that carry flag is guaranteed 595 // to be zero at this point 596 cmp $ap,$ap_end // done yet? 597 b.eq .Lsqr8x_break 598 599 ldp $a0,$a1,[$tp,#8*0] 600 ldp $a2,$a3,[$tp,#8*2] 601 ldp $a4,$a5,[$tp,#8*4] 602 ldp $a6,$a7,[$tp,#8*6] 603 adds $acc0,$acc0,$a0 604 ldr $n0,[$rp,#-8*8] 605 adcs $acc1,$acc1,$a1 606 ldp $a0,$a1,[$ap,#8*0] 607 adcs $acc2,$acc2,$a2 608 adcs $acc3,$acc3,$a3 609 ldp $a2,$a3,[$ap,#8*2] 610 adcs $acc4,$acc4,$a4 611 adcs $acc5,$acc5,$a5 612 ldp $a4,$a5,[$ap,#8*4] 613 adcs $acc6,$acc6,$a6 614 mov $cnt,#-8*8 615 adcs $acc7,$acc7,$a7 616 ldp $a6,$a7,[$ap,#8*6] 617 add $ap,$ap,#8*8 618 //adc $carry,xzr,xzr // moved above 619 b .Lsqr8x_mul 620 621.align 4 622.Lsqr8x_break: 623 ldp $a0,$a1,[$rp,#8*0] 624 add $ap,$rp,#8*8 625 ldp $a2,$a3,[$rp,#8*2] 626 sub $t0,$ap_end,$ap // is it last iteration? 627 ldp $a4,$a5,[$rp,#8*4] 628 sub $t1,$tp,$t0 629 ldp $a6,$a7,[$rp,#8*6] 630 cbz $t0,.Lsqr8x_outer_loop 631 632 stp $acc0,$acc1,[$tp,#8*0] 633 ldp $acc0,$acc1,[$t1,#8*0] 634 stp $acc2,$acc3,[$tp,#8*2] 635 ldp $acc2,$acc3,[$t1,#8*2] 636 stp $acc4,$acc5,[$tp,#8*4] 637 ldp $acc4,$acc5,[$t1,#8*4] 638 stp $acc6,$acc7,[$tp,#8*6] 639 mov $tp,$t1 640 ldp $acc6,$acc7,[$t1,#8*6] 641 b .Lsqr8x_outer_loop 642 643.align 4 644.Lsqr8x_outer_break: 645 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] 646 ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0] 647 ldp $t1,$t2,[sp,#8*1] 648 ldp $a5,$a7,[$t0,#8*2] 649 add $ap,$t0,#8*4 650 ldp $t3,$t0,[sp,#8*3] 651 652 stp $acc0,$acc1,[$tp,#8*0] 653 mul $acc0,$a1,$a1 654 stp $acc2,$acc3,[$tp,#8*2] 655 umulh $a1,$a1,$a1 656 stp $acc4,$acc5,[$tp,#8*4] 657 mul $a2,$a3,$a3 658 stp $acc6,$acc7,[$tp,#8*6] 659 mov $tp,sp 660 umulh $a3,$a3,$a3 661 adds $acc1,$a1,$t1,lsl#1 662 extr $t1,$t2,$t1,#63 663 sub $cnt,$num,#8*4 664 665.Lsqr4x_shift_n_add: 666 adcs $acc2,$a2,$t1 667 extr $t2,$t3,$t2,#63 668 sub $cnt,$cnt,#8*4 669 adcs $acc3,$a3,$t2 670 ldp $t1,$t2,[$tp,#8*5] 671 mul $a4,$a5,$a5 672 ldp $a1,$a3,[$ap],#8*2 673 umulh $a5,$a5,$a5 674 mul $a6,$a7,$a7 675 umulh $a7,$a7,$a7 676 extr $t3,$t0,$t3,#63 677 stp $acc0,$acc1,[$tp,#8*0] 678 adcs $acc4,$a4,$t3 679 extr $t0,$t1,$t0,#63 680 stp $acc2,$acc3,[$tp,#8*2] 681 adcs $acc5,$a5,$t0 682 ldp $t3,$t0,[$tp,#8*7] 683 extr $t1,$t2,$t1,#63 684 adcs $acc6,$a6,$t1 685 extr $t2,$t3,$t2,#63 686 adcs $acc7,$a7,$t2 687 ldp $t1,$t2,[$tp,#8*9] 688 mul $a0,$a1,$a1 689 ldp $a5,$a7,[$ap],#8*2 690 umulh $a1,$a1,$a1 691 mul $a2,$a3,$a3 692 umulh $a3,$a3,$a3 693 stp $acc4,$acc5,[$tp,#8*4] 694 extr $t3,$t0,$t3,#63 695 stp $acc6,$acc7,[$tp,#8*6] 696 add $tp,$tp,#8*8 697 adcs $acc0,$a0,$t3 698 extr $t0,$t1,$t0,#63 699 adcs $acc1,$a1,$t0 700 ldp $t3,$t0,[$tp,#8*3] 701 extr $t1,$t2,$t1,#63 702 cbnz $cnt,.Lsqr4x_shift_n_add 703___ 704my ($np,$np_end)=($ap,$ap_end); 705$code.=<<___; 706 ldp $np,$n0,[x29,#104] // pull np and n0 707 708 adcs $acc2,$a2,$t1 709 extr $t2,$t3,$t2,#63 710 adcs $acc3,$a3,$t2 711 ldp $t1,$t2,[$tp,#8*5] 712 mul $a4,$a5,$a5 713 umulh $a5,$a5,$a5 714 stp $acc0,$acc1,[$tp,#8*0] 715 mul $a6,$a7,$a7 716 umulh $a7,$a7,$a7 717 stp $acc2,$acc3,[$tp,#8*2] 718 extr $t3,$t0,$t3,#63 719 adcs $acc4,$a4,$t3 720 extr $t0,$t1,$t0,#63 721 ldp $acc0,$acc1,[sp,#8*0] 722 adcs $acc5,$a5,$t0 723 extr $t1,$t2,$t1,#63 724 ldp $a0,$a1,[$np,#8*0] 725 adcs $acc6,$a6,$t1 726 extr $t2,xzr,$t2,#63 727 ldp $a2,$a3,[$np,#8*2] 728 adc $acc7,$a7,$t2 729 ldp $a4,$a5,[$np,#8*4] 730 731 // Reduce by 512 bits per iteration 732 mul $na0,$n0,$acc0 // t[0]*n0 733 ldp $a6,$a7,[$np,#8*6] 734 add $np_end,$np,$num 735 ldp $acc2,$acc3,[sp,#8*2] 736 stp $acc4,$acc5,[$tp,#8*4] 737 ldp $acc4,$acc5,[sp,#8*4] 738 stp $acc6,$acc7,[$tp,#8*6] 739 ldp $acc6,$acc7,[sp,#8*6] 740 add $np,$np,#8*8 741 mov $topmost,xzr // initial top-most carry 742 mov $tp,sp 743 mov $cnt,#8 744 745.Lsqr8x_reduction: 746 // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0) 747 mul $t1,$a1,$na0 748 sub $cnt,$cnt,#1 749 mul $t2,$a2,$na0 750 str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing 751 mul $t3,$a3,$na0 752 // (*) adds xzr,$acc0,$t0 753 subs xzr,$acc0,#1 // (*) 754 mul $t0,$a4,$na0 755 adcs $acc0,$acc1,$t1 756 mul $t1,$a5,$na0 757 adcs $acc1,$acc2,$t2 758 mul $t2,$a6,$na0 759 adcs $acc2,$acc3,$t3 760 mul $t3,$a7,$na0 761 adcs $acc3,$acc4,$t0 762 umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0) 763 adcs $acc4,$acc5,$t1 764 umulh $t1,$a1,$na0 765 adcs $acc5,$acc6,$t2 766 umulh $t2,$a2,$na0 767 adcs $acc6,$acc7,$t3 768 umulh $t3,$a3,$na0 769 adc $acc7,xzr,xzr 770 adds $acc0,$acc0,$t0 771 umulh $t0,$a4,$na0 772 adcs $acc1,$acc1,$t1 773 umulh $t1,$a5,$na0 774 adcs $acc2,$acc2,$t2 775 umulh $t2,$a6,$na0 776 adcs $acc3,$acc3,$t3 777 umulh $t3,$a7,$na0 778 mul $na0,$n0,$acc0 // next t[0]*n0 779 adcs $acc4,$acc4,$t0 780 adcs $acc5,$acc5,$t1 781 adcs $acc6,$acc6,$t2 782 adc $acc7,$acc7,$t3 783 cbnz $cnt,.Lsqr8x_reduction 784 785 ldp $t0,$t1,[$tp,#8*0] 786 ldp $t2,$t3,[$tp,#8*2] 787 mov $rp,$tp 788 sub $cnt,$np_end,$np // done yet? 789 adds $acc0,$acc0,$t0 790 adcs $acc1,$acc1,$t1 791 ldp $t0,$t1,[$tp,#8*4] 792 adcs $acc2,$acc2,$t2 793 adcs $acc3,$acc3,$t3 794 ldp $t2,$t3,[$tp,#8*6] 795 adcs $acc4,$acc4,$t0 796 adcs $acc5,$acc5,$t1 797 adcs $acc6,$acc6,$t2 798 adcs $acc7,$acc7,$t3 799 //adc $carry,xzr,xzr // moved below 800 cbz $cnt,.Lsqr8x8_post_condition 801 802 ldr $n0,[$tp,#-8*8] 803 ldp $a0,$a1,[$np,#8*0] 804 ldp $a2,$a3,[$np,#8*2] 805 ldp $a4,$a5,[$np,#8*4] 806 mov $cnt,#-8*8 807 ldp $a6,$a7,[$np,#8*6] 808 add $np,$np,#8*8 809 810.Lsqr8x_tail: 811 mul $t0,$a0,$n0 812 adc $carry,xzr,xzr // carry bit, modulo-scheduled 813 mul $t1,$a1,$n0 814 add $cnt,$cnt,#8 815 mul $t2,$a2,$n0 816 mul $t3,$a3,$n0 817 adds $acc0,$acc0,$t0 818 mul $t0,$a4,$n0 819 adcs $acc1,$acc1,$t1 820 mul $t1,$a5,$n0 821 adcs $acc2,$acc2,$t2 822 mul $t2,$a6,$n0 823 adcs $acc3,$acc3,$t3 824 mul $t3,$a7,$n0 825 adcs $acc4,$acc4,$t0 826 umulh $t0,$a0,$n0 827 adcs $acc5,$acc5,$t1 828 umulh $t1,$a1,$n0 829 adcs $acc6,$acc6,$t2 830 umulh $t2,$a2,$n0 831 adcs $acc7,$acc7,$t3 832 umulh $t3,$a3,$n0 833 adc $carry,$carry,xzr 834 str $acc0,[$tp],#8 835 adds $acc0,$acc1,$t0 836 umulh $t0,$a4,$n0 837 adcs $acc1,$acc2,$t1 838 umulh $t1,$a5,$n0 839 adcs $acc2,$acc3,$t2 840 umulh $t2,$a6,$n0 841 adcs $acc3,$acc4,$t3 842 umulh $t3,$a7,$n0 843 ldr $n0,[$rp,$cnt] 844 adcs $acc4,$acc5,$t0 845 adcs $acc5,$acc6,$t1 846 adcs $acc6,$acc7,$t2 847 adcs $acc7,$carry,$t3 848 //adc $carry,xzr,xzr // moved above 849 cbnz $cnt,.Lsqr8x_tail 850 // note that carry flag is guaranteed 851 // to be zero at this point 852 ldp $a0,$a1,[$tp,#8*0] 853 sub $cnt,$np_end,$np // done yet? 854 sub $t2,$np_end,$num // rewinded np 855 ldp $a2,$a3,[$tp,#8*2] 856 ldp $a4,$a5,[$tp,#8*4] 857 ldp $a6,$a7,[$tp,#8*6] 858 cbz $cnt,.Lsqr8x_tail_break 859 860 ldr $n0,[$rp,#-8*8] 861 adds $acc0,$acc0,$a0 862 adcs $acc1,$acc1,$a1 863 ldp $a0,$a1,[$np,#8*0] 864 adcs $acc2,$acc2,$a2 865 adcs $acc3,$acc3,$a3 866 ldp $a2,$a3,[$np,#8*2] 867 adcs $acc4,$acc4,$a4 868 adcs $acc5,$acc5,$a5 869 ldp $a4,$a5,[$np,#8*4] 870 adcs $acc6,$acc6,$a6 871 mov $cnt,#-8*8 872 adcs $acc7,$acc7,$a7 873 ldp $a6,$a7,[$np,#8*6] 874 add $np,$np,#8*8 875 //adc $carry,xzr,xzr // moved above 876 b .Lsqr8x_tail 877 878.align 4 879.Lsqr8x_tail_break: 880 ldr $n0,[x29,#112] // pull n0 881 add $cnt,$tp,#8*8 // end of current t[num] window 882 883 subs xzr,$topmost,#1 // "move" top-most carry to carry bit 884 adcs $t0,$acc0,$a0 885 adcs $t1,$acc1,$a1 886 ldp $acc0,$acc1,[$rp,#8*0] 887 adcs $acc2,$acc2,$a2 888 ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0] 889 adcs $acc3,$acc3,$a3 890 ldp $a2,$a3,[$t2,#8*2] 891 adcs $acc4,$acc4,$a4 892 adcs $acc5,$acc5,$a5 893 ldp $a4,$a5,[$t2,#8*4] 894 adcs $acc6,$acc6,$a6 895 adcs $acc7,$acc7,$a7 896 ldp $a6,$a7,[$t2,#8*6] 897 add $np,$t2,#8*8 898 adc $topmost,xzr,xzr // top-most carry 899 mul $na0,$n0,$acc0 900 stp $t0,$t1,[$tp,#8*0] 901 stp $acc2,$acc3,[$tp,#8*2] 902 ldp $acc2,$acc3,[$rp,#8*2] 903 stp $acc4,$acc5,[$tp,#8*4] 904 ldp $acc4,$acc5,[$rp,#8*4] 905 cmp $cnt,x29 // did we hit the bottom? 906 stp $acc6,$acc7,[$tp,#8*6] 907 mov $tp,$rp // slide the window 908 ldp $acc6,$acc7,[$rp,#8*6] 909 mov $cnt,#8 910 b.ne .Lsqr8x_reduction 911 912 // Final step. We see if result is larger than modulus, and 913 // if it is, subtract the modulus. But comparison implies 914 // subtraction. So we subtract modulus, see if it borrowed, 915 // and conditionally copy original value. 916 ldr $rp,[x29,#96] // pull rp 917 add $tp,$tp,#8*8 918 subs $t0,$acc0,$a0 919 sbcs $t1,$acc1,$a1 920 sub $cnt,$num,#8*8 921 mov $ap_end,$rp // $rp copy 922 923.Lsqr8x_sub: 924 sbcs $t2,$acc2,$a2 925 ldp $a0,$a1,[$np,#8*0] 926 sbcs $t3,$acc3,$a3 927 stp $t0,$t1,[$rp,#8*0] 928 sbcs $t0,$acc4,$a4 929 ldp $a2,$a3,[$np,#8*2] 930 sbcs $t1,$acc5,$a5 931 stp $t2,$t3,[$rp,#8*2] 932 sbcs $t2,$acc6,$a6 933 ldp $a4,$a5,[$np,#8*4] 934 sbcs $t3,$acc7,$a7 935 ldp $a6,$a7,[$np,#8*6] 936 add $np,$np,#8*8 937 ldp $acc0,$acc1,[$tp,#8*0] 938 sub $cnt,$cnt,#8*8 939 ldp $acc2,$acc3,[$tp,#8*2] 940 ldp $acc4,$acc5,[$tp,#8*4] 941 ldp $acc6,$acc7,[$tp,#8*6] 942 add $tp,$tp,#8*8 943 stp $t0,$t1,[$rp,#8*4] 944 sbcs $t0,$acc0,$a0 945 stp $t2,$t3,[$rp,#8*6] 946 add $rp,$rp,#8*8 947 sbcs $t1,$acc1,$a1 948 cbnz $cnt,.Lsqr8x_sub 949 950 sbcs $t2,$acc2,$a2 951 mov $tp,sp 952 add $ap,sp,$num 953 ldp $a0,$a1,[$ap_end,#8*0] 954 sbcs $t3,$acc3,$a3 955 stp $t0,$t1,[$rp,#8*0] 956 sbcs $t0,$acc4,$a4 957 ldp $a2,$a3,[$ap_end,#8*2] 958 sbcs $t1,$acc5,$a5 959 stp $t2,$t3,[$rp,#8*2] 960 sbcs $t2,$acc6,$a6 961 ldp $acc0,$acc1,[$ap,#8*0] 962 sbcs $t3,$acc7,$a7 963 ldp $acc2,$acc3,[$ap,#8*2] 964 sbcs xzr,$topmost,xzr // did it borrow? 965 ldr x30,[x29,#8] // pull return address 966 stp $t0,$t1,[$rp,#8*4] 967 stp $t2,$t3,[$rp,#8*6] 968 969 sub $cnt,$num,#8*4 970.Lsqr4x_cond_copy: 971 sub $cnt,$cnt,#8*4 972 csel $t0,$acc0,$a0,lo 973 stp xzr,xzr,[$tp,#8*0] 974 csel $t1,$acc1,$a1,lo 975 ldp $a0,$a1,[$ap_end,#8*4] 976 ldp $acc0,$acc1,[$ap,#8*4] 977 csel $t2,$acc2,$a2,lo 978 stp xzr,xzr,[$tp,#8*2] 979 add $tp,$tp,#8*4 980 csel $t3,$acc3,$a3,lo 981 ldp $a2,$a3,[$ap_end,#8*6] 982 ldp $acc2,$acc3,[$ap,#8*6] 983 add $ap,$ap,#8*4 984 stp $t0,$t1,[$ap_end,#8*0] 985 stp $t2,$t3,[$ap_end,#8*2] 986 add $ap_end,$ap_end,#8*4 987 stp xzr,xzr,[$ap,#8*0] 988 stp xzr,xzr,[$ap,#8*2] 989 cbnz $cnt,.Lsqr4x_cond_copy 990 991 csel $t0,$acc0,$a0,lo 992 stp xzr,xzr,[$tp,#8*0] 993 csel $t1,$acc1,$a1,lo 994 stp xzr,xzr,[$tp,#8*2] 995 csel $t2,$acc2,$a2,lo 996 csel $t3,$acc3,$a3,lo 997 stp $t0,$t1,[$ap_end,#8*0] 998 stp $t2,$t3,[$ap_end,#8*2] 999 1000 b .Lsqr8x_done 1001 1002.align 4 1003.Lsqr8x8_post_condition: 1004 adc $carry,xzr,xzr 1005 ldr x30,[x29,#8] // pull return address 1006 // $acc0-7,$carry hold result, $a0-7 hold modulus 1007 subs $a0,$acc0,$a0 1008 ldr $ap,[x29,#96] // pull rp 1009 sbcs $a1,$acc1,$a1 1010 stp xzr,xzr,[sp,#8*0] 1011 sbcs $a2,$acc2,$a2 1012 stp xzr,xzr,[sp,#8*2] 1013 sbcs $a3,$acc3,$a3 1014 stp xzr,xzr,[sp,#8*4] 1015 sbcs $a4,$acc4,$a4 1016 stp xzr,xzr,[sp,#8*6] 1017 sbcs $a5,$acc5,$a5 1018 stp xzr,xzr,[sp,#8*8] 1019 sbcs $a6,$acc6,$a6 1020 stp xzr,xzr,[sp,#8*10] 1021 sbcs $a7,$acc7,$a7 1022 stp xzr,xzr,[sp,#8*12] 1023 sbcs $carry,$carry,xzr // did it borrow? 1024 stp xzr,xzr,[sp,#8*14] 1025 1026 // $a0-7 hold result-modulus 1027 csel $a0,$acc0,$a0,lo 1028 csel $a1,$acc1,$a1,lo 1029 csel $a2,$acc2,$a2,lo 1030 csel $a3,$acc3,$a3,lo 1031 stp $a0,$a1,[$ap,#8*0] 1032 csel $a4,$acc4,$a4,lo 1033 csel $a5,$acc5,$a5,lo 1034 stp $a2,$a3,[$ap,#8*2] 1035 csel $a6,$acc6,$a6,lo 1036 csel $a7,$acc7,$a7,lo 1037 stp $a4,$a5,[$ap,#8*4] 1038 stp $a6,$a7,[$ap,#8*6] 1039 1040.Lsqr8x_done: 1041 ldp x19,x20,[x29,#16] 1042 mov sp,x29 1043 ldp x21,x22,[x29,#32] 1044 mov x0,#1 1045 ldp x23,x24,[x29,#48] 1046 ldp x25,x26,[x29,#64] 1047 ldp x27,x28,[x29,#80] 1048 ldr x29,[sp],#128 1049 // x30 is popped earlier 1050 AARCH64_VALIDATE_LINK_REGISTER 1051 ret 1052.size __bn_sqr8x_mont,.-__bn_sqr8x_mont 1053___ 1054} 1055 1056{ 1057######################################################################## 1058# Even though this might look as ARMv8 adaptation of mulx4x_mont from 1059# x86_64-mont5 module, it's different in sense that it performs 1060# reduction 256 bits at a time. 1061 1062my ($a0,$a1,$a2,$a3, 1063 $t0,$t1,$t2,$t3, 1064 $m0,$m1,$m2,$m3, 1065 $acc0,$acc1,$acc2,$acc3,$acc4, 1066 $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28)); 1067my $bp_end=$rp; 1068my ($carry,$topmost) = ($rp,"x30"); 1069 1070$code.=<<___; 1071.type __bn_mul4x_mont,%function 1072.align 5 1073__bn_mul4x_mont: 1074 // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to 1075 // only from bn_mul_mont or __bn_mul8x_mont which have already signed the 1076 // return address. 1077 stp x29,x30,[sp,#-128]! 1078 add x29,sp,#0 1079 stp x19,x20,[sp,#16] 1080 stp x21,x22,[sp,#32] 1081 stp x23,x24,[sp,#48] 1082 stp x25,x26,[sp,#64] 1083 stp x27,x28,[sp,#80] 1084 1085 sub $tp,sp,$num,lsl#3 1086 lsl $num,$num,#3 1087 ldr $n0,[$n0] // *n0 1088 sub sp,$tp,#8*4 // alloca 1089 1090 add $t0,$bp,$num 1091 add $ap_end,$ap,$num 1092 stp $rp,$t0,[x29,#96] // offload rp and &b[num] 1093 1094 ldr $bi,[$bp,#8*0] // b[0] 1095 ldp $a0,$a1,[$ap,#8*0] // a[0..3] 1096 ldp $a2,$a3,[$ap,#8*2] 1097 add $ap,$ap,#8*4 1098 mov $acc0,xzr 1099 mov $acc1,xzr 1100 mov $acc2,xzr 1101 mov $acc3,xzr 1102 ldp $m0,$m1,[$np,#8*0] // n[0..3] 1103 ldp $m2,$m3,[$np,#8*2] 1104 adds $np,$np,#8*4 // clear carry bit 1105 mov $carry,xzr 1106 mov $cnt,#0 1107 mov $tp,sp 1108 1109.Loop_mul4x_1st_reduction: 1110 mul $t0,$a0,$bi // lo(a[0..3]*b[0]) 1111 adc $carry,$carry,xzr // modulo-scheduled 1112 mul $t1,$a1,$bi 1113 add $cnt,$cnt,#8 1114 mul $t2,$a2,$bi 1115 and $cnt,$cnt,#31 1116 mul $t3,$a3,$bi 1117 adds $acc0,$acc0,$t0 1118 umulh $t0,$a0,$bi // hi(a[0..3]*b[0]) 1119 adcs $acc1,$acc1,$t1 1120 mul $mi,$acc0,$n0 // t[0]*n0 1121 adcs $acc2,$acc2,$t2 1122 umulh $t1,$a1,$bi 1123 adcs $acc3,$acc3,$t3 1124 umulh $t2,$a2,$bi 1125 adc $acc4,xzr,xzr 1126 umulh $t3,$a3,$bi 1127 ldr $bi,[$bp,$cnt] // next b[i] (or b[0]) 1128 adds $acc1,$acc1,$t0 1129 // (*) mul $t0,$m0,$mi // lo(n[0..3]*t[0]*n0) 1130 str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing 1131 adcs $acc2,$acc2,$t1 1132 mul $t1,$m1,$mi 1133 adcs $acc3,$acc3,$t2 1134 mul $t2,$m2,$mi 1135 adc $acc4,$acc4,$t3 // can't overflow 1136 mul $t3,$m3,$mi 1137 // (*) adds xzr,$acc0,$t0 1138 subs xzr,$acc0,#1 // (*) 1139 umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0) 1140 adcs $acc0,$acc1,$t1 1141 umulh $t1,$m1,$mi 1142 adcs $acc1,$acc2,$t2 1143 umulh $t2,$m2,$mi 1144 adcs $acc2,$acc3,$t3 1145 umulh $t3,$m3,$mi 1146 adcs $acc3,$acc4,$carry 1147 adc $carry,xzr,xzr 1148 adds $acc0,$acc0,$t0 1149 sub $t0,$ap_end,$ap 1150 adcs $acc1,$acc1,$t1 1151 adcs $acc2,$acc2,$t2 1152 adcs $acc3,$acc3,$t3 1153 //adc $carry,$carry,xzr 1154 cbnz $cnt,.Loop_mul4x_1st_reduction 1155 1156 cbz $t0,.Lmul4x4_post_condition 1157 1158 ldp $a0,$a1,[$ap,#8*0] // a[4..7] 1159 ldp $a2,$a3,[$ap,#8*2] 1160 add $ap,$ap,#8*4 1161 ldr $mi,[sp] // a[0]*n0 1162 ldp $m0,$m1,[$np,#8*0] // n[4..7] 1163 ldp $m2,$m3,[$np,#8*2] 1164 add $np,$np,#8*4 1165 1166.Loop_mul4x_1st_tail: 1167 mul $t0,$a0,$bi // lo(a[4..7]*b[i]) 1168 adc $carry,$carry,xzr // modulo-scheduled 1169 mul $t1,$a1,$bi 1170 add $cnt,$cnt,#8 1171 mul $t2,$a2,$bi 1172 and $cnt,$cnt,#31 1173 mul $t3,$a3,$bi 1174 adds $acc0,$acc0,$t0 1175 umulh $t0,$a0,$bi // hi(a[4..7]*b[i]) 1176 adcs $acc1,$acc1,$t1 1177 umulh $t1,$a1,$bi 1178 adcs $acc2,$acc2,$t2 1179 umulh $t2,$a2,$bi 1180 adcs $acc3,$acc3,$t3 1181 umulh $t3,$a3,$bi 1182 adc $acc4,xzr,xzr 1183 ldr $bi,[$bp,$cnt] // next b[i] (or b[0]) 1184 adds $acc1,$acc1,$t0 1185 mul $t0,$m0,$mi // lo(n[4..7]*a[0]*n0) 1186 adcs $acc2,$acc2,$t1 1187 mul $t1,$m1,$mi 1188 adcs $acc3,$acc3,$t2 1189 mul $t2,$m2,$mi 1190 adc $acc4,$acc4,$t3 // can't overflow 1191 mul $t3,$m3,$mi 1192 adds $acc0,$acc0,$t0 1193 umulh $t0,$m0,$mi // hi(n[4..7]*a[0]*n0) 1194 adcs $acc1,$acc1,$t1 1195 umulh $t1,$m1,$mi 1196 adcs $acc2,$acc2,$t2 1197 umulh $t2,$m2,$mi 1198 adcs $acc3,$acc3,$t3 1199 adcs $acc4,$acc4,$carry 1200 umulh $t3,$m3,$mi 1201 adc $carry,xzr,xzr 1202 ldr $mi,[sp,$cnt] // next t[0]*n0 1203 str $acc0,[$tp],#8 // result!!! 1204 adds $acc0,$acc1,$t0 1205 sub $t0,$ap_end,$ap // done yet? 1206 adcs $acc1,$acc2,$t1 1207 adcs $acc2,$acc3,$t2 1208 adcs $acc3,$acc4,$t3 1209 //adc $carry,$carry,xzr 1210 cbnz $cnt,.Loop_mul4x_1st_tail 1211 1212 sub $t1,$ap_end,$num // rewinded $ap 1213 cbz $t0,.Lmul4x_proceed 1214 1215 ldp $a0,$a1,[$ap,#8*0] 1216 ldp $a2,$a3,[$ap,#8*2] 1217 add $ap,$ap,#8*4 1218 ldp $m0,$m1,[$np,#8*0] 1219 ldp $m2,$m3,[$np,#8*2] 1220 add $np,$np,#8*4 1221 b .Loop_mul4x_1st_tail 1222 1223.align 5 1224.Lmul4x_proceed: 1225 ldr $bi,[$bp,#8*4]! // *++b 1226 adc $topmost,$carry,xzr 1227 ldp $a0,$a1,[$t1,#8*0] // a[0..3] 1228 sub $np,$np,$num // rewind np 1229 ldp $a2,$a3,[$t1,#8*2] 1230 add $ap,$t1,#8*4 1231 1232 stp $acc0,$acc1,[$tp,#8*0] // result!!! 1233 ldp $acc0,$acc1,[sp,#8*4] // t[0..3] 1234 stp $acc2,$acc3,[$tp,#8*2] // result!!! 1235 ldp $acc2,$acc3,[sp,#8*6] 1236 1237 ldp $m0,$m1,[$np,#8*0] // n[0..3] 1238 mov $tp,sp 1239 ldp $m2,$m3,[$np,#8*2] 1240 adds $np,$np,#8*4 // clear carry bit 1241 mov $carry,xzr 1242 1243.align 4 1244.Loop_mul4x_reduction: 1245 mul $t0,$a0,$bi // lo(a[0..3]*b[4]) 1246 adc $carry,$carry,xzr // modulo-scheduled 1247 mul $t1,$a1,$bi 1248 add $cnt,$cnt,#8 1249 mul $t2,$a2,$bi 1250 and $cnt,$cnt,#31 1251 mul $t3,$a3,$bi 1252 adds $acc0,$acc0,$t0 1253 umulh $t0,$a0,$bi // hi(a[0..3]*b[4]) 1254 adcs $acc1,$acc1,$t1 1255 mul $mi,$acc0,$n0 // t[0]*n0 1256 adcs $acc2,$acc2,$t2 1257 umulh $t1,$a1,$bi 1258 adcs $acc3,$acc3,$t3 1259 umulh $t2,$a2,$bi 1260 adc $acc4,xzr,xzr 1261 umulh $t3,$a3,$bi 1262 ldr $bi,[$bp,$cnt] // next b[i] 1263 adds $acc1,$acc1,$t0 1264 // (*) mul $t0,$m0,$mi 1265 str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing 1266 adcs $acc2,$acc2,$t1 1267 mul $t1,$m1,$mi // lo(n[0..3]*t[0]*n0 1268 adcs $acc3,$acc3,$t2 1269 mul $t2,$m2,$mi 1270 adc $acc4,$acc4,$t3 // can't overflow 1271 mul $t3,$m3,$mi 1272 // (*) adds xzr,$acc0,$t0 1273 subs xzr,$acc0,#1 // (*) 1274 umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0 1275 adcs $acc0,$acc1,$t1 1276 umulh $t1,$m1,$mi 1277 adcs $acc1,$acc2,$t2 1278 umulh $t2,$m2,$mi 1279 adcs $acc2,$acc3,$t3 1280 umulh $t3,$m3,$mi 1281 adcs $acc3,$acc4,$carry 1282 adc $carry,xzr,xzr 1283 adds $acc0,$acc0,$t0 1284 adcs $acc1,$acc1,$t1 1285 adcs $acc2,$acc2,$t2 1286 adcs $acc3,$acc3,$t3 1287 //adc $carry,$carry,xzr 1288 cbnz $cnt,.Loop_mul4x_reduction 1289 1290 adc $carry,$carry,xzr 1291 ldp $t0,$t1,[$tp,#8*4] // t[4..7] 1292 ldp $t2,$t3,[$tp,#8*6] 1293 ldp $a0,$a1,[$ap,#8*0] // a[4..7] 1294 ldp $a2,$a3,[$ap,#8*2] 1295 add $ap,$ap,#8*4 1296 adds $acc0,$acc0,$t0 1297 adcs $acc1,$acc1,$t1 1298 adcs $acc2,$acc2,$t2 1299 adcs $acc3,$acc3,$t3 1300 //adc $carry,$carry,xzr 1301 1302 ldr $mi,[sp] // t[0]*n0 1303 ldp $m0,$m1,[$np,#8*0] // n[4..7] 1304 ldp $m2,$m3,[$np,#8*2] 1305 add $np,$np,#8*4 1306 1307.align 4 1308.Loop_mul4x_tail: 1309 mul $t0,$a0,$bi // lo(a[4..7]*b[4]) 1310 adc $carry,$carry,xzr // modulo-scheduled 1311 mul $t1,$a1,$bi 1312 add $cnt,$cnt,#8 1313 mul $t2,$a2,$bi 1314 and $cnt,$cnt,#31 1315 mul $t3,$a3,$bi 1316 adds $acc0,$acc0,$t0 1317 umulh $t0,$a0,$bi // hi(a[4..7]*b[4]) 1318 adcs $acc1,$acc1,$t1 1319 umulh $t1,$a1,$bi 1320 adcs $acc2,$acc2,$t2 1321 umulh $t2,$a2,$bi 1322 adcs $acc3,$acc3,$t3 1323 umulh $t3,$a3,$bi 1324 adc $acc4,xzr,xzr 1325 ldr $bi,[$bp,$cnt] // next b[i] 1326 adds $acc1,$acc1,$t0 1327 mul $t0,$m0,$mi // lo(n[4..7]*t[0]*n0) 1328 adcs $acc2,$acc2,$t1 1329 mul $t1,$m1,$mi 1330 adcs $acc3,$acc3,$t2 1331 mul $t2,$m2,$mi 1332 adc $acc4,$acc4,$t3 // can't overflow 1333 mul $t3,$m3,$mi 1334 adds $acc0,$acc0,$t0 1335 umulh $t0,$m0,$mi // hi(n[4..7]*t[0]*n0) 1336 adcs $acc1,$acc1,$t1 1337 umulh $t1,$m1,$mi 1338 adcs $acc2,$acc2,$t2 1339 umulh $t2,$m2,$mi 1340 adcs $acc3,$acc3,$t3 1341 umulh $t3,$m3,$mi 1342 adcs $acc4,$acc4,$carry 1343 ldr $mi,[sp,$cnt] // next a[0]*n0 1344 adc $carry,xzr,xzr 1345 str $acc0,[$tp],#8 // result!!! 1346 adds $acc0,$acc1,$t0 1347 sub $t0,$ap_end,$ap // done yet? 1348 adcs $acc1,$acc2,$t1 1349 adcs $acc2,$acc3,$t2 1350 adcs $acc3,$acc4,$t3 1351 //adc $carry,$carry,xzr 1352 cbnz $cnt,.Loop_mul4x_tail 1353 1354 sub $t1,$np,$num // rewinded np? 1355 adc $carry,$carry,xzr 1356 cbz $t0,.Loop_mul4x_break 1357 1358 ldp $t0,$t1,[$tp,#8*4] 1359 ldp $t2,$t3,[$tp,#8*6] 1360 ldp $a0,$a1,[$ap,#8*0] 1361 ldp $a2,$a3,[$ap,#8*2] 1362 add $ap,$ap,#8*4 1363 adds $acc0,$acc0,$t0 1364 adcs $acc1,$acc1,$t1 1365 adcs $acc2,$acc2,$t2 1366 adcs $acc3,$acc3,$t3 1367 //adc $carry,$carry,xzr 1368 ldp $m0,$m1,[$np,#8*0] 1369 ldp $m2,$m3,[$np,#8*2] 1370 add $np,$np,#8*4 1371 b .Loop_mul4x_tail 1372 1373.align 4 1374.Loop_mul4x_break: 1375 ldp $t2,$t3,[x29,#96] // pull rp and &b[num] 1376 adds $acc0,$acc0,$topmost 1377 add $bp,$bp,#8*4 // bp++ 1378 adcs $acc1,$acc1,xzr 1379 sub $ap,$ap,$num // rewind ap 1380 adcs $acc2,$acc2,xzr 1381 stp $acc0,$acc1,[$tp,#8*0] // result!!! 1382 adcs $acc3,$acc3,xzr 1383 ldp $acc0,$acc1,[sp,#8*4] // t[0..3] 1384 adc $topmost,$carry,xzr 1385 stp $acc2,$acc3,[$tp,#8*2] // result!!! 1386 cmp $bp,$t3 // done yet? 1387 ldp $acc2,$acc3,[sp,#8*6] 1388 ldp $m0,$m1,[$t1,#8*0] // n[0..3] 1389 ldp $m2,$m3,[$t1,#8*2] 1390 add $np,$t1,#8*4 1391 b.eq .Lmul4x_post 1392 1393 ldr $bi,[$bp] 1394 ldp $a0,$a1,[$ap,#8*0] // a[0..3] 1395 ldp $a2,$a3,[$ap,#8*2] 1396 adds $ap,$ap,#8*4 // clear carry bit 1397 mov $carry,xzr 1398 mov $tp,sp 1399 b .Loop_mul4x_reduction 1400 1401.align 4 1402.Lmul4x_post: 1403 // Final step. We see if result is larger than modulus, and 1404 // if it is, subtract the modulus. But comparison implies 1405 // subtraction. So we subtract modulus, see if it borrowed, 1406 // and conditionally copy original value. 1407 mov $rp,$t2 1408 mov $ap_end,$t2 // $rp copy 1409 subs $t0,$acc0,$m0 1410 add $tp,sp,#8*8 1411 sbcs $t1,$acc1,$m1 1412 sub $cnt,$num,#8*4 1413 1414.Lmul4x_sub: 1415 sbcs $t2,$acc2,$m2 1416 ldp $m0,$m1,[$np,#8*0] 1417 sub $cnt,$cnt,#8*4 1418 ldp $acc0,$acc1,[$tp,#8*0] 1419 sbcs $t3,$acc3,$m3 1420 ldp $m2,$m3,[$np,#8*2] 1421 add $np,$np,#8*4 1422 ldp $acc2,$acc3,[$tp,#8*2] 1423 add $tp,$tp,#8*4 1424 stp $t0,$t1,[$rp,#8*0] 1425 sbcs $t0,$acc0,$m0 1426 stp $t2,$t3,[$rp,#8*2] 1427 add $rp,$rp,#8*4 1428 sbcs $t1,$acc1,$m1 1429 cbnz $cnt,.Lmul4x_sub 1430 1431 sbcs $t2,$acc2,$m2 1432 mov $tp,sp 1433 add $ap,sp,#8*4 1434 ldp $a0,$a1,[$ap_end,#8*0] 1435 sbcs $t3,$acc3,$m3 1436 stp $t0,$t1,[$rp,#8*0] 1437 ldp $a2,$a3,[$ap_end,#8*2] 1438 stp $t2,$t3,[$rp,#8*2] 1439 ldp $acc0,$acc1,[$ap,#8*0] 1440 ldp $acc2,$acc3,[$ap,#8*2] 1441 sbcs xzr,$topmost,xzr // did it borrow? 1442 ldr x30,[x29,#8] // pull return address 1443 1444 sub $cnt,$num,#8*4 1445.Lmul4x_cond_copy: 1446 sub $cnt,$cnt,#8*4 1447 csel $t0,$acc0,$a0,lo 1448 stp xzr,xzr,[$tp,#8*0] 1449 csel $t1,$acc1,$a1,lo 1450 ldp $a0,$a1,[$ap_end,#8*4] 1451 ldp $acc0,$acc1,[$ap,#8*4] 1452 csel $t2,$acc2,$a2,lo 1453 stp xzr,xzr,[$tp,#8*2] 1454 add $tp,$tp,#8*4 1455 csel $t3,$acc3,$a3,lo 1456 ldp $a2,$a3,[$ap_end,#8*6] 1457 ldp $acc2,$acc3,[$ap,#8*6] 1458 add $ap,$ap,#8*4 1459 stp $t0,$t1,[$ap_end,#8*0] 1460 stp $t2,$t3,[$ap_end,#8*2] 1461 add $ap_end,$ap_end,#8*4 1462 cbnz $cnt,.Lmul4x_cond_copy 1463 1464 csel $t0,$acc0,$a0,lo 1465 stp xzr,xzr,[$tp,#8*0] 1466 csel $t1,$acc1,$a1,lo 1467 stp xzr,xzr,[$tp,#8*2] 1468 csel $t2,$acc2,$a2,lo 1469 stp xzr,xzr,[$tp,#8*3] 1470 csel $t3,$acc3,$a3,lo 1471 stp xzr,xzr,[$tp,#8*4] 1472 stp $t0,$t1,[$ap_end,#8*0] 1473 stp $t2,$t3,[$ap_end,#8*2] 1474 1475 b .Lmul4x_done 1476 1477.align 4 1478.Lmul4x4_post_condition: 1479 adc $carry,$carry,xzr 1480 ldr $ap,[x29,#96] // pull rp 1481 // $acc0-3,$carry hold result, $m0-7 hold modulus 1482 subs $a0,$acc0,$m0 1483 ldr x30,[x29,#8] // pull return address 1484 sbcs $a1,$acc1,$m1 1485 stp xzr,xzr,[sp,#8*0] 1486 sbcs $a2,$acc2,$m2 1487 stp xzr,xzr,[sp,#8*2] 1488 sbcs $a3,$acc3,$m3 1489 stp xzr,xzr,[sp,#8*4] 1490 sbcs xzr,$carry,xzr // did it borrow? 1491 stp xzr,xzr,[sp,#8*6] 1492 1493 // $a0-3 hold result-modulus 1494 csel $a0,$acc0,$a0,lo 1495 csel $a1,$acc1,$a1,lo 1496 csel $a2,$acc2,$a2,lo 1497 csel $a3,$acc3,$a3,lo 1498 stp $a0,$a1,[$ap,#8*0] 1499 stp $a2,$a3,[$ap,#8*2] 1500 1501.Lmul4x_done: 1502 ldp x19,x20,[x29,#16] 1503 mov sp,x29 1504 ldp x21,x22,[x29,#32] 1505 mov x0,#1 1506 ldp x23,x24,[x29,#48] 1507 ldp x25,x26,[x29,#64] 1508 ldp x27,x28,[x29,#80] 1509 ldr x29,[sp],#128 1510 // x30 is popped earlier 1511 AARCH64_VALIDATE_LINK_REGISTER 1512 ret 1513.size __bn_mul4x_mont,.-__bn_mul4x_mont 1514___ 1515} 1516$code.=<<___; 1517.asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 1518.align 4 1519___ 1520 1521print $code; 1522 1523close STDOUT or die "error closing STDOUT: $!"; 1524