1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#include <openssl/asm_base.h> 5 6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) 7#include "openssl/arm_arch.h" 8 9.section .rodata 10.align 5 11Lpoly: 12.quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 13LRR: // 2^512 mod P precomputed for NIST P256 polynomial 14.quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd 15Lone_mont: 16.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe 17Lone: 18.quad 1,0,0,0 19Lord: 20.quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 21LordK: 22.quad 0xccd1c8aaee00bc4f 23.byte 69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 24.align 2 25.text 26 27// void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], 28// const BN_ULONG x2[4]); 29.globl ecp_nistz256_mul_mont 30 31.def ecp_nistz256_mul_mont 32 .type 32 33.endef 34.align 4 35ecp_nistz256_mul_mont: 36 AARCH64_SIGN_LINK_REGISTER 37 stp x29,x30,[sp,#-32]! 38 add x29,sp,#0 39 stp x19,x20,[sp,#16] 40 41 ldr x3,[x2] // bp[0] 42 ldp x4,x5,[x1] 43 ldp x6,x7,[x1,#16] 44 adrp x13,Lpoly 45 add x13,x13,:lo12:Lpoly 46 ldr x12,[x13,#8] 47 ldr x13,[x13,#24] 48 49 bl __ecp_nistz256_mul_mont 50 51 ldp x19,x20,[sp,#16] 52 ldp x29,x30,[sp],#32 53 AARCH64_VALIDATE_LINK_REGISTER 54 ret 55 56 57// void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); 58.globl ecp_nistz256_sqr_mont 59 60.def ecp_nistz256_sqr_mont 61 .type 32 62.endef 63.align 4 64ecp_nistz256_sqr_mont: 65 AARCH64_SIGN_LINK_REGISTER 66 stp x29,x30,[sp,#-32]! 67 add x29,sp,#0 68 stp x19,x20,[sp,#16] 69 70 ldp x4,x5,[x1] 71 ldp x6,x7,[x1,#16] 72 adrp x13,Lpoly 73 add x13,x13,:lo12:Lpoly 74 ldr x12,[x13,#8] 75 ldr x13,[x13,#24] 76 77 bl __ecp_nistz256_sqr_mont 78 79 ldp x19,x20,[sp,#16] 80 ldp x29,x30,[sp],#32 81 AARCH64_VALIDATE_LINK_REGISTER 82 ret 83 84 85// void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); 86.globl ecp_nistz256_div_by_2 87 88.def ecp_nistz256_div_by_2 89 .type 32 90.endef 91.align 4 92ecp_nistz256_div_by_2: 93 AARCH64_SIGN_LINK_REGISTER 94 stp x29,x30,[sp,#-16]! 95 add x29,sp,#0 96 97 ldp x14,x15,[x1] 98 ldp x16,x17,[x1,#16] 99 adrp x13,Lpoly 100 add x13,x13,:lo12:Lpoly 101 ldr x12,[x13,#8] 102 ldr x13,[x13,#24] 103 104 bl __ecp_nistz256_div_by_2 105 106 ldp x29,x30,[sp],#16 107 AARCH64_VALIDATE_LINK_REGISTER 108 ret 109 110 111// void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); 112.globl ecp_nistz256_mul_by_2 113 114.def ecp_nistz256_mul_by_2 115 .type 32 116.endef 117.align 4 118ecp_nistz256_mul_by_2: 119 AARCH64_SIGN_LINK_REGISTER 120 stp x29,x30,[sp,#-16]! 121 add x29,sp,#0 122 123 ldp x14,x15,[x1] 124 ldp x16,x17,[x1,#16] 125 adrp x13,Lpoly 126 add x13,x13,:lo12:Lpoly 127 ldr x12,[x13,#8] 128 ldr x13,[x13,#24] 129 mov x8,x14 130 mov x9,x15 131 mov x10,x16 132 mov x11,x17 133 134 bl __ecp_nistz256_add_to // ret = a+a // 2*a 135 136 ldp x29,x30,[sp],#16 137 AARCH64_VALIDATE_LINK_REGISTER 138 ret 139 140 141// void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]); 142.globl ecp_nistz256_mul_by_3 143 144.def ecp_nistz256_mul_by_3 145 .type 32 146.endef 147.align 4 148ecp_nistz256_mul_by_3: 149 AARCH64_SIGN_LINK_REGISTER 150 stp x29,x30,[sp,#-16]! 151 add x29,sp,#0 152 153 ldp x14,x15,[x1] 154 ldp x16,x17,[x1,#16] 155 adrp x13,Lpoly 156 add x13,x13,:lo12:Lpoly 157 ldr x12,[x13,#8] 158 ldr x13,[x13,#24] 159 mov x8,x14 160 mov x9,x15 161 mov x10,x16 162 mov x11,x17 163 mov x4,x14 164 mov x5,x15 165 mov x6,x16 166 mov x7,x17 167 168 bl __ecp_nistz256_add_to // ret = a+a // 2*a 169 170 mov x8,x4 171 mov x9,x5 172 mov x10,x6 173 mov x11,x7 174 175 bl __ecp_nistz256_add_to // ret += a // 2*a+a=3*a 176 177 ldp x29,x30,[sp],#16 178 AARCH64_VALIDATE_LINK_REGISTER 179 ret 180 181 182// void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4], 183// const BN_ULONG x2[4]); 184.globl ecp_nistz256_sub 185 186.def ecp_nistz256_sub 187 .type 32 188.endef 189.align 4 190ecp_nistz256_sub: 191 AARCH64_SIGN_LINK_REGISTER 192 stp x29,x30,[sp,#-16]! 193 add x29,sp,#0 194 195 ldp x14,x15,[x1] 196 ldp x16,x17,[x1,#16] 197 adrp x13,Lpoly 198 add x13,x13,:lo12:Lpoly 199 ldr x12,[x13,#8] 200 ldr x13,[x13,#24] 201 202 bl __ecp_nistz256_sub_from 203 204 ldp x29,x30,[sp],#16 205 AARCH64_VALIDATE_LINK_REGISTER 206 ret 207 208 209// void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); 210.globl ecp_nistz256_neg 211 212.def ecp_nistz256_neg 213 .type 32 214.endef 215.align 4 216ecp_nistz256_neg: 217 AARCH64_SIGN_LINK_REGISTER 218 stp x29,x30,[sp,#-16]! 219 add x29,sp,#0 220 221 mov x2,x1 222 mov x14,xzr // a = 0 223 mov x15,xzr 224 mov x16,xzr 225 mov x17,xzr 226 adrp x13,Lpoly 227 add x13,x13,:lo12:Lpoly 228 ldr x12,[x13,#8] 229 ldr x13,[x13,#24] 230 231 bl __ecp_nistz256_sub_from 232 233 ldp x29,x30,[sp],#16 234 AARCH64_VALIDATE_LINK_REGISTER 235 ret 236 237 238// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded 239// to x4-x7 and b[0] - to x3 240.def __ecp_nistz256_mul_mont 241 .type 32 242.endef 243.align 4 244__ecp_nistz256_mul_mont: 245 mul x14,x4,x3 // a[0]*b[0] 246 umulh x8,x4,x3 247 248 mul x15,x5,x3 // a[1]*b[0] 249 umulh x9,x5,x3 250 251 mul x16,x6,x3 // a[2]*b[0] 252 umulh x10,x6,x3 253 254 mul x17,x7,x3 // a[3]*b[0] 255 umulh x11,x7,x3 256 ldr x3,[x2,#8] // b[1] 257 258 adds x15,x15,x8 // accumulate high parts of multiplication 259 lsl x8,x14,#32 260 adcs x16,x16,x9 261 lsr x9,x14,#32 262 adcs x17,x17,x10 263 adc x19,xzr,x11 264 mov x20,xzr 265 subs x10,x14,x8 // "*0xffff0001" 266 sbc x11,x14,x9 267 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 268 mul x8,x4,x3 // lo(a[0]*b[i]) 269 adcs x15,x16,x9 270 mul x9,x5,x3 // lo(a[1]*b[i]) 271 adcs x16,x17,x10 // +=acc[0]*0xffff0001 272 mul x10,x6,x3 // lo(a[2]*b[i]) 273 adcs x17,x19,x11 274 mul x11,x7,x3 // lo(a[3]*b[i]) 275 adc x19,x20,xzr 276 277 adds x14,x14,x8 // accumulate low parts of multiplication 278 umulh x8,x4,x3 // hi(a[0]*b[i]) 279 adcs x15,x15,x9 280 umulh x9,x5,x3 // hi(a[1]*b[i]) 281 adcs x16,x16,x10 282 umulh x10,x6,x3 // hi(a[2]*b[i]) 283 adcs x17,x17,x11 284 umulh x11,x7,x3 // hi(a[3]*b[i]) 285 adc x19,x19,xzr 286 ldr x3,[x2,#8*(1+1)] // b[1+1] 287 adds x15,x15,x8 // accumulate high parts of multiplication 288 lsl x8,x14,#32 289 adcs x16,x16,x9 290 lsr x9,x14,#32 291 adcs x17,x17,x10 292 adcs x19,x19,x11 293 adc x20,xzr,xzr 294 subs x10,x14,x8 // "*0xffff0001" 295 sbc x11,x14,x9 296 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 297 mul x8,x4,x3 // lo(a[0]*b[i]) 298 adcs x15,x16,x9 299 mul x9,x5,x3 // lo(a[1]*b[i]) 300 adcs x16,x17,x10 // +=acc[0]*0xffff0001 301 mul x10,x6,x3 // lo(a[2]*b[i]) 302 adcs x17,x19,x11 303 mul x11,x7,x3 // lo(a[3]*b[i]) 304 adc x19,x20,xzr 305 306 adds x14,x14,x8 // accumulate low parts of multiplication 307 umulh x8,x4,x3 // hi(a[0]*b[i]) 308 adcs x15,x15,x9 309 umulh x9,x5,x3 // hi(a[1]*b[i]) 310 adcs x16,x16,x10 311 umulh x10,x6,x3 // hi(a[2]*b[i]) 312 adcs x17,x17,x11 313 umulh x11,x7,x3 // hi(a[3]*b[i]) 314 adc x19,x19,xzr 315 ldr x3,[x2,#8*(2+1)] // b[2+1] 316 adds x15,x15,x8 // accumulate high parts of multiplication 317 lsl x8,x14,#32 318 adcs x16,x16,x9 319 lsr x9,x14,#32 320 adcs x17,x17,x10 321 adcs x19,x19,x11 322 adc x20,xzr,xzr 323 subs x10,x14,x8 // "*0xffff0001" 324 sbc x11,x14,x9 325 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 326 mul x8,x4,x3 // lo(a[0]*b[i]) 327 adcs x15,x16,x9 328 mul x9,x5,x3 // lo(a[1]*b[i]) 329 adcs x16,x17,x10 // +=acc[0]*0xffff0001 330 mul x10,x6,x3 // lo(a[2]*b[i]) 331 adcs x17,x19,x11 332 mul x11,x7,x3 // lo(a[3]*b[i]) 333 adc x19,x20,xzr 334 335 adds x14,x14,x8 // accumulate low parts of multiplication 336 umulh x8,x4,x3 // hi(a[0]*b[i]) 337 adcs x15,x15,x9 338 umulh x9,x5,x3 // hi(a[1]*b[i]) 339 adcs x16,x16,x10 340 umulh x10,x6,x3 // hi(a[2]*b[i]) 341 adcs x17,x17,x11 342 umulh x11,x7,x3 // hi(a[3]*b[i]) 343 adc x19,x19,xzr 344 adds x15,x15,x8 // accumulate high parts of multiplication 345 lsl x8,x14,#32 346 adcs x16,x16,x9 347 lsr x9,x14,#32 348 adcs x17,x17,x10 349 adcs x19,x19,x11 350 adc x20,xzr,xzr 351 // last reduction 352 subs x10,x14,x8 // "*0xffff0001" 353 sbc x11,x14,x9 354 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 355 adcs x15,x16,x9 356 adcs x16,x17,x10 // +=acc[0]*0xffff0001 357 adcs x17,x19,x11 358 adc x19,x20,xzr 359 360 adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus 361 sbcs x9,x15,x12 362 sbcs x10,x16,xzr 363 sbcs x11,x17,x13 364 sbcs xzr,x19,xzr // did it borrow? 365 366 csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus 367 csel x15,x15,x9,lo 368 csel x16,x16,x10,lo 369 stp x14,x15,[x0] 370 csel x17,x17,x11,lo 371 stp x16,x17,[x0,#16] 372 373 ret 374 375 376// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded 377// to x4-x7 378.def __ecp_nistz256_sqr_mont 379 .type 32 380.endef 381.align 4 382__ecp_nistz256_sqr_mont: 383 // | | | | | |a1*a0| | 384 // | | | | |a2*a0| | | 385 // | |a3*a2|a3*a0| | | | 386 // | | | |a2*a1| | | | 387 // | | |a3*a1| | | | | 388 // *| | | | | | | | 2| 389 // +|a3*a3|a2*a2|a1*a1|a0*a0| 390 // |--+--+--+--+--+--+--+--| 391 // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow 392 // 393 // "can't overflow" below mark carrying into high part of 394 // multiplication result, which can't overflow, because it 395 // can never be all ones. 396 397 mul x15,x5,x4 // a[1]*a[0] 398 umulh x9,x5,x4 399 mul x16,x6,x4 // a[2]*a[0] 400 umulh x10,x6,x4 401 mul x17,x7,x4 // a[3]*a[0] 402 umulh x19,x7,x4 403 404 adds x16,x16,x9 // accumulate high parts of multiplication 405 mul x8,x6,x5 // a[2]*a[1] 406 umulh x9,x6,x5 407 adcs x17,x17,x10 408 mul x10,x7,x5 // a[3]*a[1] 409 umulh x11,x7,x5 410 adc x19,x19,xzr // can't overflow 411 412 mul x20,x7,x6 // a[3]*a[2] 413 umulh x1,x7,x6 414 415 adds x9,x9,x10 // accumulate high parts of multiplication 416 mul x14,x4,x4 // a[0]*a[0] 417 adc x10,x11,xzr // can't overflow 418 419 adds x17,x17,x8 // accumulate low parts of multiplication 420 umulh x4,x4,x4 421 adcs x19,x19,x9 422 mul x9,x5,x5 // a[1]*a[1] 423 adcs x20,x20,x10 424 umulh x5,x5,x5 425 adc x1,x1,xzr // can't overflow 426 427 adds x15,x15,x15 // acc[1-6]*=2 428 mul x10,x6,x6 // a[2]*a[2] 429 adcs x16,x16,x16 430 umulh x6,x6,x6 431 adcs x17,x17,x17 432 mul x11,x7,x7 // a[3]*a[3] 433 adcs x19,x19,x19 434 umulh x7,x7,x7 435 adcs x20,x20,x20 436 adcs x1,x1,x1 437 adc x2,xzr,xzr 438 439 adds x15,x15,x4 // +a[i]*a[i] 440 adcs x16,x16,x9 441 adcs x17,x17,x5 442 adcs x19,x19,x10 443 adcs x20,x20,x6 444 lsl x8,x14,#32 445 adcs x1,x1,x11 446 lsr x9,x14,#32 447 adc x2,x2,x7 448 subs x10,x14,x8 // "*0xffff0001" 449 sbc x11,x14,x9 450 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 451 adcs x15,x16,x9 452 lsl x8,x14,#32 453 adcs x16,x17,x10 // +=acc[0]*0xffff0001 454 lsr x9,x14,#32 455 adc x17,x11,xzr // can't overflow 456 subs x10,x14,x8 // "*0xffff0001" 457 sbc x11,x14,x9 458 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 459 adcs x15,x16,x9 460 lsl x8,x14,#32 461 adcs x16,x17,x10 // +=acc[0]*0xffff0001 462 lsr x9,x14,#32 463 adc x17,x11,xzr // can't overflow 464 subs x10,x14,x8 // "*0xffff0001" 465 sbc x11,x14,x9 466 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 467 adcs x15,x16,x9 468 lsl x8,x14,#32 469 adcs x16,x17,x10 // +=acc[0]*0xffff0001 470 lsr x9,x14,#32 471 adc x17,x11,xzr // can't overflow 472 subs x10,x14,x8 // "*0xffff0001" 473 sbc x11,x14,x9 474 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 475 adcs x15,x16,x9 476 adcs x16,x17,x10 // +=acc[0]*0xffff0001 477 adc x17,x11,xzr // can't overflow 478 479 adds x14,x14,x19 // accumulate upper half 480 adcs x15,x15,x20 481 adcs x16,x16,x1 482 adcs x17,x17,x2 483 adc x19,xzr,xzr 484 485 adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus 486 sbcs x9,x15,x12 487 sbcs x10,x16,xzr 488 sbcs x11,x17,x13 489 sbcs xzr,x19,xzr // did it borrow? 490 491 csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus 492 csel x15,x15,x9,lo 493 csel x16,x16,x10,lo 494 stp x14,x15,[x0] 495 csel x17,x17,x11,lo 496 stp x16,x17,[x0,#16] 497 498 ret 499 500 501// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to 502// x4-x7 and x8-x11. This is done because it's used in multiple 503// contexts, e.g. in multiplication by 2 and 3... 504.def __ecp_nistz256_add_to 505 .type 32 506.endef 507.align 4 508__ecp_nistz256_add_to: 509 adds x14,x14,x8 // ret = a+b 510 adcs x15,x15,x9 511 adcs x16,x16,x10 512 adcs x17,x17,x11 513 adc x1,xzr,xzr // zap x1 514 515 adds x8,x14,#1 // subs x8,x4,#-1 // tmp = ret-modulus 516 sbcs x9,x15,x12 517 sbcs x10,x16,xzr 518 sbcs x11,x17,x13 519 sbcs xzr,x1,xzr // did subtraction borrow? 520 521 csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus 522 csel x15,x15,x9,lo 523 csel x16,x16,x10,lo 524 stp x14,x15,[x0] 525 csel x17,x17,x11,lo 526 stp x16,x17,[x0,#16] 527 528 ret 529 530 531.def __ecp_nistz256_sub_from 532 .type 32 533.endef 534.align 4 535__ecp_nistz256_sub_from: 536 ldp x8,x9,[x2] 537 ldp x10,x11,[x2,#16] 538 subs x14,x14,x8 // ret = a-b 539 sbcs x15,x15,x9 540 sbcs x16,x16,x10 541 sbcs x17,x17,x11 542 sbc x1,xzr,xzr // zap x1 543 544 subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus 545 adcs x9,x15,x12 546 adcs x10,x16,xzr 547 adc x11,x17,x13 548 cmp x1,xzr // did subtraction borrow? 549 550 csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret 551 csel x15,x15,x9,eq 552 csel x16,x16,x10,eq 553 stp x14,x15,[x0] 554 csel x17,x17,x11,eq 555 stp x16,x17,[x0,#16] 556 557 ret 558 559 560.def __ecp_nistz256_sub_morf 561 .type 32 562.endef 563.align 4 564__ecp_nistz256_sub_morf: 565 ldp x8,x9,[x2] 566 ldp x10,x11,[x2,#16] 567 subs x14,x8,x14 // ret = b-a 568 sbcs x15,x9,x15 569 sbcs x16,x10,x16 570 sbcs x17,x11,x17 571 sbc x1,xzr,xzr // zap x1 572 573 subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus 574 adcs x9,x15,x12 575 adcs x10,x16,xzr 576 adc x11,x17,x13 577 cmp x1,xzr // did subtraction borrow? 578 579 csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret 580 csel x15,x15,x9,eq 581 csel x16,x16,x10,eq 582 stp x14,x15,[x0] 583 csel x17,x17,x11,eq 584 stp x16,x17,[x0,#16] 585 586 ret 587 588 589.def __ecp_nistz256_div_by_2 590 .type 32 591.endef 592.align 4 593__ecp_nistz256_div_by_2: 594 subs x8,x14,#1 // adds x8,x4,#-1 // tmp = a+modulus 595 adcs x9,x15,x12 596 adcs x10,x16,xzr 597 adcs x11,x17,x13 598 adc x1,xzr,xzr // zap x1 599 tst x14,#1 // is a even? 600 601 csel x14,x14,x8,eq // ret = even ? a : a+modulus 602 csel x15,x15,x9,eq 603 csel x16,x16,x10,eq 604 csel x17,x17,x11,eq 605 csel x1,xzr,x1,eq 606 607 lsr x14,x14,#1 // ret >>= 1 608 orr x14,x14,x15,lsl#63 609 lsr x15,x15,#1 610 orr x15,x15,x16,lsl#63 611 lsr x16,x16,#1 612 orr x16,x16,x17,lsl#63 613 lsr x17,x17,#1 614 stp x14,x15,[x0] 615 orr x17,x17,x1,lsl#63 616 stp x16,x17,[x0,#16] 617 618 ret 619 620.globl ecp_nistz256_point_double 621 622.def ecp_nistz256_point_double 623 .type 32 624.endef 625.align 5 626ecp_nistz256_point_double: 627 AARCH64_SIGN_LINK_REGISTER 628 stp x29,x30,[sp,#-96]! 629 add x29,sp,#0 630 stp x19,x20,[sp,#16] 631 stp x21,x22,[sp,#32] 632 sub sp,sp,#32*4 633 634Ldouble_shortcut: 635 ldp x14,x15,[x1,#32] 636 mov x21,x0 637 ldp x16,x17,[x1,#48] 638 mov x22,x1 639 adrp x13,Lpoly 640 add x13,x13,:lo12:Lpoly 641 ldr x12,[x13,#8] 642 mov x8,x14 643 ldr x13,[x13,#24] 644 mov x9,x15 645 ldp x4,x5,[x22,#64] // forward load for p256_sqr_mont 646 mov x10,x16 647 mov x11,x17 648 ldp x6,x7,[x22,#64+16] 649 add x0,sp,#0 650 bl __ecp_nistz256_add_to // p256_mul_by_2(S, in_y); 651 652 add x0,sp,#64 653 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z); 654 655 ldp x8,x9,[x22] 656 ldp x10,x11,[x22,#16] 657 mov x4,x14 // put Zsqr aside for p256_sub 658 mov x5,x15 659 mov x6,x16 660 mov x7,x17 661 add x0,sp,#32 662 bl __ecp_nistz256_add_to // p256_add(M, Zsqr, in_x); 663 664 add x2,x22,#0 665 mov x14,x4 // restore Zsqr 666 mov x15,x5 667 ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont 668 mov x16,x6 669 mov x17,x7 670 ldp x6,x7,[sp,#0+16] 671 add x0,sp,#64 672 bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr); 673 674 add x0,sp,#0 675 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S); 676 677 ldr x3,[x22,#32] 678 ldp x4,x5,[x22,#64] 679 ldp x6,x7,[x22,#64+16] 680 add x2,x22,#32 681 add x0,sp,#96 682 bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y); 683 684 mov x8,x14 685 mov x9,x15 686 ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont 687 mov x10,x16 688 mov x11,x17 689 ldp x6,x7,[sp,#0+16] 690 add x0,x21,#64 691 bl __ecp_nistz256_add_to // p256_mul_by_2(res_z, tmp0); 692 693 add x0,sp,#96 694 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S); 695 696 ldr x3,[sp,#64] // forward load for p256_mul_mont 697 ldp x4,x5,[sp,#32] 698 ldp x6,x7,[sp,#32+16] 699 add x0,x21,#32 700 bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0); 701 702 add x2,sp,#64 703 add x0,sp,#32 704 bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr); 705 706 mov x8,x14 // duplicate M 707 mov x9,x15 708 mov x10,x16 709 mov x11,x17 710 mov x4,x14 // put M aside 711 mov x5,x15 712 mov x6,x16 713 mov x7,x17 714 add x0,sp,#32 715 bl __ecp_nistz256_add_to 716 mov x8,x4 // restore M 717 mov x9,x5 718 ldr x3,[x22] // forward load for p256_mul_mont 719 mov x10,x6 720 ldp x4,x5,[sp,#0] 721 mov x11,x7 722 ldp x6,x7,[sp,#0+16] 723 bl __ecp_nistz256_add_to // p256_mul_by_3(M, M); 724 725 add x2,x22,#0 726 add x0,sp,#0 727 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x); 728 729 mov x8,x14 730 mov x9,x15 731 ldp x4,x5,[sp,#32] // forward load for p256_sqr_mont 732 mov x10,x16 733 mov x11,x17 734 ldp x6,x7,[sp,#32+16] 735 add x0,sp,#96 736 bl __ecp_nistz256_add_to // p256_mul_by_2(tmp0, S); 737 738 add x0,x21,#0 739 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M); 740 741 add x2,sp,#96 742 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0); 743 744 add x2,sp,#0 745 add x0,sp,#0 746 bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x); 747 748 ldr x3,[sp,#32] 749 mov x4,x14 // copy S 750 mov x5,x15 751 mov x6,x16 752 mov x7,x17 753 add x2,sp,#32 754 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M); 755 756 add x2,x21,#32 757 add x0,x21,#32 758 bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y); 759 760 add sp,x29,#0 // destroy frame 761 ldp x19,x20,[x29,#16] 762 ldp x21,x22,[x29,#32] 763 ldp x29,x30,[sp],#96 764 AARCH64_VALIDATE_LINK_REGISTER 765 ret 766 767.globl ecp_nistz256_point_add 768 769.def ecp_nistz256_point_add 770 .type 32 771.endef 772.align 5 773ecp_nistz256_point_add: 774 AARCH64_SIGN_LINK_REGISTER 775 stp x29,x30,[sp,#-96]! 776 add x29,sp,#0 777 stp x19,x20,[sp,#16] 778 stp x21,x22,[sp,#32] 779 stp x23,x24,[sp,#48] 780 stp x25,x26,[sp,#64] 781 stp x27,x28,[sp,#80] 782 sub sp,sp,#32*12 783 784 ldp x4,x5,[x2,#64] // in2_z 785 ldp x6,x7,[x2,#64+16] 786 mov x21,x0 787 mov x22,x1 788 mov x23,x2 789 adrp x13,Lpoly 790 add x13,x13,:lo12:Lpoly 791 ldr x12,[x13,#8] 792 ldr x13,[x13,#24] 793 orr x8,x4,x5 794 orr x10,x6,x7 795 orr x25,x8,x10 796 cmp x25,#0 797 csetm x25,ne // ~in2infty 798 add x0,sp,#192 799 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z); 800 801 ldp x4,x5,[x22,#64] // in1_z 802 ldp x6,x7,[x22,#64+16] 803 orr x8,x4,x5 804 orr x10,x6,x7 805 orr x24,x8,x10 806 cmp x24,#0 807 csetm x24,ne // ~in1infty 808 add x0,sp,#128 809 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); 810 811 ldr x3,[x23,#64] 812 ldp x4,x5,[sp,#192] 813 ldp x6,x7,[sp,#192+16] 814 add x2,x23,#64 815 add x0,sp,#320 816 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z); 817 818 ldr x3,[x22,#64] 819 ldp x4,x5,[sp,#128] 820 ldp x6,x7,[sp,#128+16] 821 add x2,x22,#64 822 add x0,sp,#352 823 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); 824 825 ldr x3,[x22,#32] 826 ldp x4,x5,[sp,#320] 827 ldp x6,x7,[sp,#320+16] 828 add x2,x22,#32 829 add x0,sp,#320 830 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y); 831 832 ldr x3,[x23,#32] 833 ldp x4,x5,[sp,#352] 834 ldp x6,x7,[sp,#352+16] 835 add x2,x23,#32 836 add x0,sp,#352 837 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); 838 839 add x2,sp,#320 840 ldr x3,[sp,#192] // forward load for p256_mul_mont 841 ldp x4,x5,[x22] 842 ldp x6,x7,[x22,#16] 843 add x0,sp,#160 844 bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1); 845 846 orr x14,x14,x15 // see if result is zero 847 orr x16,x16,x17 848 orr x26,x14,x16 // ~is_equal(S1,S2) 849 850 add x2,sp,#192 851 add x0,sp,#256 852 bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr); 853 854 ldr x3,[sp,#128] 855 ldp x4,x5,[x23] 856 ldp x6,x7,[x23,#16] 857 add x2,sp,#128 858 add x0,sp,#288 859 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr); 860 861 add x2,sp,#256 862 ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont 863 ldp x6,x7,[sp,#160+16] 864 add x0,sp,#96 865 bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1); 866 867 orr x14,x14,x15 // see if result is zero 868 orr x16,x16,x17 869 orr x14,x14,x16 // ~is_equal(U1,U2) 870 871 mvn x27,x24 // -1/0 -> 0/-1 872 mvn x28,x25 // -1/0 -> 0/-1 873 orr x14,x14,x27 874 orr x14,x14,x28 875 orr x14,x14,x26 876 cbnz x14,Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) 877 878Ladd_double: 879 mov x1,x22 880 mov x0,x21 881 ldp x23,x24,[x29,#48] 882 ldp x25,x26,[x29,#64] 883 ldp x27,x28,[x29,#80] 884 add sp,sp,#256 // #256 is from #32*(12-4). difference in stack frames 885 b Ldouble_shortcut 886 887.align 4 888Ladd_proceed: 889 add x0,sp,#192 890 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); 891 892 ldr x3,[x22,#64] 893 ldp x4,x5,[sp,#96] 894 ldp x6,x7,[sp,#96+16] 895 add x2,x22,#64 896 add x0,sp,#64 897 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); 898 899 ldp x4,x5,[sp,#96] 900 ldp x6,x7,[sp,#96+16] 901 add x0,sp,#128 902 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); 903 904 ldr x3,[x23,#64] 905 ldp x4,x5,[sp,#64] 906 ldp x6,x7,[sp,#64+16] 907 add x2,x23,#64 908 add x0,sp,#64 909 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z); 910 911 ldr x3,[sp,#96] 912 ldp x4,x5,[sp,#128] 913 ldp x6,x7,[sp,#128+16] 914 add x2,sp,#96 915 add x0,sp,#224 916 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); 917 918 ldr x3,[sp,#128] 919 ldp x4,x5,[sp,#256] 920 ldp x6,x7,[sp,#256+16] 921 add x2,sp,#128 922 add x0,sp,#288 923 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr); 924 925 mov x8,x14 926 mov x9,x15 927 mov x10,x16 928 mov x11,x17 929 add x0,sp,#128 930 bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); 931 932 add x2,sp,#192 933 add x0,sp,#0 934 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); 935 936 add x2,sp,#224 937 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); 938 939 add x2,sp,#288 940 ldr x3,[sp,#224] // forward load for p256_mul_mont 941 ldp x4,x5,[sp,#320] 942 ldp x6,x7,[sp,#320+16] 943 add x0,sp,#32 944 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); 945 946 add x2,sp,#224 947 add x0,sp,#352 948 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub); 949 950 ldr x3,[sp,#160] 951 ldp x4,x5,[sp,#32] 952 ldp x6,x7,[sp,#32+16] 953 add x2,sp,#160 954 add x0,sp,#32 955 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); 956 957 add x2,sp,#352 958 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); 959 960 ldp x4,x5,[sp,#0] // res 961 ldp x6,x7,[sp,#0+16] 962 ldp x8,x9,[x23] // in2 963 ldp x10,x11,[x23,#16] 964 ldp x14,x15,[x22,#0] // in1 965 cmp x24,#0 // ~, remember? 966 ldp x16,x17,[x22,#0+16] 967 csel x8,x4,x8,ne 968 csel x9,x5,x9,ne 969 ldp x4,x5,[sp,#0+0+32] // res 970 csel x10,x6,x10,ne 971 csel x11,x7,x11,ne 972 cmp x25,#0 // ~, remember? 973 ldp x6,x7,[sp,#0+0+48] 974 csel x14,x8,x14,ne 975 csel x15,x9,x15,ne 976 ldp x8,x9,[x23,#0+32] // in2 977 csel x16,x10,x16,ne 978 csel x17,x11,x17,ne 979 ldp x10,x11,[x23,#0+48] 980 stp x14,x15,[x21,#0] 981 stp x16,x17,[x21,#0+16] 982 ldp x14,x15,[x22,#32] // in1 983 cmp x24,#0 // ~, remember? 984 ldp x16,x17,[x22,#32+16] 985 csel x8,x4,x8,ne 986 csel x9,x5,x9,ne 987 ldp x4,x5,[sp,#0+32+32] // res 988 csel x10,x6,x10,ne 989 csel x11,x7,x11,ne 990 cmp x25,#0 // ~, remember? 991 ldp x6,x7,[sp,#0+32+48] 992 csel x14,x8,x14,ne 993 csel x15,x9,x15,ne 994 ldp x8,x9,[x23,#32+32] // in2 995 csel x16,x10,x16,ne 996 csel x17,x11,x17,ne 997 ldp x10,x11,[x23,#32+48] 998 stp x14,x15,[x21,#32] 999 stp x16,x17,[x21,#32+16] 1000 ldp x14,x15,[x22,#64] // in1 1001 cmp x24,#0 // ~, remember? 1002 ldp x16,x17,[x22,#64+16] 1003 csel x8,x4,x8,ne 1004 csel x9,x5,x9,ne 1005 csel x10,x6,x10,ne 1006 csel x11,x7,x11,ne 1007 cmp x25,#0 // ~, remember? 1008 csel x14,x8,x14,ne 1009 csel x15,x9,x15,ne 1010 csel x16,x10,x16,ne 1011 csel x17,x11,x17,ne 1012 stp x14,x15,[x21,#64] 1013 stp x16,x17,[x21,#64+16] 1014 1015Ladd_done: 1016 add sp,x29,#0 // destroy frame 1017 ldp x19,x20,[x29,#16] 1018 ldp x21,x22,[x29,#32] 1019 ldp x23,x24,[x29,#48] 1020 ldp x25,x26,[x29,#64] 1021 ldp x27,x28,[x29,#80] 1022 ldp x29,x30,[sp],#96 1023 AARCH64_VALIDATE_LINK_REGISTER 1024 ret 1025 1026.globl ecp_nistz256_point_add_affine 1027 1028.def ecp_nistz256_point_add_affine 1029 .type 32 1030.endef 1031.align 5 1032ecp_nistz256_point_add_affine: 1033 AARCH64_SIGN_LINK_REGISTER 1034 stp x29,x30,[sp,#-80]! 1035 add x29,sp,#0 1036 stp x19,x20,[sp,#16] 1037 stp x21,x22,[sp,#32] 1038 stp x23,x24,[sp,#48] 1039 stp x25,x26,[sp,#64] 1040 sub sp,sp,#32*10 1041 1042 mov x21,x0 1043 mov x22,x1 1044 mov x23,x2 1045 adrp x13,Lpoly 1046 add x13,x13,:lo12:Lpoly 1047 ldr x12,[x13,#8] 1048 ldr x13,[x13,#24] 1049 1050 ldp x4,x5,[x1,#64] // in1_z 1051 ldp x6,x7,[x1,#64+16] 1052 orr x8,x4,x5 1053 orr x10,x6,x7 1054 orr x24,x8,x10 1055 cmp x24,#0 1056 csetm x24,ne // ~in1infty 1057 1058 ldp x14,x15,[x2] // in2_x 1059 ldp x16,x17,[x2,#16] 1060 ldp x8,x9,[x2,#32] // in2_y 1061 ldp x10,x11,[x2,#48] 1062 orr x14,x14,x15 1063 orr x16,x16,x17 1064 orr x8,x8,x9 1065 orr x10,x10,x11 1066 orr x14,x14,x16 1067 orr x8,x8,x10 1068 orr x25,x14,x8 1069 cmp x25,#0 1070 csetm x25,ne // ~in2infty 1071 1072 add x0,sp,#128 1073 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); 1074 1075 mov x4,x14 1076 mov x5,x15 1077 mov x6,x16 1078 mov x7,x17 1079 ldr x3,[x23] 1080 add x2,x23,#0 1081 add x0,sp,#96 1082 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x); 1083 1084 add x2,x22,#0 1085 ldr x3,[x22,#64] // forward load for p256_mul_mont 1086 ldp x4,x5,[sp,#128] 1087 ldp x6,x7,[sp,#128+16] 1088 add x0,sp,#160 1089 bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x); 1090 1091 add x2,x22,#64 1092 add x0,sp,#128 1093 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); 1094 1095 ldr x3,[x22,#64] 1096 ldp x4,x5,[sp,#160] 1097 ldp x6,x7,[sp,#160+16] 1098 add x2,x22,#64 1099 add x0,sp,#64 1100 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); 1101 1102 ldr x3,[x23,#32] 1103 ldp x4,x5,[sp,#128] 1104 ldp x6,x7,[sp,#128+16] 1105 add x2,x23,#32 1106 add x0,sp,#128 1107 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); 1108 1109 add x2,x22,#32 1110 ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont 1111 ldp x6,x7,[sp,#160+16] 1112 add x0,sp,#192 1113 bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y); 1114 1115 add x0,sp,#224 1116 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); 1117 1118 ldp x4,x5,[sp,#192] 1119 ldp x6,x7,[sp,#192+16] 1120 add x0,sp,#288 1121 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); 1122 1123 ldr x3,[sp,#160] 1124 ldp x4,x5,[sp,#224] 1125 ldp x6,x7,[sp,#224+16] 1126 add x2,sp,#160 1127 add x0,sp,#256 1128 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); 1129 1130 ldr x3,[x22] 1131 ldp x4,x5,[sp,#224] 1132 ldp x6,x7,[sp,#224+16] 1133 add x2,x22,#0 1134 add x0,sp,#96 1135 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr); 1136 1137 mov x8,x14 1138 mov x9,x15 1139 mov x10,x16 1140 mov x11,x17 1141 add x0,sp,#224 1142 bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); 1143 1144 add x2,sp,#288 1145 add x0,sp,#0 1146 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); 1147 1148 add x2,sp,#256 1149 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); 1150 1151 add x2,sp,#96 1152 ldr x3,[x22,#32] // forward load for p256_mul_mont 1153 ldp x4,x5,[sp,#256] 1154 ldp x6,x7,[sp,#256+16] 1155 add x0,sp,#32 1156 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); 1157 1158 add x2,x22,#32 1159 add x0,sp,#128 1160 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub); 1161 1162 ldr x3,[sp,#192] 1163 ldp x4,x5,[sp,#32] 1164 ldp x6,x7,[sp,#32+16] 1165 add x2,sp,#192 1166 add x0,sp,#32 1167 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); 1168 1169 add x2,sp,#128 1170 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); 1171 1172 ldp x4,x5,[sp,#0] // res 1173 ldp x6,x7,[sp,#0+16] 1174 ldp x8,x9,[x23] // in2 1175 ldp x10,x11,[x23,#16] 1176 ldp x14,x15,[x22,#0] // in1 1177 cmp x24,#0 // ~, remember? 1178 ldp x16,x17,[x22,#0+16] 1179 csel x8,x4,x8,ne 1180 csel x9,x5,x9,ne 1181 ldp x4,x5,[sp,#0+0+32] // res 1182 csel x10,x6,x10,ne 1183 csel x11,x7,x11,ne 1184 cmp x25,#0 // ~, remember? 1185 ldp x6,x7,[sp,#0+0+48] 1186 csel x14,x8,x14,ne 1187 csel x15,x9,x15,ne 1188 ldp x8,x9,[x23,#0+32] // in2 1189 csel x16,x10,x16,ne 1190 csel x17,x11,x17,ne 1191 ldp x10,x11,[x23,#0+48] 1192 stp x14,x15,[x21,#0] 1193 stp x16,x17,[x21,#0+16] 1194 adrp x23,Lone_mont-64 1195 add x23,x23,:lo12:Lone_mont-64 1196 ldp x14,x15,[x22,#32] // in1 1197 cmp x24,#0 // ~, remember? 1198 ldp x16,x17,[x22,#32+16] 1199 csel x8,x4,x8,ne 1200 csel x9,x5,x9,ne 1201 ldp x4,x5,[sp,#0+32+32] // res 1202 csel x10,x6,x10,ne 1203 csel x11,x7,x11,ne 1204 cmp x25,#0 // ~, remember? 1205 ldp x6,x7,[sp,#0+32+48] 1206 csel x14,x8,x14,ne 1207 csel x15,x9,x15,ne 1208 ldp x8,x9,[x23,#32+32] // in2 1209 csel x16,x10,x16,ne 1210 csel x17,x11,x17,ne 1211 ldp x10,x11,[x23,#32+48] 1212 stp x14,x15,[x21,#32] 1213 stp x16,x17,[x21,#32+16] 1214 ldp x14,x15,[x22,#64] // in1 1215 cmp x24,#0 // ~, remember? 1216 ldp x16,x17,[x22,#64+16] 1217 csel x8,x4,x8,ne 1218 csel x9,x5,x9,ne 1219 csel x10,x6,x10,ne 1220 csel x11,x7,x11,ne 1221 cmp x25,#0 // ~, remember? 1222 csel x14,x8,x14,ne 1223 csel x15,x9,x15,ne 1224 csel x16,x10,x16,ne 1225 csel x17,x11,x17,ne 1226 stp x14,x15,[x21,#64] 1227 stp x16,x17,[x21,#64+16] 1228 1229 add sp,x29,#0 // destroy frame 1230 ldp x19,x20,[x29,#16] 1231 ldp x21,x22,[x29,#32] 1232 ldp x23,x24,[x29,#48] 1233 ldp x25,x26,[x29,#64] 1234 ldp x29,x30,[sp],#80 1235 AARCH64_VALIDATE_LINK_REGISTER 1236 ret 1237 1238//////////////////////////////////////////////////////////////////////// 1239// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], 1240// uint64_t b[4]); 1241.globl ecp_nistz256_ord_mul_mont 1242 1243.def ecp_nistz256_ord_mul_mont 1244 .type 32 1245.endef 1246.align 4 1247ecp_nistz256_ord_mul_mont: 1248 AARCH64_VALID_CALL_TARGET 1249 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 1250 stp x29,x30,[sp,#-64]! 1251 add x29,sp,#0 1252 stp x19,x20,[sp,#16] 1253 stp x21,x22,[sp,#32] 1254 stp x23,x24,[sp,#48] 1255 1256 adrp x23,Lord 1257 add x23,x23,:lo12:Lord 1258 ldr x3,[x2] // bp[0] 1259 ldp x4,x5,[x1] 1260 ldp x6,x7,[x1,#16] 1261 1262 ldp x12,x13,[x23,#0] 1263 ldp x21,x22,[x23,#16] 1264 ldr x23,[x23,#32] 1265 1266 mul x14,x4,x3 // a[0]*b[0] 1267 umulh x8,x4,x3 1268 1269 mul x15,x5,x3 // a[1]*b[0] 1270 umulh x9,x5,x3 1271 1272 mul x16,x6,x3 // a[2]*b[0] 1273 umulh x10,x6,x3 1274 1275 mul x17,x7,x3 // a[3]*b[0] 1276 umulh x19,x7,x3 1277 1278 mul x24,x14,x23 1279 1280 adds x15,x15,x8 // accumulate high parts of multiplication 1281 adcs x16,x16,x9 1282 adcs x17,x17,x10 1283 adc x19,x19,xzr 1284 mov x20,xzr 1285 ldr x3,[x2,#8*1] // b[i] 1286 1287 lsl x8,x24,#32 1288 subs x16,x16,x24 1289 lsr x9,x24,#32 1290 sbcs x17,x17,x8 1291 sbcs x19,x19,x9 1292 sbc x20,x20,xzr 1293 1294 subs xzr,x14,#1 1295 umulh x9,x12,x24 1296 mul x10,x13,x24 1297 umulh x11,x13,x24 1298 1299 adcs x10,x10,x9 1300 mul x8,x4,x3 1301 adc x11,x11,xzr 1302 mul x9,x5,x3 1303 1304 adds x14,x15,x10 1305 mul x10,x6,x3 1306 adcs x15,x16,x11 1307 mul x11,x7,x3 1308 adcs x16,x17,x24 1309 adcs x17,x19,x24 1310 adc x19,x20,xzr 1311 1312 adds x14,x14,x8 // accumulate low parts 1313 umulh x8,x4,x3 1314 adcs x15,x15,x9 1315 umulh x9,x5,x3 1316 adcs x16,x16,x10 1317 umulh x10,x6,x3 1318 adcs x17,x17,x11 1319 umulh x11,x7,x3 1320 adc x19,x19,xzr 1321 mul x24,x14,x23 1322 adds x15,x15,x8 // accumulate high parts 1323 adcs x16,x16,x9 1324 adcs x17,x17,x10 1325 adcs x19,x19,x11 1326 adc x20,xzr,xzr 1327 ldr x3,[x2,#8*2] // b[i] 1328 1329 lsl x8,x24,#32 1330 subs x16,x16,x24 1331 lsr x9,x24,#32 1332 sbcs x17,x17,x8 1333 sbcs x19,x19,x9 1334 sbc x20,x20,xzr 1335 1336 subs xzr,x14,#1 1337 umulh x9,x12,x24 1338 mul x10,x13,x24 1339 umulh x11,x13,x24 1340 1341 adcs x10,x10,x9 1342 mul x8,x4,x3 1343 adc x11,x11,xzr 1344 mul x9,x5,x3 1345 1346 adds x14,x15,x10 1347 mul x10,x6,x3 1348 adcs x15,x16,x11 1349 mul x11,x7,x3 1350 adcs x16,x17,x24 1351 adcs x17,x19,x24 1352 adc x19,x20,xzr 1353 1354 adds x14,x14,x8 // accumulate low parts 1355 umulh x8,x4,x3 1356 adcs x15,x15,x9 1357 umulh x9,x5,x3 1358 adcs x16,x16,x10 1359 umulh x10,x6,x3 1360 adcs x17,x17,x11 1361 umulh x11,x7,x3 1362 adc x19,x19,xzr 1363 mul x24,x14,x23 1364 adds x15,x15,x8 // accumulate high parts 1365 adcs x16,x16,x9 1366 adcs x17,x17,x10 1367 adcs x19,x19,x11 1368 adc x20,xzr,xzr 1369 ldr x3,[x2,#8*3] // b[i] 1370 1371 lsl x8,x24,#32 1372 subs x16,x16,x24 1373 lsr x9,x24,#32 1374 sbcs x17,x17,x8 1375 sbcs x19,x19,x9 1376 sbc x20,x20,xzr 1377 1378 subs xzr,x14,#1 1379 umulh x9,x12,x24 1380 mul x10,x13,x24 1381 umulh x11,x13,x24 1382 1383 adcs x10,x10,x9 1384 mul x8,x4,x3 1385 adc x11,x11,xzr 1386 mul x9,x5,x3 1387 1388 adds x14,x15,x10 1389 mul x10,x6,x3 1390 adcs x15,x16,x11 1391 mul x11,x7,x3 1392 adcs x16,x17,x24 1393 adcs x17,x19,x24 1394 adc x19,x20,xzr 1395 1396 adds x14,x14,x8 // accumulate low parts 1397 umulh x8,x4,x3 1398 adcs x15,x15,x9 1399 umulh x9,x5,x3 1400 adcs x16,x16,x10 1401 umulh x10,x6,x3 1402 adcs x17,x17,x11 1403 umulh x11,x7,x3 1404 adc x19,x19,xzr 1405 mul x24,x14,x23 1406 adds x15,x15,x8 // accumulate high parts 1407 adcs x16,x16,x9 1408 adcs x17,x17,x10 1409 adcs x19,x19,x11 1410 adc x20,xzr,xzr 1411 lsl x8,x24,#32 // last reduction 1412 subs x16,x16,x24 1413 lsr x9,x24,#32 1414 sbcs x17,x17,x8 1415 sbcs x19,x19,x9 1416 sbc x20,x20,xzr 1417 1418 subs xzr,x14,#1 1419 umulh x9,x12,x24 1420 mul x10,x13,x24 1421 umulh x11,x13,x24 1422 1423 adcs x10,x10,x9 1424 adc x11,x11,xzr 1425 1426 adds x14,x15,x10 1427 adcs x15,x16,x11 1428 adcs x16,x17,x24 1429 adcs x17,x19,x24 1430 adc x19,x20,xzr 1431 1432 subs x8,x14,x12 // ret -= modulus 1433 sbcs x9,x15,x13 1434 sbcs x10,x16,x21 1435 sbcs x11,x17,x22 1436 sbcs xzr,x19,xzr 1437 1438 csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus 1439 csel x15,x15,x9,lo 1440 csel x16,x16,x10,lo 1441 stp x14,x15,[x0] 1442 csel x17,x17,x11,lo 1443 stp x16,x17,[x0,#16] 1444 1445 ldp x19,x20,[sp,#16] 1446 ldp x21,x22,[sp,#32] 1447 ldp x23,x24,[sp,#48] 1448 ldr x29,[sp],#64 1449 ret 1450 1451 1452//////////////////////////////////////////////////////////////////////// 1453// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], 1454// uint64_t rep); 1455.globl ecp_nistz256_ord_sqr_mont 1456 1457.def ecp_nistz256_ord_sqr_mont 1458 .type 32 1459.endef 1460.align 4 1461ecp_nistz256_ord_sqr_mont: 1462 AARCH64_VALID_CALL_TARGET 1463 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 1464 stp x29,x30,[sp,#-64]! 1465 add x29,sp,#0 1466 stp x19,x20,[sp,#16] 1467 stp x21,x22,[sp,#32] 1468 stp x23,x24,[sp,#48] 1469 1470 adrp x23,Lord 1471 add x23,x23,:lo12:Lord 1472 ldp x4,x5,[x1] 1473 ldp x6,x7,[x1,#16] 1474 1475 ldp x12,x13,[x23,#0] 1476 ldp x21,x22,[x23,#16] 1477 ldr x23,[x23,#32] 1478 b Loop_ord_sqr 1479 1480.align 4 1481Loop_ord_sqr: 1482 sub x2,x2,#1 1483 //////////////////////////////////////////////////////////////// 1484 // | | | | | |a1*a0| | 1485 // | | | | |a2*a0| | | 1486 // | |a3*a2|a3*a0| | | | 1487 // | | | |a2*a1| | | | 1488 // | | |a3*a1| | | | | 1489 // *| | | | | | | | 2| 1490 // +|a3*a3|a2*a2|a1*a1|a0*a0| 1491 // |--+--+--+--+--+--+--+--| 1492 // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow 1493 // 1494 // "can't overflow" below mark carrying into high part of 1495 // multiplication result, which can't overflow, because it 1496 // can never be all ones. 1497 1498 mul x15,x5,x4 // a[1]*a[0] 1499 umulh x9,x5,x4 1500 mul x16,x6,x4 // a[2]*a[0] 1501 umulh x10,x6,x4 1502 mul x17,x7,x4 // a[3]*a[0] 1503 umulh x19,x7,x4 1504 1505 adds x16,x16,x9 // accumulate high parts of multiplication 1506 mul x8,x6,x5 // a[2]*a[1] 1507 umulh x9,x6,x5 1508 adcs x17,x17,x10 1509 mul x10,x7,x5 // a[3]*a[1] 1510 umulh x11,x7,x5 1511 adc x19,x19,xzr // can't overflow 1512 1513 mul x20,x7,x6 // a[3]*a[2] 1514 umulh x1,x7,x6 1515 1516 adds x9,x9,x10 // accumulate high parts of multiplication 1517 mul x14,x4,x4 // a[0]*a[0] 1518 adc x10,x11,xzr // can't overflow 1519 1520 adds x17,x17,x8 // accumulate low parts of multiplication 1521 umulh x4,x4,x4 1522 adcs x19,x19,x9 1523 mul x9,x5,x5 // a[1]*a[1] 1524 adcs x20,x20,x10 1525 umulh x5,x5,x5 1526 adc x1,x1,xzr // can't overflow 1527 1528 adds x15,x15,x15 // acc[1-6]*=2 1529 mul x10,x6,x6 // a[2]*a[2] 1530 adcs x16,x16,x16 1531 umulh x6,x6,x6 1532 adcs x17,x17,x17 1533 mul x11,x7,x7 // a[3]*a[3] 1534 adcs x19,x19,x19 1535 umulh x7,x7,x7 1536 adcs x20,x20,x20 1537 adcs x1,x1,x1 1538 adc x3,xzr,xzr 1539 1540 adds x15,x15,x4 // +a[i]*a[i] 1541 mul x24,x14,x23 1542 adcs x16,x16,x9 1543 adcs x17,x17,x5 1544 adcs x19,x19,x10 1545 adcs x20,x20,x6 1546 adcs x1,x1,x11 1547 adc x3,x3,x7 1548 subs xzr,x14,#1 1549 umulh x9,x12,x24 1550 mul x10,x13,x24 1551 umulh x11,x13,x24 1552 1553 adcs x10,x10,x9 1554 adc x11,x11,xzr 1555 1556 adds x14,x15,x10 1557 adcs x15,x16,x11 1558 adcs x16,x17,x24 1559 adc x17,xzr,x24 // can't overflow 1560 mul x11,x14,x23 1561 lsl x8,x24,#32 1562 subs x15,x15,x24 1563 lsr x9,x24,#32 1564 sbcs x16,x16,x8 1565 sbc x17,x17,x9 // can't borrow 1566 subs xzr,x14,#1 1567 umulh x9,x12,x11 1568 mul x10,x13,x11 1569 umulh x24,x13,x11 1570 1571 adcs x10,x10,x9 1572 adc x24,x24,xzr 1573 1574 adds x14,x15,x10 1575 adcs x15,x16,x24 1576 adcs x16,x17,x11 1577 adc x17,xzr,x11 // can't overflow 1578 mul x24,x14,x23 1579 lsl x8,x11,#32 1580 subs x15,x15,x11 1581 lsr x9,x11,#32 1582 sbcs x16,x16,x8 1583 sbc x17,x17,x9 // can't borrow 1584 subs xzr,x14,#1 1585 umulh x9,x12,x24 1586 mul x10,x13,x24 1587 umulh x11,x13,x24 1588 1589 adcs x10,x10,x9 1590 adc x11,x11,xzr 1591 1592 adds x14,x15,x10 1593 adcs x15,x16,x11 1594 adcs x16,x17,x24 1595 adc x17,xzr,x24 // can't overflow 1596 mul x11,x14,x23 1597 lsl x8,x24,#32 1598 subs x15,x15,x24 1599 lsr x9,x24,#32 1600 sbcs x16,x16,x8 1601 sbc x17,x17,x9 // can't borrow 1602 subs xzr,x14,#1 1603 umulh x9,x12,x11 1604 mul x10,x13,x11 1605 umulh x24,x13,x11 1606 1607 adcs x10,x10,x9 1608 adc x24,x24,xzr 1609 1610 adds x14,x15,x10 1611 adcs x15,x16,x24 1612 adcs x16,x17,x11 1613 adc x17,xzr,x11 // can't overflow 1614 lsl x8,x11,#32 1615 subs x15,x15,x11 1616 lsr x9,x11,#32 1617 sbcs x16,x16,x8 1618 sbc x17,x17,x9 // can't borrow 1619 adds x14,x14,x19 // accumulate upper half 1620 adcs x15,x15,x20 1621 adcs x16,x16,x1 1622 adcs x17,x17,x3 1623 adc x19,xzr,xzr 1624 1625 subs x8,x14,x12 // ret -= modulus 1626 sbcs x9,x15,x13 1627 sbcs x10,x16,x21 1628 sbcs x11,x17,x22 1629 sbcs xzr,x19,xzr 1630 1631 csel x4,x14,x8,lo // ret = borrow ? ret : ret-modulus 1632 csel x5,x15,x9,lo 1633 csel x6,x16,x10,lo 1634 csel x7,x17,x11,lo 1635 1636 cbnz x2,Loop_ord_sqr 1637 1638 stp x4,x5,[x0] 1639 stp x6,x7,[x0,#16] 1640 1641 ldp x19,x20,[sp,#16] 1642 ldp x21,x22,[sp,#32] 1643 ldp x23,x24,[sp,#48] 1644 ldr x29,[sp],#64 1645 ret 1646 1647//////////////////////////////////////////////////////////////////////// 1648// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); 1649.globl ecp_nistz256_select_w5 1650 1651.def ecp_nistz256_select_w5 1652 .type 32 1653.endef 1654.align 4 1655ecp_nistz256_select_w5: 1656 AARCH64_VALID_CALL_TARGET 1657 1658 // x10 := x0 1659 // w9 := 0; loop counter and incremented internal index 1660 mov x10, x0 1661 mov w9, #0 1662 1663 // [v16-v21] := 0 1664 movi v16.16b, #0 1665 movi v17.16b, #0 1666 movi v18.16b, #0 1667 movi v19.16b, #0 1668 movi v20.16b, #0 1669 movi v21.16b, #0 1670 1671Lselect_w5_loop: 1672 // Loop 16 times. 1673 1674 // Increment index (loop counter); tested at the end of the loop 1675 add w9, w9, #1 1676 1677 // [v22-v27] := Load a (3*256-bit = 6*128-bit) table entry starting at x1 1678 // and advance x1 to point to the next entry 1679 ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64 1680 1681 // x11 := (w9 == w2)? All 1s : All 0s 1682 cmp w9, w2 1683 csetm x11, eq 1684 1685 // continue loading ... 1686 ld1 {v26.2d, v27.2d}, [x1],#32 1687 1688 // duplicate mask_64 into Mask (all 0s or all 1s) 1689 dup v3.2d, x11 1690 1691 // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19] 1692 // i.e., values in output registers will remain the same if w9 != w2 1693 bit v16.16b, v22.16b, v3.16b 1694 bit v17.16b, v23.16b, v3.16b 1695 1696 bit v18.16b, v24.16b, v3.16b 1697 bit v19.16b, v25.16b, v3.16b 1698 1699 bit v20.16b, v26.16b, v3.16b 1700 bit v21.16b, v27.16b, v3.16b 1701 1702 // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back 1703 tbz w9, #4, Lselect_w5_loop 1704 1705 // Write [v16-v21] to memory at the output pointer 1706 st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x10],#64 1707 st1 {v20.2d, v21.2d}, [x10] 1708 1709 ret 1710 1711 1712 1713//////////////////////////////////////////////////////////////////////// 1714// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); 1715.globl ecp_nistz256_select_w7 1716 1717.def ecp_nistz256_select_w7 1718 .type 32 1719.endef 1720.align 4 1721ecp_nistz256_select_w7: 1722 AARCH64_VALID_CALL_TARGET 1723 1724 // w9 := 0; loop counter and incremented internal index 1725 mov w9, #0 1726 1727 // [v16-v21] := 0 1728 movi v16.16b, #0 1729 movi v17.16b, #0 1730 movi v18.16b, #0 1731 movi v19.16b, #0 1732 1733Lselect_w7_loop: 1734 // Loop 64 times. 1735 1736 // Increment index (loop counter); tested at the end of the loop 1737 add w9, w9, #1 1738 1739 // [v22-v25] := Load a (2*256-bit = 4*128-bit) table entry starting at x1 1740 // and advance x1 to point to the next entry 1741 ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64 1742 1743 // x11 := (w9 == w2)? All 1s : All 0s 1744 cmp w9, w2 1745 csetm x11, eq 1746 1747 // duplicate mask_64 into Mask (all 0s or all 1s) 1748 dup v3.2d, x11 1749 1750 // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19] 1751 // i.e., values in output registers will remain the same if w9 != w2 1752 bit v16.16b, v22.16b, v3.16b 1753 bit v17.16b, v23.16b, v3.16b 1754 1755 bit v18.16b, v24.16b, v3.16b 1756 bit v19.16b, v25.16b, v3.16b 1757 1758 // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back 1759 tbz w9, #6, Lselect_w7_loop 1760 1761 // Write [v16-v19] to memory at the output pointer 1762 st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0] 1763 1764 ret 1765 1766#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) 1767