1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#include <ring-core/asm_base.h> 5 6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) 7#include "ring-core/arm_arch.h" 8 9.section .rodata 10.align 5 11Lpoly: 12.quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 13LRR: // 2^512 mod P precomputed for NIST P256 polynomial 14.quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd 15Lone_mont: 16.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe 17Lone: 18.quad 1,0,0,0 19Lord: 20.quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 21LordK: 22.quad 0xccd1c8aaee00bc4f 23.byte 69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 24.align 2 25.text 26 27// void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], 28// const BN_ULONG x2[4]); 29.globl ecp_nistz256_mul_mont 30 31.def ecp_nistz256_mul_mont 32 .type 32 33.endef 34.align 4 35ecp_nistz256_mul_mont: 36 AARCH64_SIGN_LINK_REGISTER 37 stp x29,x30,[sp,#-32]! 38 add x29,sp,#0 39 stp x19,x20,[sp,#16] 40 41 ldr x3,[x2] // bp[0] 42 ldp x4,x5,[x1] 43 ldp x6,x7,[x1,#16] 44 adrp x13,Lpoly 45 add x13,x13,:lo12:Lpoly 46 ldr x12,[x13,#8] 47 ldr x13,[x13,#24] 48 49 bl __ecp_nistz256_mul_mont 50 51 ldp x19,x20,[sp,#16] 52 ldp x29,x30,[sp],#32 53 AARCH64_VALIDATE_LINK_REGISTER 54 ret 55 56 57// void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); 58.globl ecp_nistz256_sqr_mont 59 60.def ecp_nistz256_sqr_mont 61 .type 32 62.endef 63.align 4 64ecp_nistz256_sqr_mont: 65 AARCH64_SIGN_LINK_REGISTER 66 stp x29,x30,[sp,#-32]! 67 add x29,sp,#0 68 stp x19,x20,[sp,#16] 69 70 ldp x4,x5,[x1] 71 ldp x6,x7,[x1,#16] 72 adrp x13,Lpoly 73 add x13,x13,:lo12:Lpoly 74 ldr x12,[x13,#8] 75 ldr x13,[x13,#24] 76 77 bl __ecp_nistz256_sqr_mont 78 79 ldp x19,x20,[sp,#16] 80 ldp x29,x30,[sp],#32 81 AARCH64_VALIDATE_LINK_REGISTER 82 ret 83 84 85// void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); 86.globl ecp_nistz256_neg 87 88.def ecp_nistz256_neg 89 .type 32 90.endef 91.align 4 92ecp_nistz256_neg: 93 AARCH64_SIGN_LINK_REGISTER 94 stp x29,x30,[sp,#-16]! 95 add x29,sp,#0 96 97 mov x2,x1 98 mov x14,xzr // a = 0 99 mov x15,xzr 100 mov x16,xzr 101 mov x17,xzr 102 adrp x13,Lpoly 103 add x13,x13,:lo12:Lpoly 104 ldr x12,[x13,#8] 105 ldr x13,[x13,#24] 106 107 bl __ecp_nistz256_sub_from 108 109 ldp x29,x30,[sp],#16 110 AARCH64_VALIDATE_LINK_REGISTER 111 ret 112 113 114// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded 115// to x4-x7 and b[0] - to x3 116.def __ecp_nistz256_mul_mont 117 .type 32 118.endef 119.align 4 120__ecp_nistz256_mul_mont: 121 mul x14,x4,x3 // a[0]*b[0] 122 umulh x8,x4,x3 123 124 mul x15,x5,x3 // a[1]*b[0] 125 umulh x9,x5,x3 126 127 mul x16,x6,x3 // a[2]*b[0] 128 umulh x10,x6,x3 129 130 mul x17,x7,x3 // a[3]*b[0] 131 umulh x11,x7,x3 132 ldr x3,[x2,#8] // b[1] 133 134 adds x15,x15,x8 // accumulate high parts of multiplication 135 lsl x8,x14,#32 136 adcs x16,x16,x9 137 lsr x9,x14,#32 138 adcs x17,x17,x10 139 adc x19,xzr,x11 140 mov x20,xzr 141 subs x10,x14,x8 // "*0xffff0001" 142 sbc x11,x14,x9 143 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 144 mul x8,x4,x3 // lo(a[0]*b[i]) 145 adcs x15,x16,x9 146 mul x9,x5,x3 // lo(a[1]*b[i]) 147 adcs x16,x17,x10 // +=acc[0]*0xffff0001 148 mul x10,x6,x3 // lo(a[2]*b[i]) 149 adcs x17,x19,x11 150 mul x11,x7,x3 // lo(a[3]*b[i]) 151 adc x19,x20,xzr 152 153 adds x14,x14,x8 // accumulate low parts of multiplication 154 umulh x8,x4,x3 // hi(a[0]*b[i]) 155 adcs x15,x15,x9 156 umulh x9,x5,x3 // hi(a[1]*b[i]) 157 adcs x16,x16,x10 158 umulh x10,x6,x3 // hi(a[2]*b[i]) 159 adcs x17,x17,x11 160 umulh x11,x7,x3 // hi(a[3]*b[i]) 161 adc x19,x19,xzr 162 ldr x3,[x2,#8*(1+1)] // b[1+1] 163 adds x15,x15,x8 // accumulate high parts of multiplication 164 lsl x8,x14,#32 165 adcs x16,x16,x9 166 lsr x9,x14,#32 167 adcs x17,x17,x10 168 adcs x19,x19,x11 169 adc x20,xzr,xzr 170 subs x10,x14,x8 // "*0xffff0001" 171 sbc x11,x14,x9 172 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 173 mul x8,x4,x3 // lo(a[0]*b[i]) 174 adcs x15,x16,x9 175 mul x9,x5,x3 // lo(a[1]*b[i]) 176 adcs x16,x17,x10 // +=acc[0]*0xffff0001 177 mul x10,x6,x3 // lo(a[2]*b[i]) 178 adcs x17,x19,x11 179 mul x11,x7,x3 // lo(a[3]*b[i]) 180 adc x19,x20,xzr 181 182 adds x14,x14,x8 // accumulate low parts of multiplication 183 umulh x8,x4,x3 // hi(a[0]*b[i]) 184 adcs x15,x15,x9 185 umulh x9,x5,x3 // hi(a[1]*b[i]) 186 adcs x16,x16,x10 187 umulh x10,x6,x3 // hi(a[2]*b[i]) 188 adcs x17,x17,x11 189 umulh x11,x7,x3 // hi(a[3]*b[i]) 190 adc x19,x19,xzr 191 ldr x3,[x2,#8*(2+1)] // b[2+1] 192 adds x15,x15,x8 // accumulate high parts of multiplication 193 lsl x8,x14,#32 194 adcs x16,x16,x9 195 lsr x9,x14,#32 196 adcs x17,x17,x10 197 adcs x19,x19,x11 198 adc x20,xzr,xzr 199 subs x10,x14,x8 // "*0xffff0001" 200 sbc x11,x14,x9 201 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 202 mul x8,x4,x3 // lo(a[0]*b[i]) 203 adcs x15,x16,x9 204 mul x9,x5,x3 // lo(a[1]*b[i]) 205 adcs x16,x17,x10 // +=acc[0]*0xffff0001 206 mul x10,x6,x3 // lo(a[2]*b[i]) 207 adcs x17,x19,x11 208 mul x11,x7,x3 // lo(a[3]*b[i]) 209 adc x19,x20,xzr 210 211 adds x14,x14,x8 // accumulate low parts of multiplication 212 umulh x8,x4,x3 // hi(a[0]*b[i]) 213 adcs x15,x15,x9 214 umulh x9,x5,x3 // hi(a[1]*b[i]) 215 adcs x16,x16,x10 216 umulh x10,x6,x3 // hi(a[2]*b[i]) 217 adcs x17,x17,x11 218 umulh x11,x7,x3 // hi(a[3]*b[i]) 219 adc x19,x19,xzr 220 adds x15,x15,x8 // accumulate high parts of multiplication 221 lsl x8,x14,#32 222 adcs x16,x16,x9 223 lsr x9,x14,#32 224 adcs x17,x17,x10 225 adcs x19,x19,x11 226 adc x20,xzr,xzr 227 // last reduction 228 subs x10,x14,x8 // "*0xffff0001" 229 sbc x11,x14,x9 230 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 231 adcs x15,x16,x9 232 adcs x16,x17,x10 // +=acc[0]*0xffff0001 233 adcs x17,x19,x11 234 adc x19,x20,xzr 235 236 adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus 237 sbcs x9,x15,x12 238 sbcs x10,x16,xzr 239 sbcs x11,x17,x13 240 sbcs xzr,x19,xzr // did it borrow? 241 242 csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus 243 csel x15,x15,x9,lo 244 csel x16,x16,x10,lo 245 stp x14,x15,[x0] 246 csel x17,x17,x11,lo 247 stp x16,x17,[x0,#16] 248 249 ret 250 251 252// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded 253// to x4-x7 254.def __ecp_nistz256_sqr_mont 255 .type 32 256.endef 257.align 4 258__ecp_nistz256_sqr_mont: 259 // | | | | | |a1*a0| | 260 // | | | | |a2*a0| | | 261 // | |a3*a2|a3*a0| | | | 262 // | | | |a2*a1| | | | 263 // | | |a3*a1| | | | | 264 // *| | | | | | | | 2| 265 // +|a3*a3|a2*a2|a1*a1|a0*a0| 266 // |--+--+--+--+--+--+--+--| 267 // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow 268 // 269 // "can't overflow" below mark carrying into high part of 270 // multiplication result, which can't overflow, because it 271 // can never be all ones. 272 273 mul x15,x5,x4 // a[1]*a[0] 274 umulh x9,x5,x4 275 mul x16,x6,x4 // a[2]*a[0] 276 umulh x10,x6,x4 277 mul x17,x7,x4 // a[3]*a[0] 278 umulh x19,x7,x4 279 280 adds x16,x16,x9 // accumulate high parts of multiplication 281 mul x8,x6,x5 // a[2]*a[1] 282 umulh x9,x6,x5 283 adcs x17,x17,x10 284 mul x10,x7,x5 // a[3]*a[1] 285 umulh x11,x7,x5 286 adc x19,x19,xzr // can't overflow 287 288 mul x20,x7,x6 // a[3]*a[2] 289 umulh x1,x7,x6 290 291 adds x9,x9,x10 // accumulate high parts of multiplication 292 mul x14,x4,x4 // a[0]*a[0] 293 adc x10,x11,xzr // can't overflow 294 295 adds x17,x17,x8 // accumulate low parts of multiplication 296 umulh x4,x4,x4 297 adcs x19,x19,x9 298 mul x9,x5,x5 // a[1]*a[1] 299 adcs x20,x20,x10 300 umulh x5,x5,x5 301 adc x1,x1,xzr // can't overflow 302 303 adds x15,x15,x15 // acc[1-6]*=2 304 mul x10,x6,x6 // a[2]*a[2] 305 adcs x16,x16,x16 306 umulh x6,x6,x6 307 adcs x17,x17,x17 308 mul x11,x7,x7 // a[3]*a[3] 309 adcs x19,x19,x19 310 umulh x7,x7,x7 311 adcs x20,x20,x20 312 adcs x1,x1,x1 313 adc x2,xzr,xzr 314 315 adds x15,x15,x4 // +a[i]*a[i] 316 adcs x16,x16,x9 317 adcs x17,x17,x5 318 adcs x19,x19,x10 319 adcs x20,x20,x6 320 lsl x8,x14,#32 321 adcs x1,x1,x11 322 lsr x9,x14,#32 323 adc x2,x2,x7 324 subs x10,x14,x8 // "*0xffff0001" 325 sbc x11,x14,x9 326 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 327 adcs x15,x16,x9 328 lsl x8,x14,#32 329 adcs x16,x17,x10 // +=acc[0]*0xffff0001 330 lsr x9,x14,#32 331 adc x17,x11,xzr // can't overflow 332 subs x10,x14,x8 // "*0xffff0001" 333 sbc x11,x14,x9 334 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 335 adcs x15,x16,x9 336 lsl x8,x14,#32 337 adcs x16,x17,x10 // +=acc[0]*0xffff0001 338 lsr x9,x14,#32 339 adc x17,x11,xzr // can't overflow 340 subs x10,x14,x8 // "*0xffff0001" 341 sbc x11,x14,x9 342 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 343 adcs x15,x16,x9 344 lsl x8,x14,#32 345 adcs x16,x17,x10 // +=acc[0]*0xffff0001 346 lsr x9,x14,#32 347 adc x17,x11,xzr // can't overflow 348 subs x10,x14,x8 // "*0xffff0001" 349 sbc x11,x14,x9 350 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 351 adcs x15,x16,x9 352 adcs x16,x17,x10 // +=acc[0]*0xffff0001 353 adc x17,x11,xzr // can't overflow 354 355 adds x14,x14,x19 // accumulate upper half 356 adcs x15,x15,x20 357 adcs x16,x16,x1 358 adcs x17,x17,x2 359 adc x19,xzr,xzr 360 361 adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus 362 sbcs x9,x15,x12 363 sbcs x10,x16,xzr 364 sbcs x11,x17,x13 365 sbcs xzr,x19,xzr // did it borrow? 366 367 csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus 368 csel x15,x15,x9,lo 369 csel x16,x16,x10,lo 370 stp x14,x15,[x0] 371 csel x17,x17,x11,lo 372 stp x16,x17,[x0,#16] 373 374 ret 375 376 377// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to 378// x4-x7 and x8-x11. This is done because it's used in multiple 379// contexts, e.g. in multiplication by 2 and 3... 380.def __ecp_nistz256_add_to 381 .type 32 382.endef 383.align 4 384__ecp_nistz256_add_to: 385 adds x14,x14,x8 // ret = a+b 386 adcs x15,x15,x9 387 adcs x16,x16,x10 388 adcs x17,x17,x11 389 adc x1,xzr,xzr // zap x1 390 391 adds x8,x14,#1 // subs x8,x4,#-1 // tmp = ret-modulus 392 sbcs x9,x15,x12 393 sbcs x10,x16,xzr 394 sbcs x11,x17,x13 395 sbcs xzr,x1,xzr // did subtraction borrow? 396 397 csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus 398 csel x15,x15,x9,lo 399 csel x16,x16,x10,lo 400 stp x14,x15,[x0] 401 csel x17,x17,x11,lo 402 stp x16,x17,[x0,#16] 403 404 ret 405 406 407.def __ecp_nistz256_sub_from 408 .type 32 409.endef 410.align 4 411__ecp_nistz256_sub_from: 412 ldp x8,x9,[x2] 413 ldp x10,x11,[x2,#16] 414 subs x14,x14,x8 // ret = a-b 415 sbcs x15,x15,x9 416 sbcs x16,x16,x10 417 sbcs x17,x17,x11 418 sbc x1,xzr,xzr // zap x1 419 420 subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus 421 adcs x9,x15,x12 422 adcs x10,x16,xzr 423 adc x11,x17,x13 424 cmp x1,xzr // did subtraction borrow? 425 426 csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret 427 csel x15,x15,x9,eq 428 csel x16,x16,x10,eq 429 stp x14,x15,[x0] 430 csel x17,x17,x11,eq 431 stp x16,x17,[x0,#16] 432 433 ret 434 435 436.def __ecp_nistz256_sub_morf 437 .type 32 438.endef 439.align 4 440__ecp_nistz256_sub_morf: 441 ldp x8,x9,[x2] 442 ldp x10,x11,[x2,#16] 443 subs x14,x8,x14 // ret = b-a 444 sbcs x15,x9,x15 445 sbcs x16,x10,x16 446 sbcs x17,x11,x17 447 sbc x1,xzr,xzr // zap x1 448 449 subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus 450 adcs x9,x15,x12 451 adcs x10,x16,xzr 452 adc x11,x17,x13 453 cmp x1,xzr // did subtraction borrow? 454 455 csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret 456 csel x15,x15,x9,eq 457 csel x16,x16,x10,eq 458 stp x14,x15,[x0] 459 csel x17,x17,x11,eq 460 stp x16,x17,[x0,#16] 461 462 ret 463 464 465.def __ecp_nistz256_div_by_2 466 .type 32 467.endef 468.align 4 469__ecp_nistz256_div_by_2: 470 subs x8,x14,#1 // adds x8,x4,#-1 // tmp = a+modulus 471 adcs x9,x15,x12 472 adcs x10,x16,xzr 473 adcs x11,x17,x13 474 adc x1,xzr,xzr // zap x1 475 tst x14,#1 // is a even? 476 477 csel x14,x14,x8,eq // ret = even ? a : a+modulus 478 csel x15,x15,x9,eq 479 csel x16,x16,x10,eq 480 csel x17,x17,x11,eq 481 csel x1,xzr,x1,eq 482 483 lsr x14,x14,#1 // ret >>= 1 484 orr x14,x14,x15,lsl#63 485 lsr x15,x15,#1 486 orr x15,x15,x16,lsl#63 487 lsr x16,x16,#1 488 orr x16,x16,x17,lsl#63 489 lsr x17,x17,#1 490 stp x14,x15,[x0] 491 orr x17,x17,x1,lsl#63 492 stp x16,x17,[x0,#16] 493 494 ret 495 496.globl ecp_nistz256_point_double 497 498.def ecp_nistz256_point_double 499 .type 32 500.endef 501.align 5 502ecp_nistz256_point_double: 503 AARCH64_SIGN_LINK_REGISTER 504 stp x29,x30,[sp,#-96]! 505 add x29,sp,#0 506 stp x19,x20,[sp,#16] 507 stp x21,x22,[sp,#32] 508 sub sp,sp,#32*4 509 510Ldouble_shortcut: 511 ldp x14,x15,[x1,#32] 512 mov x21,x0 513 ldp x16,x17,[x1,#48] 514 mov x22,x1 515 adrp x13,Lpoly 516 add x13,x13,:lo12:Lpoly 517 ldr x12,[x13,#8] 518 mov x8,x14 519 ldr x13,[x13,#24] 520 mov x9,x15 521 ldp x4,x5,[x22,#64] // forward load for p256_sqr_mont 522 mov x10,x16 523 mov x11,x17 524 ldp x6,x7,[x22,#64+16] 525 add x0,sp,#0 526 bl __ecp_nistz256_add_to // p256_mul_by_2(S, in_y); 527 528 add x0,sp,#64 529 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z); 530 531 ldp x8,x9,[x22] 532 ldp x10,x11,[x22,#16] 533 mov x4,x14 // put Zsqr aside for p256_sub 534 mov x5,x15 535 mov x6,x16 536 mov x7,x17 537 add x0,sp,#32 538 bl __ecp_nistz256_add_to // p256_add(M, Zsqr, in_x); 539 540 add x2,x22,#0 541 mov x14,x4 // restore Zsqr 542 mov x15,x5 543 ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont 544 mov x16,x6 545 mov x17,x7 546 ldp x6,x7,[sp,#0+16] 547 add x0,sp,#64 548 bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr); 549 550 add x0,sp,#0 551 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S); 552 553 ldr x3,[x22,#32] 554 ldp x4,x5,[x22,#64] 555 ldp x6,x7,[x22,#64+16] 556 add x2,x22,#32 557 add x0,sp,#96 558 bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y); 559 560 mov x8,x14 561 mov x9,x15 562 ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont 563 mov x10,x16 564 mov x11,x17 565 ldp x6,x7,[sp,#0+16] 566 add x0,x21,#64 567 bl __ecp_nistz256_add_to // p256_mul_by_2(res_z, tmp0); 568 569 add x0,sp,#96 570 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S); 571 572 ldr x3,[sp,#64] // forward load for p256_mul_mont 573 ldp x4,x5,[sp,#32] 574 ldp x6,x7,[sp,#32+16] 575 add x0,x21,#32 576 bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0); 577 578 add x2,sp,#64 579 add x0,sp,#32 580 bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr); 581 582 mov x8,x14 // duplicate M 583 mov x9,x15 584 mov x10,x16 585 mov x11,x17 586 mov x4,x14 // put M aside 587 mov x5,x15 588 mov x6,x16 589 mov x7,x17 590 add x0,sp,#32 591 bl __ecp_nistz256_add_to 592 mov x8,x4 // restore M 593 mov x9,x5 594 ldr x3,[x22] // forward load for p256_mul_mont 595 mov x10,x6 596 ldp x4,x5,[sp,#0] 597 mov x11,x7 598 ldp x6,x7,[sp,#0+16] 599 bl __ecp_nistz256_add_to // p256_mul_by_3(M, M); 600 601 add x2,x22,#0 602 add x0,sp,#0 603 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x); 604 605 mov x8,x14 606 mov x9,x15 607 ldp x4,x5,[sp,#32] // forward load for p256_sqr_mont 608 mov x10,x16 609 mov x11,x17 610 ldp x6,x7,[sp,#32+16] 611 add x0,sp,#96 612 bl __ecp_nistz256_add_to // p256_mul_by_2(tmp0, S); 613 614 add x0,x21,#0 615 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M); 616 617 add x2,sp,#96 618 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0); 619 620 add x2,sp,#0 621 add x0,sp,#0 622 bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x); 623 624 ldr x3,[sp,#32] 625 mov x4,x14 // copy S 626 mov x5,x15 627 mov x6,x16 628 mov x7,x17 629 add x2,sp,#32 630 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M); 631 632 add x2,x21,#32 633 add x0,x21,#32 634 bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y); 635 636 add sp,x29,#0 // destroy frame 637 ldp x19,x20,[x29,#16] 638 ldp x21,x22,[x29,#32] 639 ldp x29,x30,[sp],#96 640 AARCH64_VALIDATE_LINK_REGISTER 641 ret 642 643.globl ecp_nistz256_point_add 644 645.def ecp_nistz256_point_add 646 .type 32 647.endef 648.align 5 649ecp_nistz256_point_add: 650 AARCH64_SIGN_LINK_REGISTER 651 stp x29,x30,[sp,#-96]! 652 add x29,sp,#0 653 stp x19,x20,[sp,#16] 654 stp x21,x22,[sp,#32] 655 stp x23,x24,[sp,#48] 656 stp x25,x26,[sp,#64] 657 stp x27,x28,[sp,#80] 658 sub sp,sp,#32*12 659 660 ldp x4,x5,[x2,#64] // in2_z 661 ldp x6,x7,[x2,#64+16] 662 mov x21,x0 663 mov x22,x1 664 mov x23,x2 665 adrp x13,Lpoly 666 add x13,x13,:lo12:Lpoly 667 ldr x12,[x13,#8] 668 ldr x13,[x13,#24] 669 orr x8,x4,x5 670 orr x10,x6,x7 671 orr x25,x8,x10 672 cmp x25,#0 673 csetm x25,ne // ~in2infty 674 add x0,sp,#192 675 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z); 676 677 ldp x4,x5,[x22,#64] // in1_z 678 ldp x6,x7,[x22,#64+16] 679 orr x8,x4,x5 680 orr x10,x6,x7 681 orr x24,x8,x10 682 cmp x24,#0 683 csetm x24,ne // ~in1infty 684 add x0,sp,#128 685 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); 686 687 ldr x3,[x23,#64] 688 ldp x4,x5,[sp,#192] 689 ldp x6,x7,[sp,#192+16] 690 add x2,x23,#64 691 add x0,sp,#320 692 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z); 693 694 ldr x3,[x22,#64] 695 ldp x4,x5,[sp,#128] 696 ldp x6,x7,[sp,#128+16] 697 add x2,x22,#64 698 add x0,sp,#352 699 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); 700 701 ldr x3,[x22,#32] 702 ldp x4,x5,[sp,#320] 703 ldp x6,x7,[sp,#320+16] 704 add x2,x22,#32 705 add x0,sp,#320 706 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y); 707 708 ldr x3,[x23,#32] 709 ldp x4,x5,[sp,#352] 710 ldp x6,x7,[sp,#352+16] 711 add x2,x23,#32 712 add x0,sp,#352 713 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); 714 715 add x2,sp,#320 716 ldr x3,[sp,#192] // forward load for p256_mul_mont 717 ldp x4,x5,[x22] 718 ldp x6,x7,[x22,#16] 719 add x0,sp,#160 720 bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1); 721 722 orr x14,x14,x15 // see if result is zero 723 orr x16,x16,x17 724 orr x26,x14,x16 // ~is_equal(S1,S2) 725 726 add x2,sp,#192 727 add x0,sp,#256 728 bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr); 729 730 ldr x3,[sp,#128] 731 ldp x4,x5,[x23] 732 ldp x6,x7,[x23,#16] 733 add x2,sp,#128 734 add x0,sp,#288 735 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr); 736 737 add x2,sp,#256 738 ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont 739 ldp x6,x7,[sp,#160+16] 740 add x0,sp,#96 741 bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1); 742 743 orr x14,x14,x15 // see if result is zero 744 orr x16,x16,x17 745 orr x14,x14,x16 // ~is_equal(U1,U2) 746 747 mvn x27,x24 // -1/0 -> 0/-1 748 mvn x28,x25 // -1/0 -> 0/-1 749 orr x14,x14,x27 750 orr x14,x14,x28 751 orr x14,x14,x26 752 cbnz x14,Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) 753 754Ladd_double: 755 mov x1,x22 756 mov x0,x21 757 ldp x23,x24,[x29,#48] 758 ldp x25,x26,[x29,#64] 759 ldp x27,x28,[x29,#80] 760 add sp,sp,#256 // #256 is from #32*(12-4). difference in stack frames 761 b Ldouble_shortcut 762 763.align 4 764Ladd_proceed: 765 add x0,sp,#192 766 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); 767 768 ldr x3,[x22,#64] 769 ldp x4,x5,[sp,#96] 770 ldp x6,x7,[sp,#96+16] 771 add x2,x22,#64 772 add x0,sp,#64 773 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); 774 775 ldp x4,x5,[sp,#96] 776 ldp x6,x7,[sp,#96+16] 777 add x0,sp,#128 778 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); 779 780 ldr x3,[x23,#64] 781 ldp x4,x5,[sp,#64] 782 ldp x6,x7,[sp,#64+16] 783 add x2,x23,#64 784 add x0,sp,#64 785 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z); 786 787 ldr x3,[sp,#96] 788 ldp x4,x5,[sp,#128] 789 ldp x6,x7,[sp,#128+16] 790 add x2,sp,#96 791 add x0,sp,#224 792 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); 793 794 ldr x3,[sp,#128] 795 ldp x4,x5,[sp,#256] 796 ldp x6,x7,[sp,#256+16] 797 add x2,sp,#128 798 add x0,sp,#288 799 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr); 800 801 mov x8,x14 802 mov x9,x15 803 mov x10,x16 804 mov x11,x17 805 add x0,sp,#128 806 bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); 807 808 add x2,sp,#192 809 add x0,sp,#0 810 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); 811 812 add x2,sp,#224 813 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); 814 815 add x2,sp,#288 816 ldr x3,[sp,#224] // forward load for p256_mul_mont 817 ldp x4,x5,[sp,#320] 818 ldp x6,x7,[sp,#320+16] 819 add x0,sp,#32 820 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); 821 822 add x2,sp,#224 823 add x0,sp,#352 824 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub); 825 826 ldr x3,[sp,#160] 827 ldp x4,x5,[sp,#32] 828 ldp x6,x7,[sp,#32+16] 829 add x2,sp,#160 830 add x0,sp,#32 831 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); 832 833 add x2,sp,#352 834 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); 835 836 ldp x4,x5,[sp,#0] // res 837 ldp x6,x7,[sp,#0+16] 838 ldp x8,x9,[x23] // in2 839 ldp x10,x11,[x23,#16] 840 ldp x14,x15,[x22,#0] // in1 841 cmp x24,#0 // ~, remember? 842 ldp x16,x17,[x22,#0+16] 843 csel x8,x4,x8,ne 844 csel x9,x5,x9,ne 845 ldp x4,x5,[sp,#0+0+32] // res 846 csel x10,x6,x10,ne 847 csel x11,x7,x11,ne 848 cmp x25,#0 // ~, remember? 849 ldp x6,x7,[sp,#0+0+48] 850 csel x14,x8,x14,ne 851 csel x15,x9,x15,ne 852 ldp x8,x9,[x23,#0+32] // in2 853 csel x16,x10,x16,ne 854 csel x17,x11,x17,ne 855 ldp x10,x11,[x23,#0+48] 856 stp x14,x15,[x21,#0] 857 stp x16,x17,[x21,#0+16] 858 ldp x14,x15,[x22,#32] // in1 859 cmp x24,#0 // ~, remember? 860 ldp x16,x17,[x22,#32+16] 861 csel x8,x4,x8,ne 862 csel x9,x5,x9,ne 863 ldp x4,x5,[sp,#0+32+32] // res 864 csel x10,x6,x10,ne 865 csel x11,x7,x11,ne 866 cmp x25,#0 // ~, remember? 867 ldp x6,x7,[sp,#0+32+48] 868 csel x14,x8,x14,ne 869 csel x15,x9,x15,ne 870 ldp x8,x9,[x23,#32+32] // in2 871 csel x16,x10,x16,ne 872 csel x17,x11,x17,ne 873 ldp x10,x11,[x23,#32+48] 874 stp x14,x15,[x21,#32] 875 stp x16,x17,[x21,#32+16] 876 ldp x14,x15,[x22,#64] // in1 877 cmp x24,#0 // ~, remember? 878 ldp x16,x17,[x22,#64+16] 879 csel x8,x4,x8,ne 880 csel x9,x5,x9,ne 881 csel x10,x6,x10,ne 882 csel x11,x7,x11,ne 883 cmp x25,#0 // ~, remember? 884 csel x14,x8,x14,ne 885 csel x15,x9,x15,ne 886 csel x16,x10,x16,ne 887 csel x17,x11,x17,ne 888 stp x14,x15,[x21,#64] 889 stp x16,x17,[x21,#64+16] 890 891Ladd_done: 892 add sp,x29,#0 // destroy frame 893 ldp x19,x20,[x29,#16] 894 ldp x21,x22,[x29,#32] 895 ldp x23,x24,[x29,#48] 896 ldp x25,x26,[x29,#64] 897 ldp x27,x28,[x29,#80] 898 ldp x29,x30,[sp],#96 899 AARCH64_VALIDATE_LINK_REGISTER 900 ret 901 902.globl ecp_nistz256_point_add_affine 903 904.def ecp_nistz256_point_add_affine 905 .type 32 906.endef 907.align 5 908ecp_nistz256_point_add_affine: 909 AARCH64_SIGN_LINK_REGISTER 910 stp x29,x30,[sp,#-80]! 911 add x29,sp,#0 912 stp x19,x20,[sp,#16] 913 stp x21,x22,[sp,#32] 914 stp x23,x24,[sp,#48] 915 stp x25,x26,[sp,#64] 916 sub sp,sp,#32*10 917 918 mov x21,x0 919 mov x22,x1 920 mov x23,x2 921 adrp x13,Lpoly 922 add x13,x13,:lo12:Lpoly 923 ldr x12,[x13,#8] 924 ldr x13,[x13,#24] 925 926 ldp x4,x5,[x1,#64] // in1_z 927 ldp x6,x7,[x1,#64+16] 928 orr x8,x4,x5 929 orr x10,x6,x7 930 orr x24,x8,x10 931 cmp x24,#0 932 csetm x24,ne // ~in1infty 933 934 ldp x14,x15,[x2] // in2_x 935 ldp x16,x17,[x2,#16] 936 ldp x8,x9,[x2,#32] // in2_y 937 ldp x10,x11,[x2,#48] 938 orr x14,x14,x15 939 orr x16,x16,x17 940 orr x8,x8,x9 941 orr x10,x10,x11 942 orr x14,x14,x16 943 orr x8,x8,x10 944 orr x25,x14,x8 945 cmp x25,#0 946 csetm x25,ne // ~in2infty 947 948 add x0,sp,#128 949 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); 950 951 mov x4,x14 952 mov x5,x15 953 mov x6,x16 954 mov x7,x17 955 ldr x3,[x23] 956 add x2,x23,#0 957 add x0,sp,#96 958 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x); 959 960 add x2,x22,#0 961 ldr x3,[x22,#64] // forward load for p256_mul_mont 962 ldp x4,x5,[sp,#128] 963 ldp x6,x7,[sp,#128+16] 964 add x0,sp,#160 965 bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x); 966 967 add x2,x22,#64 968 add x0,sp,#128 969 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); 970 971 ldr x3,[x22,#64] 972 ldp x4,x5,[sp,#160] 973 ldp x6,x7,[sp,#160+16] 974 add x2,x22,#64 975 add x0,sp,#64 976 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); 977 978 ldr x3,[x23,#32] 979 ldp x4,x5,[sp,#128] 980 ldp x6,x7,[sp,#128+16] 981 add x2,x23,#32 982 add x0,sp,#128 983 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); 984 985 add x2,x22,#32 986 ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont 987 ldp x6,x7,[sp,#160+16] 988 add x0,sp,#192 989 bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y); 990 991 add x0,sp,#224 992 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); 993 994 ldp x4,x5,[sp,#192] 995 ldp x6,x7,[sp,#192+16] 996 add x0,sp,#288 997 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); 998 999 ldr x3,[sp,#160] 1000 ldp x4,x5,[sp,#224] 1001 ldp x6,x7,[sp,#224+16] 1002 add x2,sp,#160 1003 add x0,sp,#256 1004 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); 1005 1006 ldr x3,[x22] 1007 ldp x4,x5,[sp,#224] 1008 ldp x6,x7,[sp,#224+16] 1009 add x2,x22,#0 1010 add x0,sp,#96 1011 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr); 1012 1013 mov x8,x14 1014 mov x9,x15 1015 mov x10,x16 1016 mov x11,x17 1017 add x0,sp,#224 1018 bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); 1019 1020 add x2,sp,#288 1021 add x0,sp,#0 1022 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); 1023 1024 add x2,sp,#256 1025 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); 1026 1027 add x2,sp,#96 1028 ldr x3,[x22,#32] // forward load for p256_mul_mont 1029 ldp x4,x5,[sp,#256] 1030 ldp x6,x7,[sp,#256+16] 1031 add x0,sp,#32 1032 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); 1033 1034 add x2,x22,#32 1035 add x0,sp,#128 1036 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub); 1037 1038 ldr x3,[sp,#192] 1039 ldp x4,x5,[sp,#32] 1040 ldp x6,x7,[sp,#32+16] 1041 add x2,sp,#192 1042 add x0,sp,#32 1043 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); 1044 1045 add x2,sp,#128 1046 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); 1047 1048 ldp x4,x5,[sp,#0] // res 1049 ldp x6,x7,[sp,#0+16] 1050 ldp x8,x9,[x23] // in2 1051 ldp x10,x11,[x23,#16] 1052 ldp x14,x15,[x22,#0] // in1 1053 cmp x24,#0 // ~, remember? 1054 ldp x16,x17,[x22,#0+16] 1055 csel x8,x4,x8,ne 1056 csel x9,x5,x9,ne 1057 ldp x4,x5,[sp,#0+0+32] // res 1058 csel x10,x6,x10,ne 1059 csel x11,x7,x11,ne 1060 cmp x25,#0 // ~, remember? 1061 ldp x6,x7,[sp,#0+0+48] 1062 csel x14,x8,x14,ne 1063 csel x15,x9,x15,ne 1064 ldp x8,x9,[x23,#0+32] // in2 1065 csel x16,x10,x16,ne 1066 csel x17,x11,x17,ne 1067 ldp x10,x11,[x23,#0+48] 1068 stp x14,x15,[x21,#0] 1069 stp x16,x17,[x21,#0+16] 1070 adrp x23,Lone_mont-64 1071 add x23,x23,:lo12:Lone_mont-64 1072 ldp x14,x15,[x22,#32] // in1 1073 cmp x24,#0 // ~, remember? 1074 ldp x16,x17,[x22,#32+16] 1075 csel x8,x4,x8,ne 1076 csel x9,x5,x9,ne 1077 ldp x4,x5,[sp,#0+32+32] // res 1078 csel x10,x6,x10,ne 1079 csel x11,x7,x11,ne 1080 cmp x25,#0 // ~, remember? 1081 ldp x6,x7,[sp,#0+32+48] 1082 csel x14,x8,x14,ne 1083 csel x15,x9,x15,ne 1084 ldp x8,x9,[x23,#32+32] // in2 1085 csel x16,x10,x16,ne 1086 csel x17,x11,x17,ne 1087 ldp x10,x11,[x23,#32+48] 1088 stp x14,x15,[x21,#32] 1089 stp x16,x17,[x21,#32+16] 1090 ldp x14,x15,[x22,#64] // in1 1091 cmp x24,#0 // ~, remember? 1092 ldp x16,x17,[x22,#64+16] 1093 csel x8,x4,x8,ne 1094 csel x9,x5,x9,ne 1095 csel x10,x6,x10,ne 1096 csel x11,x7,x11,ne 1097 cmp x25,#0 // ~, remember? 1098 csel x14,x8,x14,ne 1099 csel x15,x9,x15,ne 1100 csel x16,x10,x16,ne 1101 csel x17,x11,x17,ne 1102 stp x14,x15,[x21,#64] 1103 stp x16,x17,[x21,#64+16] 1104 1105 add sp,x29,#0 // destroy frame 1106 ldp x19,x20,[x29,#16] 1107 ldp x21,x22,[x29,#32] 1108 ldp x23,x24,[x29,#48] 1109 ldp x25,x26,[x29,#64] 1110 ldp x29,x30,[sp],#80 1111 AARCH64_VALIDATE_LINK_REGISTER 1112 ret 1113 1114//////////////////////////////////////////////////////////////////////// 1115// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], 1116// uint64_t b[4]); 1117.globl ecp_nistz256_ord_mul_mont 1118 1119.def ecp_nistz256_ord_mul_mont 1120 .type 32 1121.endef 1122.align 4 1123ecp_nistz256_ord_mul_mont: 1124 AARCH64_VALID_CALL_TARGET 1125 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 1126 stp x29,x30,[sp,#-64]! 1127 add x29,sp,#0 1128 stp x19,x20,[sp,#16] 1129 stp x21,x22,[sp,#32] 1130 stp x23,x24,[sp,#48] 1131 1132 adrp x23,Lord 1133 add x23,x23,:lo12:Lord 1134 ldr x3,[x2] // bp[0] 1135 ldp x4,x5,[x1] 1136 ldp x6,x7,[x1,#16] 1137 1138 ldp x12,x13,[x23,#0] 1139 ldp x21,x22,[x23,#16] 1140 ldr x23,[x23,#32] 1141 1142 mul x14,x4,x3 // a[0]*b[0] 1143 umulh x8,x4,x3 1144 1145 mul x15,x5,x3 // a[1]*b[0] 1146 umulh x9,x5,x3 1147 1148 mul x16,x6,x3 // a[2]*b[0] 1149 umulh x10,x6,x3 1150 1151 mul x17,x7,x3 // a[3]*b[0] 1152 umulh x19,x7,x3 1153 1154 mul x24,x14,x23 1155 1156 adds x15,x15,x8 // accumulate high parts of multiplication 1157 adcs x16,x16,x9 1158 adcs x17,x17,x10 1159 adc x19,x19,xzr 1160 mov x20,xzr 1161 ldr x3,[x2,#8*1] // b[i] 1162 1163 lsl x8,x24,#32 1164 subs x16,x16,x24 1165 lsr x9,x24,#32 1166 sbcs x17,x17,x8 1167 sbcs x19,x19,x9 1168 sbc x20,x20,xzr 1169 1170 subs xzr,x14,#1 1171 umulh x9,x12,x24 1172 mul x10,x13,x24 1173 umulh x11,x13,x24 1174 1175 adcs x10,x10,x9 1176 mul x8,x4,x3 1177 adc x11,x11,xzr 1178 mul x9,x5,x3 1179 1180 adds x14,x15,x10 1181 mul x10,x6,x3 1182 adcs x15,x16,x11 1183 mul x11,x7,x3 1184 adcs x16,x17,x24 1185 adcs x17,x19,x24 1186 adc x19,x20,xzr 1187 1188 adds x14,x14,x8 // accumulate low parts 1189 umulh x8,x4,x3 1190 adcs x15,x15,x9 1191 umulh x9,x5,x3 1192 adcs x16,x16,x10 1193 umulh x10,x6,x3 1194 adcs x17,x17,x11 1195 umulh x11,x7,x3 1196 adc x19,x19,xzr 1197 mul x24,x14,x23 1198 adds x15,x15,x8 // accumulate high parts 1199 adcs x16,x16,x9 1200 adcs x17,x17,x10 1201 adcs x19,x19,x11 1202 adc x20,xzr,xzr 1203 ldr x3,[x2,#8*2] // b[i] 1204 1205 lsl x8,x24,#32 1206 subs x16,x16,x24 1207 lsr x9,x24,#32 1208 sbcs x17,x17,x8 1209 sbcs x19,x19,x9 1210 sbc x20,x20,xzr 1211 1212 subs xzr,x14,#1 1213 umulh x9,x12,x24 1214 mul x10,x13,x24 1215 umulh x11,x13,x24 1216 1217 adcs x10,x10,x9 1218 mul x8,x4,x3 1219 adc x11,x11,xzr 1220 mul x9,x5,x3 1221 1222 adds x14,x15,x10 1223 mul x10,x6,x3 1224 adcs x15,x16,x11 1225 mul x11,x7,x3 1226 adcs x16,x17,x24 1227 adcs x17,x19,x24 1228 adc x19,x20,xzr 1229 1230 adds x14,x14,x8 // accumulate low parts 1231 umulh x8,x4,x3 1232 adcs x15,x15,x9 1233 umulh x9,x5,x3 1234 adcs x16,x16,x10 1235 umulh x10,x6,x3 1236 adcs x17,x17,x11 1237 umulh x11,x7,x3 1238 adc x19,x19,xzr 1239 mul x24,x14,x23 1240 adds x15,x15,x8 // accumulate high parts 1241 adcs x16,x16,x9 1242 adcs x17,x17,x10 1243 adcs x19,x19,x11 1244 adc x20,xzr,xzr 1245 ldr x3,[x2,#8*3] // b[i] 1246 1247 lsl x8,x24,#32 1248 subs x16,x16,x24 1249 lsr x9,x24,#32 1250 sbcs x17,x17,x8 1251 sbcs x19,x19,x9 1252 sbc x20,x20,xzr 1253 1254 subs xzr,x14,#1 1255 umulh x9,x12,x24 1256 mul x10,x13,x24 1257 umulh x11,x13,x24 1258 1259 adcs x10,x10,x9 1260 mul x8,x4,x3 1261 adc x11,x11,xzr 1262 mul x9,x5,x3 1263 1264 adds x14,x15,x10 1265 mul x10,x6,x3 1266 adcs x15,x16,x11 1267 mul x11,x7,x3 1268 adcs x16,x17,x24 1269 adcs x17,x19,x24 1270 adc x19,x20,xzr 1271 1272 adds x14,x14,x8 // accumulate low parts 1273 umulh x8,x4,x3 1274 adcs x15,x15,x9 1275 umulh x9,x5,x3 1276 adcs x16,x16,x10 1277 umulh x10,x6,x3 1278 adcs x17,x17,x11 1279 umulh x11,x7,x3 1280 adc x19,x19,xzr 1281 mul x24,x14,x23 1282 adds x15,x15,x8 // accumulate high parts 1283 adcs x16,x16,x9 1284 adcs x17,x17,x10 1285 adcs x19,x19,x11 1286 adc x20,xzr,xzr 1287 lsl x8,x24,#32 // last reduction 1288 subs x16,x16,x24 1289 lsr x9,x24,#32 1290 sbcs x17,x17,x8 1291 sbcs x19,x19,x9 1292 sbc x20,x20,xzr 1293 1294 subs xzr,x14,#1 1295 umulh x9,x12,x24 1296 mul x10,x13,x24 1297 umulh x11,x13,x24 1298 1299 adcs x10,x10,x9 1300 adc x11,x11,xzr 1301 1302 adds x14,x15,x10 1303 adcs x15,x16,x11 1304 adcs x16,x17,x24 1305 adcs x17,x19,x24 1306 adc x19,x20,xzr 1307 1308 subs x8,x14,x12 // ret -= modulus 1309 sbcs x9,x15,x13 1310 sbcs x10,x16,x21 1311 sbcs x11,x17,x22 1312 sbcs xzr,x19,xzr 1313 1314 csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus 1315 csel x15,x15,x9,lo 1316 csel x16,x16,x10,lo 1317 stp x14,x15,[x0] 1318 csel x17,x17,x11,lo 1319 stp x16,x17,[x0,#16] 1320 1321 ldp x19,x20,[sp,#16] 1322 ldp x21,x22,[sp,#32] 1323 ldp x23,x24,[sp,#48] 1324 ldr x29,[sp],#64 1325 ret 1326 1327 1328//////////////////////////////////////////////////////////////////////// 1329// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], 1330// uint64_t rep); 1331.globl ecp_nistz256_ord_sqr_mont 1332 1333.def ecp_nistz256_ord_sqr_mont 1334 .type 32 1335.endef 1336.align 4 1337ecp_nistz256_ord_sqr_mont: 1338 AARCH64_VALID_CALL_TARGET 1339 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 1340 stp x29,x30,[sp,#-64]! 1341 add x29,sp,#0 1342 stp x19,x20,[sp,#16] 1343 stp x21,x22,[sp,#32] 1344 stp x23,x24,[sp,#48] 1345 1346 adrp x23,Lord 1347 add x23,x23,:lo12:Lord 1348 ldp x4,x5,[x1] 1349 ldp x6,x7,[x1,#16] 1350 1351 ldp x12,x13,[x23,#0] 1352 ldp x21,x22,[x23,#16] 1353 ldr x23,[x23,#32] 1354 b Loop_ord_sqr 1355 1356.align 4 1357Loop_ord_sqr: 1358 sub x2,x2,#1 1359 //////////////////////////////////////////////////////////////// 1360 // | | | | | |a1*a0| | 1361 // | | | | |a2*a0| | | 1362 // | |a3*a2|a3*a0| | | | 1363 // | | | |a2*a1| | | | 1364 // | | |a3*a1| | | | | 1365 // *| | | | | | | | 2| 1366 // +|a3*a3|a2*a2|a1*a1|a0*a0| 1367 // |--+--+--+--+--+--+--+--| 1368 // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow 1369 // 1370 // "can't overflow" below mark carrying into high part of 1371 // multiplication result, which can't overflow, because it 1372 // can never be all ones. 1373 1374 mul x15,x5,x4 // a[1]*a[0] 1375 umulh x9,x5,x4 1376 mul x16,x6,x4 // a[2]*a[0] 1377 umulh x10,x6,x4 1378 mul x17,x7,x4 // a[3]*a[0] 1379 umulh x19,x7,x4 1380 1381 adds x16,x16,x9 // accumulate high parts of multiplication 1382 mul x8,x6,x5 // a[2]*a[1] 1383 umulh x9,x6,x5 1384 adcs x17,x17,x10 1385 mul x10,x7,x5 // a[3]*a[1] 1386 umulh x11,x7,x5 1387 adc x19,x19,xzr // can't overflow 1388 1389 mul x20,x7,x6 // a[3]*a[2] 1390 umulh x1,x7,x6 1391 1392 adds x9,x9,x10 // accumulate high parts of multiplication 1393 mul x14,x4,x4 // a[0]*a[0] 1394 adc x10,x11,xzr // can't overflow 1395 1396 adds x17,x17,x8 // accumulate low parts of multiplication 1397 umulh x4,x4,x4 1398 adcs x19,x19,x9 1399 mul x9,x5,x5 // a[1]*a[1] 1400 adcs x20,x20,x10 1401 umulh x5,x5,x5 1402 adc x1,x1,xzr // can't overflow 1403 1404 adds x15,x15,x15 // acc[1-6]*=2 1405 mul x10,x6,x6 // a[2]*a[2] 1406 adcs x16,x16,x16 1407 umulh x6,x6,x6 1408 adcs x17,x17,x17 1409 mul x11,x7,x7 // a[3]*a[3] 1410 adcs x19,x19,x19 1411 umulh x7,x7,x7 1412 adcs x20,x20,x20 1413 adcs x1,x1,x1 1414 adc x3,xzr,xzr 1415 1416 adds x15,x15,x4 // +a[i]*a[i] 1417 mul x24,x14,x23 1418 adcs x16,x16,x9 1419 adcs x17,x17,x5 1420 adcs x19,x19,x10 1421 adcs x20,x20,x6 1422 adcs x1,x1,x11 1423 adc x3,x3,x7 1424 subs xzr,x14,#1 1425 umulh x9,x12,x24 1426 mul x10,x13,x24 1427 umulh x11,x13,x24 1428 1429 adcs x10,x10,x9 1430 adc x11,x11,xzr 1431 1432 adds x14,x15,x10 1433 adcs x15,x16,x11 1434 adcs x16,x17,x24 1435 adc x17,xzr,x24 // can't overflow 1436 mul x11,x14,x23 1437 lsl x8,x24,#32 1438 subs x15,x15,x24 1439 lsr x9,x24,#32 1440 sbcs x16,x16,x8 1441 sbc x17,x17,x9 // can't borrow 1442 subs xzr,x14,#1 1443 umulh x9,x12,x11 1444 mul x10,x13,x11 1445 umulh x24,x13,x11 1446 1447 adcs x10,x10,x9 1448 adc x24,x24,xzr 1449 1450 adds x14,x15,x10 1451 adcs x15,x16,x24 1452 adcs x16,x17,x11 1453 adc x17,xzr,x11 // can't overflow 1454 mul x24,x14,x23 1455 lsl x8,x11,#32 1456 subs x15,x15,x11 1457 lsr x9,x11,#32 1458 sbcs x16,x16,x8 1459 sbc x17,x17,x9 // can't borrow 1460 subs xzr,x14,#1 1461 umulh x9,x12,x24 1462 mul x10,x13,x24 1463 umulh x11,x13,x24 1464 1465 adcs x10,x10,x9 1466 adc x11,x11,xzr 1467 1468 adds x14,x15,x10 1469 adcs x15,x16,x11 1470 adcs x16,x17,x24 1471 adc x17,xzr,x24 // can't overflow 1472 mul x11,x14,x23 1473 lsl x8,x24,#32 1474 subs x15,x15,x24 1475 lsr x9,x24,#32 1476 sbcs x16,x16,x8 1477 sbc x17,x17,x9 // can't borrow 1478 subs xzr,x14,#1 1479 umulh x9,x12,x11 1480 mul x10,x13,x11 1481 umulh x24,x13,x11 1482 1483 adcs x10,x10,x9 1484 adc x24,x24,xzr 1485 1486 adds x14,x15,x10 1487 adcs x15,x16,x24 1488 adcs x16,x17,x11 1489 adc x17,xzr,x11 // can't overflow 1490 lsl x8,x11,#32 1491 subs x15,x15,x11 1492 lsr x9,x11,#32 1493 sbcs x16,x16,x8 1494 sbc x17,x17,x9 // can't borrow 1495 adds x14,x14,x19 // accumulate upper half 1496 adcs x15,x15,x20 1497 adcs x16,x16,x1 1498 adcs x17,x17,x3 1499 adc x19,xzr,xzr 1500 1501 subs x8,x14,x12 // ret -= modulus 1502 sbcs x9,x15,x13 1503 sbcs x10,x16,x21 1504 sbcs x11,x17,x22 1505 sbcs xzr,x19,xzr 1506 1507 csel x4,x14,x8,lo // ret = borrow ? ret : ret-modulus 1508 csel x5,x15,x9,lo 1509 csel x6,x16,x10,lo 1510 csel x7,x17,x11,lo 1511 1512 cbnz x2,Loop_ord_sqr 1513 1514 stp x4,x5,[x0] 1515 stp x6,x7,[x0,#16] 1516 1517 ldp x19,x20,[sp,#16] 1518 ldp x21,x22,[sp,#32] 1519 ldp x23,x24,[sp,#48] 1520 ldr x29,[sp],#64 1521 ret 1522 1523//////////////////////////////////////////////////////////////////////// 1524// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); 1525.globl ecp_nistz256_select_w5 1526 1527.def ecp_nistz256_select_w5 1528 .type 32 1529.endef 1530.align 4 1531ecp_nistz256_select_w5: 1532 AARCH64_VALID_CALL_TARGET 1533 1534 // x10 := x0 1535 // w9 := 0; loop counter and incremented internal index 1536 mov x10, x0 1537 mov w9, #0 1538 1539 // [v16-v21] := 0 1540 movi v16.16b, #0 1541 movi v17.16b, #0 1542 movi v18.16b, #0 1543 movi v19.16b, #0 1544 movi v20.16b, #0 1545 movi v21.16b, #0 1546 1547Lselect_w5_loop: 1548 // Loop 16 times. 1549 1550 // Increment index (loop counter); tested at the end of the loop 1551 add w9, w9, #1 1552 1553 // [v22-v27] := Load a (3*256-bit = 6*128-bit) table entry starting at x1 1554 // and advance x1 to point to the next entry 1555 ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64 1556 1557 // x11 := (w9 == w2)? All 1s : All 0s 1558 cmp w9, w2 1559 csetm x11, eq 1560 1561 // continue loading ... 1562 ld1 {v26.2d, v27.2d}, [x1],#32 1563 1564 // duplicate mask_64 into Mask (all 0s or all 1s) 1565 dup v3.2d, x11 1566 1567 // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19] 1568 // i.e., values in output registers will remain the same if w9 != w2 1569 bit v16.16b, v22.16b, v3.16b 1570 bit v17.16b, v23.16b, v3.16b 1571 1572 bit v18.16b, v24.16b, v3.16b 1573 bit v19.16b, v25.16b, v3.16b 1574 1575 bit v20.16b, v26.16b, v3.16b 1576 bit v21.16b, v27.16b, v3.16b 1577 1578 // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back 1579 tbz w9, #4, Lselect_w5_loop 1580 1581 // Write [v16-v21] to memory at the output pointer 1582 st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x10],#64 1583 st1 {v20.2d, v21.2d}, [x10] 1584 1585 ret 1586 1587 1588 1589//////////////////////////////////////////////////////////////////////// 1590// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); 1591.globl ecp_nistz256_select_w7 1592 1593.def ecp_nistz256_select_w7 1594 .type 32 1595.endef 1596.align 4 1597ecp_nistz256_select_w7: 1598 AARCH64_VALID_CALL_TARGET 1599 1600 // w9 := 0; loop counter and incremented internal index 1601 mov w9, #0 1602 1603 // [v16-v21] := 0 1604 movi v16.16b, #0 1605 movi v17.16b, #0 1606 movi v18.16b, #0 1607 movi v19.16b, #0 1608 1609Lselect_w7_loop: 1610 // Loop 64 times. 1611 1612 // Increment index (loop counter); tested at the end of the loop 1613 add w9, w9, #1 1614 1615 // [v22-v25] := Load a (2*256-bit = 4*128-bit) table entry starting at x1 1616 // and advance x1 to point to the next entry 1617 ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64 1618 1619 // x11 := (w9 == w2)? All 1s : All 0s 1620 cmp w9, w2 1621 csetm x11, eq 1622 1623 // duplicate mask_64 into Mask (all 0s or all 1s) 1624 dup v3.2d, x11 1625 1626 // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19] 1627 // i.e., values in output registers will remain the same if w9 != w2 1628 bit v16.16b, v22.16b, v3.16b 1629 bit v17.16b, v23.16b, v3.16b 1630 1631 bit v18.16b, v24.16b, v3.16b 1632 bit v19.16b, v25.16b, v3.16b 1633 1634 // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back 1635 tbz w9, #6, Lselect_w7_loop 1636 1637 // Write [v16-v19] to memory at the output pointer 1638 st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0] 1639 1640 ret 1641 1642#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) 1643