1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#include <ring-core/asm_base.h> 5 6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) 7#include "ring-core/arm_arch.h" 8 9.section .rodata 10.align 5 11.Lpoly: 12.quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 13.LRR: // 2^512 mod P precomputed for NIST P256 polynomial 14.quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd 15.Lone_mont: 16.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe 17.Lone: 18.quad 1,0,0,0 19.Lord: 20.quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 21.LordK: 22.quad 0xccd1c8aaee00bc4f 23.byte 69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 24.align 2 25.text 26 27// void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], 28// const BN_ULONG x2[4]); 29.globl ecp_nistz256_mul_mont 30.hidden ecp_nistz256_mul_mont 31.type ecp_nistz256_mul_mont,%function 32.align 4 33ecp_nistz256_mul_mont: 34 AARCH64_SIGN_LINK_REGISTER 35 stp x29,x30,[sp,#-32]! 36 add x29,sp,#0 37 stp x19,x20,[sp,#16] 38 39 ldr x3,[x2] // bp[0] 40 ldp x4,x5,[x1] 41 ldp x6,x7,[x1,#16] 42 adrp x13,.Lpoly 43 add x13,x13,:lo12:.Lpoly 44 ldr x12,[x13,#8] 45 ldr x13,[x13,#24] 46 47 bl __ecp_nistz256_mul_mont 48 49 ldp x19,x20,[sp,#16] 50 ldp x29,x30,[sp],#32 51 AARCH64_VALIDATE_LINK_REGISTER 52 ret 53.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont 54 55// void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); 56.globl ecp_nistz256_sqr_mont 57.hidden ecp_nistz256_sqr_mont 58.type ecp_nistz256_sqr_mont,%function 59.align 4 60ecp_nistz256_sqr_mont: 61 AARCH64_SIGN_LINK_REGISTER 62 stp x29,x30,[sp,#-32]! 63 add x29,sp,#0 64 stp x19,x20,[sp,#16] 65 66 ldp x4,x5,[x1] 67 ldp x6,x7,[x1,#16] 68 adrp x13,.Lpoly 69 add x13,x13,:lo12:.Lpoly 70 ldr x12,[x13,#8] 71 ldr x13,[x13,#24] 72 73 bl __ecp_nistz256_sqr_mont 74 75 ldp x19,x20,[sp,#16] 76 ldp x29,x30,[sp],#32 77 AARCH64_VALIDATE_LINK_REGISTER 78 ret 79.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont 80 81// void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); 82.globl ecp_nistz256_neg 83.hidden ecp_nistz256_neg 84.type ecp_nistz256_neg,%function 85.align 4 86ecp_nistz256_neg: 87 AARCH64_SIGN_LINK_REGISTER 88 stp x29,x30,[sp,#-16]! 89 add x29,sp,#0 90 91 mov x2,x1 92 mov x14,xzr // a = 0 93 mov x15,xzr 94 mov x16,xzr 95 mov x17,xzr 96 adrp x13,.Lpoly 97 add x13,x13,:lo12:.Lpoly 98 ldr x12,[x13,#8] 99 ldr x13,[x13,#24] 100 101 bl __ecp_nistz256_sub_from 102 103 ldp x29,x30,[sp],#16 104 AARCH64_VALIDATE_LINK_REGISTER 105 ret 106.size ecp_nistz256_neg,.-ecp_nistz256_neg 107 108// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded 109// to x4-x7 and b[0] - to x3 110.type __ecp_nistz256_mul_mont,%function 111.align 4 112__ecp_nistz256_mul_mont: 113 mul x14,x4,x3 // a[0]*b[0] 114 umulh x8,x4,x3 115 116 mul x15,x5,x3 // a[1]*b[0] 117 umulh x9,x5,x3 118 119 mul x16,x6,x3 // a[2]*b[0] 120 umulh x10,x6,x3 121 122 mul x17,x7,x3 // a[3]*b[0] 123 umulh x11,x7,x3 124 ldr x3,[x2,#8] // b[1] 125 126 adds x15,x15,x8 // accumulate high parts of multiplication 127 lsl x8,x14,#32 128 adcs x16,x16,x9 129 lsr x9,x14,#32 130 adcs x17,x17,x10 131 adc x19,xzr,x11 132 mov x20,xzr 133 subs x10,x14,x8 // "*0xffff0001" 134 sbc x11,x14,x9 135 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 136 mul x8,x4,x3 // lo(a[0]*b[i]) 137 adcs x15,x16,x9 138 mul x9,x5,x3 // lo(a[1]*b[i]) 139 adcs x16,x17,x10 // +=acc[0]*0xffff0001 140 mul x10,x6,x3 // lo(a[2]*b[i]) 141 adcs x17,x19,x11 142 mul x11,x7,x3 // lo(a[3]*b[i]) 143 adc x19,x20,xzr 144 145 adds x14,x14,x8 // accumulate low parts of multiplication 146 umulh x8,x4,x3 // hi(a[0]*b[i]) 147 adcs x15,x15,x9 148 umulh x9,x5,x3 // hi(a[1]*b[i]) 149 adcs x16,x16,x10 150 umulh x10,x6,x3 // hi(a[2]*b[i]) 151 adcs x17,x17,x11 152 umulh x11,x7,x3 // hi(a[3]*b[i]) 153 adc x19,x19,xzr 154 ldr x3,[x2,#8*(1+1)] // b[1+1] 155 adds x15,x15,x8 // accumulate high parts of multiplication 156 lsl x8,x14,#32 157 adcs x16,x16,x9 158 lsr x9,x14,#32 159 adcs x17,x17,x10 160 adcs x19,x19,x11 161 adc x20,xzr,xzr 162 subs x10,x14,x8 // "*0xffff0001" 163 sbc x11,x14,x9 164 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 165 mul x8,x4,x3 // lo(a[0]*b[i]) 166 adcs x15,x16,x9 167 mul x9,x5,x3 // lo(a[1]*b[i]) 168 adcs x16,x17,x10 // +=acc[0]*0xffff0001 169 mul x10,x6,x3 // lo(a[2]*b[i]) 170 adcs x17,x19,x11 171 mul x11,x7,x3 // lo(a[3]*b[i]) 172 adc x19,x20,xzr 173 174 adds x14,x14,x8 // accumulate low parts of multiplication 175 umulh x8,x4,x3 // hi(a[0]*b[i]) 176 adcs x15,x15,x9 177 umulh x9,x5,x3 // hi(a[1]*b[i]) 178 adcs x16,x16,x10 179 umulh x10,x6,x3 // hi(a[2]*b[i]) 180 adcs x17,x17,x11 181 umulh x11,x7,x3 // hi(a[3]*b[i]) 182 adc x19,x19,xzr 183 ldr x3,[x2,#8*(2+1)] // b[2+1] 184 adds x15,x15,x8 // accumulate high parts of multiplication 185 lsl x8,x14,#32 186 adcs x16,x16,x9 187 lsr x9,x14,#32 188 adcs x17,x17,x10 189 adcs x19,x19,x11 190 adc x20,xzr,xzr 191 subs x10,x14,x8 // "*0xffff0001" 192 sbc x11,x14,x9 193 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 194 mul x8,x4,x3 // lo(a[0]*b[i]) 195 adcs x15,x16,x9 196 mul x9,x5,x3 // lo(a[1]*b[i]) 197 adcs x16,x17,x10 // +=acc[0]*0xffff0001 198 mul x10,x6,x3 // lo(a[2]*b[i]) 199 adcs x17,x19,x11 200 mul x11,x7,x3 // lo(a[3]*b[i]) 201 adc x19,x20,xzr 202 203 adds x14,x14,x8 // accumulate low parts of multiplication 204 umulh x8,x4,x3 // hi(a[0]*b[i]) 205 adcs x15,x15,x9 206 umulh x9,x5,x3 // hi(a[1]*b[i]) 207 adcs x16,x16,x10 208 umulh x10,x6,x3 // hi(a[2]*b[i]) 209 adcs x17,x17,x11 210 umulh x11,x7,x3 // hi(a[3]*b[i]) 211 adc x19,x19,xzr 212 adds x15,x15,x8 // accumulate high parts of multiplication 213 lsl x8,x14,#32 214 adcs x16,x16,x9 215 lsr x9,x14,#32 216 adcs x17,x17,x10 217 adcs x19,x19,x11 218 adc x20,xzr,xzr 219 // last reduction 220 subs x10,x14,x8 // "*0xffff0001" 221 sbc x11,x14,x9 222 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 223 adcs x15,x16,x9 224 adcs x16,x17,x10 // +=acc[0]*0xffff0001 225 adcs x17,x19,x11 226 adc x19,x20,xzr 227 228 adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus 229 sbcs x9,x15,x12 230 sbcs x10,x16,xzr 231 sbcs x11,x17,x13 232 sbcs xzr,x19,xzr // did it borrow? 233 234 csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus 235 csel x15,x15,x9,lo 236 csel x16,x16,x10,lo 237 stp x14,x15,[x0] 238 csel x17,x17,x11,lo 239 stp x16,x17,[x0,#16] 240 241 ret 242.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont 243 244// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded 245// to x4-x7 246.type __ecp_nistz256_sqr_mont,%function 247.align 4 248__ecp_nistz256_sqr_mont: 249 // | | | | | |a1*a0| | 250 // | | | | |a2*a0| | | 251 // | |a3*a2|a3*a0| | | | 252 // | | | |a2*a1| | | | 253 // | | |a3*a1| | | | | 254 // *| | | | | | | | 2| 255 // +|a3*a3|a2*a2|a1*a1|a0*a0| 256 // |--+--+--+--+--+--+--+--| 257 // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow 258 // 259 // "can't overflow" below mark carrying into high part of 260 // multiplication result, which can't overflow, because it 261 // can never be all ones. 262 263 mul x15,x5,x4 // a[1]*a[0] 264 umulh x9,x5,x4 265 mul x16,x6,x4 // a[2]*a[0] 266 umulh x10,x6,x4 267 mul x17,x7,x4 // a[3]*a[0] 268 umulh x19,x7,x4 269 270 adds x16,x16,x9 // accumulate high parts of multiplication 271 mul x8,x6,x5 // a[2]*a[1] 272 umulh x9,x6,x5 273 adcs x17,x17,x10 274 mul x10,x7,x5 // a[3]*a[1] 275 umulh x11,x7,x5 276 adc x19,x19,xzr // can't overflow 277 278 mul x20,x7,x6 // a[3]*a[2] 279 umulh x1,x7,x6 280 281 adds x9,x9,x10 // accumulate high parts of multiplication 282 mul x14,x4,x4 // a[0]*a[0] 283 adc x10,x11,xzr // can't overflow 284 285 adds x17,x17,x8 // accumulate low parts of multiplication 286 umulh x4,x4,x4 287 adcs x19,x19,x9 288 mul x9,x5,x5 // a[1]*a[1] 289 adcs x20,x20,x10 290 umulh x5,x5,x5 291 adc x1,x1,xzr // can't overflow 292 293 adds x15,x15,x15 // acc[1-6]*=2 294 mul x10,x6,x6 // a[2]*a[2] 295 adcs x16,x16,x16 296 umulh x6,x6,x6 297 adcs x17,x17,x17 298 mul x11,x7,x7 // a[3]*a[3] 299 adcs x19,x19,x19 300 umulh x7,x7,x7 301 adcs x20,x20,x20 302 adcs x1,x1,x1 303 adc x2,xzr,xzr 304 305 adds x15,x15,x4 // +a[i]*a[i] 306 adcs x16,x16,x9 307 adcs x17,x17,x5 308 adcs x19,x19,x10 309 adcs x20,x20,x6 310 lsl x8,x14,#32 311 adcs x1,x1,x11 312 lsr x9,x14,#32 313 adc x2,x2,x7 314 subs x10,x14,x8 // "*0xffff0001" 315 sbc x11,x14,x9 316 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 317 adcs x15,x16,x9 318 lsl x8,x14,#32 319 adcs x16,x17,x10 // +=acc[0]*0xffff0001 320 lsr x9,x14,#32 321 adc x17,x11,xzr // can't overflow 322 subs x10,x14,x8 // "*0xffff0001" 323 sbc x11,x14,x9 324 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 325 adcs x15,x16,x9 326 lsl x8,x14,#32 327 adcs x16,x17,x10 // +=acc[0]*0xffff0001 328 lsr x9,x14,#32 329 adc x17,x11,xzr // can't overflow 330 subs x10,x14,x8 // "*0xffff0001" 331 sbc x11,x14,x9 332 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 333 adcs x15,x16,x9 334 lsl x8,x14,#32 335 adcs x16,x17,x10 // +=acc[0]*0xffff0001 336 lsr x9,x14,#32 337 adc x17,x11,xzr // can't overflow 338 subs x10,x14,x8 // "*0xffff0001" 339 sbc x11,x14,x9 340 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 341 adcs x15,x16,x9 342 adcs x16,x17,x10 // +=acc[0]*0xffff0001 343 adc x17,x11,xzr // can't overflow 344 345 adds x14,x14,x19 // accumulate upper half 346 adcs x15,x15,x20 347 adcs x16,x16,x1 348 adcs x17,x17,x2 349 adc x19,xzr,xzr 350 351 adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus 352 sbcs x9,x15,x12 353 sbcs x10,x16,xzr 354 sbcs x11,x17,x13 355 sbcs xzr,x19,xzr // did it borrow? 356 357 csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus 358 csel x15,x15,x9,lo 359 csel x16,x16,x10,lo 360 stp x14,x15,[x0] 361 csel x17,x17,x11,lo 362 stp x16,x17,[x0,#16] 363 364 ret 365.size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont 366 367// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to 368// x4-x7 and x8-x11. This is done because it's used in multiple 369// contexts, e.g. in multiplication by 2 and 3... 370.type __ecp_nistz256_add_to,%function 371.align 4 372__ecp_nistz256_add_to: 373 adds x14,x14,x8 // ret = a+b 374 adcs x15,x15,x9 375 adcs x16,x16,x10 376 adcs x17,x17,x11 377 adc x1,xzr,xzr // zap x1 378 379 adds x8,x14,#1 // subs x8,x4,#-1 // tmp = ret-modulus 380 sbcs x9,x15,x12 381 sbcs x10,x16,xzr 382 sbcs x11,x17,x13 383 sbcs xzr,x1,xzr // did subtraction borrow? 384 385 csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus 386 csel x15,x15,x9,lo 387 csel x16,x16,x10,lo 388 stp x14,x15,[x0] 389 csel x17,x17,x11,lo 390 stp x16,x17,[x0,#16] 391 392 ret 393.size __ecp_nistz256_add_to,.-__ecp_nistz256_add_to 394 395.type __ecp_nistz256_sub_from,%function 396.align 4 397__ecp_nistz256_sub_from: 398 ldp x8,x9,[x2] 399 ldp x10,x11,[x2,#16] 400 subs x14,x14,x8 // ret = a-b 401 sbcs x15,x15,x9 402 sbcs x16,x16,x10 403 sbcs x17,x17,x11 404 sbc x1,xzr,xzr // zap x1 405 406 subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus 407 adcs x9,x15,x12 408 adcs x10,x16,xzr 409 adc x11,x17,x13 410 cmp x1,xzr // did subtraction borrow? 411 412 csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret 413 csel x15,x15,x9,eq 414 csel x16,x16,x10,eq 415 stp x14,x15,[x0] 416 csel x17,x17,x11,eq 417 stp x16,x17,[x0,#16] 418 419 ret 420.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from 421 422.type __ecp_nistz256_sub_morf,%function 423.align 4 424__ecp_nistz256_sub_morf: 425 ldp x8,x9,[x2] 426 ldp x10,x11,[x2,#16] 427 subs x14,x8,x14 // ret = b-a 428 sbcs x15,x9,x15 429 sbcs x16,x10,x16 430 sbcs x17,x11,x17 431 sbc x1,xzr,xzr // zap x1 432 433 subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus 434 adcs x9,x15,x12 435 adcs x10,x16,xzr 436 adc x11,x17,x13 437 cmp x1,xzr // did subtraction borrow? 438 439 csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret 440 csel x15,x15,x9,eq 441 csel x16,x16,x10,eq 442 stp x14,x15,[x0] 443 csel x17,x17,x11,eq 444 stp x16,x17,[x0,#16] 445 446 ret 447.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf 448 449.type __ecp_nistz256_div_by_2,%function 450.align 4 451__ecp_nistz256_div_by_2: 452 subs x8,x14,#1 // adds x8,x4,#-1 // tmp = a+modulus 453 adcs x9,x15,x12 454 adcs x10,x16,xzr 455 adcs x11,x17,x13 456 adc x1,xzr,xzr // zap x1 457 tst x14,#1 // is a even? 458 459 csel x14,x14,x8,eq // ret = even ? a : a+modulus 460 csel x15,x15,x9,eq 461 csel x16,x16,x10,eq 462 csel x17,x17,x11,eq 463 csel x1,xzr,x1,eq 464 465 lsr x14,x14,#1 // ret >>= 1 466 orr x14,x14,x15,lsl#63 467 lsr x15,x15,#1 468 orr x15,x15,x16,lsl#63 469 lsr x16,x16,#1 470 orr x16,x16,x17,lsl#63 471 lsr x17,x17,#1 472 stp x14,x15,[x0] 473 orr x17,x17,x1,lsl#63 474 stp x16,x17,[x0,#16] 475 476 ret 477.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 478.globl ecp_nistz256_point_double 479.hidden ecp_nistz256_point_double 480.type ecp_nistz256_point_double,%function 481.align 5 482ecp_nistz256_point_double: 483 AARCH64_SIGN_LINK_REGISTER 484 stp x29,x30,[sp,#-96]! 485 add x29,sp,#0 486 stp x19,x20,[sp,#16] 487 stp x21,x22,[sp,#32] 488 sub sp,sp,#32*4 489 490.Ldouble_shortcut: 491 ldp x14,x15,[x1,#32] 492 mov x21,x0 493 ldp x16,x17,[x1,#48] 494 mov x22,x1 495 adrp x13,.Lpoly 496 add x13,x13,:lo12:.Lpoly 497 ldr x12,[x13,#8] 498 mov x8,x14 499 ldr x13,[x13,#24] 500 mov x9,x15 501 ldp x4,x5,[x22,#64] // forward load for p256_sqr_mont 502 mov x10,x16 503 mov x11,x17 504 ldp x6,x7,[x22,#64+16] 505 add x0,sp,#0 506 bl __ecp_nistz256_add_to // p256_mul_by_2(S, in_y); 507 508 add x0,sp,#64 509 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z); 510 511 ldp x8,x9,[x22] 512 ldp x10,x11,[x22,#16] 513 mov x4,x14 // put Zsqr aside for p256_sub 514 mov x5,x15 515 mov x6,x16 516 mov x7,x17 517 add x0,sp,#32 518 bl __ecp_nistz256_add_to // p256_add(M, Zsqr, in_x); 519 520 add x2,x22,#0 521 mov x14,x4 // restore Zsqr 522 mov x15,x5 523 ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont 524 mov x16,x6 525 mov x17,x7 526 ldp x6,x7,[sp,#0+16] 527 add x0,sp,#64 528 bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr); 529 530 add x0,sp,#0 531 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S); 532 533 ldr x3,[x22,#32] 534 ldp x4,x5,[x22,#64] 535 ldp x6,x7,[x22,#64+16] 536 add x2,x22,#32 537 add x0,sp,#96 538 bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y); 539 540 mov x8,x14 541 mov x9,x15 542 ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont 543 mov x10,x16 544 mov x11,x17 545 ldp x6,x7,[sp,#0+16] 546 add x0,x21,#64 547 bl __ecp_nistz256_add_to // p256_mul_by_2(res_z, tmp0); 548 549 add x0,sp,#96 550 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S); 551 552 ldr x3,[sp,#64] // forward load for p256_mul_mont 553 ldp x4,x5,[sp,#32] 554 ldp x6,x7,[sp,#32+16] 555 add x0,x21,#32 556 bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0); 557 558 add x2,sp,#64 559 add x0,sp,#32 560 bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr); 561 562 mov x8,x14 // duplicate M 563 mov x9,x15 564 mov x10,x16 565 mov x11,x17 566 mov x4,x14 // put M aside 567 mov x5,x15 568 mov x6,x16 569 mov x7,x17 570 add x0,sp,#32 571 bl __ecp_nistz256_add_to 572 mov x8,x4 // restore M 573 mov x9,x5 574 ldr x3,[x22] // forward load for p256_mul_mont 575 mov x10,x6 576 ldp x4,x5,[sp,#0] 577 mov x11,x7 578 ldp x6,x7,[sp,#0+16] 579 bl __ecp_nistz256_add_to // p256_mul_by_3(M, M); 580 581 add x2,x22,#0 582 add x0,sp,#0 583 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x); 584 585 mov x8,x14 586 mov x9,x15 587 ldp x4,x5,[sp,#32] // forward load for p256_sqr_mont 588 mov x10,x16 589 mov x11,x17 590 ldp x6,x7,[sp,#32+16] 591 add x0,sp,#96 592 bl __ecp_nistz256_add_to // p256_mul_by_2(tmp0, S); 593 594 add x0,x21,#0 595 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M); 596 597 add x2,sp,#96 598 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0); 599 600 add x2,sp,#0 601 add x0,sp,#0 602 bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x); 603 604 ldr x3,[sp,#32] 605 mov x4,x14 // copy S 606 mov x5,x15 607 mov x6,x16 608 mov x7,x17 609 add x2,sp,#32 610 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M); 611 612 add x2,x21,#32 613 add x0,x21,#32 614 bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y); 615 616 add sp,x29,#0 // destroy frame 617 ldp x19,x20,[x29,#16] 618 ldp x21,x22,[x29,#32] 619 ldp x29,x30,[sp],#96 620 AARCH64_VALIDATE_LINK_REGISTER 621 ret 622.size ecp_nistz256_point_double,.-ecp_nistz256_point_double 623.globl ecp_nistz256_point_add 624.hidden ecp_nistz256_point_add 625.type ecp_nistz256_point_add,%function 626.align 5 627ecp_nistz256_point_add: 628 AARCH64_SIGN_LINK_REGISTER 629 stp x29,x30,[sp,#-96]! 630 add x29,sp,#0 631 stp x19,x20,[sp,#16] 632 stp x21,x22,[sp,#32] 633 stp x23,x24,[sp,#48] 634 stp x25,x26,[sp,#64] 635 stp x27,x28,[sp,#80] 636 sub sp,sp,#32*12 637 638 ldp x4,x5,[x2,#64] // in2_z 639 ldp x6,x7,[x2,#64+16] 640 mov x21,x0 641 mov x22,x1 642 mov x23,x2 643 adrp x13,.Lpoly 644 add x13,x13,:lo12:.Lpoly 645 ldr x12,[x13,#8] 646 ldr x13,[x13,#24] 647 orr x8,x4,x5 648 orr x10,x6,x7 649 orr x25,x8,x10 650 cmp x25,#0 651 csetm x25,ne // ~in2infty 652 add x0,sp,#192 653 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z); 654 655 ldp x4,x5,[x22,#64] // in1_z 656 ldp x6,x7,[x22,#64+16] 657 orr x8,x4,x5 658 orr x10,x6,x7 659 orr x24,x8,x10 660 cmp x24,#0 661 csetm x24,ne // ~in1infty 662 add x0,sp,#128 663 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); 664 665 ldr x3,[x23,#64] 666 ldp x4,x5,[sp,#192] 667 ldp x6,x7,[sp,#192+16] 668 add x2,x23,#64 669 add x0,sp,#320 670 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z); 671 672 ldr x3,[x22,#64] 673 ldp x4,x5,[sp,#128] 674 ldp x6,x7,[sp,#128+16] 675 add x2,x22,#64 676 add x0,sp,#352 677 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); 678 679 ldr x3,[x22,#32] 680 ldp x4,x5,[sp,#320] 681 ldp x6,x7,[sp,#320+16] 682 add x2,x22,#32 683 add x0,sp,#320 684 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y); 685 686 ldr x3,[x23,#32] 687 ldp x4,x5,[sp,#352] 688 ldp x6,x7,[sp,#352+16] 689 add x2,x23,#32 690 add x0,sp,#352 691 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); 692 693 add x2,sp,#320 694 ldr x3,[sp,#192] // forward load for p256_mul_mont 695 ldp x4,x5,[x22] 696 ldp x6,x7,[x22,#16] 697 add x0,sp,#160 698 bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1); 699 700 orr x14,x14,x15 // see if result is zero 701 orr x16,x16,x17 702 orr x26,x14,x16 // ~is_equal(S1,S2) 703 704 add x2,sp,#192 705 add x0,sp,#256 706 bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr); 707 708 ldr x3,[sp,#128] 709 ldp x4,x5,[x23] 710 ldp x6,x7,[x23,#16] 711 add x2,sp,#128 712 add x0,sp,#288 713 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr); 714 715 add x2,sp,#256 716 ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont 717 ldp x6,x7,[sp,#160+16] 718 add x0,sp,#96 719 bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1); 720 721 orr x14,x14,x15 // see if result is zero 722 orr x16,x16,x17 723 orr x14,x14,x16 // ~is_equal(U1,U2) 724 725 mvn x27,x24 // -1/0 -> 0/-1 726 mvn x28,x25 // -1/0 -> 0/-1 727 orr x14,x14,x27 728 orr x14,x14,x28 729 orr x14,x14,x26 730 cbnz x14,.Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) 731 732.Ladd_double: 733 mov x1,x22 734 mov x0,x21 735 ldp x23,x24,[x29,#48] 736 ldp x25,x26,[x29,#64] 737 ldp x27,x28,[x29,#80] 738 add sp,sp,#256 // #256 is from #32*(12-4). difference in stack frames 739 b .Ldouble_shortcut 740 741.align 4 742.Ladd_proceed: 743 add x0,sp,#192 744 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); 745 746 ldr x3,[x22,#64] 747 ldp x4,x5,[sp,#96] 748 ldp x6,x7,[sp,#96+16] 749 add x2,x22,#64 750 add x0,sp,#64 751 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); 752 753 ldp x4,x5,[sp,#96] 754 ldp x6,x7,[sp,#96+16] 755 add x0,sp,#128 756 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); 757 758 ldr x3,[x23,#64] 759 ldp x4,x5,[sp,#64] 760 ldp x6,x7,[sp,#64+16] 761 add x2,x23,#64 762 add x0,sp,#64 763 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z); 764 765 ldr x3,[sp,#96] 766 ldp x4,x5,[sp,#128] 767 ldp x6,x7,[sp,#128+16] 768 add x2,sp,#96 769 add x0,sp,#224 770 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); 771 772 ldr x3,[sp,#128] 773 ldp x4,x5,[sp,#256] 774 ldp x6,x7,[sp,#256+16] 775 add x2,sp,#128 776 add x0,sp,#288 777 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr); 778 779 mov x8,x14 780 mov x9,x15 781 mov x10,x16 782 mov x11,x17 783 add x0,sp,#128 784 bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); 785 786 add x2,sp,#192 787 add x0,sp,#0 788 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); 789 790 add x2,sp,#224 791 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); 792 793 add x2,sp,#288 794 ldr x3,[sp,#224] // forward load for p256_mul_mont 795 ldp x4,x5,[sp,#320] 796 ldp x6,x7,[sp,#320+16] 797 add x0,sp,#32 798 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); 799 800 add x2,sp,#224 801 add x0,sp,#352 802 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub); 803 804 ldr x3,[sp,#160] 805 ldp x4,x5,[sp,#32] 806 ldp x6,x7,[sp,#32+16] 807 add x2,sp,#160 808 add x0,sp,#32 809 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); 810 811 add x2,sp,#352 812 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); 813 814 ldp x4,x5,[sp,#0] // res 815 ldp x6,x7,[sp,#0+16] 816 ldp x8,x9,[x23] // in2 817 ldp x10,x11,[x23,#16] 818 ldp x14,x15,[x22,#0] // in1 819 cmp x24,#0 // ~, remember? 820 ldp x16,x17,[x22,#0+16] 821 csel x8,x4,x8,ne 822 csel x9,x5,x9,ne 823 ldp x4,x5,[sp,#0+0+32] // res 824 csel x10,x6,x10,ne 825 csel x11,x7,x11,ne 826 cmp x25,#0 // ~, remember? 827 ldp x6,x7,[sp,#0+0+48] 828 csel x14,x8,x14,ne 829 csel x15,x9,x15,ne 830 ldp x8,x9,[x23,#0+32] // in2 831 csel x16,x10,x16,ne 832 csel x17,x11,x17,ne 833 ldp x10,x11,[x23,#0+48] 834 stp x14,x15,[x21,#0] 835 stp x16,x17,[x21,#0+16] 836 ldp x14,x15,[x22,#32] // in1 837 cmp x24,#0 // ~, remember? 838 ldp x16,x17,[x22,#32+16] 839 csel x8,x4,x8,ne 840 csel x9,x5,x9,ne 841 ldp x4,x5,[sp,#0+32+32] // res 842 csel x10,x6,x10,ne 843 csel x11,x7,x11,ne 844 cmp x25,#0 // ~, remember? 845 ldp x6,x7,[sp,#0+32+48] 846 csel x14,x8,x14,ne 847 csel x15,x9,x15,ne 848 ldp x8,x9,[x23,#32+32] // in2 849 csel x16,x10,x16,ne 850 csel x17,x11,x17,ne 851 ldp x10,x11,[x23,#32+48] 852 stp x14,x15,[x21,#32] 853 stp x16,x17,[x21,#32+16] 854 ldp x14,x15,[x22,#64] // in1 855 cmp x24,#0 // ~, remember? 856 ldp x16,x17,[x22,#64+16] 857 csel x8,x4,x8,ne 858 csel x9,x5,x9,ne 859 csel x10,x6,x10,ne 860 csel x11,x7,x11,ne 861 cmp x25,#0 // ~, remember? 862 csel x14,x8,x14,ne 863 csel x15,x9,x15,ne 864 csel x16,x10,x16,ne 865 csel x17,x11,x17,ne 866 stp x14,x15,[x21,#64] 867 stp x16,x17,[x21,#64+16] 868 869.Ladd_done: 870 add sp,x29,#0 // destroy frame 871 ldp x19,x20,[x29,#16] 872 ldp x21,x22,[x29,#32] 873 ldp x23,x24,[x29,#48] 874 ldp x25,x26,[x29,#64] 875 ldp x27,x28,[x29,#80] 876 ldp x29,x30,[sp],#96 877 AARCH64_VALIDATE_LINK_REGISTER 878 ret 879.size ecp_nistz256_point_add,.-ecp_nistz256_point_add 880.globl ecp_nistz256_point_add_affine 881.hidden ecp_nistz256_point_add_affine 882.type ecp_nistz256_point_add_affine,%function 883.align 5 884ecp_nistz256_point_add_affine: 885 AARCH64_SIGN_LINK_REGISTER 886 stp x29,x30,[sp,#-80]! 887 add x29,sp,#0 888 stp x19,x20,[sp,#16] 889 stp x21,x22,[sp,#32] 890 stp x23,x24,[sp,#48] 891 stp x25,x26,[sp,#64] 892 sub sp,sp,#32*10 893 894 mov x21,x0 895 mov x22,x1 896 mov x23,x2 897 adrp x13,.Lpoly 898 add x13,x13,:lo12:.Lpoly 899 ldr x12,[x13,#8] 900 ldr x13,[x13,#24] 901 902 ldp x4,x5,[x1,#64] // in1_z 903 ldp x6,x7,[x1,#64+16] 904 orr x8,x4,x5 905 orr x10,x6,x7 906 orr x24,x8,x10 907 cmp x24,#0 908 csetm x24,ne // ~in1infty 909 910 ldp x14,x15,[x2] // in2_x 911 ldp x16,x17,[x2,#16] 912 ldp x8,x9,[x2,#32] // in2_y 913 ldp x10,x11,[x2,#48] 914 orr x14,x14,x15 915 orr x16,x16,x17 916 orr x8,x8,x9 917 orr x10,x10,x11 918 orr x14,x14,x16 919 orr x8,x8,x10 920 orr x25,x14,x8 921 cmp x25,#0 922 csetm x25,ne // ~in2infty 923 924 add x0,sp,#128 925 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); 926 927 mov x4,x14 928 mov x5,x15 929 mov x6,x16 930 mov x7,x17 931 ldr x3,[x23] 932 add x2,x23,#0 933 add x0,sp,#96 934 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x); 935 936 add x2,x22,#0 937 ldr x3,[x22,#64] // forward load for p256_mul_mont 938 ldp x4,x5,[sp,#128] 939 ldp x6,x7,[sp,#128+16] 940 add x0,sp,#160 941 bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x); 942 943 add x2,x22,#64 944 add x0,sp,#128 945 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); 946 947 ldr x3,[x22,#64] 948 ldp x4,x5,[sp,#160] 949 ldp x6,x7,[sp,#160+16] 950 add x2,x22,#64 951 add x0,sp,#64 952 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); 953 954 ldr x3,[x23,#32] 955 ldp x4,x5,[sp,#128] 956 ldp x6,x7,[sp,#128+16] 957 add x2,x23,#32 958 add x0,sp,#128 959 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); 960 961 add x2,x22,#32 962 ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont 963 ldp x6,x7,[sp,#160+16] 964 add x0,sp,#192 965 bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y); 966 967 add x0,sp,#224 968 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); 969 970 ldp x4,x5,[sp,#192] 971 ldp x6,x7,[sp,#192+16] 972 add x0,sp,#288 973 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); 974 975 ldr x3,[sp,#160] 976 ldp x4,x5,[sp,#224] 977 ldp x6,x7,[sp,#224+16] 978 add x2,sp,#160 979 add x0,sp,#256 980 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); 981 982 ldr x3,[x22] 983 ldp x4,x5,[sp,#224] 984 ldp x6,x7,[sp,#224+16] 985 add x2,x22,#0 986 add x0,sp,#96 987 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr); 988 989 mov x8,x14 990 mov x9,x15 991 mov x10,x16 992 mov x11,x17 993 add x0,sp,#224 994 bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); 995 996 add x2,sp,#288 997 add x0,sp,#0 998 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); 999 1000 add x2,sp,#256 1001 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); 1002 1003 add x2,sp,#96 1004 ldr x3,[x22,#32] // forward load for p256_mul_mont 1005 ldp x4,x5,[sp,#256] 1006 ldp x6,x7,[sp,#256+16] 1007 add x0,sp,#32 1008 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); 1009 1010 add x2,x22,#32 1011 add x0,sp,#128 1012 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub); 1013 1014 ldr x3,[sp,#192] 1015 ldp x4,x5,[sp,#32] 1016 ldp x6,x7,[sp,#32+16] 1017 add x2,sp,#192 1018 add x0,sp,#32 1019 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); 1020 1021 add x2,sp,#128 1022 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); 1023 1024 ldp x4,x5,[sp,#0] // res 1025 ldp x6,x7,[sp,#0+16] 1026 ldp x8,x9,[x23] // in2 1027 ldp x10,x11,[x23,#16] 1028 ldp x14,x15,[x22,#0] // in1 1029 cmp x24,#0 // ~, remember? 1030 ldp x16,x17,[x22,#0+16] 1031 csel x8,x4,x8,ne 1032 csel x9,x5,x9,ne 1033 ldp x4,x5,[sp,#0+0+32] // res 1034 csel x10,x6,x10,ne 1035 csel x11,x7,x11,ne 1036 cmp x25,#0 // ~, remember? 1037 ldp x6,x7,[sp,#0+0+48] 1038 csel x14,x8,x14,ne 1039 csel x15,x9,x15,ne 1040 ldp x8,x9,[x23,#0+32] // in2 1041 csel x16,x10,x16,ne 1042 csel x17,x11,x17,ne 1043 ldp x10,x11,[x23,#0+48] 1044 stp x14,x15,[x21,#0] 1045 stp x16,x17,[x21,#0+16] 1046 adrp x23,.Lone_mont-64 1047 add x23,x23,:lo12:.Lone_mont-64 1048 ldp x14,x15,[x22,#32] // in1 1049 cmp x24,#0 // ~, remember? 1050 ldp x16,x17,[x22,#32+16] 1051 csel x8,x4,x8,ne 1052 csel x9,x5,x9,ne 1053 ldp x4,x5,[sp,#0+32+32] // res 1054 csel x10,x6,x10,ne 1055 csel x11,x7,x11,ne 1056 cmp x25,#0 // ~, remember? 1057 ldp x6,x7,[sp,#0+32+48] 1058 csel x14,x8,x14,ne 1059 csel x15,x9,x15,ne 1060 ldp x8,x9,[x23,#32+32] // in2 1061 csel x16,x10,x16,ne 1062 csel x17,x11,x17,ne 1063 ldp x10,x11,[x23,#32+48] 1064 stp x14,x15,[x21,#32] 1065 stp x16,x17,[x21,#32+16] 1066 ldp x14,x15,[x22,#64] // in1 1067 cmp x24,#0 // ~, remember? 1068 ldp x16,x17,[x22,#64+16] 1069 csel x8,x4,x8,ne 1070 csel x9,x5,x9,ne 1071 csel x10,x6,x10,ne 1072 csel x11,x7,x11,ne 1073 cmp x25,#0 // ~, remember? 1074 csel x14,x8,x14,ne 1075 csel x15,x9,x15,ne 1076 csel x16,x10,x16,ne 1077 csel x17,x11,x17,ne 1078 stp x14,x15,[x21,#64] 1079 stp x16,x17,[x21,#64+16] 1080 1081 add sp,x29,#0 // destroy frame 1082 ldp x19,x20,[x29,#16] 1083 ldp x21,x22,[x29,#32] 1084 ldp x23,x24,[x29,#48] 1085 ldp x25,x26,[x29,#64] 1086 ldp x29,x30,[sp],#80 1087 AARCH64_VALIDATE_LINK_REGISTER 1088 ret 1089.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine 1090//////////////////////////////////////////////////////////////////////// 1091// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], 1092// uint64_t b[4]); 1093.globl ecp_nistz256_ord_mul_mont 1094.hidden ecp_nistz256_ord_mul_mont 1095.type ecp_nistz256_ord_mul_mont,%function 1096.align 4 1097ecp_nistz256_ord_mul_mont: 1098 AARCH64_VALID_CALL_TARGET 1099 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 1100 stp x29,x30,[sp,#-64]! 1101 add x29,sp,#0 1102 stp x19,x20,[sp,#16] 1103 stp x21,x22,[sp,#32] 1104 stp x23,x24,[sp,#48] 1105 1106 adrp x23,.Lord 1107 add x23,x23,:lo12:.Lord 1108 ldr x3,[x2] // bp[0] 1109 ldp x4,x5,[x1] 1110 ldp x6,x7,[x1,#16] 1111 1112 ldp x12,x13,[x23,#0] 1113 ldp x21,x22,[x23,#16] 1114 ldr x23,[x23,#32] 1115 1116 mul x14,x4,x3 // a[0]*b[0] 1117 umulh x8,x4,x3 1118 1119 mul x15,x5,x3 // a[1]*b[0] 1120 umulh x9,x5,x3 1121 1122 mul x16,x6,x3 // a[2]*b[0] 1123 umulh x10,x6,x3 1124 1125 mul x17,x7,x3 // a[3]*b[0] 1126 umulh x19,x7,x3 1127 1128 mul x24,x14,x23 1129 1130 adds x15,x15,x8 // accumulate high parts of multiplication 1131 adcs x16,x16,x9 1132 adcs x17,x17,x10 1133 adc x19,x19,xzr 1134 mov x20,xzr 1135 ldr x3,[x2,#8*1] // b[i] 1136 1137 lsl x8,x24,#32 1138 subs x16,x16,x24 1139 lsr x9,x24,#32 1140 sbcs x17,x17,x8 1141 sbcs x19,x19,x9 1142 sbc x20,x20,xzr 1143 1144 subs xzr,x14,#1 1145 umulh x9,x12,x24 1146 mul x10,x13,x24 1147 umulh x11,x13,x24 1148 1149 adcs x10,x10,x9 1150 mul x8,x4,x3 1151 adc x11,x11,xzr 1152 mul x9,x5,x3 1153 1154 adds x14,x15,x10 1155 mul x10,x6,x3 1156 adcs x15,x16,x11 1157 mul x11,x7,x3 1158 adcs x16,x17,x24 1159 adcs x17,x19,x24 1160 adc x19,x20,xzr 1161 1162 adds x14,x14,x8 // accumulate low parts 1163 umulh x8,x4,x3 1164 adcs x15,x15,x9 1165 umulh x9,x5,x3 1166 adcs x16,x16,x10 1167 umulh x10,x6,x3 1168 adcs x17,x17,x11 1169 umulh x11,x7,x3 1170 adc x19,x19,xzr 1171 mul x24,x14,x23 1172 adds x15,x15,x8 // accumulate high parts 1173 adcs x16,x16,x9 1174 adcs x17,x17,x10 1175 adcs x19,x19,x11 1176 adc x20,xzr,xzr 1177 ldr x3,[x2,#8*2] // b[i] 1178 1179 lsl x8,x24,#32 1180 subs x16,x16,x24 1181 lsr x9,x24,#32 1182 sbcs x17,x17,x8 1183 sbcs x19,x19,x9 1184 sbc x20,x20,xzr 1185 1186 subs xzr,x14,#1 1187 umulh x9,x12,x24 1188 mul x10,x13,x24 1189 umulh x11,x13,x24 1190 1191 adcs x10,x10,x9 1192 mul x8,x4,x3 1193 adc x11,x11,xzr 1194 mul x9,x5,x3 1195 1196 adds x14,x15,x10 1197 mul x10,x6,x3 1198 adcs x15,x16,x11 1199 mul x11,x7,x3 1200 adcs x16,x17,x24 1201 adcs x17,x19,x24 1202 adc x19,x20,xzr 1203 1204 adds x14,x14,x8 // accumulate low parts 1205 umulh x8,x4,x3 1206 adcs x15,x15,x9 1207 umulh x9,x5,x3 1208 adcs x16,x16,x10 1209 umulh x10,x6,x3 1210 adcs x17,x17,x11 1211 umulh x11,x7,x3 1212 adc x19,x19,xzr 1213 mul x24,x14,x23 1214 adds x15,x15,x8 // accumulate high parts 1215 adcs x16,x16,x9 1216 adcs x17,x17,x10 1217 adcs x19,x19,x11 1218 adc x20,xzr,xzr 1219 ldr x3,[x2,#8*3] // b[i] 1220 1221 lsl x8,x24,#32 1222 subs x16,x16,x24 1223 lsr x9,x24,#32 1224 sbcs x17,x17,x8 1225 sbcs x19,x19,x9 1226 sbc x20,x20,xzr 1227 1228 subs xzr,x14,#1 1229 umulh x9,x12,x24 1230 mul x10,x13,x24 1231 umulh x11,x13,x24 1232 1233 adcs x10,x10,x9 1234 mul x8,x4,x3 1235 adc x11,x11,xzr 1236 mul x9,x5,x3 1237 1238 adds x14,x15,x10 1239 mul x10,x6,x3 1240 adcs x15,x16,x11 1241 mul x11,x7,x3 1242 adcs x16,x17,x24 1243 adcs x17,x19,x24 1244 adc x19,x20,xzr 1245 1246 adds x14,x14,x8 // accumulate low parts 1247 umulh x8,x4,x3 1248 adcs x15,x15,x9 1249 umulh x9,x5,x3 1250 adcs x16,x16,x10 1251 umulh x10,x6,x3 1252 adcs x17,x17,x11 1253 umulh x11,x7,x3 1254 adc x19,x19,xzr 1255 mul x24,x14,x23 1256 adds x15,x15,x8 // accumulate high parts 1257 adcs x16,x16,x9 1258 adcs x17,x17,x10 1259 adcs x19,x19,x11 1260 adc x20,xzr,xzr 1261 lsl x8,x24,#32 // last reduction 1262 subs x16,x16,x24 1263 lsr x9,x24,#32 1264 sbcs x17,x17,x8 1265 sbcs x19,x19,x9 1266 sbc x20,x20,xzr 1267 1268 subs xzr,x14,#1 1269 umulh x9,x12,x24 1270 mul x10,x13,x24 1271 umulh x11,x13,x24 1272 1273 adcs x10,x10,x9 1274 adc x11,x11,xzr 1275 1276 adds x14,x15,x10 1277 adcs x15,x16,x11 1278 adcs x16,x17,x24 1279 adcs x17,x19,x24 1280 adc x19,x20,xzr 1281 1282 subs x8,x14,x12 // ret -= modulus 1283 sbcs x9,x15,x13 1284 sbcs x10,x16,x21 1285 sbcs x11,x17,x22 1286 sbcs xzr,x19,xzr 1287 1288 csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus 1289 csel x15,x15,x9,lo 1290 csel x16,x16,x10,lo 1291 stp x14,x15,[x0] 1292 csel x17,x17,x11,lo 1293 stp x16,x17,[x0,#16] 1294 1295 ldp x19,x20,[sp,#16] 1296 ldp x21,x22,[sp,#32] 1297 ldp x23,x24,[sp,#48] 1298 ldr x29,[sp],#64 1299 ret 1300.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont 1301 1302//////////////////////////////////////////////////////////////////////// 1303// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], 1304// uint64_t rep); 1305.globl ecp_nistz256_ord_sqr_mont 1306.hidden ecp_nistz256_ord_sqr_mont 1307.type ecp_nistz256_ord_sqr_mont,%function 1308.align 4 1309ecp_nistz256_ord_sqr_mont: 1310 AARCH64_VALID_CALL_TARGET 1311 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 1312 stp x29,x30,[sp,#-64]! 1313 add x29,sp,#0 1314 stp x19,x20,[sp,#16] 1315 stp x21,x22,[sp,#32] 1316 stp x23,x24,[sp,#48] 1317 1318 adrp x23,.Lord 1319 add x23,x23,:lo12:.Lord 1320 ldp x4,x5,[x1] 1321 ldp x6,x7,[x1,#16] 1322 1323 ldp x12,x13,[x23,#0] 1324 ldp x21,x22,[x23,#16] 1325 ldr x23,[x23,#32] 1326 b .Loop_ord_sqr 1327 1328.align 4 1329.Loop_ord_sqr: 1330 sub x2,x2,#1 1331 //////////////////////////////////////////////////////////////// 1332 // | | | | | |a1*a0| | 1333 // | | | | |a2*a0| | | 1334 // | |a3*a2|a3*a0| | | | 1335 // | | | |a2*a1| | | | 1336 // | | |a3*a1| | | | | 1337 // *| | | | | | | | 2| 1338 // +|a3*a3|a2*a2|a1*a1|a0*a0| 1339 // |--+--+--+--+--+--+--+--| 1340 // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow 1341 // 1342 // "can't overflow" below mark carrying into high part of 1343 // multiplication result, which can't overflow, because it 1344 // can never be all ones. 1345 1346 mul x15,x5,x4 // a[1]*a[0] 1347 umulh x9,x5,x4 1348 mul x16,x6,x4 // a[2]*a[0] 1349 umulh x10,x6,x4 1350 mul x17,x7,x4 // a[3]*a[0] 1351 umulh x19,x7,x4 1352 1353 adds x16,x16,x9 // accumulate high parts of multiplication 1354 mul x8,x6,x5 // a[2]*a[1] 1355 umulh x9,x6,x5 1356 adcs x17,x17,x10 1357 mul x10,x7,x5 // a[3]*a[1] 1358 umulh x11,x7,x5 1359 adc x19,x19,xzr // can't overflow 1360 1361 mul x20,x7,x6 // a[3]*a[2] 1362 umulh x1,x7,x6 1363 1364 adds x9,x9,x10 // accumulate high parts of multiplication 1365 mul x14,x4,x4 // a[0]*a[0] 1366 adc x10,x11,xzr // can't overflow 1367 1368 adds x17,x17,x8 // accumulate low parts of multiplication 1369 umulh x4,x4,x4 1370 adcs x19,x19,x9 1371 mul x9,x5,x5 // a[1]*a[1] 1372 adcs x20,x20,x10 1373 umulh x5,x5,x5 1374 adc x1,x1,xzr // can't overflow 1375 1376 adds x15,x15,x15 // acc[1-6]*=2 1377 mul x10,x6,x6 // a[2]*a[2] 1378 adcs x16,x16,x16 1379 umulh x6,x6,x6 1380 adcs x17,x17,x17 1381 mul x11,x7,x7 // a[3]*a[3] 1382 adcs x19,x19,x19 1383 umulh x7,x7,x7 1384 adcs x20,x20,x20 1385 adcs x1,x1,x1 1386 adc x3,xzr,xzr 1387 1388 adds x15,x15,x4 // +a[i]*a[i] 1389 mul x24,x14,x23 1390 adcs x16,x16,x9 1391 adcs x17,x17,x5 1392 adcs x19,x19,x10 1393 adcs x20,x20,x6 1394 adcs x1,x1,x11 1395 adc x3,x3,x7 1396 subs xzr,x14,#1 1397 umulh x9,x12,x24 1398 mul x10,x13,x24 1399 umulh x11,x13,x24 1400 1401 adcs x10,x10,x9 1402 adc x11,x11,xzr 1403 1404 adds x14,x15,x10 1405 adcs x15,x16,x11 1406 adcs x16,x17,x24 1407 adc x17,xzr,x24 // can't overflow 1408 mul x11,x14,x23 1409 lsl x8,x24,#32 1410 subs x15,x15,x24 1411 lsr x9,x24,#32 1412 sbcs x16,x16,x8 1413 sbc x17,x17,x9 // can't borrow 1414 subs xzr,x14,#1 1415 umulh x9,x12,x11 1416 mul x10,x13,x11 1417 umulh x24,x13,x11 1418 1419 adcs x10,x10,x9 1420 adc x24,x24,xzr 1421 1422 adds x14,x15,x10 1423 adcs x15,x16,x24 1424 adcs x16,x17,x11 1425 adc x17,xzr,x11 // can't overflow 1426 mul x24,x14,x23 1427 lsl x8,x11,#32 1428 subs x15,x15,x11 1429 lsr x9,x11,#32 1430 sbcs x16,x16,x8 1431 sbc x17,x17,x9 // can't borrow 1432 subs xzr,x14,#1 1433 umulh x9,x12,x24 1434 mul x10,x13,x24 1435 umulh x11,x13,x24 1436 1437 adcs x10,x10,x9 1438 adc x11,x11,xzr 1439 1440 adds x14,x15,x10 1441 adcs x15,x16,x11 1442 adcs x16,x17,x24 1443 adc x17,xzr,x24 // can't overflow 1444 mul x11,x14,x23 1445 lsl x8,x24,#32 1446 subs x15,x15,x24 1447 lsr x9,x24,#32 1448 sbcs x16,x16,x8 1449 sbc x17,x17,x9 // can't borrow 1450 subs xzr,x14,#1 1451 umulh x9,x12,x11 1452 mul x10,x13,x11 1453 umulh x24,x13,x11 1454 1455 adcs x10,x10,x9 1456 adc x24,x24,xzr 1457 1458 adds x14,x15,x10 1459 adcs x15,x16,x24 1460 adcs x16,x17,x11 1461 adc x17,xzr,x11 // can't overflow 1462 lsl x8,x11,#32 1463 subs x15,x15,x11 1464 lsr x9,x11,#32 1465 sbcs x16,x16,x8 1466 sbc x17,x17,x9 // can't borrow 1467 adds x14,x14,x19 // accumulate upper half 1468 adcs x15,x15,x20 1469 adcs x16,x16,x1 1470 adcs x17,x17,x3 1471 adc x19,xzr,xzr 1472 1473 subs x8,x14,x12 // ret -= modulus 1474 sbcs x9,x15,x13 1475 sbcs x10,x16,x21 1476 sbcs x11,x17,x22 1477 sbcs xzr,x19,xzr 1478 1479 csel x4,x14,x8,lo // ret = borrow ? ret : ret-modulus 1480 csel x5,x15,x9,lo 1481 csel x6,x16,x10,lo 1482 csel x7,x17,x11,lo 1483 1484 cbnz x2,.Loop_ord_sqr 1485 1486 stp x4,x5,[x0] 1487 stp x6,x7,[x0,#16] 1488 1489 ldp x19,x20,[sp,#16] 1490 ldp x21,x22,[sp,#32] 1491 ldp x23,x24,[sp,#48] 1492 ldr x29,[sp],#64 1493 ret 1494.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont 1495//////////////////////////////////////////////////////////////////////// 1496// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); 1497.globl ecp_nistz256_select_w5 1498.hidden ecp_nistz256_select_w5 1499.type ecp_nistz256_select_w5,%function 1500.align 4 1501ecp_nistz256_select_w5: 1502 AARCH64_VALID_CALL_TARGET 1503 1504 // x10 := x0 1505 // w9 := 0; loop counter and incremented internal index 1506 mov x10, x0 1507 mov w9, #0 1508 1509 // [v16-v21] := 0 1510 movi v16.16b, #0 1511 movi v17.16b, #0 1512 movi v18.16b, #0 1513 movi v19.16b, #0 1514 movi v20.16b, #0 1515 movi v21.16b, #0 1516 1517.Lselect_w5_loop: 1518 // Loop 16 times. 1519 1520 // Increment index (loop counter); tested at the end of the loop 1521 add w9, w9, #1 1522 1523 // [v22-v27] := Load a (3*256-bit = 6*128-bit) table entry starting at x1 1524 // and advance x1 to point to the next entry 1525 ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64 1526 1527 // x11 := (w9 == w2)? All 1s : All 0s 1528 cmp w9, w2 1529 csetm x11, eq 1530 1531 // continue loading ... 1532 ld1 {v26.2d, v27.2d}, [x1],#32 1533 1534 // duplicate mask_64 into Mask (all 0s or all 1s) 1535 dup v3.2d, x11 1536 1537 // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19] 1538 // i.e., values in output registers will remain the same if w9 != w2 1539 bit v16.16b, v22.16b, v3.16b 1540 bit v17.16b, v23.16b, v3.16b 1541 1542 bit v18.16b, v24.16b, v3.16b 1543 bit v19.16b, v25.16b, v3.16b 1544 1545 bit v20.16b, v26.16b, v3.16b 1546 bit v21.16b, v27.16b, v3.16b 1547 1548 // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back 1549 tbz w9, #4, .Lselect_w5_loop 1550 1551 // Write [v16-v21] to memory at the output pointer 1552 st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x10],#64 1553 st1 {v20.2d, v21.2d}, [x10] 1554 1555 ret 1556.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 1557 1558 1559//////////////////////////////////////////////////////////////////////// 1560// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); 1561.globl ecp_nistz256_select_w7 1562.hidden ecp_nistz256_select_w7 1563.type ecp_nistz256_select_w7,%function 1564.align 4 1565ecp_nistz256_select_w7: 1566 AARCH64_VALID_CALL_TARGET 1567 1568 // w9 := 0; loop counter and incremented internal index 1569 mov w9, #0 1570 1571 // [v16-v21] := 0 1572 movi v16.16b, #0 1573 movi v17.16b, #0 1574 movi v18.16b, #0 1575 movi v19.16b, #0 1576 1577.Lselect_w7_loop: 1578 // Loop 64 times. 1579 1580 // Increment index (loop counter); tested at the end of the loop 1581 add w9, w9, #1 1582 1583 // [v22-v25] := Load a (2*256-bit = 4*128-bit) table entry starting at x1 1584 // and advance x1 to point to the next entry 1585 ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64 1586 1587 // x11 := (w9 == w2)? All 1s : All 0s 1588 cmp w9, w2 1589 csetm x11, eq 1590 1591 // duplicate mask_64 into Mask (all 0s or all 1s) 1592 dup v3.2d, x11 1593 1594 // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19] 1595 // i.e., values in output registers will remain the same if w9 != w2 1596 bit v16.16b, v22.16b, v3.16b 1597 bit v17.16b, v23.16b, v3.16b 1598 1599 bit v18.16b, v24.16b, v3.16b 1600 bit v19.16b, v25.16b, v3.16b 1601 1602 // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back 1603 tbz w9, #6, .Lselect_w7_loop 1604 1605 // Write [v16-v19] to memory at the output pointer 1606 st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0] 1607 1608 ret 1609.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 1610#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) 1611