1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#include <openssl/asm_base.h> 5 6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) 7#include "openssl/arm_arch.h" 8 9.section .rodata 10.align 5 11.Lpoly: 12.quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 13.LRR: // 2^512 mod P precomputed for NIST P256 polynomial 14.quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd 15.Lone_mont: 16.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe 17.Lone: 18.quad 1,0,0,0 19.Lord: 20.quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 21.LordK: 22.quad 0xccd1c8aaee00bc4f 23.byte 69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 24.align 2 25.text 26 27// void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], 28// const BN_ULONG x2[4]); 29.globl ecp_nistz256_mul_mont 30.hidden ecp_nistz256_mul_mont 31.type ecp_nistz256_mul_mont,%function 32.align 4 33ecp_nistz256_mul_mont: 34 AARCH64_SIGN_LINK_REGISTER 35 stp x29,x30,[sp,#-32]! 36 add x29,sp,#0 37 stp x19,x20,[sp,#16] 38 39 ldr x3,[x2] // bp[0] 40 ldp x4,x5,[x1] 41 ldp x6,x7,[x1,#16] 42 adrp x13,.Lpoly 43 add x13,x13,:lo12:.Lpoly 44 ldr x12,[x13,#8] 45 ldr x13,[x13,#24] 46 47 bl __ecp_nistz256_mul_mont 48 49 ldp x19,x20,[sp,#16] 50 ldp x29,x30,[sp],#32 51 AARCH64_VALIDATE_LINK_REGISTER 52 ret 53.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont 54 55// void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); 56.globl ecp_nistz256_sqr_mont 57.hidden ecp_nistz256_sqr_mont 58.type ecp_nistz256_sqr_mont,%function 59.align 4 60ecp_nistz256_sqr_mont: 61 AARCH64_SIGN_LINK_REGISTER 62 stp x29,x30,[sp,#-32]! 63 add x29,sp,#0 64 stp x19,x20,[sp,#16] 65 66 ldp x4,x5,[x1] 67 ldp x6,x7,[x1,#16] 68 adrp x13,.Lpoly 69 add x13,x13,:lo12:.Lpoly 70 ldr x12,[x13,#8] 71 ldr x13,[x13,#24] 72 73 bl __ecp_nistz256_sqr_mont 74 75 ldp x19,x20,[sp,#16] 76 ldp x29,x30,[sp],#32 77 AARCH64_VALIDATE_LINK_REGISTER 78 ret 79.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont 80 81// void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); 82.globl ecp_nistz256_div_by_2 83.hidden ecp_nistz256_div_by_2 84.type ecp_nistz256_div_by_2,%function 85.align 4 86ecp_nistz256_div_by_2: 87 AARCH64_SIGN_LINK_REGISTER 88 stp x29,x30,[sp,#-16]! 89 add x29,sp,#0 90 91 ldp x14,x15,[x1] 92 ldp x16,x17,[x1,#16] 93 adrp x13,.Lpoly 94 add x13,x13,:lo12:.Lpoly 95 ldr x12,[x13,#8] 96 ldr x13,[x13,#24] 97 98 bl __ecp_nistz256_div_by_2 99 100 ldp x29,x30,[sp],#16 101 AARCH64_VALIDATE_LINK_REGISTER 102 ret 103.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 104 105// void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); 106.globl ecp_nistz256_mul_by_2 107.hidden ecp_nistz256_mul_by_2 108.type ecp_nistz256_mul_by_2,%function 109.align 4 110ecp_nistz256_mul_by_2: 111 AARCH64_SIGN_LINK_REGISTER 112 stp x29,x30,[sp,#-16]! 113 add x29,sp,#0 114 115 ldp x14,x15,[x1] 116 ldp x16,x17,[x1,#16] 117 adrp x13,.Lpoly 118 add x13,x13,:lo12:.Lpoly 119 ldr x12,[x13,#8] 120 ldr x13,[x13,#24] 121 mov x8,x14 122 mov x9,x15 123 mov x10,x16 124 mov x11,x17 125 126 bl __ecp_nistz256_add_to // ret = a+a // 2*a 127 128 ldp x29,x30,[sp],#16 129 AARCH64_VALIDATE_LINK_REGISTER 130 ret 131.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 132 133// void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]); 134.globl ecp_nistz256_mul_by_3 135.hidden ecp_nistz256_mul_by_3 136.type ecp_nistz256_mul_by_3,%function 137.align 4 138ecp_nistz256_mul_by_3: 139 AARCH64_SIGN_LINK_REGISTER 140 stp x29,x30,[sp,#-16]! 141 add x29,sp,#0 142 143 ldp x14,x15,[x1] 144 ldp x16,x17,[x1,#16] 145 adrp x13,.Lpoly 146 add x13,x13,:lo12:.Lpoly 147 ldr x12,[x13,#8] 148 ldr x13,[x13,#24] 149 mov x8,x14 150 mov x9,x15 151 mov x10,x16 152 mov x11,x17 153 mov x4,x14 154 mov x5,x15 155 mov x6,x16 156 mov x7,x17 157 158 bl __ecp_nistz256_add_to // ret = a+a // 2*a 159 160 mov x8,x4 161 mov x9,x5 162 mov x10,x6 163 mov x11,x7 164 165 bl __ecp_nistz256_add_to // ret += a // 2*a+a=3*a 166 167 ldp x29,x30,[sp],#16 168 AARCH64_VALIDATE_LINK_REGISTER 169 ret 170.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 171 172// void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4], 173// const BN_ULONG x2[4]); 174.globl ecp_nistz256_sub 175.hidden ecp_nistz256_sub 176.type ecp_nistz256_sub,%function 177.align 4 178ecp_nistz256_sub: 179 AARCH64_SIGN_LINK_REGISTER 180 stp x29,x30,[sp,#-16]! 181 add x29,sp,#0 182 183 ldp x14,x15,[x1] 184 ldp x16,x17,[x1,#16] 185 adrp x13,.Lpoly 186 add x13,x13,:lo12:.Lpoly 187 ldr x12,[x13,#8] 188 ldr x13,[x13,#24] 189 190 bl __ecp_nistz256_sub_from 191 192 ldp x29,x30,[sp],#16 193 AARCH64_VALIDATE_LINK_REGISTER 194 ret 195.size ecp_nistz256_sub,.-ecp_nistz256_sub 196 197// void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); 198.globl ecp_nistz256_neg 199.hidden ecp_nistz256_neg 200.type ecp_nistz256_neg,%function 201.align 4 202ecp_nistz256_neg: 203 AARCH64_SIGN_LINK_REGISTER 204 stp x29,x30,[sp,#-16]! 205 add x29,sp,#0 206 207 mov x2,x1 208 mov x14,xzr // a = 0 209 mov x15,xzr 210 mov x16,xzr 211 mov x17,xzr 212 adrp x13,.Lpoly 213 add x13,x13,:lo12:.Lpoly 214 ldr x12,[x13,#8] 215 ldr x13,[x13,#24] 216 217 bl __ecp_nistz256_sub_from 218 219 ldp x29,x30,[sp],#16 220 AARCH64_VALIDATE_LINK_REGISTER 221 ret 222.size ecp_nistz256_neg,.-ecp_nistz256_neg 223 224// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded 225// to x4-x7 and b[0] - to x3 226.type __ecp_nistz256_mul_mont,%function 227.align 4 228__ecp_nistz256_mul_mont: 229 mul x14,x4,x3 // a[0]*b[0] 230 umulh x8,x4,x3 231 232 mul x15,x5,x3 // a[1]*b[0] 233 umulh x9,x5,x3 234 235 mul x16,x6,x3 // a[2]*b[0] 236 umulh x10,x6,x3 237 238 mul x17,x7,x3 // a[3]*b[0] 239 umulh x11,x7,x3 240 ldr x3,[x2,#8] // b[1] 241 242 adds x15,x15,x8 // accumulate high parts of multiplication 243 lsl x8,x14,#32 244 adcs x16,x16,x9 245 lsr x9,x14,#32 246 adcs x17,x17,x10 247 adc x19,xzr,x11 248 mov x20,xzr 249 subs x10,x14,x8 // "*0xffff0001" 250 sbc x11,x14,x9 251 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 252 mul x8,x4,x3 // lo(a[0]*b[i]) 253 adcs x15,x16,x9 254 mul x9,x5,x3 // lo(a[1]*b[i]) 255 adcs x16,x17,x10 // +=acc[0]*0xffff0001 256 mul x10,x6,x3 // lo(a[2]*b[i]) 257 adcs x17,x19,x11 258 mul x11,x7,x3 // lo(a[3]*b[i]) 259 adc x19,x20,xzr 260 261 adds x14,x14,x8 // accumulate low parts of multiplication 262 umulh x8,x4,x3 // hi(a[0]*b[i]) 263 adcs x15,x15,x9 264 umulh x9,x5,x3 // hi(a[1]*b[i]) 265 adcs x16,x16,x10 266 umulh x10,x6,x3 // hi(a[2]*b[i]) 267 adcs x17,x17,x11 268 umulh x11,x7,x3 // hi(a[3]*b[i]) 269 adc x19,x19,xzr 270 ldr x3,[x2,#8*(1+1)] // b[1+1] 271 adds x15,x15,x8 // accumulate high parts of multiplication 272 lsl x8,x14,#32 273 adcs x16,x16,x9 274 lsr x9,x14,#32 275 adcs x17,x17,x10 276 adcs x19,x19,x11 277 adc x20,xzr,xzr 278 subs x10,x14,x8 // "*0xffff0001" 279 sbc x11,x14,x9 280 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 281 mul x8,x4,x3 // lo(a[0]*b[i]) 282 adcs x15,x16,x9 283 mul x9,x5,x3 // lo(a[1]*b[i]) 284 adcs x16,x17,x10 // +=acc[0]*0xffff0001 285 mul x10,x6,x3 // lo(a[2]*b[i]) 286 adcs x17,x19,x11 287 mul x11,x7,x3 // lo(a[3]*b[i]) 288 adc x19,x20,xzr 289 290 adds x14,x14,x8 // accumulate low parts of multiplication 291 umulh x8,x4,x3 // hi(a[0]*b[i]) 292 adcs x15,x15,x9 293 umulh x9,x5,x3 // hi(a[1]*b[i]) 294 adcs x16,x16,x10 295 umulh x10,x6,x3 // hi(a[2]*b[i]) 296 adcs x17,x17,x11 297 umulh x11,x7,x3 // hi(a[3]*b[i]) 298 adc x19,x19,xzr 299 ldr x3,[x2,#8*(2+1)] // b[2+1] 300 adds x15,x15,x8 // accumulate high parts of multiplication 301 lsl x8,x14,#32 302 adcs x16,x16,x9 303 lsr x9,x14,#32 304 adcs x17,x17,x10 305 adcs x19,x19,x11 306 adc x20,xzr,xzr 307 subs x10,x14,x8 // "*0xffff0001" 308 sbc x11,x14,x9 309 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 310 mul x8,x4,x3 // lo(a[0]*b[i]) 311 adcs x15,x16,x9 312 mul x9,x5,x3 // lo(a[1]*b[i]) 313 adcs x16,x17,x10 // +=acc[0]*0xffff0001 314 mul x10,x6,x3 // lo(a[2]*b[i]) 315 adcs x17,x19,x11 316 mul x11,x7,x3 // lo(a[3]*b[i]) 317 adc x19,x20,xzr 318 319 adds x14,x14,x8 // accumulate low parts of multiplication 320 umulh x8,x4,x3 // hi(a[0]*b[i]) 321 adcs x15,x15,x9 322 umulh x9,x5,x3 // hi(a[1]*b[i]) 323 adcs x16,x16,x10 324 umulh x10,x6,x3 // hi(a[2]*b[i]) 325 adcs x17,x17,x11 326 umulh x11,x7,x3 // hi(a[3]*b[i]) 327 adc x19,x19,xzr 328 adds x15,x15,x8 // accumulate high parts of multiplication 329 lsl x8,x14,#32 330 adcs x16,x16,x9 331 lsr x9,x14,#32 332 adcs x17,x17,x10 333 adcs x19,x19,x11 334 adc x20,xzr,xzr 335 // last reduction 336 subs x10,x14,x8 // "*0xffff0001" 337 sbc x11,x14,x9 338 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 339 adcs x15,x16,x9 340 adcs x16,x17,x10 // +=acc[0]*0xffff0001 341 adcs x17,x19,x11 342 adc x19,x20,xzr 343 344 adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus 345 sbcs x9,x15,x12 346 sbcs x10,x16,xzr 347 sbcs x11,x17,x13 348 sbcs xzr,x19,xzr // did it borrow? 349 350 csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus 351 csel x15,x15,x9,lo 352 csel x16,x16,x10,lo 353 stp x14,x15,[x0] 354 csel x17,x17,x11,lo 355 stp x16,x17,[x0,#16] 356 357 ret 358.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont 359 360// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded 361// to x4-x7 362.type __ecp_nistz256_sqr_mont,%function 363.align 4 364__ecp_nistz256_sqr_mont: 365 // | | | | | |a1*a0| | 366 // | | | | |a2*a0| | | 367 // | |a3*a2|a3*a0| | | | 368 // | | | |a2*a1| | | | 369 // | | |a3*a1| | | | | 370 // *| | | | | | | | 2| 371 // +|a3*a3|a2*a2|a1*a1|a0*a0| 372 // |--+--+--+--+--+--+--+--| 373 // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow 374 // 375 // "can't overflow" below mark carrying into high part of 376 // multiplication result, which can't overflow, because it 377 // can never be all ones. 378 379 mul x15,x5,x4 // a[1]*a[0] 380 umulh x9,x5,x4 381 mul x16,x6,x4 // a[2]*a[0] 382 umulh x10,x6,x4 383 mul x17,x7,x4 // a[3]*a[0] 384 umulh x19,x7,x4 385 386 adds x16,x16,x9 // accumulate high parts of multiplication 387 mul x8,x6,x5 // a[2]*a[1] 388 umulh x9,x6,x5 389 adcs x17,x17,x10 390 mul x10,x7,x5 // a[3]*a[1] 391 umulh x11,x7,x5 392 adc x19,x19,xzr // can't overflow 393 394 mul x20,x7,x6 // a[3]*a[2] 395 umulh x1,x7,x6 396 397 adds x9,x9,x10 // accumulate high parts of multiplication 398 mul x14,x4,x4 // a[0]*a[0] 399 adc x10,x11,xzr // can't overflow 400 401 adds x17,x17,x8 // accumulate low parts of multiplication 402 umulh x4,x4,x4 403 adcs x19,x19,x9 404 mul x9,x5,x5 // a[1]*a[1] 405 adcs x20,x20,x10 406 umulh x5,x5,x5 407 adc x1,x1,xzr // can't overflow 408 409 adds x15,x15,x15 // acc[1-6]*=2 410 mul x10,x6,x6 // a[2]*a[2] 411 adcs x16,x16,x16 412 umulh x6,x6,x6 413 adcs x17,x17,x17 414 mul x11,x7,x7 // a[3]*a[3] 415 adcs x19,x19,x19 416 umulh x7,x7,x7 417 adcs x20,x20,x20 418 adcs x1,x1,x1 419 adc x2,xzr,xzr 420 421 adds x15,x15,x4 // +a[i]*a[i] 422 adcs x16,x16,x9 423 adcs x17,x17,x5 424 adcs x19,x19,x10 425 adcs x20,x20,x6 426 lsl x8,x14,#32 427 adcs x1,x1,x11 428 lsr x9,x14,#32 429 adc x2,x2,x7 430 subs x10,x14,x8 // "*0xffff0001" 431 sbc x11,x14,x9 432 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 433 adcs x15,x16,x9 434 lsl x8,x14,#32 435 adcs x16,x17,x10 // +=acc[0]*0xffff0001 436 lsr x9,x14,#32 437 adc x17,x11,xzr // can't overflow 438 subs x10,x14,x8 // "*0xffff0001" 439 sbc x11,x14,x9 440 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 441 adcs x15,x16,x9 442 lsl x8,x14,#32 443 adcs x16,x17,x10 // +=acc[0]*0xffff0001 444 lsr x9,x14,#32 445 adc x17,x11,xzr // can't overflow 446 subs x10,x14,x8 // "*0xffff0001" 447 sbc x11,x14,x9 448 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 449 adcs x15,x16,x9 450 lsl x8,x14,#32 451 adcs x16,x17,x10 // +=acc[0]*0xffff0001 452 lsr x9,x14,#32 453 adc x17,x11,xzr // can't overflow 454 subs x10,x14,x8 // "*0xffff0001" 455 sbc x11,x14,x9 456 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 457 adcs x15,x16,x9 458 adcs x16,x17,x10 // +=acc[0]*0xffff0001 459 adc x17,x11,xzr // can't overflow 460 461 adds x14,x14,x19 // accumulate upper half 462 adcs x15,x15,x20 463 adcs x16,x16,x1 464 adcs x17,x17,x2 465 adc x19,xzr,xzr 466 467 adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus 468 sbcs x9,x15,x12 469 sbcs x10,x16,xzr 470 sbcs x11,x17,x13 471 sbcs xzr,x19,xzr // did it borrow? 472 473 csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus 474 csel x15,x15,x9,lo 475 csel x16,x16,x10,lo 476 stp x14,x15,[x0] 477 csel x17,x17,x11,lo 478 stp x16,x17,[x0,#16] 479 480 ret 481.size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont 482 483// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to 484// x4-x7 and x8-x11. This is done because it's used in multiple 485// contexts, e.g. in multiplication by 2 and 3... 486.type __ecp_nistz256_add_to,%function 487.align 4 488__ecp_nistz256_add_to: 489 adds x14,x14,x8 // ret = a+b 490 adcs x15,x15,x9 491 adcs x16,x16,x10 492 adcs x17,x17,x11 493 adc x1,xzr,xzr // zap x1 494 495 adds x8,x14,#1 // subs x8,x4,#-1 // tmp = ret-modulus 496 sbcs x9,x15,x12 497 sbcs x10,x16,xzr 498 sbcs x11,x17,x13 499 sbcs xzr,x1,xzr // did subtraction borrow? 500 501 csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus 502 csel x15,x15,x9,lo 503 csel x16,x16,x10,lo 504 stp x14,x15,[x0] 505 csel x17,x17,x11,lo 506 stp x16,x17,[x0,#16] 507 508 ret 509.size __ecp_nistz256_add_to,.-__ecp_nistz256_add_to 510 511.type __ecp_nistz256_sub_from,%function 512.align 4 513__ecp_nistz256_sub_from: 514 ldp x8,x9,[x2] 515 ldp x10,x11,[x2,#16] 516 subs x14,x14,x8 // ret = a-b 517 sbcs x15,x15,x9 518 sbcs x16,x16,x10 519 sbcs x17,x17,x11 520 sbc x1,xzr,xzr // zap x1 521 522 subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus 523 adcs x9,x15,x12 524 adcs x10,x16,xzr 525 adc x11,x17,x13 526 cmp x1,xzr // did subtraction borrow? 527 528 csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret 529 csel x15,x15,x9,eq 530 csel x16,x16,x10,eq 531 stp x14,x15,[x0] 532 csel x17,x17,x11,eq 533 stp x16,x17,[x0,#16] 534 535 ret 536.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from 537 538.type __ecp_nistz256_sub_morf,%function 539.align 4 540__ecp_nistz256_sub_morf: 541 ldp x8,x9,[x2] 542 ldp x10,x11,[x2,#16] 543 subs x14,x8,x14 // ret = b-a 544 sbcs x15,x9,x15 545 sbcs x16,x10,x16 546 sbcs x17,x11,x17 547 sbc x1,xzr,xzr // zap x1 548 549 subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus 550 adcs x9,x15,x12 551 adcs x10,x16,xzr 552 adc x11,x17,x13 553 cmp x1,xzr // did subtraction borrow? 554 555 csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret 556 csel x15,x15,x9,eq 557 csel x16,x16,x10,eq 558 stp x14,x15,[x0] 559 csel x17,x17,x11,eq 560 stp x16,x17,[x0,#16] 561 562 ret 563.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf 564 565.type __ecp_nistz256_div_by_2,%function 566.align 4 567__ecp_nistz256_div_by_2: 568 subs x8,x14,#1 // adds x8,x4,#-1 // tmp = a+modulus 569 adcs x9,x15,x12 570 adcs x10,x16,xzr 571 adcs x11,x17,x13 572 adc x1,xzr,xzr // zap x1 573 tst x14,#1 // is a even? 574 575 csel x14,x14,x8,eq // ret = even ? a : a+modulus 576 csel x15,x15,x9,eq 577 csel x16,x16,x10,eq 578 csel x17,x17,x11,eq 579 csel x1,xzr,x1,eq 580 581 lsr x14,x14,#1 // ret >>= 1 582 orr x14,x14,x15,lsl#63 583 lsr x15,x15,#1 584 orr x15,x15,x16,lsl#63 585 lsr x16,x16,#1 586 orr x16,x16,x17,lsl#63 587 lsr x17,x17,#1 588 stp x14,x15,[x0] 589 orr x17,x17,x1,lsl#63 590 stp x16,x17,[x0,#16] 591 592 ret 593.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 594.globl ecp_nistz256_point_double 595.hidden ecp_nistz256_point_double 596.type ecp_nistz256_point_double,%function 597.align 5 598ecp_nistz256_point_double: 599 AARCH64_SIGN_LINK_REGISTER 600 stp x29,x30,[sp,#-96]! 601 add x29,sp,#0 602 stp x19,x20,[sp,#16] 603 stp x21,x22,[sp,#32] 604 sub sp,sp,#32*4 605 606.Ldouble_shortcut: 607 ldp x14,x15,[x1,#32] 608 mov x21,x0 609 ldp x16,x17,[x1,#48] 610 mov x22,x1 611 adrp x13,.Lpoly 612 add x13,x13,:lo12:.Lpoly 613 ldr x12,[x13,#8] 614 mov x8,x14 615 ldr x13,[x13,#24] 616 mov x9,x15 617 ldp x4,x5,[x22,#64] // forward load for p256_sqr_mont 618 mov x10,x16 619 mov x11,x17 620 ldp x6,x7,[x22,#64+16] 621 add x0,sp,#0 622 bl __ecp_nistz256_add_to // p256_mul_by_2(S, in_y); 623 624 add x0,sp,#64 625 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z); 626 627 ldp x8,x9,[x22] 628 ldp x10,x11,[x22,#16] 629 mov x4,x14 // put Zsqr aside for p256_sub 630 mov x5,x15 631 mov x6,x16 632 mov x7,x17 633 add x0,sp,#32 634 bl __ecp_nistz256_add_to // p256_add(M, Zsqr, in_x); 635 636 add x2,x22,#0 637 mov x14,x4 // restore Zsqr 638 mov x15,x5 639 ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont 640 mov x16,x6 641 mov x17,x7 642 ldp x6,x7,[sp,#0+16] 643 add x0,sp,#64 644 bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr); 645 646 add x0,sp,#0 647 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S); 648 649 ldr x3,[x22,#32] 650 ldp x4,x5,[x22,#64] 651 ldp x6,x7,[x22,#64+16] 652 add x2,x22,#32 653 add x0,sp,#96 654 bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y); 655 656 mov x8,x14 657 mov x9,x15 658 ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont 659 mov x10,x16 660 mov x11,x17 661 ldp x6,x7,[sp,#0+16] 662 add x0,x21,#64 663 bl __ecp_nistz256_add_to // p256_mul_by_2(res_z, tmp0); 664 665 add x0,sp,#96 666 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S); 667 668 ldr x3,[sp,#64] // forward load for p256_mul_mont 669 ldp x4,x5,[sp,#32] 670 ldp x6,x7,[sp,#32+16] 671 add x0,x21,#32 672 bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0); 673 674 add x2,sp,#64 675 add x0,sp,#32 676 bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr); 677 678 mov x8,x14 // duplicate M 679 mov x9,x15 680 mov x10,x16 681 mov x11,x17 682 mov x4,x14 // put M aside 683 mov x5,x15 684 mov x6,x16 685 mov x7,x17 686 add x0,sp,#32 687 bl __ecp_nistz256_add_to 688 mov x8,x4 // restore M 689 mov x9,x5 690 ldr x3,[x22] // forward load for p256_mul_mont 691 mov x10,x6 692 ldp x4,x5,[sp,#0] 693 mov x11,x7 694 ldp x6,x7,[sp,#0+16] 695 bl __ecp_nistz256_add_to // p256_mul_by_3(M, M); 696 697 add x2,x22,#0 698 add x0,sp,#0 699 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x); 700 701 mov x8,x14 702 mov x9,x15 703 ldp x4,x5,[sp,#32] // forward load for p256_sqr_mont 704 mov x10,x16 705 mov x11,x17 706 ldp x6,x7,[sp,#32+16] 707 add x0,sp,#96 708 bl __ecp_nistz256_add_to // p256_mul_by_2(tmp0, S); 709 710 add x0,x21,#0 711 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M); 712 713 add x2,sp,#96 714 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0); 715 716 add x2,sp,#0 717 add x0,sp,#0 718 bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x); 719 720 ldr x3,[sp,#32] 721 mov x4,x14 // copy S 722 mov x5,x15 723 mov x6,x16 724 mov x7,x17 725 add x2,sp,#32 726 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M); 727 728 add x2,x21,#32 729 add x0,x21,#32 730 bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y); 731 732 add sp,x29,#0 // destroy frame 733 ldp x19,x20,[x29,#16] 734 ldp x21,x22,[x29,#32] 735 ldp x29,x30,[sp],#96 736 AARCH64_VALIDATE_LINK_REGISTER 737 ret 738.size ecp_nistz256_point_double,.-ecp_nistz256_point_double 739.globl ecp_nistz256_point_add 740.hidden ecp_nistz256_point_add 741.type ecp_nistz256_point_add,%function 742.align 5 743ecp_nistz256_point_add: 744 AARCH64_SIGN_LINK_REGISTER 745 stp x29,x30,[sp,#-96]! 746 add x29,sp,#0 747 stp x19,x20,[sp,#16] 748 stp x21,x22,[sp,#32] 749 stp x23,x24,[sp,#48] 750 stp x25,x26,[sp,#64] 751 stp x27,x28,[sp,#80] 752 sub sp,sp,#32*12 753 754 ldp x4,x5,[x2,#64] // in2_z 755 ldp x6,x7,[x2,#64+16] 756 mov x21,x0 757 mov x22,x1 758 mov x23,x2 759 adrp x13,.Lpoly 760 add x13,x13,:lo12:.Lpoly 761 ldr x12,[x13,#8] 762 ldr x13,[x13,#24] 763 orr x8,x4,x5 764 orr x10,x6,x7 765 orr x25,x8,x10 766 cmp x25,#0 767 csetm x25,ne // ~in2infty 768 add x0,sp,#192 769 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z); 770 771 ldp x4,x5,[x22,#64] // in1_z 772 ldp x6,x7,[x22,#64+16] 773 orr x8,x4,x5 774 orr x10,x6,x7 775 orr x24,x8,x10 776 cmp x24,#0 777 csetm x24,ne // ~in1infty 778 add x0,sp,#128 779 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); 780 781 ldr x3,[x23,#64] 782 ldp x4,x5,[sp,#192] 783 ldp x6,x7,[sp,#192+16] 784 add x2,x23,#64 785 add x0,sp,#320 786 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z); 787 788 ldr x3,[x22,#64] 789 ldp x4,x5,[sp,#128] 790 ldp x6,x7,[sp,#128+16] 791 add x2,x22,#64 792 add x0,sp,#352 793 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); 794 795 ldr x3,[x22,#32] 796 ldp x4,x5,[sp,#320] 797 ldp x6,x7,[sp,#320+16] 798 add x2,x22,#32 799 add x0,sp,#320 800 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y); 801 802 ldr x3,[x23,#32] 803 ldp x4,x5,[sp,#352] 804 ldp x6,x7,[sp,#352+16] 805 add x2,x23,#32 806 add x0,sp,#352 807 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); 808 809 add x2,sp,#320 810 ldr x3,[sp,#192] // forward load for p256_mul_mont 811 ldp x4,x5,[x22] 812 ldp x6,x7,[x22,#16] 813 add x0,sp,#160 814 bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1); 815 816 orr x14,x14,x15 // see if result is zero 817 orr x16,x16,x17 818 orr x26,x14,x16 // ~is_equal(S1,S2) 819 820 add x2,sp,#192 821 add x0,sp,#256 822 bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr); 823 824 ldr x3,[sp,#128] 825 ldp x4,x5,[x23] 826 ldp x6,x7,[x23,#16] 827 add x2,sp,#128 828 add x0,sp,#288 829 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr); 830 831 add x2,sp,#256 832 ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont 833 ldp x6,x7,[sp,#160+16] 834 add x0,sp,#96 835 bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1); 836 837 orr x14,x14,x15 // see if result is zero 838 orr x16,x16,x17 839 orr x14,x14,x16 // ~is_equal(U1,U2) 840 841 mvn x27,x24 // -1/0 -> 0/-1 842 mvn x28,x25 // -1/0 -> 0/-1 843 orr x14,x14,x27 844 orr x14,x14,x28 845 orr x14,x14,x26 846 cbnz x14,.Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) 847 848.Ladd_double: 849 mov x1,x22 850 mov x0,x21 851 ldp x23,x24,[x29,#48] 852 ldp x25,x26,[x29,#64] 853 ldp x27,x28,[x29,#80] 854 add sp,sp,#256 // #256 is from #32*(12-4). difference in stack frames 855 b .Ldouble_shortcut 856 857.align 4 858.Ladd_proceed: 859 add x0,sp,#192 860 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); 861 862 ldr x3,[x22,#64] 863 ldp x4,x5,[sp,#96] 864 ldp x6,x7,[sp,#96+16] 865 add x2,x22,#64 866 add x0,sp,#64 867 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); 868 869 ldp x4,x5,[sp,#96] 870 ldp x6,x7,[sp,#96+16] 871 add x0,sp,#128 872 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); 873 874 ldr x3,[x23,#64] 875 ldp x4,x5,[sp,#64] 876 ldp x6,x7,[sp,#64+16] 877 add x2,x23,#64 878 add x0,sp,#64 879 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z); 880 881 ldr x3,[sp,#96] 882 ldp x4,x5,[sp,#128] 883 ldp x6,x7,[sp,#128+16] 884 add x2,sp,#96 885 add x0,sp,#224 886 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); 887 888 ldr x3,[sp,#128] 889 ldp x4,x5,[sp,#256] 890 ldp x6,x7,[sp,#256+16] 891 add x2,sp,#128 892 add x0,sp,#288 893 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr); 894 895 mov x8,x14 896 mov x9,x15 897 mov x10,x16 898 mov x11,x17 899 add x0,sp,#128 900 bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); 901 902 add x2,sp,#192 903 add x0,sp,#0 904 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); 905 906 add x2,sp,#224 907 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); 908 909 add x2,sp,#288 910 ldr x3,[sp,#224] // forward load for p256_mul_mont 911 ldp x4,x5,[sp,#320] 912 ldp x6,x7,[sp,#320+16] 913 add x0,sp,#32 914 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); 915 916 add x2,sp,#224 917 add x0,sp,#352 918 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub); 919 920 ldr x3,[sp,#160] 921 ldp x4,x5,[sp,#32] 922 ldp x6,x7,[sp,#32+16] 923 add x2,sp,#160 924 add x0,sp,#32 925 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); 926 927 add x2,sp,#352 928 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); 929 930 ldp x4,x5,[sp,#0] // res 931 ldp x6,x7,[sp,#0+16] 932 ldp x8,x9,[x23] // in2 933 ldp x10,x11,[x23,#16] 934 ldp x14,x15,[x22,#0] // in1 935 cmp x24,#0 // ~, remember? 936 ldp x16,x17,[x22,#0+16] 937 csel x8,x4,x8,ne 938 csel x9,x5,x9,ne 939 ldp x4,x5,[sp,#0+0+32] // res 940 csel x10,x6,x10,ne 941 csel x11,x7,x11,ne 942 cmp x25,#0 // ~, remember? 943 ldp x6,x7,[sp,#0+0+48] 944 csel x14,x8,x14,ne 945 csel x15,x9,x15,ne 946 ldp x8,x9,[x23,#0+32] // in2 947 csel x16,x10,x16,ne 948 csel x17,x11,x17,ne 949 ldp x10,x11,[x23,#0+48] 950 stp x14,x15,[x21,#0] 951 stp x16,x17,[x21,#0+16] 952 ldp x14,x15,[x22,#32] // in1 953 cmp x24,#0 // ~, remember? 954 ldp x16,x17,[x22,#32+16] 955 csel x8,x4,x8,ne 956 csel x9,x5,x9,ne 957 ldp x4,x5,[sp,#0+32+32] // res 958 csel x10,x6,x10,ne 959 csel x11,x7,x11,ne 960 cmp x25,#0 // ~, remember? 961 ldp x6,x7,[sp,#0+32+48] 962 csel x14,x8,x14,ne 963 csel x15,x9,x15,ne 964 ldp x8,x9,[x23,#32+32] // in2 965 csel x16,x10,x16,ne 966 csel x17,x11,x17,ne 967 ldp x10,x11,[x23,#32+48] 968 stp x14,x15,[x21,#32] 969 stp x16,x17,[x21,#32+16] 970 ldp x14,x15,[x22,#64] // in1 971 cmp x24,#0 // ~, remember? 972 ldp x16,x17,[x22,#64+16] 973 csel x8,x4,x8,ne 974 csel x9,x5,x9,ne 975 csel x10,x6,x10,ne 976 csel x11,x7,x11,ne 977 cmp x25,#0 // ~, remember? 978 csel x14,x8,x14,ne 979 csel x15,x9,x15,ne 980 csel x16,x10,x16,ne 981 csel x17,x11,x17,ne 982 stp x14,x15,[x21,#64] 983 stp x16,x17,[x21,#64+16] 984 985.Ladd_done: 986 add sp,x29,#0 // destroy frame 987 ldp x19,x20,[x29,#16] 988 ldp x21,x22,[x29,#32] 989 ldp x23,x24,[x29,#48] 990 ldp x25,x26,[x29,#64] 991 ldp x27,x28,[x29,#80] 992 ldp x29,x30,[sp],#96 993 AARCH64_VALIDATE_LINK_REGISTER 994 ret 995.size ecp_nistz256_point_add,.-ecp_nistz256_point_add 996.globl ecp_nistz256_point_add_affine 997.hidden ecp_nistz256_point_add_affine 998.type ecp_nistz256_point_add_affine,%function 999.align 5 1000ecp_nistz256_point_add_affine: 1001 AARCH64_SIGN_LINK_REGISTER 1002 stp x29,x30,[sp,#-80]! 1003 add x29,sp,#0 1004 stp x19,x20,[sp,#16] 1005 stp x21,x22,[sp,#32] 1006 stp x23,x24,[sp,#48] 1007 stp x25,x26,[sp,#64] 1008 sub sp,sp,#32*10 1009 1010 mov x21,x0 1011 mov x22,x1 1012 mov x23,x2 1013 adrp x13,.Lpoly 1014 add x13,x13,:lo12:.Lpoly 1015 ldr x12,[x13,#8] 1016 ldr x13,[x13,#24] 1017 1018 ldp x4,x5,[x1,#64] // in1_z 1019 ldp x6,x7,[x1,#64+16] 1020 orr x8,x4,x5 1021 orr x10,x6,x7 1022 orr x24,x8,x10 1023 cmp x24,#0 1024 csetm x24,ne // ~in1infty 1025 1026 ldp x14,x15,[x2] // in2_x 1027 ldp x16,x17,[x2,#16] 1028 ldp x8,x9,[x2,#32] // in2_y 1029 ldp x10,x11,[x2,#48] 1030 orr x14,x14,x15 1031 orr x16,x16,x17 1032 orr x8,x8,x9 1033 orr x10,x10,x11 1034 orr x14,x14,x16 1035 orr x8,x8,x10 1036 orr x25,x14,x8 1037 cmp x25,#0 1038 csetm x25,ne // ~in2infty 1039 1040 add x0,sp,#128 1041 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); 1042 1043 mov x4,x14 1044 mov x5,x15 1045 mov x6,x16 1046 mov x7,x17 1047 ldr x3,[x23] 1048 add x2,x23,#0 1049 add x0,sp,#96 1050 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x); 1051 1052 add x2,x22,#0 1053 ldr x3,[x22,#64] // forward load for p256_mul_mont 1054 ldp x4,x5,[sp,#128] 1055 ldp x6,x7,[sp,#128+16] 1056 add x0,sp,#160 1057 bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x); 1058 1059 add x2,x22,#64 1060 add x0,sp,#128 1061 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); 1062 1063 ldr x3,[x22,#64] 1064 ldp x4,x5,[sp,#160] 1065 ldp x6,x7,[sp,#160+16] 1066 add x2,x22,#64 1067 add x0,sp,#64 1068 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); 1069 1070 ldr x3,[x23,#32] 1071 ldp x4,x5,[sp,#128] 1072 ldp x6,x7,[sp,#128+16] 1073 add x2,x23,#32 1074 add x0,sp,#128 1075 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); 1076 1077 add x2,x22,#32 1078 ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont 1079 ldp x6,x7,[sp,#160+16] 1080 add x0,sp,#192 1081 bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y); 1082 1083 add x0,sp,#224 1084 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); 1085 1086 ldp x4,x5,[sp,#192] 1087 ldp x6,x7,[sp,#192+16] 1088 add x0,sp,#288 1089 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); 1090 1091 ldr x3,[sp,#160] 1092 ldp x4,x5,[sp,#224] 1093 ldp x6,x7,[sp,#224+16] 1094 add x2,sp,#160 1095 add x0,sp,#256 1096 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); 1097 1098 ldr x3,[x22] 1099 ldp x4,x5,[sp,#224] 1100 ldp x6,x7,[sp,#224+16] 1101 add x2,x22,#0 1102 add x0,sp,#96 1103 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr); 1104 1105 mov x8,x14 1106 mov x9,x15 1107 mov x10,x16 1108 mov x11,x17 1109 add x0,sp,#224 1110 bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); 1111 1112 add x2,sp,#288 1113 add x0,sp,#0 1114 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); 1115 1116 add x2,sp,#256 1117 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); 1118 1119 add x2,sp,#96 1120 ldr x3,[x22,#32] // forward load for p256_mul_mont 1121 ldp x4,x5,[sp,#256] 1122 ldp x6,x7,[sp,#256+16] 1123 add x0,sp,#32 1124 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); 1125 1126 add x2,x22,#32 1127 add x0,sp,#128 1128 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub); 1129 1130 ldr x3,[sp,#192] 1131 ldp x4,x5,[sp,#32] 1132 ldp x6,x7,[sp,#32+16] 1133 add x2,sp,#192 1134 add x0,sp,#32 1135 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); 1136 1137 add x2,sp,#128 1138 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); 1139 1140 ldp x4,x5,[sp,#0] // res 1141 ldp x6,x7,[sp,#0+16] 1142 ldp x8,x9,[x23] // in2 1143 ldp x10,x11,[x23,#16] 1144 ldp x14,x15,[x22,#0] // in1 1145 cmp x24,#0 // ~, remember? 1146 ldp x16,x17,[x22,#0+16] 1147 csel x8,x4,x8,ne 1148 csel x9,x5,x9,ne 1149 ldp x4,x5,[sp,#0+0+32] // res 1150 csel x10,x6,x10,ne 1151 csel x11,x7,x11,ne 1152 cmp x25,#0 // ~, remember? 1153 ldp x6,x7,[sp,#0+0+48] 1154 csel x14,x8,x14,ne 1155 csel x15,x9,x15,ne 1156 ldp x8,x9,[x23,#0+32] // in2 1157 csel x16,x10,x16,ne 1158 csel x17,x11,x17,ne 1159 ldp x10,x11,[x23,#0+48] 1160 stp x14,x15,[x21,#0] 1161 stp x16,x17,[x21,#0+16] 1162 adrp x23,.Lone_mont-64 1163 add x23,x23,:lo12:.Lone_mont-64 1164 ldp x14,x15,[x22,#32] // in1 1165 cmp x24,#0 // ~, remember? 1166 ldp x16,x17,[x22,#32+16] 1167 csel x8,x4,x8,ne 1168 csel x9,x5,x9,ne 1169 ldp x4,x5,[sp,#0+32+32] // res 1170 csel x10,x6,x10,ne 1171 csel x11,x7,x11,ne 1172 cmp x25,#0 // ~, remember? 1173 ldp x6,x7,[sp,#0+32+48] 1174 csel x14,x8,x14,ne 1175 csel x15,x9,x15,ne 1176 ldp x8,x9,[x23,#32+32] // in2 1177 csel x16,x10,x16,ne 1178 csel x17,x11,x17,ne 1179 ldp x10,x11,[x23,#32+48] 1180 stp x14,x15,[x21,#32] 1181 stp x16,x17,[x21,#32+16] 1182 ldp x14,x15,[x22,#64] // in1 1183 cmp x24,#0 // ~, remember? 1184 ldp x16,x17,[x22,#64+16] 1185 csel x8,x4,x8,ne 1186 csel x9,x5,x9,ne 1187 csel x10,x6,x10,ne 1188 csel x11,x7,x11,ne 1189 cmp x25,#0 // ~, remember? 1190 csel x14,x8,x14,ne 1191 csel x15,x9,x15,ne 1192 csel x16,x10,x16,ne 1193 csel x17,x11,x17,ne 1194 stp x14,x15,[x21,#64] 1195 stp x16,x17,[x21,#64+16] 1196 1197 add sp,x29,#0 // destroy frame 1198 ldp x19,x20,[x29,#16] 1199 ldp x21,x22,[x29,#32] 1200 ldp x23,x24,[x29,#48] 1201 ldp x25,x26,[x29,#64] 1202 ldp x29,x30,[sp],#80 1203 AARCH64_VALIDATE_LINK_REGISTER 1204 ret 1205.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine 1206//////////////////////////////////////////////////////////////////////// 1207// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], 1208// uint64_t b[4]); 1209.globl ecp_nistz256_ord_mul_mont 1210.hidden ecp_nistz256_ord_mul_mont 1211.type ecp_nistz256_ord_mul_mont,%function 1212.align 4 1213ecp_nistz256_ord_mul_mont: 1214 AARCH64_VALID_CALL_TARGET 1215 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 1216 stp x29,x30,[sp,#-64]! 1217 add x29,sp,#0 1218 stp x19,x20,[sp,#16] 1219 stp x21,x22,[sp,#32] 1220 stp x23,x24,[sp,#48] 1221 1222 adrp x23,.Lord 1223 add x23,x23,:lo12:.Lord 1224 ldr x3,[x2] // bp[0] 1225 ldp x4,x5,[x1] 1226 ldp x6,x7,[x1,#16] 1227 1228 ldp x12,x13,[x23,#0] 1229 ldp x21,x22,[x23,#16] 1230 ldr x23,[x23,#32] 1231 1232 mul x14,x4,x3 // a[0]*b[0] 1233 umulh x8,x4,x3 1234 1235 mul x15,x5,x3 // a[1]*b[0] 1236 umulh x9,x5,x3 1237 1238 mul x16,x6,x3 // a[2]*b[0] 1239 umulh x10,x6,x3 1240 1241 mul x17,x7,x3 // a[3]*b[0] 1242 umulh x19,x7,x3 1243 1244 mul x24,x14,x23 1245 1246 adds x15,x15,x8 // accumulate high parts of multiplication 1247 adcs x16,x16,x9 1248 adcs x17,x17,x10 1249 adc x19,x19,xzr 1250 mov x20,xzr 1251 ldr x3,[x2,#8*1] // b[i] 1252 1253 lsl x8,x24,#32 1254 subs x16,x16,x24 1255 lsr x9,x24,#32 1256 sbcs x17,x17,x8 1257 sbcs x19,x19,x9 1258 sbc x20,x20,xzr 1259 1260 subs xzr,x14,#1 1261 umulh x9,x12,x24 1262 mul x10,x13,x24 1263 umulh x11,x13,x24 1264 1265 adcs x10,x10,x9 1266 mul x8,x4,x3 1267 adc x11,x11,xzr 1268 mul x9,x5,x3 1269 1270 adds x14,x15,x10 1271 mul x10,x6,x3 1272 adcs x15,x16,x11 1273 mul x11,x7,x3 1274 adcs x16,x17,x24 1275 adcs x17,x19,x24 1276 adc x19,x20,xzr 1277 1278 adds x14,x14,x8 // accumulate low parts 1279 umulh x8,x4,x3 1280 adcs x15,x15,x9 1281 umulh x9,x5,x3 1282 adcs x16,x16,x10 1283 umulh x10,x6,x3 1284 adcs x17,x17,x11 1285 umulh x11,x7,x3 1286 adc x19,x19,xzr 1287 mul x24,x14,x23 1288 adds x15,x15,x8 // accumulate high parts 1289 adcs x16,x16,x9 1290 adcs x17,x17,x10 1291 adcs x19,x19,x11 1292 adc x20,xzr,xzr 1293 ldr x3,[x2,#8*2] // b[i] 1294 1295 lsl x8,x24,#32 1296 subs x16,x16,x24 1297 lsr x9,x24,#32 1298 sbcs x17,x17,x8 1299 sbcs x19,x19,x9 1300 sbc x20,x20,xzr 1301 1302 subs xzr,x14,#1 1303 umulh x9,x12,x24 1304 mul x10,x13,x24 1305 umulh x11,x13,x24 1306 1307 adcs x10,x10,x9 1308 mul x8,x4,x3 1309 adc x11,x11,xzr 1310 mul x9,x5,x3 1311 1312 adds x14,x15,x10 1313 mul x10,x6,x3 1314 adcs x15,x16,x11 1315 mul x11,x7,x3 1316 adcs x16,x17,x24 1317 adcs x17,x19,x24 1318 adc x19,x20,xzr 1319 1320 adds x14,x14,x8 // accumulate low parts 1321 umulh x8,x4,x3 1322 adcs x15,x15,x9 1323 umulh x9,x5,x3 1324 adcs x16,x16,x10 1325 umulh x10,x6,x3 1326 adcs x17,x17,x11 1327 umulh x11,x7,x3 1328 adc x19,x19,xzr 1329 mul x24,x14,x23 1330 adds x15,x15,x8 // accumulate high parts 1331 adcs x16,x16,x9 1332 adcs x17,x17,x10 1333 adcs x19,x19,x11 1334 adc x20,xzr,xzr 1335 ldr x3,[x2,#8*3] // b[i] 1336 1337 lsl x8,x24,#32 1338 subs x16,x16,x24 1339 lsr x9,x24,#32 1340 sbcs x17,x17,x8 1341 sbcs x19,x19,x9 1342 sbc x20,x20,xzr 1343 1344 subs xzr,x14,#1 1345 umulh x9,x12,x24 1346 mul x10,x13,x24 1347 umulh x11,x13,x24 1348 1349 adcs x10,x10,x9 1350 mul x8,x4,x3 1351 adc x11,x11,xzr 1352 mul x9,x5,x3 1353 1354 adds x14,x15,x10 1355 mul x10,x6,x3 1356 adcs x15,x16,x11 1357 mul x11,x7,x3 1358 adcs x16,x17,x24 1359 adcs x17,x19,x24 1360 adc x19,x20,xzr 1361 1362 adds x14,x14,x8 // accumulate low parts 1363 umulh x8,x4,x3 1364 adcs x15,x15,x9 1365 umulh x9,x5,x3 1366 adcs x16,x16,x10 1367 umulh x10,x6,x3 1368 adcs x17,x17,x11 1369 umulh x11,x7,x3 1370 adc x19,x19,xzr 1371 mul x24,x14,x23 1372 adds x15,x15,x8 // accumulate high parts 1373 adcs x16,x16,x9 1374 adcs x17,x17,x10 1375 adcs x19,x19,x11 1376 adc x20,xzr,xzr 1377 lsl x8,x24,#32 // last reduction 1378 subs x16,x16,x24 1379 lsr x9,x24,#32 1380 sbcs x17,x17,x8 1381 sbcs x19,x19,x9 1382 sbc x20,x20,xzr 1383 1384 subs xzr,x14,#1 1385 umulh x9,x12,x24 1386 mul x10,x13,x24 1387 umulh x11,x13,x24 1388 1389 adcs x10,x10,x9 1390 adc x11,x11,xzr 1391 1392 adds x14,x15,x10 1393 adcs x15,x16,x11 1394 adcs x16,x17,x24 1395 adcs x17,x19,x24 1396 adc x19,x20,xzr 1397 1398 subs x8,x14,x12 // ret -= modulus 1399 sbcs x9,x15,x13 1400 sbcs x10,x16,x21 1401 sbcs x11,x17,x22 1402 sbcs xzr,x19,xzr 1403 1404 csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus 1405 csel x15,x15,x9,lo 1406 csel x16,x16,x10,lo 1407 stp x14,x15,[x0] 1408 csel x17,x17,x11,lo 1409 stp x16,x17,[x0,#16] 1410 1411 ldp x19,x20,[sp,#16] 1412 ldp x21,x22,[sp,#32] 1413 ldp x23,x24,[sp,#48] 1414 ldr x29,[sp],#64 1415 ret 1416.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont 1417 1418//////////////////////////////////////////////////////////////////////// 1419// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], 1420// uint64_t rep); 1421.globl ecp_nistz256_ord_sqr_mont 1422.hidden ecp_nistz256_ord_sqr_mont 1423.type ecp_nistz256_ord_sqr_mont,%function 1424.align 4 1425ecp_nistz256_ord_sqr_mont: 1426 AARCH64_VALID_CALL_TARGET 1427 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 1428 stp x29,x30,[sp,#-64]! 1429 add x29,sp,#0 1430 stp x19,x20,[sp,#16] 1431 stp x21,x22,[sp,#32] 1432 stp x23,x24,[sp,#48] 1433 1434 adrp x23,.Lord 1435 add x23,x23,:lo12:.Lord 1436 ldp x4,x5,[x1] 1437 ldp x6,x7,[x1,#16] 1438 1439 ldp x12,x13,[x23,#0] 1440 ldp x21,x22,[x23,#16] 1441 ldr x23,[x23,#32] 1442 b .Loop_ord_sqr 1443 1444.align 4 1445.Loop_ord_sqr: 1446 sub x2,x2,#1 1447 //////////////////////////////////////////////////////////////// 1448 // | | | | | |a1*a0| | 1449 // | | | | |a2*a0| | | 1450 // | |a3*a2|a3*a0| | | | 1451 // | | | |a2*a1| | | | 1452 // | | |a3*a1| | | | | 1453 // *| | | | | | | | 2| 1454 // +|a3*a3|a2*a2|a1*a1|a0*a0| 1455 // |--+--+--+--+--+--+--+--| 1456 // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow 1457 // 1458 // "can't overflow" below mark carrying into high part of 1459 // multiplication result, which can't overflow, because it 1460 // can never be all ones. 1461 1462 mul x15,x5,x4 // a[1]*a[0] 1463 umulh x9,x5,x4 1464 mul x16,x6,x4 // a[2]*a[0] 1465 umulh x10,x6,x4 1466 mul x17,x7,x4 // a[3]*a[0] 1467 umulh x19,x7,x4 1468 1469 adds x16,x16,x9 // accumulate high parts of multiplication 1470 mul x8,x6,x5 // a[2]*a[1] 1471 umulh x9,x6,x5 1472 adcs x17,x17,x10 1473 mul x10,x7,x5 // a[3]*a[1] 1474 umulh x11,x7,x5 1475 adc x19,x19,xzr // can't overflow 1476 1477 mul x20,x7,x6 // a[3]*a[2] 1478 umulh x1,x7,x6 1479 1480 adds x9,x9,x10 // accumulate high parts of multiplication 1481 mul x14,x4,x4 // a[0]*a[0] 1482 adc x10,x11,xzr // can't overflow 1483 1484 adds x17,x17,x8 // accumulate low parts of multiplication 1485 umulh x4,x4,x4 1486 adcs x19,x19,x9 1487 mul x9,x5,x5 // a[1]*a[1] 1488 adcs x20,x20,x10 1489 umulh x5,x5,x5 1490 adc x1,x1,xzr // can't overflow 1491 1492 adds x15,x15,x15 // acc[1-6]*=2 1493 mul x10,x6,x6 // a[2]*a[2] 1494 adcs x16,x16,x16 1495 umulh x6,x6,x6 1496 adcs x17,x17,x17 1497 mul x11,x7,x7 // a[3]*a[3] 1498 adcs x19,x19,x19 1499 umulh x7,x7,x7 1500 adcs x20,x20,x20 1501 adcs x1,x1,x1 1502 adc x3,xzr,xzr 1503 1504 adds x15,x15,x4 // +a[i]*a[i] 1505 mul x24,x14,x23 1506 adcs x16,x16,x9 1507 adcs x17,x17,x5 1508 adcs x19,x19,x10 1509 adcs x20,x20,x6 1510 adcs x1,x1,x11 1511 adc x3,x3,x7 1512 subs xzr,x14,#1 1513 umulh x9,x12,x24 1514 mul x10,x13,x24 1515 umulh x11,x13,x24 1516 1517 adcs x10,x10,x9 1518 adc x11,x11,xzr 1519 1520 adds x14,x15,x10 1521 adcs x15,x16,x11 1522 adcs x16,x17,x24 1523 adc x17,xzr,x24 // can't overflow 1524 mul x11,x14,x23 1525 lsl x8,x24,#32 1526 subs x15,x15,x24 1527 lsr x9,x24,#32 1528 sbcs x16,x16,x8 1529 sbc x17,x17,x9 // can't borrow 1530 subs xzr,x14,#1 1531 umulh x9,x12,x11 1532 mul x10,x13,x11 1533 umulh x24,x13,x11 1534 1535 adcs x10,x10,x9 1536 adc x24,x24,xzr 1537 1538 adds x14,x15,x10 1539 adcs x15,x16,x24 1540 adcs x16,x17,x11 1541 adc x17,xzr,x11 // can't overflow 1542 mul x24,x14,x23 1543 lsl x8,x11,#32 1544 subs x15,x15,x11 1545 lsr x9,x11,#32 1546 sbcs x16,x16,x8 1547 sbc x17,x17,x9 // can't borrow 1548 subs xzr,x14,#1 1549 umulh x9,x12,x24 1550 mul x10,x13,x24 1551 umulh x11,x13,x24 1552 1553 adcs x10,x10,x9 1554 adc x11,x11,xzr 1555 1556 adds x14,x15,x10 1557 adcs x15,x16,x11 1558 adcs x16,x17,x24 1559 adc x17,xzr,x24 // can't overflow 1560 mul x11,x14,x23 1561 lsl x8,x24,#32 1562 subs x15,x15,x24 1563 lsr x9,x24,#32 1564 sbcs x16,x16,x8 1565 sbc x17,x17,x9 // can't borrow 1566 subs xzr,x14,#1 1567 umulh x9,x12,x11 1568 mul x10,x13,x11 1569 umulh x24,x13,x11 1570 1571 adcs x10,x10,x9 1572 adc x24,x24,xzr 1573 1574 adds x14,x15,x10 1575 adcs x15,x16,x24 1576 adcs x16,x17,x11 1577 adc x17,xzr,x11 // can't overflow 1578 lsl x8,x11,#32 1579 subs x15,x15,x11 1580 lsr x9,x11,#32 1581 sbcs x16,x16,x8 1582 sbc x17,x17,x9 // can't borrow 1583 adds x14,x14,x19 // accumulate upper half 1584 adcs x15,x15,x20 1585 adcs x16,x16,x1 1586 adcs x17,x17,x3 1587 adc x19,xzr,xzr 1588 1589 subs x8,x14,x12 // ret -= modulus 1590 sbcs x9,x15,x13 1591 sbcs x10,x16,x21 1592 sbcs x11,x17,x22 1593 sbcs xzr,x19,xzr 1594 1595 csel x4,x14,x8,lo // ret = borrow ? ret : ret-modulus 1596 csel x5,x15,x9,lo 1597 csel x6,x16,x10,lo 1598 csel x7,x17,x11,lo 1599 1600 cbnz x2,.Loop_ord_sqr 1601 1602 stp x4,x5,[x0] 1603 stp x6,x7,[x0,#16] 1604 1605 ldp x19,x20,[sp,#16] 1606 ldp x21,x22,[sp,#32] 1607 ldp x23,x24,[sp,#48] 1608 ldr x29,[sp],#64 1609 ret 1610.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont 1611//////////////////////////////////////////////////////////////////////// 1612// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); 1613.globl ecp_nistz256_select_w5 1614.hidden ecp_nistz256_select_w5 1615.type ecp_nistz256_select_w5,%function 1616.align 4 1617ecp_nistz256_select_w5: 1618 AARCH64_VALID_CALL_TARGET 1619 1620 // x10 := x0 1621 // w9 := 0; loop counter and incremented internal index 1622 mov x10, x0 1623 mov w9, #0 1624 1625 // [v16-v21] := 0 1626 movi v16.16b, #0 1627 movi v17.16b, #0 1628 movi v18.16b, #0 1629 movi v19.16b, #0 1630 movi v20.16b, #0 1631 movi v21.16b, #0 1632 1633.Lselect_w5_loop: 1634 // Loop 16 times. 1635 1636 // Increment index (loop counter); tested at the end of the loop 1637 add w9, w9, #1 1638 1639 // [v22-v27] := Load a (3*256-bit = 6*128-bit) table entry starting at x1 1640 // and advance x1 to point to the next entry 1641 ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64 1642 1643 // x11 := (w9 == w2)? All 1s : All 0s 1644 cmp w9, w2 1645 csetm x11, eq 1646 1647 // continue loading ... 1648 ld1 {v26.2d, v27.2d}, [x1],#32 1649 1650 // duplicate mask_64 into Mask (all 0s or all 1s) 1651 dup v3.2d, x11 1652 1653 // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19] 1654 // i.e., values in output registers will remain the same if w9 != w2 1655 bit v16.16b, v22.16b, v3.16b 1656 bit v17.16b, v23.16b, v3.16b 1657 1658 bit v18.16b, v24.16b, v3.16b 1659 bit v19.16b, v25.16b, v3.16b 1660 1661 bit v20.16b, v26.16b, v3.16b 1662 bit v21.16b, v27.16b, v3.16b 1663 1664 // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back 1665 tbz w9, #4, .Lselect_w5_loop 1666 1667 // Write [v16-v21] to memory at the output pointer 1668 st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x10],#64 1669 st1 {v20.2d, v21.2d}, [x10] 1670 1671 ret 1672.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 1673 1674 1675//////////////////////////////////////////////////////////////////////// 1676// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); 1677.globl ecp_nistz256_select_w7 1678.hidden ecp_nistz256_select_w7 1679.type ecp_nistz256_select_w7,%function 1680.align 4 1681ecp_nistz256_select_w7: 1682 AARCH64_VALID_CALL_TARGET 1683 1684 // w9 := 0; loop counter and incremented internal index 1685 mov w9, #0 1686 1687 // [v16-v21] := 0 1688 movi v16.16b, #0 1689 movi v17.16b, #0 1690 movi v18.16b, #0 1691 movi v19.16b, #0 1692 1693.Lselect_w7_loop: 1694 // Loop 64 times. 1695 1696 // Increment index (loop counter); tested at the end of the loop 1697 add w9, w9, #1 1698 1699 // [v22-v25] := Load a (2*256-bit = 4*128-bit) table entry starting at x1 1700 // and advance x1 to point to the next entry 1701 ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64 1702 1703 // x11 := (w9 == w2)? All 1s : All 0s 1704 cmp w9, w2 1705 csetm x11, eq 1706 1707 // duplicate mask_64 into Mask (all 0s or all 1s) 1708 dup v3.2d, x11 1709 1710 // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19] 1711 // i.e., values in output registers will remain the same if w9 != w2 1712 bit v16.16b, v22.16b, v3.16b 1713 bit v17.16b, v23.16b, v3.16b 1714 1715 bit v18.16b, v24.16b, v3.16b 1716 bit v19.16b, v25.16b, v3.16b 1717 1718 // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back 1719 tbz w9, #6, .Lselect_w7_loop 1720 1721 // Write [v16-v19] to memory at the output pointer 1722 st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0] 1723 1724 ret 1725.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 1726#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) 1727