1#! /usr/bin/env perl 2# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <[email protected]> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# ECP_NISTZ256 module for ARMv8. 18# 19# February 2015. 20# 21# Original ECP_NISTZ256 submission targeting x86_64 is detailed in 22# http://eprint.iacr.org/2013/816. 23# 24# with/without -DECP_NISTZ256_ASM 25# Apple A7 +190-360% 26# Cortex-A53 +190-400% 27# Cortex-A57 +190-350% 28# Denver +230-400% 29# 30# Ranges denote minimum and maximum improvement coefficients depending 31# on benchmark. Lower coefficients are for ECDSA sign, server-side 32# operation. Keep in mind that +400% means 5x improvement. 33 34# The first two arguments should always be the flavour and output file path. 35if ($#ARGV < 1) { die "Not enough arguments provided. 36 Two arguments are necessary: the flavour and the output file path."; } 37 38$flavour = shift; 39$output = shift; 40 41$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 42( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 43( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or 44die "can't locate arm-xlate.pl"; 45 46open OUT,"| \"$^X\" $xlate $flavour $output"; 47*STDOUT=*OUT; 48 49{ 50my ($rp,$ap,$bp,$bi,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$poly1,$poly3, 51 $acc0,$acc1,$acc2,$acc3,$acc4,$acc5) = 52 map("x$_",(0..17,19,20)); 53 54my ($acc6,$acc7)=($ap,$bp); # used in __ecp_nistz256_sqr_mont 55 56$code.=<<___; 57#include "openssl/arm_arch.h" 58 59.section .rodata 60.align 5 61.Lpoly: 62.quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 63.LRR: // 2^512 mod P precomputed for NIST P256 polynomial 64.quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd 65.Lone_mont: 66.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe 67.Lone: 68.quad 1,0,0,0 69.Lord: 70.quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 71.LordK: 72.quad 0xccd1c8aaee00bc4f 73.asciz "ECP_NISTZ256 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 74.text 75 76// void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], 77// const BN_ULONG x2[4]); 78.globl ecp_nistz256_mul_mont 79.type ecp_nistz256_mul_mont,%function 80.align 4 81ecp_nistz256_mul_mont: 82 AARCH64_SIGN_LINK_REGISTER 83 stp x29,x30,[sp,#-32]! 84 add x29,sp,#0 85 stp x19,x20,[sp,#16] 86 87 ldr $bi,[$bp] // bp[0] 88 ldp $a0,$a1,[$ap] 89 ldp $a2,$a3,[$ap,#16] 90 adrp $poly3,:pg_hi21:.Lpoly 91 add $poly3,$poly3,:lo12:.Lpoly 92 ldr $poly1,[$poly3,#8] 93 ldr $poly3,[$poly3,#24] 94 95 bl __ecp_nistz256_mul_mont 96 97 ldp x19,x20,[sp,#16] 98 ldp x29,x30,[sp],#32 99 AARCH64_VALIDATE_LINK_REGISTER 100 ret 101.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont 102 103// void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); 104.globl ecp_nistz256_sqr_mont 105.type ecp_nistz256_sqr_mont,%function 106.align 4 107ecp_nistz256_sqr_mont: 108 AARCH64_SIGN_LINK_REGISTER 109 stp x29,x30,[sp,#-32]! 110 add x29,sp,#0 111 stp x19,x20,[sp,#16] 112 113 ldp $a0,$a1,[$ap] 114 ldp $a2,$a3,[$ap,#16] 115 adrp $poly3,:pg_hi21:.Lpoly 116 add $poly3,$poly3,:lo12:.Lpoly 117 ldr $poly1,[$poly3,#8] 118 ldr $poly3,[$poly3,#24] 119 120 bl __ecp_nistz256_sqr_mont 121 122 ldp x19,x20,[sp,#16] 123 ldp x29,x30,[sp],#32 124 AARCH64_VALIDATE_LINK_REGISTER 125 ret 126.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont 127 128// void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); 129.globl ecp_nistz256_div_by_2 130.type ecp_nistz256_div_by_2,%function 131.align 4 132ecp_nistz256_div_by_2: 133 AARCH64_SIGN_LINK_REGISTER 134 stp x29,x30,[sp,#-16]! 135 add x29,sp,#0 136 137 ldp $acc0,$acc1,[$ap] 138 ldp $acc2,$acc3,[$ap,#16] 139 adrp $poly3,:pg_hi21:.Lpoly 140 add $poly3,$poly3,:lo12:.Lpoly 141 ldr $poly1,[$poly3,#8] 142 ldr $poly3,[$poly3,#24] 143 144 bl __ecp_nistz256_div_by_2 145 146 ldp x29,x30,[sp],#16 147 AARCH64_VALIDATE_LINK_REGISTER 148 ret 149.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 150 151// void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); 152.globl ecp_nistz256_mul_by_2 153.type ecp_nistz256_mul_by_2,%function 154.align 4 155ecp_nistz256_mul_by_2: 156 AARCH64_SIGN_LINK_REGISTER 157 stp x29,x30,[sp,#-16]! 158 add x29,sp,#0 159 160 ldp $acc0,$acc1,[$ap] 161 ldp $acc2,$acc3,[$ap,#16] 162 adrp $poly3,:pg_hi21:.Lpoly 163 add $poly3,$poly3,:lo12:.Lpoly 164 ldr $poly1,[$poly3,#8] 165 ldr $poly3,[$poly3,#24] 166 mov $t0,$acc0 167 mov $t1,$acc1 168 mov $t2,$acc2 169 mov $t3,$acc3 170 171 bl __ecp_nistz256_add_to // ret = a+a // 2*a 172 173 ldp x29,x30,[sp],#16 174 AARCH64_VALIDATE_LINK_REGISTER 175 ret 176.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 177 178// void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]); 179.globl ecp_nistz256_mul_by_3 180.type ecp_nistz256_mul_by_3,%function 181.align 4 182ecp_nistz256_mul_by_3: 183 AARCH64_SIGN_LINK_REGISTER 184 stp x29,x30,[sp,#-16]! 185 add x29,sp,#0 186 187 ldp $acc0,$acc1,[$ap] 188 ldp $acc2,$acc3,[$ap,#16] 189 adrp $poly3,:pg_hi21:.Lpoly 190 add $poly3,$poly3,:lo12:.Lpoly 191 ldr $poly1,[$poly3,#8] 192 ldr $poly3,[$poly3,#24] 193 mov $t0,$acc0 194 mov $t1,$acc1 195 mov $t2,$acc2 196 mov $t3,$acc3 197 mov $a0,$acc0 198 mov $a1,$acc1 199 mov $a2,$acc2 200 mov $a3,$acc3 201 202 bl __ecp_nistz256_add_to // ret = a+a // 2*a 203 204 mov $t0,$a0 205 mov $t1,$a1 206 mov $t2,$a2 207 mov $t3,$a3 208 209 bl __ecp_nistz256_add_to // ret += a // 2*a+a=3*a 210 211 ldp x29,x30,[sp],#16 212 AARCH64_VALIDATE_LINK_REGISTER 213 ret 214.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 215 216// void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4], 217// const BN_ULONG x2[4]); 218.globl ecp_nistz256_sub 219.type ecp_nistz256_sub,%function 220.align 4 221ecp_nistz256_sub: 222 AARCH64_SIGN_LINK_REGISTER 223 stp x29,x30,[sp,#-16]! 224 add x29,sp,#0 225 226 ldp $acc0,$acc1,[$ap] 227 ldp $acc2,$acc3,[$ap,#16] 228 adrp $poly3,:pg_hi21:.Lpoly 229 add $poly3,$poly3,:lo12:.Lpoly 230 ldr $poly1,[$poly3,#8] 231 ldr $poly3,[$poly3,#24] 232 233 bl __ecp_nistz256_sub_from 234 235 ldp x29,x30,[sp],#16 236 AARCH64_VALIDATE_LINK_REGISTER 237 ret 238.size ecp_nistz256_sub,.-ecp_nistz256_sub 239 240// void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); 241.globl ecp_nistz256_neg 242.type ecp_nistz256_neg,%function 243.align 4 244ecp_nistz256_neg: 245 AARCH64_SIGN_LINK_REGISTER 246 stp x29,x30,[sp,#-16]! 247 add x29,sp,#0 248 249 mov $bp,$ap 250 mov $acc0,xzr // a = 0 251 mov $acc1,xzr 252 mov $acc2,xzr 253 mov $acc3,xzr 254 adrp $poly3,:pg_hi21:.Lpoly 255 add $poly3,$poly3,:lo12:.Lpoly 256 ldr $poly1,[$poly3,#8] 257 ldr $poly3,[$poly3,#24] 258 259 bl __ecp_nistz256_sub_from 260 261 ldp x29,x30,[sp],#16 262 AARCH64_VALIDATE_LINK_REGISTER 263 ret 264.size ecp_nistz256_neg,.-ecp_nistz256_neg 265 266// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded 267// to $a0-$a3 and b[0] - to $bi 268.type __ecp_nistz256_mul_mont,%function 269.align 4 270__ecp_nistz256_mul_mont: 271 mul $acc0,$a0,$bi // a[0]*b[0] 272 umulh $t0,$a0,$bi 273 274 mul $acc1,$a1,$bi // a[1]*b[0] 275 umulh $t1,$a1,$bi 276 277 mul $acc2,$a2,$bi // a[2]*b[0] 278 umulh $t2,$a2,$bi 279 280 mul $acc3,$a3,$bi // a[3]*b[0] 281 umulh $t3,$a3,$bi 282 ldr $bi,[$bp,#8] // b[1] 283 284 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication 285 lsl $t0,$acc0,#32 286 adcs $acc2,$acc2,$t1 287 lsr $t1,$acc0,#32 288 adcs $acc3,$acc3,$t2 289 adc $acc4,xzr,$t3 290 mov $acc5,xzr 291___ 292for($i=1;$i<4;$i++) { 293 # Reduction iteration is normally performed by accumulating 294 # result of multiplication of modulus by "magic" digit [and 295 # omitting least significant word, which is guaranteed to 296 # be 0], but thanks to special form of modulus and "magic" 297 # digit being equal to least significant word, it can be 298 # performed with additions and subtractions alone. Indeed: 299 # 300 # ffff0001.00000000.0000ffff.ffffffff 301 # * abcdefgh 302 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh 303 # 304 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we 305 # rewrite above as: 306 # 307 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh 308 # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000 309 # - 0000abcd.efgh0000.00000000.00000000.abcdefgh 310 # 311 # or marking redundant operations: 312 # 313 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.-------- 314 # + abcdefgh.abcdefgh.0000abcd.efgh0000.-------- 315 # - 0000abcd.efgh0000.--------.--------.-------- 316 317$code.=<<___; 318 subs $t2,$acc0,$t0 // "*0xffff0001" 319 sbc $t3,$acc0,$t1 320 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] 321 mul $t0,$a0,$bi // lo(a[0]*b[i]) 322 adcs $acc1,$acc2,$t1 323 mul $t1,$a1,$bi // lo(a[1]*b[i]) 324 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 325 mul $t2,$a2,$bi // lo(a[2]*b[i]) 326 adcs $acc3,$acc4,$t3 327 mul $t3,$a3,$bi // lo(a[3]*b[i]) 328 adc $acc4,$acc5,xzr 329 330 adds $acc0,$acc0,$t0 // accumulate low parts of multiplication 331 umulh $t0,$a0,$bi // hi(a[0]*b[i]) 332 adcs $acc1,$acc1,$t1 333 umulh $t1,$a1,$bi // hi(a[1]*b[i]) 334 adcs $acc2,$acc2,$t2 335 umulh $t2,$a2,$bi // hi(a[2]*b[i]) 336 adcs $acc3,$acc3,$t3 337 umulh $t3,$a3,$bi // hi(a[3]*b[i]) 338 adc $acc4,$acc4,xzr 339___ 340$code.=<<___ if ($i<3); 341 ldr $bi,[$bp,#8*($i+1)] // b[$i+1] 342___ 343$code.=<<___; 344 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication 345 lsl $t0,$acc0,#32 346 adcs $acc2,$acc2,$t1 347 lsr $t1,$acc0,#32 348 adcs $acc3,$acc3,$t2 349 adcs $acc4,$acc4,$t3 350 adc $acc5,xzr,xzr 351___ 352} 353$code.=<<___; 354 // last reduction 355 subs $t2,$acc0,$t0 // "*0xffff0001" 356 sbc $t3,$acc0,$t1 357 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] 358 adcs $acc1,$acc2,$t1 359 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 360 adcs $acc3,$acc4,$t3 361 adc $acc4,$acc5,xzr 362 363 adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus 364 sbcs $t1,$acc1,$poly1 365 sbcs $t2,$acc2,xzr 366 sbcs $t3,$acc3,$poly3 367 sbcs xzr,$acc4,xzr // did it borrow? 368 369 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 370 csel $acc1,$acc1,$t1,lo 371 csel $acc2,$acc2,$t2,lo 372 stp $acc0,$acc1,[$rp] 373 csel $acc3,$acc3,$t3,lo 374 stp $acc2,$acc3,[$rp,#16] 375 376 ret 377.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont 378 379// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded 380// to $a0-$a3 381.type __ecp_nistz256_sqr_mont,%function 382.align 4 383__ecp_nistz256_sqr_mont: 384 // | | | | | |a1*a0| | 385 // | | | | |a2*a0| | | 386 // | |a3*a2|a3*a0| | | | 387 // | | | |a2*a1| | | | 388 // | | |a3*a1| | | | | 389 // *| | | | | | | | 2| 390 // +|a3*a3|a2*a2|a1*a1|a0*a0| 391 // |--+--+--+--+--+--+--+--| 392 // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx 393 // 394 // "can't overflow" below mark carrying into high part of 395 // multiplication result, which can't overflow, because it 396 // can never be all ones. 397 398 mul $acc1,$a1,$a0 // a[1]*a[0] 399 umulh $t1,$a1,$a0 400 mul $acc2,$a2,$a0 // a[2]*a[0] 401 umulh $t2,$a2,$a0 402 mul $acc3,$a3,$a0 // a[3]*a[0] 403 umulh $acc4,$a3,$a0 404 405 adds $acc2,$acc2,$t1 // accumulate high parts of multiplication 406 mul $t0,$a2,$a1 // a[2]*a[1] 407 umulh $t1,$a2,$a1 408 adcs $acc3,$acc3,$t2 409 mul $t2,$a3,$a1 // a[3]*a[1] 410 umulh $t3,$a3,$a1 411 adc $acc4,$acc4,xzr // can't overflow 412 413 mul $acc5,$a3,$a2 // a[3]*a[2] 414 umulh $acc6,$a3,$a2 415 416 adds $t1,$t1,$t2 // accumulate high parts of multiplication 417 mul $acc0,$a0,$a0 // a[0]*a[0] 418 adc $t2,$t3,xzr // can't overflow 419 420 adds $acc3,$acc3,$t0 // accumulate low parts of multiplication 421 umulh $a0,$a0,$a0 422 adcs $acc4,$acc4,$t1 423 mul $t1,$a1,$a1 // a[1]*a[1] 424 adcs $acc5,$acc5,$t2 425 umulh $a1,$a1,$a1 426 adc $acc6,$acc6,xzr // can't overflow 427 428 adds $acc1,$acc1,$acc1 // acc[1-6]*=2 429 mul $t2,$a2,$a2 // a[2]*a[2] 430 adcs $acc2,$acc2,$acc2 431 umulh $a2,$a2,$a2 432 adcs $acc3,$acc3,$acc3 433 mul $t3,$a3,$a3 // a[3]*a[3] 434 adcs $acc4,$acc4,$acc4 435 umulh $a3,$a3,$a3 436 adcs $acc5,$acc5,$acc5 437 adcs $acc6,$acc6,$acc6 438 adc $acc7,xzr,xzr 439 440 adds $acc1,$acc1,$a0 // +a[i]*a[i] 441 adcs $acc2,$acc2,$t1 442 adcs $acc3,$acc3,$a1 443 adcs $acc4,$acc4,$t2 444 adcs $acc5,$acc5,$a2 445 lsl $t0,$acc0,#32 446 adcs $acc6,$acc6,$t3 447 lsr $t1,$acc0,#32 448 adc $acc7,$acc7,$a3 449___ 450for($i=0;$i<3;$i++) { # reductions, see commentary in 451 # multiplication for details 452$code.=<<___; 453 subs $t2,$acc0,$t0 // "*0xffff0001" 454 sbc $t3,$acc0,$t1 455 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] 456 adcs $acc1,$acc2,$t1 457 lsl $t0,$acc0,#32 458 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 459 lsr $t1,$acc0,#32 460 adc $acc3,$t3,xzr // can't overflow 461___ 462} 463$code.=<<___; 464 subs $t2,$acc0,$t0 // "*0xffff0001" 465 sbc $t3,$acc0,$t1 466 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] 467 adcs $acc1,$acc2,$t1 468 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 469 adc $acc3,$t3,xzr // can't overflow 470 471 adds $acc0,$acc0,$acc4 // accumulate upper half 472 adcs $acc1,$acc1,$acc5 473 adcs $acc2,$acc2,$acc6 474 adcs $acc3,$acc3,$acc7 475 adc $acc4,xzr,xzr 476 477 adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus 478 sbcs $t1,$acc1,$poly1 479 sbcs $t2,$acc2,xzr 480 sbcs $t3,$acc3,$poly3 481 sbcs xzr,$acc4,xzr // did it borrow? 482 483 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 484 csel $acc1,$acc1,$t1,lo 485 csel $acc2,$acc2,$t2,lo 486 stp $acc0,$acc1,[$rp] 487 csel $acc3,$acc3,$t3,lo 488 stp $acc2,$acc3,[$rp,#16] 489 490 ret 491.size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont 492 493// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to 494// $a0-$a3 and $t0-$t3. This is done because it's used in multiple 495// contexts, e.g. in multiplication by 2 and 3... 496.type __ecp_nistz256_add_to,%function 497.align 4 498__ecp_nistz256_add_to: 499 adds $acc0,$acc0,$t0 // ret = a+b 500 adcs $acc1,$acc1,$t1 501 adcs $acc2,$acc2,$t2 502 adcs $acc3,$acc3,$t3 503 adc $ap,xzr,xzr // zap $ap 504 505 adds $t0,$acc0,#1 // subs $t0,$a0,#-1 // tmp = ret-modulus 506 sbcs $t1,$acc1,$poly1 507 sbcs $t2,$acc2,xzr 508 sbcs $t3,$acc3,$poly3 509 sbcs xzr,$ap,xzr // did subtraction borrow? 510 511 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 512 csel $acc1,$acc1,$t1,lo 513 csel $acc2,$acc2,$t2,lo 514 stp $acc0,$acc1,[$rp] 515 csel $acc3,$acc3,$t3,lo 516 stp $acc2,$acc3,[$rp,#16] 517 518 ret 519.size __ecp_nistz256_add_to,.-__ecp_nistz256_add_to 520 521.type __ecp_nistz256_sub_from,%function 522.align 4 523__ecp_nistz256_sub_from: 524 ldp $t0,$t1,[$bp] 525 ldp $t2,$t3,[$bp,#16] 526 subs $acc0,$acc0,$t0 // ret = a-b 527 sbcs $acc1,$acc1,$t1 528 sbcs $acc2,$acc2,$t2 529 sbcs $acc3,$acc3,$t3 530 sbc $ap,xzr,xzr // zap $ap 531 532 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus 533 adcs $t1,$acc1,$poly1 534 adcs $t2,$acc2,xzr 535 adc $t3,$acc3,$poly3 536 cmp $ap,xzr // did subtraction borrow? 537 538 csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret 539 csel $acc1,$acc1,$t1,eq 540 csel $acc2,$acc2,$t2,eq 541 stp $acc0,$acc1,[$rp] 542 csel $acc3,$acc3,$t3,eq 543 stp $acc2,$acc3,[$rp,#16] 544 545 ret 546.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from 547 548.type __ecp_nistz256_sub_morf,%function 549.align 4 550__ecp_nistz256_sub_morf: 551 ldp $t0,$t1,[$bp] 552 ldp $t2,$t3,[$bp,#16] 553 subs $acc0,$t0,$acc0 // ret = b-a 554 sbcs $acc1,$t1,$acc1 555 sbcs $acc2,$t2,$acc2 556 sbcs $acc3,$t3,$acc3 557 sbc $ap,xzr,xzr // zap $ap 558 559 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus 560 adcs $t1,$acc1,$poly1 561 adcs $t2,$acc2,xzr 562 adc $t3,$acc3,$poly3 563 cmp $ap,xzr // did subtraction borrow? 564 565 csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret 566 csel $acc1,$acc1,$t1,eq 567 csel $acc2,$acc2,$t2,eq 568 stp $acc0,$acc1,[$rp] 569 csel $acc3,$acc3,$t3,eq 570 stp $acc2,$acc3,[$rp,#16] 571 572 ret 573.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf 574 575.type __ecp_nistz256_div_by_2,%function 576.align 4 577__ecp_nistz256_div_by_2: 578 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = a+modulus 579 adcs $t1,$acc1,$poly1 580 adcs $t2,$acc2,xzr 581 adcs $t3,$acc3,$poly3 582 adc $ap,xzr,xzr // zap $ap 583 tst $acc0,#1 // is a even? 584 585 csel $acc0,$acc0,$t0,eq // ret = even ? a : a+modulus 586 csel $acc1,$acc1,$t1,eq 587 csel $acc2,$acc2,$t2,eq 588 csel $acc3,$acc3,$t3,eq 589 csel $ap,xzr,$ap,eq 590 591 lsr $acc0,$acc0,#1 // ret >>= 1 592 orr $acc0,$acc0,$acc1,lsl#63 593 lsr $acc1,$acc1,#1 594 orr $acc1,$acc1,$acc2,lsl#63 595 lsr $acc2,$acc2,#1 596 orr $acc2,$acc2,$acc3,lsl#63 597 lsr $acc3,$acc3,#1 598 stp $acc0,$acc1,[$rp] 599 orr $acc3,$acc3,$ap,lsl#63 600 stp $acc2,$acc3,[$rp,#16] 601 602 ret 603.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 604___ 605######################################################################## 606# following subroutines are "literal" implementation of those found in 607# ecp_nistz256.c 608# 609######################################################################## 610# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); 611# 612{ 613my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3)); 614# above map() describes stack layout with 4 temporary 615# 256-bit vectors on top. 616my ($rp_real,$ap_real) = map("x$_",(21,22)); 617 618$code.=<<___; 619.globl ecp_nistz256_point_double 620.type ecp_nistz256_point_double,%function 621.align 5 622ecp_nistz256_point_double: 623 AARCH64_SIGN_LINK_REGISTER 624 stp x29,x30,[sp,#-96]! 625 add x29,sp,#0 626 stp x19,x20,[sp,#16] 627 stp x21,x22,[sp,#32] 628 sub sp,sp,#32*4 629 630.Ldouble_shortcut: 631 ldp $acc0,$acc1,[$ap,#32] 632 mov $rp_real,$rp 633 ldp $acc2,$acc3,[$ap,#48] 634 mov $ap_real,$ap 635 adrp $poly3,:pg_hi21:.Lpoly 636 add $poly3,$poly3,:lo12:.Lpoly 637 ldr $poly1,[$poly3,#8] 638 mov $t0,$acc0 639 ldr $poly3,[$poly3,#24] 640 mov $t1,$acc1 641 ldp $a0,$a1,[$ap_real,#64] // forward load for p256_sqr_mont 642 mov $t2,$acc2 643 mov $t3,$acc3 644 ldp $a2,$a3,[$ap_real,#64+16] 645 add $rp,sp,#$S 646 bl __ecp_nistz256_add_to // p256_mul_by_2(S, in_y); 647 648 add $rp,sp,#$Zsqr 649 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z); 650 651 ldp $t0,$t1,[$ap_real] 652 ldp $t2,$t3,[$ap_real,#16] 653 mov $a0,$acc0 // put Zsqr aside for p256_sub 654 mov $a1,$acc1 655 mov $a2,$acc2 656 mov $a3,$acc3 657 add $rp,sp,#$M 658 bl __ecp_nistz256_add_to // p256_add(M, Zsqr, in_x); 659 660 add $bp,$ap_real,#0 661 mov $acc0,$a0 // restore Zsqr 662 mov $acc1,$a1 663 ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont 664 mov $acc2,$a2 665 mov $acc3,$a3 666 ldp $a2,$a3,[sp,#$S+16] 667 add $rp,sp,#$Zsqr 668 bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr); 669 670 add $rp,sp,#$S 671 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S); 672 673 ldr $bi,[$ap_real,#32] 674 ldp $a0,$a1,[$ap_real,#64] 675 ldp $a2,$a3,[$ap_real,#64+16] 676 add $bp,$ap_real,#32 677 add $rp,sp,#$tmp0 678 bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y); 679 680 mov $t0,$acc0 681 mov $t1,$acc1 682 ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont 683 mov $t2,$acc2 684 mov $t3,$acc3 685 ldp $a2,$a3,[sp,#$S+16] 686 add $rp,$rp_real,#64 687 bl __ecp_nistz256_add_to // p256_mul_by_2(res_z, tmp0); 688 689 add $rp,sp,#$tmp0 690 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S); 691 692 ldr $bi,[sp,#$Zsqr] // forward load for p256_mul_mont 693 ldp $a0,$a1,[sp,#$M] 694 ldp $a2,$a3,[sp,#$M+16] 695 add $rp,$rp_real,#32 696 bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0); 697 698 add $bp,sp,#$Zsqr 699 add $rp,sp,#$M 700 bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr); 701 702 mov $t0,$acc0 // duplicate M 703 mov $t1,$acc1 704 mov $t2,$acc2 705 mov $t3,$acc3 706 mov $a0,$acc0 // put M aside 707 mov $a1,$acc1 708 mov $a2,$acc2 709 mov $a3,$acc3 710 add $rp,sp,#$M 711 bl __ecp_nistz256_add_to 712 mov $t0,$a0 // restore M 713 mov $t1,$a1 714 ldr $bi,[$ap_real] // forward load for p256_mul_mont 715 mov $t2,$a2 716 ldp $a0,$a1,[sp,#$S] 717 mov $t3,$a3 718 ldp $a2,$a3,[sp,#$S+16] 719 bl __ecp_nistz256_add_to // p256_mul_by_3(M, M); 720 721 add $bp,$ap_real,#0 722 add $rp,sp,#$S 723 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x); 724 725 mov $t0,$acc0 726 mov $t1,$acc1 727 ldp $a0,$a1,[sp,#$M] // forward load for p256_sqr_mont 728 mov $t2,$acc2 729 mov $t3,$acc3 730 ldp $a2,$a3,[sp,#$M+16] 731 add $rp,sp,#$tmp0 732 bl __ecp_nistz256_add_to // p256_mul_by_2(tmp0, S); 733 734 add $rp,$rp_real,#0 735 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M); 736 737 add $bp,sp,#$tmp0 738 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0); 739 740 add $bp,sp,#$S 741 add $rp,sp,#$S 742 bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x); 743 744 ldr $bi,[sp,#$M] 745 mov $a0,$acc0 // copy S 746 mov $a1,$acc1 747 mov $a2,$acc2 748 mov $a3,$acc3 749 add $bp,sp,#$M 750 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M); 751 752 add $bp,$rp_real,#32 753 add $rp,$rp_real,#32 754 bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y); 755 756 add sp,x29,#0 // destroy frame 757 ldp x19,x20,[x29,#16] 758 ldp x21,x22,[x29,#32] 759 ldp x29,x30,[sp],#96 760 AARCH64_VALIDATE_LINK_REGISTER 761 ret 762.size ecp_nistz256_point_double,.-ecp_nistz256_point_double 763___ 764} 765 766######################################################################## 767# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, 768# const P256_POINT *in2); 769{ 770my ($res_x,$res_y,$res_z, 771 $H,$Hsqr,$R,$Rsqr,$Hcub, 772 $U1,$U2,$S1,$S2)=map(32*$_,(0..11)); 773my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); 774# above map() describes stack layout with 12 temporary 775# 256-bit vectors on top. 776my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp0,$temp1,$temp2)=map("x$_",(21..28)); 777 778$code.=<<___; 779.globl ecp_nistz256_point_add 780.type ecp_nistz256_point_add,%function 781.align 5 782ecp_nistz256_point_add: 783 AARCH64_SIGN_LINK_REGISTER 784 stp x29,x30,[sp,#-96]! 785 add x29,sp,#0 786 stp x19,x20,[sp,#16] 787 stp x21,x22,[sp,#32] 788 stp x23,x24,[sp,#48] 789 stp x25,x26,[sp,#64] 790 stp x27,x28,[sp,#80] 791 sub sp,sp,#32*12 792 793 ldp $a0,$a1,[$bp,#64] // in2_z 794 ldp $a2,$a3,[$bp,#64+16] 795 mov $rp_real,$rp 796 mov $ap_real,$ap 797 mov $bp_real,$bp 798 adrp $poly3,:pg_hi21:.Lpoly 799 add $poly3,$poly3,:lo12:.Lpoly 800 ldr $poly1,[$poly3,#8] 801 ldr $poly3,[$poly3,#24] 802 orr $t0,$a0,$a1 803 orr $t2,$a2,$a3 804 orr $in2infty,$t0,$t2 805 cmp $in2infty,#0 806 csetm $in2infty,ne // ~in2infty 807 add $rp,sp,#$Z2sqr 808 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z); 809 810 ldp $a0,$a1,[$ap_real,#64] // in1_z 811 ldp $a2,$a3,[$ap_real,#64+16] 812 orr $t0,$a0,$a1 813 orr $t2,$a2,$a3 814 orr $in1infty,$t0,$t2 815 cmp $in1infty,#0 816 csetm $in1infty,ne // ~in1infty 817 add $rp,sp,#$Z1sqr 818 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); 819 820 ldr $bi,[$bp_real,#64] 821 ldp $a0,$a1,[sp,#$Z2sqr] 822 ldp $a2,$a3,[sp,#$Z2sqr+16] 823 add $bp,$bp_real,#64 824 add $rp,sp,#$S1 825 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z); 826 827 ldr $bi,[$ap_real,#64] 828 ldp $a0,$a1,[sp,#$Z1sqr] 829 ldp $a2,$a3,[sp,#$Z1sqr+16] 830 add $bp,$ap_real,#64 831 add $rp,sp,#$S2 832 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); 833 834 ldr $bi,[$ap_real,#32] 835 ldp $a0,$a1,[sp,#$S1] 836 ldp $a2,$a3,[sp,#$S1+16] 837 add $bp,$ap_real,#32 838 add $rp,sp,#$S1 839 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y); 840 841 ldr $bi,[$bp_real,#32] 842 ldp $a0,$a1,[sp,#$S2] 843 ldp $a2,$a3,[sp,#$S2+16] 844 add $bp,$bp_real,#32 845 add $rp,sp,#$S2 846 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); 847 848 add $bp,sp,#$S1 849 ldr $bi,[sp,#$Z2sqr] // forward load for p256_mul_mont 850 ldp $a0,$a1,[$ap_real] 851 ldp $a2,$a3,[$ap_real,#16] 852 add $rp,sp,#$R 853 bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1); 854 855 orr $acc0,$acc0,$acc1 // see if result is zero 856 orr $acc2,$acc2,$acc3 857 orr $temp0,$acc0,$acc2 // ~is_equal(S1,S2) 858 859 add $bp,sp,#$Z2sqr 860 add $rp,sp,#$U1 861 bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr); 862 863 ldr $bi,[sp,#$Z1sqr] 864 ldp $a0,$a1,[$bp_real] 865 ldp $a2,$a3,[$bp_real,#16] 866 add $bp,sp,#$Z1sqr 867 add $rp,sp,#$U2 868 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr); 869 870 add $bp,sp,#$U1 871 ldp $a0,$a1,[sp,#$R] // forward load for p256_sqr_mont 872 ldp $a2,$a3,[sp,#$R+16] 873 add $rp,sp,#$H 874 bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1); 875 876 orr $acc0,$acc0,$acc1 // see if result is zero 877 orr $acc2,$acc2,$acc3 878 orr $acc0,$acc0,$acc2 // ~is_equal(U1,U2) 879 880 mvn $temp1,$in1infty // -1/0 -> 0/-1 881 mvn $temp2,$in2infty // -1/0 -> 0/-1 882 orr $acc0,$acc0,$temp1 883 orr $acc0,$acc0,$temp2 884 orr $acc0,$acc0,$temp0 885 cbnz $acc0,.Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) 886 887.Ladd_double: 888 mov $ap,$ap_real 889 mov $rp,$rp_real 890 ldp x23,x24,[x29,#48] 891 ldp x25,x26,[x29,#64] 892 ldp x27,x28,[x29,#80] 893 add sp,sp,#256 // #256 is from #32*(12-4). difference in stack frames 894 b .Ldouble_shortcut 895 896.align 4 897.Ladd_proceed: 898 add $rp,sp,#$Rsqr 899 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); 900 901 ldr $bi,[$ap_real,#64] 902 ldp $a0,$a1,[sp,#$H] 903 ldp $a2,$a3,[sp,#$H+16] 904 add $bp,$ap_real,#64 905 add $rp,sp,#$res_z 906 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); 907 908 ldp $a0,$a1,[sp,#$H] 909 ldp $a2,$a3,[sp,#$H+16] 910 add $rp,sp,#$Hsqr 911 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); 912 913 ldr $bi,[$bp_real,#64] 914 ldp $a0,$a1,[sp,#$res_z] 915 ldp $a2,$a3,[sp,#$res_z+16] 916 add $bp,$bp_real,#64 917 add $rp,sp,#$res_z 918 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z); 919 920 ldr $bi,[sp,#$H] 921 ldp $a0,$a1,[sp,#$Hsqr] 922 ldp $a2,$a3,[sp,#$Hsqr+16] 923 add $bp,sp,#$H 924 add $rp,sp,#$Hcub 925 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); 926 927 ldr $bi,[sp,#$Hsqr] 928 ldp $a0,$a1,[sp,#$U1] 929 ldp $a2,$a3,[sp,#$U1+16] 930 add $bp,sp,#$Hsqr 931 add $rp,sp,#$U2 932 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr); 933 934 mov $t0,$acc0 935 mov $t1,$acc1 936 mov $t2,$acc2 937 mov $t3,$acc3 938 add $rp,sp,#$Hsqr 939 bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); 940 941 add $bp,sp,#$Rsqr 942 add $rp,sp,#$res_x 943 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); 944 945 add $bp,sp,#$Hcub 946 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); 947 948 add $bp,sp,#$U2 949 ldr $bi,[sp,#$Hcub] // forward load for p256_mul_mont 950 ldp $a0,$a1,[sp,#$S1] 951 ldp $a2,$a3,[sp,#$S1+16] 952 add $rp,sp,#$res_y 953 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); 954 955 add $bp,sp,#$Hcub 956 add $rp,sp,#$S2 957 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub); 958 959 ldr $bi,[sp,#$R] 960 ldp $a0,$a1,[sp,#$res_y] 961 ldp $a2,$a3,[sp,#$res_y+16] 962 add $bp,sp,#$R 963 add $rp,sp,#$res_y 964 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); 965 966 add $bp,sp,#$S2 967 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); 968 969 ldp $a0,$a1,[sp,#$res_x] // res 970 ldp $a2,$a3,[sp,#$res_x+16] 971 ldp $t0,$t1,[$bp_real] // in2 972 ldp $t2,$t3,[$bp_real,#16] 973___ 974for($i=0;$i<64;$i+=32) { # conditional moves 975$code.=<<___; 976 ldp $acc0,$acc1,[$ap_real,#$i] // in1 977 cmp $in1infty,#0 // ~$in1intfy, remember? 978 ldp $acc2,$acc3,[$ap_real,#$i+16] 979 csel $t0,$a0,$t0,ne 980 csel $t1,$a1,$t1,ne 981 ldp $a0,$a1,[sp,#$res_x+$i+32] // res 982 csel $t2,$a2,$t2,ne 983 csel $t3,$a3,$t3,ne 984 cmp $in2infty,#0 // ~$in2intfy, remember? 985 ldp $a2,$a3,[sp,#$res_x+$i+48] 986 csel $acc0,$t0,$acc0,ne 987 csel $acc1,$t1,$acc1,ne 988 ldp $t0,$t1,[$bp_real,#$i+32] // in2 989 csel $acc2,$t2,$acc2,ne 990 csel $acc3,$t3,$acc3,ne 991 ldp $t2,$t3,[$bp_real,#$i+48] 992 stp $acc0,$acc1,[$rp_real,#$i] 993 stp $acc2,$acc3,[$rp_real,#$i+16] 994___ 995} 996$code.=<<___; 997 ldp $acc0,$acc1,[$ap_real,#$i] // in1 998 cmp $in1infty,#0 // ~$in1intfy, remember? 999 ldp $acc2,$acc3,[$ap_real,#$i+16] 1000 csel $t0,$a0,$t0,ne 1001 csel $t1,$a1,$t1,ne 1002 csel $t2,$a2,$t2,ne 1003 csel $t3,$a3,$t3,ne 1004 cmp $in2infty,#0 // ~$in2intfy, remember? 1005 csel $acc0,$t0,$acc0,ne 1006 csel $acc1,$t1,$acc1,ne 1007 csel $acc2,$t2,$acc2,ne 1008 csel $acc3,$t3,$acc3,ne 1009 stp $acc0,$acc1,[$rp_real,#$i] 1010 stp $acc2,$acc3,[$rp_real,#$i+16] 1011 1012.Ladd_done: 1013 add sp,x29,#0 // destroy frame 1014 ldp x19,x20,[x29,#16] 1015 ldp x21,x22,[x29,#32] 1016 ldp x23,x24,[x29,#48] 1017 ldp x25,x26,[x29,#64] 1018 ldp x27,x28,[x29,#80] 1019 ldp x29,x30,[sp],#96 1020 AARCH64_VALIDATE_LINK_REGISTER 1021 ret 1022.size ecp_nistz256_point_add,.-ecp_nistz256_point_add 1023___ 1024} 1025 1026######################################################################## 1027# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, 1028# const P256_POINT_AFFINE *in2); 1029{ 1030my ($res_x,$res_y,$res_z, 1031 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9)); 1032my $Z1sqr = $S2; 1033# above map() describes stack layout with 10 temporary 1034# 256-bit vectors on top. 1035my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26)); 1036 1037$code.=<<___; 1038.globl ecp_nistz256_point_add_affine 1039.type ecp_nistz256_point_add_affine,%function 1040.align 5 1041ecp_nistz256_point_add_affine: 1042 AARCH64_SIGN_LINK_REGISTER 1043 stp x29,x30,[sp,#-80]! 1044 add x29,sp,#0 1045 stp x19,x20,[sp,#16] 1046 stp x21,x22,[sp,#32] 1047 stp x23,x24,[sp,#48] 1048 stp x25,x26,[sp,#64] 1049 sub sp,sp,#32*10 1050 1051 mov $rp_real,$rp 1052 mov $ap_real,$ap 1053 mov $bp_real,$bp 1054 adrp $poly3,:pg_hi21:.Lpoly 1055 add $poly3,$poly3,:lo12:.Lpoly 1056 ldr $poly1,[$poly3,#8] 1057 ldr $poly3,[$poly3,#24] 1058 1059 ldp $a0,$a1,[$ap,#64] // in1_z 1060 ldp $a2,$a3,[$ap,#64+16] 1061 orr $t0,$a0,$a1 1062 orr $t2,$a2,$a3 1063 orr $in1infty,$t0,$t2 1064 cmp $in1infty,#0 1065 csetm $in1infty,ne // ~in1infty 1066 1067 ldp $acc0,$acc1,[$bp] // in2_x 1068 ldp $acc2,$acc3,[$bp,#16] 1069 ldp $t0,$t1,[$bp,#32] // in2_y 1070 ldp $t2,$t3,[$bp,#48] 1071 orr $acc0,$acc0,$acc1 1072 orr $acc2,$acc2,$acc3 1073 orr $t0,$t0,$t1 1074 orr $t2,$t2,$t3 1075 orr $acc0,$acc0,$acc2 1076 orr $t0,$t0,$t2 1077 orr $in2infty,$acc0,$t0 1078 cmp $in2infty,#0 1079 csetm $in2infty,ne // ~in2infty 1080 1081 add $rp,sp,#$Z1sqr 1082 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); 1083 1084 mov $a0,$acc0 1085 mov $a1,$acc1 1086 mov $a2,$acc2 1087 mov $a3,$acc3 1088 ldr $bi,[$bp_real] 1089 add $bp,$bp_real,#0 1090 add $rp,sp,#$U2 1091 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x); 1092 1093 add $bp,$ap_real,#0 1094 ldr $bi,[$ap_real,#64] // forward load for p256_mul_mont 1095 ldp $a0,$a1,[sp,#$Z1sqr] 1096 ldp $a2,$a3,[sp,#$Z1sqr+16] 1097 add $rp,sp,#$H 1098 bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x); 1099 1100 add $bp,$ap_real,#64 1101 add $rp,sp,#$S2 1102 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); 1103 1104 ldr $bi,[$ap_real,#64] 1105 ldp $a0,$a1,[sp,#$H] 1106 ldp $a2,$a3,[sp,#$H+16] 1107 add $bp,$ap_real,#64 1108 add $rp,sp,#$res_z 1109 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); 1110 1111 ldr $bi,[$bp_real,#32] 1112 ldp $a0,$a1,[sp,#$S2] 1113 ldp $a2,$a3,[sp,#$S2+16] 1114 add $bp,$bp_real,#32 1115 add $rp,sp,#$S2 1116 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); 1117 1118 add $bp,$ap_real,#32 1119 ldp $a0,$a1,[sp,#$H] // forward load for p256_sqr_mont 1120 ldp $a2,$a3,[sp,#$H+16] 1121 add $rp,sp,#$R 1122 bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y); 1123 1124 add $rp,sp,#$Hsqr 1125 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); 1126 1127 ldp $a0,$a1,[sp,#$R] 1128 ldp $a2,$a3,[sp,#$R+16] 1129 add $rp,sp,#$Rsqr 1130 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); 1131 1132 ldr $bi,[sp,#$H] 1133 ldp $a0,$a1,[sp,#$Hsqr] 1134 ldp $a2,$a3,[sp,#$Hsqr+16] 1135 add $bp,sp,#$H 1136 add $rp,sp,#$Hcub 1137 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); 1138 1139 ldr $bi,[$ap_real] 1140 ldp $a0,$a1,[sp,#$Hsqr] 1141 ldp $a2,$a3,[sp,#$Hsqr+16] 1142 add $bp,$ap_real,#0 1143 add $rp,sp,#$U2 1144 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr); 1145 1146 mov $t0,$acc0 1147 mov $t1,$acc1 1148 mov $t2,$acc2 1149 mov $t3,$acc3 1150 add $rp,sp,#$Hsqr 1151 bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); 1152 1153 add $bp,sp,#$Rsqr 1154 add $rp,sp,#$res_x 1155 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); 1156 1157 add $bp,sp,#$Hcub 1158 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); 1159 1160 add $bp,sp,#$U2 1161 ldr $bi,[$ap_real,#32] // forward load for p256_mul_mont 1162 ldp $a0,$a1,[sp,#$Hcub] 1163 ldp $a2,$a3,[sp,#$Hcub+16] 1164 add $rp,sp,#$res_y 1165 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); 1166 1167 add $bp,$ap_real,#32 1168 add $rp,sp,#$S2 1169 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub); 1170 1171 ldr $bi,[sp,#$R] 1172 ldp $a0,$a1,[sp,#$res_y] 1173 ldp $a2,$a3,[sp,#$res_y+16] 1174 add $bp,sp,#$R 1175 add $rp,sp,#$res_y 1176 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); 1177 1178 add $bp,sp,#$S2 1179 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); 1180 1181 ldp $a0,$a1,[sp,#$res_x] // res 1182 ldp $a2,$a3,[sp,#$res_x+16] 1183 ldp $t0,$t1,[$bp_real] // in2 1184 ldp $t2,$t3,[$bp_real,#16] 1185___ 1186for($i=0;$i<64;$i+=32) { # conditional moves 1187$code.=<<___; 1188 ldp $acc0,$acc1,[$ap_real,#$i] // in1 1189 cmp $in1infty,#0 // ~$in1intfy, remember? 1190 ldp $acc2,$acc3,[$ap_real,#$i+16] 1191 csel $t0,$a0,$t0,ne 1192 csel $t1,$a1,$t1,ne 1193 ldp $a0,$a1,[sp,#$res_x+$i+32] // res 1194 csel $t2,$a2,$t2,ne 1195 csel $t3,$a3,$t3,ne 1196 cmp $in2infty,#0 // ~$in2intfy, remember? 1197 ldp $a2,$a3,[sp,#$res_x+$i+48] 1198 csel $acc0,$t0,$acc0,ne 1199 csel $acc1,$t1,$acc1,ne 1200 ldp $t0,$t1,[$bp_real,#$i+32] // in2 1201 csel $acc2,$t2,$acc2,ne 1202 csel $acc3,$t3,$acc3,ne 1203 ldp $t2,$t3,[$bp_real,#$i+48] 1204 stp $acc0,$acc1,[$rp_real,#$i] 1205 stp $acc2,$acc3,[$rp_real,#$i+16] 1206___ 1207$code.=<<___ if ($i == 0); 1208 adrp $bp_real,:pg_hi21:.Lone_mont-64 1209 add $bp_real,$bp_real,:lo12:.Lone_mont-64 1210___ 1211} 1212$code.=<<___; 1213 ldp $acc0,$acc1,[$ap_real,#$i] // in1 1214 cmp $in1infty,#0 // ~$in1intfy, remember? 1215 ldp $acc2,$acc3,[$ap_real,#$i+16] 1216 csel $t0,$a0,$t0,ne 1217 csel $t1,$a1,$t1,ne 1218 csel $t2,$a2,$t2,ne 1219 csel $t3,$a3,$t3,ne 1220 cmp $in2infty,#0 // ~$in2intfy, remember? 1221 csel $acc0,$t0,$acc0,ne 1222 csel $acc1,$t1,$acc1,ne 1223 csel $acc2,$t2,$acc2,ne 1224 csel $acc3,$t3,$acc3,ne 1225 stp $acc0,$acc1,[$rp_real,#$i] 1226 stp $acc2,$acc3,[$rp_real,#$i+16] 1227 1228 add sp,x29,#0 // destroy frame 1229 ldp x19,x20,[x29,#16] 1230 ldp x21,x22,[x29,#32] 1231 ldp x23,x24,[x29,#48] 1232 ldp x25,x26,[x29,#64] 1233 ldp x29,x30,[sp],#80 1234 AARCH64_VALIDATE_LINK_REGISTER 1235 ret 1236.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine 1237___ 1238} 1239if (1) { 1240my ($ord0,$ord1) = ($poly1,$poly3); 1241my ($ord2,$ord3,$ordk,$t4) = map("x$_",(21..24)); 1242my $acc7 = $bi; 1243 1244$code.=<<___; 1245//////////////////////////////////////////////////////////////////////// 1246// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], 1247// uint64_t b[4]); 1248.globl ecp_nistz256_ord_mul_mont 1249.type ecp_nistz256_ord_mul_mont,%function 1250.align 4 1251ecp_nistz256_ord_mul_mont: 1252 AARCH64_VALID_CALL_TARGET 1253 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 1254 stp x29,x30,[sp,#-64]! 1255 add x29,sp,#0 1256 stp x19,x20,[sp,#16] 1257 stp x21,x22,[sp,#32] 1258 stp x23,x24,[sp,#48] 1259 1260 adrp $ordk,:pg_hi21:.Lord 1261 add $ordk,$ordk,:lo12:.Lord 1262 ldr $bi,[$bp] // bp[0] 1263 ldp $a0,$a1,[$ap] 1264 ldp $a2,$a3,[$ap,#16] 1265 1266 ldp $ord0,$ord1,[$ordk,#0] 1267 ldp $ord2,$ord3,[$ordk,#16] 1268 ldr $ordk,[$ordk,#32] 1269 1270 mul $acc0,$a0,$bi // a[0]*b[0] 1271 umulh $t0,$a0,$bi 1272 1273 mul $acc1,$a1,$bi // a[1]*b[0] 1274 umulh $t1,$a1,$bi 1275 1276 mul $acc2,$a2,$bi // a[2]*b[0] 1277 umulh $t2,$a2,$bi 1278 1279 mul $acc3,$a3,$bi // a[3]*b[0] 1280 umulh $acc4,$a3,$bi 1281 1282 mul $t4,$acc0,$ordk 1283 1284 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication 1285 adcs $acc2,$acc2,$t1 1286 adcs $acc3,$acc3,$t2 1287 adc $acc4,$acc4,xzr 1288 mov $acc5,xzr 1289___ 1290for ($i=1;$i<4;$i++) { 1291 ################################################################ 1292 # ffff0000.ffffffff.yyyyyyyy.zzzzzzzz 1293 # * abcdefgh 1294 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx 1295 # 1296 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we 1297 # rewrite above as: 1298 # 1299 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx 1300 # - 0000abcd.efgh0000.abcdefgh.00000000.00000000 1301 # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh 1302$code.=<<___; 1303 ldr $bi,[$bp,#8*$i] // b[i] 1304 1305 lsl $t0,$t4,#32 1306 subs $acc2,$acc2,$t4 1307 lsr $t1,$t4,#32 1308 sbcs $acc3,$acc3,$t0 1309 sbcs $acc4,$acc4,$t1 1310 sbc $acc5,$acc5,xzr 1311 1312 subs xzr,$acc0,#1 1313 umulh $t1,$ord0,$t4 1314 mul $t2,$ord1,$t4 1315 umulh $t3,$ord1,$t4 1316 1317 adcs $t2,$t2,$t1 1318 mul $t0,$a0,$bi 1319 adc $t3,$t3,xzr 1320 mul $t1,$a1,$bi 1321 1322 adds $acc0,$acc1,$t2 1323 mul $t2,$a2,$bi 1324 adcs $acc1,$acc2,$t3 1325 mul $t3,$a3,$bi 1326 adcs $acc2,$acc3,$t4 1327 adcs $acc3,$acc4,$t4 1328 adc $acc4,$acc5,xzr 1329 1330 adds $acc0,$acc0,$t0 // accumulate low parts 1331 umulh $t0,$a0,$bi 1332 adcs $acc1,$acc1,$t1 1333 umulh $t1,$a1,$bi 1334 adcs $acc2,$acc2,$t2 1335 umulh $t2,$a2,$bi 1336 adcs $acc3,$acc3,$t3 1337 umulh $t3,$a3,$bi 1338 adc $acc4,$acc4,xzr 1339 mul $t4,$acc0,$ordk 1340 adds $acc1,$acc1,$t0 // accumulate high parts 1341 adcs $acc2,$acc2,$t1 1342 adcs $acc3,$acc3,$t2 1343 adcs $acc4,$acc4,$t3 1344 adc $acc5,xzr,xzr 1345___ 1346} 1347$code.=<<___; 1348 lsl $t0,$t4,#32 // last reduction 1349 subs $acc2,$acc2,$t4 1350 lsr $t1,$t4,#32 1351 sbcs $acc3,$acc3,$t0 1352 sbcs $acc4,$acc4,$t1 1353 sbc $acc5,$acc5,xzr 1354 1355 subs xzr,$acc0,#1 1356 umulh $t1,$ord0,$t4 1357 mul $t2,$ord1,$t4 1358 umulh $t3,$ord1,$t4 1359 1360 adcs $t2,$t2,$t1 1361 adc $t3,$t3,xzr 1362 1363 adds $acc0,$acc1,$t2 1364 adcs $acc1,$acc2,$t3 1365 adcs $acc2,$acc3,$t4 1366 adcs $acc3,$acc4,$t4 1367 adc $acc4,$acc5,xzr 1368 1369 subs $t0,$acc0,$ord0 // ret -= modulus 1370 sbcs $t1,$acc1,$ord1 1371 sbcs $t2,$acc2,$ord2 1372 sbcs $t3,$acc3,$ord3 1373 sbcs xzr,$acc4,xzr 1374 1375 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 1376 csel $acc1,$acc1,$t1,lo 1377 csel $acc2,$acc2,$t2,lo 1378 stp $acc0,$acc1,[$rp] 1379 csel $acc3,$acc3,$t3,lo 1380 stp $acc2,$acc3,[$rp,#16] 1381 1382 ldp x19,x20,[sp,#16] 1383 ldp x21,x22,[sp,#32] 1384 ldp x23,x24,[sp,#48] 1385 ldr x29,[sp],#64 1386 ret 1387.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont 1388 1389//////////////////////////////////////////////////////////////////////// 1390// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], 1391// uint64_t rep); 1392.globl ecp_nistz256_ord_sqr_mont 1393.type ecp_nistz256_ord_sqr_mont,%function 1394.align 4 1395ecp_nistz256_ord_sqr_mont: 1396 AARCH64_VALID_CALL_TARGET 1397 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 1398 stp x29,x30,[sp,#-64]! 1399 add x29,sp,#0 1400 stp x19,x20,[sp,#16] 1401 stp x21,x22,[sp,#32] 1402 stp x23,x24,[sp,#48] 1403 1404 adrp $ordk,:pg_hi21:.Lord 1405 add $ordk,$ordk,:lo12:.Lord 1406 ldp $a0,$a1,[$ap] 1407 ldp $a2,$a3,[$ap,#16] 1408 1409 ldp $ord0,$ord1,[$ordk,#0] 1410 ldp $ord2,$ord3,[$ordk,#16] 1411 ldr $ordk,[$ordk,#32] 1412 b .Loop_ord_sqr 1413 1414.align 4 1415.Loop_ord_sqr: 1416 sub $bp,$bp,#1 1417 //////////////////////////////////////////////////////////////// 1418 // | | | | | |a1*a0| | 1419 // | | | | |a2*a0| | | 1420 // | |a3*a2|a3*a0| | | | 1421 // | | | |a2*a1| | | | 1422 // | | |a3*a1| | | | | 1423 // *| | | | | | | | 2| 1424 // +|a3*a3|a2*a2|a1*a1|a0*a0| 1425 // |--+--+--+--+--+--+--+--| 1426 // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx 1427 // 1428 // "can't overflow" below mark carrying into high part of 1429 // multiplication result, which can't overflow, because it 1430 // can never be all ones. 1431 1432 mul $acc1,$a1,$a0 // a[1]*a[0] 1433 umulh $t1,$a1,$a0 1434 mul $acc2,$a2,$a0 // a[2]*a[0] 1435 umulh $t2,$a2,$a0 1436 mul $acc3,$a3,$a0 // a[3]*a[0] 1437 umulh $acc4,$a3,$a0 1438 1439 adds $acc2,$acc2,$t1 // accumulate high parts of multiplication 1440 mul $t0,$a2,$a1 // a[2]*a[1] 1441 umulh $t1,$a2,$a1 1442 adcs $acc3,$acc3,$t2 1443 mul $t2,$a3,$a1 // a[3]*a[1] 1444 umulh $t3,$a3,$a1 1445 adc $acc4,$acc4,xzr // can't overflow 1446 1447 mul $acc5,$a3,$a2 // a[3]*a[2] 1448 umulh $acc6,$a3,$a2 1449 1450 adds $t1,$t1,$t2 // accumulate high parts of multiplication 1451 mul $acc0,$a0,$a0 // a[0]*a[0] 1452 adc $t2,$t3,xzr // can't overflow 1453 1454 adds $acc3,$acc3,$t0 // accumulate low parts of multiplication 1455 umulh $a0,$a0,$a0 1456 adcs $acc4,$acc4,$t1 1457 mul $t1,$a1,$a1 // a[1]*a[1] 1458 adcs $acc5,$acc5,$t2 1459 umulh $a1,$a1,$a1 1460 adc $acc6,$acc6,xzr // can't overflow 1461 1462 adds $acc1,$acc1,$acc1 // acc[1-6]*=2 1463 mul $t2,$a2,$a2 // a[2]*a[2] 1464 adcs $acc2,$acc2,$acc2 1465 umulh $a2,$a2,$a2 1466 adcs $acc3,$acc3,$acc3 1467 mul $t3,$a3,$a3 // a[3]*a[3] 1468 adcs $acc4,$acc4,$acc4 1469 umulh $a3,$a3,$a3 1470 adcs $acc5,$acc5,$acc5 1471 adcs $acc6,$acc6,$acc6 1472 adc $acc7,xzr,xzr 1473 1474 adds $acc1,$acc1,$a0 // +a[i]*a[i] 1475 mul $t4,$acc0,$ordk 1476 adcs $acc2,$acc2,$t1 1477 adcs $acc3,$acc3,$a1 1478 adcs $acc4,$acc4,$t2 1479 adcs $acc5,$acc5,$a2 1480 adcs $acc6,$acc6,$t3 1481 adc $acc7,$acc7,$a3 1482___ 1483for($i=0; $i<4; $i++) { # reductions 1484$code.=<<___; 1485 subs xzr,$acc0,#1 1486 umulh $t1,$ord0,$t4 1487 mul $t2,$ord1,$t4 1488 umulh $t3,$ord1,$t4 1489 1490 adcs $t2,$t2,$t1 1491 adc $t3,$t3,xzr 1492 1493 adds $acc0,$acc1,$t2 1494 adcs $acc1,$acc2,$t3 1495 adcs $acc2,$acc3,$t4 1496 adc $acc3,xzr,$t4 // can't overflow 1497___ 1498$code.=<<___ if ($i<3); 1499 mul $t3,$acc0,$ordk 1500___ 1501$code.=<<___; 1502 lsl $t0,$t4,#32 1503 subs $acc1,$acc1,$t4 1504 lsr $t1,$t4,#32 1505 sbcs $acc2,$acc2,$t0 1506 sbc $acc3,$acc3,$t1 // can't borrow 1507___ 1508 ($t3,$t4) = ($t4,$t3); 1509} 1510$code.=<<___; 1511 adds $acc0,$acc0,$acc4 // accumulate upper half 1512 adcs $acc1,$acc1,$acc5 1513 adcs $acc2,$acc2,$acc6 1514 adcs $acc3,$acc3,$acc7 1515 adc $acc4,xzr,xzr 1516 1517 subs $t0,$acc0,$ord0 // ret -= modulus 1518 sbcs $t1,$acc1,$ord1 1519 sbcs $t2,$acc2,$ord2 1520 sbcs $t3,$acc3,$ord3 1521 sbcs xzr,$acc4,xzr 1522 1523 csel $a0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 1524 csel $a1,$acc1,$t1,lo 1525 csel $a2,$acc2,$t2,lo 1526 csel $a3,$acc3,$t3,lo 1527 1528 cbnz $bp,.Loop_ord_sqr 1529 1530 stp $a0,$a1,[$rp] 1531 stp $a2,$a3,[$rp,#16] 1532 1533 ldp x19,x20,[sp,#16] 1534 ldp x21,x22,[sp,#32] 1535 ldp x23,x24,[sp,#48] 1536 ldr x29,[sp],#64 1537 ret 1538.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont 1539___ 1540} } 1541 1542######################################################################## 1543# select subroutines 1544# These select functions are similar to those in p256-x86_64-asm.pl 1545# They load all points in the lookup table 1546# keeping in the output only the one corresponding to the input index. 1547{ 1548my ($val,$in_t)=map("x$_",(0..1)); 1549my ($index)=("w2"); 1550my ($Idx_ctr,$Val_in, $Mask_64)=("w9", "x10", "x11"); 1551my ($Mask)=("v3"); 1552my ($Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("v$_",(16..21)); 1553my ($T0a,$T0b,$T0c,$T0d,$T0e,$T0f)=map("v$_",(22..27)); 1554$code.=<<___; 1555//////////////////////////////////////////////////////////////////////// 1556// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); 1557.globl ecp_nistz256_select_w5 1558.type ecp_nistz256_select_w5,%function 1559.align 4 1560ecp_nistz256_select_w5: 1561 AARCH64_VALID_CALL_TARGET 1562 1563 // $Val_in := $val 1564 // $Idx_ctr := 0; loop counter and incremented internal index 1565 mov $Val_in, $val 1566 mov $Idx_ctr, #0 1567 1568 // [$Ra-$Rf] := 0 1569 movi $Ra.16b, #0 1570 movi $Rb.16b, #0 1571 movi $Rc.16b, #0 1572 movi $Rd.16b, #0 1573 movi $Re.16b, #0 1574 movi $Rf.16b, #0 1575 1576.Lselect_w5_loop: 1577 // Loop 16 times. 1578 1579 // Increment index (loop counter); tested at the end of the loop 1580 add $Idx_ctr, $Idx_ctr, #1 1581 1582 // [$T0a-$T0f] := Load a (3*256-bit = 6*128-bit) table entry starting at $in_t 1583 // and advance $in_t to point to the next entry 1584 ld1 {$T0a.2d, $T0b.2d, $T0c.2d, $T0d.2d}, [$in_t],#64 1585 1586 // $Mask_64 := ($Idx_ctr == $index)? All 1s : All 0s 1587 cmp $Idx_ctr, $index 1588 csetm $Mask_64, eq 1589 1590 // continue loading ... 1591 ld1 {$T0e.2d, $T0f.2d}, [$in_t],#32 1592 1593 // duplicate mask_64 into Mask (all 0s or all 1s) 1594 dup $Mask.2d, $Mask_64 1595 1596 // [$Ra-$Rd] := (Mask == all 1s)? [$T0a-$T0d] : [$Ra-$Rd] 1597 // i.e., values in output registers will remain the same if $Idx_ctr != $index 1598 bit $Ra.16b, $T0a.16b, $Mask.16b 1599 bit $Rb.16b, $T0b.16b, $Mask.16b 1600 1601 bit $Rc.16b, $T0c.16b, $Mask.16b 1602 bit $Rd.16b, $T0d.16b, $Mask.16b 1603 1604 bit $Re.16b, $T0e.16b, $Mask.16b 1605 bit $Rf.16b, $T0f.16b, $Mask.16b 1606 1607 // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back 1608 tbz $Idx_ctr, #4, .Lselect_w5_loop 1609 1610 // Write [$Ra-$Rf] to memory at the output pointer 1611 st1 {$Ra.2d, $Rb.2d, $Rc.2d, $Rd.2d}, [$Val_in],#64 1612 st1 {$Re.2d, $Rf.2d}, [$Val_in] 1613 1614 ret 1615.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 1616 1617 1618//////////////////////////////////////////////////////////////////////// 1619// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); 1620.globl ecp_nistz256_select_w7 1621.type ecp_nistz256_select_w7,%function 1622.align 4 1623ecp_nistz256_select_w7: 1624 AARCH64_VALID_CALL_TARGET 1625 1626 // $Idx_ctr := 0; loop counter and incremented internal index 1627 mov $Idx_ctr, #0 1628 1629 // [$Ra-$Rf] := 0 1630 movi $Ra.16b, #0 1631 movi $Rb.16b, #0 1632 movi $Rc.16b, #0 1633 movi $Rd.16b, #0 1634 1635.Lselect_w7_loop: 1636 // Loop 64 times. 1637 1638 // Increment index (loop counter); tested at the end of the loop 1639 add $Idx_ctr, $Idx_ctr, #1 1640 1641 // [$T0a-$T0d] := Load a (2*256-bit = 4*128-bit) table entry starting at $in_t 1642 // and advance $in_t to point to the next entry 1643 ld1 {$T0a.2d, $T0b.2d, $T0c.2d, $T0d.2d}, [$in_t],#64 1644 1645 // $Mask_64 := ($Idx_ctr == $index)? All 1s : All 0s 1646 cmp $Idx_ctr, $index 1647 csetm $Mask_64, eq 1648 1649 // duplicate mask_64 into Mask (all 0s or all 1s) 1650 dup $Mask.2d, $Mask_64 1651 1652 // [$Ra-$Rd] := (Mask == all 1s)? [$T0a-$T0d] : [$Ra-$Rd] 1653 // i.e., values in output registers will remain the same if $Idx_ctr != $index 1654 bit $Ra.16b, $T0a.16b, $Mask.16b 1655 bit $Rb.16b, $T0b.16b, $Mask.16b 1656 1657 bit $Rc.16b, $T0c.16b, $Mask.16b 1658 bit $Rd.16b, $T0d.16b, $Mask.16b 1659 1660 // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back 1661 tbz $Idx_ctr, #6, .Lselect_w7_loop 1662 1663 // Write [$Ra-$Rd] to memory at the output pointer 1664 st1 {$Ra.2d, $Rb.2d, $Rc.2d, $Rd.2d}, [$val] 1665 1666 ret 1667.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 1668___ 1669} 1670 1671foreach (split("\n",$code)) { 1672 s/\`([^\`]*)\`/eval $1/ge; 1673 1674 print $_,"\n"; 1675} 1676close STDOUT or die "error closing STDOUT: $!"; # enforce flush 1677