xref: /aosp_15_r20/external/boringssl/src/crypto/fipsmodule/ec/asm/p256-armv8-asm.pl (revision 8fb009dc861624b67b6cdb62ea21f0f22d0c584b)
1#! /usr/bin/env perl
2# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# ECP_NISTZ256 module for ARMv8.
18#
19# February 2015.
20#
21# Original ECP_NISTZ256 submission targeting x86_64 is detailed in
22# http://eprint.iacr.org/2013/816.
23#
24#			with/without -DECP_NISTZ256_ASM
25# Apple A7		+190-360%
26# Cortex-A53		+190-400%
27# Cortex-A57		+190-350%
28# Denver		+230-400%
29#
30# Ranges denote minimum and maximum improvement coefficients depending
31# on benchmark. Lower coefficients are for ECDSA sign, server-side
32# operation. Keep in mind that +400% means 5x improvement.
33
34# The first two arguments should always be the flavour and output file path.
35if ($#ARGV < 1) { die "Not enough arguments provided.
36  Two arguments are necessary: the flavour and the output file path."; }
37
38$flavour = shift;
39$output = shift;
40
41$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
42( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
43( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
44die "can't locate arm-xlate.pl";
45
46open OUT,"| \"$^X\" $xlate $flavour $output";
47*STDOUT=*OUT;
48
49{
50my ($rp,$ap,$bp,$bi,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$poly1,$poly3,
51    $acc0,$acc1,$acc2,$acc3,$acc4,$acc5) =
52    map("x$_",(0..17,19,20));
53
54my ($acc6,$acc7)=($ap,$bp);	# used in __ecp_nistz256_sqr_mont
55
56$code.=<<___;
57#include "openssl/arm_arch.h"
58
59.section .rodata
60.align	5
61.Lpoly:
62.quad	0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
63.LRR:	// 2^512 mod P precomputed for NIST P256 polynomial
64.quad	0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd
65.Lone_mont:
66.quad	0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
67.Lone:
68.quad	1,0,0,0
69.Lord:
70.quad	0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
71.LordK:
72.quad	0xccd1c8aaee00bc4f
73.asciz	"ECP_NISTZ256 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
74.text
75
76// void	ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
77//					     const BN_ULONG x2[4]);
78.globl	ecp_nistz256_mul_mont
79.type	ecp_nistz256_mul_mont,%function
80.align	4
81ecp_nistz256_mul_mont:
82	AARCH64_SIGN_LINK_REGISTER
83	stp	x29,x30,[sp,#-32]!
84	add	x29,sp,#0
85	stp	x19,x20,[sp,#16]
86
87	ldr	$bi,[$bp]		// bp[0]
88	ldp	$a0,$a1,[$ap]
89	ldp	$a2,$a3,[$ap,#16]
90	adrp	$poly3,:pg_hi21:.Lpoly
91	add	$poly3,$poly3,:lo12:.Lpoly
92	ldr	$poly1,[$poly3,#8]
93	ldr	$poly3,[$poly3,#24]
94
95	bl	__ecp_nistz256_mul_mont
96
97	ldp	x19,x20,[sp,#16]
98	ldp	x29,x30,[sp],#32
99	AARCH64_VALIDATE_LINK_REGISTER
100	ret
101.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
102
103// void	ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
104.globl	ecp_nistz256_sqr_mont
105.type	ecp_nistz256_sqr_mont,%function
106.align	4
107ecp_nistz256_sqr_mont:
108	AARCH64_SIGN_LINK_REGISTER
109	stp	x29,x30,[sp,#-32]!
110	add	x29,sp,#0
111	stp	x19,x20,[sp,#16]
112
113	ldp	$a0,$a1,[$ap]
114	ldp	$a2,$a3,[$ap,#16]
115	adrp	$poly3,:pg_hi21:.Lpoly
116	add	$poly3,$poly3,:lo12:.Lpoly
117	ldr	$poly1,[$poly3,#8]
118	ldr	$poly3,[$poly3,#24]
119
120	bl	__ecp_nistz256_sqr_mont
121
122	ldp	x19,x20,[sp,#16]
123	ldp	x29,x30,[sp],#32
124	AARCH64_VALIDATE_LINK_REGISTER
125	ret
126.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
127
128// void	ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
129.globl	ecp_nistz256_div_by_2
130.type	ecp_nistz256_div_by_2,%function
131.align	4
132ecp_nistz256_div_by_2:
133	AARCH64_SIGN_LINK_REGISTER
134	stp	x29,x30,[sp,#-16]!
135	add	x29,sp,#0
136
137	ldp	$acc0,$acc1,[$ap]
138	ldp	$acc2,$acc3,[$ap,#16]
139	adrp	$poly3,:pg_hi21:.Lpoly
140	add	$poly3,$poly3,:lo12:.Lpoly
141	ldr	$poly1,[$poly3,#8]
142	ldr	$poly3,[$poly3,#24]
143
144	bl	__ecp_nistz256_div_by_2
145
146	ldp	x29,x30,[sp],#16
147	AARCH64_VALIDATE_LINK_REGISTER
148	ret
149.size	ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
150
151// void	ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
152.globl	ecp_nistz256_mul_by_2
153.type	ecp_nistz256_mul_by_2,%function
154.align	4
155ecp_nistz256_mul_by_2:
156	AARCH64_SIGN_LINK_REGISTER
157	stp	x29,x30,[sp,#-16]!
158	add	x29,sp,#0
159
160	ldp	$acc0,$acc1,[$ap]
161	ldp	$acc2,$acc3,[$ap,#16]
162	adrp	$poly3,:pg_hi21:.Lpoly
163	add	$poly3,$poly3,:lo12:.Lpoly
164	ldr	$poly1,[$poly3,#8]
165	ldr	$poly3,[$poly3,#24]
166	mov	$t0,$acc0
167	mov	$t1,$acc1
168	mov	$t2,$acc2
169	mov	$t3,$acc3
170
171	bl	__ecp_nistz256_add_to	// ret = a+a	// 2*a
172
173	ldp	x29,x30,[sp],#16
174	AARCH64_VALIDATE_LINK_REGISTER
175	ret
176.size	ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
177
178// void	ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
179.globl	ecp_nistz256_mul_by_3
180.type	ecp_nistz256_mul_by_3,%function
181.align	4
182ecp_nistz256_mul_by_3:
183	AARCH64_SIGN_LINK_REGISTER
184	stp	x29,x30,[sp,#-16]!
185	add	x29,sp,#0
186
187	ldp	$acc0,$acc1,[$ap]
188	ldp	$acc2,$acc3,[$ap,#16]
189	adrp	$poly3,:pg_hi21:.Lpoly
190	add	$poly3,$poly3,:lo12:.Lpoly
191	ldr	$poly1,[$poly3,#8]
192	ldr	$poly3,[$poly3,#24]
193	mov	$t0,$acc0
194	mov	$t1,$acc1
195	mov	$t2,$acc2
196	mov	$t3,$acc3
197	mov	$a0,$acc0
198	mov	$a1,$acc1
199	mov	$a2,$acc2
200	mov	$a3,$acc3
201
202	bl	__ecp_nistz256_add_to	// ret = a+a	// 2*a
203
204	mov	$t0,$a0
205	mov	$t1,$a1
206	mov	$t2,$a2
207	mov	$t3,$a3
208
209	bl	__ecp_nistz256_add_to	// ret += a	// 2*a+a=3*a
210
211	ldp	x29,x30,[sp],#16
212	AARCH64_VALIDATE_LINK_REGISTER
213	ret
214.size	ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
215
216// void	ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
217//				        const BN_ULONG x2[4]);
218.globl	ecp_nistz256_sub
219.type	ecp_nistz256_sub,%function
220.align	4
221ecp_nistz256_sub:
222	AARCH64_SIGN_LINK_REGISTER
223	stp	x29,x30,[sp,#-16]!
224	add	x29,sp,#0
225
226	ldp	$acc0,$acc1,[$ap]
227	ldp	$acc2,$acc3,[$ap,#16]
228	adrp	$poly3,:pg_hi21:.Lpoly
229	add	$poly3,$poly3,:lo12:.Lpoly
230	ldr	$poly1,[$poly3,#8]
231	ldr	$poly3,[$poly3,#24]
232
233	bl	__ecp_nistz256_sub_from
234
235	ldp	x29,x30,[sp],#16
236	AARCH64_VALIDATE_LINK_REGISTER
237	ret
238.size	ecp_nistz256_sub,.-ecp_nistz256_sub
239
240// void	ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
241.globl	ecp_nistz256_neg
242.type	ecp_nistz256_neg,%function
243.align	4
244ecp_nistz256_neg:
245	AARCH64_SIGN_LINK_REGISTER
246	stp	x29,x30,[sp,#-16]!
247	add	x29,sp,#0
248
249	mov	$bp,$ap
250	mov	$acc0,xzr		// a = 0
251	mov	$acc1,xzr
252	mov	$acc2,xzr
253	mov	$acc3,xzr
254	adrp	$poly3,:pg_hi21:.Lpoly
255	add	$poly3,$poly3,:lo12:.Lpoly
256	ldr	$poly1,[$poly3,#8]
257	ldr	$poly3,[$poly3,#24]
258
259	bl	__ecp_nistz256_sub_from
260
261	ldp	x29,x30,[sp],#16
262	AARCH64_VALIDATE_LINK_REGISTER
263	ret
264.size	ecp_nistz256_neg,.-ecp_nistz256_neg
265
266// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
267// to $a0-$a3 and b[0] - to $bi
268.type	__ecp_nistz256_mul_mont,%function
269.align	4
270__ecp_nistz256_mul_mont:
271	mul	$acc0,$a0,$bi		// a[0]*b[0]
272	umulh	$t0,$a0,$bi
273
274	mul	$acc1,$a1,$bi		// a[1]*b[0]
275	umulh	$t1,$a1,$bi
276
277	mul	$acc2,$a2,$bi		// a[2]*b[0]
278	umulh	$t2,$a2,$bi
279
280	mul	$acc3,$a3,$bi		// a[3]*b[0]
281	umulh	$t3,$a3,$bi
282	ldr	$bi,[$bp,#8]		// b[1]
283
284	adds	$acc1,$acc1,$t0		// accumulate high parts of multiplication
285	 lsl	$t0,$acc0,#32
286	adcs	$acc2,$acc2,$t1
287	 lsr	$t1,$acc0,#32
288	adcs	$acc3,$acc3,$t2
289	adc	$acc4,xzr,$t3
290	mov	$acc5,xzr
291___
292for($i=1;$i<4;$i++) {
293        # Reduction iteration is normally performed by accumulating
294        # result of multiplication of modulus by "magic" digit [and
295        # omitting least significant word, which is guaranteed to
296        # be 0], but thanks to special form of modulus and "magic"
297        # digit being equal to least significant word, it can be
298        # performed with additions and subtractions alone. Indeed:
299        #
300        #            ffff0001.00000000.0000ffff.ffffffff
301        # *                                     abcdefgh
302        # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
303        #
304        # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
305        # rewrite above as:
306        #
307        #   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
308        # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
309        # - 0000abcd.efgh0000.00000000.00000000.abcdefgh
310        #
311        # or marking redundant operations:
312        #
313        #   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
314        # + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
315        # - 0000abcd.efgh0000.--------.--------.--------
316
317$code.=<<___;
318	subs	$t2,$acc0,$t0		// "*0xffff0001"
319	sbc	$t3,$acc0,$t1
320	adds	$acc0,$acc1,$t0		// +=acc[0]<<96 and omit acc[0]
321	 mul	$t0,$a0,$bi		// lo(a[0]*b[i])
322	adcs	$acc1,$acc2,$t1
323	 mul	$t1,$a1,$bi		// lo(a[1]*b[i])
324	adcs	$acc2,$acc3,$t2		// +=acc[0]*0xffff0001
325	 mul	$t2,$a2,$bi		// lo(a[2]*b[i])
326	adcs	$acc3,$acc4,$t3
327	 mul	$t3,$a3,$bi		// lo(a[3]*b[i])
328	adc	$acc4,$acc5,xzr
329
330	adds	$acc0,$acc0,$t0		// accumulate low parts of multiplication
331	 umulh	$t0,$a0,$bi		// hi(a[0]*b[i])
332	adcs	$acc1,$acc1,$t1
333	 umulh	$t1,$a1,$bi		// hi(a[1]*b[i])
334	adcs	$acc2,$acc2,$t2
335	 umulh	$t2,$a2,$bi		// hi(a[2]*b[i])
336	adcs	$acc3,$acc3,$t3
337	 umulh	$t3,$a3,$bi		// hi(a[3]*b[i])
338	adc	$acc4,$acc4,xzr
339___
340$code.=<<___	if ($i<3);
341	ldr	$bi,[$bp,#8*($i+1)]	// b[$i+1]
342___
343$code.=<<___;
344	adds	$acc1,$acc1,$t0		// accumulate high parts of multiplication
345	 lsl	$t0,$acc0,#32
346	adcs	$acc2,$acc2,$t1
347	 lsr	$t1,$acc0,#32
348	adcs	$acc3,$acc3,$t2
349	adcs	$acc4,$acc4,$t3
350	adc	$acc5,xzr,xzr
351___
352}
353$code.=<<___;
354	// last reduction
355	subs	$t2,$acc0,$t0		// "*0xffff0001"
356	sbc	$t3,$acc0,$t1
357	adds	$acc0,$acc1,$t0		// +=acc[0]<<96 and omit acc[0]
358	adcs	$acc1,$acc2,$t1
359	adcs	$acc2,$acc3,$t2		// +=acc[0]*0xffff0001
360	adcs	$acc3,$acc4,$t3
361	adc	$acc4,$acc5,xzr
362
363	adds	$t0,$acc0,#1		// subs	$t0,$acc0,#-1 // tmp = ret-modulus
364	sbcs	$t1,$acc1,$poly1
365	sbcs	$t2,$acc2,xzr
366	sbcs	$t3,$acc3,$poly3
367	sbcs	xzr,$acc4,xzr		// did it borrow?
368
369	csel	$acc0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
370	csel	$acc1,$acc1,$t1,lo
371	csel	$acc2,$acc2,$t2,lo
372	stp	$acc0,$acc1,[$rp]
373	csel	$acc3,$acc3,$t3,lo
374	stp	$acc2,$acc3,[$rp,#16]
375
376	ret
377.size	__ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
378
379// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
380// to $a0-$a3
381.type	__ecp_nistz256_sqr_mont,%function
382.align	4
383__ecp_nistz256_sqr_mont:
384	//  |  |  |  |  |  |a1*a0|  |
385	//  |  |  |  |  |a2*a0|  |  |
386	//  |  |a3*a2|a3*a0|  |  |  |
387	//  |  |  |  |a2*a1|  |  |  |
388	//  |  |  |a3*a1|  |  |  |  |
389	// *|  |  |  |  |  |  |  | 2|
390	// +|a3*a3|a2*a2|a1*a1|a0*a0|
391	//  |--+--+--+--+--+--+--+--|
392	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
393	//
394	//  "can't overflow" below mark carrying into high part of
395	//  multiplication result, which can't overflow, because it
396	//  can never be all ones.
397
398	mul	$acc1,$a1,$a0		// a[1]*a[0]
399	umulh	$t1,$a1,$a0
400	mul	$acc2,$a2,$a0		// a[2]*a[0]
401	umulh	$t2,$a2,$a0
402	mul	$acc3,$a3,$a0		// a[3]*a[0]
403	umulh	$acc4,$a3,$a0
404
405	adds	$acc2,$acc2,$t1		// accumulate high parts of multiplication
406	 mul	$t0,$a2,$a1		// a[2]*a[1]
407	 umulh	$t1,$a2,$a1
408	adcs	$acc3,$acc3,$t2
409	 mul	$t2,$a3,$a1		// a[3]*a[1]
410	 umulh	$t3,$a3,$a1
411	adc	$acc4,$acc4,xzr		// can't overflow
412
413	mul	$acc5,$a3,$a2		// a[3]*a[2]
414	umulh	$acc6,$a3,$a2
415
416	adds	$t1,$t1,$t2		// accumulate high parts of multiplication
417	 mul	$acc0,$a0,$a0		// a[0]*a[0]
418	adc	$t2,$t3,xzr		// can't overflow
419
420	adds	$acc3,$acc3,$t0		// accumulate low parts of multiplication
421	 umulh	$a0,$a0,$a0
422	adcs	$acc4,$acc4,$t1
423	 mul	$t1,$a1,$a1		// a[1]*a[1]
424	adcs	$acc5,$acc5,$t2
425	 umulh	$a1,$a1,$a1
426	adc	$acc6,$acc6,xzr		// can't overflow
427
428	adds	$acc1,$acc1,$acc1	// acc[1-6]*=2
429	 mul	$t2,$a2,$a2		// a[2]*a[2]
430	adcs	$acc2,$acc2,$acc2
431	 umulh	$a2,$a2,$a2
432	adcs	$acc3,$acc3,$acc3
433	 mul	$t3,$a3,$a3		// a[3]*a[3]
434	adcs	$acc4,$acc4,$acc4
435	 umulh	$a3,$a3,$a3
436	adcs	$acc5,$acc5,$acc5
437	adcs	$acc6,$acc6,$acc6
438	adc	$acc7,xzr,xzr
439
440	adds	$acc1,$acc1,$a0		// +a[i]*a[i]
441	adcs	$acc2,$acc2,$t1
442	adcs	$acc3,$acc3,$a1
443	adcs	$acc4,$acc4,$t2
444	adcs	$acc5,$acc5,$a2
445	 lsl	$t0,$acc0,#32
446	adcs	$acc6,$acc6,$t3
447	 lsr	$t1,$acc0,#32
448	adc	$acc7,$acc7,$a3
449___
450for($i=0;$i<3;$i++) {			# reductions, see commentary in
451					# multiplication for details
452$code.=<<___;
453	subs	$t2,$acc0,$t0		// "*0xffff0001"
454	sbc	$t3,$acc0,$t1
455	adds	$acc0,$acc1,$t0		// +=acc[0]<<96 and omit acc[0]
456	adcs	$acc1,$acc2,$t1
457	 lsl	$t0,$acc0,#32
458	adcs	$acc2,$acc3,$t2		// +=acc[0]*0xffff0001
459	 lsr	$t1,$acc0,#32
460	adc	$acc3,$t3,xzr		// can't overflow
461___
462}
463$code.=<<___;
464	subs	$t2,$acc0,$t0		// "*0xffff0001"
465	sbc	$t3,$acc0,$t1
466	adds	$acc0,$acc1,$t0		// +=acc[0]<<96 and omit acc[0]
467	adcs	$acc1,$acc2,$t1
468	adcs	$acc2,$acc3,$t2		// +=acc[0]*0xffff0001
469	adc	$acc3,$t3,xzr		// can't overflow
470
471	adds	$acc0,$acc0,$acc4	// accumulate upper half
472	adcs	$acc1,$acc1,$acc5
473	adcs	$acc2,$acc2,$acc6
474	adcs	$acc3,$acc3,$acc7
475	adc	$acc4,xzr,xzr
476
477	adds	$t0,$acc0,#1		// subs	$t0,$acc0,#-1 // tmp = ret-modulus
478	sbcs	$t1,$acc1,$poly1
479	sbcs	$t2,$acc2,xzr
480	sbcs	$t3,$acc3,$poly3
481	sbcs	xzr,$acc4,xzr		// did it borrow?
482
483	csel	$acc0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
484	csel	$acc1,$acc1,$t1,lo
485	csel	$acc2,$acc2,$t2,lo
486	stp	$acc0,$acc1,[$rp]
487	csel	$acc3,$acc3,$t3,lo
488	stp	$acc2,$acc3,[$rp,#16]
489
490	ret
491.size	__ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont
492
493// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to
494// $a0-$a3 and $t0-$t3. This is done because it's used in multiple
495// contexts, e.g. in multiplication by 2 and 3...
496.type	__ecp_nistz256_add_to,%function
497.align	4
498__ecp_nistz256_add_to:
499	adds	$acc0,$acc0,$t0		// ret = a+b
500	adcs	$acc1,$acc1,$t1
501	adcs	$acc2,$acc2,$t2
502	adcs	$acc3,$acc3,$t3
503	adc	$ap,xzr,xzr		// zap $ap
504
505	adds	$t0,$acc0,#1		// subs	$t0,$a0,#-1 // tmp = ret-modulus
506	sbcs	$t1,$acc1,$poly1
507	sbcs	$t2,$acc2,xzr
508	sbcs	$t3,$acc3,$poly3
509	sbcs	xzr,$ap,xzr		// did subtraction borrow?
510
511	csel	$acc0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
512	csel	$acc1,$acc1,$t1,lo
513	csel	$acc2,$acc2,$t2,lo
514	stp	$acc0,$acc1,[$rp]
515	csel	$acc3,$acc3,$t3,lo
516	stp	$acc2,$acc3,[$rp,#16]
517
518	ret
519.size	__ecp_nistz256_add_to,.-__ecp_nistz256_add_to
520
521.type	__ecp_nistz256_sub_from,%function
522.align	4
523__ecp_nistz256_sub_from:
524	ldp	$t0,$t1,[$bp]
525	ldp	$t2,$t3,[$bp,#16]
526	subs	$acc0,$acc0,$t0		// ret = a-b
527	sbcs	$acc1,$acc1,$t1
528	sbcs	$acc2,$acc2,$t2
529	sbcs	$acc3,$acc3,$t3
530	sbc	$ap,xzr,xzr		// zap $ap
531
532	subs	$t0,$acc0,#1		// adds	$t0,$a0,#-1 // tmp = ret+modulus
533	adcs	$t1,$acc1,$poly1
534	adcs	$t2,$acc2,xzr
535	adc	$t3,$acc3,$poly3
536	cmp	$ap,xzr			// did subtraction borrow?
537
538	csel	$acc0,$acc0,$t0,eq	// ret = borrow ? ret+modulus : ret
539	csel	$acc1,$acc1,$t1,eq
540	csel	$acc2,$acc2,$t2,eq
541	stp	$acc0,$acc1,[$rp]
542	csel	$acc3,$acc3,$t3,eq
543	stp	$acc2,$acc3,[$rp,#16]
544
545	ret
546.size	__ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
547
548.type	__ecp_nistz256_sub_morf,%function
549.align	4
550__ecp_nistz256_sub_morf:
551	ldp	$t0,$t1,[$bp]
552	ldp	$t2,$t3,[$bp,#16]
553	subs	$acc0,$t0,$acc0		// ret = b-a
554	sbcs	$acc1,$t1,$acc1
555	sbcs	$acc2,$t2,$acc2
556	sbcs	$acc3,$t3,$acc3
557	sbc	$ap,xzr,xzr		// zap $ap
558
559	subs	$t0,$acc0,#1		// adds	$t0,$a0,#-1 // tmp = ret+modulus
560	adcs	$t1,$acc1,$poly1
561	adcs	$t2,$acc2,xzr
562	adc	$t3,$acc3,$poly3
563	cmp	$ap,xzr			// did subtraction borrow?
564
565	csel	$acc0,$acc0,$t0,eq	// ret = borrow ? ret+modulus : ret
566	csel	$acc1,$acc1,$t1,eq
567	csel	$acc2,$acc2,$t2,eq
568	stp	$acc0,$acc1,[$rp]
569	csel	$acc3,$acc3,$t3,eq
570	stp	$acc2,$acc3,[$rp,#16]
571
572	ret
573.size	__ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
574
575.type	__ecp_nistz256_div_by_2,%function
576.align	4
577__ecp_nistz256_div_by_2:
578	subs	$t0,$acc0,#1		// adds	$t0,$a0,#-1 // tmp = a+modulus
579	adcs	$t1,$acc1,$poly1
580	adcs	$t2,$acc2,xzr
581	adcs	$t3,$acc3,$poly3
582	adc	$ap,xzr,xzr		// zap $ap
583	tst	$acc0,#1		// is a even?
584
585	csel	$acc0,$acc0,$t0,eq	// ret = even ? a : a+modulus
586	csel	$acc1,$acc1,$t1,eq
587	csel	$acc2,$acc2,$t2,eq
588	csel	$acc3,$acc3,$t3,eq
589	csel	$ap,xzr,$ap,eq
590
591	lsr	$acc0,$acc0,#1		// ret >>= 1
592	orr	$acc0,$acc0,$acc1,lsl#63
593	lsr	$acc1,$acc1,#1
594	orr	$acc1,$acc1,$acc2,lsl#63
595	lsr	$acc2,$acc2,#1
596	orr	$acc2,$acc2,$acc3,lsl#63
597	lsr	$acc3,$acc3,#1
598	stp	$acc0,$acc1,[$rp]
599	orr	$acc3,$acc3,$ap,lsl#63
600	stp	$acc2,$acc3,[$rp,#16]
601
602	ret
603.size	__ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
604___
605########################################################################
606# following subroutines are "literal" implementation of those found in
607# ecp_nistz256.c
608#
609########################################################################
610# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
611#
612{
613my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3));
614# above map() describes stack layout with 4 temporary
615# 256-bit vectors on top.
616my ($rp_real,$ap_real) = map("x$_",(21,22));
617
618$code.=<<___;
619.globl	ecp_nistz256_point_double
620.type	ecp_nistz256_point_double,%function
621.align	5
622ecp_nistz256_point_double:
623	AARCH64_SIGN_LINK_REGISTER
624	stp	x29,x30,[sp,#-96]!
625	add	x29,sp,#0
626	stp	x19,x20,[sp,#16]
627	stp	x21,x22,[sp,#32]
628	sub	sp,sp,#32*4
629
630.Ldouble_shortcut:
631	ldp	$acc0,$acc1,[$ap,#32]
632	 mov	$rp_real,$rp
633	ldp	$acc2,$acc3,[$ap,#48]
634	 mov	$ap_real,$ap
635	 adrp	$poly3,:pg_hi21:.Lpoly
636	 add	$poly3,$poly3,:lo12:.Lpoly
637	 ldr	$poly1,[$poly3,#8]
638	mov	$t0,$acc0
639	 ldr	$poly3,[$poly3,#24]
640	mov	$t1,$acc1
641	 ldp	$a0,$a1,[$ap_real,#64]	// forward load for p256_sqr_mont
642	mov	$t2,$acc2
643	mov	$t3,$acc3
644	 ldp	$a2,$a3,[$ap_real,#64+16]
645	add	$rp,sp,#$S
646	bl	__ecp_nistz256_add_to	// p256_mul_by_2(S, in_y);
647
648	add	$rp,sp,#$Zsqr
649	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Zsqr, in_z);
650
651	ldp	$t0,$t1,[$ap_real]
652	ldp	$t2,$t3,[$ap_real,#16]
653	mov	$a0,$acc0		// put Zsqr aside for p256_sub
654	mov	$a1,$acc1
655	mov	$a2,$acc2
656	mov	$a3,$acc3
657	add	$rp,sp,#$M
658	bl	__ecp_nistz256_add_to	// p256_add(M, Zsqr, in_x);
659
660	add	$bp,$ap_real,#0
661	mov	$acc0,$a0		// restore Zsqr
662	mov	$acc1,$a1
663	 ldp	$a0,$a1,[sp,#$S]	// forward load for p256_sqr_mont
664	mov	$acc2,$a2
665	mov	$acc3,$a3
666	 ldp	$a2,$a3,[sp,#$S+16]
667	add	$rp,sp,#$Zsqr
668	bl	__ecp_nistz256_sub_morf	// p256_sub(Zsqr, in_x, Zsqr);
669
670	add	$rp,sp,#$S
671	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(S, S);
672
673	ldr	$bi,[$ap_real,#32]
674	ldp	$a0,$a1,[$ap_real,#64]
675	ldp	$a2,$a3,[$ap_real,#64+16]
676	add	$bp,$ap_real,#32
677	add	$rp,sp,#$tmp0
678	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(tmp0, in_z, in_y);
679
680	mov	$t0,$acc0
681	mov	$t1,$acc1
682	 ldp	$a0,$a1,[sp,#$S]	// forward load for p256_sqr_mont
683	mov	$t2,$acc2
684	mov	$t3,$acc3
685	 ldp	$a2,$a3,[sp,#$S+16]
686	add	$rp,$rp_real,#64
687	bl	__ecp_nistz256_add_to	// p256_mul_by_2(res_z, tmp0);
688
689	add	$rp,sp,#$tmp0
690	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(tmp0, S);
691
692	 ldr	$bi,[sp,#$Zsqr]		// forward load for p256_mul_mont
693	 ldp	$a0,$a1,[sp,#$M]
694	 ldp	$a2,$a3,[sp,#$M+16]
695	add	$rp,$rp_real,#32
696	bl	__ecp_nistz256_div_by_2	// p256_div_by_2(res_y, tmp0);
697
698	add	$bp,sp,#$Zsqr
699	add	$rp,sp,#$M
700	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(M, M, Zsqr);
701
702	mov	$t0,$acc0		// duplicate M
703	mov	$t1,$acc1
704	mov	$t2,$acc2
705	mov	$t3,$acc3
706	mov	$a0,$acc0		// put M aside
707	mov	$a1,$acc1
708	mov	$a2,$acc2
709	mov	$a3,$acc3
710	add	$rp,sp,#$M
711	bl	__ecp_nistz256_add_to
712	mov	$t0,$a0			// restore M
713	mov	$t1,$a1
714	 ldr	$bi,[$ap_real]		// forward load for p256_mul_mont
715	mov	$t2,$a2
716	 ldp	$a0,$a1,[sp,#$S]
717	mov	$t3,$a3
718	 ldp	$a2,$a3,[sp,#$S+16]
719	bl	__ecp_nistz256_add_to	// p256_mul_by_3(M, M);
720
721	add	$bp,$ap_real,#0
722	add	$rp,sp,#$S
723	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, in_x);
724
725	mov	$t0,$acc0
726	mov	$t1,$acc1
727	 ldp	$a0,$a1,[sp,#$M]	// forward load for p256_sqr_mont
728	mov	$t2,$acc2
729	mov	$t3,$acc3
730	 ldp	$a2,$a3,[sp,#$M+16]
731	add	$rp,sp,#$tmp0
732	bl	__ecp_nistz256_add_to	// p256_mul_by_2(tmp0, S);
733
734	add	$rp,$rp_real,#0
735	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(res_x, M);
736
737	add	$bp,sp,#$tmp0
738	bl	__ecp_nistz256_sub_from	// p256_sub(res_x, res_x, tmp0);
739
740	add	$bp,sp,#$S
741	add	$rp,sp,#$S
742	bl	__ecp_nistz256_sub_morf	// p256_sub(S, S, res_x);
743
744	ldr	$bi,[sp,#$M]
745	mov	$a0,$acc0		// copy S
746	mov	$a1,$acc1
747	mov	$a2,$acc2
748	mov	$a3,$acc3
749	add	$bp,sp,#$M
750	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, M);
751
752	add	$bp,$rp_real,#32
753	add	$rp,$rp_real,#32
754	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, S, res_y);
755
756	add	sp,x29,#0		// destroy frame
757	ldp	x19,x20,[x29,#16]
758	ldp	x21,x22,[x29,#32]
759	ldp	x29,x30,[sp],#96
760	AARCH64_VALIDATE_LINK_REGISTER
761	ret
762.size	ecp_nistz256_point_double,.-ecp_nistz256_point_double
763___
764}
765
766########################################################################
767# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
768#			      const P256_POINT *in2);
769{
770my ($res_x,$res_y,$res_z,
771    $H,$Hsqr,$R,$Rsqr,$Hcub,
772    $U1,$U2,$S1,$S2)=map(32*$_,(0..11));
773my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
774# above map() describes stack layout with 12 temporary
775# 256-bit vectors on top.
776my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp0,$temp1,$temp2)=map("x$_",(21..28));
777
778$code.=<<___;
779.globl	ecp_nistz256_point_add
780.type	ecp_nistz256_point_add,%function
781.align	5
782ecp_nistz256_point_add:
783	AARCH64_SIGN_LINK_REGISTER
784	stp	x29,x30,[sp,#-96]!
785	add	x29,sp,#0
786	stp	x19,x20,[sp,#16]
787	stp	x21,x22,[sp,#32]
788	stp	x23,x24,[sp,#48]
789	stp	x25,x26,[sp,#64]
790	stp	x27,x28,[sp,#80]
791	sub	sp,sp,#32*12
792
793	ldp	$a0,$a1,[$bp,#64]	// in2_z
794	ldp	$a2,$a3,[$bp,#64+16]
795	 mov	$rp_real,$rp
796	 mov	$ap_real,$ap
797	 mov	$bp_real,$bp
798	 adrp	$poly3,:pg_hi21:.Lpoly
799	 add	$poly3,$poly3,:lo12:.Lpoly
800	 ldr	$poly1,[$poly3,#8]
801	 ldr	$poly3,[$poly3,#24]
802	orr	$t0,$a0,$a1
803	orr	$t2,$a2,$a3
804	orr	$in2infty,$t0,$t2
805	cmp	$in2infty,#0
806	csetm	$in2infty,ne		// ~in2infty
807	add	$rp,sp,#$Z2sqr
808	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z2sqr, in2_z);
809
810	ldp	$a0,$a1,[$ap_real,#64]	// in1_z
811	ldp	$a2,$a3,[$ap_real,#64+16]
812	orr	$t0,$a0,$a1
813	orr	$t2,$a2,$a3
814	orr	$in1infty,$t0,$t2
815	cmp	$in1infty,#0
816	csetm	$in1infty,ne		// ~in1infty
817	add	$rp,sp,#$Z1sqr
818	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
819
820	ldr	$bi,[$bp_real,#64]
821	ldp	$a0,$a1,[sp,#$Z2sqr]
822	ldp	$a2,$a3,[sp,#$Z2sqr+16]
823	add	$bp,$bp_real,#64
824	add	$rp,sp,#$S1
825	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, Z2sqr, in2_z);
826
827	ldr	$bi,[$ap_real,#64]
828	ldp	$a0,$a1,[sp,#$Z1sqr]
829	ldp	$a2,$a3,[sp,#$Z1sqr+16]
830	add	$bp,$ap_real,#64
831	add	$rp,sp,#$S2
832	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
833
834	ldr	$bi,[$ap_real,#32]
835	ldp	$a0,$a1,[sp,#$S1]
836	ldp	$a2,$a3,[sp,#$S1+16]
837	add	$bp,$ap_real,#32
838	add	$rp,sp,#$S1
839	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, S1, in1_y);
840
841	ldr	$bi,[$bp_real,#32]
842	ldp	$a0,$a1,[sp,#$S2]
843	ldp	$a2,$a3,[sp,#$S2+16]
844	add	$bp,$bp_real,#32
845	add	$rp,sp,#$S2
846	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
847
848	add	$bp,sp,#$S1
849	 ldr	$bi,[sp,#$Z2sqr]	// forward load for p256_mul_mont
850	 ldp	$a0,$a1,[$ap_real]
851	 ldp	$a2,$a3,[$ap_real,#16]
852	add	$rp,sp,#$R
853	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, S1);
854
855	orr	$acc0,$acc0,$acc1	// see if result is zero
856	orr	$acc2,$acc2,$acc3
857	orr	$temp0,$acc0,$acc2	// ~is_equal(S1,S2)
858
859	add	$bp,sp,#$Z2sqr
860	add	$rp,sp,#$U1
861	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U1, in1_x, Z2sqr);
862
863	ldr	$bi,[sp,#$Z1sqr]
864	ldp	$a0,$a1,[$bp_real]
865	ldp	$a2,$a3,[$bp_real,#16]
866	add	$bp,sp,#$Z1sqr
867	add	$rp,sp,#$U2
868	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in2_x, Z1sqr);
869
870	add	$bp,sp,#$U1
871	 ldp	$a0,$a1,[sp,#$R]	// forward load for p256_sqr_mont
872	 ldp	$a2,$a3,[sp,#$R+16]
873	add	$rp,sp,#$H
874	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, U1);
875
876	orr	$acc0,$acc0,$acc1	// see if result is zero
877	orr	$acc2,$acc2,$acc3
878	orr	$acc0,$acc0,$acc2	// ~is_equal(U1,U2)
879
880	mvn	$temp1,$in1infty	// -1/0 -> 0/-1
881	mvn	$temp2,$in2infty	// -1/0 -> 0/-1
882	orr	$acc0,$acc0,$temp1
883	orr	$acc0,$acc0,$temp2
884	orr	$acc0,$acc0,$temp0
885	cbnz	$acc0,.Ladd_proceed	// if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2))
886
887.Ladd_double:
888	mov	$ap,$ap_real
889	mov	$rp,$rp_real
890	ldp	x23,x24,[x29,#48]
891	ldp	x25,x26,[x29,#64]
892	ldp	x27,x28,[x29,#80]
893	add	sp,sp,#256	// #256 is from #32*(12-4). difference in stack frames
894	b	.Ldouble_shortcut
895
896.align	4
897.Ladd_proceed:
898	add	$rp,sp,#$Rsqr
899	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
900
901	ldr	$bi,[$ap_real,#64]
902	ldp	$a0,$a1,[sp,#$H]
903	ldp	$a2,$a3,[sp,#$H+16]
904	add	$bp,$ap_real,#64
905	add	$rp,sp,#$res_z
906	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
907
908	ldp	$a0,$a1,[sp,#$H]
909	ldp	$a2,$a3,[sp,#$H+16]
910	add	$rp,sp,#$Hsqr
911	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
912
913	ldr	$bi,[$bp_real,#64]
914	ldp	$a0,$a1,[sp,#$res_z]
915	ldp	$a2,$a3,[sp,#$res_z+16]
916	add	$bp,$bp_real,#64
917	add	$rp,sp,#$res_z
918	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, res_z, in2_z);
919
920	ldr	$bi,[sp,#$H]
921	ldp	$a0,$a1,[sp,#$Hsqr]
922	ldp	$a2,$a3,[sp,#$Hsqr+16]
923	add	$bp,sp,#$H
924	add	$rp,sp,#$Hcub
925	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
926
927	ldr	$bi,[sp,#$Hsqr]
928	ldp	$a0,$a1,[sp,#$U1]
929	ldp	$a2,$a3,[sp,#$U1+16]
930	add	$bp,sp,#$Hsqr
931	add	$rp,sp,#$U2
932	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, U1, Hsqr);
933
934	mov	$t0,$acc0
935	mov	$t1,$acc1
936	mov	$t2,$acc2
937	mov	$t3,$acc3
938	add	$rp,sp,#$Hsqr
939	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
940
941	add	$bp,sp,#$Rsqr
942	add	$rp,sp,#$res_x
943	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
944
945	add	$bp,sp,#$Hcub
946	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
947
948	add	$bp,sp,#$U2
949	 ldr	$bi,[sp,#$Hcub]		// forward load for p256_mul_mont
950	 ldp	$a0,$a1,[sp,#$S1]
951	 ldp	$a2,$a3,[sp,#$S1+16]
952	add	$rp,sp,#$res_y
953	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
954
955	add	$bp,sp,#$Hcub
956	add	$rp,sp,#$S2
957	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S1, Hcub);
958
959	ldr	$bi,[sp,#$R]
960	ldp	$a0,$a1,[sp,#$res_y]
961	ldp	$a2,$a3,[sp,#$res_y+16]
962	add	$bp,sp,#$R
963	add	$rp,sp,#$res_y
964	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
965
966	add	$bp,sp,#$S2
967	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
968
969	ldp	$a0,$a1,[sp,#$res_x]		// res
970	ldp	$a2,$a3,[sp,#$res_x+16]
971	ldp	$t0,$t1,[$bp_real]		// in2
972	ldp	$t2,$t3,[$bp_real,#16]
973___
974for($i=0;$i<64;$i+=32) {		# conditional moves
975$code.=<<___;
976	ldp	$acc0,$acc1,[$ap_real,#$i]	// in1
977	cmp	$in1infty,#0			// ~$in1intfy, remember?
978	ldp	$acc2,$acc3,[$ap_real,#$i+16]
979	csel	$t0,$a0,$t0,ne
980	csel	$t1,$a1,$t1,ne
981	ldp	$a0,$a1,[sp,#$res_x+$i+32]	// res
982	csel	$t2,$a2,$t2,ne
983	csel	$t3,$a3,$t3,ne
984	cmp	$in2infty,#0			// ~$in2intfy, remember?
985	ldp	$a2,$a3,[sp,#$res_x+$i+48]
986	csel	$acc0,$t0,$acc0,ne
987	csel	$acc1,$t1,$acc1,ne
988	ldp	$t0,$t1,[$bp_real,#$i+32]	// in2
989	csel	$acc2,$t2,$acc2,ne
990	csel	$acc3,$t3,$acc3,ne
991	ldp	$t2,$t3,[$bp_real,#$i+48]
992	stp	$acc0,$acc1,[$rp_real,#$i]
993	stp	$acc2,$acc3,[$rp_real,#$i+16]
994___
995}
996$code.=<<___;
997	ldp	$acc0,$acc1,[$ap_real,#$i]	// in1
998	cmp	$in1infty,#0			// ~$in1intfy, remember?
999	ldp	$acc2,$acc3,[$ap_real,#$i+16]
1000	csel	$t0,$a0,$t0,ne
1001	csel	$t1,$a1,$t1,ne
1002	csel	$t2,$a2,$t2,ne
1003	csel	$t3,$a3,$t3,ne
1004	cmp	$in2infty,#0			// ~$in2intfy, remember?
1005	csel	$acc0,$t0,$acc0,ne
1006	csel	$acc1,$t1,$acc1,ne
1007	csel	$acc2,$t2,$acc2,ne
1008	csel	$acc3,$t3,$acc3,ne
1009	stp	$acc0,$acc1,[$rp_real,#$i]
1010	stp	$acc2,$acc3,[$rp_real,#$i+16]
1011
1012.Ladd_done:
1013	add	sp,x29,#0		// destroy frame
1014	ldp	x19,x20,[x29,#16]
1015	ldp	x21,x22,[x29,#32]
1016	ldp	x23,x24,[x29,#48]
1017	ldp	x25,x26,[x29,#64]
1018	ldp	x27,x28,[x29,#80]
1019	ldp	x29,x30,[sp],#96
1020	AARCH64_VALIDATE_LINK_REGISTER
1021	ret
1022.size	ecp_nistz256_point_add,.-ecp_nistz256_point_add
1023___
1024}
1025
1026########################################################################
1027# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
1028#				     const P256_POINT_AFFINE *in2);
1029{
1030my ($res_x,$res_y,$res_z,
1031    $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9));
1032my $Z1sqr = $S2;
1033# above map() describes stack layout with 10 temporary
1034# 256-bit vectors on top.
1035my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26));
1036
1037$code.=<<___;
1038.globl	ecp_nistz256_point_add_affine
1039.type	ecp_nistz256_point_add_affine,%function
1040.align	5
1041ecp_nistz256_point_add_affine:
1042	AARCH64_SIGN_LINK_REGISTER
1043	stp	x29,x30,[sp,#-80]!
1044	add	x29,sp,#0
1045	stp	x19,x20,[sp,#16]
1046	stp	x21,x22,[sp,#32]
1047	stp	x23,x24,[sp,#48]
1048	stp	x25,x26,[sp,#64]
1049	sub	sp,sp,#32*10
1050
1051	mov	$rp_real,$rp
1052	mov	$ap_real,$ap
1053	mov	$bp_real,$bp
1054	adrp	$poly3,:pg_hi21:.Lpoly
1055	add	$poly3,$poly3,:lo12:.Lpoly
1056	ldr	$poly1,[$poly3,#8]
1057	ldr	$poly3,[$poly3,#24]
1058
1059	ldp	$a0,$a1,[$ap,#64]	// in1_z
1060	ldp	$a2,$a3,[$ap,#64+16]
1061	orr	$t0,$a0,$a1
1062	orr	$t2,$a2,$a3
1063	orr	$in1infty,$t0,$t2
1064	cmp	$in1infty,#0
1065	csetm	$in1infty,ne		// ~in1infty
1066
1067	ldp	$acc0,$acc1,[$bp]	// in2_x
1068	ldp	$acc2,$acc3,[$bp,#16]
1069	ldp	$t0,$t1,[$bp,#32]	// in2_y
1070	ldp	$t2,$t3,[$bp,#48]
1071	orr	$acc0,$acc0,$acc1
1072	orr	$acc2,$acc2,$acc3
1073	orr	$t0,$t0,$t1
1074	orr	$t2,$t2,$t3
1075	orr	$acc0,$acc0,$acc2
1076	orr	$t0,$t0,$t2
1077	orr	$in2infty,$acc0,$t0
1078	cmp	$in2infty,#0
1079	csetm	$in2infty,ne		// ~in2infty
1080
1081	add	$rp,sp,#$Z1sqr
1082	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
1083
1084	mov	$a0,$acc0
1085	mov	$a1,$acc1
1086	mov	$a2,$acc2
1087	mov	$a3,$acc3
1088	ldr	$bi,[$bp_real]
1089	add	$bp,$bp_real,#0
1090	add	$rp,sp,#$U2
1091	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, Z1sqr, in2_x);
1092
1093	add	$bp,$ap_real,#0
1094	 ldr	$bi,[$ap_real,#64]	// forward load for p256_mul_mont
1095	 ldp	$a0,$a1,[sp,#$Z1sqr]
1096	 ldp	$a2,$a3,[sp,#$Z1sqr+16]
1097	add	$rp,sp,#$H
1098	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, in1_x);
1099
1100	add	$bp,$ap_real,#64
1101	add	$rp,sp,#$S2
1102	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
1103
1104	ldr	$bi,[$ap_real,#64]
1105	ldp	$a0,$a1,[sp,#$H]
1106	ldp	$a2,$a3,[sp,#$H+16]
1107	add	$bp,$ap_real,#64
1108	add	$rp,sp,#$res_z
1109	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
1110
1111	ldr	$bi,[$bp_real,#32]
1112	ldp	$a0,$a1,[sp,#$S2]
1113	ldp	$a2,$a3,[sp,#$S2+16]
1114	add	$bp,$bp_real,#32
1115	add	$rp,sp,#$S2
1116	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
1117
1118	add	$bp,$ap_real,#32
1119	 ldp	$a0,$a1,[sp,#$H]	// forward load for p256_sqr_mont
1120	 ldp	$a2,$a3,[sp,#$H+16]
1121	add	$rp,sp,#$R
1122	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, in1_y);
1123
1124	add	$rp,sp,#$Hsqr
1125	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
1126
1127	ldp	$a0,$a1,[sp,#$R]
1128	ldp	$a2,$a3,[sp,#$R+16]
1129	add	$rp,sp,#$Rsqr
1130	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
1131
1132	ldr	$bi,[sp,#$H]
1133	ldp	$a0,$a1,[sp,#$Hsqr]
1134	ldp	$a2,$a3,[sp,#$Hsqr+16]
1135	add	$bp,sp,#$H
1136	add	$rp,sp,#$Hcub
1137	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
1138
1139	ldr	$bi,[$ap_real]
1140	ldp	$a0,$a1,[sp,#$Hsqr]
1141	ldp	$a2,$a3,[sp,#$Hsqr+16]
1142	add	$bp,$ap_real,#0
1143	add	$rp,sp,#$U2
1144	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in1_x, Hsqr);
1145
1146	mov	$t0,$acc0
1147	mov	$t1,$acc1
1148	mov	$t2,$acc2
1149	mov	$t3,$acc3
1150	add	$rp,sp,#$Hsqr
1151	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
1152
1153	add	$bp,sp,#$Rsqr
1154	add	$rp,sp,#$res_x
1155	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
1156
1157	add	$bp,sp,#$Hcub
1158	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
1159
1160	add	$bp,sp,#$U2
1161	 ldr	$bi,[$ap_real,#32]	// forward load for p256_mul_mont
1162	 ldp	$a0,$a1,[sp,#$Hcub]
1163	 ldp	$a2,$a3,[sp,#$Hcub+16]
1164	add	$rp,sp,#$res_y
1165	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
1166
1167	add	$bp,$ap_real,#32
1168	add	$rp,sp,#$S2
1169	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, in1_y, Hcub);
1170
1171	ldr	$bi,[sp,#$R]
1172	ldp	$a0,$a1,[sp,#$res_y]
1173	ldp	$a2,$a3,[sp,#$res_y+16]
1174	add	$bp,sp,#$R
1175	add	$rp,sp,#$res_y
1176	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
1177
1178	add	$bp,sp,#$S2
1179	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
1180
1181	ldp	$a0,$a1,[sp,#$res_x]		// res
1182	ldp	$a2,$a3,[sp,#$res_x+16]
1183	ldp	$t0,$t1,[$bp_real]		// in2
1184	ldp	$t2,$t3,[$bp_real,#16]
1185___
1186for($i=0;$i<64;$i+=32) {		# conditional moves
1187$code.=<<___;
1188	ldp	$acc0,$acc1,[$ap_real,#$i]	// in1
1189	cmp	$in1infty,#0			// ~$in1intfy, remember?
1190	ldp	$acc2,$acc3,[$ap_real,#$i+16]
1191	csel	$t0,$a0,$t0,ne
1192	csel	$t1,$a1,$t1,ne
1193	ldp	$a0,$a1,[sp,#$res_x+$i+32]	// res
1194	csel	$t2,$a2,$t2,ne
1195	csel	$t3,$a3,$t3,ne
1196	cmp	$in2infty,#0			// ~$in2intfy, remember?
1197	ldp	$a2,$a3,[sp,#$res_x+$i+48]
1198	csel	$acc0,$t0,$acc0,ne
1199	csel	$acc1,$t1,$acc1,ne
1200	ldp	$t0,$t1,[$bp_real,#$i+32]	// in2
1201	csel	$acc2,$t2,$acc2,ne
1202	csel	$acc3,$t3,$acc3,ne
1203	ldp	$t2,$t3,[$bp_real,#$i+48]
1204	stp	$acc0,$acc1,[$rp_real,#$i]
1205	stp	$acc2,$acc3,[$rp_real,#$i+16]
1206___
1207$code.=<<___	if ($i == 0);
1208	adrp	$bp_real,:pg_hi21:.Lone_mont-64
1209	add	$bp_real,$bp_real,:lo12:.Lone_mont-64
1210___
1211}
1212$code.=<<___;
1213	ldp	$acc0,$acc1,[$ap_real,#$i]	// in1
1214	cmp	$in1infty,#0			// ~$in1intfy, remember?
1215	ldp	$acc2,$acc3,[$ap_real,#$i+16]
1216	csel	$t0,$a0,$t0,ne
1217	csel	$t1,$a1,$t1,ne
1218	csel	$t2,$a2,$t2,ne
1219	csel	$t3,$a3,$t3,ne
1220	cmp	$in2infty,#0			// ~$in2intfy, remember?
1221	csel	$acc0,$t0,$acc0,ne
1222	csel	$acc1,$t1,$acc1,ne
1223	csel	$acc2,$t2,$acc2,ne
1224	csel	$acc3,$t3,$acc3,ne
1225	stp	$acc0,$acc1,[$rp_real,#$i]
1226	stp	$acc2,$acc3,[$rp_real,#$i+16]
1227
1228	add	sp,x29,#0		// destroy frame
1229	ldp	x19,x20,[x29,#16]
1230	ldp	x21,x22,[x29,#32]
1231	ldp	x23,x24,[x29,#48]
1232	ldp	x25,x26,[x29,#64]
1233	ldp	x29,x30,[sp],#80
1234	AARCH64_VALIDATE_LINK_REGISTER
1235	ret
1236.size	ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1237___
1238}
1239if (1) {
1240my ($ord0,$ord1) = ($poly1,$poly3);
1241my ($ord2,$ord3,$ordk,$t4) = map("x$_",(21..24));
1242my $acc7 = $bi;
1243
1244$code.=<<___;
1245////////////////////////////////////////////////////////////////////////
1246// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
1247//                                uint64_t b[4]);
1248.globl	ecp_nistz256_ord_mul_mont
1249.type	ecp_nistz256_ord_mul_mont,%function
1250.align	4
1251ecp_nistz256_ord_mul_mont:
1252	AARCH64_VALID_CALL_TARGET
1253	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1254	stp	x29,x30,[sp,#-64]!
1255	add	x29,sp,#0
1256	stp	x19,x20,[sp,#16]
1257	stp	x21,x22,[sp,#32]
1258	stp	x23,x24,[sp,#48]
1259
1260	adrp	$ordk,:pg_hi21:.Lord
1261	add	$ordk,$ordk,:lo12:.Lord
1262	ldr	$bi,[$bp]		// bp[0]
1263	ldp	$a0,$a1,[$ap]
1264	ldp	$a2,$a3,[$ap,#16]
1265
1266	ldp	$ord0,$ord1,[$ordk,#0]
1267	ldp	$ord2,$ord3,[$ordk,#16]
1268	ldr	$ordk,[$ordk,#32]
1269
1270	mul	$acc0,$a0,$bi		// a[0]*b[0]
1271	umulh	$t0,$a0,$bi
1272
1273	mul	$acc1,$a1,$bi		// a[1]*b[0]
1274	umulh	$t1,$a1,$bi
1275
1276	mul	$acc2,$a2,$bi		// a[2]*b[0]
1277	umulh	$t2,$a2,$bi
1278
1279	mul	$acc3,$a3,$bi		// a[3]*b[0]
1280	umulh	$acc4,$a3,$bi
1281
1282	mul	$t4,$acc0,$ordk
1283
1284	adds	$acc1,$acc1,$t0		// accumulate high parts of multiplication
1285	adcs	$acc2,$acc2,$t1
1286	adcs	$acc3,$acc3,$t2
1287	adc	$acc4,$acc4,xzr
1288	mov	$acc5,xzr
1289___
1290for ($i=1;$i<4;$i++) {
1291	################################################################
1292	#            ffff0000.ffffffff.yyyyyyyy.zzzzzzzz
1293	# *                                     abcdefgh
1294	# + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
1295	#
1296	# Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
1297	# rewrite above as:
1298	#
1299	#   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
1300	# - 0000abcd.efgh0000.abcdefgh.00000000.00000000
1301	# + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh
1302$code.=<<___;
1303	ldr	$bi,[$bp,#8*$i]		// b[i]
1304
1305	lsl	$t0,$t4,#32
1306	subs	$acc2,$acc2,$t4
1307	lsr	$t1,$t4,#32
1308	sbcs	$acc3,$acc3,$t0
1309	sbcs	$acc4,$acc4,$t1
1310	sbc	$acc5,$acc5,xzr
1311
1312	subs	xzr,$acc0,#1
1313	umulh	$t1,$ord0,$t4
1314	mul	$t2,$ord1,$t4
1315	umulh	$t3,$ord1,$t4
1316
1317	adcs	$t2,$t2,$t1
1318	 mul	$t0,$a0,$bi
1319	adc	$t3,$t3,xzr
1320	 mul	$t1,$a1,$bi
1321
1322	adds	$acc0,$acc1,$t2
1323	 mul	$t2,$a2,$bi
1324	adcs	$acc1,$acc2,$t3
1325	 mul	$t3,$a3,$bi
1326	adcs	$acc2,$acc3,$t4
1327	adcs	$acc3,$acc4,$t4
1328	adc	$acc4,$acc5,xzr
1329
1330	adds	$acc0,$acc0,$t0		// accumulate low parts
1331	umulh	$t0,$a0,$bi
1332	adcs	$acc1,$acc1,$t1
1333	umulh	$t1,$a1,$bi
1334	adcs	$acc2,$acc2,$t2
1335	umulh	$t2,$a2,$bi
1336	adcs	$acc3,$acc3,$t3
1337	umulh	$t3,$a3,$bi
1338	adc	$acc4,$acc4,xzr
1339	mul	$t4,$acc0,$ordk
1340	adds	$acc1,$acc1,$t0		// accumulate high parts
1341	adcs	$acc2,$acc2,$t1
1342	adcs	$acc3,$acc3,$t2
1343	adcs	$acc4,$acc4,$t3
1344	adc	$acc5,xzr,xzr
1345___
1346}
1347$code.=<<___;
1348	lsl	$t0,$t4,#32		// last reduction
1349	subs	$acc2,$acc2,$t4
1350	lsr	$t1,$t4,#32
1351	sbcs	$acc3,$acc3,$t0
1352	sbcs	$acc4,$acc4,$t1
1353	sbc	$acc5,$acc5,xzr
1354
1355	subs	xzr,$acc0,#1
1356	umulh	$t1,$ord0,$t4
1357	mul	$t2,$ord1,$t4
1358	umulh	$t3,$ord1,$t4
1359
1360	adcs	$t2,$t2,$t1
1361	adc	$t3,$t3,xzr
1362
1363	adds	$acc0,$acc1,$t2
1364	adcs	$acc1,$acc2,$t3
1365	adcs	$acc2,$acc3,$t4
1366	adcs	$acc3,$acc4,$t4
1367	adc	$acc4,$acc5,xzr
1368
1369	subs	$t0,$acc0,$ord0		// ret -= modulus
1370	sbcs	$t1,$acc1,$ord1
1371	sbcs	$t2,$acc2,$ord2
1372	sbcs	$t3,$acc3,$ord3
1373	sbcs	xzr,$acc4,xzr
1374
1375	csel	$acc0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
1376	csel	$acc1,$acc1,$t1,lo
1377	csel	$acc2,$acc2,$t2,lo
1378	stp	$acc0,$acc1,[$rp]
1379	csel	$acc3,$acc3,$t3,lo
1380	stp	$acc2,$acc3,[$rp,#16]
1381
1382	ldp	x19,x20,[sp,#16]
1383	ldp	x21,x22,[sp,#32]
1384	ldp	x23,x24,[sp,#48]
1385	ldr	x29,[sp],#64
1386	ret
1387.size	ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
1388
1389////////////////////////////////////////////////////////////////////////
1390// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
1391//                                uint64_t rep);
1392.globl	ecp_nistz256_ord_sqr_mont
1393.type	ecp_nistz256_ord_sqr_mont,%function
1394.align	4
1395ecp_nistz256_ord_sqr_mont:
1396	AARCH64_VALID_CALL_TARGET
1397	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1398	stp	x29,x30,[sp,#-64]!
1399	add	x29,sp,#0
1400	stp	x19,x20,[sp,#16]
1401	stp	x21,x22,[sp,#32]
1402	stp	x23,x24,[sp,#48]
1403
1404	adrp	$ordk,:pg_hi21:.Lord
1405	add	$ordk,$ordk,:lo12:.Lord
1406	ldp	$a0,$a1,[$ap]
1407	ldp	$a2,$a3,[$ap,#16]
1408
1409	ldp	$ord0,$ord1,[$ordk,#0]
1410	ldp	$ord2,$ord3,[$ordk,#16]
1411	ldr	$ordk,[$ordk,#32]
1412	b	.Loop_ord_sqr
1413
1414.align	4
1415.Loop_ord_sqr:
1416	sub	$bp,$bp,#1
1417	////////////////////////////////////////////////////////////////
1418	//  |  |  |  |  |  |a1*a0|  |
1419	//  |  |  |  |  |a2*a0|  |  |
1420	//  |  |a3*a2|a3*a0|  |  |  |
1421	//  |  |  |  |a2*a1|  |  |  |
1422	//  |  |  |a3*a1|  |  |  |  |
1423	// *|  |  |  |  |  |  |  | 2|
1424	// +|a3*a3|a2*a2|a1*a1|a0*a0|
1425	//  |--+--+--+--+--+--+--+--|
1426	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
1427	//
1428	//  "can't overflow" below mark carrying into high part of
1429	//  multiplication result, which can't overflow, because it
1430	//  can never be all ones.
1431
1432	mul	$acc1,$a1,$a0		// a[1]*a[0]
1433	umulh	$t1,$a1,$a0
1434	mul	$acc2,$a2,$a0		// a[2]*a[0]
1435	umulh	$t2,$a2,$a0
1436	mul	$acc3,$a3,$a0		// a[3]*a[0]
1437	umulh	$acc4,$a3,$a0
1438
1439	adds	$acc2,$acc2,$t1		// accumulate high parts of multiplication
1440	 mul	$t0,$a2,$a1		// a[2]*a[1]
1441	 umulh	$t1,$a2,$a1
1442	adcs	$acc3,$acc3,$t2
1443	 mul	$t2,$a3,$a1		// a[3]*a[1]
1444	 umulh	$t3,$a3,$a1
1445	adc	$acc4,$acc4,xzr		// can't overflow
1446
1447	mul	$acc5,$a3,$a2		// a[3]*a[2]
1448	umulh	$acc6,$a3,$a2
1449
1450	adds	$t1,$t1,$t2		// accumulate high parts of multiplication
1451	 mul	$acc0,$a0,$a0		// a[0]*a[0]
1452	adc	$t2,$t3,xzr		// can't overflow
1453
1454	adds	$acc3,$acc3,$t0		// accumulate low parts of multiplication
1455	 umulh	$a0,$a0,$a0
1456	adcs	$acc4,$acc4,$t1
1457	 mul	$t1,$a1,$a1		// a[1]*a[1]
1458	adcs	$acc5,$acc5,$t2
1459	 umulh	$a1,$a1,$a1
1460	adc	$acc6,$acc6,xzr		// can't overflow
1461
1462	adds	$acc1,$acc1,$acc1	// acc[1-6]*=2
1463	 mul	$t2,$a2,$a2		// a[2]*a[2]
1464	adcs	$acc2,$acc2,$acc2
1465	 umulh	$a2,$a2,$a2
1466	adcs	$acc3,$acc3,$acc3
1467	 mul	$t3,$a3,$a3		// a[3]*a[3]
1468	adcs	$acc4,$acc4,$acc4
1469	 umulh	$a3,$a3,$a3
1470	adcs	$acc5,$acc5,$acc5
1471	adcs	$acc6,$acc6,$acc6
1472	adc	$acc7,xzr,xzr
1473
1474	adds	$acc1,$acc1,$a0		// +a[i]*a[i]
1475	 mul	$t4,$acc0,$ordk
1476	adcs	$acc2,$acc2,$t1
1477	adcs	$acc3,$acc3,$a1
1478	adcs	$acc4,$acc4,$t2
1479	adcs	$acc5,$acc5,$a2
1480	adcs	$acc6,$acc6,$t3
1481	adc	$acc7,$acc7,$a3
1482___
1483for($i=0; $i<4; $i++) {			# reductions
1484$code.=<<___;
1485	subs	xzr,$acc0,#1
1486	umulh	$t1,$ord0,$t4
1487	mul	$t2,$ord1,$t4
1488	umulh	$t3,$ord1,$t4
1489
1490	adcs	$t2,$t2,$t1
1491	adc	$t3,$t3,xzr
1492
1493	adds	$acc0,$acc1,$t2
1494	adcs	$acc1,$acc2,$t3
1495	adcs	$acc2,$acc3,$t4
1496	adc	$acc3,xzr,$t4		// can't overflow
1497___
1498$code.=<<___	if ($i<3);
1499	mul	$t3,$acc0,$ordk
1500___
1501$code.=<<___;
1502	lsl	$t0,$t4,#32
1503	subs	$acc1,$acc1,$t4
1504	lsr	$t1,$t4,#32
1505	sbcs	$acc2,$acc2,$t0
1506	sbc	$acc3,$acc3,$t1		// can't borrow
1507___
1508	($t3,$t4) = ($t4,$t3);
1509}
1510$code.=<<___;
1511	adds	$acc0,$acc0,$acc4	// accumulate upper half
1512	adcs	$acc1,$acc1,$acc5
1513	adcs	$acc2,$acc2,$acc6
1514	adcs	$acc3,$acc3,$acc7
1515	adc	$acc4,xzr,xzr
1516
1517	subs	$t0,$acc0,$ord0		// ret -= modulus
1518	sbcs	$t1,$acc1,$ord1
1519	sbcs	$t2,$acc2,$ord2
1520	sbcs	$t3,$acc3,$ord3
1521	sbcs	xzr,$acc4,xzr
1522
1523	csel	$a0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
1524	csel	$a1,$acc1,$t1,lo
1525	csel	$a2,$acc2,$t2,lo
1526	csel	$a3,$acc3,$t3,lo
1527
1528	cbnz	$bp,.Loop_ord_sqr
1529
1530	stp	$a0,$a1,[$rp]
1531	stp	$a2,$a3,[$rp,#16]
1532
1533	ldp	x19,x20,[sp,#16]
1534	ldp	x21,x22,[sp,#32]
1535	ldp	x23,x24,[sp,#48]
1536	ldr	x29,[sp],#64
1537	ret
1538.size	ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
1539___
1540}	}
1541
1542########################################################################
1543# select subroutines
1544# These select functions are similar to those in p256-x86_64-asm.pl
1545# They load all points in the lookup table
1546# keeping in the output only the one corresponding to the input index.
1547{
1548my ($val,$in_t)=map("x$_",(0..1));
1549my ($index)=("w2");
1550my ($Idx_ctr,$Val_in, $Mask_64)=("w9", "x10", "x11");
1551my ($Mask)=("v3");
1552my ($Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("v$_",(16..21));
1553my ($T0a,$T0b,$T0c,$T0d,$T0e,$T0f)=map("v$_",(22..27));
1554$code.=<<___;
1555////////////////////////////////////////////////////////////////////////
1556// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
1557.globl	ecp_nistz256_select_w5
1558.type	ecp_nistz256_select_w5,%function
1559.align	4
1560ecp_nistz256_select_w5:
1561    AARCH64_VALID_CALL_TARGET
1562
1563    // $Val_in := $val
1564    // $Idx_ctr := 0; loop counter and incremented internal index
1565    mov     $Val_in, $val
1566    mov     $Idx_ctr, #0
1567
1568    // [$Ra-$Rf] := 0
1569    movi    $Ra.16b, #0
1570    movi    $Rb.16b, #0
1571    movi    $Rc.16b, #0
1572    movi    $Rd.16b, #0
1573    movi    $Re.16b, #0
1574    movi    $Rf.16b, #0
1575
1576.Lselect_w5_loop:
1577    // Loop 16 times.
1578
1579    // Increment index (loop counter); tested at the end of the loop
1580    add $Idx_ctr, $Idx_ctr, #1
1581
1582    // [$T0a-$T0f] := Load a (3*256-bit = 6*128-bit) table entry starting at $in_t
1583    //  and advance $in_t to point to the next entry
1584    ld1     {$T0a.2d, $T0b.2d, $T0c.2d, $T0d.2d}, [$in_t],#64
1585
1586    // $Mask_64 := ($Idx_ctr == $index)? All 1s : All 0s
1587    cmp     $Idx_ctr, $index
1588    csetm   $Mask_64, eq
1589
1590    // continue loading ...
1591    ld1     {$T0e.2d, $T0f.2d}, [$in_t],#32
1592
1593    // duplicate mask_64 into Mask (all 0s or all 1s)
1594    dup     $Mask.2d, $Mask_64
1595
1596    // [$Ra-$Rd] := (Mask == all 1s)? [$T0a-$T0d] : [$Ra-$Rd]
1597    // i.e., values in output registers will remain the same if $Idx_ctr != $index
1598    bit     $Ra.16b, $T0a.16b, $Mask.16b
1599    bit     $Rb.16b, $T0b.16b, $Mask.16b
1600
1601    bit     $Rc.16b, $T0c.16b, $Mask.16b
1602    bit     $Rd.16b, $T0d.16b, $Mask.16b
1603
1604    bit     $Re.16b, $T0e.16b, $Mask.16b
1605    bit     $Rf.16b, $T0f.16b, $Mask.16b
1606
1607    // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back
1608    tbz    $Idx_ctr, #4, .Lselect_w5_loop
1609
1610    // Write [$Ra-$Rf] to memory at the output pointer
1611    st1     {$Ra.2d, $Rb.2d, $Rc.2d, $Rd.2d}, [$Val_in],#64
1612    st1     {$Re.2d, $Rf.2d}, [$Val_in]
1613
1614	ret
1615.size	ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
1616
1617
1618////////////////////////////////////////////////////////////////////////
1619// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
1620.globl	ecp_nistz256_select_w7
1621.type	ecp_nistz256_select_w7,%function
1622.align	4
1623ecp_nistz256_select_w7:
1624    AARCH64_VALID_CALL_TARGET
1625
1626    // $Idx_ctr := 0; loop counter and incremented internal index
1627    mov     $Idx_ctr, #0
1628
1629    // [$Ra-$Rf] := 0
1630    movi    $Ra.16b, #0
1631    movi    $Rb.16b, #0
1632    movi    $Rc.16b, #0
1633    movi    $Rd.16b, #0
1634
1635.Lselect_w7_loop:
1636    // Loop 64 times.
1637
1638    // Increment index (loop counter); tested at the end of the loop
1639    add $Idx_ctr, $Idx_ctr, #1
1640
1641    // [$T0a-$T0d] := Load a (2*256-bit = 4*128-bit) table entry starting at $in_t
1642    //  and advance $in_t to point to the next entry
1643    ld1     {$T0a.2d, $T0b.2d, $T0c.2d, $T0d.2d}, [$in_t],#64
1644
1645    // $Mask_64 := ($Idx_ctr == $index)? All 1s : All 0s
1646    cmp     $Idx_ctr, $index
1647    csetm   $Mask_64, eq
1648
1649    // duplicate mask_64 into Mask (all 0s or all 1s)
1650    dup     $Mask.2d, $Mask_64
1651
1652    // [$Ra-$Rd] := (Mask == all 1s)? [$T0a-$T0d] : [$Ra-$Rd]
1653    // i.e., values in output registers will remain the same if $Idx_ctr != $index
1654    bit     $Ra.16b, $T0a.16b, $Mask.16b
1655    bit     $Rb.16b, $T0b.16b, $Mask.16b
1656
1657    bit     $Rc.16b, $T0c.16b, $Mask.16b
1658    bit     $Rd.16b, $T0d.16b, $Mask.16b
1659
1660    // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back
1661    tbz    $Idx_ctr, #6, .Lselect_w7_loop
1662
1663    // Write [$Ra-$Rd] to memory at the output pointer
1664    st1     {$Ra.2d, $Rb.2d, $Rc.2d, $Rd.2d}, [$val]
1665
1666	ret
1667.size	ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
1668___
1669}
1670
1671foreach (split("\n",$code)) {
1672	s/\`([^\`]*)\`/eval $1/ge;
1673
1674	print $_,"\n";
1675}
1676close STDOUT or die "error closing STDOUT: $!";	# enforce flush
1677