xref: /aosp_15_r20/external/boringssl/src/crypto/fipsmodule/bn/asm/armv8-mont.pl (revision 8fb009dc861624b67b6cdb62ea21f0f22d0c584b)
1#! /usr/bin/env perl
2# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# March 2015
18#
19# "Teaser" Montgomery multiplication module for ARMv8. Needs more
20# work. While it does improve RSA sign performance by 20-30% (less for
21# longer keys) on most processors, for some reason RSA2048 is not
22# faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
23# instruction issue rate is limited on processor in question, meaning
24# that dedicated squaring procedure is a must. Well, actually all
25# contemporary AArch64 processors seem to have limited multiplication
26# issue rate, i.e. they can't issue multiplication every cycle, which
27# explains moderate improvement coefficients in comparison to
28# compiler-generated code. Recall that compiler is instructed to use
29# umulh and therefore uses same amount of multiplication instructions
30# to do the job. Assembly's edge is to minimize number of "collateral"
31# instructions and of course instruction scheduling.
32#
33# April 2015
34#
35# Squaring procedure that handles lengths divisible by 8 improves
36# RSA/DSA performance by 25-40-60% depending on processor and key
37# length. Overall improvement coefficients are always positive in
38# comparison to compiler-generated code. On Cortex-A57 improvement
39# is still modest on longest key lengths, while others exhibit e.g.
40# 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
41# on Cortex-A57 and ~60-100% faster on others.
42
43$flavour = shift;
44$output  = shift;
45
46$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
48( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
49die "can't locate arm-xlate.pl";
50
51open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
52*STDOUT=*OUT;
53
54($lo0,$hi0,$aj,$m0,$alo,$ahi,
55 $lo1,$hi1,$nj,$m1,$nlo,$nhi,
56 $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
57
58# int bn_mul_mont(
59$rp="x0";	# BN_ULONG *rp,
60$ap="x1";	# const BN_ULONG *ap,
61$bp="x2";	# const BN_ULONG *bp,
62$np="x3";	# const BN_ULONG *np,
63$n0="x4";	# const BN_ULONG *n0,
64$num="x5";	# size_t num);
65
66$code.=<<___;
67#include <openssl/arm_arch.h>
68
69.text
70
71.globl	bn_mul_mont
72.type	bn_mul_mont,%function
73.align	5
74bn_mul_mont:
75	AARCH64_SIGN_LINK_REGISTER
76	tst	$num,#7
77	b.eq	__bn_sqr8x_mont
78	tst	$num,#3
79	b.eq	__bn_mul4x_mont
80.Lmul_mont:
81	stp	x29,x30,[sp,#-64]!
82	add	x29,sp,#0
83	stp	x19,x20,[sp,#16]
84	stp	x21,x22,[sp,#32]
85	stp	x23,x24,[sp,#48]
86
87	ldr	$m0,[$bp],#8		// bp[0]
88	sub	$tp,sp,$num,lsl#3
89	ldp	$hi0,$aj,[$ap],#16	// ap[0..1]
90	lsl	$num,$num,#3
91	ldr	$n0,[$n0]		// *n0
92	and	$tp,$tp,#-16		// ABI says so
93	ldp	$hi1,$nj,[$np],#16	// np[0..1]
94
95	mul	$lo0,$hi0,$m0		// ap[0]*bp[0]
96	sub	$j,$num,#16		// j=num-2
97	umulh	$hi0,$hi0,$m0
98	mul	$alo,$aj,$m0		// ap[1]*bp[0]
99	umulh	$ahi,$aj,$m0
100
101	mul	$m1,$lo0,$n0		// "tp[0]"*n0
102	mov	sp,$tp			// alloca
103
104	// (*)	mul	$lo1,$hi1,$m1	// np[0]*m1
105	umulh	$hi1,$hi1,$m1
106	mul	$nlo,$nj,$m1		// np[1]*m1
107	// (*)	adds	$lo1,$lo1,$lo0	// discarded
108	// (*)	As for removal of first multiplication and addition
109	//	instructions. The outcome of first addition is
110	//	guaranteed to be zero, which leaves two computationally
111	//	significant outcomes: it either carries or not. Then
112	//	question is when does it carry? Is there alternative
113	//	way to deduce it? If you follow operations, you can
114	//	observe that condition for carry is quite simple:
115	//	$lo0 being non-zero. So that carry can be calculated
116	//	by adding -1 to $lo0. That's what next instruction does.
117	subs	xzr,$lo0,#1		// (*)
118	umulh	$nhi,$nj,$m1
119	adc	$hi1,$hi1,xzr
120	cbz	$j,.L1st_skip
121
122.L1st:
123	ldr	$aj,[$ap],#8
124	adds	$lo0,$alo,$hi0
125	sub	$j,$j,#8		// j--
126	adc	$hi0,$ahi,xzr
127
128	ldr	$nj,[$np],#8
129	adds	$lo1,$nlo,$hi1
130	mul	$alo,$aj,$m0		// ap[j]*bp[0]
131	adc	$hi1,$nhi,xzr
132	umulh	$ahi,$aj,$m0
133
134	adds	$lo1,$lo1,$lo0
135	mul	$nlo,$nj,$m1		// np[j]*m1
136	adc	$hi1,$hi1,xzr
137	umulh	$nhi,$nj,$m1
138	str	$lo1,[$tp],#8		// tp[j-1]
139	cbnz	$j,.L1st
140
141.L1st_skip:
142	adds	$lo0,$alo,$hi0
143	sub	$ap,$ap,$num		// rewind $ap
144	adc	$hi0,$ahi,xzr
145
146	adds	$lo1,$nlo,$hi1
147	sub	$np,$np,$num		// rewind $np
148	adc	$hi1,$nhi,xzr
149
150	adds	$lo1,$lo1,$lo0
151	sub	$i,$num,#8		// i=num-1
152	adcs	$hi1,$hi1,$hi0
153
154	adc	$ovf,xzr,xzr		// upmost overflow bit
155	stp	$lo1,$hi1,[$tp]
156
157.Louter:
158	ldr	$m0,[$bp],#8		// bp[i]
159	ldp	$hi0,$aj,[$ap],#16
160	ldr	$tj,[sp]		// tp[0]
161	add	$tp,sp,#8
162
163	mul	$lo0,$hi0,$m0		// ap[0]*bp[i]
164	sub	$j,$num,#16		// j=num-2
165	umulh	$hi0,$hi0,$m0
166	ldp	$hi1,$nj,[$np],#16
167	mul	$alo,$aj,$m0		// ap[1]*bp[i]
168	adds	$lo0,$lo0,$tj
169	umulh	$ahi,$aj,$m0
170	adc	$hi0,$hi0,xzr
171
172	mul	$m1,$lo0,$n0
173	sub	$i,$i,#8		// i--
174
175	// (*)	mul	$lo1,$hi1,$m1	// np[0]*m1
176	umulh	$hi1,$hi1,$m1
177	mul	$nlo,$nj,$m1		// np[1]*m1
178	// (*)	adds	$lo1,$lo1,$lo0
179	subs	xzr,$lo0,#1		// (*)
180	umulh	$nhi,$nj,$m1
181	cbz	$j,.Linner_skip
182
183.Linner:
184	ldr	$aj,[$ap],#8
185	adc	$hi1,$hi1,xzr
186	ldr	$tj,[$tp],#8		// tp[j]
187	adds	$lo0,$alo,$hi0
188	sub	$j,$j,#8		// j--
189	adc	$hi0,$ahi,xzr
190
191	adds	$lo1,$nlo,$hi1
192	ldr	$nj,[$np],#8
193	adc	$hi1,$nhi,xzr
194
195	mul	$alo,$aj,$m0		// ap[j]*bp[i]
196	adds	$lo0,$lo0,$tj
197	umulh	$ahi,$aj,$m0
198	adc	$hi0,$hi0,xzr
199
200	mul	$nlo,$nj,$m1		// np[j]*m1
201	adds	$lo1,$lo1,$lo0
202	umulh	$nhi,$nj,$m1
203	str	$lo1,[$tp,#-16]		// tp[j-1]
204	cbnz	$j,.Linner
205
206.Linner_skip:
207	ldr	$tj,[$tp],#8		// tp[j]
208	adc	$hi1,$hi1,xzr
209	adds	$lo0,$alo,$hi0
210	sub	$ap,$ap,$num		// rewind $ap
211	adc	$hi0,$ahi,xzr
212
213	adds	$lo1,$nlo,$hi1
214	sub	$np,$np,$num		// rewind $np
215	adcs	$hi1,$nhi,$ovf
216	adc	$ovf,xzr,xzr
217
218	adds	$lo0,$lo0,$tj
219	adc	$hi0,$hi0,xzr
220
221	adds	$lo1,$lo1,$lo0
222	adcs	$hi1,$hi1,$hi0
223	adc	$ovf,$ovf,xzr		// upmost overflow bit
224	stp	$lo1,$hi1,[$tp,#-16]
225
226	cbnz	$i,.Louter
227
228	// Final step. We see if result is larger than modulus, and
229	// if it is, subtract the modulus. But comparison implies
230	// subtraction. So we subtract modulus, see if it borrowed,
231	// and conditionally copy original value.
232	ldr	$tj,[sp]		// tp[0]
233	add	$tp,sp,#8
234	ldr	$nj,[$np],#8		// np[0]
235	subs	$j,$num,#8		// j=num-1 and clear borrow
236	mov	$ap,$rp
237.Lsub:
238	sbcs	$aj,$tj,$nj		// tp[j]-np[j]
239	ldr	$tj,[$tp],#8
240	sub	$j,$j,#8		// j--
241	ldr	$nj,[$np],#8
242	str	$aj,[$ap],#8		// rp[j]=tp[j]-np[j]
243	cbnz	$j,.Lsub
244
245	sbcs	$aj,$tj,$nj
246	sbcs	$ovf,$ovf,xzr		// did it borrow?
247	str	$aj,[$ap],#8		// rp[num-1]
248
249	ldr	$tj,[sp]		// tp[0]
250	add	$tp,sp,#8
251	ldr	$aj,[$rp],#8		// rp[0]
252	sub	$num,$num,#8		// num--
253	nop
254.Lcond_copy:
255	sub	$num,$num,#8		// num--
256	csel	$nj,$tj,$aj,lo		// did it borrow?
257	ldr	$tj,[$tp],#8
258	ldr	$aj,[$rp],#8
259	str	xzr,[$tp,#-16]		// wipe tp
260	str	$nj,[$rp,#-16]
261	cbnz	$num,.Lcond_copy
262
263	csel	$nj,$tj,$aj,lo
264	str	xzr,[$tp,#-8]		// wipe tp
265	str	$nj,[$rp,#-8]
266
267	ldp	x19,x20,[x29,#16]
268	mov	sp,x29
269	ldp	x21,x22,[x29,#32]
270	mov	x0,#1
271	ldp	x23,x24,[x29,#48]
272	ldr	x29,[sp],#64
273	AARCH64_VALIDATE_LINK_REGISTER
274	ret
275.size	bn_mul_mont,.-bn_mul_mont
276___
277{
278########################################################################
279# Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
280
281my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
282my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
283my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
284my ($cnt,$carry,$topmost)=("x27","x28","x30");
285my ($tp,$ap_end,$na0)=($bp,$np,$carry);
286
287$code.=<<___;
288.type	__bn_sqr8x_mont,%function
289.align	5
290__bn_sqr8x_mont:
291	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
292	// only from bn_mul_mont which has already signed the return address.
293	cmp	$ap,$bp
294	b.ne	__bn_mul4x_mont
295.Lsqr8x_mont:
296	stp	x29,x30,[sp,#-128]!
297	add	x29,sp,#0
298	stp	x19,x20,[sp,#16]
299	stp	x21,x22,[sp,#32]
300	stp	x23,x24,[sp,#48]
301	stp	x25,x26,[sp,#64]
302	stp	x27,x28,[sp,#80]
303	stp	$rp,$np,[sp,#96]	// offload rp and np
304
305	ldp	$a0,$a1,[$ap,#8*0]
306	ldp	$a2,$a3,[$ap,#8*2]
307	ldp	$a4,$a5,[$ap,#8*4]
308	ldp	$a6,$a7,[$ap,#8*6]
309
310	sub	$tp,sp,$num,lsl#4
311	lsl	$num,$num,#3
312	ldr	$n0,[$n0]		// *n0
313	mov	sp,$tp			// alloca
314	sub	$cnt,$num,#8*8
315	b	.Lsqr8x_zero_start
316
317.Lsqr8x_zero:
318	sub	$cnt,$cnt,#8*8
319	stp	xzr,xzr,[$tp,#8*0]
320	stp	xzr,xzr,[$tp,#8*2]
321	stp	xzr,xzr,[$tp,#8*4]
322	stp	xzr,xzr,[$tp,#8*6]
323.Lsqr8x_zero_start:
324	stp	xzr,xzr,[$tp,#8*8]
325	stp	xzr,xzr,[$tp,#8*10]
326	stp	xzr,xzr,[$tp,#8*12]
327	stp	xzr,xzr,[$tp,#8*14]
328	add	$tp,$tp,#8*16
329	cbnz	$cnt,.Lsqr8x_zero
330
331	add	$ap_end,$ap,$num
332	add	$ap,$ap,#8*8
333	mov	$acc0,xzr
334	mov	$acc1,xzr
335	mov	$acc2,xzr
336	mov	$acc3,xzr
337	mov	$acc4,xzr
338	mov	$acc5,xzr
339	mov	$acc6,xzr
340	mov	$acc7,xzr
341	mov	$tp,sp
342	str	$n0,[x29,#112]		// offload n0
343
344	// Multiply everything but a[i]*a[i]
345.align	4
346.Lsqr8x_outer_loop:
347        //                                                 a[1]a[0]	(i)
348        //                                             a[2]a[0]
349        //                                         a[3]a[0]
350        //                                     a[4]a[0]
351        //                                 a[5]a[0]
352        //                             a[6]a[0]
353        //                         a[7]a[0]
354        //                                         a[2]a[1]		(ii)
355        //                                     a[3]a[1]
356        //                                 a[4]a[1]
357        //                             a[5]a[1]
358        //                         a[6]a[1]
359        //                     a[7]a[1]
360        //                                 a[3]a[2]			(iii)
361        //                             a[4]a[2]
362        //                         a[5]a[2]
363        //                     a[6]a[2]
364        //                 a[7]a[2]
365        //                         a[4]a[3]				(iv)
366        //                     a[5]a[3]
367        //                 a[6]a[3]
368        //             a[7]a[3]
369        //                 a[5]a[4]					(v)
370        //             a[6]a[4]
371        //         a[7]a[4]
372        //         a[6]a[5]						(vi)
373        //     a[7]a[5]
374        // a[7]a[6]							(vii)
375
376	mul	$t0,$a1,$a0		// lo(a[1..7]*a[0])		(i)
377	mul	$t1,$a2,$a0
378	mul	$t2,$a3,$a0
379	mul	$t3,$a4,$a0
380	adds	$acc1,$acc1,$t0		// t[1]+lo(a[1]*a[0])
381	mul	$t0,$a5,$a0
382	adcs	$acc2,$acc2,$t1
383	mul	$t1,$a6,$a0
384	adcs	$acc3,$acc3,$t2
385	mul	$t2,$a7,$a0
386	adcs	$acc4,$acc4,$t3
387	umulh	$t3,$a1,$a0		// hi(a[1..7]*a[0])
388	adcs	$acc5,$acc5,$t0
389	umulh	$t0,$a2,$a0
390	adcs	$acc6,$acc6,$t1
391	umulh	$t1,$a3,$a0
392	adcs	$acc7,$acc7,$t2
393	umulh	$t2,$a4,$a0
394	stp	$acc0,$acc1,[$tp],#8*2	// t[0..1]
395	adc	$acc0,xzr,xzr		// t[8]
396	adds	$acc2,$acc2,$t3		// t[2]+lo(a[1]*a[0])
397	umulh	$t3,$a5,$a0
398	adcs	$acc3,$acc3,$t0
399	umulh	$t0,$a6,$a0
400	adcs	$acc4,$acc4,$t1
401	umulh	$t1,$a7,$a0
402	adcs	$acc5,$acc5,$t2
403	 mul	$t2,$a2,$a1		// lo(a[2..7]*a[1])		(ii)
404	adcs	$acc6,$acc6,$t3
405	 mul	$t3,$a3,$a1
406	adcs	$acc7,$acc7,$t0
407	 mul	$t0,$a4,$a1
408	adc	$acc0,$acc0,$t1
409
410	mul	$t1,$a5,$a1
411	adds	$acc3,$acc3,$t2
412	mul	$t2,$a6,$a1
413	adcs	$acc4,$acc4,$t3
414	mul	$t3,$a7,$a1
415	adcs	$acc5,$acc5,$t0
416	umulh	$t0,$a2,$a1		// hi(a[2..7]*a[1])
417	adcs	$acc6,$acc6,$t1
418	umulh	$t1,$a3,$a1
419	adcs	$acc7,$acc7,$t2
420	umulh	$t2,$a4,$a1
421	adcs	$acc0,$acc0,$t3
422	umulh	$t3,$a5,$a1
423	stp	$acc2,$acc3,[$tp],#8*2	// t[2..3]
424	adc	$acc1,xzr,xzr		// t[9]
425	adds	$acc4,$acc4,$t0
426	umulh	$t0,$a6,$a1
427	adcs	$acc5,$acc5,$t1
428	umulh	$t1,$a7,$a1
429	adcs	$acc6,$acc6,$t2
430	 mul	$t2,$a3,$a2		// lo(a[3..7]*a[2])		(iii)
431	adcs	$acc7,$acc7,$t3
432	 mul	$t3,$a4,$a2
433	adcs	$acc0,$acc0,$t0
434	 mul	$t0,$a5,$a2
435	adc	$acc1,$acc1,$t1
436
437	mul	$t1,$a6,$a2
438	adds	$acc5,$acc5,$t2
439	mul	$t2,$a7,$a2
440	adcs	$acc6,$acc6,$t3
441	umulh	$t3,$a3,$a2		// hi(a[3..7]*a[2])
442	adcs	$acc7,$acc7,$t0
443	umulh	$t0,$a4,$a2
444	adcs	$acc0,$acc0,$t1
445	umulh	$t1,$a5,$a2
446	adcs	$acc1,$acc1,$t2
447	umulh	$t2,$a6,$a2
448	stp	$acc4,$acc5,[$tp],#8*2	// t[4..5]
449	adc	$acc2,xzr,xzr		// t[10]
450	adds	$acc6,$acc6,$t3
451	umulh	$t3,$a7,$a2
452	adcs	$acc7,$acc7,$t0
453	 mul	$t0,$a4,$a3		// lo(a[4..7]*a[3])		(iv)
454	adcs	$acc0,$acc0,$t1
455	 mul	$t1,$a5,$a3
456	adcs	$acc1,$acc1,$t2
457	 mul	$t2,$a6,$a3
458	adc	$acc2,$acc2,$t3
459
460	mul	$t3,$a7,$a3
461	adds	$acc7,$acc7,$t0
462	umulh	$t0,$a4,$a3		// hi(a[4..7]*a[3])
463	adcs	$acc0,$acc0,$t1
464	umulh	$t1,$a5,$a3
465	adcs	$acc1,$acc1,$t2
466	umulh	$t2,$a6,$a3
467	adcs	$acc2,$acc2,$t3
468	umulh	$t3,$a7,$a3
469	stp	$acc6,$acc7,[$tp],#8*2	// t[6..7]
470	adc	$acc3,xzr,xzr		// t[11]
471	adds	$acc0,$acc0,$t0
472	 mul	$t0,$a5,$a4		// lo(a[5..7]*a[4])		(v)
473	adcs	$acc1,$acc1,$t1
474	 mul	$t1,$a6,$a4
475	adcs	$acc2,$acc2,$t2
476	 mul	$t2,$a7,$a4
477	adc	$acc3,$acc3,$t3
478
479	umulh	$t3,$a5,$a4		// hi(a[5..7]*a[4])
480	adds	$acc1,$acc1,$t0
481	umulh	$t0,$a6,$a4
482	adcs	$acc2,$acc2,$t1
483	umulh	$t1,$a7,$a4
484	adcs	$acc3,$acc3,$t2
485	 mul	$t2,$a6,$a5		// lo(a[6..7]*a[5])		(vi)
486	adc	$acc4,xzr,xzr		// t[12]
487	adds	$acc2,$acc2,$t3
488	 mul	$t3,$a7,$a5
489	adcs	$acc3,$acc3,$t0
490	 umulh	$t0,$a6,$a5		// hi(a[6..7]*a[5])
491	adc	$acc4,$acc4,$t1
492
493	umulh	$t1,$a7,$a5
494	adds	$acc3,$acc3,$t2
495	 mul	$t2,$a7,$a6		// lo(a[7]*a[6])		(vii)
496	adcs	$acc4,$acc4,$t3
497	 umulh	$t3,$a7,$a6		// hi(a[7]*a[6])
498	adc	$acc5,xzr,xzr		// t[13]
499	adds	$acc4,$acc4,$t0
500	sub	$cnt,$ap_end,$ap	// done yet?
501	adc	$acc5,$acc5,$t1
502
503	adds	$acc5,$acc5,$t2
504	sub	$t0,$ap_end,$num	// rewinded ap
505	adc	$acc6,xzr,xzr		// t[14]
506	add	$acc6,$acc6,$t3
507
508	cbz	$cnt,.Lsqr8x_outer_break
509
510	mov	$n0,$a0
511	ldp	$a0,$a1,[$tp,#8*0]
512	ldp	$a2,$a3,[$tp,#8*2]
513	ldp	$a4,$a5,[$tp,#8*4]
514	ldp	$a6,$a7,[$tp,#8*6]
515	adds	$acc0,$acc0,$a0
516	adcs	$acc1,$acc1,$a1
517	ldp	$a0,$a1,[$ap,#8*0]
518	adcs	$acc2,$acc2,$a2
519	adcs	$acc3,$acc3,$a3
520	ldp	$a2,$a3,[$ap,#8*2]
521	adcs	$acc4,$acc4,$a4
522	adcs	$acc5,$acc5,$a5
523	ldp	$a4,$a5,[$ap,#8*4]
524	adcs	$acc6,$acc6,$a6
525	mov	$rp,$ap
526	adcs	$acc7,xzr,$a7
527	ldp	$a6,$a7,[$ap,#8*6]
528	add	$ap,$ap,#8*8
529	//adc	$carry,xzr,xzr		// moved below
530	mov	$cnt,#-8*8
531
532	//                                                         a[8]a[0]
533	//                                                     a[9]a[0]
534	//                                                 a[a]a[0]
535	//                                             a[b]a[0]
536	//                                         a[c]a[0]
537	//                                     a[d]a[0]
538	//                                 a[e]a[0]
539	//                             a[f]a[0]
540	//                                                     a[8]a[1]
541	//                         a[f]a[1]........................
542	//                                                 a[8]a[2]
543	//                     a[f]a[2]........................
544	//                                             a[8]a[3]
545	//                 a[f]a[3]........................
546	//                                         a[8]a[4]
547	//             a[f]a[4]........................
548	//                                     a[8]a[5]
549	//         a[f]a[5]........................
550	//                                 a[8]a[6]
551	//     a[f]a[6]........................
552	//                             a[8]a[7]
553	// a[f]a[7]........................
554.Lsqr8x_mul:
555	mul	$t0,$a0,$n0
556	adc	$carry,xzr,xzr		// carry bit, modulo-scheduled
557	mul	$t1,$a1,$n0
558	add	$cnt,$cnt,#8
559	mul	$t2,$a2,$n0
560	mul	$t3,$a3,$n0
561	adds	$acc0,$acc0,$t0
562	mul	$t0,$a4,$n0
563	adcs	$acc1,$acc1,$t1
564	mul	$t1,$a5,$n0
565	adcs	$acc2,$acc2,$t2
566	mul	$t2,$a6,$n0
567	adcs	$acc3,$acc3,$t3
568	mul	$t3,$a7,$n0
569	adcs	$acc4,$acc4,$t0
570	umulh	$t0,$a0,$n0
571	adcs	$acc5,$acc5,$t1
572	umulh	$t1,$a1,$n0
573	adcs	$acc6,$acc6,$t2
574	umulh	$t2,$a2,$n0
575	adcs	$acc7,$acc7,$t3
576	umulh	$t3,$a3,$n0
577	adc	$carry,$carry,xzr
578	str	$acc0,[$tp],#8
579	adds	$acc0,$acc1,$t0
580	umulh	$t0,$a4,$n0
581	adcs	$acc1,$acc2,$t1
582	umulh	$t1,$a5,$n0
583	adcs	$acc2,$acc3,$t2
584	umulh	$t2,$a6,$n0
585	adcs	$acc3,$acc4,$t3
586	umulh	$t3,$a7,$n0
587	ldr	$n0,[$rp,$cnt]
588	adcs	$acc4,$acc5,$t0
589	adcs	$acc5,$acc6,$t1
590	adcs	$acc6,$acc7,$t2
591	adcs	$acc7,$carry,$t3
592	//adc	$carry,xzr,xzr		// moved above
593	cbnz	$cnt,.Lsqr8x_mul
594					// note that carry flag is guaranteed
595					// to be zero at this point
596	cmp	$ap,$ap_end		// done yet?
597	b.eq	.Lsqr8x_break
598
599	ldp	$a0,$a1,[$tp,#8*0]
600	ldp	$a2,$a3,[$tp,#8*2]
601	ldp	$a4,$a5,[$tp,#8*4]
602	ldp	$a6,$a7,[$tp,#8*6]
603	adds	$acc0,$acc0,$a0
604	ldr	$n0,[$rp,#-8*8]
605	adcs	$acc1,$acc1,$a1
606	ldp	$a0,$a1,[$ap,#8*0]
607	adcs	$acc2,$acc2,$a2
608	adcs	$acc3,$acc3,$a3
609	ldp	$a2,$a3,[$ap,#8*2]
610	adcs	$acc4,$acc4,$a4
611	adcs	$acc5,$acc5,$a5
612	ldp	$a4,$a5,[$ap,#8*4]
613	adcs	$acc6,$acc6,$a6
614	mov	$cnt,#-8*8
615	adcs	$acc7,$acc7,$a7
616	ldp	$a6,$a7,[$ap,#8*6]
617	add	$ap,$ap,#8*8
618	//adc	$carry,xzr,xzr		// moved above
619	b	.Lsqr8x_mul
620
621.align	4
622.Lsqr8x_break:
623	ldp	$a0,$a1,[$rp,#8*0]
624	add	$ap,$rp,#8*8
625	ldp	$a2,$a3,[$rp,#8*2]
626	sub	$t0,$ap_end,$ap		// is it last iteration?
627	ldp	$a4,$a5,[$rp,#8*4]
628	sub	$t1,$tp,$t0
629	ldp	$a6,$a7,[$rp,#8*6]
630	cbz	$t0,.Lsqr8x_outer_loop
631
632	stp	$acc0,$acc1,[$tp,#8*0]
633	ldp	$acc0,$acc1,[$t1,#8*0]
634	stp	$acc2,$acc3,[$tp,#8*2]
635	ldp	$acc2,$acc3,[$t1,#8*2]
636	stp	$acc4,$acc5,[$tp,#8*4]
637	ldp	$acc4,$acc5,[$t1,#8*4]
638	stp	$acc6,$acc7,[$tp,#8*6]
639	mov	$tp,$t1
640	ldp	$acc6,$acc7,[$t1,#8*6]
641	b	.Lsqr8x_outer_loop
642
643.align	4
644.Lsqr8x_outer_break:
645	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
646	ldp	$a1,$a3,[$t0,#8*0]	// recall that $t0 is &a[0]
647	ldp	$t1,$t2,[sp,#8*1]
648	ldp	$a5,$a7,[$t0,#8*2]
649	add	$ap,$t0,#8*4
650	ldp	$t3,$t0,[sp,#8*3]
651
652	stp	$acc0,$acc1,[$tp,#8*0]
653	mul	$acc0,$a1,$a1
654	stp	$acc2,$acc3,[$tp,#8*2]
655	umulh	$a1,$a1,$a1
656	stp	$acc4,$acc5,[$tp,#8*4]
657	mul	$a2,$a3,$a3
658	stp	$acc6,$acc7,[$tp,#8*6]
659	mov	$tp,sp
660	umulh	$a3,$a3,$a3
661	adds	$acc1,$a1,$t1,lsl#1
662	extr	$t1,$t2,$t1,#63
663	sub	$cnt,$num,#8*4
664
665.Lsqr4x_shift_n_add:
666	adcs	$acc2,$a2,$t1
667	extr	$t2,$t3,$t2,#63
668	sub	$cnt,$cnt,#8*4
669	adcs	$acc3,$a3,$t2
670	ldp	$t1,$t2,[$tp,#8*5]
671	mul	$a4,$a5,$a5
672	ldp	$a1,$a3,[$ap],#8*2
673	umulh	$a5,$a5,$a5
674	mul	$a6,$a7,$a7
675	umulh	$a7,$a7,$a7
676	extr	$t3,$t0,$t3,#63
677	stp	$acc0,$acc1,[$tp,#8*0]
678	adcs	$acc4,$a4,$t3
679	extr	$t0,$t1,$t0,#63
680	stp	$acc2,$acc3,[$tp,#8*2]
681	adcs	$acc5,$a5,$t0
682	ldp	$t3,$t0,[$tp,#8*7]
683	extr	$t1,$t2,$t1,#63
684	adcs	$acc6,$a6,$t1
685	extr	$t2,$t3,$t2,#63
686	adcs	$acc7,$a7,$t2
687	ldp	$t1,$t2,[$tp,#8*9]
688	mul	$a0,$a1,$a1
689	ldp	$a5,$a7,[$ap],#8*2
690	umulh	$a1,$a1,$a1
691	mul	$a2,$a3,$a3
692	umulh	$a3,$a3,$a3
693	stp	$acc4,$acc5,[$tp,#8*4]
694	extr	$t3,$t0,$t3,#63
695	stp	$acc6,$acc7,[$tp,#8*6]
696	add	$tp,$tp,#8*8
697	adcs	$acc0,$a0,$t3
698	extr	$t0,$t1,$t0,#63
699	adcs	$acc1,$a1,$t0
700	ldp	$t3,$t0,[$tp,#8*3]
701	extr	$t1,$t2,$t1,#63
702	cbnz	$cnt,.Lsqr4x_shift_n_add
703___
704my ($np,$np_end)=($ap,$ap_end);
705$code.=<<___;
706	 ldp	$np,$n0,[x29,#104]	// pull np and n0
707
708	adcs	$acc2,$a2,$t1
709	extr	$t2,$t3,$t2,#63
710	adcs	$acc3,$a3,$t2
711	ldp	$t1,$t2,[$tp,#8*5]
712	mul	$a4,$a5,$a5
713	umulh	$a5,$a5,$a5
714	stp	$acc0,$acc1,[$tp,#8*0]
715	mul	$a6,$a7,$a7
716	umulh	$a7,$a7,$a7
717	stp	$acc2,$acc3,[$tp,#8*2]
718	extr	$t3,$t0,$t3,#63
719	adcs	$acc4,$a4,$t3
720	extr	$t0,$t1,$t0,#63
721	 ldp	$acc0,$acc1,[sp,#8*0]
722	adcs	$acc5,$a5,$t0
723	extr	$t1,$t2,$t1,#63
724	 ldp	$a0,$a1,[$np,#8*0]
725	adcs	$acc6,$a6,$t1
726	extr	$t2,xzr,$t2,#63
727	 ldp	$a2,$a3,[$np,#8*2]
728	adc	$acc7,$a7,$t2
729	 ldp	$a4,$a5,[$np,#8*4]
730
731	// Reduce by 512 bits per iteration
732	mul	$na0,$n0,$acc0		// t[0]*n0
733	ldp	$a6,$a7,[$np,#8*6]
734	add	$np_end,$np,$num
735	ldp	$acc2,$acc3,[sp,#8*2]
736	stp	$acc4,$acc5,[$tp,#8*4]
737	ldp	$acc4,$acc5,[sp,#8*4]
738	stp	$acc6,$acc7,[$tp,#8*6]
739	ldp	$acc6,$acc7,[sp,#8*6]
740	add	$np,$np,#8*8
741	mov	$topmost,xzr		// initial top-most carry
742	mov	$tp,sp
743	mov	$cnt,#8
744
745.Lsqr8x_reduction:
746	// (*)	mul	$t0,$a0,$na0	// lo(n[0-7])*lo(t[0]*n0)
747	mul	$t1,$a1,$na0
748	sub	$cnt,$cnt,#1
749	mul	$t2,$a2,$na0
750	str	$na0,[$tp],#8		// put aside t[0]*n0 for tail processing
751	mul	$t3,$a3,$na0
752	// (*)	adds	xzr,$acc0,$t0
753	subs	xzr,$acc0,#1		// (*)
754	mul	$t0,$a4,$na0
755	adcs	$acc0,$acc1,$t1
756	mul	$t1,$a5,$na0
757	adcs	$acc1,$acc2,$t2
758	mul	$t2,$a6,$na0
759	adcs	$acc2,$acc3,$t3
760	mul	$t3,$a7,$na0
761	adcs	$acc3,$acc4,$t0
762	umulh	$t0,$a0,$na0		// hi(n[0-7])*lo(t[0]*n0)
763	adcs	$acc4,$acc5,$t1
764	umulh	$t1,$a1,$na0
765	adcs	$acc5,$acc6,$t2
766	umulh	$t2,$a2,$na0
767	adcs	$acc6,$acc7,$t3
768	umulh	$t3,$a3,$na0
769	adc	$acc7,xzr,xzr
770	adds	$acc0,$acc0,$t0
771	umulh	$t0,$a4,$na0
772	adcs	$acc1,$acc1,$t1
773	umulh	$t1,$a5,$na0
774	adcs	$acc2,$acc2,$t2
775	umulh	$t2,$a6,$na0
776	adcs	$acc3,$acc3,$t3
777	umulh	$t3,$a7,$na0
778	mul	$na0,$n0,$acc0		// next t[0]*n0
779	adcs	$acc4,$acc4,$t0
780	adcs	$acc5,$acc5,$t1
781	adcs	$acc6,$acc6,$t2
782	adc	$acc7,$acc7,$t3
783	cbnz	$cnt,.Lsqr8x_reduction
784
785	ldp	$t0,$t1,[$tp,#8*0]
786	ldp	$t2,$t3,[$tp,#8*2]
787	mov	$rp,$tp
788	sub	$cnt,$np_end,$np	// done yet?
789	adds	$acc0,$acc0,$t0
790	adcs	$acc1,$acc1,$t1
791	ldp	$t0,$t1,[$tp,#8*4]
792	adcs	$acc2,$acc2,$t2
793	adcs	$acc3,$acc3,$t3
794	ldp	$t2,$t3,[$tp,#8*6]
795	adcs	$acc4,$acc4,$t0
796	adcs	$acc5,$acc5,$t1
797	adcs	$acc6,$acc6,$t2
798	adcs	$acc7,$acc7,$t3
799	//adc	$carry,xzr,xzr		// moved below
800	cbz	$cnt,.Lsqr8x8_post_condition
801
802	ldr	$n0,[$tp,#-8*8]
803	ldp	$a0,$a1,[$np,#8*0]
804	ldp	$a2,$a3,[$np,#8*2]
805	ldp	$a4,$a5,[$np,#8*4]
806	mov	$cnt,#-8*8
807	ldp	$a6,$a7,[$np,#8*6]
808	add	$np,$np,#8*8
809
810.Lsqr8x_tail:
811	mul	$t0,$a0,$n0
812	adc	$carry,xzr,xzr		// carry bit, modulo-scheduled
813	mul	$t1,$a1,$n0
814	add	$cnt,$cnt,#8
815	mul	$t2,$a2,$n0
816	mul	$t3,$a3,$n0
817	adds	$acc0,$acc0,$t0
818	mul	$t0,$a4,$n0
819	adcs	$acc1,$acc1,$t1
820	mul	$t1,$a5,$n0
821	adcs	$acc2,$acc2,$t2
822	mul	$t2,$a6,$n0
823	adcs	$acc3,$acc3,$t3
824	mul	$t3,$a7,$n0
825	adcs	$acc4,$acc4,$t0
826	umulh	$t0,$a0,$n0
827	adcs	$acc5,$acc5,$t1
828	umulh	$t1,$a1,$n0
829	adcs	$acc6,$acc6,$t2
830	umulh	$t2,$a2,$n0
831	adcs	$acc7,$acc7,$t3
832	umulh	$t3,$a3,$n0
833	adc	$carry,$carry,xzr
834	str	$acc0,[$tp],#8
835	adds	$acc0,$acc1,$t0
836	umulh	$t0,$a4,$n0
837	adcs	$acc1,$acc2,$t1
838	umulh	$t1,$a5,$n0
839	adcs	$acc2,$acc3,$t2
840	umulh	$t2,$a6,$n0
841	adcs	$acc3,$acc4,$t3
842	umulh	$t3,$a7,$n0
843	ldr	$n0,[$rp,$cnt]
844	adcs	$acc4,$acc5,$t0
845	adcs	$acc5,$acc6,$t1
846	adcs	$acc6,$acc7,$t2
847	adcs	$acc7,$carry,$t3
848	//adc	$carry,xzr,xzr		// moved above
849	cbnz	$cnt,.Lsqr8x_tail
850					// note that carry flag is guaranteed
851					// to be zero at this point
852	ldp	$a0,$a1,[$tp,#8*0]
853	sub	$cnt,$np_end,$np	// done yet?
854	sub	$t2,$np_end,$num	// rewinded np
855	ldp	$a2,$a3,[$tp,#8*2]
856	ldp	$a4,$a5,[$tp,#8*4]
857	ldp	$a6,$a7,[$tp,#8*6]
858	cbz	$cnt,.Lsqr8x_tail_break
859
860	ldr	$n0,[$rp,#-8*8]
861	adds	$acc0,$acc0,$a0
862	adcs	$acc1,$acc1,$a1
863	ldp	$a0,$a1,[$np,#8*0]
864	adcs	$acc2,$acc2,$a2
865	adcs	$acc3,$acc3,$a3
866	ldp	$a2,$a3,[$np,#8*2]
867	adcs	$acc4,$acc4,$a4
868	adcs	$acc5,$acc5,$a5
869	ldp	$a4,$a5,[$np,#8*4]
870	adcs	$acc6,$acc6,$a6
871	mov	$cnt,#-8*8
872	adcs	$acc7,$acc7,$a7
873	ldp	$a6,$a7,[$np,#8*6]
874	add	$np,$np,#8*8
875	//adc	$carry,xzr,xzr		// moved above
876	b	.Lsqr8x_tail
877
878.align	4
879.Lsqr8x_tail_break:
880	ldr	$n0,[x29,#112]		// pull n0
881	add	$cnt,$tp,#8*8		// end of current t[num] window
882
883	subs	xzr,$topmost,#1		// "move" top-most carry to carry bit
884	adcs	$t0,$acc0,$a0
885	adcs	$t1,$acc1,$a1
886	ldp	$acc0,$acc1,[$rp,#8*0]
887	adcs	$acc2,$acc2,$a2
888	ldp	$a0,$a1,[$t2,#8*0]	// recall that $t2 is &n[0]
889	adcs	$acc3,$acc3,$a3
890	ldp	$a2,$a3,[$t2,#8*2]
891	adcs	$acc4,$acc4,$a4
892	adcs	$acc5,$acc5,$a5
893	ldp	$a4,$a5,[$t2,#8*4]
894	adcs	$acc6,$acc6,$a6
895	adcs	$acc7,$acc7,$a7
896	ldp	$a6,$a7,[$t2,#8*6]
897	add	$np,$t2,#8*8
898	adc	$topmost,xzr,xzr	// top-most carry
899	mul	$na0,$n0,$acc0
900	stp	$t0,$t1,[$tp,#8*0]
901	stp	$acc2,$acc3,[$tp,#8*2]
902	ldp	$acc2,$acc3,[$rp,#8*2]
903	stp	$acc4,$acc5,[$tp,#8*4]
904	ldp	$acc4,$acc5,[$rp,#8*4]
905	cmp	$cnt,x29		// did we hit the bottom?
906	stp	$acc6,$acc7,[$tp,#8*6]
907	mov	$tp,$rp			// slide the window
908	ldp	$acc6,$acc7,[$rp,#8*6]
909	mov	$cnt,#8
910	b.ne	.Lsqr8x_reduction
911
912	// Final step. We see if result is larger than modulus, and
913	// if it is, subtract the modulus. But comparison implies
914	// subtraction. So we subtract modulus, see if it borrowed,
915	// and conditionally copy original value.
916	ldr	$rp,[x29,#96]		// pull rp
917	add	$tp,$tp,#8*8
918	subs	$t0,$acc0,$a0
919	sbcs	$t1,$acc1,$a1
920	sub	$cnt,$num,#8*8
921	mov	$ap_end,$rp		// $rp copy
922
923.Lsqr8x_sub:
924	sbcs	$t2,$acc2,$a2
925	ldp	$a0,$a1,[$np,#8*0]
926	sbcs	$t3,$acc3,$a3
927	stp	$t0,$t1,[$rp,#8*0]
928	sbcs	$t0,$acc4,$a4
929	ldp	$a2,$a3,[$np,#8*2]
930	sbcs	$t1,$acc5,$a5
931	stp	$t2,$t3,[$rp,#8*2]
932	sbcs	$t2,$acc6,$a6
933	ldp	$a4,$a5,[$np,#8*4]
934	sbcs	$t3,$acc7,$a7
935	ldp	$a6,$a7,[$np,#8*6]
936	add	$np,$np,#8*8
937	ldp	$acc0,$acc1,[$tp,#8*0]
938	sub	$cnt,$cnt,#8*8
939	ldp	$acc2,$acc3,[$tp,#8*2]
940	ldp	$acc4,$acc5,[$tp,#8*4]
941	ldp	$acc6,$acc7,[$tp,#8*6]
942	add	$tp,$tp,#8*8
943	stp	$t0,$t1,[$rp,#8*4]
944	sbcs	$t0,$acc0,$a0
945	stp	$t2,$t3,[$rp,#8*6]
946	add	$rp,$rp,#8*8
947	sbcs	$t1,$acc1,$a1
948	cbnz	$cnt,.Lsqr8x_sub
949
950	sbcs	$t2,$acc2,$a2
951	 mov	$tp,sp
952	 add	$ap,sp,$num
953	 ldp	$a0,$a1,[$ap_end,#8*0]
954	sbcs	$t3,$acc3,$a3
955	stp	$t0,$t1,[$rp,#8*0]
956	sbcs	$t0,$acc4,$a4
957	 ldp	$a2,$a3,[$ap_end,#8*2]
958	sbcs	$t1,$acc5,$a5
959	stp	$t2,$t3,[$rp,#8*2]
960	sbcs	$t2,$acc6,$a6
961	 ldp	$acc0,$acc1,[$ap,#8*0]
962	sbcs	$t3,$acc7,$a7
963	 ldp	$acc2,$acc3,[$ap,#8*2]
964	sbcs	xzr,$topmost,xzr	// did it borrow?
965	ldr	x30,[x29,#8]		// pull return address
966	stp	$t0,$t1,[$rp,#8*4]
967	stp	$t2,$t3,[$rp,#8*6]
968
969	sub	$cnt,$num,#8*4
970.Lsqr4x_cond_copy:
971	sub	$cnt,$cnt,#8*4
972	csel	$t0,$acc0,$a0,lo
973	 stp	xzr,xzr,[$tp,#8*0]
974	csel	$t1,$acc1,$a1,lo
975	ldp	$a0,$a1,[$ap_end,#8*4]
976	ldp	$acc0,$acc1,[$ap,#8*4]
977	csel	$t2,$acc2,$a2,lo
978	 stp	xzr,xzr,[$tp,#8*2]
979	 add	$tp,$tp,#8*4
980	csel	$t3,$acc3,$a3,lo
981	ldp	$a2,$a3,[$ap_end,#8*6]
982	ldp	$acc2,$acc3,[$ap,#8*6]
983	add	$ap,$ap,#8*4
984	stp	$t0,$t1,[$ap_end,#8*0]
985	stp	$t2,$t3,[$ap_end,#8*2]
986	add	$ap_end,$ap_end,#8*4
987	 stp	xzr,xzr,[$ap,#8*0]
988	 stp	xzr,xzr,[$ap,#8*2]
989	cbnz	$cnt,.Lsqr4x_cond_copy
990
991	csel	$t0,$acc0,$a0,lo
992	 stp	xzr,xzr,[$tp,#8*0]
993	csel	$t1,$acc1,$a1,lo
994	 stp	xzr,xzr,[$tp,#8*2]
995	csel	$t2,$acc2,$a2,lo
996	csel	$t3,$acc3,$a3,lo
997	stp	$t0,$t1,[$ap_end,#8*0]
998	stp	$t2,$t3,[$ap_end,#8*2]
999
1000	b	.Lsqr8x_done
1001
1002.align	4
1003.Lsqr8x8_post_condition:
1004	adc	$carry,xzr,xzr
1005	ldr	x30,[x29,#8]		// pull return address
1006	// $acc0-7,$carry hold result, $a0-7 hold modulus
1007	subs	$a0,$acc0,$a0
1008	ldr	$ap,[x29,#96]		// pull rp
1009	sbcs	$a1,$acc1,$a1
1010	 stp	xzr,xzr,[sp,#8*0]
1011	sbcs	$a2,$acc2,$a2
1012	 stp	xzr,xzr,[sp,#8*2]
1013	sbcs	$a3,$acc3,$a3
1014	 stp	xzr,xzr,[sp,#8*4]
1015	sbcs	$a4,$acc4,$a4
1016	 stp	xzr,xzr,[sp,#8*6]
1017	sbcs	$a5,$acc5,$a5
1018	 stp	xzr,xzr,[sp,#8*8]
1019	sbcs	$a6,$acc6,$a6
1020	 stp	xzr,xzr,[sp,#8*10]
1021	sbcs	$a7,$acc7,$a7
1022	 stp	xzr,xzr,[sp,#8*12]
1023	sbcs	$carry,$carry,xzr	// did it borrow?
1024	 stp	xzr,xzr,[sp,#8*14]
1025
1026	// $a0-7 hold result-modulus
1027	csel	$a0,$acc0,$a0,lo
1028	csel	$a1,$acc1,$a1,lo
1029	csel	$a2,$acc2,$a2,lo
1030	csel	$a3,$acc3,$a3,lo
1031	stp	$a0,$a1,[$ap,#8*0]
1032	csel	$a4,$acc4,$a4,lo
1033	csel	$a5,$acc5,$a5,lo
1034	stp	$a2,$a3,[$ap,#8*2]
1035	csel	$a6,$acc6,$a6,lo
1036	csel	$a7,$acc7,$a7,lo
1037	stp	$a4,$a5,[$ap,#8*4]
1038	stp	$a6,$a7,[$ap,#8*6]
1039
1040.Lsqr8x_done:
1041	ldp	x19,x20,[x29,#16]
1042	mov	sp,x29
1043	ldp	x21,x22,[x29,#32]
1044	mov	x0,#1
1045	ldp	x23,x24,[x29,#48]
1046	ldp	x25,x26,[x29,#64]
1047	ldp	x27,x28,[x29,#80]
1048	ldr	x29,[sp],#128
1049	// x30 is popped earlier
1050	AARCH64_VALIDATE_LINK_REGISTER
1051	ret
1052.size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
1053___
1054}
1055
1056{
1057########################################################################
1058# Even though this might look as ARMv8 adaptation of mulx4x_mont from
1059# x86_64-mont5 module, it's different in sense that it performs
1060# reduction 256 bits at a time.
1061
1062my ($a0,$a1,$a2,$a3,
1063    $t0,$t1,$t2,$t3,
1064    $m0,$m1,$m2,$m3,
1065    $acc0,$acc1,$acc2,$acc3,$acc4,
1066    $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
1067my  $bp_end=$rp;
1068my  ($carry,$topmost) = ($rp,"x30");
1069
1070$code.=<<___;
1071.type	__bn_mul4x_mont,%function
1072.align	5
1073__bn_mul4x_mont:
1074	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
1075	// only from bn_mul_mont or __bn_mul8x_mont which have already signed the
1076	// return address.
1077	stp	x29,x30,[sp,#-128]!
1078	add	x29,sp,#0
1079	stp	x19,x20,[sp,#16]
1080	stp	x21,x22,[sp,#32]
1081	stp	x23,x24,[sp,#48]
1082	stp	x25,x26,[sp,#64]
1083	stp	x27,x28,[sp,#80]
1084
1085	sub	$tp,sp,$num,lsl#3
1086	lsl	$num,$num,#3
1087	ldr	$n0,[$n0]		// *n0
1088	sub	sp,$tp,#8*4		// alloca
1089
1090	add	$t0,$bp,$num
1091	add	$ap_end,$ap,$num
1092	stp	$rp,$t0,[x29,#96]	// offload rp and &b[num]
1093
1094	ldr	$bi,[$bp,#8*0]		// b[0]
1095	ldp	$a0,$a1,[$ap,#8*0]	// a[0..3]
1096	ldp	$a2,$a3,[$ap,#8*2]
1097	add	$ap,$ap,#8*4
1098	mov	$acc0,xzr
1099	mov	$acc1,xzr
1100	mov	$acc2,xzr
1101	mov	$acc3,xzr
1102	ldp	$m0,$m1,[$np,#8*0]	// n[0..3]
1103	ldp	$m2,$m3,[$np,#8*2]
1104	adds	$np,$np,#8*4		// clear carry bit
1105	mov	$carry,xzr
1106	mov	$cnt,#0
1107	mov	$tp,sp
1108
1109.Loop_mul4x_1st_reduction:
1110	mul	$t0,$a0,$bi		// lo(a[0..3]*b[0])
1111	adc	$carry,$carry,xzr	// modulo-scheduled
1112	mul	$t1,$a1,$bi
1113	add	$cnt,$cnt,#8
1114	mul	$t2,$a2,$bi
1115	and	$cnt,$cnt,#31
1116	mul	$t3,$a3,$bi
1117	adds	$acc0,$acc0,$t0
1118	umulh	$t0,$a0,$bi		// hi(a[0..3]*b[0])
1119	adcs	$acc1,$acc1,$t1
1120	mul	$mi,$acc0,$n0		// t[0]*n0
1121	adcs	$acc2,$acc2,$t2
1122	umulh	$t1,$a1,$bi
1123	adcs	$acc3,$acc3,$t3
1124	umulh	$t2,$a2,$bi
1125	adc	$acc4,xzr,xzr
1126	umulh	$t3,$a3,$bi
1127	ldr	$bi,[$bp,$cnt]		// next b[i] (or b[0])
1128	adds	$acc1,$acc1,$t0
1129	// (*)	mul	$t0,$m0,$mi	// lo(n[0..3]*t[0]*n0)
1130	str	$mi,[$tp],#8		// put aside t[0]*n0 for tail processing
1131	adcs	$acc2,$acc2,$t1
1132	mul	$t1,$m1,$mi
1133	adcs	$acc3,$acc3,$t2
1134	mul	$t2,$m2,$mi
1135	adc	$acc4,$acc4,$t3		// can't overflow
1136	mul	$t3,$m3,$mi
1137	// (*)	adds	xzr,$acc0,$t0
1138	subs	xzr,$acc0,#1		// (*)
1139	umulh	$t0,$m0,$mi		// hi(n[0..3]*t[0]*n0)
1140	adcs	$acc0,$acc1,$t1
1141	umulh	$t1,$m1,$mi
1142	adcs	$acc1,$acc2,$t2
1143	umulh	$t2,$m2,$mi
1144	adcs	$acc2,$acc3,$t3
1145	umulh	$t3,$m3,$mi
1146	adcs	$acc3,$acc4,$carry
1147	adc	$carry,xzr,xzr
1148	adds	$acc0,$acc0,$t0
1149	sub	$t0,$ap_end,$ap
1150	adcs	$acc1,$acc1,$t1
1151	adcs	$acc2,$acc2,$t2
1152	adcs	$acc3,$acc3,$t3
1153	//adc	$carry,$carry,xzr
1154	cbnz	$cnt,.Loop_mul4x_1st_reduction
1155
1156	cbz	$t0,.Lmul4x4_post_condition
1157
1158	ldp	$a0,$a1,[$ap,#8*0]	// a[4..7]
1159	ldp	$a2,$a3,[$ap,#8*2]
1160	add	$ap,$ap,#8*4
1161	ldr	$mi,[sp]		// a[0]*n0
1162	ldp	$m0,$m1,[$np,#8*0]	// n[4..7]
1163	ldp	$m2,$m3,[$np,#8*2]
1164	add	$np,$np,#8*4
1165
1166.Loop_mul4x_1st_tail:
1167	mul	$t0,$a0,$bi		// lo(a[4..7]*b[i])
1168	adc	$carry,$carry,xzr	// modulo-scheduled
1169	mul	$t1,$a1,$bi
1170	add	$cnt,$cnt,#8
1171	mul	$t2,$a2,$bi
1172	and	$cnt,$cnt,#31
1173	mul	$t3,$a3,$bi
1174	adds	$acc0,$acc0,$t0
1175	umulh	$t0,$a0,$bi		// hi(a[4..7]*b[i])
1176	adcs	$acc1,$acc1,$t1
1177	umulh	$t1,$a1,$bi
1178	adcs	$acc2,$acc2,$t2
1179	umulh	$t2,$a2,$bi
1180	adcs	$acc3,$acc3,$t3
1181	umulh	$t3,$a3,$bi
1182	adc	$acc4,xzr,xzr
1183	ldr	$bi,[$bp,$cnt]		// next b[i] (or b[0])
1184	adds	$acc1,$acc1,$t0
1185	mul	$t0,$m0,$mi		// lo(n[4..7]*a[0]*n0)
1186	adcs	$acc2,$acc2,$t1
1187	mul	$t1,$m1,$mi
1188	adcs	$acc3,$acc3,$t2
1189	mul	$t2,$m2,$mi
1190	adc	$acc4,$acc4,$t3		// can't overflow
1191	mul	$t3,$m3,$mi
1192	adds	$acc0,$acc0,$t0
1193	umulh	$t0,$m0,$mi		// hi(n[4..7]*a[0]*n0)
1194	adcs	$acc1,$acc1,$t1
1195	umulh	$t1,$m1,$mi
1196	adcs	$acc2,$acc2,$t2
1197	umulh	$t2,$m2,$mi
1198	adcs	$acc3,$acc3,$t3
1199	adcs	$acc4,$acc4,$carry
1200	umulh	$t3,$m3,$mi
1201	adc	$carry,xzr,xzr
1202	ldr	$mi,[sp,$cnt]		// next t[0]*n0
1203	str	$acc0,[$tp],#8		// result!!!
1204	adds	$acc0,$acc1,$t0
1205	sub	$t0,$ap_end,$ap		// done yet?
1206	adcs	$acc1,$acc2,$t1
1207	adcs	$acc2,$acc3,$t2
1208	adcs	$acc3,$acc4,$t3
1209	//adc	$carry,$carry,xzr
1210	cbnz	$cnt,.Loop_mul4x_1st_tail
1211
1212	sub	$t1,$ap_end,$num	// rewinded $ap
1213	cbz	$t0,.Lmul4x_proceed
1214
1215	ldp	$a0,$a1,[$ap,#8*0]
1216	ldp	$a2,$a3,[$ap,#8*2]
1217	add	$ap,$ap,#8*4
1218	ldp	$m0,$m1,[$np,#8*0]
1219	ldp	$m2,$m3,[$np,#8*2]
1220	add	$np,$np,#8*4
1221	b	.Loop_mul4x_1st_tail
1222
1223.align	5
1224.Lmul4x_proceed:
1225	ldr	$bi,[$bp,#8*4]!		// *++b
1226	adc	$topmost,$carry,xzr
1227	ldp	$a0,$a1,[$t1,#8*0]	// a[0..3]
1228	sub	$np,$np,$num		// rewind np
1229	ldp	$a2,$a3,[$t1,#8*2]
1230	add	$ap,$t1,#8*4
1231
1232	stp	$acc0,$acc1,[$tp,#8*0]	// result!!!
1233	ldp	$acc0,$acc1,[sp,#8*4]	// t[0..3]
1234	stp	$acc2,$acc3,[$tp,#8*2]	// result!!!
1235	ldp	$acc2,$acc3,[sp,#8*6]
1236
1237	ldp	$m0,$m1,[$np,#8*0]	// n[0..3]
1238	mov	$tp,sp
1239	ldp	$m2,$m3,[$np,#8*2]
1240	adds	$np,$np,#8*4		// clear carry bit
1241	mov	$carry,xzr
1242
1243.align	4
1244.Loop_mul4x_reduction:
1245	mul	$t0,$a0,$bi		// lo(a[0..3]*b[4])
1246	adc	$carry,$carry,xzr	// modulo-scheduled
1247	mul	$t1,$a1,$bi
1248	add	$cnt,$cnt,#8
1249	mul	$t2,$a2,$bi
1250	and	$cnt,$cnt,#31
1251	mul	$t3,$a3,$bi
1252	adds	$acc0,$acc0,$t0
1253	umulh	$t0,$a0,$bi		// hi(a[0..3]*b[4])
1254	adcs	$acc1,$acc1,$t1
1255	mul	$mi,$acc0,$n0		// t[0]*n0
1256	adcs	$acc2,$acc2,$t2
1257	umulh	$t1,$a1,$bi
1258	adcs	$acc3,$acc3,$t3
1259	umulh	$t2,$a2,$bi
1260	adc	$acc4,xzr,xzr
1261	umulh	$t3,$a3,$bi
1262	ldr	$bi,[$bp,$cnt]		// next b[i]
1263	adds	$acc1,$acc1,$t0
1264	// (*)	mul	$t0,$m0,$mi
1265	str	$mi,[$tp],#8		// put aside t[0]*n0 for tail processing
1266	adcs	$acc2,$acc2,$t1
1267	mul	$t1,$m1,$mi		// lo(n[0..3]*t[0]*n0
1268	adcs	$acc3,$acc3,$t2
1269	mul	$t2,$m2,$mi
1270	adc	$acc4,$acc4,$t3		// can't overflow
1271	mul	$t3,$m3,$mi
1272	// (*)	adds	xzr,$acc0,$t0
1273	subs	xzr,$acc0,#1		// (*)
1274	umulh	$t0,$m0,$mi		// hi(n[0..3]*t[0]*n0
1275	adcs	$acc0,$acc1,$t1
1276	umulh	$t1,$m1,$mi
1277	adcs	$acc1,$acc2,$t2
1278	umulh	$t2,$m2,$mi
1279	adcs	$acc2,$acc3,$t3
1280	umulh	$t3,$m3,$mi
1281	adcs	$acc3,$acc4,$carry
1282	adc	$carry,xzr,xzr
1283	adds	$acc0,$acc0,$t0
1284	adcs	$acc1,$acc1,$t1
1285	adcs	$acc2,$acc2,$t2
1286	adcs	$acc3,$acc3,$t3
1287	//adc	$carry,$carry,xzr
1288	cbnz	$cnt,.Loop_mul4x_reduction
1289
1290	adc	$carry,$carry,xzr
1291	ldp	$t0,$t1,[$tp,#8*4]	// t[4..7]
1292	ldp	$t2,$t3,[$tp,#8*6]
1293	ldp	$a0,$a1,[$ap,#8*0]	// a[4..7]
1294	ldp	$a2,$a3,[$ap,#8*2]
1295	add	$ap,$ap,#8*4
1296	adds	$acc0,$acc0,$t0
1297	adcs	$acc1,$acc1,$t1
1298	adcs	$acc2,$acc2,$t2
1299	adcs	$acc3,$acc3,$t3
1300	//adc	$carry,$carry,xzr
1301
1302	ldr	$mi,[sp]		// t[0]*n0
1303	ldp	$m0,$m1,[$np,#8*0]	// n[4..7]
1304	ldp	$m2,$m3,[$np,#8*2]
1305	add	$np,$np,#8*4
1306
1307.align	4
1308.Loop_mul4x_tail:
1309	mul	$t0,$a0,$bi		// lo(a[4..7]*b[4])
1310	adc	$carry,$carry,xzr	// modulo-scheduled
1311	mul	$t1,$a1,$bi
1312	add	$cnt,$cnt,#8
1313	mul	$t2,$a2,$bi
1314	and	$cnt,$cnt,#31
1315	mul	$t3,$a3,$bi
1316	adds	$acc0,$acc0,$t0
1317	umulh	$t0,$a0,$bi		// hi(a[4..7]*b[4])
1318	adcs	$acc1,$acc1,$t1
1319	umulh	$t1,$a1,$bi
1320	adcs	$acc2,$acc2,$t2
1321	umulh	$t2,$a2,$bi
1322	adcs	$acc3,$acc3,$t3
1323	umulh	$t3,$a3,$bi
1324	adc	$acc4,xzr,xzr
1325	ldr	$bi,[$bp,$cnt]		// next b[i]
1326	adds	$acc1,$acc1,$t0
1327	mul	$t0,$m0,$mi		// lo(n[4..7]*t[0]*n0)
1328	adcs	$acc2,$acc2,$t1
1329	mul	$t1,$m1,$mi
1330	adcs	$acc3,$acc3,$t2
1331	mul	$t2,$m2,$mi
1332	adc	$acc4,$acc4,$t3		// can't overflow
1333	mul	$t3,$m3,$mi
1334	adds	$acc0,$acc0,$t0
1335	umulh	$t0,$m0,$mi		// hi(n[4..7]*t[0]*n0)
1336	adcs	$acc1,$acc1,$t1
1337	umulh	$t1,$m1,$mi
1338	adcs	$acc2,$acc2,$t2
1339	umulh	$t2,$m2,$mi
1340	adcs	$acc3,$acc3,$t3
1341	umulh	$t3,$m3,$mi
1342	adcs	$acc4,$acc4,$carry
1343	ldr	$mi,[sp,$cnt]		// next a[0]*n0
1344	adc	$carry,xzr,xzr
1345	str	$acc0,[$tp],#8		// result!!!
1346	adds	$acc0,$acc1,$t0
1347	sub	$t0,$ap_end,$ap		// done yet?
1348	adcs	$acc1,$acc2,$t1
1349	adcs	$acc2,$acc3,$t2
1350	adcs	$acc3,$acc4,$t3
1351	//adc	$carry,$carry,xzr
1352	cbnz	$cnt,.Loop_mul4x_tail
1353
1354	sub	$t1,$np,$num		// rewinded np?
1355	adc	$carry,$carry,xzr
1356	cbz	$t0,.Loop_mul4x_break
1357
1358	ldp	$t0,$t1,[$tp,#8*4]
1359	ldp	$t2,$t3,[$tp,#8*6]
1360	ldp	$a0,$a1,[$ap,#8*0]
1361	ldp	$a2,$a3,[$ap,#8*2]
1362	add	$ap,$ap,#8*4
1363	adds	$acc0,$acc0,$t0
1364	adcs	$acc1,$acc1,$t1
1365	adcs	$acc2,$acc2,$t2
1366	adcs	$acc3,$acc3,$t3
1367	//adc	$carry,$carry,xzr
1368	ldp	$m0,$m1,[$np,#8*0]
1369	ldp	$m2,$m3,[$np,#8*2]
1370	add	$np,$np,#8*4
1371	b	.Loop_mul4x_tail
1372
1373.align	4
1374.Loop_mul4x_break:
1375	ldp	$t2,$t3,[x29,#96]	// pull rp and &b[num]
1376	adds	$acc0,$acc0,$topmost
1377	add	$bp,$bp,#8*4		// bp++
1378	adcs	$acc1,$acc1,xzr
1379	sub	$ap,$ap,$num		// rewind ap
1380	adcs	$acc2,$acc2,xzr
1381	stp	$acc0,$acc1,[$tp,#8*0]	// result!!!
1382	adcs	$acc3,$acc3,xzr
1383	ldp	$acc0,$acc1,[sp,#8*4]	// t[0..3]
1384	adc	$topmost,$carry,xzr
1385	stp	$acc2,$acc3,[$tp,#8*2]	// result!!!
1386	cmp	$bp,$t3			// done yet?
1387	ldp	$acc2,$acc3,[sp,#8*6]
1388	ldp	$m0,$m1,[$t1,#8*0]	// n[0..3]
1389	ldp	$m2,$m3,[$t1,#8*2]
1390	add	$np,$t1,#8*4
1391	b.eq	.Lmul4x_post
1392
1393	ldr	$bi,[$bp]
1394	ldp	$a0,$a1,[$ap,#8*0]	// a[0..3]
1395	ldp	$a2,$a3,[$ap,#8*2]
1396	adds	$ap,$ap,#8*4		// clear carry bit
1397	mov	$carry,xzr
1398	mov	$tp,sp
1399	b	.Loop_mul4x_reduction
1400
1401.align	4
1402.Lmul4x_post:
1403	// Final step. We see if result is larger than modulus, and
1404	// if it is, subtract the modulus. But comparison implies
1405	// subtraction. So we subtract modulus, see if it borrowed,
1406	// and conditionally copy original value.
1407	mov	$rp,$t2
1408	mov	$ap_end,$t2		// $rp copy
1409	subs	$t0,$acc0,$m0
1410	add	$tp,sp,#8*8
1411	sbcs	$t1,$acc1,$m1
1412	sub	$cnt,$num,#8*4
1413
1414.Lmul4x_sub:
1415	sbcs	$t2,$acc2,$m2
1416	ldp	$m0,$m1,[$np,#8*0]
1417	sub	$cnt,$cnt,#8*4
1418	ldp	$acc0,$acc1,[$tp,#8*0]
1419	sbcs	$t3,$acc3,$m3
1420	ldp	$m2,$m3,[$np,#8*2]
1421	add	$np,$np,#8*4
1422	ldp	$acc2,$acc3,[$tp,#8*2]
1423	add	$tp,$tp,#8*4
1424	stp	$t0,$t1,[$rp,#8*0]
1425	sbcs	$t0,$acc0,$m0
1426	stp	$t2,$t3,[$rp,#8*2]
1427	add	$rp,$rp,#8*4
1428	sbcs	$t1,$acc1,$m1
1429	cbnz	$cnt,.Lmul4x_sub
1430
1431	sbcs	$t2,$acc2,$m2
1432	 mov	$tp,sp
1433	 add	$ap,sp,#8*4
1434	 ldp	$a0,$a1,[$ap_end,#8*0]
1435	sbcs	$t3,$acc3,$m3
1436	stp	$t0,$t1,[$rp,#8*0]
1437	 ldp	$a2,$a3,[$ap_end,#8*2]
1438	stp	$t2,$t3,[$rp,#8*2]
1439	 ldp	$acc0,$acc1,[$ap,#8*0]
1440	 ldp	$acc2,$acc3,[$ap,#8*2]
1441	sbcs	xzr,$topmost,xzr	// did it borrow?
1442	ldr	x30,[x29,#8]		// pull return address
1443
1444	sub	$cnt,$num,#8*4
1445.Lmul4x_cond_copy:
1446	sub	$cnt,$cnt,#8*4
1447	csel	$t0,$acc0,$a0,lo
1448	 stp	xzr,xzr,[$tp,#8*0]
1449	csel	$t1,$acc1,$a1,lo
1450	ldp	$a0,$a1,[$ap_end,#8*4]
1451	ldp	$acc0,$acc1,[$ap,#8*4]
1452	csel	$t2,$acc2,$a2,lo
1453	 stp	xzr,xzr,[$tp,#8*2]
1454	 add	$tp,$tp,#8*4
1455	csel	$t3,$acc3,$a3,lo
1456	ldp	$a2,$a3,[$ap_end,#8*6]
1457	ldp	$acc2,$acc3,[$ap,#8*6]
1458	add	$ap,$ap,#8*4
1459	stp	$t0,$t1,[$ap_end,#8*0]
1460	stp	$t2,$t3,[$ap_end,#8*2]
1461	add	$ap_end,$ap_end,#8*4
1462	cbnz	$cnt,.Lmul4x_cond_copy
1463
1464	csel	$t0,$acc0,$a0,lo
1465	 stp	xzr,xzr,[$tp,#8*0]
1466	csel	$t1,$acc1,$a1,lo
1467	 stp	xzr,xzr,[$tp,#8*2]
1468	csel	$t2,$acc2,$a2,lo
1469	 stp	xzr,xzr,[$tp,#8*3]
1470	csel	$t3,$acc3,$a3,lo
1471	 stp	xzr,xzr,[$tp,#8*4]
1472	stp	$t0,$t1,[$ap_end,#8*0]
1473	stp	$t2,$t3,[$ap_end,#8*2]
1474
1475	b	.Lmul4x_done
1476
1477.align	4
1478.Lmul4x4_post_condition:
1479	adc	$carry,$carry,xzr
1480	ldr	$ap,[x29,#96]		// pull rp
1481	// $acc0-3,$carry hold result, $m0-7 hold modulus
1482	subs	$a0,$acc0,$m0
1483	ldr	x30,[x29,#8]		// pull return address
1484	sbcs	$a1,$acc1,$m1
1485	 stp	xzr,xzr,[sp,#8*0]
1486	sbcs	$a2,$acc2,$m2
1487	 stp	xzr,xzr,[sp,#8*2]
1488	sbcs	$a3,$acc3,$m3
1489	 stp	xzr,xzr,[sp,#8*4]
1490	sbcs	xzr,$carry,xzr		// did it borrow?
1491	 stp	xzr,xzr,[sp,#8*6]
1492
1493	// $a0-3 hold result-modulus
1494	csel	$a0,$acc0,$a0,lo
1495	csel	$a1,$acc1,$a1,lo
1496	csel	$a2,$acc2,$a2,lo
1497	csel	$a3,$acc3,$a3,lo
1498	stp	$a0,$a1,[$ap,#8*0]
1499	stp	$a2,$a3,[$ap,#8*2]
1500
1501.Lmul4x_done:
1502	ldp	x19,x20,[x29,#16]
1503	mov	sp,x29
1504	ldp	x21,x22,[x29,#32]
1505	mov	x0,#1
1506	ldp	x23,x24,[x29,#48]
1507	ldp	x25,x26,[x29,#64]
1508	ldp	x27,x28,[x29,#80]
1509	ldr	x29,[sp],#128
1510	// x30 is popped earlier
1511	AARCH64_VALIDATE_LINK_REGISTER
1512	ret
1513.size	__bn_mul4x_mont,.-__bn_mul4x_mont
1514___
1515}
1516$code.=<<___;
1517.asciz	"Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
1518.align	4
1519___
1520
1521print $code;
1522
1523close STDOUT or die "error closing STDOUT: $!";
1524