xref: /aosp_15_r20/external/boringssl/src/crypto/fipsmodule/bn/asm/armv4-mont.pl (revision 8fb009dc861624b67b6cdb62ea21f0f22d0c584b)
1#! /usr/bin/env perl
2# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# January 2007.
18
19# Montgomery multiplication for ARMv4.
20#
21# Performance improvement naturally varies among CPU implementations
22# and compilers. The code was observed to provide +65-35% improvement
23# [depending on key length, less for longer keys] on ARM920T, and
24# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
25# base and compiler generated code with in-lined umull and even umlal
26# instructions. The latter means that this code didn't really have an
27# "advantage" of utilizing some "secret" instruction.
28#
29# The code is interoperable with Thumb ISA and is rather compact, less
30# than 1/2KB. Windows CE port would be trivial, as it's exclusively
31# about decorations, ABI and instruction syntax are identical.
32
33# November 2013
34#
35# Add NEON code path, which handles lengths divisible by 8. RSA/DSA
36# performance improvement on Cortex-A8 is ~45-100% depending on key
37# length, more for longer keys. On Cortex-A15 the span is ~10-105%.
38# On Snapdragon S4 improvement was measured to vary from ~70% to
39# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
40# rather because original integer-only code seems to perform
41# suboptimally on S4. Situation on Cortex-A9 is unfortunately
42# different. It's being looked into, but the trouble is that
43# performance for vectors longer than 256 bits is actually couple
44# of percent worse than for integer-only code. The code is chosen
45# for execution on all NEON-capable processors, because gain on
46# others outweighs the marginal loss on Cortex-A9.
47
48# September 2015
49#
50# Align Cortex-A9 performance with November 2013 improvements, i.e.
51# NEON code is now ~20-105% faster than integer-only one on this
52# processor. But this optimization further improved performance even
53# on other processors: NEON code path is ~45-180% faster than original
54# integer-only on Cortex-A8, ~10-210% on Cortex-A15, ~70-450% on
55# Snapdragon S4.
56
57$flavour = shift;
58if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
59else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
60
61if ($flavour && $flavour ne "void") {
62    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
63    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
64    ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
65    die "can't locate arm-xlate.pl";
66
67    open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
68    *STDOUT=*OUT;
69} else {
70    open OUT,">$output";
71    *STDOUT=*OUT;
72}
73
74$num="r0";	# starts as num argument, but holds &tp[num-1]
75$ap="r1";
76$bp="r2"; $bi="r2"; $rp="r2";
77$np="r3";
78$tp="r4";
79$aj="r5";
80$nj="r6";
81$tj="r7";
82$n0="r8";
83###########	# r9 is reserved by ELF as platform specific, e.g. TLS pointer
84$alo="r10";	# sl, gcc uses it to keep @GOT
85$ahi="r11";	# fp
86$nlo="r12";	# ip
87###########	# r13 is stack pointer
88$nhi="r14";	# lr
89###########	# r15 is program counter
90
91#### argument block layout relative to &tp[num-1], a.k.a. $num
92$_rp="$num,#12*4";
93# ap permanently resides in r1
94$_bp="$num,#13*4";
95# np permanently resides in r3
96$_n0="$num,#14*4";
97$_num="$num,#15*4";	$_bpend=$_num;
98
99$code=<<___;
100#include <openssl/arm_arch.h>
101
102@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
103@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
104.arch  armv7-a
105
106.text
107#if defined(__thumb2__)
108.syntax	unified
109.thumb
110#else
111.code	32
112#endif
113
114.global	bn_mul_mont_nohw
115.type	bn_mul_mont_nohw,%function
116
117.align	5
118bn_mul_mont_nohw:
119	ldr	ip,[sp,#4]		@ load num
120	stmdb	sp!,{r0,r2}		@ sp points at argument block
121	cmp	ip,#2
122	mov	$num,ip			@ load num
123#ifdef	__thumb2__
124	ittt	lt
125#endif
126	movlt	r0,#0
127	addlt	sp,sp,#2*4
128	blt	.Labrt
129
130	stmdb	sp!,{r4-r12,lr}		@ save 10 registers
131
132	mov	$num,$num,lsl#2		@ rescale $num for byte count
133	sub	sp,sp,$num		@ alloca(4*num)
134	sub	sp,sp,#4		@ +extra dword
135	sub	$num,$num,#4		@ "num=num-1"
136	add	$tp,$bp,$num		@ &bp[num-1]
137
138	add	$num,sp,$num		@ $num to point at &tp[num-1]
139	ldr	$n0,[$_n0]		@ &n0
140	ldr	$bi,[$bp]		@ bp[0]
141	ldr	$aj,[$ap],#4		@ ap[0],ap++
142	ldr	$nj,[$np],#4		@ np[0],np++
143	ldr	$n0,[$n0]		@ *n0
144	str	$tp,[$_bpend]		@ save &bp[num]
145
146	umull	$alo,$ahi,$aj,$bi	@ ap[0]*bp[0]
147	str	$n0,[$_n0]		@ save n0 value
148	mul	$n0,$alo,$n0		@ "tp[0]"*n0
149	mov	$nlo,#0
150	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"t[0]"
151	mov	$tp,sp
152
153.L1st:
154	ldr	$aj,[$ap],#4		@ ap[j],ap++
155	mov	$alo,$ahi
156	ldr	$nj,[$np],#4		@ np[j],np++
157	mov	$ahi,#0
158	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[0]
159	mov	$nhi,#0
160	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
161	adds	$nlo,$nlo,$alo
162	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
163	adc	$nlo,$nhi,#0
164	cmp	$tp,$num
165	bne	.L1st
166
167	adds	$nlo,$nlo,$ahi
168	ldr	$tp,[$_bp]		@ restore bp
169	mov	$nhi,#0
170	ldr	$n0,[$_n0]		@ restore n0
171	adc	$nhi,$nhi,#0
172	str	$nlo,[$num]		@ tp[num-1]=
173	mov	$tj,sp
174	str	$nhi,[$num,#4]		@ tp[num]=
175
176.Louter:
177	sub	$tj,$num,$tj		@ "original" $num-1 value
178	sub	$ap,$ap,$tj		@ "rewind" ap to &ap[1]
179	ldr	$bi,[$tp,#4]!		@ *(++bp)
180	sub	$np,$np,$tj		@ "rewind" np to &np[1]
181	ldr	$aj,[$ap,#-4]		@ ap[0]
182	ldr	$alo,[sp]		@ tp[0]
183	ldr	$nj,[$np,#-4]		@ np[0]
184	ldr	$tj,[sp,#4]		@ tp[1]
185
186	mov	$ahi,#0
187	umlal	$alo,$ahi,$aj,$bi	@ ap[0]*bp[i]+tp[0]
188	str	$tp,[$_bp]		@ save bp
189	mul	$n0,$alo,$n0
190	mov	$nlo,#0
191	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"tp[0]"
192	mov	$tp,sp
193
194.Linner:
195	ldr	$aj,[$ap],#4		@ ap[j],ap++
196	adds	$alo,$ahi,$tj		@ +=tp[j]
197	ldr	$nj,[$np],#4		@ np[j],np++
198	mov	$ahi,#0
199	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[i]
200	mov	$nhi,#0
201	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
202	adc	$ahi,$ahi,#0
203	ldr	$tj,[$tp,#8]		@ tp[j+1]
204	adds	$nlo,$nlo,$alo
205	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
206	adc	$nlo,$nhi,#0
207	cmp	$tp,$num
208	bne	.Linner
209
210	adds	$nlo,$nlo,$ahi
211	mov	$nhi,#0
212	ldr	$tp,[$_bp]		@ restore bp
213	adc	$nhi,$nhi,#0
214	ldr	$n0,[$_n0]		@ restore n0
215	adds	$nlo,$nlo,$tj
216	ldr	$tj,[$_bpend]		@ restore &bp[num]
217	adc	$nhi,$nhi,#0
218	str	$nlo,[$num]		@ tp[num-1]=
219	str	$nhi,[$num,#4]		@ tp[num]=
220
221	cmp	$tp,$tj
222#ifdef	__thumb2__
223	itt	ne
224#endif
225	movne	$tj,sp
226	bne	.Louter
227
228	ldr	$rp,[$_rp]		@ pull rp
229	mov	$aj,sp
230	add	$num,$num,#4		@ $num to point at &tp[num]
231	sub	$aj,$num,$aj		@ "original" num value
232	mov	$tp,sp			@ "rewind" $tp
233	mov	$ap,$tp			@ "borrow" $ap
234	sub	$np,$np,$aj		@ "rewind" $np to &np[0]
235
236	subs	$tj,$tj,$tj		@ "clear" carry flag
237.Lsub:	ldr	$tj,[$tp],#4
238	ldr	$nj,[$np],#4
239	sbcs	$tj,$tj,$nj		@ tp[j]-np[j]
240	str	$tj,[$rp],#4		@ rp[j]=
241	teq	$tp,$num		@ preserve carry
242	bne	.Lsub
243	sbcs	$nhi,$nhi,#0		@ upmost carry
244	mov	$tp,sp			@ "rewind" $tp
245	sub	$rp,$rp,$aj		@ "rewind" $rp
246
247.Lcopy:	ldr	$tj,[$tp]		@ conditional copy
248	ldr	$aj,[$rp]
249	str	sp,[$tp],#4		@ zap tp
250#ifdef	__thumb2__
251	it	cc
252#endif
253	movcc	$aj,$tj
254	str	$aj,[$rp],#4
255	teq	$tp,$num		@ preserve carry
256	bne	.Lcopy
257
258	mov	sp,$num
259	add	sp,sp,#4		@ skip over tp[num+1]
260	ldmia	sp!,{r4-r12,lr}		@ restore registers
261	add	sp,sp,#2*4		@ skip over {r0,r2}
262	mov	r0,#1
263.Labrt:
264#if __ARM_ARCH>=5
265	ret				@ bx lr
266#else
267	tst	lr,#1
268	moveq	pc,lr			@ be binary compatible with V4, yet
269	bx	lr			@ interoperable with Thumb ISA:-)
270#endif
271.size	bn_mul_mont_nohw,.-bn_mul_mont_nohw
272___
273{
274my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
275my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
276my ($Z,$Temp)=("q4","q5");
277my @ACC=map("q$_",(6..13));
278my ($Bi,$Ni,$M0)=map("d$_",(28..31));
279my $zero="$Z#lo";
280my $temp="$Temp#lo";
281
282my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
283my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("r$_",(6..11));
284
285$code.=<<___;
286#if __ARM_MAX_ARCH__>=7
287.arch	armv7-a
288.fpu	neon
289
290.global	bn_mul8x_mont_neon
291.type	bn_mul8x_mont_neon,%function
292.align	5
293bn_mul8x_mont_neon:
294	mov	ip,sp
295	stmdb	sp!,{r4-r11}
296	vstmdb	sp!,{d8-d15}		@ ABI specification says so
297	ldmia	ip,{r4-r5}		@ load rest of parameter block
298	mov	ip,sp
299
300	cmp	$num,#8
301	bhi	.LNEON_8n
302
303	@ special case for $num==8, everything is in register bank...
304
305	vld1.32		{${Bi}[0]}, [$bptr,:32]!
306	veor		$zero,$zero,$zero
307	sub		$toutptr,sp,$num,lsl#4
308	vld1.32		{$A0-$A3},  [$aptr]!		@ can't specify :32 :-(
309	and		$toutptr,$toutptr,#-64
310	vld1.32		{${M0}[0]}, [$n0,:32]
311	mov		sp,$toutptr			@ alloca
312	vzip.16		$Bi,$zero
313
314	vmull.u32	@ACC[0],$Bi,${A0}[0]
315	vmull.u32	@ACC[1],$Bi,${A0}[1]
316	vmull.u32	@ACC[2],$Bi,${A1}[0]
317	vshl.i64	$Ni,@ACC[0]#hi,#16
318	vmull.u32	@ACC[3],$Bi,${A1}[1]
319
320	vadd.u64	$Ni,$Ni,@ACC[0]#lo
321	veor		$zero,$zero,$zero
322	vmul.u32	$Ni,$Ni,$M0
323
324	vmull.u32	@ACC[4],$Bi,${A2}[0]
325	 vld1.32	{$N0-$N3}, [$nptr]!
326	vmull.u32	@ACC[5],$Bi,${A2}[1]
327	vmull.u32	@ACC[6],$Bi,${A3}[0]
328	vzip.16		$Ni,$zero
329	vmull.u32	@ACC[7],$Bi,${A3}[1]
330
331	vmlal.u32	@ACC[0],$Ni,${N0}[0]
332	sub		$outer,$num,#1
333	vmlal.u32	@ACC[1],$Ni,${N0}[1]
334	vmlal.u32	@ACC[2],$Ni,${N1}[0]
335	vmlal.u32	@ACC[3],$Ni,${N1}[1]
336
337	vmlal.u32	@ACC[4],$Ni,${N2}[0]
338	vmov		$Temp,@ACC[0]
339	vmlal.u32	@ACC[5],$Ni,${N2}[1]
340	vmov		@ACC[0],@ACC[1]
341	vmlal.u32	@ACC[6],$Ni,${N3}[0]
342	vmov		@ACC[1],@ACC[2]
343	vmlal.u32	@ACC[7],$Ni,${N3}[1]
344	vmov		@ACC[2],@ACC[3]
345	vmov		@ACC[3],@ACC[4]
346	vshr.u64	$temp,$temp,#16
347	vmov		@ACC[4],@ACC[5]
348	vmov		@ACC[5],@ACC[6]
349	vadd.u64	$temp,$temp,$Temp#hi
350	vmov		@ACC[6],@ACC[7]
351	veor		@ACC[7],@ACC[7]
352	vshr.u64	$temp,$temp,#16
353
354	b	.LNEON_outer8
355
356.align	4
357.LNEON_outer8:
358	vld1.32		{${Bi}[0]}, [$bptr,:32]!
359	veor		$zero,$zero,$zero
360	vzip.16		$Bi,$zero
361	vadd.u64	@ACC[0]#lo,@ACC[0]#lo,$temp
362
363	vmlal.u32	@ACC[0],$Bi,${A0}[0]
364	vmlal.u32	@ACC[1],$Bi,${A0}[1]
365	vmlal.u32	@ACC[2],$Bi,${A1}[0]
366	vshl.i64	$Ni,@ACC[0]#hi,#16
367	vmlal.u32	@ACC[3],$Bi,${A1}[1]
368
369	vadd.u64	$Ni,$Ni,@ACC[0]#lo
370	veor		$zero,$zero,$zero
371	subs		$outer,$outer,#1
372	vmul.u32	$Ni,$Ni,$M0
373
374	vmlal.u32	@ACC[4],$Bi,${A2}[0]
375	vmlal.u32	@ACC[5],$Bi,${A2}[1]
376	vmlal.u32	@ACC[6],$Bi,${A3}[0]
377	vzip.16		$Ni,$zero
378	vmlal.u32	@ACC[7],$Bi,${A3}[1]
379
380	vmlal.u32	@ACC[0],$Ni,${N0}[0]
381	vmlal.u32	@ACC[1],$Ni,${N0}[1]
382	vmlal.u32	@ACC[2],$Ni,${N1}[0]
383	vmlal.u32	@ACC[3],$Ni,${N1}[1]
384
385	vmlal.u32	@ACC[4],$Ni,${N2}[0]
386	vmov		$Temp,@ACC[0]
387	vmlal.u32	@ACC[5],$Ni,${N2}[1]
388	vmov		@ACC[0],@ACC[1]
389	vmlal.u32	@ACC[6],$Ni,${N3}[0]
390	vmov		@ACC[1],@ACC[2]
391	vmlal.u32	@ACC[7],$Ni,${N3}[1]
392	vmov		@ACC[2],@ACC[3]
393	vmov		@ACC[3],@ACC[4]
394	vshr.u64	$temp,$temp,#16
395	vmov		@ACC[4],@ACC[5]
396	vmov		@ACC[5],@ACC[6]
397	vadd.u64	$temp,$temp,$Temp#hi
398	vmov		@ACC[6],@ACC[7]
399	veor		@ACC[7],@ACC[7]
400	vshr.u64	$temp,$temp,#16
401
402	bne	.LNEON_outer8
403
404	vadd.u64	@ACC[0]#lo,@ACC[0]#lo,$temp
405	mov		$toutptr,sp
406	vshr.u64	$temp,@ACC[0]#lo,#16
407	mov		$inner,$num
408	vadd.u64	@ACC[0]#hi,@ACC[0]#hi,$temp
409	add		$tinptr,sp,#96
410	vshr.u64	$temp,@ACC[0]#hi,#16
411	vzip.16		@ACC[0]#lo,@ACC[0]#hi
412
413	b	.LNEON_tail_entry
414
415.align	4
416.LNEON_8n:
417	veor		@ACC[0],@ACC[0],@ACC[0]
418	 sub		$toutptr,sp,#128
419	veor		@ACC[1],@ACC[1],@ACC[1]
420	 sub		$toutptr,$toutptr,$num,lsl#4
421	veor		@ACC[2],@ACC[2],@ACC[2]
422	 and		$toutptr,$toutptr,#-64
423	veor		@ACC[3],@ACC[3],@ACC[3]
424	 mov		sp,$toutptr			@ alloca
425	veor		@ACC[4],@ACC[4],@ACC[4]
426	 add		$toutptr,$toutptr,#256
427	veor		@ACC[5],@ACC[5],@ACC[5]
428	 sub		$inner,$num,#8
429	veor		@ACC[6],@ACC[6],@ACC[6]
430	veor		@ACC[7],@ACC[7],@ACC[7]
431
432.LNEON_8n_init:
433	vst1.64		{@ACC[0]-@ACC[1]},[$toutptr,:256]!
434	subs		$inner,$inner,#8
435	vst1.64		{@ACC[2]-@ACC[3]},[$toutptr,:256]!
436	vst1.64		{@ACC[4]-@ACC[5]},[$toutptr,:256]!
437	vst1.64		{@ACC[6]-@ACC[7]},[$toutptr,:256]!
438	bne		.LNEON_8n_init
439
440	add		$tinptr,sp,#256
441	vld1.32		{$A0-$A3},[$aptr]!
442	add		$bnptr,sp,#8
443	vld1.32		{${M0}[0]},[$n0,:32]
444	mov		$outer,$num
445	b		.LNEON_8n_outer
446
447.align	4
448.LNEON_8n_outer:
449	vld1.32		{${Bi}[0]},[$bptr,:32]!	@ *b++
450	veor		$zero,$zero,$zero
451	vzip.16		$Bi,$zero
452	add		$toutptr,sp,#128
453	vld1.32		{$N0-$N3},[$nptr]!
454
455	vmlal.u32	@ACC[0],$Bi,${A0}[0]
456	vmlal.u32	@ACC[1],$Bi,${A0}[1]
457	 veor		$zero,$zero,$zero
458	vmlal.u32	@ACC[2],$Bi,${A1}[0]
459	 vshl.i64	$Ni,@ACC[0]#hi,#16
460	vmlal.u32	@ACC[3],$Bi,${A1}[1]
461	 vadd.u64	$Ni,$Ni,@ACC[0]#lo
462	vmlal.u32	@ACC[4],$Bi,${A2}[0]
463	 vmul.u32	$Ni,$Ni,$M0
464	vmlal.u32	@ACC[5],$Bi,${A2}[1]
465	vst1.32		{$Bi},[sp,:64]		@ put aside smashed b[8*i+0]
466	vmlal.u32	@ACC[6],$Bi,${A3}[0]
467	 vzip.16	$Ni,$zero
468	vmlal.u32	@ACC[7],$Bi,${A3}[1]
469___
470for ($i=0; $i<7;) {
471$code.=<<___;
472	vld1.32		{${Bi}[0]},[$bptr,:32]!	@ *b++
473	vmlal.u32	@ACC[0],$Ni,${N0}[0]
474	veor		$temp,$temp,$temp
475	vmlal.u32	@ACC[1],$Ni,${N0}[1]
476	vzip.16		$Bi,$temp
477	vmlal.u32	@ACC[2],$Ni,${N1}[0]
478	 vshr.u64	@ACC[0]#lo,@ACC[0]#lo,#16
479	vmlal.u32	@ACC[3],$Ni,${N1}[1]
480	vmlal.u32	@ACC[4],$Ni,${N2}[0]
481	 vadd.u64	@ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
482	vmlal.u32	@ACC[5],$Ni,${N2}[1]
483	 vshr.u64	@ACC[0]#lo,@ACC[0]#lo,#16
484	vmlal.u32	@ACC[6],$Ni,${N3}[0]
485	vmlal.u32	@ACC[7],$Ni,${N3}[1]
486	 vadd.u64	@ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
487	vst1.32		{$Ni},[$bnptr,:64]!	@ put aside smashed m[8*i+$i]
488___
489	push(@ACC,shift(@ACC));	$i++;
490$code.=<<___;
491	vmlal.u32	@ACC[0],$Bi,${A0}[0]
492	vld1.64		{@ACC[7]},[$tinptr,:128]!
493	vmlal.u32	@ACC[1],$Bi,${A0}[1]
494	 veor		$zero,$zero,$zero
495	vmlal.u32	@ACC[2],$Bi,${A1}[0]
496	 vshl.i64	$Ni,@ACC[0]#hi,#16
497	vmlal.u32	@ACC[3],$Bi,${A1}[1]
498	 vadd.u64	$Ni,$Ni,@ACC[0]#lo
499	vmlal.u32	@ACC[4],$Bi,${A2}[0]
500	 vmul.u32	$Ni,$Ni,$M0
501	vmlal.u32	@ACC[5],$Bi,${A2}[1]
502	vst1.32		{$Bi},[$bnptr,:64]!	@ put aside smashed b[8*i+$i]
503	vmlal.u32	@ACC[6],$Bi,${A3}[0]
504	 vzip.16	$Ni,$zero
505	vmlal.u32	@ACC[7],$Bi,${A3}[1]
506___
507}
508$code.=<<___;
509	vld1.32		{$Bi},[sp,:64]		@ pull smashed b[8*i+0]
510	vmlal.u32	@ACC[0],$Ni,${N0}[0]
511	vld1.32		{$A0-$A3},[$aptr]!
512	vmlal.u32	@ACC[1],$Ni,${N0}[1]
513	vmlal.u32	@ACC[2],$Ni,${N1}[0]
514	 vshr.u64	@ACC[0]#lo,@ACC[0]#lo,#16
515	vmlal.u32	@ACC[3],$Ni,${N1}[1]
516	vmlal.u32	@ACC[4],$Ni,${N2}[0]
517	 vadd.u64	@ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
518	vmlal.u32	@ACC[5],$Ni,${N2}[1]
519	 vshr.u64	@ACC[0]#lo,@ACC[0]#lo,#16
520	vmlal.u32	@ACC[6],$Ni,${N3}[0]
521	vmlal.u32	@ACC[7],$Ni,${N3}[1]
522	 vadd.u64	@ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
523	vst1.32		{$Ni},[$bnptr,:64]	@ put aside smashed m[8*i+$i]
524	add		$bnptr,sp,#8		@ rewind
525___
526	push(@ACC,shift(@ACC));
527$code.=<<___;
528	sub		$inner,$num,#8
529	b		.LNEON_8n_inner
530
531.align	4
532.LNEON_8n_inner:
533	subs		$inner,$inner,#8
534	vmlal.u32	@ACC[0],$Bi,${A0}[0]
535	vld1.64		{@ACC[7]},[$tinptr,:128]
536	vmlal.u32	@ACC[1],$Bi,${A0}[1]
537	vld1.32		{$Ni},[$bnptr,:64]!	@ pull smashed m[8*i+0]
538	vmlal.u32	@ACC[2],$Bi,${A1}[0]
539	vld1.32		{$N0-$N3},[$nptr]!
540	vmlal.u32	@ACC[3],$Bi,${A1}[1]
541	it		ne
542	addne		$tinptr,$tinptr,#16	@ don't advance in last iteration
543	vmlal.u32	@ACC[4],$Bi,${A2}[0]
544	vmlal.u32	@ACC[5],$Bi,${A2}[1]
545	vmlal.u32	@ACC[6],$Bi,${A3}[0]
546	vmlal.u32	@ACC[7],$Bi,${A3}[1]
547___
548for ($i=1; $i<8; $i++) {
549$code.=<<___;
550	vld1.32		{$Bi},[$bnptr,:64]!	@ pull smashed b[8*i+$i]
551	vmlal.u32	@ACC[0],$Ni,${N0}[0]
552	vmlal.u32	@ACC[1],$Ni,${N0}[1]
553	vmlal.u32	@ACC[2],$Ni,${N1}[0]
554	vmlal.u32	@ACC[3],$Ni,${N1}[1]
555	vmlal.u32	@ACC[4],$Ni,${N2}[0]
556	vmlal.u32	@ACC[5],$Ni,${N2}[1]
557	vmlal.u32	@ACC[6],$Ni,${N3}[0]
558	vmlal.u32	@ACC[7],$Ni,${N3}[1]
559	vst1.64		{@ACC[0]},[$toutptr,:128]!
560___
561	push(@ACC,shift(@ACC));
562$code.=<<___;
563	vmlal.u32	@ACC[0],$Bi,${A0}[0]
564	vld1.64		{@ACC[7]},[$tinptr,:128]
565	vmlal.u32	@ACC[1],$Bi,${A0}[1]
566	vld1.32		{$Ni},[$bnptr,:64]!	@ pull smashed m[8*i+$i]
567	vmlal.u32	@ACC[2],$Bi,${A1}[0]
568	it		ne
569	addne		$tinptr,$tinptr,#16	@ don't advance in last iteration
570	vmlal.u32	@ACC[3],$Bi,${A1}[1]
571	vmlal.u32	@ACC[4],$Bi,${A2}[0]
572	vmlal.u32	@ACC[5],$Bi,${A2}[1]
573	vmlal.u32	@ACC[6],$Bi,${A3}[0]
574	vmlal.u32	@ACC[7],$Bi,${A3}[1]
575___
576}
577$code.=<<___;
578	it		eq
579	subeq		$aptr,$aptr,$num,lsl#2	@ rewind
580	vmlal.u32	@ACC[0],$Ni,${N0}[0]
581	vld1.32		{$Bi},[sp,:64]		@ pull smashed b[8*i+0]
582	vmlal.u32	@ACC[1],$Ni,${N0}[1]
583	vld1.32		{$A0-$A3},[$aptr]!
584	vmlal.u32	@ACC[2],$Ni,${N1}[0]
585	add		$bnptr,sp,#8		@ rewind
586	vmlal.u32	@ACC[3],$Ni,${N1}[1]
587	vmlal.u32	@ACC[4],$Ni,${N2}[0]
588	vmlal.u32	@ACC[5],$Ni,${N2}[1]
589	vmlal.u32	@ACC[6],$Ni,${N3}[0]
590	vst1.64		{@ACC[0]},[$toutptr,:128]!
591	vmlal.u32	@ACC[7],$Ni,${N3}[1]
592
593	bne		.LNEON_8n_inner
594___
595	push(@ACC,shift(@ACC));
596$code.=<<___;
597	add		$tinptr,sp,#128
598	vst1.64		{@ACC[0]-@ACC[1]},[$toutptr,:256]!
599	veor		q2,q2,q2		@ $N0-$N1
600	vst1.64		{@ACC[2]-@ACC[3]},[$toutptr,:256]!
601	veor		q3,q3,q3		@ $N2-$N3
602	vst1.64		{@ACC[4]-@ACC[5]},[$toutptr,:256]!
603	vst1.64		{@ACC[6]},[$toutptr,:128]
604
605	subs		$outer,$outer,#8
606	vld1.64		{@ACC[0]-@ACC[1]},[$tinptr,:256]!
607	vld1.64		{@ACC[2]-@ACC[3]},[$tinptr,:256]!
608	vld1.64		{@ACC[4]-@ACC[5]},[$tinptr,:256]!
609	vld1.64		{@ACC[6]-@ACC[7]},[$tinptr,:256]!
610
611	itt		ne
612	subne		$nptr,$nptr,$num,lsl#2	@ rewind
613	bne		.LNEON_8n_outer
614
615	add		$toutptr,sp,#128
616	vst1.64		{q2-q3}, [sp,:256]!	@ start wiping stack frame
617	vshr.u64	$temp,@ACC[0]#lo,#16
618	vst1.64		{q2-q3},[sp,:256]!
619	vadd.u64	@ACC[0]#hi,@ACC[0]#hi,$temp
620	vst1.64		{q2-q3}, [sp,:256]!
621	vshr.u64	$temp,@ACC[0]#hi,#16
622	vst1.64		{q2-q3}, [sp,:256]!
623	vzip.16		@ACC[0]#lo,@ACC[0]#hi
624
625	mov		$inner,$num
626	b		.LNEON_tail_entry
627
628.align	4
629.LNEON_tail:
630	vadd.u64	@ACC[0]#lo,@ACC[0]#lo,$temp
631	vshr.u64	$temp,@ACC[0]#lo,#16
632	vld1.64		{@ACC[2]-@ACC[3]}, [$tinptr, :256]!
633	vadd.u64	@ACC[0]#hi,@ACC[0]#hi,$temp
634	vld1.64		{@ACC[4]-@ACC[5]}, [$tinptr, :256]!
635	vshr.u64	$temp,@ACC[0]#hi,#16
636	vld1.64		{@ACC[6]-@ACC[7]}, [$tinptr, :256]!
637	vzip.16		@ACC[0]#lo,@ACC[0]#hi
638
639.LNEON_tail_entry:
640___
641for ($i=1; $i<8; $i++) {
642$code.=<<___;
643	vadd.u64	@ACC[1]#lo,@ACC[1]#lo,$temp
644	vst1.32		{@ACC[0]#lo[0]}, [$toutptr, :32]!
645	vshr.u64	$temp,@ACC[1]#lo,#16
646	vadd.u64	@ACC[1]#hi,@ACC[1]#hi,$temp
647	vshr.u64	$temp,@ACC[1]#hi,#16
648	vzip.16		@ACC[1]#lo,@ACC[1]#hi
649___
650	push(@ACC,shift(@ACC));
651}
652	push(@ACC,shift(@ACC));
653$code.=<<___;
654	vld1.64		{@ACC[0]-@ACC[1]}, [$tinptr, :256]!
655	subs		$inner,$inner,#8
656	vst1.32		{@ACC[7]#lo[0]},   [$toutptr, :32]!
657	bne	.LNEON_tail
658
659	vst1.32	{${temp}[0]}, [$toutptr, :32]		@ top-most bit
660	sub	$nptr,$nptr,$num,lsl#2			@ rewind $nptr
661	subs	$aptr,sp,#0				@ clear carry flag
662	add	$bptr,sp,$num,lsl#2
663
664.LNEON_sub:
665	ldmia	$aptr!, {r4-r7}
666	ldmia	$nptr!, {r8-r11}
667	sbcs	r8, r4,r8
668	sbcs	r9, r5,r9
669	sbcs	r10,r6,r10
670	sbcs	r11,r7,r11
671	teq	$aptr,$bptr				@ preserves carry
672	stmia	$rptr!, {r8-r11}
673	bne	.LNEON_sub
674
675	ldr	r10, [$aptr]				@ load top-most bit
676	mov	r11,sp
677	veor	q0,q0,q0
678	sub	r11,$bptr,r11				@ this is num*4
679	veor	q1,q1,q1
680	mov	$aptr,sp
681	sub	$rptr,$rptr,r11				@ rewind $rptr
682	mov	$nptr,$bptr				@ second 3/4th of frame
683	sbcs	r10,r10,#0				@ result is carry flag
684
685.LNEON_copy_n_zap:
686	ldmia	$aptr!, {r4-r7}
687	ldmia	$rptr,  {r8-r11}
688	it	cc
689	movcc	r8, r4
690	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
691	itt	cc
692	movcc	r9, r5
693	movcc	r10,r6
694	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
695	it	cc
696	movcc	r11,r7
697	ldmia	$aptr, {r4-r7}
698	stmia	$rptr!, {r8-r11}
699	sub	$aptr,$aptr,#16
700	ldmia	$rptr, {r8-r11}
701	it	cc
702	movcc	r8, r4
703	vst1.64	{q0-q1}, [$aptr,:256]!			@ wipe
704	itt	cc
705	movcc	r9, r5
706	movcc	r10,r6
707	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
708	it	cc
709	movcc	r11,r7
710	teq	$aptr,$bptr				@ preserves carry
711	stmia	$rptr!, {r8-r11}
712	bne	.LNEON_copy_n_zap
713
714	mov	sp,ip
715        vldmia  sp!,{d8-d15}
716        ldmia   sp!,{r4-r11}
717	ret						@ bx lr
718.size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
719#endif
720___
721}
722$code.=<<___;
723.asciz	"Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
724___
725
726foreach (split("\n",$code)) {
727	s/\`([^\`]*)\`/eval $1/ge;
728
729	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/ge	or
730	s/\bret\b/bx    lr/g						or
731	s/\bbx\s+lr\b/.word\t0xe12fff1e/g;	# make it possible to compile with -march=armv4
732
733	print $_,"\n";
734}
735
736close STDOUT or die "error closing STDOUT: $!";
737