xref: /aosp_15_r20/external/boringssl/src/crypto/chacha/asm/chacha-armv4.pl (revision 8fb009dc861624b67b6cdb62ea21f0f22d0c584b)
1#! /usr/bin/env perl
2# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# December 2014
18#
19# ChaCha20 for ARMv4.
20#
21# Performance in cycles per byte out of large buffer.
22#
23#			IALU/gcc-4.4    1xNEON      3xNEON+1xIALU
24#
25# Cortex-A5		19.3(*)/+95%    21.8        14.1
26# Cortex-A8		10.5(*)/+160%   13.9        6.35
27# Cortex-A9		12.9(**)/+110%  14.3        6.50
28# Cortex-A15		11.0/+40%       16.0        5.00
29# Snapdragon S4		11.5/+125%      13.6        4.90
30#
31# (*)	most "favourable" result for aligned data on little-endian
32#	processor, result for misaligned data is 10-15% lower;
33# (**)	this result is a trade-off: it can be improved by 20%,
34#	but then Snapdragon S4 and Cortex-A8 results get
35#	20-25% worse;
36
37$flavour = shift;
38if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
39else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
40
41if ($flavour && $flavour ne "void") {
42    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
43    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
44    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
45    die "can't locate arm-xlate.pl";
46
47    open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
48    *STDOUT=*OUT;
49} else {
50    open OUT,">$output";
51    *STDOUT=*OUT;
52}
53
54sub AUTOLOAD()		# thunk [simplified] x86-style perlasm
55{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
56  my $arg = pop;
57    $arg = "#$arg" if ($arg*1 eq $arg);
58    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
59}
60
61my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x"));
62my @t=map("r$_",(8..11));
63
64sub ROUND {
65my ($a0,$b0,$c0,$d0)=@_;
66my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
67my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
68my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
69my $odd = $d0&1;
70my ($xc,$xc_) = (@t[0..1]);
71my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]);
72my @ret;
73
74	# Consider order in which variables are addressed by their
75	# index:
76	#
77	#       a   b   c   d
78	#
79	#       0   4   8  12 < even round
80	#       1   5   9  13
81	#       2   6  10  14
82	#       3   7  11  15
83	#       0   5  10  15 < odd round
84	#       1   6  11  12
85	#       2   7   8  13
86	#       3   4   9  14
87	#
88	# 'a', 'b' are permanently allocated in registers, @x[0..7],
89	# while 'c's and pair of 'd's are maintained in memory. If
90	# you observe 'c' column, you'll notice that pair of 'c's is
91	# invariant between rounds. This means that we have to reload
92	# them once per round, in the middle. This is why you'll see
93	# bunch of 'c' stores and loads in the middle, but none in
94	# the beginning or end. If you observe 'd' column, you'll
95	# notice that 15 and 13 are reused in next pair of rounds.
96	# This is why these two are chosen for offloading to memory,
97	# to make loads count more.
98							push @ret,(
99	"&add	(@x[$a0],@x[$a0],@x[$b0])",
100	"&mov	($xd,$xd,'ror#16')",
101	 "&add	(@x[$a1],@x[$a1],@x[$b1])",
102	 "&mov	($xd_,$xd_,'ror#16')",
103	"&eor	($xd,$xd,@x[$a0],'ror#16')",
104	 "&eor	($xd_,$xd_,@x[$a1],'ror#16')",
105
106	"&add	($xc,$xc,$xd)",
107	"&mov	(@x[$b0],@x[$b0],'ror#20')",
108	 "&add	($xc_,$xc_,$xd_)",
109	 "&mov	(@x[$b1],@x[$b1],'ror#20')",
110	"&eor	(@x[$b0],@x[$b0],$xc,'ror#20')",
111	 "&eor	(@x[$b1],@x[$b1],$xc_,'ror#20')",
112
113	"&add	(@x[$a0],@x[$a0],@x[$b0])",
114	"&mov	($xd,$xd,'ror#24')",
115	 "&add	(@x[$a1],@x[$a1],@x[$b1])",
116	 "&mov	($xd_,$xd_,'ror#24')",
117	"&eor	($xd,$xd,@x[$a0],'ror#24')",
118	 "&eor	($xd_,$xd_,@x[$a1],'ror#24')",
119
120	"&add	($xc,$xc,$xd)",
121	"&mov	(@x[$b0],@x[$b0],'ror#25')"		);
122							push @ret,(
123	"&str	($xd,'[sp,#4*(16+$d0)]')",
124	"&ldr	($xd,'[sp,#4*(16+$d2)]')"		) if ($odd);
125							push @ret,(
126	 "&add	($xc_,$xc_,$xd_)",
127	 "&mov	(@x[$b1],@x[$b1],'ror#25')"		);
128							push @ret,(
129	 "&str	($xd_,'[sp,#4*(16+$d1)]')",
130	 "&ldr	($xd_,'[sp,#4*(16+$d3)]')"		) if (!$odd);
131							push @ret,(
132	"&eor	(@x[$b0],@x[$b0],$xc,'ror#25')",
133	 "&eor	(@x[$b1],@x[$b1],$xc_,'ror#25')"	);
134
135	$xd=@x[$d2]					if (!$odd);
136	$xd_=@x[$d3]					if ($odd);
137							push @ret,(
138	"&str	($xc,'[sp,#4*(16+$c0)]')",
139	"&ldr	($xc,'[sp,#4*(16+$c2)]')",
140	"&add	(@x[$a2],@x[$a2],@x[$b2])",
141	"&mov	($xd,$xd,'ror#16')",
142	 "&str	($xc_,'[sp,#4*(16+$c1)]')",
143	 "&ldr	($xc_,'[sp,#4*(16+$c3)]')",
144	 "&add	(@x[$a3],@x[$a3],@x[$b3])",
145	 "&mov	($xd_,$xd_,'ror#16')",
146	"&eor	($xd,$xd,@x[$a2],'ror#16')",
147	 "&eor	($xd_,$xd_,@x[$a3],'ror#16')",
148
149	"&add	($xc,$xc,$xd)",
150	"&mov	(@x[$b2],@x[$b2],'ror#20')",
151	 "&add	($xc_,$xc_,$xd_)",
152	 "&mov	(@x[$b3],@x[$b3],'ror#20')",
153	"&eor	(@x[$b2],@x[$b2],$xc,'ror#20')",
154	 "&eor	(@x[$b3],@x[$b3],$xc_,'ror#20')",
155
156	"&add	(@x[$a2],@x[$a2],@x[$b2])",
157	"&mov	($xd,$xd,'ror#24')",
158	 "&add	(@x[$a3],@x[$a3],@x[$b3])",
159	 "&mov	($xd_,$xd_,'ror#24')",
160	"&eor	($xd,$xd,@x[$a2],'ror#24')",
161	 "&eor	($xd_,$xd_,@x[$a3],'ror#24')",
162
163	"&add	($xc,$xc,$xd)",
164	"&mov	(@x[$b2],@x[$b2],'ror#25')",
165	 "&add	($xc_,$xc_,$xd_)",
166	 "&mov	(@x[$b3],@x[$b3],'ror#25')",
167	"&eor	(@x[$b2],@x[$b2],$xc,'ror#25')",
168	 "&eor	(@x[$b3],@x[$b3],$xc_,'ror#25')"	);
169
170	@ret;
171}
172
173$code.=<<___;
174#include <openssl/arm_arch.h>
175
176@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
177@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
178.arch  armv7-a
179
180.text
181#if defined(__thumb2__) || defined(__clang__)
182.syntax	unified
183#endif
184#if defined(__thumb2__)
185.thumb
186#else
187.code	32
188#endif
189
190#if defined(__thumb2__) || defined(__clang__)
191#define ldrhsb	ldrbhs
192#endif
193
194.align	5
195.Lsigma:
196.long	0x61707865,0x3320646e,0x79622d32,0x6b206574	@ endian-neutral
197.Lone:
198.long	1,0,0,0
199
200.globl	ChaCha20_ctr32_nohw
201.type	ChaCha20_ctr32_nohw,%function
202.align	5
203ChaCha20_ctr32_nohw:
204	ldr	r12,[sp,#0]		@ pull pointer to counter and nonce
205	stmdb	sp!,{r0-r2,r4-r11,lr}
206	adr	r14,.Lsigma
207	ldmia	r12,{r4-r7}		@ load counter and nonce
208	sub	sp,sp,#4*(16)		@ off-load area
209	stmdb	sp!,{r4-r7}		@ copy counter and nonce
210	ldmia	r3,{r4-r11}		@ load key
211	ldmia	r14,{r0-r3}		@ load sigma
212	stmdb	sp!,{r4-r11}		@ copy key
213	stmdb	sp!,{r0-r3}		@ copy sigma
214	str	r10,[sp,#4*(16+10)]	@ off-load "@x[10]"
215	str	r11,[sp,#4*(16+11)]	@ off-load "@x[11]"
216	b	.Loop_outer_enter
217
218.align	4
219.Loop_outer:
220	ldmia	sp,{r0-r9}		@ load key material
221	str	@t[3],[sp,#4*(32+2)]	@ save len
222	str	r12,  [sp,#4*(32+1)]	@ save inp
223	str	r14,  [sp,#4*(32+0)]	@ save out
224.Loop_outer_enter:
225	ldr	@t[3], [sp,#4*(15)]
226	ldr	@x[12],[sp,#4*(12)]	@ modulo-scheduled load
227	ldr	@t[2], [sp,#4*(13)]
228	ldr	@x[14],[sp,#4*(14)]
229	str	@t[3], [sp,#4*(16+15)]
230	mov	@t[3],#10
231	b	.Loop
232
233.align	4
234.Loop:
235	subs	@t[3],@t[3],#1
236___
237	foreach (&ROUND(0, 4, 8,12)) { eval; }
238	foreach (&ROUND(0, 5,10,15)) { eval; }
239$code.=<<___;
240	bne	.Loop
241
242	ldr	@t[3],[sp,#4*(32+2)]	@ load len
243
244	str	@t[0], [sp,#4*(16+8)]	@ modulo-scheduled store
245	str	@t[1], [sp,#4*(16+9)]
246	str	@x[12],[sp,#4*(16+12)]
247	str	@t[2], [sp,#4*(16+13)]
248	str	@x[14],[sp,#4*(16+14)]
249
250	@ at this point we have first half of 512-bit result in
251	@ @x[0-7] and second half at sp+4*(16+8)
252
253	cmp	@t[3],#64		@ done yet?
254#ifdef	__thumb2__
255	itete	lo
256#endif
257	addlo	r12,sp,#4*(0)		@ shortcut or ...
258	ldrhs	r12,[sp,#4*(32+1)]	@ ... load inp
259	addlo	r14,sp,#4*(0)		@ shortcut or ...
260	ldrhs	r14,[sp,#4*(32+0)]	@ ... load out
261
262	ldr	@t[0],[sp,#4*(0)]	@ load key material
263	ldr	@t[1],[sp,#4*(1)]
264
265#if __ARM_ARCH>=6 || !defined(__ARMEB__)
266# if __ARM_ARCH<7
267	orr	@t[2],r12,r14
268	tst	@t[2],#3		@ are input and output aligned?
269	ldr	@t[2],[sp,#4*(2)]
270	bne	.Lunaligned
271	cmp	@t[3],#64		@ restore flags
272# else
273	ldr	@t[2],[sp,#4*(2)]
274# endif
275	ldr	@t[3],[sp,#4*(3)]
276
277	add	@x[0],@x[0],@t[0]	@ accumulate key material
278	add	@x[1],@x[1],@t[1]
279# ifdef	__thumb2__
280	itt	hs
281# endif
282	ldrhs	@t[0],[r12],#16		@ load input
283	ldrhs	@t[1],[r12,#-12]
284
285	add	@x[2],@x[2],@t[2]
286	add	@x[3],@x[3],@t[3]
287# ifdef	__thumb2__
288	itt	hs
289# endif
290	ldrhs	@t[2],[r12,#-8]
291	ldrhs	@t[3],[r12,#-4]
292# if __ARM_ARCH>=6 && defined(__ARMEB__)
293	rev	@x[0],@x[0]
294	rev	@x[1],@x[1]
295	rev	@x[2],@x[2]
296	rev	@x[3],@x[3]
297# endif
298# ifdef	__thumb2__
299	itt	hs
300# endif
301	eorhs	@x[0],@x[0],@t[0]	@ xor with input
302	eorhs	@x[1],@x[1],@t[1]
303	 add	@t[0],sp,#4*(4)
304	str	@x[0],[r14],#16		@ store output
305# ifdef	__thumb2__
306	itt	hs
307# endif
308	eorhs	@x[2],@x[2],@t[2]
309	eorhs	@x[3],@x[3],@t[3]
310	 ldmia	@t[0],{@t[0]-@t[3]}	@ load key material
311	str	@x[1],[r14,#-12]
312	str	@x[2],[r14,#-8]
313	str	@x[3],[r14,#-4]
314
315	add	@x[4],@x[4],@t[0]	@ accumulate key material
316	add	@x[5],@x[5],@t[1]
317# ifdef	__thumb2__
318	itt	hs
319# endif
320	ldrhs	@t[0],[r12],#16		@ load input
321	ldrhs	@t[1],[r12,#-12]
322	add	@x[6],@x[6],@t[2]
323	add	@x[7],@x[7],@t[3]
324# ifdef	__thumb2__
325	itt	hs
326# endif
327	ldrhs	@t[2],[r12,#-8]
328	ldrhs	@t[3],[r12,#-4]
329# if __ARM_ARCH>=6 && defined(__ARMEB__)
330	rev	@x[4],@x[4]
331	rev	@x[5],@x[5]
332	rev	@x[6],@x[6]
333	rev	@x[7],@x[7]
334# endif
335# ifdef	__thumb2__
336	itt	hs
337# endif
338	eorhs	@x[4],@x[4],@t[0]
339	eorhs	@x[5],@x[5],@t[1]
340	 add	@t[0],sp,#4*(8)
341	str	@x[4],[r14],#16		@ store output
342# ifdef	__thumb2__
343	itt	hs
344# endif
345	eorhs	@x[6],@x[6],@t[2]
346	eorhs	@x[7],@x[7],@t[3]
347	str	@x[5],[r14,#-12]
348	 ldmia	@t[0],{@t[0]-@t[3]}	@ load key material
349	str	@x[6],[r14,#-8]
350	 add	@x[0],sp,#4*(16+8)
351	str	@x[7],[r14,#-4]
352
353	ldmia	@x[0],{@x[0]-@x[7]}	@ load second half
354
355	add	@x[0],@x[0],@t[0]	@ accumulate key material
356	add	@x[1],@x[1],@t[1]
357# ifdef	__thumb2__
358	itt	hs
359# endif
360	ldrhs	@t[0],[r12],#16		@ load input
361	ldrhs	@t[1],[r12,#-12]
362# ifdef	__thumb2__
363	itt	hi
364# endif
365	 strhi	@t[2],[sp,#4*(16+10)]	@ copy "@x[10]" while at it
366	 strhi	@t[3],[sp,#4*(16+11)]	@ copy "@x[11]" while at it
367	add	@x[2],@x[2],@t[2]
368	add	@x[3],@x[3],@t[3]
369# ifdef	__thumb2__
370	itt	hs
371# endif
372	ldrhs	@t[2],[r12,#-8]
373	ldrhs	@t[3],[r12,#-4]
374# if __ARM_ARCH>=6 && defined(__ARMEB__)
375	rev	@x[0],@x[0]
376	rev	@x[1],@x[1]
377	rev	@x[2],@x[2]
378	rev	@x[3],@x[3]
379# endif
380# ifdef	__thumb2__
381	itt	hs
382# endif
383	eorhs	@x[0],@x[0],@t[0]
384	eorhs	@x[1],@x[1],@t[1]
385	 add	@t[0],sp,#4*(12)
386	str	@x[0],[r14],#16		@ store output
387# ifdef	__thumb2__
388	itt	hs
389# endif
390	eorhs	@x[2],@x[2],@t[2]
391	eorhs	@x[3],@x[3],@t[3]
392	str	@x[1],[r14,#-12]
393	 ldmia	@t[0],{@t[0]-@t[3]}	@ load key material
394	str	@x[2],[r14,#-8]
395	str	@x[3],[r14,#-4]
396
397	add	@x[4],@x[4],@t[0]	@ accumulate key material
398	add	@x[5],@x[5],@t[1]
399# ifdef	__thumb2__
400	itt	hi
401# endif
402	 addhi	@t[0],@t[0],#1		@ next counter value
403	 strhi	@t[0],[sp,#4*(12)]	@ save next counter value
404# ifdef	__thumb2__
405	itt	hs
406# endif
407	ldrhs	@t[0],[r12],#16		@ load input
408	ldrhs	@t[1],[r12,#-12]
409	add	@x[6],@x[6],@t[2]
410	add	@x[7],@x[7],@t[3]
411# ifdef	__thumb2__
412	itt	hs
413# endif
414	ldrhs	@t[2],[r12,#-8]
415	ldrhs	@t[3],[r12,#-4]
416# if __ARM_ARCH>=6 && defined(__ARMEB__)
417	rev	@x[4],@x[4]
418	rev	@x[5],@x[5]
419	rev	@x[6],@x[6]
420	rev	@x[7],@x[7]
421# endif
422# ifdef	__thumb2__
423	itt	hs
424# endif
425	eorhs	@x[4],@x[4],@t[0]
426	eorhs	@x[5],@x[5],@t[1]
427# ifdef	__thumb2__
428	 it	ne
429# endif
430	 ldrne	@t[0],[sp,#4*(32+2)]	@ re-load len
431# ifdef	__thumb2__
432	itt	hs
433# endif
434	eorhs	@x[6],@x[6],@t[2]
435	eorhs	@x[7],@x[7],@t[3]
436	str	@x[4],[r14],#16		@ store output
437	str	@x[5],[r14,#-12]
438# ifdef	__thumb2__
439	it	hs
440# endif
441	 subhs	@t[3],@t[0],#64		@ len-=64
442	str	@x[6],[r14,#-8]
443	str	@x[7],[r14,#-4]
444	bhi	.Loop_outer
445
446	beq	.Ldone
447# if __ARM_ARCH<7
448	b	.Ltail
449
450.align	4
451.Lunaligned:				@ unaligned endian-neutral path
452	cmp	@t[3],#64		@ restore flags
453# endif
454#endif
455#if __ARM_ARCH<7
456	ldr	@t[3],[sp,#4*(3)]
457___
458for ($i=0;$i<16;$i+=4) {
459my $j=$i&0x7;
460
461$code.=<<___	if ($i==4);
462	add	@x[0],sp,#4*(16+8)
463___
464$code.=<<___	if ($i==8);
465	ldmia	@x[0],{@x[0]-@x[7]}		@ load second half
466# ifdef	__thumb2__
467	itt	hi
468# endif
469	strhi	@t[2],[sp,#4*(16+10)]		@ copy "@x[10]"
470	strhi	@t[3],[sp,#4*(16+11)]		@ copy "@x[11]"
471___
472$code.=<<___;
473	add	@x[$j+0],@x[$j+0],@t[0]		@ accumulate key material
474___
475$code.=<<___	if ($i==12);
476# ifdef	__thumb2__
477	itt	hi
478# endif
479	addhi	@t[0],@t[0],#1			@ next counter value
480	strhi	@t[0],[sp,#4*(12)]		@ save next counter value
481___
482$code.=<<___;
483	add	@x[$j+1],@x[$j+1],@t[1]
484	add	@x[$j+2],@x[$j+2],@t[2]
485# ifdef	__thumb2__
486	itete	lo
487# endif
488	eorlo	@t[0],@t[0],@t[0]		@ zero or ...
489	ldrhsb	@t[0],[r12],#16			@ ... load input
490	eorlo	@t[1],@t[1],@t[1]
491	ldrhsb	@t[1],[r12,#-12]
492
493	add	@x[$j+3],@x[$j+3],@t[3]
494# ifdef	__thumb2__
495	itete	lo
496# endif
497	eorlo	@t[2],@t[2],@t[2]
498	ldrhsb	@t[2],[r12,#-8]
499	eorlo	@t[3],@t[3],@t[3]
500	ldrhsb	@t[3],[r12,#-4]
501
502	eor	@x[$j+0],@t[0],@x[$j+0]		@ xor with input (or zero)
503	eor	@x[$j+1],@t[1],@x[$j+1]
504# ifdef	__thumb2__
505	itt	hs
506# endif
507	ldrhsb	@t[0],[r12,#-15]		@ load more input
508	ldrhsb	@t[1],[r12,#-11]
509	eor	@x[$j+2],@t[2],@x[$j+2]
510	 strb	@x[$j+0],[r14],#16		@ store output
511	eor	@x[$j+3],@t[3],@x[$j+3]
512# ifdef	__thumb2__
513	itt	hs
514# endif
515	ldrhsb	@t[2],[r12,#-7]
516	ldrhsb	@t[3],[r12,#-3]
517	 strb	@x[$j+1],[r14,#-12]
518	eor	@x[$j+0],@t[0],@x[$j+0],lsr#8
519	 strb	@x[$j+2],[r14,#-8]
520	eor	@x[$j+1],@t[1],@x[$j+1],lsr#8
521# ifdef	__thumb2__
522	itt	hs
523# endif
524	ldrhsb	@t[0],[r12,#-14]		@ load more input
525	ldrhsb	@t[1],[r12,#-10]
526	 strb	@x[$j+3],[r14,#-4]
527	eor	@x[$j+2],@t[2],@x[$j+2],lsr#8
528	 strb	@x[$j+0],[r14,#-15]
529	eor	@x[$j+3],@t[3],@x[$j+3],lsr#8
530# ifdef	__thumb2__
531	itt	hs
532# endif
533	ldrhsb	@t[2],[r12,#-6]
534	ldrhsb	@t[3],[r12,#-2]
535	 strb	@x[$j+1],[r14,#-11]
536	eor	@x[$j+0],@t[0],@x[$j+0],lsr#8
537	 strb	@x[$j+2],[r14,#-7]
538	eor	@x[$j+1],@t[1],@x[$j+1],lsr#8
539# ifdef	__thumb2__
540	itt	hs
541# endif
542	ldrhsb	@t[0],[r12,#-13]		@ load more input
543	ldrhsb	@t[1],[r12,#-9]
544	 strb	@x[$j+3],[r14,#-3]
545	eor	@x[$j+2],@t[2],@x[$j+2],lsr#8
546	 strb	@x[$j+0],[r14,#-14]
547	eor	@x[$j+3],@t[3],@x[$j+3],lsr#8
548# ifdef	__thumb2__
549	itt	hs
550# endif
551	ldrhsb	@t[2],[r12,#-5]
552	ldrhsb	@t[3],[r12,#-1]
553	 strb	@x[$j+1],[r14,#-10]
554	 strb	@x[$j+2],[r14,#-6]
555	eor	@x[$j+0],@t[0],@x[$j+0],lsr#8
556	 strb	@x[$j+3],[r14,#-2]
557	eor	@x[$j+1],@t[1],@x[$j+1],lsr#8
558	 strb	@x[$j+0],[r14,#-13]
559	eor	@x[$j+2],@t[2],@x[$j+2],lsr#8
560	 strb	@x[$j+1],[r14,#-9]
561	eor	@x[$j+3],@t[3],@x[$j+3],lsr#8
562	 strb	@x[$j+2],[r14,#-5]
563	 strb	@x[$j+3],[r14,#-1]
564___
565$code.=<<___	if ($i<12);
566	add	@t[0],sp,#4*(4+$i)
567	ldmia	@t[0],{@t[0]-@t[3]}		@ load key material
568___
569}
570$code.=<<___;
571# ifdef	__thumb2__
572	it	ne
573# endif
574	ldrne	@t[0],[sp,#4*(32+2)]		@ re-load len
575# ifdef	__thumb2__
576	it	hs
577# endif
578	subhs	@t[3],@t[0],#64			@ len-=64
579	bhi	.Loop_outer
580
581	beq	.Ldone
582#endif
583
584.Ltail:
585	ldr	r12,[sp,#4*(32+1)]	@ load inp
586	add	@t[1],sp,#4*(0)
587	ldr	r14,[sp,#4*(32+0)]	@ load out
588
589.Loop_tail:
590	ldrb	@t[2],[@t[1]],#1	@ read buffer on stack
591	ldrb	@t[3],[r12],#1		@ read input
592	subs	@t[0],@t[0],#1
593	eor	@t[3],@t[3],@t[2]
594	strb	@t[3],[r14],#1		@ store output
595	bne	.Loop_tail
596
597.Ldone:
598	add	sp,sp,#4*(32+3)
599	ldmia	sp!,{r4-r11,pc}
600.size	ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw
601___
602
603{{{
604my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) =
605    map("q$_",(0..15));
606
607sub NEONROUND {
608my $odd = pop;
609my ($a,$b,$c,$d,$t)=@_;
610
611	(
612	"&vadd_i32	($a,$a,$b)",
613	"&veor		($d,$d,$a)",
614	"&vrev32_16	($d,$d)",	# vrot ($d,16)
615
616	"&vadd_i32	($c,$c,$d)",
617	"&veor		($t,$b,$c)",
618	"&vshr_u32	($b,$t,20)",
619	"&vsli_32	($b,$t,12)",
620
621	"&vadd_i32	($a,$a,$b)",
622	"&veor		($t,$d,$a)",
623	"&vshr_u32	($d,$t,24)",
624	"&vsli_32	($d,$t,8)",
625
626	"&vadd_i32	($c,$c,$d)",
627	"&veor		($t,$b,$c)",
628	"&vshr_u32	($b,$t,25)",
629	"&vsli_32	($b,$t,7)",
630
631	"&vext_8	($c,$c,$c,8)",
632	"&vext_8	($b,$b,$b,$odd?12:4)",
633	"&vext_8	($d,$d,$d,$odd?4:12)"
634	);
635}
636
637$code.=<<___;
638#if __ARM_MAX_ARCH__>=7
639.arch	armv7-a
640.fpu	neon
641
642.globl	ChaCha20_ctr32_neon
643.type	ChaCha20_ctr32_neon,%function
644.align	5
645ChaCha20_ctr32_neon:
646	ldr		r12,[sp,#0]		@ pull pointer to counter and nonce
647	stmdb		sp!,{r0-r2,r4-r11,lr}
648	adr		r14,.Lsigma
649	vstmdb		sp!,{d8-d15}		@ ABI spec says so
650	stmdb		sp!,{r0-r3}
651
652	vld1.32		{$b0-$c0},[r3]		@ load key
653	ldmia		r3,{r4-r11}		@ load key
654
655	sub		sp,sp,#4*(16+16)
656	vld1.32		{$d0},[r12]		@ load counter and nonce
657	add		r12,sp,#4*8
658	ldmia		r14,{r0-r3}		@ load sigma
659	vld1.32		{$a0},[r14]!		@ load sigma
660	vld1.32		{$t0},[r14]		@ one
661	vst1.32		{$c0-$d0},[r12]		@ copy 1/2key|counter|nonce
662	vst1.32		{$a0-$b0},[sp]		@ copy sigma|1/2key
663
664	str		r10,[sp,#4*(16+10)]	@ off-load "@x[10]"
665	str		r11,[sp,#4*(16+11)]	@ off-load "@x[11]"
666	vshl.i32	$t1#lo,$t0#lo,#1	@ two
667	vstr		$t0#lo,[sp,#4*(16+0)]
668	vshl.i32	$t2#lo,$t0#lo,#2	@ four
669	vstr		$t1#lo,[sp,#4*(16+2)]
670	vmov		$a1,$a0
671	vstr		$t2#lo,[sp,#4*(16+4)]
672	vmov		$a2,$a0
673	vmov		$b1,$b0
674	vmov		$b2,$b0
675	b		.Loop_neon_enter
676
677.align	4
678.Loop_neon_outer:
679	ldmia		sp,{r0-r9}		@ load key material
680	cmp		@t[3],#64*2		@ if len<=64*2
681	bls		.Lbreak_neon		@ switch to integer-only
682	vmov		$a1,$a0
683	str		@t[3],[sp,#4*(32+2)]	@ save len
684	vmov		$a2,$a0
685	str		r12,  [sp,#4*(32+1)]	@ save inp
686	vmov		$b1,$b0
687	str		r14,  [sp,#4*(32+0)]	@ save out
688	vmov		$b2,$b0
689.Loop_neon_enter:
690	ldr		@t[3], [sp,#4*(15)]
691	vadd.i32	$d1,$d0,$t0		@ counter+1
692	ldr		@x[12],[sp,#4*(12)]	@ modulo-scheduled load
693	vmov		$c1,$c0
694	ldr		@t[2], [sp,#4*(13)]
695	vmov		$c2,$c0
696	ldr		@x[14],[sp,#4*(14)]
697	vadd.i32	$d2,$d1,$t0		@ counter+2
698	str		@t[3], [sp,#4*(16+15)]
699	mov		@t[3],#10
700	add		@x[12],@x[12],#3	@ counter+3
701	b		.Loop_neon
702
703.align	4
704.Loop_neon:
705	subs		@t[3],@t[3],#1
706___
707	my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0);
708	my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0);
709	my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0);
710	my @thread3=&ROUND(0,4,8,12);
711
712	foreach (@thread0) {
713		eval;			eval(shift(@thread3));
714		eval(shift(@thread1));	eval(shift(@thread3));
715		eval(shift(@thread2));	eval(shift(@thread3));
716	}
717
718	@thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1);
719	@thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1);
720	@thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1);
721	@thread3=&ROUND(0,5,10,15);
722
723	foreach (@thread0) {
724		eval;			eval(shift(@thread3));
725		eval(shift(@thread1));	eval(shift(@thread3));
726		eval(shift(@thread2));	eval(shift(@thread3));
727	}
728$code.=<<___;
729	bne		.Loop_neon
730
731	add		@t[3],sp,#32
732	vld1.32		{$t0-$t1},[sp]		@ load key material
733	vld1.32		{$t2-$t3},[@t[3]]
734
735	ldr		@t[3],[sp,#4*(32+2)]	@ load len
736
737	str		@t[0], [sp,#4*(16+8)]	@ modulo-scheduled store
738	str		@t[1], [sp,#4*(16+9)]
739	str		@x[12],[sp,#4*(16+12)]
740	str		@t[2], [sp,#4*(16+13)]
741	str		@x[14],[sp,#4*(16+14)]
742
743	@ at this point we have first half of 512-bit result in
744	@ @x[0-7] and second half at sp+4*(16+8)
745
746	ldr		r12,[sp,#4*(32+1)]	@ load inp
747	ldr		r14,[sp,#4*(32+0)]	@ load out
748
749	vadd.i32	$a0,$a0,$t0		@ accumulate key material
750	vadd.i32	$a1,$a1,$t0
751	vadd.i32	$a2,$a2,$t0
752	vldr		$t0#lo,[sp,#4*(16+0)]	@ one
753
754	vadd.i32	$b0,$b0,$t1
755	vadd.i32	$b1,$b1,$t1
756	vadd.i32	$b2,$b2,$t1
757	vldr		$t1#lo,[sp,#4*(16+2)]	@ two
758
759	vadd.i32	$c0,$c0,$t2
760	vadd.i32	$c1,$c1,$t2
761	vadd.i32	$c2,$c2,$t2
762	vadd.i32	$d1#lo,$d1#lo,$t0#lo	@ counter+1
763	vadd.i32	$d2#lo,$d2#lo,$t1#lo	@ counter+2
764
765	vadd.i32	$d0,$d0,$t3
766	vadd.i32	$d1,$d1,$t3
767	vadd.i32	$d2,$d2,$t3
768
769	cmp		@t[3],#64*4
770	blo		.Ltail_neon
771
772	vld1.8		{$t0-$t1},[r12]!	@ load input
773	 mov		@t[3],sp
774	vld1.8		{$t2-$t3},[r12]!
775	veor		$a0,$a0,$t0		@ xor with input
776	veor		$b0,$b0,$t1
777	vld1.8		{$t0-$t1},[r12]!
778	veor		$c0,$c0,$t2
779	veor		$d0,$d0,$t3
780	vld1.8		{$t2-$t3},[r12]!
781
782	veor		$a1,$a1,$t0
783	 vst1.8		{$a0-$b0},[r14]!	@ store output
784	veor		$b1,$b1,$t1
785	vld1.8		{$t0-$t1},[r12]!
786	veor		$c1,$c1,$t2
787	 vst1.8		{$c0-$d0},[r14]!
788	veor		$d1,$d1,$t3
789	vld1.8		{$t2-$t3},[r12]!
790
791	veor		$a2,$a2,$t0
792	 vld1.32	{$a0-$b0},[@t[3]]!	@ load for next iteration
793	 veor		$t0#hi,$t0#hi,$t0#hi
794	 vldr		$t0#lo,[sp,#4*(16+4)]	@ four
795	veor		$b2,$b2,$t1
796	 vld1.32	{$c0-$d0},[@t[3]]
797	veor		$c2,$c2,$t2
798	 vst1.8		{$a1-$b1},[r14]!
799	veor		$d2,$d2,$t3
800	 vst1.8		{$c1-$d1},[r14]!
801
802	vadd.i32	$d0#lo,$d0#lo,$t0#lo	@ next counter value
803	vldr		$t0#lo,[sp,#4*(16+0)]	@ one
804
805	ldmia		sp,{@t[0]-@t[3]}	@ load key material
806	add		@x[0],@x[0],@t[0]	@ accumulate key material
807	ldr		@t[0],[r12],#16		@ load input
808	 vst1.8		{$a2-$b2},[r14]!
809	add		@x[1],@x[1],@t[1]
810	ldr		@t[1],[r12,#-12]
811	 vst1.8		{$c2-$d2},[r14]!
812	add		@x[2],@x[2],@t[2]
813	ldr		@t[2],[r12,#-8]
814	add		@x[3],@x[3],@t[3]
815	ldr		@t[3],[r12,#-4]
816# ifdef	__ARMEB__
817	rev		@x[0],@x[0]
818	rev		@x[1],@x[1]
819	rev		@x[2],@x[2]
820	rev		@x[3],@x[3]
821# endif
822	eor		@x[0],@x[0],@t[0]	@ xor with input
823	 add		@t[0],sp,#4*(4)
824	eor		@x[1],@x[1],@t[1]
825	str		@x[0],[r14],#16		@ store output
826	eor		@x[2],@x[2],@t[2]
827	str		@x[1],[r14,#-12]
828	eor		@x[3],@x[3],@t[3]
829	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
830	str		@x[2],[r14,#-8]
831	str		@x[3],[r14,#-4]
832
833	add		@x[4],@x[4],@t[0]	@ accumulate key material
834	ldr		@t[0],[r12],#16		@ load input
835	add		@x[5],@x[5],@t[1]
836	ldr		@t[1],[r12,#-12]
837	add		@x[6],@x[6],@t[2]
838	ldr		@t[2],[r12,#-8]
839	add		@x[7],@x[7],@t[3]
840	ldr		@t[3],[r12,#-4]
841# ifdef	__ARMEB__
842	rev		@x[4],@x[4]
843	rev		@x[5],@x[5]
844	rev		@x[6],@x[6]
845	rev		@x[7],@x[7]
846# endif
847	eor		@x[4],@x[4],@t[0]
848	 add		@t[0],sp,#4*(8)
849	eor		@x[5],@x[5],@t[1]
850	str		@x[4],[r14],#16		@ store output
851	eor		@x[6],@x[6],@t[2]
852	str		@x[5],[r14,#-12]
853	eor		@x[7],@x[7],@t[3]
854	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
855	str		@x[6],[r14,#-8]
856	 add		@x[0],sp,#4*(16+8)
857	str		@x[7],[r14,#-4]
858
859	ldmia		@x[0],{@x[0]-@x[7]}	@ load second half
860
861	add		@x[0],@x[0],@t[0]	@ accumulate key material
862	ldr		@t[0],[r12],#16		@ load input
863	add		@x[1],@x[1],@t[1]
864	ldr		@t[1],[r12,#-12]
865# ifdef	__thumb2__
866	it	hi
867# endif
868	 strhi		@t[2],[sp,#4*(16+10)]	@ copy "@x[10]" while at it
869	add		@x[2],@x[2],@t[2]
870	ldr		@t[2],[r12,#-8]
871# ifdef	__thumb2__
872	it	hi
873# endif
874	 strhi		@t[3],[sp,#4*(16+11)]	@ copy "@x[11]" while at it
875	add		@x[3],@x[3],@t[3]
876	ldr		@t[3],[r12,#-4]
877# ifdef	__ARMEB__
878	rev		@x[0],@x[0]
879	rev		@x[1],@x[1]
880	rev		@x[2],@x[2]
881	rev		@x[3],@x[3]
882# endif
883	eor		@x[0],@x[0],@t[0]
884	 add		@t[0],sp,#4*(12)
885	eor		@x[1],@x[1],@t[1]
886	str		@x[0],[r14],#16		@ store output
887	eor		@x[2],@x[2],@t[2]
888	str		@x[1],[r14,#-12]
889	eor		@x[3],@x[3],@t[3]
890	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
891	str		@x[2],[r14,#-8]
892	str		@x[3],[r14,#-4]
893
894	add		@x[4],@x[4],@t[0]	@ accumulate key material
895	 add		@t[0],@t[0],#4		@ next counter value
896	add		@x[5],@x[5],@t[1]
897	 str		@t[0],[sp,#4*(12)]	@ save next counter value
898	ldr		@t[0],[r12],#16		@ load input
899	add		@x[6],@x[6],@t[2]
900	 add		@x[4],@x[4],#3		@ counter+3
901	ldr		@t[1],[r12,#-12]
902	add		@x[7],@x[7],@t[3]
903	ldr		@t[2],[r12,#-8]
904	ldr		@t[3],[r12,#-4]
905# ifdef	__ARMEB__
906	rev		@x[4],@x[4]
907	rev		@x[5],@x[5]
908	rev		@x[6],@x[6]
909	rev		@x[7],@x[7]
910# endif
911	eor		@x[4],@x[4],@t[0]
912# ifdef	__thumb2__
913	it	hi
914# endif
915	 ldrhi		@t[0],[sp,#4*(32+2)]	@ re-load len
916	eor		@x[5],@x[5],@t[1]
917	eor		@x[6],@x[6],@t[2]
918	str		@x[4],[r14],#16		@ store output
919	eor		@x[7],@x[7],@t[3]
920	str		@x[5],[r14,#-12]
921	 sub		@t[3],@t[0],#64*4	@ len-=64*4
922	str		@x[6],[r14,#-8]
923	str		@x[7],[r14,#-4]
924	bhi		.Loop_neon_outer
925
926	b		.Ldone_neon
927
928.align	4
929.Lbreak_neon:
930	@ harmonize NEON and integer-only stack frames: load data
931	@ from NEON frame, but save to integer-only one; distance
932	@ between the two is 4*(32+4+16-32)=4*(20).
933
934	str		@t[3], [sp,#4*(20+32+2)]	@ save len
935	 add		@t[3],sp,#4*(32+4)
936	str		r12,   [sp,#4*(20+32+1)]	@ save inp
937	str		r14,   [sp,#4*(20+32+0)]	@ save out
938
939	ldr		@x[12],[sp,#4*(16+10)]
940	ldr		@x[14],[sp,#4*(16+11)]
941	 vldmia		@t[3],{d8-d15}			@ fulfill ABI requirement
942	str		@x[12],[sp,#4*(20+16+10)]	@ copy "@x[10]"
943	str		@x[14],[sp,#4*(20+16+11)]	@ copy "@x[11]"
944
945	ldr		@t[3], [sp,#4*(15)]
946	ldr		@x[12],[sp,#4*(12)]		@ modulo-scheduled load
947	ldr		@t[2], [sp,#4*(13)]
948	ldr		@x[14],[sp,#4*(14)]
949	str		@t[3], [sp,#4*(20+16+15)]
950	add		@t[3],sp,#4*(20)
951	vst1.32		{$a0-$b0},[@t[3]]!		@ copy key
952	add		sp,sp,#4*(20)			@ switch frame
953	vst1.32		{$c0-$d0},[@t[3]]
954	mov		@t[3],#10
955	b		.Loop				@ go integer-only
956
957.align	4
958.Ltail_neon:
959	cmp		@t[3],#64*3
960	bhs		.L192_or_more_neon
961	cmp		@t[3],#64*2
962	bhs		.L128_or_more_neon
963	cmp		@t[3],#64*1
964	bhs		.L64_or_more_neon
965
966	add		@t[0],sp,#4*(8)
967	vst1.8		{$a0-$b0},[sp]
968	add		@t[2],sp,#4*(0)
969	vst1.8		{$c0-$d0},[@t[0]]
970	b		.Loop_tail_neon
971
972.align	4
973.L64_or_more_neon:
974	vld1.8		{$t0-$t1},[r12]!
975	vld1.8		{$t2-$t3},[r12]!
976	veor		$a0,$a0,$t0
977	veor		$b0,$b0,$t1
978	veor		$c0,$c0,$t2
979	veor		$d0,$d0,$t3
980	vst1.8		{$a0-$b0},[r14]!
981	vst1.8		{$c0-$d0},[r14]!
982
983	beq		.Ldone_neon
984
985	add		@t[0],sp,#4*(8)
986	vst1.8		{$a1-$b1},[sp]
987	add		@t[2],sp,#4*(0)
988	vst1.8		{$c1-$d1},[@t[0]]
989	sub		@t[3],@t[3],#64*1	@ len-=64*1
990	b		.Loop_tail_neon
991
992.align	4
993.L128_or_more_neon:
994	vld1.8		{$t0-$t1},[r12]!
995	vld1.8		{$t2-$t3},[r12]!
996	veor		$a0,$a0,$t0
997	veor		$b0,$b0,$t1
998	vld1.8		{$t0-$t1},[r12]!
999	veor		$c0,$c0,$t2
1000	veor		$d0,$d0,$t3
1001	vld1.8		{$t2-$t3},[r12]!
1002
1003	veor		$a1,$a1,$t0
1004	veor		$b1,$b1,$t1
1005	 vst1.8		{$a0-$b0},[r14]!
1006	veor		$c1,$c1,$t2
1007	 vst1.8		{$c0-$d0},[r14]!
1008	veor		$d1,$d1,$t3
1009	vst1.8		{$a1-$b1},[r14]!
1010	vst1.8		{$c1-$d1},[r14]!
1011
1012	beq		.Ldone_neon
1013
1014	add		@t[0],sp,#4*(8)
1015	vst1.8		{$a2-$b2},[sp]
1016	add		@t[2],sp,#4*(0)
1017	vst1.8		{$c2-$d2},[@t[0]]
1018	sub		@t[3],@t[3],#64*2	@ len-=64*2
1019	b		.Loop_tail_neon
1020
1021.align	4
1022.L192_or_more_neon:
1023	vld1.8		{$t0-$t1},[r12]!
1024	vld1.8		{$t2-$t3},[r12]!
1025	veor		$a0,$a0,$t0
1026	veor		$b0,$b0,$t1
1027	vld1.8		{$t0-$t1},[r12]!
1028	veor		$c0,$c0,$t2
1029	veor		$d0,$d0,$t3
1030	vld1.8		{$t2-$t3},[r12]!
1031
1032	veor		$a1,$a1,$t0
1033	veor		$b1,$b1,$t1
1034	vld1.8		{$t0-$t1},[r12]!
1035	veor		$c1,$c1,$t2
1036	 vst1.8		{$a0-$b0},[r14]!
1037	veor		$d1,$d1,$t3
1038	vld1.8		{$t2-$t3},[r12]!
1039
1040	veor		$a2,$a2,$t0
1041	 vst1.8		{$c0-$d0},[r14]!
1042	veor		$b2,$b2,$t1
1043	 vst1.8		{$a1-$b1},[r14]!
1044	veor		$c2,$c2,$t2
1045	 vst1.8		{$c1-$d1},[r14]!
1046	veor		$d2,$d2,$t3
1047	vst1.8		{$a2-$b2},[r14]!
1048	vst1.8		{$c2-$d2},[r14]!
1049
1050	beq		.Ldone_neon
1051
1052	ldmia		sp,{@t[0]-@t[3]}	@ load key material
1053	add		@x[0],@x[0],@t[0]	@ accumulate key material
1054	 add		@t[0],sp,#4*(4)
1055	add		@x[1],@x[1],@t[1]
1056	add		@x[2],@x[2],@t[2]
1057	add		@x[3],@x[3],@t[3]
1058	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
1059
1060	add		@x[4],@x[4],@t[0]	@ accumulate key material
1061	 add		@t[0],sp,#4*(8)
1062	add		@x[5],@x[5],@t[1]
1063	add		@x[6],@x[6],@t[2]
1064	add		@x[7],@x[7],@t[3]
1065	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
1066# ifdef	__ARMEB__
1067	rev		@x[0],@x[0]
1068	rev		@x[1],@x[1]
1069	rev		@x[2],@x[2]
1070	rev		@x[3],@x[3]
1071	rev		@x[4],@x[4]
1072	rev		@x[5],@x[5]
1073	rev		@x[6],@x[6]
1074	rev		@x[7],@x[7]
1075# endif
1076	stmia		sp,{@x[0]-@x[7]}
1077	 add		@x[0],sp,#4*(16+8)
1078
1079	ldmia		@x[0],{@x[0]-@x[7]}	@ load second half
1080
1081	add		@x[0],@x[0],@t[0]	@ accumulate key material
1082	 add		@t[0],sp,#4*(12)
1083	add		@x[1],@x[1],@t[1]
1084	add		@x[2],@x[2],@t[2]
1085	add		@x[3],@x[3],@t[3]
1086	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
1087
1088	add		@x[4],@x[4],@t[0]	@ accumulate key material
1089	 add		@t[0],sp,#4*(8)
1090	add		@x[5],@x[5],@t[1]
1091	 add		@x[4],@x[4],#3		@ counter+3
1092	add		@x[6],@x[6],@t[2]
1093	add		@x[7],@x[7],@t[3]
1094	 ldr		@t[3],[sp,#4*(32+2)]	@ re-load len
1095# ifdef	__ARMEB__
1096	rev		@x[0],@x[0]
1097	rev		@x[1],@x[1]
1098	rev		@x[2],@x[2]
1099	rev		@x[3],@x[3]
1100	rev		@x[4],@x[4]
1101	rev		@x[5],@x[5]
1102	rev		@x[6],@x[6]
1103	rev		@x[7],@x[7]
1104# endif
1105	stmia		@t[0],{@x[0]-@x[7]}
1106	 add		@t[2],sp,#4*(0)
1107	 sub		@t[3],@t[3],#64*3	@ len-=64*3
1108
1109.Loop_tail_neon:
1110	ldrb		@t[0],[@t[2]],#1	@ read buffer on stack
1111	ldrb		@t[1],[r12],#1		@ read input
1112	subs		@t[3],@t[3],#1
1113	eor		@t[0],@t[0],@t[1]
1114	strb		@t[0],[r14],#1		@ store output
1115	bne		.Loop_tail_neon
1116
1117.Ldone_neon:
1118	add		sp,sp,#4*(32+4)
1119	vldmia		sp,{d8-d15}
1120	add		sp,sp,#4*(16+3)
1121	ldmia		sp!,{r4-r11,pc}
1122.size	ChaCha20_ctr32_neon,.-ChaCha20_ctr32_neon
1123#endif
1124___
1125}}}
1126
1127foreach (split("\n",$code)) {
1128	s/\`([^\`]*)\`/eval $1/geo;
1129
1130	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
1131
1132	print $_,"\n";
1133}
1134close STDOUT or die "error closing STDOUT: $!";
1135