xref: /aosp_15_r20/external/boringssl/src/crypto/fipsmodule/aes/asm/aesv8-armx.pl (revision 8fb009dc861624b67b6cdb62ea21f0f22d0c584b)
1#! /usr/bin/env perl
2# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for ARMv8 AES instructions. The
18# module is endian-agnostic in sense that it supports both big- and
19# little-endian cases. As does it support both 32- and 64-bit modes
20# of operation. Latter is achieved by limiting amount of utilized
21# registers to 16, which implies additional NEON load and integer
22# instructions. This has no effect on mighty Apple A7, where results
23# are literally equal to the theoretical estimates based on AES
24# instruction latencies and issue rates. On Cortex-A53, an in-order
25# execution core, this costs up to 10-15%, which is partially
26# compensated by implementing dedicated code path for 128-bit
27# CBC encrypt case. On Cortex-A57 parallelizable mode performance
28# seems to be limited by sheer amount of NEON instructions...
29#
30# Performance in cycles per byte processed with 128-bit key:
31#
32#		CBC enc		CBC dec		CTR
33# Apple A7	2.39		1.20		1.20
34# Cortex-A53	1.32		1.29		1.46
35# Cortex-A57(*)	1.95		0.85		0.93
36# Denver	1.96		0.86		0.80
37# Mongoose	1.33		1.20		1.20
38#
39# (*)	original 3.64/1.34/1.32 results were for r0p0 revision
40#	and are still same even for updated module;
41
42$flavour = shift;
43$output  = shift;
44
45$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
46( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
47( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
48die "can't locate arm-xlate.pl";
49
50open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
51*STDOUT=*OUT;
52
53$prefix="aes_hw";
54
55$code=<<___;
56#include <openssl/arm_arch.h>
57
58#if __ARM_MAX_ARCH__>=7
59.text
60___
61$code.=".arch	armv8-a+crypto\n"			if ($flavour =~ /64/);
62$code.=<<___						if ($flavour !~ /64/);
63.arch	armv7-a	// don't confuse not-so-latest binutils with argv8 :-)
64.fpu	neon
65.code	32
66#undef	__thumb2__
67___
68
69# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
70# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
71# maintain both 32- and 64-bit codes within single module and
72# transliterate common code to either flavour with regex vodoo.
73#
74{{{
75my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
76my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
77	$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
78
79
80# On AArch64, put the data .rodata and use adrp + add for compatibility with
81# execute-only memory. On AArch32, put it in .text and use adr.
82$code.= ".section .rodata\n" if ($flavour =~ /64/);
83$code.=<<___;
84.align	5
85.Lrcon:
86.long	0x01,0x01,0x01,0x01
87.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
88.long	0x1b,0x1b,0x1b,0x1b
89
90.text
91
92.globl	${prefix}_set_encrypt_key
93.type	${prefix}_set_encrypt_key,%function
94.align	5
95${prefix}_set_encrypt_key:
96.Lenc_key:
97___
98$code.=<<___	if ($flavour =~ /64/);
99	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
100	AARCH64_VALID_CALL_TARGET
101	stp	x29,x30,[sp,#-16]!
102	add	x29,sp,#0
103___
104$code.=<<___;
105	mov	$ptr,#-2
106	cmp	$bits,#128
107	b.lt	.Lenc_key_abort
108	cmp	$bits,#256
109	b.gt	.Lenc_key_abort
110	tst	$bits,#0x3f
111	b.ne	.Lenc_key_abort
112
113___
114$code.=<<___	if ($flavour =~ /64/);
115	adrp	$ptr,:pg_hi21:.Lrcon
116	add	$ptr,$ptr,:lo12:.Lrcon
117___
118$code.=<<___	if ($flavour !~ /64/);
119	adr	$ptr,.Lrcon
120___
121$code.=<<___;
122	cmp	$bits,#192
123
124	veor	$zero,$zero,$zero
125	vld1.8	{$in0},[$inp],#16
126	mov	$bits,#8		// reuse $bits
127	vld1.32	{$rcon,$mask},[$ptr],#32
128
129	b.lt	.Loop128
130	b.eq	.L192
131	b	.L256
132
133.align	4
134.Loop128:
135	vtbl.8	$key,{$in0},$mask
136	vext.8	$tmp,$zero,$in0,#12
137	vst1.32	{$in0},[$out],#16
138	aese	$key,$zero
139	subs	$bits,$bits,#1
140
141	veor	$in0,$in0,$tmp
142	vext.8	$tmp,$zero,$tmp,#12
143	veor	$in0,$in0,$tmp
144	vext.8	$tmp,$zero,$tmp,#12
145	 veor	$key,$key,$rcon
146	veor	$in0,$in0,$tmp
147	vshl.u8	$rcon,$rcon,#1
148	veor	$in0,$in0,$key
149	b.ne	.Loop128
150
151	vld1.32	{$rcon},[$ptr]
152
153	vtbl.8	$key,{$in0},$mask
154	vext.8	$tmp,$zero,$in0,#12
155	vst1.32	{$in0},[$out],#16
156	aese	$key,$zero
157
158	veor	$in0,$in0,$tmp
159	vext.8	$tmp,$zero,$tmp,#12
160	veor	$in0,$in0,$tmp
161	vext.8	$tmp,$zero,$tmp,#12
162	 veor	$key,$key,$rcon
163	veor	$in0,$in0,$tmp
164	vshl.u8	$rcon,$rcon,#1
165	veor	$in0,$in0,$key
166
167	vtbl.8	$key,{$in0},$mask
168	vext.8	$tmp,$zero,$in0,#12
169	vst1.32	{$in0},[$out],#16
170	aese	$key,$zero
171
172	veor	$in0,$in0,$tmp
173	vext.8	$tmp,$zero,$tmp,#12
174	veor	$in0,$in0,$tmp
175	vext.8	$tmp,$zero,$tmp,#12
176	 veor	$key,$key,$rcon
177	veor	$in0,$in0,$tmp
178	veor	$in0,$in0,$key
179	vst1.32	{$in0},[$out]
180	add	$out,$out,#0x50
181
182	mov	$rounds,#10
183	b	.Ldone
184
185.align	4
186.L192:
187	vld1.8	{$in1},[$inp],#8
188	vmov.i8	$key,#8			// borrow $key
189	vst1.32	{$in0},[$out],#16
190	vsub.i8	$mask,$mask,$key	// adjust the mask
191
192.Loop192:
193	vtbl.8	$key,{$in1},$mask
194	vext.8	$tmp,$zero,$in0,#12
195	vst1.32	{$in1},[$out],#8
196	aese	$key,$zero
197	subs	$bits,$bits,#1
198
199	veor	$in0,$in0,$tmp
200	vext.8	$tmp,$zero,$tmp,#12
201	veor	$in0,$in0,$tmp
202	vext.8	$tmp,$zero,$tmp,#12
203	veor	$in0,$in0,$tmp
204
205	vdup.32	$tmp,${in0}[3]
206	veor	$tmp,$tmp,$in1
207	 veor	$key,$key,$rcon
208	vext.8	$in1,$zero,$in1,#12
209	vshl.u8	$rcon,$rcon,#1
210	veor	$in1,$in1,$tmp
211	veor	$in0,$in0,$key
212	veor	$in1,$in1,$key
213	vst1.32	{$in0},[$out],#16
214	b.ne	.Loop192
215
216	mov	$rounds,#12
217	add	$out,$out,#0x20
218	b	.Ldone
219
220.align	4
221.L256:
222	vld1.8	{$in1},[$inp]
223	mov	$bits,#7
224	mov	$rounds,#14
225	vst1.32	{$in0},[$out],#16
226
227.Loop256:
228	vtbl.8	$key,{$in1},$mask
229	vext.8	$tmp,$zero,$in0,#12
230	vst1.32	{$in1},[$out],#16
231	aese	$key,$zero
232	subs	$bits,$bits,#1
233
234	veor	$in0,$in0,$tmp
235	vext.8	$tmp,$zero,$tmp,#12
236	veor	$in0,$in0,$tmp
237	vext.8	$tmp,$zero,$tmp,#12
238	 veor	$key,$key,$rcon
239	veor	$in0,$in0,$tmp
240	vshl.u8	$rcon,$rcon,#1
241	veor	$in0,$in0,$key
242	vst1.32	{$in0},[$out],#16
243	b.eq	.Ldone
244
245	vdup.32	$key,${in0}[3]		// just splat
246	vext.8	$tmp,$zero,$in1,#12
247	aese	$key,$zero
248
249	veor	$in1,$in1,$tmp
250	vext.8	$tmp,$zero,$tmp,#12
251	veor	$in1,$in1,$tmp
252	vext.8	$tmp,$zero,$tmp,#12
253	veor	$in1,$in1,$tmp
254
255	veor	$in1,$in1,$key
256	b	.Loop256
257
258.Ldone:
259	str	$rounds,[$out]
260	mov	$ptr,#0
261
262.Lenc_key_abort:
263	mov	x0,$ptr			// return value
264	`"ldr	x29,[sp],#16"		if ($flavour =~ /64/)`
265	ret
266.size	${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
267
268.globl	${prefix}_set_decrypt_key
269.type	${prefix}_set_decrypt_key,%function
270.align	5
271${prefix}_set_decrypt_key:
272___
273$code.=<<___	if ($flavour =~ /64/);
274	AARCH64_SIGN_LINK_REGISTER
275	stp	x29,x30,[sp,#-16]!
276	add	x29,sp,#0
277___
278$code.=<<___	if ($flavour !~ /64/);
279	stmdb	sp!,{r4,lr}
280___
281$code.=<<___;
282	bl	.Lenc_key
283
284	cmp	x0,#0
285	b.ne	.Ldec_key_abort
286
287	sub	$out,$out,#240		// restore original $out
288	mov	x4,#-16
289	add	$inp,$out,x12,lsl#4	// end of key schedule
290
291	vld1.32	{v0.16b},[$out]
292	vld1.32	{v1.16b},[$inp]
293	vst1.32	{v0.16b},[$inp],x4
294	vst1.32	{v1.16b},[$out],#16
295
296.Loop_imc:
297	vld1.32	{v0.16b},[$out]
298	vld1.32	{v1.16b},[$inp]
299	aesimc	v0.16b,v0.16b
300	aesimc	v1.16b,v1.16b
301	vst1.32	{v0.16b},[$inp],x4
302	vst1.32	{v1.16b},[$out],#16
303	cmp	$inp,$out
304	b.hi	.Loop_imc
305
306	vld1.32	{v0.16b},[$out]
307	aesimc	v0.16b,v0.16b
308	vst1.32	{v0.16b},[$inp]
309
310	eor	x0,x0,x0		// return value
311.Ldec_key_abort:
312___
313$code.=<<___	if ($flavour !~ /64/);
314	ldmia	sp!,{r4,pc}
315___
316$code.=<<___	if ($flavour =~ /64/);
317	ldp	x29,x30,[sp],#16
318	AARCH64_VALIDATE_LINK_REGISTER
319	ret
320___
321$code.=<<___;
322.size	${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
323___
324}}}
325{{{
326sub gen_block () {
327my $dir = shift;
328my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
329my ($inp,$out,$key)=map("x$_",(0..2));
330my $rounds="w3";
331my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
332
333$code.=<<___;
334.globl	${prefix}_${dir}crypt
335.type	${prefix}_${dir}crypt,%function
336.align	5
337${prefix}_${dir}crypt:
338	AARCH64_VALID_CALL_TARGET
339	ldr	$rounds,[$key,#240]
340	vld1.32	{$rndkey0},[$key],#16
341	vld1.8	{$inout},[$inp]
342	sub	$rounds,$rounds,#2
343	vld1.32	{$rndkey1},[$key],#16
344
345.Loop_${dir}c:
346	aes$e	$inout,$rndkey0
347	aes$mc	$inout,$inout
348	vld1.32	{$rndkey0},[$key],#16
349	subs	$rounds,$rounds,#2
350	aes$e	$inout,$rndkey1
351	aes$mc	$inout,$inout
352	vld1.32	{$rndkey1},[$key],#16
353	b.gt	.Loop_${dir}c
354
355	aes$e	$inout,$rndkey0
356	aes$mc	$inout,$inout
357	vld1.32	{$rndkey0},[$key]
358	aes$e	$inout,$rndkey1
359	veor	$inout,$inout,$rndkey0
360
361	vst1.8	{$inout},[$out]
362	ret
363.size	${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
364___
365}
366&gen_block("en");
367&gen_block("de");
368}}}
369{{{
370my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
371my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
372my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
373
374my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
375my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
376
377### q8-q15	preloaded key schedule
378
379$code.=<<___;
380.globl	${prefix}_cbc_encrypt
381.type	${prefix}_cbc_encrypt,%function
382.align	5
383${prefix}_cbc_encrypt:
384___
385$code.=<<___	if ($flavour =~ /64/);
386	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
387	AARCH64_VALID_CALL_TARGET
388	stp	x29,x30,[sp,#-16]!
389	add	x29,sp,#0
390___
391$code.=<<___	if ($flavour !~ /64/);
392	mov	ip,sp
393	stmdb	sp!,{r4-r8,lr}
394	vstmdb	sp!,{d8-d15}            @ ABI specification says so
395	ldmia	ip,{r4-r5}		@ load remaining args
396___
397$code.=<<___;
398	subs	$len,$len,#16
399	mov	$step,#16
400	b.lo	.Lcbc_abort
401	cclr	$step,eq
402
403	cmp	$enc,#0			// en- or decrypting?
404	ldr	$rounds,[$key,#240]
405	and	$len,$len,#-16
406	vld1.8	{$ivec},[$ivp]
407	vld1.8	{$dat},[$inp],$step
408
409	vld1.32	{q8-q9},[$key]		// load key schedule...
410	sub	$rounds,$rounds,#6
411	add	$key_,$key,x5,lsl#4	// pointer to last 7 round keys
412	sub	$rounds,$rounds,#2
413	vld1.32	{q10-q11},[$key_],#32
414	vld1.32	{q12-q13},[$key_],#32
415	vld1.32	{q14-q15},[$key_],#32
416	vld1.32	{$rndlast},[$key_]
417
418	add	$key_,$key,#32
419	mov	$cnt,$rounds
420	b.eq	.Lcbc_dec
421
422	cmp	$rounds,#2
423	veor	$dat,$dat,$ivec
424	veor	$rndzero_n_last,q8,$rndlast
425	b.eq	.Lcbc_enc128
426
427	vld1.32	{$in0-$in1},[$key_]
428	add	$key_,$key,#16
429	add	$key4,$key,#16*4
430	add	$key5,$key,#16*5
431	aese	$dat,q8
432	aesmc	$dat,$dat
433	add	$key6,$key,#16*6
434	add	$key7,$key,#16*7
435	b	.Lenter_cbc_enc
436
437.align	4
438.Loop_cbc_enc:
439	aese	$dat,q8
440	aesmc	$dat,$dat
441	 vst1.8	{$ivec},[$out],#16
442.Lenter_cbc_enc:
443	aese	$dat,q9
444	aesmc	$dat,$dat
445	aese	$dat,$in0
446	aesmc	$dat,$dat
447	vld1.32	{q8},[$key4]
448	cmp	$rounds,#4
449	aese	$dat,$in1
450	aesmc	$dat,$dat
451	vld1.32	{q9},[$key5]
452	b.eq	.Lcbc_enc192
453
454	aese	$dat,q8
455	aesmc	$dat,$dat
456	vld1.32	{q8},[$key6]
457	aese	$dat,q9
458	aesmc	$dat,$dat
459	vld1.32	{q9},[$key7]
460	nop
461
462.Lcbc_enc192:
463	aese	$dat,q8
464	aesmc	$dat,$dat
465	 subs	$len,$len,#16
466	aese	$dat,q9
467	aesmc	$dat,$dat
468	 cclr	$step,eq
469	aese	$dat,q10
470	aesmc	$dat,$dat
471	aese	$dat,q11
472	aesmc	$dat,$dat
473	 vld1.8	{q8},[$inp],$step
474	aese	$dat,q12
475	aesmc	$dat,$dat
476	 veor	q8,q8,$rndzero_n_last
477	aese	$dat,q13
478	aesmc	$dat,$dat
479	 vld1.32 {q9},[$key_]		// re-pre-load rndkey[1]
480	aese	$dat,q14
481	aesmc	$dat,$dat
482	aese	$dat,q15
483	veor	$ivec,$dat,$rndlast
484	b.hs	.Loop_cbc_enc
485
486	vst1.8	{$ivec},[$out],#16
487	b	.Lcbc_done
488
489.align	5
490.Lcbc_enc128:
491	vld1.32	{$in0-$in1},[$key_]
492	aese	$dat,q8
493	aesmc	$dat,$dat
494	b	.Lenter_cbc_enc128
495.Loop_cbc_enc128:
496	aese	$dat,q8
497	aesmc	$dat,$dat
498	 vst1.8	{$ivec},[$out],#16
499.Lenter_cbc_enc128:
500	aese	$dat,q9
501	aesmc	$dat,$dat
502	 subs	$len,$len,#16
503	aese	$dat,$in0
504	aesmc	$dat,$dat
505	 cclr	$step,eq
506	aese	$dat,$in1
507	aesmc	$dat,$dat
508	aese	$dat,q10
509	aesmc	$dat,$dat
510	aese	$dat,q11
511	aesmc	$dat,$dat
512	 vld1.8	{q8},[$inp],$step
513	aese	$dat,q12
514	aesmc	$dat,$dat
515	aese	$dat,q13
516	aesmc	$dat,$dat
517	aese	$dat,q14
518	aesmc	$dat,$dat
519	 veor	q8,q8,$rndzero_n_last
520	aese	$dat,q15
521	veor	$ivec,$dat,$rndlast
522	b.hs	.Loop_cbc_enc128
523
524	vst1.8	{$ivec},[$out],#16
525	b	.Lcbc_done
526___
527{
528my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
529$code.=<<___;
530.align	5
531.Lcbc_dec:
532	vld1.8	{$dat2},[$inp],#16
533	subs	$len,$len,#32		// bias
534	add	$cnt,$rounds,#2
535	vorr	$in1,$dat,$dat
536	vorr	$dat1,$dat,$dat
537	vorr	$in2,$dat2,$dat2
538	b.lo	.Lcbc_dec_tail
539
540	vorr	$dat1,$dat2,$dat2
541	vld1.8	{$dat2},[$inp],#16
542	vorr	$in0,$dat,$dat
543	vorr	$in1,$dat1,$dat1
544	vorr	$in2,$dat2,$dat2
545
546.Loop3x_cbc_dec:
547	aesd	$dat0,q8
548	aesimc	$dat0,$dat0
549	aesd	$dat1,q8
550	aesimc	$dat1,$dat1
551	aesd	$dat2,q8
552	aesimc	$dat2,$dat2
553	vld1.32	{q8},[$key_],#16
554	subs	$cnt,$cnt,#2
555	aesd	$dat0,q9
556	aesimc	$dat0,$dat0
557	aesd	$dat1,q9
558	aesimc	$dat1,$dat1
559	aesd	$dat2,q9
560	aesimc	$dat2,$dat2
561	vld1.32	{q9},[$key_],#16
562	b.gt	.Loop3x_cbc_dec
563
564	aesd	$dat0,q8
565	aesimc	$dat0,$dat0
566	aesd	$dat1,q8
567	aesimc	$dat1,$dat1
568	aesd	$dat2,q8
569	aesimc	$dat2,$dat2
570	 veor	$tmp0,$ivec,$rndlast
571	 subs	$len,$len,#0x30
572	 veor	$tmp1,$in0,$rndlast
573	 mov.lo	x6,$len			// x6, $cnt, is zero at this point
574	aesd	$dat0,q9
575	aesimc	$dat0,$dat0
576	aesd	$dat1,q9
577	aesimc	$dat1,$dat1
578	aesd	$dat2,q9
579	aesimc	$dat2,$dat2
580	 veor	$tmp2,$in1,$rndlast
581	 add	$inp,$inp,x6		// $inp is adjusted in such way that
582					// at exit from the loop $dat1-$dat2
583					// are loaded with last "words"
584	 vorr	$ivec,$in2,$in2
585	 mov	$key_,$key
586	aesd	$dat0,q12
587	aesimc	$dat0,$dat0
588	aesd	$dat1,q12
589	aesimc	$dat1,$dat1
590	aesd	$dat2,q12
591	aesimc	$dat2,$dat2
592	 vld1.8	{$in0},[$inp],#16
593	aesd	$dat0,q13
594	aesimc	$dat0,$dat0
595	aesd	$dat1,q13
596	aesimc	$dat1,$dat1
597	aesd	$dat2,q13
598	aesimc	$dat2,$dat2
599	 vld1.8	{$in1},[$inp],#16
600	aesd	$dat0,q14
601	aesimc	$dat0,$dat0
602	aesd	$dat1,q14
603	aesimc	$dat1,$dat1
604	aesd	$dat2,q14
605	aesimc	$dat2,$dat2
606	 vld1.8	{$in2},[$inp],#16
607	aesd	$dat0,q15
608	aesd	$dat1,q15
609	aesd	$dat2,q15
610	 vld1.32 {q8},[$key_],#16	// re-pre-load rndkey[0]
611	 add	$cnt,$rounds,#2
612	veor	$tmp0,$tmp0,$dat0
613	veor	$tmp1,$tmp1,$dat1
614	veor	$dat2,$dat2,$tmp2
615	 vld1.32 {q9},[$key_],#16	// re-pre-load rndkey[1]
616	vst1.8	{$tmp0},[$out],#16
617	 vorr	$dat0,$in0,$in0
618	vst1.8	{$tmp1},[$out],#16
619	 vorr	$dat1,$in1,$in1
620	vst1.8	{$dat2},[$out],#16
621	 vorr	$dat2,$in2,$in2
622	b.hs	.Loop3x_cbc_dec
623
624	cmn	$len,#0x30
625	b.eq	.Lcbc_done
626	nop
627
628.Lcbc_dec_tail:
629	aesd	$dat1,q8
630	aesimc	$dat1,$dat1
631	aesd	$dat2,q8
632	aesimc	$dat2,$dat2
633	vld1.32	{q8},[$key_],#16
634	subs	$cnt,$cnt,#2
635	aesd	$dat1,q9
636	aesimc	$dat1,$dat1
637	aesd	$dat2,q9
638	aesimc	$dat2,$dat2
639	vld1.32	{q9},[$key_],#16
640	b.gt	.Lcbc_dec_tail
641
642	aesd	$dat1,q8
643	aesimc	$dat1,$dat1
644	aesd	$dat2,q8
645	aesimc	$dat2,$dat2
646	aesd	$dat1,q9
647	aesimc	$dat1,$dat1
648	aesd	$dat2,q9
649	aesimc	$dat2,$dat2
650	aesd	$dat1,q12
651	aesimc	$dat1,$dat1
652	aesd	$dat2,q12
653	aesimc	$dat2,$dat2
654	 cmn	$len,#0x20
655	aesd	$dat1,q13
656	aesimc	$dat1,$dat1
657	aesd	$dat2,q13
658	aesimc	$dat2,$dat2
659	 veor	$tmp1,$ivec,$rndlast
660	aesd	$dat1,q14
661	aesimc	$dat1,$dat1
662	aesd	$dat2,q14
663	aesimc	$dat2,$dat2
664	 veor	$tmp2,$in1,$rndlast
665	aesd	$dat1,q15
666	aesd	$dat2,q15
667	b.eq	.Lcbc_dec_one
668	veor	$tmp1,$tmp1,$dat1
669	veor	$tmp2,$tmp2,$dat2
670	 vorr	$ivec,$in2,$in2
671	vst1.8	{$tmp1},[$out],#16
672	vst1.8	{$tmp2},[$out],#16
673	b	.Lcbc_done
674
675.Lcbc_dec_one:
676	veor	$tmp1,$tmp1,$dat2
677	 vorr	$ivec,$in2,$in2
678	vst1.8	{$tmp1},[$out],#16
679
680.Lcbc_done:
681	vst1.8	{$ivec},[$ivp]
682.Lcbc_abort:
683___
684}
685$code.=<<___	if ($flavour !~ /64/);
686	vldmia	sp!,{d8-d15}
687	ldmia	sp!,{r4-r8,pc}
688___
689$code.=<<___	if ($flavour =~ /64/);
690	ldr	x29,[sp],#16
691	ret
692___
693$code.=<<___;
694.size	${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
695___
696}}}
697{{{
698my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
699my ($rounds,$cnt,$key_)=("w5","w6","x7");
700my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
701my $step="x12";		# aliases with $tctr2
702
703my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
704my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
705
706my ($dat,$tmp)=($dat0,$tmp0);
707
708### q8-q15	preloaded key schedule
709
710$code.=<<___;
711.globl	${prefix}_ctr32_encrypt_blocks
712.type	${prefix}_ctr32_encrypt_blocks,%function
713.align	5
714${prefix}_ctr32_encrypt_blocks:
715___
716$code.=<<___	if ($flavour =~ /64/);
717	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
718	AARCH64_VALID_CALL_TARGET
719	stp		x29,x30,[sp,#-16]!
720	add		x29,sp,#0
721___
722$code.=<<___	if ($flavour !~ /64/);
723	mov		ip,sp
724	stmdb		sp!,{r4-r10,lr}
725	vstmdb		sp!,{d8-d15}            @ ABI specification says so
726	ldr		r4, [ip]		@ load remaining arg
727___
728$code.=<<___;
729	ldr		$rounds,[$key,#240]
730
731	ldr		$ctr, [$ivp, #12]
732	vld1.32		{$dat0},[$ivp]
733
734	vld1.32		{q8-q9},[$key]		// load key schedule...
735	sub		$rounds,$rounds,#4
736	mov		$step,#16
737	cmp		$len,#2
738	add		$key_,$key,x5,lsl#4	// pointer to last 5 round keys
739	sub		$rounds,$rounds,#2
740	vld1.32		{q12-q13},[$key_],#32
741	vld1.32		{q14-q15},[$key_],#32
742	vld1.32		{$rndlast},[$key_]
743	add		$key_,$key,#32
744	mov		$cnt,$rounds
745	cclr		$step,lo
746
747	// ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
748	// affected by silicon errata #1742098 [0] and #1655431 [1],
749	// respectively, where the second instruction of an aese/aesmc
750	// instruction pair may execute twice if an interrupt is taken right
751	// after the first instruction consumes an input register of which a
752	// single 32-bit lane has been updated the last time it was modified.
753	//
754	// This function uses a counter in one 32-bit lane. The vmov.32 lines
755	// could write to $dat1 and $dat2 directly, but that trips this bugs.
756	// We write to $ivec and copy to the final register as a workaround.
757	//
758	// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
759	// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
760#ifndef __ARMEB__
761	rev		$ctr, $ctr
762#endif
763	add		$tctr1, $ctr, #1
764	vorr		$ivec,$dat0,$dat0
765	rev		$tctr1, $tctr1
766	vmov.32		${ivec}[3],$tctr1
767	add		$ctr, $ctr, #2
768	vorr		$dat1,$ivec,$ivec
769	b.ls		.Lctr32_tail
770	rev		$tctr2, $ctr
771	vmov.32		${ivec}[3],$tctr2
772	sub		$len,$len,#3		// bias
773	vorr		$dat2,$ivec,$ivec
774	b		.Loop3x_ctr32
775
776.align	4
777.Loop3x_ctr32:
778	aese		$dat0,q8
779	aesmc		$dat0,$dat0
780	aese		$dat1,q8
781	aesmc		$dat1,$dat1
782	aese		$dat2,q8
783	aesmc		$dat2,$dat2
784	vld1.32		{q8},[$key_],#16
785	subs		$cnt,$cnt,#2
786	aese		$dat0,q9
787	aesmc		$dat0,$dat0
788	aese		$dat1,q9
789	aesmc		$dat1,$dat1
790	aese		$dat2,q9
791	aesmc		$dat2,$dat2
792	vld1.32		{q9},[$key_],#16
793	b.gt		.Loop3x_ctr32
794
795	aese		$dat0,q8
796	aesmc		$tmp0,$dat0
797	aese		$dat1,q8
798	aesmc		$tmp1,$dat1
799	 vld1.8		{$in0},[$inp],#16
800	 add		$tctr0,$ctr,#1
801	aese		$dat2,q8
802	aesmc		$dat2,$dat2
803	 vld1.8		{$in1},[$inp],#16
804	 rev		$tctr0,$tctr0
805	aese		$tmp0,q9
806	aesmc		$tmp0,$tmp0
807	aese		$tmp1,q9
808	aesmc		$tmp1,$tmp1
809	 vld1.8		{$in2},[$inp],#16
810	 mov		$key_,$key
811	aese		$dat2,q9
812	aesmc		$tmp2,$dat2
813	aese		$tmp0,q12
814	aesmc		$tmp0,$tmp0
815	aese		$tmp1,q12
816	aesmc		$tmp1,$tmp1
817	 veor		$in0,$in0,$rndlast
818	 add		$tctr1,$ctr,#2
819	aese		$tmp2,q12
820	aesmc		$tmp2,$tmp2
821	 veor		$in1,$in1,$rndlast
822	 add		$ctr,$ctr,#3
823	aese		$tmp0,q13
824	aesmc		$tmp0,$tmp0
825	aese		$tmp1,q13
826	aesmc		$tmp1,$tmp1
827	 // Note the logic to update $dat0, $dat1, and $dat1 is written to work
828	 // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
829	 // 32-bit mode. See the comment above.
830	 veor		$in2,$in2,$rndlast
831	 vmov.32	${ivec}[3], $tctr0
832	aese		$tmp2,q13
833	aesmc		$tmp2,$tmp2
834	 vorr		$dat0,$ivec,$ivec
835	 rev		$tctr1,$tctr1
836	aese		$tmp0,q14
837	aesmc		$tmp0,$tmp0
838	 vmov.32	${ivec}[3], $tctr1
839	 rev		$tctr2,$ctr
840	aese		$tmp1,q14
841	aesmc		$tmp1,$tmp1
842	 vorr		$dat1,$ivec,$ivec
843	 vmov.32	${ivec}[3], $tctr2
844	aese		$tmp2,q14
845	aesmc		$tmp2,$tmp2
846	 vorr		$dat2,$ivec,$ivec
847	 subs		$len,$len,#3
848	aese		$tmp0,q15
849	aese		$tmp1,q15
850	aese		$tmp2,q15
851
852	veor		$in0,$in0,$tmp0
853	 vld1.32	 {q8},[$key_],#16	// re-pre-load rndkey[0]
854	vst1.8		{$in0},[$out],#16
855	veor		$in1,$in1,$tmp1
856	 mov		$cnt,$rounds
857	vst1.8		{$in1},[$out],#16
858	veor		$in2,$in2,$tmp2
859	 vld1.32	 {q9},[$key_],#16	// re-pre-load rndkey[1]
860	vst1.8		{$in2},[$out],#16
861	b.hs		.Loop3x_ctr32
862
863	adds		$len,$len,#3
864	b.eq		.Lctr32_done
865	cmp		$len,#1
866	mov		$step,#16
867	cclr		$step,eq
868
869.Lctr32_tail:
870	aese		$dat0,q8
871	aesmc		$dat0,$dat0
872	aese		$dat1,q8
873	aesmc		$dat1,$dat1
874	vld1.32		{q8},[$key_],#16
875	subs		$cnt,$cnt,#2
876	aese		$dat0,q9
877	aesmc		$dat0,$dat0
878	aese		$dat1,q9
879	aesmc		$dat1,$dat1
880	vld1.32		{q9},[$key_],#16
881	b.gt		.Lctr32_tail
882
883	aese		$dat0,q8
884	aesmc		$dat0,$dat0
885	aese		$dat1,q8
886	aesmc		$dat1,$dat1
887	aese		$dat0,q9
888	aesmc		$dat0,$dat0
889	aese		$dat1,q9
890	aesmc		$dat1,$dat1
891	 vld1.8		{$in0},[$inp],$step
892	aese		$dat0,q12
893	aesmc		$dat0,$dat0
894	aese		$dat1,q12
895	aesmc		$dat1,$dat1
896	 vld1.8		{$in1},[$inp]
897	aese		$dat0,q13
898	aesmc		$dat0,$dat0
899	aese		$dat1,q13
900	aesmc		$dat1,$dat1
901	 veor		$in0,$in0,$rndlast
902	aese		$dat0,q14
903	aesmc		$dat0,$dat0
904	aese		$dat1,q14
905	aesmc		$dat1,$dat1
906	 veor		$in1,$in1,$rndlast
907	aese		$dat0,q15
908	aese		$dat1,q15
909
910	cmp		$len,#1
911	veor		$in0,$in0,$dat0
912	veor		$in1,$in1,$dat1
913	vst1.8		{$in0},[$out],#16
914	b.eq		.Lctr32_done
915	vst1.8		{$in1},[$out]
916
917.Lctr32_done:
918___
919$code.=<<___	if ($flavour !~ /64/);
920	vldmia		sp!,{d8-d15}
921	ldmia		sp!,{r4-r10,pc}
922___
923$code.=<<___	if ($flavour =~ /64/);
924	ldr		x29,[sp],#16
925	ret
926___
927$code.=<<___;
928.size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
929___
930}}}
931$code.=<<___;
932#endif
933___
934########################################
935if ($flavour =~ /64/) {			######## 64-bit code
936    my %opcode = (
937	"aesd"	=>	0x4e285800,	"aese"	=>	0x4e284800,
938	"aesimc"=>	0x4e287800,	"aesmc"	=>	0x4e286800	);
939
940    local *unaes = sub {
941	my ($mnemonic,$arg)=@_;
942
943	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o	&&
944	sprintf ".inst\t0x%08x\t//%s %s",
945			$opcode{$mnemonic}|$1|($2<<5),
946			$mnemonic,$arg;
947    };
948
949    foreach(split("\n",$code)) {
950	s/\`([^\`]*)\`/eval($1)/geo;
951
952	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
953	s/@\s/\/\//o;			# old->new style commentary
954
955	#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
956	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
957	s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel	$2,$3,$2,$1/o	or
958	s/vmov\.i8/movi/o	or	# fix up legacy mnemonics
959	s/vext\.8/ext/o		or
960	s/vrev32\.8/rev32/o	or
961	s/vtst\.8/cmtst/o	or
962	s/vshr/ushr/o		or
963	s/^(\s+)v/$1/o		or	# strip off v prefix
964	s/\bbx\s+lr\b/ret/o;
965
966	# fix up remaining legacy suffixes
967	s/\.[ui]?8//o;
968	m/\],#8/o and s/\.16b/\.8b/go;
969	s/\.[ui]?32//o and s/\.16b/\.4s/go;
970	s/\.[ui]?64//o and s/\.16b/\.2d/go;
971	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
972
973	# Switch preprocessor checks to aarch64 versions.
974	s/__ARME([BL])__/__AARCH64E$1__/go;
975
976	print $_,"\n";
977    }
978} else {				######## 32-bit code
979    my %opcode = (
980	"aesd"	=>	0xf3b00340,	"aese"	=>	0xf3b00300,
981	"aesimc"=>	0xf3b003c0,	"aesmc"	=>	0xf3b00380	);
982
983    local *unaes = sub {
984	my ($mnemonic,$arg)=@_;
985
986	if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
987	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
988					 |(($2&7)<<1) |(($2&8)<<2);
989	    # since ARMv7 instructions are always encoded little-endian.
990	    # correct solution is to use .inst directive, but older
991	    # assemblers don't implement it:-(
992	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
993			$word&0xff,($word>>8)&0xff,
994			($word>>16)&0xff,($word>>24)&0xff,
995			$mnemonic,$arg;
996	}
997    };
998
999    sub unvtbl {
1000	my $arg=shift;
1001
1002	$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
1003	sprintf	"vtbl.8	d%d,{q%d},d%d\n\t".
1004		"vtbl.8	d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
1005    }
1006
1007    sub unvdup32 {
1008	my $arg=shift;
1009
1010	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
1011	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
1012    }
1013
1014    sub unvmov32 {
1015	my $arg=shift;
1016
1017	$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
1018	sprintf	"vmov.32	d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
1019    }
1020
1021    foreach(split("\n",$code)) {
1022	s/\`([^\`]*)\`/eval($1)/geo;
1023
1024	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
1025	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
1026	s/\/\/\s?/@ /o;				# new->old style commentary
1027
1028	# fix up remaining new-style suffixes
1029	s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo	or
1030	s/\],#[0-9]+/]!/o;
1031
1032	s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
1033	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2	$1,#0/o	or
1034	s/vtbl\.8\s+(.*)/unvtbl($1)/geo			or
1035	s/vdup\.32\s+(.*)/unvdup32($1)/geo		or
1036	s/vmov\.32\s+(.*)/unvmov32($1)/geo		or
1037	s/^(\s+)b\./$1b/o				or
1038	s/^(\s+)mov\./$1mov/o				or
1039	s/^(\s+)ret/$1bx\tlr/o;
1040
1041	print $_,"\n";
1042    }
1043}
1044
1045close STDOUT or die "error closing STDOUT: $!";
1046