xref: /aosp_15_r20/external/boringssl/src/crypto/fipsmodule/modes/asm/ghashv8-armx.pl (revision 8fb009dc861624b67b6cdb62ea21f0f22d0c584b)
1#! /usr/bin/env perl
2# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
18#
19# June 2014
20#
21# Initial version was developed in tight cooperation with Ard
22# Biesheuvel of Linaro from bits-n-pieces from other assembly modules.
23# Just like aesv8-armx.pl this module supports both AArch32 and
24# AArch64 execution modes.
25#
26# July 2014
27#
28# Implement 2x aggregated reduction [see ghash-x86.pl for background
29# information].
30#
31# November 2017
32#
33# AArch64 register bank to "accommodate" 4x aggregated reduction and
34# improve performance by 20-70% depending on processor.
35#
36# Current performance in cycles per processed byte:
37#
38#		64-bit PMULL	32-bit PMULL	32-bit NEON(*)
39# Apple A7	0.58		0.92		5.62
40# Cortex-A53	0.85		1.01		8.39
41# Cortex-A57	0.73		1.17		7.61
42# Denver	0.51		0.65		6.02
43# Mongoose	0.65		1.10		8.06
44# Kryo		0.76		1.16		8.00
45#
46# (*)	presented for reference/comparison purposes;
47
48$flavour = shift;
49$output  = shift;
50
51$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
52( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
53( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
54die "can't locate arm-xlate.pl";
55
56open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
57*STDOUT=*OUT;
58
59$Xi="x0";	# argument block
60$Htbl="x1";
61$inp="x2";
62$len="x3";
63
64$inc="x12";
65
66{
67my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
68my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14));
69
70$code=<<___;
71#include <openssl/arm_arch.h>
72
73#if __ARM_MAX_ARCH__>=7
74.text
75___
76$code.=".arch	armv8-a+crypto\n"	if ($flavour =~ /64/);
77$code.=<<___				if ($flavour !~ /64/);
78.fpu	neon
79.code	32
80#undef	__thumb2__
81___
82
83################################################################################
84# void gcm_init_v8(u128 Htable[16],const u64 H[2]);
85#
86# input:	128-bit H - secret parameter E(K,0^128)
87# output:	precomputed table filled with degrees of twisted H;
88#		H is twisted to handle reverse bitness of GHASH;
89#		only few of 16 slots of Htable[16] are used;
90#		data is opaque to outside world (which allows to
91#		optimize the code independently);
92#
93$code.=<<___;
94.global	gcm_init_v8
95.type	gcm_init_v8,%function
96.align	4
97gcm_init_v8:
98	AARCH64_VALID_CALL_TARGET
99	vld1.64		{$t1},[x1]		@ load input H
100	vmov.i8		$xC2,#0xe1
101	vshl.i64	$xC2,$xC2,#57		@ 0xc2.0
102	vext.8		$IN,$t1,$t1,#8
103	vshr.u64	$t2,$xC2,#63
104	vdup.32		$t1,${t1}[1]
105	vext.8		$t0,$t2,$xC2,#8		@ t0=0xc2....01
106	vshr.u64	$t2,$IN,#63
107	vshr.s32	$t1,$t1,#31		@ broadcast carry bit
108	vand		$t2,$t2,$t0
109	vshl.i64	$IN,$IN,#1
110	vext.8		$t2,$t2,$t2,#8
111	vand		$t0,$t0,$t1
112	vorr		$IN,$IN,$t2		@ H<<<=1
113	veor		$H,$IN,$t0		@ twisted H
114	vst1.64		{$H},[x0],#16		@ store Htable[0]
115
116	@ calculate H^2
117	vext.8		$t0,$H,$H,#8		@ Karatsuba pre-processing
118	vpmull.p64	$Xl,$H,$H
119	veor		$t0,$t0,$H
120	vpmull2.p64	$Xh,$H,$H
121	vpmull.p64	$Xm,$t0,$t0
122
123	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
124	veor		$t2,$Xl,$Xh
125	veor		$Xm,$Xm,$t1
126	veor		$Xm,$Xm,$t2
127	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase
128
129	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
130	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
131	veor		$Xl,$Xm,$t2
132
133	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase
134	vpmull.p64	$Xl,$Xl,$xC2
135	veor		$t2,$t2,$Xh
136	veor		$H2,$Xl,$t2
137
138	vext.8		$t1,$H2,$H2,#8		@ Karatsuba pre-processing
139	veor		$t1,$t1,$H2
140	vext.8		$Hhl,$t0,$t1,#8		@ pack Karatsuba pre-processed
141	vst1.64		{$Hhl-$H2},[x0],#32	@ store Htable[1..2]
142___
143if ($flavour =~ /64/) {
144my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7));
145
146$code.=<<___;
147	@ calculate H^3 and H^4
148	vpmull.p64	$Xl,$H, $H2
149	 vpmull.p64	$Yl,$H2,$H2
150	vpmull2.p64	$Xh,$H, $H2
151	 vpmull2.p64	$Yh,$H2,$H2
152	vpmull.p64	$Xm,$t0,$t1
153	 vpmull.p64	$Ym,$t1,$t1
154
155	vext.8		$t0,$Xl,$Xh,#8		@ Karatsuba post-processing
156	 vext.8		$t1,$Yl,$Yh,#8
157	veor		$t2,$Xl,$Xh
158	veor		$Xm,$Xm,$t0
159	 veor		$t3,$Yl,$Yh
160	 veor		$Ym,$Ym,$t1
161	veor		$Xm,$Xm,$t2
162	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase
163	 veor		$Ym,$Ym,$t3
164	 vpmull.p64	$t3,$Yl,$xC2
165
166	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
167	 vmov		$Yh#lo,$Ym#hi
168	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
169	 vmov		$Ym#hi,$Yl#lo
170	veor		$Xl,$Xm,$t2
171	 veor		$Yl,$Ym,$t3
172
173	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase
174	 vext.8		$t3,$Yl,$Yl,#8
175	vpmull.p64	$Xl,$Xl,$xC2
176	 vpmull.p64	$Yl,$Yl,$xC2
177	veor		$t2,$t2,$Xh
178	 veor		$t3,$t3,$Yh
179	veor		$H, $Xl,$t2		@ H^3
180	 veor		$H2,$Yl,$t3		@ H^4
181
182	vext.8		$t0,$H, $H,#8		@ Karatsuba pre-processing
183	 vext.8		$t1,$H2,$H2,#8
184	veor		$t0,$t0,$H
185	 veor		$t1,$t1,$H2
186	vext.8		$Hhl,$t0,$t1,#8		@ pack Karatsuba pre-processed
187	vst1.64		{$H-$H2},[x0]		@ store Htable[3..5]
188___
189}
190$code.=<<___;
191	ret
192.size	gcm_init_v8,.-gcm_init_v8
193___
194################################################################################
195# void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
196#
197# input:	Xi - current hash value;
198#		Htable - table precomputed in gcm_init_v8;
199# output:	Xi - next hash value Xi;
200#
201$code.=<<___;
202.global	gcm_gmult_v8
203.type	gcm_gmult_v8,%function
204.align	4
205gcm_gmult_v8:
206	AARCH64_VALID_CALL_TARGET
207	vld1.64		{$t1},[$Xi]		@ load Xi
208	vmov.i8		$xC2,#0xe1
209	vld1.64		{$H-$Hhl},[$Htbl]	@ load twisted H, ...
210	vshl.u64	$xC2,$xC2,#57
211#ifndef __ARMEB__
212	vrev64.8	$t1,$t1
213#endif
214	vext.8		$IN,$t1,$t1,#8
215
216	vpmull.p64	$Xl,$H,$IN		@ H.lo·Xi.lo
217	veor		$t1,$t1,$IN		@ Karatsuba pre-processing
218	vpmull2.p64	$Xh,$H,$IN		@ H.hi·Xi.hi
219	vpmull.p64	$Xm,$Hhl,$t1		@ (H.lo+H.hi)·(Xi.lo+Xi.hi)
220
221	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
222	veor		$t2,$Xl,$Xh
223	veor		$Xm,$Xm,$t1
224	veor		$Xm,$Xm,$t2
225	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
226
227	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
228	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
229	veor		$Xl,$Xm,$t2
230
231	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
232	vpmull.p64	$Xl,$Xl,$xC2
233	veor		$t2,$t2,$Xh
234	veor		$Xl,$Xl,$t2
235
236#ifndef __ARMEB__
237	vrev64.8	$Xl,$Xl
238#endif
239	vext.8		$Xl,$Xl,$Xl,#8
240	vst1.64		{$Xl},[$Xi]		@ write out Xi
241
242	ret
243.size	gcm_gmult_v8,.-gcm_gmult_v8
244___
245################################################################################
246# void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
247#
248# input:	table precomputed in gcm_init_v8;
249#		current hash value Xi;
250#		pointer to input data;
251#		length of input data in bytes, but divisible by block size;
252# output:	next hash value Xi;
253#
254$code.=<<___;
255.global	gcm_ghash_v8
256.type	gcm_ghash_v8,%function
257.align	4
258gcm_ghash_v8:
259	AARCH64_VALID_CALL_TARGET
260___
261$code.=<<___	if ($flavour =~ /64/);
262	cmp		$len,#64
263	b.hs		.Lgcm_ghash_v8_4x
264___
265$code.=<<___		if ($flavour !~ /64/);
266	vstmdb		sp!,{d8-d15}		@ 32-bit ABI says so
267___
268$code.=<<___;
269	vld1.64		{$Xl},[$Xi]		@ load [rotated] Xi
270						@ "[rotated]" means that
271						@ loaded value would have
272						@ to be rotated in order to
273						@ make it appear as in
274						@ algorithm specification
275	subs		$len,$len,#32		@ see if $len is 32 or larger
276	mov		$inc,#16		@ $inc is used as post-
277						@ increment for input pointer;
278						@ as loop is modulo-scheduled
279						@ $inc is zeroed just in time
280						@ to preclude overstepping
281						@ inp[len], which means that
282						@ last block[s] are actually
283						@ loaded twice, but last
284						@ copy is not processed
285	vld1.64		{$H-$Hhl},[$Htbl],#32	@ load twisted H, ..., H^2
286	vmov.i8		$xC2,#0xe1
287	vld1.64		{$H2},[$Htbl]
288	cclr		$inc,eq			@ is it time to zero $inc?
289	vext.8		$Xl,$Xl,$Xl,#8		@ rotate Xi
290	vld1.64		{$t0},[$inp],#16	@ load [rotated] I[0]
291	vshl.u64	$xC2,$xC2,#57		@ compose 0xc2.0 constant
292#ifndef __ARMEB__
293	vrev64.8	$t0,$t0
294	vrev64.8	$Xl,$Xl
295#endif
296	vext.8		$IN,$t0,$t0,#8		@ rotate I[0]
297	b.lo		.Lodd_tail_v8		@ $len was less than 32
298___
299{ my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7));
300	#######
301	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
302	#	[(H*Ii+1) + (H*Xi+1)] mod P =
303	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
304	#
305$code.=<<___;
306	vld1.64		{$t1},[$inp],$inc	@ load [rotated] I[1]
307#ifndef __ARMEB__
308	vrev64.8	$t1,$t1
309#endif
310	vext.8		$In,$t1,$t1,#8
311	veor		$IN,$IN,$Xl		@ I[i]^=Xi
312	vpmull.p64	$Xln,$H,$In		@ H·Ii+1
313	veor		$t1,$t1,$In		@ Karatsuba pre-processing
314	vpmull2.p64	$Xhn,$H,$In
315	b		.Loop_mod2x_v8
316
317.align	4
318.Loop_mod2x_v8:
319	vext.8		$t2,$IN,$IN,#8
320	subs		$len,$len,#32		@ is there more data?
321	vpmull.p64	$Xl,$H2,$IN		@ H^2.lo·Xi.lo
322	cclr		$inc,lo			@ is it time to zero $inc?
323
324	 vpmull.p64	$Xmn,$Hhl,$t1
325	veor		$t2,$t2,$IN		@ Karatsuba pre-processing
326	vpmull2.p64	$Xh,$H2,$IN		@ H^2.hi·Xi.hi
327	veor		$Xl,$Xl,$Xln		@ accumulate
328	vpmull2.p64	$Xm,$Hhl,$t2		@ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
329	 vld1.64	{$t0},[$inp],$inc	@ load [rotated] I[i+2]
330
331	veor		$Xh,$Xh,$Xhn
332	 cclr		$inc,eq			@ is it time to zero $inc?
333	veor		$Xm,$Xm,$Xmn
334
335	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
336	veor		$t2,$Xl,$Xh
337	veor		$Xm,$Xm,$t1
338	 vld1.64	{$t1},[$inp],$inc	@ load [rotated] I[i+3]
339#ifndef __ARMEB__
340	 vrev64.8	$t0,$t0
341#endif
342	veor		$Xm,$Xm,$t2
343	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
344
345#ifndef __ARMEB__
346	 vrev64.8	$t1,$t1
347#endif
348	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
349	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
350	 vext.8		$In,$t1,$t1,#8
351	 vext.8		$IN,$t0,$t0,#8
352	veor		$Xl,$Xm,$t2
353	 vpmull.p64	$Xln,$H,$In		@ H·Ii+1
354	veor		$IN,$IN,$Xh		@ accumulate $IN early
355
356	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
357	vpmull.p64	$Xl,$Xl,$xC2
358	veor		$IN,$IN,$t2
359	 veor		$t1,$t1,$In		@ Karatsuba pre-processing
360	veor		$IN,$IN,$Xl
361	 vpmull2.p64	$Xhn,$H,$In
362	b.hs		.Loop_mod2x_v8		@ there was at least 32 more bytes
363
364	veor		$Xh,$Xh,$t2
365	vext.8		$IN,$t0,$t0,#8		@ re-construct $IN
366	adds		$len,$len,#32		@ re-construct $len
367	veor		$Xl,$Xl,$Xh		@ re-construct $Xl
368	b.eq		.Ldone_v8		@ is $len zero?
369___
370}
371$code.=<<___;
372.Lodd_tail_v8:
373	vext.8		$t2,$Xl,$Xl,#8
374	veor		$IN,$IN,$Xl		@ inp^=Xi
375	veor		$t1,$t0,$t2		@ $t1 is rotated inp^Xi
376
377	vpmull.p64	$Xl,$H,$IN		@ H.lo·Xi.lo
378	veor		$t1,$t1,$IN		@ Karatsuba pre-processing
379	vpmull2.p64	$Xh,$H,$IN		@ H.hi·Xi.hi
380	vpmull.p64	$Xm,$Hhl,$t1		@ (H.lo+H.hi)·(Xi.lo+Xi.hi)
381
382	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
383	veor		$t2,$Xl,$Xh
384	veor		$Xm,$Xm,$t1
385	veor		$Xm,$Xm,$t2
386	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
387
388	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
389	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
390	veor		$Xl,$Xm,$t2
391
392	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
393	vpmull.p64	$Xl,$Xl,$xC2
394	veor		$t2,$t2,$Xh
395	veor		$Xl,$Xl,$t2
396
397.Ldone_v8:
398#ifndef __ARMEB__
399	vrev64.8	$Xl,$Xl
400#endif
401	vext.8		$Xl,$Xl,$Xl,#8
402	vst1.64		{$Xl},[$Xi]		@ write out Xi
403
404___
405$code.=<<___		if ($flavour !~ /64/);
406	vldmia		sp!,{d8-d15}		@ 32-bit ABI says so
407___
408$code.=<<___;
409	ret
410.size	gcm_ghash_v8,.-gcm_ghash_v8
411___
412
413if ($flavour =~ /64/) {				# 4x subroutine
414my ($I0,$j1,$j2,$j3,
415    $I1,$I2,$I3,$H3,$H34,$H4,$Yl,$Ym,$Yh) = map("q$_",(4..7,15..23));
416
417$code.=<<___;
418.type	gcm_ghash_v8_4x,%function
419.align	4
420gcm_ghash_v8_4x:
421.Lgcm_ghash_v8_4x:
422	vld1.64		{$Xl},[$Xi]		@ load [rotated] Xi
423	vld1.64		{$H-$H2},[$Htbl],#48	@ load twisted H, ..., H^2
424	vmov.i8		$xC2,#0xe1
425	vld1.64		{$H3-$H4},[$Htbl]	@ load twisted H^3, ..., H^4
426	vshl.u64	$xC2,$xC2,#57		@ compose 0xc2.0 constant
427
428	vld1.64		{$I0-$j3},[$inp],#64
429#ifndef __ARMEB__
430	vrev64.8	$Xl,$Xl
431	vrev64.8	$j1,$j1
432	vrev64.8	$j2,$j2
433	vrev64.8	$j3,$j3
434	vrev64.8	$I0,$I0
435#endif
436	vext.8		$I3,$j3,$j3,#8
437	vext.8		$I2,$j2,$j2,#8
438	vext.8		$I1,$j1,$j1,#8
439
440	vpmull.p64	$Yl,$H,$I3		@ H·Ii+3
441	veor		$j3,$j3,$I3
442	vpmull2.p64	$Yh,$H,$I3
443	vpmull.p64	$Ym,$Hhl,$j3
444
445	vpmull.p64	$t0,$H2,$I2		@ H^2·Ii+2
446	veor		$j2,$j2,$I2
447	vpmull2.p64	$I2,$H2,$I2
448	vpmull2.p64	$j2,$Hhl,$j2
449
450	veor		$Yl,$Yl,$t0
451	veor		$Yh,$Yh,$I2
452	veor		$Ym,$Ym,$j2
453
454	vpmull.p64	$j3,$H3,$I1		@ H^3·Ii+1
455	veor		$j1,$j1,$I1
456	vpmull2.p64	$I1,$H3,$I1
457	vpmull.p64	$j1,$H34,$j1
458
459	veor		$Yl,$Yl,$j3
460	veor		$Yh,$Yh,$I1
461	veor		$Ym,$Ym,$j1
462
463	subs		$len,$len,#128
464	b.lo		.Ltail4x
465
466	b		.Loop4x
467
468.align	4
469.Loop4x:
470	veor		$t0,$I0,$Xl
471	 vld1.64	{$I0-$j3},[$inp],#64
472	vext.8		$IN,$t0,$t0,#8
473#ifndef __ARMEB__
474	 vrev64.8	$j1,$j1
475	 vrev64.8	$j2,$j2
476	 vrev64.8	$j3,$j3
477	 vrev64.8	$I0,$I0
478#endif
479
480	vpmull.p64	$Xl,$H4,$IN		@ H^4·(Xi+Ii)
481	veor		$t0,$t0,$IN
482	vpmull2.p64	$Xh,$H4,$IN
483	 vext.8		$I3,$j3,$j3,#8
484	vpmull2.p64	$Xm,$H34,$t0
485
486	veor		$Xl,$Xl,$Yl
487	veor		$Xh,$Xh,$Yh
488	 vext.8		$I2,$j2,$j2,#8
489	veor		$Xm,$Xm,$Ym
490	 vext.8		$I1,$j1,$j1,#8
491
492	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
493	veor		$t2,$Xl,$Xh
494	 vpmull.p64	$Yl,$H,$I3		@ H·Ii+3
495	 veor		$j3,$j3,$I3
496	veor		$Xm,$Xm,$t1
497	 vpmull2.p64	$Yh,$H,$I3
498	veor		$Xm,$Xm,$t2
499	 vpmull.p64	$Ym,$Hhl,$j3
500
501	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
502	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
503	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
504	 vpmull.p64	$t0,$H2,$I2		@ H^2·Ii+2
505	 veor		$j2,$j2,$I2
506	 vpmull2.p64	$I2,$H2,$I2
507	veor		$Xl,$Xm,$t2
508	 vpmull2.p64	$j2,$Hhl,$j2
509
510	 veor		$Yl,$Yl,$t0
511	 veor		$Yh,$Yh,$I2
512	 veor		$Ym,$Ym,$j2
513
514	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
515	vpmull.p64	$Xl,$Xl,$xC2
516	 vpmull.p64	$j3,$H3,$I1		@ H^3·Ii+1
517	 veor		$j1,$j1,$I1
518	veor		$t2,$t2,$Xh
519	 vpmull2.p64	$I1,$H3,$I1
520	 vpmull.p64	$j1,$H34,$j1
521
522	veor		$Xl,$Xl,$t2
523	 veor		$Yl,$Yl,$j3
524	 veor		$Yh,$Yh,$I1
525	vext.8		$Xl,$Xl,$Xl,#8
526	 veor		$Ym,$Ym,$j1
527
528	subs		$len,$len,#64
529	b.hs		.Loop4x
530
531.Ltail4x:
532	veor		$t0,$I0,$Xl
533	vext.8		$IN,$t0,$t0,#8
534
535	vpmull.p64	$Xl,$H4,$IN		@ H^4·(Xi+Ii)
536	veor		$t0,$t0,$IN
537	vpmull2.p64	$Xh,$H4,$IN
538	vpmull2.p64	$Xm,$H34,$t0
539
540	veor		$Xl,$Xl,$Yl
541	veor		$Xh,$Xh,$Yh
542	veor		$Xm,$Xm,$Ym
543
544	adds		$len,$len,#64
545	b.eq		.Ldone4x
546
547	cmp		$len,#32
548	b.lo		.Lone
549	b.eq		.Ltwo
550.Lthree:
551	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
552	veor		$t2,$Xl,$Xh
553	veor		$Xm,$Xm,$t1
554	 vld1.64	{$I0-$j2},[$inp]
555	veor		$Xm,$Xm,$t2
556#ifndef	__ARMEB__
557	 vrev64.8	$j1,$j1
558	 vrev64.8	$j2,$j2
559	 vrev64.8	$I0,$I0
560#endif
561
562	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
563	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
564	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
565	 vext.8		$I2,$j2,$j2,#8
566	 vext.8		$I1,$j1,$j1,#8
567	veor		$Xl,$Xm,$t2
568
569	 vpmull.p64	$Yl,$H,$I2		@ H·Ii+2
570	 veor		$j2,$j2,$I2
571
572	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
573	vpmull.p64	$Xl,$Xl,$xC2
574	veor		$t2,$t2,$Xh
575	 vpmull2.p64	$Yh,$H,$I2
576	 vpmull.p64	$Ym,$Hhl,$j2
577	veor		$Xl,$Xl,$t2
578	 vpmull.p64	$j3,$H2,$I1		@ H^2·Ii+1
579	 veor		$j1,$j1,$I1
580	vext.8		$Xl,$Xl,$Xl,#8
581
582	 vpmull2.p64	$I1,$H2,$I1
583	veor		$t0,$I0,$Xl
584	 vpmull2.p64	$j1,$Hhl,$j1
585	vext.8		$IN,$t0,$t0,#8
586
587	 veor		$Yl,$Yl,$j3
588	 veor		$Yh,$Yh,$I1
589	 veor		$Ym,$Ym,$j1
590
591	vpmull.p64	$Xl,$H3,$IN		@ H^3·(Xi+Ii)
592	veor		$t0,$t0,$IN
593	vpmull2.p64	$Xh,$H3,$IN
594	vpmull.p64	$Xm,$H34,$t0
595
596	veor		$Xl,$Xl,$Yl
597	veor		$Xh,$Xh,$Yh
598	veor		$Xm,$Xm,$Ym
599	b		.Ldone4x
600
601.align	4
602.Ltwo:
603	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
604	veor		$t2,$Xl,$Xh
605	veor		$Xm,$Xm,$t1
606	 vld1.64	{$I0-$j1},[$inp]
607	veor		$Xm,$Xm,$t2
608#ifndef	__ARMEB__
609	 vrev64.8	$j1,$j1
610	 vrev64.8	$I0,$I0
611#endif
612
613	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
614	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
615	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
616	 vext.8		$I1,$j1,$j1,#8
617	veor		$Xl,$Xm,$t2
618
619	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
620	vpmull.p64	$Xl,$Xl,$xC2
621	veor		$t2,$t2,$Xh
622	veor		$Xl,$Xl,$t2
623	vext.8		$Xl,$Xl,$Xl,#8
624
625	 vpmull.p64	$Yl,$H,$I1		@ H·Ii+1
626	 veor		$j1,$j1,$I1
627
628	veor		$t0,$I0,$Xl
629	vext.8		$IN,$t0,$t0,#8
630
631	 vpmull2.p64	$Yh,$H,$I1
632	 vpmull.p64	$Ym,$Hhl,$j1
633
634	vpmull.p64	$Xl,$H2,$IN		@ H^2·(Xi+Ii)
635	veor		$t0,$t0,$IN
636	vpmull2.p64	$Xh,$H2,$IN
637	vpmull2.p64	$Xm,$Hhl,$t0
638
639	veor		$Xl,$Xl,$Yl
640	veor		$Xh,$Xh,$Yh
641	veor		$Xm,$Xm,$Ym
642	b		.Ldone4x
643
644.align	4
645.Lone:
646	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
647	veor		$t2,$Xl,$Xh
648	veor		$Xm,$Xm,$t1
649	 vld1.64	{$I0},[$inp]
650	veor		$Xm,$Xm,$t2
651#ifndef	__ARMEB__
652	 vrev64.8	$I0,$I0
653#endif
654
655	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
656	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
657	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
658	veor		$Xl,$Xm,$t2
659
660	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
661	vpmull.p64	$Xl,$Xl,$xC2
662	veor		$t2,$t2,$Xh
663	veor		$Xl,$Xl,$t2
664	vext.8		$Xl,$Xl,$Xl,#8
665
666	veor		$t0,$I0,$Xl
667	vext.8		$IN,$t0,$t0,#8
668
669	vpmull.p64	$Xl,$H,$IN
670	veor		$t0,$t0,$IN
671	vpmull2.p64	$Xh,$H,$IN
672	vpmull.p64	$Xm,$Hhl,$t0
673
674.Ldone4x:
675	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
676	veor		$t2,$Xl,$Xh
677	veor		$Xm,$Xm,$t1
678	veor		$Xm,$Xm,$t2
679
680	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
681	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
682	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
683	veor		$Xl,$Xm,$t2
684
685	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
686	vpmull.p64	$Xl,$Xl,$xC2
687	veor		$t2,$t2,$Xh
688	veor		$Xl,$Xl,$t2
689	vext.8		$Xl,$Xl,$Xl,#8
690
691#ifndef __ARMEB__
692	vrev64.8	$Xl,$Xl
693#endif
694	vst1.64		{$Xl},[$Xi]		@ write out Xi
695
696	ret
697.size	gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
698___
699
700}
701}
702
703$code.=<<___;
704.asciz  "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
705.align  2
706#endif
707___
708
709if ($flavour =~ /64/) {			######## 64-bit code
710    sub unvmov {
711	my $arg=shift;
712
713	$arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
714	sprintf	"ins	v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
715					     $3<8?$3:$3+8,($4 eq "lo")?0:1;
716    }
717    foreach(split("\n",$code)) {
718	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
719	s/vmov\.i8/movi/o		or	# fix up legacy mnemonics
720	s/vmov\s+(.*)/unvmov($1)/geo	or
721	s/vext\.8/ext/o			or
722	s/vshr\.s/sshr\.s/o		or
723	s/vshr/ushr/o			or
724	s/^(\s+)v/$1/o			or	# strip off v prefix
725	s/\bbx\s+lr\b/ret/o;
726
727	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
728	s/@\s/\/\//o;				# old->new style commentary
729
730	# fix up remaining legacy suffixes
731	s/\.[ui]?8(\s)/$1/o;
732	s/\.[uis]?32//o and s/\.16b/\.4s/go;
733	m/\.p64/o and s/\.16b/\.1q/o;		# 1st pmull argument
734	m/l\.p64/o and s/\.16b/\.1d/go;		# 2nd and 3rd pmull arguments
735	s/\.[uisp]?64//o and s/\.16b/\.2d/go;
736	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
737
738	# Switch preprocessor checks to aarch64 versions.
739	s/__ARME([BL])__/__AARCH64E$1__/go;
740
741	print $_,"\n";
742    }
743} else {				######## 32-bit code
744    sub unvdup32 {
745	my $arg=shift;
746
747	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
748	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
749    }
750    sub unvpmullp64 {
751	my ($mnemonic,$arg)=@_;
752
753	if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
754	    my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
755				 |(($2&7)<<17)|(($2&8)<<4)
756				 |(($3&7)<<1) |(($3&8)<<2);
757	    $word |= 0x00010001	 if ($mnemonic =~ "2");
758	    # since ARMv7 instructions are always encoded little-endian.
759	    # correct solution is to use .inst directive, but older
760	    # assemblers don't implement it:-(
761	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
762			$word&0xff,($word>>8)&0xff,
763			($word>>16)&0xff,($word>>24)&0xff,
764			$mnemonic,$arg;
765	}
766    }
767
768    foreach(split("\n",$code)) {
769	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
770	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
771	s/\/\/\s?/@ /o;				# new->old style commentary
772
773	# fix up remaining new-style suffixes
774	s/\],#[0-9]+/]!/o;
775
776	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2	$1,#0/o			or
777	s/vdup\.32\s+(.*)/unvdup32($1)/geo				or
778	s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo		or
779	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
780	s/^(\s+)b\./$1b/o						or
781	s/^(\s+)ret/$1bx\tlr/o;
782
783	print $_,"\n";
784    }
785}
786
787close STDOUT or die "error closing STDOUT: $!"; # enforce flush
788