xref: /aosp_15_r20/external/boringssl/src/crypto/fipsmodule/sha/asm/sha256-armv4.pl (revision 8fb009dc861624b67b6cdb62ea21f0f22d0c584b)
1#! /usr/bin/env perl
2# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15#
16# Permission to use under GPL terms is granted.
17# ====================================================================
18
19# SHA256 block procedure for ARMv4. May 2007.
20
21# Performance is ~2x better than gcc 3.4 generated code and in "abso-
22# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
23# byte [on single-issue Xscale PXA250 core].
24
25# July 2010.
26#
27# Rescheduling for dual-issue pipeline resulted in 22% improvement on
28# Cortex A8 core and ~20 cycles per processed byte.
29
30# February 2011.
31#
32# Profiler-assisted and platform-specific optimization resulted in 16%
33# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
34
35# September 2013.
36#
37# Add NEON implementation. On Cortex A8 it was measured to process one
38# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
39# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
40# code (meaning that latter performs sub-optimally, nothing was done
41# about it).
42
43# May 2014.
44#
45# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
46
47$flavour = shift;
48if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
49else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
50
51if ($flavour && $flavour ne "void") {
52    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
53    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
54    ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
55    die "can't locate arm-xlate.pl";
56
57    open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
58    *STDOUT=*OUT;
59} else {
60    open OUT,">$output";
61    *STDOUT=*OUT;
62}
63
64$ctx="r0";	$t0="r0";
65$inp="r1";	$t4="r1";
66$len="r2";	$t1="r2";
67$T1="r3";	$t3="r3";
68$A="r4";
69$B="r5";
70$C="r6";
71$D="r7";
72$E="r8";
73$F="r9";
74$G="r10";
75$H="r11";
76@V=($A,$B,$C,$D,$E,$F,$G,$H);
77$t2="r12";
78$Ktbl="r14";
79
80@Sigma0=( 2,13,22);
81@Sigma1=( 6,11,25);
82@sigma0=( 7,18, 3);
83@sigma1=(17,19,10);
84
85sub BODY_00_15 {
86my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
87
88$code.=<<___ if ($i<16);
89#if __ARM_ARCH>=7
90	@ ldr	$t1,[$inp],#4			@ $i
91# if $i==15
92	str	$inp,[sp,#17*4]			@ make room for $t4
93# endif
94	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
95	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
96	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
97# ifndef __ARMEB__
98	rev	$t1,$t1
99# endif
100#else
101	@ ldrb	$t1,[$inp,#3]			@ $i
102	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
103	ldrb	$t2,[$inp,#2]
104	ldrb	$t0,[$inp,#1]
105	orr	$t1,$t1,$t2,lsl#8
106	ldrb	$t2,[$inp],#4
107	orr	$t1,$t1,$t0,lsl#16
108# if $i==15
109	str	$inp,[sp,#17*4]			@ make room for $t4
110# endif
111	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
112	orr	$t1,$t1,$t2,lsl#24
113	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
114#endif
115___
116$code.=<<___;
117	ldr	$t2,[$Ktbl],#4			@ *K256++
118	add	$h,$h,$t1			@ h+=X[i]
119	str	$t1,[sp,#`$i%16`*4]
120	eor	$t1,$f,$g
121	add	$h,$h,$t0,ror#$Sigma1[0]	@ h+=Sigma1(e)
122	and	$t1,$t1,$e
123	add	$h,$h,$t2			@ h+=K256[i]
124	eor	$t1,$t1,$g			@ Ch(e,f,g)
125	eor	$t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
126	add	$h,$h,$t1			@ h+=Ch(e,f,g)
127#if $i==31
128	and	$t2,$t2,#0xff
129	cmp	$t2,#0xf2			@ done?
130#endif
131#if $i<15
132# if __ARM_ARCH>=7
133	ldr	$t1,[$inp],#4			@ prefetch
134# else
135	ldrb	$t1,[$inp,#3]
136# endif
137	eor	$t2,$a,$b			@ a^b, b^c in next round
138#else
139	ldr	$t1,[sp,#`($i+2)%16`*4]		@ from future BODY_16_xx
140	eor	$t2,$a,$b			@ a^b, b^c in next round
141	ldr	$t4,[sp,#`($i+15)%16`*4]	@ from future BODY_16_xx
142#endif
143	eor	$t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`	@ Sigma0(a)
144	and	$t3,$t3,$t2			@ (b^c)&=(a^b)
145	add	$d,$d,$h			@ d+=h
146	eor	$t3,$t3,$b			@ Maj(a,b,c)
147	add	$h,$h,$t0,ror#$Sigma0[0]	@ h+=Sigma0(a)
148	@ add	$h,$h,$t3			@ h+=Maj(a,b,c)
149___
150	($t2,$t3)=($t3,$t2);
151}
152
153sub BODY_16_XX {
154my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
155
156$code.=<<___;
157	@ ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
158	@ ldr	$t4,[sp,#`($i+14)%16`*4]
159	mov	$t0,$t1,ror#$sigma0[0]
160	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
161	mov	$t2,$t4,ror#$sigma1[0]
162	eor	$t0,$t0,$t1,ror#$sigma0[1]
163	eor	$t2,$t2,$t4,ror#$sigma1[1]
164	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
165	ldr	$t1,[sp,#`($i+0)%16`*4]
166	eor	$t2,$t2,$t4,lsr#$sigma1[2]	@ sigma1(X[i+14])
167	ldr	$t4,[sp,#`($i+9)%16`*4]
168
169	add	$t2,$t2,$t0
170	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`	@ from BODY_00_15
171	add	$t1,$t1,$t2
172	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
173	add	$t1,$t1,$t4			@ X[i]
174___
175	&BODY_00_15(@_);
176}
177
178$code=<<___;
179#ifndef __KERNEL__
180# include <openssl/arm_arch.h>
181#else
182# define __ARM_ARCH __LINUX_ARM_ARCH__
183# define __ARM_MAX_ARCH__ 7
184#endif
185
186@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
187@ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those
188@ instructions are manually-encoded. (See unsha256.)
189.arch  armv7-a
190
191.text
192#if defined(__thumb2__)
193.syntax unified
194.thumb
195#else
196.code   32
197#endif
198
199.type	K256,%object
200.align	5
201K256:
202.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
203.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
204.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
205.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
206.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
207.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
208.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
209.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
210.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
211.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
212.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
213.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
214.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
215.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
216.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
217.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
218.size	K256,.-K256
219.word	0				@ terminator
220.align	5
221
222.global	sha256_block_data_order_nohw
223.type	sha256_block_data_order_nohw,%function
224sha256_block_data_order_nohw:
225	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
226	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
227	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
228	adr	$Ktbl,K256
229	sub	sp,sp,#16*4		@ alloca(X[16])
230.Loop:
231# if __ARM_ARCH>=7
232	ldr	$t1,[$inp],#4
233# else
234	ldrb	$t1,[$inp,#3]
235# endif
236	eor	$t3,$B,$C		@ magic
237	eor	$t2,$t2,$t2
238___
239for($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
240$code.=".Lrounds_16_xx:\n";
241for (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
242$code.=<<___;
243#if __ARM_ARCH>=7
244	ite	eq			@ Thumb2 thing, sanity check in ARM
245#endif
246	ldreq	$t3,[sp,#16*4]		@ pull ctx
247	bne	.Lrounds_16_xx
248
249	add	$A,$A,$t2		@ h+=Maj(a,b,c) from the past
250	ldr	$t0,[$t3,#0]
251	ldr	$t1,[$t3,#4]
252	ldr	$t2,[$t3,#8]
253	add	$A,$A,$t0
254	ldr	$t0,[$t3,#12]
255	add	$B,$B,$t1
256	ldr	$t1,[$t3,#16]
257	add	$C,$C,$t2
258	ldr	$t2,[$t3,#20]
259	add	$D,$D,$t0
260	ldr	$t0,[$t3,#24]
261	add	$E,$E,$t1
262	ldr	$t1,[$t3,#28]
263	add	$F,$F,$t2
264	ldr	$inp,[sp,#17*4]		@ pull inp
265	ldr	$t2,[sp,#18*4]		@ pull inp+len
266	add	$G,$G,$t0
267	add	$H,$H,$t1
268	stmia	$t3,{$A,$B,$C,$D,$E,$F,$G,$H}
269	cmp	$inp,$t2
270	sub	$Ktbl,$Ktbl,#256	@ rewind Ktbl
271	bne	.Loop
272
273	add	sp,sp,#`16+3`*4	@ destroy frame
274#if __ARM_ARCH>=5
275	ldmia	sp!,{r4-r11,pc}
276#else
277	ldmia	sp!,{r4-r11,lr}
278	tst	lr,#1
279	moveq	pc,lr			@ be binary compatible with V4, yet
280	bx	lr			@ interoperable with Thumb ISA:-)
281#endif
282.size	sha256_block_data_order_nohw,.-sha256_block_data_order_nohw
283___
284######################################################################
285# NEON stuff
286#
287{{{
288my @X=map("q$_",(0..3));
289my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
290my $Xfer=$t4;
291my $j=0;
292
293sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
294sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
295
296sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
297{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
298  my $arg = pop;
299    $arg = "#$arg" if ($arg*1 eq $arg);
300    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
301}
302
303sub Xupdate()
304{ use integer;
305  my $body = shift;
306  my @insns = (&$body,&$body,&$body,&$body);
307  my ($a,$b,$c,$d,$e,$f,$g,$h);
308
309	&vext_8		($T0,@X[0],@X[1],4);	# X[1..4]
310	 eval(shift(@insns));
311	 eval(shift(@insns));
312	 eval(shift(@insns));
313	&vext_8		($T1,@X[2],@X[3],4);	# X[9..12]
314	 eval(shift(@insns));
315	 eval(shift(@insns));
316	 eval(shift(@insns));
317	&vshr_u32	($T2,$T0,$sigma0[0]);
318	 eval(shift(@insns));
319	 eval(shift(@insns));
320	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += X[9..12]
321	 eval(shift(@insns));
322	 eval(shift(@insns));
323	&vshr_u32	($T1,$T0,$sigma0[2]);
324	 eval(shift(@insns));
325	 eval(shift(@insns));
326	&vsli_32	($T2,$T0,32-$sigma0[0]);
327	 eval(shift(@insns));
328	 eval(shift(@insns));
329	&vshr_u32	($T3,$T0,$sigma0[1]);
330	 eval(shift(@insns));
331	 eval(shift(@insns));
332	&veor		($T1,$T1,$T2);
333	 eval(shift(@insns));
334	 eval(shift(@insns));
335	&vsli_32	($T3,$T0,32-$sigma0[1]);
336	 eval(shift(@insns));
337	 eval(shift(@insns));
338	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[0]);
339	 eval(shift(@insns));
340	 eval(shift(@insns));
341	&veor		($T1,$T1,$T3);		# sigma0(X[1..4])
342	 eval(shift(@insns));
343	 eval(shift(@insns));
344	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[0]);
345	 eval(shift(@insns));
346	 eval(shift(@insns));
347	  &vshr_u32	($T5,&Dhi(@X[3]),$sigma1[2]);
348	 eval(shift(@insns));
349	 eval(shift(@insns));
350	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
351	 eval(shift(@insns));
352	 eval(shift(@insns));
353	  &veor		($T5,$T5,$T4);
354	 eval(shift(@insns));
355	 eval(shift(@insns));
356	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[1]);
357	 eval(shift(@insns));
358	 eval(shift(@insns));
359	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[1]);
360	 eval(shift(@insns));
361	 eval(shift(@insns));
362	  &veor		($T5,$T5,$T4);		# sigma1(X[14..15])
363	 eval(shift(@insns));
364	 eval(shift(@insns));
365	&vadd_i32	(&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
366	 eval(shift(@insns));
367	 eval(shift(@insns));
368	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[0]);
369	 eval(shift(@insns));
370	 eval(shift(@insns));
371	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[0]);
372	 eval(shift(@insns));
373	 eval(shift(@insns));
374	  &vshr_u32	($T5,&Dlo(@X[0]),$sigma1[2]);
375	 eval(shift(@insns));
376	 eval(shift(@insns));
377	  &veor		($T5,$T5,$T4);
378	 eval(shift(@insns));
379	 eval(shift(@insns));
380	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[1]);
381	 eval(shift(@insns));
382	 eval(shift(@insns));
383	&vld1_32	("{$T0}","[$Ktbl,:128]!");
384	 eval(shift(@insns));
385	 eval(shift(@insns));
386	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[1]);
387	 eval(shift(@insns));
388	 eval(shift(@insns));
389	  &veor		($T5,$T5,$T4);		# sigma1(X[16..17])
390	 eval(shift(@insns));
391	 eval(shift(@insns));
392	&vadd_i32	(&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
393	 eval(shift(@insns));
394	 eval(shift(@insns));
395	&vadd_i32	($T0,$T0,@X[0]);
396	 while($#insns>=2) { eval(shift(@insns)); }
397	&vst1_32	("{$T0}","[$Xfer,:128]!");
398	 eval(shift(@insns));
399	 eval(shift(@insns));
400
401	push(@X,shift(@X));		# "rotate" X[]
402}
403
404sub Xpreload()
405{ use integer;
406  my $body = shift;
407  my @insns = (&$body,&$body,&$body,&$body);
408  my ($a,$b,$c,$d,$e,$f,$g,$h);
409
410	 eval(shift(@insns));
411	 eval(shift(@insns));
412	 eval(shift(@insns));
413	 eval(shift(@insns));
414	&vld1_32	("{$T0}","[$Ktbl,:128]!");
415	 eval(shift(@insns));
416	 eval(shift(@insns));
417	 eval(shift(@insns));
418	 eval(shift(@insns));
419	&vrev32_8	(@X[0],@X[0]);
420	 eval(shift(@insns));
421	 eval(shift(@insns));
422	 eval(shift(@insns));
423	 eval(shift(@insns));
424	&vadd_i32	($T0,$T0,@X[0]);
425	 foreach (@insns) { eval; }	# remaining instructions
426	&vst1_32	("{$T0}","[$Xfer,:128]!");
427
428	push(@X,shift(@X));		# "rotate" X[]
429}
430
431sub body_00_15 () {
432	(
433	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
434	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
435	'&eor	($t1,$f,$g)',
436	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
437	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
438	'&and	($t1,$t1,$e)',
439	'&eor	($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
440	'&eor	($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
441	'&eor	($t1,$t1,$g)',			# Ch(e,f,g)
442	'&add	($h,$h,$t2,"ror#$Sigma1[0]")',	# h+=Sigma1(e)
443	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
444	'&eor	($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
445	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
446	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
447	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
448	'&ldr	($t1,"[sp,#64]")			if ($j==31)',
449	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
450	'&add	($d,$d,$h)',			# d+=h
451	'&add	($h,$h,$t0,"ror#$Sigma0[0]");'.	# h+=Sigma0(a)
452	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
453	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
454	)
455}
456
457$code.=<<___;
458#if __ARM_MAX_ARCH__>=7
459.arch	armv7-a
460.fpu	neon
461
462.LK256_shortcut_neon:
463@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode.
464#if defined(__thumb2__)
465.word	K256-(.LK256_add_neon+4)
466#else
467.word	K256-(.LK256_add_neon+8)
468#endif
469
470.global	sha256_block_data_order_neon
471.type	sha256_block_data_order_neon,%function
472.align	5
473.skip	16
474sha256_block_data_order_neon:
475	stmdb	sp!,{r4-r12,lr}
476
477	sub	$H,sp,#16*4+16
478
479	@ K256 is just at the boundary of being easily referenced by an ADR from
480	@ this function. In Arm mode, when building with __ARM_ARCH=6, it does
481	@ not fit. By moving code around, we could make it fit, but this is too
482	@ fragile. For simplicity, just load the offset from
483	@ .LK256_shortcut_neon.
484	@
485	@ TODO(davidben): adrl would avoid a load, but clang-assembler does not
486	@ support it. We might be able to emulate it with a macro, but Android's
487	@ did not work when I tried it.
488	@ https://android.googlesource.com/platform/ndk/+/refs/heads/master/docs/ClangMigration.md#arm
489	ldr	$Ktbl,.LK256_shortcut_neon
490.LK256_add_neon:
491	add	$Ktbl,pc,$Ktbl
492
493	bic	$H,$H,#15		@ align for 128-bit stores
494	mov	$t2,sp
495	mov	sp,$H			@ alloca
496	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
497
498	vld1.8		{@X[0]},[$inp]!
499	vld1.8		{@X[1]},[$inp]!
500	vld1.8		{@X[2]},[$inp]!
501	vld1.8		{@X[3]},[$inp]!
502	vld1.32		{$T0},[$Ktbl,:128]!
503	vld1.32		{$T1},[$Ktbl,:128]!
504	vld1.32		{$T2},[$Ktbl,:128]!
505	vld1.32		{$T3},[$Ktbl,:128]!
506	vrev32.8	@X[0],@X[0]		@ yes, even on
507	str		$ctx,[sp,#64]
508	vrev32.8	@X[1],@X[1]		@ big-endian
509	str		$inp,[sp,#68]
510	mov		$Xfer,sp
511	vrev32.8	@X[2],@X[2]
512	str		$len,[sp,#72]
513	vrev32.8	@X[3],@X[3]
514	str		$t2,[sp,#76]		@ save original sp
515	vadd.i32	$T0,$T0,@X[0]
516	vadd.i32	$T1,$T1,@X[1]
517	vst1.32		{$T0},[$Xfer,:128]!
518	vadd.i32	$T2,$T2,@X[2]
519	vst1.32		{$T1},[$Xfer,:128]!
520	vadd.i32	$T3,$T3,@X[3]
521	vst1.32		{$T2},[$Xfer,:128]!
522	vst1.32		{$T3},[$Xfer,:128]!
523
524	ldmia		$ctx,{$A-$H}
525	sub		$Xfer,$Xfer,#64
526	ldr		$t1,[sp,#0]
527	eor		$t2,$t2,$t2
528	eor		$t3,$B,$C
529	b		.L_00_48
530
531.align	4
532.L_00_48:
533___
534	&Xupdate(\&body_00_15);
535	&Xupdate(\&body_00_15);
536	&Xupdate(\&body_00_15);
537	&Xupdate(\&body_00_15);
538$code.=<<___;
539	teq	$t1,#0				@ check for K256 terminator
540	ldr	$t1,[sp,#0]
541	sub	$Xfer,$Xfer,#64
542	bne	.L_00_48
543
544	ldr		$inp,[sp,#68]
545	ldr		$t0,[sp,#72]
546	sub		$Ktbl,$Ktbl,#256	@ rewind $Ktbl
547	teq		$inp,$t0
548	it		eq
549	subeq		$inp,$inp,#64		@ avoid SEGV
550	vld1.8		{@X[0]},[$inp]!		@ load next input block
551	vld1.8		{@X[1]},[$inp]!
552	vld1.8		{@X[2]},[$inp]!
553	vld1.8		{@X[3]},[$inp]!
554	it		ne
555	strne		$inp,[sp,#68]
556	mov		$Xfer,sp
557___
558	&Xpreload(\&body_00_15);
559	&Xpreload(\&body_00_15);
560	&Xpreload(\&body_00_15);
561	&Xpreload(\&body_00_15);
562$code.=<<___;
563	ldr	$t0,[$t1,#0]
564	add	$A,$A,$t2			@ h+=Maj(a,b,c) from the past
565	ldr	$t2,[$t1,#4]
566	ldr	$t3,[$t1,#8]
567	ldr	$t4,[$t1,#12]
568	add	$A,$A,$t0			@ accumulate
569	ldr	$t0,[$t1,#16]
570	add	$B,$B,$t2
571	ldr	$t2,[$t1,#20]
572	add	$C,$C,$t3
573	ldr	$t3,[$t1,#24]
574	add	$D,$D,$t4
575	ldr	$t4,[$t1,#28]
576	add	$E,$E,$t0
577	str	$A,[$t1],#4
578	add	$F,$F,$t2
579	str	$B,[$t1],#4
580	add	$G,$G,$t3
581	str	$C,[$t1],#4
582	add	$H,$H,$t4
583	str	$D,[$t1],#4
584	stmia	$t1,{$E-$H}
585
586	ittte	ne
587	movne	$Xfer,sp
588	ldrne	$t1,[sp,#0]
589	eorne	$t2,$t2,$t2
590	ldreq	sp,[sp,#76]			@ restore original sp
591	itt	ne
592	eorne	$t3,$B,$C
593	bne	.L_00_48
594
595	ldmia	sp!,{r4-r12,pc}
596.size	sha256_block_data_order_neon,.-sha256_block_data_order_neon
597#endif
598___
599}}}
600######################################################################
601# ARMv8 stuff
602#
603{{{
604my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
605my @MSG=map("q$_",(8..11));
606my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
607my $Ktbl="r3";
608
609$code.=<<___;
610#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
611
612# if defined(__thumb2__)
613#  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
614# else
615#  define INST(a,b,c,d)	.byte	a,b,c,d
616# endif
617
618.LK256_shortcut_hw:
619@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode.
620#if defined(__thumb2__)
621.word	K256-(.LK256_add_hw+4)
622#else
623.word	K256-(.LK256_add_hw+8)
624#endif
625
626.global	sha256_block_data_order_hw
627.type	sha256_block_data_order_hw,%function
628.align	5
629sha256_block_data_order_hw:
630	@ K256 is too far to reference from one ADR command in Thumb mode. In
631	@ Arm mode, we could make it fit by aligning the ADR offset to a 64-byte
632	@ boundary. For simplicity, just load the offset from .LK256_shortcut_hw.
633	ldr	$Ktbl,.LK256_shortcut_hw
634.LK256_add_hw:
635	add	$Ktbl,pc,$Ktbl
636
637	vld1.32	{$ABCD,$EFGH},[$ctx]
638	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
639	b	.Loop_v8
640
641.align	4
642.Loop_v8:
643	vld1.8		{@MSG[0]-@MSG[1]},[$inp]!
644	vld1.8		{@MSG[2]-@MSG[3]},[$inp]!
645	vld1.32		{$W0},[$Ktbl]!
646	vrev32.8	@MSG[0],@MSG[0]
647	vrev32.8	@MSG[1],@MSG[1]
648	vrev32.8	@MSG[2],@MSG[2]
649	vrev32.8	@MSG[3],@MSG[3]
650	vmov		$ABCD_SAVE,$ABCD	@ offload
651	vmov		$EFGH_SAVE,$EFGH
652	teq		$inp,$len
653___
654for($i=0;$i<12;$i++) {
655$code.=<<___;
656	vld1.32		{$W1},[$Ktbl]!
657	vadd.i32	$W0,$W0,@MSG[0]
658	sha256su0	@MSG[0],@MSG[1]
659	vmov		$abcd,$ABCD
660	sha256h		$ABCD,$EFGH,$W0
661	sha256h2	$EFGH,$abcd,$W0
662	sha256su1	@MSG[0],@MSG[2],@MSG[3]
663___
664	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
665}
666$code.=<<___;
667	vld1.32		{$W1},[$Ktbl]!
668	vadd.i32	$W0,$W0,@MSG[0]
669	vmov		$abcd,$ABCD
670	sha256h		$ABCD,$EFGH,$W0
671	sha256h2	$EFGH,$abcd,$W0
672
673	vld1.32		{$W0},[$Ktbl]!
674	vadd.i32	$W1,$W1,@MSG[1]
675	vmov		$abcd,$ABCD
676	sha256h		$ABCD,$EFGH,$W1
677	sha256h2	$EFGH,$abcd,$W1
678
679	vld1.32		{$W1},[$Ktbl]
680	vadd.i32	$W0,$W0,@MSG[2]
681	sub		$Ktbl,$Ktbl,#256-16	@ rewind
682	vmov		$abcd,$ABCD
683	sha256h		$ABCD,$EFGH,$W0
684	sha256h2	$EFGH,$abcd,$W0
685
686	vadd.i32	$W1,$W1,@MSG[3]
687	vmov		$abcd,$ABCD
688	sha256h		$ABCD,$EFGH,$W1
689	sha256h2	$EFGH,$abcd,$W1
690
691	vadd.i32	$ABCD,$ABCD,$ABCD_SAVE
692	vadd.i32	$EFGH,$EFGH,$EFGH_SAVE
693	it		ne
694	bne		.Loop_v8
695
696	vst1.32		{$ABCD,$EFGH},[$ctx]
697
698	ret		@ bx lr
699.size	sha256_block_data_order_hw,.-sha256_block_data_order_hw
700#endif
701___
702}}}
703$code.=<<___;
704.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
705.align	2
706___
707
708open SELF,$0;
709while(<SELF>) {
710	next if (/^#!/);
711	last if (!s/^#/@/ and !/^$/);
712	print;
713}
714close SELF;
715
716{   my  %opcode = (
717	"sha256h"	=> 0xf3000c40,	"sha256h2"	=> 0xf3100c40,
718	"sha256su0"	=> 0xf3ba03c0,	"sha256su1"	=> 0xf3200c40	);
719
720    sub unsha256 {
721	my ($mnemonic,$arg)=@_;
722
723	if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
724	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
725					 |(($2&7)<<17)|(($2&8)<<4)
726					 |(($3&7)<<1) |(($3&8)<<2);
727	    # since ARMv7 instructions are always encoded little-endian.
728	    # correct solution is to use .inst directive, but older
729	    # assemblers don't implement it:-(
730	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
731			$word&0xff,($word>>8)&0xff,
732			($word>>16)&0xff,($word>>24)&0xff,
733			$mnemonic,$arg;
734	}
735    }
736}
737
738foreach (split($/,$code)) {
739
740	s/\`([^\`]*)\`/eval $1/geo;
741
742	s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
743
744	s/\bret\b/bx	lr/go		or
745	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
746
747	print $_,"\n";
748}
749
750close STDOUT or die "error closing STDOUT: $!"; # enforce flush
751