1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <ring-core/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
7// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
8//
9// Licensed under the OpenSSL license (the "License").  You may not use
10// this file except in compliance with the License.  You can obtain a copy
11// in the file LICENSE in the source distribution or at
12// https://www.openssl.org/source/license.html
13
14// ====================================================================
15// Written by Andy Polyakov <[email protected]> for the OpenSSL
16// project. The module is, however, dual licensed under OpenSSL and
17// CRYPTOGAMS licenses depending on where you obtain it. For further
18// details see http://www.openssl.org/~appro/cryptogams/.
19//
20// Permission to use under GPLv2 terms is granted.
21// ====================================================================
22//
23// SHA256/512 for ARMv8.
24//
25// Performance in cycles per processed byte and improvement coefficient
26// over code generated with "default" compiler:
27//
28//		SHA256-hw	SHA256(*)	SHA512
29// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
30// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
31// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
32// Denver	2.01		10.5 (+26%)	6.70 (+8%)
33// X-Gene			20.0 (+100%)	12.8 (+300%(***))
34// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
35// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
36//
37// (*)	Software SHA256 results are of lesser relevance, presented
38//	mostly for informational purposes.
39// (**)	The result is a trade-off: it's possible to improve it by
40//	10% (or by 1 cycle per round), but at the cost of 20% loss
41//	on Cortex-A53 (or by 4 cycles per round).
42// (***)	Super-impressive coefficients over gcc-generated code are
43//	indication of some compiler "pathology", most notably code
44//	generated with -mgeneral-regs-only is significantly faster
45//	and the gap is only 40-90%.
46
47#ifndef	__KERNEL__
48# include <ring-core/arm_arch.h>
49#endif
50
51.text
52
53
54
55.globl	sha512_block_data_order
56
57.def sha512_block_data_order
58   .type 32
59.endef
60.align	6
61sha512_block_data_order:
62	AARCH64_VALID_CALL_TARGET
63#ifndef	__KERNEL__
64#if defined(OPENSSL_HWASAN) && __clang_major__ >= 10
65	adrp	x16,:pg_hi21_nc:OPENSSL_armcap_P
66#else
67	adrp	x16,OPENSSL_armcap_P
68#endif
69	ldr	w16,[x16,:lo12:OPENSSL_armcap_P]
70	tst	w16,#ARMV8_SHA512
71	b.ne	Lv8_entry
72#endif
73	AARCH64_SIGN_LINK_REGISTER
74	stp	x29,x30,[sp,#-128]!
75	add	x29,sp,#0
76
77	stp	x19,x20,[sp,#16]
78	stp	x21,x22,[sp,#32]
79	stp	x23,x24,[sp,#48]
80	stp	x25,x26,[sp,#64]
81	stp	x27,x28,[sp,#80]
82	sub	sp,sp,#4*8
83
84	ldp	x20,x21,[x0]				// load context
85	ldp	x22,x23,[x0,#2*8]
86	ldp	x24,x25,[x0,#4*8]
87	add	x2,x1,x2,lsl#7	// end of input
88	ldp	x26,x27,[x0,#6*8]
89	adrp	x30,LK512
90	add	x30,x30,:lo12:LK512
91	stp	x0,x2,[x29,#96]
92
93Loop:
94	ldp	x3,x4,[x1],#2*8
95	ldr	x19,[x30],#8			// *K++
96	eor	x28,x21,x22				// magic seed
97	str	x1,[x29,#112]
98#ifndef	__AARCH64EB__
99	rev	x3,x3			// 0
100#endif
101	ror	x16,x24,#14
102	add	x27,x27,x19			// h+=K[i]
103	eor	x6,x24,x24,ror#23
104	and	x17,x25,x24
105	bic	x19,x26,x24
106	add	x27,x27,x3			// h+=X[i]
107	orr	x17,x17,x19			// Ch(e,f,g)
108	eor	x19,x20,x21			// a^b, b^c in next round
109	eor	x16,x16,x6,ror#18	// Sigma1(e)
110	ror	x6,x20,#28
111	add	x27,x27,x17			// h+=Ch(e,f,g)
112	eor	x17,x20,x20,ror#5
113	add	x27,x27,x16			// h+=Sigma1(e)
114	and	x28,x28,x19			// (b^c)&=(a^b)
115	add	x23,x23,x27			// d+=h
116	eor	x28,x28,x21			// Maj(a,b,c)
117	eor	x17,x6,x17,ror#34	// Sigma0(a)
118	add	x27,x27,x28			// h+=Maj(a,b,c)
119	ldr	x28,[x30],#8		// *K++, x19 in next round
120	//add	x27,x27,x17			// h+=Sigma0(a)
121#ifndef	__AARCH64EB__
122	rev	x4,x4			// 1
123#endif
124	ldp	x5,x6,[x1],#2*8
125	add	x27,x27,x17			// h+=Sigma0(a)
126	ror	x16,x23,#14
127	add	x26,x26,x28			// h+=K[i]
128	eor	x7,x23,x23,ror#23
129	and	x17,x24,x23
130	bic	x28,x25,x23
131	add	x26,x26,x4			// h+=X[i]
132	orr	x17,x17,x28			// Ch(e,f,g)
133	eor	x28,x27,x20			// a^b, b^c in next round
134	eor	x16,x16,x7,ror#18	// Sigma1(e)
135	ror	x7,x27,#28
136	add	x26,x26,x17			// h+=Ch(e,f,g)
137	eor	x17,x27,x27,ror#5
138	add	x26,x26,x16			// h+=Sigma1(e)
139	and	x19,x19,x28			// (b^c)&=(a^b)
140	add	x22,x22,x26			// d+=h
141	eor	x19,x19,x20			// Maj(a,b,c)
142	eor	x17,x7,x17,ror#34	// Sigma0(a)
143	add	x26,x26,x19			// h+=Maj(a,b,c)
144	ldr	x19,[x30],#8		// *K++, x28 in next round
145	//add	x26,x26,x17			// h+=Sigma0(a)
146#ifndef	__AARCH64EB__
147	rev	x5,x5			// 2
148#endif
149	add	x26,x26,x17			// h+=Sigma0(a)
150	ror	x16,x22,#14
151	add	x25,x25,x19			// h+=K[i]
152	eor	x8,x22,x22,ror#23
153	and	x17,x23,x22
154	bic	x19,x24,x22
155	add	x25,x25,x5			// h+=X[i]
156	orr	x17,x17,x19			// Ch(e,f,g)
157	eor	x19,x26,x27			// a^b, b^c in next round
158	eor	x16,x16,x8,ror#18	// Sigma1(e)
159	ror	x8,x26,#28
160	add	x25,x25,x17			// h+=Ch(e,f,g)
161	eor	x17,x26,x26,ror#5
162	add	x25,x25,x16			// h+=Sigma1(e)
163	and	x28,x28,x19			// (b^c)&=(a^b)
164	add	x21,x21,x25			// d+=h
165	eor	x28,x28,x27			// Maj(a,b,c)
166	eor	x17,x8,x17,ror#34	// Sigma0(a)
167	add	x25,x25,x28			// h+=Maj(a,b,c)
168	ldr	x28,[x30],#8		// *K++, x19 in next round
169	//add	x25,x25,x17			// h+=Sigma0(a)
170#ifndef	__AARCH64EB__
171	rev	x6,x6			// 3
172#endif
173	ldp	x7,x8,[x1],#2*8
174	add	x25,x25,x17			// h+=Sigma0(a)
175	ror	x16,x21,#14
176	add	x24,x24,x28			// h+=K[i]
177	eor	x9,x21,x21,ror#23
178	and	x17,x22,x21
179	bic	x28,x23,x21
180	add	x24,x24,x6			// h+=X[i]
181	orr	x17,x17,x28			// Ch(e,f,g)
182	eor	x28,x25,x26			// a^b, b^c in next round
183	eor	x16,x16,x9,ror#18	// Sigma1(e)
184	ror	x9,x25,#28
185	add	x24,x24,x17			// h+=Ch(e,f,g)
186	eor	x17,x25,x25,ror#5
187	add	x24,x24,x16			// h+=Sigma1(e)
188	and	x19,x19,x28			// (b^c)&=(a^b)
189	add	x20,x20,x24			// d+=h
190	eor	x19,x19,x26			// Maj(a,b,c)
191	eor	x17,x9,x17,ror#34	// Sigma0(a)
192	add	x24,x24,x19			// h+=Maj(a,b,c)
193	ldr	x19,[x30],#8		// *K++, x28 in next round
194	//add	x24,x24,x17			// h+=Sigma0(a)
195#ifndef	__AARCH64EB__
196	rev	x7,x7			// 4
197#endif
198	add	x24,x24,x17			// h+=Sigma0(a)
199	ror	x16,x20,#14
200	add	x23,x23,x19			// h+=K[i]
201	eor	x10,x20,x20,ror#23
202	and	x17,x21,x20
203	bic	x19,x22,x20
204	add	x23,x23,x7			// h+=X[i]
205	orr	x17,x17,x19			// Ch(e,f,g)
206	eor	x19,x24,x25			// a^b, b^c in next round
207	eor	x16,x16,x10,ror#18	// Sigma1(e)
208	ror	x10,x24,#28
209	add	x23,x23,x17			// h+=Ch(e,f,g)
210	eor	x17,x24,x24,ror#5
211	add	x23,x23,x16			// h+=Sigma1(e)
212	and	x28,x28,x19			// (b^c)&=(a^b)
213	add	x27,x27,x23			// d+=h
214	eor	x28,x28,x25			// Maj(a,b,c)
215	eor	x17,x10,x17,ror#34	// Sigma0(a)
216	add	x23,x23,x28			// h+=Maj(a,b,c)
217	ldr	x28,[x30],#8		// *K++, x19 in next round
218	//add	x23,x23,x17			// h+=Sigma0(a)
219#ifndef	__AARCH64EB__
220	rev	x8,x8			// 5
221#endif
222	ldp	x9,x10,[x1],#2*8
223	add	x23,x23,x17			// h+=Sigma0(a)
224	ror	x16,x27,#14
225	add	x22,x22,x28			// h+=K[i]
226	eor	x11,x27,x27,ror#23
227	and	x17,x20,x27
228	bic	x28,x21,x27
229	add	x22,x22,x8			// h+=X[i]
230	orr	x17,x17,x28			// Ch(e,f,g)
231	eor	x28,x23,x24			// a^b, b^c in next round
232	eor	x16,x16,x11,ror#18	// Sigma1(e)
233	ror	x11,x23,#28
234	add	x22,x22,x17			// h+=Ch(e,f,g)
235	eor	x17,x23,x23,ror#5
236	add	x22,x22,x16			// h+=Sigma1(e)
237	and	x19,x19,x28			// (b^c)&=(a^b)
238	add	x26,x26,x22			// d+=h
239	eor	x19,x19,x24			// Maj(a,b,c)
240	eor	x17,x11,x17,ror#34	// Sigma0(a)
241	add	x22,x22,x19			// h+=Maj(a,b,c)
242	ldr	x19,[x30],#8		// *K++, x28 in next round
243	//add	x22,x22,x17			// h+=Sigma0(a)
244#ifndef	__AARCH64EB__
245	rev	x9,x9			// 6
246#endif
247	add	x22,x22,x17			// h+=Sigma0(a)
248	ror	x16,x26,#14
249	add	x21,x21,x19			// h+=K[i]
250	eor	x12,x26,x26,ror#23
251	and	x17,x27,x26
252	bic	x19,x20,x26
253	add	x21,x21,x9			// h+=X[i]
254	orr	x17,x17,x19			// Ch(e,f,g)
255	eor	x19,x22,x23			// a^b, b^c in next round
256	eor	x16,x16,x12,ror#18	// Sigma1(e)
257	ror	x12,x22,#28
258	add	x21,x21,x17			// h+=Ch(e,f,g)
259	eor	x17,x22,x22,ror#5
260	add	x21,x21,x16			// h+=Sigma1(e)
261	and	x28,x28,x19			// (b^c)&=(a^b)
262	add	x25,x25,x21			// d+=h
263	eor	x28,x28,x23			// Maj(a,b,c)
264	eor	x17,x12,x17,ror#34	// Sigma0(a)
265	add	x21,x21,x28			// h+=Maj(a,b,c)
266	ldr	x28,[x30],#8		// *K++, x19 in next round
267	//add	x21,x21,x17			// h+=Sigma0(a)
268#ifndef	__AARCH64EB__
269	rev	x10,x10			// 7
270#endif
271	ldp	x11,x12,[x1],#2*8
272	add	x21,x21,x17			// h+=Sigma0(a)
273	ror	x16,x25,#14
274	add	x20,x20,x28			// h+=K[i]
275	eor	x13,x25,x25,ror#23
276	and	x17,x26,x25
277	bic	x28,x27,x25
278	add	x20,x20,x10			// h+=X[i]
279	orr	x17,x17,x28			// Ch(e,f,g)
280	eor	x28,x21,x22			// a^b, b^c in next round
281	eor	x16,x16,x13,ror#18	// Sigma1(e)
282	ror	x13,x21,#28
283	add	x20,x20,x17			// h+=Ch(e,f,g)
284	eor	x17,x21,x21,ror#5
285	add	x20,x20,x16			// h+=Sigma1(e)
286	and	x19,x19,x28			// (b^c)&=(a^b)
287	add	x24,x24,x20			// d+=h
288	eor	x19,x19,x22			// Maj(a,b,c)
289	eor	x17,x13,x17,ror#34	// Sigma0(a)
290	add	x20,x20,x19			// h+=Maj(a,b,c)
291	ldr	x19,[x30],#8		// *K++, x28 in next round
292	//add	x20,x20,x17			// h+=Sigma0(a)
293#ifndef	__AARCH64EB__
294	rev	x11,x11			// 8
295#endif
296	add	x20,x20,x17			// h+=Sigma0(a)
297	ror	x16,x24,#14
298	add	x27,x27,x19			// h+=K[i]
299	eor	x14,x24,x24,ror#23
300	and	x17,x25,x24
301	bic	x19,x26,x24
302	add	x27,x27,x11			// h+=X[i]
303	orr	x17,x17,x19			// Ch(e,f,g)
304	eor	x19,x20,x21			// a^b, b^c in next round
305	eor	x16,x16,x14,ror#18	// Sigma1(e)
306	ror	x14,x20,#28
307	add	x27,x27,x17			// h+=Ch(e,f,g)
308	eor	x17,x20,x20,ror#5
309	add	x27,x27,x16			// h+=Sigma1(e)
310	and	x28,x28,x19			// (b^c)&=(a^b)
311	add	x23,x23,x27			// d+=h
312	eor	x28,x28,x21			// Maj(a,b,c)
313	eor	x17,x14,x17,ror#34	// Sigma0(a)
314	add	x27,x27,x28			// h+=Maj(a,b,c)
315	ldr	x28,[x30],#8		// *K++, x19 in next round
316	//add	x27,x27,x17			// h+=Sigma0(a)
317#ifndef	__AARCH64EB__
318	rev	x12,x12			// 9
319#endif
320	ldp	x13,x14,[x1],#2*8
321	add	x27,x27,x17			// h+=Sigma0(a)
322	ror	x16,x23,#14
323	add	x26,x26,x28			// h+=K[i]
324	eor	x15,x23,x23,ror#23
325	and	x17,x24,x23
326	bic	x28,x25,x23
327	add	x26,x26,x12			// h+=X[i]
328	orr	x17,x17,x28			// Ch(e,f,g)
329	eor	x28,x27,x20			// a^b, b^c in next round
330	eor	x16,x16,x15,ror#18	// Sigma1(e)
331	ror	x15,x27,#28
332	add	x26,x26,x17			// h+=Ch(e,f,g)
333	eor	x17,x27,x27,ror#5
334	add	x26,x26,x16			// h+=Sigma1(e)
335	and	x19,x19,x28			// (b^c)&=(a^b)
336	add	x22,x22,x26			// d+=h
337	eor	x19,x19,x20			// Maj(a,b,c)
338	eor	x17,x15,x17,ror#34	// Sigma0(a)
339	add	x26,x26,x19			// h+=Maj(a,b,c)
340	ldr	x19,[x30],#8		// *K++, x28 in next round
341	//add	x26,x26,x17			// h+=Sigma0(a)
342#ifndef	__AARCH64EB__
343	rev	x13,x13			// 10
344#endif
345	add	x26,x26,x17			// h+=Sigma0(a)
346	ror	x16,x22,#14
347	add	x25,x25,x19			// h+=K[i]
348	eor	x0,x22,x22,ror#23
349	and	x17,x23,x22
350	bic	x19,x24,x22
351	add	x25,x25,x13			// h+=X[i]
352	orr	x17,x17,x19			// Ch(e,f,g)
353	eor	x19,x26,x27			// a^b, b^c in next round
354	eor	x16,x16,x0,ror#18	// Sigma1(e)
355	ror	x0,x26,#28
356	add	x25,x25,x17			// h+=Ch(e,f,g)
357	eor	x17,x26,x26,ror#5
358	add	x25,x25,x16			// h+=Sigma1(e)
359	and	x28,x28,x19			// (b^c)&=(a^b)
360	add	x21,x21,x25			// d+=h
361	eor	x28,x28,x27			// Maj(a,b,c)
362	eor	x17,x0,x17,ror#34	// Sigma0(a)
363	add	x25,x25,x28			// h+=Maj(a,b,c)
364	ldr	x28,[x30],#8		// *K++, x19 in next round
365	//add	x25,x25,x17			// h+=Sigma0(a)
366#ifndef	__AARCH64EB__
367	rev	x14,x14			// 11
368#endif
369	ldp	x15,x0,[x1],#2*8
370	add	x25,x25,x17			// h+=Sigma0(a)
371	str	x6,[sp,#24]
372	ror	x16,x21,#14
373	add	x24,x24,x28			// h+=K[i]
374	eor	x6,x21,x21,ror#23
375	and	x17,x22,x21
376	bic	x28,x23,x21
377	add	x24,x24,x14			// h+=X[i]
378	orr	x17,x17,x28			// Ch(e,f,g)
379	eor	x28,x25,x26			// a^b, b^c in next round
380	eor	x16,x16,x6,ror#18	// Sigma1(e)
381	ror	x6,x25,#28
382	add	x24,x24,x17			// h+=Ch(e,f,g)
383	eor	x17,x25,x25,ror#5
384	add	x24,x24,x16			// h+=Sigma1(e)
385	and	x19,x19,x28			// (b^c)&=(a^b)
386	add	x20,x20,x24			// d+=h
387	eor	x19,x19,x26			// Maj(a,b,c)
388	eor	x17,x6,x17,ror#34	// Sigma0(a)
389	add	x24,x24,x19			// h+=Maj(a,b,c)
390	ldr	x19,[x30],#8		// *K++, x28 in next round
391	//add	x24,x24,x17			// h+=Sigma0(a)
392#ifndef	__AARCH64EB__
393	rev	x15,x15			// 12
394#endif
395	add	x24,x24,x17			// h+=Sigma0(a)
396	str	x7,[sp,#0]
397	ror	x16,x20,#14
398	add	x23,x23,x19			// h+=K[i]
399	eor	x7,x20,x20,ror#23
400	and	x17,x21,x20
401	bic	x19,x22,x20
402	add	x23,x23,x15			// h+=X[i]
403	orr	x17,x17,x19			// Ch(e,f,g)
404	eor	x19,x24,x25			// a^b, b^c in next round
405	eor	x16,x16,x7,ror#18	// Sigma1(e)
406	ror	x7,x24,#28
407	add	x23,x23,x17			// h+=Ch(e,f,g)
408	eor	x17,x24,x24,ror#5
409	add	x23,x23,x16			// h+=Sigma1(e)
410	and	x28,x28,x19			// (b^c)&=(a^b)
411	add	x27,x27,x23			// d+=h
412	eor	x28,x28,x25			// Maj(a,b,c)
413	eor	x17,x7,x17,ror#34	// Sigma0(a)
414	add	x23,x23,x28			// h+=Maj(a,b,c)
415	ldr	x28,[x30],#8		// *K++, x19 in next round
416	//add	x23,x23,x17			// h+=Sigma0(a)
417#ifndef	__AARCH64EB__
418	rev	x0,x0			// 13
419#endif
420	ldp	x1,x2,[x1]
421	add	x23,x23,x17			// h+=Sigma0(a)
422	str	x8,[sp,#8]
423	ror	x16,x27,#14
424	add	x22,x22,x28			// h+=K[i]
425	eor	x8,x27,x27,ror#23
426	and	x17,x20,x27
427	bic	x28,x21,x27
428	add	x22,x22,x0			// h+=X[i]
429	orr	x17,x17,x28			// Ch(e,f,g)
430	eor	x28,x23,x24			// a^b, b^c in next round
431	eor	x16,x16,x8,ror#18	// Sigma1(e)
432	ror	x8,x23,#28
433	add	x22,x22,x17			// h+=Ch(e,f,g)
434	eor	x17,x23,x23,ror#5
435	add	x22,x22,x16			// h+=Sigma1(e)
436	and	x19,x19,x28			// (b^c)&=(a^b)
437	add	x26,x26,x22			// d+=h
438	eor	x19,x19,x24			// Maj(a,b,c)
439	eor	x17,x8,x17,ror#34	// Sigma0(a)
440	add	x22,x22,x19			// h+=Maj(a,b,c)
441	ldr	x19,[x30],#8		// *K++, x28 in next round
442	//add	x22,x22,x17			// h+=Sigma0(a)
443#ifndef	__AARCH64EB__
444	rev	x1,x1			// 14
445#endif
446	ldr	x6,[sp,#24]
447	add	x22,x22,x17			// h+=Sigma0(a)
448	str	x9,[sp,#16]
449	ror	x16,x26,#14
450	add	x21,x21,x19			// h+=K[i]
451	eor	x9,x26,x26,ror#23
452	and	x17,x27,x26
453	bic	x19,x20,x26
454	add	x21,x21,x1			// h+=X[i]
455	orr	x17,x17,x19			// Ch(e,f,g)
456	eor	x19,x22,x23			// a^b, b^c in next round
457	eor	x16,x16,x9,ror#18	// Sigma1(e)
458	ror	x9,x22,#28
459	add	x21,x21,x17			// h+=Ch(e,f,g)
460	eor	x17,x22,x22,ror#5
461	add	x21,x21,x16			// h+=Sigma1(e)
462	and	x28,x28,x19			// (b^c)&=(a^b)
463	add	x25,x25,x21			// d+=h
464	eor	x28,x28,x23			// Maj(a,b,c)
465	eor	x17,x9,x17,ror#34	// Sigma0(a)
466	add	x21,x21,x28			// h+=Maj(a,b,c)
467	ldr	x28,[x30],#8		// *K++, x19 in next round
468	//add	x21,x21,x17			// h+=Sigma0(a)
469#ifndef	__AARCH64EB__
470	rev	x2,x2			// 15
471#endif
472	ldr	x7,[sp,#0]
473	add	x21,x21,x17			// h+=Sigma0(a)
474	str	x10,[sp,#24]
475	ror	x16,x25,#14
476	add	x20,x20,x28			// h+=K[i]
477	ror	x9,x4,#1
478	and	x17,x26,x25
479	ror	x8,x1,#19
480	bic	x28,x27,x25
481	ror	x10,x21,#28
482	add	x20,x20,x2			// h+=X[i]
483	eor	x16,x16,x25,ror#18
484	eor	x9,x9,x4,ror#8
485	orr	x17,x17,x28			// Ch(e,f,g)
486	eor	x28,x21,x22			// a^b, b^c in next round
487	eor	x16,x16,x25,ror#41	// Sigma1(e)
488	eor	x10,x10,x21,ror#34
489	add	x20,x20,x17			// h+=Ch(e,f,g)
490	and	x19,x19,x28			// (b^c)&=(a^b)
491	eor	x8,x8,x1,ror#61
492	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
493	add	x20,x20,x16			// h+=Sigma1(e)
494	eor	x19,x19,x22			// Maj(a,b,c)
495	eor	x17,x10,x21,ror#39	// Sigma0(a)
496	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
497	add	x3,x3,x12
498	add	x24,x24,x20			// d+=h
499	add	x20,x20,x19			// h+=Maj(a,b,c)
500	ldr	x19,[x30],#8		// *K++, x28 in next round
501	add	x3,x3,x9
502	add	x20,x20,x17			// h+=Sigma0(a)
503	add	x3,x3,x8
504Loop_16_xx:
505	ldr	x8,[sp,#8]
506	str	x11,[sp,#0]
507	ror	x16,x24,#14
508	add	x27,x27,x19			// h+=K[i]
509	ror	x10,x5,#1
510	and	x17,x25,x24
511	ror	x9,x2,#19
512	bic	x19,x26,x24
513	ror	x11,x20,#28
514	add	x27,x27,x3			// h+=X[i]
515	eor	x16,x16,x24,ror#18
516	eor	x10,x10,x5,ror#8
517	orr	x17,x17,x19			// Ch(e,f,g)
518	eor	x19,x20,x21			// a^b, b^c in next round
519	eor	x16,x16,x24,ror#41	// Sigma1(e)
520	eor	x11,x11,x20,ror#34
521	add	x27,x27,x17			// h+=Ch(e,f,g)
522	and	x28,x28,x19			// (b^c)&=(a^b)
523	eor	x9,x9,x2,ror#61
524	eor	x10,x10,x5,lsr#7	// sigma0(X[i+1])
525	add	x27,x27,x16			// h+=Sigma1(e)
526	eor	x28,x28,x21			// Maj(a,b,c)
527	eor	x17,x11,x20,ror#39	// Sigma0(a)
528	eor	x9,x9,x2,lsr#6	// sigma1(X[i+14])
529	add	x4,x4,x13
530	add	x23,x23,x27			// d+=h
531	add	x27,x27,x28			// h+=Maj(a,b,c)
532	ldr	x28,[x30],#8		// *K++, x19 in next round
533	add	x4,x4,x10
534	add	x27,x27,x17			// h+=Sigma0(a)
535	add	x4,x4,x9
536	ldr	x9,[sp,#16]
537	str	x12,[sp,#8]
538	ror	x16,x23,#14
539	add	x26,x26,x28			// h+=K[i]
540	ror	x11,x6,#1
541	and	x17,x24,x23
542	ror	x10,x3,#19
543	bic	x28,x25,x23
544	ror	x12,x27,#28
545	add	x26,x26,x4			// h+=X[i]
546	eor	x16,x16,x23,ror#18
547	eor	x11,x11,x6,ror#8
548	orr	x17,x17,x28			// Ch(e,f,g)
549	eor	x28,x27,x20			// a^b, b^c in next round
550	eor	x16,x16,x23,ror#41	// Sigma1(e)
551	eor	x12,x12,x27,ror#34
552	add	x26,x26,x17			// h+=Ch(e,f,g)
553	and	x19,x19,x28			// (b^c)&=(a^b)
554	eor	x10,x10,x3,ror#61
555	eor	x11,x11,x6,lsr#7	// sigma0(X[i+1])
556	add	x26,x26,x16			// h+=Sigma1(e)
557	eor	x19,x19,x20			// Maj(a,b,c)
558	eor	x17,x12,x27,ror#39	// Sigma0(a)
559	eor	x10,x10,x3,lsr#6	// sigma1(X[i+14])
560	add	x5,x5,x14
561	add	x22,x22,x26			// d+=h
562	add	x26,x26,x19			// h+=Maj(a,b,c)
563	ldr	x19,[x30],#8		// *K++, x28 in next round
564	add	x5,x5,x11
565	add	x26,x26,x17			// h+=Sigma0(a)
566	add	x5,x5,x10
567	ldr	x10,[sp,#24]
568	str	x13,[sp,#16]
569	ror	x16,x22,#14
570	add	x25,x25,x19			// h+=K[i]
571	ror	x12,x7,#1
572	and	x17,x23,x22
573	ror	x11,x4,#19
574	bic	x19,x24,x22
575	ror	x13,x26,#28
576	add	x25,x25,x5			// h+=X[i]
577	eor	x16,x16,x22,ror#18
578	eor	x12,x12,x7,ror#8
579	orr	x17,x17,x19			// Ch(e,f,g)
580	eor	x19,x26,x27			// a^b, b^c in next round
581	eor	x16,x16,x22,ror#41	// Sigma1(e)
582	eor	x13,x13,x26,ror#34
583	add	x25,x25,x17			// h+=Ch(e,f,g)
584	and	x28,x28,x19			// (b^c)&=(a^b)
585	eor	x11,x11,x4,ror#61
586	eor	x12,x12,x7,lsr#7	// sigma0(X[i+1])
587	add	x25,x25,x16			// h+=Sigma1(e)
588	eor	x28,x28,x27			// Maj(a,b,c)
589	eor	x17,x13,x26,ror#39	// Sigma0(a)
590	eor	x11,x11,x4,lsr#6	// sigma1(X[i+14])
591	add	x6,x6,x15
592	add	x21,x21,x25			// d+=h
593	add	x25,x25,x28			// h+=Maj(a,b,c)
594	ldr	x28,[x30],#8		// *K++, x19 in next round
595	add	x6,x6,x12
596	add	x25,x25,x17			// h+=Sigma0(a)
597	add	x6,x6,x11
598	ldr	x11,[sp,#0]
599	str	x14,[sp,#24]
600	ror	x16,x21,#14
601	add	x24,x24,x28			// h+=K[i]
602	ror	x13,x8,#1
603	and	x17,x22,x21
604	ror	x12,x5,#19
605	bic	x28,x23,x21
606	ror	x14,x25,#28
607	add	x24,x24,x6			// h+=X[i]
608	eor	x16,x16,x21,ror#18
609	eor	x13,x13,x8,ror#8
610	orr	x17,x17,x28			// Ch(e,f,g)
611	eor	x28,x25,x26			// a^b, b^c in next round
612	eor	x16,x16,x21,ror#41	// Sigma1(e)
613	eor	x14,x14,x25,ror#34
614	add	x24,x24,x17			// h+=Ch(e,f,g)
615	and	x19,x19,x28			// (b^c)&=(a^b)
616	eor	x12,x12,x5,ror#61
617	eor	x13,x13,x8,lsr#7	// sigma0(X[i+1])
618	add	x24,x24,x16			// h+=Sigma1(e)
619	eor	x19,x19,x26			// Maj(a,b,c)
620	eor	x17,x14,x25,ror#39	// Sigma0(a)
621	eor	x12,x12,x5,lsr#6	// sigma1(X[i+14])
622	add	x7,x7,x0
623	add	x20,x20,x24			// d+=h
624	add	x24,x24,x19			// h+=Maj(a,b,c)
625	ldr	x19,[x30],#8		// *K++, x28 in next round
626	add	x7,x7,x13
627	add	x24,x24,x17			// h+=Sigma0(a)
628	add	x7,x7,x12
629	ldr	x12,[sp,#8]
630	str	x15,[sp,#0]
631	ror	x16,x20,#14
632	add	x23,x23,x19			// h+=K[i]
633	ror	x14,x9,#1
634	and	x17,x21,x20
635	ror	x13,x6,#19
636	bic	x19,x22,x20
637	ror	x15,x24,#28
638	add	x23,x23,x7			// h+=X[i]
639	eor	x16,x16,x20,ror#18
640	eor	x14,x14,x9,ror#8
641	orr	x17,x17,x19			// Ch(e,f,g)
642	eor	x19,x24,x25			// a^b, b^c in next round
643	eor	x16,x16,x20,ror#41	// Sigma1(e)
644	eor	x15,x15,x24,ror#34
645	add	x23,x23,x17			// h+=Ch(e,f,g)
646	and	x28,x28,x19			// (b^c)&=(a^b)
647	eor	x13,x13,x6,ror#61
648	eor	x14,x14,x9,lsr#7	// sigma0(X[i+1])
649	add	x23,x23,x16			// h+=Sigma1(e)
650	eor	x28,x28,x25			// Maj(a,b,c)
651	eor	x17,x15,x24,ror#39	// Sigma0(a)
652	eor	x13,x13,x6,lsr#6	// sigma1(X[i+14])
653	add	x8,x8,x1
654	add	x27,x27,x23			// d+=h
655	add	x23,x23,x28			// h+=Maj(a,b,c)
656	ldr	x28,[x30],#8		// *K++, x19 in next round
657	add	x8,x8,x14
658	add	x23,x23,x17			// h+=Sigma0(a)
659	add	x8,x8,x13
660	ldr	x13,[sp,#16]
661	str	x0,[sp,#8]
662	ror	x16,x27,#14
663	add	x22,x22,x28			// h+=K[i]
664	ror	x15,x10,#1
665	and	x17,x20,x27
666	ror	x14,x7,#19
667	bic	x28,x21,x27
668	ror	x0,x23,#28
669	add	x22,x22,x8			// h+=X[i]
670	eor	x16,x16,x27,ror#18
671	eor	x15,x15,x10,ror#8
672	orr	x17,x17,x28			// Ch(e,f,g)
673	eor	x28,x23,x24			// a^b, b^c in next round
674	eor	x16,x16,x27,ror#41	// Sigma1(e)
675	eor	x0,x0,x23,ror#34
676	add	x22,x22,x17			// h+=Ch(e,f,g)
677	and	x19,x19,x28			// (b^c)&=(a^b)
678	eor	x14,x14,x7,ror#61
679	eor	x15,x15,x10,lsr#7	// sigma0(X[i+1])
680	add	x22,x22,x16			// h+=Sigma1(e)
681	eor	x19,x19,x24			// Maj(a,b,c)
682	eor	x17,x0,x23,ror#39	// Sigma0(a)
683	eor	x14,x14,x7,lsr#6	// sigma1(X[i+14])
684	add	x9,x9,x2
685	add	x26,x26,x22			// d+=h
686	add	x22,x22,x19			// h+=Maj(a,b,c)
687	ldr	x19,[x30],#8		// *K++, x28 in next round
688	add	x9,x9,x15
689	add	x22,x22,x17			// h+=Sigma0(a)
690	add	x9,x9,x14
691	ldr	x14,[sp,#24]
692	str	x1,[sp,#16]
693	ror	x16,x26,#14
694	add	x21,x21,x19			// h+=K[i]
695	ror	x0,x11,#1
696	and	x17,x27,x26
697	ror	x15,x8,#19
698	bic	x19,x20,x26
699	ror	x1,x22,#28
700	add	x21,x21,x9			// h+=X[i]
701	eor	x16,x16,x26,ror#18
702	eor	x0,x0,x11,ror#8
703	orr	x17,x17,x19			// Ch(e,f,g)
704	eor	x19,x22,x23			// a^b, b^c in next round
705	eor	x16,x16,x26,ror#41	// Sigma1(e)
706	eor	x1,x1,x22,ror#34
707	add	x21,x21,x17			// h+=Ch(e,f,g)
708	and	x28,x28,x19			// (b^c)&=(a^b)
709	eor	x15,x15,x8,ror#61
710	eor	x0,x0,x11,lsr#7	// sigma0(X[i+1])
711	add	x21,x21,x16			// h+=Sigma1(e)
712	eor	x28,x28,x23			// Maj(a,b,c)
713	eor	x17,x1,x22,ror#39	// Sigma0(a)
714	eor	x15,x15,x8,lsr#6	// sigma1(X[i+14])
715	add	x10,x10,x3
716	add	x25,x25,x21			// d+=h
717	add	x21,x21,x28			// h+=Maj(a,b,c)
718	ldr	x28,[x30],#8		// *K++, x19 in next round
719	add	x10,x10,x0
720	add	x21,x21,x17			// h+=Sigma0(a)
721	add	x10,x10,x15
722	ldr	x15,[sp,#0]
723	str	x2,[sp,#24]
724	ror	x16,x25,#14
725	add	x20,x20,x28			// h+=K[i]
726	ror	x1,x12,#1
727	and	x17,x26,x25
728	ror	x0,x9,#19
729	bic	x28,x27,x25
730	ror	x2,x21,#28
731	add	x20,x20,x10			// h+=X[i]
732	eor	x16,x16,x25,ror#18
733	eor	x1,x1,x12,ror#8
734	orr	x17,x17,x28			// Ch(e,f,g)
735	eor	x28,x21,x22			// a^b, b^c in next round
736	eor	x16,x16,x25,ror#41	// Sigma1(e)
737	eor	x2,x2,x21,ror#34
738	add	x20,x20,x17			// h+=Ch(e,f,g)
739	and	x19,x19,x28			// (b^c)&=(a^b)
740	eor	x0,x0,x9,ror#61
741	eor	x1,x1,x12,lsr#7	// sigma0(X[i+1])
742	add	x20,x20,x16			// h+=Sigma1(e)
743	eor	x19,x19,x22			// Maj(a,b,c)
744	eor	x17,x2,x21,ror#39	// Sigma0(a)
745	eor	x0,x0,x9,lsr#6	// sigma1(X[i+14])
746	add	x11,x11,x4
747	add	x24,x24,x20			// d+=h
748	add	x20,x20,x19			// h+=Maj(a,b,c)
749	ldr	x19,[x30],#8		// *K++, x28 in next round
750	add	x11,x11,x1
751	add	x20,x20,x17			// h+=Sigma0(a)
752	add	x11,x11,x0
753	ldr	x0,[sp,#8]
754	str	x3,[sp,#0]
755	ror	x16,x24,#14
756	add	x27,x27,x19			// h+=K[i]
757	ror	x2,x13,#1
758	and	x17,x25,x24
759	ror	x1,x10,#19
760	bic	x19,x26,x24
761	ror	x3,x20,#28
762	add	x27,x27,x11			// h+=X[i]
763	eor	x16,x16,x24,ror#18
764	eor	x2,x2,x13,ror#8
765	orr	x17,x17,x19			// Ch(e,f,g)
766	eor	x19,x20,x21			// a^b, b^c in next round
767	eor	x16,x16,x24,ror#41	// Sigma1(e)
768	eor	x3,x3,x20,ror#34
769	add	x27,x27,x17			// h+=Ch(e,f,g)
770	and	x28,x28,x19			// (b^c)&=(a^b)
771	eor	x1,x1,x10,ror#61
772	eor	x2,x2,x13,lsr#7	// sigma0(X[i+1])
773	add	x27,x27,x16			// h+=Sigma1(e)
774	eor	x28,x28,x21			// Maj(a,b,c)
775	eor	x17,x3,x20,ror#39	// Sigma0(a)
776	eor	x1,x1,x10,lsr#6	// sigma1(X[i+14])
777	add	x12,x12,x5
778	add	x23,x23,x27			// d+=h
779	add	x27,x27,x28			// h+=Maj(a,b,c)
780	ldr	x28,[x30],#8		// *K++, x19 in next round
781	add	x12,x12,x2
782	add	x27,x27,x17			// h+=Sigma0(a)
783	add	x12,x12,x1
784	ldr	x1,[sp,#16]
785	str	x4,[sp,#8]
786	ror	x16,x23,#14
787	add	x26,x26,x28			// h+=K[i]
788	ror	x3,x14,#1
789	and	x17,x24,x23
790	ror	x2,x11,#19
791	bic	x28,x25,x23
792	ror	x4,x27,#28
793	add	x26,x26,x12			// h+=X[i]
794	eor	x16,x16,x23,ror#18
795	eor	x3,x3,x14,ror#8
796	orr	x17,x17,x28			// Ch(e,f,g)
797	eor	x28,x27,x20			// a^b, b^c in next round
798	eor	x16,x16,x23,ror#41	// Sigma1(e)
799	eor	x4,x4,x27,ror#34
800	add	x26,x26,x17			// h+=Ch(e,f,g)
801	and	x19,x19,x28			// (b^c)&=(a^b)
802	eor	x2,x2,x11,ror#61
803	eor	x3,x3,x14,lsr#7	// sigma0(X[i+1])
804	add	x26,x26,x16			// h+=Sigma1(e)
805	eor	x19,x19,x20			// Maj(a,b,c)
806	eor	x17,x4,x27,ror#39	// Sigma0(a)
807	eor	x2,x2,x11,lsr#6	// sigma1(X[i+14])
808	add	x13,x13,x6
809	add	x22,x22,x26			// d+=h
810	add	x26,x26,x19			// h+=Maj(a,b,c)
811	ldr	x19,[x30],#8		// *K++, x28 in next round
812	add	x13,x13,x3
813	add	x26,x26,x17			// h+=Sigma0(a)
814	add	x13,x13,x2
815	ldr	x2,[sp,#24]
816	str	x5,[sp,#16]
817	ror	x16,x22,#14
818	add	x25,x25,x19			// h+=K[i]
819	ror	x4,x15,#1
820	and	x17,x23,x22
821	ror	x3,x12,#19
822	bic	x19,x24,x22
823	ror	x5,x26,#28
824	add	x25,x25,x13			// h+=X[i]
825	eor	x16,x16,x22,ror#18
826	eor	x4,x4,x15,ror#8
827	orr	x17,x17,x19			// Ch(e,f,g)
828	eor	x19,x26,x27			// a^b, b^c in next round
829	eor	x16,x16,x22,ror#41	// Sigma1(e)
830	eor	x5,x5,x26,ror#34
831	add	x25,x25,x17			// h+=Ch(e,f,g)
832	and	x28,x28,x19			// (b^c)&=(a^b)
833	eor	x3,x3,x12,ror#61
834	eor	x4,x4,x15,lsr#7	// sigma0(X[i+1])
835	add	x25,x25,x16			// h+=Sigma1(e)
836	eor	x28,x28,x27			// Maj(a,b,c)
837	eor	x17,x5,x26,ror#39	// Sigma0(a)
838	eor	x3,x3,x12,lsr#6	// sigma1(X[i+14])
839	add	x14,x14,x7
840	add	x21,x21,x25			// d+=h
841	add	x25,x25,x28			// h+=Maj(a,b,c)
842	ldr	x28,[x30],#8		// *K++, x19 in next round
843	add	x14,x14,x4
844	add	x25,x25,x17			// h+=Sigma0(a)
845	add	x14,x14,x3
846	ldr	x3,[sp,#0]
847	str	x6,[sp,#24]
848	ror	x16,x21,#14
849	add	x24,x24,x28			// h+=K[i]
850	ror	x5,x0,#1
851	and	x17,x22,x21
852	ror	x4,x13,#19
853	bic	x28,x23,x21
854	ror	x6,x25,#28
855	add	x24,x24,x14			// h+=X[i]
856	eor	x16,x16,x21,ror#18
857	eor	x5,x5,x0,ror#8
858	orr	x17,x17,x28			// Ch(e,f,g)
859	eor	x28,x25,x26			// a^b, b^c in next round
860	eor	x16,x16,x21,ror#41	// Sigma1(e)
861	eor	x6,x6,x25,ror#34
862	add	x24,x24,x17			// h+=Ch(e,f,g)
863	and	x19,x19,x28			// (b^c)&=(a^b)
864	eor	x4,x4,x13,ror#61
865	eor	x5,x5,x0,lsr#7	// sigma0(X[i+1])
866	add	x24,x24,x16			// h+=Sigma1(e)
867	eor	x19,x19,x26			// Maj(a,b,c)
868	eor	x17,x6,x25,ror#39	// Sigma0(a)
869	eor	x4,x4,x13,lsr#6	// sigma1(X[i+14])
870	add	x15,x15,x8
871	add	x20,x20,x24			// d+=h
872	add	x24,x24,x19			// h+=Maj(a,b,c)
873	ldr	x19,[x30],#8		// *K++, x28 in next round
874	add	x15,x15,x5
875	add	x24,x24,x17			// h+=Sigma0(a)
876	add	x15,x15,x4
877	ldr	x4,[sp,#8]
878	str	x7,[sp,#0]
879	ror	x16,x20,#14
880	add	x23,x23,x19			// h+=K[i]
881	ror	x6,x1,#1
882	and	x17,x21,x20
883	ror	x5,x14,#19
884	bic	x19,x22,x20
885	ror	x7,x24,#28
886	add	x23,x23,x15			// h+=X[i]
887	eor	x16,x16,x20,ror#18
888	eor	x6,x6,x1,ror#8
889	orr	x17,x17,x19			// Ch(e,f,g)
890	eor	x19,x24,x25			// a^b, b^c in next round
891	eor	x16,x16,x20,ror#41	// Sigma1(e)
892	eor	x7,x7,x24,ror#34
893	add	x23,x23,x17			// h+=Ch(e,f,g)
894	and	x28,x28,x19			// (b^c)&=(a^b)
895	eor	x5,x5,x14,ror#61
896	eor	x6,x6,x1,lsr#7	// sigma0(X[i+1])
897	add	x23,x23,x16			// h+=Sigma1(e)
898	eor	x28,x28,x25			// Maj(a,b,c)
899	eor	x17,x7,x24,ror#39	// Sigma0(a)
900	eor	x5,x5,x14,lsr#6	// sigma1(X[i+14])
901	add	x0,x0,x9
902	add	x27,x27,x23			// d+=h
903	add	x23,x23,x28			// h+=Maj(a,b,c)
904	ldr	x28,[x30],#8		// *K++, x19 in next round
905	add	x0,x0,x6
906	add	x23,x23,x17			// h+=Sigma0(a)
907	add	x0,x0,x5
908	ldr	x5,[sp,#16]
909	str	x8,[sp,#8]
910	ror	x16,x27,#14
911	add	x22,x22,x28			// h+=K[i]
912	ror	x7,x2,#1
913	and	x17,x20,x27
914	ror	x6,x15,#19
915	bic	x28,x21,x27
916	ror	x8,x23,#28
917	add	x22,x22,x0			// h+=X[i]
918	eor	x16,x16,x27,ror#18
919	eor	x7,x7,x2,ror#8
920	orr	x17,x17,x28			// Ch(e,f,g)
921	eor	x28,x23,x24			// a^b, b^c in next round
922	eor	x16,x16,x27,ror#41	// Sigma1(e)
923	eor	x8,x8,x23,ror#34
924	add	x22,x22,x17			// h+=Ch(e,f,g)
925	and	x19,x19,x28			// (b^c)&=(a^b)
926	eor	x6,x6,x15,ror#61
927	eor	x7,x7,x2,lsr#7	// sigma0(X[i+1])
928	add	x22,x22,x16			// h+=Sigma1(e)
929	eor	x19,x19,x24			// Maj(a,b,c)
930	eor	x17,x8,x23,ror#39	// Sigma0(a)
931	eor	x6,x6,x15,lsr#6	// sigma1(X[i+14])
932	add	x1,x1,x10
933	add	x26,x26,x22			// d+=h
934	add	x22,x22,x19			// h+=Maj(a,b,c)
935	ldr	x19,[x30],#8		// *K++, x28 in next round
936	add	x1,x1,x7
937	add	x22,x22,x17			// h+=Sigma0(a)
938	add	x1,x1,x6
939	ldr	x6,[sp,#24]
940	str	x9,[sp,#16]
941	ror	x16,x26,#14
942	add	x21,x21,x19			// h+=K[i]
943	ror	x8,x3,#1
944	and	x17,x27,x26
945	ror	x7,x0,#19
946	bic	x19,x20,x26
947	ror	x9,x22,#28
948	add	x21,x21,x1			// h+=X[i]
949	eor	x16,x16,x26,ror#18
950	eor	x8,x8,x3,ror#8
951	orr	x17,x17,x19			// Ch(e,f,g)
952	eor	x19,x22,x23			// a^b, b^c in next round
953	eor	x16,x16,x26,ror#41	// Sigma1(e)
954	eor	x9,x9,x22,ror#34
955	add	x21,x21,x17			// h+=Ch(e,f,g)
956	and	x28,x28,x19			// (b^c)&=(a^b)
957	eor	x7,x7,x0,ror#61
958	eor	x8,x8,x3,lsr#7	// sigma0(X[i+1])
959	add	x21,x21,x16			// h+=Sigma1(e)
960	eor	x28,x28,x23			// Maj(a,b,c)
961	eor	x17,x9,x22,ror#39	// Sigma0(a)
962	eor	x7,x7,x0,lsr#6	// sigma1(X[i+14])
963	add	x2,x2,x11
964	add	x25,x25,x21			// d+=h
965	add	x21,x21,x28			// h+=Maj(a,b,c)
966	ldr	x28,[x30],#8		// *K++, x19 in next round
967	add	x2,x2,x8
968	add	x21,x21,x17			// h+=Sigma0(a)
969	add	x2,x2,x7
970	ldr	x7,[sp,#0]
971	str	x10,[sp,#24]
972	ror	x16,x25,#14
973	add	x20,x20,x28			// h+=K[i]
974	ror	x9,x4,#1
975	and	x17,x26,x25
976	ror	x8,x1,#19
977	bic	x28,x27,x25
978	ror	x10,x21,#28
979	add	x20,x20,x2			// h+=X[i]
980	eor	x16,x16,x25,ror#18
981	eor	x9,x9,x4,ror#8
982	orr	x17,x17,x28			// Ch(e,f,g)
983	eor	x28,x21,x22			// a^b, b^c in next round
984	eor	x16,x16,x25,ror#41	// Sigma1(e)
985	eor	x10,x10,x21,ror#34
986	add	x20,x20,x17			// h+=Ch(e,f,g)
987	and	x19,x19,x28			// (b^c)&=(a^b)
988	eor	x8,x8,x1,ror#61
989	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
990	add	x20,x20,x16			// h+=Sigma1(e)
991	eor	x19,x19,x22			// Maj(a,b,c)
992	eor	x17,x10,x21,ror#39	// Sigma0(a)
993	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
994	add	x3,x3,x12
995	add	x24,x24,x20			// d+=h
996	add	x20,x20,x19			// h+=Maj(a,b,c)
997	ldr	x19,[x30],#8		// *K++, x28 in next round
998	add	x3,x3,x9
999	add	x20,x20,x17			// h+=Sigma0(a)
1000	add	x3,x3,x8
1001	cbnz	x19,Loop_16_xx
1002
1003	ldp	x0,x2,[x29,#96]
1004	ldr	x1,[x29,#112]
1005	sub	x30,x30,#648		// rewind
1006
1007	ldp	x3,x4,[x0]
1008	ldp	x5,x6,[x0,#2*8]
1009	add	x1,x1,#14*8			// advance input pointer
1010	ldp	x7,x8,[x0,#4*8]
1011	add	x20,x20,x3
1012	ldp	x9,x10,[x0,#6*8]
1013	add	x21,x21,x4
1014	add	x22,x22,x5
1015	add	x23,x23,x6
1016	stp	x20,x21,[x0]
1017	add	x24,x24,x7
1018	add	x25,x25,x8
1019	stp	x22,x23,[x0,#2*8]
1020	add	x26,x26,x9
1021	add	x27,x27,x10
1022	cmp	x1,x2
1023	stp	x24,x25,[x0,#4*8]
1024	stp	x26,x27,[x0,#6*8]
1025	b.ne	Loop
1026
1027	ldp	x19,x20,[x29,#16]
1028	add	sp,sp,#4*8
1029	ldp	x21,x22,[x29,#32]
1030	ldp	x23,x24,[x29,#48]
1031	ldp	x25,x26,[x29,#64]
1032	ldp	x27,x28,[x29,#80]
1033	ldp	x29,x30,[sp],#128
1034	AARCH64_VALIDATE_LINK_REGISTER
1035	ret
1036
1037
1038.section	.rodata
1039.align	6
1040
1041LK512:
1042.quad	0x428a2f98d728ae22,0x7137449123ef65cd
1043.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
1044.quad	0x3956c25bf348b538,0x59f111f1b605d019
1045.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
1046.quad	0xd807aa98a3030242,0x12835b0145706fbe
1047.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
1048.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
1049.quad	0x9bdc06a725c71235,0xc19bf174cf692694
1050.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
1051.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
1052.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
1053.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
1054.quad	0x983e5152ee66dfab,0xa831c66d2db43210
1055.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
1056.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
1057.quad	0x06ca6351e003826f,0x142929670a0e6e70
1058.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
1059.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
1060.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
1061.quad	0x81c2c92e47edaee6,0x92722c851482353b
1062.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
1063.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
1064.quad	0xd192e819d6ef5218,0xd69906245565a910
1065.quad	0xf40e35855771202a,0x106aa07032bbd1b8
1066.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
1067.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
1068.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
1069.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
1070.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
1071.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
1072.quad	0x90befffa23631e28,0xa4506cebde82bde9
1073.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
1074.quad	0xca273eceea26619c,0xd186b8c721c0c207
1075.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
1076.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
1077.quad	0x113f9804bef90dae,0x1b710b35131c471b
1078.quad	0x28db77f523047d84,0x32caab7b40c72493
1079.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
1080.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
1081.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
1082.quad	0	// terminator
1083
1084.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1085.align	2
1086.align	2
1087.text
1088#ifndef	__KERNEL__
1089.def sha512_block_armv8
1090   .type 32
1091.endef
1092.align	6
1093sha512_block_armv8:
1094Lv8_entry:
1095	stp	x29,x30,[sp,#-16]!
1096	add	x29,sp,#0
1097
1098	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64	// load input
1099	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
1100
1101	ld1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// load context
1102	adrp	x3,LK512
1103	add	x3,x3,:lo12:LK512
1104
1105	rev64	v16.16b,v16.16b
1106	rev64	v17.16b,v17.16b
1107	rev64	v18.16b,v18.16b
1108	rev64	v19.16b,v19.16b
1109	rev64	v20.16b,v20.16b
1110	rev64	v21.16b,v21.16b
1111	rev64	v22.16b,v22.16b
1112	rev64	v23.16b,v23.16b
1113	b	Loop_hw
1114
1115.align	4
1116Loop_hw:
1117	ld1	{v24.2d},[x3],#16
1118	subs	x2,x2,#1
1119	sub	x4,x1,#128
1120	orr	v26.16b,v0.16b,v0.16b			// offload
1121	orr	v27.16b,v1.16b,v1.16b
1122	orr	v28.16b,v2.16b,v2.16b
1123	orr	v29.16b,v3.16b,v3.16b
1124	csel	x1,x1,x4,ne			// conditional rewind
1125	add	v24.2d,v24.2d,v16.2d
1126	ld1	{v25.2d},[x3],#16
1127	ext	v24.16b,v24.16b,v24.16b,#8
1128	ext	v5.16b,v2.16b,v3.16b,#8
1129	ext	v6.16b,v1.16b,v2.16b,#8
1130	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1131.long	0xcec08230	//sha512su0 v16.16b,v17.16b
1132	ext	v7.16b,v20.16b,v21.16b,#8
1133.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1134.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1135	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1136.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1137	add	v25.2d,v25.2d,v17.2d
1138	ld1	{v24.2d},[x3],#16
1139	ext	v25.16b,v25.16b,v25.16b,#8
1140	ext	v5.16b,v4.16b,v2.16b,#8
1141	ext	v6.16b,v0.16b,v4.16b,#8
1142	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1143.long	0xcec08251	//sha512su0 v17.16b,v18.16b
1144	ext	v7.16b,v21.16b,v22.16b,#8
1145.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1146.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1147	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1148.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1149	add	v24.2d,v24.2d,v18.2d
1150	ld1	{v25.2d},[x3],#16
1151	ext	v24.16b,v24.16b,v24.16b,#8
1152	ext	v5.16b,v1.16b,v4.16b,#8
1153	ext	v6.16b,v3.16b,v1.16b,#8
1154	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1155.long	0xcec08272	//sha512su0 v18.16b,v19.16b
1156	ext	v7.16b,v22.16b,v23.16b,#8
1157.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1158.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1159	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1160.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1161	add	v25.2d,v25.2d,v19.2d
1162	ld1	{v24.2d},[x3],#16
1163	ext	v25.16b,v25.16b,v25.16b,#8
1164	ext	v5.16b,v0.16b,v1.16b,#8
1165	ext	v6.16b,v2.16b,v0.16b,#8
1166	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1167.long	0xcec08293	//sha512su0 v19.16b,v20.16b
1168	ext	v7.16b,v23.16b,v16.16b,#8
1169.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1170.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1171	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1172.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1173	add	v24.2d,v24.2d,v20.2d
1174	ld1	{v25.2d},[x3],#16
1175	ext	v24.16b,v24.16b,v24.16b,#8
1176	ext	v5.16b,v3.16b,v0.16b,#8
1177	ext	v6.16b,v4.16b,v3.16b,#8
1178	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1179.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
1180	ext	v7.16b,v16.16b,v17.16b,#8
1181.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1182.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1183	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1184.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1185	add	v25.2d,v25.2d,v21.2d
1186	ld1	{v24.2d},[x3],#16
1187	ext	v25.16b,v25.16b,v25.16b,#8
1188	ext	v5.16b,v2.16b,v3.16b,#8
1189	ext	v6.16b,v1.16b,v2.16b,#8
1190	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1191.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
1192	ext	v7.16b,v17.16b,v18.16b,#8
1193.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1194.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1195	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1196.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1197	add	v24.2d,v24.2d,v22.2d
1198	ld1	{v25.2d},[x3],#16
1199	ext	v24.16b,v24.16b,v24.16b,#8
1200	ext	v5.16b,v4.16b,v2.16b,#8
1201	ext	v6.16b,v0.16b,v4.16b,#8
1202	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1203.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
1204	ext	v7.16b,v18.16b,v19.16b,#8
1205.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1206.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1207	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1208.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1209	add	v25.2d,v25.2d,v23.2d
1210	ld1	{v24.2d},[x3],#16
1211	ext	v25.16b,v25.16b,v25.16b,#8
1212	ext	v5.16b,v1.16b,v4.16b,#8
1213	ext	v6.16b,v3.16b,v1.16b,#8
1214	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1215.long	0xcec08217	//sha512su0 v23.16b,v16.16b
1216	ext	v7.16b,v19.16b,v20.16b,#8
1217.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1218.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1219	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1220.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1221	add	v24.2d,v24.2d,v16.2d
1222	ld1	{v25.2d},[x3],#16
1223	ext	v24.16b,v24.16b,v24.16b,#8
1224	ext	v5.16b,v0.16b,v1.16b,#8
1225	ext	v6.16b,v2.16b,v0.16b,#8
1226	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1227.long	0xcec08230	//sha512su0 v16.16b,v17.16b
1228	ext	v7.16b,v20.16b,v21.16b,#8
1229.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1230.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1231	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1232.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1233	add	v25.2d,v25.2d,v17.2d
1234	ld1	{v24.2d},[x3],#16
1235	ext	v25.16b,v25.16b,v25.16b,#8
1236	ext	v5.16b,v3.16b,v0.16b,#8
1237	ext	v6.16b,v4.16b,v3.16b,#8
1238	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1239.long	0xcec08251	//sha512su0 v17.16b,v18.16b
1240	ext	v7.16b,v21.16b,v22.16b,#8
1241.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1242.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1243	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1244.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1245	add	v24.2d,v24.2d,v18.2d
1246	ld1	{v25.2d},[x3],#16
1247	ext	v24.16b,v24.16b,v24.16b,#8
1248	ext	v5.16b,v2.16b,v3.16b,#8
1249	ext	v6.16b,v1.16b,v2.16b,#8
1250	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1251.long	0xcec08272	//sha512su0 v18.16b,v19.16b
1252	ext	v7.16b,v22.16b,v23.16b,#8
1253.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1254.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1255	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1256.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1257	add	v25.2d,v25.2d,v19.2d
1258	ld1	{v24.2d},[x3],#16
1259	ext	v25.16b,v25.16b,v25.16b,#8
1260	ext	v5.16b,v4.16b,v2.16b,#8
1261	ext	v6.16b,v0.16b,v4.16b,#8
1262	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1263.long	0xcec08293	//sha512su0 v19.16b,v20.16b
1264	ext	v7.16b,v23.16b,v16.16b,#8
1265.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1266.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1267	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1268.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1269	add	v24.2d,v24.2d,v20.2d
1270	ld1	{v25.2d},[x3],#16
1271	ext	v24.16b,v24.16b,v24.16b,#8
1272	ext	v5.16b,v1.16b,v4.16b,#8
1273	ext	v6.16b,v3.16b,v1.16b,#8
1274	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1275.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
1276	ext	v7.16b,v16.16b,v17.16b,#8
1277.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1278.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1279	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1280.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1281	add	v25.2d,v25.2d,v21.2d
1282	ld1	{v24.2d},[x3],#16
1283	ext	v25.16b,v25.16b,v25.16b,#8
1284	ext	v5.16b,v0.16b,v1.16b,#8
1285	ext	v6.16b,v2.16b,v0.16b,#8
1286	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1287.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
1288	ext	v7.16b,v17.16b,v18.16b,#8
1289.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1290.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1291	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1292.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1293	add	v24.2d,v24.2d,v22.2d
1294	ld1	{v25.2d},[x3],#16
1295	ext	v24.16b,v24.16b,v24.16b,#8
1296	ext	v5.16b,v3.16b,v0.16b,#8
1297	ext	v6.16b,v4.16b,v3.16b,#8
1298	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1299.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
1300	ext	v7.16b,v18.16b,v19.16b,#8
1301.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1302.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1303	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1304.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1305	add	v25.2d,v25.2d,v23.2d
1306	ld1	{v24.2d},[x3],#16
1307	ext	v25.16b,v25.16b,v25.16b,#8
1308	ext	v5.16b,v2.16b,v3.16b,#8
1309	ext	v6.16b,v1.16b,v2.16b,#8
1310	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1311.long	0xcec08217	//sha512su0 v23.16b,v16.16b
1312	ext	v7.16b,v19.16b,v20.16b,#8
1313.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1314.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1315	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1316.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1317	add	v24.2d,v24.2d,v16.2d
1318	ld1	{v25.2d},[x3],#16
1319	ext	v24.16b,v24.16b,v24.16b,#8
1320	ext	v5.16b,v4.16b,v2.16b,#8
1321	ext	v6.16b,v0.16b,v4.16b,#8
1322	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1323.long	0xcec08230	//sha512su0 v16.16b,v17.16b
1324	ext	v7.16b,v20.16b,v21.16b,#8
1325.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1326.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1327	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1328.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1329	add	v25.2d,v25.2d,v17.2d
1330	ld1	{v24.2d},[x3],#16
1331	ext	v25.16b,v25.16b,v25.16b,#8
1332	ext	v5.16b,v1.16b,v4.16b,#8
1333	ext	v6.16b,v3.16b,v1.16b,#8
1334	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1335.long	0xcec08251	//sha512su0 v17.16b,v18.16b
1336	ext	v7.16b,v21.16b,v22.16b,#8
1337.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1338.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1339	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1340.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1341	add	v24.2d,v24.2d,v18.2d
1342	ld1	{v25.2d},[x3],#16
1343	ext	v24.16b,v24.16b,v24.16b,#8
1344	ext	v5.16b,v0.16b,v1.16b,#8
1345	ext	v6.16b,v2.16b,v0.16b,#8
1346	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1347.long	0xcec08272	//sha512su0 v18.16b,v19.16b
1348	ext	v7.16b,v22.16b,v23.16b,#8
1349.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1350.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1351	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1352.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1353	add	v25.2d,v25.2d,v19.2d
1354	ld1	{v24.2d},[x3],#16
1355	ext	v25.16b,v25.16b,v25.16b,#8
1356	ext	v5.16b,v3.16b,v0.16b,#8
1357	ext	v6.16b,v4.16b,v3.16b,#8
1358	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1359.long	0xcec08293	//sha512su0 v19.16b,v20.16b
1360	ext	v7.16b,v23.16b,v16.16b,#8
1361.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1362.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1363	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1364.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1365	add	v24.2d,v24.2d,v20.2d
1366	ld1	{v25.2d},[x3],#16
1367	ext	v24.16b,v24.16b,v24.16b,#8
1368	ext	v5.16b,v2.16b,v3.16b,#8
1369	ext	v6.16b,v1.16b,v2.16b,#8
1370	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1371.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
1372	ext	v7.16b,v16.16b,v17.16b,#8
1373.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1374.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1375	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1376.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1377	add	v25.2d,v25.2d,v21.2d
1378	ld1	{v24.2d},[x3],#16
1379	ext	v25.16b,v25.16b,v25.16b,#8
1380	ext	v5.16b,v4.16b,v2.16b,#8
1381	ext	v6.16b,v0.16b,v4.16b,#8
1382	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1383.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
1384	ext	v7.16b,v17.16b,v18.16b,#8
1385.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1386.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1387	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1388.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1389	add	v24.2d,v24.2d,v22.2d
1390	ld1	{v25.2d},[x3],#16
1391	ext	v24.16b,v24.16b,v24.16b,#8
1392	ext	v5.16b,v1.16b,v4.16b,#8
1393	ext	v6.16b,v3.16b,v1.16b,#8
1394	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1395.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
1396	ext	v7.16b,v18.16b,v19.16b,#8
1397.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1398.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1399	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1400.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1401	add	v25.2d,v25.2d,v23.2d
1402	ld1	{v24.2d},[x3],#16
1403	ext	v25.16b,v25.16b,v25.16b,#8
1404	ext	v5.16b,v0.16b,v1.16b,#8
1405	ext	v6.16b,v2.16b,v0.16b,#8
1406	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1407.long	0xcec08217	//sha512su0 v23.16b,v16.16b
1408	ext	v7.16b,v19.16b,v20.16b,#8
1409.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1410.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1411	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1412.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1413	add	v24.2d,v24.2d,v16.2d
1414	ld1	{v25.2d},[x3],#16
1415	ext	v24.16b,v24.16b,v24.16b,#8
1416	ext	v5.16b,v3.16b,v0.16b,#8
1417	ext	v6.16b,v4.16b,v3.16b,#8
1418	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1419.long	0xcec08230	//sha512su0 v16.16b,v17.16b
1420	ext	v7.16b,v20.16b,v21.16b,#8
1421.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1422.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1423	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1424.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1425	add	v25.2d,v25.2d,v17.2d
1426	ld1	{v24.2d},[x3],#16
1427	ext	v25.16b,v25.16b,v25.16b,#8
1428	ext	v5.16b,v2.16b,v3.16b,#8
1429	ext	v6.16b,v1.16b,v2.16b,#8
1430	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1431.long	0xcec08251	//sha512su0 v17.16b,v18.16b
1432	ext	v7.16b,v21.16b,v22.16b,#8
1433.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1434.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1435	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1436.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1437	add	v24.2d,v24.2d,v18.2d
1438	ld1	{v25.2d},[x3],#16
1439	ext	v24.16b,v24.16b,v24.16b,#8
1440	ext	v5.16b,v4.16b,v2.16b,#8
1441	ext	v6.16b,v0.16b,v4.16b,#8
1442	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1443.long	0xcec08272	//sha512su0 v18.16b,v19.16b
1444	ext	v7.16b,v22.16b,v23.16b,#8
1445.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1446.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1447	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1448.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1449	add	v25.2d,v25.2d,v19.2d
1450	ld1	{v24.2d},[x3],#16
1451	ext	v25.16b,v25.16b,v25.16b,#8
1452	ext	v5.16b,v1.16b,v4.16b,#8
1453	ext	v6.16b,v3.16b,v1.16b,#8
1454	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1455.long	0xcec08293	//sha512su0 v19.16b,v20.16b
1456	ext	v7.16b,v23.16b,v16.16b,#8
1457.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1458.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1459	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1460.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1461	add	v24.2d,v24.2d,v20.2d
1462	ld1	{v25.2d},[x3],#16
1463	ext	v24.16b,v24.16b,v24.16b,#8
1464	ext	v5.16b,v0.16b,v1.16b,#8
1465	ext	v6.16b,v2.16b,v0.16b,#8
1466	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1467.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
1468	ext	v7.16b,v16.16b,v17.16b,#8
1469.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1470.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1471	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1472.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1473	add	v25.2d,v25.2d,v21.2d
1474	ld1	{v24.2d},[x3],#16
1475	ext	v25.16b,v25.16b,v25.16b,#8
1476	ext	v5.16b,v3.16b,v0.16b,#8
1477	ext	v6.16b,v4.16b,v3.16b,#8
1478	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1479.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
1480	ext	v7.16b,v17.16b,v18.16b,#8
1481.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1482.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1483	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1484.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1485	add	v24.2d,v24.2d,v22.2d
1486	ld1	{v25.2d},[x3],#16
1487	ext	v24.16b,v24.16b,v24.16b,#8
1488	ext	v5.16b,v2.16b,v3.16b,#8
1489	ext	v6.16b,v1.16b,v2.16b,#8
1490	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1491.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
1492	ext	v7.16b,v18.16b,v19.16b,#8
1493.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1494.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1495	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1496.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1497	add	v25.2d,v25.2d,v23.2d
1498	ld1	{v24.2d},[x3],#16
1499	ext	v25.16b,v25.16b,v25.16b,#8
1500	ext	v5.16b,v4.16b,v2.16b,#8
1501	ext	v6.16b,v0.16b,v4.16b,#8
1502	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1503.long	0xcec08217	//sha512su0 v23.16b,v16.16b
1504	ext	v7.16b,v19.16b,v20.16b,#8
1505.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1506.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1507	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1508.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1509	ld1	{v25.2d},[x3],#16
1510	add	v24.2d,v24.2d,v16.2d
1511	ld1	{v16.16b},[x1],#16		// load next input
1512	ext	v24.16b,v24.16b,v24.16b,#8
1513	ext	v5.16b,v1.16b,v4.16b,#8
1514	ext	v6.16b,v3.16b,v1.16b,#8
1515	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1516.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1517	rev64	v16.16b,v16.16b
1518	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1519.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1520	ld1	{v24.2d},[x3],#16
1521	add	v25.2d,v25.2d,v17.2d
1522	ld1	{v17.16b},[x1],#16		// load next input
1523	ext	v25.16b,v25.16b,v25.16b,#8
1524	ext	v5.16b,v0.16b,v1.16b,#8
1525	ext	v6.16b,v2.16b,v0.16b,#8
1526	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1527.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1528	rev64	v17.16b,v17.16b
1529	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1530.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1531	ld1	{v25.2d},[x3],#16
1532	add	v24.2d,v24.2d,v18.2d
1533	ld1	{v18.16b},[x1],#16		// load next input
1534	ext	v24.16b,v24.16b,v24.16b,#8
1535	ext	v5.16b,v3.16b,v0.16b,#8
1536	ext	v6.16b,v4.16b,v3.16b,#8
1537	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1538.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1539	rev64	v18.16b,v18.16b
1540	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1541.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1542	ld1	{v24.2d},[x3],#16
1543	add	v25.2d,v25.2d,v19.2d
1544	ld1	{v19.16b},[x1],#16		// load next input
1545	ext	v25.16b,v25.16b,v25.16b,#8
1546	ext	v5.16b,v2.16b,v3.16b,#8
1547	ext	v6.16b,v1.16b,v2.16b,#8
1548	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1549.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1550	rev64	v19.16b,v19.16b
1551	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1552.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1553	ld1	{v25.2d},[x3],#16
1554	add	v24.2d,v24.2d,v20.2d
1555	ld1	{v20.16b},[x1],#16		// load next input
1556	ext	v24.16b,v24.16b,v24.16b,#8
1557	ext	v5.16b,v4.16b,v2.16b,#8
1558	ext	v6.16b,v0.16b,v4.16b,#8
1559	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1560.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1561	rev64	v20.16b,v20.16b
1562	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1563.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1564	ld1	{v24.2d},[x3],#16
1565	add	v25.2d,v25.2d,v21.2d
1566	ld1	{v21.16b},[x1],#16		// load next input
1567	ext	v25.16b,v25.16b,v25.16b,#8
1568	ext	v5.16b,v1.16b,v4.16b,#8
1569	ext	v6.16b,v3.16b,v1.16b,#8
1570	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1571.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1572	rev64	v21.16b,v21.16b
1573	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1574.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1575	ld1	{v25.2d},[x3],#16
1576	add	v24.2d,v24.2d,v22.2d
1577	ld1	{v22.16b},[x1],#16		// load next input
1578	ext	v24.16b,v24.16b,v24.16b,#8
1579	ext	v5.16b,v0.16b,v1.16b,#8
1580	ext	v6.16b,v2.16b,v0.16b,#8
1581	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1582.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1583	rev64	v22.16b,v22.16b
1584	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1585.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1586	sub	x3,x3,#80*8	// rewind
1587	add	v25.2d,v25.2d,v23.2d
1588	ld1	{v23.16b},[x1],#16		// load next input
1589	ext	v25.16b,v25.16b,v25.16b,#8
1590	ext	v5.16b,v3.16b,v0.16b,#8
1591	ext	v6.16b,v4.16b,v3.16b,#8
1592	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1593.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1594	rev64	v23.16b,v23.16b
1595	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1596.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1597	add	v0.2d,v0.2d,v26.2d			// accumulate
1598	add	v1.2d,v1.2d,v27.2d
1599	add	v2.2d,v2.2d,v28.2d
1600	add	v3.2d,v3.2d,v29.2d
1601
1602	cbnz	x2,Loop_hw
1603
1604	st1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// store context
1605
1606	ldr	x29,[sp],#16
1607	ret
1608
1609#endif
1610#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
1611