xref: /aosp_15_r20/external/cronet/third_party/boringssl/src/gen/bcm/sha512-armv8-win.S (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <openssl/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
7// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
8//
9// Licensed under the OpenSSL license (the "License").  You may not use
10// this file except in compliance with the License.  You can obtain a copy
11// in the file LICENSE in the source distribution or at
12// https://www.openssl.org/source/license.html
13
14// ====================================================================
15// Written by Andy Polyakov <[email protected]> for the OpenSSL
16// project. The module is, however, dual licensed under OpenSSL and
17// CRYPTOGAMS licenses depending on where you obtain it. For further
18// details see http://www.openssl.org/~appro/cryptogams/.
19//
20// Permission to use under GPLv2 terms is granted.
21// ====================================================================
22//
23// SHA256/512 for ARMv8.
24//
25// Performance in cycles per processed byte and improvement coefficient
26// over code generated with "default" compiler:
27//
28//		SHA256-hw	SHA256(*)	SHA512
29// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
30// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
31// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
32// Denver	2.01		10.5 (+26%)	6.70 (+8%)
33// X-Gene			20.0 (+100%)	12.8 (+300%(***))
34// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
35// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
36//
37// (*)	Software SHA256 results are of lesser relevance, presented
38//	mostly for informational purposes.
39// (**)	The result is a trade-off: it's possible to improve it by
40//	10% (or by 1 cycle per round), but at the cost of 20% loss
41//	on Cortex-A53 (or by 4 cycles per round).
42// (***)	Super-impressive coefficients over gcc-generated code are
43//	indication of some compiler "pathology", most notably code
44//	generated with -mgeneral-regs-only is significantly faster
45//	and the gap is only 40-90%.
46
47#ifndef	__KERNEL__
48# include <openssl/arm_arch.h>
49#endif
50
51.text
52
53.globl	sha512_block_data_order_nohw
54
55.def sha512_block_data_order_nohw
56   .type 32
57.endef
58.align	6
59sha512_block_data_order_nohw:
60	AARCH64_SIGN_LINK_REGISTER
61	stp	x29,x30,[sp,#-128]!
62	add	x29,sp,#0
63
64	stp	x19,x20,[sp,#16]
65	stp	x21,x22,[sp,#32]
66	stp	x23,x24,[sp,#48]
67	stp	x25,x26,[sp,#64]
68	stp	x27,x28,[sp,#80]
69	sub	sp,sp,#4*8
70
71	ldp	x20,x21,[x0]				// load context
72	ldp	x22,x23,[x0,#2*8]
73	ldp	x24,x25,[x0,#4*8]
74	add	x2,x1,x2,lsl#7	// end of input
75	ldp	x26,x27,[x0,#6*8]
76	adrp	x30,LK512
77	add	x30,x30,:lo12:LK512
78	stp	x0,x2,[x29,#96]
79
80Loop:
81	ldp	x3,x4,[x1],#2*8
82	ldr	x19,[x30],#8			// *K++
83	eor	x28,x21,x22				// magic seed
84	str	x1,[x29,#112]
85#ifndef	__AARCH64EB__
86	rev	x3,x3			// 0
87#endif
88	ror	x16,x24,#14
89	add	x27,x27,x19			// h+=K[i]
90	eor	x6,x24,x24,ror#23
91	and	x17,x25,x24
92	bic	x19,x26,x24
93	add	x27,x27,x3			// h+=X[i]
94	orr	x17,x17,x19			// Ch(e,f,g)
95	eor	x19,x20,x21			// a^b, b^c in next round
96	eor	x16,x16,x6,ror#18	// Sigma1(e)
97	ror	x6,x20,#28
98	add	x27,x27,x17			// h+=Ch(e,f,g)
99	eor	x17,x20,x20,ror#5
100	add	x27,x27,x16			// h+=Sigma1(e)
101	and	x28,x28,x19			// (b^c)&=(a^b)
102	add	x23,x23,x27			// d+=h
103	eor	x28,x28,x21			// Maj(a,b,c)
104	eor	x17,x6,x17,ror#34	// Sigma0(a)
105	add	x27,x27,x28			// h+=Maj(a,b,c)
106	ldr	x28,[x30],#8		// *K++, x19 in next round
107	//add	x27,x27,x17			// h+=Sigma0(a)
108#ifndef	__AARCH64EB__
109	rev	x4,x4			// 1
110#endif
111	ldp	x5,x6,[x1],#2*8
112	add	x27,x27,x17			// h+=Sigma0(a)
113	ror	x16,x23,#14
114	add	x26,x26,x28			// h+=K[i]
115	eor	x7,x23,x23,ror#23
116	and	x17,x24,x23
117	bic	x28,x25,x23
118	add	x26,x26,x4			// h+=X[i]
119	orr	x17,x17,x28			// Ch(e,f,g)
120	eor	x28,x27,x20			// a^b, b^c in next round
121	eor	x16,x16,x7,ror#18	// Sigma1(e)
122	ror	x7,x27,#28
123	add	x26,x26,x17			// h+=Ch(e,f,g)
124	eor	x17,x27,x27,ror#5
125	add	x26,x26,x16			// h+=Sigma1(e)
126	and	x19,x19,x28			// (b^c)&=(a^b)
127	add	x22,x22,x26			// d+=h
128	eor	x19,x19,x20			// Maj(a,b,c)
129	eor	x17,x7,x17,ror#34	// Sigma0(a)
130	add	x26,x26,x19			// h+=Maj(a,b,c)
131	ldr	x19,[x30],#8		// *K++, x28 in next round
132	//add	x26,x26,x17			// h+=Sigma0(a)
133#ifndef	__AARCH64EB__
134	rev	x5,x5			// 2
135#endif
136	add	x26,x26,x17			// h+=Sigma0(a)
137	ror	x16,x22,#14
138	add	x25,x25,x19			// h+=K[i]
139	eor	x8,x22,x22,ror#23
140	and	x17,x23,x22
141	bic	x19,x24,x22
142	add	x25,x25,x5			// h+=X[i]
143	orr	x17,x17,x19			// Ch(e,f,g)
144	eor	x19,x26,x27			// a^b, b^c in next round
145	eor	x16,x16,x8,ror#18	// Sigma1(e)
146	ror	x8,x26,#28
147	add	x25,x25,x17			// h+=Ch(e,f,g)
148	eor	x17,x26,x26,ror#5
149	add	x25,x25,x16			// h+=Sigma1(e)
150	and	x28,x28,x19			// (b^c)&=(a^b)
151	add	x21,x21,x25			// d+=h
152	eor	x28,x28,x27			// Maj(a,b,c)
153	eor	x17,x8,x17,ror#34	// Sigma0(a)
154	add	x25,x25,x28			// h+=Maj(a,b,c)
155	ldr	x28,[x30],#8		// *K++, x19 in next round
156	//add	x25,x25,x17			// h+=Sigma0(a)
157#ifndef	__AARCH64EB__
158	rev	x6,x6			// 3
159#endif
160	ldp	x7,x8,[x1],#2*8
161	add	x25,x25,x17			// h+=Sigma0(a)
162	ror	x16,x21,#14
163	add	x24,x24,x28			// h+=K[i]
164	eor	x9,x21,x21,ror#23
165	and	x17,x22,x21
166	bic	x28,x23,x21
167	add	x24,x24,x6			// h+=X[i]
168	orr	x17,x17,x28			// Ch(e,f,g)
169	eor	x28,x25,x26			// a^b, b^c in next round
170	eor	x16,x16,x9,ror#18	// Sigma1(e)
171	ror	x9,x25,#28
172	add	x24,x24,x17			// h+=Ch(e,f,g)
173	eor	x17,x25,x25,ror#5
174	add	x24,x24,x16			// h+=Sigma1(e)
175	and	x19,x19,x28			// (b^c)&=(a^b)
176	add	x20,x20,x24			// d+=h
177	eor	x19,x19,x26			// Maj(a,b,c)
178	eor	x17,x9,x17,ror#34	// Sigma0(a)
179	add	x24,x24,x19			// h+=Maj(a,b,c)
180	ldr	x19,[x30],#8		// *K++, x28 in next round
181	//add	x24,x24,x17			// h+=Sigma0(a)
182#ifndef	__AARCH64EB__
183	rev	x7,x7			// 4
184#endif
185	add	x24,x24,x17			// h+=Sigma0(a)
186	ror	x16,x20,#14
187	add	x23,x23,x19			// h+=K[i]
188	eor	x10,x20,x20,ror#23
189	and	x17,x21,x20
190	bic	x19,x22,x20
191	add	x23,x23,x7			// h+=X[i]
192	orr	x17,x17,x19			// Ch(e,f,g)
193	eor	x19,x24,x25			// a^b, b^c in next round
194	eor	x16,x16,x10,ror#18	// Sigma1(e)
195	ror	x10,x24,#28
196	add	x23,x23,x17			// h+=Ch(e,f,g)
197	eor	x17,x24,x24,ror#5
198	add	x23,x23,x16			// h+=Sigma1(e)
199	and	x28,x28,x19			// (b^c)&=(a^b)
200	add	x27,x27,x23			// d+=h
201	eor	x28,x28,x25			// Maj(a,b,c)
202	eor	x17,x10,x17,ror#34	// Sigma0(a)
203	add	x23,x23,x28			// h+=Maj(a,b,c)
204	ldr	x28,[x30],#8		// *K++, x19 in next round
205	//add	x23,x23,x17			// h+=Sigma0(a)
206#ifndef	__AARCH64EB__
207	rev	x8,x8			// 5
208#endif
209	ldp	x9,x10,[x1],#2*8
210	add	x23,x23,x17			// h+=Sigma0(a)
211	ror	x16,x27,#14
212	add	x22,x22,x28			// h+=K[i]
213	eor	x11,x27,x27,ror#23
214	and	x17,x20,x27
215	bic	x28,x21,x27
216	add	x22,x22,x8			// h+=X[i]
217	orr	x17,x17,x28			// Ch(e,f,g)
218	eor	x28,x23,x24			// a^b, b^c in next round
219	eor	x16,x16,x11,ror#18	// Sigma1(e)
220	ror	x11,x23,#28
221	add	x22,x22,x17			// h+=Ch(e,f,g)
222	eor	x17,x23,x23,ror#5
223	add	x22,x22,x16			// h+=Sigma1(e)
224	and	x19,x19,x28			// (b^c)&=(a^b)
225	add	x26,x26,x22			// d+=h
226	eor	x19,x19,x24			// Maj(a,b,c)
227	eor	x17,x11,x17,ror#34	// Sigma0(a)
228	add	x22,x22,x19			// h+=Maj(a,b,c)
229	ldr	x19,[x30],#8		// *K++, x28 in next round
230	//add	x22,x22,x17			// h+=Sigma0(a)
231#ifndef	__AARCH64EB__
232	rev	x9,x9			// 6
233#endif
234	add	x22,x22,x17			// h+=Sigma0(a)
235	ror	x16,x26,#14
236	add	x21,x21,x19			// h+=K[i]
237	eor	x12,x26,x26,ror#23
238	and	x17,x27,x26
239	bic	x19,x20,x26
240	add	x21,x21,x9			// h+=X[i]
241	orr	x17,x17,x19			// Ch(e,f,g)
242	eor	x19,x22,x23			// a^b, b^c in next round
243	eor	x16,x16,x12,ror#18	// Sigma1(e)
244	ror	x12,x22,#28
245	add	x21,x21,x17			// h+=Ch(e,f,g)
246	eor	x17,x22,x22,ror#5
247	add	x21,x21,x16			// h+=Sigma1(e)
248	and	x28,x28,x19			// (b^c)&=(a^b)
249	add	x25,x25,x21			// d+=h
250	eor	x28,x28,x23			// Maj(a,b,c)
251	eor	x17,x12,x17,ror#34	// Sigma0(a)
252	add	x21,x21,x28			// h+=Maj(a,b,c)
253	ldr	x28,[x30],#8		// *K++, x19 in next round
254	//add	x21,x21,x17			// h+=Sigma0(a)
255#ifndef	__AARCH64EB__
256	rev	x10,x10			// 7
257#endif
258	ldp	x11,x12,[x1],#2*8
259	add	x21,x21,x17			// h+=Sigma0(a)
260	ror	x16,x25,#14
261	add	x20,x20,x28			// h+=K[i]
262	eor	x13,x25,x25,ror#23
263	and	x17,x26,x25
264	bic	x28,x27,x25
265	add	x20,x20,x10			// h+=X[i]
266	orr	x17,x17,x28			// Ch(e,f,g)
267	eor	x28,x21,x22			// a^b, b^c in next round
268	eor	x16,x16,x13,ror#18	// Sigma1(e)
269	ror	x13,x21,#28
270	add	x20,x20,x17			// h+=Ch(e,f,g)
271	eor	x17,x21,x21,ror#5
272	add	x20,x20,x16			// h+=Sigma1(e)
273	and	x19,x19,x28			// (b^c)&=(a^b)
274	add	x24,x24,x20			// d+=h
275	eor	x19,x19,x22			// Maj(a,b,c)
276	eor	x17,x13,x17,ror#34	// Sigma0(a)
277	add	x20,x20,x19			// h+=Maj(a,b,c)
278	ldr	x19,[x30],#8		// *K++, x28 in next round
279	//add	x20,x20,x17			// h+=Sigma0(a)
280#ifndef	__AARCH64EB__
281	rev	x11,x11			// 8
282#endif
283	add	x20,x20,x17			// h+=Sigma0(a)
284	ror	x16,x24,#14
285	add	x27,x27,x19			// h+=K[i]
286	eor	x14,x24,x24,ror#23
287	and	x17,x25,x24
288	bic	x19,x26,x24
289	add	x27,x27,x11			// h+=X[i]
290	orr	x17,x17,x19			// Ch(e,f,g)
291	eor	x19,x20,x21			// a^b, b^c in next round
292	eor	x16,x16,x14,ror#18	// Sigma1(e)
293	ror	x14,x20,#28
294	add	x27,x27,x17			// h+=Ch(e,f,g)
295	eor	x17,x20,x20,ror#5
296	add	x27,x27,x16			// h+=Sigma1(e)
297	and	x28,x28,x19			// (b^c)&=(a^b)
298	add	x23,x23,x27			// d+=h
299	eor	x28,x28,x21			// Maj(a,b,c)
300	eor	x17,x14,x17,ror#34	// Sigma0(a)
301	add	x27,x27,x28			// h+=Maj(a,b,c)
302	ldr	x28,[x30],#8		// *K++, x19 in next round
303	//add	x27,x27,x17			// h+=Sigma0(a)
304#ifndef	__AARCH64EB__
305	rev	x12,x12			// 9
306#endif
307	ldp	x13,x14,[x1],#2*8
308	add	x27,x27,x17			// h+=Sigma0(a)
309	ror	x16,x23,#14
310	add	x26,x26,x28			// h+=K[i]
311	eor	x15,x23,x23,ror#23
312	and	x17,x24,x23
313	bic	x28,x25,x23
314	add	x26,x26,x12			// h+=X[i]
315	orr	x17,x17,x28			// Ch(e,f,g)
316	eor	x28,x27,x20			// a^b, b^c in next round
317	eor	x16,x16,x15,ror#18	// Sigma1(e)
318	ror	x15,x27,#28
319	add	x26,x26,x17			// h+=Ch(e,f,g)
320	eor	x17,x27,x27,ror#5
321	add	x26,x26,x16			// h+=Sigma1(e)
322	and	x19,x19,x28			// (b^c)&=(a^b)
323	add	x22,x22,x26			// d+=h
324	eor	x19,x19,x20			// Maj(a,b,c)
325	eor	x17,x15,x17,ror#34	// Sigma0(a)
326	add	x26,x26,x19			// h+=Maj(a,b,c)
327	ldr	x19,[x30],#8		// *K++, x28 in next round
328	//add	x26,x26,x17			// h+=Sigma0(a)
329#ifndef	__AARCH64EB__
330	rev	x13,x13			// 10
331#endif
332	add	x26,x26,x17			// h+=Sigma0(a)
333	ror	x16,x22,#14
334	add	x25,x25,x19			// h+=K[i]
335	eor	x0,x22,x22,ror#23
336	and	x17,x23,x22
337	bic	x19,x24,x22
338	add	x25,x25,x13			// h+=X[i]
339	orr	x17,x17,x19			// Ch(e,f,g)
340	eor	x19,x26,x27			// a^b, b^c in next round
341	eor	x16,x16,x0,ror#18	// Sigma1(e)
342	ror	x0,x26,#28
343	add	x25,x25,x17			// h+=Ch(e,f,g)
344	eor	x17,x26,x26,ror#5
345	add	x25,x25,x16			// h+=Sigma1(e)
346	and	x28,x28,x19			// (b^c)&=(a^b)
347	add	x21,x21,x25			// d+=h
348	eor	x28,x28,x27			// Maj(a,b,c)
349	eor	x17,x0,x17,ror#34	// Sigma0(a)
350	add	x25,x25,x28			// h+=Maj(a,b,c)
351	ldr	x28,[x30],#8		// *K++, x19 in next round
352	//add	x25,x25,x17			// h+=Sigma0(a)
353#ifndef	__AARCH64EB__
354	rev	x14,x14			// 11
355#endif
356	ldp	x15,x0,[x1],#2*8
357	add	x25,x25,x17			// h+=Sigma0(a)
358	str	x6,[sp,#24]
359	ror	x16,x21,#14
360	add	x24,x24,x28			// h+=K[i]
361	eor	x6,x21,x21,ror#23
362	and	x17,x22,x21
363	bic	x28,x23,x21
364	add	x24,x24,x14			// h+=X[i]
365	orr	x17,x17,x28			// Ch(e,f,g)
366	eor	x28,x25,x26			// a^b, b^c in next round
367	eor	x16,x16,x6,ror#18	// Sigma1(e)
368	ror	x6,x25,#28
369	add	x24,x24,x17			// h+=Ch(e,f,g)
370	eor	x17,x25,x25,ror#5
371	add	x24,x24,x16			// h+=Sigma1(e)
372	and	x19,x19,x28			// (b^c)&=(a^b)
373	add	x20,x20,x24			// d+=h
374	eor	x19,x19,x26			// Maj(a,b,c)
375	eor	x17,x6,x17,ror#34	// Sigma0(a)
376	add	x24,x24,x19			// h+=Maj(a,b,c)
377	ldr	x19,[x30],#8		// *K++, x28 in next round
378	//add	x24,x24,x17			// h+=Sigma0(a)
379#ifndef	__AARCH64EB__
380	rev	x15,x15			// 12
381#endif
382	add	x24,x24,x17			// h+=Sigma0(a)
383	str	x7,[sp,#0]
384	ror	x16,x20,#14
385	add	x23,x23,x19			// h+=K[i]
386	eor	x7,x20,x20,ror#23
387	and	x17,x21,x20
388	bic	x19,x22,x20
389	add	x23,x23,x15			// h+=X[i]
390	orr	x17,x17,x19			// Ch(e,f,g)
391	eor	x19,x24,x25			// a^b, b^c in next round
392	eor	x16,x16,x7,ror#18	// Sigma1(e)
393	ror	x7,x24,#28
394	add	x23,x23,x17			// h+=Ch(e,f,g)
395	eor	x17,x24,x24,ror#5
396	add	x23,x23,x16			// h+=Sigma1(e)
397	and	x28,x28,x19			// (b^c)&=(a^b)
398	add	x27,x27,x23			// d+=h
399	eor	x28,x28,x25			// Maj(a,b,c)
400	eor	x17,x7,x17,ror#34	// Sigma0(a)
401	add	x23,x23,x28			// h+=Maj(a,b,c)
402	ldr	x28,[x30],#8		// *K++, x19 in next round
403	//add	x23,x23,x17			// h+=Sigma0(a)
404#ifndef	__AARCH64EB__
405	rev	x0,x0			// 13
406#endif
407	ldp	x1,x2,[x1]
408	add	x23,x23,x17			// h+=Sigma0(a)
409	str	x8,[sp,#8]
410	ror	x16,x27,#14
411	add	x22,x22,x28			// h+=K[i]
412	eor	x8,x27,x27,ror#23
413	and	x17,x20,x27
414	bic	x28,x21,x27
415	add	x22,x22,x0			// h+=X[i]
416	orr	x17,x17,x28			// Ch(e,f,g)
417	eor	x28,x23,x24			// a^b, b^c in next round
418	eor	x16,x16,x8,ror#18	// Sigma1(e)
419	ror	x8,x23,#28
420	add	x22,x22,x17			// h+=Ch(e,f,g)
421	eor	x17,x23,x23,ror#5
422	add	x22,x22,x16			// h+=Sigma1(e)
423	and	x19,x19,x28			// (b^c)&=(a^b)
424	add	x26,x26,x22			// d+=h
425	eor	x19,x19,x24			// Maj(a,b,c)
426	eor	x17,x8,x17,ror#34	// Sigma0(a)
427	add	x22,x22,x19			// h+=Maj(a,b,c)
428	ldr	x19,[x30],#8		// *K++, x28 in next round
429	//add	x22,x22,x17			// h+=Sigma0(a)
430#ifndef	__AARCH64EB__
431	rev	x1,x1			// 14
432#endif
433	ldr	x6,[sp,#24]
434	add	x22,x22,x17			// h+=Sigma0(a)
435	str	x9,[sp,#16]
436	ror	x16,x26,#14
437	add	x21,x21,x19			// h+=K[i]
438	eor	x9,x26,x26,ror#23
439	and	x17,x27,x26
440	bic	x19,x20,x26
441	add	x21,x21,x1			// h+=X[i]
442	orr	x17,x17,x19			// Ch(e,f,g)
443	eor	x19,x22,x23			// a^b, b^c in next round
444	eor	x16,x16,x9,ror#18	// Sigma1(e)
445	ror	x9,x22,#28
446	add	x21,x21,x17			// h+=Ch(e,f,g)
447	eor	x17,x22,x22,ror#5
448	add	x21,x21,x16			// h+=Sigma1(e)
449	and	x28,x28,x19			// (b^c)&=(a^b)
450	add	x25,x25,x21			// d+=h
451	eor	x28,x28,x23			// Maj(a,b,c)
452	eor	x17,x9,x17,ror#34	// Sigma0(a)
453	add	x21,x21,x28			// h+=Maj(a,b,c)
454	ldr	x28,[x30],#8		// *K++, x19 in next round
455	//add	x21,x21,x17			// h+=Sigma0(a)
456#ifndef	__AARCH64EB__
457	rev	x2,x2			// 15
458#endif
459	ldr	x7,[sp,#0]
460	add	x21,x21,x17			// h+=Sigma0(a)
461	str	x10,[sp,#24]
462	ror	x16,x25,#14
463	add	x20,x20,x28			// h+=K[i]
464	ror	x9,x4,#1
465	and	x17,x26,x25
466	ror	x8,x1,#19
467	bic	x28,x27,x25
468	ror	x10,x21,#28
469	add	x20,x20,x2			// h+=X[i]
470	eor	x16,x16,x25,ror#18
471	eor	x9,x9,x4,ror#8
472	orr	x17,x17,x28			// Ch(e,f,g)
473	eor	x28,x21,x22			// a^b, b^c in next round
474	eor	x16,x16,x25,ror#41	// Sigma1(e)
475	eor	x10,x10,x21,ror#34
476	add	x20,x20,x17			// h+=Ch(e,f,g)
477	and	x19,x19,x28			// (b^c)&=(a^b)
478	eor	x8,x8,x1,ror#61
479	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
480	add	x20,x20,x16			// h+=Sigma1(e)
481	eor	x19,x19,x22			// Maj(a,b,c)
482	eor	x17,x10,x21,ror#39	// Sigma0(a)
483	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
484	add	x3,x3,x12
485	add	x24,x24,x20			// d+=h
486	add	x20,x20,x19			// h+=Maj(a,b,c)
487	ldr	x19,[x30],#8		// *K++, x28 in next round
488	add	x3,x3,x9
489	add	x20,x20,x17			// h+=Sigma0(a)
490	add	x3,x3,x8
491Loop_16_xx:
492	ldr	x8,[sp,#8]
493	str	x11,[sp,#0]
494	ror	x16,x24,#14
495	add	x27,x27,x19			// h+=K[i]
496	ror	x10,x5,#1
497	and	x17,x25,x24
498	ror	x9,x2,#19
499	bic	x19,x26,x24
500	ror	x11,x20,#28
501	add	x27,x27,x3			// h+=X[i]
502	eor	x16,x16,x24,ror#18
503	eor	x10,x10,x5,ror#8
504	orr	x17,x17,x19			// Ch(e,f,g)
505	eor	x19,x20,x21			// a^b, b^c in next round
506	eor	x16,x16,x24,ror#41	// Sigma1(e)
507	eor	x11,x11,x20,ror#34
508	add	x27,x27,x17			// h+=Ch(e,f,g)
509	and	x28,x28,x19			// (b^c)&=(a^b)
510	eor	x9,x9,x2,ror#61
511	eor	x10,x10,x5,lsr#7	// sigma0(X[i+1])
512	add	x27,x27,x16			// h+=Sigma1(e)
513	eor	x28,x28,x21			// Maj(a,b,c)
514	eor	x17,x11,x20,ror#39	// Sigma0(a)
515	eor	x9,x9,x2,lsr#6	// sigma1(X[i+14])
516	add	x4,x4,x13
517	add	x23,x23,x27			// d+=h
518	add	x27,x27,x28			// h+=Maj(a,b,c)
519	ldr	x28,[x30],#8		// *K++, x19 in next round
520	add	x4,x4,x10
521	add	x27,x27,x17			// h+=Sigma0(a)
522	add	x4,x4,x9
523	ldr	x9,[sp,#16]
524	str	x12,[sp,#8]
525	ror	x16,x23,#14
526	add	x26,x26,x28			// h+=K[i]
527	ror	x11,x6,#1
528	and	x17,x24,x23
529	ror	x10,x3,#19
530	bic	x28,x25,x23
531	ror	x12,x27,#28
532	add	x26,x26,x4			// h+=X[i]
533	eor	x16,x16,x23,ror#18
534	eor	x11,x11,x6,ror#8
535	orr	x17,x17,x28			// Ch(e,f,g)
536	eor	x28,x27,x20			// a^b, b^c in next round
537	eor	x16,x16,x23,ror#41	// Sigma1(e)
538	eor	x12,x12,x27,ror#34
539	add	x26,x26,x17			// h+=Ch(e,f,g)
540	and	x19,x19,x28			// (b^c)&=(a^b)
541	eor	x10,x10,x3,ror#61
542	eor	x11,x11,x6,lsr#7	// sigma0(X[i+1])
543	add	x26,x26,x16			// h+=Sigma1(e)
544	eor	x19,x19,x20			// Maj(a,b,c)
545	eor	x17,x12,x27,ror#39	// Sigma0(a)
546	eor	x10,x10,x3,lsr#6	// sigma1(X[i+14])
547	add	x5,x5,x14
548	add	x22,x22,x26			// d+=h
549	add	x26,x26,x19			// h+=Maj(a,b,c)
550	ldr	x19,[x30],#8		// *K++, x28 in next round
551	add	x5,x5,x11
552	add	x26,x26,x17			// h+=Sigma0(a)
553	add	x5,x5,x10
554	ldr	x10,[sp,#24]
555	str	x13,[sp,#16]
556	ror	x16,x22,#14
557	add	x25,x25,x19			// h+=K[i]
558	ror	x12,x7,#1
559	and	x17,x23,x22
560	ror	x11,x4,#19
561	bic	x19,x24,x22
562	ror	x13,x26,#28
563	add	x25,x25,x5			// h+=X[i]
564	eor	x16,x16,x22,ror#18
565	eor	x12,x12,x7,ror#8
566	orr	x17,x17,x19			// Ch(e,f,g)
567	eor	x19,x26,x27			// a^b, b^c in next round
568	eor	x16,x16,x22,ror#41	// Sigma1(e)
569	eor	x13,x13,x26,ror#34
570	add	x25,x25,x17			// h+=Ch(e,f,g)
571	and	x28,x28,x19			// (b^c)&=(a^b)
572	eor	x11,x11,x4,ror#61
573	eor	x12,x12,x7,lsr#7	// sigma0(X[i+1])
574	add	x25,x25,x16			// h+=Sigma1(e)
575	eor	x28,x28,x27			// Maj(a,b,c)
576	eor	x17,x13,x26,ror#39	// Sigma0(a)
577	eor	x11,x11,x4,lsr#6	// sigma1(X[i+14])
578	add	x6,x6,x15
579	add	x21,x21,x25			// d+=h
580	add	x25,x25,x28			// h+=Maj(a,b,c)
581	ldr	x28,[x30],#8		// *K++, x19 in next round
582	add	x6,x6,x12
583	add	x25,x25,x17			// h+=Sigma0(a)
584	add	x6,x6,x11
585	ldr	x11,[sp,#0]
586	str	x14,[sp,#24]
587	ror	x16,x21,#14
588	add	x24,x24,x28			// h+=K[i]
589	ror	x13,x8,#1
590	and	x17,x22,x21
591	ror	x12,x5,#19
592	bic	x28,x23,x21
593	ror	x14,x25,#28
594	add	x24,x24,x6			// h+=X[i]
595	eor	x16,x16,x21,ror#18
596	eor	x13,x13,x8,ror#8
597	orr	x17,x17,x28			// Ch(e,f,g)
598	eor	x28,x25,x26			// a^b, b^c in next round
599	eor	x16,x16,x21,ror#41	// Sigma1(e)
600	eor	x14,x14,x25,ror#34
601	add	x24,x24,x17			// h+=Ch(e,f,g)
602	and	x19,x19,x28			// (b^c)&=(a^b)
603	eor	x12,x12,x5,ror#61
604	eor	x13,x13,x8,lsr#7	// sigma0(X[i+1])
605	add	x24,x24,x16			// h+=Sigma1(e)
606	eor	x19,x19,x26			// Maj(a,b,c)
607	eor	x17,x14,x25,ror#39	// Sigma0(a)
608	eor	x12,x12,x5,lsr#6	// sigma1(X[i+14])
609	add	x7,x7,x0
610	add	x20,x20,x24			// d+=h
611	add	x24,x24,x19			// h+=Maj(a,b,c)
612	ldr	x19,[x30],#8		// *K++, x28 in next round
613	add	x7,x7,x13
614	add	x24,x24,x17			// h+=Sigma0(a)
615	add	x7,x7,x12
616	ldr	x12,[sp,#8]
617	str	x15,[sp,#0]
618	ror	x16,x20,#14
619	add	x23,x23,x19			// h+=K[i]
620	ror	x14,x9,#1
621	and	x17,x21,x20
622	ror	x13,x6,#19
623	bic	x19,x22,x20
624	ror	x15,x24,#28
625	add	x23,x23,x7			// h+=X[i]
626	eor	x16,x16,x20,ror#18
627	eor	x14,x14,x9,ror#8
628	orr	x17,x17,x19			// Ch(e,f,g)
629	eor	x19,x24,x25			// a^b, b^c in next round
630	eor	x16,x16,x20,ror#41	// Sigma1(e)
631	eor	x15,x15,x24,ror#34
632	add	x23,x23,x17			// h+=Ch(e,f,g)
633	and	x28,x28,x19			// (b^c)&=(a^b)
634	eor	x13,x13,x6,ror#61
635	eor	x14,x14,x9,lsr#7	// sigma0(X[i+1])
636	add	x23,x23,x16			// h+=Sigma1(e)
637	eor	x28,x28,x25			// Maj(a,b,c)
638	eor	x17,x15,x24,ror#39	// Sigma0(a)
639	eor	x13,x13,x6,lsr#6	// sigma1(X[i+14])
640	add	x8,x8,x1
641	add	x27,x27,x23			// d+=h
642	add	x23,x23,x28			// h+=Maj(a,b,c)
643	ldr	x28,[x30],#8		// *K++, x19 in next round
644	add	x8,x8,x14
645	add	x23,x23,x17			// h+=Sigma0(a)
646	add	x8,x8,x13
647	ldr	x13,[sp,#16]
648	str	x0,[sp,#8]
649	ror	x16,x27,#14
650	add	x22,x22,x28			// h+=K[i]
651	ror	x15,x10,#1
652	and	x17,x20,x27
653	ror	x14,x7,#19
654	bic	x28,x21,x27
655	ror	x0,x23,#28
656	add	x22,x22,x8			// h+=X[i]
657	eor	x16,x16,x27,ror#18
658	eor	x15,x15,x10,ror#8
659	orr	x17,x17,x28			// Ch(e,f,g)
660	eor	x28,x23,x24			// a^b, b^c in next round
661	eor	x16,x16,x27,ror#41	// Sigma1(e)
662	eor	x0,x0,x23,ror#34
663	add	x22,x22,x17			// h+=Ch(e,f,g)
664	and	x19,x19,x28			// (b^c)&=(a^b)
665	eor	x14,x14,x7,ror#61
666	eor	x15,x15,x10,lsr#7	// sigma0(X[i+1])
667	add	x22,x22,x16			// h+=Sigma1(e)
668	eor	x19,x19,x24			// Maj(a,b,c)
669	eor	x17,x0,x23,ror#39	// Sigma0(a)
670	eor	x14,x14,x7,lsr#6	// sigma1(X[i+14])
671	add	x9,x9,x2
672	add	x26,x26,x22			// d+=h
673	add	x22,x22,x19			// h+=Maj(a,b,c)
674	ldr	x19,[x30],#8		// *K++, x28 in next round
675	add	x9,x9,x15
676	add	x22,x22,x17			// h+=Sigma0(a)
677	add	x9,x9,x14
678	ldr	x14,[sp,#24]
679	str	x1,[sp,#16]
680	ror	x16,x26,#14
681	add	x21,x21,x19			// h+=K[i]
682	ror	x0,x11,#1
683	and	x17,x27,x26
684	ror	x15,x8,#19
685	bic	x19,x20,x26
686	ror	x1,x22,#28
687	add	x21,x21,x9			// h+=X[i]
688	eor	x16,x16,x26,ror#18
689	eor	x0,x0,x11,ror#8
690	orr	x17,x17,x19			// Ch(e,f,g)
691	eor	x19,x22,x23			// a^b, b^c in next round
692	eor	x16,x16,x26,ror#41	// Sigma1(e)
693	eor	x1,x1,x22,ror#34
694	add	x21,x21,x17			// h+=Ch(e,f,g)
695	and	x28,x28,x19			// (b^c)&=(a^b)
696	eor	x15,x15,x8,ror#61
697	eor	x0,x0,x11,lsr#7	// sigma0(X[i+1])
698	add	x21,x21,x16			// h+=Sigma1(e)
699	eor	x28,x28,x23			// Maj(a,b,c)
700	eor	x17,x1,x22,ror#39	// Sigma0(a)
701	eor	x15,x15,x8,lsr#6	// sigma1(X[i+14])
702	add	x10,x10,x3
703	add	x25,x25,x21			// d+=h
704	add	x21,x21,x28			// h+=Maj(a,b,c)
705	ldr	x28,[x30],#8		// *K++, x19 in next round
706	add	x10,x10,x0
707	add	x21,x21,x17			// h+=Sigma0(a)
708	add	x10,x10,x15
709	ldr	x15,[sp,#0]
710	str	x2,[sp,#24]
711	ror	x16,x25,#14
712	add	x20,x20,x28			// h+=K[i]
713	ror	x1,x12,#1
714	and	x17,x26,x25
715	ror	x0,x9,#19
716	bic	x28,x27,x25
717	ror	x2,x21,#28
718	add	x20,x20,x10			// h+=X[i]
719	eor	x16,x16,x25,ror#18
720	eor	x1,x1,x12,ror#8
721	orr	x17,x17,x28			// Ch(e,f,g)
722	eor	x28,x21,x22			// a^b, b^c in next round
723	eor	x16,x16,x25,ror#41	// Sigma1(e)
724	eor	x2,x2,x21,ror#34
725	add	x20,x20,x17			// h+=Ch(e,f,g)
726	and	x19,x19,x28			// (b^c)&=(a^b)
727	eor	x0,x0,x9,ror#61
728	eor	x1,x1,x12,lsr#7	// sigma0(X[i+1])
729	add	x20,x20,x16			// h+=Sigma1(e)
730	eor	x19,x19,x22			// Maj(a,b,c)
731	eor	x17,x2,x21,ror#39	// Sigma0(a)
732	eor	x0,x0,x9,lsr#6	// sigma1(X[i+14])
733	add	x11,x11,x4
734	add	x24,x24,x20			// d+=h
735	add	x20,x20,x19			// h+=Maj(a,b,c)
736	ldr	x19,[x30],#8		// *K++, x28 in next round
737	add	x11,x11,x1
738	add	x20,x20,x17			// h+=Sigma0(a)
739	add	x11,x11,x0
740	ldr	x0,[sp,#8]
741	str	x3,[sp,#0]
742	ror	x16,x24,#14
743	add	x27,x27,x19			// h+=K[i]
744	ror	x2,x13,#1
745	and	x17,x25,x24
746	ror	x1,x10,#19
747	bic	x19,x26,x24
748	ror	x3,x20,#28
749	add	x27,x27,x11			// h+=X[i]
750	eor	x16,x16,x24,ror#18
751	eor	x2,x2,x13,ror#8
752	orr	x17,x17,x19			// Ch(e,f,g)
753	eor	x19,x20,x21			// a^b, b^c in next round
754	eor	x16,x16,x24,ror#41	// Sigma1(e)
755	eor	x3,x3,x20,ror#34
756	add	x27,x27,x17			// h+=Ch(e,f,g)
757	and	x28,x28,x19			// (b^c)&=(a^b)
758	eor	x1,x1,x10,ror#61
759	eor	x2,x2,x13,lsr#7	// sigma0(X[i+1])
760	add	x27,x27,x16			// h+=Sigma1(e)
761	eor	x28,x28,x21			// Maj(a,b,c)
762	eor	x17,x3,x20,ror#39	// Sigma0(a)
763	eor	x1,x1,x10,lsr#6	// sigma1(X[i+14])
764	add	x12,x12,x5
765	add	x23,x23,x27			// d+=h
766	add	x27,x27,x28			// h+=Maj(a,b,c)
767	ldr	x28,[x30],#8		// *K++, x19 in next round
768	add	x12,x12,x2
769	add	x27,x27,x17			// h+=Sigma0(a)
770	add	x12,x12,x1
771	ldr	x1,[sp,#16]
772	str	x4,[sp,#8]
773	ror	x16,x23,#14
774	add	x26,x26,x28			// h+=K[i]
775	ror	x3,x14,#1
776	and	x17,x24,x23
777	ror	x2,x11,#19
778	bic	x28,x25,x23
779	ror	x4,x27,#28
780	add	x26,x26,x12			// h+=X[i]
781	eor	x16,x16,x23,ror#18
782	eor	x3,x3,x14,ror#8
783	orr	x17,x17,x28			// Ch(e,f,g)
784	eor	x28,x27,x20			// a^b, b^c in next round
785	eor	x16,x16,x23,ror#41	// Sigma1(e)
786	eor	x4,x4,x27,ror#34
787	add	x26,x26,x17			// h+=Ch(e,f,g)
788	and	x19,x19,x28			// (b^c)&=(a^b)
789	eor	x2,x2,x11,ror#61
790	eor	x3,x3,x14,lsr#7	// sigma0(X[i+1])
791	add	x26,x26,x16			// h+=Sigma1(e)
792	eor	x19,x19,x20			// Maj(a,b,c)
793	eor	x17,x4,x27,ror#39	// Sigma0(a)
794	eor	x2,x2,x11,lsr#6	// sigma1(X[i+14])
795	add	x13,x13,x6
796	add	x22,x22,x26			// d+=h
797	add	x26,x26,x19			// h+=Maj(a,b,c)
798	ldr	x19,[x30],#8		// *K++, x28 in next round
799	add	x13,x13,x3
800	add	x26,x26,x17			// h+=Sigma0(a)
801	add	x13,x13,x2
802	ldr	x2,[sp,#24]
803	str	x5,[sp,#16]
804	ror	x16,x22,#14
805	add	x25,x25,x19			// h+=K[i]
806	ror	x4,x15,#1
807	and	x17,x23,x22
808	ror	x3,x12,#19
809	bic	x19,x24,x22
810	ror	x5,x26,#28
811	add	x25,x25,x13			// h+=X[i]
812	eor	x16,x16,x22,ror#18
813	eor	x4,x4,x15,ror#8
814	orr	x17,x17,x19			// Ch(e,f,g)
815	eor	x19,x26,x27			// a^b, b^c in next round
816	eor	x16,x16,x22,ror#41	// Sigma1(e)
817	eor	x5,x5,x26,ror#34
818	add	x25,x25,x17			// h+=Ch(e,f,g)
819	and	x28,x28,x19			// (b^c)&=(a^b)
820	eor	x3,x3,x12,ror#61
821	eor	x4,x4,x15,lsr#7	// sigma0(X[i+1])
822	add	x25,x25,x16			// h+=Sigma1(e)
823	eor	x28,x28,x27			// Maj(a,b,c)
824	eor	x17,x5,x26,ror#39	// Sigma0(a)
825	eor	x3,x3,x12,lsr#6	// sigma1(X[i+14])
826	add	x14,x14,x7
827	add	x21,x21,x25			// d+=h
828	add	x25,x25,x28			// h+=Maj(a,b,c)
829	ldr	x28,[x30],#8		// *K++, x19 in next round
830	add	x14,x14,x4
831	add	x25,x25,x17			// h+=Sigma0(a)
832	add	x14,x14,x3
833	ldr	x3,[sp,#0]
834	str	x6,[sp,#24]
835	ror	x16,x21,#14
836	add	x24,x24,x28			// h+=K[i]
837	ror	x5,x0,#1
838	and	x17,x22,x21
839	ror	x4,x13,#19
840	bic	x28,x23,x21
841	ror	x6,x25,#28
842	add	x24,x24,x14			// h+=X[i]
843	eor	x16,x16,x21,ror#18
844	eor	x5,x5,x0,ror#8
845	orr	x17,x17,x28			// Ch(e,f,g)
846	eor	x28,x25,x26			// a^b, b^c in next round
847	eor	x16,x16,x21,ror#41	// Sigma1(e)
848	eor	x6,x6,x25,ror#34
849	add	x24,x24,x17			// h+=Ch(e,f,g)
850	and	x19,x19,x28			// (b^c)&=(a^b)
851	eor	x4,x4,x13,ror#61
852	eor	x5,x5,x0,lsr#7	// sigma0(X[i+1])
853	add	x24,x24,x16			// h+=Sigma1(e)
854	eor	x19,x19,x26			// Maj(a,b,c)
855	eor	x17,x6,x25,ror#39	// Sigma0(a)
856	eor	x4,x4,x13,lsr#6	// sigma1(X[i+14])
857	add	x15,x15,x8
858	add	x20,x20,x24			// d+=h
859	add	x24,x24,x19			// h+=Maj(a,b,c)
860	ldr	x19,[x30],#8		// *K++, x28 in next round
861	add	x15,x15,x5
862	add	x24,x24,x17			// h+=Sigma0(a)
863	add	x15,x15,x4
864	ldr	x4,[sp,#8]
865	str	x7,[sp,#0]
866	ror	x16,x20,#14
867	add	x23,x23,x19			// h+=K[i]
868	ror	x6,x1,#1
869	and	x17,x21,x20
870	ror	x5,x14,#19
871	bic	x19,x22,x20
872	ror	x7,x24,#28
873	add	x23,x23,x15			// h+=X[i]
874	eor	x16,x16,x20,ror#18
875	eor	x6,x6,x1,ror#8
876	orr	x17,x17,x19			// Ch(e,f,g)
877	eor	x19,x24,x25			// a^b, b^c in next round
878	eor	x16,x16,x20,ror#41	// Sigma1(e)
879	eor	x7,x7,x24,ror#34
880	add	x23,x23,x17			// h+=Ch(e,f,g)
881	and	x28,x28,x19			// (b^c)&=(a^b)
882	eor	x5,x5,x14,ror#61
883	eor	x6,x6,x1,lsr#7	// sigma0(X[i+1])
884	add	x23,x23,x16			// h+=Sigma1(e)
885	eor	x28,x28,x25			// Maj(a,b,c)
886	eor	x17,x7,x24,ror#39	// Sigma0(a)
887	eor	x5,x5,x14,lsr#6	// sigma1(X[i+14])
888	add	x0,x0,x9
889	add	x27,x27,x23			// d+=h
890	add	x23,x23,x28			// h+=Maj(a,b,c)
891	ldr	x28,[x30],#8		// *K++, x19 in next round
892	add	x0,x0,x6
893	add	x23,x23,x17			// h+=Sigma0(a)
894	add	x0,x0,x5
895	ldr	x5,[sp,#16]
896	str	x8,[sp,#8]
897	ror	x16,x27,#14
898	add	x22,x22,x28			// h+=K[i]
899	ror	x7,x2,#1
900	and	x17,x20,x27
901	ror	x6,x15,#19
902	bic	x28,x21,x27
903	ror	x8,x23,#28
904	add	x22,x22,x0			// h+=X[i]
905	eor	x16,x16,x27,ror#18
906	eor	x7,x7,x2,ror#8
907	orr	x17,x17,x28			// Ch(e,f,g)
908	eor	x28,x23,x24			// a^b, b^c in next round
909	eor	x16,x16,x27,ror#41	// Sigma1(e)
910	eor	x8,x8,x23,ror#34
911	add	x22,x22,x17			// h+=Ch(e,f,g)
912	and	x19,x19,x28			// (b^c)&=(a^b)
913	eor	x6,x6,x15,ror#61
914	eor	x7,x7,x2,lsr#7	// sigma0(X[i+1])
915	add	x22,x22,x16			// h+=Sigma1(e)
916	eor	x19,x19,x24			// Maj(a,b,c)
917	eor	x17,x8,x23,ror#39	// Sigma0(a)
918	eor	x6,x6,x15,lsr#6	// sigma1(X[i+14])
919	add	x1,x1,x10
920	add	x26,x26,x22			// d+=h
921	add	x22,x22,x19			// h+=Maj(a,b,c)
922	ldr	x19,[x30],#8		// *K++, x28 in next round
923	add	x1,x1,x7
924	add	x22,x22,x17			// h+=Sigma0(a)
925	add	x1,x1,x6
926	ldr	x6,[sp,#24]
927	str	x9,[sp,#16]
928	ror	x16,x26,#14
929	add	x21,x21,x19			// h+=K[i]
930	ror	x8,x3,#1
931	and	x17,x27,x26
932	ror	x7,x0,#19
933	bic	x19,x20,x26
934	ror	x9,x22,#28
935	add	x21,x21,x1			// h+=X[i]
936	eor	x16,x16,x26,ror#18
937	eor	x8,x8,x3,ror#8
938	orr	x17,x17,x19			// Ch(e,f,g)
939	eor	x19,x22,x23			// a^b, b^c in next round
940	eor	x16,x16,x26,ror#41	// Sigma1(e)
941	eor	x9,x9,x22,ror#34
942	add	x21,x21,x17			// h+=Ch(e,f,g)
943	and	x28,x28,x19			// (b^c)&=(a^b)
944	eor	x7,x7,x0,ror#61
945	eor	x8,x8,x3,lsr#7	// sigma0(X[i+1])
946	add	x21,x21,x16			// h+=Sigma1(e)
947	eor	x28,x28,x23			// Maj(a,b,c)
948	eor	x17,x9,x22,ror#39	// Sigma0(a)
949	eor	x7,x7,x0,lsr#6	// sigma1(X[i+14])
950	add	x2,x2,x11
951	add	x25,x25,x21			// d+=h
952	add	x21,x21,x28			// h+=Maj(a,b,c)
953	ldr	x28,[x30],#8		// *K++, x19 in next round
954	add	x2,x2,x8
955	add	x21,x21,x17			// h+=Sigma0(a)
956	add	x2,x2,x7
957	ldr	x7,[sp,#0]
958	str	x10,[sp,#24]
959	ror	x16,x25,#14
960	add	x20,x20,x28			// h+=K[i]
961	ror	x9,x4,#1
962	and	x17,x26,x25
963	ror	x8,x1,#19
964	bic	x28,x27,x25
965	ror	x10,x21,#28
966	add	x20,x20,x2			// h+=X[i]
967	eor	x16,x16,x25,ror#18
968	eor	x9,x9,x4,ror#8
969	orr	x17,x17,x28			// Ch(e,f,g)
970	eor	x28,x21,x22			// a^b, b^c in next round
971	eor	x16,x16,x25,ror#41	// Sigma1(e)
972	eor	x10,x10,x21,ror#34
973	add	x20,x20,x17			// h+=Ch(e,f,g)
974	and	x19,x19,x28			// (b^c)&=(a^b)
975	eor	x8,x8,x1,ror#61
976	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
977	add	x20,x20,x16			// h+=Sigma1(e)
978	eor	x19,x19,x22			// Maj(a,b,c)
979	eor	x17,x10,x21,ror#39	// Sigma0(a)
980	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
981	add	x3,x3,x12
982	add	x24,x24,x20			// d+=h
983	add	x20,x20,x19			// h+=Maj(a,b,c)
984	ldr	x19,[x30],#8		// *K++, x28 in next round
985	add	x3,x3,x9
986	add	x20,x20,x17			// h+=Sigma0(a)
987	add	x3,x3,x8
988	cbnz	x19,Loop_16_xx
989
990	ldp	x0,x2,[x29,#96]
991	ldr	x1,[x29,#112]
992	sub	x30,x30,#648		// rewind
993
994	ldp	x3,x4,[x0]
995	ldp	x5,x6,[x0,#2*8]
996	add	x1,x1,#14*8			// advance input pointer
997	ldp	x7,x8,[x0,#4*8]
998	add	x20,x20,x3
999	ldp	x9,x10,[x0,#6*8]
1000	add	x21,x21,x4
1001	add	x22,x22,x5
1002	add	x23,x23,x6
1003	stp	x20,x21,[x0]
1004	add	x24,x24,x7
1005	add	x25,x25,x8
1006	stp	x22,x23,[x0,#2*8]
1007	add	x26,x26,x9
1008	add	x27,x27,x10
1009	cmp	x1,x2
1010	stp	x24,x25,[x0,#4*8]
1011	stp	x26,x27,[x0,#6*8]
1012	b.ne	Loop
1013
1014	ldp	x19,x20,[x29,#16]
1015	add	sp,sp,#4*8
1016	ldp	x21,x22,[x29,#32]
1017	ldp	x23,x24,[x29,#48]
1018	ldp	x25,x26,[x29,#64]
1019	ldp	x27,x28,[x29,#80]
1020	ldp	x29,x30,[sp],#128
1021	AARCH64_VALIDATE_LINK_REGISTER
1022	ret
1023
1024
1025.section	.rodata
1026.align	6
1027
1028LK512:
1029.quad	0x428a2f98d728ae22,0x7137449123ef65cd
1030.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
1031.quad	0x3956c25bf348b538,0x59f111f1b605d019
1032.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
1033.quad	0xd807aa98a3030242,0x12835b0145706fbe
1034.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
1035.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
1036.quad	0x9bdc06a725c71235,0xc19bf174cf692694
1037.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
1038.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
1039.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
1040.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
1041.quad	0x983e5152ee66dfab,0xa831c66d2db43210
1042.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
1043.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
1044.quad	0x06ca6351e003826f,0x142929670a0e6e70
1045.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
1046.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
1047.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
1048.quad	0x81c2c92e47edaee6,0x92722c851482353b
1049.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
1050.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
1051.quad	0xd192e819d6ef5218,0xd69906245565a910
1052.quad	0xf40e35855771202a,0x106aa07032bbd1b8
1053.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
1054.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
1055.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
1056.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
1057.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
1058.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
1059.quad	0x90befffa23631e28,0xa4506cebde82bde9
1060.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
1061.quad	0xca273eceea26619c,0xd186b8c721c0c207
1062.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
1063.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
1064.quad	0x113f9804bef90dae,0x1b710b35131c471b
1065.quad	0x28db77f523047d84,0x32caab7b40c72493
1066.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
1067.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
1068.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
1069.quad	0	// terminator
1070
1071.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1072.align	2
1073.align	2
1074.text
1075#ifndef	__KERNEL__
1076.globl	sha512_block_data_order_hw
1077
1078.def sha512_block_data_order_hw
1079   .type 32
1080.endef
1081.align	6
1082sha512_block_data_order_hw:
1083	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1084	AARCH64_VALID_CALL_TARGET
1085	stp	x29,x30,[sp,#-16]!
1086	add	x29,sp,#0
1087
1088	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64	// load input
1089	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
1090
1091	ld1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// load context
1092	adrp	x3,LK512
1093	add	x3,x3,:lo12:LK512
1094
1095	rev64	v16.16b,v16.16b
1096	rev64	v17.16b,v17.16b
1097	rev64	v18.16b,v18.16b
1098	rev64	v19.16b,v19.16b
1099	rev64	v20.16b,v20.16b
1100	rev64	v21.16b,v21.16b
1101	rev64	v22.16b,v22.16b
1102	rev64	v23.16b,v23.16b
1103	b	Loop_hw
1104
1105.align	4
1106Loop_hw:
1107	ld1	{v24.2d},[x3],#16
1108	subs	x2,x2,#1
1109	sub	x4,x1,#128
1110	orr	v26.16b,v0.16b,v0.16b			// offload
1111	orr	v27.16b,v1.16b,v1.16b
1112	orr	v28.16b,v2.16b,v2.16b
1113	orr	v29.16b,v3.16b,v3.16b
1114	csel	x1,x1,x4,ne			// conditional rewind
1115	add	v24.2d,v24.2d,v16.2d
1116	ld1	{v25.2d},[x3],#16
1117	ext	v24.16b,v24.16b,v24.16b,#8
1118	ext	v5.16b,v2.16b,v3.16b,#8
1119	ext	v6.16b,v1.16b,v2.16b,#8
1120	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1121.long	0xcec08230	//sha512su0 v16.16b,v17.16b
1122	ext	v7.16b,v20.16b,v21.16b,#8
1123.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1124.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1125	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1126.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1127	add	v25.2d,v25.2d,v17.2d
1128	ld1	{v24.2d},[x3],#16
1129	ext	v25.16b,v25.16b,v25.16b,#8
1130	ext	v5.16b,v4.16b,v2.16b,#8
1131	ext	v6.16b,v0.16b,v4.16b,#8
1132	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1133.long	0xcec08251	//sha512su0 v17.16b,v18.16b
1134	ext	v7.16b,v21.16b,v22.16b,#8
1135.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1136.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1137	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1138.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1139	add	v24.2d,v24.2d,v18.2d
1140	ld1	{v25.2d},[x3],#16
1141	ext	v24.16b,v24.16b,v24.16b,#8
1142	ext	v5.16b,v1.16b,v4.16b,#8
1143	ext	v6.16b,v3.16b,v1.16b,#8
1144	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1145.long	0xcec08272	//sha512su0 v18.16b,v19.16b
1146	ext	v7.16b,v22.16b,v23.16b,#8
1147.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1148.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1149	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1150.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1151	add	v25.2d,v25.2d,v19.2d
1152	ld1	{v24.2d},[x3],#16
1153	ext	v25.16b,v25.16b,v25.16b,#8
1154	ext	v5.16b,v0.16b,v1.16b,#8
1155	ext	v6.16b,v2.16b,v0.16b,#8
1156	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1157.long	0xcec08293	//sha512su0 v19.16b,v20.16b
1158	ext	v7.16b,v23.16b,v16.16b,#8
1159.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1160.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1161	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1162.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1163	add	v24.2d,v24.2d,v20.2d
1164	ld1	{v25.2d},[x3],#16
1165	ext	v24.16b,v24.16b,v24.16b,#8
1166	ext	v5.16b,v3.16b,v0.16b,#8
1167	ext	v6.16b,v4.16b,v3.16b,#8
1168	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1169.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
1170	ext	v7.16b,v16.16b,v17.16b,#8
1171.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1172.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1173	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1174.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1175	add	v25.2d,v25.2d,v21.2d
1176	ld1	{v24.2d},[x3],#16
1177	ext	v25.16b,v25.16b,v25.16b,#8
1178	ext	v5.16b,v2.16b,v3.16b,#8
1179	ext	v6.16b,v1.16b,v2.16b,#8
1180	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1181.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
1182	ext	v7.16b,v17.16b,v18.16b,#8
1183.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1184.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1185	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1186.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1187	add	v24.2d,v24.2d,v22.2d
1188	ld1	{v25.2d},[x3],#16
1189	ext	v24.16b,v24.16b,v24.16b,#8
1190	ext	v5.16b,v4.16b,v2.16b,#8
1191	ext	v6.16b,v0.16b,v4.16b,#8
1192	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1193.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
1194	ext	v7.16b,v18.16b,v19.16b,#8
1195.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1196.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1197	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1198.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1199	add	v25.2d,v25.2d,v23.2d
1200	ld1	{v24.2d},[x3],#16
1201	ext	v25.16b,v25.16b,v25.16b,#8
1202	ext	v5.16b,v1.16b,v4.16b,#8
1203	ext	v6.16b,v3.16b,v1.16b,#8
1204	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1205.long	0xcec08217	//sha512su0 v23.16b,v16.16b
1206	ext	v7.16b,v19.16b,v20.16b,#8
1207.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1208.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1209	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1210.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1211	add	v24.2d,v24.2d,v16.2d
1212	ld1	{v25.2d},[x3],#16
1213	ext	v24.16b,v24.16b,v24.16b,#8
1214	ext	v5.16b,v0.16b,v1.16b,#8
1215	ext	v6.16b,v2.16b,v0.16b,#8
1216	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1217.long	0xcec08230	//sha512su0 v16.16b,v17.16b
1218	ext	v7.16b,v20.16b,v21.16b,#8
1219.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1220.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1221	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1222.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1223	add	v25.2d,v25.2d,v17.2d
1224	ld1	{v24.2d},[x3],#16
1225	ext	v25.16b,v25.16b,v25.16b,#8
1226	ext	v5.16b,v3.16b,v0.16b,#8
1227	ext	v6.16b,v4.16b,v3.16b,#8
1228	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1229.long	0xcec08251	//sha512su0 v17.16b,v18.16b
1230	ext	v7.16b,v21.16b,v22.16b,#8
1231.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1232.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1233	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1234.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1235	add	v24.2d,v24.2d,v18.2d
1236	ld1	{v25.2d},[x3],#16
1237	ext	v24.16b,v24.16b,v24.16b,#8
1238	ext	v5.16b,v2.16b,v3.16b,#8
1239	ext	v6.16b,v1.16b,v2.16b,#8
1240	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1241.long	0xcec08272	//sha512su0 v18.16b,v19.16b
1242	ext	v7.16b,v22.16b,v23.16b,#8
1243.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1244.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1245	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1246.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1247	add	v25.2d,v25.2d,v19.2d
1248	ld1	{v24.2d},[x3],#16
1249	ext	v25.16b,v25.16b,v25.16b,#8
1250	ext	v5.16b,v4.16b,v2.16b,#8
1251	ext	v6.16b,v0.16b,v4.16b,#8
1252	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1253.long	0xcec08293	//sha512su0 v19.16b,v20.16b
1254	ext	v7.16b,v23.16b,v16.16b,#8
1255.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1256.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1257	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1258.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1259	add	v24.2d,v24.2d,v20.2d
1260	ld1	{v25.2d},[x3],#16
1261	ext	v24.16b,v24.16b,v24.16b,#8
1262	ext	v5.16b,v1.16b,v4.16b,#8
1263	ext	v6.16b,v3.16b,v1.16b,#8
1264	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1265.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
1266	ext	v7.16b,v16.16b,v17.16b,#8
1267.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1268.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1269	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1270.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1271	add	v25.2d,v25.2d,v21.2d
1272	ld1	{v24.2d},[x3],#16
1273	ext	v25.16b,v25.16b,v25.16b,#8
1274	ext	v5.16b,v0.16b,v1.16b,#8
1275	ext	v6.16b,v2.16b,v0.16b,#8
1276	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1277.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
1278	ext	v7.16b,v17.16b,v18.16b,#8
1279.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1280.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1281	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1282.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1283	add	v24.2d,v24.2d,v22.2d
1284	ld1	{v25.2d},[x3],#16
1285	ext	v24.16b,v24.16b,v24.16b,#8
1286	ext	v5.16b,v3.16b,v0.16b,#8
1287	ext	v6.16b,v4.16b,v3.16b,#8
1288	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1289.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
1290	ext	v7.16b,v18.16b,v19.16b,#8
1291.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1292.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1293	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1294.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1295	add	v25.2d,v25.2d,v23.2d
1296	ld1	{v24.2d},[x3],#16
1297	ext	v25.16b,v25.16b,v25.16b,#8
1298	ext	v5.16b,v2.16b,v3.16b,#8
1299	ext	v6.16b,v1.16b,v2.16b,#8
1300	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1301.long	0xcec08217	//sha512su0 v23.16b,v16.16b
1302	ext	v7.16b,v19.16b,v20.16b,#8
1303.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1304.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1305	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1306.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1307	add	v24.2d,v24.2d,v16.2d
1308	ld1	{v25.2d},[x3],#16
1309	ext	v24.16b,v24.16b,v24.16b,#8
1310	ext	v5.16b,v4.16b,v2.16b,#8
1311	ext	v6.16b,v0.16b,v4.16b,#8
1312	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1313.long	0xcec08230	//sha512su0 v16.16b,v17.16b
1314	ext	v7.16b,v20.16b,v21.16b,#8
1315.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1316.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1317	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1318.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1319	add	v25.2d,v25.2d,v17.2d
1320	ld1	{v24.2d},[x3],#16
1321	ext	v25.16b,v25.16b,v25.16b,#8
1322	ext	v5.16b,v1.16b,v4.16b,#8
1323	ext	v6.16b,v3.16b,v1.16b,#8
1324	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1325.long	0xcec08251	//sha512su0 v17.16b,v18.16b
1326	ext	v7.16b,v21.16b,v22.16b,#8
1327.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1328.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1329	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1330.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1331	add	v24.2d,v24.2d,v18.2d
1332	ld1	{v25.2d},[x3],#16
1333	ext	v24.16b,v24.16b,v24.16b,#8
1334	ext	v5.16b,v0.16b,v1.16b,#8
1335	ext	v6.16b,v2.16b,v0.16b,#8
1336	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1337.long	0xcec08272	//sha512su0 v18.16b,v19.16b
1338	ext	v7.16b,v22.16b,v23.16b,#8
1339.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1340.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1341	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1342.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1343	add	v25.2d,v25.2d,v19.2d
1344	ld1	{v24.2d},[x3],#16
1345	ext	v25.16b,v25.16b,v25.16b,#8
1346	ext	v5.16b,v3.16b,v0.16b,#8
1347	ext	v6.16b,v4.16b,v3.16b,#8
1348	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1349.long	0xcec08293	//sha512su0 v19.16b,v20.16b
1350	ext	v7.16b,v23.16b,v16.16b,#8
1351.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1352.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1353	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1354.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1355	add	v24.2d,v24.2d,v20.2d
1356	ld1	{v25.2d},[x3],#16
1357	ext	v24.16b,v24.16b,v24.16b,#8
1358	ext	v5.16b,v2.16b,v3.16b,#8
1359	ext	v6.16b,v1.16b,v2.16b,#8
1360	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1361.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
1362	ext	v7.16b,v16.16b,v17.16b,#8
1363.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1364.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1365	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1366.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1367	add	v25.2d,v25.2d,v21.2d
1368	ld1	{v24.2d},[x3],#16
1369	ext	v25.16b,v25.16b,v25.16b,#8
1370	ext	v5.16b,v4.16b,v2.16b,#8
1371	ext	v6.16b,v0.16b,v4.16b,#8
1372	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1373.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
1374	ext	v7.16b,v17.16b,v18.16b,#8
1375.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1376.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1377	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1378.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1379	add	v24.2d,v24.2d,v22.2d
1380	ld1	{v25.2d},[x3],#16
1381	ext	v24.16b,v24.16b,v24.16b,#8
1382	ext	v5.16b,v1.16b,v4.16b,#8
1383	ext	v6.16b,v3.16b,v1.16b,#8
1384	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1385.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
1386	ext	v7.16b,v18.16b,v19.16b,#8
1387.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1388.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1389	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1390.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1391	add	v25.2d,v25.2d,v23.2d
1392	ld1	{v24.2d},[x3],#16
1393	ext	v25.16b,v25.16b,v25.16b,#8
1394	ext	v5.16b,v0.16b,v1.16b,#8
1395	ext	v6.16b,v2.16b,v0.16b,#8
1396	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1397.long	0xcec08217	//sha512su0 v23.16b,v16.16b
1398	ext	v7.16b,v19.16b,v20.16b,#8
1399.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1400.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1401	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1402.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1403	add	v24.2d,v24.2d,v16.2d
1404	ld1	{v25.2d},[x3],#16
1405	ext	v24.16b,v24.16b,v24.16b,#8
1406	ext	v5.16b,v3.16b,v0.16b,#8
1407	ext	v6.16b,v4.16b,v3.16b,#8
1408	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1409.long	0xcec08230	//sha512su0 v16.16b,v17.16b
1410	ext	v7.16b,v20.16b,v21.16b,#8
1411.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1412.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1413	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1414.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1415	add	v25.2d,v25.2d,v17.2d
1416	ld1	{v24.2d},[x3],#16
1417	ext	v25.16b,v25.16b,v25.16b,#8
1418	ext	v5.16b,v2.16b,v3.16b,#8
1419	ext	v6.16b,v1.16b,v2.16b,#8
1420	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1421.long	0xcec08251	//sha512su0 v17.16b,v18.16b
1422	ext	v7.16b,v21.16b,v22.16b,#8
1423.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1424.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1425	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1426.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1427	add	v24.2d,v24.2d,v18.2d
1428	ld1	{v25.2d},[x3],#16
1429	ext	v24.16b,v24.16b,v24.16b,#8
1430	ext	v5.16b,v4.16b,v2.16b,#8
1431	ext	v6.16b,v0.16b,v4.16b,#8
1432	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1433.long	0xcec08272	//sha512su0 v18.16b,v19.16b
1434	ext	v7.16b,v22.16b,v23.16b,#8
1435.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1436.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1437	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1438.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1439	add	v25.2d,v25.2d,v19.2d
1440	ld1	{v24.2d},[x3],#16
1441	ext	v25.16b,v25.16b,v25.16b,#8
1442	ext	v5.16b,v1.16b,v4.16b,#8
1443	ext	v6.16b,v3.16b,v1.16b,#8
1444	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1445.long	0xcec08293	//sha512su0 v19.16b,v20.16b
1446	ext	v7.16b,v23.16b,v16.16b,#8
1447.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1448.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1449	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1450.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1451	add	v24.2d,v24.2d,v20.2d
1452	ld1	{v25.2d},[x3],#16
1453	ext	v24.16b,v24.16b,v24.16b,#8
1454	ext	v5.16b,v0.16b,v1.16b,#8
1455	ext	v6.16b,v2.16b,v0.16b,#8
1456	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1457.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
1458	ext	v7.16b,v16.16b,v17.16b,#8
1459.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1460.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1461	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1462.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1463	add	v25.2d,v25.2d,v21.2d
1464	ld1	{v24.2d},[x3],#16
1465	ext	v25.16b,v25.16b,v25.16b,#8
1466	ext	v5.16b,v3.16b,v0.16b,#8
1467	ext	v6.16b,v4.16b,v3.16b,#8
1468	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1469.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
1470	ext	v7.16b,v17.16b,v18.16b,#8
1471.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1472.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1473	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1474.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1475	add	v24.2d,v24.2d,v22.2d
1476	ld1	{v25.2d},[x3],#16
1477	ext	v24.16b,v24.16b,v24.16b,#8
1478	ext	v5.16b,v2.16b,v3.16b,#8
1479	ext	v6.16b,v1.16b,v2.16b,#8
1480	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1481.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
1482	ext	v7.16b,v18.16b,v19.16b,#8
1483.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1484.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1485	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1486.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1487	add	v25.2d,v25.2d,v23.2d
1488	ld1	{v24.2d},[x3],#16
1489	ext	v25.16b,v25.16b,v25.16b,#8
1490	ext	v5.16b,v4.16b,v2.16b,#8
1491	ext	v6.16b,v0.16b,v4.16b,#8
1492	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1493.long	0xcec08217	//sha512su0 v23.16b,v16.16b
1494	ext	v7.16b,v19.16b,v20.16b,#8
1495.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1496.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1497	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1498.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1499	ld1	{v25.2d},[x3],#16
1500	add	v24.2d,v24.2d,v16.2d
1501	ld1	{v16.16b},[x1],#16		// load next input
1502	ext	v24.16b,v24.16b,v24.16b,#8
1503	ext	v5.16b,v1.16b,v4.16b,#8
1504	ext	v6.16b,v3.16b,v1.16b,#8
1505	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1506.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1507	rev64	v16.16b,v16.16b
1508	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1509.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1510	ld1	{v24.2d},[x3],#16
1511	add	v25.2d,v25.2d,v17.2d
1512	ld1	{v17.16b},[x1],#16		// load next input
1513	ext	v25.16b,v25.16b,v25.16b,#8
1514	ext	v5.16b,v0.16b,v1.16b,#8
1515	ext	v6.16b,v2.16b,v0.16b,#8
1516	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1517.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1518	rev64	v17.16b,v17.16b
1519	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1520.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1521	ld1	{v25.2d},[x3],#16
1522	add	v24.2d,v24.2d,v18.2d
1523	ld1	{v18.16b},[x1],#16		// load next input
1524	ext	v24.16b,v24.16b,v24.16b,#8
1525	ext	v5.16b,v3.16b,v0.16b,#8
1526	ext	v6.16b,v4.16b,v3.16b,#8
1527	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1528.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1529	rev64	v18.16b,v18.16b
1530	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1531.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1532	ld1	{v24.2d},[x3],#16
1533	add	v25.2d,v25.2d,v19.2d
1534	ld1	{v19.16b},[x1],#16		// load next input
1535	ext	v25.16b,v25.16b,v25.16b,#8
1536	ext	v5.16b,v2.16b,v3.16b,#8
1537	ext	v6.16b,v1.16b,v2.16b,#8
1538	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1539.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1540	rev64	v19.16b,v19.16b
1541	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1542.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1543	ld1	{v25.2d},[x3],#16
1544	add	v24.2d,v24.2d,v20.2d
1545	ld1	{v20.16b},[x1],#16		// load next input
1546	ext	v24.16b,v24.16b,v24.16b,#8
1547	ext	v5.16b,v4.16b,v2.16b,#8
1548	ext	v6.16b,v0.16b,v4.16b,#8
1549	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1550.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1551	rev64	v20.16b,v20.16b
1552	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1553.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1554	ld1	{v24.2d},[x3],#16
1555	add	v25.2d,v25.2d,v21.2d
1556	ld1	{v21.16b},[x1],#16		// load next input
1557	ext	v25.16b,v25.16b,v25.16b,#8
1558	ext	v5.16b,v1.16b,v4.16b,#8
1559	ext	v6.16b,v3.16b,v1.16b,#8
1560	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1561.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1562	rev64	v21.16b,v21.16b
1563	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1564.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1565	ld1	{v25.2d},[x3],#16
1566	add	v24.2d,v24.2d,v22.2d
1567	ld1	{v22.16b},[x1],#16		// load next input
1568	ext	v24.16b,v24.16b,v24.16b,#8
1569	ext	v5.16b,v0.16b,v1.16b,#8
1570	ext	v6.16b,v2.16b,v0.16b,#8
1571	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1572.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1573	rev64	v22.16b,v22.16b
1574	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1575.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1576	sub	x3,x3,#80*8	// rewind
1577	add	v25.2d,v25.2d,v23.2d
1578	ld1	{v23.16b},[x1],#16		// load next input
1579	ext	v25.16b,v25.16b,v25.16b,#8
1580	ext	v5.16b,v3.16b,v0.16b,#8
1581	ext	v6.16b,v4.16b,v3.16b,#8
1582	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1583.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1584	rev64	v23.16b,v23.16b
1585	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1586.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1587	add	v0.2d,v0.2d,v26.2d			// accumulate
1588	add	v1.2d,v1.2d,v27.2d
1589	add	v2.2d,v2.2d,v28.2d
1590	add	v3.2d,v3.2d,v29.2d
1591
1592	cbnz	x2,Loop_hw
1593
1594	st1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// store context
1595
1596	ldr	x29,[sp],#16
1597	ret
1598
1599#endif
1600#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
1601