1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <ring-core/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
7#include <ring-core/arm_arch.h>
8
9
10
11
12.section	.rodata
13
14.align	5
15Lsigma:
16.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
17Lone:
18.long	1,0,0,0
19.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
20.align	2
21
22.text
23
24.globl	ChaCha20_ctr32
25
26.def ChaCha20_ctr32
27   .type 32
28.endef
29.align	5
30ChaCha20_ctr32:
31	AARCH64_VALID_CALL_TARGET
32	cbz	x2,Labort
33#if defined(OPENSSL_HWASAN) && __clang_major__ >= 10
34	adrp	x5,:pg_hi21_nc:OPENSSL_armcap_P
35#else
36	adrp	x5,OPENSSL_armcap_P
37#endif
38	cmp	x2,#192
39	b.lo	Lshort
40	ldr	w17,[x5,:lo12:OPENSSL_armcap_P]
41	tst	w17,#ARMV7_NEON
42	b.ne	ChaCha20_neon
43
44Lshort:
45	AARCH64_SIGN_LINK_REGISTER
46	stp	x29,x30,[sp,#-96]!
47	add	x29,sp,#0
48
49	adrp	x5,Lsigma
50	add	x5,x5,:lo12:Lsigma
51	stp	x19,x20,[sp,#16]
52	stp	x21,x22,[sp,#32]
53	stp	x23,x24,[sp,#48]
54	stp	x25,x26,[sp,#64]
55	stp	x27,x28,[sp,#80]
56	sub	sp,sp,#64
57
58	ldp	x22,x23,[x5]		// load sigma
59	ldp	x24,x25,[x3]		// load key
60	ldp	x26,x27,[x3,#16]
61	ldp	x28,x30,[x4]		// load counter
62#ifdef	__AARCH64EB__
63	ror	x24,x24,#32
64	ror	x25,x25,#32
65	ror	x26,x26,#32
66	ror	x27,x27,#32
67	ror	x28,x28,#32
68	ror	x30,x30,#32
69#endif
70
71Loop_outer:
72	mov	w5,w22			// unpack key block
73	lsr	x6,x22,#32
74	mov	w7,w23
75	lsr	x8,x23,#32
76	mov	w9,w24
77	lsr	x10,x24,#32
78	mov	w11,w25
79	lsr	x12,x25,#32
80	mov	w13,w26
81	lsr	x14,x26,#32
82	mov	w15,w27
83	lsr	x16,x27,#32
84	mov	w17,w28
85	lsr	x19,x28,#32
86	mov	w20,w30
87	lsr	x21,x30,#32
88
89	mov	x4,#10
90	subs	x2,x2,#64
91Loop:
92	sub	x4,x4,#1
93	add	w5,w5,w9
94	add	w6,w6,w10
95	add	w7,w7,w11
96	add	w8,w8,w12
97	eor	w17,w17,w5
98	eor	w19,w19,w6
99	eor	w20,w20,w7
100	eor	w21,w21,w8
101	ror	w17,w17,#16
102	ror	w19,w19,#16
103	ror	w20,w20,#16
104	ror	w21,w21,#16
105	add	w13,w13,w17
106	add	w14,w14,w19
107	add	w15,w15,w20
108	add	w16,w16,w21
109	eor	w9,w9,w13
110	eor	w10,w10,w14
111	eor	w11,w11,w15
112	eor	w12,w12,w16
113	ror	w9,w9,#20
114	ror	w10,w10,#20
115	ror	w11,w11,#20
116	ror	w12,w12,#20
117	add	w5,w5,w9
118	add	w6,w6,w10
119	add	w7,w7,w11
120	add	w8,w8,w12
121	eor	w17,w17,w5
122	eor	w19,w19,w6
123	eor	w20,w20,w7
124	eor	w21,w21,w8
125	ror	w17,w17,#24
126	ror	w19,w19,#24
127	ror	w20,w20,#24
128	ror	w21,w21,#24
129	add	w13,w13,w17
130	add	w14,w14,w19
131	add	w15,w15,w20
132	add	w16,w16,w21
133	eor	w9,w9,w13
134	eor	w10,w10,w14
135	eor	w11,w11,w15
136	eor	w12,w12,w16
137	ror	w9,w9,#25
138	ror	w10,w10,#25
139	ror	w11,w11,#25
140	ror	w12,w12,#25
141	add	w5,w5,w10
142	add	w6,w6,w11
143	add	w7,w7,w12
144	add	w8,w8,w9
145	eor	w21,w21,w5
146	eor	w17,w17,w6
147	eor	w19,w19,w7
148	eor	w20,w20,w8
149	ror	w21,w21,#16
150	ror	w17,w17,#16
151	ror	w19,w19,#16
152	ror	w20,w20,#16
153	add	w15,w15,w21
154	add	w16,w16,w17
155	add	w13,w13,w19
156	add	w14,w14,w20
157	eor	w10,w10,w15
158	eor	w11,w11,w16
159	eor	w12,w12,w13
160	eor	w9,w9,w14
161	ror	w10,w10,#20
162	ror	w11,w11,#20
163	ror	w12,w12,#20
164	ror	w9,w9,#20
165	add	w5,w5,w10
166	add	w6,w6,w11
167	add	w7,w7,w12
168	add	w8,w8,w9
169	eor	w21,w21,w5
170	eor	w17,w17,w6
171	eor	w19,w19,w7
172	eor	w20,w20,w8
173	ror	w21,w21,#24
174	ror	w17,w17,#24
175	ror	w19,w19,#24
176	ror	w20,w20,#24
177	add	w15,w15,w21
178	add	w16,w16,w17
179	add	w13,w13,w19
180	add	w14,w14,w20
181	eor	w10,w10,w15
182	eor	w11,w11,w16
183	eor	w12,w12,w13
184	eor	w9,w9,w14
185	ror	w10,w10,#25
186	ror	w11,w11,#25
187	ror	w12,w12,#25
188	ror	w9,w9,#25
189	cbnz	x4,Loop
190
191	add	w5,w5,w22		// accumulate key block
192	add	x6,x6,x22,lsr#32
193	add	w7,w7,w23
194	add	x8,x8,x23,lsr#32
195	add	w9,w9,w24
196	add	x10,x10,x24,lsr#32
197	add	w11,w11,w25
198	add	x12,x12,x25,lsr#32
199	add	w13,w13,w26
200	add	x14,x14,x26,lsr#32
201	add	w15,w15,w27
202	add	x16,x16,x27,lsr#32
203	add	w17,w17,w28
204	add	x19,x19,x28,lsr#32
205	add	w20,w20,w30
206	add	x21,x21,x30,lsr#32
207
208	b.lo	Ltail
209
210	add	x5,x5,x6,lsl#32	// pack
211	add	x7,x7,x8,lsl#32
212	ldp	x6,x8,[x1,#0]		// load input
213	add	x9,x9,x10,lsl#32
214	add	x11,x11,x12,lsl#32
215	ldp	x10,x12,[x1,#16]
216	add	x13,x13,x14,lsl#32
217	add	x15,x15,x16,lsl#32
218	ldp	x14,x16,[x1,#32]
219	add	x17,x17,x19,lsl#32
220	add	x20,x20,x21,lsl#32
221	ldp	x19,x21,[x1,#48]
222	add	x1,x1,#64
223#ifdef	__AARCH64EB__
224	rev	x5,x5
225	rev	x7,x7
226	rev	x9,x9
227	rev	x11,x11
228	rev	x13,x13
229	rev	x15,x15
230	rev	x17,x17
231	rev	x20,x20
232#endif
233	eor	x5,x5,x6
234	eor	x7,x7,x8
235	eor	x9,x9,x10
236	eor	x11,x11,x12
237	eor	x13,x13,x14
238	eor	x15,x15,x16
239	eor	x17,x17,x19
240	eor	x20,x20,x21
241
242	stp	x5,x7,[x0,#0]		// store output
243	add	x28,x28,#1			// increment counter
244	stp	x9,x11,[x0,#16]
245	stp	x13,x15,[x0,#32]
246	stp	x17,x20,[x0,#48]
247	add	x0,x0,#64
248
249	b.hi	Loop_outer
250
251	ldp	x19,x20,[x29,#16]
252	add	sp,sp,#64
253	ldp	x21,x22,[x29,#32]
254	ldp	x23,x24,[x29,#48]
255	ldp	x25,x26,[x29,#64]
256	ldp	x27,x28,[x29,#80]
257	ldp	x29,x30,[sp],#96
258	AARCH64_VALIDATE_LINK_REGISTER
259Labort:
260	ret
261
262.align	4
263Ltail:
264	add	x2,x2,#64
265Less_than_64:
266	sub	x0,x0,#1
267	add	x1,x1,x2
268	add	x0,x0,x2
269	add	x4,sp,x2
270	neg	x2,x2
271
272	add	x5,x5,x6,lsl#32	// pack
273	add	x7,x7,x8,lsl#32
274	add	x9,x9,x10,lsl#32
275	add	x11,x11,x12,lsl#32
276	add	x13,x13,x14,lsl#32
277	add	x15,x15,x16,lsl#32
278	add	x17,x17,x19,lsl#32
279	add	x20,x20,x21,lsl#32
280#ifdef	__AARCH64EB__
281	rev	x5,x5
282	rev	x7,x7
283	rev	x9,x9
284	rev	x11,x11
285	rev	x13,x13
286	rev	x15,x15
287	rev	x17,x17
288	rev	x20,x20
289#endif
290	stp	x5,x7,[sp,#0]
291	stp	x9,x11,[sp,#16]
292	stp	x13,x15,[sp,#32]
293	stp	x17,x20,[sp,#48]
294
295Loop_tail:
296	ldrb	w10,[x1,x2]
297	ldrb	w11,[x4,x2]
298	add	x2,x2,#1
299	eor	w10,w10,w11
300	strb	w10,[x0,x2]
301	cbnz	x2,Loop_tail
302
303	stp	xzr,xzr,[sp,#0]
304	stp	xzr,xzr,[sp,#16]
305	stp	xzr,xzr,[sp,#32]
306	stp	xzr,xzr,[sp,#48]
307
308	ldp	x19,x20,[x29,#16]
309	add	sp,sp,#64
310	ldp	x21,x22,[x29,#32]
311	ldp	x23,x24,[x29,#48]
312	ldp	x25,x26,[x29,#64]
313	ldp	x27,x28,[x29,#80]
314	ldp	x29,x30,[sp],#96
315	AARCH64_VALIDATE_LINK_REGISTER
316	ret
317
318
319.def ChaCha20_neon
320   .type 32
321.endef
322.align	5
323ChaCha20_neon:
324	AARCH64_SIGN_LINK_REGISTER
325	stp	x29,x30,[sp,#-96]!
326	add	x29,sp,#0
327
328	adrp	x5,Lsigma
329	add	x5,x5,:lo12:Lsigma
330	stp	x19,x20,[sp,#16]
331	stp	x21,x22,[sp,#32]
332	stp	x23,x24,[sp,#48]
333	stp	x25,x26,[sp,#64]
334	stp	x27,x28,[sp,#80]
335	cmp	x2,#512
336	b.hs	L512_or_more_neon
337
338	sub	sp,sp,#64
339
340	ldp	x22,x23,[x5]		// load sigma
341	ld1	{v24.4s},[x5],#16
342	ldp	x24,x25,[x3]		// load key
343	ldp	x26,x27,[x3,#16]
344	ld1	{v25.4s,v26.4s},[x3]
345	ldp	x28,x30,[x4]		// load counter
346	ld1	{v27.4s},[x4]
347	ld1	{v31.4s},[x5]
348#ifdef	__AARCH64EB__
349	rev64	v24.4s,v24.4s
350	ror	x24,x24,#32
351	ror	x25,x25,#32
352	ror	x26,x26,#32
353	ror	x27,x27,#32
354	ror	x28,x28,#32
355	ror	x30,x30,#32
356#endif
357	add	v27.4s,v27.4s,v31.4s		// += 1
358	add	v28.4s,v27.4s,v31.4s
359	add	v29.4s,v28.4s,v31.4s
360	shl	v31.4s,v31.4s,#2			// 1 -> 4
361
362Loop_outer_neon:
363	mov	w5,w22			// unpack key block
364	lsr	x6,x22,#32
365	mov	v0.16b,v24.16b
366	mov	w7,w23
367	lsr	x8,x23,#32
368	mov	v4.16b,v24.16b
369	mov	w9,w24
370	lsr	x10,x24,#32
371	mov	v16.16b,v24.16b
372	mov	w11,w25
373	mov	v1.16b,v25.16b
374	lsr	x12,x25,#32
375	mov	v5.16b,v25.16b
376	mov	w13,w26
377	mov	v17.16b,v25.16b
378	lsr	x14,x26,#32
379	mov	v3.16b,v27.16b
380	mov	w15,w27
381	mov	v7.16b,v28.16b
382	lsr	x16,x27,#32
383	mov	v19.16b,v29.16b
384	mov	w17,w28
385	mov	v2.16b,v26.16b
386	lsr	x19,x28,#32
387	mov	v6.16b,v26.16b
388	mov	w20,w30
389	mov	v18.16b,v26.16b
390	lsr	x21,x30,#32
391
392	mov	x4,#10
393	subs	x2,x2,#256
394Loop_neon:
395	sub	x4,x4,#1
396	add	v0.4s,v0.4s,v1.4s
397	add	w5,w5,w9
398	add	v4.4s,v4.4s,v5.4s
399	add	w6,w6,w10
400	add	v16.4s,v16.4s,v17.4s
401	add	w7,w7,w11
402	eor	v3.16b,v3.16b,v0.16b
403	add	w8,w8,w12
404	eor	v7.16b,v7.16b,v4.16b
405	eor	w17,w17,w5
406	eor	v19.16b,v19.16b,v16.16b
407	eor	w19,w19,w6
408	rev32	v3.8h,v3.8h
409	eor	w20,w20,w7
410	rev32	v7.8h,v7.8h
411	eor	w21,w21,w8
412	rev32	v19.8h,v19.8h
413	ror	w17,w17,#16
414	add	v2.4s,v2.4s,v3.4s
415	ror	w19,w19,#16
416	add	v6.4s,v6.4s,v7.4s
417	ror	w20,w20,#16
418	add	v18.4s,v18.4s,v19.4s
419	ror	w21,w21,#16
420	eor	v20.16b,v1.16b,v2.16b
421	add	w13,w13,w17
422	eor	v21.16b,v5.16b,v6.16b
423	add	w14,w14,w19
424	eor	v22.16b,v17.16b,v18.16b
425	add	w15,w15,w20
426	ushr	v1.4s,v20.4s,#20
427	add	w16,w16,w21
428	ushr	v5.4s,v21.4s,#20
429	eor	w9,w9,w13
430	ushr	v17.4s,v22.4s,#20
431	eor	w10,w10,w14
432	sli	v1.4s,v20.4s,#12
433	eor	w11,w11,w15
434	sli	v5.4s,v21.4s,#12
435	eor	w12,w12,w16
436	sli	v17.4s,v22.4s,#12
437	ror	w9,w9,#20
438	add	v0.4s,v0.4s,v1.4s
439	ror	w10,w10,#20
440	add	v4.4s,v4.4s,v5.4s
441	ror	w11,w11,#20
442	add	v16.4s,v16.4s,v17.4s
443	ror	w12,w12,#20
444	eor	v20.16b,v3.16b,v0.16b
445	add	w5,w5,w9
446	eor	v21.16b,v7.16b,v4.16b
447	add	w6,w6,w10
448	eor	v22.16b,v19.16b,v16.16b
449	add	w7,w7,w11
450	ushr	v3.4s,v20.4s,#24
451	add	w8,w8,w12
452	ushr	v7.4s,v21.4s,#24
453	eor	w17,w17,w5
454	ushr	v19.4s,v22.4s,#24
455	eor	w19,w19,w6
456	sli	v3.4s,v20.4s,#8
457	eor	w20,w20,w7
458	sli	v7.4s,v21.4s,#8
459	eor	w21,w21,w8
460	sli	v19.4s,v22.4s,#8
461	ror	w17,w17,#24
462	add	v2.4s,v2.4s,v3.4s
463	ror	w19,w19,#24
464	add	v6.4s,v6.4s,v7.4s
465	ror	w20,w20,#24
466	add	v18.4s,v18.4s,v19.4s
467	ror	w21,w21,#24
468	eor	v20.16b,v1.16b,v2.16b
469	add	w13,w13,w17
470	eor	v21.16b,v5.16b,v6.16b
471	add	w14,w14,w19
472	eor	v22.16b,v17.16b,v18.16b
473	add	w15,w15,w20
474	ushr	v1.4s,v20.4s,#25
475	add	w16,w16,w21
476	ushr	v5.4s,v21.4s,#25
477	eor	w9,w9,w13
478	ushr	v17.4s,v22.4s,#25
479	eor	w10,w10,w14
480	sli	v1.4s,v20.4s,#7
481	eor	w11,w11,w15
482	sli	v5.4s,v21.4s,#7
483	eor	w12,w12,w16
484	sli	v17.4s,v22.4s,#7
485	ror	w9,w9,#25
486	ext	v2.16b,v2.16b,v2.16b,#8
487	ror	w10,w10,#25
488	ext	v6.16b,v6.16b,v6.16b,#8
489	ror	w11,w11,#25
490	ext	v18.16b,v18.16b,v18.16b,#8
491	ror	w12,w12,#25
492	ext	v3.16b,v3.16b,v3.16b,#12
493	ext	v7.16b,v7.16b,v7.16b,#12
494	ext	v19.16b,v19.16b,v19.16b,#12
495	ext	v1.16b,v1.16b,v1.16b,#4
496	ext	v5.16b,v5.16b,v5.16b,#4
497	ext	v17.16b,v17.16b,v17.16b,#4
498	add	v0.4s,v0.4s,v1.4s
499	add	w5,w5,w10
500	add	v4.4s,v4.4s,v5.4s
501	add	w6,w6,w11
502	add	v16.4s,v16.4s,v17.4s
503	add	w7,w7,w12
504	eor	v3.16b,v3.16b,v0.16b
505	add	w8,w8,w9
506	eor	v7.16b,v7.16b,v4.16b
507	eor	w21,w21,w5
508	eor	v19.16b,v19.16b,v16.16b
509	eor	w17,w17,w6
510	rev32	v3.8h,v3.8h
511	eor	w19,w19,w7
512	rev32	v7.8h,v7.8h
513	eor	w20,w20,w8
514	rev32	v19.8h,v19.8h
515	ror	w21,w21,#16
516	add	v2.4s,v2.4s,v3.4s
517	ror	w17,w17,#16
518	add	v6.4s,v6.4s,v7.4s
519	ror	w19,w19,#16
520	add	v18.4s,v18.4s,v19.4s
521	ror	w20,w20,#16
522	eor	v20.16b,v1.16b,v2.16b
523	add	w15,w15,w21
524	eor	v21.16b,v5.16b,v6.16b
525	add	w16,w16,w17
526	eor	v22.16b,v17.16b,v18.16b
527	add	w13,w13,w19
528	ushr	v1.4s,v20.4s,#20
529	add	w14,w14,w20
530	ushr	v5.4s,v21.4s,#20
531	eor	w10,w10,w15
532	ushr	v17.4s,v22.4s,#20
533	eor	w11,w11,w16
534	sli	v1.4s,v20.4s,#12
535	eor	w12,w12,w13
536	sli	v5.4s,v21.4s,#12
537	eor	w9,w9,w14
538	sli	v17.4s,v22.4s,#12
539	ror	w10,w10,#20
540	add	v0.4s,v0.4s,v1.4s
541	ror	w11,w11,#20
542	add	v4.4s,v4.4s,v5.4s
543	ror	w12,w12,#20
544	add	v16.4s,v16.4s,v17.4s
545	ror	w9,w9,#20
546	eor	v20.16b,v3.16b,v0.16b
547	add	w5,w5,w10
548	eor	v21.16b,v7.16b,v4.16b
549	add	w6,w6,w11
550	eor	v22.16b,v19.16b,v16.16b
551	add	w7,w7,w12
552	ushr	v3.4s,v20.4s,#24
553	add	w8,w8,w9
554	ushr	v7.4s,v21.4s,#24
555	eor	w21,w21,w5
556	ushr	v19.4s,v22.4s,#24
557	eor	w17,w17,w6
558	sli	v3.4s,v20.4s,#8
559	eor	w19,w19,w7
560	sli	v7.4s,v21.4s,#8
561	eor	w20,w20,w8
562	sli	v19.4s,v22.4s,#8
563	ror	w21,w21,#24
564	add	v2.4s,v2.4s,v3.4s
565	ror	w17,w17,#24
566	add	v6.4s,v6.4s,v7.4s
567	ror	w19,w19,#24
568	add	v18.4s,v18.4s,v19.4s
569	ror	w20,w20,#24
570	eor	v20.16b,v1.16b,v2.16b
571	add	w15,w15,w21
572	eor	v21.16b,v5.16b,v6.16b
573	add	w16,w16,w17
574	eor	v22.16b,v17.16b,v18.16b
575	add	w13,w13,w19
576	ushr	v1.4s,v20.4s,#25
577	add	w14,w14,w20
578	ushr	v5.4s,v21.4s,#25
579	eor	w10,w10,w15
580	ushr	v17.4s,v22.4s,#25
581	eor	w11,w11,w16
582	sli	v1.4s,v20.4s,#7
583	eor	w12,w12,w13
584	sli	v5.4s,v21.4s,#7
585	eor	w9,w9,w14
586	sli	v17.4s,v22.4s,#7
587	ror	w10,w10,#25
588	ext	v2.16b,v2.16b,v2.16b,#8
589	ror	w11,w11,#25
590	ext	v6.16b,v6.16b,v6.16b,#8
591	ror	w12,w12,#25
592	ext	v18.16b,v18.16b,v18.16b,#8
593	ror	w9,w9,#25
594	ext	v3.16b,v3.16b,v3.16b,#4
595	ext	v7.16b,v7.16b,v7.16b,#4
596	ext	v19.16b,v19.16b,v19.16b,#4
597	ext	v1.16b,v1.16b,v1.16b,#12
598	ext	v5.16b,v5.16b,v5.16b,#12
599	ext	v17.16b,v17.16b,v17.16b,#12
600	cbnz	x4,Loop_neon
601
602	add	w5,w5,w22		// accumulate key block
603	add	v0.4s,v0.4s,v24.4s
604	add	x6,x6,x22,lsr#32
605	add	v4.4s,v4.4s,v24.4s
606	add	w7,w7,w23
607	add	v16.4s,v16.4s,v24.4s
608	add	x8,x8,x23,lsr#32
609	add	v2.4s,v2.4s,v26.4s
610	add	w9,w9,w24
611	add	v6.4s,v6.4s,v26.4s
612	add	x10,x10,x24,lsr#32
613	add	v18.4s,v18.4s,v26.4s
614	add	w11,w11,w25
615	add	v3.4s,v3.4s,v27.4s
616	add	x12,x12,x25,lsr#32
617	add	w13,w13,w26
618	add	v7.4s,v7.4s,v28.4s
619	add	x14,x14,x26,lsr#32
620	add	w15,w15,w27
621	add	v19.4s,v19.4s,v29.4s
622	add	x16,x16,x27,lsr#32
623	add	w17,w17,w28
624	add	v1.4s,v1.4s,v25.4s
625	add	x19,x19,x28,lsr#32
626	add	w20,w20,w30
627	add	v5.4s,v5.4s,v25.4s
628	add	x21,x21,x30,lsr#32
629	add	v17.4s,v17.4s,v25.4s
630
631	b.lo	Ltail_neon
632
633	add	x5,x5,x6,lsl#32	// pack
634	add	x7,x7,x8,lsl#32
635	ldp	x6,x8,[x1,#0]		// load input
636	add	x9,x9,x10,lsl#32
637	add	x11,x11,x12,lsl#32
638	ldp	x10,x12,[x1,#16]
639	add	x13,x13,x14,lsl#32
640	add	x15,x15,x16,lsl#32
641	ldp	x14,x16,[x1,#32]
642	add	x17,x17,x19,lsl#32
643	add	x20,x20,x21,lsl#32
644	ldp	x19,x21,[x1,#48]
645	add	x1,x1,#64
646#ifdef	__AARCH64EB__
647	rev	x5,x5
648	rev	x7,x7
649	rev	x9,x9
650	rev	x11,x11
651	rev	x13,x13
652	rev	x15,x15
653	rev	x17,x17
654	rev	x20,x20
655#endif
656	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
657	eor	x5,x5,x6
658	eor	x7,x7,x8
659	eor	x9,x9,x10
660	eor	x11,x11,x12
661	eor	x13,x13,x14
662	eor	v0.16b,v0.16b,v20.16b
663	eor	x15,x15,x16
664	eor	v1.16b,v1.16b,v21.16b
665	eor	x17,x17,x19
666	eor	v2.16b,v2.16b,v22.16b
667	eor	x20,x20,x21
668	eor	v3.16b,v3.16b,v23.16b
669	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
670
671	stp	x5,x7,[x0,#0]		// store output
672	add	x28,x28,#4			// increment counter
673	stp	x9,x11,[x0,#16]
674	add	v27.4s,v27.4s,v31.4s		// += 4
675	stp	x13,x15,[x0,#32]
676	add	v28.4s,v28.4s,v31.4s
677	stp	x17,x20,[x0,#48]
678	add	v29.4s,v29.4s,v31.4s
679	add	x0,x0,#64
680
681	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
682	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
683
684	eor	v4.16b,v4.16b,v20.16b
685	eor	v5.16b,v5.16b,v21.16b
686	eor	v6.16b,v6.16b,v22.16b
687	eor	v7.16b,v7.16b,v23.16b
688	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
689
690	eor	v16.16b,v16.16b,v0.16b
691	eor	v17.16b,v17.16b,v1.16b
692	eor	v18.16b,v18.16b,v2.16b
693	eor	v19.16b,v19.16b,v3.16b
694	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
695
696	b.hi	Loop_outer_neon
697
698	ldp	x19,x20,[x29,#16]
699	add	sp,sp,#64
700	ldp	x21,x22,[x29,#32]
701	ldp	x23,x24,[x29,#48]
702	ldp	x25,x26,[x29,#64]
703	ldp	x27,x28,[x29,#80]
704	ldp	x29,x30,[sp],#96
705	AARCH64_VALIDATE_LINK_REGISTER
706	ret
707
708Ltail_neon:
709	add	x2,x2,#256
710	cmp	x2,#64
711	b.lo	Less_than_64
712
713	add	x5,x5,x6,lsl#32	// pack
714	add	x7,x7,x8,lsl#32
715	ldp	x6,x8,[x1,#0]		// load input
716	add	x9,x9,x10,lsl#32
717	add	x11,x11,x12,lsl#32
718	ldp	x10,x12,[x1,#16]
719	add	x13,x13,x14,lsl#32
720	add	x15,x15,x16,lsl#32
721	ldp	x14,x16,[x1,#32]
722	add	x17,x17,x19,lsl#32
723	add	x20,x20,x21,lsl#32
724	ldp	x19,x21,[x1,#48]
725	add	x1,x1,#64
726#ifdef	__AARCH64EB__
727	rev	x5,x5
728	rev	x7,x7
729	rev	x9,x9
730	rev	x11,x11
731	rev	x13,x13
732	rev	x15,x15
733	rev	x17,x17
734	rev	x20,x20
735#endif
736	eor	x5,x5,x6
737	eor	x7,x7,x8
738	eor	x9,x9,x10
739	eor	x11,x11,x12
740	eor	x13,x13,x14
741	eor	x15,x15,x16
742	eor	x17,x17,x19
743	eor	x20,x20,x21
744
745	stp	x5,x7,[x0,#0]		// store output
746	add	x28,x28,#4			// increment counter
747	stp	x9,x11,[x0,#16]
748	stp	x13,x15,[x0,#32]
749	stp	x17,x20,[x0,#48]
750	add	x0,x0,#64
751	b.eq	Ldone_neon
752	sub	x2,x2,#64
753	cmp	x2,#64
754	b.lo	Less_than_128
755
756	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
757	eor	v0.16b,v0.16b,v20.16b
758	eor	v1.16b,v1.16b,v21.16b
759	eor	v2.16b,v2.16b,v22.16b
760	eor	v3.16b,v3.16b,v23.16b
761	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
762	b.eq	Ldone_neon
763	sub	x2,x2,#64
764	cmp	x2,#64
765	b.lo	Less_than_192
766
767	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
768	eor	v4.16b,v4.16b,v20.16b
769	eor	v5.16b,v5.16b,v21.16b
770	eor	v6.16b,v6.16b,v22.16b
771	eor	v7.16b,v7.16b,v23.16b
772	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
773	b.eq	Ldone_neon
774	sub	x2,x2,#64
775
776	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]
777	b	Last_neon
778
779Less_than_128:
780	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[sp]
781	b	Last_neon
782Less_than_192:
783	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[sp]
784	b	Last_neon
785
786.align	4
787Last_neon:
788	sub	x0,x0,#1
789	add	x1,x1,x2
790	add	x0,x0,x2
791	add	x4,sp,x2
792	neg	x2,x2
793
794Loop_tail_neon:
795	ldrb	w10,[x1,x2]
796	ldrb	w11,[x4,x2]
797	add	x2,x2,#1
798	eor	w10,w10,w11
799	strb	w10,[x0,x2]
800	cbnz	x2,Loop_tail_neon
801
802	stp	xzr,xzr,[sp,#0]
803	stp	xzr,xzr,[sp,#16]
804	stp	xzr,xzr,[sp,#32]
805	stp	xzr,xzr,[sp,#48]
806
807Ldone_neon:
808	ldp	x19,x20,[x29,#16]
809	add	sp,sp,#64
810	ldp	x21,x22,[x29,#32]
811	ldp	x23,x24,[x29,#48]
812	ldp	x25,x26,[x29,#64]
813	ldp	x27,x28,[x29,#80]
814	ldp	x29,x30,[sp],#96
815	AARCH64_VALIDATE_LINK_REGISTER
816	ret
817
818.def ChaCha20_512_neon
819   .type 32
820.endef
821.align	5
822ChaCha20_512_neon:
823	AARCH64_SIGN_LINK_REGISTER
824	stp	x29,x30,[sp,#-96]!
825	add	x29,sp,#0
826
827	adrp	x5,Lsigma
828	add	x5,x5,:lo12:Lsigma
829	stp	x19,x20,[sp,#16]
830	stp	x21,x22,[sp,#32]
831	stp	x23,x24,[sp,#48]
832	stp	x25,x26,[sp,#64]
833	stp	x27,x28,[sp,#80]
834
835L512_or_more_neon:
836	sub	sp,sp,#128+64
837
838	ldp	x22,x23,[x5]		// load sigma
839	ld1	{v24.4s},[x5],#16
840	ldp	x24,x25,[x3]		// load key
841	ldp	x26,x27,[x3,#16]
842	ld1	{v25.4s,v26.4s},[x3]
843	ldp	x28,x30,[x4]		// load counter
844	ld1	{v27.4s},[x4]
845	ld1	{v31.4s},[x5]
846#ifdef	__AARCH64EB__
847	rev64	v24.4s,v24.4s
848	ror	x24,x24,#32
849	ror	x25,x25,#32
850	ror	x26,x26,#32
851	ror	x27,x27,#32
852	ror	x28,x28,#32
853	ror	x30,x30,#32
854#endif
855	add	v27.4s,v27.4s,v31.4s		// += 1
856	stp	q24,q25,[sp,#0]		// off-load key block, invariant part
857	add	v27.4s,v27.4s,v31.4s		// not typo
858	str	q26,[sp,#32]
859	add	v28.4s,v27.4s,v31.4s
860	add	v29.4s,v28.4s,v31.4s
861	add	v30.4s,v29.4s,v31.4s
862	shl	v31.4s,v31.4s,#2			// 1 -> 4
863
864	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
865	stp	d10,d11,[sp,#128+16]
866	stp	d12,d13,[sp,#128+32]
867	stp	d14,d15,[sp,#128+48]
868
869	sub	x2,x2,#512			// not typo
870
871Loop_outer_512_neon:
872	mov	v0.16b,v24.16b
873	mov	v4.16b,v24.16b
874	mov	v8.16b,v24.16b
875	mov	v12.16b,v24.16b
876	mov	v16.16b,v24.16b
877	mov	v20.16b,v24.16b
878	mov	v1.16b,v25.16b
879	mov	w5,w22			// unpack key block
880	mov	v5.16b,v25.16b
881	lsr	x6,x22,#32
882	mov	v9.16b,v25.16b
883	mov	w7,w23
884	mov	v13.16b,v25.16b
885	lsr	x8,x23,#32
886	mov	v17.16b,v25.16b
887	mov	w9,w24
888	mov	v21.16b,v25.16b
889	lsr	x10,x24,#32
890	mov	v3.16b,v27.16b
891	mov	w11,w25
892	mov	v7.16b,v28.16b
893	lsr	x12,x25,#32
894	mov	v11.16b,v29.16b
895	mov	w13,w26
896	mov	v15.16b,v30.16b
897	lsr	x14,x26,#32
898	mov	v2.16b,v26.16b
899	mov	w15,w27
900	mov	v6.16b,v26.16b
901	lsr	x16,x27,#32
902	add	v19.4s,v3.4s,v31.4s			// +4
903	mov	w17,w28
904	add	v23.4s,v7.4s,v31.4s			// +4
905	lsr	x19,x28,#32
906	mov	v10.16b,v26.16b
907	mov	w20,w30
908	mov	v14.16b,v26.16b
909	lsr	x21,x30,#32
910	mov	v18.16b,v26.16b
911	stp	q27,q28,[sp,#48]		// off-load key block, variable part
912	mov	v22.16b,v26.16b
913	str	q29,[sp,#80]
914
915	mov	x4,#5
916	subs	x2,x2,#512
917Loop_upper_neon:
918	sub	x4,x4,#1
919	add	v0.4s,v0.4s,v1.4s
920	add	w5,w5,w9
921	add	v4.4s,v4.4s,v5.4s
922	add	w6,w6,w10
923	add	v8.4s,v8.4s,v9.4s
924	add	w7,w7,w11
925	add	v12.4s,v12.4s,v13.4s
926	add	w8,w8,w12
927	add	v16.4s,v16.4s,v17.4s
928	eor	w17,w17,w5
929	add	v20.4s,v20.4s,v21.4s
930	eor	w19,w19,w6
931	eor	v3.16b,v3.16b,v0.16b
932	eor	w20,w20,w7
933	eor	v7.16b,v7.16b,v4.16b
934	eor	w21,w21,w8
935	eor	v11.16b,v11.16b,v8.16b
936	ror	w17,w17,#16
937	eor	v15.16b,v15.16b,v12.16b
938	ror	w19,w19,#16
939	eor	v19.16b,v19.16b,v16.16b
940	ror	w20,w20,#16
941	eor	v23.16b,v23.16b,v20.16b
942	ror	w21,w21,#16
943	rev32	v3.8h,v3.8h
944	add	w13,w13,w17
945	rev32	v7.8h,v7.8h
946	add	w14,w14,w19
947	rev32	v11.8h,v11.8h
948	add	w15,w15,w20
949	rev32	v15.8h,v15.8h
950	add	w16,w16,w21
951	rev32	v19.8h,v19.8h
952	eor	w9,w9,w13
953	rev32	v23.8h,v23.8h
954	eor	w10,w10,w14
955	add	v2.4s,v2.4s,v3.4s
956	eor	w11,w11,w15
957	add	v6.4s,v6.4s,v7.4s
958	eor	w12,w12,w16
959	add	v10.4s,v10.4s,v11.4s
960	ror	w9,w9,#20
961	add	v14.4s,v14.4s,v15.4s
962	ror	w10,w10,#20
963	add	v18.4s,v18.4s,v19.4s
964	ror	w11,w11,#20
965	add	v22.4s,v22.4s,v23.4s
966	ror	w12,w12,#20
967	eor	v24.16b,v1.16b,v2.16b
968	add	w5,w5,w9
969	eor	v25.16b,v5.16b,v6.16b
970	add	w6,w6,w10
971	eor	v26.16b,v9.16b,v10.16b
972	add	w7,w7,w11
973	eor	v27.16b,v13.16b,v14.16b
974	add	w8,w8,w12
975	eor	v28.16b,v17.16b,v18.16b
976	eor	w17,w17,w5
977	eor	v29.16b,v21.16b,v22.16b
978	eor	w19,w19,w6
979	ushr	v1.4s,v24.4s,#20
980	eor	w20,w20,w7
981	ushr	v5.4s,v25.4s,#20
982	eor	w21,w21,w8
983	ushr	v9.4s,v26.4s,#20
984	ror	w17,w17,#24
985	ushr	v13.4s,v27.4s,#20
986	ror	w19,w19,#24
987	ushr	v17.4s,v28.4s,#20
988	ror	w20,w20,#24
989	ushr	v21.4s,v29.4s,#20
990	ror	w21,w21,#24
991	sli	v1.4s,v24.4s,#12
992	add	w13,w13,w17
993	sli	v5.4s,v25.4s,#12
994	add	w14,w14,w19
995	sli	v9.4s,v26.4s,#12
996	add	w15,w15,w20
997	sli	v13.4s,v27.4s,#12
998	add	w16,w16,w21
999	sli	v17.4s,v28.4s,#12
1000	eor	w9,w9,w13
1001	sli	v21.4s,v29.4s,#12
1002	eor	w10,w10,w14
1003	add	v0.4s,v0.4s,v1.4s
1004	eor	w11,w11,w15
1005	add	v4.4s,v4.4s,v5.4s
1006	eor	w12,w12,w16
1007	add	v8.4s,v8.4s,v9.4s
1008	ror	w9,w9,#25
1009	add	v12.4s,v12.4s,v13.4s
1010	ror	w10,w10,#25
1011	add	v16.4s,v16.4s,v17.4s
1012	ror	w11,w11,#25
1013	add	v20.4s,v20.4s,v21.4s
1014	ror	w12,w12,#25
1015	eor	v24.16b,v3.16b,v0.16b
1016	add	w5,w5,w10
1017	eor	v25.16b,v7.16b,v4.16b
1018	add	w6,w6,w11
1019	eor	v26.16b,v11.16b,v8.16b
1020	add	w7,w7,w12
1021	eor	v27.16b,v15.16b,v12.16b
1022	add	w8,w8,w9
1023	eor	v28.16b,v19.16b,v16.16b
1024	eor	w21,w21,w5
1025	eor	v29.16b,v23.16b,v20.16b
1026	eor	w17,w17,w6
1027	ushr	v3.4s,v24.4s,#24
1028	eor	w19,w19,w7
1029	ushr	v7.4s,v25.4s,#24
1030	eor	w20,w20,w8
1031	ushr	v11.4s,v26.4s,#24
1032	ror	w21,w21,#16
1033	ushr	v15.4s,v27.4s,#24
1034	ror	w17,w17,#16
1035	ushr	v19.4s,v28.4s,#24
1036	ror	w19,w19,#16
1037	ushr	v23.4s,v29.4s,#24
1038	ror	w20,w20,#16
1039	sli	v3.4s,v24.4s,#8
1040	add	w15,w15,w21
1041	sli	v7.4s,v25.4s,#8
1042	add	w16,w16,w17
1043	sli	v11.4s,v26.4s,#8
1044	add	w13,w13,w19
1045	sli	v15.4s,v27.4s,#8
1046	add	w14,w14,w20
1047	sli	v19.4s,v28.4s,#8
1048	eor	w10,w10,w15
1049	sli	v23.4s,v29.4s,#8
1050	eor	w11,w11,w16
1051	add	v2.4s,v2.4s,v3.4s
1052	eor	w12,w12,w13
1053	add	v6.4s,v6.4s,v7.4s
1054	eor	w9,w9,w14
1055	add	v10.4s,v10.4s,v11.4s
1056	ror	w10,w10,#20
1057	add	v14.4s,v14.4s,v15.4s
1058	ror	w11,w11,#20
1059	add	v18.4s,v18.4s,v19.4s
1060	ror	w12,w12,#20
1061	add	v22.4s,v22.4s,v23.4s
1062	ror	w9,w9,#20
1063	eor	v24.16b,v1.16b,v2.16b
1064	add	w5,w5,w10
1065	eor	v25.16b,v5.16b,v6.16b
1066	add	w6,w6,w11
1067	eor	v26.16b,v9.16b,v10.16b
1068	add	w7,w7,w12
1069	eor	v27.16b,v13.16b,v14.16b
1070	add	w8,w8,w9
1071	eor	v28.16b,v17.16b,v18.16b
1072	eor	w21,w21,w5
1073	eor	v29.16b,v21.16b,v22.16b
1074	eor	w17,w17,w6
1075	ushr	v1.4s,v24.4s,#25
1076	eor	w19,w19,w7
1077	ushr	v5.4s,v25.4s,#25
1078	eor	w20,w20,w8
1079	ushr	v9.4s,v26.4s,#25
1080	ror	w21,w21,#24
1081	ushr	v13.4s,v27.4s,#25
1082	ror	w17,w17,#24
1083	ushr	v17.4s,v28.4s,#25
1084	ror	w19,w19,#24
1085	ushr	v21.4s,v29.4s,#25
1086	ror	w20,w20,#24
1087	sli	v1.4s,v24.4s,#7
1088	add	w15,w15,w21
1089	sli	v5.4s,v25.4s,#7
1090	add	w16,w16,w17
1091	sli	v9.4s,v26.4s,#7
1092	add	w13,w13,w19
1093	sli	v13.4s,v27.4s,#7
1094	add	w14,w14,w20
1095	sli	v17.4s,v28.4s,#7
1096	eor	w10,w10,w15
1097	sli	v21.4s,v29.4s,#7
1098	eor	w11,w11,w16
1099	ext	v2.16b,v2.16b,v2.16b,#8
1100	eor	w12,w12,w13
1101	ext	v6.16b,v6.16b,v6.16b,#8
1102	eor	w9,w9,w14
1103	ext	v10.16b,v10.16b,v10.16b,#8
1104	ror	w10,w10,#25
1105	ext	v14.16b,v14.16b,v14.16b,#8
1106	ror	w11,w11,#25
1107	ext	v18.16b,v18.16b,v18.16b,#8
1108	ror	w12,w12,#25
1109	ext	v22.16b,v22.16b,v22.16b,#8
1110	ror	w9,w9,#25
1111	ext	v3.16b,v3.16b,v3.16b,#12
1112	ext	v7.16b,v7.16b,v7.16b,#12
1113	ext	v11.16b,v11.16b,v11.16b,#12
1114	ext	v15.16b,v15.16b,v15.16b,#12
1115	ext	v19.16b,v19.16b,v19.16b,#12
1116	ext	v23.16b,v23.16b,v23.16b,#12
1117	ext	v1.16b,v1.16b,v1.16b,#4
1118	ext	v5.16b,v5.16b,v5.16b,#4
1119	ext	v9.16b,v9.16b,v9.16b,#4
1120	ext	v13.16b,v13.16b,v13.16b,#4
1121	ext	v17.16b,v17.16b,v17.16b,#4
1122	ext	v21.16b,v21.16b,v21.16b,#4
1123	add	v0.4s,v0.4s,v1.4s
1124	add	w5,w5,w9
1125	add	v4.4s,v4.4s,v5.4s
1126	add	w6,w6,w10
1127	add	v8.4s,v8.4s,v9.4s
1128	add	w7,w7,w11
1129	add	v12.4s,v12.4s,v13.4s
1130	add	w8,w8,w12
1131	add	v16.4s,v16.4s,v17.4s
1132	eor	w17,w17,w5
1133	add	v20.4s,v20.4s,v21.4s
1134	eor	w19,w19,w6
1135	eor	v3.16b,v3.16b,v0.16b
1136	eor	w20,w20,w7
1137	eor	v7.16b,v7.16b,v4.16b
1138	eor	w21,w21,w8
1139	eor	v11.16b,v11.16b,v8.16b
1140	ror	w17,w17,#16
1141	eor	v15.16b,v15.16b,v12.16b
1142	ror	w19,w19,#16
1143	eor	v19.16b,v19.16b,v16.16b
1144	ror	w20,w20,#16
1145	eor	v23.16b,v23.16b,v20.16b
1146	ror	w21,w21,#16
1147	rev32	v3.8h,v3.8h
1148	add	w13,w13,w17
1149	rev32	v7.8h,v7.8h
1150	add	w14,w14,w19
1151	rev32	v11.8h,v11.8h
1152	add	w15,w15,w20
1153	rev32	v15.8h,v15.8h
1154	add	w16,w16,w21
1155	rev32	v19.8h,v19.8h
1156	eor	w9,w9,w13
1157	rev32	v23.8h,v23.8h
1158	eor	w10,w10,w14
1159	add	v2.4s,v2.4s,v3.4s
1160	eor	w11,w11,w15
1161	add	v6.4s,v6.4s,v7.4s
1162	eor	w12,w12,w16
1163	add	v10.4s,v10.4s,v11.4s
1164	ror	w9,w9,#20
1165	add	v14.4s,v14.4s,v15.4s
1166	ror	w10,w10,#20
1167	add	v18.4s,v18.4s,v19.4s
1168	ror	w11,w11,#20
1169	add	v22.4s,v22.4s,v23.4s
1170	ror	w12,w12,#20
1171	eor	v24.16b,v1.16b,v2.16b
1172	add	w5,w5,w9
1173	eor	v25.16b,v5.16b,v6.16b
1174	add	w6,w6,w10
1175	eor	v26.16b,v9.16b,v10.16b
1176	add	w7,w7,w11
1177	eor	v27.16b,v13.16b,v14.16b
1178	add	w8,w8,w12
1179	eor	v28.16b,v17.16b,v18.16b
1180	eor	w17,w17,w5
1181	eor	v29.16b,v21.16b,v22.16b
1182	eor	w19,w19,w6
1183	ushr	v1.4s,v24.4s,#20
1184	eor	w20,w20,w7
1185	ushr	v5.4s,v25.4s,#20
1186	eor	w21,w21,w8
1187	ushr	v9.4s,v26.4s,#20
1188	ror	w17,w17,#24
1189	ushr	v13.4s,v27.4s,#20
1190	ror	w19,w19,#24
1191	ushr	v17.4s,v28.4s,#20
1192	ror	w20,w20,#24
1193	ushr	v21.4s,v29.4s,#20
1194	ror	w21,w21,#24
1195	sli	v1.4s,v24.4s,#12
1196	add	w13,w13,w17
1197	sli	v5.4s,v25.4s,#12
1198	add	w14,w14,w19
1199	sli	v9.4s,v26.4s,#12
1200	add	w15,w15,w20
1201	sli	v13.4s,v27.4s,#12
1202	add	w16,w16,w21
1203	sli	v17.4s,v28.4s,#12
1204	eor	w9,w9,w13
1205	sli	v21.4s,v29.4s,#12
1206	eor	w10,w10,w14
1207	add	v0.4s,v0.4s,v1.4s
1208	eor	w11,w11,w15
1209	add	v4.4s,v4.4s,v5.4s
1210	eor	w12,w12,w16
1211	add	v8.4s,v8.4s,v9.4s
1212	ror	w9,w9,#25
1213	add	v12.4s,v12.4s,v13.4s
1214	ror	w10,w10,#25
1215	add	v16.4s,v16.4s,v17.4s
1216	ror	w11,w11,#25
1217	add	v20.4s,v20.4s,v21.4s
1218	ror	w12,w12,#25
1219	eor	v24.16b,v3.16b,v0.16b
1220	add	w5,w5,w10
1221	eor	v25.16b,v7.16b,v4.16b
1222	add	w6,w6,w11
1223	eor	v26.16b,v11.16b,v8.16b
1224	add	w7,w7,w12
1225	eor	v27.16b,v15.16b,v12.16b
1226	add	w8,w8,w9
1227	eor	v28.16b,v19.16b,v16.16b
1228	eor	w21,w21,w5
1229	eor	v29.16b,v23.16b,v20.16b
1230	eor	w17,w17,w6
1231	ushr	v3.4s,v24.4s,#24
1232	eor	w19,w19,w7
1233	ushr	v7.4s,v25.4s,#24
1234	eor	w20,w20,w8
1235	ushr	v11.4s,v26.4s,#24
1236	ror	w21,w21,#16
1237	ushr	v15.4s,v27.4s,#24
1238	ror	w17,w17,#16
1239	ushr	v19.4s,v28.4s,#24
1240	ror	w19,w19,#16
1241	ushr	v23.4s,v29.4s,#24
1242	ror	w20,w20,#16
1243	sli	v3.4s,v24.4s,#8
1244	add	w15,w15,w21
1245	sli	v7.4s,v25.4s,#8
1246	add	w16,w16,w17
1247	sli	v11.4s,v26.4s,#8
1248	add	w13,w13,w19
1249	sli	v15.4s,v27.4s,#8
1250	add	w14,w14,w20
1251	sli	v19.4s,v28.4s,#8
1252	eor	w10,w10,w15
1253	sli	v23.4s,v29.4s,#8
1254	eor	w11,w11,w16
1255	add	v2.4s,v2.4s,v3.4s
1256	eor	w12,w12,w13
1257	add	v6.4s,v6.4s,v7.4s
1258	eor	w9,w9,w14
1259	add	v10.4s,v10.4s,v11.4s
1260	ror	w10,w10,#20
1261	add	v14.4s,v14.4s,v15.4s
1262	ror	w11,w11,#20
1263	add	v18.4s,v18.4s,v19.4s
1264	ror	w12,w12,#20
1265	add	v22.4s,v22.4s,v23.4s
1266	ror	w9,w9,#20
1267	eor	v24.16b,v1.16b,v2.16b
1268	add	w5,w5,w10
1269	eor	v25.16b,v5.16b,v6.16b
1270	add	w6,w6,w11
1271	eor	v26.16b,v9.16b,v10.16b
1272	add	w7,w7,w12
1273	eor	v27.16b,v13.16b,v14.16b
1274	add	w8,w8,w9
1275	eor	v28.16b,v17.16b,v18.16b
1276	eor	w21,w21,w5
1277	eor	v29.16b,v21.16b,v22.16b
1278	eor	w17,w17,w6
1279	ushr	v1.4s,v24.4s,#25
1280	eor	w19,w19,w7
1281	ushr	v5.4s,v25.4s,#25
1282	eor	w20,w20,w8
1283	ushr	v9.4s,v26.4s,#25
1284	ror	w21,w21,#24
1285	ushr	v13.4s,v27.4s,#25
1286	ror	w17,w17,#24
1287	ushr	v17.4s,v28.4s,#25
1288	ror	w19,w19,#24
1289	ushr	v21.4s,v29.4s,#25
1290	ror	w20,w20,#24
1291	sli	v1.4s,v24.4s,#7
1292	add	w15,w15,w21
1293	sli	v5.4s,v25.4s,#7
1294	add	w16,w16,w17
1295	sli	v9.4s,v26.4s,#7
1296	add	w13,w13,w19
1297	sli	v13.4s,v27.4s,#7
1298	add	w14,w14,w20
1299	sli	v17.4s,v28.4s,#7
1300	eor	w10,w10,w15
1301	sli	v21.4s,v29.4s,#7
1302	eor	w11,w11,w16
1303	ext	v2.16b,v2.16b,v2.16b,#8
1304	eor	w12,w12,w13
1305	ext	v6.16b,v6.16b,v6.16b,#8
1306	eor	w9,w9,w14
1307	ext	v10.16b,v10.16b,v10.16b,#8
1308	ror	w10,w10,#25
1309	ext	v14.16b,v14.16b,v14.16b,#8
1310	ror	w11,w11,#25
1311	ext	v18.16b,v18.16b,v18.16b,#8
1312	ror	w12,w12,#25
1313	ext	v22.16b,v22.16b,v22.16b,#8
1314	ror	w9,w9,#25
1315	ext	v3.16b,v3.16b,v3.16b,#4
1316	ext	v7.16b,v7.16b,v7.16b,#4
1317	ext	v11.16b,v11.16b,v11.16b,#4
1318	ext	v15.16b,v15.16b,v15.16b,#4
1319	ext	v19.16b,v19.16b,v19.16b,#4
1320	ext	v23.16b,v23.16b,v23.16b,#4
1321	ext	v1.16b,v1.16b,v1.16b,#12
1322	ext	v5.16b,v5.16b,v5.16b,#12
1323	ext	v9.16b,v9.16b,v9.16b,#12
1324	ext	v13.16b,v13.16b,v13.16b,#12
1325	ext	v17.16b,v17.16b,v17.16b,#12
1326	ext	v21.16b,v21.16b,v21.16b,#12
1327	cbnz	x4,Loop_upper_neon
1328
1329	add	w5,w5,w22		// accumulate key block
1330	add	x6,x6,x22,lsr#32
1331	add	w7,w7,w23
1332	add	x8,x8,x23,lsr#32
1333	add	w9,w9,w24
1334	add	x10,x10,x24,lsr#32
1335	add	w11,w11,w25
1336	add	x12,x12,x25,lsr#32
1337	add	w13,w13,w26
1338	add	x14,x14,x26,lsr#32
1339	add	w15,w15,w27
1340	add	x16,x16,x27,lsr#32
1341	add	w17,w17,w28
1342	add	x19,x19,x28,lsr#32
1343	add	w20,w20,w30
1344	add	x21,x21,x30,lsr#32
1345
1346	add	x5,x5,x6,lsl#32	// pack
1347	add	x7,x7,x8,lsl#32
1348	ldp	x6,x8,[x1,#0]		// load input
1349	add	x9,x9,x10,lsl#32
1350	add	x11,x11,x12,lsl#32
1351	ldp	x10,x12,[x1,#16]
1352	add	x13,x13,x14,lsl#32
1353	add	x15,x15,x16,lsl#32
1354	ldp	x14,x16,[x1,#32]
1355	add	x17,x17,x19,lsl#32
1356	add	x20,x20,x21,lsl#32
1357	ldp	x19,x21,[x1,#48]
1358	add	x1,x1,#64
1359#ifdef	__AARCH64EB__
1360	rev	x5,x5
1361	rev	x7,x7
1362	rev	x9,x9
1363	rev	x11,x11
1364	rev	x13,x13
1365	rev	x15,x15
1366	rev	x17,x17
1367	rev	x20,x20
1368#endif
1369	eor	x5,x5,x6
1370	eor	x7,x7,x8
1371	eor	x9,x9,x10
1372	eor	x11,x11,x12
1373	eor	x13,x13,x14
1374	eor	x15,x15,x16
1375	eor	x17,x17,x19
1376	eor	x20,x20,x21
1377
1378	stp	x5,x7,[x0,#0]		// store output
1379	add	x28,x28,#1			// increment counter
1380	mov	w5,w22			// unpack key block
1381	lsr	x6,x22,#32
1382	stp	x9,x11,[x0,#16]
1383	mov	w7,w23
1384	lsr	x8,x23,#32
1385	stp	x13,x15,[x0,#32]
1386	mov	w9,w24
1387	lsr	x10,x24,#32
1388	stp	x17,x20,[x0,#48]
1389	add	x0,x0,#64
1390	mov	w11,w25
1391	lsr	x12,x25,#32
1392	mov	w13,w26
1393	lsr	x14,x26,#32
1394	mov	w15,w27
1395	lsr	x16,x27,#32
1396	mov	w17,w28
1397	lsr	x19,x28,#32
1398	mov	w20,w30
1399	lsr	x21,x30,#32
1400
1401	mov	x4,#5
1402Loop_lower_neon:
1403	sub	x4,x4,#1
1404	add	v0.4s,v0.4s,v1.4s
1405	add	w5,w5,w9
1406	add	v4.4s,v4.4s,v5.4s
1407	add	w6,w6,w10
1408	add	v8.4s,v8.4s,v9.4s
1409	add	w7,w7,w11
1410	add	v12.4s,v12.4s,v13.4s
1411	add	w8,w8,w12
1412	add	v16.4s,v16.4s,v17.4s
1413	eor	w17,w17,w5
1414	add	v20.4s,v20.4s,v21.4s
1415	eor	w19,w19,w6
1416	eor	v3.16b,v3.16b,v0.16b
1417	eor	w20,w20,w7
1418	eor	v7.16b,v7.16b,v4.16b
1419	eor	w21,w21,w8
1420	eor	v11.16b,v11.16b,v8.16b
1421	ror	w17,w17,#16
1422	eor	v15.16b,v15.16b,v12.16b
1423	ror	w19,w19,#16
1424	eor	v19.16b,v19.16b,v16.16b
1425	ror	w20,w20,#16
1426	eor	v23.16b,v23.16b,v20.16b
1427	ror	w21,w21,#16
1428	rev32	v3.8h,v3.8h
1429	add	w13,w13,w17
1430	rev32	v7.8h,v7.8h
1431	add	w14,w14,w19
1432	rev32	v11.8h,v11.8h
1433	add	w15,w15,w20
1434	rev32	v15.8h,v15.8h
1435	add	w16,w16,w21
1436	rev32	v19.8h,v19.8h
1437	eor	w9,w9,w13
1438	rev32	v23.8h,v23.8h
1439	eor	w10,w10,w14
1440	add	v2.4s,v2.4s,v3.4s
1441	eor	w11,w11,w15
1442	add	v6.4s,v6.4s,v7.4s
1443	eor	w12,w12,w16
1444	add	v10.4s,v10.4s,v11.4s
1445	ror	w9,w9,#20
1446	add	v14.4s,v14.4s,v15.4s
1447	ror	w10,w10,#20
1448	add	v18.4s,v18.4s,v19.4s
1449	ror	w11,w11,#20
1450	add	v22.4s,v22.4s,v23.4s
1451	ror	w12,w12,#20
1452	eor	v24.16b,v1.16b,v2.16b
1453	add	w5,w5,w9
1454	eor	v25.16b,v5.16b,v6.16b
1455	add	w6,w6,w10
1456	eor	v26.16b,v9.16b,v10.16b
1457	add	w7,w7,w11
1458	eor	v27.16b,v13.16b,v14.16b
1459	add	w8,w8,w12
1460	eor	v28.16b,v17.16b,v18.16b
1461	eor	w17,w17,w5
1462	eor	v29.16b,v21.16b,v22.16b
1463	eor	w19,w19,w6
1464	ushr	v1.4s,v24.4s,#20
1465	eor	w20,w20,w7
1466	ushr	v5.4s,v25.4s,#20
1467	eor	w21,w21,w8
1468	ushr	v9.4s,v26.4s,#20
1469	ror	w17,w17,#24
1470	ushr	v13.4s,v27.4s,#20
1471	ror	w19,w19,#24
1472	ushr	v17.4s,v28.4s,#20
1473	ror	w20,w20,#24
1474	ushr	v21.4s,v29.4s,#20
1475	ror	w21,w21,#24
1476	sli	v1.4s,v24.4s,#12
1477	add	w13,w13,w17
1478	sli	v5.4s,v25.4s,#12
1479	add	w14,w14,w19
1480	sli	v9.4s,v26.4s,#12
1481	add	w15,w15,w20
1482	sli	v13.4s,v27.4s,#12
1483	add	w16,w16,w21
1484	sli	v17.4s,v28.4s,#12
1485	eor	w9,w9,w13
1486	sli	v21.4s,v29.4s,#12
1487	eor	w10,w10,w14
1488	add	v0.4s,v0.4s,v1.4s
1489	eor	w11,w11,w15
1490	add	v4.4s,v4.4s,v5.4s
1491	eor	w12,w12,w16
1492	add	v8.4s,v8.4s,v9.4s
1493	ror	w9,w9,#25
1494	add	v12.4s,v12.4s,v13.4s
1495	ror	w10,w10,#25
1496	add	v16.4s,v16.4s,v17.4s
1497	ror	w11,w11,#25
1498	add	v20.4s,v20.4s,v21.4s
1499	ror	w12,w12,#25
1500	eor	v24.16b,v3.16b,v0.16b
1501	add	w5,w5,w10
1502	eor	v25.16b,v7.16b,v4.16b
1503	add	w6,w6,w11
1504	eor	v26.16b,v11.16b,v8.16b
1505	add	w7,w7,w12
1506	eor	v27.16b,v15.16b,v12.16b
1507	add	w8,w8,w9
1508	eor	v28.16b,v19.16b,v16.16b
1509	eor	w21,w21,w5
1510	eor	v29.16b,v23.16b,v20.16b
1511	eor	w17,w17,w6
1512	ushr	v3.4s,v24.4s,#24
1513	eor	w19,w19,w7
1514	ushr	v7.4s,v25.4s,#24
1515	eor	w20,w20,w8
1516	ushr	v11.4s,v26.4s,#24
1517	ror	w21,w21,#16
1518	ushr	v15.4s,v27.4s,#24
1519	ror	w17,w17,#16
1520	ushr	v19.4s,v28.4s,#24
1521	ror	w19,w19,#16
1522	ushr	v23.4s,v29.4s,#24
1523	ror	w20,w20,#16
1524	sli	v3.4s,v24.4s,#8
1525	add	w15,w15,w21
1526	sli	v7.4s,v25.4s,#8
1527	add	w16,w16,w17
1528	sli	v11.4s,v26.4s,#8
1529	add	w13,w13,w19
1530	sli	v15.4s,v27.4s,#8
1531	add	w14,w14,w20
1532	sli	v19.4s,v28.4s,#8
1533	eor	w10,w10,w15
1534	sli	v23.4s,v29.4s,#8
1535	eor	w11,w11,w16
1536	add	v2.4s,v2.4s,v3.4s
1537	eor	w12,w12,w13
1538	add	v6.4s,v6.4s,v7.4s
1539	eor	w9,w9,w14
1540	add	v10.4s,v10.4s,v11.4s
1541	ror	w10,w10,#20
1542	add	v14.4s,v14.4s,v15.4s
1543	ror	w11,w11,#20
1544	add	v18.4s,v18.4s,v19.4s
1545	ror	w12,w12,#20
1546	add	v22.4s,v22.4s,v23.4s
1547	ror	w9,w9,#20
1548	eor	v24.16b,v1.16b,v2.16b
1549	add	w5,w5,w10
1550	eor	v25.16b,v5.16b,v6.16b
1551	add	w6,w6,w11
1552	eor	v26.16b,v9.16b,v10.16b
1553	add	w7,w7,w12
1554	eor	v27.16b,v13.16b,v14.16b
1555	add	w8,w8,w9
1556	eor	v28.16b,v17.16b,v18.16b
1557	eor	w21,w21,w5
1558	eor	v29.16b,v21.16b,v22.16b
1559	eor	w17,w17,w6
1560	ushr	v1.4s,v24.4s,#25
1561	eor	w19,w19,w7
1562	ushr	v5.4s,v25.4s,#25
1563	eor	w20,w20,w8
1564	ushr	v9.4s,v26.4s,#25
1565	ror	w21,w21,#24
1566	ushr	v13.4s,v27.4s,#25
1567	ror	w17,w17,#24
1568	ushr	v17.4s,v28.4s,#25
1569	ror	w19,w19,#24
1570	ushr	v21.4s,v29.4s,#25
1571	ror	w20,w20,#24
1572	sli	v1.4s,v24.4s,#7
1573	add	w15,w15,w21
1574	sli	v5.4s,v25.4s,#7
1575	add	w16,w16,w17
1576	sli	v9.4s,v26.4s,#7
1577	add	w13,w13,w19
1578	sli	v13.4s,v27.4s,#7
1579	add	w14,w14,w20
1580	sli	v17.4s,v28.4s,#7
1581	eor	w10,w10,w15
1582	sli	v21.4s,v29.4s,#7
1583	eor	w11,w11,w16
1584	ext	v2.16b,v2.16b,v2.16b,#8
1585	eor	w12,w12,w13
1586	ext	v6.16b,v6.16b,v6.16b,#8
1587	eor	w9,w9,w14
1588	ext	v10.16b,v10.16b,v10.16b,#8
1589	ror	w10,w10,#25
1590	ext	v14.16b,v14.16b,v14.16b,#8
1591	ror	w11,w11,#25
1592	ext	v18.16b,v18.16b,v18.16b,#8
1593	ror	w12,w12,#25
1594	ext	v22.16b,v22.16b,v22.16b,#8
1595	ror	w9,w9,#25
1596	ext	v3.16b,v3.16b,v3.16b,#12
1597	ext	v7.16b,v7.16b,v7.16b,#12
1598	ext	v11.16b,v11.16b,v11.16b,#12
1599	ext	v15.16b,v15.16b,v15.16b,#12
1600	ext	v19.16b,v19.16b,v19.16b,#12
1601	ext	v23.16b,v23.16b,v23.16b,#12
1602	ext	v1.16b,v1.16b,v1.16b,#4
1603	ext	v5.16b,v5.16b,v5.16b,#4
1604	ext	v9.16b,v9.16b,v9.16b,#4
1605	ext	v13.16b,v13.16b,v13.16b,#4
1606	ext	v17.16b,v17.16b,v17.16b,#4
1607	ext	v21.16b,v21.16b,v21.16b,#4
1608	add	v0.4s,v0.4s,v1.4s
1609	add	w5,w5,w9
1610	add	v4.4s,v4.4s,v5.4s
1611	add	w6,w6,w10
1612	add	v8.4s,v8.4s,v9.4s
1613	add	w7,w7,w11
1614	add	v12.4s,v12.4s,v13.4s
1615	add	w8,w8,w12
1616	add	v16.4s,v16.4s,v17.4s
1617	eor	w17,w17,w5
1618	add	v20.4s,v20.4s,v21.4s
1619	eor	w19,w19,w6
1620	eor	v3.16b,v3.16b,v0.16b
1621	eor	w20,w20,w7
1622	eor	v7.16b,v7.16b,v4.16b
1623	eor	w21,w21,w8
1624	eor	v11.16b,v11.16b,v8.16b
1625	ror	w17,w17,#16
1626	eor	v15.16b,v15.16b,v12.16b
1627	ror	w19,w19,#16
1628	eor	v19.16b,v19.16b,v16.16b
1629	ror	w20,w20,#16
1630	eor	v23.16b,v23.16b,v20.16b
1631	ror	w21,w21,#16
1632	rev32	v3.8h,v3.8h
1633	add	w13,w13,w17
1634	rev32	v7.8h,v7.8h
1635	add	w14,w14,w19
1636	rev32	v11.8h,v11.8h
1637	add	w15,w15,w20
1638	rev32	v15.8h,v15.8h
1639	add	w16,w16,w21
1640	rev32	v19.8h,v19.8h
1641	eor	w9,w9,w13
1642	rev32	v23.8h,v23.8h
1643	eor	w10,w10,w14
1644	add	v2.4s,v2.4s,v3.4s
1645	eor	w11,w11,w15
1646	add	v6.4s,v6.4s,v7.4s
1647	eor	w12,w12,w16
1648	add	v10.4s,v10.4s,v11.4s
1649	ror	w9,w9,#20
1650	add	v14.4s,v14.4s,v15.4s
1651	ror	w10,w10,#20
1652	add	v18.4s,v18.4s,v19.4s
1653	ror	w11,w11,#20
1654	add	v22.4s,v22.4s,v23.4s
1655	ror	w12,w12,#20
1656	eor	v24.16b,v1.16b,v2.16b
1657	add	w5,w5,w9
1658	eor	v25.16b,v5.16b,v6.16b
1659	add	w6,w6,w10
1660	eor	v26.16b,v9.16b,v10.16b
1661	add	w7,w7,w11
1662	eor	v27.16b,v13.16b,v14.16b
1663	add	w8,w8,w12
1664	eor	v28.16b,v17.16b,v18.16b
1665	eor	w17,w17,w5
1666	eor	v29.16b,v21.16b,v22.16b
1667	eor	w19,w19,w6
1668	ushr	v1.4s,v24.4s,#20
1669	eor	w20,w20,w7
1670	ushr	v5.4s,v25.4s,#20
1671	eor	w21,w21,w8
1672	ushr	v9.4s,v26.4s,#20
1673	ror	w17,w17,#24
1674	ushr	v13.4s,v27.4s,#20
1675	ror	w19,w19,#24
1676	ushr	v17.4s,v28.4s,#20
1677	ror	w20,w20,#24
1678	ushr	v21.4s,v29.4s,#20
1679	ror	w21,w21,#24
1680	sli	v1.4s,v24.4s,#12
1681	add	w13,w13,w17
1682	sli	v5.4s,v25.4s,#12
1683	add	w14,w14,w19
1684	sli	v9.4s,v26.4s,#12
1685	add	w15,w15,w20
1686	sli	v13.4s,v27.4s,#12
1687	add	w16,w16,w21
1688	sli	v17.4s,v28.4s,#12
1689	eor	w9,w9,w13
1690	sli	v21.4s,v29.4s,#12
1691	eor	w10,w10,w14
1692	add	v0.4s,v0.4s,v1.4s
1693	eor	w11,w11,w15
1694	add	v4.4s,v4.4s,v5.4s
1695	eor	w12,w12,w16
1696	add	v8.4s,v8.4s,v9.4s
1697	ror	w9,w9,#25
1698	add	v12.4s,v12.4s,v13.4s
1699	ror	w10,w10,#25
1700	add	v16.4s,v16.4s,v17.4s
1701	ror	w11,w11,#25
1702	add	v20.4s,v20.4s,v21.4s
1703	ror	w12,w12,#25
1704	eor	v24.16b,v3.16b,v0.16b
1705	add	w5,w5,w10
1706	eor	v25.16b,v7.16b,v4.16b
1707	add	w6,w6,w11
1708	eor	v26.16b,v11.16b,v8.16b
1709	add	w7,w7,w12
1710	eor	v27.16b,v15.16b,v12.16b
1711	add	w8,w8,w9
1712	eor	v28.16b,v19.16b,v16.16b
1713	eor	w21,w21,w5
1714	eor	v29.16b,v23.16b,v20.16b
1715	eor	w17,w17,w6
1716	ushr	v3.4s,v24.4s,#24
1717	eor	w19,w19,w7
1718	ushr	v7.4s,v25.4s,#24
1719	eor	w20,w20,w8
1720	ushr	v11.4s,v26.4s,#24
1721	ror	w21,w21,#16
1722	ushr	v15.4s,v27.4s,#24
1723	ror	w17,w17,#16
1724	ushr	v19.4s,v28.4s,#24
1725	ror	w19,w19,#16
1726	ushr	v23.4s,v29.4s,#24
1727	ror	w20,w20,#16
1728	sli	v3.4s,v24.4s,#8
1729	add	w15,w15,w21
1730	sli	v7.4s,v25.4s,#8
1731	add	w16,w16,w17
1732	sli	v11.4s,v26.4s,#8
1733	add	w13,w13,w19
1734	sli	v15.4s,v27.4s,#8
1735	add	w14,w14,w20
1736	sli	v19.4s,v28.4s,#8
1737	eor	w10,w10,w15
1738	sli	v23.4s,v29.4s,#8
1739	eor	w11,w11,w16
1740	add	v2.4s,v2.4s,v3.4s
1741	eor	w12,w12,w13
1742	add	v6.4s,v6.4s,v7.4s
1743	eor	w9,w9,w14
1744	add	v10.4s,v10.4s,v11.4s
1745	ror	w10,w10,#20
1746	add	v14.4s,v14.4s,v15.4s
1747	ror	w11,w11,#20
1748	add	v18.4s,v18.4s,v19.4s
1749	ror	w12,w12,#20
1750	add	v22.4s,v22.4s,v23.4s
1751	ror	w9,w9,#20
1752	eor	v24.16b,v1.16b,v2.16b
1753	add	w5,w5,w10
1754	eor	v25.16b,v5.16b,v6.16b
1755	add	w6,w6,w11
1756	eor	v26.16b,v9.16b,v10.16b
1757	add	w7,w7,w12
1758	eor	v27.16b,v13.16b,v14.16b
1759	add	w8,w8,w9
1760	eor	v28.16b,v17.16b,v18.16b
1761	eor	w21,w21,w5
1762	eor	v29.16b,v21.16b,v22.16b
1763	eor	w17,w17,w6
1764	ushr	v1.4s,v24.4s,#25
1765	eor	w19,w19,w7
1766	ushr	v5.4s,v25.4s,#25
1767	eor	w20,w20,w8
1768	ushr	v9.4s,v26.4s,#25
1769	ror	w21,w21,#24
1770	ushr	v13.4s,v27.4s,#25
1771	ror	w17,w17,#24
1772	ushr	v17.4s,v28.4s,#25
1773	ror	w19,w19,#24
1774	ushr	v21.4s,v29.4s,#25
1775	ror	w20,w20,#24
1776	sli	v1.4s,v24.4s,#7
1777	add	w15,w15,w21
1778	sli	v5.4s,v25.4s,#7
1779	add	w16,w16,w17
1780	sli	v9.4s,v26.4s,#7
1781	add	w13,w13,w19
1782	sli	v13.4s,v27.4s,#7
1783	add	w14,w14,w20
1784	sli	v17.4s,v28.4s,#7
1785	eor	w10,w10,w15
1786	sli	v21.4s,v29.4s,#7
1787	eor	w11,w11,w16
1788	ext	v2.16b,v2.16b,v2.16b,#8
1789	eor	w12,w12,w13
1790	ext	v6.16b,v6.16b,v6.16b,#8
1791	eor	w9,w9,w14
1792	ext	v10.16b,v10.16b,v10.16b,#8
1793	ror	w10,w10,#25
1794	ext	v14.16b,v14.16b,v14.16b,#8
1795	ror	w11,w11,#25
1796	ext	v18.16b,v18.16b,v18.16b,#8
1797	ror	w12,w12,#25
1798	ext	v22.16b,v22.16b,v22.16b,#8
1799	ror	w9,w9,#25
1800	ext	v3.16b,v3.16b,v3.16b,#4
1801	ext	v7.16b,v7.16b,v7.16b,#4
1802	ext	v11.16b,v11.16b,v11.16b,#4
1803	ext	v15.16b,v15.16b,v15.16b,#4
1804	ext	v19.16b,v19.16b,v19.16b,#4
1805	ext	v23.16b,v23.16b,v23.16b,#4
1806	ext	v1.16b,v1.16b,v1.16b,#12
1807	ext	v5.16b,v5.16b,v5.16b,#12
1808	ext	v9.16b,v9.16b,v9.16b,#12
1809	ext	v13.16b,v13.16b,v13.16b,#12
1810	ext	v17.16b,v17.16b,v17.16b,#12
1811	ext	v21.16b,v21.16b,v21.16b,#12
1812	cbnz	x4,Loop_lower_neon
1813
1814	add	w5,w5,w22		// accumulate key block
1815	ldp	q24,q25,[sp,#0]
1816	add	x6,x6,x22,lsr#32
1817	ldp	q26,q27,[sp,#32]
1818	add	w7,w7,w23
1819	ldp	q28,q29,[sp,#64]
1820	add	x8,x8,x23,lsr#32
1821	add	v0.4s,v0.4s,v24.4s
1822	add	w9,w9,w24
1823	add	v4.4s,v4.4s,v24.4s
1824	add	x10,x10,x24,lsr#32
1825	add	v8.4s,v8.4s,v24.4s
1826	add	w11,w11,w25
1827	add	v12.4s,v12.4s,v24.4s
1828	add	x12,x12,x25,lsr#32
1829	add	v16.4s,v16.4s,v24.4s
1830	add	w13,w13,w26
1831	add	v20.4s,v20.4s,v24.4s
1832	add	x14,x14,x26,lsr#32
1833	add	v2.4s,v2.4s,v26.4s
1834	add	w15,w15,w27
1835	add	v6.4s,v6.4s,v26.4s
1836	add	x16,x16,x27,lsr#32
1837	add	v10.4s,v10.4s,v26.4s
1838	add	w17,w17,w28
1839	add	v14.4s,v14.4s,v26.4s
1840	add	x19,x19,x28,lsr#32
1841	add	v18.4s,v18.4s,v26.4s
1842	add	w20,w20,w30
1843	add	v22.4s,v22.4s,v26.4s
1844	add	x21,x21,x30,lsr#32
1845	add	v19.4s,v19.4s,v31.4s			// +4
1846	add	x5,x5,x6,lsl#32	// pack
1847	add	v23.4s,v23.4s,v31.4s			// +4
1848	add	x7,x7,x8,lsl#32
1849	add	v3.4s,v3.4s,v27.4s
1850	ldp	x6,x8,[x1,#0]		// load input
1851	add	v7.4s,v7.4s,v28.4s
1852	add	x9,x9,x10,lsl#32
1853	add	v11.4s,v11.4s,v29.4s
1854	add	x11,x11,x12,lsl#32
1855	add	v15.4s,v15.4s,v30.4s
1856	ldp	x10,x12,[x1,#16]
1857	add	v19.4s,v19.4s,v27.4s
1858	add	x13,x13,x14,lsl#32
1859	add	v23.4s,v23.4s,v28.4s
1860	add	x15,x15,x16,lsl#32
1861	add	v1.4s,v1.4s,v25.4s
1862	ldp	x14,x16,[x1,#32]
1863	add	v5.4s,v5.4s,v25.4s
1864	add	x17,x17,x19,lsl#32
1865	add	v9.4s,v9.4s,v25.4s
1866	add	x20,x20,x21,lsl#32
1867	add	v13.4s,v13.4s,v25.4s
1868	ldp	x19,x21,[x1,#48]
1869	add	v17.4s,v17.4s,v25.4s
1870	add	x1,x1,#64
1871	add	v21.4s,v21.4s,v25.4s
1872
1873#ifdef	__AARCH64EB__
1874	rev	x5,x5
1875	rev	x7,x7
1876	rev	x9,x9
1877	rev	x11,x11
1878	rev	x13,x13
1879	rev	x15,x15
1880	rev	x17,x17
1881	rev	x20,x20
1882#endif
1883	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1884	eor	x5,x5,x6
1885	eor	x7,x7,x8
1886	eor	x9,x9,x10
1887	eor	x11,x11,x12
1888	eor	x13,x13,x14
1889	eor	v0.16b,v0.16b,v24.16b
1890	eor	x15,x15,x16
1891	eor	v1.16b,v1.16b,v25.16b
1892	eor	x17,x17,x19
1893	eor	v2.16b,v2.16b,v26.16b
1894	eor	x20,x20,x21
1895	eor	v3.16b,v3.16b,v27.16b
1896	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1897
1898	stp	x5,x7,[x0,#0]		// store output
1899	add	x28,x28,#7			// increment counter
1900	stp	x9,x11,[x0,#16]
1901	stp	x13,x15,[x0,#32]
1902	stp	x17,x20,[x0,#48]
1903	add	x0,x0,#64
1904	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
1905
1906	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
1907	eor	v4.16b,v4.16b,v24.16b
1908	eor	v5.16b,v5.16b,v25.16b
1909	eor	v6.16b,v6.16b,v26.16b
1910	eor	v7.16b,v7.16b,v27.16b
1911	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
1912
1913	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
1914	eor	v8.16b,v8.16b,v0.16b
1915	ldp	q24,q25,[sp,#0]
1916	eor	v9.16b,v9.16b,v1.16b
1917	ldp	q26,q27,[sp,#32]
1918	eor	v10.16b,v10.16b,v2.16b
1919	eor	v11.16b,v11.16b,v3.16b
1920	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
1921
1922	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
1923	eor	v12.16b,v12.16b,v4.16b
1924	eor	v13.16b,v13.16b,v5.16b
1925	eor	v14.16b,v14.16b,v6.16b
1926	eor	v15.16b,v15.16b,v7.16b
1927	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
1928
1929	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
1930	eor	v16.16b,v16.16b,v8.16b
1931	eor	v17.16b,v17.16b,v9.16b
1932	eor	v18.16b,v18.16b,v10.16b
1933	eor	v19.16b,v19.16b,v11.16b
1934	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
1935
1936	shl	v0.4s,v31.4s,#1			// 4 -> 8
1937	eor	v20.16b,v20.16b,v12.16b
1938	eor	v21.16b,v21.16b,v13.16b
1939	eor	v22.16b,v22.16b,v14.16b
1940	eor	v23.16b,v23.16b,v15.16b
1941	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
1942
1943	add	v27.4s,v27.4s,v0.4s			// += 8
1944	add	v28.4s,v28.4s,v0.4s
1945	add	v29.4s,v29.4s,v0.4s
1946	add	v30.4s,v30.4s,v0.4s
1947
1948	b.hs	Loop_outer_512_neon
1949
1950	adds	x2,x2,#512
1951	ushr	v0.4s,v31.4s,#2			// 4 -> 1
1952
1953	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
1954	ldp	d10,d11,[sp,#128+16]
1955	ldp	d12,d13,[sp,#128+32]
1956	ldp	d14,d15,[sp,#128+48]
1957
1958	stp	q24,q31,[sp,#0]		// wipe off-load area
1959	stp	q24,q31,[sp,#32]
1960	stp	q24,q31,[sp,#64]
1961
1962	b.eq	Ldone_512_neon
1963
1964	cmp	x2,#192
1965	sub	v27.4s,v27.4s,v0.4s			// -= 1
1966	sub	v28.4s,v28.4s,v0.4s
1967	sub	v29.4s,v29.4s,v0.4s
1968	add	sp,sp,#128
1969	b.hs	Loop_outer_neon
1970
1971	eor	v25.16b,v25.16b,v25.16b
1972	eor	v26.16b,v26.16b,v26.16b
1973	eor	v27.16b,v27.16b,v27.16b
1974	eor	v28.16b,v28.16b,v28.16b
1975	eor	v29.16b,v29.16b,v29.16b
1976	eor	v30.16b,v30.16b,v30.16b
1977	b	Loop_outer
1978
1979Ldone_512_neon:
1980	ldp	x19,x20,[x29,#16]
1981	add	sp,sp,#128+64
1982	ldp	x21,x22,[x29,#32]
1983	ldp	x23,x24,[x29,#48]
1984	ldp	x25,x26,[x29,#64]
1985	ldp	x27,x28,[x29,#80]
1986	ldp	x29,x30,[sp],#96
1987	AARCH64_VALIDATE_LINK_REGISTER
1988	ret
1989
1990#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
1991