1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <ring-core/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
7#include <ring-core/arm_arch.h>
8
9
10.hidden	OPENSSL_armcap_P
11
12.section	.rodata
13
14.align	5
15.Lsigma:
16.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
17.Lone:
18.long	1,0,0,0
19.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
20.align	2
21
22.text
23
24.globl	ChaCha20_ctr32
25.hidden	ChaCha20_ctr32
26.type	ChaCha20_ctr32,%function
27.align	5
28ChaCha20_ctr32:
29	AARCH64_VALID_CALL_TARGET
30	cbz	x2,.Labort
31#if defined(OPENSSL_HWASAN) && __clang_major__ >= 10
32	adrp	x5,:pg_hi21_nc:OPENSSL_armcap_P
33#else
34	adrp	x5,OPENSSL_armcap_P
35#endif
36	cmp	x2,#192
37	b.lo	.Lshort
38	ldr	w17,[x5,:lo12:OPENSSL_armcap_P]
39	tst	w17,#ARMV7_NEON
40	b.ne	ChaCha20_neon
41
42.Lshort:
43	AARCH64_SIGN_LINK_REGISTER
44	stp	x29,x30,[sp,#-96]!
45	add	x29,sp,#0
46
47	adrp	x5,.Lsigma
48	add	x5,x5,:lo12:.Lsigma
49	stp	x19,x20,[sp,#16]
50	stp	x21,x22,[sp,#32]
51	stp	x23,x24,[sp,#48]
52	stp	x25,x26,[sp,#64]
53	stp	x27,x28,[sp,#80]
54	sub	sp,sp,#64
55
56	ldp	x22,x23,[x5]		// load sigma
57	ldp	x24,x25,[x3]		// load key
58	ldp	x26,x27,[x3,#16]
59	ldp	x28,x30,[x4]		// load counter
60#ifdef	__AARCH64EB__
61	ror	x24,x24,#32
62	ror	x25,x25,#32
63	ror	x26,x26,#32
64	ror	x27,x27,#32
65	ror	x28,x28,#32
66	ror	x30,x30,#32
67#endif
68
69.Loop_outer:
70	mov	w5,w22			// unpack key block
71	lsr	x6,x22,#32
72	mov	w7,w23
73	lsr	x8,x23,#32
74	mov	w9,w24
75	lsr	x10,x24,#32
76	mov	w11,w25
77	lsr	x12,x25,#32
78	mov	w13,w26
79	lsr	x14,x26,#32
80	mov	w15,w27
81	lsr	x16,x27,#32
82	mov	w17,w28
83	lsr	x19,x28,#32
84	mov	w20,w30
85	lsr	x21,x30,#32
86
87	mov	x4,#10
88	subs	x2,x2,#64
89.Loop:
90	sub	x4,x4,#1
91	add	w5,w5,w9
92	add	w6,w6,w10
93	add	w7,w7,w11
94	add	w8,w8,w12
95	eor	w17,w17,w5
96	eor	w19,w19,w6
97	eor	w20,w20,w7
98	eor	w21,w21,w8
99	ror	w17,w17,#16
100	ror	w19,w19,#16
101	ror	w20,w20,#16
102	ror	w21,w21,#16
103	add	w13,w13,w17
104	add	w14,w14,w19
105	add	w15,w15,w20
106	add	w16,w16,w21
107	eor	w9,w9,w13
108	eor	w10,w10,w14
109	eor	w11,w11,w15
110	eor	w12,w12,w16
111	ror	w9,w9,#20
112	ror	w10,w10,#20
113	ror	w11,w11,#20
114	ror	w12,w12,#20
115	add	w5,w5,w9
116	add	w6,w6,w10
117	add	w7,w7,w11
118	add	w8,w8,w12
119	eor	w17,w17,w5
120	eor	w19,w19,w6
121	eor	w20,w20,w7
122	eor	w21,w21,w8
123	ror	w17,w17,#24
124	ror	w19,w19,#24
125	ror	w20,w20,#24
126	ror	w21,w21,#24
127	add	w13,w13,w17
128	add	w14,w14,w19
129	add	w15,w15,w20
130	add	w16,w16,w21
131	eor	w9,w9,w13
132	eor	w10,w10,w14
133	eor	w11,w11,w15
134	eor	w12,w12,w16
135	ror	w9,w9,#25
136	ror	w10,w10,#25
137	ror	w11,w11,#25
138	ror	w12,w12,#25
139	add	w5,w5,w10
140	add	w6,w6,w11
141	add	w7,w7,w12
142	add	w8,w8,w9
143	eor	w21,w21,w5
144	eor	w17,w17,w6
145	eor	w19,w19,w7
146	eor	w20,w20,w8
147	ror	w21,w21,#16
148	ror	w17,w17,#16
149	ror	w19,w19,#16
150	ror	w20,w20,#16
151	add	w15,w15,w21
152	add	w16,w16,w17
153	add	w13,w13,w19
154	add	w14,w14,w20
155	eor	w10,w10,w15
156	eor	w11,w11,w16
157	eor	w12,w12,w13
158	eor	w9,w9,w14
159	ror	w10,w10,#20
160	ror	w11,w11,#20
161	ror	w12,w12,#20
162	ror	w9,w9,#20
163	add	w5,w5,w10
164	add	w6,w6,w11
165	add	w7,w7,w12
166	add	w8,w8,w9
167	eor	w21,w21,w5
168	eor	w17,w17,w6
169	eor	w19,w19,w7
170	eor	w20,w20,w8
171	ror	w21,w21,#24
172	ror	w17,w17,#24
173	ror	w19,w19,#24
174	ror	w20,w20,#24
175	add	w15,w15,w21
176	add	w16,w16,w17
177	add	w13,w13,w19
178	add	w14,w14,w20
179	eor	w10,w10,w15
180	eor	w11,w11,w16
181	eor	w12,w12,w13
182	eor	w9,w9,w14
183	ror	w10,w10,#25
184	ror	w11,w11,#25
185	ror	w12,w12,#25
186	ror	w9,w9,#25
187	cbnz	x4,.Loop
188
189	add	w5,w5,w22		// accumulate key block
190	add	x6,x6,x22,lsr#32
191	add	w7,w7,w23
192	add	x8,x8,x23,lsr#32
193	add	w9,w9,w24
194	add	x10,x10,x24,lsr#32
195	add	w11,w11,w25
196	add	x12,x12,x25,lsr#32
197	add	w13,w13,w26
198	add	x14,x14,x26,lsr#32
199	add	w15,w15,w27
200	add	x16,x16,x27,lsr#32
201	add	w17,w17,w28
202	add	x19,x19,x28,lsr#32
203	add	w20,w20,w30
204	add	x21,x21,x30,lsr#32
205
206	b.lo	.Ltail
207
208	add	x5,x5,x6,lsl#32	// pack
209	add	x7,x7,x8,lsl#32
210	ldp	x6,x8,[x1,#0]		// load input
211	add	x9,x9,x10,lsl#32
212	add	x11,x11,x12,lsl#32
213	ldp	x10,x12,[x1,#16]
214	add	x13,x13,x14,lsl#32
215	add	x15,x15,x16,lsl#32
216	ldp	x14,x16,[x1,#32]
217	add	x17,x17,x19,lsl#32
218	add	x20,x20,x21,lsl#32
219	ldp	x19,x21,[x1,#48]
220	add	x1,x1,#64
221#ifdef	__AARCH64EB__
222	rev	x5,x5
223	rev	x7,x7
224	rev	x9,x9
225	rev	x11,x11
226	rev	x13,x13
227	rev	x15,x15
228	rev	x17,x17
229	rev	x20,x20
230#endif
231	eor	x5,x5,x6
232	eor	x7,x7,x8
233	eor	x9,x9,x10
234	eor	x11,x11,x12
235	eor	x13,x13,x14
236	eor	x15,x15,x16
237	eor	x17,x17,x19
238	eor	x20,x20,x21
239
240	stp	x5,x7,[x0,#0]		// store output
241	add	x28,x28,#1			// increment counter
242	stp	x9,x11,[x0,#16]
243	stp	x13,x15,[x0,#32]
244	stp	x17,x20,[x0,#48]
245	add	x0,x0,#64
246
247	b.hi	.Loop_outer
248
249	ldp	x19,x20,[x29,#16]
250	add	sp,sp,#64
251	ldp	x21,x22,[x29,#32]
252	ldp	x23,x24,[x29,#48]
253	ldp	x25,x26,[x29,#64]
254	ldp	x27,x28,[x29,#80]
255	ldp	x29,x30,[sp],#96
256	AARCH64_VALIDATE_LINK_REGISTER
257.Labort:
258	ret
259
260.align	4
261.Ltail:
262	add	x2,x2,#64
263.Less_than_64:
264	sub	x0,x0,#1
265	add	x1,x1,x2
266	add	x0,x0,x2
267	add	x4,sp,x2
268	neg	x2,x2
269
270	add	x5,x5,x6,lsl#32	// pack
271	add	x7,x7,x8,lsl#32
272	add	x9,x9,x10,lsl#32
273	add	x11,x11,x12,lsl#32
274	add	x13,x13,x14,lsl#32
275	add	x15,x15,x16,lsl#32
276	add	x17,x17,x19,lsl#32
277	add	x20,x20,x21,lsl#32
278#ifdef	__AARCH64EB__
279	rev	x5,x5
280	rev	x7,x7
281	rev	x9,x9
282	rev	x11,x11
283	rev	x13,x13
284	rev	x15,x15
285	rev	x17,x17
286	rev	x20,x20
287#endif
288	stp	x5,x7,[sp,#0]
289	stp	x9,x11,[sp,#16]
290	stp	x13,x15,[sp,#32]
291	stp	x17,x20,[sp,#48]
292
293.Loop_tail:
294	ldrb	w10,[x1,x2]
295	ldrb	w11,[x4,x2]
296	add	x2,x2,#1
297	eor	w10,w10,w11
298	strb	w10,[x0,x2]
299	cbnz	x2,.Loop_tail
300
301	stp	xzr,xzr,[sp,#0]
302	stp	xzr,xzr,[sp,#16]
303	stp	xzr,xzr,[sp,#32]
304	stp	xzr,xzr,[sp,#48]
305
306	ldp	x19,x20,[x29,#16]
307	add	sp,sp,#64
308	ldp	x21,x22,[x29,#32]
309	ldp	x23,x24,[x29,#48]
310	ldp	x25,x26,[x29,#64]
311	ldp	x27,x28,[x29,#80]
312	ldp	x29,x30,[sp],#96
313	AARCH64_VALIDATE_LINK_REGISTER
314	ret
315.size	ChaCha20_ctr32,.-ChaCha20_ctr32
316
317.type	ChaCha20_neon,%function
318.align	5
319ChaCha20_neon:
320	AARCH64_SIGN_LINK_REGISTER
321	stp	x29,x30,[sp,#-96]!
322	add	x29,sp,#0
323
324	adrp	x5,.Lsigma
325	add	x5,x5,:lo12:.Lsigma
326	stp	x19,x20,[sp,#16]
327	stp	x21,x22,[sp,#32]
328	stp	x23,x24,[sp,#48]
329	stp	x25,x26,[sp,#64]
330	stp	x27,x28,[sp,#80]
331	cmp	x2,#512
332	b.hs	.L512_or_more_neon
333
334	sub	sp,sp,#64
335
336	ldp	x22,x23,[x5]		// load sigma
337	ld1	{v24.4s},[x5],#16
338	ldp	x24,x25,[x3]		// load key
339	ldp	x26,x27,[x3,#16]
340	ld1	{v25.4s,v26.4s},[x3]
341	ldp	x28,x30,[x4]		// load counter
342	ld1	{v27.4s},[x4]
343	ld1	{v31.4s},[x5]
344#ifdef	__AARCH64EB__
345	rev64	v24.4s,v24.4s
346	ror	x24,x24,#32
347	ror	x25,x25,#32
348	ror	x26,x26,#32
349	ror	x27,x27,#32
350	ror	x28,x28,#32
351	ror	x30,x30,#32
352#endif
353	add	v27.4s,v27.4s,v31.4s		// += 1
354	add	v28.4s,v27.4s,v31.4s
355	add	v29.4s,v28.4s,v31.4s
356	shl	v31.4s,v31.4s,#2			// 1 -> 4
357
358.Loop_outer_neon:
359	mov	w5,w22			// unpack key block
360	lsr	x6,x22,#32
361	mov	v0.16b,v24.16b
362	mov	w7,w23
363	lsr	x8,x23,#32
364	mov	v4.16b,v24.16b
365	mov	w9,w24
366	lsr	x10,x24,#32
367	mov	v16.16b,v24.16b
368	mov	w11,w25
369	mov	v1.16b,v25.16b
370	lsr	x12,x25,#32
371	mov	v5.16b,v25.16b
372	mov	w13,w26
373	mov	v17.16b,v25.16b
374	lsr	x14,x26,#32
375	mov	v3.16b,v27.16b
376	mov	w15,w27
377	mov	v7.16b,v28.16b
378	lsr	x16,x27,#32
379	mov	v19.16b,v29.16b
380	mov	w17,w28
381	mov	v2.16b,v26.16b
382	lsr	x19,x28,#32
383	mov	v6.16b,v26.16b
384	mov	w20,w30
385	mov	v18.16b,v26.16b
386	lsr	x21,x30,#32
387
388	mov	x4,#10
389	subs	x2,x2,#256
390.Loop_neon:
391	sub	x4,x4,#1
392	add	v0.4s,v0.4s,v1.4s
393	add	w5,w5,w9
394	add	v4.4s,v4.4s,v5.4s
395	add	w6,w6,w10
396	add	v16.4s,v16.4s,v17.4s
397	add	w7,w7,w11
398	eor	v3.16b,v3.16b,v0.16b
399	add	w8,w8,w12
400	eor	v7.16b,v7.16b,v4.16b
401	eor	w17,w17,w5
402	eor	v19.16b,v19.16b,v16.16b
403	eor	w19,w19,w6
404	rev32	v3.8h,v3.8h
405	eor	w20,w20,w7
406	rev32	v7.8h,v7.8h
407	eor	w21,w21,w8
408	rev32	v19.8h,v19.8h
409	ror	w17,w17,#16
410	add	v2.4s,v2.4s,v3.4s
411	ror	w19,w19,#16
412	add	v6.4s,v6.4s,v7.4s
413	ror	w20,w20,#16
414	add	v18.4s,v18.4s,v19.4s
415	ror	w21,w21,#16
416	eor	v20.16b,v1.16b,v2.16b
417	add	w13,w13,w17
418	eor	v21.16b,v5.16b,v6.16b
419	add	w14,w14,w19
420	eor	v22.16b,v17.16b,v18.16b
421	add	w15,w15,w20
422	ushr	v1.4s,v20.4s,#20
423	add	w16,w16,w21
424	ushr	v5.4s,v21.4s,#20
425	eor	w9,w9,w13
426	ushr	v17.4s,v22.4s,#20
427	eor	w10,w10,w14
428	sli	v1.4s,v20.4s,#12
429	eor	w11,w11,w15
430	sli	v5.4s,v21.4s,#12
431	eor	w12,w12,w16
432	sli	v17.4s,v22.4s,#12
433	ror	w9,w9,#20
434	add	v0.4s,v0.4s,v1.4s
435	ror	w10,w10,#20
436	add	v4.4s,v4.4s,v5.4s
437	ror	w11,w11,#20
438	add	v16.4s,v16.4s,v17.4s
439	ror	w12,w12,#20
440	eor	v20.16b,v3.16b,v0.16b
441	add	w5,w5,w9
442	eor	v21.16b,v7.16b,v4.16b
443	add	w6,w6,w10
444	eor	v22.16b,v19.16b,v16.16b
445	add	w7,w7,w11
446	ushr	v3.4s,v20.4s,#24
447	add	w8,w8,w12
448	ushr	v7.4s,v21.4s,#24
449	eor	w17,w17,w5
450	ushr	v19.4s,v22.4s,#24
451	eor	w19,w19,w6
452	sli	v3.4s,v20.4s,#8
453	eor	w20,w20,w7
454	sli	v7.4s,v21.4s,#8
455	eor	w21,w21,w8
456	sli	v19.4s,v22.4s,#8
457	ror	w17,w17,#24
458	add	v2.4s,v2.4s,v3.4s
459	ror	w19,w19,#24
460	add	v6.4s,v6.4s,v7.4s
461	ror	w20,w20,#24
462	add	v18.4s,v18.4s,v19.4s
463	ror	w21,w21,#24
464	eor	v20.16b,v1.16b,v2.16b
465	add	w13,w13,w17
466	eor	v21.16b,v5.16b,v6.16b
467	add	w14,w14,w19
468	eor	v22.16b,v17.16b,v18.16b
469	add	w15,w15,w20
470	ushr	v1.4s,v20.4s,#25
471	add	w16,w16,w21
472	ushr	v5.4s,v21.4s,#25
473	eor	w9,w9,w13
474	ushr	v17.4s,v22.4s,#25
475	eor	w10,w10,w14
476	sli	v1.4s,v20.4s,#7
477	eor	w11,w11,w15
478	sli	v5.4s,v21.4s,#7
479	eor	w12,w12,w16
480	sli	v17.4s,v22.4s,#7
481	ror	w9,w9,#25
482	ext	v2.16b,v2.16b,v2.16b,#8
483	ror	w10,w10,#25
484	ext	v6.16b,v6.16b,v6.16b,#8
485	ror	w11,w11,#25
486	ext	v18.16b,v18.16b,v18.16b,#8
487	ror	w12,w12,#25
488	ext	v3.16b,v3.16b,v3.16b,#12
489	ext	v7.16b,v7.16b,v7.16b,#12
490	ext	v19.16b,v19.16b,v19.16b,#12
491	ext	v1.16b,v1.16b,v1.16b,#4
492	ext	v5.16b,v5.16b,v5.16b,#4
493	ext	v17.16b,v17.16b,v17.16b,#4
494	add	v0.4s,v0.4s,v1.4s
495	add	w5,w5,w10
496	add	v4.4s,v4.4s,v5.4s
497	add	w6,w6,w11
498	add	v16.4s,v16.4s,v17.4s
499	add	w7,w7,w12
500	eor	v3.16b,v3.16b,v0.16b
501	add	w8,w8,w9
502	eor	v7.16b,v7.16b,v4.16b
503	eor	w21,w21,w5
504	eor	v19.16b,v19.16b,v16.16b
505	eor	w17,w17,w6
506	rev32	v3.8h,v3.8h
507	eor	w19,w19,w7
508	rev32	v7.8h,v7.8h
509	eor	w20,w20,w8
510	rev32	v19.8h,v19.8h
511	ror	w21,w21,#16
512	add	v2.4s,v2.4s,v3.4s
513	ror	w17,w17,#16
514	add	v6.4s,v6.4s,v7.4s
515	ror	w19,w19,#16
516	add	v18.4s,v18.4s,v19.4s
517	ror	w20,w20,#16
518	eor	v20.16b,v1.16b,v2.16b
519	add	w15,w15,w21
520	eor	v21.16b,v5.16b,v6.16b
521	add	w16,w16,w17
522	eor	v22.16b,v17.16b,v18.16b
523	add	w13,w13,w19
524	ushr	v1.4s,v20.4s,#20
525	add	w14,w14,w20
526	ushr	v5.4s,v21.4s,#20
527	eor	w10,w10,w15
528	ushr	v17.4s,v22.4s,#20
529	eor	w11,w11,w16
530	sli	v1.4s,v20.4s,#12
531	eor	w12,w12,w13
532	sli	v5.4s,v21.4s,#12
533	eor	w9,w9,w14
534	sli	v17.4s,v22.4s,#12
535	ror	w10,w10,#20
536	add	v0.4s,v0.4s,v1.4s
537	ror	w11,w11,#20
538	add	v4.4s,v4.4s,v5.4s
539	ror	w12,w12,#20
540	add	v16.4s,v16.4s,v17.4s
541	ror	w9,w9,#20
542	eor	v20.16b,v3.16b,v0.16b
543	add	w5,w5,w10
544	eor	v21.16b,v7.16b,v4.16b
545	add	w6,w6,w11
546	eor	v22.16b,v19.16b,v16.16b
547	add	w7,w7,w12
548	ushr	v3.4s,v20.4s,#24
549	add	w8,w8,w9
550	ushr	v7.4s,v21.4s,#24
551	eor	w21,w21,w5
552	ushr	v19.4s,v22.4s,#24
553	eor	w17,w17,w6
554	sli	v3.4s,v20.4s,#8
555	eor	w19,w19,w7
556	sli	v7.4s,v21.4s,#8
557	eor	w20,w20,w8
558	sli	v19.4s,v22.4s,#8
559	ror	w21,w21,#24
560	add	v2.4s,v2.4s,v3.4s
561	ror	w17,w17,#24
562	add	v6.4s,v6.4s,v7.4s
563	ror	w19,w19,#24
564	add	v18.4s,v18.4s,v19.4s
565	ror	w20,w20,#24
566	eor	v20.16b,v1.16b,v2.16b
567	add	w15,w15,w21
568	eor	v21.16b,v5.16b,v6.16b
569	add	w16,w16,w17
570	eor	v22.16b,v17.16b,v18.16b
571	add	w13,w13,w19
572	ushr	v1.4s,v20.4s,#25
573	add	w14,w14,w20
574	ushr	v5.4s,v21.4s,#25
575	eor	w10,w10,w15
576	ushr	v17.4s,v22.4s,#25
577	eor	w11,w11,w16
578	sli	v1.4s,v20.4s,#7
579	eor	w12,w12,w13
580	sli	v5.4s,v21.4s,#7
581	eor	w9,w9,w14
582	sli	v17.4s,v22.4s,#7
583	ror	w10,w10,#25
584	ext	v2.16b,v2.16b,v2.16b,#8
585	ror	w11,w11,#25
586	ext	v6.16b,v6.16b,v6.16b,#8
587	ror	w12,w12,#25
588	ext	v18.16b,v18.16b,v18.16b,#8
589	ror	w9,w9,#25
590	ext	v3.16b,v3.16b,v3.16b,#4
591	ext	v7.16b,v7.16b,v7.16b,#4
592	ext	v19.16b,v19.16b,v19.16b,#4
593	ext	v1.16b,v1.16b,v1.16b,#12
594	ext	v5.16b,v5.16b,v5.16b,#12
595	ext	v17.16b,v17.16b,v17.16b,#12
596	cbnz	x4,.Loop_neon
597
598	add	w5,w5,w22		// accumulate key block
599	add	v0.4s,v0.4s,v24.4s
600	add	x6,x6,x22,lsr#32
601	add	v4.4s,v4.4s,v24.4s
602	add	w7,w7,w23
603	add	v16.4s,v16.4s,v24.4s
604	add	x8,x8,x23,lsr#32
605	add	v2.4s,v2.4s,v26.4s
606	add	w9,w9,w24
607	add	v6.4s,v6.4s,v26.4s
608	add	x10,x10,x24,lsr#32
609	add	v18.4s,v18.4s,v26.4s
610	add	w11,w11,w25
611	add	v3.4s,v3.4s,v27.4s
612	add	x12,x12,x25,lsr#32
613	add	w13,w13,w26
614	add	v7.4s,v7.4s,v28.4s
615	add	x14,x14,x26,lsr#32
616	add	w15,w15,w27
617	add	v19.4s,v19.4s,v29.4s
618	add	x16,x16,x27,lsr#32
619	add	w17,w17,w28
620	add	v1.4s,v1.4s,v25.4s
621	add	x19,x19,x28,lsr#32
622	add	w20,w20,w30
623	add	v5.4s,v5.4s,v25.4s
624	add	x21,x21,x30,lsr#32
625	add	v17.4s,v17.4s,v25.4s
626
627	b.lo	.Ltail_neon
628
629	add	x5,x5,x6,lsl#32	// pack
630	add	x7,x7,x8,lsl#32
631	ldp	x6,x8,[x1,#0]		// load input
632	add	x9,x9,x10,lsl#32
633	add	x11,x11,x12,lsl#32
634	ldp	x10,x12,[x1,#16]
635	add	x13,x13,x14,lsl#32
636	add	x15,x15,x16,lsl#32
637	ldp	x14,x16,[x1,#32]
638	add	x17,x17,x19,lsl#32
639	add	x20,x20,x21,lsl#32
640	ldp	x19,x21,[x1,#48]
641	add	x1,x1,#64
642#ifdef	__AARCH64EB__
643	rev	x5,x5
644	rev	x7,x7
645	rev	x9,x9
646	rev	x11,x11
647	rev	x13,x13
648	rev	x15,x15
649	rev	x17,x17
650	rev	x20,x20
651#endif
652	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
653	eor	x5,x5,x6
654	eor	x7,x7,x8
655	eor	x9,x9,x10
656	eor	x11,x11,x12
657	eor	x13,x13,x14
658	eor	v0.16b,v0.16b,v20.16b
659	eor	x15,x15,x16
660	eor	v1.16b,v1.16b,v21.16b
661	eor	x17,x17,x19
662	eor	v2.16b,v2.16b,v22.16b
663	eor	x20,x20,x21
664	eor	v3.16b,v3.16b,v23.16b
665	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
666
667	stp	x5,x7,[x0,#0]		// store output
668	add	x28,x28,#4			// increment counter
669	stp	x9,x11,[x0,#16]
670	add	v27.4s,v27.4s,v31.4s		// += 4
671	stp	x13,x15,[x0,#32]
672	add	v28.4s,v28.4s,v31.4s
673	stp	x17,x20,[x0,#48]
674	add	v29.4s,v29.4s,v31.4s
675	add	x0,x0,#64
676
677	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
678	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
679
680	eor	v4.16b,v4.16b,v20.16b
681	eor	v5.16b,v5.16b,v21.16b
682	eor	v6.16b,v6.16b,v22.16b
683	eor	v7.16b,v7.16b,v23.16b
684	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
685
686	eor	v16.16b,v16.16b,v0.16b
687	eor	v17.16b,v17.16b,v1.16b
688	eor	v18.16b,v18.16b,v2.16b
689	eor	v19.16b,v19.16b,v3.16b
690	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
691
692	b.hi	.Loop_outer_neon
693
694	ldp	x19,x20,[x29,#16]
695	add	sp,sp,#64
696	ldp	x21,x22,[x29,#32]
697	ldp	x23,x24,[x29,#48]
698	ldp	x25,x26,[x29,#64]
699	ldp	x27,x28,[x29,#80]
700	ldp	x29,x30,[sp],#96
701	AARCH64_VALIDATE_LINK_REGISTER
702	ret
703
704.Ltail_neon:
705	add	x2,x2,#256
706	cmp	x2,#64
707	b.lo	.Less_than_64
708
709	add	x5,x5,x6,lsl#32	// pack
710	add	x7,x7,x8,lsl#32
711	ldp	x6,x8,[x1,#0]		// load input
712	add	x9,x9,x10,lsl#32
713	add	x11,x11,x12,lsl#32
714	ldp	x10,x12,[x1,#16]
715	add	x13,x13,x14,lsl#32
716	add	x15,x15,x16,lsl#32
717	ldp	x14,x16,[x1,#32]
718	add	x17,x17,x19,lsl#32
719	add	x20,x20,x21,lsl#32
720	ldp	x19,x21,[x1,#48]
721	add	x1,x1,#64
722#ifdef	__AARCH64EB__
723	rev	x5,x5
724	rev	x7,x7
725	rev	x9,x9
726	rev	x11,x11
727	rev	x13,x13
728	rev	x15,x15
729	rev	x17,x17
730	rev	x20,x20
731#endif
732	eor	x5,x5,x6
733	eor	x7,x7,x8
734	eor	x9,x9,x10
735	eor	x11,x11,x12
736	eor	x13,x13,x14
737	eor	x15,x15,x16
738	eor	x17,x17,x19
739	eor	x20,x20,x21
740
741	stp	x5,x7,[x0,#0]		// store output
742	add	x28,x28,#4			// increment counter
743	stp	x9,x11,[x0,#16]
744	stp	x13,x15,[x0,#32]
745	stp	x17,x20,[x0,#48]
746	add	x0,x0,#64
747	b.eq	.Ldone_neon
748	sub	x2,x2,#64
749	cmp	x2,#64
750	b.lo	.Less_than_128
751
752	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
753	eor	v0.16b,v0.16b,v20.16b
754	eor	v1.16b,v1.16b,v21.16b
755	eor	v2.16b,v2.16b,v22.16b
756	eor	v3.16b,v3.16b,v23.16b
757	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
758	b.eq	.Ldone_neon
759	sub	x2,x2,#64
760	cmp	x2,#64
761	b.lo	.Less_than_192
762
763	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
764	eor	v4.16b,v4.16b,v20.16b
765	eor	v5.16b,v5.16b,v21.16b
766	eor	v6.16b,v6.16b,v22.16b
767	eor	v7.16b,v7.16b,v23.16b
768	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
769	b.eq	.Ldone_neon
770	sub	x2,x2,#64
771
772	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]
773	b	.Last_neon
774
775.Less_than_128:
776	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[sp]
777	b	.Last_neon
778.Less_than_192:
779	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[sp]
780	b	.Last_neon
781
782.align	4
783.Last_neon:
784	sub	x0,x0,#1
785	add	x1,x1,x2
786	add	x0,x0,x2
787	add	x4,sp,x2
788	neg	x2,x2
789
790.Loop_tail_neon:
791	ldrb	w10,[x1,x2]
792	ldrb	w11,[x4,x2]
793	add	x2,x2,#1
794	eor	w10,w10,w11
795	strb	w10,[x0,x2]
796	cbnz	x2,.Loop_tail_neon
797
798	stp	xzr,xzr,[sp,#0]
799	stp	xzr,xzr,[sp,#16]
800	stp	xzr,xzr,[sp,#32]
801	stp	xzr,xzr,[sp,#48]
802
803.Ldone_neon:
804	ldp	x19,x20,[x29,#16]
805	add	sp,sp,#64
806	ldp	x21,x22,[x29,#32]
807	ldp	x23,x24,[x29,#48]
808	ldp	x25,x26,[x29,#64]
809	ldp	x27,x28,[x29,#80]
810	ldp	x29,x30,[sp],#96
811	AARCH64_VALIDATE_LINK_REGISTER
812	ret
813.size	ChaCha20_neon,.-ChaCha20_neon
814.type	ChaCha20_512_neon,%function
815.align	5
816ChaCha20_512_neon:
817	AARCH64_SIGN_LINK_REGISTER
818	stp	x29,x30,[sp,#-96]!
819	add	x29,sp,#0
820
821	adrp	x5,.Lsigma
822	add	x5,x5,:lo12:.Lsigma
823	stp	x19,x20,[sp,#16]
824	stp	x21,x22,[sp,#32]
825	stp	x23,x24,[sp,#48]
826	stp	x25,x26,[sp,#64]
827	stp	x27,x28,[sp,#80]
828
829.L512_or_more_neon:
830	sub	sp,sp,#128+64
831
832	ldp	x22,x23,[x5]		// load sigma
833	ld1	{v24.4s},[x5],#16
834	ldp	x24,x25,[x3]		// load key
835	ldp	x26,x27,[x3,#16]
836	ld1	{v25.4s,v26.4s},[x3]
837	ldp	x28,x30,[x4]		// load counter
838	ld1	{v27.4s},[x4]
839	ld1	{v31.4s},[x5]
840#ifdef	__AARCH64EB__
841	rev64	v24.4s,v24.4s
842	ror	x24,x24,#32
843	ror	x25,x25,#32
844	ror	x26,x26,#32
845	ror	x27,x27,#32
846	ror	x28,x28,#32
847	ror	x30,x30,#32
848#endif
849	add	v27.4s,v27.4s,v31.4s		// += 1
850	stp	q24,q25,[sp,#0]		// off-load key block, invariant part
851	add	v27.4s,v27.4s,v31.4s		// not typo
852	str	q26,[sp,#32]
853	add	v28.4s,v27.4s,v31.4s
854	add	v29.4s,v28.4s,v31.4s
855	add	v30.4s,v29.4s,v31.4s
856	shl	v31.4s,v31.4s,#2			// 1 -> 4
857
858	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
859	stp	d10,d11,[sp,#128+16]
860	stp	d12,d13,[sp,#128+32]
861	stp	d14,d15,[sp,#128+48]
862
863	sub	x2,x2,#512			// not typo
864
865.Loop_outer_512_neon:
866	mov	v0.16b,v24.16b
867	mov	v4.16b,v24.16b
868	mov	v8.16b,v24.16b
869	mov	v12.16b,v24.16b
870	mov	v16.16b,v24.16b
871	mov	v20.16b,v24.16b
872	mov	v1.16b,v25.16b
873	mov	w5,w22			// unpack key block
874	mov	v5.16b,v25.16b
875	lsr	x6,x22,#32
876	mov	v9.16b,v25.16b
877	mov	w7,w23
878	mov	v13.16b,v25.16b
879	lsr	x8,x23,#32
880	mov	v17.16b,v25.16b
881	mov	w9,w24
882	mov	v21.16b,v25.16b
883	lsr	x10,x24,#32
884	mov	v3.16b,v27.16b
885	mov	w11,w25
886	mov	v7.16b,v28.16b
887	lsr	x12,x25,#32
888	mov	v11.16b,v29.16b
889	mov	w13,w26
890	mov	v15.16b,v30.16b
891	lsr	x14,x26,#32
892	mov	v2.16b,v26.16b
893	mov	w15,w27
894	mov	v6.16b,v26.16b
895	lsr	x16,x27,#32
896	add	v19.4s,v3.4s,v31.4s			// +4
897	mov	w17,w28
898	add	v23.4s,v7.4s,v31.4s			// +4
899	lsr	x19,x28,#32
900	mov	v10.16b,v26.16b
901	mov	w20,w30
902	mov	v14.16b,v26.16b
903	lsr	x21,x30,#32
904	mov	v18.16b,v26.16b
905	stp	q27,q28,[sp,#48]		// off-load key block, variable part
906	mov	v22.16b,v26.16b
907	str	q29,[sp,#80]
908
909	mov	x4,#5
910	subs	x2,x2,#512
911.Loop_upper_neon:
912	sub	x4,x4,#1
913	add	v0.4s,v0.4s,v1.4s
914	add	w5,w5,w9
915	add	v4.4s,v4.4s,v5.4s
916	add	w6,w6,w10
917	add	v8.4s,v8.4s,v9.4s
918	add	w7,w7,w11
919	add	v12.4s,v12.4s,v13.4s
920	add	w8,w8,w12
921	add	v16.4s,v16.4s,v17.4s
922	eor	w17,w17,w5
923	add	v20.4s,v20.4s,v21.4s
924	eor	w19,w19,w6
925	eor	v3.16b,v3.16b,v0.16b
926	eor	w20,w20,w7
927	eor	v7.16b,v7.16b,v4.16b
928	eor	w21,w21,w8
929	eor	v11.16b,v11.16b,v8.16b
930	ror	w17,w17,#16
931	eor	v15.16b,v15.16b,v12.16b
932	ror	w19,w19,#16
933	eor	v19.16b,v19.16b,v16.16b
934	ror	w20,w20,#16
935	eor	v23.16b,v23.16b,v20.16b
936	ror	w21,w21,#16
937	rev32	v3.8h,v3.8h
938	add	w13,w13,w17
939	rev32	v7.8h,v7.8h
940	add	w14,w14,w19
941	rev32	v11.8h,v11.8h
942	add	w15,w15,w20
943	rev32	v15.8h,v15.8h
944	add	w16,w16,w21
945	rev32	v19.8h,v19.8h
946	eor	w9,w9,w13
947	rev32	v23.8h,v23.8h
948	eor	w10,w10,w14
949	add	v2.4s,v2.4s,v3.4s
950	eor	w11,w11,w15
951	add	v6.4s,v6.4s,v7.4s
952	eor	w12,w12,w16
953	add	v10.4s,v10.4s,v11.4s
954	ror	w9,w9,#20
955	add	v14.4s,v14.4s,v15.4s
956	ror	w10,w10,#20
957	add	v18.4s,v18.4s,v19.4s
958	ror	w11,w11,#20
959	add	v22.4s,v22.4s,v23.4s
960	ror	w12,w12,#20
961	eor	v24.16b,v1.16b,v2.16b
962	add	w5,w5,w9
963	eor	v25.16b,v5.16b,v6.16b
964	add	w6,w6,w10
965	eor	v26.16b,v9.16b,v10.16b
966	add	w7,w7,w11
967	eor	v27.16b,v13.16b,v14.16b
968	add	w8,w8,w12
969	eor	v28.16b,v17.16b,v18.16b
970	eor	w17,w17,w5
971	eor	v29.16b,v21.16b,v22.16b
972	eor	w19,w19,w6
973	ushr	v1.4s,v24.4s,#20
974	eor	w20,w20,w7
975	ushr	v5.4s,v25.4s,#20
976	eor	w21,w21,w8
977	ushr	v9.4s,v26.4s,#20
978	ror	w17,w17,#24
979	ushr	v13.4s,v27.4s,#20
980	ror	w19,w19,#24
981	ushr	v17.4s,v28.4s,#20
982	ror	w20,w20,#24
983	ushr	v21.4s,v29.4s,#20
984	ror	w21,w21,#24
985	sli	v1.4s,v24.4s,#12
986	add	w13,w13,w17
987	sli	v5.4s,v25.4s,#12
988	add	w14,w14,w19
989	sli	v9.4s,v26.4s,#12
990	add	w15,w15,w20
991	sli	v13.4s,v27.4s,#12
992	add	w16,w16,w21
993	sli	v17.4s,v28.4s,#12
994	eor	w9,w9,w13
995	sli	v21.4s,v29.4s,#12
996	eor	w10,w10,w14
997	add	v0.4s,v0.4s,v1.4s
998	eor	w11,w11,w15
999	add	v4.4s,v4.4s,v5.4s
1000	eor	w12,w12,w16
1001	add	v8.4s,v8.4s,v9.4s
1002	ror	w9,w9,#25
1003	add	v12.4s,v12.4s,v13.4s
1004	ror	w10,w10,#25
1005	add	v16.4s,v16.4s,v17.4s
1006	ror	w11,w11,#25
1007	add	v20.4s,v20.4s,v21.4s
1008	ror	w12,w12,#25
1009	eor	v24.16b,v3.16b,v0.16b
1010	add	w5,w5,w10
1011	eor	v25.16b,v7.16b,v4.16b
1012	add	w6,w6,w11
1013	eor	v26.16b,v11.16b,v8.16b
1014	add	w7,w7,w12
1015	eor	v27.16b,v15.16b,v12.16b
1016	add	w8,w8,w9
1017	eor	v28.16b,v19.16b,v16.16b
1018	eor	w21,w21,w5
1019	eor	v29.16b,v23.16b,v20.16b
1020	eor	w17,w17,w6
1021	ushr	v3.4s,v24.4s,#24
1022	eor	w19,w19,w7
1023	ushr	v7.4s,v25.4s,#24
1024	eor	w20,w20,w8
1025	ushr	v11.4s,v26.4s,#24
1026	ror	w21,w21,#16
1027	ushr	v15.4s,v27.4s,#24
1028	ror	w17,w17,#16
1029	ushr	v19.4s,v28.4s,#24
1030	ror	w19,w19,#16
1031	ushr	v23.4s,v29.4s,#24
1032	ror	w20,w20,#16
1033	sli	v3.4s,v24.4s,#8
1034	add	w15,w15,w21
1035	sli	v7.4s,v25.4s,#8
1036	add	w16,w16,w17
1037	sli	v11.4s,v26.4s,#8
1038	add	w13,w13,w19
1039	sli	v15.4s,v27.4s,#8
1040	add	w14,w14,w20
1041	sli	v19.4s,v28.4s,#8
1042	eor	w10,w10,w15
1043	sli	v23.4s,v29.4s,#8
1044	eor	w11,w11,w16
1045	add	v2.4s,v2.4s,v3.4s
1046	eor	w12,w12,w13
1047	add	v6.4s,v6.4s,v7.4s
1048	eor	w9,w9,w14
1049	add	v10.4s,v10.4s,v11.4s
1050	ror	w10,w10,#20
1051	add	v14.4s,v14.4s,v15.4s
1052	ror	w11,w11,#20
1053	add	v18.4s,v18.4s,v19.4s
1054	ror	w12,w12,#20
1055	add	v22.4s,v22.4s,v23.4s
1056	ror	w9,w9,#20
1057	eor	v24.16b,v1.16b,v2.16b
1058	add	w5,w5,w10
1059	eor	v25.16b,v5.16b,v6.16b
1060	add	w6,w6,w11
1061	eor	v26.16b,v9.16b,v10.16b
1062	add	w7,w7,w12
1063	eor	v27.16b,v13.16b,v14.16b
1064	add	w8,w8,w9
1065	eor	v28.16b,v17.16b,v18.16b
1066	eor	w21,w21,w5
1067	eor	v29.16b,v21.16b,v22.16b
1068	eor	w17,w17,w6
1069	ushr	v1.4s,v24.4s,#25
1070	eor	w19,w19,w7
1071	ushr	v5.4s,v25.4s,#25
1072	eor	w20,w20,w8
1073	ushr	v9.4s,v26.4s,#25
1074	ror	w21,w21,#24
1075	ushr	v13.4s,v27.4s,#25
1076	ror	w17,w17,#24
1077	ushr	v17.4s,v28.4s,#25
1078	ror	w19,w19,#24
1079	ushr	v21.4s,v29.4s,#25
1080	ror	w20,w20,#24
1081	sli	v1.4s,v24.4s,#7
1082	add	w15,w15,w21
1083	sli	v5.4s,v25.4s,#7
1084	add	w16,w16,w17
1085	sli	v9.4s,v26.4s,#7
1086	add	w13,w13,w19
1087	sli	v13.4s,v27.4s,#7
1088	add	w14,w14,w20
1089	sli	v17.4s,v28.4s,#7
1090	eor	w10,w10,w15
1091	sli	v21.4s,v29.4s,#7
1092	eor	w11,w11,w16
1093	ext	v2.16b,v2.16b,v2.16b,#8
1094	eor	w12,w12,w13
1095	ext	v6.16b,v6.16b,v6.16b,#8
1096	eor	w9,w9,w14
1097	ext	v10.16b,v10.16b,v10.16b,#8
1098	ror	w10,w10,#25
1099	ext	v14.16b,v14.16b,v14.16b,#8
1100	ror	w11,w11,#25
1101	ext	v18.16b,v18.16b,v18.16b,#8
1102	ror	w12,w12,#25
1103	ext	v22.16b,v22.16b,v22.16b,#8
1104	ror	w9,w9,#25
1105	ext	v3.16b,v3.16b,v3.16b,#12
1106	ext	v7.16b,v7.16b,v7.16b,#12
1107	ext	v11.16b,v11.16b,v11.16b,#12
1108	ext	v15.16b,v15.16b,v15.16b,#12
1109	ext	v19.16b,v19.16b,v19.16b,#12
1110	ext	v23.16b,v23.16b,v23.16b,#12
1111	ext	v1.16b,v1.16b,v1.16b,#4
1112	ext	v5.16b,v5.16b,v5.16b,#4
1113	ext	v9.16b,v9.16b,v9.16b,#4
1114	ext	v13.16b,v13.16b,v13.16b,#4
1115	ext	v17.16b,v17.16b,v17.16b,#4
1116	ext	v21.16b,v21.16b,v21.16b,#4
1117	add	v0.4s,v0.4s,v1.4s
1118	add	w5,w5,w9
1119	add	v4.4s,v4.4s,v5.4s
1120	add	w6,w6,w10
1121	add	v8.4s,v8.4s,v9.4s
1122	add	w7,w7,w11
1123	add	v12.4s,v12.4s,v13.4s
1124	add	w8,w8,w12
1125	add	v16.4s,v16.4s,v17.4s
1126	eor	w17,w17,w5
1127	add	v20.4s,v20.4s,v21.4s
1128	eor	w19,w19,w6
1129	eor	v3.16b,v3.16b,v0.16b
1130	eor	w20,w20,w7
1131	eor	v7.16b,v7.16b,v4.16b
1132	eor	w21,w21,w8
1133	eor	v11.16b,v11.16b,v8.16b
1134	ror	w17,w17,#16
1135	eor	v15.16b,v15.16b,v12.16b
1136	ror	w19,w19,#16
1137	eor	v19.16b,v19.16b,v16.16b
1138	ror	w20,w20,#16
1139	eor	v23.16b,v23.16b,v20.16b
1140	ror	w21,w21,#16
1141	rev32	v3.8h,v3.8h
1142	add	w13,w13,w17
1143	rev32	v7.8h,v7.8h
1144	add	w14,w14,w19
1145	rev32	v11.8h,v11.8h
1146	add	w15,w15,w20
1147	rev32	v15.8h,v15.8h
1148	add	w16,w16,w21
1149	rev32	v19.8h,v19.8h
1150	eor	w9,w9,w13
1151	rev32	v23.8h,v23.8h
1152	eor	w10,w10,w14
1153	add	v2.4s,v2.4s,v3.4s
1154	eor	w11,w11,w15
1155	add	v6.4s,v6.4s,v7.4s
1156	eor	w12,w12,w16
1157	add	v10.4s,v10.4s,v11.4s
1158	ror	w9,w9,#20
1159	add	v14.4s,v14.4s,v15.4s
1160	ror	w10,w10,#20
1161	add	v18.4s,v18.4s,v19.4s
1162	ror	w11,w11,#20
1163	add	v22.4s,v22.4s,v23.4s
1164	ror	w12,w12,#20
1165	eor	v24.16b,v1.16b,v2.16b
1166	add	w5,w5,w9
1167	eor	v25.16b,v5.16b,v6.16b
1168	add	w6,w6,w10
1169	eor	v26.16b,v9.16b,v10.16b
1170	add	w7,w7,w11
1171	eor	v27.16b,v13.16b,v14.16b
1172	add	w8,w8,w12
1173	eor	v28.16b,v17.16b,v18.16b
1174	eor	w17,w17,w5
1175	eor	v29.16b,v21.16b,v22.16b
1176	eor	w19,w19,w6
1177	ushr	v1.4s,v24.4s,#20
1178	eor	w20,w20,w7
1179	ushr	v5.4s,v25.4s,#20
1180	eor	w21,w21,w8
1181	ushr	v9.4s,v26.4s,#20
1182	ror	w17,w17,#24
1183	ushr	v13.4s,v27.4s,#20
1184	ror	w19,w19,#24
1185	ushr	v17.4s,v28.4s,#20
1186	ror	w20,w20,#24
1187	ushr	v21.4s,v29.4s,#20
1188	ror	w21,w21,#24
1189	sli	v1.4s,v24.4s,#12
1190	add	w13,w13,w17
1191	sli	v5.4s,v25.4s,#12
1192	add	w14,w14,w19
1193	sli	v9.4s,v26.4s,#12
1194	add	w15,w15,w20
1195	sli	v13.4s,v27.4s,#12
1196	add	w16,w16,w21
1197	sli	v17.4s,v28.4s,#12
1198	eor	w9,w9,w13
1199	sli	v21.4s,v29.4s,#12
1200	eor	w10,w10,w14
1201	add	v0.4s,v0.4s,v1.4s
1202	eor	w11,w11,w15
1203	add	v4.4s,v4.4s,v5.4s
1204	eor	w12,w12,w16
1205	add	v8.4s,v8.4s,v9.4s
1206	ror	w9,w9,#25
1207	add	v12.4s,v12.4s,v13.4s
1208	ror	w10,w10,#25
1209	add	v16.4s,v16.4s,v17.4s
1210	ror	w11,w11,#25
1211	add	v20.4s,v20.4s,v21.4s
1212	ror	w12,w12,#25
1213	eor	v24.16b,v3.16b,v0.16b
1214	add	w5,w5,w10
1215	eor	v25.16b,v7.16b,v4.16b
1216	add	w6,w6,w11
1217	eor	v26.16b,v11.16b,v8.16b
1218	add	w7,w7,w12
1219	eor	v27.16b,v15.16b,v12.16b
1220	add	w8,w8,w9
1221	eor	v28.16b,v19.16b,v16.16b
1222	eor	w21,w21,w5
1223	eor	v29.16b,v23.16b,v20.16b
1224	eor	w17,w17,w6
1225	ushr	v3.4s,v24.4s,#24
1226	eor	w19,w19,w7
1227	ushr	v7.4s,v25.4s,#24
1228	eor	w20,w20,w8
1229	ushr	v11.4s,v26.4s,#24
1230	ror	w21,w21,#16
1231	ushr	v15.4s,v27.4s,#24
1232	ror	w17,w17,#16
1233	ushr	v19.4s,v28.4s,#24
1234	ror	w19,w19,#16
1235	ushr	v23.4s,v29.4s,#24
1236	ror	w20,w20,#16
1237	sli	v3.4s,v24.4s,#8
1238	add	w15,w15,w21
1239	sli	v7.4s,v25.4s,#8
1240	add	w16,w16,w17
1241	sli	v11.4s,v26.4s,#8
1242	add	w13,w13,w19
1243	sli	v15.4s,v27.4s,#8
1244	add	w14,w14,w20
1245	sli	v19.4s,v28.4s,#8
1246	eor	w10,w10,w15
1247	sli	v23.4s,v29.4s,#8
1248	eor	w11,w11,w16
1249	add	v2.4s,v2.4s,v3.4s
1250	eor	w12,w12,w13
1251	add	v6.4s,v6.4s,v7.4s
1252	eor	w9,w9,w14
1253	add	v10.4s,v10.4s,v11.4s
1254	ror	w10,w10,#20
1255	add	v14.4s,v14.4s,v15.4s
1256	ror	w11,w11,#20
1257	add	v18.4s,v18.4s,v19.4s
1258	ror	w12,w12,#20
1259	add	v22.4s,v22.4s,v23.4s
1260	ror	w9,w9,#20
1261	eor	v24.16b,v1.16b,v2.16b
1262	add	w5,w5,w10
1263	eor	v25.16b,v5.16b,v6.16b
1264	add	w6,w6,w11
1265	eor	v26.16b,v9.16b,v10.16b
1266	add	w7,w7,w12
1267	eor	v27.16b,v13.16b,v14.16b
1268	add	w8,w8,w9
1269	eor	v28.16b,v17.16b,v18.16b
1270	eor	w21,w21,w5
1271	eor	v29.16b,v21.16b,v22.16b
1272	eor	w17,w17,w6
1273	ushr	v1.4s,v24.4s,#25
1274	eor	w19,w19,w7
1275	ushr	v5.4s,v25.4s,#25
1276	eor	w20,w20,w8
1277	ushr	v9.4s,v26.4s,#25
1278	ror	w21,w21,#24
1279	ushr	v13.4s,v27.4s,#25
1280	ror	w17,w17,#24
1281	ushr	v17.4s,v28.4s,#25
1282	ror	w19,w19,#24
1283	ushr	v21.4s,v29.4s,#25
1284	ror	w20,w20,#24
1285	sli	v1.4s,v24.4s,#7
1286	add	w15,w15,w21
1287	sli	v5.4s,v25.4s,#7
1288	add	w16,w16,w17
1289	sli	v9.4s,v26.4s,#7
1290	add	w13,w13,w19
1291	sli	v13.4s,v27.4s,#7
1292	add	w14,w14,w20
1293	sli	v17.4s,v28.4s,#7
1294	eor	w10,w10,w15
1295	sli	v21.4s,v29.4s,#7
1296	eor	w11,w11,w16
1297	ext	v2.16b,v2.16b,v2.16b,#8
1298	eor	w12,w12,w13
1299	ext	v6.16b,v6.16b,v6.16b,#8
1300	eor	w9,w9,w14
1301	ext	v10.16b,v10.16b,v10.16b,#8
1302	ror	w10,w10,#25
1303	ext	v14.16b,v14.16b,v14.16b,#8
1304	ror	w11,w11,#25
1305	ext	v18.16b,v18.16b,v18.16b,#8
1306	ror	w12,w12,#25
1307	ext	v22.16b,v22.16b,v22.16b,#8
1308	ror	w9,w9,#25
1309	ext	v3.16b,v3.16b,v3.16b,#4
1310	ext	v7.16b,v7.16b,v7.16b,#4
1311	ext	v11.16b,v11.16b,v11.16b,#4
1312	ext	v15.16b,v15.16b,v15.16b,#4
1313	ext	v19.16b,v19.16b,v19.16b,#4
1314	ext	v23.16b,v23.16b,v23.16b,#4
1315	ext	v1.16b,v1.16b,v1.16b,#12
1316	ext	v5.16b,v5.16b,v5.16b,#12
1317	ext	v9.16b,v9.16b,v9.16b,#12
1318	ext	v13.16b,v13.16b,v13.16b,#12
1319	ext	v17.16b,v17.16b,v17.16b,#12
1320	ext	v21.16b,v21.16b,v21.16b,#12
1321	cbnz	x4,.Loop_upper_neon
1322
1323	add	w5,w5,w22		// accumulate key block
1324	add	x6,x6,x22,lsr#32
1325	add	w7,w7,w23
1326	add	x8,x8,x23,lsr#32
1327	add	w9,w9,w24
1328	add	x10,x10,x24,lsr#32
1329	add	w11,w11,w25
1330	add	x12,x12,x25,lsr#32
1331	add	w13,w13,w26
1332	add	x14,x14,x26,lsr#32
1333	add	w15,w15,w27
1334	add	x16,x16,x27,lsr#32
1335	add	w17,w17,w28
1336	add	x19,x19,x28,lsr#32
1337	add	w20,w20,w30
1338	add	x21,x21,x30,lsr#32
1339
1340	add	x5,x5,x6,lsl#32	// pack
1341	add	x7,x7,x8,lsl#32
1342	ldp	x6,x8,[x1,#0]		// load input
1343	add	x9,x9,x10,lsl#32
1344	add	x11,x11,x12,lsl#32
1345	ldp	x10,x12,[x1,#16]
1346	add	x13,x13,x14,lsl#32
1347	add	x15,x15,x16,lsl#32
1348	ldp	x14,x16,[x1,#32]
1349	add	x17,x17,x19,lsl#32
1350	add	x20,x20,x21,lsl#32
1351	ldp	x19,x21,[x1,#48]
1352	add	x1,x1,#64
1353#ifdef	__AARCH64EB__
1354	rev	x5,x5
1355	rev	x7,x7
1356	rev	x9,x9
1357	rev	x11,x11
1358	rev	x13,x13
1359	rev	x15,x15
1360	rev	x17,x17
1361	rev	x20,x20
1362#endif
1363	eor	x5,x5,x6
1364	eor	x7,x7,x8
1365	eor	x9,x9,x10
1366	eor	x11,x11,x12
1367	eor	x13,x13,x14
1368	eor	x15,x15,x16
1369	eor	x17,x17,x19
1370	eor	x20,x20,x21
1371
1372	stp	x5,x7,[x0,#0]		// store output
1373	add	x28,x28,#1			// increment counter
1374	mov	w5,w22			// unpack key block
1375	lsr	x6,x22,#32
1376	stp	x9,x11,[x0,#16]
1377	mov	w7,w23
1378	lsr	x8,x23,#32
1379	stp	x13,x15,[x0,#32]
1380	mov	w9,w24
1381	lsr	x10,x24,#32
1382	stp	x17,x20,[x0,#48]
1383	add	x0,x0,#64
1384	mov	w11,w25
1385	lsr	x12,x25,#32
1386	mov	w13,w26
1387	lsr	x14,x26,#32
1388	mov	w15,w27
1389	lsr	x16,x27,#32
1390	mov	w17,w28
1391	lsr	x19,x28,#32
1392	mov	w20,w30
1393	lsr	x21,x30,#32
1394
1395	mov	x4,#5
1396.Loop_lower_neon:
1397	sub	x4,x4,#1
1398	add	v0.4s,v0.4s,v1.4s
1399	add	w5,w5,w9
1400	add	v4.4s,v4.4s,v5.4s
1401	add	w6,w6,w10
1402	add	v8.4s,v8.4s,v9.4s
1403	add	w7,w7,w11
1404	add	v12.4s,v12.4s,v13.4s
1405	add	w8,w8,w12
1406	add	v16.4s,v16.4s,v17.4s
1407	eor	w17,w17,w5
1408	add	v20.4s,v20.4s,v21.4s
1409	eor	w19,w19,w6
1410	eor	v3.16b,v3.16b,v0.16b
1411	eor	w20,w20,w7
1412	eor	v7.16b,v7.16b,v4.16b
1413	eor	w21,w21,w8
1414	eor	v11.16b,v11.16b,v8.16b
1415	ror	w17,w17,#16
1416	eor	v15.16b,v15.16b,v12.16b
1417	ror	w19,w19,#16
1418	eor	v19.16b,v19.16b,v16.16b
1419	ror	w20,w20,#16
1420	eor	v23.16b,v23.16b,v20.16b
1421	ror	w21,w21,#16
1422	rev32	v3.8h,v3.8h
1423	add	w13,w13,w17
1424	rev32	v7.8h,v7.8h
1425	add	w14,w14,w19
1426	rev32	v11.8h,v11.8h
1427	add	w15,w15,w20
1428	rev32	v15.8h,v15.8h
1429	add	w16,w16,w21
1430	rev32	v19.8h,v19.8h
1431	eor	w9,w9,w13
1432	rev32	v23.8h,v23.8h
1433	eor	w10,w10,w14
1434	add	v2.4s,v2.4s,v3.4s
1435	eor	w11,w11,w15
1436	add	v6.4s,v6.4s,v7.4s
1437	eor	w12,w12,w16
1438	add	v10.4s,v10.4s,v11.4s
1439	ror	w9,w9,#20
1440	add	v14.4s,v14.4s,v15.4s
1441	ror	w10,w10,#20
1442	add	v18.4s,v18.4s,v19.4s
1443	ror	w11,w11,#20
1444	add	v22.4s,v22.4s,v23.4s
1445	ror	w12,w12,#20
1446	eor	v24.16b,v1.16b,v2.16b
1447	add	w5,w5,w9
1448	eor	v25.16b,v5.16b,v6.16b
1449	add	w6,w6,w10
1450	eor	v26.16b,v9.16b,v10.16b
1451	add	w7,w7,w11
1452	eor	v27.16b,v13.16b,v14.16b
1453	add	w8,w8,w12
1454	eor	v28.16b,v17.16b,v18.16b
1455	eor	w17,w17,w5
1456	eor	v29.16b,v21.16b,v22.16b
1457	eor	w19,w19,w6
1458	ushr	v1.4s,v24.4s,#20
1459	eor	w20,w20,w7
1460	ushr	v5.4s,v25.4s,#20
1461	eor	w21,w21,w8
1462	ushr	v9.4s,v26.4s,#20
1463	ror	w17,w17,#24
1464	ushr	v13.4s,v27.4s,#20
1465	ror	w19,w19,#24
1466	ushr	v17.4s,v28.4s,#20
1467	ror	w20,w20,#24
1468	ushr	v21.4s,v29.4s,#20
1469	ror	w21,w21,#24
1470	sli	v1.4s,v24.4s,#12
1471	add	w13,w13,w17
1472	sli	v5.4s,v25.4s,#12
1473	add	w14,w14,w19
1474	sli	v9.4s,v26.4s,#12
1475	add	w15,w15,w20
1476	sli	v13.4s,v27.4s,#12
1477	add	w16,w16,w21
1478	sli	v17.4s,v28.4s,#12
1479	eor	w9,w9,w13
1480	sli	v21.4s,v29.4s,#12
1481	eor	w10,w10,w14
1482	add	v0.4s,v0.4s,v1.4s
1483	eor	w11,w11,w15
1484	add	v4.4s,v4.4s,v5.4s
1485	eor	w12,w12,w16
1486	add	v8.4s,v8.4s,v9.4s
1487	ror	w9,w9,#25
1488	add	v12.4s,v12.4s,v13.4s
1489	ror	w10,w10,#25
1490	add	v16.4s,v16.4s,v17.4s
1491	ror	w11,w11,#25
1492	add	v20.4s,v20.4s,v21.4s
1493	ror	w12,w12,#25
1494	eor	v24.16b,v3.16b,v0.16b
1495	add	w5,w5,w10
1496	eor	v25.16b,v7.16b,v4.16b
1497	add	w6,w6,w11
1498	eor	v26.16b,v11.16b,v8.16b
1499	add	w7,w7,w12
1500	eor	v27.16b,v15.16b,v12.16b
1501	add	w8,w8,w9
1502	eor	v28.16b,v19.16b,v16.16b
1503	eor	w21,w21,w5
1504	eor	v29.16b,v23.16b,v20.16b
1505	eor	w17,w17,w6
1506	ushr	v3.4s,v24.4s,#24
1507	eor	w19,w19,w7
1508	ushr	v7.4s,v25.4s,#24
1509	eor	w20,w20,w8
1510	ushr	v11.4s,v26.4s,#24
1511	ror	w21,w21,#16
1512	ushr	v15.4s,v27.4s,#24
1513	ror	w17,w17,#16
1514	ushr	v19.4s,v28.4s,#24
1515	ror	w19,w19,#16
1516	ushr	v23.4s,v29.4s,#24
1517	ror	w20,w20,#16
1518	sli	v3.4s,v24.4s,#8
1519	add	w15,w15,w21
1520	sli	v7.4s,v25.4s,#8
1521	add	w16,w16,w17
1522	sli	v11.4s,v26.4s,#8
1523	add	w13,w13,w19
1524	sli	v15.4s,v27.4s,#8
1525	add	w14,w14,w20
1526	sli	v19.4s,v28.4s,#8
1527	eor	w10,w10,w15
1528	sli	v23.4s,v29.4s,#8
1529	eor	w11,w11,w16
1530	add	v2.4s,v2.4s,v3.4s
1531	eor	w12,w12,w13
1532	add	v6.4s,v6.4s,v7.4s
1533	eor	w9,w9,w14
1534	add	v10.4s,v10.4s,v11.4s
1535	ror	w10,w10,#20
1536	add	v14.4s,v14.4s,v15.4s
1537	ror	w11,w11,#20
1538	add	v18.4s,v18.4s,v19.4s
1539	ror	w12,w12,#20
1540	add	v22.4s,v22.4s,v23.4s
1541	ror	w9,w9,#20
1542	eor	v24.16b,v1.16b,v2.16b
1543	add	w5,w5,w10
1544	eor	v25.16b,v5.16b,v6.16b
1545	add	w6,w6,w11
1546	eor	v26.16b,v9.16b,v10.16b
1547	add	w7,w7,w12
1548	eor	v27.16b,v13.16b,v14.16b
1549	add	w8,w8,w9
1550	eor	v28.16b,v17.16b,v18.16b
1551	eor	w21,w21,w5
1552	eor	v29.16b,v21.16b,v22.16b
1553	eor	w17,w17,w6
1554	ushr	v1.4s,v24.4s,#25
1555	eor	w19,w19,w7
1556	ushr	v5.4s,v25.4s,#25
1557	eor	w20,w20,w8
1558	ushr	v9.4s,v26.4s,#25
1559	ror	w21,w21,#24
1560	ushr	v13.4s,v27.4s,#25
1561	ror	w17,w17,#24
1562	ushr	v17.4s,v28.4s,#25
1563	ror	w19,w19,#24
1564	ushr	v21.4s,v29.4s,#25
1565	ror	w20,w20,#24
1566	sli	v1.4s,v24.4s,#7
1567	add	w15,w15,w21
1568	sli	v5.4s,v25.4s,#7
1569	add	w16,w16,w17
1570	sli	v9.4s,v26.4s,#7
1571	add	w13,w13,w19
1572	sli	v13.4s,v27.4s,#7
1573	add	w14,w14,w20
1574	sli	v17.4s,v28.4s,#7
1575	eor	w10,w10,w15
1576	sli	v21.4s,v29.4s,#7
1577	eor	w11,w11,w16
1578	ext	v2.16b,v2.16b,v2.16b,#8
1579	eor	w12,w12,w13
1580	ext	v6.16b,v6.16b,v6.16b,#8
1581	eor	w9,w9,w14
1582	ext	v10.16b,v10.16b,v10.16b,#8
1583	ror	w10,w10,#25
1584	ext	v14.16b,v14.16b,v14.16b,#8
1585	ror	w11,w11,#25
1586	ext	v18.16b,v18.16b,v18.16b,#8
1587	ror	w12,w12,#25
1588	ext	v22.16b,v22.16b,v22.16b,#8
1589	ror	w9,w9,#25
1590	ext	v3.16b,v3.16b,v3.16b,#12
1591	ext	v7.16b,v7.16b,v7.16b,#12
1592	ext	v11.16b,v11.16b,v11.16b,#12
1593	ext	v15.16b,v15.16b,v15.16b,#12
1594	ext	v19.16b,v19.16b,v19.16b,#12
1595	ext	v23.16b,v23.16b,v23.16b,#12
1596	ext	v1.16b,v1.16b,v1.16b,#4
1597	ext	v5.16b,v5.16b,v5.16b,#4
1598	ext	v9.16b,v9.16b,v9.16b,#4
1599	ext	v13.16b,v13.16b,v13.16b,#4
1600	ext	v17.16b,v17.16b,v17.16b,#4
1601	ext	v21.16b,v21.16b,v21.16b,#4
1602	add	v0.4s,v0.4s,v1.4s
1603	add	w5,w5,w9
1604	add	v4.4s,v4.4s,v5.4s
1605	add	w6,w6,w10
1606	add	v8.4s,v8.4s,v9.4s
1607	add	w7,w7,w11
1608	add	v12.4s,v12.4s,v13.4s
1609	add	w8,w8,w12
1610	add	v16.4s,v16.4s,v17.4s
1611	eor	w17,w17,w5
1612	add	v20.4s,v20.4s,v21.4s
1613	eor	w19,w19,w6
1614	eor	v3.16b,v3.16b,v0.16b
1615	eor	w20,w20,w7
1616	eor	v7.16b,v7.16b,v4.16b
1617	eor	w21,w21,w8
1618	eor	v11.16b,v11.16b,v8.16b
1619	ror	w17,w17,#16
1620	eor	v15.16b,v15.16b,v12.16b
1621	ror	w19,w19,#16
1622	eor	v19.16b,v19.16b,v16.16b
1623	ror	w20,w20,#16
1624	eor	v23.16b,v23.16b,v20.16b
1625	ror	w21,w21,#16
1626	rev32	v3.8h,v3.8h
1627	add	w13,w13,w17
1628	rev32	v7.8h,v7.8h
1629	add	w14,w14,w19
1630	rev32	v11.8h,v11.8h
1631	add	w15,w15,w20
1632	rev32	v15.8h,v15.8h
1633	add	w16,w16,w21
1634	rev32	v19.8h,v19.8h
1635	eor	w9,w9,w13
1636	rev32	v23.8h,v23.8h
1637	eor	w10,w10,w14
1638	add	v2.4s,v2.4s,v3.4s
1639	eor	w11,w11,w15
1640	add	v6.4s,v6.4s,v7.4s
1641	eor	w12,w12,w16
1642	add	v10.4s,v10.4s,v11.4s
1643	ror	w9,w9,#20
1644	add	v14.4s,v14.4s,v15.4s
1645	ror	w10,w10,#20
1646	add	v18.4s,v18.4s,v19.4s
1647	ror	w11,w11,#20
1648	add	v22.4s,v22.4s,v23.4s
1649	ror	w12,w12,#20
1650	eor	v24.16b,v1.16b,v2.16b
1651	add	w5,w5,w9
1652	eor	v25.16b,v5.16b,v6.16b
1653	add	w6,w6,w10
1654	eor	v26.16b,v9.16b,v10.16b
1655	add	w7,w7,w11
1656	eor	v27.16b,v13.16b,v14.16b
1657	add	w8,w8,w12
1658	eor	v28.16b,v17.16b,v18.16b
1659	eor	w17,w17,w5
1660	eor	v29.16b,v21.16b,v22.16b
1661	eor	w19,w19,w6
1662	ushr	v1.4s,v24.4s,#20
1663	eor	w20,w20,w7
1664	ushr	v5.4s,v25.4s,#20
1665	eor	w21,w21,w8
1666	ushr	v9.4s,v26.4s,#20
1667	ror	w17,w17,#24
1668	ushr	v13.4s,v27.4s,#20
1669	ror	w19,w19,#24
1670	ushr	v17.4s,v28.4s,#20
1671	ror	w20,w20,#24
1672	ushr	v21.4s,v29.4s,#20
1673	ror	w21,w21,#24
1674	sli	v1.4s,v24.4s,#12
1675	add	w13,w13,w17
1676	sli	v5.4s,v25.4s,#12
1677	add	w14,w14,w19
1678	sli	v9.4s,v26.4s,#12
1679	add	w15,w15,w20
1680	sli	v13.4s,v27.4s,#12
1681	add	w16,w16,w21
1682	sli	v17.4s,v28.4s,#12
1683	eor	w9,w9,w13
1684	sli	v21.4s,v29.4s,#12
1685	eor	w10,w10,w14
1686	add	v0.4s,v0.4s,v1.4s
1687	eor	w11,w11,w15
1688	add	v4.4s,v4.4s,v5.4s
1689	eor	w12,w12,w16
1690	add	v8.4s,v8.4s,v9.4s
1691	ror	w9,w9,#25
1692	add	v12.4s,v12.4s,v13.4s
1693	ror	w10,w10,#25
1694	add	v16.4s,v16.4s,v17.4s
1695	ror	w11,w11,#25
1696	add	v20.4s,v20.4s,v21.4s
1697	ror	w12,w12,#25
1698	eor	v24.16b,v3.16b,v0.16b
1699	add	w5,w5,w10
1700	eor	v25.16b,v7.16b,v4.16b
1701	add	w6,w6,w11
1702	eor	v26.16b,v11.16b,v8.16b
1703	add	w7,w7,w12
1704	eor	v27.16b,v15.16b,v12.16b
1705	add	w8,w8,w9
1706	eor	v28.16b,v19.16b,v16.16b
1707	eor	w21,w21,w5
1708	eor	v29.16b,v23.16b,v20.16b
1709	eor	w17,w17,w6
1710	ushr	v3.4s,v24.4s,#24
1711	eor	w19,w19,w7
1712	ushr	v7.4s,v25.4s,#24
1713	eor	w20,w20,w8
1714	ushr	v11.4s,v26.4s,#24
1715	ror	w21,w21,#16
1716	ushr	v15.4s,v27.4s,#24
1717	ror	w17,w17,#16
1718	ushr	v19.4s,v28.4s,#24
1719	ror	w19,w19,#16
1720	ushr	v23.4s,v29.4s,#24
1721	ror	w20,w20,#16
1722	sli	v3.4s,v24.4s,#8
1723	add	w15,w15,w21
1724	sli	v7.4s,v25.4s,#8
1725	add	w16,w16,w17
1726	sli	v11.4s,v26.4s,#8
1727	add	w13,w13,w19
1728	sli	v15.4s,v27.4s,#8
1729	add	w14,w14,w20
1730	sli	v19.4s,v28.4s,#8
1731	eor	w10,w10,w15
1732	sli	v23.4s,v29.4s,#8
1733	eor	w11,w11,w16
1734	add	v2.4s,v2.4s,v3.4s
1735	eor	w12,w12,w13
1736	add	v6.4s,v6.4s,v7.4s
1737	eor	w9,w9,w14
1738	add	v10.4s,v10.4s,v11.4s
1739	ror	w10,w10,#20
1740	add	v14.4s,v14.4s,v15.4s
1741	ror	w11,w11,#20
1742	add	v18.4s,v18.4s,v19.4s
1743	ror	w12,w12,#20
1744	add	v22.4s,v22.4s,v23.4s
1745	ror	w9,w9,#20
1746	eor	v24.16b,v1.16b,v2.16b
1747	add	w5,w5,w10
1748	eor	v25.16b,v5.16b,v6.16b
1749	add	w6,w6,w11
1750	eor	v26.16b,v9.16b,v10.16b
1751	add	w7,w7,w12
1752	eor	v27.16b,v13.16b,v14.16b
1753	add	w8,w8,w9
1754	eor	v28.16b,v17.16b,v18.16b
1755	eor	w21,w21,w5
1756	eor	v29.16b,v21.16b,v22.16b
1757	eor	w17,w17,w6
1758	ushr	v1.4s,v24.4s,#25
1759	eor	w19,w19,w7
1760	ushr	v5.4s,v25.4s,#25
1761	eor	w20,w20,w8
1762	ushr	v9.4s,v26.4s,#25
1763	ror	w21,w21,#24
1764	ushr	v13.4s,v27.4s,#25
1765	ror	w17,w17,#24
1766	ushr	v17.4s,v28.4s,#25
1767	ror	w19,w19,#24
1768	ushr	v21.4s,v29.4s,#25
1769	ror	w20,w20,#24
1770	sli	v1.4s,v24.4s,#7
1771	add	w15,w15,w21
1772	sli	v5.4s,v25.4s,#7
1773	add	w16,w16,w17
1774	sli	v9.4s,v26.4s,#7
1775	add	w13,w13,w19
1776	sli	v13.4s,v27.4s,#7
1777	add	w14,w14,w20
1778	sli	v17.4s,v28.4s,#7
1779	eor	w10,w10,w15
1780	sli	v21.4s,v29.4s,#7
1781	eor	w11,w11,w16
1782	ext	v2.16b,v2.16b,v2.16b,#8
1783	eor	w12,w12,w13
1784	ext	v6.16b,v6.16b,v6.16b,#8
1785	eor	w9,w9,w14
1786	ext	v10.16b,v10.16b,v10.16b,#8
1787	ror	w10,w10,#25
1788	ext	v14.16b,v14.16b,v14.16b,#8
1789	ror	w11,w11,#25
1790	ext	v18.16b,v18.16b,v18.16b,#8
1791	ror	w12,w12,#25
1792	ext	v22.16b,v22.16b,v22.16b,#8
1793	ror	w9,w9,#25
1794	ext	v3.16b,v3.16b,v3.16b,#4
1795	ext	v7.16b,v7.16b,v7.16b,#4
1796	ext	v11.16b,v11.16b,v11.16b,#4
1797	ext	v15.16b,v15.16b,v15.16b,#4
1798	ext	v19.16b,v19.16b,v19.16b,#4
1799	ext	v23.16b,v23.16b,v23.16b,#4
1800	ext	v1.16b,v1.16b,v1.16b,#12
1801	ext	v5.16b,v5.16b,v5.16b,#12
1802	ext	v9.16b,v9.16b,v9.16b,#12
1803	ext	v13.16b,v13.16b,v13.16b,#12
1804	ext	v17.16b,v17.16b,v17.16b,#12
1805	ext	v21.16b,v21.16b,v21.16b,#12
1806	cbnz	x4,.Loop_lower_neon
1807
1808	add	w5,w5,w22		// accumulate key block
1809	ldp	q24,q25,[sp,#0]
1810	add	x6,x6,x22,lsr#32
1811	ldp	q26,q27,[sp,#32]
1812	add	w7,w7,w23
1813	ldp	q28,q29,[sp,#64]
1814	add	x8,x8,x23,lsr#32
1815	add	v0.4s,v0.4s,v24.4s
1816	add	w9,w9,w24
1817	add	v4.4s,v4.4s,v24.4s
1818	add	x10,x10,x24,lsr#32
1819	add	v8.4s,v8.4s,v24.4s
1820	add	w11,w11,w25
1821	add	v12.4s,v12.4s,v24.4s
1822	add	x12,x12,x25,lsr#32
1823	add	v16.4s,v16.4s,v24.4s
1824	add	w13,w13,w26
1825	add	v20.4s,v20.4s,v24.4s
1826	add	x14,x14,x26,lsr#32
1827	add	v2.4s,v2.4s,v26.4s
1828	add	w15,w15,w27
1829	add	v6.4s,v6.4s,v26.4s
1830	add	x16,x16,x27,lsr#32
1831	add	v10.4s,v10.4s,v26.4s
1832	add	w17,w17,w28
1833	add	v14.4s,v14.4s,v26.4s
1834	add	x19,x19,x28,lsr#32
1835	add	v18.4s,v18.4s,v26.4s
1836	add	w20,w20,w30
1837	add	v22.4s,v22.4s,v26.4s
1838	add	x21,x21,x30,lsr#32
1839	add	v19.4s,v19.4s,v31.4s			// +4
1840	add	x5,x5,x6,lsl#32	// pack
1841	add	v23.4s,v23.4s,v31.4s			// +4
1842	add	x7,x7,x8,lsl#32
1843	add	v3.4s,v3.4s,v27.4s
1844	ldp	x6,x8,[x1,#0]		// load input
1845	add	v7.4s,v7.4s,v28.4s
1846	add	x9,x9,x10,lsl#32
1847	add	v11.4s,v11.4s,v29.4s
1848	add	x11,x11,x12,lsl#32
1849	add	v15.4s,v15.4s,v30.4s
1850	ldp	x10,x12,[x1,#16]
1851	add	v19.4s,v19.4s,v27.4s
1852	add	x13,x13,x14,lsl#32
1853	add	v23.4s,v23.4s,v28.4s
1854	add	x15,x15,x16,lsl#32
1855	add	v1.4s,v1.4s,v25.4s
1856	ldp	x14,x16,[x1,#32]
1857	add	v5.4s,v5.4s,v25.4s
1858	add	x17,x17,x19,lsl#32
1859	add	v9.4s,v9.4s,v25.4s
1860	add	x20,x20,x21,lsl#32
1861	add	v13.4s,v13.4s,v25.4s
1862	ldp	x19,x21,[x1,#48]
1863	add	v17.4s,v17.4s,v25.4s
1864	add	x1,x1,#64
1865	add	v21.4s,v21.4s,v25.4s
1866
1867#ifdef	__AARCH64EB__
1868	rev	x5,x5
1869	rev	x7,x7
1870	rev	x9,x9
1871	rev	x11,x11
1872	rev	x13,x13
1873	rev	x15,x15
1874	rev	x17,x17
1875	rev	x20,x20
1876#endif
1877	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1878	eor	x5,x5,x6
1879	eor	x7,x7,x8
1880	eor	x9,x9,x10
1881	eor	x11,x11,x12
1882	eor	x13,x13,x14
1883	eor	v0.16b,v0.16b,v24.16b
1884	eor	x15,x15,x16
1885	eor	v1.16b,v1.16b,v25.16b
1886	eor	x17,x17,x19
1887	eor	v2.16b,v2.16b,v26.16b
1888	eor	x20,x20,x21
1889	eor	v3.16b,v3.16b,v27.16b
1890	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1891
1892	stp	x5,x7,[x0,#0]		// store output
1893	add	x28,x28,#7			// increment counter
1894	stp	x9,x11,[x0,#16]
1895	stp	x13,x15,[x0,#32]
1896	stp	x17,x20,[x0,#48]
1897	add	x0,x0,#64
1898	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
1899
1900	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
1901	eor	v4.16b,v4.16b,v24.16b
1902	eor	v5.16b,v5.16b,v25.16b
1903	eor	v6.16b,v6.16b,v26.16b
1904	eor	v7.16b,v7.16b,v27.16b
1905	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
1906
1907	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
1908	eor	v8.16b,v8.16b,v0.16b
1909	ldp	q24,q25,[sp,#0]
1910	eor	v9.16b,v9.16b,v1.16b
1911	ldp	q26,q27,[sp,#32]
1912	eor	v10.16b,v10.16b,v2.16b
1913	eor	v11.16b,v11.16b,v3.16b
1914	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
1915
1916	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
1917	eor	v12.16b,v12.16b,v4.16b
1918	eor	v13.16b,v13.16b,v5.16b
1919	eor	v14.16b,v14.16b,v6.16b
1920	eor	v15.16b,v15.16b,v7.16b
1921	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
1922
1923	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
1924	eor	v16.16b,v16.16b,v8.16b
1925	eor	v17.16b,v17.16b,v9.16b
1926	eor	v18.16b,v18.16b,v10.16b
1927	eor	v19.16b,v19.16b,v11.16b
1928	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
1929
1930	shl	v0.4s,v31.4s,#1			// 4 -> 8
1931	eor	v20.16b,v20.16b,v12.16b
1932	eor	v21.16b,v21.16b,v13.16b
1933	eor	v22.16b,v22.16b,v14.16b
1934	eor	v23.16b,v23.16b,v15.16b
1935	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
1936
1937	add	v27.4s,v27.4s,v0.4s			// += 8
1938	add	v28.4s,v28.4s,v0.4s
1939	add	v29.4s,v29.4s,v0.4s
1940	add	v30.4s,v30.4s,v0.4s
1941
1942	b.hs	.Loop_outer_512_neon
1943
1944	adds	x2,x2,#512
1945	ushr	v0.4s,v31.4s,#2			// 4 -> 1
1946
1947	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
1948	ldp	d10,d11,[sp,#128+16]
1949	ldp	d12,d13,[sp,#128+32]
1950	ldp	d14,d15,[sp,#128+48]
1951
1952	stp	q24,q31,[sp,#0]		// wipe off-load area
1953	stp	q24,q31,[sp,#32]
1954	stp	q24,q31,[sp,#64]
1955
1956	b.eq	.Ldone_512_neon
1957
1958	cmp	x2,#192
1959	sub	v27.4s,v27.4s,v0.4s			// -= 1
1960	sub	v28.4s,v28.4s,v0.4s
1961	sub	v29.4s,v29.4s,v0.4s
1962	add	sp,sp,#128
1963	b.hs	.Loop_outer_neon
1964
1965	eor	v25.16b,v25.16b,v25.16b
1966	eor	v26.16b,v26.16b,v26.16b
1967	eor	v27.16b,v27.16b,v27.16b
1968	eor	v28.16b,v28.16b,v28.16b
1969	eor	v29.16b,v29.16b,v29.16b
1970	eor	v30.16b,v30.16b,v30.16b
1971	b	.Loop_outer
1972
1973.Ldone_512_neon:
1974	ldp	x19,x20,[x29,#16]
1975	add	sp,sp,#128+64
1976	ldp	x21,x22,[x29,#32]
1977	ldp	x23,x24,[x29,#48]
1978	ldp	x25,x26,[x29,#64]
1979	ldp	x27,x28,[x29,#80]
1980	ldp	x29,x30,[sp],#96
1981	AARCH64_VALIDATE_LINK_REGISTER
1982	ret
1983.size	ChaCha20_512_neon,.-ChaCha20_512_neon
1984#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
1985