xref: /aosp_15_r20/external/cronet/third_party/boringssl/src/gen/crypto/chacha-x86_64-linux.S (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <openssl/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
7.text
8
9.section	.rodata
10.align	64
11.Lzero:
12.long	0,0,0,0
13.Lone:
14.long	1,0,0,0
15.Linc:
16.long	0,1,2,3
17.Lfour:
18.long	4,4,4,4
19.Lincy:
20.long	0,2,4,6,1,3,5,7
21.Leight:
22.long	8,8,8,8,8,8,8,8
23.Lrot16:
24.byte	0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
25.Lrot24:
26.byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
27.Lsigma:
28.byte	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
29.align	64
30.Lzeroz:
31.long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
32.Lfourz:
33.long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
34.Lincz:
35.long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
36.Lsixteen:
37.long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
38.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
39.text
40.globl	ChaCha20_ctr32_nohw
41.hidden ChaCha20_ctr32_nohw
42.type	ChaCha20_ctr32_nohw,@function
43.align	64
44ChaCha20_ctr32_nohw:
45.cfi_startproc
46_CET_ENDBR
47	pushq	%rbx
48.cfi_adjust_cfa_offset	8
49.cfi_offset	rbx,-16
50	pushq	%rbp
51.cfi_adjust_cfa_offset	8
52.cfi_offset	rbp,-24
53	pushq	%r12
54.cfi_adjust_cfa_offset	8
55.cfi_offset	r12,-32
56	pushq	%r13
57.cfi_adjust_cfa_offset	8
58.cfi_offset	r13,-40
59	pushq	%r14
60.cfi_adjust_cfa_offset	8
61.cfi_offset	r14,-48
62	pushq	%r15
63.cfi_adjust_cfa_offset	8
64.cfi_offset	r15,-56
65	subq	$64+24,%rsp
66.cfi_adjust_cfa_offset	88
67.Lctr32_body:
68
69
70	movdqu	(%rcx),%xmm1
71	movdqu	16(%rcx),%xmm2
72	movdqu	(%r8),%xmm3
73	movdqa	.Lone(%rip),%xmm4
74
75
76	movdqa	%xmm1,16(%rsp)
77	movdqa	%xmm2,32(%rsp)
78	movdqa	%xmm3,48(%rsp)
79	movq	%rdx,%rbp
80	jmp	.Loop_outer
81
82.align	32
83.Loop_outer:
84	movl	$0x61707865,%eax
85	movl	$0x3320646e,%ebx
86	movl	$0x79622d32,%ecx
87	movl	$0x6b206574,%edx
88	movl	16(%rsp),%r8d
89	movl	20(%rsp),%r9d
90	movl	24(%rsp),%r10d
91	movl	28(%rsp),%r11d
92	movd	%xmm3,%r12d
93	movl	52(%rsp),%r13d
94	movl	56(%rsp),%r14d
95	movl	60(%rsp),%r15d
96
97	movq	%rbp,64+0(%rsp)
98	movl	$10,%ebp
99	movq	%rsi,64+8(%rsp)
100.byte	102,72,15,126,214
101	movq	%rdi,64+16(%rsp)
102	movq	%rsi,%rdi
103	shrq	$32,%rdi
104	jmp	.Loop
105
106.align	32
107.Loop:
108	addl	%r8d,%eax
109	xorl	%eax,%r12d
110	roll	$16,%r12d
111	addl	%r9d,%ebx
112	xorl	%ebx,%r13d
113	roll	$16,%r13d
114	addl	%r12d,%esi
115	xorl	%esi,%r8d
116	roll	$12,%r8d
117	addl	%r13d,%edi
118	xorl	%edi,%r9d
119	roll	$12,%r9d
120	addl	%r8d,%eax
121	xorl	%eax,%r12d
122	roll	$8,%r12d
123	addl	%r9d,%ebx
124	xorl	%ebx,%r13d
125	roll	$8,%r13d
126	addl	%r12d,%esi
127	xorl	%esi,%r8d
128	roll	$7,%r8d
129	addl	%r13d,%edi
130	xorl	%edi,%r9d
131	roll	$7,%r9d
132	movl	%esi,32(%rsp)
133	movl	%edi,36(%rsp)
134	movl	40(%rsp),%esi
135	movl	44(%rsp),%edi
136	addl	%r10d,%ecx
137	xorl	%ecx,%r14d
138	roll	$16,%r14d
139	addl	%r11d,%edx
140	xorl	%edx,%r15d
141	roll	$16,%r15d
142	addl	%r14d,%esi
143	xorl	%esi,%r10d
144	roll	$12,%r10d
145	addl	%r15d,%edi
146	xorl	%edi,%r11d
147	roll	$12,%r11d
148	addl	%r10d,%ecx
149	xorl	%ecx,%r14d
150	roll	$8,%r14d
151	addl	%r11d,%edx
152	xorl	%edx,%r15d
153	roll	$8,%r15d
154	addl	%r14d,%esi
155	xorl	%esi,%r10d
156	roll	$7,%r10d
157	addl	%r15d,%edi
158	xorl	%edi,%r11d
159	roll	$7,%r11d
160	addl	%r9d,%eax
161	xorl	%eax,%r15d
162	roll	$16,%r15d
163	addl	%r10d,%ebx
164	xorl	%ebx,%r12d
165	roll	$16,%r12d
166	addl	%r15d,%esi
167	xorl	%esi,%r9d
168	roll	$12,%r9d
169	addl	%r12d,%edi
170	xorl	%edi,%r10d
171	roll	$12,%r10d
172	addl	%r9d,%eax
173	xorl	%eax,%r15d
174	roll	$8,%r15d
175	addl	%r10d,%ebx
176	xorl	%ebx,%r12d
177	roll	$8,%r12d
178	addl	%r15d,%esi
179	xorl	%esi,%r9d
180	roll	$7,%r9d
181	addl	%r12d,%edi
182	xorl	%edi,%r10d
183	roll	$7,%r10d
184	movl	%esi,40(%rsp)
185	movl	%edi,44(%rsp)
186	movl	32(%rsp),%esi
187	movl	36(%rsp),%edi
188	addl	%r11d,%ecx
189	xorl	%ecx,%r13d
190	roll	$16,%r13d
191	addl	%r8d,%edx
192	xorl	%edx,%r14d
193	roll	$16,%r14d
194	addl	%r13d,%esi
195	xorl	%esi,%r11d
196	roll	$12,%r11d
197	addl	%r14d,%edi
198	xorl	%edi,%r8d
199	roll	$12,%r8d
200	addl	%r11d,%ecx
201	xorl	%ecx,%r13d
202	roll	$8,%r13d
203	addl	%r8d,%edx
204	xorl	%edx,%r14d
205	roll	$8,%r14d
206	addl	%r13d,%esi
207	xorl	%esi,%r11d
208	roll	$7,%r11d
209	addl	%r14d,%edi
210	xorl	%edi,%r8d
211	roll	$7,%r8d
212	decl	%ebp
213	jnz	.Loop
214	movl	%edi,36(%rsp)
215	movl	%esi,32(%rsp)
216	movq	64(%rsp),%rbp
217	movdqa	%xmm2,%xmm1
218	movq	64+8(%rsp),%rsi
219	paddd	%xmm4,%xmm3
220	movq	64+16(%rsp),%rdi
221
222	addl	$0x61707865,%eax
223	addl	$0x3320646e,%ebx
224	addl	$0x79622d32,%ecx
225	addl	$0x6b206574,%edx
226	addl	16(%rsp),%r8d
227	addl	20(%rsp),%r9d
228	addl	24(%rsp),%r10d
229	addl	28(%rsp),%r11d
230	addl	48(%rsp),%r12d
231	addl	52(%rsp),%r13d
232	addl	56(%rsp),%r14d
233	addl	60(%rsp),%r15d
234	paddd	32(%rsp),%xmm1
235
236	cmpq	$64,%rbp
237	jb	.Ltail
238
239	xorl	0(%rsi),%eax
240	xorl	4(%rsi),%ebx
241	xorl	8(%rsi),%ecx
242	xorl	12(%rsi),%edx
243	xorl	16(%rsi),%r8d
244	xorl	20(%rsi),%r9d
245	xorl	24(%rsi),%r10d
246	xorl	28(%rsi),%r11d
247	movdqu	32(%rsi),%xmm0
248	xorl	48(%rsi),%r12d
249	xorl	52(%rsi),%r13d
250	xorl	56(%rsi),%r14d
251	xorl	60(%rsi),%r15d
252	leaq	64(%rsi),%rsi
253	pxor	%xmm1,%xmm0
254
255	movdqa	%xmm2,32(%rsp)
256	movd	%xmm3,48(%rsp)
257
258	movl	%eax,0(%rdi)
259	movl	%ebx,4(%rdi)
260	movl	%ecx,8(%rdi)
261	movl	%edx,12(%rdi)
262	movl	%r8d,16(%rdi)
263	movl	%r9d,20(%rdi)
264	movl	%r10d,24(%rdi)
265	movl	%r11d,28(%rdi)
266	movdqu	%xmm0,32(%rdi)
267	movl	%r12d,48(%rdi)
268	movl	%r13d,52(%rdi)
269	movl	%r14d,56(%rdi)
270	movl	%r15d,60(%rdi)
271	leaq	64(%rdi),%rdi
272
273	subq	$64,%rbp
274	jnz	.Loop_outer
275
276	jmp	.Ldone
277
278.align	16
279.Ltail:
280	movl	%eax,0(%rsp)
281	movl	%ebx,4(%rsp)
282	xorq	%rbx,%rbx
283	movl	%ecx,8(%rsp)
284	movl	%edx,12(%rsp)
285	movl	%r8d,16(%rsp)
286	movl	%r9d,20(%rsp)
287	movl	%r10d,24(%rsp)
288	movl	%r11d,28(%rsp)
289	movdqa	%xmm1,32(%rsp)
290	movl	%r12d,48(%rsp)
291	movl	%r13d,52(%rsp)
292	movl	%r14d,56(%rsp)
293	movl	%r15d,60(%rsp)
294
295.Loop_tail:
296	movzbl	(%rsi,%rbx,1),%eax
297	movzbl	(%rsp,%rbx,1),%edx
298	leaq	1(%rbx),%rbx
299	xorl	%edx,%eax
300	movb	%al,-1(%rdi,%rbx,1)
301	decq	%rbp
302	jnz	.Loop_tail
303
304.Ldone:
305	leaq	64+24+48(%rsp),%rsi
306	movq	-48(%rsi),%r15
307.cfi_restore	r15
308	movq	-40(%rsi),%r14
309.cfi_restore	r14
310	movq	-32(%rsi),%r13
311.cfi_restore	r13
312	movq	-24(%rsi),%r12
313.cfi_restore	r12
314	movq	-16(%rsi),%rbp
315.cfi_restore	rbp
316	movq	-8(%rsi),%rbx
317.cfi_restore	rbx
318	leaq	(%rsi),%rsp
319.cfi_adjust_cfa_offset	-136
320.Lno_data:
321	ret
322.cfi_endproc
323.size	ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw
324.globl	ChaCha20_ctr32_ssse3
325.hidden ChaCha20_ctr32_ssse3
326.type	ChaCha20_ctr32_ssse3,@function
327.align	32
328ChaCha20_ctr32_ssse3:
329.cfi_startproc
330_CET_ENDBR
331	movq	%rsp,%r9
332.cfi_def_cfa_register	r9
333	subq	$64+8,%rsp
334	movdqa	.Lsigma(%rip),%xmm0
335	movdqu	(%rcx),%xmm1
336	movdqu	16(%rcx),%xmm2
337	movdqu	(%r8),%xmm3
338	movdqa	.Lrot16(%rip),%xmm6
339	movdqa	.Lrot24(%rip),%xmm7
340
341	movdqa	%xmm0,0(%rsp)
342	movdqa	%xmm1,16(%rsp)
343	movdqa	%xmm2,32(%rsp)
344	movdqa	%xmm3,48(%rsp)
345	movq	$10,%r8
346	jmp	.Loop_ssse3
347
348.align	32
349.Loop_outer_ssse3:
350	movdqa	.Lone(%rip),%xmm3
351	movdqa	0(%rsp),%xmm0
352	movdqa	16(%rsp),%xmm1
353	movdqa	32(%rsp),%xmm2
354	paddd	48(%rsp),%xmm3
355	movq	$10,%r8
356	movdqa	%xmm3,48(%rsp)
357	jmp	.Loop_ssse3
358
359.align	32
360.Loop_ssse3:
361	paddd	%xmm1,%xmm0
362	pxor	%xmm0,%xmm3
363.byte	102,15,56,0,222
364	paddd	%xmm3,%xmm2
365	pxor	%xmm2,%xmm1
366	movdqa	%xmm1,%xmm4
367	psrld	$20,%xmm1
368	pslld	$12,%xmm4
369	por	%xmm4,%xmm1
370	paddd	%xmm1,%xmm0
371	pxor	%xmm0,%xmm3
372.byte	102,15,56,0,223
373	paddd	%xmm3,%xmm2
374	pxor	%xmm2,%xmm1
375	movdqa	%xmm1,%xmm4
376	psrld	$25,%xmm1
377	pslld	$7,%xmm4
378	por	%xmm4,%xmm1
379	pshufd	$78,%xmm2,%xmm2
380	pshufd	$57,%xmm1,%xmm1
381	pshufd	$147,%xmm3,%xmm3
382	nop
383	paddd	%xmm1,%xmm0
384	pxor	%xmm0,%xmm3
385.byte	102,15,56,0,222
386	paddd	%xmm3,%xmm2
387	pxor	%xmm2,%xmm1
388	movdqa	%xmm1,%xmm4
389	psrld	$20,%xmm1
390	pslld	$12,%xmm4
391	por	%xmm4,%xmm1
392	paddd	%xmm1,%xmm0
393	pxor	%xmm0,%xmm3
394.byte	102,15,56,0,223
395	paddd	%xmm3,%xmm2
396	pxor	%xmm2,%xmm1
397	movdqa	%xmm1,%xmm4
398	psrld	$25,%xmm1
399	pslld	$7,%xmm4
400	por	%xmm4,%xmm1
401	pshufd	$78,%xmm2,%xmm2
402	pshufd	$147,%xmm1,%xmm1
403	pshufd	$57,%xmm3,%xmm3
404	decq	%r8
405	jnz	.Loop_ssse3
406	paddd	0(%rsp),%xmm0
407	paddd	16(%rsp),%xmm1
408	paddd	32(%rsp),%xmm2
409	paddd	48(%rsp),%xmm3
410
411	cmpq	$64,%rdx
412	jb	.Ltail_ssse3
413
414	movdqu	0(%rsi),%xmm4
415	movdqu	16(%rsi),%xmm5
416	pxor	%xmm4,%xmm0
417	movdqu	32(%rsi),%xmm4
418	pxor	%xmm5,%xmm1
419	movdqu	48(%rsi),%xmm5
420	leaq	64(%rsi),%rsi
421	pxor	%xmm4,%xmm2
422	pxor	%xmm5,%xmm3
423
424	movdqu	%xmm0,0(%rdi)
425	movdqu	%xmm1,16(%rdi)
426	movdqu	%xmm2,32(%rdi)
427	movdqu	%xmm3,48(%rdi)
428	leaq	64(%rdi),%rdi
429
430	subq	$64,%rdx
431	jnz	.Loop_outer_ssse3
432
433	jmp	.Ldone_ssse3
434
435.align	16
436.Ltail_ssse3:
437	movdqa	%xmm0,0(%rsp)
438	movdqa	%xmm1,16(%rsp)
439	movdqa	%xmm2,32(%rsp)
440	movdqa	%xmm3,48(%rsp)
441	xorq	%r8,%r8
442
443.Loop_tail_ssse3:
444	movzbl	(%rsi,%r8,1),%eax
445	movzbl	(%rsp,%r8,1),%ecx
446	leaq	1(%r8),%r8
447	xorl	%ecx,%eax
448	movb	%al,-1(%rdi,%r8,1)
449	decq	%rdx
450	jnz	.Loop_tail_ssse3
451
452.Ldone_ssse3:
453	leaq	(%r9),%rsp
454.cfi_def_cfa_register	rsp
455.Lssse3_epilogue:
456	ret
457.cfi_endproc
458.size	ChaCha20_ctr32_ssse3,.-ChaCha20_ctr32_ssse3
459.globl	ChaCha20_ctr32_ssse3_4x
460.hidden ChaCha20_ctr32_ssse3_4x
461.type	ChaCha20_ctr32_ssse3_4x,@function
462.align	32
463ChaCha20_ctr32_ssse3_4x:
464.cfi_startproc
465_CET_ENDBR
466	movq	%rsp,%r9
467.cfi_def_cfa_register	r9
468	movq	%r10,%r11
469	subq	$0x140+8,%rsp
470	movdqa	.Lsigma(%rip),%xmm11
471	movdqu	(%rcx),%xmm15
472	movdqu	16(%rcx),%xmm7
473	movdqu	(%r8),%xmm3
474	leaq	256(%rsp),%rcx
475	leaq	.Lrot16(%rip),%r10
476	leaq	.Lrot24(%rip),%r11
477
478	pshufd	$0x00,%xmm11,%xmm8
479	pshufd	$0x55,%xmm11,%xmm9
480	movdqa	%xmm8,64(%rsp)
481	pshufd	$0xaa,%xmm11,%xmm10
482	movdqa	%xmm9,80(%rsp)
483	pshufd	$0xff,%xmm11,%xmm11
484	movdqa	%xmm10,96(%rsp)
485	movdqa	%xmm11,112(%rsp)
486
487	pshufd	$0x00,%xmm15,%xmm12
488	pshufd	$0x55,%xmm15,%xmm13
489	movdqa	%xmm12,128-256(%rcx)
490	pshufd	$0xaa,%xmm15,%xmm14
491	movdqa	%xmm13,144-256(%rcx)
492	pshufd	$0xff,%xmm15,%xmm15
493	movdqa	%xmm14,160-256(%rcx)
494	movdqa	%xmm15,176-256(%rcx)
495
496	pshufd	$0x00,%xmm7,%xmm4
497	pshufd	$0x55,%xmm7,%xmm5
498	movdqa	%xmm4,192-256(%rcx)
499	pshufd	$0xaa,%xmm7,%xmm6
500	movdqa	%xmm5,208-256(%rcx)
501	pshufd	$0xff,%xmm7,%xmm7
502	movdqa	%xmm6,224-256(%rcx)
503	movdqa	%xmm7,240-256(%rcx)
504
505	pshufd	$0x00,%xmm3,%xmm0
506	pshufd	$0x55,%xmm3,%xmm1
507	paddd	.Linc(%rip),%xmm0
508	pshufd	$0xaa,%xmm3,%xmm2
509	movdqa	%xmm1,272-256(%rcx)
510	pshufd	$0xff,%xmm3,%xmm3
511	movdqa	%xmm2,288-256(%rcx)
512	movdqa	%xmm3,304-256(%rcx)
513
514	jmp	.Loop_enter4x
515
516.align	32
517.Loop_outer4x:
518	movdqa	64(%rsp),%xmm8
519	movdqa	80(%rsp),%xmm9
520	movdqa	96(%rsp),%xmm10
521	movdqa	112(%rsp),%xmm11
522	movdqa	128-256(%rcx),%xmm12
523	movdqa	144-256(%rcx),%xmm13
524	movdqa	160-256(%rcx),%xmm14
525	movdqa	176-256(%rcx),%xmm15
526	movdqa	192-256(%rcx),%xmm4
527	movdqa	208-256(%rcx),%xmm5
528	movdqa	224-256(%rcx),%xmm6
529	movdqa	240-256(%rcx),%xmm7
530	movdqa	256-256(%rcx),%xmm0
531	movdqa	272-256(%rcx),%xmm1
532	movdqa	288-256(%rcx),%xmm2
533	movdqa	304-256(%rcx),%xmm3
534	paddd	.Lfour(%rip),%xmm0
535
536.Loop_enter4x:
537	movdqa	%xmm6,32(%rsp)
538	movdqa	%xmm7,48(%rsp)
539	movdqa	(%r10),%xmm7
540	movl	$10,%eax
541	movdqa	%xmm0,256-256(%rcx)
542	jmp	.Loop4x
543
544.align	32
545.Loop4x:
546	paddd	%xmm12,%xmm8
547	paddd	%xmm13,%xmm9
548	pxor	%xmm8,%xmm0
549	pxor	%xmm9,%xmm1
550.byte	102,15,56,0,199
551.byte	102,15,56,0,207
552	paddd	%xmm0,%xmm4
553	paddd	%xmm1,%xmm5
554	pxor	%xmm4,%xmm12
555	pxor	%xmm5,%xmm13
556	movdqa	%xmm12,%xmm6
557	pslld	$12,%xmm12
558	psrld	$20,%xmm6
559	movdqa	%xmm13,%xmm7
560	pslld	$12,%xmm13
561	por	%xmm6,%xmm12
562	psrld	$20,%xmm7
563	movdqa	(%r11),%xmm6
564	por	%xmm7,%xmm13
565	paddd	%xmm12,%xmm8
566	paddd	%xmm13,%xmm9
567	pxor	%xmm8,%xmm0
568	pxor	%xmm9,%xmm1
569.byte	102,15,56,0,198
570.byte	102,15,56,0,206
571	paddd	%xmm0,%xmm4
572	paddd	%xmm1,%xmm5
573	pxor	%xmm4,%xmm12
574	pxor	%xmm5,%xmm13
575	movdqa	%xmm12,%xmm7
576	pslld	$7,%xmm12
577	psrld	$25,%xmm7
578	movdqa	%xmm13,%xmm6
579	pslld	$7,%xmm13
580	por	%xmm7,%xmm12
581	psrld	$25,%xmm6
582	movdqa	(%r10),%xmm7
583	por	%xmm6,%xmm13
584	movdqa	%xmm4,0(%rsp)
585	movdqa	%xmm5,16(%rsp)
586	movdqa	32(%rsp),%xmm4
587	movdqa	48(%rsp),%xmm5
588	paddd	%xmm14,%xmm10
589	paddd	%xmm15,%xmm11
590	pxor	%xmm10,%xmm2
591	pxor	%xmm11,%xmm3
592.byte	102,15,56,0,215
593.byte	102,15,56,0,223
594	paddd	%xmm2,%xmm4
595	paddd	%xmm3,%xmm5
596	pxor	%xmm4,%xmm14
597	pxor	%xmm5,%xmm15
598	movdqa	%xmm14,%xmm6
599	pslld	$12,%xmm14
600	psrld	$20,%xmm6
601	movdqa	%xmm15,%xmm7
602	pslld	$12,%xmm15
603	por	%xmm6,%xmm14
604	psrld	$20,%xmm7
605	movdqa	(%r11),%xmm6
606	por	%xmm7,%xmm15
607	paddd	%xmm14,%xmm10
608	paddd	%xmm15,%xmm11
609	pxor	%xmm10,%xmm2
610	pxor	%xmm11,%xmm3
611.byte	102,15,56,0,214
612.byte	102,15,56,0,222
613	paddd	%xmm2,%xmm4
614	paddd	%xmm3,%xmm5
615	pxor	%xmm4,%xmm14
616	pxor	%xmm5,%xmm15
617	movdqa	%xmm14,%xmm7
618	pslld	$7,%xmm14
619	psrld	$25,%xmm7
620	movdqa	%xmm15,%xmm6
621	pslld	$7,%xmm15
622	por	%xmm7,%xmm14
623	psrld	$25,%xmm6
624	movdqa	(%r10),%xmm7
625	por	%xmm6,%xmm15
626	paddd	%xmm13,%xmm8
627	paddd	%xmm14,%xmm9
628	pxor	%xmm8,%xmm3
629	pxor	%xmm9,%xmm0
630.byte	102,15,56,0,223
631.byte	102,15,56,0,199
632	paddd	%xmm3,%xmm4
633	paddd	%xmm0,%xmm5
634	pxor	%xmm4,%xmm13
635	pxor	%xmm5,%xmm14
636	movdqa	%xmm13,%xmm6
637	pslld	$12,%xmm13
638	psrld	$20,%xmm6
639	movdqa	%xmm14,%xmm7
640	pslld	$12,%xmm14
641	por	%xmm6,%xmm13
642	psrld	$20,%xmm7
643	movdqa	(%r11),%xmm6
644	por	%xmm7,%xmm14
645	paddd	%xmm13,%xmm8
646	paddd	%xmm14,%xmm9
647	pxor	%xmm8,%xmm3
648	pxor	%xmm9,%xmm0
649.byte	102,15,56,0,222
650.byte	102,15,56,0,198
651	paddd	%xmm3,%xmm4
652	paddd	%xmm0,%xmm5
653	pxor	%xmm4,%xmm13
654	pxor	%xmm5,%xmm14
655	movdqa	%xmm13,%xmm7
656	pslld	$7,%xmm13
657	psrld	$25,%xmm7
658	movdqa	%xmm14,%xmm6
659	pslld	$7,%xmm14
660	por	%xmm7,%xmm13
661	psrld	$25,%xmm6
662	movdqa	(%r10),%xmm7
663	por	%xmm6,%xmm14
664	movdqa	%xmm4,32(%rsp)
665	movdqa	%xmm5,48(%rsp)
666	movdqa	0(%rsp),%xmm4
667	movdqa	16(%rsp),%xmm5
668	paddd	%xmm15,%xmm10
669	paddd	%xmm12,%xmm11
670	pxor	%xmm10,%xmm1
671	pxor	%xmm11,%xmm2
672.byte	102,15,56,0,207
673.byte	102,15,56,0,215
674	paddd	%xmm1,%xmm4
675	paddd	%xmm2,%xmm5
676	pxor	%xmm4,%xmm15
677	pxor	%xmm5,%xmm12
678	movdqa	%xmm15,%xmm6
679	pslld	$12,%xmm15
680	psrld	$20,%xmm6
681	movdqa	%xmm12,%xmm7
682	pslld	$12,%xmm12
683	por	%xmm6,%xmm15
684	psrld	$20,%xmm7
685	movdqa	(%r11),%xmm6
686	por	%xmm7,%xmm12
687	paddd	%xmm15,%xmm10
688	paddd	%xmm12,%xmm11
689	pxor	%xmm10,%xmm1
690	pxor	%xmm11,%xmm2
691.byte	102,15,56,0,206
692.byte	102,15,56,0,214
693	paddd	%xmm1,%xmm4
694	paddd	%xmm2,%xmm5
695	pxor	%xmm4,%xmm15
696	pxor	%xmm5,%xmm12
697	movdqa	%xmm15,%xmm7
698	pslld	$7,%xmm15
699	psrld	$25,%xmm7
700	movdqa	%xmm12,%xmm6
701	pslld	$7,%xmm12
702	por	%xmm7,%xmm15
703	psrld	$25,%xmm6
704	movdqa	(%r10),%xmm7
705	por	%xmm6,%xmm12
706	decl	%eax
707	jnz	.Loop4x
708
709	paddd	64(%rsp),%xmm8
710	paddd	80(%rsp),%xmm9
711	paddd	96(%rsp),%xmm10
712	paddd	112(%rsp),%xmm11
713
714	movdqa	%xmm8,%xmm6
715	punpckldq	%xmm9,%xmm8
716	movdqa	%xmm10,%xmm7
717	punpckldq	%xmm11,%xmm10
718	punpckhdq	%xmm9,%xmm6
719	punpckhdq	%xmm11,%xmm7
720	movdqa	%xmm8,%xmm9
721	punpcklqdq	%xmm10,%xmm8
722	movdqa	%xmm6,%xmm11
723	punpcklqdq	%xmm7,%xmm6
724	punpckhqdq	%xmm10,%xmm9
725	punpckhqdq	%xmm7,%xmm11
726	paddd	128-256(%rcx),%xmm12
727	paddd	144-256(%rcx),%xmm13
728	paddd	160-256(%rcx),%xmm14
729	paddd	176-256(%rcx),%xmm15
730
731	movdqa	%xmm8,0(%rsp)
732	movdqa	%xmm9,16(%rsp)
733	movdqa	32(%rsp),%xmm8
734	movdqa	48(%rsp),%xmm9
735
736	movdqa	%xmm12,%xmm10
737	punpckldq	%xmm13,%xmm12
738	movdqa	%xmm14,%xmm7
739	punpckldq	%xmm15,%xmm14
740	punpckhdq	%xmm13,%xmm10
741	punpckhdq	%xmm15,%xmm7
742	movdqa	%xmm12,%xmm13
743	punpcklqdq	%xmm14,%xmm12
744	movdqa	%xmm10,%xmm15
745	punpcklqdq	%xmm7,%xmm10
746	punpckhqdq	%xmm14,%xmm13
747	punpckhqdq	%xmm7,%xmm15
748	paddd	192-256(%rcx),%xmm4
749	paddd	208-256(%rcx),%xmm5
750	paddd	224-256(%rcx),%xmm8
751	paddd	240-256(%rcx),%xmm9
752
753	movdqa	%xmm6,32(%rsp)
754	movdqa	%xmm11,48(%rsp)
755
756	movdqa	%xmm4,%xmm14
757	punpckldq	%xmm5,%xmm4
758	movdqa	%xmm8,%xmm7
759	punpckldq	%xmm9,%xmm8
760	punpckhdq	%xmm5,%xmm14
761	punpckhdq	%xmm9,%xmm7
762	movdqa	%xmm4,%xmm5
763	punpcklqdq	%xmm8,%xmm4
764	movdqa	%xmm14,%xmm9
765	punpcklqdq	%xmm7,%xmm14
766	punpckhqdq	%xmm8,%xmm5
767	punpckhqdq	%xmm7,%xmm9
768	paddd	256-256(%rcx),%xmm0
769	paddd	272-256(%rcx),%xmm1
770	paddd	288-256(%rcx),%xmm2
771	paddd	304-256(%rcx),%xmm3
772
773	movdqa	%xmm0,%xmm8
774	punpckldq	%xmm1,%xmm0
775	movdqa	%xmm2,%xmm7
776	punpckldq	%xmm3,%xmm2
777	punpckhdq	%xmm1,%xmm8
778	punpckhdq	%xmm3,%xmm7
779	movdqa	%xmm0,%xmm1
780	punpcklqdq	%xmm2,%xmm0
781	movdqa	%xmm8,%xmm3
782	punpcklqdq	%xmm7,%xmm8
783	punpckhqdq	%xmm2,%xmm1
784	punpckhqdq	%xmm7,%xmm3
785	cmpq	$256,%rdx
786	jb	.Ltail4x
787
788	movdqu	0(%rsi),%xmm6
789	movdqu	16(%rsi),%xmm11
790	movdqu	32(%rsi),%xmm2
791	movdqu	48(%rsi),%xmm7
792	pxor	0(%rsp),%xmm6
793	pxor	%xmm12,%xmm11
794	pxor	%xmm4,%xmm2
795	pxor	%xmm0,%xmm7
796
797	movdqu	%xmm6,0(%rdi)
798	movdqu	64(%rsi),%xmm6
799	movdqu	%xmm11,16(%rdi)
800	movdqu	80(%rsi),%xmm11
801	movdqu	%xmm2,32(%rdi)
802	movdqu	96(%rsi),%xmm2
803	movdqu	%xmm7,48(%rdi)
804	movdqu	112(%rsi),%xmm7
805	leaq	128(%rsi),%rsi
806	pxor	16(%rsp),%xmm6
807	pxor	%xmm13,%xmm11
808	pxor	%xmm5,%xmm2
809	pxor	%xmm1,%xmm7
810
811	movdqu	%xmm6,64(%rdi)
812	movdqu	0(%rsi),%xmm6
813	movdqu	%xmm11,80(%rdi)
814	movdqu	16(%rsi),%xmm11
815	movdqu	%xmm2,96(%rdi)
816	movdqu	32(%rsi),%xmm2
817	movdqu	%xmm7,112(%rdi)
818	leaq	128(%rdi),%rdi
819	movdqu	48(%rsi),%xmm7
820	pxor	32(%rsp),%xmm6
821	pxor	%xmm10,%xmm11
822	pxor	%xmm14,%xmm2
823	pxor	%xmm8,%xmm7
824
825	movdqu	%xmm6,0(%rdi)
826	movdqu	64(%rsi),%xmm6
827	movdqu	%xmm11,16(%rdi)
828	movdqu	80(%rsi),%xmm11
829	movdqu	%xmm2,32(%rdi)
830	movdqu	96(%rsi),%xmm2
831	movdqu	%xmm7,48(%rdi)
832	movdqu	112(%rsi),%xmm7
833	leaq	128(%rsi),%rsi
834	pxor	48(%rsp),%xmm6
835	pxor	%xmm15,%xmm11
836	pxor	%xmm9,%xmm2
837	pxor	%xmm3,%xmm7
838	movdqu	%xmm6,64(%rdi)
839	movdqu	%xmm11,80(%rdi)
840	movdqu	%xmm2,96(%rdi)
841	movdqu	%xmm7,112(%rdi)
842	leaq	128(%rdi),%rdi
843
844	subq	$256,%rdx
845	jnz	.Loop_outer4x
846
847	jmp	.Ldone4x
848
849.Ltail4x:
850	cmpq	$192,%rdx
851	jae	.L192_or_more4x
852	cmpq	$128,%rdx
853	jae	.L128_or_more4x
854	cmpq	$64,%rdx
855	jae	.L64_or_more4x
856
857
858	xorq	%r10,%r10
859
860	movdqa	%xmm12,16(%rsp)
861	movdqa	%xmm4,32(%rsp)
862	movdqa	%xmm0,48(%rsp)
863	jmp	.Loop_tail4x
864
865.align	32
866.L64_or_more4x:
867	movdqu	0(%rsi),%xmm6
868	movdqu	16(%rsi),%xmm11
869	movdqu	32(%rsi),%xmm2
870	movdqu	48(%rsi),%xmm7
871	pxor	0(%rsp),%xmm6
872	pxor	%xmm12,%xmm11
873	pxor	%xmm4,%xmm2
874	pxor	%xmm0,%xmm7
875	movdqu	%xmm6,0(%rdi)
876	movdqu	%xmm11,16(%rdi)
877	movdqu	%xmm2,32(%rdi)
878	movdqu	%xmm7,48(%rdi)
879	je	.Ldone4x
880
881	movdqa	16(%rsp),%xmm6
882	leaq	64(%rsi),%rsi
883	xorq	%r10,%r10
884	movdqa	%xmm6,0(%rsp)
885	movdqa	%xmm13,16(%rsp)
886	leaq	64(%rdi),%rdi
887	movdqa	%xmm5,32(%rsp)
888	subq	$64,%rdx
889	movdqa	%xmm1,48(%rsp)
890	jmp	.Loop_tail4x
891
892.align	32
893.L128_or_more4x:
894	movdqu	0(%rsi),%xmm6
895	movdqu	16(%rsi),%xmm11
896	movdqu	32(%rsi),%xmm2
897	movdqu	48(%rsi),%xmm7
898	pxor	0(%rsp),%xmm6
899	pxor	%xmm12,%xmm11
900	pxor	%xmm4,%xmm2
901	pxor	%xmm0,%xmm7
902
903	movdqu	%xmm6,0(%rdi)
904	movdqu	64(%rsi),%xmm6
905	movdqu	%xmm11,16(%rdi)
906	movdqu	80(%rsi),%xmm11
907	movdqu	%xmm2,32(%rdi)
908	movdqu	96(%rsi),%xmm2
909	movdqu	%xmm7,48(%rdi)
910	movdqu	112(%rsi),%xmm7
911	pxor	16(%rsp),%xmm6
912	pxor	%xmm13,%xmm11
913	pxor	%xmm5,%xmm2
914	pxor	%xmm1,%xmm7
915	movdqu	%xmm6,64(%rdi)
916	movdqu	%xmm11,80(%rdi)
917	movdqu	%xmm2,96(%rdi)
918	movdqu	%xmm7,112(%rdi)
919	je	.Ldone4x
920
921	movdqa	32(%rsp),%xmm6
922	leaq	128(%rsi),%rsi
923	xorq	%r10,%r10
924	movdqa	%xmm6,0(%rsp)
925	movdqa	%xmm10,16(%rsp)
926	leaq	128(%rdi),%rdi
927	movdqa	%xmm14,32(%rsp)
928	subq	$128,%rdx
929	movdqa	%xmm8,48(%rsp)
930	jmp	.Loop_tail4x
931
932.align	32
933.L192_or_more4x:
934	movdqu	0(%rsi),%xmm6
935	movdqu	16(%rsi),%xmm11
936	movdqu	32(%rsi),%xmm2
937	movdqu	48(%rsi),%xmm7
938	pxor	0(%rsp),%xmm6
939	pxor	%xmm12,%xmm11
940	pxor	%xmm4,%xmm2
941	pxor	%xmm0,%xmm7
942
943	movdqu	%xmm6,0(%rdi)
944	movdqu	64(%rsi),%xmm6
945	movdqu	%xmm11,16(%rdi)
946	movdqu	80(%rsi),%xmm11
947	movdqu	%xmm2,32(%rdi)
948	movdqu	96(%rsi),%xmm2
949	movdqu	%xmm7,48(%rdi)
950	movdqu	112(%rsi),%xmm7
951	leaq	128(%rsi),%rsi
952	pxor	16(%rsp),%xmm6
953	pxor	%xmm13,%xmm11
954	pxor	%xmm5,%xmm2
955	pxor	%xmm1,%xmm7
956
957	movdqu	%xmm6,64(%rdi)
958	movdqu	0(%rsi),%xmm6
959	movdqu	%xmm11,80(%rdi)
960	movdqu	16(%rsi),%xmm11
961	movdqu	%xmm2,96(%rdi)
962	movdqu	32(%rsi),%xmm2
963	movdqu	%xmm7,112(%rdi)
964	leaq	128(%rdi),%rdi
965	movdqu	48(%rsi),%xmm7
966	pxor	32(%rsp),%xmm6
967	pxor	%xmm10,%xmm11
968	pxor	%xmm14,%xmm2
969	pxor	%xmm8,%xmm7
970	movdqu	%xmm6,0(%rdi)
971	movdqu	%xmm11,16(%rdi)
972	movdqu	%xmm2,32(%rdi)
973	movdqu	%xmm7,48(%rdi)
974	je	.Ldone4x
975
976	movdqa	48(%rsp),%xmm6
977	leaq	64(%rsi),%rsi
978	xorq	%r10,%r10
979	movdqa	%xmm6,0(%rsp)
980	movdqa	%xmm15,16(%rsp)
981	leaq	64(%rdi),%rdi
982	movdqa	%xmm9,32(%rsp)
983	subq	$192,%rdx
984	movdqa	%xmm3,48(%rsp)
985
986.Loop_tail4x:
987	movzbl	(%rsi,%r10,1),%eax
988	movzbl	(%rsp,%r10,1),%ecx
989	leaq	1(%r10),%r10
990	xorl	%ecx,%eax
991	movb	%al,-1(%rdi,%r10,1)
992	decq	%rdx
993	jnz	.Loop_tail4x
994
995.Ldone4x:
996	leaq	(%r9),%rsp
997.cfi_def_cfa_register	rsp
998.L4x_epilogue:
999	ret
1000.cfi_endproc
1001.size	ChaCha20_ctr32_ssse3_4x,.-ChaCha20_ctr32_ssse3_4x
1002.globl	ChaCha20_ctr32_avx2
1003.hidden ChaCha20_ctr32_avx2
1004.type	ChaCha20_ctr32_avx2,@function
1005.align	32
1006ChaCha20_ctr32_avx2:
1007.cfi_startproc
1008_CET_ENDBR
1009	movq	%rsp,%r9
1010.cfi_def_cfa_register	r9
1011	subq	$0x280+8,%rsp
1012	andq	$-32,%rsp
1013	vzeroupper
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024	vbroadcasti128	.Lsigma(%rip),%ymm11
1025	vbroadcasti128	(%rcx),%ymm3
1026	vbroadcasti128	16(%rcx),%ymm15
1027	vbroadcasti128	(%r8),%ymm7
1028	leaq	256(%rsp),%rcx
1029	leaq	512(%rsp),%rax
1030	leaq	.Lrot16(%rip),%r10
1031	leaq	.Lrot24(%rip),%r11
1032
1033	vpshufd	$0x00,%ymm11,%ymm8
1034	vpshufd	$0x55,%ymm11,%ymm9
1035	vmovdqa	%ymm8,128-256(%rcx)
1036	vpshufd	$0xaa,%ymm11,%ymm10
1037	vmovdqa	%ymm9,160-256(%rcx)
1038	vpshufd	$0xff,%ymm11,%ymm11
1039	vmovdqa	%ymm10,192-256(%rcx)
1040	vmovdqa	%ymm11,224-256(%rcx)
1041
1042	vpshufd	$0x00,%ymm3,%ymm0
1043	vpshufd	$0x55,%ymm3,%ymm1
1044	vmovdqa	%ymm0,256-256(%rcx)
1045	vpshufd	$0xaa,%ymm3,%ymm2
1046	vmovdqa	%ymm1,288-256(%rcx)
1047	vpshufd	$0xff,%ymm3,%ymm3
1048	vmovdqa	%ymm2,320-256(%rcx)
1049	vmovdqa	%ymm3,352-256(%rcx)
1050
1051	vpshufd	$0x00,%ymm15,%ymm12
1052	vpshufd	$0x55,%ymm15,%ymm13
1053	vmovdqa	%ymm12,384-512(%rax)
1054	vpshufd	$0xaa,%ymm15,%ymm14
1055	vmovdqa	%ymm13,416-512(%rax)
1056	vpshufd	$0xff,%ymm15,%ymm15
1057	vmovdqa	%ymm14,448-512(%rax)
1058	vmovdqa	%ymm15,480-512(%rax)
1059
1060	vpshufd	$0x00,%ymm7,%ymm4
1061	vpshufd	$0x55,%ymm7,%ymm5
1062	vpaddd	.Lincy(%rip),%ymm4,%ymm4
1063	vpshufd	$0xaa,%ymm7,%ymm6
1064	vmovdqa	%ymm5,544-512(%rax)
1065	vpshufd	$0xff,%ymm7,%ymm7
1066	vmovdqa	%ymm6,576-512(%rax)
1067	vmovdqa	%ymm7,608-512(%rax)
1068
1069	jmp	.Loop_enter8x
1070
1071.align	32
1072.Loop_outer8x:
1073	vmovdqa	128-256(%rcx),%ymm8
1074	vmovdqa	160-256(%rcx),%ymm9
1075	vmovdqa	192-256(%rcx),%ymm10
1076	vmovdqa	224-256(%rcx),%ymm11
1077	vmovdqa	256-256(%rcx),%ymm0
1078	vmovdqa	288-256(%rcx),%ymm1
1079	vmovdqa	320-256(%rcx),%ymm2
1080	vmovdqa	352-256(%rcx),%ymm3
1081	vmovdqa	384-512(%rax),%ymm12
1082	vmovdqa	416-512(%rax),%ymm13
1083	vmovdqa	448-512(%rax),%ymm14
1084	vmovdqa	480-512(%rax),%ymm15
1085	vmovdqa	512-512(%rax),%ymm4
1086	vmovdqa	544-512(%rax),%ymm5
1087	vmovdqa	576-512(%rax),%ymm6
1088	vmovdqa	608-512(%rax),%ymm7
1089	vpaddd	.Leight(%rip),%ymm4,%ymm4
1090
1091.Loop_enter8x:
1092	vmovdqa	%ymm14,64(%rsp)
1093	vmovdqa	%ymm15,96(%rsp)
1094	vbroadcasti128	(%r10),%ymm15
1095	vmovdqa	%ymm4,512-512(%rax)
1096	movl	$10,%eax
1097	jmp	.Loop8x
1098
1099.align	32
1100.Loop8x:
1101	vpaddd	%ymm0,%ymm8,%ymm8
1102	vpxor	%ymm4,%ymm8,%ymm4
1103	vpshufb	%ymm15,%ymm4,%ymm4
1104	vpaddd	%ymm1,%ymm9,%ymm9
1105	vpxor	%ymm5,%ymm9,%ymm5
1106	vpshufb	%ymm15,%ymm5,%ymm5
1107	vpaddd	%ymm4,%ymm12,%ymm12
1108	vpxor	%ymm0,%ymm12,%ymm0
1109	vpslld	$12,%ymm0,%ymm14
1110	vpsrld	$20,%ymm0,%ymm0
1111	vpor	%ymm0,%ymm14,%ymm0
1112	vbroadcasti128	(%r11),%ymm14
1113	vpaddd	%ymm5,%ymm13,%ymm13
1114	vpxor	%ymm1,%ymm13,%ymm1
1115	vpslld	$12,%ymm1,%ymm15
1116	vpsrld	$20,%ymm1,%ymm1
1117	vpor	%ymm1,%ymm15,%ymm1
1118	vpaddd	%ymm0,%ymm8,%ymm8
1119	vpxor	%ymm4,%ymm8,%ymm4
1120	vpshufb	%ymm14,%ymm4,%ymm4
1121	vpaddd	%ymm1,%ymm9,%ymm9
1122	vpxor	%ymm5,%ymm9,%ymm5
1123	vpshufb	%ymm14,%ymm5,%ymm5
1124	vpaddd	%ymm4,%ymm12,%ymm12
1125	vpxor	%ymm0,%ymm12,%ymm0
1126	vpslld	$7,%ymm0,%ymm15
1127	vpsrld	$25,%ymm0,%ymm0
1128	vpor	%ymm0,%ymm15,%ymm0
1129	vbroadcasti128	(%r10),%ymm15
1130	vpaddd	%ymm5,%ymm13,%ymm13
1131	vpxor	%ymm1,%ymm13,%ymm1
1132	vpslld	$7,%ymm1,%ymm14
1133	vpsrld	$25,%ymm1,%ymm1
1134	vpor	%ymm1,%ymm14,%ymm1
1135	vmovdqa	%ymm12,0(%rsp)
1136	vmovdqa	%ymm13,32(%rsp)
1137	vmovdqa	64(%rsp),%ymm12
1138	vmovdqa	96(%rsp),%ymm13
1139	vpaddd	%ymm2,%ymm10,%ymm10
1140	vpxor	%ymm6,%ymm10,%ymm6
1141	vpshufb	%ymm15,%ymm6,%ymm6
1142	vpaddd	%ymm3,%ymm11,%ymm11
1143	vpxor	%ymm7,%ymm11,%ymm7
1144	vpshufb	%ymm15,%ymm7,%ymm7
1145	vpaddd	%ymm6,%ymm12,%ymm12
1146	vpxor	%ymm2,%ymm12,%ymm2
1147	vpslld	$12,%ymm2,%ymm14
1148	vpsrld	$20,%ymm2,%ymm2
1149	vpor	%ymm2,%ymm14,%ymm2
1150	vbroadcasti128	(%r11),%ymm14
1151	vpaddd	%ymm7,%ymm13,%ymm13
1152	vpxor	%ymm3,%ymm13,%ymm3
1153	vpslld	$12,%ymm3,%ymm15
1154	vpsrld	$20,%ymm3,%ymm3
1155	vpor	%ymm3,%ymm15,%ymm3
1156	vpaddd	%ymm2,%ymm10,%ymm10
1157	vpxor	%ymm6,%ymm10,%ymm6
1158	vpshufb	%ymm14,%ymm6,%ymm6
1159	vpaddd	%ymm3,%ymm11,%ymm11
1160	vpxor	%ymm7,%ymm11,%ymm7
1161	vpshufb	%ymm14,%ymm7,%ymm7
1162	vpaddd	%ymm6,%ymm12,%ymm12
1163	vpxor	%ymm2,%ymm12,%ymm2
1164	vpslld	$7,%ymm2,%ymm15
1165	vpsrld	$25,%ymm2,%ymm2
1166	vpor	%ymm2,%ymm15,%ymm2
1167	vbroadcasti128	(%r10),%ymm15
1168	vpaddd	%ymm7,%ymm13,%ymm13
1169	vpxor	%ymm3,%ymm13,%ymm3
1170	vpslld	$7,%ymm3,%ymm14
1171	vpsrld	$25,%ymm3,%ymm3
1172	vpor	%ymm3,%ymm14,%ymm3
1173	vpaddd	%ymm1,%ymm8,%ymm8
1174	vpxor	%ymm7,%ymm8,%ymm7
1175	vpshufb	%ymm15,%ymm7,%ymm7
1176	vpaddd	%ymm2,%ymm9,%ymm9
1177	vpxor	%ymm4,%ymm9,%ymm4
1178	vpshufb	%ymm15,%ymm4,%ymm4
1179	vpaddd	%ymm7,%ymm12,%ymm12
1180	vpxor	%ymm1,%ymm12,%ymm1
1181	vpslld	$12,%ymm1,%ymm14
1182	vpsrld	$20,%ymm1,%ymm1
1183	vpor	%ymm1,%ymm14,%ymm1
1184	vbroadcasti128	(%r11),%ymm14
1185	vpaddd	%ymm4,%ymm13,%ymm13
1186	vpxor	%ymm2,%ymm13,%ymm2
1187	vpslld	$12,%ymm2,%ymm15
1188	vpsrld	$20,%ymm2,%ymm2
1189	vpor	%ymm2,%ymm15,%ymm2
1190	vpaddd	%ymm1,%ymm8,%ymm8
1191	vpxor	%ymm7,%ymm8,%ymm7
1192	vpshufb	%ymm14,%ymm7,%ymm7
1193	vpaddd	%ymm2,%ymm9,%ymm9
1194	vpxor	%ymm4,%ymm9,%ymm4
1195	vpshufb	%ymm14,%ymm4,%ymm4
1196	vpaddd	%ymm7,%ymm12,%ymm12
1197	vpxor	%ymm1,%ymm12,%ymm1
1198	vpslld	$7,%ymm1,%ymm15
1199	vpsrld	$25,%ymm1,%ymm1
1200	vpor	%ymm1,%ymm15,%ymm1
1201	vbroadcasti128	(%r10),%ymm15
1202	vpaddd	%ymm4,%ymm13,%ymm13
1203	vpxor	%ymm2,%ymm13,%ymm2
1204	vpslld	$7,%ymm2,%ymm14
1205	vpsrld	$25,%ymm2,%ymm2
1206	vpor	%ymm2,%ymm14,%ymm2
1207	vmovdqa	%ymm12,64(%rsp)
1208	vmovdqa	%ymm13,96(%rsp)
1209	vmovdqa	0(%rsp),%ymm12
1210	vmovdqa	32(%rsp),%ymm13
1211	vpaddd	%ymm3,%ymm10,%ymm10
1212	vpxor	%ymm5,%ymm10,%ymm5
1213	vpshufb	%ymm15,%ymm5,%ymm5
1214	vpaddd	%ymm0,%ymm11,%ymm11
1215	vpxor	%ymm6,%ymm11,%ymm6
1216	vpshufb	%ymm15,%ymm6,%ymm6
1217	vpaddd	%ymm5,%ymm12,%ymm12
1218	vpxor	%ymm3,%ymm12,%ymm3
1219	vpslld	$12,%ymm3,%ymm14
1220	vpsrld	$20,%ymm3,%ymm3
1221	vpor	%ymm3,%ymm14,%ymm3
1222	vbroadcasti128	(%r11),%ymm14
1223	vpaddd	%ymm6,%ymm13,%ymm13
1224	vpxor	%ymm0,%ymm13,%ymm0
1225	vpslld	$12,%ymm0,%ymm15
1226	vpsrld	$20,%ymm0,%ymm0
1227	vpor	%ymm0,%ymm15,%ymm0
1228	vpaddd	%ymm3,%ymm10,%ymm10
1229	vpxor	%ymm5,%ymm10,%ymm5
1230	vpshufb	%ymm14,%ymm5,%ymm5
1231	vpaddd	%ymm0,%ymm11,%ymm11
1232	vpxor	%ymm6,%ymm11,%ymm6
1233	vpshufb	%ymm14,%ymm6,%ymm6
1234	vpaddd	%ymm5,%ymm12,%ymm12
1235	vpxor	%ymm3,%ymm12,%ymm3
1236	vpslld	$7,%ymm3,%ymm15
1237	vpsrld	$25,%ymm3,%ymm3
1238	vpor	%ymm3,%ymm15,%ymm3
1239	vbroadcasti128	(%r10),%ymm15
1240	vpaddd	%ymm6,%ymm13,%ymm13
1241	vpxor	%ymm0,%ymm13,%ymm0
1242	vpslld	$7,%ymm0,%ymm14
1243	vpsrld	$25,%ymm0,%ymm0
1244	vpor	%ymm0,%ymm14,%ymm0
1245	decl	%eax
1246	jnz	.Loop8x
1247
1248	leaq	512(%rsp),%rax
1249	vpaddd	128-256(%rcx),%ymm8,%ymm8
1250	vpaddd	160-256(%rcx),%ymm9,%ymm9
1251	vpaddd	192-256(%rcx),%ymm10,%ymm10
1252	vpaddd	224-256(%rcx),%ymm11,%ymm11
1253
1254	vpunpckldq	%ymm9,%ymm8,%ymm14
1255	vpunpckldq	%ymm11,%ymm10,%ymm15
1256	vpunpckhdq	%ymm9,%ymm8,%ymm8
1257	vpunpckhdq	%ymm11,%ymm10,%ymm10
1258	vpunpcklqdq	%ymm15,%ymm14,%ymm9
1259	vpunpckhqdq	%ymm15,%ymm14,%ymm14
1260	vpunpcklqdq	%ymm10,%ymm8,%ymm11
1261	vpunpckhqdq	%ymm10,%ymm8,%ymm8
1262	vpaddd	256-256(%rcx),%ymm0,%ymm0
1263	vpaddd	288-256(%rcx),%ymm1,%ymm1
1264	vpaddd	320-256(%rcx),%ymm2,%ymm2
1265	vpaddd	352-256(%rcx),%ymm3,%ymm3
1266
1267	vpunpckldq	%ymm1,%ymm0,%ymm10
1268	vpunpckldq	%ymm3,%ymm2,%ymm15
1269	vpunpckhdq	%ymm1,%ymm0,%ymm0
1270	vpunpckhdq	%ymm3,%ymm2,%ymm2
1271	vpunpcklqdq	%ymm15,%ymm10,%ymm1
1272	vpunpckhqdq	%ymm15,%ymm10,%ymm10
1273	vpunpcklqdq	%ymm2,%ymm0,%ymm3
1274	vpunpckhqdq	%ymm2,%ymm0,%ymm0
1275	vperm2i128	$0x20,%ymm1,%ymm9,%ymm15
1276	vperm2i128	$0x31,%ymm1,%ymm9,%ymm1
1277	vperm2i128	$0x20,%ymm10,%ymm14,%ymm9
1278	vperm2i128	$0x31,%ymm10,%ymm14,%ymm10
1279	vperm2i128	$0x20,%ymm3,%ymm11,%ymm14
1280	vperm2i128	$0x31,%ymm3,%ymm11,%ymm3
1281	vperm2i128	$0x20,%ymm0,%ymm8,%ymm11
1282	vperm2i128	$0x31,%ymm0,%ymm8,%ymm0
1283	vmovdqa	%ymm15,0(%rsp)
1284	vmovdqa	%ymm9,32(%rsp)
1285	vmovdqa	64(%rsp),%ymm15
1286	vmovdqa	96(%rsp),%ymm9
1287
1288	vpaddd	384-512(%rax),%ymm12,%ymm12
1289	vpaddd	416-512(%rax),%ymm13,%ymm13
1290	vpaddd	448-512(%rax),%ymm15,%ymm15
1291	vpaddd	480-512(%rax),%ymm9,%ymm9
1292
1293	vpunpckldq	%ymm13,%ymm12,%ymm2
1294	vpunpckldq	%ymm9,%ymm15,%ymm8
1295	vpunpckhdq	%ymm13,%ymm12,%ymm12
1296	vpunpckhdq	%ymm9,%ymm15,%ymm15
1297	vpunpcklqdq	%ymm8,%ymm2,%ymm13
1298	vpunpckhqdq	%ymm8,%ymm2,%ymm2
1299	vpunpcklqdq	%ymm15,%ymm12,%ymm9
1300	vpunpckhqdq	%ymm15,%ymm12,%ymm12
1301	vpaddd	512-512(%rax),%ymm4,%ymm4
1302	vpaddd	544-512(%rax),%ymm5,%ymm5
1303	vpaddd	576-512(%rax),%ymm6,%ymm6
1304	vpaddd	608-512(%rax),%ymm7,%ymm7
1305
1306	vpunpckldq	%ymm5,%ymm4,%ymm15
1307	vpunpckldq	%ymm7,%ymm6,%ymm8
1308	vpunpckhdq	%ymm5,%ymm4,%ymm4
1309	vpunpckhdq	%ymm7,%ymm6,%ymm6
1310	vpunpcklqdq	%ymm8,%ymm15,%ymm5
1311	vpunpckhqdq	%ymm8,%ymm15,%ymm15
1312	vpunpcklqdq	%ymm6,%ymm4,%ymm7
1313	vpunpckhqdq	%ymm6,%ymm4,%ymm4
1314	vperm2i128	$0x20,%ymm5,%ymm13,%ymm8
1315	vperm2i128	$0x31,%ymm5,%ymm13,%ymm5
1316	vperm2i128	$0x20,%ymm15,%ymm2,%ymm13
1317	vperm2i128	$0x31,%ymm15,%ymm2,%ymm15
1318	vperm2i128	$0x20,%ymm7,%ymm9,%ymm2
1319	vperm2i128	$0x31,%ymm7,%ymm9,%ymm7
1320	vperm2i128	$0x20,%ymm4,%ymm12,%ymm9
1321	vperm2i128	$0x31,%ymm4,%ymm12,%ymm4
1322	vmovdqa	0(%rsp),%ymm6
1323	vmovdqa	32(%rsp),%ymm12
1324
1325	cmpq	$512,%rdx
1326	jb	.Ltail8x
1327
1328	vpxor	0(%rsi),%ymm6,%ymm6
1329	vpxor	32(%rsi),%ymm8,%ymm8
1330	vpxor	64(%rsi),%ymm1,%ymm1
1331	vpxor	96(%rsi),%ymm5,%ymm5
1332	leaq	128(%rsi),%rsi
1333	vmovdqu	%ymm6,0(%rdi)
1334	vmovdqu	%ymm8,32(%rdi)
1335	vmovdqu	%ymm1,64(%rdi)
1336	vmovdqu	%ymm5,96(%rdi)
1337	leaq	128(%rdi),%rdi
1338
1339	vpxor	0(%rsi),%ymm12,%ymm12
1340	vpxor	32(%rsi),%ymm13,%ymm13
1341	vpxor	64(%rsi),%ymm10,%ymm10
1342	vpxor	96(%rsi),%ymm15,%ymm15
1343	leaq	128(%rsi),%rsi
1344	vmovdqu	%ymm12,0(%rdi)
1345	vmovdqu	%ymm13,32(%rdi)
1346	vmovdqu	%ymm10,64(%rdi)
1347	vmovdqu	%ymm15,96(%rdi)
1348	leaq	128(%rdi),%rdi
1349
1350	vpxor	0(%rsi),%ymm14,%ymm14
1351	vpxor	32(%rsi),%ymm2,%ymm2
1352	vpxor	64(%rsi),%ymm3,%ymm3
1353	vpxor	96(%rsi),%ymm7,%ymm7
1354	leaq	128(%rsi),%rsi
1355	vmovdqu	%ymm14,0(%rdi)
1356	vmovdqu	%ymm2,32(%rdi)
1357	vmovdqu	%ymm3,64(%rdi)
1358	vmovdqu	%ymm7,96(%rdi)
1359	leaq	128(%rdi),%rdi
1360
1361	vpxor	0(%rsi),%ymm11,%ymm11
1362	vpxor	32(%rsi),%ymm9,%ymm9
1363	vpxor	64(%rsi),%ymm0,%ymm0
1364	vpxor	96(%rsi),%ymm4,%ymm4
1365	leaq	128(%rsi),%rsi
1366	vmovdqu	%ymm11,0(%rdi)
1367	vmovdqu	%ymm9,32(%rdi)
1368	vmovdqu	%ymm0,64(%rdi)
1369	vmovdqu	%ymm4,96(%rdi)
1370	leaq	128(%rdi),%rdi
1371
1372	subq	$512,%rdx
1373	jnz	.Loop_outer8x
1374
1375	jmp	.Ldone8x
1376
1377.Ltail8x:
1378	cmpq	$448,%rdx
1379	jae	.L448_or_more8x
1380	cmpq	$384,%rdx
1381	jae	.L384_or_more8x
1382	cmpq	$320,%rdx
1383	jae	.L320_or_more8x
1384	cmpq	$256,%rdx
1385	jae	.L256_or_more8x
1386	cmpq	$192,%rdx
1387	jae	.L192_or_more8x
1388	cmpq	$128,%rdx
1389	jae	.L128_or_more8x
1390	cmpq	$64,%rdx
1391	jae	.L64_or_more8x
1392
1393	xorq	%r10,%r10
1394	vmovdqa	%ymm6,0(%rsp)
1395	vmovdqa	%ymm8,32(%rsp)
1396	jmp	.Loop_tail8x
1397
1398.align	32
1399.L64_or_more8x:
1400	vpxor	0(%rsi),%ymm6,%ymm6
1401	vpxor	32(%rsi),%ymm8,%ymm8
1402	vmovdqu	%ymm6,0(%rdi)
1403	vmovdqu	%ymm8,32(%rdi)
1404	je	.Ldone8x
1405
1406	leaq	64(%rsi),%rsi
1407	xorq	%r10,%r10
1408	vmovdqa	%ymm1,0(%rsp)
1409	leaq	64(%rdi),%rdi
1410	subq	$64,%rdx
1411	vmovdqa	%ymm5,32(%rsp)
1412	jmp	.Loop_tail8x
1413
1414.align	32
1415.L128_or_more8x:
1416	vpxor	0(%rsi),%ymm6,%ymm6
1417	vpxor	32(%rsi),%ymm8,%ymm8
1418	vpxor	64(%rsi),%ymm1,%ymm1
1419	vpxor	96(%rsi),%ymm5,%ymm5
1420	vmovdqu	%ymm6,0(%rdi)
1421	vmovdqu	%ymm8,32(%rdi)
1422	vmovdqu	%ymm1,64(%rdi)
1423	vmovdqu	%ymm5,96(%rdi)
1424	je	.Ldone8x
1425
1426	leaq	128(%rsi),%rsi
1427	xorq	%r10,%r10
1428	vmovdqa	%ymm12,0(%rsp)
1429	leaq	128(%rdi),%rdi
1430	subq	$128,%rdx
1431	vmovdqa	%ymm13,32(%rsp)
1432	jmp	.Loop_tail8x
1433
1434.align	32
1435.L192_or_more8x:
1436	vpxor	0(%rsi),%ymm6,%ymm6
1437	vpxor	32(%rsi),%ymm8,%ymm8
1438	vpxor	64(%rsi),%ymm1,%ymm1
1439	vpxor	96(%rsi),%ymm5,%ymm5
1440	vpxor	128(%rsi),%ymm12,%ymm12
1441	vpxor	160(%rsi),%ymm13,%ymm13
1442	vmovdqu	%ymm6,0(%rdi)
1443	vmovdqu	%ymm8,32(%rdi)
1444	vmovdqu	%ymm1,64(%rdi)
1445	vmovdqu	%ymm5,96(%rdi)
1446	vmovdqu	%ymm12,128(%rdi)
1447	vmovdqu	%ymm13,160(%rdi)
1448	je	.Ldone8x
1449
1450	leaq	192(%rsi),%rsi
1451	xorq	%r10,%r10
1452	vmovdqa	%ymm10,0(%rsp)
1453	leaq	192(%rdi),%rdi
1454	subq	$192,%rdx
1455	vmovdqa	%ymm15,32(%rsp)
1456	jmp	.Loop_tail8x
1457
1458.align	32
1459.L256_or_more8x:
1460	vpxor	0(%rsi),%ymm6,%ymm6
1461	vpxor	32(%rsi),%ymm8,%ymm8
1462	vpxor	64(%rsi),%ymm1,%ymm1
1463	vpxor	96(%rsi),%ymm5,%ymm5
1464	vpxor	128(%rsi),%ymm12,%ymm12
1465	vpxor	160(%rsi),%ymm13,%ymm13
1466	vpxor	192(%rsi),%ymm10,%ymm10
1467	vpxor	224(%rsi),%ymm15,%ymm15
1468	vmovdqu	%ymm6,0(%rdi)
1469	vmovdqu	%ymm8,32(%rdi)
1470	vmovdqu	%ymm1,64(%rdi)
1471	vmovdqu	%ymm5,96(%rdi)
1472	vmovdqu	%ymm12,128(%rdi)
1473	vmovdqu	%ymm13,160(%rdi)
1474	vmovdqu	%ymm10,192(%rdi)
1475	vmovdqu	%ymm15,224(%rdi)
1476	je	.Ldone8x
1477
1478	leaq	256(%rsi),%rsi
1479	xorq	%r10,%r10
1480	vmovdqa	%ymm14,0(%rsp)
1481	leaq	256(%rdi),%rdi
1482	subq	$256,%rdx
1483	vmovdqa	%ymm2,32(%rsp)
1484	jmp	.Loop_tail8x
1485
1486.align	32
1487.L320_or_more8x:
1488	vpxor	0(%rsi),%ymm6,%ymm6
1489	vpxor	32(%rsi),%ymm8,%ymm8
1490	vpxor	64(%rsi),%ymm1,%ymm1
1491	vpxor	96(%rsi),%ymm5,%ymm5
1492	vpxor	128(%rsi),%ymm12,%ymm12
1493	vpxor	160(%rsi),%ymm13,%ymm13
1494	vpxor	192(%rsi),%ymm10,%ymm10
1495	vpxor	224(%rsi),%ymm15,%ymm15
1496	vpxor	256(%rsi),%ymm14,%ymm14
1497	vpxor	288(%rsi),%ymm2,%ymm2
1498	vmovdqu	%ymm6,0(%rdi)
1499	vmovdqu	%ymm8,32(%rdi)
1500	vmovdqu	%ymm1,64(%rdi)
1501	vmovdqu	%ymm5,96(%rdi)
1502	vmovdqu	%ymm12,128(%rdi)
1503	vmovdqu	%ymm13,160(%rdi)
1504	vmovdqu	%ymm10,192(%rdi)
1505	vmovdqu	%ymm15,224(%rdi)
1506	vmovdqu	%ymm14,256(%rdi)
1507	vmovdqu	%ymm2,288(%rdi)
1508	je	.Ldone8x
1509
1510	leaq	320(%rsi),%rsi
1511	xorq	%r10,%r10
1512	vmovdqa	%ymm3,0(%rsp)
1513	leaq	320(%rdi),%rdi
1514	subq	$320,%rdx
1515	vmovdqa	%ymm7,32(%rsp)
1516	jmp	.Loop_tail8x
1517
1518.align	32
1519.L384_or_more8x:
1520	vpxor	0(%rsi),%ymm6,%ymm6
1521	vpxor	32(%rsi),%ymm8,%ymm8
1522	vpxor	64(%rsi),%ymm1,%ymm1
1523	vpxor	96(%rsi),%ymm5,%ymm5
1524	vpxor	128(%rsi),%ymm12,%ymm12
1525	vpxor	160(%rsi),%ymm13,%ymm13
1526	vpxor	192(%rsi),%ymm10,%ymm10
1527	vpxor	224(%rsi),%ymm15,%ymm15
1528	vpxor	256(%rsi),%ymm14,%ymm14
1529	vpxor	288(%rsi),%ymm2,%ymm2
1530	vpxor	320(%rsi),%ymm3,%ymm3
1531	vpxor	352(%rsi),%ymm7,%ymm7
1532	vmovdqu	%ymm6,0(%rdi)
1533	vmovdqu	%ymm8,32(%rdi)
1534	vmovdqu	%ymm1,64(%rdi)
1535	vmovdqu	%ymm5,96(%rdi)
1536	vmovdqu	%ymm12,128(%rdi)
1537	vmovdqu	%ymm13,160(%rdi)
1538	vmovdqu	%ymm10,192(%rdi)
1539	vmovdqu	%ymm15,224(%rdi)
1540	vmovdqu	%ymm14,256(%rdi)
1541	vmovdqu	%ymm2,288(%rdi)
1542	vmovdqu	%ymm3,320(%rdi)
1543	vmovdqu	%ymm7,352(%rdi)
1544	je	.Ldone8x
1545
1546	leaq	384(%rsi),%rsi
1547	xorq	%r10,%r10
1548	vmovdqa	%ymm11,0(%rsp)
1549	leaq	384(%rdi),%rdi
1550	subq	$384,%rdx
1551	vmovdqa	%ymm9,32(%rsp)
1552	jmp	.Loop_tail8x
1553
1554.align	32
1555.L448_or_more8x:
1556	vpxor	0(%rsi),%ymm6,%ymm6
1557	vpxor	32(%rsi),%ymm8,%ymm8
1558	vpxor	64(%rsi),%ymm1,%ymm1
1559	vpxor	96(%rsi),%ymm5,%ymm5
1560	vpxor	128(%rsi),%ymm12,%ymm12
1561	vpxor	160(%rsi),%ymm13,%ymm13
1562	vpxor	192(%rsi),%ymm10,%ymm10
1563	vpxor	224(%rsi),%ymm15,%ymm15
1564	vpxor	256(%rsi),%ymm14,%ymm14
1565	vpxor	288(%rsi),%ymm2,%ymm2
1566	vpxor	320(%rsi),%ymm3,%ymm3
1567	vpxor	352(%rsi),%ymm7,%ymm7
1568	vpxor	384(%rsi),%ymm11,%ymm11
1569	vpxor	416(%rsi),%ymm9,%ymm9
1570	vmovdqu	%ymm6,0(%rdi)
1571	vmovdqu	%ymm8,32(%rdi)
1572	vmovdqu	%ymm1,64(%rdi)
1573	vmovdqu	%ymm5,96(%rdi)
1574	vmovdqu	%ymm12,128(%rdi)
1575	vmovdqu	%ymm13,160(%rdi)
1576	vmovdqu	%ymm10,192(%rdi)
1577	vmovdqu	%ymm15,224(%rdi)
1578	vmovdqu	%ymm14,256(%rdi)
1579	vmovdqu	%ymm2,288(%rdi)
1580	vmovdqu	%ymm3,320(%rdi)
1581	vmovdqu	%ymm7,352(%rdi)
1582	vmovdqu	%ymm11,384(%rdi)
1583	vmovdqu	%ymm9,416(%rdi)
1584	je	.Ldone8x
1585
1586	leaq	448(%rsi),%rsi
1587	xorq	%r10,%r10
1588	vmovdqa	%ymm0,0(%rsp)
1589	leaq	448(%rdi),%rdi
1590	subq	$448,%rdx
1591	vmovdqa	%ymm4,32(%rsp)
1592
1593.Loop_tail8x:
1594	movzbl	(%rsi,%r10,1),%eax
1595	movzbl	(%rsp,%r10,1),%ecx
1596	leaq	1(%r10),%r10
1597	xorl	%ecx,%eax
1598	movb	%al,-1(%rdi,%r10,1)
1599	decq	%rdx
1600	jnz	.Loop_tail8x
1601
1602.Ldone8x:
1603	vzeroall
1604	leaq	(%r9),%rsp
1605.cfi_def_cfa_register	rsp
1606.L8x_epilogue:
1607	ret
1608.cfi_endproc
1609.size	ChaCha20_ctr32_avx2,.-ChaCha20_ctr32_avx2
1610#endif
1611