xref: /aosp_15_r20/external/boringssl/src/gen/crypto/chacha-x86_64-linux.S (revision 8fb009dc861624b67b6cdb62ea21f0f22d0c584b)
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <openssl/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
7.text
8
9.section	.rodata
10.align	64
11.Lzero:
12.long	0,0,0,0
13.Lone:
14.long	1,0,0,0
15.Linc:
16.long	0,1,2,3
17.Lfour:
18.long	4,4,4,4
19.Lincy:
20.long	0,2,4,6,1,3,5,7
21.Leight:
22.long	8,8,8,8,8,8,8,8
23.Lrot16:
24.byte	0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
25.Lrot24:
26.byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
27.Lsigma:
28.byte	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
29.align	64
30.Lzeroz:
31.long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
32.Lfourz:
33.long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
34.Lincz:
35.long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
36.Lsixteen:
37.long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
38.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
39.text
40.globl	ChaCha20_ctr32_nohw
41.hidden ChaCha20_ctr32_nohw
42.type	ChaCha20_ctr32_nohw,@function
43.align	64
44ChaCha20_ctr32_nohw:
45.cfi_startproc
46_CET_ENDBR
47	pushq	%rbx
48.cfi_adjust_cfa_offset	8
49.cfi_offset	rbx,-16
50	pushq	%rbp
51.cfi_adjust_cfa_offset	8
52.cfi_offset	rbp,-24
53	pushq	%r12
54.cfi_adjust_cfa_offset	8
55.cfi_offset	r12,-32
56	pushq	%r13
57.cfi_adjust_cfa_offset	8
58.cfi_offset	r13,-40
59	pushq	%r14
60.cfi_adjust_cfa_offset	8
61.cfi_offset	r14,-48
62	pushq	%r15
63.cfi_adjust_cfa_offset	8
64.cfi_offset	r15,-56
65	subq	$64+24,%rsp
66.cfi_adjust_cfa_offset	88
67.Lctr32_body:
68
69
70	movdqu	(%rcx),%xmm1
71	movdqu	16(%rcx),%xmm2
72	movdqu	(%r8),%xmm3
73	movdqa	.Lone(%rip),%xmm4
74
75
76	movdqa	%xmm1,16(%rsp)
77	movdqa	%xmm2,32(%rsp)
78	movdqa	%xmm3,48(%rsp)
79	movq	%rdx,%rbp
80	jmp	.Loop_outer
81
82.align	32
83.Loop_outer:
84	movl	$0x61707865,%eax
85	movl	$0x3320646e,%ebx
86	movl	$0x79622d32,%ecx
87	movl	$0x6b206574,%edx
88	movl	16(%rsp),%r8d
89	movl	20(%rsp),%r9d
90	movl	24(%rsp),%r10d
91	movl	28(%rsp),%r11d
92	movd	%xmm3,%r12d
93	movl	52(%rsp),%r13d
94	movl	56(%rsp),%r14d
95	movl	60(%rsp),%r15d
96
97	movq	%rbp,64+0(%rsp)
98	movl	$10,%ebp
99	movq	%rsi,64+8(%rsp)
100.byte	102,72,15,126,214
101	movq	%rdi,64+16(%rsp)
102	movq	%rsi,%rdi
103	shrq	$32,%rdi
104	jmp	.Loop
105
106.align	32
107.Loop:
108	addl	%r8d,%eax
109	xorl	%eax,%r12d
110	roll	$16,%r12d
111	addl	%r9d,%ebx
112	xorl	%ebx,%r13d
113	roll	$16,%r13d
114	addl	%r12d,%esi
115	xorl	%esi,%r8d
116	roll	$12,%r8d
117	addl	%r13d,%edi
118	xorl	%edi,%r9d
119	roll	$12,%r9d
120	addl	%r8d,%eax
121	xorl	%eax,%r12d
122	roll	$8,%r12d
123	addl	%r9d,%ebx
124	xorl	%ebx,%r13d
125	roll	$8,%r13d
126	addl	%r12d,%esi
127	xorl	%esi,%r8d
128	roll	$7,%r8d
129	addl	%r13d,%edi
130	xorl	%edi,%r9d
131	roll	$7,%r9d
132	movl	%esi,32(%rsp)
133	movl	%edi,36(%rsp)
134	movl	40(%rsp),%esi
135	movl	44(%rsp),%edi
136	addl	%r10d,%ecx
137	xorl	%ecx,%r14d
138	roll	$16,%r14d
139	addl	%r11d,%edx
140	xorl	%edx,%r15d
141	roll	$16,%r15d
142	addl	%r14d,%esi
143	xorl	%esi,%r10d
144	roll	$12,%r10d
145	addl	%r15d,%edi
146	xorl	%edi,%r11d
147	roll	$12,%r11d
148	addl	%r10d,%ecx
149	xorl	%ecx,%r14d
150	roll	$8,%r14d
151	addl	%r11d,%edx
152	xorl	%edx,%r15d
153	roll	$8,%r15d
154	addl	%r14d,%esi
155	xorl	%esi,%r10d
156	roll	$7,%r10d
157	addl	%r15d,%edi
158	xorl	%edi,%r11d
159	roll	$7,%r11d
160	addl	%r9d,%eax
161	xorl	%eax,%r15d
162	roll	$16,%r15d
163	addl	%r10d,%ebx
164	xorl	%ebx,%r12d
165	roll	$16,%r12d
166	addl	%r15d,%esi
167	xorl	%esi,%r9d
168	roll	$12,%r9d
169	addl	%r12d,%edi
170	xorl	%edi,%r10d
171	roll	$12,%r10d
172	addl	%r9d,%eax
173	xorl	%eax,%r15d
174	roll	$8,%r15d
175	addl	%r10d,%ebx
176	xorl	%ebx,%r12d
177	roll	$8,%r12d
178	addl	%r15d,%esi
179	xorl	%esi,%r9d
180	roll	$7,%r9d
181	addl	%r12d,%edi
182	xorl	%edi,%r10d
183	roll	$7,%r10d
184	movl	%esi,40(%rsp)
185	movl	%edi,44(%rsp)
186	movl	32(%rsp),%esi
187	movl	36(%rsp),%edi
188	addl	%r11d,%ecx
189	xorl	%ecx,%r13d
190	roll	$16,%r13d
191	addl	%r8d,%edx
192	xorl	%edx,%r14d
193	roll	$16,%r14d
194	addl	%r13d,%esi
195	xorl	%esi,%r11d
196	roll	$12,%r11d
197	addl	%r14d,%edi
198	xorl	%edi,%r8d
199	roll	$12,%r8d
200	addl	%r11d,%ecx
201	xorl	%ecx,%r13d
202	roll	$8,%r13d
203	addl	%r8d,%edx
204	xorl	%edx,%r14d
205	roll	$8,%r14d
206	addl	%r13d,%esi
207	xorl	%esi,%r11d
208	roll	$7,%r11d
209	addl	%r14d,%edi
210	xorl	%edi,%r8d
211	roll	$7,%r8d
212	decl	%ebp
213	jnz	.Loop
214	movl	%edi,36(%rsp)
215	movl	%esi,32(%rsp)
216	movq	64(%rsp),%rbp
217	movdqa	%xmm2,%xmm1
218	movq	64+8(%rsp),%rsi
219	paddd	%xmm4,%xmm3
220	movq	64+16(%rsp),%rdi
221
222	addl	$0x61707865,%eax
223	addl	$0x3320646e,%ebx
224	addl	$0x79622d32,%ecx
225	addl	$0x6b206574,%edx
226	addl	16(%rsp),%r8d
227	addl	20(%rsp),%r9d
228	addl	24(%rsp),%r10d
229	addl	28(%rsp),%r11d
230	addl	48(%rsp),%r12d
231	addl	52(%rsp),%r13d
232	addl	56(%rsp),%r14d
233	addl	60(%rsp),%r15d
234	paddd	32(%rsp),%xmm1
235
236	cmpq	$64,%rbp
237	jb	.Ltail
238
239	xorl	0(%rsi),%eax
240	xorl	4(%rsi),%ebx
241	xorl	8(%rsi),%ecx
242	xorl	12(%rsi),%edx
243	xorl	16(%rsi),%r8d
244	xorl	20(%rsi),%r9d
245	xorl	24(%rsi),%r10d
246	xorl	28(%rsi),%r11d
247	movdqu	32(%rsi),%xmm0
248	xorl	48(%rsi),%r12d
249	xorl	52(%rsi),%r13d
250	xorl	56(%rsi),%r14d
251	xorl	60(%rsi),%r15d
252	leaq	64(%rsi),%rsi
253	pxor	%xmm1,%xmm0
254
255	movdqa	%xmm2,32(%rsp)
256	movd	%xmm3,48(%rsp)
257
258	movl	%eax,0(%rdi)
259	movl	%ebx,4(%rdi)
260	movl	%ecx,8(%rdi)
261	movl	%edx,12(%rdi)
262	movl	%r8d,16(%rdi)
263	movl	%r9d,20(%rdi)
264	movl	%r10d,24(%rdi)
265	movl	%r11d,28(%rdi)
266	movdqu	%xmm0,32(%rdi)
267	movl	%r12d,48(%rdi)
268	movl	%r13d,52(%rdi)
269	movl	%r14d,56(%rdi)
270	movl	%r15d,60(%rdi)
271	leaq	64(%rdi),%rdi
272
273	subq	$64,%rbp
274	jnz	.Loop_outer
275
276	jmp	.Ldone
277
278.align	16
279.Ltail:
280	movl	%eax,0(%rsp)
281	movl	%ebx,4(%rsp)
282	xorq	%rbx,%rbx
283	movl	%ecx,8(%rsp)
284	movl	%edx,12(%rsp)
285	movl	%r8d,16(%rsp)
286	movl	%r9d,20(%rsp)
287	movl	%r10d,24(%rsp)
288	movl	%r11d,28(%rsp)
289	movdqa	%xmm1,32(%rsp)
290	movl	%r12d,48(%rsp)
291	movl	%r13d,52(%rsp)
292	movl	%r14d,56(%rsp)
293	movl	%r15d,60(%rsp)
294
295.Loop_tail:
296	movzbl	(%rsi,%rbx,1),%eax
297	movzbl	(%rsp,%rbx,1),%edx
298	leaq	1(%rbx),%rbx
299	xorl	%edx,%eax
300	movb	%al,-1(%rdi,%rbx,1)
301	decq	%rbp
302	jnz	.Loop_tail
303
304.Ldone:
305	leaq	64+24+48(%rsp),%rsi
306	movq	-48(%rsi),%r15
307.cfi_restore	r15
308	movq	-40(%rsi),%r14
309.cfi_restore	r14
310	movq	-32(%rsi),%r13
311.cfi_restore	r13
312	movq	-24(%rsi),%r12
313.cfi_restore	r12
314	movq	-16(%rsi),%rbp
315.cfi_restore	rbp
316	movq	-8(%rsi),%rbx
317.cfi_restore	rbx
318	leaq	(%rsi),%rsp
319.cfi_adjust_cfa_offset	-136
320.Lno_data:
321	ret
322.cfi_endproc
323.size	ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw
324.globl	ChaCha20_ctr32_ssse3
325.hidden ChaCha20_ctr32_ssse3
326.type	ChaCha20_ctr32_ssse3,@function
327.align	32
328ChaCha20_ctr32_ssse3:
329.cfi_startproc
330_CET_ENDBR
331	movq	%rsp,%r9
332.cfi_def_cfa_register	r9
333	subq	$64+8,%rsp
334	movdqa	.Lsigma(%rip),%xmm0
335	movdqu	(%rcx),%xmm1
336	movdqu	16(%rcx),%xmm2
337	movdqu	(%r8),%xmm3
338	movdqa	.Lrot16(%rip),%xmm6
339	movdqa	.Lrot24(%rip),%xmm7
340
341	movdqa	%xmm0,0(%rsp)
342	movdqa	%xmm1,16(%rsp)
343	movdqa	%xmm2,32(%rsp)
344	movdqa	%xmm3,48(%rsp)
345	movq	$10,%r8
346	jmp	.Loop_ssse3
347
348.align	32
349.Loop_outer_ssse3:
350	movdqa	.Lone(%rip),%xmm3
351	movdqa	0(%rsp),%xmm0
352	movdqa	16(%rsp),%xmm1
353	movdqa	32(%rsp),%xmm2
354	paddd	48(%rsp),%xmm3
355	movq	$10,%r8
356	movdqa	%xmm3,48(%rsp)
357	jmp	.Loop_ssse3
358
359.align	32
360.Loop_ssse3:
361	paddd	%xmm1,%xmm0
362	pxor	%xmm0,%xmm3
363.byte	102,15,56,0,222
364	paddd	%xmm3,%xmm2
365	pxor	%xmm2,%xmm1
366	movdqa	%xmm1,%xmm4
367	psrld	$20,%xmm1
368	pslld	$12,%xmm4
369	por	%xmm4,%xmm1
370	paddd	%xmm1,%xmm0
371	pxor	%xmm0,%xmm3
372.byte	102,15,56,0,223
373	paddd	%xmm3,%xmm2
374	pxor	%xmm2,%xmm1
375	movdqa	%xmm1,%xmm4
376	psrld	$25,%xmm1
377	pslld	$7,%xmm4
378	por	%xmm4,%xmm1
379	pshufd	$78,%xmm2,%xmm2
380	pshufd	$57,%xmm1,%xmm1
381	pshufd	$147,%xmm3,%xmm3
382	nop
383	paddd	%xmm1,%xmm0
384	pxor	%xmm0,%xmm3
385.byte	102,15,56,0,222
386	paddd	%xmm3,%xmm2
387	pxor	%xmm2,%xmm1
388	movdqa	%xmm1,%xmm4
389	psrld	$20,%xmm1
390	pslld	$12,%xmm4
391	por	%xmm4,%xmm1
392	paddd	%xmm1,%xmm0
393	pxor	%xmm0,%xmm3
394.byte	102,15,56,0,223
395	paddd	%xmm3,%xmm2
396	pxor	%xmm2,%xmm1
397	movdqa	%xmm1,%xmm4
398	psrld	$25,%xmm1
399	pslld	$7,%xmm4
400	por	%xmm4,%xmm1
401	pshufd	$78,%xmm2,%xmm2
402	pshufd	$147,%xmm1,%xmm1
403	pshufd	$57,%xmm3,%xmm3
404	decq	%r8
405	jnz	.Loop_ssse3
406	paddd	0(%rsp),%xmm0
407	paddd	16(%rsp),%xmm1
408	paddd	32(%rsp),%xmm2
409	paddd	48(%rsp),%xmm3
410
411	cmpq	$64,%rdx
412	jb	.Ltail_ssse3
413
414	movdqu	0(%rsi),%xmm4
415	movdqu	16(%rsi),%xmm5
416	pxor	%xmm4,%xmm0
417	movdqu	32(%rsi),%xmm4
418	pxor	%xmm5,%xmm1
419	movdqu	48(%rsi),%xmm5
420	leaq	64(%rsi),%rsi
421	pxor	%xmm4,%xmm2
422	pxor	%xmm5,%xmm3
423
424	movdqu	%xmm0,0(%rdi)
425	movdqu	%xmm1,16(%rdi)
426	movdqu	%xmm2,32(%rdi)
427	movdqu	%xmm3,48(%rdi)
428	leaq	64(%rdi),%rdi
429
430	subq	$64,%rdx
431	jnz	.Loop_outer_ssse3
432
433	jmp	.Ldone_ssse3
434
435.align	16
436.Ltail_ssse3:
437	movdqa	%xmm0,0(%rsp)
438	movdqa	%xmm1,16(%rsp)
439	movdqa	%xmm2,32(%rsp)
440	movdqa	%xmm3,48(%rsp)
441	xorq	%r8,%r8
442
443.Loop_tail_ssse3:
444	movzbl	(%rsi,%r8,1),%eax
445	movzbl	(%rsp,%r8,1),%ecx
446	leaq	1(%r8),%r8
447	xorl	%ecx,%eax
448	movb	%al,-1(%rdi,%r8,1)
449	decq	%rdx
450	jnz	.Loop_tail_ssse3
451
452.Ldone_ssse3:
453	leaq	(%r9),%rsp
454.cfi_def_cfa_register	rsp
455.Lssse3_epilogue:
456	ret
457.cfi_endproc
458.size	ChaCha20_ctr32_ssse3,.-ChaCha20_ctr32_ssse3
459.globl	ChaCha20_ctr32_ssse3_4x
460.hidden ChaCha20_ctr32_ssse3_4x
461.type	ChaCha20_ctr32_ssse3_4x,@function
462.align	32
463ChaCha20_ctr32_ssse3_4x:
464.cfi_startproc
465_CET_ENDBR
466	movq	%rsp,%r9
467.cfi_def_cfa_register	r9
468	subq	$0x140+8,%rsp
469	movdqa	.Lsigma(%rip),%xmm11
470	movdqu	(%rcx),%xmm15
471	movdqu	16(%rcx),%xmm7
472	movdqu	(%r8),%xmm3
473	leaq	256(%rsp),%rcx
474	leaq	.Lrot16(%rip),%r10
475	leaq	.Lrot24(%rip),%r11
476
477	pshufd	$0x00,%xmm11,%xmm8
478	pshufd	$0x55,%xmm11,%xmm9
479	movdqa	%xmm8,64(%rsp)
480	pshufd	$0xaa,%xmm11,%xmm10
481	movdqa	%xmm9,80(%rsp)
482	pshufd	$0xff,%xmm11,%xmm11
483	movdqa	%xmm10,96(%rsp)
484	movdqa	%xmm11,112(%rsp)
485
486	pshufd	$0x00,%xmm15,%xmm12
487	pshufd	$0x55,%xmm15,%xmm13
488	movdqa	%xmm12,128-256(%rcx)
489	pshufd	$0xaa,%xmm15,%xmm14
490	movdqa	%xmm13,144-256(%rcx)
491	pshufd	$0xff,%xmm15,%xmm15
492	movdqa	%xmm14,160-256(%rcx)
493	movdqa	%xmm15,176-256(%rcx)
494
495	pshufd	$0x00,%xmm7,%xmm4
496	pshufd	$0x55,%xmm7,%xmm5
497	movdqa	%xmm4,192-256(%rcx)
498	pshufd	$0xaa,%xmm7,%xmm6
499	movdqa	%xmm5,208-256(%rcx)
500	pshufd	$0xff,%xmm7,%xmm7
501	movdqa	%xmm6,224-256(%rcx)
502	movdqa	%xmm7,240-256(%rcx)
503
504	pshufd	$0x00,%xmm3,%xmm0
505	pshufd	$0x55,%xmm3,%xmm1
506	paddd	.Linc(%rip),%xmm0
507	pshufd	$0xaa,%xmm3,%xmm2
508	movdqa	%xmm1,272-256(%rcx)
509	pshufd	$0xff,%xmm3,%xmm3
510	movdqa	%xmm2,288-256(%rcx)
511	movdqa	%xmm3,304-256(%rcx)
512
513	jmp	.Loop_enter4x
514
515.align	32
516.Loop_outer4x:
517	movdqa	64(%rsp),%xmm8
518	movdqa	80(%rsp),%xmm9
519	movdqa	96(%rsp),%xmm10
520	movdqa	112(%rsp),%xmm11
521	movdqa	128-256(%rcx),%xmm12
522	movdqa	144-256(%rcx),%xmm13
523	movdqa	160-256(%rcx),%xmm14
524	movdqa	176-256(%rcx),%xmm15
525	movdqa	192-256(%rcx),%xmm4
526	movdqa	208-256(%rcx),%xmm5
527	movdqa	224-256(%rcx),%xmm6
528	movdqa	240-256(%rcx),%xmm7
529	movdqa	256-256(%rcx),%xmm0
530	movdqa	272-256(%rcx),%xmm1
531	movdqa	288-256(%rcx),%xmm2
532	movdqa	304-256(%rcx),%xmm3
533	paddd	.Lfour(%rip),%xmm0
534
535.Loop_enter4x:
536	movdqa	%xmm6,32(%rsp)
537	movdqa	%xmm7,48(%rsp)
538	movdqa	(%r10),%xmm7
539	movl	$10,%eax
540	movdqa	%xmm0,256-256(%rcx)
541	jmp	.Loop4x
542
543.align	32
544.Loop4x:
545	paddd	%xmm12,%xmm8
546	paddd	%xmm13,%xmm9
547	pxor	%xmm8,%xmm0
548	pxor	%xmm9,%xmm1
549.byte	102,15,56,0,199
550.byte	102,15,56,0,207
551	paddd	%xmm0,%xmm4
552	paddd	%xmm1,%xmm5
553	pxor	%xmm4,%xmm12
554	pxor	%xmm5,%xmm13
555	movdqa	%xmm12,%xmm6
556	pslld	$12,%xmm12
557	psrld	$20,%xmm6
558	movdqa	%xmm13,%xmm7
559	pslld	$12,%xmm13
560	por	%xmm6,%xmm12
561	psrld	$20,%xmm7
562	movdqa	(%r11),%xmm6
563	por	%xmm7,%xmm13
564	paddd	%xmm12,%xmm8
565	paddd	%xmm13,%xmm9
566	pxor	%xmm8,%xmm0
567	pxor	%xmm9,%xmm1
568.byte	102,15,56,0,198
569.byte	102,15,56,0,206
570	paddd	%xmm0,%xmm4
571	paddd	%xmm1,%xmm5
572	pxor	%xmm4,%xmm12
573	pxor	%xmm5,%xmm13
574	movdqa	%xmm12,%xmm7
575	pslld	$7,%xmm12
576	psrld	$25,%xmm7
577	movdqa	%xmm13,%xmm6
578	pslld	$7,%xmm13
579	por	%xmm7,%xmm12
580	psrld	$25,%xmm6
581	movdqa	(%r10),%xmm7
582	por	%xmm6,%xmm13
583	movdqa	%xmm4,0(%rsp)
584	movdqa	%xmm5,16(%rsp)
585	movdqa	32(%rsp),%xmm4
586	movdqa	48(%rsp),%xmm5
587	paddd	%xmm14,%xmm10
588	paddd	%xmm15,%xmm11
589	pxor	%xmm10,%xmm2
590	pxor	%xmm11,%xmm3
591.byte	102,15,56,0,215
592.byte	102,15,56,0,223
593	paddd	%xmm2,%xmm4
594	paddd	%xmm3,%xmm5
595	pxor	%xmm4,%xmm14
596	pxor	%xmm5,%xmm15
597	movdqa	%xmm14,%xmm6
598	pslld	$12,%xmm14
599	psrld	$20,%xmm6
600	movdqa	%xmm15,%xmm7
601	pslld	$12,%xmm15
602	por	%xmm6,%xmm14
603	psrld	$20,%xmm7
604	movdqa	(%r11),%xmm6
605	por	%xmm7,%xmm15
606	paddd	%xmm14,%xmm10
607	paddd	%xmm15,%xmm11
608	pxor	%xmm10,%xmm2
609	pxor	%xmm11,%xmm3
610.byte	102,15,56,0,214
611.byte	102,15,56,0,222
612	paddd	%xmm2,%xmm4
613	paddd	%xmm3,%xmm5
614	pxor	%xmm4,%xmm14
615	pxor	%xmm5,%xmm15
616	movdqa	%xmm14,%xmm7
617	pslld	$7,%xmm14
618	psrld	$25,%xmm7
619	movdqa	%xmm15,%xmm6
620	pslld	$7,%xmm15
621	por	%xmm7,%xmm14
622	psrld	$25,%xmm6
623	movdqa	(%r10),%xmm7
624	por	%xmm6,%xmm15
625	paddd	%xmm13,%xmm8
626	paddd	%xmm14,%xmm9
627	pxor	%xmm8,%xmm3
628	pxor	%xmm9,%xmm0
629.byte	102,15,56,0,223
630.byte	102,15,56,0,199
631	paddd	%xmm3,%xmm4
632	paddd	%xmm0,%xmm5
633	pxor	%xmm4,%xmm13
634	pxor	%xmm5,%xmm14
635	movdqa	%xmm13,%xmm6
636	pslld	$12,%xmm13
637	psrld	$20,%xmm6
638	movdqa	%xmm14,%xmm7
639	pslld	$12,%xmm14
640	por	%xmm6,%xmm13
641	psrld	$20,%xmm7
642	movdqa	(%r11),%xmm6
643	por	%xmm7,%xmm14
644	paddd	%xmm13,%xmm8
645	paddd	%xmm14,%xmm9
646	pxor	%xmm8,%xmm3
647	pxor	%xmm9,%xmm0
648.byte	102,15,56,0,222
649.byte	102,15,56,0,198
650	paddd	%xmm3,%xmm4
651	paddd	%xmm0,%xmm5
652	pxor	%xmm4,%xmm13
653	pxor	%xmm5,%xmm14
654	movdqa	%xmm13,%xmm7
655	pslld	$7,%xmm13
656	psrld	$25,%xmm7
657	movdqa	%xmm14,%xmm6
658	pslld	$7,%xmm14
659	por	%xmm7,%xmm13
660	psrld	$25,%xmm6
661	movdqa	(%r10),%xmm7
662	por	%xmm6,%xmm14
663	movdqa	%xmm4,32(%rsp)
664	movdqa	%xmm5,48(%rsp)
665	movdqa	0(%rsp),%xmm4
666	movdqa	16(%rsp),%xmm5
667	paddd	%xmm15,%xmm10
668	paddd	%xmm12,%xmm11
669	pxor	%xmm10,%xmm1
670	pxor	%xmm11,%xmm2
671.byte	102,15,56,0,207
672.byte	102,15,56,0,215
673	paddd	%xmm1,%xmm4
674	paddd	%xmm2,%xmm5
675	pxor	%xmm4,%xmm15
676	pxor	%xmm5,%xmm12
677	movdqa	%xmm15,%xmm6
678	pslld	$12,%xmm15
679	psrld	$20,%xmm6
680	movdqa	%xmm12,%xmm7
681	pslld	$12,%xmm12
682	por	%xmm6,%xmm15
683	psrld	$20,%xmm7
684	movdqa	(%r11),%xmm6
685	por	%xmm7,%xmm12
686	paddd	%xmm15,%xmm10
687	paddd	%xmm12,%xmm11
688	pxor	%xmm10,%xmm1
689	pxor	%xmm11,%xmm2
690.byte	102,15,56,0,206
691.byte	102,15,56,0,214
692	paddd	%xmm1,%xmm4
693	paddd	%xmm2,%xmm5
694	pxor	%xmm4,%xmm15
695	pxor	%xmm5,%xmm12
696	movdqa	%xmm15,%xmm7
697	pslld	$7,%xmm15
698	psrld	$25,%xmm7
699	movdqa	%xmm12,%xmm6
700	pslld	$7,%xmm12
701	por	%xmm7,%xmm15
702	psrld	$25,%xmm6
703	movdqa	(%r10),%xmm7
704	por	%xmm6,%xmm12
705	decl	%eax
706	jnz	.Loop4x
707
708	paddd	64(%rsp),%xmm8
709	paddd	80(%rsp),%xmm9
710	paddd	96(%rsp),%xmm10
711	paddd	112(%rsp),%xmm11
712
713	movdqa	%xmm8,%xmm6
714	punpckldq	%xmm9,%xmm8
715	movdqa	%xmm10,%xmm7
716	punpckldq	%xmm11,%xmm10
717	punpckhdq	%xmm9,%xmm6
718	punpckhdq	%xmm11,%xmm7
719	movdqa	%xmm8,%xmm9
720	punpcklqdq	%xmm10,%xmm8
721	movdqa	%xmm6,%xmm11
722	punpcklqdq	%xmm7,%xmm6
723	punpckhqdq	%xmm10,%xmm9
724	punpckhqdq	%xmm7,%xmm11
725	paddd	128-256(%rcx),%xmm12
726	paddd	144-256(%rcx),%xmm13
727	paddd	160-256(%rcx),%xmm14
728	paddd	176-256(%rcx),%xmm15
729
730	movdqa	%xmm8,0(%rsp)
731	movdqa	%xmm9,16(%rsp)
732	movdqa	32(%rsp),%xmm8
733	movdqa	48(%rsp),%xmm9
734
735	movdqa	%xmm12,%xmm10
736	punpckldq	%xmm13,%xmm12
737	movdqa	%xmm14,%xmm7
738	punpckldq	%xmm15,%xmm14
739	punpckhdq	%xmm13,%xmm10
740	punpckhdq	%xmm15,%xmm7
741	movdqa	%xmm12,%xmm13
742	punpcklqdq	%xmm14,%xmm12
743	movdqa	%xmm10,%xmm15
744	punpcklqdq	%xmm7,%xmm10
745	punpckhqdq	%xmm14,%xmm13
746	punpckhqdq	%xmm7,%xmm15
747	paddd	192-256(%rcx),%xmm4
748	paddd	208-256(%rcx),%xmm5
749	paddd	224-256(%rcx),%xmm8
750	paddd	240-256(%rcx),%xmm9
751
752	movdqa	%xmm6,32(%rsp)
753	movdqa	%xmm11,48(%rsp)
754
755	movdqa	%xmm4,%xmm14
756	punpckldq	%xmm5,%xmm4
757	movdqa	%xmm8,%xmm7
758	punpckldq	%xmm9,%xmm8
759	punpckhdq	%xmm5,%xmm14
760	punpckhdq	%xmm9,%xmm7
761	movdqa	%xmm4,%xmm5
762	punpcklqdq	%xmm8,%xmm4
763	movdqa	%xmm14,%xmm9
764	punpcklqdq	%xmm7,%xmm14
765	punpckhqdq	%xmm8,%xmm5
766	punpckhqdq	%xmm7,%xmm9
767	paddd	256-256(%rcx),%xmm0
768	paddd	272-256(%rcx),%xmm1
769	paddd	288-256(%rcx),%xmm2
770	paddd	304-256(%rcx),%xmm3
771
772	movdqa	%xmm0,%xmm8
773	punpckldq	%xmm1,%xmm0
774	movdqa	%xmm2,%xmm7
775	punpckldq	%xmm3,%xmm2
776	punpckhdq	%xmm1,%xmm8
777	punpckhdq	%xmm3,%xmm7
778	movdqa	%xmm0,%xmm1
779	punpcklqdq	%xmm2,%xmm0
780	movdqa	%xmm8,%xmm3
781	punpcklqdq	%xmm7,%xmm8
782	punpckhqdq	%xmm2,%xmm1
783	punpckhqdq	%xmm7,%xmm3
784	cmpq	$256,%rdx
785	jb	.Ltail4x
786
787	movdqu	0(%rsi),%xmm6
788	movdqu	16(%rsi),%xmm11
789	movdqu	32(%rsi),%xmm2
790	movdqu	48(%rsi),%xmm7
791	pxor	0(%rsp),%xmm6
792	pxor	%xmm12,%xmm11
793	pxor	%xmm4,%xmm2
794	pxor	%xmm0,%xmm7
795
796	movdqu	%xmm6,0(%rdi)
797	movdqu	64(%rsi),%xmm6
798	movdqu	%xmm11,16(%rdi)
799	movdqu	80(%rsi),%xmm11
800	movdqu	%xmm2,32(%rdi)
801	movdqu	96(%rsi),%xmm2
802	movdqu	%xmm7,48(%rdi)
803	movdqu	112(%rsi),%xmm7
804	leaq	128(%rsi),%rsi
805	pxor	16(%rsp),%xmm6
806	pxor	%xmm13,%xmm11
807	pxor	%xmm5,%xmm2
808	pxor	%xmm1,%xmm7
809
810	movdqu	%xmm6,64(%rdi)
811	movdqu	0(%rsi),%xmm6
812	movdqu	%xmm11,80(%rdi)
813	movdqu	16(%rsi),%xmm11
814	movdqu	%xmm2,96(%rdi)
815	movdqu	32(%rsi),%xmm2
816	movdqu	%xmm7,112(%rdi)
817	leaq	128(%rdi),%rdi
818	movdqu	48(%rsi),%xmm7
819	pxor	32(%rsp),%xmm6
820	pxor	%xmm10,%xmm11
821	pxor	%xmm14,%xmm2
822	pxor	%xmm8,%xmm7
823
824	movdqu	%xmm6,0(%rdi)
825	movdqu	64(%rsi),%xmm6
826	movdqu	%xmm11,16(%rdi)
827	movdqu	80(%rsi),%xmm11
828	movdqu	%xmm2,32(%rdi)
829	movdqu	96(%rsi),%xmm2
830	movdqu	%xmm7,48(%rdi)
831	movdqu	112(%rsi),%xmm7
832	leaq	128(%rsi),%rsi
833	pxor	48(%rsp),%xmm6
834	pxor	%xmm15,%xmm11
835	pxor	%xmm9,%xmm2
836	pxor	%xmm3,%xmm7
837	movdqu	%xmm6,64(%rdi)
838	movdqu	%xmm11,80(%rdi)
839	movdqu	%xmm2,96(%rdi)
840	movdqu	%xmm7,112(%rdi)
841	leaq	128(%rdi),%rdi
842
843	subq	$256,%rdx
844	jnz	.Loop_outer4x
845
846	jmp	.Ldone4x
847
848.Ltail4x:
849	cmpq	$192,%rdx
850	jae	.L192_or_more4x
851	cmpq	$128,%rdx
852	jae	.L128_or_more4x
853	cmpq	$64,%rdx
854	jae	.L64_or_more4x
855
856
857	xorq	%r10,%r10
858
859	movdqa	%xmm12,16(%rsp)
860	movdqa	%xmm4,32(%rsp)
861	movdqa	%xmm0,48(%rsp)
862	jmp	.Loop_tail4x
863
864.align	32
865.L64_or_more4x:
866	movdqu	0(%rsi),%xmm6
867	movdqu	16(%rsi),%xmm11
868	movdqu	32(%rsi),%xmm2
869	movdqu	48(%rsi),%xmm7
870	pxor	0(%rsp),%xmm6
871	pxor	%xmm12,%xmm11
872	pxor	%xmm4,%xmm2
873	pxor	%xmm0,%xmm7
874	movdqu	%xmm6,0(%rdi)
875	movdqu	%xmm11,16(%rdi)
876	movdqu	%xmm2,32(%rdi)
877	movdqu	%xmm7,48(%rdi)
878	je	.Ldone4x
879
880	movdqa	16(%rsp),%xmm6
881	leaq	64(%rsi),%rsi
882	xorq	%r10,%r10
883	movdqa	%xmm6,0(%rsp)
884	movdqa	%xmm13,16(%rsp)
885	leaq	64(%rdi),%rdi
886	movdqa	%xmm5,32(%rsp)
887	subq	$64,%rdx
888	movdqa	%xmm1,48(%rsp)
889	jmp	.Loop_tail4x
890
891.align	32
892.L128_or_more4x:
893	movdqu	0(%rsi),%xmm6
894	movdqu	16(%rsi),%xmm11
895	movdqu	32(%rsi),%xmm2
896	movdqu	48(%rsi),%xmm7
897	pxor	0(%rsp),%xmm6
898	pxor	%xmm12,%xmm11
899	pxor	%xmm4,%xmm2
900	pxor	%xmm0,%xmm7
901
902	movdqu	%xmm6,0(%rdi)
903	movdqu	64(%rsi),%xmm6
904	movdqu	%xmm11,16(%rdi)
905	movdqu	80(%rsi),%xmm11
906	movdqu	%xmm2,32(%rdi)
907	movdqu	96(%rsi),%xmm2
908	movdqu	%xmm7,48(%rdi)
909	movdqu	112(%rsi),%xmm7
910	pxor	16(%rsp),%xmm6
911	pxor	%xmm13,%xmm11
912	pxor	%xmm5,%xmm2
913	pxor	%xmm1,%xmm7
914	movdqu	%xmm6,64(%rdi)
915	movdqu	%xmm11,80(%rdi)
916	movdqu	%xmm2,96(%rdi)
917	movdqu	%xmm7,112(%rdi)
918	je	.Ldone4x
919
920	movdqa	32(%rsp),%xmm6
921	leaq	128(%rsi),%rsi
922	xorq	%r10,%r10
923	movdqa	%xmm6,0(%rsp)
924	movdqa	%xmm10,16(%rsp)
925	leaq	128(%rdi),%rdi
926	movdqa	%xmm14,32(%rsp)
927	subq	$128,%rdx
928	movdqa	%xmm8,48(%rsp)
929	jmp	.Loop_tail4x
930
931.align	32
932.L192_or_more4x:
933	movdqu	0(%rsi),%xmm6
934	movdqu	16(%rsi),%xmm11
935	movdqu	32(%rsi),%xmm2
936	movdqu	48(%rsi),%xmm7
937	pxor	0(%rsp),%xmm6
938	pxor	%xmm12,%xmm11
939	pxor	%xmm4,%xmm2
940	pxor	%xmm0,%xmm7
941
942	movdqu	%xmm6,0(%rdi)
943	movdqu	64(%rsi),%xmm6
944	movdqu	%xmm11,16(%rdi)
945	movdqu	80(%rsi),%xmm11
946	movdqu	%xmm2,32(%rdi)
947	movdqu	96(%rsi),%xmm2
948	movdqu	%xmm7,48(%rdi)
949	movdqu	112(%rsi),%xmm7
950	leaq	128(%rsi),%rsi
951	pxor	16(%rsp),%xmm6
952	pxor	%xmm13,%xmm11
953	pxor	%xmm5,%xmm2
954	pxor	%xmm1,%xmm7
955
956	movdqu	%xmm6,64(%rdi)
957	movdqu	0(%rsi),%xmm6
958	movdqu	%xmm11,80(%rdi)
959	movdqu	16(%rsi),%xmm11
960	movdqu	%xmm2,96(%rdi)
961	movdqu	32(%rsi),%xmm2
962	movdqu	%xmm7,112(%rdi)
963	leaq	128(%rdi),%rdi
964	movdqu	48(%rsi),%xmm7
965	pxor	32(%rsp),%xmm6
966	pxor	%xmm10,%xmm11
967	pxor	%xmm14,%xmm2
968	pxor	%xmm8,%xmm7
969	movdqu	%xmm6,0(%rdi)
970	movdqu	%xmm11,16(%rdi)
971	movdqu	%xmm2,32(%rdi)
972	movdqu	%xmm7,48(%rdi)
973	je	.Ldone4x
974
975	movdqa	48(%rsp),%xmm6
976	leaq	64(%rsi),%rsi
977	xorq	%r10,%r10
978	movdqa	%xmm6,0(%rsp)
979	movdqa	%xmm15,16(%rsp)
980	leaq	64(%rdi),%rdi
981	movdqa	%xmm9,32(%rsp)
982	subq	$192,%rdx
983	movdqa	%xmm3,48(%rsp)
984
985.Loop_tail4x:
986	movzbl	(%rsi,%r10,1),%eax
987	movzbl	(%rsp,%r10,1),%ecx
988	leaq	1(%r10),%r10
989	xorl	%ecx,%eax
990	movb	%al,-1(%rdi,%r10,1)
991	decq	%rdx
992	jnz	.Loop_tail4x
993
994.Ldone4x:
995	leaq	(%r9),%rsp
996.cfi_def_cfa_register	rsp
997.L4x_epilogue:
998	ret
999.cfi_endproc
1000.size	ChaCha20_ctr32_ssse3_4x,.-ChaCha20_ctr32_ssse3_4x
1001.globl	ChaCha20_ctr32_avx2
1002.hidden ChaCha20_ctr32_avx2
1003.type	ChaCha20_ctr32_avx2,@function
1004.align	32
1005ChaCha20_ctr32_avx2:
1006.cfi_startproc
1007_CET_ENDBR
1008	movq	%rsp,%r9
1009.cfi_def_cfa_register	r9
1010	subq	$0x280+8,%rsp
1011	andq	$-32,%rsp
1012	vzeroupper
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023	vbroadcasti128	.Lsigma(%rip),%ymm11
1024	vbroadcasti128	(%rcx),%ymm3
1025	vbroadcasti128	16(%rcx),%ymm15
1026	vbroadcasti128	(%r8),%ymm7
1027	leaq	256(%rsp),%rcx
1028	leaq	512(%rsp),%rax
1029	leaq	.Lrot16(%rip),%r10
1030	leaq	.Lrot24(%rip),%r11
1031
1032	vpshufd	$0x00,%ymm11,%ymm8
1033	vpshufd	$0x55,%ymm11,%ymm9
1034	vmovdqa	%ymm8,128-256(%rcx)
1035	vpshufd	$0xaa,%ymm11,%ymm10
1036	vmovdqa	%ymm9,160-256(%rcx)
1037	vpshufd	$0xff,%ymm11,%ymm11
1038	vmovdqa	%ymm10,192-256(%rcx)
1039	vmovdqa	%ymm11,224-256(%rcx)
1040
1041	vpshufd	$0x00,%ymm3,%ymm0
1042	vpshufd	$0x55,%ymm3,%ymm1
1043	vmovdqa	%ymm0,256-256(%rcx)
1044	vpshufd	$0xaa,%ymm3,%ymm2
1045	vmovdqa	%ymm1,288-256(%rcx)
1046	vpshufd	$0xff,%ymm3,%ymm3
1047	vmovdqa	%ymm2,320-256(%rcx)
1048	vmovdqa	%ymm3,352-256(%rcx)
1049
1050	vpshufd	$0x00,%ymm15,%ymm12
1051	vpshufd	$0x55,%ymm15,%ymm13
1052	vmovdqa	%ymm12,384-512(%rax)
1053	vpshufd	$0xaa,%ymm15,%ymm14
1054	vmovdqa	%ymm13,416-512(%rax)
1055	vpshufd	$0xff,%ymm15,%ymm15
1056	vmovdqa	%ymm14,448-512(%rax)
1057	vmovdqa	%ymm15,480-512(%rax)
1058
1059	vpshufd	$0x00,%ymm7,%ymm4
1060	vpshufd	$0x55,%ymm7,%ymm5
1061	vpaddd	.Lincy(%rip),%ymm4,%ymm4
1062	vpshufd	$0xaa,%ymm7,%ymm6
1063	vmovdqa	%ymm5,544-512(%rax)
1064	vpshufd	$0xff,%ymm7,%ymm7
1065	vmovdqa	%ymm6,576-512(%rax)
1066	vmovdqa	%ymm7,608-512(%rax)
1067
1068	jmp	.Loop_enter8x
1069
1070.align	32
1071.Loop_outer8x:
1072	vmovdqa	128-256(%rcx),%ymm8
1073	vmovdqa	160-256(%rcx),%ymm9
1074	vmovdqa	192-256(%rcx),%ymm10
1075	vmovdqa	224-256(%rcx),%ymm11
1076	vmovdqa	256-256(%rcx),%ymm0
1077	vmovdqa	288-256(%rcx),%ymm1
1078	vmovdqa	320-256(%rcx),%ymm2
1079	vmovdqa	352-256(%rcx),%ymm3
1080	vmovdqa	384-512(%rax),%ymm12
1081	vmovdqa	416-512(%rax),%ymm13
1082	vmovdqa	448-512(%rax),%ymm14
1083	vmovdqa	480-512(%rax),%ymm15
1084	vmovdqa	512-512(%rax),%ymm4
1085	vmovdqa	544-512(%rax),%ymm5
1086	vmovdqa	576-512(%rax),%ymm6
1087	vmovdqa	608-512(%rax),%ymm7
1088	vpaddd	.Leight(%rip),%ymm4,%ymm4
1089
1090.Loop_enter8x:
1091	vmovdqa	%ymm14,64(%rsp)
1092	vmovdqa	%ymm15,96(%rsp)
1093	vbroadcasti128	(%r10),%ymm15
1094	vmovdqa	%ymm4,512-512(%rax)
1095	movl	$10,%eax
1096	jmp	.Loop8x
1097
1098.align	32
1099.Loop8x:
1100	vpaddd	%ymm0,%ymm8,%ymm8
1101	vpxor	%ymm4,%ymm8,%ymm4
1102	vpshufb	%ymm15,%ymm4,%ymm4
1103	vpaddd	%ymm1,%ymm9,%ymm9
1104	vpxor	%ymm5,%ymm9,%ymm5
1105	vpshufb	%ymm15,%ymm5,%ymm5
1106	vpaddd	%ymm4,%ymm12,%ymm12
1107	vpxor	%ymm0,%ymm12,%ymm0
1108	vpslld	$12,%ymm0,%ymm14
1109	vpsrld	$20,%ymm0,%ymm0
1110	vpor	%ymm0,%ymm14,%ymm0
1111	vbroadcasti128	(%r11),%ymm14
1112	vpaddd	%ymm5,%ymm13,%ymm13
1113	vpxor	%ymm1,%ymm13,%ymm1
1114	vpslld	$12,%ymm1,%ymm15
1115	vpsrld	$20,%ymm1,%ymm1
1116	vpor	%ymm1,%ymm15,%ymm1
1117	vpaddd	%ymm0,%ymm8,%ymm8
1118	vpxor	%ymm4,%ymm8,%ymm4
1119	vpshufb	%ymm14,%ymm4,%ymm4
1120	vpaddd	%ymm1,%ymm9,%ymm9
1121	vpxor	%ymm5,%ymm9,%ymm5
1122	vpshufb	%ymm14,%ymm5,%ymm5
1123	vpaddd	%ymm4,%ymm12,%ymm12
1124	vpxor	%ymm0,%ymm12,%ymm0
1125	vpslld	$7,%ymm0,%ymm15
1126	vpsrld	$25,%ymm0,%ymm0
1127	vpor	%ymm0,%ymm15,%ymm0
1128	vbroadcasti128	(%r10),%ymm15
1129	vpaddd	%ymm5,%ymm13,%ymm13
1130	vpxor	%ymm1,%ymm13,%ymm1
1131	vpslld	$7,%ymm1,%ymm14
1132	vpsrld	$25,%ymm1,%ymm1
1133	vpor	%ymm1,%ymm14,%ymm1
1134	vmovdqa	%ymm12,0(%rsp)
1135	vmovdqa	%ymm13,32(%rsp)
1136	vmovdqa	64(%rsp),%ymm12
1137	vmovdqa	96(%rsp),%ymm13
1138	vpaddd	%ymm2,%ymm10,%ymm10
1139	vpxor	%ymm6,%ymm10,%ymm6
1140	vpshufb	%ymm15,%ymm6,%ymm6
1141	vpaddd	%ymm3,%ymm11,%ymm11
1142	vpxor	%ymm7,%ymm11,%ymm7
1143	vpshufb	%ymm15,%ymm7,%ymm7
1144	vpaddd	%ymm6,%ymm12,%ymm12
1145	vpxor	%ymm2,%ymm12,%ymm2
1146	vpslld	$12,%ymm2,%ymm14
1147	vpsrld	$20,%ymm2,%ymm2
1148	vpor	%ymm2,%ymm14,%ymm2
1149	vbroadcasti128	(%r11),%ymm14
1150	vpaddd	%ymm7,%ymm13,%ymm13
1151	vpxor	%ymm3,%ymm13,%ymm3
1152	vpslld	$12,%ymm3,%ymm15
1153	vpsrld	$20,%ymm3,%ymm3
1154	vpor	%ymm3,%ymm15,%ymm3
1155	vpaddd	%ymm2,%ymm10,%ymm10
1156	vpxor	%ymm6,%ymm10,%ymm6
1157	vpshufb	%ymm14,%ymm6,%ymm6
1158	vpaddd	%ymm3,%ymm11,%ymm11
1159	vpxor	%ymm7,%ymm11,%ymm7
1160	vpshufb	%ymm14,%ymm7,%ymm7
1161	vpaddd	%ymm6,%ymm12,%ymm12
1162	vpxor	%ymm2,%ymm12,%ymm2
1163	vpslld	$7,%ymm2,%ymm15
1164	vpsrld	$25,%ymm2,%ymm2
1165	vpor	%ymm2,%ymm15,%ymm2
1166	vbroadcasti128	(%r10),%ymm15
1167	vpaddd	%ymm7,%ymm13,%ymm13
1168	vpxor	%ymm3,%ymm13,%ymm3
1169	vpslld	$7,%ymm3,%ymm14
1170	vpsrld	$25,%ymm3,%ymm3
1171	vpor	%ymm3,%ymm14,%ymm3
1172	vpaddd	%ymm1,%ymm8,%ymm8
1173	vpxor	%ymm7,%ymm8,%ymm7
1174	vpshufb	%ymm15,%ymm7,%ymm7
1175	vpaddd	%ymm2,%ymm9,%ymm9
1176	vpxor	%ymm4,%ymm9,%ymm4
1177	vpshufb	%ymm15,%ymm4,%ymm4
1178	vpaddd	%ymm7,%ymm12,%ymm12
1179	vpxor	%ymm1,%ymm12,%ymm1
1180	vpslld	$12,%ymm1,%ymm14
1181	vpsrld	$20,%ymm1,%ymm1
1182	vpor	%ymm1,%ymm14,%ymm1
1183	vbroadcasti128	(%r11),%ymm14
1184	vpaddd	%ymm4,%ymm13,%ymm13
1185	vpxor	%ymm2,%ymm13,%ymm2
1186	vpslld	$12,%ymm2,%ymm15
1187	vpsrld	$20,%ymm2,%ymm2
1188	vpor	%ymm2,%ymm15,%ymm2
1189	vpaddd	%ymm1,%ymm8,%ymm8
1190	vpxor	%ymm7,%ymm8,%ymm7
1191	vpshufb	%ymm14,%ymm7,%ymm7
1192	vpaddd	%ymm2,%ymm9,%ymm9
1193	vpxor	%ymm4,%ymm9,%ymm4
1194	vpshufb	%ymm14,%ymm4,%ymm4
1195	vpaddd	%ymm7,%ymm12,%ymm12
1196	vpxor	%ymm1,%ymm12,%ymm1
1197	vpslld	$7,%ymm1,%ymm15
1198	vpsrld	$25,%ymm1,%ymm1
1199	vpor	%ymm1,%ymm15,%ymm1
1200	vbroadcasti128	(%r10),%ymm15
1201	vpaddd	%ymm4,%ymm13,%ymm13
1202	vpxor	%ymm2,%ymm13,%ymm2
1203	vpslld	$7,%ymm2,%ymm14
1204	vpsrld	$25,%ymm2,%ymm2
1205	vpor	%ymm2,%ymm14,%ymm2
1206	vmovdqa	%ymm12,64(%rsp)
1207	vmovdqa	%ymm13,96(%rsp)
1208	vmovdqa	0(%rsp),%ymm12
1209	vmovdqa	32(%rsp),%ymm13
1210	vpaddd	%ymm3,%ymm10,%ymm10
1211	vpxor	%ymm5,%ymm10,%ymm5
1212	vpshufb	%ymm15,%ymm5,%ymm5
1213	vpaddd	%ymm0,%ymm11,%ymm11
1214	vpxor	%ymm6,%ymm11,%ymm6
1215	vpshufb	%ymm15,%ymm6,%ymm6
1216	vpaddd	%ymm5,%ymm12,%ymm12
1217	vpxor	%ymm3,%ymm12,%ymm3
1218	vpslld	$12,%ymm3,%ymm14
1219	vpsrld	$20,%ymm3,%ymm3
1220	vpor	%ymm3,%ymm14,%ymm3
1221	vbroadcasti128	(%r11),%ymm14
1222	vpaddd	%ymm6,%ymm13,%ymm13
1223	vpxor	%ymm0,%ymm13,%ymm0
1224	vpslld	$12,%ymm0,%ymm15
1225	vpsrld	$20,%ymm0,%ymm0
1226	vpor	%ymm0,%ymm15,%ymm0
1227	vpaddd	%ymm3,%ymm10,%ymm10
1228	vpxor	%ymm5,%ymm10,%ymm5
1229	vpshufb	%ymm14,%ymm5,%ymm5
1230	vpaddd	%ymm0,%ymm11,%ymm11
1231	vpxor	%ymm6,%ymm11,%ymm6
1232	vpshufb	%ymm14,%ymm6,%ymm6
1233	vpaddd	%ymm5,%ymm12,%ymm12
1234	vpxor	%ymm3,%ymm12,%ymm3
1235	vpslld	$7,%ymm3,%ymm15
1236	vpsrld	$25,%ymm3,%ymm3
1237	vpor	%ymm3,%ymm15,%ymm3
1238	vbroadcasti128	(%r10),%ymm15
1239	vpaddd	%ymm6,%ymm13,%ymm13
1240	vpxor	%ymm0,%ymm13,%ymm0
1241	vpslld	$7,%ymm0,%ymm14
1242	vpsrld	$25,%ymm0,%ymm0
1243	vpor	%ymm0,%ymm14,%ymm0
1244	decl	%eax
1245	jnz	.Loop8x
1246
1247	leaq	512(%rsp),%rax
1248	vpaddd	128-256(%rcx),%ymm8,%ymm8
1249	vpaddd	160-256(%rcx),%ymm9,%ymm9
1250	vpaddd	192-256(%rcx),%ymm10,%ymm10
1251	vpaddd	224-256(%rcx),%ymm11,%ymm11
1252
1253	vpunpckldq	%ymm9,%ymm8,%ymm14
1254	vpunpckldq	%ymm11,%ymm10,%ymm15
1255	vpunpckhdq	%ymm9,%ymm8,%ymm8
1256	vpunpckhdq	%ymm11,%ymm10,%ymm10
1257	vpunpcklqdq	%ymm15,%ymm14,%ymm9
1258	vpunpckhqdq	%ymm15,%ymm14,%ymm14
1259	vpunpcklqdq	%ymm10,%ymm8,%ymm11
1260	vpunpckhqdq	%ymm10,%ymm8,%ymm8
1261	vpaddd	256-256(%rcx),%ymm0,%ymm0
1262	vpaddd	288-256(%rcx),%ymm1,%ymm1
1263	vpaddd	320-256(%rcx),%ymm2,%ymm2
1264	vpaddd	352-256(%rcx),%ymm3,%ymm3
1265
1266	vpunpckldq	%ymm1,%ymm0,%ymm10
1267	vpunpckldq	%ymm3,%ymm2,%ymm15
1268	vpunpckhdq	%ymm1,%ymm0,%ymm0
1269	vpunpckhdq	%ymm3,%ymm2,%ymm2
1270	vpunpcklqdq	%ymm15,%ymm10,%ymm1
1271	vpunpckhqdq	%ymm15,%ymm10,%ymm10
1272	vpunpcklqdq	%ymm2,%ymm0,%ymm3
1273	vpunpckhqdq	%ymm2,%ymm0,%ymm0
1274	vperm2i128	$0x20,%ymm1,%ymm9,%ymm15
1275	vperm2i128	$0x31,%ymm1,%ymm9,%ymm1
1276	vperm2i128	$0x20,%ymm10,%ymm14,%ymm9
1277	vperm2i128	$0x31,%ymm10,%ymm14,%ymm10
1278	vperm2i128	$0x20,%ymm3,%ymm11,%ymm14
1279	vperm2i128	$0x31,%ymm3,%ymm11,%ymm3
1280	vperm2i128	$0x20,%ymm0,%ymm8,%ymm11
1281	vperm2i128	$0x31,%ymm0,%ymm8,%ymm0
1282	vmovdqa	%ymm15,0(%rsp)
1283	vmovdqa	%ymm9,32(%rsp)
1284	vmovdqa	64(%rsp),%ymm15
1285	vmovdqa	96(%rsp),%ymm9
1286
1287	vpaddd	384-512(%rax),%ymm12,%ymm12
1288	vpaddd	416-512(%rax),%ymm13,%ymm13
1289	vpaddd	448-512(%rax),%ymm15,%ymm15
1290	vpaddd	480-512(%rax),%ymm9,%ymm9
1291
1292	vpunpckldq	%ymm13,%ymm12,%ymm2
1293	vpunpckldq	%ymm9,%ymm15,%ymm8
1294	vpunpckhdq	%ymm13,%ymm12,%ymm12
1295	vpunpckhdq	%ymm9,%ymm15,%ymm15
1296	vpunpcklqdq	%ymm8,%ymm2,%ymm13
1297	vpunpckhqdq	%ymm8,%ymm2,%ymm2
1298	vpunpcklqdq	%ymm15,%ymm12,%ymm9
1299	vpunpckhqdq	%ymm15,%ymm12,%ymm12
1300	vpaddd	512-512(%rax),%ymm4,%ymm4
1301	vpaddd	544-512(%rax),%ymm5,%ymm5
1302	vpaddd	576-512(%rax),%ymm6,%ymm6
1303	vpaddd	608-512(%rax),%ymm7,%ymm7
1304
1305	vpunpckldq	%ymm5,%ymm4,%ymm15
1306	vpunpckldq	%ymm7,%ymm6,%ymm8
1307	vpunpckhdq	%ymm5,%ymm4,%ymm4
1308	vpunpckhdq	%ymm7,%ymm6,%ymm6
1309	vpunpcklqdq	%ymm8,%ymm15,%ymm5
1310	vpunpckhqdq	%ymm8,%ymm15,%ymm15
1311	vpunpcklqdq	%ymm6,%ymm4,%ymm7
1312	vpunpckhqdq	%ymm6,%ymm4,%ymm4
1313	vperm2i128	$0x20,%ymm5,%ymm13,%ymm8
1314	vperm2i128	$0x31,%ymm5,%ymm13,%ymm5
1315	vperm2i128	$0x20,%ymm15,%ymm2,%ymm13
1316	vperm2i128	$0x31,%ymm15,%ymm2,%ymm15
1317	vperm2i128	$0x20,%ymm7,%ymm9,%ymm2
1318	vperm2i128	$0x31,%ymm7,%ymm9,%ymm7
1319	vperm2i128	$0x20,%ymm4,%ymm12,%ymm9
1320	vperm2i128	$0x31,%ymm4,%ymm12,%ymm4
1321	vmovdqa	0(%rsp),%ymm6
1322	vmovdqa	32(%rsp),%ymm12
1323
1324	cmpq	$512,%rdx
1325	jb	.Ltail8x
1326
1327	vpxor	0(%rsi),%ymm6,%ymm6
1328	vpxor	32(%rsi),%ymm8,%ymm8
1329	vpxor	64(%rsi),%ymm1,%ymm1
1330	vpxor	96(%rsi),%ymm5,%ymm5
1331	leaq	128(%rsi),%rsi
1332	vmovdqu	%ymm6,0(%rdi)
1333	vmovdqu	%ymm8,32(%rdi)
1334	vmovdqu	%ymm1,64(%rdi)
1335	vmovdqu	%ymm5,96(%rdi)
1336	leaq	128(%rdi),%rdi
1337
1338	vpxor	0(%rsi),%ymm12,%ymm12
1339	vpxor	32(%rsi),%ymm13,%ymm13
1340	vpxor	64(%rsi),%ymm10,%ymm10
1341	vpxor	96(%rsi),%ymm15,%ymm15
1342	leaq	128(%rsi),%rsi
1343	vmovdqu	%ymm12,0(%rdi)
1344	vmovdqu	%ymm13,32(%rdi)
1345	vmovdqu	%ymm10,64(%rdi)
1346	vmovdqu	%ymm15,96(%rdi)
1347	leaq	128(%rdi),%rdi
1348
1349	vpxor	0(%rsi),%ymm14,%ymm14
1350	vpxor	32(%rsi),%ymm2,%ymm2
1351	vpxor	64(%rsi),%ymm3,%ymm3
1352	vpxor	96(%rsi),%ymm7,%ymm7
1353	leaq	128(%rsi),%rsi
1354	vmovdqu	%ymm14,0(%rdi)
1355	vmovdqu	%ymm2,32(%rdi)
1356	vmovdqu	%ymm3,64(%rdi)
1357	vmovdqu	%ymm7,96(%rdi)
1358	leaq	128(%rdi),%rdi
1359
1360	vpxor	0(%rsi),%ymm11,%ymm11
1361	vpxor	32(%rsi),%ymm9,%ymm9
1362	vpxor	64(%rsi),%ymm0,%ymm0
1363	vpxor	96(%rsi),%ymm4,%ymm4
1364	leaq	128(%rsi),%rsi
1365	vmovdqu	%ymm11,0(%rdi)
1366	vmovdqu	%ymm9,32(%rdi)
1367	vmovdqu	%ymm0,64(%rdi)
1368	vmovdqu	%ymm4,96(%rdi)
1369	leaq	128(%rdi),%rdi
1370
1371	subq	$512,%rdx
1372	jnz	.Loop_outer8x
1373
1374	jmp	.Ldone8x
1375
1376.Ltail8x:
1377	cmpq	$448,%rdx
1378	jae	.L448_or_more8x
1379	cmpq	$384,%rdx
1380	jae	.L384_or_more8x
1381	cmpq	$320,%rdx
1382	jae	.L320_or_more8x
1383	cmpq	$256,%rdx
1384	jae	.L256_or_more8x
1385	cmpq	$192,%rdx
1386	jae	.L192_or_more8x
1387	cmpq	$128,%rdx
1388	jae	.L128_or_more8x
1389	cmpq	$64,%rdx
1390	jae	.L64_or_more8x
1391
1392	xorq	%r10,%r10
1393	vmovdqa	%ymm6,0(%rsp)
1394	vmovdqa	%ymm8,32(%rsp)
1395	jmp	.Loop_tail8x
1396
1397.align	32
1398.L64_or_more8x:
1399	vpxor	0(%rsi),%ymm6,%ymm6
1400	vpxor	32(%rsi),%ymm8,%ymm8
1401	vmovdqu	%ymm6,0(%rdi)
1402	vmovdqu	%ymm8,32(%rdi)
1403	je	.Ldone8x
1404
1405	leaq	64(%rsi),%rsi
1406	xorq	%r10,%r10
1407	vmovdqa	%ymm1,0(%rsp)
1408	leaq	64(%rdi),%rdi
1409	subq	$64,%rdx
1410	vmovdqa	%ymm5,32(%rsp)
1411	jmp	.Loop_tail8x
1412
1413.align	32
1414.L128_or_more8x:
1415	vpxor	0(%rsi),%ymm6,%ymm6
1416	vpxor	32(%rsi),%ymm8,%ymm8
1417	vpxor	64(%rsi),%ymm1,%ymm1
1418	vpxor	96(%rsi),%ymm5,%ymm5
1419	vmovdqu	%ymm6,0(%rdi)
1420	vmovdqu	%ymm8,32(%rdi)
1421	vmovdqu	%ymm1,64(%rdi)
1422	vmovdqu	%ymm5,96(%rdi)
1423	je	.Ldone8x
1424
1425	leaq	128(%rsi),%rsi
1426	xorq	%r10,%r10
1427	vmovdqa	%ymm12,0(%rsp)
1428	leaq	128(%rdi),%rdi
1429	subq	$128,%rdx
1430	vmovdqa	%ymm13,32(%rsp)
1431	jmp	.Loop_tail8x
1432
1433.align	32
1434.L192_or_more8x:
1435	vpxor	0(%rsi),%ymm6,%ymm6
1436	vpxor	32(%rsi),%ymm8,%ymm8
1437	vpxor	64(%rsi),%ymm1,%ymm1
1438	vpxor	96(%rsi),%ymm5,%ymm5
1439	vpxor	128(%rsi),%ymm12,%ymm12
1440	vpxor	160(%rsi),%ymm13,%ymm13
1441	vmovdqu	%ymm6,0(%rdi)
1442	vmovdqu	%ymm8,32(%rdi)
1443	vmovdqu	%ymm1,64(%rdi)
1444	vmovdqu	%ymm5,96(%rdi)
1445	vmovdqu	%ymm12,128(%rdi)
1446	vmovdqu	%ymm13,160(%rdi)
1447	je	.Ldone8x
1448
1449	leaq	192(%rsi),%rsi
1450	xorq	%r10,%r10
1451	vmovdqa	%ymm10,0(%rsp)
1452	leaq	192(%rdi),%rdi
1453	subq	$192,%rdx
1454	vmovdqa	%ymm15,32(%rsp)
1455	jmp	.Loop_tail8x
1456
1457.align	32
1458.L256_or_more8x:
1459	vpxor	0(%rsi),%ymm6,%ymm6
1460	vpxor	32(%rsi),%ymm8,%ymm8
1461	vpxor	64(%rsi),%ymm1,%ymm1
1462	vpxor	96(%rsi),%ymm5,%ymm5
1463	vpxor	128(%rsi),%ymm12,%ymm12
1464	vpxor	160(%rsi),%ymm13,%ymm13
1465	vpxor	192(%rsi),%ymm10,%ymm10
1466	vpxor	224(%rsi),%ymm15,%ymm15
1467	vmovdqu	%ymm6,0(%rdi)
1468	vmovdqu	%ymm8,32(%rdi)
1469	vmovdqu	%ymm1,64(%rdi)
1470	vmovdqu	%ymm5,96(%rdi)
1471	vmovdqu	%ymm12,128(%rdi)
1472	vmovdqu	%ymm13,160(%rdi)
1473	vmovdqu	%ymm10,192(%rdi)
1474	vmovdqu	%ymm15,224(%rdi)
1475	je	.Ldone8x
1476
1477	leaq	256(%rsi),%rsi
1478	xorq	%r10,%r10
1479	vmovdqa	%ymm14,0(%rsp)
1480	leaq	256(%rdi),%rdi
1481	subq	$256,%rdx
1482	vmovdqa	%ymm2,32(%rsp)
1483	jmp	.Loop_tail8x
1484
1485.align	32
1486.L320_or_more8x:
1487	vpxor	0(%rsi),%ymm6,%ymm6
1488	vpxor	32(%rsi),%ymm8,%ymm8
1489	vpxor	64(%rsi),%ymm1,%ymm1
1490	vpxor	96(%rsi),%ymm5,%ymm5
1491	vpxor	128(%rsi),%ymm12,%ymm12
1492	vpxor	160(%rsi),%ymm13,%ymm13
1493	vpxor	192(%rsi),%ymm10,%ymm10
1494	vpxor	224(%rsi),%ymm15,%ymm15
1495	vpxor	256(%rsi),%ymm14,%ymm14
1496	vpxor	288(%rsi),%ymm2,%ymm2
1497	vmovdqu	%ymm6,0(%rdi)
1498	vmovdqu	%ymm8,32(%rdi)
1499	vmovdqu	%ymm1,64(%rdi)
1500	vmovdqu	%ymm5,96(%rdi)
1501	vmovdqu	%ymm12,128(%rdi)
1502	vmovdqu	%ymm13,160(%rdi)
1503	vmovdqu	%ymm10,192(%rdi)
1504	vmovdqu	%ymm15,224(%rdi)
1505	vmovdqu	%ymm14,256(%rdi)
1506	vmovdqu	%ymm2,288(%rdi)
1507	je	.Ldone8x
1508
1509	leaq	320(%rsi),%rsi
1510	xorq	%r10,%r10
1511	vmovdqa	%ymm3,0(%rsp)
1512	leaq	320(%rdi),%rdi
1513	subq	$320,%rdx
1514	vmovdqa	%ymm7,32(%rsp)
1515	jmp	.Loop_tail8x
1516
1517.align	32
1518.L384_or_more8x:
1519	vpxor	0(%rsi),%ymm6,%ymm6
1520	vpxor	32(%rsi),%ymm8,%ymm8
1521	vpxor	64(%rsi),%ymm1,%ymm1
1522	vpxor	96(%rsi),%ymm5,%ymm5
1523	vpxor	128(%rsi),%ymm12,%ymm12
1524	vpxor	160(%rsi),%ymm13,%ymm13
1525	vpxor	192(%rsi),%ymm10,%ymm10
1526	vpxor	224(%rsi),%ymm15,%ymm15
1527	vpxor	256(%rsi),%ymm14,%ymm14
1528	vpxor	288(%rsi),%ymm2,%ymm2
1529	vpxor	320(%rsi),%ymm3,%ymm3
1530	vpxor	352(%rsi),%ymm7,%ymm7
1531	vmovdqu	%ymm6,0(%rdi)
1532	vmovdqu	%ymm8,32(%rdi)
1533	vmovdqu	%ymm1,64(%rdi)
1534	vmovdqu	%ymm5,96(%rdi)
1535	vmovdqu	%ymm12,128(%rdi)
1536	vmovdqu	%ymm13,160(%rdi)
1537	vmovdqu	%ymm10,192(%rdi)
1538	vmovdqu	%ymm15,224(%rdi)
1539	vmovdqu	%ymm14,256(%rdi)
1540	vmovdqu	%ymm2,288(%rdi)
1541	vmovdqu	%ymm3,320(%rdi)
1542	vmovdqu	%ymm7,352(%rdi)
1543	je	.Ldone8x
1544
1545	leaq	384(%rsi),%rsi
1546	xorq	%r10,%r10
1547	vmovdqa	%ymm11,0(%rsp)
1548	leaq	384(%rdi),%rdi
1549	subq	$384,%rdx
1550	vmovdqa	%ymm9,32(%rsp)
1551	jmp	.Loop_tail8x
1552
1553.align	32
1554.L448_or_more8x:
1555	vpxor	0(%rsi),%ymm6,%ymm6
1556	vpxor	32(%rsi),%ymm8,%ymm8
1557	vpxor	64(%rsi),%ymm1,%ymm1
1558	vpxor	96(%rsi),%ymm5,%ymm5
1559	vpxor	128(%rsi),%ymm12,%ymm12
1560	vpxor	160(%rsi),%ymm13,%ymm13
1561	vpxor	192(%rsi),%ymm10,%ymm10
1562	vpxor	224(%rsi),%ymm15,%ymm15
1563	vpxor	256(%rsi),%ymm14,%ymm14
1564	vpxor	288(%rsi),%ymm2,%ymm2
1565	vpxor	320(%rsi),%ymm3,%ymm3
1566	vpxor	352(%rsi),%ymm7,%ymm7
1567	vpxor	384(%rsi),%ymm11,%ymm11
1568	vpxor	416(%rsi),%ymm9,%ymm9
1569	vmovdqu	%ymm6,0(%rdi)
1570	vmovdqu	%ymm8,32(%rdi)
1571	vmovdqu	%ymm1,64(%rdi)
1572	vmovdqu	%ymm5,96(%rdi)
1573	vmovdqu	%ymm12,128(%rdi)
1574	vmovdqu	%ymm13,160(%rdi)
1575	vmovdqu	%ymm10,192(%rdi)
1576	vmovdqu	%ymm15,224(%rdi)
1577	vmovdqu	%ymm14,256(%rdi)
1578	vmovdqu	%ymm2,288(%rdi)
1579	vmovdqu	%ymm3,320(%rdi)
1580	vmovdqu	%ymm7,352(%rdi)
1581	vmovdqu	%ymm11,384(%rdi)
1582	vmovdqu	%ymm9,416(%rdi)
1583	je	.Ldone8x
1584
1585	leaq	448(%rsi),%rsi
1586	xorq	%r10,%r10
1587	vmovdqa	%ymm0,0(%rsp)
1588	leaq	448(%rdi),%rdi
1589	subq	$448,%rdx
1590	vmovdqa	%ymm4,32(%rsp)
1591
1592.Loop_tail8x:
1593	movzbl	(%rsi,%r10,1),%eax
1594	movzbl	(%rsp,%r10,1),%ecx
1595	leaq	1(%r10),%r10
1596	xorl	%ecx,%eax
1597	movb	%al,-1(%rdi,%r10,1)
1598	decq	%rdx
1599	jnz	.Loop_tail8x
1600
1601.Ldone8x:
1602	vzeroall
1603	leaq	(%r9),%rsp
1604.cfi_def_cfa_register	rsp
1605.L8x_epilogue:
1606	ret
1607.cfi_endproc
1608.size	ChaCha20_ctr32_avx2,.-ChaCha20_ctr32_avx2
1609#endif
1610