xref: /aosp_15_r20/external/cronet/third_party/boringssl/src/gen/crypto/chacha-x86_64-apple.S (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <openssl/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
7.text
8
9.section	__DATA,__const
10.p2align	6
11L$zero:
12.long	0,0,0,0
13L$one:
14.long	1,0,0,0
15L$inc:
16.long	0,1,2,3
17L$four:
18.long	4,4,4,4
19L$incy:
20.long	0,2,4,6,1,3,5,7
21L$eight:
22.long	8,8,8,8,8,8,8,8
23L$rot16:
24.byte	0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
25L$rot24:
26.byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
27L$sigma:
28.byte	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
29.p2align	6
30L$zeroz:
31.long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
32L$fourz:
33.long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
34L$incz:
35.long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
36L$sixteen:
37.long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
38.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
39.text
40.globl	_ChaCha20_ctr32_nohw
41.private_extern _ChaCha20_ctr32_nohw
42
43.p2align	6
44_ChaCha20_ctr32_nohw:
45
46_CET_ENDBR
47	pushq	%rbx
48
49	pushq	%rbp
50
51	pushq	%r12
52
53	pushq	%r13
54
55	pushq	%r14
56
57	pushq	%r15
58
59	subq	$64+24,%rsp
60
61L$ctr32_body:
62
63
64	movdqu	(%rcx),%xmm1
65	movdqu	16(%rcx),%xmm2
66	movdqu	(%r8),%xmm3
67	movdqa	L$one(%rip),%xmm4
68
69
70	movdqa	%xmm1,16(%rsp)
71	movdqa	%xmm2,32(%rsp)
72	movdqa	%xmm3,48(%rsp)
73	movq	%rdx,%rbp
74	jmp	L$oop_outer
75
76.p2align	5
77L$oop_outer:
78	movl	$0x61707865,%eax
79	movl	$0x3320646e,%ebx
80	movl	$0x79622d32,%ecx
81	movl	$0x6b206574,%edx
82	movl	16(%rsp),%r8d
83	movl	20(%rsp),%r9d
84	movl	24(%rsp),%r10d
85	movl	28(%rsp),%r11d
86	movd	%xmm3,%r12d
87	movl	52(%rsp),%r13d
88	movl	56(%rsp),%r14d
89	movl	60(%rsp),%r15d
90
91	movq	%rbp,64+0(%rsp)
92	movl	$10,%ebp
93	movq	%rsi,64+8(%rsp)
94.byte	102,72,15,126,214
95	movq	%rdi,64+16(%rsp)
96	movq	%rsi,%rdi
97	shrq	$32,%rdi
98	jmp	L$oop
99
100.p2align	5
101L$oop:
102	addl	%r8d,%eax
103	xorl	%eax,%r12d
104	roll	$16,%r12d
105	addl	%r9d,%ebx
106	xorl	%ebx,%r13d
107	roll	$16,%r13d
108	addl	%r12d,%esi
109	xorl	%esi,%r8d
110	roll	$12,%r8d
111	addl	%r13d,%edi
112	xorl	%edi,%r9d
113	roll	$12,%r9d
114	addl	%r8d,%eax
115	xorl	%eax,%r12d
116	roll	$8,%r12d
117	addl	%r9d,%ebx
118	xorl	%ebx,%r13d
119	roll	$8,%r13d
120	addl	%r12d,%esi
121	xorl	%esi,%r8d
122	roll	$7,%r8d
123	addl	%r13d,%edi
124	xorl	%edi,%r9d
125	roll	$7,%r9d
126	movl	%esi,32(%rsp)
127	movl	%edi,36(%rsp)
128	movl	40(%rsp),%esi
129	movl	44(%rsp),%edi
130	addl	%r10d,%ecx
131	xorl	%ecx,%r14d
132	roll	$16,%r14d
133	addl	%r11d,%edx
134	xorl	%edx,%r15d
135	roll	$16,%r15d
136	addl	%r14d,%esi
137	xorl	%esi,%r10d
138	roll	$12,%r10d
139	addl	%r15d,%edi
140	xorl	%edi,%r11d
141	roll	$12,%r11d
142	addl	%r10d,%ecx
143	xorl	%ecx,%r14d
144	roll	$8,%r14d
145	addl	%r11d,%edx
146	xorl	%edx,%r15d
147	roll	$8,%r15d
148	addl	%r14d,%esi
149	xorl	%esi,%r10d
150	roll	$7,%r10d
151	addl	%r15d,%edi
152	xorl	%edi,%r11d
153	roll	$7,%r11d
154	addl	%r9d,%eax
155	xorl	%eax,%r15d
156	roll	$16,%r15d
157	addl	%r10d,%ebx
158	xorl	%ebx,%r12d
159	roll	$16,%r12d
160	addl	%r15d,%esi
161	xorl	%esi,%r9d
162	roll	$12,%r9d
163	addl	%r12d,%edi
164	xorl	%edi,%r10d
165	roll	$12,%r10d
166	addl	%r9d,%eax
167	xorl	%eax,%r15d
168	roll	$8,%r15d
169	addl	%r10d,%ebx
170	xorl	%ebx,%r12d
171	roll	$8,%r12d
172	addl	%r15d,%esi
173	xorl	%esi,%r9d
174	roll	$7,%r9d
175	addl	%r12d,%edi
176	xorl	%edi,%r10d
177	roll	$7,%r10d
178	movl	%esi,40(%rsp)
179	movl	%edi,44(%rsp)
180	movl	32(%rsp),%esi
181	movl	36(%rsp),%edi
182	addl	%r11d,%ecx
183	xorl	%ecx,%r13d
184	roll	$16,%r13d
185	addl	%r8d,%edx
186	xorl	%edx,%r14d
187	roll	$16,%r14d
188	addl	%r13d,%esi
189	xorl	%esi,%r11d
190	roll	$12,%r11d
191	addl	%r14d,%edi
192	xorl	%edi,%r8d
193	roll	$12,%r8d
194	addl	%r11d,%ecx
195	xorl	%ecx,%r13d
196	roll	$8,%r13d
197	addl	%r8d,%edx
198	xorl	%edx,%r14d
199	roll	$8,%r14d
200	addl	%r13d,%esi
201	xorl	%esi,%r11d
202	roll	$7,%r11d
203	addl	%r14d,%edi
204	xorl	%edi,%r8d
205	roll	$7,%r8d
206	decl	%ebp
207	jnz	L$oop
208	movl	%edi,36(%rsp)
209	movl	%esi,32(%rsp)
210	movq	64(%rsp),%rbp
211	movdqa	%xmm2,%xmm1
212	movq	64+8(%rsp),%rsi
213	paddd	%xmm4,%xmm3
214	movq	64+16(%rsp),%rdi
215
216	addl	$0x61707865,%eax
217	addl	$0x3320646e,%ebx
218	addl	$0x79622d32,%ecx
219	addl	$0x6b206574,%edx
220	addl	16(%rsp),%r8d
221	addl	20(%rsp),%r9d
222	addl	24(%rsp),%r10d
223	addl	28(%rsp),%r11d
224	addl	48(%rsp),%r12d
225	addl	52(%rsp),%r13d
226	addl	56(%rsp),%r14d
227	addl	60(%rsp),%r15d
228	paddd	32(%rsp),%xmm1
229
230	cmpq	$64,%rbp
231	jb	L$tail
232
233	xorl	0(%rsi),%eax
234	xorl	4(%rsi),%ebx
235	xorl	8(%rsi),%ecx
236	xorl	12(%rsi),%edx
237	xorl	16(%rsi),%r8d
238	xorl	20(%rsi),%r9d
239	xorl	24(%rsi),%r10d
240	xorl	28(%rsi),%r11d
241	movdqu	32(%rsi),%xmm0
242	xorl	48(%rsi),%r12d
243	xorl	52(%rsi),%r13d
244	xorl	56(%rsi),%r14d
245	xorl	60(%rsi),%r15d
246	leaq	64(%rsi),%rsi
247	pxor	%xmm1,%xmm0
248
249	movdqa	%xmm2,32(%rsp)
250	movd	%xmm3,48(%rsp)
251
252	movl	%eax,0(%rdi)
253	movl	%ebx,4(%rdi)
254	movl	%ecx,8(%rdi)
255	movl	%edx,12(%rdi)
256	movl	%r8d,16(%rdi)
257	movl	%r9d,20(%rdi)
258	movl	%r10d,24(%rdi)
259	movl	%r11d,28(%rdi)
260	movdqu	%xmm0,32(%rdi)
261	movl	%r12d,48(%rdi)
262	movl	%r13d,52(%rdi)
263	movl	%r14d,56(%rdi)
264	movl	%r15d,60(%rdi)
265	leaq	64(%rdi),%rdi
266
267	subq	$64,%rbp
268	jnz	L$oop_outer
269
270	jmp	L$done
271
272.p2align	4
273L$tail:
274	movl	%eax,0(%rsp)
275	movl	%ebx,4(%rsp)
276	xorq	%rbx,%rbx
277	movl	%ecx,8(%rsp)
278	movl	%edx,12(%rsp)
279	movl	%r8d,16(%rsp)
280	movl	%r9d,20(%rsp)
281	movl	%r10d,24(%rsp)
282	movl	%r11d,28(%rsp)
283	movdqa	%xmm1,32(%rsp)
284	movl	%r12d,48(%rsp)
285	movl	%r13d,52(%rsp)
286	movl	%r14d,56(%rsp)
287	movl	%r15d,60(%rsp)
288
289L$oop_tail:
290	movzbl	(%rsi,%rbx,1),%eax
291	movzbl	(%rsp,%rbx,1),%edx
292	leaq	1(%rbx),%rbx
293	xorl	%edx,%eax
294	movb	%al,-1(%rdi,%rbx,1)
295	decq	%rbp
296	jnz	L$oop_tail
297
298L$done:
299	leaq	64+24+48(%rsp),%rsi
300	movq	-48(%rsi),%r15
301
302	movq	-40(%rsi),%r14
303
304	movq	-32(%rsi),%r13
305
306	movq	-24(%rsi),%r12
307
308	movq	-16(%rsi),%rbp
309
310	movq	-8(%rsi),%rbx
311
312	leaq	(%rsi),%rsp
313
314L$no_data:
315	ret
316
317
318.globl	_ChaCha20_ctr32_ssse3
319.private_extern _ChaCha20_ctr32_ssse3
320
321.p2align	5
322_ChaCha20_ctr32_ssse3:
323
324_CET_ENDBR
325	movq	%rsp,%r9
326
327	subq	$64+8,%rsp
328	movdqa	L$sigma(%rip),%xmm0
329	movdqu	(%rcx),%xmm1
330	movdqu	16(%rcx),%xmm2
331	movdqu	(%r8),%xmm3
332	movdqa	L$rot16(%rip),%xmm6
333	movdqa	L$rot24(%rip),%xmm7
334
335	movdqa	%xmm0,0(%rsp)
336	movdqa	%xmm1,16(%rsp)
337	movdqa	%xmm2,32(%rsp)
338	movdqa	%xmm3,48(%rsp)
339	movq	$10,%r8
340	jmp	L$oop_ssse3
341
342.p2align	5
343L$oop_outer_ssse3:
344	movdqa	L$one(%rip),%xmm3
345	movdqa	0(%rsp),%xmm0
346	movdqa	16(%rsp),%xmm1
347	movdqa	32(%rsp),%xmm2
348	paddd	48(%rsp),%xmm3
349	movq	$10,%r8
350	movdqa	%xmm3,48(%rsp)
351	jmp	L$oop_ssse3
352
353.p2align	5
354L$oop_ssse3:
355	paddd	%xmm1,%xmm0
356	pxor	%xmm0,%xmm3
357.byte	102,15,56,0,222
358	paddd	%xmm3,%xmm2
359	pxor	%xmm2,%xmm1
360	movdqa	%xmm1,%xmm4
361	psrld	$20,%xmm1
362	pslld	$12,%xmm4
363	por	%xmm4,%xmm1
364	paddd	%xmm1,%xmm0
365	pxor	%xmm0,%xmm3
366.byte	102,15,56,0,223
367	paddd	%xmm3,%xmm2
368	pxor	%xmm2,%xmm1
369	movdqa	%xmm1,%xmm4
370	psrld	$25,%xmm1
371	pslld	$7,%xmm4
372	por	%xmm4,%xmm1
373	pshufd	$78,%xmm2,%xmm2
374	pshufd	$57,%xmm1,%xmm1
375	pshufd	$147,%xmm3,%xmm3
376	nop
377	paddd	%xmm1,%xmm0
378	pxor	%xmm0,%xmm3
379.byte	102,15,56,0,222
380	paddd	%xmm3,%xmm2
381	pxor	%xmm2,%xmm1
382	movdqa	%xmm1,%xmm4
383	psrld	$20,%xmm1
384	pslld	$12,%xmm4
385	por	%xmm4,%xmm1
386	paddd	%xmm1,%xmm0
387	pxor	%xmm0,%xmm3
388.byte	102,15,56,0,223
389	paddd	%xmm3,%xmm2
390	pxor	%xmm2,%xmm1
391	movdqa	%xmm1,%xmm4
392	psrld	$25,%xmm1
393	pslld	$7,%xmm4
394	por	%xmm4,%xmm1
395	pshufd	$78,%xmm2,%xmm2
396	pshufd	$147,%xmm1,%xmm1
397	pshufd	$57,%xmm3,%xmm3
398	decq	%r8
399	jnz	L$oop_ssse3
400	paddd	0(%rsp),%xmm0
401	paddd	16(%rsp),%xmm1
402	paddd	32(%rsp),%xmm2
403	paddd	48(%rsp),%xmm3
404
405	cmpq	$64,%rdx
406	jb	L$tail_ssse3
407
408	movdqu	0(%rsi),%xmm4
409	movdqu	16(%rsi),%xmm5
410	pxor	%xmm4,%xmm0
411	movdqu	32(%rsi),%xmm4
412	pxor	%xmm5,%xmm1
413	movdqu	48(%rsi),%xmm5
414	leaq	64(%rsi),%rsi
415	pxor	%xmm4,%xmm2
416	pxor	%xmm5,%xmm3
417
418	movdqu	%xmm0,0(%rdi)
419	movdqu	%xmm1,16(%rdi)
420	movdqu	%xmm2,32(%rdi)
421	movdqu	%xmm3,48(%rdi)
422	leaq	64(%rdi),%rdi
423
424	subq	$64,%rdx
425	jnz	L$oop_outer_ssse3
426
427	jmp	L$done_ssse3
428
429.p2align	4
430L$tail_ssse3:
431	movdqa	%xmm0,0(%rsp)
432	movdqa	%xmm1,16(%rsp)
433	movdqa	%xmm2,32(%rsp)
434	movdqa	%xmm3,48(%rsp)
435	xorq	%r8,%r8
436
437L$oop_tail_ssse3:
438	movzbl	(%rsi,%r8,1),%eax
439	movzbl	(%rsp,%r8,1),%ecx
440	leaq	1(%r8),%r8
441	xorl	%ecx,%eax
442	movb	%al,-1(%rdi,%r8,1)
443	decq	%rdx
444	jnz	L$oop_tail_ssse3
445
446L$done_ssse3:
447	leaq	(%r9),%rsp
448
449L$ssse3_epilogue:
450	ret
451
452
453.globl	_ChaCha20_ctr32_ssse3_4x
454.private_extern _ChaCha20_ctr32_ssse3_4x
455
456.p2align	5
457_ChaCha20_ctr32_ssse3_4x:
458
459_CET_ENDBR
460	movq	%rsp,%r9
461
462	movq	%r10,%r11
463	subq	$0x140+8,%rsp
464	movdqa	L$sigma(%rip),%xmm11
465	movdqu	(%rcx),%xmm15
466	movdqu	16(%rcx),%xmm7
467	movdqu	(%r8),%xmm3
468	leaq	256(%rsp),%rcx
469	leaq	L$rot16(%rip),%r10
470	leaq	L$rot24(%rip),%r11
471
472	pshufd	$0x00,%xmm11,%xmm8
473	pshufd	$0x55,%xmm11,%xmm9
474	movdqa	%xmm8,64(%rsp)
475	pshufd	$0xaa,%xmm11,%xmm10
476	movdqa	%xmm9,80(%rsp)
477	pshufd	$0xff,%xmm11,%xmm11
478	movdqa	%xmm10,96(%rsp)
479	movdqa	%xmm11,112(%rsp)
480
481	pshufd	$0x00,%xmm15,%xmm12
482	pshufd	$0x55,%xmm15,%xmm13
483	movdqa	%xmm12,128-256(%rcx)
484	pshufd	$0xaa,%xmm15,%xmm14
485	movdqa	%xmm13,144-256(%rcx)
486	pshufd	$0xff,%xmm15,%xmm15
487	movdqa	%xmm14,160-256(%rcx)
488	movdqa	%xmm15,176-256(%rcx)
489
490	pshufd	$0x00,%xmm7,%xmm4
491	pshufd	$0x55,%xmm7,%xmm5
492	movdqa	%xmm4,192-256(%rcx)
493	pshufd	$0xaa,%xmm7,%xmm6
494	movdqa	%xmm5,208-256(%rcx)
495	pshufd	$0xff,%xmm7,%xmm7
496	movdqa	%xmm6,224-256(%rcx)
497	movdqa	%xmm7,240-256(%rcx)
498
499	pshufd	$0x00,%xmm3,%xmm0
500	pshufd	$0x55,%xmm3,%xmm1
501	paddd	L$inc(%rip),%xmm0
502	pshufd	$0xaa,%xmm3,%xmm2
503	movdqa	%xmm1,272-256(%rcx)
504	pshufd	$0xff,%xmm3,%xmm3
505	movdqa	%xmm2,288-256(%rcx)
506	movdqa	%xmm3,304-256(%rcx)
507
508	jmp	L$oop_enter4x
509
510.p2align	5
511L$oop_outer4x:
512	movdqa	64(%rsp),%xmm8
513	movdqa	80(%rsp),%xmm9
514	movdqa	96(%rsp),%xmm10
515	movdqa	112(%rsp),%xmm11
516	movdqa	128-256(%rcx),%xmm12
517	movdqa	144-256(%rcx),%xmm13
518	movdqa	160-256(%rcx),%xmm14
519	movdqa	176-256(%rcx),%xmm15
520	movdqa	192-256(%rcx),%xmm4
521	movdqa	208-256(%rcx),%xmm5
522	movdqa	224-256(%rcx),%xmm6
523	movdqa	240-256(%rcx),%xmm7
524	movdqa	256-256(%rcx),%xmm0
525	movdqa	272-256(%rcx),%xmm1
526	movdqa	288-256(%rcx),%xmm2
527	movdqa	304-256(%rcx),%xmm3
528	paddd	L$four(%rip),%xmm0
529
530L$oop_enter4x:
531	movdqa	%xmm6,32(%rsp)
532	movdqa	%xmm7,48(%rsp)
533	movdqa	(%r10),%xmm7
534	movl	$10,%eax
535	movdqa	%xmm0,256-256(%rcx)
536	jmp	L$oop4x
537
538.p2align	5
539L$oop4x:
540	paddd	%xmm12,%xmm8
541	paddd	%xmm13,%xmm9
542	pxor	%xmm8,%xmm0
543	pxor	%xmm9,%xmm1
544.byte	102,15,56,0,199
545.byte	102,15,56,0,207
546	paddd	%xmm0,%xmm4
547	paddd	%xmm1,%xmm5
548	pxor	%xmm4,%xmm12
549	pxor	%xmm5,%xmm13
550	movdqa	%xmm12,%xmm6
551	pslld	$12,%xmm12
552	psrld	$20,%xmm6
553	movdqa	%xmm13,%xmm7
554	pslld	$12,%xmm13
555	por	%xmm6,%xmm12
556	psrld	$20,%xmm7
557	movdqa	(%r11),%xmm6
558	por	%xmm7,%xmm13
559	paddd	%xmm12,%xmm8
560	paddd	%xmm13,%xmm9
561	pxor	%xmm8,%xmm0
562	pxor	%xmm9,%xmm1
563.byte	102,15,56,0,198
564.byte	102,15,56,0,206
565	paddd	%xmm0,%xmm4
566	paddd	%xmm1,%xmm5
567	pxor	%xmm4,%xmm12
568	pxor	%xmm5,%xmm13
569	movdqa	%xmm12,%xmm7
570	pslld	$7,%xmm12
571	psrld	$25,%xmm7
572	movdqa	%xmm13,%xmm6
573	pslld	$7,%xmm13
574	por	%xmm7,%xmm12
575	psrld	$25,%xmm6
576	movdqa	(%r10),%xmm7
577	por	%xmm6,%xmm13
578	movdqa	%xmm4,0(%rsp)
579	movdqa	%xmm5,16(%rsp)
580	movdqa	32(%rsp),%xmm4
581	movdqa	48(%rsp),%xmm5
582	paddd	%xmm14,%xmm10
583	paddd	%xmm15,%xmm11
584	pxor	%xmm10,%xmm2
585	pxor	%xmm11,%xmm3
586.byte	102,15,56,0,215
587.byte	102,15,56,0,223
588	paddd	%xmm2,%xmm4
589	paddd	%xmm3,%xmm5
590	pxor	%xmm4,%xmm14
591	pxor	%xmm5,%xmm15
592	movdqa	%xmm14,%xmm6
593	pslld	$12,%xmm14
594	psrld	$20,%xmm6
595	movdqa	%xmm15,%xmm7
596	pslld	$12,%xmm15
597	por	%xmm6,%xmm14
598	psrld	$20,%xmm7
599	movdqa	(%r11),%xmm6
600	por	%xmm7,%xmm15
601	paddd	%xmm14,%xmm10
602	paddd	%xmm15,%xmm11
603	pxor	%xmm10,%xmm2
604	pxor	%xmm11,%xmm3
605.byte	102,15,56,0,214
606.byte	102,15,56,0,222
607	paddd	%xmm2,%xmm4
608	paddd	%xmm3,%xmm5
609	pxor	%xmm4,%xmm14
610	pxor	%xmm5,%xmm15
611	movdqa	%xmm14,%xmm7
612	pslld	$7,%xmm14
613	psrld	$25,%xmm7
614	movdqa	%xmm15,%xmm6
615	pslld	$7,%xmm15
616	por	%xmm7,%xmm14
617	psrld	$25,%xmm6
618	movdqa	(%r10),%xmm7
619	por	%xmm6,%xmm15
620	paddd	%xmm13,%xmm8
621	paddd	%xmm14,%xmm9
622	pxor	%xmm8,%xmm3
623	pxor	%xmm9,%xmm0
624.byte	102,15,56,0,223
625.byte	102,15,56,0,199
626	paddd	%xmm3,%xmm4
627	paddd	%xmm0,%xmm5
628	pxor	%xmm4,%xmm13
629	pxor	%xmm5,%xmm14
630	movdqa	%xmm13,%xmm6
631	pslld	$12,%xmm13
632	psrld	$20,%xmm6
633	movdqa	%xmm14,%xmm7
634	pslld	$12,%xmm14
635	por	%xmm6,%xmm13
636	psrld	$20,%xmm7
637	movdqa	(%r11),%xmm6
638	por	%xmm7,%xmm14
639	paddd	%xmm13,%xmm8
640	paddd	%xmm14,%xmm9
641	pxor	%xmm8,%xmm3
642	pxor	%xmm9,%xmm0
643.byte	102,15,56,0,222
644.byte	102,15,56,0,198
645	paddd	%xmm3,%xmm4
646	paddd	%xmm0,%xmm5
647	pxor	%xmm4,%xmm13
648	pxor	%xmm5,%xmm14
649	movdqa	%xmm13,%xmm7
650	pslld	$7,%xmm13
651	psrld	$25,%xmm7
652	movdqa	%xmm14,%xmm6
653	pslld	$7,%xmm14
654	por	%xmm7,%xmm13
655	psrld	$25,%xmm6
656	movdqa	(%r10),%xmm7
657	por	%xmm6,%xmm14
658	movdqa	%xmm4,32(%rsp)
659	movdqa	%xmm5,48(%rsp)
660	movdqa	0(%rsp),%xmm4
661	movdqa	16(%rsp),%xmm5
662	paddd	%xmm15,%xmm10
663	paddd	%xmm12,%xmm11
664	pxor	%xmm10,%xmm1
665	pxor	%xmm11,%xmm2
666.byte	102,15,56,0,207
667.byte	102,15,56,0,215
668	paddd	%xmm1,%xmm4
669	paddd	%xmm2,%xmm5
670	pxor	%xmm4,%xmm15
671	pxor	%xmm5,%xmm12
672	movdqa	%xmm15,%xmm6
673	pslld	$12,%xmm15
674	psrld	$20,%xmm6
675	movdqa	%xmm12,%xmm7
676	pslld	$12,%xmm12
677	por	%xmm6,%xmm15
678	psrld	$20,%xmm7
679	movdqa	(%r11),%xmm6
680	por	%xmm7,%xmm12
681	paddd	%xmm15,%xmm10
682	paddd	%xmm12,%xmm11
683	pxor	%xmm10,%xmm1
684	pxor	%xmm11,%xmm2
685.byte	102,15,56,0,206
686.byte	102,15,56,0,214
687	paddd	%xmm1,%xmm4
688	paddd	%xmm2,%xmm5
689	pxor	%xmm4,%xmm15
690	pxor	%xmm5,%xmm12
691	movdqa	%xmm15,%xmm7
692	pslld	$7,%xmm15
693	psrld	$25,%xmm7
694	movdqa	%xmm12,%xmm6
695	pslld	$7,%xmm12
696	por	%xmm7,%xmm15
697	psrld	$25,%xmm6
698	movdqa	(%r10),%xmm7
699	por	%xmm6,%xmm12
700	decl	%eax
701	jnz	L$oop4x
702
703	paddd	64(%rsp),%xmm8
704	paddd	80(%rsp),%xmm9
705	paddd	96(%rsp),%xmm10
706	paddd	112(%rsp),%xmm11
707
708	movdqa	%xmm8,%xmm6
709	punpckldq	%xmm9,%xmm8
710	movdqa	%xmm10,%xmm7
711	punpckldq	%xmm11,%xmm10
712	punpckhdq	%xmm9,%xmm6
713	punpckhdq	%xmm11,%xmm7
714	movdqa	%xmm8,%xmm9
715	punpcklqdq	%xmm10,%xmm8
716	movdqa	%xmm6,%xmm11
717	punpcklqdq	%xmm7,%xmm6
718	punpckhqdq	%xmm10,%xmm9
719	punpckhqdq	%xmm7,%xmm11
720	paddd	128-256(%rcx),%xmm12
721	paddd	144-256(%rcx),%xmm13
722	paddd	160-256(%rcx),%xmm14
723	paddd	176-256(%rcx),%xmm15
724
725	movdqa	%xmm8,0(%rsp)
726	movdqa	%xmm9,16(%rsp)
727	movdqa	32(%rsp),%xmm8
728	movdqa	48(%rsp),%xmm9
729
730	movdqa	%xmm12,%xmm10
731	punpckldq	%xmm13,%xmm12
732	movdqa	%xmm14,%xmm7
733	punpckldq	%xmm15,%xmm14
734	punpckhdq	%xmm13,%xmm10
735	punpckhdq	%xmm15,%xmm7
736	movdqa	%xmm12,%xmm13
737	punpcklqdq	%xmm14,%xmm12
738	movdqa	%xmm10,%xmm15
739	punpcklqdq	%xmm7,%xmm10
740	punpckhqdq	%xmm14,%xmm13
741	punpckhqdq	%xmm7,%xmm15
742	paddd	192-256(%rcx),%xmm4
743	paddd	208-256(%rcx),%xmm5
744	paddd	224-256(%rcx),%xmm8
745	paddd	240-256(%rcx),%xmm9
746
747	movdqa	%xmm6,32(%rsp)
748	movdqa	%xmm11,48(%rsp)
749
750	movdqa	%xmm4,%xmm14
751	punpckldq	%xmm5,%xmm4
752	movdqa	%xmm8,%xmm7
753	punpckldq	%xmm9,%xmm8
754	punpckhdq	%xmm5,%xmm14
755	punpckhdq	%xmm9,%xmm7
756	movdqa	%xmm4,%xmm5
757	punpcklqdq	%xmm8,%xmm4
758	movdqa	%xmm14,%xmm9
759	punpcklqdq	%xmm7,%xmm14
760	punpckhqdq	%xmm8,%xmm5
761	punpckhqdq	%xmm7,%xmm9
762	paddd	256-256(%rcx),%xmm0
763	paddd	272-256(%rcx),%xmm1
764	paddd	288-256(%rcx),%xmm2
765	paddd	304-256(%rcx),%xmm3
766
767	movdqa	%xmm0,%xmm8
768	punpckldq	%xmm1,%xmm0
769	movdqa	%xmm2,%xmm7
770	punpckldq	%xmm3,%xmm2
771	punpckhdq	%xmm1,%xmm8
772	punpckhdq	%xmm3,%xmm7
773	movdqa	%xmm0,%xmm1
774	punpcklqdq	%xmm2,%xmm0
775	movdqa	%xmm8,%xmm3
776	punpcklqdq	%xmm7,%xmm8
777	punpckhqdq	%xmm2,%xmm1
778	punpckhqdq	%xmm7,%xmm3
779	cmpq	$256,%rdx
780	jb	L$tail4x
781
782	movdqu	0(%rsi),%xmm6
783	movdqu	16(%rsi),%xmm11
784	movdqu	32(%rsi),%xmm2
785	movdqu	48(%rsi),%xmm7
786	pxor	0(%rsp),%xmm6
787	pxor	%xmm12,%xmm11
788	pxor	%xmm4,%xmm2
789	pxor	%xmm0,%xmm7
790
791	movdqu	%xmm6,0(%rdi)
792	movdqu	64(%rsi),%xmm6
793	movdqu	%xmm11,16(%rdi)
794	movdqu	80(%rsi),%xmm11
795	movdqu	%xmm2,32(%rdi)
796	movdqu	96(%rsi),%xmm2
797	movdqu	%xmm7,48(%rdi)
798	movdqu	112(%rsi),%xmm7
799	leaq	128(%rsi),%rsi
800	pxor	16(%rsp),%xmm6
801	pxor	%xmm13,%xmm11
802	pxor	%xmm5,%xmm2
803	pxor	%xmm1,%xmm7
804
805	movdqu	%xmm6,64(%rdi)
806	movdqu	0(%rsi),%xmm6
807	movdqu	%xmm11,80(%rdi)
808	movdqu	16(%rsi),%xmm11
809	movdqu	%xmm2,96(%rdi)
810	movdqu	32(%rsi),%xmm2
811	movdqu	%xmm7,112(%rdi)
812	leaq	128(%rdi),%rdi
813	movdqu	48(%rsi),%xmm7
814	pxor	32(%rsp),%xmm6
815	pxor	%xmm10,%xmm11
816	pxor	%xmm14,%xmm2
817	pxor	%xmm8,%xmm7
818
819	movdqu	%xmm6,0(%rdi)
820	movdqu	64(%rsi),%xmm6
821	movdqu	%xmm11,16(%rdi)
822	movdqu	80(%rsi),%xmm11
823	movdqu	%xmm2,32(%rdi)
824	movdqu	96(%rsi),%xmm2
825	movdqu	%xmm7,48(%rdi)
826	movdqu	112(%rsi),%xmm7
827	leaq	128(%rsi),%rsi
828	pxor	48(%rsp),%xmm6
829	pxor	%xmm15,%xmm11
830	pxor	%xmm9,%xmm2
831	pxor	%xmm3,%xmm7
832	movdqu	%xmm6,64(%rdi)
833	movdqu	%xmm11,80(%rdi)
834	movdqu	%xmm2,96(%rdi)
835	movdqu	%xmm7,112(%rdi)
836	leaq	128(%rdi),%rdi
837
838	subq	$256,%rdx
839	jnz	L$oop_outer4x
840
841	jmp	L$done4x
842
843L$tail4x:
844	cmpq	$192,%rdx
845	jae	L$192_or_more4x
846	cmpq	$128,%rdx
847	jae	L$128_or_more4x
848	cmpq	$64,%rdx
849	jae	L$64_or_more4x
850
851
852	xorq	%r10,%r10
853
854	movdqa	%xmm12,16(%rsp)
855	movdqa	%xmm4,32(%rsp)
856	movdqa	%xmm0,48(%rsp)
857	jmp	L$oop_tail4x
858
859.p2align	5
860L$64_or_more4x:
861	movdqu	0(%rsi),%xmm6
862	movdqu	16(%rsi),%xmm11
863	movdqu	32(%rsi),%xmm2
864	movdqu	48(%rsi),%xmm7
865	pxor	0(%rsp),%xmm6
866	pxor	%xmm12,%xmm11
867	pxor	%xmm4,%xmm2
868	pxor	%xmm0,%xmm7
869	movdqu	%xmm6,0(%rdi)
870	movdqu	%xmm11,16(%rdi)
871	movdqu	%xmm2,32(%rdi)
872	movdqu	%xmm7,48(%rdi)
873	je	L$done4x
874
875	movdqa	16(%rsp),%xmm6
876	leaq	64(%rsi),%rsi
877	xorq	%r10,%r10
878	movdqa	%xmm6,0(%rsp)
879	movdqa	%xmm13,16(%rsp)
880	leaq	64(%rdi),%rdi
881	movdqa	%xmm5,32(%rsp)
882	subq	$64,%rdx
883	movdqa	%xmm1,48(%rsp)
884	jmp	L$oop_tail4x
885
886.p2align	5
887L$128_or_more4x:
888	movdqu	0(%rsi),%xmm6
889	movdqu	16(%rsi),%xmm11
890	movdqu	32(%rsi),%xmm2
891	movdqu	48(%rsi),%xmm7
892	pxor	0(%rsp),%xmm6
893	pxor	%xmm12,%xmm11
894	pxor	%xmm4,%xmm2
895	pxor	%xmm0,%xmm7
896
897	movdqu	%xmm6,0(%rdi)
898	movdqu	64(%rsi),%xmm6
899	movdqu	%xmm11,16(%rdi)
900	movdqu	80(%rsi),%xmm11
901	movdqu	%xmm2,32(%rdi)
902	movdqu	96(%rsi),%xmm2
903	movdqu	%xmm7,48(%rdi)
904	movdqu	112(%rsi),%xmm7
905	pxor	16(%rsp),%xmm6
906	pxor	%xmm13,%xmm11
907	pxor	%xmm5,%xmm2
908	pxor	%xmm1,%xmm7
909	movdqu	%xmm6,64(%rdi)
910	movdqu	%xmm11,80(%rdi)
911	movdqu	%xmm2,96(%rdi)
912	movdqu	%xmm7,112(%rdi)
913	je	L$done4x
914
915	movdqa	32(%rsp),%xmm6
916	leaq	128(%rsi),%rsi
917	xorq	%r10,%r10
918	movdqa	%xmm6,0(%rsp)
919	movdqa	%xmm10,16(%rsp)
920	leaq	128(%rdi),%rdi
921	movdqa	%xmm14,32(%rsp)
922	subq	$128,%rdx
923	movdqa	%xmm8,48(%rsp)
924	jmp	L$oop_tail4x
925
926.p2align	5
927L$192_or_more4x:
928	movdqu	0(%rsi),%xmm6
929	movdqu	16(%rsi),%xmm11
930	movdqu	32(%rsi),%xmm2
931	movdqu	48(%rsi),%xmm7
932	pxor	0(%rsp),%xmm6
933	pxor	%xmm12,%xmm11
934	pxor	%xmm4,%xmm2
935	pxor	%xmm0,%xmm7
936
937	movdqu	%xmm6,0(%rdi)
938	movdqu	64(%rsi),%xmm6
939	movdqu	%xmm11,16(%rdi)
940	movdqu	80(%rsi),%xmm11
941	movdqu	%xmm2,32(%rdi)
942	movdqu	96(%rsi),%xmm2
943	movdqu	%xmm7,48(%rdi)
944	movdqu	112(%rsi),%xmm7
945	leaq	128(%rsi),%rsi
946	pxor	16(%rsp),%xmm6
947	pxor	%xmm13,%xmm11
948	pxor	%xmm5,%xmm2
949	pxor	%xmm1,%xmm7
950
951	movdqu	%xmm6,64(%rdi)
952	movdqu	0(%rsi),%xmm6
953	movdqu	%xmm11,80(%rdi)
954	movdqu	16(%rsi),%xmm11
955	movdqu	%xmm2,96(%rdi)
956	movdqu	32(%rsi),%xmm2
957	movdqu	%xmm7,112(%rdi)
958	leaq	128(%rdi),%rdi
959	movdqu	48(%rsi),%xmm7
960	pxor	32(%rsp),%xmm6
961	pxor	%xmm10,%xmm11
962	pxor	%xmm14,%xmm2
963	pxor	%xmm8,%xmm7
964	movdqu	%xmm6,0(%rdi)
965	movdqu	%xmm11,16(%rdi)
966	movdqu	%xmm2,32(%rdi)
967	movdqu	%xmm7,48(%rdi)
968	je	L$done4x
969
970	movdqa	48(%rsp),%xmm6
971	leaq	64(%rsi),%rsi
972	xorq	%r10,%r10
973	movdqa	%xmm6,0(%rsp)
974	movdqa	%xmm15,16(%rsp)
975	leaq	64(%rdi),%rdi
976	movdqa	%xmm9,32(%rsp)
977	subq	$192,%rdx
978	movdqa	%xmm3,48(%rsp)
979
980L$oop_tail4x:
981	movzbl	(%rsi,%r10,1),%eax
982	movzbl	(%rsp,%r10,1),%ecx
983	leaq	1(%r10),%r10
984	xorl	%ecx,%eax
985	movb	%al,-1(%rdi,%r10,1)
986	decq	%rdx
987	jnz	L$oop_tail4x
988
989L$done4x:
990	leaq	(%r9),%rsp
991
992L$4x_epilogue:
993	ret
994
995
996.globl	_ChaCha20_ctr32_avx2
997.private_extern _ChaCha20_ctr32_avx2
998
999.p2align	5
1000_ChaCha20_ctr32_avx2:
1001
1002_CET_ENDBR
1003	movq	%rsp,%r9
1004
1005	subq	$0x280+8,%rsp
1006	andq	$-32,%rsp
1007	vzeroupper
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018	vbroadcasti128	L$sigma(%rip),%ymm11
1019	vbroadcasti128	(%rcx),%ymm3
1020	vbroadcasti128	16(%rcx),%ymm15
1021	vbroadcasti128	(%r8),%ymm7
1022	leaq	256(%rsp),%rcx
1023	leaq	512(%rsp),%rax
1024	leaq	L$rot16(%rip),%r10
1025	leaq	L$rot24(%rip),%r11
1026
1027	vpshufd	$0x00,%ymm11,%ymm8
1028	vpshufd	$0x55,%ymm11,%ymm9
1029	vmovdqa	%ymm8,128-256(%rcx)
1030	vpshufd	$0xaa,%ymm11,%ymm10
1031	vmovdqa	%ymm9,160-256(%rcx)
1032	vpshufd	$0xff,%ymm11,%ymm11
1033	vmovdqa	%ymm10,192-256(%rcx)
1034	vmovdqa	%ymm11,224-256(%rcx)
1035
1036	vpshufd	$0x00,%ymm3,%ymm0
1037	vpshufd	$0x55,%ymm3,%ymm1
1038	vmovdqa	%ymm0,256-256(%rcx)
1039	vpshufd	$0xaa,%ymm3,%ymm2
1040	vmovdqa	%ymm1,288-256(%rcx)
1041	vpshufd	$0xff,%ymm3,%ymm3
1042	vmovdqa	%ymm2,320-256(%rcx)
1043	vmovdqa	%ymm3,352-256(%rcx)
1044
1045	vpshufd	$0x00,%ymm15,%ymm12
1046	vpshufd	$0x55,%ymm15,%ymm13
1047	vmovdqa	%ymm12,384-512(%rax)
1048	vpshufd	$0xaa,%ymm15,%ymm14
1049	vmovdqa	%ymm13,416-512(%rax)
1050	vpshufd	$0xff,%ymm15,%ymm15
1051	vmovdqa	%ymm14,448-512(%rax)
1052	vmovdqa	%ymm15,480-512(%rax)
1053
1054	vpshufd	$0x00,%ymm7,%ymm4
1055	vpshufd	$0x55,%ymm7,%ymm5
1056	vpaddd	L$incy(%rip),%ymm4,%ymm4
1057	vpshufd	$0xaa,%ymm7,%ymm6
1058	vmovdqa	%ymm5,544-512(%rax)
1059	vpshufd	$0xff,%ymm7,%ymm7
1060	vmovdqa	%ymm6,576-512(%rax)
1061	vmovdqa	%ymm7,608-512(%rax)
1062
1063	jmp	L$oop_enter8x
1064
1065.p2align	5
1066L$oop_outer8x:
1067	vmovdqa	128-256(%rcx),%ymm8
1068	vmovdqa	160-256(%rcx),%ymm9
1069	vmovdqa	192-256(%rcx),%ymm10
1070	vmovdqa	224-256(%rcx),%ymm11
1071	vmovdqa	256-256(%rcx),%ymm0
1072	vmovdqa	288-256(%rcx),%ymm1
1073	vmovdqa	320-256(%rcx),%ymm2
1074	vmovdqa	352-256(%rcx),%ymm3
1075	vmovdqa	384-512(%rax),%ymm12
1076	vmovdqa	416-512(%rax),%ymm13
1077	vmovdqa	448-512(%rax),%ymm14
1078	vmovdqa	480-512(%rax),%ymm15
1079	vmovdqa	512-512(%rax),%ymm4
1080	vmovdqa	544-512(%rax),%ymm5
1081	vmovdqa	576-512(%rax),%ymm6
1082	vmovdqa	608-512(%rax),%ymm7
1083	vpaddd	L$eight(%rip),%ymm4,%ymm4
1084
1085L$oop_enter8x:
1086	vmovdqa	%ymm14,64(%rsp)
1087	vmovdqa	%ymm15,96(%rsp)
1088	vbroadcasti128	(%r10),%ymm15
1089	vmovdqa	%ymm4,512-512(%rax)
1090	movl	$10,%eax
1091	jmp	L$oop8x
1092
1093.p2align	5
1094L$oop8x:
1095	vpaddd	%ymm0,%ymm8,%ymm8
1096	vpxor	%ymm4,%ymm8,%ymm4
1097	vpshufb	%ymm15,%ymm4,%ymm4
1098	vpaddd	%ymm1,%ymm9,%ymm9
1099	vpxor	%ymm5,%ymm9,%ymm5
1100	vpshufb	%ymm15,%ymm5,%ymm5
1101	vpaddd	%ymm4,%ymm12,%ymm12
1102	vpxor	%ymm0,%ymm12,%ymm0
1103	vpslld	$12,%ymm0,%ymm14
1104	vpsrld	$20,%ymm0,%ymm0
1105	vpor	%ymm0,%ymm14,%ymm0
1106	vbroadcasti128	(%r11),%ymm14
1107	vpaddd	%ymm5,%ymm13,%ymm13
1108	vpxor	%ymm1,%ymm13,%ymm1
1109	vpslld	$12,%ymm1,%ymm15
1110	vpsrld	$20,%ymm1,%ymm1
1111	vpor	%ymm1,%ymm15,%ymm1
1112	vpaddd	%ymm0,%ymm8,%ymm8
1113	vpxor	%ymm4,%ymm8,%ymm4
1114	vpshufb	%ymm14,%ymm4,%ymm4
1115	vpaddd	%ymm1,%ymm9,%ymm9
1116	vpxor	%ymm5,%ymm9,%ymm5
1117	vpshufb	%ymm14,%ymm5,%ymm5
1118	vpaddd	%ymm4,%ymm12,%ymm12
1119	vpxor	%ymm0,%ymm12,%ymm0
1120	vpslld	$7,%ymm0,%ymm15
1121	vpsrld	$25,%ymm0,%ymm0
1122	vpor	%ymm0,%ymm15,%ymm0
1123	vbroadcasti128	(%r10),%ymm15
1124	vpaddd	%ymm5,%ymm13,%ymm13
1125	vpxor	%ymm1,%ymm13,%ymm1
1126	vpslld	$7,%ymm1,%ymm14
1127	vpsrld	$25,%ymm1,%ymm1
1128	vpor	%ymm1,%ymm14,%ymm1
1129	vmovdqa	%ymm12,0(%rsp)
1130	vmovdqa	%ymm13,32(%rsp)
1131	vmovdqa	64(%rsp),%ymm12
1132	vmovdqa	96(%rsp),%ymm13
1133	vpaddd	%ymm2,%ymm10,%ymm10
1134	vpxor	%ymm6,%ymm10,%ymm6
1135	vpshufb	%ymm15,%ymm6,%ymm6
1136	vpaddd	%ymm3,%ymm11,%ymm11
1137	vpxor	%ymm7,%ymm11,%ymm7
1138	vpshufb	%ymm15,%ymm7,%ymm7
1139	vpaddd	%ymm6,%ymm12,%ymm12
1140	vpxor	%ymm2,%ymm12,%ymm2
1141	vpslld	$12,%ymm2,%ymm14
1142	vpsrld	$20,%ymm2,%ymm2
1143	vpor	%ymm2,%ymm14,%ymm2
1144	vbroadcasti128	(%r11),%ymm14
1145	vpaddd	%ymm7,%ymm13,%ymm13
1146	vpxor	%ymm3,%ymm13,%ymm3
1147	vpslld	$12,%ymm3,%ymm15
1148	vpsrld	$20,%ymm3,%ymm3
1149	vpor	%ymm3,%ymm15,%ymm3
1150	vpaddd	%ymm2,%ymm10,%ymm10
1151	vpxor	%ymm6,%ymm10,%ymm6
1152	vpshufb	%ymm14,%ymm6,%ymm6
1153	vpaddd	%ymm3,%ymm11,%ymm11
1154	vpxor	%ymm7,%ymm11,%ymm7
1155	vpshufb	%ymm14,%ymm7,%ymm7
1156	vpaddd	%ymm6,%ymm12,%ymm12
1157	vpxor	%ymm2,%ymm12,%ymm2
1158	vpslld	$7,%ymm2,%ymm15
1159	vpsrld	$25,%ymm2,%ymm2
1160	vpor	%ymm2,%ymm15,%ymm2
1161	vbroadcasti128	(%r10),%ymm15
1162	vpaddd	%ymm7,%ymm13,%ymm13
1163	vpxor	%ymm3,%ymm13,%ymm3
1164	vpslld	$7,%ymm3,%ymm14
1165	vpsrld	$25,%ymm3,%ymm3
1166	vpor	%ymm3,%ymm14,%ymm3
1167	vpaddd	%ymm1,%ymm8,%ymm8
1168	vpxor	%ymm7,%ymm8,%ymm7
1169	vpshufb	%ymm15,%ymm7,%ymm7
1170	vpaddd	%ymm2,%ymm9,%ymm9
1171	vpxor	%ymm4,%ymm9,%ymm4
1172	vpshufb	%ymm15,%ymm4,%ymm4
1173	vpaddd	%ymm7,%ymm12,%ymm12
1174	vpxor	%ymm1,%ymm12,%ymm1
1175	vpslld	$12,%ymm1,%ymm14
1176	vpsrld	$20,%ymm1,%ymm1
1177	vpor	%ymm1,%ymm14,%ymm1
1178	vbroadcasti128	(%r11),%ymm14
1179	vpaddd	%ymm4,%ymm13,%ymm13
1180	vpxor	%ymm2,%ymm13,%ymm2
1181	vpslld	$12,%ymm2,%ymm15
1182	vpsrld	$20,%ymm2,%ymm2
1183	vpor	%ymm2,%ymm15,%ymm2
1184	vpaddd	%ymm1,%ymm8,%ymm8
1185	vpxor	%ymm7,%ymm8,%ymm7
1186	vpshufb	%ymm14,%ymm7,%ymm7
1187	vpaddd	%ymm2,%ymm9,%ymm9
1188	vpxor	%ymm4,%ymm9,%ymm4
1189	vpshufb	%ymm14,%ymm4,%ymm4
1190	vpaddd	%ymm7,%ymm12,%ymm12
1191	vpxor	%ymm1,%ymm12,%ymm1
1192	vpslld	$7,%ymm1,%ymm15
1193	vpsrld	$25,%ymm1,%ymm1
1194	vpor	%ymm1,%ymm15,%ymm1
1195	vbroadcasti128	(%r10),%ymm15
1196	vpaddd	%ymm4,%ymm13,%ymm13
1197	vpxor	%ymm2,%ymm13,%ymm2
1198	vpslld	$7,%ymm2,%ymm14
1199	vpsrld	$25,%ymm2,%ymm2
1200	vpor	%ymm2,%ymm14,%ymm2
1201	vmovdqa	%ymm12,64(%rsp)
1202	vmovdqa	%ymm13,96(%rsp)
1203	vmovdqa	0(%rsp),%ymm12
1204	vmovdqa	32(%rsp),%ymm13
1205	vpaddd	%ymm3,%ymm10,%ymm10
1206	vpxor	%ymm5,%ymm10,%ymm5
1207	vpshufb	%ymm15,%ymm5,%ymm5
1208	vpaddd	%ymm0,%ymm11,%ymm11
1209	vpxor	%ymm6,%ymm11,%ymm6
1210	vpshufb	%ymm15,%ymm6,%ymm6
1211	vpaddd	%ymm5,%ymm12,%ymm12
1212	vpxor	%ymm3,%ymm12,%ymm3
1213	vpslld	$12,%ymm3,%ymm14
1214	vpsrld	$20,%ymm3,%ymm3
1215	vpor	%ymm3,%ymm14,%ymm3
1216	vbroadcasti128	(%r11),%ymm14
1217	vpaddd	%ymm6,%ymm13,%ymm13
1218	vpxor	%ymm0,%ymm13,%ymm0
1219	vpslld	$12,%ymm0,%ymm15
1220	vpsrld	$20,%ymm0,%ymm0
1221	vpor	%ymm0,%ymm15,%ymm0
1222	vpaddd	%ymm3,%ymm10,%ymm10
1223	vpxor	%ymm5,%ymm10,%ymm5
1224	vpshufb	%ymm14,%ymm5,%ymm5
1225	vpaddd	%ymm0,%ymm11,%ymm11
1226	vpxor	%ymm6,%ymm11,%ymm6
1227	vpshufb	%ymm14,%ymm6,%ymm6
1228	vpaddd	%ymm5,%ymm12,%ymm12
1229	vpxor	%ymm3,%ymm12,%ymm3
1230	vpslld	$7,%ymm3,%ymm15
1231	vpsrld	$25,%ymm3,%ymm3
1232	vpor	%ymm3,%ymm15,%ymm3
1233	vbroadcasti128	(%r10),%ymm15
1234	vpaddd	%ymm6,%ymm13,%ymm13
1235	vpxor	%ymm0,%ymm13,%ymm0
1236	vpslld	$7,%ymm0,%ymm14
1237	vpsrld	$25,%ymm0,%ymm0
1238	vpor	%ymm0,%ymm14,%ymm0
1239	decl	%eax
1240	jnz	L$oop8x
1241
1242	leaq	512(%rsp),%rax
1243	vpaddd	128-256(%rcx),%ymm8,%ymm8
1244	vpaddd	160-256(%rcx),%ymm9,%ymm9
1245	vpaddd	192-256(%rcx),%ymm10,%ymm10
1246	vpaddd	224-256(%rcx),%ymm11,%ymm11
1247
1248	vpunpckldq	%ymm9,%ymm8,%ymm14
1249	vpunpckldq	%ymm11,%ymm10,%ymm15
1250	vpunpckhdq	%ymm9,%ymm8,%ymm8
1251	vpunpckhdq	%ymm11,%ymm10,%ymm10
1252	vpunpcklqdq	%ymm15,%ymm14,%ymm9
1253	vpunpckhqdq	%ymm15,%ymm14,%ymm14
1254	vpunpcklqdq	%ymm10,%ymm8,%ymm11
1255	vpunpckhqdq	%ymm10,%ymm8,%ymm8
1256	vpaddd	256-256(%rcx),%ymm0,%ymm0
1257	vpaddd	288-256(%rcx),%ymm1,%ymm1
1258	vpaddd	320-256(%rcx),%ymm2,%ymm2
1259	vpaddd	352-256(%rcx),%ymm3,%ymm3
1260
1261	vpunpckldq	%ymm1,%ymm0,%ymm10
1262	vpunpckldq	%ymm3,%ymm2,%ymm15
1263	vpunpckhdq	%ymm1,%ymm0,%ymm0
1264	vpunpckhdq	%ymm3,%ymm2,%ymm2
1265	vpunpcklqdq	%ymm15,%ymm10,%ymm1
1266	vpunpckhqdq	%ymm15,%ymm10,%ymm10
1267	vpunpcklqdq	%ymm2,%ymm0,%ymm3
1268	vpunpckhqdq	%ymm2,%ymm0,%ymm0
1269	vperm2i128	$0x20,%ymm1,%ymm9,%ymm15
1270	vperm2i128	$0x31,%ymm1,%ymm9,%ymm1
1271	vperm2i128	$0x20,%ymm10,%ymm14,%ymm9
1272	vperm2i128	$0x31,%ymm10,%ymm14,%ymm10
1273	vperm2i128	$0x20,%ymm3,%ymm11,%ymm14
1274	vperm2i128	$0x31,%ymm3,%ymm11,%ymm3
1275	vperm2i128	$0x20,%ymm0,%ymm8,%ymm11
1276	vperm2i128	$0x31,%ymm0,%ymm8,%ymm0
1277	vmovdqa	%ymm15,0(%rsp)
1278	vmovdqa	%ymm9,32(%rsp)
1279	vmovdqa	64(%rsp),%ymm15
1280	vmovdqa	96(%rsp),%ymm9
1281
1282	vpaddd	384-512(%rax),%ymm12,%ymm12
1283	vpaddd	416-512(%rax),%ymm13,%ymm13
1284	vpaddd	448-512(%rax),%ymm15,%ymm15
1285	vpaddd	480-512(%rax),%ymm9,%ymm9
1286
1287	vpunpckldq	%ymm13,%ymm12,%ymm2
1288	vpunpckldq	%ymm9,%ymm15,%ymm8
1289	vpunpckhdq	%ymm13,%ymm12,%ymm12
1290	vpunpckhdq	%ymm9,%ymm15,%ymm15
1291	vpunpcklqdq	%ymm8,%ymm2,%ymm13
1292	vpunpckhqdq	%ymm8,%ymm2,%ymm2
1293	vpunpcklqdq	%ymm15,%ymm12,%ymm9
1294	vpunpckhqdq	%ymm15,%ymm12,%ymm12
1295	vpaddd	512-512(%rax),%ymm4,%ymm4
1296	vpaddd	544-512(%rax),%ymm5,%ymm5
1297	vpaddd	576-512(%rax),%ymm6,%ymm6
1298	vpaddd	608-512(%rax),%ymm7,%ymm7
1299
1300	vpunpckldq	%ymm5,%ymm4,%ymm15
1301	vpunpckldq	%ymm7,%ymm6,%ymm8
1302	vpunpckhdq	%ymm5,%ymm4,%ymm4
1303	vpunpckhdq	%ymm7,%ymm6,%ymm6
1304	vpunpcklqdq	%ymm8,%ymm15,%ymm5
1305	vpunpckhqdq	%ymm8,%ymm15,%ymm15
1306	vpunpcklqdq	%ymm6,%ymm4,%ymm7
1307	vpunpckhqdq	%ymm6,%ymm4,%ymm4
1308	vperm2i128	$0x20,%ymm5,%ymm13,%ymm8
1309	vperm2i128	$0x31,%ymm5,%ymm13,%ymm5
1310	vperm2i128	$0x20,%ymm15,%ymm2,%ymm13
1311	vperm2i128	$0x31,%ymm15,%ymm2,%ymm15
1312	vperm2i128	$0x20,%ymm7,%ymm9,%ymm2
1313	vperm2i128	$0x31,%ymm7,%ymm9,%ymm7
1314	vperm2i128	$0x20,%ymm4,%ymm12,%ymm9
1315	vperm2i128	$0x31,%ymm4,%ymm12,%ymm4
1316	vmovdqa	0(%rsp),%ymm6
1317	vmovdqa	32(%rsp),%ymm12
1318
1319	cmpq	$512,%rdx
1320	jb	L$tail8x
1321
1322	vpxor	0(%rsi),%ymm6,%ymm6
1323	vpxor	32(%rsi),%ymm8,%ymm8
1324	vpxor	64(%rsi),%ymm1,%ymm1
1325	vpxor	96(%rsi),%ymm5,%ymm5
1326	leaq	128(%rsi),%rsi
1327	vmovdqu	%ymm6,0(%rdi)
1328	vmovdqu	%ymm8,32(%rdi)
1329	vmovdqu	%ymm1,64(%rdi)
1330	vmovdqu	%ymm5,96(%rdi)
1331	leaq	128(%rdi),%rdi
1332
1333	vpxor	0(%rsi),%ymm12,%ymm12
1334	vpxor	32(%rsi),%ymm13,%ymm13
1335	vpxor	64(%rsi),%ymm10,%ymm10
1336	vpxor	96(%rsi),%ymm15,%ymm15
1337	leaq	128(%rsi),%rsi
1338	vmovdqu	%ymm12,0(%rdi)
1339	vmovdqu	%ymm13,32(%rdi)
1340	vmovdqu	%ymm10,64(%rdi)
1341	vmovdqu	%ymm15,96(%rdi)
1342	leaq	128(%rdi),%rdi
1343
1344	vpxor	0(%rsi),%ymm14,%ymm14
1345	vpxor	32(%rsi),%ymm2,%ymm2
1346	vpxor	64(%rsi),%ymm3,%ymm3
1347	vpxor	96(%rsi),%ymm7,%ymm7
1348	leaq	128(%rsi),%rsi
1349	vmovdqu	%ymm14,0(%rdi)
1350	vmovdqu	%ymm2,32(%rdi)
1351	vmovdqu	%ymm3,64(%rdi)
1352	vmovdqu	%ymm7,96(%rdi)
1353	leaq	128(%rdi),%rdi
1354
1355	vpxor	0(%rsi),%ymm11,%ymm11
1356	vpxor	32(%rsi),%ymm9,%ymm9
1357	vpxor	64(%rsi),%ymm0,%ymm0
1358	vpxor	96(%rsi),%ymm4,%ymm4
1359	leaq	128(%rsi),%rsi
1360	vmovdqu	%ymm11,0(%rdi)
1361	vmovdqu	%ymm9,32(%rdi)
1362	vmovdqu	%ymm0,64(%rdi)
1363	vmovdqu	%ymm4,96(%rdi)
1364	leaq	128(%rdi),%rdi
1365
1366	subq	$512,%rdx
1367	jnz	L$oop_outer8x
1368
1369	jmp	L$done8x
1370
1371L$tail8x:
1372	cmpq	$448,%rdx
1373	jae	L$448_or_more8x
1374	cmpq	$384,%rdx
1375	jae	L$384_or_more8x
1376	cmpq	$320,%rdx
1377	jae	L$320_or_more8x
1378	cmpq	$256,%rdx
1379	jae	L$256_or_more8x
1380	cmpq	$192,%rdx
1381	jae	L$192_or_more8x
1382	cmpq	$128,%rdx
1383	jae	L$128_or_more8x
1384	cmpq	$64,%rdx
1385	jae	L$64_or_more8x
1386
1387	xorq	%r10,%r10
1388	vmovdqa	%ymm6,0(%rsp)
1389	vmovdqa	%ymm8,32(%rsp)
1390	jmp	L$oop_tail8x
1391
1392.p2align	5
1393L$64_or_more8x:
1394	vpxor	0(%rsi),%ymm6,%ymm6
1395	vpxor	32(%rsi),%ymm8,%ymm8
1396	vmovdqu	%ymm6,0(%rdi)
1397	vmovdqu	%ymm8,32(%rdi)
1398	je	L$done8x
1399
1400	leaq	64(%rsi),%rsi
1401	xorq	%r10,%r10
1402	vmovdqa	%ymm1,0(%rsp)
1403	leaq	64(%rdi),%rdi
1404	subq	$64,%rdx
1405	vmovdqa	%ymm5,32(%rsp)
1406	jmp	L$oop_tail8x
1407
1408.p2align	5
1409L$128_or_more8x:
1410	vpxor	0(%rsi),%ymm6,%ymm6
1411	vpxor	32(%rsi),%ymm8,%ymm8
1412	vpxor	64(%rsi),%ymm1,%ymm1
1413	vpxor	96(%rsi),%ymm5,%ymm5
1414	vmovdqu	%ymm6,0(%rdi)
1415	vmovdqu	%ymm8,32(%rdi)
1416	vmovdqu	%ymm1,64(%rdi)
1417	vmovdqu	%ymm5,96(%rdi)
1418	je	L$done8x
1419
1420	leaq	128(%rsi),%rsi
1421	xorq	%r10,%r10
1422	vmovdqa	%ymm12,0(%rsp)
1423	leaq	128(%rdi),%rdi
1424	subq	$128,%rdx
1425	vmovdqa	%ymm13,32(%rsp)
1426	jmp	L$oop_tail8x
1427
1428.p2align	5
1429L$192_or_more8x:
1430	vpxor	0(%rsi),%ymm6,%ymm6
1431	vpxor	32(%rsi),%ymm8,%ymm8
1432	vpxor	64(%rsi),%ymm1,%ymm1
1433	vpxor	96(%rsi),%ymm5,%ymm5
1434	vpxor	128(%rsi),%ymm12,%ymm12
1435	vpxor	160(%rsi),%ymm13,%ymm13
1436	vmovdqu	%ymm6,0(%rdi)
1437	vmovdqu	%ymm8,32(%rdi)
1438	vmovdqu	%ymm1,64(%rdi)
1439	vmovdqu	%ymm5,96(%rdi)
1440	vmovdqu	%ymm12,128(%rdi)
1441	vmovdqu	%ymm13,160(%rdi)
1442	je	L$done8x
1443
1444	leaq	192(%rsi),%rsi
1445	xorq	%r10,%r10
1446	vmovdqa	%ymm10,0(%rsp)
1447	leaq	192(%rdi),%rdi
1448	subq	$192,%rdx
1449	vmovdqa	%ymm15,32(%rsp)
1450	jmp	L$oop_tail8x
1451
1452.p2align	5
1453L$256_or_more8x:
1454	vpxor	0(%rsi),%ymm6,%ymm6
1455	vpxor	32(%rsi),%ymm8,%ymm8
1456	vpxor	64(%rsi),%ymm1,%ymm1
1457	vpxor	96(%rsi),%ymm5,%ymm5
1458	vpxor	128(%rsi),%ymm12,%ymm12
1459	vpxor	160(%rsi),%ymm13,%ymm13
1460	vpxor	192(%rsi),%ymm10,%ymm10
1461	vpxor	224(%rsi),%ymm15,%ymm15
1462	vmovdqu	%ymm6,0(%rdi)
1463	vmovdqu	%ymm8,32(%rdi)
1464	vmovdqu	%ymm1,64(%rdi)
1465	vmovdqu	%ymm5,96(%rdi)
1466	vmovdqu	%ymm12,128(%rdi)
1467	vmovdqu	%ymm13,160(%rdi)
1468	vmovdqu	%ymm10,192(%rdi)
1469	vmovdqu	%ymm15,224(%rdi)
1470	je	L$done8x
1471
1472	leaq	256(%rsi),%rsi
1473	xorq	%r10,%r10
1474	vmovdqa	%ymm14,0(%rsp)
1475	leaq	256(%rdi),%rdi
1476	subq	$256,%rdx
1477	vmovdqa	%ymm2,32(%rsp)
1478	jmp	L$oop_tail8x
1479
1480.p2align	5
1481L$320_or_more8x:
1482	vpxor	0(%rsi),%ymm6,%ymm6
1483	vpxor	32(%rsi),%ymm8,%ymm8
1484	vpxor	64(%rsi),%ymm1,%ymm1
1485	vpxor	96(%rsi),%ymm5,%ymm5
1486	vpxor	128(%rsi),%ymm12,%ymm12
1487	vpxor	160(%rsi),%ymm13,%ymm13
1488	vpxor	192(%rsi),%ymm10,%ymm10
1489	vpxor	224(%rsi),%ymm15,%ymm15
1490	vpxor	256(%rsi),%ymm14,%ymm14
1491	vpxor	288(%rsi),%ymm2,%ymm2
1492	vmovdqu	%ymm6,0(%rdi)
1493	vmovdqu	%ymm8,32(%rdi)
1494	vmovdqu	%ymm1,64(%rdi)
1495	vmovdqu	%ymm5,96(%rdi)
1496	vmovdqu	%ymm12,128(%rdi)
1497	vmovdqu	%ymm13,160(%rdi)
1498	vmovdqu	%ymm10,192(%rdi)
1499	vmovdqu	%ymm15,224(%rdi)
1500	vmovdqu	%ymm14,256(%rdi)
1501	vmovdqu	%ymm2,288(%rdi)
1502	je	L$done8x
1503
1504	leaq	320(%rsi),%rsi
1505	xorq	%r10,%r10
1506	vmovdqa	%ymm3,0(%rsp)
1507	leaq	320(%rdi),%rdi
1508	subq	$320,%rdx
1509	vmovdqa	%ymm7,32(%rsp)
1510	jmp	L$oop_tail8x
1511
1512.p2align	5
1513L$384_or_more8x:
1514	vpxor	0(%rsi),%ymm6,%ymm6
1515	vpxor	32(%rsi),%ymm8,%ymm8
1516	vpxor	64(%rsi),%ymm1,%ymm1
1517	vpxor	96(%rsi),%ymm5,%ymm5
1518	vpxor	128(%rsi),%ymm12,%ymm12
1519	vpxor	160(%rsi),%ymm13,%ymm13
1520	vpxor	192(%rsi),%ymm10,%ymm10
1521	vpxor	224(%rsi),%ymm15,%ymm15
1522	vpxor	256(%rsi),%ymm14,%ymm14
1523	vpxor	288(%rsi),%ymm2,%ymm2
1524	vpxor	320(%rsi),%ymm3,%ymm3
1525	vpxor	352(%rsi),%ymm7,%ymm7
1526	vmovdqu	%ymm6,0(%rdi)
1527	vmovdqu	%ymm8,32(%rdi)
1528	vmovdqu	%ymm1,64(%rdi)
1529	vmovdqu	%ymm5,96(%rdi)
1530	vmovdqu	%ymm12,128(%rdi)
1531	vmovdqu	%ymm13,160(%rdi)
1532	vmovdqu	%ymm10,192(%rdi)
1533	vmovdqu	%ymm15,224(%rdi)
1534	vmovdqu	%ymm14,256(%rdi)
1535	vmovdqu	%ymm2,288(%rdi)
1536	vmovdqu	%ymm3,320(%rdi)
1537	vmovdqu	%ymm7,352(%rdi)
1538	je	L$done8x
1539
1540	leaq	384(%rsi),%rsi
1541	xorq	%r10,%r10
1542	vmovdqa	%ymm11,0(%rsp)
1543	leaq	384(%rdi),%rdi
1544	subq	$384,%rdx
1545	vmovdqa	%ymm9,32(%rsp)
1546	jmp	L$oop_tail8x
1547
1548.p2align	5
1549L$448_or_more8x:
1550	vpxor	0(%rsi),%ymm6,%ymm6
1551	vpxor	32(%rsi),%ymm8,%ymm8
1552	vpxor	64(%rsi),%ymm1,%ymm1
1553	vpxor	96(%rsi),%ymm5,%ymm5
1554	vpxor	128(%rsi),%ymm12,%ymm12
1555	vpxor	160(%rsi),%ymm13,%ymm13
1556	vpxor	192(%rsi),%ymm10,%ymm10
1557	vpxor	224(%rsi),%ymm15,%ymm15
1558	vpxor	256(%rsi),%ymm14,%ymm14
1559	vpxor	288(%rsi),%ymm2,%ymm2
1560	vpxor	320(%rsi),%ymm3,%ymm3
1561	vpxor	352(%rsi),%ymm7,%ymm7
1562	vpxor	384(%rsi),%ymm11,%ymm11
1563	vpxor	416(%rsi),%ymm9,%ymm9
1564	vmovdqu	%ymm6,0(%rdi)
1565	vmovdqu	%ymm8,32(%rdi)
1566	vmovdqu	%ymm1,64(%rdi)
1567	vmovdqu	%ymm5,96(%rdi)
1568	vmovdqu	%ymm12,128(%rdi)
1569	vmovdqu	%ymm13,160(%rdi)
1570	vmovdqu	%ymm10,192(%rdi)
1571	vmovdqu	%ymm15,224(%rdi)
1572	vmovdqu	%ymm14,256(%rdi)
1573	vmovdqu	%ymm2,288(%rdi)
1574	vmovdqu	%ymm3,320(%rdi)
1575	vmovdqu	%ymm7,352(%rdi)
1576	vmovdqu	%ymm11,384(%rdi)
1577	vmovdqu	%ymm9,416(%rdi)
1578	je	L$done8x
1579
1580	leaq	448(%rsi),%rsi
1581	xorq	%r10,%r10
1582	vmovdqa	%ymm0,0(%rsp)
1583	leaq	448(%rdi),%rdi
1584	subq	$448,%rdx
1585	vmovdqa	%ymm4,32(%rsp)
1586
1587L$oop_tail8x:
1588	movzbl	(%rsi,%r10,1),%eax
1589	movzbl	(%rsp,%r10,1),%ecx
1590	leaq	1(%r10),%r10
1591	xorl	%ecx,%eax
1592	movb	%al,-1(%rdi,%r10,1)
1593	decq	%rdx
1594	jnz	L$oop_tail8x
1595
1596L$done8x:
1597	vzeroall
1598	leaq	(%r9),%rsp
1599
1600L$8x_epilogue:
1601	ret
1602
1603
1604#endif
1605