xref: /aosp_15_r20/external/boringssl/src/gen/crypto/chacha-x86_64-apple.S (revision 8fb009dc861624b67b6cdb62ea21f0f22d0c584b)
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <openssl/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
7.text
8
9.section	__DATA,__const
10.p2align	6
11L$zero:
12.long	0,0,0,0
13L$one:
14.long	1,0,0,0
15L$inc:
16.long	0,1,2,3
17L$four:
18.long	4,4,4,4
19L$incy:
20.long	0,2,4,6,1,3,5,7
21L$eight:
22.long	8,8,8,8,8,8,8,8
23L$rot16:
24.byte	0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
25L$rot24:
26.byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
27L$sigma:
28.byte	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
29.p2align	6
30L$zeroz:
31.long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
32L$fourz:
33.long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
34L$incz:
35.long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
36L$sixteen:
37.long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
38.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
39.text
40.globl	_ChaCha20_ctr32_nohw
41.private_extern _ChaCha20_ctr32_nohw
42
43.p2align	6
44_ChaCha20_ctr32_nohw:
45
46_CET_ENDBR
47	pushq	%rbx
48
49	pushq	%rbp
50
51	pushq	%r12
52
53	pushq	%r13
54
55	pushq	%r14
56
57	pushq	%r15
58
59	subq	$64+24,%rsp
60
61L$ctr32_body:
62
63
64	movdqu	(%rcx),%xmm1
65	movdqu	16(%rcx),%xmm2
66	movdqu	(%r8),%xmm3
67	movdqa	L$one(%rip),%xmm4
68
69
70	movdqa	%xmm1,16(%rsp)
71	movdqa	%xmm2,32(%rsp)
72	movdqa	%xmm3,48(%rsp)
73	movq	%rdx,%rbp
74	jmp	L$oop_outer
75
76.p2align	5
77L$oop_outer:
78	movl	$0x61707865,%eax
79	movl	$0x3320646e,%ebx
80	movl	$0x79622d32,%ecx
81	movl	$0x6b206574,%edx
82	movl	16(%rsp),%r8d
83	movl	20(%rsp),%r9d
84	movl	24(%rsp),%r10d
85	movl	28(%rsp),%r11d
86	movd	%xmm3,%r12d
87	movl	52(%rsp),%r13d
88	movl	56(%rsp),%r14d
89	movl	60(%rsp),%r15d
90
91	movq	%rbp,64+0(%rsp)
92	movl	$10,%ebp
93	movq	%rsi,64+8(%rsp)
94.byte	102,72,15,126,214
95	movq	%rdi,64+16(%rsp)
96	movq	%rsi,%rdi
97	shrq	$32,%rdi
98	jmp	L$oop
99
100.p2align	5
101L$oop:
102	addl	%r8d,%eax
103	xorl	%eax,%r12d
104	roll	$16,%r12d
105	addl	%r9d,%ebx
106	xorl	%ebx,%r13d
107	roll	$16,%r13d
108	addl	%r12d,%esi
109	xorl	%esi,%r8d
110	roll	$12,%r8d
111	addl	%r13d,%edi
112	xorl	%edi,%r9d
113	roll	$12,%r9d
114	addl	%r8d,%eax
115	xorl	%eax,%r12d
116	roll	$8,%r12d
117	addl	%r9d,%ebx
118	xorl	%ebx,%r13d
119	roll	$8,%r13d
120	addl	%r12d,%esi
121	xorl	%esi,%r8d
122	roll	$7,%r8d
123	addl	%r13d,%edi
124	xorl	%edi,%r9d
125	roll	$7,%r9d
126	movl	%esi,32(%rsp)
127	movl	%edi,36(%rsp)
128	movl	40(%rsp),%esi
129	movl	44(%rsp),%edi
130	addl	%r10d,%ecx
131	xorl	%ecx,%r14d
132	roll	$16,%r14d
133	addl	%r11d,%edx
134	xorl	%edx,%r15d
135	roll	$16,%r15d
136	addl	%r14d,%esi
137	xorl	%esi,%r10d
138	roll	$12,%r10d
139	addl	%r15d,%edi
140	xorl	%edi,%r11d
141	roll	$12,%r11d
142	addl	%r10d,%ecx
143	xorl	%ecx,%r14d
144	roll	$8,%r14d
145	addl	%r11d,%edx
146	xorl	%edx,%r15d
147	roll	$8,%r15d
148	addl	%r14d,%esi
149	xorl	%esi,%r10d
150	roll	$7,%r10d
151	addl	%r15d,%edi
152	xorl	%edi,%r11d
153	roll	$7,%r11d
154	addl	%r9d,%eax
155	xorl	%eax,%r15d
156	roll	$16,%r15d
157	addl	%r10d,%ebx
158	xorl	%ebx,%r12d
159	roll	$16,%r12d
160	addl	%r15d,%esi
161	xorl	%esi,%r9d
162	roll	$12,%r9d
163	addl	%r12d,%edi
164	xorl	%edi,%r10d
165	roll	$12,%r10d
166	addl	%r9d,%eax
167	xorl	%eax,%r15d
168	roll	$8,%r15d
169	addl	%r10d,%ebx
170	xorl	%ebx,%r12d
171	roll	$8,%r12d
172	addl	%r15d,%esi
173	xorl	%esi,%r9d
174	roll	$7,%r9d
175	addl	%r12d,%edi
176	xorl	%edi,%r10d
177	roll	$7,%r10d
178	movl	%esi,40(%rsp)
179	movl	%edi,44(%rsp)
180	movl	32(%rsp),%esi
181	movl	36(%rsp),%edi
182	addl	%r11d,%ecx
183	xorl	%ecx,%r13d
184	roll	$16,%r13d
185	addl	%r8d,%edx
186	xorl	%edx,%r14d
187	roll	$16,%r14d
188	addl	%r13d,%esi
189	xorl	%esi,%r11d
190	roll	$12,%r11d
191	addl	%r14d,%edi
192	xorl	%edi,%r8d
193	roll	$12,%r8d
194	addl	%r11d,%ecx
195	xorl	%ecx,%r13d
196	roll	$8,%r13d
197	addl	%r8d,%edx
198	xorl	%edx,%r14d
199	roll	$8,%r14d
200	addl	%r13d,%esi
201	xorl	%esi,%r11d
202	roll	$7,%r11d
203	addl	%r14d,%edi
204	xorl	%edi,%r8d
205	roll	$7,%r8d
206	decl	%ebp
207	jnz	L$oop
208	movl	%edi,36(%rsp)
209	movl	%esi,32(%rsp)
210	movq	64(%rsp),%rbp
211	movdqa	%xmm2,%xmm1
212	movq	64+8(%rsp),%rsi
213	paddd	%xmm4,%xmm3
214	movq	64+16(%rsp),%rdi
215
216	addl	$0x61707865,%eax
217	addl	$0x3320646e,%ebx
218	addl	$0x79622d32,%ecx
219	addl	$0x6b206574,%edx
220	addl	16(%rsp),%r8d
221	addl	20(%rsp),%r9d
222	addl	24(%rsp),%r10d
223	addl	28(%rsp),%r11d
224	addl	48(%rsp),%r12d
225	addl	52(%rsp),%r13d
226	addl	56(%rsp),%r14d
227	addl	60(%rsp),%r15d
228	paddd	32(%rsp),%xmm1
229
230	cmpq	$64,%rbp
231	jb	L$tail
232
233	xorl	0(%rsi),%eax
234	xorl	4(%rsi),%ebx
235	xorl	8(%rsi),%ecx
236	xorl	12(%rsi),%edx
237	xorl	16(%rsi),%r8d
238	xorl	20(%rsi),%r9d
239	xorl	24(%rsi),%r10d
240	xorl	28(%rsi),%r11d
241	movdqu	32(%rsi),%xmm0
242	xorl	48(%rsi),%r12d
243	xorl	52(%rsi),%r13d
244	xorl	56(%rsi),%r14d
245	xorl	60(%rsi),%r15d
246	leaq	64(%rsi),%rsi
247	pxor	%xmm1,%xmm0
248
249	movdqa	%xmm2,32(%rsp)
250	movd	%xmm3,48(%rsp)
251
252	movl	%eax,0(%rdi)
253	movl	%ebx,4(%rdi)
254	movl	%ecx,8(%rdi)
255	movl	%edx,12(%rdi)
256	movl	%r8d,16(%rdi)
257	movl	%r9d,20(%rdi)
258	movl	%r10d,24(%rdi)
259	movl	%r11d,28(%rdi)
260	movdqu	%xmm0,32(%rdi)
261	movl	%r12d,48(%rdi)
262	movl	%r13d,52(%rdi)
263	movl	%r14d,56(%rdi)
264	movl	%r15d,60(%rdi)
265	leaq	64(%rdi),%rdi
266
267	subq	$64,%rbp
268	jnz	L$oop_outer
269
270	jmp	L$done
271
272.p2align	4
273L$tail:
274	movl	%eax,0(%rsp)
275	movl	%ebx,4(%rsp)
276	xorq	%rbx,%rbx
277	movl	%ecx,8(%rsp)
278	movl	%edx,12(%rsp)
279	movl	%r8d,16(%rsp)
280	movl	%r9d,20(%rsp)
281	movl	%r10d,24(%rsp)
282	movl	%r11d,28(%rsp)
283	movdqa	%xmm1,32(%rsp)
284	movl	%r12d,48(%rsp)
285	movl	%r13d,52(%rsp)
286	movl	%r14d,56(%rsp)
287	movl	%r15d,60(%rsp)
288
289L$oop_tail:
290	movzbl	(%rsi,%rbx,1),%eax
291	movzbl	(%rsp,%rbx,1),%edx
292	leaq	1(%rbx),%rbx
293	xorl	%edx,%eax
294	movb	%al,-1(%rdi,%rbx,1)
295	decq	%rbp
296	jnz	L$oop_tail
297
298L$done:
299	leaq	64+24+48(%rsp),%rsi
300	movq	-48(%rsi),%r15
301
302	movq	-40(%rsi),%r14
303
304	movq	-32(%rsi),%r13
305
306	movq	-24(%rsi),%r12
307
308	movq	-16(%rsi),%rbp
309
310	movq	-8(%rsi),%rbx
311
312	leaq	(%rsi),%rsp
313
314L$no_data:
315	ret
316
317
318.globl	_ChaCha20_ctr32_ssse3
319.private_extern _ChaCha20_ctr32_ssse3
320
321.p2align	5
322_ChaCha20_ctr32_ssse3:
323
324_CET_ENDBR
325	movq	%rsp,%r9
326
327	subq	$64+8,%rsp
328	movdqa	L$sigma(%rip),%xmm0
329	movdqu	(%rcx),%xmm1
330	movdqu	16(%rcx),%xmm2
331	movdqu	(%r8),%xmm3
332	movdqa	L$rot16(%rip),%xmm6
333	movdqa	L$rot24(%rip),%xmm7
334
335	movdqa	%xmm0,0(%rsp)
336	movdqa	%xmm1,16(%rsp)
337	movdqa	%xmm2,32(%rsp)
338	movdqa	%xmm3,48(%rsp)
339	movq	$10,%r8
340	jmp	L$oop_ssse3
341
342.p2align	5
343L$oop_outer_ssse3:
344	movdqa	L$one(%rip),%xmm3
345	movdqa	0(%rsp),%xmm0
346	movdqa	16(%rsp),%xmm1
347	movdqa	32(%rsp),%xmm2
348	paddd	48(%rsp),%xmm3
349	movq	$10,%r8
350	movdqa	%xmm3,48(%rsp)
351	jmp	L$oop_ssse3
352
353.p2align	5
354L$oop_ssse3:
355	paddd	%xmm1,%xmm0
356	pxor	%xmm0,%xmm3
357.byte	102,15,56,0,222
358	paddd	%xmm3,%xmm2
359	pxor	%xmm2,%xmm1
360	movdqa	%xmm1,%xmm4
361	psrld	$20,%xmm1
362	pslld	$12,%xmm4
363	por	%xmm4,%xmm1
364	paddd	%xmm1,%xmm0
365	pxor	%xmm0,%xmm3
366.byte	102,15,56,0,223
367	paddd	%xmm3,%xmm2
368	pxor	%xmm2,%xmm1
369	movdqa	%xmm1,%xmm4
370	psrld	$25,%xmm1
371	pslld	$7,%xmm4
372	por	%xmm4,%xmm1
373	pshufd	$78,%xmm2,%xmm2
374	pshufd	$57,%xmm1,%xmm1
375	pshufd	$147,%xmm3,%xmm3
376	nop
377	paddd	%xmm1,%xmm0
378	pxor	%xmm0,%xmm3
379.byte	102,15,56,0,222
380	paddd	%xmm3,%xmm2
381	pxor	%xmm2,%xmm1
382	movdqa	%xmm1,%xmm4
383	psrld	$20,%xmm1
384	pslld	$12,%xmm4
385	por	%xmm4,%xmm1
386	paddd	%xmm1,%xmm0
387	pxor	%xmm0,%xmm3
388.byte	102,15,56,0,223
389	paddd	%xmm3,%xmm2
390	pxor	%xmm2,%xmm1
391	movdqa	%xmm1,%xmm4
392	psrld	$25,%xmm1
393	pslld	$7,%xmm4
394	por	%xmm4,%xmm1
395	pshufd	$78,%xmm2,%xmm2
396	pshufd	$147,%xmm1,%xmm1
397	pshufd	$57,%xmm3,%xmm3
398	decq	%r8
399	jnz	L$oop_ssse3
400	paddd	0(%rsp),%xmm0
401	paddd	16(%rsp),%xmm1
402	paddd	32(%rsp),%xmm2
403	paddd	48(%rsp),%xmm3
404
405	cmpq	$64,%rdx
406	jb	L$tail_ssse3
407
408	movdqu	0(%rsi),%xmm4
409	movdqu	16(%rsi),%xmm5
410	pxor	%xmm4,%xmm0
411	movdqu	32(%rsi),%xmm4
412	pxor	%xmm5,%xmm1
413	movdqu	48(%rsi),%xmm5
414	leaq	64(%rsi),%rsi
415	pxor	%xmm4,%xmm2
416	pxor	%xmm5,%xmm3
417
418	movdqu	%xmm0,0(%rdi)
419	movdqu	%xmm1,16(%rdi)
420	movdqu	%xmm2,32(%rdi)
421	movdqu	%xmm3,48(%rdi)
422	leaq	64(%rdi),%rdi
423
424	subq	$64,%rdx
425	jnz	L$oop_outer_ssse3
426
427	jmp	L$done_ssse3
428
429.p2align	4
430L$tail_ssse3:
431	movdqa	%xmm0,0(%rsp)
432	movdqa	%xmm1,16(%rsp)
433	movdqa	%xmm2,32(%rsp)
434	movdqa	%xmm3,48(%rsp)
435	xorq	%r8,%r8
436
437L$oop_tail_ssse3:
438	movzbl	(%rsi,%r8,1),%eax
439	movzbl	(%rsp,%r8,1),%ecx
440	leaq	1(%r8),%r8
441	xorl	%ecx,%eax
442	movb	%al,-1(%rdi,%r8,1)
443	decq	%rdx
444	jnz	L$oop_tail_ssse3
445
446L$done_ssse3:
447	leaq	(%r9),%rsp
448
449L$ssse3_epilogue:
450	ret
451
452
453.globl	_ChaCha20_ctr32_ssse3_4x
454.private_extern _ChaCha20_ctr32_ssse3_4x
455
456.p2align	5
457_ChaCha20_ctr32_ssse3_4x:
458
459_CET_ENDBR
460	movq	%rsp,%r9
461
462	subq	$0x140+8,%rsp
463	movdqa	L$sigma(%rip),%xmm11
464	movdqu	(%rcx),%xmm15
465	movdqu	16(%rcx),%xmm7
466	movdqu	(%r8),%xmm3
467	leaq	256(%rsp),%rcx
468	leaq	L$rot16(%rip),%r10
469	leaq	L$rot24(%rip),%r11
470
471	pshufd	$0x00,%xmm11,%xmm8
472	pshufd	$0x55,%xmm11,%xmm9
473	movdqa	%xmm8,64(%rsp)
474	pshufd	$0xaa,%xmm11,%xmm10
475	movdqa	%xmm9,80(%rsp)
476	pshufd	$0xff,%xmm11,%xmm11
477	movdqa	%xmm10,96(%rsp)
478	movdqa	%xmm11,112(%rsp)
479
480	pshufd	$0x00,%xmm15,%xmm12
481	pshufd	$0x55,%xmm15,%xmm13
482	movdqa	%xmm12,128-256(%rcx)
483	pshufd	$0xaa,%xmm15,%xmm14
484	movdqa	%xmm13,144-256(%rcx)
485	pshufd	$0xff,%xmm15,%xmm15
486	movdqa	%xmm14,160-256(%rcx)
487	movdqa	%xmm15,176-256(%rcx)
488
489	pshufd	$0x00,%xmm7,%xmm4
490	pshufd	$0x55,%xmm7,%xmm5
491	movdqa	%xmm4,192-256(%rcx)
492	pshufd	$0xaa,%xmm7,%xmm6
493	movdqa	%xmm5,208-256(%rcx)
494	pshufd	$0xff,%xmm7,%xmm7
495	movdqa	%xmm6,224-256(%rcx)
496	movdqa	%xmm7,240-256(%rcx)
497
498	pshufd	$0x00,%xmm3,%xmm0
499	pshufd	$0x55,%xmm3,%xmm1
500	paddd	L$inc(%rip),%xmm0
501	pshufd	$0xaa,%xmm3,%xmm2
502	movdqa	%xmm1,272-256(%rcx)
503	pshufd	$0xff,%xmm3,%xmm3
504	movdqa	%xmm2,288-256(%rcx)
505	movdqa	%xmm3,304-256(%rcx)
506
507	jmp	L$oop_enter4x
508
509.p2align	5
510L$oop_outer4x:
511	movdqa	64(%rsp),%xmm8
512	movdqa	80(%rsp),%xmm9
513	movdqa	96(%rsp),%xmm10
514	movdqa	112(%rsp),%xmm11
515	movdqa	128-256(%rcx),%xmm12
516	movdqa	144-256(%rcx),%xmm13
517	movdqa	160-256(%rcx),%xmm14
518	movdqa	176-256(%rcx),%xmm15
519	movdqa	192-256(%rcx),%xmm4
520	movdqa	208-256(%rcx),%xmm5
521	movdqa	224-256(%rcx),%xmm6
522	movdqa	240-256(%rcx),%xmm7
523	movdqa	256-256(%rcx),%xmm0
524	movdqa	272-256(%rcx),%xmm1
525	movdqa	288-256(%rcx),%xmm2
526	movdqa	304-256(%rcx),%xmm3
527	paddd	L$four(%rip),%xmm0
528
529L$oop_enter4x:
530	movdqa	%xmm6,32(%rsp)
531	movdqa	%xmm7,48(%rsp)
532	movdqa	(%r10),%xmm7
533	movl	$10,%eax
534	movdqa	%xmm0,256-256(%rcx)
535	jmp	L$oop4x
536
537.p2align	5
538L$oop4x:
539	paddd	%xmm12,%xmm8
540	paddd	%xmm13,%xmm9
541	pxor	%xmm8,%xmm0
542	pxor	%xmm9,%xmm1
543.byte	102,15,56,0,199
544.byte	102,15,56,0,207
545	paddd	%xmm0,%xmm4
546	paddd	%xmm1,%xmm5
547	pxor	%xmm4,%xmm12
548	pxor	%xmm5,%xmm13
549	movdqa	%xmm12,%xmm6
550	pslld	$12,%xmm12
551	psrld	$20,%xmm6
552	movdqa	%xmm13,%xmm7
553	pslld	$12,%xmm13
554	por	%xmm6,%xmm12
555	psrld	$20,%xmm7
556	movdqa	(%r11),%xmm6
557	por	%xmm7,%xmm13
558	paddd	%xmm12,%xmm8
559	paddd	%xmm13,%xmm9
560	pxor	%xmm8,%xmm0
561	pxor	%xmm9,%xmm1
562.byte	102,15,56,0,198
563.byte	102,15,56,0,206
564	paddd	%xmm0,%xmm4
565	paddd	%xmm1,%xmm5
566	pxor	%xmm4,%xmm12
567	pxor	%xmm5,%xmm13
568	movdqa	%xmm12,%xmm7
569	pslld	$7,%xmm12
570	psrld	$25,%xmm7
571	movdqa	%xmm13,%xmm6
572	pslld	$7,%xmm13
573	por	%xmm7,%xmm12
574	psrld	$25,%xmm6
575	movdqa	(%r10),%xmm7
576	por	%xmm6,%xmm13
577	movdqa	%xmm4,0(%rsp)
578	movdqa	%xmm5,16(%rsp)
579	movdqa	32(%rsp),%xmm4
580	movdqa	48(%rsp),%xmm5
581	paddd	%xmm14,%xmm10
582	paddd	%xmm15,%xmm11
583	pxor	%xmm10,%xmm2
584	pxor	%xmm11,%xmm3
585.byte	102,15,56,0,215
586.byte	102,15,56,0,223
587	paddd	%xmm2,%xmm4
588	paddd	%xmm3,%xmm5
589	pxor	%xmm4,%xmm14
590	pxor	%xmm5,%xmm15
591	movdqa	%xmm14,%xmm6
592	pslld	$12,%xmm14
593	psrld	$20,%xmm6
594	movdqa	%xmm15,%xmm7
595	pslld	$12,%xmm15
596	por	%xmm6,%xmm14
597	psrld	$20,%xmm7
598	movdqa	(%r11),%xmm6
599	por	%xmm7,%xmm15
600	paddd	%xmm14,%xmm10
601	paddd	%xmm15,%xmm11
602	pxor	%xmm10,%xmm2
603	pxor	%xmm11,%xmm3
604.byte	102,15,56,0,214
605.byte	102,15,56,0,222
606	paddd	%xmm2,%xmm4
607	paddd	%xmm3,%xmm5
608	pxor	%xmm4,%xmm14
609	pxor	%xmm5,%xmm15
610	movdqa	%xmm14,%xmm7
611	pslld	$7,%xmm14
612	psrld	$25,%xmm7
613	movdqa	%xmm15,%xmm6
614	pslld	$7,%xmm15
615	por	%xmm7,%xmm14
616	psrld	$25,%xmm6
617	movdqa	(%r10),%xmm7
618	por	%xmm6,%xmm15
619	paddd	%xmm13,%xmm8
620	paddd	%xmm14,%xmm9
621	pxor	%xmm8,%xmm3
622	pxor	%xmm9,%xmm0
623.byte	102,15,56,0,223
624.byte	102,15,56,0,199
625	paddd	%xmm3,%xmm4
626	paddd	%xmm0,%xmm5
627	pxor	%xmm4,%xmm13
628	pxor	%xmm5,%xmm14
629	movdqa	%xmm13,%xmm6
630	pslld	$12,%xmm13
631	psrld	$20,%xmm6
632	movdqa	%xmm14,%xmm7
633	pslld	$12,%xmm14
634	por	%xmm6,%xmm13
635	psrld	$20,%xmm7
636	movdqa	(%r11),%xmm6
637	por	%xmm7,%xmm14
638	paddd	%xmm13,%xmm8
639	paddd	%xmm14,%xmm9
640	pxor	%xmm8,%xmm3
641	pxor	%xmm9,%xmm0
642.byte	102,15,56,0,222
643.byte	102,15,56,0,198
644	paddd	%xmm3,%xmm4
645	paddd	%xmm0,%xmm5
646	pxor	%xmm4,%xmm13
647	pxor	%xmm5,%xmm14
648	movdqa	%xmm13,%xmm7
649	pslld	$7,%xmm13
650	psrld	$25,%xmm7
651	movdqa	%xmm14,%xmm6
652	pslld	$7,%xmm14
653	por	%xmm7,%xmm13
654	psrld	$25,%xmm6
655	movdqa	(%r10),%xmm7
656	por	%xmm6,%xmm14
657	movdqa	%xmm4,32(%rsp)
658	movdqa	%xmm5,48(%rsp)
659	movdqa	0(%rsp),%xmm4
660	movdqa	16(%rsp),%xmm5
661	paddd	%xmm15,%xmm10
662	paddd	%xmm12,%xmm11
663	pxor	%xmm10,%xmm1
664	pxor	%xmm11,%xmm2
665.byte	102,15,56,0,207
666.byte	102,15,56,0,215
667	paddd	%xmm1,%xmm4
668	paddd	%xmm2,%xmm5
669	pxor	%xmm4,%xmm15
670	pxor	%xmm5,%xmm12
671	movdqa	%xmm15,%xmm6
672	pslld	$12,%xmm15
673	psrld	$20,%xmm6
674	movdqa	%xmm12,%xmm7
675	pslld	$12,%xmm12
676	por	%xmm6,%xmm15
677	psrld	$20,%xmm7
678	movdqa	(%r11),%xmm6
679	por	%xmm7,%xmm12
680	paddd	%xmm15,%xmm10
681	paddd	%xmm12,%xmm11
682	pxor	%xmm10,%xmm1
683	pxor	%xmm11,%xmm2
684.byte	102,15,56,0,206
685.byte	102,15,56,0,214
686	paddd	%xmm1,%xmm4
687	paddd	%xmm2,%xmm5
688	pxor	%xmm4,%xmm15
689	pxor	%xmm5,%xmm12
690	movdqa	%xmm15,%xmm7
691	pslld	$7,%xmm15
692	psrld	$25,%xmm7
693	movdqa	%xmm12,%xmm6
694	pslld	$7,%xmm12
695	por	%xmm7,%xmm15
696	psrld	$25,%xmm6
697	movdqa	(%r10),%xmm7
698	por	%xmm6,%xmm12
699	decl	%eax
700	jnz	L$oop4x
701
702	paddd	64(%rsp),%xmm8
703	paddd	80(%rsp),%xmm9
704	paddd	96(%rsp),%xmm10
705	paddd	112(%rsp),%xmm11
706
707	movdqa	%xmm8,%xmm6
708	punpckldq	%xmm9,%xmm8
709	movdqa	%xmm10,%xmm7
710	punpckldq	%xmm11,%xmm10
711	punpckhdq	%xmm9,%xmm6
712	punpckhdq	%xmm11,%xmm7
713	movdqa	%xmm8,%xmm9
714	punpcklqdq	%xmm10,%xmm8
715	movdqa	%xmm6,%xmm11
716	punpcklqdq	%xmm7,%xmm6
717	punpckhqdq	%xmm10,%xmm9
718	punpckhqdq	%xmm7,%xmm11
719	paddd	128-256(%rcx),%xmm12
720	paddd	144-256(%rcx),%xmm13
721	paddd	160-256(%rcx),%xmm14
722	paddd	176-256(%rcx),%xmm15
723
724	movdqa	%xmm8,0(%rsp)
725	movdqa	%xmm9,16(%rsp)
726	movdqa	32(%rsp),%xmm8
727	movdqa	48(%rsp),%xmm9
728
729	movdqa	%xmm12,%xmm10
730	punpckldq	%xmm13,%xmm12
731	movdqa	%xmm14,%xmm7
732	punpckldq	%xmm15,%xmm14
733	punpckhdq	%xmm13,%xmm10
734	punpckhdq	%xmm15,%xmm7
735	movdqa	%xmm12,%xmm13
736	punpcklqdq	%xmm14,%xmm12
737	movdqa	%xmm10,%xmm15
738	punpcklqdq	%xmm7,%xmm10
739	punpckhqdq	%xmm14,%xmm13
740	punpckhqdq	%xmm7,%xmm15
741	paddd	192-256(%rcx),%xmm4
742	paddd	208-256(%rcx),%xmm5
743	paddd	224-256(%rcx),%xmm8
744	paddd	240-256(%rcx),%xmm9
745
746	movdqa	%xmm6,32(%rsp)
747	movdqa	%xmm11,48(%rsp)
748
749	movdqa	%xmm4,%xmm14
750	punpckldq	%xmm5,%xmm4
751	movdqa	%xmm8,%xmm7
752	punpckldq	%xmm9,%xmm8
753	punpckhdq	%xmm5,%xmm14
754	punpckhdq	%xmm9,%xmm7
755	movdqa	%xmm4,%xmm5
756	punpcklqdq	%xmm8,%xmm4
757	movdqa	%xmm14,%xmm9
758	punpcklqdq	%xmm7,%xmm14
759	punpckhqdq	%xmm8,%xmm5
760	punpckhqdq	%xmm7,%xmm9
761	paddd	256-256(%rcx),%xmm0
762	paddd	272-256(%rcx),%xmm1
763	paddd	288-256(%rcx),%xmm2
764	paddd	304-256(%rcx),%xmm3
765
766	movdqa	%xmm0,%xmm8
767	punpckldq	%xmm1,%xmm0
768	movdqa	%xmm2,%xmm7
769	punpckldq	%xmm3,%xmm2
770	punpckhdq	%xmm1,%xmm8
771	punpckhdq	%xmm3,%xmm7
772	movdqa	%xmm0,%xmm1
773	punpcklqdq	%xmm2,%xmm0
774	movdqa	%xmm8,%xmm3
775	punpcklqdq	%xmm7,%xmm8
776	punpckhqdq	%xmm2,%xmm1
777	punpckhqdq	%xmm7,%xmm3
778	cmpq	$256,%rdx
779	jb	L$tail4x
780
781	movdqu	0(%rsi),%xmm6
782	movdqu	16(%rsi),%xmm11
783	movdqu	32(%rsi),%xmm2
784	movdqu	48(%rsi),%xmm7
785	pxor	0(%rsp),%xmm6
786	pxor	%xmm12,%xmm11
787	pxor	%xmm4,%xmm2
788	pxor	%xmm0,%xmm7
789
790	movdqu	%xmm6,0(%rdi)
791	movdqu	64(%rsi),%xmm6
792	movdqu	%xmm11,16(%rdi)
793	movdqu	80(%rsi),%xmm11
794	movdqu	%xmm2,32(%rdi)
795	movdqu	96(%rsi),%xmm2
796	movdqu	%xmm7,48(%rdi)
797	movdqu	112(%rsi),%xmm7
798	leaq	128(%rsi),%rsi
799	pxor	16(%rsp),%xmm6
800	pxor	%xmm13,%xmm11
801	pxor	%xmm5,%xmm2
802	pxor	%xmm1,%xmm7
803
804	movdqu	%xmm6,64(%rdi)
805	movdqu	0(%rsi),%xmm6
806	movdqu	%xmm11,80(%rdi)
807	movdqu	16(%rsi),%xmm11
808	movdqu	%xmm2,96(%rdi)
809	movdqu	32(%rsi),%xmm2
810	movdqu	%xmm7,112(%rdi)
811	leaq	128(%rdi),%rdi
812	movdqu	48(%rsi),%xmm7
813	pxor	32(%rsp),%xmm6
814	pxor	%xmm10,%xmm11
815	pxor	%xmm14,%xmm2
816	pxor	%xmm8,%xmm7
817
818	movdqu	%xmm6,0(%rdi)
819	movdqu	64(%rsi),%xmm6
820	movdqu	%xmm11,16(%rdi)
821	movdqu	80(%rsi),%xmm11
822	movdqu	%xmm2,32(%rdi)
823	movdqu	96(%rsi),%xmm2
824	movdqu	%xmm7,48(%rdi)
825	movdqu	112(%rsi),%xmm7
826	leaq	128(%rsi),%rsi
827	pxor	48(%rsp),%xmm6
828	pxor	%xmm15,%xmm11
829	pxor	%xmm9,%xmm2
830	pxor	%xmm3,%xmm7
831	movdqu	%xmm6,64(%rdi)
832	movdqu	%xmm11,80(%rdi)
833	movdqu	%xmm2,96(%rdi)
834	movdqu	%xmm7,112(%rdi)
835	leaq	128(%rdi),%rdi
836
837	subq	$256,%rdx
838	jnz	L$oop_outer4x
839
840	jmp	L$done4x
841
842L$tail4x:
843	cmpq	$192,%rdx
844	jae	L$192_or_more4x
845	cmpq	$128,%rdx
846	jae	L$128_or_more4x
847	cmpq	$64,%rdx
848	jae	L$64_or_more4x
849
850
851	xorq	%r10,%r10
852
853	movdqa	%xmm12,16(%rsp)
854	movdqa	%xmm4,32(%rsp)
855	movdqa	%xmm0,48(%rsp)
856	jmp	L$oop_tail4x
857
858.p2align	5
859L$64_or_more4x:
860	movdqu	0(%rsi),%xmm6
861	movdqu	16(%rsi),%xmm11
862	movdqu	32(%rsi),%xmm2
863	movdqu	48(%rsi),%xmm7
864	pxor	0(%rsp),%xmm6
865	pxor	%xmm12,%xmm11
866	pxor	%xmm4,%xmm2
867	pxor	%xmm0,%xmm7
868	movdqu	%xmm6,0(%rdi)
869	movdqu	%xmm11,16(%rdi)
870	movdqu	%xmm2,32(%rdi)
871	movdqu	%xmm7,48(%rdi)
872	je	L$done4x
873
874	movdqa	16(%rsp),%xmm6
875	leaq	64(%rsi),%rsi
876	xorq	%r10,%r10
877	movdqa	%xmm6,0(%rsp)
878	movdqa	%xmm13,16(%rsp)
879	leaq	64(%rdi),%rdi
880	movdqa	%xmm5,32(%rsp)
881	subq	$64,%rdx
882	movdqa	%xmm1,48(%rsp)
883	jmp	L$oop_tail4x
884
885.p2align	5
886L$128_or_more4x:
887	movdqu	0(%rsi),%xmm6
888	movdqu	16(%rsi),%xmm11
889	movdqu	32(%rsi),%xmm2
890	movdqu	48(%rsi),%xmm7
891	pxor	0(%rsp),%xmm6
892	pxor	%xmm12,%xmm11
893	pxor	%xmm4,%xmm2
894	pxor	%xmm0,%xmm7
895
896	movdqu	%xmm6,0(%rdi)
897	movdqu	64(%rsi),%xmm6
898	movdqu	%xmm11,16(%rdi)
899	movdqu	80(%rsi),%xmm11
900	movdqu	%xmm2,32(%rdi)
901	movdqu	96(%rsi),%xmm2
902	movdqu	%xmm7,48(%rdi)
903	movdqu	112(%rsi),%xmm7
904	pxor	16(%rsp),%xmm6
905	pxor	%xmm13,%xmm11
906	pxor	%xmm5,%xmm2
907	pxor	%xmm1,%xmm7
908	movdqu	%xmm6,64(%rdi)
909	movdqu	%xmm11,80(%rdi)
910	movdqu	%xmm2,96(%rdi)
911	movdqu	%xmm7,112(%rdi)
912	je	L$done4x
913
914	movdqa	32(%rsp),%xmm6
915	leaq	128(%rsi),%rsi
916	xorq	%r10,%r10
917	movdqa	%xmm6,0(%rsp)
918	movdqa	%xmm10,16(%rsp)
919	leaq	128(%rdi),%rdi
920	movdqa	%xmm14,32(%rsp)
921	subq	$128,%rdx
922	movdqa	%xmm8,48(%rsp)
923	jmp	L$oop_tail4x
924
925.p2align	5
926L$192_or_more4x:
927	movdqu	0(%rsi),%xmm6
928	movdqu	16(%rsi),%xmm11
929	movdqu	32(%rsi),%xmm2
930	movdqu	48(%rsi),%xmm7
931	pxor	0(%rsp),%xmm6
932	pxor	%xmm12,%xmm11
933	pxor	%xmm4,%xmm2
934	pxor	%xmm0,%xmm7
935
936	movdqu	%xmm6,0(%rdi)
937	movdqu	64(%rsi),%xmm6
938	movdqu	%xmm11,16(%rdi)
939	movdqu	80(%rsi),%xmm11
940	movdqu	%xmm2,32(%rdi)
941	movdqu	96(%rsi),%xmm2
942	movdqu	%xmm7,48(%rdi)
943	movdqu	112(%rsi),%xmm7
944	leaq	128(%rsi),%rsi
945	pxor	16(%rsp),%xmm6
946	pxor	%xmm13,%xmm11
947	pxor	%xmm5,%xmm2
948	pxor	%xmm1,%xmm7
949
950	movdqu	%xmm6,64(%rdi)
951	movdqu	0(%rsi),%xmm6
952	movdqu	%xmm11,80(%rdi)
953	movdqu	16(%rsi),%xmm11
954	movdqu	%xmm2,96(%rdi)
955	movdqu	32(%rsi),%xmm2
956	movdqu	%xmm7,112(%rdi)
957	leaq	128(%rdi),%rdi
958	movdqu	48(%rsi),%xmm7
959	pxor	32(%rsp),%xmm6
960	pxor	%xmm10,%xmm11
961	pxor	%xmm14,%xmm2
962	pxor	%xmm8,%xmm7
963	movdqu	%xmm6,0(%rdi)
964	movdqu	%xmm11,16(%rdi)
965	movdqu	%xmm2,32(%rdi)
966	movdqu	%xmm7,48(%rdi)
967	je	L$done4x
968
969	movdqa	48(%rsp),%xmm6
970	leaq	64(%rsi),%rsi
971	xorq	%r10,%r10
972	movdqa	%xmm6,0(%rsp)
973	movdqa	%xmm15,16(%rsp)
974	leaq	64(%rdi),%rdi
975	movdqa	%xmm9,32(%rsp)
976	subq	$192,%rdx
977	movdqa	%xmm3,48(%rsp)
978
979L$oop_tail4x:
980	movzbl	(%rsi,%r10,1),%eax
981	movzbl	(%rsp,%r10,1),%ecx
982	leaq	1(%r10),%r10
983	xorl	%ecx,%eax
984	movb	%al,-1(%rdi,%r10,1)
985	decq	%rdx
986	jnz	L$oop_tail4x
987
988L$done4x:
989	leaq	(%r9),%rsp
990
991L$4x_epilogue:
992	ret
993
994
995.globl	_ChaCha20_ctr32_avx2
996.private_extern _ChaCha20_ctr32_avx2
997
998.p2align	5
999_ChaCha20_ctr32_avx2:
1000
1001_CET_ENDBR
1002	movq	%rsp,%r9
1003
1004	subq	$0x280+8,%rsp
1005	andq	$-32,%rsp
1006	vzeroupper
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017	vbroadcasti128	L$sigma(%rip),%ymm11
1018	vbroadcasti128	(%rcx),%ymm3
1019	vbroadcasti128	16(%rcx),%ymm15
1020	vbroadcasti128	(%r8),%ymm7
1021	leaq	256(%rsp),%rcx
1022	leaq	512(%rsp),%rax
1023	leaq	L$rot16(%rip),%r10
1024	leaq	L$rot24(%rip),%r11
1025
1026	vpshufd	$0x00,%ymm11,%ymm8
1027	vpshufd	$0x55,%ymm11,%ymm9
1028	vmovdqa	%ymm8,128-256(%rcx)
1029	vpshufd	$0xaa,%ymm11,%ymm10
1030	vmovdqa	%ymm9,160-256(%rcx)
1031	vpshufd	$0xff,%ymm11,%ymm11
1032	vmovdqa	%ymm10,192-256(%rcx)
1033	vmovdqa	%ymm11,224-256(%rcx)
1034
1035	vpshufd	$0x00,%ymm3,%ymm0
1036	vpshufd	$0x55,%ymm3,%ymm1
1037	vmovdqa	%ymm0,256-256(%rcx)
1038	vpshufd	$0xaa,%ymm3,%ymm2
1039	vmovdqa	%ymm1,288-256(%rcx)
1040	vpshufd	$0xff,%ymm3,%ymm3
1041	vmovdqa	%ymm2,320-256(%rcx)
1042	vmovdqa	%ymm3,352-256(%rcx)
1043
1044	vpshufd	$0x00,%ymm15,%ymm12
1045	vpshufd	$0x55,%ymm15,%ymm13
1046	vmovdqa	%ymm12,384-512(%rax)
1047	vpshufd	$0xaa,%ymm15,%ymm14
1048	vmovdqa	%ymm13,416-512(%rax)
1049	vpshufd	$0xff,%ymm15,%ymm15
1050	vmovdqa	%ymm14,448-512(%rax)
1051	vmovdqa	%ymm15,480-512(%rax)
1052
1053	vpshufd	$0x00,%ymm7,%ymm4
1054	vpshufd	$0x55,%ymm7,%ymm5
1055	vpaddd	L$incy(%rip),%ymm4,%ymm4
1056	vpshufd	$0xaa,%ymm7,%ymm6
1057	vmovdqa	%ymm5,544-512(%rax)
1058	vpshufd	$0xff,%ymm7,%ymm7
1059	vmovdqa	%ymm6,576-512(%rax)
1060	vmovdqa	%ymm7,608-512(%rax)
1061
1062	jmp	L$oop_enter8x
1063
1064.p2align	5
1065L$oop_outer8x:
1066	vmovdqa	128-256(%rcx),%ymm8
1067	vmovdqa	160-256(%rcx),%ymm9
1068	vmovdqa	192-256(%rcx),%ymm10
1069	vmovdqa	224-256(%rcx),%ymm11
1070	vmovdqa	256-256(%rcx),%ymm0
1071	vmovdqa	288-256(%rcx),%ymm1
1072	vmovdqa	320-256(%rcx),%ymm2
1073	vmovdqa	352-256(%rcx),%ymm3
1074	vmovdqa	384-512(%rax),%ymm12
1075	vmovdqa	416-512(%rax),%ymm13
1076	vmovdqa	448-512(%rax),%ymm14
1077	vmovdqa	480-512(%rax),%ymm15
1078	vmovdqa	512-512(%rax),%ymm4
1079	vmovdqa	544-512(%rax),%ymm5
1080	vmovdqa	576-512(%rax),%ymm6
1081	vmovdqa	608-512(%rax),%ymm7
1082	vpaddd	L$eight(%rip),%ymm4,%ymm4
1083
1084L$oop_enter8x:
1085	vmovdqa	%ymm14,64(%rsp)
1086	vmovdqa	%ymm15,96(%rsp)
1087	vbroadcasti128	(%r10),%ymm15
1088	vmovdqa	%ymm4,512-512(%rax)
1089	movl	$10,%eax
1090	jmp	L$oop8x
1091
1092.p2align	5
1093L$oop8x:
1094	vpaddd	%ymm0,%ymm8,%ymm8
1095	vpxor	%ymm4,%ymm8,%ymm4
1096	vpshufb	%ymm15,%ymm4,%ymm4
1097	vpaddd	%ymm1,%ymm9,%ymm9
1098	vpxor	%ymm5,%ymm9,%ymm5
1099	vpshufb	%ymm15,%ymm5,%ymm5
1100	vpaddd	%ymm4,%ymm12,%ymm12
1101	vpxor	%ymm0,%ymm12,%ymm0
1102	vpslld	$12,%ymm0,%ymm14
1103	vpsrld	$20,%ymm0,%ymm0
1104	vpor	%ymm0,%ymm14,%ymm0
1105	vbroadcasti128	(%r11),%ymm14
1106	vpaddd	%ymm5,%ymm13,%ymm13
1107	vpxor	%ymm1,%ymm13,%ymm1
1108	vpslld	$12,%ymm1,%ymm15
1109	vpsrld	$20,%ymm1,%ymm1
1110	vpor	%ymm1,%ymm15,%ymm1
1111	vpaddd	%ymm0,%ymm8,%ymm8
1112	vpxor	%ymm4,%ymm8,%ymm4
1113	vpshufb	%ymm14,%ymm4,%ymm4
1114	vpaddd	%ymm1,%ymm9,%ymm9
1115	vpxor	%ymm5,%ymm9,%ymm5
1116	vpshufb	%ymm14,%ymm5,%ymm5
1117	vpaddd	%ymm4,%ymm12,%ymm12
1118	vpxor	%ymm0,%ymm12,%ymm0
1119	vpslld	$7,%ymm0,%ymm15
1120	vpsrld	$25,%ymm0,%ymm0
1121	vpor	%ymm0,%ymm15,%ymm0
1122	vbroadcasti128	(%r10),%ymm15
1123	vpaddd	%ymm5,%ymm13,%ymm13
1124	vpxor	%ymm1,%ymm13,%ymm1
1125	vpslld	$7,%ymm1,%ymm14
1126	vpsrld	$25,%ymm1,%ymm1
1127	vpor	%ymm1,%ymm14,%ymm1
1128	vmovdqa	%ymm12,0(%rsp)
1129	vmovdqa	%ymm13,32(%rsp)
1130	vmovdqa	64(%rsp),%ymm12
1131	vmovdqa	96(%rsp),%ymm13
1132	vpaddd	%ymm2,%ymm10,%ymm10
1133	vpxor	%ymm6,%ymm10,%ymm6
1134	vpshufb	%ymm15,%ymm6,%ymm6
1135	vpaddd	%ymm3,%ymm11,%ymm11
1136	vpxor	%ymm7,%ymm11,%ymm7
1137	vpshufb	%ymm15,%ymm7,%ymm7
1138	vpaddd	%ymm6,%ymm12,%ymm12
1139	vpxor	%ymm2,%ymm12,%ymm2
1140	vpslld	$12,%ymm2,%ymm14
1141	vpsrld	$20,%ymm2,%ymm2
1142	vpor	%ymm2,%ymm14,%ymm2
1143	vbroadcasti128	(%r11),%ymm14
1144	vpaddd	%ymm7,%ymm13,%ymm13
1145	vpxor	%ymm3,%ymm13,%ymm3
1146	vpslld	$12,%ymm3,%ymm15
1147	vpsrld	$20,%ymm3,%ymm3
1148	vpor	%ymm3,%ymm15,%ymm3
1149	vpaddd	%ymm2,%ymm10,%ymm10
1150	vpxor	%ymm6,%ymm10,%ymm6
1151	vpshufb	%ymm14,%ymm6,%ymm6
1152	vpaddd	%ymm3,%ymm11,%ymm11
1153	vpxor	%ymm7,%ymm11,%ymm7
1154	vpshufb	%ymm14,%ymm7,%ymm7
1155	vpaddd	%ymm6,%ymm12,%ymm12
1156	vpxor	%ymm2,%ymm12,%ymm2
1157	vpslld	$7,%ymm2,%ymm15
1158	vpsrld	$25,%ymm2,%ymm2
1159	vpor	%ymm2,%ymm15,%ymm2
1160	vbroadcasti128	(%r10),%ymm15
1161	vpaddd	%ymm7,%ymm13,%ymm13
1162	vpxor	%ymm3,%ymm13,%ymm3
1163	vpslld	$7,%ymm3,%ymm14
1164	vpsrld	$25,%ymm3,%ymm3
1165	vpor	%ymm3,%ymm14,%ymm3
1166	vpaddd	%ymm1,%ymm8,%ymm8
1167	vpxor	%ymm7,%ymm8,%ymm7
1168	vpshufb	%ymm15,%ymm7,%ymm7
1169	vpaddd	%ymm2,%ymm9,%ymm9
1170	vpxor	%ymm4,%ymm9,%ymm4
1171	vpshufb	%ymm15,%ymm4,%ymm4
1172	vpaddd	%ymm7,%ymm12,%ymm12
1173	vpxor	%ymm1,%ymm12,%ymm1
1174	vpslld	$12,%ymm1,%ymm14
1175	vpsrld	$20,%ymm1,%ymm1
1176	vpor	%ymm1,%ymm14,%ymm1
1177	vbroadcasti128	(%r11),%ymm14
1178	vpaddd	%ymm4,%ymm13,%ymm13
1179	vpxor	%ymm2,%ymm13,%ymm2
1180	vpslld	$12,%ymm2,%ymm15
1181	vpsrld	$20,%ymm2,%ymm2
1182	vpor	%ymm2,%ymm15,%ymm2
1183	vpaddd	%ymm1,%ymm8,%ymm8
1184	vpxor	%ymm7,%ymm8,%ymm7
1185	vpshufb	%ymm14,%ymm7,%ymm7
1186	vpaddd	%ymm2,%ymm9,%ymm9
1187	vpxor	%ymm4,%ymm9,%ymm4
1188	vpshufb	%ymm14,%ymm4,%ymm4
1189	vpaddd	%ymm7,%ymm12,%ymm12
1190	vpxor	%ymm1,%ymm12,%ymm1
1191	vpslld	$7,%ymm1,%ymm15
1192	vpsrld	$25,%ymm1,%ymm1
1193	vpor	%ymm1,%ymm15,%ymm1
1194	vbroadcasti128	(%r10),%ymm15
1195	vpaddd	%ymm4,%ymm13,%ymm13
1196	vpxor	%ymm2,%ymm13,%ymm2
1197	vpslld	$7,%ymm2,%ymm14
1198	vpsrld	$25,%ymm2,%ymm2
1199	vpor	%ymm2,%ymm14,%ymm2
1200	vmovdqa	%ymm12,64(%rsp)
1201	vmovdqa	%ymm13,96(%rsp)
1202	vmovdqa	0(%rsp),%ymm12
1203	vmovdqa	32(%rsp),%ymm13
1204	vpaddd	%ymm3,%ymm10,%ymm10
1205	vpxor	%ymm5,%ymm10,%ymm5
1206	vpshufb	%ymm15,%ymm5,%ymm5
1207	vpaddd	%ymm0,%ymm11,%ymm11
1208	vpxor	%ymm6,%ymm11,%ymm6
1209	vpshufb	%ymm15,%ymm6,%ymm6
1210	vpaddd	%ymm5,%ymm12,%ymm12
1211	vpxor	%ymm3,%ymm12,%ymm3
1212	vpslld	$12,%ymm3,%ymm14
1213	vpsrld	$20,%ymm3,%ymm3
1214	vpor	%ymm3,%ymm14,%ymm3
1215	vbroadcasti128	(%r11),%ymm14
1216	vpaddd	%ymm6,%ymm13,%ymm13
1217	vpxor	%ymm0,%ymm13,%ymm0
1218	vpslld	$12,%ymm0,%ymm15
1219	vpsrld	$20,%ymm0,%ymm0
1220	vpor	%ymm0,%ymm15,%ymm0
1221	vpaddd	%ymm3,%ymm10,%ymm10
1222	vpxor	%ymm5,%ymm10,%ymm5
1223	vpshufb	%ymm14,%ymm5,%ymm5
1224	vpaddd	%ymm0,%ymm11,%ymm11
1225	vpxor	%ymm6,%ymm11,%ymm6
1226	vpshufb	%ymm14,%ymm6,%ymm6
1227	vpaddd	%ymm5,%ymm12,%ymm12
1228	vpxor	%ymm3,%ymm12,%ymm3
1229	vpslld	$7,%ymm3,%ymm15
1230	vpsrld	$25,%ymm3,%ymm3
1231	vpor	%ymm3,%ymm15,%ymm3
1232	vbroadcasti128	(%r10),%ymm15
1233	vpaddd	%ymm6,%ymm13,%ymm13
1234	vpxor	%ymm0,%ymm13,%ymm0
1235	vpslld	$7,%ymm0,%ymm14
1236	vpsrld	$25,%ymm0,%ymm0
1237	vpor	%ymm0,%ymm14,%ymm0
1238	decl	%eax
1239	jnz	L$oop8x
1240
1241	leaq	512(%rsp),%rax
1242	vpaddd	128-256(%rcx),%ymm8,%ymm8
1243	vpaddd	160-256(%rcx),%ymm9,%ymm9
1244	vpaddd	192-256(%rcx),%ymm10,%ymm10
1245	vpaddd	224-256(%rcx),%ymm11,%ymm11
1246
1247	vpunpckldq	%ymm9,%ymm8,%ymm14
1248	vpunpckldq	%ymm11,%ymm10,%ymm15
1249	vpunpckhdq	%ymm9,%ymm8,%ymm8
1250	vpunpckhdq	%ymm11,%ymm10,%ymm10
1251	vpunpcklqdq	%ymm15,%ymm14,%ymm9
1252	vpunpckhqdq	%ymm15,%ymm14,%ymm14
1253	vpunpcklqdq	%ymm10,%ymm8,%ymm11
1254	vpunpckhqdq	%ymm10,%ymm8,%ymm8
1255	vpaddd	256-256(%rcx),%ymm0,%ymm0
1256	vpaddd	288-256(%rcx),%ymm1,%ymm1
1257	vpaddd	320-256(%rcx),%ymm2,%ymm2
1258	vpaddd	352-256(%rcx),%ymm3,%ymm3
1259
1260	vpunpckldq	%ymm1,%ymm0,%ymm10
1261	vpunpckldq	%ymm3,%ymm2,%ymm15
1262	vpunpckhdq	%ymm1,%ymm0,%ymm0
1263	vpunpckhdq	%ymm3,%ymm2,%ymm2
1264	vpunpcklqdq	%ymm15,%ymm10,%ymm1
1265	vpunpckhqdq	%ymm15,%ymm10,%ymm10
1266	vpunpcklqdq	%ymm2,%ymm0,%ymm3
1267	vpunpckhqdq	%ymm2,%ymm0,%ymm0
1268	vperm2i128	$0x20,%ymm1,%ymm9,%ymm15
1269	vperm2i128	$0x31,%ymm1,%ymm9,%ymm1
1270	vperm2i128	$0x20,%ymm10,%ymm14,%ymm9
1271	vperm2i128	$0x31,%ymm10,%ymm14,%ymm10
1272	vperm2i128	$0x20,%ymm3,%ymm11,%ymm14
1273	vperm2i128	$0x31,%ymm3,%ymm11,%ymm3
1274	vperm2i128	$0x20,%ymm0,%ymm8,%ymm11
1275	vperm2i128	$0x31,%ymm0,%ymm8,%ymm0
1276	vmovdqa	%ymm15,0(%rsp)
1277	vmovdqa	%ymm9,32(%rsp)
1278	vmovdqa	64(%rsp),%ymm15
1279	vmovdqa	96(%rsp),%ymm9
1280
1281	vpaddd	384-512(%rax),%ymm12,%ymm12
1282	vpaddd	416-512(%rax),%ymm13,%ymm13
1283	vpaddd	448-512(%rax),%ymm15,%ymm15
1284	vpaddd	480-512(%rax),%ymm9,%ymm9
1285
1286	vpunpckldq	%ymm13,%ymm12,%ymm2
1287	vpunpckldq	%ymm9,%ymm15,%ymm8
1288	vpunpckhdq	%ymm13,%ymm12,%ymm12
1289	vpunpckhdq	%ymm9,%ymm15,%ymm15
1290	vpunpcklqdq	%ymm8,%ymm2,%ymm13
1291	vpunpckhqdq	%ymm8,%ymm2,%ymm2
1292	vpunpcklqdq	%ymm15,%ymm12,%ymm9
1293	vpunpckhqdq	%ymm15,%ymm12,%ymm12
1294	vpaddd	512-512(%rax),%ymm4,%ymm4
1295	vpaddd	544-512(%rax),%ymm5,%ymm5
1296	vpaddd	576-512(%rax),%ymm6,%ymm6
1297	vpaddd	608-512(%rax),%ymm7,%ymm7
1298
1299	vpunpckldq	%ymm5,%ymm4,%ymm15
1300	vpunpckldq	%ymm7,%ymm6,%ymm8
1301	vpunpckhdq	%ymm5,%ymm4,%ymm4
1302	vpunpckhdq	%ymm7,%ymm6,%ymm6
1303	vpunpcklqdq	%ymm8,%ymm15,%ymm5
1304	vpunpckhqdq	%ymm8,%ymm15,%ymm15
1305	vpunpcklqdq	%ymm6,%ymm4,%ymm7
1306	vpunpckhqdq	%ymm6,%ymm4,%ymm4
1307	vperm2i128	$0x20,%ymm5,%ymm13,%ymm8
1308	vperm2i128	$0x31,%ymm5,%ymm13,%ymm5
1309	vperm2i128	$0x20,%ymm15,%ymm2,%ymm13
1310	vperm2i128	$0x31,%ymm15,%ymm2,%ymm15
1311	vperm2i128	$0x20,%ymm7,%ymm9,%ymm2
1312	vperm2i128	$0x31,%ymm7,%ymm9,%ymm7
1313	vperm2i128	$0x20,%ymm4,%ymm12,%ymm9
1314	vperm2i128	$0x31,%ymm4,%ymm12,%ymm4
1315	vmovdqa	0(%rsp),%ymm6
1316	vmovdqa	32(%rsp),%ymm12
1317
1318	cmpq	$512,%rdx
1319	jb	L$tail8x
1320
1321	vpxor	0(%rsi),%ymm6,%ymm6
1322	vpxor	32(%rsi),%ymm8,%ymm8
1323	vpxor	64(%rsi),%ymm1,%ymm1
1324	vpxor	96(%rsi),%ymm5,%ymm5
1325	leaq	128(%rsi),%rsi
1326	vmovdqu	%ymm6,0(%rdi)
1327	vmovdqu	%ymm8,32(%rdi)
1328	vmovdqu	%ymm1,64(%rdi)
1329	vmovdqu	%ymm5,96(%rdi)
1330	leaq	128(%rdi),%rdi
1331
1332	vpxor	0(%rsi),%ymm12,%ymm12
1333	vpxor	32(%rsi),%ymm13,%ymm13
1334	vpxor	64(%rsi),%ymm10,%ymm10
1335	vpxor	96(%rsi),%ymm15,%ymm15
1336	leaq	128(%rsi),%rsi
1337	vmovdqu	%ymm12,0(%rdi)
1338	vmovdqu	%ymm13,32(%rdi)
1339	vmovdqu	%ymm10,64(%rdi)
1340	vmovdqu	%ymm15,96(%rdi)
1341	leaq	128(%rdi),%rdi
1342
1343	vpxor	0(%rsi),%ymm14,%ymm14
1344	vpxor	32(%rsi),%ymm2,%ymm2
1345	vpxor	64(%rsi),%ymm3,%ymm3
1346	vpxor	96(%rsi),%ymm7,%ymm7
1347	leaq	128(%rsi),%rsi
1348	vmovdqu	%ymm14,0(%rdi)
1349	vmovdqu	%ymm2,32(%rdi)
1350	vmovdqu	%ymm3,64(%rdi)
1351	vmovdqu	%ymm7,96(%rdi)
1352	leaq	128(%rdi),%rdi
1353
1354	vpxor	0(%rsi),%ymm11,%ymm11
1355	vpxor	32(%rsi),%ymm9,%ymm9
1356	vpxor	64(%rsi),%ymm0,%ymm0
1357	vpxor	96(%rsi),%ymm4,%ymm4
1358	leaq	128(%rsi),%rsi
1359	vmovdqu	%ymm11,0(%rdi)
1360	vmovdqu	%ymm9,32(%rdi)
1361	vmovdqu	%ymm0,64(%rdi)
1362	vmovdqu	%ymm4,96(%rdi)
1363	leaq	128(%rdi),%rdi
1364
1365	subq	$512,%rdx
1366	jnz	L$oop_outer8x
1367
1368	jmp	L$done8x
1369
1370L$tail8x:
1371	cmpq	$448,%rdx
1372	jae	L$448_or_more8x
1373	cmpq	$384,%rdx
1374	jae	L$384_or_more8x
1375	cmpq	$320,%rdx
1376	jae	L$320_or_more8x
1377	cmpq	$256,%rdx
1378	jae	L$256_or_more8x
1379	cmpq	$192,%rdx
1380	jae	L$192_or_more8x
1381	cmpq	$128,%rdx
1382	jae	L$128_or_more8x
1383	cmpq	$64,%rdx
1384	jae	L$64_or_more8x
1385
1386	xorq	%r10,%r10
1387	vmovdqa	%ymm6,0(%rsp)
1388	vmovdqa	%ymm8,32(%rsp)
1389	jmp	L$oop_tail8x
1390
1391.p2align	5
1392L$64_or_more8x:
1393	vpxor	0(%rsi),%ymm6,%ymm6
1394	vpxor	32(%rsi),%ymm8,%ymm8
1395	vmovdqu	%ymm6,0(%rdi)
1396	vmovdqu	%ymm8,32(%rdi)
1397	je	L$done8x
1398
1399	leaq	64(%rsi),%rsi
1400	xorq	%r10,%r10
1401	vmovdqa	%ymm1,0(%rsp)
1402	leaq	64(%rdi),%rdi
1403	subq	$64,%rdx
1404	vmovdqa	%ymm5,32(%rsp)
1405	jmp	L$oop_tail8x
1406
1407.p2align	5
1408L$128_or_more8x:
1409	vpxor	0(%rsi),%ymm6,%ymm6
1410	vpxor	32(%rsi),%ymm8,%ymm8
1411	vpxor	64(%rsi),%ymm1,%ymm1
1412	vpxor	96(%rsi),%ymm5,%ymm5
1413	vmovdqu	%ymm6,0(%rdi)
1414	vmovdqu	%ymm8,32(%rdi)
1415	vmovdqu	%ymm1,64(%rdi)
1416	vmovdqu	%ymm5,96(%rdi)
1417	je	L$done8x
1418
1419	leaq	128(%rsi),%rsi
1420	xorq	%r10,%r10
1421	vmovdqa	%ymm12,0(%rsp)
1422	leaq	128(%rdi),%rdi
1423	subq	$128,%rdx
1424	vmovdqa	%ymm13,32(%rsp)
1425	jmp	L$oop_tail8x
1426
1427.p2align	5
1428L$192_or_more8x:
1429	vpxor	0(%rsi),%ymm6,%ymm6
1430	vpxor	32(%rsi),%ymm8,%ymm8
1431	vpxor	64(%rsi),%ymm1,%ymm1
1432	vpxor	96(%rsi),%ymm5,%ymm5
1433	vpxor	128(%rsi),%ymm12,%ymm12
1434	vpxor	160(%rsi),%ymm13,%ymm13
1435	vmovdqu	%ymm6,0(%rdi)
1436	vmovdqu	%ymm8,32(%rdi)
1437	vmovdqu	%ymm1,64(%rdi)
1438	vmovdqu	%ymm5,96(%rdi)
1439	vmovdqu	%ymm12,128(%rdi)
1440	vmovdqu	%ymm13,160(%rdi)
1441	je	L$done8x
1442
1443	leaq	192(%rsi),%rsi
1444	xorq	%r10,%r10
1445	vmovdqa	%ymm10,0(%rsp)
1446	leaq	192(%rdi),%rdi
1447	subq	$192,%rdx
1448	vmovdqa	%ymm15,32(%rsp)
1449	jmp	L$oop_tail8x
1450
1451.p2align	5
1452L$256_or_more8x:
1453	vpxor	0(%rsi),%ymm6,%ymm6
1454	vpxor	32(%rsi),%ymm8,%ymm8
1455	vpxor	64(%rsi),%ymm1,%ymm1
1456	vpxor	96(%rsi),%ymm5,%ymm5
1457	vpxor	128(%rsi),%ymm12,%ymm12
1458	vpxor	160(%rsi),%ymm13,%ymm13
1459	vpxor	192(%rsi),%ymm10,%ymm10
1460	vpxor	224(%rsi),%ymm15,%ymm15
1461	vmovdqu	%ymm6,0(%rdi)
1462	vmovdqu	%ymm8,32(%rdi)
1463	vmovdqu	%ymm1,64(%rdi)
1464	vmovdqu	%ymm5,96(%rdi)
1465	vmovdqu	%ymm12,128(%rdi)
1466	vmovdqu	%ymm13,160(%rdi)
1467	vmovdqu	%ymm10,192(%rdi)
1468	vmovdqu	%ymm15,224(%rdi)
1469	je	L$done8x
1470
1471	leaq	256(%rsi),%rsi
1472	xorq	%r10,%r10
1473	vmovdqa	%ymm14,0(%rsp)
1474	leaq	256(%rdi),%rdi
1475	subq	$256,%rdx
1476	vmovdqa	%ymm2,32(%rsp)
1477	jmp	L$oop_tail8x
1478
1479.p2align	5
1480L$320_or_more8x:
1481	vpxor	0(%rsi),%ymm6,%ymm6
1482	vpxor	32(%rsi),%ymm8,%ymm8
1483	vpxor	64(%rsi),%ymm1,%ymm1
1484	vpxor	96(%rsi),%ymm5,%ymm5
1485	vpxor	128(%rsi),%ymm12,%ymm12
1486	vpxor	160(%rsi),%ymm13,%ymm13
1487	vpxor	192(%rsi),%ymm10,%ymm10
1488	vpxor	224(%rsi),%ymm15,%ymm15
1489	vpxor	256(%rsi),%ymm14,%ymm14
1490	vpxor	288(%rsi),%ymm2,%ymm2
1491	vmovdqu	%ymm6,0(%rdi)
1492	vmovdqu	%ymm8,32(%rdi)
1493	vmovdqu	%ymm1,64(%rdi)
1494	vmovdqu	%ymm5,96(%rdi)
1495	vmovdqu	%ymm12,128(%rdi)
1496	vmovdqu	%ymm13,160(%rdi)
1497	vmovdqu	%ymm10,192(%rdi)
1498	vmovdqu	%ymm15,224(%rdi)
1499	vmovdqu	%ymm14,256(%rdi)
1500	vmovdqu	%ymm2,288(%rdi)
1501	je	L$done8x
1502
1503	leaq	320(%rsi),%rsi
1504	xorq	%r10,%r10
1505	vmovdqa	%ymm3,0(%rsp)
1506	leaq	320(%rdi),%rdi
1507	subq	$320,%rdx
1508	vmovdqa	%ymm7,32(%rsp)
1509	jmp	L$oop_tail8x
1510
1511.p2align	5
1512L$384_or_more8x:
1513	vpxor	0(%rsi),%ymm6,%ymm6
1514	vpxor	32(%rsi),%ymm8,%ymm8
1515	vpxor	64(%rsi),%ymm1,%ymm1
1516	vpxor	96(%rsi),%ymm5,%ymm5
1517	vpxor	128(%rsi),%ymm12,%ymm12
1518	vpxor	160(%rsi),%ymm13,%ymm13
1519	vpxor	192(%rsi),%ymm10,%ymm10
1520	vpxor	224(%rsi),%ymm15,%ymm15
1521	vpxor	256(%rsi),%ymm14,%ymm14
1522	vpxor	288(%rsi),%ymm2,%ymm2
1523	vpxor	320(%rsi),%ymm3,%ymm3
1524	vpxor	352(%rsi),%ymm7,%ymm7
1525	vmovdqu	%ymm6,0(%rdi)
1526	vmovdqu	%ymm8,32(%rdi)
1527	vmovdqu	%ymm1,64(%rdi)
1528	vmovdqu	%ymm5,96(%rdi)
1529	vmovdqu	%ymm12,128(%rdi)
1530	vmovdqu	%ymm13,160(%rdi)
1531	vmovdqu	%ymm10,192(%rdi)
1532	vmovdqu	%ymm15,224(%rdi)
1533	vmovdqu	%ymm14,256(%rdi)
1534	vmovdqu	%ymm2,288(%rdi)
1535	vmovdqu	%ymm3,320(%rdi)
1536	vmovdqu	%ymm7,352(%rdi)
1537	je	L$done8x
1538
1539	leaq	384(%rsi),%rsi
1540	xorq	%r10,%r10
1541	vmovdqa	%ymm11,0(%rsp)
1542	leaq	384(%rdi),%rdi
1543	subq	$384,%rdx
1544	vmovdqa	%ymm9,32(%rsp)
1545	jmp	L$oop_tail8x
1546
1547.p2align	5
1548L$448_or_more8x:
1549	vpxor	0(%rsi),%ymm6,%ymm6
1550	vpxor	32(%rsi),%ymm8,%ymm8
1551	vpxor	64(%rsi),%ymm1,%ymm1
1552	vpxor	96(%rsi),%ymm5,%ymm5
1553	vpxor	128(%rsi),%ymm12,%ymm12
1554	vpxor	160(%rsi),%ymm13,%ymm13
1555	vpxor	192(%rsi),%ymm10,%ymm10
1556	vpxor	224(%rsi),%ymm15,%ymm15
1557	vpxor	256(%rsi),%ymm14,%ymm14
1558	vpxor	288(%rsi),%ymm2,%ymm2
1559	vpxor	320(%rsi),%ymm3,%ymm3
1560	vpxor	352(%rsi),%ymm7,%ymm7
1561	vpxor	384(%rsi),%ymm11,%ymm11
1562	vpxor	416(%rsi),%ymm9,%ymm9
1563	vmovdqu	%ymm6,0(%rdi)
1564	vmovdqu	%ymm8,32(%rdi)
1565	vmovdqu	%ymm1,64(%rdi)
1566	vmovdqu	%ymm5,96(%rdi)
1567	vmovdqu	%ymm12,128(%rdi)
1568	vmovdqu	%ymm13,160(%rdi)
1569	vmovdqu	%ymm10,192(%rdi)
1570	vmovdqu	%ymm15,224(%rdi)
1571	vmovdqu	%ymm14,256(%rdi)
1572	vmovdqu	%ymm2,288(%rdi)
1573	vmovdqu	%ymm3,320(%rdi)
1574	vmovdqu	%ymm7,352(%rdi)
1575	vmovdqu	%ymm11,384(%rdi)
1576	vmovdqu	%ymm9,416(%rdi)
1577	je	L$done8x
1578
1579	leaq	448(%rsi),%rsi
1580	xorq	%r10,%r10
1581	vmovdqa	%ymm0,0(%rsp)
1582	leaq	448(%rdi),%rdi
1583	subq	$448,%rdx
1584	vmovdqa	%ymm4,32(%rsp)
1585
1586L$oop_tail8x:
1587	movzbl	(%rsi,%r10,1),%eax
1588	movzbl	(%rsp,%r10,1),%ecx
1589	leaq	1(%r10),%r10
1590	xorl	%ecx,%eax
1591	movb	%al,-1(%rdi,%r10,1)
1592	decq	%rdx
1593	jnz	L$oop_tail8x
1594
1595L$done8x:
1596	vzeroall
1597	leaq	(%r9),%rsp
1598
1599L$8x_epilogue:
1600	ret
1601
1602
1603#endif
1604