1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <ring-core/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
7.text
8
9
10
11.section	__DATA,__const
12.p2align	6
13L$zero:
14.long	0,0,0,0
15L$one:
16.long	1,0,0,0
17L$inc:
18.long	0,1,2,3
19L$four:
20.long	4,4,4,4
21L$incy:
22.long	0,2,4,6,1,3,5,7
23L$eight:
24.long	8,8,8,8,8,8,8,8
25L$rot16:
26.byte	0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
27L$rot24:
28.byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
29L$sigma:
30.byte	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
31.p2align	6
32L$zeroz:
33.long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
34L$fourz:
35.long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
36L$incz:
37.long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
38L$sixteen:
39.long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
40.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
41.text
42.globl	_ChaCha20_ctr32
43.private_extern _ChaCha20_ctr32
44
45.p2align	6
46_ChaCha20_ctr32:
47
48_CET_ENDBR
49	cmpq	$0,%rdx
50	je	L$no_data
51	movq	_OPENSSL_ia32cap_P+4(%rip),%r10
52	testl	$512,%r10d
53	jnz	L$ChaCha20_ssse3
54
55	pushq	%rbx
56
57	pushq	%rbp
58
59	pushq	%r12
60
61	pushq	%r13
62
63	pushq	%r14
64
65	pushq	%r15
66
67	subq	$64+24,%rsp
68
69L$ctr32_body:
70
71
72	movdqu	(%rcx),%xmm1
73	movdqu	16(%rcx),%xmm2
74	movdqu	(%r8),%xmm3
75	movdqa	L$one(%rip),%xmm4
76
77
78	movdqa	%xmm1,16(%rsp)
79	movdqa	%xmm2,32(%rsp)
80	movdqa	%xmm3,48(%rsp)
81	movq	%rdx,%rbp
82	jmp	L$oop_outer
83
84.p2align	5
85L$oop_outer:
86	movl	$0x61707865,%eax
87	movl	$0x3320646e,%ebx
88	movl	$0x79622d32,%ecx
89	movl	$0x6b206574,%edx
90	movl	16(%rsp),%r8d
91	movl	20(%rsp),%r9d
92	movl	24(%rsp),%r10d
93	movl	28(%rsp),%r11d
94	movd	%xmm3,%r12d
95	movl	52(%rsp),%r13d
96	movl	56(%rsp),%r14d
97	movl	60(%rsp),%r15d
98
99	movq	%rbp,64+0(%rsp)
100	movl	$10,%ebp
101	movq	%rsi,64+8(%rsp)
102.byte	102,72,15,126,214
103	movq	%rdi,64+16(%rsp)
104	movq	%rsi,%rdi
105	shrq	$32,%rdi
106	jmp	L$oop
107
108.p2align	5
109L$oop:
110	addl	%r8d,%eax
111	xorl	%eax,%r12d
112	roll	$16,%r12d
113	addl	%r9d,%ebx
114	xorl	%ebx,%r13d
115	roll	$16,%r13d
116	addl	%r12d,%esi
117	xorl	%esi,%r8d
118	roll	$12,%r8d
119	addl	%r13d,%edi
120	xorl	%edi,%r9d
121	roll	$12,%r9d
122	addl	%r8d,%eax
123	xorl	%eax,%r12d
124	roll	$8,%r12d
125	addl	%r9d,%ebx
126	xorl	%ebx,%r13d
127	roll	$8,%r13d
128	addl	%r12d,%esi
129	xorl	%esi,%r8d
130	roll	$7,%r8d
131	addl	%r13d,%edi
132	xorl	%edi,%r9d
133	roll	$7,%r9d
134	movl	%esi,32(%rsp)
135	movl	%edi,36(%rsp)
136	movl	40(%rsp),%esi
137	movl	44(%rsp),%edi
138	addl	%r10d,%ecx
139	xorl	%ecx,%r14d
140	roll	$16,%r14d
141	addl	%r11d,%edx
142	xorl	%edx,%r15d
143	roll	$16,%r15d
144	addl	%r14d,%esi
145	xorl	%esi,%r10d
146	roll	$12,%r10d
147	addl	%r15d,%edi
148	xorl	%edi,%r11d
149	roll	$12,%r11d
150	addl	%r10d,%ecx
151	xorl	%ecx,%r14d
152	roll	$8,%r14d
153	addl	%r11d,%edx
154	xorl	%edx,%r15d
155	roll	$8,%r15d
156	addl	%r14d,%esi
157	xorl	%esi,%r10d
158	roll	$7,%r10d
159	addl	%r15d,%edi
160	xorl	%edi,%r11d
161	roll	$7,%r11d
162	addl	%r9d,%eax
163	xorl	%eax,%r15d
164	roll	$16,%r15d
165	addl	%r10d,%ebx
166	xorl	%ebx,%r12d
167	roll	$16,%r12d
168	addl	%r15d,%esi
169	xorl	%esi,%r9d
170	roll	$12,%r9d
171	addl	%r12d,%edi
172	xorl	%edi,%r10d
173	roll	$12,%r10d
174	addl	%r9d,%eax
175	xorl	%eax,%r15d
176	roll	$8,%r15d
177	addl	%r10d,%ebx
178	xorl	%ebx,%r12d
179	roll	$8,%r12d
180	addl	%r15d,%esi
181	xorl	%esi,%r9d
182	roll	$7,%r9d
183	addl	%r12d,%edi
184	xorl	%edi,%r10d
185	roll	$7,%r10d
186	movl	%esi,40(%rsp)
187	movl	%edi,44(%rsp)
188	movl	32(%rsp),%esi
189	movl	36(%rsp),%edi
190	addl	%r11d,%ecx
191	xorl	%ecx,%r13d
192	roll	$16,%r13d
193	addl	%r8d,%edx
194	xorl	%edx,%r14d
195	roll	$16,%r14d
196	addl	%r13d,%esi
197	xorl	%esi,%r11d
198	roll	$12,%r11d
199	addl	%r14d,%edi
200	xorl	%edi,%r8d
201	roll	$12,%r8d
202	addl	%r11d,%ecx
203	xorl	%ecx,%r13d
204	roll	$8,%r13d
205	addl	%r8d,%edx
206	xorl	%edx,%r14d
207	roll	$8,%r14d
208	addl	%r13d,%esi
209	xorl	%esi,%r11d
210	roll	$7,%r11d
211	addl	%r14d,%edi
212	xorl	%edi,%r8d
213	roll	$7,%r8d
214	decl	%ebp
215	jnz	L$oop
216	movl	%edi,36(%rsp)
217	movl	%esi,32(%rsp)
218	movq	64(%rsp),%rbp
219	movdqa	%xmm2,%xmm1
220	movq	64+8(%rsp),%rsi
221	paddd	%xmm4,%xmm3
222	movq	64+16(%rsp),%rdi
223
224	addl	$0x61707865,%eax
225	addl	$0x3320646e,%ebx
226	addl	$0x79622d32,%ecx
227	addl	$0x6b206574,%edx
228	addl	16(%rsp),%r8d
229	addl	20(%rsp),%r9d
230	addl	24(%rsp),%r10d
231	addl	28(%rsp),%r11d
232	addl	48(%rsp),%r12d
233	addl	52(%rsp),%r13d
234	addl	56(%rsp),%r14d
235	addl	60(%rsp),%r15d
236	paddd	32(%rsp),%xmm1
237
238	cmpq	$64,%rbp
239	jb	L$tail
240
241	xorl	0(%rsi),%eax
242	xorl	4(%rsi),%ebx
243	xorl	8(%rsi),%ecx
244	xorl	12(%rsi),%edx
245	xorl	16(%rsi),%r8d
246	xorl	20(%rsi),%r9d
247	xorl	24(%rsi),%r10d
248	xorl	28(%rsi),%r11d
249	movdqu	32(%rsi),%xmm0
250	xorl	48(%rsi),%r12d
251	xorl	52(%rsi),%r13d
252	xorl	56(%rsi),%r14d
253	xorl	60(%rsi),%r15d
254	leaq	64(%rsi),%rsi
255	pxor	%xmm1,%xmm0
256
257	movdqa	%xmm2,32(%rsp)
258	movd	%xmm3,48(%rsp)
259
260	movl	%eax,0(%rdi)
261	movl	%ebx,4(%rdi)
262	movl	%ecx,8(%rdi)
263	movl	%edx,12(%rdi)
264	movl	%r8d,16(%rdi)
265	movl	%r9d,20(%rdi)
266	movl	%r10d,24(%rdi)
267	movl	%r11d,28(%rdi)
268	movdqu	%xmm0,32(%rdi)
269	movl	%r12d,48(%rdi)
270	movl	%r13d,52(%rdi)
271	movl	%r14d,56(%rdi)
272	movl	%r15d,60(%rdi)
273	leaq	64(%rdi),%rdi
274
275	subq	$64,%rbp
276	jnz	L$oop_outer
277
278	jmp	L$done
279
280.p2align	4
281L$tail:
282	movl	%eax,0(%rsp)
283	movl	%ebx,4(%rsp)
284	xorq	%rbx,%rbx
285	movl	%ecx,8(%rsp)
286	movl	%edx,12(%rsp)
287	movl	%r8d,16(%rsp)
288	movl	%r9d,20(%rsp)
289	movl	%r10d,24(%rsp)
290	movl	%r11d,28(%rsp)
291	movdqa	%xmm1,32(%rsp)
292	movl	%r12d,48(%rsp)
293	movl	%r13d,52(%rsp)
294	movl	%r14d,56(%rsp)
295	movl	%r15d,60(%rsp)
296
297L$oop_tail:
298	movzbl	(%rsi,%rbx,1),%eax
299	movzbl	(%rsp,%rbx,1),%edx
300	leaq	1(%rbx),%rbx
301	xorl	%edx,%eax
302	movb	%al,-1(%rdi,%rbx,1)
303	decq	%rbp
304	jnz	L$oop_tail
305
306L$done:
307	leaq	64+24+48(%rsp),%rsi
308	movq	-48(%rsi),%r15
309
310	movq	-40(%rsi),%r14
311
312	movq	-32(%rsi),%r13
313
314	movq	-24(%rsi),%r12
315
316	movq	-16(%rsi),%rbp
317
318	movq	-8(%rsi),%rbx
319
320	leaq	(%rsi),%rsp
321
322L$no_data:
323	ret
324
325
326
327.p2align	5
328ChaCha20_ssse3:
329L$ChaCha20_ssse3:
330
331	movq	%rsp,%r9
332
333	cmpq	$128,%rdx
334	ja	L$ChaCha20_4x
335
336L$do_sse3_after_all:
337	subq	$64+8,%rsp
338	movdqa	L$sigma(%rip),%xmm0
339	movdqu	(%rcx),%xmm1
340	movdqu	16(%rcx),%xmm2
341	movdqu	(%r8),%xmm3
342	movdqa	L$rot16(%rip),%xmm6
343	movdqa	L$rot24(%rip),%xmm7
344
345	movdqa	%xmm0,0(%rsp)
346	movdqa	%xmm1,16(%rsp)
347	movdqa	%xmm2,32(%rsp)
348	movdqa	%xmm3,48(%rsp)
349	movq	$10,%r8
350	jmp	L$oop_ssse3
351
352.p2align	5
353L$oop_outer_ssse3:
354	movdqa	L$one(%rip),%xmm3
355	movdqa	0(%rsp),%xmm0
356	movdqa	16(%rsp),%xmm1
357	movdqa	32(%rsp),%xmm2
358	paddd	48(%rsp),%xmm3
359	movq	$10,%r8
360	movdqa	%xmm3,48(%rsp)
361	jmp	L$oop_ssse3
362
363.p2align	5
364L$oop_ssse3:
365	paddd	%xmm1,%xmm0
366	pxor	%xmm0,%xmm3
367.byte	102,15,56,0,222
368	paddd	%xmm3,%xmm2
369	pxor	%xmm2,%xmm1
370	movdqa	%xmm1,%xmm4
371	psrld	$20,%xmm1
372	pslld	$12,%xmm4
373	por	%xmm4,%xmm1
374	paddd	%xmm1,%xmm0
375	pxor	%xmm0,%xmm3
376.byte	102,15,56,0,223
377	paddd	%xmm3,%xmm2
378	pxor	%xmm2,%xmm1
379	movdqa	%xmm1,%xmm4
380	psrld	$25,%xmm1
381	pslld	$7,%xmm4
382	por	%xmm4,%xmm1
383	pshufd	$78,%xmm2,%xmm2
384	pshufd	$57,%xmm1,%xmm1
385	pshufd	$147,%xmm3,%xmm3
386	nop
387	paddd	%xmm1,%xmm0
388	pxor	%xmm0,%xmm3
389.byte	102,15,56,0,222
390	paddd	%xmm3,%xmm2
391	pxor	%xmm2,%xmm1
392	movdqa	%xmm1,%xmm4
393	psrld	$20,%xmm1
394	pslld	$12,%xmm4
395	por	%xmm4,%xmm1
396	paddd	%xmm1,%xmm0
397	pxor	%xmm0,%xmm3
398.byte	102,15,56,0,223
399	paddd	%xmm3,%xmm2
400	pxor	%xmm2,%xmm1
401	movdqa	%xmm1,%xmm4
402	psrld	$25,%xmm1
403	pslld	$7,%xmm4
404	por	%xmm4,%xmm1
405	pshufd	$78,%xmm2,%xmm2
406	pshufd	$147,%xmm1,%xmm1
407	pshufd	$57,%xmm3,%xmm3
408	decq	%r8
409	jnz	L$oop_ssse3
410	paddd	0(%rsp),%xmm0
411	paddd	16(%rsp),%xmm1
412	paddd	32(%rsp),%xmm2
413	paddd	48(%rsp),%xmm3
414
415	cmpq	$64,%rdx
416	jb	L$tail_ssse3
417
418	movdqu	0(%rsi),%xmm4
419	movdqu	16(%rsi),%xmm5
420	pxor	%xmm4,%xmm0
421	movdqu	32(%rsi),%xmm4
422	pxor	%xmm5,%xmm1
423	movdqu	48(%rsi),%xmm5
424	leaq	64(%rsi),%rsi
425	pxor	%xmm4,%xmm2
426	pxor	%xmm5,%xmm3
427
428	movdqu	%xmm0,0(%rdi)
429	movdqu	%xmm1,16(%rdi)
430	movdqu	%xmm2,32(%rdi)
431	movdqu	%xmm3,48(%rdi)
432	leaq	64(%rdi),%rdi
433
434	subq	$64,%rdx
435	jnz	L$oop_outer_ssse3
436
437	jmp	L$done_ssse3
438
439.p2align	4
440L$tail_ssse3:
441	movdqa	%xmm0,0(%rsp)
442	movdqa	%xmm1,16(%rsp)
443	movdqa	%xmm2,32(%rsp)
444	movdqa	%xmm3,48(%rsp)
445	xorq	%r8,%r8
446
447L$oop_tail_ssse3:
448	movzbl	(%rsi,%r8,1),%eax
449	movzbl	(%rsp,%r8,1),%ecx
450	leaq	1(%r8),%r8
451	xorl	%ecx,%eax
452	movb	%al,-1(%rdi,%r8,1)
453	decq	%rdx
454	jnz	L$oop_tail_ssse3
455
456L$done_ssse3:
457	leaq	(%r9),%rsp
458
459L$ssse3_epilogue:
460	ret
461
462
463
464.p2align	5
465ChaCha20_4x:
466L$ChaCha20_4x:
467
468	movq	%rsp,%r9
469
470	movq	%r10,%r11
471	shrq	$32,%r10
472	testq	$32,%r10
473	jnz	L$ChaCha20_8x
474	cmpq	$192,%rdx
475	ja	L$proceed4x
476
477	andq	$71303168,%r11
478	cmpq	$4194304,%r11
479	je	L$do_sse3_after_all
480
481L$proceed4x:
482	subq	$0x140+8,%rsp
483	movdqa	L$sigma(%rip),%xmm11
484	movdqu	(%rcx),%xmm15
485	movdqu	16(%rcx),%xmm7
486	movdqu	(%r8),%xmm3
487	leaq	256(%rsp),%rcx
488	leaq	L$rot16(%rip),%r10
489	leaq	L$rot24(%rip),%r11
490
491	pshufd	$0x00,%xmm11,%xmm8
492	pshufd	$0x55,%xmm11,%xmm9
493	movdqa	%xmm8,64(%rsp)
494	pshufd	$0xaa,%xmm11,%xmm10
495	movdqa	%xmm9,80(%rsp)
496	pshufd	$0xff,%xmm11,%xmm11
497	movdqa	%xmm10,96(%rsp)
498	movdqa	%xmm11,112(%rsp)
499
500	pshufd	$0x00,%xmm15,%xmm12
501	pshufd	$0x55,%xmm15,%xmm13
502	movdqa	%xmm12,128-256(%rcx)
503	pshufd	$0xaa,%xmm15,%xmm14
504	movdqa	%xmm13,144-256(%rcx)
505	pshufd	$0xff,%xmm15,%xmm15
506	movdqa	%xmm14,160-256(%rcx)
507	movdqa	%xmm15,176-256(%rcx)
508
509	pshufd	$0x00,%xmm7,%xmm4
510	pshufd	$0x55,%xmm7,%xmm5
511	movdqa	%xmm4,192-256(%rcx)
512	pshufd	$0xaa,%xmm7,%xmm6
513	movdqa	%xmm5,208-256(%rcx)
514	pshufd	$0xff,%xmm7,%xmm7
515	movdqa	%xmm6,224-256(%rcx)
516	movdqa	%xmm7,240-256(%rcx)
517
518	pshufd	$0x00,%xmm3,%xmm0
519	pshufd	$0x55,%xmm3,%xmm1
520	paddd	L$inc(%rip),%xmm0
521	pshufd	$0xaa,%xmm3,%xmm2
522	movdqa	%xmm1,272-256(%rcx)
523	pshufd	$0xff,%xmm3,%xmm3
524	movdqa	%xmm2,288-256(%rcx)
525	movdqa	%xmm3,304-256(%rcx)
526
527	jmp	L$oop_enter4x
528
529.p2align	5
530L$oop_outer4x:
531	movdqa	64(%rsp),%xmm8
532	movdqa	80(%rsp),%xmm9
533	movdqa	96(%rsp),%xmm10
534	movdqa	112(%rsp),%xmm11
535	movdqa	128-256(%rcx),%xmm12
536	movdqa	144-256(%rcx),%xmm13
537	movdqa	160-256(%rcx),%xmm14
538	movdqa	176-256(%rcx),%xmm15
539	movdqa	192-256(%rcx),%xmm4
540	movdqa	208-256(%rcx),%xmm5
541	movdqa	224-256(%rcx),%xmm6
542	movdqa	240-256(%rcx),%xmm7
543	movdqa	256-256(%rcx),%xmm0
544	movdqa	272-256(%rcx),%xmm1
545	movdqa	288-256(%rcx),%xmm2
546	movdqa	304-256(%rcx),%xmm3
547	paddd	L$four(%rip),%xmm0
548
549L$oop_enter4x:
550	movdqa	%xmm6,32(%rsp)
551	movdqa	%xmm7,48(%rsp)
552	movdqa	(%r10),%xmm7
553	movl	$10,%eax
554	movdqa	%xmm0,256-256(%rcx)
555	jmp	L$oop4x
556
557.p2align	5
558L$oop4x:
559	paddd	%xmm12,%xmm8
560	paddd	%xmm13,%xmm9
561	pxor	%xmm8,%xmm0
562	pxor	%xmm9,%xmm1
563.byte	102,15,56,0,199
564.byte	102,15,56,0,207
565	paddd	%xmm0,%xmm4
566	paddd	%xmm1,%xmm5
567	pxor	%xmm4,%xmm12
568	pxor	%xmm5,%xmm13
569	movdqa	%xmm12,%xmm6
570	pslld	$12,%xmm12
571	psrld	$20,%xmm6
572	movdqa	%xmm13,%xmm7
573	pslld	$12,%xmm13
574	por	%xmm6,%xmm12
575	psrld	$20,%xmm7
576	movdqa	(%r11),%xmm6
577	por	%xmm7,%xmm13
578	paddd	%xmm12,%xmm8
579	paddd	%xmm13,%xmm9
580	pxor	%xmm8,%xmm0
581	pxor	%xmm9,%xmm1
582.byte	102,15,56,0,198
583.byte	102,15,56,0,206
584	paddd	%xmm0,%xmm4
585	paddd	%xmm1,%xmm5
586	pxor	%xmm4,%xmm12
587	pxor	%xmm5,%xmm13
588	movdqa	%xmm12,%xmm7
589	pslld	$7,%xmm12
590	psrld	$25,%xmm7
591	movdqa	%xmm13,%xmm6
592	pslld	$7,%xmm13
593	por	%xmm7,%xmm12
594	psrld	$25,%xmm6
595	movdqa	(%r10),%xmm7
596	por	%xmm6,%xmm13
597	movdqa	%xmm4,0(%rsp)
598	movdqa	%xmm5,16(%rsp)
599	movdqa	32(%rsp),%xmm4
600	movdqa	48(%rsp),%xmm5
601	paddd	%xmm14,%xmm10
602	paddd	%xmm15,%xmm11
603	pxor	%xmm10,%xmm2
604	pxor	%xmm11,%xmm3
605.byte	102,15,56,0,215
606.byte	102,15,56,0,223
607	paddd	%xmm2,%xmm4
608	paddd	%xmm3,%xmm5
609	pxor	%xmm4,%xmm14
610	pxor	%xmm5,%xmm15
611	movdqa	%xmm14,%xmm6
612	pslld	$12,%xmm14
613	psrld	$20,%xmm6
614	movdqa	%xmm15,%xmm7
615	pslld	$12,%xmm15
616	por	%xmm6,%xmm14
617	psrld	$20,%xmm7
618	movdqa	(%r11),%xmm6
619	por	%xmm7,%xmm15
620	paddd	%xmm14,%xmm10
621	paddd	%xmm15,%xmm11
622	pxor	%xmm10,%xmm2
623	pxor	%xmm11,%xmm3
624.byte	102,15,56,0,214
625.byte	102,15,56,0,222
626	paddd	%xmm2,%xmm4
627	paddd	%xmm3,%xmm5
628	pxor	%xmm4,%xmm14
629	pxor	%xmm5,%xmm15
630	movdqa	%xmm14,%xmm7
631	pslld	$7,%xmm14
632	psrld	$25,%xmm7
633	movdqa	%xmm15,%xmm6
634	pslld	$7,%xmm15
635	por	%xmm7,%xmm14
636	psrld	$25,%xmm6
637	movdqa	(%r10),%xmm7
638	por	%xmm6,%xmm15
639	paddd	%xmm13,%xmm8
640	paddd	%xmm14,%xmm9
641	pxor	%xmm8,%xmm3
642	pxor	%xmm9,%xmm0
643.byte	102,15,56,0,223
644.byte	102,15,56,0,199
645	paddd	%xmm3,%xmm4
646	paddd	%xmm0,%xmm5
647	pxor	%xmm4,%xmm13
648	pxor	%xmm5,%xmm14
649	movdqa	%xmm13,%xmm6
650	pslld	$12,%xmm13
651	psrld	$20,%xmm6
652	movdqa	%xmm14,%xmm7
653	pslld	$12,%xmm14
654	por	%xmm6,%xmm13
655	psrld	$20,%xmm7
656	movdqa	(%r11),%xmm6
657	por	%xmm7,%xmm14
658	paddd	%xmm13,%xmm8
659	paddd	%xmm14,%xmm9
660	pxor	%xmm8,%xmm3
661	pxor	%xmm9,%xmm0
662.byte	102,15,56,0,222
663.byte	102,15,56,0,198
664	paddd	%xmm3,%xmm4
665	paddd	%xmm0,%xmm5
666	pxor	%xmm4,%xmm13
667	pxor	%xmm5,%xmm14
668	movdqa	%xmm13,%xmm7
669	pslld	$7,%xmm13
670	psrld	$25,%xmm7
671	movdqa	%xmm14,%xmm6
672	pslld	$7,%xmm14
673	por	%xmm7,%xmm13
674	psrld	$25,%xmm6
675	movdqa	(%r10),%xmm7
676	por	%xmm6,%xmm14
677	movdqa	%xmm4,32(%rsp)
678	movdqa	%xmm5,48(%rsp)
679	movdqa	0(%rsp),%xmm4
680	movdqa	16(%rsp),%xmm5
681	paddd	%xmm15,%xmm10
682	paddd	%xmm12,%xmm11
683	pxor	%xmm10,%xmm1
684	pxor	%xmm11,%xmm2
685.byte	102,15,56,0,207
686.byte	102,15,56,0,215
687	paddd	%xmm1,%xmm4
688	paddd	%xmm2,%xmm5
689	pxor	%xmm4,%xmm15
690	pxor	%xmm5,%xmm12
691	movdqa	%xmm15,%xmm6
692	pslld	$12,%xmm15
693	psrld	$20,%xmm6
694	movdqa	%xmm12,%xmm7
695	pslld	$12,%xmm12
696	por	%xmm6,%xmm15
697	psrld	$20,%xmm7
698	movdqa	(%r11),%xmm6
699	por	%xmm7,%xmm12
700	paddd	%xmm15,%xmm10
701	paddd	%xmm12,%xmm11
702	pxor	%xmm10,%xmm1
703	pxor	%xmm11,%xmm2
704.byte	102,15,56,0,206
705.byte	102,15,56,0,214
706	paddd	%xmm1,%xmm4
707	paddd	%xmm2,%xmm5
708	pxor	%xmm4,%xmm15
709	pxor	%xmm5,%xmm12
710	movdqa	%xmm15,%xmm7
711	pslld	$7,%xmm15
712	psrld	$25,%xmm7
713	movdqa	%xmm12,%xmm6
714	pslld	$7,%xmm12
715	por	%xmm7,%xmm15
716	psrld	$25,%xmm6
717	movdqa	(%r10),%xmm7
718	por	%xmm6,%xmm12
719	decl	%eax
720	jnz	L$oop4x
721
722	paddd	64(%rsp),%xmm8
723	paddd	80(%rsp),%xmm9
724	paddd	96(%rsp),%xmm10
725	paddd	112(%rsp),%xmm11
726
727	movdqa	%xmm8,%xmm6
728	punpckldq	%xmm9,%xmm8
729	movdqa	%xmm10,%xmm7
730	punpckldq	%xmm11,%xmm10
731	punpckhdq	%xmm9,%xmm6
732	punpckhdq	%xmm11,%xmm7
733	movdqa	%xmm8,%xmm9
734	punpcklqdq	%xmm10,%xmm8
735	movdqa	%xmm6,%xmm11
736	punpcklqdq	%xmm7,%xmm6
737	punpckhqdq	%xmm10,%xmm9
738	punpckhqdq	%xmm7,%xmm11
739	paddd	128-256(%rcx),%xmm12
740	paddd	144-256(%rcx),%xmm13
741	paddd	160-256(%rcx),%xmm14
742	paddd	176-256(%rcx),%xmm15
743
744	movdqa	%xmm8,0(%rsp)
745	movdqa	%xmm9,16(%rsp)
746	movdqa	32(%rsp),%xmm8
747	movdqa	48(%rsp),%xmm9
748
749	movdqa	%xmm12,%xmm10
750	punpckldq	%xmm13,%xmm12
751	movdqa	%xmm14,%xmm7
752	punpckldq	%xmm15,%xmm14
753	punpckhdq	%xmm13,%xmm10
754	punpckhdq	%xmm15,%xmm7
755	movdqa	%xmm12,%xmm13
756	punpcklqdq	%xmm14,%xmm12
757	movdqa	%xmm10,%xmm15
758	punpcklqdq	%xmm7,%xmm10
759	punpckhqdq	%xmm14,%xmm13
760	punpckhqdq	%xmm7,%xmm15
761	paddd	192-256(%rcx),%xmm4
762	paddd	208-256(%rcx),%xmm5
763	paddd	224-256(%rcx),%xmm8
764	paddd	240-256(%rcx),%xmm9
765
766	movdqa	%xmm6,32(%rsp)
767	movdqa	%xmm11,48(%rsp)
768
769	movdqa	%xmm4,%xmm14
770	punpckldq	%xmm5,%xmm4
771	movdqa	%xmm8,%xmm7
772	punpckldq	%xmm9,%xmm8
773	punpckhdq	%xmm5,%xmm14
774	punpckhdq	%xmm9,%xmm7
775	movdqa	%xmm4,%xmm5
776	punpcklqdq	%xmm8,%xmm4
777	movdqa	%xmm14,%xmm9
778	punpcklqdq	%xmm7,%xmm14
779	punpckhqdq	%xmm8,%xmm5
780	punpckhqdq	%xmm7,%xmm9
781	paddd	256-256(%rcx),%xmm0
782	paddd	272-256(%rcx),%xmm1
783	paddd	288-256(%rcx),%xmm2
784	paddd	304-256(%rcx),%xmm3
785
786	movdqa	%xmm0,%xmm8
787	punpckldq	%xmm1,%xmm0
788	movdqa	%xmm2,%xmm7
789	punpckldq	%xmm3,%xmm2
790	punpckhdq	%xmm1,%xmm8
791	punpckhdq	%xmm3,%xmm7
792	movdqa	%xmm0,%xmm1
793	punpcklqdq	%xmm2,%xmm0
794	movdqa	%xmm8,%xmm3
795	punpcklqdq	%xmm7,%xmm8
796	punpckhqdq	%xmm2,%xmm1
797	punpckhqdq	%xmm7,%xmm3
798	cmpq	$256,%rdx
799	jb	L$tail4x
800
801	movdqu	0(%rsi),%xmm6
802	movdqu	16(%rsi),%xmm11
803	movdqu	32(%rsi),%xmm2
804	movdqu	48(%rsi),%xmm7
805	pxor	0(%rsp),%xmm6
806	pxor	%xmm12,%xmm11
807	pxor	%xmm4,%xmm2
808	pxor	%xmm0,%xmm7
809
810	movdqu	%xmm6,0(%rdi)
811	movdqu	64(%rsi),%xmm6
812	movdqu	%xmm11,16(%rdi)
813	movdqu	80(%rsi),%xmm11
814	movdqu	%xmm2,32(%rdi)
815	movdqu	96(%rsi),%xmm2
816	movdqu	%xmm7,48(%rdi)
817	movdqu	112(%rsi),%xmm7
818	leaq	128(%rsi),%rsi
819	pxor	16(%rsp),%xmm6
820	pxor	%xmm13,%xmm11
821	pxor	%xmm5,%xmm2
822	pxor	%xmm1,%xmm7
823
824	movdqu	%xmm6,64(%rdi)
825	movdqu	0(%rsi),%xmm6
826	movdqu	%xmm11,80(%rdi)
827	movdqu	16(%rsi),%xmm11
828	movdqu	%xmm2,96(%rdi)
829	movdqu	32(%rsi),%xmm2
830	movdqu	%xmm7,112(%rdi)
831	leaq	128(%rdi),%rdi
832	movdqu	48(%rsi),%xmm7
833	pxor	32(%rsp),%xmm6
834	pxor	%xmm10,%xmm11
835	pxor	%xmm14,%xmm2
836	pxor	%xmm8,%xmm7
837
838	movdqu	%xmm6,0(%rdi)
839	movdqu	64(%rsi),%xmm6
840	movdqu	%xmm11,16(%rdi)
841	movdqu	80(%rsi),%xmm11
842	movdqu	%xmm2,32(%rdi)
843	movdqu	96(%rsi),%xmm2
844	movdqu	%xmm7,48(%rdi)
845	movdqu	112(%rsi),%xmm7
846	leaq	128(%rsi),%rsi
847	pxor	48(%rsp),%xmm6
848	pxor	%xmm15,%xmm11
849	pxor	%xmm9,%xmm2
850	pxor	%xmm3,%xmm7
851	movdqu	%xmm6,64(%rdi)
852	movdqu	%xmm11,80(%rdi)
853	movdqu	%xmm2,96(%rdi)
854	movdqu	%xmm7,112(%rdi)
855	leaq	128(%rdi),%rdi
856
857	subq	$256,%rdx
858	jnz	L$oop_outer4x
859
860	jmp	L$done4x
861
862L$tail4x:
863	cmpq	$192,%rdx
864	jae	L$192_or_more4x
865	cmpq	$128,%rdx
866	jae	L$128_or_more4x
867	cmpq	$64,%rdx
868	jae	L$64_or_more4x
869
870
871	xorq	%r10,%r10
872
873	movdqa	%xmm12,16(%rsp)
874	movdqa	%xmm4,32(%rsp)
875	movdqa	%xmm0,48(%rsp)
876	jmp	L$oop_tail4x
877
878.p2align	5
879L$64_or_more4x:
880	movdqu	0(%rsi),%xmm6
881	movdqu	16(%rsi),%xmm11
882	movdqu	32(%rsi),%xmm2
883	movdqu	48(%rsi),%xmm7
884	pxor	0(%rsp),%xmm6
885	pxor	%xmm12,%xmm11
886	pxor	%xmm4,%xmm2
887	pxor	%xmm0,%xmm7
888	movdqu	%xmm6,0(%rdi)
889	movdqu	%xmm11,16(%rdi)
890	movdqu	%xmm2,32(%rdi)
891	movdqu	%xmm7,48(%rdi)
892	je	L$done4x
893
894	movdqa	16(%rsp),%xmm6
895	leaq	64(%rsi),%rsi
896	xorq	%r10,%r10
897	movdqa	%xmm6,0(%rsp)
898	movdqa	%xmm13,16(%rsp)
899	leaq	64(%rdi),%rdi
900	movdqa	%xmm5,32(%rsp)
901	subq	$64,%rdx
902	movdqa	%xmm1,48(%rsp)
903	jmp	L$oop_tail4x
904
905.p2align	5
906L$128_or_more4x:
907	movdqu	0(%rsi),%xmm6
908	movdqu	16(%rsi),%xmm11
909	movdqu	32(%rsi),%xmm2
910	movdqu	48(%rsi),%xmm7
911	pxor	0(%rsp),%xmm6
912	pxor	%xmm12,%xmm11
913	pxor	%xmm4,%xmm2
914	pxor	%xmm0,%xmm7
915
916	movdqu	%xmm6,0(%rdi)
917	movdqu	64(%rsi),%xmm6
918	movdqu	%xmm11,16(%rdi)
919	movdqu	80(%rsi),%xmm11
920	movdqu	%xmm2,32(%rdi)
921	movdqu	96(%rsi),%xmm2
922	movdqu	%xmm7,48(%rdi)
923	movdqu	112(%rsi),%xmm7
924	pxor	16(%rsp),%xmm6
925	pxor	%xmm13,%xmm11
926	pxor	%xmm5,%xmm2
927	pxor	%xmm1,%xmm7
928	movdqu	%xmm6,64(%rdi)
929	movdqu	%xmm11,80(%rdi)
930	movdqu	%xmm2,96(%rdi)
931	movdqu	%xmm7,112(%rdi)
932	je	L$done4x
933
934	movdqa	32(%rsp),%xmm6
935	leaq	128(%rsi),%rsi
936	xorq	%r10,%r10
937	movdqa	%xmm6,0(%rsp)
938	movdqa	%xmm10,16(%rsp)
939	leaq	128(%rdi),%rdi
940	movdqa	%xmm14,32(%rsp)
941	subq	$128,%rdx
942	movdqa	%xmm8,48(%rsp)
943	jmp	L$oop_tail4x
944
945.p2align	5
946L$192_or_more4x:
947	movdqu	0(%rsi),%xmm6
948	movdqu	16(%rsi),%xmm11
949	movdqu	32(%rsi),%xmm2
950	movdqu	48(%rsi),%xmm7
951	pxor	0(%rsp),%xmm6
952	pxor	%xmm12,%xmm11
953	pxor	%xmm4,%xmm2
954	pxor	%xmm0,%xmm7
955
956	movdqu	%xmm6,0(%rdi)
957	movdqu	64(%rsi),%xmm6
958	movdqu	%xmm11,16(%rdi)
959	movdqu	80(%rsi),%xmm11
960	movdqu	%xmm2,32(%rdi)
961	movdqu	96(%rsi),%xmm2
962	movdqu	%xmm7,48(%rdi)
963	movdqu	112(%rsi),%xmm7
964	leaq	128(%rsi),%rsi
965	pxor	16(%rsp),%xmm6
966	pxor	%xmm13,%xmm11
967	pxor	%xmm5,%xmm2
968	pxor	%xmm1,%xmm7
969
970	movdqu	%xmm6,64(%rdi)
971	movdqu	0(%rsi),%xmm6
972	movdqu	%xmm11,80(%rdi)
973	movdqu	16(%rsi),%xmm11
974	movdqu	%xmm2,96(%rdi)
975	movdqu	32(%rsi),%xmm2
976	movdqu	%xmm7,112(%rdi)
977	leaq	128(%rdi),%rdi
978	movdqu	48(%rsi),%xmm7
979	pxor	32(%rsp),%xmm6
980	pxor	%xmm10,%xmm11
981	pxor	%xmm14,%xmm2
982	pxor	%xmm8,%xmm7
983	movdqu	%xmm6,0(%rdi)
984	movdqu	%xmm11,16(%rdi)
985	movdqu	%xmm2,32(%rdi)
986	movdqu	%xmm7,48(%rdi)
987	je	L$done4x
988
989	movdqa	48(%rsp),%xmm6
990	leaq	64(%rsi),%rsi
991	xorq	%r10,%r10
992	movdqa	%xmm6,0(%rsp)
993	movdqa	%xmm15,16(%rsp)
994	leaq	64(%rdi),%rdi
995	movdqa	%xmm9,32(%rsp)
996	subq	$192,%rdx
997	movdqa	%xmm3,48(%rsp)
998
999L$oop_tail4x:
1000	movzbl	(%rsi,%r10,1),%eax
1001	movzbl	(%rsp,%r10,1),%ecx
1002	leaq	1(%r10),%r10
1003	xorl	%ecx,%eax
1004	movb	%al,-1(%rdi,%r10,1)
1005	decq	%rdx
1006	jnz	L$oop_tail4x
1007
1008L$done4x:
1009	leaq	(%r9),%rsp
1010
1011L$4x_epilogue:
1012	ret
1013
1014
1015
1016.p2align	5
1017ChaCha20_8x:
1018L$ChaCha20_8x:
1019
1020	movq	%rsp,%r9
1021
1022	subq	$0x280+8,%rsp
1023	andq	$-32,%rsp
1024	vzeroupper
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035	vbroadcasti128	L$sigma(%rip),%ymm11
1036	vbroadcasti128	(%rcx),%ymm3
1037	vbroadcasti128	16(%rcx),%ymm15
1038	vbroadcasti128	(%r8),%ymm7
1039	leaq	256(%rsp),%rcx
1040	leaq	512(%rsp),%rax
1041	leaq	L$rot16(%rip),%r10
1042	leaq	L$rot24(%rip),%r11
1043
1044	vpshufd	$0x00,%ymm11,%ymm8
1045	vpshufd	$0x55,%ymm11,%ymm9
1046	vmovdqa	%ymm8,128-256(%rcx)
1047	vpshufd	$0xaa,%ymm11,%ymm10
1048	vmovdqa	%ymm9,160-256(%rcx)
1049	vpshufd	$0xff,%ymm11,%ymm11
1050	vmovdqa	%ymm10,192-256(%rcx)
1051	vmovdqa	%ymm11,224-256(%rcx)
1052
1053	vpshufd	$0x00,%ymm3,%ymm0
1054	vpshufd	$0x55,%ymm3,%ymm1
1055	vmovdqa	%ymm0,256-256(%rcx)
1056	vpshufd	$0xaa,%ymm3,%ymm2
1057	vmovdqa	%ymm1,288-256(%rcx)
1058	vpshufd	$0xff,%ymm3,%ymm3
1059	vmovdqa	%ymm2,320-256(%rcx)
1060	vmovdqa	%ymm3,352-256(%rcx)
1061
1062	vpshufd	$0x00,%ymm15,%ymm12
1063	vpshufd	$0x55,%ymm15,%ymm13
1064	vmovdqa	%ymm12,384-512(%rax)
1065	vpshufd	$0xaa,%ymm15,%ymm14
1066	vmovdqa	%ymm13,416-512(%rax)
1067	vpshufd	$0xff,%ymm15,%ymm15
1068	vmovdqa	%ymm14,448-512(%rax)
1069	vmovdqa	%ymm15,480-512(%rax)
1070
1071	vpshufd	$0x00,%ymm7,%ymm4
1072	vpshufd	$0x55,%ymm7,%ymm5
1073	vpaddd	L$incy(%rip),%ymm4,%ymm4
1074	vpshufd	$0xaa,%ymm7,%ymm6
1075	vmovdqa	%ymm5,544-512(%rax)
1076	vpshufd	$0xff,%ymm7,%ymm7
1077	vmovdqa	%ymm6,576-512(%rax)
1078	vmovdqa	%ymm7,608-512(%rax)
1079
1080	jmp	L$oop_enter8x
1081
1082.p2align	5
1083L$oop_outer8x:
1084	vmovdqa	128-256(%rcx),%ymm8
1085	vmovdqa	160-256(%rcx),%ymm9
1086	vmovdqa	192-256(%rcx),%ymm10
1087	vmovdqa	224-256(%rcx),%ymm11
1088	vmovdqa	256-256(%rcx),%ymm0
1089	vmovdqa	288-256(%rcx),%ymm1
1090	vmovdqa	320-256(%rcx),%ymm2
1091	vmovdqa	352-256(%rcx),%ymm3
1092	vmovdqa	384-512(%rax),%ymm12
1093	vmovdqa	416-512(%rax),%ymm13
1094	vmovdqa	448-512(%rax),%ymm14
1095	vmovdqa	480-512(%rax),%ymm15
1096	vmovdqa	512-512(%rax),%ymm4
1097	vmovdqa	544-512(%rax),%ymm5
1098	vmovdqa	576-512(%rax),%ymm6
1099	vmovdqa	608-512(%rax),%ymm7
1100	vpaddd	L$eight(%rip),%ymm4,%ymm4
1101
1102L$oop_enter8x:
1103	vmovdqa	%ymm14,64(%rsp)
1104	vmovdqa	%ymm15,96(%rsp)
1105	vbroadcasti128	(%r10),%ymm15
1106	vmovdqa	%ymm4,512-512(%rax)
1107	movl	$10,%eax
1108	jmp	L$oop8x
1109
1110.p2align	5
1111L$oop8x:
1112	vpaddd	%ymm0,%ymm8,%ymm8
1113	vpxor	%ymm4,%ymm8,%ymm4
1114	vpshufb	%ymm15,%ymm4,%ymm4
1115	vpaddd	%ymm1,%ymm9,%ymm9
1116	vpxor	%ymm5,%ymm9,%ymm5
1117	vpshufb	%ymm15,%ymm5,%ymm5
1118	vpaddd	%ymm4,%ymm12,%ymm12
1119	vpxor	%ymm0,%ymm12,%ymm0
1120	vpslld	$12,%ymm0,%ymm14
1121	vpsrld	$20,%ymm0,%ymm0
1122	vpor	%ymm0,%ymm14,%ymm0
1123	vbroadcasti128	(%r11),%ymm14
1124	vpaddd	%ymm5,%ymm13,%ymm13
1125	vpxor	%ymm1,%ymm13,%ymm1
1126	vpslld	$12,%ymm1,%ymm15
1127	vpsrld	$20,%ymm1,%ymm1
1128	vpor	%ymm1,%ymm15,%ymm1
1129	vpaddd	%ymm0,%ymm8,%ymm8
1130	vpxor	%ymm4,%ymm8,%ymm4
1131	vpshufb	%ymm14,%ymm4,%ymm4
1132	vpaddd	%ymm1,%ymm9,%ymm9
1133	vpxor	%ymm5,%ymm9,%ymm5
1134	vpshufb	%ymm14,%ymm5,%ymm5
1135	vpaddd	%ymm4,%ymm12,%ymm12
1136	vpxor	%ymm0,%ymm12,%ymm0
1137	vpslld	$7,%ymm0,%ymm15
1138	vpsrld	$25,%ymm0,%ymm0
1139	vpor	%ymm0,%ymm15,%ymm0
1140	vbroadcasti128	(%r10),%ymm15
1141	vpaddd	%ymm5,%ymm13,%ymm13
1142	vpxor	%ymm1,%ymm13,%ymm1
1143	vpslld	$7,%ymm1,%ymm14
1144	vpsrld	$25,%ymm1,%ymm1
1145	vpor	%ymm1,%ymm14,%ymm1
1146	vmovdqa	%ymm12,0(%rsp)
1147	vmovdqa	%ymm13,32(%rsp)
1148	vmovdqa	64(%rsp),%ymm12
1149	vmovdqa	96(%rsp),%ymm13
1150	vpaddd	%ymm2,%ymm10,%ymm10
1151	vpxor	%ymm6,%ymm10,%ymm6
1152	vpshufb	%ymm15,%ymm6,%ymm6
1153	vpaddd	%ymm3,%ymm11,%ymm11
1154	vpxor	%ymm7,%ymm11,%ymm7
1155	vpshufb	%ymm15,%ymm7,%ymm7
1156	vpaddd	%ymm6,%ymm12,%ymm12
1157	vpxor	%ymm2,%ymm12,%ymm2
1158	vpslld	$12,%ymm2,%ymm14
1159	vpsrld	$20,%ymm2,%ymm2
1160	vpor	%ymm2,%ymm14,%ymm2
1161	vbroadcasti128	(%r11),%ymm14
1162	vpaddd	%ymm7,%ymm13,%ymm13
1163	vpxor	%ymm3,%ymm13,%ymm3
1164	vpslld	$12,%ymm3,%ymm15
1165	vpsrld	$20,%ymm3,%ymm3
1166	vpor	%ymm3,%ymm15,%ymm3
1167	vpaddd	%ymm2,%ymm10,%ymm10
1168	vpxor	%ymm6,%ymm10,%ymm6
1169	vpshufb	%ymm14,%ymm6,%ymm6
1170	vpaddd	%ymm3,%ymm11,%ymm11
1171	vpxor	%ymm7,%ymm11,%ymm7
1172	vpshufb	%ymm14,%ymm7,%ymm7
1173	vpaddd	%ymm6,%ymm12,%ymm12
1174	vpxor	%ymm2,%ymm12,%ymm2
1175	vpslld	$7,%ymm2,%ymm15
1176	vpsrld	$25,%ymm2,%ymm2
1177	vpor	%ymm2,%ymm15,%ymm2
1178	vbroadcasti128	(%r10),%ymm15
1179	vpaddd	%ymm7,%ymm13,%ymm13
1180	vpxor	%ymm3,%ymm13,%ymm3
1181	vpslld	$7,%ymm3,%ymm14
1182	vpsrld	$25,%ymm3,%ymm3
1183	vpor	%ymm3,%ymm14,%ymm3
1184	vpaddd	%ymm1,%ymm8,%ymm8
1185	vpxor	%ymm7,%ymm8,%ymm7
1186	vpshufb	%ymm15,%ymm7,%ymm7
1187	vpaddd	%ymm2,%ymm9,%ymm9
1188	vpxor	%ymm4,%ymm9,%ymm4
1189	vpshufb	%ymm15,%ymm4,%ymm4
1190	vpaddd	%ymm7,%ymm12,%ymm12
1191	vpxor	%ymm1,%ymm12,%ymm1
1192	vpslld	$12,%ymm1,%ymm14
1193	vpsrld	$20,%ymm1,%ymm1
1194	vpor	%ymm1,%ymm14,%ymm1
1195	vbroadcasti128	(%r11),%ymm14
1196	vpaddd	%ymm4,%ymm13,%ymm13
1197	vpxor	%ymm2,%ymm13,%ymm2
1198	vpslld	$12,%ymm2,%ymm15
1199	vpsrld	$20,%ymm2,%ymm2
1200	vpor	%ymm2,%ymm15,%ymm2
1201	vpaddd	%ymm1,%ymm8,%ymm8
1202	vpxor	%ymm7,%ymm8,%ymm7
1203	vpshufb	%ymm14,%ymm7,%ymm7
1204	vpaddd	%ymm2,%ymm9,%ymm9
1205	vpxor	%ymm4,%ymm9,%ymm4
1206	vpshufb	%ymm14,%ymm4,%ymm4
1207	vpaddd	%ymm7,%ymm12,%ymm12
1208	vpxor	%ymm1,%ymm12,%ymm1
1209	vpslld	$7,%ymm1,%ymm15
1210	vpsrld	$25,%ymm1,%ymm1
1211	vpor	%ymm1,%ymm15,%ymm1
1212	vbroadcasti128	(%r10),%ymm15
1213	vpaddd	%ymm4,%ymm13,%ymm13
1214	vpxor	%ymm2,%ymm13,%ymm2
1215	vpslld	$7,%ymm2,%ymm14
1216	vpsrld	$25,%ymm2,%ymm2
1217	vpor	%ymm2,%ymm14,%ymm2
1218	vmovdqa	%ymm12,64(%rsp)
1219	vmovdqa	%ymm13,96(%rsp)
1220	vmovdqa	0(%rsp),%ymm12
1221	vmovdqa	32(%rsp),%ymm13
1222	vpaddd	%ymm3,%ymm10,%ymm10
1223	vpxor	%ymm5,%ymm10,%ymm5
1224	vpshufb	%ymm15,%ymm5,%ymm5
1225	vpaddd	%ymm0,%ymm11,%ymm11
1226	vpxor	%ymm6,%ymm11,%ymm6
1227	vpshufb	%ymm15,%ymm6,%ymm6
1228	vpaddd	%ymm5,%ymm12,%ymm12
1229	vpxor	%ymm3,%ymm12,%ymm3
1230	vpslld	$12,%ymm3,%ymm14
1231	vpsrld	$20,%ymm3,%ymm3
1232	vpor	%ymm3,%ymm14,%ymm3
1233	vbroadcasti128	(%r11),%ymm14
1234	vpaddd	%ymm6,%ymm13,%ymm13
1235	vpxor	%ymm0,%ymm13,%ymm0
1236	vpslld	$12,%ymm0,%ymm15
1237	vpsrld	$20,%ymm0,%ymm0
1238	vpor	%ymm0,%ymm15,%ymm0
1239	vpaddd	%ymm3,%ymm10,%ymm10
1240	vpxor	%ymm5,%ymm10,%ymm5
1241	vpshufb	%ymm14,%ymm5,%ymm5
1242	vpaddd	%ymm0,%ymm11,%ymm11
1243	vpxor	%ymm6,%ymm11,%ymm6
1244	vpshufb	%ymm14,%ymm6,%ymm6
1245	vpaddd	%ymm5,%ymm12,%ymm12
1246	vpxor	%ymm3,%ymm12,%ymm3
1247	vpslld	$7,%ymm3,%ymm15
1248	vpsrld	$25,%ymm3,%ymm3
1249	vpor	%ymm3,%ymm15,%ymm3
1250	vbroadcasti128	(%r10),%ymm15
1251	vpaddd	%ymm6,%ymm13,%ymm13
1252	vpxor	%ymm0,%ymm13,%ymm0
1253	vpslld	$7,%ymm0,%ymm14
1254	vpsrld	$25,%ymm0,%ymm0
1255	vpor	%ymm0,%ymm14,%ymm0
1256	decl	%eax
1257	jnz	L$oop8x
1258
1259	leaq	512(%rsp),%rax
1260	vpaddd	128-256(%rcx),%ymm8,%ymm8
1261	vpaddd	160-256(%rcx),%ymm9,%ymm9
1262	vpaddd	192-256(%rcx),%ymm10,%ymm10
1263	vpaddd	224-256(%rcx),%ymm11,%ymm11
1264
1265	vpunpckldq	%ymm9,%ymm8,%ymm14
1266	vpunpckldq	%ymm11,%ymm10,%ymm15
1267	vpunpckhdq	%ymm9,%ymm8,%ymm8
1268	vpunpckhdq	%ymm11,%ymm10,%ymm10
1269	vpunpcklqdq	%ymm15,%ymm14,%ymm9
1270	vpunpckhqdq	%ymm15,%ymm14,%ymm14
1271	vpunpcklqdq	%ymm10,%ymm8,%ymm11
1272	vpunpckhqdq	%ymm10,%ymm8,%ymm8
1273	vpaddd	256-256(%rcx),%ymm0,%ymm0
1274	vpaddd	288-256(%rcx),%ymm1,%ymm1
1275	vpaddd	320-256(%rcx),%ymm2,%ymm2
1276	vpaddd	352-256(%rcx),%ymm3,%ymm3
1277
1278	vpunpckldq	%ymm1,%ymm0,%ymm10
1279	vpunpckldq	%ymm3,%ymm2,%ymm15
1280	vpunpckhdq	%ymm1,%ymm0,%ymm0
1281	vpunpckhdq	%ymm3,%ymm2,%ymm2
1282	vpunpcklqdq	%ymm15,%ymm10,%ymm1
1283	vpunpckhqdq	%ymm15,%ymm10,%ymm10
1284	vpunpcklqdq	%ymm2,%ymm0,%ymm3
1285	vpunpckhqdq	%ymm2,%ymm0,%ymm0
1286	vperm2i128	$0x20,%ymm1,%ymm9,%ymm15
1287	vperm2i128	$0x31,%ymm1,%ymm9,%ymm1
1288	vperm2i128	$0x20,%ymm10,%ymm14,%ymm9
1289	vperm2i128	$0x31,%ymm10,%ymm14,%ymm10
1290	vperm2i128	$0x20,%ymm3,%ymm11,%ymm14
1291	vperm2i128	$0x31,%ymm3,%ymm11,%ymm3
1292	vperm2i128	$0x20,%ymm0,%ymm8,%ymm11
1293	vperm2i128	$0x31,%ymm0,%ymm8,%ymm0
1294	vmovdqa	%ymm15,0(%rsp)
1295	vmovdqa	%ymm9,32(%rsp)
1296	vmovdqa	64(%rsp),%ymm15
1297	vmovdqa	96(%rsp),%ymm9
1298
1299	vpaddd	384-512(%rax),%ymm12,%ymm12
1300	vpaddd	416-512(%rax),%ymm13,%ymm13
1301	vpaddd	448-512(%rax),%ymm15,%ymm15
1302	vpaddd	480-512(%rax),%ymm9,%ymm9
1303
1304	vpunpckldq	%ymm13,%ymm12,%ymm2
1305	vpunpckldq	%ymm9,%ymm15,%ymm8
1306	vpunpckhdq	%ymm13,%ymm12,%ymm12
1307	vpunpckhdq	%ymm9,%ymm15,%ymm15
1308	vpunpcklqdq	%ymm8,%ymm2,%ymm13
1309	vpunpckhqdq	%ymm8,%ymm2,%ymm2
1310	vpunpcklqdq	%ymm15,%ymm12,%ymm9
1311	vpunpckhqdq	%ymm15,%ymm12,%ymm12
1312	vpaddd	512-512(%rax),%ymm4,%ymm4
1313	vpaddd	544-512(%rax),%ymm5,%ymm5
1314	vpaddd	576-512(%rax),%ymm6,%ymm6
1315	vpaddd	608-512(%rax),%ymm7,%ymm7
1316
1317	vpunpckldq	%ymm5,%ymm4,%ymm15
1318	vpunpckldq	%ymm7,%ymm6,%ymm8
1319	vpunpckhdq	%ymm5,%ymm4,%ymm4
1320	vpunpckhdq	%ymm7,%ymm6,%ymm6
1321	vpunpcklqdq	%ymm8,%ymm15,%ymm5
1322	vpunpckhqdq	%ymm8,%ymm15,%ymm15
1323	vpunpcklqdq	%ymm6,%ymm4,%ymm7
1324	vpunpckhqdq	%ymm6,%ymm4,%ymm4
1325	vperm2i128	$0x20,%ymm5,%ymm13,%ymm8
1326	vperm2i128	$0x31,%ymm5,%ymm13,%ymm5
1327	vperm2i128	$0x20,%ymm15,%ymm2,%ymm13
1328	vperm2i128	$0x31,%ymm15,%ymm2,%ymm15
1329	vperm2i128	$0x20,%ymm7,%ymm9,%ymm2
1330	vperm2i128	$0x31,%ymm7,%ymm9,%ymm7
1331	vperm2i128	$0x20,%ymm4,%ymm12,%ymm9
1332	vperm2i128	$0x31,%ymm4,%ymm12,%ymm4
1333	vmovdqa	0(%rsp),%ymm6
1334	vmovdqa	32(%rsp),%ymm12
1335
1336	cmpq	$512,%rdx
1337	jb	L$tail8x
1338
1339	vpxor	0(%rsi),%ymm6,%ymm6
1340	vpxor	32(%rsi),%ymm8,%ymm8
1341	vpxor	64(%rsi),%ymm1,%ymm1
1342	vpxor	96(%rsi),%ymm5,%ymm5
1343	leaq	128(%rsi),%rsi
1344	vmovdqu	%ymm6,0(%rdi)
1345	vmovdqu	%ymm8,32(%rdi)
1346	vmovdqu	%ymm1,64(%rdi)
1347	vmovdqu	%ymm5,96(%rdi)
1348	leaq	128(%rdi),%rdi
1349
1350	vpxor	0(%rsi),%ymm12,%ymm12
1351	vpxor	32(%rsi),%ymm13,%ymm13
1352	vpxor	64(%rsi),%ymm10,%ymm10
1353	vpxor	96(%rsi),%ymm15,%ymm15
1354	leaq	128(%rsi),%rsi
1355	vmovdqu	%ymm12,0(%rdi)
1356	vmovdqu	%ymm13,32(%rdi)
1357	vmovdqu	%ymm10,64(%rdi)
1358	vmovdqu	%ymm15,96(%rdi)
1359	leaq	128(%rdi),%rdi
1360
1361	vpxor	0(%rsi),%ymm14,%ymm14
1362	vpxor	32(%rsi),%ymm2,%ymm2
1363	vpxor	64(%rsi),%ymm3,%ymm3
1364	vpxor	96(%rsi),%ymm7,%ymm7
1365	leaq	128(%rsi),%rsi
1366	vmovdqu	%ymm14,0(%rdi)
1367	vmovdqu	%ymm2,32(%rdi)
1368	vmovdqu	%ymm3,64(%rdi)
1369	vmovdqu	%ymm7,96(%rdi)
1370	leaq	128(%rdi),%rdi
1371
1372	vpxor	0(%rsi),%ymm11,%ymm11
1373	vpxor	32(%rsi),%ymm9,%ymm9
1374	vpxor	64(%rsi),%ymm0,%ymm0
1375	vpxor	96(%rsi),%ymm4,%ymm4
1376	leaq	128(%rsi),%rsi
1377	vmovdqu	%ymm11,0(%rdi)
1378	vmovdqu	%ymm9,32(%rdi)
1379	vmovdqu	%ymm0,64(%rdi)
1380	vmovdqu	%ymm4,96(%rdi)
1381	leaq	128(%rdi),%rdi
1382
1383	subq	$512,%rdx
1384	jnz	L$oop_outer8x
1385
1386	jmp	L$done8x
1387
1388L$tail8x:
1389	cmpq	$448,%rdx
1390	jae	L$448_or_more8x
1391	cmpq	$384,%rdx
1392	jae	L$384_or_more8x
1393	cmpq	$320,%rdx
1394	jae	L$320_or_more8x
1395	cmpq	$256,%rdx
1396	jae	L$256_or_more8x
1397	cmpq	$192,%rdx
1398	jae	L$192_or_more8x
1399	cmpq	$128,%rdx
1400	jae	L$128_or_more8x
1401	cmpq	$64,%rdx
1402	jae	L$64_or_more8x
1403
1404	xorq	%r10,%r10
1405	vmovdqa	%ymm6,0(%rsp)
1406	vmovdqa	%ymm8,32(%rsp)
1407	jmp	L$oop_tail8x
1408
1409.p2align	5
1410L$64_or_more8x:
1411	vpxor	0(%rsi),%ymm6,%ymm6
1412	vpxor	32(%rsi),%ymm8,%ymm8
1413	vmovdqu	%ymm6,0(%rdi)
1414	vmovdqu	%ymm8,32(%rdi)
1415	je	L$done8x
1416
1417	leaq	64(%rsi),%rsi
1418	xorq	%r10,%r10
1419	vmovdqa	%ymm1,0(%rsp)
1420	leaq	64(%rdi),%rdi
1421	subq	$64,%rdx
1422	vmovdqa	%ymm5,32(%rsp)
1423	jmp	L$oop_tail8x
1424
1425.p2align	5
1426L$128_or_more8x:
1427	vpxor	0(%rsi),%ymm6,%ymm6
1428	vpxor	32(%rsi),%ymm8,%ymm8
1429	vpxor	64(%rsi),%ymm1,%ymm1
1430	vpxor	96(%rsi),%ymm5,%ymm5
1431	vmovdqu	%ymm6,0(%rdi)
1432	vmovdqu	%ymm8,32(%rdi)
1433	vmovdqu	%ymm1,64(%rdi)
1434	vmovdqu	%ymm5,96(%rdi)
1435	je	L$done8x
1436
1437	leaq	128(%rsi),%rsi
1438	xorq	%r10,%r10
1439	vmovdqa	%ymm12,0(%rsp)
1440	leaq	128(%rdi),%rdi
1441	subq	$128,%rdx
1442	vmovdqa	%ymm13,32(%rsp)
1443	jmp	L$oop_tail8x
1444
1445.p2align	5
1446L$192_or_more8x:
1447	vpxor	0(%rsi),%ymm6,%ymm6
1448	vpxor	32(%rsi),%ymm8,%ymm8
1449	vpxor	64(%rsi),%ymm1,%ymm1
1450	vpxor	96(%rsi),%ymm5,%ymm5
1451	vpxor	128(%rsi),%ymm12,%ymm12
1452	vpxor	160(%rsi),%ymm13,%ymm13
1453	vmovdqu	%ymm6,0(%rdi)
1454	vmovdqu	%ymm8,32(%rdi)
1455	vmovdqu	%ymm1,64(%rdi)
1456	vmovdqu	%ymm5,96(%rdi)
1457	vmovdqu	%ymm12,128(%rdi)
1458	vmovdqu	%ymm13,160(%rdi)
1459	je	L$done8x
1460
1461	leaq	192(%rsi),%rsi
1462	xorq	%r10,%r10
1463	vmovdqa	%ymm10,0(%rsp)
1464	leaq	192(%rdi),%rdi
1465	subq	$192,%rdx
1466	vmovdqa	%ymm15,32(%rsp)
1467	jmp	L$oop_tail8x
1468
1469.p2align	5
1470L$256_or_more8x:
1471	vpxor	0(%rsi),%ymm6,%ymm6
1472	vpxor	32(%rsi),%ymm8,%ymm8
1473	vpxor	64(%rsi),%ymm1,%ymm1
1474	vpxor	96(%rsi),%ymm5,%ymm5
1475	vpxor	128(%rsi),%ymm12,%ymm12
1476	vpxor	160(%rsi),%ymm13,%ymm13
1477	vpxor	192(%rsi),%ymm10,%ymm10
1478	vpxor	224(%rsi),%ymm15,%ymm15
1479	vmovdqu	%ymm6,0(%rdi)
1480	vmovdqu	%ymm8,32(%rdi)
1481	vmovdqu	%ymm1,64(%rdi)
1482	vmovdqu	%ymm5,96(%rdi)
1483	vmovdqu	%ymm12,128(%rdi)
1484	vmovdqu	%ymm13,160(%rdi)
1485	vmovdqu	%ymm10,192(%rdi)
1486	vmovdqu	%ymm15,224(%rdi)
1487	je	L$done8x
1488
1489	leaq	256(%rsi),%rsi
1490	xorq	%r10,%r10
1491	vmovdqa	%ymm14,0(%rsp)
1492	leaq	256(%rdi),%rdi
1493	subq	$256,%rdx
1494	vmovdqa	%ymm2,32(%rsp)
1495	jmp	L$oop_tail8x
1496
1497.p2align	5
1498L$320_or_more8x:
1499	vpxor	0(%rsi),%ymm6,%ymm6
1500	vpxor	32(%rsi),%ymm8,%ymm8
1501	vpxor	64(%rsi),%ymm1,%ymm1
1502	vpxor	96(%rsi),%ymm5,%ymm5
1503	vpxor	128(%rsi),%ymm12,%ymm12
1504	vpxor	160(%rsi),%ymm13,%ymm13
1505	vpxor	192(%rsi),%ymm10,%ymm10
1506	vpxor	224(%rsi),%ymm15,%ymm15
1507	vpxor	256(%rsi),%ymm14,%ymm14
1508	vpxor	288(%rsi),%ymm2,%ymm2
1509	vmovdqu	%ymm6,0(%rdi)
1510	vmovdqu	%ymm8,32(%rdi)
1511	vmovdqu	%ymm1,64(%rdi)
1512	vmovdqu	%ymm5,96(%rdi)
1513	vmovdqu	%ymm12,128(%rdi)
1514	vmovdqu	%ymm13,160(%rdi)
1515	vmovdqu	%ymm10,192(%rdi)
1516	vmovdqu	%ymm15,224(%rdi)
1517	vmovdqu	%ymm14,256(%rdi)
1518	vmovdqu	%ymm2,288(%rdi)
1519	je	L$done8x
1520
1521	leaq	320(%rsi),%rsi
1522	xorq	%r10,%r10
1523	vmovdqa	%ymm3,0(%rsp)
1524	leaq	320(%rdi),%rdi
1525	subq	$320,%rdx
1526	vmovdqa	%ymm7,32(%rsp)
1527	jmp	L$oop_tail8x
1528
1529.p2align	5
1530L$384_or_more8x:
1531	vpxor	0(%rsi),%ymm6,%ymm6
1532	vpxor	32(%rsi),%ymm8,%ymm8
1533	vpxor	64(%rsi),%ymm1,%ymm1
1534	vpxor	96(%rsi),%ymm5,%ymm5
1535	vpxor	128(%rsi),%ymm12,%ymm12
1536	vpxor	160(%rsi),%ymm13,%ymm13
1537	vpxor	192(%rsi),%ymm10,%ymm10
1538	vpxor	224(%rsi),%ymm15,%ymm15
1539	vpxor	256(%rsi),%ymm14,%ymm14
1540	vpxor	288(%rsi),%ymm2,%ymm2
1541	vpxor	320(%rsi),%ymm3,%ymm3
1542	vpxor	352(%rsi),%ymm7,%ymm7
1543	vmovdqu	%ymm6,0(%rdi)
1544	vmovdqu	%ymm8,32(%rdi)
1545	vmovdqu	%ymm1,64(%rdi)
1546	vmovdqu	%ymm5,96(%rdi)
1547	vmovdqu	%ymm12,128(%rdi)
1548	vmovdqu	%ymm13,160(%rdi)
1549	vmovdqu	%ymm10,192(%rdi)
1550	vmovdqu	%ymm15,224(%rdi)
1551	vmovdqu	%ymm14,256(%rdi)
1552	vmovdqu	%ymm2,288(%rdi)
1553	vmovdqu	%ymm3,320(%rdi)
1554	vmovdqu	%ymm7,352(%rdi)
1555	je	L$done8x
1556
1557	leaq	384(%rsi),%rsi
1558	xorq	%r10,%r10
1559	vmovdqa	%ymm11,0(%rsp)
1560	leaq	384(%rdi),%rdi
1561	subq	$384,%rdx
1562	vmovdqa	%ymm9,32(%rsp)
1563	jmp	L$oop_tail8x
1564
1565.p2align	5
1566L$448_or_more8x:
1567	vpxor	0(%rsi),%ymm6,%ymm6
1568	vpxor	32(%rsi),%ymm8,%ymm8
1569	vpxor	64(%rsi),%ymm1,%ymm1
1570	vpxor	96(%rsi),%ymm5,%ymm5
1571	vpxor	128(%rsi),%ymm12,%ymm12
1572	vpxor	160(%rsi),%ymm13,%ymm13
1573	vpxor	192(%rsi),%ymm10,%ymm10
1574	vpxor	224(%rsi),%ymm15,%ymm15
1575	vpxor	256(%rsi),%ymm14,%ymm14
1576	vpxor	288(%rsi),%ymm2,%ymm2
1577	vpxor	320(%rsi),%ymm3,%ymm3
1578	vpxor	352(%rsi),%ymm7,%ymm7
1579	vpxor	384(%rsi),%ymm11,%ymm11
1580	vpxor	416(%rsi),%ymm9,%ymm9
1581	vmovdqu	%ymm6,0(%rdi)
1582	vmovdqu	%ymm8,32(%rdi)
1583	vmovdqu	%ymm1,64(%rdi)
1584	vmovdqu	%ymm5,96(%rdi)
1585	vmovdqu	%ymm12,128(%rdi)
1586	vmovdqu	%ymm13,160(%rdi)
1587	vmovdqu	%ymm10,192(%rdi)
1588	vmovdqu	%ymm15,224(%rdi)
1589	vmovdqu	%ymm14,256(%rdi)
1590	vmovdqu	%ymm2,288(%rdi)
1591	vmovdqu	%ymm3,320(%rdi)
1592	vmovdqu	%ymm7,352(%rdi)
1593	vmovdqu	%ymm11,384(%rdi)
1594	vmovdqu	%ymm9,416(%rdi)
1595	je	L$done8x
1596
1597	leaq	448(%rsi),%rsi
1598	xorq	%r10,%r10
1599	vmovdqa	%ymm0,0(%rsp)
1600	leaq	448(%rdi),%rdi
1601	subq	$448,%rdx
1602	vmovdqa	%ymm4,32(%rsp)
1603
1604L$oop_tail8x:
1605	movzbl	(%rsi,%r10,1),%eax
1606	movzbl	(%rsp,%r10,1),%ecx
1607	leaq	1(%r10),%r10
1608	xorl	%ecx,%eax
1609	movb	%al,-1(%rdi,%r10,1)
1610	decq	%rdx
1611	jnz	L$oop_tail8x
1612
1613L$done8x:
1614	vzeroall
1615	leaq	(%r9),%rsp
1616
1617L$8x_epilogue:
1618	ret
1619
1620
1621#endif
1622