1; This file is generated from a similarly-named Perl script in the BoringSSL
2; source tree. Do not edit by hand.
3
4%ifidn __OUTPUT_FORMAT__, win64
5default	rel
6%define XMMWORD
7%define YMMWORD
8%define ZMMWORD
9%define _CET_ENDBR
10
11%include "ring_core_generated/prefix_symbols_nasm.inc"
12section	.text code align=64
13
14
15EXTERN	OPENSSL_ia32cap_P
16
17section	.rdata rdata align=8
18ALIGN	64
19$L$zero:
20	DD	0,0,0,0
21$L$one:
22	DD	1,0,0,0
23$L$inc:
24	DD	0,1,2,3
25$L$four:
26	DD	4,4,4,4
27$L$incy:
28	DD	0,2,4,6,1,3,5,7
29$L$eight:
30	DD	8,8,8,8,8,8,8,8
31$L$rot16:
32	DB	0x2,0x3,0x0,0x1,0x6,0x7,0x4,0x5,0xa,0xb,0x8,0x9,0xe,0xf,0xc,0xd
33$L$rot24:
34	DB	0x3,0x0,0x1,0x2,0x7,0x4,0x5,0x6,0xb,0x8,0x9,0xa,0xf,0xc,0xd,0xe
35$L$sigma:
36	DB	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107
37	DB	0
38ALIGN	64
39$L$zeroz:
40	DD	0,0,0,0,1,0,0,0,2,0,0,0,3,0,0,0
41$L$fourz:
42	DD	4,0,0,0,4,0,0,0,4,0,0,0,4,0,0,0
43$L$incz:
44	DD	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
45$L$sixteen:
46	DD	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
47	DB	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
48	DB	95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32
49	DB	98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115
50	DB	108,46,111,114,103,62,0
51section	.text
52
53global	ChaCha20_ctr32
54
55ALIGN	64
56ChaCha20_ctr32:
57	mov	QWORD[8+rsp],rdi	;WIN64 prologue
58	mov	QWORD[16+rsp],rsi
59	mov	rax,rsp
60$L$SEH_begin_ChaCha20_ctr32:
61	mov	rdi,rcx
62	mov	rsi,rdx
63	mov	rdx,r8
64	mov	rcx,r9
65	mov	r8,QWORD[40+rsp]
66
67
68
69_CET_ENDBR
70	cmp	rdx,0
71	je	NEAR $L$no_data
72	mov	r10,QWORD[((OPENSSL_ia32cap_P+4))]
73	test	r10d,512
74	jnz	NEAR $L$ChaCha20_ssse3
75
76	push	rbx
77
78	push	rbp
79
80	push	r12
81
82	push	r13
83
84	push	r14
85
86	push	r15
87
88	sub	rsp,64+24
89
90$L$ctr32_body:
91
92
93	movdqu	xmm1,XMMWORD[rcx]
94	movdqu	xmm2,XMMWORD[16+rcx]
95	movdqu	xmm3,XMMWORD[r8]
96	movdqa	xmm4,XMMWORD[$L$one]
97
98
99	movdqa	XMMWORD[16+rsp],xmm1
100	movdqa	XMMWORD[32+rsp],xmm2
101	movdqa	XMMWORD[48+rsp],xmm3
102	mov	rbp,rdx
103	jmp	NEAR $L$oop_outer
104
105ALIGN	32
106$L$oop_outer:
107	mov	eax,0x61707865
108	mov	ebx,0x3320646e
109	mov	ecx,0x79622d32
110	mov	edx,0x6b206574
111	mov	r8d,DWORD[16+rsp]
112	mov	r9d,DWORD[20+rsp]
113	mov	r10d,DWORD[24+rsp]
114	mov	r11d,DWORD[28+rsp]
115	movd	r12d,xmm3
116	mov	r13d,DWORD[52+rsp]
117	mov	r14d,DWORD[56+rsp]
118	mov	r15d,DWORD[60+rsp]
119
120	mov	QWORD[((64+0))+rsp],rbp
121	mov	ebp,10
122	mov	QWORD[((64+8))+rsp],rsi
123DB	102,72,15,126,214
124	mov	QWORD[((64+16))+rsp],rdi
125	mov	rdi,rsi
126	shr	rdi,32
127	jmp	NEAR $L$oop
128
129ALIGN	32
130$L$oop:
131	add	eax,r8d
132	xor	r12d,eax
133	rol	r12d,16
134	add	ebx,r9d
135	xor	r13d,ebx
136	rol	r13d,16
137	add	esi,r12d
138	xor	r8d,esi
139	rol	r8d,12
140	add	edi,r13d
141	xor	r9d,edi
142	rol	r9d,12
143	add	eax,r8d
144	xor	r12d,eax
145	rol	r12d,8
146	add	ebx,r9d
147	xor	r13d,ebx
148	rol	r13d,8
149	add	esi,r12d
150	xor	r8d,esi
151	rol	r8d,7
152	add	edi,r13d
153	xor	r9d,edi
154	rol	r9d,7
155	mov	DWORD[32+rsp],esi
156	mov	DWORD[36+rsp],edi
157	mov	esi,DWORD[40+rsp]
158	mov	edi,DWORD[44+rsp]
159	add	ecx,r10d
160	xor	r14d,ecx
161	rol	r14d,16
162	add	edx,r11d
163	xor	r15d,edx
164	rol	r15d,16
165	add	esi,r14d
166	xor	r10d,esi
167	rol	r10d,12
168	add	edi,r15d
169	xor	r11d,edi
170	rol	r11d,12
171	add	ecx,r10d
172	xor	r14d,ecx
173	rol	r14d,8
174	add	edx,r11d
175	xor	r15d,edx
176	rol	r15d,8
177	add	esi,r14d
178	xor	r10d,esi
179	rol	r10d,7
180	add	edi,r15d
181	xor	r11d,edi
182	rol	r11d,7
183	add	eax,r9d
184	xor	r15d,eax
185	rol	r15d,16
186	add	ebx,r10d
187	xor	r12d,ebx
188	rol	r12d,16
189	add	esi,r15d
190	xor	r9d,esi
191	rol	r9d,12
192	add	edi,r12d
193	xor	r10d,edi
194	rol	r10d,12
195	add	eax,r9d
196	xor	r15d,eax
197	rol	r15d,8
198	add	ebx,r10d
199	xor	r12d,ebx
200	rol	r12d,8
201	add	esi,r15d
202	xor	r9d,esi
203	rol	r9d,7
204	add	edi,r12d
205	xor	r10d,edi
206	rol	r10d,7
207	mov	DWORD[40+rsp],esi
208	mov	DWORD[44+rsp],edi
209	mov	esi,DWORD[32+rsp]
210	mov	edi,DWORD[36+rsp]
211	add	ecx,r11d
212	xor	r13d,ecx
213	rol	r13d,16
214	add	edx,r8d
215	xor	r14d,edx
216	rol	r14d,16
217	add	esi,r13d
218	xor	r11d,esi
219	rol	r11d,12
220	add	edi,r14d
221	xor	r8d,edi
222	rol	r8d,12
223	add	ecx,r11d
224	xor	r13d,ecx
225	rol	r13d,8
226	add	edx,r8d
227	xor	r14d,edx
228	rol	r14d,8
229	add	esi,r13d
230	xor	r11d,esi
231	rol	r11d,7
232	add	edi,r14d
233	xor	r8d,edi
234	rol	r8d,7
235	dec	ebp
236	jnz	NEAR $L$oop
237	mov	DWORD[36+rsp],edi
238	mov	DWORD[32+rsp],esi
239	mov	rbp,QWORD[64+rsp]
240	movdqa	xmm1,xmm2
241	mov	rsi,QWORD[((64+8))+rsp]
242	paddd	xmm3,xmm4
243	mov	rdi,QWORD[((64+16))+rsp]
244
245	add	eax,0x61707865
246	add	ebx,0x3320646e
247	add	ecx,0x79622d32
248	add	edx,0x6b206574
249	add	r8d,DWORD[16+rsp]
250	add	r9d,DWORD[20+rsp]
251	add	r10d,DWORD[24+rsp]
252	add	r11d,DWORD[28+rsp]
253	add	r12d,DWORD[48+rsp]
254	add	r13d,DWORD[52+rsp]
255	add	r14d,DWORD[56+rsp]
256	add	r15d,DWORD[60+rsp]
257	paddd	xmm1,XMMWORD[32+rsp]
258
259	cmp	rbp,64
260	jb	NEAR $L$tail
261
262	xor	eax,DWORD[rsi]
263	xor	ebx,DWORD[4+rsi]
264	xor	ecx,DWORD[8+rsi]
265	xor	edx,DWORD[12+rsi]
266	xor	r8d,DWORD[16+rsi]
267	xor	r9d,DWORD[20+rsi]
268	xor	r10d,DWORD[24+rsi]
269	xor	r11d,DWORD[28+rsi]
270	movdqu	xmm0,XMMWORD[32+rsi]
271	xor	r12d,DWORD[48+rsi]
272	xor	r13d,DWORD[52+rsi]
273	xor	r14d,DWORD[56+rsi]
274	xor	r15d,DWORD[60+rsi]
275	lea	rsi,[64+rsi]
276	pxor	xmm0,xmm1
277
278	movdqa	XMMWORD[32+rsp],xmm2
279	movd	DWORD[48+rsp],xmm3
280
281	mov	DWORD[rdi],eax
282	mov	DWORD[4+rdi],ebx
283	mov	DWORD[8+rdi],ecx
284	mov	DWORD[12+rdi],edx
285	mov	DWORD[16+rdi],r8d
286	mov	DWORD[20+rdi],r9d
287	mov	DWORD[24+rdi],r10d
288	mov	DWORD[28+rdi],r11d
289	movdqu	XMMWORD[32+rdi],xmm0
290	mov	DWORD[48+rdi],r12d
291	mov	DWORD[52+rdi],r13d
292	mov	DWORD[56+rdi],r14d
293	mov	DWORD[60+rdi],r15d
294	lea	rdi,[64+rdi]
295
296	sub	rbp,64
297	jnz	NEAR $L$oop_outer
298
299	jmp	NEAR $L$done
300
301ALIGN	16
302$L$tail:
303	mov	DWORD[rsp],eax
304	mov	DWORD[4+rsp],ebx
305	xor	rbx,rbx
306	mov	DWORD[8+rsp],ecx
307	mov	DWORD[12+rsp],edx
308	mov	DWORD[16+rsp],r8d
309	mov	DWORD[20+rsp],r9d
310	mov	DWORD[24+rsp],r10d
311	mov	DWORD[28+rsp],r11d
312	movdqa	XMMWORD[32+rsp],xmm1
313	mov	DWORD[48+rsp],r12d
314	mov	DWORD[52+rsp],r13d
315	mov	DWORD[56+rsp],r14d
316	mov	DWORD[60+rsp],r15d
317
318$L$oop_tail:
319	movzx	eax,BYTE[rbx*1+rsi]
320	movzx	edx,BYTE[rbx*1+rsp]
321	lea	rbx,[1+rbx]
322	xor	eax,edx
323	mov	BYTE[((-1))+rbx*1+rdi],al
324	dec	rbp
325	jnz	NEAR $L$oop_tail
326
327$L$done:
328	lea	rsi,[((64+24+48))+rsp]
329	mov	r15,QWORD[((-48))+rsi]
330
331	mov	r14,QWORD[((-40))+rsi]
332
333	mov	r13,QWORD[((-32))+rsi]
334
335	mov	r12,QWORD[((-24))+rsi]
336
337	mov	rbp,QWORD[((-16))+rsi]
338
339	mov	rbx,QWORD[((-8))+rsi]
340
341	lea	rsp,[rsi]
342
343$L$no_data:
344	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
345	mov	rsi,QWORD[16+rsp]
346	ret
347
348$L$SEH_end_ChaCha20_ctr32:
349
350ALIGN	32
351ChaCha20_ssse3:
352	mov	QWORD[8+rsp],rdi	;WIN64 prologue
353	mov	QWORD[16+rsp],rsi
354	mov	rax,rsp
355$L$SEH_begin_ChaCha20_ssse3:
356	mov	rdi,rcx
357	mov	rsi,rdx
358	mov	rdx,r8
359	mov	rcx,r9
360	mov	r8,QWORD[40+rsp]
361
362
363$L$ChaCha20_ssse3:
364
365	mov	r9,rsp
366
367	cmp	rdx,128
368	ja	NEAR $L$ChaCha20_4x
369
370$L$do_sse3_after_all:
371	sub	rsp,64+40
372	movaps	XMMWORD[(-40)+r9],xmm6
373	movaps	XMMWORD[(-24)+r9],xmm7
374$L$ssse3_body:
375	movdqa	xmm0,XMMWORD[$L$sigma]
376	movdqu	xmm1,XMMWORD[rcx]
377	movdqu	xmm2,XMMWORD[16+rcx]
378	movdqu	xmm3,XMMWORD[r8]
379	movdqa	xmm6,XMMWORD[$L$rot16]
380	movdqa	xmm7,XMMWORD[$L$rot24]
381
382	movdqa	XMMWORD[rsp],xmm0
383	movdqa	XMMWORD[16+rsp],xmm1
384	movdqa	XMMWORD[32+rsp],xmm2
385	movdqa	XMMWORD[48+rsp],xmm3
386	mov	r8,10
387	jmp	NEAR $L$oop_ssse3
388
389ALIGN	32
390$L$oop_outer_ssse3:
391	movdqa	xmm3,XMMWORD[$L$one]
392	movdqa	xmm0,XMMWORD[rsp]
393	movdqa	xmm1,XMMWORD[16+rsp]
394	movdqa	xmm2,XMMWORD[32+rsp]
395	paddd	xmm3,XMMWORD[48+rsp]
396	mov	r8,10
397	movdqa	XMMWORD[48+rsp],xmm3
398	jmp	NEAR $L$oop_ssse3
399
400ALIGN	32
401$L$oop_ssse3:
402	paddd	xmm0,xmm1
403	pxor	xmm3,xmm0
404DB	102,15,56,0,222
405	paddd	xmm2,xmm3
406	pxor	xmm1,xmm2
407	movdqa	xmm4,xmm1
408	psrld	xmm1,20
409	pslld	xmm4,12
410	por	xmm1,xmm4
411	paddd	xmm0,xmm1
412	pxor	xmm3,xmm0
413DB	102,15,56,0,223
414	paddd	xmm2,xmm3
415	pxor	xmm1,xmm2
416	movdqa	xmm4,xmm1
417	psrld	xmm1,25
418	pslld	xmm4,7
419	por	xmm1,xmm4
420	pshufd	xmm2,xmm2,78
421	pshufd	xmm1,xmm1,57
422	pshufd	xmm3,xmm3,147
423	nop
424	paddd	xmm0,xmm1
425	pxor	xmm3,xmm0
426DB	102,15,56,0,222
427	paddd	xmm2,xmm3
428	pxor	xmm1,xmm2
429	movdqa	xmm4,xmm1
430	psrld	xmm1,20
431	pslld	xmm4,12
432	por	xmm1,xmm4
433	paddd	xmm0,xmm1
434	pxor	xmm3,xmm0
435DB	102,15,56,0,223
436	paddd	xmm2,xmm3
437	pxor	xmm1,xmm2
438	movdqa	xmm4,xmm1
439	psrld	xmm1,25
440	pslld	xmm4,7
441	por	xmm1,xmm4
442	pshufd	xmm2,xmm2,78
443	pshufd	xmm1,xmm1,147
444	pshufd	xmm3,xmm3,57
445	dec	r8
446	jnz	NEAR $L$oop_ssse3
447	paddd	xmm0,XMMWORD[rsp]
448	paddd	xmm1,XMMWORD[16+rsp]
449	paddd	xmm2,XMMWORD[32+rsp]
450	paddd	xmm3,XMMWORD[48+rsp]
451
452	cmp	rdx,64
453	jb	NEAR $L$tail_ssse3
454
455	movdqu	xmm4,XMMWORD[rsi]
456	movdqu	xmm5,XMMWORD[16+rsi]
457	pxor	xmm0,xmm4
458	movdqu	xmm4,XMMWORD[32+rsi]
459	pxor	xmm1,xmm5
460	movdqu	xmm5,XMMWORD[48+rsi]
461	lea	rsi,[64+rsi]
462	pxor	xmm2,xmm4
463	pxor	xmm3,xmm5
464
465	movdqu	XMMWORD[rdi],xmm0
466	movdqu	XMMWORD[16+rdi],xmm1
467	movdqu	XMMWORD[32+rdi],xmm2
468	movdqu	XMMWORD[48+rdi],xmm3
469	lea	rdi,[64+rdi]
470
471	sub	rdx,64
472	jnz	NEAR $L$oop_outer_ssse3
473
474	jmp	NEAR $L$done_ssse3
475
476ALIGN	16
477$L$tail_ssse3:
478	movdqa	XMMWORD[rsp],xmm0
479	movdqa	XMMWORD[16+rsp],xmm1
480	movdqa	XMMWORD[32+rsp],xmm2
481	movdqa	XMMWORD[48+rsp],xmm3
482	xor	r8,r8
483
484$L$oop_tail_ssse3:
485	movzx	eax,BYTE[r8*1+rsi]
486	movzx	ecx,BYTE[r8*1+rsp]
487	lea	r8,[1+r8]
488	xor	eax,ecx
489	mov	BYTE[((-1))+r8*1+rdi],al
490	dec	rdx
491	jnz	NEAR $L$oop_tail_ssse3
492
493$L$done_ssse3:
494	movaps	xmm6,XMMWORD[((-40))+r9]
495	movaps	xmm7,XMMWORD[((-24))+r9]
496	lea	rsp,[r9]
497
498$L$ssse3_epilogue:
499	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
500	mov	rsi,QWORD[16+rsp]
501	ret
502
503$L$SEH_end_ChaCha20_ssse3:
504
505ALIGN	32
506ChaCha20_4x:
507	mov	QWORD[8+rsp],rdi	;WIN64 prologue
508	mov	QWORD[16+rsp],rsi
509	mov	rax,rsp
510$L$SEH_begin_ChaCha20_4x:
511	mov	rdi,rcx
512	mov	rsi,rdx
513	mov	rdx,r8
514	mov	rcx,r9
515	mov	r8,QWORD[40+rsp]
516
517
518$L$ChaCha20_4x:
519
520	mov	r9,rsp
521
522	mov	r11,r10
523	shr	r10,32
524	test	r10,32
525	jnz	NEAR $L$ChaCha20_8x
526	cmp	rdx,192
527	ja	NEAR $L$proceed4x
528
529	and	r11,71303168
530	cmp	r11,4194304
531	je	NEAR $L$do_sse3_after_all
532
533$L$proceed4x:
534	sub	rsp,0x140+168
535	movaps	XMMWORD[(-168)+r9],xmm6
536	movaps	XMMWORD[(-152)+r9],xmm7
537	movaps	XMMWORD[(-136)+r9],xmm8
538	movaps	XMMWORD[(-120)+r9],xmm9
539	movaps	XMMWORD[(-104)+r9],xmm10
540	movaps	XMMWORD[(-88)+r9],xmm11
541	movaps	XMMWORD[(-72)+r9],xmm12
542	movaps	XMMWORD[(-56)+r9],xmm13
543	movaps	XMMWORD[(-40)+r9],xmm14
544	movaps	XMMWORD[(-24)+r9],xmm15
545$L$4x_body:
546	movdqa	xmm11,XMMWORD[$L$sigma]
547	movdqu	xmm15,XMMWORD[rcx]
548	movdqu	xmm7,XMMWORD[16+rcx]
549	movdqu	xmm3,XMMWORD[r8]
550	lea	rcx,[256+rsp]
551	lea	r10,[$L$rot16]
552	lea	r11,[$L$rot24]
553
554	pshufd	xmm8,xmm11,0x00
555	pshufd	xmm9,xmm11,0x55
556	movdqa	XMMWORD[64+rsp],xmm8
557	pshufd	xmm10,xmm11,0xaa
558	movdqa	XMMWORD[80+rsp],xmm9
559	pshufd	xmm11,xmm11,0xff
560	movdqa	XMMWORD[96+rsp],xmm10
561	movdqa	XMMWORD[112+rsp],xmm11
562
563	pshufd	xmm12,xmm15,0x00
564	pshufd	xmm13,xmm15,0x55
565	movdqa	XMMWORD[(128-256)+rcx],xmm12
566	pshufd	xmm14,xmm15,0xaa
567	movdqa	XMMWORD[(144-256)+rcx],xmm13
568	pshufd	xmm15,xmm15,0xff
569	movdqa	XMMWORD[(160-256)+rcx],xmm14
570	movdqa	XMMWORD[(176-256)+rcx],xmm15
571
572	pshufd	xmm4,xmm7,0x00
573	pshufd	xmm5,xmm7,0x55
574	movdqa	XMMWORD[(192-256)+rcx],xmm4
575	pshufd	xmm6,xmm7,0xaa
576	movdqa	XMMWORD[(208-256)+rcx],xmm5
577	pshufd	xmm7,xmm7,0xff
578	movdqa	XMMWORD[(224-256)+rcx],xmm6
579	movdqa	XMMWORD[(240-256)+rcx],xmm7
580
581	pshufd	xmm0,xmm3,0x00
582	pshufd	xmm1,xmm3,0x55
583	paddd	xmm0,XMMWORD[$L$inc]
584	pshufd	xmm2,xmm3,0xaa
585	movdqa	XMMWORD[(272-256)+rcx],xmm1
586	pshufd	xmm3,xmm3,0xff
587	movdqa	XMMWORD[(288-256)+rcx],xmm2
588	movdqa	XMMWORD[(304-256)+rcx],xmm3
589
590	jmp	NEAR $L$oop_enter4x
591
592ALIGN	32
593$L$oop_outer4x:
594	movdqa	xmm8,XMMWORD[64+rsp]
595	movdqa	xmm9,XMMWORD[80+rsp]
596	movdqa	xmm10,XMMWORD[96+rsp]
597	movdqa	xmm11,XMMWORD[112+rsp]
598	movdqa	xmm12,XMMWORD[((128-256))+rcx]
599	movdqa	xmm13,XMMWORD[((144-256))+rcx]
600	movdqa	xmm14,XMMWORD[((160-256))+rcx]
601	movdqa	xmm15,XMMWORD[((176-256))+rcx]
602	movdqa	xmm4,XMMWORD[((192-256))+rcx]
603	movdqa	xmm5,XMMWORD[((208-256))+rcx]
604	movdqa	xmm6,XMMWORD[((224-256))+rcx]
605	movdqa	xmm7,XMMWORD[((240-256))+rcx]
606	movdqa	xmm0,XMMWORD[((256-256))+rcx]
607	movdqa	xmm1,XMMWORD[((272-256))+rcx]
608	movdqa	xmm2,XMMWORD[((288-256))+rcx]
609	movdqa	xmm3,XMMWORD[((304-256))+rcx]
610	paddd	xmm0,XMMWORD[$L$four]
611
612$L$oop_enter4x:
613	movdqa	XMMWORD[32+rsp],xmm6
614	movdqa	XMMWORD[48+rsp],xmm7
615	movdqa	xmm7,XMMWORD[r10]
616	mov	eax,10
617	movdqa	XMMWORD[(256-256)+rcx],xmm0
618	jmp	NEAR $L$oop4x
619
620ALIGN	32
621$L$oop4x:
622	paddd	xmm8,xmm12
623	paddd	xmm9,xmm13
624	pxor	xmm0,xmm8
625	pxor	xmm1,xmm9
626DB	102,15,56,0,199
627DB	102,15,56,0,207
628	paddd	xmm4,xmm0
629	paddd	xmm5,xmm1
630	pxor	xmm12,xmm4
631	pxor	xmm13,xmm5
632	movdqa	xmm6,xmm12
633	pslld	xmm12,12
634	psrld	xmm6,20
635	movdqa	xmm7,xmm13
636	pslld	xmm13,12
637	por	xmm12,xmm6
638	psrld	xmm7,20
639	movdqa	xmm6,XMMWORD[r11]
640	por	xmm13,xmm7
641	paddd	xmm8,xmm12
642	paddd	xmm9,xmm13
643	pxor	xmm0,xmm8
644	pxor	xmm1,xmm9
645DB	102,15,56,0,198
646DB	102,15,56,0,206
647	paddd	xmm4,xmm0
648	paddd	xmm5,xmm1
649	pxor	xmm12,xmm4
650	pxor	xmm13,xmm5
651	movdqa	xmm7,xmm12
652	pslld	xmm12,7
653	psrld	xmm7,25
654	movdqa	xmm6,xmm13
655	pslld	xmm13,7
656	por	xmm12,xmm7
657	psrld	xmm6,25
658	movdqa	xmm7,XMMWORD[r10]
659	por	xmm13,xmm6
660	movdqa	XMMWORD[rsp],xmm4
661	movdqa	XMMWORD[16+rsp],xmm5
662	movdqa	xmm4,XMMWORD[32+rsp]
663	movdqa	xmm5,XMMWORD[48+rsp]
664	paddd	xmm10,xmm14
665	paddd	xmm11,xmm15
666	pxor	xmm2,xmm10
667	pxor	xmm3,xmm11
668DB	102,15,56,0,215
669DB	102,15,56,0,223
670	paddd	xmm4,xmm2
671	paddd	xmm5,xmm3
672	pxor	xmm14,xmm4
673	pxor	xmm15,xmm5
674	movdqa	xmm6,xmm14
675	pslld	xmm14,12
676	psrld	xmm6,20
677	movdqa	xmm7,xmm15
678	pslld	xmm15,12
679	por	xmm14,xmm6
680	psrld	xmm7,20
681	movdqa	xmm6,XMMWORD[r11]
682	por	xmm15,xmm7
683	paddd	xmm10,xmm14
684	paddd	xmm11,xmm15
685	pxor	xmm2,xmm10
686	pxor	xmm3,xmm11
687DB	102,15,56,0,214
688DB	102,15,56,0,222
689	paddd	xmm4,xmm2
690	paddd	xmm5,xmm3
691	pxor	xmm14,xmm4
692	pxor	xmm15,xmm5
693	movdqa	xmm7,xmm14
694	pslld	xmm14,7
695	psrld	xmm7,25
696	movdqa	xmm6,xmm15
697	pslld	xmm15,7
698	por	xmm14,xmm7
699	psrld	xmm6,25
700	movdqa	xmm7,XMMWORD[r10]
701	por	xmm15,xmm6
702	paddd	xmm8,xmm13
703	paddd	xmm9,xmm14
704	pxor	xmm3,xmm8
705	pxor	xmm0,xmm9
706DB	102,15,56,0,223
707DB	102,15,56,0,199
708	paddd	xmm4,xmm3
709	paddd	xmm5,xmm0
710	pxor	xmm13,xmm4
711	pxor	xmm14,xmm5
712	movdqa	xmm6,xmm13
713	pslld	xmm13,12
714	psrld	xmm6,20
715	movdqa	xmm7,xmm14
716	pslld	xmm14,12
717	por	xmm13,xmm6
718	psrld	xmm7,20
719	movdqa	xmm6,XMMWORD[r11]
720	por	xmm14,xmm7
721	paddd	xmm8,xmm13
722	paddd	xmm9,xmm14
723	pxor	xmm3,xmm8
724	pxor	xmm0,xmm9
725DB	102,15,56,0,222
726DB	102,15,56,0,198
727	paddd	xmm4,xmm3
728	paddd	xmm5,xmm0
729	pxor	xmm13,xmm4
730	pxor	xmm14,xmm5
731	movdqa	xmm7,xmm13
732	pslld	xmm13,7
733	psrld	xmm7,25
734	movdqa	xmm6,xmm14
735	pslld	xmm14,7
736	por	xmm13,xmm7
737	psrld	xmm6,25
738	movdqa	xmm7,XMMWORD[r10]
739	por	xmm14,xmm6
740	movdqa	XMMWORD[32+rsp],xmm4
741	movdqa	XMMWORD[48+rsp],xmm5
742	movdqa	xmm4,XMMWORD[rsp]
743	movdqa	xmm5,XMMWORD[16+rsp]
744	paddd	xmm10,xmm15
745	paddd	xmm11,xmm12
746	pxor	xmm1,xmm10
747	pxor	xmm2,xmm11
748DB	102,15,56,0,207
749DB	102,15,56,0,215
750	paddd	xmm4,xmm1
751	paddd	xmm5,xmm2
752	pxor	xmm15,xmm4
753	pxor	xmm12,xmm5
754	movdqa	xmm6,xmm15
755	pslld	xmm15,12
756	psrld	xmm6,20
757	movdqa	xmm7,xmm12
758	pslld	xmm12,12
759	por	xmm15,xmm6
760	psrld	xmm7,20
761	movdqa	xmm6,XMMWORD[r11]
762	por	xmm12,xmm7
763	paddd	xmm10,xmm15
764	paddd	xmm11,xmm12
765	pxor	xmm1,xmm10
766	pxor	xmm2,xmm11
767DB	102,15,56,0,206
768DB	102,15,56,0,214
769	paddd	xmm4,xmm1
770	paddd	xmm5,xmm2
771	pxor	xmm15,xmm4
772	pxor	xmm12,xmm5
773	movdqa	xmm7,xmm15
774	pslld	xmm15,7
775	psrld	xmm7,25
776	movdqa	xmm6,xmm12
777	pslld	xmm12,7
778	por	xmm15,xmm7
779	psrld	xmm6,25
780	movdqa	xmm7,XMMWORD[r10]
781	por	xmm12,xmm6
782	dec	eax
783	jnz	NEAR $L$oop4x
784
785	paddd	xmm8,XMMWORD[64+rsp]
786	paddd	xmm9,XMMWORD[80+rsp]
787	paddd	xmm10,XMMWORD[96+rsp]
788	paddd	xmm11,XMMWORD[112+rsp]
789
790	movdqa	xmm6,xmm8
791	punpckldq	xmm8,xmm9
792	movdqa	xmm7,xmm10
793	punpckldq	xmm10,xmm11
794	punpckhdq	xmm6,xmm9
795	punpckhdq	xmm7,xmm11
796	movdqa	xmm9,xmm8
797	punpcklqdq	xmm8,xmm10
798	movdqa	xmm11,xmm6
799	punpcklqdq	xmm6,xmm7
800	punpckhqdq	xmm9,xmm10
801	punpckhqdq	xmm11,xmm7
802	paddd	xmm12,XMMWORD[((128-256))+rcx]
803	paddd	xmm13,XMMWORD[((144-256))+rcx]
804	paddd	xmm14,XMMWORD[((160-256))+rcx]
805	paddd	xmm15,XMMWORD[((176-256))+rcx]
806
807	movdqa	XMMWORD[rsp],xmm8
808	movdqa	XMMWORD[16+rsp],xmm9
809	movdqa	xmm8,XMMWORD[32+rsp]
810	movdqa	xmm9,XMMWORD[48+rsp]
811
812	movdqa	xmm10,xmm12
813	punpckldq	xmm12,xmm13
814	movdqa	xmm7,xmm14
815	punpckldq	xmm14,xmm15
816	punpckhdq	xmm10,xmm13
817	punpckhdq	xmm7,xmm15
818	movdqa	xmm13,xmm12
819	punpcklqdq	xmm12,xmm14
820	movdqa	xmm15,xmm10
821	punpcklqdq	xmm10,xmm7
822	punpckhqdq	xmm13,xmm14
823	punpckhqdq	xmm15,xmm7
824	paddd	xmm4,XMMWORD[((192-256))+rcx]
825	paddd	xmm5,XMMWORD[((208-256))+rcx]
826	paddd	xmm8,XMMWORD[((224-256))+rcx]
827	paddd	xmm9,XMMWORD[((240-256))+rcx]
828
829	movdqa	XMMWORD[32+rsp],xmm6
830	movdqa	XMMWORD[48+rsp],xmm11
831
832	movdqa	xmm14,xmm4
833	punpckldq	xmm4,xmm5
834	movdqa	xmm7,xmm8
835	punpckldq	xmm8,xmm9
836	punpckhdq	xmm14,xmm5
837	punpckhdq	xmm7,xmm9
838	movdqa	xmm5,xmm4
839	punpcklqdq	xmm4,xmm8
840	movdqa	xmm9,xmm14
841	punpcklqdq	xmm14,xmm7
842	punpckhqdq	xmm5,xmm8
843	punpckhqdq	xmm9,xmm7
844	paddd	xmm0,XMMWORD[((256-256))+rcx]
845	paddd	xmm1,XMMWORD[((272-256))+rcx]
846	paddd	xmm2,XMMWORD[((288-256))+rcx]
847	paddd	xmm3,XMMWORD[((304-256))+rcx]
848
849	movdqa	xmm8,xmm0
850	punpckldq	xmm0,xmm1
851	movdqa	xmm7,xmm2
852	punpckldq	xmm2,xmm3
853	punpckhdq	xmm8,xmm1
854	punpckhdq	xmm7,xmm3
855	movdqa	xmm1,xmm0
856	punpcklqdq	xmm0,xmm2
857	movdqa	xmm3,xmm8
858	punpcklqdq	xmm8,xmm7
859	punpckhqdq	xmm1,xmm2
860	punpckhqdq	xmm3,xmm7
861	cmp	rdx,64*4
862	jb	NEAR $L$tail4x
863
864	movdqu	xmm6,XMMWORD[rsi]
865	movdqu	xmm11,XMMWORD[16+rsi]
866	movdqu	xmm2,XMMWORD[32+rsi]
867	movdqu	xmm7,XMMWORD[48+rsi]
868	pxor	xmm6,XMMWORD[rsp]
869	pxor	xmm11,xmm12
870	pxor	xmm2,xmm4
871	pxor	xmm7,xmm0
872
873	movdqu	XMMWORD[rdi],xmm6
874	movdqu	xmm6,XMMWORD[64+rsi]
875	movdqu	XMMWORD[16+rdi],xmm11
876	movdqu	xmm11,XMMWORD[80+rsi]
877	movdqu	XMMWORD[32+rdi],xmm2
878	movdqu	xmm2,XMMWORD[96+rsi]
879	movdqu	XMMWORD[48+rdi],xmm7
880	movdqu	xmm7,XMMWORD[112+rsi]
881	lea	rsi,[128+rsi]
882	pxor	xmm6,XMMWORD[16+rsp]
883	pxor	xmm11,xmm13
884	pxor	xmm2,xmm5
885	pxor	xmm7,xmm1
886
887	movdqu	XMMWORD[64+rdi],xmm6
888	movdqu	xmm6,XMMWORD[rsi]
889	movdqu	XMMWORD[80+rdi],xmm11
890	movdqu	xmm11,XMMWORD[16+rsi]
891	movdqu	XMMWORD[96+rdi],xmm2
892	movdqu	xmm2,XMMWORD[32+rsi]
893	movdqu	XMMWORD[112+rdi],xmm7
894	lea	rdi,[128+rdi]
895	movdqu	xmm7,XMMWORD[48+rsi]
896	pxor	xmm6,XMMWORD[32+rsp]
897	pxor	xmm11,xmm10
898	pxor	xmm2,xmm14
899	pxor	xmm7,xmm8
900
901	movdqu	XMMWORD[rdi],xmm6
902	movdqu	xmm6,XMMWORD[64+rsi]
903	movdqu	XMMWORD[16+rdi],xmm11
904	movdqu	xmm11,XMMWORD[80+rsi]
905	movdqu	XMMWORD[32+rdi],xmm2
906	movdqu	xmm2,XMMWORD[96+rsi]
907	movdqu	XMMWORD[48+rdi],xmm7
908	movdqu	xmm7,XMMWORD[112+rsi]
909	lea	rsi,[128+rsi]
910	pxor	xmm6,XMMWORD[48+rsp]
911	pxor	xmm11,xmm15
912	pxor	xmm2,xmm9
913	pxor	xmm7,xmm3
914	movdqu	XMMWORD[64+rdi],xmm6
915	movdqu	XMMWORD[80+rdi],xmm11
916	movdqu	XMMWORD[96+rdi],xmm2
917	movdqu	XMMWORD[112+rdi],xmm7
918	lea	rdi,[128+rdi]
919
920	sub	rdx,64*4
921	jnz	NEAR $L$oop_outer4x
922
923	jmp	NEAR $L$done4x
924
925$L$tail4x:
926	cmp	rdx,192
927	jae	NEAR $L$192_or_more4x
928	cmp	rdx,128
929	jae	NEAR $L$128_or_more4x
930	cmp	rdx,64
931	jae	NEAR $L$64_or_more4x
932
933
934	xor	r10,r10
935
936	movdqa	XMMWORD[16+rsp],xmm12
937	movdqa	XMMWORD[32+rsp],xmm4
938	movdqa	XMMWORD[48+rsp],xmm0
939	jmp	NEAR $L$oop_tail4x
940
941ALIGN	32
942$L$64_or_more4x:
943	movdqu	xmm6,XMMWORD[rsi]
944	movdqu	xmm11,XMMWORD[16+rsi]
945	movdqu	xmm2,XMMWORD[32+rsi]
946	movdqu	xmm7,XMMWORD[48+rsi]
947	pxor	xmm6,XMMWORD[rsp]
948	pxor	xmm11,xmm12
949	pxor	xmm2,xmm4
950	pxor	xmm7,xmm0
951	movdqu	XMMWORD[rdi],xmm6
952	movdqu	XMMWORD[16+rdi],xmm11
953	movdqu	XMMWORD[32+rdi],xmm2
954	movdqu	XMMWORD[48+rdi],xmm7
955	je	NEAR $L$done4x
956
957	movdqa	xmm6,XMMWORD[16+rsp]
958	lea	rsi,[64+rsi]
959	xor	r10,r10
960	movdqa	XMMWORD[rsp],xmm6
961	movdqa	XMMWORD[16+rsp],xmm13
962	lea	rdi,[64+rdi]
963	movdqa	XMMWORD[32+rsp],xmm5
964	sub	rdx,64
965	movdqa	XMMWORD[48+rsp],xmm1
966	jmp	NEAR $L$oop_tail4x
967
968ALIGN	32
969$L$128_or_more4x:
970	movdqu	xmm6,XMMWORD[rsi]
971	movdqu	xmm11,XMMWORD[16+rsi]
972	movdqu	xmm2,XMMWORD[32+rsi]
973	movdqu	xmm7,XMMWORD[48+rsi]
974	pxor	xmm6,XMMWORD[rsp]
975	pxor	xmm11,xmm12
976	pxor	xmm2,xmm4
977	pxor	xmm7,xmm0
978
979	movdqu	XMMWORD[rdi],xmm6
980	movdqu	xmm6,XMMWORD[64+rsi]
981	movdqu	XMMWORD[16+rdi],xmm11
982	movdqu	xmm11,XMMWORD[80+rsi]
983	movdqu	XMMWORD[32+rdi],xmm2
984	movdqu	xmm2,XMMWORD[96+rsi]
985	movdqu	XMMWORD[48+rdi],xmm7
986	movdqu	xmm7,XMMWORD[112+rsi]
987	pxor	xmm6,XMMWORD[16+rsp]
988	pxor	xmm11,xmm13
989	pxor	xmm2,xmm5
990	pxor	xmm7,xmm1
991	movdqu	XMMWORD[64+rdi],xmm6
992	movdqu	XMMWORD[80+rdi],xmm11
993	movdqu	XMMWORD[96+rdi],xmm2
994	movdqu	XMMWORD[112+rdi],xmm7
995	je	NEAR $L$done4x
996
997	movdqa	xmm6,XMMWORD[32+rsp]
998	lea	rsi,[128+rsi]
999	xor	r10,r10
1000	movdqa	XMMWORD[rsp],xmm6
1001	movdqa	XMMWORD[16+rsp],xmm10
1002	lea	rdi,[128+rdi]
1003	movdqa	XMMWORD[32+rsp],xmm14
1004	sub	rdx,128
1005	movdqa	XMMWORD[48+rsp],xmm8
1006	jmp	NEAR $L$oop_tail4x
1007
1008ALIGN	32
1009$L$192_or_more4x:
1010	movdqu	xmm6,XMMWORD[rsi]
1011	movdqu	xmm11,XMMWORD[16+rsi]
1012	movdqu	xmm2,XMMWORD[32+rsi]
1013	movdqu	xmm7,XMMWORD[48+rsi]
1014	pxor	xmm6,XMMWORD[rsp]
1015	pxor	xmm11,xmm12
1016	pxor	xmm2,xmm4
1017	pxor	xmm7,xmm0
1018
1019	movdqu	XMMWORD[rdi],xmm6
1020	movdqu	xmm6,XMMWORD[64+rsi]
1021	movdqu	XMMWORD[16+rdi],xmm11
1022	movdqu	xmm11,XMMWORD[80+rsi]
1023	movdqu	XMMWORD[32+rdi],xmm2
1024	movdqu	xmm2,XMMWORD[96+rsi]
1025	movdqu	XMMWORD[48+rdi],xmm7
1026	movdqu	xmm7,XMMWORD[112+rsi]
1027	lea	rsi,[128+rsi]
1028	pxor	xmm6,XMMWORD[16+rsp]
1029	pxor	xmm11,xmm13
1030	pxor	xmm2,xmm5
1031	pxor	xmm7,xmm1
1032
1033	movdqu	XMMWORD[64+rdi],xmm6
1034	movdqu	xmm6,XMMWORD[rsi]
1035	movdqu	XMMWORD[80+rdi],xmm11
1036	movdqu	xmm11,XMMWORD[16+rsi]
1037	movdqu	XMMWORD[96+rdi],xmm2
1038	movdqu	xmm2,XMMWORD[32+rsi]
1039	movdqu	XMMWORD[112+rdi],xmm7
1040	lea	rdi,[128+rdi]
1041	movdqu	xmm7,XMMWORD[48+rsi]
1042	pxor	xmm6,XMMWORD[32+rsp]
1043	pxor	xmm11,xmm10
1044	pxor	xmm2,xmm14
1045	pxor	xmm7,xmm8
1046	movdqu	XMMWORD[rdi],xmm6
1047	movdqu	XMMWORD[16+rdi],xmm11
1048	movdqu	XMMWORD[32+rdi],xmm2
1049	movdqu	XMMWORD[48+rdi],xmm7
1050	je	NEAR $L$done4x
1051
1052	movdqa	xmm6,XMMWORD[48+rsp]
1053	lea	rsi,[64+rsi]
1054	xor	r10,r10
1055	movdqa	XMMWORD[rsp],xmm6
1056	movdqa	XMMWORD[16+rsp],xmm15
1057	lea	rdi,[64+rdi]
1058	movdqa	XMMWORD[32+rsp],xmm9
1059	sub	rdx,192
1060	movdqa	XMMWORD[48+rsp],xmm3
1061
1062$L$oop_tail4x:
1063	movzx	eax,BYTE[r10*1+rsi]
1064	movzx	ecx,BYTE[r10*1+rsp]
1065	lea	r10,[1+r10]
1066	xor	eax,ecx
1067	mov	BYTE[((-1))+r10*1+rdi],al
1068	dec	rdx
1069	jnz	NEAR $L$oop_tail4x
1070
1071$L$done4x:
1072	movaps	xmm6,XMMWORD[((-168))+r9]
1073	movaps	xmm7,XMMWORD[((-152))+r9]
1074	movaps	xmm8,XMMWORD[((-136))+r9]
1075	movaps	xmm9,XMMWORD[((-120))+r9]
1076	movaps	xmm10,XMMWORD[((-104))+r9]
1077	movaps	xmm11,XMMWORD[((-88))+r9]
1078	movaps	xmm12,XMMWORD[((-72))+r9]
1079	movaps	xmm13,XMMWORD[((-56))+r9]
1080	movaps	xmm14,XMMWORD[((-40))+r9]
1081	movaps	xmm15,XMMWORD[((-24))+r9]
1082	lea	rsp,[r9]
1083
1084$L$4x_epilogue:
1085	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
1086	mov	rsi,QWORD[16+rsp]
1087	ret
1088
1089$L$SEH_end_ChaCha20_4x:
1090
1091ALIGN	32
1092ChaCha20_8x:
1093	mov	QWORD[8+rsp],rdi	;WIN64 prologue
1094	mov	QWORD[16+rsp],rsi
1095	mov	rax,rsp
1096$L$SEH_begin_ChaCha20_8x:
1097	mov	rdi,rcx
1098	mov	rsi,rdx
1099	mov	rdx,r8
1100	mov	rcx,r9
1101	mov	r8,QWORD[40+rsp]
1102
1103
1104$L$ChaCha20_8x:
1105
1106	mov	r9,rsp
1107
1108	sub	rsp,0x280+168
1109	and	rsp,-32
1110	movaps	XMMWORD[(-168)+r9],xmm6
1111	movaps	XMMWORD[(-152)+r9],xmm7
1112	movaps	XMMWORD[(-136)+r9],xmm8
1113	movaps	XMMWORD[(-120)+r9],xmm9
1114	movaps	XMMWORD[(-104)+r9],xmm10
1115	movaps	XMMWORD[(-88)+r9],xmm11
1116	movaps	XMMWORD[(-72)+r9],xmm12
1117	movaps	XMMWORD[(-56)+r9],xmm13
1118	movaps	XMMWORD[(-40)+r9],xmm14
1119	movaps	XMMWORD[(-24)+r9],xmm15
1120$L$8x_body:
1121	vzeroupper
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132	vbroadcasti128	ymm11,XMMWORD[$L$sigma]
1133	vbroadcasti128	ymm3,XMMWORD[rcx]
1134	vbroadcasti128	ymm15,XMMWORD[16+rcx]
1135	vbroadcasti128	ymm7,XMMWORD[r8]
1136	lea	rcx,[256+rsp]
1137	lea	rax,[512+rsp]
1138	lea	r10,[$L$rot16]
1139	lea	r11,[$L$rot24]
1140
1141	vpshufd	ymm8,ymm11,0x00
1142	vpshufd	ymm9,ymm11,0x55
1143	vmovdqa	YMMWORD[(128-256)+rcx],ymm8
1144	vpshufd	ymm10,ymm11,0xaa
1145	vmovdqa	YMMWORD[(160-256)+rcx],ymm9
1146	vpshufd	ymm11,ymm11,0xff
1147	vmovdqa	YMMWORD[(192-256)+rcx],ymm10
1148	vmovdqa	YMMWORD[(224-256)+rcx],ymm11
1149
1150	vpshufd	ymm0,ymm3,0x00
1151	vpshufd	ymm1,ymm3,0x55
1152	vmovdqa	YMMWORD[(256-256)+rcx],ymm0
1153	vpshufd	ymm2,ymm3,0xaa
1154	vmovdqa	YMMWORD[(288-256)+rcx],ymm1
1155	vpshufd	ymm3,ymm3,0xff
1156	vmovdqa	YMMWORD[(320-256)+rcx],ymm2
1157	vmovdqa	YMMWORD[(352-256)+rcx],ymm3
1158
1159	vpshufd	ymm12,ymm15,0x00
1160	vpshufd	ymm13,ymm15,0x55
1161	vmovdqa	YMMWORD[(384-512)+rax],ymm12
1162	vpshufd	ymm14,ymm15,0xaa
1163	vmovdqa	YMMWORD[(416-512)+rax],ymm13
1164	vpshufd	ymm15,ymm15,0xff
1165	vmovdqa	YMMWORD[(448-512)+rax],ymm14
1166	vmovdqa	YMMWORD[(480-512)+rax],ymm15
1167
1168	vpshufd	ymm4,ymm7,0x00
1169	vpshufd	ymm5,ymm7,0x55
1170	vpaddd	ymm4,ymm4,YMMWORD[$L$incy]
1171	vpshufd	ymm6,ymm7,0xaa
1172	vmovdqa	YMMWORD[(544-512)+rax],ymm5
1173	vpshufd	ymm7,ymm7,0xff
1174	vmovdqa	YMMWORD[(576-512)+rax],ymm6
1175	vmovdqa	YMMWORD[(608-512)+rax],ymm7
1176
1177	jmp	NEAR $L$oop_enter8x
1178
1179ALIGN	32
1180$L$oop_outer8x:
1181	vmovdqa	ymm8,YMMWORD[((128-256))+rcx]
1182	vmovdqa	ymm9,YMMWORD[((160-256))+rcx]
1183	vmovdqa	ymm10,YMMWORD[((192-256))+rcx]
1184	vmovdqa	ymm11,YMMWORD[((224-256))+rcx]
1185	vmovdqa	ymm0,YMMWORD[((256-256))+rcx]
1186	vmovdqa	ymm1,YMMWORD[((288-256))+rcx]
1187	vmovdqa	ymm2,YMMWORD[((320-256))+rcx]
1188	vmovdqa	ymm3,YMMWORD[((352-256))+rcx]
1189	vmovdqa	ymm12,YMMWORD[((384-512))+rax]
1190	vmovdqa	ymm13,YMMWORD[((416-512))+rax]
1191	vmovdqa	ymm14,YMMWORD[((448-512))+rax]
1192	vmovdqa	ymm15,YMMWORD[((480-512))+rax]
1193	vmovdqa	ymm4,YMMWORD[((512-512))+rax]
1194	vmovdqa	ymm5,YMMWORD[((544-512))+rax]
1195	vmovdqa	ymm6,YMMWORD[((576-512))+rax]
1196	vmovdqa	ymm7,YMMWORD[((608-512))+rax]
1197	vpaddd	ymm4,ymm4,YMMWORD[$L$eight]
1198
1199$L$oop_enter8x:
1200	vmovdqa	YMMWORD[64+rsp],ymm14
1201	vmovdqa	YMMWORD[96+rsp],ymm15
1202	vbroadcasti128	ymm15,XMMWORD[r10]
1203	vmovdqa	YMMWORD[(512-512)+rax],ymm4
1204	mov	eax,10
1205	jmp	NEAR $L$oop8x
1206
1207ALIGN	32
1208$L$oop8x:
1209	vpaddd	ymm8,ymm8,ymm0
1210	vpxor	ymm4,ymm8,ymm4
1211	vpshufb	ymm4,ymm4,ymm15
1212	vpaddd	ymm9,ymm9,ymm1
1213	vpxor	ymm5,ymm9,ymm5
1214	vpshufb	ymm5,ymm5,ymm15
1215	vpaddd	ymm12,ymm12,ymm4
1216	vpxor	ymm0,ymm12,ymm0
1217	vpslld	ymm14,ymm0,12
1218	vpsrld	ymm0,ymm0,20
1219	vpor	ymm0,ymm14,ymm0
1220	vbroadcasti128	ymm14,XMMWORD[r11]
1221	vpaddd	ymm13,ymm13,ymm5
1222	vpxor	ymm1,ymm13,ymm1
1223	vpslld	ymm15,ymm1,12
1224	vpsrld	ymm1,ymm1,20
1225	vpor	ymm1,ymm15,ymm1
1226	vpaddd	ymm8,ymm8,ymm0
1227	vpxor	ymm4,ymm8,ymm4
1228	vpshufb	ymm4,ymm4,ymm14
1229	vpaddd	ymm9,ymm9,ymm1
1230	vpxor	ymm5,ymm9,ymm5
1231	vpshufb	ymm5,ymm5,ymm14
1232	vpaddd	ymm12,ymm12,ymm4
1233	vpxor	ymm0,ymm12,ymm0
1234	vpslld	ymm15,ymm0,7
1235	vpsrld	ymm0,ymm0,25
1236	vpor	ymm0,ymm15,ymm0
1237	vbroadcasti128	ymm15,XMMWORD[r10]
1238	vpaddd	ymm13,ymm13,ymm5
1239	vpxor	ymm1,ymm13,ymm1
1240	vpslld	ymm14,ymm1,7
1241	vpsrld	ymm1,ymm1,25
1242	vpor	ymm1,ymm14,ymm1
1243	vmovdqa	YMMWORD[rsp],ymm12
1244	vmovdqa	YMMWORD[32+rsp],ymm13
1245	vmovdqa	ymm12,YMMWORD[64+rsp]
1246	vmovdqa	ymm13,YMMWORD[96+rsp]
1247	vpaddd	ymm10,ymm10,ymm2
1248	vpxor	ymm6,ymm10,ymm6
1249	vpshufb	ymm6,ymm6,ymm15
1250	vpaddd	ymm11,ymm11,ymm3
1251	vpxor	ymm7,ymm11,ymm7
1252	vpshufb	ymm7,ymm7,ymm15
1253	vpaddd	ymm12,ymm12,ymm6
1254	vpxor	ymm2,ymm12,ymm2
1255	vpslld	ymm14,ymm2,12
1256	vpsrld	ymm2,ymm2,20
1257	vpor	ymm2,ymm14,ymm2
1258	vbroadcasti128	ymm14,XMMWORD[r11]
1259	vpaddd	ymm13,ymm13,ymm7
1260	vpxor	ymm3,ymm13,ymm3
1261	vpslld	ymm15,ymm3,12
1262	vpsrld	ymm3,ymm3,20
1263	vpor	ymm3,ymm15,ymm3
1264	vpaddd	ymm10,ymm10,ymm2
1265	vpxor	ymm6,ymm10,ymm6
1266	vpshufb	ymm6,ymm6,ymm14
1267	vpaddd	ymm11,ymm11,ymm3
1268	vpxor	ymm7,ymm11,ymm7
1269	vpshufb	ymm7,ymm7,ymm14
1270	vpaddd	ymm12,ymm12,ymm6
1271	vpxor	ymm2,ymm12,ymm2
1272	vpslld	ymm15,ymm2,7
1273	vpsrld	ymm2,ymm2,25
1274	vpor	ymm2,ymm15,ymm2
1275	vbroadcasti128	ymm15,XMMWORD[r10]
1276	vpaddd	ymm13,ymm13,ymm7
1277	vpxor	ymm3,ymm13,ymm3
1278	vpslld	ymm14,ymm3,7
1279	vpsrld	ymm3,ymm3,25
1280	vpor	ymm3,ymm14,ymm3
1281	vpaddd	ymm8,ymm8,ymm1
1282	vpxor	ymm7,ymm8,ymm7
1283	vpshufb	ymm7,ymm7,ymm15
1284	vpaddd	ymm9,ymm9,ymm2
1285	vpxor	ymm4,ymm9,ymm4
1286	vpshufb	ymm4,ymm4,ymm15
1287	vpaddd	ymm12,ymm12,ymm7
1288	vpxor	ymm1,ymm12,ymm1
1289	vpslld	ymm14,ymm1,12
1290	vpsrld	ymm1,ymm1,20
1291	vpor	ymm1,ymm14,ymm1
1292	vbroadcasti128	ymm14,XMMWORD[r11]
1293	vpaddd	ymm13,ymm13,ymm4
1294	vpxor	ymm2,ymm13,ymm2
1295	vpslld	ymm15,ymm2,12
1296	vpsrld	ymm2,ymm2,20
1297	vpor	ymm2,ymm15,ymm2
1298	vpaddd	ymm8,ymm8,ymm1
1299	vpxor	ymm7,ymm8,ymm7
1300	vpshufb	ymm7,ymm7,ymm14
1301	vpaddd	ymm9,ymm9,ymm2
1302	vpxor	ymm4,ymm9,ymm4
1303	vpshufb	ymm4,ymm4,ymm14
1304	vpaddd	ymm12,ymm12,ymm7
1305	vpxor	ymm1,ymm12,ymm1
1306	vpslld	ymm15,ymm1,7
1307	vpsrld	ymm1,ymm1,25
1308	vpor	ymm1,ymm15,ymm1
1309	vbroadcasti128	ymm15,XMMWORD[r10]
1310	vpaddd	ymm13,ymm13,ymm4
1311	vpxor	ymm2,ymm13,ymm2
1312	vpslld	ymm14,ymm2,7
1313	vpsrld	ymm2,ymm2,25
1314	vpor	ymm2,ymm14,ymm2
1315	vmovdqa	YMMWORD[64+rsp],ymm12
1316	vmovdqa	YMMWORD[96+rsp],ymm13
1317	vmovdqa	ymm12,YMMWORD[rsp]
1318	vmovdqa	ymm13,YMMWORD[32+rsp]
1319	vpaddd	ymm10,ymm10,ymm3
1320	vpxor	ymm5,ymm10,ymm5
1321	vpshufb	ymm5,ymm5,ymm15
1322	vpaddd	ymm11,ymm11,ymm0
1323	vpxor	ymm6,ymm11,ymm6
1324	vpshufb	ymm6,ymm6,ymm15
1325	vpaddd	ymm12,ymm12,ymm5
1326	vpxor	ymm3,ymm12,ymm3
1327	vpslld	ymm14,ymm3,12
1328	vpsrld	ymm3,ymm3,20
1329	vpor	ymm3,ymm14,ymm3
1330	vbroadcasti128	ymm14,XMMWORD[r11]
1331	vpaddd	ymm13,ymm13,ymm6
1332	vpxor	ymm0,ymm13,ymm0
1333	vpslld	ymm15,ymm0,12
1334	vpsrld	ymm0,ymm0,20
1335	vpor	ymm0,ymm15,ymm0
1336	vpaddd	ymm10,ymm10,ymm3
1337	vpxor	ymm5,ymm10,ymm5
1338	vpshufb	ymm5,ymm5,ymm14
1339	vpaddd	ymm11,ymm11,ymm0
1340	vpxor	ymm6,ymm11,ymm6
1341	vpshufb	ymm6,ymm6,ymm14
1342	vpaddd	ymm12,ymm12,ymm5
1343	vpxor	ymm3,ymm12,ymm3
1344	vpslld	ymm15,ymm3,7
1345	vpsrld	ymm3,ymm3,25
1346	vpor	ymm3,ymm15,ymm3
1347	vbroadcasti128	ymm15,XMMWORD[r10]
1348	vpaddd	ymm13,ymm13,ymm6
1349	vpxor	ymm0,ymm13,ymm0
1350	vpslld	ymm14,ymm0,7
1351	vpsrld	ymm0,ymm0,25
1352	vpor	ymm0,ymm14,ymm0
1353	dec	eax
1354	jnz	NEAR $L$oop8x
1355
1356	lea	rax,[512+rsp]
1357	vpaddd	ymm8,ymm8,YMMWORD[((128-256))+rcx]
1358	vpaddd	ymm9,ymm9,YMMWORD[((160-256))+rcx]
1359	vpaddd	ymm10,ymm10,YMMWORD[((192-256))+rcx]
1360	vpaddd	ymm11,ymm11,YMMWORD[((224-256))+rcx]
1361
1362	vpunpckldq	ymm14,ymm8,ymm9
1363	vpunpckldq	ymm15,ymm10,ymm11
1364	vpunpckhdq	ymm8,ymm8,ymm9
1365	vpunpckhdq	ymm10,ymm10,ymm11
1366	vpunpcklqdq	ymm9,ymm14,ymm15
1367	vpunpckhqdq	ymm14,ymm14,ymm15
1368	vpunpcklqdq	ymm11,ymm8,ymm10
1369	vpunpckhqdq	ymm8,ymm8,ymm10
1370	vpaddd	ymm0,ymm0,YMMWORD[((256-256))+rcx]
1371	vpaddd	ymm1,ymm1,YMMWORD[((288-256))+rcx]
1372	vpaddd	ymm2,ymm2,YMMWORD[((320-256))+rcx]
1373	vpaddd	ymm3,ymm3,YMMWORD[((352-256))+rcx]
1374
1375	vpunpckldq	ymm10,ymm0,ymm1
1376	vpunpckldq	ymm15,ymm2,ymm3
1377	vpunpckhdq	ymm0,ymm0,ymm1
1378	vpunpckhdq	ymm2,ymm2,ymm3
1379	vpunpcklqdq	ymm1,ymm10,ymm15
1380	vpunpckhqdq	ymm10,ymm10,ymm15
1381	vpunpcklqdq	ymm3,ymm0,ymm2
1382	vpunpckhqdq	ymm0,ymm0,ymm2
1383	vperm2i128	ymm15,ymm9,ymm1,0x20
1384	vperm2i128	ymm1,ymm9,ymm1,0x31
1385	vperm2i128	ymm9,ymm14,ymm10,0x20
1386	vperm2i128	ymm10,ymm14,ymm10,0x31
1387	vperm2i128	ymm14,ymm11,ymm3,0x20
1388	vperm2i128	ymm3,ymm11,ymm3,0x31
1389	vperm2i128	ymm11,ymm8,ymm0,0x20
1390	vperm2i128	ymm0,ymm8,ymm0,0x31
1391	vmovdqa	YMMWORD[rsp],ymm15
1392	vmovdqa	YMMWORD[32+rsp],ymm9
1393	vmovdqa	ymm15,YMMWORD[64+rsp]
1394	vmovdqa	ymm9,YMMWORD[96+rsp]
1395
1396	vpaddd	ymm12,ymm12,YMMWORD[((384-512))+rax]
1397	vpaddd	ymm13,ymm13,YMMWORD[((416-512))+rax]
1398	vpaddd	ymm15,ymm15,YMMWORD[((448-512))+rax]
1399	vpaddd	ymm9,ymm9,YMMWORD[((480-512))+rax]
1400
1401	vpunpckldq	ymm2,ymm12,ymm13
1402	vpunpckldq	ymm8,ymm15,ymm9
1403	vpunpckhdq	ymm12,ymm12,ymm13
1404	vpunpckhdq	ymm15,ymm15,ymm9
1405	vpunpcklqdq	ymm13,ymm2,ymm8
1406	vpunpckhqdq	ymm2,ymm2,ymm8
1407	vpunpcklqdq	ymm9,ymm12,ymm15
1408	vpunpckhqdq	ymm12,ymm12,ymm15
1409	vpaddd	ymm4,ymm4,YMMWORD[((512-512))+rax]
1410	vpaddd	ymm5,ymm5,YMMWORD[((544-512))+rax]
1411	vpaddd	ymm6,ymm6,YMMWORD[((576-512))+rax]
1412	vpaddd	ymm7,ymm7,YMMWORD[((608-512))+rax]
1413
1414	vpunpckldq	ymm15,ymm4,ymm5
1415	vpunpckldq	ymm8,ymm6,ymm7
1416	vpunpckhdq	ymm4,ymm4,ymm5
1417	vpunpckhdq	ymm6,ymm6,ymm7
1418	vpunpcklqdq	ymm5,ymm15,ymm8
1419	vpunpckhqdq	ymm15,ymm15,ymm8
1420	vpunpcklqdq	ymm7,ymm4,ymm6
1421	vpunpckhqdq	ymm4,ymm4,ymm6
1422	vperm2i128	ymm8,ymm13,ymm5,0x20
1423	vperm2i128	ymm5,ymm13,ymm5,0x31
1424	vperm2i128	ymm13,ymm2,ymm15,0x20
1425	vperm2i128	ymm15,ymm2,ymm15,0x31
1426	vperm2i128	ymm2,ymm9,ymm7,0x20
1427	vperm2i128	ymm7,ymm9,ymm7,0x31
1428	vperm2i128	ymm9,ymm12,ymm4,0x20
1429	vperm2i128	ymm4,ymm12,ymm4,0x31
1430	vmovdqa	ymm6,YMMWORD[rsp]
1431	vmovdqa	ymm12,YMMWORD[32+rsp]
1432
1433	cmp	rdx,64*8
1434	jb	NEAR $L$tail8x
1435
1436	vpxor	ymm6,ymm6,YMMWORD[rsi]
1437	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1438	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1439	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1440	lea	rsi,[128+rsi]
1441	vmovdqu	YMMWORD[rdi],ymm6
1442	vmovdqu	YMMWORD[32+rdi],ymm8
1443	vmovdqu	YMMWORD[64+rdi],ymm1
1444	vmovdqu	YMMWORD[96+rdi],ymm5
1445	lea	rdi,[128+rdi]
1446
1447	vpxor	ymm12,ymm12,YMMWORD[rsi]
1448	vpxor	ymm13,ymm13,YMMWORD[32+rsi]
1449	vpxor	ymm10,ymm10,YMMWORD[64+rsi]
1450	vpxor	ymm15,ymm15,YMMWORD[96+rsi]
1451	lea	rsi,[128+rsi]
1452	vmovdqu	YMMWORD[rdi],ymm12
1453	vmovdqu	YMMWORD[32+rdi],ymm13
1454	vmovdqu	YMMWORD[64+rdi],ymm10
1455	vmovdqu	YMMWORD[96+rdi],ymm15
1456	lea	rdi,[128+rdi]
1457
1458	vpxor	ymm14,ymm14,YMMWORD[rsi]
1459	vpxor	ymm2,ymm2,YMMWORD[32+rsi]
1460	vpxor	ymm3,ymm3,YMMWORD[64+rsi]
1461	vpxor	ymm7,ymm7,YMMWORD[96+rsi]
1462	lea	rsi,[128+rsi]
1463	vmovdqu	YMMWORD[rdi],ymm14
1464	vmovdqu	YMMWORD[32+rdi],ymm2
1465	vmovdqu	YMMWORD[64+rdi],ymm3
1466	vmovdqu	YMMWORD[96+rdi],ymm7
1467	lea	rdi,[128+rdi]
1468
1469	vpxor	ymm11,ymm11,YMMWORD[rsi]
1470	vpxor	ymm9,ymm9,YMMWORD[32+rsi]
1471	vpxor	ymm0,ymm0,YMMWORD[64+rsi]
1472	vpxor	ymm4,ymm4,YMMWORD[96+rsi]
1473	lea	rsi,[128+rsi]
1474	vmovdqu	YMMWORD[rdi],ymm11
1475	vmovdqu	YMMWORD[32+rdi],ymm9
1476	vmovdqu	YMMWORD[64+rdi],ymm0
1477	vmovdqu	YMMWORD[96+rdi],ymm4
1478	lea	rdi,[128+rdi]
1479
1480	sub	rdx,64*8
1481	jnz	NEAR $L$oop_outer8x
1482
1483	jmp	NEAR $L$done8x
1484
1485$L$tail8x:
1486	cmp	rdx,448
1487	jae	NEAR $L$448_or_more8x
1488	cmp	rdx,384
1489	jae	NEAR $L$384_or_more8x
1490	cmp	rdx,320
1491	jae	NEAR $L$320_or_more8x
1492	cmp	rdx,256
1493	jae	NEAR $L$256_or_more8x
1494	cmp	rdx,192
1495	jae	NEAR $L$192_or_more8x
1496	cmp	rdx,128
1497	jae	NEAR $L$128_or_more8x
1498	cmp	rdx,64
1499	jae	NEAR $L$64_or_more8x
1500
1501	xor	r10,r10
1502	vmovdqa	YMMWORD[rsp],ymm6
1503	vmovdqa	YMMWORD[32+rsp],ymm8
1504	jmp	NEAR $L$oop_tail8x
1505
1506ALIGN	32
1507$L$64_or_more8x:
1508	vpxor	ymm6,ymm6,YMMWORD[rsi]
1509	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1510	vmovdqu	YMMWORD[rdi],ymm6
1511	vmovdqu	YMMWORD[32+rdi],ymm8
1512	je	NEAR $L$done8x
1513
1514	lea	rsi,[64+rsi]
1515	xor	r10,r10
1516	vmovdqa	YMMWORD[rsp],ymm1
1517	lea	rdi,[64+rdi]
1518	sub	rdx,64
1519	vmovdqa	YMMWORD[32+rsp],ymm5
1520	jmp	NEAR $L$oop_tail8x
1521
1522ALIGN	32
1523$L$128_or_more8x:
1524	vpxor	ymm6,ymm6,YMMWORD[rsi]
1525	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1526	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1527	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1528	vmovdqu	YMMWORD[rdi],ymm6
1529	vmovdqu	YMMWORD[32+rdi],ymm8
1530	vmovdqu	YMMWORD[64+rdi],ymm1
1531	vmovdqu	YMMWORD[96+rdi],ymm5
1532	je	NEAR $L$done8x
1533
1534	lea	rsi,[128+rsi]
1535	xor	r10,r10
1536	vmovdqa	YMMWORD[rsp],ymm12
1537	lea	rdi,[128+rdi]
1538	sub	rdx,128
1539	vmovdqa	YMMWORD[32+rsp],ymm13
1540	jmp	NEAR $L$oop_tail8x
1541
1542ALIGN	32
1543$L$192_or_more8x:
1544	vpxor	ymm6,ymm6,YMMWORD[rsi]
1545	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1546	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1547	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1548	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
1549	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
1550	vmovdqu	YMMWORD[rdi],ymm6
1551	vmovdqu	YMMWORD[32+rdi],ymm8
1552	vmovdqu	YMMWORD[64+rdi],ymm1
1553	vmovdqu	YMMWORD[96+rdi],ymm5
1554	vmovdqu	YMMWORD[128+rdi],ymm12
1555	vmovdqu	YMMWORD[160+rdi],ymm13
1556	je	NEAR $L$done8x
1557
1558	lea	rsi,[192+rsi]
1559	xor	r10,r10
1560	vmovdqa	YMMWORD[rsp],ymm10
1561	lea	rdi,[192+rdi]
1562	sub	rdx,192
1563	vmovdqa	YMMWORD[32+rsp],ymm15
1564	jmp	NEAR $L$oop_tail8x
1565
1566ALIGN	32
1567$L$256_or_more8x:
1568	vpxor	ymm6,ymm6,YMMWORD[rsi]
1569	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1570	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1571	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1572	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
1573	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
1574	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
1575	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
1576	vmovdqu	YMMWORD[rdi],ymm6
1577	vmovdqu	YMMWORD[32+rdi],ymm8
1578	vmovdqu	YMMWORD[64+rdi],ymm1
1579	vmovdqu	YMMWORD[96+rdi],ymm5
1580	vmovdqu	YMMWORD[128+rdi],ymm12
1581	vmovdqu	YMMWORD[160+rdi],ymm13
1582	vmovdqu	YMMWORD[192+rdi],ymm10
1583	vmovdqu	YMMWORD[224+rdi],ymm15
1584	je	NEAR $L$done8x
1585
1586	lea	rsi,[256+rsi]
1587	xor	r10,r10
1588	vmovdqa	YMMWORD[rsp],ymm14
1589	lea	rdi,[256+rdi]
1590	sub	rdx,256
1591	vmovdqa	YMMWORD[32+rsp],ymm2
1592	jmp	NEAR $L$oop_tail8x
1593
1594ALIGN	32
1595$L$320_or_more8x:
1596	vpxor	ymm6,ymm6,YMMWORD[rsi]
1597	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1598	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1599	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1600	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
1601	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
1602	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
1603	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
1604	vpxor	ymm14,ymm14,YMMWORD[256+rsi]
1605	vpxor	ymm2,ymm2,YMMWORD[288+rsi]
1606	vmovdqu	YMMWORD[rdi],ymm6
1607	vmovdqu	YMMWORD[32+rdi],ymm8
1608	vmovdqu	YMMWORD[64+rdi],ymm1
1609	vmovdqu	YMMWORD[96+rdi],ymm5
1610	vmovdqu	YMMWORD[128+rdi],ymm12
1611	vmovdqu	YMMWORD[160+rdi],ymm13
1612	vmovdqu	YMMWORD[192+rdi],ymm10
1613	vmovdqu	YMMWORD[224+rdi],ymm15
1614	vmovdqu	YMMWORD[256+rdi],ymm14
1615	vmovdqu	YMMWORD[288+rdi],ymm2
1616	je	NEAR $L$done8x
1617
1618	lea	rsi,[320+rsi]
1619	xor	r10,r10
1620	vmovdqa	YMMWORD[rsp],ymm3
1621	lea	rdi,[320+rdi]
1622	sub	rdx,320
1623	vmovdqa	YMMWORD[32+rsp],ymm7
1624	jmp	NEAR $L$oop_tail8x
1625
1626ALIGN	32
1627$L$384_or_more8x:
1628	vpxor	ymm6,ymm6,YMMWORD[rsi]
1629	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1630	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1631	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1632	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
1633	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
1634	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
1635	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
1636	vpxor	ymm14,ymm14,YMMWORD[256+rsi]
1637	vpxor	ymm2,ymm2,YMMWORD[288+rsi]
1638	vpxor	ymm3,ymm3,YMMWORD[320+rsi]
1639	vpxor	ymm7,ymm7,YMMWORD[352+rsi]
1640	vmovdqu	YMMWORD[rdi],ymm6
1641	vmovdqu	YMMWORD[32+rdi],ymm8
1642	vmovdqu	YMMWORD[64+rdi],ymm1
1643	vmovdqu	YMMWORD[96+rdi],ymm5
1644	vmovdqu	YMMWORD[128+rdi],ymm12
1645	vmovdqu	YMMWORD[160+rdi],ymm13
1646	vmovdqu	YMMWORD[192+rdi],ymm10
1647	vmovdqu	YMMWORD[224+rdi],ymm15
1648	vmovdqu	YMMWORD[256+rdi],ymm14
1649	vmovdqu	YMMWORD[288+rdi],ymm2
1650	vmovdqu	YMMWORD[320+rdi],ymm3
1651	vmovdqu	YMMWORD[352+rdi],ymm7
1652	je	NEAR $L$done8x
1653
1654	lea	rsi,[384+rsi]
1655	xor	r10,r10
1656	vmovdqa	YMMWORD[rsp],ymm11
1657	lea	rdi,[384+rdi]
1658	sub	rdx,384
1659	vmovdqa	YMMWORD[32+rsp],ymm9
1660	jmp	NEAR $L$oop_tail8x
1661
1662ALIGN	32
1663$L$448_or_more8x:
1664	vpxor	ymm6,ymm6,YMMWORD[rsi]
1665	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1666	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1667	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1668	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
1669	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
1670	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
1671	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
1672	vpxor	ymm14,ymm14,YMMWORD[256+rsi]
1673	vpxor	ymm2,ymm2,YMMWORD[288+rsi]
1674	vpxor	ymm3,ymm3,YMMWORD[320+rsi]
1675	vpxor	ymm7,ymm7,YMMWORD[352+rsi]
1676	vpxor	ymm11,ymm11,YMMWORD[384+rsi]
1677	vpxor	ymm9,ymm9,YMMWORD[416+rsi]
1678	vmovdqu	YMMWORD[rdi],ymm6
1679	vmovdqu	YMMWORD[32+rdi],ymm8
1680	vmovdqu	YMMWORD[64+rdi],ymm1
1681	vmovdqu	YMMWORD[96+rdi],ymm5
1682	vmovdqu	YMMWORD[128+rdi],ymm12
1683	vmovdqu	YMMWORD[160+rdi],ymm13
1684	vmovdqu	YMMWORD[192+rdi],ymm10
1685	vmovdqu	YMMWORD[224+rdi],ymm15
1686	vmovdqu	YMMWORD[256+rdi],ymm14
1687	vmovdqu	YMMWORD[288+rdi],ymm2
1688	vmovdqu	YMMWORD[320+rdi],ymm3
1689	vmovdqu	YMMWORD[352+rdi],ymm7
1690	vmovdqu	YMMWORD[384+rdi],ymm11
1691	vmovdqu	YMMWORD[416+rdi],ymm9
1692	je	NEAR $L$done8x
1693
1694	lea	rsi,[448+rsi]
1695	xor	r10,r10
1696	vmovdqa	YMMWORD[rsp],ymm0
1697	lea	rdi,[448+rdi]
1698	sub	rdx,448
1699	vmovdqa	YMMWORD[32+rsp],ymm4
1700
1701$L$oop_tail8x:
1702	movzx	eax,BYTE[r10*1+rsi]
1703	movzx	ecx,BYTE[r10*1+rsp]
1704	lea	r10,[1+r10]
1705	xor	eax,ecx
1706	mov	BYTE[((-1))+r10*1+rdi],al
1707	dec	rdx
1708	jnz	NEAR $L$oop_tail8x
1709
1710$L$done8x:
1711	vzeroall
1712	movaps	xmm6,XMMWORD[((-168))+r9]
1713	movaps	xmm7,XMMWORD[((-152))+r9]
1714	movaps	xmm8,XMMWORD[((-136))+r9]
1715	movaps	xmm9,XMMWORD[((-120))+r9]
1716	movaps	xmm10,XMMWORD[((-104))+r9]
1717	movaps	xmm11,XMMWORD[((-88))+r9]
1718	movaps	xmm12,XMMWORD[((-72))+r9]
1719	movaps	xmm13,XMMWORD[((-56))+r9]
1720	movaps	xmm14,XMMWORD[((-40))+r9]
1721	movaps	xmm15,XMMWORD[((-24))+r9]
1722	lea	rsp,[r9]
1723
1724$L$8x_epilogue:
1725	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
1726	mov	rsi,QWORD[16+rsp]
1727	ret
1728
1729$L$SEH_end_ChaCha20_8x:
1730EXTERN	__imp_RtlVirtualUnwind
1731
1732ALIGN	16
1733se_handler:
1734	push	rsi
1735	push	rdi
1736	push	rbx
1737	push	rbp
1738	push	r12
1739	push	r13
1740	push	r14
1741	push	r15
1742	pushfq
1743	sub	rsp,64
1744
1745	mov	rax,QWORD[120+r8]
1746	mov	rbx,QWORD[248+r8]
1747
1748	mov	rsi,QWORD[8+r9]
1749	mov	r11,QWORD[56+r9]
1750
1751	lea	r10,[$L$ctr32_body]
1752	cmp	rbx,r10
1753	jb	NEAR $L$common_seh_tail
1754
1755	mov	rax,QWORD[152+r8]
1756
1757	lea	r10,[$L$no_data]
1758	cmp	rbx,r10
1759	jae	NEAR $L$common_seh_tail
1760
1761	lea	rax,[((64+24+48))+rax]
1762
1763	mov	rbx,QWORD[((-8))+rax]
1764	mov	rbp,QWORD[((-16))+rax]
1765	mov	r12,QWORD[((-24))+rax]
1766	mov	r13,QWORD[((-32))+rax]
1767	mov	r14,QWORD[((-40))+rax]
1768	mov	r15,QWORD[((-48))+rax]
1769	mov	QWORD[144+r8],rbx
1770	mov	QWORD[160+r8],rbp
1771	mov	QWORD[216+r8],r12
1772	mov	QWORD[224+r8],r13
1773	mov	QWORD[232+r8],r14
1774	mov	QWORD[240+r8],r15
1775
1776$L$common_seh_tail:
1777	mov	rdi,QWORD[8+rax]
1778	mov	rsi,QWORD[16+rax]
1779	mov	QWORD[152+r8],rax
1780	mov	QWORD[168+r8],rsi
1781	mov	QWORD[176+r8],rdi
1782
1783	mov	rdi,QWORD[40+r9]
1784	mov	rsi,r8
1785	mov	ecx,154
1786	DD	0xa548f3fc
1787
1788	mov	rsi,r9
1789	xor	rcx,rcx
1790	mov	rdx,QWORD[8+rsi]
1791	mov	r8,QWORD[rsi]
1792	mov	r9,QWORD[16+rsi]
1793	mov	r10,QWORD[40+rsi]
1794	lea	r11,[56+rsi]
1795	lea	r12,[24+rsi]
1796	mov	QWORD[32+rsp],r10
1797	mov	QWORD[40+rsp],r11
1798	mov	QWORD[48+rsp],r12
1799	mov	QWORD[56+rsp],rcx
1800	call	QWORD[__imp_RtlVirtualUnwind]
1801
1802	mov	eax,1
1803	add	rsp,64
1804	popfq
1805	pop	r15
1806	pop	r14
1807	pop	r13
1808	pop	r12
1809	pop	rbp
1810	pop	rbx
1811	pop	rdi
1812	pop	rsi
1813	ret
1814
1815
1816
1817ALIGN	16
1818ssse3_handler:
1819	push	rsi
1820	push	rdi
1821	push	rbx
1822	push	rbp
1823	push	r12
1824	push	r13
1825	push	r14
1826	push	r15
1827	pushfq
1828	sub	rsp,64
1829
1830	mov	rax,QWORD[120+r8]
1831	mov	rbx,QWORD[248+r8]
1832
1833	mov	rsi,QWORD[8+r9]
1834	mov	r11,QWORD[56+r9]
1835
1836	mov	r10d,DWORD[r11]
1837	lea	r10,[r10*1+rsi]
1838	cmp	rbx,r10
1839	jb	NEAR $L$common_seh_tail
1840
1841	mov	rax,QWORD[192+r8]
1842
1843	mov	r10d,DWORD[4+r11]
1844	lea	r10,[r10*1+rsi]
1845	cmp	rbx,r10
1846	jae	NEAR $L$common_seh_tail
1847
1848	lea	rsi,[((-40))+rax]
1849	lea	rdi,[512+r8]
1850	mov	ecx,4
1851	DD	0xa548f3fc
1852
1853	jmp	NEAR $L$common_seh_tail
1854
1855
1856
1857ALIGN	16
1858full_handler:
1859	push	rsi
1860	push	rdi
1861	push	rbx
1862	push	rbp
1863	push	r12
1864	push	r13
1865	push	r14
1866	push	r15
1867	pushfq
1868	sub	rsp,64
1869
1870	mov	rax,QWORD[120+r8]
1871	mov	rbx,QWORD[248+r8]
1872
1873	mov	rsi,QWORD[8+r9]
1874	mov	r11,QWORD[56+r9]
1875
1876	mov	r10d,DWORD[r11]
1877	lea	r10,[r10*1+rsi]
1878	cmp	rbx,r10
1879	jb	NEAR $L$common_seh_tail
1880
1881	mov	rax,QWORD[192+r8]
1882
1883	mov	r10d,DWORD[4+r11]
1884	lea	r10,[r10*1+rsi]
1885	cmp	rbx,r10
1886	jae	NEAR $L$common_seh_tail
1887
1888	lea	rsi,[((-168))+rax]
1889	lea	rdi,[512+r8]
1890	mov	ecx,20
1891	DD	0xa548f3fc
1892
1893	jmp	NEAR $L$common_seh_tail
1894
1895
1896section	.pdata rdata align=4
1897ALIGN	4
1898	DD	$L$SEH_begin_ChaCha20_ctr32 wrt ..imagebase
1899	DD	$L$SEH_end_ChaCha20_ctr32 wrt ..imagebase
1900	DD	$L$SEH_info_ChaCha20_ctr32 wrt ..imagebase
1901
1902	DD	$L$SEH_begin_ChaCha20_ssse3 wrt ..imagebase
1903	DD	$L$SEH_end_ChaCha20_ssse3 wrt ..imagebase
1904	DD	$L$SEH_info_ChaCha20_ssse3 wrt ..imagebase
1905
1906	DD	$L$SEH_begin_ChaCha20_4x wrt ..imagebase
1907	DD	$L$SEH_end_ChaCha20_4x wrt ..imagebase
1908	DD	$L$SEH_info_ChaCha20_4x wrt ..imagebase
1909	DD	$L$SEH_begin_ChaCha20_8x wrt ..imagebase
1910	DD	$L$SEH_end_ChaCha20_8x wrt ..imagebase
1911	DD	$L$SEH_info_ChaCha20_8x wrt ..imagebase
1912section	.xdata rdata align=8
1913ALIGN	8
1914$L$SEH_info_ChaCha20_ctr32:
1915	DB	9,0,0,0
1916	DD	se_handler wrt ..imagebase
1917
1918$L$SEH_info_ChaCha20_ssse3:
1919	DB	9,0,0,0
1920	DD	ssse3_handler wrt ..imagebase
1921	DD	$L$ssse3_body wrt ..imagebase,$L$ssse3_epilogue wrt ..imagebase
1922
1923$L$SEH_info_ChaCha20_4x:
1924	DB	9,0,0,0
1925	DD	full_handler wrt ..imagebase
1926	DD	$L$4x_body wrt ..imagebase,$L$4x_epilogue wrt ..imagebase
1927$L$SEH_info_ChaCha20_8x:
1928	DB	9,0,0,0
1929	DD	full_handler wrt ..imagebase
1930	DD	$L$8x_body wrt ..imagebase,$L$8x_epilogue wrt ..imagebase
1931%else
1932; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
1933ret
1934%endif
1935