xref: /aosp_15_r20/external/boringssl/src/gen/crypto/chacha-x86_64-win.asm (revision 8fb009dc861624b67b6cdb62ea21f0f22d0c584b)
1; This file is generated from a similarly-named Perl script in the BoringSSL
2; source tree. Do not edit by hand.
3
4%ifidn __OUTPUT_FORMAT__, win64
5default	rel
6%define XMMWORD
7%define YMMWORD
8%define ZMMWORD
9%define _CET_ENDBR
10
11%ifdef BORINGSSL_PREFIX
12%include "boringssl_prefix_symbols_nasm.inc"
13%endif
14section	.text code align=64
15
16
17section	.rdata rdata align=8
18ALIGN	64
19$L$zero:
20	DD	0,0,0,0
21$L$one:
22	DD	1,0,0,0
23$L$inc:
24	DD	0,1,2,3
25$L$four:
26	DD	4,4,4,4
27$L$incy:
28	DD	0,2,4,6,1,3,5,7
29$L$eight:
30	DD	8,8,8,8,8,8,8,8
31$L$rot16:
32	DB	0x2,0x3,0x0,0x1,0x6,0x7,0x4,0x5,0xa,0xb,0x8,0x9,0xe,0xf,0xc,0xd
33$L$rot24:
34	DB	0x3,0x0,0x1,0x2,0x7,0x4,0x5,0x6,0xb,0x8,0x9,0xa,0xf,0xc,0xd,0xe
35$L$sigma:
36	DB	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107
37	DB	0
38ALIGN	64
39$L$zeroz:
40	DD	0,0,0,0,1,0,0,0,2,0,0,0,3,0,0,0
41$L$fourz:
42	DD	4,0,0,0,4,0,0,0,4,0,0,0,4,0,0,0
43$L$incz:
44	DD	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
45$L$sixteen:
46	DD	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
47	DB	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
48	DB	95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32
49	DB	98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115
50	DB	108,46,111,114,103,62,0
51section	.text
52
53global	ChaCha20_ctr32_nohw
54
55ALIGN	64
56ChaCha20_ctr32_nohw:
57	mov	QWORD[8+rsp],rdi	;WIN64 prologue
58	mov	QWORD[16+rsp],rsi
59	mov	rax,rsp
60$L$SEH_begin_ChaCha20_ctr32_nohw:
61	mov	rdi,rcx
62	mov	rsi,rdx
63	mov	rdx,r8
64	mov	rcx,r9
65	mov	r8,QWORD[40+rsp]
66
67
68
69_CET_ENDBR
70	push	rbx
71
72	push	rbp
73
74	push	r12
75
76	push	r13
77
78	push	r14
79
80	push	r15
81
82	sub	rsp,64+24
83
84$L$ctr32_body:
85
86
87	movdqu	xmm1,XMMWORD[rcx]
88	movdqu	xmm2,XMMWORD[16+rcx]
89	movdqu	xmm3,XMMWORD[r8]
90	movdqa	xmm4,XMMWORD[$L$one]
91
92
93	movdqa	XMMWORD[16+rsp],xmm1
94	movdqa	XMMWORD[32+rsp],xmm2
95	movdqa	XMMWORD[48+rsp],xmm3
96	mov	rbp,rdx
97	jmp	NEAR $L$oop_outer
98
99ALIGN	32
100$L$oop_outer:
101	mov	eax,0x61707865
102	mov	ebx,0x3320646e
103	mov	ecx,0x79622d32
104	mov	edx,0x6b206574
105	mov	r8d,DWORD[16+rsp]
106	mov	r9d,DWORD[20+rsp]
107	mov	r10d,DWORD[24+rsp]
108	mov	r11d,DWORD[28+rsp]
109	movd	r12d,xmm3
110	mov	r13d,DWORD[52+rsp]
111	mov	r14d,DWORD[56+rsp]
112	mov	r15d,DWORD[60+rsp]
113
114	mov	QWORD[((64+0))+rsp],rbp
115	mov	ebp,10
116	mov	QWORD[((64+8))+rsp],rsi
117DB	102,72,15,126,214
118	mov	QWORD[((64+16))+rsp],rdi
119	mov	rdi,rsi
120	shr	rdi,32
121	jmp	NEAR $L$oop
122
123ALIGN	32
124$L$oop:
125	add	eax,r8d
126	xor	r12d,eax
127	rol	r12d,16
128	add	ebx,r9d
129	xor	r13d,ebx
130	rol	r13d,16
131	add	esi,r12d
132	xor	r8d,esi
133	rol	r8d,12
134	add	edi,r13d
135	xor	r9d,edi
136	rol	r9d,12
137	add	eax,r8d
138	xor	r12d,eax
139	rol	r12d,8
140	add	ebx,r9d
141	xor	r13d,ebx
142	rol	r13d,8
143	add	esi,r12d
144	xor	r8d,esi
145	rol	r8d,7
146	add	edi,r13d
147	xor	r9d,edi
148	rol	r9d,7
149	mov	DWORD[32+rsp],esi
150	mov	DWORD[36+rsp],edi
151	mov	esi,DWORD[40+rsp]
152	mov	edi,DWORD[44+rsp]
153	add	ecx,r10d
154	xor	r14d,ecx
155	rol	r14d,16
156	add	edx,r11d
157	xor	r15d,edx
158	rol	r15d,16
159	add	esi,r14d
160	xor	r10d,esi
161	rol	r10d,12
162	add	edi,r15d
163	xor	r11d,edi
164	rol	r11d,12
165	add	ecx,r10d
166	xor	r14d,ecx
167	rol	r14d,8
168	add	edx,r11d
169	xor	r15d,edx
170	rol	r15d,8
171	add	esi,r14d
172	xor	r10d,esi
173	rol	r10d,7
174	add	edi,r15d
175	xor	r11d,edi
176	rol	r11d,7
177	add	eax,r9d
178	xor	r15d,eax
179	rol	r15d,16
180	add	ebx,r10d
181	xor	r12d,ebx
182	rol	r12d,16
183	add	esi,r15d
184	xor	r9d,esi
185	rol	r9d,12
186	add	edi,r12d
187	xor	r10d,edi
188	rol	r10d,12
189	add	eax,r9d
190	xor	r15d,eax
191	rol	r15d,8
192	add	ebx,r10d
193	xor	r12d,ebx
194	rol	r12d,8
195	add	esi,r15d
196	xor	r9d,esi
197	rol	r9d,7
198	add	edi,r12d
199	xor	r10d,edi
200	rol	r10d,7
201	mov	DWORD[40+rsp],esi
202	mov	DWORD[44+rsp],edi
203	mov	esi,DWORD[32+rsp]
204	mov	edi,DWORD[36+rsp]
205	add	ecx,r11d
206	xor	r13d,ecx
207	rol	r13d,16
208	add	edx,r8d
209	xor	r14d,edx
210	rol	r14d,16
211	add	esi,r13d
212	xor	r11d,esi
213	rol	r11d,12
214	add	edi,r14d
215	xor	r8d,edi
216	rol	r8d,12
217	add	ecx,r11d
218	xor	r13d,ecx
219	rol	r13d,8
220	add	edx,r8d
221	xor	r14d,edx
222	rol	r14d,8
223	add	esi,r13d
224	xor	r11d,esi
225	rol	r11d,7
226	add	edi,r14d
227	xor	r8d,edi
228	rol	r8d,7
229	dec	ebp
230	jnz	NEAR $L$oop
231	mov	DWORD[36+rsp],edi
232	mov	DWORD[32+rsp],esi
233	mov	rbp,QWORD[64+rsp]
234	movdqa	xmm1,xmm2
235	mov	rsi,QWORD[((64+8))+rsp]
236	paddd	xmm3,xmm4
237	mov	rdi,QWORD[((64+16))+rsp]
238
239	add	eax,0x61707865
240	add	ebx,0x3320646e
241	add	ecx,0x79622d32
242	add	edx,0x6b206574
243	add	r8d,DWORD[16+rsp]
244	add	r9d,DWORD[20+rsp]
245	add	r10d,DWORD[24+rsp]
246	add	r11d,DWORD[28+rsp]
247	add	r12d,DWORD[48+rsp]
248	add	r13d,DWORD[52+rsp]
249	add	r14d,DWORD[56+rsp]
250	add	r15d,DWORD[60+rsp]
251	paddd	xmm1,XMMWORD[32+rsp]
252
253	cmp	rbp,64
254	jb	NEAR $L$tail
255
256	xor	eax,DWORD[rsi]
257	xor	ebx,DWORD[4+rsi]
258	xor	ecx,DWORD[8+rsi]
259	xor	edx,DWORD[12+rsi]
260	xor	r8d,DWORD[16+rsi]
261	xor	r9d,DWORD[20+rsi]
262	xor	r10d,DWORD[24+rsi]
263	xor	r11d,DWORD[28+rsi]
264	movdqu	xmm0,XMMWORD[32+rsi]
265	xor	r12d,DWORD[48+rsi]
266	xor	r13d,DWORD[52+rsi]
267	xor	r14d,DWORD[56+rsi]
268	xor	r15d,DWORD[60+rsi]
269	lea	rsi,[64+rsi]
270	pxor	xmm0,xmm1
271
272	movdqa	XMMWORD[32+rsp],xmm2
273	movd	DWORD[48+rsp],xmm3
274
275	mov	DWORD[rdi],eax
276	mov	DWORD[4+rdi],ebx
277	mov	DWORD[8+rdi],ecx
278	mov	DWORD[12+rdi],edx
279	mov	DWORD[16+rdi],r8d
280	mov	DWORD[20+rdi],r9d
281	mov	DWORD[24+rdi],r10d
282	mov	DWORD[28+rdi],r11d
283	movdqu	XMMWORD[32+rdi],xmm0
284	mov	DWORD[48+rdi],r12d
285	mov	DWORD[52+rdi],r13d
286	mov	DWORD[56+rdi],r14d
287	mov	DWORD[60+rdi],r15d
288	lea	rdi,[64+rdi]
289
290	sub	rbp,64
291	jnz	NEAR $L$oop_outer
292
293	jmp	NEAR $L$done
294
295ALIGN	16
296$L$tail:
297	mov	DWORD[rsp],eax
298	mov	DWORD[4+rsp],ebx
299	xor	rbx,rbx
300	mov	DWORD[8+rsp],ecx
301	mov	DWORD[12+rsp],edx
302	mov	DWORD[16+rsp],r8d
303	mov	DWORD[20+rsp],r9d
304	mov	DWORD[24+rsp],r10d
305	mov	DWORD[28+rsp],r11d
306	movdqa	XMMWORD[32+rsp],xmm1
307	mov	DWORD[48+rsp],r12d
308	mov	DWORD[52+rsp],r13d
309	mov	DWORD[56+rsp],r14d
310	mov	DWORD[60+rsp],r15d
311
312$L$oop_tail:
313	movzx	eax,BYTE[rbx*1+rsi]
314	movzx	edx,BYTE[rbx*1+rsp]
315	lea	rbx,[1+rbx]
316	xor	eax,edx
317	mov	BYTE[((-1))+rbx*1+rdi],al
318	dec	rbp
319	jnz	NEAR $L$oop_tail
320
321$L$done:
322	lea	rsi,[((64+24+48))+rsp]
323	mov	r15,QWORD[((-48))+rsi]
324
325	mov	r14,QWORD[((-40))+rsi]
326
327	mov	r13,QWORD[((-32))+rsi]
328
329	mov	r12,QWORD[((-24))+rsi]
330
331	mov	rbp,QWORD[((-16))+rsi]
332
333	mov	rbx,QWORD[((-8))+rsi]
334
335	lea	rsp,[rsi]
336
337$L$no_data:
338	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
339	mov	rsi,QWORD[16+rsp]
340	ret
341
342$L$SEH_end_ChaCha20_ctr32_nohw:
343global	ChaCha20_ctr32_ssse3
344
345ALIGN	32
346ChaCha20_ctr32_ssse3:
347	mov	QWORD[8+rsp],rdi	;WIN64 prologue
348	mov	QWORD[16+rsp],rsi
349	mov	rax,rsp
350$L$SEH_begin_ChaCha20_ctr32_ssse3:
351	mov	rdi,rcx
352	mov	rsi,rdx
353	mov	rdx,r8
354	mov	rcx,r9
355	mov	r8,QWORD[40+rsp]
356
357
358
359_CET_ENDBR
360	mov	r9,rsp
361
362	sub	rsp,64+40
363	movaps	XMMWORD[(-40)+r9],xmm6
364	movaps	XMMWORD[(-24)+r9],xmm7
365$L$ssse3_body:
366	movdqa	xmm0,XMMWORD[$L$sigma]
367	movdqu	xmm1,XMMWORD[rcx]
368	movdqu	xmm2,XMMWORD[16+rcx]
369	movdqu	xmm3,XMMWORD[r8]
370	movdqa	xmm6,XMMWORD[$L$rot16]
371	movdqa	xmm7,XMMWORD[$L$rot24]
372
373	movdqa	XMMWORD[rsp],xmm0
374	movdqa	XMMWORD[16+rsp],xmm1
375	movdqa	XMMWORD[32+rsp],xmm2
376	movdqa	XMMWORD[48+rsp],xmm3
377	mov	r8,10
378	jmp	NEAR $L$oop_ssse3
379
380ALIGN	32
381$L$oop_outer_ssse3:
382	movdqa	xmm3,XMMWORD[$L$one]
383	movdqa	xmm0,XMMWORD[rsp]
384	movdqa	xmm1,XMMWORD[16+rsp]
385	movdqa	xmm2,XMMWORD[32+rsp]
386	paddd	xmm3,XMMWORD[48+rsp]
387	mov	r8,10
388	movdqa	XMMWORD[48+rsp],xmm3
389	jmp	NEAR $L$oop_ssse3
390
391ALIGN	32
392$L$oop_ssse3:
393	paddd	xmm0,xmm1
394	pxor	xmm3,xmm0
395DB	102,15,56,0,222
396	paddd	xmm2,xmm3
397	pxor	xmm1,xmm2
398	movdqa	xmm4,xmm1
399	psrld	xmm1,20
400	pslld	xmm4,12
401	por	xmm1,xmm4
402	paddd	xmm0,xmm1
403	pxor	xmm3,xmm0
404DB	102,15,56,0,223
405	paddd	xmm2,xmm3
406	pxor	xmm1,xmm2
407	movdqa	xmm4,xmm1
408	psrld	xmm1,25
409	pslld	xmm4,7
410	por	xmm1,xmm4
411	pshufd	xmm2,xmm2,78
412	pshufd	xmm1,xmm1,57
413	pshufd	xmm3,xmm3,147
414	nop
415	paddd	xmm0,xmm1
416	pxor	xmm3,xmm0
417DB	102,15,56,0,222
418	paddd	xmm2,xmm3
419	pxor	xmm1,xmm2
420	movdqa	xmm4,xmm1
421	psrld	xmm1,20
422	pslld	xmm4,12
423	por	xmm1,xmm4
424	paddd	xmm0,xmm1
425	pxor	xmm3,xmm0
426DB	102,15,56,0,223
427	paddd	xmm2,xmm3
428	pxor	xmm1,xmm2
429	movdqa	xmm4,xmm1
430	psrld	xmm1,25
431	pslld	xmm4,7
432	por	xmm1,xmm4
433	pshufd	xmm2,xmm2,78
434	pshufd	xmm1,xmm1,147
435	pshufd	xmm3,xmm3,57
436	dec	r8
437	jnz	NEAR $L$oop_ssse3
438	paddd	xmm0,XMMWORD[rsp]
439	paddd	xmm1,XMMWORD[16+rsp]
440	paddd	xmm2,XMMWORD[32+rsp]
441	paddd	xmm3,XMMWORD[48+rsp]
442
443	cmp	rdx,64
444	jb	NEAR $L$tail_ssse3
445
446	movdqu	xmm4,XMMWORD[rsi]
447	movdqu	xmm5,XMMWORD[16+rsi]
448	pxor	xmm0,xmm4
449	movdqu	xmm4,XMMWORD[32+rsi]
450	pxor	xmm1,xmm5
451	movdqu	xmm5,XMMWORD[48+rsi]
452	lea	rsi,[64+rsi]
453	pxor	xmm2,xmm4
454	pxor	xmm3,xmm5
455
456	movdqu	XMMWORD[rdi],xmm0
457	movdqu	XMMWORD[16+rdi],xmm1
458	movdqu	XMMWORD[32+rdi],xmm2
459	movdqu	XMMWORD[48+rdi],xmm3
460	lea	rdi,[64+rdi]
461
462	sub	rdx,64
463	jnz	NEAR $L$oop_outer_ssse3
464
465	jmp	NEAR $L$done_ssse3
466
467ALIGN	16
468$L$tail_ssse3:
469	movdqa	XMMWORD[rsp],xmm0
470	movdqa	XMMWORD[16+rsp],xmm1
471	movdqa	XMMWORD[32+rsp],xmm2
472	movdqa	XMMWORD[48+rsp],xmm3
473	xor	r8,r8
474
475$L$oop_tail_ssse3:
476	movzx	eax,BYTE[r8*1+rsi]
477	movzx	ecx,BYTE[r8*1+rsp]
478	lea	r8,[1+r8]
479	xor	eax,ecx
480	mov	BYTE[((-1))+r8*1+rdi],al
481	dec	rdx
482	jnz	NEAR $L$oop_tail_ssse3
483
484$L$done_ssse3:
485	movaps	xmm6,XMMWORD[((-40))+r9]
486	movaps	xmm7,XMMWORD[((-24))+r9]
487	lea	rsp,[r9]
488
489$L$ssse3_epilogue:
490	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
491	mov	rsi,QWORD[16+rsp]
492	ret
493
494$L$SEH_end_ChaCha20_ctr32_ssse3:
495global	ChaCha20_ctr32_ssse3_4x
496
497ALIGN	32
498ChaCha20_ctr32_ssse3_4x:
499	mov	QWORD[8+rsp],rdi	;WIN64 prologue
500	mov	QWORD[16+rsp],rsi
501	mov	rax,rsp
502$L$SEH_begin_ChaCha20_ctr32_ssse3_4x:
503	mov	rdi,rcx
504	mov	rsi,rdx
505	mov	rdx,r8
506	mov	rcx,r9
507	mov	r8,QWORD[40+rsp]
508
509
510
511_CET_ENDBR
512	mov	r9,rsp
513
514	sub	rsp,0x140+168
515	movaps	XMMWORD[(-168)+r9],xmm6
516	movaps	XMMWORD[(-152)+r9],xmm7
517	movaps	XMMWORD[(-136)+r9],xmm8
518	movaps	XMMWORD[(-120)+r9],xmm9
519	movaps	XMMWORD[(-104)+r9],xmm10
520	movaps	XMMWORD[(-88)+r9],xmm11
521	movaps	XMMWORD[(-72)+r9],xmm12
522	movaps	XMMWORD[(-56)+r9],xmm13
523	movaps	XMMWORD[(-40)+r9],xmm14
524	movaps	XMMWORD[(-24)+r9],xmm15
525$L$4x_body:
526	movdqa	xmm11,XMMWORD[$L$sigma]
527	movdqu	xmm15,XMMWORD[rcx]
528	movdqu	xmm7,XMMWORD[16+rcx]
529	movdqu	xmm3,XMMWORD[r8]
530	lea	rcx,[256+rsp]
531	lea	r10,[$L$rot16]
532	lea	r11,[$L$rot24]
533
534	pshufd	xmm8,xmm11,0x00
535	pshufd	xmm9,xmm11,0x55
536	movdqa	XMMWORD[64+rsp],xmm8
537	pshufd	xmm10,xmm11,0xaa
538	movdqa	XMMWORD[80+rsp],xmm9
539	pshufd	xmm11,xmm11,0xff
540	movdqa	XMMWORD[96+rsp],xmm10
541	movdqa	XMMWORD[112+rsp],xmm11
542
543	pshufd	xmm12,xmm15,0x00
544	pshufd	xmm13,xmm15,0x55
545	movdqa	XMMWORD[(128-256)+rcx],xmm12
546	pshufd	xmm14,xmm15,0xaa
547	movdqa	XMMWORD[(144-256)+rcx],xmm13
548	pshufd	xmm15,xmm15,0xff
549	movdqa	XMMWORD[(160-256)+rcx],xmm14
550	movdqa	XMMWORD[(176-256)+rcx],xmm15
551
552	pshufd	xmm4,xmm7,0x00
553	pshufd	xmm5,xmm7,0x55
554	movdqa	XMMWORD[(192-256)+rcx],xmm4
555	pshufd	xmm6,xmm7,0xaa
556	movdqa	XMMWORD[(208-256)+rcx],xmm5
557	pshufd	xmm7,xmm7,0xff
558	movdqa	XMMWORD[(224-256)+rcx],xmm6
559	movdqa	XMMWORD[(240-256)+rcx],xmm7
560
561	pshufd	xmm0,xmm3,0x00
562	pshufd	xmm1,xmm3,0x55
563	paddd	xmm0,XMMWORD[$L$inc]
564	pshufd	xmm2,xmm3,0xaa
565	movdqa	XMMWORD[(272-256)+rcx],xmm1
566	pshufd	xmm3,xmm3,0xff
567	movdqa	XMMWORD[(288-256)+rcx],xmm2
568	movdqa	XMMWORD[(304-256)+rcx],xmm3
569
570	jmp	NEAR $L$oop_enter4x
571
572ALIGN	32
573$L$oop_outer4x:
574	movdqa	xmm8,XMMWORD[64+rsp]
575	movdqa	xmm9,XMMWORD[80+rsp]
576	movdqa	xmm10,XMMWORD[96+rsp]
577	movdqa	xmm11,XMMWORD[112+rsp]
578	movdqa	xmm12,XMMWORD[((128-256))+rcx]
579	movdqa	xmm13,XMMWORD[((144-256))+rcx]
580	movdqa	xmm14,XMMWORD[((160-256))+rcx]
581	movdqa	xmm15,XMMWORD[((176-256))+rcx]
582	movdqa	xmm4,XMMWORD[((192-256))+rcx]
583	movdqa	xmm5,XMMWORD[((208-256))+rcx]
584	movdqa	xmm6,XMMWORD[((224-256))+rcx]
585	movdqa	xmm7,XMMWORD[((240-256))+rcx]
586	movdqa	xmm0,XMMWORD[((256-256))+rcx]
587	movdqa	xmm1,XMMWORD[((272-256))+rcx]
588	movdqa	xmm2,XMMWORD[((288-256))+rcx]
589	movdqa	xmm3,XMMWORD[((304-256))+rcx]
590	paddd	xmm0,XMMWORD[$L$four]
591
592$L$oop_enter4x:
593	movdqa	XMMWORD[32+rsp],xmm6
594	movdqa	XMMWORD[48+rsp],xmm7
595	movdqa	xmm7,XMMWORD[r10]
596	mov	eax,10
597	movdqa	XMMWORD[(256-256)+rcx],xmm0
598	jmp	NEAR $L$oop4x
599
600ALIGN	32
601$L$oop4x:
602	paddd	xmm8,xmm12
603	paddd	xmm9,xmm13
604	pxor	xmm0,xmm8
605	pxor	xmm1,xmm9
606DB	102,15,56,0,199
607DB	102,15,56,0,207
608	paddd	xmm4,xmm0
609	paddd	xmm5,xmm1
610	pxor	xmm12,xmm4
611	pxor	xmm13,xmm5
612	movdqa	xmm6,xmm12
613	pslld	xmm12,12
614	psrld	xmm6,20
615	movdqa	xmm7,xmm13
616	pslld	xmm13,12
617	por	xmm12,xmm6
618	psrld	xmm7,20
619	movdqa	xmm6,XMMWORD[r11]
620	por	xmm13,xmm7
621	paddd	xmm8,xmm12
622	paddd	xmm9,xmm13
623	pxor	xmm0,xmm8
624	pxor	xmm1,xmm9
625DB	102,15,56,0,198
626DB	102,15,56,0,206
627	paddd	xmm4,xmm0
628	paddd	xmm5,xmm1
629	pxor	xmm12,xmm4
630	pxor	xmm13,xmm5
631	movdqa	xmm7,xmm12
632	pslld	xmm12,7
633	psrld	xmm7,25
634	movdqa	xmm6,xmm13
635	pslld	xmm13,7
636	por	xmm12,xmm7
637	psrld	xmm6,25
638	movdqa	xmm7,XMMWORD[r10]
639	por	xmm13,xmm6
640	movdqa	XMMWORD[rsp],xmm4
641	movdqa	XMMWORD[16+rsp],xmm5
642	movdqa	xmm4,XMMWORD[32+rsp]
643	movdqa	xmm5,XMMWORD[48+rsp]
644	paddd	xmm10,xmm14
645	paddd	xmm11,xmm15
646	pxor	xmm2,xmm10
647	pxor	xmm3,xmm11
648DB	102,15,56,0,215
649DB	102,15,56,0,223
650	paddd	xmm4,xmm2
651	paddd	xmm5,xmm3
652	pxor	xmm14,xmm4
653	pxor	xmm15,xmm5
654	movdqa	xmm6,xmm14
655	pslld	xmm14,12
656	psrld	xmm6,20
657	movdqa	xmm7,xmm15
658	pslld	xmm15,12
659	por	xmm14,xmm6
660	psrld	xmm7,20
661	movdqa	xmm6,XMMWORD[r11]
662	por	xmm15,xmm7
663	paddd	xmm10,xmm14
664	paddd	xmm11,xmm15
665	pxor	xmm2,xmm10
666	pxor	xmm3,xmm11
667DB	102,15,56,0,214
668DB	102,15,56,0,222
669	paddd	xmm4,xmm2
670	paddd	xmm5,xmm3
671	pxor	xmm14,xmm4
672	pxor	xmm15,xmm5
673	movdqa	xmm7,xmm14
674	pslld	xmm14,7
675	psrld	xmm7,25
676	movdqa	xmm6,xmm15
677	pslld	xmm15,7
678	por	xmm14,xmm7
679	psrld	xmm6,25
680	movdqa	xmm7,XMMWORD[r10]
681	por	xmm15,xmm6
682	paddd	xmm8,xmm13
683	paddd	xmm9,xmm14
684	pxor	xmm3,xmm8
685	pxor	xmm0,xmm9
686DB	102,15,56,0,223
687DB	102,15,56,0,199
688	paddd	xmm4,xmm3
689	paddd	xmm5,xmm0
690	pxor	xmm13,xmm4
691	pxor	xmm14,xmm5
692	movdqa	xmm6,xmm13
693	pslld	xmm13,12
694	psrld	xmm6,20
695	movdqa	xmm7,xmm14
696	pslld	xmm14,12
697	por	xmm13,xmm6
698	psrld	xmm7,20
699	movdqa	xmm6,XMMWORD[r11]
700	por	xmm14,xmm7
701	paddd	xmm8,xmm13
702	paddd	xmm9,xmm14
703	pxor	xmm3,xmm8
704	pxor	xmm0,xmm9
705DB	102,15,56,0,222
706DB	102,15,56,0,198
707	paddd	xmm4,xmm3
708	paddd	xmm5,xmm0
709	pxor	xmm13,xmm4
710	pxor	xmm14,xmm5
711	movdqa	xmm7,xmm13
712	pslld	xmm13,7
713	psrld	xmm7,25
714	movdqa	xmm6,xmm14
715	pslld	xmm14,7
716	por	xmm13,xmm7
717	psrld	xmm6,25
718	movdqa	xmm7,XMMWORD[r10]
719	por	xmm14,xmm6
720	movdqa	XMMWORD[32+rsp],xmm4
721	movdqa	XMMWORD[48+rsp],xmm5
722	movdqa	xmm4,XMMWORD[rsp]
723	movdqa	xmm5,XMMWORD[16+rsp]
724	paddd	xmm10,xmm15
725	paddd	xmm11,xmm12
726	pxor	xmm1,xmm10
727	pxor	xmm2,xmm11
728DB	102,15,56,0,207
729DB	102,15,56,0,215
730	paddd	xmm4,xmm1
731	paddd	xmm5,xmm2
732	pxor	xmm15,xmm4
733	pxor	xmm12,xmm5
734	movdqa	xmm6,xmm15
735	pslld	xmm15,12
736	psrld	xmm6,20
737	movdqa	xmm7,xmm12
738	pslld	xmm12,12
739	por	xmm15,xmm6
740	psrld	xmm7,20
741	movdqa	xmm6,XMMWORD[r11]
742	por	xmm12,xmm7
743	paddd	xmm10,xmm15
744	paddd	xmm11,xmm12
745	pxor	xmm1,xmm10
746	pxor	xmm2,xmm11
747DB	102,15,56,0,206
748DB	102,15,56,0,214
749	paddd	xmm4,xmm1
750	paddd	xmm5,xmm2
751	pxor	xmm15,xmm4
752	pxor	xmm12,xmm5
753	movdqa	xmm7,xmm15
754	pslld	xmm15,7
755	psrld	xmm7,25
756	movdqa	xmm6,xmm12
757	pslld	xmm12,7
758	por	xmm15,xmm7
759	psrld	xmm6,25
760	movdqa	xmm7,XMMWORD[r10]
761	por	xmm12,xmm6
762	dec	eax
763	jnz	NEAR $L$oop4x
764
765	paddd	xmm8,XMMWORD[64+rsp]
766	paddd	xmm9,XMMWORD[80+rsp]
767	paddd	xmm10,XMMWORD[96+rsp]
768	paddd	xmm11,XMMWORD[112+rsp]
769
770	movdqa	xmm6,xmm8
771	punpckldq	xmm8,xmm9
772	movdqa	xmm7,xmm10
773	punpckldq	xmm10,xmm11
774	punpckhdq	xmm6,xmm9
775	punpckhdq	xmm7,xmm11
776	movdqa	xmm9,xmm8
777	punpcklqdq	xmm8,xmm10
778	movdqa	xmm11,xmm6
779	punpcklqdq	xmm6,xmm7
780	punpckhqdq	xmm9,xmm10
781	punpckhqdq	xmm11,xmm7
782	paddd	xmm12,XMMWORD[((128-256))+rcx]
783	paddd	xmm13,XMMWORD[((144-256))+rcx]
784	paddd	xmm14,XMMWORD[((160-256))+rcx]
785	paddd	xmm15,XMMWORD[((176-256))+rcx]
786
787	movdqa	XMMWORD[rsp],xmm8
788	movdqa	XMMWORD[16+rsp],xmm9
789	movdqa	xmm8,XMMWORD[32+rsp]
790	movdqa	xmm9,XMMWORD[48+rsp]
791
792	movdqa	xmm10,xmm12
793	punpckldq	xmm12,xmm13
794	movdqa	xmm7,xmm14
795	punpckldq	xmm14,xmm15
796	punpckhdq	xmm10,xmm13
797	punpckhdq	xmm7,xmm15
798	movdqa	xmm13,xmm12
799	punpcklqdq	xmm12,xmm14
800	movdqa	xmm15,xmm10
801	punpcklqdq	xmm10,xmm7
802	punpckhqdq	xmm13,xmm14
803	punpckhqdq	xmm15,xmm7
804	paddd	xmm4,XMMWORD[((192-256))+rcx]
805	paddd	xmm5,XMMWORD[((208-256))+rcx]
806	paddd	xmm8,XMMWORD[((224-256))+rcx]
807	paddd	xmm9,XMMWORD[((240-256))+rcx]
808
809	movdqa	XMMWORD[32+rsp],xmm6
810	movdqa	XMMWORD[48+rsp],xmm11
811
812	movdqa	xmm14,xmm4
813	punpckldq	xmm4,xmm5
814	movdqa	xmm7,xmm8
815	punpckldq	xmm8,xmm9
816	punpckhdq	xmm14,xmm5
817	punpckhdq	xmm7,xmm9
818	movdqa	xmm5,xmm4
819	punpcklqdq	xmm4,xmm8
820	movdqa	xmm9,xmm14
821	punpcklqdq	xmm14,xmm7
822	punpckhqdq	xmm5,xmm8
823	punpckhqdq	xmm9,xmm7
824	paddd	xmm0,XMMWORD[((256-256))+rcx]
825	paddd	xmm1,XMMWORD[((272-256))+rcx]
826	paddd	xmm2,XMMWORD[((288-256))+rcx]
827	paddd	xmm3,XMMWORD[((304-256))+rcx]
828
829	movdqa	xmm8,xmm0
830	punpckldq	xmm0,xmm1
831	movdqa	xmm7,xmm2
832	punpckldq	xmm2,xmm3
833	punpckhdq	xmm8,xmm1
834	punpckhdq	xmm7,xmm3
835	movdqa	xmm1,xmm0
836	punpcklqdq	xmm0,xmm2
837	movdqa	xmm3,xmm8
838	punpcklqdq	xmm8,xmm7
839	punpckhqdq	xmm1,xmm2
840	punpckhqdq	xmm3,xmm7
841	cmp	rdx,64*4
842	jb	NEAR $L$tail4x
843
844	movdqu	xmm6,XMMWORD[rsi]
845	movdqu	xmm11,XMMWORD[16+rsi]
846	movdqu	xmm2,XMMWORD[32+rsi]
847	movdqu	xmm7,XMMWORD[48+rsi]
848	pxor	xmm6,XMMWORD[rsp]
849	pxor	xmm11,xmm12
850	pxor	xmm2,xmm4
851	pxor	xmm7,xmm0
852
853	movdqu	XMMWORD[rdi],xmm6
854	movdqu	xmm6,XMMWORD[64+rsi]
855	movdqu	XMMWORD[16+rdi],xmm11
856	movdqu	xmm11,XMMWORD[80+rsi]
857	movdqu	XMMWORD[32+rdi],xmm2
858	movdqu	xmm2,XMMWORD[96+rsi]
859	movdqu	XMMWORD[48+rdi],xmm7
860	movdqu	xmm7,XMMWORD[112+rsi]
861	lea	rsi,[128+rsi]
862	pxor	xmm6,XMMWORD[16+rsp]
863	pxor	xmm11,xmm13
864	pxor	xmm2,xmm5
865	pxor	xmm7,xmm1
866
867	movdqu	XMMWORD[64+rdi],xmm6
868	movdqu	xmm6,XMMWORD[rsi]
869	movdqu	XMMWORD[80+rdi],xmm11
870	movdqu	xmm11,XMMWORD[16+rsi]
871	movdqu	XMMWORD[96+rdi],xmm2
872	movdqu	xmm2,XMMWORD[32+rsi]
873	movdqu	XMMWORD[112+rdi],xmm7
874	lea	rdi,[128+rdi]
875	movdqu	xmm7,XMMWORD[48+rsi]
876	pxor	xmm6,XMMWORD[32+rsp]
877	pxor	xmm11,xmm10
878	pxor	xmm2,xmm14
879	pxor	xmm7,xmm8
880
881	movdqu	XMMWORD[rdi],xmm6
882	movdqu	xmm6,XMMWORD[64+rsi]
883	movdqu	XMMWORD[16+rdi],xmm11
884	movdqu	xmm11,XMMWORD[80+rsi]
885	movdqu	XMMWORD[32+rdi],xmm2
886	movdqu	xmm2,XMMWORD[96+rsi]
887	movdqu	XMMWORD[48+rdi],xmm7
888	movdqu	xmm7,XMMWORD[112+rsi]
889	lea	rsi,[128+rsi]
890	pxor	xmm6,XMMWORD[48+rsp]
891	pxor	xmm11,xmm15
892	pxor	xmm2,xmm9
893	pxor	xmm7,xmm3
894	movdqu	XMMWORD[64+rdi],xmm6
895	movdqu	XMMWORD[80+rdi],xmm11
896	movdqu	XMMWORD[96+rdi],xmm2
897	movdqu	XMMWORD[112+rdi],xmm7
898	lea	rdi,[128+rdi]
899
900	sub	rdx,64*4
901	jnz	NEAR $L$oop_outer4x
902
903	jmp	NEAR $L$done4x
904
905$L$tail4x:
906	cmp	rdx,192
907	jae	NEAR $L$192_or_more4x
908	cmp	rdx,128
909	jae	NEAR $L$128_or_more4x
910	cmp	rdx,64
911	jae	NEAR $L$64_or_more4x
912
913
914	xor	r10,r10
915
916	movdqa	XMMWORD[16+rsp],xmm12
917	movdqa	XMMWORD[32+rsp],xmm4
918	movdqa	XMMWORD[48+rsp],xmm0
919	jmp	NEAR $L$oop_tail4x
920
921ALIGN	32
922$L$64_or_more4x:
923	movdqu	xmm6,XMMWORD[rsi]
924	movdqu	xmm11,XMMWORD[16+rsi]
925	movdqu	xmm2,XMMWORD[32+rsi]
926	movdqu	xmm7,XMMWORD[48+rsi]
927	pxor	xmm6,XMMWORD[rsp]
928	pxor	xmm11,xmm12
929	pxor	xmm2,xmm4
930	pxor	xmm7,xmm0
931	movdqu	XMMWORD[rdi],xmm6
932	movdqu	XMMWORD[16+rdi],xmm11
933	movdqu	XMMWORD[32+rdi],xmm2
934	movdqu	XMMWORD[48+rdi],xmm7
935	je	NEAR $L$done4x
936
937	movdqa	xmm6,XMMWORD[16+rsp]
938	lea	rsi,[64+rsi]
939	xor	r10,r10
940	movdqa	XMMWORD[rsp],xmm6
941	movdqa	XMMWORD[16+rsp],xmm13
942	lea	rdi,[64+rdi]
943	movdqa	XMMWORD[32+rsp],xmm5
944	sub	rdx,64
945	movdqa	XMMWORD[48+rsp],xmm1
946	jmp	NEAR $L$oop_tail4x
947
948ALIGN	32
949$L$128_or_more4x:
950	movdqu	xmm6,XMMWORD[rsi]
951	movdqu	xmm11,XMMWORD[16+rsi]
952	movdqu	xmm2,XMMWORD[32+rsi]
953	movdqu	xmm7,XMMWORD[48+rsi]
954	pxor	xmm6,XMMWORD[rsp]
955	pxor	xmm11,xmm12
956	pxor	xmm2,xmm4
957	pxor	xmm7,xmm0
958
959	movdqu	XMMWORD[rdi],xmm6
960	movdqu	xmm6,XMMWORD[64+rsi]
961	movdqu	XMMWORD[16+rdi],xmm11
962	movdqu	xmm11,XMMWORD[80+rsi]
963	movdqu	XMMWORD[32+rdi],xmm2
964	movdqu	xmm2,XMMWORD[96+rsi]
965	movdqu	XMMWORD[48+rdi],xmm7
966	movdqu	xmm7,XMMWORD[112+rsi]
967	pxor	xmm6,XMMWORD[16+rsp]
968	pxor	xmm11,xmm13
969	pxor	xmm2,xmm5
970	pxor	xmm7,xmm1
971	movdqu	XMMWORD[64+rdi],xmm6
972	movdqu	XMMWORD[80+rdi],xmm11
973	movdqu	XMMWORD[96+rdi],xmm2
974	movdqu	XMMWORD[112+rdi],xmm7
975	je	NEAR $L$done4x
976
977	movdqa	xmm6,XMMWORD[32+rsp]
978	lea	rsi,[128+rsi]
979	xor	r10,r10
980	movdqa	XMMWORD[rsp],xmm6
981	movdqa	XMMWORD[16+rsp],xmm10
982	lea	rdi,[128+rdi]
983	movdqa	XMMWORD[32+rsp],xmm14
984	sub	rdx,128
985	movdqa	XMMWORD[48+rsp],xmm8
986	jmp	NEAR $L$oop_tail4x
987
988ALIGN	32
989$L$192_or_more4x:
990	movdqu	xmm6,XMMWORD[rsi]
991	movdqu	xmm11,XMMWORD[16+rsi]
992	movdqu	xmm2,XMMWORD[32+rsi]
993	movdqu	xmm7,XMMWORD[48+rsi]
994	pxor	xmm6,XMMWORD[rsp]
995	pxor	xmm11,xmm12
996	pxor	xmm2,xmm4
997	pxor	xmm7,xmm0
998
999	movdqu	XMMWORD[rdi],xmm6
1000	movdqu	xmm6,XMMWORD[64+rsi]
1001	movdqu	XMMWORD[16+rdi],xmm11
1002	movdqu	xmm11,XMMWORD[80+rsi]
1003	movdqu	XMMWORD[32+rdi],xmm2
1004	movdqu	xmm2,XMMWORD[96+rsi]
1005	movdqu	XMMWORD[48+rdi],xmm7
1006	movdqu	xmm7,XMMWORD[112+rsi]
1007	lea	rsi,[128+rsi]
1008	pxor	xmm6,XMMWORD[16+rsp]
1009	pxor	xmm11,xmm13
1010	pxor	xmm2,xmm5
1011	pxor	xmm7,xmm1
1012
1013	movdqu	XMMWORD[64+rdi],xmm6
1014	movdqu	xmm6,XMMWORD[rsi]
1015	movdqu	XMMWORD[80+rdi],xmm11
1016	movdqu	xmm11,XMMWORD[16+rsi]
1017	movdqu	XMMWORD[96+rdi],xmm2
1018	movdqu	xmm2,XMMWORD[32+rsi]
1019	movdqu	XMMWORD[112+rdi],xmm7
1020	lea	rdi,[128+rdi]
1021	movdqu	xmm7,XMMWORD[48+rsi]
1022	pxor	xmm6,XMMWORD[32+rsp]
1023	pxor	xmm11,xmm10
1024	pxor	xmm2,xmm14
1025	pxor	xmm7,xmm8
1026	movdqu	XMMWORD[rdi],xmm6
1027	movdqu	XMMWORD[16+rdi],xmm11
1028	movdqu	XMMWORD[32+rdi],xmm2
1029	movdqu	XMMWORD[48+rdi],xmm7
1030	je	NEAR $L$done4x
1031
1032	movdqa	xmm6,XMMWORD[48+rsp]
1033	lea	rsi,[64+rsi]
1034	xor	r10,r10
1035	movdqa	XMMWORD[rsp],xmm6
1036	movdqa	XMMWORD[16+rsp],xmm15
1037	lea	rdi,[64+rdi]
1038	movdqa	XMMWORD[32+rsp],xmm9
1039	sub	rdx,192
1040	movdqa	XMMWORD[48+rsp],xmm3
1041
1042$L$oop_tail4x:
1043	movzx	eax,BYTE[r10*1+rsi]
1044	movzx	ecx,BYTE[r10*1+rsp]
1045	lea	r10,[1+r10]
1046	xor	eax,ecx
1047	mov	BYTE[((-1))+r10*1+rdi],al
1048	dec	rdx
1049	jnz	NEAR $L$oop_tail4x
1050
1051$L$done4x:
1052	movaps	xmm6,XMMWORD[((-168))+r9]
1053	movaps	xmm7,XMMWORD[((-152))+r9]
1054	movaps	xmm8,XMMWORD[((-136))+r9]
1055	movaps	xmm9,XMMWORD[((-120))+r9]
1056	movaps	xmm10,XMMWORD[((-104))+r9]
1057	movaps	xmm11,XMMWORD[((-88))+r9]
1058	movaps	xmm12,XMMWORD[((-72))+r9]
1059	movaps	xmm13,XMMWORD[((-56))+r9]
1060	movaps	xmm14,XMMWORD[((-40))+r9]
1061	movaps	xmm15,XMMWORD[((-24))+r9]
1062	lea	rsp,[r9]
1063
1064$L$4x_epilogue:
1065	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
1066	mov	rsi,QWORD[16+rsp]
1067	ret
1068
1069$L$SEH_end_ChaCha20_ctr32_ssse3_4x:
1070global	ChaCha20_ctr32_avx2
1071
1072ALIGN	32
1073ChaCha20_ctr32_avx2:
1074	mov	QWORD[8+rsp],rdi	;WIN64 prologue
1075	mov	QWORD[16+rsp],rsi
1076	mov	rax,rsp
1077$L$SEH_begin_ChaCha20_ctr32_avx2:
1078	mov	rdi,rcx
1079	mov	rsi,rdx
1080	mov	rdx,r8
1081	mov	rcx,r9
1082	mov	r8,QWORD[40+rsp]
1083
1084
1085
1086_CET_ENDBR
1087	mov	r9,rsp
1088
1089	sub	rsp,0x280+168
1090	and	rsp,-32
1091	movaps	XMMWORD[(-168)+r9],xmm6
1092	movaps	XMMWORD[(-152)+r9],xmm7
1093	movaps	XMMWORD[(-136)+r9],xmm8
1094	movaps	XMMWORD[(-120)+r9],xmm9
1095	movaps	XMMWORD[(-104)+r9],xmm10
1096	movaps	XMMWORD[(-88)+r9],xmm11
1097	movaps	XMMWORD[(-72)+r9],xmm12
1098	movaps	XMMWORD[(-56)+r9],xmm13
1099	movaps	XMMWORD[(-40)+r9],xmm14
1100	movaps	XMMWORD[(-24)+r9],xmm15
1101$L$8x_body:
1102	vzeroupper
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113	vbroadcasti128	ymm11,XMMWORD[$L$sigma]
1114	vbroadcasti128	ymm3,XMMWORD[rcx]
1115	vbroadcasti128	ymm15,XMMWORD[16+rcx]
1116	vbroadcasti128	ymm7,XMMWORD[r8]
1117	lea	rcx,[256+rsp]
1118	lea	rax,[512+rsp]
1119	lea	r10,[$L$rot16]
1120	lea	r11,[$L$rot24]
1121
1122	vpshufd	ymm8,ymm11,0x00
1123	vpshufd	ymm9,ymm11,0x55
1124	vmovdqa	YMMWORD[(128-256)+rcx],ymm8
1125	vpshufd	ymm10,ymm11,0xaa
1126	vmovdqa	YMMWORD[(160-256)+rcx],ymm9
1127	vpshufd	ymm11,ymm11,0xff
1128	vmovdqa	YMMWORD[(192-256)+rcx],ymm10
1129	vmovdqa	YMMWORD[(224-256)+rcx],ymm11
1130
1131	vpshufd	ymm0,ymm3,0x00
1132	vpshufd	ymm1,ymm3,0x55
1133	vmovdqa	YMMWORD[(256-256)+rcx],ymm0
1134	vpshufd	ymm2,ymm3,0xaa
1135	vmovdqa	YMMWORD[(288-256)+rcx],ymm1
1136	vpshufd	ymm3,ymm3,0xff
1137	vmovdqa	YMMWORD[(320-256)+rcx],ymm2
1138	vmovdqa	YMMWORD[(352-256)+rcx],ymm3
1139
1140	vpshufd	ymm12,ymm15,0x00
1141	vpshufd	ymm13,ymm15,0x55
1142	vmovdqa	YMMWORD[(384-512)+rax],ymm12
1143	vpshufd	ymm14,ymm15,0xaa
1144	vmovdqa	YMMWORD[(416-512)+rax],ymm13
1145	vpshufd	ymm15,ymm15,0xff
1146	vmovdqa	YMMWORD[(448-512)+rax],ymm14
1147	vmovdqa	YMMWORD[(480-512)+rax],ymm15
1148
1149	vpshufd	ymm4,ymm7,0x00
1150	vpshufd	ymm5,ymm7,0x55
1151	vpaddd	ymm4,ymm4,YMMWORD[$L$incy]
1152	vpshufd	ymm6,ymm7,0xaa
1153	vmovdqa	YMMWORD[(544-512)+rax],ymm5
1154	vpshufd	ymm7,ymm7,0xff
1155	vmovdqa	YMMWORD[(576-512)+rax],ymm6
1156	vmovdqa	YMMWORD[(608-512)+rax],ymm7
1157
1158	jmp	NEAR $L$oop_enter8x
1159
1160ALIGN	32
1161$L$oop_outer8x:
1162	vmovdqa	ymm8,YMMWORD[((128-256))+rcx]
1163	vmovdqa	ymm9,YMMWORD[((160-256))+rcx]
1164	vmovdqa	ymm10,YMMWORD[((192-256))+rcx]
1165	vmovdqa	ymm11,YMMWORD[((224-256))+rcx]
1166	vmovdqa	ymm0,YMMWORD[((256-256))+rcx]
1167	vmovdqa	ymm1,YMMWORD[((288-256))+rcx]
1168	vmovdqa	ymm2,YMMWORD[((320-256))+rcx]
1169	vmovdqa	ymm3,YMMWORD[((352-256))+rcx]
1170	vmovdqa	ymm12,YMMWORD[((384-512))+rax]
1171	vmovdqa	ymm13,YMMWORD[((416-512))+rax]
1172	vmovdqa	ymm14,YMMWORD[((448-512))+rax]
1173	vmovdqa	ymm15,YMMWORD[((480-512))+rax]
1174	vmovdqa	ymm4,YMMWORD[((512-512))+rax]
1175	vmovdqa	ymm5,YMMWORD[((544-512))+rax]
1176	vmovdqa	ymm6,YMMWORD[((576-512))+rax]
1177	vmovdqa	ymm7,YMMWORD[((608-512))+rax]
1178	vpaddd	ymm4,ymm4,YMMWORD[$L$eight]
1179
1180$L$oop_enter8x:
1181	vmovdqa	YMMWORD[64+rsp],ymm14
1182	vmovdqa	YMMWORD[96+rsp],ymm15
1183	vbroadcasti128	ymm15,XMMWORD[r10]
1184	vmovdqa	YMMWORD[(512-512)+rax],ymm4
1185	mov	eax,10
1186	jmp	NEAR $L$oop8x
1187
1188ALIGN	32
1189$L$oop8x:
1190	vpaddd	ymm8,ymm8,ymm0
1191	vpxor	ymm4,ymm8,ymm4
1192	vpshufb	ymm4,ymm4,ymm15
1193	vpaddd	ymm9,ymm9,ymm1
1194	vpxor	ymm5,ymm9,ymm5
1195	vpshufb	ymm5,ymm5,ymm15
1196	vpaddd	ymm12,ymm12,ymm4
1197	vpxor	ymm0,ymm12,ymm0
1198	vpslld	ymm14,ymm0,12
1199	vpsrld	ymm0,ymm0,20
1200	vpor	ymm0,ymm14,ymm0
1201	vbroadcasti128	ymm14,XMMWORD[r11]
1202	vpaddd	ymm13,ymm13,ymm5
1203	vpxor	ymm1,ymm13,ymm1
1204	vpslld	ymm15,ymm1,12
1205	vpsrld	ymm1,ymm1,20
1206	vpor	ymm1,ymm15,ymm1
1207	vpaddd	ymm8,ymm8,ymm0
1208	vpxor	ymm4,ymm8,ymm4
1209	vpshufb	ymm4,ymm4,ymm14
1210	vpaddd	ymm9,ymm9,ymm1
1211	vpxor	ymm5,ymm9,ymm5
1212	vpshufb	ymm5,ymm5,ymm14
1213	vpaddd	ymm12,ymm12,ymm4
1214	vpxor	ymm0,ymm12,ymm0
1215	vpslld	ymm15,ymm0,7
1216	vpsrld	ymm0,ymm0,25
1217	vpor	ymm0,ymm15,ymm0
1218	vbroadcasti128	ymm15,XMMWORD[r10]
1219	vpaddd	ymm13,ymm13,ymm5
1220	vpxor	ymm1,ymm13,ymm1
1221	vpslld	ymm14,ymm1,7
1222	vpsrld	ymm1,ymm1,25
1223	vpor	ymm1,ymm14,ymm1
1224	vmovdqa	YMMWORD[rsp],ymm12
1225	vmovdqa	YMMWORD[32+rsp],ymm13
1226	vmovdqa	ymm12,YMMWORD[64+rsp]
1227	vmovdqa	ymm13,YMMWORD[96+rsp]
1228	vpaddd	ymm10,ymm10,ymm2
1229	vpxor	ymm6,ymm10,ymm6
1230	vpshufb	ymm6,ymm6,ymm15
1231	vpaddd	ymm11,ymm11,ymm3
1232	vpxor	ymm7,ymm11,ymm7
1233	vpshufb	ymm7,ymm7,ymm15
1234	vpaddd	ymm12,ymm12,ymm6
1235	vpxor	ymm2,ymm12,ymm2
1236	vpslld	ymm14,ymm2,12
1237	vpsrld	ymm2,ymm2,20
1238	vpor	ymm2,ymm14,ymm2
1239	vbroadcasti128	ymm14,XMMWORD[r11]
1240	vpaddd	ymm13,ymm13,ymm7
1241	vpxor	ymm3,ymm13,ymm3
1242	vpslld	ymm15,ymm3,12
1243	vpsrld	ymm3,ymm3,20
1244	vpor	ymm3,ymm15,ymm3
1245	vpaddd	ymm10,ymm10,ymm2
1246	vpxor	ymm6,ymm10,ymm6
1247	vpshufb	ymm6,ymm6,ymm14
1248	vpaddd	ymm11,ymm11,ymm3
1249	vpxor	ymm7,ymm11,ymm7
1250	vpshufb	ymm7,ymm7,ymm14
1251	vpaddd	ymm12,ymm12,ymm6
1252	vpxor	ymm2,ymm12,ymm2
1253	vpslld	ymm15,ymm2,7
1254	vpsrld	ymm2,ymm2,25
1255	vpor	ymm2,ymm15,ymm2
1256	vbroadcasti128	ymm15,XMMWORD[r10]
1257	vpaddd	ymm13,ymm13,ymm7
1258	vpxor	ymm3,ymm13,ymm3
1259	vpslld	ymm14,ymm3,7
1260	vpsrld	ymm3,ymm3,25
1261	vpor	ymm3,ymm14,ymm3
1262	vpaddd	ymm8,ymm8,ymm1
1263	vpxor	ymm7,ymm8,ymm7
1264	vpshufb	ymm7,ymm7,ymm15
1265	vpaddd	ymm9,ymm9,ymm2
1266	vpxor	ymm4,ymm9,ymm4
1267	vpshufb	ymm4,ymm4,ymm15
1268	vpaddd	ymm12,ymm12,ymm7
1269	vpxor	ymm1,ymm12,ymm1
1270	vpslld	ymm14,ymm1,12
1271	vpsrld	ymm1,ymm1,20
1272	vpor	ymm1,ymm14,ymm1
1273	vbroadcasti128	ymm14,XMMWORD[r11]
1274	vpaddd	ymm13,ymm13,ymm4
1275	vpxor	ymm2,ymm13,ymm2
1276	vpslld	ymm15,ymm2,12
1277	vpsrld	ymm2,ymm2,20
1278	vpor	ymm2,ymm15,ymm2
1279	vpaddd	ymm8,ymm8,ymm1
1280	vpxor	ymm7,ymm8,ymm7
1281	vpshufb	ymm7,ymm7,ymm14
1282	vpaddd	ymm9,ymm9,ymm2
1283	vpxor	ymm4,ymm9,ymm4
1284	vpshufb	ymm4,ymm4,ymm14
1285	vpaddd	ymm12,ymm12,ymm7
1286	vpxor	ymm1,ymm12,ymm1
1287	vpslld	ymm15,ymm1,7
1288	vpsrld	ymm1,ymm1,25
1289	vpor	ymm1,ymm15,ymm1
1290	vbroadcasti128	ymm15,XMMWORD[r10]
1291	vpaddd	ymm13,ymm13,ymm4
1292	vpxor	ymm2,ymm13,ymm2
1293	vpslld	ymm14,ymm2,7
1294	vpsrld	ymm2,ymm2,25
1295	vpor	ymm2,ymm14,ymm2
1296	vmovdqa	YMMWORD[64+rsp],ymm12
1297	vmovdqa	YMMWORD[96+rsp],ymm13
1298	vmovdqa	ymm12,YMMWORD[rsp]
1299	vmovdqa	ymm13,YMMWORD[32+rsp]
1300	vpaddd	ymm10,ymm10,ymm3
1301	vpxor	ymm5,ymm10,ymm5
1302	vpshufb	ymm5,ymm5,ymm15
1303	vpaddd	ymm11,ymm11,ymm0
1304	vpxor	ymm6,ymm11,ymm6
1305	vpshufb	ymm6,ymm6,ymm15
1306	vpaddd	ymm12,ymm12,ymm5
1307	vpxor	ymm3,ymm12,ymm3
1308	vpslld	ymm14,ymm3,12
1309	vpsrld	ymm3,ymm3,20
1310	vpor	ymm3,ymm14,ymm3
1311	vbroadcasti128	ymm14,XMMWORD[r11]
1312	vpaddd	ymm13,ymm13,ymm6
1313	vpxor	ymm0,ymm13,ymm0
1314	vpslld	ymm15,ymm0,12
1315	vpsrld	ymm0,ymm0,20
1316	vpor	ymm0,ymm15,ymm0
1317	vpaddd	ymm10,ymm10,ymm3
1318	vpxor	ymm5,ymm10,ymm5
1319	vpshufb	ymm5,ymm5,ymm14
1320	vpaddd	ymm11,ymm11,ymm0
1321	vpxor	ymm6,ymm11,ymm6
1322	vpshufb	ymm6,ymm6,ymm14
1323	vpaddd	ymm12,ymm12,ymm5
1324	vpxor	ymm3,ymm12,ymm3
1325	vpslld	ymm15,ymm3,7
1326	vpsrld	ymm3,ymm3,25
1327	vpor	ymm3,ymm15,ymm3
1328	vbroadcasti128	ymm15,XMMWORD[r10]
1329	vpaddd	ymm13,ymm13,ymm6
1330	vpxor	ymm0,ymm13,ymm0
1331	vpslld	ymm14,ymm0,7
1332	vpsrld	ymm0,ymm0,25
1333	vpor	ymm0,ymm14,ymm0
1334	dec	eax
1335	jnz	NEAR $L$oop8x
1336
1337	lea	rax,[512+rsp]
1338	vpaddd	ymm8,ymm8,YMMWORD[((128-256))+rcx]
1339	vpaddd	ymm9,ymm9,YMMWORD[((160-256))+rcx]
1340	vpaddd	ymm10,ymm10,YMMWORD[((192-256))+rcx]
1341	vpaddd	ymm11,ymm11,YMMWORD[((224-256))+rcx]
1342
1343	vpunpckldq	ymm14,ymm8,ymm9
1344	vpunpckldq	ymm15,ymm10,ymm11
1345	vpunpckhdq	ymm8,ymm8,ymm9
1346	vpunpckhdq	ymm10,ymm10,ymm11
1347	vpunpcklqdq	ymm9,ymm14,ymm15
1348	vpunpckhqdq	ymm14,ymm14,ymm15
1349	vpunpcklqdq	ymm11,ymm8,ymm10
1350	vpunpckhqdq	ymm8,ymm8,ymm10
1351	vpaddd	ymm0,ymm0,YMMWORD[((256-256))+rcx]
1352	vpaddd	ymm1,ymm1,YMMWORD[((288-256))+rcx]
1353	vpaddd	ymm2,ymm2,YMMWORD[((320-256))+rcx]
1354	vpaddd	ymm3,ymm3,YMMWORD[((352-256))+rcx]
1355
1356	vpunpckldq	ymm10,ymm0,ymm1
1357	vpunpckldq	ymm15,ymm2,ymm3
1358	vpunpckhdq	ymm0,ymm0,ymm1
1359	vpunpckhdq	ymm2,ymm2,ymm3
1360	vpunpcklqdq	ymm1,ymm10,ymm15
1361	vpunpckhqdq	ymm10,ymm10,ymm15
1362	vpunpcklqdq	ymm3,ymm0,ymm2
1363	vpunpckhqdq	ymm0,ymm0,ymm2
1364	vperm2i128	ymm15,ymm9,ymm1,0x20
1365	vperm2i128	ymm1,ymm9,ymm1,0x31
1366	vperm2i128	ymm9,ymm14,ymm10,0x20
1367	vperm2i128	ymm10,ymm14,ymm10,0x31
1368	vperm2i128	ymm14,ymm11,ymm3,0x20
1369	vperm2i128	ymm3,ymm11,ymm3,0x31
1370	vperm2i128	ymm11,ymm8,ymm0,0x20
1371	vperm2i128	ymm0,ymm8,ymm0,0x31
1372	vmovdqa	YMMWORD[rsp],ymm15
1373	vmovdqa	YMMWORD[32+rsp],ymm9
1374	vmovdqa	ymm15,YMMWORD[64+rsp]
1375	vmovdqa	ymm9,YMMWORD[96+rsp]
1376
1377	vpaddd	ymm12,ymm12,YMMWORD[((384-512))+rax]
1378	vpaddd	ymm13,ymm13,YMMWORD[((416-512))+rax]
1379	vpaddd	ymm15,ymm15,YMMWORD[((448-512))+rax]
1380	vpaddd	ymm9,ymm9,YMMWORD[((480-512))+rax]
1381
1382	vpunpckldq	ymm2,ymm12,ymm13
1383	vpunpckldq	ymm8,ymm15,ymm9
1384	vpunpckhdq	ymm12,ymm12,ymm13
1385	vpunpckhdq	ymm15,ymm15,ymm9
1386	vpunpcklqdq	ymm13,ymm2,ymm8
1387	vpunpckhqdq	ymm2,ymm2,ymm8
1388	vpunpcklqdq	ymm9,ymm12,ymm15
1389	vpunpckhqdq	ymm12,ymm12,ymm15
1390	vpaddd	ymm4,ymm4,YMMWORD[((512-512))+rax]
1391	vpaddd	ymm5,ymm5,YMMWORD[((544-512))+rax]
1392	vpaddd	ymm6,ymm6,YMMWORD[((576-512))+rax]
1393	vpaddd	ymm7,ymm7,YMMWORD[((608-512))+rax]
1394
1395	vpunpckldq	ymm15,ymm4,ymm5
1396	vpunpckldq	ymm8,ymm6,ymm7
1397	vpunpckhdq	ymm4,ymm4,ymm5
1398	vpunpckhdq	ymm6,ymm6,ymm7
1399	vpunpcklqdq	ymm5,ymm15,ymm8
1400	vpunpckhqdq	ymm15,ymm15,ymm8
1401	vpunpcklqdq	ymm7,ymm4,ymm6
1402	vpunpckhqdq	ymm4,ymm4,ymm6
1403	vperm2i128	ymm8,ymm13,ymm5,0x20
1404	vperm2i128	ymm5,ymm13,ymm5,0x31
1405	vperm2i128	ymm13,ymm2,ymm15,0x20
1406	vperm2i128	ymm15,ymm2,ymm15,0x31
1407	vperm2i128	ymm2,ymm9,ymm7,0x20
1408	vperm2i128	ymm7,ymm9,ymm7,0x31
1409	vperm2i128	ymm9,ymm12,ymm4,0x20
1410	vperm2i128	ymm4,ymm12,ymm4,0x31
1411	vmovdqa	ymm6,YMMWORD[rsp]
1412	vmovdqa	ymm12,YMMWORD[32+rsp]
1413
1414	cmp	rdx,64*8
1415	jb	NEAR $L$tail8x
1416
1417	vpxor	ymm6,ymm6,YMMWORD[rsi]
1418	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1419	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1420	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1421	lea	rsi,[128+rsi]
1422	vmovdqu	YMMWORD[rdi],ymm6
1423	vmovdqu	YMMWORD[32+rdi],ymm8
1424	vmovdqu	YMMWORD[64+rdi],ymm1
1425	vmovdqu	YMMWORD[96+rdi],ymm5
1426	lea	rdi,[128+rdi]
1427
1428	vpxor	ymm12,ymm12,YMMWORD[rsi]
1429	vpxor	ymm13,ymm13,YMMWORD[32+rsi]
1430	vpxor	ymm10,ymm10,YMMWORD[64+rsi]
1431	vpxor	ymm15,ymm15,YMMWORD[96+rsi]
1432	lea	rsi,[128+rsi]
1433	vmovdqu	YMMWORD[rdi],ymm12
1434	vmovdqu	YMMWORD[32+rdi],ymm13
1435	vmovdqu	YMMWORD[64+rdi],ymm10
1436	vmovdqu	YMMWORD[96+rdi],ymm15
1437	lea	rdi,[128+rdi]
1438
1439	vpxor	ymm14,ymm14,YMMWORD[rsi]
1440	vpxor	ymm2,ymm2,YMMWORD[32+rsi]
1441	vpxor	ymm3,ymm3,YMMWORD[64+rsi]
1442	vpxor	ymm7,ymm7,YMMWORD[96+rsi]
1443	lea	rsi,[128+rsi]
1444	vmovdqu	YMMWORD[rdi],ymm14
1445	vmovdqu	YMMWORD[32+rdi],ymm2
1446	vmovdqu	YMMWORD[64+rdi],ymm3
1447	vmovdqu	YMMWORD[96+rdi],ymm7
1448	lea	rdi,[128+rdi]
1449
1450	vpxor	ymm11,ymm11,YMMWORD[rsi]
1451	vpxor	ymm9,ymm9,YMMWORD[32+rsi]
1452	vpxor	ymm0,ymm0,YMMWORD[64+rsi]
1453	vpxor	ymm4,ymm4,YMMWORD[96+rsi]
1454	lea	rsi,[128+rsi]
1455	vmovdqu	YMMWORD[rdi],ymm11
1456	vmovdqu	YMMWORD[32+rdi],ymm9
1457	vmovdqu	YMMWORD[64+rdi],ymm0
1458	vmovdqu	YMMWORD[96+rdi],ymm4
1459	lea	rdi,[128+rdi]
1460
1461	sub	rdx,64*8
1462	jnz	NEAR $L$oop_outer8x
1463
1464	jmp	NEAR $L$done8x
1465
1466$L$tail8x:
1467	cmp	rdx,448
1468	jae	NEAR $L$448_or_more8x
1469	cmp	rdx,384
1470	jae	NEAR $L$384_or_more8x
1471	cmp	rdx,320
1472	jae	NEAR $L$320_or_more8x
1473	cmp	rdx,256
1474	jae	NEAR $L$256_or_more8x
1475	cmp	rdx,192
1476	jae	NEAR $L$192_or_more8x
1477	cmp	rdx,128
1478	jae	NEAR $L$128_or_more8x
1479	cmp	rdx,64
1480	jae	NEAR $L$64_or_more8x
1481
1482	xor	r10,r10
1483	vmovdqa	YMMWORD[rsp],ymm6
1484	vmovdqa	YMMWORD[32+rsp],ymm8
1485	jmp	NEAR $L$oop_tail8x
1486
1487ALIGN	32
1488$L$64_or_more8x:
1489	vpxor	ymm6,ymm6,YMMWORD[rsi]
1490	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1491	vmovdqu	YMMWORD[rdi],ymm6
1492	vmovdqu	YMMWORD[32+rdi],ymm8
1493	je	NEAR $L$done8x
1494
1495	lea	rsi,[64+rsi]
1496	xor	r10,r10
1497	vmovdqa	YMMWORD[rsp],ymm1
1498	lea	rdi,[64+rdi]
1499	sub	rdx,64
1500	vmovdqa	YMMWORD[32+rsp],ymm5
1501	jmp	NEAR $L$oop_tail8x
1502
1503ALIGN	32
1504$L$128_or_more8x:
1505	vpxor	ymm6,ymm6,YMMWORD[rsi]
1506	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1507	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1508	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1509	vmovdqu	YMMWORD[rdi],ymm6
1510	vmovdqu	YMMWORD[32+rdi],ymm8
1511	vmovdqu	YMMWORD[64+rdi],ymm1
1512	vmovdqu	YMMWORD[96+rdi],ymm5
1513	je	NEAR $L$done8x
1514
1515	lea	rsi,[128+rsi]
1516	xor	r10,r10
1517	vmovdqa	YMMWORD[rsp],ymm12
1518	lea	rdi,[128+rdi]
1519	sub	rdx,128
1520	vmovdqa	YMMWORD[32+rsp],ymm13
1521	jmp	NEAR $L$oop_tail8x
1522
1523ALIGN	32
1524$L$192_or_more8x:
1525	vpxor	ymm6,ymm6,YMMWORD[rsi]
1526	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1527	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1528	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1529	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
1530	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
1531	vmovdqu	YMMWORD[rdi],ymm6
1532	vmovdqu	YMMWORD[32+rdi],ymm8
1533	vmovdqu	YMMWORD[64+rdi],ymm1
1534	vmovdqu	YMMWORD[96+rdi],ymm5
1535	vmovdqu	YMMWORD[128+rdi],ymm12
1536	vmovdqu	YMMWORD[160+rdi],ymm13
1537	je	NEAR $L$done8x
1538
1539	lea	rsi,[192+rsi]
1540	xor	r10,r10
1541	vmovdqa	YMMWORD[rsp],ymm10
1542	lea	rdi,[192+rdi]
1543	sub	rdx,192
1544	vmovdqa	YMMWORD[32+rsp],ymm15
1545	jmp	NEAR $L$oop_tail8x
1546
1547ALIGN	32
1548$L$256_or_more8x:
1549	vpxor	ymm6,ymm6,YMMWORD[rsi]
1550	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1551	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1552	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1553	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
1554	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
1555	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
1556	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
1557	vmovdqu	YMMWORD[rdi],ymm6
1558	vmovdqu	YMMWORD[32+rdi],ymm8
1559	vmovdqu	YMMWORD[64+rdi],ymm1
1560	vmovdqu	YMMWORD[96+rdi],ymm5
1561	vmovdqu	YMMWORD[128+rdi],ymm12
1562	vmovdqu	YMMWORD[160+rdi],ymm13
1563	vmovdqu	YMMWORD[192+rdi],ymm10
1564	vmovdqu	YMMWORD[224+rdi],ymm15
1565	je	NEAR $L$done8x
1566
1567	lea	rsi,[256+rsi]
1568	xor	r10,r10
1569	vmovdqa	YMMWORD[rsp],ymm14
1570	lea	rdi,[256+rdi]
1571	sub	rdx,256
1572	vmovdqa	YMMWORD[32+rsp],ymm2
1573	jmp	NEAR $L$oop_tail8x
1574
1575ALIGN	32
1576$L$320_or_more8x:
1577	vpxor	ymm6,ymm6,YMMWORD[rsi]
1578	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1579	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1580	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1581	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
1582	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
1583	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
1584	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
1585	vpxor	ymm14,ymm14,YMMWORD[256+rsi]
1586	vpxor	ymm2,ymm2,YMMWORD[288+rsi]
1587	vmovdqu	YMMWORD[rdi],ymm6
1588	vmovdqu	YMMWORD[32+rdi],ymm8
1589	vmovdqu	YMMWORD[64+rdi],ymm1
1590	vmovdqu	YMMWORD[96+rdi],ymm5
1591	vmovdqu	YMMWORD[128+rdi],ymm12
1592	vmovdqu	YMMWORD[160+rdi],ymm13
1593	vmovdqu	YMMWORD[192+rdi],ymm10
1594	vmovdqu	YMMWORD[224+rdi],ymm15
1595	vmovdqu	YMMWORD[256+rdi],ymm14
1596	vmovdqu	YMMWORD[288+rdi],ymm2
1597	je	NEAR $L$done8x
1598
1599	lea	rsi,[320+rsi]
1600	xor	r10,r10
1601	vmovdqa	YMMWORD[rsp],ymm3
1602	lea	rdi,[320+rdi]
1603	sub	rdx,320
1604	vmovdqa	YMMWORD[32+rsp],ymm7
1605	jmp	NEAR $L$oop_tail8x
1606
1607ALIGN	32
1608$L$384_or_more8x:
1609	vpxor	ymm6,ymm6,YMMWORD[rsi]
1610	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1611	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1612	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1613	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
1614	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
1615	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
1616	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
1617	vpxor	ymm14,ymm14,YMMWORD[256+rsi]
1618	vpxor	ymm2,ymm2,YMMWORD[288+rsi]
1619	vpxor	ymm3,ymm3,YMMWORD[320+rsi]
1620	vpxor	ymm7,ymm7,YMMWORD[352+rsi]
1621	vmovdqu	YMMWORD[rdi],ymm6
1622	vmovdqu	YMMWORD[32+rdi],ymm8
1623	vmovdqu	YMMWORD[64+rdi],ymm1
1624	vmovdqu	YMMWORD[96+rdi],ymm5
1625	vmovdqu	YMMWORD[128+rdi],ymm12
1626	vmovdqu	YMMWORD[160+rdi],ymm13
1627	vmovdqu	YMMWORD[192+rdi],ymm10
1628	vmovdqu	YMMWORD[224+rdi],ymm15
1629	vmovdqu	YMMWORD[256+rdi],ymm14
1630	vmovdqu	YMMWORD[288+rdi],ymm2
1631	vmovdqu	YMMWORD[320+rdi],ymm3
1632	vmovdqu	YMMWORD[352+rdi],ymm7
1633	je	NEAR $L$done8x
1634
1635	lea	rsi,[384+rsi]
1636	xor	r10,r10
1637	vmovdqa	YMMWORD[rsp],ymm11
1638	lea	rdi,[384+rdi]
1639	sub	rdx,384
1640	vmovdqa	YMMWORD[32+rsp],ymm9
1641	jmp	NEAR $L$oop_tail8x
1642
1643ALIGN	32
1644$L$448_or_more8x:
1645	vpxor	ymm6,ymm6,YMMWORD[rsi]
1646	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1647	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1648	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1649	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
1650	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
1651	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
1652	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
1653	vpxor	ymm14,ymm14,YMMWORD[256+rsi]
1654	vpxor	ymm2,ymm2,YMMWORD[288+rsi]
1655	vpxor	ymm3,ymm3,YMMWORD[320+rsi]
1656	vpxor	ymm7,ymm7,YMMWORD[352+rsi]
1657	vpxor	ymm11,ymm11,YMMWORD[384+rsi]
1658	vpxor	ymm9,ymm9,YMMWORD[416+rsi]
1659	vmovdqu	YMMWORD[rdi],ymm6
1660	vmovdqu	YMMWORD[32+rdi],ymm8
1661	vmovdqu	YMMWORD[64+rdi],ymm1
1662	vmovdqu	YMMWORD[96+rdi],ymm5
1663	vmovdqu	YMMWORD[128+rdi],ymm12
1664	vmovdqu	YMMWORD[160+rdi],ymm13
1665	vmovdqu	YMMWORD[192+rdi],ymm10
1666	vmovdqu	YMMWORD[224+rdi],ymm15
1667	vmovdqu	YMMWORD[256+rdi],ymm14
1668	vmovdqu	YMMWORD[288+rdi],ymm2
1669	vmovdqu	YMMWORD[320+rdi],ymm3
1670	vmovdqu	YMMWORD[352+rdi],ymm7
1671	vmovdqu	YMMWORD[384+rdi],ymm11
1672	vmovdqu	YMMWORD[416+rdi],ymm9
1673	je	NEAR $L$done8x
1674
1675	lea	rsi,[448+rsi]
1676	xor	r10,r10
1677	vmovdqa	YMMWORD[rsp],ymm0
1678	lea	rdi,[448+rdi]
1679	sub	rdx,448
1680	vmovdqa	YMMWORD[32+rsp],ymm4
1681
1682$L$oop_tail8x:
1683	movzx	eax,BYTE[r10*1+rsi]
1684	movzx	ecx,BYTE[r10*1+rsp]
1685	lea	r10,[1+r10]
1686	xor	eax,ecx
1687	mov	BYTE[((-1))+r10*1+rdi],al
1688	dec	rdx
1689	jnz	NEAR $L$oop_tail8x
1690
1691$L$done8x:
1692	vzeroall
1693	movaps	xmm6,XMMWORD[((-168))+r9]
1694	movaps	xmm7,XMMWORD[((-152))+r9]
1695	movaps	xmm8,XMMWORD[((-136))+r9]
1696	movaps	xmm9,XMMWORD[((-120))+r9]
1697	movaps	xmm10,XMMWORD[((-104))+r9]
1698	movaps	xmm11,XMMWORD[((-88))+r9]
1699	movaps	xmm12,XMMWORD[((-72))+r9]
1700	movaps	xmm13,XMMWORD[((-56))+r9]
1701	movaps	xmm14,XMMWORD[((-40))+r9]
1702	movaps	xmm15,XMMWORD[((-24))+r9]
1703	lea	rsp,[r9]
1704
1705$L$8x_epilogue:
1706	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
1707	mov	rsi,QWORD[16+rsp]
1708	ret
1709
1710$L$SEH_end_ChaCha20_ctr32_avx2:
1711EXTERN	__imp_RtlVirtualUnwind
1712
1713ALIGN	16
1714se_handler:
1715	push	rsi
1716	push	rdi
1717	push	rbx
1718	push	rbp
1719	push	r12
1720	push	r13
1721	push	r14
1722	push	r15
1723	pushfq
1724	sub	rsp,64
1725
1726	mov	rax,QWORD[120+r8]
1727	mov	rbx,QWORD[248+r8]
1728
1729	mov	rsi,QWORD[8+r9]
1730	mov	r11,QWORD[56+r9]
1731
1732	lea	r10,[$L$ctr32_body]
1733	cmp	rbx,r10
1734	jb	NEAR $L$common_seh_tail
1735
1736	mov	rax,QWORD[152+r8]
1737
1738	lea	r10,[$L$no_data]
1739	cmp	rbx,r10
1740	jae	NEAR $L$common_seh_tail
1741
1742	lea	rax,[((64+24+48))+rax]
1743
1744	mov	rbx,QWORD[((-8))+rax]
1745	mov	rbp,QWORD[((-16))+rax]
1746	mov	r12,QWORD[((-24))+rax]
1747	mov	r13,QWORD[((-32))+rax]
1748	mov	r14,QWORD[((-40))+rax]
1749	mov	r15,QWORD[((-48))+rax]
1750	mov	QWORD[144+r8],rbx
1751	mov	QWORD[160+r8],rbp
1752	mov	QWORD[216+r8],r12
1753	mov	QWORD[224+r8],r13
1754	mov	QWORD[232+r8],r14
1755	mov	QWORD[240+r8],r15
1756
1757$L$common_seh_tail:
1758	mov	rdi,QWORD[8+rax]
1759	mov	rsi,QWORD[16+rax]
1760	mov	QWORD[152+r8],rax
1761	mov	QWORD[168+r8],rsi
1762	mov	QWORD[176+r8],rdi
1763
1764	mov	rdi,QWORD[40+r9]
1765	mov	rsi,r8
1766	mov	ecx,154
1767	DD	0xa548f3fc
1768
1769	mov	rsi,r9
1770	xor	rcx,rcx
1771	mov	rdx,QWORD[8+rsi]
1772	mov	r8,QWORD[rsi]
1773	mov	r9,QWORD[16+rsi]
1774	mov	r10,QWORD[40+rsi]
1775	lea	r11,[56+rsi]
1776	lea	r12,[24+rsi]
1777	mov	QWORD[32+rsp],r10
1778	mov	QWORD[40+rsp],r11
1779	mov	QWORD[48+rsp],r12
1780	mov	QWORD[56+rsp],rcx
1781	call	QWORD[__imp_RtlVirtualUnwind]
1782
1783	mov	eax,1
1784	add	rsp,64
1785	popfq
1786	pop	r15
1787	pop	r14
1788	pop	r13
1789	pop	r12
1790	pop	rbp
1791	pop	rbx
1792	pop	rdi
1793	pop	rsi
1794	ret
1795
1796
1797
1798ALIGN	16
1799ssse3_handler:
1800	push	rsi
1801	push	rdi
1802	push	rbx
1803	push	rbp
1804	push	r12
1805	push	r13
1806	push	r14
1807	push	r15
1808	pushfq
1809	sub	rsp,64
1810
1811	mov	rax,QWORD[120+r8]
1812	mov	rbx,QWORD[248+r8]
1813
1814	mov	rsi,QWORD[8+r9]
1815	mov	r11,QWORD[56+r9]
1816
1817	mov	r10d,DWORD[r11]
1818	lea	r10,[r10*1+rsi]
1819	cmp	rbx,r10
1820	jb	NEAR $L$common_seh_tail
1821
1822	mov	rax,QWORD[192+r8]
1823
1824	mov	r10d,DWORD[4+r11]
1825	lea	r10,[r10*1+rsi]
1826	cmp	rbx,r10
1827	jae	NEAR $L$common_seh_tail
1828
1829	lea	rsi,[((-40))+rax]
1830	lea	rdi,[512+r8]
1831	mov	ecx,4
1832	DD	0xa548f3fc
1833
1834	jmp	NEAR $L$common_seh_tail
1835
1836
1837
1838ALIGN	16
1839full_handler:
1840	push	rsi
1841	push	rdi
1842	push	rbx
1843	push	rbp
1844	push	r12
1845	push	r13
1846	push	r14
1847	push	r15
1848	pushfq
1849	sub	rsp,64
1850
1851	mov	rax,QWORD[120+r8]
1852	mov	rbx,QWORD[248+r8]
1853
1854	mov	rsi,QWORD[8+r9]
1855	mov	r11,QWORD[56+r9]
1856
1857	mov	r10d,DWORD[r11]
1858	lea	r10,[r10*1+rsi]
1859	cmp	rbx,r10
1860	jb	NEAR $L$common_seh_tail
1861
1862	mov	rax,QWORD[192+r8]
1863
1864	mov	r10d,DWORD[4+r11]
1865	lea	r10,[r10*1+rsi]
1866	cmp	rbx,r10
1867	jae	NEAR $L$common_seh_tail
1868
1869	lea	rsi,[((-168))+rax]
1870	lea	rdi,[512+r8]
1871	mov	ecx,20
1872	DD	0xa548f3fc
1873
1874	jmp	NEAR $L$common_seh_tail
1875
1876
1877section	.pdata rdata align=4
1878ALIGN	4
1879	DD	$L$SEH_begin_ChaCha20_ctr32_nohw wrt ..imagebase
1880	DD	$L$SEH_end_ChaCha20_ctr32_nohw wrt ..imagebase
1881	DD	$L$SEH_info_ChaCha20_ctr32_nohw wrt ..imagebase
1882
1883	DD	$L$SEH_begin_ChaCha20_ctr32_ssse3 wrt ..imagebase
1884	DD	$L$SEH_end_ChaCha20_ctr32_ssse3 wrt ..imagebase
1885	DD	$L$SEH_info_ChaCha20_ctr32_ssse3 wrt ..imagebase
1886
1887	DD	$L$SEH_begin_ChaCha20_ctr32_ssse3_4x wrt ..imagebase
1888	DD	$L$SEH_end_ChaCha20_ctr32_ssse3_4x wrt ..imagebase
1889	DD	$L$SEH_info_ChaCha20_ctr32_ssse3_4x wrt ..imagebase
1890	DD	$L$SEH_begin_ChaCha20_ctr32_avx2 wrt ..imagebase
1891	DD	$L$SEH_end_ChaCha20_ctr32_avx2 wrt ..imagebase
1892	DD	$L$SEH_info_ChaCha20_ctr32_avx2 wrt ..imagebase
1893section	.xdata rdata align=8
1894ALIGN	8
1895$L$SEH_info_ChaCha20_ctr32_nohw:
1896	DB	9,0,0,0
1897	DD	se_handler wrt ..imagebase
1898
1899$L$SEH_info_ChaCha20_ctr32_ssse3:
1900	DB	9,0,0,0
1901	DD	ssse3_handler wrt ..imagebase
1902	DD	$L$ssse3_body wrt ..imagebase,$L$ssse3_epilogue wrt ..imagebase
1903
1904$L$SEH_info_ChaCha20_ctr32_ssse3_4x:
1905	DB	9,0,0,0
1906	DD	full_handler wrt ..imagebase
1907	DD	$L$4x_body wrt ..imagebase,$L$4x_epilogue wrt ..imagebase
1908$L$SEH_info_ChaCha20_ctr32_avx2:
1909	DB	9,0,0,0
1910	DD	full_handler wrt ..imagebase
1911	DD	$L$8x_body wrt ..imagebase,$L$8x_epilogue wrt ..imagebase
1912%else
1913; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
1914ret
1915%endif
1916