xref: /aosp_15_r20/external/boringssl/src/gen/crypto/chacha-x86-win.asm (revision 8fb009dc861624b67b6cdb62ea21f0f22d0c584b)
1; This file is generated from a similarly-named Perl script in the BoringSSL
2; source tree. Do not edit by hand.
3
4%ifdef BORINGSSL_PREFIX
5%include "boringssl_prefix_symbols_nasm.inc"
6%endif
7%ifidn __OUTPUT_FORMAT__, win32
8%ifidn __OUTPUT_FORMAT__,obj
9section	code	use32 class=code align=64
10%elifidn __OUTPUT_FORMAT__,win32
11$@feat.00 equ 1
12section	.text	code align=64
13%else
14section	.text	code
15%endif
16global	_ChaCha20_ctr32_nohw
17align	16
18_ChaCha20_ctr32_nohw:
19L$_ChaCha20_ctr32_nohw_begin:
20	push	ebp
21	push	ebx
22	push	esi
23	push	edi
24	mov	esi,DWORD [32+esp]
25	mov	edi,DWORD [36+esp]
26	sub	esp,132
27	mov	eax,DWORD [esi]
28	mov	ebx,DWORD [4+esi]
29	mov	ecx,DWORD [8+esi]
30	mov	edx,DWORD [12+esi]
31	mov	DWORD [80+esp],eax
32	mov	DWORD [84+esp],ebx
33	mov	DWORD [88+esp],ecx
34	mov	DWORD [92+esp],edx
35	mov	eax,DWORD [16+esi]
36	mov	ebx,DWORD [20+esi]
37	mov	ecx,DWORD [24+esi]
38	mov	edx,DWORD [28+esi]
39	mov	DWORD [96+esp],eax
40	mov	DWORD [100+esp],ebx
41	mov	DWORD [104+esp],ecx
42	mov	DWORD [108+esp],edx
43	mov	eax,DWORD [edi]
44	mov	ebx,DWORD [4+edi]
45	mov	ecx,DWORD [8+edi]
46	mov	edx,DWORD [12+edi]
47	sub	eax,1
48	mov	DWORD [112+esp],eax
49	mov	DWORD [116+esp],ebx
50	mov	DWORD [120+esp],ecx
51	mov	DWORD [124+esp],edx
52	jmp	NEAR L$000entry
53align	16
54L$001outer_loop:
55	mov	DWORD [156+esp],ebx
56	mov	DWORD [152+esp],eax
57	mov	DWORD [160+esp],ecx
58L$000entry:
59	mov	eax,1634760805
60	mov	DWORD [4+esp],857760878
61	mov	DWORD [8+esp],2036477234
62	mov	DWORD [12+esp],1797285236
63	mov	ebx,DWORD [84+esp]
64	mov	ebp,DWORD [88+esp]
65	mov	ecx,DWORD [104+esp]
66	mov	esi,DWORD [108+esp]
67	mov	edx,DWORD [116+esp]
68	mov	edi,DWORD [120+esp]
69	mov	DWORD [20+esp],ebx
70	mov	DWORD [24+esp],ebp
71	mov	DWORD [40+esp],ecx
72	mov	DWORD [44+esp],esi
73	mov	DWORD [52+esp],edx
74	mov	DWORD [56+esp],edi
75	mov	ebx,DWORD [92+esp]
76	mov	edi,DWORD [124+esp]
77	mov	edx,DWORD [112+esp]
78	mov	ebp,DWORD [80+esp]
79	mov	ecx,DWORD [96+esp]
80	mov	esi,DWORD [100+esp]
81	add	edx,1
82	mov	DWORD [28+esp],ebx
83	mov	DWORD [60+esp],edi
84	mov	DWORD [112+esp],edx
85	mov	ebx,10
86	jmp	NEAR L$002loop
87align	16
88L$002loop:
89	add	eax,ebp
90	mov	DWORD [128+esp],ebx
91	mov	ebx,ebp
92	xor	edx,eax
93	rol	edx,16
94	add	ecx,edx
95	xor	ebx,ecx
96	mov	edi,DWORD [52+esp]
97	rol	ebx,12
98	mov	ebp,DWORD [20+esp]
99	add	eax,ebx
100	xor	edx,eax
101	mov	DWORD [esp],eax
102	rol	edx,8
103	mov	eax,DWORD [4+esp]
104	add	ecx,edx
105	mov	DWORD [48+esp],edx
106	xor	ebx,ecx
107	add	eax,ebp
108	rol	ebx,7
109	xor	edi,eax
110	mov	DWORD [32+esp],ecx
111	rol	edi,16
112	mov	DWORD [16+esp],ebx
113	add	esi,edi
114	mov	ecx,DWORD [40+esp]
115	xor	ebp,esi
116	mov	edx,DWORD [56+esp]
117	rol	ebp,12
118	mov	ebx,DWORD [24+esp]
119	add	eax,ebp
120	xor	edi,eax
121	mov	DWORD [4+esp],eax
122	rol	edi,8
123	mov	eax,DWORD [8+esp]
124	add	esi,edi
125	mov	DWORD [52+esp],edi
126	xor	ebp,esi
127	add	eax,ebx
128	rol	ebp,7
129	xor	edx,eax
130	mov	DWORD [36+esp],esi
131	rol	edx,16
132	mov	DWORD [20+esp],ebp
133	add	ecx,edx
134	mov	esi,DWORD [44+esp]
135	xor	ebx,ecx
136	mov	edi,DWORD [60+esp]
137	rol	ebx,12
138	mov	ebp,DWORD [28+esp]
139	add	eax,ebx
140	xor	edx,eax
141	mov	DWORD [8+esp],eax
142	rol	edx,8
143	mov	eax,DWORD [12+esp]
144	add	ecx,edx
145	mov	DWORD [56+esp],edx
146	xor	ebx,ecx
147	add	eax,ebp
148	rol	ebx,7
149	xor	edi,eax
150	rol	edi,16
151	mov	DWORD [24+esp],ebx
152	add	esi,edi
153	xor	ebp,esi
154	rol	ebp,12
155	mov	ebx,DWORD [20+esp]
156	add	eax,ebp
157	xor	edi,eax
158	mov	DWORD [12+esp],eax
159	rol	edi,8
160	mov	eax,DWORD [esp]
161	add	esi,edi
162	mov	edx,edi
163	xor	ebp,esi
164	add	eax,ebx
165	rol	ebp,7
166	xor	edx,eax
167	rol	edx,16
168	mov	DWORD [28+esp],ebp
169	add	ecx,edx
170	xor	ebx,ecx
171	mov	edi,DWORD [48+esp]
172	rol	ebx,12
173	mov	ebp,DWORD [24+esp]
174	add	eax,ebx
175	xor	edx,eax
176	mov	DWORD [esp],eax
177	rol	edx,8
178	mov	eax,DWORD [4+esp]
179	add	ecx,edx
180	mov	DWORD [60+esp],edx
181	xor	ebx,ecx
182	add	eax,ebp
183	rol	ebx,7
184	xor	edi,eax
185	mov	DWORD [40+esp],ecx
186	rol	edi,16
187	mov	DWORD [20+esp],ebx
188	add	esi,edi
189	mov	ecx,DWORD [32+esp]
190	xor	ebp,esi
191	mov	edx,DWORD [52+esp]
192	rol	ebp,12
193	mov	ebx,DWORD [28+esp]
194	add	eax,ebp
195	xor	edi,eax
196	mov	DWORD [4+esp],eax
197	rol	edi,8
198	mov	eax,DWORD [8+esp]
199	add	esi,edi
200	mov	DWORD [48+esp],edi
201	xor	ebp,esi
202	add	eax,ebx
203	rol	ebp,7
204	xor	edx,eax
205	mov	DWORD [44+esp],esi
206	rol	edx,16
207	mov	DWORD [24+esp],ebp
208	add	ecx,edx
209	mov	esi,DWORD [36+esp]
210	xor	ebx,ecx
211	mov	edi,DWORD [56+esp]
212	rol	ebx,12
213	mov	ebp,DWORD [16+esp]
214	add	eax,ebx
215	xor	edx,eax
216	mov	DWORD [8+esp],eax
217	rol	edx,8
218	mov	eax,DWORD [12+esp]
219	add	ecx,edx
220	mov	DWORD [52+esp],edx
221	xor	ebx,ecx
222	add	eax,ebp
223	rol	ebx,7
224	xor	edi,eax
225	rol	edi,16
226	mov	DWORD [28+esp],ebx
227	add	esi,edi
228	xor	ebp,esi
229	mov	edx,DWORD [48+esp]
230	rol	ebp,12
231	mov	ebx,DWORD [128+esp]
232	add	eax,ebp
233	xor	edi,eax
234	mov	DWORD [12+esp],eax
235	rol	edi,8
236	mov	eax,DWORD [esp]
237	add	esi,edi
238	mov	DWORD [56+esp],edi
239	xor	ebp,esi
240	rol	ebp,7
241	dec	ebx
242	jnz	NEAR L$002loop
243	mov	ebx,DWORD [160+esp]
244	add	eax,1634760805
245	add	ebp,DWORD [80+esp]
246	add	ecx,DWORD [96+esp]
247	add	esi,DWORD [100+esp]
248	cmp	ebx,64
249	jb	NEAR L$003tail
250	mov	ebx,DWORD [156+esp]
251	add	edx,DWORD [112+esp]
252	add	edi,DWORD [120+esp]
253	xor	eax,DWORD [ebx]
254	xor	ebp,DWORD [16+ebx]
255	mov	DWORD [esp],eax
256	mov	eax,DWORD [152+esp]
257	xor	ecx,DWORD [32+ebx]
258	xor	esi,DWORD [36+ebx]
259	xor	edx,DWORD [48+ebx]
260	xor	edi,DWORD [56+ebx]
261	mov	DWORD [16+eax],ebp
262	mov	DWORD [32+eax],ecx
263	mov	DWORD [36+eax],esi
264	mov	DWORD [48+eax],edx
265	mov	DWORD [56+eax],edi
266	mov	ebp,DWORD [4+esp]
267	mov	ecx,DWORD [8+esp]
268	mov	esi,DWORD [12+esp]
269	mov	edx,DWORD [20+esp]
270	mov	edi,DWORD [24+esp]
271	add	ebp,857760878
272	add	ecx,2036477234
273	add	esi,1797285236
274	add	edx,DWORD [84+esp]
275	add	edi,DWORD [88+esp]
276	xor	ebp,DWORD [4+ebx]
277	xor	ecx,DWORD [8+ebx]
278	xor	esi,DWORD [12+ebx]
279	xor	edx,DWORD [20+ebx]
280	xor	edi,DWORD [24+ebx]
281	mov	DWORD [4+eax],ebp
282	mov	DWORD [8+eax],ecx
283	mov	DWORD [12+eax],esi
284	mov	DWORD [20+eax],edx
285	mov	DWORD [24+eax],edi
286	mov	ebp,DWORD [28+esp]
287	mov	ecx,DWORD [40+esp]
288	mov	esi,DWORD [44+esp]
289	mov	edx,DWORD [52+esp]
290	mov	edi,DWORD [60+esp]
291	add	ebp,DWORD [92+esp]
292	add	ecx,DWORD [104+esp]
293	add	esi,DWORD [108+esp]
294	add	edx,DWORD [116+esp]
295	add	edi,DWORD [124+esp]
296	xor	ebp,DWORD [28+ebx]
297	xor	ecx,DWORD [40+ebx]
298	xor	esi,DWORD [44+ebx]
299	xor	edx,DWORD [52+ebx]
300	xor	edi,DWORD [60+ebx]
301	lea	ebx,[64+ebx]
302	mov	DWORD [28+eax],ebp
303	mov	ebp,DWORD [esp]
304	mov	DWORD [40+eax],ecx
305	mov	ecx,DWORD [160+esp]
306	mov	DWORD [44+eax],esi
307	mov	DWORD [52+eax],edx
308	mov	DWORD [60+eax],edi
309	mov	DWORD [eax],ebp
310	lea	eax,[64+eax]
311	sub	ecx,64
312	jnz	NEAR L$001outer_loop
313	jmp	NEAR L$004done
314L$003tail:
315	add	edx,DWORD [112+esp]
316	add	edi,DWORD [120+esp]
317	mov	DWORD [esp],eax
318	mov	DWORD [16+esp],ebp
319	mov	DWORD [32+esp],ecx
320	mov	DWORD [36+esp],esi
321	mov	DWORD [48+esp],edx
322	mov	DWORD [56+esp],edi
323	mov	ebp,DWORD [4+esp]
324	mov	ecx,DWORD [8+esp]
325	mov	esi,DWORD [12+esp]
326	mov	edx,DWORD [20+esp]
327	mov	edi,DWORD [24+esp]
328	add	ebp,857760878
329	add	ecx,2036477234
330	add	esi,1797285236
331	add	edx,DWORD [84+esp]
332	add	edi,DWORD [88+esp]
333	mov	DWORD [4+esp],ebp
334	mov	DWORD [8+esp],ecx
335	mov	DWORD [12+esp],esi
336	mov	DWORD [20+esp],edx
337	mov	DWORD [24+esp],edi
338	mov	ebp,DWORD [28+esp]
339	mov	ecx,DWORD [40+esp]
340	mov	esi,DWORD [44+esp]
341	mov	edx,DWORD [52+esp]
342	mov	edi,DWORD [60+esp]
343	add	ebp,DWORD [92+esp]
344	add	ecx,DWORD [104+esp]
345	add	esi,DWORD [108+esp]
346	add	edx,DWORD [116+esp]
347	add	edi,DWORD [124+esp]
348	mov	DWORD [28+esp],ebp
349	mov	ebp,DWORD [156+esp]
350	mov	DWORD [40+esp],ecx
351	mov	ecx,DWORD [152+esp]
352	mov	DWORD [44+esp],esi
353	xor	esi,esi
354	mov	DWORD [52+esp],edx
355	mov	DWORD [60+esp],edi
356	xor	eax,eax
357	xor	edx,edx
358L$005tail_loop:
359	mov	al,BYTE [ebp*1+esi]
360	mov	dl,BYTE [esi*1+esp]
361	lea	esi,[1+esi]
362	xor	al,dl
363	mov	BYTE [esi*1+ecx-1],al
364	dec	ebx
365	jnz	NEAR L$005tail_loop
366L$004done:
367	add	esp,132
368	pop	edi
369	pop	esi
370	pop	ebx
371	pop	ebp
372	ret
373global	_ChaCha20_ctr32_ssse3
374align	16
375_ChaCha20_ctr32_ssse3:
376L$_ChaCha20_ctr32_ssse3_begin:
377	push	ebp
378	push	ebx
379	push	esi
380	push	edi
381	call	L$pic_point
382L$pic_point:
383	pop	eax
384	mov	edi,DWORD [20+esp]
385	mov	esi,DWORD [24+esp]
386	mov	ecx,DWORD [28+esp]
387	mov	edx,DWORD [32+esp]
388	mov	ebx,DWORD [36+esp]
389	mov	ebp,esp
390	sub	esp,524
391	and	esp,-64
392	mov	DWORD [512+esp],ebp
393	lea	eax,[(L$ssse3_data-L$pic_point)+eax]
394	movdqu	xmm3,[ebx]
395	cmp	ecx,256
396	jb	NEAR L$0061x
397	mov	DWORD [516+esp],edx
398	mov	DWORD [520+esp],ebx
399	sub	ecx,256
400	lea	ebp,[384+esp]
401	movdqu	xmm7,[edx]
402	pshufd	xmm0,xmm3,0
403	pshufd	xmm1,xmm3,85
404	pshufd	xmm2,xmm3,170
405	pshufd	xmm3,xmm3,255
406	paddd	xmm0,[48+eax]
407	pshufd	xmm4,xmm7,0
408	pshufd	xmm5,xmm7,85
409	psubd	xmm0,[64+eax]
410	pshufd	xmm6,xmm7,170
411	pshufd	xmm7,xmm7,255
412	movdqa	[64+ebp],xmm0
413	movdqa	[80+ebp],xmm1
414	movdqa	[96+ebp],xmm2
415	movdqa	[112+ebp],xmm3
416	movdqu	xmm3,[16+edx]
417	movdqa	[ebp-64],xmm4
418	movdqa	[ebp-48],xmm5
419	movdqa	[ebp-32],xmm6
420	movdqa	[ebp-16],xmm7
421	movdqa	xmm7,[32+eax]
422	lea	ebx,[128+esp]
423	pshufd	xmm0,xmm3,0
424	pshufd	xmm1,xmm3,85
425	pshufd	xmm2,xmm3,170
426	pshufd	xmm3,xmm3,255
427	pshufd	xmm4,xmm7,0
428	pshufd	xmm5,xmm7,85
429	pshufd	xmm6,xmm7,170
430	pshufd	xmm7,xmm7,255
431	movdqa	[ebp],xmm0
432	movdqa	[16+ebp],xmm1
433	movdqa	[32+ebp],xmm2
434	movdqa	[48+ebp],xmm3
435	movdqa	[ebp-128],xmm4
436	movdqa	[ebp-112],xmm5
437	movdqa	[ebp-96],xmm6
438	movdqa	[ebp-80],xmm7
439	lea	esi,[128+esi]
440	lea	edi,[128+edi]
441	jmp	NEAR L$007outer_loop
442align	16
443L$007outer_loop:
444	movdqa	xmm1,[ebp-112]
445	movdqa	xmm2,[ebp-96]
446	movdqa	xmm3,[ebp-80]
447	movdqa	xmm5,[ebp-48]
448	movdqa	xmm6,[ebp-32]
449	movdqa	xmm7,[ebp-16]
450	movdqa	[ebx-112],xmm1
451	movdqa	[ebx-96],xmm2
452	movdqa	[ebx-80],xmm3
453	movdqa	[ebx-48],xmm5
454	movdqa	[ebx-32],xmm6
455	movdqa	[ebx-16],xmm7
456	movdqa	xmm2,[32+ebp]
457	movdqa	xmm3,[48+ebp]
458	movdqa	xmm4,[64+ebp]
459	movdqa	xmm5,[80+ebp]
460	movdqa	xmm6,[96+ebp]
461	movdqa	xmm7,[112+ebp]
462	paddd	xmm4,[64+eax]
463	movdqa	[32+ebx],xmm2
464	movdqa	[48+ebx],xmm3
465	movdqa	[64+ebx],xmm4
466	movdqa	[80+ebx],xmm5
467	movdqa	[96+ebx],xmm6
468	movdqa	[112+ebx],xmm7
469	movdqa	[64+ebp],xmm4
470	movdqa	xmm0,[ebp-128]
471	movdqa	xmm6,xmm4
472	movdqa	xmm3,[ebp-64]
473	movdqa	xmm4,[ebp]
474	movdqa	xmm5,[16+ebp]
475	mov	edx,10
476	nop
477align	16
478L$008loop:
479	paddd	xmm0,xmm3
480	movdqa	xmm2,xmm3
481	pxor	xmm6,xmm0
482	pshufb	xmm6,[eax]
483	paddd	xmm4,xmm6
484	pxor	xmm2,xmm4
485	movdqa	xmm3,[ebx-48]
486	movdqa	xmm1,xmm2
487	pslld	xmm2,12
488	psrld	xmm1,20
489	por	xmm2,xmm1
490	movdqa	xmm1,[ebx-112]
491	paddd	xmm0,xmm2
492	movdqa	xmm7,[80+ebx]
493	pxor	xmm6,xmm0
494	movdqa	[ebx-128],xmm0
495	pshufb	xmm6,[16+eax]
496	paddd	xmm4,xmm6
497	movdqa	[64+ebx],xmm6
498	pxor	xmm2,xmm4
499	paddd	xmm1,xmm3
500	movdqa	xmm0,xmm2
501	pslld	xmm2,7
502	psrld	xmm0,25
503	pxor	xmm7,xmm1
504	por	xmm2,xmm0
505	movdqa	[ebx],xmm4
506	pshufb	xmm7,[eax]
507	movdqa	[ebx-64],xmm2
508	paddd	xmm5,xmm7
509	movdqa	xmm4,[32+ebx]
510	pxor	xmm3,xmm5
511	movdqa	xmm2,[ebx-32]
512	movdqa	xmm0,xmm3
513	pslld	xmm3,12
514	psrld	xmm0,20
515	por	xmm3,xmm0
516	movdqa	xmm0,[ebx-96]
517	paddd	xmm1,xmm3
518	movdqa	xmm6,[96+ebx]
519	pxor	xmm7,xmm1
520	movdqa	[ebx-112],xmm1
521	pshufb	xmm7,[16+eax]
522	paddd	xmm5,xmm7
523	movdqa	[80+ebx],xmm7
524	pxor	xmm3,xmm5
525	paddd	xmm0,xmm2
526	movdqa	xmm1,xmm3
527	pslld	xmm3,7
528	psrld	xmm1,25
529	pxor	xmm6,xmm0
530	por	xmm3,xmm1
531	movdqa	[16+ebx],xmm5
532	pshufb	xmm6,[eax]
533	movdqa	[ebx-48],xmm3
534	paddd	xmm4,xmm6
535	movdqa	xmm5,[48+ebx]
536	pxor	xmm2,xmm4
537	movdqa	xmm3,[ebx-16]
538	movdqa	xmm1,xmm2
539	pslld	xmm2,12
540	psrld	xmm1,20
541	por	xmm2,xmm1
542	movdqa	xmm1,[ebx-80]
543	paddd	xmm0,xmm2
544	movdqa	xmm7,[112+ebx]
545	pxor	xmm6,xmm0
546	movdqa	[ebx-96],xmm0
547	pshufb	xmm6,[16+eax]
548	paddd	xmm4,xmm6
549	movdqa	[96+ebx],xmm6
550	pxor	xmm2,xmm4
551	paddd	xmm1,xmm3
552	movdqa	xmm0,xmm2
553	pslld	xmm2,7
554	psrld	xmm0,25
555	pxor	xmm7,xmm1
556	por	xmm2,xmm0
557	pshufb	xmm7,[eax]
558	movdqa	[ebx-32],xmm2
559	paddd	xmm5,xmm7
560	pxor	xmm3,xmm5
561	movdqa	xmm2,[ebx-48]
562	movdqa	xmm0,xmm3
563	pslld	xmm3,12
564	psrld	xmm0,20
565	por	xmm3,xmm0
566	movdqa	xmm0,[ebx-128]
567	paddd	xmm1,xmm3
568	pxor	xmm7,xmm1
569	movdqa	[ebx-80],xmm1
570	pshufb	xmm7,[16+eax]
571	paddd	xmm5,xmm7
572	movdqa	xmm6,xmm7
573	pxor	xmm3,xmm5
574	paddd	xmm0,xmm2
575	movdqa	xmm1,xmm3
576	pslld	xmm3,7
577	psrld	xmm1,25
578	pxor	xmm6,xmm0
579	por	xmm3,xmm1
580	pshufb	xmm6,[eax]
581	movdqa	[ebx-16],xmm3
582	paddd	xmm4,xmm6
583	pxor	xmm2,xmm4
584	movdqa	xmm3,[ebx-32]
585	movdqa	xmm1,xmm2
586	pslld	xmm2,12
587	psrld	xmm1,20
588	por	xmm2,xmm1
589	movdqa	xmm1,[ebx-112]
590	paddd	xmm0,xmm2
591	movdqa	xmm7,[64+ebx]
592	pxor	xmm6,xmm0
593	movdqa	[ebx-128],xmm0
594	pshufb	xmm6,[16+eax]
595	paddd	xmm4,xmm6
596	movdqa	[112+ebx],xmm6
597	pxor	xmm2,xmm4
598	paddd	xmm1,xmm3
599	movdqa	xmm0,xmm2
600	pslld	xmm2,7
601	psrld	xmm0,25
602	pxor	xmm7,xmm1
603	por	xmm2,xmm0
604	movdqa	[32+ebx],xmm4
605	pshufb	xmm7,[eax]
606	movdqa	[ebx-48],xmm2
607	paddd	xmm5,xmm7
608	movdqa	xmm4,[ebx]
609	pxor	xmm3,xmm5
610	movdqa	xmm2,[ebx-16]
611	movdqa	xmm0,xmm3
612	pslld	xmm3,12
613	psrld	xmm0,20
614	por	xmm3,xmm0
615	movdqa	xmm0,[ebx-96]
616	paddd	xmm1,xmm3
617	movdqa	xmm6,[80+ebx]
618	pxor	xmm7,xmm1
619	movdqa	[ebx-112],xmm1
620	pshufb	xmm7,[16+eax]
621	paddd	xmm5,xmm7
622	movdqa	[64+ebx],xmm7
623	pxor	xmm3,xmm5
624	paddd	xmm0,xmm2
625	movdqa	xmm1,xmm3
626	pslld	xmm3,7
627	psrld	xmm1,25
628	pxor	xmm6,xmm0
629	por	xmm3,xmm1
630	movdqa	[48+ebx],xmm5
631	pshufb	xmm6,[eax]
632	movdqa	[ebx-32],xmm3
633	paddd	xmm4,xmm6
634	movdqa	xmm5,[16+ebx]
635	pxor	xmm2,xmm4
636	movdqa	xmm3,[ebx-64]
637	movdqa	xmm1,xmm2
638	pslld	xmm2,12
639	psrld	xmm1,20
640	por	xmm2,xmm1
641	movdqa	xmm1,[ebx-80]
642	paddd	xmm0,xmm2
643	movdqa	xmm7,[96+ebx]
644	pxor	xmm6,xmm0
645	movdqa	[ebx-96],xmm0
646	pshufb	xmm6,[16+eax]
647	paddd	xmm4,xmm6
648	movdqa	[80+ebx],xmm6
649	pxor	xmm2,xmm4
650	paddd	xmm1,xmm3
651	movdqa	xmm0,xmm2
652	pslld	xmm2,7
653	psrld	xmm0,25
654	pxor	xmm7,xmm1
655	por	xmm2,xmm0
656	pshufb	xmm7,[eax]
657	movdqa	[ebx-16],xmm2
658	paddd	xmm5,xmm7
659	pxor	xmm3,xmm5
660	movdqa	xmm0,xmm3
661	pslld	xmm3,12
662	psrld	xmm0,20
663	por	xmm3,xmm0
664	movdqa	xmm0,[ebx-128]
665	paddd	xmm1,xmm3
666	movdqa	xmm6,[64+ebx]
667	pxor	xmm7,xmm1
668	movdqa	[ebx-80],xmm1
669	pshufb	xmm7,[16+eax]
670	paddd	xmm5,xmm7
671	movdqa	[96+ebx],xmm7
672	pxor	xmm3,xmm5
673	movdqa	xmm1,xmm3
674	pslld	xmm3,7
675	psrld	xmm1,25
676	por	xmm3,xmm1
677	dec	edx
678	jnz	NEAR L$008loop
679	movdqa	[ebx-64],xmm3
680	movdqa	[ebx],xmm4
681	movdqa	[16+ebx],xmm5
682	movdqa	[64+ebx],xmm6
683	movdqa	[96+ebx],xmm7
684	movdqa	xmm1,[ebx-112]
685	movdqa	xmm2,[ebx-96]
686	movdqa	xmm3,[ebx-80]
687	paddd	xmm0,[ebp-128]
688	paddd	xmm1,[ebp-112]
689	paddd	xmm2,[ebp-96]
690	paddd	xmm3,[ebp-80]
691	movdqa	xmm6,xmm0
692	punpckldq	xmm0,xmm1
693	movdqa	xmm7,xmm2
694	punpckldq	xmm2,xmm3
695	punpckhdq	xmm6,xmm1
696	punpckhdq	xmm7,xmm3
697	movdqa	xmm1,xmm0
698	punpcklqdq	xmm0,xmm2
699	movdqa	xmm3,xmm6
700	punpcklqdq	xmm6,xmm7
701	punpckhqdq	xmm1,xmm2
702	punpckhqdq	xmm3,xmm7
703	movdqu	xmm4,[esi-128]
704	movdqu	xmm5,[esi-64]
705	movdqu	xmm2,[esi]
706	movdqu	xmm7,[64+esi]
707	lea	esi,[16+esi]
708	pxor	xmm4,xmm0
709	movdqa	xmm0,[ebx-64]
710	pxor	xmm5,xmm1
711	movdqa	xmm1,[ebx-48]
712	pxor	xmm6,xmm2
713	movdqa	xmm2,[ebx-32]
714	pxor	xmm7,xmm3
715	movdqa	xmm3,[ebx-16]
716	movdqu	[edi-128],xmm4
717	movdqu	[edi-64],xmm5
718	movdqu	[edi],xmm6
719	movdqu	[64+edi],xmm7
720	lea	edi,[16+edi]
721	paddd	xmm0,[ebp-64]
722	paddd	xmm1,[ebp-48]
723	paddd	xmm2,[ebp-32]
724	paddd	xmm3,[ebp-16]
725	movdqa	xmm6,xmm0
726	punpckldq	xmm0,xmm1
727	movdqa	xmm7,xmm2
728	punpckldq	xmm2,xmm3
729	punpckhdq	xmm6,xmm1
730	punpckhdq	xmm7,xmm3
731	movdqa	xmm1,xmm0
732	punpcklqdq	xmm0,xmm2
733	movdqa	xmm3,xmm6
734	punpcklqdq	xmm6,xmm7
735	punpckhqdq	xmm1,xmm2
736	punpckhqdq	xmm3,xmm7
737	movdqu	xmm4,[esi-128]
738	movdqu	xmm5,[esi-64]
739	movdqu	xmm2,[esi]
740	movdqu	xmm7,[64+esi]
741	lea	esi,[16+esi]
742	pxor	xmm4,xmm0
743	movdqa	xmm0,[ebx]
744	pxor	xmm5,xmm1
745	movdqa	xmm1,[16+ebx]
746	pxor	xmm6,xmm2
747	movdqa	xmm2,[32+ebx]
748	pxor	xmm7,xmm3
749	movdqa	xmm3,[48+ebx]
750	movdqu	[edi-128],xmm4
751	movdqu	[edi-64],xmm5
752	movdqu	[edi],xmm6
753	movdqu	[64+edi],xmm7
754	lea	edi,[16+edi]
755	paddd	xmm0,[ebp]
756	paddd	xmm1,[16+ebp]
757	paddd	xmm2,[32+ebp]
758	paddd	xmm3,[48+ebp]
759	movdqa	xmm6,xmm0
760	punpckldq	xmm0,xmm1
761	movdqa	xmm7,xmm2
762	punpckldq	xmm2,xmm3
763	punpckhdq	xmm6,xmm1
764	punpckhdq	xmm7,xmm3
765	movdqa	xmm1,xmm0
766	punpcklqdq	xmm0,xmm2
767	movdqa	xmm3,xmm6
768	punpcklqdq	xmm6,xmm7
769	punpckhqdq	xmm1,xmm2
770	punpckhqdq	xmm3,xmm7
771	movdqu	xmm4,[esi-128]
772	movdqu	xmm5,[esi-64]
773	movdqu	xmm2,[esi]
774	movdqu	xmm7,[64+esi]
775	lea	esi,[16+esi]
776	pxor	xmm4,xmm0
777	movdqa	xmm0,[64+ebx]
778	pxor	xmm5,xmm1
779	movdqa	xmm1,[80+ebx]
780	pxor	xmm6,xmm2
781	movdqa	xmm2,[96+ebx]
782	pxor	xmm7,xmm3
783	movdqa	xmm3,[112+ebx]
784	movdqu	[edi-128],xmm4
785	movdqu	[edi-64],xmm5
786	movdqu	[edi],xmm6
787	movdqu	[64+edi],xmm7
788	lea	edi,[16+edi]
789	paddd	xmm0,[64+ebp]
790	paddd	xmm1,[80+ebp]
791	paddd	xmm2,[96+ebp]
792	paddd	xmm3,[112+ebp]
793	movdqa	xmm6,xmm0
794	punpckldq	xmm0,xmm1
795	movdqa	xmm7,xmm2
796	punpckldq	xmm2,xmm3
797	punpckhdq	xmm6,xmm1
798	punpckhdq	xmm7,xmm3
799	movdqa	xmm1,xmm0
800	punpcklqdq	xmm0,xmm2
801	movdqa	xmm3,xmm6
802	punpcklqdq	xmm6,xmm7
803	punpckhqdq	xmm1,xmm2
804	punpckhqdq	xmm3,xmm7
805	movdqu	xmm4,[esi-128]
806	movdqu	xmm5,[esi-64]
807	movdqu	xmm2,[esi]
808	movdqu	xmm7,[64+esi]
809	lea	esi,[208+esi]
810	pxor	xmm4,xmm0
811	pxor	xmm5,xmm1
812	pxor	xmm6,xmm2
813	pxor	xmm7,xmm3
814	movdqu	[edi-128],xmm4
815	movdqu	[edi-64],xmm5
816	movdqu	[edi],xmm6
817	movdqu	[64+edi],xmm7
818	lea	edi,[208+edi]
819	sub	ecx,256
820	jnc	NEAR L$007outer_loop
821	add	ecx,256
822	jz	NEAR L$009done
823	mov	ebx,DWORD [520+esp]
824	lea	esi,[esi-128]
825	mov	edx,DWORD [516+esp]
826	lea	edi,[edi-128]
827	movd	xmm2,DWORD [64+ebp]
828	movdqu	xmm3,[ebx]
829	paddd	xmm2,[96+eax]
830	pand	xmm3,[112+eax]
831	por	xmm3,xmm2
832L$0061x:
833	movdqa	xmm0,[32+eax]
834	movdqu	xmm1,[edx]
835	movdqu	xmm2,[16+edx]
836	movdqa	xmm6,[eax]
837	movdqa	xmm7,[16+eax]
838	mov	DWORD [48+esp],ebp
839	movdqa	[esp],xmm0
840	movdqa	[16+esp],xmm1
841	movdqa	[32+esp],xmm2
842	movdqa	[48+esp],xmm3
843	mov	edx,10
844	jmp	NEAR L$010loop1x
845align	16
846L$011outer1x:
847	movdqa	xmm3,[80+eax]
848	movdqa	xmm0,[esp]
849	movdqa	xmm1,[16+esp]
850	movdqa	xmm2,[32+esp]
851	paddd	xmm3,[48+esp]
852	mov	edx,10
853	movdqa	[48+esp],xmm3
854	jmp	NEAR L$010loop1x
855align	16
856L$010loop1x:
857	paddd	xmm0,xmm1
858	pxor	xmm3,xmm0
859db	102,15,56,0,222
860	paddd	xmm2,xmm3
861	pxor	xmm1,xmm2
862	movdqa	xmm4,xmm1
863	psrld	xmm1,20
864	pslld	xmm4,12
865	por	xmm1,xmm4
866	paddd	xmm0,xmm1
867	pxor	xmm3,xmm0
868db	102,15,56,0,223
869	paddd	xmm2,xmm3
870	pxor	xmm1,xmm2
871	movdqa	xmm4,xmm1
872	psrld	xmm1,25
873	pslld	xmm4,7
874	por	xmm1,xmm4
875	pshufd	xmm2,xmm2,78
876	pshufd	xmm1,xmm1,57
877	pshufd	xmm3,xmm3,147
878	nop
879	paddd	xmm0,xmm1
880	pxor	xmm3,xmm0
881db	102,15,56,0,222
882	paddd	xmm2,xmm3
883	pxor	xmm1,xmm2
884	movdqa	xmm4,xmm1
885	psrld	xmm1,20
886	pslld	xmm4,12
887	por	xmm1,xmm4
888	paddd	xmm0,xmm1
889	pxor	xmm3,xmm0
890db	102,15,56,0,223
891	paddd	xmm2,xmm3
892	pxor	xmm1,xmm2
893	movdqa	xmm4,xmm1
894	psrld	xmm1,25
895	pslld	xmm4,7
896	por	xmm1,xmm4
897	pshufd	xmm2,xmm2,78
898	pshufd	xmm1,xmm1,147
899	pshufd	xmm3,xmm3,57
900	dec	edx
901	jnz	NEAR L$010loop1x
902	paddd	xmm0,[esp]
903	paddd	xmm1,[16+esp]
904	paddd	xmm2,[32+esp]
905	paddd	xmm3,[48+esp]
906	cmp	ecx,64
907	jb	NEAR L$012tail
908	movdqu	xmm4,[esi]
909	movdqu	xmm5,[16+esi]
910	pxor	xmm0,xmm4
911	movdqu	xmm4,[32+esi]
912	pxor	xmm1,xmm5
913	movdqu	xmm5,[48+esi]
914	pxor	xmm2,xmm4
915	pxor	xmm3,xmm5
916	lea	esi,[64+esi]
917	movdqu	[edi],xmm0
918	movdqu	[16+edi],xmm1
919	movdqu	[32+edi],xmm2
920	movdqu	[48+edi],xmm3
921	lea	edi,[64+edi]
922	sub	ecx,64
923	jnz	NEAR L$011outer1x
924	jmp	NEAR L$009done
925L$012tail:
926	movdqa	[esp],xmm0
927	movdqa	[16+esp],xmm1
928	movdqa	[32+esp],xmm2
929	movdqa	[48+esp],xmm3
930	xor	eax,eax
931	xor	edx,edx
932	xor	ebp,ebp
933L$013tail_loop:
934	mov	al,BYTE [ebp*1+esp]
935	mov	dl,BYTE [ebp*1+esi]
936	lea	ebp,[1+ebp]
937	xor	al,dl
938	mov	BYTE [ebp*1+edi-1],al
939	dec	ecx
940	jnz	NEAR L$013tail_loop
941L$009done:
942	mov	esp,DWORD [512+esp]
943	pop	edi
944	pop	esi
945	pop	ebx
946	pop	ebp
947	ret
948align	64
949L$ssse3_data:
950db	2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
951db	3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
952dd	1634760805,857760878,2036477234,1797285236
953dd	0,1,2,3
954dd	4,4,4,4
955dd	1,0,0,0
956dd	4,0,0,0
957dd	0,-1,-1,-1
958align	64
959db	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
960db	44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
961db	60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
962db	114,103,62,0
963%else
964; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
965ret
966%endif
967