1; This file is generated from a similarly-named Perl script in the BoringSSL
2; source tree. Do not edit by hand.
3
4%ifidn __OUTPUT_FORMAT__, win64
5default	rel
6%define XMMWORD
7%define YMMWORD
8%define ZMMWORD
9%define _CET_ENDBR
10
11%include "ring_core_generated/prefix_symbols_nasm.inc"
12section	.text code align=64
13
14
15EXTERN	OPENSSL_ia32cap_P
16
17global	bn_mul_mont_gather5
18
19ALIGN	64
20bn_mul_mont_gather5:
21	mov	QWORD[8+rsp],rdi	;WIN64 prologue
22	mov	QWORD[16+rsp],rsi
23	mov	rax,rsp
24$L$SEH_begin_bn_mul_mont_gather5:
25	mov	rdi,rcx
26	mov	rsi,rdx
27	mov	rdx,r8
28	mov	rcx,r9
29	mov	r8,QWORD[40+rsp]
30	mov	r9,QWORD[48+rsp]
31
32
33
34_CET_ENDBR
35	mov	r9d,r9d
36	mov	rax,rsp
37
38	test	r9d,7
39	jnz	NEAR $L$mul_enter
40	lea	r11,[OPENSSL_ia32cap_P]
41	mov	r11d,DWORD[8+r11]
42	jmp	NEAR $L$mul4x_enter
43
44ALIGN	16
45$L$mul_enter:
46	movd	xmm5,DWORD[56+rsp]
47	push	rbx
48
49	push	rbp
50
51	push	r12
52
53	push	r13
54
55	push	r14
56
57	push	r15
58
59
60	neg	r9
61	mov	r11,rsp
62	lea	r10,[((-280))+r9*8+rsp]
63	neg	r9
64	and	r10,-1024
65
66
67
68
69
70
71
72
73
74	sub	r11,r10
75	and	r11,-4096
76	lea	rsp,[r11*1+r10]
77	mov	r11,QWORD[rsp]
78	cmp	rsp,r10
79	ja	NEAR $L$mul_page_walk
80	jmp	NEAR $L$mul_page_walk_done
81
82$L$mul_page_walk:
83	lea	rsp,[((-4096))+rsp]
84	mov	r11,QWORD[rsp]
85	cmp	rsp,r10
86	ja	NEAR $L$mul_page_walk
87$L$mul_page_walk_done:
88
89	lea	r10,[$L$inc]
90	mov	QWORD[8+r9*8+rsp],rax
91
92$L$mul_body:
93
94	lea	r12,[128+rdx]
95	movdqa	xmm0,XMMWORD[r10]
96	movdqa	xmm1,XMMWORD[16+r10]
97	lea	r10,[((24-112))+r9*8+rsp]
98	and	r10,-16
99
100	pshufd	xmm5,xmm5,0
101	movdqa	xmm4,xmm1
102	movdqa	xmm2,xmm1
103	paddd	xmm1,xmm0
104	pcmpeqd	xmm0,xmm5
105	DB	0x67
106	movdqa	xmm3,xmm4
107	paddd	xmm2,xmm1
108	pcmpeqd	xmm1,xmm5
109	movdqa	XMMWORD[112+r10],xmm0
110	movdqa	xmm0,xmm4
111
112	paddd	xmm3,xmm2
113	pcmpeqd	xmm2,xmm5
114	movdqa	XMMWORD[128+r10],xmm1
115	movdqa	xmm1,xmm4
116
117	paddd	xmm0,xmm3
118	pcmpeqd	xmm3,xmm5
119	movdqa	XMMWORD[144+r10],xmm2
120	movdqa	xmm2,xmm4
121
122	paddd	xmm1,xmm0
123	pcmpeqd	xmm0,xmm5
124	movdqa	XMMWORD[160+r10],xmm3
125	movdqa	xmm3,xmm4
126	paddd	xmm2,xmm1
127	pcmpeqd	xmm1,xmm5
128	movdqa	XMMWORD[176+r10],xmm0
129	movdqa	xmm0,xmm4
130
131	paddd	xmm3,xmm2
132	pcmpeqd	xmm2,xmm5
133	movdqa	XMMWORD[192+r10],xmm1
134	movdqa	xmm1,xmm4
135
136	paddd	xmm0,xmm3
137	pcmpeqd	xmm3,xmm5
138	movdqa	XMMWORD[208+r10],xmm2
139	movdqa	xmm2,xmm4
140
141	paddd	xmm1,xmm0
142	pcmpeqd	xmm0,xmm5
143	movdqa	XMMWORD[224+r10],xmm3
144	movdqa	xmm3,xmm4
145	paddd	xmm2,xmm1
146	pcmpeqd	xmm1,xmm5
147	movdqa	XMMWORD[240+r10],xmm0
148	movdqa	xmm0,xmm4
149
150	paddd	xmm3,xmm2
151	pcmpeqd	xmm2,xmm5
152	movdqa	XMMWORD[256+r10],xmm1
153	movdqa	xmm1,xmm4
154
155	paddd	xmm0,xmm3
156	pcmpeqd	xmm3,xmm5
157	movdqa	XMMWORD[272+r10],xmm2
158	movdqa	xmm2,xmm4
159
160	paddd	xmm1,xmm0
161	pcmpeqd	xmm0,xmm5
162	movdqa	XMMWORD[288+r10],xmm3
163	movdqa	xmm3,xmm4
164	paddd	xmm2,xmm1
165	pcmpeqd	xmm1,xmm5
166	movdqa	XMMWORD[304+r10],xmm0
167
168	paddd	xmm3,xmm2
169	DB	0x67
170	pcmpeqd	xmm2,xmm5
171	movdqa	XMMWORD[320+r10],xmm1
172
173	pcmpeqd	xmm3,xmm5
174	movdqa	XMMWORD[336+r10],xmm2
175	pand	xmm0,XMMWORD[64+r12]
176
177	pand	xmm1,XMMWORD[80+r12]
178	pand	xmm2,XMMWORD[96+r12]
179	movdqa	XMMWORD[352+r10],xmm3
180	pand	xmm3,XMMWORD[112+r12]
181	por	xmm0,xmm2
182	por	xmm1,xmm3
183	movdqa	xmm4,XMMWORD[((-128))+r12]
184	movdqa	xmm5,XMMWORD[((-112))+r12]
185	movdqa	xmm2,XMMWORD[((-96))+r12]
186	pand	xmm4,XMMWORD[112+r10]
187	movdqa	xmm3,XMMWORD[((-80))+r12]
188	pand	xmm5,XMMWORD[128+r10]
189	por	xmm0,xmm4
190	pand	xmm2,XMMWORD[144+r10]
191	por	xmm1,xmm5
192	pand	xmm3,XMMWORD[160+r10]
193	por	xmm0,xmm2
194	por	xmm1,xmm3
195	movdqa	xmm4,XMMWORD[((-64))+r12]
196	movdqa	xmm5,XMMWORD[((-48))+r12]
197	movdqa	xmm2,XMMWORD[((-32))+r12]
198	pand	xmm4,XMMWORD[176+r10]
199	movdqa	xmm3,XMMWORD[((-16))+r12]
200	pand	xmm5,XMMWORD[192+r10]
201	por	xmm0,xmm4
202	pand	xmm2,XMMWORD[208+r10]
203	por	xmm1,xmm5
204	pand	xmm3,XMMWORD[224+r10]
205	por	xmm0,xmm2
206	por	xmm1,xmm3
207	movdqa	xmm4,XMMWORD[r12]
208	movdqa	xmm5,XMMWORD[16+r12]
209	movdqa	xmm2,XMMWORD[32+r12]
210	pand	xmm4,XMMWORD[240+r10]
211	movdqa	xmm3,XMMWORD[48+r12]
212	pand	xmm5,XMMWORD[256+r10]
213	por	xmm0,xmm4
214	pand	xmm2,XMMWORD[272+r10]
215	por	xmm1,xmm5
216	pand	xmm3,XMMWORD[288+r10]
217	por	xmm0,xmm2
218	por	xmm1,xmm3
219	por	xmm0,xmm1
220
221	pshufd	xmm1,xmm0,0x4e
222	por	xmm0,xmm1
223	lea	r12,[256+r12]
224DB	102,72,15,126,195
225
226	mov	r8,QWORD[r8]
227	mov	rax,QWORD[rsi]
228
229	xor	r14,r14
230	xor	r15,r15
231
232	mov	rbp,r8
233	mul	rbx
234	mov	r10,rax
235	mov	rax,QWORD[rcx]
236
237	imul	rbp,r10
238	mov	r11,rdx
239
240	mul	rbp
241	add	r10,rax
242	mov	rax,QWORD[8+rsi]
243	adc	rdx,0
244	mov	r13,rdx
245
246	lea	r15,[1+r15]
247	jmp	NEAR $L$1st_enter
248
249ALIGN	16
250$L$1st:
251	add	r13,rax
252	mov	rax,QWORD[r15*8+rsi]
253	adc	rdx,0
254	add	r13,r11
255	mov	r11,r10
256	adc	rdx,0
257	mov	QWORD[((-16))+r15*8+rsp],r13
258	mov	r13,rdx
259
260$L$1st_enter:
261	mul	rbx
262	add	r11,rax
263	mov	rax,QWORD[r15*8+rcx]
264	adc	rdx,0
265	lea	r15,[1+r15]
266	mov	r10,rdx
267
268	mul	rbp
269	cmp	r15,r9
270	jne	NEAR $L$1st
271
272
273	add	r13,rax
274	adc	rdx,0
275	add	r13,r11
276	adc	rdx,0
277	mov	QWORD[((-16))+r9*8+rsp],r13
278	mov	r13,rdx
279	mov	r11,r10
280
281	xor	rdx,rdx
282	add	r13,r11
283	adc	rdx,0
284	mov	QWORD[((-8))+r9*8+rsp],r13
285	mov	QWORD[r9*8+rsp],rdx
286
287	lea	r14,[1+r14]
288	jmp	NEAR $L$outer
289ALIGN	16
290$L$outer:
291	lea	rdx,[((24+128))+r9*8+rsp]
292	and	rdx,-16
293	pxor	xmm4,xmm4
294	pxor	xmm5,xmm5
295	movdqa	xmm0,XMMWORD[((-128))+r12]
296	movdqa	xmm1,XMMWORD[((-112))+r12]
297	movdqa	xmm2,XMMWORD[((-96))+r12]
298	movdqa	xmm3,XMMWORD[((-80))+r12]
299	pand	xmm0,XMMWORD[((-128))+rdx]
300	pand	xmm1,XMMWORD[((-112))+rdx]
301	por	xmm4,xmm0
302	pand	xmm2,XMMWORD[((-96))+rdx]
303	por	xmm5,xmm1
304	pand	xmm3,XMMWORD[((-80))+rdx]
305	por	xmm4,xmm2
306	por	xmm5,xmm3
307	movdqa	xmm0,XMMWORD[((-64))+r12]
308	movdqa	xmm1,XMMWORD[((-48))+r12]
309	movdqa	xmm2,XMMWORD[((-32))+r12]
310	movdqa	xmm3,XMMWORD[((-16))+r12]
311	pand	xmm0,XMMWORD[((-64))+rdx]
312	pand	xmm1,XMMWORD[((-48))+rdx]
313	por	xmm4,xmm0
314	pand	xmm2,XMMWORD[((-32))+rdx]
315	por	xmm5,xmm1
316	pand	xmm3,XMMWORD[((-16))+rdx]
317	por	xmm4,xmm2
318	por	xmm5,xmm3
319	movdqa	xmm0,XMMWORD[r12]
320	movdqa	xmm1,XMMWORD[16+r12]
321	movdqa	xmm2,XMMWORD[32+r12]
322	movdqa	xmm3,XMMWORD[48+r12]
323	pand	xmm0,XMMWORD[rdx]
324	pand	xmm1,XMMWORD[16+rdx]
325	por	xmm4,xmm0
326	pand	xmm2,XMMWORD[32+rdx]
327	por	xmm5,xmm1
328	pand	xmm3,XMMWORD[48+rdx]
329	por	xmm4,xmm2
330	por	xmm5,xmm3
331	movdqa	xmm0,XMMWORD[64+r12]
332	movdqa	xmm1,XMMWORD[80+r12]
333	movdqa	xmm2,XMMWORD[96+r12]
334	movdqa	xmm3,XMMWORD[112+r12]
335	pand	xmm0,XMMWORD[64+rdx]
336	pand	xmm1,XMMWORD[80+rdx]
337	por	xmm4,xmm0
338	pand	xmm2,XMMWORD[96+rdx]
339	por	xmm5,xmm1
340	pand	xmm3,XMMWORD[112+rdx]
341	por	xmm4,xmm2
342	por	xmm5,xmm3
343	por	xmm4,xmm5
344
345	pshufd	xmm0,xmm4,0x4e
346	por	xmm0,xmm4
347	lea	r12,[256+r12]
348
349	mov	rax,QWORD[rsi]
350DB	102,72,15,126,195
351
352	xor	r15,r15
353	mov	rbp,r8
354	mov	r10,QWORD[rsp]
355
356	mul	rbx
357	add	r10,rax
358	mov	rax,QWORD[rcx]
359	adc	rdx,0
360
361	imul	rbp,r10
362	mov	r11,rdx
363
364	mul	rbp
365	add	r10,rax
366	mov	rax,QWORD[8+rsi]
367	adc	rdx,0
368	mov	r10,QWORD[8+rsp]
369	mov	r13,rdx
370
371	lea	r15,[1+r15]
372	jmp	NEAR $L$inner_enter
373
374ALIGN	16
375$L$inner:
376	add	r13,rax
377	mov	rax,QWORD[r15*8+rsi]
378	adc	rdx,0
379	add	r13,r10
380	mov	r10,QWORD[r15*8+rsp]
381	adc	rdx,0
382	mov	QWORD[((-16))+r15*8+rsp],r13
383	mov	r13,rdx
384
385$L$inner_enter:
386	mul	rbx
387	add	r11,rax
388	mov	rax,QWORD[r15*8+rcx]
389	adc	rdx,0
390	add	r10,r11
391	mov	r11,rdx
392	adc	r11,0
393	lea	r15,[1+r15]
394
395	mul	rbp
396	cmp	r15,r9
397	jne	NEAR $L$inner
398
399	add	r13,rax
400	adc	rdx,0
401	add	r13,r10
402	mov	r10,QWORD[r9*8+rsp]
403	adc	rdx,0
404	mov	QWORD[((-16))+r9*8+rsp],r13
405	mov	r13,rdx
406
407	xor	rdx,rdx
408	add	r13,r11
409	adc	rdx,0
410	add	r13,r10
411	adc	rdx,0
412	mov	QWORD[((-8))+r9*8+rsp],r13
413	mov	QWORD[r9*8+rsp],rdx
414
415	lea	r14,[1+r14]
416	cmp	r14,r9
417	jb	NEAR $L$outer
418
419	xor	r14,r14
420	mov	rax,QWORD[rsp]
421	lea	rsi,[rsp]
422	mov	r15,r9
423	jmp	NEAR $L$sub
424ALIGN	16
425$L$sub:	sbb	rax,QWORD[r14*8+rcx]
426	mov	QWORD[r14*8+rdi],rax
427	mov	rax,QWORD[8+r14*8+rsi]
428	lea	r14,[1+r14]
429	dec	r15
430	jnz	NEAR $L$sub
431
432	sbb	rax,0
433	mov	rbx,-1
434	xor	rbx,rax
435	xor	r14,r14
436	mov	r15,r9
437
438$L$copy:
439	mov	rcx,QWORD[r14*8+rdi]
440	mov	rdx,QWORD[r14*8+rsp]
441	and	rcx,rbx
442	and	rdx,rax
443	mov	QWORD[r14*8+rsp],r14
444	or	rdx,rcx
445	mov	QWORD[r14*8+rdi],rdx
446	lea	r14,[1+r14]
447	sub	r15,1
448	jnz	NEAR $L$copy
449
450	mov	rsi,QWORD[8+r9*8+rsp]
451
452	mov	rax,1
453
454	mov	r15,QWORD[((-48))+rsi]
455
456	mov	r14,QWORD[((-40))+rsi]
457
458	mov	r13,QWORD[((-32))+rsi]
459
460	mov	r12,QWORD[((-24))+rsi]
461
462	mov	rbp,QWORD[((-16))+rsi]
463
464	mov	rbx,QWORD[((-8))+rsi]
465
466	lea	rsp,[rsi]
467
468$L$mul_epilogue:
469	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
470	mov	rsi,QWORD[16+rsp]
471	ret
472
473$L$SEH_end_bn_mul_mont_gather5:
474
475ALIGN	32
476bn_mul4x_mont_gather5:
477	mov	QWORD[8+rsp],rdi	;WIN64 prologue
478	mov	QWORD[16+rsp],rsi
479	mov	rax,rsp
480$L$SEH_begin_bn_mul4x_mont_gather5:
481	mov	rdi,rcx
482	mov	rsi,rdx
483	mov	rdx,r8
484	mov	rcx,r9
485	mov	r8,QWORD[40+rsp]
486	mov	r9,QWORD[48+rsp]
487
488
489
490	DB	0x67
491	mov	rax,rsp
492
493$L$mul4x_enter:
494	and	r11d,0x80108
495	cmp	r11d,0x80108
496	je	NEAR $L$mulx4x_enter
497	push	rbx
498
499	push	rbp
500
501	push	r12
502
503	push	r13
504
505	push	r14
506
507	push	r15
508
509$L$mul4x_prologue:
510
511	DB	0x67
512	shl	r9d,3
513	lea	r10,[r9*2+r9]
514	neg	r9
515
516
517
518
519
520
521
522
523
524
525	lea	r11,[((-320))+r9*2+rsp]
526	mov	rbp,rsp
527	sub	r11,rdi
528	and	r11,4095
529	cmp	r10,r11
530	jb	NEAR $L$mul4xsp_alt
531	sub	rbp,r11
532	lea	rbp,[((-320))+r9*2+rbp]
533	jmp	NEAR $L$mul4xsp_done
534
535ALIGN	32
536$L$mul4xsp_alt:
537	lea	r10,[((4096-320))+r9*2]
538	lea	rbp,[((-320))+r9*2+rbp]
539	sub	r11,r10
540	mov	r10,0
541	cmovc	r11,r10
542	sub	rbp,r11
543$L$mul4xsp_done:
544	and	rbp,-64
545	mov	r11,rsp
546	sub	r11,rbp
547	and	r11,-4096
548	lea	rsp,[rbp*1+r11]
549	mov	r10,QWORD[rsp]
550	cmp	rsp,rbp
551	ja	NEAR $L$mul4x_page_walk
552	jmp	NEAR $L$mul4x_page_walk_done
553
554$L$mul4x_page_walk:
555	lea	rsp,[((-4096))+rsp]
556	mov	r10,QWORD[rsp]
557	cmp	rsp,rbp
558	ja	NEAR $L$mul4x_page_walk
559$L$mul4x_page_walk_done:
560
561	neg	r9
562
563	mov	QWORD[40+rsp],rax
564
565$L$mul4x_body:
566
567	call	mul4x_internal
568
569	mov	rsi,QWORD[40+rsp]
570
571	mov	rax,1
572
573	mov	r15,QWORD[((-48))+rsi]
574
575	mov	r14,QWORD[((-40))+rsi]
576
577	mov	r13,QWORD[((-32))+rsi]
578
579	mov	r12,QWORD[((-24))+rsi]
580
581	mov	rbp,QWORD[((-16))+rsi]
582
583	mov	rbx,QWORD[((-8))+rsi]
584
585	lea	rsp,[rsi]
586
587$L$mul4x_epilogue:
588	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
589	mov	rsi,QWORD[16+rsp]
590	ret
591
592$L$SEH_end_bn_mul4x_mont_gather5:
593
594
595ALIGN	32
596mul4x_internal:
597
598	shl	r9,5
599	movd	xmm5,DWORD[56+rax]
600	lea	rax,[$L$inc]
601	lea	r13,[128+r9*1+rdx]
602	shr	r9,5
603	movdqa	xmm0,XMMWORD[rax]
604	movdqa	xmm1,XMMWORD[16+rax]
605	lea	r10,[((88-112))+r9*1+rsp]
606	lea	r12,[128+rdx]
607
608	pshufd	xmm5,xmm5,0
609	movdqa	xmm4,xmm1
610	DB	0x67,0x67
611	movdqa	xmm2,xmm1
612	paddd	xmm1,xmm0
613	pcmpeqd	xmm0,xmm5
614	DB	0x67
615	movdqa	xmm3,xmm4
616	paddd	xmm2,xmm1
617	pcmpeqd	xmm1,xmm5
618	movdqa	XMMWORD[112+r10],xmm0
619	movdqa	xmm0,xmm4
620
621	paddd	xmm3,xmm2
622	pcmpeqd	xmm2,xmm5
623	movdqa	XMMWORD[128+r10],xmm1
624	movdqa	xmm1,xmm4
625
626	paddd	xmm0,xmm3
627	pcmpeqd	xmm3,xmm5
628	movdqa	XMMWORD[144+r10],xmm2
629	movdqa	xmm2,xmm4
630
631	paddd	xmm1,xmm0
632	pcmpeqd	xmm0,xmm5
633	movdqa	XMMWORD[160+r10],xmm3
634	movdqa	xmm3,xmm4
635	paddd	xmm2,xmm1
636	pcmpeqd	xmm1,xmm5
637	movdqa	XMMWORD[176+r10],xmm0
638	movdqa	xmm0,xmm4
639
640	paddd	xmm3,xmm2
641	pcmpeqd	xmm2,xmm5
642	movdqa	XMMWORD[192+r10],xmm1
643	movdqa	xmm1,xmm4
644
645	paddd	xmm0,xmm3
646	pcmpeqd	xmm3,xmm5
647	movdqa	XMMWORD[208+r10],xmm2
648	movdqa	xmm2,xmm4
649
650	paddd	xmm1,xmm0
651	pcmpeqd	xmm0,xmm5
652	movdqa	XMMWORD[224+r10],xmm3
653	movdqa	xmm3,xmm4
654	paddd	xmm2,xmm1
655	pcmpeqd	xmm1,xmm5
656	movdqa	XMMWORD[240+r10],xmm0
657	movdqa	xmm0,xmm4
658
659	paddd	xmm3,xmm2
660	pcmpeqd	xmm2,xmm5
661	movdqa	XMMWORD[256+r10],xmm1
662	movdqa	xmm1,xmm4
663
664	paddd	xmm0,xmm3
665	pcmpeqd	xmm3,xmm5
666	movdqa	XMMWORD[272+r10],xmm2
667	movdqa	xmm2,xmm4
668
669	paddd	xmm1,xmm0
670	pcmpeqd	xmm0,xmm5
671	movdqa	XMMWORD[288+r10],xmm3
672	movdqa	xmm3,xmm4
673	paddd	xmm2,xmm1
674	pcmpeqd	xmm1,xmm5
675	movdqa	XMMWORD[304+r10],xmm0
676
677	paddd	xmm3,xmm2
678	DB	0x67
679	pcmpeqd	xmm2,xmm5
680	movdqa	XMMWORD[320+r10],xmm1
681
682	pcmpeqd	xmm3,xmm5
683	movdqa	XMMWORD[336+r10],xmm2
684	pand	xmm0,XMMWORD[64+r12]
685
686	pand	xmm1,XMMWORD[80+r12]
687	pand	xmm2,XMMWORD[96+r12]
688	movdqa	XMMWORD[352+r10],xmm3
689	pand	xmm3,XMMWORD[112+r12]
690	por	xmm0,xmm2
691	por	xmm1,xmm3
692	movdqa	xmm4,XMMWORD[((-128))+r12]
693	movdqa	xmm5,XMMWORD[((-112))+r12]
694	movdqa	xmm2,XMMWORD[((-96))+r12]
695	pand	xmm4,XMMWORD[112+r10]
696	movdqa	xmm3,XMMWORD[((-80))+r12]
697	pand	xmm5,XMMWORD[128+r10]
698	por	xmm0,xmm4
699	pand	xmm2,XMMWORD[144+r10]
700	por	xmm1,xmm5
701	pand	xmm3,XMMWORD[160+r10]
702	por	xmm0,xmm2
703	por	xmm1,xmm3
704	movdqa	xmm4,XMMWORD[((-64))+r12]
705	movdqa	xmm5,XMMWORD[((-48))+r12]
706	movdqa	xmm2,XMMWORD[((-32))+r12]
707	pand	xmm4,XMMWORD[176+r10]
708	movdqa	xmm3,XMMWORD[((-16))+r12]
709	pand	xmm5,XMMWORD[192+r10]
710	por	xmm0,xmm4
711	pand	xmm2,XMMWORD[208+r10]
712	por	xmm1,xmm5
713	pand	xmm3,XMMWORD[224+r10]
714	por	xmm0,xmm2
715	por	xmm1,xmm3
716	movdqa	xmm4,XMMWORD[r12]
717	movdqa	xmm5,XMMWORD[16+r12]
718	movdqa	xmm2,XMMWORD[32+r12]
719	pand	xmm4,XMMWORD[240+r10]
720	movdqa	xmm3,XMMWORD[48+r12]
721	pand	xmm5,XMMWORD[256+r10]
722	por	xmm0,xmm4
723	pand	xmm2,XMMWORD[272+r10]
724	por	xmm1,xmm5
725	pand	xmm3,XMMWORD[288+r10]
726	por	xmm0,xmm2
727	por	xmm1,xmm3
728	por	xmm0,xmm1
729
730	pshufd	xmm1,xmm0,0x4e
731	por	xmm0,xmm1
732	lea	r12,[256+r12]
733DB	102,72,15,126,195
734
735	mov	QWORD[((16+8))+rsp],r13
736	mov	QWORD[((56+8))+rsp],rdi
737
738	mov	r8,QWORD[r8]
739	mov	rax,QWORD[rsi]
740	lea	rsi,[r9*1+rsi]
741	neg	r9
742
743	mov	rbp,r8
744	mul	rbx
745	mov	r10,rax
746	mov	rax,QWORD[rcx]
747
748	imul	rbp,r10
749	lea	r14,[((64+8))+rsp]
750	mov	r11,rdx
751
752	mul	rbp
753	add	r10,rax
754	mov	rax,QWORD[8+r9*1+rsi]
755	adc	rdx,0
756	mov	rdi,rdx
757
758	mul	rbx
759	add	r11,rax
760	mov	rax,QWORD[8+rcx]
761	adc	rdx,0
762	mov	r10,rdx
763
764	mul	rbp
765	add	rdi,rax
766	mov	rax,QWORD[16+r9*1+rsi]
767	adc	rdx,0
768	add	rdi,r11
769	lea	r15,[32+r9]
770	lea	rcx,[32+rcx]
771	adc	rdx,0
772	mov	QWORD[r14],rdi
773	mov	r13,rdx
774	jmp	NEAR $L$1st4x
775
776ALIGN	32
777$L$1st4x:
778	mul	rbx
779	add	r10,rax
780	mov	rax,QWORD[((-16))+rcx]
781	lea	r14,[32+r14]
782	adc	rdx,0
783	mov	r11,rdx
784
785	mul	rbp
786	add	r13,rax
787	mov	rax,QWORD[((-8))+r15*1+rsi]
788	adc	rdx,0
789	add	r13,r10
790	adc	rdx,0
791	mov	QWORD[((-24))+r14],r13
792	mov	rdi,rdx
793
794	mul	rbx
795	add	r11,rax
796	mov	rax,QWORD[((-8))+rcx]
797	adc	rdx,0
798	mov	r10,rdx
799
800	mul	rbp
801	add	rdi,rax
802	mov	rax,QWORD[r15*1+rsi]
803	adc	rdx,0
804	add	rdi,r11
805	adc	rdx,0
806	mov	QWORD[((-16))+r14],rdi
807	mov	r13,rdx
808
809	mul	rbx
810	add	r10,rax
811	mov	rax,QWORD[rcx]
812	adc	rdx,0
813	mov	r11,rdx
814
815	mul	rbp
816	add	r13,rax
817	mov	rax,QWORD[8+r15*1+rsi]
818	adc	rdx,0
819	add	r13,r10
820	adc	rdx,0
821	mov	QWORD[((-8))+r14],r13
822	mov	rdi,rdx
823
824	mul	rbx
825	add	r11,rax
826	mov	rax,QWORD[8+rcx]
827	adc	rdx,0
828	mov	r10,rdx
829
830	mul	rbp
831	add	rdi,rax
832	mov	rax,QWORD[16+r15*1+rsi]
833	adc	rdx,0
834	add	rdi,r11
835	lea	rcx,[32+rcx]
836	adc	rdx,0
837	mov	QWORD[r14],rdi
838	mov	r13,rdx
839
840	add	r15,32
841	jnz	NEAR $L$1st4x
842
843	mul	rbx
844	add	r10,rax
845	mov	rax,QWORD[((-16))+rcx]
846	lea	r14,[32+r14]
847	adc	rdx,0
848	mov	r11,rdx
849
850	mul	rbp
851	add	r13,rax
852	mov	rax,QWORD[((-8))+rsi]
853	adc	rdx,0
854	add	r13,r10
855	adc	rdx,0
856	mov	QWORD[((-24))+r14],r13
857	mov	rdi,rdx
858
859	mul	rbx
860	add	r11,rax
861	mov	rax,QWORD[((-8))+rcx]
862	adc	rdx,0
863	mov	r10,rdx
864
865	mul	rbp
866	add	rdi,rax
867	mov	rax,QWORD[r9*1+rsi]
868	adc	rdx,0
869	add	rdi,r11
870	adc	rdx,0
871	mov	QWORD[((-16))+r14],rdi
872	mov	r13,rdx
873
874	lea	rcx,[r9*1+rcx]
875
876	xor	rdi,rdi
877	add	r13,r10
878	adc	rdi,0
879	mov	QWORD[((-8))+r14],r13
880
881	jmp	NEAR $L$outer4x
882
883ALIGN	32
884$L$outer4x:
885	lea	rdx,[((16+128))+r14]
886	pxor	xmm4,xmm4
887	pxor	xmm5,xmm5
888	movdqa	xmm0,XMMWORD[((-128))+r12]
889	movdqa	xmm1,XMMWORD[((-112))+r12]
890	movdqa	xmm2,XMMWORD[((-96))+r12]
891	movdqa	xmm3,XMMWORD[((-80))+r12]
892	pand	xmm0,XMMWORD[((-128))+rdx]
893	pand	xmm1,XMMWORD[((-112))+rdx]
894	por	xmm4,xmm0
895	pand	xmm2,XMMWORD[((-96))+rdx]
896	por	xmm5,xmm1
897	pand	xmm3,XMMWORD[((-80))+rdx]
898	por	xmm4,xmm2
899	por	xmm5,xmm3
900	movdqa	xmm0,XMMWORD[((-64))+r12]
901	movdqa	xmm1,XMMWORD[((-48))+r12]
902	movdqa	xmm2,XMMWORD[((-32))+r12]
903	movdqa	xmm3,XMMWORD[((-16))+r12]
904	pand	xmm0,XMMWORD[((-64))+rdx]
905	pand	xmm1,XMMWORD[((-48))+rdx]
906	por	xmm4,xmm0
907	pand	xmm2,XMMWORD[((-32))+rdx]
908	por	xmm5,xmm1
909	pand	xmm3,XMMWORD[((-16))+rdx]
910	por	xmm4,xmm2
911	por	xmm5,xmm3
912	movdqa	xmm0,XMMWORD[r12]
913	movdqa	xmm1,XMMWORD[16+r12]
914	movdqa	xmm2,XMMWORD[32+r12]
915	movdqa	xmm3,XMMWORD[48+r12]
916	pand	xmm0,XMMWORD[rdx]
917	pand	xmm1,XMMWORD[16+rdx]
918	por	xmm4,xmm0
919	pand	xmm2,XMMWORD[32+rdx]
920	por	xmm5,xmm1
921	pand	xmm3,XMMWORD[48+rdx]
922	por	xmm4,xmm2
923	por	xmm5,xmm3
924	movdqa	xmm0,XMMWORD[64+r12]
925	movdqa	xmm1,XMMWORD[80+r12]
926	movdqa	xmm2,XMMWORD[96+r12]
927	movdqa	xmm3,XMMWORD[112+r12]
928	pand	xmm0,XMMWORD[64+rdx]
929	pand	xmm1,XMMWORD[80+rdx]
930	por	xmm4,xmm0
931	pand	xmm2,XMMWORD[96+rdx]
932	por	xmm5,xmm1
933	pand	xmm3,XMMWORD[112+rdx]
934	por	xmm4,xmm2
935	por	xmm5,xmm3
936	por	xmm4,xmm5
937
938	pshufd	xmm0,xmm4,0x4e
939	por	xmm0,xmm4
940	lea	r12,[256+r12]
941DB	102,72,15,126,195
942
943	mov	r10,QWORD[r9*1+r14]
944	mov	rbp,r8
945	mul	rbx
946	add	r10,rax
947	mov	rax,QWORD[rcx]
948	adc	rdx,0
949
950	imul	rbp,r10
951	mov	r11,rdx
952	mov	QWORD[r14],rdi
953
954	lea	r14,[r9*1+r14]
955
956	mul	rbp
957	add	r10,rax
958	mov	rax,QWORD[8+r9*1+rsi]
959	adc	rdx,0
960	mov	rdi,rdx
961
962	mul	rbx
963	add	r11,rax
964	mov	rax,QWORD[8+rcx]
965	adc	rdx,0
966	add	r11,QWORD[8+r14]
967	adc	rdx,0
968	mov	r10,rdx
969
970	mul	rbp
971	add	rdi,rax
972	mov	rax,QWORD[16+r9*1+rsi]
973	adc	rdx,0
974	add	rdi,r11
975	lea	r15,[32+r9]
976	lea	rcx,[32+rcx]
977	adc	rdx,0
978	mov	r13,rdx
979	jmp	NEAR $L$inner4x
980
981ALIGN	32
982$L$inner4x:
983	mul	rbx
984	add	r10,rax
985	mov	rax,QWORD[((-16))+rcx]
986	adc	rdx,0
987	add	r10,QWORD[16+r14]
988	lea	r14,[32+r14]
989	adc	rdx,0
990	mov	r11,rdx
991
992	mul	rbp
993	add	r13,rax
994	mov	rax,QWORD[((-8))+r15*1+rsi]
995	adc	rdx,0
996	add	r13,r10
997	adc	rdx,0
998	mov	QWORD[((-32))+r14],rdi
999	mov	rdi,rdx
1000
1001	mul	rbx
1002	add	r11,rax
1003	mov	rax,QWORD[((-8))+rcx]
1004	adc	rdx,0
1005	add	r11,QWORD[((-8))+r14]
1006	adc	rdx,0
1007	mov	r10,rdx
1008
1009	mul	rbp
1010	add	rdi,rax
1011	mov	rax,QWORD[r15*1+rsi]
1012	adc	rdx,0
1013	add	rdi,r11
1014	adc	rdx,0
1015	mov	QWORD[((-24))+r14],r13
1016	mov	r13,rdx
1017
1018	mul	rbx
1019	add	r10,rax
1020	mov	rax,QWORD[rcx]
1021	adc	rdx,0
1022	add	r10,QWORD[r14]
1023	adc	rdx,0
1024	mov	r11,rdx
1025
1026	mul	rbp
1027	add	r13,rax
1028	mov	rax,QWORD[8+r15*1+rsi]
1029	adc	rdx,0
1030	add	r13,r10
1031	adc	rdx,0
1032	mov	QWORD[((-16))+r14],rdi
1033	mov	rdi,rdx
1034
1035	mul	rbx
1036	add	r11,rax
1037	mov	rax,QWORD[8+rcx]
1038	adc	rdx,0
1039	add	r11,QWORD[8+r14]
1040	adc	rdx,0
1041	mov	r10,rdx
1042
1043	mul	rbp
1044	add	rdi,rax
1045	mov	rax,QWORD[16+r15*1+rsi]
1046	adc	rdx,0
1047	add	rdi,r11
1048	lea	rcx,[32+rcx]
1049	adc	rdx,0
1050	mov	QWORD[((-8))+r14],r13
1051	mov	r13,rdx
1052
1053	add	r15,32
1054	jnz	NEAR $L$inner4x
1055
1056	mul	rbx
1057	add	r10,rax
1058	mov	rax,QWORD[((-16))+rcx]
1059	adc	rdx,0
1060	add	r10,QWORD[16+r14]
1061	lea	r14,[32+r14]
1062	adc	rdx,0
1063	mov	r11,rdx
1064
1065	mul	rbp
1066	add	r13,rax
1067	mov	rax,QWORD[((-8))+rsi]
1068	adc	rdx,0
1069	add	r13,r10
1070	adc	rdx,0
1071	mov	QWORD[((-32))+r14],rdi
1072	mov	rdi,rdx
1073
1074	mul	rbx
1075	add	r11,rax
1076	mov	rax,rbp
1077	mov	rbp,QWORD[((-8))+rcx]
1078	adc	rdx,0
1079	add	r11,QWORD[((-8))+r14]
1080	adc	rdx,0
1081	mov	r10,rdx
1082
1083	mul	rbp
1084	add	rdi,rax
1085	mov	rax,QWORD[r9*1+rsi]
1086	adc	rdx,0
1087	add	rdi,r11
1088	adc	rdx,0
1089	mov	QWORD[((-24))+r14],r13
1090	mov	r13,rdx
1091
1092	mov	QWORD[((-16))+r14],rdi
1093	lea	rcx,[r9*1+rcx]
1094
1095	xor	rdi,rdi
1096	add	r13,r10
1097	adc	rdi,0
1098	add	r13,QWORD[r14]
1099	adc	rdi,0
1100	mov	QWORD[((-8))+r14],r13
1101
1102	cmp	r12,QWORD[((16+8))+rsp]
1103	jb	NEAR $L$outer4x
1104	xor	rax,rax
1105	sub	rbp,r13
1106	adc	r15,r15
1107	or	rdi,r15
1108	sub	rax,rdi
1109	lea	rbx,[r9*1+r14]
1110	mov	r12,QWORD[rcx]
1111	lea	rbp,[rcx]
1112	mov	rcx,r9
1113	sar	rcx,3+2
1114	mov	rdi,QWORD[((56+8))+rsp]
1115	dec	r12
1116	xor	r10,r10
1117	mov	r13,QWORD[8+rbp]
1118	mov	r14,QWORD[16+rbp]
1119	mov	r15,QWORD[24+rbp]
1120	jmp	NEAR $L$sqr4x_sub_entry
1121
1122
1123global	bn_power5
1124
1125ALIGN	32
1126bn_power5:
1127	mov	QWORD[8+rsp],rdi	;WIN64 prologue
1128	mov	QWORD[16+rsp],rsi
1129	mov	rax,rsp
1130$L$SEH_begin_bn_power5:
1131	mov	rdi,rcx
1132	mov	rsi,rdx
1133	mov	rdx,r8
1134	mov	rcx,r9
1135	mov	r8,QWORD[40+rsp]
1136	mov	r9,QWORD[48+rsp]
1137
1138
1139
1140_CET_ENDBR
1141	mov	rax,rsp
1142
1143	lea	r11,[OPENSSL_ia32cap_P]
1144	mov	r11d,DWORD[8+r11]
1145	and	r11d,0x80108
1146	cmp	r11d,0x80108
1147	je	NEAR $L$powerx5_enter
1148	push	rbx
1149
1150	push	rbp
1151
1152	push	r12
1153
1154	push	r13
1155
1156	push	r14
1157
1158	push	r15
1159
1160$L$power5_prologue:
1161
1162	shl	r9d,3
1163	lea	r10d,[r9*2+r9]
1164	neg	r9
1165	mov	r8,QWORD[r8]
1166
1167
1168
1169
1170
1171
1172
1173
1174	lea	r11,[((-320))+r9*2+rsp]
1175	mov	rbp,rsp
1176	sub	r11,rdi
1177	and	r11,4095
1178	cmp	r10,r11
1179	jb	NEAR $L$pwr_sp_alt
1180	sub	rbp,r11
1181	lea	rbp,[((-320))+r9*2+rbp]
1182	jmp	NEAR $L$pwr_sp_done
1183
1184ALIGN	32
1185$L$pwr_sp_alt:
1186	lea	r10,[((4096-320))+r9*2]
1187	lea	rbp,[((-320))+r9*2+rbp]
1188	sub	r11,r10
1189	mov	r10,0
1190	cmovc	r11,r10
1191	sub	rbp,r11
1192$L$pwr_sp_done:
1193	and	rbp,-64
1194	mov	r11,rsp
1195	sub	r11,rbp
1196	and	r11,-4096
1197	lea	rsp,[rbp*1+r11]
1198	mov	r10,QWORD[rsp]
1199	cmp	rsp,rbp
1200	ja	NEAR $L$pwr_page_walk
1201	jmp	NEAR $L$pwr_page_walk_done
1202
1203$L$pwr_page_walk:
1204	lea	rsp,[((-4096))+rsp]
1205	mov	r10,QWORD[rsp]
1206	cmp	rsp,rbp
1207	ja	NEAR $L$pwr_page_walk
1208$L$pwr_page_walk_done:
1209
1210	mov	r10,r9
1211	neg	r9
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222	mov	QWORD[32+rsp],r8
1223	mov	QWORD[40+rsp],rax
1224
1225$L$power5_body:
1226DB	102,72,15,110,207
1227DB	102,72,15,110,209
1228DB	102,73,15,110,218
1229DB	102,72,15,110,226
1230
1231	call	__bn_sqr8x_internal
1232	call	__bn_post4x_internal
1233	call	__bn_sqr8x_internal
1234	call	__bn_post4x_internal
1235	call	__bn_sqr8x_internal
1236	call	__bn_post4x_internal
1237	call	__bn_sqr8x_internal
1238	call	__bn_post4x_internal
1239	call	__bn_sqr8x_internal
1240	call	__bn_post4x_internal
1241
1242DB	102,72,15,126,209
1243DB	102,72,15,126,226
1244	mov	rdi,rsi
1245	mov	rax,QWORD[40+rsp]
1246	lea	r8,[32+rsp]
1247
1248	call	mul4x_internal
1249
1250	mov	rsi,QWORD[40+rsp]
1251
1252	mov	rax,1
1253	mov	r15,QWORD[((-48))+rsi]
1254
1255	mov	r14,QWORD[((-40))+rsi]
1256
1257	mov	r13,QWORD[((-32))+rsi]
1258
1259	mov	r12,QWORD[((-24))+rsi]
1260
1261	mov	rbp,QWORD[((-16))+rsi]
1262
1263	mov	rbx,QWORD[((-8))+rsi]
1264
1265	lea	rsp,[rsi]
1266
1267$L$power5_epilogue:
1268	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
1269	mov	rsi,QWORD[16+rsp]
1270	ret
1271
1272$L$SEH_end_bn_power5:
1273
1274global	bn_sqr8x_internal
1275
1276
1277ALIGN	32
1278bn_sqr8x_internal:
1279__bn_sqr8x_internal:
1280
1281_CET_ENDBR
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355	lea	rbp,[32+r10]
1356	lea	rsi,[r9*1+rsi]
1357
1358	mov	rcx,r9
1359
1360
1361	mov	r14,QWORD[((-32))+rbp*1+rsi]
1362	lea	rdi,[((48+8))+r9*2+rsp]
1363	mov	rax,QWORD[((-24))+rbp*1+rsi]
1364	lea	rdi,[((-32))+rbp*1+rdi]
1365	mov	rbx,QWORD[((-16))+rbp*1+rsi]
1366	mov	r15,rax
1367
1368	mul	r14
1369	mov	r10,rax
1370	mov	rax,rbx
1371	mov	r11,rdx
1372	mov	QWORD[((-24))+rbp*1+rdi],r10
1373
1374	mul	r14
1375	add	r11,rax
1376	mov	rax,rbx
1377	adc	rdx,0
1378	mov	QWORD[((-16))+rbp*1+rdi],r11
1379	mov	r10,rdx
1380
1381
1382	mov	rbx,QWORD[((-8))+rbp*1+rsi]
1383	mul	r15
1384	mov	r12,rax
1385	mov	rax,rbx
1386	mov	r13,rdx
1387
1388	lea	rcx,[rbp]
1389	mul	r14
1390	add	r10,rax
1391	mov	rax,rbx
1392	mov	r11,rdx
1393	adc	r11,0
1394	add	r10,r12
1395	adc	r11,0
1396	mov	QWORD[((-8))+rcx*1+rdi],r10
1397	jmp	NEAR $L$sqr4x_1st
1398
1399ALIGN	32
1400$L$sqr4x_1st:
1401	mov	rbx,QWORD[rcx*1+rsi]
1402	mul	r15
1403	add	r13,rax
1404	mov	rax,rbx
1405	mov	r12,rdx
1406	adc	r12,0
1407
1408	mul	r14
1409	add	r11,rax
1410	mov	rax,rbx
1411	mov	rbx,QWORD[8+rcx*1+rsi]
1412	mov	r10,rdx
1413	adc	r10,0
1414	add	r11,r13
1415	adc	r10,0
1416
1417
1418	mul	r15
1419	add	r12,rax
1420	mov	rax,rbx
1421	mov	QWORD[rcx*1+rdi],r11
1422	mov	r13,rdx
1423	adc	r13,0
1424
1425	mul	r14
1426	add	r10,rax
1427	mov	rax,rbx
1428	mov	rbx,QWORD[16+rcx*1+rsi]
1429	mov	r11,rdx
1430	adc	r11,0
1431	add	r10,r12
1432	adc	r11,0
1433
1434	mul	r15
1435	add	r13,rax
1436	mov	rax,rbx
1437	mov	QWORD[8+rcx*1+rdi],r10
1438	mov	r12,rdx
1439	adc	r12,0
1440
1441	mul	r14
1442	add	r11,rax
1443	mov	rax,rbx
1444	mov	rbx,QWORD[24+rcx*1+rsi]
1445	mov	r10,rdx
1446	adc	r10,0
1447	add	r11,r13
1448	adc	r10,0
1449
1450
1451	mul	r15
1452	add	r12,rax
1453	mov	rax,rbx
1454	mov	QWORD[16+rcx*1+rdi],r11
1455	mov	r13,rdx
1456	adc	r13,0
1457	lea	rcx,[32+rcx]
1458
1459	mul	r14
1460	add	r10,rax
1461	mov	rax,rbx
1462	mov	r11,rdx
1463	adc	r11,0
1464	add	r10,r12
1465	adc	r11,0
1466	mov	QWORD[((-8))+rcx*1+rdi],r10
1467
1468	cmp	rcx,0
1469	jne	NEAR $L$sqr4x_1st
1470
1471	mul	r15
1472	add	r13,rax
1473	lea	rbp,[16+rbp]
1474	adc	rdx,0
1475	add	r13,r11
1476	adc	rdx,0
1477
1478	mov	QWORD[rdi],r13
1479	mov	r12,rdx
1480	mov	QWORD[8+rdi],rdx
1481	jmp	NEAR $L$sqr4x_outer
1482
1483ALIGN	32
1484$L$sqr4x_outer:
1485	mov	r14,QWORD[((-32))+rbp*1+rsi]
1486	lea	rdi,[((48+8))+r9*2+rsp]
1487	mov	rax,QWORD[((-24))+rbp*1+rsi]
1488	lea	rdi,[((-32))+rbp*1+rdi]
1489	mov	rbx,QWORD[((-16))+rbp*1+rsi]
1490	mov	r15,rax
1491
1492	mul	r14
1493	mov	r10,QWORD[((-24))+rbp*1+rdi]
1494	add	r10,rax
1495	mov	rax,rbx
1496	adc	rdx,0
1497	mov	QWORD[((-24))+rbp*1+rdi],r10
1498	mov	r11,rdx
1499
1500	mul	r14
1501	add	r11,rax
1502	mov	rax,rbx
1503	adc	rdx,0
1504	add	r11,QWORD[((-16))+rbp*1+rdi]
1505	mov	r10,rdx
1506	adc	r10,0
1507	mov	QWORD[((-16))+rbp*1+rdi],r11
1508
1509	xor	r12,r12
1510
1511	mov	rbx,QWORD[((-8))+rbp*1+rsi]
1512	mul	r15
1513	add	r12,rax
1514	mov	rax,rbx
1515	adc	rdx,0
1516	add	r12,QWORD[((-8))+rbp*1+rdi]
1517	mov	r13,rdx
1518	adc	r13,0
1519
1520	mul	r14
1521	add	r10,rax
1522	mov	rax,rbx
1523	adc	rdx,0
1524	add	r10,r12
1525	mov	r11,rdx
1526	adc	r11,0
1527	mov	QWORD[((-8))+rbp*1+rdi],r10
1528
1529	lea	rcx,[rbp]
1530	jmp	NEAR $L$sqr4x_inner
1531
1532ALIGN	32
1533$L$sqr4x_inner:
1534	mov	rbx,QWORD[rcx*1+rsi]
1535	mul	r15
1536	add	r13,rax
1537	mov	rax,rbx
1538	mov	r12,rdx
1539	adc	r12,0
1540	add	r13,QWORD[rcx*1+rdi]
1541	adc	r12,0
1542
1543	DB	0x67
1544	mul	r14
1545	add	r11,rax
1546	mov	rax,rbx
1547	mov	rbx,QWORD[8+rcx*1+rsi]
1548	mov	r10,rdx
1549	adc	r10,0
1550	add	r11,r13
1551	adc	r10,0
1552
1553	mul	r15
1554	add	r12,rax
1555	mov	QWORD[rcx*1+rdi],r11
1556	mov	rax,rbx
1557	mov	r13,rdx
1558	adc	r13,0
1559	add	r12,QWORD[8+rcx*1+rdi]
1560	lea	rcx,[16+rcx]
1561	adc	r13,0
1562
1563	mul	r14
1564	add	r10,rax
1565	mov	rax,rbx
1566	adc	rdx,0
1567	add	r10,r12
1568	mov	r11,rdx
1569	adc	r11,0
1570	mov	QWORD[((-8))+rcx*1+rdi],r10
1571
1572	cmp	rcx,0
1573	jne	NEAR $L$sqr4x_inner
1574
1575	DB	0x67
1576	mul	r15
1577	add	r13,rax
1578	adc	rdx,0
1579	add	r13,r11
1580	adc	rdx,0
1581
1582	mov	QWORD[rdi],r13
1583	mov	r12,rdx
1584	mov	QWORD[8+rdi],rdx
1585
1586	add	rbp,16
1587	jnz	NEAR $L$sqr4x_outer
1588
1589
1590	mov	r14,QWORD[((-32))+rsi]
1591	lea	rdi,[((48+8))+r9*2+rsp]
1592	mov	rax,QWORD[((-24))+rsi]
1593	lea	rdi,[((-32))+rbp*1+rdi]
1594	mov	rbx,QWORD[((-16))+rsi]
1595	mov	r15,rax
1596
1597	mul	r14
1598	add	r10,rax
1599	mov	rax,rbx
1600	mov	r11,rdx
1601	adc	r11,0
1602
1603	mul	r14
1604	add	r11,rax
1605	mov	rax,rbx
1606	mov	QWORD[((-24))+rdi],r10
1607	mov	r10,rdx
1608	adc	r10,0
1609	add	r11,r13
1610	mov	rbx,QWORD[((-8))+rsi]
1611	adc	r10,0
1612
1613	mul	r15
1614	add	r12,rax
1615	mov	rax,rbx
1616	mov	QWORD[((-16))+rdi],r11
1617	mov	r13,rdx
1618	adc	r13,0
1619
1620	mul	r14
1621	add	r10,rax
1622	mov	rax,rbx
1623	mov	r11,rdx
1624	adc	r11,0
1625	add	r10,r12
1626	adc	r11,0
1627	mov	QWORD[((-8))+rdi],r10
1628
1629	mul	r15
1630	add	r13,rax
1631	mov	rax,QWORD[((-16))+rsi]
1632	adc	rdx,0
1633	add	r13,r11
1634	adc	rdx,0
1635
1636	mov	QWORD[rdi],r13
1637	mov	r12,rdx
1638	mov	QWORD[8+rdi],rdx
1639
1640	mul	rbx
1641	add	rbp,16
1642	xor	r14,r14
1643	sub	rbp,r9
1644	xor	r15,r15
1645
1646	add	rax,r12
1647	adc	rdx,0
1648	mov	QWORD[8+rdi],rax
1649	mov	QWORD[16+rdi],rdx
1650	mov	QWORD[24+rdi],r15
1651
1652	mov	rax,QWORD[((-16))+rbp*1+rsi]
1653	lea	rdi,[((48+8))+rsp]
1654	xor	r10,r10
1655	mov	r11,QWORD[8+rdi]
1656
1657	lea	r12,[r10*2+r14]
1658	shr	r10,63
1659	lea	r13,[r11*2+rcx]
1660	shr	r11,63
1661	or	r13,r10
1662	mov	r10,QWORD[16+rdi]
1663	mov	r14,r11
1664	mul	rax
1665	neg	r15
1666	mov	r11,QWORD[24+rdi]
1667	adc	r12,rax
1668	mov	rax,QWORD[((-8))+rbp*1+rsi]
1669	mov	QWORD[rdi],r12
1670	adc	r13,rdx
1671
1672	lea	rbx,[r10*2+r14]
1673	mov	QWORD[8+rdi],r13
1674	sbb	r15,r15
1675	shr	r10,63
1676	lea	r8,[r11*2+rcx]
1677	shr	r11,63
1678	or	r8,r10
1679	mov	r10,QWORD[32+rdi]
1680	mov	r14,r11
1681	mul	rax
1682	neg	r15
1683	mov	r11,QWORD[40+rdi]
1684	adc	rbx,rax
1685	mov	rax,QWORD[rbp*1+rsi]
1686	mov	QWORD[16+rdi],rbx
1687	adc	r8,rdx
1688	lea	rbp,[16+rbp]
1689	mov	QWORD[24+rdi],r8
1690	sbb	r15,r15
1691	lea	rdi,[64+rdi]
1692	jmp	NEAR $L$sqr4x_shift_n_add
1693
1694ALIGN	32
1695$L$sqr4x_shift_n_add:
1696	lea	r12,[r10*2+r14]
1697	shr	r10,63
1698	lea	r13,[r11*2+rcx]
1699	shr	r11,63
1700	or	r13,r10
1701	mov	r10,QWORD[((-16))+rdi]
1702	mov	r14,r11
1703	mul	rax
1704	neg	r15
1705	mov	r11,QWORD[((-8))+rdi]
1706	adc	r12,rax
1707	mov	rax,QWORD[((-8))+rbp*1+rsi]
1708	mov	QWORD[((-32))+rdi],r12
1709	adc	r13,rdx
1710
1711	lea	rbx,[r10*2+r14]
1712	mov	QWORD[((-24))+rdi],r13
1713	sbb	r15,r15
1714	shr	r10,63
1715	lea	r8,[r11*2+rcx]
1716	shr	r11,63
1717	or	r8,r10
1718	mov	r10,QWORD[rdi]
1719	mov	r14,r11
1720	mul	rax
1721	neg	r15
1722	mov	r11,QWORD[8+rdi]
1723	adc	rbx,rax
1724	mov	rax,QWORD[rbp*1+rsi]
1725	mov	QWORD[((-16))+rdi],rbx
1726	adc	r8,rdx
1727
1728	lea	r12,[r10*2+r14]
1729	mov	QWORD[((-8))+rdi],r8
1730	sbb	r15,r15
1731	shr	r10,63
1732	lea	r13,[r11*2+rcx]
1733	shr	r11,63
1734	or	r13,r10
1735	mov	r10,QWORD[16+rdi]
1736	mov	r14,r11
1737	mul	rax
1738	neg	r15
1739	mov	r11,QWORD[24+rdi]
1740	adc	r12,rax
1741	mov	rax,QWORD[8+rbp*1+rsi]
1742	mov	QWORD[rdi],r12
1743	adc	r13,rdx
1744
1745	lea	rbx,[r10*2+r14]
1746	mov	QWORD[8+rdi],r13
1747	sbb	r15,r15
1748	shr	r10,63
1749	lea	r8,[r11*2+rcx]
1750	shr	r11,63
1751	or	r8,r10
1752	mov	r10,QWORD[32+rdi]
1753	mov	r14,r11
1754	mul	rax
1755	neg	r15
1756	mov	r11,QWORD[40+rdi]
1757	adc	rbx,rax
1758	mov	rax,QWORD[16+rbp*1+rsi]
1759	mov	QWORD[16+rdi],rbx
1760	adc	r8,rdx
1761	mov	QWORD[24+rdi],r8
1762	sbb	r15,r15
1763	lea	rdi,[64+rdi]
1764	add	rbp,32
1765	jnz	NEAR $L$sqr4x_shift_n_add
1766
1767	lea	r12,[r10*2+r14]
1768	DB	0x67
1769	shr	r10,63
1770	lea	r13,[r11*2+rcx]
1771	shr	r11,63
1772	or	r13,r10
1773	mov	r10,QWORD[((-16))+rdi]
1774	mov	r14,r11
1775	mul	rax
1776	neg	r15
1777	mov	r11,QWORD[((-8))+rdi]
1778	adc	r12,rax
1779	mov	rax,QWORD[((-8))+rsi]
1780	mov	QWORD[((-32))+rdi],r12
1781	adc	r13,rdx
1782
1783	lea	rbx,[r10*2+r14]
1784	mov	QWORD[((-24))+rdi],r13
1785	sbb	r15,r15
1786	shr	r10,63
1787	lea	r8,[r11*2+rcx]
1788	shr	r11,63
1789	or	r8,r10
1790	mul	rax
1791	neg	r15
1792	adc	rbx,rax
1793	adc	r8,rdx
1794	mov	QWORD[((-16))+rdi],rbx
1795	mov	QWORD[((-8))+rdi],r8
1796DB	102,72,15,126,213
1797__bn_sqr8x_reduction:
1798	xor	rax,rax
1799	lea	rcx,[rbp*1+r9]
1800	lea	rdx,[((48+8))+r9*2+rsp]
1801	mov	QWORD[((0+8))+rsp],rcx
1802	lea	rdi,[((48+8))+r9*1+rsp]
1803	mov	QWORD[((8+8))+rsp],rdx
1804	neg	r9
1805	jmp	NEAR $L$8x_reduction_loop
1806
1807ALIGN	32
1808$L$8x_reduction_loop:
1809	lea	rdi,[r9*1+rdi]
1810	DB	0x66
1811	mov	rbx,QWORD[rdi]
1812	mov	r9,QWORD[8+rdi]
1813	mov	r10,QWORD[16+rdi]
1814	mov	r11,QWORD[24+rdi]
1815	mov	r12,QWORD[32+rdi]
1816	mov	r13,QWORD[40+rdi]
1817	mov	r14,QWORD[48+rdi]
1818	mov	r15,QWORD[56+rdi]
1819	mov	QWORD[rdx],rax
1820	lea	rdi,[64+rdi]
1821
1822	DB	0x67
1823	mov	r8,rbx
1824	imul	rbx,QWORD[((32+8))+rsp]
1825	mov	rax,QWORD[rbp]
1826	mov	ecx,8
1827	jmp	NEAR $L$8x_reduce
1828
1829ALIGN	32
1830$L$8x_reduce:
1831	mul	rbx
1832	mov	rax,QWORD[8+rbp]
1833	neg	r8
1834	mov	r8,rdx
1835	adc	r8,0
1836
1837	mul	rbx
1838	add	r9,rax
1839	mov	rax,QWORD[16+rbp]
1840	adc	rdx,0
1841	add	r8,r9
1842	mov	QWORD[((48-8+8))+rcx*8+rsp],rbx
1843	mov	r9,rdx
1844	adc	r9,0
1845
1846	mul	rbx
1847	add	r10,rax
1848	mov	rax,QWORD[24+rbp]
1849	adc	rdx,0
1850	add	r9,r10
1851	mov	rsi,QWORD[((32+8))+rsp]
1852	mov	r10,rdx
1853	adc	r10,0
1854
1855	mul	rbx
1856	add	r11,rax
1857	mov	rax,QWORD[32+rbp]
1858	adc	rdx,0
1859	imul	rsi,r8
1860	add	r10,r11
1861	mov	r11,rdx
1862	adc	r11,0
1863
1864	mul	rbx
1865	add	r12,rax
1866	mov	rax,QWORD[40+rbp]
1867	adc	rdx,0
1868	add	r11,r12
1869	mov	r12,rdx
1870	adc	r12,0
1871
1872	mul	rbx
1873	add	r13,rax
1874	mov	rax,QWORD[48+rbp]
1875	adc	rdx,0
1876	add	r12,r13
1877	mov	r13,rdx
1878	adc	r13,0
1879
1880	mul	rbx
1881	add	r14,rax
1882	mov	rax,QWORD[56+rbp]
1883	adc	rdx,0
1884	add	r13,r14
1885	mov	r14,rdx
1886	adc	r14,0
1887
1888	mul	rbx
1889	mov	rbx,rsi
1890	add	r15,rax
1891	mov	rax,QWORD[rbp]
1892	adc	rdx,0
1893	add	r14,r15
1894	mov	r15,rdx
1895	adc	r15,0
1896
1897	dec	ecx
1898	jnz	NEAR $L$8x_reduce
1899
1900	lea	rbp,[64+rbp]
1901	xor	rax,rax
1902	mov	rdx,QWORD[((8+8))+rsp]
1903	cmp	rbp,QWORD[((0+8))+rsp]
1904	jae	NEAR $L$8x_no_tail
1905
1906	DB	0x66
1907	add	r8,QWORD[rdi]
1908	adc	r9,QWORD[8+rdi]
1909	adc	r10,QWORD[16+rdi]
1910	adc	r11,QWORD[24+rdi]
1911	adc	r12,QWORD[32+rdi]
1912	adc	r13,QWORD[40+rdi]
1913	adc	r14,QWORD[48+rdi]
1914	adc	r15,QWORD[56+rdi]
1915	sbb	rsi,rsi
1916
1917	mov	rbx,QWORD[((48+56+8))+rsp]
1918	mov	ecx,8
1919	mov	rax,QWORD[rbp]
1920	jmp	NEAR $L$8x_tail
1921
1922ALIGN	32
1923$L$8x_tail:
1924	mul	rbx
1925	add	r8,rax
1926	mov	rax,QWORD[8+rbp]
1927	mov	QWORD[rdi],r8
1928	mov	r8,rdx
1929	adc	r8,0
1930
1931	mul	rbx
1932	add	r9,rax
1933	mov	rax,QWORD[16+rbp]
1934	adc	rdx,0
1935	add	r8,r9
1936	lea	rdi,[8+rdi]
1937	mov	r9,rdx
1938	adc	r9,0
1939
1940	mul	rbx
1941	add	r10,rax
1942	mov	rax,QWORD[24+rbp]
1943	adc	rdx,0
1944	add	r9,r10
1945	mov	r10,rdx
1946	adc	r10,0
1947
1948	mul	rbx
1949	add	r11,rax
1950	mov	rax,QWORD[32+rbp]
1951	adc	rdx,0
1952	add	r10,r11
1953	mov	r11,rdx
1954	adc	r11,0
1955
1956	mul	rbx
1957	add	r12,rax
1958	mov	rax,QWORD[40+rbp]
1959	adc	rdx,0
1960	add	r11,r12
1961	mov	r12,rdx
1962	adc	r12,0
1963
1964	mul	rbx
1965	add	r13,rax
1966	mov	rax,QWORD[48+rbp]
1967	adc	rdx,0
1968	add	r12,r13
1969	mov	r13,rdx
1970	adc	r13,0
1971
1972	mul	rbx
1973	add	r14,rax
1974	mov	rax,QWORD[56+rbp]
1975	adc	rdx,0
1976	add	r13,r14
1977	mov	r14,rdx
1978	adc	r14,0
1979
1980	mul	rbx
1981	mov	rbx,QWORD[((48-16+8))+rcx*8+rsp]
1982	add	r15,rax
1983	adc	rdx,0
1984	add	r14,r15
1985	mov	rax,QWORD[rbp]
1986	mov	r15,rdx
1987	adc	r15,0
1988
1989	dec	ecx
1990	jnz	NEAR $L$8x_tail
1991
1992	lea	rbp,[64+rbp]
1993	mov	rdx,QWORD[((8+8))+rsp]
1994	cmp	rbp,QWORD[((0+8))+rsp]
1995	jae	NEAR $L$8x_tail_done
1996
1997	mov	rbx,QWORD[((48+56+8))+rsp]
1998	neg	rsi
1999	mov	rax,QWORD[rbp]
2000	adc	r8,QWORD[rdi]
2001	adc	r9,QWORD[8+rdi]
2002	adc	r10,QWORD[16+rdi]
2003	adc	r11,QWORD[24+rdi]
2004	adc	r12,QWORD[32+rdi]
2005	adc	r13,QWORD[40+rdi]
2006	adc	r14,QWORD[48+rdi]
2007	adc	r15,QWORD[56+rdi]
2008	sbb	rsi,rsi
2009
2010	mov	ecx,8
2011	jmp	NEAR $L$8x_tail
2012
2013ALIGN	32
2014$L$8x_tail_done:
2015	xor	rax,rax
2016	add	r8,QWORD[rdx]
2017	adc	r9,0
2018	adc	r10,0
2019	adc	r11,0
2020	adc	r12,0
2021	adc	r13,0
2022	adc	r14,0
2023	adc	r15,0
2024	adc	rax,0
2025
2026	neg	rsi
2027$L$8x_no_tail:
2028	adc	r8,QWORD[rdi]
2029	adc	r9,QWORD[8+rdi]
2030	adc	r10,QWORD[16+rdi]
2031	adc	r11,QWORD[24+rdi]
2032	adc	r12,QWORD[32+rdi]
2033	adc	r13,QWORD[40+rdi]
2034	adc	r14,QWORD[48+rdi]
2035	adc	r15,QWORD[56+rdi]
2036	adc	rax,0
2037	mov	rcx,QWORD[((-8))+rbp]
2038	xor	rsi,rsi
2039
2040DB	102,72,15,126,213
2041
2042	mov	QWORD[rdi],r8
2043	mov	QWORD[8+rdi],r9
2044DB	102,73,15,126,217
2045	mov	QWORD[16+rdi],r10
2046	mov	QWORD[24+rdi],r11
2047	mov	QWORD[32+rdi],r12
2048	mov	QWORD[40+rdi],r13
2049	mov	QWORD[48+rdi],r14
2050	mov	QWORD[56+rdi],r15
2051	lea	rdi,[64+rdi]
2052
2053	cmp	rdi,rdx
2054	jb	NEAR $L$8x_reduction_loop
2055	ret
2056
2057
2058
2059ALIGN	32
2060__bn_post4x_internal:
2061
2062	mov	r12,QWORD[rbp]
2063	lea	rbx,[r9*1+rdi]
2064	mov	rcx,r9
2065DB	102,72,15,126,207
2066	neg	rax
2067DB	102,72,15,126,206
2068	sar	rcx,3+2
2069	dec	r12
2070	xor	r10,r10
2071	mov	r13,QWORD[8+rbp]
2072	mov	r14,QWORD[16+rbp]
2073	mov	r15,QWORD[24+rbp]
2074	jmp	NEAR $L$sqr4x_sub_entry
2075
2076ALIGN	16
2077$L$sqr4x_sub:
2078	mov	r12,QWORD[rbp]
2079	mov	r13,QWORD[8+rbp]
2080	mov	r14,QWORD[16+rbp]
2081	mov	r15,QWORD[24+rbp]
2082$L$sqr4x_sub_entry:
2083	lea	rbp,[32+rbp]
2084	not	r12
2085	not	r13
2086	not	r14
2087	not	r15
2088	and	r12,rax
2089	and	r13,rax
2090	and	r14,rax
2091	and	r15,rax
2092
2093	neg	r10
2094	adc	r12,QWORD[rbx]
2095	adc	r13,QWORD[8+rbx]
2096	adc	r14,QWORD[16+rbx]
2097	adc	r15,QWORD[24+rbx]
2098	mov	QWORD[rdi],r12
2099	lea	rbx,[32+rbx]
2100	mov	QWORD[8+rdi],r13
2101	sbb	r10,r10
2102	mov	QWORD[16+rdi],r14
2103	mov	QWORD[24+rdi],r15
2104	lea	rdi,[32+rdi]
2105
2106	inc	rcx
2107	jnz	NEAR $L$sqr4x_sub
2108
2109	mov	r10,r9
2110	neg	r9
2111	ret
2112
2113
2114
2115ALIGN	32
2116bn_mulx4x_mont_gather5:
2117	mov	QWORD[8+rsp],rdi	;WIN64 prologue
2118	mov	QWORD[16+rsp],rsi
2119	mov	rax,rsp
2120$L$SEH_begin_bn_mulx4x_mont_gather5:
2121	mov	rdi,rcx
2122	mov	rsi,rdx
2123	mov	rdx,r8
2124	mov	rcx,r9
2125	mov	r8,QWORD[40+rsp]
2126	mov	r9,QWORD[48+rsp]
2127
2128
2129
2130	mov	rax,rsp
2131
2132$L$mulx4x_enter:
2133	push	rbx
2134
2135	push	rbp
2136
2137	push	r12
2138
2139	push	r13
2140
2141	push	r14
2142
2143	push	r15
2144
2145$L$mulx4x_prologue:
2146
2147	shl	r9d,3
2148	lea	r10,[r9*2+r9]
2149	neg	r9
2150	mov	r8,QWORD[r8]
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161	lea	r11,[((-320))+r9*2+rsp]
2162	mov	rbp,rsp
2163	sub	r11,rdi
2164	and	r11,4095
2165	cmp	r10,r11
2166	jb	NEAR $L$mulx4xsp_alt
2167	sub	rbp,r11
2168	lea	rbp,[((-320))+r9*2+rbp]
2169	jmp	NEAR $L$mulx4xsp_done
2170
2171$L$mulx4xsp_alt:
2172	lea	r10,[((4096-320))+r9*2]
2173	lea	rbp,[((-320))+r9*2+rbp]
2174	sub	r11,r10
2175	mov	r10,0
2176	cmovc	r11,r10
2177	sub	rbp,r11
2178$L$mulx4xsp_done:
2179	and	rbp,-64
2180	mov	r11,rsp
2181	sub	r11,rbp
2182	and	r11,-4096
2183	lea	rsp,[rbp*1+r11]
2184	mov	r10,QWORD[rsp]
2185	cmp	rsp,rbp
2186	ja	NEAR $L$mulx4x_page_walk
2187	jmp	NEAR $L$mulx4x_page_walk_done
2188
2189$L$mulx4x_page_walk:
2190	lea	rsp,[((-4096))+rsp]
2191	mov	r10,QWORD[rsp]
2192	cmp	rsp,rbp
2193	ja	NEAR $L$mulx4x_page_walk
2194$L$mulx4x_page_walk_done:
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208	mov	QWORD[32+rsp],r8
2209	mov	QWORD[40+rsp],rax
2210
2211$L$mulx4x_body:
2212	call	mulx4x_internal
2213
2214	mov	rsi,QWORD[40+rsp]
2215
2216	mov	rax,1
2217
2218	mov	r15,QWORD[((-48))+rsi]
2219
2220	mov	r14,QWORD[((-40))+rsi]
2221
2222	mov	r13,QWORD[((-32))+rsi]
2223
2224	mov	r12,QWORD[((-24))+rsi]
2225
2226	mov	rbp,QWORD[((-16))+rsi]
2227
2228	mov	rbx,QWORD[((-8))+rsi]
2229
2230	lea	rsp,[rsi]
2231
2232$L$mulx4x_epilogue:
2233	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
2234	mov	rsi,QWORD[16+rsp]
2235	ret
2236
2237$L$SEH_end_bn_mulx4x_mont_gather5:
2238
2239
2240ALIGN	32
2241mulx4x_internal:
2242
2243	mov	QWORD[8+rsp],r9
2244	mov	r10,r9
2245	neg	r9
2246	shl	r9,5
2247	neg	r10
2248	lea	r13,[128+r9*1+rdx]
2249	shr	r9,5+5
2250	movd	xmm5,DWORD[56+rax]
2251	sub	r9,1
2252	lea	rax,[$L$inc]
2253	mov	QWORD[((16+8))+rsp],r13
2254	mov	QWORD[((24+8))+rsp],r9
2255	mov	QWORD[((56+8))+rsp],rdi
2256	movdqa	xmm0,XMMWORD[rax]
2257	movdqa	xmm1,XMMWORD[16+rax]
2258	lea	r10,[((88-112))+r10*1+rsp]
2259	lea	rdi,[128+rdx]
2260
2261	pshufd	xmm5,xmm5,0
2262	movdqa	xmm4,xmm1
2263	DB	0x67
2264	movdqa	xmm2,xmm1
2265	DB	0x67
2266	paddd	xmm1,xmm0
2267	pcmpeqd	xmm0,xmm5
2268	movdqa	xmm3,xmm4
2269	paddd	xmm2,xmm1
2270	pcmpeqd	xmm1,xmm5
2271	movdqa	XMMWORD[112+r10],xmm0
2272	movdqa	xmm0,xmm4
2273
2274	paddd	xmm3,xmm2
2275	pcmpeqd	xmm2,xmm5
2276	movdqa	XMMWORD[128+r10],xmm1
2277	movdqa	xmm1,xmm4
2278
2279	paddd	xmm0,xmm3
2280	pcmpeqd	xmm3,xmm5
2281	movdqa	XMMWORD[144+r10],xmm2
2282	movdqa	xmm2,xmm4
2283
2284	paddd	xmm1,xmm0
2285	pcmpeqd	xmm0,xmm5
2286	movdqa	XMMWORD[160+r10],xmm3
2287	movdqa	xmm3,xmm4
2288	paddd	xmm2,xmm1
2289	pcmpeqd	xmm1,xmm5
2290	movdqa	XMMWORD[176+r10],xmm0
2291	movdqa	xmm0,xmm4
2292
2293	paddd	xmm3,xmm2
2294	pcmpeqd	xmm2,xmm5
2295	movdqa	XMMWORD[192+r10],xmm1
2296	movdqa	xmm1,xmm4
2297
2298	paddd	xmm0,xmm3
2299	pcmpeqd	xmm3,xmm5
2300	movdqa	XMMWORD[208+r10],xmm2
2301	movdqa	xmm2,xmm4
2302
2303	paddd	xmm1,xmm0
2304	pcmpeqd	xmm0,xmm5
2305	movdqa	XMMWORD[224+r10],xmm3
2306	movdqa	xmm3,xmm4
2307	paddd	xmm2,xmm1
2308	pcmpeqd	xmm1,xmm5
2309	movdqa	XMMWORD[240+r10],xmm0
2310	movdqa	xmm0,xmm4
2311
2312	paddd	xmm3,xmm2
2313	pcmpeqd	xmm2,xmm5
2314	movdqa	XMMWORD[256+r10],xmm1
2315	movdqa	xmm1,xmm4
2316
2317	paddd	xmm0,xmm3
2318	pcmpeqd	xmm3,xmm5
2319	movdqa	XMMWORD[272+r10],xmm2
2320	movdqa	xmm2,xmm4
2321
2322	paddd	xmm1,xmm0
2323	pcmpeqd	xmm0,xmm5
2324	movdqa	XMMWORD[288+r10],xmm3
2325	movdqa	xmm3,xmm4
2326	DB	0x67
2327	paddd	xmm2,xmm1
2328	pcmpeqd	xmm1,xmm5
2329	movdqa	XMMWORD[304+r10],xmm0
2330
2331	paddd	xmm3,xmm2
2332	pcmpeqd	xmm2,xmm5
2333	movdqa	XMMWORD[320+r10],xmm1
2334
2335	pcmpeqd	xmm3,xmm5
2336	movdqa	XMMWORD[336+r10],xmm2
2337
2338	pand	xmm0,XMMWORD[64+rdi]
2339	pand	xmm1,XMMWORD[80+rdi]
2340	pand	xmm2,XMMWORD[96+rdi]
2341	movdqa	XMMWORD[352+r10],xmm3
2342	pand	xmm3,XMMWORD[112+rdi]
2343	por	xmm0,xmm2
2344	por	xmm1,xmm3
2345	movdqa	xmm4,XMMWORD[((-128))+rdi]
2346	movdqa	xmm5,XMMWORD[((-112))+rdi]
2347	movdqa	xmm2,XMMWORD[((-96))+rdi]
2348	pand	xmm4,XMMWORD[112+r10]
2349	movdqa	xmm3,XMMWORD[((-80))+rdi]
2350	pand	xmm5,XMMWORD[128+r10]
2351	por	xmm0,xmm4
2352	pand	xmm2,XMMWORD[144+r10]
2353	por	xmm1,xmm5
2354	pand	xmm3,XMMWORD[160+r10]
2355	por	xmm0,xmm2
2356	por	xmm1,xmm3
2357	movdqa	xmm4,XMMWORD[((-64))+rdi]
2358	movdqa	xmm5,XMMWORD[((-48))+rdi]
2359	movdqa	xmm2,XMMWORD[((-32))+rdi]
2360	pand	xmm4,XMMWORD[176+r10]
2361	movdqa	xmm3,XMMWORD[((-16))+rdi]
2362	pand	xmm5,XMMWORD[192+r10]
2363	por	xmm0,xmm4
2364	pand	xmm2,XMMWORD[208+r10]
2365	por	xmm1,xmm5
2366	pand	xmm3,XMMWORD[224+r10]
2367	por	xmm0,xmm2
2368	por	xmm1,xmm3
2369	movdqa	xmm4,XMMWORD[rdi]
2370	movdqa	xmm5,XMMWORD[16+rdi]
2371	movdqa	xmm2,XMMWORD[32+rdi]
2372	pand	xmm4,XMMWORD[240+r10]
2373	movdqa	xmm3,XMMWORD[48+rdi]
2374	pand	xmm5,XMMWORD[256+r10]
2375	por	xmm0,xmm4
2376	pand	xmm2,XMMWORD[272+r10]
2377	por	xmm1,xmm5
2378	pand	xmm3,XMMWORD[288+r10]
2379	por	xmm0,xmm2
2380	por	xmm1,xmm3
2381	pxor	xmm0,xmm1
2382
2383	pshufd	xmm1,xmm0,0x4e
2384	por	xmm0,xmm1
2385	lea	rdi,[256+rdi]
2386DB	102,72,15,126,194
2387	lea	rbx,[((64+32+8))+rsp]
2388
2389	mov	r9,rdx
2390	mulx	rax,r8,QWORD[rsi]
2391	mulx	r12,r11,QWORD[8+rsi]
2392	add	r11,rax
2393	mulx	r13,rax,QWORD[16+rsi]
2394	adc	r12,rax
2395	adc	r13,0
2396	mulx	r14,rax,QWORD[24+rsi]
2397
2398	mov	r15,r8
2399	imul	r8,QWORD[((32+8))+rsp]
2400	xor	rbp,rbp
2401	mov	rdx,r8
2402
2403	mov	QWORD[((8+8))+rsp],rdi
2404
2405	lea	rsi,[32+rsi]
2406	adcx	r13,rax
2407	adcx	r14,rbp
2408
2409	mulx	r10,rax,QWORD[rcx]
2410	adcx	r15,rax
2411	adox	r10,r11
2412	mulx	r11,rax,QWORD[8+rcx]
2413	adcx	r10,rax
2414	adox	r11,r12
2415	mulx	r12,rax,QWORD[16+rcx]
2416	mov	rdi,QWORD[((24+8))+rsp]
2417	mov	QWORD[((-32))+rbx],r10
2418	adcx	r11,rax
2419	adox	r12,r13
2420	mulx	r15,rax,QWORD[24+rcx]
2421	mov	rdx,r9
2422	mov	QWORD[((-24))+rbx],r11
2423	adcx	r12,rax
2424	adox	r15,rbp
2425	lea	rcx,[32+rcx]
2426	mov	QWORD[((-16))+rbx],r12
2427	jmp	NEAR $L$mulx4x_1st
2428
2429ALIGN	32
2430$L$mulx4x_1st:
2431	adcx	r15,rbp
2432	mulx	rax,r10,QWORD[rsi]
2433	adcx	r10,r14
2434	mulx	r14,r11,QWORD[8+rsi]
2435	adcx	r11,rax
2436	mulx	rax,r12,QWORD[16+rsi]
2437	adcx	r12,r14
2438	mulx	r14,r13,QWORD[24+rsi]
2439	DB	0x67,0x67
2440	mov	rdx,r8
2441	adcx	r13,rax
2442	adcx	r14,rbp
2443	lea	rsi,[32+rsi]
2444	lea	rbx,[32+rbx]
2445
2446	adox	r10,r15
2447	mulx	r15,rax,QWORD[rcx]
2448	adcx	r10,rax
2449	adox	r11,r15
2450	mulx	r15,rax,QWORD[8+rcx]
2451	adcx	r11,rax
2452	adox	r12,r15
2453	mulx	r15,rax,QWORD[16+rcx]
2454	mov	QWORD[((-40))+rbx],r10
2455	adcx	r12,rax
2456	mov	QWORD[((-32))+rbx],r11
2457	adox	r13,r15
2458	mulx	r15,rax,QWORD[24+rcx]
2459	mov	rdx,r9
2460	mov	QWORD[((-24))+rbx],r12
2461	adcx	r13,rax
2462	adox	r15,rbp
2463	lea	rcx,[32+rcx]
2464	mov	QWORD[((-16))+rbx],r13
2465
2466	dec	rdi
2467	jnz	NEAR $L$mulx4x_1st
2468
2469	mov	rax,QWORD[8+rsp]
2470	adc	r15,rbp
2471	lea	rsi,[rax*1+rsi]
2472	add	r14,r15
2473	mov	rdi,QWORD[((8+8))+rsp]
2474	adc	rbp,rbp
2475	mov	QWORD[((-8))+rbx],r14
2476	jmp	NEAR $L$mulx4x_outer
2477
2478ALIGN	32
2479$L$mulx4x_outer:
2480	lea	r10,[((16-256))+rbx]
2481	pxor	xmm4,xmm4
2482	DB	0x67,0x67
2483	pxor	xmm5,xmm5
2484	movdqa	xmm0,XMMWORD[((-128))+rdi]
2485	movdqa	xmm1,XMMWORD[((-112))+rdi]
2486	movdqa	xmm2,XMMWORD[((-96))+rdi]
2487	pand	xmm0,XMMWORD[256+r10]
2488	movdqa	xmm3,XMMWORD[((-80))+rdi]
2489	pand	xmm1,XMMWORD[272+r10]
2490	por	xmm4,xmm0
2491	pand	xmm2,XMMWORD[288+r10]
2492	por	xmm5,xmm1
2493	pand	xmm3,XMMWORD[304+r10]
2494	por	xmm4,xmm2
2495	por	xmm5,xmm3
2496	movdqa	xmm0,XMMWORD[((-64))+rdi]
2497	movdqa	xmm1,XMMWORD[((-48))+rdi]
2498	movdqa	xmm2,XMMWORD[((-32))+rdi]
2499	pand	xmm0,XMMWORD[320+r10]
2500	movdqa	xmm3,XMMWORD[((-16))+rdi]
2501	pand	xmm1,XMMWORD[336+r10]
2502	por	xmm4,xmm0
2503	pand	xmm2,XMMWORD[352+r10]
2504	por	xmm5,xmm1
2505	pand	xmm3,XMMWORD[368+r10]
2506	por	xmm4,xmm2
2507	por	xmm5,xmm3
2508	movdqa	xmm0,XMMWORD[rdi]
2509	movdqa	xmm1,XMMWORD[16+rdi]
2510	movdqa	xmm2,XMMWORD[32+rdi]
2511	pand	xmm0,XMMWORD[384+r10]
2512	movdqa	xmm3,XMMWORD[48+rdi]
2513	pand	xmm1,XMMWORD[400+r10]
2514	por	xmm4,xmm0
2515	pand	xmm2,XMMWORD[416+r10]
2516	por	xmm5,xmm1
2517	pand	xmm3,XMMWORD[432+r10]
2518	por	xmm4,xmm2
2519	por	xmm5,xmm3
2520	movdqa	xmm0,XMMWORD[64+rdi]
2521	movdqa	xmm1,XMMWORD[80+rdi]
2522	movdqa	xmm2,XMMWORD[96+rdi]
2523	pand	xmm0,XMMWORD[448+r10]
2524	movdqa	xmm3,XMMWORD[112+rdi]
2525	pand	xmm1,XMMWORD[464+r10]
2526	por	xmm4,xmm0
2527	pand	xmm2,XMMWORD[480+r10]
2528	por	xmm5,xmm1
2529	pand	xmm3,XMMWORD[496+r10]
2530	por	xmm4,xmm2
2531	por	xmm5,xmm3
2532	por	xmm4,xmm5
2533
2534	pshufd	xmm0,xmm4,0x4e
2535	por	xmm0,xmm4
2536	lea	rdi,[256+rdi]
2537DB	102,72,15,126,194
2538
2539	mov	QWORD[rbx],rbp
2540	lea	rbx,[32+rax*1+rbx]
2541	mulx	r11,r8,QWORD[rsi]
2542	xor	rbp,rbp
2543	mov	r9,rdx
2544	mulx	r12,r14,QWORD[8+rsi]
2545	adox	r8,QWORD[((-32))+rbx]
2546	adcx	r11,r14
2547	mulx	r13,r15,QWORD[16+rsi]
2548	adox	r11,QWORD[((-24))+rbx]
2549	adcx	r12,r15
2550	mulx	r14,rdx,QWORD[24+rsi]
2551	adox	r12,QWORD[((-16))+rbx]
2552	adcx	r13,rdx
2553	lea	rcx,[rax*1+rcx]
2554	lea	rsi,[32+rsi]
2555	adox	r13,QWORD[((-8))+rbx]
2556	adcx	r14,rbp
2557	adox	r14,rbp
2558
2559	mov	r15,r8
2560	imul	r8,QWORD[((32+8))+rsp]
2561
2562	mov	rdx,r8
2563	xor	rbp,rbp
2564	mov	QWORD[((8+8))+rsp],rdi
2565
2566	mulx	r10,rax,QWORD[rcx]
2567	adcx	r15,rax
2568	adox	r10,r11
2569	mulx	r11,rax,QWORD[8+rcx]
2570	adcx	r10,rax
2571	adox	r11,r12
2572	mulx	r12,rax,QWORD[16+rcx]
2573	adcx	r11,rax
2574	adox	r12,r13
2575	mulx	r15,rax,QWORD[24+rcx]
2576	mov	rdx,r9
2577	mov	rdi,QWORD[((24+8))+rsp]
2578	mov	QWORD[((-32))+rbx],r10
2579	adcx	r12,rax
2580	mov	QWORD[((-24))+rbx],r11
2581	adox	r15,rbp
2582	mov	QWORD[((-16))+rbx],r12
2583	lea	rcx,[32+rcx]
2584	jmp	NEAR $L$mulx4x_inner
2585
2586ALIGN	32
2587$L$mulx4x_inner:
2588	mulx	rax,r10,QWORD[rsi]
2589	adcx	r15,rbp
2590	adox	r10,r14
2591	mulx	r14,r11,QWORD[8+rsi]
2592	adcx	r10,QWORD[rbx]
2593	adox	r11,rax
2594	mulx	rax,r12,QWORD[16+rsi]
2595	adcx	r11,QWORD[8+rbx]
2596	adox	r12,r14
2597	mulx	r14,r13,QWORD[24+rsi]
2598	mov	rdx,r8
2599	adcx	r12,QWORD[16+rbx]
2600	adox	r13,rax
2601	adcx	r13,QWORD[24+rbx]
2602	adox	r14,rbp
2603	lea	rsi,[32+rsi]
2604	lea	rbx,[32+rbx]
2605	adcx	r14,rbp
2606
2607	adox	r10,r15
2608	mulx	r15,rax,QWORD[rcx]
2609	adcx	r10,rax
2610	adox	r11,r15
2611	mulx	r15,rax,QWORD[8+rcx]
2612	adcx	r11,rax
2613	adox	r12,r15
2614	mulx	r15,rax,QWORD[16+rcx]
2615	mov	QWORD[((-40))+rbx],r10
2616	adcx	r12,rax
2617	adox	r13,r15
2618	mov	QWORD[((-32))+rbx],r11
2619	mulx	r15,rax,QWORD[24+rcx]
2620	mov	rdx,r9
2621	lea	rcx,[32+rcx]
2622	mov	QWORD[((-24))+rbx],r12
2623	adcx	r13,rax
2624	adox	r15,rbp
2625	mov	QWORD[((-16))+rbx],r13
2626
2627	dec	rdi
2628	jnz	NEAR $L$mulx4x_inner
2629
2630	mov	rax,QWORD[((0+8))+rsp]
2631	adc	r15,rbp
2632	sub	rdi,QWORD[rbx]
2633	mov	rdi,QWORD[((8+8))+rsp]
2634	mov	r10,QWORD[((16+8))+rsp]
2635	adc	r14,r15
2636	lea	rsi,[rax*1+rsi]
2637	adc	rbp,rbp
2638	mov	QWORD[((-8))+rbx],r14
2639
2640	cmp	rdi,r10
2641	jb	NEAR $L$mulx4x_outer
2642
2643	mov	r10,QWORD[((-8))+rcx]
2644	mov	r8,rbp
2645	mov	r12,QWORD[rax*1+rcx]
2646	lea	rbp,[rax*1+rcx]
2647	mov	rcx,rax
2648	lea	rdi,[rax*1+rbx]
2649	xor	eax,eax
2650	xor	r15,r15
2651	sub	r10,r14
2652	adc	r15,r15
2653	or	r8,r15
2654	sar	rcx,3+2
2655	sub	rax,r8
2656	mov	rdx,QWORD[((56+8))+rsp]
2657	dec	r12
2658	mov	r13,QWORD[8+rbp]
2659	xor	r8,r8
2660	mov	r14,QWORD[16+rbp]
2661	mov	r15,QWORD[24+rbp]
2662	jmp	NEAR $L$sqrx4x_sub_entry
2663
2664
2665
2666ALIGN	32
2667bn_powerx5:
2668	mov	QWORD[8+rsp],rdi	;WIN64 prologue
2669	mov	QWORD[16+rsp],rsi
2670	mov	rax,rsp
2671$L$SEH_begin_bn_powerx5:
2672	mov	rdi,rcx
2673	mov	rsi,rdx
2674	mov	rdx,r8
2675	mov	rcx,r9
2676	mov	r8,QWORD[40+rsp]
2677	mov	r9,QWORD[48+rsp]
2678
2679
2680
2681	mov	rax,rsp
2682
2683$L$powerx5_enter:
2684	push	rbx
2685
2686	push	rbp
2687
2688	push	r12
2689
2690	push	r13
2691
2692	push	r14
2693
2694	push	r15
2695
2696$L$powerx5_prologue:
2697
2698	shl	r9d,3
2699	lea	r10,[r9*2+r9]
2700	neg	r9
2701	mov	r8,QWORD[r8]
2702
2703
2704
2705
2706
2707
2708
2709
2710	lea	r11,[((-320))+r9*2+rsp]
2711	mov	rbp,rsp
2712	sub	r11,rdi
2713	and	r11,4095
2714	cmp	r10,r11
2715	jb	NEAR $L$pwrx_sp_alt
2716	sub	rbp,r11
2717	lea	rbp,[((-320))+r9*2+rbp]
2718	jmp	NEAR $L$pwrx_sp_done
2719
2720ALIGN	32
2721$L$pwrx_sp_alt:
2722	lea	r10,[((4096-320))+r9*2]
2723	lea	rbp,[((-320))+r9*2+rbp]
2724	sub	r11,r10
2725	mov	r10,0
2726	cmovc	r11,r10
2727	sub	rbp,r11
2728$L$pwrx_sp_done:
2729	and	rbp,-64
2730	mov	r11,rsp
2731	sub	r11,rbp
2732	and	r11,-4096
2733	lea	rsp,[rbp*1+r11]
2734	mov	r10,QWORD[rsp]
2735	cmp	rsp,rbp
2736	ja	NEAR $L$pwrx_page_walk
2737	jmp	NEAR $L$pwrx_page_walk_done
2738
2739$L$pwrx_page_walk:
2740	lea	rsp,[((-4096))+rsp]
2741	mov	r10,QWORD[rsp]
2742	cmp	rsp,rbp
2743	ja	NEAR $L$pwrx_page_walk
2744$L$pwrx_page_walk_done:
2745
2746	mov	r10,r9
2747	neg	r9
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760	pxor	xmm0,xmm0
2761DB	102,72,15,110,207
2762DB	102,72,15,110,209
2763DB	102,73,15,110,218
2764DB	102,72,15,110,226
2765	mov	QWORD[32+rsp],r8
2766	mov	QWORD[40+rsp],rax
2767
2768$L$powerx5_body:
2769
2770	call	__bn_sqrx8x_internal
2771	call	__bn_postx4x_internal
2772	call	__bn_sqrx8x_internal
2773	call	__bn_postx4x_internal
2774	call	__bn_sqrx8x_internal
2775	call	__bn_postx4x_internal
2776	call	__bn_sqrx8x_internal
2777	call	__bn_postx4x_internal
2778	call	__bn_sqrx8x_internal
2779	call	__bn_postx4x_internal
2780
2781	mov	r9,r10
2782	mov	rdi,rsi
2783DB	102,72,15,126,209
2784DB	102,72,15,126,226
2785	mov	rax,QWORD[40+rsp]
2786
2787	call	mulx4x_internal
2788
2789	mov	rsi,QWORD[40+rsp]
2790
2791	mov	rax,1
2792
2793	mov	r15,QWORD[((-48))+rsi]
2794
2795	mov	r14,QWORD[((-40))+rsi]
2796
2797	mov	r13,QWORD[((-32))+rsi]
2798
2799	mov	r12,QWORD[((-24))+rsi]
2800
2801	mov	rbp,QWORD[((-16))+rsi]
2802
2803	mov	rbx,QWORD[((-8))+rsi]
2804
2805	lea	rsp,[rsi]
2806
2807$L$powerx5_epilogue:
2808	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
2809	mov	rsi,QWORD[16+rsp]
2810	ret
2811
2812$L$SEH_end_bn_powerx5:
2813
2814global	bn_sqrx8x_internal
2815
2816
2817ALIGN	32
2818bn_sqrx8x_internal:
2819__bn_sqrx8x_internal:
2820
2821_CET_ENDBR
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862	lea	rdi,[((48+8))+rsp]
2863	lea	rbp,[r9*1+rsi]
2864	mov	QWORD[((0+8))+rsp],r9
2865	mov	QWORD[((8+8))+rsp],rbp
2866	jmp	NEAR $L$sqr8x_zero_start
2867
2868ALIGN	32
2869	DB	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2870$L$sqrx8x_zero:
2871	DB	0x3e
2872	movdqa	XMMWORD[rdi],xmm0
2873	movdqa	XMMWORD[16+rdi],xmm0
2874	movdqa	XMMWORD[32+rdi],xmm0
2875	movdqa	XMMWORD[48+rdi],xmm0
2876$L$sqr8x_zero_start:
2877	movdqa	XMMWORD[64+rdi],xmm0
2878	movdqa	XMMWORD[80+rdi],xmm0
2879	movdqa	XMMWORD[96+rdi],xmm0
2880	movdqa	XMMWORD[112+rdi],xmm0
2881	lea	rdi,[128+rdi]
2882	sub	r9,64
2883	jnz	NEAR $L$sqrx8x_zero
2884
2885	mov	rdx,QWORD[rsi]
2886
2887	xor	r10,r10
2888	xor	r11,r11
2889	xor	r12,r12
2890	xor	r13,r13
2891	xor	r14,r14
2892	xor	r15,r15
2893	lea	rdi,[((48+8))+rsp]
2894	xor	rbp,rbp
2895	jmp	NEAR $L$sqrx8x_outer_loop
2896
2897ALIGN	32
2898$L$sqrx8x_outer_loop:
2899	mulx	rax,r8,QWORD[8+rsi]
2900	adcx	r8,r9
2901	adox	r10,rax
2902	mulx	rax,r9,QWORD[16+rsi]
2903	adcx	r9,r10
2904	adox	r11,rax
2905	DB	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
2906	adcx	r10,r11
2907	adox	r12,rax
2908	DB	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
2909	adcx	r11,r12
2910	adox	r13,rax
2911	mulx	rax,r12,QWORD[40+rsi]
2912	adcx	r12,r13
2913	adox	r14,rax
2914	mulx	rax,r13,QWORD[48+rsi]
2915	adcx	r13,r14
2916	adox	rax,r15
2917	mulx	r15,r14,QWORD[56+rsi]
2918	mov	rdx,QWORD[8+rsi]
2919	adcx	r14,rax
2920	adox	r15,rbp
2921	adc	r15,QWORD[64+rdi]
2922	mov	QWORD[8+rdi],r8
2923	mov	QWORD[16+rdi],r9
2924	sbb	rcx,rcx
2925	xor	rbp,rbp
2926
2927
2928	mulx	rbx,r8,QWORD[16+rsi]
2929	mulx	rax,r9,QWORD[24+rsi]
2930	adcx	r8,r10
2931	adox	r9,rbx
2932	mulx	rbx,r10,QWORD[32+rsi]
2933	adcx	r9,r11
2934	adox	r10,rax
2935	DB	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
2936	adcx	r10,r12
2937	adox	r11,rbx
2938	DB	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
2939	adcx	r11,r13
2940	adox	r12,r14
2941	DB	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
2942	mov	rdx,QWORD[16+rsi]
2943	adcx	r12,rax
2944	adox	r13,rbx
2945	adcx	r13,r15
2946	adox	r14,rbp
2947	adcx	r14,rbp
2948
2949	mov	QWORD[24+rdi],r8
2950	mov	QWORD[32+rdi],r9
2951
2952	mulx	rbx,r8,QWORD[24+rsi]
2953	mulx	rax,r9,QWORD[32+rsi]
2954	adcx	r8,r10
2955	adox	r9,rbx
2956	mulx	rbx,r10,QWORD[40+rsi]
2957	adcx	r9,r11
2958	adox	r10,rax
2959	DB	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
2960	adcx	r10,r12
2961	adox	r11,r13
2962	DB	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
2963	DB	0x3e
2964	mov	rdx,QWORD[24+rsi]
2965	adcx	r11,rbx
2966	adox	r12,rax
2967	adcx	r12,r14
2968	mov	QWORD[40+rdi],r8
2969	mov	QWORD[48+rdi],r9
2970	mulx	rax,r8,QWORD[32+rsi]
2971	adox	r13,rbp
2972	adcx	r13,rbp
2973
2974	mulx	rbx,r9,QWORD[40+rsi]
2975	adcx	r8,r10
2976	adox	r9,rax
2977	mulx	rax,r10,QWORD[48+rsi]
2978	adcx	r9,r11
2979	adox	r10,r12
2980	mulx	r12,r11,QWORD[56+rsi]
2981	mov	rdx,QWORD[32+rsi]
2982	mov	r14,QWORD[40+rsi]
2983	adcx	r10,rbx
2984	adox	r11,rax
2985	mov	r15,QWORD[48+rsi]
2986	adcx	r11,r13
2987	adox	r12,rbp
2988	adcx	r12,rbp
2989
2990	mov	QWORD[56+rdi],r8
2991	mov	QWORD[64+rdi],r9
2992
2993	mulx	rax,r9,r14
2994	mov	r8,QWORD[56+rsi]
2995	adcx	r9,r10
2996	mulx	rbx,r10,r15
2997	adox	r10,rax
2998	adcx	r10,r11
2999	mulx	rax,r11,r8
3000	mov	rdx,r14
3001	adox	r11,rbx
3002	adcx	r11,r12
3003
3004	adcx	rax,rbp
3005
3006	mulx	rbx,r14,r15
3007	mulx	r13,r12,r8
3008	mov	rdx,r15
3009	lea	rsi,[64+rsi]
3010	adcx	r11,r14
3011	adox	r12,rbx
3012	adcx	r12,rax
3013	adox	r13,rbp
3014
3015	DB	0x67,0x67
3016	mulx	r14,r8,r8
3017	adcx	r13,r8
3018	adcx	r14,rbp
3019
3020	cmp	rsi,QWORD[((8+8))+rsp]
3021	je	NEAR $L$sqrx8x_outer_break
3022
3023	neg	rcx
3024	mov	rcx,-8
3025	mov	r15,rbp
3026	mov	r8,QWORD[64+rdi]
3027	adcx	r9,QWORD[72+rdi]
3028	adcx	r10,QWORD[80+rdi]
3029	adcx	r11,QWORD[88+rdi]
3030	adc	r12,QWORD[96+rdi]
3031	adc	r13,QWORD[104+rdi]
3032	adc	r14,QWORD[112+rdi]
3033	adc	r15,QWORD[120+rdi]
3034	lea	rbp,[rsi]
3035	lea	rdi,[128+rdi]
3036	sbb	rax,rax
3037
3038	mov	rdx,QWORD[((-64))+rsi]
3039	mov	QWORD[((16+8))+rsp],rax
3040	mov	QWORD[((24+8))+rsp],rdi
3041
3042
3043	xor	eax,eax
3044	jmp	NEAR $L$sqrx8x_loop
3045
3046ALIGN	32
3047$L$sqrx8x_loop:
3048	mov	rbx,r8
3049	mulx	r8,rax,QWORD[rbp]
3050	adcx	rbx,rax
3051	adox	r8,r9
3052
3053	mulx	r9,rax,QWORD[8+rbp]
3054	adcx	r8,rax
3055	adox	r9,r10
3056
3057	mulx	r10,rax,QWORD[16+rbp]
3058	adcx	r9,rax
3059	adox	r10,r11
3060
3061	mulx	r11,rax,QWORD[24+rbp]
3062	adcx	r10,rax
3063	adox	r11,r12
3064
3065	DB	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3066	adcx	r11,rax
3067	adox	r12,r13
3068
3069	mulx	r13,rax,QWORD[40+rbp]
3070	adcx	r12,rax
3071	adox	r13,r14
3072
3073	mulx	r14,rax,QWORD[48+rbp]
3074	mov	QWORD[rcx*8+rdi],rbx
3075	mov	ebx,0
3076	adcx	r13,rax
3077	adox	r14,r15
3078
3079	DB	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
3080	mov	rdx,QWORD[8+rcx*8+rsi]
3081	adcx	r14,rax
3082	adox	r15,rbx
3083	adcx	r15,rbx
3084
3085	DB	0x67
3086	inc	rcx
3087	jnz	NEAR $L$sqrx8x_loop
3088
3089	lea	rbp,[64+rbp]
3090	mov	rcx,-8
3091	cmp	rbp,QWORD[((8+8))+rsp]
3092	je	NEAR $L$sqrx8x_break
3093
3094	sub	rbx,QWORD[((16+8))+rsp]
3095	DB	0x66
3096	mov	rdx,QWORD[((-64))+rsi]
3097	adcx	r8,QWORD[rdi]
3098	adcx	r9,QWORD[8+rdi]
3099	adc	r10,QWORD[16+rdi]
3100	adc	r11,QWORD[24+rdi]
3101	adc	r12,QWORD[32+rdi]
3102	adc	r13,QWORD[40+rdi]
3103	adc	r14,QWORD[48+rdi]
3104	adc	r15,QWORD[56+rdi]
3105	lea	rdi,[64+rdi]
3106	DB	0x67
3107	sbb	rax,rax
3108	xor	ebx,ebx
3109	mov	QWORD[((16+8))+rsp],rax
3110	jmp	NEAR $L$sqrx8x_loop
3111
3112ALIGN	32
3113$L$sqrx8x_break:
3114	xor	rbp,rbp
3115	sub	rbx,QWORD[((16+8))+rsp]
3116	adcx	r8,rbp
3117	mov	rcx,QWORD[((24+8))+rsp]
3118	adcx	r9,rbp
3119	mov	rdx,QWORD[rsi]
3120	adc	r10,0
3121	mov	QWORD[rdi],r8
3122	adc	r11,0
3123	adc	r12,0
3124	adc	r13,0
3125	adc	r14,0
3126	adc	r15,0
3127	cmp	rdi,rcx
3128	je	NEAR $L$sqrx8x_outer_loop
3129
3130	mov	QWORD[8+rdi],r9
3131	mov	r9,QWORD[8+rcx]
3132	mov	QWORD[16+rdi],r10
3133	mov	r10,QWORD[16+rcx]
3134	mov	QWORD[24+rdi],r11
3135	mov	r11,QWORD[24+rcx]
3136	mov	QWORD[32+rdi],r12
3137	mov	r12,QWORD[32+rcx]
3138	mov	QWORD[40+rdi],r13
3139	mov	r13,QWORD[40+rcx]
3140	mov	QWORD[48+rdi],r14
3141	mov	r14,QWORD[48+rcx]
3142	mov	QWORD[56+rdi],r15
3143	mov	r15,QWORD[56+rcx]
3144	mov	rdi,rcx
3145	jmp	NEAR $L$sqrx8x_outer_loop
3146
3147ALIGN	32
3148$L$sqrx8x_outer_break:
3149	mov	QWORD[72+rdi],r9
3150DB	102,72,15,126,217
3151	mov	QWORD[80+rdi],r10
3152	mov	QWORD[88+rdi],r11
3153	mov	QWORD[96+rdi],r12
3154	mov	QWORD[104+rdi],r13
3155	mov	QWORD[112+rdi],r14
3156	lea	rdi,[((48+8))+rsp]
3157	mov	rdx,QWORD[rcx*1+rsi]
3158
3159	mov	r11,QWORD[8+rdi]
3160	xor	r10,r10
3161	mov	r9,QWORD[((0+8))+rsp]
3162	adox	r11,r11
3163	mov	r12,QWORD[16+rdi]
3164	mov	r13,QWORD[24+rdi]
3165
3166
3167ALIGN	32
3168$L$sqrx4x_shift_n_add:
3169	mulx	rbx,rax,rdx
3170	adox	r12,r12
3171	adcx	rax,r10
3172	DB	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
3173	DB	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
3174	adox	r13,r13
3175	adcx	rbx,r11
3176	mov	r11,QWORD[40+rdi]
3177	mov	QWORD[rdi],rax
3178	mov	QWORD[8+rdi],rbx
3179
3180	mulx	rbx,rax,rdx
3181	adox	r10,r10
3182	adcx	rax,r12
3183	mov	rdx,QWORD[16+rcx*1+rsi]
3184	mov	r12,QWORD[48+rdi]
3185	adox	r11,r11
3186	adcx	rbx,r13
3187	mov	r13,QWORD[56+rdi]
3188	mov	QWORD[16+rdi],rax
3189	mov	QWORD[24+rdi],rbx
3190
3191	mulx	rbx,rax,rdx
3192	adox	r12,r12
3193	adcx	rax,r10
3194	mov	rdx,QWORD[24+rcx*1+rsi]
3195	lea	rcx,[32+rcx]
3196	mov	r10,QWORD[64+rdi]
3197	adox	r13,r13
3198	adcx	rbx,r11
3199	mov	r11,QWORD[72+rdi]
3200	mov	QWORD[32+rdi],rax
3201	mov	QWORD[40+rdi],rbx
3202
3203	mulx	rbx,rax,rdx
3204	adox	r10,r10
3205	adcx	rax,r12
3206	jrcxz	$L$sqrx4x_shift_n_add_break
3207	DB	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
3208	adox	r11,r11
3209	adcx	rbx,r13
3210	mov	r12,QWORD[80+rdi]
3211	mov	r13,QWORD[88+rdi]
3212	mov	QWORD[48+rdi],rax
3213	mov	QWORD[56+rdi],rbx
3214	lea	rdi,[64+rdi]
3215	nop
3216	jmp	NEAR $L$sqrx4x_shift_n_add
3217
3218ALIGN	32
3219$L$sqrx4x_shift_n_add_break:
3220	adcx	rbx,r13
3221	mov	QWORD[48+rdi],rax
3222	mov	QWORD[56+rdi],rbx
3223	lea	rdi,[64+rdi]
3224DB	102,72,15,126,213
3225__bn_sqrx8x_reduction:
3226	xor	eax,eax
3227	mov	rbx,QWORD[((32+8))+rsp]
3228	mov	rdx,QWORD[((48+8))+rsp]
3229	lea	rcx,[((-64))+r9*1+rbp]
3230
3231	mov	QWORD[((0+8))+rsp],rcx
3232	mov	QWORD[((8+8))+rsp],rdi
3233
3234	lea	rdi,[((48+8))+rsp]
3235	jmp	NEAR $L$sqrx8x_reduction_loop
3236
3237ALIGN	32
3238$L$sqrx8x_reduction_loop:
3239	mov	r9,QWORD[8+rdi]
3240	mov	r10,QWORD[16+rdi]
3241	mov	r11,QWORD[24+rdi]
3242	mov	r12,QWORD[32+rdi]
3243	mov	r8,rdx
3244	imul	rdx,rbx
3245	mov	r13,QWORD[40+rdi]
3246	mov	r14,QWORD[48+rdi]
3247	mov	r15,QWORD[56+rdi]
3248	mov	QWORD[((24+8))+rsp],rax
3249
3250	lea	rdi,[64+rdi]
3251	xor	rsi,rsi
3252	mov	rcx,-8
3253	jmp	NEAR $L$sqrx8x_reduce
3254
3255ALIGN	32
3256$L$sqrx8x_reduce:
3257	mov	rbx,r8
3258	mulx	r8,rax,QWORD[rbp]
3259	adcx	rax,rbx
3260	adox	r8,r9
3261
3262	mulx	r9,rbx,QWORD[8+rbp]
3263	adcx	r8,rbx
3264	adox	r9,r10
3265
3266	mulx	r10,rbx,QWORD[16+rbp]
3267	adcx	r9,rbx
3268	adox	r10,r11
3269
3270	mulx	r11,rbx,QWORD[24+rbp]
3271	adcx	r10,rbx
3272	adox	r11,r12
3273
3274	DB	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
3275	mov	rax,rdx
3276	mov	rdx,r8
3277	adcx	r11,rbx
3278	adox	r12,r13
3279
3280	mulx	rdx,rbx,QWORD[((32+8))+rsp]
3281	mov	rdx,rax
3282	mov	QWORD[((64+48+8))+rcx*8+rsp],rax
3283
3284	mulx	r13,rax,QWORD[40+rbp]
3285	adcx	r12,rax
3286	adox	r13,r14
3287
3288	mulx	r14,rax,QWORD[48+rbp]
3289	adcx	r13,rax
3290	adox	r14,r15
3291
3292	mulx	r15,rax,QWORD[56+rbp]
3293	mov	rdx,rbx
3294	adcx	r14,rax
3295	adox	r15,rsi
3296	adcx	r15,rsi
3297
3298	DB	0x67,0x67,0x67
3299	inc	rcx
3300	jnz	NEAR $L$sqrx8x_reduce
3301
3302	mov	rax,rsi
3303	cmp	rbp,QWORD[((0+8))+rsp]
3304	jae	NEAR $L$sqrx8x_no_tail
3305
3306	mov	rdx,QWORD[((48+8))+rsp]
3307	add	r8,QWORD[rdi]
3308	lea	rbp,[64+rbp]
3309	mov	rcx,-8
3310	adcx	r9,QWORD[8+rdi]
3311	adcx	r10,QWORD[16+rdi]
3312	adc	r11,QWORD[24+rdi]
3313	adc	r12,QWORD[32+rdi]
3314	adc	r13,QWORD[40+rdi]
3315	adc	r14,QWORD[48+rdi]
3316	adc	r15,QWORD[56+rdi]
3317	lea	rdi,[64+rdi]
3318	sbb	rax,rax
3319
3320	xor	rsi,rsi
3321	mov	QWORD[((16+8))+rsp],rax
3322	jmp	NEAR $L$sqrx8x_tail
3323
3324ALIGN	32
3325$L$sqrx8x_tail:
3326	mov	rbx,r8
3327	mulx	r8,rax,QWORD[rbp]
3328	adcx	rbx,rax
3329	adox	r8,r9
3330
3331	mulx	r9,rax,QWORD[8+rbp]
3332	adcx	r8,rax
3333	adox	r9,r10
3334
3335	mulx	r10,rax,QWORD[16+rbp]
3336	adcx	r9,rax
3337	adox	r10,r11
3338
3339	mulx	r11,rax,QWORD[24+rbp]
3340	adcx	r10,rax
3341	adox	r11,r12
3342
3343	DB	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3344	adcx	r11,rax
3345	adox	r12,r13
3346
3347	mulx	r13,rax,QWORD[40+rbp]
3348	adcx	r12,rax
3349	adox	r13,r14
3350
3351	mulx	r14,rax,QWORD[48+rbp]
3352	adcx	r13,rax
3353	adox	r14,r15
3354
3355	mulx	r15,rax,QWORD[56+rbp]
3356	mov	rdx,QWORD[((72+48+8))+rcx*8+rsp]
3357	adcx	r14,rax
3358	adox	r15,rsi
3359	mov	QWORD[rcx*8+rdi],rbx
3360	mov	rbx,r8
3361	adcx	r15,rsi
3362
3363	inc	rcx
3364	jnz	NEAR $L$sqrx8x_tail
3365
3366	cmp	rbp,QWORD[((0+8))+rsp]
3367	jae	NEAR $L$sqrx8x_tail_done
3368
3369	sub	rsi,QWORD[((16+8))+rsp]
3370	mov	rdx,QWORD[((48+8))+rsp]
3371	lea	rbp,[64+rbp]
3372	adc	r8,QWORD[rdi]
3373	adc	r9,QWORD[8+rdi]
3374	adc	r10,QWORD[16+rdi]
3375	adc	r11,QWORD[24+rdi]
3376	adc	r12,QWORD[32+rdi]
3377	adc	r13,QWORD[40+rdi]
3378	adc	r14,QWORD[48+rdi]
3379	adc	r15,QWORD[56+rdi]
3380	lea	rdi,[64+rdi]
3381	sbb	rax,rax
3382	sub	rcx,8
3383
3384	xor	rsi,rsi
3385	mov	QWORD[((16+8))+rsp],rax
3386	jmp	NEAR $L$sqrx8x_tail
3387
3388ALIGN	32
3389$L$sqrx8x_tail_done:
3390	xor	rax,rax
3391	add	r8,QWORD[((24+8))+rsp]
3392	adc	r9,0
3393	adc	r10,0
3394	adc	r11,0
3395	adc	r12,0
3396	adc	r13,0
3397	adc	r14,0
3398	adc	r15,0
3399	adc	rax,0
3400
3401	sub	rsi,QWORD[((16+8))+rsp]
3402$L$sqrx8x_no_tail:
3403	adc	r8,QWORD[rdi]
3404DB	102,72,15,126,217
3405	adc	r9,QWORD[8+rdi]
3406	mov	rsi,QWORD[56+rbp]
3407DB	102,72,15,126,213
3408	adc	r10,QWORD[16+rdi]
3409	adc	r11,QWORD[24+rdi]
3410	adc	r12,QWORD[32+rdi]
3411	adc	r13,QWORD[40+rdi]
3412	adc	r14,QWORD[48+rdi]
3413	adc	r15,QWORD[56+rdi]
3414	adc	rax,0
3415
3416	mov	rbx,QWORD[((32+8))+rsp]
3417	mov	rdx,QWORD[64+rcx*1+rdi]
3418
3419	mov	QWORD[rdi],r8
3420	lea	r8,[64+rdi]
3421	mov	QWORD[8+rdi],r9
3422	mov	QWORD[16+rdi],r10
3423	mov	QWORD[24+rdi],r11
3424	mov	QWORD[32+rdi],r12
3425	mov	QWORD[40+rdi],r13
3426	mov	QWORD[48+rdi],r14
3427	mov	QWORD[56+rdi],r15
3428
3429	lea	rdi,[64+rcx*1+rdi]
3430	cmp	r8,QWORD[((8+8))+rsp]
3431	jb	NEAR $L$sqrx8x_reduction_loop
3432	ret
3433
3434
3435ALIGN	32
3436
3437__bn_postx4x_internal:
3438
3439	mov	r12,QWORD[rbp]
3440	mov	r10,rcx
3441	mov	r9,rcx
3442	neg	rax
3443	sar	rcx,3+2
3444
3445DB	102,72,15,126,202
3446DB	102,72,15,126,206
3447	dec	r12
3448	mov	r13,QWORD[8+rbp]
3449	xor	r8,r8
3450	mov	r14,QWORD[16+rbp]
3451	mov	r15,QWORD[24+rbp]
3452	jmp	NEAR $L$sqrx4x_sub_entry
3453
3454ALIGN	16
3455$L$sqrx4x_sub:
3456	mov	r12,QWORD[rbp]
3457	mov	r13,QWORD[8+rbp]
3458	mov	r14,QWORD[16+rbp]
3459	mov	r15,QWORD[24+rbp]
3460$L$sqrx4x_sub_entry:
3461	andn	r12,r12,rax
3462	lea	rbp,[32+rbp]
3463	andn	r13,r13,rax
3464	andn	r14,r14,rax
3465	andn	r15,r15,rax
3466
3467	neg	r8
3468	adc	r12,QWORD[rdi]
3469	adc	r13,QWORD[8+rdi]
3470	adc	r14,QWORD[16+rdi]
3471	adc	r15,QWORD[24+rdi]
3472	mov	QWORD[rdx],r12
3473	lea	rdi,[32+rdi]
3474	mov	QWORD[8+rdx],r13
3475	sbb	r8,r8
3476	mov	QWORD[16+rdx],r14
3477	mov	QWORD[24+rdx],r15
3478	lea	rdx,[32+rdx]
3479
3480	inc	rcx
3481	jnz	NEAR $L$sqrx4x_sub
3482
3483	neg	r9
3484
3485	ret
3486
3487
3488global	bn_scatter5
3489
3490ALIGN	16
3491bn_scatter5:
3492
3493_CET_ENDBR
3494	cmp	edx,0
3495	jz	NEAR $L$scatter_epilogue
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505	lea	r8,[r9*8+r8]
3506$L$scatter:
3507	mov	rax,QWORD[rcx]
3508	lea	rcx,[8+rcx]
3509	mov	QWORD[r8],rax
3510	lea	r8,[256+r8]
3511	sub	edx,1
3512	jnz	NEAR $L$scatter
3513$L$scatter_epilogue:
3514	ret
3515
3516
3517
3518global	bn_gather5
3519
3520ALIGN	32
3521bn_gather5:
3522
3523$L$SEH_begin_bn_gather5:
3524_CET_ENDBR
3525
3526	DB	0x4c,0x8d,0x14,0x24
3527
3528	DB	0x48,0x81,0xec,0x08,0x01,0x00,0x00
3529	lea	rax,[$L$inc]
3530	and	rsp,-16
3531
3532	movd	xmm5,r9d
3533	movdqa	xmm0,XMMWORD[rax]
3534	movdqa	xmm1,XMMWORD[16+rax]
3535	lea	r11,[128+r8]
3536	lea	rax,[128+rsp]
3537
3538	pshufd	xmm5,xmm5,0
3539	movdqa	xmm4,xmm1
3540	movdqa	xmm2,xmm1
3541	paddd	xmm1,xmm0
3542	pcmpeqd	xmm0,xmm5
3543	movdqa	xmm3,xmm4
3544
3545	paddd	xmm2,xmm1
3546	pcmpeqd	xmm1,xmm5
3547	movdqa	XMMWORD[(-128)+rax],xmm0
3548	movdqa	xmm0,xmm4
3549
3550	paddd	xmm3,xmm2
3551	pcmpeqd	xmm2,xmm5
3552	movdqa	XMMWORD[(-112)+rax],xmm1
3553	movdqa	xmm1,xmm4
3554
3555	paddd	xmm0,xmm3
3556	pcmpeqd	xmm3,xmm5
3557	movdqa	XMMWORD[(-96)+rax],xmm2
3558	movdqa	xmm2,xmm4
3559	paddd	xmm1,xmm0
3560	pcmpeqd	xmm0,xmm5
3561	movdqa	XMMWORD[(-80)+rax],xmm3
3562	movdqa	xmm3,xmm4
3563
3564	paddd	xmm2,xmm1
3565	pcmpeqd	xmm1,xmm5
3566	movdqa	XMMWORD[(-64)+rax],xmm0
3567	movdqa	xmm0,xmm4
3568
3569	paddd	xmm3,xmm2
3570	pcmpeqd	xmm2,xmm5
3571	movdqa	XMMWORD[(-48)+rax],xmm1
3572	movdqa	xmm1,xmm4
3573
3574	paddd	xmm0,xmm3
3575	pcmpeqd	xmm3,xmm5
3576	movdqa	XMMWORD[(-32)+rax],xmm2
3577	movdqa	xmm2,xmm4
3578	paddd	xmm1,xmm0
3579	pcmpeqd	xmm0,xmm5
3580	movdqa	XMMWORD[(-16)+rax],xmm3
3581	movdqa	xmm3,xmm4
3582
3583	paddd	xmm2,xmm1
3584	pcmpeqd	xmm1,xmm5
3585	movdqa	XMMWORD[rax],xmm0
3586	movdqa	xmm0,xmm4
3587
3588	paddd	xmm3,xmm2
3589	pcmpeqd	xmm2,xmm5
3590	movdqa	XMMWORD[16+rax],xmm1
3591	movdqa	xmm1,xmm4
3592
3593	paddd	xmm0,xmm3
3594	pcmpeqd	xmm3,xmm5
3595	movdqa	XMMWORD[32+rax],xmm2
3596	movdqa	xmm2,xmm4
3597	paddd	xmm1,xmm0
3598	pcmpeqd	xmm0,xmm5
3599	movdqa	XMMWORD[48+rax],xmm3
3600	movdqa	xmm3,xmm4
3601
3602	paddd	xmm2,xmm1
3603	pcmpeqd	xmm1,xmm5
3604	movdqa	XMMWORD[64+rax],xmm0
3605	movdqa	xmm0,xmm4
3606
3607	paddd	xmm3,xmm2
3608	pcmpeqd	xmm2,xmm5
3609	movdqa	XMMWORD[80+rax],xmm1
3610	movdqa	xmm1,xmm4
3611
3612	paddd	xmm0,xmm3
3613	pcmpeqd	xmm3,xmm5
3614	movdqa	XMMWORD[96+rax],xmm2
3615	movdqa	xmm2,xmm4
3616	movdqa	XMMWORD[112+rax],xmm3
3617	jmp	NEAR $L$gather
3618
3619ALIGN	32
3620$L$gather:
3621	pxor	xmm4,xmm4
3622	pxor	xmm5,xmm5
3623	movdqa	xmm0,XMMWORD[((-128))+r11]
3624	movdqa	xmm1,XMMWORD[((-112))+r11]
3625	movdqa	xmm2,XMMWORD[((-96))+r11]
3626	pand	xmm0,XMMWORD[((-128))+rax]
3627	movdqa	xmm3,XMMWORD[((-80))+r11]
3628	pand	xmm1,XMMWORD[((-112))+rax]
3629	por	xmm4,xmm0
3630	pand	xmm2,XMMWORD[((-96))+rax]
3631	por	xmm5,xmm1
3632	pand	xmm3,XMMWORD[((-80))+rax]
3633	por	xmm4,xmm2
3634	por	xmm5,xmm3
3635	movdqa	xmm0,XMMWORD[((-64))+r11]
3636	movdqa	xmm1,XMMWORD[((-48))+r11]
3637	movdqa	xmm2,XMMWORD[((-32))+r11]
3638	pand	xmm0,XMMWORD[((-64))+rax]
3639	movdqa	xmm3,XMMWORD[((-16))+r11]
3640	pand	xmm1,XMMWORD[((-48))+rax]
3641	por	xmm4,xmm0
3642	pand	xmm2,XMMWORD[((-32))+rax]
3643	por	xmm5,xmm1
3644	pand	xmm3,XMMWORD[((-16))+rax]
3645	por	xmm4,xmm2
3646	por	xmm5,xmm3
3647	movdqa	xmm0,XMMWORD[r11]
3648	movdqa	xmm1,XMMWORD[16+r11]
3649	movdqa	xmm2,XMMWORD[32+r11]
3650	pand	xmm0,XMMWORD[rax]
3651	movdqa	xmm3,XMMWORD[48+r11]
3652	pand	xmm1,XMMWORD[16+rax]
3653	por	xmm4,xmm0
3654	pand	xmm2,XMMWORD[32+rax]
3655	por	xmm5,xmm1
3656	pand	xmm3,XMMWORD[48+rax]
3657	por	xmm4,xmm2
3658	por	xmm5,xmm3
3659	movdqa	xmm0,XMMWORD[64+r11]
3660	movdqa	xmm1,XMMWORD[80+r11]
3661	movdqa	xmm2,XMMWORD[96+r11]
3662	pand	xmm0,XMMWORD[64+rax]
3663	movdqa	xmm3,XMMWORD[112+r11]
3664	pand	xmm1,XMMWORD[80+rax]
3665	por	xmm4,xmm0
3666	pand	xmm2,XMMWORD[96+rax]
3667	por	xmm5,xmm1
3668	pand	xmm3,XMMWORD[112+rax]
3669	por	xmm4,xmm2
3670	por	xmm5,xmm3
3671	por	xmm4,xmm5
3672	lea	r11,[256+r11]
3673
3674	pshufd	xmm0,xmm4,0x4e
3675	por	xmm0,xmm4
3676	movq	QWORD[rcx],xmm0
3677	lea	rcx,[8+rcx]
3678	sub	edx,1
3679	jnz	NEAR $L$gather
3680
3681	lea	rsp,[r10]
3682
3683	ret
3684$L$SEH_end_bn_gather5:
3685
3686
3687section	.rdata rdata align=8
3688ALIGN	64
3689$L$inc:
3690	DD	0,0,1,1
3691	DD	2,2,2,2
3692	DB	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
3693	DB	112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115
3694	DB	99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111
3695	DB	114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79
3696	DB	71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111
3697	DB	112,101,110,115,115,108,46,111,114,103,62,0
3698section	.text
3699
3700EXTERN	__imp_RtlVirtualUnwind
3701
3702ALIGN	16
3703mul_handler:
3704	push	rsi
3705	push	rdi
3706	push	rbx
3707	push	rbp
3708	push	r12
3709	push	r13
3710	push	r14
3711	push	r15
3712	pushfq
3713	sub	rsp,64
3714
3715	mov	rax,QWORD[120+r8]
3716	mov	rbx,QWORD[248+r8]
3717
3718	mov	rsi,QWORD[8+r9]
3719	mov	r11,QWORD[56+r9]
3720
3721	mov	r10d,DWORD[r11]
3722	lea	r10,[r10*1+rsi]
3723	cmp	rbx,r10
3724	jb	NEAR $L$common_seh_tail
3725
3726	mov	r10d,DWORD[4+r11]
3727	lea	r10,[r10*1+rsi]
3728	cmp	rbx,r10
3729	jb	NEAR $L$common_pop_regs
3730
3731	mov	rax,QWORD[152+r8]
3732
3733	mov	r10d,DWORD[8+r11]
3734	lea	r10,[r10*1+rsi]
3735	cmp	rbx,r10
3736	jae	NEAR $L$common_seh_tail
3737
3738	lea	r10,[$L$mul_epilogue]
3739	cmp	rbx,r10
3740	ja	NEAR $L$body_40
3741
3742	mov	r10,QWORD[192+r8]
3743	mov	rax,QWORD[8+r10*8+rax]
3744
3745	jmp	NEAR $L$common_pop_regs
3746
3747$L$body_40:
3748	mov	rax,QWORD[40+rax]
3749$L$common_pop_regs:
3750	mov	rbx,QWORD[((-8))+rax]
3751	mov	rbp,QWORD[((-16))+rax]
3752	mov	r12,QWORD[((-24))+rax]
3753	mov	r13,QWORD[((-32))+rax]
3754	mov	r14,QWORD[((-40))+rax]
3755	mov	r15,QWORD[((-48))+rax]
3756	mov	QWORD[144+r8],rbx
3757	mov	QWORD[160+r8],rbp
3758	mov	QWORD[216+r8],r12
3759	mov	QWORD[224+r8],r13
3760	mov	QWORD[232+r8],r14
3761	mov	QWORD[240+r8],r15
3762
3763$L$common_seh_tail:
3764	mov	rdi,QWORD[8+rax]
3765	mov	rsi,QWORD[16+rax]
3766	mov	QWORD[152+r8],rax
3767	mov	QWORD[168+r8],rsi
3768	mov	QWORD[176+r8],rdi
3769
3770	mov	rdi,QWORD[40+r9]
3771	mov	rsi,r8
3772	mov	ecx,154
3773	DD	0xa548f3fc
3774
3775	mov	rsi,r9
3776	xor	rcx,rcx
3777	mov	rdx,QWORD[8+rsi]
3778	mov	r8,QWORD[rsi]
3779	mov	r9,QWORD[16+rsi]
3780	mov	r10,QWORD[40+rsi]
3781	lea	r11,[56+rsi]
3782	lea	r12,[24+rsi]
3783	mov	QWORD[32+rsp],r10
3784	mov	QWORD[40+rsp],r11
3785	mov	QWORD[48+rsp],r12
3786	mov	QWORD[56+rsp],rcx
3787	call	QWORD[__imp_RtlVirtualUnwind]
3788
3789	mov	eax,1
3790	add	rsp,64
3791	popfq
3792	pop	r15
3793	pop	r14
3794	pop	r13
3795	pop	r12
3796	pop	rbp
3797	pop	rbx
3798	pop	rdi
3799	pop	rsi
3800	ret
3801
3802
3803section	.pdata rdata align=4
3804ALIGN	4
3805	DD	$L$SEH_begin_bn_mul_mont_gather5 wrt ..imagebase
3806	DD	$L$SEH_end_bn_mul_mont_gather5 wrt ..imagebase
3807	DD	$L$SEH_info_bn_mul_mont_gather5 wrt ..imagebase
3808
3809	DD	$L$SEH_begin_bn_mul4x_mont_gather5 wrt ..imagebase
3810	DD	$L$SEH_end_bn_mul4x_mont_gather5 wrt ..imagebase
3811	DD	$L$SEH_info_bn_mul4x_mont_gather5 wrt ..imagebase
3812
3813	DD	$L$SEH_begin_bn_power5 wrt ..imagebase
3814	DD	$L$SEH_end_bn_power5 wrt ..imagebase
3815	DD	$L$SEH_info_bn_power5 wrt ..imagebase
3816	DD	$L$SEH_begin_bn_mulx4x_mont_gather5 wrt ..imagebase
3817	DD	$L$SEH_end_bn_mulx4x_mont_gather5 wrt ..imagebase
3818	DD	$L$SEH_info_bn_mulx4x_mont_gather5 wrt ..imagebase
3819
3820	DD	$L$SEH_begin_bn_powerx5 wrt ..imagebase
3821	DD	$L$SEH_end_bn_powerx5 wrt ..imagebase
3822	DD	$L$SEH_info_bn_powerx5 wrt ..imagebase
3823	DD	$L$SEH_begin_bn_gather5 wrt ..imagebase
3824	DD	$L$SEH_end_bn_gather5 wrt ..imagebase
3825	DD	$L$SEH_info_bn_gather5 wrt ..imagebase
3826
3827section	.xdata rdata align=8
3828ALIGN	8
3829$L$SEH_info_bn_mul_mont_gather5:
3830	DB	9,0,0,0
3831	DD	mul_handler wrt ..imagebase
3832	DD	$L$mul_body wrt ..imagebase,$L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase
3833ALIGN	8
3834$L$SEH_info_bn_mul4x_mont_gather5:
3835	DB	9,0,0,0
3836	DD	mul_handler wrt ..imagebase
3837	DD	$L$mul4x_prologue wrt ..imagebase,$L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase
3838ALIGN	8
3839$L$SEH_info_bn_power5:
3840	DB	9,0,0,0
3841	DD	mul_handler wrt ..imagebase
3842	DD	$L$power5_prologue wrt ..imagebase,$L$power5_body wrt ..imagebase,$L$power5_epilogue wrt ..imagebase
3843ALIGN	8
3844$L$SEH_info_bn_mulx4x_mont_gather5:
3845	DB	9,0,0,0
3846	DD	mul_handler wrt ..imagebase
3847	DD	$L$mulx4x_prologue wrt ..imagebase,$L$mulx4x_body wrt ..imagebase,$L$mulx4x_epilogue wrt ..imagebase
3848ALIGN	8
3849$L$SEH_info_bn_powerx5:
3850	DB	9,0,0,0
3851	DD	mul_handler wrt ..imagebase
3852	DD	$L$powerx5_prologue wrt ..imagebase,$L$powerx5_body wrt ..imagebase,$L$powerx5_epilogue wrt ..imagebase
3853ALIGN	8
3854$L$SEH_info_bn_gather5:
3855	DB	0x01,0x0b,0x03,0x0a
3856	DB	0x0b,0x01,0x21,0x00
3857	DB	0x04,0xa3,0x00,0x00
3858ALIGN	8
3859%else
3860; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
3861ret
3862%endif
3863