xref: /aosp_15_r20/external/boringssl/src/gen/bcm/x86_64-mont5-win.asm (revision 8fb009dc861624b67b6cdb62ea21f0f22d0c584b)
1; This file is generated from a similarly-named Perl script in the BoringSSL
2; source tree. Do not edit by hand.
3
4%ifidn __OUTPUT_FORMAT__, win64
5default	rel
6%define XMMWORD
7%define YMMWORD
8%define ZMMWORD
9%define _CET_ENDBR
10
11%ifdef BORINGSSL_PREFIX
12%include "boringssl_prefix_symbols_nasm.inc"
13%endif
14section	.text code align=64
15
16
17global	bn_mul_mont_gather5_nohw
18
19ALIGN	64
20bn_mul_mont_gather5_nohw:
21	mov	QWORD[8+rsp],rdi	;WIN64 prologue
22	mov	QWORD[16+rsp],rsi
23	mov	rax,rsp
24$L$SEH_begin_bn_mul_mont_gather5_nohw:
25	mov	rdi,rcx
26	mov	rsi,rdx
27	mov	rdx,r8
28	mov	rcx,r9
29	mov	r8,QWORD[40+rsp]
30	mov	r9,QWORD[48+rsp]
31
32
33
34_CET_ENDBR
35
36
37	mov	r9d,r9d
38	mov	rax,rsp
39
40	movd	xmm5,DWORD[56+rsp]
41	push	rbx
42
43	push	rbp
44
45	push	r12
46
47	push	r13
48
49	push	r14
50
51	push	r15
52
53
54	neg	r9
55	mov	r11,rsp
56	lea	r10,[((-280))+r9*8+rsp]
57	neg	r9
58	and	r10,-1024
59
60
61
62
63
64
65
66
67
68	sub	r11,r10
69	and	r11,-4096
70	lea	rsp,[r11*1+r10]
71	mov	r11,QWORD[rsp]
72	cmp	rsp,r10
73	ja	NEAR $L$mul_page_walk
74	jmp	NEAR $L$mul_page_walk_done
75
76$L$mul_page_walk:
77	lea	rsp,[((-4096))+rsp]
78	mov	r11,QWORD[rsp]
79	cmp	rsp,r10
80	ja	NEAR $L$mul_page_walk
81$L$mul_page_walk_done:
82
83	lea	r10,[$L$inc]
84	mov	QWORD[8+r9*8+rsp],rax
85
86$L$mul_body:
87
88	lea	r12,[128+rdx]
89	movdqa	xmm0,XMMWORD[r10]
90	movdqa	xmm1,XMMWORD[16+r10]
91	lea	r10,[((24-112))+r9*8+rsp]
92	and	r10,-16
93
94	pshufd	xmm5,xmm5,0
95	movdqa	xmm4,xmm1
96	movdqa	xmm2,xmm1
97	paddd	xmm1,xmm0
98	pcmpeqd	xmm0,xmm5
99	DB	0x67
100	movdqa	xmm3,xmm4
101	paddd	xmm2,xmm1
102	pcmpeqd	xmm1,xmm5
103	movdqa	XMMWORD[112+r10],xmm0
104	movdqa	xmm0,xmm4
105
106	paddd	xmm3,xmm2
107	pcmpeqd	xmm2,xmm5
108	movdqa	XMMWORD[128+r10],xmm1
109	movdqa	xmm1,xmm4
110
111	paddd	xmm0,xmm3
112	pcmpeqd	xmm3,xmm5
113	movdqa	XMMWORD[144+r10],xmm2
114	movdqa	xmm2,xmm4
115
116	paddd	xmm1,xmm0
117	pcmpeqd	xmm0,xmm5
118	movdqa	XMMWORD[160+r10],xmm3
119	movdqa	xmm3,xmm4
120	paddd	xmm2,xmm1
121	pcmpeqd	xmm1,xmm5
122	movdqa	XMMWORD[176+r10],xmm0
123	movdqa	xmm0,xmm4
124
125	paddd	xmm3,xmm2
126	pcmpeqd	xmm2,xmm5
127	movdqa	XMMWORD[192+r10],xmm1
128	movdqa	xmm1,xmm4
129
130	paddd	xmm0,xmm3
131	pcmpeqd	xmm3,xmm5
132	movdqa	XMMWORD[208+r10],xmm2
133	movdqa	xmm2,xmm4
134
135	paddd	xmm1,xmm0
136	pcmpeqd	xmm0,xmm5
137	movdqa	XMMWORD[224+r10],xmm3
138	movdqa	xmm3,xmm4
139	paddd	xmm2,xmm1
140	pcmpeqd	xmm1,xmm5
141	movdqa	XMMWORD[240+r10],xmm0
142	movdqa	xmm0,xmm4
143
144	paddd	xmm3,xmm2
145	pcmpeqd	xmm2,xmm5
146	movdqa	XMMWORD[256+r10],xmm1
147	movdqa	xmm1,xmm4
148
149	paddd	xmm0,xmm3
150	pcmpeqd	xmm3,xmm5
151	movdqa	XMMWORD[272+r10],xmm2
152	movdqa	xmm2,xmm4
153
154	paddd	xmm1,xmm0
155	pcmpeqd	xmm0,xmm5
156	movdqa	XMMWORD[288+r10],xmm3
157	movdqa	xmm3,xmm4
158	paddd	xmm2,xmm1
159	pcmpeqd	xmm1,xmm5
160	movdqa	XMMWORD[304+r10],xmm0
161
162	paddd	xmm3,xmm2
163	DB	0x67
164	pcmpeqd	xmm2,xmm5
165	movdqa	XMMWORD[320+r10],xmm1
166
167	pcmpeqd	xmm3,xmm5
168	movdqa	XMMWORD[336+r10],xmm2
169	pand	xmm0,XMMWORD[64+r12]
170
171	pand	xmm1,XMMWORD[80+r12]
172	pand	xmm2,XMMWORD[96+r12]
173	movdqa	XMMWORD[352+r10],xmm3
174	pand	xmm3,XMMWORD[112+r12]
175	por	xmm0,xmm2
176	por	xmm1,xmm3
177	movdqa	xmm4,XMMWORD[((-128))+r12]
178	movdqa	xmm5,XMMWORD[((-112))+r12]
179	movdqa	xmm2,XMMWORD[((-96))+r12]
180	pand	xmm4,XMMWORD[112+r10]
181	movdqa	xmm3,XMMWORD[((-80))+r12]
182	pand	xmm5,XMMWORD[128+r10]
183	por	xmm0,xmm4
184	pand	xmm2,XMMWORD[144+r10]
185	por	xmm1,xmm5
186	pand	xmm3,XMMWORD[160+r10]
187	por	xmm0,xmm2
188	por	xmm1,xmm3
189	movdqa	xmm4,XMMWORD[((-64))+r12]
190	movdqa	xmm5,XMMWORD[((-48))+r12]
191	movdqa	xmm2,XMMWORD[((-32))+r12]
192	pand	xmm4,XMMWORD[176+r10]
193	movdqa	xmm3,XMMWORD[((-16))+r12]
194	pand	xmm5,XMMWORD[192+r10]
195	por	xmm0,xmm4
196	pand	xmm2,XMMWORD[208+r10]
197	por	xmm1,xmm5
198	pand	xmm3,XMMWORD[224+r10]
199	por	xmm0,xmm2
200	por	xmm1,xmm3
201	movdqa	xmm4,XMMWORD[r12]
202	movdqa	xmm5,XMMWORD[16+r12]
203	movdqa	xmm2,XMMWORD[32+r12]
204	pand	xmm4,XMMWORD[240+r10]
205	movdqa	xmm3,XMMWORD[48+r12]
206	pand	xmm5,XMMWORD[256+r10]
207	por	xmm0,xmm4
208	pand	xmm2,XMMWORD[272+r10]
209	por	xmm1,xmm5
210	pand	xmm3,XMMWORD[288+r10]
211	por	xmm0,xmm2
212	por	xmm1,xmm3
213	por	xmm0,xmm1
214
215	pshufd	xmm1,xmm0,0x4e
216	por	xmm0,xmm1
217	lea	r12,[256+r12]
218DB	102,72,15,126,195
219
220	mov	r8,QWORD[r8]
221	mov	rax,QWORD[rsi]
222
223	xor	r14,r14
224	xor	r15,r15
225
226	mov	rbp,r8
227	mul	rbx
228	mov	r10,rax
229	mov	rax,QWORD[rcx]
230
231	imul	rbp,r10
232	mov	r11,rdx
233
234	mul	rbp
235	add	r10,rax
236	mov	rax,QWORD[8+rsi]
237	adc	rdx,0
238	mov	r13,rdx
239
240	lea	r15,[1+r15]
241	jmp	NEAR $L$1st_enter
242
243ALIGN	16
244$L$1st:
245	add	r13,rax
246	mov	rax,QWORD[r15*8+rsi]
247	adc	rdx,0
248	add	r13,r11
249	mov	r11,r10
250	adc	rdx,0
251	mov	QWORD[((-16))+r15*8+rsp],r13
252	mov	r13,rdx
253
254$L$1st_enter:
255	mul	rbx
256	add	r11,rax
257	mov	rax,QWORD[r15*8+rcx]
258	adc	rdx,0
259	lea	r15,[1+r15]
260	mov	r10,rdx
261
262	mul	rbp
263	cmp	r15,r9
264	jne	NEAR $L$1st
265
266
267	add	r13,rax
268	adc	rdx,0
269	add	r13,r11
270	adc	rdx,0
271	mov	QWORD[((-16))+r9*8+rsp],r13
272	mov	r13,rdx
273	mov	r11,r10
274
275	xor	rdx,rdx
276	add	r13,r11
277	adc	rdx,0
278	mov	QWORD[((-8))+r9*8+rsp],r13
279	mov	QWORD[r9*8+rsp],rdx
280
281	lea	r14,[1+r14]
282	jmp	NEAR $L$outer
283ALIGN	16
284$L$outer:
285	lea	rdx,[((24+128))+r9*8+rsp]
286	and	rdx,-16
287	pxor	xmm4,xmm4
288	pxor	xmm5,xmm5
289	movdqa	xmm0,XMMWORD[((-128))+r12]
290	movdqa	xmm1,XMMWORD[((-112))+r12]
291	movdqa	xmm2,XMMWORD[((-96))+r12]
292	movdqa	xmm3,XMMWORD[((-80))+r12]
293	pand	xmm0,XMMWORD[((-128))+rdx]
294	pand	xmm1,XMMWORD[((-112))+rdx]
295	por	xmm4,xmm0
296	pand	xmm2,XMMWORD[((-96))+rdx]
297	por	xmm5,xmm1
298	pand	xmm3,XMMWORD[((-80))+rdx]
299	por	xmm4,xmm2
300	por	xmm5,xmm3
301	movdqa	xmm0,XMMWORD[((-64))+r12]
302	movdqa	xmm1,XMMWORD[((-48))+r12]
303	movdqa	xmm2,XMMWORD[((-32))+r12]
304	movdqa	xmm3,XMMWORD[((-16))+r12]
305	pand	xmm0,XMMWORD[((-64))+rdx]
306	pand	xmm1,XMMWORD[((-48))+rdx]
307	por	xmm4,xmm0
308	pand	xmm2,XMMWORD[((-32))+rdx]
309	por	xmm5,xmm1
310	pand	xmm3,XMMWORD[((-16))+rdx]
311	por	xmm4,xmm2
312	por	xmm5,xmm3
313	movdqa	xmm0,XMMWORD[r12]
314	movdqa	xmm1,XMMWORD[16+r12]
315	movdqa	xmm2,XMMWORD[32+r12]
316	movdqa	xmm3,XMMWORD[48+r12]
317	pand	xmm0,XMMWORD[rdx]
318	pand	xmm1,XMMWORD[16+rdx]
319	por	xmm4,xmm0
320	pand	xmm2,XMMWORD[32+rdx]
321	por	xmm5,xmm1
322	pand	xmm3,XMMWORD[48+rdx]
323	por	xmm4,xmm2
324	por	xmm5,xmm3
325	movdqa	xmm0,XMMWORD[64+r12]
326	movdqa	xmm1,XMMWORD[80+r12]
327	movdqa	xmm2,XMMWORD[96+r12]
328	movdqa	xmm3,XMMWORD[112+r12]
329	pand	xmm0,XMMWORD[64+rdx]
330	pand	xmm1,XMMWORD[80+rdx]
331	por	xmm4,xmm0
332	pand	xmm2,XMMWORD[96+rdx]
333	por	xmm5,xmm1
334	pand	xmm3,XMMWORD[112+rdx]
335	por	xmm4,xmm2
336	por	xmm5,xmm3
337	por	xmm4,xmm5
338
339	pshufd	xmm0,xmm4,0x4e
340	por	xmm0,xmm4
341	lea	r12,[256+r12]
342
343	mov	rax,QWORD[rsi]
344DB	102,72,15,126,195
345
346	xor	r15,r15
347	mov	rbp,r8
348	mov	r10,QWORD[rsp]
349
350	mul	rbx
351	add	r10,rax
352	mov	rax,QWORD[rcx]
353	adc	rdx,0
354
355	imul	rbp,r10
356	mov	r11,rdx
357
358	mul	rbp
359	add	r10,rax
360	mov	rax,QWORD[8+rsi]
361	adc	rdx,0
362	mov	r10,QWORD[8+rsp]
363	mov	r13,rdx
364
365	lea	r15,[1+r15]
366	jmp	NEAR $L$inner_enter
367
368ALIGN	16
369$L$inner:
370	add	r13,rax
371	mov	rax,QWORD[r15*8+rsi]
372	adc	rdx,0
373	add	r13,r10
374	mov	r10,QWORD[r15*8+rsp]
375	adc	rdx,0
376	mov	QWORD[((-16))+r15*8+rsp],r13
377	mov	r13,rdx
378
379$L$inner_enter:
380	mul	rbx
381	add	r11,rax
382	mov	rax,QWORD[r15*8+rcx]
383	adc	rdx,0
384	add	r10,r11
385	mov	r11,rdx
386	adc	r11,0
387	lea	r15,[1+r15]
388
389	mul	rbp
390	cmp	r15,r9
391	jne	NEAR $L$inner
392
393	add	r13,rax
394	adc	rdx,0
395	add	r13,r10
396	mov	r10,QWORD[r9*8+rsp]
397	adc	rdx,0
398	mov	QWORD[((-16))+r9*8+rsp],r13
399	mov	r13,rdx
400
401	xor	rdx,rdx
402	add	r13,r11
403	adc	rdx,0
404	add	r13,r10
405	adc	rdx,0
406	mov	QWORD[((-8))+r9*8+rsp],r13
407	mov	QWORD[r9*8+rsp],rdx
408
409	lea	r14,[1+r14]
410	cmp	r14,r9
411	jb	NEAR $L$outer
412
413	xor	r14,r14
414	mov	rax,QWORD[rsp]
415	lea	rsi,[rsp]
416	mov	r15,r9
417	jmp	NEAR $L$sub
418ALIGN	16
419$L$sub:	sbb	rax,QWORD[r14*8+rcx]
420	mov	QWORD[r14*8+rdi],rax
421	mov	rax,QWORD[8+r14*8+rsi]
422	lea	r14,[1+r14]
423	dec	r15
424	jnz	NEAR $L$sub
425
426	sbb	rax,0
427	mov	rbx,-1
428	xor	rbx,rax
429	xor	r14,r14
430	mov	r15,r9
431
432$L$copy:
433	mov	rcx,QWORD[r14*8+rdi]
434	mov	rdx,QWORD[r14*8+rsp]
435	and	rcx,rbx
436	and	rdx,rax
437	mov	QWORD[r14*8+rsp],r14
438	or	rdx,rcx
439	mov	QWORD[r14*8+rdi],rdx
440	lea	r14,[1+r14]
441	sub	r15,1
442	jnz	NEAR $L$copy
443
444	mov	rsi,QWORD[8+r9*8+rsp]
445
446	mov	rax,1
447
448	mov	r15,QWORD[((-48))+rsi]
449
450	mov	r14,QWORD[((-40))+rsi]
451
452	mov	r13,QWORD[((-32))+rsi]
453
454	mov	r12,QWORD[((-24))+rsi]
455
456	mov	rbp,QWORD[((-16))+rsi]
457
458	mov	rbx,QWORD[((-8))+rsi]
459
460	lea	rsp,[rsi]
461
462$L$mul_epilogue:
463	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
464	mov	rsi,QWORD[16+rsp]
465	ret
466
467$L$SEH_end_bn_mul_mont_gather5_nohw:
468global	bn_mul4x_mont_gather5
469
470ALIGN	32
471bn_mul4x_mont_gather5:
472	mov	QWORD[8+rsp],rdi	;WIN64 prologue
473	mov	QWORD[16+rsp],rsi
474	mov	rax,rsp
475$L$SEH_begin_bn_mul4x_mont_gather5:
476	mov	rdi,rcx
477	mov	rsi,rdx
478	mov	rdx,r8
479	mov	rcx,r9
480	mov	r8,QWORD[40+rsp]
481	mov	r9,QWORD[48+rsp]
482
483
484
485_CET_ENDBR
486	DB	0x67
487	mov	rax,rsp
488
489	push	rbx
490
491	push	rbp
492
493	push	r12
494
495	push	r13
496
497	push	r14
498
499	push	r15
500
501$L$mul4x_prologue:
502
503	DB	0x67
504
505
506
507	shl	r9d,3
508	lea	r10,[r9*2+r9]
509	neg	r9
510
511
512
513
514
515
516
517
518
519
520	lea	r11,[((-320))+r9*2+rsp]
521	mov	rbp,rsp
522	sub	r11,rdi
523	and	r11,4095
524	cmp	r10,r11
525	jb	NEAR $L$mul4xsp_alt
526	sub	rbp,r11
527	lea	rbp,[((-320))+r9*2+rbp]
528	jmp	NEAR $L$mul4xsp_done
529
530ALIGN	32
531$L$mul4xsp_alt:
532	lea	r10,[((4096-320))+r9*2]
533	lea	rbp,[((-320))+r9*2+rbp]
534	sub	r11,r10
535	mov	r10,0
536	cmovc	r11,r10
537	sub	rbp,r11
538$L$mul4xsp_done:
539	and	rbp,-64
540	mov	r11,rsp
541	sub	r11,rbp
542	and	r11,-4096
543	lea	rsp,[rbp*1+r11]
544	mov	r10,QWORD[rsp]
545	cmp	rsp,rbp
546	ja	NEAR $L$mul4x_page_walk
547	jmp	NEAR $L$mul4x_page_walk_done
548
549$L$mul4x_page_walk:
550	lea	rsp,[((-4096))+rsp]
551	mov	r10,QWORD[rsp]
552	cmp	rsp,rbp
553	ja	NEAR $L$mul4x_page_walk
554$L$mul4x_page_walk_done:
555
556	neg	r9
557
558	mov	QWORD[40+rsp],rax
559
560$L$mul4x_body:
561
562	call	mul4x_internal
563
564	mov	rsi,QWORD[40+rsp]
565
566	mov	rax,1
567
568	mov	r15,QWORD[((-48))+rsi]
569
570	mov	r14,QWORD[((-40))+rsi]
571
572	mov	r13,QWORD[((-32))+rsi]
573
574	mov	r12,QWORD[((-24))+rsi]
575
576	mov	rbp,QWORD[((-16))+rsi]
577
578	mov	rbx,QWORD[((-8))+rsi]
579
580	lea	rsp,[rsi]
581
582$L$mul4x_epilogue:
583	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
584	mov	rsi,QWORD[16+rsp]
585	ret
586
587$L$SEH_end_bn_mul4x_mont_gather5:
588
589
590ALIGN	32
591mul4x_internal:
592
593	shl	r9,5
594	movd	xmm5,DWORD[56+rax]
595	lea	rax,[$L$inc]
596	lea	r13,[128+r9*1+rdx]
597	shr	r9,5
598	movdqa	xmm0,XMMWORD[rax]
599	movdqa	xmm1,XMMWORD[16+rax]
600	lea	r10,[((88-112))+r9*1+rsp]
601	lea	r12,[128+rdx]
602
603	pshufd	xmm5,xmm5,0
604	movdqa	xmm4,xmm1
605	DB	0x67,0x67
606	movdqa	xmm2,xmm1
607	paddd	xmm1,xmm0
608	pcmpeqd	xmm0,xmm5
609	DB	0x67
610	movdqa	xmm3,xmm4
611	paddd	xmm2,xmm1
612	pcmpeqd	xmm1,xmm5
613	movdqa	XMMWORD[112+r10],xmm0
614	movdqa	xmm0,xmm4
615
616	paddd	xmm3,xmm2
617	pcmpeqd	xmm2,xmm5
618	movdqa	XMMWORD[128+r10],xmm1
619	movdqa	xmm1,xmm4
620
621	paddd	xmm0,xmm3
622	pcmpeqd	xmm3,xmm5
623	movdqa	XMMWORD[144+r10],xmm2
624	movdqa	xmm2,xmm4
625
626	paddd	xmm1,xmm0
627	pcmpeqd	xmm0,xmm5
628	movdqa	XMMWORD[160+r10],xmm3
629	movdqa	xmm3,xmm4
630	paddd	xmm2,xmm1
631	pcmpeqd	xmm1,xmm5
632	movdqa	XMMWORD[176+r10],xmm0
633	movdqa	xmm0,xmm4
634
635	paddd	xmm3,xmm2
636	pcmpeqd	xmm2,xmm5
637	movdqa	XMMWORD[192+r10],xmm1
638	movdqa	xmm1,xmm4
639
640	paddd	xmm0,xmm3
641	pcmpeqd	xmm3,xmm5
642	movdqa	XMMWORD[208+r10],xmm2
643	movdqa	xmm2,xmm4
644
645	paddd	xmm1,xmm0
646	pcmpeqd	xmm0,xmm5
647	movdqa	XMMWORD[224+r10],xmm3
648	movdqa	xmm3,xmm4
649	paddd	xmm2,xmm1
650	pcmpeqd	xmm1,xmm5
651	movdqa	XMMWORD[240+r10],xmm0
652	movdqa	xmm0,xmm4
653
654	paddd	xmm3,xmm2
655	pcmpeqd	xmm2,xmm5
656	movdqa	XMMWORD[256+r10],xmm1
657	movdqa	xmm1,xmm4
658
659	paddd	xmm0,xmm3
660	pcmpeqd	xmm3,xmm5
661	movdqa	XMMWORD[272+r10],xmm2
662	movdqa	xmm2,xmm4
663
664	paddd	xmm1,xmm0
665	pcmpeqd	xmm0,xmm5
666	movdqa	XMMWORD[288+r10],xmm3
667	movdqa	xmm3,xmm4
668	paddd	xmm2,xmm1
669	pcmpeqd	xmm1,xmm5
670	movdqa	XMMWORD[304+r10],xmm0
671
672	paddd	xmm3,xmm2
673	DB	0x67
674	pcmpeqd	xmm2,xmm5
675	movdqa	XMMWORD[320+r10],xmm1
676
677	pcmpeqd	xmm3,xmm5
678	movdqa	XMMWORD[336+r10],xmm2
679	pand	xmm0,XMMWORD[64+r12]
680
681	pand	xmm1,XMMWORD[80+r12]
682	pand	xmm2,XMMWORD[96+r12]
683	movdqa	XMMWORD[352+r10],xmm3
684	pand	xmm3,XMMWORD[112+r12]
685	por	xmm0,xmm2
686	por	xmm1,xmm3
687	movdqa	xmm4,XMMWORD[((-128))+r12]
688	movdqa	xmm5,XMMWORD[((-112))+r12]
689	movdqa	xmm2,XMMWORD[((-96))+r12]
690	pand	xmm4,XMMWORD[112+r10]
691	movdqa	xmm3,XMMWORD[((-80))+r12]
692	pand	xmm5,XMMWORD[128+r10]
693	por	xmm0,xmm4
694	pand	xmm2,XMMWORD[144+r10]
695	por	xmm1,xmm5
696	pand	xmm3,XMMWORD[160+r10]
697	por	xmm0,xmm2
698	por	xmm1,xmm3
699	movdqa	xmm4,XMMWORD[((-64))+r12]
700	movdqa	xmm5,XMMWORD[((-48))+r12]
701	movdqa	xmm2,XMMWORD[((-32))+r12]
702	pand	xmm4,XMMWORD[176+r10]
703	movdqa	xmm3,XMMWORD[((-16))+r12]
704	pand	xmm5,XMMWORD[192+r10]
705	por	xmm0,xmm4
706	pand	xmm2,XMMWORD[208+r10]
707	por	xmm1,xmm5
708	pand	xmm3,XMMWORD[224+r10]
709	por	xmm0,xmm2
710	por	xmm1,xmm3
711	movdqa	xmm4,XMMWORD[r12]
712	movdqa	xmm5,XMMWORD[16+r12]
713	movdqa	xmm2,XMMWORD[32+r12]
714	pand	xmm4,XMMWORD[240+r10]
715	movdqa	xmm3,XMMWORD[48+r12]
716	pand	xmm5,XMMWORD[256+r10]
717	por	xmm0,xmm4
718	pand	xmm2,XMMWORD[272+r10]
719	por	xmm1,xmm5
720	pand	xmm3,XMMWORD[288+r10]
721	por	xmm0,xmm2
722	por	xmm1,xmm3
723	por	xmm0,xmm1
724
725	pshufd	xmm1,xmm0,0x4e
726	por	xmm0,xmm1
727	lea	r12,[256+r12]
728DB	102,72,15,126,195
729
730	mov	QWORD[((16+8))+rsp],r13
731	mov	QWORD[((56+8))+rsp],rdi
732
733	mov	r8,QWORD[r8]
734	mov	rax,QWORD[rsi]
735	lea	rsi,[r9*1+rsi]
736	neg	r9
737
738	mov	rbp,r8
739	mul	rbx
740	mov	r10,rax
741	mov	rax,QWORD[rcx]
742
743	imul	rbp,r10
744	lea	r14,[((64+8))+rsp]
745	mov	r11,rdx
746
747	mul	rbp
748	add	r10,rax
749	mov	rax,QWORD[8+r9*1+rsi]
750	adc	rdx,0
751	mov	rdi,rdx
752
753	mul	rbx
754	add	r11,rax
755	mov	rax,QWORD[8+rcx]
756	adc	rdx,0
757	mov	r10,rdx
758
759	mul	rbp
760	add	rdi,rax
761	mov	rax,QWORD[16+r9*1+rsi]
762	adc	rdx,0
763	add	rdi,r11
764	lea	r15,[32+r9]
765	lea	rcx,[32+rcx]
766	adc	rdx,0
767	mov	QWORD[r14],rdi
768	mov	r13,rdx
769	jmp	NEAR $L$1st4x
770
771ALIGN	32
772$L$1st4x:
773	mul	rbx
774	add	r10,rax
775	mov	rax,QWORD[((-16))+rcx]
776	lea	r14,[32+r14]
777	adc	rdx,0
778	mov	r11,rdx
779
780	mul	rbp
781	add	r13,rax
782	mov	rax,QWORD[((-8))+r15*1+rsi]
783	adc	rdx,0
784	add	r13,r10
785	adc	rdx,0
786	mov	QWORD[((-24))+r14],r13
787	mov	rdi,rdx
788
789	mul	rbx
790	add	r11,rax
791	mov	rax,QWORD[((-8))+rcx]
792	adc	rdx,0
793	mov	r10,rdx
794
795	mul	rbp
796	add	rdi,rax
797	mov	rax,QWORD[r15*1+rsi]
798	adc	rdx,0
799	add	rdi,r11
800	adc	rdx,0
801	mov	QWORD[((-16))+r14],rdi
802	mov	r13,rdx
803
804	mul	rbx
805	add	r10,rax
806	mov	rax,QWORD[rcx]
807	adc	rdx,0
808	mov	r11,rdx
809
810	mul	rbp
811	add	r13,rax
812	mov	rax,QWORD[8+r15*1+rsi]
813	adc	rdx,0
814	add	r13,r10
815	adc	rdx,0
816	mov	QWORD[((-8))+r14],r13
817	mov	rdi,rdx
818
819	mul	rbx
820	add	r11,rax
821	mov	rax,QWORD[8+rcx]
822	adc	rdx,0
823	mov	r10,rdx
824
825	mul	rbp
826	add	rdi,rax
827	mov	rax,QWORD[16+r15*1+rsi]
828	adc	rdx,0
829	add	rdi,r11
830	lea	rcx,[32+rcx]
831	adc	rdx,0
832	mov	QWORD[r14],rdi
833	mov	r13,rdx
834
835	add	r15,32
836	jnz	NEAR $L$1st4x
837
838	mul	rbx
839	add	r10,rax
840	mov	rax,QWORD[((-16))+rcx]
841	lea	r14,[32+r14]
842	adc	rdx,0
843	mov	r11,rdx
844
845	mul	rbp
846	add	r13,rax
847	mov	rax,QWORD[((-8))+rsi]
848	adc	rdx,0
849	add	r13,r10
850	adc	rdx,0
851	mov	QWORD[((-24))+r14],r13
852	mov	rdi,rdx
853
854	mul	rbx
855	add	r11,rax
856	mov	rax,QWORD[((-8))+rcx]
857	adc	rdx,0
858	mov	r10,rdx
859
860	mul	rbp
861	add	rdi,rax
862	mov	rax,QWORD[r9*1+rsi]
863	adc	rdx,0
864	add	rdi,r11
865	adc	rdx,0
866	mov	QWORD[((-16))+r14],rdi
867	mov	r13,rdx
868
869	lea	rcx,[r9*1+rcx]
870
871	xor	rdi,rdi
872	add	r13,r10
873	adc	rdi,0
874	mov	QWORD[((-8))+r14],r13
875
876	jmp	NEAR $L$outer4x
877
878ALIGN	32
879$L$outer4x:
880	lea	rdx,[((16+128))+r14]
881	pxor	xmm4,xmm4
882	pxor	xmm5,xmm5
883	movdqa	xmm0,XMMWORD[((-128))+r12]
884	movdqa	xmm1,XMMWORD[((-112))+r12]
885	movdqa	xmm2,XMMWORD[((-96))+r12]
886	movdqa	xmm3,XMMWORD[((-80))+r12]
887	pand	xmm0,XMMWORD[((-128))+rdx]
888	pand	xmm1,XMMWORD[((-112))+rdx]
889	por	xmm4,xmm0
890	pand	xmm2,XMMWORD[((-96))+rdx]
891	por	xmm5,xmm1
892	pand	xmm3,XMMWORD[((-80))+rdx]
893	por	xmm4,xmm2
894	por	xmm5,xmm3
895	movdqa	xmm0,XMMWORD[((-64))+r12]
896	movdqa	xmm1,XMMWORD[((-48))+r12]
897	movdqa	xmm2,XMMWORD[((-32))+r12]
898	movdqa	xmm3,XMMWORD[((-16))+r12]
899	pand	xmm0,XMMWORD[((-64))+rdx]
900	pand	xmm1,XMMWORD[((-48))+rdx]
901	por	xmm4,xmm0
902	pand	xmm2,XMMWORD[((-32))+rdx]
903	por	xmm5,xmm1
904	pand	xmm3,XMMWORD[((-16))+rdx]
905	por	xmm4,xmm2
906	por	xmm5,xmm3
907	movdqa	xmm0,XMMWORD[r12]
908	movdqa	xmm1,XMMWORD[16+r12]
909	movdqa	xmm2,XMMWORD[32+r12]
910	movdqa	xmm3,XMMWORD[48+r12]
911	pand	xmm0,XMMWORD[rdx]
912	pand	xmm1,XMMWORD[16+rdx]
913	por	xmm4,xmm0
914	pand	xmm2,XMMWORD[32+rdx]
915	por	xmm5,xmm1
916	pand	xmm3,XMMWORD[48+rdx]
917	por	xmm4,xmm2
918	por	xmm5,xmm3
919	movdqa	xmm0,XMMWORD[64+r12]
920	movdqa	xmm1,XMMWORD[80+r12]
921	movdqa	xmm2,XMMWORD[96+r12]
922	movdqa	xmm3,XMMWORD[112+r12]
923	pand	xmm0,XMMWORD[64+rdx]
924	pand	xmm1,XMMWORD[80+rdx]
925	por	xmm4,xmm0
926	pand	xmm2,XMMWORD[96+rdx]
927	por	xmm5,xmm1
928	pand	xmm3,XMMWORD[112+rdx]
929	por	xmm4,xmm2
930	por	xmm5,xmm3
931	por	xmm4,xmm5
932
933	pshufd	xmm0,xmm4,0x4e
934	por	xmm0,xmm4
935	lea	r12,[256+r12]
936DB	102,72,15,126,195
937
938	mov	r10,QWORD[r9*1+r14]
939	mov	rbp,r8
940	mul	rbx
941	add	r10,rax
942	mov	rax,QWORD[rcx]
943	adc	rdx,0
944
945	imul	rbp,r10
946	mov	r11,rdx
947	mov	QWORD[r14],rdi
948
949	lea	r14,[r9*1+r14]
950
951	mul	rbp
952	add	r10,rax
953	mov	rax,QWORD[8+r9*1+rsi]
954	adc	rdx,0
955	mov	rdi,rdx
956
957	mul	rbx
958	add	r11,rax
959	mov	rax,QWORD[8+rcx]
960	adc	rdx,0
961	add	r11,QWORD[8+r14]
962	adc	rdx,0
963	mov	r10,rdx
964
965	mul	rbp
966	add	rdi,rax
967	mov	rax,QWORD[16+r9*1+rsi]
968	adc	rdx,0
969	add	rdi,r11
970	lea	r15,[32+r9]
971	lea	rcx,[32+rcx]
972	adc	rdx,0
973	mov	r13,rdx
974	jmp	NEAR $L$inner4x
975
976ALIGN	32
977$L$inner4x:
978	mul	rbx
979	add	r10,rax
980	mov	rax,QWORD[((-16))+rcx]
981	adc	rdx,0
982	add	r10,QWORD[16+r14]
983	lea	r14,[32+r14]
984	adc	rdx,0
985	mov	r11,rdx
986
987	mul	rbp
988	add	r13,rax
989	mov	rax,QWORD[((-8))+r15*1+rsi]
990	adc	rdx,0
991	add	r13,r10
992	adc	rdx,0
993	mov	QWORD[((-32))+r14],rdi
994	mov	rdi,rdx
995
996	mul	rbx
997	add	r11,rax
998	mov	rax,QWORD[((-8))+rcx]
999	adc	rdx,0
1000	add	r11,QWORD[((-8))+r14]
1001	adc	rdx,0
1002	mov	r10,rdx
1003
1004	mul	rbp
1005	add	rdi,rax
1006	mov	rax,QWORD[r15*1+rsi]
1007	adc	rdx,0
1008	add	rdi,r11
1009	adc	rdx,0
1010	mov	QWORD[((-24))+r14],r13
1011	mov	r13,rdx
1012
1013	mul	rbx
1014	add	r10,rax
1015	mov	rax,QWORD[rcx]
1016	adc	rdx,0
1017	add	r10,QWORD[r14]
1018	adc	rdx,0
1019	mov	r11,rdx
1020
1021	mul	rbp
1022	add	r13,rax
1023	mov	rax,QWORD[8+r15*1+rsi]
1024	adc	rdx,0
1025	add	r13,r10
1026	adc	rdx,0
1027	mov	QWORD[((-16))+r14],rdi
1028	mov	rdi,rdx
1029
1030	mul	rbx
1031	add	r11,rax
1032	mov	rax,QWORD[8+rcx]
1033	adc	rdx,0
1034	add	r11,QWORD[8+r14]
1035	adc	rdx,0
1036	mov	r10,rdx
1037
1038	mul	rbp
1039	add	rdi,rax
1040	mov	rax,QWORD[16+r15*1+rsi]
1041	adc	rdx,0
1042	add	rdi,r11
1043	lea	rcx,[32+rcx]
1044	adc	rdx,0
1045	mov	QWORD[((-8))+r14],r13
1046	mov	r13,rdx
1047
1048	add	r15,32
1049	jnz	NEAR $L$inner4x
1050
1051	mul	rbx
1052	add	r10,rax
1053	mov	rax,QWORD[((-16))+rcx]
1054	adc	rdx,0
1055	add	r10,QWORD[16+r14]
1056	lea	r14,[32+r14]
1057	adc	rdx,0
1058	mov	r11,rdx
1059
1060	mul	rbp
1061	add	r13,rax
1062	mov	rax,QWORD[((-8))+rsi]
1063	adc	rdx,0
1064	add	r13,r10
1065	adc	rdx,0
1066	mov	QWORD[((-32))+r14],rdi
1067	mov	rdi,rdx
1068
1069	mul	rbx
1070	add	r11,rax
1071	mov	rax,rbp
1072	mov	rbp,QWORD[((-8))+rcx]
1073	adc	rdx,0
1074	add	r11,QWORD[((-8))+r14]
1075	adc	rdx,0
1076	mov	r10,rdx
1077
1078	mul	rbp
1079	add	rdi,rax
1080	mov	rax,QWORD[r9*1+rsi]
1081	adc	rdx,0
1082	add	rdi,r11
1083	adc	rdx,0
1084	mov	QWORD[((-24))+r14],r13
1085	mov	r13,rdx
1086
1087	mov	QWORD[((-16))+r14],rdi
1088	lea	rcx,[r9*1+rcx]
1089
1090	xor	rdi,rdi
1091	add	r13,r10
1092	adc	rdi,0
1093	add	r13,QWORD[r14]
1094	adc	rdi,0
1095	mov	QWORD[((-8))+r14],r13
1096
1097	cmp	r12,QWORD[((16+8))+rsp]
1098	jb	NEAR $L$outer4x
1099	xor	rax,rax
1100	sub	rbp,r13
1101	adc	r15,r15
1102	or	rdi,r15
1103	sub	rax,rdi
1104	lea	rbx,[r9*1+r14]
1105	mov	r12,QWORD[rcx]
1106	lea	rbp,[rcx]
1107	mov	rcx,r9
1108	sar	rcx,3+2
1109	mov	rdi,QWORD[((56+8))+rsp]
1110	dec	r12
1111	xor	r10,r10
1112	mov	r13,QWORD[8+rbp]
1113	mov	r14,QWORD[16+rbp]
1114	mov	r15,QWORD[24+rbp]
1115	jmp	NEAR $L$sqr4x_sub_entry
1116
1117
1118global	bn_power5_nohw
1119
1120ALIGN	32
1121bn_power5_nohw:
1122	mov	QWORD[8+rsp],rdi	;WIN64 prologue
1123	mov	QWORD[16+rsp],rsi
1124	mov	rax,rsp
1125$L$SEH_begin_bn_power5_nohw:
1126	mov	rdi,rcx
1127	mov	rsi,rdx
1128	mov	rdx,r8
1129	mov	rcx,r9
1130	mov	r8,QWORD[40+rsp]
1131	mov	r9,QWORD[48+rsp]
1132
1133
1134
1135_CET_ENDBR
1136	mov	rax,rsp
1137
1138	push	rbx
1139
1140	push	rbp
1141
1142	push	r12
1143
1144	push	r13
1145
1146	push	r14
1147
1148	push	r15
1149
1150$L$power5_prologue:
1151
1152
1153
1154
1155	shl	r9d,3
1156	lea	r10d,[r9*2+r9]
1157	neg	r9
1158	mov	r8,QWORD[r8]
1159
1160
1161
1162
1163
1164
1165
1166
1167	lea	r11,[((-320))+r9*2+rsp]
1168	mov	rbp,rsp
1169	sub	r11,rdi
1170	and	r11,4095
1171	cmp	r10,r11
1172	jb	NEAR $L$pwr_sp_alt
1173	sub	rbp,r11
1174	lea	rbp,[((-320))+r9*2+rbp]
1175	jmp	NEAR $L$pwr_sp_done
1176
1177ALIGN	32
1178$L$pwr_sp_alt:
1179	lea	r10,[((4096-320))+r9*2]
1180	lea	rbp,[((-320))+r9*2+rbp]
1181	sub	r11,r10
1182	mov	r10,0
1183	cmovc	r11,r10
1184	sub	rbp,r11
1185$L$pwr_sp_done:
1186	and	rbp,-64
1187	mov	r11,rsp
1188	sub	r11,rbp
1189	and	r11,-4096
1190	lea	rsp,[rbp*1+r11]
1191	mov	r10,QWORD[rsp]
1192	cmp	rsp,rbp
1193	ja	NEAR $L$pwr_page_walk
1194	jmp	NEAR $L$pwr_page_walk_done
1195
1196$L$pwr_page_walk:
1197	lea	rsp,[((-4096))+rsp]
1198	mov	r10,QWORD[rsp]
1199	cmp	rsp,rbp
1200	ja	NEAR $L$pwr_page_walk
1201$L$pwr_page_walk_done:
1202
1203	mov	r10,r9
1204	neg	r9
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215	mov	QWORD[32+rsp],r8
1216	mov	QWORD[40+rsp],rax
1217
1218$L$power5_body:
1219DB	102,72,15,110,207
1220DB	102,72,15,110,209
1221DB	102,73,15,110,218
1222DB	102,72,15,110,226
1223
1224	call	__bn_sqr8x_internal
1225	call	__bn_post4x_internal
1226	call	__bn_sqr8x_internal
1227	call	__bn_post4x_internal
1228	call	__bn_sqr8x_internal
1229	call	__bn_post4x_internal
1230	call	__bn_sqr8x_internal
1231	call	__bn_post4x_internal
1232	call	__bn_sqr8x_internal
1233	call	__bn_post4x_internal
1234
1235DB	102,72,15,126,209
1236DB	102,72,15,126,226
1237	mov	rdi,rsi
1238	mov	rax,QWORD[40+rsp]
1239	lea	r8,[32+rsp]
1240
1241	call	mul4x_internal
1242
1243	mov	rsi,QWORD[40+rsp]
1244
1245	mov	rax,1
1246	mov	r15,QWORD[((-48))+rsi]
1247
1248	mov	r14,QWORD[((-40))+rsi]
1249
1250	mov	r13,QWORD[((-32))+rsi]
1251
1252	mov	r12,QWORD[((-24))+rsi]
1253
1254	mov	rbp,QWORD[((-16))+rsi]
1255
1256	mov	rbx,QWORD[((-8))+rsi]
1257
1258	lea	rsp,[rsi]
1259
1260$L$power5_epilogue:
1261	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
1262	mov	rsi,QWORD[16+rsp]
1263	ret
1264
1265$L$SEH_end_bn_power5_nohw:
1266
1267global	bn_sqr8x_internal
1268
1269
1270ALIGN	32
1271bn_sqr8x_internal:
1272__bn_sqr8x_internal:
1273
1274_CET_ENDBR
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348	lea	rbp,[32+r10]
1349	lea	rsi,[r9*1+rsi]
1350
1351	mov	rcx,r9
1352
1353
1354	mov	r14,QWORD[((-32))+rbp*1+rsi]
1355	lea	rdi,[((48+8))+r9*2+rsp]
1356	mov	rax,QWORD[((-24))+rbp*1+rsi]
1357	lea	rdi,[((-32))+rbp*1+rdi]
1358	mov	rbx,QWORD[((-16))+rbp*1+rsi]
1359	mov	r15,rax
1360
1361	mul	r14
1362	mov	r10,rax
1363	mov	rax,rbx
1364	mov	r11,rdx
1365	mov	QWORD[((-24))+rbp*1+rdi],r10
1366
1367	mul	r14
1368	add	r11,rax
1369	mov	rax,rbx
1370	adc	rdx,0
1371	mov	QWORD[((-16))+rbp*1+rdi],r11
1372	mov	r10,rdx
1373
1374
1375	mov	rbx,QWORD[((-8))+rbp*1+rsi]
1376	mul	r15
1377	mov	r12,rax
1378	mov	rax,rbx
1379	mov	r13,rdx
1380
1381	lea	rcx,[rbp]
1382	mul	r14
1383	add	r10,rax
1384	mov	rax,rbx
1385	mov	r11,rdx
1386	adc	r11,0
1387	add	r10,r12
1388	adc	r11,0
1389	mov	QWORD[((-8))+rcx*1+rdi],r10
1390	jmp	NEAR $L$sqr4x_1st
1391
1392ALIGN	32
1393$L$sqr4x_1st:
1394	mov	rbx,QWORD[rcx*1+rsi]
1395	mul	r15
1396	add	r13,rax
1397	mov	rax,rbx
1398	mov	r12,rdx
1399	adc	r12,0
1400
1401	mul	r14
1402	add	r11,rax
1403	mov	rax,rbx
1404	mov	rbx,QWORD[8+rcx*1+rsi]
1405	mov	r10,rdx
1406	adc	r10,0
1407	add	r11,r13
1408	adc	r10,0
1409
1410
1411	mul	r15
1412	add	r12,rax
1413	mov	rax,rbx
1414	mov	QWORD[rcx*1+rdi],r11
1415	mov	r13,rdx
1416	adc	r13,0
1417
1418	mul	r14
1419	add	r10,rax
1420	mov	rax,rbx
1421	mov	rbx,QWORD[16+rcx*1+rsi]
1422	mov	r11,rdx
1423	adc	r11,0
1424	add	r10,r12
1425	adc	r11,0
1426
1427	mul	r15
1428	add	r13,rax
1429	mov	rax,rbx
1430	mov	QWORD[8+rcx*1+rdi],r10
1431	mov	r12,rdx
1432	adc	r12,0
1433
1434	mul	r14
1435	add	r11,rax
1436	mov	rax,rbx
1437	mov	rbx,QWORD[24+rcx*1+rsi]
1438	mov	r10,rdx
1439	adc	r10,0
1440	add	r11,r13
1441	adc	r10,0
1442
1443
1444	mul	r15
1445	add	r12,rax
1446	mov	rax,rbx
1447	mov	QWORD[16+rcx*1+rdi],r11
1448	mov	r13,rdx
1449	adc	r13,0
1450	lea	rcx,[32+rcx]
1451
1452	mul	r14
1453	add	r10,rax
1454	mov	rax,rbx
1455	mov	r11,rdx
1456	adc	r11,0
1457	add	r10,r12
1458	adc	r11,0
1459	mov	QWORD[((-8))+rcx*1+rdi],r10
1460
1461	cmp	rcx,0
1462	jne	NEAR $L$sqr4x_1st
1463
1464	mul	r15
1465	add	r13,rax
1466	lea	rbp,[16+rbp]
1467	adc	rdx,0
1468	add	r13,r11
1469	adc	rdx,0
1470
1471	mov	QWORD[rdi],r13
1472	mov	r12,rdx
1473	mov	QWORD[8+rdi],rdx
1474	jmp	NEAR $L$sqr4x_outer
1475
1476ALIGN	32
1477$L$sqr4x_outer:
1478	mov	r14,QWORD[((-32))+rbp*1+rsi]
1479	lea	rdi,[((48+8))+r9*2+rsp]
1480	mov	rax,QWORD[((-24))+rbp*1+rsi]
1481	lea	rdi,[((-32))+rbp*1+rdi]
1482	mov	rbx,QWORD[((-16))+rbp*1+rsi]
1483	mov	r15,rax
1484
1485	mul	r14
1486	mov	r10,QWORD[((-24))+rbp*1+rdi]
1487	add	r10,rax
1488	mov	rax,rbx
1489	adc	rdx,0
1490	mov	QWORD[((-24))+rbp*1+rdi],r10
1491	mov	r11,rdx
1492
1493	mul	r14
1494	add	r11,rax
1495	mov	rax,rbx
1496	adc	rdx,0
1497	add	r11,QWORD[((-16))+rbp*1+rdi]
1498	mov	r10,rdx
1499	adc	r10,0
1500	mov	QWORD[((-16))+rbp*1+rdi],r11
1501
1502	xor	r12,r12
1503
1504	mov	rbx,QWORD[((-8))+rbp*1+rsi]
1505	mul	r15
1506	add	r12,rax
1507	mov	rax,rbx
1508	adc	rdx,0
1509	add	r12,QWORD[((-8))+rbp*1+rdi]
1510	mov	r13,rdx
1511	adc	r13,0
1512
1513	mul	r14
1514	add	r10,rax
1515	mov	rax,rbx
1516	adc	rdx,0
1517	add	r10,r12
1518	mov	r11,rdx
1519	adc	r11,0
1520	mov	QWORD[((-8))+rbp*1+rdi],r10
1521
1522	lea	rcx,[rbp]
1523	jmp	NEAR $L$sqr4x_inner
1524
1525ALIGN	32
1526$L$sqr4x_inner:
1527	mov	rbx,QWORD[rcx*1+rsi]
1528	mul	r15
1529	add	r13,rax
1530	mov	rax,rbx
1531	mov	r12,rdx
1532	adc	r12,0
1533	add	r13,QWORD[rcx*1+rdi]
1534	adc	r12,0
1535
1536	DB	0x67
1537	mul	r14
1538	add	r11,rax
1539	mov	rax,rbx
1540	mov	rbx,QWORD[8+rcx*1+rsi]
1541	mov	r10,rdx
1542	adc	r10,0
1543	add	r11,r13
1544	adc	r10,0
1545
1546	mul	r15
1547	add	r12,rax
1548	mov	QWORD[rcx*1+rdi],r11
1549	mov	rax,rbx
1550	mov	r13,rdx
1551	adc	r13,0
1552	add	r12,QWORD[8+rcx*1+rdi]
1553	lea	rcx,[16+rcx]
1554	adc	r13,0
1555
1556	mul	r14
1557	add	r10,rax
1558	mov	rax,rbx
1559	adc	rdx,0
1560	add	r10,r12
1561	mov	r11,rdx
1562	adc	r11,0
1563	mov	QWORD[((-8))+rcx*1+rdi],r10
1564
1565	cmp	rcx,0
1566	jne	NEAR $L$sqr4x_inner
1567
1568	DB	0x67
1569	mul	r15
1570	add	r13,rax
1571	adc	rdx,0
1572	add	r13,r11
1573	adc	rdx,0
1574
1575	mov	QWORD[rdi],r13
1576	mov	r12,rdx
1577	mov	QWORD[8+rdi],rdx
1578
1579	add	rbp,16
1580	jnz	NEAR $L$sqr4x_outer
1581
1582
1583	mov	r14,QWORD[((-32))+rsi]
1584	lea	rdi,[((48+8))+r9*2+rsp]
1585	mov	rax,QWORD[((-24))+rsi]
1586	lea	rdi,[((-32))+rbp*1+rdi]
1587	mov	rbx,QWORD[((-16))+rsi]
1588	mov	r15,rax
1589
1590	mul	r14
1591	add	r10,rax
1592	mov	rax,rbx
1593	mov	r11,rdx
1594	adc	r11,0
1595
1596	mul	r14
1597	add	r11,rax
1598	mov	rax,rbx
1599	mov	QWORD[((-24))+rdi],r10
1600	mov	r10,rdx
1601	adc	r10,0
1602	add	r11,r13
1603	mov	rbx,QWORD[((-8))+rsi]
1604	adc	r10,0
1605
1606	mul	r15
1607	add	r12,rax
1608	mov	rax,rbx
1609	mov	QWORD[((-16))+rdi],r11
1610	mov	r13,rdx
1611	adc	r13,0
1612
1613	mul	r14
1614	add	r10,rax
1615	mov	rax,rbx
1616	mov	r11,rdx
1617	adc	r11,0
1618	add	r10,r12
1619	adc	r11,0
1620	mov	QWORD[((-8))+rdi],r10
1621
1622	mul	r15
1623	add	r13,rax
1624	mov	rax,QWORD[((-16))+rsi]
1625	adc	rdx,0
1626	add	r13,r11
1627	adc	rdx,0
1628
1629	mov	QWORD[rdi],r13
1630	mov	r12,rdx
1631	mov	QWORD[8+rdi],rdx
1632
1633	mul	rbx
1634	add	rbp,16
1635	xor	r14,r14
1636	sub	rbp,r9
1637	xor	r15,r15
1638
1639	add	rax,r12
1640	adc	rdx,0
1641	mov	QWORD[8+rdi],rax
1642	mov	QWORD[16+rdi],rdx
1643	mov	QWORD[24+rdi],r15
1644
1645	mov	rax,QWORD[((-16))+rbp*1+rsi]
1646	lea	rdi,[((48+8))+rsp]
1647	xor	r10,r10
1648	mov	r11,QWORD[8+rdi]
1649
1650	lea	r12,[r10*2+r14]
1651	shr	r10,63
1652	lea	r13,[r11*2+rcx]
1653	shr	r11,63
1654	or	r13,r10
1655	mov	r10,QWORD[16+rdi]
1656	mov	r14,r11
1657	mul	rax
1658	neg	r15
1659	mov	r11,QWORD[24+rdi]
1660	adc	r12,rax
1661	mov	rax,QWORD[((-8))+rbp*1+rsi]
1662	mov	QWORD[rdi],r12
1663	adc	r13,rdx
1664
1665	lea	rbx,[r10*2+r14]
1666	mov	QWORD[8+rdi],r13
1667	sbb	r15,r15
1668	shr	r10,63
1669	lea	r8,[r11*2+rcx]
1670	shr	r11,63
1671	or	r8,r10
1672	mov	r10,QWORD[32+rdi]
1673	mov	r14,r11
1674	mul	rax
1675	neg	r15
1676	mov	r11,QWORD[40+rdi]
1677	adc	rbx,rax
1678	mov	rax,QWORD[rbp*1+rsi]
1679	mov	QWORD[16+rdi],rbx
1680	adc	r8,rdx
1681	lea	rbp,[16+rbp]
1682	mov	QWORD[24+rdi],r8
1683	sbb	r15,r15
1684	lea	rdi,[64+rdi]
1685	jmp	NEAR $L$sqr4x_shift_n_add
1686
1687ALIGN	32
1688$L$sqr4x_shift_n_add:
1689	lea	r12,[r10*2+r14]
1690	shr	r10,63
1691	lea	r13,[r11*2+rcx]
1692	shr	r11,63
1693	or	r13,r10
1694	mov	r10,QWORD[((-16))+rdi]
1695	mov	r14,r11
1696	mul	rax
1697	neg	r15
1698	mov	r11,QWORD[((-8))+rdi]
1699	adc	r12,rax
1700	mov	rax,QWORD[((-8))+rbp*1+rsi]
1701	mov	QWORD[((-32))+rdi],r12
1702	adc	r13,rdx
1703
1704	lea	rbx,[r10*2+r14]
1705	mov	QWORD[((-24))+rdi],r13
1706	sbb	r15,r15
1707	shr	r10,63
1708	lea	r8,[r11*2+rcx]
1709	shr	r11,63
1710	or	r8,r10
1711	mov	r10,QWORD[rdi]
1712	mov	r14,r11
1713	mul	rax
1714	neg	r15
1715	mov	r11,QWORD[8+rdi]
1716	adc	rbx,rax
1717	mov	rax,QWORD[rbp*1+rsi]
1718	mov	QWORD[((-16))+rdi],rbx
1719	adc	r8,rdx
1720
1721	lea	r12,[r10*2+r14]
1722	mov	QWORD[((-8))+rdi],r8
1723	sbb	r15,r15
1724	shr	r10,63
1725	lea	r13,[r11*2+rcx]
1726	shr	r11,63
1727	or	r13,r10
1728	mov	r10,QWORD[16+rdi]
1729	mov	r14,r11
1730	mul	rax
1731	neg	r15
1732	mov	r11,QWORD[24+rdi]
1733	adc	r12,rax
1734	mov	rax,QWORD[8+rbp*1+rsi]
1735	mov	QWORD[rdi],r12
1736	adc	r13,rdx
1737
1738	lea	rbx,[r10*2+r14]
1739	mov	QWORD[8+rdi],r13
1740	sbb	r15,r15
1741	shr	r10,63
1742	lea	r8,[r11*2+rcx]
1743	shr	r11,63
1744	or	r8,r10
1745	mov	r10,QWORD[32+rdi]
1746	mov	r14,r11
1747	mul	rax
1748	neg	r15
1749	mov	r11,QWORD[40+rdi]
1750	adc	rbx,rax
1751	mov	rax,QWORD[16+rbp*1+rsi]
1752	mov	QWORD[16+rdi],rbx
1753	adc	r8,rdx
1754	mov	QWORD[24+rdi],r8
1755	sbb	r15,r15
1756	lea	rdi,[64+rdi]
1757	add	rbp,32
1758	jnz	NEAR $L$sqr4x_shift_n_add
1759
1760	lea	r12,[r10*2+r14]
1761	DB	0x67
1762	shr	r10,63
1763	lea	r13,[r11*2+rcx]
1764	shr	r11,63
1765	or	r13,r10
1766	mov	r10,QWORD[((-16))+rdi]
1767	mov	r14,r11
1768	mul	rax
1769	neg	r15
1770	mov	r11,QWORD[((-8))+rdi]
1771	adc	r12,rax
1772	mov	rax,QWORD[((-8))+rsi]
1773	mov	QWORD[((-32))+rdi],r12
1774	adc	r13,rdx
1775
1776	lea	rbx,[r10*2+r14]
1777	mov	QWORD[((-24))+rdi],r13
1778	sbb	r15,r15
1779	shr	r10,63
1780	lea	r8,[r11*2+rcx]
1781	shr	r11,63
1782	or	r8,r10
1783	mul	rax
1784	neg	r15
1785	adc	rbx,rax
1786	adc	r8,rdx
1787	mov	QWORD[((-16))+rdi],rbx
1788	mov	QWORD[((-8))+rdi],r8
1789DB	102,72,15,126,213
1790__bn_sqr8x_reduction:
1791	xor	rax,rax
1792	lea	rcx,[rbp*1+r9]
1793	lea	rdx,[((48+8))+r9*2+rsp]
1794	mov	QWORD[((0+8))+rsp],rcx
1795	lea	rdi,[((48+8))+r9*1+rsp]
1796	mov	QWORD[((8+8))+rsp],rdx
1797	neg	r9
1798	jmp	NEAR $L$8x_reduction_loop
1799
1800ALIGN	32
1801$L$8x_reduction_loop:
1802	lea	rdi,[r9*1+rdi]
1803	DB	0x66
1804	mov	rbx,QWORD[rdi]
1805	mov	r9,QWORD[8+rdi]
1806	mov	r10,QWORD[16+rdi]
1807	mov	r11,QWORD[24+rdi]
1808	mov	r12,QWORD[32+rdi]
1809	mov	r13,QWORD[40+rdi]
1810	mov	r14,QWORD[48+rdi]
1811	mov	r15,QWORD[56+rdi]
1812	mov	QWORD[rdx],rax
1813	lea	rdi,[64+rdi]
1814
1815	DB	0x67
1816	mov	r8,rbx
1817	imul	rbx,QWORD[((32+8))+rsp]
1818	mov	rax,QWORD[rbp]
1819	mov	ecx,8
1820	jmp	NEAR $L$8x_reduce
1821
1822ALIGN	32
1823$L$8x_reduce:
1824	mul	rbx
1825	mov	rax,QWORD[8+rbp]
1826	neg	r8
1827	mov	r8,rdx
1828	adc	r8,0
1829
1830	mul	rbx
1831	add	r9,rax
1832	mov	rax,QWORD[16+rbp]
1833	adc	rdx,0
1834	add	r8,r9
1835	mov	QWORD[((48-8+8))+rcx*8+rsp],rbx
1836	mov	r9,rdx
1837	adc	r9,0
1838
1839	mul	rbx
1840	add	r10,rax
1841	mov	rax,QWORD[24+rbp]
1842	adc	rdx,0
1843	add	r9,r10
1844	mov	rsi,QWORD[((32+8))+rsp]
1845	mov	r10,rdx
1846	adc	r10,0
1847
1848	mul	rbx
1849	add	r11,rax
1850	mov	rax,QWORD[32+rbp]
1851	adc	rdx,0
1852	imul	rsi,r8
1853	add	r10,r11
1854	mov	r11,rdx
1855	adc	r11,0
1856
1857	mul	rbx
1858	add	r12,rax
1859	mov	rax,QWORD[40+rbp]
1860	adc	rdx,0
1861	add	r11,r12
1862	mov	r12,rdx
1863	adc	r12,0
1864
1865	mul	rbx
1866	add	r13,rax
1867	mov	rax,QWORD[48+rbp]
1868	adc	rdx,0
1869	add	r12,r13
1870	mov	r13,rdx
1871	adc	r13,0
1872
1873	mul	rbx
1874	add	r14,rax
1875	mov	rax,QWORD[56+rbp]
1876	adc	rdx,0
1877	add	r13,r14
1878	mov	r14,rdx
1879	adc	r14,0
1880
1881	mul	rbx
1882	mov	rbx,rsi
1883	add	r15,rax
1884	mov	rax,QWORD[rbp]
1885	adc	rdx,0
1886	add	r14,r15
1887	mov	r15,rdx
1888	adc	r15,0
1889
1890	dec	ecx
1891	jnz	NEAR $L$8x_reduce
1892
1893	lea	rbp,[64+rbp]
1894	xor	rax,rax
1895	mov	rdx,QWORD[((8+8))+rsp]
1896	cmp	rbp,QWORD[((0+8))+rsp]
1897	jae	NEAR $L$8x_no_tail
1898
1899	DB	0x66
1900	add	r8,QWORD[rdi]
1901	adc	r9,QWORD[8+rdi]
1902	adc	r10,QWORD[16+rdi]
1903	adc	r11,QWORD[24+rdi]
1904	adc	r12,QWORD[32+rdi]
1905	adc	r13,QWORD[40+rdi]
1906	adc	r14,QWORD[48+rdi]
1907	adc	r15,QWORD[56+rdi]
1908	sbb	rsi,rsi
1909
1910	mov	rbx,QWORD[((48+56+8))+rsp]
1911	mov	ecx,8
1912	mov	rax,QWORD[rbp]
1913	jmp	NEAR $L$8x_tail
1914
1915ALIGN	32
1916$L$8x_tail:
1917	mul	rbx
1918	add	r8,rax
1919	mov	rax,QWORD[8+rbp]
1920	mov	QWORD[rdi],r8
1921	mov	r8,rdx
1922	adc	r8,0
1923
1924	mul	rbx
1925	add	r9,rax
1926	mov	rax,QWORD[16+rbp]
1927	adc	rdx,0
1928	add	r8,r9
1929	lea	rdi,[8+rdi]
1930	mov	r9,rdx
1931	adc	r9,0
1932
1933	mul	rbx
1934	add	r10,rax
1935	mov	rax,QWORD[24+rbp]
1936	adc	rdx,0
1937	add	r9,r10
1938	mov	r10,rdx
1939	adc	r10,0
1940
1941	mul	rbx
1942	add	r11,rax
1943	mov	rax,QWORD[32+rbp]
1944	adc	rdx,0
1945	add	r10,r11
1946	mov	r11,rdx
1947	adc	r11,0
1948
1949	mul	rbx
1950	add	r12,rax
1951	mov	rax,QWORD[40+rbp]
1952	adc	rdx,0
1953	add	r11,r12
1954	mov	r12,rdx
1955	adc	r12,0
1956
1957	mul	rbx
1958	add	r13,rax
1959	mov	rax,QWORD[48+rbp]
1960	adc	rdx,0
1961	add	r12,r13
1962	mov	r13,rdx
1963	adc	r13,0
1964
1965	mul	rbx
1966	add	r14,rax
1967	mov	rax,QWORD[56+rbp]
1968	adc	rdx,0
1969	add	r13,r14
1970	mov	r14,rdx
1971	adc	r14,0
1972
1973	mul	rbx
1974	mov	rbx,QWORD[((48-16+8))+rcx*8+rsp]
1975	add	r15,rax
1976	adc	rdx,0
1977	add	r14,r15
1978	mov	rax,QWORD[rbp]
1979	mov	r15,rdx
1980	adc	r15,0
1981
1982	dec	ecx
1983	jnz	NEAR $L$8x_tail
1984
1985	lea	rbp,[64+rbp]
1986	mov	rdx,QWORD[((8+8))+rsp]
1987	cmp	rbp,QWORD[((0+8))+rsp]
1988	jae	NEAR $L$8x_tail_done
1989
1990	mov	rbx,QWORD[((48+56+8))+rsp]
1991	neg	rsi
1992	mov	rax,QWORD[rbp]
1993	adc	r8,QWORD[rdi]
1994	adc	r9,QWORD[8+rdi]
1995	adc	r10,QWORD[16+rdi]
1996	adc	r11,QWORD[24+rdi]
1997	adc	r12,QWORD[32+rdi]
1998	adc	r13,QWORD[40+rdi]
1999	adc	r14,QWORD[48+rdi]
2000	adc	r15,QWORD[56+rdi]
2001	sbb	rsi,rsi
2002
2003	mov	ecx,8
2004	jmp	NEAR $L$8x_tail
2005
2006ALIGN	32
2007$L$8x_tail_done:
2008	xor	rax,rax
2009	add	r8,QWORD[rdx]
2010	adc	r9,0
2011	adc	r10,0
2012	adc	r11,0
2013	adc	r12,0
2014	adc	r13,0
2015	adc	r14,0
2016	adc	r15,0
2017	adc	rax,0
2018
2019	neg	rsi
2020$L$8x_no_tail:
2021	adc	r8,QWORD[rdi]
2022	adc	r9,QWORD[8+rdi]
2023	adc	r10,QWORD[16+rdi]
2024	adc	r11,QWORD[24+rdi]
2025	adc	r12,QWORD[32+rdi]
2026	adc	r13,QWORD[40+rdi]
2027	adc	r14,QWORD[48+rdi]
2028	adc	r15,QWORD[56+rdi]
2029	adc	rax,0
2030	mov	rcx,QWORD[((-8))+rbp]
2031	xor	rsi,rsi
2032
2033DB	102,72,15,126,213
2034
2035	mov	QWORD[rdi],r8
2036	mov	QWORD[8+rdi],r9
2037DB	102,73,15,126,217
2038	mov	QWORD[16+rdi],r10
2039	mov	QWORD[24+rdi],r11
2040	mov	QWORD[32+rdi],r12
2041	mov	QWORD[40+rdi],r13
2042	mov	QWORD[48+rdi],r14
2043	mov	QWORD[56+rdi],r15
2044	lea	rdi,[64+rdi]
2045
2046	cmp	rdi,rdx
2047	jb	NEAR $L$8x_reduction_loop
2048	ret
2049
2050
2051
2052ALIGN	32
2053__bn_post4x_internal:
2054
2055	mov	r12,QWORD[rbp]
2056	lea	rbx,[r9*1+rdi]
2057	mov	rcx,r9
2058DB	102,72,15,126,207
2059	neg	rax
2060DB	102,72,15,126,206
2061	sar	rcx,3+2
2062	dec	r12
2063	xor	r10,r10
2064	mov	r13,QWORD[8+rbp]
2065	mov	r14,QWORD[16+rbp]
2066	mov	r15,QWORD[24+rbp]
2067	jmp	NEAR $L$sqr4x_sub_entry
2068
2069ALIGN	16
2070$L$sqr4x_sub:
2071	mov	r12,QWORD[rbp]
2072	mov	r13,QWORD[8+rbp]
2073	mov	r14,QWORD[16+rbp]
2074	mov	r15,QWORD[24+rbp]
2075$L$sqr4x_sub_entry:
2076	lea	rbp,[32+rbp]
2077	not	r12
2078	not	r13
2079	not	r14
2080	not	r15
2081	and	r12,rax
2082	and	r13,rax
2083	and	r14,rax
2084	and	r15,rax
2085
2086	neg	r10
2087	adc	r12,QWORD[rbx]
2088	adc	r13,QWORD[8+rbx]
2089	adc	r14,QWORD[16+rbx]
2090	adc	r15,QWORD[24+rbx]
2091	mov	QWORD[rdi],r12
2092	lea	rbx,[32+rbx]
2093	mov	QWORD[8+rdi],r13
2094	sbb	r10,r10
2095	mov	QWORD[16+rdi],r14
2096	mov	QWORD[24+rdi],r15
2097	lea	rdi,[32+rdi]
2098
2099	inc	rcx
2100	jnz	NEAR $L$sqr4x_sub
2101
2102	mov	r10,r9
2103	neg	r9
2104	ret
2105
2106
2107global	bn_mulx4x_mont_gather5
2108
2109ALIGN	32
2110bn_mulx4x_mont_gather5:
2111	mov	QWORD[8+rsp],rdi	;WIN64 prologue
2112	mov	QWORD[16+rsp],rsi
2113	mov	rax,rsp
2114$L$SEH_begin_bn_mulx4x_mont_gather5:
2115	mov	rdi,rcx
2116	mov	rsi,rdx
2117	mov	rdx,r8
2118	mov	rcx,r9
2119	mov	r8,QWORD[40+rsp]
2120	mov	r9,QWORD[48+rsp]
2121
2122
2123
2124_CET_ENDBR
2125	mov	rax,rsp
2126
2127	push	rbx
2128
2129	push	rbp
2130
2131	push	r12
2132
2133	push	r13
2134
2135	push	r14
2136
2137	push	r15
2138
2139$L$mulx4x_prologue:
2140
2141
2142
2143
2144	shl	r9d,3
2145	lea	r10,[r9*2+r9]
2146	neg	r9
2147	mov	r8,QWORD[r8]
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158	lea	r11,[((-320))+r9*2+rsp]
2159	mov	rbp,rsp
2160	sub	r11,rdi
2161	and	r11,4095
2162	cmp	r10,r11
2163	jb	NEAR $L$mulx4xsp_alt
2164	sub	rbp,r11
2165	lea	rbp,[((-320))+r9*2+rbp]
2166	jmp	NEAR $L$mulx4xsp_done
2167
2168$L$mulx4xsp_alt:
2169	lea	r10,[((4096-320))+r9*2]
2170	lea	rbp,[((-320))+r9*2+rbp]
2171	sub	r11,r10
2172	mov	r10,0
2173	cmovc	r11,r10
2174	sub	rbp,r11
2175$L$mulx4xsp_done:
2176	and	rbp,-64
2177	mov	r11,rsp
2178	sub	r11,rbp
2179	and	r11,-4096
2180	lea	rsp,[rbp*1+r11]
2181	mov	r10,QWORD[rsp]
2182	cmp	rsp,rbp
2183	ja	NEAR $L$mulx4x_page_walk
2184	jmp	NEAR $L$mulx4x_page_walk_done
2185
2186$L$mulx4x_page_walk:
2187	lea	rsp,[((-4096))+rsp]
2188	mov	r10,QWORD[rsp]
2189	cmp	rsp,rbp
2190	ja	NEAR $L$mulx4x_page_walk
2191$L$mulx4x_page_walk_done:
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205	mov	QWORD[32+rsp],r8
2206	mov	QWORD[40+rsp],rax
2207
2208$L$mulx4x_body:
2209	call	mulx4x_internal
2210
2211	mov	rsi,QWORD[40+rsp]
2212
2213	mov	rax,1
2214
2215	mov	r15,QWORD[((-48))+rsi]
2216
2217	mov	r14,QWORD[((-40))+rsi]
2218
2219	mov	r13,QWORD[((-32))+rsi]
2220
2221	mov	r12,QWORD[((-24))+rsi]
2222
2223	mov	rbp,QWORD[((-16))+rsi]
2224
2225	mov	rbx,QWORD[((-8))+rsi]
2226
2227	lea	rsp,[rsi]
2228
2229$L$mulx4x_epilogue:
2230	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
2231	mov	rsi,QWORD[16+rsp]
2232	ret
2233
2234$L$SEH_end_bn_mulx4x_mont_gather5:
2235
2236
2237ALIGN	32
2238mulx4x_internal:
2239
2240	mov	QWORD[8+rsp],r9
2241	mov	r10,r9
2242	neg	r9
2243	shl	r9,5
2244	neg	r10
2245	lea	r13,[128+r9*1+rdx]
2246	shr	r9,5+5
2247	movd	xmm5,DWORD[56+rax]
2248	sub	r9,1
2249	lea	rax,[$L$inc]
2250	mov	QWORD[((16+8))+rsp],r13
2251	mov	QWORD[((24+8))+rsp],r9
2252	mov	QWORD[((56+8))+rsp],rdi
2253	movdqa	xmm0,XMMWORD[rax]
2254	movdqa	xmm1,XMMWORD[16+rax]
2255	lea	r10,[((88-112))+r10*1+rsp]
2256	lea	rdi,[128+rdx]
2257
2258	pshufd	xmm5,xmm5,0
2259	movdqa	xmm4,xmm1
2260	DB	0x67
2261	movdqa	xmm2,xmm1
2262	DB	0x67
2263	paddd	xmm1,xmm0
2264	pcmpeqd	xmm0,xmm5
2265	movdqa	xmm3,xmm4
2266	paddd	xmm2,xmm1
2267	pcmpeqd	xmm1,xmm5
2268	movdqa	XMMWORD[112+r10],xmm0
2269	movdqa	xmm0,xmm4
2270
2271	paddd	xmm3,xmm2
2272	pcmpeqd	xmm2,xmm5
2273	movdqa	XMMWORD[128+r10],xmm1
2274	movdqa	xmm1,xmm4
2275
2276	paddd	xmm0,xmm3
2277	pcmpeqd	xmm3,xmm5
2278	movdqa	XMMWORD[144+r10],xmm2
2279	movdqa	xmm2,xmm4
2280
2281	paddd	xmm1,xmm0
2282	pcmpeqd	xmm0,xmm5
2283	movdqa	XMMWORD[160+r10],xmm3
2284	movdqa	xmm3,xmm4
2285	paddd	xmm2,xmm1
2286	pcmpeqd	xmm1,xmm5
2287	movdqa	XMMWORD[176+r10],xmm0
2288	movdqa	xmm0,xmm4
2289
2290	paddd	xmm3,xmm2
2291	pcmpeqd	xmm2,xmm5
2292	movdqa	XMMWORD[192+r10],xmm1
2293	movdqa	xmm1,xmm4
2294
2295	paddd	xmm0,xmm3
2296	pcmpeqd	xmm3,xmm5
2297	movdqa	XMMWORD[208+r10],xmm2
2298	movdqa	xmm2,xmm4
2299
2300	paddd	xmm1,xmm0
2301	pcmpeqd	xmm0,xmm5
2302	movdqa	XMMWORD[224+r10],xmm3
2303	movdqa	xmm3,xmm4
2304	paddd	xmm2,xmm1
2305	pcmpeqd	xmm1,xmm5
2306	movdqa	XMMWORD[240+r10],xmm0
2307	movdqa	xmm0,xmm4
2308
2309	paddd	xmm3,xmm2
2310	pcmpeqd	xmm2,xmm5
2311	movdqa	XMMWORD[256+r10],xmm1
2312	movdqa	xmm1,xmm4
2313
2314	paddd	xmm0,xmm3
2315	pcmpeqd	xmm3,xmm5
2316	movdqa	XMMWORD[272+r10],xmm2
2317	movdqa	xmm2,xmm4
2318
2319	paddd	xmm1,xmm0
2320	pcmpeqd	xmm0,xmm5
2321	movdqa	XMMWORD[288+r10],xmm3
2322	movdqa	xmm3,xmm4
2323	DB	0x67
2324	paddd	xmm2,xmm1
2325	pcmpeqd	xmm1,xmm5
2326	movdqa	XMMWORD[304+r10],xmm0
2327
2328	paddd	xmm3,xmm2
2329	pcmpeqd	xmm2,xmm5
2330	movdqa	XMMWORD[320+r10],xmm1
2331
2332	pcmpeqd	xmm3,xmm5
2333	movdqa	XMMWORD[336+r10],xmm2
2334
2335	pand	xmm0,XMMWORD[64+rdi]
2336	pand	xmm1,XMMWORD[80+rdi]
2337	pand	xmm2,XMMWORD[96+rdi]
2338	movdqa	XMMWORD[352+r10],xmm3
2339	pand	xmm3,XMMWORD[112+rdi]
2340	por	xmm0,xmm2
2341	por	xmm1,xmm3
2342	movdqa	xmm4,XMMWORD[((-128))+rdi]
2343	movdqa	xmm5,XMMWORD[((-112))+rdi]
2344	movdqa	xmm2,XMMWORD[((-96))+rdi]
2345	pand	xmm4,XMMWORD[112+r10]
2346	movdqa	xmm3,XMMWORD[((-80))+rdi]
2347	pand	xmm5,XMMWORD[128+r10]
2348	por	xmm0,xmm4
2349	pand	xmm2,XMMWORD[144+r10]
2350	por	xmm1,xmm5
2351	pand	xmm3,XMMWORD[160+r10]
2352	por	xmm0,xmm2
2353	por	xmm1,xmm3
2354	movdqa	xmm4,XMMWORD[((-64))+rdi]
2355	movdqa	xmm5,XMMWORD[((-48))+rdi]
2356	movdqa	xmm2,XMMWORD[((-32))+rdi]
2357	pand	xmm4,XMMWORD[176+r10]
2358	movdqa	xmm3,XMMWORD[((-16))+rdi]
2359	pand	xmm5,XMMWORD[192+r10]
2360	por	xmm0,xmm4
2361	pand	xmm2,XMMWORD[208+r10]
2362	por	xmm1,xmm5
2363	pand	xmm3,XMMWORD[224+r10]
2364	por	xmm0,xmm2
2365	por	xmm1,xmm3
2366	movdqa	xmm4,XMMWORD[rdi]
2367	movdqa	xmm5,XMMWORD[16+rdi]
2368	movdqa	xmm2,XMMWORD[32+rdi]
2369	pand	xmm4,XMMWORD[240+r10]
2370	movdqa	xmm3,XMMWORD[48+rdi]
2371	pand	xmm5,XMMWORD[256+r10]
2372	por	xmm0,xmm4
2373	pand	xmm2,XMMWORD[272+r10]
2374	por	xmm1,xmm5
2375	pand	xmm3,XMMWORD[288+r10]
2376	por	xmm0,xmm2
2377	por	xmm1,xmm3
2378	pxor	xmm0,xmm1
2379
2380	pshufd	xmm1,xmm0,0x4e
2381	por	xmm0,xmm1
2382	lea	rdi,[256+rdi]
2383DB	102,72,15,126,194
2384	lea	rbx,[((64+32+8))+rsp]
2385
2386	mov	r9,rdx
2387	mulx	rax,r8,QWORD[rsi]
2388	mulx	r12,r11,QWORD[8+rsi]
2389	add	r11,rax
2390	mulx	r13,rax,QWORD[16+rsi]
2391	adc	r12,rax
2392	adc	r13,0
2393	mulx	r14,rax,QWORD[24+rsi]
2394
2395	mov	r15,r8
2396	imul	r8,QWORD[((32+8))+rsp]
2397	xor	rbp,rbp
2398	mov	rdx,r8
2399
2400	mov	QWORD[((8+8))+rsp],rdi
2401
2402	lea	rsi,[32+rsi]
2403	adcx	r13,rax
2404	adcx	r14,rbp
2405
2406	mulx	r10,rax,QWORD[rcx]
2407	adcx	r15,rax
2408	adox	r10,r11
2409	mulx	r11,rax,QWORD[8+rcx]
2410	adcx	r10,rax
2411	adox	r11,r12
2412	mulx	r12,rax,QWORD[16+rcx]
2413	mov	rdi,QWORD[((24+8))+rsp]
2414	mov	QWORD[((-32))+rbx],r10
2415	adcx	r11,rax
2416	adox	r12,r13
2417	mulx	r15,rax,QWORD[24+rcx]
2418	mov	rdx,r9
2419	mov	QWORD[((-24))+rbx],r11
2420	adcx	r12,rax
2421	adox	r15,rbp
2422	lea	rcx,[32+rcx]
2423	mov	QWORD[((-16))+rbx],r12
2424	jmp	NEAR $L$mulx4x_1st
2425
2426ALIGN	32
2427$L$mulx4x_1st:
2428	adcx	r15,rbp
2429	mulx	rax,r10,QWORD[rsi]
2430	adcx	r10,r14
2431	mulx	r14,r11,QWORD[8+rsi]
2432	adcx	r11,rax
2433	mulx	rax,r12,QWORD[16+rsi]
2434	adcx	r12,r14
2435	mulx	r14,r13,QWORD[24+rsi]
2436	DB	0x67,0x67
2437	mov	rdx,r8
2438	adcx	r13,rax
2439	adcx	r14,rbp
2440	lea	rsi,[32+rsi]
2441	lea	rbx,[32+rbx]
2442
2443	adox	r10,r15
2444	mulx	r15,rax,QWORD[rcx]
2445	adcx	r10,rax
2446	adox	r11,r15
2447	mulx	r15,rax,QWORD[8+rcx]
2448	adcx	r11,rax
2449	adox	r12,r15
2450	mulx	r15,rax,QWORD[16+rcx]
2451	mov	QWORD[((-40))+rbx],r10
2452	adcx	r12,rax
2453	mov	QWORD[((-32))+rbx],r11
2454	adox	r13,r15
2455	mulx	r15,rax,QWORD[24+rcx]
2456	mov	rdx,r9
2457	mov	QWORD[((-24))+rbx],r12
2458	adcx	r13,rax
2459	adox	r15,rbp
2460	lea	rcx,[32+rcx]
2461	mov	QWORD[((-16))+rbx],r13
2462
2463	dec	rdi
2464	jnz	NEAR $L$mulx4x_1st
2465
2466	mov	rax,QWORD[8+rsp]
2467	adc	r15,rbp
2468	lea	rsi,[rax*1+rsi]
2469	add	r14,r15
2470	mov	rdi,QWORD[((8+8))+rsp]
2471	adc	rbp,rbp
2472	mov	QWORD[((-8))+rbx],r14
2473	jmp	NEAR $L$mulx4x_outer
2474
2475ALIGN	32
2476$L$mulx4x_outer:
2477	lea	r10,[((16-256))+rbx]
2478	pxor	xmm4,xmm4
2479	DB	0x67,0x67
2480	pxor	xmm5,xmm5
2481	movdqa	xmm0,XMMWORD[((-128))+rdi]
2482	movdqa	xmm1,XMMWORD[((-112))+rdi]
2483	movdqa	xmm2,XMMWORD[((-96))+rdi]
2484	pand	xmm0,XMMWORD[256+r10]
2485	movdqa	xmm3,XMMWORD[((-80))+rdi]
2486	pand	xmm1,XMMWORD[272+r10]
2487	por	xmm4,xmm0
2488	pand	xmm2,XMMWORD[288+r10]
2489	por	xmm5,xmm1
2490	pand	xmm3,XMMWORD[304+r10]
2491	por	xmm4,xmm2
2492	por	xmm5,xmm3
2493	movdqa	xmm0,XMMWORD[((-64))+rdi]
2494	movdqa	xmm1,XMMWORD[((-48))+rdi]
2495	movdqa	xmm2,XMMWORD[((-32))+rdi]
2496	pand	xmm0,XMMWORD[320+r10]
2497	movdqa	xmm3,XMMWORD[((-16))+rdi]
2498	pand	xmm1,XMMWORD[336+r10]
2499	por	xmm4,xmm0
2500	pand	xmm2,XMMWORD[352+r10]
2501	por	xmm5,xmm1
2502	pand	xmm3,XMMWORD[368+r10]
2503	por	xmm4,xmm2
2504	por	xmm5,xmm3
2505	movdqa	xmm0,XMMWORD[rdi]
2506	movdqa	xmm1,XMMWORD[16+rdi]
2507	movdqa	xmm2,XMMWORD[32+rdi]
2508	pand	xmm0,XMMWORD[384+r10]
2509	movdqa	xmm3,XMMWORD[48+rdi]
2510	pand	xmm1,XMMWORD[400+r10]
2511	por	xmm4,xmm0
2512	pand	xmm2,XMMWORD[416+r10]
2513	por	xmm5,xmm1
2514	pand	xmm3,XMMWORD[432+r10]
2515	por	xmm4,xmm2
2516	por	xmm5,xmm3
2517	movdqa	xmm0,XMMWORD[64+rdi]
2518	movdqa	xmm1,XMMWORD[80+rdi]
2519	movdqa	xmm2,XMMWORD[96+rdi]
2520	pand	xmm0,XMMWORD[448+r10]
2521	movdqa	xmm3,XMMWORD[112+rdi]
2522	pand	xmm1,XMMWORD[464+r10]
2523	por	xmm4,xmm0
2524	pand	xmm2,XMMWORD[480+r10]
2525	por	xmm5,xmm1
2526	pand	xmm3,XMMWORD[496+r10]
2527	por	xmm4,xmm2
2528	por	xmm5,xmm3
2529	por	xmm4,xmm5
2530
2531	pshufd	xmm0,xmm4,0x4e
2532	por	xmm0,xmm4
2533	lea	rdi,[256+rdi]
2534DB	102,72,15,126,194
2535
2536	mov	QWORD[rbx],rbp
2537	lea	rbx,[32+rax*1+rbx]
2538	mulx	r11,r8,QWORD[rsi]
2539	xor	rbp,rbp
2540	mov	r9,rdx
2541	mulx	r12,r14,QWORD[8+rsi]
2542	adox	r8,QWORD[((-32))+rbx]
2543	adcx	r11,r14
2544	mulx	r13,r15,QWORD[16+rsi]
2545	adox	r11,QWORD[((-24))+rbx]
2546	adcx	r12,r15
2547	mulx	r14,rdx,QWORD[24+rsi]
2548	adox	r12,QWORD[((-16))+rbx]
2549	adcx	r13,rdx
2550	lea	rcx,[rax*1+rcx]
2551	lea	rsi,[32+rsi]
2552	adox	r13,QWORD[((-8))+rbx]
2553	adcx	r14,rbp
2554	adox	r14,rbp
2555
2556	mov	r15,r8
2557	imul	r8,QWORD[((32+8))+rsp]
2558
2559	mov	rdx,r8
2560	xor	rbp,rbp
2561	mov	QWORD[((8+8))+rsp],rdi
2562
2563	mulx	r10,rax,QWORD[rcx]
2564	adcx	r15,rax
2565	adox	r10,r11
2566	mulx	r11,rax,QWORD[8+rcx]
2567	adcx	r10,rax
2568	adox	r11,r12
2569	mulx	r12,rax,QWORD[16+rcx]
2570	adcx	r11,rax
2571	adox	r12,r13
2572	mulx	r15,rax,QWORD[24+rcx]
2573	mov	rdx,r9
2574	mov	rdi,QWORD[((24+8))+rsp]
2575	mov	QWORD[((-32))+rbx],r10
2576	adcx	r12,rax
2577	mov	QWORD[((-24))+rbx],r11
2578	adox	r15,rbp
2579	mov	QWORD[((-16))+rbx],r12
2580	lea	rcx,[32+rcx]
2581	jmp	NEAR $L$mulx4x_inner
2582
2583ALIGN	32
2584$L$mulx4x_inner:
2585	mulx	rax,r10,QWORD[rsi]
2586	adcx	r15,rbp
2587	adox	r10,r14
2588	mulx	r14,r11,QWORD[8+rsi]
2589	adcx	r10,QWORD[rbx]
2590	adox	r11,rax
2591	mulx	rax,r12,QWORD[16+rsi]
2592	adcx	r11,QWORD[8+rbx]
2593	adox	r12,r14
2594	mulx	r14,r13,QWORD[24+rsi]
2595	mov	rdx,r8
2596	adcx	r12,QWORD[16+rbx]
2597	adox	r13,rax
2598	adcx	r13,QWORD[24+rbx]
2599	adox	r14,rbp
2600	lea	rsi,[32+rsi]
2601	lea	rbx,[32+rbx]
2602	adcx	r14,rbp
2603
2604	adox	r10,r15
2605	mulx	r15,rax,QWORD[rcx]
2606	adcx	r10,rax
2607	adox	r11,r15
2608	mulx	r15,rax,QWORD[8+rcx]
2609	adcx	r11,rax
2610	adox	r12,r15
2611	mulx	r15,rax,QWORD[16+rcx]
2612	mov	QWORD[((-40))+rbx],r10
2613	adcx	r12,rax
2614	adox	r13,r15
2615	mov	QWORD[((-32))+rbx],r11
2616	mulx	r15,rax,QWORD[24+rcx]
2617	mov	rdx,r9
2618	lea	rcx,[32+rcx]
2619	mov	QWORD[((-24))+rbx],r12
2620	adcx	r13,rax
2621	adox	r15,rbp
2622	mov	QWORD[((-16))+rbx],r13
2623
2624	dec	rdi
2625	jnz	NEAR $L$mulx4x_inner
2626
2627	mov	rax,QWORD[((0+8))+rsp]
2628	adc	r15,rbp
2629	sub	rdi,QWORD[rbx]
2630	mov	rdi,QWORD[((8+8))+rsp]
2631	mov	r10,QWORD[((16+8))+rsp]
2632	adc	r14,r15
2633	lea	rsi,[rax*1+rsi]
2634	adc	rbp,rbp
2635	mov	QWORD[((-8))+rbx],r14
2636
2637	cmp	rdi,r10
2638	jb	NEAR $L$mulx4x_outer
2639
2640	mov	r10,QWORD[((-8))+rcx]
2641	mov	r8,rbp
2642	mov	r12,QWORD[rax*1+rcx]
2643	lea	rbp,[rax*1+rcx]
2644	mov	rcx,rax
2645	lea	rdi,[rax*1+rbx]
2646	xor	eax,eax
2647	xor	r15,r15
2648	sub	r10,r14
2649	adc	r15,r15
2650	or	r8,r15
2651	sar	rcx,3+2
2652	sub	rax,r8
2653	mov	rdx,QWORD[((56+8))+rsp]
2654	dec	r12
2655	mov	r13,QWORD[8+rbp]
2656	xor	r8,r8
2657	mov	r14,QWORD[16+rbp]
2658	mov	r15,QWORD[24+rbp]
2659	jmp	NEAR $L$sqrx4x_sub_entry
2660
2661
2662global	bn_powerx5
2663
2664ALIGN	32
2665bn_powerx5:
2666	mov	QWORD[8+rsp],rdi	;WIN64 prologue
2667	mov	QWORD[16+rsp],rsi
2668	mov	rax,rsp
2669$L$SEH_begin_bn_powerx5:
2670	mov	rdi,rcx
2671	mov	rsi,rdx
2672	mov	rdx,r8
2673	mov	rcx,r9
2674	mov	r8,QWORD[40+rsp]
2675	mov	r9,QWORD[48+rsp]
2676
2677
2678
2679_CET_ENDBR
2680	mov	rax,rsp
2681
2682	push	rbx
2683
2684	push	rbp
2685
2686	push	r12
2687
2688	push	r13
2689
2690	push	r14
2691
2692	push	r15
2693
2694$L$powerx5_prologue:
2695
2696
2697
2698
2699	shl	r9d,3
2700	lea	r10,[r9*2+r9]
2701	neg	r9
2702	mov	r8,QWORD[r8]
2703
2704
2705
2706
2707
2708
2709
2710
2711	lea	r11,[((-320))+r9*2+rsp]
2712	mov	rbp,rsp
2713	sub	r11,rdi
2714	and	r11,4095
2715	cmp	r10,r11
2716	jb	NEAR $L$pwrx_sp_alt
2717	sub	rbp,r11
2718	lea	rbp,[((-320))+r9*2+rbp]
2719	jmp	NEAR $L$pwrx_sp_done
2720
2721ALIGN	32
2722$L$pwrx_sp_alt:
2723	lea	r10,[((4096-320))+r9*2]
2724	lea	rbp,[((-320))+r9*2+rbp]
2725	sub	r11,r10
2726	mov	r10,0
2727	cmovc	r11,r10
2728	sub	rbp,r11
2729$L$pwrx_sp_done:
2730	and	rbp,-64
2731	mov	r11,rsp
2732	sub	r11,rbp
2733	and	r11,-4096
2734	lea	rsp,[rbp*1+r11]
2735	mov	r10,QWORD[rsp]
2736	cmp	rsp,rbp
2737	ja	NEAR $L$pwrx_page_walk
2738	jmp	NEAR $L$pwrx_page_walk_done
2739
2740$L$pwrx_page_walk:
2741	lea	rsp,[((-4096))+rsp]
2742	mov	r10,QWORD[rsp]
2743	cmp	rsp,rbp
2744	ja	NEAR $L$pwrx_page_walk
2745$L$pwrx_page_walk_done:
2746
2747	mov	r10,r9
2748	neg	r9
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761	pxor	xmm0,xmm0
2762DB	102,72,15,110,207
2763DB	102,72,15,110,209
2764DB	102,73,15,110,218
2765DB	102,72,15,110,226
2766	mov	QWORD[32+rsp],r8
2767	mov	QWORD[40+rsp],rax
2768
2769$L$powerx5_body:
2770
2771	call	__bn_sqrx8x_internal
2772	call	__bn_postx4x_internal
2773	call	__bn_sqrx8x_internal
2774	call	__bn_postx4x_internal
2775	call	__bn_sqrx8x_internal
2776	call	__bn_postx4x_internal
2777	call	__bn_sqrx8x_internal
2778	call	__bn_postx4x_internal
2779	call	__bn_sqrx8x_internal
2780	call	__bn_postx4x_internal
2781
2782	mov	r9,r10
2783	mov	rdi,rsi
2784DB	102,72,15,126,209
2785DB	102,72,15,126,226
2786	mov	rax,QWORD[40+rsp]
2787
2788	call	mulx4x_internal
2789
2790	mov	rsi,QWORD[40+rsp]
2791
2792	mov	rax,1
2793
2794	mov	r15,QWORD[((-48))+rsi]
2795
2796	mov	r14,QWORD[((-40))+rsi]
2797
2798	mov	r13,QWORD[((-32))+rsi]
2799
2800	mov	r12,QWORD[((-24))+rsi]
2801
2802	mov	rbp,QWORD[((-16))+rsi]
2803
2804	mov	rbx,QWORD[((-8))+rsi]
2805
2806	lea	rsp,[rsi]
2807
2808$L$powerx5_epilogue:
2809	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
2810	mov	rsi,QWORD[16+rsp]
2811	ret
2812
2813$L$SEH_end_bn_powerx5:
2814
2815global	bn_sqrx8x_internal
2816
2817
2818ALIGN	32
2819bn_sqrx8x_internal:
2820__bn_sqrx8x_internal:
2821
2822_CET_ENDBR
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863	lea	rdi,[((48+8))+rsp]
2864	lea	rbp,[r9*1+rsi]
2865	mov	QWORD[((0+8))+rsp],r9
2866	mov	QWORD[((8+8))+rsp],rbp
2867	jmp	NEAR $L$sqr8x_zero_start
2868
2869ALIGN	32
2870	DB	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2871$L$sqrx8x_zero:
2872	DB	0x3e
2873	movdqa	XMMWORD[rdi],xmm0
2874	movdqa	XMMWORD[16+rdi],xmm0
2875	movdqa	XMMWORD[32+rdi],xmm0
2876	movdqa	XMMWORD[48+rdi],xmm0
2877$L$sqr8x_zero_start:
2878	movdqa	XMMWORD[64+rdi],xmm0
2879	movdqa	XMMWORD[80+rdi],xmm0
2880	movdqa	XMMWORD[96+rdi],xmm0
2881	movdqa	XMMWORD[112+rdi],xmm0
2882	lea	rdi,[128+rdi]
2883	sub	r9,64
2884	jnz	NEAR $L$sqrx8x_zero
2885
2886	mov	rdx,QWORD[rsi]
2887
2888	xor	r10,r10
2889	xor	r11,r11
2890	xor	r12,r12
2891	xor	r13,r13
2892	xor	r14,r14
2893	xor	r15,r15
2894	lea	rdi,[((48+8))+rsp]
2895	xor	rbp,rbp
2896	jmp	NEAR $L$sqrx8x_outer_loop
2897
2898ALIGN	32
2899$L$sqrx8x_outer_loop:
2900	mulx	rax,r8,QWORD[8+rsi]
2901	adcx	r8,r9
2902	adox	r10,rax
2903	mulx	rax,r9,QWORD[16+rsi]
2904	adcx	r9,r10
2905	adox	r11,rax
2906	DB	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
2907	adcx	r10,r11
2908	adox	r12,rax
2909	DB	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
2910	adcx	r11,r12
2911	adox	r13,rax
2912	mulx	rax,r12,QWORD[40+rsi]
2913	adcx	r12,r13
2914	adox	r14,rax
2915	mulx	rax,r13,QWORD[48+rsi]
2916	adcx	r13,r14
2917	adox	rax,r15
2918	mulx	r15,r14,QWORD[56+rsi]
2919	mov	rdx,QWORD[8+rsi]
2920	adcx	r14,rax
2921	adox	r15,rbp
2922	adc	r15,QWORD[64+rdi]
2923	mov	QWORD[8+rdi],r8
2924	mov	QWORD[16+rdi],r9
2925	sbb	rcx,rcx
2926	xor	rbp,rbp
2927
2928
2929	mulx	rbx,r8,QWORD[16+rsi]
2930	mulx	rax,r9,QWORD[24+rsi]
2931	adcx	r8,r10
2932	adox	r9,rbx
2933	mulx	rbx,r10,QWORD[32+rsi]
2934	adcx	r9,r11
2935	adox	r10,rax
2936	DB	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
2937	adcx	r10,r12
2938	adox	r11,rbx
2939	DB	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
2940	adcx	r11,r13
2941	adox	r12,r14
2942	DB	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
2943	mov	rdx,QWORD[16+rsi]
2944	adcx	r12,rax
2945	adox	r13,rbx
2946	adcx	r13,r15
2947	adox	r14,rbp
2948	adcx	r14,rbp
2949
2950	mov	QWORD[24+rdi],r8
2951	mov	QWORD[32+rdi],r9
2952
2953	mulx	rbx,r8,QWORD[24+rsi]
2954	mulx	rax,r9,QWORD[32+rsi]
2955	adcx	r8,r10
2956	adox	r9,rbx
2957	mulx	rbx,r10,QWORD[40+rsi]
2958	adcx	r9,r11
2959	adox	r10,rax
2960	DB	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
2961	adcx	r10,r12
2962	adox	r11,r13
2963	DB	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
2964	DB	0x3e
2965	mov	rdx,QWORD[24+rsi]
2966	adcx	r11,rbx
2967	adox	r12,rax
2968	adcx	r12,r14
2969	mov	QWORD[40+rdi],r8
2970	mov	QWORD[48+rdi],r9
2971	mulx	rax,r8,QWORD[32+rsi]
2972	adox	r13,rbp
2973	adcx	r13,rbp
2974
2975	mulx	rbx,r9,QWORD[40+rsi]
2976	adcx	r8,r10
2977	adox	r9,rax
2978	mulx	rax,r10,QWORD[48+rsi]
2979	adcx	r9,r11
2980	adox	r10,r12
2981	mulx	r12,r11,QWORD[56+rsi]
2982	mov	rdx,QWORD[32+rsi]
2983	mov	r14,QWORD[40+rsi]
2984	adcx	r10,rbx
2985	adox	r11,rax
2986	mov	r15,QWORD[48+rsi]
2987	adcx	r11,r13
2988	adox	r12,rbp
2989	adcx	r12,rbp
2990
2991	mov	QWORD[56+rdi],r8
2992	mov	QWORD[64+rdi],r9
2993
2994	mulx	rax,r9,r14
2995	mov	r8,QWORD[56+rsi]
2996	adcx	r9,r10
2997	mulx	rbx,r10,r15
2998	adox	r10,rax
2999	adcx	r10,r11
3000	mulx	rax,r11,r8
3001	mov	rdx,r14
3002	adox	r11,rbx
3003	adcx	r11,r12
3004
3005	adcx	rax,rbp
3006
3007	mulx	rbx,r14,r15
3008	mulx	r13,r12,r8
3009	mov	rdx,r15
3010	lea	rsi,[64+rsi]
3011	adcx	r11,r14
3012	adox	r12,rbx
3013	adcx	r12,rax
3014	adox	r13,rbp
3015
3016	DB	0x67,0x67
3017	mulx	r14,r8,r8
3018	adcx	r13,r8
3019	adcx	r14,rbp
3020
3021	cmp	rsi,QWORD[((8+8))+rsp]
3022	je	NEAR $L$sqrx8x_outer_break
3023
3024	neg	rcx
3025	mov	rcx,-8
3026	mov	r15,rbp
3027	mov	r8,QWORD[64+rdi]
3028	adcx	r9,QWORD[72+rdi]
3029	adcx	r10,QWORD[80+rdi]
3030	adcx	r11,QWORD[88+rdi]
3031	adc	r12,QWORD[96+rdi]
3032	adc	r13,QWORD[104+rdi]
3033	adc	r14,QWORD[112+rdi]
3034	adc	r15,QWORD[120+rdi]
3035	lea	rbp,[rsi]
3036	lea	rdi,[128+rdi]
3037	sbb	rax,rax
3038
3039	mov	rdx,QWORD[((-64))+rsi]
3040	mov	QWORD[((16+8))+rsp],rax
3041	mov	QWORD[((24+8))+rsp],rdi
3042
3043
3044	xor	eax,eax
3045	jmp	NEAR $L$sqrx8x_loop
3046
3047ALIGN	32
3048$L$sqrx8x_loop:
3049	mov	rbx,r8
3050	mulx	r8,rax,QWORD[rbp]
3051	adcx	rbx,rax
3052	adox	r8,r9
3053
3054	mulx	r9,rax,QWORD[8+rbp]
3055	adcx	r8,rax
3056	adox	r9,r10
3057
3058	mulx	r10,rax,QWORD[16+rbp]
3059	adcx	r9,rax
3060	adox	r10,r11
3061
3062	mulx	r11,rax,QWORD[24+rbp]
3063	adcx	r10,rax
3064	adox	r11,r12
3065
3066	DB	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3067	adcx	r11,rax
3068	adox	r12,r13
3069
3070	mulx	r13,rax,QWORD[40+rbp]
3071	adcx	r12,rax
3072	adox	r13,r14
3073
3074	mulx	r14,rax,QWORD[48+rbp]
3075	mov	QWORD[rcx*8+rdi],rbx
3076	mov	ebx,0
3077	adcx	r13,rax
3078	adox	r14,r15
3079
3080	DB	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
3081	mov	rdx,QWORD[8+rcx*8+rsi]
3082	adcx	r14,rax
3083	adox	r15,rbx
3084	adcx	r15,rbx
3085
3086	DB	0x67
3087	inc	rcx
3088	jnz	NEAR $L$sqrx8x_loop
3089
3090	lea	rbp,[64+rbp]
3091	mov	rcx,-8
3092	cmp	rbp,QWORD[((8+8))+rsp]
3093	je	NEAR $L$sqrx8x_break
3094
3095	sub	rbx,QWORD[((16+8))+rsp]
3096	DB	0x66
3097	mov	rdx,QWORD[((-64))+rsi]
3098	adcx	r8,QWORD[rdi]
3099	adcx	r9,QWORD[8+rdi]
3100	adc	r10,QWORD[16+rdi]
3101	adc	r11,QWORD[24+rdi]
3102	adc	r12,QWORD[32+rdi]
3103	adc	r13,QWORD[40+rdi]
3104	adc	r14,QWORD[48+rdi]
3105	adc	r15,QWORD[56+rdi]
3106	lea	rdi,[64+rdi]
3107	DB	0x67
3108	sbb	rax,rax
3109	xor	ebx,ebx
3110	mov	QWORD[((16+8))+rsp],rax
3111	jmp	NEAR $L$sqrx8x_loop
3112
3113ALIGN	32
3114$L$sqrx8x_break:
3115	xor	rbp,rbp
3116	sub	rbx,QWORD[((16+8))+rsp]
3117	adcx	r8,rbp
3118	mov	rcx,QWORD[((24+8))+rsp]
3119	adcx	r9,rbp
3120	mov	rdx,QWORD[rsi]
3121	adc	r10,0
3122	mov	QWORD[rdi],r8
3123	adc	r11,0
3124	adc	r12,0
3125	adc	r13,0
3126	adc	r14,0
3127	adc	r15,0
3128	cmp	rdi,rcx
3129	je	NEAR $L$sqrx8x_outer_loop
3130
3131	mov	QWORD[8+rdi],r9
3132	mov	r9,QWORD[8+rcx]
3133	mov	QWORD[16+rdi],r10
3134	mov	r10,QWORD[16+rcx]
3135	mov	QWORD[24+rdi],r11
3136	mov	r11,QWORD[24+rcx]
3137	mov	QWORD[32+rdi],r12
3138	mov	r12,QWORD[32+rcx]
3139	mov	QWORD[40+rdi],r13
3140	mov	r13,QWORD[40+rcx]
3141	mov	QWORD[48+rdi],r14
3142	mov	r14,QWORD[48+rcx]
3143	mov	QWORD[56+rdi],r15
3144	mov	r15,QWORD[56+rcx]
3145	mov	rdi,rcx
3146	jmp	NEAR $L$sqrx8x_outer_loop
3147
3148ALIGN	32
3149$L$sqrx8x_outer_break:
3150	mov	QWORD[72+rdi],r9
3151DB	102,72,15,126,217
3152	mov	QWORD[80+rdi],r10
3153	mov	QWORD[88+rdi],r11
3154	mov	QWORD[96+rdi],r12
3155	mov	QWORD[104+rdi],r13
3156	mov	QWORD[112+rdi],r14
3157	lea	rdi,[((48+8))+rsp]
3158	mov	rdx,QWORD[rcx*1+rsi]
3159
3160	mov	r11,QWORD[8+rdi]
3161	xor	r10,r10
3162	mov	r9,QWORD[((0+8))+rsp]
3163	adox	r11,r11
3164	mov	r12,QWORD[16+rdi]
3165	mov	r13,QWORD[24+rdi]
3166
3167
3168ALIGN	32
3169$L$sqrx4x_shift_n_add:
3170	mulx	rbx,rax,rdx
3171	adox	r12,r12
3172	adcx	rax,r10
3173	DB	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
3174	DB	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
3175	adox	r13,r13
3176	adcx	rbx,r11
3177	mov	r11,QWORD[40+rdi]
3178	mov	QWORD[rdi],rax
3179	mov	QWORD[8+rdi],rbx
3180
3181	mulx	rbx,rax,rdx
3182	adox	r10,r10
3183	adcx	rax,r12
3184	mov	rdx,QWORD[16+rcx*1+rsi]
3185	mov	r12,QWORD[48+rdi]
3186	adox	r11,r11
3187	adcx	rbx,r13
3188	mov	r13,QWORD[56+rdi]
3189	mov	QWORD[16+rdi],rax
3190	mov	QWORD[24+rdi],rbx
3191
3192	mulx	rbx,rax,rdx
3193	adox	r12,r12
3194	adcx	rax,r10
3195	mov	rdx,QWORD[24+rcx*1+rsi]
3196	lea	rcx,[32+rcx]
3197	mov	r10,QWORD[64+rdi]
3198	adox	r13,r13
3199	adcx	rbx,r11
3200	mov	r11,QWORD[72+rdi]
3201	mov	QWORD[32+rdi],rax
3202	mov	QWORD[40+rdi],rbx
3203
3204	mulx	rbx,rax,rdx
3205	adox	r10,r10
3206	adcx	rax,r12
3207	jrcxz	$L$sqrx4x_shift_n_add_break
3208	DB	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
3209	adox	r11,r11
3210	adcx	rbx,r13
3211	mov	r12,QWORD[80+rdi]
3212	mov	r13,QWORD[88+rdi]
3213	mov	QWORD[48+rdi],rax
3214	mov	QWORD[56+rdi],rbx
3215	lea	rdi,[64+rdi]
3216	nop
3217	jmp	NEAR $L$sqrx4x_shift_n_add
3218
3219ALIGN	32
3220$L$sqrx4x_shift_n_add_break:
3221	adcx	rbx,r13
3222	mov	QWORD[48+rdi],rax
3223	mov	QWORD[56+rdi],rbx
3224	lea	rdi,[64+rdi]
3225DB	102,72,15,126,213
3226__bn_sqrx8x_reduction:
3227	xor	eax,eax
3228	mov	rbx,QWORD[((32+8))+rsp]
3229	mov	rdx,QWORD[((48+8))+rsp]
3230	lea	rcx,[((-64))+r9*1+rbp]
3231
3232	mov	QWORD[((0+8))+rsp],rcx
3233	mov	QWORD[((8+8))+rsp],rdi
3234
3235	lea	rdi,[((48+8))+rsp]
3236	jmp	NEAR $L$sqrx8x_reduction_loop
3237
3238ALIGN	32
3239$L$sqrx8x_reduction_loop:
3240	mov	r9,QWORD[8+rdi]
3241	mov	r10,QWORD[16+rdi]
3242	mov	r11,QWORD[24+rdi]
3243	mov	r12,QWORD[32+rdi]
3244	mov	r8,rdx
3245	imul	rdx,rbx
3246	mov	r13,QWORD[40+rdi]
3247	mov	r14,QWORD[48+rdi]
3248	mov	r15,QWORD[56+rdi]
3249	mov	QWORD[((24+8))+rsp],rax
3250
3251	lea	rdi,[64+rdi]
3252	xor	rsi,rsi
3253	mov	rcx,-8
3254	jmp	NEAR $L$sqrx8x_reduce
3255
3256ALIGN	32
3257$L$sqrx8x_reduce:
3258	mov	rbx,r8
3259	mulx	r8,rax,QWORD[rbp]
3260	adcx	rax,rbx
3261	adox	r8,r9
3262
3263	mulx	r9,rbx,QWORD[8+rbp]
3264	adcx	r8,rbx
3265	adox	r9,r10
3266
3267	mulx	r10,rbx,QWORD[16+rbp]
3268	adcx	r9,rbx
3269	adox	r10,r11
3270
3271	mulx	r11,rbx,QWORD[24+rbp]
3272	adcx	r10,rbx
3273	adox	r11,r12
3274
3275	DB	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
3276	mov	rax,rdx
3277	mov	rdx,r8
3278	adcx	r11,rbx
3279	adox	r12,r13
3280
3281	mulx	rdx,rbx,QWORD[((32+8))+rsp]
3282	mov	rdx,rax
3283	mov	QWORD[((64+48+8))+rcx*8+rsp],rax
3284
3285	mulx	r13,rax,QWORD[40+rbp]
3286	adcx	r12,rax
3287	adox	r13,r14
3288
3289	mulx	r14,rax,QWORD[48+rbp]
3290	adcx	r13,rax
3291	adox	r14,r15
3292
3293	mulx	r15,rax,QWORD[56+rbp]
3294	mov	rdx,rbx
3295	adcx	r14,rax
3296	adox	r15,rsi
3297	adcx	r15,rsi
3298
3299	DB	0x67,0x67,0x67
3300	inc	rcx
3301	jnz	NEAR $L$sqrx8x_reduce
3302
3303	mov	rax,rsi
3304	cmp	rbp,QWORD[((0+8))+rsp]
3305	jae	NEAR $L$sqrx8x_no_tail
3306
3307	mov	rdx,QWORD[((48+8))+rsp]
3308	add	r8,QWORD[rdi]
3309	lea	rbp,[64+rbp]
3310	mov	rcx,-8
3311	adcx	r9,QWORD[8+rdi]
3312	adcx	r10,QWORD[16+rdi]
3313	adc	r11,QWORD[24+rdi]
3314	adc	r12,QWORD[32+rdi]
3315	adc	r13,QWORD[40+rdi]
3316	adc	r14,QWORD[48+rdi]
3317	adc	r15,QWORD[56+rdi]
3318	lea	rdi,[64+rdi]
3319	sbb	rax,rax
3320
3321	xor	rsi,rsi
3322	mov	QWORD[((16+8))+rsp],rax
3323	jmp	NEAR $L$sqrx8x_tail
3324
3325ALIGN	32
3326$L$sqrx8x_tail:
3327	mov	rbx,r8
3328	mulx	r8,rax,QWORD[rbp]
3329	adcx	rbx,rax
3330	adox	r8,r9
3331
3332	mulx	r9,rax,QWORD[8+rbp]
3333	adcx	r8,rax
3334	adox	r9,r10
3335
3336	mulx	r10,rax,QWORD[16+rbp]
3337	adcx	r9,rax
3338	adox	r10,r11
3339
3340	mulx	r11,rax,QWORD[24+rbp]
3341	adcx	r10,rax
3342	adox	r11,r12
3343
3344	DB	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3345	adcx	r11,rax
3346	adox	r12,r13
3347
3348	mulx	r13,rax,QWORD[40+rbp]
3349	adcx	r12,rax
3350	adox	r13,r14
3351
3352	mulx	r14,rax,QWORD[48+rbp]
3353	adcx	r13,rax
3354	adox	r14,r15
3355
3356	mulx	r15,rax,QWORD[56+rbp]
3357	mov	rdx,QWORD[((72+48+8))+rcx*8+rsp]
3358	adcx	r14,rax
3359	adox	r15,rsi
3360	mov	QWORD[rcx*8+rdi],rbx
3361	mov	rbx,r8
3362	adcx	r15,rsi
3363
3364	inc	rcx
3365	jnz	NEAR $L$sqrx8x_tail
3366
3367	cmp	rbp,QWORD[((0+8))+rsp]
3368	jae	NEAR $L$sqrx8x_tail_done
3369
3370	sub	rsi,QWORD[((16+8))+rsp]
3371	mov	rdx,QWORD[((48+8))+rsp]
3372	lea	rbp,[64+rbp]
3373	adc	r8,QWORD[rdi]
3374	adc	r9,QWORD[8+rdi]
3375	adc	r10,QWORD[16+rdi]
3376	adc	r11,QWORD[24+rdi]
3377	adc	r12,QWORD[32+rdi]
3378	adc	r13,QWORD[40+rdi]
3379	adc	r14,QWORD[48+rdi]
3380	adc	r15,QWORD[56+rdi]
3381	lea	rdi,[64+rdi]
3382	sbb	rax,rax
3383	sub	rcx,8
3384
3385	xor	rsi,rsi
3386	mov	QWORD[((16+8))+rsp],rax
3387	jmp	NEAR $L$sqrx8x_tail
3388
3389ALIGN	32
3390$L$sqrx8x_tail_done:
3391	xor	rax,rax
3392	add	r8,QWORD[((24+8))+rsp]
3393	adc	r9,0
3394	adc	r10,0
3395	adc	r11,0
3396	adc	r12,0
3397	adc	r13,0
3398	adc	r14,0
3399	adc	r15,0
3400	adc	rax,0
3401
3402	sub	rsi,QWORD[((16+8))+rsp]
3403$L$sqrx8x_no_tail:
3404	adc	r8,QWORD[rdi]
3405DB	102,72,15,126,217
3406	adc	r9,QWORD[8+rdi]
3407	mov	rsi,QWORD[56+rbp]
3408DB	102,72,15,126,213
3409	adc	r10,QWORD[16+rdi]
3410	adc	r11,QWORD[24+rdi]
3411	adc	r12,QWORD[32+rdi]
3412	adc	r13,QWORD[40+rdi]
3413	adc	r14,QWORD[48+rdi]
3414	adc	r15,QWORD[56+rdi]
3415	adc	rax,0
3416
3417	mov	rbx,QWORD[((32+8))+rsp]
3418	mov	rdx,QWORD[64+rcx*1+rdi]
3419
3420	mov	QWORD[rdi],r8
3421	lea	r8,[64+rdi]
3422	mov	QWORD[8+rdi],r9
3423	mov	QWORD[16+rdi],r10
3424	mov	QWORD[24+rdi],r11
3425	mov	QWORD[32+rdi],r12
3426	mov	QWORD[40+rdi],r13
3427	mov	QWORD[48+rdi],r14
3428	mov	QWORD[56+rdi],r15
3429
3430	lea	rdi,[64+rcx*1+rdi]
3431	cmp	r8,QWORD[((8+8))+rsp]
3432	jb	NEAR $L$sqrx8x_reduction_loop
3433	ret
3434
3435
3436ALIGN	32
3437
3438__bn_postx4x_internal:
3439
3440	mov	r12,QWORD[rbp]
3441	mov	r10,rcx
3442	mov	r9,rcx
3443	neg	rax
3444	sar	rcx,3+2
3445
3446DB	102,72,15,126,202
3447DB	102,72,15,126,206
3448	dec	r12
3449	mov	r13,QWORD[8+rbp]
3450	xor	r8,r8
3451	mov	r14,QWORD[16+rbp]
3452	mov	r15,QWORD[24+rbp]
3453	jmp	NEAR $L$sqrx4x_sub_entry
3454
3455ALIGN	16
3456$L$sqrx4x_sub:
3457	mov	r12,QWORD[rbp]
3458	mov	r13,QWORD[8+rbp]
3459	mov	r14,QWORD[16+rbp]
3460	mov	r15,QWORD[24+rbp]
3461$L$sqrx4x_sub_entry:
3462	andn	r12,r12,rax
3463	lea	rbp,[32+rbp]
3464	andn	r13,r13,rax
3465	andn	r14,r14,rax
3466	andn	r15,r15,rax
3467
3468	neg	r8
3469	adc	r12,QWORD[rdi]
3470	adc	r13,QWORD[8+rdi]
3471	adc	r14,QWORD[16+rdi]
3472	adc	r15,QWORD[24+rdi]
3473	mov	QWORD[rdx],r12
3474	lea	rdi,[32+rdi]
3475	mov	QWORD[8+rdx],r13
3476	sbb	r8,r8
3477	mov	QWORD[16+rdx],r14
3478	mov	QWORD[24+rdx],r15
3479	lea	rdx,[32+rdx]
3480
3481	inc	rcx
3482	jnz	NEAR $L$sqrx4x_sub
3483
3484	neg	r9
3485
3486	ret
3487
3488
3489global	bn_scatter5
3490
3491ALIGN	16
3492bn_scatter5:
3493
3494_CET_ENDBR
3495	cmp	edx,0
3496	jz	NEAR $L$scatter_epilogue
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506	lea	r8,[r9*8+r8]
3507$L$scatter:
3508	mov	rax,QWORD[rcx]
3509	lea	rcx,[8+rcx]
3510	mov	QWORD[r8],rax
3511	lea	r8,[256+r8]
3512	sub	edx,1
3513	jnz	NEAR $L$scatter
3514$L$scatter_epilogue:
3515	ret
3516
3517
3518
3519global	bn_gather5
3520
3521ALIGN	32
3522bn_gather5:
3523
3524$L$SEH_begin_bn_gather5:
3525_CET_ENDBR
3526
3527	DB	0x4c,0x8d,0x14,0x24
3528
3529	DB	0x48,0x81,0xec,0x08,0x01,0x00,0x00
3530	lea	rax,[$L$inc]
3531	and	rsp,-16
3532
3533	movd	xmm5,r9d
3534	movdqa	xmm0,XMMWORD[rax]
3535	movdqa	xmm1,XMMWORD[16+rax]
3536	lea	r11,[128+r8]
3537	lea	rax,[128+rsp]
3538
3539	pshufd	xmm5,xmm5,0
3540	movdqa	xmm4,xmm1
3541	movdqa	xmm2,xmm1
3542	paddd	xmm1,xmm0
3543	pcmpeqd	xmm0,xmm5
3544	movdqa	xmm3,xmm4
3545
3546	paddd	xmm2,xmm1
3547	pcmpeqd	xmm1,xmm5
3548	movdqa	XMMWORD[(-128)+rax],xmm0
3549	movdqa	xmm0,xmm4
3550
3551	paddd	xmm3,xmm2
3552	pcmpeqd	xmm2,xmm5
3553	movdqa	XMMWORD[(-112)+rax],xmm1
3554	movdqa	xmm1,xmm4
3555
3556	paddd	xmm0,xmm3
3557	pcmpeqd	xmm3,xmm5
3558	movdqa	XMMWORD[(-96)+rax],xmm2
3559	movdqa	xmm2,xmm4
3560	paddd	xmm1,xmm0
3561	pcmpeqd	xmm0,xmm5
3562	movdqa	XMMWORD[(-80)+rax],xmm3
3563	movdqa	xmm3,xmm4
3564
3565	paddd	xmm2,xmm1
3566	pcmpeqd	xmm1,xmm5
3567	movdqa	XMMWORD[(-64)+rax],xmm0
3568	movdqa	xmm0,xmm4
3569
3570	paddd	xmm3,xmm2
3571	pcmpeqd	xmm2,xmm5
3572	movdqa	XMMWORD[(-48)+rax],xmm1
3573	movdqa	xmm1,xmm4
3574
3575	paddd	xmm0,xmm3
3576	pcmpeqd	xmm3,xmm5
3577	movdqa	XMMWORD[(-32)+rax],xmm2
3578	movdqa	xmm2,xmm4
3579	paddd	xmm1,xmm0
3580	pcmpeqd	xmm0,xmm5
3581	movdqa	XMMWORD[(-16)+rax],xmm3
3582	movdqa	xmm3,xmm4
3583
3584	paddd	xmm2,xmm1
3585	pcmpeqd	xmm1,xmm5
3586	movdqa	XMMWORD[rax],xmm0
3587	movdqa	xmm0,xmm4
3588
3589	paddd	xmm3,xmm2
3590	pcmpeqd	xmm2,xmm5
3591	movdqa	XMMWORD[16+rax],xmm1
3592	movdqa	xmm1,xmm4
3593
3594	paddd	xmm0,xmm3
3595	pcmpeqd	xmm3,xmm5
3596	movdqa	XMMWORD[32+rax],xmm2
3597	movdqa	xmm2,xmm4
3598	paddd	xmm1,xmm0
3599	pcmpeqd	xmm0,xmm5
3600	movdqa	XMMWORD[48+rax],xmm3
3601	movdqa	xmm3,xmm4
3602
3603	paddd	xmm2,xmm1
3604	pcmpeqd	xmm1,xmm5
3605	movdqa	XMMWORD[64+rax],xmm0
3606	movdqa	xmm0,xmm4
3607
3608	paddd	xmm3,xmm2
3609	pcmpeqd	xmm2,xmm5
3610	movdqa	XMMWORD[80+rax],xmm1
3611	movdqa	xmm1,xmm4
3612
3613	paddd	xmm0,xmm3
3614	pcmpeqd	xmm3,xmm5
3615	movdqa	XMMWORD[96+rax],xmm2
3616	movdqa	xmm2,xmm4
3617	movdqa	XMMWORD[112+rax],xmm3
3618	jmp	NEAR $L$gather
3619
3620ALIGN	32
3621$L$gather:
3622	pxor	xmm4,xmm4
3623	pxor	xmm5,xmm5
3624	movdqa	xmm0,XMMWORD[((-128))+r11]
3625	movdqa	xmm1,XMMWORD[((-112))+r11]
3626	movdqa	xmm2,XMMWORD[((-96))+r11]
3627	pand	xmm0,XMMWORD[((-128))+rax]
3628	movdqa	xmm3,XMMWORD[((-80))+r11]
3629	pand	xmm1,XMMWORD[((-112))+rax]
3630	por	xmm4,xmm0
3631	pand	xmm2,XMMWORD[((-96))+rax]
3632	por	xmm5,xmm1
3633	pand	xmm3,XMMWORD[((-80))+rax]
3634	por	xmm4,xmm2
3635	por	xmm5,xmm3
3636	movdqa	xmm0,XMMWORD[((-64))+r11]
3637	movdqa	xmm1,XMMWORD[((-48))+r11]
3638	movdqa	xmm2,XMMWORD[((-32))+r11]
3639	pand	xmm0,XMMWORD[((-64))+rax]
3640	movdqa	xmm3,XMMWORD[((-16))+r11]
3641	pand	xmm1,XMMWORD[((-48))+rax]
3642	por	xmm4,xmm0
3643	pand	xmm2,XMMWORD[((-32))+rax]
3644	por	xmm5,xmm1
3645	pand	xmm3,XMMWORD[((-16))+rax]
3646	por	xmm4,xmm2
3647	por	xmm5,xmm3
3648	movdqa	xmm0,XMMWORD[r11]
3649	movdqa	xmm1,XMMWORD[16+r11]
3650	movdqa	xmm2,XMMWORD[32+r11]
3651	pand	xmm0,XMMWORD[rax]
3652	movdqa	xmm3,XMMWORD[48+r11]
3653	pand	xmm1,XMMWORD[16+rax]
3654	por	xmm4,xmm0
3655	pand	xmm2,XMMWORD[32+rax]
3656	por	xmm5,xmm1
3657	pand	xmm3,XMMWORD[48+rax]
3658	por	xmm4,xmm2
3659	por	xmm5,xmm3
3660	movdqa	xmm0,XMMWORD[64+r11]
3661	movdqa	xmm1,XMMWORD[80+r11]
3662	movdqa	xmm2,XMMWORD[96+r11]
3663	pand	xmm0,XMMWORD[64+rax]
3664	movdqa	xmm3,XMMWORD[112+r11]
3665	pand	xmm1,XMMWORD[80+rax]
3666	por	xmm4,xmm0
3667	pand	xmm2,XMMWORD[96+rax]
3668	por	xmm5,xmm1
3669	pand	xmm3,XMMWORD[112+rax]
3670	por	xmm4,xmm2
3671	por	xmm5,xmm3
3672	por	xmm4,xmm5
3673	lea	r11,[256+r11]
3674
3675	pshufd	xmm0,xmm4,0x4e
3676	por	xmm0,xmm4
3677	movq	QWORD[rcx],xmm0
3678	lea	rcx,[8+rcx]
3679	sub	edx,1
3680	jnz	NEAR $L$gather
3681
3682	lea	rsp,[r10]
3683
3684	ret
3685$L$SEH_end_bn_gather5:
3686
3687
3688section	.rdata rdata align=8
3689ALIGN	64
3690$L$inc:
3691	DD	0,0,1,1
3692	DD	2,2,2,2
3693	DB	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
3694	DB	112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115
3695	DB	99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111
3696	DB	114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79
3697	DB	71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111
3698	DB	112,101,110,115,115,108,46,111,114,103,62,0
3699section	.text
3700
3701EXTERN	__imp_RtlVirtualUnwind
3702
3703ALIGN	16
3704mul_handler:
3705	push	rsi
3706	push	rdi
3707	push	rbx
3708	push	rbp
3709	push	r12
3710	push	r13
3711	push	r14
3712	push	r15
3713	pushfq
3714	sub	rsp,64
3715
3716	mov	rax,QWORD[120+r8]
3717	mov	rbx,QWORD[248+r8]
3718
3719	mov	rsi,QWORD[8+r9]
3720	mov	r11,QWORD[56+r9]
3721
3722	mov	r10d,DWORD[r11]
3723	lea	r10,[r10*1+rsi]
3724	cmp	rbx,r10
3725	jb	NEAR $L$common_seh_tail
3726
3727	mov	r10d,DWORD[4+r11]
3728	lea	r10,[r10*1+rsi]
3729	cmp	rbx,r10
3730	jb	NEAR $L$common_pop_regs
3731
3732	mov	rax,QWORD[152+r8]
3733
3734	mov	r10d,DWORD[8+r11]
3735	lea	r10,[r10*1+rsi]
3736	cmp	rbx,r10
3737	jae	NEAR $L$common_seh_tail
3738
3739	lea	r10,[$L$mul_epilogue]
3740	cmp	rbx,r10
3741	ja	NEAR $L$body_40
3742
3743	mov	r10,QWORD[192+r8]
3744	mov	rax,QWORD[8+r10*8+rax]
3745
3746	jmp	NEAR $L$common_pop_regs
3747
3748$L$body_40:
3749	mov	rax,QWORD[40+rax]
3750$L$common_pop_regs:
3751	mov	rbx,QWORD[((-8))+rax]
3752	mov	rbp,QWORD[((-16))+rax]
3753	mov	r12,QWORD[((-24))+rax]
3754	mov	r13,QWORD[((-32))+rax]
3755	mov	r14,QWORD[((-40))+rax]
3756	mov	r15,QWORD[((-48))+rax]
3757	mov	QWORD[144+r8],rbx
3758	mov	QWORD[160+r8],rbp
3759	mov	QWORD[216+r8],r12
3760	mov	QWORD[224+r8],r13
3761	mov	QWORD[232+r8],r14
3762	mov	QWORD[240+r8],r15
3763
3764$L$common_seh_tail:
3765	mov	rdi,QWORD[8+rax]
3766	mov	rsi,QWORD[16+rax]
3767	mov	QWORD[152+r8],rax
3768	mov	QWORD[168+r8],rsi
3769	mov	QWORD[176+r8],rdi
3770
3771	mov	rdi,QWORD[40+r9]
3772	mov	rsi,r8
3773	mov	ecx,154
3774	DD	0xa548f3fc
3775
3776	mov	rsi,r9
3777	xor	rcx,rcx
3778	mov	rdx,QWORD[8+rsi]
3779	mov	r8,QWORD[rsi]
3780	mov	r9,QWORD[16+rsi]
3781	mov	r10,QWORD[40+rsi]
3782	lea	r11,[56+rsi]
3783	lea	r12,[24+rsi]
3784	mov	QWORD[32+rsp],r10
3785	mov	QWORD[40+rsp],r11
3786	mov	QWORD[48+rsp],r12
3787	mov	QWORD[56+rsp],rcx
3788	call	QWORD[__imp_RtlVirtualUnwind]
3789
3790	mov	eax,1
3791	add	rsp,64
3792	popfq
3793	pop	r15
3794	pop	r14
3795	pop	r13
3796	pop	r12
3797	pop	rbp
3798	pop	rbx
3799	pop	rdi
3800	pop	rsi
3801	ret
3802
3803
3804section	.pdata rdata align=4
3805ALIGN	4
3806	DD	$L$SEH_begin_bn_mul_mont_gather5_nohw wrt ..imagebase
3807	DD	$L$SEH_end_bn_mul_mont_gather5_nohw wrt ..imagebase
3808	DD	$L$SEH_info_bn_mul_mont_gather5_nohw wrt ..imagebase
3809
3810	DD	$L$SEH_begin_bn_mul4x_mont_gather5 wrt ..imagebase
3811	DD	$L$SEH_end_bn_mul4x_mont_gather5 wrt ..imagebase
3812	DD	$L$SEH_info_bn_mul4x_mont_gather5 wrt ..imagebase
3813
3814	DD	$L$SEH_begin_bn_power5_nohw wrt ..imagebase
3815	DD	$L$SEH_end_bn_power5_nohw wrt ..imagebase
3816	DD	$L$SEH_info_bn_power5_nohw wrt ..imagebase
3817	DD	$L$SEH_begin_bn_mulx4x_mont_gather5 wrt ..imagebase
3818	DD	$L$SEH_end_bn_mulx4x_mont_gather5 wrt ..imagebase
3819	DD	$L$SEH_info_bn_mulx4x_mont_gather5 wrt ..imagebase
3820
3821	DD	$L$SEH_begin_bn_powerx5 wrt ..imagebase
3822	DD	$L$SEH_end_bn_powerx5 wrt ..imagebase
3823	DD	$L$SEH_info_bn_powerx5 wrt ..imagebase
3824	DD	$L$SEH_begin_bn_gather5 wrt ..imagebase
3825	DD	$L$SEH_end_bn_gather5 wrt ..imagebase
3826	DD	$L$SEH_info_bn_gather5 wrt ..imagebase
3827
3828section	.xdata rdata align=8
3829ALIGN	8
3830$L$SEH_info_bn_mul_mont_gather5_nohw:
3831	DB	9,0,0,0
3832	DD	mul_handler wrt ..imagebase
3833	DD	$L$mul_body wrt ..imagebase,$L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase
3834ALIGN	8
3835$L$SEH_info_bn_mul4x_mont_gather5:
3836	DB	9,0,0,0
3837	DD	mul_handler wrt ..imagebase
3838	DD	$L$mul4x_prologue wrt ..imagebase,$L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase
3839ALIGN	8
3840$L$SEH_info_bn_power5_nohw:
3841	DB	9,0,0,0
3842	DD	mul_handler wrt ..imagebase
3843	DD	$L$power5_prologue wrt ..imagebase,$L$power5_body wrt ..imagebase,$L$power5_epilogue wrt ..imagebase
3844ALIGN	8
3845$L$SEH_info_bn_mulx4x_mont_gather5:
3846	DB	9,0,0,0
3847	DD	mul_handler wrt ..imagebase
3848	DD	$L$mulx4x_prologue wrt ..imagebase,$L$mulx4x_body wrt ..imagebase,$L$mulx4x_epilogue wrt ..imagebase
3849ALIGN	8
3850$L$SEH_info_bn_powerx5:
3851	DB	9,0,0,0
3852	DD	mul_handler wrt ..imagebase
3853	DD	$L$powerx5_prologue wrt ..imagebase,$L$powerx5_body wrt ..imagebase,$L$powerx5_epilogue wrt ..imagebase
3854ALIGN	8
3855$L$SEH_info_bn_gather5:
3856	DB	0x01,0x0b,0x03,0x0a
3857	DB	0x0b,0x01,0x21,0x00
3858	DB	0x04,0xa3,0x00,0x00
3859ALIGN	8
3860%else
3861; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
3862ret
3863%endif
3864