1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <ring-core/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
7.text
8
9
10
11.globl	_bn_mul_mont
12.private_extern _bn_mul_mont
13
14.p2align	4
15_bn_mul_mont:
16
17_CET_ENDBR
18	movl	%r9d,%r9d
19	movq	%rsp,%rax
20
21	testl	$3,%r9d
22	jnz	L$mul_enter
23	cmpl	$8,%r9d
24	jb	L$mul_enter
25	leaq	_OPENSSL_ia32cap_P(%rip),%r11
26	movl	8(%r11),%r11d
27	cmpq	%rsi,%rdx
28	jne	L$mul4x_enter
29	testl	$7,%r9d
30	jz	L$sqr8x_enter
31	jmp	L$mul4x_enter
32
33.p2align	4
34L$mul_enter:
35	pushq	%rbx
36
37	pushq	%rbp
38
39	pushq	%r12
40
41	pushq	%r13
42
43	pushq	%r14
44
45	pushq	%r15
46
47
48	negq	%r9
49	movq	%rsp,%r11
50	leaq	-16(%rsp,%r9,8),%r10
51	negq	%r9
52	andq	$-1024,%r10
53
54
55
56
57
58
59
60
61
62	subq	%r10,%r11
63	andq	$-4096,%r11
64	leaq	(%r10,%r11,1),%rsp
65	movq	(%rsp),%r11
66	cmpq	%r10,%rsp
67	ja	L$mul_page_walk
68	jmp	L$mul_page_walk_done
69
70.p2align	4
71L$mul_page_walk:
72	leaq	-4096(%rsp),%rsp
73	movq	(%rsp),%r11
74	cmpq	%r10,%rsp
75	ja	L$mul_page_walk
76L$mul_page_walk_done:
77
78	movq	%rax,8(%rsp,%r9,8)
79
80L$mul_body:
81	movq	%rdx,%r12
82	movq	(%r8),%r8
83	movq	(%r12),%rbx
84	movq	(%rsi),%rax
85
86	xorq	%r14,%r14
87	xorq	%r15,%r15
88
89	movq	%r8,%rbp
90	mulq	%rbx
91	movq	%rax,%r10
92	movq	(%rcx),%rax
93
94	imulq	%r10,%rbp
95	movq	%rdx,%r11
96
97	mulq	%rbp
98	addq	%rax,%r10
99	movq	8(%rsi),%rax
100	adcq	$0,%rdx
101	movq	%rdx,%r13
102
103	leaq	1(%r15),%r15
104	jmp	L$1st_enter
105
106.p2align	4
107L$1st:
108	addq	%rax,%r13
109	movq	(%rsi,%r15,8),%rax
110	adcq	$0,%rdx
111	addq	%r11,%r13
112	movq	%r10,%r11
113	adcq	$0,%rdx
114	movq	%r13,-16(%rsp,%r15,8)
115	movq	%rdx,%r13
116
117L$1st_enter:
118	mulq	%rbx
119	addq	%rax,%r11
120	movq	(%rcx,%r15,8),%rax
121	adcq	$0,%rdx
122	leaq	1(%r15),%r15
123	movq	%rdx,%r10
124
125	mulq	%rbp
126	cmpq	%r9,%r15
127	jne	L$1st
128
129	addq	%rax,%r13
130	movq	(%rsi),%rax
131	adcq	$0,%rdx
132	addq	%r11,%r13
133	adcq	$0,%rdx
134	movq	%r13,-16(%rsp,%r15,8)
135	movq	%rdx,%r13
136	movq	%r10,%r11
137
138	xorq	%rdx,%rdx
139	addq	%r11,%r13
140	adcq	$0,%rdx
141	movq	%r13,-8(%rsp,%r9,8)
142	movq	%rdx,(%rsp,%r9,8)
143
144	leaq	1(%r14),%r14
145	jmp	L$outer
146.p2align	4
147L$outer:
148	movq	(%r12,%r14,8),%rbx
149	xorq	%r15,%r15
150	movq	%r8,%rbp
151	movq	(%rsp),%r10
152	mulq	%rbx
153	addq	%rax,%r10
154	movq	(%rcx),%rax
155	adcq	$0,%rdx
156
157	imulq	%r10,%rbp
158	movq	%rdx,%r11
159
160	mulq	%rbp
161	addq	%rax,%r10
162	movq	8(%rsi),%rax
163	adcq	$0,%rdx
164	movq	8(%rsp),%r10
165	movq	%rdx,%r13
166
167	leaq	1(%r15),%r15
168	jmp	L$inner_enter
169
170.p2align	4
171L$inner:
172	addq	%rax,%r13
173	movq	(%rsi,%r15,8),%rax
174	adcq	$0,%rdx
175	addq	%r10,%r13
176	movq	(%rsp,%r15,8),%r10
177	adcq	$0,%rdx
178	movq	%r13,-16(%rsp,%r15,8)
179	movq	%rdx,%r13
180
181L$inner_enter:
182	mulq	%rbx
183	addq	%rax,%r11
184	movq	(%rcx,%r15,8),%rax
185	adcq	$0,%rdx
186	addq	%r11,%r10
187	movq	%rdx,%r11
188	adcq	$0,%r11
189	leaq	1(%r15),%r15
190
191	mulq	%rbp
192	cmpq	%r9,%r15
193	jne	L$inner
194
195	addq	%rax,%r13
196	movq	(%rsi),%rax
197	adcq	$0,%rdx
198	addq	%r10,%r13
199	movq	(%rsp,%r15,8),%r10
200	adcq	$0,%rdx
201	movq	%r13,-16(%rsp,%r15,8)
202	movq	%rdx,%r13
203
204	xorq	%rdx,%rdx
205	addq	%r11,%r13
206	adcq	$0,%rdx
207	addq	%r10,%r13
208	adcq	$0,%rdx
209	movq	%r13,-8(%rsp,%r9,8)
210	movq	%rdx,(%rsp,%r9,8)
211
212	leaq	1(%r14),%r14
213	cmpq	%r9,%r14
214	jb	L$outer
215
216	xorq	%r14,%r14
217	movq	(%rsp),%rax
218	movq	%r9,%r15
219
220.p2align	4
221L$sub:	sbbq	(%rcx,%r14,8),%rax
222	movq	%rax,(%rdi,%r14,8)
223	movq	8(%rsp,%r14,8),%rax
224	leaq	1(%r14),%r14
225	decq	%r15
226	jnz	L$sub
227
228	sbbq	$0,%rax
229	movq	$-1,%rbx
230	xorq	%rax,%rbx
231	xorq	%r14,%r14
232	movq	%r9,%r15
233
234L$copy:
235	movq	(%rdi,%r14,8),%rcx
236	movq	(%rsp,%r14,8),%rdx
237	andq	%rbx,%rcx
238	andq	%rax,%rdx
239	movq	%r9,(%rsp,%r14,8)
240	orq	%rcx,%rdx
241	movq	%rdx,(%rdi,%r14,8)
242	leaq	1(%r14),%r14
243	subq	$1,%r15
244	jnz	L$copy
245
246	movq	8(%rsp,%r9,8),%rsi
247
248	movq	$1,%rax
249	movq	-48(%rsi),%r15
250
251	movq	-40(%rsi),%r14
252
253	movq	-32(%rsi),%r13
254
255	movq	-24(%rsi),%r12
256
257	movq	-16(%rsi),%rbp
258
259	movq	-8(%rsi),%rbx
260
261	leaq	(%rsi),%rsp
262
263L$mul_epilogue:
264	ret
265
266
267
268.p2align	4
269bn_mul4x_mont:
270
271	movl	%r9d,%r9d
272	movq	%rsp,%rax
273
274L$mul4x_enter:
275	andl	$0x80100,%r11d
276	cmpl	$0x80100,%r11d
277	je	L$mulx4x_enter
278	pushq	%rbx
279
280	pushq	%rbp
281
282	pushq	%r12
283
284	pushq	%r13
285
286	pushq	%r14
287
288	pushq	%r15
289
290
291	negq	%r9
292	movq	%rsp,%r11
293	leaq	-32(%rsp,%r9,8),%r10
294	negq	%r9
295	andq	$-1024,%r10
296
297	subq	%r10,%r11
298	andq	$-4096,%r11
299	leaq	(%r10,%r11,1),%rsp
300	movq	(%rsp),%r11
301	cmpq	%r10,%rsp
302	ja	L$mul4x_page_walk
303	jmp	L$mul4x_page_walk_done
304
305L$mul4x_page_walk:
306	leaq	-4096(%rsp),%rsp
307	movq	(%rsp),%r11
308	cmpq	%r10,%rsp
309	ja	L$mul4x_page_walk
310L$mul4x_page_walk_done:
311
312	movq	%rax,8(%rsp,%r9,8)
313
314L$mul4x_body:
315	movq	%rdi,16(%rsp,%r9,8)
316	movq	%rdx,%r12
317	movq	(%r8),%r8
318	movq	(%r12),%rbx
319	movq	(%rsi),%rax
320
321	xorq	%r14,%r14
322	xorq	%r15,%r15
323
324	movq	%r8,%rbp
325	mulq	%rbx
326	movq	%rax,%r10
327	movq	(%rcx),%rax
328
329	imulq	%r10,%rbp
330	movq	%rdx,%r11
331
332	mulq	%rbp
333	addq	%rax,%r10
334	movq	8(%rsi),%rax
335	adcq	$0,%rdx
336	movq	%rdx,%rdi
337
338	mulq	%rbx
339	addq	%rax,%r11
340	movq	8(%rcx),%rax
341	adcq	$0,%rdx
342	movq	%rdx,%r10
343
344	mulq	%rbp
345	addq	%rax,%rdi
346	movq	16(%rsi),%rax
347	adcq	$0,%rdx
348	addq	%r11,%rdi
349	leaq	4(%r15),%r15
350	adcq	$0,%rdx
351	movq	%rdi,(%rsp)
352	movq	%rdx,%r13
353	jmp	L$1st4x
354.p2align	4
355L$1st4x:
356	mulq	%rbx
357	addq	%rax,%r10
358	movq	-16(%rcx,%r15,8),%rax
359	adcq	$0,%rdx
360	movq	%rdx,%r11
361
362	mulq	%rbp
363	addq	%rax,%r13
364	movq	-8(%rsi,%r15,8),%rax
365	adcq	$0,%rdx
366	addq	%r10,%r13
367	adcq	$0,%rdx
368	movq	%r13,-24(%rsp,%r15,8)
369	movq	%rdx,%rdi
370
371	mulq	%rbx
372	addq	%rax,%r11
373	movq	-8(%rcx,%r15,8),%rax
374	adcq	$0,%rdx
375	movq	%rdx,%r10
376
377	mulq	%rbp
378	addq	%rax,%rdi
379	movq	(%rsi,%r15,8),%rax
380	adcq	$0,%rdx
381	addq	%r11,%rdi
382	adcq	$0,%rdx
383	movq	%rdi,-16(%rsp,%r15,8)
384	movq	%rdx,%r13
385
386	mulq	%rbx
387	addq	%rax,%r10
388	movq	(%rcx,%r15,8),%rax
389	adcq	$0,%rdx
390	movq	%rdx,%r11
391
392	mulq	%rbp
393	addq	%rax,%r13
394	movq	8(%rsi,%r15,8),%rax
395	adcq	$0,%rdx
396	addq	%r10,%r13
397	adcq	$0,%rdx
398	movq	%r13,-8(%rsp,%r15,8)
399	movq	%rdx,%rdi
400
401	mulq	%rbx
402	addq	%rax,%r11
403	movq	8(%rcx,%r15,8),%rax
404	adcq	$0,%rdx
405	leaq	4(%r15),%r15
406	movq	%rdx,%r10
407
408	mulq	%rbp
409	addq	%rax,%rdi
410	movq	-16(%rsi,%r15,8),%rax
411	adcq	$0,%rdx
412	addq	%r11,%rdi
413	adcq	$0,%rdx
414	movq	%rdi,-32(%rsp,%r15,8)
415	movq	%rdx,%r13
416	cmpq	%r9,%r15
417	jb	L$1st4x
418
419	mulq	%rbx
420	addq	%rax,%r10
421	movq	-16(%rcx,%r15,8),%rax
422	adcq	$0,%rdx
423	movq	%rdx,%r11
424
425	mulq	%rbp
426	addq	%rax,%r13
427	movq	-8(%rsi,%r15,8),%rax
428	adcq	$0,%rdx
429	addq	%r10,%r13
430	adcq	$0,%rdx
431	movq	%r13,-24(%rsp,%r15,8)
432	movq	%rdx,%rdi
433
434	mulq	%rbx
435	addq	%rax,%r11
436	movq	-8(%rcx,%r15,8),%rax
437	adcq	$0,%rdx
438	movq	%rdx,%r10
439
440	mulq	%rbp
441	addq	%rax,%rdi
442	movq	(%rsi),%rax
443	adcq	$0,%rdx
444	addq	%r11,%rdi
445	adcq	$0,%rdx
446	movq	%rdi,-16(%rsp,%r15,8)
447	movq	%rdx,%r13
448
449	xorq	%rdi,%rdi
450	addq	%r10,%r13
451	adcq	$0,%rdi
452	movq	%r13,-8(%rsp,%r15,8)
453	movq	%rdi,(%rsp,%r15,8)
454
455	leaq	1(%r14),%r14
456.p2align	2
457L$outer4x:
458	movq	(%r12,%r14,8),%rbx
459	xorq	%r15,%r15
460	movq	(%rsp),%r10
461	movq	%r8,%rbp
462	mulq	%rbx
463	addq	%rax,%r10
464	movq	(%rcx),%rax
465	adcq	$0,%rdx
466
467	imulq	%r10,%rbp
468	movq	%rdx,%r11
469
470	mulq	%rbp
471	addq	%rax,%r10
472	movq	8(%rsi),%rax
473	adcq	$0,%rdx
474	movq	%rdx,%rdi
475
476	mulq	%rbx
477	addq	%rax,%r11
478	movq	8(%rcx),%rax
479	adcq	$0,%rdx
480	addq	8(%rsp),%r11
481	adcq	$0,%rdx
482	movq	%rdx,%r10
483
484	mulq	%rbp
485	addq	%rax,%rdi
486	movq	16(%rsi),%rax
487	adcq	$0,%rdx
488	addq	%r11,%rdi
489	leaq	4(%r15),%r15
490	adcq	$0,%rdx
491	movq	%rdi,(%rsp)
492	movq	%rdx,%r13
493	jmp	L$inner4x
494.p2align	4
495L$inner4x:
496	mulq	%rbx
497	addq	%rax,%r10
498	movq	-16(%rcx,%r15,8),%rax
499	adcq	$0,%rdx
500	addq	-16(%rsp,%r15,8),%r10
501	adcq	$0,%rdx
502	movq	%rdx,%r11
503
504	mulq	%rbp
505	addq	%rax,%r13
506	movq	-8(%rsi,%r15,8),%rax
507	adcq	$0,%rdx
508	addq	%r10,%r13
509	adcq	$0,%rdx
510	movq	%r13,-24(%rsp,%r15,8)
511	movq	%rdx,%rdi
512
513	mulq	%rbx
514	addq	%rax,%r11
515	movq	-8(%rcx,%r15,8),%rax
516	adcq	$0,%rdx
517	addq	-8(%rsp,%r15,8),%r11
518	adcq	$0,%rdx
519	movq	%rdx,%r10
520
521	mulq	%rbp
522	addq	%rax,%rdi
523	movq	(%rsi,%r15,8),%rax
524	adcq	$0,%rdx
525	addq	%r11,%rdi
526	adcq	$0,%rdx
527	movq	%rdi,-16(%rsp,%r15,8)
528	movq	%rdx,%r13
529
530	mulq	%rbx
531	addq	%rax,%r10
532	movq	(%rcx,%r15,8),%rax
533	adcq	$0,%rdx
534	addq	(%rsp,%r15,8),%r10
535	adcq	$0,%rdx
536	movq	%rdx,%r11
537
538	mulq	%rbp
539	addq	%rax,%r13
540	movq	8(%rsi,%r15,8),%rax
541	adcq	$0,%rdx
542	addq	%r10,%r13
543	adcq	$0,%rdx
544	movq	%r13,-8(%rsp,%r15,8)
545	movq	%rdx,%rdi
546
547	mulq	%rbx
548	addq	%rax,%r11
549	movq	8(%rcx,%r15,8),%rax
550	adcq	$0,%rdx
551	addq	8(%rsp,%r15,8),%r11
552	adcq	$0,%rdx
553	leaq	4(%r15),%r15
554	movq	%rdx,%r10
555
556	mulq	%rbp
557	addq	%rax,%rdi
558	movq	-16(%rsi,%r15,8),%rax
559	adcq	$0,%rdx
560	addq	%r11,%rdi
561	adcq	$0,%rdx
562	movq	%rdi,-32(%rsp,%r15,8)
563	movq	%rdx,%r13
564	cmpq	%r9,%r15
565	jb	L$inner4x
566
567	mulq	%rbx
568	addq	%rax,%r10
569	movq	-16(%rcx,%r15,8),%rax
570	adcq	$0,%rdx
571	addq	-16(%rsp,%r15,8),%r10
572	adcq	$0,%rdx
573	movq	%rdx,%r11
574
575	mulq	%rbp
576	addq	%rax,%r13
577	movq	-8(%rsi,%r15,8),%rax
578	adcq	$0,%rdx
579	addq	%r10,%r13
580	adcq	$0,%rdx
581	movq	%r13,-24(%rsp,%r15,8)
582	movq	%rdx,%rdi
583
584	mulq	%rbx
585	addq	%rax,%r11
586	movq	-8(%rcx,%r15,8),%rax
587	adcq	$0,%rdx
588	addq	-8(%rsp,%r15,8),%r11
589	adcq	$0,%rdx
590	leaq	1(%r14),%r14
591	movq	%rdx,%r10
592
593	mulq	%rbp
594	addq	%rax,%rdi
595	movq	(%rsi),%rax
596	adcq	$0,%rdx
597	addq	%r11,%rdi
598	adcq	$0,%rdx
599	movq	%rdi,-16(%rsp,%r15,8)
600	movq	%rdx,%r13
601
602	xorq	%rdi,%rdi
603	addq	%r10,%r13
604	adcq	$0,%rdi
605	addq	(%rsp,%r9,8),%r13
606	adcq	$0,%rdi
607	movq	%r13,-8(%rsp,%r15,8)
608	movq	%rdi,(%rsp,%r15,8)
609
610	cmpq	%r9,%r14
611	jb	L$outer4x
612	movq	16(%rsp,%r9,8),%rdi
613	leaq	-4(%r9),%r15
614	movq	0(%rsp),%rax
615	movq	8(%rsp),%rdx
616	shrq	$2,%r15
617	leaq	(%rsp),%rsi
618	xorq	%r14,%r14
619
620	subq	0(%rcx),%rax
621	movq	16(%rsi),%rbx
622	movq	24(%rsi),%rbp
623	sbbq	8(%rcx),%rdx
624
625L$sub4x:
626	movq	%rax,0(%rdi,%r14,8)
627	movq	%rdx,8(%rdi,%r14,8)
628	sbbq	16(%rcx,%r14,8),%rbx
629	movq	32(%rsi,%r14,8),%rax
630	movq	40(%rsi,%r14,8),%rdx
631	sbbq	24(%rcx,%r14,8),%rbp
632	movq	%rbx,16(%rdi,%r14,8)
633	movq	%rbp,24(%rdi,%r14,8)
634	sbbq	32(%rcx,%r14,8),%rax
635	movq	48(%rsi,%r14,8),%rbx
636	movq	56(%rsi,%r14,8),%rbp
637	sbbq	40(%rcx,%r14,8),%rdx
638	leaq	4(%r14),%r14
639	decq	%r15
640	jnz	L$sub4x
641
642	movq	%rax,0(%rdi,%r14,8)
643	movq	32(%rsi,%r14,8),%rax
644	sbbq	16(%rcx,%r14,8),%rbx
645	movq	%rdx,8(%rdi,%r14,8)
646	sbbq	24(%rcx,%r14,8),%rbp
647	movq	%rbx,16(%rdi,%r14,8)
648
649	sbbq	$0,%rax
650	movq	%rbp,24(%rdi,%r14,8)
651	pxor	%xmm0,%xmm0
652.byte	102,72,15,110,224
653	pcmpeqd	%xmm5,%xmm5
654	pshufd	$0,%xmm4,%xmm4
655	movq	%r9,%r15
656	pxor	%xmm4,%xmm5
657	shrq	$2,%r15
658	xorl	%eax,%eax
659
660	jmp	L$copy4x
661.p2align	4
662L$copy4x:
663	movdqa	(%rsp,%rax,1),%xmm1
664	movdqu	(%rdi,%rax,1),%xmm2
665	pand	%xmm4,%xmm1
666	pand	%xmm5,%xmm2
667	movdqa	16(%rsp,%rax,1),%xmm3
668	movdqa	%xmm0,(%rsp,%rax,1)
669	por	%xmm2,%xmm1
670	movdqu	16(%rdi,%rax,1),%xmm2
671	movdqu	%xmm1,(%rdi,%rax,1)
672	pand	%xmm4,%xmm3
673	pand	%xmm5,%xmm2
674	movdqa	%xmm0,16(%rsp,%rax,1)
675	por	%xmm2,%xmm3
676	movdqu	%xmm3,16(%rdi,%rax,1)
677	leaq	32(%rax),%rax
678	decq	%r15
679	jnz	L$copy4x
680	movq	8(%rsp,%r9,8),%rsi
681
682	movq	$1,%rax
683	movq	-48(%rsi),%r15
684
685	movq	-40(%rsi),%r14
686
687	movq	-32(%rsi),%r13
688
689	movq	-24(%rsi),%r12
690
691	movq	-16(%rsi),%rbp
692
693	movq	-8(%rsi),%rbx
694
695	leaq	(%rsi),%rsp
696
697L$mul4x_epilogue:
698	ret
699
700
701
702
703
704
705.p2align	5
706bn_sqr8x_mont:
707
708	movq	%rsp,%rax
709
710L$sqr8x_enter:
711	pushq	%rbx
712
713	pushq	%rbp
714
715	pushq	%r12
716
717	pushq	%r13
718
719	pushq	%r14
720
721	pushq	%r15
722
723L$sqr8x_prologue:
724
725	movl	%r9d,%r10d
726	shll	$3,%r9d
727	shlq	$3+2,%r10
728	negq	%r9
729
730
731
732
733
734
735	leaq	-64(%rsp,%r9,2),%r11
736	movq	%rsp,%rbp
737	movq	(%r8),%r8
738	subq	%rsi,%r11
739	andq	$4095,%r11
740	cmpq	%r11,%r10
741	jb	L$sqr8x_sp_alt
742	subq	%r11,%rbp
743	leaq	-64(%rbp,%r9,2),%rbp
744	jmp	L$sqr8x_sp_done
745
746.p2align	5
747L$sqr8x_sp_alt:
748	leaq	4096-64(,%r9,2),%r10
749	leaq	-64(%rbp,%r9,2),%rbp
750	subq	%r10,%r11
751	movq	$0,%r10
752	cmovcq	%r10,%r11
753	subq	%r11,%rbp
754L$sqr8x_sp_done:
755	andq	$-64,%rbp
756	movq	%rsp,%r11
757	subq	%rbp,%r11
758	andq	$-4096,%r11
759	leaq	(%r11,%rbp,1),%rsp
760	movq	(%rsp),%r10
761	cmpq	%rbp,%rsp
762	ja	L$sqr8x_page_walk
763	jmp	L$sqr8x_page_walk_done
764
765.p2align	4
766L$sqr8x_page_walk:
767	leaq	-4096(%rsp),%rsp
768	movq	(%rsp),%r10
769	cmpq	%rbp,%rsp
770	ja	L$sqr8x_page_walk
771L$sqr8x_page_walk_done:
772
773	movq	%r9,%r10
774	negq	%r9
775
776	movq	%r8,32(%rsp)
777	movq	%rax,40(%rsp)
778
779L$sqr8x_body:
780
781.byte	102,72,15,110,209
782	pxor	%xmm0,%xmm0
783.byte	102,72,15,110,207
784.byte	102,73,15,110,218
785	leaq	_OPENSSL_ia32cap_P(%rip),%rax
786	movl	8(%rax),%eax
787	andl	$0x80100,%eax
788	cmpl	$0x80100,%eax
789	jne	L$sqr8x_nox
790
791	call	_bn_sqrx8x_internal
792
793
794
795
796	leaq	(%r8,%rcx,1),%rbx
797	movq	%rcx,%r9
798	movq	%rcx,%rdx
799.byte	102,72,15,126,207
800	sarq	$3+2,%rcx
801	jmp	L$sqr8x_sub
802
803.p2align	5
804L$sqr8x_nox:
805	call	_bn_sqr8x_internal
806
807
808
809
810	leaq	(%rdi,%r9,1),%rbx
811	movq	%r9,%rcx
812	movq	%r9,%rdx
813.byte	102,72,15,126,207
814	sarq	$3+2,%rcx
815	jmp	L$sqr8x_sub
816
817.p2align	5
818L$sqr8x_sub:
819	movq	0(%rbx),%r12
820	movq	8(%rbx),%r13
821	movq	16(%rbx),%r14
822	movq	24(%rbx),%r15
823	leaq	32(%rbx),%rbx
824	sbbq	0(%rbp),%r12
825	sbbq	8(%rbp),%r13
826	sbbq	16(%rbp),%r14
827	sbbq	24(%rbp),%r15
828	leaq	32(%rbp),%rbp
829	movq	%r12,0(%rdi)
830	movq	%r13,8(%rdi)
831	movq	%r14,16(%rdi)
832	movq	%r15,24(%rdi)
833	leaq	32(%rdi),%rdi
834	incq	%rcx
835	jnz	L$sqr8x_sub
836
837	sbbq	$0,%rax
838	leaq	(%rbx,%r9,1),%rbx
839	leaq	(%rdi,%r9,1),%rdi
840
841.byte	102,72,15,110,200
842	pxor	%xmm0,%xmm0
843	pshufd	$0,%xmm1,%xmm1
844	movq	40(%rsp),%rsi
845
846	jmp	L$sqr8x_cond_copy
847
848.p2align	5
849L$sqr8x_cond_copy:
850	movdqa	0(%rbx),%xmm2
851	movdqa	16(%rbx),%xmm3
852	leaq	32(%rbx),%rbx
853	movdqu	0(%rdi),%xmm4
854	movdqu	16(%rdi),%xmm5
855	leaq	32(%rdi),%rdi
856	movdqa	%xmm0,-32(%rbx)
857	movdqa	%xmm0,-16(%rbx)
858	movdqa	%xmm0,-32(%rbx,%rdx,1)
859	movdqa	%xmm0,-16(%rbx,%rdx,1)
860	pcmpeqd	%xmm1,%xmm0
861	pand	%xmm1,%xmm2
862	pand	%xmm1,%xmm3
863	pand	%xmm0,%xmm4
864	pand	%xmm0,%xmm5
865	pxor	%xmm0,%xmm0
866	por	%xmm2,%xmm4
867	por	%xmm3,%xmm5
868	movdqu	%xmm4,-32(%rdi)
869	movdqu	%xmm5,-16(%rdi)
870	addq	$32,%r9
871	jnz	L$sqr8x_cond_copy
872
873	movq	$1,%rax
874	movq	-48(%rsi),%r15
875
876	movq	-40(%rsi),%r14
877
878	movq	-32(%rsi),%r13
879
880	movq	-24(%rsi),%r12
881
882	movq	-16(%rsi),%rbp
883
884	movq	-8(%rsi),%rbx
885
886	leaq	(%rsi),%rsp
887
888L$sqr8x_epilogue:
889	ret
890
891
892
893.p2align	5
894bn_mulx4x_mont:
895
896	movq	%rsp,%rax
897
898L$mulx4x_enter:
899	pushq	%rbx
900
901	pushq	%rbp
902
903	pushq	%r12
904
905	pushq	%r13
906
907	pushq	%r14
908
909	pushq	%r15
910
911L$mulx4x_prologue:
912
913	shll	$3,%r9d
914	xorq	%r10,%r10
915	subq	%r9,%r10
916	movq	(%r8),%r8
917	leaq	-72(%rsp,%r10,1),%rbp
918	andq	$-128,%rbp
919	movq	%rsp,%r11
920	subq	%rbp,%r11
921	andq	$-4096,%r11
922	leaq	(%r11,%rbp,1),%rsp
923	movq	(%rsp),%r10
924	cmpq	%rbp,%rsp
925	ja	L$mulx4x_page_walk
926	jmp	L$mulx4x_page_walk_done
927
928.p2align	4
929L$mulx4x_page_walk:
930	leaq	-4096(%rsp),%rsp
931	movq	(%rsp),%r10
932	cmpq	%rbp,%rsp
933	ja	L$mulx4x_page_walk
934L$mulx4x_page_walk_done:
935
936	leaq	(%rdx,%r9,1),%r10
937
938
939
940
941
942
943
944
945
946
947
948
949	movq	%r9,0(%rsp)
950	shrq	$5,%r9
951	movq	%r10,16(%rsp)
952	subq	$1,%r9
953	movq	%r8,24(%rsp)
954	movq	%rdi,32(%rsp)
955	movq	%rax,40(%rsp)
956
957	movq	%r9,48(%rsp)
958	jmp	L$mulx4x_body
959
960.p2align	5
961L$mulx4x_body:
962	leaq	8(%rdx),%rdi
963	movq	(%rdx),%rdx
964	leaq	64+32(%rsp),%rbx
965	movq	%rdx,%r9
966
967	mulxq	0(%rsi),%r8,%rax
968	mulxq	8(%rsi),%r11,%r14
969	addq	%rax,%r11
970	movq	%rdi,8(%rsp)
971	mulxq	16(%rsi),%r12,%r13
972	adcq	%r14,%r12
973	adcq	$0,%r13
974
975	movq	%r8,%rdi
976	imulq	24(%rsp),%r8
977	xorq	%rbp,%rbp
978
979	mulxq	24(%rsi),%rax,%r14
980	movq	%r8,%rdx
981	leaq	32(%rsi),%rsi
982	adcxq	%rax,%r13
983	adcxq	%rbp,%r14
984
985	mulxq	0(%rcx),%rax,%r10
986	adcxq	%rax,%rdi
987	adoxq	%r11,%r10
988	mulxq	8(%rcx),%rax,%r11
989	adcxq	%rax,%r10
990	adoxq	%r12,%r11
991.byte	0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
992	movq	48(%rsp),%rdi
993	movq	%r10,-32(%rbx)
994	adcxq	%rax,%r11
995	adoxq	%r13,%r12
996	mulxq	24(%rcx),%rax,%r15
997	movq	%r9,%rdx
998	movq	%r11,-24(%rbx)
999	adcxq	%rax,%r12
1000	adoxq	%rbp,%r15
1001	leaq	32(%rcx),%rcx
1002	movq	%r12,-16(%rbx)
1003
1004	jmp	L$mulx4x_1st
1005
1006.p2align	5
1007L$mulx4x_1st:
1008	adcxq	%rbp,%r15
1009	mulxq	0(%rsi),%r10,%rax
1010	adcxq	%r14,%r10
1011	mulxq	8(%rsi),%r11,%r14
1012	adcxq	%rax,%r11
1013	mulxq	16(%rsi),%r12,%rax
1014	adcxq	%r14,%r12
1015	mulxq	24(%rsi),%r13,%r14
1016.byte	0x67,0x67
1017	movq	%r8,%rdx
1018	adcxq	%rax,%r13
1019	adcxq	%rbp,%r14
1020	leaq	32(%rsi),%rsi
1021	leaq	32(%rbx),%rbx
1022
1023	adoxq	%r15,%r10
1024	mulxq	0(%rcx),%rax,%r15
1025	adcxq	%rax,%r10
1026	adoxq	%r15,%r11
1027	mulxq	8(%rcx),%rax,%r15
1028	adcxq	%rax,%r11
1029	adoxq	%r15,%r12
1030	mulxq	16(%rcx),%rax,%r15
1031	movq	%r10,-40(%rbx)
1032	adcxq	%rax,%r12
1033	movq	%r11,-32(%rbx)
1034	adoxq	%r15,%r13
1035	mulxq	24(%rcx),%rax,%r15
1036	movq	%r9,%rdx
1037	movq	%r12,-24(%rbx)
1038	adcxq	%rax,%r13
1039	adoxq	%rbp,%r15
1040	leaq	32(%rcx),%rcx
1041	movq	%r13,-16(%rbx)
1042
1043	decq	%rdi
1044	jnz	L$mulx4x_1st
1045
1046	movq	0(%rsp),%rax
1047	movq	8(%rsp),%rdi
1048	adcq	%rbp,%r15
1049	addq	%r15,%r14
1050	sbbq	%r15,%r15
1051	movq	%r14,-8(%rbx)
1052	jmp	L$mulx4x_outer
1053
1054.p2align	5
1055L$mulx4x_outer:
1056	movq	(%rdi),%rdx
1057	leaq	8(%rdi),%rdi
1058	subq	%rax,%rsi
1059	movq	%r15,(%rbx)
1060	leaq	64+32(%rsp),%rbx
1061	subq	%rax,%rcx
1062
1063	mulxq	0(%rsi),%r8,%r11
1064	xorl	%ebp,%ebp
1065	movq	%rdx,%r9
1066	mulxq	8(%rsi),%r14,%r12
1067	adoxq	-32(%rbx),%r8
1068	adcxq	%r14,%r11
1069	mulxq	16(%rsi),%r15,%r13
1070	adoxq	-24(%rbx),%r11
1071	adcxq	%r15,%r12
1072	adoxq	-16(%rbx),%r12
1073	adcxq	%rbp,%r13
1074	adoxq	%rbp,%r13
1075
1076	movq	%rdi,8(%rsp)
1077	movq	%r8,%r15
1078	imulq	24(%rsp),%r8
1079	xorl	%ebp,%ebp
1080
1081	mulxq	24(%rsi),%rax,%r14
1082	movq	%r8,%rdx
1083	adcxq	%rax,%r13
1084	adoxq	-8(%rbx),%r13
1085	adcxq	%rbp,%r14
1086	leaq	32(%rsi),%rsi
1087	adoxq	%rbp,%r14
1088
1089	mulxq	0(%rcx),%rax,%r10
1090	adcxq	%rax,%r15
1091	adoxq	%r11,%r10
1092	mulxq	8(%rcx),%rax,%r11
1093	adcxq	%rax,%r10
1094	adoxq	%r12,%r11
1095	mulxq	16(%rcx),%rax,%r12
1096	movq	%r10,-32(%rbx)
1097	adcxq	%rax,%r11
1098	adoxq	%r13,%r12
1099	mulxq	24(%rcx),%rax,%r15
1100	movq	%r9,%rdx
1101	movq	%r11,-24(%rbx)
1102	leaq	32(%rcx),%rcx
1103	adcxq	%rax,%r12
1104	adoxq	%rbp,%r15
1105	movq	48(%rsp),%rdi
1106	movq	%r12,-16(%rbx)
1107
1108	jmp	L$mulx4x_inner
1109
1110.p2align	5
1111L$mulx4x_inner:
1112	mulxq	0(%rsi),%r10,%rax
1113	adcxq	%rbp,%r15
1114	adoxq	%r14,%r10
1115	mulxq	8(%rsi),%r11,%r14
1116	adcxq	0(%rbx),%r10
1117	adoxq	%rax,%r11
1118	mulxq	16(%rsi),%r12,%rax
1119	adcxq	8(%rbx),%r11
1120	adoxq	%r14,%r12
1121	mulxq	24(%rsi),%r13,%r14
1122	movq	%r8,%rdx
1123	adcxq	16(%rbx),%r12
1124	adoxq	%rax,%r13
1125	adcxq	24(%rbx),%r13
1126	adoxq	%rbp,%r14
1127	leaq	32(%rsi),%rsi
1128	leaq	32(%rbx),%rbx
1129	adcxq	%rbp,%r14
1130
1131	adoxq	%r15,%r10
1132	mulxq	0(%rcx),%rax,%r15
1133	adcxq	%rax,%r10
1134	adoxq	%r15,%r11
1135	mulxq	8(%rcx),%rax,%r15
1136	adcxq	%rax,%r11
1137	adoxq	%r15,%r12
1138	mulxq	16(%rcx),%rax,%r15
1139	movq	%r10,-40(%rbx)
1140	adcxq	%rax,%r12
1141	adoxq	%r15,%r13
1142	mulxq	24(%rcx),%rax,%r15
1143	movq	%r9,%rdx
1144	movq	%r11,-32(%rbx)
1145	movq	%r12,-24(%rbx)
1146	adcxq	%rax,%r13
1147	adoxq	%rbp,%r15
1148	leaq	32(%rcx),%rcx
1149	movq	%r13,-16(%rbx)
1150
1151	decq	%rdi
1152	jnz	L$mulx4x_inner
1153
1154	movq	0(%rsp),%rax
1155	movq	8(%rsp),%rdi
1156	adcq	%rbp,%r15
1157	subq	0(%rbx),%rbp
1158	adcq	%r15,%r14
1159	sbbq	%r15,%r15
1160	movq	%r14,-8(%rbx)
1161
1162	cmpq	16(%rsp),%rdi
1163	jne	L$mulx4x_outer
1164
1165	leaq	64(%rsp),%rbx
1166	subq	%rax,%rcx
1167	negq	%r15
1168	movq	%rax,%rdx
1169	shrq	$3+2,%rax
1170	movq	32(%rsp),%rdi
1171	jmp	L$mulx4x_sub
1172
1173.p2align	5
1174L$mulx4x_sub:
1175	movq	0(%rbx),%r11
1176	movq	8(%rbx),%r12
1177	movq	16(%rbx),%r13
1178	movq	24(%rbx),%r14
1179	leaq	32(%rbx),%rbx
1180	sbbq	0(%rcx),%r11
1181	sbbq	8(%rcx),%r12
1182	sbbq	16(%rcx),%r13
1183	sbbq	24(%rcx),%r14
1184	leaq	32(%rcx),%rcx
1185	movq	%r11,0(%rdi)
1186	movq	%r12,8(%rdi)
1187	movq	%r13,16(%rdi)
1188	movq	%r14,24(%rdi)
1189	leaq	32(%rdi),%rdi
1190	decq	%rax
1191	jnz	L$mulx4x_sub
1192
1193	sbbq	$0,%r15
1194	leaq	64(%rsp),%rbx
1195	subq	%rdx,%rdi
1196
1197.byte	102,73,15,110,207
1198	pxor	%xmm0,%xmm0
1199	pshufd	$0,%xmm1,%xmm1
1200	movq	40(%rsp),%rsi
1201
1202	jmp	L$mulx4x_cond_copy
1203
1204.p2align	5
1205L$mulx4x_cond_copy:
1206	movdqa	0(%rbx),%xmm2
1207	movdqa	16(%rbx),%xmm3
1208	leaq	32(%rbx),%rbx
1209	movdqu	0(%rdi),%xmm4
1210	movdqu	16(%rdi),%xmm5
1211	leaq	32(%rdi),%rdi
1212	movdqa	%xmm0,-32(%rbx)
1213	movdqa	%xmm0,-16(%rbx)
1214	pcmpeqd	%xmm1,%xmm0
1215	pand	%xmm1,%xmm2
1216	pand	%xmm1,%xmm3
1217	pand	%xmm0,%xmm4
1218	pand	%xmm0,%xmm5
1219	pxor	%xmm0,%xmm0
1220	por	%xmm2,%xmm4
1221	por	%xmm3,%xmm5
1222	movdqu	%xmm4,-32(%rdi)
1223	movdqu	%xmm5,-16(%rdi)
1224	subq	$32,%rdx
1225	jnz	L$mulx4x_cond_copy
1226
1227	movq	%rdx,(%rbx)
1228
1229	movq	$1,%rax
1230	movq	-48(%rsi),%r15
1231
1232	movq	-40(%rsi),%r14
1233
1234	movq	-32(%rsi),%r13
1235
1236	movq	-24(%rsi),%r12
1237
1238	movq	-16(%rsi),%rbp
1239
1240	movq	-8(%rsi),%rbx
1241
1242	leaq	(%rsi),%rsp
1243
1244L$mulx4x_epilogue:
1245	ret
1246
1247
1248.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1249.p2align	4
1250#endif
1251