1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <ring-core/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
7.text
8
9.extern	OPENSSL_ia32cap_P
10.hidden OPENSSL_ia32cap_P
11
12.globl	bn_mul_mont
13.hidden bn_mul_mont
14.type	bn_mul_mont,@function
15.align	16
16bn_mul_mont:
17.cfi_startproc
18_CET_ENDBR
19	movl	%r9d,%r9d
20	movq	%rsp,%rax
21.cfi_def_cfa_register	%rax
22	testl	$3,%r9d
23	jnz	.Lmul_enter
24	cmpl	$8,%r9d
25	jb	.Lmul_enter
26	leaq	OPENSSL_ia32cap_P(%rip),%r11
27	movl	8(%r11),%r11d
28	cmpq	%rsi,%rdx
29	jne	.Lmul4x_enter
30	testl	$7,%r9d
31	jz	.Lsqr8x_enter
32	jmp	.Lmul4x_enter
33
34.align	16
35.Lmul_enter:
36	pushq	%rbx
37.cfi_offset	%rbx,-16
38	pushq	%rbp
39.cfi_offset	%rbp,-24
40	pushq	%r12
41.cfi_offset	%r12,-32
42	pushq	%r13
43.cfi_offset	%r13,-40
44	pushq	%r14
45.cfi_offset	%r14,-48
46	pushq	%r15
47.cfi_offset	%r15,-56
48
49	negq	%r9
50	movq	%rsp,%r11
51	leaq	-16(%rsp,%r9,8),%r10
52	negq	%r9
53	andq	$-1024,%r10
54
55
56
57
58
59
60
61
62
63	subq	%r10,%r11
64	andq	$-4096,%r11
65	leaq	(%r10,%r11,1),%rsp
66	movq	(%rsp),%r11
67	cmpq	%r10,%rsp
68	ja	.Lmul_page_walk
69	jmp	.Lmul_page_walk_done
70
71.align	16
72.Lmul_page_walk:
73	leaq	-4096(%rsp),%rsp
74	movq	(%rsp),%r11
75	cmpq	%r10,%rsp
76	ja	.Lmul_page_walk
77.Lmul_page_walk_done:
78
79	movq	%rax,8(%rsp,%r9,8)
80.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
81.Lmul_body:
82	movq	%rdx,%r12
83	movq	(%r8),%r8
84	movq	(%r12),%rbx
85	movq	(%rsi),%rax
86
87	xorq	%r14,%r14
88	xorq	%r15,%r15
89
90	movq	%r8,%rbp
91	mulq	%rbx
92	movq	%rax,%r10
93	movq	(%rcx),%rax
94
95	imulq	%r10,%rbp
96	movq	%rdx,%r11
97
98	mulq	%rbp
99	addq	%rax,%r10
100	movq	8(%rsi),%rax
101	adcq	$0,%rdx
102	movq	%rdx,%r13
103
104	leaq	1(%r15),%r15
105	jmp	.L1st_enter
106
107.align	16
108.L1st:
109	addq	%rax,%r13
110	movq	(%rsi,%r15,8),%rax
111	adcq	$0,%rdx
112	addq	%r11,%r13
113	movq	%r10,%r11
114	adcq	$0,%rdx
115	movq	%r13,-16(%rsp,%r15,8)
116	movq	%rdx,%r13
117
118.L1st_enter:
119	mulq	%rbx
120	addq	%rax,%r11
121	movq	(%rcx,%r15,8),%rax
122	adcq	$0,%rdx
123	leaq	1(%r15),%r15
124	movq	%rdx,%r10
125
126	mulq	%rbp
127	cmpq	%r9,%r15
128	jne	.L1st
129
130	addq	%rax,%r13
131	movq	(%rsi),%rax
132	adcq	$0,%rdx
133	addq	%r11,%r13
134	adcq	$0,%rdx
135	movq	%r13,-16(%rsp,%r15,8)
136	movq	%rdx,%r13
137	movq	%r10,%r11
138
139	xorq	%rdx,%rdx
140	addq	%r11,%r13
141	adcq	$0,%rdx
142	movq	%r13,-8(%rsp,%r9,8)
143	movq	%rdx,(%rsp,%r9,8)
144
145	leaq	1(%r14),%r14
146	jmp	.Louter
147.align	16
148.Louter:
149	movq	(%r12,%r14,8),%rbx
150	xorq	%r15,%r15
151	movq	%r8,%rbp
152	movq	(%rsp),%r10
153	mulq	%rbx
154	addq	%rax,%r10
155	movq	(%rcx),%rax
156	adcq	$0,%rdx
157
158	imulq	%r10,%rbp
159	movq	%rdx,%r11
160
161	mulq	%rbp
162	addq	%rax,%r10
163	movq	8(%rsi),%rax
164	adcq	$0,%rdx
165	movq	8(%rsp),%r10
166	movq	%rdx,%r13
167
168	leaq	1(%r15),%r15
169	jmp	.Linner_enter
170
171.align	16
172.Linner:
173	addq	%rax,%r13
174	movq	(%rsi,%r15,8),%rax
175	adcq	$0,%rdx
176	addq	%r10,%r13
177	movq	(%rsp,%r15,8),%r10
178	adcq	$0,%rdx
179	movq	%r13,-16(%rsp,%r15,8)
180	movq	%rdx,%r13
181
182.Linner_enter:
183	mulq	%rbx
184	addq	%rax,%r11
185	movq	(%rcx,%r15,8),%rax
186	adcq	$0,%rdx
187	addq	%r11,%r10
188	movq	%rdx,%r11
189	adcq	$0,%r11
190	leaq	1(%r15),%r15
191
192	mulq	%rbp
193	cmpq	%r9,%r15
194	jne	.Linner
195
196	addq	%rax,%r13
197	movq	(%rsi),%rax
198	adcq	$0,%rdx
199	addq	%r10,%r13
200	movq	(%rsp,%r15,8),%r10
201	adcq	$0,%rdx
202	movq	%r13,-16(%rsp,%r15,8)
203	movq	%rdx,%r13
204
205	xorq	%rdx,%rdx
206	addq	%r11,%r13
207	adcq	$0,%rdx
208	addq	%r10,%r13
209	adcq	$0,%rdx
210	movq	%r13,-8(%rsp,%r9,8)
211	movq	%rdx,(%rsp,%r9,8)
212
213	leaq	1(%r14),%r14
214	cmpq	%r9,%r14
215	jb	.Louter
216
217	xorq	%r14,%r14
218	movq	(%rsp),%rax
219	movq	%r9,%r15
220
221.align	16
222.Lsub:	sbbq	(%rcx,%r14,8),%rax
223	movq	%rax,(%rdi,%r14,8)
224	movq	8(%rsp,%r14,8),%rax
225	leaq	1(%r14),%r14
226	decq	%r15
227	jnz	.Lsub
228
229	sbbq	$0,%rax
230	movq	$-1,%rbx
231	xorq	%rax,%rbx
232	xorq	%r14,%r14
233	movq	%r9,%r15
234
235.Lcopy:
236	movq	(%rdi,%r14,8),%rcx
237	movq	(%rsp,%r14,8),%rdx
238	andq	%rbx,%rcx
239	andq	%rax,%rdx
240	movq	%r9,(%rsp,%r14,8)
241	orq	%rcx,%rdx
242	movq	%rdx,(%rdi,%r14,8)
243	leaq	1(%r14),%r14
244	subq	$1,%r15
245	jnz	.Lcopy
246
247	movq	8(%rsp,%r9,8),%rsi
248.cfi_def_cfa	%rsi,8
249	movq	$1,%rax
250	movq	-48(%rsi),%r15
251.cfi_restore	%r15
252	movq	-40(%rsi),%r14
253.cfi_restore	%r14
254	movq	-32(%rsi),%r13
255.cfi_restore	%r13
256	movq	-24(%rsi),%r12
257.cfi_restore	%r12
258	movq	-16(%rsi),%rbp
259.cfi_restore	%rbp
260	movq	-8(%rsi),%rbx
261.cfi_restore	%rbx
262	leaq	(%rsi),%rsp
263.cfi_def_cfa_register	%rsp
264.Lmul_epilogue:
265	ret
266.cfi_endproc
267.size	bn_mul_mont,.-bn_mul_mont
268.type	bn_mul4x_mont,@function
269.align	16
270bn_mul4x_mont:
271.cfi_startproc
272	movl	%r9d,%r9d
273	movq	%rsp,%rax
274.cfi_def_cfa_register	%rax
275.Lmul4x_enter:
276	andl	$0x80100,%r11d
277	cmpl	$0x80100,%r11d
278	je	.Lmulx4x_enter
279	pushq	%rbx
280.cfi_offset	%rbx,-16
281	pushq	%rbp
282.cfi_offset	%rbp,-24
283	pushq	%r12
284.cfi_offset	%r12,-32
285	pushq	%r13
286.cfi_offset	%r13,-40
287	pushq	%r14
288.cfi_offset	%r14,-48
289	pushq	%r15
290.cfi_offset	%r15,-56
291
292	negq	%r9
293	movq	%rsp,%r11
294	leaq	-32(%rsp,%r9,8),%r10
295	negq	%r9
296	andq	$-1024,%r10
297
298	subq	%r10,%r11
299	andq	$-4096,%r11
300	leaq	(%r10,%r11,1),%rsp
301	movq	(%rsp),%r11
302	cmpq	%r10,%rsp
303	ja	.Lmul4x_page_walk
304	jmp	.Lmul4x_page_walk_done
305
306.Lmul4x_page_walk:
307	leaq	-4096(%rsp),%rsp
308	movq	(%rsp),%r11
309	cmpq	%r10,%rsp
310	ja	.Lmul4x_page_walk
311.Lmul4x_page_walk_done:
312
313	movq	%rax,8(%rsp,%r9,8)
314.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
315.Lmul4x_body:
316	movq	%rdi,16(%rsp,%r9,8)
317	movq	%rdx,%r12
318	movq	(%r8),%r8
319	movq	(%r12),%rbx
320	movq	(%rsi),%rax
321
322	xorq	%r14,%r14
323	xorq	%r15,%r15
324
325	movq	%r8,%rbp
326	mulq	%rbx
327	movq	%rax,%r10
328	movq	(%rcx),%rax
329
330	imulq	%r10,%rbp
331	movq	%rdx,%r11
332
333	mulq	%rbp
334	addq	%rax,%r10
335	movq	8(%rsi),%rax
336	adcq	$0,%rdx
337	movq	%rdx,%rdi
338
339	mulq	%rbx
340	addq	%rax,%r11
341	movq	8(%rcx),%rax
342	adcq	$0,%rdx
343	movq	%rdx,%r10
344
345	mulq	%rbp
346	addq	%rax,%rdi
347	movq	16(%rsi),%rax
348	adcq	$0,%rdx
349	addq	%r11,%rdi
350	leaq	4(%r15),%r15
351	adcq	$0,%rdx
352	movq	%rdi,(%rsp)
353	movq	%rdx,%r13
354	jmp	.L1st4x
355.align	16
356.L1st4x:
357	mulq	%rbx
358	addq	%rax,%r10
359	movq	-16(%rcx,%r15,8),%rax
360	adcq	$0,%rdx
361	movq	%rdx,%r11
362
363	mulq	%rbp
364	addq	%rax,%r13
365	movq	-8(%rsi,%r15,8),%rax
366	adcq	$0,%rdx
367	addq	%r10,%r13
368	adcq	$0,%rdx
369	movq	%r13,-24(%rsp,%r15,8)
370	movq	%rdx,%rdi
371
372	mulq	%rbx
373	addq	%rax,%r11
374	movq	-8(%rcx,%r15,8),%rax
375	adcq	$0,%rdx
376	movq	%rdx,%r10
377
378	mulq	%rbp
379	addq	%rax,%rdi
380	movq	(%rsi,%r15,8),%rax
381	adcq	$0,%rdx
382	addq	%r11,%rdi
383	adcq	$0,%rdx
384	movq	%rdi,-16(%rsp,%r15,8)
385	movq	%rdx,%r13
386
387	mulq	%rbx
388	addq	%rax,%r10
389	movq	(%rcx,%r15,8),%rax
390	adcq	$0,%rdx
391	movq	%rdx,%r11
392
393	mulq	%rbp
394	addq	%rax,%r13
395	movq	8(%rsi,%r15,8),%rax
396	adcq	$0,%rdx
397	addq	%r10,%r13
398	adcq	$0,%rdx
399	movq	%r13,-8(%rsp,%r15,8)
400	movq	%rdx,%rdi
401
402	mulq	%rbx
403	addq	%rax,%r11
404	movq	8(%rcx,%r15,8),%rax
405	adcq	$0,%rdx
406	leaq	4(%r15),%r15
407	movq	%rdx,%r10
408
409	mulq	%rbp
410	addq	%rax,%rdi
411	movq	-16(%rsi,%r15,8),%rax
412	adcq	$0,%rdx
413	addq	%r11,%rdi
414	adcq	$0,%rdx
415	movq	%rdi,-32(%rsp,%r15,8)
416	movq	%rdx,%r13
417	cmpq	%r9,%r15
418	jb	.L1st4x
419
420	mulq	%rbx
421	addq	%rax,%r10
422	movq	-16(%rcx,%r15,8),%rax
423	adcq	$0,%rdx
424	movq	%rdx,%r11
425
426	mulq	%rbp
427	addq	%rax,%r13
428	movq	-8(%rsi,%r15,8),%rax
429	adcq	$0,%rdx
430	addq	%r10,%r13
431	adcq	$0,%rdx
432	movq	%r13,-24(%rsp,%r15,8)
433	movq	%rdx,%rdi
434
435	mulq	%rbx
436	addq	%rax,%r11
437	movq	-8(%rcx,%r15,8),%rax
438	adcq	$0,%rdx
439	movq	%rdx,%r10
440
441	mulq	%rbp
442	addq	%rax,%rdi
443	movq	(%rsi),%rax
444	adcq	$0,%rdx
445	addq	%r11,%rdi
446	adcq	$0,%rdx
447	movq	%rdi,-16(%rsp,%r15,8)
448	movq	%rdx,%r13
449
450	xorq	%rdi,%rdi
451	addq	%r10,%r13
452	adcq	$0,%rdi
453	movq	%r13,-8(%rsp,%r15,8)
454	movq	%rdi,(%rsp,%r15,8)
455
456	leaq	1(%r14),%r14
457.align	4
458.Louter4x:
459	movq	(%r12,%r14,8),%rbx
460	xorq	%r15,%r15
461	movq	(%rsp),%r10
462	movq	%r8,%rbp
463	mulq	%rbx
464	addq	%rax,%r10
465	movq	(%rcx),%rax
466	adcq	$0,%rdx
467
468	imulq	%r10,%rbp
469	movq	%rdx,%r11
470
471	mulq	%rbp
472	addq	%rax,%r10
473	movq	8(%rsi),%rax
474	adcq	$0,%rdx
475	movq	%rdx,%rdi
476
477	mulq	%rbx
478	addq	%rax,%r11
479	movq	8(%rcx),%rax
480	adcq	$0,%rdx
481	addq	8(%rsp),%r11
482	adcq	$0,%rdx
483	movq	%rdx,%r10
484
485	mulq	%rbp
486	addq	%rax,%rdi
487	movq	16(%rsi),%rax
488	adcq	$0,%rdx
489	addq	%r11,%rdi
490	leaq	4(%r15),%r15
491	adcq	$0,%rdx
492	movq	%rdi,(%rsp)
493	movq	%rdx,%r13
494	jmp	.Linner4x
495.align	16
496.Linner4x:
497	mulq	%rbx
498	addq	%rax,%r10
499	movq	-16(%rcx,%r15,8),%rax
500	adcq	$0,%rdx
501	addq	-16(%rsp,%r15,8),%r10
502	adcq	$0,%rdx
503	movq	%rdx,%r11
504
505	mulq	%rbp
506	addq	%rax,%r13
507	movq	-8(%rsi,%r15,8),%rax
508	adcq	$0,%rdx
509	addq	%r10,%r13
510	adcq	$0,%rdx
511	movq	%r13,-24(%rsp,%r15,8)
512	movq	%rdx,%rdi
513
514	mulq	%rbx
515	addq	%rax,%r11
516	movq	-8(%rcx,%r15,8),%rax
517	adcq	$0,%rdx
518	addq	-8(%rsp,%r15,8),%r11
519	adcq	$0,%rdx
520	movq	%rdx,%r10
521
522	mulq	%rbp
523	addq	%rax,%rdi
524	movq	(%rsi,%r15,8),%rax
525	adcq	$0,%rdx
526	addq	%r11,%rdi
527	adcq	$0,%rdx
528	movq	%rdi,-16(%rsp,%r15,8)
529	movq	%rdx,%r13
530
531	mulq	%rbx
532	addq	%rax,%r10
533	movq	(%rcx,%r15,8),%rax
534	adcq	$0,%rdx
535	addq	(%rsp,%r15,8),%r10
536	adcq	$0,%rdx
537	movq	%rdx,%r11
538
539	mulq	%rbp
540	addq	%rax,%r13
541	movq	8(%rsi,%r15,8),%rax
542	adcq	$0,%rdx
543	addq	%r10,%r13
544	adcq	$0,%rdx
545	movq	%r13,-8(%rsp,%r15,8)
546	movq	%rdx,%rdi
547
548	mulq	%rbx
549	addq	%rax,%r11
550	movq	8(%rcx,%r15,8),%rax
551	adcq	$0,%rdx
552	addq	8(%rsp,%r15,8),%r11
553	adcq	$0,%rdx
554	leaq	4(%r15),%r15
555	movq	%rdx,%r10
556
557	mulq	%rbp
558	addq	%rax,%rdi
559	movq	-16(%rsi,%r15,8),%rax
560	adcq	$0,%rdx
561	addq	%r11,%rdi
562	adcq	$0,%rdx
563	movq	%rdi,-32(%rsp,%r15,8)
564	movq	%rdx,%r13
565	cmpq	%r9,%r15
566	jb	.Linner4x
567
568	mulq	%rbx
569	addq	%rax,%r10
570	movq	-16(%rcx,%r15,8),%rax
571	adcq	$0,%rdx
572	addq	-16(%rsp,%r15,8),%r10
573	adcq	$0,%rdx
574	movq	%rdx,%r11
575
576	mulq	%rbp
577	addq	%rax,%r13
578	movq	-8(%rsi,%r15,8),%rax
579	adcq	$0,%rdx
580	addq	%r10,%r13
581	adcq	$0,%rdx
582	movq	%r13,-24(%rsp,%r15,8)
583	movq	%rdx,%rdi
584
585	mulq	%rbx
586	addq	%rax,%r11
587	movq	-8(%rcx,%r15,8),%rax
588	adcq	$0,%rdx
589	addq	-8(%rsp,%r15,8),%r11
590	adcq	$0,%rdx
591	leaq	1(%r14),%r14
592	movq	%rdx,%r10
593
594	mulq	%rbp
595	addq	%rax,%rdi
596	movq	(%rsi),%rax
597	adcq	$0,%rdx
598	addq	%r11,%rdi
599	adcq	$0,%rdx
600	movq	%rdi,-16(%rsp,%r15,8)
601	movq	%rdx,%r13
602
603	xorq	%rdi,%rdi
604	addq	%r10,%r13
605	adcq	$0,%rdi
606	addq	(%rsp,%r9,8),%r13
607	adcq	$0,%rdi
608	movq	%r13,-8(%rsp,%r15,8)
609	movq	%rdi,(%rsp,%r15,8)
610
611	cmpq	%r9,%r14
612	jb	.Louter4x
613	movq	16(%rsp,%r9,8),%rdi
614	leaq	-4(%r9),%r15
615	movq	0(%rsp),%rax
616	movq	8(%rsp),%rdx
617	shrq	$2,%r15
618	leaq	(%rsp),%rsi
619	xorq	%r14,%r14
620
621	subq	0(%rcx),%rax
622	movq	16(%rsi),%rbx
623	movq	24(%rsi),%rbp
624	sbbq	8(%rcx),%rdx
625
626.Lsub4x:
627	movq	%rax,0(%rdi,%r14,8)
628	movq	%rdx,8(%rdi,%r14,8)
629	sbbq	16(%rcx,%r14,8),%rbx
630	movq	32(%rsi,%r14,8),%rax
631	movq	40(%rsi,%r14,8),%rdx
632	sbbq	24(%rcx,%r14,8),%rbp
633	movq	%rbx,16(%rdi,%r14,8)
634	movq	%rbp,24(%rdi,%r14,8)
635	sbbq	32(%rcx,%r14,8),%rax
636	movq	48(%rsi,%r14,8),%rbx
637	movq	56(%rsi,%r14,8),%rbp
638	sbbq	40(%rcx,%r14,8),%rdx
639	leaq	4(%r14),%r14
640	decq	%r15
641	jnz	.Lsub4x
642
643	movq	%rax,0(%rdi,%r14,8)
644	movq	32(%rsi,%r14,8),%rax
645	sbbq	16(%rcx,%r14,8),%rbx
646	movq	%rdx,8(%rdi,%r14,8)
647	sbbq	24(%rcx,%r14,8),%rbp
648	movq	%rbx,16(%rdi,%r14,8)
649
650	sbbq	$0,%rax
651	movq	%rbp,24(%rdi,%r14,8)
652	pxor	%xmm0,%xmm0
653.byte	102,72,15,110,224
654	pcmpeqd	%xmm5,%xmm5
655	pshufd	$0,%xmm4,%xmm4
656	movq	%r9,%r15
657	pxor	%xmm4,%xmm5
658	shrq	$2,%r15
659	xorl	%eax,%eax
660
661	jmp	.Lcopy4x
662.align	16
663.Lcopy4x:
664	movdqa	(%rsp,%rax,1),%xmm1
665	movdqu	(%rdi,%rax,1),%xmm2
666	pand	%xmm4,%xmm1
667	pand	%xmm5,%xmm2
668	movdqa	16(%rsp,%rax,1),%xmm3
669	movdqa	%xmm0,(%rsp,%rax,1)
670	por	%xmm2,%xmm1
671	movdqu	16(%rdi,%rax,1),%xmm2
672	movdqu	%xmm1,(%rdi,%rax,1)
673	pand	%xmm4,%xmm3
674	pand	%xmm5,%xmm2
675	movdqa	%xmm0,16(%rsp,%rax,1)
676	por	%xmm2,%xmm3
677	movdqu	%xmm3,16(%rdi,%rax,1)
678	leaq	32(%rax),%rax
679	decq	%r15
680	jnz	.Lcopy4x
681	movq	8(%rsp,%r9,8),%rsi
682.cfi_def_cfa	%rsi, 8
683	movq	$1,%rax
684	movq	-48(%rsi),%r15
685.cfi_restore	%r15
686	movq	-40(%rsi),%r14
687.cfi_restore	%r14
688	movq	-32(%rsi),%r13
689.cfi_restore	%r13
690	movq	-24(%rsi),%r12
691.cfi_restore	%r12
692	movq	-16(%rsi),%rbp
693.cfi_restore	%rbp
694	movq	-8(%rsi),%rbx
695.cfi_restore	%rbx
696	leaq	(%rsi),%rsp
697.cfi_def_cfa_register	%rsp
698.Lmul4x_epilogue:
699	ret
700.cfi_endproc
701.size	bn_mul4x_mont,.-bn_mul4x_mont
702.extern	bn_sqrx8x_internal
703.hidden bn_sqrx8x_internal
704.extern	bn_sqr8x_internal
705.hidden bn_sqr8x_internal
706
707.type	bn_sqr8x_mont,@function
708.align	32
709bn_sqr8x_mont:
710.cfi_startproc
711	movq	%rsp,%rax
712.cfi_def_cfa_register	%rax
713.Lsqr8x_enter:
714	pushq	%rbx
715.cfi_offset	%rbx,-16
716	pushq	%rbp
717.cfi_offset	%rbp,-24
718	pushq	%r12
719.cfi_offset	%r12,-32
720	pushq	%r13
721.cfi_offset	%r13,-40
722	pushq	%r14
723.cfi_offset	%r14,-48
724	pushq	%r15
725.cfi_offset	%r15,-56
726.Lsqr8x_prologue:
727
728	movl	%r9d,%r10d
729	shll	$3,%r9d
730	shlq	$3+2,%r10
731	negq	%r9
732
733
734
735
736
737
738	leaq	-64(%rsp,%r9,2),%r11
739	movq	%rsp,%rbp
740	movq	(%r8),%r8
741	subq	%rsi,%r11
742	andq	$4095,%r11
743	cmpq	%r11,%r10
744	jb	.Lsqr8x_sp_alt
745	subq	%r11,%rbp
746	leaq	-64(%rbp,%r9,2),%rbp
747	jmp	.Lsqr8x_sp_done
748
749.align	32
750.Lsqr8x_sp_alt:
751	leaq	4096-64(,%r9,2),%r10
752	leaq	-64(%rbp,%r9,2),%rbp
753	subq	%r10,%r11
754	movq	$0,%r10
755	cmovcq	%r10,%r11
756	subq	%r11,%rbp
757.Lsqr8x_sp_done:
758	andq	$-64,%rbp
759	movq	%rsp,%r11
760	subq	%rbp,%r11
761	andq	$-4096,%r11
762	leaq	(%r11,%rbp,1),%rsp
763	movq	(%rsp),%r10
764	cmpq	%rbp,%rsp
765	ja	.Lsqr8x_page_walk
766	jmp	.Lsqr8x_page_walk_done
767
768.align	16
769.Lsqr8x_page_walk:
770	leaq	-4096(%rsp),%rsp
771	movq	(%rsp),%r10
772	cmpq	%rbp,%rsp
773	ja	.Lsqr8x_page_walk
774.Lsqr8x_page_walk_done:
775
776	movq	%r9,%r10
777	negq	%r9
778
779	movq	%r8,32(%rsp)
780	movq	%rax,40(%rsp)
781.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
782.Lsqr8x_body:
783
784.byte	102,72,15,110,209
785	pxor	%xmm0,%xmm0
786.byte	102,72,15,110,207
787.byte	102,73,15,110,218
788	leaq	OPENSSL_ia32cap_P(%rip),%rax
789	movl	8(%rax),%eax
790	andl	$0x80100,%eax
791	cmpl	$0x80100,%eax
792	jne	.Lsqr8x_nox
793
794	call	bn_sqrx8x_internal
795
796
797
798
799	leaq	(%r8,%rcx,1),%rbx
800	movq	%rcx,%r9
801	movq	%rcx,%rdx
802.byte	102,72,15,126,207
803	sarq	$3+2,%rcx
804	jmp	.Lsqr8x_sub
805
806.align	32
807.Lsqr8x_nox:
808	call	bn_sqr8x_internal
809
810
811
812
813	leaq	(%rdi,%r9,1),%rbx
814	movq	%r9,%rcx
815	movq	%r9,%rdx
816.byte	102,72,15,126,207
817	sarq	$3+2,%rcx
818	jmp	.Lsqr8x_sub
819
820.align	32
821.Lsqr8x_sub:
822	movq	0(%rbx),%r12
823	movq	8(%rbx),%r13
824	movq	16(%rbx),%r14
825	movq	24(%rbx),%r15
826	leaq	32(%rbx),%rbx
827	sbbq	0(%rbp),%r12
828	sbbq	8(%rbp),%r13
829	sbbq	16(%rbp),%r14
830	sbbq	24(%rbp),%r15
831	leaq	32(%rbp),%rbp
832	movq	%r12,0(%rdi)
833	movq	%r13,8(%rdi)
834	movq	%r14,16(%rdi)
835	movq	%r15,24(%rdi)
836	leaq	32(%rdi),%rdi
837	incq	%rcx
838	jnz	.Lsqr8x_sub
839
840	sbbq	$0,%rax
841	leaq	(%rbx,%r9,1),%rbx
842	leaq	(%rdi,%r9,1),%rdi
843
844.byte	102,72,15,110,200
845	pxor	%xmm0,%xmm0
846	pshufd	$0,%xmm1,%xmm1
847	movq	40(%rsp),%rsi
848.cfi_def_cfa	%rsi,8
849	jmp	.Lsqr8x_cond_copy
850
851.align	32
852.Lsqr8x_cond_copy:
853	movdqa	0(%rbx),%xmm2
854	movdqa	16(%rbx),%xmm3
855	leaq	32(%rbx),%rbx
856	movdqu	0(%rdi),%xmm4
857	movdqu	16(%rdi),%xmm5
858	leaq	32(%rdi),%rdi
859	movdqa	%xmm0,-32(%rbx)
860	movdqa	%xmm0,-16(%rbx)
861	movdqa	%xmm0,-32(%rbx,%rdx,1)
862	movdqa	%xmm0,-16(%rbx,%rdx,1)
863	pcmpeqd	%xmm1,%xmm0
864	pand	%xmm1,%xmm2
865	pand	%xmm1,%xmm3
866	pand	%xmm0,%xmm4
867	pand	%xmm0,%xmm5
868	pxor	%xmm0,%xmm0
869	por	%xmm2,%xmm4
870	por	%xmm3,%xmm5
871	movdqu	%xmm4,-32(%rdi)
872	movdqu	%xmm5,-16(%rdi)
873	addq	$32,%r9
874	jnz	.Lsqr8x_cond_copy
875
876	movq	$1,%rax
877	movq	-48(%rsi),%r15
878.cfi_restore	%r15
879	movq	-40(%rsi),%r14
880.cfi_restore	%r14
881	movq	-32(%rsi),%r13
882.cfi_restore	%r13
883	movq	-24(%rsi),%r12
884.cfi_restore	%r12
885	movq	-16(%rsi),%rbp
886.cfi_restore	%rbp
887	movq	-8(%rsi),%rbx
888.cfi_restore	%rbx
889	leaq	(%rsi),%rsp
890.cfi_def_cfa_register	%rsp
891.Lsqr8x_epilogue:
892	ret
893.cfi_endproc
894.size	bn_sqr8x_mont,.-bn_sqr8x_mont
895.type	bn_mulx4x_mont,@function
896.align	32
897bn_mulx4x_mont:
898.cfi_startproc
899	movq	%rsp,%rax
900.cfi_def_cfa_register	%rax
901.Lmulx4x_enter:
902	pushq	%rbx
903.cfi_offset	%rbx,-16
904	pushq	%rbp
905.cfi_offset	%rbp,-24
906	pushq	%r12
907.cfi_offset	%r12,-32
908	pushq	%r13
909.cfi_offset	%r13,-40
910	pushq	%r14
911.cfi_offset	%r14,-48
912	pushq	%r15
913.cfi_offset	%r15,-56
914.Lmulx4x_prologue:
915
916	shll	$3,%r9d
917	xorq	%r10,%r10
918	subq	%r9,%r10
919	movq	(%r8),%r8
920	leaq	-72(%rsp,%r10,1),%rbp
921	andq	$-128,%rbp
922	movq	%rsp,%r11
923	subq	%rbp,%r11
924	andq	$-4096,%r11
925	leaq	(%r11,%rbp,1),%rsp
926	movq	(%rsp),%r10
927	cmpq	%rbp,%rsp
928	ja	.Lmulx4x_page_walk
929	jmp	.Lmulx4x_page_walk_done
930
931.align	16
932.Lmulx4x_page_walk:
933	leaq	-4096(%rsp),%rsp
934	movq	(%rsp),%r10
935	cmpq	%rbp,%rsp
936	ja	.Lmulx4x_page_walk
937.Lmulx4x_page_walk_done:
938
939	leaq	(%rdx,%r9,1),%r10
940
941
942
943
944
945
946
947
948
949
950
951
952	movq	%r9,0(%rsp)
953	shrq	$5,%r9
954	movq	%r10,16(%rsp)
955	subq	$1,%r9
956	movq	%r8,24(%rsp)
957	movq	%rdi,32(%rsp)
958	movq	%rax,40(%rsp)
959.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
960	movq	%r9,48(%rsp)
961	jmp	.Lmulx4x_body
962
963.align	32
964.Lmulx4x_body:
965	leaq	8(%rdx),%rdi
966	movq	(%rdx),%rdx
967	leaq	64+32(%rsp),%rbx
968	movq	%rdx,%r9
969
970	mulxq	0(%rsi),%r8,%rax
971	mulxq	8(%rsi),%r11,%r14
972	addq	%rax,%r11
973	movq	%rdi,8(%rsp)
974	mulxq	16(%rsi),%r12,%r13
975	adcq	%r14,%r12
976	adcq	$0,%r13
977
978	movq	%r8,%rdi
979	imulq	24(%rsp),%r8
980	xorq	%rbp,%rbp
981
982	mulxq	24(%rsi),%rax,%r14
983	movq	%r8,%rdx
984	leaq	32(%rsi),%rsi
985	adcxq	%rax,%r13
986	adcxq	%rbp,%r14
987
988	mulxq	0(%rcx),%rax,%r10
989	adcxq	%rax,%rdi
990	adoxq	%r11,%r10
991	mulxq	8(%rcx),%rax,%r11
992	adcxq	%rax,%r10
993	adoxq	%r12,%r11
994.byte	0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
995	movq	48(%rsp),%rdi
996	movq	%r10,-32(%rbx)
997	adcxq	%rax,%r11
998	adoxq	%r13,%r12
999	mulxq	24(%rcx),%rax,%r15
1000	movq	%r9,%rdx
1001	movq	%r11,-24(%rbx)
1002	adcxq	%rax,%r12
1003	adoxq	%rbp,%r15
1004	leaq	32(%rcx),%rcx
1005	movq	%r12,-16(%rbx)
1006
1007	jmp	.Lmulx4x_1st
1008
1009.align	32
1010.Lmulx4x_1st:
1011	adcxq	%rbp,%r15
1012	mulxq	0(%rsi),%r10,%rax
1013	adcxq	%r14,%r10
1014	mulxq	8(%rsi),%r11,%r14
1015	adcxq	%rax,%r11
1016	mulxq	16(%rsi),%r12,%rax
1017	adcxq	%r14,%r12
1018	mulxq	24(%rsi),%r13,%r14
1019.byte	0x67,0x67
1020	movq	%r8,%rdx
1021	adcxq	%rax,%r13
1022	adcxq	%rbp,%r14
1023	leaq	32(%rsi),%rsi
1024	leaq	32(%rbx),%rbx
1025
1026	adoxq	%r15,%r10
1027	mulxq	0(%rcx),%rax,%r15
1028	adcxq	%rax,%r10
1029	adoxq	%r15,%r11
1030	mulxq	8(%rcx),%rax,%r15
1031	adcxq	%rax,%r11
1032	adoxq	%r15,%r12
1033	mulxq	16(%rcx),%rax,%r15
1034	movq	%r10,-40(%rbx)
1035	adcxq	%rax,%r12
1036	movq	%r11,-32(%rbx)
1037	adoxq	%r15,%r13
1038	mulxq	24(%rcx),%rax,%r15
1039	movq	%r9,%rdx
1040	movq	%r12,-24(%rbx)
1041	adcxq	%rax,%r13
1042	adoxq	%rbp,%r15
1043	leaq	32(%rcx),%rcx
1044	movq	%r13,-16(%rbx)
1045
1046	decq	%rdi
1047	jnz	.Lmulx4x_1st
1048
1049	movq	0(%rsp),%rax
1050	movq	8(%rsp),%rdi
1051	adcq	%rbp,%r15
1052	addq	%r15,%r14
1053	sbbq	%r15,%r15
1054	movq	%r14,-8(%rbx)
1055	jmp	.Lmulx4x_outer
1056
1057.align	32
1058.Lmulx4x_outer:
1059	movq	(%rdi),%rdx
1060	leaq	8(%rdi),%rdi
1061	subq	%rax,%rsi
1062	movq	%r15,(%rbx)
1063	leaq	64+32(%rsp),%rbx
1064	subq	%rax,%rcx
1065
1066	mulxq	0(%rsi),%r8,%r11
1067	xorl	%ebp,%ebp
1068	movq	%rdx,%r9
1069	mulxq	8(%rsi),%r14,%r12
1070	adoxq	-32(%rbx),%r8
1071	adcxq	%r14,%r11
1072	mulxq	16(%rsi),%r15,%r13
1073	adoxq	-24(%rbx),%r11
1074	adcxq	%r15,%r12
1075	adoxq	-16(%rbx),%r12
1076	adcxq	%rbp,%r13
1077	adoxq	%rbp,%r13
1078
1079	movq	%rdi,8(%rsp)
1080	movq	%r8,%r15
1081	imulq	24(%rsp),%r8
1082	xorl	%ebp,%ebp
1083
1084	mulxq	24(%rsi),%rax,%r14
1085	movq	%r8,%rdx
1086	adcxq	%rax,%r13
1087	adoxq	-8(%rbx),%r13
1088	adcxq	%rbp,%r14
1089	leaq	32(%rsi),%rsi
1090	adoxq	%rbp,%r14
1091
1092	mulxq	0(%rcx),%rax,%r10
1093	adcxq	%rax,%r15
1094	adoxq	%r11,%r10
1095	mulxq	8(%rcx),%rax,%r11
1096	adcxq	%rax,%r10
1097	adoxq	%r12,%r11
1098	mulxq	16(%rcx),%rax,%r12
1099	movq	%r10,-32(%rbx)
1100	adcxq	%rax,%r11
1101	adoxq	%r13,%r12
1102	mulxq	24(%rcx),%rax,%r15
1103	movq	%r9,%rdx
1104	movq	%r11,-24(%rbx)
1105	leaq	32(%rcx),%rcx
1106	adcxq	%rax,%r12
1107	adoxq	%rbp,%r15
1108	movq	48(%rsp),%rdi
1109	movq	%r12,-16(%rbx)
1110
1111	jmp	.Lmulx4x_inner
1112
1113.align	32
1114.Lmulx4x_inner:
1115	mulxq	0(%rsi),%r10,%rax
1116	adcxq	%rbp,%r15
1117	adoxq	%r14,%r10
1118	mulxq	8(%rsi),%r11,%r14
1119	adcxq	0(%rbx),%r10
1120	adoxq	%rax,%r11
1121	mulxq	16(%rsi),%r12,%rax
1122	adcxq	8(%rbx),%r11
1123	adoxq	%r14,%r12
1124	mulxq	24(%rsi),%r13,%r14
1125	movq	%r8,%rdx
1126	adcxq	16(%rbx),%r12
1127	adoxq	%rax,%r13
1128	adcxq	24(%rbx),%r13
1129	adoxq	%rbp,%r14
1130	leaq	32(%rsi),%rsi
1131	leaq	32(%rbx),%rbx
1132	adcxq	%rbp,%r14
1133
1134	adoxq	%r15,%r10
1135	mulxq	0(%rcx),%rax,%r15
1136	adcxq	%rax,%r10
1137	adoxq	%r15,%r11
1138	mulxq	8(%rcx),%rax,%r15
1139	adcxq	%rax,%r11
1140	adoxq	%r15,%r12
1141	mulxq	16(%rcx),%rax,%r15
1142	movq	%r10,-40(%rbx)
1143	adcxq	%rax,%r12
1144	adoxq	%r15,%r13
1145	mulxq	24(%rcx),%rax,%r15
1146	movq	%r9,%rdx
1147	movq	%r11,-32(%rbx)
1148	movq	%r12,-24(%rbx)
1149	adcxq	%rax,%r13
1150	adoxq	%rbp,%r15
1151	leaq	32(%rcx),%rcx
1152	movq	%r13,-16(%rbx)
1153
1154	decq	%rdi
1155	jnz	.Lmulx4x_inner
1156
1157	movq	0(%rsp),%rax
1158	movq	8(%rsp),%rdi
1159	adcq	%rbp,%r15
1160	subq	0(%rbx),%rbp
1161	adcq	%r15,%r14
1162	sbbq	%r15,%r15
1163	movq	%r14,-8(%rbx)
1164
1165	cmpq	16(%rsp),%rdi
1166	jne	.Lmulx4x_outer
1167
1168	leaq	64(%rsp),%rbx
1169	subq	%rax,%rcx
1170	negq	%r15
1171	movq	%rax,%rdx
1172	shrq	$3+2,%rax
1173	movq	32(%rsp),%rdi
1174	jmp	.Lmulx4x_sub
1175
1176.align	32
1177.Lmulx4x_sub:
1178	movq	0(%rbx),%r11
1179	movq	8(%rbx),%r12
1180	movq	16(%rbx),%r13
1181	movq	24(%rbx),%r14
1182	leaq	32(%rbx),%rbx
1183	sbbq	0(%rcx),%r11
1184	sbbq	8(%rcx),%r12
1185	sbbq	16(%rcx),%r13
1186	sbbq	24(%rcx),%r14
1187	leaq	32(%rcx),%rcx
1188	movq	%r11,0(%rdi)
1189	movq	%r12,8(%rdi)
1190	movq	%r13,16(%rdi)
1191	movq	%r14,24(%rdi)
1192	leaq	32(%rdi),%rdi
1193	decq	%rax
1194	jnz	.Lmulx4x_sub
1195
1196	sbbq	$0,%r15
1197	leaq	64(%rsp),%rbx
1198	subq	%rdx,%rdi
1199
1200.byte	102,73,15,110,207
1201	pxor	%xmm0,%xmm0
1202	pshufd	$0,%xmm1,%xmm1
1203	movq	40(%rsp),%rsi
1204.cfi_def_cfa	%rsi,8
1205	jmp	.Lmulx4x_cond_copy
1206
1207.align	32
1208.Lmulx4x_cond_copy:
1209	movdqa	0(%rbx),%xmm2
1210	movdqa	16(%rbx),%xmm3
1211	leaq	32(%rbx),%rbx
1212	movdqu	0(%rdi),%xmm4
1213	movdqu	16(%rdi),%xmm5
1214	leaq	32(%rdi),%rdi
1215	movdqa	%xmm0,-32(%rbx)
1216	movdqa	%xmm0,-16(%rbx)
1217	pcmpeqd	%xmm1,%xmm0
1218	pand	%xmm1,%xmm2
1219	pand	%xmm1,%xmm3
1220	pand	%xmm0,%xmm4
1221	pand	%xmm0,%xmm5
1222	pxor	%xmm0,%xmm0
1223	por	%xmm2,%xmm4
1224	por	%xmm3,%xmm5
1225	movdqu	%xmm4,-32(%rdi)
1226	movdqu	%xmm5,-16(%rdi)
1227	subq	$32,%rdx
1228	jnz	.Lmulx4x_cond_copy
1229
1230	movq	%rdx,(%rbx)
1231
1232	movq	$1,%rax
1233	movq	-48(%rsi),%r15
1234.cfi_restore	%r15
1235	movq	-40(%rsi),%r14
1236.cfi_restore	%r14
1237	movq	-32(%rsi),%r13
1238.cfi_restore	%r13
1239	movq	-24(%rsi),%r12
1240.cfi_restore	%r12
1241	movq	-16(%rsi),%rbp
1242.cfi_restore	%rbp
1243	movq	-8(%rsi),%rbx
1244.cfi_restore	%rbx
1245	leaq	(%rsi),%rsp
1246.cfi_def_cfa_register	%rsp
1247.Lmulx4x_epilogue:
1248	ret
1249.cfi_endproc
1250.size	bn_mulx4x_mont,.-bn_mulx4x_mont
1251.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1252.align	16
1253#endif
1254