xref: /aosp_15_r20/external/cronet/third_party/boringssl/src/gen/bcm/x86_64-mont-linux.S (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <openssl/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
7.text
8
9.globl	bn_mul_mont_nohw
10.hidden bn_mul_mont_nohw
11.type	bn_mul_mont_nohw,@function
12.align	16
13bn_mul_mont_nohw:
14.cfi_startproc
15_CET_ENDBR
16	movl	%r9d,%r9d
17	movq	%rsp,%rax
18.cfi_def_cfa_register	%rax
19	pushq	%rbx
20.cfi_offset	%rbx,-16
21	pushq	%rbp
22.cfi_offset	%rbp,-24
23	pushq	%r12
24.cfi_offset	%r12,-32
25	pushq	%r13
26.cfi_offset	%r13,-40
27	pushq	%r14
28.cfi_offset	%r14,-48
29	pushq	%r15
30.cfi_offset	%r15,-56
31
32	negq	%r9
33	movq	%rsp,%r11
34	leaq	-16(%rsp,%r9,8),%r10
35	negq	%r9
36	andq	$-1024,%r10
37
38
39
40
41
42
43
44
45
46	subq	%r10,%r11
47	andq	$-4096,%r11
48	leaq	(%r10,%r11,1),%rsp
49	movq	(%rsp),%r11
50	cmpq	%r10,%rsp
51	ja	.Lmul_page_walk
52	jmp	.Lmul_page_walk_done
53
54.align	16
55.Lmul_page_walk:
56	leaq	-4096(%rsp),%rsp
57	movq	(%rsp),%r11
58	cmpq	%r10,%rsp
59	ja	.Lmul_page_walk
60.Lmul_page_walk_done:
61
62	movq	%rax,8(%rsp,%r9,8)
63.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
64.Lmul_body:
65	movq	%rdx,%r12
66	movq	(%r8),%r8
67	movq	(%r12),%rbx
68	movq	(%rsi),%rax
69
70	xorq	%r14,%r14
71	xorq	%r15,%r15
72
73	movq	%r8,%rbp
74	mulq	%rbx
75	movq	%rax,%r10
76	movq	(%rcx),%rax
77
78	imulq	%r10,%rbp
79	movq	%rdx,%r11
80
81	mulq	%rbp
82	addq	%rax,%r10
83	movq	8(%rsi),%rax
84	adcq	$0,%rdx
85	movq	%rdx,%r13
86
87	leaq	1(%r15),%r15
88	jmp	.L1st_enter
89
90.align	16
91.L1st:
92	addq	%rax,%r13
93	movq	(%rsi,%r15,8),%rax
94	adcq	$0,%rdx
95	addq	%r11,%r13
96	movq	%r10,%r11
97	adcq	$0,%rdx
98	movq	%r13,-16(%rsp,%r15,8)
99	movq	%rdx,%r13
100
101.L1st_enter:
102	mulq	%rbx
103	addq	%rax,%r11
104	movq	(%rcx,%r15,8),%rax
105	adcq	$0,%rdx
106	leaq	1(%r15),%r15
107	movq	%rdx,%r10
108
109	mulq	%rbp
110	cmpq	%r9,%r15
111	jne	.L1st
112
113	addq	%rax,%r13
114	movq	(%rsi),%rax
115	adcq	$0,%rdx
116	addq	%r11,%r13
117	adcq	$0,%rdx
118	movq	%r13,-16(%rsp,%r15,8)
119	movq	%rdx,%r13
120	movq	%r10,%r11
121
122	xorq	%rdx,%rdx
123	addq	%r11,%r13
124	adcq	$0,%rdx
125	movq	%r13,-8(%rsp,%r9,8)
126	movq	%rdx,(%rsp,%r9,8)
127
128	leaq	1(%r14),%r14
129	jmp	.Louter
130.align	16
131.Louter:
132	movq	(%r12,%r14,8),%rbx
133	xorq	%r15,%r15
134	movq	%r8,%rbp
135	movq	(%rsp),%r10
136	mulq	%rbx
137	addq	%rax,%r10
138	movq	(%rcx),%rax
139	adcq	$0,%rdx
140
141	imulq	%r10,%rbp
142	movq	%rdx,%r11
143
144	mulq	%rbp
145	addq	%rax,%r10
146	movq	8(%rsi),%rax
147	adcq	$0,%rdx
148	movq	8(%rsp),%r10
149	movq	%rdx,%r13
150
151	leaq	1(%r15),%r15
152	jmp	.Linner_enter
153
154.align	16
155.Linner:
156	addq	%rax,%r13
157	movq	(%rsi,%r15,8),%rax
158	adcq	$0,%rdx
159	addq	%r10,%r13
160	movq	(%rsp,%r15,8),%r10
161	adcq	$0,%rdx
162	movq	%r13,-16(%rsp,%r15,8)
163	movq	%rdx,%r13
164
165.Linner_enter:
166	mulq	%rbx
167	addq	%rax,%r11
168	movq	(%rcx,%r15,8),%rax
169	adcq	$0,%rdx
170	addq	%r11,%r10
171	movq	%rdx,%r11
172	adcq	$0,%r11
173	leaq	1(%r15),%r15
174
175	mulq	%rbp
176	cmpq	%r9,%r15
177	jne	.Linner
178
179	addq	%rax,%r13
180	movq	(%rsi),%rax
181	adcq	$0,%rdx
182	addq	%r10,%r13
183	movq	(%rsp,%r15,8),%r10
184	adcq	$0,%rdx
185	movq	%r13,-16(%rsp,%r15,8)
186	movq	%rdx,%r13
187
188	xorq	%rdx,%rdx
189	addq	%r11,%r13
190	adcq	$0,%rdx
191	addq	%r10,%r13
192	adcq	$0,%rdx
193	movq	%r13,-8(%rsp,%r9,8)
194	movq	%rdx,(%rsp,%r9,8)
195
196	leaq	1(%r14),%r14
197	cmpq	%r9,%r14
198	jb	.Louter
199
200	xorq	%r14,%r14
201	movq	(%rsp),%rax
202	movq	%r9,%r15
203
204.align	16
205.Lsub:	sbbq	(%rcx,%r14,8),%rax
206	movq	%rax,(%rdi,%r14,8)
207	movq	8(%rsp,%r14,8),%rax
208	leaq	1(%r14),%r14
209	decq	%r15
210	jnz	.Lsub
211
212	sbbq	$0,%rax
213	movq	$-1,%rbx
214	xorq	%rax,%rbx
215	xorq	%r14,%r14
216	movq	%r9,%r15
217
218.Lcopy:
219	movq	(%rdi,%r14,8),%rcx
220	movq	(%rsp,%r14,8),%rdx
221	andq	%rbx,%rcx
222	andq	%rax,%rdx
223	movq	%r9,(%rsp,%r14,8)
224	orq	%rcx,%rdx
225	movq	%rdx,(%rdi,%r14,8)
226	leaq	1(%r14),%r14
227	subq	$1,%r15
228	jnz	.Lcopy
229
230	movq	8(%rsp,%r9,8),%rsi
231.cfi_def_cfa	%rsi,8
232	movq	$1,%rax
233	movq	-48(%rsi),%r15
234.cfi_restore	%r15
235	movq	-40(%rsi),%r14
236.cfi_restore	%r14
237	movq	-32(%rsi),%r13
238.cfi_restore	%r13
239	movq	-24(%rsi),%r12
240.cfi_restore	%r12
241	movq	-16(%rsi),%rbp
242.cfi_restore	%rbp
243	movq	-8(%rsi),%rbx
244.cfi_restore	%rbx
245	leaq	(%rsi),%rsp
246.cfi_def_cfa_register	%rsp
247.Lmul_epilogue:
248	ret
249.cfi_endproc
250.size	bn_mul_mont_nohw,.-bn_mul_mont_nohw
251.globl	bn_mul4x_mont
252.hidden bn_mul4x_mont
253.type	bn_mul4x_mont,@function
254.align	16
255bn_mul4x_mont:
256.cfi_startproc
257_CET_ENDBR
258	movl	%r9d,%r9d
259	movq	%rsp,%rax
260.cfi_def_cfa_register	%rax
261	pushq	%rbx
262.cfi_offset	%rbx,-16
263	pushq	%rbp
264.cfi_offset	%rbp,-24
265	pushq	%r12
266.cfi_offset	%r12,-32
267	pushq	%r13
268.cfi_offset	%r13,-40
269	pushq	%r14
270.cfi_offset	%r14,-48
271	pushq	%r15
272.cfi_offset	%r15,-56
273
274	negq	%r9
275	movq	%rsp,%r11
276	leaq	-32(%rsp,%r9,8),%r10
277	negq	%r9
278	andq	$-1024,%r10
279
280	subq	%r10,%r11
281	andq	$-4096,%r11
282	leaq	(%r10,%r11,1),%rsp
283	movq	(%rsp),%r11
284	cmpq	%r10,%rsp
285	ja	.Lmul4x_page_walk
286	jmp	.Lmul4x_page_walk_done
287
288.Lmul4x_page_walk:
289	leaq	-4096(%rsp),%rsp
290	movq	(%rsp),%r11
291	cmpq	%r10,%rsp
292	ja	.Lmul4x_page_walk
293.Lmul4x_page_walk_done:
294
295	movq	%rax,8(%rsp,%r9,8)
296.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
297.Lmul4x_body:
298	movq	%rdi,16(%rsp,%r9,8)
299	movq	%rdx,%r12
300	movq	(%r8),%r8
301	movq	(%r12),%rbx
302	movq	(%rsi),%rax
303
304	xorq	%r14,%r14
305	xorq	%r15,%r15
306
307	movq	%r8,%rbp
308	mulq	%rbx
309	movq	%rax,%r10
310	movq	(%rcx),%rax
311
312	imulq	%r10,%rbp
313	movq	%rdx,%r11
314
315	mulq	%rbp
316	addq	%rax,%r10
317	movq	8(%rsi),%rax
318	adcq	$0,%rdx
319	movq	%rdx,%rdi
320
321	mulq	%rbx
322	addq	%rax,%r11
323	movq	8(%rcx),%rax
324	adcq	$0,%rdx
325	movq	%rdx,%r10
326
327	mulq	%rbp
328	addq	%rax,%rdi
329	movq	16(%rsi),%rax
330	adcq	$0,%rdx
331	addq	%r11,%rdi
332	leaq	4(%r15),%r15
333	adcq	$0,%rdx
334	movq	%rdi,(%rsp)
335	movq	%rdx,%r13
336	jmp	.L1st4x
337.align	16
338.L1st4x:
339	mulq	%rbx
340	addq	%rax,%r10
341	movq	-16(%rcx,%r15,8),%rax
342	adcq	$0,%rdx
343	movq	%rdx,%r11
344
345	mulq	%rbp
346	addq	%rax,%r13
347	movq	-8(%rsi,%r15,8),%rax
348	adcq	$0,%rdx
349	addq	%r10,%r13
350	adcq	$0,%rdx
351	movq	%r13,-24(%rsp,%r15,8)
352	movq	%rdx,%rdi
353
354	mulq	%rbx
355	addq	%rax,%r11
356	movq	-8(%rcx,%r15,8),%rax
357	adcq	$0,%rdx
358	movq	%rdx,%r10
359
360	mulq	%rbp
361	addq	%rax,%rdi
362	movq	(%rsi,%r15,8),%rax
363	adcq	$0,%rdx
364	addq	%r11,%rdi
365	adcq	$0,%rdx
366	movq	%rdi,-16(%rsp,%r15,8)
367	movq	%rdx,%r13
368
369	mulq	%rbx
370	addq	%rax,%r10
371	movq	(%rcx,%r15,8),%rax
372	adcq	$0,%rdx
373	movq	%rdx,%r11
374
375	mulq	%rbp
376	addq	%rax,%r13
377	movq	8(%rsi,%r15,8),%rax
378	adcq	$0,%rdx
379	addq	%r10,%r13
380	adcq	$0,%rdx
381	movq	%r13,-8(%rsp,%r15,8)
382	movq	%rdx,%rdi
383
384	mulq	%rbx
385	addq	%rax,%r11
386	movq	8(%rcx,%r15,8),%rax
387	adcq	$0,%rdx
388	leaq	4(%r15),%r15
389	movq	%rdx,%r10
390
391	mulq	%rbp
392	addq	%rax,%rdi
393	movq	-16(%rsi,%r15,8),%rax
394	adcq	$0,%rdx
395	addq	%r11,%rdi
396	adcq	$0,%rdx
397	movq	%rdi,-32(%rsp,%r15,8)
398	movq	%rdx,%r13
399	cmpq	%r9,%r15
400	jb	.L1st4x
401
402	mulq	%rbx
403	addq	%rax,%r10
404	movq	-16(%rcx,%r15,8),%rax
405	adcq	$0,%rdx
406	movq	%rdx,%r11
407
408	mulq	%rbp
409	addq	%rax,%r13
410	movq	-8(%rsi,%r15,8),%rax
411	adcq	$0,%rdx
412	addq	%r10,%r13
413	adcq	$0,%rdx
414	movq	%r13,-24(%rsp,%r15,8)
415	movq	%rdx,%rdi
416
417	mulq	%rbx
418	addq	%rax,%r11
419	movq	-8(%rcx,%r15,8),%rax
420	adcq	$0,%rdx
421	movq	%rdx,%r10
422
423	mulq	%rbp
424	addq	%rax,%rdi
425	movq	(%rsi),%rax
426	adcq	$0,%rdx
427	addq	%r11,%rdi
428	adcq	$0,%rdx
429	movq	%rdi,-16(%rsp,%r15,8)
430	movq	%rdx,%r13
431
432	xorq	%rdi,%rdi
433	addq	%r10,%r13
434	adcq	$0,%rdi
435	movq	%r13,-8(%rsp,%r15,8)
436	movq	%rdi,(%rsp,%r15,8)
437
438	leaq	1(%r14),%r14
439.align	4
440.Louter4x:
441	movq	(%r12,%r14,8),%rbx
442	xorq	%r15,%r15
443	movq	(%rsp),%r10
444	movq	%r8,%rbp
445	mulq	%rbx
446	addq	%rax,%r10
447	movq	(%rcx),%rax
448	adcq	$0,%rdx
449
450	imulq	%r10,%rbp
451	movq	%rdx,%r11
452
453	mulq	%rbp
454	addq	%rax,%r10
455	movq	8(%rsi),%rax
456	adcq	$0,%rdx
457	movq	%rdx,%rdi
458
459	mulq	%rbx
460	addq	%rax,%r11
461	movq	8(%rcx),%rax
462	adcq	$0,%rdx
463	addq	8(%rsp),%r11
464	adcq	$0,%rdx
465	movq	%rdx,%r10
466
467	mulq	%rbp
468	addq	%rax,%rdi
469	movq	16(%rsi),%rax
470	adcq	$0,%rdx
471	addq	%r11,%rdi
472	leaq	4(%r15),%r15
473	adcq	$0,%rdx
474	movq	%rdi,(%rsp)
475	movq	%rdx,%r13
476	jmp	.Linner4x
477.align	16
478.Linner4x:
479	mulq	%rbx
480	addq	%rax,%r10
481	movq	-16(%rcx,%r15,8),%rax
482	adcq	$0,%rdx
483	addq	-16(%rsp,%r15,8),%r10
484	adcq	$0,%rdx
485	movq	%rdx,%r11
486
487	mulq	%rbp
488	addq	%rax,%r13
489	movq	-8(%rsi,%r15,8),%rax
490	adcq	$0,%rdx
491	addq	%r10,%r13
492	adcq	$0,%rdx
493	movq	%r13,-24(%rsp,%r15,8)
494	movq	%rdx,%rdi
495
496	mulq	%rbx
497	addq	%rax,%r11
498	movq	-8(%rcx,%r15,8),%rax
499	adcq	$0,%rdx
500	addq	-8(%rsp,%r15,8),%r11
501	adcq	$0,%rdx
502	movq	%rdx,%r10
503
504	mulq	%rbp
505	addq	%rax,%rdi
506	movq	(%rsi,%r15,8),%rax
507	adcq	$0,%rdx
508	addq	%r11,%rdi
509	adcq	$0,%rdx
510	movq	%rdi,-16(%rsp,%r15,8)
511	movq	%rdx,%r13
512
513	mulq	%rbx
514	addq	%rax,%r10
515	movq	(%rcx,%r15,8),%rax
516	adcq	$0,%rdx
517	addq	(%rsp,%r15,8),%r10
518	adcq	$0,%rdx
519	movq	%rdx,%r11
520
521	mulq	%rbp
522	addq	%rax,%r13
523	movq	8(%rsi,%r15,8),%rax
524	adcq	$0,%rdx
525	addq	%r10,%r13
526	adcq	$0,%rdx
527	movq	%r13,-8(%rsp,%r15,8)
528	movq	%rdx,%rdi
529
530	mulq	%rbx
531	addq	%rax,%r11
532	movq	8(%rcx,%r15,8),%rax
533	adcq	$0,%rdx
534	addq	8(%rsp,%r15,8),%r11
535	adcq	$0,%rdx
536	leaq	4(%r15),%r15
537	movq	%rdx,%r10
538
539	mulq	%rbp
540	addq	%rax,%rdi
541	movq	-16(%rsi,%r15,8),%rax
542	adcq	$0,%rdx
543	addq	%r11,%rdi
544	adcq	$0,%rdx
545	movq	%rdi,-32(%rsp,%r15,8)
546	movq	%rdx,%r13
547	cmpq	%r9,%r15
548	jb	.Linner4x
549
550	mulq	%rbx
551	addq	%rax,%r10
552	movq	-16(%rcx,%r15,8),%rax
553	adcq	$0,%rdx
554	addq	-16(%rsp,%r15,8),%r10
555	adcq	$0,%rdx
556	movq	%rdx,%r11
557
558	mulq	%rbp
559	addq	%rax,%r13
560	movq	-8(%rsi,%r15,8),%rax
561	adcq	$0,%rdx
562	addq	%r10,%r13
563	adcq	$0,%rdx
564	movq	%r13,-24(%rsp,%r15,8)
565	movq	%rdx,%rdi
566
567	mulq	%rbx
568	addq	%rax,%r11
569	movq	-8(%rcx,%r15,8),%rax
570	adcq	$0,%rdx
571	addq	-8(%rsp,%r15,8),%r11
572	adcq	$0,%rdx
573	leaq	1(%r14),%r14
574	movq	%rdx,%r10
575
576	mulq	%rbp
577	addq	%rax,%rdi
578	movq	(%rsi),%rax
579	adcq	$0,%rdx
580	addq	%r11,%rdi
581	adcq	$0,%rdx
582	movq	%rdi,-16(%rsp,%r15,8)
583	movq	%rdx,%r13
584
585	xorq	%rdi,%rdi
586	addq	%r10,%r13
587	adcq	$0,%rdi
588	addq	(%rsp,%r9,8),%r13
589	adcq	$0,%rdi
590	movq	%r13,-8(%rsp,%r15,8)
591	movq	%rdi,(%rsp,%r15,8)
592
593	cmpq	%r9,%r14
594	jb	.Louter4x
595	movq	16(%rsp,%r9,8),%rdi
596	leaq	-4(%r9),%r15
597	movq	0(%rsp),%rax
598	movq	8(%rsp),%rdx
599	shrq	$2,%r15
600	leaq	(%rsp),%rsi
601	xorq	%r14,%r14
602
603	subq	0(%rcx),%rax
604	movq	16(%rsi),%rbx
605	movq	24(%rsi),%rbp
606	sbbq	8(%rcx),%rdx
607
608.Lsub4x:
609	movq	%rax,0(%rdi,%r14,8)
610	movq	%rdx,8(%rdi,%r14,8)
611	sbbq	16(%rcx,%r14,8),%rbx
612	movq	32(%rsi,%r14,8),%rax
613	movq	40(%rsi,%r14,8),%rdx
614	sbbq	24(%rcx,%r14,8),%rbp
615	movq	%rbx,16(%rdi,%r14,8)
616	movq	%rbp,24(%rdi,%r14,8)
617	sbbq	32(%rcx,%r14,8),%rax
618	movq	48(%rsi,%r14,8),%rbx
619	movq	56(%rsi,%r14,8),%rbp
620	sbbq	40(%rcx,%r14,8),%rdx
621	leaq	4(%r14),%r14
622	decq	%r15
623	jnz	.Lsub4x
624
625	movq	%rax,0(%rdi,%r14,8)
626	movq	32(%rsi,%r14,8),%rax
627	sbbq	16(%rcx,%r14,8),%rbx
628	movq	%rdx,8(%rdi,%r14,8)
629	sbbq	24(%rcx,%r14,8),%rbp
630	movq	%rbx,16(%rdi,%r14,8)
631
632	sbbq	$0,%rax
633	movq	%rbp,24(%rdi,%r14,8)
634	pxor	%xmm0,%xmm0
635.byte	102,72,15,110,224
636	pcmpeqd	%xmm5,%xmm5
637	pshufd	$0,%xmm4,%xmm4
638	movq	%r9,%r15
639	pxor	%xmm4,%xmm5
640	shrq	$2,%r15
641	xorl	%eax,%eax
642
643	jmp	.Lcopy4x
644.align	16
645.Lcopy4x:
646	movdqa	(%rsp,%rax,1),%xmm1
647	movdqu	(%rdi,%rax,1),%xmm2
648	pand	%xmm4,%xmm1
649	pand	%xmm5,%xmm2
650	movdqa	16(%rsp,%rax,1),%xmm3
651	movdqa	%xmm0,(%rsp,%rax,1)
652	por	%xmm2,%xmm1
653	movdqu	16(%rdi,%rax,1),%xmm2
654	movdqu	%xmm1,(%rdi,%rax,1)
655	pand	%xmm4,%xmm3
656	pand	%xmm5,%xmm2
657	movdqa	%xmm0,16(%rsp,%rax,1)
658	por	%xmm2,%xmm3
659	movdqu	%xmm3,16(%rdi,%rax,1)
660	leaq	32(%rax),%rax
661	decq	%r15
662	jnz	.Lcopy4x
663	movq	8(%rsp,%r9,8),%rsi
664.cfi_def_cfa	%rsi, 8
665	movq	$1,%rax
666	movq	-48(%rsi),%r15
667.cfi_restore	%r15
668	movq	-40(%rsi),%r14
669.cfi_restore	%r14
670	movq	-32(%rsi),%r13
671.cfi_restore	%r13
672	movq	-24(%rsi),%r12
673.cfi_restore	%r12
674	movq	-16(%rsi),%rbp
675.cfi_restore	%rbp
676	movq	-8(%rsi),%rbx
677.cfi_restore	%rbx
678	leaq	(%rsi),%rsp
679.cfi_def_cfa_register	%rsp
680.Lmul4x_epilogue:
681	ret
682.cfi_endproc
683.size	bn_mul4x_mont,.-bn_mul4x_mont
684.extern	bn_sqrx8x_internal
685.hidden bn_sqrx8x_internal
686.extern	bn_sqr8x_internal
687.hidden bn_sqr8x_internal
688
689.globl	bn_sqr8x_mont
690.hidden bn_sqr8x_mont
691.type	bn_sqr8x_mont,@function
692.align	32
693bn_sqr8x_mont:
694.cfi_startproc
695_CET_ENDBR
696	movl	%r9d,%r9d
697	movq	%rsp,%rax
698.cfi_def_cfa_register	%rax
699	pushq	%rbx
700.cfi_offset	%rbx,-16
701	pushq	%rbp
702.cfi_offset	%rbp,-24
703	pushq	%r12
704.cfi_offset	%r12,-32
705	pushq	%r13
706.cfi_offset	%r13,-40
707	pushq	%r14
708.cfi_offset	%r14,-48
709	pushq	%r15
710.cfi_offset	%r15,-56
711.Lsqr8x_prologue:
712
713	movl	%r9d,%r10d
714	shll	$3,%r9d
715	shlq	$3+2,%r10
716	negq	%r9
717
718
719
720
721
722
723	leaq	-64(%rsp,%r9,2),%r11
724	movq	%rsp,%rbp
725	movq	(%r8),%r8
726	subq	%rsi,%r11
727	andq	$4095,%r11
728	cmpq	%r11,%r10
729	jb	.Lsqr8x_sp_alt
730	subq	%r11,%rbp
731	leaq	-64(%rbp,%r9,2),%rbp
732	jmp	.Lsqr8x_sp_done
733
734.align	32
735.Lsqr8x_sp_alt:
736	leaq	4096-64(,%r9,2),%r10
737	leaq	-64(%rbp,%r9,2),%rbp
738	subq	%r10,%r11
739	movq	$0,%r10
740	cmovcq	%r10,%r11
741	subq	%r11,%rbp
742.Lsqr8x_sp_done:
743	andq	$-64,%rbp
744	movq	%rsp,%r11
745	subq	%rbp,%r11
746	andq	$-4096,%r11
747	leaq	(%r11,%rbp,1),%rsp
748	movq	(%rsp),%r10
749	cmpq	%rbp,%rsp
750	ja	.Lsqr8x_page_walk
751	jmp	.Lsqr8x_page_walk_done
752
753.align	16
754.Lsqr8x_page_walk:
755	leaq	-4096(%rsp),%rsp
756	movq	(%rsp),%r10
757	cmpq	%rbp,%rsp
758	ja	.Lsqr8x_page_walk
759.Lsqr8x_page_walk_done:
760
761	movq	%r9,%r10
762	negq	%r9
763
764	movq	%r8,32(%rsp)
765	movq	%rax,40(%rsp)
766.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
767.Lsqr8x_body:
768
769.byte	102,72,15,110,209
770	pxor	%xmm0,%xmm0
771.byte	102,72,15,110,207
772.byte	102,73,15,110,218
773	testq	%rdx,%rdx
774	jz	.Lsqr8x_nox
775
776	call	bn_sqrx8x_internal
777
778
779
780
781	leaq	(%r8,%rcx,1),%rbx
782	movq	%rcx,%r9
783	movq	%rcx,%rdx
784.byte	102,72,15,126,207
785	sarq	$3+2,%rcx
786	jmp	.Lsqr8x_sub
787
788.align	32
789.Lsqr8x_nox:
790	call	bn_sqr8x_internal
791
792
793
794
795	leaq	(%rdi,%r9,1),%rbx
796	movq	%r9,%rcx
797	movq	%r9,%rdx
798.byte	102,72,15,126,207
799	sarq	$3+2,%rcx
800	jmp	.Lsqr8x_sub
801
802.align	32
803.Lsqr8x_sub:
804	movq	0(%rbx),%r12
805	movq	8(%rbx),%r13
806	movq	16(%rbx),%r14
807	movq	24(%rbx),%r15
808	leaq	32(%rbx),%rbx
809	sbbq	0(%rbp),%r12
810	sbbq	8(%rbp),%r13
811	sbbq	16(%rbp),%r14
812	sbbq	24(%rbp),%r15
813	leaq	32(%rbp),%rbp
814	movq	%r12,0(%rdi)
815	movq	%r13,8(%rdi)
816	movq	%r14,16(%rdi)
817	movq	%r15,24(%rdi)
818	leaq	32(%rdi),%rdi
819	incq	%rcx
820	jnz	.Lsqr8x_sub
821
822	sbbq	$0,%rax
823	leaq	(%rbx,%r9,1),%rbx
824	leaq	(%rdi,%r9,1),%rdi
825
826.byte	102,72,15,110,200
827	pxor	%xmm0,%xmm0
828	pshufd	$0,%xmm1,%xmm1
829	movq	40(%rsp),%rsi
830.cfi_def_cfa	%rsi,8
831	jmp	.Lsqr8x_cond_copy
832
833.align	32
834.Lsqr8x_cond_copy:
835	movdqa	0(%rbx),%xmm2
836	movdqa	16(%rbx),%xmm3
837	leaq	32(%rbx),%rbx
838	movdqu	0(%rdi),%xmm4
839	movdqu	16(%rdi),%xmm5
840	leaq	32(%rdi),%rdi
841	movdqa	%xmm0,-32(%rbx)
842	movdqa	%xmm0,-16(%rbx)
843	movdqa	%xmm0,-32(%rbx,%rdx,1)
844	movdqa	%xmm0,-16(%rbx,%rdx,1)
845	pcmpeqd	%xmm1,%xmm0
846	pand	%xmm1,%xmm2
847	pand	%xmm1,%xmm3
848	pand	%xmm0,%xmm4
849	pand	%xmm0,%xmm5
850	pxor	%xmm0,%xmm0
851	por	%xmm2,%xmm4
852	por	%xmm3,%xmm5
853	movdqu	%xmm4,-32(%rdi)
854	movdqu	%xmm5,-16(%rdi)
855	addq	$32,%r9
856	jnz	.Lsqr8x_cond_copy
857
858	movq	$1,%rax
859	movq	-48(%rsi),%r15
860.cfi_restore	%r15
861	movq	-40(%rsi),%r14
862.cfi_restore	%r14
863	movq	-32(%rsi),%r13
864.cfi_restore	%r13
865	movq	-24(%rsi),%r12
866.cfi_restore	%r12
867	movq	-16(%rsi),%rbp
868.cfi_restore	%rbp
869	movq	-8(%rsi),%rbx
870.cfi_restore	%rbx
871	leaq	(%rsi),%rsp
872.cfi_def_cfa_register	%rsp
873.Lsqr8x_epilogue:
874	ret
875.cfi_endproc
876.size	bn_sqr8x_mont,.-bn_sqr8x_mont
877.globl	bn_mulx4x_mont
878.hidden bn_mulx4x_mont
879.type	bn_mulx4x_mont,@function
880.align	32
881bn_mulx4x_mont:
882.cfi_startproc
883_CET_ENDBR
884	movq	%rsp,%rax
885.cfi_def_cfa_register	%rax
886	pushq	%rbx
887.cfi_offset	%rbx,-16
888	pushq	%rbp
889.cfi_offset	%rbp,-24
890	pushq	%r12
891.cfi_offset	%r12,-32
892	pushq	%r13
893.cfi_offset	%r13,-40
894	pushq	%r14
895.cfi_offset	%r14,-48
896	pushq	%r15
897.cfi_offset	%r15,-56
898.Lmulx4x_prologue:
899
900	shll	$3,%r9d
901	xorq	%r10,%r10
902	subq	%r9,%r10
903	movq	(%r8),%r8
904	leaq	-72(%rsp,%r10,1),%rbp
905	andq	$-128,%rbp
906	movq	%rsp,%r11
907	subq	%rbp,%r11
908	andq	$-4096,%r11
909	leaq	(%r11,%rbp,1),%rsp
910	movq	(%rsp),%r10
911	cmpq	%rbp,%rsp
912	ja	.Lmulx4x_page_walk
913	jmp	.Lmulx4x_page_walk_done
914
915.align	16
916.Lmulx4x_page_walk:
917	leaq	-4096(%rsp),%rsp
918	movq	(%rsp),%r10
919	cmpq	%rbp,%rsp
920	ja	.Lmulx4x_page_walk
921.Lmulx4x_page_walk_done:
922
923	leaq	(%rdx,%r9,1),%r10
924
925
926
927
928
929
930
931
932
933
934
935
936	movq	%r9,0(%rsp)
937	shrq	$5,%r9
938	movq	%r10,16(%rsp)
939	subq	$1,%r9
940	movq	%r8,24(%rsp)
941	movq	%rdi,32(%rsp)
942	movq	%rax,40(%rsp)
943.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
944	movq	%r9,48(%rsp)
945	jmp	.Lmulx4x_body
946
947.align	32
948.Lmulx4x_body:
949	leaq	8(%rdx),%rdi
950	movq	(%rdx),%rdx
951	leaq	64+32(%rsp),%rbx
952	movq	%rdx,%r9
953
954	mulxq	0(%rsi),%r8,%rax
955	mulxq	8(%rsi),%r11,%r14
956	addq	%rax,%r11
957	movq	%rdi,8(%rsp)
958	mulxq	16(%rsi),%r12,%r13
959	adcq	%r14,%r12
960	adcq	$0,%r13
961
962	movq	%r8,%rdi
963	imulq	24(%rsp),%r8
964	xorq	%rbp,%rbp
965
966	mulxq	24(%rsi),%rax,%r14
967	movq	%r8,%rdx
968	leaq	32(%rsi),%rsi
969	adcxq	%rax,%r13
970	adcxq	%rbp,%r14
971
972	mulxq	0(%rcx),%rax,%r10
973	adcxq	%rax,%rdi
974	adoxq	%r11,%r10
975	mulxq	8(%rcx),%rax,%r11
976	adcxq	%rax,%r10
977	adoxq	%r12,%r11
978.byte	0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
979	movq	48(%rsp),%rdi
980	movq	%r10,-32(%rbx)
981	adcxq	%rax,%r11
982	adoxq	%r13,%r12
983	mulxq	24(%rcx),%rax,%r15
984	movq	%r9,%rdx
985	movq	%r11,-24(%rbx)
986	adcxq	%rax,%r12
987	adoxq	%rbp,%r15
988	leaq	32(%rcx),%rcx
989	movq	%r12,-16(%rbx)
990
991	jmp	.Lmulx4x_1st
992
993.align	32
994.Lmulx4x_1st:
995	adcxq	%rbp,%r15
996	mulxq	0(%rsi),%r10,%rax
997	adcxq	%r14,%r10
998	mulxq	8(%rsi),%r11,%r14
999	adcxq	%rax,%r11
1000	mulxq	16(%rsi),%r12,%rax
1001	adcxq	%r14,%r12
1002	mulxq	24(%rsi),%r13,%r14
1003.byte	0x67,0x67
1004	movq	%r8,%rdx
1005	adcxq	%rax,%r13
1006	adcxq	%rbp,%r14
1007	leaq	32(%rsi),%rsi
1008	leaq	32(%rbx),%rbx
1009
1010	adoxq	%r15,%r10
1011	mulxq	0(%rcx),%rax,%r15
1012	adcxq	%rax,%r10
1013	adoxq	%r15,%r11
1014	mulxq	8(%rcx),%rax,%r15
1015	adcxq	%rax,%r11
1016	adoxq	%r15,%r12
1017	mulxq	16(%rcx),%rax,%r15
1018	movq	%r10,-40(%rbx)
1019	adcxq	%rax,%r12
1020	movq	%r11,-32(%rbx)
1021	adoxq	%r15,%r13
1022	mulxq	24(%rcx),%rax,%r15
1023	movq	%r9,%rdx
1024	movq	%r12,-24(%rbx)
1025	adcxq	%rax,%r13
1026	adoxq	%rbp,%r15
1027	leaq	32(%rcx),%rcx
1028	movq	%r13,-16(%rbx)
1029
1030	decq	%rdi
1031	jnz	.Lmulx4x_1st
1032
1033	movq	0(%rsp),%rax
1034	movq	8(%rsp),%rdi
1035	adcq	%rbp,%r15
1036	addq	%r15,%r14
1037	sbbq	%r15,%r15
1038	movq	%r14,-8(%rbx)
1039	jmp	.Lmulx4x_outer
1040
1041.align	32
1042.Lmulx4x_outer:
1043	movq	(%rdi),%rdx
1044	leaq	8(%rdi),%rdi
1045	subq	%rax,%rsi
1046	movq	%r15,(%rbx)
1047	leaq	64+32(%rsp),%rbx
1048	subq	%rax,%rcx
1049
1050	mulxq	0(%rsi),%r8,%r11
1051	xorl	%ebp,%ebp
1052	movq	%rdx,%r9
1053	mulxq	8(%rsi),%r14,%r12
1054	adoxq	-32(%rbx),%r8
1055	adcxq	%r14,%r11
1056	mulxq	16(%rsi),%r15,%r13
1057	adoxq	-24(%rbx),%r11
1058	adcxq	%r15,%r12
1059	adoxq	-16(%rbx),%r12
1060	adcxq	%rbp,%r13
1061	adoxq	%rbp,%r13
1062
1063	movq	%rdi,8(%rsp)
1064	movq	%r8,%r15
1065	imulq	24(%rsp),%r8
1066	xorl	%ebp,%ebp
1067
1068	mulxq	24(%rsi),%rax,%r14
1069	movq	%r8,%rdx
1070	adcxq	%rax,%r13
1071	adoxq	-8(%rbx),%r13
1072	adcxq	%rbp,%r14
1073	leaq	32(%rsi),%rsi
1074	adoxq	%rbp,%r14
1075
1076	mulxq	0(%rcx),%rax,%r10
1077	adcxq	%rax,%r15
1078	adoxq	%r11,%r10
1079	mulxq	8(%rcx),%rax,%r11
1080	adcxq	%rax,%r10
1081	adoxq	%r12,%r11
1082	mulxq	16(%rcx),%rax,%r12
1083	movq	%r10,-32(%rbx)
1084	adcxq	%rax,%r11
1085	adoxq	%r13,%r12
1086	mulxq	24(%rcx),%rax,%r15
1087	movq	%r9,%rdx
1088	movq	%r11,-24(%rbx)
1089	leaq	32(%rcx),%rcx
1090	adcxq	%rax,%r12
1091	adoxq	%rbp,%r15
1092	movq	48(%rsp),%rdi
1093	movq	%r12,-16(%rbx)
1094
1095	jmp	.Lmulx4x_inner
1096
1097.align	32
1098.Lmulx4x_inner:
1099	mulxq	0(%rsi),%r10,%rax
1100	adcxq	%rbp,%r15
1101	adoxq	%r14,%r10
1102	mulxq	8(%rsi),%r11,%r14
1103	adcxq	0(%rbx),%r10
1104	adoxq	%rax,%r11
1105	mulxq	16(%rsi),%r12,%rax
1106	adcxq	8(%rbx),%r11
1107	adoxq	%r14,%r12
1108	mulxq	24(%rsi),%r13,%r14
1109	movq	%r8,%rdx
1110	adcxq	16(%rbx),%r12
1111	adoxq	%rax,%r13
1112	adcxq	24(%rbx),%r13
1113	adoxq	%rbp,%r14
1114	leaq	32(%rsi),%rsi
1115	leaq	32(%rbx),%rbx
1116	adcxq	%rbp,%r14
1117
1118	adoxq	%r15,%r10
1119	mulxq	0(%rcx),%rax,%r15
1120	adcxq	%rax,%r10
1121	adoxq	%r15,%r11
1122	mulxq	8(%rcx),%rax,%r15
1123	adcxq	%rax,%r11
1124	adoxq	%r15,%r12
1125	mulxq	16(%rcx),%rax,%r15
1126	movq	%r10,-40(%rbx)
1127	adcxq	%rax,%r12
1128	adoxq	%r15,%r13
1129	mulxq	24(%rcx),%rax,%r15
1130	movq	%r9,%rdx
1131	movq	%r11,-32(%rbx)
1132	movq	%r12,-24(%rbx)
1133	adcxq	%rax,%r13
1134	adoxq	%rbp,%r15
1135	leaq	32(%rcx),%rcx
1136	movq	%r13,-16(%rbx)
1137
1138	decq	%rdi
1139	jnz	.Lmulx4x_inner
1140
1141	movq	0(%rsp),%rax
1142	movq	8(%rsp),%rdi
1143	adcq	%rbp,%r15
1144	subq	0(%rbx),%rbp
1145	adcq	%r15,%r14
1146	sbbq	%r15,%r15
1147	movq	%r14,-8(%rbx)
1148
1149	cmpq	16(%rsp),%rdi
1150	jne	.Lmulx4x_outer
1151
1152	leaq	64(%rsp),%rbx
1153	subq	%rax,%rcx
1154	negq	%r15
1155	movq	%rax,%rdx
1156	shrq	$3+2,%rax
1157	movq	32(%rsp),%rdi
1158	jmp	.Lmulx4x_sub
1159
1160.align	32
1161.Lmulx4x_sub:
1162	movq	0(%rbx),%r11
1163	movq	8(%rbx),%r12
1164	movq	16(%rbx),%r13
1165	movq	24(%rbx),%r14
1166	leaq	32(%rbx),%rbx
1167	sbbq	0(%rcx),%r11
1168	sbbq	8(%rcx),%r12
1169	sbbq	16(%rcx),%r13
1170	sbbq	24(%rcx),%r14
1171	leaq	32(%rcx),%rcx
1172	movq	%r11,0(%rdi)
1173	movq	%r12,8(%rdi)
1174	movq	%r13,16(%rdi)
1175	movq	%r14,24(%rdi)
1176	leaq	32(%rdi),%rdi
1177	decq	%rax
1178	jnz	.Lmulx4x_sub
1179
1180	sbbq	$0,%r15
1181	leaq	64(%rsp),%rbx
1182	subq	%rdx,%rdi
1183
1184.byte	102,73,15,110,207
1185	pxor	%xmm0,%xmm0
1186	pshufd	$0,%xmm1,%xmm1
1187	movq	40(%rsp),%rsi
1188.cfi_def_cfa	%rsi,8
1189	jmp	.Lmulx4x_cond_copy
1190
1191.align	32
1192.Lmulx4x_cond_copy:
1193	movdqa	0(%rbx),%xmm2
1194	movdqa	16(%rbx),%xmm3
1195	leaq	32(%rbx),%rbx
1196	movdqu	0(%rdi),%xmm4
1197	movdqu	16(%rdi),%xmm5
1198	leaq	32(%rdi),%rdi
1199	movdqa	%xmm0,-32(%rbx)
1200	movdqa	%xmm0,-16(%rbx)
1201	pcmpeqd	%xmm1,%xmm0
1202	pand	%xmm1,%xmm2
1203	pand	%xmm1,%xmm3
1204	pand	%xmm0,%xmm4
1205	pand	%xmm0,%xmm5
1206	pxor	%xmm0,%xmm0
1207	por	%xmm2,%xmm4
1208	por	%xmm3,%xmm5
1209	movdqu	%xmm4,-32(%rdi)
1210	movdqu	%xmm5,-16(%rdi)
1211	subq	$32,%rdx
1212	jnz	.Lmulx4x_cond_copy
1213
1214	movq	%rdx,(%rbx)
1215
1216	movq	$1,%rax
1217	movq	-48(%rsi),%r15
1218.cfi_restore	%r15
1219	movq	-40(%rsi),%r14
1220.cfi_restore	%r14
1221	movq	-32(%rsi),%r13
1222.cfi_restore	%r13
1223	movq	-24(%rsi),%r12
1224.cfi_restore	%r12
1225	movq	-16(%rsi),%rbp
1226.cfi_restore	%rbp
1227	movq	-8(%rsi),%rbx
1228.cfi_restore	%rbx
1229	leaq	(%rsi),%rsp
1230.cfi_def_cfa_register	%rsp
1231.Lmulx4x_epilogue:
1232	ret
1233.cfi_endproc
1234.size	bn_mulx4x_mont,.-bn_mulx4x_mont
1235.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1236.align	16
1237#endif
1238