xref: /aosp_15_r20/external/cronet/third_party/boringssl/src/gen/bcm/x86_64-mont-apple.S (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <openssl/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
7.text
8
9.globl	_bn_mul_mont_nohw
10.private_extern _bn_mul_mont_nohw
11
12.p2align	4
13_bn_mul_mont_nohw:
14
15_CET_ENDBR
16	movl	%r9d,%r9d
17	movq	%rsp,%rax
18
19	pushq	%rbx
20
21	pushq	%rbp
22
23	pushq	%r12
24
25	pushq	%r13
26
27	pushq	%r14
28
29	pushq	%r15
30
31
32	negq	%r9
33	movq	%rsp,%r11
34	leaq	-16(%rsp,%r9,8),%r10
35	negq	%r9
36	andq	$-1024,%r10
37
38
39
40
41
42
43
44
45
46	subq	%r10,%r11
47	andq	$-4096,%r11
48	leaq	(%r10,%r11,1),%rsp
49	movq	(%rsp),%r11
50	cmpq	%r10,%rsp
51	ja	L$mul_page_walk
52	jmp	L$mul_page_walk_done
53
54.p2align	4
55L$mul_page_walk:
56	leaq	-4096(%rsp),%rsp
57	movq	(%rsp),%r11
58	cmpq	%r10,%rsp
59	ja	L$mul_page_walk
60L$mul_page_walk_done:
61
62	movq	%rax,8(%rsp,%r9,8)
63
64L$mul_body:
65	movq	%rdx,%r12
66	movq	(%r8),%r8
67	movq	(%r12),%rbx
68	movq	(%rsi),%rax
69
70	xorq	%r14,%r14
71	xorq	%r15,%r15
72
73	movq	%r8,%rbp
74	mulq	%rbx
75	movq	%rax,%r10
76	movq	(%rcx),%rax
77
78	imulq	%r10,%rbp
79	movq	%rdx,%r11
80
81	mulq	%rbp
82	addq	%rax,%r10
83	movq	8(%rsi),%rax
84	adcq	$0,%rdx
85	movq	%rdx,%r13
86
87	leaq	1(%r15),%r15
88	jmp	L$1st_enter
89
90.p2align	4
91L$1st:
92	addq	%rax,%r13
93	movq	(%rsi,%r15,8),%rax
94	adcq	$0,%rdx
95	addq	%r11,%r13
96	movq	%r10,%r11
97	adcq	$0,%rdx
98	movq	%r13,-16(%rsp,%r15,8)
99	movq	%rdx,%r13
100
101L$1st_enter:
102	mulq	%rbx
103	addq	%rax,%r11
104	movq	(%rcx,%r15,8),%rax
105	adcq	$0,%rdx
106	leaq	1(%r15),%r15
107	movq	%rdx,%r10
108
109	mulq	%rbp
110	cmpq	%r9,%r15
111	jne	L$1st
112
113	addq	%rax,%r13
114	movq	(%rsi),%rax
115	adcq	$0,%rdx
116	addq	%r11,%r13
117	adcq	$0,%rdx
118	movq	%r13,-16(%rsp,%r15,8)
119	movq	%rdx,%r13
120	movq	%r10,%r11
121
122	xorq	%rdx,%rdx
123	addq	%r11,%r13
124	adcq	$0,%rdx
125	movq	%r13,-8(%rsp,%r9,8)
126	movq	%rdx,(%rsp,%r9,8)
127
128	leaq	1(%r14),%r14
129	jmp	L$outer
130.p2align	4
131L$outer:
132	movq	(%r12,%r14,8),%rbx
133	xorq	%r15,%r15
134	movq	%r8,%rbp
135	movq	(%rsp),%r10
136	mulq	%rbx
137	addq	%rax,%r10
138	movq	(%rcx),%rax
139	adcq	$0,%rdx
140
141	imulq	%r10,%rbp
142	movq	%rdx,%r11
143
144	mulq	%rbp
145	addq	%rax,%r10
146	movq	8(%rsi),%rax
147	adcq	$0,%rdx
148	movq	8(%rsp),%r10
149	movq	%rdx,%r13
150
151	leaq	1(%r15),%r15
152	jmp	L$inner_enter
153
154.p2align	4
155L$inner:
156	addq	%rax,%r13
157	movq	(%rsi,%r15,8),%rax
158	adcq	$0,%rdx
159	addq	%r10,%r13
160	movq	(%rsp,%r15,8),%r10
161	adcq	$0,%rdx
162	movq	%r13,-16(%rsp,%r15,8)
163	movq	%rdx,%r13
164
165L$inner_enter:
166	mulq	%rbx
167	addq	%rax,%r11
168	movq	(%rcx,%r15,8),%rax
169	adcq	$0,%rdx
170	addq	%r11,%r10
171	movq	%rdx,%r11
172	adcq	$0,%r11
173	leaq	1(%r15),%r15
174
175	mulq	%rbp
176	cmpq	%r9,%r15
177	jne	L$inner
178
179	addq	%rax,%r13
180	movq	(%rsi),%rax
181	adcq	$0,%rdx
182	addq	%r10,%r13
183	movq	(%rsp,%r15,8),%r10
184	adcq	$0,%rdx
185	movq	%r13,-16(%rsp,%r15,8)
186	movq	%rdx,%r13
187
188	xorq	%rdx,%rdx
189	addq	%r11,%r13
190	adcq	$0,%rdx
191	addq	%r10,%r13
192	adcq	$0,%rdx
193	movq	%r13,-8(%rsp,%r9,8)
194	movq	%rdx,(%rsp,%r9,8)
195
196	leaq	1(%r14),%r14
197	cmpq	%r9,%r14
198	jb	L$outer
199
200	xorq	%r14,%r14
201	movq	(%rsp),%rax
202	movq	%r9,%r15
203
204.p2align	4
205L$sub:	sbbq	(%rcx,%r14,8),%rax
206	movq	%rax,(%rdi,%r14,8)
207	movq	8(%rsp,%r14,8),%rax
208	leaq	1(%r14),%r14
209	decq	%r15
210	jnz	L$sub
211
212	sbbq	$0,%rax
213	movq	$-1,%rbx
214	xorq	%rax,%rbx
215	xorq	%r14,%r14
216	movq	%r9,%r15
217
218L$copy:
219	movq	(%rdi,%r14,8),%rcx
220	movq	(%rsp,%r14,8),%rdx
221	andq	%rbx,%rcx
222	andq	%rax,%rdx
223	movq	%r9,(%rsp,%r14,8)
224	orq	%rcx,%rdx
225	movq	%rdx,(%rdi,%r14,8)
226	leaq	1(%r14),%r14
227	subq	$1,%r15
228	jnz	L$copy
229
230	movq	8(%rsp,%r9,8),%rsi
231
232	movq	$1,%rax
233	movq	-48(%rsi),%r15
234
235	movq	-40(%rsi),%r14
236
237	movq	-32(%rsi),%r13
238
239	movq	-24(%rsi),%r12
240
241	movq	-16(%rsi),%rbp
242
243	movq	-8(%rsi),%rbx
244
245	leaq	(%rsi),%rsp
246
247L$mul_epilogue:
248	ret
249
250
251.globl	_bn_mul4x_mont
252.private_extern _bn_mul4x_mont
253
254.p2align	4
255_bn_mul4x_mont:
256
257_CET_ENDBR
258	movl	%r9d,%r9d
259	movq	%rsp,%rax
260
261	pushq	%rbx
262
263	pushq	%rbp
264
265	pushq	%r12
266
267	pushq	%r13
268
269	pushq	%r14
270
271	pushq	%r15
272
273
274	negq	%r9
275	movq	%rsp,%r11
276	leaq	-32(%rsp,%r9,8),%r10
277	negq	%r9
278	andq	$-1024,%r10
279
280	subq	%r10,%r11
281	andq	$-4096,%r11
282	leaq	(%r10,%r11,1),%rsp
283	movq	(%rsp),%r11
284	cmpq	%r10,%rsp
285	ja	L$mul4x_page_walk
286	jmp	L$mul4x_page_walk_done
287
288L$mul4x_page_walk:
289	leaq	-4096(%rsp),%rsp
290	movq	(%rsp),%r11
291	cmpq	%r10,%rsp
292	ja	L$mul4x_page_walk
293L$mul4x_page_walk_done:
294
295	movq	%rax,8(%rsp,%r9,8)
296
297L$mul4x_body:
298	movq	%rdi,16(%rsp,%r9,8)
299	movq	%rdx,%r12
300	movq	(%r8),%r8
301	movq	(%r12),%rbx
302	movq	(%rsi),%rax
303
304	xorq	%r14,%r14
305	xorq	%r15,%r15
306
307	movq	%r8,%rbp
308	mulq	%rbx
309	movq	%rax,%r10
310	movq	(%rcx),%rax
311
312	imulq	%r10,%rbp
313	movq	%rdx,%r11
314
315	mulq	%rbp
316	addq	%rax,%r10
317	movq	8(%rsi),%rax
318	adcq	$0,%rdx
319	movq	%rdx,%rdi
320
321	mulq	%rbx
322	addq	%rax,%r11
323	movq	8(%rcx),%rax
324	adcq	$0,%rdx
325	movq	%rdx,%r10
326
327	mulq	%rbp
328	addq	%rax,%rdi
329	movq	16(%rsi),%rax
330	adcq	$0,%rdx
331	addq	%r11,%rdi
332	leaq	4(%r15),%r15
333	adcq	$0,%rdx
334	movq	%rdi,(%rsp)
335	movq	%rdx,%r13
336	jmp	L$1st4x
337.p2align	4
338L$1st4x:
339	mulq	%rbx
340	addq	%rax,%r10
341	movq	-16(%rcx,%r15,8),%rax
342	adcq	$0,%rdx
343	movq	%rdx,%r11
344
345	mulq	%rbp
346	addq	%rax,%r13
347	movq	-8(%rsi,%r15,8),%rax
348	adcq	$0,%rdx
349	addq	%r10,%r13
350	adcq	$0,%rdx
351	movq	%r13,-24(%rsp,%r15,8)
352	movq	%rdx,%rdi
353
354	mulq	%rbx
355	addq	%rax,%r11
356	movq	-8(%rcx,%r15,8),%rax
357	adcq	$0,%rdx
358	movq	%rdx,%r10
359
360	mulq	%rbp
361	addq	%rax,%rdi
362	movq	(%rsi,%r15,8),%rax
363	adcq	$0,%rdx
364	addq	%r11,%rdi
365	adcq	$0,%rdx
366	movq	%rdi,-16(%rsp,%r15,8)
367	movq	%rdx,%r13
368
369	mulq	%rbx
370	addq	%rax,%r10
371	movq	(%rcx,%r15,8),%rax
372	adcq	$0,%rdx
373	movq	%rdx,%r11
374
375	mulq	%rbp
376	addq	%rax,%r13
377	movq	8(%rsi,%r15,8),%rax
378	adcq	$0,%rdx
379	addq	%r10,%r13
380	adcq	$0,%rdx
381	movq	%r13,-8(%rsp,%r15,8)
382	movq	%rdx,%rdi
383
384	mulq	%rbx
385	addq	%rax,%r11
386	movq	8(%rcx,%r15,8),%rax
387	adcq	$0,%rdx
388	leaq	4(%r15),%r15
389	movq	%rdx,%r10
390
391	mulq	%rbp
392	addq	%rax,%rdi
393	movq	-16(%rsi,%r15,8),%rax
394	adcq	$0,%rdx
395	addq	%r11,%rdi
396	adcq	$0,%rdx
397	movq	%rdi,-32(%rsp,%r15,8)
398	movq	%rdx,%r13
399	cmpq	%r9,%r15
400	jb	L$1st4x
401
402	mulq	%rbx
403	addq	%rax,%r10
404	movq	-16(%rcx,%r15,8),%rax
405	adcq	$0,%rdx
406	movq	%rdx,%r11
407
408	mulq	%rbp
409	addq	%rax,%r13
410	movq	-8(%rsi,%r15,8),%rax
411	adcq	$0,%rdx
412	addq	%r10,%r13
413	adcq	$0,%rdx
414	movq	%r13,-24(%rsp,%r15,8)
415	movq	%rdx,%rdi
416
417	mulq	%rbx
418	addq	%rax,%r11
419	movq	-8(%rcx,%r15,8),%rax
420	adcq	$0,%rdx
421	movq	%rdx,%r10
422
423	mulq	%rbp
424	addq	%rax,%rdi
425	movq	(%rsi),%rax
426	adcq	$0,%rdx
427	addq	%r11,%rdi
428	adcq	$0,%rdx
429	movq	%rdi,-16(%rsp,%r15,8)
430	movq	%rdx,%r13
431
432	xorq	%rdi,%rdi
433	addq	%r10,%r13
434	adcq	$0,%rdi
435	movq	%r13,-8(%rsp,%r15,8)
436	movq	%rdi,(%rsp,%r15,8)
437
438	leaq	1(%r14),%r14
439.p2align	2
440L$outer4x:
441	movq	(%r12,%r14,8),%rbx
442	xorq	%r15,%r15
443	movq	(%rsp),%r10
444	movq	%r8,%rbp
445	mulq	%rbx
446	addq	%rax,%r10
447	movq	(%rcx),%rax
448	adcq	$0,%rdx
449
450	imulq	%r10,%rbp
451	movq	%rdx,%r11
452
453	mulq	%rbp
454	addq	%rax,%r10
455	movq	8(%rsi),%rax
456	adcq	$0,%rdx
457	movq	%rdx,%rdi
458
459	mulq	%rbx
460	addq	%rax,%r11
461	movq	8(%rcx),%rax
462	adcq	$0,%rdx
463	addq	8(%rsp),%r11
464	adcq	$0,%rdx
465	movq	%rdx,%r10
466
467	mulq	%rbp
468	addq	%rax,%rdi
469	movq	16(%rsi),%rax
470	adcq	$0,%rdx
471	addq	%r11,%rdi
472	leaq	4(%r15),%r15
473	adcq	$0,%rdx
474	movq	%rdi,(%rsp)
475	movq	%rdx,%r13
476	jmp	L$inner4x
477.p2align	4
478L$inner4x:
479	mulq	%rbx
480	addq	%rax,%r10
481	movq	-16(%rcx,%r15,8),%rax
482	adcq	$0,%rdx
483	addq	-16(%rsp,%r15,8),%r10
484	adcq	$0,%rdx
485	movq	%rdx,%r11
486
487	mulq	%rbp
488	addq	%rax,%r13
489	movq	-8(%rsi,%r15,8),%rax
490	adcq	$0,%rdx
491	addq	%r10,%r13
492	adcq	$0,%rdx
493	movq	%r13,-24(%rsp,%r15,8)
494	movq	%rdx,%rdi
495
496	mulq	%rbx
497	addq	%rax,%r11
498	movq	-8(%rcx,%r15,8),%rax
499	adcq	$0,%rdx
500	addq	-8(%rsp,%r15,8),%r11
501	adcq	$0,%rdx
502	movq	%rdx,%r10
503
504	mulq	%rbp
505	addq	%rax,%rdi
506	movq	(%rsi,%r15,8),%rax
507	adcq	$0,%rdx
508	addq	%r11,%rdi
509	adcq	$0,%rdx
510	movq	%rdi,-16(%rsp,%r15,8)
511	movq	%rdx,%r13
512
513	mulq	%rbx
514	addq	%rax,%r10
515	movq	(%rcx,%r15,8),%rax
516	adcq	$0,%rdx
517	addq	(%rsp,%r15,8),%r10
518	adcq	$0,%rdx
519	movq	%rdx,%r11
520
521	mulq	%rbp
522	addq	%rax,%r13
523	movq	8(%rsi,%r15,8),%rax
524	adcq	$0,%rdx
525	addq	%r10,%r13
526	adcq	$0,%rdx
527	movq	%r13,-8(%rsp,%r15,8)
528	movq	%rdx,%rdi
529
530	mulq	%rbx
531	addq	%rax,%r11
532	movq	8(%rcx,%r15,8),%rax
533	adcq	$0,%rdx
534	addq	8(%rsp,%r15,8),%r11
535	adcq	$0,%rdx
536	leaq	4(%r15),%r15
537	movq	%rdx,%r10
538
539	mulq	%rbp
540	addq	%rax,%rdi
541	movq	-16(%rsi,%r15,8),%rax
542	adcq	$0,%rdx
543	addq	%r11,%rdi
544	adcq	$0,%rdx
545	movq	%rdi,-32(%rsp,%r15,8)
546	movq	%rdx,%r13
547	cmpq	%r9,%r15
548	jb	L$inner4x
549
550	mulq	%rbx
551	addq	%rax,%r10
552	movq	-16(%rcx,%r15,8),%rax
553	adcq	$0,%rdx
554	addq	-16(%rsp,%r15,8),%r10
555	adcq	$0,%rdx
556	movq	%rdx,%r11
557
558	mulq	%rbp
559	addq	%rax,%r13
560	movq	-8(%rsi,%r15,8),%rax
561	adcq	$0,%rdx
562	addq	%r10,%r13
563	adcq	$0,%rdx
564	movq	%r13,-24(%rsp,%r15,8)
565	movq	%rdx,%rdi
566
567	mulq	%rbx
568	addq	%rax,%r11
569	movq	-8(%rcx,%r15,8),%rax
570	adcq	$0,%rdx
571	addq	-8(%rsp,%r15,8),%r11
572	adcq	$0,%rdx
573	leaq	1(%r14),%r14
574	movq	%rdx,%r10
575
576	mulq	%rbp
577	addq	%rax,%rdi
578	movq	(%rsi),%rax
579	adcq	$0,%rdx
580	addq	%r11,%rdi
581	adcq	$0,%rdx
582	movq	%rdi,-16(%rsp,%r15,8)
583	movq	%rdx,%r13
584
585	xorq	%rdi,%rdi
586	addq	%r10,%r13
587	adcq	$0,%rdi
588	addq	(%rsp,%r9,8),%r13
589	adcq	$0,%rdi
590	movq	%r13,-8(%rsp,%r15,8)
591	movq	%rdi,(%rsp,%r15,8)
592
593	cmpq	%r9,%r14
594	jb	L$outer4x
595	movq	16(%rsp,%r9,8),%rdi
596	leaq	-4(%r9),%r15
597	movq	0(%rsp),%rax
598	movq	8(%rsp),%rdx
599	shrq	$2,%r15
600	leaq	(%rsp),%rsi
601	xorq	%r14,%r14
602
603	subq	0(%rcx),%rax
604	movq	16(%rsi),%rbx
605	movq	24(%rsi),%rbp
606	sbbq	8(%rcx),%rdx
607
608L$sub4x:
609	movq	%rax,0(%rdi,%r14,8)
610	movq	%rdx,8(%rdi,%r14,8)
611	sbbq	16(%rcx,%r14,8),%rbx
612	movq	32(%rsi,%r14,8),%rax
613	movq	40(%rsi,%r14,8),%rdx
614	sbbq	24(%rcx,%r14,8),%rbp
615	movq	%rbx,16(%rdi,%r14,8)
616	movq	%rbp,24(%rdi,%r14,8)
617	sbbq	32(%rcx,%r14,8),%rax
618	movq	48(%rsi,%r14,8),%rbx
619	movq	56(%rsi,%r14,8),%rbp
620	sbbq	40(%rcx,%r14,8),%rdx
621	leaq	4(%r14),%r14
622	decq	%r15
623	jnz	L$sub4x
624
625	movq	%rax,0(%rdi,%r14,8)
626	movq	32(%rsi,%r14,8),%rax
627	sbbq	16(%rcx,%r14,8),%rbx
628	movq	%rdx,8(%rdi,%r14,8)
629	sbbq	24(%rcx,%r14,8),%rbp
630	movq	%rbx,16(%rdi,%r14,8)
631
632	sbbq	$0,%rax
633	movq	%rbp,24(%rdi,%r14,8)
634	pxor	%xmm0,%xmm0
635.byte	102,72,15,110,224
636	pcmpeqd	%xmm5,%xmm5
637	pshufd	$0,%xmm4,%xmm4
638	movq	%r9,%r15
639	pxor	%xmm4,%xmm5
640	shrq	$2,%r15
641	xorl	%eax,%eax
642
643	jmp	L$copy4x
644.p2align	4
645L$copy4x:
646	movdqa	(%rsp,%rax,1),%xmm1
647	movdqu	(%rdi,%rax,1),%xmm2
648	pand	%xmm4,%xmm1
649	pand	%xmm5,%xmm2
650	movdqa	16(%rsp,%rax,1),%xmm3
651	movdqa	%xmm0,(%rsp,%rax,1)
652	por	%xmm2,%xmm1
653	movdqu	16(%rdi,%rax,1),%xmm2
654	movdqu	%xmm1,(%rdi,%rax,1)
655	pand	%xmm4,%xmm3
656	pand	%xmm5,%xmm2
657	movdqa	%xmm0,16(%rsp,%rax,1)
658	por	%xmm2,%xmm3
659	movdqu	%xmm3,16(%rdi,%rax,1)
660	leaq	32(%rax),%rax
661	decq	%r15
662	jnz	L$copy4x
663	movq	8(%rsp,%r9,8),%rsi
664
665	movq	$1,%rax
666	movq	-48(%rsi),%r15
667
668	movq	-40(%rsi),%r14
669
670	movq	-32(%rsi),%r13
671
672	movq	-24(%rsi),%r12
673
674	movq	-16(%rsi),%rbp
675
676	movq	-8(%rsi),%rbx
677
678	leaq	(%rsi),%rsp
679
680L$mul4x_epilogue:
681	ret
682
683
684
685
686
687.globl	_bn_sqr8x_mont
688.private_extern _bn_sqr8x_mont
689
690.p2align	5
691_bn_sqr8x_mont:
692
693_CET_ENDBR
694	movl	%r9d,%r9d
695	movq	%rsp,%rax
696
697	pushq	%rbx
698
699	pushq	%rbp
700
701	pushq	%r12
702
703	pushq	%r13
704
705	pushq	%r14
706
707	pushq	%r15
708
709L$sqr8x_prologue:
710
711	movl	%r9d,%r10d
712	shll	$3,%r9d
713	shlq	$3+2,%r10
714	negq	%r9
715
716
717
718
719
720
721	leaq	-64(%rsp,%r9,2),%r11
722	movq	%rsp,%rbp
723	movq	(%r8),%r8
724	subq	%rsi,%r11
725	andq	$4095,%r11
726	cmpq	%r11,%r10
727	jb	L$sqr8x_sp_alt
728	subq	%r11,%rbp
729	leaq	-64(%rbp,%r9,2),%rbp
730	jmp	L$sqr8x_sp_done
731
732.p2align	5
733L$sqr8x_sp_alt:
734	leaq	4096-64(,%r9,2),%r10
735	leaq	-64(%rbp,%r9,2),%rbp
736	subq	%r10,%r11
737	movq	$0,%r10
738	cmovcq	%r10,%r11
739	subq	%r11,%rbp
740L$sqr8x_sp_done:
741	andq	$-64,%rbp
742	movq	%rsp,%r11
743	subq	%rbp,%r11
744	andq	$-4096,%r11
745	leaq	(%r11,%rbp,1),%rsp
746	movq	(%rsp),%r10
747	cmpq	%rbp,%rsp
748	ja	L$sqr8x_page_walk
749	jmp	L$sqr8x_page_walk_done
750
751.p2align	4
752L$sqr8x_page_walk:
753	leaq	-4096(%rsp),%rsp
754	movq	(%rsp),%r10
755	cmpq	%rbp,%rsp
756	ja	L$sqr8x_page_walk
757L$sqr8x_page_walk_done:
758
759	movq	%r9,%r10
760	negq	%r9
761
762	movq	%r8,32(%rsp)
763	movq	%rax,40(%rsp)
764
765L$sqr8x_body:
766
767.byte	102,72,15,110,209
768	pxor	%xmm0,%xmm0
769.byte	102,72,15,110,207
770.byte	102,73,15,110,218
771	testq	%rdx,%rdx
772	jz	L$sqr8x_nox
773
774	call	_bn_sqrx8x_internal
775
776
777
778
779	leaq	(%r8,%rcx,1),%rbx
780	movq	%rcx,%r9
781	movq	%rcx,%rdx
782.byte	102,72,15,126,207
783	sarq	$3+2,%rcx
784	jmp	L$sqr8x_sub
785
786.p2align	5
787L$sqr8x_nox:
788	call	_bn_sqr8x_internal
789
790
791
792
793	leaq	(%rdi,%r9,1),%rbx
794	movq	%r9,%rcx
795	movq	%r9,%rdx
796.byte	102,72,15,126,207
797	sarq	$3+2,%rcx
798	jmp	L$sqr8x_sub
799
800.p2align	5
801L$sqr8x_sub:
802	movq	0(%rbx),%r12
803	movq	8(%rbx),%r13
804	movq	16(%rbx),%r14
805	movq	24(%rbx),%r15
806	leaq	32(%rbx),%rbx
807	sbbq	0(%rbp),%r12
808	sbbq	8(%rbp),%r13
809	sbbq	16(%rbp),%r14
810	sbbq	24(%rbp),%r15
811	leaq	32(%rbp),%rbp
812	movq	%r12,0(%rdi)
813	movq	%r13,8(%rdi)
814	movq	%r14,16(%rdi)
815	movq	%r15,24(%rdi)
816	leaq	32(%rdi),%rdi
817	incq	%rcx
818	jnz	L$sqr8x_sub
819
820	sbbq	$0,%rax
821	leaq	(%rbx,%r9,1),%rbx
822	leaq	(%rdi,%r9,1),%rdi
823
824.byte	102,72,15,110,200
825	pxor	%xmm0,%xmm0
826	pshufd	$0,%xmm1,%xmm1
827	movq	40(%rsp),%rsi
828
829	jmp	L$sqr8x_cond_copy
830
831.p2align	5
832L$sqr8x_cond_copy:
833	movdqa	0(%rbx),%xmm2
834	movdqa	16(%rbx),%xmm3
835	leaq	32(%rbx),%rbx
836	movdqu	0(%rdi),%xmm4
837	movdqu	16(%rdi),%xmm5
838	leaq	32(%rdi),%rdi
839	movdqa	%xmm0,-32(%rbx)
840	movdqa	%xmm0,-16(%rbx)
841	movdqa	%xmm0,-32(%rbx,%rdx,1)
842	movdqa	%xmm0,-16(%rbx,%rdx,1)
843	pcmpeqd	%xmm1,%xmm0
844	pand	%xmm1,%xmm2
845	pand	%xmm1,%xmm3
846	pand	%xmm0,%xmm4
847	pand	%xmm0,%xmm5
848	pxor	%xmm0,%xmm0
849	por	%xmm2,%xmm4
850	por	%xmm3,%xmm5
851	movdqu	%xmm4,-32(%rdi)
852	movdqu	%xmm5,-16(%rdi)
853	addq	$32,%r9
854	jnz	L$sqr8x_cond_copy
855
856	movq	$1,%rax
857	movq	-48(%rsi),%r15
858
859	movq	-40(%rsi),%r14
860
861	movq	-32(%rsi),%r13
862
863	movq	-24(%rsi),%r12
864
865	movq	-16(%rsi),%rbp
866
867	movq	-8(%rsi),%rbx
868
869	leaq	(%rsi),%rsp
870
871L$sqr8x_epilogue:
872	ret
873
874
875.globl	_bn_mulx4x_mont
876.private_extern _bn_mulx4x_mont
877
878.p2align	5
879_bn_mulx4x_mont:
880
881_CET_ENDBR
882	movq	%rsp,%rax
883
884	pushq	%rbx
885
886	pushq	%rbp
887
888	pushq	%r12
889
890	pushq	%r13
891
892	pushq	%r14
893
894	pushq	%r15
895
896L$mulx4x_prologue:
897
898	shll	$3,%r9d
899	xorq	%r10,%r10
900	subq	%r9,%r10
901	movq	(%r8),%r8
902	leaq	-72(%rsp,%r10,1),%rbp
903	andq	$-128,%rbp
904	movq	%rsp,%r11
905	subq	%rbp,%r11
906	andq	$-4096,%r11
907	leaq	(%r11,%rbp,1),%rsp
908	movq	(%rsp),%r10
909	cmpq	%rbp,%rsp
910	ja	L$mulx4x_page_walk
911	jmp	L$mulx4x_page_walk_done
912
913.p2align	4
914L$mulx4x_page_walk:
915	leaq	-4096(%rsp),%rsp
916	movq	(%rsp),%r10
917	cmpq	%rbp,%rsp
918	ja	L$mulx4x_page_walk
919L$mulx4x_page_walk_done:
920
921	leaq	(%rdx,%r9,1),%r10
922
923
924
925
926
927
928
929
930
931
932
933
934	movq	%r9,0(%rsp)
935	shrq	$5,%r9
936	movq	%r10,16(%rsp)
937	subq	$1,%r9
938	movq	%r8,24(%rsp)
939	movq	%rdi,32(%rsp)
940	movq	%rax,40(%rsp)
941
942	movq	%r9,48(%rsp)
943	jmp	L$mulx4x_body
944
945.p2align	5
946L$mulx4x_body:
947	leaq	8(%rdx),%rdi
948	movq	(%rdx),%rdx
949	leaq	64+32(%rsp),%rbx
950	movq	%rdx,%r9
951
952	mulxq	0(%rsi),%r8,%rax
953	mulxq	8(%rsi),%r11,%r14
954	addq	%rax,%r11
955	movq	%rdi,8(%rsp)
956	mulxq	16(%rsi),%r12,%r13
957	adcq	%r14,%r12
958	adcq	$0,%r13
959
960	movq	%r8,%rdi
961	imulq	24(%rsp),%r8
962	xorq	%rbp,%rbp
963
964	mulxq	24(%rsi),%rax,%r14
965	movq	%r8,%rdx
966	leaq	32(%rsi),%rsi
967	adcxq	%rax,%r13
968	adcxq	%rbp,%r14
969
970	mulxq	0(%rcx),%rax,%r10
971	adcxq	%rax,%rdi
972	adoxq	%r11,%r10
973	mulxq	8(%rcx),%rax,%r11
974	adcxq	%rax,%r10
975	adoxq	%r12,%r11
976.byte	0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
977	movq	48(%rsp),%rdi
978	movq	%r10,-32(%rbx)
979	adcxq	%rax,%r11
980	adoxq	%r13,%r12
981	mulxq	24(%rcx),%rax,%r15
982	movq	%r9,%rdx
983	movq	%r11,-24(%rbx)
984	adcxq	%rax,%r12
985	adoxq	%rbp,%r15
986	leaq	32(%rcx),%rcx
987	movq	%r12,-16(%rbx)
988
989	jmp	L$mulx4x_1st
990
991.p2align	5
992L$mulx4x_1st:
993	adcxq	%rbp,%r15
994	mulxq	0(%rsi),%r10,%rax
995	adcxq	%r14,%r10
996	mulxq	8(%rsi),%r11,%r14
997	adcxq	%rax,%r11
998	mulxq	16(%rsi),%r12,%rax
999	adcxq	%r14,%r12
1000	mulxq	24(%rsi),%r13,%r14
1001.byte	0x67,0x67
1002	movq	%r8,%rdx
1003	adcxq	%rax,%r13
1004	adcxq	%rbp,%r14
1005	leaq	32(%rsi),%rsi
1006	leaq	32(%rbx),%rbx
1007
1008	adoxq	%r15,%r10
1009	mulxq	0(%rcx),%rax,%r15
1010	adcxq	%rax,%r10
1011	adoxq	%r15,%r11
1012	mulxq	8(%rcx),%rax,%r15
1013	adcxq	%rax,%r11
1014	adoxq	%r15,%r12
1015	mulxq	16(%rcx),%rax,%r15
1016	movq	%r10,-40(%rbx)
1017	adcxq	%rax,%r12
1018	movq	%r11,-32(%rbx)
1019	adoxq	%r15,%r13
1020	mulxq	24(%rcx),%rax,%r15
1021	movq	%r9,%rdx
1022	movq	%r12,-24(%rbx)
1023	adcxq	%rax,%r13
1024	adoxq	%rbp,%r15
1025	leaq	32(%rcx),%rcx
1026	movq	%r13,-16(%rbx)
1027
1028	decq	%rdi
1029	jnz	L$mulx4x_1st
1030
1031	movq	0(%rsp),%rax
1032	movq	8(%rsp),%rdi
1033	adcq	%rbp,%r15
1034	addq	%r15,%r14
1035	sbbq	%r15,%r15
1036	movq	%r14,-8(%rbx)
1037	jmp	L$mulx4x_outer
1038
1039.p2align	5
1040L$mulx4x_outer:
1041	movq	(%rdi),%rdx
1042	leaq	8(%rdi),%rdi
1043	subq	%rax,%rsi
1044	movq	%r15,(%rbx)
1045	leaq	64+32(%rsp),%rbx
1046	subq	%rax,%rcx
1047
1048	mulxq	0(%rsi),%r8,%r11
1049	xorl	%ebp,%ebp
1050	movq	%rdx,%r9
1051	mulxq	8(%rsi),%r14,%r12
1052	adoxq	-32(%rbx),%r8
1053	adcxq	%r14,%r11
1054	mulxq	16(%rsi),%r15,%r13
1055	adoxq	-24(%rbx),%r11
1056	adcxq	%r15,%r12
1057	adoxq	-16(%rbx),%r12
1058	adcxq	%rbp,%r13
1059	adoxq	%rbp,%r13
1060
1061	movq	%rdi,8(%rsp)
1062	movq	%r8,%r15
1063	imulq	24(%rsp),%r8
1064	xorl	%ebp,%ebp
1065
1066	mulxq	24(%rsi),%rax,%r14
1067	movq	%r8,%rdx
1068	adcxq	%rax,%r13
1069	adoxq	-8(%rbx),%r13
1070	adcxq	%rbp,%r14
1071	leaq	32(%rsi),%rsi
1072	adoxq	%rbp,%r14
1073
1074	mulxq	0(%rcx),%rax,%r10
1075	adcxq	%rax,%r15
1076	adoxq	%r11,%r10
1077	mulxq	8(%rcx),%rax,%r11
1078	adcxq	%rax,%r10
1079	adoxq	%r12,%r11
1080	mulxq	16(%rcx),%rax,%r12
1081	movq	%r10,-32(%rbx)
1082	adcxq	%rax,%r11
1083	adoxq	%r13,%r12
1084	mulxq	24(%rcx),%rax,%r15
1085	movq	%r9,%rdx
1086	movq	%r11,-24(%rbx)
1087	leaq	32(%rcx),%rcx
1088	adcxq	%rax,%r12
1089	adoxq	%rbp,%r15
1090	movq	48(%rsp),%rdi
1091	movq	%r12,-16(%rbx)
1092
1093	jmp	L$mulx4x_inner
1094
1095.p2align	5
1096L$mulx4x_inner:
1097	mulxq	0(%rsi),%r10,%rax
1098	adcxq	%rbp,%r15
1099	adoxq	%r14,%r10
1100	mulxq	8(%rsi),%r11,%r14
1101	adcxq	0(%rbx),%r10
1102	adoxq	%rax,%r11
1103	mulxq	16(%rsi),%r12,%rax
1104	adcxq	8(%rbx),%r11
1105	adoxq	%r14,%r12
1106	mulxq	24(%rsi),%r13,%r14
1107	movq	%r8,%rdx
1108	adcxq	16(%rbx),%r12
1109	adoxq	%rax,%r13
1110	adcxq	24(%rbx),%r13
1111	adoxq	%rbp,%r14
1112	leaq	32(%rsi),%rsi
1113	leaq	32(%rbx),%rbx
1114	adcxq	%rbp,%r14
1115
1116	adoxq	%r15,%r10
1117	mulxq	0(%rcx),%rax,%r15
1118	adcxq	%rax,%r10
1119	adoxq	%r15,%r11
1120	mulxq	8(%rcx),%rax,%r15
1121	adcxq	%rax,%r11
1122	adoxq	%r15,%r12
1123	mulxq	16(%rcx),%rax,%r15
1124	movq	%r10,-40(%rbx)
1125	adcxq	%rax,%r12
1126	adoxq	%r15,%r13
1127	mulxq	24(%rcx),%rax,%r15
1128	movq	%r9,%rdx
1129	movq	%r11,-32(%rbx)
1130	movq	%r12,-24(%rbx)
1131	adcxq	%rax,%r13
1132	adoxq	%rbp,%r15
1133	leaq	32(%rcx),%rcx
1134	movq	%r13,-16(%rbx)
1135
1136	decq	%rdi
1137	jnz	L$mulx4x_inner
1138
1139	movq	0(%rsp),%rax
1140	movq	8(%rsp),%rdi
1141	adcq	%rbp,%r15
1142	subq	0(%rbx),%rbp
1143	adcq	%r15,%r14
1144	sbbq	%r15,%r15
1145	movq	%r14,-8(%rbx)
1146
1147	cmpq	16(%rsp),%rdi
1148	jne	L$mulx4x_outer
1149
1150	leaq	64(%rsp),%rbx
1151	subq	%rax,%rcx
1152	negq	%r15
1153	movq	%rax,%rdx
1154	shrq	$3+2,%rax
1155	movq	32(%rsp),%rdi
1156	jmp	L$mulx4x_sub
1157
1158.p2align	5
1159L$mulx4x_sub:
1160	movq	0(%rbx),%r11
1161	movq	8(%rbx),%r12
1162	movq	16(%rbx),%r13
1163	movq	24(%rbx),%r14
1164	leaq	32(%rbx),%rbx
1165	sbbq	0(%rcx),%r11
1166	sbbq	8(%rcx),%r12
1167	sbbq	16(%rcx),%r13
1168	sbbq	24(%rcx),%r14
1169	leaq	32(%rcx),%rcx
1170	movq	%r11,0(%rdi)
1171	movq	%r12,8(%rdi)
1172	movq	%r13,16(%rdi)
1173	movq	%r14,24(%rdi)
1174	leaq	32(%rdi),%rdi
1175	decq	%rax
1176	jnz	L$mulx4x_sub
1177
1178	sbbq	$0,%r15
1179	leaq	64(%rsp),%rbx
1180	subq	%rdx,%rdi
1181
1182.byte	102,73,15,110,207
1183	pxor	%xmm0,%xmm0
1184	pshufd	$0,%xmm1,%xmm1
1185	movq	40(%rsp),%rsi
1186
1187	jmp	L$mulx4x_cond_copy
1188
1189.p2align	5
1190L$mulx4x_cond_copy:
1191	movdqa	0(%rbx),%xmm2
1192	movdqa	16(%rbx),%xmm3
1193	leaq	32(%rbx),%rbx
1194	movdqu	0(%rdi),%xmm4
1195	movdqu	16(%rdi),%xmm5
1196	leaq	32(%rdi),%rdi
1197	movdqa	%xmm0,-32(%rbx)
1198	movdqa	%xmm0,-16(%rbx)
1199	pcmpeqd	%xmm1,%xmm0
1200	pand	%xmm1,%xmm2
1201	pand	%xmm1,%xmm3
1202	pand	%xmm0,%xmm4
1203	pand	%xmm0,%xmm5
1204	pxor	%xmm0,%xmm0
1205	por	%xmm2,%xmm4
1206	por	%xmm3,%xmm5
1207	movdqu	%xmm4,-32(%rdi)
1208	movdqu	%xmm5,-16(%rdi)
1209	subq	$32,%rdx
1210	jnz	L$mulx4x_cond_copy
1211
1212	movq	%rdx,(%rbx)
1213
1214	movq	$1,%rax
1215	movq	-48(%rsi),%r15
1216
1217	movq	-40(%rsi),%r14
1218
1219	movq	-32(%rsi),%r13
1220
1221	movq	-24(%rsi),%r12
1222
1223	movq	-16(%rsi),%rbp
1224
1225	movq	-8(%rsi),%rbx
1226
1227	leaq	(%rsi),%rsp
1228
1229L$mulx4x_epilogue:
1230	ret
1231
1232
1233.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1234.p2align	4
1235#endif
1236