xref: /aosp_15_r20/external/boringssl/src/gen/bcm/x86_64-mont5-linux.S (revision 8fb009dc861624b67b6cdb62ea21f0f22d0c584b)
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <openssl/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
7.text
8
9.globl	bn_mul_mont_gather5_nohw
10.hidden bn_mul_mont_gather5_nohw
11.type	bn_mul_mont_gather5_nohw,@function
12.align	64
13bn_mul_mont_gather5_nohw:
14.cfi_startproc
15_CET_ENDBR
16
17
18	movl	%r9d,%r9d
19	movq	%rsp,%rax
20.cfi_def_cfa_register	%rax
21	movd	8(%rsp),%xmm5
22	pushq	%rbx
23.cfi_offset	%rbx,-16
24	pushq	%rbp
25.cfi_offset	%rbp,-24
26	pushq	%r12
27.cfi_offset	%r12,-32
28	pushq	%r13
29.cfi_offset	%r13,-40
30	pushq	%r14
31.cfi_offset	%r14,-48
32	pushq	%r15
33.cfi_offset	%r15,-56
34
35	negq	%r9
36	movq	%rsp,%r11
37	leaq	-280(%rsp,%r9,8),%r10
38	negq	%r9
39	andq	$-1024,%r10
40
41
42
43
44
45
46
47
48
49	subq	%r10,%r11
50	andq	$-4096,%r11
51	leaq	(%r10,%r11,1),%rsp
52	movq	(%rsp),%r11
53	cmpq	%r10,%rsp
54	ja	.Lmul_page_walk
55	jmp	.Lmul_page_walk_done
56
57.Lmul_page_walk:
58	leaq	-4096(%rsp),%rsp
59	movq	(%rsp),%r11
60	cmpq	%r10,%rsp
61	ja	.Lmul_page_walk
62.Lmul_page_walk_done:
63
64	leaq	.Linc(%rip),%r10
65	movq	%rax,8(%rsp,%r9,8)
66.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
67.Lmul_body:
68
69	leaq	128(%rdx),%r12
70	movdqa	0(%r10),%xmm0
71	movdqa	16(%r10),%xmm1
72	leaq	24-112(%rsp,%r9,8),%r10
73	andq	$-16,%r10
74
75	pshufd	$0,%xmm5,%xmm5
76	movdqa	%xmm1,%xmm4
77	movdqa	%xmm1,%xmm2
78	paddd	%xmm0,%xmm1
79	pcmpeqd	%xmm5,%xmm0
80.byte	0x67
81	movdqa	%xmm4,%xmm3
82	paddd	%xmm1,%xmm2
83	pcmpeqd	%xmm5,%xmm1
84	movdqa	%xmm0,112(%r10)
85	movdqa	%xmm4,%xmm0
86
87	paddd	%xmm2,%xmm3
88	pcmpeqd	%xmm5,%xmm2
89	movdqa	%xmm1,128(%r10)
90	movdqa	%xmm4,%xmm1
91
92	paddd	%xmm3,%xmm0
93	pcmpeqd	%xmm5,%xmm3
94	movdqa	%xmm2,144(%r10)
95	movdqa	%xmm4,%xmm2
96
97	paddd	%xmm0,%xmm1
98	pcmpeqd	%xmm5,%xmm0
99	movdqa	%xmm3,160(%r10)
100	movdqa	%xmm4,%xmm3
101	paddd	%xmm1,%xmm2
102	pcmpeqd	%xmm5,%xmm1
103	movdqa	%xmm0,176(%r10)
104	movdqa	%xmm4,%xmm0
105
106	paddd	%xmm2,%xmm3
107	pcmpeqd	%xmm5,%xmm2
108	movdqa	%xmm1,192(%r10)
109	movdqa	%xmm4,%xmm1
110
111	paddd	%xmm3,%xmm0
112	pcmpeqd	%xmm5,%xmm3
113	movdqa	%xmm2,208(%r10)
114	movdqa	%xmm4,%xmm2
115
116	paddd	%xmm0,%xmm1
117	pcmpeqd	%xmm5,%xmm0
118	movdqa	%xmm3,224(%r10)
119	movdqa	%xmm4,%xmm3
120	paddd	%xmm1,%xmm2
121	pcmpeqd	%xmm5,%xmm1
122	movdqa	%xmm0,240(%r10)
123	movdqa	%xmm4,%xmm0
124
125	paddd	%xmm2,%xmm3
126	pcmpeqd	%xmm5,%xmm2
127	movdqa	%xmm1,256(%r10)
128	movdqa	%xmm4,%xmm1
129
130	paddd	%xmm3,%xmm0
131	pcmpeqd	%xmm5,%xmm3
132	movdqa	%xmm2,272(%r10)
133	movdqa	%xmm4,%xmm2
134
135	paddd	%xmm0,%xmm1
136	pcmpeqd	%xmm5,%xmm0
137	movdqa	%xmm3,288(%r10)
138	movdqa	%xmm4,%xmm3
139	paddd	%xmm1,%xmm2
140	pcmpeqd	%xmm5,%xmm1
141	movdqa	%xmm0,304(%r10)
142
143	paddd	%xmm2,%xmm3
144.byte	0x67
145	pcmpeqd	%xmm5,%xmm2
146	movdqa	%xmm1,320(%r10)
147
148	pcmpeqd	%xmm5,%xmm3
149	movdqa	%xmm2,336(%r10)
150	pand	64(%r12),%xmm0
151
152	pand	80(%r12),%xmm1
153	pand	96(%r12),%xmm2
154	movdqa	%xmm3,352(%r10)
155	pand	112(%r12),%xmm3
156	por	%xmm2,%xmm0
157	por	%xmm3,%xmm1
158	movdqa	-128(%r12),%xmm4
159	movdqa	-112(%r12),%xmm5
160	movdqa	-96(%r12),%xmm2
161	pand	112(%r10),%xmm4
162	movdqa	-80(%r12),%xmm3
163	pand	128(%r10),%xmm5
164	por	%xmm4,%xmm0
165	pand	144(%r10),%xmm2
166	por	%xmm5,%xmm1
167	pand	160(%r10),%xmm3
168	por	%xmm2,%xmm0
169	por	%xmm3,%xmm1
170	movdqa	-64(%r12),%xmm4
171	movdqa	-48(%r12),%xmm5
172	movdqa	-32(%r12),%xmm2
173	pand	176(%r10),%xmm4
174	movdqa	-16(%r12),%xmm3
175	pand	192(%r10),%xmm5
176	por	%xmm4,%xmm0
177	pand	208(%r10),%xmm2
178	por	%xmm5,%xmm1
179	pand	224(%r10),%xmm3
180	por	%xmm2,%xmm0
181	por	%xmm3,%xmm1
182	movdqa	0(%r12),%xmm4
183	movdqa	16(%r12),%xmm5
184	movdqa	32(%r12),%xmm2
185	pand	240(%r10),%xmm4
186	movdqa	48(%r12),%xmm3
187	pand	256(%r10),%xmm5
188	por	%xmm4,%xmm0
189	pand	272(%r10),%xmm2
190	por	%xmm5,%xmm1
191	pand	288(%r10),%xmm3
192	por	%xmm2,%xmm0
193	por	%xmm3,%xmm1
194	por	%xmm1,%xmm0
195
196	pshufd	$0x4e,%xmm0,%xmm1
197	por	%xmm1,%xmm0
198	leaq	256(%r12),%r12
199.byte	102,72,15,126,195
200
201	movq	(%r8),%r8
202	movq	(%rsi),%rax
203
204	xorq	%r14,%r14
205	xorq	%r15,%r15
206
207	movq	%r8,%rbp
208	mulq	%rbx
209	movq	%rax,%r10
210	movq	(%rcx),%rax
211
212	imulq	%r10,%rbp
213	movq	%rdx,%r11
214
215	mulq	%rbp
216	addq	%rax,%r10
217	movq	8(%rsi),%rax
218	adcq	$0,%rdx
219	movq	%rdx,%r13
220
221	leaq	1(%r15),%r15
222	jmp	.L1st_enter
223
224.align	16
225.L1st:
226	addq	%rax,%r13
227	movq	(%rsi,%r15,8),%rax
228	adcq	$0,%rdx
229	addq	%r11,%r13
230	movq	%r10,%r11
231	adcq	$0,%rdx
232	movq	%r13,-16(%rsp,%r15,8)
233	movq	%rdx,%r13
234
235.L1st_enter:
236	mulq	%rbx
237	addq	%rax,%r11
238	movq	(%rcx,%r15,8),%rax
239	adcq	$0,%rdx
240	leaq	1(%r15),%r15
241	movq	%rdx,%r10
242
243	mulq	%rbp
244	cmpq	%r9,%r15
245	jne	.L1st
246
247
248	addq	%rax,%r13
249	adcq	$0,%rdx
250	addq	%r11,%r13
251	adcq	$0,%rdx
252	movq	%r13,-16(%rsp,%r9,8)
253	movq	%rdx,%r13
254	movq	%r10,%r11
255
256	xorq	%rdx,%rdx
257	addq	%r11,%r13
258	adcq	$0,%rdx
259	movq	%r13,-8(%rsp,%r9,8)
260	movq	%rdx,(%rsp,%r9,8)
261
262	leaq	1(%r14),%r14
263	jmp	.Louter
264.align	16
265.Louter:
266	leaq	24+128(%rsp,%r9,8),%rdx
267	andq	$-16,%rdx
268	pxor	%xmm4,%xmm4
269	pxor	%xmm5,%xmm5
270	movdqa	-128(%r12),%xmm0
271	movdqa	-112(%r12),%xmm1
272	movdqa	-96(%r12),%xmm2
273	movdqa	-80(%r12),%xmm3
274	pand	-128(%rdx),%xmm0
275	pand	-112(%rdx),%xmm1
276	por	%xmm0,%xmm4
277	pand	-96(%rdx),%xmm2
278	por	%xmm1,%xmm5
279	pand	-80(%rdx),%xmm3
280	por	%xmm2,%xmm4
281	por	%xmm3,%xmm5
282	movdqa	-64(%r12),%xmm0
283	movdqa	-48(%r12),%xmm1
284	movdqa	-32(%r12),%xmm2
285	movdqa	-16(%r12),%xmm3
286	pand	-64(%rdx),%xmm0
287	pand	-48(%rdx),%xmm1
288	por	%xmm0,%xmm4
289	pand	-32(%rdx),%xmm2
290	por	%xmm1,%xmm5
291	pand	-16(%rdx),%xmm3
292	por	%xmm2,%xmm4
293	por	%xmm3,%xmm5
294	movdqa	0(%r12),%xmm0
295	movdqa	16(%r12),%xmm1
296	movdqa	32(%r12),%xmm2
297	movdqa	48(%r12),%xmm3
298	pand	0(%rdx),%xmm0
299	pand	16(%rdx),%xmm1
300	por	%xmm0,%xmm4
301	pand	32(%rdx),%xmm2
302	por	%xmm1,%xmm5
303	pand	48(%rdx),%xmm3
304	por	%xmm2,%xmm4
305	por	%xmm3,%xmm5
306	movdqa	64(%r12),%xmm0
307	movdqa	80(%r12),%xmm1
308	movdqa	96(%r12),%xmm2
309	movdqa	112(%r12),%xmm3
310	pand	64(%rdx),%xmm0
311	pand	80(%rdx),%xmm1
312	por	%xmm0,%xmm4
313	pand	96(%rdx),%xmm2
314	por	%xmm1,%xmm5
315	pand	112(%rdx),%xmm3
316	por	%xmm2,%xmm4
317	por	%xmm3,%xmm5
318	por	%xmm5,%xmm4
319
320	pshufd	$0x4e,%xmm4,%xmm0
321	por	%xmm4,%xmm0
322	leaq	256(%r12),%r12
323
324	movq	(%rsi),%rax
325.byte	102,72,15,126,195
326
327	xorq	%r15,%r15
328	movq	%r8,%rbp
329	movq	(%rsp),%r10
330
331	mulq	%rbx
332	addq	%rax,%r10
333	movq	(%rcx),%rax
334	adcq	$0,%rdx
335
336	imulq	%r10,%rbp
337	movq	%rdx,%r11
338
339	mulq	%rbp
340	addq	%rax,%r10
341	movq	8(%rsi),%rax
342	adcq	$0,%rdx
343	movq	8(%rsp),%r10
344	movq	%rdx,%r13
345
346	leaq	1(%r15),%r15
347	jmp	.Linner_enter
348
349.align	16
350.Linner:
351	addq	%rax,%r13
352	movq	(%rsi,%r15,8),%rax
353	adcq	$0,%rdx
354	addq	%r10,%r13
355	movq	(%rsp,%r15,8),%r10
356	adcq	$0,%rdx
357	movq	%r13,-16(%rsp,%r15,8)
358	movq	%rdx,%r13
359
360.Linner_enter:
361	mulq	%rbx
362	addq	%rax,%r11
363	movq	(%rcx,%r15,8),%rax
364	adcq	$0,%rdx
365	addq	%r11,%r10
366	movq	%rdx,%r11
367	adcq	$0,%r11
368	leaq	1(%r15),%r15
369
370	mulq	%rbp
371	cmpq	%r9,%r15
372	jne	.Linner
373
374	addq	%rax,%r13
375	adcq	$0,%rdx
376	addq	%r10,%r13
377	movq	(%rsp,%r9,8),%r10
378	adcq	$0,%rdx
379	movq	%r13,-16(%rsp,%r9,8)
380	movq	%rdx,%r13
381
382	xorq	%rdx,%rdx
383	addq	%r11,%r13
384	adcq	$0,%rdx
385	addq	%r10,%r13
386	adcq	$0,%rdx
387	movq	%r13,-8(%rsp,%r9,8)
388	movq	%rdx,(%rsp,%r9,8)
389
390	leaq	1(%r14),%r14
391	cmpq	%r9,%r14
392	jb	.Louter
393
394	xorq	%r14,%r14
395	movq	(%rsp),%rax
396	leaq	(%rsp),%rsi
397	movq	%r9,%r15
398	jmp	.Lsub
399.align	16
400.Lsub:	sbbq	(%rcx,%r14,8),%rax
401	movq	%rax,(%rdi,%r14,8)
402	movq	8(%rsi,%r14,8),%rax
403	leaq	1(%r14),%r14
404	decq	%r15
405	jnz	.Lsub
406
407	sbbq	$0,%rax
408	movq	$-1,%rbx
409	xorq	%rax,%rbx
410	xorq	%r14,%r14
411	movq	%r9,%r15
412
413.Lcopy:
414	movq	(%rdi,%r14,8),%rcx
415	movq	(%rsp,%r14,8),%rdx
416	andq	%rbx,%rcx
417	andq	%rax,%rdx
418	movq	%r14,(%rsp,%r14,8)
419	orq	%rcx,%rdx
420	movq	%rdx,(%rdi,%r14,8)
421	leaq	1(%r14),%r14
422	subq	$1,%r15
423	jnz	.Lcopy
424
425	movq	8(%rsp,%r9,8),%rsi
426.cfi_def_cfa	%rsi,8
427	movq	$1,%rax
428
429	movq	-48(%rsi),%r15
430.cfi_restore	%r15
431	movq	-40(%rsi),%r14
432.cfi_restore	%r14
433	movq	-32(%rsi),%r13
434.cfi_restore	%r13
435	movq	-24(%rsi),%r12
436.cfi_restore	%r12
437	movq	-16(%rsi),%rbp
438.cfi_restore	%rbp
439	movq	-8(%rsi),%rbx
440.cfi_restore	%rbx
441	leaq	(%rsi),%rsp
442.cfi_def_cfa_register	%rsp
443.Lmul_epilogue:
444	ret
445.cfi_endproc
446.size	bn_mul_mont_gather5_nohw,.-bn_mul_mont_gather5_nohw
447.globl	bn_mul4x_mont_gather5
448.hidden bn_mul4x_mont_gather5
449.type	bn_mul4x_mont_gather5,@function
450.align	32
451bn_mul4x_mont_gather5:
452.cfi_startproc
453_CET_ENDBR
454.byte	0x67
455	movq	%rsp,%rax
456.cfi_def_cfa_register	%rax
457	pushq	%rbx
458.cfi_offset	%rbx,-16
459	pushq	%rbp
460.cfi_offset	%rbp,-24
461	pushq	%r12
462.cfi_offset	%r12,-32
463	pushq	%r13
464.cfi_offset	%r13,-40
465	pushq	%r14
466.cfi_offset	%r14,-48
467	pushq	%r15
468.cfi_offset	%r15,-56
469.Lmul4x_prologue:
470
471.byte	0x67
472
473
474
475	shll	$3,%r9d
476	leaq	(%r9,%r9,2),%r10
477	negq	%r9
478
479
480
481
482
483
484
485
486
487
488	leaq	-320(%rsp,%r9,2),%r11
489	movq	%rsp,%rbp
490	subq	%rdi,%r11
491	andq	$4095,%r11
492	cmpq	%r11,%r10
493	jb	.Lmul4xsp_alt
494	subq	%r11,%rbp
495	leaq	-320(%rbp,%r9,2),%rbp
496	jmp	.Lmul4xsp_done
497
498.align	32
499.Lmul4xsp_alt:
500	leaq	4096-320(,%r9,2),%r10
501	leaq	-320(%rbp,%r9,2),%rbp
502	subq	%r10,%r11
503	movq	$0,%r10
504	cmovcq	%r10,%r11
505	subq	%r11,%rbp
506.Lmul4xsp_done:
507	andq	$-64,%rbp
508	movq	%rsp,%r11
509	subq	%rbp,%r11
510	andq	$-4096,%r11
511	leaq	(%r11,%rbp,1),%rsp
512	movq	(%rsp),%r10
513	cmpq	%rbp,%rsp
514	ja	.Lmul4x_page_walk
515	jmp	.Lmul4x_page_walk_done
516
517.Lmul4x_page_walk:
518	leaq	-4096(%rsp),%rsp
519	movq	(%rsp),%r10
520	cmpq	%rbp,%rsp
521	ja	.Lmul4x_page_walk
522.Lmul4x_page_walk_done:
523
524	negq	%r9
525
526	movq	%rax,40(%rsp)
527.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
528.Lmul4x_body:
529
530	call	mul4x_internal
531
532	movq	40(%rsp),%rsi
533.cfi_def_cfa	%rsi,8
534	movq	$1,%rax
535
536	movq	-48(%rsi),%r15
537.cfi_restore	%r15
538	movq	-40(%rsi),%r14
539.cfi_restore	%r14
540	movq	-32(%rsi),%r13
541.cfi_restore	%r13
542	movq	-24(%rsi),%r12
543.cfi_restore	%r12
544	movq	-16(%rsi),%rbp
545.cfi_restore	%rbp
546	movq	-8(%rsi),%rbx
547.cfi_restore	%rbx
548	leaq	(%rsi),%rsp
549.cfi_def_cfa_register	%rsp
550.Lmul4x_epilogue:
551	ret
552.cfi_endproc
553.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
554
555.type	mul4x_internal,@function
556.align	32
557mul4x_internal:
558.cfi_startproc
559	shlq	$5,%r9
560	movd	8(%rax),%xmm5
561	leaq	.Linc(%rip),%rax
562	leaq	128(%rdx,%r9,1),%r13
563	shrq	$5,%r9
564	movdqa	0(%rax),%xmm0
565	movdqa	16(%rax),%xmm1
566	leaq	88-112(%rsp,%r9,1),%r10
567	leaq	128(%rdx),%r12
568
569	pshufd	$0,%xmm5,%xmm5
570	movdqa	%xmm1,%xmm4
571.byte	0x67,0x67
572	movdqa	%xmm1,%xmm2
573	paddd	%xmm0,%xmm1
574	pcmpeqd	%xmm5,%xmm0
575.byte	0x67
576	movdqa	%xmm4,%xmm3
577	paddd	%xmm1,%xmm2
578	pcmpeqd	%xmm5,%xmm1
579	movdqa	%xmm0,112(%r10)
580	movdqa	%xmm4,%xmm0
581
582	paddd	%xmm2,%xmm3
583	pcmpeqd	%xmm5,%xmm2
584	movdqa	%xmm1,128(%r10)
585	movdqa	%xmm4,%xmm1
586
587	paddd	%xmm3,%xmm0
588	pcmpeqd	%xmm5,%xmm3
589	movdqa	%xmm2,144(%r10)
590	movdqa	%xmm4,%xmm2
591
592	paddd	%xmm0,%xmm1
593	pcmpeqd	%xmm5,%xmm0
594	movdqa	%xmm3,160(%r10)
595	movdqa	%xmm4,%xmm3
596	paddd	%xmm1,%xmm2
597	pcmpeqd	%xmm5,%xmm1
598	movdqa	%xmm0,176(%r10)
599	movdqa	%xmm4,%xmm0
600
601	paddd	%xmm2,%xmm3
602	pcmpeqd	%xmm5,%xmm2
603	movdqa	%xmm1,192(%r10)
604	movdqa	%xmm4,%xmm1
605
606	paddd	%xmm3,%xmm0
607	pcmpeqd	%xmm5,%xmm3
608	movdqa	%xmm2,208(%r10)
609	movdqa	%xmm4,%xmm2
610
611	paddd	%xmm0,%xmm1
612	pcmpeqd	%xmm5,%xmm0
613	movdqa	%xmm3,224(%r10)
614	movdqa	%xmm4,%xmm3
615	paddd	%xmm1,%xmm2
616	pcmpeqd	%xmm5,%xmm1
617	movdqa	%xmm0,240(%r10)
618	movdqa	%xmm4,%xmm0
619
620	paddd	%xmm2,%xmm3
621	pcmpeqd	%xmm5,%xmm2
622	movdqa	%xmm1,256(%r10)
623	movdqa	%xmm4,%xmm1
624
625	paddd	%xmm3,%xmm0
626	pcmpeqd	%xmm5,%xmm3
627	movdqa	%xmm2,272(%r10)
628	movdqa	%xmm4,%xmm2
629
630	paddd	%xmm0,%xmm1
631	pcmpeqd	%xmm5,%xmm0
632	movdqa	%xmm3,288(%r10)
633	movdqa	%xmm4,%xmm3
634	paddd	%xmm1,%xmm2
635	pcmpeqd	%xmm5,%xmm1
636	movdqa	%xmm0,304(%r10)
637
638	paddd	%xmm2,%xmm3
639.byte	0x67
640	pcmpeqd	%xmm5,%xmm2
641	movdqa	%xmm1,320(%r10)
642
643	pcmpeqd	%xmm5,%xmm3
644	movdqa	%xmm2,336(%r10)
645	pand	64(%r12),%xmm0
646
647	pand	80(%r12),%xmm1
648	pand	96(%r12),%xmm2
649	movdqa	%xmm3,352(%r10)
650	pand	112(%r12),%xmm3
651	por	%xmm2,%xmm0
652	por	%xmm3,%xmm1
653	movdqa	-128(%r12),%xmm4
654	movdqa	-112(%r12),%xmm5
655	movdqa	-96(%r12),%xmm2
656	pand	112(%r10),%xmm4
657	movdqa	-80(%r12),%xmm3
658	pand	128(%r10),%xmm5
659	por	%xmm4,%xmm0
660	pand	144(%r10),%xmm2
661	por	%xmm5,%xmm1
662	pand	160(%r10),%xmm3
663	por	%xmm2,%xmm0
664	por	%xmm3,%xmm1
665	movdqa	-64(%r12),%xmm4
666	movdqa	-48(%r12),%xmm5
667	movdqa	-32(%r12),%xmm2
668	pand	176(%r10),%xmm4
669	movdqa	-16(%r12),%xmm3
670	pand	192(%r10),%xmm5
671	por	%xmm4,%xmm0
672	pand	208(%r10),%xmm2
673	por	%xmm5,%xmm1
674	pand	224(%r10),%xmm3
675	por	%xmm2,%xmm0
676	por	%xmm3,%xmm1
677	movdqa	0(%r12),%xmm4
678	movdqa	16(%r12),%xmm5
679	movdqa	32(%r12),%xmm2
680	pand	240(%r10),%xmm4
681	movdqa	48(%r12),%xmm3
682	pand	256(%r10),%xmm5
683	por	%xmm4,%xmm0
684	pand	272(%r10),%xmm2
685	por	%xmm5,%xmm1
686	pand	288(%r10),%xmm3
687	por	%xmm2,%xmm0
688	por	%xmm3,%xmm1
689	por	%xmm1,%xmm0
690
691	pshufd	$0x4e,%xmm0,%xmm1
692	por	%xmm1,%xmm0
693	leaq	256(%r12),%r12
694.byte	102,72,15,126,195
695
696	movq	%r13,16+8(%rsp)
697	movq	%rdi,56+8(%rsp)
698
699	movq	(%r8),%r8
700	movq	(%rsi),%rax
701	leaq	(%rsi,%r9,1),%rsi
702	negq	%r9
703
704	movq	%r8,%rbp
705	mulq	%rbx
706	movq	%rax,%r10
707	movq	(%rcx),%rax
708
709	imulq	%r10,%rbp
710	leaq	64+8(%rsp),%r14
711	movq	%rdx,%r11
712
713	mulq	%rbp
714	addq	%rax,%r10
715	movq	8(%rsi,%r9,1),%rax
716	adcq	$0,%rdx
717	movq	%rdx,%rdi
718
719	mulq	%rbx
720	addq	%rax,%r11
721	movq	8(%rcx),%rax
722	adcq	$0,%rdx
723	movq	%rdx,%r10
724
725	mulq	%rbp
726	addq	%rax,%rdi
727	movq	16(%rsi,%r9,1),%rax
728	adcq	$0,%rdx
729	addq	%r11,%rdi
730	leaq	32(%r9),%r15
731	leaq	32(%rcx),%rcx
732	adcq	$0,%rdx
733	movq	%rdi,(%r14)
734	movq	%rdx,%r13
735	jmp	.L1st4x
736
737.align	32
738.L1st4x:
739	mulq	%rbx
740	addq	%rax,%r10
741	movq	-16(%rcx),%rax
742	leaq	32(%r14),%r14
743	adcq	$0,%rdx
744	movq	%rdx,%r11
745
746	mulq	%rbp
747	addq	%rax,%r13
748	movq	-8(%rsi,%r15,1),%rax
749	adcq	$0,%rdx
750	addq	%r10,%r13
751	adcq	$0,%rdx
752	movq	%r13,-24(%r14)
753	movq	%rdx,%rdi
754
755	mulq	%rbx
756	addq	%rax,%r11
757	movq	-8(%rcx),%rax
758	adcq	$0,%rdx
759	movq	%rdx,%r10
760
761	mulq	%rbp
762	addq	%rax,%rdi
763	movq	(%rsi,%r15,1),%rax
764	adcq	$0,%rdx
765	addq	%r11,%rdi
766	adcq	$0,%rdx
767	movq	%rdi,-16(%r14)
768	movq	%rdx,%r13
769
770	mulq	%rbx
771	addq	%rax,%r10
772	movq	0(%rcx),%rax
773	adcq	$0,%rdx
774	movq	%rdx,%r11
775
776	mulq	%rbp
777	addq	%rax,%r13
778	movq	8(%rsi,%r15,1),%rax
779	adcq	$0,%rdx
780	addq	%r10,%r13
781	adcq	$0,%rdx
782	movq	%r13,-8(%r14)
783	movq	%rdx,%rdi
784
785	mulq	%rbx
786	addq	%rax,%r11
787	movq	8(%rcx),%rax
788	adcq	$0,%rdx
789	movq	%rdx,%r10
790
791	mulq	%rbp
792	addq	%rax,%rdi
793	movq	16(%rsi,%r15,1),%rax
794	adcq	$0,%rdx
795	addq	%r11,%rdi
796	leaq	32(%rcx),%rcx
797	adcq	$0,%rdx
798	movq	%rdi,(%r14)
799	movq	%rdx,%r13
800
801	addq	$32,%r15
802	jnz	.L1st4x
803
804	mulq	%rbx
805	addq	%rax,%r10
806	movq	-16(%rcx),%rax
807	leaq	32(%r14),%r14
808	adcq	$0,%rdx
809	movq	%rdx,%r11
810
811	mulq	%rbp
812	addq	%rax,%r13
813	movq	-8(%rsi),%rax
814	adcq	$0,%rdx
815	addq	%r10,%r13
816	adcq	$0,%rdx
817	movq	%r13,-24(%r14)
818	movq	%rdx,%rdi
819
820	mulq	%rbx
821	addq	%rax,%r11
822	movq	-8(%rcx),%rax
823	adcq	$0,%rdx
824	movq	%rdx,%r10
825
826	mulq	%rbp
827	addq	%rax,%rdi
828	movq	(%rsi,%r9,1),%rax
829	adcq	$0,%rdx
830	addq	%r11,%rdi
831	adcq	$0,%rdx
832	movq	%rdi,-16(%r14)
833	movq	%rdx,%r13
834
835	leaq	(%rcx,%r9,1),%rcx
836
837	xorq	%rdi,%rdi
838	addq	%r10,%r13
839	adcq	$0,%rdi
840	movq	%r13,-8(%r14)
841
842	jmp	.Louter4x
843
844.align	32
845.Louter4x:
846	leaq	16+128(%r14),%rdx
847	pxor	%xmm4,%xmm4
848	pxor	%xmm5,%xmm5
849	movdqa	-128(%r12),%xmm0
850	movdqa	-112(%r12),%xmm1
851	movdqa	-96(%r12),%xmm2
852	movdqa	-80(%r12),%xmm3
853	pand	-128(%rdx),%xmm0
854	pand	-112(%rdx),%xmm1
855	por	%xmm0,%xmm4
856	pand	-96(%rdx),%xmm2
857	por	%xmm1,%xmm5
858	pand	-80(%rdx),%xmm3
859	por	%xmm2,%xmm4
860	por	%xmm3,%xmm5
861	movdqa	-64(%r12),%xmm0
862	movdqa	-48(%r12),%xmm1
863	movdqa	-32(%r12),%xmm2
864	movdqa	-16(%r12),%xmm3
865	pand	-64(%rdx),%xmm0
866	pand	-48(%rdx),%xmm1
867	por	%xmm0,%xmm4
868	pand	-32(%rdx),%xmm2
869	por	%xmm1,%xmm5
870	pand	-16(%rdx),%xmm3
871	por	%xmm2,%xmm4
872	por	%xmm3,%xmm5
873	movdqa	0(%r12),%xmm0
874	movdqa	16(%r12),%xmm1
875	movdqa	32(%r12),%xmm2
876	movdqa	48(%r12),%xmm3
877	pand	0(%rdx),%xmm0
878	pand	16(%rdx),%xmm1
879	por	%xmm0,%xmm4
880	pand	32(%rdx),%xmm2
881	por	%xmm1,%xmm5
882	pand	48(%rdx),%xmm3
883	por	%xmm2,%xmm4
884	por	%xmm3,%xmm5
885	movdqa	64(%r12),%xmm0
886	movdqa	80(%r12),%xmm1
887	movdqa	96(%r12),%xmm2
888	movdqa	112(%r12),%xmm3
889	pand	64(%rdx),%xmm0
890	pand	80(%rdx),%xmm1
891	por	%xmm0,%xmm4
892	pand	96(%rdx),%xmm2
893	por	%xmm1,%xmm5
894	pand	112(%rdx),%xmm3
895	por	%xmm2,%xmm4
896	por	%xmm3,%xmm5
897	por	%xmm5,%xmm4
898
899	pshufd	$0x4e,%xmm4,%xmm0
900	por	%xmm4,%xmm0
901	leaq	256(%r12),%r12
902.byte	102,72,15,126,195
903
904	movq	(%r14,%r9,1),%r10
905	movq	%r8,%rbp
906	mulq	%rbx
907	addq	%rax,%r10
908	movq	(%rcx),%rax
909	adcq	$0,%rdx
910
911	imulq	%r10,%rbp
912	movq	%rdx,%r11
913	movq	%rdi,(%r14)
914
915	leaq	(%r14,%r9,1),%r14
916
917	mulq	%rbp
918	addq	%rax,%r10
919	movq	8(%rsi,%r9,1),%rax
920	adcq	$0,%rdx
921	movq	%rdx,%rdi
922
923	mulq	%rbx
924	addq	%rax,%r11
925	movq	8(%rcx),%rax
926	adcq	$0,%rdx
927	addq	8(%r14),%r11
928	adcq	$0,%rdx
929	movq	%rdx,%r10
930
931	mulq	%rbp
932	addq	%rax,%rdi
933	movq	16(%rsi,%r9,1),%rax
934	adcq	$0,%rdx
935	addq	%r11,%rdi
936	leaq	32(%r9),%r15
937	leaq	32(%rcx),%rcx
938	adcq	$0,%rdx
939	movq	%rdx,%r13
940	jmp	.Linner4x
941
942.align	32
943.Linner4x:
944	mulq	%rbx
945	addq	%rax,%r10
946	movq	-16(%rcx),%rax
947	adcq	$0,%rdx
948	addq	16(%r14),%r10
949	leaq	32(%r14),%r14
950	adcq	$0,%rdx
951	movq	%rdx,%r11
952
953	mulq	%rbp
954	addq	%rax,%r13
955	movq	-8(%rsi,%r15,1),%rax
956	adcq	$0,%rdx
957	addq	%r10,%r13
958	adcq	$0,%rdx
959	movq	%rdi,-32(%r14)
960	movq	%rdx,%rdi
961
962	mulq	%rbx
963	addq	%rax,%r11
964	movq	-8(%rcx),%rax
965	adcq	$0,%rdx
966	addq	-8(%r14),%r11
967	adcq	$0,%rdx
968	movq	%rdx,%r10
969
970	mulq	%rbp
971	addq	%rax,%rdi
972	movq	(%rsi,%r15,1),%rax
973	adcq	$0,%rdx
974	addq	%r11,%rdi
975	adcq	$0,%rdx
976	movq	%r13,-24(%r14)
977	movq	%rdx,%r13
978
979	mulq	%rbx
980	addq	%rax,%r10
981	movq	0(%rcx),%rax
982	adcq	$0,%rdx
983	addq	(%r14),%r10
984	adcq	$0,%rdx
985	movq	%rdx,%r11
986
987	mulq	%rbp
988	addq	%rax,%r13
989	movq	8(%rsi,%r15,1),%rax
990	adcq	$0,%rdx
991	addq	%r10,%r13
992	adcq	$0,%rdx
993	movq	%rdi,-16(%r14)
994	movq	%rdx,%rdi
995
996	mulq	%rbx
997	addq	%rax,%r11
998	movq	8(%rcx),%rax
999	adcq	$0,%rdx
1000	addq	8(%r14),%r11
1001	adcq	$0,%rdx
1002	movq	%rdx,%r10
1003
1004	mulq	%rbp
1005	addq	%rax,%rdi
1006	movq	16(%rsi,%r15,1),%rax
1007	adcq	$0,%rdx
1008	addq	%r11,%rdi
1009	leaq	32(%rcx),%rcx
1010	adcq	$0,%rdx
1011	movq	%r13,-8(%r14)
1012	movq	%rdx,%r13
1013
1014	addq	$32,%r15
1015	jnz	.Linner4x
1016
1017	mulq	%rbx
1018	addq	%rax,%r10
1019	movq	-16(%rcx),%rax
1020	adcq	$0,%rdx
1021	addq	16(%r14),%r10
1022	leaq	32(%r14),%r14
1023	adcq	$0,%rdx
1024	movq	%rdx,%r11
1025
1026	mulq	%rbp
1027	addq	%rax,%r13
1028	movq	-8(%rsi),%rax
1029	adcq	$0,%rdx
1030	addq	%r10,%r13
1031	adcq	$0,%rdx
1032	movq	%rdi,-32(%r14)
1033	movq	%rdx,%rdi
1034
1035	mulq	%rbx
1036	addq	%rax,%r11
1037	movq	%rbp,%rax
1038	movq	-8(%rcx),%rbp
1039	adcq	$0,%rdx
1040	addq	-8(%r14),%r11
1041	adcq	$0,%rdx
1042	movq	%rdx,%r10
1043
1044	mulq	%rbp
1045	addq	%rax,%rdi
1046	movq	(%rsi,%r9,1),%rax
1047	adcq	$0,%rdx
1048	addq	%r11,%rdi
1049	adcq	$0,%rdx
1050	movq	%r13,-24(%r14)
1051	movq	%rdx,%r13
1052
1053	movq	%rdi,-16(%r14)
1054	leaq	(%rcx,%r9,1),%rcx
1055
1056	xorq	%rdi,%rdi
1057	addq	%r10,%r13
1058	adcq	$0,%rdi
1059	addq	(%r14),%r13
1060	adcq	$0,%rdi
1061	movq	%r13,-8(%r14)
1062
1063	cmpq	16+8(%rsp),%r12
1064	jb	.Louter4x
1065	xorq	%rax,%rax
1066	subq	%r13,%rbp
1067	adcq	%r15,%r15
1068	orq	%r15,%rdi
1069	subq	%rdi,%rax
1070	leaq	(%r14,%r9,1),%rbx
1071	movq	(%rcx),%r12
1072	leaq	(%rcx),%rbp
1073	movq	%r9,%rcx
1074	sarq	$3+2,%rcx
1075	movq	56+8(%rsp),%rdi
1076	decq	%r12
1077	xorq	%r10,%r10
1078	movq	8(%rbp),%r13
1079	movq	16(%rbp),%r14
1080	movq	24(%rbp),%r15
1081	jmp	.Lsqr4x_sub_entry
1082.cfi_endproc
1083.size	mul4x_internal,.-mul4x_internal
1084.globl	bn_power5_nohw
1085.hidden bn_power5_nohw
1086.type	bn_power5_nohw,@function
1087.align	32
1088bn_power5_nohw:
1089.cfi_startproc
1090_CET_ENDBR
1091	movq	%rsp,%rax
1092.cfi_def_cfa_register	%rax
1093	pushq	%rbx
1094.cfi_offset	%rbx,-16
1095	pushq	%rbp
1096.cfi_offset	%rbp,-24
1097	pushq	%r12
1098.cfi_offset	%r12,-32
1099	pushq	%r13
1100.cfi_offset	%r13,-40
1101	pushq	%r14
1102.cfi_offset	%r14,-48
1103	pushq	%r15
1104.cfi_offset	%r15,-56
1105.Lpower5_prologue:
1106
1107
1108
1109
1110	shll	$3,%r9d
1111	leal	(%r9,%r9,2),%r10d
1112	negq	%r9
1113	movq	(%r8),%r8
1114
1115
1116
1117
1118
1119
1120
1121
1122	leaq	-320(%rsp,%r9,2),%r11
1123	movq	%rsp,%rbp
1124	subq	%rdi,%r11
1125	andq	$4095,%r11
1126	cmpq	%r11,%r10
1127	jb	.Lpwr_sp_alt
1128	subq	%r11,%rbp
1129	leaq	-320(%rbp,%r9,2),%rbp
1130	jmp	.Lpwr_sp_done
1131
1132.align	32
1133.Lpwr_sp_alt:
1134	leaq	4096-320(,%r9,2),%r10
1135	leaq	-320(%rbp,%r9,2),%rbp
1136	subq	%r10,%r11
1137	movq	$0,%r10
1138	cmovcq	%r10,%r11
1139	subq	%r11,%rbp
1140.Lpwr_sp_done:
1141	andq	$-64,%rbp
1142	movq	%rsp,%r11
1143	subq	%rbp,%r11
1144	andq	$-4096,%r11
1145	leaq	(%r11,%rbp,1),%rsp
1146	movq	(%rsp),%r10
1147	cmpq	%rbp,%rsp
1148	ja	.Lpwr_page_walk
1149	jmp	.Lpwr_page_walk_done
1150
1151.Lpwr_page_walk:
1152	leaq	-4096(%rsp),%rsp
1153	movq	(%rsp),%r10
1154	cmpq	%rbp,%rsp
1155	ja	.Lpwr_page_walk
1156.Lpwr_page_walk_done:
1157
1158	movq	%r9,%r10
1159	negq	%r9
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170	movq	%r8,32(%rsp)
1171	movq	%rax,40(%rsp)
1172.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
1173.Lpower5_body:
1174.byte	102,72,15,110,207
1175.byte	102,72,15,110,209
1176.byte	102,73,15,110,218
1177.byte	102,72,15,110,226
1178
1179	call	__bn_sqr8x_internal
1180	call	__bn_post4x_internal
1181	call	__bn_sqr8x_internal
1182	call	__bn_post4x_internal
1183	call	__bn_sqr8x_internal
1184	call	__bn_post4x_internal
1185	call	__bn_sqr8x_internal
1186	call	__bn_post4x_internal
1187	call	__bn_sqr8x_internal
1188	call	__bn_post4x_internal
1189
1190.byte	102,72,15,126,209
1191.byte	102,72,15,126,226
1192	movq	%rsi,%rdi
1193	movq	40(%rsp),%rax
1194	leaq	32(%rsp),%r8
1195
1196	call	mul4x_internal
1197
1198	movq	40(%rsp),%rsi
1199.cfi_def_cfa	%rsi,8
1200	movq	$1,%rax
1201	movq	-48(%rsi),%r15
1202.cfi_restore	%r15
1203	movq	-40(%rsi),%r14
1204.cfi_restore	%r14
1205	movq	-32(%rsi),%r13
1206.cfi_restore	%r13
1207	movq	-24(%rsi),%r12
1208.cfi_restore	%r12
1209	movq	-16(%rsi),%rbp
1210.cfi_restore	%rbp
1211	movq	-8(%rsi),%rbx
1212.cfi_restore	%rbx
1213	leaq	(%rsi),%rsp
1214.cfi_def_cfa_register	%rsp
1215.Lpower5_epilogue:
1216	ret
1217.cfi_endproc
1218.size	bn_power5_nohw,.-bn_power5_nohw
1219
1220.globl	bn_sqr8x_internal
1221.hidden bn_sqr8x_internal
1222.hidden	bn_sqr8x_internal
1223.type	bn_sqr8x_internal,@function
1224.align	32
1225bn_sqr8x_internal:
1226__bn_sqr8x_internal:
1227.cfi_startproc
1228_CET_ENDBR
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302	leaq	32(%r10),%rbp
1303	leaq	(%rsi,%r9,1),%rsi
1304
1305	movq	%r9,%rcx
1306
1307
1308	movq	-32(%rsi,%rbp,1),%r14
1309	leaq	48+8(%rsp,%r9,2),%rdi
1310	movq	-24(%rsi,%rbp,1),%rax
1311	leaq	-32(%rdi,%rbp,1),%rdi
1312	movq	-16(%rsi,%rbp,1),%rbx
1313	movq	%rax,%r15
1314
1315	mulq	%r14
1316	movq	%rax,%r10
1317	movq	%rbx,%rax
1318	movq	%rdx,%r11
1319	movq	%r10,-24(%rdi,%rbp,1)
1320
1321	mulq	%r14
1322	addq	%rax,%r11
1323	movq	%rbx,%rax
1324	adcq	$0,%rdx
1325	movq	%r11,-16(%rdi,%rbp,1)
1326	movq	%rdx,%r10
1327
1328
1329	movq	-8(%rsi,%rbp,1),%rbx
1330	mulq	%r15
1331	movq	%rax,%r12
1332	movq	%rbx,%rax
1333	movq	%rdx,%r13
1334
1335	leaq	(%rbp),%rcx
1336	mulq	%r14
1337	addq	%rax,%r10
1338	movq	%rbx,%rax
1339	movq	%rdx,%r11
1340	adcq	$0,%r11
1341	addq	%r12,%r10
1342	adcq	$0,%r11
1343	movq	%r10,-8(%rdi,%rcx,1)
1344	jmp	.Lsqr4x_1st
1345
1346.align	32
1347.Lsqr4x_1st:
1348	movq	(%rsi,%rcx,1),%rbx
1349	mulq	%r15
1350	addq	%rax,%r13
1351	movq	%rbx,%rax
1352	movq	%rdx,%r12
1353	adcq	$0,%r12
1354
1355	mulq	%r14
1356	addq	%rax,%r11
1357	movq	%rbx,%rax
1358	movq	8(%rsi,%rcx,1),%rbx
1359	movq	%rdx,%r10
1360	adcq	$0,%r10
1361	addq	%r13,%r11
1362	adcq	$0,%r10
1363
1364
1365	mulq	%r15
1366	addq	%rax,%r12
1367	movq	%rbx,%rax
1368	movq	%r11,(%rdi,%rcx,1)
1369	movq	%rdx,%r13
1370	adcq	$0,%r13
1371
1372	mulq	%r14
1373	addq	%rax,%r10
1374	movq	%rbx,%rax
1375	movq	16(%rsi,%rcx,1),%rbx
1376	movq	%rdx,%r11
1377	adcq	$0,%r11
1378	addq	%r12,%r10
1379	adcq	$0,%r11
1380
1381	mulq	%r15
1382	addq	%rax,%r13
1383	movq	%rbx,%rax
1384	movq	%r10,8(%rdi,%rcx,1)
1385	movq	%rdx,%r12
1386	adcq	$0,%r12
1387
1388	mulq	%r14
1389	addq	%rax,%r11
1390	movq	%rbx,%rax
1391	movq	24(%rsi,%rcx,1),%rbx
1392	movq	%rdx,%r10
1393	adcq	$0,%r10
1394	addq	%r13,%r11
1395	adcq	$0,%r10
1396
1397
1398	mulq	%r15
1399	addq	%rax,%r12
1400	movq	%rbx,%rax
1401	movq	%r11,16(%rdi,%rcx,1)
1402	movq	%rdx,%r13
1403	adcq	$0,%r13
1404	leaq	32(%rcx),%rcx
1405
1406	mulq	%r14
1407	addq	%rax,%r10
1408	movq	%rbx,%rax
1409	movq	%rdx,%r11
1410	adcq	$0,%r11
1411	addq	%r12,%r10
1412	adcq	$0,%r11
1413	movq	%r10,-8(%rdi,%rcx,1)
1414
1415	cmpq	$0,%rcx
1416	jne	.Lsqr4x_1st
1417
1418	mulq	%r15
1419	addq	%rax,%r13
1420	leaq	16(%rbp),%rbp
1421	adcq	$0,%rdx
1422	addq	%r11,%r13
1423	adcq	$0,%rdx
1424
1425	movq	%r13,(%rdi)
1426	movq	%rdx,%r12
1427	movq	%rdx,8(%rdi)
1428	jmp	.Lsqr4x_outer
1429
1430.align	32
1431.Lsqr4x_outer:
1432	movq	-32(%rsi,%rbp,1),%r14
1433	leaq	48+8(%rsp,%r9,2),%rdi
1434	movq	-24(%rsi,%rbp,1),%rax
1435	leaq	-32(%rdi,%rbp,1),%rdi
1436	movq	-16(%rsi,%rbp,1),%rbx
1437	movq	%rax,%r15
1438
1439	mulq	%r14
1440	movq	-24(%rdi,%rbp,1),%r10
1441	addq	%rax,%r10
1442	movq	%rbx,%rax
1443	adcq	$0,%rdx
1444	movq	%r10,-24(%rdi,%rbp,1)
1445	movq	%rdx,%r11
1446
1447	mulq	%r14
1448	addq	%rax,%r11
1449	movq	%rbx,%rax
1450	adcq	$0,%rdx
1451	addq	-16(%rdi,%rbp,1),%r11
1452	movq	%rdx,%r10
1453	adcq	$0,%r10
1454	movq	%r11,-16(%rdi,%rbp,1)
1455
1456	xorq	%r12,%r12
1457
1458	movq	-8(%rsi,%rbp,1),%rbx
1459	mulq	%r15
1460	addq	%rax,%r12
1461	movq	%rbx,%rax
1462	adcq	$0,%rdx
1463	addq	-8(%rdi,%rbp,1),%r12
1464	movq	%rdx,%r13
1465	adcq	$0,%r13
1466
1467	mulq	%r14
1468	addq	%rax,%r10
1469	movq	%rbx,%rax
1470	adcq	$0,%rdx
1471	addq	%r12,%r10
1472	movq	%rdx,%r11
1473	adcq	$0,%r11
1474	movq	%r10,-8(%rdi,%rbp,1)
1475
1476	leaq	(%rbp),%rcx
1477	jmp	.Lsqr4x_inner
1478
1479.align	32
1480.Lsqr4x_inner:
1481	movq	(%rsi,%rcx,1),%rbx
1482	mulq	%r15
1483	addq	%rax,%r13
1484	movq	%rbx,%rax
1485	movq	%rdx,%r12
1486	adcq	$0,%r12
1487	addq	(%rdi,%rcx,1),%r13
1488	adcq	$0,%r12
1489
1490.byte	0x67
1491	mulq	%r14
1492	addq	%rax,%r11
1493	movq	%rbx,%rax
1494	movq	8(%rsi,%rcx,1),%rbx
1495	movq	%rdx,%r10
1496	adcq	$0,%r10
1497	addq	%r13,%r11
1498	adcq	$0,%r10
1499
1500	mulq	%r15
1501	addq	%rax,%r12
1502	movq	%r11,(%rdi,%rcx,1)
1503	movq	%rbx,%rax
1504	movq	%rdx,%r13
1505	adcq	$0,%r13
1506	addq	8(%rdi,%rcx,1),%r12
1507	leaq	16(%rcx),%rcx
1508	adcq	$0,%r13
1509
1510	mulq	%r14
1511	addq	%rax,%r10
1512	movq	%rbx,%rax
1513	adcq	$0,%rdx
1514	addq	%r12,%r10
1515	movq	%rdx,%r11
1516	adcq	$0,%r11
1517	movq	%r10,-8(%rdi,%rcx,1)
1518
1519	cmpq	$0,%rcx
1520	jne	.Lsqr4x_inner
1521
1522.byte	0x67
1523	mulq	%r15
1524	addq	%rax,%r13
1525	adcq	$0,%rdx
1526	addq	%r11,%r13
1527	adcq	$0,%rdx
1528
1529	movq	%r13,(%rdi)
1530	movq	%rdx,%r12
1531	movq	%rdx,8(%rdi)
1532
1533	addq	$16,%rbp
1534	jnz	.Lsqr4x_outer
1535
1536
1537	movq	-32(%rsi),%r14
1538	leaq	48+8(%rsp,%r9,2),%rdi
1539	movq	-24(%rsi),%rax
1540	leaq	-32(%rdi,%rbp,1),%rdi
1541	movq	-16(%rsi),%rbx
1542	movq	%rax,%r15
1543
1544	mulq	%r14
1545	addq	%rax,%r10
1546	movq	%rbx,%rax
1547	movq	%rdx,%r11
1548	adcq	$0,%r11
1549
1550	mulq	%r14
1551	addq	%rax,%r11
1552	movq	%rbx,%rax
1553	movq	%r10,-24(%rdi)
1554	movq	%rdx,%r10
1555	adcq	$0,%r10
1556	addq	%r13,%r11
1557	movq	-8(%rsi),%rbx
1558	adcq	$0,%r10
1559
1560	mulq	%r15
1561	addq	%rax,%r12
1562	movq	%rbx,%rax
1563	movq	%r11,-16(%rdi)
1564	movq	%rdx,%r13
1565	adcq	$0,%r13
1566
1567	mulq	%r14
1568	addq	%rax,%r10
1569	movq	%rbx,%rax
1570	movq	%rdx,%r11
1571	adcq	$0,%r11
1572	addq	%r12,%r10
1573	adcq	$0,%r11
1574	movq	%r10,-8(%rdi)
1575
1576	mulq	%r15
1577	addq	%rax,%r13
1578	movq	-16(%rsi),%rax
1579	adcq	$0,%rdx
1580	addq	%r11,%r13
1581	adcq	$0,%rdx
1582
1583	movq	%r13,(%rdi)
1584	movq	%rdx,%r12
1585	movq	%rdx,8(%rdi)
1586
1587	mulq	%rbx
1588	addq	$16,%rbp
1589	xorq	%r14,%r14
1590	subq	%r9,%rbp
1591	xorq	%r15,%r15
1592
1593	addq	%r12,%rax
1594	adcq	$0,%rdx
1595	movq	%rax,8(%rdi)
1596	movq	%rdx,16(%rdi)
1597	movq	%r15,24(%rdi)
1598
1599	movq	-16(%rsi,%rbp,1),%rax
1600	leaq	48+8(%rsp),%rdi
1601	xorq	%r10,%r10
1602	movq	8(%rdi),%r11
1603
1604	leaq	(%r14,%r10,2),%r12
1605	shrq	$63,%r10
1606	leaq	(%rcx,%r11,2),%r13
1607	shrq	$63,%r11
1608	orq	%r10,%r13
1609	movq	16(%rdi),%r10
1610	movq	%r11,%r14
1611	mulq	%rax
1612	negq	%r15
1613	movq	24(%rdi),%r11
1614	adcq	%rax,%r12
1615	movq	-8(%rsi,%rbp,1),%rax
1616	movq	%r12,(%rdi)
1617	adcq	%rdx,%r13
1618
1619	leaq	(%r14,%r10,2),%rbx
1620	movq	%r13,8(%rdi)
1621	sbbq	%r15,%r15
1622	shrq	$63,%r10
1623	leaq	(%rcx,%r11,2),%r8
1624	shrq	$63,%r11
1625	orq	%r10,%r8
1626	movq	32(%rdi),%r10
1627	movq	%r11,%r14
1628	mulq	%rax
1629	negq	%r15
1630	movq	40(%rdi),%r11
1631	adcq	%rax,%rbx
1632	movq	0(%rsi,%rbp,1),%rax
1633	movq	%rbx,16(%rdi)
1634	adcq	%rdx,%r8
1635	leaq	16(%rbp),%rbp
1636	movq	%r8,24(%rdi)
1637	sbbq	%r15,%r15
1638	leaq	64(%rdi),%rdi
1639	jmp	.Lsqr4x_shift_n_add
1640
1641.align	32
1642.Lsqr4x_shift_n_add:
1643	leaq	(%r14,%r10,2),%r12
1644	shrq	$63,%r10
1645	leaq	(%rcx,%r11,2),%r13
1646	shrq	$63,%r11
1647	orq	%r10,%r13
1648	movq	-16(%rdi),%r10
1649	movq	%r11,%r14
1650	mulq	%rax
1651	negq	%r15
1652	movq	-8(%rdi),%r11
1653	adcq	%rax,%r12
1654	movq	-8(%rsi,%rbp,1),%rax
1655	movq	%r12,-32(%rdi)
1656	adcq	%rdx,%r13
1657
1658	leaq	(%r14,%r10,2),%rbx
1659	movq	%r13,-24(%rdi)
1660	sbbq	%r15,%r15
1661	shrq	$63,%r10
1662	leaq	(%rcx,%r11,2),%r8
1663	shrq	$63,%r11
1664	orq	%r10,%r8
1665	movq	0(%rdi),%r10
1666	movq	%r11,%r14
1667	mulq	%rax
1668	negq	%r15
1669	movq	8(%rdi),%r11
1670	adcq	%rax,%rbx
1671	movq	0(%rsi,%rbp,1),%rax
1672	movq	%rbx,-16(%rdi)
1673	adcq	%rdx,%r8
1674
1675	leaq	(%r14,%r10,2),%r12
1676	movq	%r8,-8(%rdi)
1677	sbbq	%r15,%r15
1678	shrq	$63,%r10
1679	leaq	(%rcx,%r11,2),%r13
1680	shrq	$63,%r11
1681	orq	%r10,%r13
1682	movq	16(%rdi),%r10
1683	movq	%r11,%r14
1684	mulq	%rax
1685	negq	%r15
1686	movq	24(%rdi),%r11
1687	adcq	%rax,%r12
1688	movq	8(%rsi,%rbp,1),%rax
1689	movq	%r12,0(%rdi)
1690	adcq	%rdx,%r13
1691
1692	leaq	(%r14,%r10,2),%rbx
1693	movq	%r13,8(%rdi)
1694	sbbq	%r15,%r15
1695	shrq	$63,%r10
1696	leaq	(%rcx,%r11,2),%r8
1697	shrq	$63,%r11
1698	orq	%r10,%r8
1699	movq	32(%rdi),%r10
1700	movq	%r11,%r14
1701	mulq	%rax
1702	negq	%r15
1703	movq	40(%rdi),%r11
1704	adcq	%rax,%rbx
1705	movq	16(%rsi,%rbp,1),%rax
1706	movq	%rbx,16(%rdi)
1707	adcq	%rdx,%r8
1708	movq	%r8,24(%rdi)
1709	sbbq	%r15,%r15
1710	leaq	64(%rdi),%rdi
1711	addq	$32,%rbp
1712	jnz	.Lsqr4x_shift_n_add
1713
1714	leaq	(%r14,%r10,2),%r12
1715.byte	0x67
1716	shrq	$63,%r10
1717	leaq	(%rcx,%r11,2),%r13
1718	shrq	$63,%r11
1719	orq	%r10,%r13
1720	movq	-16(%rdi),%r10
1721	movq	%r11,%r14
1722	mulq	%rax
1723	negq	%r15
1724	movq	-8(%rdi),%r11
1725	adcq	%rax,%r12
1726	movq	-8(%rsi),%rax
1727	movq	%r12,-32(%rdi)
1728	adcq	%rdx,%r13
1729
1730	leaq	(%r14,%r10,2),%rbx
1731	movq	%r13,-24(%rdi)
1732	sbbq	%r15,%r15
1733	shrq	$63,%r10
1734	leaq	(%rcx,%r11,2),%r8
1735	shrq	$63,%r11
1736	orq	%r10,%r8
1737	mulq	%rax
1738	negq	%r15
1739	adcq	%rax,%rbx
1740	adcq	%rdx,%r8
1741	movq	%rbx,-16(%rdi)
1742	movq	%r8,-8(%rdi)
1743.byte	102,72,15,126,213
1744__bn_sqr8x_reduction:
1745	xorq	%rax,%rax
1746	leaq	(%r9,%rbp,1),%rcx
1747	leaq	48+8(%rsp,%r9,2),%rdx
1748	movq	%rcx,0+8(%rsp)
1749	leaq	48+8(%rsp,%r9,1),%rdi
1750	movq	%rdx,8+8(%rsp)
1751	negq	%r9
1752	jmp	.L8x_reduction_loop
1753
1754.align	32
1755.L8x_reduction_loop:
1756	leaq	(%rdi,%r9,1),%rdi
1757.byte	0x66
1758	movq	0(%rdi),%rbx
1759	movq	8(%rdi),%r9
1760	movq	16(%rdi),%r10
1761	movq	24(%rdi),%r11
1762	movq	32(%rdi),%r12
1763	movq	40(%rdi),%r13
1764	movq	48(%rdi),%r14
1765	movq	56(%rdi),%r15
1766	movq	%rax,(%rdx)
1767	leaq	64(%rdi),%rdi
1768
1769.byte	0x67
1770	movq	%rbx,%r8
1771	imulq	32+8(%rsp),%rbx
1772	movq	0(%rbp),%rax
1773	movl	$8,%ecx
1774	jmp	.L8x_reduce
1775
1776.align	32
1777.L8x_reduce:
1778	mulq	%rbx
1779	movq	8(%rbp),%rax
1780	negq	%r8
1781	movq	%rdx,%r8
1782	adcq	$0,%r8
1783
1784	mulq	%rbx
1785	addq	%rax,%r9
1786	movq	16(%rbp),%rax
1787	adcq	$0,%rdx
1788	addq	%r9,%r8
1789	movq	%rbx,48-8+8(%rsp,%rcx,8)
1790	movq	%rdx,%r9
1791	adcq	$0,%r9
1792
1793	mulq	%rbx
1794	addq	%rax,%r10
1795	movq	24(%rbp),%rax
1796	adcq	$0,%rdx
1797	addq	%r10,%r9
1798	movq	32+8(%rsp),%rsi
1799	movq	%rdx,%r10
1800	adcq	$0,%r10
1801
1802	mulq	%rbx
1803	addq	%rax,%r11
1804	movq	32(%rbp),%rax
1805	adcq	$0,%rdx
1806	imulq	%r8,%rsi
1807	addq	%r11,%r10
1808	movq	%rdx,%r11
1809	adcq	$0,%r11
1810
1811	mulq	%rbx
1812	addq	%rax,%r12
1813	movq	40(%rbp),%rax
1814	adcq	$0,%rdx
1815	addq	%r12,%r11
1816	movq	%rdx,%r12
1817	adcq	$0,%r12
1818
1819	mulq	%rbx
1820	addq	%rax,%r13
1821	movq	48(%rbp),%rax
1822	adcq	$0,%rdx
1823	addq	%r13,%r12
1824	movq	%rdx,%r13
1825	adcq	$0,%r13
1826
1827	mulq	%rbx
1828	addq	%rax,%r14
1829	movq	56(%rbp),%rax
1830	adcq	$0,%rdx
1831	addq	%r14,%r13
1832	movq	%rdx,%r14
1833	adcq	$0,%r14
1834
1835	mulq	%rbx
1836	movq	%rsi,%rbx
1837	addq	%rax,%r15
1838	movq	0(%rbp),%rax
1839	adcq	$0,%rdx
1840	addq	%r15,%r14
1841	movq	%rdx,%r15
1842	adcq	$0,%r15
1843
1844	decl	%ecx
1845	jnz	.L8x_reduce
1846
1847	leaq	64(%rbp),%rbp
1848	xorq	%rax,%rax
1849	movq	8+8(%rsp),%rdx
1850	cmpq	0+8(%rsp),%rbp
1851	jae	.L8x_no_tail
1852
1853.byte	0x66
1854	addq	0(%rdi),%r8
1855	adcq	8(%rdi),%r9
1856	adcq	16(%rdi),%r10
1857	adcq	24(%rdi),%r11
1858	adcq	32(%rdi),%r12
1859	adcq	40(%rdi),%r13
1860	adcq	48(%rdi),%r14
1861	adcq	56(%rdi),%r15
1862	sbbq	%rsi,%rsi
1863
1864	movq	48+56+8(%rsp),%rbx
1865	movl	$8,%ecx
1866	movq	0(%rbp),%rax
1867	jmp	.L8x_tail
1868
1869.align	32
1870.L8x_tail:
1871	mulq	%rbx
1872	addq	%rax,%r8
1873	movq	8(%rbp),%rax
1874	movq	%r8,(%rdi)
1875	movq	%rdx,%r8
1876	adcq	$0,%r8
1877
1878	mulq	%rbx
1879	addq	%rax,%r9
1880	movq	16(%rbp),%rax
1881	adcq	$0,%rdx
1882	addq	%r9,%r8
1883	leaq	8(%rdi),%rdi
1884	movq	%rdx,%r9
1885	adcq	$0,%r9
1886
1887	mulq	%rbx
1888	addq	%rax,%r10
1889	movq	24(%rbp),%rax
1890	adcq	$0,%rdx
1891	addq	%r10,%r9
1892	movq	%rdx,%r10
1893	adcq	$0,%r10
1894
1895	mulq	%rbx
1896	addq	%rax,%r11
1897	movq	32(%rbp),%rax
1898	adcq	$0,%rdx
1899	addq	%r11,%r10
1900	movq	%rdx,%r11
1901	adcq	$0,%r11
1902
1903	mulq	%rbx
1904	addq	%rax,%r12
1905	movq	40(%rbp),%rax
1906	adcq	$0,%rdx
1907	addq	%r12,%r11
1908	movq	%rdx,%r12
1909	adcq	$0,%r12
1910
1911	mulq	%rbx
1912	addq	%rax,%r13
1913	movq	48(%rbp),%rax
1914	adcq	$0,%rdx
1915	addq	%r13,%r12
1916	movq	%rdx,%r13
1917	adcq	$0,%r13
1918
1919	mulq	%rbx
1920	addq	%rax,%r14
1921	movq	56(%rbp),%rax
1922	adcq	$0,%rdx
1923	addq	%r14,%r13
1924	movq	%rdx,%r14
1925	adcq	$0,%r14
1926
1927	mulq	%rbx
1928	movq	48-16+8(%rsp,%rcx,8),%rbx
1929	addq	%rax,%r15
1930	adcq	$0,%rdx
1931	addq	%r15,%r14
1932	movq	0(%rbp),%rax
1933	movq	%rdx,%r15
1934	adcq	$0,%r15
1935
1936	decl	%ecx
1937	jnz	.L8x_tail
1938
1939	leaq	64(%rbp),%rbp
1940	movq	8+8(%rsp),%rdx
1941	cmpq	0+8(%rsp),%rbp
1942	jae	.L8x_tail_done
1943
1944	movq	48+56+8(%rsp),%rbx
1945	negq	%rsi
1946	movq	0(%rbp),%rax
1947	adcq	0(%rdi),%r8
1948	adcq	8(%rdi),%r9
1949	adcq	16(%rdi),%r10
1950	adcq	24(%rdi),%r11
1951	adcq	32(%rdi),%r12
1952	adcq	40(%rdi),%r13
1953	adcq	48(%rdi),%r14
1954	adcq	56(%rdi),%r15
1955	sbbq	%rsi,%rsi
1956
1957	movl	$8,%ecx
1958	jmp	.L8x_tail
1959
1960.align	32
1961.L8x_tail_done:
1962	xorq	%rax,%rax
1963	addq	(%rdx),%r8
1964	adcq	$0,%r9
1965	adcq	$0,%r10
1966	adcq	$0,%r11
1967	adcq	$0,%r12
1968	adcq	$0,%r13
1969	adcq	$0,%r14
1970	adcq	$0,%r15
1971	adcq	$0,%rax
1972
1973	negq	%rsi
1974.L8x_no_tail:
1975	adcq	0(%rdi),%r8
1976	adcq	8(%rdi),%r9
1977	adcq	16(%rdi),%r10
1978	adcq	24(%rdi),%r11
1979	adcq	32(%rdi),%r12
1980	adcq	40(%rdi),%r13
1981	adcq	48(%rdi),%r14
1982	adcq	56(%rdi),%r15
1983	adcq	$0,%rax
1984	movq	-8(%rbp),%rcx
1985	xorq	%rsi,%rsi
1986
1987.byte	102,72,15,126,213
1988
1989	movq	%r8,0(%rdi)
1990	movq	%r9,8(%rdi)
1991.byte	102,73,15,126,217
1992	movq	%r10,16(%rdi)
1993	movq	%r11,24(%rdi)
1994	movq	%r12,32(%rdi)
1995	movq	%r13,40(%rdi)
1996	movq	%r14,48(%rdi)
1997	movq	%r15,56(%rdi)
1998	leaq	64(%rdi),%rdi
1999
2000	cmpq	%rdx,%rdi
2001	jb	.L8x_reduction_loop
2002	ret
2003.cfi_endproc
2004.size	bn_sqr8x_internal,.-bn_sqr8x_internal
2005.type	__bn_post4x_internal,@function
2006.align	32
2007__bn_post4x_internal:
2008.cfi_startproc
2009	movq	0(%rbp),%r12
2010	leaq	(%rdi,%r9,1),%rbx
2011	movq	%r9,%rcx
2012.byte	102,72,15,126,207
2013	negq	%rax
2014.byte	102,72,15,126,206
2015	sarq	$3+2,%rcx
2016	decq	%r12
2017	xorq	%r10,%r10
2018	movq	8(%rbp),%r13
2019	movq	16(%rbp),%r14
2020	movq	24(%rbp),%r15
2021	jmp	.Lsqr4x_sub_entry
2022
2023.align	16
2024.Lsqr4x_sub:
2025	movq	0(%rbp),%r12
2026	movq	8(%rbp),%r13
2027	movq	16(%rbp),%r14
2028	movq	24(%rbp),%r15
2029.Lsqr4x_sub_entry:
2030	leaq	32(%rbp),%rbp
2031	notq	%r12
2032	notq	%r13
2033	notq	%r14
2034	notq	%r15
2035	andq	%rax,%r12
2036	andq	%rax,%r13
2037	andq	%rax,%r14
2038	andq	%rax,%r15
2039
2040	negq	%r10
2041	adcq	0(%rbx),%r12
2042	adcq	8(%rbx),%r13
2043	adcq	16(%rbx),%r14
2044	adcq	24(%rbx),%r15
2045	movq	%r12,0(%rdi)
2046	leaq	32(%rbx),%rbx
2047	movq	%r13,8(%rdi)
2048	sbbq	%r10,%r10
2049	movq	%r14,16(%rdi)
2050	movq	%r15,24(%rdi)
2051	leaq	32(%rdi),%rdi
2052
2053	incq	%rcx
2054	jnz	.Lsqr4x_sub
2055
2056	movq	%r9,%r10
2057	negq	%r9
2058	ret
2059.cfi_endproc
2060.size	__bn_post4x_internal,.-__bn_post4x_internal
2061.globl	bn_mulx4x_mont_gather5
2062.hidden bn_mulx4x_mont_gather5
2063.type	bn_mulx4x_mont_gather5,@function
2064.align	32
2065bn_mulx4x_mont_gather5:
2066.cfi_startproc
2067_CET_ENDBR
2068	movq	%rsp,%rax
2069.cfi_def_cfa_register	%rax
2070	pushq	%rbx
2071.cfi_offset	%rbx,-16
2072	pushq	%rbp
2073.cfi_offset	%rbp,-24
2074	pushq	%r12
2075.cfi_offset	%r12,-32
2076	pushq	%r13
2077.cfi_offset	%r13,-40
2078	pushq	%r14
2079.cfi_offset	%r14,-48
2080	pushq	%r15
2081.cfi_offset	%r15,-56
2082.Lmulx4x_prologue:
2083
2084
2085
2086
2087	shll	$3,%r9d
2088	leaq	(%r9,%r9,2),%r10
2089	negq	%r9
2090	movq	(%r8),%r8
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101	leaq	-320(%rsp,%r9,2),%r11
2102	movq	%rsp,%rbp
2103	subq	%rdi,%r11
2104	andq	$4095,%r11
2105	cmpq	%r11,%r10
2106	jb	.Lmulx4xsp_alt
2107	subq	%r11,%rbp
2108	leaq	-320(%rbp,%r9,2),%rbp
2109	jmp	.Lmulx4xsp_done
2110
2111.Lmulx4xsp_alt:
2112	leaq	4096-320(,%r9,2),%r10
2113	leaq	-320(%rbp,%r9,2),%rbp
2114	subq	%r10,%r11
2115	movq	$0,%r10
2116	cmovcq	%r10,%r11
2117	subq	%r11,%rbp
2118.Lmulx4xsp_done:
2119	andq	$-64,%rbp
2120	movq	%rsp,%r11
2121	subq	%rbp,%r11
2122	andq	$-4096,%r11
2123	leaq	(%r11,%rbp,1),%rsp
2124	movq	(%rsp),%r10
2125	cmpq	%rbp,%rsp
2126	ja	.Lmulx4x_page_walk
2127	jmp	.Lmulx4x_page_walk_done
2128
2129.Lmulx4x_page_walk:
2130	leaq	-4096(%rsp),%rsp
2131	movq	(%rsp),%r10
2132	cmpq	%rbp,%rsp
2133	ja	.Lmulx4x_page_walk
2134.Lmulx4x_page_walk_done:
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148	movq	%r8,32(%rsp)
2149	movq	%rax,40(%rsp)
2150.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2151.Lmulx4x_body:
2152	call	mulx4x_internal
2153
2154	movq	40(%rsp),%rsi
2155.cfi_def_cfa	%rsi,8
2156	movq	$1,%rax
2157
2158	movq	-48(%rsi),%r15
2159.cfi_restore	%r15
2160	movq	-40(%rsi),%r14
2161.cfi_restore	%r14
2162	movq	-32(%rsi),%r13
2163.cfi_restore	%r13
2164	movq	-24(%rsi),%r12
2165.cfi_restore	%r12
2166	movq	-16(%rsi),%rbp
2167.cfi_restore	%rbp
2168	movq	-8(%rsi),%rbx
2169.cfi_restore	%rbx
2170	leaq	(%rsi),%rsp
2171.cfi_def_cfa_register	%rsp
2172.Lmulx4x_epilogue:
2173	ret
2174.cfi_endproc
2175.size	bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
2176
2177.type	mulx4x_internal,@function
2178.align	32
2179mulx4x_internal:
2180.cfi_startproc
2181	movq	%r9,8(%rsp)
2182	movq	%r9,%r10
2183	negq	%r9
2184	shlq	$5,%r9
2185	negq	%r10
2186	leaq	128(%rdx,%r9,1),%r13
2187	shrq	$5+5,%r9
2188	movd	8(%rax),%xmm5
2189	subq	$1,%r9
2190	leaq	.Linc(%rip),%rax
2191	movq	%r13,16+8(%rsp)
2192	movq	%r9,24+8(%rsp)
2193	movq	%rdi,56+8(%rsp)
2194	movdqa	0(%rax),%xmm0
2195	movdqa	16(%rax),%xmm1
2196	leaq	88-112(%rsp,%r10,1),%r10
2197	leaq	128(%rdx),%rdi
2198
2199	pshufd	$0,%xmm5,%xmm5
2200	movdqa	%xmm1,%xmm4
2201.byte	0x67
2202	movdqa	%xmm1,%xmm2
2203.byte	0x67
2204	paddd	%xmm0,%xmm1
2205	pcmpeqd	%xmm5,%xmm0
2206	movdqa	%xmm4,%xmm3
2207	paddd	%xmm1,%xmm2
2208	pcmpeqd	%xmm5,%xmm1
2209	movdqa	%xmm0,112(%r10)
2210	movdqa	%xmm4,%xmm0
2211
2212	paddd	%xmm2,%xmm3
2213	pcmpeqd	%xmm5,%xmm2
2214	movdqa	%xmm1,128(%r10)
2215	movdqa	%xmm4,%xmm1
2216
2217	paddd	%xmm3,%xmm0
2218	pcmpeqd	%xmm5,%xmm3
2219	movdqa	%xmm2,144(%r10)
2220	movdqa	%xmm4,%xmm2
2221
2222	paddd	%xmm0,%xmm1
2223	pcmpeqd	%xmm5,%xmm0
2224	movdqa	%xmm3,160(%r10)
2225	movdqa	%xmm4,%xmm3
2226	paddd	%xmm1,%xmm2
2227	pcmpeqd	%xmm5,%xmm1
2228	movdqa	%xmm0,176(%r10)
2229	movdqa	%xmm4,%xmm0
2230
2231	paddd	%xmm2,%xmm3
2232	pcmpeqd	%xmm5,%xmm2
2233	movdqa	%xmm1,192(%r10)
2234	movdqa	%xmm4,%xmm1
2235
2236	paddd	%xmm3,%xmm0
2237	pcmpeqd	%xmm5,%xmm3
2238	movdqa	%xmm2,208(%r10)
2239	movdqa	%xmm4,%xmm2
2240
2241	paddd	%xmm0,%xmm1
2242	pcmpeqd	%xmm5,%xmm0
2243	movdqa	%xmm3,224(%r10)
2244	movdqa	%xmm4,%xmm3
2245	paddd	%xmm1,%xmm2
2246	pcmpeqd	%xmm5,%xmm1
2247	movdqa	%xmm0,240(%r10)
2248	movdqa	%xmm4,%xmm0
2249
2250	paddd	%xmm2,%xmm3
2251	pcmpeqd	%xmm5,%xmm2
2252	movdqa	%xmm1,256(%r10)
2253	movdqa	%xmm4,%xmm1
2254
2255	paddd	%xmm3,%xmm0
2256	pcmpeqd	%xmm5,%xmm3
2257	movdqa	%xmm2,272(%r10)
2258	movdqa	%xmm4,%xmm2
2259
2260	paddd	%xmm0,%xmm1
2261	pcmpeqd	%xmm5,%xmm0
2262	movdqa	%xmm3,288(%r10)
2263	movdqa	%xmm4,%xmm3
2264.byte	0x67
2265	paddd	%xmm1,%xmm2
2266	pcmpeqd	%xmm5,%xmm1
2267	movdqa	%xmm0,304(%r10)
2268
2269	paddd	%xmm2,%xmm3
2270	pcmpeqd	%xmm5,%xmm2
2271	movdqa	%xmm1,320(%r10)
2272
2273	pcmpeqd	%xmm5,%xmm3
2274	movdqa	%xmm2,336(%r10)
2275
2276	pand	64(%rdi),%xmm0
2277	pand	80(%rdi),%xmm1
2278	pand	96(%rdi),%xmm2
2279	movdqa	%xmm3,352(%r10)
2280	pand	112(%rdi),%xmm3
2281	por	%xmm2,%xmm0
2282	por	%xmm3,%xmm1
2283	movdqa	-128(%rdi),%xmm4
2284	movdqa	-112(%rdi),%xmm5
2285	movdqa	-96(%rdi),%xmm2
2286	pand	112(%r10),%xmm4
2287	movdqa	-80(%rdi),%xmm3
2288	pand	128(%r10),%xmm5
2289	por	%xmm4,%xmm0
2290	pand	144(%r10),%xmm2
2291	por	%xmm5,%xmm1
2292	pand	160(%r10),%xmm3
2293	por	%xmm2,%xmm0
2294	por	%xmm3,%xmm1
2295	movdqa	-64(%rdi),%xmm4
2296	movdqa	-48(%rdi),%xmm5
2297	movdqa	-32(%rdi),%xmm2
2298	pand	176(%r10),%xmm4
2299	movdqa	-16(%rdi),%xmm3
2300	pand	192(%r10),%xmm5
2301	por	%xmm4,%xmm0
2302	pand	208(%r10),%xmm2
2303	por	%xmm5,%xmm1
2304	pand	224(%r10),%xmm3
2305	por	%xmm2,%xmm0
2306	por	%xmm3,%xmm1
2307	movdqa	0(%rdi),%xmm4
2308	movdqa	16(%rdi),%xmm5
2309	movdqa	32(%rdi),%xmm2
2310	pand	240(%r10),%xmm4
2311	movdqa	48(%rdi),%xmm3
2312	pand	256(%r10),%xmm5
2313	por	%xmm4,%xmm0
2314	pand	272(%r10),%xmm2
2315	por	%xmm5,%xmm1
2316	pand	288(%r10),%xmm3
2317	por	%xmm2,%xmm0
2318	por	%xmm3,%xmm1
2319	pxor	%xmm1,%xmm0
2320
2321	pshufd	$0x4e,%xmm0,%xmm1
2322	por	%xmm1,%xmm0
2323	leaq	256(%rdi),%rdi
2324.byte	102,72,15,126,194
2325	leaq	64+32+8(%rsp),%rbx
2326
2327	movq	%rdx,%r9
2328	mulxq	0(%rsi),%r8,%rax
2329	mulxq	8(%rsi),%r11,%r12
2330	addq	%rax,%r11
2331	mulxq	16(%rsi),%rax,%r13
2332	adcq	%rax,%r12
2333	adcq	$0,%r13
2334	mulxq	24(%rsi),%rax,%r14
2335
2336	movq	%r8,%r15
2337	imulq	32+8(%rsp),%r8
2338	xorq	%rbp,%rbp
2339	movq	%r8,%rdx
2340
2341	movq	%rdi,8+8(%rsp)
2342
2343	leaq	32(%rsi),%rsi
2344	adcxq	%rax,%r13
2345	adcxq	%rbp,%r14
2346
2347	mulxq	0(%rcx),%rax,%r10
2348	adcxq	%rax,%r15
2349	adoxq	%r11,%r10
2350	mulxq	8(%rcx),%rax,%r11
2351	adcxq	%rax,%r10
2352	adoxq	%r12,%r11
2353	mulxq	16(%rcx),%rax,%r12
2354	movq	24+8(%rsp),%rdi
2355	movq	%r10,-32(%rbx)
2356	adcxq	%rax,%r11
2357	adoxq	%r13,%r12
2358	mulxq	24(%rcx),%rax,%r15
2359	movq	%r9,%rdx
2360	movq	%r11,-24(%rbx)
2361	adcxq	%rax,%r12
2362	adoxq	%rbp,%r15
2363	leaq	32(%rcx),%rcx
2364	movq	%r12,-16(%rbx)
2365	jmp	.Lmulx4x_1st
2366
2367.align	32
2368.Lmulx4x_1st:
2369	adcxq	%rbp,%r15
2370	mulxq	0(%rsi),%r10,%rax
2371	adcxq	%r14,%r10
2372	mulxq	8(%rsi),%r11,%r14
2373	adcxq	%rax,%r11
2374	mulxq	16(%rsi),%r12,%rax
2375	adcxq	%r14,%r12
2376	mulxq	24(%rsi),%r13,%r14
2377.byte	0x67,0x67
2378	movq	%r8,%rdx
2379	adcxq	%rax,%r13
2380	adcxq	%rbp,%r14
2381	leaq	32(%rsi),%rsi
2382	leaq	32(%rbx),%rbx
2383
2384	adoxq	%r15,%r10
2385	mulxq	0(%rcx),%rax,%r15
2386	adcxq	%rax,%r10
2387	adoxq	%r15,%r11
2388	mulxq	8(%rcx),%rax,%r15
2389	adcxq	%rax,%r11
2390	adoxq	%r15,%r12
2391	mulxq	16(%rcx),%rax,%r15
2392	movq	%r10,-40(%rbx)
2393	adcxq	%rax,%r12
2394	movq	%r11,-32(%rbx)
2395	adoxq	%r15,%r13
2396	mulxq	24(%rcx),%rax,%r15
2397	movq	%r9,%rdx
2398	movq	%r12,-24(%rbx)
2399	adcxq	%rax,%r13
2400	adoxq	%rbp,%r15
2401	leaq	32(%rcx),%rcx
2402	movq	%r13,-16(%rbx)
2403
2404	decq	%rdi
2405	jnz	.Lmulx4x_1st
2406
2407	movq	8(%rsp),%rax
2408	adcq	%rbp,%r15
2409	leaq	(%rsi,%rax,1),%rsi
2410	addq	%r15,%r14
2411	movq	8+8(%rsp),%rdi
2412	adcq	%rbp,%rbp
2413	movq	%r14,-8(%rbx)
2414	jmp	.Lmulx4x_outer
2415
2416.align	32
2417.Lmulx4x_outer:
2418	leaq	16-256(%rbx),%r10
2419	pxor	%xmm4,%xmm4
2420.byte	0x67,0x67
2421	pxor	%xmm5,%xmm5
2422	movdqa	-128(%rdi),%xmm0
2423	movdqa	-112(%rdi),%xmm1
2424	movdqa	-96(%rdi),%xmm2
2425	pand	256(%r10),%xmm0
2426	movdqa	-80(%rdi),%xmm3
2427	pand	272(%r10),%xmm1
2428	por	%xmm0,%xmm4
2429	pand	288(%r10),%xmm2
2430	por	%xmm1,%xmm5
2431	pand	304(%r10),%xmm3
2432	por	%xmm2,%xmm4
2433	por	%xmm3,%xmm5
2434	movdqa	-64(%rdi),%xmm0
2435	movdqa	-48(%rdi),%xmm1
2436	movdqa	-32(%rdi),%xmm2
2437	pand	320(%r10),%xmm0
2438	movdqa	-16(%rdi),%xmm3
2439	pand	336(%r10),%xmm1
2440	por	%xmm0,%xmm4
2441	pand	352(%r10),%xmm2
2442	por	%xmm1,%xmm5
2443	pand	368(%r10),%xmm3
2444	por	%xmm2,%xmm4
2445	por	%xmm3,%xmm5
2446	movdqa	0(%rdi),%xmm0
2447	movdqa	16(%rdi),%xmm1
2448	movdqa	32(%rdi),%xmm2
2449	pand	384(%r10),%xmm0
2450	movdqa	48(%rdi),%xmm3
2451	pand	400(%r10),%xmm1
2452	por	%xmm0,%xmm4
2453	pand	416(%r10),%xmm2
2454	por	%xmm1,%xmm5
2455	pand	432(%r10),%xmm3
2456	por	%xmm2,%xmm4
2457	por	%xmm3,%xmm5
2458	movdqa	64(%rdi),%xmm0
2459	movdqa	80(%rdi),%xmm1
2460	movdqa	96(%rdi),%xmm2
2461	pand	448(%r10),%xmm0
2462	movdqa	112(%rdi),%xmm3
2463	pand	464(%r10),%xmm1
2464	por	%xmm0,%xmm4
2465	pand	480(%r10),%xmm2
2466	por	%xmm1,%xmm5
2467	pand	496(%r10),%xmm3
2468	por	%xmm2,%xmm4
2469	por	%xmm3,%xmm5
2470	por	%xmm5,%xmm4
2471
2472	pshufd	$0x4e,%xmm4,%xmm0
2473	por	%xmm4,%xmm0
2474	leaq	256(%rdi),%rdi
2475.byte	102,72,15,126,194
2476
2477	movq	%rbp,(%rbx)
2478	leaq	32(%rbx,%rax,1),%rbx
2479	mulxq	0(%rsi),%r8,%r11
2480	xorq	%rbp,%rbp
2481	movq	%rdx,%r9
2482	mulxq	8(%rsi),%r14,%r12
2483	adoxq	-32(%rbx),%r8
2484	adcxq	%r14,%r11
2485	mulxq	16(%rsi),%r15,%r13
2486	adoxq	-24(%rbx),%r11
2487	adcxq	%r15,%r12
2488	mulxq	24(%rsi),%rdx,%r14
2489	adoxq	-16(%rbx),%r12
2490	adcxq	%rdx,%r13
2491	leaq	(%rcx,%rax,1),%rcx
2492	leaq	32(%rsi),%rsi
2493	adoxq	-8(%rbx),%r13
2494	adcxq	%rbp,%r14
2495	adoxq	%rbp,%r14
2496
2497	movq	%r8,%r15
2498	imulq	32+8(%rsp),%r8
2499
2500	movq	%r8,%rdx
2501	xorq	%rbp,%rbp
2502	movq	%rdi,8+8(%rsp)
2503
2504	mulxq	0(%rcx),%rax,%r10
2505	adcxq	%rax,%r15
2506	adoxq	%r11,%r10
2507	mulxq	8(%rcx),%rax,%r11
2508	adcxq	%rax,%r10
2509	adoxq	%r12,%r11
2510	mulxq	16(%rcx),%rax,%r12
2511	adcxq	%rax,%r11
2512	adoxq	%r13,%r12
2513	mulxq	24(%rcx),%rax,%r15
2514	movq	%r9,%rdx
2515	movq	24+8(%rsp),%rdi
2516	movq	%r10,-32(%rbx)
2517	adcxq	%rax,%r12
2518	movq	%r11,-24(%rbx)
2519	adoxq	%rbp,%r15
2520	movq	%r12,-16(%rbx)
2521	leaq	32(%rcx),%rcx
2522	jmp	.Lmulx4x_inner
2523
2524.align	32
2525.Lmulx4x_inner:
2526	mulxq	0(%rsi),%r10,%rax
2527	adcxq	%rbp,%r15
2528	adoxq	%r14,%r10
2529	mulxq	8(%rsi),%r11,%r14
2530	adcxq	0(%rbx),%r10
2531	adoxq	%rax,%r11
2532	mulxq	16(%rsi),%r12,%rax
2533	adcxq	8(%rbx),%r11
2534	adoxq	%r14,%r12
2535	mulxq	24(%rsi),%r13,%r14
2536	movq	%r8,%rdx
2537	adcxq	16(%rbx),%r12
2538	adoxq	%rax,%r13
2539	adcxq	24(%rbx),%r13
2540	adoxq	%rbp,%r14
2541	leaq	32(%rsi),%rsi
2542	leaq	32(%rbx),%rbx
2543	adcxq	%rbp,%r14
2544
2545	adoxq	%r15,%r10
2546	mulxq	0(%rcx),%rax,%r15
2547	adcxq	%rax,%r10
2548	adoxq	%r15,%r11
2549	mulxq	8(%rcx),%rax,%r15
2550	adcxq	%rax,%r11
2551	adoxq	%r15,%r12
2552	mulxq	16(%rcx),%rax,%r15
2553	movq	%r10,-40(%rbx)
2554	adcxq	%rax,%r12
2555	adoxq	%r15,%r13
2556	movq	%r11,-32(%rbx)
2557	mulxq	24(%rcx),%rax,%r15
2558	movq	%r9,%rdx
2559	leaq	32(%rcx),%rcx
2560	movq	%r12,-24(%rbx)
2561	adcxq	%rax,%r13
2562	adoxq	%rbp,%r15
2563	movq	%r13,-16(%rbx)
2564
2565	decq	%rdi
2566	jnz	.Lmulx4x_inner
2567
2568	movq	0+8(%rsp),%rax
2569	adcq	%rbp,%r15
2570	subq	0(%rbx),%rdi
2571	movq	8+8(%rsp),%rdi
2572	movq	16+8(%rsp),%r10
2573	adcq	%r15,%r14
2574	leaq	(%rsi,%rax,1),%rsi
2575	adcq	%rbp,%rbp
2576	movq	%r14,-8(%rbx)
2577
2578	cmpq	%r10,%rdi
2579	jb	.Lmulx4x_outer
2580
2581	movq	-8(%rcx),%r10
2582	movq	%rbp,%r8
2583	movq	(%rcx,%rax,1),%r12
2584	leaq	(%rcx,%rax,1),%rbp
2585	movq	%rax,%rcx
2586	leaq	(%rbx,%rax,1),%rdi
2587	xorl	%eax,%eax
2588	xorq	%r15,%r15
2589	subq	%r14,%r10
2590	adcq	%r15,%r15
2591	orq	%r15,%r8
2592	sarq	$3+2,%rcx
2593	subq	%r8,%rax
2594	movq	56+8(%rsp),%rdx
2595	decq	%r12
2596	movq	8(%rbp),%r13
2597	xorq	%r8,%r8
2598	movq	16(%rbp),%r14
2599	movq	24(%rbp),%r15
2600	jmp	.Lsqrx4x_sub_entry
2601.cfi_endproc
2602.size	mulx4x_internal,.-mulx4x_internal
2603.globl	bn_powerx5
2604.hidden bn_powerx5
2605.type	bn_powerx5,@function
2606.align	32
2607bn_powerx5:
2608.cfi_startproc
2609_CET_ENDBR
2610	movq	%rsp,%rax
2611.cfi_def_cfa_register	%rax
2612	pushq	%rbx
2613.cfi_offset	%rbx,-16
2614	pushq	%rbp
2615.cfi_offset	%rbp,-24
2616	pushq	%r12
2617.cfi_offset	%r12,-32
2618	pushq	%r13
2619.cfi_offset	%r13,-40
2620	pushq	%r14
2621.cfi_offset	%r14,-48
2622	pushq	%r15
2623.cfi_offset	%r15,-56
2624.Lpowerx5_prologue:
2625
2626
2627
2628
2629	shll	$3,%r9d
2630	leaq	(%r9,%r9,2),%r10
2631	negq	%r9
2632	movq	(%r8),%r8
2633
2634
2635
2636
2637
2638
2639
2640
2641	leaq	-320(%rsp,%r9,2),%r11
2642	movq	%rsp,%rbp
2643	subq	%rdi,%r11
2644	andq	$4095,%r11
2645	cmpq	%r11,%r10
2646	jb	.Lpwrx_sp_alt
2647	subq	%r11,%rbp
2648	leaq	-320(%rbp,%r9,2),%rbp
2649	jmp	.Lpwrx_sp_done
2650
2651.align	32
2652.Lpwrx_sp_alt:
2653	leaq	4096-320(,%r9,2),%r10
2654	leaq	-320(%rbp,%r9,2),%rbp
2655	subq	%r10,%r11
2656	movq	$0,%r10
2657	cmovcq	%r10,%r11
2658	subq	%r11,%rbp
2659.Lpwrx_sp_done:
2660	andq	$-64,%rbp
2661	movq	%rsp,%r11
2662	subq	%rbp,%r11
2663	andq	$-4096,%r11
2664	leaq	(%r11,%rbp,1),%rsp
2665	movq	(%rsp),%r10
2666	cmpq	%rbp,%rsp
2667	ja	.Lpwrx_page_walk
2668	jmp	.Lpwrx_page_walk_done
2669
2670.Lpwrx_page_walk:
2671	leaq	-4096(%rsp),%rsp
2672	movq	(%rsp),%r10
2673	cmpq	%rbp,%rsp
2674	ja	.Lpwrx_page_walk
2675.Lpwrx_page_walk_done:
2676
2677	movq	%r9,%r10
2678	negq	%r9
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691	pxor	%xmm0,%xmm0
2692.byte	102,72,15,110,207
2693.byte	102,72,15,110,209
2694.byte	102,73,15,110,218
2695.byte	102,72,15,110,226
2696	movq	%r8,32(%rsp)
2697	movq	%rax,40(%rsp)
2698.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2699.Lpowerx5_body:
2700
2701	call	__bn_sqrx8x_internal
2702	call	__bn_postx4x_internal
2703	call	__bn_sqrx8x_internal
2704	call	__bn_postx4x_internal
2705	call	__bn_sqrx8x_internal
2706	call	__bn_postx4x_internal
2707	call	__bn_sqrx8x_internal
2708	call	__bn_postx4x_internal
2709	call	__bn_sqrx8x_internal
2710	call	__bn_postx4x_internal
2711
2712	movq	%r10,%r9
2713	movq	%rsi,%rdi
2714.byte	102,72,15,126,209
2715.byte	102,72,15,126,226
2716	movq	40(%rsp),%rax
2717
2718	call	mulx4x_internal
2719
2720	movq	40(%rsp),%rsi
2721.cfi_def_cfa	%rsi,8
2722	movq	$1,%rax
2723
2724	movq	-48(%rsi),%r15
2725.cfi_restore	%r15
2726	movq	-40(%rsi),%r14
2727.cfi_restore	%r14
2728	movq	-32(%rsi),%r13
2729.cfi_restore	%r13
2730	movq	-24(%rsi),%r12
2731.cfi_restore	%r12
2732	movq	-16(%rsi),%rbp
2733.cfi_restore	%rbp
2734	movq	-8(%rsi),%rbx
2735.cfi_restore	%rbx
2736	leaq	(%rsi),%rsp
2737.cfi_def_cfa_register	%rsp
2738.Lpowerx5_epilogue:
2739	ret
2740.cfi_endproc
2741.size	bn_powerx5,.-bn_powerx5
2742
2743.globl	bn_sqrx8x_internal
2744.hidden bn_sqrx8x_internal
2745.hidden	bn_sqrx8x_internal
2746.type	bn_sqrx8x_internal,@function
2747.align	32
2748bn_sqrx8x_internal:
2749__bn_sqrx8x_internal:
2750.cfi_startproc
2751_CET_ENDBR
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792	leaq	48+8(%rsp),%rdi
2793	leaq	(%rsi,%r9,1),%rbp
2794	movq	%r9,0+8(%rsp)
2795	movq	%rbp,8+8(%rsp)
2796	jmp	.Lsqr8x_zero_start
2797
2798.align	32
2799.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2800.Lsqrx8x_zero:
2801.byte	0x3e
2802	movdqa	%xmm0,0(%rdi)
2803	movdqa	%xmm0,16(%rdi)
2804	movdqa	%xmm0,32(%rdi)
2805	movdqa	%xmm0,48(%rdi)
2806.Lsqr8x_zero_start:
2807	movdqa	%xmm0,64(%rdi)
2808	movdqa	%xmm0,80(%rdi)
2809	movdqa	%xmm0,96(%rdi)
2810	movdqa	%xmm0,112(%rdi)
2811	leaq	128(%rdi),%rdi
2812	subq	$64,%r9
2813	jnz	.Lsqrx8x_zero
2814
2815	movq	0(%rsi),%rdx
2816
2817	xorq	%r10,%r10
2818	xorq	%r11,%r11
2819	xorq	%r12,%r12
2820	xorq	%r13,%r13
2821	xorq	%r14,%r14
2822	xorq	%r15,%r15
2823	leaq	48+8(%rsp),%rdi
2824	xorq	%rbp,%rbp
2825	jmp	.Lsqrx8x_outer_loop
2826
2827.align	32
2828.Lsqrx8x_outer_loop:
2829	mulxq	8(%rsi),%r8,%rax
2830	adcxq	%r9,%r8
2831	adoxq	%rax,%r10
2832	mulxq	16(%rsi),%r9,%rax
2833	adcxq	%r10,%r9
2834	adoxq	%rax,%r11
2835.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
2836	adcxq	%r11,%r10
2837	adoxq	%rax,%r12
2838.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
2839	adcxq	%r12,%r11
2840	adoxq	%rax,%r13
2841	mulxq	40(%rsi),%r12,%rax
2842	adcxq	%r13,%r12
2843	adoxq	%rax,%r14
2844	mulxq	48(%rsi),%r13,%rax
2845	adcxq	%r14,%r13
2846	adoxq	%r15,%rax
2847	mulxq	56(%rsi),%r14,%r15
2848	movq	8(%rsi),%rdx
2849	adcxq	%rax,%r14
2850	adoxq	%rbp,%r15
2851	adcq	64(%rdi),%r15
2852	movq	%r8,8(%rdi)
2853	movq	%r9,16(%rdi)
2854	sbbq	%rcx,%rcx
2855	xorq	%rbp,%rbp
2856
2857
2858	mulxq	16(%rsi),%r8,%rbx
2859	mulxq	24(%rsi),%r9,%rax
2860	adcxq	%r10,%r8
2861	adoxq	%rbx,%r9
2862	mulxq	32(%rsi),%r10,%rbx
2863	adcxq	%r11,%r9
2864	adoxq	%rax,%r10
2865.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
2866	adcxq	%r12,%r10
2867	adoxq	%rbx,%r11
2868.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
2869	adcxq	%r13,%r11
2870	adoxq	%r14,%r12
2871.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
2872	movq	16(%rsi),%rdx
2873	adcxq	%rax,%r12
2874	adoxq	%rbx,%r13
2875	adcxq	%r15,%r13
2876	adoxq	%rbp,%r14
2877	adcxq	%rbp,%r14
2878
2879	movq	%r8,24(%rdi)
2880	movq	%r9,32(%rdi)
2881
2882	mulxq	24(%rsi),%r8,%rbx
2883	mulxq	32(%rsi),%r9,%rax
2884	adcxq	%r10,%r8
2885	adoxq	%rbx,%r9
2886	mulxq	40(%rsi),%r10,%rbx
2887	adcxq	%r11,%r9
2888	adoxq	%rax,%r10
2889.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
2890	adcxq	%r12,%r10
2891	adoxq	%r13,%r11
2892.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
2893.byte	0x3e
2894	movq	24(%rsi),%rdx
2895	adcxq	%rbx,%r11
2896	adoxq	%rax,%r12
2897	adcxq	%r14,%r12
2898	movq	%r8,40(%rdi)
2899	movq	%r9,48(%rdi)
2900	mulxq	32(%rsi),%r8,%rax
2901	adoxq	%rbp,%r13
2902	adcxq	%rbp,%r13
2903
2904	mulxq	40(%rsi),%r9,%rbx
2905	adcxq	%r10,%r8
2906	adoxq	%rax,%r9
2907	mulxq	48(%rsi),%r10,%rax
2908	adcxq	%r11,%r9
2909	adoxq	%r12,%r10
2910	mulxq	56(%rsi),%r11,%r12
2911	movq	32(%rsi),%rdx
2912	movq	40(%rsi),%r14
2913	adcxq	%rbx,%r10
2914	adoxq	%rax,%r11
2915	movq	48(%rsi),%r15
2916	adcxq	%r13,%r11
2917	adoxq	%rbp,%r12
2918	adcxq	%rbp,%r12
2919
2920	movq	%r8,56(%rdi)
2921	movq	%r9,64(%rdi)
2922
2923	mulxq	%r14,%r9,%rax
2924	movq	56(%rsi),%r8
2925	adcxq	%r10,%r9
2926	mulxq	%r15,%r10,%rbx
2927	adoxq	%rax,%r10
2928	adcxq	%r11,%r10
2929	mulxq	%r8,%r11,%rax
2930	movq	%r14,%rdx
2931	adoxq	%rbx,%r11
2932	adcxq	%r12,%r11
2933
2934	adcxq	%rbp,%rax
2935
2936	mulxq	%r15,%r14,%rbx
2937	mulxq	%r8,%r12,%r13
2938	movq	%r15,%rdx
2939	leaq	64(%rsi),%rsi
2940	adcxq	%r14,%r11
2941	adoxq	%rbx,%r12
2942	adcxq	%rax,%r12
2943	adoxq	%rbp,%r13
2944
2945.byte	0x67,0x67
2946	mulxq	%r8,%r8,%r14
2947	adcxq	%r8,%r13
2948	adcxq	%rbp,%r14
2949
2950	cmpq	8+8(%rsp),%rsi
2951	je	.Lsqrx8x_outer_break
2952
2953	negq	%rcx
2954	movq	$-8,%rcx
2955	movq	%rbp,%r15
2956	movq	64(%rdi),%r8
2957	adcxq	72(%rdi),%r9
2958	adcxq	80(%rdi),%r10
2959	adcxq	88(%rdi),%r11
2960	adcq	96(%rdi),%r12
2961	adcq	104(%rdi),%r13
2962	adcq	112(%rdi),%r14
2963	adcq	120(%rdi),%r15
2964	leaq	(%rsi),%rbp
2965	leaq	128(%rdi),%rdi
2966	sbbq	%rax,%rax
2967
2968	movq	-64(%rsi),%rdx
2969	movq	%rax,16+8(%rsp)
2970	movq	%rdi,24+8(%rsp)
2971
2972
2973	xorl	%eax,%eax
2974	jmp	.Lsqrx8x_loop
2975
2976.align	32
2977.Lsqrx8x_loop:
2978	movq	%r8,%rbx
2979	mulxq	0(%rbp),%rax,%r8
2980	adcxq	%rax,%rbx
2981	adoxq	%r9,%r8
2982
2983	mulxq	8(%rbp),%rax,%r9
2984	adcxq	%rax,%r8
2985	adoxq	%r10,%r9
2986
2987	mulxq	16(%rbp),%rax,%r10
2988	adcxq	%rax,%r9
2989	adoxq	%r11,%r10
2990
2991	mulxq	24(%rbp),%rax,%r11
2992	adcxq	%rax,%r10
2993	adoxq	%r12,%r11
2994
2995.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
2996	adcxq	%rax,%r11
2997	adoxq	%r13,%r12
2998
2999	mulxq	40(%rbp),%rax,%r13
3000	adcxq	%rax,%r12
3001	adoxq	%r14,%r13
3002
3003	mulxq	48(%rbp),%rax,%r14
3004	movq	%rbx,(%rdi,%rcx,8)
3005	movl	$0,%ebx
3006	adcxq	%rax,%r13
3007	adoxq	%r15,%r14
3008
3009.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
3010	movq	8(%rsi,%rcx,8),%rdx
3011	adcxq	%rax,%r14
3012	adoxq	%rbx,%r15
3013	adcxq	%rbx,%r15
3014
3015.byte	0x67
3016	incq	%rcx
3017	jnz	.Lsqrx8x_loop
3018
3019	leaq	64(%rbp),%rbp
3020	movq	$-8,%rcx
3021	cmpq	8+8(%rsp),%rbp
3022	je	.Lsqrx8x_break
3023
3024	subq	16+8(%rsp),%rbx
3025.byte	0x66
3026	movq	-64(%rsi),%rdx
3027	adcxq	0(%rdi),%r8
3028	adcxq	8(%rdi),%r9
3029	adcq	16(%rdi),%r10
3030	adcq	24(%rdi),%r11
3031	adcq	32(%rdi),%r12
3032	adcq	40(%rdi),%r13
3033	adcq	48(%rdi),%r14
3034	adcq	56(%rdi),%r15
3035	leaq	64(%rdi),%rdi
3036.byte	0x67
3037	sbbq	%rax,%rax
3038	xorl	%ebx,%ebx
3039	movq	%rax,16+8(%rsp)
3040	jmp	.Lsqrx8x_loop
3041
3042.align	32
3043.Lsqrx8x_break:
3044	xorq	%rbp,%rbp
3045	subq	16+8(%rsp),%rbx
3046	adcxq	%rbp,%r8
3047	movq	24+8(%rsp),%rcx
3048	adcxq	%rbp,%r9
3049	movq	0(%rsi),%rdx
3050	adcq	$0,%r10
3051	movq	%r8,0(%rdi)
3052	adcq	$0,%r11
3053	adcq	$0,%r12
3054	adcq	$0,%r13
3055	adcq	$0,%r14
3056	adcq	$0,%r15
3057	cmpq	%rcx,%rdi
3058	je	.Lsqrx8x_outer_loop
3059
3060	movq	%r9,8(%rdi)
3061	movq	8(%rcx),%r9
3062	movq	%r10,16(%rdi)
3063	movq	16(%rcx),%r10
3064	movq	%r11,24(%rdi)
3065	movq	24(%rcx),%r11
3066	movq	%r12,32(%rdi)
3067	movq	32(%rcx),%r12
3068	movq	%r13,40(%rdi)
3069	movq	40(%rcx),%r13
3070	movq	%r14,48(%rdi)
3071	movq	48(%rcx),%r14
3072	movq	%r15,56(%rdi)
3073	movq	56(%rcx),%r15
3074	movq	%rcx,%rdi
3075	jmp	.Lsqrx8x_outer_loop
3076
3077.align	32
3078.Lsqrx8x_outer_break:
3079	movq	%r9,72(%rdi)
3080.byte	102,72,15,126,217
3081	movq	%r10,80(%rdi)
3082	movq	%r11,88(%rdi)
3083	movq	%r12,96(%rdi)
3084	movq	%r13,104(%rdi)
3085	movq	%r14,112(%rdi)
3086	leaq	48+8(%rsp),%rdi
3087	movq	(%rsi,%rcx,1),%rdx
3088
3089	movq	8(%rdi),%r11
3090	xorq	%r10,%r10
3091	movq	0+8(%rsp),%r9
3092	adoxq	%r11,%r11
3093	movq	16(%rdi),%r12
3094	movq	24(%rdi),%r13
3095
3096
3097.align	32
3098.Lsqrx4x_shift_n_add:
3099	mulxq	%rdx,%rax,%rbx
3100	adoxq	%r12,%r12
3101	adcxq	%r10,%rax
3102.byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
3103.byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
3104	adoxq	%r13,%r13
3105	adcxq	%r11,%rbx
3106	movq	40(%rdi),%r11
3107	movq	%rax,0(%rdi)
3108	movq	%rbx,8(%rdi)
3109
3110	mulxq	%rdx,%rax,%rbx
3111	adoxq	%r10,%r10
3112	adcxq	%r12,%rax
3113	movq	16(%rsi,%rcx,1),%rdx
3114	movq	48(%rdi),%r12
3115	adoxq	%r11,%r11
3116	adcxq	%r13,%rbx
3117	movq	56(%rdi),%r13
3118	movq	%rax,16(%rdi)
3119	movq	%rbx,24(%rdi)
3120
3121	mulxq	%rdx,%rax,%rbx
3122	adoxq	%r12,%r12
3123	adcxq	%r10,%rax
3124	movq	24(%rsi,%rcx,1),%rdx
3125	leaq	32(%rcx),%rcx
3126	movq	64(%rdi),%r10
3127	adoxq	%r13,%r13
3128	adcxq	%r11,%rbx
3129	movq	72(%rdi),%r11
3130	movq	%rax,32(%rdi)
3131	movq	%rbx,40(%rdi)
3132
3133	mulxq	%rdx,%rax,%rbx
3134	adoxq	%r10,%r10
3135	adcxq	%r12,%rax
3136	jrcxz	.Lsqrx4x_shift_n_add_break
3137.byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
3138	adoxq	%r11,%r11
3139	adcxq	%r13,%rbx
3140	movq	80(%rdi),%r12
3141	movq	88(%rdi),%r13
3142	movq	%rax,48(%rdi)
3143	movq	%rbx,56(%rdi)
3144	leaq	64(%rdi),%rdi
3145	nop
3146	jmp	.Lsqrx4x_shift_n_add
3147
3148.align	32
3149.Lsqrx4x_shift_n_add_break:
3150	adcxq	%r13,%rbx
3151	movq	%rax,48(%rdi)
3152	movq	%rbx,56(%rdi)
3153	leaq	64(%rdi),%rdi
3154.byte	102,72,15,126,213
3155__bn_sqrx8x_reduction:
3156	xorl	%eax,%eax
3157	movq	32+8(%rsp),%rbx
3158	movq	48+8(%rsp),%rdx
3159	leaq	-64(%rbp,%r9,1),%rcx
3160
3161	movq	%rcx,0+8(%rsp)
3162	movq	%rdi,8+8(%rsp)
3163
3164	leaq	48+8(%rsp),%rdi
3165	jmp	.Lsqrx8x_reduction_loop
3166
3167.align	32
3168.Lsqrx8x_reduction_loop:
3169	movq	8(%rdi),%r9
3170	movq	16(%rdi),%r10
3171	movq	24(%rdi),%r11
3172	movq	32(%rdi),%r12
3173	movq	%rdx,%r8
3174	imulq	%rbx,%rdx
3175	movq	40(%rdi),%r13
3176	movq	48(%rdi),%r14
3177	movq	56(%rdi),%r15
3178	movq	%rax,24+8(%rsp)
3179
3180	leaq	64(%rdi),%rdi
3181	xorq	%rsi,%rsi
3182	movq	$-8,%rcx
3183	jmp	.Lsqrx8x_reduce
3184
3185.align	32
3186.Lsqrx8x_reduce:
3187	movq	%r8,%rbx
3188	mulxq	0(%rbp),%rax,%r8
3189	adcxq	%rbx,%rax
3190	adoxq	%r9,%r8
3191
3192	mulxq	8(%rbp),%rbx,%r9
3193	adcxq	%rbx,%r8
3194	adoxq	%r10,%r9
3195
3196	mulxq	16(%rbp),%rbx,%r10
3197	adcxq	%rbx,%r9
3198	adoxq	%r11,%r10
3199
3200	mulxq	24(%rbp),%rbx,%r11
3201	adcxq	%rbx,%r10
3202	adoxq	%r12,%r11
3203
3204.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
3205	movq	%rdx,%rax
3206	movq	%r8,%rdx
3207	adcxq	%rbx,%r11
3208	adoxq	%r13,%r12
3209
3210	mulxq	32+8(%rsp),%rbx,%rdx
3211	movq	%rax,%rdx
3212	movq	%rax,64+48+8(%rsp,%rcx,8)
3213
3214	mulxq	40(%rbp),%rax,%r13
3215	adcxq	%rax,%r12
3216	adoxq	%r14,%r13
3217
3218	mulxq	48(%rbp),%rax,%r14
3219	adcxq	%rax,%r13
3220	adoxq	%r15,%r14
3221
3222	mulxq	56(%rbp),%rax,%r15
3223	movq	%rbx,%rdx
3224	adcxq	%rax,%r14
3225	adoxq	%rsi,%r15
3226	adcxq	%rsi,%r15
3227
3228.byte	0x67,0x67,0x67
3229	incq	%rcx
3230	jnz	.Lsqrx8x_reduce
3231
3232	movq	%rsi,%rax
3233	cmpq	0+8(%rsp),%rbp
3234	jae	.Lsqrx8x_no_tail
3235
3236	movq	48+8(%rsp),%rdx
3237	addq	0(%rdi),%r8
3238	leaq	64(%rbp),%rbp
3239	movq	$-8,%rcx
3240	adcxq	8(%rdi),%r9
3241	adcxq	16(%rdi),%r10
3242	adcq	24(%rdi),%r11
3243	adcq	32(%rdi),%r12
3244	adcq	40(%rdi),%r13
3245	adcq	48(%rdi),%r14
3246	adcq	56(%rdi),%r15
3247	leaq	64(%rdi),%rdi
3248	sbbq	%rax,%rax
3249
3250	xorq	%rsi,%rsi
3251	movq	%rax,16+8(%rsp)
3252	jmp	.Lsqrx8x_tail
3253
3254.align	32
3255.Lsqrx8x_tail:
3256	movq	%r8,%rbx
3257	mulxq	0(%rbp),%rax,%r8
3258	adcxq	%rax,%rbx
3259	adoxq	%r9,%r8
3260
3261	mulxq	8(%rbp),%rax,%r9
3262	adcxq	%rax,%r8
3263	adoxq	%r10,%r9
3264
3265	mulxq	16(%rbp),%rax,%r10
3266	adcxq	%rax,%r9
3267	adoxq	%r11,%r10
3268
3269	mulxq	24(%rbp),%rax,%r11
3270	adcxq	%rax,%r10
3271	adoxq	%r12,%r11
3272
3273.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3274	adcxq	%rax,%r11
3275	adoxq	%r13,%r12
3276
3277	mulxq	40(%rbp),%rax,%r13
3278	adcxq	%rax,%r12
3279	adoxq	%r14,%r13
3280
3281	mulxq	48(%rbp),%rax,%r14
3282	adcxq	%rax,%r13
3283	adoxq	%r15,%r14
3284
3285	mulxq	56(%rbp),%rax,%r15
3286	movq	72+48+8(%rsp,%rcx,8),%rdx
3287	adcxq	%rax,%r14
3288	adoxq	%rsi,%r15
3289	movq	%rbx,(%rdi,%rcx,8)
3290	movq	%r8,%rbx
3291	adcxq	%rsi,%r15
3292
3293	incq	%rcx
3294	jnz	.Lsqrx8x_tail
3295
3296	cmpq	0+8(%rsp),%rbp
3297	jae	.Lsqrx8x_tail_done
3298
3299	subq	16+8(%rsp),%rsi
3300	movq	48+8(%rsp),%rdx
3301	leaq	64(%rbp),%rbp
3302	adcq	0(%rdi),%r8
3303	adcq	8(%rdi),%r9
3304	adcq	16(%rdi),%r10
3305	adcq	24(%rdi),%r11
3306	adcq	32(%rdi),%r12
3307	adcq	40(%rdi),%r13
3308	adcq	48(%rdi),%r14
3309	adcq	56(%rdi),%r15
3310	leaq	64(%rdi),%rdi
3311	sbbq	%rax,%rax
3312	subq	$8,%rcx
3313
3314	xorq	%rsi,%rsi
3315	movq	%rax,16+8(%rsp)
3316	jmp	.Lsqrx8x_tail
3317
3318.align	32
3319.Lsqrx8x_tail_done:
3320	xorq	%rax,%rax
3321	addq	24+8(%rsp),%r8
3322	adcq	$0,%r9
3323	adcq	$0,%r10
3324	adcq	$0,%r11
3325	adcq	$0,%r12
3326	adcq	$0,%r13
3327	adcq	$0,%r14
3328	adcq	$0,%r15
3329	adcq	$0,%rax
3330
3331	subq	16+8(%rsp),%rsi
3332.Lsqrx8x_no_tail:
3333	adcq	0(%rdi),%r8
3334.byte	102,72,15,126,217
3335	adcq	8(%rdi),%r9
3336	movq	56(%rbp),%rsi
3337.byte	102,72,15,126,213
3338	adcq	16(%rdi),%r10
3339	adcq	24(%rdi),%r11
3340	adcq	32(%rdi),%r12
3341	adcq	40(%rdi),%r13
3342	adcq	48(%rdi),%r14
3343	adcq	56(%rdi),%r15
3344	adcq	$0,%rax
3345
3346	movq	32+8(%rsp),%rbx
3347	movq	64(%rdi,%rcx,1),%rdx
3348
3349	movq	%r8,0(%rdi)
3350	leaq	64(%rdi),%r8
3351	movq	%r9,8(%rdi)
3352	movq	%r10,16(%rdi)
3353	movq	%r11,24(%rdi)
3354	movq	%r12,32(%rdi)
3355	movq	%r13,40(%rdi)
3356	movq	%r14,48(%rdi)
3357	movq	%r15,56(%rdi)
3358
3359	leaq	64(%rdi,%rcx,1),%rdi
3360	cmpq	8+8(%rsp),%r8
3361	jb	.Lsqrx8x_reduction_loop
3362	ret
3363.cfi_endproc
3364.size	bn_sqrx8x_internal,.-bn_sqrx8x_internal
3365.align	32
3366.type	__bn_postx4x_internal,@function
3367__bn_postx4x_internal:
3368.cfi_startproc
3369	movq	0(%rbp),%r12
3370	movq	%rcx,%r10
3371	movq	%rcx,%r9
3372	negq	%rax
3373	sarq	$3+2,%rcx
3374
3375.byte	102,72,15,126,202
3376.byte	102,72,15,126,206
3377	decq	%r12
3378	movq	8(%rbp),%r13
3379	xorq	%r8,%r8
3380	movq	16(%rbp),%r14
3381	movq	24(%rbp),%r15
3382	jmp	.Lsqrx4x_sub_entry
3383
3384.align	16
3385.Lsqrx4x_sub:
3386	movq	0(%rbp),%r12
3387	movq	8(%rbp),%r13
3388	movq	16(%rbp),%r14
3389	movq	24(%rbp),%r15
3390.Lsqrx4x_sub_entry:
3391	andnq	%rax,%r12,%r12
3392	leaq	32(%rbp),%rbp
3393	andnq	%rax,%r13,%r13
3394	andnq	%rax,%r14,%r14
3395	andnq	%rax,%r15,%r15
3396
3397	negq	%r8
3398	adcq	0(%rdi),%r12
3399	adcq	8(%rdi),%r13
3400	adcq	16(%rdi),%r14
3401	adcq	24(%rdi),%r15
3402	movq	%r12,0(%rdx)
3403	leaq	32(%rdi),%rdi
3404	movq	%r13,8(%rdx)
3405	sbbq	%r8,%r8
3406	movq	%r14,16(%rdx)
3407	movq	%r15,24(%rdx)
3408	leaq	32(%rdx),%rdx
3409
3410	incq	%rcx
3411	jnz	.Lsqrx4x_sub
3412
3413	negq	%r9
3414
3415	ret
3416.cfi_endproc
3417.size	__bn_postx4x_internal,.-__bn_postx4x_internal
3418.globl	bn_scatter5
3419.hidden bn_scatter5
3420.type	bn_scatter5,@function
3421.align	16
3422bn_scatter5:
3423.cfi_startproc
3424_CET_ENDBR
3425	cmpl	$0,%esi
3426	jz	.Lscatter_epilogue
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436	leaq	(%rdx,%rcx,8),%rdx
3437.Lscatter:
3438	movq	(%rdi),%rax
3439	leaq	8(%rdi),%rdi
3440	movq	%rax,(%rdx)
3441	leaq	256(%rdx),%rdx
3442	subl	$1,%esi
3443	jnz	.Lscatter
3444.Lscatter_epilogue:
3445	ret
3446.cfi_endproc
3447.size	bn_scatter5,.-bn_scatter5
3448
3449.globl	bn_gather5
3450.hidden bn_gather5
3451.type	bn_gather5,@function
3452.align	32
3453bn_gather5:
3454.cfi_startproc
3455.LSEH_begin_bn_gather5:
3456_CET_ENDBR
3457
3458.byte	0x4c,0x8d,0x14,0x24
3459.cfi_def_cfa_register	%r10
3460.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00
3461	leaq	.Linc(%rip),%rax
3462	andq	$-16,%rsp
3463
3464	movd	%ecx,%xmm5
3465	movdqa	0(%rax),%xmm0
3466	movdqa	16(%rax),%xmm1
3467	leaq	128(%rdx),%r11
3468	leaq	128(%rsp),%rax
3469
3470	pshufd	$0,%xmm5,%xmm5
3471	movdqa	%xmm1,%xmm4
3472	movdqa	%xmm1,%xmm2
3473	paddd	%xmm0,%xmm1
3474	pcmpeqd	%xmm5,%xmm0
3475	movdqa	%xmm4,%xmm3
3476
3477	paddd	%xmm1,%xmm2
3478	pcmpeqd	%xmm5,%xmm1
3479	movdqa	%xmm0,-128(%rax)
3480	movdqa	%xmm4,%xmm0
3481
3482	paddd	%xmm2,%xmm3
3483	pcmpeqd	%xmm5,%xmm2
3484	movdqa	%xmm1,-112(%rax)
3485	movdqa	%xmm4,%xmm1
3486
3487	paddd	%xmm3,%xmm0
3488	pcmpeqd	%xmm5,%xmm3
3489	movdqa	%xmm2,-96(%rax)
3490	movdqa	%xmm4,%xmm2
3491	paddd	%xmm0,%xmm1
3492	pcmpeqd	%xmm5,%xmm0
3493	movdqa	%xmm3,-80(%rax)
3494	movdqa	%xmm4,%xmm3
3495
3496	paddd	%xmm1,%xmm2
3497	pcmpeqd	%xmm5,%xmm1
3498	movdqa	%xmm0,-64(%rax)
3499	movdqa	%xmm4,%xmm0
3500
3501	paddd	%xmm2,%xmm3
3502	pcmpeqd	%xmm5,%xmm2
3503	movdqa	%xmm1,-48(%rax)
3504	movdqa	%xmm4,%xmm1
3505
3506	paddd	%xmm3,%xmm0
3507	pcmpeqd	%xmm5,%xmm3
3508	movdqa	%xmm2,-32(%rax)
3509	movdqa	%xmm4,%xmm2
3510	paddd	%xmm0,%xmm1
3511	pcmpeqd	%xmm5,%xmm0
3512	movdqa	%xmm3,-16(%rax)
3513	movdqa	%xmm4,%xmm3
3514
3515	paddd	%xmm1,%xmm2
3516	pcmpeqd	%xmm5,%xmm1
3517	movdqa	%xmm0,0(%rax)
3518	movdqa	%xmm4,%xmm0
3519
3520	paddd	%xmm2,%xmm3
3521	pcmpeqd	%xmm5,%xmm2
3522	movdqa	%xmm1,16(%rax)
3523	movdqa	%xmm4,%xmm1
3524
3525	paddd	%xmm3,%xmm0
3526	pcmpeqd	%xmm5,%xmm3
3527	movdqa	%xmm2,32(%rax)
3528	movdqa	%xmm4,%xmm2
3529	paddd	%xmm0,%xmm1
3530	pcmpeqd	%xmm5,%xmm0
3531	movdqa	%xmm3,48(%rax)
3532	movdqa	%xmm4,%xmm3
3533
3534	paddd	%xmm1,%xmm2
3535	pcmpeqd	%xmm5,%xmm1
3536	movdqa	%xmm0,64(%rax)
3537	movdqa	%xmm4,%xmm0
3538
3539	paddd	%xmm2,%xmm3
3540	pcmpeqd	%xmm5,%xmm2
3541	movdqa	%xmm1,80(%rax)
3542	movdqa	%xmm4,%xmm1
3543
3544	paddd	%xmm3,%xmm0
3545	pcmpeqd	%xmm5,%xmm3
3546	movdqa	%xmm2,96(%rax)
3547	movdqa	%xmm4,%xmm2
3548	movdqa	%xmm3,112(%rax)
3549	jmp	.Lgather
3550
3551.align	32
3552.Lgather:
3553	pxor	%xmm4,%xmm4
3554	pxor	%xmm5,%xmm5
3555	movdqa	-128(%r11),%xmm0
3556	movdqa	-112(%r11),%xmm1
3557	movdqa	-96(%r11),%xmm2
3558	pand	-128(%rax),%xmm0
3559	movdqa	-80(%r11),%xmm3
3560	pand	-112(%rax),%xmm1
3561	por	%xmm0,%xmm4
3562	pand	-96(%rax),%xmm2
3563	por	%xmm1,%xmm5
3564	pand	-80(%rax),%xmm3
3565	por	%xmm2,%xmm4
3566	por	%xmm3,%xmm5
3567	movdqa	-64(%r11),%xmm0
3568	movdqa	-48(%r11),%xmm1
3569	movdqa	-32(%r11),%xmm2
3570	pand	-64(%rax),%xmm0
3571	movdqa	-16(%r11),%xmm3
3572	pand	-48(%rax),%xmm1
3573	por	%xmm0,%xmm4
3574	pand	-32(%rax),%xmm2
3575	por	%xmm1,%xmm5
3576	pand	-16(%rax),%xmm3
3577	por	%xmm2,%xmm4
3578	por	%xmm3,%xmm5
3579	movdqa	0(%r11),%xmm0
3580	movdqa	16(%r11),%xmm1
3581	movdqa	32(%r11),%xmm2
3582	pand	0(%rax),%xmm0
3583	movdqa	48(%r11),%xmm3
3584	pand	16(%rax),%xmm1
3585	por	%xmm0,%xmm4
3586	pand	32(%rax),%xmm2
3587	por	%xmm1,%xmm5
3588	pand	48(%rax),%xmm3
3589	por	%xmm2,%xmm4
3590	por	%xmm3,%xmm5
3591	movdqa	64(%r11),%xmm0
3592	movdqa	80(%r11),%xmm1
3593	movdqa	96(%r11),%xmm2
3594	pand	64(%rax),%xmm0
3595	movdqa	112(%r11),%xmm3
3596	pand	80(%rax),%xmm1
3597	por	%xmm0,%xmm4
3598	pand	96(%rax),%xmm2
3599	por	%xmm1,%xmm5
3600	pand	112(%rax),%xmm3
3601	por	%xmm2,%xmm4
3602	por	%xmm3,%xmm5
3603	por	%xmm5,%xmm4
3604	leaq	256(%r11),%r11
3605
3606	pshufd	$0x4e,%xmm4,%xmm0
3607	por	%xmm4,%xmm0
3608	movq	%xmm0,(%rdi)
3609	leaq	8(%rdi),%rdi
3610	subl	$1,%esi
3611	jnz	.Lgather
3612
3613	leaq	(%r10),%rsp
3614.cfi_def_cfa_register	%rsp
3615	ret
3616.LSEH_end_bn_gather5:
3617.cfi_endproc
3618.size	bn_gather5,.-bn_gather5
3619.section	.rodata
3620.align	64
3621.Linc:
3622.long	0,0, 1,1
3623.long	2,2, 2,2
3624.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
3625.text
3626#endif
3627