xref: /aosp_15_r20/external/cronet/third_party/boringssl/src/gen/bcm/x86_64-mont5-linux.S (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <openssl/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
7.text
8
9.extern	OPENSSL_ia32cap_P
10.hidden OPENSSL_ia32cap_P
11
12.globl	bn_mul_mont_gather5
13.hidden bn_mul_mont_gather5
14.type	bn_mul_mont_gather5,@function
15.align	64
16bn_mul_mont_gather5:
17.cfi_startproc
18_CET_ENDBR
19	movl	%r9d,%r9d
20	movq	%rsp,%rax
21.cfi_def_cfa_register	%rax
22	testl	$7,%r9d
23	jnz	.Lmul_enter
24	leaq	OPENSSL_ia32cap_P(%rip),%r11
25	movl	8(%r11),%r11d
26	jmp	.Lmul4x_enter
27
28.align	16
29.Lmul_enter:
30	movd	8(%rsp),%xmm5
31	pushq	%rbx
32.cfi_offset	%rbx,-16
33	pushq	%rbp
34.cfi_offset	%rbp,-24
35	pushq	%r12
36.cfi_offset	%r12,-32
37	pushq	%r13
38.cfi_offset	%r13,-40
39	pushq	%r14
40.cfi_offset	%r14,-48
41	pushq	%r15
42.cfi_offset	%r15,-56
43
44	negq	%r9
45	movq	%rsp,%r11
46	leaq	-280(%rsp,%r9,8),%r10
47	negq	%r9
48	andq	$-1024,%r10
49
50
51
52
53
54
55
56
57
58	subq	%r10,%r11
59	andq	$-4096,%r11
60	leaq	(%r10,%r11,1),%rsp
61	movq	(%rsp),%r11
62	cmpq	%r10,%rsp
63	ja	.Lmul_page_walk
64	jmp	.Lmul_page_walk_done
65
66.Lmul_page_walk:
67	leaq	-4096(%rsp),%rsp
68	movq	(%rsp),%r11
69	cmpq	%r10,%rsp
70	ja	.Lmul_page_walk
71.Lmul_page_walk_done:
72
73	leaq	.Linc(%rip),%r10
74	movq	%rax,8(%rsp,%r9,8)
75.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
76.Lmul_body:
77
78	leaq	128(%rdx),%r12
79	movdqa	0(%r10),%xmm0
80	movdqa	16(%r10),%xmm1
81	leaq	24-112(%rsp,%r9,8),%r10
82	andq	$-16,%r10
83
84	pshufd	$0,%xmm5,%xmm5
85	movdqa	%xmm1,%xmm4
86	movdqa	%xmm1,%xmm2
87	paddd	%xmm0,%xmm1
88	pcmpeqd	%xmm5,%xmm0
89.byte	0x67
90	movdqa	%xmm4,%xmm3
91	paddd	%xmm1,%xmm2
92	pcmpeqd	%xmm5,%xmm1
93	movdqa	%xmm0,112(%r10)
94	movdqa	%xmm4,%xmm0
95
96	paddd	%xmm2,%xmm3
97	pcmpeqd	%xmm5,%xmm2
98	movdqa	%xmm1,128(%r10)
99	movdqa	%xmm4,%xmm1
100
101	paddd	%xmm3,%xmm0
102	pcmpeqd	%xmm5,%xmm3
103	movdqa	%xmm2,144(%r10)
104	movdqa	%xmm4,%xmm2
105
106	paddd	%xmm0,%xmm1
107	pcmpeqd	%xmm5,%xmm0
108	movdqa	%xmm3,160(%r10)
109	movdqa	%xmm4,%xmm3
110	paddd	%xmm1,%xmm2
111	pcmpeqd	%xmm5,%xmm1
112	movdqa	%xmm0,176(%r10)
113	movdqa	%xmm4,%xmm0
114
115	paddd	%xmm2,%xmm3
116	pcmpeqd	%xmm5,%xmm2
117	movdqa	%xmm1,192(%r10)
118	movdqa	%xmm4,%xmm1
119
120	paddd	%xmm3,%xmm0
121	pcmpeqd	%xmm5,%xmm3
122	movdqa	%xmm2,208(%r10)
123	movdqa	%xmm4,%xmm2
124
125	paddd	%xmm0,%xmm1
126	pcmpeqd	%xmm5,%xmm0
127	movdqa	%xmm3,224(%r10)
128	movdqa	%xmm4,%xmm3
129	paddd	%xmm1,%xmm2
130	pcmpeqd	%xmm5,%xmm1
131	movdqa	%xmm0,240(%r10)
132	movdqa	%xmm4,%xmm0
133
134	paddd	%xmm2,%xmm3
135	pcmpeqd	%xmm5,%xmm2
136	movdqa	%xmm1,256(%r10)
137	movdqa	%xmm4,%xmm1
138
139	paddd	%xmm3,%xmm0
140	pcmpeqd	%xmm5,%xmm3
141	movdqa	%xmm2,272(%r10)
142	movdqa	%xmm4,%xmm2
143
144	paddd	%xmm0,%xmm1
145	pcmpeqd	%xmm5,%xmm0
146	movdqa	%xmm3,288(%r10)
147	movdqa	%xmm4,%xmm3
148	paddd	%xmm1,%xmm2
149	pcmpeqd	%xmm5,%xmm1
150	movdqa	%xmm0,304(%r10)
151
152	paddd	%xmm2,%xmm3
153.byte	0x67
154	pcmpeqd	%xmm5,%xmm2
155	movdqa	%xmm1,320(%r10)
156
157	pcmpeqd	%xmm5,%xmm3
158	movdqa	%xmm2,336(%r10)
159	pand	64(%r12),%xmm0
160
161	pand	80(%r12),%xmm1
162	pand	96(%r12),%xmm2
163	movdqa	%xmm3,352(%r10)
164	pand	112(%r12),%xmm3
165	por	%xmm2,%xmm0
166	por	%xmm3,%xmm1
167	movdqa	-128(%r12),%xmm4
168	movdqa	-112(%r12),%xmm5
169	movdqa	-96(%r12),%xmm2
170	pand	112(%r10),%xmm4
171	movdqa	-80(%r12),%xmm3
172	pand	128(%r10),%xmm5
173	por	%xmm4,%xmm0
174	pand	144(%r10),%xmm2
175	por	%xmm5,%xmm1
176	pand	160(%r10),%xmm3
177	por	%xmm2,%xmm0
178	por	%xmm3,%xmm1
179	movdqa	-64(%r12),%xmm4
180	movdqa	-48(%r12),%xmm5
181	movdqa	-32(%r12),%xmm2
182	pand	176(%r10),%xmm4
183	movdqa	-16(%r12),%xmm3
184	pand	192(%r10),%xmm5
185	por	%xmm4,%xmm0
186	pand	208(%r10),%xmm2
187	por	%xmm5,%xmm1
188	pand	224(%r10),%xmm3
189	por	%xmm2,%xmm0
190	por	%xmm3,%xmm1
191	movdqa	0(%r12),%xmm4
192	movdqa	16(%r12),%xmm5
193	movdqa	32(%r12),%xmm2
194	pand	240(%r10),%xmm4
195	movdqa	48(%r12),%xmm3
196	pand	256(%r10),%xmm5
197	por	%xmm4,%xmm0
198	pand	272(%r10),%xmm2
199	por	%xmm5,%xmm1
200	pand	288(%r10),%xmm3
201	por	%xmm2,%xmm0
202	por	%xmm3,%xmm1
203	por	%xmm1,%xmm0
204
205	pshufd	$0x4e,%xmm0,%xmm1
206	por	%xmm1,%xmm0
207	leaq	256(%r12),%r12
208.byte	102,72,15,126,195
209
210	movq	(%r8),%r8
211	movq	(%rsi),%rax
212
213	xorq	%r14,%r14
214	xorq	%r15,%r15
215
216	movq	%r8,%rbp
217	mulq	%rbx
218	movq	%rax,%r10
219	movq	(%rcx),%rax
220
221	imulq	%r10,%rbp
222	movq	%rdx,%r11
223
224	mulq	%rbp
225	addq	%rax,%r10
226	movq	8(%rsi),%rax
227	adcq	$0,%rdx
228	movq	%rdx,%r13
229
230	leaq	1(%r15),%r15
231	jmp	.L1st_enter
232
233.align	16
234.L1st:
235	addq	%rax,%r13
236	movq	(%rsi,%r15,8),%rax
237	adcq	$0,%rdx
238	addq	%r11,%r13
239	movq	%r10,%r11
240	adcq	$0,%rdx
241	movq	%r13,-16(%rsp,%r15,8)
242	movq	%rdx,%r13
243
244.L1st_enter:
245	mulq	%rbx
246	addq	%rax,%r11
247	movq	(%rcx,%r15,8),%rax
248	adcq	$0,%rdx
249	leaq	1(%r15),%r15
250	movq	%rdx,%r10
251
252	mulq	%rbp
253	cmpq	%r9,%r15
254	jne	.L1st
255
256
257	addq	%rax,%r13
258	adcq	$0,%rdx
259	addq	%r11,%r13
260	adcq	$0,%rdx
261	movq	%r13,-16(%rsp,%r9,8)
262	movq	%rdx,%r13
263	movq	%r10,%r11
264
265	xorq	%rdx,%rdx
266	addq	%r11,%r13
267	adcq	$0,%rdx
268	movq	%r13,-8(%rsp,%r9,8)
269	movq	%rdx,(%rsp,%r9,8)
270
271	leaq	1(%r14),%r14
272	jmp	.Louter
273.align	16
274.Louter:
275	leaq	24+128(%rsp,%r9,8),%rdx
276	andq	$-16,%rdx
277	pxor	%xmm4,%xmm4
278	pxor	%xmm5,%xmm5
279	movdqa	-128(%r12),%xmm0
280	movdqa	-112(%r12),%xmm1
281	movdqa	-96(%r12),%xmm2
282	movdqa	-80(%r12),%xmm3
283	pand	-128(%rdx),%xmm0
284	pand	-112(%rdx),%xmm1
285	por	%xmm0,%xmm4
286	pand	-96(%rdx),%xmm2
287	por	%xmm1,%xmm5
288	pand	-80(%rdx),%xmm3
289	por	%xmm2,%xmm4
290	por	%xmm3,%xmm5
291	movdqa	-64(%r12),%xmm0
292	movdqa	-48(%r12),%xmm1
293	movdqa	-32(%r12),%xmm2
294	movdqa	-16(%r12),%xmm3
295	pand	-64(%rdx),%xmm0
296	pand	-48(%rdx),%xmm1
297	por	%xmm0,%xmm4
298	pand	-32(%rdx),%xmm2
299	por	%xmm1,%xmm5
300	pand	-16(%rdx),%xmm3
301	por	%xmm2,%xmm4
302	por	%xmm3,%xmm5
303	movdqa	0(%r12),%xmm0
304	movdqa	16(%r12),%xmm1
305	movdqa	32(%r12),%xmm2
306	movdqa	48(%r12),%xmm3
307	pand	0(%rdx),%xmm0
308	pand	16(%rdx),%xmm1
309	por	%xmm0,%xmm4
310	pand	32(%rdx),%xmm2
311	por	%xmm1,%xmm5
312	pand	48(%rdx),%xmm3
313	por	%xmm2,%xmm4
314	por	%xmm3,%xmm5
315	movdqa	64(%r12),%xmm0
316	movdqa	80(%r12),%xmm1
317	movdqa	96(%r12),%xmm2
318	movdqa	112(%r12),%xmm3
319	pand	64(%rdx),%xmm0
320	pand	80(%rdx),%xmm1
321	por	%xmm0,%xmm4
322	pand	96(%rdx),%xmm2
323	por	%xmm1,%xmm5
324	pand	112(%rdx),%xmm3
325	por	%xmm2,%xmm4
326	por	%xmm3,%xmm5
327	por	%xmm5,%xmm4
328
329	pshufd	$0x4e,%xmm4,%xmm0
330	por	%xmm4,%xmm0
331	leaq	256(%r12),%r12
332
333	movq	(%rsi),%rax
334.byte	102,72,15,126,195
335
336	xorq	%r15,%r15
337	movq	%r8,%rbp
338	movq	(%rsp),%r10
339
340	mulq	%rbx
341	addq	%rax,%r10
342	movq	(%rcx),%rax
343	adcq	$0,%rdx
344
345	imulq	%r10,%rbp
346	movq	%rdx,%r11
347
348	mulq	%rbp
349	addq	%rax,%r10
350	movq	8(%rsi),%rax
351	adcq	$0,%rdx
352	movq	8(%rsp),%r10
353	movq	%rdx,%r13
354
355	leaq	1(%r15),%r15
356	jmp	.Linner_enter
357
358.align	16
359.Linner:
360	addq	%rax,%r13
361	movq	(%rsi,%r15,8),%rax
362	adcq	$0,%rdx
363	addq	%r10,%r13
364	movq	(%rsp,%r15,8),%r10
365	adcq	$0,%rdx
366	movq	%r13,-16(%rsp,%r15,8)
367	movq	%rdx,%r13
368
369.Linner_enter:
370	mulq	%rbx
371	addq	%rax,%r11
372	movq	(%rcx,%r15,8),%rax
373	adcq	$0,%rdx
374	addq	%r11,%r10
375	movq	%rdx,%r11
376	adcq	$0,%r11
377	leaq	1(%r15),%r15
378
379	mulq	%rbp
380	cmpq	%r9,%r15
381	jne	.Linner
382
383	addq	%rax,%r13
384	adcq	$0,%rdx
385	addq	%r10,%r13
386	movq	(%rsp,%r9,8),%r10
387	adcq	$0,%rdx
388	movq	%r13,-16(%rsp,%r9,8)
389	movq	%rdx,%r13
390
391	xorq	%rdx,%rdx
392	addq	%r11,%r13
393	adcq	$0,%rdx
394	addq	%r10,%r13
395	adcq	$0,%rdx
396	movq	%r13,-8(%rsp,%r9,8)
397	movq	%rdx,(%rsp,%r9,8)
398
399	leaq	1(%r14),%r14
400	cmpq	%r9,%r14
401	jb	.Louter
402
403	xorq	%r14,%r14
404	movq	(%rsp),%rax
405	leaq	(%rsp),%rsi
406	movq	%r9,%r15
407	jmp	.Lsub
408.align	16
409.Lsub:	sbbq	(%rcx,%r14,8),%rax
410	movq	%rax,(%rdi,%r14,8)
411	movq	8(%rsi,%r14,8),%rax
412	leaq	1(%r14),%r14
413	decq	%r15
414	jnz	.Lsub
415
416	sbbq	$0,%rax
417	movq	$-1,%rbx
418	xorq	%rax,%rbx
419	xorq	%r14,%r14
420	movq	%r9,%r15
421
422.Lcopy:
423	movq	(%rdi,%r14,8),%rcx
424	movq	(%rsp,%r14,8),%rdx
425	andq	%rbx,%rcx
426	andq	%rax,%rdx
427	movq	%r14,(%rsp,%r14,8)
428	orq	%rcx,%rdx
429	movq	%rdx,(%rdi,%r14,8)
430	leaq	1(%r14),%r14
431	subq	$1,%r15
432	jnz	.Lcopy
433
434	movq	8(%rsp,%r9,8),%rsi
435.cfi_def_cfa	%rsi,8
436	movq	$1,%rax
437
438	movq	-48(%rsi),%r15
439.cfi_restore	%r15
440	movq	-40(%rsi),%r14
441.cfi_restore	%r14
442	movq	-32(%rsi),%r13
443.cfi_restore	%r13
444	movq	-24(%rsi),%r12
445.cfi_restore	%r12
446	movq	-16(%rsi),%rbp
447.cfi_restore	%rbp
448	movq	-8(%rsi),%rbx
449.cfi_restore	%rbx
450	leaq	(%rsi),%rsp
451.cfi_def_cfa_register	%rsp
452.Lmul_epilogue:
453	ret
454.cfi_endproc
455.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
456.type	bn_mul4x_mont_gather5,@function
457.align	32
458bn_mul4x_mont_gather5:
459.cfi_startproc
460.byte	0x67
461	movq	%rsp,%rax
462.cfi_def_cfa_register	%rax
463.Lmul4x_enter:
464	andl	$0x80108,%r11d
465	cmpl	$0x80108,%r11d
466	je	.Lmulx4x_enter
467	pushq	%rbx
468.cfi_offset	%rbx,-16
469	pushq	%rbp
470.cfi_offset	%rbp,-24
471	pushq	%r12
472.cfi_offset	%r12,-32
473	pushq	%r13
474.cfi_offset	%r13,-40
475	pushq	%r14
476.cfi_offset	%r14,-48
477	pushq	%r15
478.cfi_offset	%r15,-56
479.Lmul4x_prologue:
480
481.byte	0x67
482	shll	$3,%r9d
483	leaq	(%r9,%r9,2),%r10
484	negq	%r9
485
486
487
488
489
490
491
492
493
494
495	leaq	-320(%rsp,%r9,2),%r11
496	movq	%rsp,%rbp
497	subq	%rdi,%r11
498	andq	$4095,%r11
499	cmpq	%r11,%r10
500	jb	.Lmul4xsp_alt
501	subq	%r11,%rbp
502	leaq	-320(%rbp,%r9,2),%rbp
503	jmp	.Lmul4xsp_done
504
505.align	32
506.Lmul4xsp_alt:
507	leaq	4096-320(,%r9,2),%r10
508	leaq	-320(%rbp,%r9,2),%rbp
509	subq	%r10,%r11
510	movq	$0,%r10
511	cmovcq	%r10,%r11
512	subq	%r11,%rbp
513.Lmul4xsp_done:
514	andq	$-64,%rbp
515	movq	%rsp,%r11
516	subq	%rbp,%r11
517	andq	$-4096,%r11
518	leaq	(%r11,%rbp,1),%rsp
519	movq	(%rsp),%r10
520	cmpq	%rbp,%rsp
521	ja	.Lmul4x_page_walk
522	jmp	.Lmul4x_page_walk_done
523
524.Lmul4x_page_walk:
525	leaq	-4096(%rsp),%rsp
526	movq	(%rsp),%r10
527	cmpq	%rbp,%rsp
528	ja	.Lmul4x_page_walk
529.Lmul4x_page_walk_done:
530
531	negq	%r9
532
533	movq	%rax,40(%rsp)
534.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
535.Lmul4x_body:
536
537	call	mul4x_internal
538
539	movq	40(%rsp),%rsi
540.cfi_def_cfa	%rsi,8
541	movq	$1,%rax
542
543	movq	-48(%rsi),%r15
544.cfi_restore	%r15
545	movq	-40(%rsi),%r14
546.cfi_restore	%r14
547	movq	-32(%rsi),%r13
548.cfi_restore	%r13
549	movq	-24(%rsi),%r12
550.cfi_restore	%r12
551	movq	-16(%rsi),%rbp
552.cfi_restore	%rbp
553	movq	-8(%rsi),%rbx
554.cfi_restore	%rbx
555	leaq	(%rsi),%rsp
556.cfi_def_cfa_register	%rsp
557.Lmul4x_epilogue:
558	ret
559.cfi_endproc
560.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
561
562.type	mul4x_internal,@function
563.align	32
564mul4x_internal:
565.cfi_startproc
566	shlq	$5,%r9
567	movd	8(%rax),%xmm5
568	leaq	.Linc(%rip),%rax
569	leaq	128(%rdx,%r9,1),%r13
570	shrq	$5,%r9
571	movdqa	0(%rax),%xmm0
572	movdqa	16(%rax),%xmm1
573	leaq	88-112(%rsp,%r9,1),%r10
574	leaq	128(%rdx),%r12
575
576	pshufd	$0,%xmm5,%xmm5
577	movdqa	%xmm1,%xmm4
578.byte	0x67,0x67
579	movdqa	%xmm1,%xmm2
580	paddd	%xmm0,%xmm1
581	pcmpeqd	%xmm5,%xmm0
582.byte	0x67
583	movdqa	%xmm4,%xmm3
584	paddd	%xmm1,%xmm2
585	pcmpeqd	%xmm5,%xmm1
586	movdqa	%xmm0,112(%r10)
587	movdqa	%xmm4,%xmm0
588
589	paddd	%xmm2,%xmm3
590	pcmpeqd	%xmm5,%xmm2
591	movdqa	%xmm1,128(%r10)
592	movdqa	%xmm4,%xmm1
593
594	paddd	%xmm3,%xmm0
595	pcmpeqd	%xmm5,%xmm3
596	movdqa	%xmm2,144(%r10)
597	movdqa	%xmm4,%xmm2
598
599	paddd	%xmm0,%xmm1
600	pcmpeqd	%xmm5,%xmm0
601	movdqa	%xmm3,160(%r10)
602	movdqa	%xmm4,%xmm3
603	paddd	%xmm1,%xmm2
604	pcmpeqd	%xmm5,%xmm1
605	movdqa	%xmm0,176(%r10)
606	movdqa	%xmm4,%xmm0
607
608	paddd	%xmm2,%xmm3
609	pcmpeqd	%xmm5,%xmm2
610	movdqa	%xmm1,192(%r10)
611	movdqa	%xmm4,%xmm1
612
613	paddd	%xmm3,%xmm0
614	pcmpeqd	%xmm5,%xmm3
615	movdqa	%xmm2,208(%r10)
616	movdqa	%xmm4,%xmm2
617
618	paddd	%xmm0,%xmm1
619	pcmpeqd	%xmm5,%xmm0
620	movdqa	%xmm3,224(%r10)
621	movdqa	%xmm4,%xmm3
622	paddd	%xmm1,%xmm2
623	pcmpeqd	%xmm5,%xmm1
624	movdqa	%xmm0,240(%r10)
625	movdqa	%xmm4,%xmm0
626
627	paddd	%xmm2,%xmm3
628	pcmpeqd	%xmm5,%xmm2
629	movdqa	%xmm1,256(%r10)
630	movdqa	%xmm4,%xmm1
631
632	paddd	%xmm3,%xmm0
633	pcmpeqd	%xmm5,%xmm3
634	movdqa	%xmm2,272(%r10)
635	movdqa	%xmm4,%xmm2
636
637	paddd	%xmm0,%xmm1
638	pcmpeqd	%xmm5,%xmm0
639	movdqa	%xmm3,288(%r10)
640	movdqa	%xmm4,%xmm3
641	paddd	%xmm1,%xmm2
642	pcmpeqd	%xmm5,%xmm1
643	movdqa	%xmm0,304(%r10)
644
645	paddd	%xmm2,%xmm3
646.byte	0x67
647	pcmpeqd	%xmm5,%xmm2
648	movdqa	%xmm1,320(%r10)
649
650	pcmpeqd	%xmm5,%xmm3
651	movdqa	%xmm2,336(%r10)
652	pand	64(%r12),%xmm0
653
654	pand	80(%r12),%xmm1
655	pand	96(%r12),%xmm2
656	movdqa	%xmm3,352(%r10)
657	pand	112(%r12),%xmm3
658	por	%xmm2,%xmm0
659	por	%xmm3,%xmm1
660	movdqa	-128(%r12),%xmm4
661	movdqa	-112(%r12),%xmm5
662	movdqa	-96(%r12),%xmm2
663	pand	112(%r10),%xmm4
664	movdqa	-80(%r12),%xmm3
665	pand	128(%r10),%xmm5
666	por	%xmm4,%xmm0
667	pand	144(%r10),%xmm2
668	por	%xmm5,%xmm1
669	pand	160(%r10),%xmm3
670	por	%xmm2,%xmm0
671	por	%xmm3,%xmm1
672	movdqa	-64(%r12),%xmm4
673	movdqa	-48(%r12),%xmm5
674	movdqa	-32(%r12),%xmm2
675	pand	176(%r10),%xmm4
676	movdqa	-16(%r12),%xmm3
677	pand	192(%r10),%xmm5
678	por	%xmm4,%xmm0
679	pand	208(%r10),%xmm2
680	por	%xmm5,%xmm1
681	pand	224(%r10),%xmm3
682	por	%xmm2,%xmm0
683	por	%xmm3,%xmm1
684	movdqa	0(%r12),%xmm4
685	movdqa	16(%r12),%xmm5
686	movdqa	32(%r12),%xmm2
687	pand	240(%r10),%xmm4
688	movdqa	48(%r12),%xmm3
689	pand	256(%r10),%xmm5
690	por	%xmm4,%xmm0
691	pand	272(%r10),%xmm2
692	por	%xmm5,%xmm1
693	pand	288(%r10),%xmm3
694	por	%xmm2,%xmm0
695	por	%xmm3,%xmm1
696	por	%xmm1,%xmm0
697
698	pshufd	$0x4e,%xmm0,%xmm1
699	por	%xmm1,%xmm0
700	leaq	256(%r12),%r12
701.byte	102,72,15,126,195
702
703	movq	%r13,16+8(%rsp)
704	movq	%rdi,56+8(%rsp)
705
706	movq	(%r8),%r8
707	movq	(%rsi),%rax
708	leaq	(%rsi,%r9,1),%rsi
709	negq	%r9
710
711	movq	%r8,%rbp
712	mulq	%rbx
713	movq	%rax,%r10
714	movq	(%rcx),%rax
715
716	imulq	%r10,%rbp
717	leaq	64+8(%rsp),%r14
718	movq	%rdx,%r11
719
720	mulq	%rbp
721	addq	%rax,%r10
722	movq	8(%rsi,%r9,1),%rax
723	adcq	$0,%rdx
724	movq	%rdx,%rdi
725
726	mulq	%rbx
727	addq	%rax,%r11
728	movq	8(%rcx),%rax
729	adcq	$0,%rdx
730	movq	%rdx,%r10
731
732	mulq	%rbp
733	addq	%rax,%rdi
734	movq	16(%rsi,%r9,1),%rax
735	adcq	$0,%rdx
736	addq	%r11,%rdi
737	leaq	32(%r9),%r15
738	leaq	32(%rcx),%rcx
739	adcq	$0,%rdx
740	movq	%rdi,(%r14)
741	movq	%rdx,%r13
742	jmp	.L1st4x
743
744.align	32
745.L1st4x:
746	mulq	%rbx
747	addq	%rax,%r10
748	movq	-16(%rcx),%rax
749	leaq	32(%r14),%r14
750	adcq	$0,%rdx
751	movq	%rdx,%r11
752
753	mulq	%rbp
754	addq	%rax,%r13
755	movq	-8(%rsi,%r15,1),%rax
756	adcq	$0,%rdx
757	addq	%r10,%r13
758	adcq	$0,%rdx
759	movq	%r13,-24(%r14)
760	movq	%rdx,%rdi
761
762	mulq	%rbx
763	addq	%rax,%r11
764	movq	-8(%rcx),%rax
765	adcq	$0,%rdx
766	movq	%rdx,%r10
767
768	mulq	%rbp
769	addq	%rax,%rdi
770	movq	(%rsi,%r15,1),%rax
771	adcq	$0,%rdx
772	addq	%r11,%rdi
773	adcq	$0,%rdx
774	movq	%rdi,-16(%r14)
775	movq	%rdx,%r13
776
777	mulq	%rbx
778	addq	%rax,%r10
779	movq	0(%rcx),%rax
780	adcq	$0,%rdx
781	movq	%rdx,%r11
782
783	mulq	%rbp
784	addq	%rax,%r13
785	movq	8(%rsi,%r15,1),%rax
786	adcq	$0,%rdx
787	addq	%r10,%r13
788	adcq	$0,%rdx
789	movq	%r13,-8(%r14)
790	movq	%rdx,%rdi
791
792	mulq	%rbx
793	addq	%rax,%r11
794	movq	8(%rcx),%rax
795	adcq	$0,%rdx
796	movq	%rdx,%r10
797
798	mulq	%rbp
799	addq	%rax,%rdi
800	movq	16(%rsi,%r15,1),%rax
801	adcq	$0,%rdx
802	addq	%r11,%rdi
803	leaq	32(%rcx),%rcx
804	adcq	$0,%rdx
805	movq	%rdi,(%r14)
806	movq	%rdx,%r13
807
808	addq	$32,%r15
809	jnz	.L1st4x
810
811	mulq	%rbx
812	addq	%rax,%r10
813	movq	-16(%rcx),%rax
814	leaq	32(%r14),%r14
815	adcq	$0,%rdx
816	movq	%rdx,%r11
817
818	mulq	%rbp
819	addq	%rax,%r13
820	movq	-8(%rsi),%rax
821	adcq	$0,%rdx
822	addq	%r10,%r13
823	adcq	$0,%rdx
824	movq	%r13,-24(%r14)
825	movq	%rdx,%rdi
826
827	mulq	%rbx
828	addq	%rax,%r11
829	movq	-8(%rcx),%rax
830	adcq	$0,%rdx
831	movq	%rdx,%r10
832
833	mulq	%rbp
834	addq	%rax,%rdi
835	movq	(%rsi,%r9,1),%rax
836	adcq	$0,%rdx
837	addq	%r11,%rdi
838	adcq	$0,%rdx
839	movq	%rdi,-16(%r14)
840	movq	%rdx,%r13
841
842	leaq	(%rcx,%r9,1),%rcx
843
844	xorq	%rdi,%rdi
845	addq	%r10,%r13
846	adcq	$0,%rdi
847	movq	%r13,-8(%r14)
848
849	jmp	.Louter4x
850
851.align	32
852.Louter4x:
853	leaq	16+128(%r14),%rdx
854	pxor	%xmm4,%xmm4
855	pxor	%xmm5,%xmm5
856	movdqa	-128(%r12),%xmm0
857	movdqa	-112(%r12),%xmm1
858	movdqa	-96(%r12),%xmm2
859	movdqa	-80(%r12),%xmm3
860	pand	-128(%rdx),%xmm0
861	pand	-112(%rdx),%xmm1
862	por	%xmm0,%xmm4
863	pand	-96(%rdx),%xmm2
864	por	%xmm1,%xmm5
865	pand	-80(%rdx),%xmm3
866	por	%xmm2,%xmm4
867	por	%xmm3,%xmm5
868	movdqa	-64(%r12),%xmm0
869	movdqa	-48(%r12),%xmm1
870	movdqa	-32(%r12),%xmm2
871	movdqa	-16(%r12),%xmm3
872	pand	-64(%rdx),%xmm0
873	pand	-48(%rdx),%xmm1
874	por	%xmm0,%xmm4
875	pand	-32(%rdx),%xmm2
876	por	%xmm1,%xmm5
877	pand	-16(%rdx),%xmm3
878	por	%xmm2,%xmm4
879	por	%xmm3,%xmm5
880	movdqa	0(%r12),%xmm0
881	movdqa	16(%r12),%xmm1
882	movdqa	32(%r12),%xmm2
883	movdqa	48(%r12),%xmm3
884	pand	0(%rdx),%xmm0
885	pand	16(%rdx),%xmm1
886	por	%xmm0,%xmm4
887	pand	32(%rdx),%xmm2
888	por	%xmm1,%xmm5
889	pand	48(%rdx),%xmm3
890	por	%xmm2,%xmm4
891	por	%xmm3,%xmm5
892	movdqa	64(%r12),%xmm0
893	movdqa	80(%r12),%xmm1
894	movdqa	96(%r12),%xmm2
895	movdqa	112(%r12),%xmm3
896	pand	64(%rdx),%xmm0
897	pand	80(%rdx),%xmm1
898	por	%xmm0,%xmm4
899	pand	96(%rdx),%xmm2
900	por	%xmm1,%xmm5
901	pand	112(%rdx),%xmm3
902	por	%xmm2,%xmm4
903	por	%xmm3,%xmm5
904	por	%xmm5,%xmm4
905
906	pshufd	$0x4e,%xmm4,%xmm0
907	por	%xmm4,%xmm0
908	leaq	256(%r12),%r12
909.byte	102,72,15,126,195
910
911	movq	(%r14,%r9,1),%r10
912	movq	%r8,%rbp
913	mulq	%rbx
914	addq	%rax,%r10
915	movq	(%rcx),%rax
916	adcq	$0,%rdx
917
918	imulq	%r10,%rbp
919	movq	%rdx,%r11
920	movq	%rdi,(%r14)
921
922	leaq	(%r14,%r9,1),%r14
923
924	mulq	%rbp
925	addq	%rax,%r10
926	movq	8(%rsi,%r9,1),%rax
927	adcq	$0,%rdx
928	movq	%rdx,%rdi
929
930	mulq	%rbx
931	addq	%rax,%r11
932	movq	8(%rcx),%rax
933	adcq	$0,%rdx
934	addq	8(%r14),%r11
935	adcq	$0,%rdx
936	movq	%rdx,%r10
937
938	mulq	%rbp
939	addq	%rax,%rdi
940	movq	16(%rsi,%r9,1),%rax
941	adcq	$0,%rdx
942	addq	%r11,%rdi
943	leaq	32(%r9),%r15
944	leaq	32(%rcx),%rcx
945	adcq	$0,%rdx
946	movq	%rdx,%r13
947	jmp	.Linner4x
948
949.align	32
950.Linner4x:
951	mulq	%rbx
952	addq	%rax,%r10
953	movq	-16(%rcx),%rax
954	adcq	$0,%rdx
955	addq	16(%r14),%r10
956	leaq	32(%r14),%r14
957	adcq	$0,%rdx
958	movq	%rdx,%r11
959
960	mulq	%rbp
961	addq	%rax,%r13
962	movq	-8(%rsi,%r15,1),%rax
963	adcq	$0,%rdx
964	addq	%r10,%r13
965	adcq	$0,%rdx
966	movq	%rdi,-32(%r14)
967	movq	%rdx,%rdi
968
969	mulq	%rbx
970	addq	%rax,%r11
971	movq	-8(%rcx),%rax
972	adcq	$0,%rdx
973	addq	-8(%r14),%r11
974	adcq	$0,%rdx
975	movq	%rdx,%r10
976
977	mulq	%rbp
978	addq	%rax,%rdi
979	movq	(%rsi,%r15,1),%rax
980	adcq	$0,%rdx
981	addq	%r11,%rdi
982	adcq	$0,%rdx
983	movq	%r13,-24(%r14)
984	movq	%rdx,%r13
985
986	mulq	%rbx
987	addq	%rax,%r10
988	movq	0(%rcx),%rax
989	adcq	$0,%rdx
990	addq	(%r14),%r10
991	adcq	$0,%rdx
992	movq	%rdx,%r11
993
994	mulq	%rbp
995	addq	%rax,%r13
996	movq	8(%rsi,%r15,1),%rax
997	adcq	$0,%rdx
998	addq	%r10,%r13
999	adcq	$0,%rdx
1000	movq	%rdi,-16(%r14)
1001	movq	%rdx,%rdi
1002
1003	mulq	%rbx
1004	addq	%rax,%r11
1005	movq	8(%rcx),%rax
1006	adcq	$0,%rdx
1007	addq	8(%r14),%r11
1008	adcq	$0,%rdx
1009	movq	%rdx,%r10
1010
1011	mulq	%rbp
1012	addq	%rax,%rdi
1013	movq	16(%rsi,%r15,1),%rax
1014	adcq	$0,%rdx
1015	addq	%r11,%rdi
1016	leaq	32(%rcx),%rcx
1017	adcq	$0,%rdx
1018	movq	%r13,-8(%r14)
1019	movq	%rdx,%r13
1020
1021	addq	$32,%r15
1022	jnz	.Linner4x
1023
1024	mulq	%rbx
1025	addq	%rax,%r10
1026	movq	-16(%rcx),%rax
1027	adcq	$0,%rdx
1028	addq	16(%r14),%r10
1029	leaq	32(%r14),%r14
1030	adcq	$0,%rdx
1031	movq	%rdx,%r11
1032
1033	mulq	%rbp
1034	addq	%rax,%r13
1035	movq	-8(%rsi),%rax
1036	adcq	$0,%rdx
1037	addq	%r10,%r13
1038	adcq	$0,%rdx
1039	movq	%rdi,-32(%r14)
1040	movq	%rdx,%rdi
1041
1042	mulq	%rbx
1043	addq	%rax,%r11
1044	movq	%rbp,%rax
1045	movq	-8(%rcx),%rbp
1046	adcq	$0,%rdx
1047	addq	-8(%r14),%r11
1048	adcq	$0,%rdx
1049	movq	%rdx,%r10
1050
1051	mulq	%rbp
1052	addq	%rax,%rdi
1053	movq	(%rsi,%r9,1),%rax
1054	adcq	$0,%rdx
1055	addq	%r11,%rdi
1056	adcq	$0,%rdx
1057	movq	%r13,-24(%r14)
1058	movq	%rdx,%r13
1059
1060	movq	%rdi,-16(%r14)
1061	leaq	(%rcx,%r9,1),%rcx
1062
1063	xorq	%rdi,%rdi
1064	addq	%r10,%r13
1065	adcq	$0,%rdi
1066	addq	(%r14),%r13
1067	adcq	$0,%rdi
1068	movq	%r13,-8(%r14)
1069
1070	cmpq	16+8(%rsp),%r12
1071	jb	.Louter4x
1072	xorq	%rax,%rax
1073	subq	%r13,%rbp
1074	adcq	%r15,%r15
1075	orq	%r15,%rdi
1076	subq	%rdi,%rax
1077	leaq	(%r14,%r9,1),%rbx
1078	movq	(%rcx),%r12
1079	leaq	(%rcx),%rbp
1080	movq	%r9,%rcx
1081	sarq	$3+2,%rcx
1082	movq	56+8(%rsp),%rdi
1083	decq	%r12
1084	xorq	%r10,%r10
1085	movq	8(%rbp),%r13
1086	movq	16(%rbp),%r14
1087	movq	24(%rbp),%r15
1088	jmp	.Lsqr4x_sub_entry
1089.cfi_endproc
1090.size	mul4x_internal,.-mul4x_internal
1091.globl	bn_power5
1092.hidden bn_power5
1093.type	bn_power5,@function
1094.align	32
1095bn_power5:
1096.cfi_startproc
1097_CET_ENDBR
1098	movq	%rsp,%rax
1099.cfi_def_cfa_register	%rax
1100	leaq	OPENSSL_ia32cap_P(%rip),%r11
1101	movl	8(%r11),%r11d
1102	andl	$0x80108,%r11d
1103	cmpl	$0x80108,%r11d
1104	je	.Lpowerx5_enter
1105	pushq	%rbx
1106.cfi_offset	%rbx,-16
1107	pushq	%rbp
1108.cfi_offset	%rbp,-24
1109	pushq	%r12
1110.cfi_offset	%r12,-32
1111	pushq	%r13
1112.cfi_offset	%r13,-40
1113	pushq	%r14
1114.cfi_offset	%r14,-48
1115	pushq	%r15
1116.cfi_offset	%r15,-56
1117.Lpower5_prologue:
1118
1119	shll	$3,%r9d
1120	leal	(%r9,%r9,2),%r10d
1121	negq	%r9
1122	movq	(%r8),%r8
1123
1124
1125
1126
1127
1128
1129
1130
1131	leaq	-320(%rsp,%r9,2),%r11
1132	movq	%rsp,%rbp
1133	subq	%rdi,%r11
1134	andq	$4095,%r11
1135	cmpq	%r11,%r10
1136	jb	.Lpwr_sp_alt
1137	subq	%r11,%rbp
1138	leaq	-320(%rbp,%r9,2),%rbp
1139	jmp	.Lpwr_sp_done
1140
1141.align	32
1142.Lpwr_sp_alt:
1143	leaq	4096-320(,%r9,2),%r10
1144	leaq	-320(%rbp,%r9,2),%rbp
1145	subq	%r10,%r11
1146	movq	$0,%r10
1147	cmovcq	%r10,%r11
1148	subq	%r11,%rbp
1149.Lpwr_sp_done:
1150	andq	$-64,%rbp
1151	movq	%rsp,%r11
1152	subq	%rbp,%r11
1153	andq	$-4096,%r11
1154	leaq	(%r11,%rbp,1),%rsp
1155	movq	(%rsp),%r10
1156	cmpq	%rbp,%rsp
1157	ja	.Lpwr_page_walk
1158	jmp	.Lpwr_page_walk_done
1159
1160.Lpwr_page_walk:
1161	leaq	-4096(%rsp),%rsp
1162	movq	(%rsp),%r10
1163	cmpq	%rbp,%rsp
1164	ja	.Lpwr_page_walk
1165.Lpwr_page_walk_done:
1166
1167	movq	%r9,%r10
1168	negq	%r9
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179	movq	%r8,32(%rsp)
1180	movq	%rax,40(%rsp)
1181.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
1182.Lpower5_body:
1183.byte	102,72,15,110,207
1184.byte	102,72,15,110,209
1185.byte	102,73,15,110,218
1186.byte	102,72,15,110,226
1187
1188	call	__bn_sqr8x_internal
1189	call	__bn_post4x_internal
1190	call	__bn_sqr8x_internal
1191	call	__bn_post4x_internal
1192	call	__bn_sqr8x_internal
1193	call	__bn_post4x_internal
1194	call	__bn_sqr8x_internal
1195	call	__bn_post4x_internal
1196	call	__bn_sqr8x_internal
1197	call	__bn_post4x_internal
1198
1199.byte	102,72,15,126,209
1200.byte	102,72,15,126,226
1201	movq	%rsi,%rdi
1202	movq	40(%rsp),%rax
1203	leaq	32(%rsp),%r8
1204
1205	call	mul4x_internal
1206
1207	movq	40(%rsp),%rsi
1208.cfi_def_cfa	%rsi,8
1209	movq	$1,%rax
1210	movq	-48(%rsi),%r15
1211.cfi_restore	%r15
1212	movq	-40(%rsi),%r14
1213.cfi_restore	%r14
1214	movq	-32(%rsi),%r13
1215.cfi_restore	%r13
1216	movq	-24(%rsi),%r12
1217.cfi_restore	%r12
1218	movq	-16(%rsi),%rbp
1219.cfi_restore	%rbp
1220	movq	-8(%rsi),%rbx
1221.cfi_restore	%rbx
1222	leaq	(%rsi),%rsp
1223.cfi_def_cfa_register	%rsp
1224.Lpower5_epilogue:
1225	ret
1226.cfi_endproc
1227.size	bn_power5,.-bn_power5
1228
1229.globl	bn_sqr8x_internal
1230.hidden bn_sqr8x_internal
1231.hidden	bn_sqr8x_internal
1232.type	bn_sqr8x_internal,@function
1233.align	32
1234bn_sqr8x_internal:
1235__bn_sqr8x_internal:
1236.cfi_startproc
1237_CET_ENDBR
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311	leaq	32(%r10),%rbp
1312	leaq	(%rsi,%r9,1),%rsi
1313
1314	movq	%r9,%rcx
1315
1316
1317	movq	-32(%rsi,%rbp,1),%r14
1318	leaq	48+8(%rsp,%r9,2),%rdi
1319	movq	-24(%rsi,%rbp,1),%rax
1320	leaq	-32(%rdi,%rbp,1),%rdi
1321	movq	-16(%rsi,%rbp,1),%rbx
1322	movq	%rax,%r15
1323
1324	mulq	%r14
1325	movq	%rax,%r10
1326	movq	%rbx,%rax
1327	movq	%rdx,%r11
1328	movq	%r10,-24(%rdi,%rbp,1)
1329
1330	mulq	%r14
1331	addq	%rax,%r11
1332	movq	%rbx,%rax
1333	adcq	$0,%rdx
1334	movq	%r11,-16(%rdi,%rbp,1)
1335	movq	%rdx,%r10
1336
1337
1338	movq	-8(%rsi,%rbp,1),%rbx
1339	mulq	%r15
1340	movq	%rax,%r12
1341	movq	%rbx,%rax
1342	movq	%rdx,%r13
1343
1344	leaq	(%rbp),%rcx
1345	mulq	%r14
1346	addq	%rax,%r10
1347	movq	%rbx,%rax
1348	movq	%rdx,%r11
1349	adcq	$0,%r11
1350	addq	%r12,%r10
1351	adcq	$0,%r11
1352	movq	%r10,-8(%rdi,%rcx,1)
1353	jmp	.Lsqr4x_1st
1354
1355.align	32
1356.Lsqr4x_1st:
1357	movq	(%rsi,%rcx,1),%rbx
1358	mulq	%r15
1359	addq	%rax,%r13
1360	movq	%rbx,%rax
1361	movq	%rdx,%r12
1362	adcq	$0,%r12
1363
1364	mulq	%r14
1365	addq	%rax,%r11
1366	movq	%rbx,%rax
1367	movq	8(%rsi,%rcx,1),%rbx
1368	movq	%rdx,%r10
1369	adcq	$0,%r10
1370	addq	%r13,%r11
1371	adcq	$0,%r10
1372
1373
1374	mulq	%r15
1375	addq	%rax,%r12
1376	movq	%rbx,%rax
1377	movq	%r11,(%rdi,%rcx,1)
1378	movq	%rdx,%r13
1379	adcq	$0,%r13
1380
1381	mulq	%r14
1382	addq	%rax,%r10
1383	movq	%rbx,%rax
1384	movq	16(%rsi,%rcx,1),%rbx
1385	movq	%rdx,%r11
1386	adcq	$0,%r11
1387	addq	%r12,%r10
1388	adcq	$0,%r11
1389
1390	mulq	%r15
1391	addq	%rax,%r13
1392	movq	%rbx,%rax
1393	movq	%r10,8(%rdi,%rcx,1)
1394	movq	%rdx,%r12
1395	adcq	$0,%r12
1396
1397	mulq	%r14
1398	addq	%rax,%r11
1399	movq	%rbx,%rax
1400	movq	24(%rsi,%rcx,1),%rbx
1401	movq	%rdx,%r10
1402	adcq	$0,%r10
1403	addq	%r13,%r11
1404	adcq	$0,%r10
1405
1406
1407	mulq	%r15
1408	addq	%rax,%r12
1409	movq	%rbx,%rax
1410	movq	%r11,16(%rdi,%rcx,1)
1411	movq	%rdx,%r13
1412	adcq	$0,%r13
1413	leaq	32(%rcx),%rcx
1414
1415	mulq	%r14
1416	addq	%rax,%r10
1417	movq	%rbx,%rax
1418	movq	%rdx,%r11
1419	adcq	$0,%r11
1420	addq	%r12,%r10
1421	adcq	$0,%r11
1422	movq	%r10,-8(%rdi,%rcx,1)
1423
1424	cmpq	$0,%rcx
1425	jne	.Lsqr4x_1st
1426
1427	mulq	%r15
1428	addq	%rax,%r13
1429	leaq	16(%rbp),%rbp
1430	adcq	$0,%rdx
1431	addq	%r11,%r13
1432	adcq	$0,%rdx
1433
1434	movq	%r13,(%rdi)
1435	movq	%rdx,%r12
1436	movq	%rdx,8(%rdi)
1437	jmp	.Lsqr4x_outer
1438
1439.align	32
1440.Lsqr4x_outer:
1441	movq	-32(%rsi,%rbp,1),%r14
1442	leaq	48+8(%rsp,%r9,2),%rdi
1443	movq	-24(%rsi,%rbp,1),%rax
1444	leaq	-32(%rdi,%rbp,1),%rdi
1445	movq	-16(%rsi,%rbp,1),%rbx
1446	movq	%rax,%r15
1447
1448	mulq	%r14
1449	movq	-24(%rdi,%rbp,1),%r10
1450	addq	%rax,%r10
1451	movq	%rbx,%rax
1452	adcq	$0,%rdx
1453	movq	%r10,-24(%rdi,%rbp,1)
1454	movq	%rdx,%r11
1455
1456	mulq	%r14
1457	addq	%rax,%r11
1458	movq	%rbx,%rax
1459	adcq	$0,%rdx
1460	addq	-16(%rdi,%rbp,1),%r11
1461	movq	%rdx,%r10
1462	adcq	$0,%r10
1463	movq	%r11,-16(%rdi,%rbp,1)
1464
1465	xorq	%r12,%r12
1466
1467	movq	-8(%rsi,%rbp,1),%rbx
1468	mulq	%r15
1469	addq	%rax,%r12
1470	movq	%rbx,%rax
1471	adcq	$0,%rdx
1472	addq	-8(%rdi,%rbp,1),%r12
1473	movq	%rdx,%r13
1474	adcq	$0,%r13
1475
1476	mulq	%r14
1477	addq	%rax,%r10
1478	movq	%rbx,%rax
1479	adcq	$0,%rdx
1480	addq	%r12,%r10
1481	movq	%rdx,%r11
1482	adcq	$0,%r11
1483	movq	%r10,-8(%rdi,%rbp,1)
1484
1485	leaq	(%rbp),%rcx
1486	jmp	.Lsqr4x_inner
1487
1488.align	32
1489.Lsqr4x_inner:
1490	movq	(%rsi,%rcx,1),%rbx
1491	mulq	%r15
1492	addq	%rax,%r13
1493	movq	%rbx,%rax
1494	movq	%rdx,%r12
1495	adcq	$0,%r12
1496	addq	(%rdi,%rcx,1),%r13
1497	adcq	$0,%r12
1498
1499.byte	0x67
1500	mulq	%r14
1501	addq	%rax,%r11
1502	movq	%rbx,%rax
1503	movq	8(%rsi,%rcx,1),%rbx
1504	movq	%rdx,%r10
1505	adcq	$0,%r10
1506	addq	%r13,%r11
1507	adcq	$0,%r10
1508
1509	mulq	%r15
1510	addq	%rax,%r12
1511	movq	%r11,(%rdi,%rcx,1)
1512	movq	%rbx,%rax
1513	movq	%rdx,%r13
1514	adcq	$0,%r13
1515	addq	8(%rdi,%rcx,1),%r12
1516	leaq	16(%rcx),%rcx
1517	adcq	$0,%r13
1518
1519	mulq	%r14
1520	addq	%rax,%r10
1521	movq	%rbx,%rax
1522	adcq	$0,%rdx
1523	addq	%r12,%r10
1524	movq	%rdx,%r11
1525	adcq	$0,%r11
1526	movq	%r10,-8(%rdi,%rcx,1)
1527
1528	cmpq	$0,%rcx
1529	jne	.Lsqr4x_inner
1530
1531.byte	0x67
1532	mulq	%r15
1533	addq	%rax,%r13
1534	adcq	$0,%rdx
1535	addq	%r11,%r13
1536	adcq	$0,%rdx
1537
1538	movq	%r13,(%rdi)
1539	movq	%rdx,%r12
1540	movq	%rdx,8(%rdi)
1541
1542	addq	$16,%rbp
1543	jnz	.Lsqr4x_outer
1544
1545
1546	movq	-32(%rsi),%r14
1547	leaq	48+8(%rsp,%r9,2),%rdi
1548	movq	-24(%rsi),%rax
1549	leaq	-32(%rdi,%rbp,1),%rdi
1550	movq	-16(%rsi),%rbx
1551	movq	%rax,%r15
1552
1553	mulq	%r14
1554	addq	%rax,%r10
1555	movq	%rbx,%rax
1556	movq	%rdx,%r11
1557	adcq	$0,%r11
1558
1559	mulq	%r14
1560	addq	%rax,%r11
1561	movq	%rbx,%rax
1562	movq	%r10,-24(%rdi)
1563	movq	%rdx,%r10
1564	adcq	$0,%r10
1565	addq	%r13,%r11
1566	movq	-8(%rsi),%rbx
1567	adcq	$0,%r10
1568
1569	mulq	%r15
1570	addq	%rax,%r12
1571	movq	%rbx,%rax
1572	movq	%r11,-16(%rdi)
1573	movq	%rdx,%r13
1574	adcq	$0,%r13
1575
1576	mulq	%r14
1577	addq	%rax,%r10
1578	movq	%rbx,%rax
1579	movq	%rdx,%r11
1580	adcq	$0,%r11
1581	addq	%r12,%r10
1582	adcq	$0,%r11
1583	movq	%r10,-8(%rdi)
1584
1585	mulq	%r15
1586	addq	%rax,%r13
1587	movq	-16(%rsi),%rax
1588	adcq	$0,%rdx
1589	addq	%r11,%r13
1590	adcq	$0,%rdx
1591
1592	movq	%r13,(%rdi)
1593	movq	%rdx,%r12
1594	movq	%rdx,8(%rdi)
1595
1596	mulq	%rbx
1597	addq	$16,%rbp
1598	xorq	%r14,%r14
1599	subq	%r9,%rbp
1600	xorq	%r15,%r15
1601
1602	addq	%r12,%rax
1603	adcq	$0,%rdx
1604	movq	%rax,8(%rdi)
1605	movq	%rdx,16(%rdi)
1606	movq	%r15,24(%rdi)
1607
1608	movq	-16(%rsi,%rbp,1),%rax
1609	leaq	48+8(%rsp),%rdi
1610	xorq	%r10,%r10
1611	movq	8(%rdi),%r11
1612
1613	leaq	(%r14,%r10,2),%r12
1614	shrq	$63,%r10
1615	leaq	(%rcx,%r11,2),%r13
1616	shrq	$63,%r11
1617	orq	%r10,%r13
1618	movq	16(%rdi),%r10
1619	movq	%r11,%r14
1620	mulq	%rax
1621	negq	%r15
1622	movq	24(%rdi),%r11
1623	adcq	%rax,%r12
1624	movq	-8(%rsi,%rbp,1),%rax
1625	movq	%r12,(%rdi)
1626	adcq	%rdx,%r13
1627
1628	leaq	(%r14,%r10,2),%rbx
1629	movq	%r13,8(%rdi)
1630	sbbq	%r15,%r15
1631	shrq	$63,%r10
1632	leaq	(%rcx,%r11,2),%r8
1633	shrq	$63,%r11
1634	orq	%r10,%r8
1635	movq	32(%rdi),%r10
1636	movq	%r11,%r14
1637	mulq	%rax
1638	negq	%r15
1639	movq	40(%rdi),%r11
1640	adcq	%rax,%rbx
1641	movq	0(%rsi,%rbp,1),%rax
1642	movq	%rbx,16(%rdi)
1643	adcq	%rdx,%r8
1644	leaq	16(%rbp),%rbp
1645	movq	%r8,24(%rdi)
1646	sbbq	%r15,%r15
1647	leaq	64(%rdi),%rdi
1648	jmp	.Lsqr4x_shift_n_add
1649
1650.align	32
1651.Lsqr4x_shift_n_add:
1652	leaq	(%r14,%r10,2),%r12
1653	shrq	$63,%r10
1654	leaq	(%rcx,%r11,2),%r13
1655	shrq	$63,%r11
1656	orq	%r10,%r13
1657	movq	-16(%rdi),%r10
1658	movq	%r11,%r14
1659	mulq	%rax
1660	negq	%r15
1661	movq	-8(%rdi),%r11
1662	adcq	%rax,%r12
1663	movq	-8(%rsi,%rbp,1),%rax
1664	movq	%r12,-32(%rdi)
1665	adcq	%rdx,%r13
1666
1667	leaq	(%r14,%r10,2),%rbx
1668	movq	%r13,-24(%rdi)
1669	sbbq	%r15,%r15
1670	shrq	$63,%r10
1671	leaq	(%rcx,%r11,2),%r8
1672	shrq	$63,%r11
1673	orq	%r10,%r8
1674	movq	0(%rdi),%r10
1675	movq	%r11,%r14
1676	mulq	%rax
1677	negq	%r15
1678	movq	8(%rdi),%r11
1679	adcq	%rax,%rbx
1680	movq	0(%rsi,%rbp,1),%rax
1681	movq	%rbx,-16(%rdi)
1682	adcq	%rdx,%r8
1683
1684	leaq	(%r14,%r10,2),%r12
1685	movq	%r8,-8(%rdi)
1686	sbbq	%r15,%r15
1687	shrq	$63,%r10
1688	leaq	(%rcx,%r11,2),%r13
1689	shrq	$63,%r11
1690	orq	%r10,%r13
1691	movq	16(%rdi),%r10
1692	movq	%r11,%r14
1693	mulq	%rax
1694	negq	%r15
1695	movq	24(%rdi),%r11
1696	adcq	%rax,%r12
1697	movq	8(%rsi,%rbp,1),%rax
1698	movq	%r12,0(%rdi)
1699	adcq	%rdx,%r13
1700
1701	leaq	(%r14,%r10,2),%rbx
1702	movq	%r13,8(%rdi)
1703	sbbq	%r15,%r15
1704	shrq	$63,%r10
1705	leaq	(%rcx,%r11,2),%r8
1706	shrq	$63,%r11
1707	orq	%r10,%r8
1708	movq	32(%rdi),%r10
1709	movq	%r11,%r14
1710	mulq	%rax
1711	negq	%r15
1712	movq	40(%rdi),%r11
1713	adcq	%rax,%rbx
1714	movq	16(%rsi,%rbp,1),%rax
1715	movq	%rbx,16(%rdi)
1716	adcq	%rdx,%r8
1717	movq	%r8,24(%rdi)
1718	sbbq	%r15,%r15
1719	leaq	64(%rdi),%rdi
1720	addq	$32,%rbp
1721	jnz	.Lsqr4x_shift_n_add
1722
1723	leaq	(%r14,%r10,2),%r12
1724.byte	0x67
1725	shrq	$63,%r10
1726	leaq	(%rcx,%r11,2),%r13
1727	shrq	$63,%r11
1728	orq	%r10,%r13
1729	movq	-16(%rdi),%r10
1730	movq	%r11,%r14
1731	mulq	%rax
1732	negq	%r15
1733	movq	-8(%rdi),%r11
1734	adcq	%rax,%r12
1735	movq	-8(%rsi),%rax
1736	movq	%r12,-32(%rdi)
1737	adcq	%rdx,%r13
1738
1739	leaq	(%r14,%r10,2),%rbx
1740	movq	%r13,-24(%rdi)
1741	sbbq	%r15,%r15
1742	shrq	$63,%r10
1743	leaq	(%rcx,%r11,2),%r8
1744	shrq	$63,%r11
1745	orq	%r10,%r8
1746	mulq	%rax
1747	negq	%r15
1748	adcq	%rax,%rbx
1749	adcq	%rdx,%r8
1750	movq	%rbx,-16(%rdi)
1751	movq	%r8,-8(%rdi)
1752.byte	102,72,15,126,213
1753__bn_sqr8x_reduction:
1754	xorq	%rax,%rax
1755	leaq	(%r9,%rbp,1),%rcx
1756	leaq	48+8(%rsp,%r9,2),%rdx
1757	movq	%rcx,0+8(%rsp)
1758	leaq	48+8(%rsp,%r9,1),%rdi
1759	movq	%rdx,8+8(%rsp)
1760	negq	%r9
1761	jmp	.L8x_reduction_loop
1762
1763.align	32
1764.L8x_reduction_loop:
1765	leaq	(%rdi,%r9,1),%rdi
1766.byte	0x66
1767	movq	0(%rdi),%rbx
1768	movq	8(%rdi),%r9
1769	movq	16(%rdi),%r10
1770	movq	24(%rdi),%r11
1771	movq	32(%rdi),%r12
1772	movq	40(%rdi),%r13
1773	movq	48(%rdi),%r14
1774	movq	56(%rdi),%r15
1775	movq	%rax,(%rdx)
1776	leaq	64(%rdi),%rdi
1777
1778.byte	0x67
1779	movq	%rbx,%r8
1780	imulq	32+8(%rsp),%rbx
1781	movq	0(%rbp),%rax
1782	movl	$8,%ecx
1783	jmp	.L8x_reduce
1784
1785.align	32
1786.L8x_reduce:
1787	mulq	%rbx
1788	movq	8(%rbp),%rax
1789	negq	%r8
1790	movq	%rdx,%r8
1791	adcq	$0,%r8
1792
1793	mulq	%rbx
1794	addq	%rax,%r9
1795	movq	16(%rbp),%rax
1796	adcq	$0,%rdx
1797	addq	%r9,%r8
1798	movq	%rbx,48-8+8(%rsp,%rcx,8)
1799	movq	%rdx,%r9
1800	adcq	$0,%r9
1801
1802	mulq	%rbx
1803	addq	%rax,%r10
1804	movq	24(%rbp),%rax
1805	adcq	$0,%rdx
1806	addq	%r10,%r9
1807	movq	32+8(%rsp),%rsi
1808	movq	%rdx,%r10
1809	adcq	$0,%r10
1810
1811	mulq	%rbx
1812	addq	%rax,%r11
1813	movq	32(%rbp),%rax
1814	adcq	$0,%rdx
1815	imulq	%r8,%rsi
1816	addq	%r11,%r10
1817	movq	%rdx,%r11
1818	adcq	$0,%r11
1819
1820	mulq	%rbx
1821	addq	%rax,%r12
1822	movq	40(%rbp),%rax
1823	adcq	$0,%rdx
1824	addq	%r12,%r11
1825	movq	%rdx,%r12
1826	adcq	$0,%r12
1827
1828	mulq	%rbx
1829	addq	%rax,%r13
1830	movq	48(%rbp),%rax
1831	adcq	$0,%rdx
1832	addq	%r13,%r12
1833	movq	%rdx,%r13
1834	adcq	$0,%r13
1835
1836	mulq	%rbx
1837	addq	%rax,%r14
1838	movq	56(%rbp),%rax
1839	adcq	$0,%rdx
1840	addq	%r14,%r13
1841	movq	%rdx,%r14
1842	adcq	$0,%r14
1843
1844	mulq	%rbx
1845	movq	%rsi,%rbx
1846	addq	%rax,%r15
1847	movq	0(%rbp),%rax
1848	adcq	$0,%rdx
1849	addq	%r15,%r14
1850	movq	%rdx,%r15
1851	adcq	$0,%r15
1852
1853	decl	%ecx
1854	jnz	.L8x_reduce
1855
1856	leaq	64(%rbp),%rbp
1857	xorq	%rax,%rax
1858	movq	8+8(%rsp),%rdx
1859	cmpq	0+8(%rsp),%rbp
1860	jae	.L8x_no_tail
1861
1862.byte	0x66
1863	addq	0(%rdi),%r8
1864	adcq	8(%rdi),%r9
1865	adcq	16(%rdi),%r10
1866	adcq	24(%rdi),%r11
1867	adcq	32(%rdi),%r12
1868	adcq	40(%rdi),%r13
1869	adcq	48(%rdi),%r14
1870	adcq	56(%rdi),%r15
1871	sbbq	%rsi,%rsi
1872
1873	movq	48+56+8(%rsp),%rbx
1874	movl	$8,%ecx
1875	movq	0(%rbp),%rax
1876	jmp	.L8x_tail
1877
1878.align	32
1879.L8x_tail:
1880	mulq	%rbx
1881	addq	%rax,%r8
1882	movq	8(%rbp),%rax
1883	movq	%r8,(%rdi)
1884	movq	%rdx,%r8
1885	adcq	$0,%r8
1886
1887	mulq	%rbx
1888	addq	%rax,%r9
1889	movq	16(%rbp),%rax
1890	adcq	$0,%rdx
1891	addq	%r9,%r8
1892	leaq	8(%rdi),%rdi
1893	movq	%rdx,%r9
1894	adcq	$0,%r9
1895
1896	mulq	%rbx
1897	addq	%rax,%r10
1898	movq	24(%rbp),%rax
1899	adcq	$0,%rdx
1900	addq	%r10,%r9
1901	movq	%rdx,%r10
1902	adcq	$0,%r10
1903
1904	mulq	%rbx
1905	addq	%rax,%r11
1906	movq	32(%rbp),%rax
1907	adcq	$0,%rdx
1908	addq	%r11,%r10
1909	movq	%rdx,%r11
1910	adcq	$0,%r11
1911
1912	mulq	%rbx
1913	addq	%rax,%r12
1914	movq	40(%rbp),%rax
1915	adcq	$0,%rdx
1916	addq	%r12,%r11
1917	movq	%rdx,%r12
1918	adcq	$0,%r12
1919
1920	mulq	%rbx
1921	addq	%rax,%r13
1922	movq	48(%rbp),%rax
1923	adcq	$0,%rdx
1924	addq	%r13,%r12
1925	movq	%rdx,%r13
1926	adcq	$0,%r13
1927
1928	mulq	%rbx
1929	addq	%rax,%r14
1930	movq	56(%rbp),%rax
1931	adcq	$0,%rdx
1932	addq	%r14,%r13
1933	movq	%rdx,%r14
1934	adcq	$0,%r14
1935
1936	mulq	%rbx
1937	movq	48-16+8(%rsp,%rcx,8),%rbx
1938	addq	%rax,%r15
1939	adcq	$0,%rdx
1940	addq	%r15,%r14
1941	movq	0(%rbp),%rax
1942	movq	%rdx,%r15
1943	adcq	$0,%r15
1944
1945	decl	%ecx
1946	jnz	.L8x_tail
1947
1948	leaq	64(%rbp),%rbp
1949	movq	8+8(%rsp),%rdx
1950	cmpq	0+8(%rsp),%rbp
1951	jae	.L8x_tail_done
1952
1953	movq	48+56+8(%rsp),%rbx
1954	negq	%rsi
1955	movq	0(%rbp),%rax
1956	adcq	0(%rdi),%r8
1957	adcq	8(%rdi),%r9
1958	adcq	16(%rdi),%r10
1959	adcq	24(%rdi),%r11
1960	adcq	32(%rdi),%r12
1961	adcq	40(%rdi),%r13
1962	adcq	48(%rdi),%r14
1963	adcq	56(%rdi),%r15
1964	sbbq	%rsi,%rsi
1965
1966	movl	$8,%ecx
1967	jmp	.L8x_tail
1968
1969.align	32
1970.L8x_tail_done:
1971	xorq	%rax,%rax
1972	addq	(%rdx),%r8
1973	adcq	$0,%r9
1974	adcq	$0,%r10
1975	adcq	$0,%r11
1976	adcq	$0,%r12
1977	adcq	$0,%r13
1978	adcq	$0,%r14
1979	adcq	$0,%r15
1980	adcq	$0,%rax
1981
1982	negq	%rsi
1983.L8x_no_tail:
1984	adcq	0(%rdi),%r8
1985	adcq	8(%rdi),%r9
1986	adcq	16(%rdi),%r10
1987	adcq	24(%rdi),%r11
1988	adcq	32(%rdi),%r12
1989	adcq	40(%rdi),%r13
1990	adcq	48(%rdi),%r14
1991	adcq	56(%rdi),%r15
1992	adcq	$0,%rax
1993	movq	-8(%rbp),%rcx
1994	xorq	%rsi,%rsi
1995
1996.byte	102,72,15,126,213
1997
1998	movq	%r8,0(%rdi)
1999	movq	%r9,8(%rdi)
2000.byte	102,73,15,126,217
2001	movq	%r10,16(%rdi)
2002	movq	%r11,24(%rdi)
2003	movq	%r12,32(%rdi)
2004	movq	%r13,40(%rdi)
2005	movq	%r14,48(%rdi)
2006	movq	%r15,56(%rdi)
2007	leaq	64(%rdi),%rdi
2008
2009	cmpq	%rdx,%rdi
2010	jb	.L8x_reduction_loop
2011	ret
2012.cfi_endproc
2013.size	bn_sqr8x_internal,.-bn_sqr8x_internal
2014.type	__bn_post4x_internal,@function
2015.align	32
2016__bn_post4x_internal:
2017.cfi_startproc
2018	movq	0(%rbp),%r12
2019	leaq	(%rdi,%r9,1),%rbx
2020	movq	%r9,%rcx
2021.byte	102,72,15,126,207
2022	negq	%rax
2023.byte	102,72,15,126,206
2024	sarq	$3+2,%rcx
2025	decq	%r12
2026	xorq	%r10,%r10
2027	movq	8(%rbp),%r13
2028	movq	16(%rbp),%r14
2029	movq	24(%rbp),%r15
2030	jmp	.Lsqr4x_sub_entry
2031
2032.align	16
2033.Lsqr4x_sub:
2034	movq	0(%rbp),%r12
2035	movq	8(%rbp),%r13
2036	movq	16(%rbp),%r14
2037	movq	24(%rbp),%r15
2038.Lsqr4x_sub_entry:
2039	leaq	32(%rbp),%rbp
2040	notq	%r12
2041	notq	%r13
2042	notq	%r14
2043	notq	%r15
2044	andq	%rax,%r12
2045	andq	%rax,%r13
2046	andq	%rax,%r14
2047	andq	%rax,%r15
2048
2049	negq	%r10
2050	adcq	0(%rbx),%r12
2051	adcq	8(%rbx),%r13
2052	adcq	16(%rbx),%r14
2053	adcq	24(%rbx),%r15
2054	movq	%r12,0(%rdi)
2055	leaq	32(%rbx),%rbx
2056	movq	%r13,8(%rdi)
2057	sbbq	%r10,%r10
2058	movq	%r14,16(%rdi)
2059	movq	%r15,24(%rdi)
2060	leaq	32(%rdi),%rdi
2061
2062	incq	%rcx
2063	jnz	.Lsqr4x_sub
2064
2065	movq	%r9,%r10
2066	negq	%r9
2067	ret
2068.cfi_endproc
2069.size	__bn_post4x_internal,.-__bn_post4x_internal
2070.type	bn_mulx4x_mont_gather5,@function
2071.align	32
2072bn_mulx4x_mont_gather5:
2073.cfi_startproc
2074	movq	%rsp,%rax
2075.cfi_def_cfa_register	%rax
2076.Lmulx4x_enter:
2077	pushq	%rbx
2078.cfi_offset	%rbx,-16
2079	pushq	%rbp
2080.cfi_offset	%rbp,-24
2081	pushq	%r12
2082.cfi_offset	%r12,-32
2083	pushq	%r13
2084.cfi_offset	%r13,-40
2085	pushq	%r14
2086.cfi_offset	%r14,-48
2087	pushq	%r15
2088.cfi_offset	%r15,-56
2089.Lmulx4x_prologue:
2090
2091	shll	$3,%r9d
2092	leaq	(%r9,%r9,2),%r10
2093	negq	%r9
2094	movq	(%r8),%r8
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105	leaq	-320(%rsp,%r9,2),%r11
2106	movq	%rsp,%rbp
2107	subq	%rdi,%r11
2108	andq	$4095,%r11
2109	cmpq	%r11,%r10
2110	jb	.Lmulx4xsp_alt
2111	subq	%r11,%rbp
2112	leaq	-320(%rbp,%r9,2),%rbp
2113	jmp	.Lmulx4xsp_done
2114
2115.Lmulx4xsp_alt:
2116	leaq	4096-320(,%r9,2),%r10
2117	leaq	-320(%rbp,%r9,2),%rbp
2118	subq	%r10,%r11
2119	movq	$0,%r10
2120	cmovcq	%r10,%r11
2121	subq	%r11,%rbp
2122.Lmulx4xsp_done:
2123	andq	$-64,%rbp
2124	movq	%rsp,%r11
2125	subq	%rbp,%r11
2126	andq	$-4096,%r11
2127	leaq	(%r11,%rbp,1),%rsp
2128	movq	(%rsp),%r10
2129	cmpq	%rbp,%rsp
2130	ja	.Lmulx4x_page_walk
2131	jmp	.Lmulx4x_page_walk_done
2132
2133.Lmulx4x_page_walk:
2134	leaq	-4096(%rsp),%rsp
2135	movq	(%rsp),%r10
2136	cmpq	%rbp,%rsp
2137	ja	.Lmulx4x_page_walk
2138.Lmulx4x_page_walk_done:
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152	movq	%r8,32(%rsp)
2153	movq	%rax,40(%rsp)
2154.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2155.Lmulx4x_body:
2156	call	mulx4x_internal
2157
2158	movq	40(%rsp),%rsi
2159.cfi_def_cfa	%rsi,8
2160	movq	$1,%rax
2161
2162	movq	-48(%rsi),%r15
2163.cfi_restore	%r15
2164	movq	-40(%rsi),%r14
2165.cfi_restore	%r14
2166	movq	-32(%rsi),%r13
2167.cfi_restore	%r13
2168	movq	-24(%rsi),%r12
2169.cfi_restore	%r12
2170	movq	-16(%rsi),%rbp
2171.cfi_restore	%rbp
2172	movq	-8(%rsi),%rbx
2173.cfi_restore	%rbx
2174	leaq	(%rsi),%rsp
2175.cfi_def_cfa_register	%rsp
2176.Lmulx4x_epilogue:
2177	ret
2178.cfi_endproc
2179.size	bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
2180
2181.type	mulx4x_internal,@function
2182.align	32
2183mulx4x_internal:
2184.cfi_startproc
2185	movq	%r9,8(%rsp)
2186	movq	%r9,%r10
2187	negq	%r9
2188	shlq	$5,%r9
2189	negq	%r10
2190	leaq	128(%rdx,%r9,1),%r13
2191	shrq	$5+5,%r9
2192	movd	8(%rax),%xmm5
2193	subq	$1,%r9
2194	leaq	.Linc(%rip),%rax
2195	movq	%r13,16+8(%rsp)
2196	movq	%r9,24+8(%rsp)
2197	movq	%rdi,56+8(%rsp)
2198	movdqa	0(%rax),%xmm0
2199	movdqa	16(%rax),%xmm1
2200	leaq	88-112(%rsp,%r10,1),%r10
2201	leaq	128(%rdx),%rdi
2202
2203	pshufd	$0,%xmm5,%xmm5
2204	movdqa	%xmm1,%xmm4
2205.byte	0x67
2206	movdqa	%xmm1,%xmm2
2207.byte	0x67
2208	paddd	%xmm0,%xmm1
2209	pcmpeqd	%xmm5,%xmm0
2210	movdqa	%xmm4,%xmm3
2211	paddd	%xmm1,%xmm2
2212	pcmpeqd	%xmm5,%xmm1
2213	movdqa	%xmm0,112(%r10)
2214	movdqa	%xmm4,%xmm0
2215
2216	paddd	%xmm2,%xmm3
2217	pcmpeqd	%xmm5,%xmm2
2218	movdqa	%xmm1,128(%r10)
2219	movdqa	%xmm4,%xmm1
2220
2221	paddd	%xmm3,%xmm0
2222	pcmpeqd	%xmm5,%xmm3
2223	movdqa	%xmm2,144(%r10)
2224	movdqa	%xmm4,%xmm2
2225
2226	paddd	%xmm0,%xmm1
2227	pcmpeqd	%xmm5,%xmm0
2228	movdqa	%xmm3,160(%r10)
2229	movdqa	%xmm4,%xmm3
2230	paddd	%xmm1,%xmm2
2231	pcmpeqd	%xmm5,%xmm1
2232	movdqa	%xmm0,176(%r10)
2233	movdqa	%xmm4,%xmm0
2234
2235	paddd	%xmm2,%xmm3
2236	pcmpeqd	%xmm5,%xmm2
2237	movdqa	%xmm1,192(%r10)
2238	movdqa	%xmm4,%xmm1
2239
2240	paddd	%xmm3,%xmm0
2241	pcmpeqd	%xmm5,%xmm3
2242	movdqa	%xmm2,208(%r10)
2243	movdqa	%xmm4,%xmm2
2244
2245	paddd	%xmm0,%xmm1
2246	pcmpeqd	%xmm5,%xmm0
2247	movdqa	%xmm3,224(%r10)
2248	movdqa	%xmm4,%xmm3
2249	paddd	%xmm1,%xmm2
2250	pcmpeqd	%xmm5,%xmm1
2251	movdqa	%xmm0,240(%r10)
2252	movdqa	%xmm4,%xmm0
2253
2254	paddd	%xmm2,%xmm3
2255	pcmpeqd	%xmm5,%xmm2
2256	movdqa	%xmm1,256(%r10)
2257	movdqa	%xmm4,%xmm1
2258
2259	paddd	%xmm3,%xmm0
2260	pcmpeqd	%xmm5,%xmm3
2261	movdqa	%xmm2,272(%r10)
2262	movdqa	%xmm4,%xmm2
2263
2264	paddd	%xmm0,%xmm1
2265	pcmpeqd	%xmm5,%xmm0
2266	movdqa	%xmm3,288(%r10)
2267	movdqa	%xmm4,%xmm3
2268.byte	0x67
2269	paddd	%xmm1,%xmm2
2270	pcmpeqd	%xmm5,%xmm1
2271	movdqa	%xmm0,304(%r10)
2272
2273	paddd	%xmm2,%xmm3
2274	pcmpeqd	%xmm5,%xmm2
2275	movdqa	%xmm1,320(%r10)
2276
2277	pcmpeqd	%xmm5,%xmm3
2278	movdqa	%xmm2,336(%r10)
2279
2280	pand	64(%rdi),%xmm0
2281	pand	80(%rdi),%xmm1
2282	pand	96(%rdi),%xmm2
2283	movdqa	%xmm3,352(%r10)
2284	pand	112(%rdi),%xmm3
2285	por	%xmm2,%xmm0
2286	por	%xmm3,%xmm1
2287	movdqa	-128(%rdi),%xmm4
2288	movdqa	-112(%rdi),%xmm5
2289	movdqa	-96(%rdi),%xmm2
2290	pand	112(%r10),%xmm4
2291	movdqa	-80(%rdi),%xmm3
2292	pand	128(%r10),%xmm5
2293	por	%xmm4,%xmm0
2294	pand	144(%r10),%xmm2
2295	por	%xmm5,%xmm1
2296	pand	160(%r10),%xmm3
2297	por	%xmm2,%xmm0
2298	por	%xmm3,%xmm1
2299	movdqa	-64(%rdi),%xmm4
2300	movdqa	-48(%rdi),%xmm5
2301	movdqa	-32(%rdi),%xmm2
2302	pand	176(%r10),%xmm4
2303	movdqa	-16(%rdi),%xmm3
2304	pand	192(%r10),%xmm5
2305	por	%xmm4,%xmm0
2306	pand	208(%r10),%xmm2
2307	por	%xmm5,%xmm1
2308	pand	224(%r10),%xmm3
2309	por	%xmm2,%xmm0
2310	por	%xmm3,%xmm1
2311	movdqa	0(%rdi),%xmm4
2312	movdqa	16(%rdi),%xmm5
2313	movdqa	32(%rdi),%xmm2
2314	pand	240(%r10),%xmm4
2315	movdqa	48(%rdi),%xmm3
2316	pand	256(%r10),%xmm5
2317	por	%xmm4,%xmm0
2318	pand	272(%r10),%xmm2
2319	por	%xmm5,%xmm1
2320	pand	288(%r10),%xmm3
2321	por	%xmm2,%xmm0
2322	por	%xmm3,%xmm1
2323	pxor	%xmm1,%xmm0
2324
2325	pshufd	$0x4e,%xmm0,%xmm1
2326	por	%xmm1,%xmm0
2327	leaq	256(%rdi),%rdi
2328.byte	102,72,15,126,194
2329	leaq	64+32+8(%rsp),%rbx
2330
2331	movq	%rdx,%r9
2332	mulxq	0(%rsi),%r8,%rax
2333	mulxq	8(%rsi),%r11,%r12
2334	addq	%rax,%r11
2335	mulxq	16(%rsi),%rax,%r13
2336	adcq	%rax,%r12
2337	adcq	$0,%r13
2338	mulxq	24(%rsi),%rax,%r14
2339
2340	movq	%r8,%r15
2341	imulq	32+8(%rsp),%r8
2342	xorq	%rbp,%rbp
2343	movq	%r8,%rdx
2344
2345	movq	%rdi,8+8(%rsp)
2346
2347	leaq	32(%rsi),%rsi
2348	adcxq	%rax,%r13
2349	adcxq	%rbp,%r14
2350
2351	mulxq	0(%rcx),%rax,%r10
2352	adcxq	%rax,%r15
2353	adoxq	%r11,%r10
2354	mulxq	8(%rcx),%rax,%r11
2355	adcxq	%rax,%r10
2356	adoxq	%r12,%r11
2357	mulxq	16(%rcx),%rax,%r12
2358	movq	24+8(%rsp),%rdi
2359	movq	%r10,-32(%rbx)
2360	adcxq	%rax,%r11
2361	adoxq	%r13,%r12
2362	mulxq	24(%rcx),%rax,%r15
2363	movq	%r9,%rdx
2364	movq	%r11,-24(%rbx)
2365	adcxq	%rax,%r12
2366	adoxq	%rbp,%r15
2367	leaq	32(%rcx),%rcx
2368	movq	%r12,-16(%rbx)
2369	jmp	.Lmulx4x_1st
2370
2371.align	32
2372.Lmulx4x_1st:
2373	adcxq	%rbp,%r15
2374	mulxq	0(%rsi),%r10,%rax
2375	adcxq	%r14,%r10
2376	mulxq	8(%rsi),%r11,%r14
2377	adcxq	%rax,%r11
2378	mulxq	16(%rsi),%r12,%rax
2379	adcxq	%r14,%r12
2380	mulxq	24(%rsi),%r13,%r14
2381.byte	0x67,0x67
2382	movq	%r8,%rdx
2383	adcxq	%rax,%r13
2384	adcxq	%rbp,%r14
2385	leaq	32(%rsi),%rsi
2386	leaq	32(%rbx),%rbx
2387
2388	adoxq	%r15,%r10
2389	mulxq	0(%rcx),%rax,%r15
2390	adcxq	%rax,%r10
2391	adoxq	%r15,%r11
2392	mulxq	8(%rcx),%rax,%r15
2393	adcxq	%rax,%r11
2394	adoxq	%r15,%r12
2395	mulxq	16(%rcx),%rax,%r15
2396	movq	%r10,-40(%rbx)
2397	adcxq	%rax,%r12
2398	movq	%r11,-32(%rbx)
2399	adoxq	%r15,%r13
2400	mulxq	24(%rcx),%rax,%r15
2401	movq	%r9,%rdx
2402	movq	%r12,-24(%rbx)
2403	adcxq	%rax,%r13
2404	adoxq	%rbp,%r15
2405	leaq	32(%rcx),%rcx
2406	movq	%r13,-16(%rbx)
2407
2408	decq	%rdi
2409	jnz	.Lmulx4x_1st
2410
2411	movq	8(%rsp),%rax
2412	adcq	%rbp,%r15
2413	leaq	(%rsi,%rax,1),%rsi
2414	addq	%r15,%r14
2415	movq	8+8(%rsp),%rdi
2416	adcq	%rbp,%rbp
2417	movq	%r14,-8(%rbx)
2418	jmp	.Lmulx4x_outer
2419
2420.align	32
2421.Lmulx4x_outer:
2422	leaq	16-256(%rbx),%r10
2423	pxor	%xmm4,%xmm4
2424.byte	0x67,0x67
2425	pxor	%xmm5,%xmm5
2426	movdqa	-128(%rdi),%xmm0
2427	movdqa	-112(%rdi),%xmm1
2428	movdqa	-96(%rdi),%xmm2
2429	pand	256(%r10),%xmm0
2430	movdqa	-80(%rdi),%xmm3
2431	pand	272(%r10),%xmm1
2432	por	%xmm0,%xmm4
2433	pand	288(%r10),%xmm2
2434	por	%xmm1,%xmm5
2435	pand	304(%r10),%xmm3
2436	por	%xmm2,%xmm4
2437	por	%xmm3,%xmm5
2438	movdqa	-64(%rdi),%xmm0
2439	movdqa	-48(%rdi),%xmm1
2440	movdqa	-32(%rdi),%xmm2
2441	pand	320(%r10),%xmm0
2442	movdqa	-16(%rdi),%xmm3
2443	pand	336(%r10),%xmm1
2444	por	%xmm0,%xmm4
2445	pand	352(%r10),%xmm2
2446	por	%xmm1,%xmm5
2447	pand	368(%r10),%xmm3
2448	por	%xmm2,%xmm4
2449	por	%xmm3,%xmm5
2450	movdqa	0(%rdi),%xmm0
2451	movdqa	16(%rdi),%xmm1
2452	movdqa	32(%rdi),%xmm2
2453	pand	384(%r10),%xmm0
2454	movdqa	48(%rdi),%xmm3
2455	pand	400(%r10),%xmm1
2456	por	%xmm0,%xmm4
2457	pand	416(%r10),%xmm2
2458	por	%xmm1,%xmm5
2459	pand	432(%r10),%xmm3
2460	por	%xmm2,%xmm4
2461	por	%xmm3,%xmm5
2462	movdqa	64(%rdi),%xmm0
2463	movdqa	80(%rdi),%xmm1
2464	movdqa	96(%rdi),%xmm2
2465	pand	448(%r10),%xmm0
2466	movdqa	112(%rdi),%xmm3
2467	pand	464(%r10),%xmm1
2468	por	%xmm0,%xmm4
2469	pand	480(%r10),%xmm2
2470	por	%xmm1,%xmm5
2471	pand	496(%r10),%xmm3
2472	por	%xmm2,%xmm4
2473	por	%xmm3,%xmm5
2474	por	%xmm5,%xmm4
2475
2476	pshufd	$0x4e,%xmm4,%xmm0
2477	por	%xmm4,%xmm0
2478	leaq	256(%rdi),%rdi
2479.byte	102,72,15,126,194
2480
2481	movq	%rbp,(%rbx)
2482	leaq	32(%rbx,%rax,1),%rbx
2483	mulxq	0(%rsi),%r8,%r11
2484	xorq	%rbp,%rbp
2485	movq	%rdx,%r9
2486	mulxq	8(%rsi),%r14,%r12
2487	adoxq	-32(%rbx),%r8
2488	adcxq	%r14,%r11
2489	mulxq	16(%rsi),%r15,%r13
2490	adoxq	-24(%rbx),%r11
2491	adcxq	%r15,%r12
2492	mulxq	24(%rsi),%rdx,%r14
2493	adoxq	-16(%rbx),%r12
2494	adcxq	%rdx,%r13
2495	leaq	(%rcx,%rax,1),%rcx
2496	leaq	32(%rsi),%rsi
2497	adoxq	-8(%rbx),%r13
2498	adcxq	%rbp,%r14
2499	adoxq	%rbp,%r14
2500
2501	movq	%r8,%r15
2502	imulq	32+8(%rsp),%r8
2503
2504	movq	%r8,%rdx
2505	xorq	%rbp,%rbp
2506	movq	%rdi,8+8(%rsp)
2507
2508	mulxq	0(%rcx),%rax,%r10
2509	adcxq	%rax,%r15
2510	adoxq	%r11,%r10
2511	mulxq	8(%rcx),%rax,%r11
2512	adcxq	%rax,%r10
2513	adoxq	%r12,%r11
2514	mulxq	16(%rcx),%rax,%r12
2515	adcxq	%rax,%r11
2516	adoxq	%r13,%r12
2517	mulxq	24(%rcx),%rax,%r15
2518	movq	%r9,%rdx
2519	movq	24+8(%rsp),%rdi
2520	movq	%r10,-32(%rbx)
2521	adcxq	%rax,%r12
2522	movq	%r11,-24(%rbx)
2523	adoxq	%rbp,%r15
2524	movq	%r12,-16(%rbx)
2525	leaq	32(%rcx),%rcx
2526	jmp	.Lmulx4x_inner
2527
2528.align	32
2529.Lmulx4x_inner:
2530	mulxq	0(%rsi),%r10,%rax
2531	adcxq	%rbp,%r15
2532	adoxq	%r14,%r10
2533	mulxq	8(%rsi),%r11,%r14
2534	adcxq	0(%rbx),%r10
2535	adoxq	%rax,%r11
2536	mulxq	16(%rsi),%r12,%rax
2537	adcxq	8(%rbx),%r11
2538	adoxq	%r14,%r12
2539	mulxq	24(%rsi),%r13,%r14
2540	movq	%r8,%rdx
2541	adcxq	16(%rbx),%r12
2542	adoxq	%rax,%r13
2543	adcxq	24(%rbx),%r13
2544	adoxq	%rbp,%r14
2545	leaq	32(%rsi),%rsi
2546	leaq	32(%rbx),%rbx
2547	adcxq	%rbp,%r14
2548
2549	adoxq	%r15,%r10
2550	mulxq	0(%rcx),%rax,%r15
2551	adcxq	%rax,%r10
2552	adoxq	%r15,%r11
2553	mulxq	8(%rcx),%rax,%r15
2554	adcxq	%rax,%r11
2555	adoxq	%r15,%r12
2556	mulxq	16(%rcx),%rax,%r15
2557	movq	%r10,-40(%rbx)
2558	adcxq	%rax,%r12
2559	adoxq	%r15,%r13
2560	movq	%r11,-32(%rbx)
2561	mulxq	24(%rcx),%rax,%r15
2562	movq	%r9,%rdx
2563	leaq	32(%rcx),%rcx
2564	movq	%r12,-24(%rbx)
2565	adcxq	%rax,%r13
2566	adoxq	%rbp,%r15
2567	movq	%r13,-16(%rbx)
2568
2569	decq	%rdi
2570	jnz	.Lmulx4x_inner
2571
2572	movq	0+8(%rsp),%rax
2573	adcq	%rbp,%r15
2574	subq	0(%rbx),%rdi
2575	movq	8+8(%rsp),%rdi
2576	movq	16+8(%rsp),%r10
2577	adcq	%r15,%r14
2578	leaq	(%rsi,%rax,1),%rsi
2579	adcq	%rbp,%rbp
2580	movq	%r14,-8(%rbx)
2581
2582	cmpq	%r10,%rdi
2583	jb	.Lmulx4x_outer
2584
2585	movq	-8(%rcx),%r10
2586	movq	%rbp,%r8
2587	movq	(%rcx,%rax,1),%r12
2588	leaq	(%rcx,%rax,1),%rbp
2589	movq	%rax,%rcx
2590	leaq	(%rbx,%rax,1),%rdi
2591	xorl	%eax,%eax
2592	xorq	%r15,%r15
2593	subq	%r14,%r10
2594	adcq	%r15,%r15
2595	orq	%r15,%r8
2596	sarq	$3+2,%rcx
2597	subq	%r8,%rax
2598	movq	56+8(%rsp),%rdx
2599	decq	%r12
2600	movq	8(%rbp),%r13
2601	xorq	%r8,%r8
2602	movq	16(%rbp),%r14
2603	movq	24(%rbp),%r15
2604	jmp	.Lsqrx4x_sub_entry
2605.cfi_endproc
2606.size	mulx4x_internal,.-mulx4x_internal
2607.type	bn_powerx5,@function
2608.align	32
2609bn_powerx5:
2610.cfi_startproc
2611	movq	%rsp,%rax
2612.cfi_def_cfa_register	%rax
2613.Lpowerx5_enter:
2614	pushq	%rbx
2615.cfi_offset	%rbx,-16
2616	pushq	%rbp
2617.cfi_offset	%rbp,-24
2618	pushq	%r12
2619.cfi_offset	%r12,-32
2620	pushq	%r13
2621.cfi_offset	%r13,-40
2622	pushq	%r14
2623.cfi_offset	%r14,-48
2624	pushq	%r15
2625.cfi_offset	%r15,-56
2626.Lpowerx5_prologue:
2627
2628	shll	$3,%r9d
2629	leaq	(%r9,%r9,2),%r10
2630	negq	%r9
2631	movq	(%r8),%r8
2632
2633
2634
2635
2636
2637
2638
2639
2640	leaq	-320(%rsp,%r9,2),%r11
2641	movq	%rsp,%rbp
2642	subq	%rdi,%r11
2643	andq	$4095,%r11
2644	cmpq	%r11,%r10
2645	jb	.Lpwrx_sp_alt
2646	subq	%r11,%rbp
2647	leaq	-320(%rbp,%r9,2),%rbp
2648	jmp	.Lpwrx_sp_done
2649
2650.align	32
2651.Lpwrx_sp_alt:
2652	leaq	4096-320(,%r9,2),%r10
2653	leaq	-320(%rbp,%r9,2),%rbp
2654	subq	%r10,%r11
2655	movq	$0,%r10
2656	cmovcq	%r10,%r11
2657	subq	%r11,%rbp
2658.Lpwrx_sp_done:
2659	andq	$-64,%rbp
2660	movq	%rsp,%r11
2661	subq	%rbp,%r11
2662	andq	$-4096,%r11
2663	leaq	(%r11,%rbp,1),%rsp
2664	movq	(%rsp),%r10
2665	cmpq	%rbp,%rsp
2666	ja	.Lpwrx_page_walk
2667	jmp	.Lpwrx_page_walk_done
2668
2669.Lpwrx_page_walk:
2670	leaq	-4096(%rsp),%rsp
2671	movq	(%rsp),%r10
2672	cmpq	%rbp,%rsp
2673	ja	.Lpwrx_page_walk
2674.Lpwrx_page_walk_done:
2675
2676	movq	%r9,%r10
2677	negq	%r9
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690	pxor	%xmm0,%xmm0
2691.byte	102,72,15,110,207
2692.byte	102,72,15,110,209
2693.byte	102,73,15,110,218
2694.byte	102,72,15,110,226
2695	movq	%r8,32(%rsp)
2696	movq	%rax,40(%rsp)
2697.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2698.Lpowerx5_body:
2699
2700	call	__bn_sqrx8x_internal
2701	call	__bn_postx4x_internal
2702	call	__bn_sqrx8x_internal
2703	call	__bn_postx4x_internal
2704	call	__bn_sqrx8x_internal
2705	call	__bn_postx4x_internal
2706	call	__bn_sqrx8x_internal
2707	call	__bn_postx4x_internal
2708	call	__bn_sqrx8x_internal
2709	call	__bn_postx4x_internal
2710
2711	movq	%r10,%r9
2712	movq	%rsi,%rdi
2713.byte	102,72,15,126,209
2714.byte	102,72,15,126,226
2715	movq	40(%rsp),%rax
2716
2717	call	mulx4x_internal
2718
2719	movq	40(%rsp),%rsi
2720.cfi_def_cfa	%rsi,8
2721	movq	$1,%rax
2722
2723	movq	-48(%rsi),%r15
2724.cfi_restore	%r15
2725	movq	-40(%rsi),%r14
2726.cfi_restore	%r14
2727	movq	-32(%rsi),%r13
2728.cfi_restore	%r13
2729	movq	-24(%rsi),%r12
2730.cfi_restore	%r12
2731	movq	-16(%rsi),%rbp
2732.cfi_restore	%rbp
2733	movq	-8(%rsi),%rbx
2734.cfi_restore	%rbx
2735	leaq	(%rsi),%rsp
2736.cfi_def_cfa_register	%rsp
2737.Lpowerx5_epilogue:
2738	ret
2739.cfi_endproc
2740.size	bn_powerx5,.-bn_powerx5
2741
2742.globl	bn_sqrx8x_internal
2743.hidden bn_sqrx8x_internal
2744.hidden	bn_sqrx8x_internal
2745.type	bn_sqrx8x_internal,@function
2746.align	32
2747bn_sqrx8x_internal:
2748__bn_sqrx8x_internal:
2749.cfi_startproc
2750_CET_ENDBR
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791	leaq	48+8(%rsp),%rdi
2792	leaq	(%rsi,%r9,1),%rbp
2793	movq	%r9,0+8(%rsp)
2794	movq	%rbp,8+8(%rsp)
2795	jmp	.Lsqr8x_zero_start
2796
2797.align	32
2798.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2799.Lsqrx8x_zero:
2800.byte	0x3e
2801	movdqa	%xmm0,0(%rdi)
2802	movdqa	%xmm0,16(%rdi)
2803	movdqa	%xmm0,32(%rdi)
2804	movdqa	%xmm0,48(%rdi)
2805.Lsqr8x_zero_start:
2806	movdqa	%xmm0,64(%rdi)
2807	movdqa	%xmm0,80(%rdi)
2808	movdqa	%xmm0,96(%rdi)
2809	movdqa	%xmm0,112(%rdi)
2810	leaq	128(%rdi),%rdi
2811	subq	$64,%r9
2812	jnz	.Lsqrx8x_zero
2813
2814	movq	0(%rsi),%rdx
2815
2816	xorq	%r10,%r10
2817	xorq	%r11,%r11
2818	xorq	%r12,%r12
2819	xorq	%r13,%r13
2820	xorq	%r14,%r14
2821	xorq	%r15,%r15
2822	leaq	48+8(%rsp),%rdi
2823	xorq	%rbp,%rbp
2824	jmp	.Lsqrx8x_outer_loop
2825
2826.align	32
2827.Lsqrx8x_outer_loop:
2828	mulxq	8(%rsi),%r8,%rax
2829	adcxq	%r9,%r8
2830	adoxq	%rax,%r10
2831	mulxq	16(%rsi),%r9,%rax
2832	adcxq	%r10,%r9
2833	adoxq	%rax,%r11
2834.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
2835	adcxq	%r11,%r10
2836	adoxq	%rax,%r12
2837.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
2838	adcxq	%r12,%r11
2839	adoxq	%rax,%r13
2840	mulxq	40(%rsi),%r12,%rax
2841	adcxq	%r13,%r12
2842	adoxq	%rax,%r14
2843	mulxq	48(%rsi),%r13,%rax
2844	adcxq	%r14,%r13
2845	adoxq	%r15,%rax
2846	mulxq	56(%rsi),%r14,%r15
2847	movq	8(%rsi),%rdx
2848	adcxq	%rax,%r14
2849	adoxq	%rbp,%r15
2850	adcq	64(%rdi),%r15
2851	movq	%r8,8(%rdi)
2852	movq	%r9,16(%rdi)
2853	sbbq	%rcx,%rcx
2854	xorq	%rbp,%rbp
2855
2856
2857	mulxq	16(%rsi),%r8,%rbx
2858	mulxq	24(%rsi),%r9,%rax
2859	adcxq	%r10,%r8
2860	adoxq	%rbx,%r9
2861	mulxq	32(%rsi),%r10,%rbx
2862	adcxq	%r11,%r9
2863	adoxq	%rax,%r10
2864.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
2865	adcxq	%r12,%r10
2866	adoxq	%rbx,%r11
2867.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
2868	adcxq	%r13,%r11
2869	adoxq	%r14,%r12
2870.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
2871	movq	16(%rsi),%rdx
2872	adcxq	%rax,%r12
2873	adoxq	%rbx,%r13
2874	adcxq	%r15,%r13
2875	adoxq	%rbp,%r14
2876	adcxq	%rbp,%r14
2877
2878	movq	%r8,24(%rdi)
2879	movq	%r9,32(%rdi)
2880
2881	mulxq	24(%rsi),%r8,%rbx
2882	mulxq	32(%rsi),%r9,%rax
2883	adcxq	%r10,%r8
2884	adoxq	%rbx,%r9
2885	mulxq	40(%rsi),%r10,%rbx
2886	adcxq	%r11,%r9
2887	adoxq	%rax,%r10
2888.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
2889	adcxq	%r12,%r10
2890	adoxq	%r13,%r11
2891.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
2892.byte	0x3e
2893	movq	24(%rsi),%rdx
2894	adcxq	%rbx,%r11
2895	adoxq	%rax,%r12
2896	adcxq	%r14,%r12
2897	movq	%r8,40(%rdi)
2898	movq	%r9,48(%rdi)
2899	mulxq	32(%rsi),%r8,%rax
2900	adoxq	%rbp,%r13
2901	adcxq	%rbp,%r13
2902
2903	mulxq	40(%rsi),%r9,%rbx
2904	adcxq	%r10,%r8
2905	adoxq	%rax,%r9
2906	mulxq	48(%rsi),%r10,%rax
2907	adcxq	%r11,%r9
2908	adoxq	%r12,%r10
2909	mulxq	56(%rsi),%r11,%r12
2910	movq	32(%rsi),%rdx
2911	movq	40(%rsi),%r14
2912	adcxq	%rbx,%r10
2913	adoxq	%rax,%r11
2914	movq	48(%rsi),%r15
2915	adcxq	%r13,%r11
2916	adoxq	%rbp,%r12
2917	adcxq	%rbp,%r12
2918
2919	movq	%r8,56(%rdi)
2920	movq	%r9,64(%rdi)
2921
2922	mulxq	%r14,%r9,%rax
2923	movq	56(%rsi),%r8
2924	adcxq	%r10,%r9
2925	mulxq	%r15,%r10,%rbx
2926	adoxq	%rax,%r10
2927	adcxq	%r11,%r10
2928	mulxq	%r8,%r11,%rax
2929	movq	%r14,%rdx
2930	adoxq	%rbx,%r11
2931	adcxq	%r12,%r11
2932
2933	adcxq	%rbp,%rax
2934
2935	mulxq	%r15,%r14,%rbx
2936	mulxq	%r8,%r12,%r13
2937	movq	%r15,%rdx
2938	leaq	64(%rsi),%rsi
2939	adcxq	%r14,%r11
2940	adoxq	%rbx,%r12
2941	adcxq	%rax,%r12
2942	adoxq	%rbp,%r13
2943
2944.byte	0x67,0x67
2945	mulxq	%r8,%r8,%r14
2946	adcxq	%r8,%r13
2947	adcxq	%rbp,%r14
2948
2949	cmpq	8+8(%rsp),%rsi
2950	je	.Lsqrx8x_outer_break
2951
2952	negq	%rcx
2953	movq	$-8,%rcx
2954	movq	%rbp,%r15
2955	movq	64(%rdi),%r8
2956	adcxq	72(%rdi),%r9
2957	adcxq	80(%rdi),%r10
2958	adcxq	88(%rdi),%r11
2959	adcq	96(%rdi),%r12
2960	adcq	104(%rdi),%r13
2961	adcq	112(%rdi),%r14
2962	adcq	120(%rdi),%r15
2963	leaq	(%rsi),%rbp
2964	leaq	128(%rdi),%rdi
2965	sbbq	%rax,%rax
2966
2967	movq	-64(%rsi),%rdx
2968	movq	%rax,16+8(%rsp)
2969	movq	%rdi,24+8(%rsp)
2970
2971
2972	xorl	%eax,%eax
2973	jmp	.Lsqrx8x_loop
2974
2975.align	32
2976.Lsqrx8x_loop:
2977	movq	%r8,%rbx
2978	mulxq	0(%rbp),%rax,%r8
2979	adcxq	%rax,%rbx
2980	adoxq	%r9,%r8
2981
2982	mulxq	8(%rbp),%rax,%r9
2983	adcxq	%rax,%r8
2984	adoxq	%r10,%r9
2985
2986	mulxq	16(%rbp),%rax,%r10
2987	adcxq	%rax,%r9
2988	adoxq	%r11,%r10
2989
2990	mulxq	24(%rbp),%rax,%r11
2991	adcxq	%rax,%r10
2992	adoxq	%r12,%r11
2993
2994.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
2995	adcxq	%rax,%r11
2996	adoxq	%r13,%r12
2997
2998	mulxq	40(%rbp),%rax,%r13
2999	adcxq	%rax,%r12
3000	adoxq	%r14,%r13
3001
3002	mulxq	48(%rbp),%rax,%r14
3003	movq	%rbx,(%rdi,%rcx,8)
3004	movl	$0,%ebx
3005	adcxq	%rax,%r13
3006	adoxq	%r15,%r14
3007
3008.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
3009	movq	8(%rsi,%rcx,8),%rdx
3010	adcxq	%rax,%r14
3011	adoxq	%rbx,%r15
3012	adcxq	%rbx,%r15
3013
3014.byte	0x67
3015	incq	%rcx
3016	jnz	.Lsqrx8x_loop
3017
3018	leaq	64(%rbp),%rbp
3019	movq	$-8,%rcx
3020	cmpq	8+8(%rsp),%rbp
3021	je	.Lsqrx8x_break
3022
3023	subq	16+8(%rsp),%rbx
3024.byte	0x66
3025	movq	-64(%rsi),%rdx
3026	adcxq	0(%rdi),%r8
3027	adcxq	8(%rdi),%r9
3028	adcq	16(%rdi),%r10
3029	adcq	24(%rdi),%r11
3030	adcq	32(%rdi),%r12
3031	adcq	40(%rdi),%r13
3032	adcq	48(%rdi),%r14
3033	adcq	56(%rdi),%r15
3034	leaq	64(%rdi),%rdi
3035.byte	0x67
3036	sbbq	%rax,%rax
3037	xorl	%ebx,%ebx
3038	movq	%rax,16+8(%rsp)
3039	jmp	.Lsqrx8x_loop
3040
3041.align	32
3042.Lsqrx8x_break:
3043	xorq	%rbp,%rbp
3044	subq	16+8(%rsp),%rbx
3045	adcxq	%rbp,%r8
3046	movq	24+8(%rsp),%rcx
3047	adcxq	%rbp,%r9
3048	movq	0(%rsi),%rdx
3049	adcq	$0,%r10
3050	movq	%r8,0(%rdi)
3051	adcq	$0,%r11
3052	adcq	$0,%r12
3053	adcq	$0,%r13
3054	adcq	$0,%r14
3055	adcq	$0,%r15
3056	cmpq	%rcx,%rdi
3057	je	.Lsqrx8x_outer_loop
3058
3059	movq	%r9,8(%rdi)
3060	movq	8(%rcx),%r9
3061	movq	%r10,16(%rdi)
3062	movq	16(%rcx),%r10
3063	movq	%r11,24(%rdi)
3064	movq	24(%rcx),%r11
3065	movq	%r12,32(%rdi)
3066	movq	32(%rcx),%r12
3067	movq	%r13,40(%rdi)
3068	movq	40(%rcx),%r13
3069	movq	%r14,48(%rdi)
3070	movq	48(%rcx),%r14
3071	movq	%r15,56(%rdi)
3072	movq	56(%rcx),%r15
3073	movq	%rcx,%rdi
3074	jmp	.Lsqrx8x_outer_loop
3075
3076.align	32
3077.Lsqrx8x_outer_break:
3078	movq	%r9,72(%rdi)
3079.byte	102,72,15,126,217
3080	movq	%r10,80(%rdi)
3081	movq	%r11,88(%rdi)
3082	movq	%r12,96(%rdi)
3083	movq	%r13,104(%rdi)
3084	movq	%r14,112(%rdi)
3085	leaq	48+8(%rsp),%rdi
3086	movq	(%rsi,%rcx,1),%rdx
3087
3088	movq	8(%rdi),%r11
3089	xorq	%r10,%r10
3090	movq	0+8(%rsp),%r9
3091	adoxq	%r11,%r11
3092	movq	16(%rdi),%r12
3093	movq	24(%rdi),%r13
3094
3095
3096.align	32
3097.Lsqrx4x_shift_n_add:
3098	mulxq	%rdx,%rax,%rbx
3099	adoxq	%r12,%r12
3100	adcxq	%r10,%rax
3101.byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
3102.byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
3103	adoxq	%r13,%r13
3104	adcxq	%r11,%rbx
3105	movq	40(%rdi),%r11
3106	movq	%rax,0(%rdi)
3107	movq	%rbx,8(%rdi)
3108
3109	mulxq	%rdx,%rax,%rbx
3110	adoxq	%r10,%r10
3111	adcxq	%r12,%rax
3112	movq	16(%rsi,%rcx,1),%rdx
3113	movq	48(%rdi),%r12
3114	adoxq	%r11,%r11
3115	adcxq	%r13,%rbx
3116	movq	56(%rdi),%r13
3117	movq	%rax,16(%rdi)
3118	movq	%rbx,24(%rdi)
3119
3120	mulxq	%rdx,%rax,%rbx
3121	adoxq	%r12,%r12
3122	adcxq	%r10,%rax
3123	movq	24(%rsi,%rcx,1),%rdx
3124	leaq	32(%rcx),%rcx
3125	movq	64(%rdi),%r10
3126	adoxq	%r13,%r13
3127	adcxq	%r11,%rbx
3128	movq	72(%rdi),%r11
3129	movq	%rax,32(%rdi)
3130	movq	%rbx,40(%rdi)
3131
3132	mulxq	%rdx,%rax,%rbx
3133	adoxq	%r10,%r10
3134	adcxq	%r12,%rax
3135	jrcxz	.Lsqrx4x_shift_n_add_break
3136.byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
3137	adoxq	%r11,%r11
3138	adcxq	%r13,%rbx
3139	movq	80(%rdi),%r12
3140	movq	88(%rdi),%r13
3141	movq	%rax,48(%rdi)
3142	movq	%rbx,56(%rdi)
3143	leaq	64(%rdi),%rdi
3144	nop
3145	jmp	.Lsqrx4x_shift_n_add
3146
3147.align	32
3148.Lsqrx4x_shift_n_add_break:
3149	adcxq	%r13,%rbx
3150	movq	%rax,48(%rdi)
3151	movq	%rbx,56(%rdi)
3152	leaq	64(%rdi),%rdi
3153.byte	102,72,15,126,213
3154__bn_sqrx8x_reduction:
3155	xorl	%eax,%eax
3156	movq	32+8(%rsp),%rbx
3157	movq	48+8(%rsp),%rdx
3158	leaq	-64(%rbp,%r9,1),%rcx
3159
3160	movq	%rcx,0+8(%rsp)
3161	movq	%rdi,8+8(%rsp)
3162
3163	leaq	48+8(%rsp),%rdi
3164	jmp	.Lsqrx8x_reduction_loop
3165
3166.align	32
3167.Lsqrx8x_reduction_loop:
3168	movq	8(%rdi),%r9
3169	movq	16(%rdi),%r10
3170	movq	24(%rdi),%r11
3171	movq	32(%rdi),%r12
3172	movq	%rdx,%r8
3173	imulq	%rbx,%rdx
3174	movq	40(%rdi),%r13
3175	movq	48(%rdi),%r14
3176	movq	56(%rdi),%r15
3177	movq	%rax,24+8(%rsp)
3178
3179	leaq	64(%rdi),%rdi
3180	xorq	%rsi,%rsi
3181	movq	$-8,%rcx
3182	jmp	.Lsqrx8x_reduce
3183
3184.align	32
3185.Lsqrx8x_reduce:
3186	movq	%r8,%rbx
3187	mulxq	0(%rbp),%rax,%r8
3188	adcxq	%rbx,%rax
3189	adoxq	%r9,%r8
3190
3191	mulxq	8(%rbp),%rbx,%r9
3192	adcxq	%rbx,%r8
3193	adoxq	%r10,%r9
3194
3195	mulxq	16(%rbp),%rbx,%r10
3196	adcxq	%rbx,%r9
3197	adoxq	%r11,%r10
3198
3199	mulxq	24(%rbp),%rbx,%r11
3200	adcxq	%rbx,%r10
3201	adoxq	%r12,%r11
3202
3203.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
3204	movq	%rdx,%rax
3205	movq	%r8,%rdx
3206	adcxq	%rbx,%r11
3207	adoxq	%r13,%r12
3208
3209	mulxq	32+8(%rsp),%rbx,%rdx
3210	movq	%rax,%rdx
3211	movq	%rax,64+48+8(%rsp,%rcx,8)
3212
3213	mulxq	40(%rbp),%rax,%r13
3214	adcxq	%rax,%r12
3215	adoxq	%r14,%r13
3216
3217	mulxq	48(%rbp),%rax,%r14
3218	adcxq	%rax,%r13
3219	adoxq	%r15,%r14
3220
3221	mulxq	56(%rbp),%rax,%r15
3222	movq	%rbx,%rdx
3223	adcxq	%rax,%r14
3224	adoxq	%rsi,%r15
3225	adcxq	%rsi,%r15
3226
3227.byte	0x67,0x67,0x67
3228	incq	%rcx
3229	jnz	.Lsqrx8x_reduce
3230
3231	movq	%rsi,%rax
3232	cmpq	0+8(%rsp),%rbp
3233	jae	.Lsqrx8x_no_tail
3234
3235	movq	48+8(%rsp),%rdx
3236	addq	0(%rdi),%r8
3237	leaq	64(%rbp),%rbp
3238	movq	$-8,%rcx
3239	adcxq	8(%rdi),%r9
3240	adcxq	16(%rdi),%r10
3241	adcq	24(%rdi),%r11
3242	adcq	32(%rdi),%r12
3243	adcq	40(%rdi),%r13
3244	adcq	48(%rdi),%r14
3245	adcq	56(%rdi),%r15
3246	leaq	64(%rdi),%rdi
3247	sbbq	%rax,%rax
3248
3249	xorq	%rsi,%rsi
3250	movq	%rax,16+8(%rsp)
3251	jmp	.Lsqrx8x_tail
3252
3253.align	32
3254.Lsqrx8x_tail:
3255	movq	%r8,%rbx
3256	mulxq	0(%rbp),%rax,%r8
3257	adcxq	%rax,%rbx
3258	adoxq	%r9,%r8
3259
3260	mulxq	8(%rbp),%rax,%r9
3261	adcxq	%rax,%r8
3262	adoxq	%r10,%r9
3263
3264	mulxq	16(%rbp),%rax,%r10
3265	adcxq	%rax,%r9
3266	adoxq	%r11,%r10
3267
3268	mulxq	24(%rbp),%rax,%r11
3269	adcxq	%rax,%r10
3270	adoxq	%r12,%r11
3271
3272.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3273	adcxq	%rax,%r11
3274	adoxq	%r13,%r12
3275
3276	mulxq	40(%rbp),%rax,%r13
3277	adcxq	%rax,%r12
3278	adoxq	%r14,%r13
3279
3280	mulxq	48(%rbp),%rax,%r14
3281	adcxq	%rax,%r13
3282	adoxq	%r15,%r14
3283
3284	mulxq	56(%rbp),%rax,%r15
3285	movq	72+48+8(%rsp,%rcx,8),%rdx
3286	adcxq	%rax,%r14
3287	adoxq	%rsi,%r15
3288	movq	%rbx,(%rdi,%rcx,8)
3289	movq	%r8,%rbx
3290	adcxq	%rsi,%r15
3291
3292	incq	%rcx
3293	jnz	.Lsqrx8x_tail
3294
3295	cmpq	0+8(%rsp),%rbp
3296	jae	.Lsqrx8x_tail_done
3297
3298	subq	16+8(%rsp),%rsi
3299	movq	48+8(%rsp),%rdx
3300	leaq	64(%rbp),%rbp
3301	adcq	0(%rdi),%r8
3302	adcq	8(%rdi),%r9
3303	adcq	16(%rdi),%r10
3304	adcq	24(%rdi),%r11
3305	adcq	32(%rdi),%r12
3306	adcq	40(%rdi),%r13
3307	adcq	48(%rdi),%r14
3308	adcq	56(%rdi),%r15
3309	leaq	64(%rdi),%rdi
3310	sbbq	%rax,%rax
3311	subq	$8,%rcx
3312
3313	xorq	%rsi,%rsi
3314	movq	%rax,16+8(%rsp)
3315	jmp	.Lsqrx8x_tail
3316
3317.align	32
3318.Lsqrx8x_tail_done:
3319	xorq	%rax,%rax
3320	addq	24+8(%rsp),%r8
3321	adcq	$0,%r9
3322	adcq	$0,%r10
3323	adcq	$0,%r11
3324	adcq	$0,%r12
3325	adcq	$0,%r13
3326	adcq	$0,%r14
3327	adcq	$0,%r15
3328	adcq	$0,%rax
3329
3330	subq	16+8(%rsp),%rsi
3331.Lsqrx8x_no_tail:
3332	adcq	0(%rdi),%r8
3333.byte	102,72,15,126,217
3334	adcq	8(%rdi),%r9
3335	movq	56(%rbp),%rsi
3336.byte	102,72,15,126,213
3337	adcq	16(%rdi),%r10
3338	adcq	24(%rdi),%r11
3339	adcq	32(%rdi),%r12
3340	adcq	40(%rdi),%r13
3341	adcq	48(%rdi),%r14
3342	adcq	56(%rdi),%r15
3343	adcq	$0,%rax
3344
3345	movq	32+8(%rsp),%rbx
3346	movq	64(%rdi,%rcx,1),%rdx
3347
3348	movq	%r8,0(%rdi)
3349	leaq	64(%rdi),%r8
3350	movq	%r9,8(%rdi)
3351	movq	%r10,16(%rdi)
3352	movq	%r11,24(%rdi)
3353	movq	%r12,32(%rdi)
3354	movq	%r13,40(%rdi)
3355	movq	%r14,48(%rdi)
3356	movq	%r15,56(%rdi)
3357
3358	leaq	64(%rdi,%rcx,1),%rdi
3359	cmpq	8+8(%rsp),%r8
3360	jb	.Lsqrx8x_reduction_loop
3361	ret
3362.cfi_endproc
3363.size	bn_sqrx8x_internal,.-bn_sqrx8x_internal
3364.align	32
3365.type	__bn_postx4x_internal,@function
3366__bn_postx4x_internal:
3367.cfi_startproc
3368	movq	0(%rbp),%r12
3369	movq	%rcx,%r10
3370	movq	%rcx,%r9
3371	negq	%rax
3372	sarq	$3+2,%rcx
3373
3374.byte	102,72,15,126,202
3375.byte	102,72,15,126,206
3376	decq	%r12
3377	movq	8(%rbp),%r13
3378	xorq	%r8,%r8
3379	movq	16(%rbp),%r14
3380	movq	24(%rbp),%r15
3381	jmp	.Lsqrx4x_sub_entry
3382
3383.align	16
3384.Lsqrx4x_sub:
3385	movq	0(%rbp),%r12
3386	movq	8(%rbp),%r13
3387	movq	16(%rbp),%r14
3388	movq	24(%rbp),%r15
3389.Lsqrx4x_sub_entry:
3390	andnq	%rax,%r12,%r12
3391	leaq	32(%rbp),%rbp
3392	andnq	%rax,%r13,%r13
3393	andnq	%rax,%r14,%r14
3394	andnq	%rax,%r15,%r15
3395
3396	negq	%r8
3397	adcq	0(%rdi),%r12
3398	adcq	8(%rdi),%r13
3399	adcq	16(%rdi),%r14
3400	adcq	24(%rdi),%r15
3401	movq	%r12,0(%rdx)
3402	leaq	32(%rdi),%rdi
3403	movq	%r13,8(%rdx)
3404	sbbq	%r8,%r8
3405	movq	%r14,16(%rdx)
3406	movq	%r15,24(%rdx)
3407	leaq	32(%rdx),%rdx
3408
3409	incq	%rcx
3410	jnz	.Lsqrx4x_sub
3411
3412	negq	%r9
3413
3414	ret
3415.cfi_endproc
3416.size	__bn_postx4x_internal,.-__bn_postx4x_internal
3417.globl	bn_scatter5
3418.hidden bn_scatter5
3419.type	bn_scatter5,@function
3420.align	16
3421bn_scatter5:
3422.cfi_startproc
3423_CET_ENDBR
3424	cmpl	$0,%esi
3425	jz	.Lscatter_epilogue
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435	leaq	(%rdx,%rcx,8),%rdx
3436.Lscatter:
3437	movq	(%rdi),%rax
3438	leaq	8(%rdi),%rdi
3439	movq	%rax,(%rdx)
3440	leaq	256(%rdx),%rdx
3441	subl	$1,%esi
3442	jnz	.Lscatter
3443.Lscatter_epilogue:
3444	ret
3445.cfi_endproc
3446.size	bn_scatter5,.-bn_scatter5
3447
3448.globl	bn_gather5
3449.hidden bn_gather5
3450.type	bn_gather5,@function
3451.align	32
3452bn_gather5:
3453.cfi_startproc
3454.LSEH_begin_bn_gather5:
3455_CET_ENDBR
3456
3457.byte	0x4c,0x8d,0x14,0x24
3458.cfi_def_cfa_register	%r10
3459.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00
3460	leaq	.Linc(%rip),%rax
3461	andq	$-16,%rsp
3462
3463	movd	%ecx,%xmm5
3464	movdqa	0(%rax),%xmm0
3465	movdqa	16(%rax),%xmm1
3466	leaq	128(%rdx),%r11
3467	leaq	128(%rsp),%rax
3468
3469	pshufd	$0,%xmm5,%xmm5
3470	movdqa	%xmm1,%xmm4
3471	movdqa	%xmm1,%xmm2
3472	paddd	%xmm0,%xmm1
3473	pcmpeqd	%xmm5,%xmm0
3474	movdqa	%xmm4,%xmm3
3475
3476	paddd	%xmm1,%xmm2
3477	pcmpeqd	%xmm5,%xmm1
3478	movdqa	%xmm0,-128(%rax)
3479	movdqa	%xmm4,%xmm0
3480
3481	paddd	%xmm2,%xmm3
3482	pcmpeqd	%xmm5,%xmm2
3483	movdqa	%xmm1,-112(%rax)
3484	movdqa	%xmm4,%xmm1
3485
3486	paddd	%xmm3,%xmm0
3487	pcmpeqd	%xmm5,%xmm3
3488	movdqa	%xmm2,-96(%rax)
3489	movdqa	%xmm4,%xmm2
3490	paddd	%xmm0,%xmm1
3491	pcmpeqd	%xmm5,%xmm0
3492	movdqa	%xmm3,-80(%rax)
3493	movdqa	%xmm4,%xmm3
3494
3495	paddd	%xmm1,%xmm2
3496	pcmpeqd	%xmm5,%xmm1
3497	movdqa	%xmm0,-64(%rax)
3498	movdqa	%xmm4,%xmm0
3499
3500	paddd	%xmm2,%xmm3
3501	pcmpeqd	%xmm5,%xmm2
3502	movdqa	%xmm1,-48(%rax)
3503	movdqa	%xmm4,%xmm1
3504
3505	paddd	%xmm3,%xmm0
3506	pcmpeqd	%xmm5,%xmm3
3507	movdqa	%xmm2,-32(%rax)
3508	movdqa	%xmm4,%xmm2
3509	paddd	%xmm0,%xmm1
3510	pcmpeqd	%xmm5,%xmm0
3511	movdqa	%xmm3,-16(%rax)
3512	movdqa	%xmm4,%xmm3
3513
3514	paddd	%xmm1,%xmm2
3515	pcmpeqd	%xmm5,%xmm1
3516	movdqa	%xmm0,0(%rax)
3517	movdqa	%xmm4,%xmm0
3518
3519	paddd	%xmm2,%xmm3
3520	pcmpeqd	%xmm5,%xmm2
3521	movdqa	%xmm1,16(%rax)
3522	movdqa	%xmm4,%xmm1
3523
3524	paddd	%xmm3,%xmm0
3525	pcmpeqd	%xmm5,%xmm3
3526	movdqa	%xmm2,32(%rax)
3527	movdqa	%xmm4,%xmm2
3528	paddd	%xmm0,%xmm1
3529	pcmpeqd	%xmm5,%xmm0
3530	movdqa	%xmm3,48(%rax)
3531	movdqa	%xmm4,%xmm3
3532
3533	paddd	%xmm1,%xmm2
3534	pcmpeqd	%xmm5,%xmm1
3535	movdqa	%xmm0,64(%rax)
3536	movdqa	%xmm4,%xmm0
3537
3538	paddd	%xmm2,%xmm3
3539	pcmpeqd	%xmm5,%xmm2
3540	movdqa	%xmm1,80(%rax)
3541	movdqa	%xmm4,%xmm1
3542
3543	paddd	%xmm3,%xmm0
3544	pcmpeqd	%xmm5,%xmm3
3545	movdqa	%xmm2,96(%rax)
3546	movdqa	%xmm4,%xmm2
3547	movdqa	%xmm3,112(%rax)
3548	jmp	.Lgather
3549
3550.align	32
3551.Lgather:
3552	pxor	%xmm4,%xmm4
3553	pxor	%xmm5,%xmm5
3554	movdqa	-128(%r11),%xmm0
3555	movdqa	-112(%r11),%xmm1
3556	movdqa	-96(%r11),%xmm2
3557	pand	-128(%rax),%xmm0
3558	movdqa	-80(%r11),%xmm3
3559	pand	-112(%rax),%xmm1
3560	por	%xmm0,%xmm4
3561	pand	-96(%rax),%xmm2
3562	por	%xmm1,%xmm5
3563	pand	-80(%rax),%xmm3
3564	por	%xmm2,%xmm4
3565	por	%xmm3,%xmm5
3566	movdqa	-64(%r11),%xmm0
3567	movdqa	-48(%r11),%xmm1
3568	movdqa	-32(%r11),%xmm2
3569	pand	-64(%rax),%xmm0
3570	movdqa	-16(%r11),%xmm3
3571	pand	-48(%rax),%xmm1
3572	por	%xmm0,%xmm4
3573	pand	-32(%rax),%xmm2
3574	por	%xmm1,%xmm5
3575	pand	-16(%rax),%xmm3
3576	por	%xmm2,%xmm4
3577	por	%xmm3,%xmm5
3578	movdqa	0(%r11),%xmm0
3579	movdqa	16(%r11),%xmm1
3580	movdqa	32(%r11),%xmm2
3581	pand	0(%rax),%xmm0
3582	movdqa	48(%r11),%xmm3
3583	pand	16(%rax),%xmm1
3584	por	%xmm0,%xmm4
3585	pand	32(%rax),%xmm2
3586	por	%xmm1,%xmm5
3587	pand	48(%rax),%xmm3
3588	por	%xmm2,%xmm4
3589	por	%xmm3,%xmm5
3590	movdqa	64(%r11),%xmm0
3591	movdqa	80(%r11),%xmm1
3592	movdqa	96(%r11),%xmm2
3593	pand	64(%rax),%xmm0
3594	movdqa	112(%r11),%xmm3
3595	pand	80(%rax),%xmm1
3596	por	%xmm0,%xmm4
3597	pand	96(%rax),%xmm2
3598	por	%xmm1,%xmm5
3599	pand	112(%rax),%xmm3
3600	por	%xmm2,%xmm4
3601	por	%xmm3,%xmm5
3602	por	%xmm5,%xmm4
3603	leaq	256(%r11),%r11
3604
3605	pshufd	$0x4e,%xmm4,%xmm0
3606	por	%xmm4,%xmm0
3607	movq	%xmm0,(%rdi)
3608	leaq	8(%rdi),%rdi
3609	subl	$1,%esi
3610	jnz	.Lgather
3611
3612	leaq	(%r10),%rsp
3613.cfi_def_cfa_register	%rsp
3614	ret
3615.LSEH_end_bn_gather5:
3616.cfi_endproc
3617.size	bn_gather5,.-bn_gather5
3618.section	.rodata
3619.align	64
3620.Linc:
3621.long	0,0, 1,1
3622.long	2,2, 2,2
3623.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
3624.text
3625#endif
3626