1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <ring-core/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
7.text
8
9
10
11.globl	_bn_mul_mont_gather5
12.private_extern _bn_mul_mont_gather5
13
14.p2align	6
15_bn_mul_mont_gather5:
16
17_CET_ENDBR
18	movl	%r9d,%r9d
19	movq	%rsp,%rax
20
21	testl	$7,%r9d
22	jnz	L$mul_enter
23	leaq	_OPENSSL_ia32cap_P(%rip),%r11
24	movl	8(%r11),%r11d
25	jmp	L$mul4x_enter
26
27.p2align	4
28L$mul_enter:
29	movd	8(%rsp),%xmm5
30	pushq	%rbx
31
32	pushq	%rbp
33
34	pushq	%r12
35
36	pushq	%r13
37
38	pushq	%r14
39
40	pushq	%r15
41
42
43	negq	%r9
44	movq	%rsp,%r11
45	leaq	-280(%rsp,%r9,8),%r10
46	negq	%r9
47	andq	$-1024,%r10
48
49
50
51
52
53
54
55
56
57	subq	%r10,%r11
58	andq	$-4096,%r11
59	leaq	(%r10,%r11,1),%rsp
60	movq	(%rsp),%r11
61	cmpq	%r10,%rsp
62	ja	L$mul_page_walk
63	jmp	L$mul_page_walk_done
64
65L$mul_page_walk:
66	leaq	-4096(%rsp),%rsp
67	movq	(%rsp),%r11
68	cmpq	%r10,%rsp
69	ja	L$mul_page_walk
70L$mul_page_walk_done:
71
72	leaq	L$inc(%rip),%r10
73	movq	%rax,8(%rsp,%r9,8)
74
75L$mul_body:
76
77	leaq	128(%rdx),%r12
78	movdqa	0(%r10),%xmm0
79	movdqa	16(%r10),%xmm1
80	leaq	24-112(%rsp,%r9,8),%r10
81	andq	$-16,%r10
82
83	pshufd	$0,%xmm5,%xmm5
84	movdqa	%xmm1,%xmm4
85	movdqa	%xmm1,%xmm2
86	paddd	%xmm0,%xmm1
87	pcmpeqd	%xmm5,%xmm0
88.byte	0x67
89	movdqa	%xmm4,%xmm3
90	paddd	%xmm1,%xmm2
91	pcmpeqd	%xmm5,%xmm1
92	movdqa	%xmm0,112(%r10)
93	movdqa	%xmm4,%xmm0
94
95	paddd	%xmm2,%xmm3
96	pcmpeqd	%xmm5,%xmm2
97	movdqa	%xmm1,128(%r10)
98	movdqa	%xmm4,%xmm1
99
100	paddd	%xmm3,%xmm0
101	pcmpeqd	%xmm5,%xmm3
102	movdqa	%xmm2,144(%r10)
103	movdqa	%xmm4,%xmm2
104
105	paddd	%xmm0,%xmm1
106	pcmpeqd	%xmm5,%xmm0
107	movdqa	%xmm3,160(%r10)
108	movdqa	%xmm4,%xmm3
109	paddd	%xmm1,%xmm2
110	pcmpeqd	%xmm5,%xmm1
111	movdqa	%xmm0,176(%r10)
112	movdqa	%xmm4,%xmm0
113
114	paddd	%xmm2,%xmm3
115	pcmpeqd	%xmm5,%xmm2
116	movdqa	%xmm1,192(%r10)
117	movdqa	%xmm4,%xmm1
118
119	paddd	%xmm3,%xmm0
120	pcmpeqd	%xmm5,%xmm3
121	movdqa	%xmm2,208(%r10)
122	movdqa	%xmm4,%xmm2
123
124	paddd	%xmm0,%xmm1
125	pcmpeqd	%xmm5,%xmm0
126	movdqa	%xmm3,224(%r10)
127	movdqa	%xmm4,%xmm3
128	paddd	%xmm1,%xmm2
129	pcmpeqd	%xmm5,%xmm1
130	movdqa	%xmm0,240(%r10)
131	movdqa	%xmm4,%xmm0
132
133	paddd	%xmm2,%xmm3
134	pcmpeqd	%xmm5,%xmm2
135	movdqa	%xmm1,256(%r10)
136	movdqa	%xmm4,%xmm1
137
138	paddd	%xmm3,%xmm0
139	pcmpeqd	%xmm5,%xmm3
140	movdqa	%xmm2,272(%r10)
141	movdqa	%xmm4,%xmm2
142
143	paddd	%xmm0,%xmm1
144	pcmpeqd	%xmm5,%xmm0
145	movdqa	%xmm3,288(%r10)
146	movdqa	%xmm4,%xmm3
147	paddd	%xmm1,%xmm2
148	pcmpeqd	%xmm5,%xmm1
149	movdqa	%xmm0,304(%r10)
150
151	paddd	%xmm2,%xmm3
152.byte	0x67
153	pcmpeqd	%xmm5,%xmm2
154	movdqa	%xmm1,320(%r10)
155
156	pcmpeqd	%xmm5,%xmm3
157	movdqa	%xmm2,336(%r10)
158	pand	64(%r12),%xmm0
159
160	pand	80(%r12),%xmm1
161	pand	96(%r12),%xmm2
162	movdqa	%xmm3,352(%r10)
163	pand	112(%r12),%xmm3
164	por	%xmm2,%xmm0
165	por	%xmm3,%xmm1
166	movdqa	-128(%r12),%xmm4
167	movdqa	-112(%r12),%xmm5
168	movdqa	-96(%r12),%xmm2
169	pand	112(%r10),%xmm4
170	movdqa	-80(%r12),%xmm3
171	pand	128(%r10),%xmm5
172	por	%xmm4,%xmm0
173	pand	144(%r10),%xmm2
174	por	%xmm5,%xmm1
175	pand	160(%r10),%xmm3
176	por	%xmm2,%xmm0
177	por	%xmm3,%xmm1
178	movdqa	-64(%r12),%xmm4
179	movdqa	-48(%r12),%xmm5
180	movdqa	-32(%r12),%xmm2
181	pand	176(%r10),%xmm4
182	movdqa	-16(%r12),%xmm3
183	pand	192(%r10),%xmm5
184	por	%xmm4,%xmm0
185	pand	208(%r10),%xmm2
186	por	%xmm5,%xmm1
187	pand	224(%r10),%xmm3
188	por	%xmm2,%xmm0
189	por	%xmm3,%xmm1
190	movdqa	0(%r12),%xmm4
191	movdqa	16(%r12),%xmm5
192	movdqa	32(%r12),%xmm2
193	pand	240(%r10),%xmm4
194	movdqa	48(%r12),%xmm3
195	pand	256(%r10),%xmm5
196	por	%xmm4,%xmm0
197	pand	272(%r10),%xmm2
198	por	%xmm5,%xmm1
199	pand	288(%r10),%xmm3
200	por	%xmm2,%xmm0
201	por	%xmm3,%xmm1
202	por	%xmm1,%xmm0
203
204	pshufd	$0x4e,%xmm0,%xmm1
205	por	%xmm1,%xmm0
206	leaq	256(%r12),%r12
207.byte	102,72,15,126,195
208
209	movq	(%r8),%r8
210	movq	(%rsi),%rax
211
212	xorq	%r14,%r14
213	xorq	%r15,%r15
214
215	movq	%r8,%rbp
216	mulq	%rbx
217	movq	%rax,%r10
218	movq	(%rcx),%rax
219
220	imulq	%r10,%rbp
221	movq	%rdx,%r11
222
223	mulq	%rbp
224	addq	%rax,%r10
225	movq	8(%rsi),%rax
226	adcq	$0,%rdx
227	movq	%rdx,%r13
228
229	leaq	1(%r15),%r15
230	jmp	L$1st_enter
231
232.p2align	4
233L$1st:
234	addq	%rax,%r13
235	movq	(%rsi,%r15,8),%rax
236	adcq	$0,%rdx
237	addq	%r11,%r13
238	movq	%r10,%r11
239	adcq	$0,%rdx
240	movq	%r13,-16(%rsp,%r15,8)
241	movq	%rdx,%r13
242
243L$1st_enter:
244	mulq	%rbx
245	addq	%rax,%r11
246	movq	(%rcx,%r15,8),%rax
247	adcq	$0,%rdx
248	leaq	1(%r15),%r15
249	movq	%rdx,%r10
250
251	mulq	%rbp
252	cmpq	%r9,%r15
253	jne	L$1st
254
255
256	addq	%rax,%r13
257	adcq	$0,%rdx
258	addq	%r11,%r13
259	adcq	$0,%rdx
260	movq	%r13,-16(%rsp,%r9,8)
261	movq	%rdx,%r13
262	movq	%r10,%r11
263
264	xorq	%rdx,%rdx
265	addq	%r11,%r13
266	adcq	$0,%rdx
267	movq	%r13,-8(%rsp,%r9,8)
268	movq	%rdx,(%rsp,%r9,8)
269
270	leaq	1(%r14),%r14
271	jmp	L$outer
272.p2align	4
273L$outer:
274	leaq	24+128(%rsp,%r9,8),%rdx
275	andq	$-16,%rdx
276	pxor	%xmm4,%xmm4
277	pxor	%xmm5,%xmm5
278	movdqa	-128(%r12),%xmm0
279	movdqa	-112(%r12),%xmm1
280	movdqa	-96(%r12),%xmm2
281	movdqa	-80(%r12),%xmm3
282	pand	-128(%rdx),%xmm0
283	pand	-112(%rdx),%xmm1
284	por	%xmm0,%xmm4
285	pand	-96(%rdx),%xmm2
286	por	%xmm1,%xmm5
287	pand	-80(%rdx),%xmm3
288	por	%xmm2,%xmm4
289	por	%xmm3,%xmm5
290	movdqa	-64(%r12),%xmm0
291	movdqa	-48(%r12),%xmm1
292	movdqa	-32(%r12),%xmm2
293	movdqa	-16(%r12),%xmm3
294	pand	-64(%rdx),%xmm0
295	pand	-48(%rdx),%xmm1
296	por	%xmm0,%xmm4
297	pand	-32(%rdx),%xmm2
298	por	%xmm1,%xmm5
299	pand	-16(%rdx),%xmm3
300	por	%xmm2,%xmm4
301	por	%xmm3,%xmm5
302	movdqa	0(%r12),%xmm0
303	movdqa	16(%r12),%xmm1
304	movdqa	32(%r12),%xmm2
305	movdqa	48(%r12),%xmm3
306	pand	0(%rdx),%xmm0
307	pand	16(%rdx),%xmm1
308	por	%xmm0,%xmm4
309	pand	32(%rdx),%xmm2
310	por	%xmm1,%xmm5
311	pand	48(%rdx),%xmm3
312	por	%xmm2,%xmm4
313	por	%xmm3,%xmm5
314	movdqa	64(%r12),%xmm0
315	movdqa	80(%r12),%xmm1
316	movdqa	96(%r12),%xmm2
317	movdqa	112(%r12),%xmm3
318	pand	64(%rdx),%xmm0
319	pand	80(%rdx),%xmm1
320	por	%xmm0,%xmm4
321	pand	96(%rdx),%xmm2
322	por	%xmm1,%xmm5
323	pand	112(%rdx),%xmm3
324	por	%xmm2,%xmm4
325	por	%xmm3,%xmm5
326	por	%xmm5,%xmm4
327
328	pshufd	$0x4e,%xmm4,%xmm0
329	por	%xmm4,%xmm0
330	leaq	256(%r12),%r12
331
332	movq	(%rsi),%rax
333.byte	102,72,15,126,195
334
335	xorq	%r15,%r15
336	movq	%r8,%rbp
337	movq	(%rsp),%r10
338
339	mulq	%rbx
340	addq	%rax,%r10
341	movq	(%rcx),%rax
342	adcq	$0,%rdx
343
344	imulq	%r10,%rbp
345	movq	%rdx,%r11
346
347	mulq	%rbp
348	addq	%rax,%r10
349	movq	8(%rsi),%rax
350	adcq	$0,%rdx
351	movq	8(%rsp),%r10
352	movq	%rdx,%r13
353
354	leaq	1(%r15),%r15
355	jmp	L$inner_enter
356
357.p2align	4
358L$inner:
359	addq	%rax,%r13
360	movq	(%rsi,%r15,8),%rax
361	adcq	$0,%rdx
362	addq	%r10,%r13
363	movq	(%rsp,%r15,8),%r10
364	adcq	$0,%rdx
365	movq	%r13,-16(%rsp,%r15,8)
366	movq	%rdx,%r13
367
368L$inner_enter:
369	mulq	%rbx
370	addq	%rax,%r11
371	movq	(%rcx,%r15,8),%rax
372	adcq	$0,%rdx
373	addq	%r11,%r10
374	movq	%rdx,%r11
375	adcq	$0,%r11
376	leaq	1(%r15),%r15
377
378	mulq	%rbp
379	cmpq	%r9,%r15
380	jne	L$inner
381
382	addq	%rax,%r13
383	adcq	$0,%rdx
384	addq	%r10,%r13
385	movq	(%rsp,%r9,8),%r10
386	adcq	$0,%rdx
387	movq	%r13,-16(%rsp,%r9,8)
388	movq	%rdx,%r13
389
390	xorq	%rdx,%rdx
391	addq	%r11,%r13
392	adcq	$0,%rdx
393	addq	%r10,%r13
394	adcq	$0,%rdx
395	movq	%r13,-8(%rsp,%r9,8)
396	movq	%rdx,(%rsp,%r9,8)
397
398	leaq	1(%r14),%r14
399	cmpq	%r9,%r14
400	jb	L$outer
401
402	xorq	%r14,%r14
403	movq	(%rsp),%rax
404	leaq	(%rsp),%rsi
405	movq	%r9,%r15
406	jmp	L$sub
407.p2align	4
408L$sub:	sbbq	(%rcx,%r14,8),%rax
409	movq	%rax,(%rdi,%r14,8)
410	movq	8(%rsi,%r14,8),%rax
411	leaq	1(%r14),%r14
412	decq	%r15
413	jnz	L$sub
414
415	sbbq	$0,%rax
416	movq	$-1,%rbx
417	xorq	%rax,%rbx
418	xorq	%r14,%r14
419	movq	%r9,%r15
420
421L$copy:
422	movq	(%rdi,%r14,8),%rcx
423	movq	(%rsp,%r14,8),%rdx
424	andq	%rbx,%rcx
425	andq	%rax,%rdx
426	movq	%r14,(%rsp,%r14,8)
427	orq	%rcx,%rdx
428	movq	%rdx,(%rdi,%r14,8)
429	leaq	1(%r14),%r14
430	subq	$1,%r15
431	jnz	L$copy
432
433	movq	8(%rsp,%r9,8),%rsi
434
435	movq	$1,%rax
436
437	movq	-48(%rsi),%r15
438
439	movq	-40(%rsi),%r14
440
441	movq	-32(%rsi),%r13
442
443	movq	-24(%rsi),%r12
444
445	movq	-16(%rsi),%rbp
446
447	movq	-8(%rsi),%rbx
448
449	leaq	(%rsi),%rsp
450
451L$mul_epilogue:
452	ret
453
454
455
456.p2align	5
457bn_mul4x_mont_gather5:
458
459.byte	0x67
460	movq	%rsp,%rax
461
462L$mul4x_enter:
463	andl	$0x80108,%r11d
464	cmpl	$0x80108,%r11d
465	je	L$mulx4x_enter
466	pushq	%rbx
467
468	pushq	%rbp
469
470	pushq	%r12
471
472	pushq	%r13
473
474	pushq	%r14
475
476	pushq	%r15
477
478L$mul4x_prologue:
479
480.byte	0x67
481	shll	$3,%r9d
482	leaq	(%r9,%r9,2),%r10
483	negq	%r9
484
485
486
487
488
489
490
491
492
493
494	leaq	-320(%rsp,%r9,2),%r11
495	movq	%rsp,%rbp
496	subq	%rdi,%r11
497	andq	$4095,%r11
498	cmpq	%r11,%r10
499	jb	L$mul4xsp_alt
500	subq	%r11,%rbp
501	leaq	-320(%rbp,%r9,2),%rbp
502	jmp	L$mul4xsp_done
503
504.p2align	5
505L$mul4xsp_alt:
506	leaq	4096-320(,%r9,2),%r10
507	leaq	-320(%rbp,%r9,2),%rbp
508	subq	%r10,%r11
509	movq	$0,%r10
510	cmovcq	%r10,%r11
511	subq	%r11,%rbp
512L$mul4xsp_done:
513	andq	$-64,%rbp
514	movq	%rsp,%r11
515	subq	%rbp,%r11
516	andq	$-4096,%r11
517	leaq	(%r11,%rbp,1),%rsp
518	movq	(%rsp),%r10
519	cmpq	%rbp,%rsp
520	ja	L$mul4x_page_walk
521	jmp	L$mul4x_page_walk_done
522
523L$mul4x_page_walk:
524	leaq	-4096(%rsp),%rsp
525	movq	(%rsp),%r10
526	cmpq	%rbp,%rsp
527	ja	L$mul4x_page_walk
528L$mul4x_page_walk_done:
529
530	negq	%r9
531
532	movq	%rax,40(%rsp)
533
534L$mul4x_body:
535
536	call	mul4x_internal
537
538	movq	40(%rsp),%rsi
539
540	movq	$1,%rax
541
542	movq	-48(%rsi),%r15
543
544	movq	-40(%rsi),%r14
545
546	movq	-32(%rsi),%r13
547
548	movq	-24(%rsi),%r12
549
550	movq	-16(%rsi),%rbp
551
552	movq	-8(%rsi),%rbx
553
554	leaq	(%rsi),%rsp
555
556L$mul4x_epilogue:
557	ret
558
559
560
561
562.p2align	5
563mul4x_internal:
564
565	shlq	$5,%r9
566	movd	8(%rax),%xmm5
567	leaq	L$inc(%rip),%rax
568	leaq	128(%rdx,%r9,1),%r13
569	shrq	$5,%r9
570	movdqa	0(%rax),%xmm0
571	movdqa	16(%rax),%xmm1
572	leaq	88-112(%rsp,%r9,1),%r10
573	leaq	128(%rdx),%r12
574
575	pshufd	$0,%xmm5,%xmm5
576	movdqa	%xmm1,%xmm4
577.byte	0x67,0x67
578	movdqa	%xmm1,%xmm2
579	paddd	%xmm0,%xmm1
580	pcmpeqd	%xmm5,%xmm0
581.byte	0x67
582	movdqa	%xmm4,%xmm3
583	paddd	%xmm1,%xmm2
584	pcmpeqd	%xmm5,%xmm1
585	movdqa	%xmm0,112(%r10)
586	movdqa	%xmm4,%xmm0
587
588	paddd	%xmm2,%xmm3
589	pcmpeqd	%xmm5,%xmm2
590	movdqa	%xmm1,128(%r10)
591	movdqa	%xmm4,%xmm1
592
593	paddd	%xmm3,%xmm0
594	pcmpeqd	%xmm5,%xmm3
595	movdqa	%xmm2,144(%r10)
596	movdqa	%xmm4,%xmm2
597
598	paddd	%xmm0,%xmm1
599	pcmpeqd	%xmm5,%xmm0
600	movdqa	%xmm3,160(%r10)
601	movdqa	%xmm4,%xmm3
602	paddd	%xmm1,%xmm2
603	pcmpeqd	%xmm5,%xmm1
604	movdqa	%xmm0,176(%r10)
605	movdqa	%xmm4,%xmm0
606
607	paddd	%xmm2,%xmm3
608	pcmpeqd	%xmm5,%xmm2
609	movdqa	%xmm1,192(%r10)
610	movdqa	%xmm4,%xmm1
611
612	paddd	%xmm3,%xmm0
613	pcmpeqd	%xmm5,%xmm3
614	movdqa	%xmm2,208(%r10)
615	movdqa	%xmm4,%xmm2
616
617	paddd	%xmm0,%xmm1
618	pcmpeqd	%xmm5,%xmm0
619	movdqa	%xmm3,224(%r10)
620	movdqa	%xmm4,%xmm3
621	paddd	%xmm1,%xmm2
622	pcmpeqd	%xmm5,%xmm1
623	movdqa	%xmm0,240(%r10)
624	movdqa	%xmm4,%xmm0
625
626	paddd	%xmm2,%xmm3
627	pcmpeqd	%xmm5,%xmm2
628	movdqa	%xmm1,256(%r10)
629	movdqa	%xmm4,%xmm1
630
631	paddd	%xmm3,%xmm0
632	pcmpeqd	%xmm5,%xmm3
633	movdqa	%xmm2,272(%r10)
634	movdqa	%xmm4,%xmm2
635
636	paddd	%xmm0,%xmm1
637	pcmpeqd	%xmm5,%xmm0
638	movdqa	%xmm3,288(%r10)
639	movdqa	%xmm4,%xmm3
640	paddd	%xmm1,%xmm2
641	pcmpeqd	%xmm5,%xmm1
642	movdqa	%xmm0,304(%r10)
643
644	paddd	%xmm2,%xmm3
645.byte	0x67
646	pcmpeqd	%xmm5,%xmm2
647	movdqa	%xmm1,320(%r10)
648
649	pcmpeqd	%xmm5,%xmm3
650	movdqa	%xmm2,336(%r10)
651	pand	64(%r12),%xmm0
652
653	pand	80(%r12),%xmm1
654	pand	96(%r12),%xmm2
655	movdqa	%xmm3,352(%r10)
656	pand	112(%r12),%xmm3
657	por	%xmm2,%xmm0
658	por	%xmm3,%xmm1
659	movdqa	-128(%r12),%xmm4
660	movdqa	-112(%r12),%xmm5
661	movdqa	-96(%r12),%xmm2
662	pand	112(%r10),%xmm4
663	movdqa	-80(%r12),%xmm3
664	pand	128(%r10),%xmm5
665	por	%xmm4,%xmm0
666	pand	144(%r10),%xmm2
667	por	%xmm5,%xmm1
668	pand	160(%r10),%xmm3
669	por	%xmm2,%xmm0
670	por	%xmm3,%xmm1
671	movdqa	-64(%r12),%xmm4
672	movdqa	-48(%r12),%xmm5
673	movdqa	-32(%r12),%xmm2
674	pand	176(%r10),%xmm4
675	movdqa	-16(%r12),%xmm3
676	pand	192(%r10),%xmm5
677	por	%xmm4,%xmm0
678	pand	208(%r10),%xmm2
679	por	%xmm5,%xmm1
680	pand	224(%r10),%xmm3
681	por	%xmm2,%xmm0
682	por	%xmm3,%xmm1
683	movdqa	0(%r12),%xmm4
684	movdqa	16(%r12),%xmm5
685	movdqa	32(%r12),%xmm2
686	pand	240(%r10),%xmm4
687	movdqa	48(%r12),%xmm3
688	pand	256(%r10),%xmm5
689	por	%xmm4,%xmm0
690	pand	272(%r10),%xmm2
691	por	%xmm5,%xmm1
692	pand	288(%r10),%xmm3
693	por	%xmm2,%xmm0
694	por	%xmm3,%xmm1
695	por	%xmm1,%xmm0
696
697	pshufd	$0x4e,%xmm0,%xmm1
698	por	%xmm1,%xmm0
699	leaq	256(%r12),%r12
700.byte	102,72,15,126,195
701
702	movq	%r13,16+8(%rsp)
703	movq	%rdi,56+8(%rsp)
704
705	movq	(%r8),%r8
706	movq	(%rsi),%rax
707	leaq	(%rsi,%r9,1),%rsi
708	negq	%r9
709
710	movq	%r8,%rbp
711	mulq	%rbx
712	movq	%rax,%r10
713	movq	(%rcx),%rax
714
715	imulq	%r10,%rbp
716	leaq	64+8(%rsp),%r14
717	movq	%rdx,%r11
718
719	mulq	%rbp
720	addq	%rax,%r10
721	movq	8(%rsi,%r9,1),%rax
722	adcq	$0,%rdx
723	movq	%rdx,%rdi
724
725	mulq	%rbx
726	addq	%rax,%r11
727	movq	8(%rcx),%rax
728	adcq	$0,%rdx
729	movq	%rdx,%r10
730
731	mulq	%rbp
732	addq	%rax,%rdi
733	movq	16(%rsi,%r9,1),%rax
734	adcq	$0,%rdx
735	addq	%r11,%rdi
736	leaq	32(%r9),%r15
737	leaq	32(%rcx),%rcx
738	adcq	$0,%rdx
739	movq	%rdi,(%r14)
740	movq	%rdx,%r13
741	jmp	L$1st4x
742
743.p2align	5
744L$1st4x:
745	mulq	%rbx
746	addq	%rax,%r10
747	movq	-16(%rcx),%rax
748	leaq	32(%r14),%r14
749	adcq	$0,%rdx
750	movq	%rdx,%r11
751
752	mulq	%rbp
753	addq	%rax,%r13
754	movq	-8(%rsi,%r15,1),%rax
755	adcq	$0,%rdx
756	addq	%r10,%r13
757	adcq	$0,%rdx
758	movq	%r13,-24(%r14)
759	movq	%rdx,%rdi
760
761	mulq	%rbx
762	addq	%rax,%r11
763	movq	-8(%rcx),%rax
764	adcq	$0,%rdx
765	movq	%rdx,%r10
766
767	mulq	%rbp
768	addq	%rax,%rdi
769	movq	(%rsi,%r15,1),%rax
770	adcq	$0,%rdx
771	addq	%r11,%rdi
772	adcq	$0,%rdx
773	movq	%rdi,-16(%r14)
774	movq	%rdx,%r13
775
776	mulq	%rbx
777	addq	%rax,%r10
778	movq	0(%rcx),%rax
779	adcq	$0,%rdx
780	movq	%rdx,%r11
781
782	mulq	%rbp
783	addq	%rax,%r13
784	movq	8(%rsi,%r15,1),%rax
785	adcq	$0,%rdx
786	addq	%r10,%r13
787	adcq	$0,%rdx
788	movq	%r13,-8(%r14)
789	movq	%rdx,%rdi
790
791	mulq	%rbx
792	addq	%rax,%r11
793	movq	8(%rcx),%rax
794	adcq	$0,%rdx
795	movq	%rdx,%r10
796
797	mulq	%rbp
798	addq	%rax,%rdi
799	movq	16(%rsi,%r15,1),%rax
800	adcq	$0,%rdx
801	addq	%r11,%rdi
802	leaq	32(%rcx),%rcx
803	adcq	$0,%rdx
804	movq	%rdi,(%r14)
805	movq	%rdx,%r13
806
807	addq	$32,%r15
808	jnz	L$1st4x
809
810	mulq	%rbx
811	addq	%rax,%r10
812	movq	-16(%rcx),%rax
813	leaq	32(%r14),%r14
814	adcq	$0,%rdx
815	movq	%rdx,%r11
816
817	mulq	%rbp
818	addq	%rax,%r13
819	movq	-8(%rsi),%rax
820	adcq	$0,%rdx
821	addq	%r10,%r13
822	adcq	$0,%rdx
823	movq	%r13,-24(%r14)
824	movq	%rdx,%rdi
825
826	mulq	%rbx
827	addq	%rax,%r11
828	movq	-8(%rcx),%rax
829	adcq	$0,%rdx
830	movq	%rdx,%r10
831
832	mulq	%rbp
833	addq	%rax,%rdi
834	movq	(%rsi,%r9,1),%rax
835	adcq	$0,%rdx
836	addq	%r11,%rdi
837	adcq	$0,%rdx
838	movq	%rdi,-16(%r14)
839	movq	%rdx,%r13
840
841	leaq	(%rcx,%r9,1),%rcx
842
843	xorq	%rdi,%rdi
844	addq	%r10,%r13
845	adcq	$0,%rdi
846	movq	%r13,-8(%r14)
847
848	jmp	L$outer4x
849
850.p2align	5
851L$outer4x:
852	leaq	16+128(%r14),%rdx
853	pxor	%xmm4,%xmm4
854	pxor	%xmm5,%xmm5
855	movdqa	-128(%r12),%xmm0
856	movdqa	-112(%r12),%xmm1
857	movdqa	-96(%r12),%xmm2
858	movdqa	-80(%r12),%xmm3
859	pand	-128(%rdx),%xmm0
860	pand	-112(%rdx),%xmm1
861	por	%xmm0,%xmm4
862	pand	-96(%rdx),%xmm2
863	por	%xmm1,%xmm5
864	pand	-80(%rdx),%xmm3
865	por	%xmm2,%xmm4
866	por	%xmm3,%xmm5
867	movdqa	-64(%r12),%xmm0
868	movdqa	-48(%r12),%xmm1
869	movdqa	-32(%r12),%xmm2
870	movdqa	-16(%r12),%xmm3
871	pand	-64(%rdx),%xmm0
872	pand	-48(%rdx),%xmm1
873	por	%xmm0,%xmm4
874	pand	-32(%rdx),%xmm2
875	por	%xmm1,%xmm5
876	pand	-16(%rdx),%xmm3
877	por	%xmm2,%xmm4
878	por	%xmm3,%xmm5
879	movdqa	0(%r12),%xmm0
880	movdqa	16(%r12),%xmm1
881	movdqa	32(%r12),%xmm2
882	movdqa	48(%r12),%xmm3
883	pand	0(%rdx),%xmm0
884	pand	16(%rdx),%xmm1
885	por	%xmm0,%xmm4
886	pand	32(%rdx),%xmm2
887	por	%xmm1,%xmm5
888	pand	48(%rdx),%xmm3
889	por	%xmm2,%xmm4
890	por	%xmm3,%xmm5
891	movdqa	64(%r12),%xmm0
892	movdqa	80(%r12),%xmm1
893	movdqa	96(%r12),%xmm2
894	movdqa	112(%r12),%xmm3
895	pand	64(%rdx),%xmm0
896	pand	80(%rdx),%xmm1
897	por	%xmm0,%xmm4
898	pand	96(%rdx),%xmm2
899	por	%xmm1,%xmm5
900	pand	112(%rdx),%xmm3
901	por	%xmm2,%xmm4
902	por	%xmm3,%xmm5
903	por	%xmm5,%xmm4
904
905	pshufd	$0x4e,%xmm4,%xmm0
906	por	%xmm4,%xmm0
907	leaq	256(%r12),%r12
908.byte	102,72,15,126,195
909
910	movq	(%r14,%r9,1),%r10
911	movq	%r8,%rbp
912	mulq	%rbx
913	addq	%rax,%r10
914	movq	(%rcx),%rax
915	adcq	$0,%rdx
916
917	imulq	%r10,%rbp
918	movq	%rdx,%r11
919	movq	%rdi,(%r14)
920
921	leaq	(%r14,%r9,1),%r14
922
923	mulq	%rbp
924	addq	%rax,%r10
925	movq	8(%rsi,%r9,1),%rax
926	adcq	$0,%rdx
927	movq	%rdx,%rdi
928
929	mulq	%rbx
930	addq	%rax,%r11
931	movq	8(%rcx),%rax
932	adcq	$0,%rdx
933	addq	8(%r14),%r11
934	adcq	$0,%rdx
935	movq	%rdx,%r10
936
937	mulq	%rbp
938	addq	%rax,%rdi
939	movq	16(%rsi,%r9,1),%rax
940	adcq	$0,%rdx
941	addq	%r11,%rdi
942	leaq	32(%r9),%r15
943	leaq	32(%rcx),%rcx
944	adcq	$0,%rdx
945	movq	%rdx,%r13
946	jmp	L$inner4x
947
948.p2align	5
949L$inner4x:
950	mulq	%rbx
951	addq	%rax,%r10
952	movq	-16(%rcx),%rax
953	adcq	$0,%rdx
954	addq	16(%r14),%r10
955	leaq	32(%r14),%r14
956	adcq	$0,%rdx
957	movq	%rdx,%r11
958
959	mulq	%rbp
960	addq	%rax,%r13
961	movq	-8(%rsi,%r15,1),%rax
962	adcq	$0,%rdx
963	addq	%r10,%r13
964	adcq	$0,%rdx
965	movq	%rdi,-32(%r14)
966	movq	%rdx,%rdi
967
968	mulq	%rbx
969	addq	%rax,%r11
970	movq	-8(%rcx),%rax
971	adcq	$0,%rdx
972	addq	-8(%r14),%r11
973	adcq	$0,%rdx
974	movq	%rdx,%r10
975
976	mulq	%rbp
977	addq	%rax,%rdi
978	movq	(%rsi,%r15,1),%rax
979	adcq	$0,%rdx
980	addq	%r11,%rdi
981	adcq	$0,%rdx
982	movq	%r13,-24(%r14)
983	movq	%rdx,%r13
984
985	mulq	%rbx
986	addq	%rax,%r10
987	movq	0(%rcx),%rax
988	adcq	$0,%rdx
989	addq	(%r14),%r10
990	adcq	$0,%rdx
991	movq	%rdx,%r11
992
993	mulq	%rbp
994	addq	%rax,%r13
995	movq	8(%rsi,%r15,1),%rax
996	adcq	$0,%rdx
997	addq	%r10,%r13
998	adcq	$0,%rdx
999	movq	%rdi,-16(%r14)
1000	movq	%rdx,%rdi
1001
1002	mulq	%rbx
1003	addq	%rax,%r11
1004	movq	8(%rcx),%rax
1005	adcq	$0,%rdx
1006	addq	8(%r14),%r11
1007	adcq	$0,%rdx
1008	movq	%rdx,%r10
1009
1010	mulq	%rbp
1011	addq	%rax,%rdi
1012	movq	16(%rsi,%r15,1),%rax
1013	adcq	$0,%rdx
1014	addq	%r11,%rdi
1015	leaq	32(%rcx),%rcx
1016	adcq	$0,%rdx
1017	movq	%r13,-8(%r14)
1018	movq	%rdx,%r13
1019
1020	addq	$32,%r15
1021	jnz	L$inner4x
1022
1023	mulq	%rbx
1024	addq	%rax,%r10
1025	movq	-16(%rcx),%rax
1026	adcq	$0,%rdx
1027	addq	16(%r14),%r10
1028	leaq	32(%r14),%r14
1029	adcq	$0,%rdx
1030	movq	%rdx,%r11
1031
1032	mulq	%rbp
1033	addq	%rax,%r13
1034	movq	-8(%rsi),%rax
1035	adcq	$0,%rdx
1036	addq	%r10,%r13
1037	adcq	$0,%rdx
1038	movq	%rdi,-32(%r14)
1039	movq	%rdx,%rdi
1040
1041	mulq	%rbx
1042	addq	%rax,%r11
1043	movq	%rbp,%rax
1044	movq	-8(%rcx),%rbp
1045	adcq	$0,%rdx
1046	addq	-8(%r14),%r11
1047	adcq	$0,%rdx
1048	movq	%rdx,%r10
1049
1050	mulq	%rbp
1051	addq	%rax,%rdi
1052	movq	(%rsi,%r9,1),%rax
1053	adcq	$0,%rdx
1054	addq	%r11,%rdi
1055	adcq	$0,%rdx
1056	movq	%r13,-24(%r14)
1057	movq	%rdx,%r13
1058
1059	movq	%rdi,-16(%r14)
1060	leaq	(%rcx,%r9,1),%rcx
1061
1062	xorq	%rdi,%rdi
1063	addq	%r10,%r13
1064	adcq	$0,%rdi
1065	addq	(%r14),%r13
1066	adcq	$0,%rdi
1067	movq	%r13,-8(%r14)
1068
1069	cmpq	16+8(%rsp),%r12
1070	jb	L$outer4x
1071	xorq	%rax,%rax
1072	subq	%r13,%rbp
1073	adcq	%r15,%r15
1074	orq	%r15,%rdi
1075	subq	%rdi,%rax
1076	leaq	(%r14,%r9,1),%rbx
1077	movq	(%rcx),%r12
1078	leaq	(%rcx),%rbp
1079	movq	%r9,%rcx
1080	sarq	$3+2,%rcx
1081	movq	56+8(%rsp),%rdi
1082	decq	%r12
1083	xorq	%r10,%r10
1084	movq	8(%rbp),%r13
1085	movq	16(%rbp),%r14
1086	movq	24(%rbp),%r15
1087	jmp	L$sqr4x_sub_entry
1088
1089
1090.globl	_bn_power5
1091.private_extern _bn_power5
1092
1093.p2align	5
1094_bn_power5:
1095
1096_CET_ENDBR
1097	movq	%rsp,%rax
1098
1099	leaq	_OPENSSL_ia32cap_P(%rip),%r11
1100	movl	8(%r11),%r11d
1101	andl	$0x80108,%r11d
1102	cmpl	$0x80108,%r11d
1103	je	L$powerx5_enter
1104	pushq	%rbx
1105
1106	pushq	%rbp
1107
1108	pushq	%r12
1109
1110	pushq	%r13
1111
1112	pushq	%r14
1113
1114	pushq	%r15
1115
1116L$power5_prologue:
1117
1118	shll	$3,%r9d
1119	leal	(%r9,%r9,2),%r10d
1120	negq	%r9
1121	movq	(%r8),%r8
1122
1123
1124
1125
1126
1127
1128
1129
1130	leaq	-320(%rsp,%r9,2),%r11
1131	movq	%rsp,%rbp
1132	subq	%rdi,%r11
1133	andq	$4095,%r11
1134	cmpq	%r11,%r10
1135	jb	L$pwr_sp_alt
1136	subq	%r11,%rbp
1137	leaq	-320(%rbp,%r9,2),%rbp
1138	jmp	L$pwr_sp_done
1139
1140.p2align	5
1141L$pwr_sp_alt:
1142	leaq	4096-320(,%r9,2),%r10
1143	leaq	-320(%rbp,%r9,2),%rbp
1144	subq	%r10,%r11
1145	movq	$0,%r10
1146	cmovcq	%r10,%r11
1147	subq	%r11,%rbp
1148L$pwr_sp_done:
1149	andq	$-64,%rbp
1150	movq	%rsp,%r11
1151	subq	%rbp,%r11
1152	andq	$-4096,%r11
1153	leaq	(%r11,%rbp,1),%rsp
1154	movq	(%rsp),%r10
1155	cmpq	%rbp,%rsp
1156	ja	L$pwr_page_walk
1157	jmp	L$pwr_page_walk_done
1158
1159L$pwr_page_walk:
1160	leaq	-4096(%rsp),%rsp
1161	movq	(%rsp),%r10
1162	cmpq	%rbp,%rsp
1163	ja	L$pwr_page_walk
1164L$pwr_page_walk_done:
1165
1166	movq	%r9,%r10
1167	negq	%r9
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178	movq	%r8,32(%rsp)
1179	movq	%rax,40(%rsp)
1180
1181L$power5_body:
1182.byte	102,72,15,110,207
1183.byte	102,72,15,110,209
1184.byte	102,73,15,110,218
1185.byte	102,72,15,110,226
1186
1187	call	__bn_sqr8x_internal
1188	call	__bn_post4x_internal
1189	call	__bn_sqr8x_internal
1190	call	__bn_post4x_internal
1191	call	__bn_sqr8x_internal
1192	call	__bn_post4x_internal
1193	call	__bn_sqr8x_internal
1194	call	__bn_post4x_internal
1195	call	__bn_sqr8x_internal
1196	call	__bn_post4x_internal
1197
1198.byte	102,72,15,126,209
1199.byte	102,72,15,126,226
1200	movq	%rsi,%rdi
1201	movq	40(%rsp),%rax
1202	leaq	32(%rsp),%r8
1203
1204	call	mul4x_internal
1205
1206	movq	40(%rsp),%rsi
1207
1208	movq	$1,%rax
1209	movq	-48(%rsi),%r15
1210
1211	movq	-40(%rsi),%r14
1212
1213	movq	-32(%rsi),%r13
1214
1215	movq	-24(%rsi),%r12
1216
1217	movq	-16(%rsi),%rbp
1218
1219	movq	-8(%rsi),%rbx
1220
1221	leaq	(%rsi),%rsp
1222
1223L$power5_epilogue:
1224	ret
1225
1226
1227
1228.globl	_bn_sqr8x_internal
1229.private_extern _bn_sqr8x_internal
1230.private_extern	_bn_sqr8x_internal
1231
1232.p2align	5
1233_bn_sqr8x_internal:
1234__bn_sqr8x_internal:
1235
1236_CET_ENDBR
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310	leaq	32(%r10),%rbp
1311	leaq	(%rsi,%r9,1),%rsi
1312
1313	movq	%r9,%rcx
1314
1315
1316	movq	-32(%rsi,%rbp,1),%r14
1317	leaq	48+8(%rsp,%r9,2),%rdi
1318	movq	-24(%rsi,%rbp,1),%rax
1319	leaq	-32(%rdi,%rbp,1),%rdi
1320	movq	-16(%rsi,%rbp,1),%rbx
1321	movq	%rax,%r15
1322
1323	mulq	%r14
1324	movq	%rax,%r10
1325	movq	%rbx,%rax
1326	movq	%rdx,%r11
1327	movq	%r10,-24(%rdi,%rbp,1)
1328
1329	mulq	%r14
1330	addq	%rax,%r11
1331	movq	%rbx,%rax
1332	adcq	$0,%rdx
1333	movq	%r11,-16(%rdi,%rbp,1)
1334	movq	%rdx,%r10
1335
1336
1337	movq	-8(%rsi,%rbp,1),%rbx
1338	mulq	%r15
1339	movq	%rax,%r12
1340	movq	%rbx,%rax
1341	movq	%rdx,%r13
1342
1343	leaq	(%rbp),%rcx
1344	mulq	%r14
1345	addq	%rax,%r10
1346	movq	%rbx,%rax
1347	movq	%rdx,%r11
1348	adcq	$0,%r11
1349	addq	%r12,%r10
1350	adcq	$0,%r11
1351	movq	%r10,-8(%rdi,%rcx,1)
1352	jmp	L$sqr4x_1st
1353
1354.p2align	5
1355L$sqr4x_1st:
1356	movq	(%rsi,%rcx,1),%rbx
1357	mulq	%r15
1358	addq	%rax,%r13
1359	movq	%rbx,%rax
1360	movq	%rdx,%r12
1361	adcq	$0,%r12
1362
1363	mulq	%r14
1364	addq	%rax,%r11
1365	movq	%rbx,%rax
1366	movq	8(%rsi,%rcx,1),%rbx
1367	movq	%rdx,%r10
1368	adcq	$0,%r10
1369	addq	%r13,%r11
1370	adcq	$0,%r10
1371
1372
1373	mulq	%r15
1374	addq	%rax,%r12
1375	movq	%rbx,%rax
1376	movq	%r11,(%rdi,%rcx,1)
1377	movq	%rdx,%r13
1378	adcq	$0,%r13
1379
1380	mulq	%r14
1381	addq	%rax,%r10
1382	movq	%rbx,%rax
1383	movq	16(%rsi,%rcx,1),%rbx
1384	movq	%rdx,%r11
1385	adcq	$0,%r11
1386	addq	%r12,%r10
1387	adcq	$0,%r11
1388
1389	mulq	%r15
1390	addq	%rax,%r13
1391	movq	%rbx,%rax
1392	movq	%r10,8(%rdi,%rcx,1)
1393	movq	%rdx,%r12
1394	adcq	$0,%r12
1395
1396	mulq	%r14
1397	addq	%rax,%r11
1398	movq	%rbx,%rax
1399	movq	24(%rsi,%rcx,1),%rbx
1400	movq	%rdx,%r10
1401	adcq	$0,%r10
1402	addq	%r13,%r11
1403	adcq	$0,%r10
1404
1405
1406	mulq	%r15
1407	addq	%rax,%r12
1408	movq	%rbx,%rax
1409	movq	%r11,16(%rdi,%rcx,1)
1410	movq	%rdx,%r13
1411	adcq	$0,%r13
1412	leaq	32(%rcx),%rcx
1413
1414	mulq	%r14
1415	addq	%rax,%r10
1416	movq	%rbx,%rax
1417	movq	%rdx,%r11
1418	adcq	$0,%r11
1419	addq	%r12,%r10
1420	adcq	$0,%r11
1421	movq	%r10,-8(%rdi,%rcx,1)
1422
1423	cmpq	$0,%rcx
1424	jne	L$sqr4x_1st
1425
1426	mulq	%r15
1427	addq	%rax,%r13
1428	leaq	16(%rbp),%rbp
1429	adcq	$0,%rdx
1430	addq	%r11,%r13
1431	adcq	$0,%rdx
1432
1433	movq	%r13,(%rdi)
1434	movq	%rdx,%r12
1435	movq	%rdx,8(%rdi)
1436	jmp	L$sqr4x_outer
1437
1438.p2align	5
1439L$sqr4x_outer:
1440	movq	-32(%rsi,%rbp,1),%r14
1441	leaq	48+8(%rsp,%r9,2),%rdi
1442	movq	-24(%rsi,%rbp,1),%rax
1443	leaq	-32(%rdi,%rbp,1),%rdi
1444	movq	-16(%rsi,%rbp,1),%rbx
1445	movq	%rax,%r15
1446
1447	mulq	%r14
1448	movq	-24(%rdi,%rbp,1),%r10
1449	addq	%rax,%r10
1450	movq	%rbx,%rax
1451	adcq	$0,%rdx
1452	movq	%r10,-24(%rdi,%rbp,1)
1453	movq	%rdx,%r11
1454
1455	mulq	%r14
1456	addq	%rax,%r11
1457	movq	%rbx,%rax
1458	adcq	$0,%rdx
1459	addq	-16(%rdi,%rbp,1),%r11
1460	movq	%rdx,%r10
1461	adcq	$0,%r10
1462	movq	%r11,-16(%rdi,%rbp,1)
1463
1464	xorq	%r12,%r12
1465
1466	movq	-8(%rsi,%rbp,1),%rbx
1467	mulq	%r15
1468	addq	%rax,%r12
1469	movq	%rbx,%rax
1470	adcq	$0,%rdx
1471	addq	-8(%rdi,%rbp,1),%r12
1472	movq	%rdx,%r13
1473	adcq	$0,%r13
1474
1475	mulq	%r14
1476	addq	%rax,%r10
1477	movq	%rbx,%rax
1478	adcq	$0,%rdx
1479	addq	%r12,%r10
1480	movq	%rdx,%r11
1481	adcq	$0,%r11
1482	movq	%r10,-8(%rdi,%rbp,1)
1483
1484	leaq	(%rbp),%rcx
1485	jmp	L$sqr4x_inner
1486
1487.p2align	5
1488L$sqr4x_inner:
1489	movq	(%rsi,%rcx,1),%rbx
1490	mulq	%r15
1491	addq	%rax,%r13
1492	movq	%rbx,%rax
1493	movq	%rdx,%r12
1494	adcq	$0,%r12
1495	addq	(%rdi,%rcx,1),%r13
1496	adcq	$0,%r12
1497
1498.byte	0x67
1499	mulq	%r14
1500	addq	%rax,%r11
1501	movq	%rbx,%rax
1502	movq	8(%rsi,%rcx,1),%rbx
1503	movq	%rdx,%r10
1504	adcq	$0,%r10
1505	addq	%r13,%r11
1506	adcq	$0,%r10
1507
1508	mulq	%r15
1509	addq	%rax,%r12
1510	movq	%r11,(%rdi,%rcx,1)
1511	movq	%rbx,%rax
1512	movq	%rdx,%r13
1513	adcq	$0,%r13
1514	addq	8(%rdi,%rcx,1),%r12
1515	leaq	16(%rcx),%rcx
1516	adcq	$0,%r13
1517
1518	mulq	%r14
1519	addq	%rax,%r10
1520	movq	%rbx,%rax
1521	adcq	$0,%rdx
1522	addq	%r12,%r10
1523	movq	%rdx,%r11
1524	adcq	$0,%r11
1525	movq	%r10,-8(%rdi,%rcx,1)
1526
1527	cmpq	$0,%rcx
1528	jne	L$sqr4x_inner
1529
1530.byte	0x67
1531	mulq	%r15
1532	addq	%rax,%r13
1533	adcq	$0,%rdx
1534	addq	%r11,%r13
1535	adcq	$0,%rdx
1536
1537	movq	%r13,(%rdi)
1538	movq	%rdx,%r12
1539	movq	%rdx,8(%rdi)
1540
1541	addq	$16,%rbp
1542	jnz	L$sqr4x_outer
1543
1544
1545	movq	-32(%rsi),%r14
1546	leaq	48+8(%rsp,%r9,2),%rdi
1547	movq	-24(%rsi),%rax
1548	leaq	-32(%rdi,%rbp,1),%rdi
1549	movq	-16(%rsi),%rbx
1550	movq	%rax,%r15
1551
1552	mulq	%r14
1553	addq	%rax,%r10
1554	movq	%rbx,%rax
1555	movq	%rdx,%r11
1556	adcq	$0,%r11
1557
1558	mulq	%r14
1559	addq	%rax,%r11
1560	movq	%rbx,%rax
1561	movq	%r10,-24(%rdi)
1562	movq	%rdx,%r10
1563	adcq	$0,%r10
1564	addq	%r13,%r11
1565	movq	-8(%rsi),%rbx
1566	adcq	$0,%r10
1567
1568	mulq	%r15
1569	addq	%rax,%r12
1570	movq	%rbx,%rax
1571	movq	%r11,-16(%rdi)
1572	movq	%rdx,%r13
1573	adcq	$0,%r13
1574
1575	mulq	%r14
1576	addq	%rax,%r10
1577	movq	%rbx,%rax
1578	movq	%rdx,%r11
1579	adcq	$0,%r11
1580	addq	%r12,%r10
1581	adcq	$0,%r11
1582	movq	%r10,-8(%rdi)
1583
1584	mulq	%r15
1585	addq	%rax,%r13
1586	movq	-16(%rsi),%rax
1587	adcq	$0,%rdx
1588	addq	%r11,%r13
1589	adcq	$0,%rdx
1590
1591	movq	%r13,(%rdi)
1592	movq	%rdx,%r12
1593	movq	%rdx,8(%rdi)
1594
1595	mulq	%rbx
1596	addq	$16,%rbp
1597	xorq	%r14,%r14
1598	subq	%r9,%rbp
1599	xorq	%r15,%r15
1600
1601	addq	%r12,%rax
1602	adcq	$0,%rdx
1603	movq	%rax,8(%rdi)
1604	movq	%rdx,16(%rdi)
1605	movq	%r15,24(%rdi)
1606
1607	movq	-16(%rsi,%rbp,1),%rax
1608	leaq	48+8(%rsp),%rdi
1609	xorq	%r10,%r10
1610	movq	8(%rdi),%r11
1611
1612	leaq	(%r14,%r10,2),%r12
1613	shrq	$63,%r10
1614	leaq	(%rcx,%r11,2),%r13
1615	shrq	$63,%r11
1616	orq	%r10,%r13
1617	movq	16(%rdi),%r10
1618	movq	%r11,%r14
1619	mulq	%rax
1620	negq	%r15
1621	movq	24(%rdi),%r11
1622	adcq	%rax,%r12
1623	movq	-8(%rsi,%rbp,1),%rax
1624	movq	%r12,(%rdi)
1625	adcq	%rdx,%r13
1626
1627	leaq	(%r14,%r10,2),%rbx
1628	movq	%r13,8(%rdi)
1629	sbbq	%r15,%r15
1630	shrq	$63,%r10
1631	leaq	(%rcx,%r11,2),%r8
1632	shrq	$63,%r11
1633	orq	%r10,%r8
1634	movq	32(%rdi),%r10
1635	movq	%r11,%r14
1636	mulq	%rax
1637	negq	%r15
1638	movq	40(%rdi),%r11
1639	adcq	%rax,%rbx
1640	movq	0(%rsi,%rbp,1),%rax
1641	movq	%rbx,16(%rdi)
1642	adcq	%rdx,%r8
1643	leaq	16(%rbp),%rbp
1644	movq	%r8,24(%rdi)
1645	sbbq	%r15,%r15
1646	leaq	64(%rdi),%rdi
1647	jmp	L$sqr4x_shift_n_add
1648
1649.p2align	5
1650L$sqr4x_shift_n_add:
1651	leaq	(%r14,%r10,2),%r12
1652	shrq	$63,%r10
1653	leaq	(%rcx,%r11,2),%r13
1654	shrq	$63,%r11
1655	orq	%r10,%r13
1656	movq	-16(%rdi),%r10
1657	movq	%r11,%r14
1658	mulq	%rax
1659	negq	%r15
1660	movq	-8(%rdi),%r11
1661	adcq	%rax,%r12
1662	movq	-8(%rsi,%rbp,1),%rax
1663	movq	%r12,-32(%rdi)
1664	adcq	%rdx,%r13
1665
1666	leaq	(%r14,%r10,2),%rbx
1667	movq	%r13,-24(%rdi)
1668	sbbq	%r15,%r15
1669	shrq	$63,%r10
1670	leaq	(%rcx,%r11,2),%r8
1671	shrq	$63,%r11
1672	orq	%r10,%r8
1673	movq	0(%rdi),%r10
1674	movq	%r11,%r14
1675	mulq	%rax
1676	negq	%r15
1677	movq	8(%rdi),%r11
1678	adcq	%rax,%rbx
1679	movq	0(%rsi,%rbp,1),%rax
1680	movq	%rbx,-16(%rdi)
1681	adcq	%rdx,%r8
1682
1683	leaq	(%r14,%r10,2),%r12
1684	movq	%r8,-8(%rdi)
1685	sbbq	%r15,%r15
1686	shrq	$63,%r10
1687	leaq	(%rcx,%r11,2),%r13
1688	shrq	$63,%r11
1689	orq	%r10,%r13
1690	movq	16(%rdi),%r10
1691	movq	%r11,%r14
1692	mulq	%rax
1693	negq	%r15
1694	movq	24(%rdi),%r11
1695	adcq	%rax,%r12
1696	movq	8(%rsi,%rbp,1),%rax
1697	movq	%r12,0(%rdi)
1698	adcq	%rdx,%r13
1699
1700	leaq	(%r14,%r10,2),%rbx
1701	movq	%r13,8(%rdi)
1702	sbbq	%r15,%r15
1703	shrq	$63,%r10
1704	leaq	(%rcx,%r11,2),%r8
1705	shrq	$63,%r11
1706	orq	%r10,%r8
1707	movq	32(%rdi),%r10
1708	movq	%r11,%r14
1709	mulq	%rax
1710	negq	%r15
1711	movq	40(%rdi),%r11
1712	adcq	%rax,%rbx
1713	movq	16(%rsi,%rbp,1),%rax
1714	movq	%rbx,16(%rdi)
1715	adcq	%rdx,%r8
1716	movq	%r8,24(%rdi)
1717	sbbq	%r15,%r15
1718	leaq	64(%rdi),%rdi
1719	addq	$32,%rbp
1720	jnz	L$sqr4x_shift_n_add
1721
1722	leaq	(%r14,%r10,2),%r12
1723.byte	0x67
1724	shrq	$63,%r10
1725	leaq	(%rcx,%r11,2),%r13
1726	shrq	$63,%r11
1727	orq	%r10,%r13
1728	movq	-16(%rdi),%r10
1729	movq	%r11,%r14
1730	mulq	%rax
1731	negq	%r15
1732	movq	-8(%rdi),%r11
1733	adcq	%rax,%r12
1734	movq	-8(%rsi),%rax
1735	movq	%r12,-32(%rdi)
1736	adcq	%rdx,%r13
1737
1738	leaq	(%r14,%r10,2),%rbx
1739	movq	%r13,-24(%rdi)
1740	sbbq	%r15,%r15
1741	shrq	$63,%r10
1742	leaq	(%rcx,%r11,2),%r8
1743	shrq	$63,%r11
1744	orq	%r10,%r8
1745	mulq	%rax
1746	negq	%r15
1747	adcq	%rax,%rbx
1748	adcq	%rdx,%r8
1749	movq	%rbx,-16(%rdi)
1750	movq	%r8,-8(%rdi)
1751.byte	102,72,15,126,213
1752__bn_sqr8x_reduction:
1753	xorq	%rax,%rax
1754	leaq	(%r9,%rbp,1),%rcx
1755	leaq	48+8(%rsp,%r9,2),%rdx
1756	movq	%rcx,0+8(%rsp)
1757	leaq	48+8(%rsp,%r9,1),%rdi
1758	movq	%rdx,8+8(%rsp)
1759	negq	%r9
1760	jmp	L$8x_reduction_loop
1761
1762.p2align	5
1763L$8x_reduction_loop:
1764	leaq	(%rdi,%r9,1),%rdi
1765.byte	0x66
1766	movq	0(%rdi),%rbx
1767	movq	8(%rdi),%r9
1768	movq	16(%rdi),%r10
1769	movq	24(%rdi),%r11
1770	movq	32(%rdi),%r12
1771	movq	40(%rdi),%r13
1772	movq	48(%rdi),%r14
1773	movq	56(%rdi),%r15
1774	movq	%rax,(%rdx)
1775	leaq	64(%rdi),%rdi
1776
1777.byte	0x67
1778	movq	%rbx,%r8
1779	imulq	32+8(%rsp),%rbx
1780	movq	0(%rbp),%rax
1781	movl	$8,%ecx
1782	jmp	L$8x_reduce
1783
1784.p2align	5
1785L$8x_reduce:
1786	mulq	%rbx
1787	movq	8(%rbp),%rax
1788	negq	%r8
1789	movq	%rdx,%r8
1790	adcq	$0,%r8
1791
1792	mulq	%rbx
1793	addq	%rax,%r9
1794	movq	16(%rbp),%rax
1795	adcq	$0,%rdx
1796	addq	%r9,%r8
1797	movq	%rbx,48-8+8(%rsp,%rcx,8)
1798	movq	%rdx,%r9
1799	adcq	$0,%r9
1800
1801	mulq	%rbx
1802	addq	%rax,%r10
1803	movq	24(%rbp),%rax
1804	adcq	$0,%rdx
1805	addq	%r10,%r9
1806	movq	32+8(%rsp),%rsi
1807	movq	%rdx,%r10
1808	adcq	$0,%r10
1809
1810	mulq	%rbx
1811	addq	%rax,%r11
1812	movq	32(%rbp),%rax
1813	adcq	$0,%rdx
1814	imulq	%r8,%rsi
1815	addq	%r11,%r10
1816	movq	%rdx,%r11
1817	adcq	$0,%r11
1818
1819	mulq	%rbx
1820	addq	%rax,%r12
1821	movq	40(%rbp),%rax
1822	adcq	$0,%rdx
1823	addq	%r12,%r11
1824	movq	%rdx,%r12
1825	adcq	$0,%r12
1826
1827	mulq	%rbx
1828	addq	%rax,%r13
1829	movq	48(%rbp),%rax
1830	adcq	$0,%rdx
1831	addq	%r13,%r12
1832	movq	%rdx,%r13
1833	adcq	$0,%r13
1834
1835	mulq	%rbx
1836	addq	%rax,%r14
1837	movq	56(%rbp),%rax
1838	adcq	$0,%rdx
1839	addq	%r14,%r13
1840	movq	%rdx,%r14
1841	adcq	$0,%r14
1842
1843	mulq	%rbx
1844	movq	%rsi,%rbx
1845	addq	%rax,%r15
1846	movq	0(%rbp),%rax
1847	adcq	$0,%rdx
1848	addq	%r15,%r14
1849	movq	%rdx,%r15
1850	adcq	$0,%r15
1851
1852	decl	%ecx
1853	jnz	L$8x_reduce
1854
1855	leaq	64(%rbp),%rbp
1856	xorq	%rax,%rax
1857	movq	8+8(%rsp),%rdx
1858	cmpq	0+8(%rsp),%rbp
1859	jae	L$8x_no_tail
1860
1861.byte	0x66
1862	addq	0(%rdi),%r8
1863	adcq	8(%rdi),%r9
1864	adcq	16(%rdi),%r10
1865	adcq	24(%rdi),%r11
1866	adcq	32(%rdi),%r12
1867	adcq	40(%rdi),%r13
1868	adcq	48(%rdi),%r14
1869	adcq	56(%rdi),%r15
1870	sbbq	%rsi,%rsi
1871
1872	movq	48+56+8(%rsp),%rbx
1873	movl	$8,%ecx
1874	movq	0(%rbp),%rax
1875	jmp	L$8x_tail
1876
1877.p2align	5
1878L$8x_tail:
1879	mulq	%rbx
1880	addq	%rax,%r8
1881	movq	8(%rbp),%rax
1882	movq	%r8,(%rdi)
1883	movq	%rdx,%r8
1884	adcq	$0,%r8
1885
1886	mulq	%rbx
1887	addq	%rax,%r9
1888	movq	16(%rbp),%rax
1889	adcq	$0,%rdx
1890	addq	%r9,%r8
1891	leaq	8(%rdi),%rdi
1892	movq	%rdx,%r9
1893	adcq	$0,%r9
1894
1895	mulq	%rbx
1896	addq	%rax,%r10
1897	movq	24(%rbp),%rax
1898	adcq	$0,%rdx
1899	addq	%r10,%r9
1900	movq	%rdx,%r10
1901	adcq	$0,%r10
1902
1903	mulq	%rbx
1904	addq	%rax,%r11
1905	movq	32(%rbp),%rax
1906	adcq	$0,%rdx
1907	addq	%r11,%r10
1908	movq	%rdx,%r11
1909	adcq	$0,%r11
1910
1911	mulq	%rbx
1912	addq	%rax,%r12
1913	movq	40(%rbp),%rax
1914	adcq	$0,%rdx
1915	addq	%r12,%r11
1916	movq	%rdx,%r12
1917	adcq	$0,%r12
1918
1919	mulq	%rbx
1920	addq	%rax,%r13
1921	movq	48(%rbp),%rax
1922	adcq	$0,%rdx
1923	addq	%r13,%r12
1924	movq	%rdx,%r13
1925	adcq	$0,%r13
1926
1927	mulq	%rbx
1928	addq	%rax,%r14
1929	movq	56(%rbp),%rax
1930	adcq	$0,%rdx
1931	addq	%r14,%r13
1932	movq	%rdx,%r14
1933	adcq	$0,%r14
1934
1935	mulq	%rbx
1936	movq	48-16+8(%rsp,%rcx,8),%rbx
1937	addq	%rax,%r15
1938	adcq	$0,%rdx
1939	addq	%r15,%r14
1940	movq	0(%rbp),%rax
1941	movq	%rdx,%r15
1942	adcq	$0,%r15
1943
1944	decl	%ecx
1945	jnz	L$8x_tail
1946
1947	leaq	64(%rbp),%rbp
1948	movq	8+8(%rsp),%rdx
1949	cmpq	0+8(%rsp),%rbp
1950	jae	L$8x_tail_done
1951
1952	movq	48+56+8(%rsp),%rbx
1953	negq	%rsi
1954	movq	0(%rbp),%rax
1955	adcq	0(%rdi),%r8
1956	adcq	8(%rdi),%r9
1957	adcq	16(%rdi),%r10
1958	adcq	24(%rdi),%r11
1959	adcq	32(%rdi),%r12
1960	adcq	40(%rdi),%r13
1961	adcq	48(%rdi),%r14
1962	adcq	56(%rdi),%r15
1963	sbbq	%rsi,%rsi
1964
1965	movl	$8,%ecx
1966	jmp	L$8x_tail
1967
1968.p2align	5
1969L$8x_tail_done:
1970	xorq	%rax,%rax
1971	addq	(%rdx),%r8
1972	adcq	$0,%r9
1973	adcq	$0,%r10
1974	adcq	$0,%r11
1975	adcq	$0,%r12
1976	adcq	$0,%r13
1977	adcq	$0,%r14
1978	adcq	$0,%r15
1979	adcq	$0,%rax
1980
1981	negq	%rsi
1982L$8x_no_tail:
1983	adcq	0(%rdi),%r8
1984	adcq	8(%rdi),%r9
1985	adcq	16(%rdi),%r10
1986	adcq	24(%rdi),%r11
1987	adcq	32(%rdi),%r12
1988	adcq	40(%rdi),%r13
1989	adcq	48(%rdi),%r14
1990	adcq	56(%rdi),%r15
1991	adcq	$0,%rax
1992	movq	-8(%rbp),%rcx
1993	xorq	%rsi,%rsi
1994
1995.byte	102,72,15,126,213
1996
1997	movq	%r8,0(%rdi)
1998	movq	%r9,8(%rdi)
1999.byte	102,73,15,126,217
2000	movq	%r10,16(%rdi)
2001	movq	%r11,24(%rdi)
2002	movq	%r12,32(%rdi)
2003	movq	%r13,40(%rdi)
2004	movq	%r14,48(%rdi)
2005	movq	%r15,56(%rdi)
2006	leaq	64(%rdi),%rdi
2007
2008	cmpq	%rdx,%rdi
2009	jb	L$8x_reduction_loop
2010	ret
2011
2012
2013
2014.p2align	5
2015__bn_post4x_internal:
2016
2017	movq	0(%rbp),%r12
2018	leaq	(%rdi,%r9,1),%rbx
2019	movq	%r9,%rcx
2020.byte	102,72,15,126,207
2021	negq	%rax
2022.byte	102,72,15,126,206
2023	sarq	$3+2,%rcx
2024	decq	%r12
2025	xorq	%r10,%r10
2026	movq	8(%rbp),%r13
2027	movq	16(%rbp),%r14
2028	movq	24(%rbp),%r15
2029	jmp	L$sqr4x_sub_entry
2030
2031.p2align	4
2032L$sqr4x_sub:
2033	movq	0(%rbp),%r12
2034	movq	8(%rbp),%r13
2035	movq	16(%rbp),%r14
2036	movq	24(%rbp),%r15
2037L$sqr4x_sub_entry:
2038	leaq	32(%rbp),%rbp
2039	notq	%r12
2040	notq	%r13
2041	notq	%r14
2042	notq	%r15
2043	andq	%rax,%r12
2044	andq	%rax,%r13
2045	andq	%rax,%r14
2046	andq	%rax,%r15
2047
2048	negq	%r10
2049	adcq	0(%rbx),%r12
2050	adcq	8(%rbx),%r13
2051	adcq	16(%rbx),%r14
2052	adcq	24(%rbx),%r15
2053	movq	%r12,0(%rdi)
2054	leaq	32(%rbx),%rbx
2055	movq	%r13,8(%rdi)
2056	sbbq	%r10,%r10
2057	movq	%r14,16(%rdi)
2058	movq	%r15,24(%rdi)
2059	leaq	32(%rdi),%rdi
2060
2061	incq	%rcx
2062	jnz	L$sqr4x_sub
2063
2064	movq	%r9,%r10
2065	negq	%r9
2066	ret
2067
2068
2069
2070.p2align	5
2071bn_mulx4x_mont_gather5:
2072
2073	movq	%rsp,%rax
2074
2075L$mulx4x_enter:
2076	pushq	%rbx
2077
2078	pushq	%rbp
2079
2080	pushq	%r12
2081
2082	pushq	%r13
2083
2084	pushq	%r14
2085
2086	pushq	%r15
2087
2088L$mulx4x_prologue:
2089
2090	shll	$3,%r9d
2091	leaq	(%r9,%r9,2),%r10
2092	negq	%r9
2093	movq	(%r8),%r8
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104	leaq	-320(%rsp,%r9,2),%r11
2105	movq	%rsp,%rbp
2106	subq	%rdi,%r11
2107	andq	$4095,%r11
2108	cmpq	%r11,%r10
2109	jb	L$mulx4xsp_alt
2110	subq	%r11,%rbp
2111	leaq	-320(%rbp,%r9,2),%rbp
2112	jmp	L$mulx4xsp_done
2113
2114L$mulx4xsp_alt:
2115	leaq	4096-320(,%r9,2),%r10
2116	leaq	-320(%rbp,%r9,2),%rbp
2117	subq	%r10,%r11
2118	movq	$0,%r10
2119	cmovcq	%r10,%r11
2120	subq	%r11,%rbp
2121L$mulx4xsp_done:
2122	andq	$-64,%rbp
2123	movq	%rsp,%r11
2124	subq	%rbp,%r11
2125	andq	$-4096,%r11
2126	leaq	(%r11,%rbp,1),%rsp
2127	movq	(%rsp),%r10
2128	cmpq	%rbp,%rsp
2129	ja	L$mulx4x_page_walk
2130	jmp	L$mulx4x_page_walk_done
2131
2132L$mulx4x_page_walk:
2133	leaq	-4096(%rsp),%rsp
2134	movq	(%rsp),%r10
2135	cmpq	%rbp,%rsp
2136	ja	L$mulx4x_page_walk
2137L$mulx4x_page_walk_done:
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151	movq	%r8,32(%rsp)
2152	movq	%rax,40(%rsp)
2153
2154L$mulx4x_body:
2155	call	mulx4x_internal
2156
2157	movq	40(%rsp),%rsi
2158
2159	movq	$1,%rax
2160
2161	movq	-48(%rsi),%r15
2162
2163	movq	-40(%rsi),%r14
2164
2165	movq	-32(%rsi),%r13
2166
2167	movq	-24(%rsi),%r12
2168
2169	movq	-16(%rsi),%rbp
2170
2171	movq	-8(%rsi),%rbx
2172
2173	leaq	(%rsi),%rsp
2174
2175L$mulx4x_epilogue:
2176	ret
2177
2178
2179
2180
2181.p2align	5
2182mulx4x_internal:
2183
2184	movq	%r9,8(%rsp)
2185	movq	%r9,%r10
2186	negq	%r9
2187	shlq	$5,%r9
2188	negq	%r10
2189	leaq	128(%rdx,%r9,1),%r13
2190	shrq	$5+5,%r9
2191	movd	8(%rax),%xmm5
2192	subq	$1,%r9
2193	leaq	L$inc(%rip),%rax
2194	movq	%r13,16+8(%rsp)
2195	movq	%r9,24+8(%rsp)
2196	movq	%rdi,56+8(%rsp)
2197	movdqa	0(%rax),%xmm0
2198	movdqa	16(%rax),%xmm1
2199	leaq	88-112(%rsp,%r10,1),%r10
2200	leaq	128(%rdx),%rdi
2201
2202	pshufd	$0,%xmm5,%xmm5
2203	movdqa	%xmm1,%xmm4
2204.byte	0x67
2205	movdqa	%xmm1,%xmm2
2206.byte	0x67
2207	paddd	%xmm0,%xmm1
2208	pcmpeqd	%xmm5,%xmm0
2209	movdqa	%xmm4,%xmm3
2210	paddd	%xmm1,%xmm2
2211	pcmpeqd	%xmm5,%xmm1
2212	movdqa	%xmm0,112(%r10)
2213	movdqa	%xmm4,%xmm0
2214
2215	paddd	%xmm2,%xmm3
2216	pcmpeqd	%xmm5,%xmm2
2217	movdqa	%xmm1,128(%r10)
2218	movdqa	%xmm4,%xmm1
2219
2220	paddd	%xmm3,%xmm0
2221	pcmpeqd	%xmm5,%xmm3
2222	movdqa	%xmm2,144(%r10)
2223	movdqa	%xmm4,%xmm2
2224
2225	paddd	%xmm0,%xmm1
2226	pcmpeqd	%xmm5,%xmm0
2227	movdqa	%xmm3,160(%r10)
2228	movdqa	%xmm4,%xmm3
2229	paddd	%xmm1,%xmm2
2230	pcmpeqd	%xmm5,%xmm1
2231	movdqa	%xmm0,176(%r10)
2232	movdqa	%xmm4,%xmm0
2233
2234	paddd	%xmm2,%xmm3
2235	pcmpeqd	%xmm5,%xmm2
2236	movdqa	%xmm1,192(%r10)
2237	movdqa	%xmm4,%xmm1
2238
2239	paddd	%xmm3,%xmm0
2240	pcmpeqd	%xmm5,%xmm3
2241	movdqa	%xmm2,208(%r10)
2242	movdqa	%xmm4,%xmm2
2243
2244	paddd	%xmm0,%xmm1
2245	pcmpeqd	%xmm5,%xmm0
2246	movdqa	%xmm3,224(%r10)
2247	movdqa	%xmm4,%xmm3
2248	paddd	%xmm1,%xmm2
2249	pcmpeqd	%xmm5,%xmm1
2250	movdqa	%xmm0,240(%r10)
2251	movdqa	%xmm4,%xmm0
2252
2253	paddd	%xmm2,%xmm3
2254	pcmpeqd	%xmm5,%xmm2
2255	movdqa	%xmm1,256(%r10)
2256	movdqa	%xmm4,%xmm1
2257
2258	paddd	%xmm3,%xmm0
2259	pcmpeqd	%xmm5,%xmm3
2260	movdqa	%xmm2,272(%r10)
2261	movdqa	%xmm4,%xmm2
2262
2263	paddd	%xmm0,%xmm1
2264	pcmpeqd	%xmm5,%xmm0
2265	movdqa	%xmm3,288(%r10)
2266	movdqa	%xmm4,%xmm3
2267.byte	0x67
2268	paddd	%xmm1,%xmm2
2269	pcmpeqd	%xmm5,%xmm1
2270	movdqa	%xmm0,304(%r10)
2271
2272	paddd	%xmm2,%xmm3
2273	pcmpeqd	%xmm5,%xmm2
2274	movdqa	%xmm1,320(%r10)
2275
2276	pcmpeqd	%xmm5,%xmm3
2277	movdqa	%xmm2,336(%r10)
2278
2279	pand	64(%rdi),%xmm0
2280	pand	80(%rdi),%xmm1
2281	pand	96(%rdi),%xmm2
2282	movdqa	%xmm3,352(%r10)
2283	pand	112(%rdi),%xmm3
2284	por	%xmm2,%xmm0
2285	por	%xmm3,%xmm1
2286	movdqa	-128(%rdi),%xmm4
2287	movdqa	-112(%rdi),%xmm5
2288	movdqa	-96(%rdi),%xmm2
2289	pand	112(%r10),%xmm4
2290	movdqa	-80(%rdi),%xmm3
2291	pand	128(%r10),%xmm5
2292	por	%xmm4,%xmm0
2293	pand	144(%r10),%xmm2
2294	por	%xmm5,%xmm1
2295	pand	160(%r10),%xmm3
2296	por	%xmm2,%xmm0
2297	por	%xmm3,%xmm1
2298	movdqa	-64(%rdi),%xmm4
2299	movdqa	-48(%rdi),%xmm5
2300	movdqa	-32(%rdi),%xmm2
2301	pand	176(%r10),%xmm4
2302	movdqa	-16(%rdi),%xmm3
2303	pand	192(%r10),%xmm5
2304	por	%xmm4,%xmm0
2305	pand	208(%r10),%xmm2
2306	por	%xmm5,%xmm1
2307	pand	224(%r10),%xmm3
2308	por	%xmm2,%xmm0
2309	por	%xmm3,%xmm1
2310	movdqa	0(%rdi),%xmm4
2311	movdqa	16(%rdi),%xmm5
2312	movdqa	32(%rdi),%xmm2
2313	pand	240(%r10),%xmm4
2314	movdqa	48(%rdi),%xmm3
2315	pand	256(%r10),%xmm5
2316	por	%xmm4,%xmm0
2317	pand	272(%r10),%xmm2
2318	por	%xmm5,%xmm1
2319	pand	288(%r10),%xmm3
2320	por	%xmm2,%xmm0
2321	por	%xmm3,%xmm1
2322	pxor	%xmm1,%xmm0
2323
2324	pshufd	$0x4e,%xmm0,%xmm1
2325	por	%xmm1,%xmm0
2326	leaq	256(%rdi),%rdi
2327.byte	102,72,15,126,194
2328	leaq	64+32+8(%rsp),%rbx
2329
2330	movq	%rdx,%r9
2331	mulxq	0(%rsi),%r8,%rax
2332	mulxq	8(%rsi),%r11,%r12
2333	addq	%rax,%r11
2334	mulxq	16(%rsi),%rax,%r13
2335	adcq	%rax,%r12
2336	adcq	$0,%r13
2337	mulxq	24(%rsi),%rax,%r14
2338
2339	movq	%r8,%r15
2340	imulq	32+8(%rsp),%r8
2341	xorq	%rbp,%rbp
2342	movq	%r8,%rdx
2343
2344	movq	%rdi,8+8(%rsp)
2345
2346	leaq	32(%rsi),%rsi
2347	adcxq	%rax,%r13
2348	adcxq	%rbp,%r14
2349
2350	mulxq	0(%rcx),%rax,%r10
2351	adcxq	%rax,%r15
2352	adoxq	%r11,%r10
2353	mulxq	8(%rcx),%rax,%r11
2354	adcxq	%rax,%r10
2355	adoxq	%r12,%r11
2356	mulxq	16(%rcx),%rax,%r12
2357	movq	24+8(%rsp),%rdi
2358	movq	%r10,-32(%rbx)
2359	adcxq	%rax,%r11
2360	adoxq	%r13,%r12
2361	mulxq	24(%rcx),%rax,%r15
2362	movq	%r9,%rdx
2363	movq	%r11,-24(%rbx)
2364	adcxq	%rax,%r12
2365	adoxq	%rbp,%r15
2366	leaq	32(%rcx),%rcx
2367	movq	%r12,-16(%rbx)
2368	jmp	L$mulx4x_1st
2369
2370.p2align	5
2371L$mulx4x_1st:
2372	adcxq	%rbp,%r15
2373	mulxq	0(%rsi),%r10,%rax
2374	adcxq	%r14,%r10
2375	mulxq	8(%rsi),%r11,%r14
2376	adcxq	%rax,%r11
2377	mulxq	16(%rsi),%r12,%rax
2378	adcxq	%r14,%r12
2379	mulxq	24(%rsi),%r13,%r14
2380.byte	0x67,0x67
2381	movq	%r8,%rdx
2382	adcxq	%rax,%r13
2383	adcxq	%rbp,%r14
2384	leaq	32(%rsi),%rsi
2385	leaq	32(%rbx),%rbx
2386
2387	adoxq	%r15,%r10
2388	mulxq	0(%rcx),%rax,%r15
2389	adcxq	%rax,%r10
2390	adoxq	%r15,%r11
2391	mulxq	8(%rcx),%rax,%r15
2392	adcxq	%rax,%r11
2393	adoxq	%r15,%r12
2394	mulxq	16(%rcx),%rax,%r15
2395	movq	%r10,-40(%rbx)
2396	adcxq	%rax,%r12
2397	movq	%r11,-32(%rbx)
2398	adoxq	%r15,%r13
2399	mulxq	24(%rcx),%rax,%r15
2400	movq	%r9,%rdx
2401	movq	%r12,-24(%rbx)
2402	adcxq	%rax,%r13
2403	adoxq	%rbp,%r15
2404	leaq	32(%rcx),%rcx
2405	movq	%r13,-16(%rbx)
2406
2407	decq	%rdi
2408	jnz	L$mulx4x_1st
2409
2410	movq	8(%rsp),%rax
2411	adcq	%rbp,%r15
2412	leaq	(%rsi,%rax,1),%rsi
2413	addq	%r15,%r14
2414	movq	8+8(%rsp),%rdi
2415	adcq	%rbp,%rbp
2416	movq	%r14,-8(%rbx)
2417	jmp	L$mulx4x_outer
2418
2419.p2align	5
2420L$mulx4x_outer:
2421	leaq	16-256(%rbx),%r10
2422	pxor	%xmm4,%xmm4
2423.byte	0x67,0x67
2424	pxor	%xmm5,%xmm5
2425	movdqa	-128(%rdi),%xmm0
2426	movdqa	-112(%rdi),%xmm1
2427	movdqa	-96(%rdi),%xmm2
2428	pand	256(%r10),%xmm0
2429	movdqa	-80(%rdi),%xmm3
2430	pand	272(%r10),%xmm1
2431	por	%xmm0,%xmm4
2432	pand	288(%r10),%xmm2
2433	por	%xmm1,%xmm5
2434	pand	304(%r10),%xmm3
2435	por	%xmm2,%xmm4
2436	por	%xmm3,%xmm5
2437	movdqa	-64(%rdi),%xmm0
2438	movdqa	-48(%rdi),%xmm1
2439	movdqa	-32(%rdi),%xmm2
2440	pand	320(%r10),%xmm0
2441	movdqa	-16(%rdi),%xmm3
2442	pand	336(%r10),%xmm1
2443	por	%xmm0,%xmm4
2444	pand	352(%r10),%xmm2
2445	por	%xmm1,%xmm5
2446	pand	368(%r10),%xmm3
2447	por	%xmm2,%xmm4
2448	por	%xmm3,%xmm5
2449	movdqa	0(%rdi),%xmm0
2450	movdqa	16(%rdi),%xmm1
2451	movdqa	32(%rdi),%xmm2
2452	pand	384(%r10),%xmm0
2453	movdqa	48(%rdi),%xmm3
2454	pand	400(%r10),%xmm1
2455	por	%xmm0,%xmm4
2456	pand	416(%r10),%xmm2
2457	por	%xmm1,%xmm5
2458	pand	432(%r10),%xmm3
2459	por	%xmm2,%xmm4
2460	por	%xmm3,%xmm5
2461	movdqa	64(%rdi),%xmm0
2462	movdqa	80(%rdi),%xmm1
2463	movdqa	96(%rdi),%xmm2
2464	pand	448(%r10),%xmm0
2465	movdqa	112(%rdi),%xmm3
2466	pand	464(%r10),%xmm1
2467	por	%xmm0,%xmm4
2468	pand	480(%r10),%xmm2
2469	por	%xmm1,%xmm5
2470	pand	496(%r10),%xmm3
2471	por	%xmm2,%xmm4
2472	por	%xmm3,%xmm5
2473	por	%xmm5,%xmm4
2474
2475	pshufd	$0x4e,%xmm4,%xmm0
2476	por	%xmm4,%xmm0
2477	leaq	256(%rdi),%rdi
2478.byte	102,72,15,126,194
2479
2480	movq	%rbp,(%rbx)
2481	leaq	32(%rbx,%rax,1),%rbx
2482	mulxq	0(%rsi),%r8,%r11
2483	xorq	%rbp,%rbp
2484	movq	%rdx,%r9
2485	mulxq	8(%rsi),%r14,%r12
2486	adoxq	-32(%rbx),%r8
2487	adcxq	%r14,%r11
2488	mulxq	16(%rsi),%r15,%r13
2489	adoxq	-24(%rbx),%r11
2490	adcxq	%r15,%r12
2491	mulxq	24(%rsi),%rdx,%r14
2492	adoxq	-16(%rbx),%r12
2493	adcxq	%rdx,%r13
2494	leaq	(%rcx,%rax,1),%rcx
2495	leaq	32(%rsi),%rsi
2496	adoxq	-8(%rbx),%r13
2497	adcxq	%rbp,%r14
2498	adoxq	%rbp,%r14
2499
2500	movq	%r8,%r15
2501	imulq	32+8(%rsp),%r8
2502
2503	movq	%r8,%rdx
2504	xorq	%rbp,%rbp
2505	movq	%rdi,8+8(%rsp)
2506
2507	mulxq	0(%rcx),%rax,%r10
2508	adcxq	%rax,%r15
2509	adoxq	%r11,%r10
2510	mulxq	8(%rcx),%rax,%r11
2511	adcxq	%rax,%r10
2512	adoxq	%r12,%r11
2513	mulxq	16(%rcx),%rax,%r12
2514	adcxq	%rax,%r11
2515	adoxq	%r13,%r12
2516	mulxq	24(%rcx),%rax,%r15
2517	movq	%r9,%rdx
2518	movq	24+8(%rsp),%rdi
2519	movq	%r10,-32(%rbx)
2520	adcxq	%rax,%r12
2521	movq	%r11,-24(%rbx)
2522	adoxq	%rbp,%r15
2523	movq	%r12,-16(%rbx)
2524	leaq	32(%rcx),%rcx
2525	jmp	L$mulx4x_inner
2526
2527.p2align	5
2528L$mulx4x_inner:
2529	mulxq	0(%rsi),%r10,%rax
2530	adcxq	%rbp,%r15
2531	adoxq	%r14,%r10
2532	mulxq	8(%rsi),%r11,%r14
2533	adcxq	0(%rbx),%r10
2534	adoxq	%rax,%r11
2535	mulxq	16(%rsi),%r12,%rax
2536	adcxq	8(%rbx),%r11
2537	adoxq	%r14,%r12
2538	mulxq	24(%rsi),%r13,%r14
2539	movq	%r8,%rdx
2540	adcxq	16(%rbx),%r12
2541	adoxq	%rax,%r13
2542	adcxq	24(%rbx),%r13
2543	adoxq	%rbp,%r14
2544	leaq	32(%rsi),%rsi
2545	leaq	32(%rbx),%rbx
2546	adcxq	%rbp,%r14
2547
2548	adoxq	%r15,%r10
2549	mulxq	0(%rcx),%rax,%r15
2550	adcxq	%rax,%r10
2551	adoxq	%r15,%r11
2552	mulxq	8(%rcx),%rax,%r15
2553	adcxq	%rax,%r11
2554	adoxq	%r15,%r12
2555	mulxq	16(%rcx),%rax,%r15
2556	movq	%r10,-40(%rbx)
2557	adcxq	%rax,%r12
2558	adoxq	%r15,%r13
2559	movq	%r11,-32(%rbx)
2560	mulxq	24(%rcx),%rax,%r15
2561	movq	%r9,%rdx
2562	leaq	32(%rcx),%rcx
2563	movq	%r12,-24(%rbx)
2564	adcxq	%rax,%r13
2565	adoxq	%rbp,%r15
2566	movq	%r13,-16(%rbx)
2567
2568	decq	%rdi
2569	jnz	L$mulx4x_inner
2570
2571	movq	0+8(%rsp),%rax
2572	adcq	%rbp,%r15
2573	subq	0(%rbx),%rdi
2574	movq	8+8(%rsp),%rdi
2575	movq	16+8(%rsp),%r10
2576	adcq	%r15,%r14
2577	leaq	(%rsi,%rax,1),%rsi
2578	adcq	%rbp,%rbp
2579	movq	%r14,-8(%rbx)
2580
2581	cmpq	%r10,%rdi
2582	jb	L$mulx4x_outer
2583
2584	movq	-8(%rcx),%r10
2585	movq	%rbp,%r8
2586	movq	(%rcx,%rax,1),%r12
2587	leaq	(%rcx,%rax,1),%rbp
2588	movq	%rax,%rcx
2589	leaq	(%rbx,%rax,1),%rdi
2590	xorl	%eax,%eax
2591	xorq	%r15,%r15
2592	subq	%r14,%r10
2593	adcq	%r15,%r15
2594	orq	%r15,%r8
2595	sarq	$3+2,%rcx
2596	subq	%r8,%rax
2597	movq	56+8(%rsp),%rdx
2598	decq	%r12
2599	movq	8(%rbp),%r13
2600	xorq	%r8,%r8
2601	movq	16(%rbp),%r14
2602	movq	24(%rbp),%r15
2603	jmp	L$sqrx4x_sub_entry
2604
2605
2606
2607.p2align	5
2608bn_powerx5:
2609
2610	movq	%rsp,%rax
2611
2612L$powerx5_enter:
2613	pushq	%rbx
2614
2615	pushq	%rbp
2616
2617	pushq	%r12
2618
2619	pushq	%r13
2620
2621	pushq	%r14
2622
2623	pushq	%r15
2624
2625L$powerx5_prologue:
2626
2627	shll	$3,%r9d
2628	leaq	(%r9,%r9,2),%r10
2629	negq	%r9
2630	movq	(%r8),%r8
2631
2632
2633
2634
2635
2636
2637
2638
2639	leaq	-320(%rsp,%r9,2),%r11
2640	movq	%rsp,%rbp
2641	subq	%rdi,%r11
2642	andq	$4095,%r11
2643	cmpq	%r11,%r10
2644	jb	L$pwrx_sp_alt
2645	subq	%r11,%rbp
2646	leaq	-320(%rbp,%r9,2),%rbp
2647	jmp	L$pwrx_sp_done
2648
2649.p2align	5
2650L$pwrx_sp_alt:
2651	leaq	4096-320(,%r9,2),%r10
2652	leaq	-320(%rbp,%r9,2),%rbp
2653	subq	%r10,%r11
2654	movq	$0,%r10
2655	cmovcq	%r10,%r11
2656	subq	%r11,%rbp
2657L$pwrx_sp_done:
2658	andq	$-64,%rbp
2659	movq	%rsp,%r11
2660	subq	%rbp,%r11
2661	andq	$-4096,%r11
2662	leaq	(%r11,%rbp,1),%rsp
2663	movq	(%rsp),%r10
2664	cmpq	%rbp,%rsp
2665	ja	L$pwrx_page_walk
2666	jmp	L$pwrx_page_walk_done
2667
2668L$pwrx_page_walk:
2669	leaq	-4096(%rsp),%rsp
2670	movq	(%rsp),%r10
2671	cmpq	%rbp,%rsp
2672	ja	L$pwrx_page_walk
2673L$pwrx_page_walk_done:
2674
2675	movq	%r9,%r10
2676	negq	%r9
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689	pxor	%xmm0,%xmm0
2690.byte	102,72,15,110,207
2691.byte	102,72,15,110,209
2692.byte	102,73,15,110,218
2693.byte	102,72,15,110,226
2694	movq	%r8,32(%rsp)
2695	movq	%rax,40(%rsp)
2696
2697L$powerx5_body:
2698
2699	call	__bn_sqrx8x_internal
2700	call	__bn_postx4x_internal
2701	call	__bn_sqrx8x_internal
2702	call	__bn_postx4x_internal
2703	call	__bn_sqrx8x_internal
2704	call	__bn_postx4x_internal
2705	call	__bn_sqrx8x_internal
2706	call	__bn_postx4x_internal
2707	call	__bn_sqrx8x_internal
2708	call	__bn_postx4x_internal
2709
2710	movq	%r10,%r9
2711	movq	%rsi,%rdi
2712.byte	102,72,15,126,209
2713.byte	102,72,15,126,226
2714	movq	40(%rsp),%rax
2715
2716	call	mulx4x_internal
2717
2718	movq	40(%rsp),%rsi
2719
2720	movq	$1,%rax
2721
2722	movq	-48(%rsi),%r15
2723
2724	movq	-40(%rsi),%r14
2725
2726	movq	-32(%rsi),%r13
2727
2728	movq	-24(%rsi),%r12
2729
2730	movq	-16(%rsi),%rbp
2731
2732	movq	-8(%rsi),%rbx
2733
2734	leaq	(%rsi),%rsp
2735
2736L$powerx5_epilogue:
2737	ret
2738
2739
2740
2741.globl	_bn_sqrx8x_internal
2742.private_extern _bn_sqrx8x_internal
2743.private_extern	_bn_sqrx8x_internal
2744
2745.p2align	5
2746_bn_sqrx8x_internal:
2747__bn_sqrx8x_internal:
2748
2749_CET_ENDBR
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790	leaq	48+8(%rsp),%rdi
2791	leaq	(%rsi,%r9,1),%rbp
2792	movq	%r9,0+8(%rsp)
2793	movq	%rbp,8+8(%rsp)
2794	jmp	L$sqr8x_zero_start
2795
2796.p2align	5
2797.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2798L$sqrx8x_zero:
2799.byte	0x3e
2800	movdqa	%xmm0,0(%rdi)
2801	movdqa	%xmm0,16(%rdi)
2802	movdqa	%xmm0,32(%rdi)
2803	movdqa	%xmm0,48(%rdi)
2804L$sqr8x_zero_start:
2805	movdqa	%xmm0,64(%rdi)
2806	movdqa	%xmm0,80(%rdi)
2807	movdqa	%xmm0,96(%rdi)
2808	movdqa	%xmm0,112(%rdi)
2809	leaq	128(%rdi),%rdi
2810	subq	$64,%r9
2811	jnz	L$sqrx8x_zero
2812
2813	movq	0(%rsi),%rdx
2814
2815	xorq	%r10,%r10
2816	xorq	%r11,%r11
2817	xorq	%r12,%r12
2818	xorq	%r13,%r13
2819	xorq	%r14,%r14
2820	xorq	%r15,%r15
2821	leaq	48+8(%rsp),%rdi
2822	xorq	%rbp,%rbp
2823	jmp	L$sqrx8x_outer_loop
2824
2825.p2align	5
2826L$sqrx8x_outer_loop:
2827	mulxq	8(%rsi),%r8,%rax
2828	adcxq	%r9,%r8
2829	adoxq	%rax,%r10
2830	mulxq	16(%rsi),%r9,%rax
2831	adcxq	%r10,%r9
2832	adoxq	%rax,%r11
2833.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
2834	adcxq	%r11,%r10
2835	adoxq	%rax,%r12
2836.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
2837	adcxq	%r12,%r11
2838	adoxq	%rax,%r13
2839	mulxq	40(%rsi),%r12,%rax
2840	adcxq	%r13,%r12
2841	adoxq	%rax,%r14
2842	mulxq	48(%rsi),%r13,%rax
2843	adcxq	%r14,%r13
2844	adoxq	%r15,%rax
2845	mulxq	56(%rsi),%r14,%r15
2846	movq	8(%rsi),%rdx
2847	adcxq	%rax,%r14
2848	adoxq	%rbp,%r15
2849	adcq	64(%rdi),%r15
2850	movq	%r8,8(%rdi)
2851	movq	%r9,16(%rdi)
2852	sbbq	%rcx,%rcx
2853	xorq	%rbp,%rbp
2854
2855
2856	mulxq	16(%rsi),%r8,%rbx
2857	mulxq	24(%rsi),%r9,%rax
2858	adcxq	%r10,%r8
2859	adoxq	%rbx,%r9
2860	mulxq	32(%rsi),%r10,%rbx
2861	adcxq	%r11,%r9
2862	adoxq	%rax,%r10
2863.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
2864	adcxq	%r12,%r10
2865	adoxq	%rbx,%r11
2866.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
2867	adcxq	%r13,%r11
2868	adoxq	%r14,%r12
2869.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
2870	movq	16(%rsi),%rdx
2871	adcxq	%rax,%r12
2872	adoxq	%rbx,%r13
2873	adcxq	%r15,%r13
2874	adoxq	%rbp,%r14
2875	adcxq	%rbp,%r14
2876
2877	movq	%r8,24(%rdi)
2878	movq	%r9,32(%rdi)
2879
2880	mulxq	24(%rsi),%r8,%rbx
2881	mulxq	32(%rsi),%r9,%rax
2882	adcxq	%r10,%r8
2883	adoxq	%rbx,%r9
2884	mulxq	40(%rsi),%r10,%rbx
2885	adcxq	%r11,%r9
2886	adoxq	%rax,%r10
2887.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
2888	adcxq	%r12,%r10
2889	adoxq	%r13,%r11
2890.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
2891.byte	0x3e
2892	movq	24(%rsi),%rdx
2893	adcxq	%rbx,%r11
2894	adoxq	%rax,%r12
2895	adcxq	%r14,%r12
2896	movq	%r8,40(%rdi)
2897	movq	%r9,48(%rdi)
2898	mulxq	32(%rsi),%r8,%rax
2899	adoxq	%rbp,%r13
2900	adcxq	%rbp,%r13
2901
2902	mulxq	40(%rsi),%r9,%rbx
2903	adcxq	%r10,%r8
2904	adoxq	%rax,%r9
2905	mulxq	48(%rsi),%r10,%rax
2906	adcxq	%r11,%r9
2907	adoxq	%r12,%r10
2908	mulxq	56(%rsi),%r11,%r12
2909	movq	32(%rsi),%rdx
2910	movq	40(%rsi),%r14
2911	adcxq	%rbx,%r10
2912	adoxq	%rax,%r11
2913	movq	48(%rsi),%r15
2914	adcxq	%r13,%r11
2915	adoxq	%rbp,%r12
2916	adcxq	%rbp,%r12
2917
2918	movq	%r8,56(%rdi)
2919	movq	%r9,64(%rdi)
2920
2921	mulxq	%r14,%r9,%rax
2922	movq	56(%rsi),%r8
2923	adcxq	%r10,%r9
2924	mulxq	%r15,%r10,%rbx
2925	adoxq	%rax,%r10
2926	adcxq	%r11,%r10
2927	mulxq	%r8,%r11,%rax
2928	movq	%r14,%rdx
2929	adoxq	%rbx,%r11
2930	adcxq	%r12,%r11
2931
2932	adcxq	%rbp,%rax
2933
2934	mulxq	%r15,%r14,%rbx
2935	mulxq	%r8,%r12,%r13
2936	movq	%r15,%rdx
2937	leaq	64(%rsi),%rsi
2938	adcxq	%r14,%r11
2939	adoxq	%rbx,%r12
2940	adcxq	%rax,%r12
2941	adoxq	%rbp,%r13
2942
2943.byte	0x67,0x67
2944	mulxq	%r8,%r8,%r14
2945	adcxq	%r8,%r13
2946	adcxq	%rbp,%r14
2947
2948	cmpq	8+8(%rsp),%rsi
2949	je	L$sqrx8x_outer_break
2950
2951	negq	%rcx
2952	movq	$-8,%rcx
2953	movq	%rbp,%r15
2954	movq	64(%rdi),%r8
2955	adcxq	72(%rdi),%r9
2956	adcxq	80(%rdi),%r10
2957	adcxq	88(%rdi),%r11
2958	adcq	96(%rdi),%r12
2959	adcq	104(%rdi),%r13
2960	adcq	112(%rdi),%r14
2961	adcq	120(%rdi),%r15
2962	leaq	(%rsi),%rbp
2963	leaq	128(%rdi),%rdi
2964	sbbq	%rax,%rax
2965
2966	movq	-64(%rsi),%rdx
2967	movq	%rax,16+8(%rsp)
2968	movq	%rdi,24+8(%rsp)
2969
2970
2971	xorl	%eax,%eax
2972	jmp	L$sqrx8x_loop
2973
2974.p2align	5
2975L$sqrx8x_loop:
2976	movq	%r8,%rbx
2977	mulxq	0(%rbp),%rax,%r8
2978	adcxq	%rax,%rbx
2979	adoxq	%r9,%r8
2980
2981	mulxq	8(%rbp),%rax,%r9
2982	adcxq	%rax,%r8
2983	adoxq	%r10,%r9
2984
2985	mulxq	16(%rbp),%rax,%r10
2986	adcxq	%rax,%r9
2987	adoxq	%r11,%r10
2988
2989	mulxq	24(%rbp),%rax,%r11
2990	adcxq	%rax,%r10
2991	adoxq	%r12,%r11
2992
2993.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
2994	adcxq	%rax,%r11
2995	adoxq	%r13,%r12
2996
2997	mulxq	40(%rbp),%rax,%r13
2998	adcxq	%rax,%r12
2999	adoxq	%r14,%r13
3000
3001	mulxq	48(%rbp),%rax,%r14
3002	movq	%rbx,(%rdi,%rcx,8)
3003	movl	$0,%ebx
3004	adcxq	%rax,%r13
3005	adoxq	%r15,%r14
3006
3007.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
3008	movq	8(%rsi,%rcx,8),%rdx
3009	adcxq	%rax,%r14
3010	adoxq	%rbx,%r15
3011	adcxq	%rbx,%r15
3012
3013.byte	0x67
3014	incq	%rcx
3015	jnz	L$sqrx8x_loop
3016
3017	leaq	64(%rbp),%rbp
3018	movq	$-8,%rcx
3019	cmpq	8+8(%rsp),%rbp
3020	je	L$sqrx8x_break
3021
3022	subq	16+8(%rsp),%rbx
3023.byte	0x66
3024	movq	-64(%rsi),%rdx
3025	adcxq	0(%rdi),%r8
3026	adcxq	8(%rdi),%r9
3027	adcq	16(%rdi),%r10
3028	adcq	24(%rdi),%r11
3029	adcq	32(%rdi),%r12
3030	adcq	40(%rdi),%r13
3031	adcq	48(%rdi),%r14
3032	adcq	56(%rdi),%r15
3033	leaq	64(%rdi),%rdi
3034.byte	0x67
3035	sbbq	%rax,%rax
3036	xorl	%ebx,%ebx
3037	movq	%rax,16+8(%rsp)
3038	jmp	L$sqrx8x_loop
3039
3040.p2align	5
3041L$sqrx8x_break:
3042	xorq	%rbp,%rbp
3043	subq	16+8(%rsp),%rbx
3044	adcxq	%rbp,%r8
3045	movq	24+8(%rsp),%rcx
3046	adcxq	%rbp,%r9
3047	movq	0(%rsi),%rdx
3048	adcq	$0,%r10
3049	movq	%r8,0(%rdi)
3050	adcq	$0,%r11
3051	adcq	$0,%r12
3052	adcq	$0,%r13
3053	adcq	$0,%r14
3054	adcq	$0,%r15
3055	cmpq	%rcx,%rdi
3056	je	L$sqrx8x_outer_loop
3057
3058	movq	%r9,8(%rdi)
3059	movq	8(%rcx),%r9
3060	movq	%r10,16(%rdi)
3061	movq	16(%rcx),%r10
3062	movq	%r11,24(%rdi)
3063	movq	24(%rcx),%r11
3064	movq	%r12,32(%rdi)
3065	movq	32(%rcx),%r12
3066	movq	%r13,40(%rdi)
3067	movq	40(%rcx),%r13
3068	movq	%r14,48(%rdi)
3069	movq	48(%rcx),%r14
3070	movq	%r15,56(%rdi)
3071	movq	56(%rcx),%r15
3072	movq	%rcx,%rdi
3073	jmp	L$sqrx8x_outer_loop
3074
3075.p2align	5
3076L$sqrx8x_outer_break:
3077	movq	%r9,72(%rdi)
3078.byte	102,72,15,126,217
3079	movq	%r10,80(%rdi)
3080	movq	%r11,88(%rdi)
3081	movq	%r12,96(%rdi)
3082	movq	%r13,104(%rdi)
3083	movq	%r14,112(%rdi)
3084	leaq	48+8(%rsp),%rdi
3085	movq	(%rsi,%rcx,1),%rdx
3086
3087	movq	8(%rdi),%r11
3088	xorq	%r10,%r10
3089	movq	0+8(%rsp),%r9
3090	adoxq	%r11,%r11
3091	movq	16(%rdi),%r12
3092	movq	24(%rdi),%r13
3093
3094
3095.p2align	5
3096L$sqrx4x_shift_n_add:
3097	mulxq	%rdx,%rax,%rbx
3098	adoxq	%r12,%r12
3099	adcxq	%r10,%rax
3100.byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
3101.byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
3102	adoxq	%r13,%r13
3103	adcxq	%r11,%rbx
3104	movq	40(%rdi),%r11
3105	movq	%rax,0(%rdi)
3106	movq	%rbx,8(%rdi)
3107
3108	mulxq	%rdx,%rax,%rbx
3109	adoxq	%r10,%r10
3110	adcxq	%r12,%rax
3111	movq	16(%rsi,%rcx,1),%rdx
3112	movq	48(%rdi),%r12
3113	adoxq	%r11,%r11
3114	adcxq	%r13,%rbx
3115	movq	56(%rdi),%r13
3116	movq	%rax,16(%rdi)
3117	movq	%rbx,24(%rdi)
3118
3119	mulxq	%rdx,%rax,%rbx
3120	adoxq	%r12,%r12
3121	adcxq	%r10,%rax
3122	movq	24(%rsi,%rcx,1),%rdx
3123	leaq	32(%rcx),%rcx
3124	movq	64(%rdi),%r10
3125	adoxq	%r13,%r13
3126	adcxq	%r11,%rbx
3127	movq	72(%rdi),%r11
3128	movq	%rax,32(%rdi)
3129	movq	%rbx,40(%rdi)
3130
3131	mulxq	%rdx,%rax,%rbx
3132	adoxq	%r10,%r10
3133	adcxq	%r12,%rax
3134	jrcxz	L$sqrx4x_shift_n_add_break
3135.byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
3136	adoxq	%r11,%r11
3137	adcxq	%r13,%rbx
3138	movq	80(%rdi),%r12
3139	movq	88(%rdi),%r13
3140	movq	%rax,48(%rdi)
3141	movq	%rbx,56(%rdi)
3142	leaq	64(%rdi),%rdi
3143	nop
3144	jmp	L$sqrx4x_shift_n_add
3145
3146.p2align	5
3147L$sqrx4x_shift_n_add_break:
3148	adcxq	%r13,%rbx
3149	movq	%rax,48(%rdi)
3150	movq	%rbx,56(%rdi)
3151	leaq	64(%rdi),%rdi
3152.byte	102,72,15,126,213
3153__bn_sqrx8x_reduction:
3154	xorl	%eax,%eax
3155	movq	32+8(%rsp),%rbx
3156	movq	48+8(%rsp),%rdx
3157	leaq	-64(%rbp,%r9,1),%rcx
3158
3159	movq	%rcx,0+8(%rsp)
3160	movq	%rdi,8+8(%rsp)
3161
3162	leaq	48+8(%rsp),%rdi
3163	jmp	L$sqrx8x_reduction_loop
3164
3165.p2align	5
3166L$sqrx8x_reduction_loop:
3167	movq	8(%rdi),%r9
3168	movq	16(%rdi),%r10
3169	movq	24(%rdi),%r11
3170	movq	32(%rdi),%r12
3171	movq	%rdx,%r8
3172	imulq	%rbx,%rdx
3173	movq	40(%rdi),%r13
3174	movq	48(%rdi),%r14
3175	movq	56(%rdi),%r15
3176	movq	%rax,24+8(%rsp)
3177
3178	leaq	64(%rdi),%rdi
3179	xorq	%rsi,%rsi
3180	movq	$-8,%rcx
3181	jmp	L$sqrx8x_reduce
3182
3183.p2align	5
3184L$sqrx8x_reduce:
3185	movq	%r8,%rbx
3186	mulxq	0(%rbp),%rax,%r8
3187	adcxq	%rbx,%rax
3188	adoxq	%r9,%r8
3189
3190	mulxq	8(%rbp),%rbx,%r9
3191	adcxq	%rbx,%r8
3192	adoxq	%r10,%r9
3193
3194	mulxq	16(%rbp),%rbx,%r10
3195	adcxq	%rbx,%r9
3196	adoxq	%r11,%r10
3197
3198	mulxq	24(%rbp),%rbx,%r11
3199	adcxq	%rbx,%r10
3200	adoxq	%r12,%r11
3201
3202.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
3203	movq	%rdx,%rax
3204	movq	%r8,%rdx
3205	adcxq	%rbx,%r11
3206	adoxq	%r13,%r12
3207
3208	mulxq	32+8(%rsp),%rbx,%rdx
3209	movq	%rax,%rdx
3210	movq	%rax,64+48+8(%rsp,%rcx,8)
3211
3212	mulxq	40(%rbp),%rax,%r13
3213	adcxq	%rax,%r12
3214	adoxq	%r14,%r13
3215
3216	mulxq	48(%rbp),%rax,%r14
3217	adcxq	%rax,%r13
3218	adoxq	%r15,%r14
3219
3220	mulxq	56(%rbp),%rax,%r15
3221	movq	%rbx,%rdx
3222	adcxq	%rax,%r14
3223	adoxq	%rsi,%r15
3224	adcxq	%rsi,%r15
3225
3226.byte	0x67,0x67,0x67
3227	incq	%rcx
3228	jnz	L$sqrx8x_reduce
3229
3230	movq	%rsi,%rax
3231	cmpq	0+8(%rsp),%rbp
3232	jae	L$sqrx8x_no_tail
3233
3234	movq	48+8(%rsp),%rdx
3235	addq	0(%rdi),%r8
3236	leaq	64(%rbp),%rbp
3237	movq	$-8,%rcx
3238	adcxq	8(%rdi),%r9
3239	adcxq	16(%rdi),%r10
3240	adcq	24(%rdi),%r11
3241	adcq	32(%rdi),%r12
3242	adcq	40(%rdi),%r13
3243	adcq	48(%rdi),%r14
3244	adcq	56(%rdi),%r15
3245	leaq	64(%rdi),%rdi
3246	sbbq	%rax,%rax
3247
3248	xorq	%rsi,%rsi
3249	movq	%rax,16+8(%rsp)
3250	jmp	L$sqrx8x_tail
3251
3252.p2align	5
3253L$sqrx8x_tail:
3254	movq	%r8,%rbx
3255	mulxq	0(%rbp),%rax,%r8
3256	adcxq	%rax,%rbx
3257	adoxq	%r9,%r8
3258
3259	mulxq	8(%rbp),%rax,%r9
3260	adcxq	%rax,%r8
3261	adoxq	%r10,%r9
3262
3263	mulxq	16(%rbp),%rax,%r10
3264	adcxq	%rax,%r9
3265	adoxq	%r11,%r10
3266
3267	mulxq	24(%rbp),%rax,%r11
3268	adcxq	%rax,%r10
3269	adoxq	%r12,%r11
3270
3271.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3272	adcxq	%rax,%r11
3273	adoxq	%r13,%r12
3274
3275	mulxq	40(%rbp),%rax,%r13
3276	adcxq	%rax,%r12
3277	adoxq	%r14,%r13
3278
3279	mulxq	48(%rbp),%rax,%r14
3280	adcxq	%rax,%r13
3281	adoxq	%r15,%r14
3282
3283	mulxq	56(%rbp),%rax,%r15
3284	movq	72+48+8(%rsp,%rcx,8),%rdx
3285	adcxq	%rax,%r14
3286	adoxq	%rsi,%r15
3287	movq	%rbx,(%rdi,%rcx,8)
3288	movq	%r8,%rbx
3289	adcxq	%rsi,%r15
3290
3291	incq	%rcx
3292	jnz	L$sqrx8x_tail
3293
3294	cmpq	0+8(%rsp),%rbp
3295	jae	L$sqrx8x_tail_done
3296
3297	subq	16+8(%rsp),%rsi
3298	movq	48+8(%rsp),%rdx
3299	leaq	64(%rbp),%rbp
3300	adcq	0(%rdi),%r8
3301	adcq	8(%rdi),%r9
3302	adcq	16(%rdi),%r10
3303	adcq	24(%rdi),%r11
3304	adcq	32(%rdi),%r12
3305	adcq	40(%rdi),%r13
3306	adcq	48(%rdi),%r14
3307	adcq	56(%rdi),%r15
3308	leaq	64(%rdi),%rdi
3309	sbbq	%rax,%rax
3310	subq	$8,%rcx
3311
3312	xorq	%rsi,%rsi
3313	movq	%rax,16+8(%rsp)
3314	jmp	L$sqrx8x_tail
3315
3316.p2align	5
3317L$sqrx8x_tail_done:
3318	xorq	%rax,%rax
3319	addq	24+8(%rsp),%r8
3320	adcq	$0,%r9
3321	adcq	$0,%r10
3322	adcq	$0,%r11
3323	adcq	$0,%r12
3324	adcq	$0,%r13
3325	adcq	$0,%r14
3326	adcq	$0,%r15
3327	adcq	$0,%rax
3328
3329	subq	16+8(%rsp),%rsi
3330L$sqrx8x_no_tail:
3331	adcq	0(%rdi),%r8
3332.byte	102,72,15,126,217
3333	adcq	8(%rdi),%r9
3334	movq	56(%rbp),%rsi
3335.byte	102,72,15,126,213
3336	adcq	16(%rdi),%r10
3337	adcq	24(%rdi),%r11
3338	adcq	32(%rdi),%r12
3339	adcq	40(%rdi),%r13
3340	adcq	48(%rdi),%r14
3341	adcq	56(%rdi),%r15
3342	adcq	$0,%rax
3343
3344	movq	32+8(%rsp),%rbx
3345	movq	64(%rdi,%rcx,1),%rdx
3346
3347	movq	%r8,0(%rdi)
3348	leaq	64(%rdi),%r8
3349	movq	%r9,8(%rdi)
3350	movq	%r10,16(%rdi)
3351	movq	%r11,24(%rdi)
3352	movq	%r12,32(%rdi)
3353	movq	%r13,40(%rdi)
3354	movq	%r14,48(%rdi)
3355	movq	%r15,56(%rdi)
3356
3357	leaq	64(%rdi,%rcx,1),%rdi
3358	cmpq	8+8(%rsp),%r8
3359	jb	L$sqrx8x_reduction_loop
3360	ret
3361
3362
3363.p2align	5
3364
3365__bn_postx4x_internal:
3366
3367	movq	0(%rbp),%r12
3368	movq	%rcx,%r10
3369	movq	%rcx,%r9
3370	negq	%rax
3371	sarq	$3+2,%rcx
3372
3373.byte	102,72,15,126,202
3374.byte	102,72,15,126,206
3375	decq	%r12
3376	movq	8(%rbp),%r13
3377	xorq	%r8,%r8
3378	movq	16(%rbp),%r14
3379	movq	24(%rbp),%r15
3380	jmp	L$sqrx4x_sub_entry
3381
3382.p2align	4
3383L$sqrx4x_sub:
3384	movq	0(%rbp),%r12
3385	movq	8(%rbp),%r13
3386	movq	16(%rbp),%r14
3387	movq	24(%rbp),%r15
3388L$sqrx4x_sub_entry:
3389	andnq	%rax,%r12,%r12
3390	leaq	32(%rbp),%rbp
3391	andnq	%rax,%r13,%r13
3392	andnq	%rax,%r14,%r14
3393	andnq	%rax,%r15,%r15
3394
3395	negq	%r8
3396	adcq	0(%rdi),%r12
3397	adcq	8(%rdi),%r13
3398	adcq	16(%rdi),%r14
3399	adcq	24(%rdi),%r15
3400	movq	%r12,0(%rdx)
3401	leaq	32(%rdi),%rdi
3402	movq	%r13,8(%rdx)
3403	sbbq	%r8,%r8
3404	movq	%r14,16(%rdx)
3405	movq	%r15,24(%rdx)
3406	leaq	32(%rdx),%rdx
3407
3408	incq	%rcx
3409	jnz	L$sqrx4x_sub
3410
3411	negq	%r9
3412
3413	ret
3414
3415
3416.globl	_bn_scatter5
3417.private_extern _bn_scatter5
3418
3419.p2align	4
3420_bn_scatter5:
3421
3422_CET_ENDBR
3423	cmpl	$0,%esi
3424	jz	L$scatter_epilogue
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434	leaq	(%rdx,%rcx,8),%rdx
3435L$scatter:
3436	movq	(%rdi),%rax
3437	leaq	8(%rdi),%rdi
3438	movq	%rax,(%rdx)
3439	leaq	256(%rdx),%rdx
3440	subl	$1,%esi
3441	jnz	L$scatter
3442L$scatter_epilogue:
3443	ret
3444
3445
3446
3447.globl	_bn_gather5
3448.private_extern _bn_gather5
3449
3450.p2align	5
3451_bn_gather5:
3452
3453L$SEH_begin_bn_gather5:
3454_CET_ENDBR
3455
3456.byte	0x4c,0x8d,0x14,0x24
3457
3458.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00
3459	leaq	L$inc(%rip),%rax
3460	andq	$-16,%rsp
3461
3462	movd	%ecx,%xmm5
3463	movdqa	0(%rax),%xmm0
3464	movdqa	16(%rax),%xmm1
3465	leaq	128(%rdx),%r11
3466	leaq	128(%rsp),%rax
3467
3468	pshufd	$0,%xmm5,%xmm5
3469	movdqa	%xmm1,%xmm4
3470	movdqa	%xmm1,%xmm2
3471	paddd	%xmm0,%xmm1
3472	pcmpeqd	%xmm5,%xmm0
3473	movdqa	%xmm4,%xmm3
3474
3475	paddd	%xmm1,%xmm2
3476	pcmpeqd	%xmm5,%xmm1
3477	movdqa	%xmm0,-128(%rax)
3478	movdqa	%xmm4,%xmm0
3479
3480	paddd	%xmm2,%xmm3
3481	pcmpeqd	%xmm5,%xmm2
3482	movdqa	%xmm1,-112(%rax)
3483	movdqa	%xmm4,%xmm1
3484
3485	paddd	%xmm3,%xmm0
3486	pcmpeqd	%xmm5,%xmm3
3487	movdqa	%xmm2,-96(%rax)
3488	movdqa	%xmm4,%xmm2
3489	paddd	%xmm0,%xmm1
3490	pcmpeqd	%xmm5,%xmm0
3491	movdqa	%xmm3,-80(%rax)
3492	movdqa	%xmm4,%xmm3
3493
3494	paddd	%xmm1,%xmm2
3495	pcmpeqd	%xmm5,%xmm1
3496	movdqa	%xmm0,-64(%rax)
3497	movdqa	%xmm4,%xmm0
3498
3499	paddd	%xmm2,%xmm3
3500	pcmpeqd	%xmm5,%xmm2
3501	movdqa	%xmm1,-48(%rax)
3502	movdqa	%xmm4,%xmm1
3503
3504	paddd	%xmm3,%xmm0
3505	pcmpeqd	%xmm5,%xmm3
3506	movdqa	%xmm2,-32(%rax)
3507	movdqa	%xmm4,%xmm2
3508	paddd	%xmm0,%xmm1
3509	pcmpeqd	%xmm5,%xmm0
3510	movdqa	%xmm3,-16(%rax)
3511	movdqa	%xmm4,%xmm3
3512
3513	paddd	%xmm1,%xmm2
3514	pcmpeqd	%xmm5,%xmm1
3515	movdqa	%xmm0,0(%rax)
3516	movdqa	%xmm4,%xmm0
3517
3518	paddd	%xmm2,%xmm3
3519	pcmpeqd	%xmm5,%xmm2
3520	movdqa	%xmm1,16(%rax)
3521	movdqa	%xmm4,%xmm1
3522
3523	paddd	%xmm3,%xmm0
3524	pcmpeqd	%xmm5,%xmm3
3525	movdqa	%xmm2,32(%rax)
3526	movdqa	%xmm4,%xmm2
3527	paddd	%xmm0,%xmm1
3528	pcmpeqd	%xmm5,%xmm0
3529	movdqa	%xmm3,48(%rax)
3530	movdqa	%xmm4,%xmm3
3531
3532	paddd	%xmm1,%xmm2
3533	pcmpeqd	%xmm5,%xmm1
3534	movdqa	%xmm0,64(%rax)
3535	movdqa	%xmm4,%xmm0
3536
3537	paddd	%xmm2,%xmm3
3538	pcmpeqd	%xmm5,%xmm2
3539	movdqa	%xmm1,80(%rax)
3540	movdqa	%xmm4,%xmm1
3541
3542	paddd	%xmm3,%xmm0
3543	pcmpeqd	%xmm5,%xmm3
3544	movdqa	%xmm2,96(%rax)
3545	movdqa	%xmm4,%xmm2
3546	movdqa	%xmm3,112(%rax)
3547	jmp	L$gather
3548
3549.p2align	5
3550L$gather:
3551	pxor	%xmm4,%xmm4
3552	pxor	%xmm5,%xmm5
3553	movdqa	-128(%r11),%xmm0
3554	movdqa	-112(%r11),%xmm1
3555	movdqa	-96(%r11),%xmm2
3556	pand	-128(%rax),%xmm0
3557	movdqa	-80(%r11),%xmm3
3558	pand	-112(%rax),%xmm1
3559	por	%xmm0,%xmm4
3560	pand	-96(%rax),%xmm2
3561	por	%xmm1,%xmm5
3562	pand	-80(%rax),%xmm3
3563	por	%xmm2,%xmm4
3564	por	%xmm3,%xmm5
3565	movdqa	-64(%r11),%xmm0
3566	movdqa	-48(%r11),%xmm1
3567	movdqa	-32(%r11),%xmm2
3568	pand	-64(%rax),%xmm0
3569	movdqa	-16(%r11),%xmm3
3570	pand	-48(%rax),%xmm1
3571	por	%xmm0,%xmm4
3572	pand	-32(%rax),%xmm2
3573	por	%xmm1,%xmm5
3574	pand	-16(%rax),%xmm3
3575	por	%xmm2,%xmm4
3576	por	%xmm3,%xmm5
3577	movdqa	0(%r11),%xmm0
3578	movdqa	16(%r11),%xmm1
3579	movdqa	32(%r11),%xmm2
3580	pand	0(%rax),%xmm0
3581	movdqa	48(%r11),%xmm3
3582	pand	16(%rax),%xmm1
3583	por	%xmm0,%xmm4
3584	pand	32(%rax),%xmm2
3585	por	%xmm1,%xmm5
3586	pand	48(%rax),%xmm3
3587	por	%xmm2,%xmm4
3588	por	%xmm3,%xmm5
3589	movdqa	64(%r11),%xmm0
3590	movdqa	80(%r11),%xmm1
3591	movdqa	96(%r11),%xmm2
3592	pand	64(%rax),%xmm0
3593	movdqa	112(%r11),%xmm3
3594	pand	80(%rax),%xmm1
3595	por	%xmm0,%xmm4
3596	pand	96(%rax),%xmm2
3597	por	%xmm1,%xmm5
3598	pand	112(%rax),%xmm3
3599	por	%xmm2,%xmm4
3600	por	%xmm3,%xmm5
3601	por	%xmm5,%xmm4
3602	leaq	256(%r11),%r11
3603
3604	pshufd	$0x4e,%xmm4,%xmm0
3605	por	%xmm4,%xmm0
3606	movq	%xmm0,(%rdi)
3607	leaq	8(%rdi),%rdi
3608	subl	$1,%esi
3609	jnz	L$gather
3610
3611	leaq	(%r10),%rsp
3612
3613	ret
3614L$SEH_end_bn_gather5:
3615
3616
3617.section	__DATA,__const
3618.p2align	6
3619L$inc:
3620.long	0,0, 1,1
3621.long	2,2, 2,2
3622.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
3623.text
3624#endif
3625