1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <ring-core/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
7#include <ring-core/arm_arch.h>
8
9.text
10
11.globl	bn_mul_mont
12.hidden	bn_mul_mont
13.type	bn_mul_mont,%function
14.align	5
15bn_mul_mont:
16	AARCH64_SIGN_LINK_REGISTER
17	tst	x5,#7
18	b.eq	__bn_sqr8x_mont
19	tst	x5,#3
20	b.eq	__bn_mul4x_mont
21.Lmul_mont:
22	stp	x29,x30,[sp,#-64]!
23	add	x29,sp,#0
24	stp	x19,x20,[sp,#16]
25	stp	x21,x22,[sp,#32]
26	stp	x23,x24,[sp,#48]
27
28	ldr	x9,[x2],#8		// bp[0]
29	sub	x22,sp,x5,lsl#3
30	ldp	x7,x8,[x1],#16	// ap[0..1]
31	lsl	x5,x5,#3
32	ldr	x4,[x4]		// *n0
33	and	x22,x22,#-16		// ABI says so
34	ldp	x13,x14,[x3],#16	// np[0..1]
35
36	mul	x6,x7,x9		// ap[0]*bp[0]
37	sub	x21,x5,#16		// j=num-2
38	umulh	x7,x7,x9
39	mul	x10,x8,x9		// ap[1]*bp[0]
40	umulh	x11,x8,x9
41
42	mul	x15,x6,x4		// "tp[0]"*n0
43	mov	sp,x22			// alloca
44
45	// (*)	mul	x12,x13,x15	// np[0]*m1
46	umulh	x13,x13,x15
47	mul	x16,x14,x15		// np[1]*m1
48	// (*)	adds	x12,x12,x6	// discarded
49	// (*)	As for removal of first multiplication and addition
50	//	instructions. The outcome of first addition is
51	//	guaranteed to be zero, which leaves two computationally
52	//	significant outcomes: it either carries or not. Then
53	//	question is when does it carry? Is there alternative
54	//	way to deduce it? If you follow operations, you can
55	//	observe that condition for carry is quite simple:
56	//	x6 being non-zero. So that carry can be calculated
57	//	by adding -1 to x6. That's what next instruction does.
58	subs	xzr,x6,#1		// (*)
59	umulh	x17,x14,x15
60	adc	x13,x13,xzr
61	cbz	x21,.L1st_skip
62
63.L1st:
64	ldr	x8,[x1],#8
65	adds	x6,x10,x7
66	sub	x21,x21,#8		// j--
67	adc	x7,x11,xzr
68
69	ldr	x14,[x3],#8
70	adds	x12,x16,x13
71	mul	x10,x8,x9		// ap[j]*bp[0]
72	adc	x13,x17,xzr
73	umulh	x11,x8,x9
74
75	adds	x12,x12,x6
76	mul	x16,x14,x15		// np[j]*m1
77	adc	x13,x13,xzr
78	umulh	x17,x14,x15
79	str	x12,[x22],#8		// tp[j-1]
80	cbnz	x21,.L1st
81
82.L1st_skip:
83	adds	x6,x10,x7
84	sub	x1,x1,x5		// rewind x1
85	adc	x7,x11,xzr
86
87	adds	x12,x16,x13
88	sub	x3,x3,x5		// rewind x3
89	adc	x13,x17,xzr
90
91	adds	x12,x12,x6
92	sub	x20,x5,#8		// i=num-1
93	adcs	x13,x13,x7
94
95	adc	x19,xzr,xzr		// upmost overflow bit
96	stp	x12,x13,[x22]
97
98.Louter:
99	ldr	x9,[x2],#8		// bp[i]
100	ldp	x7,x8,[x1],#16
101	ldr	x23,[sp]		// tp[0]
102	add	x22,sp,#8
103
104	mul	x6,x7,x9		// ap[0]*bp[i]
105	sub	x21,x5,#16		// j=num-2
106	umulh	x7,x7,x9
107	ldp	x13,x14,[x3],#16
108	mul	x10,x8,x9		// ap[1]*bp[i]
109	adds	x6,x6,x23
110	umulh	x11,x8,x9
111	adc	x7,x7,xzr
112
113	mul	x15,x6,x4
114	sub	x20,x20,#8		// i--
115
116	// (*)	mul	x12,x13,x15	// np[0]*m1
117	umulh	x13,x13,x15
118	mul	x16,x14,x15		// np[1]*m1
119	// (*)	adds	x12,x12,x6
120	subs	xzr,x6,#1		// (*)
121	umulh	x17,x14,x15
122	cbz	x21,.Linner_skip
123
124.Linner:
125	ldr	x8,[x1],#8
126	adc	x13,x13,xzr
127	ldr	x23,[x22],#8		// tp[j]
128	adds	x6,x10,x7
129	sub	x21,x21,#8		// j--
130	adc	x7,x11,xzr
131
132	adds	x12,x16,x13
133	ldr	x14,[x3],#8
134	adc	x13,x17,xzr
135
136	mul	x10,x8,x9		// ap[j]*bp[i]
137	adds	x6,x6,x23
138	umulh	x11,x8,x9
139	adc	x7,x7,xzr
140
141	mul	x16,x14,x15		// np[j]*m1
142	adds	x12,x12,x6
143	umulh	x17,x14,x15
144	str	x12,[x22,#-16]		// tp[j-1]
145	cbnz	x21,.Linner
146
147.Linner_skip:
148	ldr	x23,[x22],#8		// tp[j]
149	adc	x13,x13,xzr
150	adds	x6,x10,x7
151	sub	x1,x1,x5		// rewind x1
152	adc	x7,x11,xzr
153
154	adds	x12,x16,x13
155	sub	x3,x3,x5		// rewind x3
156	adcs	x13,x17,x19
157	adc	x19,xzr,xzr
158
159	adds	x6,x6,x23
160	adc	x7,x7,xzr
161
162	adds	x12,x12,x6
163	adcs	x13,x13,x7
164	adc	x19,x19,xzr		// upmost overflow bit
165	stp	x12,x13,[x22,#-16]
166
167	cbnz	x20,.Louter
168
169	// Final step. We see if result is larger than modulus, and
170	// if it is, subtract the modulus. But comparison implies
171	// subtraction. So we subtract modulus, see if it borrowed,
172	// and conditionally copy original value.
173	ldr	x23,[sp]		// tp[0]
174	add	x22,sp,#8
175	ldr	x14,[x3],#8		// np[0]
176	subs	x21,x5,#8		// j=num-1 and clear borrow
177	mov	x1,x0
178.Lsub:
179	sbcs	x8,x23,x14		// tp[j]-np[j]
180	ldr	x23,[x22],#8
181	sub	x21,x21,#8		// j--
182	ldr	x14,[x3],#8
183	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
184	cbnz	x21,.Lsub
185
186	sbcs	x8,x23,x14
187	sbcs	x19,x19,xzr		// did it borrow?
188	str	x8,[x1],#8		// rp[num-1]
189
190	ldr	x23,[sp]		// tp[0]
191	add	x22,sp,#8
192	ldr	x8,[x0],#8		// rp[0]
193	sub	x5,x5,#8		// num--
194	nop
195.Lcond_copy:
196	sub	x5,x5,#8		// num--
197	csel	x14,x23,x8,lo		// did it borrow?
198	ldr	x23,[x22],#8
199	ldr	x8,[x0],#8
200	str	xzr,[x22,#-16]		// wipe tp
201	str	x14,[x0,#-16]
202	cbnz	x5,.Lcond_copy
203
204	csel	x14,x23,x8,lo
205	str	xzr,[x22,#-8]		// wipe tp
206	str	x14,[x0,#-8]
207
208	ldp	x19,x20,[x29,#16]
209	mov	sp,x29
210	ldp	x21,x22,[x29,#32]
211	mov	x0,#1
212	ldp	x23,x24,[x29,#48]
213	ldr	x29,[sp],#64
214	AARCH64_VALIDATE_LINK_REGISTER
215	ret
216.size	bn_mul_mont,.-bn_mul_mont
217.type	__bn_sqr8x_mont,%function
218.align	5
219__bn_sqr8x_mont:
220	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
221	// only from bn_mul_mont which has already signed the return address.
222	cmp	x1,x2
223	b.ne	__bn_mul4x_mont
224.Lsqr8x_mont:
225	stp	x29,x30,[sp,#-128]!
226	add	x29,sp,#0
227	stp	x19,x20,[sp,#16]
228	stp	x21,x22,[sp,#32]
229	stp	x23,x24,[sp,#48]
230	stp	x25,x26,[sp,#64]
231	stp	x27,x28,[sp,#80]
232	stp	x0,x3,[sp,#96]	// offload rp and np
233
234	ldp	x6,x7,[x1,#8*0]
235	ldp	x8,x9,[x1,#8*2]
236	ldp	x10,x11,[x1,#8*4]
237	ldp	x12,x13,[x1,#8*6]
238
239	sub	x2,sp,x5,lsl#4
240	lsl	x5,x5,#3
241	ldr	x4,[x4]		// *n0
242	mov	sp,x2			// alloca
243	sub	x27,x5,#8*8
244	b	.Lsqr8x_zero_start
245
246.Lsqr8x_zero:
247	sub	x27,x27,#8*8
248	stp	xzr,xzr,[x2,#8*0]
249	stp	xzr,xzr,[x2,#8*2]
250	stp	xzr,xzr,[x2,#8*4]
251	stp	xzr,xzr,[x2,#8*6]
252.Lsqr8x_zero_start:
253	stp	xzr,xzr,[x2,#8*8]
254	stp	xzr,xzr,[x2,#8*10]
255	stp	xzr,xzr,[x2,#8*12]
256	stp	xzr,xzr,[x2,#8*14]
257	add	x2,x2,#8*16
258	cbnz	x27,.Lsqr8x_zero
259
260	add	x3,x1,x5
261	add	x1,x1,#8*8
262	mov	x19,xzr
263	mov	x20,xzr
264	mov	x21,xzr
265	mov	x22,xzr
266	mov	x23,xzr
267	mov	x24,xzr
268	mov	x25,xzr
269	mov	x26,xzr
270	mov	x2,sp
271	str	x4,[x29,#112]		// offload n0
272
273	// Multiply everything but a[i]*a[i]
274.align	4
275.Lsqr8x_outer_loop:
276        //                                                 a[1]a[0]	(i)
277        //                                             a[2]a[0]
278        //                                         a[3]a[0]
279        //                                     a[4]a[0]
280        //                                 a[5]a[0]
281        //                             a[6]a[0]
282        //                         a[7]a[0]
283        //                                         a[2]a[1]		(ii)
284        //                                     a[3]a[1]
285        //                                 a[4]a[1]
286        //                             a[5]a[1]
287        //                         a[6]a[1]
288        //                     a[7]a[1]
289        //                                 a[3]a[2]			(iii)
290        //                             a[4]a[2]
291        //                         a[5]a[2]
292        //                     a[6]a[2]
293        //                 a[7]a[2]
294        //                         a[4]a[3]				(iv)
295        //                     a[5]a[3]
296        //                 a[6]a[3]
297        //             a[7]a[3]
298        //                 a[5]a[4]					(v)
299        //             a[6]a[4]
300        //         a[7]a[4]
301        //         a[6]a[5]						(vi)
302        //     a[7]a[5]
303        // a[7]a[6]							(vii)
304
305	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
306	mul	x15,x8,x6
307	mul	x16,x9,x6
308	mul	x17,x10,x6
309	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
310	mul	x14,x11,x6
311	adcs	x21,x21,x15
312	mul	x15,x12,x6
313	adcs	x22,x22,x16
314	mul	x16,x13,x6
315	adcs	x23,x23,x17
316	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
317	adcs	x24,x24,x14
318	umulh	x14,x8,x6
319	adcs	x25,x25,x15
320	umulh	x15,x9,x6
321	adcs	x26,x26,x16
322	umulh	x16,x10,x6
323	stp	x19,x20,[x2],#8*2	// t[0..1]
324	adc	x19,xzr,xzr		// t[8]
325	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
326	umulh	x17,x11,x6
327	adcs	x22,x22,x14
328	umulh	x14,x12,x6
329	adcs	x23,x23,x15
330	umulh	x15,x13,x6
331	adcs	x24,x24,x16
332	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
333	adcs	x25,x25,x17
334	mul	x17,x9,x7
335	adcs	x26,x26,x14
336	mul	x14,x10,x7
337	adc	x19,x19,x15
338
339	mul	x15,x11,x7
340	adds	x22,x22,x16
341	mul	x16,x12,x7
342	adcs	x23,x23,x17
343	mul	x17,x13,x7
344	adcs	x24,x24,x14
345	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
346	adcs	x25,x25,x15
347	umulh	x15,x9,x7
348	adcs	x26,x26,x16
349	umulh	x16,x10,x7
350	adcs	x19,x19,x17
351	umulh	x17,x11,x7
352	stp	x21,x22,[x2],#8*2	// t[2..3]
353	adc	x20,xzr,xzr		// t[9]
354	adds	x23,x23,x14
355	umulh	x14,x12,x7
356	adcs	x24,x24,x15
357	umulh	x15,x13,x7
358	adcs	x25,x25,x16
359	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
360	adcs	x26,x26,x17
361	mul	x17,x10,x8
362	adcs	x19,x19,x14
363	mul	x14,x11,x8
364	adc	x20,x20,x15
365
366	mul	x15,x12,x8
367	adds	x24,x24,x16
368	mul	x16,x13,x8
369	adcs	x25,x25,x17
370	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
371	adcs	x26,x26,x14
372	umulh	x14,x10,x8
373	adcs	x19,x19,x15
374	umulh	x15,x11,x8
375	adcs	x20,x20,x16
376	umulh	x16,x12,x8
377	stp	x23,x24,[x2],#8*2	// t[4..5]
378	adc	x21,xzr,xzr		// t[10]
379	adds	x25,x25,x17
380	umulh	x17,x13,x8
381	adcs	x26,x26,x14
382	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
383	adcs	x19,x19,x15
384	mul	x15,x11,x9
385	adcs	x20,x20,x16
386	mul	x16,x12,x9
387	adc	x21,x21,x17
388
389	mul	x17,x13,x9
390	adds	x26,x26,x14
391	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
392	adcs	x19,x19,x15
393	umulh	x15,x11,x9
394	adcs	x20,x20,x16
395	umulh	x16,x12,x9
396	adcs	x21,x21,x17
397	umulh	x17,x13,x9
398	stp	x25,x26,[x2],#8*2	// t[6..7]
399	adc	x22,xzr,xzr		// t[11]
400	adds	x19,x19,x14
401	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
402	adcs	x20,x20,x15
403	mul	x15,x12,x10
404	adcs	x21,x21,x16
405	mul	x16,x13,x10
406	adc	x22,x22,x17
407
408	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
409	adds	x20,x20,x14
410	umulh	x14,x12,x10
411	adcs	x21,x21,x15
412	umulh	x15,x13,x10
413	adcs	x22,x22,x16
414	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
415	adc	x23,xzr,xzr		// t[12]
416	adds	x21,x21,x17
417	mul	x17,x13,x11
418	adcs	x22,x22,x14
419	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
420	adc	x23,x23,x15
421
422	umulh	x15,x13,x11
423	adds	x22,x22,x16
424	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
425	adcs	x23,x23,x17
426	umulh	x17,x13,x12		// hi(a[7]*a[6])
427	adc	x24,xzr,xzr		// t[13]
428	adds	x23,x23,x14
429	sub	x27,x3,x1	// done yet?
430	adc	x24,x24,x15
431
432	adds	x24,x24,x16
433	sub	x14,x3,x5	// rewinded ap
434	adc	x25,xzr,xzr		// t[14]
435	add	x25,x25,x17
436
437	cbz	x27,.Lsqr8x_outer_break
438
439	mov	x4,x6
440	ldp	x6,x7,[x2,#8*0]
441	ldp	x8,x9,[x2,#8*2]
442	ldp	x10,x11,[x2,#8*4]
443	ldp	x12,x13,[x2,#8*6]
444	adds	x19,x19,x6
445	adcs	x20,x20,x7
446	ldp	x6,x7,[x1,#8*0]
447	adcs	x21,x21,x8
448	adcs	x22,x22,x9
449	ldp	x8,x9,[x1,#8*2]
450	adcs	x23,x23,x10
451	adcs	x24,x24,x11
452	ldp	x10,x11,[x1,#8*4]
453	adcs	x25,x25,x12
454	mov	x0,x1
455	adcs	x26,xzr,x13
456	ldp	x12,x13,[x1,#8*6]
457	add	x1,x1,#8*8
458	//adc	x28,xzr,xzr		// moved below
459	mov	x27,#-8*8
460
461	//                                                         a[8]a[0]
462	//                                                     a[9]a[0]
463	//                                                 a[a]a[0]
464	//                                             a[b]a[0]
465	//                                         a[c]a[0]
466	//                                     a[d]a[0]
467	//                                 a[e]a[0]
468	//                             a[f]a[0]
469	//                                                     a[8]a[1]
470	//                         a[f]a[1]........................
471	//                                                 a[8]a[2]
472	//                     a[f]a[2]........................
473	//                                             a[8]a[3]
474	//                 a[f]a[3]........................
475	//                                         a[8]a[4]
476	//             a[f]a[4]........................
477	//                                     a[8]a[5]
478	//         a[f]a[5]........................
479	//                                 a[8]a[6]
480	//     a[f]a[6]........................
481	//                             a[8]a[7]
482	// a[f]a[7]........................
483.Lsqr8x_mul:
484	mul	x14,x6,x4
485	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
486	mul	x15,x7,x4
487	add	x27,x27,#8
488	mul	x16,x8,x4
489	mul	x17,x9,x4
490	adds	x19,x19,x14
491	mul	x14,x10,x4
492	adcs	x20,x20,x15
493	mul	x15,x11,x4
494	adcs	x21,x21,x16
495	mul	x16,x12,x4
496	adcs	x22,x22,x17
497	mul	x17,x13,x4
498	adcs	x23,x23,x14
499	umulh	x14,x6,x4
500	adcs	x24,x24,x15
501	umulh	x15,x7,x4
502	adcs	x25,x25,x16
503	umulh	x16,x8,x4
504	adcs	x26,x26,x17
505	umulh	x17,x9,x4
506	adc	x28,x28,xzr
507	str	x19,[x2],#8
508	adds	x19,x20,x14
509	umulh	x14,x10,x4
510	adcs	x20,x21,x15
511	umulh	x15,x11,x4
512	adcs	x21,x22,x16
513	umulh	x16,x12,x4
514	adcs	x22,x23,x17
515	umulh	x17,x13,x4
516	ldr	x4,[x0,x27]
517	adcs	x23,x24,x14
518	adcs	x24,x25,x15
519	adcs	x25,x26,x16
520	adcs	x26,x28,x17
521	//adc	x28,xzr,xzr		// moved above
522	cbnz	x27,.Lsqr8x_mul
523					// note that carry flag is guaranteed
524					// to be zero at this point
525	cmp	x1,x3		// done yet?
526	b.eq	.Lsqr8x_break
527
528	ldp	x6,x7,[x2,#8*0]
529	ldp	x8,x9,[x2,#8*2]
530	ldp	x10,x11,[x2,#8*4]
531	ldp	x12,x13,[x2,#8*6]
532	adds	x19,x19,x6
533	ldr	x4,[x0,#-8*8]
534	adcs	x20,x20,x7
535	ldp	x6,x7,[x1,#8*0]
536	adcs	x21,x21,x8
537	adcs	x22,x22,x9
538	ldp	x8,x9,[x1,#8*2]
539	adcs	x23,x23,x10
540	adcs	x24,x24,x11
541	ldp	x10,x11,[x1,#8*4]
542	adcs	x25,x25,x12
543	mov	x27,#-8*8
544	adcs	x26,x26,x13
545	ldp	x12,x13,[x1,#8*6]
546	add	x1,x1,#8*8
547	//adc	x28,xzr,xzr		// moved above
548	b	.Lsqr8x_mul
549
550.align	4
551.Lsqr8x_break:
552	ldp	x6,x7,[x0,#8*0]
553	add	x1,x0,#8*8
554	ldp	x8,x9,[x0,#8*2]
555	sub	x14,x3,x1		// is it last iteration?
556	ldp	x10,x11,[x0,#8*4]
557	sub	x15,x2,x14
558	ldp	x12,x13,[x0,#8*6]
559	cbz	x14,.Lsqr8x_outer_loop
560
561	stp	x19,x20,[x2,#8*0]
562	ldp	x19,x20,[x15,#8*0]
563	stp	x21,x22,[x2,#8*2]
564	ldp	x21,x22,[x15,#8*2]
565	stp	x23,x24,[x2,#8*4]
566	ldp	x23,x24,[x15,#8*4]
567	stp	x25,x26,[x2,#8*6]
568	mov	x2,x15
569	ldp	x25,x26,[x15,#8*6]
570	b	.Lsqr8x_outer_loop
571
572.align	4
573.Lsqr8x_outer_break:
574	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
575	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
576	ldp	x15,x16,[sp,#8*1]
577	ldp	x11,x13,[x14,#8*2]
578	add	x1,x14,#8*4
579	ldp	x17,x14,[sp,#8*3]
580
581	stp	x19,x20,[x2,#8*0]
582	mul	x19,x7,x7
583	stp	x21,x22,[x2,#8*2]
584	umulh	x7,x7,x7
585	stp	x23,x24,[x2,#8*4]
586	mul	x8,x9,x9
587	stp	x25,x26,[x2,#8*6]
588	mov	x2,sp
589	umulh	x9,x9,x9
590	adds	x20,x7,x15,lsl#1
591	extr	x15,x16,x15,#63
592	sub	x27,x5,#8*4
593
594.Lsqr4x_shift_n_add:
595	adcs	x21,x8,x15
596	extr	x16,x17,x16,#63
597	sub	x27,x27,#8*4
598	adcs	x22,x9,x16
599	ldp	x15,x16,[x2,#8*5]
600	mul	x10,x11,x11
601	ldp	x7,x9,[x1],#8*2
602	umulh	x11,x11,x11
603	mul	x12,x13,x13
604	umulh	x13,x13,x13
605	extr	x17,x14,x17,#63
606	stp	x19,x20,[x2,#8*0]
607	adcs	x23,x10,x17
608	extr	x14,x15,x14,#63
609	stp	x21,x22,[x2,#8*2]
610	adcs	x24,x11,x14
611	ldp	x17,x14,[x2,#8*7]
612	extr	x15,x16,x15,#63
613	adcs	x25,x12,x15
614	extr	x16,x17,x16,#63
615	adcs	x26,x13,x16
616	ldp	x15,x16,[x2,#8*9]
617	mul	x6,x7,x7
618	ldp	x11,x13,[x1],#8*2
619	umulh	x7,x7,x7
620	mul	x8,x9,x9
621	umulh	x9,x9,x9
622	stp	x23,x24,[x2,#8*4]
623	extr	x17,x14,x17,#63
624	stp	x25,x26,[x2,#8*6]
625	add	x2,x2,#8*8
626	adcs	x19,x6,x17
627	extr	x14,x15,x14,#63
628	adcs	x20,x7,x14
629	ldp	x17,x14,[x2,#8*3]
630	extr	x15,x16,x15,#63
631	cbnz	x27,.Lsqr4x_shift_n_add
632	ldp	x1,x4,[x29,#104]	// pull np and n0
633
634	adcs	x21,x8,x15
635	extr	x16,x17,x16,#63
636	adcs	x22,x9,x16
637	ldp	x15,x16,[x2,#8*5]
638	mul	x10,x11,x11
639	umulh	x11,x11,x11
640	stp	x19,x20,[x2,#8*0]
641	mul	x12,x13,x13
642	umulh	x13,x13,x13
643	stp	x21,x22,[x2,#8*2]
644	extr	x17,x14,x17,#63
645	adcs	x23,x10,x17
646	extr	x14,x15,x14,#63
647	ldp	x19,x20,[sp,#8*0]
648	adcs	x24,x11,x14
649	extr	x15,x16,x15,#63
650	ldp	x6,x7,[x1,#8*0]
651	adcs	x25,x12,x15
652	extr	x16,xzr,x16,#63
653	ldp	x8,x9,[x1,#8*2]
654	adc	x26,x13,x16
655	ldp	x10,x11,[x1,#8*4]
656
657	// Reduce by 512 bits per iteration
658	mul	x28,x4,x19		// t[0]*n0
659	ldp	x12,x13,[x1,#8*6]
660	add	x3,x1,x5
661	ldp	x21,x22,[sp,#8*2]
662	stp	x23,x24,[x2,#8*4]
663	ldp	x23,x24,[sp,#8*4]
664	stp	x25,x26,[x2,#8*6]
665	ldp	x25,x26,[sp,#8*6]
666	add	x1,x1,#8*8
667	mov	x30,xzr		// initial top-most carry
668	mov	x2,sp
669	mov	x27,#8
670
671.Lsqr8x_reduction:
672	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
673	mul	x15,x7,x28
674	sub	x27,x27,#1
675	mul	x16,x8,x28
676	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
677	mul	x17,x9,x28
678	// (*)	adds	xzr,x19,x14
679	subs	xzr,x19,#1		// (*)
680	mul	x14,x10,x28
681	adcs	x19,x20,x15
682	mul	x15,x11,x28
683	adcs	x20,x21,x16
684	mul	x16,x12,x28
685	adcs	x21,x22,x17
686	mul	x17,x13,x28
687	adcs	x22,x23,x14
688	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
689	adcs	x23,x24,x15
690	umulh	x15,x7,x28
691	adcs	x24,x25,x16
692	umulh	x16,x8,x28
693	adcs	x25,x26,x17
694	umulh	x17,x9,x28
695	adc	x26,xzr,xzr
696	adds	x19,x19,x14
697	umulh	x14,x10,x28
698	adcs	x20,x20,x15
699	umulh	x15,x11,x28
700	adcs	x21,x21,x16
701	umulh	x16,x12,x28
702	adcs	x22,x22,x17
703	umulh	x17,x13,x28
704	mul	x28,x4,x19		// next t[0]*n0
705	adcs	x23,x23,x14
706	adcs	x24,x24,x15
707	adcs	x25,x25,x16
708	adc	x26,x26,x17
709	cbnz	x27,.Lsqr8x_reduction
710
711	ldp	x14,x15,[x2,#8*0]
712	ldp	x16,x17,[x2,#8*2]
713	mov	x0,x2
714	sub	x27,x3,x1	// done yet?
715	adds	x19,x19,x14
716	adcs	x20,x20,x15
717	ldp	x14,x15,[x2,#8*4]
718	adcs	x21,x21,x16
719	adcs	x22,x22,x17
720	ldp	x16,x17,[x2,#8*6]
721	adcs	x23,x23,x14
722	adcs	x24,x24,x15
723	adcs	x25,x25,x16
724	adcs	x26,x26,x17
725	//adc	x28,xzr,xzr		// moved below
726	cbz	x27,.Lsqr8x8_post_condition
727
728	ldr	x4,[x2,#-8*8]
729	ldp	x6,x7,[x1,#8*0]
730	ldp	x8,x9,[x1,#8*2]
731	ldp	x10,x11,[x1,#8*4]
732	mov	x27,#-8*8
733	ldp	x12,x13,[x1,#8*6]
734	add	x1,x1,#8*8
735
736.Lsqr8x_tail:
737	mul	x14,x6,x4
738	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
739	mul	x15,x7,x4
740	add	x27,x27,#8
741	mul	x16,x8,x4
742	mul	x17,x9,x4
743	adds	x19,x19,x14
744	mul	x14,x10,x4
745	adcs	x20,x20,x15
746	mul	x15,x11,x4
747	adcs	x21,x21,x16
748	mul	x16,x12,x4
749	adcs	x22,x22,x17
750	mul	x17,x13,x4
751	adcs	x23,x23,x14
752	umulh	x14,x6,x4
753	adcs	x24,x24,x15
754	umulh	x15,x7,x4
755	adcs	x25,x25,x16
756	umulh	x16,x8,x4
757	adcs	x26,x26,x17
758	umulh	x17,x9,x4
759	adc	x28,x28,xzr
760	str	x19,[x2],#8
761	adds	x19,x20,x14
762	umulh	x14,x10,x4
763	adcs	x20,x21,x15
764	umulh	x15,x11,x4
765	adcs	x21,x22,x16
766	umulh	x16,x12,x4
767	adcs	x22,x23,x17
768	umulh	x17,x13,x4
769	ldr	x4,[x0,x27]
770	adcs	x23,x24,x14
771	adcs	x24,x25,x15
772	adcs	x25,x26,x16
773	adcs	x26,x28,x17
774	//adc	x28,xzr,xzr		// moved above
775	cbnz	x27,.Lsqr8x_tail
776					// note that carry flag is guaranteed
777					// to be zero at this point
778	ldp	x6,x7,[x2,#8*0]
779	sub	x27,x3,x1	// done yet?
780	sub	x16,x3,x5	// rewinded np
781	ldp	x8,x9,[x2,#8*2]
782	ldp	x10,x11,[x2,#8*4]
783	ldp	x12,x13,[x2,#8*6]
784	cbz	x27,.Lsqr8x_tail_break
785
786	ldr	x4,[x0,#-8*8]
787	adds	x19,x19,x6
788	adcs	x20,x20,x7
789	ldp	x6,x7,[x1,#8*0]
790	adcs	x21,x21,x8
791	adcs	x22,x22,x9
792	ldp	x8,x9,[x1,#8*2]
793	adcs	x23,x23,x10
794	adcs	x24,x24,x11
795	ldp	x10,x11,[x1,#8*4]
796	adcs	x25,x25,x12
797	mov	x27,#-8*8
798	adcs	x26,x26,x13
799	ldp	x12,x13,[x1,#8*6]
800	add	x1,x1,#8*8
801	//adc	x28,xzr,xzr		// moved above
802	b	.Lsqr8x_tail
803
804.align	4
805.Lsqr8x_tail_break:
806	ldr	x4,[x29,#112]		// pull n0
807	add	x27,x2,#8*8		// end of current t[num] window
808
809	subs	xzr,x30,#1		// "move" top-most carry to carry bit
810	adcs	x14,x19,x6
811	adcs	x15,x20,x7
812	ldp	x19,x20,[x0,#8*0]
813	adcs	x21,x21,x8
814	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
815	adcs	x22,x22,x9
816	ldp	x8,x9,[x16,#8*2]
817	adcs	x23,x23,x10
818	adcs	x24,x24,x11
819	ldp	x10,x11,[x16,#8*4]
820	adcs	x25,x25,x12
821	adcs	x26,x26,x13
822	ldp	x12,x13,[x16,#8*6]
823	add	x1,x16,#8*8
824	adc	x30,xzr,xzr	// top-most carry
825	mul	x28,x4,x19
826	stp	x14,x15,[x2,#8*0]
827	stp	x21,x22,[x2,#8*2]
828	ldp	x21,x22,[x0,#8*2]
829	stp	x23,x24,[x2,#8*4]
830	ldp	x23,x24,[x0,#8*4]
831	cmp	x27,x29		// did we hit the bottom?
832	stp	x25,x26,[x2,#8*6]
833	mov	x2,x0			// slide the window
834	ldp	x25,x26,[x0,#8*6]
835	mov	x27,#8
836	b.ne	.Lsqr8x_reduction
837
838	// Final step. We see if result is larger than modulus, and
839	// if it is, subtract the modulus. But comparison implies
840	// subtraction. So we subtract modulus, see if it borrowed,
841	// and conditionally copy original value.
842	ldr	x0,[x29,#96]		// pull rp
843	add	x2,x2,#8*8
844	subs	x14,x19,x6
845	sbcs	x15,x20,x7
846	sub	x27,x5,#8*8
847	mov	x3,x0		// x0 copy
848
849.Lsqr8x_sub:
850	sbcs	x16,x21,x8
851	ldp	x6,x7,[x1,#8*0]
852	sbcs	x17,x22,x9
853	stp	x14,x15,[x0,#8*0]
854	sbcs	x14,x23,x10
855	ldp	x8,x9,[x1,#8*2]
856	sbcs	x15,x24,x11
857	stp	x16,x17,[x0,#8*2]
858	sbcs	x16,x25,x12
859	ldp	x10,x11,[x1,#8*4]
860	sbcs	x17,x26,x13
861	ldp	x12,x13,[x1,#8*6]
862	add	x1,x1,#8*8
863	ldp	x19,x20,[x2,#8*0]
864	sub	x27,x27,#8*8
865	ldp	x21,x22,[x2,#8*2]
866	ldp	x23,x24,[x2,#8*4]
867	ldp	x25,x26,[x2,#8*6]
868	add	x2,x2,#8*8
869	stp	x14,x15,[x0,#8*4]
870	sbcs	x14,x19,x6
871	stp	x16,x17,[x0,#8*6]
872	add	x0,x0,#8*8
873	sbcs	x15,x20,x7
874	cbnz	x27,.Lsqr8x_sub
875
876	sbcs	x16,x21,x8
877	mov	x2,sp
878	add	x1,sp,x5
879	ldp	x6,x7,[x3,#8*0]
880	sbcs	x17,x22,x9
881	stp	x14,x15,[x0,#8*0]
882	sbcs	x14,x23,x10
883	ldp	x8,x9,[x3,#8*2]
884	sbcs	x15,x24,x11
885	stp	x16,x17,[x0,#8*2]
886	sbcs	x16,x25,x12
887	ldp	x19,x20,[x1,#8*0]
888	sbcs	x17,x26,x13
889	ldp	x21,x22,[x1,#8*2]
890	sbcs	xzr,x30,xzr	// did it borrow?
891	ldr	x30,[x29,#8]		// pull return address
892	stp	x14,x15,[x0,#8*4]
893	stp	x16,x17,[x0,#8*6]
894
895	sub	x27,x5,#8*4
896.Lsqr4x_cond_copy:
897	sub	x27,x27,#8*4
898	csel	x14,x19,x6,lo
899	stp	xzr,xzr,[x2,#8*0]
900	csel	x15,x20,x7,lo
901	ldp	x6,x7,[x3,#8*4]
902	ldp	x19,x20,[x1,#8*4]
903	csel	x16,x21,x8,lo
904	stp	xzr,xzr,[x2,#8*2]
905	add	x2,x2,#8*4
906	csel	x17,x22,x9,lo
907	ldp	x8,x9,[x3,#8*6]
908	ldp	x21,x22,[x1,#8*6]
909	add	x1,x1,#8*4
910	stp	x14,x15,[x3,#8*0]
911	stp	x16,x17,[x3,#8*2]
912	add	x3,x3,#8*4
913	stp	xzr,xzr,[x1,#8*0]
914	stp	xzr,xzr,[x1,#8*2]
915	cbnz	x27,.Lsqr4x_cond_copy
916
917	csel	x14,x19,x6,lo
918	stp	xzr,xzr,[x2,#8*0]
919	csel	x15,x20,x7,lo
920	stp	xzr,xzr,[x2,#8*2]
921	csel	x16,x21,x8,lo
922	csel	x17,x22,x9,lo
923	stp	x14,x15,[x3,#8*0]
924	stp	x16,x17,[x3,#8*2]
925
926	b	.Lsqr8x_done
927
928.align	4
929.Lsqr8x8_post_condition:
930	adc	x28,xzr,xzr
931	ldr	x30,[x29,#8]		// pull return address
932	// x19-7,x28 hold result, x6-7 hold modulus
933	subs	x6,x19,x6
934	ldr	x1,[x29,#96]		// pull rp
935	sbcs	x7,x20,x7
936	stp	xzr,xzr,[sp,#8*0]
937	sbcs	x8,x21,x8
938	stp	xzr,xzr,[sp,#8*2]
939	sbcs	x9,x22,x9
940	stp	xzr,xzr,[sp,#8*4]
941	sbcs	x10,x23,x10
942	stp	xzr,xzr,[sp,#8*6]
943	sbcs	x11,x24,x11
944	stp	xzr,xzr,[sp,#8*8]
945	sbcs	x12,x25,x12
946	stp	xzr,xzr,[sp,#8*10]
947	sbcs	x13,x26,x13
948	stp	xzr,xzr,[sp,#8*12]
949	sbcs	x28,x28,xzr	// did it borrow?
950	stp	xzr,xzr,[sp,#8*14]
951
952	// x6-7 hold result-modulus
953	csel	x6,x19,x6,lo
954	csel	x7,x20,x7,lo
955	csel	x8,x21,x8,lo
956	csel	x9,x22,x9,lo
957	stp	x6,x7,[x1,#8*0]
958	csel	x10,x23,x10,lo
959	csel	x11,x24,x11,lo
960	stp	x8,x9,[x1,#8*2]
961	csel	x12,x25,x12,lo
962	csel	x13,x26,x13,lo
963	stp	x10,x11,[x1,#8*4]
964	stp	x12,x13,[x1,#8*6]
965
966.Lsqr8x_done:
967	ldp	x19,x20,[x29,#16]
968	mov	sp,x29
969	ldp	x21,x22,[x29,#32]
970	mov	x0,#1
971	ldp	x23,x24,[x29,#48]
972	ldp	x25,x26,[x29,#64]
973	ldp	x27,x28,[x29,#80]
974	ldr	x29,[sp],#128
975	// x30 is popped earlier
976	AARCH64_VALIDATE_LINK_REGISTER
977	ret
978.size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
979.type	__bn_mul4x_mont,%function
980.align	5
981__bn_mul4x_mont:
982	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
983	// only from bn_mul_mont or __bn_mul8x_mont which have already signed the
984	// return address.
985	stp	x29,x30,[sp,#-128]!
986	add	x29,sp,#0
987	stp	x19,x20,[sp,#16]
988	stp	x21,x22,[sp,#32]
989	stp	x23,x24,[sp,#48]
990	stp	x25,x26,[sp,#64]
991	stp	x27,x28,[sp,#80]
992
993	sub	x26,sp,x5,lsl#3
994	lsl	x5,x5,#3
995	ldr	x4,[x4]		// *n0
996	sub	sp,x26,#8*4		// alloca
997
998	add	x10,x2,x5
999	add	x27,x1,x5
1000	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
1001
1002	ldr	x24,[x2,#8*0]		// b[0]
1003	ldp	x6,x7,[x1,#8*0]	// a[0..3]
1004	ldp	x8,x9,[x1,#8*2]
1005	add	x1,x1,#8*4
1006	mov	x19,xzr
1007	mov	x20,xzr
1008	mov	x21,xzr
1009	mov	x22,xzr
1010	ldp	x14,x15,[x3,#8*0]	// n[0..3]
1011	ldp	x16,x17,[x3,#8*2]
1012	adds	x3,x3,#8*4		// clear carry bit
1013	mov	x0,xzr
1014	mov	x28,#0
1015	mov	x26,sp
1016
1017.Loop_mul4x_1st_reduction:
1018	mul	x10,x6,x24		// lo(a[0..3]*b[0])
1019	adc	x0,x0,xzr	// modulo-scheduled
1020	mul	x11,x7,x24
1021	add	x28,x28,#8
1022	mul	x12,x8,x24
1023	and	x28,x28,#31
1024	mul	x13,x9,x24
1025	adds	x19,x19,x10
1026	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
1027	adcs	x20,x20,x11
1028	mul	x25,x19,x4		// t[0]*n0
1029	adcs	x21,x21,x12
1030	umulh	x11,x7,x24
1031	adcs	x22,x22,x13
1032	umulh	x12,x8,x24
1033	adc	x23,xzr,xzr
1034	umulh	x13,x9,x24
1035	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1036	adds	x20,x20,x10
1037	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
1038	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1039	adcs	x21,x21,x11
1040	mul	x11,x15,x25
1041	adcs	x22,x22,x12
1042	mul	x12,x16,x25
1043	adc	x23,x23,x13		// can't overflow
1044	mul	x13,x17,x25
1045	// (*)	adds	xzr,x19,x10
1046	subs	xzr,x19,#1		// (*)
1047	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
1048	adcs	x19,x20,x11
1049	umulh	x11,x15,x25
1050	adcs	x20,x21,x12
1051	umulh	x12,x16,x25
1052	adcs	x21,x22,x13
1053	umulh	x13,x17,x25
1054	adcs	x22,x23,x0
1055	adc	x0,xzr,xzr
1056	adds	x19,x19,x10
1057	sub	x10,x27,x1
1058	adcs	x20,x20,x11
1059	adcs	x21,x21,x12
1060	adcs	x22,x22,x13
1061	//adc	x0,x0,xzr
1062	cbnz	x28,.Loop_mul4x_1st_reduction
1063
1064	cbz	x10,.Lmul4x4_post_condition
1065
1066	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1067	ldp	x8,x9,[x1,#8*2]
1068	add	x1,x1,#8*4
1069	ldr	x25,[sp]		// a[0]*n0
1070	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1071	ldp	x16,x17,[x3,#8*2]
1072	add	x3,x3,#8*4
1073
1074.Loop_mul4x_1st_tail:
1075	mul	x10,x6,x24		// lo(a[4..7]*b[i])
1076	adc	x0,x0,xzr	// modulo-scheduled
1077	mul	x11,x7,x24
1078	add	x28,x28,#8
1079	mul	x12,x8,x24
1080	and	x28,x28,#31
1081	mul	x13,x9,x24
1082	adds	x19,x19,x10
1083	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
1084	adcs	x20,x20,x11
1085	umulh	x11,x7,x24
1086	adcs	x21,x21,x12
1087	umulh	x12,x8,x24
1088	adcs	x22,x22,x13
1089	umulh	x13,x9,x24
1090	adc	x23,xzr,xzr
1091	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1092	adds	x20,x20,x10
1093	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
1094	adcs	x21,x21,x11
1095	mul	x11,x15,x25
1096	adcs	x22,x22,x12
1097	mul	x12,x16,x25
1098	adc	x23,x23,x13		// can't overflow
1099	mul	x13,x17,x25
1100	adds	x19,x19,x10
1101	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
1102	adcs	x20,x20,x11
1103	umulh	x11,x15,x25
1104	adcs	x21,x21,x12
1105	umulh	x12,x16,x25
1106	adcs	x22,x22,x13
1107	adcs	x23,x23,x0
1108	umulh	x13,x17,x25
1109	adc	x0,xzr,xzr
1110	ldr	x25,[sp,x28]		// next t[0]*n0
1111	str	x19,[x26],#8		// result!!!
1112	adds	x19,x20,x10
1113	sub	x10,x27,x1		// done yet?
1114	adcs	x20,x21,x11
1115	adcs	x21,x22,x12
1116	adcs	x22,x23,x13
1117	//adc	x0,x0,xzr
1118	cbnz	x28,.Loop_mul4x_1st_tail
1119
1120	sub	x11,x27,x5	// rewinded x1
1121	cbz	x10,.Lmul4x_proceed
1122
1123	ldp	x6,x7,[x1,#8*0]
1124	ldp	x8,x9,[x1,#8*2]
1125	add	x1,x1,#8*4
1126	ldp	x14,x15,[x3,#8*0]
1127	ldp	x16,x17,[x3,#8*2]
1128	add	x3,x3,#8*4
1129	b	.Loop_mul4x_1st_tail
1130
1131.align	5
1132.Lmul4x_proceed:
1133	ldr	x24,[x2,#8*4]!		// *++b
1134	adc	x30,x0,xzr
1135	ldp	x6,x7,[x11,#8*0]	// a[0..3]
1136	sub	x3,x3,x5		// rewind np
1137	ldp	x8,x9,[x11,#8*2]
1138	add	x1,x11,#8*4
1139
1140	stp	x19,x20,[x26,#8*0]	// result!!!
1141	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1142	stp	x21,x22,[x26,#8*2]	// result!!!
1143	ldp	x21,x22,[sp,#8*6]
1144
1145	ldp	x14,x15,[x3,#8*0]	// n[0..3]
1146	mov	x26,sp
1147	ldp	x16,x17,[x3,#8*2]
1148	adds	x3,x3,#8*4		// clear carry bit
1149	mov	x0,xzr
1150
1151.align	4
1152.Loop_mul4x_reduction:
1153	mul	x10,x6,x24		// lo(a[0..3]*b[4])
1154	adc	x0,x0,xzr	// modulo-scheduled
1155	mul	x11,x7,x24
1156	add	x28,x28,#8
1157	mul	x12,x8,x24
1158	and	x28,x28,#31
1159	mul	x13,x9,x24
1160	adds	x19,x19,x10
1161	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
1162	adcs	x20,x20,x11
1163	mul	x25,x19,x4		// t[0]*n0
1164	adcs	x21,x21,x12
1165	umulh	x11,x7,x24
1166	adcs	x22,x22,x13
1167	umulh	x12,x8,x24
1168	adc	x23,xzr,xzr
1169	umulh	x13,x9,x24
1170	ldr	x24,[x2,x28]		// next b[i]
1171	adds	x20,x20,x10
1172	// (*)	mul	x10,x14,x25
1173	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1174	adcs	x21,x21,x11
1175	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
1176	adcs	x22,x22,x12
1177	mul	x12,x16,x25
1178	adc	x23,x23,x13		// can't overflow
1179	mul	x13,x17,x25
1180	// (*)	adds	xzr,x19,x10
1181	subs	xzr,x19,#1		// (*)
1182	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
1183	adcs	x19,x20,x11
1184	umulh	x11,x15,x25
1185	adcs	x20,x21,x12
1186	umulh	x12,x16,x25
1187	adcs	x21,x22,x13
1188	umulh	x13,x17,x25
1189	adcs	x22,x23,x0
1190	adc	x0,xzr,xzr
1191	adds	x19,x19,x10
1192	adcs	x20,x20,x11
1193	adcs	x21,x21,x12
1194	adcs	x22,x22,x13
1195	//adc	x0,x0,xzr
1196	cbnz	x28,.Loop_mul4x_reduction
1197
1198	adc	x0,x0,xzr
1199	ldp	x10,x11,[x26,#8*4]	// t[4..7]
1200	ldp	x12,x13,[x26,#8*6]
1201	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1202	ldp	x8,x9,[x1,#8*2]
1203	add	x1,x1,#8*4
1204	adds	x19,x19,x10
1205	adcs	x20,x20,x11
1206	adcs	x21,x21,x12
1207	adcs	x22,x22,x13
1208	//adc	x0,x0,xzr
1209
1210	ldr	x25,[sp]		// t[0]*n0
1211	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1212	ldp	x16,x17,[x3,#8*2]
1213	add	x3,x3,#8*4
1214
1215.align	4
1216.Loop_mul4x_tail:
1217	mul	x10,x6,x24		// lo(a[4..7]*b[4])
1218	adc	x0,x0,xzr	// modulo-scheduled
1219	mul	x11,x7,x24
1220	add	x28,x28,#8
1221	mul	x12,x8,x24
1222	and	x28,x28,#31
1223	mul	x13,x9,x24
1224	adds	x19,x19,x10
1225	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
1226	adcs	x20,x20,x11
1227	umulh	x11,x7,x24
1228	adcs	x21,x21,x12
1229	umulh	x12,x8,x24
1230	adcs	x22,x22,x13
1231	umulh	x13,x9,x24
1232	adc	x23,xzr,xzr
1233	ldr	x24,[x2,x28]		// next b[i]
1234	adds	x20,x20,x10
1235	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
1236	adcs	x21,x21,x11
1237	mul	x11,x15,x25
1238	adcs	x22,x22,x12
1239	mul	x12,x16,x25
1240	adc	x23,x23,x13		// can't overflow
1241	mul	x13,x17,x25
1242	adds	x19,x19,x10
1243	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
1244	adcs	x20,x20,x11
1245	umulh	x11,x15,x25
1246	adcs	x21,x21,x12
1247	umulh	x12,x16,x25
1248	adcs	x22,x22,x13
1249	umulh	x13,x17,x25
1250	adcs	x23,x23,x0
1251	ldr	x25,[sp,x28]		// next a[0]*n0
1252	adc	x0,xzr,xzr
1253	str	x19,[x26],#8		// result!!!
1254	adds	x19,x20,x10
1255	sub	x10,x27,x1		// done yet?
1256	adcs	x20,x21,x11
1257	adcs	x21,x22,x12
1258	adcs	x22,x23,x13
1259	//adc	x0,x0,xzr
1260	cbnz	x28,.Loop_mul4x_tail
1261
1262	sub	x11,x3,x5		// rewinded np?
1263	adc	x0,x0,xzr
1264	cbz	x10,.Loop_mul4x_break
1265
1266	ldp	x10,x11,[x26,#8*4]
1267	ldp	x12,x13,[x26,#8*6]
1268	ldp	x6,x7,[x1,#8*0]
1269	ldp	x8,x9,[x1,#8*2]
1270	add	x1,x1,#8*4
1271	adds	x19,x19,x10
1272	adcs	x20,x20,x11
1273	adcs	x21,x21,x12
1274	adcs	x22,x22,x13
1275	//adc	x0,x0,xzr
1276	ldp	x14,x15,[x3,#8*0]
1277	ldp	x16,x17,[x3,#8*2]
1278	add	x3,x3,#8*4
1279	b	.Loop_mul4x_tail
1280
1281.align	4
1282.Loop_mul4x_break:
1283	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
1284	adds	x19,x19,x30
1285	add	x2,x2,#8*4		// bp++
1286	adcs	x20,x20,xzr
1287	sub	x1,x1,x5		// rewind ap
1288	adcs	x21,x21,xzr
1289	stp	x19,x20,[x26,#8*0]	// result!!!
1290	adcs	x22,x22,xzr
1291	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1292	adc	x30,x0,xzr
1293	stp	x21,x22,[x26,#8*2]	// result!!!
1294	cmp	x2,x13			// done yet?
1295	ldp	x21,x22,[sp,#8*6]
1296	ldp	x14,x15,[x11,#8*0]	// n[0..3]
1297	ldp	x16,x17,[x11,#8*2]
1298	add	x3,x11,#8*4
1299	b.eq	.Lmul4x_post
1300
1301	ldr	x24,[x2]
1302	ldp	x6,x7,[x1,#8*0]	// a[0..3]
1303	ldp	x8,x9,[x1,#8*2]
1304	adds	x1,x1,#8*4		// clear carry bit
1305	mov	x0,xzr
1306	mov	x26,sp
1307	b	.Loop_mul4x_reduction
1308
1309.align	4
1310.Lmul4x_post:
1311	// Final step. We see if result is larger than modulus, and
1312	// if it is, subtract the modulus. But comparison implies
1313	// subtraction. So we subtract modulus, see if it borrowed,
1314	// and conditionally copy original value.
1315	mov	x0,x12
1316	mov	x27,x12		// x0 copy
1317	subs	x10,x19,x14
1318	add	x26,sp,#8*8
1319	sbcs	x11,x20,x15
1320	sub	x28,x5,#8*4
1321
1322.Lmul4x_sub:
1323	sbcs	x12,x21,x16
1324	ldp	x14,x15,[x3,#8*0]
1325	sub	x28,x28,#8*4
1326	ldp	x19,x20,[x26,#8*0]
1327	sbcs	x13,x22,x17
1328	ldp	x16,x17,[x3,#8*2]
1329	add	x3,x3,#8*4
1330	ldp	x21,x22,[x26,#8*2]
1331	add	x26,x26,#8*4
1332	stp	x10,x11,[x0,#8*0]
1333	sbcs	x10,x19,x14
1334	stp	x12,x13,[x0,#8*2]
1335	add	x0,x0,#8*4
1336	sbcs	x11,x20,x15
1337	cbnz	x28,.Lmul4x_sub
1338
1339	sbcs	x12,x21,x16
1340	mov	x26,sp
1341	add	x1,sp,#8*4
1342	ldp	x6,x7,[x27,#8*0]
1343	sbcs	x13,x22,x17
1344	stp	x10,x11,[x0,#8*0]
1345	ldp	x8,x9,[x27,#8*2]
1346	stp	x12,x13,[x0,#8*2]
1347	ldp	x19,x20,[x1,#8*0]
1348	ldp	x21,x22,[x1,#8*2]
1349	sbcs	xzr,x30,xzr	// did it borrow?
1350	ldr	x30,[x29,#8]		// pull return address
1351
1352	sub	x28,x5,#8*4
1353.Lmul4x_cond_copy:
1354	sub	x28,x28,#8*4
1355	csel	x10,x19,x6,lo
1356	stp	xzr,xzr,[x26,#8*0]
1357	csel	x11,x20,x7,lo
1358	ldp	x6,x7,[x27,#8*4]
1359	ldp	x19,x20,[x1,#8*4]
1360	csel	x12,x21,x8,lo
1361	stp	xzr,xzr,[x26,#8*2]
1362	add	x26,x26,#8*4
1363	csel	x13,x22,x9,lo
1364	ldp	x8,x9,[x27,#8*6]
1365	ldp	x21,x22,[x1,#8*6]
1366	add	x1,x1,#8*4
1367	stp	x10,x11,[x27,#8*0]
1368	stp	x12,x13,[x27,#8*2]
1369	add	x27,x27,#8*4
1370	cbnz	x28,.Lmul4x_cond_copy
1371
1372	csel	x10,x19,x6,lo
1373	stp	xzr,xzr,[x26,#8*0]
1374	csel	x11,x20,x7,lo
1375	stp	xzr,xzr,[x26,#8*2]
1376	csel	x12,x21,x8,lo
1377	stp	xzr,xzr,[x26,#8*3]
1378	csel	x13,x22,x9,lo
1379	stp	xzr,xzr,[x26,#8*4]
1380	stp	x10,x11,[x27,#8*0]
1381	stp	x12,x13,[x27,#8*2]
1382
1383	b	.Lmul4x_done
1384
1385.align	4
1386.Lmul4x4_post_condition:
1387	adc	x0,x0,xzr
1388	ldr	x1,[x29,#96]		// pull rp
1389	// x19-3,x0 hold result, x14-7 hold modulus
1390	subs	x6,x19,x14
1391	ldr	x30,[x29,#8]		// pull return address
1392	sbcs	x7,x20,x15
1393	stp	xzr,xzr,[sp,#8*0]
1394	sbcs	x8,x21,x16
1395	stp	xzr,xzr,[sp,#8*2]
1396	sbcs	x9,x22,x17
1397	stp	xzr,xzr,[sp,#8*4]
1398	sbcs	xzr,x0,xzr		// did it borrow?
1399	stp	xzr,xzr,[sp,#8*6]
1400
1401	// x6-3 hold result-modulus
1402	csel	x6,x19,x6,lo
1403	csel	x7,x20,x7,lo
1404	csel	x8,x21,x8,lo
1405	csel	x9,x22,x9,lo
1406	stp	x6,x7,[x1,#8*0]
1407	stp	x8,x9,[x1,#8*2]
1408
1409.Lmul4x_done:
1410	ldp	x19,x20,[x29,#16]
1411	mov	sp,x29
1412	ldp	x21,x22,[x29,#32]
1413	mov	x0,#1
1414	ldp	x23,x24,[x29,#48]
1415	ldp	x25,x26,[x29,#64]
1416	ldp	x27,x28,[x29,#80]
1417	ldr	x29,[sp],#128
1418	// x30 is popped earlier
1419	AARCH64_VALIDATE_LINK_REGISTER
1420	ret
1421.size	__bn_mul4x_mont,.-__bn_mul4x_mont
1422.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1423.align	2
1424.align	4
1425#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
1426