1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <ring-core/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
7#include "ring-core/arm_arch.h"
8
9.section	.rodata
10.align	5
11.Lpoly:
12.quad	0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
13.LRR:	//	2^512 mod P precomputed for NIST P256 polynomial
14.quad	0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd
15.Lone_mont:
16.quad	0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
17.Lone:
18.quad	1,0,0,0
19.Lord:
20.quad	0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
21.LordK:
22.quad	0xccd1c8aaee00bc4f
23.byte	69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
24.align	2
25.text
26
27// void	ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
28//					     const BN_ULONG x2[4]);
29.globl	ecp_nistz256_mul_mont
30.hidden	ecp_nistz256_mul_mont
31.type	ecp_nistz256_mul_mont,%function
32.align	4
33ecp_nistz256_mul_mont:
34	AARCH64_SIGN_LINK_REGISTER
35	stp	x29,x30,[sp,#-32]!
36	add	x29,sp,#0
37	stp	x19,x20,[sp,#16]
38
39	ldr	x3,[x2]		// bp[0]
40	ldp	x4,x5,[x1]
41	ldp	x6,x7,[x1,#16]
42	adrp	x13,.Lpoly
43	add	x13,x13,:lo12:.Lpoly
44	ldr	x12,[x13,#8]
45	ldr	x13,[x13,#24]
46
47	bl	__ecp_nistz256_mul_mont
48
49	ldp	x19,x20,[sp,#16]
50	ldp	x29,x30,[sp],#32
51	AARCH64_VALIDATE_LINK_REGISTER
52	ret
53.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
54
55// void	ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
56.globl	ecp_nistz256_sqr_mont
57.hidden	ecp_nistz256_sqr_mont
58.type	ecp_nistz256_sqr_mont,%function
59.align	4
60ecp_nistz256_sqr_mont:
61	AARCH64_SIGN_LINK_REGISTER
62	stp	x29,x30,[sp,#-32]!
63	add	x29,sp,#0
64	stp	x19,x20,[sp,#16]
65
66	ldp	x4,x5,[x1]
67	ldp	x6,x7,[x1,#16]
68	adrp	x13,.Lpoly
69	add	x13,x13,:lo12:.Lpoly
70	ldr	x12,[x13,#8]
71	ldr	x13,[x13,#24]
72
73	bl	__ecp_nistz256_sqr_mont
74
75	ldp	x19,x20,[sp,#16]
76	ldp	x29,x30,[sp],#32
77	AARCH64_VALIDATE_LINK_REGISTER
78	ret
79.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
80
81// void	ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
82.globl	ecp_nistz256_neg
83.hidden	ecp_nistz256_neg
84.type	ecp_nistz256_neg,%function
85.align	4
86ecp_nistz256_neg:
87	AARCH64_SIGN_LINK_REGISTER
88	stp	x29,x30,[sp,#-16]!
89	add	x29,sp,#0
90
91	mov	x2,x1
92	mov	x14,xzr		// a = 0
93	mov	x15,xzr
94	mov	x16,xzr
95	mov	x17,xzr
96	adrp	x13,.Lpoly
97	add	x13,x13,:lo12:.Lpoly
98	ldr	x12,[x13,#8]
99	ldr	x13,[x13,#24]
100
101	bl	__ecp_nistz256_sub_from
102
103	ldp	x29,x30,[sp],#16
104	AARCH64_VALIDATE_LINK_REGISTER
105	ret
106.size	ecp_nistz256_neg,.-ecp_nistz256_neg
107
108// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
109// to x4-x7 and b[0] - to x3
110.type	__ecp_nistz256_mul_mont,%function
111.align	4
112__ecp_nistz256_mul_mont:
113	mul	x14,x4,x3		// a[0]*b[0]
114	umulh	x8,x4,x3
115
116	mul	x15,x5,x3		// a[1]*b[0]
117	umulh	x9,x5,x3
118
119	mul	x16,x6,x3		// a[2]*b[0]
120	umulh	x10,x6,x3
121
122	mul	x17,x7,x3		// a[3]*b[0]
123	umulh	x11,x7,x3
124	ldr	x3,[x2,#8]		// b[1]
125
126	adds	x15,x15,x8		// accumulate high parts of multiplication
127	lsl	x8,x14,#32
128	adcs	x16,x16,x9
129	lsr	x9,x14,#32
130	adcs	x17,x17,x10
131	adc	x19,xzr,x11
132	mov	x20,xzr
133	subs	x10,x14,x8		// "*0xffff0001"
134	sbc	x11,x14,x9
135	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
136	mul	x8,x4,x3		// lo(a[0]*b[i])
137	adcs	x15,x16,x9
138	mul	x9,x5,x3		// lo(a[1]*b[i])
139	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
140	mul	x10,x6,x3		// lo(a[2]*b[i])
141	adcs	x17,x19,x11
142	mul	x11,x7,x3		// lo(a[3]*b[i])
143	adc	x19,x20,xzr
144
145	adds	x14,x14,x8		// accumulate low parts of multiplication
146	umulh	x8,x4,x3		// hi(a[0]*b[i])
147	adcs	x15,x15,x9
148	umulh	x9,x5,x3		// hi(a[1]*b[i])
149	adcs	x16,x16,x10
150	umulh	x10,x6,x3		// hi(a[2]*b[i])
151	adcs	x17,x17,x11
152	umulh	x11,x7,x3		// hi(a[3]*b[i])
153	adc	x19,x19,xzr
154	ldr	x3,[x2,#8*(1+1)]	// b[1+1]
155	adds	x15,x15,x8		// accumulate high parts of multiplication
156	lsl	x8,x14,#32
157	adcs	x16,x16,x9
158	lsr	x9,x14,#32
159	adcs	x17,x17,x10
160	adcs	x19,x19,x11
161	adc	x20,xzr,xzr
162	subs	x10,x14,x8		// "*0xffff0001"
163	sbc	x11,x14,x9
164	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
165	mul	x8,x4,x3		// lo(a[0]*b[i])
166	adcs	x15,x16,x9
167	mul	x9,x5,x3		// lo(a[1]*b[i])
168	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
169	mul	x10,x6,x3		// lo(a[2]*b[i])
170	adcs	x17,x19,x11
171	mul	x11,x7,x3		// lo(a[3]*b[i])
172	adc	x19,x20,xzr
173
174	adds	x14,x14,x8		// accumulate low parts of multiplication
175	umulh	x8,x4,x3		// hi(a[0]*b[i])
176	adcs	x15,x15,x9
177	umulh	x9,x5,x3		// hi(a[1]*b[i])
178	adcs	x16,x16,x10
179	umulh	x10,x6,x3		// hi(a[2]*b[i])
180	adcs	x17,x17,x11
181	umulh	x11,x7,x3		// hi(a[3]*b[i])
182	adc	x19,x19,xzr
183	ldr	x3,[x2,#8*(2+1)]	// b[2+1]
184	adds	x15,x15,x8		// accumulate high parts of multiplication
185	lsl	x8,x14,#32
186	adcs	x16,x16,x9
187	lsr	x9,x14,#32
188	adcs	x17,x17,x10
189	adcs	x19,x19,x11
190	adc	x20,xzr,xzr
191	subs	x10,x14,x8		// "*0xffff0001"
192	sbc	x11,x14,x9
193	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
194	mul	x8,x4,x3		// lo(a[0]*b[i])
195	adcs	x15,x16,x9
196	mul	x9,x5,x3		// lo(a[1]*b[i])
197	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
198	mul	x10,x6,x3		// lo(a[2]*b[i])
199	adcs	x17,x19,x11
200	mul	x11,x7,x3		// lo(a[3]*b[i])
201	adc	x19,x20,xzr
202
203	adds	x14,x14,x8		// accumulate low parts of multiplication
204	umulh	x8,x4,x3		// hi(a[0]*b[i])
205	adcs	x15,x15,x9
206	umulh	x9,x5,x3		// hi(a[1]*b[i])
207	adcs	x16,x16,x10
208	umulh	x10,x6,x3		// hi(a[2]*b[i])
209	adcs	x17,x17,x11
210	umulh	x11,x7,x3		// hi(a[3]*b[i])
211	adc	x19,x19,xzr
212	adds	x15,x15,x8		// accumulate high parts of multiplication
213	lsl	x8,x14,#32
214	adcs	x16,x16,x9
215	lsr	x9,x14,#32
216	adcs	x17,x17,x10
217	adcs	x19,x19,x11
218	adc	x20,xzr,xzr
219	// last reduction
220	subs	x10,x14,x8		// "*0xffff0001"
221	sbc	x11,x14,x9
222	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
223	adcs	x15,x16,x9
224	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
225	adcs	x17,x19,x11
226	adc	x19,x20,xzr
227
228	adds	x8,x14,#1		// subs	x8,x14,#-1 // tmp = ret-modulus
229	sbcs	x9,x15,x12
230	sbcs	x10,x16,xzr
231	sbcs	x11,x17,x13
232	sbcs	xzr,x19,xzr		// did it borrow?
233
234	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
235	csel	x15,x15,x9,lo
236	csel	x16,x16,x10,lo
237	stp	x14,x15,[x0]
238	csel	x17,x17,x11,lo
239	stp	x16,x17,[x0,#16]
240
241	ret
242.size	__ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
243
244// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
245// to x4-x7
246.type	__ecp_nistz256_sqr_mont,%function
247.align	4
248__ecp_nistz256_sqr_mont:
249	//  |  |  |  |  |  |a1*a0|  |
250	//  |  |  |  |  |a2*a0|  |  |
251	//  |  |a3*a2|a3*a0|  |  |  |
252	//  |  |  |  |a2*a1|  |  |  |
253	//  |  |  |a3*a1|  |  |  |  |
254	// *|  |  |  |  |  |  |  | 2|
255	// +|a3*a3|a2*a2|a1*a1|a0*a0|
256	//  |--+--+--+--+--+--+--+--|
257	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow
258	//
259	//  "can't overflow" below mark carrying into high part of
260	//  multiplication result, which can't overflow, because it
261	//  can never be all ones.
262
263	mul	x15,x5,x4		// a[1]*a[0]
264	umulh	x9,x5,x4
265	mul	x16,x6,x4		// a[2]*a[0]
266	umulh	x10,x6,x4
267	mul	x17,x7,x4		// a[3]*a[0]
268	umulh	x19,x7,x4
269
270	adds	x16,x16,x9		// accumulate high parts of multiplication
271	mul	x8,x6,x5		// a[2]*a[1]
272	umulh	x9,x6,x5
273	adcs	x17,x17,x10
274	mul	x10,x7,x5		// a[3]*a[1]
275	umulh	x11,x7,x5
276	adc	x19,x19,xzr		// can't overflow
277
278	mul	x20,x7,x6		// a[3]*a[2]
279	umulh	x1,x7,x6
280
281	adds	x9,x9,x10		// accumulate high parts of multiplication
282	mul	x14,x4,x4		// a[0]*a[0]
283	adc	x10,x11,xzr		// can't overflow
284
285	adds	x17,x17,x8		// accumulate low parts of multiplication
286	umulh	x4,x4,x4
287	adcs	x19,x19,x9
288	mul	x9,x5,x5		// a[1]*a[1]
289	adcs	x20,x20,x10
290	umulh	x5,x5,x5
291	adc	x1,x1,xzr		// can't overflow
292
293	adds	x15,x15,x15	// acc[1-6]*=2
294	mul	x10,x6,x6		// a[2]*a[2]
295	adcs	x16,x16,x16
296	umulh	x6,x6,x6
297	adcs	x17,x17,x17
298	mul	x11,x7,x7		// a[3]*a[3]
299	adcs	x19,x19,x19
300	umulh	x7,x7,x7
301	adcs	x20,x20,x20
302	adcs	x1,x1,x1
303	adc	x2,xzr,xzr
304
305	adds	x15,x15,x4		// +a[i]*a[i]
306	adcs	x16,x16,x9
307	adcs	x17,x17,x5
308	adcs	x19,x19,x10
309	adcs	x20,x20,x6
310	lsl	x8,x14,#32
311	adcs	x1,x1,x11
312	lsr	x9,x14,#32
313	adc	x2,x2,x7
314	subs	x10,x14,x8		// "*0xffff0001"
315	sbc	x11,x14,x9
316	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
317	adcs	x15,x16,x9
318	lsl	x8,x14,#32
319	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
320	lsr	x9,x14,#32
321	adc	x17,x11,xzr		// can't overflow
322	subs	x10,x14,x8		// "*0xffff0001"
323	sbc	x11,x14,x9
324	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
325	adcs	x15,x16,x9
326	lsl	x8,x14,#32
327	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
328	lsr	x9,x14,#32
329	adc	x17,x11,xzr		// can't overflow
330	subs	x10,x14,x8		// "*0xffff0001"
331	sbc	x11,x14,x9
332	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
333	adcs	x15,x16,x9
334	lsl	x8,x14,#32
335	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
336	lsr	x9,x14,#32
337	adc	x17,x11,xzr		// can't overflow
338	subs	x10,x14,x8		// "*0xffff0001"
339	sbc	x11,x14,x9
340	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
341	adcs	x15,x16,x9
342	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
343	adc	x17,x11,xzr		// can't overflow
344
345	adds	x14,x14,x19	// accumulate upper half
346	adcs	x15,x15,x20
347	adcs	x16,x16,x1
348	adcs	x17,x17,x2
349	adc	x19,xzr,xzr
350
351	adds	x8,x14,#1		// subs	x8,x14,#-1 // tmp = ret-modulus
352	sbcs	x9,x15,x12
353	sbcs	x10,x16,xzr
354	sbcs	x11,x17,x13
355	sbcs	xzr,x19,xzr		// did it borrow?
356
357	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
358	csel	x15,x15,x9,lo
359	csel	x16,x16,x10,lo
360	stp	x14,x15,[x0]
361	csel	x17,x17,x11,lo
362	stp	x16,x17,[x0,#16]
363
364	ret
365.size	__ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont
366
367// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to
368// x4-x7 and x8-x11. This is done because it's used in multiple
369// contexts, e.g. in multiplication by 2 and 3...
370.type	__ecp_nistz256_add_to,%function
371.align	4
372__ecp_nistz256_add_to:
373	adds	x14,x14,x8		// ret = a+b
374	adcs	x15,x15,x9
375	adcs	x16,x16,x10
376	adcs	x17,x17,x11
377	adc	x1,xzr,xzr		// zap x1
378
379	adds	x8,x14,#1		// subs	x8,x4,#-1 // tmp = ret-modulus
380	sbcs	x9,x15,x12
381	sbcs	x10,x16,xzr
382	sbcs	x11,x17,x13
383	sbcs	xzr,x1,xzr		// did subtraction borrow?
384
385	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
386	csel	x15,x15,x9,lo
387	csel	x16,x16,x10,lo
388	stp	x14,x15,[x0]
389	csel	x17,x17,x11,lo
390	stp	x16,x17,[x0,#16]
391
392	ret
393.size	__ecp_nistz256_add_to,.-__ecp_nistz256_add_to
394
395.type	__ecp_nistz256_sub_from,%function
396.align	4
397__ecp_nistz256_sub_from:
398	ldp	x8,x9,[x2]
399	ldp	x10,x11,[x2,#16]
400	subs	x14,x14,x8		// ret = a-b
401	sbcs	x15,x15,x9
402	sbcs	x16,x16,x10
403	sbcs	x17,x17,x11
404	sbc	x1,xzr,xzr		// zap x1
405
406	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = ret+modulus
407	adcs	x9,x15,x12
408	adcs	x10,x16,xzr
409	adc	x11,x17,x13
410	cmp	x1,xzr			// did subtraction borrow?
411
412	csel	x14,x14,x8,eq	// ret = borrow ? ret+modulus : ret
413	csel	x15,x15,x9,eq
414	csel	x16,x16,x10,eq
415	stp	x14,x15,[x0]
416	csel	x17,x17,x11,eq
417	stp	x16,x17,[x0,#16]
418
419	ret
420.size	__ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
421
422.type	__ecp_nistz256_sub_morf,%function
423.align	4
424__ecp_nistz256_sub_morf:
425	ldp	x8,x9,[x2]
426	ldp	x10,x11,[x2,#16]
427	subs	x14,x8,x14		// ret = b-a
428	sbcs	x15,x9,x15
429	sbcs	x16,x10,x16
430	sbcs	x17,x11,x17
431	sbc	x1,xzr,xzr		// zap x1
432
433	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = ret+modulus
434	adcs	x9,x15,x12
435	adcs	x10,x16,xzr
436	adc	x11,x17,x13
437	cmp	x1,xzr			// did subtraction borrow?
438
439	csel	x14,x14,x8,eq	// ret = borrow ? ret+modulus : ret
440	csel	x15,x15,x9,eq
441	csel	x16,x16,x10,eq
442	stp	x14,x15,[x0]
443	csel	x17,x17,x11,eq
444	stp	x16,x17,[x0,#16]
445
446	ret
447.size	__ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
448
449.type	__ecp_nistz256_div_by_2,%function
450.align	4
451__ecp_nistz256_div_by_2:
452	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = a+modulus
453	adcs	x9,x15,x12
454	adcs	x10,x16,xzr
455	adcs	x11,x17,x13
456	adc	x1,xzr,xzr		// zap x1
457	tst	x14,#1		// is a even?
458
459	csel	x14,x14,x8,eq	// ret = even ? a : a+modulus
460	csel	x15,x15,x9,eq
461	csel	x16,x16,x10,eq
462	csel	x17,x17,x11,eq
463	csel	x1,xzr,x1,eq
464
465	lsr	x14,x14,#1		// ret >>= 1
466	orr	x14,x14,x15,lsl#63
467	lsr	x15,x15,#1
468	orr	x15,x15,x16,lsl#63
469	lsr	x16,x16,#1
470	orr	x16,x16,x17,lsl#63
471	lsr	x17,x17,#1
472	stp	x14,x15,[x0]
473	orr	x17,x17,x1,lsl#63
474	stp	x16,x17,[x0,#16]
475
476	ret
477.size	__ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
478.globl	ecp_nistz256_point_double
479.hidden	ecp_nistz256_point_double
480.type	ecp_nistz256_point_double,%function
481.align	5
482ecp_nistz256_point_double:
483	AARCH64_SIGN_LINK_REGISTER
484	stp	x29,x30,[sp,#-96]!
485	add	x29,sp,#0
486	stp	x19,x20,[sp,#16]
487	stp	x21,x22,[sp,#32]
488	sub	sp,sp,#32*4
489
490.Ldouble_shortcut:
491	ldp	x14,x15,[x1,#32]
492	mov	x21,x0
493	ldp	x16,x17,[x1,#48]
494	mov	x22,x1
495	adrp	x13,.Lpoly
496	add	x13,x13,:lo12:.Lpoly
497	ldr	x12,[x13,#8]
498	mov	x8,x14
499	ldr	x13,[x13,#24]
500	mov	x9,x15
501	ldp	x4,x5,[x22,#64]	// forward load for p256_sqr_mont
502	mov	x10,x16
503	mov	x11,x17
504	ldp	x6,x7,[x22,#64+16]
505	add	x0,sp,#0
506	bl	__ecp_nistz256_add_to	// p256_mul_by_2(S, in_y);
507
508	add	x0,sp,#64
509	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Zsqr, in_z);
510
511	ldp	x8,x9,[x22]
512	ldp	x10,x11,[x22,#16]
513	mov	x4,x14		// put Zsqr aside for p256_sub
514	mov	x5,x15
515	mov	x6,x16
516	mov	x7,x17
517	add	x0,sp,#32
518	bl	__ecp_nistz256_add_to	// p256_add(M, Zsqr, in_x);
519
520	add	x2,x22,#0
521	mov	x14,x4		// restore Zsqr
522	mov	x15,x5
523	ldp	x4,x5,[sp,#0]	// forward load for p256_sqr_mont
524	mov	x16,x6
525	mov	x17,x7
526	ldp	x6,x7,[sp,#0+16]
527	add	x0,sp,#64
528	bl	__ecp_nistz256_sub_morf	// p256_sub(Zsqr, in_x, Zsqr);
529
530	add	x0,sp,#0
531	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(S, S);
532
533	ldr	x3,[x22,#32]
534	ldp	x4,x5,[x22,#64]
535	ldp	x6,x7,[x22,#64+16]
536	add	x2,x22,#32
537	add	x0,sp,#96
538	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(tmp0, in_z, in_y);
539
540	mov	x8,x14
541	mov	x9,x15
542	ldp	x4,x5,[sp,#0]	// forward load for p256_sqr_mont
543	mov	x10,x16
544	mov	x11,x17
545	ldp	x6,x7,[sp,#0+16]
546	add	x0,x21,#64
547	bl	__ecp_nistz256_add_to	// p256_mul_by_2(res_z, tmp0);
548
549	add	x0,sp,#96
550	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(tmp0, S);
551
552	ldr	x3,[sp,#64]		// forward load for p256_mul_mont
553	ldp	x4,x5,[sp,#32]
554	ldp	x6,x7,[sp,#32+16]
555	add	x0,x21,#32
556	bl	__ecp_nistz256_div_by_2	// p256_div_by_2(res_y, tmp0);
557
558	add	x2,sp,#64
559	add	x0,sp,#32
560	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(M, M, Zsqr);
561
562	mov	x8,x14		// duplicate M
563	mov	x9,x15
564	mov	x10,x16
565	mov	x11,x17
566	mov	x4,x14		// put M aside
567	mov	x5,x15
568	mov	x6,x16
569	mov	x7,x17
570	add	x0,sp,#32
571	bl	__ecp_nistz256_add_to
572	mov	x8,x4			// restore M
573	mov	x9,x5
574	ldr	x3,[x22]		// forward load for p256_mul_mont
575	mov	x10,x6
576	ldp	x4,x5,[sp,#0]
577	mov	x11,x7
578	ldp	x6,x7,[sp,#0+16]
579	bl	__ecp_nistz256_add_to	// p256_mul_by_3(M, M);
580
581	add	x2,x22,#0
582	add	x0,sp,#0
583	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, in_x);
584
585	mov	x8,x14
586	mov	x9,x15
587	ldp	x4,x5,[sp,#32]	// forward load for p256_sqr_mont
588	mov	x10,x16
589	mov	x11,x17
590	ldp	x6,x7,[sp,#32+16]
591	add	x0,sp,#96
592	bl	__ecp_nistz256_add_to	// p256_mul_by_2(tmp0, S);
593
594	add	x0,x21,#0
595	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(res_x, M);
596
597	add	x2,sp,#96
598	bl	__ecp_nistz256_sub_from	// p256_sub(res_x, res_x, tmp0);
599
600	add	x2,sp,#0
601	add	x0,sp,#0
602	bl	__ecp_nistz256_sub_morf	// p256_sub(S, S, res_x);
603
604	ldr	x3,[sp,#32]
605	mov	x4,x14		// copy S
606	mov	x5,x15
607	mov	x6,x16
608	mov	x7,x17
609	add	x2,sp,#32
610	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, M);
611
612	add	x2,x21,#32
613	add	x0,x21,#32
614	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, S, res_y);
615
616	add	sp,x29,#0		// destroy frame
617	ldp	x19,x20,[x29,#16]
618	ldp	x21,x22,[x29,#32]
619	ldp	x29,x30,[sp],#96
620	AARCH64_VALIDATE_LINK_REGISTER
621	ret
622.size	ecp_nistz256_point_double,.-ecp_nistz256_point_double
623.globl	ecp_nistz256_point_add
624.hidden	ecp_nistz256_point_add
625.type	ecp_nistz256_point_add,%function
626.align	5
627ecp_nistz256_point_add:
628	AARCH64_SIGN_LINK_REGISTER
629	stp	x29,x30,[sp,#-96]!
630	add	x29,sp,#0
631	stp	x19,x20,[sp,#16]
632	stp	x21,x22,[sp,#32]
633	stp	x23,x24,[sp,#48]
634	stp	x25,x26,[sp,#64]
635	stp	x27,x28,[sp,#80]
636	sub	sp,sp,#32*12
637
638	ldp	x4,x5,[x2,#64]	// in2_z
639	ldp	x6,x7,[x2,#64+16]
640	mov	x21,x0
641	mov	x22,x1
642	mov	x23,x2
643	adrp	x13,.Lpoly
644	add	x13,x13,:lo12:.Lpoly
645	ldr	x12,[x13,#8]
646	ldr	x13,[x13,#24]
647	orr	x8,x4,x5
648	orr	x10,x6,x7
649	orr	x25,x8,x10
650	cmp	x25,#0
651	csetm	x25,ne		// ~in2infty
652	add	x0,sp,#192
653	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z2sqr, in2_z);
654
655	ldp	x4,x5,[x22,#64]	// in1_z
656	ldp	x6,x7,[x22,#64+16]
657	orr	x8,x4,x5
658	orr	x10,x6,x7
659	orr	x24,x8,x10
660	cmp	x24,#0
661	csetm	x24,ne		// ~in1infty
662	add	x0,sp,#128
663	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
664
665	ldr	x3,[x23,#64]
666	ldp	x4,x5,[sp,#192]
667	ldp	x6,x7,[sp,#192+16]
668	add	x2,x23,#64
669	add	x0,sp,#320
670	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, Z2sqr, in2_z);
671
672	ldr	x3,[x22,#64]
673	ldp	x4,x5,[sp,#128]
674	ldp	x6,x7,[sp,#128+16]
675	add	x2,x22,#64
676	add	x0,sp,#352
677	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
678
679	ldr	x3,[x22,#32]
680	ldp	x4,x5,[sp,#320]
681	ldp	x6,x7,[sp,#320+16]
682	add	x2,x22,#32
683	add	x0,sp,#320
684	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, S1, in1_y);
685
686	ldr	x3,[x23,#32]
687	ldp	x4,x5,[sp,#352]
688	ldp	x6,x7,[sp,#352+16]
689	add	x2,x23,#32
690	add	x0,sp,#352
691	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
692
693	add	x2,sp,#320
694	ldr	x3,[sp,#192]	// forward load for p256_mul_mont
695	ldp	x4,x5,[x22]
696	ldp	x6,x7,[x22,#16]
697	add	x0,sp,#160
698	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, S1);
699
700	orr	x14,x14,x15	// see if result is zero
701	orr	x16,x16,x17
702	orr	x26,x14,x16	// ~is_equal(S1,S2)
703
704	add	x2,sp,#192
705	add	x0,sp,#256
706	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U1, in1_x, Z2sqr);
707
708	ldr	x3,[sp,#128]
709	ldp	x4,x5,[x23]
710	ldp	x6,x7,[x23,#16]
711	add	x2,sp,#128
712	add	x0,sp,#288
713	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in2_x, Z1sqr);
714
715	add	x2,sp,#256
716	ldp	x4,x5,[sp,#160]	// forward load for p256_sqr_mont
717	ldp	x6,x7,[sp,#160+16]
718	add	x0,sp,#96
719	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, U1);
720
721	orr	x14,x14,x15	// see if result is zero
722	orr	x16,x16,x17
723	orr	x14,x14,x16	// ~is_equal(U1,U2)
724
725	mvn	x27,x24	// -1/0 -> 0/-1
726	mvn	x28,x25	// -1/0 -> 0/-1
727	orr	x14,x14,x27
728	orr	x14,x14,x28
729	orr	x14,x14,x26
730	cbnz	x14,.Ladd_proceed	// if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2))
731
732.Ladd_double:
733	mov	x1,x22
734	mov	x0,x21
735	ldp	x23,x24,[x29,#48]
736	ldp	x25,x26,[x29,#64]
737	ldp	x27,x28,[x29,#80]
738	add	sp,sp,#256	// #256 is from #32*(12-4). difference in stack frames
739	b	.Ldouble_shortcut
740
741.align	4
742.Ladd_proceed:
743	add	x0,sp,#192
744	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
745
746	ldr	x3,[x22,#64]
747	ldp	x4,x5,[sp,#96]
748	ldp	x6,x7,[sp,#96+16]
749	add	x2,x22,#64
750	add	x0,sp,#64
751	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
752
753	ldp	x4,x5,[sp,#96]
754	ldp	x6,x7,[sp,#96+16]
755	add	x0,sp,#128
756	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
757
758	ldr	x3,[x23,#64]
759	ldp	x4,x5,[sp,#64]
760	ldp	x6,x7,[sp,#64+16]
761	add	x2,x23,#64
762	add	x0,sp,#64
763	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, res_z, in2_z);
764
765	ldr	x3,[sp,#96]
766	ldp	x4,x5,[sp,#128]
767	ldp	x6,x7,[sp,#128+16]
768	add	x2,sp,#96
769	add	x0,sp,#224
770	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
771
772	ldr	x3,[sp,#128]
773	ldp	x4,x5,[sp,#256]
774	ldp	x6,x7,[sp,#256+16]
775	add	x2,sp,#128
776	add	x0,sp,#288
777	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, U1, Hsqr);
778
779	mov	x8,x14
780	mov	x9,x15
781	mov	x10,x16
782	mov	x11,x17
783	add	x0,sp,#128
784	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
785
786	add	x2,sp,#192
787	add	x0,sp,#0
788	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
789
790	add	x2,sp,#224
791	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
792
793	add	x2,sp,#288
794	ldr	x3,[sp,#224]		// forward load for p256_mul_mont
795	ldp	x4,x5,[sp,#320]
796	ldp	x6,x7,[sp,#320+16]
797	add	x0,sp,#32
798	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
799
800	add	x2,sp,#224
801	add	x0,sp,#352
802	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S1, Hcub);
803
804	ldr	x3,[sp,#160]
805	ldp	x4,x5,[sp,#32]
806	ldp	x6,x7,[sp,#32+16]
807	add	x2,sp,#160
808	add	x0,sp,#32
809	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
810
811	add	x2,sp,#352
812	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
813
814	ldp	x4,x5,[sp,#0]		// res
815	ldp	x6,x7,[sp,#0+16]
816	ldp	x8,x9,[x23]		// in2
817	ldp	x10,x11,[x23,#16]
818	ldp	x14,x15,[x22,#0]	// in1
819	cmp	x24,#0			// ~, remember?
820	ldp	x16,x17,[x22,#0+16]
821	csel	x8,x4,x8,ne
822	csel	x9,x5,x9,ne
823	ldp	x4,x5,[sp,#0+0+32]	// res
824	csel	x10,x6,x10,ne
825	csel	x11,x7,x11,ne
826	cmp	x25,#0			// ~, remember?
827	ldp	x6,x7,[sp,#0+0+48]
828	csel	x14,x8,x14,ne
829	csel	x15,x9,x15,ne
830	ldp	x8,x9,[x23,#0+32]	// in2
831	csel	x16,x10,x16,ne
832	csel	x17,x11,x17,ne
833	ldp	x10,x11,[x23,#0+48]
834	stp	x14,x15,[x21,#0]
835	stp	x16,x17,[x21,#0+16]
836	ldp	x14,x15,[x22,#32]	// in1
837	cmp	x24,#0			// ~, remember?
838	ldp	x16,x17,[x22,#32+16]
839	csel	x8,x4,x8,ne
840	csel	x9,x5,x9,ne
841	ldp	x4,x5,[sp,#0+32+32]	// res
842	csel	x10,x6,x10,ne
843	csel	x11,x7,x11,ne
844	cmp	x25,#0			// ~, remember?
845	ldp	x6,x7,[sp,#0+32+48]
846	csel	x14,x8,x14,ne
847	csel	x15,x9,x15,ne
848	ldp	x8,x9,[x23,#32+32]	// in2
849	csel	x16,x10,x16,ne
850	csel	x17,x11,x17,ne
851	ldp	x10,x11,[x23,#32+48]
852	stp	x14,x15,[x21,#32]
853	stp	x16,x17,[x21,#32+16]
854	ldp	x14,x15,[x22,#64]	// in1
855	cmp	x24,#0			// ~, remember?
856	ldp	x16,x17,[x22,#64+16]
857	csel	x8,x4,x8,ne
858	csel	x9,x5,x9,ne
859	csel	x10,x6,x10,ne
860	csel	x11,x7,x11,ne
861	cmp	x25,#0			// ~, remember?
862	csel	x14,x8,x14,ne
863	csel	x15,x9,x15,ne
864	csel	x16,x10,x16,ne
865	csel	x17,x11,x17,ne
866	stp	x14,x15,[x21,#64]
867	stp	x16,x17,[x21,#64+16]
868
869.Ladd_done:
870	add	sp,x29,#0		// destroy frame
871	ldp	x19,x20,[x29,#16]
872	ldp	x21,x22,[x29,#32]
873	ldp	x23,x24,[x29,#48]
874	ldp	x25,x26,[x29,#64]
875	ldp	x27,x28,[x29,#80]
876	ldp	x29,x30,[sp],#96
877	AARCH64_VALIDATE_LINK_REGISTER
878	ret
879.size	ecp_nistz256_point_add,.-ecp_nistz256_point_add
880.globl	ecp_nistz256_point_add_affine
881.hidden	ecp_nistz256_point_add_affine
882.type	ecp_nistz256_point_add_affine,%function
883.align	5
884ecp_nistz256_point_add_affine:
885	AARCH64_SIGN_LINK_REGISTER
886	stp	x29,x30,[sp,#-80]!
887	add	x29,sp,#0
888	stp	x19,x20,[sp,#16]
889	stp	x21,x22,[sp,#32]
890	stp	x23,x24,[sp,#48]
891	stp	x25,x26,[sp,#64]
892	sub	sp,sp,#32*10
893
894	mov	x21,x0
895	mov	x22,x1
896	mov	x23,x2
897	adrp	x13,.Lpoly
898	add	x13,x13,:lo12:.Lpoly
899	ldr	x12,[x13,#8]
900	ldr	x13,[x13,#24]
901
902	ldp	x4,x5,[x1,#64]	// in1_z
903	ldp	x6,x7,[x1,#64+16]
904	orr	x8,x4,x5
905	orr	x10,x6,x7
906	orr	x24,x8,x10
907	cmp	x24,#0
908	csetm	x24,ne		// ~in1infty
909
910	ldp	x14,x15,[x2]	// in2_x
911	ldp	x16,x17,[x2,#16]
912	ldp	x8,x9,[x2,#32]	// in2_y
913	ldp	x10,x11,[x2,#48]
914	orr	x14,x14,x15
915	orr	x16,x16,x17
916	orr	x8,x8,x9
917	orr	x10,x10,x11
918	orr	x14,x14,x16
919	orr	x8,x8,x10
920	orr	x25,x14,x8
921	cmp	x25,#0
922	csetm	x25,ne		// ~in2infty
923
924	add	x0,sp,#128
925	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
926
927	mov	x4,x14
928	mov	x5,x15
929	mov	x6,x16
930	mov	x7,x17
931	ldr	x3,[x23]
932	add	x2,x23,#0
933	add	x0,sp,#96
934	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, Z1sqr, in2_x);
935
936	add	x2,x22,#0
937	ldr	x3,[x22,#64]	// forward load for p256_mul_mont
938	ldp	x4,x5,[sp,#128]
939	ldp	x6,x7,[sp,#128+16]
940	add	x0,sp,#160
941	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, in1_x);
942
943	add	x2,x22,#64
944	add	x0,sp,#128
945	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
946
947	ldr	x3,[x22,#64]
948	ldp	x4,x5,[sp,#160]
949	ldp	x6,x7,[sp,#160+16]
950	add	x2,x22,#64
951	add	x0,sp,#64
952	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
953
954	ldr	x3,[x23,#32]
955	ldp	x4,x5,[sp,#128]
956	ldp	x6,x7,[sp,#128+16]
957	add	x2,x23,#32
958	add	x0,sp,#128
959	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
960
961	add	x2,x22,#32
962	ldp	x4,x5,[sp,#160]	// forward load for p256_sqr_mont
963	ldp	x6,x7,[sp,#160+16]
964	add	x0,sp,#192
965	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, in1_y);
966
967	add	x0,sp,#224
968	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
969
970	ldp	x4,x5,[sp,#192]
971	ldp	x6,x7,[sp,#192+16]
972	add	x0,sp,#288
973	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
974
975	ldr	x3,[sp,#160]
976	ldp	x4,x5,[sp,#224]
977	ldp	x6,x7,[sp,#224+16]
978	add	x2,sp,#160
979	add	x0,sp,#256
980	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
981
982	ldr	x3,[x22]
983	ldp	x4,x5,[sp,#224]
984	ldp	x6,x7,[sp,#224+16]
985	add	x2,x22,#0
986	add	x0,sp,#96
987	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in1_x, Hsqr);
988
989	mov	x8,x14
990	mov	x9,x15
991	mov	x10,x16
992	mov	x11,x17
993	add	x0,sp,#224
994	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
995
996	add	x2,sp,#288
997	add	x0,sp,#0
998	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
999
1000	add	x2,sp,#256
1001	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
1002
1003	add	x2,sp,#96
1004	ldr	x3,[x22,#32]	// forward load for p256_mul_mont
1005	ldp	x4,x5,[sp,#256]
1006	ldp	x6,x7,[sp,#256+16]
1007	add	x0,sp,#32
1008	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
1009
1010	add	x2,x22,#32
1011	add	x0,sp,#128
1012	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, in1_y, Hcub);
1013
1014	ldr	x3,[sp,#192]
1015	ldp	x4,x5,[sp,#32]
1016	ldp	x6,x7,[sp,#32+16]
1017	add	x2,sp,#192
1018	add	x0,sp,#32
1019	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
1020
1021	add	x2,sp,#128
1022	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
1023
1024	ldp	x4,x5,[sp,#0]		// res
1025	ldp	x6,x7,[sp,#0+16]
1026	ldp	x8,x9,[x23]		// in2
1027	ldp	x10,x11,[x23,#16]
1028	ldp	x14,x15,[x22,#0]	// in1
1029	cmp	x24,#0			// ~, remember?
1030	ldp	x16,x17,[x22,#0+16]
1031	csel	x8,x4,x8,ne
1032	csel	x9,x5,x9,ne
1033	ldp	x4,x5,[sp,#0+0+32]	// res
1034	csel	x10,x6,x10,ne
1035	csel	x11,x7,x11,ne
1036	cmp	x25,#0			// ~, remember?
1037	ldp	x6,x7,[sp,#0+0+48]
1038	csel	x14,x8,x14,ne
1039	csel	x15,x9,x15,ne
1040	ldp	x8,x9,[x23,#0+32]	// in2
1041	csel	x16,x10,x16,ne
1042	csel	x17,x11,x17,ne
1043	ldp	x10,x11,[x23,#0+48]
1044	stp	x14,x15,[x21,#0]
1045	stp	x16,x17,[x21,#0+16]
1046	adrp	x23,.Lone_mont-64
1047	add	x23,x23,:lo12:.Lone_mont-64
1048	ldp	x14,x15,[x22,#32]	// in1
1049	cmp	x24,#0			// ~, remember?
1050	ldp	x16,x17,[x22,#32+16]
1051	csel	x8,x4,x8,ne
1052	csel	x9,x5,x9,ne
1053	ldp	x4,x5,[sp,#0+32+32]	// res
1054	csel	x10,x6,x10,ne
1055	csel	x11,x7,x11,ne
1056	cmp	x25,#0			// ~, remember?
1057	ldp	x6,x7,[sp,#0+32+48]
1058	csel	x14,x8,x14,ne
1059	csel	x15,x9,x15,ne
1060	ldp	x8,x9,[x23,#32+32]	// in2
1061	csel	x16,x10,x16,ne
1062	csel	x17,x11,x17,ne
1063	ldp	x10,x11,[x23,#32+48]
1064	stp	x14,x15,[x21,#32]
1065	stp	x16,x17,[x21,#32+16]
1066	ldp	x14,x15,[x22,#64]	// in1
1067	cmp	x24,#0			// ~, remember?
1068	ldp	x16,x17,[x22,#64+16]
1069	csel	x8,x4,x8,ne
1070	csel	x9,x5,x9,ne
1071	csel	x10,x6,x10,ne
1072	csel	x11,x7,x11,ne
1073	cmp	x25,#0			// ~, remember?
1074	csel	x14,x8,x14,ne
1075	csel	x15,x9,x15,ne
1076	csel	x16,x10,x16,ne
1077	csel	x17,x11,x17,ne
1078	stp	x14,x15,[x21,#64]
1079	stp	x16,x17,[x21,#64+16]
1080
1081	add	sp,x29,#0		// destroy frame
1082	ldp	x19,x20,[x29,#16]
1083	ldp	x21,x22,[x29,#32]
1084	ldp	x23,x24,[x29,#48]
1085	ldp	x25,x26,[x29,#64]
1086	ldp	x29,x30,[sp],#80
1087	AARCH64_VALIDATE_LINK_REGISTER
1088	ret
1089.size	ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1090////////////////////////////////////////////////////////////////////////
1091// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
1092//                                uint64_t b[4]);
1093.globl	ecp_nistz256_ord_mul_mont
1094.hidden	ecp_nistz256_ord_mul_mont
1095.type	ecp_nistz256_ord_mul_mont,%function
1096.align	4
1097ecp_nistz256_ord_mul_mont:
1098	AARCH64_VALID_CALL_TARGET
1099	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1100	stp	x29,x30,[sp,#-64]!
1101	add	x29,sp,#0
1102	stp	x19,x20,[sp,#16]
1103	stp	x21,x22,[sp,#32]
1104	stp	x23,x24,[sp,#48]
1105
1106	adrp	x23,.Lord
1107	add	x23,x23,:lo12:.Lord
1108	ldr	x3,[x2]		// bp[0]
1109	ldp	x4,x5,[x1]
1110	ldp	x6,x7,[x1,#16]
1111
1112	ldp	x12,x13,[x23,#0]
1113	ldp	x21,x22,[x23,#16]
1114	ldr	x23,[x23,#32]
1115
1116	mul	x14,x4,x3		// a[0]*b[0]
1117	umulh	x8,x4,x3
1118
1119	mul	x15,x5,x3		// a[1]*b[0]
1120	umulh	x9,x5,x3
1121
1122	mul	x16,x6,x3		// a[2]*b[0]
1123	umulh	x10,x6,x3
1124
1125	mul	x17,x7,x3		// a[3]*b[0]
1126	umulh	x19,x7,x3
1127
1128	mul	x24,x14,x23
1129
1130	adds	x15,x15,x8		// accumulate high parts of multiplication
1131	adcs	x16,x16,x9
1132	adcs	x17,x17,x10
1133	adc	x19,x19,xzr
1134	mov	x20,xzr
1135	ldr	x3,[x2,#8*1]		// b[i]
1136
1137	lsl	x8,x24,#32
1138	subs	x16,x16,x24
1139	lsr	x9,x24,#32
1140	sbcs	x17,x17,x8
1141	sbcs	x19,x19,x9
1142	sbc	x20,x20,xzr
1143
1144	subs	xzr,x14,#1
1145	umulh	x9,x12,x24
1146	mul	x10,x13,x24
1147	umulh	x11,x13,x24
1148
1149	adcs	x10,x10,x9
1150	mul	x8,x4,x3
1151	adc	x11,x11,xzr
1152	mul	x9,x5,x3
1153
1154	adds	x14,x15,x10
1155	mul	x10,x6,x3
1156	adcs	x15,x16,x11
1157	mul	x11,x7,x3
1158	adcs	x16,x17,x24
1159	adcs	x17,x19,x24
1160	adc	x19,x20,xzr
1161
1162	adds	x14,x14,x8		// accumulate low parts
1163	umulh	x8,x4,x3
1164	adcs	x15,x15,x9
1165	umulh	x9,x5,x3
1166	adcs	x16,x16,x10
1167	umulh	x10,x6,x3
1168	adcs	x17,x17,x11
1169	umulh	x11,x7,x3
1170	adc	x19,x19,xzr
1171	mul	x24,x14,x23
1172	adds	x15,x15,x8		// accumulate high parts
1173	adcs	x16,x16,x9
1174	adcs	x17,x17,x10
1175	adcs	x19,x19,x11
1176	adc	x20,xzr,xzr
1177	ldr	x3,[x2,#8*2]		// b[i]
1178
1179	lsl	x8,x24,#32
1180	subs	x16,x16,x24
1181	lsr	x9,x24,#32
1182	sbcs	x17,x17,x8
1183	sbcs	x19,x19,x9
1184	sbc	x20,x20,xzr
1185
1186	subs	xzr,x14,#1
1187	umulh	x9,x12,x24
1188	mul	x10,x13,x24
1189	umulh	x11,x13,x24
1190
1191	adcs	x10,x10,x9
1192	mul	x8,x4,x3
1193	adc	x11,x11,xzr
1194	mul	x9,x5,x3
1195
1196	adds	x14,x15,x10
1197	mul	x10,x6,x3
1198	adcs	x15,x16,x11
1199	mul	x11,x7,x3
1200	adcs	x16,x17,x24
1201	adcs	x17,x19,x24
1202	adc	x19,x20,xzr
1203
1204	adds	x14,x14,x8		// accumulate low parts
1205	umulh	x8,x4,x3
1206	adcs	x15,x15,x9
1207	umulh	x9,x5,x3
1208	adcs	x16,x16,x10
1209	umulh	x10,x6,x3
1210	adcs	x17,x17,x11
1211	umulh	x11,x7,x3
1212	adc	x19,x19,xzr
1213	mul	x24,x14,x23
1214	adds	x15,x15,x8		// accumulate high parts
1215	adcs	x16,x16,x9
1216	adcs	x17,x17,x10
1217	adcs	x19,x19,x11
1218	adc	x20,xzr,xzr
1219	ldr	x3,[x2,#8*3]		// b[i]
1220
1221	lsl	x8,x24,#32
1222	subs	x16,x16,x24
1223	lsr	x9,x24,#32
1224	sbcs	x17,x17,x8
1225	sbcs	x19,x19,x9
1226	sbc	x20,x20,xzr
1227
1228	subs	xzr,x14,#1
1229	umulh	x9,x12,x24
1230	mul	x10,x13,x24
1231	umulh	x11,x13,x24
1232
1233	adcs	x10,x10,x9
1234	mul	x8,x4,x3
1235	adc	x11,x11,xzr
1236	mul	x9,x5,x3
1237
1238	adds	x14,x15,x10
1239	mul	x10,x6,x3
1240	adcs	x15,x16,x11
1241	mul	x11,x7,x3
1242	adcs	x16,x17,x24
1243	adcs	x17,x19,x24
1244	adc	x19,x20,xzr
1245
1246	adds	x14,x14,x8		// accumulate low parts
1247	umulh	x8,x4,x3
1248	adcs	x15,x15,x9
1249	umulh	x9,x5,x3
1250	adcs	x16,x16,x10
1251	umulh	x10,x6,x3
1252	adcs	x17,x17,x11
1253	umulh	x11,x7,x3
1254	adc	x19,x19,xzr
1255	mul	x24,x14,x23
1256	adds	x15,x15,x8		// accumulate high parts
1257	adcs	x16,x16,x9
1258	adcs	x17,x17,x10
1259	adcs	x19,x19,x11
1260	adc	x20,xzr,xzr
1261	lsl	x8,x24,#32		// last reduction
1262	subs	x16,x16,x24
1263	lsr	x9,x24,#32
1264	sbcs	x17,x17,x8
1265	sbcs	x19,x19,x9
1266	sbc	x20,x20,xzr
1267
1268	subs	xzr,x14,#1
1269	umulh	x9,x12,x24
1270	mul	x10,x13,x24
1271	umulh	x11,x13,x24
1272
1273	adcs	x10,x10,x9
1274	adc	x11,x11,xzr
1275
1276	adds	x14,x15,x10
1277	adcs	x15,x16,x11
1278	adcs	x16,x17,x24
1279	adcs	x17,x19,x24
1280	adc	x19,x20,xzr
1281
1282	subs	x8,x14,x12		// ret -= modulus
1283	sbcs	x9,x15,x13
1284	sbcs	x10,x16,x21
1285	sbcs	x11,x17,x22
1286	sbcs	xzr,x19,xzr
1287
1288	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
1289	csel	x15,x15,x9,lo
1290	csel	x16,x16,x10,lo
1291	stp	x14,x15,[x0]
1292	csel	x17,x17,x11,lo
1293	stp	x16,x17,[x0,#16]
1294
1295	ldp	x19,x20,[sp,#16]
1296	ldp	x21,x22,[sp,#32]
1297	ldp	x23,x24,[sp,#48]
1298	ldr	x29,[sp],#64
1299	ret
1300.size	ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
1301
1302////////////////////////////////////////////////////////////////////////
1303// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
1304//                                uint64_t rep);
1305.globl	ecp_nistz256_ord_sqr_mont
1306.hidden	ecp_nistz256_ord_sqr_mont
1307.type	ecp_nistz256_ord_sqr_mont,%function
1308.align	4
1309ecp_nistz256_ord_sqr_mont:
1310	AARCH64_VALID_CALL_TARGET
1311	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1312	stp	x29,x30,[sp,#-64]!
1313	add	x29,sp,#0
1314	stp	x19,x20,[sp,#16]
1315	stp	x21,x22,[sp,#32]
1316	stp	x23,x24,[sp,#48]
1317
1318	adrp	x23,.Lord
1319	add	x23,x23,:lo12:.Lord
1320	ldp	x4,x5,[x1]
1321	ldp	x6,x7,[x1,#16]
1322
1323	ldp	x12,x13,[x23,#0]
1324	ldp	x21,x22,[x23,#16]
1325	ldr	x23,[x23,#32]
1326	b	.Loop_ord_sqr
1327
1328.align	4
1329.Loop_ord_sqr:
1330	sub	x2,x2,#1
1331	////////////////////////////////////////////////////////////////
1332	//  |  |  |  |  |  |a1*a0|  |
1333	//  |  |  |  |  |a2*a0|  |  |
1334	//  |  |a3*a2|a3*a0|  |  |  |
1335	//  |  |  |  |a2*a1|  |  |  |
1336	//  |  |  |a3*a1|  |  |  |  |
1337	// *|  |  |  |  |  |  |  | 2|
1338	// +|a3*a3|a2*a2|a1*a1|a0*a0|
1339	//  |--+--+--+--+--+--+--+--|
1340	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow
1341	//
1342	//  "can't overflow" below mark carrying into high part of
1343	//  multiplication result, which can't overflow, because it
1344	//  can never be all ones.
1345
1346	mul	x15,x5,x4		// a[1]*a[0]
1347	umulh	x9,x5,x4
1348	mul	x16,x6,x4		// a[2]*a[0]
1349	umulh	x10,x6,x4
1350	mul	x17,x7,x4		// a[3]*a[0]
1351	umulh	x19,x7,x4
1352
1353	adds	x16,x16,x9		// accumulate high parts of multiplication
1354	mul	x8,x6,x5		// a[2]*a[1]
1355	umulh	x9,x6,x5
1356	adcs	x17,x17,x10
1357	mul	x10,x7,x5		// a[3]*a[1]
1358	umulh	x11,x7,x5
1359	adc	x19,x19,xzr		// can't overflow
1360
1361	mul	x20,x7,x6		// a[3]*a[2]
1362	umulh	x1,x7,x6
1363
1364	adds	x9,x9,x10		// accumulate high parts of multiplication
1365	mul	x14,x4,x4		// a[0]*a[0]
1366	adc	x10,x11,xzr		// can't overflow
1367
1368	adds	x17,x17,x8		// accumulate low parts of multiplication
1369	umulh	x4,x4,x4
1370	adcs	x19,x19,x9
1371	mul	x9,x5,x5		// a[1]*a[1]
1372	adcs	x20,x20,x10
1373	umulh	x5,x5,x5
1374	adc	x1,x1,xzr		// can't overflow
1375
1376	adds	x15,x15,x15	// acc[1-6]*=2
1377	mul	x10,x6,x6		// a[2]*a[2]
1378	adcs	x16,x16,x16
1379	umulh	x6,x6,x6
1380	adcs	x17,x17,x17
1381	mul	x11,x7,x7		// a[3]*a[3]
1382	adcs	x19,x19,x19
1383	umulh	x7,x7,x7
1384	adcs	x20,x20,x20
1385	adcs	x1,x1,x1
1386	adc	x3,xzr,xzr
1387
1388	adds	x15,x15,x4		// +a[i]*a[i]
1389	mul	x24,x14,x23
1390	adcs	x16,x16,x9
1391	adcs	x17,x17,x5
1392	adcs	x19,x19,x10
1393	adcs	x20,x20,x6
1394	adcs	x1,x1,x11
1395	adc	x3,x3,x7
1396	subs	xzr,x14,#1
1397	umulh	x9,x12,x24
1398	mul	x10,x13,x24
1399	umulh	x11,x13,x24
1400
1401	adcs	x10,x10,x9
1402	adc	x11,x11,xzr
1403
1404	adds	x14,x15,x10
1405	adcs	x15,x16,x11
1406	adcs	x16,x17,x24
1407	adc	x17,xzr,x24		// can't overflow
1408	mul	x11,x14,x23
1409	lsl	x8,x24,#32
1410	subs	x15,x15,x24
1411	lsr	x9,x24,#32
1412	sbcs	x16,x16,x8
1413	sbc	x17,x17,x9		// can't borrow
1414	subs	xzr,x14,#1
1415	umulh	x9,x12,x11
1416	mul	x10,x13,x11
1417	umulh	x24,x13,x11
1418
1419	adcs	x10,x10,x9
1420	adc	x24,x24,xzr
1421
1422	adds	x14,x15,x10
1423	adcs	x15,x16,x24
1424	adcs	x16,x17,x11
1425	adc	x17,xzr,x11		// can't overflow
1426	mul	x24,x14,x23
1427	lsl	x8,x11,#32
1428	subs	x15,x15,x11
1429	lsr	x9,x11,#32
1430	sbcs	x16,x16,x8
1431	sbc	x17,x17,x9		// can't borrow
1432	subs	xzr,x14,#1
1433	umulh	x9,x12,x24
1434	mul	x10,x13,x24
1435	umulh	x11,x13,x24
1436
1437	adcs	x10,x10,x9
1438	adc	x11,x11,xzr
1439
1440	adds	x14,x15,x10
1441	adcs	x15,x16,x11
1442	adcs	x16,x17,x24
1443	adc	x17,xzr,x24		// can't overflow
1444	mul	x11,x14,x23
1445	lsl	x8,x24,#32
1446	subs	x15,x15,x24
1447	lsr	x9,x24,#32
1448	sbcs	x16,x16,x8
1449	sbc	x17,x17,x9		// can't borrow
1450	subs	xzr,x14,#1
1451	umulh	x9,x12,x11
1452	mul	x10,x13,x11
1453	umulh	x24,x13,x11
1454
1455	adcs	x10,x10,x9
1456	adc	x24,x24,xzr
1457
1458	adds	x14,x15,x10
1459	adcs	x15,x16,x24
1460	adcs	x16,x17,x11
1461	adc	x17,xzr,x11		// can't overflow
1462	lsl	x8,x11,#32
1463	subs	x15,x15,x11
1464	lsr	x9,x11,#32
1465	sbcs	x16,x16,x8
1466	sbc	x17,x17,x9		// can't borrow
1467	adds	x14,x14,x19	// accumulate upper half
1468	adcs	x15,x15,x20
1469	adcs	x16,x16,x1
1470	adcs	x17,x17,x3
1471	adc	x19,xzr,xzr
1472
1473	subs	x8,x14,x12		// ret -= modulus
1474	sbcs	x9,x15,x13
1475	sbcs	x10,x16,x21
1476	sbcs	x11,x17,x22
1477	sbcs	xzr,x19,xzr
1478
1479	csel	x4,x14,x8,lo	// ret = borrow ? ret : ret-modulus
1480	csel	x5,x15,x9,lo
1481	csel	x6,x16,x10,lo
1482	csel	x7,x17,x11,lo
1483
1484	cbnz	x2,.Loop_ord_sqr
1485
1486	stp	x4,x5,[x0]
1487	stp	x6,x7,[x0,#16]
1488
1489	ldp	x19,x20,[sp,#16]
1490	ldp	x21,x22,[sp,#32]
1491	ldp	x23,x24,[sp,#48]
1492	ldr	x29,[sp],#64
1493	ret
1494.size	ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
1495////////////////////////////////////////////////////////////////////////
1496// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
1497.globl	ecp_nistz256_select_w5
1498.hidden	ecp_nistz256_select_w5
1499.type	ecp_nistz256_select_w5,%function
1500.align	4
1501ecp_nistz256_select_w5:
1502	AARCH64_VALID_CALL_TARGET
1503
1504    // x10 := x0
1505    // w9 := 0; loop counter and incremented internal index
1506	mov	x10, x0
1507	mov	w9, #0
1508
1509    // [v16-v21] := 0
1510	movi	v16.16b, #0
1511	movi	v17.16b, #0
1512	movi	v18.16b, #0
1513	movi	v19.16b, #0
1514	movi	v20.16b, #0
1515	movi	v21.16b, #0
1516
1517.Lselect_w5_loop:
1518    // Loop 16 times.
1519
1520    // Increment index (loop counter); tested at the end of the loop
1521	add	w9, w9, #1
1522
1523    // [v22-v27] := Load a (3*256-bit = 6*128-bit) table entry starting at x1
1524    //  and advance x1 to point to the next entry
1525	ld1	{v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
1526
1527    // x11 := (w9 == w2)? All 1s : All 0s
1528	cmp	w9, w2
1529	csetm	x11, eq
1530
1531    // continue loading ...
1532	ld1	{v26.2d, v27.2d}, [x1],#32
1533
1534    // duplicate mask_64 into Mask (all 0s or all 1s)
1535	dup	v3.2d, x11
1536
1537    // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
1538    // i.e., values in output registers will remain the same if w9 != w2
1539	bit	v16.16b, v22.16b, v3.16b
1540	bit	v17.16b, v23.16b, v3.16b
1541
1542	bit	v18.16b, v24.16b, v3.16b
1543	bit	v19.16b, v25.16b, v3.16b
1544
1545	bit	v20.16b, v26.16b, v3.16b
1546	bit	v21.16b, v27.16b, v3.16b
1547
1548    // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back
1549	tbz	w9, #4, .Lselect_w5_loop
1550
1551    // Write [v16-v21] to memory at the output pointer
1552	st1	{v16.2d, v17.2d, v18.2d, v19.2d}, [x10],#64
1553	st1	{v20.2d, v21.2d}, [x10]
1554
1555	ret
1556.size	ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
1557
1558
1559////////////////////////////////////////////////////////////////////////
1560// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
1561.globl	ecp_nistz256_select_w7
1562.hidden	ecp_nistz256_select_w7
1563.type	ecp_nistz256_select_w7,%function
1564.align	4
1565ecp_nistz256_select_w7:
1566	AARCH64_VALID_CALL_TARGET
1567
1568    // w9 := 0; loop counter and incremented internal index
1569	mov	w9, #0
1570
1571    // [v16-v21] := 0
1572	movi	v16.16b, #0
1573	movi	v17.16b, #0
1574	movi	v18.16b, #0
1575	movi	v19.16b, #0
1576
1577.Lselect_w7_loop:
1578    // Loop 64 times.
1579
1580    // Increment index (loop counter); tested at the end of the loop
1581	add	w9, w9, #1
1582
1583    // [v22-v25] := Load a (2*256-bit = 4*128-bit) table entry starting at x1
1584    //  and advance x1 to point to the next entry
1585	ld1	{v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
1586
1587    // x11 := (w9 == w2)? All 1s : All 0s
1588	cmp	w9, w2
1589	csetm	x11, eq
1590
1591    // duplicate mask_64 into Mask (all 0s or all 1s)
1592	dup	v3.2d, x11
1593
1594    // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
1595    // i.e., values in output registers will remain the same if w9 != w2
1596	bit	v16.16b, v22.16b, v3.16b
1597	bit	v17.16b, v23.16b, v3.16b
1598
1599	bit	v18.16b, v24.16b, v3.16b
1600	bit	v19.16b, v25.16b, v3.16b
1601
1602    // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back
1603	tbz	w9, #6, .Lselect_w7_loop
1604
1605    // Write [v16-v19] to memory at the output pointer
1606	st1	{v16.2d, v17.2d, v18.2d, v19.2d}, [x0]
1607
1608	ret
1609.size	ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
1610#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
1611