1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <ring-core/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
7#include "ring-core/arm_arch.h"
8
9.section	.rodata
10.align	5
11Lpoly:
12.quad	0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
13LRR:	//	2^512 mod P precomputed for NIST P256 polynomial
14.quad	0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd
15Lone_mont:
16.quad	0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
17Lone:
18.quad	1,0,0,0
19Lord:
20.quad	0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
21LordK:
22.quad	0xccd1c8aaee00bc4f
23.byte	69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
24.align	2
25.text
26
27// void	ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
28//					     const BN_ULONG x2[4]);
29.globl	ecp_nistz256_mul_mont
30
31.def ecp_nistz256_mul_mont
32   .type 32
33.endef
34.align	4
35ecp_nistz256_mul_mont:
36	AARCH64_SIGN_LINK_REGISTER
37	stp	x29,x30,[sp,#-32]!
38	add	x29,sp,#0
39	stp	x19,x20,[sp,#16]
40
41	ldr	x3,[x2]		// bp[0]
42	ldp	x4,x5,[x1]
43	ldp	x6,x7,[x1,#16]
44	adrp	x13,Lpoly
45	add	x13,x13,:lo12:Lpoly
46	ldr	x12,[x13,#8]
47	ldr	x13,[x13,#24]
48
49	bl	__ecp_nistz256_mul_mont
50
51	ldp	x19,x20,[sp,#16]
52	ldp	x29,x30,[sp],#32
53	AARCH64_VALIDATE_LINK_REGISTER
54	ret
55
56
57// void	ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
58.globl	ecp_nistz256_sqr_mont
59
60.def ecp_nistz256_sqr_mont
61   .type 32
62.endef
63.align	4
64ecp_nistz256_sqr_mont:
65	AARCH64_SIGN_LINK_REGISTER
66	stp	x29,x30,[sp,#-32]!
67	add	x29,sp,#0
68	stp	x19,x20,[sp,#16]
69
70	ldp	x4,x5,[x1]
71	ldp	x6,x7,[x1,#16]
72	adrp	x13,Lpoly
73	add	x13,x13,:lo12:Lpoly
74	ldr	x12,[x13,#8]
75	ldr	x13,[x13,#24]
76
77	bl	__ecp_nistz256_sqr_mont
78
79	ldp	x19,x20,[sp,#16]
80	ldp	x29,x30,[sp],#32
81	AARCH64_VALIDATE_LINK_REGISTER
82	ret
83
84
85// void	ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
86.globl	ecp_nistz256_neg
87
88.def ecp_nistz256_neg
89   .type 32
90.endef
91.align	4
92ecp_nistz256_neg:
93	AARCH64_SIGN_LINK_REGISTER
94	stp	x29,x30,[sp,#-16]!
95	add	x29,sp,#0
96
97	mov	x2,x1
98	mov	x14,xzr		// a = 0
99	mov	x15,xzr
100	mov	x16,xzr
101	mov	x17,xzr
102	adrp	x13,Lpoly
103	add	x13,x13,:lo12:Lpoly
104	ldr	x12,[x13,#8]
105	ldr	x13,[x13,#24]
106
107	bl	__ecp_nistz256_sub_from
108
109	ldp	x29,x30,[sp],#16
110	AARCH64_VALIDATE_LINK_REGISTER
111	ret
112
113
114// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
115// to x4-x7 and b[0] - to x3
116.def __ecp_nistz256_mul_mont
117   .type 32
118.endef
119.align	4
120__ecp_nistz256_mul_mont:
121	mul	x14,x4,x3		// a[0]*b[0]
122	umulh	x8,x4,x3
123
124	mul	x15,x5,x3		// a[1]*b[0]
125	umulh	x9,x5,x3
126
127	mul	x16,x6,x3		// a[2]*b[0]
128	umulh	x10,x6,x3
129
130	mul	x17,x7,x3		// a[3]*b[0]
131	umulh	x11,x7,x3
132	ldr	x3,[x2,#8]		// b[1]
133
134	adds	x15,x15,x8		// accumulate high parts of multiplication
135	lsl	x8,x14,#32
136	adcs	x16,x16,x9
137	lsr	x9,x14,#32
138	adcs	x17,x17,x10
139	adc	x19,xzr,x11
140	mov	x20,xzr
141	subs	x10,x14,x8		// "*0xffff0001"
142	sbc	x11,x14,x9
143	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
144	mul	x8,x4,x3		// lo(a[0]*b[i])
145	adcs	x15,x16,x9
146	mul	x9,x5,x3		// lo(a[1]*b[i])
147	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
148	mul	x10,x6,x3		// lo(a[2]*b[i])
149	adcs	x17,x19,x11
150	mul	x11,x7,x3		// lo(a[3]*b[i])
151	adc	x19,x20,xzr
152
153	adds	x14,x14,x8		// accumulate low parts of multiplication
154	umulh	x8,x4,x3		// hi(a[0]*b[i])
155	adcs	x15,x15,x9
156	umulh	x9,x5,x3		// hi(a[1]*b[i])
157	adcs	x16,x16,x10
158	umulh	x10,x6,x3		// hi(a[2]*b[i])
159	adcs	x17,x17,x11
160	umulh	x11,x7,x3		// hi(a[3]*b[i])
161	adc	x19,x19,xzr
162	ldr	x3,[x2,#8*(1+1)]	// b[1+1]
163	adds	x15,x15,x8		// accumulate high parts of multiplication
164	lsl	x8,x14,#32
165	adcs	x16,x16,x9
166	lsr	x9,x14,#32
167	adcs	x17,x17,x10
168	adcs	x19,x19,x11
169	adc	x20,xzr,xzr
170	subs	x10,x14,x8		// "*0xffff0001"
171	sbc	x11,x14,x9
172	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
173	mul	x8,x4,x3		// lo(a[0]*b[i])
174	adcs	x15,x16,x9
175	mul	x9,x5,x3		// lo(a[1]*b[i])
176	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
177	mul	x10,x6,x3		// lo(a[2]*b[i])
178	adcs	x17,x19,x11
179	mul	x11,x7,x3		// lo(a[3]*b[i])
180	adc	x19,x20,xzr
181
182	adds	x14,x14,x8		// accumulate low parts of multiplication
183	umulh	x8,x4,x3		// hi(a[0]*b[i])
184	adcs	x15,x15,x9
185	umulh	x9,x5,x3		// hi(a[1]*b[i])
186	adcs	x16,x16,x10
187	umulh	x10,x6,x3		// hi(a[2]*b[i])
188	adcs	x17,x17,x11
189	umulh	x11,x7,x3		// hi(a[3]*b[i])
190	adc	x19,x19,xzr
191	ldr	x3,[x2,#8*(2+1)]	// b[2+1]
192	adds	x15,x15,x8		// accumulate high parts of multiplication
193	lsl	x8,x14,#32
194	adcs	x16,x16,x9
195	lsr	x9,x14,#32
196	adcs	x17,x17,x10
197	adcs	x19,x19,x11
198	adc	x20,xzr,xzr
199	subs	x10,x14,x8		// "*0xffff0001"
200	sbc	x11,x14,x9
201	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
202	mul	x8,x4,x3		// lo(a[0]*b[i])
203	adcs	x15,x16,x9
204	mul	x9,x5,x3		// lo(a[1]*b[i])
205	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
206	mul	x10,x6,x3		// lo(a[2]*b[i])
207	adcs	x17,x19,x11
208	mul	x11,x7,x3		// lo(a[3]*b[i])
209	adc	x19,x20,xzr
210
211	adds	x14,x14,x8		// accumulate low parts of multiplication
212	umulh	x8,x4,x3		// hi(a[0]*b[i])
213	adcs	x15,x15,x9
214	umulh	x9,x5,x3		// hi(a[1]*b[i])
215	adcs	x16,x16,x10
216	umulh	x10,x6,x3		// hi(a[2]*b[i])
217	adcs	x17,x17,x11
218	umulh	x11,x7,x3		// hi(a[3]*b[i])
219	adc	x19,x19,xzr
220	adds	x15,x15,x8		// accumulate high parts of multiplication
221	lsl	x8,x14,#32
222	adcs	x16,x16,x9
223	lsr	x9,x14,#32
224	adcs	x17,x17,x10
225	adcs	x19,x19,x11
226	adc	x20,xzr,xzr
227	// last reduction
228	subs	x10,x14,x8		// "*0xffff0001"
229	sbc	x11,x14,x9
230	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
231	adcs	x15,x16,x9
232	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
233	adcs	x17,x19,x11
234	adc	x19,x20,xzr
235
236	adds	x8,x14,#1		// subs	x8,x14,#-1 // tmp = ret-modulus
237	sbcs	x9,x15,x12
238	sbcs	x10,x16,xzr
239	sbcs	x11,x17,x13
240	sbcs	xzr,x19,xzr		// did it borrow?
241
242	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
243	csel	x15,x15,x9,lo
244	csel	x16,x16,x10,lo
245	stp	x14,x15,[x0]
246	csel	x17,x17,x11,lo
247	stp	x16,x17,[x0,#16]
248
249	ret
250
251
252// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
253// to x4-x7
254.def __ecp_nistz256_sqr_mont
255   .type 32
256.endef
257.align	4
258__ecp_nistz256_sqr_mont:
259	//  |  |  |  |  |  |a1*a0|  |
260	//  |  |  |  |  |a2*a0|  |  |
261	//  |  |a3*a2|a3*a0|  |  |  |
262	//  |  |  |  |a2*a1|  |  |  |
263	//  |  |  |a3*a1|  |  |  |  |
264	// *|  |  |  |  |  |  |  | 2|
265	// +|a3*a3|a2*a2|a1*a1|a0*a0|
266	//  |--+--+--+--+--+--+--+--|
267	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow
268	//
269	//  "can't overflow" below mark carrying into high part of
270	//  multiplication result, which can't overflow, because it
271	//  can never be all ones.
272
273	mul	x15,x5,x4		// a[1]*a[0]
274	umulh	x9,x5,x4
275	mul	x16,x6,x4		// a[2]*a[0]
276	umulh	x10,x6,x4
277	mul	x17,x7,x4		// a[3]*a[0]
278	umulh	x19,x7,x4
279
280	adds	x16,x16,x9		// accumulate high parts of multiplication
281	mul	x8,x6,x5		// a[2]*a[1]
282	umulh	x9,x6,x5
283	adcs	x17,x17,x10
284	mul	x10,x7,x5		// a[3]*a[1]
285	umulh	x11,x7,x5
286	adc	x19,x19,xzr		// can't overflow
287
288	mul	x20,x7,x6		// a[3]*a[2]
289	umulh	x1,x7,x6
290
291	adds	x9,x9,x10		// accumulate high parts of multiplication
292	mul	x14,x4,x4		// a[0]*a[0]
293	adc	x10,x11,xzr		// can't overflow
294
295	adds	x17,x17,x8		// accumulate low parts of multiplication
296	umulh	x4,x4,x4
297	adcs	x19,x19,x9
298	mul	x9,x5,x5		// a[1]*a[1]
299	adcs	x20,x20,x10
300	umulh	x5,x5,x5
301	adc	x1,x1,xzr		// can't overflow
302
303	adds	x15,x15,x15	// acc[1-6]*=2
304	mul	x10,x6,x6		// a[2]*a[2]
305	adcs	x16,x16,x16
306	umulh	x6,x6,x6
307	adcs	x17,x17,x17
308	mul	x11,x7,x7		// a[3]*a[3]
309	adcs	x19,x19,x19
310	umulh	x7,x7,x7
311	adcs	x20,x20,x20
312	adcs	x1,x1,x1
313	adc	x2,xzr,xzr
314
315	adds	x15,x15,x4		// +a[i]*a[i]
316	adcs	x16,x16,x9
317	adcs	x17,x17,x5
318	adcs	x19,x19,x10
319	adcs	x20,x20,x6
320	lsl	x8,x14,#32
321	adcs	x1,x1,x11
322	lsr	x9,x14,#32
323	adc	x2,x2,x7
324	subs	x10,x14,x8		// "*0xffff0001"
325	sbc	x11,x14,x9
326	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
327	adcs	x15,x16,x9
328	lsl	x8,x14,#32
329	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
330	lsr	x9,x14,#32
331	adc	x17,x11,xzr		// can't overflow
332	subs	x10,x14,x8		// "*0xffff0001"
333	sbc	x11,x14,x9
334	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
335	adcs	x15,x16,x9
336	lsl	x8,x14,#32
337	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
338	lsr	x9,x14,#32
339	adc	x17,x11,xzr		// can't overflow
340	subs	x10,x14,x8		// "*0xffff0001"
341	sbc	x11,x14,x9
342	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
343	adcs	x15,x16,x9
344	lsl	x8,x14,#32
345	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
346	lsr	x9,x14,#32
347	adc	x17,x11,xzr		// can't overflow
348	subs	x10,x14,x8		// "*0xffff0001"
349	sbc	x11,x14,x9
350	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
351	adcs	x15,x16,x9
352	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
353	adc	x17,x11,xzr		// can't overflow
354
355	adds	x14,x14,x19	// accumulate upper half
356	adcs	x15,x15,x20
357	adcs	x16,x16,x1
358	adcs	x17,x17,x2
359	adc	x19,xzr,xzr
360
361	adds	x8,x14,#1		// subs	x8,x14,#-1 // tmp = ret-modulus
362	sbcs	x9,x15,x12
363	sbcs	x10,x16,xzr
364	sbcs	x11,x17,x13
365	sbcs	xzr,x19,xzr		// did it borrow?
366
367	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
368	csel	x15,x15,x9,lo
369	csel	x16,x16,x10,lo
370	stp	x14,x15,[x0]
371	csel	x17,x17,x11,lo
372	stp	x16,x17,[x0,#16]
373
374	ret
375
376
377// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to
378// x4-x7 and x8-x11. This is done because it's used in multiple
379// contexts, e.g. in multiplication by 2 and 3...
380.def __ecp_nistz256_add_to
381   .type 32
382.endef
383.align	4
384__ecp_nistz256_add_to:
385	adds	x14,x14,x8		// ret = a+b
386	adcs	x15,x15,x9
387	adcs	x16,x16,x10
388	adcs	x17,x17,x11
389	adc	x1,xzr,xzr		// zap x1
390
391	adds	x8,x14,#1		// subs	x8,x4,#-1 // tmp = ret-modulus
392	sbcs	x9,x15,x12
393	sbcs	x10,x16,xzr
394	sbcs	x11,x17,x13
395	sbcs	xzr,x1,xzr		// did subtraction borrow?
396
397	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
398	csel	x15,x15,x9,lo
399	csel	x16,x16,x10,lo
400	stp	x14,x15,[x0]
401	csel	x17,x17,x11,lo
402	stp	x16,x17,[x0,#16]
403
404	ret
405
406
407.def __ecp_nistz256_sub_from
408   .type 32
409.endef
410.align	4
411__ecp_nistz256_sub_from:
412	ldp	x8,x9,[x2]
413	ldp	x10,x11,[x2,#16]
414	subs	x14,x14,x8		// ret = a-b
415	sbcs	x15,x15,x9
416	sbcs	x16,x16,x10
417	sbcs	x17,x17,x11
418	sbc	x1,xzr,xzr		// zap x1
419
420	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = ret+modulus
421	adcs	x9,x15,x12
422	adcs	x10,x16,xzr
423	adc	x11,x17,x13
424	cmp	x1,xzr			// did subtraction borrow?
425
426	csel	x14,x14,x8,eq	// ret = borrow ? ret+modulus : ret
427	csel	x15,x15,x9,eq
428	csel	x16,x16,x10,eq
429	stp	x14,x15,[x0]
430	csel	x17,x17,x11,eq
431	stp	x16,x17,[x0,#16]
432
433	ret
434
435
436.def __ecp_nistz256_sub_morf
437   .type 32
438.endef
439.align	4
440__ecp_nistz256_sub_morf:
441	ldp	x8,x9,[x2]
442	ldp	x10,x11,[x2,#16]
443	subs	x14,x8,x14		// ret = b-a
444	sbcs	x15,x9,x15
445	sbcs	x16,x10,x16
446	sbcs	x17,x11,x17
447	sbc	x1,xzr,xzr		// zap x1
448
449	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = ret+modulus
450	adcs	x9,x15,x12
451	adcs	x10,x16,xzr
452	adc	x11,x17,x13
453	cmp	x1,xzr			// did subtraction borrow?
454
455	csel	x14,x14,x8,eq	// ret = borrow ? ret+modulus : ret
456	csel	x15,x15,x9,eq
457	csel	x16,x16,x10,eq
458	stp	x14,x15,[x0]
459	csel	x17,x17,x11,eq
460	stp	x16,x17,[x0,#16]
461
462	ret
463
464
465.def __ecp_nistz256_div_by_2
466   .type 32
467.endef
468.align	4
469__ecp_nistz256_div_by_2:
470	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = a+modulus
471	adcs	x9,x15,x12
472	adcs	x10,x16,xzr
473	adcs	x11,x17,x13
474	adc	x1,xzr,xzr		// zap x1
475	tst	x14,#1		// is a even?
476
477	csel	x14,x14,x8,eq	// ret = even ? a : a+modulus
478	csel	x15,x15,x9,eq
479	csel	x16,x16,x10,eq
480	csel	x17,x17,x11,eq
481	csel	x1,xzr,x1,eq
482
483	lsr	x14,x14,#1		// ret >>= 1
484	orr	x14,x14,x15,lsl#63
485	lsr	x15,x15,#1
486	orr	x15,x15,x16,lsl#63
487	lsr	x16,x16,#1
488	orr	x16,x16,x17,lsl#63
489	lsr	x17,x17,#1
490	stp	x14,x15,[x0]
491	orr	x17,x17,x1,lsl#63
492	stp	x16,x17,[x0,#16]
493
494	ret
495
496.globl	ecp_nistz256_point_double
497
498.def ecp_nistz256_point_double
499   .type 32
500.endef
501.align	5
502ecp_nistz256_point_double:
503	AARCH64_SIGN_LINK_REGISTER
504	stp	x29,x30,[sp,#-96]!
505	add	x29,sp,#0
506	stp	x19,x20,[sp,#16]
507	stp	x21,x22,[sp,#32]
508	sub	sp,sp,#32*4
509
510Ldouble_shortcut:
511	ldp	x14,x15,[x1,#32]
512	mov	x21,x0
513	ldp	x16,x17,[x1,#48]
514	mov	x22,x1
515	adrp	x13,Lpoly
516	add	x13,x13,:lo12:Lpoly
517	ldr	x12,[x13,#8]
518	mov	x8,x14
519	ldr	x13,[x13,#24]
520	mov	x9,x15
521	ldp	x4,x5,[x22,#64]	// forward load for p256_sqr_mont
522	mov	x10,x16
523	mov	x11,x17
524	ldp	x6,x7,[x22,#64+16]
525	add	x0,sp,#0
526	bl	__ecp_nistz256_add_to	// p256_mul_by_2(S, in_y);
527
528	add	x0,sp,#64
529	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Zsqr, in_z);
530
531	ldp	x8,x9,[x22]
532	ldp	x10,x11,[x22,#16]
533	mov	x4,x14		// put Zsqr aside for p256_sub
534	mov	x5,x15
535	mov	x6,x16
536	mov	x7,x17
537	add	x0,sp,#32
538	bl	__ecp_nistz256_add_to	// p256_add(M, Zsqr, in_x);
539
540	add	x2,x22,#0
541	mov	x14,x4		// restore Zsqr
542	mov	x15,x5
543	ldp	x4,x5,[sp,#0]	// forward load for p256_sqr_mont
544	mov	x16,x6
545	mov	x17,x7
546	ldp	x6,x7,[sp,#0+16]
547	add	x0,sp,#64
548	bl	__ecp_nistz256_sub_morf	// p256_sub(Zsqr, in_x, Zsqr);
549
550	add	x0,sp,#0
551	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(S, S);
552
553	ldr	x3,[x22,#32]
554	ldp	x4,x5,[x22,#64]
555	ldp	x6,x7,[x22,#64+16]
556	add	x2,x22,#32
557	add	x0,sp,#96
558	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(tmp0, in_z, in_y);
559
560	mov	x8,x14
561	mov	x9,x15
562	ldp	x4,x5,[sp,#0]	// forward load for p256_sqr_mont
563	mov	x10,x16
564	mov	x11,x17
565	ldp	x6,x7,[sp,#0+16]
566	add	x0,x21,#64
567	bl	__ecp_nistz256_add_to	// p256_mul_by_2(res_z, tmp0);
568
569	add	x0,sp,#96
570	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(tmp0, S);
571
572	ldr	x3,[sp,#64]		// forward load for p256_mul_mont
573	ldp	x4,x5,[sp,#32]
574	ldp	x6,x7,[sp,#32+16]
575	add	x0,x21,#32
576	bl	__ecp_nistz256_div_by_2	// p256_div_by_2(res_y, tmp0);
577
578	add	x2,sp,#64
579	add	x0,sp,#32
580	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(M, M, Zsqr);
581
582	mov	x8,x14		// duplicate M
583	mov	x9,x15
584	mov	x10,x16
585	mov	x11,x17
586	mov	x4,x14		// put M aside
587	mov	x5,x15
588	mov	x6,x16
589	mov	x7,x17
590	add	x0,sp,#32
591	bl	__ecp_nistz256_add_to
592	mov	x8,x4			// restore M
593	mov	x9,x5
594	ldr	x3,[x22]		// forward load for p256_mul_mont
595	mov	x10,x6
596	ldp	x4,x5,[sp,#0]
597	mov	x11,x7
598	ldp	x6,x7,[sp,#0+16]
599	bl	__ecp_nistz256_add_to	// p256_mul_by_3(M, M);
600
601	add	x2,x22,#0
602	add	x0,sp,#0
603	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, in_x);
604
605	mov	x8,x14
606	mov	x9,x15
607	ldp	x4,x5,[sp,#32]	// forward load for p256_sqr_mont
608	mov	x10,x16
609	mov	x11,x17
610	ldp	x6,x7,[sp,#32+16]
611	add	x0,sp,#96
612	bl	__ecp_nistz256_add_to	// p256_mul_by_2(tmp0, S);
613
614	add	x0,x21,#0
615	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(res_x, M);
616
617	add	x2,sp,#96
618	bl	__ecp_nistz256_sub_from	// p256_sub(res_x, res_x, tmp0);
619
620	add	x2,sp,#0
621	add	x0,sp,#0
622	bl	__ecp_nistz256_sub_morf	// p256_sub(S, S, res_x);
623
624	ldr	x3,[sp,#32]
625	mov	x4,x14		// copy S
626	mov	x5,x15
627	mov	x6,x16
628	mov	x7,x17
629	add	x2,sp,#32
630	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, M);
631
632	add	x2,x21,#32
633	add	x0,x21,#32
634	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, S, res_y);
635
636	add	sp,x29,#0		// destroy frame
637	ldp	x19,x20,[x29,#16]
638	ldp	x21,x22,[x29,#32]
639	ldp	x29,x30,[sp],#96
640	AARCH64_VALIDATE_LINK_REGISTER
641	ret
642
643.globl	ecp_nistz256_point_add
644
645.def ecp_nistz256_point_add
646   .type 32
647.endef
648.align	5
649ecp_nistz256_point_add:
650	AARCH64_SIGN_LINK_REGISTER
651	stp	x29,x30,[sp,#-96]!
652	add	x29,sp,#0
653	stp	x19,x20,[sp,#16]
654	stp	x21,x22,[sp,#32]
655	stp	x23,x24,[sp,#48]
656	stp	x25,x26,[sp,#64]
657	stp	x27,x28,[sp,#80]
658	sub	sp,sp,#32*12
659
660	ldp	x4,x5,[x2,#64]	// in2_z
661	ldp	x6,x7,[x2,#64+16]
662	mov	x21,x0
663	mov	x22,x1
664	mov	x23,x2
665	adrp	x13,Lpoly
666	add	x13,x13,:lo12:Lpoly
667	ldr	x12,[x13,#8]
668	ldr	x13,[x13,#24]
669	orr	x8,x4,x5
670	orr	x10,x6,x7
671	orr	x25,x8,x10
672	cmp	x25,#0
673	csetm	x25,ne		// ~in2infty
674	add	x0,sp,#192
675	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z2sqr, in2_z);
676
677	ldp	x4,x5,[x22,#64]	// in1_z
678	ldp	x6,x7,[x22,#64+16]
679	orr	x8,x4,x5
680	orr	x10,x6,x7
681	orr	x24,x8,x10
682	cmp	x24,#0
683	csetm	x24,ne		// ~in1infty
684	add	x0,sp,#128
685	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
686
687	ldr	x3,[x23,#64]
688	ldp	x4,x5,[sp,#192]
689	ldp	x6,x7,[sp,#192+16]
690	add	x2,x23,#64
691	add	x0,sp,#320
692	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, Z2sqr, in2_z);
693
694	ldr	x3,[x22,#64]
695	ldp	x4,x5,[sp,#128]
696	ldp	x6,x7,[sp,#128+16]
697	add	x2,x22,#64
698	add	x0,sp,#352
699	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
700
701	ldr	x3,[x22,#32]
702	ldp	x4,x5,[sp,#320]
703	ldp	x6,x7,[sp,#320+16]
704	add	x2,x22,#32
705	add	x0,sp,#320
706	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, S1, in1_y);
707
708	ldr	x3,[x23,#32]
709	ldp	x4,x5,[sp,#352]
710	ldp	x6,x7,[sp,#352+16]
711	add	x2,x23,#32
712	add	x0,sp,#352
713	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
714
715	add	x2,sp,#320
716	ldr	x3,[sp,#192]	// forward load for p256_mul_mont
717	ldp	x4,x5,[x22]
718	ldp	x6,x7,[x22,#16]
719	add	x0,sp,#160
720	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, S1);
721
722	orr	x14,x14,x15	// see if result is zero
723	orr	x16,x16,x17
724	orr	x26,x14,x16	// ~is_equal(S1,S2)
725
726	add	x2,sp,#192
727	add	x0,sp,#256
728	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U1, in1_x, Z2sqr);
729
730	ldr	x3,[sp,#128]
731	ldp	x4,x5,[x23]
732	ldp	x6,x7,[x23,#16]
733	add	x2,sp,#128
734	add	x0,sp,#288
735	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in2_x, Z1sqr);
736
737	add	x2,sp,#256
738	ldp	x4,x5,[sp,#160]	// forward load for p256_sqr_mont
739	ldp	x6,x7,[sp,#160+16]
740	add	x0,sp,#96
741	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, U1);
742
743	orr	x14,x14,x15	// see if result is zero
744	orr	x16,x16,x17
745	orr	x14,x14,x16	// ~is_equal(U1,U2)
746
747	mvn	x27,x24	// -1/0 -> 0/-1
748	mvn	x28,x25	// -1/0 -> 0/-1
749	orr	x14,x14,x27
750	orr	x14,x14,x28
751	orr	x14,x14,x26
752	cbnz	x14,Ladd_proceed	// if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2))
753
754Ladd_double:
755	mov	x1,x22
756	mov	x0,x21
757	ldp	x23,x24,[x29,#48]
758	ldp	x25,x26,[x29,#64]
759	ldp	x27,x28,[x29,#80]
760	add	sp,sp,#256	// #256 is from #32*(12-4). difference in stack frames
761	b	Ldouble_shortcut
762
763.align	4
764Ladd_proceed:
765	add	x0,sp,#192
766	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
767
768	ldr	x3,[x22,#64]
769	ldp	x4,x5,[sp,#96]
770	ldp	x6,x7,[sp,#96+16]
771	add	x2,x22,#64
772	add	x0,sp,#64
773	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
774
775	ldp	x4,x5,[sp,#96]
776	ldp	x6,x7,[sp,#96+16]
777	add	x0,sp,#128
778	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
779
780	ldr	x3,[x23,#64]
781	ldp	x4,x5,[sp,#64]
782	ldp	x6,x7,[sp,#64+16]
783	add	x2,x23,#64
784	add	x0,sp,#64
785	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, res_z, in2_z);
786
787	ldr	x3,[sp,#96]
788	ldp	x4,x5,[sp,#128]
789	ldp	x6,x7,[sp,#128+16]
790	add	x2,sp,#96
791	add	x0,sp,#224
792	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
793
794	ldr	x3,[sp,#128]
795	ldp	x4,x5,[sp,#256]
796	ldp	x6,x7,[sp,#256+16]
797	add	x2,sp,#128
798	add	x0,sp,#288
799	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, U1, Hsqr);
800
801	mov	x8,x14
802	mov	x9,x15
803	mov	x10,x16
804	mov	x11,x17
805	add	x0,sp,#128
806	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
807
808	add	x2,sp,#192
809	add	x0,sp,#0
810	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
811
812	add	x2,sp,#224
813	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
814
815	add	x2,sp,#288
816	ldr	x3,[sp,#224]		// forward load for p256_mul_mont
817	ldp	x4,x5,[sp,#320]
818	ldp	x6,x7,[sp,#320+16]
819	add	x0,sp,#32
820	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
821
822	add	x2,sp,#224
823	add	x0,sp,#352
824	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S1, Hcub);
825
826	ldr	x3,[sp,#160]
827	ldp	x4,x5,[sp,#32]
828	ldp	x6,x7,[sp,#32+16]
829	add	x2,sp,#160
830	add	x0,sp,#32
831	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
832
833	add	x2,sp,#352
834	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
835
836	ldp	x4,x5,[sp,#0]		// res
837	ldp	x6,x7,[sp,#0+16]
838	ldp	x8,x9,[x23]		// in2
839	ldp	x10,x11,[x23,#16]
840	ldp	x14,x15,[x22,#0]	// in1
841	cmp	x24,#0			// ~, remember?
842	ldp	x16,x17,[x22,#0+16]
843	csel	x8,x4,x8,ne
844	csel	x9,x5,x9,ne
845	ldp	x4,x5,[sp,#0+0+32]	// res
846	csel	x10,x6,x10,ne
847	csel	x11,x7,x11,ne
848	cmp	x25,#0			// ~, remember?
849	ldp	x6,x7,[sp,#0+0+48]
850	csel	x14,x8,x14,ne
851	csel	x15,x9,x15,ne
852	ldp	x8,x9,[x23,#0+32]	// in2
853	csel	x16,x10,x16,ne
854	csel	x17,x11,x17,ne
855	ldp	x10,x11,[x23,#0+48]
856	stp	x14,x15,[x21,#0]
857	stp	x16,x17,[x21,#0+16]
858	ldp	x14,x15,[x22,#32]	// in1
859	cmp	x24,#0			// ~, remember?
860	ldp	x16,x17,[x22,#32+16]
861	csel	x8,x4,x8,ne
862	csel	x9,x5,x9,ne
863	ldp	x4,x5,[sp,#0+32+32]	// res
864	csel	x10,x6,x10,ne
865	csel	x11,x7,x11,ne
866	cmp	x25,#0			// ~, remember?
867	ldp	x6,x7,[sp,#0+32+48]
868	csel	x14,x8,x14,ne
869	csel	x15,x9,x15,ne
870	ldp	x8,x9,[x23,#32+32]	// in2
871	csel	x16,x10,x16,ne
872	csel	x17,x11,x17,ne
873	ldp	x10,x11,[x23,#32+48]
874	stp	x14,x15,[x21,#32]
875	stp	x16,x17,[x21,#32+16]
876	ldp	x14,x15,[x22,#64]	// in1
877	cmp	x24,#0			// ~, remember?
878	ldp	x16,x17,[x22,#64+16]
879	csel	x8,x4,x8,ne
880	csel	x9,x5,x9,ne
881	csel	x10,x6,x10,ne
882	csel	x11,x7,x11,ne
883	cmp	x25,#0			// ~, remember?
884	csel	x14,x8,x14,ne
885	csel	x15,x9,x15,ne
886	csel	x16,x10,x16,ne
887	csel	x17,x11,x17,ne
888	stp	x14,x15,[x21,#64]
889	stp	x16,x17,[x21,#64+16]
890
891Ladd_done:
892	add	sp,x29,#0		// destroy frame
893	ldp	x19,x20,[x29,#16]
894	ldp	x21,x22,[x29,#32]
895	ldp	x23,x24,[x29,#48]
896	ldp	x25,x26,[x29,#64]
897	ldp	x27,x28,[x29,#80]
898	ldp	x29,x30,[sp],#96
899	AARCH64_VALIDATE_LINK_REGISTER
900	ret
901
902.globl	ecp_nistz256_point_add_affine
903
904.def ecp_nistz256_point_add_affine
905   .type 32
906.endef
907.align	5
908ecp_nistz256_point_add_affine:
909	AARCH64_SIGN_LINK_REGISTER
910	stp	x29,x30,[sp,#-80]!
911	add	x29,sp,#0
912	stp	x19,x20,[sp,#16]
913	stp	x21,x22,[sp,#32]
914	stp	x23,x24,[sp,#48]
915	stp	x25,x26,[sp,#64]
916	sub	sp,sp,#32*10
917
918	mov	x21,x0
919	mov	x22,x1
920	mov	x23,x2
921	adrp	x13,Lpoly
922	add	x13,x13,:lo12:Lpoly
923	ldr	x12,[x13,#8]
924	ldr	x13,[x13,#24]
925
926	ldp	x4,x5,[x1,#64]	// in1_z
927	ldp	x6,x7,[x1,#64+16]
928	orr	x8,x4,x5
929	orr	x10,x6,x7
930	orr	x24,x8,x10
931	cmp	x24,#0
932	csetm	x24,ne		// ~in1infty
933
934	ldp	x14,x15,[x2]	// in2_x
935	ldp	x16,x17,[x2,#16]
936	ldp	x8,x9,[x2,#32]	// in2_y
937	ldp	x10,x11,[x2,#48]
938	orr	x14,x14,x15
939	orr	x16,x16,x17
940	orr	x8,x8,x9
941	orr	x10,x10,x11
942	orr	x14,x14,x16
943	orr	x8,x8,x10
944	orr	x25,x14,x8
945	cmp	x25,#0
946	csetm	x25,ne		// ~in2infty
947
948	add	x0,sp,#128
949	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
950
951	mov	x4,x14
952	mov	x5,x15
953	mov	x6,x16
954	mov	x7,x17
955	ldr	x3,[x23]
956	add	x2,x23,#0
957	add	x0,sp,#96
958	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, Z1sqr, in2_x);
959
960	add	x2,x22,#0
961	ldr	x3,[x22,#64]	// forward load for p256_mul_mont
962	ldp	x4,x5,[sp,#128]
963	ldp	x6,x7,[sp,#128+16]
964	add	x0,sp,#160
965	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, in1_x);
966
967	add	x2,x22,#64
968	add	x0,sp,#128
969	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
970
971	ldr	x3,[x22,#64]
972	ldp	x4,x5,[sp,#160]
973	ldp	x6,x7,[sp,#160+16]
974	add	x2,x22,#64
975	add	x0,sp,#64
976	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
977
978	ldr	x3,[x23,#32]
979	ldp	x4,x5,[sp,#128]
980	ldp	x6,x7,[sp,#128+16]
981	add	x2,x23,#32
982	add	x0,sp,#128
983	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
984
985	add	x2,x22,#32
986	ldp	x4,x5,[sp,#160]	// forward load for p256_sqr_mont
987	ldp	x6,x7,[sp,#160+16]
988	add	x0,sp,#192
989	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, in1_y);
990
991	add	x0,sp,#224
992	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
993
994	ldp	x4,x5,[sp,#192]
995	ldp	x6,x7,[sp,#192+16]
996	add	x0,sp,#288
997	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
998
999	ldr	x3,[sp,#160]
1000	ldp	x4,x5,[sp,#224]
1001	ldp	x6,x7,[sp,#224+16]
1002	add	x2,sp,#160
1003	add	x0,sp,#256
1004	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
1005
1006	ldr	x3,[x22]
1007	ldp	x4,x5,[sp,#224]
1008	ldp	x6,x7,[sp,#224+16]
1009	add	x2,x22,#0
1010	add	x0,sp,#96
1011	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in1_x, Hsqr);
1012
1013	mov	x8,x14
1014	mov	x9,x15
1015	mov	x10,x16
1016	mov	x11,x17
1017	add	x0,sp,#224
1018	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
1019
1020	add	x2,sp,#288
1021	add	x0,sp,#0
1022	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
1023
1024	add	x2,sp,#256
1025	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
1026
1027	add	x2,sp,#96
1028	ldr	x3,[x22,#32]	// forward load for p256_mul_mont
1029	ldp	x4,x5,[sp,#256]
1030	ldp	x6,x7,[sp,#256+16]
1031	add	x0,sp,#32
1032	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
1033
1034	add	x2,x22,#32
1035	add	x0,sp,#128
1036	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, in1_y, Hcub);
1037
1038	ldr	x3,[sp,#192]
1039	ldp	x4,x5,[sp,#32]
1040	ldp	x6,x7,[sp,#32+16]
1041	add	x2,sp,#192
1042	add	x0,sp,#32
1043	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
1044
1045	add	x2,sp,#128
1046	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
1047
1048	ldp	x4,x5,[sp,#0]		// res
1049	ldp	x6,x7,[sp,#0+16]
1050	ldp	x8,x9,[x23]		// in2
1051	ldp	x10,x11,[x23,#16]
1052	ldp	x14,x15,[x22,#0]	// in1
1053	cmp	x24,#0			// ~, remember?
1054	ldp	x16,x17,[x22,#0+16]
1055	csel	x8,x4,x8,ne
1056	csel	x9,x5,x9,ne
1057	ldp	x4,x5,[sp,#0+0+32]	// res
1058	csel	x10,x6,x10,ne
1059	csel	x11,x7,x11,ne
1060	cmp	x25,#0			// ~, remember?
1061	ldp	x6,x7,[sp,#0+0+48]
1062	csel	x14,x8,x14,ne
1063	csel	x15,x9,x15,ne
1064	ldp	x8,x9,[x23,#0+32]	// in2
1065	csel	x16,x10,x16,ne
1066	csel	x17,x11,x17,ne
1067	ldp	x10,x11,[x23,#0+48]
1068	stp	x14,x15,[x21,#0]
1069	stp	x16,x17,[x21,#0+16]
1070	adrp	x23,Lone_mont-64
1071	add	x23,x23,:lo12:Lone_mont-64
1072	ldp	x14,x15,[x22,#32]	// in1
1073	cmp	x24,#0			// ~, remember?
1074	ldp	x16,x17,[x22,#32+16]
1075	csel	x8,x4,x8,ne
1076	csel	x9,x5,x9,ne
1077	ldp	x4,x5,[sp,#0+32+32]	// res
1078	csel	x10,x6,x10,ne
1079	csel	x11,x7,x11,ne
1080	cmp	x25,#0			// ~, remember?
1081	ldp	x6,x7,[sp,#0+32+48]
1082	csel	x14,x8,x14,ne
1083	csel	x15,x9,x15,ne
1084	ldp	x8,x9,[x23,#32+32]	// in2
1085	csel	x16,x10,x16,ne
1086	csel	x17,x11,x17,ne
1087	ldp	x10,x11,[x23,#32+48]
1088	stp	x14,x15,[x21,#32]
1089	stp	x16,x17,[x21,#32+16]
1090	ldp	x14,x15,[x22,#64]	// in1
1091	cmp	x24,#0			// ~, remember?
1092	ldp	x16,x17,[x22,#64+16]
1093	csel	x8,x4,x8,ne
1094	csel	x9,x5,x9,ne
1095	csel	x10,x6,x10,ne
1096	csel	x11,x7,x11,ne
1097	cmp	x25,#0			// ~, remember?
1098	csel	x14,x8,x14,ne
1099	csel	x15,x9,x15,ne
1100	csel	x16,x10,x16,ne
1101	csel	x17,x11,x17,ne
1102	stp	x14,x15,[x21,#64]
1103	stp	x16,x17,[x21,#64+16]
1104
1105	add	sp,x29,#0		// destroy frame
1106	ldp	x19,x20,[x29,#16]
1107	ldp	x21,x22,[x29,#32]
1108	ldp	x23,x24,[x29,#48]
1109	ldp	x25,x26,[x29,#64]
1110	ldp	x29,x30,[sp],#80
1111	AARCH64_VALIDATE_LINK_REGISTER
1112	ret
1113
1114////////////////////////////////////////////////////////////////////////
1115// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
1116//                                uint64_t b[4]);
1117.globl	ecp_nistz256_ord_mul_mont
1118
1119.def ecp_nistz256_ord_mul_mont
1120   .type 32
1121.endef
1122.align	4
1123ecp_nistz256_ord_mul_mont:
1124	AARCH64_VALID_CALL_TARGET
1125	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1126	stp	x29,x30,[sp,#-64]!
1127	add	x29,sp,#0
1128	stp	x19,x20,[sp,#16]
1129	stp	x21,x22,[sp,#32]
1130	stp	x23,x24,[sp,#48]
1131
1132	adrp	x23,Lord
1133	add	x23,x23,:lo12:Lord
1134	ldr	x3,[x2]		// bp[0]
1135	ldp	x4,x5,[x1]
1136	ldp	x6,x7,[x1,#16]
1137
1138	ldp	x12,x13,[x23,#0]
1139	ldp	x21,x22,[x23,#16]
1140	ldr	x23,[x23,#32]
1141
1142	mul	x14,x4,x3		// a[0]*b[0]
1143	umulh	x8,x4,x3
1144
1145	mul	x15,x5,x3		// a[1]*b[0]
1146	umulh	x9,x5,x3
1147
1148	mul	x16,x6,x3		// a[2]*b[0]
1149	umulh	x10,x6,x3
1150
1151	mul	x17,x7,x3		// a[3]*b[0]
1152	umulh	x19,x7,x3
1153
1154	mul	x24,x14,x23
1155
1156	adds	x15,x15,x8		// accumulate high parts of multiplication
1157	adcs	x16,x16,x9
1158	adcs	x17,x17,x10
1159	adc	x19,x19,xzr
1160	mov	x20,xzr
1161	ldr	x3,[x2,#8*1]		// b[i]
1162
1163	lsl	x8,x24,#32
1164	subs	x16,x16,x24
1165	lsr	x9,x24,#32
1166	sbcs	x17,x17,x8
1167	sbcs	x19,x19,x9
1168	sbc	x20,x20,xzr
1169
1170	subs	xzr,x14,#1
1171	umulh	x9,x12,x24
1172	mul	x10,x13,x24
1173	umulh	x11,x13,x24
1174
1175	adcs	x10,x10,x9
1176	mul	x8,x4,x3
1177	adc	x11,x11,xzr
1178	mul	x9,x5,x3
1179
1180	adds	x14,x15,x10
1181	mul	x10,x6,x3
1182	adcs	x15,x16,x11
1183	mul	x11,x7,x3
1184	adcs	x16,x17,x24
1185	adcs	x17,x19,x24
1186	adc	x19,x20,xzr
1187
1188	adds	x14,x14,x8		// accumulate low parts
1189	umulh	x8,x4,x3
1190	adcs	x15,x15,x9
1191	umulh	x9,x5,x3
1192	adcs	x16,x16,x10
1193	umulh	x10,x6,x3
1194	adcs	x17,x17,x11
1195	umulh	x11,x7,x3
1196	adc	x19,x19,xzr
1197	mul	x24,x14,x23
1198	adds	x15,x15,x8		// accumulate high parts
1199	adcs	x16,x16,x9
1200	adcs	x17,x17,x10
1201	adcs	x19,x19,x11
1202	adc	x20,xzr,xzr
1203	ldr	x3,[x2,#8*2]		// b[i]
1204
1205	lsl	x8,x24,#32
1206	subs	x16,x16,x24
1207	lsr	x9,x24,#32
1208	sbcs	x17,x17,x8
1209	sbcs	x19,x19,x9
1210	sbc	x20,x20,xzr
1211
1212	subs	xzr,x14,#1
1213	umulh	x9,x12,x24
1214	mul	x10,x13,x24
1215	umulh	x11,x13,x24
1216
1217	adcs	x10,x10,x9
1218	mul	x8,x4,x3
1219	adc	x11,x11,xzr
1220	mul	x9,x5,x3
1221
1222	adds	x14,x15,x10
1223	mul	x10,x6,x3
1224	adcs	x15,x16,x11
1225	mul	x11,x7,x3
1226	adcs	x16,x17,x24
1227	adcs	x17,x19,x24
1228	adc	x19,x20,xzr
1229
1230	adds	x14,x14,x8		// accumulate low parts
1231	umulh	x8,x4,x3
1232	adcs	x15,x15,x9
1233	umulh	x9,x5,x3
1234	adcs	x16,x16,x10
1235	umulh	x10,x6,x3
1236	adcs	x17,x17,x11
1237	umulh	x11,x7,x3
1238	adc	x19,x19,xzr
1239	mul	x24,x14,x23
1240	adds	x15,x15,x8		// accumulate high parts
1241	adcs	x16,x16,x9
1242	adcs	x17,x17,x10
1243	adcs	x19,x19,x11
1244	adc	x20,xzr,xzr
1245	ldr	x3,[x2,#8*3]		// b[i]
1246
1247	lsl	x8,x24,#32
1248	subs	x16,x16,x24
1249	lsr	x9,x24,#32
1250	sbcs	x17,x17,x8
1251	sbcs	x19,x19,x9
1252	sbc	x20,x20,xzr
1253
1254	subs	xzr,x14,#1
1255	umulh	x9,x12,x24
1256	mul	x10,x13,x24
1257	umulh	x11,x13,x24
1258
1259	adcs	x10,x10,x9
1260	mul	x8,x4,x3
1261	adc	x11,x11,xzr
1262	mul	x9,x5,x3
1263
1264	adds	x14,x15,x10
1265	mul	x10,x6,x3
1266	adcs	x15,x16,x11
1267	mul	x11,x7,x3
1268	adcs	x16,x17,x24
1269	adcs	x17,x19,x24
1270	adc	x19,x20,xzr
1271
1272	adds	x14,x14,x8		// accumulate low parts
1273	umulh	x8,x4,x3
1274	adcs	x15,x15,x9
1275	umulh	x9,x5,x3
1276	adcs	x16,x16,x10
1277	umulh	x10,x6,x3
1278	adcs	x17,x17,x11
1279	umulh	x11,x7,x3
1280	adc	x19,x19,xzr
1281	mul	x24,x14,x23
1282	adds	x15,x15,x8		// accumulate high parts
1283	adcs	x16,x16,x9
1284	adcs	x17,x17,x10
1285	adcs	x19,x19,x11
1286	adc	x20,xzr,xzr
1287	lsl	x8,x24,#32		// last reduction
1288	subs	x16,x16,x24
1289	lsr	x9,x24,#32
1290	sbcs	x17,x17,x8
1291	sbcs	x19,x19,x9
1292	sbc	x20,x20,xzr
1293
1294	subs	xzr,x14,#1
1295	umulh	x9,x12,x24
1296	mul	x10,x13,x24
1297	umulh	x11,x13,x24
1298
1299	adcs	x10,x10,x9
1300	adc	x11,x11,xzr
1301
1302	adds	x14,x15,x10
1303	adcs	x15,x16,x11
1304	adcs	x16,x17,x24
1305	adcs	x17,x19,x24
1306	adc	x19,x20,xzr
1307
1308	subs	x8,x14,x12		// ret -= modulus
1309	sbcs	x9,x15,x13
1310	sbcs	x10,x16,x21
1311	sbcs	x11,x17,x22
1312	sbcs	xzr,x19,xzr
1313
1314	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
1315	csel	x15,x15,x9,lo
1316	csel	x16,x16,x10,lo
1317	stp	x14,x15,[x0]
1318	csel	x17,x17,x11,lo
1319	stp	x16,x17,[x0,#16]
1320
1321	ldp	x19,x20,[sp,#16]
1322	ldp	x21,x22,[sp,#32]
1323	ldp	x23,x24,[sp,#48]
1324	ldr	x29,[sp],#64
1325	ret
1326
1327
1328////////////////////////////////////////////////////////////////////////
1329// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
1330//                                uint64_t rep);
1331.globl	ecp_nistz256_ord_sqr_mont
1332
1333.def ecp_nistz256_ord_sqr_mont
1334   .type 32
1335.endef
1336.align	4
1337ecp_nistz256_ord_sqr_mont:
1338	AARCH64_VALID_CALL_TARGET
1339	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1340	stp	x29,x30,[sp,#-64]!
1341	add	x29,sp,#0
1342	stp	x19,x20,[sp,#16]
1343	stp	x21,x22,[sp,#32]
1344	stp	x23,x24,[sp,#48]
1345
1346	adrp	x23,Lord
1347	add	x23,x23,:lo12:Lord
1348	ldp	x4,x5,[x1]
1349	ldp	x6,x7,[x1,#16]
1350
1351	ldp	x12,x13,[x23,#0]
1352	ldp	x21,x22,[x23,#16]
1353	ldr	x23,[x23,#32]
1354	b	Loop_ord_sqr
1355
1356.align	4
1357Loop_ord_sqr:
1358	sub	x2,x2,#1
1359	////////////////////////////////////////////////////////////////
1360	//  |  |  |  |  |  |a1*a0|  |
1361	//  |  |  |  |  |a2*a0|  |  |
1362	//  |  |a3*a2|a3*a0|  |  |  |
1363	//  |  |  |  |a2*a1|  |  |  |
1364	//  |  |  |a3*a1|  |  |  |  |
1365	// *|  |  |  |  |  |  |  | 2|
1366	// +|a3*a3|a2*a2|a1*a1|a0*a0|
1367	//  |--+--+--+--+--+--+--+--|
1368	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow
1369	//
1370	//  "can't overflow" below mark carrying into high part of
1371	//  multiplication result, which can't overflow, because it
1372	//  can never be all ones.
1373
1374	mul	x15,x5,x4		// a[1]*a[0]
1375	umulh	x9,x5,x4
1376	mul	x16,x6,x4		// a[2]*a[0]
1377	umulh	x10,x6,x4
1378	mul	x17,x7,x4		// a[3]*a[0]
1379	umulh	x19,x7,x4
1380
1381	adds	x16,x16,x9		// accumulate high parts of multiplication
1382	mul	x8,x6,x5		// a[2]*a[1]
1383	umulh	x9,x6,x5
1384	adcs	x17,x17,x10
1385	mul	x10,x7,x5		// a[3]*a[1]
1386	umulh	x11,x7,x5
1387	adc	x19,x19,xzr		// can't overflow
1388
1389	mul	x20,x7,x6		// a[3]*a[2]
1390	umulh	x1,x7,x6
1391
1392	adds	x9,x9,x10		// accumulate high parts of multiplication
1393	mul	x14,x4,x4		// a[0]*a[0]
1394	adc	x10,x11,xzr		// can't overflow
1395
1396	adds	x17,x17,x8		// accumulate low parts of multiplication
1397	umulh	x4,x4,x4
1398	adcs	x19,x19,x9
1399	mul	x9,x5,x5		// a[1]*a[1]
1400	adcs	x20,x20,x10
1401	umulh	x5,x5,x5
1402	adc	x1,x1,xzr		// can't overflow
1403
1404	adds	x15,x15,x15	// acc[1-6]*=2
1405	mul	x10,x6,x6		// a[2]*a[2]
1406	adcs	x16,x16,x16
1407	umulh	x6,x6,x6
1408	adcs	x17,x17,x17
1409	mul	x11,x7,x7		// a[3]*a[3]
1410	adcs	x19,x19,x19
1411	umulh	x7,x7,x7
1412	adcs	x20,x20,x20
1413	adcs	x1,x1,x1
1414	adc	x3,xzr,xzr
1415
1416	adds	x15,x15,x4		// +a[i]*a[i]
1417	mul	x24,x14,x23
1418	adcs	x16,x16,x9
1419	adcs	x17,x17,x5
1420	adcs	x19,x19,x10
1421	adcs	x20,x20,x6
1422	adcs	x1,x1,x11
1423	adc	x3,x3,x7
1424	subs	xzr,x14,#1
1425	umulh	x9,x12,x24
1426	mul	x10,x13,x24
1427	umulh	x11,x13,x24
1428
1429	adcs	x10,x10,x9
1430	adc	x11,x11,xzr
1431
1432	adds	x14,x15,x10
1433	adcs	x15,x16,x11
1434	adcs	x16,x17,x24
1435	adc	x17,xzr,x24		// can't overflow
1436	mul	x11,x14,x23
1437	lsl	x8,x24,#32
1438	subs	x15,x15,x24
1439	lsr	x9,x24,#32
1440	sbcs	x16,x16,x8
1441	sbc	x17,x17,x9		// can't borrow
1442	subs	xzr,x14,#1
1443	umulh	x9,x12,x11
1444	mul	x10,x13,x11
1445	umulh	x24,x13,x11
1446
1447	adcs	x10,x10,x9
1448	adc	x24,x24,xzr
1449
1450	adds	x14,x15,x10
1451	adcs	x15,x16,x24
1452	adcs	x16,x17,x11
1453	adc	x17,xzr,x11		// can't overflow
1454	mul	x24,x14,x23
1455	lsl	x8,x11,#32
1456	subs	x15,x15,x11
1457	lsr	x9,x11,#32
1458	sbcs	x16,x16,x8
1459	sbc	x17,x17,x9		// can't borrow
1460	subs	xzr,x14,#1
1461	umulh	x9,x12,x24
1462	mul	x10,x13,x24
1463	umulh	x11,x13,x24
1464
1465	adcs	x10,x10,x9
1466	adc	x11,x11,xzr
1467
1468	adds	x14,x15,x10
1469	adcs	x15,x16,x11
1470	adcs	x16,x17,x24
1471	adc	x17,xzr,x24		// can't overflow
1472	mul	x11,x14,x23
1473	lsl	x8,x24,#32
1474	subs	x15,x15,x24
1475	lsr	x9,x24,#32
1476	sbcs	x16,x16,x8
1477	sbc	x17,x17,x9		// can't borrow
1478	subs	xzr,x14,#1
1479	umulh	x9,x12,x11
1480	mul	x10,x13,x11
1481	umulh	x24,x13,x11
1482
1483	adcs	x10,x10,x9
1484	adc	x24,x24,xzr
1485
1486	adds	x14,x15,x10
1487	adcs	x15,x16,x24
1488	adcs	x16,x17,x11
1489	adc	x17,xzr,x11		// can't overflow
1490	lsl	x8,x11,#32
1491	subs	x15,x15,x11
1492	lsr	x9,x11,#32
1493	sbcs	x16,x16,x8
1494	sbc	x17,x17,x9		// can't borrow
1495	adds	x14,x14,x19	// accumulate upper half
1496	adcs	x15,x15,x20
1497	adcs	x16,x16,x1
1498	adcs	x17,x17,x3
1499	adc	x19,xzr,xzr
1500
1501	subs	x8,x14,x12		// ret -= modulus
1502	sbcs	x9,x15,x13
1503	sbcs	x10,x16,x21
1504	sbcs	x11,x17,x22
1505	sbcs	xzr,x19,xzr
1506
1507	csel	x4,x14,x8,lo	// ret = borrow ? ret : ret-modulus
1508	csel	x5,x15,x9,lo
1509	csel	x6,x16,x10,lo
1510	csel	x7,x17,x11,lo
1511
1512	cbnz	x2,Loop_ord_sqr
1513
1514	stp	x4,x5,[x0]
1515	stp	x6,x7,[x0,#16]
1516
1517	ldp	x19,x20,[sp,#16]
1518	ldp	x21,x22,[sp,#32]
1519	ldp	x23,x24,[sp,#48]
1520	ldr	x29,[sp],#64
1521	ret
1522
1523////////////////////////////////////////////////////////////////////////
1524// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
1525.globl	ecp_nistz256_select_w5
1526
1527.def ecp_nistz256_select_w5
1528   .type 32
1529.endef
1530.align	4
1531ecp_nistz256_select_w5:
1532	AARCH64_VALID_CALL_TARGET
1533
1534    // x10 := x0
1535    // w9 := 0; loop counter and incremented internal index
1536	mov	x10, x0
1537	mov	w9, #0
1538
1539    // [v16-v21] := 0
1540	movi	v16.16b, #0
1541	movi	v17.16b, #0
1542	movi	v18.16b, #0
1543	movi	v19.16b, #0
1544	movi	v20.16b, #0
1545	movi	v21.16b, #0
1546
1547Lselect_w5_loop:
1548    // Loop 16 times.
1549
1550    // Increment index (loop counter); tested at the end of the loop
1551	add	w9, w9, #1
1552
1553    // [v22-v27] := Load a (3*256-bit = 6*128-bit) table entry starting at x1
1554    //  and advance x1 to point to the next entry
1555	ld1	{v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
1556
1557    // x11 := (w9 == w2)? All 1s : All 0s
1558	cmp	w9, w2
1559	csetm	x11, eq
1560
1561    // continue loading ...
1562	ld1	{v26.2d, v27.2d}, [x1],#32
1563
1564    // duplicate mask_64 into Mask (all 0s or all 1s)
1565	dup	v3.2d, x11
1566
1567    // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
1568    // i.e., values in output registers will remain the same if w9 != w2
1569	bit	v16.16b, v22.16b, v3.16b
1570	bit	v17.16b, v23.16b, v3.16b
1571
1572	bit	v18.16b, v24.16b, v3.16b
1573	bit	v19.16b, v25.16b, v3.16b
1574
1575	bit	v20.16b, v26.16b, v3.16b
1576	bit	v21.16b, v27.16b, v3.16b
1577
1578    // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back
1579	tbz	w9, #4, Lselect_w5_loop
1580
1581    // Write [v16-v21] to memory at the output pointer
1582	st1	{v16.2d, v17.2d, v18.2d, v19.2d}, [x10],#64
1583	st1	{v20.2d, v21.2d}, [x10]
1584
1585	ret
1586
1587
1588
1589////////////////////////////////////////////////////////////////////////
1590// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
1591.globl	ecp_nistz256_select_w7
1592
1593.def ecp_nistz256_select_w7
1594   .type 32
1595.endef
1596.align	4
1597ecp_nistz256_select_w7:
1598	AARCH64_VALID_CALL_TARGET
1599
1600    // w9 := 0; loop counter and incremented internal index
1601	mov	w9, #0
1602
1603    // [v16-v21] := 0
1604	movi	v16.16b, #0
1605	movi	v17.16b, #0
1606	movi	v18.16b, #0
1607	movi	v19.16b, #0
1608
1609Lselect_w7_loop:
1610    // Loop 64 times.
1611
1612    // Increment index (loop counter); tested at the end of the loop
1613	add	w9, w9, #1
1614
1615    // [v22-v25] := Load a (2*256-bit = 4*128-bit) table entry starting at x1
1616    //  and advance x1 to point to the next entry
1617	ld1	{v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
1618
1619    // x11 := (w9 == w2)? All 1s : All 0s
1620	cmp	w9, w2
1621	csetm	x11, eq
1622
1623    // duplicate mask_64 into Mask (all 0s or all 1s)
1624	dup	v3.2d, x11
1625
1626    // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
1627    // i.e., values in output registers will remain the same if w9 != w2
1628	bit	v16.16b, v22.16b, v3.16b
1629	bit	v17.16b, v23.16b, v3.16b
1630
1631	bit	v18.16b, v24.16b, v3.16b
1632	bit	v19.16b, v25.16b, v3.16b
1633
1634    // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back
1635	tbz	w9, #6, Lselect_w7_loop
1636
1637    // Write [v16-v19] to memory at the output pointer
1638	st1	{v16.2d, v17.2d, v18.2d, v19.2d}, [x0]
1639
1640	ret
1641
1642#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
1643