xref: /aosp_15_r20/external/cronet/third_party/boringssl/src/gen/bcm/p256-armv8-asm-win.S (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <openssl/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
7#include "openssl/arm_arch.h"
8
9.section	.rodata
10.align	5
11Lpoly:
12.quad	0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
13LRR:	//	2^512 mod P precomputed for NIST P256 polynomial
14.quad	0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd
15Lone_mont:
16.quad	0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
17Lone:
18.quad	1,0,0,0
19Lord:
20.quad	0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
21LordK:
22.quad	0xccd1c8aaee00bc4f
23.byte	69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
24.align	2
25.text
26
27// void	ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
28//					     const BN_ULONG x2[4]);
29.globl	ecp_nistz256_mul_mont
30
31.def ecp_nistz256_mul_mont
32   .type 32
33.endef
34.align	4
35ecp_nistz256_mul_mont:
36	AARCH64_SIGN_LINK_REGISTER
37	stp	x29,x30,[sp,#-32]!
38	add	x29,sp,#0
39	stp	x19,x20,[sp,#16]
40
41	ldr	x3,[x2]		// bp[0]
42	ldp	x4,x5,[x1]
43	ldp	x6,x7,[x1,#16]
44	adrp	x13,Lpoly
45	add	x13,x13,:lo12:Lpoly
46	ldr	x12,[x13,#8]
47	ldr	x13,[x13,#24]
48
49	bl	__ecp_nistz256_mul_mont
50
51	ldp	x19,x20,[sp,#16]
52	ldp	x29,x30,[sp],#32
53	AARCH64_VALIDATE_LINK_REGISTER
54	ret
55
56
57// void	ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
58.globl	ecp_nistz256_sqr_mont
59
60.def ecp_nistz256_sqr_mont
61   .type 32
62.endef
63.align	4
64ecp_nistz256_sqr_mont:
65	AARCH64_SIGN_LINK_REGISTER
66	stp	x29,x30,[sp,#-32]!
67	add	x29,sp,#0
68	stp	x19,x20,[sp,#16]
69
70	ldp	x4,x5,[x1]
71	ldp	x6,x7,[x1,#16]
72	adrp	x13,Lpoly
73	add	x13,x13,:lo12:Lpoly
74	ldr	x12,[x13,#8]
75	ldr	x13,[x13,#24]
76
77	bl	__ecp_nistz256_sqr_mont
78
79	ldp	x19,x20,[sp,#16]
80	ldp	x29,x30,[sp],#32
81	AARCH64_VALIDATE_LINK_REGISTER
82	ret
83
84
85// void	ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
86.globl	ecp_nistz256_div_by_2
87
88.def ecp_nistz256_div_by_2
89   .type 32
90.endef
91.align	4
92ecp_nistz256_div_by_2:
93	AARCH64_SIGN_LINK_REGISTER
94	stp	x29,x30,[sp,#-16]!
95	add	x29,sp,#0
96
97	ldp	x14,x15,[x1]
98	ldp	x16,x17,[x1,#16]
99	adrp	x13,Lpoly
100	add	x13,x13,:lo12:Lpoly
101	ldr	x12,[x13,#8]
102	ldr	x13,[x13,#24]
103
104	bl	__ecp_nistz256_div_by_2
105
106	ldp	x29,x30,[sp],#16
107	AARCH64_VALIDATE_LINK_REGISTER
108	ret
109
110
111// void	ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
112.globl	ecp_nistz256_mul_by_2
113
114.def ecp_nistz256_mul_by_2
115   .type 32
116.endef
117.align	4
118ecp_nistz256_mul_by_2:
119	AARCH64_SIGN_LINK_REGISTER
120	stp	x29,x30,[sp,#-16]!
121	add	x29,sp,#0
122
123	ldp	x14,x15,[x1]
124	ldp	x16,x17,[x1,#16]
125	adrp	x13,Lpoly
126	add	x13,x13,:lo12:Lpoly
127	ldr	x12,[x13,#8]
128	ldr	x13,[x13,#24]
129	mov	x8,x14
130	mov	x9,x15
131	mov	x10,x16
132	mov	x11,x17
133
134	bl	__ecp_nistz256_add_to	// ret = a+a	// 2*a
135
136	ldp	x29,x30,[sp],#16
137	AARCH64_VALIDATE_LINK_REGISTER
138	ret
139
140
141// void	ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
142.globl	ecp_nistz256_mul_by_3
143
144.def ecp_nistz256_mul_by_3
145   .type 32
146.endef
147.align	4
148ecp_nistz256_mul_by_3:
149	AARCH64_SIGN_LINK_REGISTER
150	stp	x29,x30,[sp,#-16]!
151	add	x29,sp,#0
152
153	ldp	x14,x15,[x1]
154	ldp	x16,x17,[x1,#16]
155	adrp	x13,Lpoly
156	add	x13,x13,:lo12:Lpoly
157	ldr	x12,[x13,#8]
158	ldr	x13,[x13,#24]
159	mov	x8,x14
160	mov	x9,x15
161	mov	x10,x16
162	mov	x11,x17
163	mov	x4,x14
164	mov	x5,x15
165	mov	x6,x16
166	mov	x7,x17
167
168	bl	__ecp_nistz256_add_to	// ret = a+a	// 2*a
169
170	mov	x8,x4
171	mov	x9,x5
172	mov	x10,x6
173	mov	x11,x7
174
175	bl	__ecp_nistz256_add_to	// ret += a	// 2*a+a=3*a
176
177	ldp	x29,x30,[sp],#16
178	AARCH64_VALIDATE_LINK_REGISTER
179	ret
180
181
182// void	ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
183//				        const BN_ULONG x2[4]);
184.globl	ecp_nistz256_sub
185
186.def ecp_nistz256_sub
187   .type 32
188.endef
189.align	4
190ecp_nistz256_sub:
191	AARCH64_SIGN_LINK_REGISTER
192	stp	x29,x30,[sp,#-16]!
193	add	x29,sp,#0
194
195	ldp	x14,x15,[x1]
196	ldp	x16,x17,[x1,#16]
197	adrp	x13,Lpoly
198	add	x13,x13,:lo12:Lpoly
199	ldr	x12,[x13,#8]
200	ldr	x13,[x13,#24]
201
202	bl	__ecp_nistz256_sub_from
203
204	ldp	x29,x30,[sp],#16
205	AARCH64_VALIDATE_LINK_REGISTER
206	ret
207
208
209// void	ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
210.globl	ecp_nistz256_neg
211
212.def ecp_nistz256_neg
213   .type 32
214.endef
215.align	4
216ecp_nistz256_neg:
217	AARCH64_SIGN_LINK_REGISTER
218	stp	x29,x30,[sp,#-16]!
219	add	x29,sp,#0
220
221	mov	x2,x1
222	mov	x14,xzr		// a = 0
223	mov	x15,xzr
224	mov	x16,xzr
225	mov	x17,xzr
226	adrp	x13,Lpoly
227	add	x13,x13,:lo12:Lpoly
228	ldr	x12,[x13,#8]
229	ldr	x13,[x13,#24]
230
231	bl	__ecp_nistz256_sub_from
232
233	ldp	x29,x30,[sp],#16
234	AARCH64_VALIDATE_LINK_REGISTER
235	ret
236
237
238// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
239// to x4-x7 and b[0] - to x3
240.def __ecp_nistz256_mul_mont
241   .type 32
242.endef
243.align	4
244__ecp_nistz256_mul_mont:
245	mul	x14,x4,x3		// a[0]*b[0]
246	umulh	x8,x4,x3
247
248	mul	x15,x5,x3		// a[1]*b[0]
249	umulh	x9,x5,x3
250
251	mul	x16,x6,x3		// a[2]*b[0]
252	umulh	x10,x6,x3
253
254	mul	x17,x7,x3		// a[3]*b[0]
255	umulh	x11,x7,x3
256	ldr	x3,[x2,#8]		// b[1]
257
258	adds	x15,x15,x8		// accumulate high parts of multiplication
259	lsl	x8,x14,#32
260	adcs	x16,x16,x9
261	lsr	x9,x14,#32
262	adcs	x17,x17,x10
263	adc	x19,xzr,x11
264	mov	x20,xzr
265	subs	x10,x14,x8		// "*0xffff0001"
266	sbc	x11,x14,x9
267	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
268	mul	x8,x4,x3		// lo(a[0]*b[i])
269	adcs	x15,x16,x9
270	mul	x9,x5,x3		// lo(a[1]*b[i])
271	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
272	mul	x10,x6,x3		// lo(a[2]*b[i])
273	adcs	x17,x19,x11
274	mul	x11,x7,x3		// lo(a[3]*b[i])
275	adc	x19,x20,xzr
276
277	adds	x14,x14,x8		// accumulate low parts of multiplication
278	umulh	x8,x4,x3		// hi(a[0]*b[i])
279	adcs	x15,x15,x9
280	umulh	x9,x5,x3		// hi(a[1]*b[i])
281	adcs	x16,x16,x10
282	umulh	x10,x6,x3		// hi(a[2]*b[i])
283	adcs	x17,x17,x11
284	umulh	x11,x7,x3		// hi(a[3]*b[i])
285	adc	x19,x19,xzr
286	ldr	x3,[x2,#8*(1+1)]	// b[1+1]
287	adds	x15,x15,x8		// accumulate high parts of multiplication
288	lsl	x8,x14,#32
289	adcs	x16,x16,x9
290	lsr	x9,x14,#32
291	adcs	x17,x17,x10
292	adcs	x19,x19,x11
293	adc	x20,xzr,xzr
294	subs	x10,x14,x8		// "*0xffff0001"
295	sbc	x11,x14,x9
296	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
297	mul	x8,x4,x3		// lo(a[0]*b[i])
298	adcs	x15,x16,x9
299	mul	x9,x5,x3		// lo(a[1]*b[i])
300	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
301	mul	x10,x6,x3		// lo(a[2]*b[i])
302	adcs	x17,x19,x11
303	mul	x11,x7,x3		// lo(a[3]*b[i])
304	adc	x19,x20,xzr
305
306	adds	x14,x14,x8		// accumulate low parts of multiplication
307	umulh	x8,x4,x3		// hi(a[0]*b[i])
308	adcs	x15,x15,x9
309	umulh	x9,x5,x3		// hi(a[1]*b[i])
310	adcs	x16,x16,x10
311	umulh	x10,x6,x3		// hi(a[2]*b[i])
312	adcs	x17,x17,x11
313	umulh	x11,x7,x3		// hi(a[3]*b[i])
314	adc	x19,x19,xzr
315	ldr	x3,[x2,#8*(2+1)]	// b[2+1]
316	adds	x15,x15,x8		// accumulate high parts of multiplication
317	lsl	x8,x14,#32
318	adcs	x16,x16,x9
319	lsr	x9,x14,#32
320	adcs	x17,x17,x10
321	adcs	x19,x19,x11
322	adc	x20,xzr,xzr
323	subs	x10,x14,x8		// "*0xffff0001"
324	sbc	x11,x14,x9
325	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
326	mul	x8,x4,x3		// lo(a[0]*b[i])
327	adcs	x15,x16,x9
328	mul	x9,x5,x3		// lo(a[1]*b[i])
329	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
330	mul	x10,x6,x3		// lo(a[2]*b[i])
331	adcs	x17,x19,x11
332	mul	x11,x7,x3		// lo(a[3]*b[i])
333	adc	x19,x20,xzr
334
335	adds	x14,x14,x8		// accumulate low parts of multiplication
336	umulh	x8,x4,x3		// hi(a[0]*b[i])
337	adcs	x15,x15,x9
338	umulh	x9,x5,x3		// hi(a[1]*b[i])
339	adcs	x16,x16,x10
340	umulh	x10,x6,x3		// hi(a[2]*b[i])
341	adcs	x17,x17,x11
342	umulh	x11,x7,x3		// hi(a[3]*b[i])
343	adc	x19,x19,xzr
344	adds	x15,x15,x8		// accumulate high parts of multiplication
345	lsl	x8,x14,#32
346	adcs	x16,x16,x9
347	lsr	x9,x14,#32
348	adcs	x17,x17,x10
349	adcs	x19,x19,x11
350	adc	x20,xzr,xzr
351	// last reduction
352	subs	x10,x14,x8		// "*0xffff0001"
353	sbc	x11,x14,x9
354	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
355	adcs	x15,x16,x9
356	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
357	adcs	x17,x19,x11
358	adc	x19,x20,xzr
359
360	adds	x8,x14,#1		// subs	x8,x14,#-1 // tmp = ret-modulus
361	sbcs	x9,x15,x12
362	sbcs	x10,x16,xzr
363	sbcs	x11,x17,x13
364	sbcs	xzr,x19,xzr		// did it borrow?
365
366	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
367	csel	x15,x15,x9,lo
368	csel	x16,x16,x10,lo
369	stp	x14,x15,[x0]
370	csel	x17,x17,x11,lo
371	stp	x16,x17,[x0,#16]
372
373	ret
374
375
376// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
377// to x4-x7
378.def __ecp_nistz256_sqr_mont
379   .type 32
380.endef
381.align	4
382__ecp_nistz256_sqr_mont:
383	//  |  |  |  |  |  |a1*a0|  |
384	//  |  |  |  |  |a2*a0|  |  |
385	//  |  |a3*a2|a3*a0|  |  |  |
386	//  |  |  |  |a2*a1|  |  |  |
387	//  |  |  |a3*a1|  |  |  |  |
388	// *|  |  |  |  |  |  |  | 2|
389	// +|a3*a3|a2*a2|a1*a1|a0*a0|
390	//  |--+--+--+--+--+--+--+--|
391	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow
392	//
393	//  "can't overflow" below mark carrying into high part of
394	//  multiplication result, which can't overflow, because it
395	//  can never be all ones.
396
397	mul	x15,x5,x4		// a[1]*a[0]
398	umulh	x9,x5,x4
399	mul	x16,x6,x4		// a[2]*a[0]
400	umulh	x10,x6,x4
401	mul	x17,x7,x4		// a[3]*a[0]
402	umulh	x19,x7,x4
403
404	adds	x16,x16,x9		// accumulate high parts of multiplication
405	mul	x8,x6,x5		// a[2]*a[1]
406	umulh	x9,x6,x5
407	adcs	x17,x17,x10
408	mul	x10,x7,x5		// a[3]*a[1]
409	umulh	x11,x7,x5
410	adc	x19,x19,xzr		// can't overflow
411
412	mul	x20,x7,x6		// a[3]*a[2]
413	umulh	x1,x7,x6
414
415	adds	x9,x9,x10		// accumulate high parts of multiplication
416	mul	x14,x4,x4		// a[0]*a[0]
417	adc	x10,x11,xzr		// can't overflow
418
419	adds	x17,x17,x8		// accumulate low parts of multiplication
420	umulh	x4,x4,x4
421	adcs	x19,x19,x9
422	mul	x9,x5,x5		// a[1]*a[1]
423	adcs	x20,x20,x10
424	umulh	x5,x5,x5
425	adc	x1,x1,xzr		// can't overflow
426
427	adds	x15,x15,x15	// acc[1-6]*=2
428	mul	x10,x6,x6		// a[2]*a[2]
429	adcs	x16,x16,x16
430	umulh	x6,x6,x6
431	adcs	x17,x17,x17
432	mul	x11,x7,x7		// a[3]*a[3]
433	adcs	x19,x19,x19
434	umulh	x7,x7,x7
435	adcs	x20,x20,x20
436	adcs	x1,x1,x1
437	adc	x2,xzr,xzr
438
439	adds	x15,x15,x4		// +a[i]*a[i]
440	adcs	x16,x16,x9
441	adcs	x17,x17,x5
442	adcs	x19,x19,x10
443	adcs	x20,x20,x6
444	lsl	x8,x14,#32
445	adcs	x1,x1,x11
446	lsr	x9,x14,#32
447	adc	x2,x2,x7
448	subs	x10,x14,x8		// "*0xffff0001"
449	sbc	x11,x14,x9
450	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
451	adcs	x15,x16,x9
452	lsl	x8,x14,#32
453	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
454	lsr	x9,x14,#32
455	adc	x17,x11,xzr		// can't overflow
456	subs	x10,x14,x8		// "*0xffff0001"
457	sbc	x11,x14,x9
458	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
459	adcs	x15,x16,x9
460	lsl	x8,x14,#32
461	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
462	lsr	x9,x14,#32
463	adc	x17,x11,xzr		// can't overflow
464	subs	x10,x14,x8		// "*0xffff0001"
465	sbc	x11,x14,x9
466	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
467	adcs	x15,x16,x9
468	lsl	x8,x14,#32
469	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
470	lsr	x9,x14,#32
471	adc	x17,x11,xzr		// can't overflow
472	subs	x10,x14,x8		// "*0xffff0001"
473	sbc	x11,x14,x9
474	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
475	adcs	x15,x16,x9
476	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
477	adc	x17,x11,xzr		// can't overflow
478
479	adds	x14,x14,x19	// accumulate upper half
480	adcs	x15,x15,x20
481	adcs	x16,x16,x1
482	adcs	x17,x17,x2
483	adc	x19,xzr,xzr
484
485	adds	x8,x14,#1		// subs	x8,x14,#-1 // tmp = ret-modulus
486	sbcs	x9,x15,x12
487	sbcs	x10,x16,xzr
488	sbcs	x11,x17,x13
489	sbcs	xzr,x19,xzr		// did it borrow?
490
491	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
492	csel	x15,x15,x9,lo
493	csel	x16,x16,x10,lo
494	stp	x14,x15,[x0]
495	csel	x17,x17,x11,lo
496	stp	x16,x17,[x0,#16]
497
498	ret
499
500
501// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to
502// x4-x7 and x8-x11. This is done because it's used in multiple
503// contexts, e.g. in multiplication by 2 and 3...
504.def __ecp_nistz256_add_to
505   .type 32
506.endef
507.align	4
508__ecp_nistz256_add_to:
509	adds	x14,x14,x8		// ret = a+b
510	adcs	x15,x15,x9
511	adcs	x16,x16,x10
512	adcs	x17,x17,x11
513	adc	x1,xzr,xzr		// zap x1
514
515	adds	x8,x14,#1		// subs	x8,x4,#-1 // tmp = ret-modulus
516	sbcs	x9,x15,x12
517	sbcs	x10,x16,xzr
518	sbcs	x11,x17,x13
519	sbcs	xzr,x1,xzr		// did subtraction borrow?
520
521	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
522	csel	x15,x15,x9,lo
523	csel	x16,x16,x10,lo
524	stp	x14,x15,[x0]
525	csel	x17,x17,x11,lo
526	stp	x16,x17,[x0,#16]
527
528	ret
529
530
531.def __ecp_nistz256_sub_from
532   .type 32
533.endef
534.align	4
535__ecp_nistz256_sub_from:
536	ldp	x8,x9,[x2]
537	ldp	x10,x11,[x2,#16]
538	subs	x14,x14,x8		// ret = a-b
539	sbcs	x15,x15,x9
540	sbcs	x16,x16,x10
541	sbcs	x17,x17,x11
542	sbc	x1,xzr,xzr		// zap x1
543
544	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = ret+modulus
545	adcs	x9,x15,x12
546	adcs	x10,x16,xzr
547	adc	x11,x17,x13
548	cmp	x1,xzr			// did subtraction borrow?
549
550	csel	x14,x14,x8,eq	// ret = borrow ? ret+modulus : ret
551	csel	x15,x15,x9,eq
552	csel	x16,x16,x10,eq
553	stp	x14,x15,[x0]
554	csel	x17,x17,x11,eq
555	stp	x16,x17,[x0,#16]
556
557	ret
558
559
560.def __ecp_nistz256_sub_morf
561   .type 32
562.endef
563.align	4
564__ecp_nistz256_sub_morf:
565	ldp	x8,x9,[x2]
566	ldp	x10,x11,[x2,#16]
567	subs	x14,x8,x14		// ret = b-a
568	sbcs	x15,x9,x15
569	sbcs	x16,x10,x16
570	sbcs	x17,x11,x17
571	sbc	x1,xzr,xzr		// zap x1
572
573	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = ret+modulus
574	adcs	x9,x15,x12
575	adcs	x10,x16,xzr
576	adc	x11,x17,x13
577	cmp	x1,xzr			// did subtraction borrow?
578
579	csel	x14,x14,x8,eq	// ret = borrow ? ret+modulus : ret
580	csel	x15,x15,x9,eq
581	csel	x16,x16,x10,eq
582	stp	x14,x15,[x0]
583	csel	x17,x17,x11,eq
584	stp	x16,x17,[x0,#16]
585
586	ret
587
588
589.def __ecp_nistz256_div_by_2
590   .type 32
591.endef
592.align	4
593__ecp_nistz256_div_by_2:
594	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = a+modulus
595	adcs	x9,x15,x12
596	adcs	x10,x16,xzr
597	adcs	x11,x17,x13
598	adc	x1,xzr,xzr		// zap x1
599	tst	x14,#1		// is a even?
600
601	csel	x14,x14,x8,eq	// ret = even ? a : a+modulus
602	csel	x15,x15,x9,eq
603	csel	x16,x16,x10,eq
604	csel	x17,x17,x11,eq
605	csel	x1,xzr,x1,eq
606
607	lsr	x14,x14,#1		// ret >>= 1
608	orr	x14,x14,x15,lsl#63
609	lsr	x15,x15,#1
610	orr	x15,x15,x16,lsl#63
611	lsr	x16,x16,#1
612	orr	x16,x16,x17,lsl#63
613	lsr	x17,x17,#1
614	stp	x14,x15,[x0]
615	orr	x17,x17,x1,lsl#63
616	stp	x16,x17,[x0,#16]
617
618	ret
619
620.globl	ecp_nistz256_point_double
621
622.def ecp_nistz256_point_double
623   .type 32
624.endef
625.align	5
626ecp_nistz256_point_double:
627	AARCH64_SIGN_LINK_REGISTER
628	stp	x29,x30,[sp,#-96]!
629	add	x29,sp,#0
630	stp	x19,x20,[sp,#16]
631	stp	x21,x22,[sp,#32]
632	sub	sp,sp,#32*4
633
634Ldouble_shortcut:
635	ldp	x14,x15,[x1,#32]
636	mov	x21,x0
637	ldp	x16,x17,[x1,#48]
638	mov	x22,x1
639	adrp	x13,Lpoly
640	add	x13,x13,:lo12:Lpoly
641	ldr	x12,[x13,#8]
642	mov	x8,x14
643	ldr	x13,[x13,#24]
644	mov	x9,x15
645	ldp	x4,x5,[x22,#64]	// forward load for p256_sqr_mont
646	mov	x10,x16
647	mov	x11,x17
648	ldp	x6,x7,[x22,#64+16]
649	add	x0,sp,#0
650	bl	__ecp_nistz256_add_to	// p256_mul_by_2(S, in_y);
651
652	add	x0,sp,#64
653	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Zsqr, in_z);
654
655	ldp	x8,x9,[x22]
656	ldp	x10,x11,[x22,#16]
657	mov	x4,x14		// put Zsqr aside for p256_sub
658	mov	x5,x15
659	mov	x6,x16
660	mov	x7,x17
661	add	x0,sp,#32
662	bl	__ecp_nistz256_add_to	// p256_add(M, Zsqr, in_x);
663
664	add	x2,x22,#0
665	mov	x14,x4		// restore Zsqr
666	mov	x15,x5
667	ldp	x4,x5,[sp,#0]	// forward load for p256_sqr_mont
668	mov	x16,x6
669	mov	x17,x7
670	ldp	x6,x7,[sp,#0+16]
671	add	x0,sp,#64
672	bl	__ecp_nistz256_sub_morf	// p256_sub(Zsqr, in_x, Zsqr);
673
674	add	x0,sp,#0
675	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(S, S);
676
677	ldr	x3,[x22,#32]
678	ldp	x4,x5,[x22,#64]
679	ldp	x6,x7,[x22,#64+16]
680	add	x2,x22,#32
681	add	x0,sp,#96
682	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(tmp0, in_z, in_y);
683
684	mov	x8,x14
685	mov	x9,x15
686	ldp	x4,x5,[sp,#0]	// forward load for p256_sqr_mont
687	mov	x10,x16
688	mov	x11,x17
689	ldp	x6,x7,[sp,#0+16]
690	add	x0,x21,#64
691	bl	__ecp_nistz256_add_to	// p256_mul_by_2(res_z, tmp0);
692
693	add	x0,sp,#96
694	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(tmp0, S);
695
696	ldr	x3,[sp,#64]		// forward load for p256_mul_mont
697	ldp	x4,x5,[sp,#32]
698	ldp	x6,x7,[sp,#32+16]
699	add	x0,x21,#32
700	bl	__ecp_nistz256_div_by_2	// p256_div_by_2(res_y, tmp0);
701
702	add	x2,sp,#64
703	add	x0,sp,#32
704	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(M, M, Zsqr);
705
706	mov	x8,x14		// duplicate M
707	mov	x9,x15
708	mov	x10,x16
709	mov	x11,x17
710	mov	x4,x14		// put M aside
711	mov	x5,x15
712	mov	x6,x16
713	mov	x7,x17
714	add	x0,sp,#32
715	bl	__ecp_nistz256_add_to
716	mov	x8,x4			// restore M
717	mov	x9,x5
718	ldr	x3,[x22]		// forward load for p256_mul_mont
719	mov	x10,x6
720	ldp	x4,x5,[sp,#0]
721	mov	x11,x7
722	ldp	x6,x7,[sp,#0+16]
723	bl	__ecp_nistz256_add_to	// p256_mul_by_3(M, M);
724
725	add	x2,x22,#0
726	add	x0,sp,#0
727	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, in_x);
728
729	mov	x8,x14
730	mov	x9,x15
731	ldp	x4,x5,[sp,#32]	// forward load for p256_sqr_mont
732	mov	x10,x16
733	mov	x11,x17
734	ldp	x6,x7,[sp,#32+16]
735	add	x0,sp,#96
736	bl	__ecp_nistz256_add_to	// p256_mul_by_2(tmp0, S);
737
738	add	x0,x21,#0
739	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(res_x, M);
740
741	add	x2,sp,#96
742	bl	__ecp_nistz256_sub_from	// p256_sub(res_x, res_x, tmp0);
743
744	add	x2,sp,#0
745	add	x0,sp,#0
746	bl	__ecp_nistz256_sub_morf	// p256_sub(S, S, res_x);
747
748	ldr	x3,[sp,#32]
749	mov	x4,x14		// copy S
750	mov	x5,x15
751	mov	x6,x16
752	mov	x7,x17
753	add	x2,sp,#32
754	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, M);
755
756	add	x2,x21,#32
757	add	x0,x21,#32
758	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, S, res_y);
759
760	add	sp,x29,#0		// destroy frame
761	ldp	x19,x20,[x29,#16]
762	ldp	x21,x22,[x29,#32]
763	ldp	x29,x30,[sp],#96
764	AARCH64_VALIDATE_LINK_REGISTER
765	ret
766
767.globl	ecp_nistz256_point_add
768
769.def ecp_nistz256_point_add
770   .type 32
771.endef
772.align	5
773ecp_nistz256_point_add:
774	AARCH64_SIGN_LINK_REGISTER
775	stp	x29,x30,[sp,#-96]!
776	add	x29,sp,#0
777	stp	x19,x20,[sp,#16]
778	stp	x21,x22,[sp,#32]
779	stp	x23,x24,[sp,#48]
780	stp	x25,x26,[sp,#64]
781	stp	x27,x28,[sp,#80]
782	sub	sp,sp,#32*12
783
784	ldp	x4,x5,[x2,#64]	// in2_z
785	ldp	x6,x7,[x2,#64+16]
786	mov	x21,x0
787	mov	x22,x1
788	mov	x23,x2
789	adrp	x13,Lpoly
790	add	x13,x13,:lo12:Lpoly
791	ldr	x12,[x13,#8]
792	ldr	x13,[x13,#24]
793	orr	x8,x4,x5
794	orr	x10,x6,x7
795	orr	x25,x8,x10
796	cmp	x25,#0
797	csetm	x25,ne		// ~in2infty
798	add	x0,sp,#192
799	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z2sqr, in2_z);
800
801	ldp	x4,x5,[x22,#64]	// in1_z
802	ldp	x6,x7,[x22,#64+16]
803	orr	x8,x4,x5
804	orr	x10,x6,x7
805	orr	x24,x8,x10
806	cmp	x24,#0
807	csetm	x24,ne		// ~in1infty
808	add	x0,sp,#128
809	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
810
811	ldr	x3,[x23,#64]
812	ldp	x4,x5,[sp,#192]
813	ldp	x6,x7,[sp,#192+16]
814	add	x2,x23,#64
815	add	x0,sp,#320
816	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, Z2sqr, in2_z);
817
818	ldr	x3,[x22,#64]
819	ldp	x4,x5,[sp,#128]
820	ldp	x6,x7,[sp,#128+16]
821	add	x2,x22,#64
822	add	x0,sp,#352
823	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
824
825	ldr	x3,[x22,#32]
826	ldp	x4,x5,[sp,#320]
827	ldp	x6,x7,[sp,#320+16]
828	add	x2,x22,#32
829	add	x0,sp,#320
830	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, S1, in1_y);
831
832	ldr	x3,[x23,#32]
833	ldp	x4,x5,[sp,#352]
834	ldp	x6,x7,[sp,#352+16]
835	add	x2,x23,#32
836	add	x0,sp,#352
837	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
838
839	add	x2,sp,#320
840	ldr	x3,[sp,#192]	// forward load for p256_mul_mont
841	ldp	x4,x5,[x22]
842	ldp	x6,x7,[x22,#16]
843	add	x0,sp,#160
844	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, S1);
845
846	orr	x14,x14,x15	// see if result is zero
847	orr	x16,x16,x17
848	orr	x26,x14,x16	// ~is_equal(S1,S2)
849
850	add	x2,sp,#192
851	add	x0,sp,#256
852	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U1, in1_x, Z2sqr);
853
854	ldr	x3,[sp,#128]
855	ldp	x4,x5,[x23]
856	ldp	x6,x7,[x23,#16]
857	add	x2,sp,#128
858	add	x0,sp,#288
859	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in2_x, Z1sqr);
860
861	add	x2,sp,#256
862	ldp	x4,x5,[sp,#160]	// forward load for p256_sqr_mont
863	ldp	x6,x7,[sp,#160+16]
864	add	x0,sp,#96
865	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, U1);
866
867	orr	x14,x14,x15	// see if result is zero
868	orr	x16,x16,x17
869	orr	x14,x14,x16	// ~is_equal(U1,U2)
870
871	mvn	x27,x24	// -1/0 -> 0/-1
872	mvn	x28,x25	// -1/0 -> 0/-1
873	orr	x14,x14,x27
874	orr	x14,x14,x28
875	orr	x14,x14,x26
876	cbnz	x14,Ladd_proceed	// if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2))
877
878Ladd_double:
879	mov	x1,x22
880	mov	x0,x21
881	ldp	x23,x24,[x29,#48]
882	ldp	x25,x26,[x29,#64]
883	ldp	x27,x28,[x29,#80]
884	add	sp,sp,#256	// #256 is from #32*(12-4). difference in stack frames
885	b	Ldouble_shortcut
886
887.align	4
888Ladd_proceed:
889	add	x0,sp,#192
890	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
891
892	ldr	x3,[x22,#64]
893	ldp	x4,x5,[sp,#96]
894	ldp	x6,x7,[sp,#96+16]
895	add	x2,x22,#64
896	add	x0,sp,#64
897	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
898
899	ldp	x4,x5,[sp,#96]
900	ldp	x6,x7,[sp,#96+16]
901	add	x0,sp,#128
902	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
903
904	ldr	x3,[x23,#64]
905	ldp	x4,x5,[sp,#64]
906	ldp	x6,x7,[sp,#64+16]
907	add	x2,x23,#64
908	add	x0,sp,#64
909	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, res_z, in2_z);
910
911	ldr	x3,[sp,#96]
912	ldp	x4,x5,[sp,#128]
913	ldp	x6,x7,[sp,#128+16]
914	add	x2,sp,#96
915	add	x0,sp,#224
916	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
917
918	ldr	x3,[sp,#128]
919	ldp	x4,x5,[sp,#256]
920	ldp	x6,x7,[sp,#256+16]
921	add	x2,sp,#128
922	add	x0,sp,#288
923	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, U1, Hsqr);
924
925	mov	x8,x14
926	mov	x9,x15
927	mov	x10,x16
928	mov	x11,x17
929	add	x0,sp,#128
930	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
931
932	add	x2,sp,#192
933	add	x0,sp,#0
934	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
935
936	add	x2,sp,#224
937	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
938
939	add	x2,sp,#288
940	ldr	x3,[sp,#224]		// forward load for p256_mul_mont
941	ldp	x4,x5,[sp,#320]
942	ldp	x6,x7,[sp,#320+16]
943	add	x0,sp,#32
944	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
945
946	add	x2,sp,#224
947	add	x0,sp,#352
948	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S1, Hcub);
949
950	ldr	x3,[sp,#160]
951	ldp	x4,x5,[sp,#32]
952	ldp	x6,x7,[sp,#32+16]
953	add	x2,sp,#160
954	add	x0,sp,#32
955	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
956
957	add	x2,sp,#352
958	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
959
960	ldp	x4,x5,[sp,#0]		// res
961	ldp	x6,x7,[sp,#0+16]
962	ldp	x8,x9,[x23]		// in2
963	ldp	x10,x11,[x23,#16]
964	ldp	x14,x15,[x22,#0]	// in1
965	cmp	x24,#0			// ~, remember?
966	ldp	x16,x17,[x22,#0+16]
967	csel	x8,x4,x8,ne
968	csel	x9,x5,x9,ne
969	ldp	x4,x5,[sp,#0+0+32]	// res
970	csel	x10,x6,x10,ne
971	csel	x11,x7,x11,ne
972	cmp	x25,#0			// ~, remember?
973	ldp	x6,x7,[sp,#0+0+48]
974	csel	x14,x8,x14,ne
975	csel	x15,x9,x15,ne
976	ldp	x8,x9,[x23,#0+32]	// in2
977	csel	x16,x10,x16,ne
978	csel	x17,x11,x17,ne
979	ldp	x10,x11,[x23,#0+48]
980	stp	x14,x15,[x21,#0]
981	stp	x16,x17,[x21,#0+16]
982	ldp	x14,x15,[x22,#32]	// in1
983	cmp	x24,#0			// ~, remember?
984	ldp	x16,x17,[x22,#32+16]
985	csel	x8,x4,x8,ne
986	csel	x9,x5,x9,ne
987	ldp	x4,x5,[sp,#0+32+32]	// res
988	csel	x10,x6,x10,ne
989	csel	x11,x7,x11,ne
990	cmp	x25,#0			// ~, remember?
991	ldp	x6,x7,[sp,#0+32+48]
992	csel	x14,x8,x14,ne
993	csel	x15,x9,x15,ne
994	ldp	x8,x9,[x23,#32+32]	// in2
995	csel	x16,x10,x16,ne
996	csel	x17,x11,x17,ne
997	ldp	x10,x11,[x23,#32+48]
998	stp	x14,x15,[x21,#32]
999	stp	x16,x17,[x21,#32+16]
1000	ldp	x14,x15,[x22,#64]	// in1
1001	cmp	x24,#0			// ~, remember?
1002	ldp	x16,x17,[x22,#64+16]
1003	csel	x8,x4,x8,ne
1004	csel	x9,x5,x9,ne
1005	csel	x10,x6,x10,ne
1006	csel	x11,x7,x11,ne
1007	cmp	x25,#0			// ~, remember?
1008	csel	x14,x8,x14,ne
1009	csel	x15,x9,x15,ne
1010	csel	x16,x10,x16,ne
1011	csel	x17,x11,x17,ne
1012	stp	x14,x15,[x21,#64]
1013	stp	x16,x17,[x21,#64+16]
1014
1015Ladd_done:
1016	add	sp,x29,#0		// destroy frame
1017	ldp	x19,x20,[x29,#16]
1018	ldp	x21,x22,[x29,#32]
1019	ldp	x23,x24,[x29,#48]
1020	ldp	x25,x26,[x29,#64]
1021	ldp	x27,x28,[x29,#80]
1022	ldp	x29,x30,[sp],#96
1023	AARCH64_VALIDATE_LINK_REGISTER
1024	ret
1025
1026.globl	ecp_nistz256_point_add_affine
1027
1028.def ecp_nistz256_point_add_affine
1029   .type 32
1030.endef
1031.align	5
1032ecp_nistz256_point_add_affine:
1033	AARCH64_SIGN_LINK_REGISTER
1034	stp	x29,x30,[sp,#-80]!
1035	add	x29,sp,#0
1036	stp	x19,x20,[sp,#16]
1037	stp	x21,x22,[sp,#32]
1038	stp	x23,x24,[sp,#48]
1039	stp	x25,x26,[sp,#64]
1040	sub	sp,sp,#32*10
1041
1042	mov	x21,x0
1043	mov	x22,x1
1044	mov	x23,x2
1045	adrp	x13,Lpoly
1046	add	x13,x13,:lo12:Lpoly
1047	ldr	x12,[x13,#8]
1048	ldr	x13,[x13,#24]
1049
1050	ldp	x4,x5,[x1,#64]	// in1_z
1051	ldp	x6,x7,[x1,#64+16]
1052	orr	x8,x4,x5
1053	orr	x10,x6,x7
1054	orr	x24,x8,x10
1055	cmp	x24,#0
1056	csetm	x24,ne		// ~in1infty
1057
1058	ldp	x14,x15,[x2]	// in2_x
1059	ldp	x16,x17,[x2,#16]
1060	ldp	x8,x9,[x2,#32]	// in2_y
1061	ldp	x10,x11,[x2,#48]
1062	orr	x14,x14,x15
1063	orr	x16,x16,x17
1064	orr	x8,x8,x9
1065	orr	x10,x10,x11
1066	orr	x14,x14,x16
1067	orr	x8,x8,x10
1068	orr	x25,x14,x8
1069	cmp	x25,#0
1070	csetm	x25,ne		// ~in2infty
1071
1072	add	x0,sp,#128
1073	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
1074
1075	mov	x4,x14
1076	mov	x5,x15
1077	mov	x6,x16
1078	mov	x7,x17
1079	ldr	x3,[x23]
1080	add	x2,x23,#0
1081	add	x0,sp,#96
1082	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, Z1sqr, in2_x);
1083
1084	add	x2,x22,#0
1085	ldr	x3,[x22,#64]	// forward load for p256_mul_mont
1086	ldp	x4,x5,[sp,#128]
1087	ldp	x6,x7,[sp,#128+16]
1088	add	x0,sp,#160
1089	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, in1_x);
1090
1091	add	x2,x22,#64
1092	add	x0,sp,#128
1093	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
1094
1095	ldr	x3,[x22,#64]
1096	ldp	x4,x5,[sp,#160]
1097	ldp	x6,x7,[sp,#160+16]
1098	add	x2,x22,#64
1099	add	x0,sp,#64
1100	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
1101
1102	ldr	x3,[x23,#32]
1103	ldp	x4,x5,[sp,#128]
1104	ldp	x6,x7,[sp,#128+16]
1105	add	x2,x23,#32
1106	add	x0,sp,#128
1107	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
1108
1109	add	x2,x22,#32
1110	ldp	x4,x5,[sp,#160]	// forward load for p256_sqr_mont
1111	ldp	x6,x7,[sp,#160+16]
1112	add	x0,sp,#192
1113	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, in1_y);
1114
1115	add	x0,sp,#224
1116	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
1117
1118	ldp	x4,x5,[sp,#192]
1119	ldp	x6,x7,[sp,#192+16]
1120	add	x0,sp,#288
1121	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
1122
1123	ldr	x3,[sp,#160]
1124	ldp	x4,x5,[sp,#224]
1125	ldp	x6,x7,[sp,#224+16]
1126	add	x2,sp,#160
1127	add	x0,sp,#256
1128	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
1129
1130	ldr	x3,[x22]
1131	ldp	x4,x5,[sp,#224]
1132	ldp	x6,x7,[sp,#224+16]
1133	add	x2,x22,#0
1134	add	x0,sp,#96
1135	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in1_x, Hsqr);
1136
1137	mov	x8,x14
1138	mov	x9,x15
1139	mov	x10,x16
1140	mov	x11,x17
1141	add	x0,sp,#224
1142	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
1143
1144	add	x2,sp,#288
1145	add	x0,sp,#0
1146	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
1147
1148	add	x2,sp,#256
1149	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
1150
1151	add	x2,sp,#96
1152	ldr	x3,[x22,#32]	// forward load for p256_mul_mont
1153	ldp	x4,x5,[sp,#256]
1154	ldp	x6,x7,[sp,#256+16]
1155	add	x0,sp,#32
1156	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
1157
1158	add	x2,x22,#32
1159	add	x0,sp,#128
1160	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, in1_y, Hcub);
1161
1162	ldr	x3,[sp,#192]
1163	ldp	x4,x5,[sp,#32]
1164	ldp	x6,x7,[sp,#32+16]
1165	add	x2,sp,#192
1166	add	x0,sp,#32
1167	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
1168
1169	add	x2,sp,#128
1170	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
1171
1172	ldp	x4,x5,[sp,#0]		// res
1173	ldp	x6,x7,[sp,#0+16]
1174	ldp	x8,x9,[x23]		// in2
1175	ldp	x10,x11,[x23,#16]
1176	ldp	x14,x15,[x22,#0]	// in1
1177	cmp	x24,#0			// ~, remember?
1178	ldp	x16,x17,[x22,#0+16]
1179	csel	x8,x4,x8,ne
1180	csel	x9,x5,x9,ne
1181	ldp	x4,x5,[sp,#0+0+32]	// res
1182	csel	x10,x6,x10,ne
1183	csel	x11,x7,x11,ne
1184	cmp	x25,#0			// ~, remember?
1185	ldp	x6,x7,[sp,#0+0+48]
1186	csel	x14,x8,x14,ne
1187	csel	x15,x9,x15,ne
1188	ldp	x8,x9,[x23,#0+32]	// in2
1189	csel	x16,x10,x16,ne
1190	csel	x17,x11,x17,ne
1191	ldp	x10,x11,[x23,#0+48]
1192	stp	x14,x15,[x21,#0]
1193	stp	x16,x17,[x21,#0+16]
1194	adrp	x23,Lone_mont-64
1195	add	x23,x23,:lo12:Lone_mont-64
1196	ldp	x14,x15,[x22,#32]	// in1
1197	cmp	x24,#0			// ~, remember?
1198	ldp	x16,x17,[x22,#32+16]
1199	csel	x8,x4,x8,ne
1200	csel	x9,x5,x9,ne
1201	ldp	x4,x5,[sp,#0+32+32]	// res
1202	csel	x10,x6,x10,ne
1203	csel	x11,x7,x11,ne
1204	cmp	x25,#0			// ~, remember?
1205	ldp	x6,x7,[sp,#0+32+48]
1206	csel	x14,x8,x14,ne
1207	csel	x15,x9,x15,ne
1208	ldp	x8,x9,[x23,#32+32]	// in2
1209	csel	x16,x10,x16,ne
1210	csel	x17,x11,x17,ne
1211	ldp	x10,x11,[x23,#32+48]
1212	stp	x14,x15,[x21,#32]
1213	stp	x16,x17,[x21,#32+16]
1214	ldp	x14,x15,[x22,#64]	// in1
1215	cmp	x24,#0			// ~, remember?
1216	ldp	x16,x17,[x22,#64+16]
1217	csel	x8,x4,x8,ne
1218	csel	x9,x5,x9,ne
1219	csel	x10,x6,x10,ne
1220	csel	x11,x7,x11,ne
1221	cmp	x25,#0			// ~, remember?
1222	csel	x14,x8,x14,ne
1223	csel	x15,x9,x15,ne
1224	csel	x16,x10,x16,ne
1225	csel	x17,x11,x17,ne
1226	stp	x14,x15,[x21,#64]
1227	stp	x16,x17,[x21,#64+16]
1228
1229	add	sp,x29,#0		// destroy frame
1230	ldp	x19,x20,[x29,#16]
1231	ldp	x21,x22,[x29,#32]
1232	ldp	x23,x24,[x29,#48]
1233	ldp	x25,x26,[x29,#64]
1234	ldp	x29,x30,[sp],#80
1235	AARCH64_VALIDATE_LINK_REGISTER
1236	ret
1237
1238////////////////////////////////////////////////////////////////////////
1239// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
1240//                                uint64_t b[4]);
1241.globl	ecp_nistz256_ord_mul_mont
1242
1243.def ecp_nistz256_ord_mul_mont
1244   .type 32
1245.endef
1246.align	4
1247ecp_nistz256_ord_mul_mont:
1248	AARCH64_VALID_CALL_TARGET
1249	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1250	stp	x29,x30,[sp,#-64]!
1251	add	x29,sp,#0
1252	stp	x19,x20,[sp,#16]
1253	stp	x21,x22,[sp,#32]
1254	stp	x23,x24,[sp,#48]
1255
1256	adrp	x23,Lord
1257	add	x23,x23,:lo12:Lord
1258	ldr	x3,[x2]		// bp[0]
1259	ldp	x4,x5,[x1]
1260	ldp	x6,x7,[x1,#16]
1261
1262	ldp	x12,x13,[x23,#0]
1263	ldp	x21,x22,[x23,#16]
1264	ldr	x23,[x23,#32]
1265
1266	mul	x14,x4,x3		// a[0]*b[0]
1267	umulh	x8,x4,x3
1268
1269	mul	x15,x5,x3		// a[1]*b[0]
1270	umulh	x9,x5,x3
1271
1272	mul	x16,x6,x3		// a[2]*b[0]
1273	umulh	x10,x6,x3
1274
1275	mul	x17,x7,x3		// a[3]*b[0]
1276	umulh	x19,x7,x3
1277
1278	mul	x24,x14,x23
1279
1280	adds	x15,x15,x8		// accumulate high parts of multiplication
1281	adcs	x16,x16,x9
1282	adcs	x17,x17,x10
1283	adc	x19,x19,xzr
1284	mov	x20,xzr
1285	ldr	x3,[x2,#8*1]		// b[i]
1286
1287	lsl	x8,x24,#32
1288	subs	x16,x16,x24
1289	lsr	x9,x24,#32
1290	sbcs	x17,x17,x8
1291	sbcs	x19,x19,x9
1292	sbc	x20,x20,xzr
1293
1294	subs	xzr,x14,#1
1295	umulh	x9,x12,x24
1296	mul	x10,x13,x24
1297	umulh	x11,x13,x24
1298
1299	adcs	x10,x10,x9
1300	mul	x8,x4,x3
1301	adc	x11,x11,xzr
1302	mul	x9,x5,x3
1303
1304	adds	x14,x15,x10
1305	mul	x10,x6,x3
1306	adcs	x15,x16,x11
1307	mul	x11,x7,x3
1308	adcs	x16,x17,x24
1309	adcs	x17,x19,x24
1310	adc	x19,x20,xzr
1311
1312	adds	x14,x14,x8		// accumulate low parts
1313	umulh	x8,x4,x3
1314	adcs	x15,x15,x9
1315	umulh	x9,x5,x3
1316	adcs	x16,x16,x10
1317	umulh	x10,x6,x3
1318	adcs	x17,x17,x11
1319	umulh	x11,x7,x3
1320	adc	x19,x19,xzr
1321	mul	x24,x14,x23
1322	adds	x15,x15,x8		// accumulate high parts
1323	adcs	x16,x16,x9
1324	adcs	x17,x17,x10
1325	adcs	x19,x19,x11
1326	adc	x20,xzr,xzr
1327	ldr	x3,[x2,#8*2]		// b[i]
1328
1329	lsl	x8,x24,#32
1330	subs	x16,x16,x24
1331	lsr	x9,x24,#32
1332	sbcs	x17,x17,x8
1333	sbcs	x19,x19,x9
1334	sbc	x20,x20,xzr
1335
1336	subs	xzr,x14,#1
1337	umulh	x9,x12,x24
1338	mul	x10,x13,x24
1339	umulh	x11,x13,x24
1340
1341	adcs	x10,x10,x9
1342	mul	x8,x4,x3
1343	adc	x11,x11,xzr
1344	mul	x9,x5,x3
1345
1346	adds	x14,x15,x10
1347	mul	x10,x6,x3
1348	adcs	x15,x16,x11
1349	mul	x11,x7,x3
1350	adcs	x16,x17,x24
1351	adcs	x17,x19,x24
1352	adc	x19,x20,xzr
1353
1354	adds	x14,x14,x8		// accumulate low parts
1355	umulh	x8,x4,x3
1356	adcs	x15,x15,x9
1357	umulh	x9,x5,x3
1358	adcs	x16,x16,x10
1359	umulh	x10,x6,x3
1360	adcs	x17,x17,x11
1361	umulh	x11,x7,x3
1362	adc	x19,x19,xzr
1363	mul	x24,x14,x23
1364	adds	x15,x15,x8		// accumulate high parts
1365	adcs	x16,x16,x9
1366	adcs	x17,x17,x10
1367	adcs	x19,x19,x11
1368	adc	x20,xzr,xzr
1369	ldr	x3,[x2,#8*3]		// b[i]
1370
1371	lsl	x8,x24,#32
1372	subs	x16,x16,x24
1373	lsr	x9,x24,#32
1374	sbcs	x17,x17,x8
1375	sbcs	x19,x19,x9
1376	sbc	x20,x20,xzr
1377
1378	subs	xzr,x14,#1
1379	umulh	x9,x12,x24
1380	mul	x10,x13,x24
1381	umulh	x11,x13,x24
1382
1383	adcs	x10,x10,x9
1384	mul	x8,x4,x3
1385	adc	x11,x11,xzr
1386	mul	x9,x5,x3
1387
1388	adds	x14,x15,x10
1389	mul	x10,x6,x3
1390	adcs	x15,x16,x11
1391	mul	x11,x7,x3
1392	adcs	x16,x17,x24
1393	adcs	x17,x19,x24
1394	adc	x19,x20,xzr
1395
1396	adds	x14,x14,x8		// accumulate low parts
1397	umulh	x8,x4,x3
1398	adcs	x15,x15,x9
1399	umulh	x9,x5,x3
1400	adcs	x16,x16,x10
1401	umulh	x10,x6,x3
1402	adcs	x17,x17,x11
1403	umulh	x11,x7,x3
1404	adc	x19,x19,xzr
1405	mul	x24,x14,x23
1406	adds	x15,x15,x8		// accumulate high parts
1407	adcs	x16,x16,x9
1408	adcs	x17,x17,x10
1409	adcs	x19,x19,x11
1410	adc	x20,xzr,xzr
1411	lsl	x8,x24,#32		// last reduction
1412	subs	x16,x16,x24
1413	lsr	x9,x24,#32
1414	sbcs	x17,x17,x8
1415	sbcs	x19,x19,x9
1416	sbc	x20,x20,xzr
1417
1418	subs	xzr,x14,#1
1419	umulh	x9,x12,x24
1420	mul	x10,x13,x24
1421	umulh	x11,x13,x24
1422
1423	adcs	x10,x10,x9
1424	adc	x11,x11,xzr
1425
1426	adds	x14,x15,x10
1427	adcs	x15,x16,x11
1428	adcs	x16,x17,x24
1429	adcs	x17,x19,x24
1430	adc	x19,x20,xzr
1431
1432	subs	x8,x14,x12		// ret -= modulus
1433	sbcs	x9,x15,x13
1434	sbcs	x10,x16,x21
1435	sbcs	x11,x17,x22
1436	sbcs	xzr,x19,xzr
1437
1438	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
1439	csel	x15,x15,x9,lo
1440	csel	x16,x16,x10,lo
1441	stp	x14,x15,[x0]
1442	csel	x17,x17,x11,lo
1443	stp	x16,x17,[x0,#16]
1444
1445	ldp	x19,x20,[sp,#16]
1446	ldp	x21,x22,[sp,#32]
1447	ldp	x23,x24,[sp,#48]
1448	ldr	x29,[sp],#64
1449	ret
1450
1451
1452////////////////////////////////////////////////////////////////////////
1453// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
1454//                                uint64_t rep);
1455.globl	ecp_nistz256_ord_sqr_mont
1456
1457.def ecp_nistz256_ord_sqr_mont
1458   .type 32
1459.endef
1460.align	4
1461ecp_nistz256_ord_sqr_mont:
1462	AARCH64_VALID_CALL_TARGET
1463	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1464	stp	x29,x30,[sp,#-64]!
1465	add	x29,sp,#0
1466	stp	x19,x20,[sp,#16]
1467	stp	x21,x22,[sp,#32]
1468	stp	x23,x24,[sp,#48]
1469
1470	adrp	x23,Lord
1471	add	x23,x23,:lo12:Lord
1472	ldp	x4,x5,[x1]
1473	ldp	x6,x7,[x1,#16]
1474
1475	ldp	x12,x13,[x23,#0]
1476	ldp	x21,x22,[x23,#16]
1477	ldr	x23,[x23,#32]
1478	b	Loop_ord_sqr
1479
1480.align	4
1481Loop_ord_sqr:
1482	sub	x2,x2,#1
1483	////////////////////////////////////////////////////////////////
1484	//  |  |  |  |  |  |a1*a0|  |
1485	//  |  |  |  |  |a2*a0|  |  |
1486	//  |  |a3*a2|a3*a0|  |  |  |
1487	//  |  |  |  |a2*a1|  |  |  |
1488	//  |  |  |a3*a1|  |  |  |  |
1489	// *|  |  |  |  |  |  |  | 2|
1490	// +|a3*a3|a2*a2|a1*a1|a0*a0|
1491	//  |--+--+--+--+--+--+--+--|
1492	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow
1493	//
1494	//  "can't overflow" below mark carrying into high part of
1495	//  multiplication result, which can't overflow, because it
1496	//  can never be all ones.
1497
1498	mul	x15,x5,x4		// a[1]*a[0]
1499	umulh	x9,x5,x4
1500	mul	x16,x6,x4		// a[2]*a[0]
1501	umulh	x10,x6,x4
1502	mul	x17,x7,x4		// a[3]*a[0]
1503	umulh	x19,x7,x4
1504
1505	adds	x16,x16,x9		// accumulate high parts of multiplication
1506	mul	x8,x6,x5		// a[2]*a[1]
1507	umulh	x9,x6,x5
1508	adcs	x17,x17,x10
1509	mul	x10,x7,x5		// a[3]*a[1]
1510	umulh	x11,x7,x5
1511	adc	x19,x19,xzr		// can't overflow
1512
1513	mul	x20,x7,x6		// a[3]*a[2]
1514	umulh	x1,x7,x6
1515
1516	adds	x9,x9,x10		// accumulate high parts of multiplication
1517	mul	x14,x4,x4		// a[0]*a[0]
1518	adc	x10,x11,xzr		// can't overflow
1519
1520	adds	x17,x17,x8		// accumulate low parts of multiplication
1521	umulh	x4,x4,x4
1522	adcs	x19,x19,x9
1523	mul	x9,x5,x5		// a[1]*a[1]
1524	adcs	x20,x20,x10
1525	umulh	x5,x5,x5
1526	adc	x1,x1,xzr		// can't overflow
1527
1528	adds	x15,x15,x15	// acc[1-6]*=2
1529	mul	x10,x6,x6		// a[2]*a[2]
1530	adcs	x16,x16,x16
1531	umulh	x6,x6,x6
1532	adcs	x17,x17,x17
1533	mul	x11,x7,x7		// a[3]*a[3]
1534	adcs	x19,x19,x19
1535	umulh	x7,x7,x7
1536	adcs	x20,x20,x20
1537	adcs	x1,x1,x1
1538	adc	x3,xzr,xzr
1539
1540	adds	x15,x15,x4		// +a[i]*a[i]
1541	mul	x24,x14,x23
1542	adcs	x16,x16,x9
1543	adcs	x17,x17,x5
1544	adcs	x19,x19,x10
1545	adcs	x20,x20,x6
1546	adcs	x1,x1,x11
1547	adc	x3,x3,x7
1548	subs	xzr,x14,#1
1549	umulh	x9,x12,x24
1550	mul	x10,x13,x24
1551	umulh	x11,x13,x24
1552
1553	adcs	x10,x10,x9
1554	adc	x11,x11,xzr
1555
1556	adds	x14,x15,x10
1557	adcs	x15,x16,x11
1558	adcs	x16,x17,x24
1559	adc	x17,xzr,x24		// can't overflow
1560	mul	x11,x14,x23
1561	lsl	x8,x24,#32
1562	subs	x15,x15,x24
1563	lsr	x9,x24,#32
1564	sbcs	x16,x16,x8
1565	sbc	x17,x17,x9		// can't borrow
1566	subs	xzr,x14,#1
1567	umulh	x9,x12,x11
1568	mul	x10,x13,x11
1569	umulh	x24,x13,x11
1570
1571	adcs	x10,x10,x9
1572	adc	x24,x24,xzr
1573
1574	adds	x14,x15,x10
1575	adcs	x15,x16,x24
1576	adcs	x16,x17,x11
1577	adc	x17,xzr,x11		// can't overflow
1578	mul	x24,x14,x23
1579	lsl	x8,x11,#32
1580	subs	x15,x15,x11
1581	lsr	x9,x11,#32
1582	sbcs	x16,x16,x8
1583	sbc	x17,x17,x9		// can't borrow
1584	subs	xzr,x14,#1
1585	umulh	x9,x12,x24
1586	mul	x10,x13,x24
1587	umulh	x11,x13,x24
1588
1589	adcs	x10,x10,x9
1590	adc	x11,x11,xzr
1591
1592	adds	x14,x15,x10
1593	adcs	x15,x16,x11
1594	adcs	x16,x17,x24
1595	adc	x17,xzr,x24		// can't overflow
1596	mul	x11,x14,x23
1597	lsl	x8,x24,#32
1598	subs	x15,x15,x24
1599	lsr	x9,x24,#32
1600	sbcs	x16,x16,x8
1601	sbc	x17,x17,x9		// can't borrow
1602	subs	xzr,x14,#1
1603	umulh	x9,x12,x11
1604	mul	x10,x13,x11
1605	umulh	x24,x13,x11
1606
1607	adcs	x10,x10,x9
1608	adc	x24,x24,xzr
1609
1610	adds	x14,x15,x10
1611	adcs	x15,x16,x24
1612	adcs	x16,x17,x11
1613	adc	x17,xzr,x11		// can't overflow
1614	lsl	x8,x11,#32
1615	subs	x15,x15,x11
1616	lsr	x9,x11,#32
1617	sbcs	x16,x16,x8
1618	sbc	x17,x17,x9		// can't borrow
1619	adds	x14,x14,x19	// accumulate upper half
1620	adcs	x15,x15,x20
1621	adcs	x16,x16,x1
1622	adcs	x17,x17,x3
1623	adc	x19,xzr,xzr
1624
1625	subs	x8,x14,x12		// ret -= modulus
1626	sbcs	x9,x15,x13
1627	sbcs	x10,x16,x21
1628	sbcs	x11,x17,x22
1629	sbcs	xzr,x19,xzr
1630
1631	csel	x4,x14,x8,lo	// ret = borrow ? ret : ret-modulus
1632	csel	x5,x15,x9,lo
1633	csel	x6,x16,x10,lo
1634	csel	x7,x17,x11,lo
1635
1636	cbnz	x2,Loop_ord_sqr
1637
1638	stp	x4,x5,[x0]
1639	stp	x6,x7,[x0,#16]
1640
1641	ldp	x19,x20,[sp,#16]
1642	ldp	x21,x22,[sp,#32]
1643	ldp	x23,x24,[sp,#48]
1644	ldr	x29,[sp],#64
1645	ret
1646
1647////////////////////////////////////////////////////////////////////////
1648// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
1649.globl	ecp_nistz256_select_w5
1650
1651.def ecp_nistz256_select_w5
1652   .type 32
1653.endef
1654.align	4
1655ecp_nistz256_select_w5:
1656	AARCH64_VALID_CALL_TARGET
1657
1658    // x10 := x0
1659    // w9 := 0; loop counter and incremented internal index
1660	mov	x10, x0
1661	mov	w9, #0
1662
1663    // [v16-v21] := 0
1664	movi	v16.16b, #0
1665	movi	v17.16b, #0
1666	movi	v18.16b, #0
1667	movi	v19.16b, #0
1668	movi	v20.16b, #0
1669	movi	v21.16b, #0
1670
1671Lselect_w5_loop:
1672    // Loop 16 times.
1673
1674    // Increment index (loop counter); tested at the end of the loop
1675	add	w9, w9, #1
1676
1677    // [v22-v27] := Load a (3*256-bit = 6*128-bit) table entry starting at x1
1678    //  and advance x1 to point to the next entry
1679	ld1	{v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
1680
1681    // x11 := (w9 == w2)? All 1s : All 0s
1682	cmp	w9, w2
1683	csetm	x11, eq
1684
1685    // continue loading ...
1686	ld1	{v26.2d, v27.2d}, [x1],#32
1687
1688    // duplicate mask_64 into Mask (all 0s or all 1s)
1689	dup	v3.2d, x11
1690
1691    // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
1692    // i.e., values in output registers will remain the same if w9 != w2
1693	bit	v16.16b, v22.16b, v3.16b
1694	bit	v17.16b, v23.16b, v3.16b
1695
1696	bit	v18.16b, v24.16b, v3.16b
1697	bit	v19.16b, v25.16b, v3.16b
1698
1699	bit	v20.16b, v26.16b, v3.16b
1700	bit	v21.16b, v27.16b, v3.16b
1701
1702    // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back
1703	tbz	w9, #4, Lselect_w5_loop
1704
1705    // Write [v16-v21] to memory at the output pointer
1706	st1	{v16.2d, v17.2d, v18.2d, v19.2d}, [x10],#64
1707	st1	{v20.2d, v21.2d}, [x10]
1708
1709	ret
1710
1711
1712
1713////////////////////////////////////////////////////////////////////////
1714// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
1715.globl	ecp_nistz256_select_w7
1716
1717.def ecp_nistz256_select_w7
1718   .type 32
1719.endef
1720.align	4
1721ecp_nistz256_select_w7:
1722	AARCH64_VALID_CALL_TARGET
1723
1724    // w9 := 0; loop counter and incremented internal index
1725	mov	w9, #0
1726
1727    // [v16-v21] := 0
1728	movi	v16.16b, #0
1729	movi	v17.16b, #0
1730	movi	v18.16b, #0
1731	movi	v19.16b, #0
1732
1733Lselect_w7_loop:
1734    // Loop 64 times.
1735
1736    // Increment index (loop counter); tested at the end of the loop
1737	add	w9, w9, #1
1738
1739    // [v22-v25] := Load a (2*256-bit = 4*128-bit) table entry starting at x1
1740    //  and advance x1 to point to the next entry
1741	ld1	{v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
1742
1743    // x11 := (w9 == w2)? All 1s : All 0s
1744	cmp	w9, w2
1745	csetm	x11, eq
1746
1747    // duplicate mask_64 into Mask (all 0s or all 1s)
1748	dup	v3.2d, x11
1749
1750    // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
1751    // i.e., values in output registers will remain the same if w9 != w2
1752	bit	v16.16b, v22.16b, v3.16b
1753	bit	v17.16b, v23.16b, v3.16b
1754
1755	bit	v18.16b, v24.16b, v3.16b
1756	bit	v19.16b, v25.16b, v3.16b
1757
1758    // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back
1759	tbz	w9, #6, Lselect_w7_loop
1760
1761    // Write [v16-v19] to memory at the output pointer
1762	st1	{v16.2d, v17.2d, v18.2d, v19.2d}, [x0]
1763
1764	ret
1765
1766#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
1767