xref: /aosp_15_r20/external/cronet/third_party/boringssl/src/gen/bcm/p256-armv8-asm-linux.S (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <openssl/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
7#include "openssl/arm_arch.h"
8
9.section	.rodata
10.align	5
11.Lpoly:
12.quad	0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
13.LRR:	//	2^512 mod P precomputed for NIST P256 polynomial
14.quad	0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd
15.Lone_mont:
16.quad	0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
17.Lone:
18.quad	1,0,0,0
19.Lord:
20.quad	0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
21.LordK:
22.quad	0xccd1c8aaee00bc4f
23.byte	69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
24.align	2
25.text
26
27// void	ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
28//					     const BN_ULONG x2[4]);
29.globl	ecp_nistz256_mul_mont
30.hidden	ecp_nistz256_mul_mont
31.type	ecp_nistz256_mul_mont,%function
32.align	4
33ecp_nistz256_mul_mont:
34	AARCH64_SIGN_LINK_REGISTER
35	stp	x29,x30,[sp,#-32]!
36	add	x29,sp,#0
37	stp	x19,x20,[sp,#16]
38
39	ldr	x3,[x2]		// bp[0]
40	ldp	x4,x5,[x1]
41	ldp	x6,x7,[x1,#16]
42	adrp	x13,.Lpoly
43	add	x13,x13,:lo12:.Lpoly
44	ldr	x12,[x13,#8]
45	ldr	x13,[x13,#24]
46
47	bl	__ecp_nistz256_mul_mont
48
49	ldp	x19,x20,[sp,#16]
50	ldp	x29,x30,[sp],#32
51	AARCH64_VALIDATE_LINK_REGISTER
52	ret
53.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
54
55// void	ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
56.globl	ecp_nistz256_sqr_mont
57.hidden	ecp_nistz256_sqr_mont
58.type	ecp_nistz256_sqr_mont,%function
59.align	4
60ecp_nistz256_sqr_mont:
61	AARCH64_SIGN_LINK_REGISTER
62	stp	x29,x30,[sp,#-32]!
63	add	x29,sp,#0
64	stp	x19,x20,[sp,#16]
65
66	ldp	x4,x5,[x1]
67	ldp	x6,x7,[x1,#16]
68	adrp	x13,.Lpoly
69	add	x13,x13,:lo12:.Lpoly
70	ldr	x12,[x13,#8]
71	ldr	x13,[x13,#24]
72
73	bl	__ecp_nistz256_sqr_mont
74
75	ldp	x19,x20,[sp,#16]
76	ldp	x29,x30,[sp],#32
77	AARCH64_VALIDATE_LINK_REGISTER
78	ret
79.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
80
81// void	ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
82.globl	ecp_nistz256_div_by_2
83.hidden	ecp_nistz256_div_by_2
84.type	ecp_nistz256_div_by_2,%function
85.align	4
86ecp_nistz256_div_by_2:
87	AARCH64_SIGN_LINK_REGISTER
88	stp	x29,x30,[sp,#-16]!
89	add	x29,sp,#0
90
91	ldp	x14,x15,[x1]
92	ldp	x16,x17,[x1,#16]
93	adrp	x13,.Lpoly
94	add	x13,x13,:lo12:.Lpoly
95	ldr	x12,[x13,#8]
96	ldr	x13,[x13,#24]
97
98	bl	__ecp_nistz256_div_by_2
99
100	ldp	x29,x30,[sp],#16
101	AARCH64_VALIDATE_LINK_REGISTER
102	ret
103.size	ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
104
105// void	ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
106.globl	ecp_nistz256_mul_by_2
107.hidden	ecp_nistz256_mul_by_2
108.type	ecp_nistz256_mul_by_2,%function
109.align	4
110ecp_nistz256_mul_by_2:
111	AARCH64_SIGN_LINK_REGISTER
112	stp	x29,x30,[sp,#-16]!
113	add	x29,sp,#0
114
115	ldp	x14,x15,[x1]
116	ldp	x16,x17,[x1,#16]
117	adrp	x13,.Lpoly
118	add	x13,x13,:lo12:.Lpoly
119	ldr	x12,[x13,#8]
120	ldr	x13,[x13,#24]
121	mov	x8,x14
122	mov	x9,x15
123	mov	x10,x16
124	mov	x11,x17
125
126	bl	__ecp_nistz256_add_to	// ret = a+a	// 2*a
127
128	ldp	x29,x30,[sp],#16
129	AARCH64_VALIDATE_LINK_REGISTER
130	ret
131.size	ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
132
133// void	ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
134.globl	ecp_nistz256_mul_by_3
135.hidden	ecp_nistz256_mul_by_3
136.type	ecp_nistz256_mul_by_3,%function
137.align	4
138ecp_nistz256_mul_by_3:
139	AARCH64_SIGN_LINK_REGISTER
140	stp	x29,x30,[sp,#-16]!
141	add	x29,sp,#0
142
143	ldp	x14,x15,[x1]
144	ldp	x16,x17,[x1,#16]
145	adrp	x13,.Lpoly
146	add	x13,x13,:lo12:.Lpoly
147	ldr	x12,[x13,#8]
148	ldr	x13,[x13,#24]
149	mov	x8,x14
150	mov	x9,x15
151	mov	x10,x16
152	mov	x11,x17
153	mov	x4,x14
154	mov	x5,x15
155	mov	x6,x16
156	mov	x7,x17
157
158	bl	__ecp_nistz256_add_to	// ret = a+a	// 2*a
159
160	mov	x8,x4
161	mov	x9,x5
162	mov	x10,x6
163	mov	x11,x7
164
165	bl	__ecp_nistz256_add_to	// ret += a	// 2*a+a=3*a
166
167	ldp	x29,x30,[sp],#16
168	AARCH64_VALIDATE_LINK_REGISTER
169	ret
170.size	ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
171
172// void	ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
173//				        const BN_ULONG x2[4]);
174.globl	ecp_nistz256_sub
175.hidden	ecp_nistz256_sub
176.type	ecp_nistz256_sub,%function
177.align	4
178ecp_nistz256_sub:
179	AARCH64_SIGN_LINK_REGISTER
180	stp	x29,x30,[sp,#-16]!
181	add	x29,sp,#0
182
183	ldp	x14,x15,[x1]
184	ldp	x16,x17,[x1,#16]
185	adrp	x13,.Lpoly
186	add	x13,x13,:lo12:.Lpoly
187	ldr	x12,[x13,#8]
188	ldr	x13,[x13,#24]
189
190	bl	__ecp_nistz256_sub_from
191
192	ldp	x29,x30,[sp],#16
193	AARCH64_VALIDATE_LINK_REGISTER
194	ret
195.size	ecp_nistz256_sub,.-ecp_nistz256_sub
196
197// void	ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
198.globl	ecp_nistz256_neg
199.hidden	ecp_nistz256_neg
200.type	ecp_nistz256_neg,%function
201.align	4
202ecp_nistz256_neg:
203	AARCH64_SIGN_LINK_REGISTER
204	stp	x29,x30,[sp,#-16]!
205	add	x29,sp,#0
206
207	mov	x2,x1
208	mov	x14,xzr		// a = 0
209	mov	x15,xzr
210	mov	x16,xzr
211	mov	x17,xzr
212	adrp	x13,.Lpoly
213	add	x13,x13,:lo12:.Lpoly
214	ldr	x12,[x13,#8]
215	ldr	x13,[x13,#24]
216
217	bl	__ecp_nistz256_sub_from
218
219	ldp	x29,x30,[sp],#16
220	AARCH64_VALIDATE_LINK_REGISTER
221	ret
222.size	ecp_nistz256_neg,.-ecp_nistz256_neg
223
224// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
225// to x4-x7 and b[0] - to x3
226.type	__ecp_nistz256_mul_mont,%function
227.align	4
228__ecp_nistz256_mul_mont:
229	mul	x14,x4,x3		// a[0]*b[0]
230	umulh	x8,x4,x3
231
232	mul	x15,x5,x3		// a[1]*b[0]
233	umulh	x9,x5,x3
234
235	mul	x16,x6,x3		// a[2]*b[0]
236	umulh	x10,x6,x3
237
238	mul	x17,x7,x3		// a[3]*b[0]
239	umulh	x11,x7,x3
240	ldr	x3,[x2,#8]		// b[1]
241
242	adds	x15,x15,x8		// accumulate high parts of multiplication
243	lsl	x8,x14,#32
244	adcs	x16,x16,x9
245	lsr	x9,x14,#32
246	adcs	x17,x17,x10
247	adc	x19,xzr,x11
248	mov	x20,xzr
249	subs	x10,x14,x8		// "*0xffff0001"
250	sbc	x11,x14,x9
251	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
252	mul	x8,x4,x3		// lo(a[0]*b[i])
253	adcs	x15,x16,x9
254	mul	x9,x5,x3		// lo(a[1]*b[i])
255	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
256	mul	x10,x6,x3		// lo(a[2]*b[i])
257	adcs	x17,x19,x11
258	mul	x11,x7,x3		// lo(a[3]*b[i])
259	adc	x19,x20,xzr
260
261	adds	x14,x14,x8		// accumulate low parts of multiplication
262	umulh	x8,x4,x3		// hi(a[0]*b[i])
263	adcs	x15,x15,x9
264	umulh	x9,x5,x3		// hi(a[1]*b[i])
265	adcs	x16,x16,x10
266	umulh	x10,x6,x3		// hi(a[2]*b[i])
267	adcs	x17,x17,x11
268	umulh	x11,x7,x3		// hi(a[3]*b[i])
269	adc	x19,x19,xzr
270	ldr	x3,[x2,#8*(1+1)]	// b[1+1]
271	adds	x15,x15,x8		// accumulate high parts of multiplication
272	lsl	x8,x14,#32
273	adcs	x16,x16,x9
274	lsr	x9,x14,#32
275	adcs	x17,x17,x10
276	adcs	x19,x19,x11
277	adc	x20,xzr,xzr
278	subs	x10,x14,x8		// "*0xffff0001"
279	sbc	x11,x14,x9
280	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
281	mul	x8,x4,x3		// lo(a[0]*b[i])
282	adcs	x15,x16,x9
283	mul	x9,x5,x3		// lo(a[1]*b[i])
284	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
285	mul	x10,x6,x3		// lo(a[2]*b[i])
286	adcs	x17,x19,x11
287	mul	x11,x7,x3		// lo(a[3]*b[i])
288	adc	x19,x20,xzr
289
290	adds	x14,x14,x8		// accumulate low parts of multiplication
291	umulh	x8,x4,x3		// hi(a[0]*b[i])
292	adcs	x15,x15,x9
293	umulh	x9,x5,x3		// hi(a[1]*b[i])
294	adcs	x16,x16,x10
295	umulh	x10,x6,x3		// hi(a[2]*b[i])
296	adcs	x17,x17,x11
297	umulh	x11,x7,x3		// hi(a[3]*b[i])
298	adc	x19,x19,xzr
299	ldr	x3,[x2,#8*(2+1)]	// b[2+1]
300	adds	x15,x15,x8		// accumulate high parts of multiplication
301	lsl	x8,x14,#32
302	adcs	x16,x16,x9
303	lsr	x9,x14,#32
304	adcs	x17,x17,x10
305	adcs	x19,x19,x11
306	adc	x20,xzr,xzr
307	subs	x10,x14,x8		// "*0xffff0001"
308	sbc	x11,x14,x9
309	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
310	mul	x8,x4,x3		// lo(a[0]*b[i])
311	adcs	x15,x16,x9
312	mul	x9,x5,x3		// lo(a[1]*b[i])
313	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
314	mul	x10,x6,x3		// lo(a[2]*b[i])
315	adcs	x17,x19,x11
316	mul	x11,x7,x3		// lo(a[3]*b[i])
317	adc	x19,x20,xzr
318
319	adds	x14,x14,x8		// accumulate low parts of multiplication
320	umulh	x8,x4,x3		// hi(a[0]*b[i])
321	adcs	x15,x15,x9
322	umulh	x9,x5,x3		// hi(a[1]*b[i])
323	adcs	x16,x16,x10
324	umulh	x10,x6,x3		// hi(a[2]*b[i])
325	adcs	x17,x17,x11
326	umulh	x11,x7,x3		// hi(a[3]*b[i])
327	adc	x19,x19,xzr
328	adds	x15,x15,x8		// accumulate high parts of multiplication
329	lsl	x8,x14,#32
330	adcs	x16,x16,x9
331	lsr	x9,x14,#32
332	adcs	x17,x17,x10
333	adcs	x19,x19,x11
334	adc	x20,xzr,xzr
335	// last reduction
336	subs	x10,x14,x8		// "*0xffff0001"
337	sbc	x11,x14,x9
338	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
339	adcs	x15,x16,x9
340	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
341	adcs	x17,x19,x11
342	adc	x19,x20,xzr
343
344	adds	x8,x14,#1		// subs	x8,x14,#-1 // tmp = ret-modulus
345	sbcs	x9,x15,x12
346	sbcs	x10,x16,xzr
347	sbcs	x11,x17,x13
348	sbcs	xzr,x19,xzr		// did it borrow?
349
350	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
351	csel	x15,x15,x9,lo
352	csel	x16,x16,x10,lo
353	stp	x14,x15,[x0]
354	csel	x17,x17,x11,lo
355	stp	x16,x17,[x0,#16]
356
357	ret
358.size	__ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
359
360// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
361// to x4-x7
362.type	__ecp_nistz256_sqr_mont,%function
363.align	4
364__ecp_nistz256_sqr_mont:
365	//  |  |  |  |  |  |a1*a0|  |
366	//  |  |  |  |  |a2*a0|  |  |
367	//  |  |a3*a2|a3*a0|  |  |  |
368	//  |  |  |  |a2*a1|  |  |  |
369	//  |  |  |a3*a1|  |  |  |  |
370	// *|  |  |  |  |  |  |  | 2|
371	// +|a3*a3|a2*a2|a1*a1|a0*a0|
372	//  |--+--+--+--+--+--+--+--|
373	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow
374	//
375	//  "can't overflow" below mark carrying into high part of
376	//  multiplication result, which can't overflow, because it
377	//  can never be all ones.
378
379	mul	x15,x5,x4		// a[1]*a[0]
380	umulh	x9,x5,x4
381	mul	x16,x6,x4		// a[2]*a[0]
382	umulh	x10,x6,x4
383	mul	x17,x7,x4		// a[3]*a[0]
384	umulh	x19,x7,x4
385
386	adds	x16,x16,x9		// accumulate high parts of multiplication
387	mul	x8,x6,x5		// a[2]*a[1]
388	umulh	x9,x6,x5
389	adcs	x17,x17,x10
390	mul	x10,x7,x5		// a[3]*a[1]
391	umulh	x11,x7,x5
392	adc	x19,x19,xzr		// can't overflow
393
394	mul	x20,x7,x6		// a[3]*a[2]
395	umulh	x1,x7,x6
396
397	adds	x9,x9,x10		// accumulate high parts of multiplication
398	mul	x14,x4,x4		// a[0]*a[0]
399	adc	x10,x11,xzr		// can't overflow
400
401	adds	x17,x17,x8		// accumulate low parts of multiplication
402	umulh	x4,x4,x4
403	adcs	x19,x19,x9
404	mul	x9,x5,x5		// a[1]*a[1]
405	adcs	x20,x20,x10
406	umulh	x5,x5,x5
407	adc	x1,x1,xzr		// can't overflow
408
409	adds	x15,x15,x15	// acc[1-6]*=2
410	mul	x10,x6,x6		// a[2]*a[2]
411	adcs	x16,x16,x16
412	umulh	x6,x6,x6
413	adcs	x17,x17,x17
414	mul	x11,x7,x7		// a[3]*a[3]
415	adcs	x19,x19,x19
416	umulh	x7,x7,x7
417	adcs	x20,x20,x20
418	adcs	x1,x1,x1
419	adc	x2,xzr,xzr
420
421	adds	x15,x15,x4		// +a[i]*a[i]
422	adcs	x16,x16,x9
423	adcs	x17,x17,x5
424	adcs	x19,x19,x10
425	adcs	x20,x20,x6
426	lsl	x8,x14,#32
427	adcs	x1,x1,x11
428	lsr	x9,x14,#32
429	adc	x2,x2,x7
430	subs	x10,x14,x8		// "*0xffff0001"
431	sbc	x11,x14,x9
432	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
433	adcs	x15,x16,x9
434	lsl	x8,x14,#32
435	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
436	lsr	x9,x14,#32
437	adc	x17,x11,xzr		// can't overflow
438	subs	x10,x14,x8		// "*0xffff0001"
439	sbc	x11,x14,x9
440	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
441	adcs	x15,x16,x9
442	lsl	x8,x14,#32
443	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
444	lsr	x9,x14,#32
445	adc	x17,x11,xzr		// can't overflow
446	subs	x10,x14,x8		// "*0xffff0001"
447	sbc	x11,x14,x9
448	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
449	adcs	x15,x16,x9
450	lsl	x8,x14,#32
451	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
452	lsr	x9,x14,#32
453	adc	x17,x11,xzr		// can't overflow
454	subs	x10,x14,x8		// "*0xffff0001"
455	sbc	x11,x14,x9
456	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
457	adcs	x15,x16,x9
458	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
459	adc	x17,x11,xzr		// can't overflow
460
461	adds	x14,x14,x19	// accumulate upper half
462	adcs	x15,x15,x20
463	adcs	x16,x16,x1
464	adcs	x17,x17,x2
465	adc	x19,xzr,xzr
466
467	adds	x8,x14,#1		// subs	x8,x14,#-1 // tmp = ret-modulus
468	sbcs	x9,x15,x12
469	sbcs	x10,x16,xzr
470	sbcs	x11,x17,x13
471	sbcs	xzr,x19,xzr		// did it borrow?
472
473	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
474	csel	x15,x15,x9,lo
475	csel	x16,x16,x10,lo
476	stp	x14,x15,[x0]
477	csel	x17,x17,x11,lo
478	stp	x16,x17,[x0,#16]
479
480	ret
481.size	__ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont
482
483// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to
484// x4-x7 and x8-x11. This is done because it's used in multiple
485// contexts, e.g. in multiplication by 2 and 3...
486.type	__ecp_nistz256_add_to,%function
487.align	4
488__ecp_nistz256_add_to:
489	adds	x14,x14,x8		// ret = a+b
490	adcs	x15,x15,x9
491	adcs	x16,x16,x10
492	adcs	x17,x17,x11
493	adc	x1,xzr,xzr		// zap x1
494
495	adds	x8,x14,#1		// subs	x8,x4,#-1 // tmp = ret-modulus
496	sbcs	x9,x15,x12
497	sbcs	x10,x16,xzr
498	sbcs	x11,x17,x13
499	sbcs	xzr,x1,xzr		// did subtraction borrow?
500
501	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
502	csel	x15,x15,x9,lo
503	csel	x16,x16,x10,lo
504	stp	x14,x15,[x0]
505	csel	x17,x17,x11,lo
506	stp	x16,x17,[x0,#16]
507
508	ret
509.size	__ecp_nistz256_add_to,.-__ecp_nistz256_add_to
510
511.type	__ecp_nistz256_sub_from,%function
512.align	4
513__ecp_nistz256_sub_from:
514	ldp	x8,x9,[x2]
515	ldp	x10,x11,[x2,#16]
516	subs	x14,x14,x8		// ret = a-b
517	sbcs	x15,x15,x9
518	sbcs	x16,x16,x10
519	sbcs	x17,x17,x11
520	sbc	x1,xzr,xzr		// zap x1
521
522	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = ret+modulus
523	adcs	x9,x15,x12
524	adcs	x10,x16,xzr
525	adc	x11,x17,x13
526	cmp	x1,xzr			// did subtraction borrow?
527
528	csel	x14,x14,x8,eq	// ret = borrow ? ret+modulus : ret
529	csel	x15,x15,x9,eq
530	csel	x16,x16,x10,eq
531	stp	x14,x15,[x0]
532	csel	x17,x17,x11,eq
533	stp	x16,x17,[x0,#16]
534
535	ret
536.size	__ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
537
538.type	__ecp_nistz256_sub_morf,%function
539.align	4
540__ecp_nistz256_sub_morf:
541	ldp	x8,x9,[x2]
542	ldp	x10,x11,[x2,#16]
543	subs	x14,x8,x14		// ret = b-a
544	sbcs	x15,x9,x15
545	sbcs	x16,x10,x16
546	sbcs	x17,x11,x17
547	sbc	x1,xzr,xzr		// zap x1
548
549	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = ret+modulus
550	adcs	x9,x15,x12
551	adcs	x10,x16,xzr
552	adc	x11,x17,x13
553	cmp	x1,xzr			// did subtraction borrow?
554
555	csel	x14,x14,x8,eq	// ret = borrow ? ret+modulus : ret
556	csel	x15,x15,x9,eq
557	csel	x16,x16,x10,eq
558	stp	x14,x15,[x0]
559	csel	x17,x17,x11,eq
560	stp	x16,x17,[x0,#16]
561
562	ret
563.size	__ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
564
565.type	__ecp_nistz256_div_by_2,%function
566.align	4
567__ecp_nistz256_div_by_2:
568	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = a+modulus
569	adcs	x9,x15,x12
570	adcs	x10,x16,xzr
571	adcs	x11,x17,x13
572	adc	x1,xzr,xzr		// zap x1
573	tst	x14,#1		// is a even?
574
575	csel	x14,x14,x8,eq	// ret = even ? a : a+modulus
576	csel	x15,x15,x9,eq
577	csel	x16,x16,x10,eq
578	csel	x17,x17,x11,eq
579	csel	x1,xzr,x1,eq
580
581	lsr	x14,x14,#1		// ret >>= 1
582	orr	x14,x14,x15,lsl#63
583	lsr	x15,x15,#1
584	orr	x15,x15,x16,lsl#63
585	lsr	x16,x16,#1
586	orr	x16,x16,x17,lsl#63
587	lsr	x17,x17,#1
588	stp	x14,x15,[x0]
589	orr	x17,x17,x1,lsl#63
590	stp	x16,x17,[x0,#16]
591
592	ret
593.size	__ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
594.globl	ecp_nistz256_point_double
595.hidden	ecp_nistz256_point_double
596.type	ecp_nistz256_point_double,%function
597.align	5
598ecp_nistz256_point_double:
599	AARCH64_SIGN_LINK_REGISTER
600	stp	x29,x30,[sp,#-96]!
601	add	x29,sp,#0
602	stp	x19,x20,[sp,#16]
603	stp	x21,x22,[sp,#32]
604	sub	sp,sp,#32*4
605
606.Ldouble_shortcut:
607	ldp	x14,x15,[x1,#32]
608	mov	x21,x0
609	ldp	x16,x17,[x1,#48]
610	mov	x22,x1
611	adrp	x13,.Lpoly
612	add	x13,x13,:lo12:.Lpoly
613	ldr	x12,[x13,#8]
614	mov	x8,x14
615	ldr	x13,[x13,#24]
616	mov	x9,x15
617	ldp	x4,x5,[x22,#64]	// forward load for p256_sqr_mont
618	mov	x10,x16
619	mov	x11,x17
620	ldp	x6,x7,[x22,#64+16]
621	add	x0,sp,#0
622	bl	__ecp_nistz256_add_to	// p256_mul_by_2(S, in_y);
623
624	add	x0,sp,#64
625	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Zsqr, in_z);
626
627	ldp	x8,x9,[x22]
628	ldp	x10,x11,[x22,#16]
629	mov	x4,x14		// put Zsqr aside for p256_sub
630	mov	x5,x15
631	mov	x6,x16
632	mov	x7,x17
633	add	x0,sp,#32
634	bl	__ecp_nistz256_add_to	// p256_add(M, Zsqr, in_x);
635
636	add	x2,x22,#0
637	mov	x14,x4		// restore Zsqr
638	mov	x15,x5
639	ldp	x4,x5,[sp,#0]	// forward load for p256_sqr_mont
640	mov	x16,x6
641	mov	x17,x7
642	ldp	x6,x7,[sp,#0+16]
643	add	x0,sp,#64
644	bl	__ecp_nistz256_sub_morf	// p256_sub(Zsqr, in_x, Zsqr);
645
646	add	x0,sp,#0
647	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(S, S);
648
649	ldr	x3,[x22,#32]
650	ldp	x4,x5,[x22,#64]
651	ldp	x6,x7,[x22,#64+16]
652	add	x2,x22,#32
653	add	x0,sp,#96
654	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(tmp0, in_z, in_y);
655
656	mov	x8,x14
657	mov	x9,x15
658	ldp	x4,x5,[sp,#0]	// forward load for p256_sqr_mont
659	mov	x10,x16
660	mov	x11,x17
661	ldp	x6,x7,[sp,#0+16]
662	add	x0,x21,#64
663	bl	__ecp_nistz256_add_to	// p256_mul_by_2(res_z, tmp0);
664
665	add	x0,sp,#96
666	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(tmp0, S);
667
668	ldr	x3,[sp,#64]		// forward load for p256_mul_mont
669	ldp	x4,x5,[sp,#32]
670	ldp	x6,x7,[sp,#32+16]
671	add	x0,x21,#32
672	bl	__ecp_nistz256_div_by_2	// p256_div_by_2(res_y, tmp0);
673
674	add	x2,sp,#64
675	add	x0,sp,#32
676	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(M, M, Zsqr);
677
678	mov	x8,x14		// duplicate M
679	mov	x9,x15
680	mov	x10,x16
681	mov	x11,x17
682	mov	x4,x14		// put M aside
683	mov	x5,x15
684	mov	x6,x16
685	mov	x7,x17
686	add	x0,sp,#32
687	bl	__ecp_nistz256_add_to
688	mov	x8,x4			// restore M
689	mov	x9,x5
690	ldr	x3,[x22]		// forward load for p256_mul_mont
691	mov	x10,x6
692	ldp	x4,x5,[sp,#0]
693	mov	x11,x7
694	ldp	x6,x7,[sp,#0+16]
695	bl	__ecp_nistz256_add_to	// p256_mul_by_3(M, M);
696
697	add	x2,x22,#0
698	add	x0,sp,#0
699	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, in_x);
700
701	mov	x8,x14
702	mov	x9,x15
703	ldp	x4,x5,[sp,#32]	// forward load for p256_sqr_mont
704	mov	x10,x16
705	mov	x11,x17
706	ldp	x6,x7,[sp,#32+16]
707	add	x0,sp,#96
708	bl	__ecp_nistz256_add_to	// p256_mul_by_2(tmp0, S);
709
710	add	x0,x21,#0
711	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(res_x, M);
712
713	add	x2,sp,#96
714	bl	__ecp_nistz256_sub_from	// p256_sub(res_x, res_x, tmp0);
715
716	add	x2,sp,#0
717	add	x0,sp,#0
718	bl	__ecp_nistz256_sub_morf	// p256_sub(S, S, res_x);
719
720	ldr	x3,[sp,#32]
721	mov	x4,x14		// copy S
722	mov	x5,x15
723	mov	x6,x16
724	mov	x7,x17
725	add	x2,sp,#32
726	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, M);
727
728	add	x2,x21,#32
729	add	x0,x21,#32
730	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, S, res_y);
731
732	add	sp,x29,#0		// destroy frame
733	ldp	x19,x20,[x29,#16]
734	ldp	x21,x22,[x29,#32]
735	ldp	x29,x30,[sp],#96
736	AARCH64_VALIDATE_LINK_REGISTER
737	ret
738.size	ecp_nistz256_point_double,.-ecp_nistz256_point_double
739.globl	ecp_nistz256_point_add
740.hidden	ecp_nistz256_point_add
741.type	ecp_nistz256_point_add,%function
742.align	5
743ecp_nistz256_point_add:
744	AARCH64_SIGN_LINK_REGISTER
745	stp	x29,x30,[sp,#-96]!
746	add	x29,sp,#0
747	stp	x19,x20,[sp,#16]
748	stp	x21,x22,[sp,#32]
749	stp	x23,x24,[sp,#48]
750	stp	x25,x26,[sp,#64]
751	stp	x27,x28,[sp,#80]
752	sub	sp,sp,#32*12
753
754	ldp	x4,x5,[x2,#64]	// in2_z
755	ldp	x6,x7,[x2,#64+16]
756	mov	x21,x0
757	mov	x22,x1
758	mov	x23,x2
759	adrp	x13,.Lpoly
760	add	x13,x13,:lo12:.Lpoly
761	ldr	x12,[x13,#8]
762	ldr	x13,[x13,#24]
763	orr	x8,x4,x5
764	orr	x10,x6,x7
765	orr	x25,x8,x10
766	cmp	x25,#0
767	csetm	x25,ne		// ~in2infty
768	add	x0,sp,#192
769	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z2sqr, in2_z);
770
771	ldp	x4,x5,[x22,#64]	// in1_z
772	ldp	x6,x7,[x22,#64+16]
773	orr	x8,x4,x5
774	orr	x10,x6,x7
775	orr	x24,x8,x10
776	cmp	x24,#0
777	csetm	x24,ne		// ~in1infty
778	add	x0,sp,#128
779	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
780
781	ldr	x3,[x23,#64]
782	ldp	x4,x5,[sp,#192]
783	ldp	x6,x7,[sp,#192+16]
784	add	x2,x23,#64
785	add	x0,sp,#320
786	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, Z2sqr, in2_z);
787
788	ldr	x3,[x22,#64]
789	ldp	x4,x5,[sp,#128]
790	ldp	x6,x7,[sp,#128+16]
791	add	x2,x22,#64
792	add	x0,sp,#352
793	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
794
795	ldr	x3,[x22,#32]
796	ldp	x4,x5,[sp,#320]
797	ldp	x6,x7,[sp,#320+16]
798	add	x2,x22,#32
799	add	x0,sp,#320
800	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, S1, in1_y);
801
802	ldr	x3,[x23,#32]
803	ldp	x4,x5,[sp,#352]
804	ldp	x6,x7,[sp,#352+16]
805	add	x2,x23,#32
806	add	x0,sp,#352
807	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
808
809	add	x2,sp,#320
810	ldr	x3,[sp,#192]	// forward load for p256_mul_mont
811	ldp	x4,x5,[x22]
812	ldp	x6,x7,[x22,#16]
813	add	x0,sp,#160
814	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, S1);
815
816	orr	x14,x14,x15	// see if result is zero
817	orr	x16,x16,x17
818	orr	x26,x14,x16	// ~is_equal(S1,S2)
819
820	add	x2,sp,#192
821	add	x0,sp,#256
822	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U1, in1_x, Z2sqr);
823
824	ldr	x3,[sp,#128]
825	ldp	x4,x5,[x23]
826	ldp	x6,x7,[x23,#16]
827	add	x2,sp,#128
828	add	x0,sp,#288
829	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in2_x, Z1sqr);
830
831	add	x2,sp,#256
832	ldp	x4,x5,[sp,#160]	// forward load for p256_sqr_mont
833	ldp	x6,x7,[sp,#160+16]
834	add	x0,sp,#96
835	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, U1);
836
837	orr	x14,x14,x15	// see if result is zero
838	orr	x16,x16,x17
839	orr	x14,x14,x16	// ~is_equal(U1,U2)
840
841	mvn	x27,x24	// -1/0 -> 0/-1
842	mvn	x28,x25	// -1/0 -> 0/-1
843	orr	x14,x14,x27
844	orr	x14,x14,x28
845	orr	x14,x14,x26
846	cbnz	x14,.Ladd_proceed	// if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2))
847
848.Ladd_double:
849	mov	x1,x22
850	mov	x0,x21
851	ldp	x23,x24,[x29,#48]
852	ldp	x25,x26,[x29,#64]
853	ldp	x27,x28,[x29,#80]
854	add	sp,sp,#256	// #256 is from #32*(12-4). difference in stack frames
855	b	.Ldouble_shortcut
856
857.align	4
858.Ladd_proceed:
859	add	x0,sp,#192
860	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
861
862	ldr	x3,[x22,#64]
863	ldp	x4,x5,[sp,#96]
864	ldp	x6,x7,[sp,#96+16]
865	add	x2,x22,#64
866	add	x0,sp,#64
867	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
868
869	ldp	x4,x5,[sp,#96]
870	ldp	x6,x7,[sp,#96+16]
871	add	x0,sp,#128
872	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
873
874	ldr	x3,[x23,#64]
875	ldp	x4,x5,[sp,#64]
876	ldp	x6,x7,[sp,#64+16]
877	add	x2,x23,#64
878	add	x0,sp,#64
879	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, res_z, in2_z);
880
881	ldr	x3,[sp,#96]
882	ldp	x4,x5,[sp,#128]
883	ldp	x6,x7,[sp,#128+16]
884	add	x2,sp,#96
885	add	x0,sp,#224
886	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
887
888	ldr	x3,[sp,#128]
889	ldp	x4,x5,[sp,#256]
890	ldp	x6,x7,[sp,#256+16]
891	add	x2,sp,#128
892	add	x0,sp,#288
893	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, U1, Hsqr);
894
895	mov	x8,x14
896	mov	x9,x15
897	mov	x10,x16
898	mov	x11,x17
899	add	x0,sp,#128
900	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
901
902	add	x2,sp,#192
903	add	x0,sp,#0
904	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
905
906	add	x2,sp,#224
907	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
908
909	add	x2,sp,#288
910	ldr	x3,[sp,#224]		// forward load for p256_mul_mont
911	ldp	x4,x5,[sp,#320]
912	ldp	x6,x7,[sp,#320+16]
913	add	x0,sp,#32
914	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
915
916	add	x2,sp,#224
917	add	x0,sp,#352
918	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S1, Hcub);
919
920	ldr	x3,[sp,#160]
921	ldp	x4,x5,[sp,#32]
922	ldp	x6,x7,[sp,#32+16]
923	add	x2,sp,#160
924	add	x0,sp,#32
925	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
926
927	add	x2,sp,#352
928	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
929
930	ldp	x4,x5,[sp,#0]		// res
931	ldp	x6,x7,[sp,#0+16]
932	ldp	x8,x9,[x23]		// in2
933	ldp	x10,x11,[x23,#16]
934	ldp	x14,x15,[x22,#0]	// in1
935	cmp	x24,#0			// ~, remember?
936	ldp	x16,x17,[x22,#0+16]
937	csel	x8,x4,x8,ne
938	csel	x9,x5,x9,ne
939	ldp	x4,x5,[sp,#0+0+32]	// res
940	csel	x10,x6,x10,ne
941	csel	x11,x7,x11,ne
942	cmp	x25,#0			// ~, remember?
943	ldp	x6,x7,[sp,#0+0+48]
944	csel	x14,x8,x14,ne
945	csel	x15,x9,x15,ne
946	ldp	x8,x9,[x23,#0+32]	// in2
947	csel	x16,x10,x16,ne
948	csel	x17,x11,x17,ne
949	ldp	x10,x11,[x23,#0+48]
950	stp	x14,x15,[x21,#0]
951	stp	x16,x17,[x21,#0+16]
952	ldp	x14,x15,[x22,#32]	// in1
953	cmp	x24,#0			// ~, remember?
954	ldp	x16,x17,[x22,#32+16]
955	csel	x8,x4,x8,ne
956	csel	x9,x5,x9,ne
957	ldp	x4,x5,[sp,#0+32+32]	// res
958	csel	x10,x6,x10,ne
959	csel	x11,x7,x11,ne
960	cmp	x25,#0			// ~, remember?
961	ldp	x6,x7,[sp,#0+32+48]
962	csel	x14,x8,x14,ne
963	csel	x15,x9,x15,ne
964	ldp	x8,x9,[x23,#32+32]	// in2
965	csel	x16,x10,x16,ne
966	csel	x17,x11,x17,ne
967	ldp	x10,x11,[x23,#32+48]
968	stp	x14,x15,[x21,#32]
969	stp	x16,x17,[x21,#32+16]
970	ldp	x14,x15,[x22,#64]	// in1
971	cmp	x24,#0			// ~, remember?
972	ldp	x16,x17,[x22,#64+16]
973	csel	x8,x4,x8,ne
974	csel	x9,x5,x9,ne
975	csel	x10,x6,x10,ne
976	csel	x11,x7,x11,ne
977	cmp	x25,#0			// ~, remember?
978	csel	x14,x8,x14,ne
979	csel	x15,x9,x15,ne
980	csel	x16,x10,x16,ne
981	csel	x17,x11,x17,ne
982	stp	x14,x15,[x21,#64]
983	stp	x16,x17,[x21,#64+16]
984
985.Ladd_done:
986	add	sp,x29,#0		// destroy frame
987	ldp	x19,x20,[x29,#16]
988	ldp	x21,x22,[x29,#32]
989	ldp	x23,x24,[x29,#48]
990	ldp	x25,x26,[x29,#64]
991	ldp	x27,x28,[x29,#80]
992	ldp	x29,x30,[sp],#96
993	AARCH64_VALIDATE_LINK_REGISTER
994	ret
995.size	ecp_nistz256_point_add,.-ecp_nistz256_point_add
996.globl	ecp_nistz256_point_add_affine
997.hidden	ecp_nistz256_point_add_affine
998.type	ecp_nistz256_point_add_affine,%function
999.align	5
1000ecp_nistz256_point_add_affine:
1001	AARCH64_SIGN_LINK_REGISTER
1002	stp	x29,x30,[sp,#-80]!
1003	add	x29,sp,#0
1004	stp	x19,x20,[sp,#16]
1005	stp	x21,x22,[sp,#32]
1006	stp	x23,x24,[sp,#48]
1007	stp	x25,x26,[sp,#64]
1008	sub	sp,sp,#32*10
1009
1010	mov	x21,x0
1011	mov	x22,x1
1012	mov	x23,x2
1013	adrp	x13,.Lpoly
1014	add	x13,x13,:lo12:.Lpoly
1015	ldr	x12,[x13,#8]
1016	ldr	x13,[x13,#24]
1017
1018	ldp	x4,x5,[x1,#64]	// in1_z
1019	ldp	x6,x7,[x1,#64+16]
1020	orr	x8,x4,x5
1021	orr	x10,x6,x7
1022	orr	x24,x8,x10
1023	cmp	x24,#0
1024	csetm	x24,ne		// ~in1infty
1025
1026	ldp	x14,x15,[x2]	// in2_x
1027	ldp	x16,x17,[x2,#16]
1028	ldp	x8,x9,[x2,#32]	// in2_y
1029	ldp	x10,x11,[x2,#48]
1030	orr	x14,x14,x15
1031	orr	x16,x16,x17
1032	orr	x8,x8,x9
1033	orr	x10,x10,x11
1034	orr	x14,x14,x16
1035	orr	x8,x8,x10
1036	orr	x25,x14,x8
1037	cmp	x25,#0
1038	csetm	x25,ne		// ~in2infty
1039
1040	add	x0,sp,#128
1041	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
1042
1043	mov	x4,x14
1044	mov	x5,x15
1045	mov	x6,x16
1046	mov	x7,x17
1047	ldr	x3,[x23]
1048	add	x2,x23,#0
1049	add	x0,sp,#96
1050	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, Z1sqr, in2_x);
1051
1052	add	x2,x22,#0
1053	ldr	x3,[x22,#64]	// forward load for p256_mul_mont
1054	ldp	x4,x5,[sp,#128]
1055	ldp	x6,x7,[sp,#128+16]
1056	add	x0,sp,#160
1057	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, in1_x);
1058
1059	add	x2,x22,#64
1060	add	x0,sp,#128
1061	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
1062
1063	ldr	x3,[x22,#64]
1064	ldp	x4,x5,[sp,#160]
1065	ldp	x6,x7,[sp,#160+16]
1066	add	x2,x22,#64
1067	add	x0,sp,#64
1068	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
1069
1070	ldr	x3,[x23,#32]
1071	ldp	x4,x5,[sp,#128]
1072	ldp	x6,x7,[sp,#128+16]
1073	add	x2,x23,#32
1074	add	x0,sp,#128
1075	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
1076
1077	add	x2,x22,#32
1078	ldp	x4,x5,[sp,#160]	// forward load for p256_sqr_mont
1079	ldp	x6,x7,[sp,#160+16]
1080	add	x0,sp,#192
1081	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, in1_y);
1082
1083	add	x0,sp,#224
1084	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
1085
1086	ldp	x4,x5,[sp,#192]
1087	ldp	x6,x7,[sp,#192+16]
1088	add	x0,sp,#288
1089	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
1090
1091	ldr	x3,[sp,#160]
1092	ldp	x4,x5,[sp,#224]
1093	ldp	x6,x7,[sp,#224+16]
1094	add	x2,sp,#160
1095	add	x0,sp,#256
1096	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
1097
1098	ldr	x3,[x22]
1099	ldp	x4,x5,[sp,#224]
1100	ldp	x6,x7,[sp,#224+16]
1101	add	x2,x22,#0
1102	add	x0,sp,#96
1103	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in1_x, Hsqr);
1104
1105	mov	x8,x14
1106	mov	x9,x15
1107	mov	x10,x16
1108	mov	x11,x17
1109	add	x0,sp,#224
1110	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
1111
1112	add	x2,sp,#288
1113	add	x0,sp,#0
1114	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
1115
1116	add	x2,sp,#256
1117	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
1118
1119	add	x2,sp,#96
1120	ldr	x3,[x22,#32]	// forward load for p256_mul_mont
1121	ldp	x4,x5,[sp,#256]
1122	ldp	x6,x7,[sp,#256+16]
1123	add	x0,sp,#32
1124	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
1125
1126	add	x2,x22,#32
1127	add	x0,sp,#128
1128	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, in1_y, Hcub);
1129
1130	ldr	x3,[sp,#192]
1131	ldp	x4,x5,[sp,#32]
1132	ldp	x6,x7,[sp,#32+16]
1133	add	x2,sp,#192
1134	add	x0,sp,#32
1135	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
1136
1137	add	x2,sp,#128
1138	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
1139
1140	ldp	x4,x5,[sp,#0]		// res
1141	ldp	x6,x7,[sp,#0+16]
1142	ldp	x8,x9,[x23]		// in2
1143	ldp	x10,x11,[x23,#16]
1144	ldp	x14,x15,[x22,#0]	// in1
1145	cmp	x24,#0			// ~, remember?
1146	ldp	x16,x17,[x22,#0+16]
1147	csel	x8,x4,x8,ne
1148	csel	x9,x5,x9,ne
1149	ldp	x4,x5,[sp,#0+0+32]	// res
1150	csel	x10,x6,x10,ne
1151	csel	x11,x7,x11,ne
1152	cmp	x25,#0			// ~, remember?
1153	ldp	x6,x7,[sp,#0+0+48]
1154	csel	x14,x8,x14,ne
1155	csel	x15,x9,x15,ne
1156	ldp	x8,x9,[x23,#0+32]	// in2
1157	csel	x16,x10,x16,ne
1158	csel	x17,x11,x17,ne
1159	ldp	x10,x11,[x23,#0+48]
1160	stp	x14,x15,[x21,#0]
1161	stp	x16,x17,[x21,#0+16]
1162	adrp	x23,.Lone_mont-64
1163	add	x23,x23,:lo12:.Lone_mont-64
1164	ldp	x14,x15,[x22,#32]	// in1
1165	cmp	x24,#0			// ~, remember?
1166	ldp	x16,x17,[x22,#32+16]
1167	csel	x8,x4,x8,ne
1168	csel	x9,x5,x9,ne
1169	ldp	x4,x5,[sp,#0+32+32]	// res
1170	csel	x10,x6,x10,ne
1171	csel	x11,x7,x11,ne
1172	cmp	x25,#0			// ~, remember?
1173	ldp	x6,x7,[sp,#0+32+48]
1174	csel	x14,x8,x14,ne
1175	csel	x15,x9,x15,ne
1176	ldp	x8,x9,[x23,#32+32]	// in2
1177	csel	x16,x10,x16,ne
1178	csel	x17,x11,x17,ne
1179	ldp	x10,x11,[x23,#32+48]
1180	stp	x14,x15,[x21,#32]
1181	stp	x16,x17,[x21,#32+16]
1182	ldp	x14,x15,[x22,#64]	// in1
1183	cmp	x24,#0			// ~, remember?
1184	ldp	x16,x17,[x22,#64+16]
1185	csel	x8,x4,x8,ne
1186	csel	x9,x5,x9,ne
1187	csel	x10,x6,x10,ne
1188	csel	x11,x7,x11,ne
1189	cmp	x25,#0			// ~, remember?
1190	csel	x14,x8,x14,ne
1191	csel	x15,x9,x15,ne
1192	csel	x16,x10,x16,ne
1193	csel	x17,x11,x17,ne
1194	stp	x14,x15,[x21,#64]
1195	stp	x16,x17,[x21,#64+16]
1196
1197	add	sp,x29,#0		// destroy frame
1198	ldp	x19,x20,[x29,#16]
1199	ldp	x21,x22,[x29,#32]
1200	ldp	x23,x24,[x29,#48]
1201	ldp	x25,x26,[x29,#64]
1202	ldp	x29,x30,[sp],#80
1203	AARCH64_VALIDATE_LINK_REGISTER
1204	ret
1205.size	ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1206////////////////////////////////////////////////////////////////////////
1207// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
1208//                                uint64_t b[4]);
1209.globl	ecp_nistz256_ord_mul_mont
1210.hidden	ecp_nistz256_ord_mul_mont
1211.type	ecp_nistz256_ord_mul_mont,%function
1212.align	4
1213ecp_nistz256_ord_mul_mont:
1214	AARCH64_VALID_CALL_TARGET
1215	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1216	stp	x29,x30,[sp,#-64]!
1217	add	x29,sp,#0
1218	stp	x19,x20,[sp,#16]
1219	stp	x21,x22,[sp,#32]
1220	stp	x23,x24,[sp,#48]
1221
1222	adrp	x23,.Lord
1223	add	x23,x23,:lo12:.Lord
1224	ldr	x3,[x2]		// bp[0]
1225	ldp	x4,x5,[x1]
1226	ldp	x6,x7,[x1,#16]
1227
1228	ldp	x12,x13,[x23,#0]
1229	ldp	x21,x22,[x23,#16]
1230	ldr	x23,[x23,#32]
1231
1232	mul	x14,x4,x3		// a[0]*b[0]
1233	umulh	x8,x4,x3
1234
1235	mul	x15,x5,x3		// a[1]*b[0]
1236	umulh	x9,x5,x3
1237
1238	mul	x16,x6,x3		// a[2]*b[0]
1239	umulh	x10,x6,x3
1240
1241	mul	x17,x7,x3		// a[3]*b[0]
1242	umulh	x19,x7,x3
1243
1244	mul	x24,x14,x23
1245
1246	adds	x15,x15,x8		// accumulate high parts of multiplication
1247	adcs	x16,x16,x9
1248	adcs	x17,x17,x10
1249	adc	x19,x19,xzr
1250	mov	x20,xzr
1251	ldr	x3,[x2,#8*1]		// b[i]
1252
1253	lsl	x8,x24,#32
1254	subs	x16,x16,x24
1255	lsr	x9,x24,#32
1256	sbcs	x17,x17,x8
1257	sbcs	x19,x19,x9
1258	sbc	x20,x20,xzr
1259
1260	subs	xzr,x14,#1
1261	umulh	x9,x12,x24
1262	mul	x10,x13,x24
1263	umulh	x11,x13,x24
1264
1265	adcs	x10,x10,x9
1266	mul	x8,x4,x3
1267	adc	x11,x11,xzr
1268	mul	x9,x5,x3
1269
1270	adds	x14,x15,x10
1271	mul	x10,x6,x3
1272	adcs	x15,x16,x11
1273	mul	x11,x7,x3
1274	adcs	x16,x17,x24
1275	adcs	x17,x19,x24
1276	adc	x19,x20,xzr
1277
1278	adds	x14,x14,x8		// accumulate low parts
1279	umulh	x8,x4,x3
1280	adcs	x15,x15,x9
1281	umulh	x9,x5,x3
1282	adcs	x16,x16,x10
1283	umulh	x10,x6,x3
1284	adcs	x17,x17,x11
1285	umulh	x11,x7,x3
1286	adc	x19,x19,xzr
1287	mul	x24,x14,x23
1288	adds	x15,x15,x8		// accumulate high parts
1289	adcs	x16,x16,x9
1290	adcs	x17,x17,x10
1291	adcs	x19,x19,x11
1292	adc	x20,xzr,xzr
1293	ldr	x3,[x2,#8*2]		// b[i]
1294
1295	lsl	x8,x24,#32
1296	subs	x16,x16,x24
1297	lsr	x9,x24,#32
1298	sbcs	x17,x17,x8
1299	sbcs	x19,x19,x9
1300	sbc	x20,x20,xzr
1301
1302	subs	xzr,x14,#1
1303	umulh	x9,x12,x24
1304	mul	x10,x13,x24
1305	umulh	x11,x13,x24
1306
1307	adcs	x10,x10,x9
1308	mul	x8,x4,x3
1309	adc	x11,x11,xzr
1310	mul	x9,x5,x3
1311
1312	adds	x14,x15,x10
1313	mul	x10,x6,x3
1314	adcs	x15,x16,x11
1315	mul	x11,x7,x3
1316	adcs	x16,x17,x24
1317	adcs	x17,x19,x24
1318	adc	x19,x20,xzr
1319
1320	adds	x14,x14,x8		// accumulate low parts
1321	umulh	x8,x4,x3
1322	adcs	x15,x15,x9
1323	umulh	x9,x5,x3
1324	adcs	x16,x16,x10
1325	umulh	x10,x6,x3
1326	adcs	x17,x17,x11
1327	umulh	x11,x7,x3
1328	adc	x19,x19,xzr
1329	mul	x24,x14,x23
1330	adds	x15,x15,x8		// accumulate high parts
1331	adcs	x16,x16,x9
1332	adcs	x17,x17,x10
1333	adcs	x19,x19,x11
1334	adc	x20,xzr,xzr
1335	ldr	x3,[x2,#8*3]		// b[i]
1336
1337	lsl	x8,x24,#32
1338	subs	x16,x16,x24
1339	lsr	x9,x24,#32
1340	sbcs	x17,x17,x8
1341	sbcs	x19,x19,x9
1342	sbc	x20,x20,xzr
1343
1344	subs	xzr,x14,#1
1345	umulh	x9,x12,x24
1346	mul	x10,x13,x24
1347	umulh	x11,x13,x24
1348
1349	adcs	x10,x10,x9
1350	mul	x8,x4,x3
1351	adc	x11,x11,xzr
1352	mul	x9,x5,x3
1353
1354	adds	x14,x15,x10
1355	mul	x10,x6,x3
1356	adcs	x15,x16,x11
1357	mul	x11,x7,x3
1358	adcs	x16,x17,x24
1359	adcs	x17,x19,x24
1360	adc	x19,x20,xzr
1361
1362	adds	x14,x14,x8		// accumulate low parts
1363	umulh	x8,x4,x3
1364	adcs	x15,x15,x9
1365	umulh	x9,x5,x3
1366	adcs	x16,x16,x10
1367	umulh	x10,x6,x3
1368	adcs	x17,x17,x11
1369	umulh	x11,x7,x3
1370	adc	x19,x19,xzr
1371	mul	x24,x14,x23
1372	adds	x15,x15,x8		// accumulate high parts
1373	adcs	x16,x16,x9
1374	adcs	x17,x17,x10
1375	adcs	x19,x19,x11
1376	adc	x20,xzr,xzr
1377	lsl	x8,x24,#32		// last reduction
1378	subs	x16,x16,x24
1379	lsr	x9,x24,#32
1380	sbcs	x17,x17,x8
1381	sbcs	x19,x19,x9
1382	sbc	x20,x20,xzr
1383
1384	subs	xzr,x14,#1
1385	umulh	x9,x12,x24
1386	mul	x10,x13,x24
1387	umulh	x11,x13,x24
1388
1389	adcs	x10,x10,x9
1390	adc	x11,x11,xzr
1391
1392	adds	x14,x15,x10
1393	adcs	x15,x16,x11
1394	adcs	x16,x17,x24
1395	adcs	x17,x19,x24
1396	adc	x19,x20,xzr
1397
1398	subs	x8,x14,x12		// ret -= modulus
1399	sbcs	x9,x15,x13
1400	sbcs	x10,x16,x21
1401	sbcs	x11,x17,x22
1402	sbcs	xzr,x19,xzr
1403
1404	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
1405	csel	x15,x15,x9,lo
1406	csel	x16,x16,x10,lo
1407	stp	x14,x15,[x0]
1408	csel	x17,x17,x11,lo
1409	stp	x16,x17,[x0,#16]
1410
1411	ldp	x19,x20,[sp,#16]
1412	ldp	x21,x22,[sp,#32]
1413	ldp	x23,x24,[sp,#48]
1414	ldr	x29,[sp],#64
1415	ret
1416.size	ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
1417
1418////////////////////////////////////////////////////////////////////////
1419// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
1420//                                uint64_t rep);
1421.globl	ecp_nistz256_ord_sqr_mont
1422.hidden	ecp_nistz256_ord_sqr_mont
1423.type	ecp_nistz256_ord_sqr_mont,%function
1424.align	4
1425ecp_nistz256_ord_sqr_mont:
1426	AARCH64_VALID_CALL_TARGET
1427	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1428	stp	x29,x30,[sp,#-64]!
1429	add	x29,sp,#0
1430	stp	x19,x20,[sp,#16]
1431	stp	x21,x22,[sp,#32]
1432	stp	x23,x24,[sp,#48]
1433
1434	adrp	x23,.Lord
1435	add	x23,x23,:lo12:.Lord
1436	ldp	x4,x5,[x1]
1437	ldp	x6,x7,[x1,#16]
1438
1439	ldp	x12,x13,[x23,#0]
1440	ldp	x21,x22,[x23,#16]
1441	ldr	x23,[x23,#32]
1442	b	.Loop_ord_sqr
1443
1444.align	4
1445.Loop_ord_sqr:
1446	sub	x2,x2,#1
1447	////////////////////////////////////////////////////////////////
1448	//  |  |  |  |  |  |a1*a0|  |
1449	//  |  |  |  |  |a2*a0|  |  |
1450	//  |  |a3*a2|a3*a0|  |  |  |
1451	//  |  |  |  |a2*a1|  |  |  |
1452	//  |  |  |a3*a1|  |  |  |  |
1453	// *|  |  |  |  |  |  |  | 2|
1454	// +|a3*a3|a2*a2|a1*a1|a0*a0|
1455	//  |--+--+--+--+--+--+--+--|
1456	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow
1457	//
1458	//  "can't overflow" below mark carrying into high part of
1459	//  multiplication result, which can't overflow, because it
1460	//  can never be all ones.
1461
1462	mul	x15,x5,x4		// a[1]*a[0]
1463	umulh	x9,x5,x4
1464	mul	x16,x6,x4		// a[2]*a[0]
1465	umulh	x10,x6,x4
1466	mul	x17,x7,x4		// a[3]*a[0]
1467	umulh	x19,x7,x4
1468
1469	adds	x16,x16,x9		// accumulate high parts of multiplication
1470	mul	x8,x6,x5		// a[2]*a[1]
1471	umulh	x9,x6,x5
1472	adcs	x17,x17,x10
1473	mul	x10,x7,x5		// a[3]*a[1]
1474	umulh	x11,x7,x5
1475	adc	x19,x19,xzr		// can't overflow
1476
1477	mul	x20,x7,x6		// a[3]*a[2]
1478	umulh	x1,x7,x6
1479
1480	adds	x9,x9,x10		// accumulate high parts of multiplication
1481	mul	x14,x4,x4		// a[0]*a[0]
1482	adc	x10,x11,xzr		// can't overflow
1483
1484	adds	x17,x17,x8		// accumulate low parts of multiplication
1485	umulh	x4,x4,x4
1486	adcs	x19,x19,x9
1487	mul	x9,x5,x5		// a[1]*a[1]
1488	adcs	x20,x20,x10
1489	umulh	x5,x5,x5
1490	adc	x1,x1,xzr		// can't overflow
1491
1492	adds	x15,x15,x15	// acc[1-6]*=2
1493	mul	x10,x6,x6		// a[2]*a[2]
1494	adcs	x16,x16,x16
1495	umulh	x6,x6,x6
1496	adcs	x17,x17,x17
1497	mul	x11,x7,x7		// a[3]*a[3]
1498	adcs	x19,x19,x19
1499	umulh	x7,x7,x7
1500	adcs	x20,x20,x20
1501	adcs	x1,x1,x1
1502	adc	x3,xzr,xzr
1503
1504	adds	x15,x15,x4		// +a[i]*a[i]
1505	mul	x24,x14,x23
1506	adcs	x16,x16,x9
1507	adcs	x17,x17,x5
1508	adcs	x19,x19,x10
1509	adcs	x20,x20,x6
1510	adcs	x1,x1,x11
1511	adc	x3,x3,x7
1512	subs	xzr,x14,#1
1513	umulh	x9,x12,x24
1514	mul	x10,x13,x24
1515	umulh	x11,x13,x24
1516
1517	adcs	x10,x10,x9
1518	adc	x11,x11,xzr
1519
1520	adds	x14,x15,x10
1521	adcs	x15,x16,x11
1522	adcs	x16,x17,x24
1523	adc	x17,xzr,x24		// can't overflow
1524	mul	x11,x14,x23
1525	lsl	x8,x24,#32
1526	subs	x15,x15,x24
1527	lsr	x9,x24,#32
1528	sbcs	x16,x16,x8
1529	sbc	x17,x17,x9		// can't borrow
1530	subs	xzr,x14,#1
1531	umulh	x9,x12,x11
1532	mul	x10,x13,x11
1533	umulh	x24,x13,x11
1534
1535	adcs	x10,x10,x9
1536	adc	x24,x24,xzr
1537
1538	adds	x14,x15,x10
1539	adcs	x15,x16,x24
1540	adcs	x16,x17,x11
1541	adc	x17,xzr,x11		// can't overflow
1542	mul	x24,x14,x23
1543	lsl	x8,x11,#32
1544	subs	x15,x15,x11
1545	lsr	x9,x11,#32
1546	sbcs	x16,x16,x8
1547	sbc	x17,x17,x9		// can't borrow
1548	subs	xzr,x14,#1
1549	umulh	x9,x12,x24
1550	mul	x10,x13,x24
1551	umulh	x11,x13,x24
1552
1553	adcs	x10,x10,x9
1554	adc	x11,x11,xzr
1555
1556	adds	x14,x15,x10
1557	adcs	x15,x16,x11
1558	adcs	x16,x17,x24
1559	adc	x17,xzr,x24		// can't overflow
1560	mul	x11,x14,x23
1561	lsl	x8,x24,#32
1562	subs	x15,x15,x24
1563	lsr	x9,x24,#32
1564	sbcs	x16,x16,x8
1565	sbc	x17,x17,x9		// can't borrow
1566	subs	xzr,x14,#1
1567	umulh	x9,x12,x11
1568	mul	x10,x13,x11
1569	umulh	x24,x13,x11
1570
1571	adcs	x10,x10,x9
1572	adc	x24,x24,xzr
1573
1574	adds	x14,x15,x10
1575	adcs	x15,x16,x24
1576	adcs	x16,x17,x11
1577	adc	x17,xzr,x11		// can't overflow
1578	lsl	x8,x11,#32
1579	subs	x15,x15,x11
1580	lsr	x9,x11,#32
1581	sbcs	x16,x16,x8
1582	sbc	x17,x17,x9		// can't borrow
1583	adds	x14,x14,x19	// accumulate upper half
1584	adcs	x15,x15,x20
1585	adcs	x16,x16,x1
1586	adcs	x17,x17,x3
1587	adc	x19,xzr,xzr
1588
1589	subs	x8,x14,x12		// ret -= modulus
1590	sbcs	x9,x15,x13
1591	sbcs	x10,x16,x21
1592	sbcs	x11,x17,x22
1593	sbcs	xzr,x19,xzr
1594
1595	csel	x4,x14,x8,lo	// ret = borrow ? ret : ret-modulus
1596	csel	x5,x15,x9,lo
1597	csel	x6,x16,x10,lo
1598	csel	x7,x17,x11,lo
1599
1600	cbnz	x2,.Loop_ord_sqr
1601
1602	stp	x4,x5,[x0]
1603	stp	x6,x7,[x0,#16]
1604
1605	ldp	x19,x20,[sp,#16]
1606	ldp	x21,x22,[sp,#32]
1607	ldp	x23,x24,[sp,#48]
1608	ldr	x29,[sp],#64
1609	ret
1610.size	ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
1611////////////////////////////////////////////////////////////////////////
1612// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
1613.globl	ecp_nistz256_select_w5
1614.hidden	ecp_nistz256_select_w5
1615.type	ecp_nistz256_select_w5,%function
1616.align	4
1617ecp_nistz256_select_w5:
1618	AARCH64_VALID_CALL_TARGET
1619
1620    // x10 := x0
1621    // w9 := 0; loop counter and incremented internal index
1622	mov	x10, x0
1623	mov	w9, #0
1624
1625    // [v16-v21] := 0
1626	movi	v16.16b, #0
1627	movi	v17.16b, #0
1628	movi	v18.16b, #0
1629	movi	v19.16b, #0
1630	movi	v20.16b, #0
1631	movi	v21.16b, #0
1632
1633.Lselect_w5_loop:
1634    // Loop 16 times.
1635
1636    // Increment index (loop counter); tested at the end of the loop
1637	add	w9, w9, #1
1638
1639    // [v22-v27] := Load a (3*256-bit = 6*128-bit) table entry starting at x1
1640    //  and advance x1 to point to the next entry
1641	ld1	{v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
1642
1643    // x11 := (w9 == w2)? All 1s : All 0s
1644	cmp	w9, w2
1645	csetm	x11, eq
1646
1647    // continue loading ...
1648	ld1	{v26.2d, v27.2d}, [x1],#32
1649
1650    // duplicate mask_64 into Mask (all 0s or all 1s)
1651	dup	v3.2d, x11
1652
1653    // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
1654    // i.e., values in output registers will remain the same if w9 != w2
1655	bit	v16.16b, v22.16b, v3.16b
1656	bit	v17.16b, v23.16b, v3.16b
1657
1658	bit	v18.16b, v24.16b, v3.16b
1659	bit	v19.16b, v25.16b, v3.16b
1660
1661	bit	v20.16b, v26.16b, v3.16b
1662	bit	v21.16b, v27.16b, v3.16b
1663
1664    // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back
1665	tbz	w9, #4, .Lselect_w5_loop
1666
1667    // Write [v16-v21] to memory at the output pointer
1668	st1	{v16.2d, v17.2d, v18.2d, v19.2d}, [x10],#64
1669	st1	{v20.2d, v21.2d}, [x10]
1670
1671	ret
1672.size	ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
1673
1674
1675////////////////////////////////////////////////////////////////////////
1676// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
1677.globl	ecp_nistz256_select_w7
1678.hidden	ecp_nistz256_select_w7
1679.type	ecp_nistz256_select_w7,%function
1680.align	4
1681ecp_nistz256_select_w7:
1682	AARCH64_VALID_CALL_TARGET
1683
1684    // w9 := 0; loop counter and incremented internal index
1685	mov	w9, #0
1686
1687    // [v16-v21] := 0
1688	movi	v16.16b, #0
1689	movi	v17.16b, #0
1690	movi	v18.16b, #0
1691	movi	v19.16b, #0
1692
1693.Lselect_w7_loop:
1694    // Loop 64 times.
1695
1696    // Increment index (loop counter); tested at the end of the loop
1697	add	w9, w9, #1
1698
1699    // [v22-v25] := Load a (2*256-bit = 4*128-bit) table entry starting at x1
1700    //  and advance x1 to point to the next entry
1701	ld1	{v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
1702
1703    // x11 := (w9 == w2)? All 1s : All 0s
1704	cmp	w9, w2
1705	csetm	x11, eq
1706
1707    // duplicate mask_64 into Mask (all 0s or all 1s)
1708	dup	v3.2d, x11
1709
1710    // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
1711    // i.e., values in output registers will remain the same if w9 != w2
1712	bit	v16.16b, v22.16b, v3.16b
1713	bit	v17.16b, v23.16b, v3.16b
1714
1715	bit	v18.16b, v24.16b, v3.16b
1716	bit	v19.16b, v25.16b, v3.16b
1717
1718    // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back
1719	tbz	w9, #6, .Lselect_w7_loop
1720
1721    // Write [v16-v19] to memory at the output pointer
1722	st1	{v16.2d, v17.2d, v18.2d, v19.2d}, [x0]
1723
1724	ret
1725.size	ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
1726#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
1727