xref: /aosp_15_r20/external/cronet/third_party/boringssl/src/gen/bcm/p256_beeu-armv8-asm-win.S (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <openssl/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
7#include "openssl/arm_arch.h"
8
9.text
10.globl	beeu_mod_inverse_vartime
11
12
13.align	4
14beeu_mod_inverse_vartime:
15    // Reserve enough space for 14 8-byte registers on the stack
16    // in the first stp call for x29, x30.
17    // Then store the remaining callee-saved registers.
18    //
19    //    | x29 | x30 | x19 | x20 | ... | x27 | x28 |  x0 |  x2 |
20    //    ^                                                     ^
21    //    sp  <------------------- 112 bytes ----------------> old sp
22    //   x29 (FP)
23    //
24	AARCH64_SIGN_LINK_REGISTER
25	stp	x29,x30,[sp,#-112]!
26	add	x29,sp,#0
27	stp	x19,x20,[sp,#16]
28	stp	x21,x22,[sp,#32]
29	stp	x23,x24,[sp,#48]
30	stp	x25,x26,[sp,#64]
31	stp	x27,x28,[sp,#80]
32	stp	x0,x2,[sp,#96]
33
34    // B = b3..b0 := a
35	ldp	x25,x26,[x1]
36	ldp	x27,x28,[x1,#16]
37
38    // n3..n0 := n
39    // Note: the value of input params are changed in the following.
40	ldp	x0,x1,[x2]
41	ldp	x2,x30,[x2,#16]
42
43    // A = a3..a0 := n
44	mov	x21, x0
45	mov	x22, x1
46	mov	x23, x2
47	mov	x24, x30
48
49    // X = x4..x0 := 1
50	mov	x3, #1
51	eor	x4, x4, x4
52	eor	x5, x5, x5
53	eor	x6, x6, x6
54	eor	x7, x7, x7
55
56    // Y = y4..y0 := 0
57	eor	x8, x8, x8
58	eor	x9, x9, x9
59	eor	x10, x10, x10
60	eor	x11, x11, x11
61	eor	x12, x12, x12
62
63Lbeeu_loop:
64    // if B == 0, jump to .Lbeeu_loop_end
65	orr	x14, x25, x26
66	orr	x14, x14, x27
67
68    // reverse the bit order of x25. This is needed for clz after this macro
69	rbit	x15, x25
70
71	orr	x14, x14, x28
72	cbz	x14,Lbeeu_loop_end
73
74
75    // 0 < B < |n|,
76    // 0 < A <= |n|,
77    // (1)      X*a  ==  B   (mod |n|),
78    // (2) (-1)*Y*a  ==  A   (mod |n|)
79
80    // Now divide B by the maximum possible power of two in the
81    // integers, and divide X by the same value mod |n|.
82    // When we're done, (1) still holds.
83
84    // shift := number of trailing 0s in x25
85    // (      = number of leading 0s in x15; see the "rbit" instruction in TEST_B_ZERO)
86	clz	x13, x15
87
88    // If there is no shift, goto shift_A_Y
89	cbz	x13, Lbeeu_shift_A_Y
90
91    // Shift B right by "x13" bits
92	neg	x14, x13
93	lsr	x25, x25, x13
94	lsl	x15, x26, x14
95
96	lsr	x26, x26, x13
97	lsl	x19, x27, x14
98
99	orr	x25, x25, x15
100
101	lsr	x27, x27, x13
102	lsl	x20, x28, x14
103
104	orr	x26, x26, x19
105
106	lsr	x28, x28, x13
107
108	orr	x27, x27, x20
109
110
111    // Shift X right by "x13" bits, adding n whenever X becomes odd.
112    // x13--;
113    // x14 := 0; needed in the addition to the most significant word in SHIFT1
114	eor	x14, x14, x14
115Lbeeu_shift_loop_X:
116	tbz	x3, #0, Lshift1_0
117	adds	x3, x3, x0
118	adcs	x4, x4, x1
119	adcs	x5, x5, x2
120	adcs	x6, x6, x30
121	adc	x7, x7, x14
122Lshift1_0:
123    // var0 := [var1|var0]<64..1>;
124    // i.e. concatenate var1 and var0,
125    //      extract bits <64..1> from the resulting 128-bit value
126    //      and put them in var0
127	extr	x3, x4, x3, #1
128	extr	x4, x5, x4, #1
129	extr	x5, x6, x5, #1
130	extr	x6, x7, x6, #1
131	lsr	x7, x7, #1
132
133	subs	x13, x13, #1
134	bne	Lbeeu_shift_loop_X
135
136    // Note: the steps above perform the same sequence as in p256_beeu-x86_64-asm.pl
137    // with the following differences:
138    // - "x13" is set directly to the number of trailing 0s in B
139    //   (using rbit and clz instructions)
140    // - The loop is only used to call SHIFT1(X)
141    //   and x13 is decreased while executing the X loop.
142    // - SHIFT256(B, x13) is performed before right-shifting X; they are independent
143
144Lbeeu_shift_A_Y:
145    // Same for A and Y.
146    // Afterwards, (2) still holds.
147    // Reverse the bit order of x21
148    // x13 := number of trailing 0s in x21 (= number of leading 0s in x15)
149	rbit	x15, x21
150	clz	x13, x15
151
152    // If there is no shift, goto |B-A|, X+Y update
153	cbz	x13, Lbeeu_update_B_X_or_A_Y
154
155    // Shift A right by "x13" bits
156	neg	x14, x13
157	lsr	x21, x21, x13
158	lsl	x15, x22, x14
159
160	lsr	x22, x22, x13
161	lsl	x19, x23, x14
162
163	orr	x21, x21, x15
164
165	lsr	x23, x23, x13
166	lsl	x20, x24, x14
167
168	orr	x22, x22, x19
169
170	lsr	x24, x24, x13
171
172	orr	x23, x23, x20
173
174
175    // Shift Y right by "x13" bits, adding n whenever Y becomes odd.
176    // x13--;
177    // x14 := 0; needed in the addition to the most significant word in SHIFT1
178	eor	x14, x14, x14
179Lbeeu_shift_loop_Y:
180	tbz	x8, #0, Lshift1_1
181	adds	x8, x8, x0
182	adcs	x9, x9, x1
183	adcs	x10, x10, x2
184	adcs	x11, x11, x30
185	adc	x12, x12, x14
186Lshift1_1:
187    // var0 := [var1|var0]<64..1>;
188    // i.e. concatenate var1 and var0,
189    //      extract bits <64..1> from the resulting 128-bit value
190    //      and put them in var0
191	extr	x8, x9, x8, #1
192	extr	x9, x10, x9, #1
193	extr	x10, x11, x10, #1
194	extr	x11, x12, x11, #1
195	lsr	x12, x12, #1
196
197	subs	x13, x13, #1
198	bne	Lbeeu_shift_loop_Y
199
200Lbeeu_update_B_X_or_A_Y:
201    // Try T := B - A; if cs, continue with B > A (cs: carry set = no borrow)
202    // Note: this is a case of unsigned arithmetic, where T fits in 4 64-bit words
203    //       without taking a sign bit if generated. The lack of a carry would
204    //       indicate a negative result. See, for example,
205    //       https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/condition-codes-1-condition-flags-and-codes
206	subs	x14, x25, x21
207	sbcs	x15, x26, x22
208	sbcs	x19, x27, x23
209	sbcs	x20, x28, x24
210	bcs	Lbeeu_B_greater_than_A
211
212    // Else A > B =>
213    // A := A - B; Y := Y + X; goto beginning of the loop
214	subs	x21, x21, x25
215	sbcs	x22, x22, x26
216	sbcs	x23, x23, x27
217	sbcs	x24, x24, x28
218
219	adds	x8, x8, x3
220	adcs	x9, x9, x4
221	adcs	x10, x10, x5
222	adcs	x11, x11, x6
223	adc	x12, x12, x7
224	b	Lbeeu_loop
225
226Lbeeu_B_greater_than_A:
227    // Continue with B > A =>
228    // B := B - A; X := X + Y; goto beginning of the loop
229	mov	x25, x14
230	mov	x26, x15
231	mov	x27, x19
232	mov	x28, x20
233
234	adds	x3, x3, x8
235	adcs	x4, x4, x9
236	adcs	x5, x5, x10
237	adcs	x6, x6, x11
238	adc	x7, x7, x12
239	b	Lbeeu_loop
240
241Lbeeu_loop_end:
242    // The Euclid's algorithm loop ends when A == gcd(a,n);
243    // this would be 1, when a and n are co-prime (i.e. do not have a common factor).
244    // Since (-1)*Y*a == A (mod |n|), Y>0
245    // then out = -Y mod n
246
247    // Verify that A = 1 ==> (-1)*Y*a = A = 1  (mod |n|)
248    // Is A-1 == 0?
249    // If not, fail.
250	sub	x14, x21, #1
251	orr	x14, x14, x22
252	orr	x14, x14, x23
253	orr	x14, x14, x24
254	cbnz	x14, Lbeeu_err
255
256    // If Y>n ==> Y:=Y-n
257Lbeeu_reduction_loop:
258    // x_i := y_i - n_i (X is no longer needed, use it as temp)
259    // (x14 = 0 from above)
260	subs	x3, x8, x0
261	sbcs	x4, x9, x1
262	sbcs	x5, x10, x2
263	sbcs	x6, x11, x30
264	sbcs	x7, x12, x14
265
266    // If result is non-negative (i.e., cs = carry set = no borrow),
267    // y_i := x_i; goto reduce again
268    // else
269    // y_i := y_i; continue
270	csel	x8, x3, x8, cs
271	csel	x9, x4, x9, cs
272	csel	x10, x5, x10, cs
273	csel	x11, x6, x11, cs
274	csel	x12, x7, x12, cs
275	bcs	Lbeeu_reduction_loop
276
277    // Now Y < n (Y cannot be equal to n, since the inverse cannot be 0)
278    // out = -Y = n-Y
279	subs	x8, x0, x8
280	sbcs	x9, x1, x9
281	sbcs	x10, x2, x10
282	sbcs	x11, x30, x11
283
284    // Save Y in output (out (x0) was saved on the stack)
285	ldr	x3, [sp,#96]
286	stp	x8, x9, [x3]
287	stp	x10, x11, [x3,#16]
288    // return 1 (success)
289	mov	x0, #1
290	b	Lbeeu_finish
291
292Lbeeu_err:
293    // return 0 (error)
294	eor	x0, x0, x0
295
296Lbeeu_finish:
297    // Restore callee-saved registers, except x0, x2
298	add	sp,x29,#0
299	ldp	x19,x20,[sp,#16]
300	ldp	x21,x22,[sp,#32]
301	ldp	x23,x24,[sp,#48]
302	ldp	x25,x26,[sp,#64]
303	ldp	x27,x28,[sp,#80]
304	ldp	x29,x30,[sp],#112
305
306	AARCH64_VALIDATE_LINK_REGISTER
307	ret
308
309#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
310