1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#include <openssl/asm_base.h> 5 6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) 7#include "openssl/arm_arch.h" 8 9.text 10.globl beeu_mod_inverse_vartime 11 12 13.align 4 14beeu_mod_inverse_vartime: 15 // Reserve enough space for 14 8-byte registers on the stack 16 // in the first stp call for x29, x30. 17 // Then store the remaining callee-saved registers. 18 // 19 // | x29 | x30 | x19 | x20 | ... | x27 | x28 | x0 | x2 | 20 // ^ ^ 21 // sp <------------------- 112 bytes ----------------> old sp 22 // x29 (FP) 23 // 24 AARCH64_SIGN_LINK_REGISTER 25 stp x29,x30,[sp,#-112]! 26 add x29,sp,#0 27 stp x19,x20,[sp,#16] 28 stp x21,x22,[sp,#32] 29 stp x23,x24,[sp,#48] 30 stp x25,x26,[sp,#64] 31 stp x27,x28,[sp,#80] 32 stp x0,x2,[sp,#96] 33 34 // B = b3..b0 := a 35 ldp x25,x26,[x1] 36 ldp x27,x28,[x1,#16] 37 38 // n3..n0 := n 39 // Note: the value of input params are changed in the following. 40 ldp x0,x1,[x2] 41 ldp x2,x30,[x2,#16] 42 43 // A = a3..a0 := n 44 mov x21, x0 45 mov x22, x1 46 mov x23, x2 47 mov x24, x30 48 49 // X = x4..x0 := 1 50 mov x3, #1 51 eor x4, x4, x4 52 eor x5, x5, x5 53 eor x6, x6, x6 54 eor x7, x7, x7 55 56 // Y = y4..y0 := 0 57 eor x8, x8, x8 58 eor x9, x9, x9 59 eor x10, x10, x10 60 eor x11, x11, x11 61 eor x12, x12, x12 62 63Lbeeu_loop: 64 // if B == 0, jump to .Lbeeu_loop_end 65 orr x14, x25, x26 66 orr x14, x14, x27 67 68 // reverse the bit order of x25. This is needed for clz after this macro 69 rbit x15, x25 70 71 orr x14, x14, x28 72 cbz x14,Lbeeu_loop_end 73 74 75 // 0 < B < |n|, 76 // 0 < A <= |n|, 77 // (1) X*a == B (mod |n|), 78 // (2) (-1)*Y*a == A (mod |n|) 79 80 // Now divide B by the maximum possible power of two in the 81 // integers, and divide X by the same value mod |n|. 82 // When we're done, (1) still holds. 83 84 // shift := number of trailing 0s in x25 85 // ( = number of leading 0s in x15; see the "rbit" instruction in TEST_B_ZERO) 86 clz x13, x15 87 88 // If there is no shift, goto shift_A_Y 89 cbz x13, Lbeeu_shift_A_Y 90 91 // Shift B right by "x13" bits 92 neg x14, x13 93 lsr x25, x25, x13 94 lsl x15, x26, x14 95 96 lsr x26, x26, x13 97 lsl x19, x27, x14 98 99 orr x25, x25, x15 100 101 lsr x27, x27, x13 102 lsl x20, x28, x14 103 104 orr x26, x26, x19 105 106 lsr x28, x28, x13 107 108 orr x27, x27, x20 109 110 111 // Shift X right by "x13" bits, adding n whenever X becomes odd. 112 // x13--; 113 // x14 := 0; needed in the addition to the most significant word in SHIFT1 114 eor x14, x14, x14 115Lbeeu_shift_loop_X: 116 tbz x3, #0, Lshift1_0 117 adds x3, x3, x0 118 adcs x4, x4, x1 119 adcs x5, x5, x2 120 adcs x6, x6, x30 121 adc x7, x7, x14 122Lshift1_0: 123 // var0 := [var1|var0]<64..1>; 124 // i.e. concatenate var1 and var0, 125 // extract bits <64..1> from the resulting 128-bit value 126 // and put them in var0 127 extr x3, x4, x3, #1 128 extr x4, x5, x4, #1 129 extr x5, x6, x5, #1 130 extr x6, x7, x6, #1 131 lsr x7, x7, #1 132 133 subs x13, x13, #1 134 bne Lbeeu_shift_loop_X 135 136 // Note: the steps above perform the same sequence as in p256_beeu-x86_64-asm.pl 137 // with the following differences: 138 // - "x13" is set directly to the number of trailing 0s in B 139 // (using rbit and clz instructions) 140 // - The loop is only used to call SHIFT1(X) 141 // and x13 is decreased while executing the X loop. 142 // - SHIFT256(B, x13) is performed before right-shifting X; they are independent 143 144Lbeeu_shift_A_Y: 145 // Same for A and Y. 146 // Afterwards, (2) still holds. 147 // Reverse the bit order of x21 148 // x13 := number of trailing 0s in x21 (= number of leading 0s in x15) 149 rbit x15, x21 150 clz x13, x15 151 152 // If there is no shift, goto |B-A|, X+Y update 153 cbz x13, Lbeeu_update_B_X_or_A_Y 154 155 // Shift A right by "x13" bits 156 neg x14, x13 157 lsr x21, x21, x13 158 lsl x15, x22, x14 159 160 lsr x22, x22, x13 161 lsl x19, x23, x14 162 163 orr x21, x21, x15 164 165 lsr x23, x23, x13 166 lsl x20, x24, x14 167 168 orr x22, x22, x19 169 170 lsr x24, x24, x13 171 172 orr x23, x23, x20 173 174 175 // Shift Y right by "x13" bits, adding n whenever Y becomes odd. 176 // x13--; 177 // x14 := 0; needed in the addition to the most significant word in SHIFT1 178 eor x14, x14, x14 179Lbeeu_shift_loop_Y: 180 tbz x8, #0, Lshift1_1 181 adds x8, x8, x0 182 adcs x9, x9, x1 183 adcs x10, x10, x2 184 adcs x11, x11, x30 185 adc x12, x12, x14 186Lshift1_1: 187 // var0 := [var1|var0]<64..1>; 188 // i.e. concatenate var1 and var0, 189 // extract bits <64..1> from the resulting 128-bit value 190 // and put them in var0 191 extr x8, x9, x8, #1 192 extr x9, x10, x9, #1 193 extr x10, x11, x10, #1 194 extr x11, x12, x11, #1 195 lsr x12, x12, #1 196 197 subs x13, x13, #1 198 bne Lbeeu_shift_loop_Y 199 200Lbeeu_update_B_X_or_A_Y: 201 // Try T := B - A; if cs, continue with B > A (cs: carry set = no borrow) 202 // Note: this is a case of unsigned arithmetic, where T fits in 4 64-bit words 203 // without taking a sign bit if generated. The lack of a carry would 204 // indicate a negative result. See, for example, 205 // https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/condition-codes-1-condition-flags-and-codes 206 subs x14, x25, x21 207 sbcs x15, x26, x22 208 sbcs x19, x27, x23 209 sbcs x20, x28, x24 210 bcs Lbeeu_B_greater_than_A 211 212 // Else A > B => 213 // A := A - B; Y := Y + X; goto beginning of the loop 214 subs x21, x21, x25 215 sbcs x22, x22, x26 216 sbcs x23, x23, x27 217 sbcs x24, x24, x28 218 219 adds x8, x8, x3 220 adcs x9, x9, x4 221 adcs x10, x10, x5 222 adcs x11, x11, x6 223 adc x12, x12, x7 224 b Lbeeu_loop 225 226Lbeeu_B_greater_than_A: 227 // Continue with B > A => 228 // B := B - A; X := X + Y; goto beginning of the loop 229 mov x25, x14 230 mov x26, x15 231 mov x27, x19 232 mov x28, x20 233 234 adds x3, x3, x8 235 adcs x4, x4, x9 236 adcs x5, x5, x10 237 adcs x6, x6, x11 238 adc x7, x7, x12 239 b Lbeeu_loop 240 241Lbeeu_loop_end: 242 // The Euclid's algorithm loop ends when A == gcd(a,n); 243 // this would be 1, when a and n are co-prime (i.e. do not have a common factor). 244 // Since (-1)*Y*a == A (mod |n|), Y>0 245 // then out = -Y mod n 246 247 // Verify that A = 1 ==> (-1)*Y*a = A = 1 (mod |n|) 248 // Is A-1 == 0? 249 // If not, fail. 250 sub x14, x21, #1 251 orr x14, x14, x22 252 orr x14, x14, x23 253 orr x14, x14, x24 254 cbnz x14, Lbeeu_err 255 256 // If Y>n ==> Y:=Y-n 257Lbeeu_reduction_loop: 258 // x_i := y_i - n_i (X is no longer needed, use it as temp) 259 // (x14 = 0 from above) 260 subs x3, x8, x0 261 sbcs x4, x9, x1 262 sbcs x5, x10, x2 263 sbcs x6, x11, x30 264 sbcs x7, x12, x14 265 266 // If result is non-negative (i.e., cs = carry set = no borrow), 267 // y_i := x_i; goto reduce again 268 // else 269 // y_i := y_i; continue 270 csel x8, x3, x8, cs 271 csel x9, x4, x9, cs 272 csel x10, x5, x10, cs 273 csel x11, x6, x11, cs 274 csel x12, x7, x12, cs 275 bcs Lbeeu_reduction_loop 276 277 // Now Y < n (Y cannot be equal to n, since the inverse cannot be 0) 278 // out = -Y = n-Y 279 subs x8, x0, x8 280 sbcs x9, x1, x9 281 sbcs x10, x2, x10 282 sbcs x11, x30, x11 283 284 // Save Y in output (out (x0) was saved on the stack) 285 ldr x3, [sp,#96] 286 stp x8, x9, [x3] 287 stp x10, x11, [x3,#16] 288 // return 1 (success) 289 mov x0, #1 290 b Lbeeu_finish 291 292Lbeeu_err: 293 // return 0 (error) 294 eor x0, x0, x0 295 296Lbeeu_finish: 297 // Restore callee-saved registers, except x0, x2 298 add sp,x29,#0 299 ldp x19,x20,[sp,#16] 300 ldp x21,x22,[sp,#32] 301 ldp x23,x24,[sp,#48] 302 ldp x25,x26,[sp,#64] 303 ldp x27,x28,[sp,#80] 304 ldp x29,x30,[sp],#112 305 306 AARCH64_VALIDATE_LINK_REGISTER 307 ret 308 309#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) 310