xref: /aosp_15_r20/external/boringssl/src/crypto/fipsmodule/ec/p256.c (revision 8fb009dc861624b67b6cdb62ea21f0f22d0c584b)
1 /* Copyright (c) 2020, Google Inc.
2  *
3  * Permission to use, copy, modify, and/or distribute this software for any
4  * purpose with or without fee is hereby granted, provided that the above
5  * copyright notice and this permission notice appear in all copies.
6  *
7  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10  * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12  * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
14 
15 // An implementation of the NIST P-256 elliptic curve point multiplication.
16 // 256-bit Montgomery form for 64 and 32-bit. Field operations are generated by
17 // Fiat, which lives in //third_party/fiat.
18 
19 #include <openssl/base.h>
20 
21 #include <openssl/bn.h>
22 #include <openssl/ec.h>
23 #include <openssl/err.h>
24 #include <openssl/mem.h>
25 
26 #include <assert.h>
27 #include <string.h>
28 
29 #include "../../internal.h"
30 #include "../delocate.h"
31 #include "./internal.h"
32 
33 #if defined(BORINGSSL_HAS_UINT128)
34 #include "../../../third_party/fiat/p256_64.h"
35 #elif defined(OPENSSL_64_BIT)
36 #include "../../../third_party/fiat/p256_64_msvc.h"
37 #else
38 #include "../../../third_party/fiat/p256_32.h"
39 #endif
40 
41 
42 // utility functions, handwritten
43 
44 #if defined(OPENSSL_64_BIT)
45 #define FIAT_P256_NLIMBS 4
46 typedef uint64_t fiat_p256_limb_t;
47 typedef uint64_t fiat_p256_felem[FIAT_P256_NLIMBS];
48 static const fiat_p256_felem fiat_p256_one = {0x1, 0xffffffff00000000,
49                                               0xffffffffffffffff, 0xfffffffe};
50 #else  // 64BIT; else 32BIT
51 #define FIAT_P256_NLIMBS 8
52 typedef uint32_t fiat_p256_limb_t;
53 typedef uint32_t fiat_p256_felem[FIAT_P256_NLIMBS];
54 static const fiat_p256_felem fiat_p256_one = {
55     0x1, 0x0, 0x0, 0xffffffff, 0xffffffff, 0xffffffff, 0xfffffffe, 0x0};
56 #endif  // 64BIT
57 
58 
fiat_p256_nz(const fiat_p256_limb_t in1[FIAT_P256_NLIMBS])59 static fiat_p256_limb_t fiat_p256_nz(
60     const fiat_p256_limb_t in1[FIAT_P256_NLIMBS]) {
61   fiat_p256_limb_t ret;
62   fiat_p256_nonzero(&ret, in1);
63   return ret;
64 }
65 
fiat_p256_copy(fiat_p256_limb_t out[FIAT_P256_NLIMBS],const fiat_p256_limb_t in1[FIAT_P256_NLIMBS])66 static void fiat_p256_copy(fiat_p256_limb_t out[FIAT_P256_NLIMBS],
67                            const fiat_p256_limb_t in1[FIAT_P256_NLIMBS]) {
68   for (size_t i = 0; i < FIAT_P256_NLIMBS; i++) {
69     out[i] = in1[i];
70   }
71 }
72 
fiat_p256_cmovznz(fiat_p256_limb_t out[FIAT_P256_NLIMBS],fiat_p256_limb_t t,const fiat_p256_limb_t z[FIAT_P256_NLIMBS],const fiat_p256_limb_t nz[FIAT_P256_NLIMBS])73 static void fiat_p256_cmovznz(fiat_p256_limb_t out[FIAT_P256_NLIMBS],
74                               fiat_p256_limb_t t,
75                               const fiat_p256_limb_t z[FIAT_P256_NLIMBS],
76                               const fiat_p256_limb_t nz[FIAT_P256_NLIMBS]) {
77   fiat_p256_selectznz(out, !!t, z, nz);
78 }
79 
fiat_p256_from_words(fiat_p256_felem out,const BN_ULONG in[32/sizeof (BN_ULONG)])80 static void fiat_p256_from_words(fiat_p256_felem out,
81                                  const BN_ULONG in[32 / sizeof(BN_ULONG)]) {
82   // Typically, |BN_ULONG| and |fiat_p256_limb_t| will be the same type, but on
83   // 64-bit platforms without |uint128_t|, they are different. However, on
84   // little-endian systems, |uint64_t[4]| and |uint32_t[8]| have the same
85   // layout.
86   OPENSSL_memcpy(out, in, 32);
87 }
88 
fiat_p256_from_generic(fiat_p256_felem out,const EC_FELEM * in)89 static void fiat_p256_from_generic(fiat_p256_felem out, const EC_FELEM *in) {
90   fiat_p256_from_words(out, in->words);
91 }
92 
fiat_p256_to_generic(EC_FELEM * out,const fiat_p256_felem in)93 static void fiat_p256_to_generic(EC_FELEM *out, const fiat_p256_felem in) {
94   // See |fiat_p256_from_words|.
95   OPENSSL_memcpy(out->words, in, 32);
96 }
97 
98 // fiat_p256_inv_square calculates |out| = |in|^{-2}
99 //
100 // Based on Fermat's Little Theorem:
101 //   a^p = a (mod p)
102 //   a^{p-1} = 1 (mod p)
103 //   a^{p-3} = a^{-2} (mod p)
fiat_p256_inv_square(fiat_p256_felem out,const fiat_p256_felem in)104 static void fiat_p256_inv_square(fiat_p256_felem out,
105                                  const fiat_p256_felem in) {
106   // This implements the addition chain described in
107   // https://briansmith.org/ecc-inversion-addition-chains-01#p256_field_inversion
108   fiat_p256_felem x2, x3, x6, x12, x15, x30, x32;
109   fiat_p256_square(x2, in);   // 2^2 - 2^1
110   fiat_p256_mul(x2, x2, in);  // 2^2 - 2^0
111 
112   fiat_p256_square(x3, x2);   // 2^3 - 2^1
113   fiat_p256_mul(x3, x3, in);  // 2^3 - 2^0
114 
115   fiat_p256_square(x6, x3);
116   for (int i = 1; i < 3; i++) {
117     fiat_p256_square(x6, x6);
118   }                           // 2^6 - 2^3
119   fiat_p256_mul(x6, x6, x3);  // 2^6 - 2^0
120 
121   fiat_p256_square(x12, x6);
122   for (int i = 1; i < 6; i++) {
123     fiat_p256_square(x12, x12);
124   }                             // 2^12 - 2^6
125   fiat_p256_mul(x12, x12, x6);  // 2^12 - 2^0
126 
127   fiat_p256_square(x15, x12);
128   for (int i = 1; i < 3; i++) {
129     fiat_p256_square(x15, x15);
130   }                             // 2^15 - 2^3
131   fiat_p256_mul(x15, x15, x3);  // 2^15 - 2^0
132 
133   fiat_p256_square(x30, x15);
134   for (int i = 1; i < 15; i++) {
135     fiat_p256_square(x30, x30);
136   }                              // 2^30 - 2^15
137   fiat_p256_mul(x30, x30, x15);  // 2^30 - 2^0
138 
139   fiat_p256_square(x32, x30);
140   fiat_p256_square(x32, x32);   // 2^32 - 2^2
141   fiat_p256_mul(x32, x32, x2);  // 2^32 - 2^0
142 
143   fiat_p256_felem ret;
144   fiat_p256_square(ret, x32);
145   for (int i = 1; i < 31 + 1; i++) {
146     fiat_p256_square(ret, ret);
147   }                             // 2^64 - 2^32
148   fiat_p256_mul(ret, ret, in);  // 2^64 - 2^32 + 2^0
149 
150   for (int i = 0; i < 96 + 32; i++) {
151     fiat_p256_square(ret, ret);
152   }                              // 2^192 - 2^160 + 2^128
153   fiat_p256_mul(ret, ret, x32);  // 2^192 - 2^160 + 2^128 + 2^32 - 2^0
154 
155   for (int i = 0; i < 32; i++) {
156     fiat_p256_square(ret, ret);
157   }                              // 2^224 - 2^192 + 2^160 + 2^64 - 2^32
158   fiat_p256_mul(ret, ret, x32);  // 2^224 - 2^192 + 2^160 + 2^64 - 2^0
159 
160   for (int i = 0; i < 30; i++) {
161     fiat_p256_square(ret, ret);
162   }                              // 2^254 - 2^222 + 2^190 + 2^94 - 2^30
163   fiat_p256_mul(ret, ret, x30);  // 2^254 - 2^222 + 2^190 + 2^94 - 2^0
164 
165   fiat_p256_square(ret, ret);
166   fiat_p256_square(out, ret);  // 2^256 - 2^224 + 2^192 + 2^96 - 2^2
167 }
168 
169 // Group operations
170 // ----------------
171 //
172 // Building on top of the field operations we have the operations on the
173 // elliptic curve group itself. Points on the curve are represented in Jacobian
174 // coordinates.
175 //
176 // Both operations were transcribed to Coq and proven to correspond to naive
177 // implementations using Affine coordinates, for all suitable fields.  In the
178 // Coq proofs, issues of constant-time execution and memory layout (aliasing)
179 // conventions were not considered. Specification of affine coordinates:
180 // <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Spec/WeierstrassCurve.v#L28>
181 // As a sanity check, a proof that these points form a commutative group:
182 // <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Curves/Weierstrass/AffineProofs.v#L33>
183 
184 // fiat_p256_point_double calculates 2*(x_in, y_in, z_in)
185 //
186 // The method is taken from:
187 //   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
188 //
189 // Coq transcription and correctness proof:
190 // <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Curves/Weierstrass/Jacobian.v#L93>
191 // <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Curves/Weierstrass/Jacobian.v#L201>
192 //
193 // Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
194 // while x_out == y_in is not (maybe this works, but it's not tested).
fiat_p256_point_double(fiat_p256_felem x_out,fiat_p256_felem y_out,fiat_p256_felem z_out,const fiat_p256_felem x_in,const fiat_p256_felem y_in,const fiat_p256_felem z_in)195 static void fiat_p256_point_double(fiat_p256_felem x_out, fiat_p256_felem y_out,
196                                    fiat_p256_felem z_out,
197                                    const fiat_p256_felem x_in,
198                                    const fiat_p256_felem y_in,
199                                    const fiat_p256_felem z_in) {
200   fiat_p256_felem delta, gamma, beta, ftmp, ftmp2, tmptmp, alpha, fourbeta;
201   // delta = z^2
202   fiat_p256_square(delta, z_in);
203   // gamma = y^2
204   fiat_p256_square(gamma, y_in);
205   // beta = x*gamma
206   fiat_p256_mul(beta, x_in, gamma);
207 
208   // alpha = 3*(x-delta)*(x+delta)
209   fiat_p256_sub(ftmp, x_in, delta);
210   fiat_p256_add(ftmp2, x_in, delta);
211 
212   fiat_p256_add(tmptmp, ftmp2, ftmp2);
213   fiat_p256_add(ftmp2, ftmp2, tmptmp);
214   fiat_p256_mul(alpha, ftmp, ftmp2);
215 
216   // x' = alpha^2 - 8*beta
217   fiat_p256_square(x_out, alpha);
218   fiat_p256_add(fourbeta, beta, beta);
219   fiat_p256_add(fourbeta, fourbeta, fourbeta);
220   fiat_p256_add(tmptmp, fourbeta, fourbeta);
221   fiat_p256_sub(x_out, x_out, tmptmp);
222 
223   // z' = (y + z)^2 - gamma - delta
224   fiat_p256_add(delta, gamma, delta);
225   fiat_p256_add(ftmp, y_in, z_in);
226   fiat_p256_square(z_out, ftmp);
227   fiat_p256_sub(z_out, z_out, delta);
228 
229   // y' = alpha*(4*beta - x') - 8*gamma^2
230   fiat_p256_sub(y_out, fourbeta, x_out);
231   fiat_p256_add(gamma, gamma, gamma);
232   fiat_p256_square(gamma, gamma);
233   fiat_p256_mul(y_out, alpha, y_out);
234   fiat_p256_add(gamma, gamma, gamma);
235   fiat_p256_sub(y_out, y_out, gamma);
236 }
237 
238 // fiat_p256_point_add calculates (x1, y1, z1) + (x2, y2, z2)
239 //
240 // The method is taken from:
241 //   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
242 // adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
243 //
244 // Coq transcription and correctness proof:
245 // <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Curves/Weierstrass/Jacobian.v#L135>
246 // <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Curves/Weierstrass/Jacobian.v#L205>
247 //
248 // This function includes a branch for checking whether the two input points
249 // are equal, (while not equal to the point at infinity). This case never
250 // happens during single point multiplication, so there is no timing leak for
251 // ECDH or ECDSA signing.
fiat_p256_point_add(fiat_p256_felem x3,fiat_p256_felem y3,fiat_p256_felem z3,const fiat_p256_felem x1,const fiat_p256_felem y1,const fiat_p256_felem z1,const int mixed,const fiat_p256_felem x2,const fiat_p256_felem y2,const fiat_p256_felem z2)252 static void fiat_p256_point_add(fiat_p256_felem x3, fiat_p256_felem y3,
253                                 fiat_p256_felem z3, const fiat_p256_felem x1,
254                                 const fiat_p256_felem y1,
255                                 const fiat_p256_felem z1, const int mixed,
256                                 const fiat_p256_felem x2,
257                                 const fiat_p256_felem y2,
258                                 const fiat_p256_felem z2) {
259   fiat_p256_felem x_out, y_out, z_out;
260   fiat_p256_limb_t z1nz = fiat_p256_nz(z1);
261   fiat_p256_limb_t z2nz = fiat_p256_nz(z2);
262 
263   // z1z1 = z1z1 = z1**2
264   fiat_p256_felem z1z1;
265   fiat_p256_square(z1z1, z1);
266 
267   fiat_p256_felem u1, s1, two_z1z2;
268   if (!mixed) {
269     // z2z2 = z2**2
270     fiat_p256_felem z2z2;
271     fiat_p256_square(z2z2, z2);
272 
273     // u1 = x1*z2z2
274     fiat_p256_mul(u1, x1, z2z2);
275 
276     // two_z1z2 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2
277     fiat_p256_add(two_z1z2, z1, z2);
278     fiat_p256_square(two_z1z2, two_z1z2);
279     fiat_p256_sub(two_z1z2, two_z1z2, z1z1);
280     fiat_p256_sub(two_z1z2, two_z1z2, z2z2);
281 
282     // s1 = y1 * z2**3
283     fiat_p256_mul(s1, z2, z2z2);
284     fiat_p256_mul(s1, s1, y1);
285   } else {
286     // We'll assume z2 = 1 (special case z2 = 0 is handled later).
287 
288     // u1 = x1*z2z2
289     fiat_p256_copy(u1, x1);
290     // two_z1z2 = 2z1z2
291     fiat_p256_add(two_z1z2, z1, z1);
292     // s1 = y1 * z2**3
293     fiat_p256_copy(s1, y1);
294   }
295 
296   // u2 = x2*z1z1
297   fiat_p256_felem u2;
298   fiat_p256_mul(u2, x2, z1z1);
299 
300   // h = u2 - u1
301   fiat_p256_felem h;
302   fiat_p256_sub(h, u2, u1);
303 
304   fiat_p256_limb_t xneq = fiat_p256_nz(h);
305 
306   // z_out = two_z1z2 * h
307   fiat_p256_mul(z_out, h, two_z1z2);
308 
309   // z1z1z1 = z1 * z1z1
310   fiat_p256_felem z1z1z1;
311   fiat_p256_mul(z1z1z1, z1, z1z1);
312 
313   // s2 = y2 * z1**3
314   fiat_p256_felem s2;
315   fiat_p256_mul(s2, y2, z1z1z1);
316 
317   // r = (s2 - s1)*2
318   fiat_p256_felem r;
319   fiat_p256_sub(r, s2, s1);
320   fiat_p256_add(r, r, r);
321 
322   fiat_p256_limb_t yneq = fiat_p256_nz(r);
323 
324   fiat_p256_limb_t is_nontrivial_double = constant_time_is_zero_w(xneq | yneq) &
325                                           ~constant_time_is_zero_w(z1nz) &
326                                           ~constant_time_is_zero_w(z2nz);
327   if (constant_time_declassify_w(is_nontrivial_double)) {
328     fiat_p256_point_double(x3, y3, z3, x1, y1, z1);
329     return;
330   }
331 
332   // I = (2h)**2
333   fiat_p256_felem i;
334   fiat_p256_add(i, h, h);
335   fiat_p256_square(i, i);
336 
337   // J = h * I
338   fiat_p256_felem j;
339   fiat_p256_mul(j, h, i);
340 
341   // V = U1 * I
342   fiat_p256_felem v;
343   fiat_p256_mul(v, u1, i);
344 
345   // x_out = r**2 - J - 2V
346   fiat_p256_square(x_out, r);
347   fiat_p256_sub(x_out, x_out, j);
348   fiat_p256_sub(x_out, x_out, v);
349   fiat_p256_sub(x_out, x_out, v);
350 
351   // y_out = r(V-x_out) - 2 * s1 * J
352   fiat_p256_sub(y_out, v, x_out);
353   fiat_p256_mul(y_out, y_out, r);
354   fiat_p256_felem s1j;
355   fiat_p256_mul(s1j, s1, j);
356   fiat_p256_sub(y_out, y_out, s1j);
357   fiat_p256_sub(y_out, y_out, s1j);
358 
359   fiat_p256_cmovznz(x_out, z1nz, x2, x_out);
360   fiat_p256_cmovznz(x3, z2nz, x1, x_out);
361   fiat_p256_cmovznz(y_out, z1nz, y2, y_out);
362   fiat_p256_cmovznz(y3, z2nz, y1, y_out);
363   fiat_p256_cmovznz(z_out, z1nz, z2, z_out);
364   fiat_p256_cmovznz(z3, z2nz, z1, z_out);
365 }
366 
367 #include "./p256_table.h"
368 
369 // fiat_p256_select_point_affine selects the |idx-1|th point from a
370 // precomputation table and copies it to out. If |idx| is zero, the output is
371 // the point at infinity.
fiat_p256_select_point_affine(const fiat_p256_limb_t idx,size_t size,const fiat_p256_felem pre_comp[][2],fiat_p256_felem out[3])372 static void fiat_p256_select_point_affine(
373     const fiat_p256_limb_t idx, size_t size,
374     const fiat_p256_felem pre_comp[/*size*/][2], fiat_p256_felem out[3]) {
375   OPENSSL_memset(out, 0, sizeof(fiat_p256_felem) * 3);
376   for (size_t i = 0; i < size; i++) {
377     fiat_p256_limb_t mismatch = i ^ (idx - 1);
378     fiat_p256_cmovznz(out[0], mismatch, pre_comp[i][0], out[0]);
379     fiat_p256_cmovznz(out[1], mismatch, pre_comp[i][1], out[1]);
380   }
381   fiat_p256_cmovznz(out[2], idx, out[2], fiat_p256_one);
382 }
383 
384 // fiat_p256_select_point selects the |idx|th point from a precomputation table
385 // and copies it to out.
fiat_p256_select_point(const fiat_p256_limb_t idx,size_t size,const fiat_p256_felem pre_comp[][3],fiat_p256_felem out[3])386 static void fiat_p256_select_point(const fiat_p256_limb_t idx, size_t size,
387                                    const fiat_p256_felem pre_comp[/*size*/][3],
388                                    fiat_p256_felem out[3]) {
389   OPENSSL_memset(out, 0, sizeof(fiat_p256_felem) * 3);
390   for (size_t i = 0; i < size; i++) {
391     fiat_p256_limb_t mismatch = i ^ idx;
392     fiat_p256_cmovznz(out[0], mismatch, pre_comp[i][0], out[0]);
393     fiat_p256_cmovznz(out[1], mismatch, pre_comp[i][1], out[1]);
394     fiat_p256_cmovznz(out[2], mismatch, pre_comp[i][2], out[2]);
395   }
396 }
397 
398 // fiat_p256_get_bit returns the |i|th bit in |in|.
fiat_p256_get_bit(const EC_SCALAR * in,int i)399 static crypto_word_t fiat_p256_get_bit(const EC_SCALAR *in, int i) {
400   if (i < 0 || i >= 256) {
401     return 0;
402   }
403 #if defined(OPENSSL_64_BIT)
404   static_assert(sizeof(BN_ULONG) == 8, "BN_ULONG was not 64-bit");
405   return (in->words[i >> 6] >> (i & 63)) & 1;
406 #else
407   static_assert(sizeof(BN_ULONG) == 4, "BN_ULONG was not 32-bit");
408   return (in->words[i >> 5] >> (i & 31)) & 1;
409 #endif
410 }
411 
412 // OPENSSL EC_METHOD FUNCTIONS
413 
414 // Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
415 // (X/Z^2, Y/Z^3).
ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP * group,const EC_JACOBIAN * point,EC_FELEM * x_out,EC_FELEM * y_out)416 static int ec_GFp_nistp256_point_get_affine_coordinates(
417     const EC_GROUP *group, const EC_JACOBIAN *point, EC_FELEM *x_out,
418     EC_FELEM *y_out) {
419   if (constant_time_declassify_int(
420           ec_GFp_simple_is_at_infinity(group, point))) {
421     OPENSSL_PUT_ERROR(EC, EC_R_POINT_AT_INFINITY);
422     return 0;
423   }
424 
425   fiat_p256_felem z1, z2;
426   fiat_p256_from_generic(z1, &point->Z);
427   fiat_p256_inv_square(z2, z1);
428 
429   if (x_out != NULL) {
430     fiat_p256_felem x;
431     fiat_p256_from_generic(x, &point->X);
432     fiat_p256_mul(x, x, z2);
433     fiat_p256_to_generic(x_out, x);
434   }
435 
436   if (y_out != NULL) {
437     fiat_p256_felem y;
438     fiat_p256_from_generic(y, &point->Y);
439     fiat_p256_square(z2, z2);  // z^-4
440     fiat_p256_mul(y, y, z1);   // y * z
441     fiat_p256_mul(y, y, z2);   // y * z^-3
442     fiat_p256_to_generic(y_out, y);
443   }
444 
445   return 1;
446 }
447 
ec_GFp_nistp256_add(const EC_GROUP * group,EC_JACOBIAN * r,const EC_JACOBIAN * a,const EC_JACOBIAN * b)448 static void ec_GFp_nistp256_add(const EC_GROUP *group, EC_JACOBIAN *r,
449                                 const EC_JACOBIAN *a, const EC_JACOBIAN *b) {
450   fiat_p256_felem x1, y1, z1, x2, y2, z2;
451   fiat_p256_from_generic(x1, &a->X);
452   fiat_p256_from_generic(y1, &a->Y);
453   fiat_p256_from_generic(z1, &a->Z);
454   fiat_p256_from_generic(x2, &b->X);
455   fiat_p256_from_generic(y2, &b->Y);
456   fiat_p256_from_generic(z2, &b->Z);
457   fiat_p256_point_add(x1, y1, z1, x1, y1, z1, 0 /* both Jacobian */, x2, y2,
458                       z2);
459   fiat_p256_to_generic(&r->X, x1);
460   fiat_p256_to_generic(&r->Y, y1);
461   fiat_p256_to_generic(&r->Z, z1);
462 }
463 
ec_GFp_nistp256_dbl(const EC_GROUP * group,EC_JACOBIAN * r,const EC_JACOBIAN * a)464 static void ec_GFp_nistp256_dbl(const EC_GROUP *group, EC_JACOBIAN *r,
465                                 const EC_JACOBIAN *a) {
466   fiat_p256_felem x, y, z;
467   fiat_p256_from_generic(x, &a->X);
468   fiat_p256_from_generic(y, &a->Y);
469   fiat_p256_from_generic(z, &a->Z);
470   fiat_p256_point_double(x, y, z, x, y, z);
471   fiat_p256_to_generic(&r->X, x);
472   fiat_p256_to_generic(&r->Y, y);
473   fiat_p256_to_generic(&r->Z, z);
474 }
475 
ec_GFp_nistp256_point_mul(const EC_GROUP * group,EC_JACOBIAN * r,const EC_JACOBIAN * p,const EC_SCALAR * scalar)476 static void ec_GFp_nistp256_point_mul(const EC_GROUP *group, EC_JACOBIAN *r,
477                                       const EC_JACOBIAN *p,
478                                       const EC_SCALAR *scalar) {
479   fiat_p256_felem p_pre_comp[17][3];
480   OPENSSL_memset(&p_pre_comp, 0, sizeof(p_pre_comp));
481   // Precompute multiples.
482   fiat_p256_from_generic(p_pre_comp[1][0], &p->X);
483   fiat_p256_from_generic(p_pre_comp[1][1], &p->Y);
484   fiat_p256_from_generic(p_pre_comp[1][2], &p->Z);
485   for (size_t j = 2; j <= 16; ++j) {
486     if (j & 1) {
487       fiat_p256_point_add(p_pre_comp[j][0], p_pre_comp[j][1], p_pre_comp[j][2],
488                           p_pre_comp[1][0], p_pre_comp[1][1], p_pre_comp[1][2],
489                           0, p_pre_comp[j - 1][0], p_pre_comp[j - 1][1],
490                           p_pre_comp[j - 1][2]);
491     } else {
492       fiat_p256_point_double(p_pre_comp[j][0], p_pre_comp[j][1],
493                              p_pre_comp[j][2], p_pre_comp[j / 2][0],
494                              p_pre_comp[j / 2][1], p_pre_comp[j / 2][2]);
495     }
496   }
497 
498   // Set nq to the point at infinity.
499   fiat_p256_felem nq[3] = {{0}, {0}, {0}}, ftmp, tmp[3];
500 
501   // Loop over |scalar| msb-to-lsb, incorporating |p_pre_comp| every 5th round.
502   int skip = 1;  // Save two point operations in the first round.
503   for (size_t i = 255; i < 256; i--) {
504     // double
505     if (!skip) {
506       fiat_p256_point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
507     }
508 
509     // do other additions every 5 doublings
510     if (i % 5 == 0) {
511       crypto_word_t bits = fiat_p256_get_bit(scalar, i + 4) << 5;
512       bits |= fiat_p256_get_bit(scalar, i + 3) << 4;
513       bits |= fiat_p256_get_bit(scalar, i + 2) << 3;
514       bits |= fiat_p256_get_bit(scalar, i + 1) << 2;
515       bits |= fiat_p256_get_bit(scalar, i) << 1;
516       bits |= fiat_p256_get_bit(scalar, i - 1);
517       crypto_word_t sign, digit;
518       ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
519 
520       // select the point to add or subtract, in constant time.
521       fiat_p256_select_point((fiat_p256_limb_t)digit, 17,
522                              (const fiat_p256_felem(*)[3])p_pre_comp, tmp);
523       fiat_p256_opp(ftmp, tmp[1]);  // (X, -Y, Z) is the negative point.
524       fiat_p256_cmovznz(tmp[1], (fiat_p256_limb_t)sign, tmp[1], ftmp);
525 
526       if (!skip) {
527         fiat_p256_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2],
528                             0 /* mixed */, tmp[0], tmp[1], tmp[2]);
529       } else {
530         fiat_p256_copy(nq[0], tmp[0]);
531         fiat_p256_copy(nq[1], tmp[1]);
532         fiat_p256_copy(nq[2], tmp[2]);
533         skip = 0;
534       }
535     }
536   }
537 
538   fiat_p256_to_generic(&r->X, nq[0]);
539   fiat_p256_to_generic(&r->Y, nq[1]);
540   fiat_p256_to_generic(&r->Z, nq[2]);
541 }
542 
ec_GFp_nistp256_point_mul_base(const EC_GROUP * group,EC_JACOBIAN * r,const EC_SCALAR * scalar)543 static void ec_GFp_nistp256_point_mul_base(const EC_GROUP *group,
544                                            EC_JACOBIAN *r,
545                                            const EC_SCALAR *scalar) {
546   // Set nq to the point at infinity.
547   fiat_p256_felem nq[3] = {{0}, {0}, {0}}, tmp[3];
548 
549   int skip = 1;  // Save two point operations in the first round.
550   for (size_t i = 31; i < 32; i--) {
551     if (!skip) {
552       fiat_p256_point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
553     }
554 
555     // First, look 32 bits upwards.
556     crypto_word_t bits = fiat_p256_get_bit(scalar, i + 224) << 3;
557     bits |= fiat_p256_get_bit(scalar, i + 160) << 2;
558     bits |= fiat_p256_get_bit(scalar, i + 96) << 1;
559     bits |= fiat_p256_get_bit(scalar, i + 32);
560     // Select the point to add, in constant time.
561     fiat_p256_select_point_affine((fiat_p256_limb_t)bits, 15,
562                                   fiat_p256_g_pre_comp[1], tmp);
563 
564     if (!skip) {
565       fiat_p256_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2],
566                           1 /* mixed */, tmp[0], tmp[1], tmp[2]);
567     } else {
568       fiat_p256_copy(nq[0], tmp[0]);
569       fiat_p256_copy(nq[1], tmp[1]);
570       fiat_p256_copy(nq[2], tmp[2]);
571       skip = 0;
572     }
573 
574     // Second, look at the current position.
575     bits = fiat_p256_get_bit(scalar, i + 192) << 3;
576     bits |= fiat_p256_get_bit(scalar, i + 128) << 2;
577     bits |= fiat_p256_get_bit(scalar, i + 64) << 1;
578     bits |= fiat_p256_get_bit(scalar, i);
579     // Select the point to add, in constant time.
580     fiat_p256_select_point_affine((fiat_p256_limb_t)bits, 15,
581                                   fiat_p256_g_pre_comp[0], tmp);
582     fiat_p256_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 1 /* mixed */,
583                         tmp[0], tmp[1], tmp[2]);
584   }
585 
586   fiat_p256_to_generic(&r->X, nq[0]);
587   fiat_p256_to_generic(&r->Y, nq[1]);
588   fiat_p256_to_generic(&r->Z, nq[2]);
589 }
590 
ec_GFp_nistp256_point_mul_public(const EC_GROUP * group,EC_JACOBIAN * r,const EC_SCALAR * g_scalar,const EC_JACOBIAN * p,const EC_SCALAR * p_scalar)591 static void ec_GFp_nistp256_point_mul_public(const EC_GROUP *group,
592                                              EC_JACOBIAN *r,
593                                              const EC_SCALAR *g_scalar,
594                                              const EC_JACOBIAN *p,
595                                              const EC_SCALAR *p_scalar) {
596 #define P256_WSIZE_PUBLIC 4
597   // Precompute multiples of |p|. p_pre_comp[i] is (2*i+1) * |p|.
598   fiat_p256_felem p_pre_comp[1 << (P256_WSIZE_PUBLIC - 1)][3];
599   fiat_p256_from_generic(p_pre_comp[0][0], &p->X);
600   fiat_p256_from_generic(p_pre_comp[0][1], &p->Y);
601   fiat_p256_from_generic(p_pre_comp[0][2], &p->Z);
602   fiat_p256_felem p2[3];
603   fiat_p256_point_double(p2[0], p2[1], p2[2], p_pre_comp[0][0],
604                          p_pre_comp[0][1], p_pre_comp[0][2]);
605   for (size_t i = 1; i < OPENSSL_ARRAY_SIZE(p_pre_comp); i++) {
606     fiat_p256_point_add(p_pre_comp[i][0], p_pre_comp[i][1], p_pre_comp[i][2],
607                         p_pre_comp[i - 1][0], p_pre_comp[i - 1][1],
608                         p_pre_comp[i - 1][2], 0 /* not mixed */, p2[0], p2[1],
609                         p2[2]);
610   }
611 
612   // Set up the coefficients for |p_scalar|.
613   int8_t p_wNAF[257];
614   ec_compute_wNAF(group, p_wNAF, p_scalar, 256, P256_WSIZE_PUBLIC);
615 
616   // Set |ret| to the point at infinity.
617   int skip = 1;  // Save some point operations.
618   fiat_p256_felem ret[3] = {{0}, {0}, {0}};
619   for (int i = 256; i >= 0; i--) {
620     if (!skip) {
621       fiat_p256_point_double(ret[0], ret[1], ret[2], ret[0], ret[1], ret[2]);
622     }
623 
624     // For the |g_scalar|, we use the precomputed table without the
625     // constant-time lookup.
626     if (i <= 31) {
627       // First, look 32 bits upwards.
628       crypto_word_t bits = fiat_p256_get_bit(g_scalar, i + 224) << 3;
629       bits |= fiat_p256_get_bit(g_scalar, i + 160) << 2;
630       bits |= fiat_p256_get_bit(g_scalar, i + 96) << 1;
631       bits |= fiat_p256_get_bit(g_scalar, i + 32);
632       if (bits != 0) {
633         size_t index = (size_t)(bits - 1);
634         fiat_p256_point_add(ret[0], ret[1], ret[2], ret[0], ret[1], ret[2],
635                             1 /* mixed */, fiat_p256_g_pre_comp[1][index][0],
636                             fiat_p256_g_pre_comp[1][index][1],
637                             fiat_p256_one);
638         skip = 0;
639       }
640 
641       // Second, look at the current position.
642       bits = fiat_p256_get_bit(g_scalar, i + 192) << 3;
643       bits |= fiat_p256_get_bit(g_scalar, i + 128) << 2;
644       bits |= fiat_p256_get_bit(g_scalar, i + 64) << 1;
645       bits |= fiat_p256_get_bit(g_scalar, i);
646       if (bits != 0) {
647         size_t index = (size_t)(bits - 1);
648         fiat_p256_point_add(ret[0], ret[1], ret[2], ret[0], ret[1], ret[2],
649                             1 /* mixed */, fiat_p256_g_pre_comp[0][index][0],
650                             fiat_p256_g_pre_comp[0][index][1],
651                             fiat_p256_one);
652         skip = 0;
653       }
654     }
655 
656     int digit = p_wNAF[i];
657     if (digit != 0) {
658       assert(digit & 1);
659       size_t idx = (size_t)(digit < 0 ? (-digit) >> 1 : digit >> 1);
660       fiat_p256_felem *y = &p_pre_comp[idx][1], tmp;
661       if (digit < 0) {
662         fiat_p256_opp(tmp, p_pre_comp[idx][1]);
663         y = &tmp;
664       }
665       if (!skip) {
666         fiat_p256_point_add(ret[0], ret[1], ret[2], ret[0], ret[1], ret[2],
667                             0 /* not mixed */, p_pre_comp[idx][0], *y,
668                             p_pre_comp[idx][2]);
669       } else {
670         fiat_p256_copy(ret[0], p_pre_comp[idx][0]);
671         fiat_p256_copy(ret[1], *y);
672         fiat_p256_copy(ret[2], p_pre_comp[idx][2]);
673         skip = 0;
674       }
675     }
676   }
677 
678   fiat_p256_to_generic(&r->X, ret[0]);
679   fiat_p256_to_generic(&r->Y, ret[1]);
680   fiat_p256_to_generic(&r->Z, ret[2]);
681 }
682 
ec_GFp_nistp256_cmp_x_coordinate(const EC_GROUP * group,const EC_JACOBIAN * p,const EC_SCALAR * r)683 static int ec_GFp_nistp256_cmp_x_coordinate(const EC_GROUP *group,
684                                             const EC_JACOBIAN *p,
685                                             const EC_SCALAR *r) {
686   if (ec_GFp_simple_is_at_infinity(group, p)) {
687     return 0;
688   }
689 
690   // We wish to compare X/Z^2 with r. This is equivalent to comparing X with
691   // r*Z^2. Note that X and Z are represented in Montgomery form, while r is
692   // not.
693   fiat_p256_felem Z2_mont;
694   fiat_p256_from_generic(Z2_mont, &p->Z);
695   fiat_p256_mul(Z2_mont, Z2_mont, Z2_mont);
696 
697   fiat_p256_felem r_Z2;
698   fiat_p256_from_words(r_Z2, r->words);  // r < order < p, so this is valid.
699   fiat_p256_mul(r_Z2, r_Z2, Z2_mont);
700 
701   fiat_p256_felem X;
702   fiat_p256_from_generic(X, &p->X);
703   fiat_p256_from_montgomery(X, X);
704 
705   if (OPENSSL_memcmp(&r_Z2, &X, sizeof(r_Z2)) == 0) {
706     return 1;
707   }
708 
709   // During signing the x coefficient is reduced modulo the group order.
710   // Therefore there is a small possibility, less than 1/2^128, that group_order
711   // < p.x < P. in that case we need not only to compare against |r| but also to
712   // compare against r+group_order.
713   assert(group->field.N.width == group->order.N.width);
714   EC_FELEM tmp;
715   BN_ULONG carry =
716       bn_add_words(tmp.words, r->words, group->order.N.d, group->field.N.width);
717   if (carry == 0 &&
718       bn_less_than_words(tmp.words, group->field.N.d, group->field.N.width)) {
719     fiat_p256_from_generic(r_Z2, &tmp);
720     fiat_p256_mul(r_Z2, r_Z2, Z2_mont);
721     if (OPENSSL_memcmp(&r_Z2, &X, sizeof(r_Z2)) == 0) {
722       return 1;
723     }
724   }
725 
726   return 0;
727 }
728 
DEFINE_METHOD_FUNCTION(EC_METHOD,EC_GFp_nistp256_method)729 DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_nistp256_method) {
730   out->point_get_affine_coordinates =
731       ec_GFp_nistp256_point_get_affine_coordinates;
732   out->add = ec_GFp_nistp256_add;
733   out->dbl = ec_GFp_nistp256_dbl;
734   out->mul = ec_GFp_nistp256_point_mul;
735   out->mul_base = ec_GFp_nistp256_point_mul_base;
736   out->mul_public = ec_GFp_nistp256_point_mul_public;
737   out->felem_mul = ec_GFp_mont_felem_mul;
738   out->felem_sqr = ec_GFp_mont_felem_sqr;
739   out->felem_to_bytes = ec_GFp_mont_felem_to_bytes;
740   out->felem_from_bytes = ec_GFp_mont_felem_from_bytes;
741   out->felem_reduce = ec_GFp_mont_felem_reduce;
742   // TODO(davidben): This should use the specialized field arithmetic
743   // implementation, rather than the generic one.
744   out->felem_exp = ec_GFp_mont_felem_exp;
745   out->scalar_inv0_montgomery = ec_simple_scalar_inv0_montgomery;
746   out->scalar_to_montgomery_inv_vartime =
747       ec_simple_scalar_to_montgomery_inv_vartime;
748   out->cmp_x_coordinate = ec_GFp_nistp256_cmp_x_coordinate;
749 }
750