1 /* Copyright (c) 2020, Google Inc.
2  *
3  * Permission to use, copy, modify, and/or distribute this software for any
4  * purpose with or without fee is hereby granted, provided that the above
5  * copyright notice and this permission notice appear in all copies.
6  *
7  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10  * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12  * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
14 
15 // Some of this code is taken from the ref10 version of Ed25519 in SUPERCOP
16 // 20141124 (http://bench.cr.yp.to/supercop.html). That code is released as
17 // public domain. Other parts have been replaced to call into code generated by
18 // Fiat (https://github.com/mit-plv/fiat-crypto) in //third_party/fiat.
19 //
20 // The field functions are shared by Ed25519 and X25519 where possible.
21 
22 #include <ring-core/mem.h>
23 
24 #include "internal.h"
25 #include "../internal.h"
26 
27 #if defined(_MSC_VER) && !defined(__clang__)
28 // '=': conversion from 'int64_t' to 'int32_t', possible loss of data
29 #pragma warning(disable: 4242)
30 // '=': conversion from 'int32_t' to 'uint8_t', possible loss of data
31 #pragma warning(disable: 4244)
32 #endif
33 
34 #if defined(__GNUC__) || defined(__clang__)
35 #pragma GCC diagnostic ignored "-Wconversion"
36 #pragma GCC diagnostic ignored "-Wsign-conversion"
37 #endif
38 
39 #if defined(__GNUC__) && !defined(__clang__)
40 #pragma GCC diagnostic ignored "-Winline"
41 #endif
42 
43 // Various pre-computed constants.
44 #include "./curve25519_tables.h"
45 
46 #if defined(BORINGSSL_HAS_UINT128)
47 #if defined(__GNUC__)
48 #pragma GCC diagnostic ignored "-Wpedantic"
49 #endif
50 #include "../../third_party/fiat/curve25519_64.h"
51 #elif defined(OPENSSL_64_BIT)
52 #include "../../third_party/fiat/curve25519_64_msvc.h"
53 #else
54 #include "../../third_party/fiat/curve25519_32.h"
55 #endif
56 
57 
58 // Low-level intrinsic operations
59 
load_3(const uint8_t * in)60 static uint64_t load_3(const uint8_t *in) {
61   uint64_t result;
62   result = (uint64_t)in[0];
63   result |= ((uint64_t)in[1]) << 8;
64   result |= ((uint64_t)in[2]) << 16;
65   return result;
66 }
67 
load_4(const uint8_t * in)68 static uint64_t load_4(const uint8_t *in) {
69   uint64_t result;
70   result = (uint64_t)in[0];
71   result |= ((uint64_t)in[1]) << 8;
72   result |= ((uint64_t)in[2]) << 16;
73   result |= ((uint64_t)in[3]) << 24;
74   return result;
75 }
76 
77 
78 // Field operations.
79 
80 #if defined(OPENSSL_64_BIT)
81 
82 // assert_fe asserts that |f| satisfies bounds:
83 //
84 //  [[0x0 ~> 0x8cccccccccccc],
85 //   [0x0 ~> 0x8cccccccccccc],
86 //   [0x0 ~> 0x8cccccccccccc],
87 //   [0x0 ~> 0x8cccccccccccc],
88 //   [0x0 ~> 0x8cccccccccccc]]
89 //
90 // See comments in curve25519_64.h for which functions use these bounds for
91 // inputs or outputs.
92 #define assert_fe(f)                                                    \
93   do {                                                                  \
94     for (unsigned _assert_fe_i = 0; _assert_fe_i < 5; _assert_fe_i++) { \
95       dev_assert_secret(f[_assert_fe_i] <= UINT64_C(0x8cccccccccccc));  \
96     }                                                                   \
97   } while (0)
98 
99 // assert_fe_loose asserts that |f| satisfies bounds:
100 //
101 //  [[0x0 ~> 0x1a666666666664],
102 //   [0x0 ~> 0x1a666666666664],
103 //   [0x0 ~> 0x1a666666666664],
104 //   [0x0 ~> 0x1a666666666664],
105 //   [0x0 ~> 0x1a666666666664]]
106 //
107 // See comments in curve25519_64.h for which functions use these bounds for
108 // inputs or outputs.
109 #define assert_fe_loose(f)                                              \
110   do {                                                                  \
111     for (unsigned _assert_fe_i = 0; _assert_fe_i < 5; _assert_fe_i++) { \
112       dev_assert_secret(f[_assert_fe_i] <= UINT64_C(0x1a666666666664)); \
113     }                                                                   \
114   } while (0)
115 
116 #else
117 
118 // assert_fe asserts that |f| satisfies bounds:
119 //
120 //  [[0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
121 //   [0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
122 //   [0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
123 //   [0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
124 //   [0x0 ~> 0x4666666], [0x0 ~> 0x2333333]]
125 //
126 // See comments in curve25519_32.h for which functions use these bounds for
127 // inputs or outputs.
128 #define assert_fe(f)                                                     \
129   do {                                                                   \
130     for (unsigned _assert_fe_i = 0; _assert_fe_i < 10; _assert_fe_i++) { \
131       dev_assert_secret(f[_assert_fe_i] <=                               \
132              ((_assert_fe_i & 1) ? 0x2333333u : 0x4666666u));            \
133     }                                                                    \
134   } while (0)
135 
136 // assert_fe_loose asserts that |f| satisfies bounds:
137 //
138 //  [[0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
139 //   [0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
140 //   [0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
141 //   [0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
142 //   [0x0 ~> 0xd333332], [0x0 ~> 0x6999999]]
143 //
144 // See comments in curve25519_32.h for which functions use these bounds for
145 // inputs or outputs.
146 #define assert_fe_loose(f)                                               \
147   do {                                                                   \
148     for (unsigned _assert_fe_i = 0; _assert_fe_i < 10; _assert_fe_i++) { \
149       dev_assert_secret(f[_assert_fe_i] <=                               \
150              ((_assert_fe_i & 1) ? 0x6999999u : 0xd333332u));            \
151     }                                                                    \
152   } while (0)
153 
154 #endif  // OPENSSL_64_BIT
155 
156 OPENSSL_STATIC_ASSERT(sizeof(fe) == sizeof(fe_limb_t) * FE_NUM_LIMBS,
157                       "fe_limb_t[FE_NUM_LIMBS] is inconsistent with fe");
158 
fe_frombytes_strict(fe * h,const uint8_t s[32])159 static void fe_frombytes_strict(fe *h, const uint8_t s[32]) {
160   // |fiat_25519_from_bytes| requires the top-most bit be clear.
161   dev_assert_secret((s[31] & 0x80) == 0);
162   fiat_25519_from_bytes(h->v, s);
163   assert_fe(h->v);
164 }
165 
fe_frombytes(fe * h,const uint8_t s[32])166 static void fe_frombytes(fe *h, const uint8_t s[32]) {
167   uint8_t s_copy[32];
168   OPENSSL_memcpy(s_copy, s, 32);
169   s_copy[31] &= 0x7f;
170   fe_frombytes_strict(h, s_copy);
171 }
172 
fe_tobytes(uint8_t s[32],const fe * f)173 static void fe_tobytes(uint8_t s[32], const fe *f) {
174   assert_fe(f->v);
175   fiat_25519_to_bytes(s, f->v);
176 }
177 
178 // h = 0
fe_0(fe * h)179 static void fe_0(fe *h) {
180   OPENSSL_memset(h, 0, sizeof(fe));
181 }
182 
183 #if defined(OPENSSL_SMALL)
184 
fe_loose_0(fe_loose * h)185 static void fe_loose_0(fe_loose *h) {
186   OPENSSL_memset(h, 0, sizeof(fe_loose));
187 }
188 
189 #endif
190 
191 // h = 1
fe_1(fe * h)192 static void fe_1(fe *h) {
193   OPENSSL_memset(h, 0, sizeof(fe));
194   h->v[0] = 1;
195 }
196 
197 #if defined(OPENSSL_SMALL)
198 
fe_loose_1(fe_loose * h)199 static void fe_loose_1(fe_loose *h) {
200   OPENSSL_memset(h, 0, sizeof(fe_loose));
201   h->v[0] = 1;
202 }
203 
204 #endif
205 
206 // h = f + g
207 // Can overlap h with f or g.
fe_add(fe_loose * h,const fe * f,const fe * g)208 static void fe_add(fe_loose *h, const fe *f, const fe *g) {
209   assert_fe(f->v);
210   assert_fe(g->v);
211   fiat_25519_add(h->v, f->v, g->v);
212   assert_fe_loose(h->v);
213 }
214 
215 // h = f - g
216 // Can overlap h with f or g.
fe_sub(fe_loose * h,const fe * f,const fe * g)217 static void fe_sub(fe_loose *h, const fe *f, const fe *g) {
218   assert_fe(f->v);
219   assert_fe(g->v);
220   fiat_25519_sub(h->v, f->v, g->v);
221   assert_fe_loose(h->v);
222 }
223 
fe_carry(fe * h,const fe_loose * f)224 static void fe_carry(fe *h, const fe_loose* f) {
225   assert_fe_loose(f->v);
226   fiat_25519_carry(h->v, f->v);
227   assert_fe(h->v);
228 }
229 
fe_mul_impl(fe_limb_t out[FE_NUM_LIMBS],const fe_limb_t in1[FE_NUM_LIMBS],const fe_limb_t in2[FE_NUM_LIMBS])230 static void fe_mul_impl(fe_limb_t out[FE_NUM_LIMBS],
231                         const fe_limb_t in1[FE_NUM_LIMBS],
232                         const fe_limb_t in2[FE_NUM_LIMBS]) {
233   assert_fe_loose(in1);
234   assert_fe_loose(in2);
235   fiat_25519_carry_mul(out, in1, in2);
236   assert_fe(out);
237 }
238 
fe_mul_ltt(fe_loose * h,const fe * f,const fe * g)239 static void fe_mul_ltt(fe_loose *h, const fe *f, const fe *g) {
240   fe_mul_impl(h->v, f->v, g->v);
241 }
242 
243 // static void fe_mul_llt(fe_loose *h, const fe_loose *f, const fe *g) was
244 // removed. This comment is here to make diffs vs. BoringSSL easier to read.
245 
246 
fe_mul_ttt(fe * h,const fe * f,const fe * g)247 static void fe_mul_ttt(fe *h, const fe *f, const fe *g) {
248   fe_mul_impl(h->v, f->v, g->v);
249 }
250 
fe_mul_tlt(fe * h,const fe_loose * f,const fe * g)251 static void fe_mul_tlt(fe *h, const fe_loose *f, const fe *g) {
252   fe_mul_impl(h->v, f->v, g->v);
253 }
254 
fe_mul_ttl(fe * h,const fe * f,const fe_loose * g)255 static void fe_mul_ttl(fe *h, const fe *f, const fe_loose *g) {
256   fe_mul_impl(h->v, f->v, g->v);
257 }
258 
fe_mul_tll(fe * h,const fe_loose * f,const fe_loose * g)259 static void fe_mul_tll(fe *h, const fe_loose *f, const fe_loose *g) {
260   fe_mul_impl(h->v, f->v, g->v);
261 }
262 
fe_sq_tl(fe * h,const fe_loose * f)263 static void fe_sq_tl(fe *h, const fe_loose *f) {
264   assert_fe_loose(f->v);
265   fiat_25519_carry_square(h->v, f->v);
266   assert_fe(h->v);
267 }
268 
fe_sq_tt(fe * h,const fe * f)269 static void fe_sq_tt(fe *h, const fe *f) {
270   assert_fe_loose(f->v);
271   fiat_25519_carry_square(h->v, f->v);
272   assert_fe(h->v);
273 }
274 
275 // Replace (f,g) with (g,f) if b == 1;
276 // replace (f,g) with (f,g) if b == 0.
277 //
278 // Preconditions: b in {0,1}.
fe_cswap(fe * f,fe * g,fe_limb_t b)279 static void fe_cswap(fe *f, fe *g, fe_limb_t b) {
280   b = 0-b;
281   for (unsigned i = 0; i < FE_NUM_LIMBS; i++) {
282     fe_limb_t x = f->v[i] ^ g->v[i];
283     x &= b;
284     f->v[i] ^= x;
285     g->v[i] ^= x;
286   }
287 }
288 
fe_mul121666(fe * h,const fe_loose * f)289 static void fe_mul121666(fe *h, const fe_loose *f) {
290   assert_fe_loose(f->v);
291   fiat_25519_carry_scmul_121666(h->v, f->v);
292   assert_fe(h->v);
293 }
294 
295 // h = -f
fe_neg(fe_loose * h,const fe * f)296 static void fe_neg(fe_loose *h, const fe *f) {
297   assert_fe(f->v);
298   fiat_25519_opp(h->v, f->v);
299   assert_fe_loose(h->v);
300 }
301 
302 // Replace (f,g) with (g,g) if b == 1;
303 // replace (f,g) with (f,g) if b == 0.
304 //
305 // Preconditions: b in {0,1}.
fe_cmov(fe_loose * f,const fe_loose * g,fe_limb_t b)306 static void fe_cmov(fe_loose *f, const fe_loose *g, fe_limb_t b) {
307   // Silence an unused function warning. |fiat_25519_selectznz| isn't quite the
308   // calling convention the rest of this code wants, so implement it by hand.
309   //
310   // TODO(davidben): Switch to fiat's calling convention, or ask fiat to emit a
311   // different one.
312   (void)fiat_25519_selectznz;
313 
314   b = 0-b;
315   for (unsigned i = 0; i < FE_NUM_LIMBS; i++) {
316     fe_limb_t x = f->v[i] ^ g->v[i];
317     x &= b;
318     f->v[i] ^= x;
319   }
320 }
321 
322 // h = f
fe_copy(fe * h,const fe * f)323 static void fe_copy(fe *h, const fe *f) {
324   fe_limbs_copy(h->v, f->v);
325 }
326 
fe_copy_lt(fe_loose * h,const fe * f)327 static void fe_copy_lt(fe_loose *h, const fe *f) {
328   fe_limbs_copy(h->v, f->v);
329 }
330 
fe_loose_invert(fe * out,const fe_loose * z)331 static void fe_loose_invert(fe *out, const fe_loose *z) {
332   fe t0;
333   fe t1;
334   fe t2;
335   fe t3;
336   int i;
337 
338   fe_sq_tl(&t0, z);
339   fe_sq_tt(&t1, &t0);
340   for (i = 1; i < 2; ++i) {
341     fe_sq_tt(&t1, &t1);
342   }
343   fe_mul_tlt(&t1, z, &t1);
344   fe_mul_ttt(&t0, &t0, &t1);
345   fe_sq_tt(&t2, &t0);
346   fe_mul_ttt(&t1, &t1, &t2);
347   fe_sq_tt(&t2, &t1);
348   for (i = 1; i < 5; ++i) {
349     fe_sq_tt(&t2, &t2);
350   }
351   fe_mul_ttt(&t1, &t2, &t1);
352   fe_sq_tt(&t2, &t1);
353   for (i = 1; i < 10; ++i) {
354     fe_sq_tt(&t2, &t2);
355   }
356   fe_mul_ttt(&t2, &t2, &t1);
357   fe_sq_tt(&t3, &t2);
358   for (i = 1; i < 20; ++i) {
359     fe_sq_tt(&t3, &t3);
360   }
361   fe_mul_ttt(&t2, &t3, &t2);
362   fe_sq_tt(&t2, &t2);
363   for (i = 1; i < 10; ++i) {
364     fe_sq_tt(&t2, &t2);
365   }
366   fe_mul_ttt(&t1, &t2, &t1);
367   fe_sq_tt(&t2, &t1);
368   for (i = 1; i < 50; ++i) {
369     fe_sq_tt(&t2, &t2);
370   }
371   fe_mul_ttt(&t2, &t2, &t1);
372   fe_sq_tt(&t3, &t2);
373   for (i = 1; i < 100; ++i) {
374     fe_sq_tt(&t3, &t3);
375   }
376   fe_mul_ttt(&t2, &t3, &t2);
377   fe_sq_tt(&t2, &t2);
378   for (i = 1; i < 50; ++i) {
379     fe_sq_tt(&t2, &t2);
380   }
381   fe_mul_ttt(&t1, &t2, &t1);
382   fe_sq_tt(&t1, &t1);
383   for (i = 1; i < 5; ++i) {
384     fe_sq_tt(&t1, &t1);
385   }
386   fe_mul_ttt(out, &t1, &t0);
387 }
388 
fe_invert(fe * out,const fe * z)389 static void fe_invert(fe *out, const fe *z) {
390   fe_loose l;
391   fe_copy_lt(&l, z);
392   fe_loose_invert(out, &l);
393 }
394 
395 // return 0 if f == 0
396 // return 1 if f != 0
fe_isnonzero(const fe_loose * f)397 static int fe_isnonzero(const fe_loose *f) {
398   fe tight;
399   fe_carry(&tight, f);
400   uint8_t s[32];
401   fe_tobytes(s, &tight);
402 
403   static const uint8_t zero[32] = {0};
404   return OPENSSL_memcmp(s, zero, sizeof(zero)) != 0;
405 }
406 
407 // return 1 if f is in {1,3,5,...,q-2}
408 // return 0 if f is in {0,2,4,...,q-1}
fe_isnegative(const fe * f)409 static int fe_isnegative(const fe *f) {
410   uint8_t s[32];
411   fe_tobytes(s, f);
412   return s[0] & 1;
413 }
414 
fe_sq2_tt(fe * h,const fe * f)415 static void fe_sq2_tt(fe *h, const fe *f) {
416   // h = f^2
417   fe_sq_tt(h, f);
418 
419   // h = h + h
420   fe_loose tmp;
421   fe_add(&tmp, h, h);
422   fe_carry(h, &tmp);
423 }
424 
fe_pow22523(fe * out,const fe * z)425 static void fe_pow22523(fe *out, const fe *z) {
426   fe t0;
427   fe t1;
428   fe t2;
429   int i;
430 
431   fe_sq_tt(&t0, z);
432   fe_sq_tt(&t1, &t0);
433   for (i = 1; i < 2; ++i) {
434     fe_sq_tt(&t1, &t1);
435   }
436   fe_mul_ttt(&t1, z, &t1);
437   fe_mul_ttt(&t0, &t0, &t1);
438   fe_sq_tt(&t0, &t0);
439   fe_mul_ttt(&t0, &t1, &t0);
440   fe_sq_tt(&t1, &t0);
441   for (i = 1; i < 5; ++i) {
442     fe_sq_tt(&t1, &t1);
443   }
444   fe_mul_ttt(&t0, &t1, &t0);
445   fe_sq_tt(&t1, &t0);
446   for (i = 1; i < 10; ++i) {
447     fe_sq_tt(&t1, &t1);
448   }
449   fe_mul_ttt(&t1, &t1, &t0);
450   fe_sq_tt(&t2, &t1);
451   for (i = 1; i < 20; ++i) {
452     fe_sq_tt(&t2, &t2);
453   }
454   fe_mul_ttt(&t1, &t2, &t1);
455   fe_sq_tt(&t1, &t1);
456   for (i = 1; i < 10; ++i) {
457     fe_sq_tt(&t1, &t1);
458   }
459   fe_mul_ttt(&t0, &t1, &t0);
460   fe_sq_tt(&t1, &t0);
461   for (i = 1; i < 50; ++i) {
462     fe_sq_tt(&t1, &t1);
463   }
464   fe_mul_ttt(&t1, &t1, &t0);
465   fe_sq_tt(&t2, &t1);
466   for (i = 1; i < 100; ++i) {
467     fe_sq_tt(&t2, &t2);
468   }
469   fe_mul_ttt(&t1, &t2, &t1);
470   fe_sq_tt(&t1, &t1);
471   for (i = 1; i < 50; ++i) {
472     fe_sq_tt(&t1, &t1);
473   }
474   fe_mul_ttt(&t0, &t1, &t0);
475   fe_sq_tt(&t0, &t0);
476   for (i = 1; i < 2; ++i) {
477     fe_sq_tt(&t0, &t0);
478   }
479   fe_mul_ttt(out, &t0, z);
480 }
481 
482 
483 // Group operations.
484 
x25519_ge_frombytes_vartime(ge_p3 * h,const uint8_t s[32])485 int x25519_ge_frombytes_vartime(ge_p3 *h, const uint8_t s[32]) {
486   fe u;
487   fe_loose v;
488   fe w;
489   fe vxx;
490   fe_loose check;
491 
492   fe_frombytes(&h->Y, s);
493   fe_1(&h->Z);
494   fe_sq_tt(&w, &h->Y);
495   fe_mul_ttt(&vxx, &w, &d);
496   fe_sub(&v, &w, &h->Z);  // u = y^2-1
497   fe_carry(&u, &v);
498   fe_add(&v, &vxx, &h->Z);  // v = dy^2+1
499 
500   fe_mul_ttl(&w, &u, &v);  // w = u*v
501   fe_pow22523(&h->X, &w);  // x = w^((q-5)/8)
502   fe_mul_ttt(&h->X, &h->X, &u);  // x = u*w^((q-5)/8)
503 
504   fe_sq_tt(&vxx, &h->X);
505   fe_mul_ttl(&vxx, &vxx, &v);
506   fe_sub(&check, &vxx, &u);
507   if (fe_isnonzero(&check)) {
508     fe_add(&check, &vxx, &u);
509     if (fe_isnonzero(&check)) {
510       return 0;
511     }
512     fe_mul_ttt(&h->X, &h->X, &sqrtm1);
513   }
514 
515   if (fe_isnegative(&h->X) != (s[31] >> 7)) {
516     fe_loose t;
517     fe_neg(&t, &h->X);
518     fe_carry(&h->X, &t);
519   }
520 
521   fe_mul_ttt(&h->T, &h->X, &h->Y);
522   return 1;
523 }
524 
ge_p2_0(ge_p2 * h)525 static void ge_p2_0(ge_p2 *h) {
526   fe_0(&h->X);
527   fe_1(&h->Y);
528   fe_1(&h->Z);
529 }
530 
ge_p3_0(ge_p3 * h)531 static void ge_p3_0(ge_p3 *h) {
532   fe_0(&h->X);
533   fe_1(&h->Y);
534   fe_1(&h->Z);
535   fe_0(&h->T);
536 }
537 
538 #if defined(OPENSSL_SMALL)
539 
ge_precomp_0(ge_precomp * h)540 static void ge_precomp_0(ge_precomp *h) {
541   fe_loose_1(&h->yplusx);
542   fe_loose_1(&h->yminusx);
543   fe_loose_0(&h->xy2d);
544 }
545 
546 #endif
547 
548 // r = p
ge_p3_to_p2(ge_p2 * r,const ge_p3 * p)549 static void ge_p3_to_p2(ge_p2 *r, const ge_p3 *p) {
550   fe_copy(&r->X, &p->X);
551   fe_copy(&r->Y, &p->Y);
552   fe_copy(&r->Z, &p->Z);
553 }
554 
555 // r = p
x25519_ge_p3_to_cached(ge_cached * r,const ge_p3 * p)556 static void x25519_ge_p3_to_cached(ge_cached *r, const ge_p3 *p) {
557   fe_add(&r->YplusX, &p->Y, &p->X);
558   fe_sub(&r->YminusX, &p->Y, &p->X);
559   fe_copy_lt(&r->Z, &p->Z);
560   fe_mul_ltt(&r->T2d, &p->T, &d2);
561 }
562 
563 // r = p
x25519_ge_p1p1_to_p2(ge_p2 * r,const ge_p1p1 * p)564 static void x25519_ge_p1p1_to_p2(ge_p2 *r, const ge_p1p1 *p) {
565   fe_mul_tll(&r->X, &p->X, &p->T);
566   fe_mul_tll(&r->Y, &p->Y, &p->Z);
567   fe_mul_tll(&r->Z, &p->Z, &p->T);
568 }
569 
570 // r = p
x25519_ge_p1p1_to_p3(ge_p3 * r,const ge_p1p1 * p)571 static void x25519_ge_p1p1_to_p3(ge_p3 *r, const ge_p1p1 *p) {
572   fe_mul_tll(&r->X, &p->X, &p->T);
573   fe_mul_tll(&r->Y, &p->Y, &p->Z);
574   fe_mul_tll(&r->Z, &p->Z, &p->T);
575   fe_mul_tll(&r->T, &p->X, &p->Y);
576 }
577 
578 // r = 2 * p
ge_p2_dbl(ge_p1p1 * r,const ge_p2 * p)579 static void ge_p2_dbl(ge_p1p1 *r, const ge_p2 *p) {
580   fe trX, trZ, trT;
581   fe t0;
582 
583   fe_sq_tt(&trX, &p->X);
584   fe_sq_tt(&trZ, &p->Y);
585   fe_sq2_tt(&trT, &p->Z);
586   fe_add(&r->Y, &p->X, &p->Y);
587   fe_sq_tl(&t0, &r->Y);
588 
589   fe_add(&r->Y, &trZ, &trX);
590   fe_sub(&r->Z, &trZ, &trX);
591   fe_carry(&trZ, &r->Y);
592   fe_sub(&r->X, &t0, &trZ);
593   fe_carry(&trZ, &r->Z);
594   fe_sub(&r->T, &trT, &trZ);
595 }
596 
597 // r = 2 * p
ge_p3_dbl(ge_p1p1 * r,const ge_p3 * p)598 static void ge_p3_dbl(ge_p1p1 *r, const ge_p3 *p) {
599   ge_p2 q;
600   ge_p3_to_p2(&q, p);
601   ge_p2_dbl(r, &q);
602 }
603 
604 // r = p + q
ge_madd(ge_p1p1 * r,const ge_p3 * p,const ge_precomp * q)605 static void ge_madd(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) {
606   fe trY, trZ, trT;
607 
608   fe_add(&r->X, &p->Y, &p->X);
609   fe_sub(&r->Y, &p->Y, &p->X);
610   fe_mul_tll(&trZ, &r->X, &q->yplusx);
611   fe_mul_tll(&trY, &r->Y, &q->yminusx);
612   fe_mul_tlt(&trT, &q->xy2d, &p->T);
613   fe_add(&r->T, &p->Z, &p->Z);
614   fe_sub(&r->X, &trZ, &trY);
615   fe_add(&r->Y, &trZ, &trY);
616   fe_carry(&trZ, &r->T);
617   fe_add(&r->Z, &trZ, &trT);
618   fe_sub(&r->T, &trZ, &trT);
619 }
620 
621 // r = p - q
ge_msub(ge_p1p1 * r,const ge_p3 * p,const ge_precomp * q)622 static void ge_msub(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) {
623   fe trY, trZ, trT;
624 
625   fe_add(&r->X, &p->Y, &p->X);
626   fe_sub(&r->Y, &p->Y, &p->X);
627   fe_mul_tll(&trZ, &r->X, &q->yminusx);
628   fe_mul_tll(&trY, &r->Y, &q->yplusx);
629   fe_mul_tlt(&trT, &q->xy2d, &p->T);
630   fe_add(&r->T, &p->Z, &p->Z);
631   fe_sub(&r->X, &trZ, &trY);
632   fe_add(&r->Y, &trZ, &trY);
633   fe_carry(&trZ, &r->T);
634   fe_sub(&r->Z, &trZ, &trT);
635   fe_add(&r->T, &trZ, &trT);
636 }
637 
638 // r = p + q
x25519_ge_add(ge_p1p1 * r,const ge_p3 * p,const ge_cached * q)639 static void x25519_ge_add(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) {
640   fe trX, trY, trZ, trT;
641 
642   fe_add(&r->X, &p->Y, &p->X);
643   fe_sub(&r->Y, &p->Y, &p->X);
644   fe_mul_tll(&trZ, &r->X, &q->YplusX);
645   fe_mul_tll(&trY, &r->Y, &q->YminusX);
646   fe_mul_tlt(&trT, &q->T2d, &p->T);
647   fe_mul_ttl(&trX, &p->Z, &q->Z);
648   fe_add(&r->T, &trX, &trX);
649   fe_sub(&r->X, &trZ, &trY);
650   fe_add(&r->Y, &trZ, &trY);
651   fe_carry(&trZ, &r->T);
652   fe_add(&r->Z, &trZ, &trT);
653   fe_sub(&r->T, &trZ, &trT);
654 }
655 
656 // r = p - q
x25519_ge_sub(ge_p1p1 * r,const ge_p3 * p,const ge_cached * q)657 static void x25519_ge_sub(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) {
658   fe trX, trY, trZ, trT;
659 
660   fe_add(&r->X, &p->Y, &p->X);
661   fe_sub(&r->Y, &p->Y, &p->X);
662   fe_mul_tll(&trZ, &r->X, &q->YminusX);
663   fe_mul_tll(&trY, &r->Y, &q->YplusX);
664   fe_mul_tlt(&trT, &q->T2d, &p->T);
665   fe_mul_ttl(&trX, &p->Z, &q->Z);
666   fe_add(&r->T, &trX, &trX);
667   fe_sub(&r->X, &trZ, &trY);
668   fe_add(&r->Y, &trZ, &trY);
669   fe_carry(&trZ, &r->T);
670   fe_sub(&r->Z, &trZ, &trT);
671   fe_add(&r->T, &trZ, &trT);
672 }
673 
cmov(ge_precomp * t,const ge_precomp * u,uint8_t b)674 static void cmov(ge_precomp *t, const ge_precomp *u, uint8_t b) {
675   fe_cmov(&t->yplusx, &u->yplusx, b);
676   fe_cmov(&t->yminusx, &u->yminusx, b);
677   fe_cmov(&t->xy2d, &u->xy2d, b);
678 }
679 
680 #if defined(OPENSSL_SMALL)
681 
x25519_ge_scalarmult_small_precomp(ge_p3 * h,const uint8_t a[32],const uint8_t precomp_table[15* 2* 32])682 static void x25519_ge_scalarmult_small_precomp(
683     ge_p3 *h, const uint8_t a[32], const uint8_t precomp_table[15 * 2 * 32]) {
684   // precomp_table is first expanded into matching |ge_precomp|
685   // elements.
686   ge_precomp multiples[15];
687 
688   unsigned i;
689   for (i = 0; i < 15; i++) {
690     // The precomputed table is assumed to already clear the top bit, so
691     // |fe_frombytes_strict| may be used directly.
692     const uint8_t *bytes = &precomp_table[i*(2 * 32)];
693     fe x, y;
694     fe_frombytes_strict(&x, bytes);
695     fe_frombytes_strict(&y, bytes + 32);
696 
697     ge_precomp *out = &multiples[i];
698     fe_add(&out->yplusx, &y, &x);
699     fe_sub(&out->yminusx, &y, &x);
700     fe_mul_ltt(&out->xy2d, &x, &y);
701     fe_mul_llt(&out->xy2d, &out->xy2d, &d2);
702   }
703 
704   // See the comment above |k25519SmallPrecomp| about the structure of the
705   // precomputed elements. This loop does 64 additions and 64 doublings to
706   // calculate the result.
707   ge_p3_0(h);
708 
709   for (i = 63; i < 64; i--) {
710     unsigned j;
711     signed char index = 0;
712 
713     for (j = 0; j < 4; j++) {
714       const uint8_t bit = 1 & (a[(8 * j) + (i / 8)] >> (i & 7));
715       index |= (bit << j);
716     }
717 
718     ge_precomp e;
719     ge_precomp_0(&e);
720 
721     for (j = 1; j < 16; j++) {
722       cmov(&e, &multiples[j-1], 1&constant_time_eq_w(index, j));
723     }
724 
725     ge_cached cached;
726     ge_p1p1 r;
727     x25519_ge_p3_to_cached(&cached, h);
728     x25519_ge_add(&r, h, &cached);
729     x25519_ge_p1p1_to_p3(h, &r);
730 
731     ge_madd(&r, h, &e);
732     x25519_ge_p1p1_to_p3(h, &r);
733   }
734 }
735 
x25519_ge_scalarmult_base(ge_p3 * h,const uint8_t a[32],int use_adx)736 void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t a[32], int use_adx) {
737   (void)use_adx;
738   x25519_ge_scalarmult_small_precomp(h, a, k25519SmallPrecomp);
739 }
740 
741 #else
742 
table_select(ge_precomp * t,const int pos,const signed char b)743 static void table_select(ge_precomp *t, const int pos, const signed char b) {
744   uint8_t bnegative = constant_time_msb_w(b);
745   uint8_t babs = b - ((bnegative & b) << 1);
746 
747   uint8_t t_bytes[3][32] = {
748       {constant_time_is_zero_w(b) & 1}, {constant_time_is_zero_w(b) & 1}, {0}};
749 #if defined(__clang__) // materialize for vectorization, 6% speedup
750   __asm__("" : "+m" (t_bytes) : /*no inputs*/);
751 #endif
752   OPENSSL_STATIC_ASSERT(sizeof(t_bytes) == sizeof(k25519Precomp[pos][0]), "");
753   for (int i = 0; i < 8; i++) {
754     constant_time_conditional_memxor(t_bytes, k25519Precomp[pos][i],
755                                      sizeof(t_bytes),
756                                      constant_time_eq_w(babs, 1 + i));
757   }
758 
759   fe yplusx, yminusx, xy2d;
760   fe_frombytes_strict(&yplusx, t_bytes[0]);
761   fe_frombytes_strict(&yminusx, t_bytes[1]);
762   fe_frombytes_strict(&xy2d, t_bytes[2]);
763 
764   fe_copy_lt(&t->yplusx, &yplusx);
765   fe_copy_lt(&t->yminusx, &yminusx);
766   fe_copy_lt(&t->xy2d, &xy2d);
767 
768   ge_precomp minust;
769   fe_copy_lt(&minust.yplusx, &yminusx);
770   fe_copy_lt(&minust.yminusx, &yplusx);
771   fe_neg(&minust.xy2d, &xy2d);
772   cmov(t, &minust, bnegative>>7);
773 }
774 
775 // h = a * B
776 // where a = a[0]+256*a[1]+...+256^31 a[31]
777 // B is the Ed25519 base point (x,4/5) with x positive.
778 //
779 // Preconditions:
780 //   a[31] <= 127
x25519_ge_scalarmult_base(ge_p3 * h,const uint8_t a[32],int use_adx)781 void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t a[32], int use_adx) {
782 #if defined(BORINGSSL_FE25519_ADX)
783   if (use_adx) {
784     uint8_t t[4][32];
785     x25519_ge_scalarmult_base_adx(t, a);
786     fiat_25519_from_bytes(h->X.v, t[0]);
787     fiat_25519_from_bytes(h->Y.v, t[1]);
788     fiat_25519_from_bytes(h->Z.v, t[2]);
789     fiat_25519_from_bytes(h->T.v, t[3]);
790     return;
791   }
792 #else
793   (void)use_adx;
794 #endif
795   signed char e[64];
796   signed char carry;
797   ge_p1p1 r;
798   ge_p2 s;
799   ge_precomp t;
800   int i;
801 
802   for (i = 0; i < 32; ++i) {
803     e[2 * i + 0] = (a[i] >> 0) & 15;
804     e[2 * i + 1] = (a[i] >> 4) & 15;
805   }
806   // each e[i] is between 0 and 15
807   // e[63] is between 0 and 7
808 
809   carry = 0;
810   for (i = 0; i < 63; ++i) {
811     e[i] += carry;
812     carry = e[i] + 8;
813     carry >>= 4;
814     e[i] -= carry << 4;
815   }
816   e[63] += carry;
817   // each e[i] is between -8 and 8
818 
819   ge_p3_0(h);
820   for (i = 1; i < 64; i += 2) {
821     table_select(&t, i / 2, e[i]);
822     ge_madd(&r, h, &t);
823     x25519_ge_p1p1_to_p3(h, &r);
824   }
825 
826   ge_p3_dbl(&r, h);
827   x25519_ge_p1p1_to_p2(&s, &r);
828   ge_p2_dbl(&r, &s);
829   x25519_ge_p1p1_to_p2(&s, &r);
830   ge_p2_dbl(&r, &s);
831   x25519_ge_p1p1_to_p2(&s, &r);
832   ge_p2_dbl(&r, &s);
833   x25519_ge_p1p1_to_p3(h, &r);
834 
835   for (i = 0; i < 64; i += 2) {
836     table_select(&t, i / 2, e[i]);
837     ge_madd(&r, h, &t);
838     x25519_ge_p1p1_to_p3(h, &r);
839   }
840 }
841 
842 #endif
843 
slide(signed char * r,const uint8_t * a)844 static void slide(signed char *r, const uint8_t *a) {
845   int i;
846   int b;
847   int k;
848 
849   for (i = 0; i < 256; ++i) {
850     r[i] = 1 & (a[i >> 3] >> (i & 7));
851   }
852 
853   for (i = 0; i < 256; ++i) {
854     if (r[i]) {
855       for (b = 1; b <= 6 && i + b < 256; ++b) {
856         if (r[i + b]) {
857           if (r[i] + (r[i + b] << b) <= 15) {
858             r[i] += r[i + b] << b;
859             r[i + b] = 0;
860           } else if (r[i] - (r[i + b] << b) >= -15) {
861             r[i] -= r[i + b] << b;
862             for (k = i + b; k < 256; ++k) {
863               if (!r[k]) {
864                 r[k] = 1;
865                 break;
866               }
867               r[k] = 0;
868             }
869           } else {
870             break;
871           }
872         }
873       }
874     }
875   }
876 }
877 
878 // r = a * A + b * B
879 // where a = a[0]+256*a[1]+...+256^31 a[31].
880 // and b = b[0]+256*b[1]+...+256^31 b[31].
881 // B is the Ed25519 base point (x,4/5) with x positive.
ge_double_scalarmult_vartime(ge_p2 * r,const uint8_t * a,const ge_p3 * A,const uint8_t * b)882 static void ge_double_scalarmult_vartime(ge_p2 *r, const uint8_t *a,
883                                          const ge_p3 *A, const uint8_t *b) {
884   signed char aslide[256];
885   signed char bslide[256];
886   ge_cached Ai[8];  // A,3A,5A,7A,9A,11A,13A,15A
887   ge_p1p1 t;
888   ge_p3 u;
889   ge_p3 A2;
890   int i;
891 
892   slide(aslide, a);
893   slide(bslide, b);
894 
895   x25519_ge_p3_to_cached(&Ai[0], A);
896   ge_p3_dbl(&t, A);
897   x25519_ge_p1p1_to_p3(&A2, &t);
898   x25519_ge_add(&t, &A2, &Ai[0]);
899   x25519_ge_p1p1_to_p3(&u, &t);
900   x25519_ge_p3_to_cached(&Ai[1], &u);
901   x25519_ge_add(&t, &A2, &Ai[1]);
902   x25519_ge_p1p1_to_p3(&u, &t);
903   x25519_ge_p3_to_cached(&Ai[2], &u);
904   x25519_ge_add(&t, &A2, &Ai[2]);
905   x25519_ge_p1p1_to_p3(&u, &t);
906   x25519_ge_p3_to_cached(&Ai[3], &u);
907   x25519_ge_add(&t, &A2, &Ai[3]);
908   x25519_ge_p1p1_to_p3(&u, &t);
909   x25519_ge_p3_to_cached(&Ai[4], &u);
910   x25519_ge_add(&t, &A2, &Ai[4]);
911   x25519_ge_p1p1_to_p3(&u, &t);
912   x25519_ge_p3_to_cached(&Ai[5], &u);
913   x25519_ge_add(&t, &A2, &Ai[5]);
914   x25519_ge_p1p1_to_p3(&u, &t);
915   x25519_ge_p3_to_cached(&Ai[6], &u);
916   x25519_ge_add(&t, &A2, &Ai[6]);
917   x25519_ge_p1p1_to_p3(&u, &t);
918   x25519_ge_p3_to_cached(&Ai[7], &u);
919 
920   ge_p2_0(r);
921 
922   for (i = 255; i >= 0; --i) {
923     if (aslide[i] || bslide[i]) {
924       break;
925     }
926   }
927 
928   for (; i >= 0; --i) {
929     ge_p2_dbl(&t, r);
930 
931     if (aslide[i] > 0) {
932       x25519_ge_p1p1_to_p3(&u, &t);
933       x25519_ge_add(&t, &u, &Ai[aslide[i] / 2]);
934     } else if (aslide[i] < 0) {
935       x25519_ge_p1p1_to_p3(&u, &t);
936       x25519_ge_sub(&t, &u, &Ai[(-aslide[i]) / 2]);
937     }
938 
939     if (bslide[i] > 0) {
940       x25519_ge_p1p1_to_p3(&u, &t);
941       ge_madd(&t, &u, &Bi[bslide[i] / 2]);
942     } else if (bslide[i] < 0) {
943       x25519_ge_p1p1_to_p3(&u, &t);
944       ge_msub(&t, &u, &Bi[(-bslide[i]) / 2]);
945     }
946 
947     x25519_ge_p1p1_to_p2(r, &t);
948   }
949 }
950 
951 // int64_lshift21 returns |a << 21| but is defined when shifting bits into the
952 // sign bit. This works around a language flaw in C.
int64_lshift21(int64_t a)953 static inline int64_t int64_lshift21(int64_t a) {
954   return (int64_t)((uint64_t)a << 21);
955 }
956 
957 // The set of scalars is \Z/l
958 // where l = 2^252 + 27742317777372353535851937790883648493.
959 
960 // Input:
961 //   s[0]+256*s[1]+...+256^63*s[63] = s
962 //
963 // Output:
964 //   s[0]+256*s[1]+...+256^31*s[31] = s mod l
965 //   where l = 2^252 + 27742317777372353535851937790883648493.
966 //   Overwrites s in place.
x25519_sc_reduce(uint8_t s[64])967 void x25519_sc_reduce(uint8_t s[64]) {
968   int64_t s0 = 2097151 & load_3(s);
969   int64_t s1 = 2097151 & (load_4(s + 2) >> 5);
970   int64_t s2 = 2097151 & (load_3(s + 5) >> 2);
971   int64_t s3 = 2097151 & (load_4(s + 7) >> 7);
972   int64_t s4 = 2097151 & (load_4(s + 10) >> 4);
973   int64_t s5 = 2097151 & (load_3(s + 13) >> 1);
974   int64_t s6 = 2097151 & (load_4(s + 15) >> 6);
975   int64_t s7 = 2097151 & (load_3(s + 18) >> 3);
976   int64_t s8 = 2097151 & load_3(s + 21);
977   int64_t s9 = 2097151 & (load_4(s + 23) >> 5);
978   int64_t s10 = 2097151 & (load_3(s + 26) >> 2);
979   int64_t s11 = 2097151 & (load_4(s + 28) >> 7);
980   int64_t s12 = 2097151 & (load_4(s + 31) >> 4);
981   int64_t s13 = 2097151 & (load_3(s + 34) >> 1);
982   int64_t s14 = 2097151 & (load_4(s + 36) >> 6);
983   int64_t s15 = 2097151 & (load_3(s + 39) >> 3);
984   int64_t s16 = 2097151 & load_3(s + 42);
985   int64_t s17 = 2097151 & (load_4(s + 44) >> 5);
986   int64_t s18 = 2097151 & (load_3(s + 47) >> 2);
987   int64_t s19 = 2097151 & (load_4(s + 49) >> 7);
988   int64_t s20 = 2097151 & (load_4(s + 52) >> 4);
989   int64_t s21 = 2097151 & (load_3(s + 55) >> 1);
990   int64_t s22 = 2097151 & (load_4(s + 57) >> 6);
991   int64_t s23 = (load_4(s + 60) >> 3);
992   int64_t carry0;
993   int64_t carry1;
994   int64_t carry2;
995   int64_t carry3;
996   int64_t carry4;
997   int64_t carry5;
998   int64_t carry6;
999   int64_t carry7;
1000   int64_t carry8;
1001   int64_t carry9;
1002   int64_t carry10;
1003   int64_t carry11;
1004   int64_t carry12;
1005   int64_t carry13;
1006   int64_t carry14;
1007   int64_t carry15;
1008   int64_t carry16;
1009 
1010   s11 += s23 * 666643;
1011   s12 += s23 * 470296;
1012   s13 += s23 * 654183;
1013   s14 -= s23 * 997805;
1014   s15 += s23 * 136657;
1015   s16 -= s23 * 683901;
1016   s23 = 0;
1017 
1018   s10 += s22 * 666643;
1019   s11 += s22 * 470296;
1020   s12 += s22 * 654183;
1021   s13 -= s22 * 997805;
1022   s14 += s22 * 136657;
1023   s15 -= s22 * 683901;
1024   s22 = 0;
1025 
1026   s9 += s21 * 666643;
1027   s10 += s21 * 470296;
1028   s11 += s21 * 654183;
1029   s12 -= s21 * 997805;
1030   s13 += s21 * 136657;
1031   s14 -= s21 * 683901;
1032   s21 = 0;
1033 
1034   s8 += s20 * 666643;
1035   s9 += s20 * 470296;
1036   s10 += s20 * 654183;
1037   s11 -= s20 * 997805;
1038   s12 += s20 * 136657;
1039   s13 -= s20 * 683901;
1040   s20 = 0;
1041 
1042   s7 += s19 * 666643;
1043   s8 += s19 * 470296;
1044   s9 += s19 * 654183;
1045   s10 -= s19 * 997805;
1046   s11 += s19 * 136657;
1047   s12 -= s19 * 683901;
1048   s19 = 0;
1049 
1050   s6 += s18 * 666643;
1051   s7 += s18 * 470296;
1052   s8 += s18 * 654183;
1053   s9 -= s18 * 997805;
1054   s10 += s18 * 136657;
1055   s11 -= s18 * 683901;
1056   s18 = 0;
1057 
1058   carry6 = (s6 + (1 << 20)) >> 21;
1059   s7 += carry6;
1060   s6 -= int64_lshift21(carry6);
1061   carry8 = (s8 + (1 << 20)) >> 21;
1062   s9 += carry8;
1063   s8 -= int64_lshift21(carry8);
1064   carry10 = (s10 + (1 << 20)) >> 21;
1065   s11 += carry10;
1066   s10 -= int64_lshift21(carry10);
1067   carry12 = (s12 + (1 << 20)) >> 21;
1068   s13 += carry12;
1069   s12 -= int64_lshift21(carry12);
1070   carry14 = (s14 + (1 << 20)) >> 21;
1071   s15 += carry14;
1072   s14 -= int64_lshift21(carry14);
1073   carry16 = (s16 + (1 << 20)) >> 21;
1074   s17 += carry16;
1075   s16 -= int64_lshift21(carry16);
1076 
1077   carry7 = (s7 + (1 << 20)) >> 21;
1078   s8 += carry7;
1079   s7 -= int64_lshift21(carry7);
1080   carry9 = (s9 + (1 << 20)) >> 21;
1081   s10 += carry9;
1082   s9 -= int64_lshift21(carry9);
1083   carry11 = (s11 + (1 << 20)) >> 21;
1084   s12 += carry11;
1085   s11 -= int64_lshift21(carry11);
1086   carry13 = (s13 + (1 << 20)) >> 21;
1087   s14 += carry13;
1088   s13 -= int64_lshift21(carry13);
1089   carry15 = (s15 + (1 << 20)) >> 21;
1090   s16 += carry15;
1091   s15 -= int64_lshift21(carry15);
1092 
1093   s5 += s17 * 666643;
1094   s6 += s17 * 470296;
1095   s7 += s17 * 654183;
1096   s8 -= s17 * 997805;
1097   s9 += s17 * 136657;
1098   s10 -= s17 * 683901;
1099   s17 = 0;
1100 
1101   s4 += s16 * 666643;
1102   s5 += s16 * 470296;
1103   s6 += s16 * 654183;
1104   s7 -= s16 * 997805;
1105   s8 += s16 * 136657;
1106   s9 -= s16 * 683901;
1107   s16 = 0;
1108 
1109   s3 += s15 * 666643;
1110   s4 += s15 * 470296;
1111   s5 += s15 * 654183;
1112   s6 -= s15 * 997805;
1113   s7 += s15 * 136657;
1114   s8 -= s15 * 683901;
1115   s15 = 0;
1116 
1117   s2 += s14 * 666643;
1118   s3 += s14 * 470296;
1119   s4 += s14 * 654183;
1120   s5 -= s14 * 997805;
1121   s6 += s14 * 136657;
1122   s7 -= s14 * 683901;
1123   s14 = 0;
1124 
1125   s1 += s13 * 666643;
1126   s2 += s13 * 470296;
1127   s3 += s13 * 654183;
1128   s4 -= s13 * 997805;
1129   s5 += s13 * 136657;
1130   s6 -= s13 * 683901;
1131   s13 = 0;
1132 
1133   s0 += s12 * 666643;
1134   s1 += s12 * 470296;
1135   s2 += s12 * 654183;
1136   s3 -= s12 * 997805;
1137   s4 += s12 * 136657;
1138   s5 -= s12 * 683901;
1139   s12 = 0;
1140 
1141   carry0 = (s0 + (1 << 20)) >> 21;
1142   s1 += carry0;
1143   s0 -= int64_lshift21(carry0);
1144   carry2 = (s2 + (1 << 20)) >> 21;
1145   s3 += carry2;
1146   s2 -= int64_lshift21(carry2);
1147   carry4 = (s4 + (1 << 20)) >> 21;
1148   s5 += carry4;
1149   s4 -= int64_lshift21(carry4);
1150   carry6 = (s6 + (1 << 20)) >> 21;
1151   s7 += carry6;
1152   s6 -= int64_lshift21(carry6);
1153   carry8 = (s8 + (1 << 20)) >> 21;
1154   s9 += carry8;
1155   s8 -= int64_lshift21(carry8);
1156   carry10 = (s10 + (1 << 20)) >> 21;
1157   s11 += carry10;
1158   s10 -= int64_lshift21(carry10);
1159 
1160   carry1 = (s1 + (1 << 20)) >> 21;
1161   s2 += carry1;
1162   s1 -= int64_lshift21(carry1);
1163   carry3 = (s3 + (1 << 20)) >> 21;
1164   s4 += carry3;
1165   s3 -= int64_lshift21(carry3);
1166   carry5 = (s5 + (1 << 20)) >> 21;
1167   s6 += carry5;
1168   s5 -= int64_lshift21(carry5);
1169   carry7 = (s7 + (1 << 20)) >> 21;
1170   s8 += carry7;
1171   s7 -= int64_lshift21(carry7);
1172   carry9 = (s9 + (1 << 20)) >> 21;
1173   s10 += carry9;
1174   s9 -= int64_lshift21(carry9);
1175   carry11 = (s11 + (1 << 20)) >> 21;
1176   s12 += carry11;
1177   s11 -= int64_lshift21(carry11);
1178 
1179   s0 += s12 * 666643;
1180   s1 += s12 * 470296;
1181   s2 += s12 * 654183;
1182   s3 -= s12 * 997805;
1183   s4 += s12 * 136657;
1184   s5 -= s12 * 683901;
1185   s12 = 0;
1186 
1187   carry0 = s0 >> 21;
1188   s1 += carry0;
1189   s0 -= int64_lshift21(carry0);
1190   carry1 = s1 >> 21;
1191   s2 += carry1;
1192   s1 -= int64_lshift21(carry1);
1193   carry2 = s2 >> 21;
1194   s3 += carry2;
1195   s2 -= int64_lshift21(carry2);
1196   carry3 = s3 >> 21;
1197   s4 += carry3;
1198   s3 -= int64_lshift21(carry3);
1199   carry4 = s4 >> 21;
1200   s5 += carry4;
1201   s4 -= int64_lshift21(carry4);
1202   carry5 = s5 >> 21;
1203   s6 += carry5;
1204   s5 -= int64_lshift21(carry5);
1205   carry6 = s6 >> 21;
1206   s7 += carry6;
1207   s6 -= int64_lshift21(carry6);
1208   carry7 = s7 >> 21;
1209   s8 += carry7;
1210   s7 -= int64_lshift21(carry7);
1211   carry8 = s8 >> 21;
1212   s9 += carry8;
1213   s8 -= int64_lshift21(carry8);
1214   carry9 = s9 >> 21;
1215   s10 += carry9;
1216   s9 -= int64_lshift21(carry9);
1217   carry10 = s10 >> 21;
1218   s11 += carry10;
1219   s10 -= int64_lshift21(carry10);
1220   carry11 = s11 >> 21;
1221   s12 += carry11;
1222   s11 -= int64_lshift21(carry11);
1223 
1224   s0 += s12 * 666643;
1225   s1 += s12 * 470296;
1226   s2 += s12 * 654183;
1227   s3 -= s12 * 997805;
1228   s4 += s12 * 136657;
1229   s5 -= s12 * 683901;
1230   s12 = 0;
1231 
1232   carry0 = s0 >> 21;
1233   s1 += carry0;
1234   s0 -= int64_lshift21(carry0);
1235   carry1 = s1 >> 21;
1236   s2 += carry1;
1237   s1 -= int64_lshift21(carry1);
1238   carry2 = s2 >> 21;
1239   s3 += carry2;
1240   s2 -= int64_lshift21(carry2);
1241   carry3 = s3 >> 21;
1242   s4 += carry3;
1243   s3 -= int64_lshift21(carry3);
1244   carry4 = s4 >> 21;
1245   s5 += carry4;
1246   s4 -= int64_lshift21(carry4);
1247   carry5 = s5 >> 21;
1248   s6 += carry5;
1249   s5 -= int64_lshift21(carry5);
1250   carry6 = s6 >> 21;
1251   s7 += carry6;
1252   s6 -= int64_lshift21(carry6);
1253   carry7 = s7 >> 21;
1254   s8 += carry7;
1255   s7 -= int64_lshift21(carry7);
1256   carry8 = s8 >> 21;
1257   s9 += carry8;
1258   s8 -= int64_lshift21(carry8);
1259   carry9 = s9 >> 21;
1260   s10 += carry9;
1261   s9 -= int64_lshift21(carry9);
1262   carry10 = s10 >> 21;
1263   s11 += carry10;
1264   s10 -= int64_lshift21(carry10);
1265 
1266   s[0] = s0 >> 0;
1267   s[1] = s0 >> 8;
1268   s[2] = (s0 >> 16) | (s1 << 5);
1269   s[3] = s1 >> 3;
1270   s[4] = s1 >> 11;
1271   s[5] = (s1 >> 19) | (s2 << 2);
1272   s[6] = s2 >> 6;
1273   s[7] = (s2 >> 14) | (s3 << 7);
1274   s[8] = s3 >> 1;
1275   s[9] = s3 >> 9;
1276   s[10] = (s3 >> 17) | (s4 << 4);
1277   s[11] = s4 >> 4;
1278   s[12] = s4 >> 12;
1279   s[13] = (s4 >> 20) | (s5 << 1);
1280   s[14] = s5 >> 7;
1281   s[15] = (s5 >> 15) | (s6 << 6);
1282   s[16] = s6 >> 2;
1283   s[17] = s6 >> 10;
1284   s[18] = (s6 >> 18) | (s7 << 3);
1285   s[19] = s7 >> 5;
1286   s[20] = s7 >> 13;
1287   s[21] = s8 >> 0;
1288   s[22] = s8 >> 8;
1289   s[23] = (s8 >> 16) | (s9 << 5);
1290   s[24] = s9 >> 3;
1291   s[25] = s9 >> 11;
1292   s[26] = (s9 >> 19) | (s10 << 2);
1293   s[27] = s10 >> 6;
1294   s[28] = (s10 >> 14) | (s11 << 7);
1295   s[29] = s11 >> 1;
1296   s[30] = s11 >> 9;
1297   s[31] = s11 >> 17;
1298 }
1299 
1300 // Input:
1301 //   a[0]+256*a[1]+...+256^31*a[31] = a
1302 //   b[0]+256*b[1]+...+256^31*b[31] = b
1303 //   c[0]+256*c[1]+...+256^31*c[31] = c
1304 //
1305 // Output:
1306 //   s[0]+256*s[1]+...+256^31*s[31] = (ab+c) mod l
1307 //   where l = 2^252 + 27742317777372353535851937790883648493.
sc_muladd(uint8_t * s,const uint8_t * a,const uint8_t * b,const uint8_t * c)1308 static void sc_muladd(uint8_t *s, const uint8_t *a, const uint8_t *b,
1309                       const uint8_t *c) {
1310   int64_t a0 = 2097151 & load_3(a);
1311   int64_t a1 = 2097151 & (load_4(a + 2) >> 5);
1312   int64_t a2 = 2097151 & (load_3(a + 5) >> 2);
1313   int64_t a3 = 2097151 & (load_4(a + 7) >> 7);
1314   int64_t a4 = 2097151 & (load_4(a + 10) >> 4);
1315   int64_t a5 = 2097151 & (load_3(a + 13) >> 1);
1316   int64_t a6 = 2097151 & (load_4(a + 15) >> 6);
1317   int64_t a7 = 2097151 & (load_3(a + 18) >> 3);
1318   int64_t a8 = 2097151 & load_3(a + 21);
1319   int64_t a9 = 2097151 & (load_4(a + 23) >> 5);
1320   int64_t a10 = 2097151 & (load_3(a + 26) >> 2);
1321   int64_t a11 = (load_4(a + 28) >> 7);
1322   int64_t b0 = 2097151 & load_3(b);
1323   int64_t b1 = 2097151 & (load_4(b + 2) >> 5);
1324   int64_t b2 = 2097151 & (load_3(b + 5) >> 2);
1325   int64_t b3 = 2097151 & (load_4(b + 7) >> 7);
1326   int64_t b4 = 2097151 & (load_4(b + 10) >> 4);
1327   int64_t b5 = 2097151 & (load_3(b + 13) >> 1);
1328   int64_t b6 = 2097151 & (load_4(b + 15) >> 6);
1329   int64_t b7 = 2097151 & (load_3(b + 18) >> 3);
1330   int64_t b8 = 2097151 & load_3(b + 21);
1331   int64_t b9 = 2097151 & (load_4(b + 23) >> 5);
1332   int64_t b10 = 2097151 & (load_3(b + 26) >> 2);
1333   int64_t b11 = (load_4(b + 28) >> 7);
1334   int64_t c0 = 2097151 & load_3(c);
1335   int64_t c1 = 2097151 & (load_4(c + 2) >> 5);
1336   int64_t c2 = 2097151 & (load_3(c + 5) >> 2);
1337   int64_t c3 = 2097151 & (load_4(c + 7) >> 7);
1338   int64_t c4 = 2097151 & (load_4(c + 10) >> 4);
1339   int64_t c5 = 2097151 & (load_3(c + 13) >> 1);
1340   int64_t c6 = 2097151 & (load_4(c + 15) >> 6);
1341   int64_t c7 = 2097151 & (load_3(c + 18) >> 3);
1342   int64_t c8 = 2097151 & load_3(c + 21);
1343   int64_t c9 = 2097151 & (load_4(c + 23) >> 5);
1344   int64_t c10 = 2097151 & (load_3(c + 26) >> 2);
1345   int64_t c11 = (load_4(c + 28) >> 7);
1346   int64_t s0;
1347   int64_t s1;
1348   int64_t s2;
1349   int64_t s3;
1350   int64_t s4;
1351   int64_t s5;
1352   int64_t s6;
1353   int64_t s7;
1354   int64_t s8;
1355   int64_t s9;
1356   int64_t s10;
1357   int64_t s11;
1358   int64_t s12;
1359   int64_t s13;
1360   int64_t s14;
1361   int64_t s15;
1362   int64_t s16;
1363   int64_t s17;
1364   int64_t s18;
1365   int64_t s19;
1366   int64_t s20;
1367   int64_t s21;
1368   int64_t s22;
1369   int64_t s23;
1370   int64_t carry0;
1371   int64_t carry1;
1372   int64_t carry2;
1373   int64_t carry3;
1374   int64_t carry4;
1375   int64_t carry5;
1376   int64_t carry6;
1377   int64_t carry7;
1378   int64_t carry8;
1379   int64_t carry9;
1380   int64_t carry10;
1381   int64_t carry11;
1382   int64_t carry12;
1383   int64_t carry13;
1384   int64_t carry14;
1385   int64_t carry15;
1386   int64_t carry16;
1387   int64_t carry17;
1388   int64_t carry18;
1389   int64_t carry19;
1390   int64_t carry20;
1391   int64_t carry21;
1392   int64_t carry22;
1393 
1394   s0 = c0 + a0 * b0;
1395   s1 = c1 + a0 * b1 + a1 * b0;
1396   s2 = c2 + a0 * b2 + a1 * b1 + a2 * b0;
1397   s3 = c3 + a0 * b3 + a1 * b2 + a2 * b1 + a3 * b0;
1398   s4 = c4 + a0 * b4 + a1 * b3 + a2 * b2 + a3 * b1 + a4 * b0;
1399   s5 = c5 + a0 * b5 + a1 * b4 + a2 * b3 + a3 * b2 + a4 * b1 + a5 * b0;
1400   s6 = c6 + a0 * b6 + a1 * b5 + a2 * b4 + a3 * b3 + a4 * b2 + a5 * b1 + a6 * b0;
1401   s7 = c7 + a0 * b7 + a1 * b6 + a2 * b5 + a3 * b4 + a4 * b3 + a5 * b2 +
1402        a6 * b1 + a7 * b0;
1403   s8 = c8 + a0 * b8 + a1 * b7 + a2 * b6 + a3 * b5 + a4 * b4 + a5 * b3 +
1404        a6 * b2 + a7 * b1 + a8 * b0;
1405   s9 = c9 + a0 * b9 + a1 * b8 + a2 * b7 + a3 * b6 + a4 * b5 + a5 * b4 +
1406        a6 * b3 + a7 * b2 + a8 * b1 + a9 * b0;
1407   s10 = c10 + a0 * b10 + a1 * b9 + a2 * b8 + a3 * b7 + a4 * b6 + a5 * b5 +
1408         a6 * b4 + a7 * b3 + a8 * b2 + a9 * b1 + a10 * b0;
1409   s11 = c11 + a0 * b11 + a1 * b10 + a2 * b9 + a3 * b8 + a4 * b7 + a5 * b6 +
1410         a6 * b5 + a7 * b4 + a8 * b3 + a9 * b2 + a10 * b1 + a11 * b0;
1411   s12 = a1 * b11 + a2 * b10 + a3 * b9 + a4 * b8 + a5 * b7 + a6 * b6 + a7 * b5 +
1412         a8 * b4 + a9 * b3 + a10 * b2 + a11 * b1;
1413   s13 = a2 * b11 + a3 * b10 + a4 * b9 + a5 * b8 + a6 * b7 + a7 * b6 + a8 * b5 +
1414         a9 * b4 + a10 * b3 + a11 * b2;
1415   s14 = a3 * b11 + a4 * b10 + a5 * b9 + a6 * b8 + a7 * b7 + a8 * b6 + a9 * b5 +
1416         a10 * b4 + a11 * b3;
1417   s15 = a4 * b11 + a5 * b10 + a6 * b9 + a7 * b8 + a8 * b7 + a9 * b6 + a10 * b5 +
1418         a11 * b4;
1419   s16 = a5 * b11 + a6 * b10 + a7 * b9 + a8 * b8 + a9 * b7 + a10 * b6 + a11 * b5;
1420   s17 = a6 * b11 + a7 * b10 + a8 * b9 + a9 * b8 + a10 * b7 + a11 * b6;
1421   s18 = a7 * b11 + a8 * b10 + a9 * b9 + a10 * b8 + a11 * b7;
1422   s19 = a8 * b11 + a9 * b10 + a10 * b9 + a11 * b8;
1423   s20 = a9 * b11 + a10 * b10 + a11 * b9;
1424   s21 = a10 * b11 + a11 * b10;
1425   s22 = a11 * b11;
1426   s23 = 0;
1427 
1428   carry0 = (s0 + (1 << 20)) >> 21;
1429   s1 += carry0;
1430   s0 -= int64_lshift21(carry0);
1431   carry2 = (s2 + (1 << 20)) >> 21;
1432   s3 += carry2;
1433   s2 -= int64_lshift21(carry2);
1434   carry4 = (s4 + (1 << 20)) >> 21;
1435   s5 += carry4;
1436   s4 -= int64_lshift21(carry4);
1437   carry6 = (s6 + (1 << 20)) >> 21;
1438   s7 += carry6;
1439   s6 -= int64_lshift21(carry6);
1440   carry8 = (s8 + (1 << 20)) >> 21;
1441   s9 += carry8;
1442   s8 -= int64_lshift21(carry8);
1443   carry10 = (s10 + (1 << 20)) >> 21;
1444   s11 += carry10;
1445   s10 -= int64_lshift21(carry10);
1446   carry12 = (s12 + (1 << 20)) >> 21;
1447   s13 += carry12;
1448   s12 -= int64_lshift21(carry12);
1449   carry14 = (s14 + (1 << 20)) >> 21;
1450   s15 += carry14;
1451   s14 -= int64_lshift21(carry14);
1452   carry16 = (s16 + (1 << 20)) >> 21;
1453   s17 += carry16;
1454   s16 -= int64_lshift21(carry16);
1455   carry18 = (s18 + (1 << 20)) >> 21;
1456   s19 += carry18;
1457   s18 -= int64_lshift21(carry18);
1458   carry20 = (s20 + (1 << 20)) >> 21;
1459   s21 += carry20;
1460   s20 -= int64_lshift21(carry20);
1461   carry22 = (s22 + (1 << 20)) >> 21;
1462   s23 += carry22;
1463   s22 -= int64_lshift21(carry22);
1464 
1465   carry1 = (s1 + (1 << 20)) >> 21;
1466   s2 += carry1;
1467   s1 -= int64_lshift21(carry1);
1468   carry3 = (s3 + (1 << 20)) >> 21;
1469   s4 += carry3;
1470   s3 -= int64_lshift21(carry3);
1471   carry5 = (s5 + (1 << 20)) >> 21;
1472   s6 += carry5;
1473   s5 -= int64_lshift21(carry5);
1474   carry7 = (s7 + (1 << 20)) >> 21;
1475   s8 += carry7;
1476   s7 -= int64_lshift21(carry7);
1477   carry9 = (s9 + (1 << 20)) >> 21;
1478   s10 += carry9;
1479   s9 -= int64_lshift21(carry9);
1480   carry11 = (s11 + (1 << 20)) >> 21;
1481   s12 += carry11;
1482   s11 -= int64_lshift21(carry11);
1483   carry13 = (s13 + (1 << 20)) >> 21;
1484   s14 += carry13;
1485   s13 -= int64_lshift21(carry13);
1486   carry15 = (s15 + (1 << 20)) >> 21;
1487   s16 += carry15;
1488   s15 -= int64_lshift21(carry15);
1489   carry17 = (s17 + (1 << 20)) >> 21;
1490   s18 += carry17;
1491   s17 -= int64_lshift21(carry17);
1492   carry19 = (s19 + (1 << 20)) >> 21;
1493   s20 += carry19;
1494   s19 -= int64_lshift21(carry19);
1495   carry21 = (s21 + (1 << 20)) >> 21;
1496   s22 += carry21;
1497   s21 -= int64_lshift21(carry21);
1498 
1499   s11 += s23 * 666643;
1500   s12 += s23 * 470296;
1501   s13 += s23 * 654183;
1502   s14 -= s23 * 997805;
1503   s15 += s23 * 136657;
1504   s16 -= s23 * 683901;
1505   s23 = 0;
1506 
1507   s10 += s22 * 666643;
1508   s11 += s22 * 470296;
1509   s12 += s22 * 654183;
1510   s13 -= s22 * 997805;
1511   s14 += s22 * 136657;
1512   s15 -= s22 * 683901;
1513   s22 = 0;
1514 
1515   s9 += s21 * 666643;
1516   s10 += s21 * 470296;
1517   s11 += s21 * 654183;
1518   s12 -= s21 * 997805;
1519   s13 += s21 * 136657;
1520   s14 -= s21 * 683901;
1521   s21 = 0;
1522 
1523   s8 += s20 * 666643;
1524   s9 += s20 * 470296;
1525   s10 += s20 * 654183;
1526   s11 -= s20 * 997805;
1527   s12 += s20 * 136657;
1528   s13 -= s20 * 683901;
1529   s20 = 0;
1530 
1531   s7 += s19 * 666643;
1532   s8 += s19 * 470296;
1533   s9 += s19 * 654183;
1534   s10 -= s19 * 997805;
1535   s11 += s19 * 136657;
1536   s12 -= s19 * 683901;
1537   s19 = 0;
1538 
1539   s6 += s18 * 666643;
1540   s7 += s18 * 470296;
1541   s8 += s18 * 654183;
1542   s9 -= s18 * 997805;
1543   s10 += s18 * 136657;
1544   s11 -= s18 * 683901;
1545   s18 = 0;
1546 
1547   carry6 = (s6 + (1 << 20)) >> 21;
1548   s7 += carry6;
1549   s6 -= int64_lshift21(carry6);
1550   carry8 = (s8 + (1 << 20)) >> 21;
1551   s9 += carry8;
1552   s8 -= int64_lshift21(carry8);
1553   carry10 = (s10 + (1 << 20)) >> 21;
1554   s11 += carry10;
1555   s10 -= int64_lshift21(carry10);
1556   carry12 = (s12 + (1 << 20)) >> 21;
1557   s13 += carry12;
1558   s12 -= int64_lshift21(carry12);
1559   carry14 = (s14 + (1 << 20)) >> 21;
1560   s15 += carry14;
1561   s14 -= int64_lshift21(carry14);
1562   carry16 = (s16 + (1 << 20)) >> 21;
1563   s17 += carry16;
1564   s16 -= int64_lshift21(carry16);
1565 
1566   carry7 = (s7 + (1 << 20)) >> 21;
1567   s8 += carry7;
1568   s7 -= int64_lshift21(carry7);
1569   carry9 = (s9 + (1 << 20)) >> 21;
1570   s10 += carry9;
1571   s9 -= int64_lshift21(carry9);
1572   carry11 = (s11 + (1 << 20)) >> 21;
1573   s12 += carry11;
1574   s11 -= int64_lshift21(carry11);
1575   carry13 = (s13 + (1 << 20)) >> 21;
1576   s14 += carry13;
1577   s13 -= int64_lshift21(carry13);
1578   carry15 = (s15 + (1 << 20)) >> 21;
1579   s16 += carry15;
1580   s15 -= int64_lshift21(carry15);
1581 
1582   s5 += s17 * 666643;
1583   s6 += s17 * 470296;
1584   s7 += s17 * 654183;
1585   s8 -= s17 * 997805;
1586   s9 += s17 * 136657;
1587   s10 -= s17 * 683901;
1588   s17 = 0;
1589 
1590   s4 += s16 * 666643;
1591   s5 += s16 * 470296;
1592   s6 += s16 * 654183;
1593   s7 -= s16 * 997805;
1594   s8 += s16 * 136657;
1595   s9 -= s16 * 683901;
1596   s16 = 0;
1597 
1598   s3 += s15 * 666643;
1599   s4 += s15 * 470296;
1600   s5 += s15 * 654183;
1601   s6 -= s15 * 997805;
1602   s7 += s15 * 136657;
1603   s8 -= s15 * 683901;
1604   s15 = 0;
1605 
1606   s2 += s14 * 666643;
1607   s3 += s14 * 470296;
1608   s4 += s14 * 654183;
1609   s5 -= s14 * 997805;
1610   s6 += s14 * 136657;
1611   s7 -= s14 * 683901;
1612   s14 = 0;
1613 
1614   s1 += s13 * 666643;
1615   s2 += s13 * 470296;
1616   s3 += s13 * 654183;
1617   s4 -= s13 * 997805;
1618   s5 += s13 * 136657;
1619   s6 -= s13 * 683901;
1620   s13 = 0;
1621 
1622   s0 += s12 * 666643;
1623   s1 += s12 * 470296;
1624   s2 += s12 * 654183;
1625   s3 -= s12 * 997805;
1626   s4 += s12 * 136657;
1627   s5 -= s12 * 683901;
1628   s12 = 0;
1629 
1630   carry0 = (s0 + (1 << 20)) >> 21;
1631   s1 += carry0;
1632   s0 -= int64_lshift21(carry0);
1633   carry2 = (s2 + (1 << 20)) >> 21;
1634   s3 += carry2;
1635   s2 -= int64_lshift21(carry2);
1636   carry4 = (s4 + (1 << 20)) >> 21;
1637   s5 += carry4;
1638   s4 -= int64_lshift21(carry4);
1639   carry6 = (s6 + (1 << 20)) >> 21;
1640   s7 += carry6;
1641   s6 -= int64_lshift21(carry6);
1642   carry8 = (s8 + (1 << 20)) >> 21;
1643   s9 += carry8;
1644   s8 -= int64_lshift21(carry8);
1645   carry10 = (s10 + (1 << 20)) >> 21;
1646   s11 += carry10;
1647   s10 -= int64_lshift21(carry10);
1648 
1649   carry1 = (s1 + (1 << 20)) >> 21;
1650   s2 += carry1;
1651   s1 -= int64_lshift21(carry1);
1652   carry3 = (s3 + (1 << 20)) >> 21;
1653   s4 += carry3;
1654   s3 -= int64_lshift21(carry3);
1655   carry5 = (s5 + (1 << 20)) >> 21;
1656   s6 += carry5;
1657   s5 -= int64_lshift21(carry5);
1658   carry7 = (s7 + (1 << 20)) >> 21;
1659   s8 += carry7;
1660   s7 -= int64_lshift21(carry7);
1661   carry9 = (s9 + (1 << 20)) >> 21;
1662   s10 += carry9;
1663   s9 -= int64_lshift21(carry9);
1664   carry11 = (s11 + (1 << 20)) >> 21;
1665   s12 += carry11;
1666   s11 -= int64_lshift21(carry11);
1667 
1668   s0 += s12 * 666643;
1669   s1 += s12 * 470296;
1670   s2 += s12 * 654183;
1671   s3 -= s12 * 997805;
1672   s4 += s12 * 136657;
1673   s5 -= s12 * 683901;
1674   s12 = 0;
1675 
1676   carry0 = s0 >> 21;
1677   s1 += carry0;
1678   s0 -= int64_lshift21(carry0);
1679   carry1 = s1 >> 21;
1680   s2 += carry1;
1681   s1 -= int64_lshift21(carry1);
1682   carry2 = s2 >> 21;
1683   s3 += carry2;
1684   s2 -= int64_lshift21(carry2);
1685   carry3 = s3 >> 21;
1686   s4 += carry3;
1687   s3 -= int64_lshift21(carry3);
1688   carry4 = s4 >> 21;
1689   s5 += carry4;
1690   s4 -= int64_lshift21(carry4);
1691   carry5 = s5 >> 21;
1692   s6 += carry5;
1693   s5 -= int64_lshift21(carry5);
1694   carry6 = s6 >> 21;
1695   s7 += carry6;
1696   s6 -= int64_lshift21(carry6);
1697   carry7 = s7 >> 21;
1698   s8 += carry7;
1699   s7 -= int64_lshift21(carry7);
1700   carry8 = s8 >> 21;
1701   s9 += carry8;
1702   s8 -= int64_lshift21(carry8);
1703   carry9 = s9 >> 21;
1704   s10 += carry9;
1705   s9 -= int64_lshift21(carry9);
1706   carry10 = s10 >> 21;
1707   s11 += carry10;
1708   s10 -= int64_lshift21(carry10);
1709   carry11 = s11 >> 21;
1710   s12 += carry11;
1711   s11 -= int64_lshift21(carry11);
1712 
1713   s0 += s12 * 666643;
1714   s1 += s12 * 470296;
1715   s2 += s12 * 654183;
1716   s3 -= s12 * 997805;
1717   s4 += s12 * 136657;
1718   s5 -= s12 * 683901;
1719   s12 = 0;
1720 
1721   carry0 = s0 >> 21;
1722   s1 += carry0;
1723   s0 -= int64_lshift21(carry0);
1724   carry1 = s1 >> 21;
1725   s2 += carry1;
1726   s1 -= int64_lshift21(carry1);
1727   carry2 = s2 >> 21;
1728   s3 += carry2;
1729   s2 -= int64_lshift21(carry2);
1730   carry3 = s3 >> 21;
1731   s4 += carry3;
1732   s3 -= int64_lshift21(carry3);
1733   carry4 = s4 >> 21;
1734   s5 += carry4;
1735   s4 -= int64_lshift21(carry4);
1736   carry5 = s5 >> 21;
1737   s6 += carry5;
1738   s5 -= int64_lshift21(carry5);
1739   carry6 = s6 >> 21;
1740   s7 += carry6;
1741   s6 -= int64_lshift21(carry6);
1742   carry7 = s7 >> 21;
1743   s8 += carry7;
1744   s7 -= int64_lshift21(carry7);
1745   carry8 = s8 >> 21;
1746   s9 += carry8;
1747   s8 -= int64_lshift21(carry8);
1748   carry9 = s9 >> 21;
1749   s10 += carry9;
1750   s9 -= int64_lshift21(carry9);
1751   carry10 = s10 >> 21;
1752   s11 += carry10;
1753   s10 -= int64_lshift21(carry10);
1754 
1755   s[0] = s0 >> 0;
1756   s[1] = s0 >> 8;
1757   s[2] = (s0 >> 16) | (s1 << 5);
1758   s[3] = s1 >> 3;
1759   s[4] = s1 >> 11;
1760   s[5] = (s1 >> 19) | (s2 << 2);
1761   s[6] = s2 >> 6;
1762   s[7] = (s2 >> 14) | (s3 << 7);
1763   s[8] = s3 >> 1;
1764   s[9] = s3 >> 9;
1765   s[10] = (s3 >> 17) | (s4 << 4);
1766   s[11] = s4 >> 4;
1767   s[12] = s4 >> 12;
1768   s[13] = (s4 >> 20) | (s5 << 1);
1769   s[14] = s5 >> 7;
1770   s[15] = (s5 >> 15) | (s6 << 6);
1771   s[16] = s6 >> 2;
1772   s[17] = s6 >> 10;
1773   s[18] = (s6 >> 18) | (s7 << 3);
1774   s[19] = s7 >> 5;
1775   s[20] = s7 >> 13;
1776   s[21] = s8 >> 0;
1777   s[22] = s8 >> 8;
1778   s[23] = (s8 >> 16) | (s9 << 5);
1779   s[24] = s9 >> 3;
1780   s[25] = s9 >> 11;
1781   s[26] = (s9 >> 19) | (s10 << 2);
1782   s[27] = s10 >> 6;
1783   s[28] = (s10 >> 14) | (s11 << 7);
1784   s[29] = s11 >> 1;
1785   s[30] = s11 >> 9;
1786   s[31] = s11 >> 17;
1787 }
1788 
1789 
x25519_scalar_mult_generic_masked(uint8_t out[32],const uint8_t scalar_masked[32],const uint8_t point[32])1790 void x25519_scalar_mult_generic_masked(uint8_t out[32],
1791                                            const uint8_t scalar_masked[32],
1792                                            const uint8_t point[32]) {
1793   fe x1, x2, z2, x3, z3, tmp0, tmp1;
1794   fe_loose x2l, z2l, x3l, tmp0l, tmp1l;
1795 
1796   uint8_t e[32];
1797   OPENSSL_memcpy(e, scalar_masked, 32);
1798   // The following implementation was transcribed to Coq and proven to
1799   // correspond to unary scalar multiplication in affine coordinates given that
1800   // x1 != 0 is the x coordinate of some point on the curve. It was also checked
1801   // in Coq that doing a ladderstep with x1 = x3 = 0 gives z2' = z3' = 0, and z2
1802   // = z3 = 0 gives z2' = z3' = 0. The statement was quantified over the
1803   // underlying field, so it applies to Curve25519 itself and the quadratic
1804   // twist of Curve25519. It was not proven in Coq that prime-field arithmetic
1805   // correctly simulates extension-field arithmetic on prime-field values.
1806   // The decoding of the byte array representation of e was not considered.
1807   // Specification of Montgomery curves in affine coordinates:
1808   // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Spec/MontgomeryCurve.v#L27>
1809   // Proof that these form a group that is isomorphic to a Weierstrass curve:
1810   // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/AffineProofs.v#L35>
1811   // Coq transcription and correctness proof of the loop (where scalarbits=255):
1812   // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L118>
1813   // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L278>
1814   // preconditions: 0 <= e < 2^255 (not necessarily e < order), fe_invert(0) = 0
1815   fe_frombytes(&x1, point);
1816   fe_1(&x2);
1817   fe_0(&z2);
1818   fe_copy(&x3, &x1);
1819   fe_1(&z3);
1820 
1821   unsigned swap = 0;
1822   int pos;
1823   for (pos = 254; pos >= 0; --pos) {
1824     // loop invariant as of right before the test, for the case where x1 != 0:
1825     //   pos >= -1; if z2 = 0 then x2 is nonzero; if z3 = 0 then x3 is nonzero
1826     //   let r := e >> (pos+1) in the following equalities of projective points:
1827     //   to_xz (r*P)     === if swap then (x3, z3) else (x2, z2)
1828     //   to_xz ((r+1)*P) === if swap then (x2, z2) else (x3, z3)
1829     //   x1 is the nonzero x coordinate of the nonzero point (r*P-(r+1)*P)
1830     unsigned b = 1 & (e[pos / 8] >> (pos & 7));
1831     swap ^= b;
1832     fe_cswap(&x2, &x3, swap);
1833     fe_cswap(&z2, &z3, swap);
1834     swap = b;
1835     // Coq transcription of ladderstep formula (called from transcribed loop):
1836     // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L89>
1837     // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L131>
1838     // x1 != 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L217>
1839     // x1  = 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L147>
1840     fe_sub(&tmp0l, &x3, &z3);
1841     fe_sub(&tmp1l, &x2, &z2);
1842     fe_add(&x2l, &x2, &z2);
1843     fe_add(&z2l, &x3, &z3);
1844     fe_mul_tll(&z3, &tmp0l, &x2l);
1845     fe_mul_tll(&z2, &z2l, &tmp1l);
1846     fe_sq_tl(&tmp0, &tmp1l);
1847     fe_sq_tl(&tmp1, &x2l);
1848     fe_add(&x3l, &z3, &z2);
1849     fe_sub(&z2l, &z3, &z2);
1850     fe_mul_ttt(&x2, &tmp1, &tmp0);
1851     fe_sub(&tmp1l, &tmp1, &tmp0);
1852     fe_sq_tl(&z2, &z2l);
1853     fe_mul121666(&z3, &tmp1l);
1854     fe_sq_tl(&x3, &x3l);
1855     fe_add(&tmp0l, &tmp0, &z3);
1856     fe_mul_ttt(&z3, &x1, &z2);
1857     fe_mul_tll(&z2, &tmp1l, &tmp0l);
1858   }
1859   // here pos=-1, so r=e, so to_xz (e*P) === if swap then (x3, z3) else (x2, z2)
1860   fe_cswap(&x2, &x3, swap);
1861   fe_cswap(&z2, &z3, swap);
1862 
1863   fe_invert(&z2, &z2);
1864   fe_mul_ttt(&x2, &x2, &z2);
1865   fe_tobytes(out, &x2);
1866 }
1867 
x25519_public_from_private_generic_masked(uint8_t out_public_value[32],const uint8_t private_key_masked[32],int use_adx)1868 void x25519_public_from_private_generic_masked(uint8_t out_public_value[32],
1869                                                const uint8_t private_key_masked[32],
1870                                                int use_adx) {
1871   uint8_t e[32];
1872   OPENSSL_memcpy(e, private_key_masked, 32);
1873 
1874   ge_p3 A;
1875   x25519_ge_scalarmult_base(&A, e, use_adx);
1876 
1877   // We only need the u-coordinate of the curve25519 point. The map is
1878   // u=(y+1)/(1-y). Since y=Y/Z, this gives u=(Z+Y)/(Z-Y).
1879   fe_loose zplusy, zminusy;
1880   fe zminusy_inv;
1881   fe_add(&zplusy, &A.Z, &A.Y);
1882   fe_sub(&zminusy, &A.Z, &A.Y);
1883   fe_loose_invert(&zminusy_inv, &zminusy);
1884   fe_mul_tlt(&zminusy_inv, &zplusy, &zminusy_inv);
1885   fe_tobytes(out_public_value, &zminusy_inv);
1886   CONSTTIME_DECLASSIFY(out_public_value, 32);
1887 }
1888 
x25519_fe_invert(fe * out,const fe * z)1889 void x25519_fe_invert(fe *out, const fe *z) {
1890   fe_invert(out, z);
1891 }
1892 
x25519_fe_isnegative(const fe * f)1893 uint8_t x25519_fe_isnegative(const fe *f) {
1894   return (uint8_t)fe_isnegative(f);
1895 }
1896 
x25519_fe_mul_ttt(fe * h,const fe * f,const fe * g)1897 void x25519_fe_mul_ttt(fe *h, const fe *f, const fe *g) {
1898   fe_mul_ttt(h, f, g);
1899 }
1900 
x25519_fe_neg(fe * f)1901 void x25519_fe_neg(fe *f) {
1902   fe_loose t;
1903   fe_neg(&t, f);
1904   fe_carry(f, &t);
1905 }
1906 
x25519_fe_tobytes(uint8_t s[32],const fe * h)1907 void x25519_fe_tobytes(uint8_t s[32], const fe *h) {
1908   fe_tobytes(s, h);
1909 }
1910 
x25519_ge_double_scalarmult_vartime(ge_p2 * r,const uint8_t * a,const ge_p3 * A,const uint8_t * b)1911 void x25519_ge_double_scalarmult_vartime(ge_p2 *r, const uint8_t *a,
1912                                              const ge_p3 *A, const uint8_t *b) {
1913   ge_double_scalarmult_vartime(r, a, A, b);
1914 }
1915 
x25519_sc_mask(uint8_t a[32])1916 void x25519_sc_mask(uint8_t a[32]) {
1917   a[0] &= 248;
1918   a[31] &= 127;
1919   a[31] |= 64;
1920 }
1921 
x25519_sc_muladd(uint8_t * s,const uint8_t * a,const uint8_t * b,const uint8_t * c)1922 void x25519_sc_muladd(uint8_t *s, const uint8_t *a, const uint8_t *b,
1923                           const uint8_t *c) {
1924   sc_muladd(s, a, b, c);
1925 }
1926