1 //! A simple big-integer type for slow path algorithms.
2 //!
3 //! This includes minimal stackvector for use in big-integer arithmetic.
4 
5 #![doc(hidden)]
6 
7 #[cfg(feature = "alloc")]
8 use crate::heapvec::HeapVec;
9 use crate::num::Float;
10 #[cfg(not(feature = "alloc"))]
11 use crate::stackvec::StackVec;
12 #[cfg(not(feature = "compact"))]
13 use crate::table::{LARGE_POW5, LARGE_POW5_STEP};
14 use core::{cmp, ops, ptr};
15 
16 /// Number of bits in a Bigint.
17 ///
18 /// This needs to be at least the number of bits required to store
19 /// a Bigint, which is `log2(radix**digits)`.
20 /// ≅ 3600 for base-10, rounded-up.
21 pub const BIGINT_BITS: usize = 4000;
22 
23 /// The number of limbs for the bigint.
24 pub const BIGINT_LIMBS: usize = BIGINT_BITS / LIMB_BITS;
25 
26 #[cfg(feature = "alloc")]
27 pub type VecType = HeapVec;
28 
29 #[cfg(not(feature = "alloc"))]
30 pub type VecType = StackVec;
31 
32 /// Storage for a big integer type.
33 ///
34 /// This is used for algorithms when we have a finite number of digits.
35 /// Specifically, it stores all the significant digits scaled to the
36 /// proper exponent, as an integral type, and then directly compares
37 /// these digits.
38 ///
39 /// This requires us to store the number of significant bits, plus the
40 /// number of exponent bits (required) since we scale everything
41 /// to the same exponent.
42 #[derive(Clone, PartialEq, Eq)]
43 pub struct Bigint {
44     /// Significant digits for the float, stored in a big integer in LE order.
45     ///
46     /// This is pretty much the same number of digits for any radix, since the
47     ///  significant digits balances out the zeros from the exponent:
48     ///     1. Decimal is 1091 digits, 767 mantissa digits + 324 exponent zeros.
49     ///     2. Base 6 is 1097 digits, or 680 mantissa digits + 417 exponent zeros.
50     ///     3. Base 36 is 1086 digits, or 877 mantissa digits + 209 exponent zeros.
51     ///
52     /// However, the number of bytes required is larger for large radixes:
53     /// for decimal, we need `log2(10**1091) ≅ 3600`, while for base 36
54     /// we need `log2(36**1086) ≅ 5600`. Since we use uninitialized data,
55     /// we avoid a major performance hit from the large buffer size.
56     pub data: VecType,
57 }
58 
59 #[allow(clippy::new_without_default)]
60 impl Bigint {
61     /// Construct a bigint representing 0.
62     #[inline(always)]
new() -> Self63     pub fn new() -> Self {
64         Self {
65             data: VecType::new(),
66         }
67     }
68 
69     /// Construct a bigint from an integer.
70     #[inline(always)]
from_u64(value: u64) -> Self71     pub fn from_u64(value: u64) -> Self {
72         Self {
73             data: VecType::from_u64(value),
74         }
75     }
76 
77     #[inline(always)]
hi64(&self) -> (u64, bool)78     pub fn hi64(&self) -> (u64, bool) {
79         self.data.hi64()
80     }
81 
82     /// Multiply and assign as if by exponentiation by a power.
83     #[inline]
pow(&mut self, base: u32, exp: u32) -> Option<()>84     pub fn pow(&mut self, base: u32, exp: u32) -> Option<()> {
85         debug_assert!(base == 2 || base == 5 || base == 10);
86         if base % 5 == 0 {
87             pow(&mut self.data, exp)?;
88         }
89         if base % 2 == 0 {
90             shl(&mut self.data, exp as usize)?;
91         }
92         Some(())
93     }
94 
95     /// Calculate the bit-length of the big-integer.
96     #[inline]
bit_length(&self) -> u3297     pub fn bit_length(&self) -> u32 {
98         bit_length(&self.data)
99     }
100 }
101 
102 impl ops::MulAssign<&Bigint> for Bigint {
mul_assign(&mut self, rhs: &Bigint)103     fn mul_assign(&mut self, rhs: &Bigint) {
104         self.data *= &rhs.data;
105     }
106 }
107 
108 /// REVERSE VIEW
109 
110 /// Reverse, immutable view of a sequence.
111 pub struct ReverseView<'a, T: 'a> {
112     inner: &'a [T],
113 }
114 
115 impl<'a, T> ops::Index<usize> for ReverseView<'a, T> {
116     type Output = T;
117 
118     #[inline]
index(&self, index: usize) -> &T119     fn index(&self, index: usize) -> &T {
120         let len = self.inner.len();
121         &(*self.inner)[len - index - 1]
122     }
123 }
124 
125 /// Create a reverse view of the vector for indexing.
126 #[inline]
rview(x: &[Limb]) -> ReverseView<Limb>127 pub fn rview(x: &[Limb]) -> ReverseView<Limb> {
128     ReverseView {
129         inner: x,
130     }
131 }
132 
133 // COMPARE
134 // -------
135 
136 /// Compare `x` to `y`, in little-endian order.
137 #[inline]
compare(x: &[Limb], y: &[Limb]) -> cmp::Ordering138 pub fn compare(x: &[Limb], y: &[Limb]) -> cmp::Ordering {
139     match x.len().cmp(&y.len()) {
140         cmp::Ordering::Equal => {
141             let iter = x.iter().rev().zip(y.iter().rev());
142             for (&xi, yi) in iter {
143                 match xi.cmp(yi) {
144                     cmp::Ordering::Equal => (),
145                     ord => return ord,
146                 }
147             }
148             // Equal case.
149             cmp::Ordering::Equal
150         },
151         ord => ord,
152     }
153 }
154 
155 // NORMALIZE
156 // ---------
157 
158 /// Normalize the integer, so any leading zero values are removed.
159 #[inline]
normalize(x: &mut VecType)160 pub fn normalize(x: &mut VecType) {
161     // We don't care if this wraps: the index is bounds-checked.
162     while let Some(&value) = x.get(x.len().wrapping_sub(1)) {
163         if value == 0 {
164             unsafe { x.set_len(x.len() - 1) };
165         } else {
166             break;
167         }
168     }
169 }
170 
171 /// Get if the big integer is normalized.
172 #[inline]
173 #[allow(clippy::match_like_matches_macro)]
is_normalized(x: &[Limb]) -> bool174 pub fn is_normalized(x: &[Limb]) -> bool {
175     // We don't care if this wraps: the index is bounds-checked.
176     match x.get(x.len().wrapping_sub(1)) {
177         Some(&0) => false,
178         _ => true,
179     }
180 }
181 
182 // FROM
183 // ----
184 
185 /// Create StackVec from u64 value.
186 #[inline(always)]
187 #[allow(clippy::branches_sharing_code)]
from_u64(x: u64) -> VecType188 pub fn from_u64(x: u64) -> VecType {
189     let mut vec = VecType::new();
190     debug_assert!(vec.capacity() >= 2);
191     if LIMB_BITS == 32 {
192         vec.try_push(x as Limb).unwrap();
193         vec.try_push((x >> 32) as Limb).unwrap();
194     } else {
195         vec.try_push(x as Limb).unwrap();
196     }
197     vec.normalize();
198     vec
199 }
200 
201 // HI
202 // --
203 
204 /// Check if any of the remaining bits are non-zero.
205 ///
206 /// # Safety
207 ///
208 /// Safe as long as `rindex <= x.len()`.
209 #[inline]
nonzero(x: &[Limb], rindex: usize) -> bool210 pub fn nonzero(x: &[Limb], rindex: usize) -> bool {
211     debug_assert!(rindex <= x.len());
212 
213     let len = x.len();
214     let slc = &x[..len - rindex];
215     slc.iter().rev().any(|&x| x != 0)
216 }
217 
218 // These return the high X bits and if the bits were truncated.
219 
220 /// Shift 32-bit integer to high 64-bits.
221 #[inline]
u32_to_hi64_1(r0: u32) -> (u64, bool)222 pub fn u32_to_hi64_1(r0: u32) -> (u64, bool) {
223     u64_to_hi64_1(r0 as u64)
224 }
225 
226 /// Shift 2 32-bit integers to high 64-bits.
227 #[inline]
u32_to_hi64_2(r0: u32, r1: u32) -> (u64, bool)228 pub fn u32_to_hi64_2(r0: u32, r1: u32) -> (u64, bool) {
229     let r0 = (r0 as u64) << 32;
230     let r1 = r1 as u64;
231     u64_to_hi64_1(r0 | r1)
232 }
233 
234 /// Shift 3 32-bit integers to high 64-bits.
235 #[inline]
u32_to_hi64_3(r0: u32, r1: u32, r2: u32) -> (u64, bool)236 pub fn u32_to_hi64_3(r0: u32, r1: u32, r2: u32) -> (u64, bool) {
237     let r0 = r0 as u64;
238     let r1 = (r1 as u64) << 32;
239     let r2 = r2 as u64;
240     u64_to_hi64_2(r0, r1 | r2)
241 }
242 
243 /// Shift 64-bit integer to high 64-bits.
244 #[inline]
u64_to_hi64_1(r0: u64) -> (u64, bool)245 pub fn u64_to_hi64_1(r0: u64) -> (u64, bool) {
246     let ls = r0.leading_zeros();
247     (r0 << ls, false)
248 }
249 
250 /// Shift 2 64-bit integers to high 64-bits.
251 #[inline]
u64_to_hi64_2(r0: u64, r1: u64) -> (u64, bool)252 pub fn u64_to_hi64_2(r0: u64, r1: u64) -> (u64, bool) {
253     let ls = r0.leading_zeros();
254     let rs = 64 - ls;
255     let v = match ls {
256         0 => r0,
257         _ => (r0 << ls) | (r1 >> rs),
258     };
259     let n = r1 << ls != 0;
260     (v, n)
261 }
262 
263 /// Extract the hi bits from the buffer.
264 macro_rules! hi {
265     // # Safety
266     //
267     // Safe as long as the `stackvec.len() >= 1`.
268     (@1 $self:ident, $rview:ident, $t:ident, $fn:ident) => {{
269         $fn($rview[0] as $t)
270     }};
271 
272     // # Safety
273     //
274     // Safe as long as the `stackvec.len() >= 2`.
275     (@2 $self:ident, $rview:ident, $t:ident, $fn:ident) => {{
276         let r0 = $rview[0] as $t;
277         let r1 = $rview[1] as $t;
278         $fn(r0, r1)
279     }};
280 
281     // # Safety
282     //
283     // Safe as long as the `stackvec.len() >= 2`.
284     (@nonzero2 $self:ident, $rview:ident, $t:ident, $fn:ident) => {{
285         let (v, n) = hi!(@2 $self, $rview, $t, $fn);
286         (v, n || nonzero($self, 2 ))
287     }};
288 
289     // # Safety
290     //
291     // Safe as long as the `stackvec.len() >= 3`.
292     (@3 $self:ident, $rview:ident, $t:ident, $fn:ident) => {{
293         let r0 = $rview[0] as $t;
294         let r1 = $rview[1] as $t;
295         let r2 = $rview[2] as $t;
296         $fn(r0, r1, r2)
297     }};
298 
299     // # Safety
300     //
301     // Safe as long as the `stackvec.len() >= 3`.
302     (@nonzero3 $self:ident, $rview:ident, $t:ident, $fn:ident) => {{
303         let (v, n) = hi!(@3 $self, $rview, $t, $fn);
304         (v, n || nonzero($self, 3))
305     }};
306 }
307 
308 /// Get the high 64 bits from the vector.
309 #[inline(always)]
hi64(x: &[Limb]) -> (u64, bool)310 pub fn hi64(x: &[Limb]) -> (u64, bool) {
311     let rslc = rview(x);
312     // SAFETY: the buffer must be at least length bytes long.
313     match x.len() {
314         0 => (0, false),
315         1 if LIMB_BITS == 32 => hi!(@1 x, rslc, u32, u32_to_hi64_1),
316         1 => hi!(@1 x, rslc, u64, u64_to_hi64_1),
317         2 if LIMB_BITS == 32 => hi!(@2 x, rslc, u32, u32_to_hi64_2),
318         2 => hi!(@2 x, rslc, u64, u64_to_hi64_2),
319         _ if LIMB_BITS == 32 => hi!(@nonzero3 x, rslc, u32, u32_to_hi64_3),
320         _ => hi!(@nonzero2 x, rslc, u64, u64_to_hi64_2),
321     }
322 }
323 
324 // POWERS
325 // ------
326 
327 /// MulAssign by a power of 5.
328 ///
329 /// Theoretically...
330 ///
331 /// Use an exponentiation by squaring method, since it reduces the time
332 /// complexity of the multiplication to ~`O(log(n))` for the squaring,
333 /// and `O(n*m)` for the result. Since `m` is typically a lower-order
334 /// factor, this significantly reduces the number of multiplications
335 /// we need to do. Iteratively multiplying by small powers follows
336 /// the nth triangular number series, which scales as `O(p^2)`, but
337 /// where `p` is `n+m`. In short, it scales very poorly.
338 ///
339 /// Practically....
340 ///
341 /// Exponentiation by Squaring:
342 ///     running 2 tests
343 ///     test bigcomp_f32_lexical ... bench:       1,018 ns/iter (+/- 78)
344 ///     test bigcomp_f64_lexical ... bench:       3,639 ns/iter (+/- 1,007)
345 ///
346 /// Exponentiation by Iterative Small Powers:
347 ///     running 2 tests
348 ///     test bigcomp_f32_lexical ... bench:         518 ns/iter (+/- 31)
349 ///     test bigcomp_f64_lexical ... bench:         583 ns/iter (+/- 47)
350 ///
351 /// Exponentiation by Iterative Large Powers (of 2):
352 ///     running 2 tests
353 ///     test bigcomp_f32_lexical ... bench:         671 ns/iter (+/- 31)
354 ///     test bigcomp_f64_lexical ... bench:       1,394 ns/iter (+/- 47)
355 ///
356 /// The following benchmarks were run on `1 * 5^300`, using native `pow`,
357 /// a version with only small powers, and one with pre-computed powers
358 /// of `5^(3 * max_exp)`, rather than `5^(5 * max_exp)`.
359 ///
360 /// However, using large powers is crucial for good performance for higher
361 /// powers.
362 ///     pow/default             time:   [426.20 ns 427.96 ns 429.89 ns]
363 ///     pow/small               time:   [2.9270 us 2.9411 us 2.9565 us]
364 ///     pow/large:3             time:   [838.51 ns 842.21 ns 846.27 ns]
365 ///
366 /// Even using worst-case scenarios, exponentiation by squaring is
367 /// significantly slower for our workloads. Just multiply by small powers,
368 /// in simple cases, and use precalculated large powers in other cases.
369 ///
370 /// Furthermore, using sufficiently big large powers is also crucial for
371 /// performance. This is a tradeoff of binary size and performance, and
372 /// using a single value at ~`5^(5 * max_exp)` seems optimal.
pow(x: &mut VecType, mut exp: u32) -> Option<()>373 pub fn pow(x: &mut VecType, mut exp: u32) -> Option<()> {
374     // Minimize the number of iterations for large exponents: just
375     // do a few steps with a large powers.
376     #[cfg(not(feature = "compact"))]
377     {
378         while exp >= LARGE_POW5_STEP {
379             large_mul(x, &LARGE_POW5)?;
380             exp -= LARGE_POW5_STEP;
381         }
382     }
383 
384     // Now use our pre-computed small powers iteratively.
385     // This is calculated as `⌊log(2^BITS - 1, 5)⌋`.
386     let small_step = if LIMB_BITS == 32 {
387         13
388     } else {
389         27
390     };
391     let max_native = (5 as Limb).pow(small_step);
392     while exp >= small_step {
393         small_mul(x, max_native)?;
394         exp -= small_step;
395     }
396     if exp != 0 {
397         // SAFETY: safe, since `exp < small_step`.
398         let small_power = unsafe { f64::int_pow_fast_path(exp as usize, 5) };
399         small_mul(x, small_power as Limb)?;
400     }
401     Some(())
402 }
403 
404 // SCALAR
405 // ------
406 
407 /// Add two small integers and return the resulting value and if overflow happens.
408 #[inline(always)]
scalar_add(x: Limb, y: Limb) -> (Limb, bool)409 pub fn scalar_add(x: Limb, y: Limb) -> (Limb, bool) {
410     x.overflowing_add(y)
411 }
412 
413 /// Multiply two small integers (with carry) (and return the overflow contribution).
414 ///
415 /// Returns the (low, high) components.
416 #[inline(always)]
scalar_mul(x: Limb, y: Limb, carry: Limb) -> (Limb, Limb)417 pub fn scalar_mul(x: Limb, y: Limb, carry: Limb) -> (Limb, Limb) {
418     // Cannot overflow, as long as wide is 2x as wide. This is because
419     // the following is always true:
420     // `Wide::MAX - (Narrow::MAX * Narrow::MAX) >= Narrow::MAX`
421     let z: Wide = (x as Wide) * (y as Wide) + (carry as Wide);
422     (z as Limb, (z >> LIMB_BITS) as Limb)
423 }
424 
425 // SMALL
426 // -----
427 
428 /// Add small integer to bigint starting from offset.
429 #[inline]
small_add_from(x: &mut VecType, y: Limb, start: usize) -> Option<()>430 pub fn small_add_from(x: &mut VecType, y: Limb, start: usize) -> Option<()> {
431     let mut index = start;
432     let mut carry = y;
433     while carry != 0 && index < x.len() {
434         let result = scalar_add(x[index], carry);
435         x[index] = result.0;
436         carry = result.1 as Limb;
437         index += 1;
438     }
439     // If we carried past all the elements, add to the end of the buffer.
440     if carry != 0 {
441         x.try_push(carry)?;
442     }
443     Some(())
444 }
445 
446 /// Add small integer to bigint.
447 #[inline(always)]
small_add(x: &mut VecType, y: Limb) -> Option<()>448 pub fn small_add(x: &mut VecType, y: Limb) -> Option<()> {
449     small_add_from(x, y, 0)
450 }
451 
452 /// Multiply bigint by small integer.
453 #[inline]
small_mul(x: &mut VecType, y: Limb) -> Option<()>454 pub fn small_mul(x: &mut VecType, y: Limb) -> Option<()> {
455     let mut carry = 0;
456     for xi in x.iter_mut() {
457         let result = scalar_mul(*xi, y, carry);
458         *xi = result.0;
459         carry = result.1;
460     }
461     // If we carried past all the elements, add to the end of the buffer.
462     if carry != 0 {
463         x.try_push(carry)?;
464     }
465     Some(())
466 }
467 
468 // LARGE
469 // -----
470 
471 /// Add bigint to bigint starting from offset.
large_add_from(x: &mut VecType, y: &[Limb], start: usize) -> Option<()>472 pub fn large_add_from(x: &mut VecType, y: &[Limb], start: usize) -> Option<()> {
473     // The effective x buffer is from `xstart..x.len()`, so we need to treat
474     // that as the current range. If the effective y buffer is longer, need
475     // to resize to that, + the start index.
476     if y.len() > x.len().saturating_sub(start) {
477         // Ensure we panic if we can't extend the buffer.
478         // This avoids any unsafe behavior afterwards.
479         x.try_resize(y.len() + start, 0)?;
480     }
481 
482     // Iteratively add elements from y to x.
483     let mut carry = false;
484     for (index, &yi) in y.iter().enumerate() {
485         // We panicked in `try_resize` if this wasn't true.
486         let xi = x.get_mut(start + index).unwrap();
487 
488         // Only one op of the two ops can overflow, since we added at max
489         // Limb::max_value() + Limb::max_value(). Add the previous carry,
490         // and store the current carry for the next.
491         let result = scalar_add(*xi, yi);
492         *xi = result.0;
493         let mut tmp = result.1;
494         if carry {
495             let result = scalar_add(*xi, 1);
496             *xi = result.0;
497             tmp |= result.1;
498         }
499         carry = tmp;
500     }
501 
502     // Handle overflow.
503     if carry {
504         small_add_from(x, 1, y.len() + start)?;
505     }
506     Some(())
507 }
508 
509 /// Add bigint to bigint.
510 #[inline(always)]
large_add(x: &mut VecType, y: &[Limb]) -> Option<()>511 pub fn large_add(x: &mut VecType, y: &[Limb]) -> Option<()> {
512     large_add_from(x, y, 0)
513 }
514 
515 /// Grade-school multiplication algorithm.
516 ///
517 /// Slow, naive algorithm, using limb-bit bases and just shifting left for
518 /// each iteration. This could be optimized with numerous other algorithms,
519 /// but it's extremely simple, and works in O(n*m) time, which is fine
520 /// by me. Each iteration, of which there are `m` iterations, requires
521 /// `n` multiplications, and `n` additions, or grade-school multiplication.
522 ///
523 /// Don't use Karatsuba multiplication, since out implementation seems to
524 /// be slower asymptotically, which is likely just due to the small sizes
525 /// we deal with here. For example, running on the following data:
526 ///
527 /// ```text
528 /// const SMALL_X: &[u32] = &[
529 ///     766857581, 3588187092, 1583923090, 2204542082, 1564708913, 2695310100, 3676050286,
530 ///     1022770393, 468044626, 446028186
531 /// ];
532 /// const SMALL_Y: &[u32] = &[
533 ///     3945492125, 3250752032, 1282554898, 1708742809, 1131807209, 3171663979, 1353276095,
534 ///     1678845844, 2373924447, 3640713171
535 /// ];
536 /// const LARGE_X: &[u32] = &[
537 ///     3647536243, 2836434412, 2154401029, 1297917894, 137240595, 790694805, 2260404854,
538 ///     3872698172, 690585094, 99641546, 3510774932, 1672049983, 2313458559, 2017623719,
539 ///     638180197, 1140936565, 1787190494, 1797420655, 14113450, 2350476485, 3052941684,
540 ///     1993594787, 2901001571, 4156930025, 1248016552, 848099908, 2660577483, 4030871206,
541 ///     692169593, 2835966319, 1781364505, 4266390061, 1813581655, 4210899844, 2137005290,
542 ///     2346701569, 3715571980, 3386325356, 1251725092, 2267270902, 474686922, 2712200426,
543 ///     197581715, 3087636290, 1379224439, 1258285015, 3230794403, 2759309199, 1494932094,
544 ///     326310242
545 /// ];
546 /// const LARGE_Y: &[u32] = &[
547 ///     1574249566, 868970575, 76716509, 3198027972, 1541766986, 1095120699, 3891610505,
548 ///     2322545818, 1677345138, 865101357, 2650232883, 2831881215, 3985005565, 2294283760,
549 ///     3468161605, 393539559, 3665153349, 1494067812, 106699483, 2596454134, 797235106,
550 ///     705031740, 1209732933, 2732145769, 4122429072, 141002534, 790195010, 4014829800,
551 ///     1303930792, 3649568494, 308065964, 1233648836, 2807326116, 79326486, 1262500691,
552 ///     621809229, 2258109428, 3819258501, 171115668, 1139491184, 2979680603, 1333372297,
553 ///     1657496603, 2790845317, 4090236532, 4220374789, 601876604, 1828177209, 2372228171,
554 ///     2247372529
555 /// ];
556 /// ```
557 ///
558 /// We get the following results:
559 
560 /// ```text
561 /// mul/small:long          time:   [220.23 ns 221.47 ns 222.81 ns]
562 /// Found 4 outliers among 100 measurements (4.00%)
563 ///   2 (2.00%) high mild
564 ///   2 (2.00%) high severe
565 /// mul/small:karatsuba     time:   [233.88 ns 234.63 ns 235.44 ns]
566 /// Found 11 outliers among 100 measurements (11.00%)
567 ///   8 (8.00%) high mild
568 ///   3 (3.00%) high severe
569 /// mul/large:long          time:   [1.9365 us 1.9455 us 1.9558 us]
570 /// Found 12 outliers among 100 measurements (12.00%)
571 ///   7 (7.00%) high mild
572 ///   5 (5.00%) high severe
573 /// mul/large:karatsuba     time:   [4.4250 us 4.4515 us 4.4812 us]
574 /// ```
575 ///
576 /// In short, Karatsuba multiplication is never worthwhile for out use-case.
long_mul(x: &[Limb], y: &[Limb]) -> Option<VecType>577 pub fn long_mul(x: &[Limb], y: &[Limb]) -> Option<VecType> {
578     // Using the immutable value, multiply by all the scalars in y, using
579     // the algorithm defined above. Use a single buffer to avoid
580     // frequent reallocations. Handle the first case to avoid a redundant
581     // addition, since we know y.len() >= 1.
582     let mut z = VecType::try_from(x)?;
583     if !y.is_empty() {
584         let y0 = y[0];
585         small_mul(&mut z, y0)?;
586 
587         for (index, &yi) in y.iter().enumerate().skip(1) {
588             if yi != 0 {
589                 let mut zi = VecType::try_from(x)?;
590                 small_mul(&mut zi, yi)?;
591                 large_add_from(&mut z, &zi, index)?;
592             }
593         }
594     }
595 
596     z.normalize();
597     Some(z)
598 }
599 
600 /// Multiply bigint by bigint using grade-school multiplication algorithm.
601 #[inline(always)]
large_mul(x: &mut VecType, y: &[Limb]) -> Option<()>602 pub fn large_mul(x: &mut VecType, y: &[Limb]) -> Option<()> {
603     // Karatsuba multiplication never makes sense, so just use grade school
604     // multiplication.
605     if y.len() == 1 {
606         // SAFETY: safe since `y.len() == 1`.
607         small_mul(x, y[0])?;
608     } else {
609         *x = long_mul(y, x)?;
610     }
611     Some(())
612 }
613 
614 // SHIFT
615 // -----
616 
617 /// Shift-left `n` bits inside a buffer.
618 #[inline]
shl_bits(x: &mut VecType, n: usize) -> Option<()>619 pub fn shl_bits(x: &mut VecType, n: usize) -> Option<()> {
620     debug_assert!(n != 0);
621 
622     // Internally, for each item, we shift left by n, and add the previous
623     // right shifted limb-bits.
624     // For example, we transform (for u8) shifted left 2, to:
625     //      b10100100 b01000010
626     //      b10 b10010001 b00001000
627     debug_assert!(n < LIMB_BITS);
628     let rshift = LIMB_BITS - n;
629     let lshift = n;
630     let mut prev: Limb = 0;
631     for xi in x.iter_mut() {
632         let tmp = *xi;
633         *xi <<= lshift;
634         *xi |= prev >> rshift;
635         prev = tmp;
636     }
637 
638     // Always push the carry, even if it creates a non-normal result.
639     let carry = prev >> rshift;
640     if carry != 0 {
641         x.try_push(carry)?;
642     }
643 
644     Some(())
645 }
646 
647 /// Shift-left `n` limbs inside a buffer.
648 #[inline]
shl_limbs(x: &mut VecType, n: usize) -> Option<()>649 pub fn shl_limbs(x: &mut VecType, n: usize) -> Option<()> {
650     debug_assert!(n != 0);
651     if n + x.len() > x.capacity() {
652         None
653     } else if !x.is_empty() {
654         let len = n + x.len();
655         // SAFE: since x is not empty, and `x.len() + n <= x.capacity()`.
656         unsafe {
657             // Move the elements.
658             let src = x.as_ptr();
659             let dst = x.as_mut_ptr().add(n);
660             ptr::copy(src, dst, x.len());
661             // Write our 0s.
662             ptr::write_bytes(x.as_mut_ptr(), 0, n);
663             x.set_len(len);
664         }
665         Some(())
666     } else {
667         Some(())
668     }
669 }
670 
671 /// Shift-left buffer by n bits.
672 #[inline]
shl(x: &mut VecType, n: usize) -> Option<()>673 pub fn shl(x: &mut VecType, n: usize) -> Option<()> {
674     let rem = n % LIMB_BITS;
675     let div = n / LIMB_BITS;
676     if rem != 0 {
677         shl_bits(x, rem)?;
678     }
679     if div != 0 {
680         shl_limbs(x, div)?;
681     }
682     Some(())
683 }
684 
685 /// Get number of leading zero bits in the storage.
686 #[inline]
leading_zeros(x: &[Limb]) -> u32687 pub fn leading_zeros(x: &[Limb]) -> u32 {
688     let length = x.len();
689     // wrapping_sub is fine, since it'll just return None.
690     if let Some(&value) = x.get(length.wrapping_sub(1)) {
691         value.leading_zeros()
692     } else {
693         0
694     }
695 }
696 
697 /// Calculate the bit-length of the big-integer.
698 #[inline]
bit_length(x: &[Limb]) -> u32699 pub fn bit_length(x: &[Limb]) -> u32 {
700     let nlz = leading_zeros(x);
701     LIMB_BITS as u32 * x.len() as u32 - nlz
702 }
703 
704 // LIMB
705 // ----
706 
707 //  Type for a single limb of the big integer.
708 //
709 //  A limb is analogous to a digit in base10, except, it stores 32-bit
710 //  or 64-bit numbers instead. We want types where 64-bit multiplication
711 //  is well-supported by the architecture, rather than emulated in 3
712 //  instructions. The quickest way to check this support is using a
713 //  cross-compiler for numerous architectures, along with the following
714 //  source file and command:
715 //
716 //  Compile with `gcc main.c -c -S -O3 -masm=intel`
717 //
718 //  And the source code is:
719 //  ```text
720 //  #include <stdint.h>
721 //
722 //  struct i128 {
723 //      uint64_t hi;
724 //      uint64_t lo;
725 //  };
726 //
727 //  // Type your code here, or load an example.
728 //  struct i128 square(uint64_t x, uint64_t y) {
729 //      __int128 prod = (__int128)x * (__int128)y;
730 //      struct i128 z;
731 //      z.hi = (uint64_t)(prod >> 64);
732 //      z.lo = (uint64_t)prod;
733 //      return z;
734 //  }
735 //  ```
736 //
737 //  If the result contains `call __multi3`, then the multiplication
738 //  is emulated by the compiler. Otherwise, it's natively supported.
739 //
740 //  This should be all-known 64-bit platforms supported by Rust.
741 //      https://forge.rust-lang.org/platform-support.html
742 //
743 //  # Supported
744 //
745 //  Platforms where native 128-bit multiplication is explicitly supported:
746 //      - x86_64 (Supported via `MUL`).
747 //      - mips64 (Supported via `DMULTU`, which `HI` and `LO` can be read-from).
748 //      - s390x (Supported via `MLGR`).
749 //
750 //  # Efficient
751 //
752 //  Platforms where native 64-bit multiplication is supported and
753 //  you can extract hi-lo for 64-bit multiplications.
754 //      - aarch64 (Requires `UMULH` and `MUL` to capture high and low bits).
755 //      - powerpc64 (Requires `MULHDU` and `MULLD` to capture high and low bits).
756 //      - riscv64 (Requires `MUL` and `MULH` to capture high and low bits).
757 //
758 //  # Unsupported
759 //
760 //  Platforms where native 128-bit multiplication is not supported,
761 //  requiring software emulation.
762 //      sparc64 (`UMUL` only supports double-word arguments).
763 //      sparcv9 (Same as sparc64).
764 //
765 //  These tests are run via `xcross`, my own library for C cross-compiling,
766 //  which supports numerous targets (far in excess of Rust's tier 1 support,
767 //  or rust-embedded/cross's list). xcross may be found here:
768 //      https://github.com/Alexhuszagh/xcross
769 //
770 //  To compile for the given target, run:
771 //      `xcross gcc main.c -c -S -O3 --target $target`
772 //
773 //  All 32-bit architectures inherently do not have support. That means
774 //  we can essentially look for 64-bit architectures that are not SPARC.
775 
776 #[cfg(all(target_pointer_width = "64", not(target_arch = "sparc")))]
777 pub type Limb = u64;
778 #[cfg(all(target_pointer_width = "64", not(target_arch = "sparc")))]
779 pub type Wide = u128;
780 #[cfg(all(target_pointer_width = "64", not(target_arch = "sparc")))]
781 pub const LIMB_BITS: usize = 64;
782 
783 #[cfg(not(all(target_pointer_width = "64", not(target_arch = "sparc"))))]
784 pub type Limb = u32;
785 #[cfg(not(all(target_pointer_width = "64", not(target_arch = "sparc"))))]
786 pub type Wide = u64;
787 #[cfg(not(all(target_pointer_width = "64", not(target_arch = "sparc"))))]
788 pub const LIMB_BITS: usize = 32;
789