1 // Copyright (c) 2019, Google Inc.
2 // Portions Copyright 2020 Brian Smith.
3 //
4 // Permission to use, copy, modify, and/or distribute this software for any
5 // purpose with or without fee is hereby granted, provided that the above
6 // copyright notice and this permission notice appear in all copies.
7 //
8 // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
11 // SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
13 // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
14 // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15
16 // This file is based on BoringSSL's gcm_nohw.c.
17
18 // This file contains a constant-time implementation of GHASH based on the notes
19 // in https://bearssl.org/constanttime.html#ghash-for-gcm and the reduction
20 // algorithm described in
21 // https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf.
22 //
23 // Unlike the BearSSL notes, we use u128 in the 64-bit implementation.
24
25 use super::{Block, Xi, BLOCK_LEN};
26 use crate::polyfill::ChunksFixed;
27
28 #[cfg(target_pointer_width = "64")]
gcm_mul64_nohw(a: u64, b: u64) -> (u64, u64)29 fn gcm_mul64_nohw(a: u64, b: u64) -> (u64, u64) {
30 #[inline(always)]
31 fn lo(a: u128) -> u64 {
32 a as u64
33 }
34
35 #[inline(always)]
36 fn hi(a: u128) -> u64 {
37 lo(a >> 64)
38 }
39
40 #[inline(always)]
41 fn mul(a: u64, b: u64) -> u128 {
42 u128::from(a) * u128::from(b)
43 }
44
45 // One term every four bits means the largest term is 64/4 = 16, which barely
46 // overflows into the next term. Using one term every five bits would cost 25
47 // multiplications instead of 16. It is faster to mask off the bottom four
48 // bits of |a|, giving a largest term of 60/4 = 15, and apply the bottom bits
49 // separately.
50 let a0 = a & 0x1111111111111110;
51 let a1 = a & 0x2222222222222220;
52 let a2 = a & 0x4444444444444440;
53 let a3 = a & 0x8888888888888880;
54
55 let b0 = b & 0x1111111111111111;
56 let b1 = b & 0x2222222222222222;
57 let b2 = b & 0x4444444444444444;
58 let b3 = b & 0x8888888888888888;
59
60 let c0 = mul(a0, b0) ^ mul(a1, b3) ^ mul(a2, b2) ^ mul(a3, b1);
61 let c1 = mul(a0, b1) ^ mul(a1, b0) ^ mul(a2, b3) ^ mul(a3, b2);
62 let c2 = mul(a0, b2) ^ mul(a1, b1) ^ mul(a2, b0) ^ mul(a3, b3);
63 let c3 = mul(a0, b3) ^ mul(a1, b2) ^ mul(a2, b1) ^ mul(a3, b0);
64
65 // Multiply the bottom four bits of |a| with |b|.
66 let a0_mask = 0u64.wrapping_sub(a & 1);
67 let a1_mask = 0u64.wrapping_sub((a >> 1) & 1);
68 let a2_mask = 0u64.wrapping_sub((a >> 2) & 1);
69 let a3_mask = 0u64.wrapping_sub((a >> 3) & 1);
70 let extra = u128::from(a0_mask & b)
71 ^ (u128::from(a1_mask & b) << 1)
72 ^ (u128::from(a2_mask & b) << 2)
73 ^ (u128::from(a3_mask & b) << 3);
74
75 let lo = (lo(c0) & 0x1111111111111111)
76 ^ (lo(c1) & 0x2222222222222222)
77 ^ (lo(c2) & 0x4444444444444444)
78 ^ (lo(c3) & 0x8888888888888888)
79 ^ lo(extra);
80 let hi = (hi(c0) & 0x1111111111111111)
81 ^ (hi(c1) & 0x2222222222222222)
82 ^ (hi(c2) & 0x4444444444444444)
83 ^ (hi(c3) & 0x8888888888888888)
84 ^ hi(extra);
85 (lo, hi)
86 }
87
88 #[cfg(not(target_pointer_width = "64"))]
gcm_mul32_nohw(a: u32, b: u32) -> u6489 fn gcm_mul32_nohw(a: u32, b: u32) -> u64 {
90 #[inline(always)]
91 fn mul(a: u32, b: u32) -> u64 {
92 u64::from(a) * u64::from(b)
93 }
94
95 // One term every four bits means the largest term is 32/4 = 8, which does not
96 // overflow into the next term.
97 let a0 = a & 0x11111111;
98 let a1 = a & 0x22222222;
99 let a2 = a & 0x44444444;
100 let a3 = a & 0x88888888;
101
102 let b0 = b & 0x11111111;
103 let b1 = b & 0x22222222;
104 let b2 = b & 0x44444444;
105 let b3 = b & 0x88888888;
106
107 let c0 = mul(a0, b0) ^ mul(a1, b3) ^ mul(a2, b2) ^ mul(a3, b1);
108 let c1 = mul(a0, b1) ^ mul(a1, b0) ^ mul(a2, b3) ^ mul(a3, b2);
109 let c2 = mul(a0, b2) ^ mul(a1, b1) ^ mul(a2, b0) ^ mul(a3, b3);
110 let c3 = mul(a0, b3) ^ mul(a1, b2) ^ mul(a2, b1) ^ mul(a3, b0);
111
112 (c0 & 0x1111111111111111)
113 | (c1 & 0x2222222222222222)
114 | (c2 & 0x4444444444444444)
115 | (c3 & 0x8888888888888888)
116 }
117
118 #[cfg(not(target_pointer_width = "64"))]
gcm_mul64_nohw(a: u64, b: u64) -> (u64, u64)119 fn gcm_mul64_nohw(a: u64, b: u64) -> (u64, u64) {
120 #[inline(always)]
121 fn lo(a: u64) -> u32 {
122 a as u32
123 }
124 #[inline(always)]
125 fn hi(a: u64) -> u32 {
126 lo(a >> 32)
127 }
128
129 let a0 = lo(a);
130 let a1 = hi(a);
131 let b0 = lo(b);
132 let b1 = hi(b);
133 // Karatsuba multiplication.
134 let lo = gcm_mul32_nohw(a0, b0);
135 let hi = gcm_mul32_nohw(a1, b1);
136 let mid = gcm_mul32_nohw(a0 ^ a1, b0 ^ b1) ^ lo ^ hi;
137 (lo ^ (mid << 32), hi ^ (mid >> 32))
138 }
139
init(xi: [u64; 2]) -> super::u128140 pub(super) fn init(xi: [u64; 2]) -> super::u128 {
141 // We implement GHASH in terms of POLYVAL, as described in RFC 8452. This
142 // avoids a shift by 1 in the multiplication, needed to account for bit
143 // reversal losing a bit after multiplication, that is,
144 // rev128(X) * rev128(Y) = rev255(X*Y).
145 //
146 // Per Appendix A, we run mulX_POLYVAL. Note this is the same transformation
147 // applied by |gcm_init_clmul|, etc. Note |Xi| has already been byteswapped.
148 //
149 // See also slide 16 of
150 // https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf
151 let mut lo = xi[1];
152 let mut hi = xi[0];
153
154 let mut carry = hi >> 63;
155 carry = 0u64.wrapping_sub(carry);
156
157 hi <<= 1;
158 hi |= lo >> 63;
159 lo <<= 1;
160
161 // The irreducible polynomial is 1 + x^121 + x^126 + x^127 + x^128, so we
162 // conditionally add 0xc200...0001.
163 lo ^= carry & 1;
164 hi ^= carry & 0xc200000000000000;
165
166 // This implementation does not use the rest of |Htable|.
167 super::u128 { hi, lo }
168 }
169
gcm_polyval_nohw(xi: &mut [u64; 2], h: super::u128)170 fn gcm_polyval_nohw(xi: &mut [u64; 2], h: super::u128) {
171 // Karatsuba multiplication. The product of |Xi| and |H| is stored in |r0|
172 // through |r3|. Note there is no byte or bit reversal because we are
173 // evaluating POLYVAL.
174 let (r0, mut r1) = gcm_mul64_nohw(xi[0], h.lo);
175 let (mut r2, mut r3) = gcm_mul64_nohw(xi[1], h.hi);
176 let (mut mid0, mut mid1) = gcm_mul64_nohw(xi[0] ^ xi[1], h.hi ^ h.lo);
177 mid0 ^= r0 ^ r2;
178 mid1 ^= r1 ^ r3;
179 r2 ^= mid1;
180 r1 ^= mid0;
181
182 // Now we multiply our 256-bit result by x^-128 and reduce. |r2| and
183 // |r3| shifts into position and we must multiply |r0| and |r1| by x^-128. We
184 // have:
185 //
186 // 1 = x^121 + x^126 + x^127 + x^128
187 // x^-128 = x^-7 + x^-2 + x^-1 + 1
188 //
189 // This is the GHASH reduction step, but with bits flowing in reverse.
190
191 // The x^-7, x^-2, and x^-1 terms shift bits past x^0, which would require
192 // another reduction steps. Instead, we gather the excess bits, incorporate
193 // them into |r0| and |r1| and reduce once. See slides 17-19
194 // of https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf.
195 r1 ^= (r0 << 63) ^ (r0 << 62) ^ (r0 << 57);
196
197 // 1
198 r2 ^= r0;
199 r3 ^= r1;
200
201 // x^-1
202 r2 ^= r0 >> 1;
203 r2 ^= r1 << 63;
204 r3 ^= r1 >> 1;
205
206 // x^-2
207 r2 ^= r0 >> 2;
208 r2 ^= r1 << 62;
209 r3 ^= r1 >> 2;
210
211 // x^-7
212 r2 ^= r0 >> 7;
213 r2 ^= r1 << 57;
214 r3 ^= r1 >> 7;
215
216 *xi = [r2, r3];
217 }
218
gmult(xi: &mut Xi, h: super::u128)219 pub(super) fn gmult(xi: &mut Xi, h: super::u128) {
220 with_swapped_xi(xi, |swapped| {
221 gcm_polyval_nohw(swapped, h);
222 })
223 }
224
ghash(xi: &mut Xi, h: super::u128, input: &[[u8; BLOCK_LEN]])225 pub(super) fn ghash(xi: &mut Xi, h: super::u128, input: &[[u8; BLOCK_LEN]]) {
226 with_swapped_xi(xi, |swapped| {
227 input.iter().for_each(|input| {
228 let input: &[[u8; 8]; 2] = input.chunks_fixed();
229 swapped[0] ^= u64::from_be_bytes(input[1]);
230 swapped[1] ^= u64::from_be_bytes(input[0]);
231 gcm_polyval_nohw(swapped, h);
232 });
233 });
234 }
235
236 #[inline]
with_swapped_xi(Xi(xi): &mut Xi, f: impl FnOnce(&mut [u64; 2]))237 fn with_swapped_xi(Xi(xi): &mut Xi, f: impl FnOnce(&mut [u64; 2])) {
238 let unswapped: [u64; 2] = (*xi).into();
239 let mut swapped: [u64; 2] = [unswapped[1], unswapped[0]];
240 f(&mut swapped);
241 *xi = Block::from([swapped[1], swapped[0]])
242 }
243