1 /* Copyright (c) 2019, Google Inc.
2 *
3 * Permission to use, copy, modify, and/or distribute this software for any
4 * purpose with or without fee is hereby granted, provided that the above
5 * copyright notice and this permission notice appear in all copies.
6 *
7 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10 * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12 * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
14
15 #include <openssl/aes.h>
16
17 #include <assert.h>
18 #include <string.h>
19
20 #include "../../internal.h"
21 #include "internal.h"
22
23 #if defined(OPENSSL_SSE2)
24 #include <emmintrin.h>
25 #endif
26
27
28 // This file contains a constant-time implementation of AES, bitsliced with
29 // 32-bit, 64-bit, or 128-bit words, operating on two-, four-, and eight-block
30 // batches, respectively. The 128-bit implementation requires SSE2 intrinsics.
31 //
32 // This implementation is based on the algorithms described in the following
33 // references:
34 // - https://bearssl.org/constanttime.html#aes
35 // - https://eprint.iacr.org/2009/129.pdf
36 // - https://eprint.iacr.org/2009/191.pdf
37
38
39 // Word operations.
40 //
41 // An aes_word_t is the word used for this AES implementation. Throughout this
42 // file, bits and bytes are ordered little-endian, though "left" and "right"
43 // shifts match the operations themselves, which makes them reversed in a
44 // little-endian, left-to-right reading.
45 //
46 // Eight |aes_word_t|s contain |AES_NOHW_BATCH_SIZE| blocks. The bits in an
47 // |aes_word_t| are divided into 16 consecutive groups of |AES_NOHW_BATCH_SIZE|
48 // bits each, each corresponding to a byte in an AES block in column-major
49 // order (AES's byte order). We refer to these as "logical bytes". Note, in the
50 // 32-bit and 64-bit implementations, they are smaller than a byte. (The
51 // contents of a logical byte will be described later.)
52 //
53 // MSVC does not support C bit operators on |__m128i|, so the wrapper functions
54 // |aes_nohw_and|, etc., should be used instead. Note |aes_nohw_shift_left| and
55 // |aes_nohw_shift_right| measure the shift in logical bytes. That is, the shift
56 // value ranges from 0 to 15 independent of |aes_word_t| and
57 // |AES_NOHW_BATCH_SIZE|.
58 //
59 // This ordering is different from https://eprint.iacr.org/2009/129.pdf, which
60 // uses row-major order. Matching the AES order was easier to reason about, and
61 // we do not have PSHUFB available to arbitrarily permute bytes.
62
63 #if defined(OPENSSL_SSE2)
64 typedef __m128i aes_word_t;
65 // AES_NOHW_WORD_SIZE is sizeof(aes_word_t). alignas(sizeof(T)) does not work in
66 // MSVC, so we define a constant.
67 #define AES_NOHW_WORD_SIZE 16
68 #define AES_NOHW_BATCH_SIZE 8
69 #define AES_NOHW_ROW0_MASK \
70 _mm_set_epi32(0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff)
71 #define AES_NOHW_ROW1_MASK \
72 _mm_set_epi32(0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00)
73 #define AES_NOHW_ROW2_MASK \
74 _mm_set_epi32(0x00ff0000, 0x00ff0000, 0x00ff0000, 0x00ff0000)
75 #define AES_NOHW_ROW3_MASK \
76 _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000)
77 #define AES_NOHW_COL01_MASK \
78 _mm_set_epi32(0x00000000, 0x00000000, 0xffffffff, 0xffffffff)
79 #define AES_NOHW_COL2_MASK \
80 _mm_set_epi32(0x00000000, 0xffffffff, 0x00000000, 0x00000000)
81 #define AES_NOHW_COL3_MASK \
82 _mm_set_epi32(0xffffffff, 0x00000000, 0x00000000, 0x00000000)
83
aes_nohw_and(aes_word_t a,aes_word_t b)84 static inline aes_word_t aes_nohw_and(aes_word_t a, aes_word_t b) {
85 return _mm_and_si128(a, b);
86 }
87
aes_nohw_or(aes_word_t a,aes_word_t b)88 static inline aes_word_t aes_nohw_or(aes_word_t a, aes_word_t b) {
89 return _mm_or_si128(a, b);
90 }
91
aes_nohw_xor(aes_word_t a,aes_word_t b)92 static inline aes_word_t aes_nohw_xor(aes_word_t a, aes_word_t b) {
93 return _mm_xor_si128(a, b);
94 }
95
aes_nohw_not(aes_word_t a)96 static inline aes_word_t aes_nohw_not(aes_word_t a) {
97 return _mm_xor_si128(
98 a, _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff));
99 }
100
101 // These are macros because parameters to |_mm_slli_si128| and |_mm_srli_si128|
102 // must be constants.
103 #define aes_nohw_shift_left(/* aes_word_t */ a, /* const */ i) \
104 _mm_slli_si128((a), (i))
105 #define aes_nohw_shift_right(/* aes_word_t */ a, /* const */ i) \
106 _mm_srli_si128((a), (i))
107 #else // !OPENSSL_SSE2
108 #if defined(OPENSSL_64_BIT)
109 typedef uint64_t aes_word_t;
110 #define AES_NOHW_WORD_SIZE 8
111 #define AES_NOHW_BATCH_SIZE 4
112 #define AES_NOHW_ROW0_MASK UINT64_C(0x000f000f000f000f)
113 #define AES_NOHW_ROW1_MASK UINT64_C(0x00f000f000f000f0)
114 #define AES_NOHW_ROW2_MASK UINT64_C(0x0f000f000f000f00)
115 #define AES_NOHW_ROW3_MASK UINT64_C(0xf000f000f000f000)
116 #define AES_NOHW_COL01_MASK UINT64_C(0x00000000ffffffff)
117 #define AES_NOHW_COL2_MASK UINT64_C(0x0000ffff00000000)
118 #define AES_NOHW_COL3_MASK UINT64_C(0xffff000000000000)
119 #else // !OPENSSL_64_BIT
120 typedef uint32_t aes_word_t;
121 #define AES_NOHW_WORD_SIZE 4
122 #define AES_NOHW_BATCH_SIZE 2
123 #define AES_NOHW_ROW0_MASK 0x03030303
124 #define AES_NOHW_ROW1_MASK 0x0c0c0c0c
125 #define AES_NOHW_ROW2_MASK 0x30303030
126 #define AES_NOHW_ROW3_MASK 0xc0c0c0c0
127 #define AES_NOHW_COL01_MASK 0x0000ffff
128 #define AES_NOHW_COL2_MASK 0x00ff0000
129 #define AES_NOHW_COL3_MASK 0xff000000
130 #endif // OPENSSL_64_BIT
131
aes_nohw_and(aes_word_t a,aes_word_t b)132 static inline aes_word_t aes_nohw_and(aes_word_t a, aes_word_t b) {
133 return a & b;
134 }
135
aes_nohw_or(aes_word_t a,aes_word_t b)136 static inline aes_word_t aes_nohw_or(aes_word_t a, aes_word_t b) {
137 return a | b;
138 }
139
aes_nohw_xor(aes_word_t a,aes_word_t b)140 static inline aes_word_t aes_nohw_xor(aes_word_t a, aes_word_t b) {
141 return a ^ b;
142 }
143
aes_nohw_not(aes_word_t a)144 static inline aes_word_t aes_nohw_not(aes_word_t a) { return ~a; }
145
aes_nohw_shift_left(aes_word_t a,aes_word_t i)146 static inline aes_word_t aes_nohw_shift_left(aes_word_t a, aes_word_t i) {
147 return a << (i * AES_NOHW_BATCH_SIZE);
148 }
149
aes_nohw_shift_right(aes_word_t a,aes_word_t i)150 static inline aes_word_t aes_nohw_shift_right(aes_word_t a, aes_word_t i) {
151 return a >> (i * AES_NOHW_BATCH_SIZE);
152 }
153 #endif // OPENSSL_SSE2
154
155 static_assert(AES_NOHW_BATCH_SIZE * 128 == 8 * 8 * sizeof(aes_word_t),
156 "batch size does not match word size");
157 static_assert(AES_NOHW_WORD_SIZE == sizeof(aes_word_t),
158 "AES_NOHW_WORD_SIZE is incorrect");
159
160
161 // Block representations.
162 //
163 // This implementation uses three representations for AES blocks. First, the
164 // public API represents blocks as uint8_t[16] in the usual way. Second, most
165 // AES steps are evaluated in bitsliced form, stored in an |AES_NOHW_BATCH|.
166 // This stores |AES_NOHW_BATCH_SIZE| blocks in bitsliced order. For 64-bit words
167 // containing bitsliced blocks a, b, c, d, this would be as follows (vertical
168 // bars divide logical bytes):
169 //
170 // batch.w[0] = a0 b0 c0 d0 | a8 b8 c8 d8 | a16 b16 c16 d16 ...
171 // batch.w[1] = a1 b1 c1 d1 | a9 b9 c9 d9 | a17 b17 c17 d17 ...
172 // batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ...
173 // batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ...
174 // ...
175 //
176 // Finally, an individual block may be stored as an intermediate form in an
177 // aes_word_t[AES_NOHW_BLOCK_WORDS]. In this form, we permute the bits in each
178 // block, so that block[0]'s ith logical byte contains least-significant
179 // |AES_NOHW_BATCH_SIZE| bits of byte i, block[1] contains the next group of
180 // |AES_NOHW_BATCH_SIZE| bits, and so on. We refer to this transformation as
181 // "compacting" the block. Note this is no-op with 128-bit words because then
182 // |AES_NOHW_BLOCK_WORDS| is one and |AES_NOHW_BATCH_SIZE| is eight. For 64-bit
183 // words, one block would be stored in two words:
184 //
185 // block[0] = a0 a1 a2 a3 | a8 a9 a10 a11 | a16 a17 a18 a19 ...
186 // block[1] = a4 a5 a6 a7 | a12 a13 a14 a15 | a20 a21 a22 a23 ...
187 //
188 // Observe that the distances between corresponding bits in bitsliced and
189 // compact bit orders match. If we line up corresponding words of each block,
190 // the bitsliced and compact representations may be converted by tranposing bits
191 // in corresponding logical bytes. Continuing the 64-bit example:
192 //
193 // block_a[0] = a0 a1 a2 a3 | a8 a9 a10 a11 | a16 a17 a18 a19 ...
194 // block_b[0] = b0 b1 b2 b3 | b8 b9 b10 b11 | b16 b17 b18 b19 ...
195 // block_c[0] = c0 c1 c2 c3 | c8 c9 c10 c11 | c16 c17 c18 c19 ...
196 // block_d[0] = d0 d1 d2 d3 | d8 d9 d10 d11 | d16 d17 d18 d19 ...
197 //
198 // batch.w[0] = a0 b0 c0 d0 | a8 b8 c8 d8 | a16 b16 c16 d16 ...
199 // batch.w[1] = a1 b1 c1 d1 | a9 b9 c9 d9 | a17 b17 c17 d17 ...
200 // batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ...
201 // batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ...
202 //
203 // Note also that bitwise operations and (logical) byte permutations on an
204 // |aes_word_t| work equally for the bitsliced and compact words.
205 //
206 // We use the compact form in the |AES_KEY| representation to save work
207 // inflating round keys into |AES_NOHW_BATCH|. The compact form also exists
208 // temporarily while moving blocks in or out of an |AES_NOHW_BATCH|, immediately
209 // before or after |aes_nohw_transpose|.
210
211 #define AES_NOHW_BLOCK_WORDS (16 / sizeof(aes_word_t))
212
213 // An AES_NOHW_BATCH stores |AES_NOHW_BATCH_SIZE| blocks. Unless otherwise
214 // specified, it is in bitsliced form.
215 typedef struct {
216 aes_word_t w[8];
217 } AES_NOHW_BATCH;
218
219 // An AES_NOHW_SCHEDULE is an expanded bitsliced AES key schedule. It is
220 // suitable for encryption or decryption. It is as large as |AES_NOHW_BATCH|
221 // |AES_KEY|s so it should not be used as a long-term key representation.
222 typedef struct {
223 // keys is an array of batches, one for each round key. Each batch stores
224 // |AES_NOHW_BATCH_SIZE| copies of the round key in bitsliced form.
225 AES_NOHW_BATCH keys[AES_MAXNR + 1];
226 } AES_NOHW_SCHEDULE;
227
228 // aes_nohw_batch_set sets the |i|th block of |batch| to |in|. |batch| is in
229 // compact form.
aes_nohw_batch_set(AES_NOHW_BATCH * batch,const aes_word_t in[AES_NOHW_BLOCK_WORDS],size_t i)230 static inline void aes_nohw_batch_set(AES_NOHW_BATCH *batch,
231 const aes_word_t in[AES_NOHW_BLOCK_WORDS],
232 size_t i) {
233 // Note the words are interleaved. The order comes from |aes_nohw_transpose|.
234 // If |i| is zero and this is the 64-bit implementation, in[0] contains bits
235 // 0-3 and in[1] contains bits 4-7. We place in[0] at w[0] and in[1] at
236 // w[4] so that bits 0 and 4 are in the correct position. (In general, bits
237 // along diagonals of |AES_NOHW_BATCH_SIZE| by |AES_NOHW_BATCH_SIZE| squares
238 // will be correctly placed.)
239 assert(i < AES_NOHW_BATCH_SIZE);
240 #if defined(OPENSSL_SSE2)
241 batch->w[i] = in[0];
242 #elif defined(OPENSSL_64_BIT)
243 batch->w[i] = in[0];
244 batch->w[i + 4] = in[1];
245 #else
246 batch->w[i] = in[0];
247 batch->w[i + 2] = in[1];
248 batch->w[i + 4] = in[2];
249 batch->w[i + 6] = in[3];
250 #endif
251 }
252
253 // aes_nohw_batch_get writes the |i|th block of |batch| to |out|. |batch| is in
254 // compact form.
aes_nohw_batch_get(const AES_NOHW_BATCH * batch,aes_word_t out[AES_NOHW_BLOCK_WORDS],size_t i)255 static inline void aes_nohw_batch_get(const AES_NOHW_BATCH *batch,
256 aes_word_t out[AES_NOHW_BLOCK_WORDS],
257 size_t i) {
258 assert(i < AES_NOHW_BATCH_SIZE);
259 #if defined(OPENSSL_SSE2)
260 out[0] = batch->w[i];
261 #elif defined(OPENSSL_64_BIT)
262 out[0] = batch->w[i];
263 out[1] = batch->w[i + 4];
264 #else
265 out[0] = batch->w[i];
266 out[1] = batch->w[i + 2];
267 out[2] = batch->w[i + 4];
268 out[3] = batch->w[i + 6];
269 #endif
270 }
271
272 #if !defined(OPENSSL_SSE2)
273 // aes_nohw_delta_swap returns |a| with bits |a & mask| and
274 // |a & (mask << shift)| swapped. |mask| and |mask << shift| may not overlap.
aes_nohw_delta_swap(aes_word_t a,aes_word_t mask,aes_word_t shift)275 static inline aes_word_t aes_nohw_delta_swap(aes_word_t a, aes_word_t mask,
276 aes_word_t shift) {
277 // See
278 // https://reflectionsonsecurity.wordpress.com/2014/05/11/efficient-bit-permutation-using-delta-swaps/
279 aes_word_t b = (a ^ (a >> shift)) & mask;
280 return a ^ b ^ (b << shift);
281 }
282
283 // In the 32-bit and 64-bit implementations, a block spans multiple words.
284 // |aes_nohw_compact_block| must permute bits across different words. First we
285 // implement |aes_nohw_compact_word| which performs a smaller version of the
286 // transformation which stays within a single word.
287 //
288 // These transformations are generalizations of the output of
289 // http://programming.sirrida.de/calcperm.php on smaller inputs.
290 #if defined(OPENSSL_64_BIT)
aes_nohw_compact_word(uint64_t a)291 static inline uint64_t aes_nohw_compact_word(uint64_t a) {
292 // Numbering the 64/2 = 16 4-bit chunks, least to most significant, we swap
293 // quartets of those chunks:
294 // 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 =>
295 // 0 2 1 3 | 4 6 5 7 | 8 10 9 11 | 12 14 13 15
296 a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4);
297 // Swap quartets of 8-bit chunks (still numbering by 4-bit chunks):
298 // 0 2 1 3 | 4 6 5 7 | 8 10 9 11 | 12 14 13 15 =>
299 // 0 2 4 6 | 1 3 5 7 | 8 10 12 14 | 9 11 13 15
300 a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8);
301 // Swap quartets of 16-bit chunks (still numbering by 4-bit chunks):
302 // 0 2 4 6 | 1 3 5 7 | 8 10 12 14 | 9 11 13 15 =>
303 // 0 2 4 6 | 8 10 12 14 | 1 3 5 7 | 9 11 13 15
304 a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16);
305 return a;
306 }
307
aes_nohw_uncompact_word(uint64_t a)308 static inline uint64_t aes_nohw_uncompact_word(uint64_t a) {
309 // Reverse the steps of |aes_nohw_uncompact_word|.
310 a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16);
311 a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8);
312 a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4);
313 return a;
314 }
315 #else // !OPENSSL_64_BIT
aes_nohw_compact_word(uint32_t a)316 static inline uint32_t aes_nohw_compact_word(uint32_t a) {
317 // Numbering the 32/2 = 16 pairs of bits, least to most significant, we swap:
318 // 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 =>
319 // 0 4 2 6 | 1 5 3 7 | 8 12 10 14 | 9 13 11 15
320 // Note: 0x00cc = 0b0000_0000_1100_1100
321 // 0x00cc << 6 = 0b0011_0011_0000_0000
322 a = aes_nohw_delta_swap(a, 0x00cc00cc, 6);
323 // Now we swap groups of four bits (still numbering by pairs):
324 // 0 4 2 6 | 1 5 3 7 | 8 12 10 14 | 9 13 11 15 =>
325 // 0 4 8 12 | 1 5 9 13 | 2 6 10 14 | 3 7 11 15
326 // Note: 0x0000_f0f0 << 12 = 0x0f0f_0000
327 a = aes_nohw_delta_swap(a, 0x0000f0f0, 12);
328 return a;
329 }
330
aes_nohw_uncompact_word(uint32_t a)331 static inline uint32_t aes_nohw_uncompact_word(uint32_t a) {
332 // Reverse the steps of |aes_nohw_uncompact_word|.
333 a = aes_nohw_delta_swap(a, 0x0000f0f0, 12);
334 a = aes_nohw_delta_swap(a, 0x00cc00cc, 6);
335 return a;
336 }
337
aes_nohw_word_from_bytes(uint8_t a0,uint8_t a1,uint8_t a2,uint8_t a3)338 static inline uint32_t aes_nohw_word_from_bytes(uint8_t a0, uint8_t a1,
339 uint8_t a2, uint8_t a3) {
340 return (uint32_t)a0 | ((uint32_t)a1 << 8) | ((uint32_t)a2 << 16) |
341 ((uint32_t)a3 << 24);
342 }
343 #endif // OPENSSL_64_BIT
344 #endif // !OPENSSL_SSE2
345
aes_nohw_compact_block(aes_word_t out[AES_NOHW_BLOCK_WORDS],const uint8_t in[16])346 static inline void aes_nohw_compact_block(aes_word_t out[AES_NOHW_BLOCK_WORDS],
347 const uint8_t in[16]) {
348 memcpy(out, in, 16);
349 #if defined(OPENSSL_SSE2)
350 // No conversions needed.
351 #elif defined(OPENSSL_64_BIT)
352 uint64_t a0 = aes_nohw_compact_word(out[0]);
353 uint64_t a1 = aes_nohw_compact_word(out[1]);
354 out[0] = (a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32);
355 out[1] = (a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32);
356 #else
357 uint32_t a0 = aes_nohw_compact_word(out[0]);
358 uint32_t a1 = aes_nohw_compact_word(out[1]);
359 uint32_t a2 = aes_nohw_compact_word(out[2]);
360 uint32_t a3 = aes_nohw_compact_word(out[3]);
361 // Note clang, when building for ARM Thumb2, will sometimes miscompile
362 // expressions such as (a0 & 0x0000ff00) << 8, particularly when building
363 // without optimizations. This bug was introduced in
364 // https://reviews.llvm.org/rL340261 and fixed in
365 // https://reviews.llvm.org/rL351310. The following is written to avoid this.
366 out[0] = aes_nohw_word_from_bytes(a0, a1, a2, a3);
367 out[1] = aes_nohw_word_from_bytes(a0 >> 8, a1 >> 8, a2 >> 8, a3 >> 8);
368 out[2] = aes_nohw_word_from_bytes(a0 >> 16, a1 >> 16, a2 >> 16, a3 >> 16);
369 out[3] = aes_nohw_word_from_bytes(a0 >> 24, a1 >> 24, a2 >> 24, a3 >> 24);
370 #endif
371 }
372
aes_nohw_uncompact_block(uint8_t out[16],const aes_word_t in[AES_NOHW_BLOCK_WORDS])373 static inline void aes_nohw_uncompact_block(
374 uint8_t out[16], const aes_word_t in[AES_NOHW_BLOCK_WORDS]) {
375 #if defined(OPENSSL_SSE2)
376 memcpy(out, in, 16); // No conversions needed.
377 #elif defined(OPENSSL_64_BIT)
378 uint64_t a0 = in[0];
379 uint64_t a1 = in[1];
380 uint64_t b0 =
381 aes_nohw_uncompact_word((a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32));
382 uint64_t b1 =
383 aes_nohw_uncompact_word((a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32));
384 memcpy(out, &b0, 8);
385 memcpy(out + 8, &b1, 8);
386 #else
387 uint32_t a0 = in[0];
388 uint32_t a1 = in[1];
389 uint32_t a2 = in[2];
390 uint32_t a3 = in[3];
391 // Note clang, when building for ARM Thumb2, will sometimes miscompile
392 // expressions such as (a0 & 0x0000ff00) << 8, particularly when building
393 // without optimizations. This bug was introduced in
394 // https://reviews.llvm.org/rL340261 and fixed in
395 // https://reviews.llvm.org/rL351310. The following is written to avoid this.
396 uint32_t b0 = aes_nohw_word_from_bytes(a0, a1, a2, a3);
397 uint32_t b1 = aes_nohw_word_from_bytes(a0 >> 8, a1 >> 8, a2 >> 8, a3 >> 8);
398 uint32_t b2 =
399 aes_nohw_word_from_bytes(a0 >> 16, a1 >> 16, a2 >> 16, a3 >> 16);
400 uint32_t b3 =
401 aes_nohw_word_from_bytes(a0 >> 24, a1 >> 24, a2 >> 24, a3 >> 24);
402 b0 = aes_nohw_uncompact_word(b0);
403 b1 = aes_nohw_uncompact_word(b1);
404 b2 = aes_nohw_uncompact_word(b2);
405 b3 = aes_nohw_uncompact_word(b3);
406 memcpy(out, &b0, 4);
407 memcpy(out + 4, &b1, 4);
408 memcpy(out + 8, &b2, 4);
409 memcpy(out + 12, &b3, 4);
410 #endif
411 }
412
413 // aes_nohw_swap_bits is a variation on a delta swap. It swaps the bits in
414 // |*a & (mask << shift)| with the bits in |*b & mask|. |mask| and
415 // |mask << shift| must not overlap. |mask| is specified as a |uint32_t|, but it
416 // is repeated to the full width of |aes_word_t|.
417 #if defined(OPENSSL_SSE2)
418 // This must be a macro because |_mm_srli_epi32| and |_mm_slli_epi32| require
419 // constant shift values.
420 #define aes_nohw_swap_bits(/*__m128i* */ a, /*__m128i* */ b, \
421 /* uint32_t */ mask, /* const */ shift) \
422 do { \
423 __m128i swap = \
424 _mm_and_si128(_mm_xor_si128(_mm_srli_epi32(*(a), (shift)), *(b)), \
425 _mm_set_epi32((mask), (mask), (mask), (mask))); \
426 *(a) = _mm_xor_si128(*(a), _mm_slli_epi32(swap, (shift))); \
427 *(b) = _mm_xor_si128(*(b), swap); \
428 \
429 } while (0)
430 #else
aes_nohw_swap_bits(aes_word_t * a,aes_word_t * b,uint32_t mask,aes_word_t shift)431 static inline void aes_nohw_swap_bits(aes_word_t *a, aes_word_t *b,
432 uint32_t mask, aes_word_t shift) {
433 #if defined(OPENSSL_64_BIT)
434 aes_word_t mask_w = (((uint64_t)mask) << 32) | mask;
435 #else
436 aes_word_t mask_w = mask;
437 #endif
438 // This is a variation on a delta swap.
439 aes_word_t swap = ((*a >> shift) ^ *b) & mask_w;
440 *a ^= swap << shift;
441 *b ^= swap;
442 }
443 #endif // OPENSSL_SSE2
444
445 // aes_nohw_transpose converts |batch| to and from bitsliced form. It divides
446 // the 8 × word_size bits into AES_NOHW_BATCH_SIZE × AES_NOHW_BATCH_SIZE squares
447 // and transposes each square.
aes_nohw_transpose(AES_NOHW_BATCH * batch)448 static void aes_nohw_transpose(AES_NOHW_BATCH *batch) {
449 // Swap bits with index 0 and 1 mod 2 (0x55 = 0b01010101).
450 aes_nohw_swap_bits(&batch->w[0], &batch->w[1], 0x55555555, 1);
451 aes_nohw_swap_bits(&batch->w[2], &batch->w[3], 0x55555555, 1);
452 aes_nohw_swap_bits(&batch->w[4], &batch->w[5], 0x55555555, 1);
453 aes_nohw_swap_bits(&batch->w[6], &batch->w[7], 0x55555555, 1);
454
455 #if AES_NOHW_BATCH_SIZE >= 4
456 // Swap bits with index 0-1 and 2-3 mod 4 (0x33 = 0b00110011).
457 aes_nohw_swap_bits(&batch->w[0], &batch->w[2], 0x33333333, 2);
458 aes_nohw_swap_bits(&batch->w[1], &batch->w[3], 0x33333333, 2);
459 aes_nohw_swap_bits(&batch->w[4], &batch->w[6], 0x33333333, 2);
460 aes_nohw_swap_bits(&batch->w[5], &batch->w[7], 0x33333333, 2);
461 #endif
462
463 #if AES_NOHW_BATCH_SIZE >= 8
464 // Swap bits with index 0-3 and 4-7 mod 8 (0x0f = 0b00001111).
465 aes_nohw_swap_bits(&batch->w[0], &batch->w[4], 0x0f0f0f0f, 4);
466 aes_nohw_swap_bits(&batch->w[1], &batch->w[5], 0x0f0f0f0f, 4);
467 aes_nohw_swap_bits(&batch->w[2], &batch->w[6], 0x0f0f0f0f, 4);
468 aes_nohw_swap_bits(&batch->w[3], &batch->w[7], 0x0f0f0f0f, 4);
469 #endif
470 }
471
472 // aes_nohw_to_batch initializes |out| with the |num_blocks| blocks from |in|.
473 // |num_blocks| must be at most |AES_NOHW_BATCH|.
aes_nohw_to_batch(AES_NOHW_BATCH * out,const uint8_t * in,size_t num_blocks)474 static void aes_nohw_to_batch(AES_NOHW_BATCH *out, const uint8_t *in,
475 size_t num_blocks) {
476 // Don't leave unused blocks uninitialized.
477 memset(out, 0, sizeof(AES_NOHW_BATCH));
478 assert(num_blocks <= AES_NOHW_BATCH_SIZE);
479 for (size_t i = 0; i < num_blocks; i++) {
480 aes_word_t block[AES_NOHW_BLOCK_WORDS];
481 aes_nohw_compact_block(block, in + 16 * i);
482 aes_nohw_batch_set(out, block, i);
483 }
484
485 aes_nohw_transpose(out);
486 }
487
488 // aes_nohw_to_batch writes the first |num_blocks| blocks in |batch| to |out|.
489 // |num_blocks| must be at most |AES_NOHW_BATCH|.
aes_nohw_from_batch(uint8_t * out,size_t num_blocks,const AES_NOHW_BATCH * batch)490 static void aes_nohw_from_batch(uint8_t *out, size_t num_blocks,
491 const AES_NOHW_BATCH *batch) {
492 AES_NOHW_BATCH copy = *batch;
493 aes_nohw_transpose(©);
494
495 assert(num_blocks <= AES_NOHW_BATCH_SIZE);
496 for (size_t i = 0; i < num_blocks; i++) {
497 aes_word_t block[AES_NOHW_BLOCK_WORDS];
498 aes_nohw_batch_get(©, block, i);
499 aes_nohw_uncompact_block(out + 16 * i, block);
500 }
501 }
502
503
504 // AES round steps.
505
aes_nohw_add_round_key(AES_NOHW_BATCH * batch,const AES_NOHW_BATCH * key)506 static void aes_nohw_add_round_key(AES_NOHW_BATCH *batch,
507 const AES_NOHW_BATCH *key) {
508 for (size_t i = 0; i < 8; i++) {
509 batch->w[i] = aes_nohw_xor(batch->w[i], key->w[i]);
510 }
511 }
512
aes_nohw_sub_bytes(AES_NOHW_BATCH * batch)513 static void aes_nohw_sub_bytes(AES_NOHW_BATCH *batch) {
514 // See https://eprint.iacr.org/2009/191.pdf, Appendix C.
515 aes_word_t x0 = batch->w[7];
516 aes_word_t x1 = batch->w[6];
517 aes_word_t x2 = batch->w[5];
518 aes_word_t x3 = batch->w[4];
519 aes_word_t x4 = batch->w[3];
520 aes_word_t x5 = batch->w[2];
521 aes_word_t x6 = batch->w[1];
522 aes_word_t x7 = batch->w[0];
523
524 // Figure 2, the top linear transformation.
525 aes_word_t y14 = aes_nohw_xor(x3, x5);
526 aes_word_t y13 = aes_nohw_xor(x0, x6);
527 aes_word_t y9 = aes_nohw_xor(x0, x3);
528 aes_word_t y8 = aes_nohw_xor(x0, x5);
529 aes_word_t t0 = aes_nohw_xor(x1, x2);
530 aes_word_t y1 = aes_nohw_xor(t0, x7);
531 aes_word_t y4 = aes_nohw_xor(y1, x3);
532 aes_word_t y12 = aes_nohw_xor(y13, y14);
533 aes_word_t y2 = aes_nohw_xor(y1, x0);
534 aes_word_t y5 = aes_nohw_xor(y1, x6);
535 aes_word_t y3 = aes_nohw_xor(y5, y8);
536 aes_word_t t1 = aes_nohw_xor(x4, y12);
537 aes_word_t y15 = aes_nohw_xor(t1, x5);
538 aes_word_t y20 = aes_nohw_xor(t1, x1);
539 aes_word_t y6 = aes_nohw_xor(y15, x7);
540 aes_word_t y10 = aes_nohw_xor(y15, t0);
541 aes_word_t y11 = aes_nohw_xor(y20, y9);
542 aes_word_t y7 = aes_nohw_xor(x7, y11);
543 aes_word_t y17 = aes_nohw_xor(y10, y11);
544 aes_word_t y19 = aes_nohw_xor(y10, y8);
545 aes_word_t y16 = aes_nohw_xor(t0, y11);
546 aes_word_t y21 = aes_nohw_xor(y13, y16);
547 aes_word_t y18 = aes_nohw_xor(x0, y16);
548
549 // Figure 3, the middle non-linear section.
550 aes_word_t t2 = aes_nohw_and(y12, y15);
551 aes_word_t t3 = aes_nohw_and(y3, y6);
552 aes_word_t t4 = aes_nohw_xor(t3, t2);
553 aes_word_t t5 = aes_nohw_and(y4, x7);
554 aes_word_t t6 = aes_nohw_xor(t5, t2);
555 aes_word_t t7 = aes_nohw_and(y13, y16);
556 aes_word_t t8 = aes_nohw_and(y5, y1);
557 aes_word_t t9 = aes_nohw_xor(t8, t7);
558 aes_word_t t10 = aes_nohw_and(y2, y7);
559 aes_word_t t11 = aes_nohw_xor(t10, t7);
560 aes_word_t t12 = aes_nohw_and(y9, y11);
561 aes_word_t t13 = aes_nohw_and(y14, y17);
562 aes_word_t t14 = aes_nohw_xor(t13, t12);
563 aes_word_t t15 = aes_nohw_and(y8, y10);
564 aes_word_t t16 = aes_nohw_xor(t15, t12);
565 aes_word_t t17 = aes_nohw_xor(t4, t14);
566 aes_word_t t18 = aes_nohw_xor(t6, t16);
567 aes_word_t t19 = aes_nohw_xor(t9, t14);
568 aes_word_t t20 = aes_nohw_xor(t11, t16);
569 aes_word_t t21 = aes_nohw_xor(t17, y20);
570 aes_word_t t22 = aes_nohw_xor(t18, y19);
571 aes_word_t t23 = aes_nohw_xor(t19, y21);
572 aes_word_t t24 = aes_nohw_xor(t20, y18);
573 aes_word_t t25 = aes_nohw_xor(t21, t22);
574 aes_word_t t26 = aes_nohw_and(t21, t23);
575 aes_word_t t27 = aes_nohw_xor(t24, t26);
576 aes_word_t t28 = aes_nohw_and(t25, t27);
577 aes_word_t t29 = aes_nohw_xor(t28, t22);
578 aes_word_t t30 = aes_nohw_xor(t23, t24);
579 aes_word_t t31 = aes_nohw_xor(t22, t26);
580 aes_word_t t32 = aes_nohw_and(t31, t30);
581 aes_word_t t33 = aes_nohw_xor(t32, t24);
582 aes_word_t t34 = aes_nohw_xor(t23, t33);
583 aes_word_t t35 = aes_nohw_xor(t27, t33);
584 aes_word_t t36 = aes_nohw_and(t24, t35);
585 aes_word_t t37 = aes_nohw_xor(t36, t34);
586 aes_word_t t38 = aes_nohw_xor(t27, t36);
587 aes_word_t t39 = aes_nohw_and(t29, t38);
588 aes_word_t t40 = aes_nohw_xor(t25, t39);
589 aes_word_t t41 = aes_nohw_xor(t40, t37);
590 aes_word_t t42 = aes_nohw_xor(t29, t33);
591 aes_word_t t43 = aes_nohw_xor(t29, t40);
592 aes_word_t t44 = aes_nohw_xor(t33, t37);
593 aes_word_t t45 = aes_nohw_xor(t42, t41);
594 aes_word_t z0 = aes_nohw_and(t44, y15);
595 aes_word_t z1 = aes_nohw_and(t37, y6);
596 aes_word_t z2 = aes_nohw_and(t33, x7);
597 aes_word_t z3 = aes_nohw_and(t43, y16);
598 aes_word_t z4 = aes_nohw_and(t40, y1);
599 aes_word_t z5 = aes_nohw_and(t29, y7);
600 aes_word_t z6 = aes_nohw_and(t42, y11);
601 aes_word_t z7 = aes_nohw_and(t45, y17);
602 aes_word_t z8 = aes_nohw_and(t41, y10);
603 aes_word_t z9 = aes_nohw_and(t44, y12);
604 aes_word_t z10 = aes_nohw_and(t37, y3);
605 aes_word_t z11 = aes_nohw_and(t33, y4);
606 aes_word_t z12 = aes_nohw_and(t43, y13);
607 aes_word_t z13 = aes_nohw_and(t40, y5);
608 aes_word_t z14 = aes_nohw_and(t29, y2);
609 aes_word_t z15 = aes_nohw_and(t42, y9);
610 aes_word_t z16 = aes_nohw_and(t45, y14);
611 aes_word_t z17 = aes_nohw_and(t41, y8);
612
613 // Figure 4, bottom linear transformation.
614 aes_word_t t46 = aes_nohw_xor(z15, z16);
615 aes_word_t t47 = aes_nohw_xor(z10, z11);
616 aes_word_t t48 = aes_nohw_xor(z5, z13);
617 aes_word_t t49 = aes_nohw_xor(z9, z10);
618 aes_word_t t50 = aes_nohw_xor(z2, z12);
619 aes_word_t t51 = aes_nohw_xor(z2, z5);
620 aes_word_t t52 = aes_nohw_xor(z7, z8);
621 aes_word_t t53 = aes_nohw_xor(z0, z3);
622 aes_word_t t54 = aes_nohw_xor(z6, z7);
623 aes_word_t t55 = aes_nohw_xor(z16, z17);
624 aes_word_t t56 = aes_nohw_xor(z12, t48);
625 aes_word_t t57 = aes_nohw_xor(t50, t53);
626 aes_word_t t58 = aes_nohw_xor(z4, t46);
627 aes_word_t t59 = aes_nohw_xor(z3, t54);
628 aes_word_t t60 = aes_nohw_xor(t46, t57);
629 aes_word_t t61 = aes_nohw_xor(z14, t57);
630 aes_word_t t62 = aes_nohw_xor(t52, t58);
631 aes_word_t t63 = aes_nohw_xor(t49, t58);
632 aes_word_t t64 = aes_nohw_xor(z4, t59);
633 aes_word_t t65 = aes_nohw_xor(t61, t62);
634 aes_word_t t66 = aes_nohw_xor(z1, t63);
635 aes_word_t s0 = aes_nohw_xor(t59, t63);
636 aes_word_t s6 = aes_nohw_xor(t56, aes_nohw_not(t62));
637 aes_word_t s7 = aes_nohw_xor(t48, aes_nohw_not(t60));
638 aes_word_t t67 = aes_nohw_xor(t64, t65);
639 aes_word_t s3 = aes_nohw_xor(t53, t66);
640 aes_word_t s4 = aes_nohw_xor(t51, t66);
641 aes_word_t s5 = aes_nohw_xor(t47, t65);
642 aes_word_t s1 = aes_nohw_xor(t64, aes_nohw_not(s3));
643 aes_word_t s2 = aes_nohw_xor(t55, aes_nohw_not(t67));
644
645 batch->w[0] = s7;
646 batch->w[1] = s6;
647 batch->w[2] = s5;
648 batch->w[3] = s4;
649 batch->w[4] = s3;
650 batch->w[5] = s2;
651 batch->w[6] = s1;
652 batch->w[7] = s0;
653 }
654
655 // aes_nohw_sub_bytes_inv_affine inverts the affine transform portion of the AES
656 // S-box, defined in FIPS PUB 197, section 5.1.1, step 2.
aes_nohw_sub_bytes_inv_affine(AES_NOHW_BATCH * batch)657 static void aes_nohw_sub_bytes_inv_affine(AES_NOHW_BATCH *batch) {
658 aes_word_t a0 = batch->w[0];
659 aes_word_t a1 = batch->w[1];
660 aes_word_t a2 = batch->w[2];
661 aes_word_t a3 = batch->w[3];
662 aes_word_t a4 = batch->w[4];
663 aes_word_t a5 = batch->w[5];
664 aes_word_t a6 = batch->w[6];
665 aes_word_t a7 = batch->w[7];
666
667 // Apply the circulant [0 0 1 0 0 1 0 1]. This is the inverse of the circulant
668 // [1 0 0 0 1 1 1 1].
669 aes_word_t b0 = aes_nohw_xor(a2, aes_nohw_xor(a5, a7));
670 aes_word_t b1 = aes_nohw_xor(a3, aes_nohw_xor(a6, a0));
671 aes_word_t b2 = aes_nohw_xor(a4, aes_nohw_xor(a7, a1));
672 aes_word_t b3 = aes_nohw_xor(a5, aes_nohw_xor(a0, a2));
673 aes_word_t b4 = aes_nohw_xor(a6, aes_nohw_xor(a1, a3));
674 aes_word_t b5 = aes_nohw_xor(a7, aes_nohw_xor(a2, a4));
675 aes_word_t b6 = aes_nohw_xor(a0, aes_nohw_xor(a3, a5));
676 aes_word_t b7 = aes_nohw_xor(a1, aes_nohw_xor(a4, a6));
677
678 // XOR 0x05. Equivalently, we could XOR 0x63 before applying the circulant,
679 // but 0x05 has lower Hamming weight. (0x05 is the circulant applied to 0x63.)
680 batch->w[0] = aes_nohw_not(b0);
681 batch->w[1] = b1;
682 batch->w[2] = aes_nohw_not(b2);
683 batch->w[3] = b3;
684 batch->w[4] = b4;
685 batch->w[5] = b5;
686 batch->w[6] = b6;
687 batch->w[7] = b7;
688 }
689
aes_nohw_inv_sub_bytes(AES_NOHW_BATCH * batch)690 static void aes_nohw_inv_sub_bytes(AES_NOHW_BATCH *batch) {
691 // We implement the inverse S-box using the forwards implementation with the
692 // technique described in https://www.bearssl.org/constanttime.html#aes.
693 //
694 // The forwards S-box inverts its input and applies an affine transformation:
695 // S(x) = A(Inv(x)). Thus Inv(x) = InvA(S(x)). The inverse S-box is then:
696 //
697 // InvS(x) = Inv(InvA(x)).
698 // = InvA(S(InvA(x)))
699 aes_nohw_sub_bytes_inv_affine(batch);
700 aes_nohw_sub_bytes(batch);
701 aes_nohw_sub_bytes_inv_affine(batch);
702 }
703
704 // aes_nohw_rotate_cols_right returns |v| with the columns in each row rotated
705 // to the right by |n|. This is a macro because |aes_nohw_shift_*| require
706 // constant shift counts in the SSE2 implementation.
707 #define aes_nohw_rotate_cols_right(/* aes_word_t */ v, /* const */ n) \
708 (aes_nohw_or(aes_nohw_shift_right((v), (n)*4), \
709 aes_nohw_shift_left((v), 16 - (n)*4)))
710
aes_nohw_shift_rows(AES_NOHW_BATCH * batch)711 static void aes_nohw_shift_rows(AES_NOHW_BATCH *batch) {
712 for (size_t i = 0; i < 8; i++) {
713 aes_word_t row0 = aes_nohw_and(batch->w[i], AES_NOHW_ROW0_MASK);
714 aes_word_t row1 = aes_nohw_and(batch->w[i], AES_NOHW_ROW1_MASK);
715 aes_word_t row2 = aes_nohw_and(batch->w[i], AES_NOHW_ROW2_MASK);
716 aes_word_t row3 = aes_nohw_and(batch->w[i], AES_NOHW_ROW3_MASK);
717 row1 = aes_nohw_rotate_cols_right(row1, 1);
718 row2 = aes_nohw_rotate_cols_right(row2, 2);
719 row3 = aes_nohw_rotate_cols_right(row3, 3);
720 batch->w[i] = aes_nohw_or(aes_nohw_or(row0, row1), aes_nohw_or(row2, row3));
721 }
722 }
723
aes_nohw_inv_shift_rows(AES_NOHW_BATCH * batch)724 static void aes_nohw_inv_shift_rows(AES_NOHW_BATCH *batch) {
725 for (size_t i = 0; i < 8; i++) {
726 aes_word_t row0 = aes_nohw_and(batch->w[i], AES_NOHW_ROW0_MASK);
727 aes_word_t row1 = aes_nohw_and(batch->w[i], AES_NOHW_ROW1_MASK);
728 aes_word_t row2 = aes_nohw_and(batch->w[i], AES_NOHW_ROW2_MASK);
729 aes_word_t row3 = aes_nohw_and(batch->w[i], AES_NOHW_ROW3_MASK);
730 row1 = aes_nohw_rotate_cols_right(row1, 3);
731 row2 = aes_nohw_rotate_cols_right(row2, 2);
732 row3 = aes_nohw_rotate_cols_right(row3, 1);
733 batch->w[i] = aes_nohw_or(aes_nohw_or(row0, row1), aes_nohw_or(row2, row3));
734 }
735 }
736
737 // aes_nohw_rotate_rows_down returns |v| with the rows in each column rotated
738 // down by one.
aes_nohw_rotate_rows_down(aes_word_t v)739 static inline aes_word_t aes_nohw_rotate_rows_down(aes_word_t v) {
740 #if defined(OPENSSL_SSE2)
741 return _mm_or_si128(_mm_srli_epi32(v, 8), _mm_slli_epi32(v, 24));
742 #elif defined(OPENSSL_64_BIT)
743 return ((v >> 4) & UINT64_C(0x0fff0fff0fff0fff)) |
744 ((v << 12) & UINT64_C(0xf000f000f000f000));
745 #else
746 return ((v >> 2) & 0x3f3f3f3f) | ((v << 6) & 0xc0c0c0c0);
747 #endif
748 }
749
750 // aes_nohw_rotate_rows_twice returns |v| with the rows in each column rotated
751 // by two.
aes_nohw_rotate_rows_twice(aes_word_t v)752 static inline aes_word_t aes_nohw_rotate_rows_twice(aes_word_t v) {
753 #if defined(OPENSSL_SSE2)
754 return _mm_or_si128(_mm_srli_epi32(v, 16), _mm_slli_epi32(v, 16));
755 #elif defined(OPENSSL_64_BIT)
756 return ((v >> 8) & UINT64_C(0x00ff00ff00ff00ff)) |
757 ((v << 8) & UINT64_C(0xff00ff00ff00ff00));
758 #else
759 return ((v >> 4) & 0x0f0f0f0f) | ((v << 4) & 0xf0f0f0f0);
760 #endif
761 }
762
aes_nohw_mix_columns(AES_NOHW_BATCH * batch)763 static void aes_nohw_mix_columns(AES_NOHW_BATCH *batch) {
764 // See https://eprint.iacr.org/2009/129.pdf, section 4.4 and appendix A.
765 aes_word_t a0 = batch->w[0];
766 aes_word_t a1 = batch->w[1];
767 aes_word_t a2 = batch->w[2];
768 aes_word_t a3 = batch->w[3];
769 aes_word_t a4 = batch->w[4];
770 aes_word_t a5 = batch->w[5];
771 aes_word_t a6 = batch->w[6];
772 aes_word_t a7 = batch->w[7];
773
774 aes_word_t r0 = aes_nohw_rotate_rows_down(a0);
775 aes_word_t a0_r0 = aes_nohw_xor(a0, r0);
776 aes_word_t r1 = aes_nohw_rotate_rows_down(a1);
777 aes_word_t a1_r1 = aes_nohw_xor(a1, r1);
778 aes_word_t r2 = aes_nohw_rotate_rows_down(a2);
779 aes_word_t a2_r2 = aes_nohw_xor(a2, r2);
780 aes_word_t r3 = aes_nohw_rotate_rows_down(a3);
781 aes_word_t a3_r3 = aes_nohw_xor(a3, r3);
782 aes_word_t r4 = aes_nohw_rotate_rows_down(a4);
783 aes_word_t a4_r4 = aes_nohw_xor(a4, r4);
784 aes_word_t r5 = aes_nohw_rotate_rows_down(a5);
785 aes_word_t a5_r5 = aes_nohw_xor(a5, r5);
786 aes_word_t r6 = aes_nohw_rotate_rows_down(a6);
787 aes_word_t a6_r6 = aes_nohw_xor(a6, r6);
788 aes_word_t r7 = aes_nohw_rotate_rows_down(a7);
789 aes_word_t a7_r7 = aes_nohw_xor(a7, r7);
790
791 batch->w[0] =
792 aes_nohw_xor(aes_nohw_xor(a7_r7, r0), aes_nohw_rotate_rows_twice(a0_r0));
793 batch->w[1] =
794 aes_nohw_xor(aes_nohw_xor(a0_r0, a7_r7),
795 aes_nohw_xor(r1, aes_nohw_rotate_rows_twice(a1_r1)));
796 batch->w[2] =
797 aes_nohw_xor(aes_nohw_xor(a1_r1, r2), aes_nohw_rotate_rows_twice(a2_r2));
798 batch->w[3] =
799 aes_nohw_xor(aes_nohw_xor(a2_r2, a7_r7),
800 aes_nohw_xor(r3, aes_nohw_rotate_rows_twice(a3_r3)));
801 batch->w[4] =
802 aes_nohw_xor(aes_nohw_xor(a3_r3, a7_r7),
803 aes_nohw_xor(r4, aes_nohw_rotate_rows_twice(a4_r4)));
804 batch->w[5] =
805 aes_nohw_xor(aes_nohw_xor(a4_r4, r5), aes_nohw_rotate_rows_twice(a5_r5));
806 batch->w[6] =
807 aes_nohw_xor(aes_nohw_xor(a5_r5, r6), aes_nohw_rotate_rows_twice(a6_r6));
808 batch->w[7] =
809 aes_nohw_xor(aes_nohw_xor(a6_r6, r7), aes_nohw_rotate_rows_twice(a7_r7));
810 }
811
aes_nohw_inv_mix_columns(AES_NOHW_BATCH * batch)812 static void aes_nohw_inv_mix_columns(AES_NOHW_BATCH *batch) {
813 aes_word_t a0 = batch->w[0];
814 aes_word_t a1 = batch->w[1];
815 aes_word_t a2 = batch->w[2];
816 aes_word_t a3 = batch->w[3];
817 aes_word_t a4 = batch->w[4];
818 aes_word_t a5 = batch->w[5];
819 aes_word_t a6 = batch->w[6];
820 aes_word_t a7 = batch->w[7];
821
822 // bsaes-x86_64.pl describes the following decomposition of the inverse
823 // MixColumns matrix, credited to Jussi Kivilinna. This gives a much simpler
824 // multiplication.
825 //
826 // | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
827 // | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
828 // | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
829 // | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
830 //
831 // First, apply the [5 0 4 0] matrix. Multiplying by 4 in F_(2^8) is described
832 // by the following bit equations:
833 //
834 // b0 = a6
835 // b1 = a6 ^ a7
836 // b2 = a0 ^ a7
837 // b3 = a1 ^ a6
838 // b4 = a2 ^ a6 ^ a7
839 // b5 = a3 ^ a7
840 // b6 = a4
841 // b7 = a5
842 //
843 // Each coefficient is given by:
844 //
845 // b_ij = 05·a_ij ⊕ 04·a_i(j+2) = 04·(a_ij ⊕ a_i(j+2)) ⊕ a_ij
846 //
847 // We combine the two equations below. Note a_i(j+2) is a row rotation.
848 aes_word_t a0_r0 = aes_nohw_xor(a0, aes_nohw_rotate_rows_twice(a0));
849 aes_word_t a1_r1 = aes_nohw_xor(a1, aes_nohw_rotate_rows_twice(a1));
850 aes_word_t a2_r2 = aes_nohw_xor(a2, aes_nohw_rotate_rows_twice(a2));
851 aes_word_t a3_r3 = aes_nohw_xor(a3, aes_nohw_rotate_rows_twice(a3));
852 aes_word_t a4_r4 = aes_nohw_xor(a4, aes_nohw_rotate_rows_twice(a4));
853 aes_word_t a5_r5 = aes_nohw_xor(a5, aes_nohw_rotate_rows_twice(a5));
854 aes_word_t a6_r6 = aes_nohw_xor(a6, aes_nohw_rotate_rows_twice(a6));
855 aes_word_t a7_r7 = aes_nohw_xor(a7, aes_nohw_rotate_rows_twice(a7));
856
857 batch->w[0] = aes_nohw_xor(a0, a6_r6);
858 batch->w[1] = aes_nohw_xor(a1, aes_nohw_xor(a6_r6, a7_r7));
859 batch->w[2] = aes_nohw_xor(a2, aes_nohw_xor(a0_r0, a7_r7));
860 batch->w[3] = aes_nohw_xor(a3, aes_nohw_xor(a1_r1, a6_r6));
861 batch->w[4] =
862 aes_nohw_xor(aes_nohw_xor(a4, a2_r2), aes_nohw_xor(a6_r6, a7_r7));
863 batch->w[5] = aes_nohw_xor(a5, aes_nohw_xor(a3_r3, a7_r7));
864 batch->w[6] = aes_nohw_xor(a6, a4_r4);
865 batch->w[7] = aes_nohw_xor(a7, a5_r5);
866
867 // Apply the [02 03 01 01] matrix, which is just MixColumns.
868 aes_nohw_mix_columns(batch);
869 }
870
aes_nohw_encrypt_batch(const AES_NOHW_SCHEDULE * key,size_t num_rounds,AES_NOHW_BATCH * batch)871 static void aes_nohw_encrypt_batch(const AES_NOHW_SCHEDULE *key,
872 size_t num_rounds, AES_NOHW_BATCH *batch) {
873 aes_nohw_add_round_key(batch, &key->keys[0]);
874 for (size_t i = 1; i < num_rounds; i++) {
875 aes_nohw_sub_bytes(batch);
876 aes_nohw_shift_rows(batch);
877 aes_nohw_mix_columns(batch);
878 aes_nohw_add_round_key(batch, &key->keys[i]);
879 }
880 aes_nohw_sub_bytes(batch);
881 aes_nohw_shift_rows(batch);
882 aes_nohw_add_round_key(batch, &key->keys[num_rounds]);
883 }
884
aes_nohw_decrypt_batch(const AES_NOHW_SCHEDULE * key,size_t num_rounds,AES_NOHW_BATCH * batch)885 static void aes_nohw_decrypt_batch(const AES_NOHW_SCHEDULE *key,
886 size_t num_rounds, AES_NOHW_BATCH *batch) {
887 aes_nohw_add_round_key(batch, &key->keys[num_rounds]);
888 aes_nohw_inv_shift_rows(batch);
889 aes_nohw_inv_sub_bytes(batch);
890 for (size_t i = num_rounds - 1; i > 0; i--) {
891 aes_nohw_add_round_key(batch, &key->keys[i]);
892 aes_nohw_inv_mix_columns(batch);
893 aes_nohw_inv_shift_rows(batch);
894 aes_nohw_inv_sub_bytes(batch);
895 }
896 aes_nohw_add_round_key(batch, &key->keys[0]);
897 }
898
899
900 // Key schedule.
901
aes_nohw_expand_round_keys(AES_NOHW_SCHEDULE * out,const AES_KEY * key)902 static void aes_nohw_expand_round_keys(AES_NOHW_SCHEDULE *out,
903 const AES_KEY *key) {
904 for (size_t i = 0; i <= key->rounds; i++) {
905 // Copy the round key into each block in the batch.
906 for (size_t j = 0; j < AES_NOHW_BATCH_SIZE; j++) {
907 aes_word_t tmp[AES_NOHW_BLOCK_WORDS];
908 memcpy(tmp, key->rd_key + 4 * i, 16);
909 aes_nohw_batch_set(&out->keys[i], tmp, j);
910 }
911 aes_nohw_transpose(&out->keys[i]);
912 }
913 }
914
915 static const uint8_t aes_nohw_rcon[10] = {0x01, 0x02, 0x04, 0x08, 0x10,
916 0x20, 0x40, 0x80, 0x1b, 0x36};
917
918 // aes_nohw_rcon_slice returns the |i|th group of |AES_NOHW_BATCH_SIZE| bits in
919 // |rcon|, stored in a |aes_word_t|.
aes_nohw_rcon_slice(uint8_t rcon,size_t i)920 static inline aes_word_t aes_nohw_rcon_slice(uint8_t rcon, size_t i) {
921 rcon = (rcon >> (i * AES_NOHW_BATCH_SIZE)) & ((1 << AES_NOHW_BATCH_SIZE) - 1);
922 #if defined(OPENSSL_SSE2)
923 return _mm_set_epi32(0, 0, 0, rcon);
924 #else
925 return ((aes_word_t)rcon);
926 #endif
927 }
928
aes_nohw_sub_block(aes_word_t out[AES_NOHW_BLOCK_WORDS],const aes_word_t in[AES_NOHW_BLOCK_WORDS])929 static void aes_nohw_sub_block(aes_word_t out[AES_NOHW_BLOCK_WORDS],
930 const aes_word_t in[AES_NOHW_BLOCK_WORDS]) {
931 AES_NOHW_BATCH batch;
932 memset(&batch, 0, sizeof(batch));
933 aes_nohw_batch_set(&batch, in, 0);
934 aes_nohw_transpose(&batch);
935 aes_nohw_sub_bytes(&batch);
936 aes_nohw_transpose(&batch);
937 aes_nohw_batch_get(&batch, out, 0);
938 }
939
aes_nohw_setup_key_128(AES_KEY * key,const uint8_t in[16])940 static void aes_nohw_setup_key_128(AES_KEY *key, const uint8_t in[16]) {
941 key->rounds = 10;
942
943 aes_word_t block[AES_NOHW_BLOCK_WORDS];
944 aes_nohw_compact_block(block, in);
945 memcpy(key->rd_key, block, 16);
946
947 for (size_t i = 1; i <= 10; i++) {
948 aes_word_t sub[AES_NOHW_BLOCK_WORDS];
949 aes_nohw_sub_block(sub, block);
950 uint8_t rcon = aes_nohw_rcon[i - 1];
951 for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
952 // Incorporate |rcon| and the transformed word into the first word.
953 block[j] = aes_nohw_xor(block[j], aes_nohw_rcon_slice(rcon, j));
954 block[j] = aes_nohw_xor(
955 block[j],
956 aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12));
957 // Propagate to the remaining words. Note this is reordered from the usual
958 // formulation to avoid needing masks.
959 aes_word_t v = block[j];
960 block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 4));
961 block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 8));
962 block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 12));
963 }
964 memcpy(key->rd_key + 4 * i, block, 16);
965 }
966 }
967
aes_nohw_setup_key_192(AES_KEY * key,const uint8_t in[24])968 static void aes_nohw_setup_key_192(AES_KEY *key, const uint8_t in[24]) {
969 key->rounds = 12;
970
971 aes_word_t storage1[AES_NOHW_BLOCK_WORDS], storage2[AES_NOHW_BLOCK_WORDS];
972 aes_word_t *block1 = storage1, *block2 = storage2;
973
974 // AES-192's key schedule is complex because each key schedule iteration
975 // produces six words, but we compute on blocks and each block is four words.
976 // We maintain a sliding window of two blocks, filled to 1.5 blocks at a time.
977 // We loop below every three blocks or two key schedule iterations.
978 //
979 // On entry to the loop, |block1| and the first half of |block2| contain the
980 // previous key schedule iteration. |block1| has been written to |key|, but
981 // |block2| has not as it is incomplete.
982 aes_nohw_compact_block(block1, in);
983 memcpy(key->rd_key, block1, 16);
984
985 uint8_t half_block[16] = {0};
986 memcpy(half_block, in + 16, 8);
987 aes_nohw_compact_block(block2, half_block);
988
989 for (size_t i = 0; i < 4; i++) {
990 aes_word_t sub[AES_NOHW_BLOCK_WORDS];
991 aes_nohw_sub_block(sub, block2);
992 uint8_t rcon = aes_nohw_rcon[2 * i];
993 for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
994 // Compute the first two words of the next key schedule iteration, which
995 // go in the second half of |block2|. The first two words of the previous
996 // iteration are in the first half of |block1|. Apply |rcon| here too
997 // because the shifts match.
998 block2[j] = aes_nohw_or(
999 block2[j],
1000 aes_nohw_shift_left(
1001 aes_nohw_xor(block1[j], aes_nohw_rcon_slice(rcon, j)), 8));
1002 // Incorporate the transformed word and propagate. Note the last word of
1003 // the previous iteration corresponds to the second word of |copy|. This
1004 // is incorporated into the first word of the next iteration, or the third
1005 // word of |block2|.
1006 block2[j] = aes_nohw_xor(
1007 block2[j], aes_nohw_and(aes_nohw_shift_left(
1008 aes_nohw_rotate_rows_down(sub[j]), 4),
1009 AES_NOHW_COL2_MASK));
1010 block2[j] = aes_nohw_xor(
1011 block2[j],
1012 aes_nohw_and(aes_nohw_shift_left(block2[j], 4), AES_NOHW_COL3_MASK));
1013
1014 // Compute the remaining four words, which fill |block1|. Begin by moving
1015 // the corresponding words of the previous iteration: the second half of
1016 // |block1| and the first half of |block2|.
1017 block1[j] = aes_nohw_shift_right(block1[j], 8);
1018 block1[j] = aes_nohw_or(block1[j], aes_nohw_shift_left(block2[j], 8));
1019 // Incorporate the second word, computed previously in |block2|, and
1020 // propagate.
1021 block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_right(block2[j], 12));
1022 aes_word_t v = block1[j];
1023 block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 4));
1024 block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 8));
1025 block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 12));
1026 }
1027
1028 // This completes two round keys. Note half of |block2| was computed in the
1029 // previous loop iteration but was not yet output.
1030 memcpy(key->rd_key + 4 * (3 * i + 1), block2, 16);
1031 memcpy(key->rd_key + 4 * (3 * i + 2), block1, 16);
1032
1033 aes_nohw_sub_block(sub, block1);
1034 rcon = aes_nohw_rcon[2 * i + 1];
1035 for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
1036 // Compute the first four words of the next key schedule iteration in
1037 // |block2|. Begin by moving the corresponding words of the previous
1038 // iteration: the second half of |block2| and the first half of |block1|.
1039 block2[j] = aes_nohw_shift_right(block2[j], 8);
1040 block2[j] = aes_nohw_or(block2[j], aes_nohw_shift_left(block1[j], 8));
1041 // Incorporate rcon and the transformed word. Note the last word of the
1042 // previous iteration corresponds to the last word of |copy|.
1043 block2[j] = aes_nohw_xor(block2[j], aes_nohw_rcon_slice(rcon, j));
1044 block2[j] = aes_nohw_xor(
1045 block2[j],
1046 aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12));
1047 // Propagate to the remaining words.
1048 aes_word_t v = block2[j];
1049 block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 4));
1050 block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 8));
1051 block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 12));
1052
1053 // Compute the last two words, which go in the first half of |block1|. The
1054 // last two words of the previous iteration are in the second half of
1055 // |block1|.
1056 block1[j] = aes_nohw_shift_right(block1[j], 8);
1057 // Propagate blocks and mask off the excess.
1058 block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_right(block2[j], 12));
1059 block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(block1[j], 4));
1060 block1[j] = aes_nohw_and(block1[j], AES_NOHW_COL01_MASK);
1061 }
1062
1063 // |block2| has a complete round key. |block1| will be completed in the next
1064 // iteration.
1065 memcpy(key->rd_key + 4 * (3 * i + 3), block2, 16);
1066
1067 // Swap blocks to restore the invariant.
1068 aes_word_t *tmp = block1;
1069 block1 = block2;
1070 block2 = tmp;
1071 }
1072 }
1073
aes_nohw_setup_key_256(AES_KEY * key,const uint8_t in[32])1074 static void aes_nohw_setup_key_256(AES_KEY *key, const uint8_t in[32]) {
1075 key->rounds = 14;
1076
1077 // Each key schedule iteration produces two round keys.
1078 aes_word_t block1[AES_NOHW_BLOCK_WORDS], block2[AES_NOHW_BLOCK_WORDS];
1079 aes_nohw_compact_block(block1, in);
1080 memcpy(key->rd_key, block1, 16);
1081
1082 aes_nohw_compact_block(block2, in + 16);
1083 memcpy(key->rd_key + 4, block2, 16);
1084
1085 for (size_t i = 2; i <= 14; i += 2) {
1086 aes_word_t sub[AES_NOHW_BLOCK_WORDS];
1087 aes_nohw_sub_block(sub, block2);
1088 uint8_t rcon = aes_nohw_rcon[i / 2 - 1];
1089 for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
1090 // Incorporate |rcon| and the transformed word into the first word.
1091 block1[j] = aes_nohw_xor(block1[j], aes_nohw_rcon_slice(rcon, j));
1092 block1[j] = aes_nohw_xor(
1093 block1[j],
1094 aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12));
1095 // Propagate to the remaining words.
1096 aes_word_t v = block1[j];
1097 block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 4));
1098 block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 8));
1099 block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 12));
1100 }
1101 memcpy(key->rd_key + 4 * i, block1, 16);
1102
1103 if (i == 14) {
1104 break;
1105 }
1106
1107 aes_nohw_sub_block(sub, block1);
1108 for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
1109 // Incorporate the transformed word into the first word.
1110 block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_right(sub[j], 12));
1111 // Propagate to the remaining words.
1112 aes_word_t v = block2[j];
1113 block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 4));
1114 block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 8));
1115 block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 12));
1116 }
1117 memcpy(key->rd_key + 4 * (i + 1), block2, 16);
1118 }
1119 }
1120
1121
1122 // External API.
1123
aes_nohw_set_encrypt_key(const uint8_t * key,unsigned bits,AES_KEY * aeskey)1124 int aes_nohw_set_encrypt_key(const uint8_t *key, unsigned bits,
1125 AES_KEY *aeskey) {
1126 switch (bits) {
1127 case 128:
1128 aes_nohw_setup_key_128(aeskey, key);
1129 return 0;
1130 case 192:
1131 aes_nohw_setup_key_192(aeskey, key);
1132 return 0;
1133 case 256:
1134 aes_nohw_setup_key_256(aeskey, key);
1135 return 0;
1136 }
1137 return 1;
1138 }
1139
aes_nohw_set_decrypt_key(const uint8_t * key,unsigned bits,AES_KEY * aeskey)1140 int aes_nohw_set_decrypt_key(const uint8_t *key, unsigned bits,
1141 AES_KEY *aeskey) {
1142 return aes_nohw_set_encrypt_key(key, bits, aeskey);
1143 }
1144
aes_nohw_encrypt(const uint8_t * in,uint8_t * out,const AES_KEY * key)1145 void aes_nohw_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) {
1146 AES_NOHW_SCHEDULE sched;
1147 aes_nohw_expand_round_keys(&sched, key);
1148 AES_NOHW_BATCH batch;
1149 aes_nohw_to_batch(&batch, in, /*num_blocks=*/1);
1150 aes_nohw_encrypt_batch(&sched, key->rounds, &batch);
1151 aes_nohw_from_batch(out, /*num_blocks=*/1, &batch);
1152 }
1153
aes_nohw_decrypt(const uint8_t * in,uint8_t * out,const AES_KEY * key)1154 void aes_nohw_decrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) {
1155 AES_NOHW_SCHEDULE sched;
1156 aes_nohw_expand_round_keys(&sched, key);
1157 AES_NOHW_BATCH batch;
1158 aes_nohw_to_batch(&batch, in, /*num_blocks=*/1);
1159 aes_nohw_decrypt_batch(&sched, key->rounds, &batch);
1160 aes_nohw_from_batch(out, /*num_blocks=*/1, &batch);
1161 }
1162
aes_nohw_xor_block(uint8_t out[16],const uint8_t a[16],const uint8_t b[16])1163 static inline void aes_nohw_xor_block(uint8_t out[16], const uint8_t a[16],
1164 const uint8_t b[16]) {
1165 for (size_t i = 0; i < 16; i += sizeof(aes_word_t)) {
1166 aes_word_t x, y;
1167 memcpy(&x, a + i, sizeof(aes_word_t));
1168 memcpy(&y, b + i, sizeof(aes_word_t));
1169 x = aes_nohw_xor(x, y);
1170 memcpy(out + i, &x, sizeof(aes_word_t));
1171 }
1172 }
1173
aes_nohw_ctr32_encrypt_blocks(const uint8_t * in,uint8_t * out,size_t blocks,const AES_KEY * key,const uint8_t ivec[16])1174 void aes_nohw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out,
1175 size_t blocks, const AES_KEY *key,
1176 const uint8_t ivec[16]) {
1177 if (blocks == 0) {
1178 return;
1179 }
1180
1181 AES_NOHW_SCHEDULE sched;
1182 aes_nohw_expand_round_keys(&sched, key);
1183
1184 // Make |AES_NOHW_BATCH_SIZE| copies of |ivec|.
1185 alignas(AES_NOHW_WORD_SIZE) uint8_t ivs[AES_NOHW_BATCH_SIZE * 16];
1186 alignas(AES_NOHW_WORD_SIZE) uint8_t enc_ivs[AES_NOHW_BATCH_SIZE * 16];
1187 for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) {
1188 memcpy(ivs + 16 * i, ivec, 16);
1189 }
1190
1191 uint32_t ctr = CRYPTO_load_u32_be(ivs + 12);
1192 for (;;) {
1193 // Update counters.
1194 for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) {
1195 CRYPTO_store_u32_be(ivs + 16 * i + 12, ctr + (uint32_t)i);
1196 }
1197
1198 size_t todo = blocks >= AES_NOHW_BATCH_SIZE ? AES_NOHW_BATCH_SIZE : blocks;
1199 AES_NOHW_BATCH batch;
1200 aes_nohw_to_batch(&batch, ivs, todo);
1201 aes_nohw_encrypt_batch(&sched, key->rounds, &batch);
1202 aes_nohw_from_batch(enc_ivs, todo, &batch);
1203
1204 for (size_t i = 0; i < todo; i++) {
1205 aes_nohw_xor_block(out + 16 * i, in + 16 * i, enc_ivs + 16 * i);
1206 }
1207
1208 blocks -= todo;
1209 if (blocks == 0) {
1210 break;
1211 }
1212
1213 in += 16 * AES_NOHW_BATCH_SIZE;
1214 out += 16 * AES_NOHW_BATCH_SIZE;
1215 ctr += AES_NOHW_BATCH_SIZE;
1216 }
1217 }
1218
aes_nohw_cbc_encrypt(const uint8_t * in,uint8_t * out,size_t len,const AES_KEY * key,uint8_t * ivec,const int enc)1219 void aes_nohw_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t len,
1220 const AES_KEY *key, uint8_t *ivec, const int enc) {
1221 assert(len % 16 == 0);
1222 size_t blocks = len / 16;
1223 if (blocks == 0) {
1224 return;
1225 }
1226
1227 AES_NOHW_SCHEDULE sched;
1228 aes_nohw_expand_round_keys(&sched, key);
1229 alignas(AES_NOHW_WORD_SIZE) uint8_t iv[16];
1230 memcpy(iv, ivec, 16);
1231
1232 if (enc) {
1233 // CBC encryption is not parallelizable.
1234 while (blocks > 0) {
1235 aes_nohw_xor_block(iv, iv, in);
1236
1237 AES_NOHW_BATCH batch;
1238 aes_nohw_to_batch(&batch, iv, /*num_blocks=*/1);
1239 aes_nohw_encrypt_batch(&sched, key->rounds, &batch);
1240 aes_nohw_from_batch(out, /*num_blocks=*/1, &batch);
1241
1242 memcpy(iv, out, 16);
1243
1244 in += 16;
1245 out += 16;
1246 blocks--;
1247 }
1248 memcpy(ivec, iv, 16);
1249 return;
1250 }
1251
1252 for (;;) {
1253 size_t todo = blocks >= AES_NOHW_BATCH_SIZE ? AES_NOHW_BATCH_SIZE : blocks;
1254 // Make a copy of the input so we can decrypt in-place.
1255 alignas(AES_NOHW_WORD_SIZE) uint8_t copy[AES_NOHW_BATCH_SIZE * 16];
1256 memcpy(copy, in, todo * 16);
1257
1258 AES_NOHW_BATCH batch;
1259 aes_nohw_to_batch(&batch, in, todo);
1260 aes_nohw_decrypt_batch(&sched, key->rounds, &batch);
1261 aes_nohw_from_batch(out, todo, &batch);
1262
1263 aes_nohw_xor_block(out, out, iv);
1264 for (size_t i = 1; i < todo; i++) {
1265 aes_nohw_xor_block(out + 16 * i, out + 16 * i, copy + 16 * (i - 1));
1266 }
1267
1268 // Save the last block as the IV.
1269 memcpy(iv, copy + 16 * (todo - 1), 16);
1270
1271 blocks -= todo;
1272 if (blocks == 0) {
1273 break;
1274 }
1275
1276 in += 16 * AES_NOHW_BATCH_SIZE;
1277 out += 16 * AES_NOHW_BATCH_SIZE;
1278 }
1279
1280 memcpy(ivec, iv, 16);
1281 }
1282