1 /* Copyright (c) 2014, Google Inc.
2 *
3 * Permission to use, copy, modify, and/or distribute this software for any
4 * purpose with or without fee is hereby granted, provided that the above
5 * copyright notice and this permission notice appear in all copies.
6 *
7 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10 * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12 * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
14
15 // This implementation of poly1305 is by Andrew Moon
16 // (https://github.com/floodyberry/poly1305-donna) and released as public
17 // domain. It implements SIMD vectorization based on the algorithm described in
18 // http://cr.yp.to/papers.html#neoncrypto. Unrolled to 2 powers, i.e. 64 byte
19 // block size
20
21 #include <openssl/poly1305.h>
22
23 #include <assert.h>
24
25 #include "../internal.h"
26
27
28 #if defined(BORINGSSL_HAS_UINT128) && defined(OPENSSL_X86_64)
29
30 #include <emmintrin.h>
31
32 typedef __m128i xmmi;
33
34 static const alignas(16) uint32_t poly1305_x64_sse2_message_mask[4] = {
35 (1 << 26) - 1, 0, (1 << 26) - 1, 0};
36 static const alignas(16) uint32_t poly1305_x64_sse2_5[4] = {5, 0, 5, 0};
37 static const alignas(16) uint32_t poly1305_x64_sse2_1shl128[4] = {
38 (1 << 24), 0, (1 << 24), 0};
39
add128(uint128_t a,uint128_t b)40 static inline uint128_t add128(uint128_t a, uint128_t b) { return a + b; }
41
add128_64(uint128_t a,uint64_t b)42 static inline uint128_t add128_64(uint128_t a, uint64_t b) { return a + b; }
43
mul64x64_128(uint64_t a,uint64_t b)44 static inline uint128_t mul64x64_128(uint64_t a, uint64_t b) {
45 return (uint128_t)a * b;
46 }
47
lo128(uint128_t a)48 static inline uint64_t lo128(uint128_t a) { return (uint64_t)a; }
49
shr128(uint128_t v,const int shift)50 static inline uint64_t shr128(uint128_t v, const int shift) {
51 return (uint64_t)(v >> shift);
52 }
53
shr128_pair(uint64_t hi,uint64_t lo,const int shift)54 static inline uint64_t shr128_pair(uint64_t hi, uint64_t lo, const int shift) {
55 return (uint64_t)((((uint128_t)hi << 64) | lo) >> shift);
56 }
57
58 typedef struct poly1305_power_t {
59 union {
60 xmmi v;
61 uint64_t u[2];
62 uint32_t d[4];
63 } R20, R21, R22, R23, R24, S21, S22, S23, S24;
64 } poly1305_power;
65
66 typedef struct poly1305_state_internal_t {
67 poly1305_power P[2]; /* 288 bytes, top 32 bit halves unused = 144
68 bytes of free storage */
69 union {
70 xmmi H[5]; // 80 bytes
71 uint64_t HH[10];
72 };
73 // uint64_t r0,r1,r2; [24 bytes]
74 // uint64_t pad0,pad1; [16 bytes]
75 uint64_t started; // 8 bytes
76 uint64_t leftover; // 8 bytes
77 uint8_t buffer[64]; // 64 bytes
78 } poly1305_state_internal; /* 448 bytes total + 63 bytes for
79 alignment = 511 bytes raw */
80
81 static_assert(sizeof(struct poly1305_state_internal_t) + 63 <=
82 sizeof(poly1305_state),
83 "poly1305_state isn't large enough to hold aligned "
84 "poly1305_state_internal_t");
85
poly1305_aligned_state(poly1305_state * state)86 static inline poly1305_state_internal *poly1305_aligned_state(
87 poly1305_state *state) {
88 return (poly1305_state_internal *)(((uint64_t)state + 63) & ~63);
89 }
90
poly1305_min(size_t a,size_t b)91 static inline size_t poly1305_min(size_t a, size_t b) {
92 return (a < b) ? a : b;
93 }
94
CRYPTO_poly1305_init(poly1305_state * state,const uint8_t key[32])95 void CRYPTO_poly1305_init(poly1305_state *state, const uint8_t key[32]) {
96 poly1305_state_internal *st = poly1305_aligned_state(state);
97 poly1305_power *p;
98 uint64_t r0, r1, r2;
99 uint64_t t0, t1;
100
101 // clamp key
102 t0 = CRYPTO_load_u64_le(key + 0);
103 t1 = CRYPTO_load_u64_le(key + 8);
104 r0 = t0 & 0xffc0fffffff;
105 t0 >>= 44;
106 t0 |= t1 << 20;
107 r1 = t0 & 0xfffffc0ffff;
108 t1 >>= 24;
109 r2 = t1 & 0x00ffffffc0f;
110
111 // store r in un-used space of st->P[1]
112 p = &st->P[1];
113 p->R20.d[1] = (uint32_t)(r0);
114 p->R20.d[3] = (uint32_t)(r0 >> 32);
115 p->R21.d[1] = (uint32_t)(r1);
116 p->R21.d[3] = (uint32_t)(r1 >> 32);
117 p->R22.d[1] = (uint32_t)(r2);
118 p->R22.d[3] = (uint32_t)(r2 >> 32);
119
120 // store pad
121 p->R23.d[1] = CRYPTO_load_u32_le(key + 16);
122 p->R23.d[3] = CRYPTO_load_u32_le(key + 20);
123 p->R24.d[1] = CRYPTO_load_u32_le(key + 24);
124 p->R24.d[3] = CRYPTO_load_u32_le(key + 28);
125
126 // H = 0
127 st->H[0] = _mm_setzero_si128();
128 st->H[1] = _mm_setzero_si128();
129 st->H[2] = _mm_setzero_si128();
130 st->H[3] = _mm_setzero_si128();
131 st->H[4] = _mm_setzero_si128();
132
133 st->started = 0;
134 st->leftover = 0;
135 }
136
poly1305_first_block(poly1305_state_internal * st,const uint8_t * m)137 static void poly1305_first_block(poly1305_state_internal *st,
138 const uint8_t *m) {
139 const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
140 const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
141 const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
142 xmmi T5, T6;
143 poly1305_power *p;
144 uint128_t d[3];
145 uint64_t r0, r1, r2;
146 uint64_t r20, r21, r22, s22;
147 uint64_t pad0, pad1;
148 uint64_t c;
149 uint64_t i;
150
151 // pull out stored info
152 p = &st->P[1];
153
154 r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
155 r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
156 r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
157 pad0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
158 pad1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
159
160 // compute powers r^2,r^4
161 r20 = r0;
162 r21 = r1;
163 r22 = r2;
164 for (i = 0; i < 2; i++) {
165 s22 = r22 * (5 << 2);
166
167 d[0] = add128(mul64x64_128(r20, r20), mul64x64_128(r21 * 2, s22));
168 d[1] = add128(mul64x64_128(r22, s22), mul64x64_128(r20 * 2, r21));
169 d[2] = add128(mul64x64_128(r21, r21), mul64x64_128(r22 * 2, r20));
170
171 r20 = lo128(d[0]) & 0xfffffffffff;
172 c = shr128(d[0], 44);
173 d[1] = add128_64(d[1], c);
174 r21 = lo128(d[1]) & 0xfffffffffff;
175 c = shr128(d[1], 44);
176 d[2] = add128_64(d[2], c);
177 r22 = lo128(d[2]) & 0x3ffffffffff;
178 c = shr128(d[2], 42);
179 r20 += c * 5;
180 c = (r20 >> 44);
181 r20 = r20 & 0xfffffffffff;
182 r21 += c;
183
184 p->R20.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)(r20)&0x3ffffff),
185 _MM_SHUFFLE(1, 0, 1, 0));
186 p->R21.v = _mm_shuffle_epi32(
187 _mm_cvtsi32_si128((uint32_t)((r20 >> 26) | (r21 << 18)) & 0x3ffffff),
188 _MM_SHUFFLE(1, 0, 1, 0));
189 p->R22.v =
190 _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r21 >> 8)) & 0x3ffffff),
191 _MM_SHUFFLE(1, 0, 1, 0));
192 p->R23.v = _mm_shuffle_epi32(
193 _mm_cvtsi32_si128((uint32_t)((r21 >> 34) | (r22 << 10)) & 0x3ffffff),
194 _MM_SHUFFLE(1, 0, 1, 0));
195 p->R24.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r22 >> 16))),
196 _MM_SHUFFLE(1, 0, 1, 0));
197 p->S21.v = _mm_mul_epu32(p->R21.v, FIVE);
198 p->S22.v = _mm_mul_epu32(p->R22.v, FIVE);
199 p->S23.v = _mm_mul_epu32(p->R23.v, FIVE);
200 p->S24.v = _mm_mul_epu32(p->R24.v, FIVE);
201 p--;
202 }
203
204 // put saved info back
205 p = &st->P[1];
206 p->R20.d[1] = (uint32_t)(r0);
207 p->R20.d[3] = (uint32_t)(r0 >> 32);
208 p->R21.d[1] = (uint32_t)(r1);
209 p->R21.d[3] = (uint32_t)(r1 >> 32);
210 p->R22.d[1] = (uint32_t)(r2);
211 p->R22.d[3] = (uint32_t)(r2 >> 32);
212 p->R23.d[1] = (uint32_t)(pad0);
213 p->R23.d[3] = (uint32_t)(pad0 >> 32);
214 p->R24.d[1] = (uint32_t)(pad1);
215 p->R24.d[3] = (uint32_t)(pad1 >> 32);
216
217 // H = [Mx,My]
218 T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
219 _mm_loadl_epi64((const xmmi *)(m + 16)));
220 T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
221 _mm_loadl_epi64((const xmmi *)(m + 24)));
222 st->H[0] = _mm_and_si128(MMASK, T5);
223 st->H[1] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
224 T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
225 st->H[2] = _mm_and_si128(MMASK, T5);
226 st->H[3] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
227 st->H[4] = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
228 }
229
poly1305_blocks(poly1305_state_internal * st,const uint8_t * m,size_t bytes)230 static void poly1305_blocks(poly1305_state_internal *st, const uint8_t *m,
231 size_t bytes) {
232 const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
233 const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
234 const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
235
236 poly1305_power *p;
237 xmmi H0, H1, H2, H3, H4;
238 xmmi T0, T1, T2, T3, T4, T5, T6;
239 xmmi M0, M1, M2, M3, M4;
240 xmmi C1, C2;
241
242 H0 = st->H[0];
243 H1 = st->H[1];
244 H2 = st->H[2];
245 H3 = st->H[3];
246 H4 = st->H[4];
247
248 while (bytes >= 64) {
249 // H *= [r^4,r^4]
250 p = &st->P[0];
251 T0 = _mm_mul_epu32(H0, p->R20.v);
252 T1 = _mm_mul_epu32(H0, p->R21.v);
253 T2 = _mm_mul_epu32(H0, p->R22.v);
254 T3 = _mm_mul_epu32(H0, p->R23.v);
255 T4 = _mm_mul_epu32(H0, p->R24.v);
256 T5 = _mm_mul_epu32(H1, p->S24.v);
257 T6 = _mm_mul_epu32(H1, p->R20.v);
258 T0 = _mm_add_epi64(T0, T5);
259 T1 = _mm_add_epi64(T1, T6);
260 T5 = _mm_mul_epu32(H2, p->S23.v);
261 T6 = _mm_mul_epu32(H2, p->S24.v);
262 T0 = _mm_add_epi64(T0, T5);
263 T1 = _mm_add_epi64(T1, T6);
264 T5 = _mm_mul_epu32(H3, p->S22.v);
265 T6 = _mm_mul_epu32(H3, p->S23.v);
266 T0 = _mm_add_epi64(T0, T5);
267 T1 = _mm_add_epi64(T1, T6);
268 T5 = _mm_mul_epu32(H4, p->S21.v);
269 T6 = _mm_mul_epu32(H4, p->S22.v);
270 T0 = _mm_add_epi64(T0, T5);
271 T1 = _mm_add_epi64(T1, T6);
272 T5 = _mm_mul_epu32(H1, p->R21.v);
273 T6 = _mm_mul_epu32(H1, p->R22.v);
274 T2 = _mm_add_epi64(T2, T5);
275 T3 = _mm_add_epi64(T3, T6);
276 T5 = _mm_mul_epu32(H2, p->R20.v);
277 T6 = _mm_mul_epu32(H2, p->R21.v);
278 T2 = _mm_add_epi64(T2, T5);
279 T3 = _mm_add_epi64(T3, T6);
280 T5 = _mm_mul_epu32(H3, p->S24.v);
281 T6 = _mm_mul_epu32(H3, p->R20.v);
282 T2 = _mm_add_epi64(T2, T5);
283 T3 = _mm_add_epi64(T3, T6);
284 T5 = _mm_mul_epu32(H4, p->S23.v);
285 T6 = _mm_mul_epu32(H4, p->S24.v);
286 T2 = _mm_add_epi64(T2, T5);
287 T3 = _mm_add_epi64(T3, T6);
288 T5 = _mm_mul_epu32(H1, p->R23.v);
289 T4 = _mm_add_epi64(T4, T5);
290 T5 = _mm_mul_epu32(H2, p->R22.v);
291 T4 = _mm_add_epi64(T4, T5);
292 T5 = _mm_mul_epu32(H3, p->R21.v);
293 T4 = _mm_add_epi64(T4, T5);
294 T5 = _mm_mul_epu32(H4, p->R20.v);
295 T4 = _mm_add_epi64(T4, T5);
296
297 // H += [Mx,My]*[r^2,r^2]
298 T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
299 _mm_loadl_epi64((const xmmi *)(m + 16)));
300 T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
301 _mm_loadl_epi64((const xmmi *)(m + 24)));
302 M0 = _mm_and_si128(MMASK, T5);
303 M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
304 T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
305 M2 = _mm_and_si128(MMASK, T5);
306 M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
307 M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
308
309 p = &st->P[1];
310 T5 = _mm_mul_epu32(M0, p->R20.v);
311 T6 = _mm_mul_epu32(M0, p->R21.v);
312 T0 = _mm_add_epi64(T0, T5);
313 T1 = _mm_add_epi64(T1, T6);
314 T5 = _mm_mul_epu32(M1, p->S24.v);
315 T6 = _mm_mul_epu32(M1, p->R20.v);
316 T0 = _mm_add_epi64(T0, T5);
317 T1 = _mm_add_epi64(T1, T6);
318 T5 = _mm_mul_epu32(M2, p->S23.v);
319 T6 = _mm_mul_epu32(M2, p->S24.v);
320 T0 = _mm_add_epi64(T0, T5);
321 T1 = _mm_add_epi64(T1, T6);
322 T5 = _mm_mul_epu32(M3, p->S22.v);
323 T6 = _mm_mul_epu32(M3, p->S23.v);
324 T0 = _mm_add_epi64(T0, T5);
325 T1 = _mm_add_epi64(T1, T6);
326 T5 = _mm_mul_epu32(M4, p->S21.v);
327 T6 = _mm_mul_epu32(M4, p->S22.v);
328 T0 = _mm_add_epi64(T0, T5);
329 T1 = _mm_add_epi64(T1, T6);
330 T5 = _mm_mul_epu32(M0, p->R22.v);
331 T6 = _mm_mul_epu32(M0, p->R23.v);
332 T2 = _mm_add_epi64(T2, T5);
333 T3 = _mm_add_epi64(T3, T6);
334 T5 = _mm_mul_epu32(M1, p->R21.v);
335 T6 = _mm_mul_epu32(M1, p->R22.v);
336 T2 = _mm_add_epi64(T2, T5);
337 T3 = _mm_add_epi64(T3, T6);
338 T5 = _mm_mul_epu32(M2, p->R20.v);
339 T6 = _mm_mul_epu32(M2, p->R21.v);
340 T2 = _mm_add_epi64(T2, T5);
341 T3 = _mm_add_epi64(T3, T6);
342 T5 = _mm_mul_epu32(M3, p->S24.v);
343 T6 = _mm_mul_epu32(M3, p->R20.v);
344 T2 = _mm_add_epi64(T2, T5);
345 T3 = _mm_add_epi64(T3, T6);
346 T5 = _mm_mul_epu32(M4, p->S23.v);
347 T6 = _mm_mul_epu32(M4, p->S24.v);
348 T2 = _mm_add_epi64(T2, T5);
349 T3 = _mm_add_epi64(T3, T6);
350 T5 = _mm_mul_epu32(M0, p->R24.v);
351 T4 = _mm_add_epi64(T4, T5);
352 T5 = _mm_mul_epu32(M1, p->R23.v);
353 T4 = _mm_add_epi64(T4, T5);
354 T5 = _mm_mul_epu32(M2, p->R22.v);
355 T4 = _mm_add_epi64(T4, T5);
356 T5 = _mm_mul_epu32(M3, p->R21.v);
357 T4 = _mm_add_epi64(T4, T5);
358 T5 = _mm_mul_epu32(M4, p->R20.v);
359 T4 = _mm_add_epi64(T4, T5);
360
361 // H += [Mx,My]
362 T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 32)),
363 _mm_loadl_epi64((const xmmi *)(m + 48)));
364 T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 40)),
365 _mm_loadl_epi64((const xmmi *)(m + 56)));
366 M0 = _mm_and_si128(MMASK, T5);
367 M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
368 T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
369 M2 = _mm_and_si128(MMASK, T5);
370 M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
371 M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
372
373 T0 = _mm_add_epi64(T0, M0);
374 T1 = _mm_add_epi64(T1, M1);
375 T2 = _mm_add_epi64(T2, M2);
376 T3 = _mm_add_epi64(T3, M3);
377 T4 = _mm_add_epi64(T4, M4);
378
379 // reduce
380 C1 = _mm_srli_epi64(T0, 26);
381 C2 = _mm_srli_epi64(T3, 26);
382 T0 = _mm_and_si128(T0, MMASK);
383 T3 = _mm_and_si128(T3, MMASK);
384 T1 = _mm_add_epi64(T1, C1);
385 T4 = _mm_add_epi64(T4, C2);
386 C1 = _mm_srli_epi64(T1, 26);
387 C2 = _mm_srli_epi64(T4, 26);
388 T1 = _mm_and_si128(T1, MMASK);
389 T4 = _mm_and_si128(T4, MMASK);
390 T2 = _mm_add_epi64(T2, C1);
391 T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
392 C1 = _mm_srli_epi64(T2, 26);
393 C2 = _mm_srli_epi64(T0, 26);
394 T2 = _mm_and_si128(T2, MMASK);
395 T0 = _mm_and_si128(T0, MMASK);
396 T3 = _mm_add_epi64(T3, C1);
397 T1 = _mm_add_epi64(T1, C2);
398 C1 = _mm_srli_epi64(T3, 26);
399 T3 = _mm_and_si128(T3, MMASK);
400 T4 = _mm_add_epi64(T4, C1);
401
402 // H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx,My])
403 H0 = T0;
404 H1 = T1;
405 H2 = T2;
406 H3 = T3;
407 H4 = T4;
408
409 m += 64;
410 bytes -= 64;
411 }
412
413 st->H[0] = H0;
414 st->H[1] = H1;
415 st->H[2] = H2;
416 st->H[3] = H3;
417 st->H[4] = H4;
418 }
419
poly1305_combine(poly1305_state_internal * st,const uint8_t * m,size_t bytes)420 static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m,
421 size_t bytes) {
422 const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
423 const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
424 const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
425
426 poly1305_power *p;
427 xmmi H0, H1, H2, H3, H4;
428 xmmi M0, M1, M2, M3, M4;
429 xmmi T0, T1, T2, T3, T4, T5, T6;
430 xmmi C1, C2;
431
432 uint64_t r0, r1, r2;
433 uint64_t t0, t1, t2, t3, t4;
434 uint64_t c;
435 size_t consumed = 0;
436
437 H0 = st->H[0];
438 H1 = st->H[1];
439 H2 = st->H[2];
440 H3 = st->H[3];
441 H4 = st->H[4];
442
443 // p = [r^2,r^2]
444 p = &st->P[1];
445
446 if (bytes >= 32) {
447 // H *= [r^2,r^2]
448 T0 = _mm_mul_epu32(H0, p->R20.v);
449 T1 = _mm_mul_epu32(H0, p->R21.v);
450 T2 = _mm_mul_epu32(H0, p->R22.v);
451 T3 = _mm_mul_epu32(H0, p->R23.v);
452 T4 = _mm_mul_epu32(H0, p->R24.v);
453 T5 = _mm_mul_epu32(H1, p->S24.v);
454 T6 = _mm_mul_epu32(H1, p->R20.v);
455 T0 = _mm_add_epi64(T0, T5);
456 T1 = _mm_add_epi64(T1, T6);
457 T5 = _mm_mul_epu32(H2, p->S23.v);
458 T6 = _mm_mul_epu32(H2, p->S24.v);
459 T0 = _mm_add_epi64(T0, T5);
460 T1 = _mm_add_epi64(T1, T6);
461 T5 = _mm_mul_epu32(H3, p->S22.v);
462 T6 = _mm_mul_epu32(H3, p->S23.v);
463 T0 = _mm_add_epi64(T0, T5);
464 T1 = _mm_add_epi64(T1, T6);
465 T5 = _mm_mul_epu32(H4, p->S21.v);
466 T6 = _mm_mul_epu32(H4, p->S22.v);
467 T0 = _mm_add_epi64(T0, T5);
468 T1 = _mm_add_epi64(T1, T6);
469 T5 = _mm_mul_epu32(H1, p->R21.v);
470 T6 = _mm_mul_epu32(H1, p->R22.v);
471 T2 = _mm_add_epi64(T2, T5);
472 T3 = _mm_add_epi64(T3, T6);
473 T5 = _mm_mul_epu32(H2, p->R20.v);
474 T6 = _mm_mul_epu32(H2, p->R21.v);
475 T2 = _mm_add_epi64(T2, T5);
476 T3 = _mm_add_epi64(T3, T6);
477 T5 = _mm_mul_epu32(H3, p->S24.v);
478 T6 = _mm_mul_epu32(H3, p->R20.v);
479 T2 = _mm_add_epi64(T2, T5);
480 T3 = _mm_add_epi64(T3, T6);
481 T5 = _mm_mul_epu32(H4, p->S23.v);
482 T6 = _mm_mul_epu32(H4, p->S24.v);
483 T2 = _mm_add_epi64(T2, T5);
484 T3 = _mm_add_epi64(T3, T6);
485 T5 = _mm_mul_epu32(H1, p->R23.v);
486 T4 = _mm_add_epi64(T4, T5);
487 T5 = _mm_mul_epu32(H2, p->R22.v);
488 T4 = _mm_add_epi64(T4, T5);
489 T5 = _mm_mul_epu32(H3, p->R21.v);
490 T4 = _mm_add_epi64(T4, T5);
491 T5 = _mm_mul_epu32(H4, p->R20.v);
492 T4 = _mm_add_epi64(T4, T5);
493
494 // H += [Mx,My]
495 T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
496 _mm_loadl_epi64((const xmmi *)(m + 16)));
497 T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
498 _mm_loadl_epi64((const xmmi *)(m + 24)));
499 M0 = _mm_and_si128(MMASK, T5);
500 M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
501 T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
502 M2 = _mm_and_si128(MMASK, T5);
503 M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
504 M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
505
506 T0 = _mm_add_epi64(T0, M0);
507 T1 = _mm_add_epi64(T1, M1);
508 T2 = _mm_add_epi64(T2, M2);
509 T3 = _mm_add_epi64(T3, M3);
510 T4 = _mm_add_epi64(T4, M4);
511
512 // reduce
513 C1 = _mm_srli_epi64(T0, 26);
514 C2 = _mm_srli_epi64(T3, 26);
515 T0 = _mm_and_si128(T0, MMASK);
516 T3 = _mm_and_si128(T3, MMASK);
517 T1 = _mm_add_epi64(T1, C1);
518 T4 = _mm_add_epi64(T4, C2);
519 C1 = _mm_srli_epi64(T1, 26);
520 C2 = _mm_srli_epi64(T4, 26);
521 T1 = _mm_and_si128(T1, MMASK);
522 T4 = _mm_and_si128(T4, MMASK);
523 T2 = _mm_add_epi64(T2, C1);
524 T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
525 C1 = _mm_srli_epi64(T2, 26);
526 C2 = _mm_srli_epi64(T0, 26);
527 T2 = _mm_and_si128(T2, MMASK);
528 T0 = _mm_and_si128(T0, MMASK);
529 T3 = _mm_add_epi64(T3, C1);
530 T1 = _mm_add_epi64(T1, C2);
531 C1 = _mm_srli_epi64(T3, 26);
532 T3 = _mm_and_si128(T3, MMASK);
533 T4 = _mm_add_epi64(T4, C1);
534
535 // H = (H*[r^2,r^2] + [Mx,My])
536 H0 = T0;
537 H1 = T1;
538 H2 = T2;
539 H3 = T3;
540 H4 = T4;
541
542 consumed = 32;
543 }
544
545 // finalize, H *= [r^2,r]
546 r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
547 r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
548 r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
549
550 p->R20.d[2] = (uint32_t)(r0)&0x3ffffff;
551 p->R21.d[2] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff;
552 p->R22.d[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff;
553 p->R23.d[2] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff;
554 p->R24.d[2] = (uint32_t)((r2 >> 16));
555 p->S21.d[2] = p->R21.d[2] * 5;
556 p->S22.d[2] = p->R22.d[2] * 5;
557 p->S23.d[2] = p->R23.d[2] * 5;
558 p->S24.d[2] = p->R24.d[2] * 5;
559
560 // H *= [r^2,r]
561 T0 = _mm_mul_epu32(H0, p->R20.v);
562 T1 = _mm_mul_epu32(H0, p->R21.v);
563 T2 = _mm_mul_epu32(H0, p->R22.v);
564 T3 = _mm_mul_epu32(H0, p->R23.v);
565 T4 = _mm_mul_epu32(H0, p->R24.v);
566 T5 = _mm_mul_epu32(H1, p->S24.v);
567 T6 = _mm_mul_epu32(H1, p->R20.v);
568 T0 = _mm_add_epi64(T0, T5);
569 T1 = _mm_add_epi64(T1, T6);
570 T5 = _mm_mul_epu32(H2, p->S23.v);
571 T6 = _mm_mul_epu32(H2, p->S24.v);
572 T0 = _mm_add_epi64(T0, T5);
573 T1 = _mm_add_epi64(T1, T6);
574 T5 = _mm_mul_epu32(H3, p->S22.v);
575 T6 = _mm_mul_epu32(H3, p->S23.v);
576 T0 = _mm_add_epi64(T0, T5);
577 T1 = _mm_add_epi64(T1, T6);
578 T5 = _mm_mul_epu32(H4, p->S21.v);
579 T6 = _mm_mul_epu32(H4, p->S22.v);
580 T0 = _mm_add_epi64(T0, T5);
581 T1 = _mm_add_epi64(T1, T6);
582 T5 = _mm_mul_epu32(H1, p->R21.v);
583 T6 = _mm_mul_epu32(H1, p->R22.v);
584 T2 = _mm_add_epi64(T2, T5);
585 T3 = _mm_add_epi64(T3, T6);
586 T5 = _mm_mul_epu32(H2, p->R20.v);
587 T6 = _mm_mul_epu32(H2, p->R21.v);
588 T2 = _mm_add_epi64(T2, T5);
589 T3 = _mm_add_epi64(T3, T6);
590 T5 = _mm_mul_epu32(H3, p->S24.v);
591 T6 = _mm_mul_epu32(H3, p->R20.v);
592 T2 = _mm_add_epi64(T2, T5);
593 T3 = _mm_add_epi64(T3, T6);
594 T5 = _mm_mul_epu32(H4, p->S23.v);
595 T6 = _mm_mul_epu32(H4, p->S24.v);
596 T2 = _mm_add_epi64(T2, T5);
597 T3 = _mm_add_epi64(T3, T6);
598 T5 = _mm_mul_epu32(H1, p->R23.v);
599 T4 = _mm_add_epi64(T4, T5);
600 T5 = _mm_mul_epu32(H2, p->R22.v);
601 T4 = _mm_add_epi64(T4, T5);
602 T5 = _mm_mul_epu32(H3, p->R21.v);
603 T4 = _mm_add_epi64(T4, T5);
604 T5 = _mm_mul_epu32(H4, p->R20.v);
605 T4 = _mm_add_epi64(T4, T5);
606
607 C1 = _mm_srli_epi64(T0, 26);
608 C2 = _mm_srli_epi64(T3, 26);
609 T0 = _mm_and_si128(T0, MMASK);
610 T3 = _mm_and_si128(T3, MMASK);
611 T1 = _mm_add_epi64(T1, C1);
612 T4 = _mm_add_epi64(T4, C2);
613 C1 = _mm_srli_epi64(T1, 26);
614 C2 = _mm_srli_epi64(T4, 26);
615 T1 = _mm_and_si128(T1, MMASK);
616 T4 = _mm_and_si128(T4, MMASK);
617 T2 = _mm_add_epi64(T2, C1);
618 T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
619 C1 = _mm_srli_epi64(T2, 26);
620 C2 = _mm_srli_epi64(T0, 26);
621 T2 = _mm_and_si128(T2, MMASK);
622 T0 = _mm_and_si128(T0, MMASK);
623 T3 = _mm_add_epi64(T3, C1);
624 T1 = _mm_add_epi64(T1, C2);
625 C1 = _mm_srli_epi64(T3, 26);
626 T3 = _mm_and_si128(T3, MMASK);
627 T4 = _mm_add_epi64(T4, C1);
628
629 // H = H[0]+H[1]
630 H0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8));
631 H1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8));
632 H2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8));
633 H3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8));
634 H4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8));
635
636 t0 = _mm_cvtsi128_si32(H0);
637 c = (t0 >> 26);
638 t0 &= 0x3ffffff;
639 t1 = _mm_cvtsi128_si32(H1) + c;
640 c = (t1 >> 26);
641 t1 &= 0x3ffffff;
642 t2 = _mm_cvtsi128_si32(H2) + c;
643 c = (t2 >> 26);
644 t2 &= 0x3ffffff;
645 t3 = _mm_cvtsi128_si32(H3) + c;
646 c = (t3 >> 26);
647 t3 &= 0x3ffffff;
648 t4 = _mm_cvtsi128_si32(H4) + c;
649 c = (t4 >> 26);
650 t4 &= 0x3ffffff;
651 t0 = t0 + (c * 5);
652 c = (t0 >> 26);
653 t0 &= 0x3ffffff;
654 t1 = t1 + c;
655
656 st->HH[0] = ((t0) | (t1 << 26)) & UINT64_C(0xfffffffffff);
657 st->HH[1] = ((t1 >> 18) | (t2 << 8) | (t3 << 34)) & UINT64_C(0xfffffffffff);
658 st->HH[2] = ((t3 >> 10) | (t4 << 16)) & UINT64_C(0x3ffffffffff);
659
660 return consumed;
661 }
662
CRYPTO_poly1305_update(poly1305_state * state,const uint8_t * m,size_t bytes)663 void CRYPTO_poly1305_update(poly1305_state *state, const uint8_t *m,
664 size_t bytes) {
665 poly1305_state_internal *st = poly1305_aligned_state(state);
666 size_t want;
667
668 // Work around a C language bug. See https://crbug.com/1019588.
669 if (bytes == 0) {
670 return;
671 }
672
673 // need at least 32 initial bytes to start the accelerated branch
674 if (!st->started) {
675 if ((st->leftover == 0) && (bytes > 32)) {
676 poly1305_first_block(st, m);
677 m += 32;
678 bytes -= 32;
679 } else {
680 want = poly1305_min(32 - st->leftover, bytes);
681 OPENSSL_memcpy(st->buffer + st->leftover, m, want);
682 bytes -= want;
683 m += want;
684 st->leftover += want;
685 if ((st->leftover < 32) || (bytes == 0)) {
686 return;
687 }
688 poly1305_first_block(st, st->buffer);
689 st->leftover = 0;
690 }
691 st->started = 1;
692 }
693
694 // handle leftover
695 if (st->leftover) {
696 want = poly1305_min(64 - st->leftover, bytes);
697 OPENSSL_memcpy(st->buffer + st->leftover, m, want);
698 bytes -= want;
699 m += want;
700 st->leftover += want;
701 if (st->leftover < 64) {
702 return;
703 }
704 poly1305_blocks(st, st->buffer, 64);
705 st->leftover = 0;
706 }
707
708 // process 64 byte blocks
709 if (bytes >= 64) {
710 want = (bytes & ~63);
711 poly1305_blocks(st, m, want);
712 m += want;
713 bytes -= want;
714 }
715
716 if (bytes) {
717 OPENSSL_memcpy(st->buffer + st->leftover, m, bytes);
718 st->leftover += bytes;
719 }
720 }
721
CRYPTO_poly1305_finish(poly1305_state * state,uint8_t mac[16])722 void CRYPTO_poly1305_finish(poly1305_state *state, uint8_t mac[16]) {
723 poly1305_state_internal *st = poly1305_aligned_state(state);
724 size_t leftover = st->leftover;
725 uint8_t *m = st->buffer;
726 uint128_t d[3];
727 uint64_t h0, h1, h2;
728 uint64_t t0, t1;
729 uint64_t g0, g1, g2, c, nc;
730 uint64_t r0, r1, r2, s1, s2;
731 poly1305_power *p;
732
733 if (st->started) {
734 size_t consumed = poly1305_combine(st, m, leftover);
735 leftover -= consumed;
736 m += consumed;
737 }
738
739 // st->HH will either be 0 or have the combined result
740 h0 = st->HH[0];
741 h1 = st->HH[1];
742 h2 = st->HH[2];
743
744 p = &st->P[1];
745 r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
746 r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
747 r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
748 s1 = r1 * (5 << 2);
749 s2 = r2 * (5 << 2);
750
751 if (leftover < 16) {
752 goto poly1305_donna_atmost15bytes;
753 }
754
755 poly1305_donna_atleast16bytes:
756 t0 = CRYPTO_load_u64_le(m + 0);
757 t1 = CRYPTO_load_u64_le(m + 8);
758 h0 += t0 & 0xfffffffffff;
759 t0 = shr128_pair(t1, t0, 44);
760 h1 += t0 & 0xfffffffffff;
761 h2 += (t1 >> 24) | ((uint64_t)1 << 40);
762
763 poly1305_donna_mul:
764 d[0] = add128(add128(mul64x64_128(h0, r0), mul64x64_128(h1, s2)),
765 mul64x64_128(h2, s1));
766 d[1] = add128(add128(mul64x64_128(h0, r1), mul64x64_128(h1, r0)),
767 mul64x64_128(h2, s2));
768 d[2] = add128(add128(mul64x64_128(h0, r2), mul64x64_128(h1, r1)),
769 mul64x64_128(h2, r0));
770 h0 = lo128(d[0]) & 0xfffffffffff;
771 c = shr128(d[0], 44);
772 d[1] = add128_64(d[1], c);
773 h1 = lo128(d[1]) & 0xfffffffffff;
774 c = shr128(d[1], 44);
775 d[2] = add128_64(d[2], c);
776 h2 = lo128(d[2]) & 0x3ffffffffff;
777 c = shr128(d[2], 42);
778 h0 += c * 5;
779
780 m += 16;
781 leftover -= 16;
782 if (leftover >= 16) {
783 goto poly1305_donna_atleast16bytes;
784 }
785
786 // final bytes
787 poly1305_donna_atmost15bytes:
788 if (!leftover) {
789 goto poly1305_donna_finish;
790 }
791
792 m[leftover++] = 1;
793 OPENSSL_memset(m + leftover, 0, 16 - leftover);
794 leftover = 16;
795
796 t0 = CRYPTO_load_u64_le(m + 0);
797 t1 = CRYPTO_load_u64_le(m + 8);
798 h0 += t0 & 0xfffffffffff;
799 t0 = shr128_pair(t1, t0, 44);
800 h1 += t0 & 0xfffffffffff;
801 h2 += (t1 >> 24);
802
803 goto poly1305_donna_mul;
804
805 poly1305_donna_finish:
806 c = (h0 >> 44);
807 h0 &= 0xfffffffffff;
808 h1 += c;
809 c = (h1 >> 44);
810 h1 &= 0xfffffffffff;
811 h2 += c;
812 c = (h2 >> 42);
813 h2 &= 0x3ffffffffff;
814 h0 += c * 5;
815
816 g0 = h0 + 5;
817 c = (g0 >> 44);
818 g0 &= 0xfffffffffff;
819 g1 = h1 + c;
820 c = (g1 >> 44);
821 g1 &= 0xfffffffffff;
822 g2 = h2 + c - ((uint64_t)1 << 42);
823
824 c = (g2 >> 63) - 1;
825 nc = ~c;
826 h0 = (h0 & nc) | (g0 & c);
827 h1 = (h1 & nc) | (g1 & c);
828 h2 = (h2 & nc) | (g2 & c);
829
830 // pad
831 t0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
832 t1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
833 h0 += (t0 & 0xfffffffffff);
834 c = (h0 >> 44);
835 h0 &= 0xfffffffffff;
836 t0 = shr128_pair(t1, t0, 44);
837 h1 += (t0 & 0xfffffffffff) + c;
838 c = (h1 >> 44);
839 h1 &= 0xfffffffffff;
840 t1 = (t1 >> 24);
841 h2 += (t1)+c;
842
843 CRYPTO_store_u64_le(mac + 0, ((h0) | (h1 << 44)));
844 CRYPTO_store_u64_le(mac + 8, ((h1 >> 20) | (h2 << 24)));
845 }
846
847 #endif // BORINGSSL_HAS_UINT128 && OPENSSL_X86_64
848