1 /* crc32 for POWER8 using VSX instructions
2 * Copyright (C) 2021 IBM Corporation
3 *
4 * Author: Rogerio Alves <[email protected]>
5 *
6 * For conditions of distribution and use, see copyright notice in zlib.h
7 *
8 * Calculate the checksum of data that is 16 byte aligned and a multiple of
9 * 16 bytes.
10 *
11 * The first step is to reduce it to 1024 bits. We do this in 8 parallel
12 * chunks in order to mask the latency of the vpmsum instructions. If we
13 * have more than 32 kB of data to checksum we repeat this step multiple
14 * times, passing in the previous 1024 bits.
15 *
16 * The next step is to reduce the 1024 bits to 64 bits. This step adds
17 * 32 bits of 0s to the end - this matches what a CRC does. We just
18 * calculate constants that land the data in this 32 bits.
19 *
20 * We then use fixed point Barrett reduction to compute a mod n over GF(2)
21 * for n = CRC using POWER8 instructions. We use x = 32.
22 *
23 * http://en.wikipedia.org/wiki/Barrett_reduction
24 *
25 * This code uses gcc vector builtins instead using assembly directly.
26 */
27
28 #include <altivec.h>
29 #include "zendian.h"
30 #include "zbuild.h"
31
32 #include "crc32_constants.h"
33 #include "crc32_braid_tbl.h"
34
35 #if defined (__clang__)
36 #include "fallback_builtins.h"
37 #endif
38
39 #define MAX_SIZE 32768
40 #define VMX_ALIGN 16
41 #define VMX_ALIGN_MASK (VMX_ALIGN-1)
42
crc32_align(unsigned int crc,const unsigned char * p,unsigned long len)43 static unsigned int crc32_align(unsigned int crc, const unsigned char *p, unsigned long len) {
44 while (len--)
45 crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
46 return crc;
47 }
48
49 static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len);
50
crc32_power8(uint32_t crc,const unsigned char * p,uint64_t _len)51 Z_INTERNAL uint32_t crc32_power8(uint32_t crc, const unsigned char *p, uint64_t _len) {
52 unsigned int prealign;
53 unsigned int tail;
54
55 unsigned long len = (unsigned long) _len;
56
57 if (p == (const unsigned char *) 0x0)
58 return 0;
59
60 crc ^= 0xffffffff;
61
62 if (len < VMX_ALIGN + VMX_ALIGN_MASK) {
63 crc = crc32_align(crc, p, len);
64 goto out;
65 }
66
67 if ((unsigned long)p & VMX_ALIGN_MASK) {
68 prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
69 crc = crc32_align(crc, p, prealign);
70 len -= prealign;
71 p += prealign;
72 }
73
74 crc = __crc32_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
75
76 tail = len & VMX_ALIGN_MASK;
77 if (tail) {
78 p += len & ~VMX_ALIGN_MASK;
79 crc = crc32_align(crc, p, tail);
80 }
81
82 out:
83 crc ^= 0xffffffff;
84
85 return crc;
86 }
87
88 /* When we have a load-store in a single-dispatch group and address overlap
89 * such that forward is not allowed (load-hit-store) the group must be flushed.
90 * A group ending NOP prevents the flush.
91 */
92 #define GROUP_ENDING_NOP __asm__("ori 2,2,0" ::: "memory")
93
94 #if BYTE_ORDER == BIG_ENDIAN
95 #define BYTESWAP_DATA
96 #endif
97
98 #ifdef BYTESWAP_DATA
99 #define VEC_PERM(vr, va, vb, vc) vr = vec_perm(va, vb, (__vector unsigned char) vc)
100 #if BYTE_ORDER == LITTLE_ENDIAN
101 /* Byte reverse permute constant LE. */
102 static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x08090A0B0C0D0E0FUL, 0x0001020304050607UL };
103 #else
104 static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x0F0E0D0C0B0A0908UL, 0X0706050403020100UL };
105 #endif
106 #else
107 #define VEC_PERM(vr, va, vb, vc)
108 #endif
109
__crc32_vpmsum(unsigned int crc,const void * p,unsigned long len)110 static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len) {
111
112 const __vector unsigned long long vzero = {0,0};
113 const __vector unsigned long long vones = {0xffffffffffffffffUL, 0xffffffffffffffffUL};
114
115 const __vector unsigned long long vmask_32bit =
116 (__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 4);
117
118 const __vector unsigned long long vmask_64bit =
119 (__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 8);
120
121 __vector unsigned long long vcrc;
122
123 __vector unsigned long long vconst1, vconst2;
124
125 /* vdata0-vdata7 will contain our data (p). */
126 __vector unsigned long long vdata0, vdata1, vdata2, vdata3, vdata4, vdata5, vdata6, vdata7;
127
128 /* v0-v7 will contain our checksums */
129 __vector unsigned long long v0 = {0,0};
130 __vector unsigned long long v1 = {0,0};
131 __vector unsigned long long v2 = {0,0};
132 __vector unsigned long long v3 = {0,0};
133 __vector unsigned long long v4 = {0,0};
134 __vector unsigned long long v5 = {0,0};
135 __vector unsigned long long v6 = {0,0};
136 __vector unsigned long long v7 = {0,0};
137
138
139 /* Vector auxiliary variables. */
140 __vector unsigned long long va0, va1, va2, va3, va4, va5, va6, va7;
141
142 unsigned int offset; /* Constant table offset. */
143
144 unsigned long i; /* Counter. */
145 unsigned long chunks;
146
147 unsigned long block_size;
148 int next_block = 0;
149
150 /* Align by 128 bits. The last 128 bit block will be processed at end. */
151 unsigned long length = len & 0xFFFFFFFFFFFFFF80UL;
152
153 vcrc = (__vector unsigned long long)__builtin_pack_vector_int128(0UL, crc);
154
155 /* Short version. */
156 if (len < 256) {
157 /* Calculate where in the constant table we need to start. */
158 offset = 256 - len;
159
160 vconst1 = vec_ld(offset, vcrc_short_const);
161 vdata0 = vec_ld(0, (__vector unsigned long long*) p);
162 VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
163
164 /* xor initial value */
165 vdata0 = vec_xor(vdata0, vcrc);
166
167 vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(
168 (__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
169 v0 = vec_xor(v0, vdata0);
170
171 for (i = 16; i < len; i += 16) {
172 vconst1 = vec_ld(offset + i, vcrc_short_const);
173 vdata0 = vec_ld(i, (__vector unsigned long long*) p);
174 VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
175 vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(
176 (__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
177 v0 = vec_xor(v0, vdata0);
178 }
179 } else {
180
181 /* Load initial values. */
182 vdata0 = vec_ld(0, (__vector unsigned long long*) p);
183 vdata1 = vec_ld(16, (__vector unsigned long long*) p);
184
185 VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
186 VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
187
188 vdata2 = vec_ld(32, (__vector unsigned long long*) p);
189 vdata3 = vec_ld(48, (__vector unsigned long long*) p);
190
191 VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
192 VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
193
194 vdata4 = vec_ld(64, (__vector unsigned long long*) p);
195 vdata5 = vec_ld(80, (__vector unsigned long long*) p);
196
197 VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
198 VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
199
200 vdata6 = vec_ld(96, (__vector unsigned long long*) p);
201 vdata7 = vec_ld(112, (__vector unsigned long long*) p);
202
203 VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
204 VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
205
206 /* xor in initial value */
207 vdata0 = vec_xor(vdata0, vcrc);
208
209 p = (char *)p + 128;
210
211 do {
212 /* Checksum in blocks of MAX_SIZE. */
213 block_size = length;
214 if (block_size > MAX_SIZE) {
215 block_size = MAX_SIZE;
216 }
217
218 length = length - block_size;
219
220 /*
221 * Work out the offset into the constants table to start at. Each
222 * constant is 16 bytes, and it is used against 128 bytes of input
223 * data - 128 / 16 = 8
224 */
225 offset = (MAX_SIZE/8) - (block_size/8);
226 /* We reduce our final 128 bytes in a separate step */
227 chunks = (block_size/128)-1;
228
229 vconst1 = vec_ld(offset, vcrc_const);
230
231 va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
232 (__vector unsigned long long)vconst1);
233 va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
234 (__vector unsigned long long)vconst1);
235 va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,
236 (__vector unsigned long long)vconst1);
237 va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
238 (__vector unsigned long long)vconst1);
239 va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
240 (__vector unsigned long long)vconst1);
241 va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
242 (__vector unsigned long long)vconst1);
243 va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
244 (__vector unsigned long long)vconst1);
245 va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
246 (__vector unsigned long long)vconst1);
247
248 if (chunks > 1) {
249 offset += 16;
250 vconst2 = vec_ld(offset, vcrc_const);
251 GROUP_ENDING_NOP;
252
253 vdata0 = vec_ld(0, (__vector unsigned long long*) p);
254 VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
255
256 vdata1 = vec_ld(16, (__vector unsigned long long*) p);
257 VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
258
259 vdata2 = vec_ld(32, (__vector unsigned long long*) p);
260 VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
261
262 vdata3 = vec_ld(48, (__vector unsigned long long*) p);
263 VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
264
265 vdata4 = vec_ld(64, (__vector unsigned long long*) p);
266 VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
267
268 vdata5 = vec_ld(80, (__vector unsigned long long*) p);
269 VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
270
271 vdata6 = vec_ld(96, (__vector unsigned long long*) p);
272 VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
273
274 vdata7 = vec_ld(112, (__vector unsigned long long*) p);
275 VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
276
277 p = (char *)p + 128;
278
279 /*
280 * main loop. Each iteration calculates the CRC for a 128-byte
281 * block.
282 */
283 for (i = 0; i < chunks-2; i++) {
284 vconst1 = vec_ld(offset, vcrc_const);
285 offset += 16;
286 GROUP_ENDING_NOP;
287
288 v0 = vec_xor(v0, va0);
289 va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
290 (__vector unsigned long long)vconst2);
291 vdata0 = vec_ld(0, (__vector unsigned long long*) p);
292 VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
293 GROUP_ENDING_NOP;
294
295 v1 = vec_xor(v1, va1);
296 va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
297 (__vector unsigned long long)vconst2);
298 vdata1 = vec_ld(16, (__vector unsigned long long*) p);
299 VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
300 GROUP_ENDING_NOP;
301
302 v2 = vec_xor(v2, va2);
303 va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)
304 vdata2, (__vector unsigned long long)vconst2);
305 vdata2 = vec_ld(32, (__vector unsigned long long*) p);
306 VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
307 GROUP_ENDING_NOP;
308
309 v3 = vec_xor(v3, va3);
310 va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
311 (__vector unsigned long long)vconst2);
312 vdata3 = vec_ld(48, (__vector unsigned long long*) p);
313 VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
314
315 vconst2 = vec_ld(offset, vcrc_const);
316 GROUP_ENDING_NOP;
317
318 v4 = vec_xor(v4, va4);
319 va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
320 (__vector unsigned long long)vconst1);
321 vdata4 = vec_ld(64, (__vector unsigned long long*) p);
322 VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
323 GROUP_ENDING_NOP;
324
325 v5 = vec_xor(v5, va5);
326 va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
327 (__vector unsigned long long)vconst1);
328 vdata5 = vec_ld(80, (__vector unsigned long long*) p);
329 VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
330 GROUP_ENDING_NOP;
331
332 v6 = vec_xor(v6, va6);
333 va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
334 (__vector unsigned long long)vconst1);
335 vdata6 = vec_ld(96, (__vector unsigned long long*) p);
336 VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
337 GROUP_ENDING_NOP;
338
339 v7 = vec_xor(v7, va7);
340 va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
341 (__vector unsigned long long)vconst1);
342 vdata7 = vec_ld(112, (__vector unsigned long long*) p);
343 VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
344
345 p = (char *)p + 128;
346 }
347
348 /* First cool down */
349 vconst1 = vec_ld(offset, vcrc_const);
350 offset += 16;
351
352 v0 = vec_xor(v0, va0);
353 va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
354 (__vector unsigned long long)vconst1);
355 GROUP_ENDING_NOP;
356
357 v1 = vec_xor(v1, va1);
358 va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
359 (__vector unsigned long long)vconst1);
360 GROUP_ENDING_NOP;
361
362 v2 = vec_xor(v2, va2);
363 va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,
364 (__vector unsigned long long)vconst1);
365 GROUP_ENDING_NOP;
366
367 v3 = vec_xor(v3, va3);
368 va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
369 (__vector unsigned long long)vconst1);
370 GROUP_ENDING_NOP;
371
372 v4 = vec_xor(v4, va4);
373 va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
374 (__vector unsigned long long)vconst1);
375 GROUP_ENDING_NOP;
376
377 v5 = vec_xor(v5, va5);
378 va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
379 (__vector unsigned long long)vconst1);
380 GROUP_ENDING_NOP;
381
382 v6 = vec_xor(v6, va6);
383 va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
384 (__vector unsigned long long)vconst1);
385 GROUP_ENDING_NOP;
386
387 v7 = vec_xor(v7, va7);
388 va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
389 (__vector unsigned long long)vconst1);
390 }/* else */
391
392 /* Second cool down. */
393 v0 = vec_xor(v0, va0);
394 v1 = vec_xor(v1, va1);
395 v2 = vec_xor(v2, va2);
396 v3 = vec_xor(v3, va3);
397 v4 = vec_xor(v4, va4);
398 v5 = vec_xor(v5, va5);
399 v6 = vec_xor(v6, va6);
400 v7 = vec_xor(v7, va7);
401
402 /*
403 * vpmsumd produces a 96 bit result in the least significant bits
404 * of the register. Since we are bit reflected we have to shift it
405 * left 32 bits so it occupies the least significant bits in the
406 * bit reflected domain.
407 */
408 v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
409 (__vector unsigned char)vzero, 4);
410 v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v1,
411 (__vector unsigned char)vzero, 4);
412 v2 = (__vector unsigned long long)vec_sld((__vector unsigned char)v2,
413 (__vector unsigned char)vzero, 4);
414 v3 = (__vector unsigned long long)vec_sld((__vector unsigned char)v3,
415 (__vector unsigned char)vzero, 4);
416 v4 = (__vector unsigned long long)vec_sld((__vector unsigned char)v4,
417 (__vector unsigned char)vzero, 4);
418 v5 = (__vector unsigned long long)vec_sld((__vector unsigned char)v5,
419 (__vector unsigned char)vzero, 4);
420 v6 = (__vector unsigned long long)vec_sld((__vector unsigned char)v6,
421 (__vector unsigned char)vzero, 4);
422 v7 = (__vector unsigned long long)vec_sld((__vector unsigned char)v7,
423 (__vector unsigned char)vzero, 4);
424
425 /* xor with the last 1024 bits. */
426 va0 = vec_ld(0, (__vector unsigned long long*) p);
427 VEC_PERM(va0, va0, va0, vperm_const);
428
429 va1 = vec_ld(16, (__vector unsigned long long*) p);
430 VEC_PERM(va1, va1, va1, vperm_const);
431
432 va2 = vec_ld(32, (__vector unsigned long long*) p);
433 VEC_PERM(va2, va2, va2, vperm_const);
434
435 va3 = vec_ld(48, (__vector unsigned long long*) p);
436 VEC_PERM(va3, va3, va3, vperm_const);
437
438 va4 = vec_ld(64, (__vector unsigned long long*) p);
439 VEC_PERM(va4, va4, va4, vperm_const);
440
441 va5 = vec_ld(80, (__vector unsigned long long*) p);
442 VEC_PERM(va5, va5, va5, vperm_const);
443
444 va6 = vec_ld(96, (__vector unsigned long long*) p);
445 VEC_PERM(va6, va6, va6, vperm_const);
446
447 va7 = vec_ld(112, (__vector unsigned long long*) p);
448 VEC_PERM(va7, va7, va7, vperm_const);
449
450 p = (char *)p + 128;
451
452 vdata0 = vec_xor(v0, va0);
453 vdata1 = vec_xor(v1, va1);
454 vdata2 = vec_xor(v2, va2);
455 vdata3 = vec_xor(v3, va3);
456 vdata4 = vec_xor(v4, va4);
457 vdata5 = vec_xor(v5, va5);
458 vdata6 = vec_xor(v6, va6);
459 vdata7 = vec_xor(v7, va7);
460
461 /* Check if we have more blocks to process */
462 next_block = 0;
463 if (length != 0) {
464 next_block = 1;
465
466 /* zero v0-v7 */
467 v0 = vec_xor(v0, v0);
468 v1 = vec_xor(v1, v1);
469 v2 = vec_xor(v2, v2);
470 v3 = vec_xor(v3, v3);
471 v4 = vec_xor(v4, v4);
472 v5 = vec_xor(v5, v5);
473 v6 = vec_xor(v6, v6);
474 v7 = vec_xor(v7, v7);
475 }
476 length = length + 128;
477
478 } while (next_block);
479
480 /* Calculate how many bytes we have left. */
481 length = (len & 127);
482
483 /* Calculate where in (short) constant table we need to start. */
484 offset = 128 - length;
485
486 v0 = vec_ld(offset, vcrc_short_const);
487 v1 = vec_ld(offset + 16, vcrc_short_const);
488 v2 = vec_ld(offset + 32, vcrc_short_const);
489 v3 = vec_ld(offset + 48, vcrc_short_const);
490 v4 = vec_ld(offset + 64, vcrc_short_const);
491 v5 = vec_ld(offset + 80, vcrc_short_const);
492 v6 = vec_ld(offset + 96, vcrc_short_const);
493 v7 = vec_ld(offset + 112, vcrc_short_const);
494
495 offset += 128;
496
497 v0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
498 (__vector unsigned int)vdata0, (__vector unsigned int)v0);
499 v1 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
500 (__vector unsigned int)vdata1, (__vector unsigned int)v1);
501 v2 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
502 (__vector unsigned int)vdata2, (__vector unsigned int)v2);
503 v3 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
504 (__vector unsigned int)vdata3, (__vector unsigned int)v3);
505 v4 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
506 (__vector unsigned int)vdata4, (__vector unsigned int)v4);
507 v5 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
508 (__vector unsigned int)vdata5, (__vector unsigned int)v5);
509 v6 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
510 (__vector unsigned int)vdata6, (__vector unsigned int)v6);
511 v7 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
512 (__vector unsigned int)vdata7, (__vector unsigned int)v7);
513
514 /* Now reduce the tail (0-112 bytes). */
515 for (i = 0; i < length; i+=16) {
516 vdata0 = vec_ld(i,(__vector unsigned long long*)p);
517 VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
518 va0 = vec_ld(offset + i,vcrc_short_const);
519 va0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
520 (__vector unsigned int)vdata0, (__vector unsigned int)va0);
521 v0 = vec_xor(v0, va0);
522 }
523
524 /* xor all parallel chunks together. */
525 v0 = vec_xor(v0, v1);
526 v2 = vec_xor(v2, v3);
527 v4 = vec_xor(v4, v5);
528 v6 = vec_xor(v6, v7);
529
530 v0 = vec_xor(v0, v2);
531 v4 = vec_xor(v4, v6);
532
533 v0 = vec_xor(v0, v4);
534 }
535
536 /* Barrett Reduction */
537 vconst1 = vec_ld(0, v_Barrett_const);
538 vconst2 = vec_ld(16, v_Barrett_const);
539
540 v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
541 (__vector unsigned char)v0, 8);
542 v0 = vec_xor(v1,v0);
543
544 /* shift left one bit */
545 __vector unsigned char vsht_splat = vec_splat_u8 (1);
546 v0 = (__vector unsigned long long)vec_sll((__vector unsigned char)v0, vsht_splat);
547
548 v0 = vec_and(v0, vmask_64bit);
549
550 /*
551 * The reflected version of Barrett reduction. Instead of bit
552 * reflecting our data (which is expensive to do), we bit reflect our
553 * constants and our algorithm, which means the intermediate data in
554 * our vector registers goes from 0-63 instead of 63-0. We can reflect
555 * the algorithm because we don't carry in mod 2 arithmetic.
556 */
557
558 /* bottom 32 bits of a */
559 v1 = vec_and(v0, vmask_32bit);
560
561 /* ma */
562 v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,
563 (__vector unsigned long long)vconst1);
564
565 /* bottom 32bits of ma */
566 v1 = vec_and(v1, vmask_32bit);
567 /* qn */
568 v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,
569 (__vector unsigned long long)vconst2);
570 /* a - qn, subtraction is xor in GF(2) */
571 v0 = vec_xor (v0, v1);
572
573 /*
574 * Since we are bit reflected, the result (ie the low 32 bits) is in
575 * the high 32 bits. We just need to shift it left 4 bytes
576 * V0 [ 0 1 X 3 ]
577 * V0 [ 0 X 2 3 ]
578 */
579
580 /* shift result into top 64 bits of */
581 v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
582 (__vector unsigned char)vzero, 4);
583
584 #if BYTE_ORDER == BIG_ENDIAN
585 return v0[0];
586 #else
587 return v0[1];
588 #endif
589 }
590