1 /* crc32 for POWER8 using VSX instructions
2  * Copyright (C) 2021 IBM Corporation
3  *
4  * Author: Rogerio Alves <[email protected]>
5  *
6  * For conditions of distribution and use, see copyright notice in zlib.h
7  *
8  * Calculate the checksum of data that is 16 byte aligned and a multiple of
9  * 16 bytes.
10  *
11  * The first step is to reduce it to 1024 bits. We do this in 8 parallel
12  * chunks in order to mask the latency of the vpmsum instructions. If we
13  * have more than 32 kB of data to checksum we repeat this step multiple
14  * times, passing in the previous 1024 bits.
15  *
16  * The next step is to reduce the 1024 bits to 64 bits. This step adds
17  * 32 bits of 0s to the end - this matches what a CRC does. We just
18  * calculate constants that land the data in this 32 bits.
19  *
20  * We then use fixed point Barrett reduction to compute a mod n over GF(2)
21  * for n = CRC using POWER8 instructions. We use x = 32.
22  *
23  * http://en.wikipedia.org/wiki/Barrett_reduction
24  *
25  * This code uses gcc vector builtins instead using assembly directly.
26  */
27 
28 #include <altivec.h>
29 #include "zendian.h"
30 #include "zbuild.h"
31 
32 #include "crc32_constants.h"
33 #include "crc32_braid_tbl.h"
34 
35 #if defined (__clang__)
36 #include "fallback_builtins.h"
37 #endif
38 
39 #define MAX_SIZE    32768
40 #define VMX_ALIGN	16
41 #define VMX_ALIGN_MASK	(VMX_ALIGN-1)
42 
crc32_align(unsigned int crc,const unsigned char * p,unsigned long len)43 static unsigned int crc32_align(unsigned int crc, const unsigned char *p, unsigned long len) {
44     while (len--)
45         crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
46     return crc;
47 }
48 
49 static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len);
50 
crc32_power8(uint32_t crc,const unsigned char * p,uint64_t _len)51 Z_INTERNAL uint32_t crc32_power8(uint32_t crc, const unsigned char *p, uint64_t _len) {
52     unsigned int prealign;
53     unsigned int tail;
54 
55     unsigned long len = (unsigned long) _len;
56 
57     if (p == (const unsigned char *) 0x0)
58         return 0;
59 
60     crc ^= 0xffffffff;
61 
62     if (len < VMX_ALIGN + VMX_ALIGN_MASK) {
63         crc = crc32_align(crc, p, len);
64         goto out;
65     }
66 
67     if ((unsigned long)p & VMX_ALIGN_MASK) {
68         prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
69         crc = crc32_align(crc, p, prealign);
70         len -= prealign;
71         p += prealign;
72     }
73 
74     crc = __crc32_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
75 
76     tail = len & VMX_ALIGN_MASK;
77     if (tail) {
78         p += len & ~VMX_ALIGN_MASK;
79         crc = crc32_align(crc, p, tail);
80     }
81 
82 out:
83     crc ^= 0xffffffff;
84 
85     return crc;
86 }
87 
88 /* When we have a load-store in a single-dispatch group and address overlap
89  * such that forward is not allowed (load-hit-store) the group must be flushed.
90  * A group ending NOP prevents the flush.
91  */
92 #define GROUP_ENDING_NOP __asm__("ori 2,2,0" ::: "memory")
93 
94 #if BYTE_ORDER == BIG_ENDIAN
95 #define BYTESWAP_DATA
96 #endif
97 
98 #ifdef BYTESWAP_DATA
99 #define VEC_PERM(vr, va, vb, vc) vr = vec_perm(va, vb, (__vector unsigned char) vc)
100 #if BYTE_ORDER == LITTLE_ENDIAN
101 /* Byte reverse permute constant LE. */
102 static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x08090A0B0C0D0E0FUL, 0x0001020304050607UL };
103 #else
104 static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x0F0E0D0C0B0A0908UL, 0X0706050403020100UL };
105 #endif
106 #else
107 #define VEC_PERM(vr, va, vb, vc)
108 #endif
109 
__crc32_vpmsum(unsigned int crc,const void * p,unsigned long len)110 static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len) {
111 
112     const __vector unsigned long long vzero = {0,0};
113     const __vector unsigned long long vones = {0xffffffffffffffffUL, 0xffffffffffffffffUL};
114 
115     const __vector unsigned long long vmask_32bit =
116         (__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 4);
117 
118     const __vector unsigned long long vmask_64bit =
119         (__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 8);
120 
121     __vector unsigned long long vcrc;
122 
123     __vector unsigned long long vconst1, vconst2;
124 
125     /* vdata0-vdata7 will contain our data (p). */
126     __vector unsigned long long vdata0, vdata1, vdata2, vdata3, vdata4, vdata5, vdata6, vdata7;
127 
128     /* v0-v7 will contain our checksums */
129     __vector unsigned long long v0 = {0,0};
130     __vector unsigned long long v1 = {0,0};
131     __vector unsigned long long v2 = {0,0};
132     __vector unsigned long long v3 = {0,0};
133     __vector unsigned long long v4 = {0,0};
134     __vector unsigned long long v5 = {0,0};
135     __vector unsigned long long v6 = {0,0};
136     __vector unsigned long long v7 = {0,0};
137 
138 
139     /* Vector auxiliary variables. */
140     __vector unsigned long long va0, va1, va2, va3, va4, va5, va6, va7;
141 
142     unsigned int offset; /* Constant table offset. */
143 
144     unsigned long i; /* Counter. */
145     unsigned long chunks;
146 
147     unsigned long block_size;
148     int next_block = 0;
149 
150     /* Align by 128 bits. The last 128 bit block will be processed at end. */
151     unsigned long length = len & 0xFFFFFFFFFFFFFF80UL;
152 
153     vcrc = (__vector unsigned long long)__builtin_pack_vector_int128(0UL, crc);
154 
155     /* Short version. */
156     if (len < 256) {
157         /* Calculate where in the constant table we need to start. */
158         offset = 256 - len;
159 
160         vconst1 = vec_ld(offset, vcrc_short_const);
161         vdata0 = vec_ld(0, (__vector unsigned long long*) p);
162         VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
163 
164         /* xor initial value */
165         vdata0 = vec_xor(vdata0, vcrc);
166 
167         vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(
168             (__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
169         v0 = vec_xor(v0, vdata0);
170 
171         for (i = 16; i < len; i += 16) {
172             vconst1 = vec_ld(offset + i, vcrc_short_const);
173             vdata0 = vec_ld(i, (__vector unsigned long long*) p);
174             VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
175             vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(
176                 (__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
177             v0 = vec_xor(v0, vdata0);
178         }
179     } else {
180 
181         /* Load initial values. */
182         vdata0 = vec_ld(0, (__vector unsigned long long*) p);
183         vdata1 = vec_ld(16, (__vector unsigned long long*) p);
184 
185         VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
186         VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
187 
188         vdata2 = vec_ld(32, (__vector unsigned long long*) p);
189         vdata3 = vec_ld(48, (__vector unsigned long long*) p);
190 
191         VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
192         VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
193 
194         vdata4 = vec_ld(64, (__vector unsigned long long*) p);
195         vdata5 = vec_ld(80, (__vector unsigned long long*) p);
196 
197         VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
198         VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
199 
200         vdata6 = vec_ld(96, (__vector unsigned long long*) p);
201         vdata7 = vec_ld(112, (__vector unsigned long long*) p);
202 
203         VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
204         VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
205 
206         /* xor in initial value */
207         vdata0 = vec_xor(vdata0, vcrc);
208 
209         p = (char *)p + 128;
210 
211         do {
212             /* Checksum in blocks of MAX_SIZE. */
213             block_size = length;
214             if (block_size > MAX_SIZE) {
215                 block_size = MAX_SIZE;
216             }
217 
218             length = length - block_size;
219 
220             /*
221              * Work out the offset into the constants table to start at. Each
222              * constant is 16 bytes, and it is used against 128 bytes of input
223              * data - 128 / 16 = 8
224              */
225             offset = (MAX_SIZE/8) - (block_size/8);
226             /* We reduce our final 128 bytes in a separate step */
227             chunks = (block_size/128)-1;
228 
229             vconst1 = vec_ld(offset, vcrc_const);
230 
231             va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
232                                            (__vector unsigned long long)vconst1);
233             va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
234                                            (__vector unsigned long long)vconst1);
235             va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,
236                                            (__vector unsigned long long)vconst1);
237             va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
238                                            (__vector unsigned long long)vconst1);
239             va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
240                                            (__vector unsigned long long)vconst1);
241             va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
242                                            (__vector unsigned long long)vconst1);
243             va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
244                                            (__vector unsigned long long)vconst1);
245             va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
246                                            (__vector unsigned long long)vconst1);
247 
248             if (chunks > 1) {
249                 offset += 16;
250                 vconst2 = vec_ld(offset, vcrc_const);
251                 GROUP_ENDING_NOP;
252 
253                 vdata0 = vec_ld(0, (__vector unsigned long long*) p);
254                 VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
255 
256                 vdata1 = vec_ld(16, (__vector unsigned long long*) p);
257                 VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
258 
259                 vdata2 = vec_ld(32, (__vector unsigned long long*) p);
260                 VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
261 
262                 vdata3 = vec_ld(48, (__vector unsigned long long*) p);
263                 VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
264 
265                 vdata4 = vec_ld(64, (__vector unsigned long long*) p);
266                 VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
267 
268                 vdata5 = vec_ld(80, (__vector unsigned long long*) p);
269                 VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
270 
271                 vdata6 = vec_ld(96, (__vector unsigned long long*) p);
272                 VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
273 
274                 vdata7 = vec_ld(112, (__vector unsigned long long*) p);
275                 VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
276 
277                 p = (char *)p + 128;
278 
279                 /*
280                  * main loop. Each iteration calculates the CRC for a 128-byte
281                  * block.
282                  */
283                 for (i = 0; i < chunks-2; i++) {
284                     vconst1 = vec_ld(offset, vcrc_const);
285                     offset += 16;
286                     GROUP_ENDING_NOP;
287 
288                     v0 = vec_xor(v0, va0);
289                     va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
290                                                    (__vector unsigned long long)vconst2);
291                     vdata0 = vec_ld(0, (__vector unsigned long long*) p);
292                     VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
293                     GROUP_ENDING_NOP;
294 
295                     v1 = vec_xor(v1, va1);
296                     va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
297                                                    (__vector unsigned long long)vconst2);
298                     vdata1 = vec_ld(16, (__vector unsigned long long*) p);
299                     VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
300                     GROUP_ENDING_NOP;
301 
302                     v2 = vec_xor(v2, va2);
303                     va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)
304                                                    vdata2, (__vector unsigned long long)vconst2);
305                     vdata2 = vec_ld(32, (__vector unsigned long long*) p);
306                     VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
307                     GROUP_ENDING_NOP;
308 
309                     v3 = vec_xor(v3, va3);
310                     va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
311                                                    (__vector unsigned long long)vconst2);
312                     vdata3 = vec_ld(48, (__vector unsigned long long*) p);
313                     VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
314 
315                     vconst2 = vec_ld(offset, vcrc_const);
316                     GROUP_ENDING_NOP;
317 
318                     v4 = vec_xor(v4, va4);
319                     va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
320                                                    (__vector unsigned long long)vconst1);
321                     vdata4 = vec_ld(64, (__vector unsigned long long*) p);
322                     VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
323                     GROUP_ENDING_NOP;
324 
325                     v5 = vec_xor(v5, va5);
326                     va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
327                                                    (__vector unsigned long long)vconst1);
328                     vdata5 = vec_ld(80, (__vector unsigned long long*) p);
329                     VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
330                     GROUP_ENDING_NOP;
331 
332                     v6 = vec_xor(v6, va6);
333                     va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
334                                                    (__vector unsigned long long)vconst1);
335                     vdata6 = vec_ld(96, (__vector unsigned long long*) p);
336                     VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
337                     GROUP_ENDING_NOP;
338 
339                     v7 = vec_xor(v7, va7);
340                     va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
341                                                    (__vector unsigned long long)vconst1);
342                     vdata7 = vec_ld(112, (__vector unsigned long long*) p);
343                     VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
344 
345                     p = (char *)p + 128;
346                 }
347 
348                 /* First cool down */
349                 vconst1 = vec_ld(offset, vcrc_const);
350                 offset += 16;
351 
352                 v0 = vec_xor(v0, va0);
353                 va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
354                                                (__vector unsigned long long)vconst1);
355                 GROUP_ENDING_NOP;
356 
357                 v1 = vec_xor(v1, va1);
358                 va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
359                                                (__vector unsigned long long)vconst1);
360                 GROUP_ENDING_NOP;
361 
362                 v2 = vec_xor(v2, va2);
363                 va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,
364                                                (__vector unsigned long long)vconst1);
365                 GROUP_ENDING_NOP;
366 
367                 v3 = vec_xor(v3, va3);
368                 va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
369                                                (__vector unsigned long long)vconst1);
370                 GROUP_ENDING_NOP;
371 
372                 v4 = vec_xor(v4, va4);
373                 va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
374                                                (__vector unsigned long long)vconst1);
375                 GROUP_ENDING_NOP;
376 
377                 v5 = vec_xor(v5, va5);
378                 va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
379                                                (__vector unsigned long long)vconst1);
380                 GROUP_ENDING_NOP;
381 
382                 v6 = vec_xor(v6, va6);
383                 va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
384                                                (__vector unsigned long long)vconst1);
385                 GROUP_ENDING_NOP;
386 
387                 v7 = vec_xor(v7, va7);
388                 va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
389                                                (__vector unsigned long long)vconst1);
390             }/* else */
391 
392             /* Second cool down. */
393             v0 = vec_xor(v0, va0);
394             v1 = vec_xor(v1, va1);
395             v2 = vec_xor(v2, va2);
396             v3 = vec_xor(v3, va3);
397             v4 = vec_xor(v4, va4);
398             v5 = vec_xor(v5, va5);
399             v6 = vec_xor(v6, va6);
400             v7 = vec_xor(v7, va7);
401 
402             /*
403              * vpmsumd produces a 96 bit result in the least significant bits
404              * of the register. Since we are bit reflected we have to shift it
405              * left 32 bits so it occupies the least significant bits in the
406              * bit reflected domain.
407              */
408             v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
409                                                       (__vector unsigned char)vzero, 4);
410             v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v1,
411                                                       (__vector unsigned char)vzero, 4);
412             v2 = (__vector unsigned long long)vec_sld((__vector unsigned char)v2,
413                                                       (__vector unsigned char)vzero, 4);
414             v3 = (__vector unsigned long long)vec_sld((__vector unsigned char)v3,
415                                                       (__vector unsigned char)vzero, 4);
416             v4 = (__vector unsigned long long)vec_sld((__vector unsigned char)v4,
417                                                       (__vector unsigned char)vzero, 4);
418             v5 = (__vector unsigned long long)vec_sld((__vector unsigned char)v5,
419                                                       (__vector unsigned char)vzero, 4);
420             v6 = (__vector unsigned long long)vec_sld((__vector unsigned char)v6,
421                                                       (__vector unsigned char)vzero, 4);
422             v7 = (__vector unsigned long long)vec_sld((__vector unsigned char)v7,
423                                                       (__vector unsigned char)vzero, 4);
424 
425             /* xor with the last 1024 bits. */
426             va0 = vec_ld(0, (__vector unsigned long long*) p);
427             VEC_PERM(va0, va0, va0, vperm_const);
428 
429             va1 = vec_ld(16, (__vector unsigned long long*) p);
430             VEC_PERM(va1, va1, va1, vperm_const);
431 
432             va2 = vec_ld(32, (__vector unsigned long long*) p);
433             VEC_PERM(va2, va2, va2, vperm_const);
434 
435             va3 = vec_ld(48, (__vector unsigned long long*) p);
436             VEC_PERM(va3, va3, va3, vperm_const);
437 
438             va4 = vec_ld(64, (__vector unsigned long long*) p);
439             VEC_PERM(va4, va4, va4, vperm_const);
440 
441             va5 = vec_ld(80, (__vector unsigned long long*) p);
442             VEC_PERM(va5, va5, va5, vperm_const);
443 
444             va6 = vec_ld(96, (__vector unsigned long long*) p);
445             VEC_PERM(va6, va6, va6, vperm_const);
446 
447             va7 = vec_ld(112, (__vector unsigned long long*) p);
448             VEC_PERM(va7, va7, va7, vperm_const);
449 
450             p = (char *)p + 128;
451 
452             vdata0 = vec_xor(v0, va0);
453             vdata1 = vec_xor(v1, va1);
454             vdata2 = vec_xor(v2, va2);
455             vdata3 = vec_xor(v3, va3);
456             vdata4 = vec_xor(v4, va4);
457             vdata5 = vec_xor(v5, va5);
458             vdata6 = vec_xor(v6, va6);
459             vdata7 = vec_xor(v7, va7);
460 
461             /* Check if we have more blocks to process */
462             next_block = 0;
463             if (length != 0) {
464                 next_block = 1;
465 
466                 /* zero v0-v7 */
467                 v0 = vec_xor(v0, v0);
468                 v1 = vec_xor(v1, v1);
469                 v2 = vec_xor(v2, v2);
470                 v3 = vec_xor(v3, v3);
471                 v4 = vec_xor(v4, v4);
472                 v5 = vec_xor(v5, v5);
473                 v6 = vec_xor(v6, v6);
474                 v7 = vec_xor(v7, v7);
475             }
476             length = length + 128;
477 
478         } while (next_block);
479 
480         /* Calculate how many bytes we have left. */
481         length = (len & 127);
482 
483         /* Calculate where in (short) constant table we need to start. */
484         offset = 128 - length;
485 
486         v0 = vec_ld(offset, vcrc_short_const);
487         v1 = vec_ld(offset + 16, vcrc_short_const);
488         v2 = vec_ld(offset + 32, vcrc_short_const);
489         v3 = vec_ld(offset + 48, vcrc_short_const);
490         v4 = vec_ld(offset + 64, vcrc_short_const);
491         v5 = vec_ld(offset + 80, vcrc_short_const);
492         v6 = vec_ld(offset + 96, vcrc_short_const);
493         v7 = vec_ld(offset + 112, vcrc_short_const);
494 
495         offset += 128;
496 
497         v0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
498             (__vector unsigned int)vdata0, (__vector unsigned int)v0);
499         v1 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
500             (__vector unsigned int)vdata1, (__vector unsigned int)v1);
501         v2 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
502             (__vector unsigned int)vdata2, (__vector unsigned int)v2);
503         v3 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
504             (__vector unsigned int)vdata3, (__vector unsigned int)v3);
505         v4 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
506             (__vector unsigned int)vdata4, (__vector unsigned int)v4);
507         v5 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
508             (__vector unsigned int)vdata5, (__vector unsigned int)v5);
509         v6 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
510             (__vector unsigned int)vdata6, (__vector unsigned int)v6);
511         v7 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
512             (__vector unsigned int)vdata7, (__vector unsigned int)v7);
513 
514         /* Now reduce the tail (0-112 bytes). */
515         for (i = 0; i < length; i+=16) {
516             vdata0 = vec_ld(i,(__vector unsigned long long*)p);
517             VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
518             va0 = vec_ld(offset + i,vcrc_short_const);
519             va0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
520                 (__vector unsigned int)vdata0, (__vector unsigned int)va0);
521             v0 = vec_xor(v0, va0);
522         }
523 
524         /* xor all parallel chunks together. */
525         v0 = vec_xor(v0, v1);
526         v2 = vec_xor(v2, v3);
527         v4 = vec_xor(v4, v5);
528         v6 = vec_xor(v6, v7);
529 
530         v0 = vec_xor(v0, v2);
531         v4 = vec_xor(v4, v6);
532 
533         v0 = vec_xor(v0, v4);
534     }
535 
536     /* Barrett Reduction */
537     vconst1 = vec_ld(0, v_Barrett_const);
538     vconst2 = vec_ld(16, v_Barrett_const);
539 
540     v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
541                                               (__vector unsigned char)v0, 8);
542     v0 = vec_xor(v1,v0);
543 
544     /* shift left one bit */
545     __vector unsigned char vsht_splat = vec_splat_u8 (1);
546     v0 = (__vector unsigned long long)vec_sll((__vector unsigned char)v0, vsht_splat);
547 
548     v0 = vec_and(v0, vmask_64bit);
549 
550     /*
551      * The reflected version of Barrett reduction. Instead of bit
552      * reflecting our data (which is expensive to do), we bit reflect our
553      * constants and our algorithm, which means the intermediate data in
554      * our vector registers goes from 0-63 instead of 63-0. We can reflect
555      * the algorithm because we don't carry in mod 2 arithmetic.
556      */
557 
558     /* bottom 32 bits of a */
559     v1 = vec_and(v0, vmask_32bit);
560 
561     /* ma */
562     v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,
563                                   (__vector unsigned long long)vconst1);
564 
565     /* bottom 32bits of ma */
566     v1 = vec_and(v1, vmask_32bit);
567     /* qn */
568     v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,
569                                   (__vector unsigned long long)vconst2);
570     /* a - qn, subtraction is xor in GF(2) */
571     v0 = vec_xor (v0, v1);
572 
573     /*
574      * Since we are bit reflected, the result (ie the low 32 bits) is in
575      * the high 32 bits. We just need to shift it left 4 bytes
576      * V0 [ 0 1 X 3 ]
577      * V0 [ 0 X 2 3 ]
578      */
579 
580     /* shift result into top 64 bits of */
581     v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
582                                               (__vector unsigned char)vzero, 4);
583 
584 #if BYTE_ORDER == BIG_ENDIAN
585     return v0[0];
586 #else
587     return v0[1];
588 #endif
589 }
590