1 /* crc32_braid.c -- compute the CRC-32 of a data stream
2  * Copyright (C) 1995-2006, 2010, 2011, 2012, 2016, 2018 Mark Adler
3  * For conditions of distribution and use, see copyright notice in zlib.h
4  *
5  * This interleaved implementation of a CRC makes use of pipelined multiple
6  * arithmetic-logic units, commonly found in modern CPU cores. It is due to
7  * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
8  */
9 
10 #include "zbuild.h"
11 #include "zutil.h"
12 #include "functable.h"
13 #include "crc32_braid_p.h"
14 #include "crc32_braid_tbl.h"
15 
16 /* ========================================================================= */
17 
PREFIX(get_crc_table)18 const uint32_t * Z_EXPORT PREFIX(get_crc_table)(void) {
19     return (const uint32_t *)crc_table;
20 }
21 
22 #ifdef ZLIB_COMPAT
PREFIX(crc32_z)23 unsigned long Z_EXPORT PREFIX(crc32_z)(unsigned long crc, const unsigned char *buf, size_t len) {
24     if (buf == NULL) return 0;
25 
26     return (unsigned long)functable.crc32((uint32_t)crc, buf, len);
27 }
28 #else
PREFIX(crc32_z)29 uint32_t Z_EXPORT PREFIX(crc32_z)(uint32_t crc, const unsigned char *buf, size_t len) {
30     if (buf == NULL) return 0;
31 
32     return functable.crc32(crc, buf, len);
33 }
34 #endif
35 
36 #ifdef ZLIB_COMPAT
PREFIX(crc32)37 unsigned long Z_EXPORT PREFIX(crc32)(unsigned long crc, const unsigned char *buf, unsigned int len) {
38     return (unsigned long)PREFIX(crc32_z)((uint32_t)crc, buf, len);
39 }
40 #else
PREFIX(crc32)41 uint32_t Z_EXPORT PREFIX(crc32)(uint32_t crc, const unsigned char *buf, uint32_t len) {
42     return PREFIX(crc32_z)(crc, buf, len);
43 }
44 #endif
45 
46 /* ========================================================================= */
47 
48 /*
49   A CRC of a message is computed on N braids of words in the message, where
50   each word consists of W bytes (4 or 8). If N is 3, for example, then three
51   running sparse CRCs are calculated respectively on each braid, at these
52   indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
53   This is done starting at a word boundary, and continues until as many blocks
54   of N * W bytes as are available have been processed. The results are combined
55   into a single CRC at the end. For this code, N must be in the range 1..6 and
56   W must be 4 or 8. The upper limit on N can be increased if desired by adding
57   more #if blocks, extending the patterns apparent in the code. In addition,
58   crc32 tables would need to be regenerated, if the maximum N value is increased.
59 
60   N and W are chosen empirically by benchmarking the execution time on a given
61   processor. The choices for N and W below were based on testing on Intel Kaby
62   Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
63   Octeon II processors. The Intel, AMD, and ARM processors were all fastest
64   with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
65   They were all tested with either gcc or clang, all using the -O3 optimization
66   level. Your mileage may vary.
67 */
68 
69 /* ========================================================================= */
70 
71 #if BYTE_ORDER == LITTLE_ENDIAN
72 #  define ZSWAPWORD(word) (word)
73 #  define BRAID_TABLE crc_braid_table
74 #elif BYTE_ORDER == BIG_ENDIAN
75 #  if W == 8
76 #    define ZSWAPWORD(word) ZSWAP64(word)
77 #  elif W == 4
78 #    define ZSWAPWORD(word) ZSWAP32(word)
79 #  endif
80 #  define BRAID_TABLE crc_braid_big_table
81 #else
82 #  error "No endian defined"
83 #endif
84 #define DO1 c = crc_table[(c ^ *buf++) & 0xff] ^ (c >> 8)
85 #define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1
86 
87 /* ========================================================================= */
88 #ifdef W
89 /*
90   Return the CRC of the W bytes in the word_t data, taking the
91   least-significant byte of the word as the first byte of data, without any pre
92   or post conditioning. This is used to combine the CRCs of each braid.
93  */
94 #if BYTE_ORDER == LITTLE_ENDIAN
crc_word(z_word_t data)95 static uint32_t crc_word(z_word_t data) {
96     int k;
97     for (k = 0; k < W; k++)
98         data = (data >> 8) ^ crc_table[data & 0xff];
99     return (uint32_t)data;
100 }
101 #elif BYTE_ORDER == BIG_ENDIAN
crc_word(z_word_t data)102 static z_word_t crc_word(z_word_t data) {
103     int k;
104     for (k = 0; k < W; k++)
105         data = (data << 8) ^
106             crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
107     return data;
108 }
109 #endif /* BYTE_ORDER */
110 
111 #endif /* W */
112 
113 /* ========================================================================= */
crc32_braid(uint32_t crc,const unsigned char * buf,uint64_t len)114 Z_INTERNAL uint32_t crc32_braid(uint32_t crc, const unsigned char *buf, uint64_t len) {
115     Z_REGISTER uint32_t c;
116 
117     /* Pre-condition the CRC */
118     c = (~crc) & 0xffffffff;
119 
120 #ifdef W
121     /* If provided enough bytes, do a braided CRC calculation. */
122     if (len >= N * W + W - 1) {
123         uint64_t blks;
124         z_word_t const *words;
125         int k;
126 
127         /* Compute the CRC up to a z_word_t boundary. */
128         while (len && ((z_size_t)buf & (W - 1)) != 0) {
129             len--;
130             DO1;
131         }
132 
133         /* Compute the CRC on as many N z_word_t blocks as are available. */
134         blks = len / (N * W);
135         len -= blks * N * W;
136         words = (z_word_t const *)buf;
137 
138         z_word_t crc0, word0, comb;
139 #if N > 1
140         z_word_t crc1, word1;
141 #if N > 2
142         z_word_t crc2, word2;
143 #if N > 3
144         z_word_t crc3, word3;
145 #if N > 4
146         z_word_t crc4, word4;
147 #if N > 5
148         z_word_t crc5, word5;
149 #endif
150 #endif
151 #endif
152 #endif
153 #endif
154         /* Initialize the CRC for each braid. */
155         crc0 = ZSWAPWORD(c);
156 #if N > 1
157         crc1 = 0;
158 #if N > 2
159         crc2 = 0;
160 #if N > 3
161         crc3 = 0;
162 #if N > 4
163         crc4 = 0;
164 #if N > 5
165         crc5 = 0;
166 #endif
167 #endif
168 #endif
169 #endif
170 #endif
171         /* Process the first blks-1 blocks, computing the CRCs on each braid independently. */
172         while (--blks) {
173             /* Load the word for each braid into registers. */
174             word0 = crc0 ^ words[0];
175 #if N > 1
176             word1 = crc1 ^ words[1];
177 #if N > 2
178             word2 = crc2 ^ words[2];
179 #if N > 3
180             word3 = crc3 ^ words[3];
181 #if N > 4
182             word4 = crc4 ^ words[4];
183 #if N > 5
184             word5 = crc5 ^ words[5];
185 #endif
186 #endif
187 #endif
188 #endif
189 #endif
190             words += N;
191 
192             /* Compute and update the CRC for each word. The loop should get unrolled. */
193             crc0 = BRAID_TABLE[0][word0 & 0xff];
194 #if N > 1
195             crc1 = BRAID_TABLE[0][word1 & 0xff];
196 #if N > 2
197             crc2 = BRAID_TABLE[0][word2 & 0xff];
198 #if N > 3
199             crc3 = BRAID_TABLE[0][word3 & 0xff];
200 #if N > 4
201             crc4 = BRAID_TABLE[0][word4 & 0xff];
202 #if N > 5
203             crc5 = BRAID_TABLE[0][word5 & 0xff];
204 #endif
205 #endif
206 #endif
207 #endif
208 #endif
209             for (k = 1; k < W; k++) {
210                 crc0 ^= BRAID_TABLE[k][(word0 >> (k << 3)) & 0xff];
211 #if N > 1
212                 crc1 ^= BRAID_TABLE[k][(word1 >> (k << 3)) & 0xff];
213 #if N > 2
214                 crc2 ^= BRAID_TABLE[k][(word2 >> (k << 3)) & 0xff];
215 #if N > 3
216                 crc3 ^= BRAID_TABLE[k][(word3 >> (k << 3)) & 0xff];
217 #if N > 4
218                 crc4 ^= BRAID_TABLE[k][(word4 >> (k << 3)) & 0xff];
219 #if N > 5
220                 crc5 ^= BRAID_TABLE[k][(word5 >> (k << 3)) & 0xff];
221 #endif
222 #endif
223 #endif
224 #endif
225 #endif
226             }
227         }
228 
229         /* Process the last block, combining the CRCs of the N braids at the same time. */
230         comb = crc_word(crc0 ^ words[0]);
231 #if N > 1
232         comb = crc_word(crc1 ^ words[1] ^ comb);
233 #if N > 2
234         comb = crc_word(crc2 ^ words[2] ^ comb);
235 #if N > 3
236         comb = crc_word(crc3 ^ words[3] ^ comb);
237 #if N > 4
238         comb = crc_word(crc4 ^ words[4] ^ comb);
239 #if N > 5
240         comb = crc_word(crc5 ^ words[5] ^ comb);
241 #endif
242 #endif
243 #endif
244 #endif
245 #endif
246         words += N;
247         c = ZSWAPWORD(comb);
248 
249         /* Update the pointer to the remaining bytes to process. */
250         buf = (const unsigned char *)words;
251     }
252 
253 #endif /* W */
254 
255     /* Complete the computation of the CRC on any remaining bytes. */
256     while (len >= 8) {
257         len -= 8;
258         DO8;
259     }
260     while (len) {
261         len--;
262         DO1;
263     }
264 
265     /* Return the CRC, post-conditioned. */
266     return c ^ 0xffffffff;
267 }
268