1 /* crc32_braid.c -- compute the CRC-32 of a data stream
2 * Copyright (C) 1995-2006, 2010, 2011, 2012, 2016, 2018 Mark Adler
3 * For conditions of distribution and use, see copyright notice in zlib.h
4 *
5 * This interleaved implementation of a CRC makes use of pipelined multiple
6 * arithmetic-logic units, commonly found in modern CPU cores. It is due to
7 * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
8 */
9
10 #include "zbuild.h"
11 #include "zutil.h"
12 #include "functable.h"
13 #include "crc32_braid_p.h"
14 #include "crc32_braid_tbl.h"
15
16 /* ========================================================================= */
17
PREFIX(get_crc_table)18 const uint32_t * Z_EXPORT PREFIX(get_crc_table)(void) {
19 return (const uint32_t *)crc_table;
20 }
21
22 #ifdef ZLIB_COMPAT
PREFIX(crc32_z)23 unsigned long Z_EXPORT PREFIX(crc32_z)(unsigned long crc, const unsigned char *buf, size_t len) {
24 if (buf == NULL) return 0;
25
26 return (unsigned long)functable.crc32((uint32_t)crc, buf, len);
27 }
28 #else
PREFIX(crc32_z)29 uint32_t Z_EXPORT PREFIX(crc32_z)(uint32_t crc, const unsigned char *buf, size_t len) {
30 if (buf == NULL) return 0;
31
32 return functable.crc32(crc, buf, len);
33 }
34 #endif
35
36 #ifdef ZLIB_COMPAT
PREFIX(crc32)37 unsigned long Z_EXPORT PREFIX(crc32)(unsigned long crc, const unsigned char *buf, unsigned int len) {
38 return (unsigned long)PREFIX(crc32_z)((uint32_t)crc, buf, len);
39 }
40 #else
PREFIX(crc32)41 uint32_t Z_EXPORT PREFIX(crc32)(uint32_t crc, const unsigned char *buf, uint32_t len) {
42 return PREFIX(crc32_z)(crc, buf, len);
43 }
44 #endif
45
46 /* ========================================================================= */
47
48 /*
49 A CRC of a message is computed on N braids of words in the message, where
50 each word consists of W bytes (4 or 8). If N is 3, for example, then three
51 running sparse CRCs are calculated respectively on each braid, at these
52 indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
53 This is done starting at a word boundary, and continues until as many blocks
54 of N * W bytes as are available have been processed. The results are combined
55 into a single CRC at the end. For this code, N must be in the range 1..6 and
56 W must be 4 or 8. The upper limit on N can be increased if desired by adding
57 more #if blocks, extending the patterns apparent in the code. In addition,
58 crc32 tables would need to be regenerated, if the maximum N value is increased.
59
60 N and W are chosen empirically by benchmarking the execution time on a given
61 processor. The choices for N and W below were based on testing on Intel Kaby
62 Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
63 Octeon II processors. The Intel, AMD, and ARM processors were all fastest
64 with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
65 They were all tested with either gcc or clang, all using the -O3 optimization
66 level. Your mileage may vary.
67 */
68
69 /* ========================================================================= */
70
71 #if BYTE_ORDER == LITTLE_ENDIAN
72 # define ZSWAPWORD(word) (word)
73 # define BRAID_TABLE crc_braid_table
74 #elif BYTE_ORDER == BIG_ENDIAN
75 # if W == 8
76 # define ZSWAPWORD(word) ZSWAP64(word)
77 # elif W == 4
78 # define ZSWAPWORD(word) ZSWAP32(word)
79 # endif
80 # define BRAID_TABLE crc_braid_big_table
81 #else
82 # error "No endian defined"
83 #endif
84 #define DO1 c = crc_table[(c ^ *buf++) & 0xff] ^ (c >> 8)
85 #define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1
86
87 /* ========================================================================= */
88 #ifdef W
89 /*
90 Return the CRC of the W bytes in the word_t data, taking the
91 least-significant byte of the word as the first byte of data, without any pre
92 or post conditioning. This is used to combine the CRCs of each braid.
93 */
94 #if BYTE_ORDER == LITTLE_ENDIAN
crc_word(z_word_t data)95 static uint32_t crc_word(z_word_t data) {
96 int k;
97 for (k = 0; k < W; k++)
98 data = (data >> 8) ^ crc_table[data & 0xff];
99 return (uint32_t)data;
100 }
101 #elif BYTE_ORDER == BIG_ENDIAN
crc_word(z_word_t data)102 static z_word_t crc_word(z_word_t data) {
103 int k;
104 for (k = 0; k < W; k++)
105 data = (data << 8) ^
106 crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
107 return data;
108 }
109 #endif /* BYTE_ORDER */
110
111 #endif /* W */
112
113 /* ========================================================================= */
crc32_braid(uint32_t crc,const unsigned char * buf,uint64_t len)114 Z_INTERNAL uint32_t crc32_braid(uint32_t crc, const unsigned char *buf, uint64_t len) {
115 Z_REGISTER uint32_t c;
116
117 /* Pre-condition the CRC */
118 c = (~crc) & 0xffffffff;
119
120 #ifdef W
121 /* If provided enough bytes, do a braided CRC calculation. */
122 if (len >= N * W + W - 1) {
123 uint64_t blks;
124 z_word_t const *words;
125 int k;
126
127 /* Compute the CRC up to a z_word_t boundary. */
128 while (len && ((z_size_t)buf & (W - 1)) != 0) {
129 len--;
130 DO1;
131 }
132
133 /* Compute the CRC on as many N z_word_t blocks as are available. */
134 blks = len / (N * W);
135 len -= blks * N * W;
136 words = (z_word_t const *)buf;
137
138 z_word_t crc0, word0, comb;
139 #if N > 1
140 z_word_t crc1, word1;
141 #if N > 2
142 z_word_t crc2, word2;
143 #if N > 3
144 z_word_t crc3, word3;
145 #if N > 4
146 z_word_t crc4, word4;
147 #if N > 5
148 z_word_t crc5, word5;
149 #endif
150 #endif
151 #endif
152 #endif
153 #endif
154 /* Initialize the CRC for each braid. */
155 crc0 = ZSWAPWORD(c);
156 #if N > 1
157 crc1 = 0;
158 #if N > 2
159 crc2 = 0;
160 #if N > 3
161 crc3 = 0;
162 #if N > 4
163 crc4 = 0;
164 #if N > 5
165 crc5 = 0;
166 #endif
167 #endif
168 #endif
169 #endif
170 #endif
171 /* Process the first blks-1 blocks, computing the CRCs on each braid independently. */
172 while (--blks) {
173 /* Load the word for each braid into registers. */
174 word0 = crc0 ^ words[0];
175 #if N > 1
176 word1 = crc1 ^ words[1];
177 #if N > 2
178 word2 = crc2 ^ words[2];
179 #if N > 3
180 word3 = crc3 ^ words[3];
181 #if N > 4
182 word4 = crc4 ^ words[4];
183 #if N > 5
184 word5 = crc5 ^ words[5];
185 #endif
186 #endif
187 #endif
188 #endif
189 #endif
190 words += N;
191
192 /* Compute and update the CRC for each word. The loop should get unrolled. */
193 crc0 = BRAID_TABLE[0][word0 & 0xff];
194 #if N > 1
195 crc1 = BRAID_TABLE[0][word1 & 0xff];
196 #if N > 2
197 crc2 = BRAID_TABLE[0][word2 & 0xff];
198 #if N > 3
199 crc3 = BRAID_TABLE[0][word3 & 0xff];
200 #if N > 4
201 crc4 = BRAID_TABLE[0][word4 & 0xff];
202 #if N > 5
203 crc5 = BRAID_TABLE[0][word5 & 0xff];
204 #endif
205 #endif
206 #endif
207 #endif
208 #endif
209 for (k = 1; k < W; k++) {
210 crc0 ^= BRAID_TABLE[k][(word0 >> (k << 3)) & 0xff];
211 #if N > 1
212 crc1 ^= BRAID_TABLE[k][(word1 >> (k << 3)) & 0xff];
213 #if N > 2
214 crc2 ^= BRAID_TABLE[k][(word2 >> (k << 3)) & 0xff];
215 #if N > 3
216 crc3 ^= BRAID_TABLE[k][(word3 >> (k << 3)) & 0xff];
217 #if N > 4
218 crc4 ^= BRAID_TABLE[k][(word4 >> (k << 3)) & 0xff];
219 #if N > 5
220 crc5 ^= BRAID_TABLE[k][(word5 >> (k << 3)) & 0xff];
221 #endif
222 #endif
223 #endif
224 #endif
225 #endif
226 }
227 }
228
229 /* Process the last block, combining the CRCs of the N braids at the same time. */
230 comb = crc_word(crc0 ^ words[0]);
231 #if N > 1
232 comb = crc_word(crc1 ^ words[1] ^ comb);
233 #if N > 2
234 comb = crc_word(crc2 ^ words[2] ^ comb);
235 #if N > 3
236 comb = crc_word(crc3 ^ words[3] ^ comb);
237 #if N > 4
238 comb = crc_word(crc4 ^ words[4] ^ comb);
239 #if N > 5
240 comb = crc_word(crc5 ^ words[5] ^ comb);
241 #endif
242 #endif
243 #endif
244 #endif
245 #endif
246 words += N;
247 c = ZSWAPWORD(comb);
248
249 /* Update the pointer to the remaining bytes to process. */
250 buf = (const unsigned char *)words;
251 }
252
253 #endif /* W */
254
255 /* Complete the computation of the CRC on any remaining bytes. */
256 while (len >= 8) {
257 len -= 8;
258 DO8;
259 }
260 while (len) {
261 len--;
262 DO1;
263 }
264
265 /* Return the CRC, post-conditioned. */
266 return c ^ 0xffffffff;
267 }
268