xref: /aosp_15_r20/external/zlib/crc32.c (revision 86ee64e75fa5f8bce2c8c356138035642429cd05)
1 /* crc32.c -- compute the CRC-32 of a data stream
2  * Copyright (C) 1995-2022 Mark Adler
3  * For conditions of distribution and use, see copyright notice in zlib.h
4  *
5  * This interleaved implementation of a CRC makes use of pipelined multiple
6  * arithmetic-logic units, commonly found in modern CPU cores. It is due to
7  * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
8  */
9 
10 /* @(#) $Id$ */
11 
12 /*
13   Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
14   protection on the static variables used to control the first-use generation
15   of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should
16   first call get_crc_table() to initialize the tables before allowing more than
17   one thread to use crc32().
18 
19   MAKECRCH can be #defined to write out crc32.h. A main() routine is also
20   produced, so that this one source file can be compiled to an executable.
21  */
22 
23 #ifdef MAKECRCH
24 #  include <stdio.h>
25 #  ifndef DYNAMIC_CRC_TABLE
26 #    define DYNAMIC_CRC_TABLE
27 #  endif /* !DYNAMIC_CRC_TABLE */
28 #endif /* MAKECRCH */
29 
30 #include "deflate.h"
31 #include "cpu_features.h"
32 #include "zutil.h"      /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */
33 
34 #if defined(CRC32_SIMD_SSE42_PCLMUL) || defined(CRC32_ARMV8_CRC32)
35 #include "crc32_simd.h"
36 #endif
37 
38  /*
39   A CRC of a message is computed on N braids of words in the message, where
40   each word consists of W bytes (4 or 8). If N is 3, for example, then three
41   running sparse CRCs are calculated respectively on each braid, at these
42   indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
43   This is done starting at a word boundary, and continues until as many blocks
44   of N * W bytes as are available have been processed. The results are combined
45   into a single CRC at the end. For this code, N must be in the range 1..6 and
46   W must be 4 or 8. The upper limit on N can be increased if desired by adding
47   more #if blocks, extending the patterns apparent in the code. In addition,
48   crc32.h would need to be regenerated, if the maximum N value is increased.
49 
50   N and W are chosen empirically by benchmarking the execution time on a given
51   processor. The choices for N and W below were based on testing on Intel Kaby
52   Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
53   Octeon II processors. The Intel, AMD, and ARM processors were all fastest
54   with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
55   They were all tested with either gcc or clang, all using the -O3 optimization
56   level. Your mileage may vary.
57  */
58 
59 /* Define N */
60 #ifdef Z_TESTN
61 #  define N Z_TESTN
62 #else
63 #  define N 5
64 #endif
65 #if N < 1 || N > 6
66 #  error N must be in 1..6
67 #endif
68 
69 /*
70   z_crc_t must be at least 32 bits. z_word_t must be at least as long as
71   z_crc_t. It is assumed here that z_word_t is either 32 bits or 64 bits, and
72   that bytes are eight bits.
73  */
74 
75 /*
76   Define W and the associated z_word_t type. If W is not defined, then a
77   braided calculation is not used, and the associated tables and code are not
78   compiled.
79  */
80 #ifdef Z_TESTW
81 #  if Z_TESTW-1 != -1
82 #    define W Z_TESTW
83 #  endif
84 #else
85 #  ifdef MAKECRCH
86 #    define W 8         /* required for MAKECRCH */
87 #  else
88 #    if defined(__x86_64__) || defined(__aarch64__)
89 #      define W 8
90 #    else
91 #      define W 4
92 #    endif
93 #  endif
94 #endif
95 #ifdef W
96 #  if W == 8 && defined(Z_U8)
97      typedef Z_U8 z_word_t;
98 #  elif defined(Z_U4)
99 #    undef W
100 #    define W 4
101      typedef Z_U4 z_word_t;
102 #  else
103 #    undef W
104 #  endif
105 #endif
106 
107 /* If available, use the ARM processor CRC32 instruction. */
108 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8 \
109     && defined(USE_CANONICAL_ARMV8_CRC32)
110 #  define ARMCRC32_CANONICAL_ZLIB
111 #endif
112 
113 #if defined(W) && (!defined(ARMCRC32_CANONICAL_ZLIB) || defined(DYNAMIC_CRC_TABLE))
114 /*
115   Swap the bytes in a z_word_t to convert between little and big endian. Any
116   self-respecting compiler will optimize this to a single machine byte-swap
117   instruction, if one is available. This assumes that word_t is either 32 bits
118   or 64 bits.
119  */
byte_swap(z_word_t word)120 local z_word_t byte_swap(z_word_t word) {
121 #  if W == 8
122     return
123         (word & 0xff00000000000000) >> 56 |
124         (word & 0xff000000000000) >> 40 |
125         (word & 0xff0000000000) >> 24 |
126         (word & 0xff00000000) >> 8 |
127         (word & 0xff000000) << 8 |
128         (word & 0xff0000) << 24 |
129         (word & 0xff00) << 40 |
130         (word & 0xff) << 56;
131 #  else   /* W == 4 */
132     return
133         (word & 0xff000000) >> 24 |
134         (word & 0xff0000) >> 8 |
135         (word & 0xff00) << 8 |
136         (word & 0xff) << 24;
137 #  endif
138 }
139 #endif
140 
141 #ifdef DYNAMIC_CRC_TABLE
142 /* =========================================================================
143  * Table of powers of x for combining CRC-32s, filled in by make_crc_table()
144  * below.
145  */
146    local z_crc_t FAR x2n_table[32];
147 #else
148 /* =========================================================================
149  * Tables for byte-wise and braided CRC-32 calculations, and a table of powers
150  * of x for combining CRC-32s, all made by make_crc_table().
151  */
152 #  include "crc32.h"
153 #endif
154 
155 /* CRC polynomial. */
156 #define POLY 0xedb88320         /* p(x) reflected, with x^32 implied */
157 
158 /*
159   Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial,
160   reflected. For speed, this requires that a not be zero.
161  */
multmodp(z_crc_t a,z_crc_t b)162 local z_crc_t multmodp(z_crc_t a, z_crc_t b) {
163     z_crc_t m, p;
164 
165     m = (z_crc_t)1 << 31;
166     p = 0;
167     for (;;) {
168         if (a & m) {
169             p ^= b;
170             if ((a & (m - 1)) == 0)
171                 break;
172         }
173         m >>= 1;
174         b = b & 1 ? (b >> 1) ^ POLY : b >> 1;
175     }
176     return p;
177 }
178 
179 /*
180   Return x^(n * 2^k) modulo p(x). Requires that x2n_table[] has been
181   initialized.
182  */
x2nmodp(z_off64_t n,unsigned k)183 local z_crc_t x2nmodp(z_off64_t n, unsigned k) {
184     z_crc_t p;
185 
186     p = (z_crc_t)1 << 31;           /* x^0 == 1 */
187     while (n) {
188         if (n & 1)
189             p = multmodp(x2n_table[k & 31], p);
190         n >>= 1;
191         k++;
192     }
193     return p;
194 }
195 
196 #ifdef DYNAMIC_CRC_TABLE
197 /* =========================================================================
198  * Build the tables for byte-wise and braided CRC-32 calculations, and a table
199  * of powers of x for combining CRC-32s.
200  */
201 local z_crc_t FAR crc_table[256];
202 #ifdef W
203    local z_word_t FAR crc_big_table[256];
204    local z_crc_t FAR crc_braid_table[W][256];
205    local z_word_t FAR crc_braid_big_table[W][256];
206    local void braid(z_crc_t [][256], z_word_t [][256], int, int);
207 #endif
208 #ifdef MAKECRCH
209    local void write_table(FILE *, const z_crc_t FAR *, int);
210    local void write_table32hi(FILE *, const z_word_t FAR *, int);
211    local void write_table64(FILE *, const z_word_t FAR *, int);
212 #endif /* MAKECRCH */
213 
214 /*
215   Define a once() function depending on the availability of atomics. If this is
216   compiled with DYNAMIC_CRC_TABLE defined, and if CRCs will be computed in
217   multiple threads, and if atomics are not available, then get_crc_table() must
218   be called to initialize the tables and must return before any threads are
219   allowed to compute or combine CRCs.
220  */
221 
222 /* Definition of once functionality. */
223 typedef struct once_s once_t;
224 
225 /* Check for the availability of atomics. */
226 #if defined(__STDC__) && __STDC_VERSION__ >= 201112L && \
227     !defined(__STDC_NO_ATOMICS__)
228 
229 #include <stdatomic.h>
230 
231 /* Structure for once(), which must be initialized with ONCE_INIT. */
232 struct once_s {
233     atomic_flag begun;
234     atomic_int done;
235 };
236 #define ONCE_INIT {ATOMIC_FLAG_INIT, 0}
237 
238 /*
239   Run the provided init() function exactly once, even if multiple threads
240   invoke once() at the same time. The state must be a once_t initialized with
241   ONCE_INIT.
242  */
once(once_t * state,void (* init)(void))243 local void once(once_t *state, void (*init)(void)) {
244     if (!atomic_load(&state->done)) {
245         if (atomic_flag_test_and_set(&state->begun))
246             while (!atomic_load(&state->done))
247                 ;
248         else {
249             init();
250             atomic_store(&state->done, 1);
251         }
252     }
253 }
254 
255 #else   /* no atomics */
256 
257 /* Structure for once(), which must be initialized with ONCE_INIT. */
258 struct once_s {
259     volatile int begun;
260     volatile int done;
261 };
262 #define ONCE_INIT {0, 0}
263 
264 /* Test and set. Alas, not atomic, but tries to minimize the period of
265    vulnerability. */
test_and_set(int volatile * flag)266 local int test_and_set(int volatile *flag) {
267     int was;
268 
269     was = *flag;
270     *flag = 1;
271     return was;
272 }
273 
274 /* Run the provided init() function once. This is not thread-safe. */
once(once_t * state,void (* init)(void))275 local void once(once_t *state, void (*init)(void)) {
276     if (!state->done) {
277         if (test_and_set(&state->begun))
278             while (!state->done)
279                 ;
280         else {
281             init();
282             state->done = 1;
283         }
284     }
285 }
286 
287 #endif
288 
289 /* State for once(). */
290 local once_t made = ONCE_INIT;
291 
292 /*
293   Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
294   x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
295 
296   Polynomials over GF(2) are represented in binary, one bit per coefficient,
297   with the lowest powers in the most significant bit. Then adding polynomials
298   is just exclusive-or, and multiplying a polynomial by x is a right shift by
299   one. If we call the above polynomial p, and represent a byte as the
300   polynomial q, also with the lowest power in the most significant bit (so the
301   byte 0xb1 is the polynomial x^7+x^3+x^2+1), then the CRC is (q*x^32) mod p,
302   where a mod b means the remainder after dividing a by b.
303 
304   This calculation is done using the shift-register method of multiplying and
305   taking the remainder. The register is initialized to zero, and for each
306   incoming bit, x^32 is added mod p to the register if the bit is a one (where
307   x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by x
308   (which is shifting right by one and adding x^32 mod p if the bit shifted out
309   is a one). We start with the highest power (least significant bit) of q and
310   repeat for all eight bits of q.
311 
312   The table is simply the CRC of all possible eight bit values. This is all the
313   information needed to generate CRCs on data a byte at a time for all
314   combinations of CRC register values and incoming bytes.
315  */
make_crc_table(void)316 local void make_crc_table(void)
317 {
318     unsigned i, j, n;
319     z_crc_t p;
320 
321     /* initialize the CRC of bytes tables */
322     for (i = 0; i < 256; i++) {
323         p = i;
324         for (j = 0; j < 8; j++)
325             p = p & 1 ? (p >> 1) ^ POLY : p >> 1;
326         crc_table[i] = p;
327 #ifdef W
328         crc_big_table[i] = byte_swap(p);
329 #endif
330     }
331 
332     /* initialize the x^2^n mod p(x) table */
333     p = (z_crc_t)1 << 30;         /* x^1 */
334     x2n_table[0] = p;
335     for (n = 1; n < 32; n++)
336         x2n_table[n] = p = multmodp(p, p);
337 
338 #ifdef W
339     /* initialize the braiding tables -- needs x2n_table[] */
340     braid(crc_braid_table, crc_braid_big_table, N, W);
341 #endif
342 
343 #ifdef MAKECRCH
344     {
345         /*
346           The crc32.h header file contains tables for both 32-bit and 64-bit
347           z_word_t's, and so requires a 64-bit type be available. In that case,
348           z_word_t must be defined to be 64-bits. This code then also generates
349           and writes out the tables for the case that z_word_t is 32 bits.
350          */
351 #if !defined(W) || W != 8
352 #  error Need a 64-bit integer type in order to generate crc32.h.
353 #endif
354         FILE *out;
355         int k, n;
356         z_crc_t ltl[8][256];
357         z_word_t big[8][256];
358 
359         out = fopen("crc32.h", "w");
360         if (out == NULL) return;
361 
362         /* write out little-endian CRC table to crc32.h */
363         fprintf(out,
364             "/* crc32.h -- tables for rapid CRC calculation\n"
365             " * Generated automatically by crc32.c\n */\n"
366             "\n"
367             "local const z_crc_t FAR crc_table[] = {\n"
368             "    ");
369         write_table(out, crc_table, 256);
370         fprintf(out,
371             "};\n");
372 
373         /* write out big-endian CRC table for 64-bit z_word_t to crc32.h */
374         fprintf(out,
375             "\n"
376             "#ifdef W\n"
377             "\n"
378             "#if W == 8\n"
379             "\n"
380             "local const z_word_t FAR crc_big_table[] = {\n"
381             "    ");
382         write_table64(out, crc_big_table, 256);
383         fprintf(out,
384             "};\n");
385 
386         /* write out big-endian CRC table for 32-bit z_word_t to crc32.h */
387         fprintf(out,
388             "\n"
389             "#else /* W == 4 */\n"
390             "\n"
391             "local const z_word_t FAR crc_big_table[] = {\n"
392             "    ");
393         write_table32hi(out, crc_big_table, 256);
394         fprintf(out,
395             "};\n"
396             "\n"
397             "#endif\n");
398 
399         /* write out braid tables for each value of N */
400         for (n = 1; n <= 6; n++) {
401             fprintf(out,
402             "\n"
403             "#if N == %d\n", n);
404 
405             /* compute braid tables for this N and 64-bit word_t */
406             braid(ltl, big, n, 8);
407 
408             /* write out braid tables for 64-bit z_word_t to crc32.h */
409             fprintf(out,
410             "\n"
411             "#if W == 8\n"
412             "\n"
413             "local const z_crc_t FAR crc_braid_table[][256] = {\n");
414             for (k = 0; k < 8; k++) {
415                 fprintf(out, "   {");
416                 write_table(out, ltl[k], 256);
417                 fprintf(out, "}%s", k < 7 ? ",\n" : "");
418             }
419             fprintf(out,
420             "};\n"
421             "\n"
422             "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
423             for (k = 0; k < 8; k++) {
424                 fprintf(out, "   {");
425                 write_table64(out, big[k], 256);
426                 fprintf(out, "}%s", k < 7 ? ",\n" : "");
427             }
428             fprintf(out,
429             "};\n");
430 
431             /* compute braid tables for this N and 32-bit word_t */
432             braid(ltl, big, n, 4);
433 
434             /* write out braid tables for 32-bit z_word_t to crc32.h */
435             fprintf(out,
436             "\n"
437             "#else /* W == 4 */\n"
438             "\n"
439             "local const z_crc_t FAR crc_braid_table[][256] = {\n");
440             for (k = 0; k < 4; k++) {
441                 fprintf(out, "   {");
442                 write_table(out, ltl[k], 256);
443                 fprintf(out, "}%s", k < 3 ? ",\n" : "");
444             }
445             fprintf(out,
446             "};\n"
447             "\n"
448             "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
449             for (k = 0; k < 4; k++) {
450                 fprintf(out, "   {");
451                 write_table32hi(out, big[k], 256);
452                 fprintf(out, "}%s", k < 3 ? ",\n" : "");
453             }
454             fprintf(out,
455             "};\n"
456             "\n"
457             "#endif\n"
458             "\n"
459             "#endif\n");
460         }
461         fprintf(out,
462             "\n"
463             "#endif\n");
464 
465         /* write out zeros operator table to crc32.h */
466         fprintf(out,
467             "\n"
468             "local const z_crc_t FAR x2n_table[] = {\n"
469             "    ");
470         write_table(out, x2n_table, 32);
471         fprintf(out,
472             "};\n");
473         fclose(out);
474     }
475 #endif /* MAKECRCH */
476 }
477 
478 #ifdef MAKECRCH
479 
480 /*
481    Write the 32-bit values in table[0..k-1] to out, five per line in
482    hexadecimal separated by commas.
483  */
write_table(FILE * out,const z_crc_t FAR * table,int k)484 local void write_table(FILE *out, const z_crc_t FAR *table, int k) {
485     int n;
486 
487     for (n = 0; n < k; n++)
488         fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
489                 (unsigned long)(table[n]),
490                 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
491 }
492 
493 /*
494    Write the high 32-bits of each value in table[0..k-1] to out, five per line
495    in hexadecimal separated by commas.
496  */
write_table32hi(FILE * out,const z_word_t FAR * table,int k)497 local void write_table32hi(FILE *out, const z_word_t FAR *table, int k) {
498     int n;
499 
500     for (n = 0; n < k; n++)
501         fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
502                 (unsigned long)(table[n] >> 32),
503                 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
504 }
505 
506 /*
507   Write the 64-bit values in table[0..k-1] to out, three per line in
508   hexadecimal separated by commas. This assumes that if there is a 64-bit
509   type, then there is also a long long integer type, and it is at least 64
510   bits. If not, then the type cast and format string can be adjusted
511   accordingly.
512  */
write_table64(FILE * out,const z_word_t FAR * table,int k)513 local void write_table64(FILE *out, const z_word_t FAR *table, int k) {
514     int n;
515 
516     for (n = 0; n < k; n++)
517         fprintf(out, "%s0x%016llx%s", n == 0 || n % 3 ? "" : "    ",
518                 (unsigned long long)(table[n]),
519                 n == k - 1 ? "" : (n % 3 == 2 ? ",\n" : ", "));
520 }
521 
522 /* Actually do the deed. */
main(void)523 int main(void) {
524     make_crc_table();
525     return 0;
526 }
527 
528 #endif /* MAKECRCH */
529 
530 #ifdef W
531 /*
532   Generate the little and big-endian braid tables for the given n and z_word_t
533   size w. Each array must have room for w blocks of 256 elements.
534  */
braid(z_crc_t ltl[][256],z_word_t big[][256],int n,int w)535 local void braid(z_crc_t ltl[][256], z_word_t big[][256], int n, int w) {
536     int k;
537     z_crc_t i, p, q;
538     for (k = 0; k < w; k++) {
539         p = x2nmodp((n * w + 3 - k) << 3, 0);
540         ltl[k][0] = 0;
541         big[w - 1 - k][0] = 0;
542         for (i = 1; i < 256; i++) {
543             ltl[k][i] = q = multmodp(i << 24, p);
544             big[w - 1 - k][i] = byte_swap(q);
545         }
546     }
547 }
548 #endif
549 
550 #endif /* DYNAMIC_CRC_TABLE */
551 
552 /* =========================================================================
553  * This function can be used by asm versions of crc32(), and to force the
554  * generation of the CRC tables in a threaded application.
555  */
get_crc_table(void)556 const z_crc_t FAR * ZEXPORT get_crc_table(void) {
557 #ifdef DYNAMIC_CRC_TABLE
558     once(&made, make_crc_table);
559 #endif /* DYNAMIC_CRC_TABLE */
560     return (const z_crc_t FAR *)crc_table;
561 }
562 
563 /* =========================================================================
564  * Use ARM machine instructions if available. This will compute the CRC about
565  * ten times faster than the braided calculation. This code does not check for
566  * the presence of the CRC instruction at run time. __ARM_FEATURE_CRC32 will
567  * only be defined if the compilation specifies an ARM processor architecture
568  * that has the instructions. For example, compiling with -march=armv8.1-a or
569  * -march=armv8-a+crc, or -march=native if the compile machine has the crc32
570  * instructions.
571  */
572 #if ARMCRC32_CANONICAL_ZLIB
573 
574 /*
575    Constants empirically determined to maximize speed. These values are from
576    measurements on a Cortex-A57. Your mileage may vary.
577  */
578 #define Z_BATCH 3990                /* number of words in a batch */
579 #define Z_BATCH_ZEROS 0xa10d3d0c    /* computed from Z_BATCH = 3990 */
580 #define Z_BATCH_MIN 800             /* fewest words in a final batch */
581 
crc32_z(unsigned long crc,const unsigned char FAR * buf,z_size_t len)582 unsigned long ZEXPORT crc32_z(unsigned long crc, const unsigned char FAR *buf,
583                               z_size_t len) {
584     z_crc_t val;
585     z_word_t crc1, crc2;
586     const z_word_t *word;
587     z_word_t val0, val1, val2;
588     z_size_t last, last2, i;
589     z_size_t num;
590 
591     /* Return initial CRC, if requested. */
592     if (buf == Z_NULL) return 0;
593 
594 #ifdef DYNAMIC_CRC_TABLE
595     once(&made, make_crc_table);
596 #endif /* DYNAMIC_CRC_TABLE */
597 
598     /* Pre-condition the CRC */
599     crc = (~crc) & 0xffffffff;
600 
601     /* Compute the CRC up to a word boundary. */
602     while (len && ((z_size_t)buf & 7) != 0) {
603         len--;
604         val = *buf++;
605         __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
606     }
607 
608     /* Prepare to compute the CRC on full 64-bit words word[0..num-1]. */
609     word = (z_word_t const *)buf;
610     num = len >> 3;
611     len &= 7;
612 
613     /* Do three interleaved CRCs to realize the throughput of one crc32x
614        instruction per cycle. Each CRC is calculated on Z_BATCH words. The
615        three CRCs are combined into a single CRC after each set of batches. */
616     while (num >= 3 * Z_BATCH) {
617         crc1 = 0;
618         crc2 = 0;
619         for (i = 0; i < Z_BATCH; i++) {
620             val0 = word[i];
621             val1 = word[i + Z_BATCH];
622             val2 = word[i + 2 * Z_BATCH];
623             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
624             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
625             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
626         }
627         word += 3 * Z_BATCH;
628         num -= 3 * Z_BATCH;
629         crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc1;
630         crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc2;
631     }
632 
633     /* Do one last smaller batch with the remaining words, if there are enough
634        to pay for the combination of CRCs. */
635     last = num / 3;
636     if (last >= Z_BATCH_MIN) {
637         last2 = last << 1;
638         crc1 = 0;
639         crc2 = 0;
640         for (i = 0; i < last; i++) {
641             val0 = word[i];
642             val1 = word[i + last];
643             val2 = word[i + last2];
644             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
645             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
646             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
647         }
648         word += 3 * last;
649         num -= 3 * last;
650         val = x2nmodp(last, 6);
651         crc = multmodp(val, crc) ^ crc1;
652         crc = multmodp(val, crc) ^ crc2;
653     }
654 
655     /* Compute the CRC on any remaining words. */
656     for (i = 0; i < num; i++) {
657         val0 = word[i];
658         __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
659     }
660     word += num;
661 
662     /* Complete the CRC on any remaining bytes. */
663     buf = (const unsigned char FAR *)word;
664     while (len) {
665         len--;
666         val = *buf++;
667         __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
668     }
669 
670     /* Return the CRC, post-conditioned. */
671     return crc ^ 0xffffffff;
672 }
673 
674 #else
675 
676 #ifdef W
677 
678 /*
679   Return the CRC of the W bytes in the word_t data, taking the
680   least-significant byte of the word as the first byte of data, without any pre
681   or post conditioning. This is used to combine the CRCs of each braid.
682  */
crc_word(z_word_t data)683 local z_crc_t crc_word(z_word_t data) {
684     int k;
685     for (k = 0; k < W; k++)
686         data = (data >> 8) ^ crc_table[data & 0xff];
687     return (z_crc_t)data;
688 }
689 
crc_word_big(z_word_t data)690 local z_word_t crc_word_big(z_word_t data) {
691     int k;
692     for (k = 0; k < W; k++)
693         data = (data << 8) ^
694             crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
695     return data;
696 }
697 
698 #endif
699 
700 /* ========================================================================= */
crc32_z(unsigned long crc,const unsigned char FAR * buf,z_size_t len)701 unsigned long ZEXPORT crc32_z(unsigned long crc, const unsigned char FAR *buf,
702                               z_size_t len) {
703     /*
704      * zlib convention is to call crc32(0, NULL, 0); before making
705      * calls to crc32(). So this is a good, early (and infrequent)
706      * place to cache CPU features if needed for those later, more
707      * interesting crc32() calls.
708      */
709 #if defined(CRC32_SIMD_SSE42_PCLMUL) || defined(CRC32_ARMV8_CRC32) \
710     || defined(RISCV_RVV)
711     /*
712      * Since this routine can be freely used, check CPU features here.
713      */
714     if (buf == Z_NULL) {
715         if (!len) /* Assume user is calling crc32(0, NULL, 0); */
716             cpu_check_features();
717         return 0UL;
718     }
719 
720 #endif
721 #if defined(CRC32_SIMD_AVX512_PCLMUL)
722     if (x86_cpu_enable_avx512 && len >= Z_CRC32_AVX512_MINIMUM_LENGTH) {
723         /* crc32 64-byte chunks */
724         z_size_t chunk_size = len & ~Z_CRC32_AVX512_CHUNKSIZE_MASK;
725         crc = ~crc32_avx512_simd_(buf, chunk_size, ~(uint32_t)crc);
726         /* check remaining data */
727         len -= chunk_size;
728         if (!len)
729             return crc;
730         /* Fall into the default crc32 for the remaining data. */
731         buf += chunk_size;
732     }
733 #elif defined(CRC32_SIMD_SSE42_PCLMUL)
734     if (x86_cpu_enable_simd && len >= Z_CRC32_SSE42_MINIMUM_LENGTH) {
735         /* crc32 16-byte chunks */
736         z_size_t chunk_size = len & ~Z_CRC32_SSE42_CHUNKSIZE_MASK;
737         crc = ~crc32_sse42_simd_(buf, chunk_size, ~(uint32_t)crc);
738         /* check remaining data */
739         len -= chunk_size;
740         if (!len)
741             return crc;
742         /* Fall into the default crc32 for the remaining data. */
743         buf += chunk_size;
744     }
745 #elif defined(CRC32_ARMV8_CRC32)
746     if (arm_cpu_enable_crc32) {
747 #if defined(__aarch64__)
748         /* PMULL is 64bit only, plus code needs at least a 64 bytes buffer. */
749         if (arm_cpu_enable_pmull && (len > Z_CRC32_PMULL_MINIMUM_LENGTH)) {
750             const size_t chunk_size = len & ~Z_CRC32_PMULL_CHUNKSIZE_MASK;
751             crc = ~armv8_crc32_pmull_little(buf, chunk_size, ~(uint32_t)crc);
752             /* Check remaining data. */
753             len -= chunk_size;
754             if (!len)
755                 return crc;
756 
757             /* Fall through for the remaining data. */
758             buf += chunk_size;
759         }
760 #endif
761         return armv8_crc32_little(buf, len, crc); /* Armv8@32bit or tail. */
762     }
763 #else
764     if (buf == Z_NULL) {
765         return 0UL;
766     }
767 #endif /* CRC32_SIMD */
768 
769 #ifdef DYNAMIC_CRC_TABLE
770     once(&made, make_crc_table);
771 #endif /* DYNAMIC_CRC_TABLE */
772     /* Pre-condition the CRC */
773     crc = (~crc) & 0xffffffff;
774 
775 #ifdef W
776 
777     /* If provided enough bytes, do a braided CRC calculation. */
778     if (len >= N * W + W - 1) {
779         z_size_t blks;
780         z_word_t const *words;
781         unsigned endian;
782         int k;
783 
784         /* Compute the CRC up to a z_word_t boundary. */
785         while (len && ((z_size_t)buf & (W - 1)) != 0) {
786             len--;
787             crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
788         }
789 
790         /* Compute the CRC on as many N z_word_t blocks as are available. */
791         blks = len / (N * W);
792         len -= blks * N * W;
793         words = (z_word_t const *)buf;
794 
795         /* Do endian check at execution time instead of compile time, since ARM
796            processors can change the endianness at execution time. If the
797            compiler knows what the endianness will be, it can optimize out the
798            check and the unused branch. */
799         endian = 1;
800         if (*(unsigned char *)&endian) {
801             /* Little endian. */
802 
803             z_crc_t crc0;
804             z_word_t word0;
805 #if N > 1
806             z_crc_t crc1;
807             z_word_t word1;
808 #if N > 2
809             z_crc_t crc2;
810             z_word_t word2;
811 #if N > 3
812             z_crc_t crc3;
813             z_word_t word3;
814 #if N > 4
815             z_crc_t crc4;
816             z_word_t word4;
817 #if N > 5
818             z_crc_t crc5;
819             z_word_t word5;
820 #endif
821 #endif
822 #endif
823 #endif
824 #endif
825 
826             /* Initialize the CRC for each braid. */
827             crc0 = crc;
828 #if N > 1
829             crc1 = 0;
830 #if N > 2
831             crc2 = 0;
832 #if N > 3
833             crc3 = 0;
834 #if N > 4
835             crc4 = 0;
836 #if N > 5
837             crc5 = 0;
838 #endif
839 #endif
840 #endif
841 #endif
842 #endif
843 
844             /*
845               Process the first blks-1 blocks, computing the CRCs on each braid
846               independently.
847              */
848             while (--blks) {
849                 /* Load the word for each braid into registers. */
850                 word0 = crc0 ^ words[0];
851 #if N > 1
852                 word1 = crc1 ^ words[1];
853 #if N > 2
854                 word2 = crc2 ^ words[2];
855 #if N > 3
856                 word3 = crc3 ^ words[3];
857 #if N > 4
858                 word4 = crc4 ^ words[4];
859 #if N > 5
860                 word5 = crc5 ^ words[5];
861 #endif
862 #endif
863 #endif
864 #endif
865 #endif
866                 words += N;
867 
868                 /* Compute and update the CRC for each word. The loop should
869                    get unrolled. */
870                 crc0 = crc_braid_table[0][word0 & 0xff];
871 #if N > 1
872                 crc1 = crc_braid_table[0][word1 & 0xff];
873 #if N > 2
874                 crc2 = crc_braid_table[0][word2 & 0xff];
875 #if N > 3
876                 crc3 = crc_braid_table[0][word3 & 0xff];
877 #if N > 4
878                 crc4 = crc_braid_table[0][word4 & 0xff];
879 #if N > 5
880                 crc5 = crc_braid_table[0][word5 & 0xff];
881 #endif
882 #endif
883 #endif
884 #endif
885 #endif
886                 for (k = 1; k < W; k++) {
887                     crc0 ^= crc_braid_table[k][(word0 >> (k << 3)) & 0xff];
888 #if N > 1
889                     crc1 ^= crc_braid_table[k][(word1 >> (k << 3)) & 0xff];
890 #if N > 2
891                     crc2 ^= crc_braid_table[k][(word2 >> (k << 3)) & 0xff];
892 #if N > 3
893                     crc3 ^= crc_braid_table[k][(word3 >> (k << 3)) & 0xff];
894 #if N > 4
895                     crc4 ^= crc_braid_table[k][(word4 >> (k << 3)) & 0xff];
896 #if N > 5
897                     crc5 ^= crc_braid_table[k][(word5 >> (k << 3)) & 0xff];
898 #endif
899 #endif
900 #endif
901 #endif
902 #endif
903                 }
904             }
905 
906             /*
907               Process the last block, combining the CRCs of the N braids at the
908               same time.
909              */
910             crc = crc_word(crc0 ^ words[0]);
911 #if N > 1
912             crc = crc_word(crc1 ^ words[1] ^ crc);
913 #if N > 2
914             crc = crc_word(crc2 ^ words[2] ^ crc);
915 #if N > 3
916             crc = crc_word(crc3 ^ words[3] ^ crc);
917 #if N > 4
918             crc = crc_word(crc4 ^ words[4] ^ crc);
919 #if N > 5
920             crc = crc_word(crc5 ^ words[5] ^ crc);
921 #endif
922 #endif
923 #endif
924 #endif
925 #endif
926             words += N;
927         }
928         else {
929             /* Big endian. */
930 
931             z_word_t crc0, word0, comb;
932 #if N > 1
933             z_word_t crc1, word1;
934 #if N > 2
935             z_word_t crc2, word2;
936 #if N > 3
937             z_word_t crc3, word3;
938 #if N > 4
939             z_word_t crc4, word4;
940 #if N > 5
941             z_word_t crc5, word5;
942 #endif
943 #endif
944 #endif
945 #endif
946 #endif
947 
948             /* Initialize the CRC for each braid. */
949             crc0 = byte_swap(crc);
950 #if N > 1
951             crc1 = 0;
952 #if N > 2
953             crc2 = 0;
954 #if N > 3
955             crc3 = 0;
956 #if N > 4
957             crc4 = 0;
958 #if N > 5
959             crc5 = 0;
960 #endif
961 #endif
962 #endif
963 #endif
964 #endif
965 
966             /*
967               Process the first blks-1 blocks, computing the CRCs on each braid
968               independently.
969              */
970             while (--blks) {
971                 /* Load the word for each braid into registers. */
972                 word0 = crc0 ^ words[0];
973 #if N > 1
974                 word1 = crc1 ^ words[1];
975 #if N > 2
976                 word2 = crc2 ^ words[2];
977 #if N > 3
978                 word3 = crc3 ^ words[3];
979 #if N > 4
980                 word4 = crc4 ^ words[4];
981 #if N > 5
982                 word5 = crc5 ^ words[5];
983 #endif
984 #endif
985 #endif
986 #endif
987 #endif
988                 words += N;
989 
990                 /* Compute and update the CRC for each word. The loop should
991                    get unrolled. */
992                 crc0 = crc_braid_big_table[0][word0 & 0xff];
993 #if N > 1
994                 crc1 = crc_braid_big_table[0][word1 & 0xff];
995 #if N > 2
996                 crc2 = crc_braid_big_table[0][word2 & 0xff];
997 #if N > 3
998                 crc3 = crc_braid_big_table[0][word3 & 0xff];
999 #if N > 4
1000                 crc4 = crc_braid_big_table[0][word4 & 0xff];
1001 #if N > 5
1002                 crc5 = crc_braid_big_table[0][word5 & 0xff];
1003 #endif
1004 #endif
1005 #endif
1006 #endif
1007 #endif
1008                 for (k = 1; k < W; k++) {
1009                     crc0 ^= crc_braid_big_table[k][(word0 >> (k << 3)) & 0xff];
1010 #if N > 1
1011                     crc1 ^= crc_braid_big_table[k][(word1 >> (k << 3)) & 0xff];
1012 #if N > 2
1013                     crc2 ^= crc_braid_big_table[k][(word2 >> (k << 3)) & 0xff];
1014 #if N > 3
1015                     crc3 ^= crc_braid_big_table[k][(word3 >> (k << 3)) & 0xff];
1016 #if N > 4
1017                     crc4 ^= crc_braid_big_table[k][(word4 >> (k << 3)) & 0xff];
1018 #if N > 5
1019                     crc5 ^= crc_braid_big_table[k][(word5 >> (k << 3)) & 0xff];
1020 #endif
1021 #endif
1022 #endif
1023 #endif
1024 #endif
1025                 }
1026             }
1027 
1028             /*
1029               Process the last block, combining the CRCs of the N braids at the
1030               same time.
1031              */
1032             comb = crc_word_big(crc0 ^ words[0]);
1033 #if N > 1
1034             comb = crc_word_big(crc1 ^ words[1] ^ comb);
1035 #if N > 2
1036             comb = crc_word_big(crc2 ^ words[2] ^ comb);
1037 #if N > 3
1038             comb = crc_word_big(crc3 ^ words[3] ^ comb);
1039 #if N > 4
1040             comb = crc_word_big(crc4 ^ words[4] ^ comb);
1041 #if N > 5
1042             comb = crc_word_big(crc5 ^ words[5] ^ comb);
1043 #endif
1044 #endif
1045 #endif
1046 #endif
1047 #endif
1048             words += N;
1049             crc = byte_swap(comb);
1050         }
1051 
1052         /*
1053           Update the pointer to the remaining bytes to process.
1054          */
1055         buf = (unsigned char const *)words;
1056     }
1057 
1058 #endif /* W */
1059 
1060     /* Complete the computation of the CRC on any remaining bytes. */
1061     while (len >= 8) {
1062         len -= 8;
1063         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1064         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1065         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1066         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1067         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1068         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1069         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1070         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1071     }
1072     while (len) {
1073         len--;
1074         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1075     }
1076 
1077     /* Return the CRC, post-conditioned. */
1078     return crc ^ 0xffffffff;
1079 }
1080 
1081 #endif
1082 
1083 /* ========================================================================= */
crc32(unsigned long crc,const unsigned char FAR * buf,uInt len)1084 unsigned long ZEXPORT crc32(unsigned long crc, const unsigned char FAR *buf,
1085                             uInt len) {
1086     /* Some bots compile with optimizations disabled, others will emulate
1087      * ARM on x86 and other weird combinations.
1088      */
1089 #if defined(CRC32_SIMD_SSE42_PCLMUL) || defined(CRC32_ARMV8_CRC32) \
1090     || defined(RISCV_RVV)
1091     /* We got to verify CPU features, so exploit the common usage pattern
1092      * of calling this function with Z_NULL for an initial valid crc value.
1093      * This allows to cache the result of the feature check and avoid extraneous
1094      * function calls.
1095      */
1096     if (buf == Z_NULL) {
1097         if (!len) /* Assume user is calling crc32(0, NULL, 0); */
1098             cpu_check_features();
1099         return 0UL;
1100     }
1101 #endif
1102 
1103 #if defined(CRC32_ARMV8_CRC32)
1104     if (arm_cpu_enable_crc32) {
1105 #if defined(__aarch64__)
1106         /* PMULL is 64bit only, plus code needs at least a 64 bytes buffer. */
1107         if (arm_cpu_enable_pmull && (len > Z_CRC32_PMULL_MINIMUM_LENGTH)) {
1108             const size_t chunk_size = len & ~Z_CRC32_PMULL_CHUNKSIZE_MASK;
1109             crc = ~armv8_crc32_pmull_little(buf, chunk_size, ~(uint32_t)crc);
1110             /* Check remaining data. */
1111             len -= chunk_size;
1112             if (!len)
1113                 return crc;
1114 
1115             /* Fall through for the remaining data. */
1116             buf += chunk_size;
1117         }
1118 #endif
1119         return armv8_crc32_little(buf, len, crc); /* Armv8@32bit or tail. */
1120     }
1121 #endif
1122     return crc32_z(crc, buf, len); /* Armv7 or Armv8 w/o crypto extensions. */
1123 }
1124 
1125 /* ========================================================================= */
crc32_combine64(uLong crc1,uLong crc2,z_off64_t len2)1126 uLong ZEXPORT crc32_combine64(uLong crc1, uLong crc2, z_off64_t len2) {
1127 #ifdef DYNAMIC_CRC_TABLE
1128     once(&made, make_crc_table);
1129 #endif /* DYNAMIC_CRC_TABLE */
1130     return multmodp(x2nmodp(len2, 3), crc1) ^ (crc2 & 0xffffffff);
1131 }
1132 
1133 /* ========================================================================= */
crc32_combine(uLong crc1,uLong crc2,z_off_t len2)1134 uLong ZEXPORT crc32_combine(uLong crc1, uLong crc2, z_off_t len2) {
1135     return crc32_combine64(crc1, crc2, (z_off64_t)len2);
1136 }
1137 /* ========================================================================= */
crc32_combine_gen64(z_off64_t len2)1138 uLong ZEXPORT crc32_combine_gen64(z_off64_t len2) {
1139 #ifdef DYNAMIC_CRC_TABLE
1140     once(&made, make_crc_table);
1141 #endif /* DYNAMIC_CRC_TABLE */
1142     return x2nmodp(len2, 3);
1143 }
1144 
1145 /* ========================================================================= */
crc32_combine_gen(z_off_t len2)1146 uLong ZEXPORT crc32_combine_gen(z_off_t len2) {
1147     return crc32_combine_gen64((z_off64_t)len2);
1148 }
1149 
1150 /* ========================================================================= */
crc32_combine_op(uLong crc1,uLong crc2,uLong op)1151 uLong ZEXPORT crc32_combine_op(uLong crc1, uLong crc2, uLong op) {
1152     return multmodp(op, crc1) ^ (crc2 & 0xffffffff);
1153 }
1154 
crc_reset(deflate_state * const s)1155 ZLIB_INTERNAL void crc_reset(deflate_state *const s)
1156 {
1157 #ifdef CRC32_SIMD_SSE42_PCLMUL
1158     if (x86_cpu_enable_simd) {
1159         crc_fold_init(s);
1160         return;
1161     }
1162 #endif
1163     s->strm->adler = crc32(0L, Z_NULL, 0);
1164 }
1165 
crc_finalize(deflate_state * const s)1166 ZLIB_INTERNAL void crc_finalize(deflate_state *const s)
1167 {
1168 #ifdef CRC32_SIMD_SSE42_PCLMUL
1169     if (x86_cpu_enable_simd)
1170         s->strm->adler = crc_fold_512to32(s);
1171 #endif
1172 }
1173 
copy_with_crc(z_streamp strm,Bytef * dst,long size)1174 ZLIB_INTERNAL void copy_with_crc(z_streamp strm, Bytef *dst, long size)
1175 {
1176 #ifdef CRC32_SIMD_SSE42_PCLMUL
1177     if (x86_cpu_enable_simd) {
1178         crc_fold_copy(strm->state, dst, strm->next_in, size);
1179         return;
1180     }
1181 #endif
1182     zmemcpy(dst, strm->next_in, size);
1183     strm->adler = crc32(strm->adler, dst, size);
1184 }
1185