xref: /illumos-gate/usr/src/contrib/zlib/crc32.c (revision 7a6d80f1660abd4755c68cbd094d4a914681d26e)
1 /* crc32.c -- compute the CRC-32 of a data stream
2  * Copyright (C) 1995-2022 Mark Adler
3  * For conditions of distribution and use, see copyright notice in zlib.h
4  *
5  * This interleaved implementation of a CRC makes use of pipelined multiple
6  * arithmetic-logic units, commonly found in modern CPU cores. It is due to
7  * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
8  */
9 
10 /*
11   Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
12   protection on the static variables used to control the first-use generation
13   of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should
14   first call get_crc_table() to initialize the tables before allowing more than
15   one thread to use crc32().
16 
17   MAKECRCH can be #defined to write out crc32.h. A main() routine is also
18   produced, so that this one source file can be compiled to an executable.
19  */
20 
21 #ifdef MAKECRCH
22 #  include <stdio.h>
23 #  ifndef DYNAMIC_CRC_TABLE
24 #    define DYNAMIC_CRC_TABLE
25 #  endif /* !DYNAMIC_CRC_TABLE */
26 #endif /* MAKECRCH */
27 
28 #include "zutil.h"      /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */
29 
30  /*
31   A CRC of a message is computed on N braids of words in the message, where
32   each word consists of W bytes (4 or 8). If N is 3, for example, then three
33   running sparse CRCs are calculated respectively on each braid, at these
34   indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
35   This is done starting at a word boundary, and continues until as many blocks
36   of N * W bytes as are available have been processed. The results are combined
37   into a single CRC at the end. For this code, N must be in the range 1..6 and
38   W must be 4 or 8. The upper limit on N can be increased if desired by adding
39   more #if blocks, extending the patterns apparent in the code. In addition,
40   crc32.h would need to be regenerated, if the maximum N value is increased.
41 
42   N and W are chosen empirically by benchmarking the execution time on a given
43   processor. The choices for N and W below were based on testing on Intel Kaby
44   Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
45   Octeon II processors. The Intel, AMD, and ARM processors were all fastest
46   with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
47   They were all tested with either gcc or clang, all using the -O3 optimization
48   level. Your mileage may vary.
49  */
50 
51 /* Define N */
52 #ifdef Z_TESTN
53 #  define N Z_TESTN
54 #else
55 #  define N 5
56 #endif
57 #if N < 1 || N > 6
58 #  error N must be in 1..6
59 #endif
60 
61 /*
62   z_crc_t must be at least 32 bits. z_word_t must be at least as long as
63   z_crc_t. It is assumed here that z_word_t is either 32 bits or 64 bits, and
64   that bytes are eight bits.
65  */
66 
67 /*
68   Define W and the associated z_word_t type. If W is not defined, then a
69   braided calculation is not used, and the associated tables and code are not
70   compiled.
71  */
72 #ifdef Z_TESTW
73 #  if Z_TESTW-1 != -1
74 #    define W Z_TESTW
75 #  endif
76 #else
77 #  ifdef MAKECRCH
78 #    define W 8         /* required for MAKECRCH */
79 #  else
80 #    if defined(__x86_64__) || defined(__aarch64__)
81 #      define W 8
82 #    else
83 #      define W 4
84 #    endif
85 #  endif
86 #endif
87 #ifdef W
88 #  if W == 8 && defined(Z_U8)
89      typedef Z_U8 z_word_t;
90 #  elif defined(Z_U4)
91 #    undef W
92 #    define W 4
93      typedef Z_U4 z_word_t;
94 #  else
95 #    undef W
96 #  endif
97 #endif
98 
99 /* Local functions. */
100 local z_crc_t multmodp OF((z_crc_t a, z_crc_t b));
101 local z_crc_t x2nmodp OF((z_off64_t n, unsigned k));
102 
103 /* If available, use the ARM processor CRC32 instruction. */
104 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8
105 #  define ARMCRC32
106 #endif
107 
108 #if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE))
109 /*
110   Swap the bytes in a z_word_t to convert between little and big endian. Any
111   self-respecting compiler will optimize this to a single machine byte-swap
112   instruction, if one is available. This assumes that word_t is either 32 bits
113   or 64 bits.
114  */
115 local z_word_t byte_swap(z_word_t word)
116 {
117 #  if W == 8
118     return
119         (word & 0xff00000000000000) >> 56 |
120         (word & 0xff000000000000) >> 40 |
121         (word & 0xff0000000000) >> 24 |
122         (word & 0xff00000000) >> 8 |
123         (word & 0xff000000) << 8 |
124         (word & 0xff0000) << 24 |
125         (word & 0xff00) << 40 |
126         (word & 0xff) << 56;
127 #  else   /* W == 4 */
128     return
129         (word & 0xff000000) >> 24 |
130         (word & 0xff0000) >> 8 |
131         (word & 0xff00) << 8 |
132         (word & 0xff) << 24;
133 #  endif
134 }
135 #endif
136 
137 /* CRC polynomial. */
138 #define POLY 0xedb88320         /* p(x) reflected, with x^32 implied */
139 
140 #ifdef DYNAMIC_CRC_TABLE
141 
142 local z_crc_t FAR crc_table[256];
143 local z_crc_t FAR x2n_table[32];
144 local void make_crc_table OF((void));
145 #ifdef W
146    local z_word_t FAR crc_big_table[256];
147    local z_crc_t FAR crc_braid_table[W][256];
148    local z_word_t FAR crc_braid_big_table[W][256];
149    local void braid OF((z_crc_t [][256], z_word_t [][256], int, int));
150 #endif
151 #ifdef MAKECRCH
152    local void write_table OF((FILE *, const z_crc_t FAR *, int));
153    local void write_table32hi OF((FILE *, const z_word_t FAR *, int));
154    local void write_table64 OF((FILE *, const z_word_t FAR *, int));
155 #endif /* MAKECRCH */
156 
157 /*
158   Define a once() function depending on the availability of atomics. If this is
159   compiled with DYNAMIC_CRC_TABLE defined, and if CRCs will be computed in
160   multiple threads, and if atomics are not available, then get_crc_table() must
161   be called to initialize the tables and must return before any threads are
162   allowed to compute or combine CRCs.
163  */
164 
165 /* Definition of once functionality. */
166 typedef struct once_s once_t;
167 local void once OF((once_t *, void (*)(void)));
168 
169 /* Check for the availability of atomics. */
170 #if defined(__STDC__) && __STDC_VERSION__ >= 201112L && \
171     !defined(__STDC_NO_ATOMICS__)
172 
173 #include <stdatomic.h>
174 
175 /* Structure for once(), which must be initialized with ONCE_INIT. */
176 struct once_s {
177     atomic_flag begun;
178     atomic_int done;
179 };
180 #define ONCE_INIT {ATOMIC_FLAG_INIT, 0}
181 
182 /*
183   Run the provided init() function exactly once, even if multiple threads
184   invoke once() at the same time. The state must be a once_t initialized with
185   ONCE_INIT.
186  */
187 local void once(state, init)
188     once_t *state;
189     void (*init)(void);
190 {
191     if (!atomic_load(&state->done)) {
192         if (atomic_flag_test_and_set(&state->begun))
193             while (!atomic_load(&state->done))
194                 ;
195         else {
196             init();
197             atomic_store(&state->done, 1);
198         }
199     }
200 }
201 
202 #else   /* no atomics */
203 
204 /* Structure for once(), which must be initialized with ONCE_INIT. */
205 struct once_s {
206     volatile int begun;
207     volatile int done;
208 };
209 #define ONCE_INIT {0, 0}
210 
211 /* Test and set. Alas, not atomic, but tries to minimize the period of
212    vulnerability. */
213 local int test_and_set OF((int volatile *));
214 local int test_and_set(flag)
215     int volatile *flag;
216 {
217     int was;
218 
219     was = *flag;
220     *flag = 1;
221     return was;
222 }
223 
224 /* Run the provided init() function once. This is not thread-safe. */
225 local void once(state, init)
226     once_t *state;
227     void (*init)(void);
228 {
229     if (!state->done) {
230         if (test_and_set(&state->begun))
231             while (!state->done)
232                 ;
233         else {
234             init();
235             state->done = 1;
236         }
237     }
238 }
239 
240 #endif
241 
242 /* State for once(). */
243 local once_t made = ONCE_INIT;
244 
245 /*
246   Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
247   x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
248 
249   Polynomials over GF(2) are represented in binary, one bit per coefficient,
250   with the lowest powers in the most significant bit. Then adding polynomials
251   is just exclusive-or, and multiplying a polynomial by x is a right shift by
252   one. If we call the above polynomial p, and represent a byte as the
253   polynomial q, also with the lowest power in the most significant bit (so the
254   byte 0xb1 is the polynomial x^7+x^3+x^2+1), then the CRC is (q*x^32) mod p,
255   where a mod b means the remainder after dividing a by b.
256 
257   This calculation is done using the shift-register method of multiplying and
258   taking the remainder. The register is initialized to zero, and for each
259   incoming bit, x^32 is added mod p to the register if the bit is a one (where
260   x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by x
261   (which is shifting right by one and adding x^32 mod p if the bit shifted out
262   is a one). We start with the highest power (least significant bit) of q and
263   repeat for all eight bits of q.
264 
265   The table is simply the CRC of all possible eight bit values. This is all the
266   information needed to generate CRCs on data a byte at a time for all
267   combinations of CRC register values and incoming bytes.
268  */
269 
270 local void make_crc_table()
271 {
272     unsigned i, j, n;
273     z_crc_t p;
274 
275     /* initialize the CRC of bytes tables */
276     for (i = 0; i < 256; i++) {
277         p = i;
278         for (j = 0; j < 8; j++)
279             p = p & 1 ? (p >> 1) ^ POLY : p >> 1;
280         crc_table[i] = p;
281 #ifdef W
282         crc_big_table[i] = byte_swap(p);
283 #endif
284     }
285 
286     /* initialize the x^2^n mod p(x) table */
287     p = (z_crc_t)1 << 30;         /* x^1 */
288     x2n_table[0] = p;
289     for (n = 1; n < 32; n++)
290         x2n_table[n] = p = multmodp(p, p);
291 
292 #ifdef W
293     /* initialize the braiding tables -- needs x2n_table[] */
294     braid(crc_braid_table, crc_braid_big_table, N, W);
295 #endif
296 
297 #ifdef MAKECRCH
298     {
299         /*
300           The crc32.h header file contains tables for both 32-bit and 64-bit
301           z_word_t's, and so requires a 64-bit type be available. In that case,
302           z_word_t must be defined to be 64-bits. This code then also generates
303           and writes out the tables for the case that z_word_t is 32 bits.
304          */
305 #if !defined(W) || W != 8
306 #  error Need a 64-bit integer type in order to generate crc32.h.
307 #endif
308         FILE *out;
309         int k, n;
310         z_crc_t ltl[8][256];
311         z_word_t big[8][256];
312 
313         out = fopen("crc32.h", "w");
314         if (out == NULL) return;
315 
316         /* write out little-endian CRC table to crc32.h */
317         fprintf(out,
318             "/* crc32.h -- tables for rapid CRC calculation\n"
319             " * Generated automatically by crc32.c\n */\n"
320             "\n"
321             "local const z_crc_t FAR crc_table[] = {\n"
322             "    ");
323         write_table(out, crc_table, 256);
324         fprintf(out,
325             "};\n");
326 
327         /* write out big-endian CRC table for 64-bit z_word_t to crc32.h */
328         fprintf(out,
329             "\n"
330             "#ifdef W\n"
331             "\n"
332             "#if W == 8\n"
333             "\n"
334             "local const z_word_t FAR crc_big_table[] = {\n"
335             "    ");
336         write_table64(out, crc_big_table, 256);
337         fprintf(out,
338             "};\n");
339 
340         /* write out big-endian CRC table for 32-bit z_word_t to crc32.h */
341         fprintf(out,
342             "\n"
343             "#else /* W == 4 */\n"
344             "\n"
345             "local const z_word_t FAR crc_big_table[] = {\n"
346             "    ");
347         write_table32hi(out, crc_big_table, 256);
348         fprintf(out,
349             "};\n"
350             "\n"
351             "#endif\n");
352 
353         /* write out braid tables for each value of N */
354         for (n = 1; n <= 6; n++) {
355             fprintf(out,
356             "\n"
357             "#if N == %d\n", n);
358 
359             /* compute braid tables for this N and 64-bit word_t */
360             braid(ltl, big, n, 8);
361 
362             /* write out braid tables for 64-bit z_word_t to crc32.h */
363             fprintf(out,
364             "\n"
365             "#if W == 8\n"
366             "\n"
367             "local const z_crc_t FAR crc_braid_table[][256] = {\n");
368             for (k = 0; k < 8; k++) {
369                 fprintf(out, "   {");
370                 write_table(out, ltl[k], 256);
371                 fprintf(out, "}%s", k < 7 ? ",\n" : "");
372             }
373             fprintf(out,
374             "};\n"
375             "\n"
376             "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
377             for (k = 0; k < 8; k++) {
378                 fprintf(out, "   {");
379                 write_table64(out, big[k], 256);
380                 fprintf(out, "}%s", k < 7 ? ",\n" : "");
381             }
382             fprintf(out,
383             "};\n");
384 
385             /* compute braid tables for this N and 32-bit word_t */
386             braid(ltl, big, n, 4);
387 
388             /* write out braid tables for 32-bit z_word_t to crc32.h */
389             fprintf(out,
390             "\n"
391             "#else /* W == 4 */\n"
392             "\n"
393             "local const z_crc_t FAR crc_braid_table[][256] = {\n");
394             for (k = 0; k < 4; k++) {
395                 fprintf(out, "   {");
396                 write_table(out, ltl[k], 256);
397                 fprintf(out, "}%s", k < 3 ? ",\n" : "");
398             }
399             fprintf(out,
400             "};\n"
401             "\n"
402             "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
403             for (k = 0; k < 4; k++) {
404                 fprintf(out, "   {");
405                 write_table32hi(out, big[k], 256);
406                 fprintf(out, "}%s", k < 3 ? ",\n" : "");
407             }
408             fprintf(out,
409             "};\n"
410             "\n"
411             "#endif\n"
412             "\n"
413             "#endif\n");
414         }
415         fprintf(out,
416             "\n"
417             "#endif\n");
418 
419         /* write out zeros operator table to crc32.h */
420         fprintf(out,
421             "\n"
422             "local const z_crc_t FAR x2n_table[] = {\n"
423             "    ");
424         write_table(out, x2n_table, 32);
425         fprintf(out,
426             "};\n");
427         fclose(out);
428     }
429 #endif /* MAKECRCH */
430 }
431 
432 #ifdef MAKECRCH
433 
434 /*
435    Write the 32-bit values in table[0..k-1] to out, five per line in
436    hexadecimal separated by commas.
437  */
438 local void write_table(out, table, k)
439     FILE *out;
440     const z_crc_t FAR *table;
441     int k;
442 {
443     int n;
444 
445     for (n = 0; n < k; n++)
446         fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
447                 (unsigned long)(table[n]),
448                 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
449 }
450 
451 /*
452    Write the high 32-bits of each value in table[0..k-1] to out, five per line
453    in hexadecimal separated by commas.
454  */
455 local void write_table32hi(out, table, k)
456 FILE *out;
457 const z_word_t FAR *table;
458 int k;
459 {
460     int n;
461 
462     for (n = 0; n < k; n++)
463         fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
464                 (unsigned long)(table[n] >> 32),
465                 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
466 }
467 
468 /*
469   Write the 64-bit values in table[0..k-1] to out, three per line in
470   hexadecimal separated by commas. This assumes that if there is a 64-bit
471   type, then there is also a long long integer type, and it is at least 64
472   bits. If not, then the type cast and format string can be adjusted
473   accordingly.
474  */
475 local void write_table64(out, table, k)
476     FILE *out;
477     const z_word_t FAR *table;
478     int k;
479 {
480     int n;
481 
482     for (n = 0; n < k; n++)
483         fprintf(out, "%s0x%016llx%s", n == 0 || n % 3 ? "" : "    ",
484                 (unsigned long long)(table[n]),
485                 n == k - 1 ? "" : (n % 3 == 2 ? ",\n" : ", "));
486 }
487 
488 /* Actually do the deed. */
489 int main()
490 {
491     make_crc_table();
492     return 0;
493 }
494 
495 #endif /* MAKECRCH */
496 
497 #ifdef W
498 /*
499   Generate the little and big-endian braid tables for the given n and z_word_t
500   size w. Each array must have room for w blocks of 256 elements.
501  */
502 local void braid(ltl, big, n, w)
503     z_crc_t ltl[][256];
504     z_word_t big[][256];
505     int n;
506     int w;
507 {
508     int k;
509     z_crc_t i, p, q;
510     for (k = 0; k < w; k++) {
511         p = x2nmodp((n * w + 3 - k) << 3, 0);
512         ltl[k][0] = 0;
513         big[w - 1 - k][0] = 0;
514         for (i = 1; i < 256; i++) {
515             ltl[k][i] = q = multmodp(i << 24, p);
516             big[w - 1 - k][i] = byte_swap(q);
517         }
518     }
519 }
520 #endif
521 
522 #else /* !DYNAMIC_CRC_TABLE */
523 /* ========================================================================
524  * Tables for byte-wise and braided CRC-32 calculations, and a table of powers
525  * of x for combining CRC-32s, all made by make_crc_table().
526  */
527 #include "crc32.h"
528 #endif /* DYNAMIC_CRC_TABLE */
529 
530 /* ========================================================================
531  * Routines used for CRC calculation. Some are also required for the table
532  * generation above.
533  */
534 
535 /*
536   Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial,
537   reflected. For speed, this requires that a not be zero.
538  */
539 local z_crc_t multmodp(z_crc_t a, z_crc_t b)
540 {
541     z_crc_t m, p;
542 
543     m = (z_crc_t)1 << 31;
544     p = 0;
545     for (;;) {
546         if (a & m) {
547             p ^= b;
548             if ((a & (m - 1)) == 0)
549                 break;
550         }
551         m >>= 1;
552         b = b & 1 ? (b >> 1) ^ POLY : b >> 1;
553     }
554     return p;
555 }
556 
557 /*
558   Return x^(n * 2^k) modulo p(x). Requires that x2n_table[] has been
559   initialized.
560  */
561 local z_crc_t x2nmodp(z_off64_t n, unsigned k)
562 {
563     z_crc_t p;
564 
565     p = (z_crc_t)1 << 31;           /* x^0 == 1 */
566     while (n) {
567         if (n & 1)
568             p = multmodp(x2n_table[k & 31], p);
569         n >>= 1;
570         k++;
571     }
572     return p;
573 }
574 
575 /* =========================================================================
576  * This function can be used by asm versions of crc32(), and to force the
577  * generation of the CRC tables in a threaded application.
578  */
579 const z_crc_t FAR * ZEXPORT get_crc_table(void)
580 {
581 #ifdef DYNAMIC_CRC_TABLE
582     once(&made, make_crc_table);
583 #endif /* DYNAMIC_CRC_TABLE */
584     return (const z_crc_t FAR *)crc_table;
585 }
586 
587 /* =========================================================================
588  * Use ARM machine instructions if available. This will compute the CRC about
589  * ten times faster than the braided calculation. This code does not check for
590  * the presence of the CRC instruction at run time. __ARM_FEATURE_CRC32 will
591  * only be defined if the compilation specifies an ARM processor architecture
592  * that has the instructions. For example, compiling with -march=armv8.1-a or
593  * -march=armv8-a+crc, or -march=native if the compile machine has the crc32
594  * instructions.
595  */
596 #ifdef ARMCRC32
597 
598 /*
599    Constants empirically determined to maximize speed. These values are from
600    measurements on a Cortex-A57. Your mileage may vary.
601  */
602 #define Z_BATCH 3990                /* number of words in a batch */
603 #define Z_BATCH_ZEROS 0xa10d3d0c    /* computed from Z_BATCH = 3990 */
604 #define Z_BATCH_MIN 800             /* fewest words in a final batch */
605 
606 unsigned long ZEXPORT crc32_z(crc, buf, len)
607     unsigned long crc;
608     const unsigned char FAR *buf;
609     z_size_t len;
610 {
611     z_crc_t val;
612     z_word_t crc1, crc2;
613     const z_word_t *word;
614     z_word_t val0, val1, val2;
615     z_size_t last, last2, i;
616     z_size_t num;
617 
618     /* Return initial CRC, if requested. */
619     if (buf == Z_NULL) return 0;
620 
621 #ifdef DYNAMIC_CRC_TABLE
622     once(&made, make_crc_table);
623 #endif /* DYNAMIC_CRC_TABLE */
624 
625     /* Pre-condition the CRC */
626     crc = (~crc) & 0xffffffff;
627 
628     /* Compute the CRC up to a word boundary. */
629     while (len && ((z_size_t)buf & 7) != 0) {
630         len--;
631         val = *buf++;
632         __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
633     }
634 
635     /* Prepare to compute the CRC on full 64-bit words word[0..num-1]. */
636     word = (z_word_t const *)buf;
637     num = len >> 3;
638     len &= 7;
639 
640     /* Do three interleaved CRCs to realize the throughput of one crc32x
641        instruction per cycle. Each CRC is calcuated on Z_BATCH words. The three
642        CRCs are combined into a single CRC after each set of batches. */
643     while (num >= 3 * Z_BATCH) {
644         crc1 = 0;
645         crc2 = 0;
646         for (i = 0; i < Z_BATCH; i++) {
647             val0 = word[i];
648             val1 = word[i + Z_BATCH];
649             val2 = word[i + 2 * Z_BATCH];
650             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
651             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
652             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
653         }
654         word += 3 * Z_BATCH;
655         num -= 3 * Z_BATCH;
656         crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc1;
657         crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc2;
658     }
659 
660     /* Do one last smaller batch with the remaining words, if there are enough
661        to pay for the combination of CRCs. */
662     last = num / 3;
663     if (last >= Z_BATCH_MIN) {
664         last2 = last << 1;
665         crc1 = 0;
666         crc2 = 0;
667         for (i = 0; i < last; i++) {
668             val0 = word[i];
669             val1 = word[i + last];
670             val2 = word[i + last2];
671             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
672             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
673             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
674         }
675         word += 3 * last;
676         num -= 3 * last;
677         val = x2nmodp(last, 6);
678         crc = multmodp(val, crc) ^ crc1;
679         crc = multmodp(val, crc) ^ crc2;
680     }
681 
682     /* Compute the CRC on any remaining words. */
683     for (i = 0; i < num; i++) {
684         val0 = word[i];
685         __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
686     }
687     word += num;
688 
689     /* Complete the CRC on any remaining bytes. */
690     buf = (const unsigned char FAR *)word;
691     while (len) {
692         len--;
693         val = *buf++;
694         __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
695     }
696 
697     /* Return the CRC, post-conditioned. */
698     return crc ^ 0xffffffff;
699 }
700 
701 #else
702 
703 #ifdef W
704 
705 /*
706   Return the CRC of the W bytes in the word_t data, taking the
707   least-significant byte of the word as the first byte of data, without any pre
708   or post conditioning. This is used to combine the CRCs of each braid.
709  */
710 local z_crc_t crc_word(z_word_t data)
711 {
712     int k;
713     for (k = 0; k < W; k++)
714         data = (data >> 8) ^ crc_table[data & 0xff];
715     return (z_crc_t)data;
716 }
717 
718 local z_word_t crc_word_big(z_word_t data)
719 {
720     int k;
721     for (k = 0; k < W; k++)
722         data = (data << 8) ^
723             crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
724     return data;
725 }
726 
727 #endif
728 
729 /* ========================================================================= */
730 unsigned long ZEXPORT crc32_z(unsigned long crc, const unsigned char FAR *buf,
731     z_size_t len)
732 {
733     /* Return initial CRC, if requested. */
734     if (buf == Z_NULL) return 0;
735 
736 #ifdef DYNAMIC_CRC_TABLE
737     once(&made, make_crc_table);
738 #endif /* DYNAMIC_CRC_TABLE */
739 
740     /* Pre-condition the CRC */
741     crc = (~crc) & 0xffffffff;
742 
743 #ifdef W
744 
745     /* If provided enough bytes, do a braided CRC calculation. */
746     if (len >= N * W + W - 1) {
747         z_size_t blks;
748         z_word_t const *words;
749         unsigned endian;
750         int k;
751 
752         /* Compute the CRC up to a z_word_t boundary. */
753         while (len && ((z_size_t)buf & (W - 1)) != 0) {
754             len--;
755             crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
756         }
757 
758         /* Compute the CRC on as many N z_word_t blocks as are available. */
759         blks = len / (N * W);
760         len -= blks * N * W;
761         words = (z_word_t const *)buf;
762 
763         /* Do endian check at execution time instead of compile time, since ARM
764            processors can change the endianess at execution time. If the
765            compiler knows what the endianess will be, it can optimize out the
766            check and the unused branch. */
767         endian = 1;
768         if (*(unsigned char *)&endian) {
769             /* Little endian. */
770 
771             z_crc_t crc0;
772             z_word_t word0;
773 #if N > 1
774             z_crc_t crc1;
775             z_word_t word1;
776 #if N > 2
777             z_crc_t crc2;
778             z_word_t word2;
779 #if N > 3
780             z_crc_t crc3;
781             z_word_t word3;
782 #if N > 4
783             z_crc_t crc4;
784             z_word_t word4;
785 #if N > 5
786             z_crc_t crc5;
787             z_word_t word5;
788 #endif
789 #endif
790 #endif
791 #endif
792 #endif
793 
794             /* Initialize the CRC for each braid. */
795             crc0 = crc;
796 #if N > 1
797             crc1 = 0;
798 #if N > 2
799             crc2 = 0;
800 #if N > 3
801             crc3 = 0;
802 #if N > 4
803             crc4 = 0;
804 #if N > 5
805             crc5 = 0;
806 #endif
807 #endif
808 #endif
809 #endif
810 #endif
811 
812             /*
813               Process the first blks-1 blocks, computing the CRCs on each braid
814               independently.
815              */
816             while (--blks) {
817                 /* Load the word for each braid into registers. */
818                 word0 = crc0 ^ words[0];
819 #if N > 1
820                 word1 = crc1 ^ words[1];
821 #if N > 2
822                 word2 = crc2 ^ words[2];
823 #if N > 3
824                 word3 = crc3 ^ words[3];
825 #if N > 4
826                 word4 = crc4 ^ words[4];
827 #if N > 5
828                 word5 = crc5 ^ words[5];
829 #endif
830 #endif
831 #endif
832 #endif
833 #endif
834                 words += N;
835 
836                 /* Compute and update the CRC for each word. The loop should
837                    get unrolled. */
838                 crc0 = crc_braid_table[0][word0 & 0xff];
839 #if N > 1
840                 crc1 = crc_braid_table[0][word1 & 0xff];
841 #if N > 2
842                 crc2 = crc_braid_table[0][word2 & 0xff];
843 #if N > 3
844                 crc3 = crc_braid_table[0][word3 & 0xff];
845 #if N > 4
846                 crc4 = crc_braid_table[0][word4 & 0xff];
847 #if N > 5
848                 crc5 = crc_braid_table[0][word5 & 0xff];
849 #endif
850 #endif
851 #endif
852 #endif
853 #endif
854                 for (k = 1; k < W; k++) {
855                     crc0 ^= crc_braid_table[k][(word0 >> (k << 3)) & 0xff];
856 #if N > 1
857                     crc1 ^= crc_braid_table[k][(word1 >> (k << 3)) & 0xff];
858 #if N > 2
859                     crc2 ^= crc_braid_table[k][(word2 >> (k << 3)) & 0xff];
860 #if N > 3
861                     crc3 ^= crc_braid_table[k][(word3 >> (k << 3)) & 0xff];
862 #if N > 4
863                     crc4 ^= crc_braid_table[k][(word4 >> (k << 3)) & 0xff];
864 #if N > 5
865                     crc5 ^= crc_braid_table[k][(word5 >> (k << 3)) & 0xff];
866 #endif
867 #endif
868 #endif
869 #endif
870 #endif
871                 }
872             }
873 
874             /*
875               Process the last block, combining the CRCs of the N braids at the
876               same time.
877              */
878             crc = crc_word(crc0 ^ words[0]);
879 #if N > 1
880             crc = crc_word(crc1 ^ words[1] ^ crc);
881 #if N > 2
882             crc = crc_word(crc2 ^ words[2] ^ crc);
883 #if N > 3
884             crc = crc_word(crc3 ^ words[3] ^ crc);
885 #if N > 4
886             crc = crc_word(crc4 ^ words[4] ^ crc);
887 #if N > 5
888             crc = crc_word(crc5 ^ words[5] ^ crc);
889 #endif
890 #endif
891 #endif
892 #endif
893 #endif
894             words += N;
895         }
896         else {
897             /* Big endian. */
898 
899             z_word_t crc0, word0, comb;
900 #if N > 1
901             z_word_t crc1, word1;
902 #if N > 2
903             z_word_t crc2, word2;
904 #if N > 3
905             z_word_t crc3, word3;
906 #if N > 4
907             z_word_t crc4, word4;
908 #if N > 5
909             z_word_t crc5, word5;
910 #endif
911 #endif
912 #endif
913 #endif
914 #endif
915 
916             /* Initialize the CRC for each braid. */
917             crc0 = byte_swap(crc);
918 #if N > 1
919             crc1 = 0;
920 #if N > 2
921             crc2 = 0;
922 #if N > 3
923             crc3 = 0;
924 #if N > 4
925             crc4 = 0;
926 #if N > 5
927             crc5 = 0;
928 #endif
929 #endif
930 #endif
931 #endif
932 #endif
933 
934             /*
935               Process the first blks-1 blocks, computing the CRCs on each braid
936               independently.
937              */
938             while (--blks) {
939                 /* Load the word for each braid into registers. */
940                 word0 = crc0 ^ words[0];
941 #if N > 1
942                 word1 = crc1 ^ words[1];
943 #if N > 2
944                 word2 = crc2 ^ words[2];
945 #if N > 3
946                 word3 = crc3 ^ words[3];
947 #if N > 4
948                 word4 = crc4 ^ words[4];
949 #if N > 5
950                 word5 = crc5 ^ words[5];
951 #endif
952 #endif
953 #endif
954 #endif
955 #endif
956                 words += N;
957 
958                 /* Compute and update the CRC for each word. The loop should
959                    get unrolled. */
960                 crc0 = crc_braid_big_table[0][word0 & 0xff];
961 #if N > 1
962                 crc1 = crc_braid_big_table[0][word1 & 0xff];
963 #if N > 2
964                 crc2 = crc_braid_big_table[0][word2 & 0xff];
965 #if N > 3
966                 crc3 = crc_braid_big_table[0][word3 & 0xff];
967 #if N > 4
968                 crc4 = crc_braid_big_table[0][word4 & 0xff];
969 #if N > 5
970                 crc5 = crc_braid_big_table[0][word5 & 0xff];
971 #endif
972 #endif
973 #endif
974 #endif
975 #endif
976                 for (k = 1; k < W; k++) {
977                     crc0 ^= crc_braid_big_table[k][(word0 >> (k << 3)) & 0xff];
978 #if N > 1
979                     crc1 ^= crc_braid_big_table[k][(word1 >> (k << 3)) & 0xff];
980 #if N > 2
981                     crc2 ^= crc_braid_big_table[k][(word2 >> (k << 3)) & 0xff];
982 #if N > 3
983                     crc3 ^= crc_braid_big_table[k][(word3 >> (k << 3)) & 0xff];
984 #if N > 4
985                     crc4 ^= crc_braid_big_table[k][(word4 >> (k << 3)) & 0xff];
986 #if N > 5
987                     crc5 ^= crc_braid_big_table[k][(word5 >> (k << 3)) & 0xff];
988 #endif
989 #endif
990 #endif
991 #endif
992 #endif
993                 }
994             }
995 
996             /*
997               Process the last block, combining the CRCs of the N braids at the
998               same time.
999              */
1000             comb = crc_word_big(crc0 ^ words[0]);
1001 #if N > 1
1002             comb = crc_word_big(crc1 ^ words[1] ^ comb);
1003 #if N > 2
1004             comb = crc_word_big(crc2 ^ words[2] ^ comb);
1005 #if N > 3
1006             comb = crc_word_big(crc3 ^ words[3] ^ comb);
1007 #if N > 4
1008             comb = crc_word_big(crc4 ^ words[4] ^ comb);
1009 #if N > 5
1010             comb = crc_word_big(crc5 ^ words[5] ^ comb);
1011 #endif
1012 #endif
1013 #endif
1014 #endif
1015 #endif
1016             words += N;
1017             crc = byte_swap(comb);
1018         }
1019 
1020         /*
1021           Update the pointer to the remaining bytes to process.
1022          */
1023         buf = (unsigned char const *)words;
1024     }
1025 
1026 #endif /* W */
1027 
1028     /* Complete the computation of the CRC on any remaining bytes. */
1029     while (len >= 8) {
1030         len -= 8;
1031         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1032         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1033         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1034         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1035         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1036         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1037         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1038         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1039     }
1040     while (len) {
1041         len--;
1042         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1043     }
1044 
1045     /* Return the CRC, post-conditioned. */
1046     return crc ^ 0xffffffff;
1047 }
1048 
1049 #endif
1050 
1051 /* ========================================================================= */
1052 unsigned long ZEXPORT crc32(unsigned long crc, const unsigned char FAR *buf,
1053     uInt len)
1054 {
1055     return crc32_z(crc, buf, len);
1056 }
1057 
1058 /* ========================================================================= */
1059 uLong ZEXPORT crc32_combine64(uLong crc1, uLong crc2, z_off64_t len2)
1060 {
1061 #ifdef DYNAMIC_CRC_TABLE
1062     once(&made, make_crc_table);
1063 #endif /* DYNAMIC_CRC_TABLE */
1064     return multmodp(x2nmodp(len2, 3), crc1) ^ (crc2 & 0xffffffff);
1065 }
1066 
1067 /* ========================================================================= */
1068 uLong ZEXPORT crc32_combine(uLong crc1, uLong crc2, z_off_t len2)
1069 {
1070     return crc32_combine64(crc1, crc2, len2);
1071 }
1072 
1073 /* ========================================================================= */
1074 uLong ZEXPORT crc32_combine_gen64(z_off64_t len2)
1075 {
1076 #ifdef DYNAMIC_CRC_TABLE
1077     once(&made, make_crc_table);
1078 #endif /* DYNAMIC_CRC_TABLE */
1079     return x2nmodp(len2, 3);
1080 }
1081 
1082 /* ========================================================================= */
1083 uLong ZEXPORT crc32_combine_gen(z_off_t len2)
1084 {
1085     return crc32_combine_gen64(len2);
1086 }
1087 
1088 /* ========================================================================= */
1089 uLong crc32_combine_op(uLong crc1, uLong crc2, uLong op)
1090 {
1091     return multmodp(op, crc1) ^ (crc2 & 0xffffffff);
1092 }
1093