xref: /freebsd/sys/contrib/zlib/crc32.c (revision 7543a9c0280a0f4262489671936a6e03b9b2c563)
1 /* crc32.c -- compute the CRC-32 of a data stream
2  * Copyright (C) 1995-2022 Mark Adler
3  * For conditions of distribution and use, see copyright notice in zlib.h
4  *
5  * This interleaved implementation of a CRC makes use of pipelined multiple
6  * arithmetic-logic units, commonly found in modern CPU cores. It is due to
7  * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
8  */
9 
10 /* @(#) $Id$ */
11 
12 /*
13   Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
14   protection on the static variables used to control the first-use generation
15   of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should
16   first call get_crc_table() to initialize the tables before allowing more than
17   one thread to use crc32().
18 
19   MAKECRCH can be #defined to write out crc32.h. A main() routine is also
20   produced, so that this one source file can be compiled to an executable.
21  */
22 
23 #ifdef MAKECRCH
24 #  include <stdio.h>
25 #  ifndef DYNAMIC_CRC_TABLE
26 #    define DYNAMIC_CRC_TABLE
27 #  endif /* !DYNAMIC_CRC_TABLE */
28 #endif /* MAKECRCH */
29 
30 #include "zutil.h"      /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */
31 
32  /*
33   A CRC of a message is computed on N braids of words in the message, where
34   each word consists of W bytes (4 or 8). If N is 3, for example, then three
35   running sparse CRCs are calculated respectively on each braid, at these
36   indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
37   This is done starting at a word boundary, and continues until as many blocks
38   of N * W bytes as are available have been processed. The results are combined
39   into a single CRC at the end. For this code, N must be in the range 1..6 and
40   W must be 4 or 8. The upper limit on N can be increased if desired by adding
41   more #if blocks, extending the patterns apparent in the code. In addition,
42   crc32.h would need to be regenerated, if the maximum N value is increased.
43 
44   N and W are chosen empirically by benchmarking the execution time on a given
45   processor. The choices for N and W below were based on testing on Intel Kaby
46   Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
47   Octeon II processors. The Intel, AMD, and ARM processors were all fastest
48   with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
49   They were all tested with either gcc or clang, all using the -O3 optimization
50   level. Your mileage may vary.
51  */
52 
53 /* Define N */
54 #ifdef Z_TESTN
55 #  define N Z_TESTN
56 #else
57 #  define N 5
58 #endif
59 #if N < 1 || N > 6
60 #  error N must be in 1..6
61 #endif
62 
63 /*
64   z_crc_t must be at least 32 bits. z_word_t must be at least as long as
65   z_crc_t. It is assumed here that z_word_t is either 32 bits or 64 bits, and
66   that bytes are eight bits.
67  */
68 
69 /*
70   Define W and the associated z_word_t type. If W is not defined, then a
71   braided calculation is not used, and the associated tables and code are not
72   compiled.
73  */
74 #ifdef Z_TESTW
75 #  if Z_TESTW-1 != -1
76 #    define W Z_TESTW
77 #  endif
78 #else
79 #  ifdef MAKECRCH
80 #    define W 8         /* required for MAKECRCH */
81 #  else
82 #    if defined(__x86_64__) || defined(__aarch64__)
83 #      define W 8
84 #    else
85 #      define W 4
86 #    endif
87 #  endif
88 #endif
89 #ifdef W
90 #  if W == 8 && defined(Z_U8)
91      typedef Z_U8 z_word_t;
92 #  elif defined(Z_U4)
93 #    undef W
94 #    define W 4
95      typedef Z_U4 z_word_t;
96 #  else
97 #    undef W
98 #  endif
99 #endif
100 
101 /* Local functions. */
102 local z_crc_t multmodp OF((z_crc_t a, z_crc_t b));
103 local z_crc_t x2nmodp OF((z_off64_t n, unsigned k));
104 
105 /* If available, use the ARM processor CRC32 instruction. */
106 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8
107 #  define ARMCRC32
108 #endif
109 
110 #if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE))
111 /*
112   Swap the bytes in a z_word_t to convert between little and big endian. Any
113   self-respecting compiler will optimize this to a single machine byte-swap
114   instruction, if one is available. This assumes that word_t is either 32 bits
115   or 64 bits.
116  */
117 local z_word_t byte_swap OF((z_word_t word));
118 
119 local z_word_t byte_swap(word)
120     z_word_t word;
121 {
122 #  if W == 8
123     return
124         (word & 0xff00000000000000) >> 56 |
125         (word & 0xff000000000000) >> 40 |
126         (word & 0xff0000000000) >> 24 |
127         (word & 0xff00000000) >> 8 |
128         (word & 0xff000000) << 8 |
129         (word & 0xff0000) << 24 |
130         (word & 0xff00) << 40 |
131         (word & 0xff) << 56;
132 #  else   /* W == 4 */
133     return
134         (word & 0xff000000) >> 24 |
135         (word & 0xff0000) >> 8 |
136         (word & 0xff00) << 8 |
137         (word & 0xff) << 24;
138 #  endif
139 }
140 #endif
141 
142 /* CRC polynomial. */
143 #define POLY 0xedb88320         /* p(x) reflected, with x^32 implied */
144 
145 #ifdef DYNAMIC_CRC_TABLE
146 
147 local z_crc_t FAR crc_table[256];
148 local z_crc_t FAR x2n_table[32];
149 local void make_crc_table OF((void));
150 #ifdef W
151    local z_word_t FAR crc_big_table[256];
152    local z_crc_t FAR crc_braid_table[W][256];
153    local z_word_t FAR crc_braid_big_table[W][256];
154    local void braid OF((z_crc_t [][256], z_word_t [][256], int, int));
155 #endif
156 #ifdef MAKECRCH
157    local void write_table OF((FILE *, const z_crc_t FAR *, int));
158    local void write_table32hi OF((FILE *, const z_word_t FAR *, int));
159    local void write_table64 OF((FILE *, const z_word_t FAR *, int));
160 #endif /* MAKECRCH */
161 
162 /*
163   Define a once() function depending on the availability of atomics. If this is
164   compiled with DYNAMIC_CRC_TABLE defined, and if CRCs will be computed in
165   multiple threads, and if atomics are not available, then get_crc_table() must
166   be called to initialize the tables and must return before any threads are
167   allowed to compute or combine CRCs.
168  */
169 
170 /* Definition of once functionality. */
171 typedef struct once_s once_t;
172 local void once OF((once_t *, void (*)(void)));
173 
174 /* Check for the availability of atomics. */
175 #if defined(__STDC__) && __STDC_VERSION__ >= 201112L && \
176     !defined(__STDC_NO_ATOMICS__)
177 
178 #include <stdatomic.h>
179 
180 /* Structure for once(), which must be initialized with ONCE_INIT. */
181 struct once_s {
182     atomic_flag begun;
183     atomic_int done;
184 };
185 #define ONCE_INIT {ATOMIC_FLAG_INIT, 0}
186 
187 /*
188   Run the provided init() function exactly once, even if multiple threads
189   invoke once() at the same time. The state must be a once_t initialized with
190   ONCE_INIT.
191  */
192 local void once(state, init)
193     once_t *state;
194     void (*init)(void);
195 {
196     if (!atomic_load(&state->done)) {
197         if (atomic_flag_test_and_set(&state->begun))
198             while (!atomic_load(&state->done))
199                 ;
200         else {
201             init();
202             atomic_store(&state->done, 1);
203         }
204     }
205 }
206 
207 #else   /* no atomics */
208 
209 /* Structure for once(), which must be initialized with ONCE_INIT. */
210 struct once_s {
211     volatile int begun;
212     volatile int done;
213 };
214 #define ONCE_INIT {0, 0}
215 
216 /* Test and set. Alas, not atomic, but tries to minimize the period of
217    vulnerability. */
218 local int test_and_set OF((int volatile *));
219 local int test_and_set(flag)
220     int volatile *flag;
221 {
222     int was;
223 
224     was = *flag;
225     *flag = 1;
226     return was;
227 }
228 
229 /* Run the provided init() function once. This is not thread-safe. */
230 local void once(state, init)
231     once_t *state;
232     void (*init)(void);
233 {
234     if (!state->done) {
235         if (test_and_set(&state->begun))
236             while (!state->done)
237                 ;
238         else {
239             init();
240             state->done = 1;
241         }
242     }
243 }
244 
245 #endif
246 
247 /* State for once(). */
248 local once_t made = ONCE_INIT;
249 
250 /*
251   Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
252   x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
253 
254   Polynomials over GF(2) are represented in binary, one bit per coefficient,
255   with the lowest powers in the most significant bit. Then adding polynomials
256   is just exclusive-or, and multiplying a polynomial by x is a right shift by
257   one. If we call the above polynomial p, and represent a byte as the
258   polynomial q, also with the lowest power in the most significant bit (so the
259   byte 0xb1 is the polynomial x^7+x^3+x^2+1), then the CRC is (q*x^32) mod p,
260   where a mod b means the remainder after dividing a by b.
261 
262   This calculation is done using the shift-register method of multiplying and
263   taking the remainder. The register is initialized to zero, and for each
264   incoming bit, x^32 is added mod p to the register if the bit is a one (where
265   x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by x
266   (which is shifting right by one and adding x^32 mod p if the bit shifted out
267   is a one). We start with the highest power (least significant bit) of q and
268   repeat for all eight bits of q.
269 
270   The table is simply the CRC of all possible eight bit values. This is all the
271   information needed to generate CRCs on data a byte at a time for all
272   combinations of CRC register values and incoming bytes.
273  */
274 
275 local void make_crc_table()
276 {
277     unsigned i, j, n;
278     z_crc_t p;
279 
280     /* initialize the CRC of bytes tables */
281     for (i = 0; i < 256; i++) {
282         p = i;
283         for (j = 0; j < 8; j++)
284             p = p & 1 ? (p >> 1) ^ POLY : p >> 1;
285         crc_table[i] = p;
286 #ifdef W
287         crc_big_table[i] = byte_swap(p);
288 #endif
289     }
290 
291     /* initialize the x^2^n mod p(x) table */
292     p = (z_crc_t)1 << 30;         /* x^1 */
293     x2n_table[0] = p;
294     for (n = 1; n < 32; n++)
295         x2n_table[n] = p = multmodp(p, p);
296 
297 #ifdef W
298     /* initialize the braiding tables -- needs x2n_table[] */
299     braid(crc_braid_table, crc_braid_big_table, N, W);
300 #endif
301 
302 #ifdef MAKECRCH
303     {
304         /*
305           The crc32.h header file contains tables for both 32-bit and 64-bit
306           z_word_t's, and so requires a 64-bit type be available. In that case,
307           z_word_t must be defined to be 64-bits. This code then also generates
308           and writes out the tables for the case that z_word_t is 32 bits.
309          */
310 #if !defined(W) || W != 8
311 #  error Need a 64-bit integer type in order to generate crc32.h.
312 #endif
313         FILE *out;
314         int k, n;
315         z_crc_t ltl[8][256];
316         z_word_t big[8][256];
317 
318         out = fopen("crc32.h", "w");
319         if (out == NULL) return;
320 
321         /* write out little-endian CRC table to crc32.h */
322         fprintf(out,
323             "/* crc32.h -- tables for rapid CRC calculation\n"
324             " * Generated automatically by crc32.c\n */\n"
325             "\n"
326             "local const z_crc_t FAR crc_table[] = {\n"
327             "    ");
328         write_table(out, crc_table, 256);
329         fprintf(out,
330             "};\n");
331 
332         /* write out big-endian CRC table for 64-bit z_word_t to crc32.h */
333         fprintf(out,
334             "\n"
335             "#ifdef W\n"
336             "\n"
337             "#if W == 8\n"
338             "\n"
339             "local const z_word_t FAR crc_big_table[] = {\n"
340             "    ");
341         write_table64(out, crc_big_table, 256);
342         fprintf(out,
343             "};\n");
344 
345         /* write out big-endian CRC table for 32-bit z_word_t to crc32.h */
346         fprintf(out,
347             "\n"
348             "#else /* W == 4 */\n"
349             "\n"
350             "local const z_word_t FAR crc_big_table[] = {\n"
351             "    ");
352         write_table32hi(out, crc_big_table, 256);
353         fprintf(out,
354             "};\n"
355             "\n"
356             "#endif\n");
357 
358         /* write out braid tables for each value of N */
359         for (n = 1; n <= 6; n++) {
360             fprintf(out,
361             "\n"
362             "#if N == %d\n", n);
363 
364             /* compute braid tables for this N and 64-bit word_t */
365             braid(ltl, big, n, 8);
366 
367             /* write out braid tables for 64-bit z_word_t to crc32.h */
368             fprintf(out,
369             "\n"
370             "#if W == 8\n"
371             "\n"
372             "local const z_crc_t FAR crc_braid_table[][256] = {\n");
373             for (k = 0; k < 8; k++) {
374                 fprintf(out, "   {");
375                 write_table(out, ltl[k], 256);
376                 fprintf(out, "}%s", k < 7 ? ",\n" : "");
377             }
378             fprintf(out,
379             "};\n"
380             "\n"
381             "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
382             for (k = 0; k < 8; k++) {
383                 fprintf(out, "   {");
384                 write_table64(out, big[k], 256);
385                 fprintf(out, "}%s", k < 7 ? ",\n" : "");
386             }
387             fprintf(out,
388             "};\n");
389 
390             /* compute braid tables for this N and 32-bit word_t */
391             braid(ltl, big, n, 4);
392 
393             /* write out braid tables for 32-bit z_word_t to crc32.h */
394             fprintf(out,
395             "\n"
396             "#else /* W == 4 */\n"
397             "\n"
398             "local const z_crc_t FAR crc_braid_table[][256] = {\n");
399             for (k = 0; k < 4; k++) {
400                 fprintf(out, "   {");
401                 write_table(out, ltl[k], 256);
402                 fprintf(out, "}%s", k < 3 ? ",\n" : "");
403             }
404             fprintf(out,
405             "};\n"
406             "\n"
407             "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
408             for (k = 0; k < 4; k++) {
409                 fprintf(out, "   {");
410                 write_table32hi(out, big[k], 256);
411                 fprintf(out, "}%s", k < 3 ? ",\n" : "");
412             }
413             fprintf(out,
414             "};\n"
415             "\n"
416             "#endif\n"
417             "\n"
418             "#endif\n");
419         }
420         fprintf(out,
421             "\n"
422             "#endif\n");
423 
424         /* write out zeros operator table to crc32.h */
425         fprintf(out,
426             "\n"
427             "local const z_crc_t FAR x2n_table[] = {\n"
428             "    ");
429         write_table(out, x2n_table, 32);
430         fprintf(out,
431             "};\n");
432         fclose(out);
433     }
434 #endif /* MAKECRCH */
435 }
436 
437 #ifdef MAKECRCH
438 
439 /*
440    Write the 32-bit values in table[0..k-1] to out, five per line in
441    hexadecimal separated by commas.
442  */
443 local void write_table(out, table, k)
444     FILE *out;
445     const z_crc_t FAR *table;
446     int k;
447 {
448     int n;
449 
450     for (n = 0; n < k; n++)
451         fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
452                 (unsigned long)(table[n]),
453                 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
454 }
455 
456 /*
457    Write the high 32-bits of each value in table[0..k-1] to out, five per line
458    in hexadecimal separated by commas.
459  */
460 local void write_table32hi(out, table, k)
461 FILE *out;
462 const z_word_t FAR *table;
463 int k;
464 {
465     int n;
466 
467     for (n = 0; n < k; n++)
468         fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
469                 (unsigned long)(table[n] >> 32),
470                 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
471 }
472 
473 /*
474   Write the 64-bit values in table[0..k-1] to out, three per line in
475   hexadecimal separated by commas. This assumes that if there is a 64-bit
476   type, then there is also a long long integer type, and it is at least 64
477   bits. If not, then the type cast and format string can be adjusted
478   accordingly.
479  */
480 local void write_table64(out, table, k)
481     FILE *out;
482     const z_word_t FAR *table;
483     int k;
484 {
485     int n;
486 
487     for (n = 0; n < k; n++)
488         fprintf(out, "%s0x%016llx%s", n == 0 || n % 3 ? "" : "    ",
489                 (unsigned long long)(table[n]),
490                 n == k - 1 ? "" : (n % 3 == 2 ? ",\n" : ", "));
491 }
492 
493 /* Actually do the deed. */
494 int main()
495 {
496     make_crc_table();
497     return 0;
498 }
499 
500 #endif /* MAKECRCH */
501 
502 #ifdef W
503 /*
504   Generate the little and big-endian braid tables for the given n and z_word_t
505   size w. Each array must have room for w blocks of 256 elements.
506  */
507 local void braid(ltl, big, n, w)
508     z_crc_t ltl[][256];
509     z_word_t big[][256];
510     int n;
511     int w;
512 {
513     int k;
514     z_crc_t i, p, q;
515     for (k = 0; k < w; k++) {
516         p = x2nmodp((n * w + 3 - k) << 3, 0);
517         ltl[k][0] = 0;
518         big[w - 1 - k][0] = 0;
519         for (i = 1; i < 256; i++) {
520             ltl[k][i] = q = multmodp(i << 24, p);
521             big[w - 1 - k][i] = byte_swap(q);
522         }
523     }
524 }
525 #endif
526 
527 #else /* !DYNAMIC_CRC_TABLE */
528 /* ========================================================================
529  * Tables for byte-wise and braided CRC-32 calculations, and a table of powers
530  * of x for combining CRC-32s, all made by make_crc_table().
531  */
532 #include "crc32.h"
533 #endif /* DYNAMIC_CRC_TABLE */
534 
535 /* ========================================================================
536  * Routines used for CRC calculation. Some are also required for the table
537  * generation above.
538  */
539 
540 /*
541   Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial,
542   reflected. For speed, this requires that a not be zero.
543  */
544 local z_crc_t multmodp(a, b)
545     z_crc_t a;
546     z_crc_t b;
547 {
548     z_crc_t m, p;
549 
550     m = (z_crc_t)1 << 31;
551     p = 0;
552     for (;;) {
553         if (a & m) {
554             p ^= b;
555             if ((a & (m - 1)) == 0)
556                 break;
557         }
558         m >>= 1;
559         b = b & 1 ? (b >> 1) ^ POLY : b >> 1;
560     }
561     return p;
562 }
563 
564 /*
565   Return x^(n * 2^k) modulo p(x). Requires that x2n_table[] has been
566   initialized.
567  */
568 local z_crc_t x2nmodp(n, k)
569     z_off64_t n;
570     unsigned k;
571 {
572     z_crc_t p;
573 
574     p = (z_crc_t)1 << 31;           /* x^0 == 1 */
575     while (n) {
576         if (n & 1)
577             p = multmodp(x2n_table[k & 31], p);
578         n >>= 1;
579         k++;
580     }
581     return p;
582 }
583 
584 /* =========================================================================
585  * This function can be used by asm versions of crc32(), and to force the
586  * generation of the CRC tables in a threaded application.
587  */
588 const z_crc_t FAR * ZEXPORT get_crc_table()
589 {
590 #ifdef DYNAMIC_CRC_TABLE
591     once(&made, make_crc_table);
592 #endif /* DYNAMIC_CRC_TABLE */
593     return (const z_crc_t FAR *)crc_table;
594 }
595 
596 /* =========================================================================
597  * Use ARM machine instructions if available. This will compute the CRC about
598  * ten times faster than the braided calculation. This code does not check for
599  * the presence of the CRC instruction at run time. __ARM_FEATURE_CRC32 will
600  * only be defined if the compilation specifies an ARM processor architecture
601  * that has the instructions. For example, compiling with -march=armv8.1-a or
602  * -march=armv8-a+crc, or -march=native if the compile machine has the crc32
603  * instructions.
604  */
605 #ifdef ARMCRC32
606 
607 /*
608    Constants empirically determined to maximize speed. These values are from
609    measurements on a Cortex-A57. Your mileage may vary.
610  */
611 #define Z_BATCH 3990                /* number of words in a batch */
612 #define Z_BATCH_ZEROS 0xa10d3d0c    /* computed from Z_BATCH = 3990 */
613 #define Z_BATCH_MIN 800             /* fewest words in a final batch */
614 
615 unsigned long ZEXPORT crc32_z(crc, buf, len)
616     unsigned long crc;
617     const unsigned char FAR *buf;
618     z_size_t len;
619 {
620     z_crc_t val;
621     z_word_t crc1, crc2;
622     const z_word_t *word;
623     z_word_t val0, val1, val2;
624     z_size_t last, last2, i;
625     z_size_t num;
626 
627     /* Return initial CRC, if requested. */
628     if (buf == Z_NULL) return 0;
629 
630 #ifdef DYNAMIC_CRC_TABLE
631     once(&made, make_crc_table);
632 #endif /* DYNAMIC_CRC_TABLE */
633 
634     /* Pre-condition the CRC */
635     crc = (~crc) & 0xffffffff;
636 
637     /* Compute the CRC up to a word boundary. */
638     while (len && ((z_size_t)buf & 7) != 0) {
639         len--;
640         val = *buf++;
641         __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
642     }
643 
644     /* Prepare to compute the CRC on full 64-bit words word[0..num-1]. */
645     word = (z_word_t const *)buf;
646     num = len >> 3;
647     len &= 7;
648 
649     /* Do three interleaved CRCs to realize the throughput of one crc32x
650        instruction per cycle. Each CRC is calcuated on Z_BATCH words. The three
651        CRCs are combined into a single CRC after each set of batches. */
652     while (num >= 3 * Z_BATCH) {
653         crc1 = 0;
654         crc2 = 0;
655         for (i = 0; i < Z_BATCH; i++) {
656             val0 = word[i];
657             val1 = word[i + Z_BATCH];
658             val2 = word[i + 2 * Z_BATCH];
659             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
660             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
661             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
662         }
663         word += 3 * Z_BATCH;
664         num -= 3 * Z_BATCH;
665         crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc1;
666         crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc2;
667     }
668 
669     /* Do one last smaller batch with the remaining words, if there are enough
670        to pay for the combination of CRCs. */
671     last = num / 3;
672     if (last >= Z_BATCH_MIN) {
673         last2 = last << 1;
674         crc1 = 0;
675         crc2 = 0;
676         for (i = 0; i < last; i++) {
677             val0 = word[i];
678             val1 = word[i + last];
679             val2 = word[i + last2];
680             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
681             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
682             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
683         }
684         word += 3 * last;
685         num -= 3 * last;
686         val = x2nmodp(last, 6);
687         crc = multmodp(val, crc) ^ crc1;
688         crc = multmodp(val, crc) ^ crc2;
689     }
690 
691     /* Compute the CRC on any remaining words. */
692     for (i = 0; i < num; i++) {
693         val0 = word[i];
694         __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
695     }
696     word += num;
697 
698     /* Complete the CRC on any remaining bytes. */
699     buf = (const unsigned char FAR *)word;
700     while (len) {
701         len--;
702         val = *buf++;
703         __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
704     }
705 
706     /* Return the CRC, post-conditioned. */
707     return crc ^ 0xffffffff;
708 }
709 
710 #else
711 
712 #ifdef W
713 
714 /*
715   Return the CRC of the W bytes in the word_t data, taking the
716   least-significant byte of the word as the first byte of data, without any pre
717   or post conditioning. This is used to combine the CRCs of each braid.
718  */
719 local z_crc_t crc_word OF((z_word_t data));
720 
721 local z_crc_t crc_word(data)
722     z_word_t data;
723 {
724     int k;
725     for (k = 0; k < W; k++)
726         data = (data >> 8) ^ crc_table[data & 0xff];
727     return (z_crc_t)data;
728 }
729 
730 local z_word_t crc_word_big OF((z_word_t data));
731 
732 local z_word_t crc_word_big(data)
733     z_word_t data;
734 {
735     int k;
736     for (k = 0; k < W; k++)
737         data = (data << 8) ^
738             crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
739     return data;
740 }
741 
742 #endif
743 
744 /* ========================================================================= */
745 unsigned long ZEXPORT crc32_z(crc, buf, len)
746     unsigned long crc;
747     const unsigned char FAR *buf;
748     z_size_t len;
749 {
750     /* Return initial CRC, if requested. */
751     if (buf == Z_NULL) return 0;
752 
753 #ifdef DYNAMIC_CRC_TABLE
754     once(&made, make_crc_table);
755 #endif /* DYNAMIC_CRC_TABLE */
756 
757     /* Pre-condition the CRC */
758     crc = (~crc) & 0xffffffff;
759 
760 #ifdef W
761 
762     /* If provided enough bytes, do a braided CRC calculation. */
763     if (len >= N * W + W - 1) {
764         z_size_t blks;
765         z_word_t const *words;
766         unsigned endian;
767         int k;
768 
769         /* Compute the CRC up to a z_word_t boundary. */
770         while (len && ((z_size_t)buf & (W - 1)) != 0) {
771             len--;
772             crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
773         }
774 
775         /* Compute the CRC on as many N z_word_t blocks as are available. */
776         blks = len / (N * W);
777         len -= blks * N * W;
778         words = (z_word_t const *)buf;
779 
780         /* Do endian check at execution time instead of compile time, since ARM
781            processors can change the endianess at execution time. If the
782            compiler knows what the endianess will be, it can optimize out the
783            check and the unused branch. */
784         endian = 1;
785         if (*(unsigned char *)&endian) {
786             /* Little endian. */
787 
788             z_crc_t crc0;
789             z_word_t word0;
790 #if N > 1
791             z_crc_t crc1;
792             z_word_t word1;
793 #if N > 2
794             z_crc_t crc2;
795             z_word_t word2;
796 #if N > 3
797             z_crc_t crc3;
798             z_word_t word3;
799 #if N > 4
800             z_crc_t crc4;
801             z_word_t word4;
802 #if N > 5
803             z_crc_t crc5;
804             z_word_t word5;
805 #endif
806 #endif
807 #endif
808 #endif
809 #endif
810 
811             /* Initialize the CRC for each braid. */
812             crc0 = crc;
813 #if N > 1
814             crc1 = 0;
815 #if N > 2
816             crc2 = 0;
817 #if N > 3
818             crc3 = 0;
819 #if N > 4
820             crc4 = 0;
821 #if N > 5
822             crc5 = 0;
823 #endif
824 #endif
825 #endif
826 #endif
827 #endif
828 
829             /*
830               Process the first blks-1 blocks, computing the CRCs on each braid
831               independently.
832              */
833             while (--blks) {
834                 /* Load the word for each braid into registers. */
835                 word0 = crc0 ^ words[0];
836 #if N > 1
837                 word1 = crc1 ^ words[1];
838 #if N > 2
839                 word2 = crc2 ^ words[2];
840 #if N > 3
841                 word3 = crc3 ^ words[3];
842 #if N > 4
843                 word4 = crc4 ^ words[4];
844 #if N > 5
845                 word5 = crc5 ^ words[5];
846 #endif
847 #endif
848 #endif
849 #endif
850 #endif
851                 words += N;
852 
853                 /* Compute and update the CRC for each word. The loop should
854                    get unrolled. */
855                 crc0 = crc_braid_table[0][word0 & 0xff];
856 #if N > 1
857                 crc1 = crc_braid_table[0][word1 & 0xff];
858 #if N > 2
859                 crc2 = crc_braid_table[0][word2 & 0xff];
860 #if N > 3
861                 crc3 = crc_braid_table[0][word3 & 0xff];
862 #if N > 4
863                 crc4 = crc_braid_table[0][word4 & 0xff];
864 #if N > 5
865                 crc5 = crc_braid_table[0][word5 & 0xff];
866 #endif
867 #endif
868 #endif
869 #endif
870 #endif
871                 for (k = 1; k < W; k++) {
872                     crc0 ^= crc_braid_table[k][(word0 >> (k << 3)) & 0xff];
873 #if N > 1
874                     crc1 ^= crc_braid_table[k][(word1 >> (k << 3)) & 0xff];
875 #if N > 2
876                     crc2 ^= crc_braid_table[k][(word2 >> (k << 3)) & 0xff];
877 #if N > 3
878                     crc3 ^= crc_braid_table[k][(word3 >> (k << 3)) & 0xff];
879 #if N > 4
880                     crc4 ^= crc_braid_table[k][(word4 >> (k << 3)) & 0xff];
881 #if N > 5
882                     crc5 ^= crc_braid_table[k][(word5 >> (k << 3)) & 0xff];
883 #endif
884 #endif
885 #endif
886 #endif
887 #endif
888                 }
889             }
890 
891             /*
892               Process the last block, combining the CRCs of the N braids at the
893               same time.
894              */
895             crc = crc_word(crc0 ^ words[0]);
896 #if N > 1
897             crc = crc_word(crc1 ^ words[1] ^ crc);
898 #if N > 2
899             crc = crc_word(crc2 ^ words[2] ^ crc);
900 #if N > 3
901             crc = crc_word(crc3 ^ words[3] ^ crc);
902 #if N > 4
903             crc = crc_word(crc4 ^ words[4] ^ crc);
904 #if N > 5
905             crc = crc_word(crc5 ^ words[5] ^ crc);
906 #endif
907 #endif
908 #endif
909 #endif
910 #endif
911             words += N;
912         }
913         else {
914             /* Big endian. */
915 
916             z_word_t crc0, word0, comb;
917 #if N > 1
918             z_word_t crc1, word1;
919 #if N > 2
920             z_word_t crc2, word2;
921 #if N > 3
922             z_word_t crc3, word3;
923 #if N > 4
924             z_word_t crc4, word4;
925 #if N > 5
926             z_word_t crc5, word5;
927 #endif
928 #endif
929 #endif
930 #endif
931 #endif
932 
933             /* Initialize the CRC for each braid. */
934             crc0 = byte_swap(crc);
935 #if N > 1
936             crc1 = 0;
937 #if N > 2
938             crc2 = 0;
939 #if N > 3
940             crc3 = 0;
941 #if N > 4
942             crc4 = 0;
943 #if N > 5
944             crc5 = 0;
945 #endif
946 #endif
947 #endif
948 #endif
949 #endif
950 
951             /*
952               Process the first blks-1 blocks, computing the CRCs on each braid
953               independently.
954              */
955             while (--blks) {
956                 /* Load the word for each braid into registers. */
957                 word0 = crc0 ^ words[0];
958 #if N > 1
959                 word1 = crc1 ^ words[1];
960 #if N > 2
961                 word2 = crc2 ^ words[2];
962 #if N > 3
963                 word3 = crc3 ^ words[3];
964 #if N > 4
965                 word4 = crc4 ^ words[4];
966 #if N > 5
967                 word5 = crc5 ^ words[5];
968 #endif
969 #endif
970 #endif
971 #endif
972 #endif
973                 words += N;
974 
975                 /* Compute and update the CRC for each word. The loop should
976                    get unrolled. */
977                 crc0 = crc_braid_big_table[0][word0 & 0xff];
978 #if N > 1
979                 crc1 = crc_braid_big_table[0][word1 & 0xff];
980 #if N > 2
981                 crc2 = crc_braid_big_table[0][word2 & 0xff];
982 #if N > 3
983                 crc3 = crc_braid_big_table[0][word3 & 0xff];
984 #if N > 4
985                 crc4 = crc_braid_big_table[0][word4 & 0xff];
986 #if N > 5
987                 crc5 = crc_braid_big_table[0][word5 & 0xff];
988 #endif
989 #endif
990 #endif
991 #endif
992 #endif
993                 for (k = 1; k < W; k++) {
994                     crc0 ^= crc_braid_big_table[k][(word0 >> (k << 3)) & 0xff];
995 #if N > 1
996                     crc1 ^= crc_braid_big_table[k][(word1 >> (k << 3)) & 0xff];
997 #if N > 2
998                     crc2 ^= crc_braid_big_table[k][(word2 >> (k << 3)) & 0xff];
999 #if N > 3
1000                     crc3 ^= crc_braid_big_table[k][(word3 >> (k << 3)) & 0xff];
1001 #if N > 4
1002                     crc4 ^= crc_braid_big_table[k][(word4 >> (k << 3)) & 0xff];
1003 #if N > 5
1004                     crc5 ^= crc_braid_big_table[k][(word5 >> (k << 3)) & 0xff];
1005 #endif
1006 #endif
1007 #endif
1008 #endif
1009 #endif
1010                 }
1011             }
1012 
1013             /*
1014               Process the last block, combining the CRCs of the N braids at the
1015               same time.
1016              */
1017             comb = crc_word_big(crc0 ^ words[0]);
1018 #if N > 1
1019             comb = crc_word_big(crc1 ^ words[1] ^ comb);
1020 #if N > 2
1021             comb = crc_word_big(crc2 ^ words[2] ^ comb);
1022 #if N > 3
1023             comb = crc_word_big(crc3 ^ words[3] ^ comb);
1024 #if N > 4
1025             comb = crc_word_big(crc4 ^ words[4] ^ comb);
1026 #if N > 5
1027             comb = crc_word_big(crc5 ^ words[5] ^ comb);
1028 #endif
1029 #endif
1030 #endif
1031 #endif
1032 #endif
1033             words += N;
1034             crc = byte_swap(comb);
1035         }
1036 
1037         /*
1038           Update the pointer to the remaining bytes to process.
1039          */
1040         buf = (unsigned char const *)words;
1041     }
1042 
1043 #endif /* W */
1044 
1045     /* Complete the computation of the CRC on any remaining bytes. */
1046     while (len >= 8) {
1047         len -= 8;
1048         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1049         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1050         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1051         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1052         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1053         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1054         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1055         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1056     }
1057     while (len) {
1058         len--;
1059         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1060     }
1061 
1062     /* Return the CRC, post-conditioned. */
1063     return crc ^ 0xffffffff;
1064 }
1065 
1066 #endif
1067 
1068 /* ========================================================================= */
1069 unsigned long ZEXPORT crc32(crc, buf, len)
1070     unsigned long crc;
1071     const unsigned char FAR *buf;
1072     uInt len;
1073 {
1074     return crc32_z(crc, buf, len);
1075 }
1076 
1077 /* ========================================================================= */
1078 uLong ZEXPORT crc32_combine64(crc1, crc2, len2)
1079     uLong crc1;
1080     uLong crc2;
1081     z_off64_t len2;
1082 {
1083 #ifdef DYNAMIC_CRC_TABLE
1084     once(&made, make_crc_table);
1085 #endif /* DYNAMIC_CRC_TABLE */
1086     return multmodp(x2nmodp(len2, 3), crc1) ^ (crc2 & 0xffffffff);
1087 }
1088 
1089 /* ========================================================================= */
1090 uLong ZEXPORT crc32_combine(crc1, crc2, len2)
1091     uLong crc1;
1092     uLong crc2;
1093     z_off_t len2;
1094 {
1095     return crc32_combine64(crc1, crc2, len2);
1096 }
1097 
1098 /* ========================================================================= */
1099 uLong ZEXPORT crc32_combine_gen64(len2)
1100     z_off64_t len2;
1101 {
1102 #ifdef DYNAMIC_CRC_TABLE
1103     once(&made, make_crc_table);
1104 #endif /* DYNAMIC_CRC_TABLE */
1105     return x2nmodp(len2, 3);
1106 }
1107 
1108 /* ========================================================================= */
1109 uLong ZEXPORT crc32_combine_gen(len2)
1110     z_off_t len2;
1111 {
1112     return crc32_combine_gen64(len2);
1113 }
1114 
1115 /* ========================================================================= */
1116 uLong crc32_combine_op(crc1, crc2, op)
1117     uLong crc1;
1118     uLong crc2;
1119     uLong op;
1120 {
1121     return multmodp(op, crc1) ^ (crc2 & 0xffffffff);
1122 }
1123