xref: /freebsd/sys/contrib/zlib/crc32.c (revision 656d68a711952ac2b92ed258502978c5ba1dbc73)
1 /* crc32.c -- compute the CRC-32 of a data stream
2  * Copyright (C) 1995-2022 Mark Adler
3  * For conditions of distribution and use, see copyright notice in zlib.h
4  *
5  * This interleaved implementation of a CRC makes use of pipelined multiple
6  * arithmetic-logic units, commonly found in modern CPU cores. It is due to
7  * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
8  */
9 
10 /* @(#) $Id$ */
11 
12 /*
13   Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
14   protection on the static variables used to control the first-use generation
15   of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should
16   first call get_crc_table() to initialize the tables before allowing more than
17   one thread to use crc32().
18 
19   MAKECRCH can be #defined to write out crc32.h. A main() routine is also
20   produced, so that this one source file can be compiled to an executable.
21  */
22 
23 #ifdef MAKECRCH
24 #  include <stdio.h>
25 #  ifndef DYNAMIC_CRC_TABLE
26 #    define DYNAMIC_CRC_TABLE
27 #  endif /* !DYNAMIC_CRC_TABLE */
28 #endif /* MAKECRCH */
29 
30 #include "zutil.h"      /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */
31 
32  /*
33   A CRC of a message is computed on N braids of words in the message, where
34   each word consists of W bytes (4 or 8). If N is 3, for example, then three
35   running sparse CRCs are calculated respectively on each braid, at these
36   indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
37   This is done starting at a word boundary, and continues until as many blocks
38   of N * W bytes as are available have been processed. The results are combined
39   into a single CRC at the end. For this code, N must be in the range 1..6 and
40   W must be 4 or 8. The upper limit on N can be increased if desired by adding
41   more #if blocks, extending the patterns apparent in the code. In addition,
42   crc32.h would need to be regenerated, if the maximum N value is increased.
43 
44   N and W are chosen empirically by benchmarking the execution time on a given
45   processor. The choices for N and W below were based on testing on Intel Kaby
46   Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
47   Octeon II processors. The Intel, AMD, and ARM processors were all fastest
48   with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
49   They were all tested with either gcc or clang, all using the -O3 optimization
50   level. Your mileage may vary.
51  */
52 
53 /* Define N */
54 #ifdef Z_TESTN
55 #  define N Z_TESTN
56 #else
57 #  define N 5
58 #endif
59 #if N < 1 || N > 6
60 #  error N must be in 1..6
61 #endif
62 
63 /*
64   z_crc_t must be at least 32 bits. z_word_t must be at least as long as
65   z_crc_t. It is assumed here that z_word_t is either 32 bits or 64 bits, and
66   that bytes are eight bits.
67  */
68 
69 /*
70   Define W and the associated z_word_t type. If W is not defined, then a
71   braided calculation is not used, and the associated tables and code are not
72   compiled.
73  */
74 #ifdef Z_TESTW
75 #  if Z_TESTW-1 != -1
76 #    define W Z_TESTW
77 #  endif
78 #else
79 #  ifdef MAKECRCH
80 #    define W 8         /* required for MAKECRCH */
81 #  else
82 #    if defined(__x86_64__) || defined(__aarch64__)
83 #      define W 8
84 #    else
85 #      define W 4
86 #    endif
87 #  endif
88 #endif
89 #ifdef W
90 #  if W == 8 && defined(Z_U8)
91      typedef Z_U8 z_word_t;
92 #  elif defined(Z_U4)
93 #    undef W
94 #    define W 4
95      typedef Z_U4 z_word_t;
96 #  else
97 #    undef W
98 #  endif
99 #endif
100 
101 /* Local functions. */
102 local z_crc_t multmodp OF((z_crc_t a, z_crc_t b));
103 local z_crc_t x2nmodp OF((z_off64_t n, unsigned k));
104 
105 /* If available, use the ARM processor CRC32 instruction. */
106 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8
107 #  define ARMCRC32
108 #endif
109 
110 #if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE))
111 /*
112   Swap the bytes in a z_word_t to convert between little and big endian. Any
113   self-respecting compiler will optimize this to a single machine byte-swap
114   instruction, if one is available. This assumes that word_t is either 32 bits
115   or 64 bits.
116  */
117 local z_word_t byte_swap(word)
118     z_word_t word;
119 {
120 #  if W == 8
121     return
122         (word & 0xff00000000000000) >> 56 |
123         (word & 0xff000000000000) >> 40 |
124         (word & 0xff0000000000) >> 24 |
125         (word & 0xff00000000) >> 8 |
126         (word & 0xff000000) << 8 |
127         (word & 0xff0000) << 24 |
128         (word & 0xff00) << 40 |
129         (word & 0xff) << 56;
130 #  else   /* W == 4 */
131     return
132         (word & 0xff000000) >> 24 |
133         (word & 0xff0000) >> 8 |
134         (word & 0xff00) << 8 |
135         (word & 0xff) << 24;
136 #  endif
137 }
138 #endif
139 
140 /* CRC polynomial. */
141 #define POLY 0xedb88320         /* p(x) reflected, with x^32 implied */
142 
143 #ifdef DYNAMIC_CRC_TABLE
144 
145 local z_crc_t FAR crc_table[256];
146 local z_crc_t FAR x2n_table[32];
147 local void make_crc_table OF((void));
148 #ifdef W
149    local z_word_t FAR crc_big_table[256];
150    local z_crc_t FAR crc_braid_table[W][256];
151    local z_word_t FAR crc_braid_big_table[W][256];
152    local void braid OF((z_crc_t [][256], z_word_t [][256], int, int));
153 #endif
154 #ifdef MAKECRCH
155    local void write_table OF((FILE *, const z_crc_t FAR *, int));
156    local void write_table32hi OF((FILE *, const z_word_t FAR *, int));
157    local void write_table64 OF((FILE *, const z_word_t FAR *, int));
158 #endif /* MAKECRCH */
159 
160 /*
161   Define a once() function depending on the availability of atomics. If this is
162   compiled with DYNAMIC_CRC_TABLE defined, and if CRCs will be computed in
163   multiple threads, and if atomics are not available, then get_crc_table() must
164   be called to initialize the tables and must return before any threads are
165   allowed to compute or combine CRCs.
166  */
167 
168 /* Definition of once functionality. */
169 typedef struct once_s once_t;
170 local void once OF((once_t *, void (*)(void)));
171 
172 /* Check for the availability of atomics. */
173 #if defined(__STDC__) && __STDC_VERSION__ >= 201112L && \
174     !defined(__STDC_NO_ATOMICS__)
175 
176 #include <stdatomic.h>
177 
178 /* Structure for once(), which must be initialized with ONCE_INIT. */
179 struct once_s {
180     atomic_flag begun;
181     atomic_int done;
182 };
183 #define ONCE_INIT {ATOMIC_FLAG_INIT, 0}
184 
185 /*
186   Run the provided init() function exactly once, even if multiple threads
187   invoke once() at the same time. The state must be a once_t initialized with
188   ONCE_INIT.
189  */
190 local void once(state, init)
191     once_t *state;
192     void (*init)(void);
193 {
194     if (!atomic_load(&state->done)) {
195         if (atomic_flag_test_and_set(&state->begun))
196             while (!atomic_load(&state->done))
197                 ;
198         else {
199             init();
200             atomic_store(&state->done, 1);
201         }
202     }
203 }
204 
205 #else   /* no atomics */
206 
207 /* Structure for once(), which must be initialized with ONCE_INIT. */
208 struct once_s {
209     volatile int begun;
210     volatile int done;
211 };
212 #define ONCE_INIT {0, 0}
213 
214 /* Test and set. Alas, not atomic, but tries to minimize the period of
215    vulnerability. */
216 local int test_and_set OF((int volatile *));
217 local int test_and_set(flag)
218     int volatile *flag;
219 {
220     int was;
221 
222     was = *flag;
223     *flag = 1;
224     return was;
225 }
226 
227 /* Run the provided init() function once. This is not thread-safe. */
228 local void once(state, init)
229     once_t *state;
230     void (*init)(void);
231 {
232     if (!state->done) {
233         if (test_and_set(&state->begun))
234             while (!state->done)
235                 ;
236         else {
237             init();
238             state->done = 1;
239         }
240     }
241 }
242 
243 #endif
244 
245 /* State for once(). */
246 local once_t made = ONCE_INIT;
247 
248 /*
249   Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
250   x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
251 
252   Polynomials over GF(2) are represented in binary, one bit per coefficient,
253   with the lowest powers in the most significant bit. Then adding polynomials
254   is just exclusive-or, and multiplying a polynomial by x is a right shift by
255   one. If we call the above polynomial p, and represent a byte as the
256   polynomial q, also with the lowest power in the most significant bit (so the
257   byte 0xb1 is the polynomial x^7+x^3+x^2+1), then the CRC is (q*x^32) mod p,
258   where a mod b means the remainder after dividing a by b.
259 
260   This calculation is done using the shift-register method of multiplying and
261   taking the remainder. The register is initialized to zero, and for each
262   incoming bit, x^32 is added mod p to the register if the bit is a one (where
263   x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by x
264   (which is shifting right by one and adding x^32 mod p if the bit shifted out
265   is a one). We start with the highest power (least significant bit) of q and
266   repeat for all eight bits of q.
267 
268   The table is simply the CRC of all possible eight bit values. This is all the
269   information needed to generate CRCs on data a byte at a time for all
270   combinations of CRC register values and incoming bytes.
271  */
272 
273 local void make_crc_table()
274 {
275     unsigned i, j, n;
276     z_crc_t p;
277 
278     /* initialize the CRC of bytes tables */
279     for (i = 0; i < 256; i++) {
280         p = i;
281         for (j = 0; j < 8; j++)
282             p = p & 1 ? (p >> 1) ^ POLY : p >> 1;
283         crc_table[i] = p;
284 #ifdef W
285         crc_big_table[i] = byte_swap(p);
286 #endif
287     }
288 
289     /* initialize the x^2^n mod p(x) table */
290     p = (z_crc_t)1 << 30;         /* x^1 */
291     x2n_table[0] = p;
292     for (n = 1; n < 32; n++)
293         x2n_table[n] = p = multmodp(p, p);
294 
295 #ifdef W
296     /* initialize the braiding tables -- needs x2n_table[] */
297     braid(crc_braid_table, crc_braid_big_table, N, W);
298 #endif
299 
300 #ifdef MAKECRCH
301     {
302         /*
303           The crc32.h header file contains tables for both 32-bit and 64-bit
304           z_word_t's, and so requires a 64-bit type be available. In that case,
305           z_word_t must be defined to be 64-bits. This code then also generates
306           and writes out the tables for the case that z_word_t is 32 bits.
307          */
308 #if !defined(W) || W != 8
309 #  error Need a 64-bit integer type in order to generate crc32.h.
310 #endif
311         FILE *out;
312         int k, n;
313         z_crc_t ltl[8][256];
314         z_word_t big[8][256];
315 
316         out = fopen("crc32.h", "w");
317         if (out == NULL) return;
318 
319         /* write out little-endian CRC table to crc32.h */
320         fprintf(out,
321             "/* crc32.h -- tables for rapid CRC calculation\n"
322             " * Generated automatically by crc32.c\n */\n"
323             "\n"
324             "local const z_crc_t FAR crc_table[] = {\n"
325             "    ");
326         write_table(out, crc_table, 256);
327         fprintf(out,
328             "};\n");
329 
330         /* write out big-endian CRC table for 64-bit z_word_t to crc32.h */
331         fprintf(out,
332             "\n"
333             "#ifdef W\n"
334             "\n"
335             "#if W == 8\n"
336             "\n"
337             "local const z_word_t FAR crc_big_table[] = {\n"
338             "    ");
339         write_table64(out, crc_big_table, 256);
340         fprintf(out,
341             "};\n");
342 
343         /* write out big-endian CRC table for 32-bit z_word_t to crc32.h */
344         fprintf(out,
345             "\n"
346             "#else /* W == 4 */\n"
347             "\n"
348             "local const z_word_t FAR crc_big_table[] = {\n"
349             "    ");
350         write_table32hi(out, crc_big_table, 256);
351         fprintf(out,
352             "};\n"
353             "\n"
354             "#endif\n");
355 
356         /* write out braid tables for each value of N */
357         for (n = 1; n <= 6; n++) {
358             fprintf(out,
359             "\n"
360             "#if N == %d\n", n);
361 
362             /* compute braid tables for this N and 64-bit word_t */
363             braid(ltl, big, n, 8);
364 
365             /* write out braid tables for 64-bit z_word_t to crc32.h */
366             fprintf(out,
367             "\n"
368             "#if W == 8\n"
369             "\n"
370             "local const z_crc_t FAR crc_braid_table[][256] = {\n");
371             for (k = 0; k < 8; k++) {
372                 fprintf(out, "   {");
373                 write_table(out, ltl[k], 256);
374                 fprintf(out, "}%s", k < 7 ? ",\n" : "");
375             }
376             fprintf(out,
377             "};\n"
378             "\n"
379             "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
380             for (k = 0; k < 8; k++) {
381                 fprintf(out, "   {");
382                 write_table64(out, big[k], 256);
383                 fprintf(out, "}%s", k < 7 ? ",\n" : "");
384             }
385             fprintf(out,
386             "};\n");
387 
388             /* compute braid tables for this N and 32-bit word_t */
389             braid(ltl, big, n, 4);
390 
391             /* write out braid tables for 32-bit z_word_t to crc32.h */
392             fprintf(out,
393             "\n"
394             "#else /* W == 4 */\n"
395             "\n"
396             "local const z_crc_t FAR crc_braid_table[][256] = {\n");
397             for (k = 0; k < 4; k++) {
398                 fprintf(out, "   {");
399                 write_table(out, ltl[k], 256);
400                 fprintf(out, "}%s", k < 3 ? ",\n" : "");
401             }
402             fprintf(out,
403             "};\n"
404             "\n"
405             "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
406             for (k = 0; k < 4; k++) {
407                 fprintf(out, "   {");
408                 write_table32hi(out, big[k], 256);
409                 fprintf(out, "}%s", k < 3 ? ",\n" : "");
410             }
411             fprintf(out,
412             "};\n"
413             "\n"
414             "#endif\n"
415             "\n"
416             "#endif\n");
417         }
418         fprintf(out,
419             "\n"
420             "#endif\n");
421 
422         /* write out zeros operator table to crc32.h */
423         fprintf(out,
424             "\n"
425             "local const z_crc_t FAR x2n_table[] = {\n"
426             "    ");
427         write_table(out, x2n_table, 32);
428         fprintf(out,
429             "};\n");
430         fclose(out);
431     }
432 #endif /* MAKECRCH */
433 }
434 
435 #ifdef MAKECRCH
436 
437 /*
438    Write the 32-bit values in table[0..k-1] to out, five per line in
439    hexadecimal separated by commas.
440  */
441 local void write_table(out, table, k)
442     FILE *out;
443     const z_crc_t FAR *table;
444     int k;
445 {
446     int n;
447 
448     for (n = 0; n < k; n++)
449         fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
450                 (unsigned long)(table[n]),
451                 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
452 }
453 
454 /*
455    Write the high 32-bits of each value in table[0..k-1] to out, five per line
456    in hexadecimal separated by commas.
457  */
458 local void write_table32hi(out, table, k)
459 FILE *out;
460 const z_word_t FAR *table;
461 int k;
462 {
463     int n;
464 
465     for (n = 0; n < k; n++)
466         fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
467                 (unsigned long)(table[n] >> 32),
468                 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
469 }
470 
471 /*
472   Write the 64-bit values in table[0..k-1] to out, three per line in
473   hexadecimal separated by commas. This assumes that if there is a 64-bit
474   type, then there is also a long long integer type, and it is at least 64
475   bits. If not, then the type cast and format string can be adjusted
476   accordingly.
477  */
478 local void write_table64(out, table, k)
479     FILE *out;
480     const z_word_t FAR *table;
481     int k;
482 {
483     int n;
484 
485     for (n = 0; n < k; n++)
486         fprintf(out, "%s0x%016llx%s", n == 0 || n % 3 ? "" : "    ",
487                 (unsigned long long)(table[n]),
488                 n == k - 1 ? "" : (n % 3 == 2 ? ",\n" : ", "));
489 }
490 
491 /* Actually do the deed. */
492 int main()
493 {
494     make_crc_table();
495     return 0;
496 }
497 
498 #endif /* MAKECRCH */
499 
500 #ifdef W
501 /*
502   Generate the little and big-endian braid tables for the given n and z_word_t
503   size w. Each array must have room for w blocks of 256 elements.
504  */
505 local void braid(ltl, big, n, w)
506     z_crc_t ltl[][256];
507     z_word_t big[][256];
508     int n;
509     int w;
510 {
511     int k;
512     z_crc_t i, p, q;
513     for (k = 0; k < w; k++) {
514         p = x2nmodp((n * w + 3 - k) << 3, 0);
515         ltl[k][0] = 0;
516         big[w - 1 - k][0] = 0;
517         for (i = 1; i < 256; i++) {
518             ltl[k][i] = q = multmodp(i << 24, p);
519             big[w - 1 - k][i] = byte_swap(q);
520         }
521     }
522 }
523 #endif
524 
525 #else /* !DYNAMIC_CRC_TABLE */
526 /* ========================================================================
527  * Tables for byte-wise and braided CRC-32 calculations, and a table of powers
528  * of x for combining CRC-32s, all made by make_crc_table().
529  */
530 #include "crc32.h"
531 #endif /* DYNAMIC_CRC_TABLE */
532 
533 /* ========================================================================
534  * Routines used for CRC calculation. Some are also required for the table
535  * generation above.
536  */
537 
538 /*
539   Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial,
540   reflected. For speed, this requires that a not be zero.
541  */
542 local z_crc_t multmodp(a, b)
543     z_crc_t a;
544     z_crc_t b;
545 {
546     z_crc_t m, p;
547 
548     m = (z_crc_t)1 << 31;
549     p = 0;
550     for (;;) {
551         if (a & m) {
552             p ^= b;
553             if ((a & (m - 1)) == 0)
554                 break;
555         }
556         m >>= 1;
557         b = b & 1 ? (b >> 1) ^ POLY : b >> 1;
558     }
559     return p;
560 }
561 
562 /*
563   Return x^(n * 2^k) modulo p(x). Requires that x2n_table[] has been
564   initialized.
565  */
566 local z_crc_t x2nmodp(n, k)
567     z_off64_t n;
568     unsigned k;
569 {
570     z_crc_t p;
571 
572     p = (z_crc_t)1 << 31;           /* x^0 == 1 */
573     while (n) {
574         if (n & 1)
575             p = multmodp(x2n_table[k & 31], p);
576         n >>= 1;
577         k++;
578     }
579     return p;
580 }
581 
582 /* =========================================================================
583  * This function can be used by asm versions of crc32(), and to force the
584  * generation of the CRC tables in a threaded application.
585  */
586 const z_crc_t FAR * ZEXPORT get_crc_table()
587 {
588 #ifdef DYNAMIC_CRC_TABLE
589     once(&made, make_crc_table);
590 #endif /* DYNAMIC_CRC_TABLE */
591     return (const z_crc_t FAR *)crc_table;
592 }
593 
594 /* =========================================================================
595  * Use ARM machine instructions if available. This will compute the CRC about
596  * ten times faster than the braided calculation. This code does not check for
597  * the presence of the CRC instruction at run time. __ARM_FEATURE_CRC32 will
598  * only be defined if the compilation specifies an ARM processor architecture
599  * that has the instructions. For example, compiling with -march=armv8.1-a or
600  * -march=armv8-a+crc, or -march=native if the compile machine has the crc32
601  * instructions.
602  */
603 #ifdef ARMCRC32
604 
605 /*
606    Constants empirically determined to maximize speed. These values are from
607    measurements on a Cortex-A57. Your mileage may vary.
608  */
609 #define Z_BATCH 3990                /* number of words in a batch */
610 #define Z_BATCH_ZEROS 0xa10d3d0c    /* computed from Z_BATCH = 3990 */
611 #define Z_BATCH_MIN 800             /* fewest words in a final batch */
612 
613 unsigned long ZEXPORT crc32_z(crc, buf, len)
614     unsigned long crc;
615     const unsigned char FAR *buf;
616     z_size_t len;
617 {
618     z_crc_t val;
619     z_word_t crc1, crc2;
620     const z_word_t *word;
621     z_word_t val0, val1, val2;
622     z_size_t last, last2, i;
623     z_size_t num;
624 
625     /* Return initial CRC, if requested. */
626     if (buf == Z_NULL) return 0;
627 
628 #ifdef DYNAMIC_CRC_TABLE
629     once(&made, make_crc_table);
630 #endif /* DYNAMIC_CRC_TABLE */
631 
632     /* Pre-condition the CRC */
633     crc = (~crc) & 0xffffffff;
634 
635     /* Compute the CRC up to a word boundary. */
636     while (len && ((z_size_t)buf & 7) != 0) {
637         len--;
638         val = *buf++;
639         __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
640     }
641 
642     /* Prepare to compute the CRC on full 64-bit words word[0..num-1]. */
643     word = (z_word_t const *)buf;
644     num = len >> 3;
645     len &= 7;
646 
647     /* Do three interleaved CRCs to realize the throughput of one crc32x
648        instruction per cycle. Each CRC is calcuated on Z_BATCH words. The three
649        CRCs are combined into a single CRC after each set of batches. */
650     while (num >= 3 * Z_BATCH) {
651         crc1 = 0;
652         crc2 = 0;
653         for (i = 0; i < Z_BATCH; i++) {
654             val0 = word[i];
655             val1 = word[i + Z_BATCH];
656             val2 = word[i + 2 * Z_BATCH];
657             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
658             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
659             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
660         }
661         word += 3 * Z_BATCH;
662         num -= 3 * Z_BATCH;
663         crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc1;
664         crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc2;
665     }
666 
667     /* Do one last smaller batch with the remaining words, if there are enough
668        to pay for the combination of CRCs. */
669     last = num / 3;
670     if (last >= Z_BATCH_MIN) {
671         last2 = last << 1;
672         crc1 = 0;
673         crc2 = 0;
674         for (i = 0; i < last; i++) {
675             val0 = word[i];
676             val1 = word[i + last];
677             val2 = word[i + last2];
678             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
679             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
680             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
681         }
682         word += 3 * last;
683         num -= 3 * last;
684         val = x2nmodp(last, 6);
685         crc = multmodp(val, crc) ^ crc1;
686         crc = multmodp(val, crc) ^ crc2;
687     }
688 
689     /* Compute the CRC on any remaining words. */
690     for (i = 0; i < num; i++) {
691         val0 = word[i];
692         __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
693     }
694     word += num;
695 
696     /* Complete the CRC on any remaining bytes. */
697     buf = (const unsigned char FAR *)word;
698     while (len) {
699         len--;
700         val = *buf++;
701         __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
702     }
703 
704     /* Return the CRC, post-conditioned. */
705     return crc ^ 0xffffffff;
706 }
707 
708 #else
709 
710 #ifdef W
711 
712 /*
713   Return the CRC of the W bytes in the word_t data, taking the
714   least-significant byte of the word as the first byte of data, without any pre
715   or post conditioning. This is used to combine the CRCs of each braid.
716  */
717 local z_crc_t crc_word(data)
718     z_word_t data;
719 {
720     int k;
721     for (k = 0; k < W; k++)
722         data = (data >> 8) ^ crc_table[data & 0xff];
723     return (z_crc_t)data;
724 }
725 
726 local z_word_t crc_word_big(data)
727     z_word_t data;
728 {
729     int k;
730     for (k = 0; k < W; k++)
731         data = (data << 8) ^
732             crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
733     return data;
734 }
735 
736 #endif
737 
738 /* ========================================================================= */
739 unsigned long ZEXPORT crc32_z(crc, buf, len)
740     unsigned long crc;
741     const unsigned char FAR *buf;
742     z_size_t len;
743 {
744     /* Return initial CRC, if requested. */
745     if (buf == Z_NULL) return 0;
746 
747 #ifdef DYNAMIC_CRC_TABLE
748     once(&made, make_crc_table);
749 #endif /* DYNAMIC_CRC_TABLE */
750 
751     /* Pre-condition the CRC */
752     crc = (~crc) & 0xffffffff;
753 
754 #ifdef W
755 
756     /* If provided enough bytes, do a braided CRC calculation. */
757     if (len >= N * W + W - 1) {
758         z_size_t blks;
759         z_word_t const *words;
760         unsigned endian;
761         int k;
762 
763         /* Compute the CRC up to a z_word_t boundary. */
764         while (len && ((z_size_t)buf & (W - 1)) != 0) {
765             len--;
766             crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
767         }
768 
769         /* Compute the CRC on as many N z_word_t blocks as are available. */
770         blks = len / (N * W);
771         len -= blks * N * W;
772         words = (z_word_t const *)buf;
773 
774         /* Do endian check at execution time instead of compile time, since ARM
775            processors can change the endianess at execution time. If the
776            compiler knows what the endianess will be, it can optimize out the
777            check and the unused branch. */
778         endian = 1;
779         if (*(unsigned char *)&endian) {
780             /* Little endian. */
781 
782             z_crc_t crc0;
783             z_word_t word0;
784 #if N > 1
785             z_crc_t crc1;
786             z_word_t word1;
787 #if N > 2
788             z_crc_t crc2;
789             z_word_t word2;
790 #if N > 3
791             z_crc_t crc3;
792             z_word_t word3;
793 #if N > 4
794             z_crc_t crc4;
795             z_word_t word4;
796 #if N > 5
797             z_crc_t crc5;
798             z_word_t word5;
799 #endif
800 #endif
801 #endif
802 #endif
803 #endif
804 
805             /* Initialize the CRC for each braid. */
806             crc0 = crc;
807 #if N > 1
808             crc1 = 0;
809 #if N > 2
810             crc2 = 0;
811 #if N > 3
812             crc3 = 0;
813 #if N > 4
814             crc4 = 0;
815 #if N > 5
816             crc5 = 0;
817 #endif
818 #endif
819 #endif
820 #endif
821 #endif
822 
823             /*
824               Process the first blks-1 blocks, computing the CRCs on each braid
825               independently.
826              */
827             while (--blks) {
828                 /* Load the word for each braid into registers. */
829                 word0 = crc0 ^ words[0];
830 #if N > 1
831                 word1 = crc1 ^ words[1];
832 #if N > 2
833                 word2 = crc2 ^ words[2];
834 #if N > 3
835                 word3 = crc3 ^ words[3];
836 #if N > 4
837                 word4 = crc4 ^ words[4];
838 #if N > 5
839                 word5 = crc5 ^ words[5];
840 #endif
841 #endif
842 #endif
843 #endif
844 #endif
845                 words += N;
846 
847                 /* Compute and update the CRC for each word. The loop should
848                    get unrolled. */
849                 crc0 = crc_braid_table[0][word0 & 0xff];
850 #if N > 1
851                 crc1 = crc_braid_table[0][word1 & 0xff];
852 #if N > 2
853                 crc2 = crc_braid_table[0][word2 & 0xff];
854 #if N > 3
855                 crc3 = crc_braid_table[0][word3 & 0xff];
856 #if N > 4
857                 crc4 = crc_braid_table[0][word4 & 0xff];
858 #if N > 5
859                 crc5 = crc_braid_table[0][word5 & 0xff];
860 #endif
861 #endif
862 #endif
863 #endif
864 #endif
865                 for (k = 1; k < W; k++) {
866                     crc0 ^= crc_braid_table[k][(word0 >> (k << 3)) & 0xff];
867 #if N > 1
868                     crc1 ^= crc_braid_table[k][(word1 >> (k << 3)) & 0xff];
869 #if N > 2
870                     crc2 ^= crc_braid_table[k][(word2 >> (k << 3)) & 0xff];
871 #if N > 3
872                     crc3 ^= crc_braid_table[k][(word3 >> (k << 3)) & 0xff];
873 #if N > 4
874                     crc4 ^= crc_braid_table[k][(word4 >> (k << 3)) & 0xff];
875 #if N > 5
876                     crc5 ^= crc_braid_table[k][(word5 >> (k << 3)) & 0xff];
877 #endif
878 #endif
879 #endif
880 #endif
881 #endif
882                 }
883             }
884 
885             /*
886               Process the last block, combining the CRCs of the N braids at the
887               same time.
888              */
889             crc = crc_word(crc0 ^ words[0]);
890 #if N > 1
891             crc = crc_word(crc1 ^ words[1] ^ crc);
892 #if N > 2
893             crc = crc_word(crc2 ^ words[2] ^ crc);
894 #if N > 3
895             crc = crc_word(crc3 ^ words[3] ^ crc);
896 #if N > 4
897             crc = crc_word(crc4 ^ words[4] ^ crc);
898 #if N > 5
899             crc = crc_word(crc5 ^ words[5] ^ crc);
900 #endif
901 #endif
902 #endif
903 #endif
904 #endif
905             words += N;
906         }
907         else {
908             /* Big endian. */
909 
910             z_word_t crc0, word0, comb;
911 #if N > 1
912             z_word_t crc1, word1;
913 #if N > 2
914             z_word_t crc2, word2;
915 #if N > 3
916             z_word_t crc3, word3;
917 #if N > 4
918             z_word_t crc4, word4;
919 #if N > 5
920             z_word_t crc5, word5;
921 #endif
922 #endif
923 #endif
924 #endif
925 #endif
926 
927             /* Initialize the CRC for each braid. */
928             crc0 = byte_swap(crc);
929 #if N > 1
930             crc1 = 0;
931 #if N > 2
932             crc2 = 0;
933 #if N > 3
934             crc3 = 0;
935 #if N > 4
936             crc4 = 0;
937 #if N > 5
938             crc5 = 0;
939 #endif
940 #endif
941 #endif
942 #endif
943 #endif
944 
945             /*
946               Process the first blks-1 blocks, computing the CRCs on each braid
947               independently.
948              */
949             while (--blks) {
950                 /* Load the word for each braid into registers. */
951                 word0 = crc0 ^ words[0];
952 #if N > 1
953                 word1 = crc1 ^ words[1];
954 #if N > 2
955                 word2 = crc2 ^ words[2];
956 #if N > 3
957                 word3 = crc3 ^ words[3];
958 #if N > 4
959                 word4 = crc4 ^ words[4];
960 #if N > 5
961                 word5 = crc5 ^ words[5];
962 #endif
963 #endif
964 #endif
965 #endif
966 #endif
967                 words += N;
968 
969                 /* Compute and update the CRC for each word. The loop should
970                    get unrolled. */
971                 crc0 = crc_braid_big_table[0][word0 & 0xff];
972 #if N > 1
973                 crc1 = crc_braid_big_table[0][word1 & 0xff];
974 #if N > 2
975                 crc2 = crc_braid_big_table[0][word2 & 0xff];
976 #if N > 3
977                 crc3 = crc_braid_big_table[0][word3 & 0xff];
978 #if N > 4
979                 crc4 = crc_braid_big_table[0][word4 & 0xff];
980 #if N > 5
981                 crc5 = crc_braid_big_table[0][word5 & 0xff];
982 #endif
983 #endif
984 #endif
985 #endif
986 #endif
987                 for (k = 1; k < W; k++) {
988                     crc0 ^= crc_braid_big_table[k][(word0 >> (k << 3)) & 0xff];
989 #if N > 1
990                     crc1 ^= crc_braid_big_table[k][(word1 >> (k << 3)) & 0xff];
991 #if N > 2
992                     crc2 ^= crc_braid_big_table[k][(word2 >> (k << 3)) & 0xff];
993 #if N > 3
994                     crc3 ^= crc_braid_big_table[k][(word3 >> (k << 3)) & 0xff];
995 #if N > 4
996                     crc4 ^= crc_braid_big_table[k][(word4 >> (k << 3)) & 0xff];
997 #if N > 5
998                     crc5 ^= crc_braid_big_table[k][(word5 >> (k << 3)) & 0xff];
999 #endif
1000 #endif
1001 #endif
1002 #endif
1003 #endif
1004                 }
1005             }
1006 
1007             /*
1008               Process the last block, combining the CRCs of the N braids at the
1009               same time.
1010              */
1011             comb = crc_word_big(crc0 ^ words[0]);
1012 #if N > 1
1013             comb = crc_word_big(crc1 ^ words[1] ^ comb);
1014 #if N > 2
1015             comb = crc_word_big(crc2 ^ words[2] ^ comb);
1016 #if N > 3
1017             comb = crc_word_big(crc3 ^ words[3] ^ comb);
1018 #if N > 4
1019             comb = crc_word_big(crc4 ^ words[4] ^ comb);
1020 #if N > 5
1021             comb = crc_word_big(crc5 ^ words[5] ^ comb);
1022 #endif
1023 #endif
1024 #endif
1025 #endif
1026 #endif
1027             words += N;
1028             crc = byte_swap(comb);
1029         }
1030 
1031         /*
1032           Update the pointer to the remaining bytes to process.
1033          */
1034         buf = (unsigned char const *)words;
1035     }
1036 
1037 #endif /* W */
1038 
1039     /* Complete the computation of the CRC on any remaining bytes. */
1040     while (len >= 8) {
1041         len -= 8;
1042         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1043         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1044         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1045         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1046         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1047         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1048         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1049         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1050     }
1051     while (len) {
1052         len--;
1053         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1054     }
1055 
1056     /* Return the CRC, post-conditioned. */
1057     return crc ^ 0xffffffff;
1058 }
1059 
1060 #endif
1061 
1062 /* ========================================================================= */
1063 unsigned long ZEXPORT crc32(crc, buf, len)
1064     unsigned long crc;
1065     const unsigned char FAR *buf;
1066     uInt len;
1067 {
1068     return crc32_z(crc, buf, len);
1069 }
1070 
1071 /* ========================================================================= */
1072 uLong ZEXPORT crc32_combine64(crc1, crc2, len2)
1073     uLong crc1;
1074     uLong crc2;
1075     z_off64_t len2;
1076 {
1077 #ifdef DYNAMIC_CRC_TABLE
1078     once(&made, make_crc_table);
1079 #endif /* DYNAMIC_CRC_TABLE */
1080     return multmodp(x2nmodp(len2, 3), crc1) ^ (crc2 & 0xffffffff);
1081 }
1082 
1083 /* ========================================================================= */
1084 uLong ZEXPORT crc32_combine(crc1, crc2, len2)
1085     uLong crc1;
1086     uLong crc2;
1087     z_off_t len2;
1088 {
1089     return crc32_combine64(crc1, crc2, len2);
1090 }
1091 
1092 /* ========================================================================= */
1093 uLong ZEXPORT crc32_combine_gen64(len2)
1094     z_off64_t len2;
1095 {
1096 #ifdef DYNAMIC_CRC_TABLE
1097     once(&made, make_crc_table);
1098 #endif /* DYNAMIC_CRC_TABLE */
1099     return x2nmodp(len2, 3);
1100 }
1101 
1102 /* ========================================================================= */
1103 uLong ZEXPORT crc32_combine_gen(len2)
1104     z_off_t len2;
1105 {
1106     return crc32_combine_gen64(len2);
1107 }
1108 
1109 /* ========================================================================= */
1110 uLong crc32_combine_op(crc1, crc2, op)
1111     uLong crc1;
1112     uLong crc2;
1113     uLong op;
1114 {
1115     return multmodp(op, crc1) ^ (crc2 & 0xffffffff);
1116 }
1117