xref: /freebsd/sys/contrib/zlib/crc32.c (revision e32fecd0c2c3ee37c47ee100f169e7eb0282a873)
1 /* crc32.c -- compute the CRC-32 of a data stream
2  * Copyright (C) 1995-2022 Mark Adler
3  * For conditions of distribution and use, see copyright notice in zlib.h
4  *
5  * This interleaved implementation of a CRC makes use of pipelined multiple
6  * arithmetic-logic units, commonly found in modern CPU cores. It is due to
7  * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
8  */
9 
10 /* @(#) $Id$ */
11 
12 /*
13   Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
14   protection on the static variables used to control the first-use generation
15   of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should
16   first call get_crc_table() to initialize the tables before allowing more than
17   one thread to use crc32().
18 
19   MAKECRCH can be #defined to write out crc32.h. A main() routine is also
20   produced, so that this one source file can be compiled to an executable.
21  */
22 
23 #ifdef MAKECRCH
24 #  include <stdio.h>
25 #  ifndef DYNAMIC_CRC_TABLE
26 #    define DYNAMIC_CRC_TABLE
27 #  endif /* !DYNAMIC_CRC_TABLE */
28 #endif /* MAKECRCH */
29 
30 #include "zutil.h"      /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */
31 
32  /*
33   A CRC of a message is computed on N braids of words in the message, where
34   each word consists of W bytes (4 or 8). If N is 3, for example, then three
35   running sparse CRCs are calculated respectively on each braid, at these
36   indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
37   This is done starting at a word boundary, and continues until as many blocks
38   of N * W bytes as are available have been processed. The results are combined
39   into a single CRC at the end. For this code, N must be in the range 1..6 and
40   W must be 4 or 8. The upper limit on N can be increased if desired by adding
41   more #if blocks, extending the patterns apparent in the code. In addition,
42   crc32.h would need to be regenerated, if the maximum N value is increased.
43 
44   N and W are chosen empirically by benchmarking the execution time on a given
45   processor. The choices for N and W below were based on testing on Intel Kaby
46   Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
47   Octeon II processors. The Intel, AMD, and ARM processors were all fastest
48   with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
49   They were all tested with either gcc or clang, all using the -O3 optimization
50   level. Your mileage may vary.
51  */
52 
53 /* Define N */
54 #ifdef Z_TESTN
55 #  define N Z_TESTN
56 #else
57 #  define N 5
58 #endif
59 #if N < 1 || N > 6
60 #  error N must be in 1..6
61 #endif
62 
63 /*
64   z_crc_t must be at least 32 bits. z_word_t must be at least as long as
65   z_crc_t. It is assumed here that z_word_t is either 32 bits or 64 bits, and
66   that bytes are eight bits.
67  */
68 
69 /*
70   Define W and the associated z_word_t type. If W is not defined, then a
71   braided calculation is not used, and the associated tables and code are not
72   compiled.
73  */
74 #ifdef Z_TESTW
75 #  if Z_TESTW-1 != -1
76 #    define W Z_TESTW
77 #  endif
78 #else
79 #  ifdef MAKECRCH
80 #    define W 8         /* required for MAKECRCH */
81 #  else
82 #    if defined(__x86_64__) || defined(__aarch64__)
83 #      define W 8
84 #    else
85 #      define W 4
86 #    endif
87 #  endif
88 #endif
89 #ifdef W
90 #  if W == 8 && defined(Z_U8)
91      typedef Z_U8 z_word_t;
92 #  elif defined(Z_U4)
93 #    undef W
94 #    define W 4
95      typedef Z_U4 z_word_t;
96 #  else
97 #    undef W
98 #  endif
99 #endif
100 
101 /* If available, use the ARM processor CRC32 instruction. */
102 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8
103 #  define ARMCRC32
104 #endif
105 
106 /* Local functions. */
107 local z_crc_t multmodp OF((z_crc_t a, z_crc_t b));
108 local z_crc_t x2nmodp OF((z_off64_t n, unsigned k));
109 
110 #if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE))
111     local z_word_t byte_swap OF((z_word_t word));
112 #endif
113 
114 #if defined(W) && !defined(ARMCRC32)
115     local z_crc_t crc_word OF((z_word_t data));
116     local z_word_t crc_word_big OF((z_word_t data));
117 #endif
118 
119 #if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE))
120 /*
121   Swap the bytes in a z_word_t to convert between little and big endian. Any
122   self-respecting compiler will optimize this to a single machine byte-swap
123   instruction, if one is available. This assumes that word_t is either 32 bits
124   or 64 bits.
125  */
126 local z_word_t byte_swap OF((z_word_t word));
127 
128 local z_word_t byte_swap(word)
129     z_word_t word;
130 {
131 #  if W == 8
132     return
133         (word & 0xff00000000000000) >> 56 |
134         (word & 0xff000000000000) >> 40 |
135         (word & 0xff0000000000) >> 24 |
136         (word & 0xff00000000) >> 8 |
137         (word & 0xff000000) << 8 |
138         (word & 0xff0000) << 24 |
139         (word & 0xff00) << 40 |
140         (word & 0xff) << 56;
141 #  else   /* W == 4 */
142     return
143         (word & 0xff000000) >> 24 |
144         (word & 0xff0000) >> 8 |
145         (word & 0xff00) << 8 |
146         (word & 0xff) << 24;
147 #  endif
148 }
149 #endif
150 
151 /* CRC polynomial. */
152 #define POLY 0xedb88320         /* p(x) reflected, with x^32 implied */
153 
154 #ifdef DYNAMIC_CRC_TABLE
155 
156 local z_crc_t FAR crc_table[256];
157 local z_crc_t FAR x2n_table[32];
158 local void make_crc_table OF((void));
159 #ifdef W
160    local z_word_t FAR crc_big_table[256];
161    local z_crc_t FAR crc_braid_table[W][256];
162    local z_word_t FAR crc_braid_big_table[W][256];
163    local void braid OF((z_crc_t [][256], z_word_t [][256], int, int));
164 #endif
165 #ifdef MAKECRCH
166    local void write_table OF((FILE *, const z_crc_t FAR *, int));
167    local void write_table32hi OF((FILE *, const z_word_t FAR *, int));
168    local void write_table64 OF((FILE *, const z_word_t FAR *, int));
169 #endif /* MAKECRCH */
170 
171 /*
172   Define a once() function depending on the availability of atomics. If this is
173   compiled with DYNAMIC_CRC_TABLE defined, and if CRCs will be computed in
174   multiple threads, and if atomics are not available, then get_crc_table() must
175   be called to initialize the tables and must return before any threads are
176   allowed to compute or combine CRCs.
177  */
178 
179 /* Definition of once functionality. */
180 typedef struct once_s once_t;
181 local void once OF((once_t *, void (*)(void)));
182 
183 /* Check for the availability of atomics. */
184 #if defined(__STDC__) && __STDC_VERSION__ >= 201112L && \
185     !defined(__STDC_NO_ATOMICS__)
186 
187 #include <stdatomic.h>
188 
189 /* Structure for once(), which must be initialized with ONCE_INIT. */
190 struct once_s {
191     atomic_flag begun;
192     atomic_int done;
193 };
194 #define ONCE_INIT {ATOMIC_FLAG_INIT, 0}
195 
196 /*
197   Run the provided init() function exactly once, even if multiple threads
198   invoke once() at the same time. The state must be a once_t initialized with
199   ONCE_INIT.
200  */
201 local void once(state, init)
202     once_t *state;
203     void (*init)(void);
204 {
205     if (!atomic_load(&state->done)) {
206         if (atomic_flag_test_and_set(&state->begun))
207             while (!atomic_load(&state->done))
208                 ;
209         else {
210             init();
211             atomic_store(&state->done, 1);
212         }
213     }
214 }
215 
216 #else   /* no atomics */
217 
218 /* Structure for once(), which must be initialized with ONCE_INIT. */
219 struct once_s {
220     volatile int begun;
221     volatile int done;
222 };
223 #define ONCE_INIT {0, 0}
224 
225 /* Test and set. Alas, not atomic, but tries to minimize the period of
226    vulnerability. */
227 local int test_and_set OF((int volatile *));
228 local int test_and_set(flag)
229     int volatile *flag;
230 {
231     int was;
232 
233     was = *flag;
234     *flag = 1;
235     return was;
236 }
237 
238 /* Run the provided init() function once. This is not thread-safe. */
239 local void once(state, init)
240     once_t *state;
241     void (*init)(void);
242 {
243     if (!state->done) {
244         if (test_and_set(&state->begun))
245             while (!state->done)
246                 ;
247         else {
248             init();
249             state->done = 1;
250         }
251     }
252 }
253 
254 #endif
255 
256 /* State for once(). */
257 local once_t made = ONCE_INIT;
258 
259 /*
260   Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
261   x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
262 
263   Polynomials over GF(2) are represented in binary, one bit per coefficient,
264   with the lowest powers in the most significant bit. Then adding polynomials
265   is just exclusive-or, and multiplying a polynomial by x is a right shift by
266   one. If we call the above polynomial p, and represent a byte as the
267   polynomial q, also with the lowest power in the most significant bit (so the
268   byte 0xb1 is the polynomial x^7+x^3+x^2+1), then the CRC is (q*x^32) mod p,
269   where a mod b means the remainder after dividing a by b.
270 
271   This calculation is done using the shift-register method of multiplying and
272   taking the remainder. The register is initialized to zero, and for each
273   incoming bit, x^32 is added mod p to the register if the bit is a one (where
274   x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by x
275   (which is shifting right by one and adding x^32 mod p if the bit shifted out
276   is a one). We start with the highest power (least significant bit) of q and
277   repeat for all eight bits of q.
278 
279   The table is simply the CRC of all possible eight bit values. This is all the
280   information needed to generate CRCs on data a byte at a time for all
281   combinations of CRC register values and incoming bytes.
282  */
283 
284 local void make_crc_table()
285 {
286     unsigned i, j, n;
287     z_crc_t p;
288 
289     /* initialize the CRC of bytes tables */
290     for (i = 0; i < 256; i++) {
291         p = i;
292         for (j = 0; j < 8; j++)
293             p = p & 1 ? (p >> 1) ^ POLY : p >> 1;
294         crc_table[i] = p;
295 #ifdef W
296         crc_big_table[i] = byte_swap(p);
297 #endif
298     }
299 
300     /* initialize the x^2^n mod p(x) table */
301     p = (z_crc_t)1 << 30;         /* x^1 */
302     x2n_table[0] = p;
303     for (n = 1; n < 32; n++)
304         x2n_table[n] = p = multmodp(p, p);
305 
306 #ifdef W
307     /* initialize the braiding tables -- needs x2n_table[] */
308     braid(crc_braid_table, crc_braid_big_table, N, W);
309 #endif
310 
311 #ifdef MAKECRCH
312     {
313         /*
314           The crc32.h header file contains tables for both 32-bit and 64-bit
315           z_word_t's, and so requires a 64-bit type be available. In that case,
316           z_word_t must be defined to be 64-bits. This code then also generates
317           and writes out the tables for the case that z_word_t is 32 bits.
318          */
319 #if !defined(W) || W != 8
320 #  error Need a 64-bit integer type in order to generate crc32.h.
321 #endif
322         FILE *out;
323         int k, n;
324         z_crc_t ltl[8][256];
325         z_word_t big[8][256];
326 
327         out = fopen("crc32.h", "w");
328         if (out == NULL) return;
329 
330         /* write out little-endian CRC table to crc32.h */
331         fprintf(out,
332             "/* crc32.h -- tables for rapid CRC calculation\n"
333             " * Generated automatically by crc32.c\n */\n"
334             "\n"
335             "local const z_crc_t FAR crc_table[] = {\n"
336             "    ");
337         write_table(out, crc_table, 256);
338         fprintf(out,
339             "};\n");
340 
341         /* write out big-endian CRC table for 64-bit z_word_t to crc32.h */
342         fprintf(out,
343             "\n"
344             "#ifdef W\n"
345             "\n"
346             "#if W == 8\n"
347             "\n"
348             "local const z_word_t FAR crc_big_table[] = {\n"
349             "    ");
350         write_table64(out, crc_big_table, 256);
351         fprintf(out,
352             "};\n");
353 
354         /* write out big-endian CRC table for 32-bit z_word_t to crc32.h */
355         fprintf(out,
356             "\n"
357             "#else /* W == 4 */\n"
358             "\n"
359             "local const z_word_t FAR crc_big_table[] = {\n"
360             "    ");
361         write_table32hi(out, crc_big_table, 256);
362         fprintf(out,
363             "};\n"
364             "\n"
365             "#endif\n");
366 
367         /* write out braid tables for each value of N */
368         for (n = 1; n <= 6; n++) {
369             fprintf(out,
370             "\n"
371             "#if N == %d\n", n);
372 
373             /* compute braid tables for this N and 64-bit word_t */
374             braid(ltl, big, n, 8);
375 
376             /* write out braid tables for 64-bit z_word_t to crc32.h */
377             fprintf(out,
378             "\n"
379             "#if W == 8\n"
380             "\n"
381             "local const z_crc_t FAR crc_braid_table[][256] = {\n");
382             for (k = 0; k < 8; k++) {
383                 fprintf(out, "   {");
384                 write_table(out, ltl[k], 256);
385                 fprintf(out, "}%s", k < 7 ? ",\n" : "");
386             }
387             fprintf(out,
388             "};\n"
389             "\n"
390             "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
391             for (k = 0; k < 8; k++) {
392                 fprintf(out, "   {");
393                 write_table64(out, big[k], 256);
394                 fprintf(out, "}%s", k < 7 ? ",\n" : "");
395             }
396             fprintf(out,
397             "};\n");
398 
399             /* compute braid tables for this N and 32-bit word_t */
400             braid(ltl, big, n, 4);
401 
402             /* write out braid tables for 32-bit z_word_t to crc32.h */
403             fprintf(out,
404             "\n"
405             "#else /* W == 4 */\n"
406             "\n"
407             "local const z_crc_t FAR crc_braid_table[][256] = {\n");
408             for (k = 0; k < 4; k++) {
409                 fprintf(out, "   {");
410                 write_table(out, ltl[k], 256);
411                 fprintf(out, "}%s", k < 3 ? ",\n" : "");
412             }
413             fprintf(out,
414             "};\n"
415             "\n"
416             "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
417             for (k = 0; k < 4; k++) {
418                 fprintf(out, "   {");
419                 write_table32hi(out, big[k], 256);
420                 fprintf(out, "}%s", k < 3 ? ",\n" : "");
421             }
422             fprintf(out,
423             "};\n"
424             "\n"
425             "#endif\n"
426             "\n"
427             "#endif\n");
428         }
429         fprintf(out,
430             "\n"
431             "#endif\n");
432 
433         /* write out zeros operator table to crc32.h */
434         fprintf(out,
435             "\n"
436             "local const z_crc_t FAR x2n_table[] = {\n"
437             "    ");
438         write_table(out, x2n_table, 32);
439         fprintf(out,
440             "};\n");
441         fclose(out);
442     }
443 #endif /* MAKECRCH */
444 }
445 
446 #ifdef MAKECRCH
447 
448 /*
449    Write the 32-bit values in table[0..k-1] to out, five per line in
450    hexadecimal separated by commas.
451  */
452 local void write_table(out, table, k)
453     FILE *out;
454     const z_crc_t FAR *table;
455     int k;
456 {
457     int n;
458 
459     for (n = 0; n < k; n++)
460         fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
461                 (unsigned long)(table[n]),
462                 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
463 }
464 
465 /*
466    Write the high 32-bits of each value in table[0..k-1] to out, five per line
467    in hexadecimal separated by commas.
468  */
469 local void write_table32hi(out, table, k)
470 FILE *out;
471 const z_word_t FAR *table;
472 int k;
473 {
474     int n;
475 
476     for (n = 0; n < k; n++)
477         fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
478                 (unsigned long)(table[n] >> 32),
479                 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
480 }
481 
482 /*
483   Write the 64-bit values in table[0..k-1] to out, three per line in
484   hexadecimal separated by commas. This assumes that if there is a 64-bit
485   type, then there is also a long long integer type, and it is at least 64
486   bits. If not, then the type cast and format string can be adjusted
487   accordingly.
488  */
489 local void write_table64(out, table, k)
490     FILE *out;
491     const z_word_t FAR *table;
492     int k;
493 {
494     int n;
495 
496     for (n = 0; n < k; n++)
497         fprintf(out, "%s0x%016llx%s", n == 0 || n % 3 ? "" : "    ",
498                 (unsigned long long)(table[n]),
499                 n == k - 1 ? "" : (n % 3 == 2 ? ",\n" : ", "));
500 }
501 
502 /* Actually do the deed. */
503 int main()
504 {
505     make_crc_table();
506     return 0;
507 }
508 
509 #endif /* MAKECRCH */
510 
511 #ifdef W
512 /*
513   Generate the little and big-endian braid tables for the given n and z_word_t
514   size w. Each array must have room for w blocks of 256 elements.
515  */
516 local void braid(ltl, big, n, w)
517     z_crc_t ltl[][256];
518     z_word_t big[][256];
519     int n;
520     int w;
521 {
522     int k;
523     z_crc_t i, p, q;
524     for (k = 0; k < w; k++) {
525         p = x2nmodp((n * w + 3 - k) << 3, 0);
526         ltl[k][0] = 0;
527         big[w - 1 - k][0] = 0;
528         for (i = 1; i < 256; i++) {
529             ltl[k][i] = q = multmodp(i << 24, p);
530             big[w - 1 - k][i] = byte_swap(q);
531         }
532     }
533 }
534 #endif
535 
536 #else /* !DYNAMIC_CRC_TABLE */
537 /* ========================================================================
538  * Tables for byte-wise and braided CRC-32 calculations, and a table of powers
539  * of x for combining CRC-32s, all made by make_crc_table().
540  */
541 #include "crc32.h"
542 #endif /* DYNAMIC_CRC_TABLE */
543 
544 /* ========================================================================
545  * Routines used for CRC calculation. Some are also required for the table
546  * generation above.
547  */
548 
549 /*
550   Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial,
551   reflected. For speed, this requires that a not be zero.
552  */
553 local z_crc_t multmodp(a, b)
554     z_crc_t a;
555     z_crc_t b;
556 {
557     z_crc_t m, p;
558 
559     m = (z_crc_t)1 << 31;
560     p = 0;
561     for (;;) {
562         if (a & m) {
563             p ^= b;
564             if ((a & (m - 1)) == 0)
565                 break;
566         }
567         m >>= 1;
568         b = b & 1 ? (b >> 1) ^ POLY : b >> 1;
569     }
570     return p;
571 }
572 
573 /*
574   Return x^(n * 2^k) modulo p(x). Requires that x2n_table[] has been
575   initialized.
576  */
577 local z_crc_t x2nmodp(n, k)
578     z_off64_t n;
579     unsigned k;
580 {
581     z_crc_t p;
582 
583     p = (z_crc_t)1 << 31;           /* x^0 == 1 */
584     while (n) {
585         if (n & 1)
586             p = multmodp(x2n_table[k & 31], p);
587         n >>= 1;
588         k++;
589     }
590     return p;
591 }
592 
593 /* =========================================================================
594  * This function can be used by asm versions of crc32(), and to force the
595  * generation of the CRC tables in a threaded application.
596  */
597 const z_crc_t FAR * ZEXPORT get_crc_table()
598 {
599 #ifdef DYNAMIC_CRC_TABLE
600     once(&made, make_crc_table);
601 #endif /* DYNAMIC_CRC_TABLE */
602     return (const z_crc_t FAR *)crc_table;
603 }
604 
605 /* =========================================================================
606  * Use ARM machine instructions if available. This will compute the CRC about
607  * ten times faster than the braided calculation. This code does not check for
608  * the presence of the CRC instruction at run time. __ARM_FEATURE_CRC32 will
609  * only be defined if the compilation specifies an ARM processor architecture
610  * that has the instructions. For example, compiling with -march=armv8.1-a or
611  * -march=armv8-a+crc, or -march=native if the compile machine has the crc32
612  * instructions.
613  */
614 #ifdef ARMCRC32
615 
616 /*
617    Constants empirically determined to maximize speed. These values are from
618    measurements on a Cortex-A57. Your mileage may vary.
619  */
620 #define Z_BATCH 3990                /* number of words in a batch */
621 #define Z_BATCH_ZEROS 0xa10d3d0c    /* computed from Z_BATCH = 3990 */
622 #define Z_BATCH_MIN 800             /* fewest words in a final batch */
623 
624 unsigned long ZEXPORT crc32_z(crc, buf, len)
625     unsigned long crc;
626     const unsigned char FAR *buf;
627     z_size_t len;
628 {
629     z_crc_t val;
630     z_word_t crc1, crc2;
631     const z_word_t *word;
632     z_word_t val0, val1, val2;
633     z_size_t last, last2, i;
634     z_size_t num;
635 
636     /* Return initial CRC, if requested. */
637     if (buf == Z_NULL) return 0;
638 
639 #ifdef DYNAMIC_CRC_TABLE
640     once(&made, make_crc_table);
641 #endif /* DYNAMIC_CRC_TABLE */
642 
643     /* Pre-condition the CRC */
644     crc = (~crc) & 0xffffffff;
645 
646     /* Compute the CRC up to a word boundary. */
647     while (len && ((z_size_t)buf & 7) != 0) {
648         len--;
649         val = *buf++;
650         __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
651     }
652 
653     /* Prepare to compute the CRC on full 64-bit words word[0..num-1]. */
654     word = (z_word_t const *)buf;
655     num = len >> 3;
656     len &= 7;
657 
658     /* Do three interleaved CRCs to realize the throughput of one crc32x
659        instruction per cycle. Each CRC is calculated on Z_BATCH words. The
660        three CRCs are combined into a single CRC after each set of batches. */
661     while (num >= 3 * Z_BATCH) {
662         crc1 = 0;
663         crc2 = 0;
664         for (i = 0; i < Z_BATCH; i++) {
665             val0 = word[i];
666             val1 = word[i + Z_BATCH];
667             val2 = word[i + 2 * Z_BATCH];
668             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
669             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
670             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
671         }
672         word += 3 * Z_BATCH;
673         num -= 3 * Z_BATCH;
674         crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc1;
675         crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc2;
676     }
677 
678     /* Do one last smaller batch with the remaining words, if there are enough
679        to pay for the combination of CRCs. */
680     last = num / 3;
681     if (last >= Z_BATCH_MIN) {
682         last2 = last << 1;
683         crc1 = 0;
684         crc2 = 0;
685         for (i = 0; i < last; i++) {
686             val0 = word[i];
687             val1 = word[i + last];
688             val2 = word[i + last2];
689             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
690             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
691             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
692         }
693         word += 3 * last;
694         num -= 3 * last;
695         val = x2nmodp(last, 6);
696         crc = multmodp(val, crc) ^ crc1;
697         crc = multmodp(val, crc) ^ crc2;
698     }
699 
700     /* Compute the CRC on any remaining words. */
701     for (i = 0; i < num; i++) {
702         val0 = word[i];
703         __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
704     }
705     word += num;
706 
707     /* Complete the CRC on any remaining bytes. */
708     buf = (const unsigned char FAR *)word;
709     while (len) {
710         len--;
711         val = *buf++;
712         __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
713     }
714 
715     /* Return the CRC, post-conditioned. */
716     return crc ^ 0xffffffff;
717 }
718 
719 #else
720 
721 #ifdef W
722 
723 /*
724   Return the CRC of the W bytes in the word_t data, taking the
725   least-significant byte of the word as the first byte of data, without any pre
726   or post conditioning. This is used to combine the CRCs of each braid.
727  */
728 local z_crc_t crc_word OF((z_word_t data));
729 
730 local z_crc_t crc_word(data)
731     z_word_t data;
732 {
733     int k;
734     for (k = 0; k < W; k++)
735         data = (data >> 8) ^ crc_table[data & 0xff];
736     return (z_crc_t)data;
737 }
738 
739 local z_word_t crc_word_big OF((z_word_t data));
740 
741 local z_word_t crc_word_big(data)
742     z_word_t data;
743 {
744     int k;
745     for (k = 0; k < W; k++)
746         data = (data << 8) ^
747             crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
748     return data;
749 }
750 
751 #endif
752 
753 /* ========================================================================= */
754 unsigned long ZEXPORT crc32_z(crc, buf, len)
755     unsigned long crc;
756     const unsigned char FAR *buf;
757     z_size_t len;
758 {
759     /* Return initial CRC, if requested. */
760     if (buf == Z_NULL) return 0;
761 
762 #ifdef DYNAMIC_CRC_TABLE
763     once(&made, make_crc_table);
764 #endif /* DYNAMIC_CRC_TABLE */
765 
766     /* Pre-condition the CRC */
767     crc = (~crc) & 0xffffffff;
768 
769 #ifdef W
770 
771     /* If provided enough bytes, do a braided CRC calculation. */
772     if (len >= N * W + W - 1) {
773         z_size_t blks;
774         z_word_t const *words;
775         unsigned endian;
776         int k;
777 
778         /* Compute the CRC up to a z_word_t boundary. */
779         while (len && ((z_size_t)buf & (W - 1)) != 0) {
780             len--;
781             crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
782         }
783 
784         /* Compute the CRC on as many N z_word_t blocks as are available. */
785         blks = len / (N * W);
786         len -= blks * N * W;
787         words = (z_word_t const *)buf;
788 
789         /* Do endian check at execution time instead of compile time, since ARM
790            processors can change the endianess at execution time. If the
791            compiler knows what the endianess will be, it can optimize out the
792            check and the unused branch. */
793         endian = 1;
794         if (*(unsigned char *)&endian) {
795             /* Little endian. */
796 
797             z_crc_t crc0;
798             z_word_t word0;
799 #if N > 1
800             z_crc_t crc1;
801             z_word_t word1;
802 #if N > 2
803             z_crc_t crc2;
804             z_word_t word2;
805 #if N > 3
806             z_crc_t crc3;
807             z_word_t word3;
808 #if N > 4
809             z_crc_t crc4;
810             z_word_t word4;
811 #if N > 5
812             z_crc_t crc5;
813             z_word_t word5;
814 #endif
815 #endif
816 #endif
817 #endif
818 #endif
819 
820             /* Initialize the CRC for each braid. */
821             crc0 = crc;
822 #if N > 1
823             crc1 = 0;
824 #if N > 2
825             crc2 = 0;
826 #if N > 3
827             crc3 = 0;
828 #if N > 4
829             crc4 = 0;
830 #if N > 5
831             crc5 = 0;
832 #endif
833 #endif
834 #endif
835 #endif
836 #endif
837 
838             /*
839               Process the first blks-1 blocks, computing the CRCs on each braid
840               independently.
841              */
842             while (--blks) {
843                 /* Load the word for each braid into registers. */
844                 word0 = crc0 ^ words[0];
845 #if N > 1
846                 word1 = crc1 ^ words[1];
847 #if N > 2
848                 word2 = crc2 ^ words[2];
849 #if N > 3
850                 word3 = crc3 ^ words[3];
851 #if N > 4
852                 word4 = crc4 ^ words[4];
853 #if N > 5
854                 word5 = crc5 ^ words[5];
855 #endif
856 #endif
857 #endif
858 #endif
859 #endif
860                 words += N;
861 
862                 /* Compute and update the CRC for each word. The loop should
863                    get unrolled. */
864                 crc0 = crc_braid_table[0][word0 & 0xff];
865 #if N > 1
866                 crc1 = crc_braid_table[0][word1 & 0xff];
867 #if N > 2
868                 crc2 = crc_braid_table[0][word2 & 0xff];
869 #if N > 3
870                 crc3 = crc_braid_table[0][word3 & 0xff];
871 #if N > 4
872                 crc4 = crc_braid_table[0][word4 & 0xff];
873 #if N > 5
874                 crc5 = crc_braid_table[0][word5 & 0xff];
875 #endif
876 #endif
877 #endif
878 #endif
879 #endif
880                 for (k = 1; k < W; k++) {
881                     crc0 ^= crc_braid_table[k][(word0 >> (k << 3)) & 0xff];
882 #if N > 1
883                     crc1 ^= crc_braid_table[k][(word1 >> (k << 3)) & 0xff];
884 #if N > 2
885                     crc2 ^= crc_braid_table[k][(word2 >> (k << 3)) & 0xff];
886 #if N > 3
887                     crc3 ^= crc_braid_table[k][(word3 >> (k << 3)) & 0xff];
888 #if N > 4
889                     crc4 ^= crc_braid_table[k][(word4 >> (k << 3)) & 0xff];
890 #if N > 5
891                     crc5 ^= crc_braid_table[k][(word5 >> (k << 3)) & 0xff];
892 #endif
893 #endif
894 #endif
895 #endif
896 #endif
897                 }
898             }
899 
900             /*
901               Process the last block, combining the CRCs of the N braids at the
902               same time.
903              */
904             crc = crc_word(crc0 ^ words[0]);
905 #if N > 1
906             crc = crc_word(crc1 ^ words[1] ^ crc);
907 #if N > 2
908             crc = crc_word(crc2 ^ words[2] ^ crc);
909 #if N > 3
910             crc = crc_word(crc3 ^ words[3] ^ crc);
911 #if N > 4
912             crc = crc_word(crc4 ^ words[4] ^ crc);
913 #if N > 5
914             crc = crc_word(crc5 ^ words[5] ^ crc);
915 #endif
916 #endif
917 #endif
918 #endif
919 #endif
920             words += N;
921         }
922         else {
923             /* Big endian. */
924 
925             z_word_t crc0, word0, comb;
926 #if N > 1
927             z_word_t crc1, word1;
928 #if N > 2
929             z_word_t crc2, word2;
930 #if N > 3
931             z_word_t crc3, word3;
932 #if N > 4
933             z_word_t crc4, word4;
934 #if N > 5
935             z_word_t crc5, word5;
936 #endif
937 #endif
938 #endif
939 #endif
940 #endif
941 
942             /* Initialize the CRC for each braid. */
943             crc0 = byte_swap(crc);
944 #if N > 1
945             crc1 = 0;
946 #if N > 2
947             crc2 = 0;
948 #if N > 3
949             crc3 = 0;
950 #if N > 4
951             crc4 = 0;
952 #if N > 5
953             crc5 = 0;
954 #endif
955 #endif
956 #endif
957 #endif
958 #endif
959 
960             /*
961               Process the first blks-1 blocks, computing the CRCs on each braid
962               independently.
963              */
964             while (--blks) {
965                 /* Load the word for each braid into registers. */
966                 word0 = crc0 ^ words[0];
967 #if N > 1
968                 word1 = crc1 ^ words[1];
969 #if N > 2
970                 word2 = crc2 ^ words[2];
971 #if N > 3
972                 word3 = crc3 ^ words[3];
973 #if N > 4
974                 word4 = crc4 ^ words[4];
975 #if N > 5
976                 word5 = crc5 ^ words[5];
977 #endif
978 #endif
979 #endif
980 #endif
981 #endif
982                 words += N;
983 
984                 /* Compute and update the CRC for each word. The loop should
985                    get unrolled. */
986                 crc0 = crc_braid_big_table[0][word0 & 0xff];
987 #if N > 1
988                 crc1 = crc_braid_big_table[0][word1 & 0xff];
989 #if N > 2
990                 crc2 = crc_braid_big_table[0][word2 & 0xff];
991 #if N > 3
992                 crc3 = crc_braid_big_table[0][word3 & 0xff];
993 #if N > 4
994                 crc4 = crc_braid_big_table[0][word4 & 0xff];
995 #if N > 5
996                 crc5 = crc_braid_big_table[0][word5 & 0xff];
997 #endif
998 #endif
999 #endif
1000 #endif
1001 #endif
1002                 for (k = 1; k < W; k++) {
1003                     crc0 ^= crc_braid_big_table[k][(word0 >> (k << 3)) & 0xff];
1004 #if N > 1
1005                     crc1 ^= crc_braid_big_table[k][(word1 >> (k << 3)) & 0xff];
1006 #if N > 2
1007                     crc2 ^= crc_braid_big_table[k][(word2 >> (k << 3)) & 0xff];
1008 #if N > 3
1009                     crc3 ^= crc_braid_big_table[k][(word3 >> (k << 3)) & 0xff];
1010 #if N > 4
1011                     crc4 ^= crc_braid_big_table[k][(word4 >> (k << 3)) & 0xff];
1012 #if N > 5
1013                     crc5 ^= crc_braid_big_table[k][(word5 >> (k << 3)) & 0xff];
1014 #endif
1015 #endif
1016 #endif
1017 #endif
1018 #endif
1019                 }
1020             }
1021 
1022             /*
1023               Process the last block, combining the CRCs of the N braids at the
1024               same time.
1025              */
1026             comb = crc_word_big(crc0 ^ words[0]);
1027 #if N > 1
1028             comb = crc_word_big(crc1 ^ words[1] ^ comb);
1029 #if N > 2
1030             comb = crc_word_big(crc2 ^ words[2] ^ comb);
1031 #if N > 3
1032             comb = crc_word_big(crc3 ^ words[3] ^ comb);
1033 #if N > 4
1034             comb = crc_word_big(crc4 ^ words[4] ^ comb);
1035 #if N > 5
1036             comb = crc_word_big(crc5 ^ words[5] ^ comb);
1037 #endif
1038 #endif
1039 #endif
1040 #endif
1041 #endif
1042             words += N;
1043             crc = byte_swap(comb);
1044         }
1045 
1046         /*
1047           Update the pointer to the remaining bytes to process.
1048          */
1049         buf = (unsigned char const *)words;
1050     }
1051 
1052 #endif /* W */
1053 
1054     /* Complete the computation of the CRC on any remaining bytes. */
1055     while (len >= 8) {
1056         len -= 8;
1057         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1058         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1059         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1060         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1061         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1062         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1063         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1064         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1065     }
1066     while (len) {
1067         len--;
1068         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1069     }
1070 
1071     /* Return the CRC, post-conditioned. */
1072     return crc ^ 0xffffffff;
1073 }
1074 
1075 #endif
1076 
1077 /* ========================================================================= */
1078 unsigned long ZEXPORT crc32(crc, buf, len)
1079     unsigned long crc;
1080     const unsigned char FAR *buf;
1081     uInt len;
1082 {
1083     return crc32_z(crc, buf, len);
1084 }
1085 
1086 /* ========================================================================= */
1087 uLong ZEXPORT crc32_combine64(crc1, crc2, len2)
1088     uLong crc1;
1089     uLong crc2;
1090     z_off64_t len2;
1091 {
1092 #ifdef DYNAMIC_CRC_TABLE
1093     once(&made, make_crc_table);
1094 #endif /* DYNAMIC_CRC_TABLE */
1095     return multmodp(x2nmodp(len2, 3), crc1) ^ (crc2 & 0xffffffff);
1096 }
1097 
1098 /* ========================================================================= */
1099 uLong ZEXPORT crc32_combine(crc1, crc2, len2)
1100     uLong crc1;
1101     uLong crc2;
1102     z_off_t len2;
1103 {
1104     return crc32_combine64(crc1, crc2, (z_off64_t)len2);
1105 }
1106 
1107 /* ========================================================================= */
1108 uLong ZEXPORT crc32_combine_gen64(len2)
1109     z_off64_t len2;
1110 {
1111 #ifdef DYNAMIC_CRC_TABLE
1112     once(&made, make_crc_table);
1113 #endif /* DYNAMIC_CRC_TABLE */
1114     return x2nmodp(len2, 3);
1115 }
1116 
1117 /* ========================================================================= */
1118 uLong ZEXPORT crc32_combine_gen(len2)
1119     z_off_t len2;
1120 {
1121     return crc32_combine_gen64((z_off64_t)len2);
1122 }
1123 
1124 /* ========================================================================= */
1125 uLong ZEXPORT crc32_combine_op(crc1, crc2, op)
1126     uLong crc1;
1127     uLong crc2;
1128     uLong op;
1129 {
1130     return multmodp(op, crc1) ^ (crc2 & 0xffffffff);
1131 }
1132