1 // SPDX-License-Identifier: 0BSD
2
3 ///////////////////////////////////////////////////////////////////////////////
4 //
5 /// \file crc_clmul_consts_gen.c
6 /// \brief Generate constants for CLMUL CRC code
7 ///
8 /// Compiling: gcc -std=c99 -o crc_clmul_consts_gen crc_clmul_consts_gen.c
9 ///
10 /// This is for CRCs that use reversed bit order (bit reflection).
11 /// The same CLMUL CRC code can be used with CRC64 and smaller ones like
12 /// CRC32 apart from one special case: CRC64 needs an extra step in the
13 /// Barrett reduction to handle the 65th bit; the smaller ones don't.
14 /// Otherwise it's enough to just change the polynomial and the derived
15 /// constants and use the same code.
16 ///
17 /// See the Intel white paper "Fast CRC Computation for Generic Polynomials
18 /// Using PCLMULQDQ Instruction" from 2009.
19 //
20 // Author: Lasse Collin
21 //
22 ///////////////////////////////////////////////////////////////////////////////
23
24 #include <inttypes.h>
25 #include <stdio.h>
26
27
28 /// CRC32 (Ethernet) polynomial in reversed representation
29 static const uint64_t p32 = 0xedb88320;
30
31 // CRC64 (ECMA-182) polynomial in reversed representation
32 static const uint64_t p64 = 0xc96c5795d7870f42;
33
34
35 /// Calculates floor(x^128 / p) where p is a CRC64 polynomial in
36 /// reversed representation. The result is in reversed representation too.
37 static uint64_t
calc_cldiv(uint64_t p)38 calc_cldiv(uint64_t p)
39 {
40 // Quotient
41 uint64_t q = 0;
42
43 // Align the x^64 term with the x^128 (the implied high bits of the
44 // divisor and the dividend) and do the first step of polynomial long
45 // division, calculating the first remainder. The variable q remains
46 // zero because the highest bit of the quotient is an implied bit 1
47 // (we kind of set q = 1 << -1).
48 uint64_t r = p;
49
50 // Then process the remaining 64 terms. Note that r has no implied
51 // high bit, only q and p do. (And remember that a high bit in the
52 // polynomial is stored at a low bit in the variable due to the
53 // reversed bit order.)
54 for (unsigned i = 0; i < 64; ++i) {
55 q |= (r & 1) << i;
56 r = (r >> 1) ^ (r & 1 ? p : 0);
57 }
58
59 return q;
60 }
61
62
63 /// Calculate the remainder of carryless division:
64 ///
65 /// x^(bits + n - 1) % p, where n=64 (for CRC64)
66 ///
67 /// p must be in reversed representation which omits the bit of
68 /// the highest term of the polynomial. Instead, it is an implied bit
69 /// at kind of like "1 << -1" position, as if it had just been shifted out.
70 ///
71 /// The return value is in the reversed bit order. (There are no implied bits.)
72 static uint64_t
calc_clrem(uint64_t p,unsigned bits)73 calc_clrem(uint64_t p, unsigned bits)
74 {
75 // Do the first step of polynomial long division.
76 uint64_t r = p;
77
78 // Then process the remaining terms. Start with i = 1 instead of i = 0
79 // to account for the -1 in x^(bits + n - 1). This -1 is convenient
80 // with the reversed bit order. See the "Bit-Reflection" section in
81 // the Intel white paper.
82 for (unsigned i = 1; i < bits; ++i)
83 r = (r >> 1) ^ (r & 1 ? p : 0);
84
85 return r;
86 }
87
88
89 extern int
main(void)90 main(void)
91 {
92 puts("// CRC64");
93
94 // The order of the two 64-bit constants in a vector don't matter.
95 // It feels logical to put them in this order as it matches the
96 // order in which the input bytes are read.
97 printf("const __m128i fold512 = _mm_set_epi64x("
98 "0x%016" PRIx64 ", 0x%016" PRIx64 ");\n",
99 calc_clrem(p64, 4 * 128 - 64),
100 calc_clrem(p64, 4 * 128));
101
102 printf("const __m128i fold128 = _mm_set_epi64x("
103 "0x%016" PRIx64 ", 0x%016" PRIx64 ");\n",
104 calc_clrem(p64, 128 - 64),
105 calc_clrem(p64, 128));
106
107 // When we multiply by mu, we care about the high bits of the result
108 // (in reversed bit order!). It doesn't matter that the low bit gets
109 // shifted out because the affected output bits will be ignored.
110 // Below we add the implied high bit with "| 1" after the shifting
111 // so that the high bits of the multiplication will be correct.
112 //
113 // p64 is shifted left by one so that the final multiplication
114 // in Barrett reduction won't be misaligned by one bit. We could
115 // use "(p64 << 1) | 1" instead of "p64 << 1" too but it makes
116 // no difference as that bit won't affect the relevant output bits
117 // (we only care about the lowest 64 bits of the result, that is,
118 // lowest in the reversed bit order).
119 //
120 // NOTE: The 65rd bit of p64 gets shifted out. It needs to be
121 // compensated with 64-bit shift and xor in the CRC64 code.
122 printf("const __m128i mu_p = _mm_set_epi64x("
123 "0x%016" PRIx64 ", 0x%016" PRIx64 ");\n",
124 (calc_cldiv(p64) << 1) | 1,
125 p64 << 1);
126
127 puts("");
128
129 puts("// CRC32");
130
131 printf("const __m128i fold512 = _mm_set_epi64x("
132 "0x%08" PRIx64 ", 0x%08" PRIx64 ");\n",
133 calc_clrem(p32, 4 * 128 - 64),
134 calc_clrem(p32, 4 * 128));
135
136 printf("const __m128i fold128 = _mm_set_epi64x("
137 "0x%08" PRIx64 ", 0x%08" PRIx64 ");\n",
138 calc_clrem(p32, 128 - 64),
139 calc_clrem(p32, 128));
140
141 // CRC32 calculation is done by modulus scaling it to a CRC64.
142 // Since the CRC is in reversed representation, only the mu
143 // constant changes with the modulus scaling. This method avoids
144 // one additional constant and one additional clmul in the final
145 // reduction steps, making the code both simpler and faster.
146 //
147 // p32 is shifted left by one so that the final multiplication
148 // in Barrett reduction won't be misaligned by one bit. We could
149 // use "(p32 << 1) | 1" instead of "p32 << 1" too but it makes
150 // no difference as that bit won't affect the relevant output bits.
151 //
152 // NOTE: The 33-bit value fits in 64 bits so, unlike with CRC64,
153 // there is no need to compensate for any missing bits in the code.
154 printf("const __m128i mu_p = _mm_set_epi64x("
155 "0x%016" PRIx64 ", 0x%" PRIx64 ");\n",
156 (calc_cldiv(p32) << 1) | 1,
157 p32 << 1);
158
159 return 0;
160 }
161