xref: /freebsd/contrib/xz/src/liblzma/check/crc_clmul_consts_gen.c (revision 128836d304d93f2d00eb14069c27089ab46c38d4)
1 // SPDX-License-Identifier: 0BSD
2 
3 ///////////////////////////////////////////////////////////////////////////////
4 //
5 /// \file       crc_clmul_consts_gen.c
6 /// \brief      Generate constants for CLMUL CRC code
7 ///
8 /// Compiling: gcc -std=c99 -o crc_clmul_consts_gen crc_clmul_consts_gen.c
9 ///
10 /// This is for CRCs that use reversed bit order (bit reflection).
11 /// The same CLMUL CRC code can be used with CRC64 and smaller ones like
12 /// CRC32 apart from one special case: CRC64 needs an extra step in the
13 /// Barrett reduction to handle the 65th bit; the smaller ones don't.
14 /// Otherwise it's enough to just change the polynomial and the derived
15 /// constants and use the same code.
16 ///
17 /// See the Intel white paper "Fast CRC Computation for Generic Polynomials
18 /// Using PCLMULQDQ Instruction" from 2009.
19 //
20 //  Author:     Lasse Collin
21 //
22 ///////////////////////////////////////////////////////////////////////////////
23 
24 #include <inttypes.h>
25 #include <stdio.h>
26 
27 
28 /// CRC32 (Ethernet) polynomial in reversed representation
29 static const uint64_t p32 = 0xedb88320;
30 
31 // CRC64 (ECMA-182) polynomial in reversed representation
32 static const uint64_t p64 = 0xc96c5795d7870f42;
33 
34 
35 /// Calculates floor(x^128 / p) where p is a CRC64 polynomial in
36 /// reversed representation. The result is in reversed representation too.
37 static uint64_t
calc_cldiv(uint64_t p)38 calc_cldiv(uint64_t p)
39 {
40 	// Quotient
41 	uint64_t q = 0;
42 
43 	// Align the x^64 term with the x^128 (the implied high bits of the
44 	// divisor and the dividend) and do the first step of polynomial long
45 	// division, calculating the first remainder. The variable q remains
46 	// zero because the highest bit of the quotient is an implied bit 1
47 	// (we kind of set q = 1 << -1).
48 	uint64_t r = p;
49 
50 	// Then process the remaining 64 terms. Note that r has no implied
51 	// high bit, only q and p do. (And remember that a high bit in the
52 	// polynomial is stored at a low bit in the variable due to the
53 	// reversed bit order.)
54 	for (unsigned i = 0; i < 64; ++i) {
55 		q |= (r & 1) << i;
56 		r = (r >> 1) ^ (r & 1 ? p : 0);
57 	}
58 
59 	return q;
60 }
61 
62 
63 /// Calculate the remainder of carryless division:
64 ///
65 ///     x^(bits + n - 1) % p, where n=64 (for CRC64)
66 ///
67 /// p must be in reversed representation which omits the bit of
68 /// the highest term of the polynomial. Instead, it is an implied bit
69 /// at kind of like "1 << -1" position, as if it had just been shifted out.
70 ///
71 /// The return value is in the reversed bit order. (There are no implied bits.)
72 static uint64_t
calc_clrem(uint64_t p,unsigned bits)73 calc_clrem(uint64_t p, unsigned bits)
74 {
75 	// Do the first step of polynomial long division.
76 	uint64_t r = p;
77 
78 	// Then process the remaining terms. Start with i = 1 instead of i = 0
79 	// to account for the -1 in x^(bits + n - 1). This -1 is convenient
80 	// with the reversed bit order. See the "Bit-Reflection" section in
81 	// the Intel white paper.
82 	for (unsigned i = 1; i < bits; ++i)
83 		r = (r >> 1) ^ (r & 1 ? p : 0);
84 
85 	return r;
86 }
87 
88 
89 extern int
main(void)90 main(void)
91 {
92 	puts("// CRC64");
93 
94 	// The order of the two 64-bit constants in a vector don't matter.
95 	// It feels logical to put them in this order as it matches the
96 	// order in which the input bytes are read.
97 	printf("const __m128i fold512 = _mm_set_epi64x("
98 		"0x%016" PRIx64 ", 0x%016" PRIx64 ");\n",
99 		calc_clrem(p64, 4 * 128 - 64),
100 		calc_clrem(p64, 4 * 128));
101 
102 	printf("const __m128i fold128 = _mm_set_epi64x("
103 		"0x%016" PRIx64 ", 0x%016" PRIx64 ");\n",
104 		calc_clrem(p64, 128 - 64),
105 		calc_clrem(p64, 128));
106 
107 	// When we multiply by mu, we care about the high bits of the result
108 	// (in reversed bit order!). It doesn't matter that the low bit gets
109 	// shifted out because the affected output bits will be ignored.
110 	// Below we add the implied high bit with "| 1" after the shifting
111 	// so that the high bits of the multiplication will be correct.
112 	//
113 	// p64 is shifted left by one so that the final multiplication
114 	// in Barrett reduction won't be misaligned by one bit. We could
115 	// use "(p64 << 1) | 1" instead of "p64 << 1" too but it makes
116 	// no difference as that bit won't affect the relevant output bits
117 	// (we only care about the lowest 64 bits of the result, that is,
118 	// lowest in the reversed bit order).
119 	//
120 	// NOTE: The 65rd bit of p64 gets shifted out. It needs to be
121 	// compensated with 64-bit shift and xor in the CRC64 code.
122 	printf("const __m128i mu_p = _mm_set_epi64x("
123 		"0x%016" PRIx64 ", 0x%016" PRIx64 ");\n",
124 		(calc_cldiv(p64) << 1) | 1,
125 		p64 << 1);
126 
127 	puts("");
128 
129 	puts("// CRC32");
130 
131 	printf("const __m128i fold512 = _mm_set_epi64x("
132 		"0x%08" PRIx64 ", 0x%08" PRIx64 ");\n",
133 		calc_clrem(p32, 4 * 128 - 64),
134 		calc_clrem(p32, 4 * 128));
135 
136 	printf("const __m128i fold128 = _mm_set_epi64x("
137 		"0x%08" PRIx64 ", 0x%08" PRIx64 ");\n",
138 		calc_clrem(p32, 128 - 64),
139 		calc_clrem(p32, 128));
140 
141 	// CRC32 calculation is done by modulus scaling it to a CRC64.
142 	// Since the CRC is in reversed representation, only the mu
143 	// constant changes with the modulus scaling. This method avoids
144 	// one additional constant and one additional clmul in the final
145 	// reduction steps, making the code both simpler and faster.
146 	//
147 	// p32 is shifted left by one so that the final multiplication
148 	// in Barrett reduction won't be misaligned by one bit. We could
149 	// use "(p32 << 1) | 1" instead of "p32 << 1" too but it makes
150 	// no difference as that bit won't affect the relevant output bits.
151 	//
152 	// NOTE: The 33-bit value fits in 64 bits so, unlike with CRC64,
153 	// there is no need to compensate for any missing bits in the code.
154 	printf("const __m128i mu_p = _mm_set_epi64x("
155 		"0x%016" PRIx64 ", 0x%" PRIx64 ");\n",
156 		(calc_cldiv(p32) << 1) | 1,
157 		p32 << 1);
158 
159 	return 0;
160 }
161