1 // SPDX-License-Identifier: 0BSD 2 3 /////////////////////////////////////////////////////////////////////////////// 4 // 5 /// \file crc_clmul_consts_gen.c 6 /// \brief Generate constants for CLMUL CRC code 7 /// 8 /// Compiling: gcc -std=c99 -o crc_clmul_consts_gen crc_clmul_consts_gen.c 9 /// 10 /// This is for CRCs that use reversed bit order (bit reflection). 11 /// The same CLMUL CRC code can be used with CRC64 and smaller ones like 12 /// CRC32 apart from one special case: CRC64 needs an extra step in the 13 /// Barrett reduction to handle the 65th bit; the smaller ones don't. 14 /// Otherwise it's enough to just change the polynomial and the derived 15 /// constants and use the same code. 16 /// 17 /// See the Intel white paper "Fast CRC Computation for Generic Polynomials 18 /// Using PCLMULQDQ Instruction" from 2009. 19 // 20 // Author: Lasse Collin 21 // 22 /////////////////////////////////////////////////////////////////////////////// 23 24 #include <inttypes.h> 25 #include <stdio.h> 26 27 28 /// CRC32 (Ethernet) polynomial in reversed representation 29 static const uint64_t p32 = 0xedb88320; 30 31 // CRC64 (ECMA-182) polynomial in reversed representation 32 static const uint64_t p64 = 0xc96c5795d7870f42; 33 34 35 /// Calculates floor(x^128 / p) where p is a CRC64 polynomial in 36 /// reversed representation. The result is in reversed representation too. 37 static uint64_t 38 calc_cldiv(uint64_t p) 39 { 40 // Quotient 41 uint64_t q = 0; 42 43 // Align the x^64 term with the x^128 (the implied high bits of the 44 // divisor and the dividend) and do the first step of polynomial long 45 // division, calculating the first remainder. The variable q remains 46 // zero because the highest bit of the quotient is an implied bit 1 47 // (we kind of set q = 1 << -1). 48 uint64_t r = p; 49 50 // Then process the remaining 64 terms. Note that r has no implied 51 // high bit, only q and p do. (And remember that a high bit in the 52 // polynomial is stored at a low bit in the variable due to the 53 // reversed bit order.) 54 for (unsigned i = 0; i < 64; ++i) { 55 q |= (r & 1) << i; 56 r = (r >> 1) ^ (r & 1 ? p : 0); 57 } 58 59 return q; 60 } 61 62 63 /// Calculate the remainder of carryless division: 64 /// 65 /// x^(bits + n - 1) % p, where n=64 (for CRC64) 66 /// 67 /// p must be in reversed representation which omits the bit of 68 /// the highest term of the polynomial. Instead, it is an implied bit 69 /// at kind of like "1 << -1" position, as if it had just been shifted out. 70 /// 71 /// The return value is in the reversed bit order. (There are no implied bits.) 72 static uint64_t 73 calc_clrem(uint64_t p, unsigned bits) 74 { 75 // Do the first step of polynomial long division. 76 uint64_t r = p; 77 78 // Then process the remaining terms. Start with i = 1 instead of i = 0 79 // to account for the -1 in x^(bits + n - 1). This -1 is convenient 80 // with the reversed bit order. See the "Bit-Reflection" section in 81 // the Intel white paper. 82 for (unsigned i = 1; i < bits; ++i) 83 r = (r >> 1) ^ (r & 1 ? p : 0); 84 85 return r; 86 } 87 88 89 extern int 90 main(void) 91 { 92 puts("// CRC64"); 93 94 // The order of the two 64-bit constants in a vector don't matter. 95 // It feels logical to put them in this order as it matches the 96 // order in which the input bytes are read. 97 printf("const __m128i fold512 = _mm_set_epi64x(" 98 "0x%016" PRIx64 ", 0x%016" PRIx64 ");\n", 99 calc_clrem(p64, 4 * 128 - 64), 100 calc_clrem(p64, 4 * 128)); 101 102 printf("const __m128i fold128 = _mm_set_epi64x(" 103 "0x%016" PRIx64 ", 0x%016" PRIx64 ");\n", 104 calc_clrem(p64, 128 - 64), 105 calc_clrem(p64, 128)); 106 107 // When we multiply by mu, we care about the high bits of the result 108 // (in reversed bit order!). It doesn't matter that the low bit gets 109 // shifted out because the affected output bits will be ignored. 110 // Below we add the implied high bit with "| 1" after the shifting 111 // so that the high bits of the multiplication will be correct. 112 // 113 // p64 is shifted left by one so that the final multiplication 114 // in Barrett reduction won't be misaligned by one bit. We could 115 // use "(p64 << 1) | 1" instead of "p64 << 1" too but it makes 116 // no difference as that bit won't affect the relevant output bits 117 // (we only care about the lowest 64 bits of the result, that is, 118 // lowest in the reversed bit order). 119 // 120 // NOTE: The 65rd bit of p64 gets shifted out. It needs to be 121 // compensated with 64-bit shift and xor in the CRC64 code. 122 printf("const __m128i mu_p = _mm_set_epi64x(" 123 "0x%016" PRIx64 ", 0x%016" PRIx64 ");\n", 124 (calc_cldiv(p64) << 1) | 1, 125 p64 << 1); 126 127 puts(""); 128 129 puts("// CRC32"); 130 131 printf("const __m128i fold512 = _mm_set_epi64x(" 132 "0x%08" PRIx64 ", 0x%08" PRIx64 ");\n", 133 calc_clrem(p32, 4 * 128 - 64), 134 calc_clrem(p32, 4 * 128)); 135 136 printf("const __m128i fold128 = _mm_set_epi64x(" 137 "0x%08" PRIx64 ", 0x%08" PRIx64 ");\n", 138 calc_clrem(p32, 128 - 64), 139 calc_clrem(p32, 128)); 140 141 // CRC32 calculation is done by modulus scaling it to a CRC64. 142 // Since the CRC is in reversed representation, only the mu 143 // constant changes with the modulus scaling. This method avoids 144 // one additional constant and one additional clmul in the final 145 // reduction steps, making the code both simpler and faster. 146 // 147 // p32 is shifted left by one so that the final multiplication 148 // in Barrett reduction won't be misaligned by one bit. We could 149 // use "(p32 << 1) | 1" instead of "p32 << 1" too but it makes 150 // no difference as that bit won't affect the relevant output bits. 151 // 152 // NOTE: The 33-bit value fits in 64 bits so, unlike with CRC64, 153 // there is no need to compensate for any missing bits in the code. 154 printf("const __m128i mu_p = _mm_set_epi64x(" 155 "0x%016" PRIx64 ", 0x%" PRIx64 ");\n", 156 (calc_cldiv(p32) << 1) | 1, 157 p32 << 1); 158 159 return 0; 160 } 161