1 /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 /* Copyright 2025 Google LLC */ 3 4 /* 5 * This file is a "template" that generates a CRC function optimized using the 6 * RISC-V Zbc (scalar carryless multiplication) extension. The includer of this 7 * file must define the following parameters to specify the type of CRC: 8 * 9 * crc_t: the data type of the CRC, e.g. u32 for a 32-bit CRC 10 * LSB_CRC: 0 for a msb (most-significant-bit) first CRC, i.e. natural 11 * mapping between bits and polynomial coefficients 12 * 1 for a lsb (least-significant-bit) first CRC, i.e. reflected 13 * mapping between bits and polynomial coefficients 14 */ 15 16 #include <asm/byteorder.h> 17 #include <linux/minmax.h> 18 19 #define CRC_BITS (8 * sizeof(crc_t)) /* a.k.a. 'n' */ 20 21 static inline unsigned long clmul(unsigned long a, unsigned long b) 22 { 23 unsigned long res; 24 25 asm(".option push\n" 26 ".option arch,+zbc\n" 27 "clmul %0, %1, %2\n" 28 ".option pop\n" 29 : "=r" (res) : "r" (a), "r" (b)); 30 return res; 31 } 32 33 static inline unsigned long clmulh(unsigned long a, unsigned long b) 34 { 35 unsigned long res; 36 37 asm(".option push\n" 38 ".option arch,+zbc\n" 39 "clmulh %0, %1, %2\n" 40 ".option pop\n" 41 : "=r" (res) : "r" (a), "r" (b)); 42 return res; 43 } 44 45 static inline unsigned long clmulr(unsigned long a, unsigned long b) 46 { 47 unsigned long res; 48 49 asm(".option push\n" 50 ".option arch,+zbc\n" 51 "clmulr %0, %1, %2\n" 52 ".option pop\n" 53 : "=r" (res) : "r" (a), "r" (b)); 54 return res; 55 } 56 57 /* 58 * crc_load_long() loads one "unsigned long" of aligned data bytes, producing a 59 * polynomial whose bit order matches the CRC's bit order. 60 */ 61 #ifdef CONFIG_64BIT 62 # if LSB_CRC 63 # define crc_load_long(x) le64_to_cpup(x) 64 # else 65 # define crc_load_long(x) be64_to_cpup(x) 66 # endif 67 #else 68 # if LSB_CRC 69 # define crc_load_long(x) le32_to_cpup(x) 70 # else 71 # define crc_load_long(x) be32_to_cpup(x) 72 # endif 73 #endif 74 75 /* XOR @crc into the end of @msgpoly that represents the high-order terms. */ 76 static inline unsigned long 77 crc_clmul_prep(crc_t crc, unsigned long msgpoly) 78 { 79 #if LSB_CRC 80 return msgpoly ^ crc; 81 #else 82 return msgpoly ^ ((unsigned long)crc << (BITS_PER_LONG - CRC_BITS)); 83 #endif 84 } 85 86 /* 87 * Multiply the long-sized @msgpoly by x^n (a.k.a. x^CRC_BITS) and reduce it 88 * modulo the generator polynomial G. This gives the CRC of @msgpoly. 89 */ 90 static inline crc_t 91 crc_clmul_long(unsigned long msgpoly, const struct crc_clmul_consts *consts) 92 { 93 unsigned long tmp; 94 95 /* 96 * First step of Barrett reduction with integrated multiplication by 97 * x^n: calculate floor((msgpoly * x^n) / G). This is the value by 98 * which G needs to be multiplied to cancel out the x^n and higher terms 99 * of msgpoly * x^n. Do it using the following formula: 100 * 101 * msb-first: 102 * floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G)) / x^(BITS_PER_LONG-1)) 103 * lsb-first: 104 * floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G) * x) / x^BITS_PER_LONG) 105 * 106 * barrett_reduction_const_1 contains floor(x^(BITS_PER_LONG-1+n) / G), 107 * which fits a long exactly. Using any lower power of x there would 108 * not carry enough precision through the calculation, while using any 109 * higher power of x would require extra instructions to handle a wider 110 * multiplication. In the msb-first case, using this power of x results 111 * in needing a floored division by x^(BITS_PER_LONG-1), which matches 112 * what clmulr produces. In the lsb-first case, a factor of x gets 113 * implicitly introduced by each carryless multiplication (shown as 114 * '* x' above), and the floored division instead needs to be by 115 * x^BITS_PER_LONG which matches what clmul produces. 116 */ 117 #if LSB_CRC 118 tmp = clmul(msgpoly, consts->barrett_reduction_const_1); 119 #else 120 tmp = clmulr(msgpoly, consts->barrett_reduction_const_1); 121 #endif 122 123 /* 124 * Second step of Barrett reduction: 125 * 126 * crc := (msgpoly * x^n) + (G * floor((msgpoly * x^n) / G)) 127 * 128 * This reduces (msgpoly * x^n) modulo G by adding the appropriate 129 * multiple of G to it. The result uses only the x^0..x^(n-1) terms. 130 * HOWEVER, since the unreduced value (msgpoly * x^n) is zero in those 131 * terms in the first place, it is more efficient to do the equivalent: 132 * 133 * crc := ((G - x^n) * floor((msgpoly * x^n) / G)) mod x^n 134 * 135 * In the lsb-first case further modify it to the following which avoids 136 * a shift, as the crc ends up in the physically low n bits from clmulr: 137 * 138 * product := ((G - x^n) * x^(BITS_PER_LONG - n)) * floor((msgpoly * x^n) / G) * x 139 * crc := floor(product / x^(BITS_PER_LONG + 1 - n)) mod x^n 140 * 141 * barrett_reduction_const_2 contains the constant multiplier (G - x^n) 142 * or (G - x^n) * x^(BITS_PER_LONG - n) from the formulas above. The 143 * cast of the result to crc_t is essential, as it applies the mod x^n! 144 */ 145 #if LSB_CRC 146 return clmulr(tmp, consts->barrett_reduction_const_2); 147 #else 148 return clmul(tmp, consts->barrett_reduction_const_2); 149 #endif 150 } 151 152 /* Update @crc with the data from @msgpoly. */ 153 static inline crc_t 154 crc_clmul_update_long(crc_t crc, unsigned long msgpoly, 155 const struct crc_clmul_consts *consts) 156 { 157 return crc_clmul_long(crc_clmul_prep(crc, msgpoly), consts); 158 } 159 160 /* Update @crc with 1 <= @len < sizeof(unsigned long) bytes of data. */ 161 static inline crc_t 162 crc_clmul_update_partial(crc_t crc, const u8 *p, size_t len, 163 const struct crc_clmul_consts *consts) 164 { 165 unsigned long msgpoly; 166 size_t i; 167 168 #if LSB_CRC 169 msgpoly = (unsigned long)p[0] << (BITS_PER_LONG - 8); 170 for (i = 1; i < len; i++) 171 msgpoly = (msgpoly >> 8) ^ ((unsigned long)p[i] << (BITS_PER_LONG - 8)); 172 #else 173 msgpoly = p[0]; 174 for (i = 1; i < len; i++) 175 msgpoly = (msgpoly << 8) ^ p[i]; 176 #endif 177 178 if (len >= sizeof(crc_t)) { 179 #if LSB_CRC 180 msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len); 181 #else 182 msgpoly ^= (unsigned long)crc << (8*len - CRC_BITS); 183 #endif 184 return crc_clmul_long(msgpoly, consts); 185 } 186 #if LSB_CRC 187 msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len); 188 return crc_clmul_long(msgpoly, consts) ^ (crc >> (8*len)); 189 #else 190 msgpoly ^= crc >> (CRC_BITS - 8*len); 191 return crc_clmul_long(msgpoly, consts) ^ (crc << (8*len)); 192 #endif 193 } 194 195 static inline crc_t 196 crc_clmul(crc_t crc, const void *p, size_t len, 197 const struct crc_clmul_consts *consts) 198 { 199 size_t align; 200 201 /* This implementation assumes that the CRC fits in an unsigned long. */ 202 BUILD_BUG_ON(sizeof(crc_t) > sizeof(unsigned long)); 203 204 /* If the buffer is not long-aligned, align it. */ 205 align = (unsigned long)p % sizeof(unsigned long); 206 if (align && len) { 207 align = min(sizeof(unsigned long) - align, len); 208 crc = crc_clmul_update_partial(crc, p, align, consts); 209 p += align; 210 len -= align; 211 } 212 213 if (len >= 4 * sizeof(unsigned long)) { 214 unsigned long m0, m1; 215 216 m0 = crc_clmul_prep(crc, crc_load_long(p)); 217 m1 = crc_load_long(p + sizeof(unsigned long)); 218 p += 2 * sizeof(unsigned long); 219 len -= 2 * sizeof(unsigned long); 220 /* 221 * Main loop. Each iteration starts with a message polynomial 222 * (x^BITS_PER_LONG)*m0 + m1, then logically extends it by two 223 * more longs of data to form x^(3*BITS_PER_LONG)*m0 + 224 * x^(2*BITS_PER_LONG)*m1 + x^BITS_PER_LONG*m2 + m3, then 225 * "folds" that back into a congruent (modulo G) value that uses 226 * just m0 and m1 again. This is done by multiplying m0 by the 227 * precomputed constant (x^(3*BITS_PER_LONG) mod G) and m1 by 228 * the precomputed constant (x^(2*BITS_PER_LONG) mod G), then 229 * adding the results to m2 and m3 as appropriate. Each such 230 * multiplication produces a result twice the length of a long, 231 * which in RISC-V is two instructions clmul and clmulh. 232 * 233 * This could be changed to fold across more than 2 longs at a 234 * time if there is a CPU that can take advantage of it. 235 */ 236 do { 237 unsigned long p0, p1, p2, p3; 238 239 p0 = clmulh(m0, consts->fold_across_2_longs_const_hi); 240 p1 = clmul(m0, consts->fold_across_2_longs_const_hi); 241 p2 = clmulh(m1, consts->fold_across_2_longs_const_lo); 242 p3 = clmul(m1, consts->fold_across_2_longs_const_lo); 243 m0 = (LSB_CRC ? p1 ^ p3 : p0 ^ p2) ^ crc_load_long(p); 244 m1 = (LSB_CRC ? p0 ^ p2 : p1 ^ p3) ^ 245 crc_load_long(p + sizeof(unsigned long)); 246 247 p += 2 * sizeof(unsigned long); 248 len -= 2 * sizeof(unsigned long); 249 } while (len >= 2 * sizeof(unsigned long)); 250 251 crc = crc_clmul_long(m0, consts); 252 crc = crc_clmul_update_long(crc, m1, consts); 253 } 254 255 while (len >= sizeof(unsigned long)) { 256 crc = crc_clmul_update_long(crc, crc_load_long(p), consts); 257 p += sizeof(unsigned long); 258 len -= sizeof(unsigned long); 259 } 260 261 if (len) 262 crc = crc_clmul_update_partial(crc, p, len, consts); 263 264 return crc; 265 } 266