1*bbe2610bSEric Biggers /* SPDX-License-Identifier: GPL-2.0-or-later */ 2*bbe2610bSEric Biggers /* Copyright 2025 Google LLC */ 3*bbe2610bSEric Biggers 4*bbe2610bSEric Biggers /* 5*bbe2610bSEric Biggers * This file is a "template" that generates a CRC function optimized using the 6*bbe2610bSEric Biggers * RISC-V Zbc (scalar carryless multiplication) extension. The includer of this 7*bbe2610bSEric Biggers * file must define the following parameters to specify the type of CRC: 8*bbe2610bSEric Biggers * 9*bbe2610bSEric Biggers * crc_t: the data type of the CRC, e.g. u32 for a 32-bit CRC 10*bbe2610bSEric Biggers * LSB_CRC: 0 for a msb (most-significant-bit) first CRC, i.e. natural 11*bbe2610bSEric Biggers * mapping between bits and polynomial coefficients 12*bbe2610bSEric Biggers * 1 for a lsb (least-significant-bit) first CRC, i.e. reflected 13*bbe2610bSEric Biggers * mapping between bits and polynomial coefficients 14*bbe2610bSEric Biggers */ 15*bbe2610bSEric Biggers 16*bbe2610bSEric Biggers #include <asm/byteorder.h> 17*bbe2610bSEric Biggers #include <linux/minmax.h> 18*bbe2610bSEric Biggers 19*bbe2610bSEric Biggers #define CRC_BITS (8 * sizeof(crc_t)) /* a.k.a. 'n' */ 20*bbe2610bSEric Biggers 21*bbe2610bSEric Biggers static inline unsigned long clmul(unsigned long a, unsigned long b) 22*bbe2610bSEric Biggers { 23*bbe2610bSEric Biggers unsigned long res; 24*bbe2610bSEric Biggers 25*bbe2610bSEric Biggers asm(".option push\n" 26*bbe2610bSEric Biggers ".option arch,+zbc\n" 27*bbe2610bSEric Biggers "clmul %0, %1, %2\n" 28*bbe2610bSEric Biggers ".option pop\n" 29*bbe2610bSEric Biggers : "=r" (res) : "r" (a), "r" (b)); 30*bbe2610bSEric Biggers return res; 31*bbe2610bSEric Biggers } 32*bbe2610bSEric Biggers 33*bbe2610bSEric Biggers static inline unsigned long clmulh(unsigned long a, unsigned long b) 34*bbe2610bSEric Biggers { 35*bbe2610bSEric Biggers unsigned long res; 36*bbe2610bSEric Biggers 37*bbe2610bSEric Biggers asm(".option push\n" 38*bbe2610bSEric Biggers ".option arch,+zbc\n" 39*bbe2610bSEric Biggers "clmulh %0, %1, %2\n" 40*bbe2610bSEric Biggers ".option pop\n" 41*bbe2610bSEric Biggers : "=r" (res) : "r" (a), "r" (b)); 42*bbe2610bSEric Biggers return res; 43*bbe2610bSEric Biggers } 44*bbe2610bSEric Biggers 45*bbe2610bSEric Biggers static inline unsigned long clmulr(unsigned long a, unsigned long b) 46*bbe2610bSEric Biggers { 47*bbe2610bSEric Biggers unsigned long res; 48*bbe2610bSEric Biggers 49*bbe2610bSEric Biggers asm(".option push\n" 50*bbe2610bSEric Biggers ".option arch,+zbc\n" 51*bbe2610bSEric Biggers "clmulr %0, %1, %2\n" 52*bbe2610bSEric Biggers ".option pop\n" 53*bbe2610bSEric Biggers : "=r" (res) : "r" (a), "r" (b)); 54*bbe2610bSEric Biggers return res; 55*bbe2610bSEric Biggers } 56*bbe2610bSEric Biggers 57*bbe2610bSEric Biggers /* 58*bbe2610bSEric Biggers * crc_load_long() loads one "unsigned long" of aligned data bytes, producing a 59*bbe2610bSEric Biggers * polynomial whose bit order matches the CRC's bit order. 60*bbe2610bSEric Biggers */ 61*bbe2610bSEric Biggers #ifdef CONFIG_64BIT 62*bbe2610bSEric Biggers # if LSB_CRC 63*bbe2610bSEric Biggers # define crc_load_long(x) le64_to_cpup(x) 64*bbe2610bSEric Biggers # else 65*bbe2610bSEric Biggers # define crc_load_long(x) be64_to_cpup(x) 66*bbe2610bSEric Biggers # endif 67*bbe2610bSEric Biggers #else 68*bbe2610bSEric Biggers # if LSB_CRC 69*bbe2610bSEric Biggers # define crc_load_long(x) le32_to_cpup(x) 70*bbe2610bSEric Biggers # else 71*bbe2610bSEric Biggers # define crc_load_long(x) be32_to_cpup(x) 72*bbe2610bSEric Biggers # endif 73*bbe2610bSEric Biggers #endif 74*bbe2610bSEric Biggers 75*bbe2610bSEric Biggers /* XOR @crc into the end of @msgpoly that represents the high-order terms. */ 76*bbe2610bSEric Biggers static inline unsigned long 77*bbe2610bSEric Biggers crc_clmul_prep(crc_t crc, unsigned long msgpoly) 78*bbe2610bSEric Biggers { 79*bbe2610bSEric Biggers #if LSB_CRC 80*bbe2610bSEric Biggers return msgpoly ^ crc; 81*bbe2610bSEric Biggers #else 82*bbe2610bSEric Biggers return msgpoly ^ ((unsigned long)crc << (BITS_PER_LONG - CRC_BITS)); 83*bbe2610bSEric Biggers #endif 84*bbe2610bSEric Biggers } 85*bbe2610bSEric Biggers 86*bbe2610bSEric Biggers /* 87*bbe2610bSEric Biggers * Multiply the long-sized @msgpoly by x^n (a.k.a. x^CRC_BITS) and reduce it 88*bbe2610bSEric Biggers * modulo the generator polynomial G. This gives the CRC of @msgpoly. 89*bbe2610bSEric Biggers */ 90*bbe2610bSEric Biggers static inline crc_t 91*bbe2610bSEric Biggers crc_clmul_long(unsigned long msgpoly, const struct crc_clmul_consts *consts) 92*bbe2610bSEric Biggers { 93*bbe2610bSEric Biggers unsigned long tmp; 94*bbe2610bSEric Biggers 95*bbe2610bSEric Biggers /* 96*bbe2610bSEric Biggers * First step of Barrett reduction with integrated multiplication by 97*bbe2610bSEric Biggers * x^n: calculate floor((msgpoly * x^n) / G). This is the value by 98*bbe2610bSEric Biggers * which G needs to be multiplied to cancel out the x^n and higher terms 99*bbe2610bSEric Biggers * of msgpoly * x^n. Do it using the following formula: 100*bbe2610bSEric Biggers * 101*bbe2610bSEric Biggers * msb-first: 102*bbe2610bSEric Biggers * floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G)) / x^(BITS_PER_LONG-1)) 103*bbe2610bSEric Biggers * lsb-first: 104*bbe2610bSEric Biggers * floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G) * x) / x^BITS_PER_LONG) 105*bbe2610bSEric Biggers * 106*bbe2610bSEric Biggers * barrett_reduction_const_1 contains floor(x^(BITS_PER_LONG-1+n) / G), 107*bbe2610bSEric Biggers * which fits a long exactly. Using any lower power of x there would 108*bbe2610bSEric Biggers * not carry enough precision through the calculation, while using any 109*bbe2610bSEric Biggers * higher power of x would require extra instructions to handle a wider 110*bbe2610bSEric Biggers * multiplication. In the msb-first case, using this power of x results 111*bbe2610bSEric Biggers * in needing a floored division by x^(BITS_PER_LONG-1), which matches 112*bbe2610bSEric Biggers * what clmulr produces. In the lsb-first case, a factor of x gets 113*bbe2610bSEric Biggers * implicitly introduced by each carryless multiplication (shown as 114*bbe2610bSEric Biggers * '* x' above), and the floored division instead needs to be by 115*bbe2610bSEric Biggers * x^BITS_PER_LONG which matches what clmul produces. 116*bbe2610bSEric Biggers */ 117*bbe2610bSEric Biggers #if LSB_CRC 118*bbe2610bSEric Biggers tmp = clmul(msgpoly, consts->barrett_reduction_const_1); 119*bbe2610bSEric Biggers #else 120*bbe2610bSEric Biggers tmp = clmulr(msgpoly, consts->barrett_reduction_const_1); 121*bbe2610bSEric Biggers #endif 122*bbe2610bSEric Biggers 123*bbe2610bSEric Biggers /* 124*bbe2610bSEric Biggers * Second step of Barrett reduction: 125*bbe2610bSEric Biggers * 126*bbe2610bSEric Biggers * crc := (msgpoly * x^n) + (G * floor((msgpoly * x^n) / G)) 127*bbe2610bSEric Biggers * 128*bbe2610bSEric Biggers * This reduces (msgpoly * x^n) modulo G by adding the appropriate 129*bbe2610bSEric Biggers * multiple of G to it. The result uses only the x^0..x^(n-1) terms. 130*bbe2610bSEric Biggers * HOWEVER, since the unreduced value (msgpoly * x^n) is zero in those 131*bbe2610bSEric Biggers * terms in the first place, it is more efficient to do the equivalent: 132*bbe2610bSEric Biggers * 133*bbe2610bSEric Biggers * crc := ((G - x^n) * floor((msgpoly * x^n) / G)) mod x^n 134*bbe2610bSEric Biggers * 135*bbe2610bSEric Biggers * In the lsb-first case further modify it to the following which avoids 136*bbe2610bSEric Biggers * a shift, as the crc ends up in the physically low n bits from clmulr: 137*bbe2610bSEric Biggers * 138*bbe2610bSEric Biggers * product := ((G - x^n) * x^(BITS_PER_LONG - n)) * floor((msgpoly * x^n) / G) * x 139*bbe2610bSEric Biggers * crc := floor(product / x^(BITS_PER_LONG + 1 - n)) mod x^n 140*bbe2610bSEric Biggers * 141*bbe2610bSEric Biggers * barrett_reduction_const_2 contains the constant multiplier (G - x^n) 142*bbe2610bSEric Biggers * or (G - x^n) * x^(BITS_PER_LONG - n) from the formulas above. The 143*bbe2610bSEric Biggers * cast of the result to crc_t is essential, as it applies the mod x^n! 144*bbe2610bSEric Biggers */ 145*bbe2610bSEric Biggers #if LSB_CRC 146*bbe2610bSEric Biggers return clmulr(tmp, consts->barrett_reduction_const_2); 147*bbe2610bSEric Biggers #else 148*bbe2610bSEric Biggers return clmul(tmp, consts->barrett_reduction_const_2); 149*bbe2610bSEric Biggers #endif 150*bbe2610bSEric Biggers } 151*bbe2610bSEric Biggers 152*bbe2610bSEric Biggers /* Update @crc with the data from @msgpoly. */ 153*bbe2610bSEric Biggers static inline crc_t 154*bbe2610bSEric Biggers crc_clmul_update_long(crc_t crc, unsigned long msgpoly, 155*bbe2610bSEric Biggers const struct crc_clmul_consts *consts) 156*bbe2610bSEric Biggers { 157*bbe2610bSEric Biggers return crc_clmul_long(crc_clmul_prep(crc, msgpoly), consts); 158*bbe2610bSEric Biggers } 159*bbe2610bSEric Biggers 160*bbe2610bSEric Biggers /* Update @crc with 1 <= @len < sizeof(unsigned long) bytes of data. */ 161*bbe2610bSEric Biggers static inline crc_t 162*bbe2610bSEric Biggers crc_clmul_update_partial(crc_t crc, const u8 *p, size_t len, 163*bbe2610bSEric Biggers const struct crc_clmul_consts *consts) 164*bbe2610bSEric Biggers { 165*bbe2610bSEric Biggers unsigned long msgpoly; 166*bbe2610bSEric Biggers size_t i; 167*bbe2610bSEric Biggers 168*bbe2610bSEric Biggers #if LSB_CRC 169*bbe2610bSEric Biggers msgpoly = (unsigned long)p[0] << (BITS_PER_LONG - 8); 170*bbe2610bSEric Biggers for (i = 1; i < len; i++) 171*bbe2610bSEric Biggers msgpoly = (msgpoly >> 8) ^ ((unsigned long)p[i] << (BITS_PER_LONG - 8)); 172*bbe2610bSEric Biggers #else 173*bbe2610bSEric Biggers msgpoly = p[0]; 174*bbe2610bSEric Biggers for (i = 1; i < len; i++) 175*bbe2610bSEric Biggers msgpoly = (msgpoly << 8) ^ p[i]; 176*bbe2610bSEric Biggers #endif 177*bbe2610bSEric Biggers 178*bbe2610bSEric Biggers if (len >= sizeof(crc_t)) { 179*bbe2610bSEric Biggers #if LSB_CRC 180*bbe2610bSEric Biggers msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len); 181*bbe2610bSEric Biggers #else 182*bbe2610bSEric Biggers msgpoly ^= (unsigned long)crc << (8*len - CRC_BITS); 183*bbe2610bSEric Biggers #endif 184*bbe2610bSEric Biggers return crc_clmul_long(msgpoly, consts); 185*bbe2610bSEric Biggers } 186*bbe2610bSEric Biggers #if LSB_CRC 187*bbe2610bSEric Biggers msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len); 188*bbe2610bSEric Biggers return crc_clmul_long(msgpoly, consts) ^ (crc >> (8*len)); 189*bbe2610bSEric Biggers #else 190*bbe2610bSEric Biggers msgpoly ^= crc >> (CRC_BITS - 8*len); 191*bbe2610bSEric Biggers return crc_clmul_long(msgpoly, consts) ^ (crc << (8*len)); 192*bbe2610bSEric Biggers #endif 193*bbe2610bSEric Biggers } 194*bbe2610bSEric Biggers 195*bbe2610bSEric Biggers static inline crc_t 196*bbe2610bSEric Biggers crc_clmul(crc_t crc, const void *p, size_t len, 197*bbe2610bSEric Biggers const struct crc_clmul_consts *consts) 198*bbe2610bSEric Biggers { 199*bbe2610bSEric Biggers size_t align; 200*bbe2610bSEric Biggers 201*bbe2610bSEric Biggers /* This implementation assumes that the CRC fits in an unsigned long. */ 202*bbe2610bSEric Biggers BUILD_BUG_ON(sizeof(crc_t) > sizeof(unsigned long)); 203*bbe2610bSEric Biggers 204*bbe2610bSEric Biggers /* If the buffer is not long-aligned, align it. */ 205*bbe2610bSEric Biggers align = (unsigned long)p % sizeof(unsigned long); 206*bbe2610bSEric Biggers if (align && len) { 207*bbe2610bSEric Biggers align = min(sizeof(unsigned long) - align, len); 208*bbe2610bSEric Biggers crc = crc_clmul_update_partial(crc, p, align, consts); 209*bbe2610bSEric Biggers p += align; 210*bbe2610bSEric Biggers len -= align; 211*bbe2610bSEric Biggers } 212*bbe2610bSEric Biggers 213*bbe2610bSEric Biggers if (len >= 4 * sizeof(unsigned long)) { 214*bbe2610bSEric Biggers unsigned long m0, m1; 215*bbe2610bSEric Biggers 216*bbe2610bSEric Biggers m0 = crc_clmul_prep(crc, crc_load_long(p)); 217*bbe2610bSEric Biggers m1 = crc_load_long(p + sizeof(unsigned long)); 218*bbe2610bSEric Biggers p += 2 * sizeof(unsigned long); 219*bbe2610bSEric Biggers len -= 2 * sizeof(unsigned long); 220*bbe2610bSEric Biggers /* 221*bbe2610bSEric Biggers * Main loop. Each iteration starts with a message polynomial 222*bbe2610bSEric Biggers * (x^BITS_PER_LONG)*m0 + m1, then logically extends it by two 223*bbe2610bSEric Biggers * more longs of data to form x^(3*BITS_PER_LONG)*m0 + 224*bbe2610bSEric Biggers * x^(2*BITS_PER_LONG)*m1 + x^BITS_PER_LONG*m2 + m3, then 225*bbe2610bSEric Biggers * "folds" that back into a congruent (modulo G) value that uses 226*bbe2610bSEric Biggers * just m0 and m1 again. This is done by multiplying m0 by the 227*bbe2610bSEric Biggers * precomputed constant (x^(3*BITS_PER_LONG) mod G) and m1 by 228*bbe2610bSEric Biggers * the precomputed constant (x^(2*BITS_PER_LONG) mod G), then 229*bbe2610bSEric Biggers * adding the results to m2 and m3 as appropriate. Each such 230*bbe2610bSEric Biggers * multiplication produces a result twice the length of a long, 231*bbe2610bSEric Biggers * which in RISC-V is two instructions clmul and clmulh. 232*bbe2610bSEric Biggers * 233*bbe2610bSEric Biggers * This could be changed to fold across more than 2 longs at a 234*bbe2610bSEric Biggers * time if there is a CPU that can take advantage of it. 235*bbe2610bSEric Biggers */ 236*bbe2610bSEric Biggers do { 237*bbe2610bSEric Biggers unsigned long p0, p1, p2, p3; 238*bbe2610bSEric Biggers 239*bbe2610bSEric Biggers p0 = clmulh(m0, consts->fold_across_2_longs_const_hi); 240*bbe2610bSEric Biggers p1 = clmul(m0, consts->fold_across_2_longs_const_hi); 241*bbe2610bSEric Biggers p2 = clmulh(m1, consts->fold_across_2_longs_const_lo); 242*bbe2610bSEric Biggers p3 = clmul(m1, consts->fold_across_2_longs_const_lo); 243*bbe2610bSEric Biggers m0 = (LSB_CRC ? p1 ^ p3 : p0 ^ p2) ^ crc_load_long(p); 244*bbe2610bSEric Biggers m1 = (LSB_CRC ? p0 ^ p2 : p1 ^ p3) ^ 245*bbe2610bSEric Biggers crc_load_long(p + sizeof(unsigned long)); 246*bbe2610bSEric Biggers 247*bbe2610bSEric Biggers p += 2 * sizeof(unsigned long); 248*bbe2610bSEric Biggers len -= 2 * sizeof(unsigned long); 249*bbe2610bSEric Biggers } while (len >= 2 * sizeof(unsigned long)); 250*bbe2610bSEric Biggers 251*bbe2610bSEric Biggers crc = crc_clmul_long(m0, consts); 252*bbe2610bSEric Biggers crc = crc_clmul_update_long(crc, m1, consts); 253*bbe2610bSEric Biggers } 254*bbe2610bSEric Biggers 255*bbe2610bSEric Biggers while (len >= sizeof(unsigned long)) { 256*bbe2610bSEric Biggers crc = crc_clmul_update_long(crc, crc_load_long(p), consts); 257*bbe2610bSEric Biggers p += sizeof(unsigned long); 258*bbe2610bSEric Biggers len -= sizeof(unsigned long); 259*bbe2610bSEric Biggers } 260*bbe2610bSEric Biggers 261*bbe2610bSEric Biggers if (len) 262*bbe2610bSEric Biggers crc = crc_clmul_update_partial(crc, p, len, consts); 263*bbe2610bSEric Biggers 264*bbe2610bSEric Biggers return crc; 265*bbe2610bSEric Biggers } 266