xref: /linux/arch/riscv/lib/crc-clmul-template.h (revision 4f9786035f9e519db41375818e1d0b5f20da2f10)
1*bbe2610bSEric Biggers /* SPDX-License-Identifier: GPL-2.0-or-later */
2*bbe2610bSEric Biggers /* Copyright 2025 Google LLC */
3*bbe2610bSEric Biggers 
4*bbe2610bSEric Biggers /*
5*bbe2610bSEric Biggers  * This file is a "template" that generates a CRC function optimized using the
6*bbe2610bSEric Biggers  * RISC-V Zbc (scalar carryless multiplication) extension.  The includer of this
7*bbe2610bSEric Biggers  * file must define the following parameters to specify the type of CRC:
8*bbe2610bSEric Biggers  *
9*bbe2610bSEric Biggers  *	crc_t: the data type of the CRC, e.g. u32 for a 32-bit CRC
10*bbe2610bSEric Biggers  *	LSB_CRC: 0 for a msb (most-significant-bit) first CRC, i.e. natural
11*bbe2610bSEric Biggers  *		 mapping between bits and polynomial coefficients
12*bbe2610bSEric Biggers  *	         1 for a lsb (least-significant-bit) first CRC, i.e. reflected
13*bbe2610bSEric Biggers  *	         mapping between bits and polynomial coefficients
14*bbe2610bSEric Biggers  */
15*bbe2610bSEric Biggers 
16*bbe2610bSEric Biggers #include <asm/byteorder.h>
17*bbe2610bSEric Biggers #include <linux/minmax.h>
18*bbe2610bSEric Biggers 
19*bbe2610bSEric Biggers #define CRC_BITS	(8 * sizeof(crc_t))	/* a.k.a. 'n' */
20*bbe2610bSEric Biggers 
21*bbe2610bSEric Biggers static inline unsigned long clmul(unsigned long a, unsigned long b)
22*bbe2610bSEric Biggers {
23*bbe2610bSEric Biggers 	unsigned long res;
24*bbe2610bSEric Biggers 
25*bbe2610bSEric Biggers 	asm(".option push\n"
26*bbe2610bSEric Biggers 	    ".option arch,+zbc\n"
27*bbe2610bSEric Biggers 	    "clmul %0, %1, %2\n"
28*bbe2610bSEric Biggers 	    ".option pop\n"
29*bbe2610bSEric Biggers 	    : "=r" (res) : "r" (a), "r" (b));
30*bbe2610bSEric Biggers 	return res;
31*bbe2610bSEric Biggers }
32*bbe2610bSEric Biggers 
33*bbe2610bSEric Biggers static inline unsigned long clmulh(unsigned long a, unsigned long b)
34*bbe2610bSEric Biggers {
35*bbe2610bSEric Biggers 	unsigned long res;
36*bbe2610bSEric Biggers 
37*bbe2610bSEric Biggers 	asm(".option push\n"
38*bbe2610bSEric Biggers 	    ".option arch,+zbc\n"
39*bbe2610bSEric Biggers 	    "clmulh %0, %1, %2\n"
40*bbe2610bSEric Biggers 	    ".option pop\n"
41*bbe2610bSEric Biggers 	    : "=r" (res) : "r" (a), "r" (b));
42*bbe2610bSEric Biggers 	return res;
43*bbe2610bSEric Biggers }
44*bbe2610bSEric Biggers 
45*bbe2610bSEric Biggers static inline unsigned long clmulr(unsigned long a, unsigned long b)
46*bbe2610bSEric Biggers {
47*bbe2610bSEric Biggers 	unsigned long res;
48*bbe2610bSEric Biggers 
49*bbe2610bSEric Biggers 	asm(".option push\n"
50*bbe2610bSEric Biggers 	    ".option arch,+zbc\n"
51*bbe2610bSEric Biggers 	    "clmulr %0, %1, %2\n"
52*bbe2610bSEric Biggers 	    ".option pop\n"
53*bbe2610bSEric Biggers 	    : "=r" (res) : "r" (a), "r" (b));
54*bbe2610bSEric Biggers 	return res;
55*bbe2610bSEric Biggers }
56*bbe2610bSEric Biggers 
57*bbe2610bSEric Biggers /*
58*bbe2610bSEric Biggers  * crc_load_long() loads one "unsigned long" of aligned data bytes, producing a
59*bbe2610bSEric Biggers  * polynomial whose bit order matches the CRC's bit order.
60*bbe2610bSEric Biggers  */
61*bbe2610bSEric Biggers #ifdef CONFIG_64BIT
62*bbe2610bSEric Biggers #  if LSB_CRC
63*bbe2610bSEric Biggers #    define crc_load_long(x)	le64_to_cpup(x)
64*bbe2610bSEric Biggers #  else
65*bbe2610bSEric Biggers #    define crc_load_long(x)	be64_to_cpup(x)
66*bbe2610bSEric Biggers #  endif
67*bbe2610bSEric Biggers #else
68*bbe2610bSEric Biggers #  if LSB_CRC
69*bbe2610bSEric Biggers #    define crc_load_long(x)	le32_to_cpup(x)
70*bbe2610bSEric Biggers #  else
71*bbe2610bSEric Biggers #    define crc_load_long(x)	be32_to_cpup(x)
72*bbe2610bSEric Biggers #  endif
73*bbe2610bSEric Biggers #endif
74*bbe2610bSEric Biggers 
75*bbe2610bSEric Biggers /* XOR @crc into the end of @msgpoly that represents the high-order terms. */
76*bbe2610bSEric Biggers static inline unsigned long
77*bbe2610bSEric Biggers crc_clmul_prep(crc_t crc, unsigned long msgpoly)
78*bbe2610bSEric Biggers {
79*bbe2610bSEric Biggers #if LSB_CRC
80*bbe2610bSEric Biggers 	return msgpoly ^ crc;
81*bbe2610bSEric Biggers #else
82*bbe2610bSEric Biggers 	return msgpoly ^ ((unsigned long)crc << (BITS_PER_LONG - CRC_BITS));
83*bbe2610bSEric Biggers #endif
84*bbe2610bSEric Biggers }
85*bbe2610bSEric Biggers 
86*bbe2610bSEric Biggers /*
87*bbe2610bSEric Biggers  * Multiply the long-sized @msgpoly by x^n (a.k.a. x^CRC_BITS) and reduce it
88*bbe2610bSEric Biggers  * modulo the generator polynomial G.  This gives the CRC of @msgpoly.
89*bbe2610bSEric Biggers  */
90*bbe2610bSEric Biggers static inline crc_t
91*bbe2610bSEric Biggers crc_clmul_long(unsigned long msgpoly, const struct crc_clmul_consts *consts)
92*bbe2610bSEric Biggers {
93*bbe2610bSEric Biggers 	unsigned long tmp;
94*bbe2610bSEric Biggers 
95*bbe2610bSEric Biggers 	/*
96*bbe2610bSEric Biggers 	 * First step of Barrett reduction with integrated multiplication by
97*bbe2610bSEric Biggers 	 * x^n: calculate floor((msgpoly * x^n) / G).  This is the value by
98*bbe2610bSEric Biggers 	 * which G needs to be multiplied to cancel out the x^n and higher terms
99*bbe2610bSEric Biggers 	 * of msgpoly * x^n.  Do it using the following formula:
100*bbe2610bSEric Biggers 	 *
101*bbe2610bSEric Biggers 	 * msb-first:
102*bbe2610bSEric Biggers 	 *    floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G)) / x^(BITS_PER_LONG-1))
103*bbe2610bSEric Biggers 	 * lsb-first:
104*bbe2610bSEric Biggers 	 *    floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G) * x) / x^BITS_PER_LONG)
105*bbe2610bSEric Biggers 	 *
106*bbe2610bSEric Biggers 	 * barrett_reduction_const_1 contains floor(x^(BITS_PER_LONG-1+n) / G),
107*bbe2610bSEric Biggers 	 * which fits a long exactly.  Using any lower power of x there would
108*bbe2610bSEric Biggers 	 * not carry enough precision through the calculation, while using any
109*bbe2610bSEric Biggers 	 * higher power of x would require extra instructions to handle a wider
110*bbe2610bSEric Biggers 	 * multiplication.  In the msb-first case, using this power of x results
111*bbe2610bSEric Biggers 	 * in needing a floored division by x^(BITS_PER_LONG-1), which matches
112*bbe2610bSEric Biggers 	 * what clmulr produces.  In the lsb-first case, a factor of x gets
113*bbe2610bSEric Biggers 	 * implicitly introduced by each carryless multiplication (shown as
114*bbe2610bSEric Biggers 	 * '* x' above), and the floored division instead needs to be by
115*bbe2610bSEric Biggers 	 * x^BITS_PER_LONG which matches what clmul produces.
116*bbe2610bSEric Biggers 	 */
117*bbe2610bSEric Biggers #if LSB_CRC
118*bbe2610bSEric Biggers 	tmp = clmul(msgpoly, consts->barrett_reduction_const_1);
119*bbe2610bSEric Biggers #else
120*bbe2610bSEric Biggers 	tmp = clmulr(msgpoly, consts->barrett_reduction_const_1);
121*bbe2610bSEric Biggers #endif
122*bbe2610bSEric Biggers 
123*bbe2610bSEric Biggers 	/*
124*bbe2610bSEric Biggers 	 * Second step of Barrett reduction:
125*bbe2610bSEric Biggers 	 *
126*bbe2610bSEric Biggers 	 *    crc := (msgpoly * x^n) + (G * floor((msgpoly * x^n) / G))
127*bbe2610bSEric Biggers 	 *
128*bbe2610bSEric Biggers 	 * This reduces (msgpoly * x^n) modulo G by adding the appropriate
129*bbe2610bSEric Biggers 	 * multiple of G to it.  The result uses only the x^0..x^(n-1) terms.
130*bbe2610bSEric Biggers 	 * HOWEVER, since the unreduced value (msgpoly * x^n) is zero in those
131*bbe2610bSEric Biggers 	 * terms in the first place, it is more efficient to do the equivalent:
132*bbe2610bSEric Biggers 	 *
133*bbe2610bSEric Biggers 	 *    crc := ((G - x^n) * floor((msgpoly * x^n) / G)) mod x^n
134*bbe2610bSEric Biggers 	 *
135*bbe2610bSEric Biggers 	 * In the lsb-first case further modify it to the following which avoids
136*bbe2610bSEric Biggers 	 * a shift, as the crc ends up in the physically low n bits from clmulr:
137*bbe2610bSEric Biggers 	 *
138*bbe2610bSEric Biggers 	 *    product := ((G - x^n) * x^(BITS_PER_LONG - n)) * floor((msgpoly * x^n) / G) * x
139*bbe2610bSEric Biggers 	 *    crc := floor(product / x^(BITS_PER_LONG + 1 - n)) mod x^n
140*bbe2610bSEric Biggers 	 *
141*bbe2610bSEric Biggers 	 * barrett_reduction_const_2 contains the constant multiplier (G - x^n)
142*bbe2610bSEric Biggers 	 * or (G - x^n) * x^(BITS_PER_LONG - n) from the formulas above.  The
143*bbe2610bSEric Biggers 	 * cast of the result to crc_t is essential, as it applies the mod x^n!
144*bbe2610bSEric Biggers 	 */
145*bbe2610bSEric Biggers #if LSB_CRC
146*bbe2610bSEric Biggers 	return clmulr(tmp, consts->barrett_reduction_const_2);
147*bbe2610bSEric Biggers #else
148*bbe2610bSEric Biggers 	return clmul(tmp, consts->barrett_reduction_const_2);
149*bbe2610bSEric Biggers #endif
150*bbe2610bSEric Biggers }
151*bbe2610bSEric Biggers 
152*bbe2610bSEric Biggers /* Update @crc with the data from @msgpoly. */
153*bbe2610bSEric Biggers static inline crc_t
154*bbe2610bSEric Biggers crc_clmul_update_long(crc_t crc, unsigned long msgpoly,
155*bbe2610bSEric Biggers 		      const struct crc_clmul_consts *consts)
156*bbe2610bSEric Biggers {
157*bbe2610bSEric Biggers 	return crc_clmul_long(crc_clmul_prep(crc, msgpoly), consts);
158*bbe2610bSEric Biggers }
159*bbe2610bSEric Biggers 
160*bbe2610bSEric Biggers /* Update @crc with 1 <= @len < sizeof(unsigned long) bytes of data. */
161*bbe2610bSEric Biggers static inline crc_t
162*bbe2610bSEric Biggers crc_clmul_update_partial(crc_t crc, const u8 *p, size_t len,
163*bbe2610bSEric Biggers 			 const struct crc_clmul_consts *consts)
164*bbe2610bSEric Biggers {
165*bbe2610bSEric Biggers 	unsigned long msgpoly;
166*bbe2610bSEric Biggers 	size_t i;
167*bbe2610bSEric Biggers 
168*bbe2610bSEric Biggers #if LSB_CRC
169*bbe2610bSEric Biggers 	msgpoly = (unsigned long)p[0] << (BITS_PER_LONG - 8);
170*bbe2610bSEric Biggers 	for (i = 1; i < len; i++)
171*bbe2610bSEric Biggers 		msgpoly = (msgpoly >> 8) ^ ((unsigned long)p[i] << (BITS_PER_LONG - 8));
172*bbe2610bSEric Biggers #else
173*bbe2610bSEric Biggers 	msgpoly = p[0];
174*bbe2610bSEric Biggers 	for (i = 1; i < len; i++)
175*bbe2610bSEric Biggers 		msgpoly = (msgpoly << 8) ^ p[i];
176*bbe2610bSEric Biggers #endif
177*bbe2610bSEric Biggers 
178*bbe2610bSEric Biggers 	if (len >= sizeof(crc_t)) {
179*bbe2610bSEric Biggers 	#if LSB_CRC
180*bbe2610bSEric Biggers 		msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len);
181*bbe2610bSEric Biggers 	#else
182*bbe2610bSEric Biggers 		msgpoly ^= (unsigned long)crc << (8*len - CRC_BITS);
183*bbe2610bSEric Biggers 	#endif
184*bbe2610bSEric Biggers 		return crc_clmul_long(msgpoly, consts);
185*bbe2610bSEric Biggers 	}
186*bbe2610bSEric Biggers #if LSB_CRC
187*bbe2610bSEric Biggers 	msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len);
188*bbe2610bSEric Biggers 	return crc_clmul_long(msgpoly, consts) ^ (crc >> (8*len));
189*bbe2610bSEric Biggers #else
190*bbe2610bSEric Biggers 	msgpoly ^= crc >> (CRC_BITS - 8*len);
191*bbe2610bSEric Biggers 	return crc_clmul_long(msgpoly, consts) ^ (crc << (8*len));
192*bbe2610bSEric Biggers #endif
193*bbe2610bSEric Biggers }
194*bbe2610bSEric Biggers 
195*bbe2610bSEric Biggers static inline crc_t
196*bbe2610bSEric Biggers crc_clmul(crc_t crc, const void *p, size_t len,
197*bbe2610bSEric Biggers 	  const struct crc_clmul_consts *consts)
198*bbe2610bSEric Biggers {
199*bbe2610bSEric Biggers 	size_t align;
200*bbe2610bSEric Biggers 
201*bbe2610bSEric Biggers 	/* This implementation assumes that the CRC fits in an unsigned long. */
202*bbe2610bSEric Biggers 	BUILD_BUG_ON(sizeof(crc_t) > sizeof(unsigned long));
203*bbe2610bSEric Biggers 
204*bbe2610bSEric Biggers 	/* If the buffer is not long-aligned, align it. */
205*bbe2610bSEric Biggers 	align = (unsigned long)p % sizeof(unsigned long);
206*bbe2610bSEric Biggers 	if (align && len) {
207*bbe2610bSEric Biggers 		align = min(sizeof(unsigned long) - align, len);
208*bbe2610bSEric Biggers 		crc = crc_clmul_update_partial(crc, p, align, consts);
209*bbe2610bSEric Biggers 		p += align;
210*bbe2610bSEric Biggers 		len -= align;
211*bbe2610bSEric Biggers 	}
212*bbe2610bSEric Biggers 
213*bbe2610bSEric Biggers 	if (len >= 4 * sizeof(unsigned long)) {
214*bbe2610bSEric Biggers 		unsigned long m0, m1;
215*bbe2610bSEric Biggers 
216*bbe2610bSEric Biggers 		m0 = crc_clmul_prep(crc, crc_load_long(p));
217*bbe2610bSEric Biggers 		m1 = crc_load_long(p + sizeof(unsigned long));
218*bbe2610bSEric Biggers 		p += 2 * sizeof(unsigned long);
219*bbe2610bSEric Biggers 		len -= 2 * sizeof(unsigned long);
220*bbe2610bSEric Biggers 		/*
221*bbe2610bSEric Biggers 		 * Main loop.  Each iteration starts with a message polynomial
222*bbe2610bSEric Biggers 		 * (x^BITS_PER_LONG)*m0 + m1, then logically extends it by two
223*bbe2610bSEric Biggers 		 * more longs of data to form x^(3*BITS_PER_LONG)*m0 +
224*bbe2610bSEric Biggers 		 * x^(2*BITS_PER_LONG)*m1 + x^BITS_PER_LONG*m2 + m3, then
225*bbe2610bSEric Biggers 		 * "folds" that back into a congruent (modulo G) value that uses
226*bbe2610bSEric Biggers 		 * just m0 and m1 again.  This is done by multiplying m0 by the
227*bbe2610bSEric Biggers 		 * precomputed constant (x^(3*BITS_PER_LONG) mod G) and m1 by
228*bbe2610bSEric Biggers 		 * the precomputed constant (x^(2*BITS_PER_LONG) mod G), then
229*bbe2610bSEric Biggers 		 * adding the results to m2 and m3 as appropriate.  Each such
230*bbe2610bSEric Biggers 		 * multiplication produces a result twice the length of a long,
231*bbe2610bSEric Biggers 		 * which in RISC-V is two instructions clmul and clmulh.
232*bbe2610bSEric Biggers 		 *
233*bbe2610bSEric Biggers 		 * This could be changed to fold across more than 2 longs at a
234*bbe2610bSEric Biggers 		 * time if there is a CPU that can take advantage of it.
235*bbe2610bSEric Biggers 		 */
236*bbe2610bSEric Biggers 		do {
237*bbe2610bSEric Biggers 			unsigned long p0, p1, p2, p3;
238*bbe2610bSEric Biggers 
239*bbe2610bSEric Biggers 			p0 = clmulh(m0, consts->fold_across_2_longs_const_hi);
240*bbe2610bSEric Biggers 			p1 = clmul(m0, consts->fold_across_2_longs_const_hi);
241*bbe2610bSEric Biggers 			p2 = clmulh(m1, consts->fold_across_2_longs_const_lo);
242*bbe2610bSEric Biggers 			p3 = clmul(m1, consts->fold_across_2_longs_const_lo);
243*bbe2610bSEric Biggers 			m0 = (LSB_CRC ? p1 ^ p3 : p0 ^ p2) ^ crc_load_long(p);
244*bbe2610bSEric Biggers 			m1 = (LSB_CRC ? p0 ^ p2 : p1 ^ p3) ^
245*bbe2610bSEric Biggers 			     crc_load_long(p + sizeof(unsigned long));
246*bbe2610bSEric Biggers 
247*bbe2610bSEric Biggers 			p += 2 * sizeof(unsigned long);
248*bbe2610bSEric Biggers 			len -= 2 * sizeof(unsigned long);
249*bbe2610bSEric Biggers 		} while (len >= 2 * sizeof(unsigned long));
250*bbe2610bSEric Biggers 
251*bbe2610bSEric Biggers 		crc = crc_clmul_long(m0, consts);
252*bbe2610bSEric Biggers 		crc = crc_clmul_update_long(crc, m1, consts);
253*bbe2610bSEric Biggers 	}
254*bbe2610bSEric Biggers 
255*bbe2610bSEric Biggers 	while (len >= sizeof(unsigned long)) {
256*bbe2610bSEric Biggers 		crc = crc_clmul_update_long(crc, crc_load_long(p), consts);
257*bbe2610bSEric Biggers 		p += sizeof(unsigned long);
258*bbe2610bSEric Biggers 		len -= sizeof(unsigned long);
259*bbe2610bSEric Biggers 	}
260*bbe2610bSEric Biggers 
261*bbe2610bSEric Biggers 	if (len)
262*bbe2610bSEric Biggers 		crc = crc_clmul_update_partial(crc, p, len, consts);
263*bbe2610bSEric Biggers 
264*bbe2610bSEric Biggers 	return crc;
265*bbe2610bSEric Biggers }
266