1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Accelerated CRC32 implementation with Zbc extension.
4 *
5 * Copyright (C) 2024 Intel Corporation
6 */
7
8 #include <asm/hwcap.h>
9 #include <asm/alternative-macros.h>
10 #include <asm/byteorder.h>
11
12 #include <linux/types.h>
13 #include <linux/minmax.h>
14 #include <linux/crc32poly.h>
15 #include <linux/crc32.h>
16 #include <linux/byteorder/generic.h>
17 #include <linux/module.h>
18
19 /*
20 * Refer to https://www.corsix.org/content/barrett-reduction-polynomials for
21 * better understanding of how this math works.
22 *
23 * let "+" denotes polynomial add (XOR)
24 * let "-" denotes polynomial sub (XOR)
25 * let "*" denotes polynomial multiplication
26 * let "/" denotes polynomial floor division
27 * let "S" denotes source data, XLEN bit wide
28 * let "P" denotes CRC32 polynomial
29 * let "T" denotes 2^(XLEN+32)
30 * let "QT" denotes quotient of T/P, with the bit for 2^XLEN being implicit
31 *
32 * crc32(S, P)
33 * => S * (2^32) - S * (2^32) / P * P
34 * => lowest 32 bits of: S * (2^32) / P * P
35 * => lowest 32 bits of: S * (2^32) * (T / P) / T * P
36 * => lowest 32 bits of: S * (2^32) * quotient / T * P
37 * => lowest 32 bits of: S * quotient / 2^XLEN * P
38 * => lowest 32 bits of: (clmul_high_part(S, QT) + S) * P
39 * => clmul_low_part(clmul_high_part(S, QT) + S, P)
40 *
41 * In terms of below implementations, the BE case is more intuitive, since the
42 * higher order bit sits at more significant position.
43 */
44
45 #if __riscv_xlen == 64
46 /* Slide by XLEN bits per iteration */
47 # define STEP_ORDER 3
48
49 /* Each below polynomial quotient has an implicit bit for 2^XLEN */
50
51 /* Polynomial quotient of (2^(XLEN+32))/CRC32_POLY, in LE format */
52 # define CRC32_POLY_QT_LE 0x5a72d812fb808b20
53
54 /* Polynomial quotient of (2^(XLEN+32))/CRC32C_POLY, in LE format */
55 # define CRC32C_POLY_QT_LE 0xa434f61c6f5389f8
56
57 /* Polynomial quotient of (2^(XLEN+32))/CRC32_POLY, in BE format, it should be
58 * the same as the bit-reversed version of CRC32_POLY_QT_LE
59 */
60 # define CRC32_POLY_QT_BE 0x04d101df481b4e5a
61
crc32_le_prep(u32 crc,unsigned long const * ptr)62 static inline u64 crc32_le_prep(u32 crc, unsigned long const *ptr)
63 {
64 return (u64)crc ^ (__force u64)__cpu_to_le64(*ptr);
65 }
66
crc32_le_zbc(unsigned long s,u32 poly,unsigned long poly_qt)67 static inline u32 crc32_le_zbc(unsigned long s, u32 poly, unsigned long poly_qt)
68 {
69 u32 crc;
70
71 /* We don't have a "clmulrh" insn, so use clmul + slli instead. */
72 asm volatile (".option push\n"
73 ".option arch,+zbc\n"
74 "clmul %0, %1, %2\n"
75 "slli %0, %0, 1\n"
76 "xor %0, %0, %1\n"
77 "clmulr %0, %0, %3\n"
78 "srli %0, %0, 32\n"
79 ".option pop\n"
80 : "=&r" (crc)
81 : "r" (s),
82 "r" (poly_qt),
83 "r" ((u64)poly << 32)
84 :);
85 return crc;
86 }
87
crc32_be_prep(u32 crc,unsigned long const * ptr)88 static inline u64 crc32_be_prep(u32 crc, unsigned long const *ptr)
89 {
90 return ((u64)crc << 32) ^ (__force u64)__cpu_to_be64(*ptr);
91 }
92
93 #elif __riscv_xlen == 32
94 # define STEP_ORDER 2
95 /* Each quotient should match the upper half of its analog in RV64 */
96 # define CRC32_POLY_QT_LE 0xfb808b20
97 # define CRC32C_POLY_QT_LE 0x6f5389f8
98 # define CRC32_POLY_QT_BE 0x04d101df
99
crc32_le_prep(u32 crc,unsigned long const * ptr)100 static inline u32 crc32_le_prep(u32 crc, unsigned long const *ptr)
101 {
102 return crc ^ (__force u32)__cpu_to_le32(*ptr);
103 }
104
crc32_le_zbc(unsigned long s,u32 poly,unsigned long poly_qt)105 static inline u32 crc32_le_zbc(unsigned long s, u32 poly, unsigned long poly_qt)
106 {
107 u32 crc;
108
109 /* We don't have a "clmulrh" insn, so use clmul + slli instead. */
110 asm volatile (".option push\n"
111 ".option arch,+zbc\n"
112 "clmul %0, %1, %2\n"
113 "slli %0, %0, 1\n"
114 "xor %0, %0, %1\n"
115 "clmulr %0, %0, %3\n"
116 ".option pop\n"
117 : "=&r" (crc)
118 : "r" (s),
119 "r" (poly_qt),
120 "r" (poly)
121 :);
122 return crc;
123 }
124
crc32_be_prep(u32 crc,unsigned long const * ptr)125 static inline u32 crc32_be_prep(u32 crc, unsigned long const *ptr)
126 {
127 return crc ^ (__force u32)__cpu_to_be32(*ptr);
128 }
129
130 #else
131 # error "Unexpected __riscv_xlen"
132 #endif
133
crc32_be_zbc(unsigned long s)134 static inline u32 crc32_be_zbc(unsigned long s)
135 {
136 u32 crc;
137
138 asm volatile (".option push\n"
139 ".option arch,+zbc\n"
140 "clmulh %0, %1, %2\n"
141 "xor %0, %0, %1\n"
142 "clmul %0, %0, %3\n"
143 ".option pop\n"
144 : "=&r" (crc)
145 : "r" (s),
146 "r" (CRC32_POLY_QT_BE),
147 "r" (CRC32_POLY_BE)
148 :);
149 return crc;
150 }
151
152 #define STEP (1 << STEP_ORDER)
153 #define OFFSET_MASK (STEP - 1)
154
155 typedef u32 (*fallback)(u32 crc, unsigned char const *p, size_t len);
156
crc32_le_unaligned(u32 crc,unsigned char const * p,size_t len,u32 poly,unsigned long poly_qt)157 static inline u32 crc32_le_unaligned(u32 crc, unsigned char const *p,
158 size_t len, u32 poly,
159 unsigned long poly_qt)
160 {
161 size_t bits = len * 8;
162 unsigned long s = 0;
163 u32 crc_low = 0;
164
165 for (int i = 0; i < len; i++)
166 s = ((unsigned long)*p++ << (__riscv_xlen - 8)) | (s >> 8);
167
168 s ^= (unsigned long)crc << (__riscv_xlen - bits);
169 if (__riscv_xlen == 32 || len < sizeof(u32))
170 crc_low = crc >> bits;
171
172 crc = crc32_le_zbc(s, poly, poly_qt);
173 crc ^= crc_low;
174
175 return crc;
176 }
177
crc32_le_generic(u32 crc,unsigned char const * p,size_t len,u32 poly,unsigned long poly_qt,fallback crc_fb)178 static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p,
179 size_t len, u32 poly,
180 unsigned long poly_qt,
181 fallback crc_fb)
182 {
183 size_t offset, head_len, tail_len;
184 unsigned long const *p_ul;
185 unsigned long s;
186
187 asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
188 RISCV_ISA_EXT_ZBC, 1)
189 : : : : legacy);
190
191 /* Handle the unaligned head. */
192 offset = (unsigned long)p & OFFSET_MASK;
193 if (offset && len) {
194 head_len = min(STEP - offset, len);
195 crc = crc32_le_unaligned(crc, p, head_len, poly, poly_qt);
196 p += head_len;
197 len -= head_len;
198 }
199
200 tail_len = len & OFFSET_MASK;
201 len = len >> STEP_ORDER;
202 p_ul = (unsigned long const *)p;
203
204 for (int i = 0; i < len; i++) {
205 s = crc32_le_prep(crc, p_ul);
206 crc = crc32_le_zbc(s, poly, poly_qt);
207 p_ul++;
208 }
209
210 /* Handle the tail bytes. */
211 p = (unsigned char const *)p_ul;
212 if (tail_len)
213 crc = crc32_le_unaligned(crc, p, tail_len, poly, poly_qt);
214
215 return crc;
216
217 legacy:
218 return crc_fb(crc, p, len);
219 }
220
crc32_le_arch(u32 crc,const u8 * p,size_t len)221 u32 __pure crc32_le_arch(u32 crc, const u8 *p, size_t len)
222 {
223 return crc32_le_generic(crc, p, len, CRC32_POLY_LE, CRC32_POLY_QT_LE,
224 crc32_le_base);
225 }
226 EXPORT_SYMBOL(crc32_le_arch);
227
crc32c_le_arch(u32 crc,const u8 * p,size_t len)228 u32 __pure crc32c_le_arch(u32 crc, const u8 *p, size_t len)
229 {
230 return crc32_le_generic(crc, p, len, CRC32C_POLY_LE,
231 CRC32C_POLY_QT_LE, crc32c_le_base);
232 }
233 EXPORT_SYMBOL(crc32c_le_arch);
234
crc32_be_unaligned(u32 crc,unsigned char const * p,size_t len)235 static inline u32 crc32_be_unaligned(u32 crc, unsigned char const *p,
236 size_t len)
237 {
238 size_t bits = len * 8;
239 unsigned long s = 0;
240 u32 crc_low = 0;
241
242 s = 0;
243 for (int i = 0; i < len; i++)
244 s = *p++ | (s << 8);
245
246 if (__riscv_xlen == 32 || len < sizeof(u32)) {
247 s ^= crc >> (32 - bits);
248 crc_low = crc << bits;
249 } else {
250 s ^= (unsigned long)crc << (bits - 32);
251 }
252
253 crc = crc32_be_zbc(s);
254 crc ^= crc_low;
255
256 return crc;
257 }
258
crc32_be_arch(u32 crc,const u8 * p,size_t len)259 u32 __pure crc32_be_arch(u32 crc, const u8 *p, size_t len)
260 {
261 size_t offset, head_len, tail_len;
262 unsigned long const *p_ul;
263 unsigned long s;
264
265 asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
266 RISCV_ISA_EXT_ZBC, 1)
267 : : : : legacy);
268
269 /* Handle the unaligned head. */
270 offset = (unsigned long)p & OFFSET_MASK;
271 if (offset && len) {
272 head_len = min(STEP - offset, len);
273 crc = crc32_be_unaligned(crc, p, head_len);
274 p += head_len;
275 len -= head_len;
276 }
277
278 tail_len = len & OFFSET_MASK;
279 len = len >> STEP_ORDER;
280 p_ul = (unsigned long const *)p;
281
282 for (int i = 0; i < len; i++) {
283 s = crc32_be_prep(crc, p_ul);
284 crc = crc32_be_zbc(s);
285 p_ul++;
286 }
287
288 /* Handle the tail bytes. */
289 p = (unsigned char const *)p_ul;
290 if (tail_len)
291 crc = crc32_be_unaligned(crc, p, tail_len);
292
293 return crc;
294
295 legacy:
296 return crc32_be_base(crc, p, len);
297 }
298 EXPORT_SYMBOL(crc32_be_arch);
299
crc32_optimizations(void)300 u32 crc32_optimizations(void)
301 {
302 if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC))
303 return CRC32_LE_OPTIMIZATION |
304 CRC32_BE_OPTIMIZATION |
305 CRC32C_OPTIMIZATION;
306 return 0;
307 }
308 EXPORT_SYMBOL(crc32_optimizations);
309
310 MODULE_LICENSE("GPL");
311 MODULE_DESCRIPTION("Accelerated CRC32 implementation with Zbc extension");
312