xref: /freebsd/sys/libkern/x86/crc32_sse42.c (revision b08fc26cbdd00df6852e71e1be58fa9cc92019f0)
1 /*
2  * Derived from crc32c.c version 1.1 by Mark Adler.
3  *
4  * Copyright (C) 2013 Mark Adler
5  *
6  * This software is provided 'as-is', without any express or implied warranty.
7  * In no event will the author be held liable for any damages arising from the
8  * use of this software.
9  *
10  * Permission is granted to anyone to use this software for any purpose,
11  * including commercial applications, and to alter it and redistribute it
12  * freely, subject to the following restrictions:
13  *
14  * 1. The origin of this software must not be misrepresented; you must not
15  *    claim that you wrote the original software. If you use this software
16  *    in a product, an acknowledgment in the product documentation would be
17  *    appreciated but is not required.
18  * 2. Altered source versions must be plainly marked as such, and must not be
19  *    misrepresented as being the original software.
20  * 3. This notice may not be removed or altered from any source distribution.
21  *
22  * Mark Adler
23  * madler@alumni.caltech.edu
24  */
25 
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
28 
29 /*
30  * This file is compiled in userspace in order to run ATF unit tests.
31  */
32 #ifdef USERSPACE_TESTING
33 #include <stdint.h>
34 #else
35 #include <sys/param.h>
36 #include <sys/kernel.h>
37 #include <sys/libkern.h>
38 #include <sys/systm.h>
39 #endif
40 
41 #include <nmmintrin.h>
42 
43 /* CRC-32C (iSCSI) polynomial in reversed bit order. */
44 #define POLY	0x82f63b78
45 
46 /*
47  * Block sizes for three-way parallel crc computation.  LONG and SHORT must
48  * both be powers of two.
49  */
50 #define LONG	8192
51 #define SHORT	256
52 
53 /* Tables for hardware crc that shift a crc by LONG and SHORT zeros. */
54 static uint32_t crc32c_long[4][256];
55 static uint32_t crc32c_short[4][256];
56 
57 /*
58  * Multiply a matrix times a vector over the Galois field of two elements,
59  * GF(2).  Each element is a bit in an unsigned integer.  mat must have at
60  * least as many entries as the power of two for most significant one bit in
61  * vec.
62  */
63 static inline uint32_t
64 gf2_matrix_times(uint32_t *mat, uint32_t vec)
65 {
66 	uint32_t sum;
67 
68 	sum = 0;
69 	while (vec) {
70 		if (vec & 1)
71 			sum ^= *mat;
72 		vec >>= 1;
73 		mat++;
74 	}
75 	return (sum);
76 }
77 
78 /*
79  * Multiply a matrix by itself over GF(2).  Both mat and square must have 32
80  * rows.
81  */
82 static inline void
83 gf2_matrix_square(uint32_t *square, uint32_t *mat)
84 {
85 	int n;
86 
87 	for (n = 0; n < 32; n++)
88 		square[n] = gf2_matrix_times(mat, mat[n]);
89 }
90 
91 /*
92  * Construct an operator to apply len zeros to a crc.  len must be a power of
93  * two.  If len is not a power of two, then the result is the same as for the
94  * largest power of two less than len.  The result for len == 0 is the same as
95  * for len == 1.  A version of this routine could be easily written for any
96  * len, but that is not needed for this application.
97  */
98 static void
99 crc32c_zeros_op(uint32_t *even, size_t len)
100 {
101 	uint32_t odd[32];       /* odd-power-of-two zeros operator */
102 	uint32_t row;
103 	int n;
104 
105 	/* put operator for one zero bit in odd */
106 	odd[0] = POLY;              /* CRC-32C polynomial */
107 	row = 1;
108 	for (n = 1; n < 32; n++) {
109 		odd[n] = row;
110 		row <<= 1;
111 	}
112 
113 	/* put operator for two zero bits in even */
114 	gf2_matrix_square(even, odd);
115 
116 	/* put operator for four zero bits in odd */
117 	gf2_matrix_square(odd, even);
118 
119 	/*
120 	 * first square will put the operator for one zero byte (eight zero
121 	 * bits), in even -- next square puts operator for two zero bytes in
122 	 * odd, and so on, until len has been rotated down to zero
123 	 */
124 	do {
125 		gf2_matrix_square(even, odd);
126 		len >>= 1;
127 		if (len == 0)
128 			return;
129 		gf2_matrix_square(odd, even);
130 		len >>= 1;
131 	} while (len);
132 
133 	/* answer ended up in odd -- copy to even */
134 	for (n = 0; n < 32; n++)
135 		even[n] = odd[n];
136 }
137 
138 /*
139  * Take a length and build four lookup tables for applying the zeros operator
140  * for that length, byte-by-byte on the operand.
141  */
142 static void
143 crc32c_zeros(uint32_t zeros[][256], size_t len)
144 {
145 	uint32_t op[32];
146 	uint32_t n;
147 
148 	crc32c_zeros_op(op, len);
149 	for (n = 0; n < 256; n++) {
150 		zeros[0][n] = gf2_matrix_times(op, n);
151 		zeros[1][n] = gf2_matrix_times(op, n << 8);
152 		zeros[2][n] = gf2_matrix_times(op, n << 16);
153 		zeros[3][n] = gf2_matrix_times(op, n << 24);
154 	}
155 }
156 
157 /* Apply the zeros operator table to crc. */
158 static inline uint32_t
159 crc32c_shift(uint32_t zeros[][256], uint32_t crc)
160 {
161 
162 	return (zeros[0][crc & 0xff] ^ zeros[1][(crc >> 8) & 0xff] ^
163 	    zeros[2][(crc >> 16) & 0xff] ^ zeros[3][crc >> 24]);
164 }
165 
166 /* Initialize tables for shifting crcs. */
167 static void
168 #ifdef USERSPACE_TESTING
169 __attribute__((__constructor__))
170 #endif
171 crc32c_init_hw(void)
172 {
173 	crc32c_zeros(crc32c_long, LONG);
174 	crc32c_zeros(crc32c_short, SHORT);
175 }
176 #ifdef _KERNEL
177 SYSINIT(crc32c_sse42, SI_SUB_LOCK, SI_ORDER_ANY, crc32c_init_hw, NULL);
178 #endif
179 
180 /* Compute CRC-32C using the Intel hardware instruction. */
181 #ifdef USERSPACE_TESTING
182 uint32_t sse42_crc32c(uint32_t, const unsigned char *, unsigned);
183 #endif
184 uint32_t
185 sse42_crc32c(uint32_t crc, const unsigned char *buf, unsigned len)
186 {
187 #ifdef __amd64__
188 	const size_t align = 8;
189 #else
190 	const size_t align = 4;
191 #endif
192 	const unsigned char *next, *end;
193 	uint64_t crc0, crc1, crc2;      /* need to be 64 bits for crc32q */
194 
195 	next = buf;
196 	crc0 = crc;
197 
198 	/* Compute the crc to bring the data pointer to an aligned boundary. */
199 	while (len && ((uintptr_t)next & (align - 1)) != 0) {
200 		crc0 = _mm_crc32_u8(crc0, *next);
201 		next++;
202 		len--;
203 	}
204 
205 	/*
206 	 * Compute the crc on sets of LONG*3 bytes, executing three independent
207 	 * crc instructions, each on LONG bytes -- this is optimized for the
208 	 * Nehalem, Westmere, Sandy Bridge, and Ivy Bridge architectures, which
209 	 * have a throughput of one crc per cycle, but a latency of three
210 	 * cycles.
211 	 */
212 	while (len >= LONG * 3) {
213 		crc1 = 0;
214 		crc2 = 0;
215 		end = next + LONG;
216 		do {
217 #ifdef __amd64__
218 			crc0 = _mm_crc32_u64(crc0, *(const uint64_t *)next);
219 			crc1 = _mm_crc32_u64(crc1,
220 			    *(const uint64_t *)(next + LONG));
221 			crc2 = _mm_crc32_u64(crc2,
222 			    *(const uint64_t *)(next + (LONG * 2)));
223 #else
224 			crc0 = _mm_crc32_u32(crc0, *(const uint32_t *)next);
225 			crc1 = _mm_crc32_u32(crc1,
226 			    *(const uint32_t *)(next + LONG));
227 			crc2 = _mm_crc32_u32(crc2,
228 			    *(const uint32_t *)(next + (LONG * 2)));
229 #endif
230 			next += align;
231 		} while (next < end);
232 		crc0 = crc32c_shift(crc32c_long, crc0) ^ crc1;
233 		crc0 = crc32c_shift(crc32c_long, crc0) ^ crc2;
234 		next += LONG * 2;
235 		len -= LONG * 3;
236 	}
237 
238 	/*
239 	 * Do the same thing, but now on SHORT*3 blocks for the remaining data
240 	 * less than a LONG*3 block
241 	 */
242 	while (len >= SHORT * 3) {
243 		crc1 = 0;
244 		crc2 = 0;
245 		end = next + SHORT;
246 		do {
247 #ifdef __amd64__
248 			crc0 = _mm_crc32_u64(crc0, *(const uint64_t *)next);
249 			crc1 = _mm_crc32_u64(crc1,
250 			    *(const uint64_t *)(next + SHORT));
251 			crc2 = _mm_crc32_u64(crc2,
252 			    *(const uint64_t *)(next + (SHORT * 2)));
253 #else
254 			crc0 = _mm_crc32_u32(crc0, *(const uint32_t *)next);
255 			crc1 = _mm_crc32_u32(crc1,
256 			    *(const uint32_t *)(next + SHORT));
257 			crc2 = _mm_crc32_u32(crc2,
258 			    *(const uint32_t *)(next + (SHORT * 2)));
259 #endif
260 			next += align;
261 		} while (next < end);
262 		crc0 = crc32c_shift(crc32c_short, crc0) ^ crc1;
263 		crc0 = crc32c_shift(crc32c_short, crc0) ^ crc2;
264 		next += SHORT * 2;
265 		len -= SHORT * 3;
266 	}
267 
268 	/* Compute the crc on the remaining bytes at native word size. */
269 	end = next + (len - (len & (align - 1)));
270 	while (next < end) {
271 #ifdef __amd64__
272 		crc0 = _mm_crc32_u64(crc0, *(const uint64_t *)next);
273 #else
274 		crc0 = _mm_crc32_u32(crc0, *(const uint32_t *)next);
275 #endif
276 		next += align;
277 	}
278 	len &= (align - 1);
279 
280 	/* Compute the crc for any trailing bytes. */
281 	while (len) {
282 		crc0 = _mm_crc32_u8(crc0, *next);
283 		next++;
284 		len--;
285 	}
286 
287 	return ((uint32_t)crc0);
288 }
289