xref: /linux/arch/x86/lib/crc32c-3way.S (revision 37b33c68b00089a574ebd0a856a5d554eb3001b7)
1/*
2 * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64)
3 *
4 * The white papers on CRC32C calculations with PCLMULQDQ instruction can be
5 * downloaded from:
6 * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
7 * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf
8 *
9 * Copyright (C) 2012 Intel Corporation.
10 * Copyright 2024 Google LLC
11 *
12 * Authors:
13 *	Wajdi Feghali <wajdi.k.feghali@intel.com>
14 *	James Guilford <james.guilford@intel.com>
15 *	David Cote <david.m.cote@intel.com>
16 *	Tim Chen <tim.c.chen@linux.intel.com>
17 *
18 * This software is available to you under a choice of one of two
19 * licenses.  You may choose to be licensed under the terms of the GNU
20 * General Public License (GPL) Version 2, available from the file
21 * COPYING in the main directory of this source tree, or the
22 * OpenIB.org BSD license below:
23 *
24 *     Redistribution and use in source and binary forms, with or
25 *     without modification, are permitted provided that the following
26 *     conditions are met:
27 *
28 *      - Redistributions of source code must retain the above
29 *        copyright notice, this list of conditions and the following
30 *        disclaimer.
31 *
32 *      - Redistributions in binary form must reproduce the above
33 *        copyright notice, this list of conditions and the following
34 *        disclaimer in the documentation and/or other materials
35 *        provided with the distribution.
36 *
37 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
38 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
39 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
40 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
41 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
42 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
43 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
44 * SOFTWARE.
45 */
46
47#include <linux/linkage.h>
48
49## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
50
51# Define threshold below which buffers are considered "small" and routed to
52# regular CRC code that does not interleave the CRC instructions.
53#define SMALL_SIZE 200
54
55# u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
56
57.text
58SYM_FUNC_START(crc32c_x86_3way)
59#define    crc0		  %edi
60#define    crc0_q	  %rdi
61#define    bufp		  %rsi
62#define    bufp_d	  %esi
63#define    len		  %rdx
64#define    len_dw	  %edx
65#define    n_misaligned	  %ecx /* overlaps chunk_bytes! */
66#define    n_misaligned_q %rcx
67#define    chunk_bytes	  %ecx /* overlaps n_misaligned! */
68#define    chunk_bytes_q  %rcx
69#define    crc1		  %r8
70#define    crc2		  %r9
71
72	cmp	$SMALL_SIZE, len
73	jb	.Lsmall
74
75	################################################################
76	## 1) ALIGN:
77	################################################################
78	mov	bufp_d, n_misaligned
79	neg	n_misaligned
80	and	$7, n_misaligned	# calculate the misalignment amount of
81					# the address
82	je	.Laligned		# Skip if aligned
83
84	# Process 1 <= n_misaligned <= 7 bytes individually in order to align
85	# the remaining data to an 8-byte boundary.
86.Ldo_align:
87	movq	(bufp), %rax
88	add	n_misaligned_q, bufp
89	sub	n_misaligned_q, len
90.Lalign_loop:
91	crc32b	%al, crc0		# compute crc32 of 1-byte
92	shr	$8, %rax		# get next byte
93	dec	n_misaligned
94	jne     .Lalign_loop
95.Laligned:
96
97	################################################################
98	## 2) PROCESS BLOCK:
99	################################################################
100
101	cmp	$128*24, len
102	jae     .Lfull_block
103
104.Lpartial_block:
105	# Compute floor(len / 24) to get num qwords to process from each lane.
106	imul	$2731, len_dw, %eax	# 2731 = ceil(2^16 / 24)
107	shr	$16, %eax
108	jmp	.Lcrc_3lanes
109
110.Lfull_block:
111	# Processing 128 qwords from each lane.
112	mov	$128, %eax
113
114	################################################################
115	## 3) CRC each of three lanes:
116	################################################################
117
118.Lcrc_3lanes:
119	xor	crc1,crc1
120	xor     crc2,crc2
121	mov	%eax, chunk_bytes
122	shl	$3, chunk_bytes		# num bytes to process from each lane
123	sub	$5, %eax		# 4 for 4x_loop, 1 for special last iter
124	jl	.Lcrc_3lanes_4x_done
125
126	# Unroll the loop by a factor of 4 to reduce the overhead of the loop
127	# bookkeeping instructions, which can compete with crc32q for the ALUs.
128.Lcrc_3lanes_4x_loop:
129	crc32q	(bufp), crc0_q
130	crc32q	(bufp,chunk_bytes_q), crc1
131	crc32q	(bufp,chunk_bytes_q,2), crc2
132	crc32q	8(bufp), crc0_q
133	crc32q	8(bufp,chunk_bytes_q), crc1
134	crc32q	8(bufp,chunk_bytes_q,2), crc2
135	crc32q	16(bufp), crc0_q
136	crc32q	16(bufp,chunk_bytes_q), crc1
137	crc32q	16(bufp,chunk_bytes_q,2), crc2
138	crc32q	24(bufp), crc0_q
139	crc32q	24(bufp,chunk_bytes_q), crc1
140	crc32q	24(bufp,chunk_bytes_q,2), crc2
141	add	$32, bufp
142	sub	$4, %eax
143	jge	.Lcrc_3lanes_4x_loop
144
145.Lcrc_3lanes_4x_done:
146	add	$4, %eax
147	jz	.Lcrc_3lanes_last_qword
148
149.Lcrc_3lanes_1x_loop:
150	crc32q	(bufp), crc0_q
151	crc32q	(bufp,chunk_bytes_q), crc1
152	crc32q	(bufp,chunk_bytes_q,2), crc2
153	add	$8, bufp
154	dec	%eax
155	jnz	.Lcrc_3lanes_1x_loop
156
157.Lcrc_3lanes_last_qword:
158	crc32q	(bufp), crc0_q
159	crc32q	(bufp,chunk_bytes_q), crc1
160# SKIP  crc32q	(bufp,chunk_bytes_q,2), crc2	; Don't do this one yet
161
162	################################################################
163	## 4) Combine three results:
164	################################################################
165
166	lea	(K_table-8)(%rip), %rax		# first entry is for idx 1
167	pmovzxdq (%rax,chunk_bytes_q), %xmm0	# 2 consts: K1:K2
168	lea	(chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3
169	sub	%rax, len			# len -= chunk_bytes * 3
170
171	movq	crc0_q, %xmm1			# CRC for block 1
172	pclmulqdq $0x00, %xmm0, %xmm1		# Multiply by K2
173
174	movq    crc1, %xmm2			# CRC for block 2
175	pclmulqdq $0x10, %xmm0, %xmm2		# Multiply by K1
176
177	pxor    %xmm2,%xmm1
178	movq    %xmm1, %rax
179	xor	(bufp,chunk_bytes_q,2), %rax
180	mov	crc2, crc0_q
181	crc32	%rax, crc0_q
182	lea	8(bufp,chunk_bytes_q,2), bufp
183
184	################################################################
185	## 5) If more blocks remain, goto (2):
186	################################################################
187
188	cmp	$128*24, len
189	jae	.Lfull_block
190	cmp	$SMALL_SIZE, len
191	jae	.Lpartial_block
192
193	#######################################################################
194	## 6) Process any remainder without interleaving:
195	#######################################################################
196.Lsmall:
197	test	len_dw, len_dw
198	jz	.Ldone
199	mov	len_dw, %eax
200	shr	$3, %eax
201	jz	.Ldo_dword
202.Ldo_qwords:
203	crc32q	(bufp), crc0_q
204	add	$8, bufp
205	dec	%eax
206	jnz	.Ldo_qwords
207.Ldo_dword:
208	test	$4, len_dw
209	jz	.Ldo_word
210	crc32l	(bufp), crc0
211	add	$4, bufp
212.Ldo_word:
213	test	$2, len_dw
214	jz	.Ldo_byte
215	crc32w	(bufp), crc0
216	add	$2, bufp
217.Ldo_byte:
218	test	$1, len_dw
219	jz	.Ldone
220	crc32b	(bufp), crc0
221.Ldone:
222	mov	crc0, %eax
223        RET
224SYM_FUNC_END(crc32c_x86_3way)
225
226.section	.rodata, "a", @progbits
227	################################################################
228	## PCLMULQDQ tables
229	## Table is 128 entries x 2 words (8 bytes) each
230	################################################################
231.align 8
232K_table:
233	.long 0x493c7d27, 0x00000001
234	.long 0xba4fc28e, 0x493c7d27
235	.long 0xddc0152b, 0xf20c0dfe
236	.long 0x9e4addf8, 0xba4fc28e
237	.long 0x39d3b296, 0x3da6d0cb
238	.long 0x0715ce53, 0xddc0152b
239	.long 0x47db8317, 0x1c291d04
240	.long 0x0d3b6092, 0x9e4addf8
241	.long 0xc96cfdc0, 0x740eef02
242	.long 0x878a92a7, 0x39d3b296
243	.long 0xdaece73e, 0x083a6eec
244	.long 0xab7aff2a, 0x0715ce53
245	.long 0x2162d385, 0xc49f4f67
246	.long 0x83348832, 0x47db8317
247	.long 0x299847d5, 0x2ad91c30
248	.long 0xb9e02b86, 0x0d3b6092
249	.long 0x18b33a4e, 0x6992cea2
250	.long 0xb6dd949b, 0xc96cfdc0
251	.long 0x78d9ccb7, 0x7e908048
252	.long 0xbac2fd7b, 0x878a92a7
253	.long 0xa60ce07b, 0x1b3d8f29
254	.long 0xce7f39f4, 0xdaece73e
255	.long 0x61d82e56, 0xf1d0f55e
256	.long 0xd270f1a2, 0xab7aff2a
257	.long 0xc619809d, 0xa87ab8a8
258	.long 0x2b3cac5d, 0x2162d385
259	.long 0x65863b64, 0x8462d800
260	.long 0x1b03397f, 0x83348832
261	.long 0xebb883bd, 0x71d111a8
262	.long 0xb3e32c28, 0x299847d5
263	.long 0x064f7f26, 0xffd852c6
264	.long 0xdd7e3b0c, 0xb9e02b86
265	.long 0xf285651c, 0xdcb17aa4
266	.long 0x10746f3c, 0x18b33a4e
267	.long 0xc7a68855, 0xf37c5aee
268	.long 0x271d9844, 0xb6dd949b
269	.long 0x8e766a0c, 0x6051d5a2
270	.long 0x93a5f730, 0x78d9ccb7
271	.long 0x6cb08e5c, 0x18b0d4ff
272	.long 0x6b749fb2, 0xbac2fd7b
273	.long 0x1393e203, 0x21f3d99c
274	.long 0xcec3662e, 0xa60ce07b
275	.long 0x96c515bb, 0x8f158014
276	.long 0xe6fc4e6a, 0xce7f39f4
277	.long 0x8227bb8a, 0xa00457f7
278	.long 0xb0cd4768, 0x61d82e56
279	.long 0x39c7ff35, 0x8d6d2c43
280	.long 0xd7a4825c, 0xd270f1a2
281	.long 0x0ab3844b, 0x00ac29cf
282	.long 0x0167d312, 0xc619809d
283	.long 0xf6076544, 0xe9adf796
284	.long 0x26f6a60a, 0x2b3cac5d
285	.long 0xa741c1bf, 0x96638b34
286	.long 0x98d8d9cb, 0x65863b64
287	.long 0x49c3cc9c, 0xe0e9f351
288	.long 0x68bce87a, 0x1b03397f
289	.long 0x57a3d037, 0x9af01f2d
290	.long 0x6956fc3b, 0xebb883bd
291	.long 0x42d98888, 0x2cff42cf
292	.long 0x3771e98f, 0xb3e32c28
293	.long 0xb42ae3d9, 0x88f25a3a
294	.long 0x2178513a, 0x064f7f26
295	.long 0xe0ac139e, 0x4e36f0b0
296	.long 0x170076fa, 0xdd7e3b0c
297	.long 0x444dd413, 0xbd6f81f8
298	.long 0x6f345e45, 0xf285651c
299	.long 0x41d17b64, 0x91c9bd4b
300	.long 0xff0dba97, 0x10746f3c
301	.long 0xa2b73df1, 0x885f087b
302	.long 0xf872e54c, 0xc7a68855
303	.long 0x1e41e9fc, 0x4c144932
304	.long 0x86d8e4d2, 0x271d9844
305	.long 0x651bd98b, 0x52148f02
306	.long 0x5bb8f1bc, 0x8e766a0c
307	.long 0xa90fd27a, 0xa3c6f37a
308	.long 0xb3af077a, 0x93a5f730
309	.long 0x4984d782, 0xd7c0557f
310	.long 0xca6ef3ac, 0x6cb08e5c
311	.long 0x234e0b26, 0x63ded06a
312	.long 0xdd66cbbb, 0x6b749fb2
313	.long 0x4597456a, 0x4d56973c
314	.long 0xe9e28eb4, 0x1393e203
315	.long 0x7b3ff57a, 0x9669c9df
316	.long 0xc9c8b782, 0xcec3662e
317	.long 0x3f70cc6f, 0xe417f38a
318	.long 0x93e106a4, 0x96c515bb
319	.long 0x62ec6c6d, 0x4b9e0f71
320	.long 0xd813b325, 0xe6fc4e6a
321	.long 0x0df04680, 0xd104b8fc
322	.long 0x2342001e, 0x8227bb8a
323	.long 0x0a2a8d7e, 0x5b397730
324	.long 0x6d9a4957, 0xb0cd4768
325	.long 0xe8b6368b, 0xe78eb416
326	.long 0xd2c3ed1a, 0x39c7ff35
327	.long 0x995a5724, 0x61ff0e01
328	.long 0x9ef68d35, 0xd7a4825c
329	.long 0x0c139b31, 0x8d96551c
330	.long 0xf2271e60, 0x0ab3844b
331	.long 0x0b0bf8ca, 0x0bf80dd2
332	.long 0x2664fd8b, 0x0167d312
333	.long 0xed64812d, 0x8821abed
334	.long 0x02ee03b2, 0xf6076544
335	.long 0x8604ae0f, 0x6a45d2b2
336	.long 0x363bd6b3, 0x26f6a60a
337	.long 0x135c83fd, 0xd8d26619
338	.long 0x5fabe670, 0xa741c1bf
339	.long 0x35ec3279, 0xde87806c
340	.long 0x00bcf5f6, 0x98d8d9cb
341	.long 0x8ae00689, 0x14338754
342	.long 0x17f27698, 0x49c3cc9c
343	.long 0x58ca5f00, 0x5bd2011f
344	.long 0xaa7c7ad5, 0x68bce87a
345	.long 0xb5cfca28, 0xdd07448e
346	.long 0xded288f8, 0x57a3d037
347	.long 0x59f229bc, 0xdde8f5b9
348	.long 0x6d390dec, 0x6956fc3b
349	.long 0x37170390, 0xa3e3e02c
350	.long 0x6353c1cc, 0x42d98888
351	.long 0xc4584f5c, 0xd73c7bea
352	.long 0xf48642e9, 0x3771e98f
353	.long 0x531377e2, 0x80ff0093
354	.long 0xdd35bc8d, 0xb42ae3d9
355	.long 0xb25b29f2, 0x8fe4c34d
356	.long 0x9a5ede41, 0x2178513a
357	.long 0xa563905d, 0xdf99fc11
358	.long 0x45cddf4e, 0xe0ac139e
359	.long 0xacfa3103, 0x6c23e841
360	.long 0xa51b6135, 0x170076fa
361