xref: /linux/arch/x86/lib/crc32-pclmul.S (revision 37b33c68b00089a574ebd0a856a5d554eb3001b7)
1*55d1ecceSEric Biggers/* SPDX-License-Identifier: GPL-2.0-only */
2*55d1ecceSEric Biggers/*
3*55d1ecceSEric Biggers * Copyright 2012 Xyratex Technology Limited
4*55d1ecceSEric Biggers *
5*55d1ecceSEric Biggers * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
6*55d1ecceSEric Biggers * calculation.
7*55d1ecceSEric Biggers * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
8*55d1ecceSEric Biggers * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
9*55d1ecceSEric Biggers * at:
10*55d1ecceSEric Biggers * http://www.intel.com/products/processor/manuals/
11*55d1ecceSEric Biggers * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
12*55d1ecceSEric Biggers * Volume 2B: Instruction Set Reference, N-Z
13*55d1ecceSEric Biggers *
14*55d1ecceSEric Biggers * Authors:   Gregory Prestas <Gregory_Prestas@us.xyratex.com>
15*55d1ecceSEric Biggers *	      Alexander Boyko <Alexander_Boyko@xyratex.com>
16*55d1ecceSEric Biggers */
17*55d1ecceSEric Biggers
18*55d1ecceSEric Biggers#include <linux/linkage.h>
19*55d1ecceSEric Biggers
20*55d1ecceSEric Biggers
21*55d1ecceSEric Biggers.section .rodata
22*55d1ecceSEric Biggers.align 16
23*55d1ecceSEric Biggers/*
24*55d1ecceSEric Biggers * [x4*128+32 mod P(x) << 32)]'  << 1   = 0x154442bd4
25*55d1ecceSEric Biggers * #define CONSTANT_R1  0x154442bd4LL
26*55d1ecceSEric Biggers *
27*55d1ecceSEric Biggers * [(x4*128-32 mod P(x) << 32)]' << 1   = 0x1c6e41596
28*55d1ecceSEric Biggers * #define CONSTANT_R2  0x1c6e41596LL
29*55d1ecceSEric Biggers */
30*55d1ecceSEric Biggers.Lconstant_R2R1:
31*55d1ecceSEric Biggers	.octa 0x00000001c6e415960000000154442bd4
32*55d1ecceSEric Biggers/*
33*55d1ecceSEric Biggers * [(x128+32 mod P(x) << 32)]'   << 1   = 0x1751997d0
34*55d1ecceSEric Biggers * #define CONSTANT_R3  0x1751997d0LL
35*55d1ecceSEric Biggers *
36*55d1ecceSEric Biggers * [(x128-32 mod P(x) << 32)]'   << 1   = 0x0ccaa009e
37*55d1ecceSEric Biggers * #define CONSTANT_R4  0x0ccaa009eLL
38*55d1ecceSEric Biggers */
39*55d1ecceSEric Biggers.Lconstant_R4R3:
40*55d1ecceSEric Biggers	.octa 0x00000000ccaa009e00000001751997d0
41*55d1ecceSEric Biggers/*
42*55d1ecceSEric Biggers * [(x64 mod P(x) << 32)]'       << 1   = 0x163cd6124
43*55d1ecceSEric Biggers * #define CONSTANT_R5  0x163cd6124LL
44*55d1ecceSEric Biggers */
45*55d1ecceSEric Biggers.Lconstant_R5:
46*55d1ecceSEric Biggers	.octa 0x00000000000000000000000163cd6124
47*55d1ecceSEric Biggers.Lconstant_mask32:
48*55d1ecceSEric Biggers	.octa 0x000000000000000000000000FFFFFFFF
49*55d1ecceSEric Biggers/*
50*55d1ecceSEric Biggers * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
51*55d1ecceSEric Biggers *
52*55d1ecceSEric Biggers * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL
53*55d1ecceSEric Biggers * #define CONSTANT_RU  0x1F7011641LL
54*55d1ecceSEric Biggers */
55*55d1ecceSEric Biggers.Lconstant_RUpoly:
56*55d1ecceSEric Biggers	.octa 0x00000001F701164100000001DB710641
57*55d1ecceSEric Biggers
58*55d1ecceSEric Biggers#define CONSTANT %xmm0
59*55d1ecceSEric Biggers
60*55d1ecceSEric Biggers#ifdef __x86_64__
61*55d1ecceSEric Biggers#define CRC     %edi
62*55d1ecceSEric Biggers#define BUF     %rsi
63*55d1ecceSEric Biggers#define LEN     %rdx
64*55d1ecceSEric Biggers#else
65*55d1ecceSEric Biggers#define CRC     %eax
66*55d1ecceSEric Biggers#define BUF     %edx
67*55d1ecceSEric Biggers#define LEN     %ecx
68*55d1ecceSEric Biggers#endif
69*55d1ecceSEric Biggers
70*55d1ecceSEric Biggers
71*55d1ecceSEric Biggers
72*55d1ecceSEric Biggers.text
73*55d1ecceSEric Biggers/**
74*55d1ecceSEric Biggers *      Calculate crc32
75*55d1ecceSEric Biggers *      CRC - initial crc32
76*55d1ecceSEric Biggers *      BUF - buffer (16 bytes aligned)
77*55d1ecceSEric Biggers *      LEN - sizeof buffer (16 bytes aligned), LEN should be greater than 63
78*55d1ecceSEric Biggers *      return %eax crc32
79*55d1ecceSEric Biggers *      u32 crc32_pclmul_le_16(u32 crc, const u8 *buffer, size_t len);
80*55d1ecceSEric Biggers */
81*55d1ecceSEric Biggers
82*55d1ecceSEric BiggersSYM_FUNC_START(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */
83*55d1ecceSEric Biggers	movdqa  (BUF), %xmm1
84*55d1ecceSEric Biggers	movdqa  0x10(BUF), %xmm2
85*55d1ecceSEric Biggers	movdqa  0x20(BUF), %xmm3
86*55d1ecceSEric Biggers	movdqa  0x30(BUF), %xmm4
87*55d1ecceSEric Biggers	movd    CRC, CONSTANT
88*55d1ecceSEric Biggers	pxor    CONSTANT, %xmm1
89*55d1ecceSEric Biggers	sub     $0x40, LEN
90*55d1ecceSEric Biggers	add     $0x40, BUF
91*55d1ecceSEric Biggers	cmp     $0x40, LEN
92*55d1ecceSEric Biggers	jb      .Lless_64
93*55d1ecceSEric Biggers
94*55d1ecceSEric Biggers#ifdef __x86_64__
95*55d1ecceSEric Biggers	movdqa .Lconstant_R2R1(%rip), CONSTANT
96*55d1ecceSEric Biggers#else
97*55d1ecceSEric Biggers	movdqa .Lconstant_R2R1, CONSTANT
98*55d1ecceSEric Biggers#endif
99*55d1ecceSEric Biggers
100*55d1ecceSEric Biggers.Lloop_64:/*  64 bytes Full cache line folding */
101*55d1ecceSEric Biggers	prefetchnta    0x40(BUF)
102*55d1ecceSEric Biggers	movdqa  %xmm1, %xmm5
103*55d1ecceSEric Biggers	movdqa  %xmm2, %xmm6
104*55d1ecceSEric Biggers	movdqa  %xmm3, %xmm7
105*55d1ecceSEric Biggers#ifdef __x86_64__
106*55d1ecceSEric Biggers	movdqa  %xmm4, %xmm8
107*55d1ecceSEric Biggers#endif
108*55d1ecceSEric Biggers	pclmulqdq $0x00, CONSTANT, %xmm1
109*55d1ecceSEric Biggers	pclmulqdq $0x00, CONSTANT, %xmm2
110*55d1ecceSEric Biggers	pclmulqdq $0x00, CONSTANT, %xmm3
111*55d1ecceSEric Biggers#ifdef __x86_64__
112*55d1ecceSEric Biggers	pclmulqdq $0x00, CONSTANT, %xmm4
113*55d1ecceSEric Biggers#endif
114*55d1ecceSEric Biggers	pclmulqdq $0x11, CONSTANT, %xmm5
115*55d1ecceSEric Biggers	pclmulqdq $0x11, CONSTANT, %xmm6
116*55d1ecceSEric Biggers	pclmulqdq $0x11, CONSTANT, %xmm7
117*55d1ecceSEric Biggers#ifdef __x86_64__
118*55d1ecceSEric Biggers	pclmulqdq $0x11, CONSTANT, %xmm8
119*55d1ecceSEric Biggers#endif
120*55d1ecceSEric Biggers	pxor    %xmm5, %xmm1
121*55d1ecceSEric Biggers	pxor    %xmm6, %xmm2
122*55d1ecceSEric Biggers	pxor    %xmm7, %xmm3
123*55d1ecceSEric Biggers#ifdef __x86_64__
124*55d1ecceSEric Biggers	pxor    %xmm8, %xmm4
125*55d1ecceSEric Biggers#else
126*55d1ecceSEric Biggers	/* xmm8 unsupported for x32 */
127*55d1ecceSEric Biggers	movdqa  %xmm4, %xmm5
128*55d1ecceSEric Biggers	pclmulqdq $0x00, CONSTANT, %xmm4
129*55d1ecceSEric Biggers	pclmulqdq $0x11, CONSTANT, %xmm5
130*55d1ecceSEric Biggers	pxor    %xmm5, %xmm4
131*55d1ecceSEric Biggers#endif
132*55d1ecceSEric Biggers
133*55d1ecceSEric Biggers	pxor    (BUF), %xmm1
134*55d1ecceSEric Biggers	pxor    0x10(BUF), %xmm2
135*55d1ecceSEric Biggers	pxor    0x20(BUF), %xmm3
136*55d1ecceSEric Biggers	pxor    0x30(BUF), %xmm4
137*55d1ecceSEric Biggers
138*55d1ecceSEric Biggers	sub     $0x40, LEN
139*55d1ecceSEric Biggers	add     $0x40, BUF
140*55d1ecceSEric Biggers	cmp     $0x40, LEN
141*55d1ecceSEric Biggers	jge     .Lloop_64
142*55d1ecceSEric Biggers.Lless_64:/*  Folding cache line into 128bit */
143*55d1ecceSEric Biggers#ifdef __x86_64__
144*55d1ecceSEric Biggers	movdqa  .Lconstant_R4R3(%rip), CONSTANT
145*55d1ecceSEric Biggers#else
146*55d1ecceSEric Biggers	movdqa  .Lconstant_R4R3, CONSTANT
147*55d1ecceSEric Biggers#endif
148*55d1ecceSEric Biggers	prefetchnta     (BUF)
149*55d1ecceSEric Biggers
150*55d1ecceSEric Biggers	movdqa  %xmm1, %xmm5
151*55d1ecceSEric Biggers	pclmulqdq $0x00, CONSTANT, %xmm1
152*55d1ecceSEric Biggers	pclmulqdq $0x11, CONSTANT, %xmm5
153*55d1ecceSEric Biggers	pxor    %xmm5, %xmm1
154*55d1ecceSEric Biggers	pxor    %xmm2, %xmm1
155*55d1ecceSEric Biggers
156*55d1ecceSEric Biggers	movdqa  %xmm1, %xmm5
157*55d1ecceSEric Biggers	pclmulqdq $0x00, CONSTANT, %xmm1
158*55d1ecceSEric Biggers	pclmulqdq $0x11, CONSTANT, %xmm5
159*55d1ecceSEric Biggers	pxor    %xmm5, %xmm1
160*55d1ecceSEric Biggers	pxor    %xmm3, %xmm1
161*55d1ecceSEric Biggers
162*55d1ecceSEric Biggers	movdqa  %xmm1, %xmm5
163*55d1ecceSEric Biggers	pclmulqdq $0x00, CONSTANT, %xmm1
164*55d1ecceSEric Biggers	pclmulqdq $0x11, CONSTANT, %xmm5
165*55d1ecceSEric Biggers	pxor    %xmm5, %xmm1
166*55d1ecceSEric Biggers	pxor    %xmm4, %xmm1
167*55d1ecceSEric Biggers
168*55d1ecceSEric Biggers	cmp     $0x10, LEN
169*55d1ecceSEric Biggers	jb      .Lfold_64
170*55d1ecceSEric Biggers.Lloop_16:/* Folding rest buffer into 128bit */
171*55d1ecceSEric Biggers	movdqa  %xmm1, %xmm5
172*55d1ecceSEric Biggers	pclmulqdq $0x00, CONSTANT, %xmm1
173*55d1ecceSEric Biggers	pclmulqdq $0x11, CONSTANT, %xmm5
174*55d1ecceSEric Biggers	pxor    %xmm5, %xmm1
175*55d1ecceSEric Biggers	pxor    (BUF), %xmm1
176*55d1ecceSEric Biggers	sub     $0x10, LEN
177*55d1ecceSEric Biggers	add     $0x10, BUF
178*55d1ecceSEric Biggers	cmp     $0x10, LEN
179*55d1ecceSEric Biggers	jge     .Lloop_16
180*55d1ecceSEric Biggers
181*55d1ecceSEric Biggers.Lfold_64:
182*55d1ecceSEric Biggers	/* perform the last 64 bit fold, also adds 32 zeroes
183*55d1ecceSEric Biggers	 * to the input stream */
184*55d1ecceSEric Biggers	pclmulqdq $0x01, %xmm1, CONSTANT /* R4 * xmm1.low */
185*55d1ecceSEric Biggers	psrldq  $0x08, %xmm1
186*55d1ecceSEric Biggers	pxor    CONSTANT, %xmm1
187*55d1ecceSEric Biggers
188*55d1ecceSEric Biggers	/* final 32-bit fold */
189*55d1ecceSEric Biggers	movdqa  %xmm1, %xmm2
190*55d1ecceSEric Biggers#ifdef __x86_64__
191*55d1ecceSEric Biggers	movdqa  .Lconstant_R5(%rip), CONSTANT
192*55d1ecceSEric Biggers	movdqa  .Lconstant_mask32(%rip), %xmm3
193*55d1ecceSEric Biggers#else
194*55d1ecceSEric Biggers	movdqa  .Lconstant_R5, CONSTANT
195*55d1ecceSEric Biggers	movdqa  .Lconstant_mask32, %xmm3
196*55d1ecceSEric Biggers#endif
197*55d1ecceSEric Biggers	psrldq  $0x04, %xmm2
198*55d1ecceSEric Biggers	pand    %xmm3, %xmm1
199*55d1ecceSEric Biggers	pclmulqdq $0x00, CONSTANT, %xmm1
200*55d1ecceSEric Biggers	pxor    %xmm2, %xmm1
201*55d1ecceSEric Biggers
202*55d1ecceSEric Biggers	/* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
203*55d1ecceSEric Biggers#ifdef __x86_64__
204*55d1ecceSEric Biggers	movdqa  .Lconstant_RUpoly(%rip), CONSTANT
205*55d1ecceSEric Biggers#else
206*55d1ecceSEric Biggers	movdqa  .Lconstant_RUpoly, CONSTANT
207*55d1ecceSEric Biggers#endif
208*55d1ecceSEric Biggers	movdqa  %xmm1, %xmm2
209*55d1ecceSEric Biggers	pand    %xmm3, %xmm1
210*55d1ecceSEric Biggers	pclmulqdq $0x10, CONSTANT, %xmm1
211*55d1ecceSEric Biggers	pand    %xmm3, %xmm1
212*55d1ecceSEric Biggers	pclmulqdq $0x00, CONSTANT, %xmm1
213*55d1ecceSEric Biggers	pxor    %xmm2, %xmm1
214*55d1ecceSEric Biggers	pextrd  $0x01, %xmm1, %eax
215*55d1ecceSEric Biggers
216*55d1ecceSEric Biggers	RET
217*55d1ecceSEric BiggersSYM_FUNC_END(crc32_pclmul_le_16)
218