xref: /linux/arch/arm64/lib/crc-t10dif-core.S (revision 37b33c68b00089a574ebd0a856a5d554eb3001b7)
1*2051da85SEric Biggers//
2*2051da85SEric Biggers// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
3*2051da85SEric Biggers//
4*2051da85SEric Biggers// Copyright (C) 2016 Linaro Ltd
5*2051da85SEric Biggers// Copyright (C) 2019-2024 Google LLC
6*2051da85SEric Biggers//
7*2051da85SEric Biggers// Authors: Ard Biesheuvel <ardb@google.com>
8*2051da85SEric Biggers//          Eric Biggers <ebiggers@google.com>
9*2051da85SEric Biggers//
10*2051da85SEric Biggers// This program is free software; you can redistribute it and/or modify
11*2051da85SEric Biggers// it under the terms of the GNU General Public License version 2 as
12*2051da85SEric Biggers// published by the Free Software Foundation.
13*2051da85SEric Biggers//
14*2051da85SEric Biggers
15*2051da85SEric Biggers// Derived from the x86 version:
16*2051da85SEric Biggers//
17*2051da85SEric Biggers// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
18*2051da85SEric Biggers//
19*2051da85SEric Biggers// Copyright (c) 2013, Intel Corporation
20*2051da85SEric Biggers//
21*2051da85SEric Biggers// Authors:
22*2051da85SEric Biggers//     Erdinc Ozturk <erdinc.ozturk@intel.com>
23*2051da85SEric Biggers//     Vinodh Gopal <vinodh.gopal@intel.com>
24*2051da85SEric Biggers//     James Guilford <james.guilford@intel.com>
25*2051da85SEric Biggers//     Tim Chen <tim.c.chen@linux.intel.com>
26*2051da85SEric Biggers//
27*2051da85SEric Biggers// This software is available to you under a choice of one of two
28*2051da85SEric Biggers// licenses.  You may choose to be licensed under the terms of the GNU
29*2051da85SEric Biggers// General Public License (GPL) Version 2, available from the file
30*2051da85SEric Biggers// COPYING in the main directory of this source tree, or the
31*2051da85SEric Biggers// OpenIB.org BSD license below:
32*2051da85SEric Biggers//
33*2051da85SEric Biggers// Redistribution and use in source and binary forms, with or without
34*2051da85SEric Biggers// modification, are permitted provided that the following conditions are
35*2051da85SEric Biggers// met:
36*2051da85SEric Biggers//
37*2051da85SEric Biggers// * Redistributions of source code must retain the above copyright
38*2051da85SEric Biggers//   notice, this list of conditions and the following disclaimer.
39*2051da85SEric Biggers//
40*2051da85SEric Biggers// * Redistributions in binary form must reproduce the above copyright
41*2051da85SEric Biggers//   notice, this list of conditions and the following disclaimer in the
42*2051da85SEric Biggers//   documentation and/or other materials provided with the
43*2051da85SEric Biggers//   distribution.
44*2051da85SEric Biggers//
45*2051da85SEric Biggers// * Neither the name of the Intel Corporation nor the names of its
46*2051da85SEric Biggers//   contributors may be used to endorse or promote products derived from
47*2051da85SEric Biggers//   this software without specific prior written permission.
48*2051da85SEric Biggers//
49*2051da85SEric Biggers//
50*2051da85SEric Biggers// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
51*2051da85SEric Biggers// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52*2051da85SEric Biggers// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
53*2051da85SEric Biggers// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
54*2051da85SEric Biggers// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
55*2051da85SEric Biggers// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
56*2051da85SEric Biggers// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
57*2051da85SEric Biggers// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
58*2051da85SEric Biggers// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
59*2051da85SEric Biggers// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
60*2051da85SEric Biggers// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
61*2051da85SEric Biggers//
62*2051da85SEric Biggers//       Reference paper titled "Fast CRC Computation for Generic
63*2051da85SEric Biggers//	Polynomials Using PCLMULQDQ Instruction"
64*2051da85SEric Biggers//       URL: http://www.intel.com/content/dam/www/public/us/en/documents
65*2051da85SEric Biggers//  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
66*2051da85SEric Biggers//
67*2051da85SEric Biggers
68*2051da85SEric Biggers#include <linux/linkage.h>
69*2051da85SEric Biggers#include <asm/assembler.h>
70*2051da85SEric Biggers
71*2051da85SEric Biggers	.text
72*2051da85SEric Biggers	.arch		armv8-a+crypto
73*2051da85SEric Biggers
74*2051da85SEric Biggers	init_crc	.req	w0
75*2051da85SEric Biggers	buf		.req	x1
76*2051da85SEric Biggers	len		.req	x2
77*2051da85SEric Biggers	fold_consts_ptr	.req	x5
78*2051da85SEric Biggers
79*2051da85SEric Biggers	fold_consts	.req	v10
80*2051da85SEric Biggers
81*2051da85SEric Biggers	t3		.req	v17
82*2051da85SEric Biggers	t4		.req	v18
83*2051da85SEric Biggers	t5		.req	v19
84*2051da85SEric Biggers	t6		.req	v20
85*2051da85SEric Biggers	t7		.req	v21
86*2051da85SEric Biggers	t8		.req	v22
87*2051da85SEric Biggers
88*2051da85SEric Biggers	perm		.req	v27
89*2051da85SEric Biggers
90*2051da85SEric Biggers	.macro		pmull16x64_p64, a16, b64, c64
91*2051da85SEric Biggers	pmull2		\c64\().1q, \a16\().2d, \b64\().2d
92*2051da85SEric Biggers	pmull		\b64\().1q, \a16\().1d, \b64\().1d
93*2051da85SEric Biggers	.endm
94*2051da85SEric Biggers
95*2051da85SEric Biggers	/*
96*2051da85SEric Biggers	 * Pairwise long polynomial multiplication of two 16-bit values
97*2051da85SEric Biggers	 *
98*2051da85SEric Biggers	 *   { w0, w1 }, { y0, y1 }
99*2051da85SEric Biggers	 *
100*2051da85SEric Biggers	 * by two 64-bit values
101*2051da85SEric Biggers	 *
102*2051da85SEric Biggers	 *   { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 }
103*2051da85SEric Biggers	 *
104*2051da85SEric Biggers	 * where each vector element is a byte, ordered from least to most
105*2051da85SEric Biggers	 * significant.
106*2051da85SEric Biggers	 *
107*2051da85SEric Biggers	 * This can be implemented using 8x8 long polynomial multiplication, by
108*2051da85SEric Biggers	 * reorganizing the input so that each pairwise 8x8 multiplication
109*2051da85SEric Biggers	 * produces one of the terms from the decomposition below, and
110*2051da85SEric Biggers	 * combining the results of each rank and shifting them into place.
111*2051da85SEric Biggers	 *
112*2051da85SEric Biggers	 * Rank
113*2051da85SEric Biggers	 *  0            w0*x0 ^              |        y0*z0 ^
114*2051da85SEric Biggers	 *  1       (w0*x1 ^ w1*x0) <<  8 ^   |   (y0*z1 ^ y1*z0) <<  8 ^
115*2051da85SEric Biggers	 *  2       (w0*x2 ^ w1*x1) << 16 ^   |   (y0*z2 ^ y1*z1) << 16 ^
116*2051da85SEric Biggers	 *  3       (w0*x3 ^ w1*x2) << 24 ^   |   (y0*z3 ^ y1*z2) << 24 ^
117*2051da85SEric Biggers	 *  4       (w0*x4 ^ w1*x3) << 32 ^   |   (y0*z4 ^ y1*z3) << 32 ^
118*2051da85SEric Biggers	 *  5       (w0*x5 ^ w1*x4) << 40 ^   |   (y0*z5 ^ y1*z4) << 40 ^
119*2051da85SEric Biggers	 *  6       (w0*x6 ^ w1*x5) << 48 ^   |   (y0*z6 ^ y1*z5) << 48 ^
120*2051da85SEric Biggers	 *  7       (w0*x7 ^ w1*x6) << 56 ^   |   (y0*z7 ^ y1*z6) << 56 ^
121*2051da85SEric Biggers	 *  8            w1*x7      << 64     |        y1*z7      << 64
122*2051da85SEric Biggers	 *
123*2051da85SEric Biggers	 * The inputs can be reorganized into
124*2051da85SEric Biggers	 *
125*2051da85SEric Biggers	 *   { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 }
126*2051da85SEric Biggers	 *   { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 }
127*2051da85SEric Biggers	 *
128*2051da85SEric Biggers	 * and after performing 8x8->16 bit long polynomial multiplication of
129*2051da85SEric Biggers	 * each of the halves of the first vector with those of the second one,
130*2051da85SEric Biggers	 * we obtain the following four vectors of 16-bit elements:
131*2051da85SEric Biggers	 *
132*2051da85SEric Biggers	 *   a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 }
133*2051da85SEric Biggers	 *   b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 }
134*2051da85SEric Biggers	 *   c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 }
135*2051da85SEric Biggers	 *   d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 }
136*2051da85SEric Biggers	 *
137*2051da85SEric Biggers	 * Results b and c can be XORed together, as the vector elements have
138*2051da85SEric Biggers	 * matching ranks. Then, the final XOR (*) can be pulled forward, and
139*2051da85SEric Biggers	 * applied between the halves of each of the remaining three vectors,
140*2051da85SEric Biggers	 * which are then shifted into place, and combined to produce two
141*2051da85SEric Biggers	 * 80-bit results.
142*2051da85SEric Biggers	 *
143*2051da85SEric Biggers	 * (*) NOTE: the 16x64 bit polynomial multiply below is not equivalent
144*2051da85SEric Biggers	 * to the 64x64 bit one above, but XOR'ing the outputs together will
145*2051da85SEric Biggers	 * produce the expected result, and this is sufficient in the context of
146*2051da85SEric Biggers	 * this algorithm.
147*2051da85SEric Biggers	 */
148*2051da85SEric Biggers	.macro		pmull16x64_p8, a16, b64, c64
149*2051da85SEric Biggers	ext		t7.16b, \b64\().16b, \b64\().16b, #1
150*2051da85SEric Biggers	tbl		t5.16b, {\a16\().16b}, perm.16b
151*2051da85SEric Biggers	uzp1		t7.16b, \b64\().16b, t7.16b
152*2051da85SEric Biggers	bl		__pmull_p8_16x64
153*2051da85SEric Biggers	ext		\b64\().16b, t4.16b, t4.16b, #15
154*2051da85SEric Biggers	eor		\c64\().16b, t8.16b, t5.16b
155*2051da85SEric Biggers	.endm
156*2051da85SEric Biggers
157*2051da85SEric BiggersSYM_FUNC_START_LOCAL(__pmull_p8_16x64)
158*2051da85SEric Biggers	ext		t6.16b, t5.16b, t5.16b, #8
159*2051da85SEric Biggers
160*2051da85SEric Biggers	pmull		t3.8h, t7.8b, t5.8b
161*2051da85SEric Biggers	pmull		t4.8h, t7.8b, t6.8b
162*2051da85SEric Biggers	pmull2		t5.8h, t7.16b, t5.16b
163*2051da85SEric Biggers	pmull2		t6.8h, t7.16b, t6.16b
164*2051da85SEric Biggers
165*2051da85SEric Biggers	ext		t8.16b, t3.16b, t3.16b, #8
166*2051da85SEric Biggers	eor		t4.16b, t4.16b, t6.16b
167*2051da85SEric Biggers	ext		t7.16b, t5.16b, t5.16b, #8
168*2051da85SEric Biggers	ext		t6.16b, t4.16b, t4.16b, #8
169*2051da85SEric Biggers	eor		t8.8b, t8.8b, t3.8b
170*2051da85SEric Biggers	eor		t5.8b, t5.8b, t7.8b
171*2051da85SEric Biggers	eor		t4.8b, t4.8b, t6.8b
172*2051da85SEric Biggers	ext		t5.16b, t5.16b, t5.16b, #14
173*2051da85SEric Biggers	ret
174*2051da85SEric BiggersSYM_FUNC_END(__pmull_p8_16x64)
175*2051da85SEric Biggers
176*2051da85SEric Biggers
177*2051da85SEric Biggers	// Fold reg1, reg2 into the next 32 data bytes, storing the result back
178*2051da85SEric Biggers	// into reg1, reg2.
179*2051da85SEric Biggers	.macro		fold_32_bytes, p, reg1, reg2
180*2051da85SEric Biggers	ldp		q11, q12, [buf], #0x20
181*2051da85SEric Biggers
182*2051da85SEric Biggers	pmull16x64_\p	fold_consts, \reg1, v8
183*2051da85SEric Biggers
184*2051da85SEric BiggersCPU_LE(	rev64		v11.16b, v11.16b		)
185*2051da85SEric BiggersCPU_LE(	rev64		v12.16b, v12.16b		)
186*2051da85SEric Biggers
187*2051da85SEric Biggers	pmull16x64_\p	fold_consts, \reg2, v9
188*2051da85SEric Biggers
189*2051da85SEric BiggersCPU_LE(	ext		v11.16b, v11.16b, v11.16b, #8	)
190*2051da85SEric BiggersCPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
191*2051da85SEric Biggers
192*2051da85SEric Biggers	eor		\reg1\().16b, \reg1\().16b, v8.16b
193*2051da85SEric Biggers	eor		\reg2\().16b, \reg2\().16b, v9.16b
194*2051da85SEric Biggers	eor		\reg1\().16b, \reg1\().16b, v11.16b
195*2051da85SEric Biggers	eor		\reg2\().16b, \reg2\().16b, v12.16b
196*2051da85SEric Biggers	.endm
197*2051da85SEric Biggers
198*2051da85SEric Biggers	// Fold src_reg into dst_reg, optionally loading the next fold constants
199*2051da85SEric Biggers	.macro		fold_16_bytes, p, src_reg, dst_reg, load_next_consts
200*2051da85SEric Biggers	pmull16x64_\p	fold_consts, \src_reg, v8
201*2051da85SEric Biggers	.ifnb		\load_next_consts
202*2051da85SEric Biggers	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
203*2051da85SEric Biggers	.endif
204*2051da85SEric Biggers	eor		\dst_reg\().16b, \dst_reg\().16b, v8.16b
205*2051da85SEric Biggers	eor		\dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
206*2051da85SEric Biggers	.endm
207*2051da85SEric Biggers
208*2051da85SEric Biggers	.macro		crc_t10dif_pmull, p
209*2051da85SEric Biggers
210*2051da85SEric Biggers	// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
211*2051da85SEric Biggers	cmp		len, #256
212*2051da85SEric Biggers	b.lt		.Lless_than_256_bytes_\@
213*2051da85SEric Biggers
214*2051da85SEric Biggers	adr_l		fold_consts_ptr, .Lfold_across_128_bytes_consts
215*2051da85SEric Biggers
216*2051da85SEric Biggers	// Load the first 128 data bytes.  Byte swapping is necessary to make
217*2051da85SEric Biggers	// the bit order match the polynomial coefficient order.
218*2051da85SEric Biggers	ldp		q0, q1, [buf]
219*2051da85SEric Biggers	ldp		q2, q3, [buf, #0x20]
220*2051da85SEric Biggers	ldp		q4, q5, [buf, #0x40]
221*2051da85SEric Biggers	ldp		q6, q7, [buf, #0x60]
222*2051da85SEric Biggers	add		buf, buf, #0x80
223*2051da85SEric BiggersCPU_LE(	rev64		v0.16b, v0.16b			)
224*2051da85SEric BiggersCPU_LE(	rev64		v1.16b, v1.16b			)
225*2051da85SEric BiggersCPU_LE(	rev64		v2.16b, v2.16b			)
226*2051da85SEric BiggersCPU_LE(	rev64		v3.16b, v3.16b			)
227*2051da85SEric BiggersCPU_LE(	rev64		v4.16b, v4.16b			)
228*2051da85SEric BiggersCPU_LE(	rev64		v5.16b, v5.16b			)
229*2051da85SEric BiggersCPU_LE(	rev64		v6.16b, v6.16b			)
230*2051da85SEric BiggersCPU_LE(	rev64		v7.16b, v7.16b			)
231*2051da85SEric BiggersCPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
232*2051da85SEric BiggersCPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)
233*2051da85SEric BiggersCPU_LE(	ext		v2.16b, v2.16b, v2.16b, #8	)
234*2051da85SEric BiggersCPU_LE(	ext		v3.16b, v3.16b, v3.16b, #8	)
235*2051da85SEric BiggersCPU_LE(	ext		v4.16b, v4.16b, v4.16b, #8	)
236*2051da85SEric BiggersCPU_LE(	ext		v5.16b, v5.16b, v5.16b, #8	)
237*2051da85SEric BiggersCPU_LE(	ext		v6.16b, v6.16b, v6.16b, #8	)
238*2051da85SEric BiggersCPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
239*2051da85SEric Biggers
240*2051da85SEric Biggers	// XOR the first 16 data *bits* with the initial CRC value.
241*2051da85SEric Biggers	movi		v8.16b, #0
242*2051da85SEric Biggers	mov		v8.h[7], init_crc
243*2051da85SEric Biggers	eor		v0.16b, v0.16b, v8.16b
244*2051da85SEric Biggers
245*2051da85SEric Biggers	// Load the constants for folding across 128 bytes.
246*2051da85SEric Biggers	ld1		{fold_consts.2d}, [fold_consts_ptr]
247*2051da85SEric Biggers
248*2051da85SEric Biggers	// Subtract 128 for the 128 data bytes just consumed.  Subtract another
249*2051da85SEric Biggers	// 128 to simplify the termination condition of the following loop.
250*2051da85SEric Biggers	sub		len, len, #256
251*2051da85SEric Biggers
252*2051da85SEric Biggers	// While >= 128 data bytes remain (not counting v0-v7), fold the 128
253*2051da85SEric Biggers	// bytes v0-v7 into them, storing the result back into v0-v7.
254*2051da85SEric Biggers.Lfold_128_bytes_loop_\@:
255*2051da85SEric Biggers	fold_32_bytes	\p, v0, v1
256*2051da85SEric Biggers	fold_32_bytes	\p, v2, v3
257*2051da85SEric Biggers	fold_32_bytes	\p, v4, v5
258*2051da85SEric Biggers	fold_32_bytes	\p, v6, v7
259*2051da85SEric Biggers
260*2051da85SEric Biggers	subs		len, len, #128
261*2051da85SEric Biggers	b.ge		.Lfold_128_bytes_loop_\@
262*2051da85SEric Biggers
263*2051da85SEric Biggers	// Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
264*2051da85SEric Biggers
265*2051da85SEric Biggers	// Fold across 64 bytes.
266*2051da85SEric Biggers	add		fold_consts_ptr, fold_consts_ptr, #16
267*2051da85SEric Biggers	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
268*2051da85SEric Biggers	fold_16_bytes	\p, v0, v4
269*2051da85SEric Biggers	fold_16_bytes	\p, v1, v5
270*2051da85SEric Biggers	fold_16_bytes	\p, v2, v6
271*2051da85SEric Biggers	fold_16_bytes	\p, v3, v7, 1
272*2051da85SEric Biggers	// Fold across 32 bytes.
273*2051da85SEric Biggers	fold_16_bytes	\p, v4, v6
274*2051da85SEric Biggers	fold_16_bytes	\p, v5, v7, 1
275*2051da85SEric Biggers	// Fold across 16 bytes.
276*2051da85SEric Biggers	fold_16_bytes	\p, v6, v7
277*2051da85SEric Biggers
278*2051da85SEric Biggers	// Add 128 to get the correct number of data bytes remaining in 0...127
279*2051da85SEric Biggers	// (not counting v7), following the previous extra subtraction by 128.
280*2051da85SEric Biggers	// Then subtract 16 to simplify the termination condition of the
281*2051da85SEric Biggers	// following loop.
282*2051da85SEric Biggers	adds		len, len, #(128-16)
283*2051da85SEric Biggers
284*2051da85SEric Biggers	// While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7
285*2051da85SEric Biggers	// into them, storing the result back into v7.
286*2051da85SEric Biggers	b.lt		.Lfold_16_bytes_loop_done_\@
287*2051da85SEric Biggers.Lfold_16_bytes_loop_\@:
288*2051da85SEric Biggers	pmull16x64_\p	fold_consts, v7, v8
289*2051da85SEric Biggers	eor		v7.16b, v7.16b, v8.16b
290*2051da85SEric Biggers	ldr		q0, [buf], #16
291*2051da85SEric BiggersCPU_LE(	rev64		v0.16b, v0.16b			)
292*2051da85SEric BiggersCPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
293*2051da85SEric Biggers	eor		v7.16b, v7.16b, v0.16b
294*2051da85SEric Biggers	subs		len, len, #16
295*2051da85SEric Biggers	b.ge		.Lfold_16_bytes_loop_\@
296*2051da85SEric Biggers
297*2051da85SEric Biggers.Lfold_16_bytes_loop_done_\@:
298*2051da85SEric Biggers	// Add 16 to get the correct number of data bytes remaining in 0...15
299*2051da85SEric Biggers	// (not counting v7), following the previous extra subtraction by 16.
300*2051da85SEric Biggers	adds		len, len, #16
301*2051da85SEric Biggers	b.eq		.Lreduce_final_16_bytes_\@
302*2051da85SEric Biggers
303*2051da85SEric Biggers.Lhandle_partial_segment_\@:
304*2051da85SEric Biggers	// Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
305*2051da85SEric Biggers	// 16 bytes are in v7 and the rest are the remaining data in 'buf'.  To
306*2051da85SEric Biggers	// do this without needing a fold constant for each possible 'len',
307*2051da85SEric Biggers	// redivide the bytes into a first chunk of 'len' bytes and a second
308*2051da85SEric Biggers	// chunk of 16 bytes, then fold the first chunk into the second.
309*2051da85SEric Biggers
310*2051da85SEric Biggers	// v0 = last 16 original data bytes
311*2051da85SEric Biggers	add		buf, buf, len
312*2051da85SEric Biggers	ldr		q0, [buf, #-16]
313*2051da85SEric BiggersCPU_LE(	rev64		v0.16b, v0.16b			)
314*2051da85SEric BiggersCPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
315*2051da85SEric Biggers
316*2051da85SEric Biggers	// v1 = high order part of second chunk: v7 left-shifted by 'len' bytes.
317*2051da85SEric Biggers	adr_l		x4, .Lbyteshift_table + 16
318*2051da85SEric Biggers	sub		x4, x4, len
319*2051da85SEric Biggers	ld1		{v2.16b}, [x4]
320*2051da85SEric Biggers	tbl		v1.16b, {v7.16b}, v2.16b
321*2051da85SEric Biggers
322*2051da85SEric Biggers	// v3 = first chunk: v7 right-shifted by '16-len' bytes.
323*2051da85SEric Biggers	movi		v3.16b, #0x80
324*2051da85SEric Biggers	eor		v2.16b, v2.16b, v3.16b
325*2051da85SEric Biggers	tbl		v3.16b, {v7.16b}, v2.16b
326*2051da85SEric Biggers
327*2051da85SEric Biggers	// Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
328*2051da85SEric Biggers	sshr		v2.16b, v2.16b, #7
329*2051da85SEric Biggers
330*2051da85SEric Biggers	// v2 = second chunk: 'len' bytes from v0 (low-order bytes),
331*2051da85SEric Biggers	// then '16-len' bytes from v1 (high-order bytes).
332*2051da85SEric Biggers	bsl		v2.16b, v1.16b, v0.16b
333*2051da85SEric Biggers
334*2051da85SEric Biggers	// Fold the first chunk into the second chunk, storing the result in v7.
335*2051da85SEric Biggers	pmull16x64_\p	fold_consts, v3, v0
336*2051da85SEric Biggers	eor		v7.16b, v3.16b, v0.16b
337*2051da85SEric Biggers	eor		v7.16b, v7.16b, v2.16b
338*2051da85SEric Biggers	b		.Lreduce_final_16_bytes_\@
339*2051da85SEric Biggers
340*2051da85SEric Biggers.Lless_than_256_bytes_\@:
341*2051da85SEric Biggers	// Checksumming a buffer of length 16...255 bytes
342*2051da85SEric Biggers
343*2051da85SEric Biggers	adr_l		fold_consts_ptr, .Lfold_across_16_bytes_consts
344*2051da85SEric Biggers
345*2051da85SEric Biggers	// Load the first 16 data bytes.
346*2051da85SEric Biggers	ldr		q7, [buf], #0x10
347*2051da85SEric BiggersCPU_LE(	rev64		v7.16b, v7.16b			)
348*2051da85SEric BiggersCPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
349*2051da85SEric Biggers
350*2051da85SEric Biggers	// XOR the first 16 data *bits* with the initial CRC value.
351*2051da85SEric Biggers	movi		v0.16b, #0
352*2051da85SEric Biggers	mov		v0.h[7], init_crc
353*2051da85SEric Biggers	eor		v7.16b, v7.16b, v0.16b
354*2051da85SEric Biggers
355*2051da85SEric Biggers	// Load the fold-across-16-bytes constants.
356*2051da85SEric Biggers	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
357*2051da85SEric Biggers
358*2051da85SEric Biggers	cmp		len, #16
359*2051da85SEric Biggers	b.eq		.Lreduce_final_16_bytes_\@	// len == 16
360*2051da85SEric Biggers	subs		len, len, #32
361*2051da85SEric Biggers	b.ge		.Lfold_16_bytes_loop_\@		// 32 <= len <= 255
362*2051da85SEric Biggers	add		len, len, #16
363*2051da85SEric Biggers	b		.Lhandle_partial_segment_\@	// 17 <= len <= 31
364*2051da85SEric Biggers
365*2051da85SEric Biggers.Lreduce_final_16_bytes_\@:
366*2051da85SEric Biggers	.endm
367*2051da85SEric Biggers
368*2051da85SEric Biggers//
369*2051da85SEric Biggers// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
370*2051da85SEric Biggers//
371*2051da85SEric Biggers// Assumes len >= 16.
372*2051da85SEric Biggers//
373*2051da85SEric BiggersSYM_FUNC_START(crc_t10dif_pmull_p8)
374*2051da85SEric Biggers	frame_push	1
375*2051da85SEric Biggers
376*2051da85SEric Biggers	// Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 }
377*2051da85SEric Biggers	movi		perm.4h, #8, lsl #8
378*2051da85SEric Biggers	orr		perm.2s, #1, lsl #16
379*2051da85SEric Biggers	orr		perm.2s, #1, lsl #24
380*2051da85SEric Biggers	zip1		perm.16b, perm.16b, perm.16b
381*2051da85SEric Biggers	zip1		perm.16b, perm.16b, perm.16b
382*2051da85SEric Biggers
383*2051da85SEric Biggers	crc_t10dif_pmull p8
384*2051da85SEric Biggers
385*2051da85SEric BiggersCPU_LE(	rev64		v7.16b, v7.16b			)
386*2051da85SEric BiggersCPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
387*2051da85SEric Biggers	str		q7, [x3]
388*2051da85SEric Biggers
389*2051da85SEric Biggers	frame_pop
390*2051da85SEric Biggers	ret
391*2051da85SEric BiggersSYM_FUNC_END(crc_t10dif_pmull_p8)
392*2051da85SEric Biggers
393*2051da85SEric Biggers	.align		5
394*2051da85SEric Biggers//
395*2051da85SEric Biggers// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
396*2051da85SEric Biggers//
397*2051da85SEric Biggers// Assumes len >= 16.
398*2051da85SEric Biggers//
399*2051da85SEric BiggersSYM_FUNC_START(crc_t10dif_pmull_p64)
400*2051da85SEric Biggers	crc_t10dif_pmull	p64
401*2051da85SEric Biggers
402*2051da85SEric Biggers	// Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
403*2051da85SEric Biggers
404*2051da85SEric Biggers	movi		v2.16b, #0		// init zero register
405*2051da85SEric Biggers
406*2051da85SEric Biggers	// Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
407*2051da85SEric Biggers	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
408*2051da85SEric Biggers
409*2051da85SEric Biggers	// Fold the high 64 bits into the low 64 bits, while also multiplying by
410*2051da85SEric Biggers	// x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
411*2051da85SEric Biggers	// whose low 48 bits are 0.
412*2051da85SEric Biggers	ext		v0.16b, v2.16b, v7.16b, #8
413*2051da85SEric Biggers	pmull2		v7.1q, v7.2d, fold_consts.2d	// high bits * x^48 * (x^80 mod G(x))
414*2051da85SEric Biggers	eor		v0.16b, v0.16b, v7.16b		// + low bits * x^64
415*2051da85SEric Biggers
416*2051da85SEric Biggers	// Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
417*2051da85SEric Biggers	// value congruent to x^64 * M(x) and whose low 48 bits are 0.
418*2051da85SEric Biggers	ext		v1.16b, v0.16b, v2.16b, #12	// extract high 32 bits
419*2051da85SEric Biggers	mov		v0.s[3], v2.s[0]		// zero high 32 bits
420*2051da85SEric Biggers	pmull		v1.1q, v1.1d, fold_consts.1d	// high 32 bits * x^48 * (x^48 mod G(x))
421*2051da85SEric Biggers	eor		v0.16b, v0.16b, v1.16b		// + low bits
422*2051da85SEric Biggers
423*2051da85SEric Biggers	// Load G(x) and floor(x^48 / G(x)).
424*2051da85SEric Biggers	ld1		{fold_consts.2d}, [fold_consts_ptr]
425*2051da85SEric Biggers
426*2051da85SEric Biggers	// Use Barrett reduction to compute the final CRC value.
427*2051da85SEric Biggers	pmull2		v1.1q, v0.2d, fold_consts.2d	// high 32 bits * floor(x^48 / G(x))
428*2051da85SEric Biggers	ushr		v1.2d, v1.2d, #32		// /= x^32
429*2051da85SEric Biggers	pmull		v1.1q, v1.1d, fold_consts.1d	// *= G(x)
430*2051da85SEric Biggers	ushr		v0.2d, v0.2d, #48
431*2051da85SEric Biggers	eor		v0.16b, v0.16b, v1.16b		// + low 16 nonzero bits
432*2051da85SEric Biggers	// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
433*2051da85SEric Biggers
434*2051da85SEric Biggers	umov		w0, v0.h[0]
435*2051da85SEric Biggers	ret
436*2051da85SEric BiggersSYM_FUNC_END(crc_t10dif_pmull_p64)
437*2051da85SEric Biggers
438*2051da85SEric Biggers	.section	".rodata", "a"
439*2051da85SEric Biggers	.align		4
440*2051da85SEric Biggers
441*2051da85SEric Biggers// Fold constants precomputed from the polynomial 0x18bb7
442*2051da85SEric Biggers// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
443*2051da85SEric Biggers.Lfold_across_128_bytes_consts:
444*2051da85SEric Biggers	.quad		0x0000000000006123	// x^(8*128)	mod G(x)
445*2051da85SEric Biggers	.quad		0x0000000000002295	// x^(8*128+64)	mod G(x)
446*2051da85SEric Biggers// .Lfold_across_64_bytes_consts:
447*2051da85SEric Biggers	.quad		0x0000000000001069	// x^(4*128)	mod G(x)
448*2051da85SEric Biggers	.quad		0x000000000000dd31	// x^(4*128+64)	mod G(x)
449*2051da85SEric Biggers// .Lfold_across_32_bytes_consts:
450*2051da85SEric Biggers	.quad		0x000000000000857d	// x^(2*128)	mod G(x)
451*2051da85SEric Biggers	.quad		0x0000000000007acc	// x^(2*128+64)	mod G(x)
452*2051da85SEric Biggers.Lfold_across_16_bytes_consts:
453*2051da85SEric Biggers	.quad		0x000000000000a010	// x^(1*128)	mod G(x)
454*2051da85SEric Biggers	.quad		0x0000000000001faa	// x^(1*128+64)	mod G(x)
455*2051da85SEric Biggers// .Lfinal_fold_consts:
456*2051da85SEric Biggers	.quad		0x1368000000000000	// x^48 * (x^48 mod G(x))
457*2051da85SEric Biggers	.quad		0x2d56000000000000	// x^48 * (x^80 mod G(x))
458*2051da85SEric Biggers// .Lbarrett_reduction_consts:
459*2051da85SEric Biggers	.quad		0x0000000000018bb7	// G(x)
460*2051da85SEric Biggers	.quad		0x00000001f65a57f8	// floor(x^48 / G(x))
461*2051da85SEric Biggers
462*2051da85SEric Biggers// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
463*2051da85SEric Biggers// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
464*2051da85SEric Biggers// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
465*2051da85SEric Biggers.Lbyteshift_table:
466*2051da85SEric Biggers	.byte		 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
467*2051da85SEric Biggers	.byte		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
468*2051da85SEric Biggers	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
469*2051da85SEric Biggers	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0
470