xref: /linux/arch/x86/lib/crc-pclmul-template.S (revision 4f9786035f9e519db41375818e1d0b5f20da2f10)
18d2d3e72SEric Biggers/* SPDX-License-Identifier: GPL-2.0-or-later */
28d2d3e72SEric Biggers//
38d2d3e72SEric Biggers// Template to generate [V]PCLMULQDQ-based CRC functions for x86
48d2d3e72SEric Biggers//
58d2d3e72SEric Biggers// Copyright 2025 Google LLC
68d2d3e72SEric Biggers//
78d2d3e72SEric Biggers// Author: Eric Biggers <ebiggers@google.com>
88d2d3e72SEric Biggers
98d2d3e72SEric Biggers#include <linux/linkage.h>
10a0bd462fSEric Biggers#include <linux/objtool.h>
118d2d3e72SEric Biggers
128d2d3e72SEric Biggers// Offsets within the generated constants table
138d2d3e72SEric Biggers.set OFFSETOF_BSWAP_MASK,			-5*16	// msb-first CRCs only
148d2d3e72SEric Biggers.set OFFSETOF_FOLD_ACROSS_2048_BITS_CONSTS,	-4*16	// must precede next
158d2d3e72SEric Biggers.set OFFSETOF_FOLD_ACROSS_1024_BITS_CONSTS,	-3*16	// must precede next
168d2d3e72SEric Biggers.set OFFSETOF_FOLD_ACROSS_512_BITS_CONSTS,	-2*16	// must precede next
178d2d3e72SEric Biggers.set OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS,	-1*16	// must precede next
188d2d3e72SEric Biggers.set OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS,	0*16	// must be 0
198d2d3e72SEric Biggers.set OFFSETOF_SHUF_TABLE,			1*16
208d2d3e72SEric Biggers.set OFFSETOF_BARRETT_REDUCTION_CONSTS,		4*16
218d2d3e72SEric Biggers
228d2d3e72SEric Biggers// Emit a VEX (or EVEX) coded instruction if allowed, or emulate it using the
238d2d3e72SEric Biggers// corresponding non-VEX instruction plus any needed moves.  The supported
248d2d3e72SEric Biggers// instruction formats are:
258d2d3e72SEric Biggers//
268d2d3e72SEric Biggers//     - Two-arg [src, dst], where the non-VEX format is the same.
278d2d3e72SEric Biggers//     - Three-arg [src1, src2, dst] where the non-VEX format is
288d2d3e72SEric Biggers//	 [src1, src2_and_dst].  If src2 != dst, then src1 must != dst too.
298d2d3e72SEric Biggers//
308d2d3e72SEric Biggers// \insn gives the instruction without a "v" prefix and including any immediate
318d2d3e72SEric Biggers// argument if needed to make the instruction follow one of the above formats.
328d2d3e72SEric Biggers// If \unaligned_mem_tmp is given, then the emitted non-VEX code moves \arg1 to
338d2d3e72SEric Biggers// it first; this is needed when \arg1 is an unaligned mem operand.
348d2d3e72SEric Biggers.macro	_cond_vex	insn:req, arg1:req, arg2:req, arg3, unaligned_mem_tmp
358d2d3e72SEric Biggers.if AVX_LEVEL == 0
368d2d3e72SEric Biggers  // VEX not allowed.  Emulate it.
378d2d3e72SEric Biggers  .ifnb \arg3 // Three-arg [src1, src2, dst]
388d2d3e72SEric Biggers    .ifc "\arg2", "\arg3" // src2 == dst?
398d2d3e72SEric Biggers      .ifnb \unaligned_mem_tmp
408d2d3e72SEric Biggers	movdqu		\arg1, \unaligned_mem_tmp
418d2d3e72SEric Biggers	\insn		\unaligned_mem_tmp, \arg3
428d2d3e72SEric Biggers      .else
438d2d3e72SEric Biggers	\insn		\arg1, \arg3
448d2d3e72SEric Biggers      .endif
458d2d3e72SEric Biggers    .else // src2 != dst
468d2d3e72SEric Biggers      .ifc "\arg1", "\arg3"
478d2d3e72SEric Biggers	.error "Can't have src1 == dst when src2 != dst"
488d2d3e72SEric Biggers      .endif
498d2d3e72SEric Biggers      .ifnb \unaligned_mem_tmp
508d2d3e72SEric Biggers	movdqu		\arg1, \unaligned_mem_tmp
518d2d3e72SEric Biggers	movdqa		\arg2, \arg3
528d2d3e72SEric Biggers	\insn		\unaligned_mem_tmp, \arg3
538d2d3e72SEric Biggers      .else
548d2d3e72SEric Biggers	movdqa		\arg2, \arg3
558d2d3e72SEric Biggers	\insn		\arg1, \arg3
568d2d3e72SEric Biggers      .endif
578d2d3e72SEric Biggers    .endif
588d2d3e72SEric Biggers  .else // Two-arg [src, dst]
598d2d3e72SEric Biggers    .ifnb \unaligned_mem_tmp
608d2d3e72SEric Biggers	movdqu		\arg1, \unaligned_mem_tmp
618d2d3e72SEric Biggers	\insn		\unaligned_mem_tmp, \arg2
628d2d3e72SEric Biggers    .else
638d2d3e72SEric Biggers	\insn		\arg1, \arg2
648d2d3e72SEric Biggers    .endif
658d2d3e72SEric Biggers  .endif
668d2d3e72SEric Biggers.else
678d2d3e72SEric Biggers  // VEX is allowed.  Emit the desired instruction directly.
688d2d3e72SEric Biggers  .ifnb \arg3
698d2d3e72SEric Biggers	v\insn		\arg1, \arg2, \arg3
708d2d3e72SEric Biggers  .else
718d2d3e72SEric Biggers	v\insn		\arg1, \arg2
728d2d3e72SEric Biggers  .endif
738d2d3e72SEric Biggers.endif
748d2d3e72SEric Biggers.endm
758d2d3e72SEric Biggers
768d2d3e72SEric Biggers// Broadcast an aligned 128-bit mem operand to all 128-bit lanes of a vector
778d2d3e72SEric Biggers// register of length VL.
788d2d3e72SEric Biggers.macro	_vbroadcast	src, dst
798d2d3e72SEric Biggers.if VL == 16
808d2d3e72SEric Biggers	_cond_vex movdqa,	\src, \dst
818d2d3e72SEric Biggers.elseif VL == 32
828d2d3e72SEric Biggers	vbroadcasti128		\src, \dst
838d2d3e72SEric Biggers.else
848d2d3e72SEric Biggers	vbroadcasti32x4		\src, \dst
858d2d3e72SEric Biggers.endif
868d2d3e72SEric Biggers.endm
878d2d3e72SEric Biggers
888d2d3e72SEric Biggers// Load \vl bytes from the unaligned mem operand \src into \dst, and if the CRC
898d2d3e72SEric Biggers// is msb-first use \bswap_mask to reflect the bytes within each 128-bit lane.
908d2d3e72SEric Biggers.macro	_load_data	vl, src, bswap_mask, dst
918d2d3e72SEric Biggers.if \vl < 64
928d2d3e72SEric Biggers	_cond_vex movdqu,	"\src", \dst
938d2d3e72SEric Biggers.else
948d2d3e72SEric Biggers	vmovdqu8		\src, \dst
958d2d3e72SEric Biggers.endif
968d2d3e72SEric Biggers.if !LSB_CRC
978d2d3e72SEric Biggers	_cond_vex pshufb,	\bswap_mask, \dst, \dst
988d2d3e72SEric Biggers.endif
998d2d3e72SEric Biggers.endm
1008d2d3e72SEric Biggers
1018d2d3e72SEric Biggers.macro	_prepare_v0	vl, v0, v1, bswap_mask
1028d2d3e72SEric Biggers.if LSB_CRC
1038d2d3e72SEric Biggers  .if \vl < 64
1048d2d3e72SEric Biggers	_cond_vex pxor,		(BUF), \v0, \v0, unaligned_mem_tmp=\v1
1058d2d3e72SEric Biggers  .else
1068d2d3e72SEric Biggers	vpxorq			(BUF), \v0, \v0
1078d2d3e72SEric Biggers  .endif
1088d2d3e72SEric Biggers.else
1098d2d3e72SEric Biggers	_load_data		\vl, (BUF), \bswap_mask, \v1
1108d2d3e72SEric Biggers  .if \vl < 64
1118d2d3e72SEric Biggers	_cond_vex pxor,		\v1, \v0, \v0
1128d2d3e72SEric Biggers  .else
1138d2d3e72SEric Biggers	vpxorq			\v1, \v0, \v0
1148d2d3e72SEric Biggers  .endif
1158d2d3e72SEric Biggers.endif
1168d2d3e72SEric Biggers.endm
1178d2d3e72SEric Biggers
1188d2d3e72SEric Biggers// The x^0..x^63 terms, i.e. poly128 mod x^64, i.e. the physically low qword for
1198d2d3e72SEric Biggers// msb-first order or the physically high qword for lsb-first order
1208d2d3e72SEric Biggers#define LO64_TERMS 0
1218d2d3e72SEric Biggers
1228d2d3e72SEric Biggers// The x^64..x^127 terms, i.e. floor(poly128 / x^64), i.e. the physically high
1238d2d3e72SEric Biggers// qword for msb-first order or the physically low qword for lsb-first order
1248d2d3e72SEric Biggers#define HI64_TERMS 1
1258d2d3e72SEric Biggers
1268d2d3e72SEric Biggers// Multiply the given \src1_terms of each 128-bit lane of \src1 by the given
1278d2d3e72SEric Biggers// \src2_terms of each 128-bit lane of \src2, and write the result(s) to \dst.
1288d2d3e72SEric Biggers.macro	_pclmulqdq	src1, src1_terms, src2, src2_terms, dst
1298d2d3e72SEric Biggers	_cond_vex "pclmulqdq $((\src1_terms ^ LSB_CRC) << 4) ^ (\src2_terms ^ LSB_CRC),", \
1308d2d3e72SEric Biggers		  \src1, \src2, \dst
1318d2d3e72SEric Biggers.endm
1328d2d3e72SEric Biggers
1338d2d3e72SEric Biggers// Fold \acc into \data and store the result back into \acc.  \data can be an
1348d2d3e72SEric Biggers// unaligned mem operand if using VEX is allowed and the CRC is lsb-first so no
1358d2d3e72SEric Biggers// byte-reflection is needed; otherwise it must be a vector register.  \consts
1368d2d3e72SEric Biggers// is a vector register containing the needed fold constants, and \tmp is a
1378d2d3e72SEric Biggers// temporary vector register.  All arguments must be the same length.
1388d2d3e72SEric Biggers.macro	_fold_vec	acc, data, consts, tmp
1398d2d3e72SEric Biggers	_pclmulqdq	\consts, HI64_TERMS, \acc, HI64_TERMS, \tmp
1408d2d3e72SEric Biggers	_pclmulqdq	\consts, LO64_TERMS, \acc, LO64_TERMS, \acc
141*acf9f8daSEric Biggers.if AVX_LEVEL <= 2
1428d2d3e72SEric Biggers	_cond_vex pxor,	\data, \tmp, \tmp
1438d2d3e72SEric Biggers	_cond_vex pxor,	\tmp, \acc, \acc
1448d2d3e72SEric Biggers.else
1458d2d3e72SEric Biggers	vpternlogq	$0x96, \data, \tmp, \acc
1468d2d3e72SEric Biggers.endif
1478d2d3e72SEric Biggers.endm
1488d2d3e72SEric Biggers
1498d2d3e72SEric Biggers// Fold \acc into \data and store the result back into \acc.  \data is an
1508d2d3e72SEric Biggers// unaligned mem operand, \consts is a vector register containing the needed
1518d2d3e72SEric Biggers// fold constants, \bswap_mask is a vector register containing the
1528d2d3e72SEric Biggers// byte-reflection table if the CRC is msb-first, and \tmp1 and \tmp2 are
1538d2d3e72SEric Biggers// temporary vector registers.  All arguments must have length \vl.
1548d2d3e72SEric Biggers.macro	_fold_vec_mem	vl, acc, data, consts, bswap_mask, tmp1, tmp2
1558d2d3e72SEric Biggers.if AVX_LEVEL == 0 || !LSB_CRC
1568d2d3e72SEric Biggers	_load_data	\vl, \data, \bswap_mask, \tmp1
1578d2d3e72SEric Biggers	_fold_vec	\acc, \tmp1, \consts, \tmp2
1588d2d3e72SEric Biggers.else
1598d2d3e72SEric Biggers	_fold_vec	\acc, \data, \consts, \tmp1
1608d2d3e72SEric Biggers.endif
1618d2d3e72SEric Biggers.endm
1628d2d3e72SEric Biggers
1638d2d3e72SEric Biggers// Load the constants for folding across 2**i vectors of length VL at a time
1648d2d3e72SEric Biggers// into all 128-bit lanes of the vector register CONSTS.
1658d2d3e72SEric Biggers.macro	_load_vec_folding_consts	i
1668d2d3e72SEric Biggers	_vbroadcast OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS+(4-LOG2_VL-\i)*16(CONSTS_PTR), \
1678d2d3e72SEric Biggers		    CONSTS
1688d2d3e72SEric Biggers.endm
1698d2d3e72SEric Biggers
1708d2d3e72SEric Biggers// Given vector registers \v0 and \v1 of length \vl, fold \v0 into \v1 and store
1718d2d3e72SEric Biggers// the result back into \v0.  If the remaining length mod \vl is nonzero, also
1728d2d3e72SEric Biggers// fold \vl data bytes from BUF.  For both operations the fold distance is \vl.
1738d2d3e72SEric Biggers// \consts must be a register of length \vl containing the fold constants.
1748d2d3e72SEric Biggers.macro	_fold_vec_final	vl, v0, v1, consts, bswap_mask, tmp1, tmp2
1758d2d3e72SEric Biggers	_fold_vec	\v0, \v1, \consts, \tmp1
1768d2d3e72SEric Biggers	test		$\vl, LEN8
1778d2d3e72SEric Biggers	jz		.Lfold_vec_final_done\@
1788d2d3e72SEric Biggers	_fold_vec_mem	\vl, \v0, (BUF), \consts, \bswap_mask, \tmp1, \tmp2
1798d2d3e72SEric Biggers	add		$\vl, BUF
1808d2d3e72SEric Biggers.Lfold_vec_final_done\@:
1818d2d3e72SEric Biggers.endm
1828d2d3e72SEric Biggers
1838d2d3e72SEric Biggers// This macro generates the body of a CRC function with the following prototype:
1848d2d3e72SEric Biggers//
1858d2d3e72SEric Biggers// crc_t crc_func(crc_t crc, const u8 *buf, size_t len, const void *consts);
1868d2d3e72SEric Biggers//
1878d2d3e72SEric Biggers// |crc| is the initial CRC, and crc_t is a data type wide enough to hold it.
1888d2d3e72SEric Biggers// |buf| is the data to checksum.  |len| is the data length in bytes, which must
1898d2d3e72SEric Biggers// be at least 16.  |consts| is a pointer to the fold_across_128_bits_consts
1908d2d3e72SEric Biggers// field of the constants struct that was generated for the chosen CRC variant.
1918d2d3e72SEric Biggers//
1928d2d3e72SEric Biggers// Moving onto the macro parameters, \n is the number of bits in the CRC, e.g.
1938d2d3e72SEric Biggers// 32 for a CRC-32.  Currently the supported values are 8, 16, 32, and 64.  If
1948d2d3e72SEric Biggers// the file is compiled in i386 mode, then the maximum supported value is 32.
1958d2d3e72SEric Biggers//
1968d2d3e72SEric Biggers// \lsb_crc is 1 if the CRC processes the least significant bit of each byte
1978d2d3e72SEric Biggers// first, i.e. maps bit0 to x^7, bit1 to x^6, ..., bit7 to x^0.  \lsb_crc is 0
1988d2d3e72SEric Biggers// if the CRC processes the most significant bit of each byte first, i.e. maps
1998d2d3e72SEric Biggers// bit0 to x^0, bit1 to x^1, bit7 to x^7.
2008d2d3e72SEric Biggers//
2018d2d3e72SEric Biggers// \vl is the maximum length of vector register to use in bytes: 16, 32, or 64.
2028d2d3e72SEric Biggers//
2038d2d3e72SEric Biggers// \avx_level is the level of AVX support to use: 0 for SSE only, 2 for AVX2, or
204*acf9f8daSEric Biggers// 512 for AVX512.
2058d2d3e72SEric Biggers//
2068d2d3e72SEric Biggers// If \vl == 16 && \avx_level == 0, the generated code requires:
2078d2d3e72SEric Biggers// PCLMULQDQ && SSE4.1.  (Note: all known CPUs with PCLMULQDQ also have SSE4.1.)
2088d2d3e72SEric Biggers//
2098d2d3e72SEric Biggers// If \vl == 32 && \avx_level == 2, the generated code requires:
2108d2d3e72SEric Biggers// VPCLMULQDQ && AVX2.
2118d2d3e72SEric Biggers//
212*acf9f8daSEric Biggers// If \vl == 64 && \avx_level == 512, the generated code requires:
213*acf9f8daSEric Biggers// VPCLMULQDQ && AVX512BW && AVX512VL.
2148d2d3e72SEric Biggers//
2158d2d3e72SEric Biggers// Other \vl and \avx_level combinations are either not supported or not useful.
2168d2d3e72SEric Biggers.macro	_crc_pclmul	n, lsb_crc, vl, avx_level
2178d2d3e72SEric Biggers	.set	LSB_CRC,	\lsb_crc
2188d2d3e72SEric Biggers	.set	VL,		\vl
2198d2d3e72SEric Biggers	.set	AVX_LEVEL,	\avx_level
2208d2d3e72SEric Biggers
2218d2d3e72SEric Biggers	// Define aliases for the xmm, ymm, or zmm registers according to VL.
2228d2d3e72SEric Biggers.irp i, 0,1,2,3,4,5,6,7
2238d2d3e72SEric Biggers  .if VL == 16
2248d2d3e72SEric Biggers	.set	V\i,		%xmm\i
2258d2d3e72SEric Biggers	.set	LOG2_VL,	4
2268d2d3e72SEric Biggers  .elseif VL == 32
2278d2d3e72SEric Biggers	.set	V\i,		%ymm\i
2288d2d3e72SEric Biggers	.set	LOG2_VL,	5
2298d2d3e72SEric Biggers  .elseif VL == 64
2308d2d3e72SEric Biggers	.set	V\i,		%zmm\i
2318d2d3e72SEric Biggers	.set	LOG2_VL,	6
2328d2d3e72SEric Biggers  .else
2338d2d3e72SEric Biggers	.error "Unsupported vector length"
2348d2d3e72SEric Biggers  .endif
2358d2d3e72SEric Biggers.endr
2368d2d3e72SEric Biggers	// Define aliases for the function parameters.
2378d2d3e72SEric Biggers	// Note: when crc_t is shorter than u32, zero-extension to 32 bits is
2388d2d3e72SEric Biggers	// guaranteed by the ABI.  Zero-extension to 64 bits is *not* guaranteed
2398d2d3e72SEric Biggers	// when crc_t is shorter than u64.
2408d2d3e72SEric Biggers#ifdef __x86_64__
2418d2d3e72SEric Biggers.if \n <= 32
2428d2d3e72SEric Biggers	.set	CRC,		%edi
2438d2d3e72SEric Biggers.else
2448d2d3e72SEric Biggers	.set	CRC,		%rdi
2458d2d3e72SEric Biggers.endif
2468d2d3e72SEric Biggers	.set	BUF,		%rsi
2478d2d3e72SEric Biggers	.set	LEN,		%rdx
2488d2d3e72SEric Biggers	.set	LEN32,		%edx
2498d2d3e72SEric Biggers	.set	LEN8,		%dl
2508d2d3e72SEric Biggers	.set	CONSTS_PTR,	%rcx
2518d2d3e72SEric Biggers#else
2528d2d3e72SEric Biggers	// 32-bit support, assuming -mregparm=3 and not including support for
2538d2d3e72SEric Biggers	// CRC-64 (which would use both eax and edx to pass the crc parameter).
2548d2d3e72SEric Biggers	.set	CRC,		%eax
2558d2d3e72SEric Biggers	.set	BUF,		%edx
2568d2d3e72SEric Biggers	.set	LEN,		%ecx
2578d2d3e72SEric Biggers	.set	LEN32,		%ecx
2588d2d3e72SEric Biggers	.set	LEN8,		%cl
2598d2d3e72SEric Biggers	.set	CONSTS_PTR,	%ebx	// Passed on stack
2608d2d3e72SEric Biggers#endif
2618d2d3e72SEric Biggers
2628d2d3e72SEric Biggers	// Define aliases for some local variables.  V0-V5 are used without
2638d2d3e72SEric Biggers	// aliases (for accumulators, data, temporary values, etc).  Staying
2648d2d3e72SEric Biggers	// within the first 8 vector registers keeps the code 32-bit SSE
2658d2d3e72SEric Biggers	// compatible and reduces the size of 64-bit SSE code slightly.
2668d2d3e72SEric Biggers	.set	BSWAP_MASK,	V6
2678d2d3e72SEric Biggers	.set	BSWAP_MASK_YMM,	%ymm6
2688d2d3e72SEric Biggers	.set	BSWAP_MASK_XMM,	%xmm6
2698d2d3e72SEric Biggers	.set	CONSTS,		V7
2708d2d3e72SEric Biggers	.set	CONSTS_YMM,	%ymm7
2718d2d3e72SEric Biggers	.set	CONSTS_XMM,	%xmm7
2728d2d3e72SEric Biggers
273a0bd462fSEric Biggers	// Use ANNOTATE_NOENDBR to suppress an objtool warning, since the
274a0bd462fSEric Biggers	// functions generated by this macro are called only by static_call.
275a0bd462fSEric Biggers	ANNOTATE_NOENDBR
276a0bd462fSEric Biggers
2778d2d3e72SEric Biggers#ifdef __i386__
2788d2d3e72SEric Biggers	push		CONSTS_PTR
2798d2d3e72SEric Biggers	mov		8(%esp), CONSTS_PTR
2808d2d3e72SEric Biggers#endif
2818d2d3e72SEric Biggers
2828d2d3e72SEric Biggers	// Create a 128-bit vector that contains the initial CRC in the end
2838d2d3e72SEric Biggers	// representing the high-order polynomial coefficients, and the rest 0.
2848d2d3e72SEric Biggers	// If the CRC is msb-first, also load the byte-reflection table.
2858d2d3e72SEric Biggers.if \n <= 32
2868d2d3e72SEric Biggers	_cond_vex movd,	CRC, %xmm0
2878d2d3e72SEric Biggers.else
2888d2d3e72SEric Biggers	_cond_vex movq,	CRC, %xmm0
2898d2d3e72SEric Biggers.endif
2908d2d3e72SEric Biggers.if !LSB_CRC
2918d2d3e72SEric Biggers	_cond_vex pslldq, $(128-\n)/8, %xmm0, %xmm0
2928d2d3e72SEric Biggers	_vbroadcast	OFFSETOF_BSWAP_MASK(CONSTS_PTR), BSWAP_MASK
2938d2d3e72SEric Biggers.endif
2948d2d3e72SEric Biggers
2958d2d3e72SEric Biggers	// Load the first vector of data and XOR the initial CRC into the
2968d2d3e72SEric Biggers	// appropriate end of the first 128-bit lane of data.  If LEN < VL, then
2978d2d3e72SEric Biggers	// use a short vector and jump ahead to the final reduction.  (LEN >= 16
2988d2d3e72SEric Biggers	// is guaranteed here but not necessarily LEN >= VL.)
2998d2d3e72SEric Biggers.if VL >= 32
3008d2d3e72SEric Biggers	cmp		$VL, LEN
3018d2d3e72SEric Biggers	jae		.Lat_least_1vec\@
3028d2d3e72SEric Biggers  .if VL == 64
3038d2d3e72SEric Biggers	cmp		$32, LEN32
3048d2d3e72SEric Biggers	jb		.Lless_than_32bytes\@
3058d2d3e72SEric Biggers	_prepare_v0	32, %ymm0, %ymm1, BSWAP_MASK_YMM
3068d2d3e72SEric Biggers	add		$32, BUF
3078d2d3e72SEric Biggers	jmp		.Lreduce_256bits_to_128bits\@
3088d2d3e72SEric Biggers.Lless_than_32bytes\@:
3098d2d3e72SEric Biggers  .endif
3108d2d3e72SEric Biggers	_prepare_v0	16, %xmm0, %xmm1, BSWAP_MASK_XMM
3118d2d3e72SEric Biggers	add		$16, BUF
3128d2d3e72SEric Biggers	vmovdqa		OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
3138d2d3e72SEric Biggers	jmp		.Lcheck_for_partial_block\@
3148d2d3e72SEric Biggers.Lat_least_1vec\@:
3158d2d3e72SEric Biggers.endif
3168d2d3e72SEric Biggers	_prepare_v0	VL, V0, V1, BSWAP_MASK
3178d2d3e72SEric Biggers
3188d2d3e72SEric Biggers	// Handle VL <= LEN < 4*VL.
3198d2d3e72SEric Biggers	cmp		$4*VL-1, LEN
3208d2d3e72SEric Biggers	ja		.Lat_least_4vecs\@
3218d2d3e72SEric Biggers	add		$VL, BUF
3228d2d3e72SEric Biggers	// If VL <= LEN < 2*VL, then jump ahead to the reduction from 1 vector.
3238d2d3e72SEric Biggers	// If VL==16 then load fold_across_128_bits_consts first, as the final
3248d2d3e72SEric Biggers	// reduction depends on it and it won't be loaded anywhere else.
3258d2d3e72SEric Biggers	cmp		$2*VL-1, LEN32
3268d2d3e72SEric Biggers.if VL == 16
3278d2d3e72SEric Biggers	_cond_vex movdqa, OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
3288d2d3e72SEric Biggers.endif
3298d2d3e72SEric Biggers	jbe		.Lreduce_1vec_to_128bits\@
3308d2d3e72SEric Biggers	// Otherwise 2*VL <= LEN < 4*VL.  Load one more vector and jump ahead to
3318d2d3e72SEric Biggers	// the reduction from 2 vectors.
3328d2d3e72SEric Biggers	_load_data	VL, (BUF), BSWAP_MASK, V1
3338d2d3e72SEric Biggers	add		$VL, BUF
3348d2d3e72SEric Biggers	jmp		.Lreduce_2vecs_to_1\@
3358d2d3e72SEric Biggers
3368d2d3e72SEric Biggers.Lat_least_4vecs\@:
3378d2d3e72SEric Biggers	// Load 3 more vectors of data.
3388d2d3e72SEric Biggers	_load_data	VL, 1*VL(BUF), BSWAP_MASK, V1
3398d2d3e72SEric Biggers	_load_data	VL, 2*VL(BUF), BSWAP_MASK, V2
3408d2d3e72SEric Biggers	_load_data	VL, 3*VL(BUF), BSWAP_MASK, V3
3418d2d3e72SEric Biggers	sub		$-4*VL, BUF	// Shorter than 'add 4*VL' when VL=32
3428d2d3e72SEric Biggers	add		$-4*VL, LEN	// Shorter than 'sub 4*VL' when VL=32
3438d2d3e72SEric Biggers
3448d2d3e72SEric Biggers	// Main loop: while LEN >= 4*VL, fold the 4 vectors V0-V3 into the next
3458d2d3e72SEric Biggers	// 4 vectors of data and write the result back to V0-V3.
3468d2d3e72SEric Biggers	cmp		$4*VL-1, LEN	// Shorter than 'cmp 4*VL' when VL=32
3478d2d3e72SEric Biggers	jbe		.Lreduce_4vecs_to_2\@
3488d2d3e72SEric Biggers	_load_vec_folding_consts	2
3498d2d3e72SEric Biggers.Lfold_4vecs_loop\@:
3508d2d3e72SEric Biggers	_fold_vec_mem	VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
3518d2d3e72SEric Biggers	_fold_vec_mem	VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
3528d2d3e72SEric Biggers	_fold_vec_mem	VL, V2, 2*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
3538d2d3e72SEric Biggers	_fold_vec_mem	VL, V3, 3*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
3548d2d3e72SEric Biggers	sub		$-4*VL, BUF
3558d2d3e72SEric Biggers	add		$-4*VL, LEN
3568d2d3e72SEric Biggers	cmp		$4*VL-1, LEN
3578d2d3e72SEric Biggers	ja		.Lfold_4vecs_loop\@
3588d2d3e72SEric Biggers
3598d2d3e72SEric Biggers	// Fold V0,V1 into V2,V3 and write the result back to V0,V1.  Then fold
3608d2d3e72SEric Biggers	// two more vectors of data from BUF, if at least that much remains.
3618d2d3e72SEric Biggers.Lreduce_4vecs_to_2\@:
3628d2d3e72SEric Biggers	_load_vec_folding_consts	1
3638d2d3e72SEric Biggers	_fold_vec	V0, V2, CONSTS, V4
3648d2d3e72SEric Biggers	_fold_vec	V1, V3, CONSTS, V4
3658d2d3e72SEric Biggers	test		$2*VL, LEN8
3668d2d3e72SEric Biggers	jz		.Lreduce_2vecs_to_1\@
3678d2d3e72SEric Biggers	_fold_vec_mem	VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
3688d2d3e72SEric Biggers	_fold_vec_mem	VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
3698d2d3e72SEric Biggers	sub		$-2*VL, BUF
3708d2d3e72SEric Biggers
3718d2d3e72SEric Biggers	// Fold V0 into V1 and write the result back to V0.  Then fold one more
3728d2d3e72SEric Biggers	// vector of data from BUF, if at least that much remains.
3738d2d3e72SEric Biggers.Lreduce_2vecs_to_1\@:
3748d2d3e72SEric Biggers	_load_vec_folding_consts	0
3758d2d3e72SEric Biggers	_fold_vec_final	VL, V0, V1, CONSTS, BSWAP_MASK, V4, V5
3768d2d3e72SEric Biggers
3778d2d3e72SEric Biggers.Lreduce_1vec_to_128bits\@:
3788d2d3e72SEric Biggers.if VL == 64
3798d2d3e72SEric Biggers	// Reduce 512-bit %zmm0 to 256-bit %ymm0.  Then fold 256 more bits of
3808d2d3e72SEric Biggers	// data from BUF, if at least that much remains.
3818d2d3e72SEric Biggers	vbroadcasti128	OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS(CONSTS_PTR), CONSTS_YMM
3828d2d3e72SEric Biggers	vextracti64x4	$1, %zmm0, %ymm1
3838d2d3e72SEric Biggers	_fold_vec_final	32, %ymm0, %ymm1, CONSTS_YMM, BSWAP_MASK_YMM, %ymm4, %ymm5
3848d2d3e72SEric Biggers.Lreduce_256bits_to_128bits\@:
3858d2d3e72SEric Biggers.endif
3868d2d3e72SEric Biggers.if VL >= 32
3878d2d3e72SEric Biggers	// Reduce 256-bit %ymm0 to 128-bit %xmm0.  Then fold 128 more bits of
3888d2d3e72SEric Biggers	// data from BUF, if at least that much remains.
3898d2d3e72SEric Biggers	vmovdqa		OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
3908d2d3e72SEric Biggers	vextracti128	$1, %ymm0, %xmm1
3918d2d3e72SEric Biggers	_fold_vec_final	16, %xmm0, %xmm1, CONSTS_XMM, BSWAP_MASK_XMM, %xmm4, %xmm5
3928d2d3e72SEric Biggers.Lcheck_for_partial_block\@:
3938d2d3e72SEric Biggers.endif
3948d2d3e72SEric Biggers	and		$15, LEN32
3958d2d3e72SEric Biggers	jz		.Lreduce_128bits_to_crc\@
3968d2d3e72SEric Biggers
3978d2d3e72SEric Biggers	// 1 <= LEN <= 15 data bytes remain in BUF.  The polynomial is now
3988d2d3e72SEric Biggers	// A*(x^(8*LEN)) + B, where A is the 128-bit polynomial stored in %xmm0
3998d2d3e72SEric Biggers	// and B is the polynomial of the remaining LEN data bytes.  To reduce
4008d2d3e72SEric Biggers	// this to 128 bits without needing fold constants for each possible
4018d2d3e72SEric Biggers	// LEN, rearrange this expression into C1*(x^128) + C2, where
4028d2d3e72SEric Biggers	// C1 = floor(A / x^(128 - 8*LEN)) and C2 = A*x^(8*LEN) + B mod x^128.
4038d2d3e72SEric Biggers	// Then fold C1 into C2, which is just another fold across 128 bits.
4048d2d3e72SEric Biggers
4058d2d3e72SEric Biggers.if !LSB_CRC || AVX_LEVEL == 0
4068d2d3e72SEric Biggers	// Load the last 16 data bytes.  Note that originally LEN was >= 16.
4078d2d3e72SEric Biggers	_load_data	16, "-16(BUF,LEN)", BSWAP_MASK_XMM, %xmm2
4088d2d3e72SEric Biggers.endif // Else will use vpblendvb mem operand later.
4098d2d3e72SEric Biggers.if !LSB_CRC
4108d2d3e72SEric Biggers	neg		LEN	// Needed for indexing shuf_table
4118d2d3e72SEric Biggers.endif
4128d2d3e72SEric Biggers
4138d2d3e72SEric Biggers	// tmp = A*x^(8*LEN) mod x^128
4148d2d3e72SEric Biggers	// lsb: pshufb by [LEN, LEN+1, ..., 15, -1, -1, ..., -1]
4158d2d3e72SEric Biggers	//	i.e. right-shift by LEN bytes.
4168d2d3e72SEric Biggers	// msb: pshufb by [-1, -1, ..., -1, 0, 1, ..., 15-LEN]
4178d2d3e72SEric Biggers	//	i.e. left-shift by LEN bytes.
4188d2d3e72SEric Biggers	_cond_vex movdqu,	"OFFSETOF_SHUF_TABLE+16(CONSTS_PTR,LEN)", %xmm3
4198d2d3e72SEric Biggers	_cond_vex pshufb,	%xmm3, %xmm0, %xmm1
4208d2d3e72SEric Biggers
4218d2d3e72SEric Biggers	// C1 = floor(A / x^(128 - 8*LEN))
4228d2d3e72SEric Biggers	// lsb: pshufb by [-1, -1, ..., -1, 0, 1, ..., LEN-1]
4238d2d3e72SEric Biggers	//	i.e. left-shift by 16-LEN bytes.
4248d2d3e72SEric Biggers	// msb: pshufb by [16-LEN, 16-LEN+1, ..., 15, -1, -1, ..., -1]
4258d2d3e72SEric Biggers	//	i.e. right-shift by 16-LEN bytes.
4268d2d3e72SEric Biggers	_cond_vex pshufb,	"OFFSETOF_SHUF_TABLE+32*!LSB_CRC(CONSTS_PTR,LEN)", \
4278d2d3e72SEric Biggers				%xmm0, %xmm0, unaligned_mem_tmp=%xmm4
4288d2d3e72SEric Biggers
4298d2d3e72SEric Biggers	// C2 = tmp + B.  This is just a blend of tmp with the last 16 data
4308d2d3e72SEric Biggers	// bytes (reflected if msb-first).  The blend mask is the shuffle table
4318d2d3e72SEric Biggers	// that was used to create tmp.  0 selects tmp, and 1 last16databytes.
4328d2d3e72SEric Biggers.if AVX_LEVEL == 0
4338d2d3e72SEric Biggers	movdqa		%xmm0, %xmm4
4348d2d3e72SEric Biggers	movdqa		%xmm3, %xmm0
4358d2d3e72SEric Biggers	pblendvb	%xmm2, %xmm1	// uses %xmm0 as implicit operand
4368d2d3e72SEric Biggers	movdqa		%xmm4, %xmm0
4378d2d3e72SEric Biggers.elseif LSB_CRC
4388d2d3e72SEric Biggers	vpblendvb	%xmm3, -16(BUF,LEN), %xmm1, %xmm1
4398d2d3e72SEric Biggers.else
4408d2d3e72SEric Biggers	vpblendvb	%xmm3, %xmm2, %xmm1, %xmm1
4418d2d3e72SEric Biggers.endif
4428d2d3e72SEric Biggers
4438d2d3e72SEric Biggers	// Fold C1 into C2 and store the 128-bit result in %xmm0.
4448d2d3e72SEric Biggers	_fold_vec	%xmm0, %xmm1, CONSTS_XMM, %xmm4
4458d2d3e72SEric Biggers
4468d2d3e72SEric Biggers.Lreduce_128bits_to_crc\@:
4478d2d3e72SEric Biggers	// Compute the CRC as %xmm0 * x^n mod G.  Here %xmm0 means the 128-bit
4488d2d3e72SEric Biggers	// polynomial stored in %xmm0 (using either lsb-first or msb-first bit
4498d2d3e72SEric Biggers	// order according to LSB_CRC), and G is the CRC's generator polynomial.
4508d2d3e72SEric Biggers
4518d2d3e72SEric Biggers	// First, multiply %xmm0 by x^n and reduce the result to 64+n bits:
4528d2d3e72SEric Biggers	//
4538d2d3e72SEric Biggers	//	t0 := (x^(64+n) mod G) * floor(%xmm0 / x^64) +
4548d2d3e72SEric Biggers	//	      x^n * (%xmm0 mod x^64)
4558d2d3e72SEric Biggers	//
4568d2d3e72SEric Biggers	// Store t0 * x^(64-n) in %xmm0.  I.e., actually do:
4578d2d3e72SEric Biggers	//
4588d2d3e72SEric Biggers	//	%xmm0 := ((x^(64+n) mod G) * x^(64-n)) * floor(%xmm0 / x^64) +
4598d2d3e72SEric Biggers	//		 x^64 * (%xmm0 mod x^64)
4608d2d3e72SEric Biggers	//
4618d2d3e72SEric Biggers	// The extra unreduced factor of x^(64-n) makes floor(t0 / x^n) aligned
4628d2d3e72SEric Biggers	// to the HI64_TERMS of %xmm0 so that the next pclmulqdq can easily
4638d2d3e72SEric Biggers	// select it.  The 64-bit constant (x^(64+n) mod G) * x^(64-n) in the
4648d2d3e72SEric Biggers	// msb-first case, or (x^(63+n) mod G) * x^(64-n) in the lsb-first case
4658d2d3e72SEric Biggers	// (considering the extra factor of x that gets implicitly introduced by
4668d2d3e72SEric Biggers	// each pclmulqdq when using lsb-first order), is identical to the
4678d2d3e72SEric Biggers	// constant that was used earlier for folding the LO64_TERMS across 128
4688d2d3e72SEric Biggers	// bits.  Thus it's already available in LO64_TERMS of CONSTS_XMM.
4698d2d3e72SEric Biggers	_pclmulqdq		CONSTS_XMM, LO64_TERMS, %xmm0, HI64_TERMS, %xmm1
4708d2d3e72SEric Biggers.if LSB_CRC
4718d2d3e72SEric Biggers	_cond_vex psrldq,	$8, %xmm0, %xmm0  // x^64 * (%xmm0 mod x^64)
4728d2d3e72SEric Biggers.else
4738d2d3e72SEric Biggers	_cond_vex pslldq,	$8, %xmm0, %xmm0  // x^64 * (%xmm0 mod x^64)
4748d2d3e72SEric Biggers.endif
4758d2d3e72SEric Biggers	_cond_vex pxor,		%xmm1, %xmm0, %xmm0
4768d2d3e72SEric Biggers	// The HI64_TERMS of %xmm0 now contain floor(t0 / x^n).
4778d2d3e72SEric Biggers	// The LO64_TERMS of %xmm0 now contain (t0 mod x^n) * x^(64-n).
4788d2d3e72SEric Biggers
4798d2d3e72SEric Biggers	// First step of Barrett reduction: Compute floor(t0 / G).  This is the
4808d2d3e72SEric Biggers	// polynomial by which G needs to be multiplied to cancel out the x^n
4818d2d3e72SEric Biggers	// and higher terms of t0, i.e. to reduce t0 mod G.  First do:
4828d2d3e72SEric Biggers	//
4838d2d3e72SEric Biggers	//	t1 := floor(x^(63+n) / G) * x * floor(t0 / x^n)
4848d2d3e72SEric Biggers	//
4858d2d3e72SEric Biggers	// Then the desired value floor(t0 / G) is floor(t1 / x^64).  The 63 in
4868d2d3e72SEric Biggers	// x^(63+n) is the maximum degree of floor(t0 / x^n) and thus the lowest
4878d2d3e72SEric Biggers	// value that makes enough precision be carried through the calculation.
4888d2d3e72SEric Biggers	//
4898d2d3e72SEric Biggers	// The '* x' makes it so the result is floor(t1 / x^64) rather than
4908d2d3e72SEric Biggers	// floor(t1 / x^63), making it qword-aligned in HI64_TERMS so that it
4918d2d3e72SEric Biggers	// can be extracted much more easily in the next step.  In the lsb-first
4928d2d3e72SEric Biggers	// case the '* x' happens implicitly.  In the msb-first case it must be
4938d2d3e72SEric Biggers	// done explicitly; floor(x^(63+n) / G) * x is a 65-bit constant, so the
4948d2d3e72SEric Biggers	// constant passed to pclmulqdq is (floor(x^(63+n) / G) * x) - x^64, and
4958d2d3e72SEric Biggers	// the multiplication by the x^64 term is handled using a pxor.  The
4968d2d3e72SEric Biggers	// pxor causes the low 64 terms of t1 to be wrong, but they are unused.
4978d2d3e72SEric Biggers	_cond_vex movdqa,	OFFSETOF_BARRETT_REDUCTION_CONSTS(CONSTS_PTR), CONSTS_XMM
4988d2d3e72SEric Biggers	_pclmulqdq		CONSTS_XMM, HI64_TERMS, %xmm0, HI64_TERMS, %xmm1
4998d2d3e72SEric Biggers.if !LSB_CRC
5008d2d3e72SEric Biggers	_cond_vex pxor,		%xmm0, %xmm1, %xmm1 // += x^64 * floor(t0 / x^n)
5018d2d3e72SEric Biggers.endif
5028d2d3e72SEric Biggers	// The HI64_TERMS of %xmm1 now contain floor(t1 / x^64) = floor(t0 / G).
5038d2d3e72SEric Biggers
5048d2d3e72SEric Biggers	// Second step of Barrett reduction: Cancel out the x^n and higher terms
5058d2d3e72SEric Biggers	// of t0 by subtracting the needed multiple of G.  This gives the CRC:
5068d2d3e72SEric Biggers	//
5078d2d3e72SEric Biggers	//	crc := t0 - (G * floor(t0 / G))
5088d2d3e72SEric Biggers	//
5098d2d3e72SEric Biggers	// But %xmm0 contains t0 * x^(64-n), so it's more convenient to do:
5108d2d3e72SEric Biggers	//
5118d2d3e72SEric Biggers	//	crc := ((t0 * x^(64-n)) - ((G * x^(64-n)) * floor(t0 / G))) / x^(64-n)
5128d2d3e72SEric Biggers	//
5138d2d3e72SEric Biggers	// Furthermore, since the resulting CRC is n-bit, if mod x^n is
5148d2d3e72SEric Biggers	// explicitly applied to it then the x^n term of G makes no difference
5158d2d3e72SEric Biggers	// in the result and can be omitted.  This helps keep the constant
5168d2d3e72SEric Biggers	// multiplier in 64 bits in most cases.  This gives the following:
5178d2d3e72SEric Biggers	//
5188d2d3e72SEric Biggers	//	%xmm0 := %xmm0 - (((G - x^n) * x^(64-n)) * floor(t0 / G))
5198d2d3e72SEric Biggers	//	crc := (%xmm0 / x^(64-n)) mod x^n
5208d2d3e72SEric Biggers	//
5218d2d3e72SEric Biggers	// In the lsb-first case, each pclmulqdq implicitly introduces
5228d2d3e72SEric Biggers	// an extra factor of x, so in that case the constant that needs to be
5238d2d3e72SEric Biggers	// passed to pclmulqdq is actually '(G - x^n) * x^(63-n)' when n <= 63.
5248d2d3e72SEric Biggers	// For lsb-first CRCs where n=64, the extra factor of x cannot be as
5258d2d3e72SEric Biggers	// easily avoided.  In that case, instead pass '(G - x^n - x^0) / x' to
5268d2d3e72SEric Biggers	// pclmulqdq and handle the x^0 term (i.e. 1) separately.  (All CRC
5278d2d3e72SEric Biggers	// polynomials have nonzero x^n and x^0 terms.)  It works out as: the
5288d2d3e72SEric Biggers	// CRC has be XORed with the physically low qword of %xmm1, representing
5298d2d3e72SEric Biggers	// floor(t0 / G).  The most efficient way to do that is to move it to
5308d2d3e72SEric Biggers	// the physically high qword and use a ternlog to combine the two XORs.
5318d2d3e72SEric Biggers.if LSB_CRC && \n == 64
5328d2d3e72SEric Biggers	_cond_vex punpcklqdq,	%xmm1, %xmm2, %xmm2
5338d2d3e72SEric Biggers	_pclmulqdq		CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1
534*acf9f8daSEric Biggers    .if AVX_LEVEL <= 2
5358d2d3e72SEric Biggers	_cond_vex pxor,		%xmm2, %xmm0, %xmm0
5368d2d3e72SEric Biggers	_cond_vex pxor,		%xmm1, %xmm0, %xmm0
5378d2d3e72SEric Biggers    .else
5388d2d3e72SEric Biggers	vpternlogq		$0x96, %xmm2, %xmm1, %xmm0
5398d2d3e72SEric Biggers    .endif
5408d2d3e72SEric Biggers	_cond_vex "pextrq $1,",	%xmm0, %rax  // (%xmm0 / x^0) mod x^64
5418d2d3e72SEric Biggers.else
5428d2d3e72SEric Biggers	_pclmulqdq		CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1
5438d2d3e72SEric Biggers	_cond_vex pxor,		%xmm1, %xmm0, %xmm0
5448d2d3e72SEric Biggers  .if \n == 8
5458d2d3e72SEric Biggers	_cond_vex "pextrb $7 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^56) mod x^8
5468d2d3e72SEric Biggers  .elseif \n == 16
5478d2d3e72SEric Biggers	_cond_vex "pextrw $3 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^48) mod x^16
5488d2d3e72SEric Biggers  .elseif \n == 32
5498d2d3e72SEric Biggers	_cond_vex "pextrd $1 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^32) mod x^32
5508d2d3e72SEric Biggers  .else // \n == 64 && !LSB_CRC
5518d2d3e72SEric Biggers	_cond_vex movq,		%xmm0, %rax  // (%xmm0 / x^0) mod x^64
5528d2d3e72SEric Biggers  .endif
5538d2d3e72SEric Biggers.endif
5548d2d3e72SEric Biggers
5558d2d3e72SEric Biggers.if VL > 16
5568d2d3e72SEric Biggers	vzeroupper	// Needed when ymm or zmm registers may have been used.
5578d2d3e72SEric Biggers.endif
5588d2d3e72SEric Biggers#ifdef __i386__
5598d2d3e72SEric Biggers	pop		CONSTS_PTR
5608d2d3e72SEric Biggers#endif
5618d2d3e72SEric Biggers	RET
5628d2d3e72SEric Biggers.endm
5638d2d3e72SEric Biggers
5648d2d3e72SEric Biggers#ifdef CONFIG_AS_VPCLMULQDQ
5658d2d3e72SEric Biggers#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb)			\
5668d2d3e72SEric BiggersSYM_FUNC_START(prefix##_pclmul_sse);					\
5678d2d3e72SEric Biggers	_crc_pclmul	n=bits, lsb_crc=lsb, vl=16, avx_level=0;	\
5688d2d3e72SEric BiggersSYM_FUNC_END(prefix##_pclmul_sse);					\
5698d2d3e72SEric Biggers									\
5708d2d3e72SEric BiggersSYM_FUNC_START(prefix##_vpclmul_avx2);					\
5718d2d3e72SEric Biggers	_crc_pclmul	n=bits, lsb_crc=lsb, vl=32, avx_level=2;	\
5728d2d3e72SEric BiggersSYM_FUNC_END(prefix##_vpclmul_avx2);					\
5738d2d3e72SEric Biggers									\
574*acf9f8daSEric BiggersSYM_FUNC_START(prefix##_vpclmul_avx512);				\
575*acf9f8daSEric Biggers	_crc_pclmul	n=bits, lsb_crc=lsb, vl=64, avx_level=512;	\
576*acf9f8daSEric BiggersSYM_FUNC_END(prefix##_vpclmul_avx512);
5778d2d3e72SEric Biggers#else
5788d2d3e72SEric Biggers#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb)			\
5798d2d3e72SEric BiggersSYM_FUNC_START(prefix##_pclmul_sse);					\
5808d2d3e72SEric Biggers	_crc_pclmul	n=bits, lsb_crc=lsb, vl=16, avx_level=0;	\
5818d2d3e72SEric BiggersSYM_FUNC_END(prefix##_pclmul_sse);
5828d2d3e72SEric Biggers#endif // !CONFIG_AS_VPCLMULQDQ
583