x86/lib/crc-pclmul-template.S

8d2d3e72SEric Biggers/* SPDX-License-Identifier: GPL-2.0-or-later */
8d2d3e72SEric Biggers//
8d2d3e72SEric Biggers// Template to generate [V]PCLMULQDQ-based CRC functions for x86
8d2d3e72SEric Biggers//
8d2d3e72SEric Biggers// Copyright 2025 Google LLC
8d2d3e72SEric Biggers//
8d2d3e72SEric Biggers// Author: Eric Biggers <ebiggers@google.com>
8d2d3e72SEric Biggers
8d2d3e72SEric Biggers#include <linux/linkage.h>
a0bd462fSEric Biggers#include <linux/objtool.h>
8d2d3e72SEric Biggers
8d2d3e72SEric Biggers// Offsets within the generated constants table
8d2d3e72SEric Biggers.set OFFSETOF_BSWAP_MASK,			-5*16	// msb-first CRCs only
8d2d3e72SEric Biggers.set OFFSETOF_FOLD_ACROSS_2048_BITS_CONSTS,	-4*16	// must precede next
8d2d3e72SEric Biggers.set OFFSETOF_FOLD_ACROSS_1024_BITS_CONSTS,	-3*16	// must precede next
8d2d3e72SEric Biggers.set OFFSETOF_FOLD_ACROSS_512_BITS_CONSTS,	-2*16	// must precede next
8d2d3e72SEric Biggers.set OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS,	-1*16	// must precede next
8d2d3e72SEric Biggers.set OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS,	0*16	// must be 0
8d2d3e72SEric Biggers.set OFFSETOF_SHUF_TABLE,			1*16
8d2d3e72SEric Biggers.set OFFSETOF_BARRETT_REDUCTION_CONSTS,		4*16
8d2d3e72SEric Biggers
8d2d3e72SEric Biggers// Emit a VEX (or EVEX) coded instruction if allowed, or emulate it using the
8d2d3e72SEric Biggers// corresponding non-VEX instruction plus any needed moves.  The supported
8d2d3e72SEric Biggers// instruction formats are:
8d2d3e72SEric Biggers//
8d2d3e72SEric Biggers//     - Two-arg [src, dst], where the non-VEX format is the same.
8d2d3e72SEric Biggers//     - Three-arg [src1, src2, dst] where the non-VEX format is
8d2d3e72SEric Biggers//	 [src1, src2_and_dst].  If src2 != dst, then src1 must != dst too.
8d2d3e72SEric Biggers//
8d2d3e72SEric Biggers// \insn gives the instruction without a "v" prefix and including any immediate
8d2d3e72SEric Biggers// argument if needed to make the instruction follow one of the above formats.
8d2d3e72SEric Biggers// If \unaligned_mem_tmp is given, then the emitted non-VEX code moves \arg1 to
8d2d3e72SEric Biggers// it first; this is needed when \arg1 is an unaligned mem operand.
8d2d3e72SEric Biggers.macro	_cond_vex	insn:req, arg1:req, arg2:req, arg3, unaligned_mem_tmp
8d2d3e72SEric Biggers.if AVX_LEVEL == 0
8d2d3e72SEric Biggers  // VEX not allowed.  Emulate it.
8d2d3e72SEric Biggers  .ifnb \arg3 // Three-arg [src1, src2, dst]
8d2d3e72SEric Biggers    .ifc "\arg2", "\arg3" // src2 == dst?
8d2d3e72SEric Biggers      .ifnb \unaligned_mem_tmp
8d2d3e72SEric Biggers	movdqu		\arg1, \unaligned_mem_tmp
8d2d3e72SEric Biggers	\insn		\unaligned_mem_tmp, \arg3
8d2d3e72SEric Biggers      .else
8d2d3e72SEric Biggers	\insn		\arg1, \arg3
8d2d3e72SEric Biggers      .endif
8d2d3e72SEric Biggers    .else // src2 != dst
8d2d3e72SEric Biggers      .ifc "\arg1", "\arg3"
8d2d3e72SEric Biggers	.error "Can't have src1 == dst when src2 != dst"
8d2d3e72SEric Biggers      .endif
8d2d3e72SEric Biggers      .ifnb \unaligned_mem_tmp
8d2d3e72SEric Biggers	movdqu		\arg1, \unaligned_mem_tmp
8d2d3e72SEric Biggers	movdqa		\arg2, \arg3
8d2d3e72SEric Biggers	\insn		\unaligned_mem_tmp, \arg3
8d2d3e72SEric Biggers      .else
8d2d3e72SEric Biggers	movdqa		\arg2, \arg3
8d2d3e72SEric Biggers	\insn		\arg1, \arg3
8d2d3e72SEric Biggers      .endif
8d2d3e72SEric Biggers    .endif
8d2d3e72SEric Biggers  .else // Two-arg [src, dst]
8d2d3e72SEric Biggers    .ifnb \unaligned_mem_tmp
8d2d3e72SEric Biggers	movdqu		\arg1, \unaligned_mem_tmp
8d2d3e72SEric Biggers	\insn		\unaligned_mem_tmp, \arg2
8d2d3e72SEric Biggers    .else
8d2d3e72SEric Biggers	\insn		\arg1, \arg2
8d2d3e72SEric Biggers    .endif
8d2d3e72SEric Biggers  .endif
8d2d3e72SEric Biggers.else
8d2d3e72SEric Biggers  // VEX is allowed.  Emit the desired instruction directly.
8d2d3e72SEric Biggers  .ifnb \arg3
8d2d3e72SEric Biggers	v\insn		\arg1, \arg2, \arg3
8d2d3e72SEric Biggers  .else
8d2d3e72SEric Biggers	v\insn		\arg1, \arg2
8d2d3e72SEric Biggers  .endif
8d2d3e72SEric Biggers.endif
8d2d3e72SEric Biggers.endm
8d2d3e72SEric Biggers
8d2d3e72SEric Biggers// Broadcast an aligned 128-bit mem operand to all 128-bit lanes of a vector
8d2d3e72SEric Biggers// register of length VL.
8d2d3e72SEric Biggers.macro	_vbroadcast	src, dst
8d2d3e72SEric Biggers.if VL == 16
8d2d3e72SEric Biggers	_cond_vex movdqa,	\src, \dst
8d2d3e72SEric Biggers.elseif VL == 32
8d2d3e72SEric Biggers	vbroadcasti128		\src, \dst
8d2d3e72SEric Biggers.else
8d2d3e72SEric Biggers	vbroadcasti32x4		\src, \dst
8d2d3e72SEric Biggers.endif
8d2d3e72SEric Biggers.endm
8d2d3e72SEric Biggers
8d2d3e72SEric Biggers// Load \vl bytes from the unaligned mem operand \src into \dst, and if the CRC
8d2d3e72SEric Biggers// is msb-first use \bswap_mask to reflect the bytes within each 128-bit lane.
8d2d3e72SEric Biggers.macro	_load_data	vl, src, bswap_mask, dst
8d2d3e72SEric Biggers.if \vl < 64
8d2d3e72SEric Biggers	_cond_vex movdqu,	"\src", \dst
8d2d3e72SEric Biggers.else
8d2d3e72SEric Biggers	vmovdqu8		\src, \dst
8d2d3e72SEric Biggers.endif
8d2d3e72SEric Biggers.if !LSB_CRC
8d2d3e72SEric Biggers	_cond_vex pshufb,	\bswap_mask, \dst, \dst
8d2d3e72SEric Biggers.endif
8d2d3e72SEric Biggers.endm
8d2d3e72SEric Biggers
8d2d3e72SEric Biggers.macro	_prepare_v0	vl, v0, v1, bswap_mask
8d2d3e72SEric Biggers.if LSB_CRC
8d2d3e72SEric Biggers  .if \vl < 64
8d2d3e72SEric Biggers	_cond_vex pxor,		(BUF), \v0, \v0, unaligned_mem_tmp=\v1
8d2d3e72SEric Biggers  .else
8d2d3e72SEric Biggers	vpxorq			(BUF), \v0, \v0
8d2d3e72SEric Biggers  .endif
8d2d3e72SEric Biggers.else
8d2d3e72SEric Biggers	_load_data		\vl, (BUF), \bswap_mask, \v1
8d2d3e72SEric Biggers  .if \vl < 64
8d2d3e72SEric Biggers	_cond_vex pxor,		\v1, \v0, \v0
8d2d3e72SEric Biggers  .else
8d2d3e72SEric Biggers	vpxorq			\v1, \v0, \v0
8d2d3e72SEric Biggers  .endif
8d2d3e72SEric Biggers.endif
8d2d3e72SEric Biggers.endm
8d2d3e72SEric Biggers
8d2d3e72SEric Biggers// The x^0..x^63 terms, i.e. poly128 mod x^64, i.e. the physically low qword for
8d2d3e72SEric Biggers// msb-first order or the physically high qword for lsb-first order
8d2d3e72SEric Biggers#define LO64_TERMS 0
8d2d3e72SEric Biggers
8d2d3e72SEric Biggers// The x^64..x^127 terms, i.e. floor(poly128 / x^64), i.e. the physically high
8d2d3e72SEric Biggers// qword for msb-first order or the physically low qword for lsb-first order
8d2d3e72SEric Biggers#define HI64_TERMS 1
8d2d3e72SEric Biggers
8d2d3e72SEric Biggers// Multiply the given \src1_terms of each 128-bit lane of \src1 by the given
8d2d3e72SEric Biggers// \src2_terms of each 128-bit lane of \src2, and write the result(s) to \dst.
8d2d3e72SEric Biggers.macro	_pclmulqdq	src1, src1_terms, src2, src2_terms, dst
8d2d3e72SEric Biggers	_cond_vex "pclmulqdq $((\src1_terms ^ LSB_CRC) << 4) ^ (\src2_terms ^ LSB_CRC),", \
8d2d3e72SEric Biggers		  \src1, \src2, \dst
8d2d3e72SEric Biggers.endm
8d2d3e72SEric Biggers
8d2d3e72SEric Biggers// Fold \acc into \data and store the result back into \acc.  \data can be an
8d2d3e72SEric Biggers// unaligned mem operand if using VEX is allowed and the CRC is lsb-first so no
8d2d3e72SEric Biggers// byte-reflection is needed; otherwise it must be a vector register.  \consts
8d2d3e72SEric Biggers// is a vector register containing the needed fold constants, and \tmp is a
8d2d3e72SEric Biggers// temporary vector register.  All arguments must be the same length.
8d2d3e72SEric Biggers.macro	_fold_vec	acc, data, consts, tmp
8d2d3e72SEric Biggers	_pclmulqdq	\consts, HI64_TERMS, \acc, HI64_TERMS, \tmp
8d2d3e72SEric Biggers	_pclmulqdq	\consts, LO64_TERMS, \acc, LO64_TERMS, \acc
*acf9f8daSEric Biggers.if AVX_LEVEL <= 2
8d2d3e72SEric Biggers	_cond_vex pxor,	\data, \tmp, \tmp
8d2d3e72SEric Biggers	_cond_vex pxor,	\tmp, \acc, \acc
8d2d3e72SEric Biggers.else
8d2d3e72SEric Biggers	vpternlogq	$0x96, \data, \tmp, \acc
8d2d3e72SEric Biggers.endif
8d2d3e72SEric Biggers.endm
8d2d3e72SEric Biggers
8d2d3e72SEric Biggers// Fold \acc into \data and store the result back into \acc.  \data is an
8d2d3e72SEric Biggers// unaligned mem operand, \consts is a vector register containing the needed
8d2d3e72SEric Biggers// fold constants, \bswap_mask is a vector register containing the
8d2d3e72SEric Biggers// byte-reflection table if the CRC is msb-first, and \tmp1 and \tmp2 are
8d2d3e72SEric Biggers// temporary vector registers.  All arguments must have length \vl.
8d2d3e72SEric Biggers.macro	_fold_vec_mem	vl, acc, data, consts, bswap_mask, tmp1, tmp2
8d2d3e72SEric Biggers.if AVX_LEVEL == 0 || !LSB_CRC
8d2d3e72SEric Biggers	_load_data	\vl, \data, \bswap_mask, \tmp1
8d2d3e72SEric Biggers	_fold_vec	\acc, \tmp1, \consts, \tmp2
8d2d3e72SEric Biggers.else
8d2d3e72SEric Biggers	_fold_vec	\acc, \data, \consts, \tmp1
8d2d3e72SEric Biggers.endif
8d2d3e72SEric Biggers.endm
8d2d3e72SEric Biggers
8d2d3e72SEric Biggers// Load the constants for folding across 2**i vectors of length VL at a time
8d2d3e72SEric Biggers// into all 128-bit lanes of the vector register CONSTS.
8d2d3e72SEric Biggers.macro	_load_vec_folding_consts	i
8d2d3e72SEric Biggers	_vbroadcast OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS+(4-LOG2_VL-\i)*16(CONSTS_PTR), \
8d2d3e72SEric Biggers		    CONSTS
8d2d3e72SEric Biggers.endm
8d2d3e72SEric Biggers
8d2d3e72SEric Biggers// Given vector registers \v0 and \v1 of length \vl, fold \v0 into \v1 and store
8d2d3e72SEric Biggers// the result back into \v0.  If the remaining length mod \vl is nonzero, also
8d2d3e72SEric Biggers// fold \vl data bytes from BUF.  For both operations the fold distance is \vl.
8d2d3e72SEric Biggers// \consts must be a register of length \vl containing the fold constants.
8d2d3e72SEric Biggers.macro	_fold_vec_final	vl, v0, v1, consts, bswap_mask, tmp1, tmp2
8d2d3e72SEric Biggers	_fold_vec	\v0, \v1, \consts, \tmp1
8d2d3e72SEric Biggers	test		$\vl, LEN8
8d2d3e72SEric Biggers	jz		.Lfold_vec_final_done\@
8d2d3e72SEric Biggers	_fold_vec_mem	\vl, \v0, (BUF), \consts, \bswap_mask, \tmp1, \tmp2
8d2d3e72SEric Biggers	add		$\vl, BUF
8d2d3e72SEric Biggers.Lfold_vec_final_done\@:
8d2d3e72SEric Biggers.endm
8d2d3e72SEric Biggers
8d2d3e72SEric Biggers// This macro generates the body of a CRC function with the following prototype:
8d2d3e72SEric Biggers//
8d2d3e72SEric Biggers// crc_t crc_func(crc_t crc, const u8 *buf, size_t len, const void *consts);
8d2d3e72SEric Biggers//
8d2d3e72SEric Biggers// |crc| is the initial CRC, and crc_t is a data type wide enough to hold it.
8d2d3e72SEric Biggers// |buf| is the data to checksum.  |len| is the data length in bytes, which must
8d2d3e72SEric Biggers// be at least 16.  |consts| is a pointer to the fold_across_128_bits_consts
8d2d3e72SEric Biggers// field of the constants struct that was generated for the chosen CRC variant.
8d2d3e72SEric Biggers//
8d2d3e72SEric Biggers// Moving onto the macro parameters, \n is the number of bits in the CRC, e.g.
8d2d3e72SEric Biggers// 32 for a CRC-32.  Currently the supported values are 8, 16, 32, and 64.  If
8d2d3e72SEric Biggers// the file is compiled in i386 mode, then the maximum supported value is 32.
8d2d3e72SEric Biggers//
8d2d3e72SEric Biggers// \lsb_crc is 1 if the CRC processes the least significant bit of each byte
8d2d3e72SEric Biggers// first, i.e. maps bit0 to x^7, bit1 to x^6, ..., bit7 to x^0.  \lsb_crc is 0
8d2d3e72SEric Biggers// if the CRC processes the most significant bit of each byte first, i.e. maps
8d2d3e72SEric Biggers// bit0 to x^0, bit1 to x^1, bit7 to x^7.
8d2d3e72SEric Biggers//
8d2d3e72SEric Biggers// \vl is the maximum length of vector register to use in bytes: 16, 32, or 64.
8d2d3e72SEric Biggers//
8d2d3e72SEric Biggers// \avx_level is the level of AVX support to use: 0 for SSE only, 2 for AVX2, or
*acf9f8daSEric Biggers// 512 for AVX512.
8d2d3e72SEric Biggers//
8d2d3e72SEric Biggers// If \vl == 16 && \avx_level == 0, the generated code requires:
8d2d3e72SEric Biggers// PCLMULQDQ && SSE4.1.  (Note: all known CPUs with PCLMULQDQ also have SSE4.1.)
8d2d3e72SEric Biggers//
8d2d3e72SEric Biggers// If \vl == 32 && \avx_level == 2, the generated code requires:
8d2d3e72SEric Biggers// VPCLMULQDQ && AVX2.
8d2d3e72SEric Biggers//
*acf9f8daSEric Biggers// If \vl == 64 && \avx_level == 512, the generated code requires:
*acf9f8daSEric Biggers// VPCLMULQDQ && AVX512BW && AVX512VL.
8d2d3e72SEric Biggers//
8d2d3e72SEric Biggers// Other \vl and \avx_level combinations are either not supported or not useful.
8d2d3e72SEric Biggers.macro	_crc_pclmul	n, lsb_crc, vl, avx_level
8d2d3e72SEric Biggers	.set	LSB_CRC,	\lsb_crc
8d2d3e72SEric Biggers	.set	VL,		\vl
8d2d3e72SEric Biggers	.set	AVX_LEVEL,	\avx_level
8d2d3e72SEric Biggers
8d2d3e72SEric Biggers	// Define aliases for the xmm, ymm, or zmm registers according to VL.
8d2d3e72SEric Biggers.irp i, 0,1,2,3,4,5,6,7
8d2d3e72SEric Biggers  .if VL == 16
8d2d3e72SEric Biggers	.set	V\i,		%xmm\i
8d2d3e72SEric Biggers	.set	LOG2_VL,	4
8d2d3e72SEric Biggers  .elseif VL == 32
8d2d3e72SEric Biggers	.set	V\i,		%ymm\i
8d2d3e72SEric Biggers	.set	LOG2_VL,	5
8d2d3e72SEric Biggers  .elseif VL == 64
8d2d3e72SEric Biggers	.set	V\i,		%zmm\i
8d2d3e72SEric Biggers	.set	LOG2_VL,	6
8d2d3e72SEric Biggers  .else
8d2d3e72SEric Biggers	.error "Unsupported vector length"
8d2d3e72SEric Biggers  .endif
8d2d3e72SEric Biggers.endr
8d2d3e72SEric Biggers	// Define aliases for the function parameters.
8d2d3e72SEric Biggers	// Note: when crc_t is shorter than u32, zero-extension to 32 bits is
8d2d3e72SEric Biggers	// guaranteed by the ABI.  Zero-extension to 64 bits is *not* guaranteed
8d2d3e72SEric Biggers	// when crc_t is shorter than u64.
8d2d3e72SEric Biggers#ifdef __x86_64__
8d2d3e72SEric Biggers.if \n <= 32
8d2d3e72SEric Biggers	.set	CRC,		%edi
8d2d3e72SEric Biggers.else
8d2d3e72SEric Biggers	.set	CRC,		%rdi
8d2d3e72SEric Biggers.endif
8d2d3e72SEric Biggers	.set	BUF,		%rsi
8d2d3e72SEric Biggers	.set	LEN,		%rdx
8d2d3e72SEric Biggers	.set	LEN32,		%edx
8d2d3e72SEric Biggers	.set	LEN8,		%dl
8d2d3e72SEric Biggers	.set	CONSTS_PTR,	%rcx
8d2d3e72SEric Biggers#else
8d2d3e72SEric Biggers	// 32-bit support, assuming -mregparm=3 and not including support for
8d2d3e72SEric Biggers	// CRC-64 (which would use both eax and edx to pass the crc parameter).
8d2d3e72SEric Biggers	.set	CRC,		%eax
8d2d3e72SEric Biggers	.set	BUF,		%edx
8d2d3e72SEric Biggers	.set	LEN,		%ecx
8d2d3e72SEric Biggers	.set	LEN32,		%ecx
8d2d3e72SEric Biggers	.set	LEN8,		%cl
8d2d3e72SEric Biggers	.set	CONSTS_PTR,	%ebx	// Passed on stack
8d2d3e72SEric Biggers#endif
8d2d3e72SEric Biggers
8d2d3e72SEric Biggers	// Define aliases for some local variables.  V0-V5 are used without
8d2d3e72SEric Biggers	// aliases (for accumulators, data, temporary values, etc).  Staying
8d2d3e72SEric Biggers	// within the first 8 vector registers keeps the code 32-bit SSE
8d2d3e72SEric Biggers	// compatible and reduces the size of 64-bit SSE code slightly.
8d2d3e72SEric Biggers	.set	BSWAP_MASK,	V6
8d2d3e72SEric Biggers	.set	BSWAP_MASK_YMM,	%ymm6
8d2d3e72SEric Biggers	.set	BSWAP_MASK_XMM,	%xmm6
8d2d3e72SEric Biggers	.set	CONSTS,		V7
8d2d3e72SEric Biggers	.set	CONSTS_YMM,	%ymm7
8d2d3e72SEric Biggers	.set	CONSTS_XMM,	%xmm7
8d2d3e72SEric Biggers
a0bd462fSEric Biggers	// Use ANNOTATE_NOENDBR to suppress an objtool warning, since the
a0bd462fSEric Biggers	// functions generated by this macro are called only by static_call.
a0bd462fSEric Biggers	ANNOTATE_NOENDBR
a0bd462fSEric Biggers
8d2d3e72SEric Biggers#ifdef __i386__
8d2d3e72SEric Biggers	push		CONSTS_PTR
8d2d3e72SEric Biggers	mov		8(%esp), CONSTS_PTR
8d2d3e72SEric Biggers#endif
8d2d3e72SEric Biggers
8d2d3e72SEric Biggers	// Create a 128-bit vector that contains the initial CRC in the end
8d2d3e72SEric Biggers	// representing the high-order polynomial coefficients, and the rest 0.
8d2d3e72SEric Biggers	// If the CRC is msb-first, also load the byte-reflection table.
8d2d3e72SEric Biggers.if \n <= 32
8d2d3e72SEric Biggers	_cond_vex movd,	CRC, %xmm0
8d2d3e72SEric Biggers.else
8d2d3e72SEric Biggers	_cond_vex movq,	CRC, %xmm0
8d2d3e72SEric Biggers.endif
8d2d3e72SEric Biggers.if !LSB_CRC
8d2d3e72SEric Biggers	_cond_vex pslldq, $(128-\n)/8, %xmm0, %xmm0
8d2d3e72SEric Biggers	_vbroadcast	OFFSETOF_BSWAP_MASK(CONSTS_PTR), BSWAP_MASK
8d2d3e72SEric Biggers.endif
8d2d3e72SEric Biggers
8d2d3e72SEric Biggers	// Load the first vector of data and XOR the initial CRC into the
8d2d3e72SEric Biggers	// appropriate end of the first 128-bit lane of data.  If LEN < VL, then
8d2d3e72SEric Biggers	// use a short vector and jump ahead to the final reduction.  (LEN >= 16
8d2d3e72SEric Biggers	// is guaranteed here but not necessarily LEN >= VL.)
8d2d3e72SEric Biggers.if VL >= 32
8d2d3e72SEric Biggers	cmp		$VL, LEN
8d2d3e72SEric Biggers	jae		.Lat_least_1vec\@
8d2d3e72SEric Biggers  .if VL == 64
8d2d3e72SEric Biggers	cmp		$32, LEN32
8d2d3e72SEric Biggers	jb		.Lless_than_32bytes\@
8d2d3e72SEric Biggers	_prepare_v0	32, %ymm0, %ymm1, BSWAP_MASK_YMM
8d2d3e72SEric Biggers	add		$32, BUF
8d2d3e72SEric Biggers	jmp		.Lreduce_256bits_to_128bits\@
8d2d3e72SEric Biggers.Lless_than_32bytes\@:
8d2d3e72SEric Biggers  .endif
8d2d3e72SEric Biggers	_prepare_v0	16, %xmm0, %xmm1, BSWAP_MASK_XMM
8d2d3e72SEric Biggers	add		$16, BUF
8d2d3e72SEric Biggers	vmovdqa		OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
8d2d3e72SEric Biggers	jmp		.Lcheck_for_partial_block\@
8d2d3e72SEric Biggers.Lat_least_1vec\@:
8d2d3e72SEric Biggers.endif
8d2d3e72SEric Biggers	_prepare_v0	VL, V0, V1, BSWAP_MASK
8d2d3e72SEric Biggers
8d2d3e72SEric Biggers	// Handle VL <= LEN < 4*VL.
8d2d3e72SEric Biggers	cmp		$4*VL-1, LEN
8d2d3e72SEric Biggers	ja		.Lat_least_4vecs\@
8d2d3e72SEric Biggers	add		$VL, BUF
8d2d3e72SEric Biggers	// If VL <= LEN < 2*VL, then jump ahead to the reduction from 1 vector.
8d2d3e72SEric Biggers	// If VL==16 then load fold_across_128_bits_consts first, as the final
8d2d3e72SEric Biggers	// reduction depends on it and it won't be loaded anywhere else.
8d2d3e72SEric Biggers	cmp		$2*VL-1, LEN32
8d2d3e72SEric Biggers.if VL == 16
8d2d3e72SEric Biggers	_cond_vex movdqa, OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
8d2d3e72SEric Biggers.endif
8d2d3e72SEric Biggers	jbe		.Lreduce_1vec_to_128bits\@
8d2d3e72SEric Biggers	// Otherwise 2*VL <= LEN < 4*VL.  Load one more vector and jump ahead to
8d2d3e72SEric Biggers	// the reduction from 2 vectors.
8d2d3e72SEric Biggers	_load_data	VL, (BUF), BSWAP_MASK, V1
8d2d3e72SEric Biggers	add		$VL, BUF
8d2d3e72SEric Biggers	jmp		.Lreduce_2vecs_to_1\@
8d2d3e72SEric Biggers
8d2d3e72SEric Biggers.Lat_least_4vecs\@:
8d2d3e72SEric Biggers	// Load 3 more vectors of data.
8d2d3e72SEric Biggers	_load_data	VL, 1*VL(BUF), BSWAP_MASK, V1
8d2d3e72SEric Biggers	_load_data	VL, 2*VL(BUF), BSWAP_MASK, V2
8d2d3e72SEric Biggers	_load_data	VL, 3*VL(BUF), BSWAP_MASK, V3
8d2d3e72SEric Biggers	sub		$-4*VL, BUF	// Shorter than 'add 4*VL' when VL=32
8d2d3e72SEric Biggers	add		$-4*VL, LEN	// Shorter than 'sub 4*VL' when VL=32
8d2d3e72SEric Biggers
8d2d3e72SEric Biggers	// Main loop: while LEN >= 4*VL, fold the 4 vectors V0-V3 into the next
8d2d3e72SEric Biggers	// 4 vectors of data and write the result back to V0-V3.
8d2d3e72SEric Biggers	cmp		$4*VL-1, LEN	// Shorter than 'cmp 4*VL' when VL=32
8d2d3e72SEric Biggers	jbe		.Lreduce_4vecs_to_2\@
8d2d3e72SEric Biggers	_load_vec_folding_consts	2
8d2d3e72SEric Biggers.Lfold_4vecs_loop\@:
8d2d3e72SEric Biggers	_fold_vec_mem	VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
8d2d3e72SEric Biggers	_fold_vec_mem	VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
8d2d3e72SEric Biggers	_fold_vec_mem	VL, V2, 2*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
8d2d3e72SEric Biggers	_fold_vec_mem	VL, V3, 3*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
8d2d3e72SEric Biggers	sub		$-4*VL, BUF
8d2d3e72SEric Biggers	add		$-4*VL, LEN
8d2d3e72SEric Biggers	cmp		$4*VL-1, LEN
8d2d3e72SEric Biggers	ja		.Lfold_4vecs_loop\@
8d2d3e72SEric Biggers
8d2d3e72SEric Biggers	// Fold V0,V1 into V2,V3 and write the result back to V0,V1.  Then fold
8d2d3e72SEric Biggers	// two more vectors of data from BUF, if at least that much remains.
8d2d3e72SEric Biggers.Lreduce_4vecs_to_2\@:
8d2d3e72SEric Biggers	_load_vec_folding_consts	1
8d2d3e72SEric Biggers	_fold_vec	V0, V2, CONSTS, V4
8d2d3e72SEric Biggers	_fold_vec	V1, V3, CONSTS, V4
8d2d3e72SEric Biggers	test		$2*VL, LEN8
8d2d3e72SEric Biggers	jz		.Lreduce_2vecs_to_1\@
8d2d3e72SEric Biggers	_fold_vec_mem	VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
8d2d3e72SEric Biggers	_fold_vec_mem	VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
8d2d3e72SEric Biggers	sub		$-2*VL, BUF
8d2d3e72SEric Biggers
8d2d3e72SEric Biggers	// Fold V0 into V1 and write the result back to V0.  Then fold one more
8d2d3e72SEric Biggers	// vector of data from BUF, if at least that much remains.
8d2d3e72SEric Biggers.Lreduce_2vecs_to_1\@:
8d2d3e72SEric Biggers	_load_vec_folding_consts	0
8d2d3e72SEric Biggers	_fold_vec_final	VL, V0, V1, CONSTS, BSWAP_MASK, V4, V5
8d2d3e72SEric Biggers
8d2d3e72SEric Biggers.Lreduce_1vec_to_128bits\@:
8d2d3e72SEric Biggers.if VL == 64
8d2d3e72SEric Biggers	// Reduce 512-bit %zmm0 to 256-bit %ymm0.  Then fold 256 more bits of
8d2d3e72SEric Biggers	// data from BUF, if at least that much remains.
8d2d3e72SEric Biggers	vbroadcasti128	OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS(CONSTS_PTR), CONSTS_YMM
8d2d3e72SEric Biggers	vextracti64x4	$1, %zmm0, %ymm1
8d2d3e72SEric Biggers	_fold_vec_final	32, %ymm0, %ymm1, CONSTS_YMM, BSWAP_MASK_YMM, %ymm4, %ymm5
8d2d3e72SEric Biggers.Lreduce_256bits_to_128bits\@:
8d2d3e72SEric Biggers.endif
8d2d3e72SEric Biggers.if VL >= 32
8d2d3e72SEric Biggers	// Reduce 256-bit %ymm0 to 128-bit %xmm0.  Then fold 128 more bits of
8d2d3e72SEric Biggers	// data from BUF, if at least that much remains.
8d2d3e72SEric Biggers	vmovdqa		OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
8d2d3e72SEric Biggers	vextracti128	$1, %ymm0, %xmm1
8d2d3e72SEric Biggers	_fold_vec_final	16, %xmm0, %xmm1, CONSTS_XMM, BSWAP_MASK_XMM, %xmm4, %xmm5
8d2d3e72SEric Biggers.Lcheck_for_partial_block\@:
8d2d3e72SEric Biggers.endif
8d2d3e72SEric Biggers	and		$15, LEN32
8d2d3e72SEric Biggers	jz		.Lreduce_128bits_to_crc\@
8d2d3e72SEric Biggers
8d2d3e72SEric Biggers	// 1 <= LEN <= 15 data bytes remain in BUF.  The polynomial is now
8d2d3e72SEric Biggers	// A*(x^(8*LEN)) + B, where A is the 128-bit polynomial stored in %xmm0
8d2d3e72SEric Biggers	// and B is the polynomial of the remaining LEN data bytes.  To reduce
8d2d3e72SEric Biggers	// this to 128 bits without needing fold constants for each possible
8d2d3e72SEric Biggers	// LEN, rearrange this expression into C1*(x^128) + C2, where
8d2d3e72SEric Biggers	// C1 = floor(A / x^(128 - 8*LEN)) and C2 = A*x^(8*LEN) + B mod x^128.
8d2d3e72SEric Biggers	// Then fold C1 into C2, which is just another fold across 128 bits.
8d2d3e72SEric Biggers
8d2d3e72SEric Biggers.if !LSB_CRC || AVX_LEVEL == 0
8d2d3e72SEric Biggers	// Load the last 16 data bytes.  Note that originally LEN was >= 16.
8d2d3e72SEric Biggers	_load_data	16, "-16(BUF,LEN)", BSWAP_MASK_XMM, %xmm2
8d2d3e72SEric Biggers.endif // Else will use vpblendvb mem operand later.
8d2d3e72SEric Biggers.if !LSB_CRC
8d2d3e72SEric Biggers	neg		LEN	// Needed for indexing shuf_table
8d2d3e72SEric Biggers.endif
8d2d3e72SEric Biggers
8d2d3e72SEric Biggers	// tmp = A*x^(8*LEN) mod x^128
8d2d3e72SEric Biggers	// lsb: pshufb by [LEN, LEN+1, ..., 15, -1, -1, ..., -1]
8d2d3e72SEric Biggers	//	i.e. right-shift by LEN bytes.
8d2d3e72SEric Biggers	// msb: pshufb by [-1, -1, ..., -1, 0, 1, ..., 15-LEN]
8d2d3e72SEric Biggers	//	i.e. left-shift by LEN bytes.
8d2d3e72SEric Biggers	_cond_vex movdqu,	"OFFSETOF_SHUF_TABLE+16(CONSTS_PTR,LEN)", %xmm3
8d2d3e72SEric Biggers	_cond_vex pshufb,	%xmm3, %xmm0, %xmm1
8d2d3e72SEric Biggers
8d2d3e72SEric Biggers	// C1 = floor(A / x^(128 - 8*LEN))
8d2d3e72SEric Biggers	// lsb: pshufb by [-1, -1, ..., -1, 0, 1, ..., LEN-1]
8d2d3e72SEric Biggers	//	i.e. left-shift by 16-LEN bytes.
8d2d3e72SEric Biggers	// msb: pshufb by [16-LEN, 16-LEN+1, ..., 15, -1, -1, ..., -1]
8d2d3e72SEric Biggers	//	i.e. right-shift by 16-LEN bytes.
8d2d3e72SEric Biggers	_cond_vex pshufb,	"OFFSETOF_SHUF_TABLE+32*!LSB_CRC(CONSTS_PTR,LEN)", \
8d2d3e72SEric Biggers				%xmm0, %xmm0, unaligned_mem_tmp=%xmm4
8d2d3e72SEric Biggers
8d2d3e72SEric Biggers	// C2 = tmp + B.  This is just a blend of tmp with the last 16 data
8d2d3e72SEric Biggers	// bytes (reflected if msb-first).  The blend mask is the shuffle table
8d2d3e72SEric Biggers	// that was used to create tmp.  0 selects tmp, and 1 last16databytes.
8d2d3e72SEric Biggers.if AVX_LEVEL == 0
8d2d3e72SEric Biggers	movdqa		%xmm0, %xmm4
8d2d3e72SEric Biggers	movdqa		%xmm3, %xmm0
8d2d3e72SEric Biggers	pblendvb	%xmm2, %xmm1	// uses %xmm0 as implicit operand
8d2d3e72SEric Biggers	movdqa		%xmm4, %xmm0
8d2d3e72SEric Biggers.elseif LSB_CRC
8d2d3e72SEric Biggers	vpblendvb	%xmm3, -16(BUF,LEN), %xmm1, %xmm1
8d2d3e72SEric Biggers.else
8d2d3e72SEric Biggers	vpblendvb	%xmm3, %xmm2, %xmm1, %xmm1
8d2d3e72SEric Biggers.endif
8d2d3e72SEric Biggers
8d2d3e72SEric Biggers	// Fold C1 into C2 and store the 128-bit result in %xmm0.
8d2d3e72SEric Biggers	_fold_vec	%xmm0, %xmm1, CONSTS_XMM, %xmm4
8d2d3e72SEric Biggers
8d2d3e72SEric Biggers.Lreduce_128bits_to_crc\@:
8d2d3e72SEric Biggers	// Compute the CRC as %xmm0 * x^n mod G.  Here %xmm0 means the 128-bit
8d2d3e72SEric Biggers	// polynomial stored in %xmm0 (using either lsb-first or msb-first bit
8d2d3e72SEric Biggers	// order according to LSB_CRC), and G is the CRC's generator polynomial.
8d2d3e72SEric Biggers
8d2d3e72SEric Biggers	// First, multiply %xmm0 by x^n and reduce the result to 64+n bits:
8d2d3e72SEric Biggers	//
8d2d3e72SEric Biggers	//	t0 := (x^(64+n) mod G) * floor(%xmm0 / x^64) +
8d2d3e72SEric Biggers	//	      x^n * (%xmm0 mod x^64)
8d2d3e72SEric Biggers	//
8d2d3e72SEric Biggers	// Store t0 * x^(64-n) in %xmm0.  I.e., actually do:
8d2d3e72SEric Biggers	//
8d2d3e72SEric Biggers	//	%xmm0 := ((x^(64+n) mod G) * x^(64-n)) * floor(%xmm0 / x^64) +
8d2d3e72SEric Biggers	//		 x^64 * (%xmm0 mod x^64)
8d2d3e72SEric Biggers	//
8d2d3e72SEric Biggers	// The extra unreduced factor of x^(64-n) makes floor(t0 / x^n) aligned
8d2d3e72SEric Biggers	// to the HI64_TERMS of %xmm0 so that the next pclmulqdq can easily
8d2d3e72SEric Biggers	// select it.  The 64-bit constant (x^(64+n) mod G) * x^(64-n) in the
8d2d3e72SEric Biggers	// msb-first case, or (x^(63+n) mod G) * x^(64-n) in the lsb-first case
8d2d3e72SEric Biggers	// (considering the extra factor of x that gets implicitly introduced by
8d2d3e72SEric Biggers	// each pclmulqdq when using lsb-first order), is identical to the
8d2d3e72SEric Biggers	// constant that was used earlier for folding the LO64_TERMS across 128
8d2d3e72SEric Biggers	// bits.  Thus it's already available in LO64_TERMS of CONSTS_XMM.
8d2d3e72SEric Biggers	_pclmulqdq		CONSTS_XMM, LO64_TERMS, %xmm0, HI64_TERMS, %xmm1
8d2d3e72SEric Biggers.if LSB_CRC
8d2d3e72SEric Biggers	_cond_vex psrldq,	$8, %xmm0, %xmm0  // x^64 * (%xmm0 mod x^64)
8d2d3e72SEric Biggers.else
8d2d3e72SEric Biggers	_cond_vex pslldq,	$8, %xmm0, %xmm0  // x^64 * (%xmm0 mod x^64)
8d2d3e72SEric Biggers.endif
8d2d3e72SEric Biggers	_cond_vex pxor,		%xmm1, %xmm0, %xmm0
8d2d3e72SEric Biggers	// The HI64_TERMS of %xmm0 now contain floor(t0 / x^n).
8d2d3e72SEric Biggers	// The LO64_TERMS of %xmm0 now contain (t0 mod x^n) * x^(64-n).
8d2d3e72SEric Biggers
8d2d3e72SEric Biggers	// First step of Barrett reduction: Compute floor(t0 / G).  This is the
8d2d3e72SEric Biggers	// polynomial by which G needs to be multiplied to cancel out the x^n
8d2d3e72SEric Biggers	// and higher terms of t0, i.e. to reduce t0 mod G.  First do:
8d2d3e72SEric Biggers	//
8d2d3e72SEric Biggers	//	t1 := floor(x^(63+n) / G) * x * floor(t0 / x^n)
8d2d3e72SEric Biggers	//
8d2d3e72SEric Biggers	// Then the desired value floor(t0 / G) is floor(t1 / x^64).  The 63 in
8d2d3e72SEric Biggers	// x^(63+n) is the maximum degree of floor(t0 / x^n) and thus the lowest
8d2d3e72SEric Biggers	// value that makes enough precision be carried through the calculation.
8d2d3e72SEric Biggers	//
8d2d3e72SEric Biggers	// The '* x' makes it so the result is floor(t1 / x^64) rather than
8d2d3e72SEric Biggers	// floor(t1 / x^63), making it qword-aligned in HI64_TERMS so that it
8d2d3e72SEric Biggers	// can be extracted much more easily in the next step.  In the lsb-first
8d2d3e72SEric Biggers	// case the '* x' happens implicitly.  In the msb-first case it must be
8d2d3e72SEric Biggers	// done explicitly; floor(x^(63+n) / G) * x is a 65-bit constant, so the
8d2d3e72SEric Biggers	// constant passed to pclmulqdq is (floor(x^(63+n) / G) * x) - x^64, and
8d2d3e72SEric Biggers	// the multiplication by the x^64 term is handled using a pxor.  The
8d2d3e72SEric Biggers	// pxor causes the low 64 terms of t1 to be wrong, but they are unused.
8d2d3e72SEric Biggers	_cond_vex movdqa,	OFFSETOF_BARRETT_REDUCTION_CONSTS(CONSTS_PTR), CONSTS_XMM
8d2d3e72SEric Biggers	_pclmulqdq		CONSTS_XMM, HI64_TERMS, %xmm0, HI64_TERMS, %xmm1
8d2d3e72SEric Biggers.if !LSB_CRC
8d2d3e72SEric Biggers	_cond_vex pxor,		%xmm0, %xmm1, %xmm1 // += x^64 * floor(t0 / x^n)
8d2d3e72SEric Biggers.endif
8d2d3e72SEric Biggers	// The HI64_TERMS of %xmm1 now contain floor(t1 / x^64) = floor(t0 / G).
8d2d3e72SEric Biggers
8d2d3e72SEric Biggers	// Second step of Barrett reduction: Cancel out the x^n and higher terms
8d2d3e72SEric Biggers	// of t0 by subtracting the needed multiple of G.  This gives the CRC:
8d2d3e72SEric Biggers	//
8d2d3e72SEric Biggers	//	crc := t0 - (G * floor(t0 / G))
8d2d3e72SEric Biggers	//
8d2d3e72SEric Biggers	// But %xmm0 contains t0 * x^(64-n), so it's more convenient to do:
8d2d3e72SEric Biggers	//
8d2d3e72SEric Biggers	//	crc := ((t0 * x^(64-n)) - ((G * x^(64-n)) * floor(t0 / G))) / x^(64-n)
8d2d3e72SEric Biggers	//
8d2d3e72SEric Biggers	// Furthermore, since the resulting CRC is n-bit, if mod x^n is
8d2d3e72SEric Biggers	// explicitly applied to it then the x^n term of G makes no difference
8d2d3e72SEric Biggers	// in the result and can be omitted.  This helps keep the constant
8d2d3e72SEric Biggers	// multiplier in 64 bits in most cases.  This gives the following:
8d2d3e72SEric Biggers	//
8d2d3e72SEric Biggers	//	%xmm0 := %xmm0 - (((G - x^n) * x^(64-n)) * floor(t0 / G))
8d2d3e72SEric Biggers	//	crc := (%xmm0 / x^(64-n)) mod x^n
8d2d3e72SEric Biggers	//
8d2d3e72SEric Biggers	// In the lsb-first case, each pclmulqdq implicitly introduces
8d2d3e72SEric Biggers	// an extra factor of x, so in that case the constant that needs to be
8d2d3e72SEric Biggers	// passed to pclmulqdq is actually '(G - x^n) * x^(63-n)' when n <= 63.
8d2d3e72SEric Biggers	// For lsb-first CRCs where n=64, the extra factor of x cannot be as
8d2d3e72SEric Biggers	// easily avoided.  In that case, instead pass '(G - x^n - x^0) / x' to
8d2d3e72SEric Biggers	// pclmulqdq and handle the x^0 term (i.e. 1) separately.  (All CRC
8d2d3e72SEric Biggers	// polynomials have nonzero x^n and x^0 terms.)  It works out as: the
8d2d3e72SEric Biggers	// CRC has be XORed with the physically low qword of %xmm1, representing
8d2d3e72SEric Biggers	// floor(t0 / G).  The most efficient way to do that is to move it to
8d2d3e72SEric Biggers	// the physically high qword and use a ternlog to combine the two XORs.
8d2d3e72SEric Biggers.if LSB_CRC && \n == 64
8d2d3e72SEric Biggers	_cond_vex punpcklqdq,	%xmm1, %xmm2, %xmm2
8d2d3e72SEric Biggers	_pclmulqdq		CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1
*acf9f8daSEric Biggers    .if AVX_LEVEL <= 2
8d2d3e72SEric Biggers	_cond_vex pxor,		%xmm2, %xmm0, %xmm0
8d2d3e72SEric Biggers	_cond_vex pxor,		%xmm1, %xmm0, %xmm0
8d2d3e72SEric Biggers    .else
8d2d3e72SEric Biggers	vpternlogq		$0x96, %xmm2, %xmm1, %xmm0
8d2d3e72SEric Biggers    .endif
8d2d3e72SEric Biggers	_cond_vex "pextrq $1,",	%xmm0, %rax  // (%xmm0 / x^0) mod x^64
8d2d3e72SEric Biggers.else
8d2d3e72SEric Biggers	_pclmulqdq		CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1
8d2d3e72SEric Biggers	_cond_vex pxor,		%xmm1, %xmm0, %xmm0
8d2d3e72SEric Biggers  .if \n == 8
8d2d3e72SEric Biggers	_cond_vex "pextrb $7 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^56) mod x^8
8d2d3e72SEric Biggers  .elseif \n == 16
8d2d3e72SEric Biggers	_cond_vex "pextrw $3 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^48) mod x^16
8d2d3e72SEric Biggers  .elseif \n == 32
8d2d3e72SEric Biggers	_cond_vex "pextrd $1 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^32) mod x^32
8d2d3e72SEric Biggers  .else // \n == 64 && !LSB_CRC
8d2d3e72SEric Biggers	_cond_vex movq,		%xmm0, %rax  // (%xmm0 / x^0) mod x^64
8d2d3e72SEric Biggers  .endif
8d2d3e72SEric Biggers.endif
8d2d3e72SEric Biggers
8d2d3e72SEric Biggers.if VL > 16
8d2d3e72SEric Biggers	vzeroupper	// Needed when ymm or zmm registers may have been used.
8d2d3e72SEric Biggers.endif
8d2d3e72SEric Biggers#ifdef __i386__
8d2d3e72SEric Biggers	pop		CONSTS_PTR
8d2d3e72SEric Biggers#endif
8d2d3e72SEric Biggers	RET
8d2d3e72SEric Biggers.endm
8d2d3e72SEric Biggers
8d2d3e72SEric Biggers#ifdef CONFIG_AS_VPCLMULQDQ
8d2d3e72SEric Biggers#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb)			\
8d2d3e72SEric BiggersSYM_FUNC_START(prefix##_pclmul_sse);					\
8d2d3e72SEric Biggers	_crc_pclmul	n=bits, lsb_crc=lsb, vl=16, avx_level=0;	\
8d2d3e72SEric BiggersSYM_FUNC_END(prefix##_pclmul_sse);					\
8d2d3e72SEric Biggers									\
8d2d3e72SEric BiggersSYM_FUNC_START(prefix##_vpclmul_avx2);					\
8d2d3e72SEric Biggers	_crc_pclmul	n=bits, lsb_crc=lsb, vl=32, avx_level=2;	\
8d2d3e72SEric BiggersSYM_FUNC_END(prefix##_vpclmul_avx2);					\
8d2d3e72SEric Biggers									\
*acf9f8daSEric BiggersSYM_FUNC_START(prefix##_vpclmul_avx512);				\
*acf9f8daSEric Biggers	_crc_pclmul	n=bits, lsb_crc=lsb, vl=64, avx_level=512;	\
*acf9f8daSEric BiggersSYM_FUNC_END(prefix##_vpclmul_avx512);
8d2d3e72SEric Biggers#else
8d2d3e72SEric Biggers#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb)			\
8d2d3e72SEric BiggersSYM_FUNC_START(prefix##_pclmul_sse);					\
8d2d3e72SEric Biggers	_crc_pclmul	n=bits, lsb_crc=lsb, vl=16, avx_level=0;	\
8d2d3e72SEric BiggersSYM_FUNC_END(prefix##_pclmul_sse);
8d2d3e72SEric Biggers#endif // !CONFIG_AS_VPCLMULQDQ