18d2d3e72SEric Biggers/* SPDX-License-Identifier: GPL-2.0-or-later */ 28d2d3e72SEric Biggers// 38d2d3e72SEric Biggers// Template to generate [V]PCLMULQDQ-based CRC functions for x86 48d2d3e72SEric Biggers// 58d2d3e72SEric Biggers// Copyright 2025 Google LLC 68d2d3e72SEric Biggers// 78d2d3e72SEric Biggers// Author: Eric Biggers <ebiggers@google.com> 88d2d3e72SEric Biggers 98d2d3e72SEric Biggers#include <linux/linkage.h> 10a0bd462fSEric Biggers#include <linux/objtool.h> 118d2d3e72SEric Biggers 128d2d3e72SEric Biggers// Offsets within the generated constants table 138d2d3e72SEric Biggers.set OFFSETOF_BSWAP_MASK, -5*16 // msb-first CRCs only 148d2d3e72SEric Biggers.set OFFSETOF_FOLD_ACROSS_2048_BITS_CONSTS, -4*16 // must precede next 158d2d3e72SEric Biggers.set OFFSETOF_FOLD_ACROSS_1024_BITS_CONSTS, -3*16 // must precede next 168d2d3e72SEric Biggers.set OFFSETOF_FOLD_ACROSS_512_BITS_CONSTS, -2*16 // must precede next 178d2d3e72SEric Biggers.set OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS, -1*16 // must precede next 188d2d3e72SEric Biggers.set OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS, 0*16 // must be 0 198d2d3e72SEric Biggers.set OFFSETOF_SHUF_TABLE, 1*16 208d2d3e72SEric Biggers.set OFFSETOF_BARRETT_REDUCTION_CONSTS, 4*16 218d2d3e72SEric Biggers 228d2d3e72SEric Biggers// Emit a VEX (or EVEX) coded instruction if allowed, or emulate it using the 238d2d3e72SEric Biggers// corresponding non-VEX instruction plus any needed moves. The supported 248d2d3e72SEric Biggers// instruction formats are: 258d2d3e72SEric Biggers// 268d2d3e72SEric Biggers// - Two-arg [src, dst], where the non-VEX format is the same. 278d2d3e72SEric Biggers// - Three-arg [src1, src2, dst] where the non-VEX format is 288d2d3e72SEric Biggers// [src1, src2_and_dst]. If src2 != dst, then src1 must != dst too. 298d2d3e72SEric Biggers// 308d2d3e72SEric Biggers// \insn gives the instruction without a "v" prefix and including any immediate 318d2d3e72SEric Biggers// argument if needed to make the instruction follow one of the above formats. 328d2d3e72SEric Biggers// If \unaligned_mem_tmp is given, then the emitted non-VEX code moves \arg1 to 338d2d3e72SEric Biggers// it first; this is needed when \arg1 is an unaligned mem operand. 348d2d3e72SEric Biggers.macro _cond_vex insn:req, arg1:req, arg2:req, arg3, unaligned_mem_tmp 358d2d3e72SEric Biggers.if AVX_LEVEL == 0 368d2d3e72SEric Biggers // VEX not allowed. Emulate it. 378d2d3e72SEric Biggers .ifnb \arg3 // Three-arg [src1, src2, dst] 388d2d3e72SEric Biggers .ifc "\arg2", "\arg3" // src2 == dst? 398d2d3e72SEric Biggers .ifnb \unaligned_mem_tmp 408d2d3e72SEric Biggers movdqu \arg1, \unaligned_mem_tmp 418d2d3e72SEric Biggers \insn \unaligned_mem_tmp, \arg3 428d2d3e72SEric Biggers .else 438d2d3e72SEric Biggers \insn \arg1, \arg3 448d2d3e72SEric Biggers .endif 458d2d3e72SEric Biggers .else // src2 != dst 468d2d3e72SEric Biggers .ifc "\arg1", "\arg3" 478d2d3e72SEric Biggers .error "Can't have src1 == dst when src2 != dst" 488d2d3e72SEric Biggers .endif 498d2d3e72SEric Biggers .ifnb \unaligned_mem_tmp 508d2d3e72SEric Biggers movdqu \arg1, \unaligned_mem_tmp 518d2d3e72SEric Biggers movdqa \arg2, \arg3 528d2d3e72SEric Biggers \insn \unaligned_mem_tmp, \arg3 538d2d3e72SEric Biggers .else 548d2d3e72SEric Biggers movdqa \arg2, \arg3 558d2d3e72SEric Biggers \insn \arg1, \arg3 568d2d3e72SEric Biggers .endif 578d2d3e72SEric Biggers .endif 588d2d3e72SEric Biggers .else // Two-arg [src, dst] 598d2d3e72SEric Biggers .ifnb \unaligned_mem_tmp 608d2d3e72SEric Biggers movdqu \arg1, \unaligned_mem_tmp 618d2d3e72SEric Biggers \insn \unaligned_mem_tmp, \arg2 628d2d3e72SEric Biggers .else 638d2d3e72SEric Biggers \insn \arg1, \arg2 648d2d3e72SEric Biggers .endif 658d2d3e72SEric Biggers .endif 668d2d3e72SEric Biggers.else 678d2d3e72SEric Biggers // VEX is allowed. Emit the desired instruction directly. 688d2d3e72SEric Biggers .ifnb \arg3 698d2d3e72SEric Biggers v\insn \arg1, \arg2, \arg3 708d2d3e72SEric Biggers .else 718d2d3e72SEric Biggers v\insn \arg1, \arg2 728d2d3e72SEric Biggers .endif 738d2d3e72SEric Biggers.endif 748d2d3e72SEric Biggers.endm 758d2d3e72SEric Biggers 768d2d3e72SEric Biggers// Broadcast an aligned 128-bit mem operand to all 128-bit lanes of a vector 778d2d3e72SEric Biggers// register of length VL. 788d2d3e72SEric Biggers.macro _vbroadcast src, dst 798d2d3e72SEric Biggers.if VL == 16 808d2d3e72SEric Biggers _cond_vex movdqa, \src, \dst 818d2d3e72SEric Biggers.elseif VL == 32 828d2d3e72SEric Biggers vbroadcasti128 \src, \dst 838d2d3e72SEric Biggers.else 848d2d3e72SEric Biggers vbroadcasti32x4 \src, \dst 858d2d3e72SEric Biggers.endif 868d2d3e72SEric Biggers.endm 878d2d3e72SEric Biggers 888d2d3e72SEric Biggers// Load \vl bytes from the unaligned mem operand \src into \dst, and if the CRC 898d2d3e72SEric Biggers// is msb-first use \bswap_mask to reflect the bytes within each 128-bit lane. 908d2d3e72SEric Biggers.macro _load_data vl, src, bswap_mask, dst 918d2d3e72SEric Biggers.if \vl < 64 928d2d3e72SEric Biggers _cond_vex movdqu, "\src", \dst 938d2d3e72SEric Biggers.else 948d2d3e72SEric Biggers vmovdqu8 \src, \dst 958d2d3e72SEric Biggers.endif 968d2d3e72SEric Biggers.if !LSB_CRC 978d2d3e72SEric Biggers _cond_vex pshufb, \bswap_mask, \dst, \dst 988d2d3e72SEric Biggers.endif 998d2d3e72SEric Biggers.endm 1008d2d3e72SEric Biggers 1018d2d3e72SEric Biggers.macro _prepare_v0 vl, v0, v1, bswap_mask 1028d2d3e72SEric Biggers.if LSB_CRC 1038d2d3e72SEric Biggers .if \vl < 64 1048d2d3e72SEric Biggers _cond_vex pxor, (BUF), \v0, \v0, unaligned_mem_tmp=\v1 1058d2d3e72SEric Biggers .else 1068d2d3e72SEric Biggers vpxorq (BUF), \v0, \v0 1078d2d3e72SEric Biggers .endif 1088d2d3e72SEric Biggers.else 1098d2d3e72SEric Biggers _load_data \vl, (BUF), \bswap_mask, \v1 1108d2d3e72SEric Biggers .if \vl < 64 1118d2d3e72SEric Biggers _cond_vex pxor, \v1, \v0, \v0 1128d2d3e72SEric Biggers .else 1138d2d3e72SEric Biggers vpxorq \v1, \v0, \v0 1148d2d3e72SEric Biggers .endif 1158d2d3e72SEric Biggers.endif 1168d2d3e72SEric Biggers.endm 1178d2d3e72SEric Biggers 1188d2d3e72SEric Biggers// The x^0..x^63 terms, i.e. poly128 mod x^64, i.e. the physically low qword for 1198d2d3e72SEric Biggers// msb-first order or the physically high qword for lsb-first order 1208d2d3e72SEric Biggers#define LO64_TERMS 0 1218d2d3e72SEric Biggers 1228d2d3e72SEric Biggers// The x^64..x^127 terms, i.e. floor(poly128 / x^64), i.e. the physically high 1238d2d3e72SEric Biggers// qword for msb-first order or the physically low qword for lsb-first order 1248d2d3e72SEric Biggers#define HI64_TERMS 1 1258d2d3e72SEric Biggers 1268d2d3e72SEric Biggers// Multiply the given \src1_terms of each 128-bit lane of \src1 by the given 1278d2d3e72SEric Biggers// \src2_terms of each 128-bit lane of \src2, and write the result(s) to \dst. 1288d2d3e72SEric Biggers.macro _pclmulqdq src1, src1_terms, src2, src2_terms, dst 1298d2d3e72SEric Biggers _cond_vex "pclmulqdq $((\src1_terms ^ LSB_CRC) << 4) ^ (\src2_terms ^ LSB_CRC),", \ 1308d2d3e72SEric Biggers \src1, \src2, \dst 1318d2d3e72SEric Biggers.endm 1328d2d3e72SEric Biggers 1338d2d3e72SEric Biggers// Fold \acc into \data and store the result back into \acc. \data can be an 1348d2d3e72SEric Biggers// unaligned mem operand if using VEX is allowed and the CRC is lsb-first so no 1358d2d3e72SEric Biggers// byte-reflection is needed; otherwise it must be a vector register. \consts 1368d2d3e72SEric Biggers// is a vector register containing the needed fold constants, and \tmp is a 1378d2d3e72SEric Biggers// temporary vector register. All arguments must be the same length. 1388d2d3e72SEric Biggers.macro _fold_vec acc, data, consts, tmp 1398d2d3e72SEric Biggers _pclmulqdq \consts, HI64_TERMS, \acc, HI64_TERMS, \tmp 1408d2d3e72SEric Biggers _pclmulqdq \consts, LO64_TERMS, \acc, LO64_TERMS, \acc 141*acf9f8daSEric Biggers.if AVX_LEVEL <= 2 1428d2d3e72SEric Biggers _cond_vex pxor, \data, \tmp, \tmp 1438d2d3e72SEric Biggers _cond_vex pxor, \tmp, \acc, \acc 1448d2d3e72SEric Biggers.else 1458d2d3e72SEric Biggers vpternlogq $0x96, \data, \tmp, \acc 1468d2d3e72SEric Biggers.endif 1478d2d3e72SEric Biggers.endm 1488d2d3e72SEric Biggers 1498d2d3e72SEric Biggers// Fold \acc into \data and store the result back into \acc. \data is an 1508d2d3e72SEric Biggers// unaligned mem operand, \consts is a vector register containing the needed 1518d2d3e72SEric Biggers// fold constants, \bswap_mask is a vector register containing the 1528d2d3e72SEric Biggers// byte-reflection table if the CRC is msb-first, and \tmp1 and \tmp2 are 1538d2d3e72SEric Biggers// temporary vector registers. All arguments must have length \vl. 1548d2d3e72SEric Biggers.macro _fold_vec_mem vl, acc, data, consts, bswap_mask, tmp1, tmp2 1558d2d3e72SEric Biggers.if AVX_LEVEL == 0 || !LSB_CRC 1568d2d3e72SEric Biggers _load_data \vl, \data, \bswap_mask, \tmp1 1578d2d3e72SEric Biggers _fold_vec \acc, \tmp1, \consts, \tmp2 1588d2d3e72SEric Biggers.else 1598d2d3e72SEric Biggers _fold_vec \acc, \data, \consts, \tmp1 1608d2d3e72SEric Biggers.endif 1618d2d3e72SEric Biggers.endm 1628d2d3e72SEric Biggers 1638d2d3e72SEric Biggers// Load the constants for folding across 2**i vectors of length VL at a time 1648d2d3e72SEric Biggers// into all 128-bit lanes of the vector register CONSTS. 1658d2d3e72SEric Biggers.macro _load_vec_folding_consts i 1668d2d3e72SEric Biggers _vbroadcast OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS+(4-LOG2_VL-\i)*16(CONSTS_PTR), \ 1678d2d3e72SEric Biggers CONSTS 1688d2d3e72SEric Biggers.endm 1698d2d3e72SEric Biggers 1708d2d3e72SEric Biggers// Given vector registers \v0 and \v1 of length \vl, fold \v0 into \v1 and store 1718d2d3e72SEric Biggers// the result back into \v0. If the remaining length mod \vl is nonzero, also 1728d2d3e72SEric Biggers// fold \vl data bytes from BUF. For both operations the fold distance is \vl. 1738d2d3e72SEric Biggers// \consts must be a register of length \vl containing the fold constants. 1748d2d3e72SEric Biggers.macro _fold_vec_final vl, v0, v1, consts, bswap_mask, tmp1, tmp2 1758d2d3e72SEric Biggers _fold_vec \v0, \v1, \consts, \tmp1 1768d2d3e72SEric Biggers test $\vl, LEN8 1778d2d3e72SEric Biggers jz .Lfold_vec_final_done\@ 1788d2d3e72SEric Biggers _fold_vec_mem \vl, \v0, (BUF), \consts, \bswap_mask, \tmp1, \tmp2 1798d2d3e72SEric Biggers add $\vl, BUF 1808d2d3e72SEric Biggers.Lfold_vec_final_done\@: 1818d2d3e72SEric Biggers.endm 1828d2d3e72SEric Biggers 1838d2d3e72SEric Biggers// This macro generates the body of a CRC function with the following prototype: 1848d2d3e72SEric Biggers// 1858d2d3e72SEric Biggers// crc_t crc_func(crc_t crc, const u8 *buf, size_t len, const void *consts); 1868d2d3e72SEric Biggers// 1878d2d3e72SEric Biggers// |crc| is the initial CRC, and crc_t is a data type wide enough to hold it. 1888d2d3e72SEric Biggers// |buf| is the data to checksum. |len| is the data length in bytes, which must 1898d2d3e72SEric Biggers// be at least 16. |consts| is a pointer to the fold_across_128_bits_consts 1908d2d3e72SEric Biggers// field of the constants struct that was generated for the chosen CRC variant. 1918d2d3e72SEric Biggers// 1928d2d3e72SEric Biggers// Moving onto the macro parameters, \n is the number of bits in the CRC, e.g. 1938d2d3e72SEric Biggers// 32 for a CRC-32. Currently the supported values are 8, 16, 32, and 64. If 1948d2d3e72SEric Biggers// the file is compiled in i386 mode, then the maximum supported value is 32. 1958d2d3e72SEric Biggers// 1968d2d3e72SEric Biggers// \lsb_crc is 1 if the CRC processes the least significant bit of each byte 1978d2d3e72SEric Biggers// first, i.e. maps bit0 to x^7, bit1 to x^6, ..., bit7 to x^0. \lsb_crc is 0 1988d2d3e72SEric Biggers// if the CRC processes the most significant bit of each byte first, i.e. maps 1998d2d3e72SEric Biggers// bit0 to x^0, bit1 to x^1, bit7 to x^7. 2008d2d3e72SEric Biggers// 2018d2d3e72SEric Biggers// \vl is the maximum length of vector register to use in bytes: 16, 32, or 64. 2028d2d3e72SEric Biggers// 2038d2d3e72SEric Biggers// \avx_level is the level of AVX support to use: 0 for SSE only, 2 for AVX2, or 204*acf9f8daSEric Biggers// 512 for AVX512. 2058d2d3e72SEric Biggers// 2068d2d3e72SEric Biggers// If \vl == 16 && \avx_level == 0, the generated code requires: 2078d2d3e72SEric Biggers// PCLMULQDQ && SSE4.1. (Note: all known CPUs with PCLMULQDQ also have SSE4.1.) 2088d2d3e72SEric Biggers// 2098d2d3e72SEric Biggers// If \vl == 32 && \avx_level == 2, the generated code requires: 2108d2d3e72SEric Biggers// VPCLMULQDQ && AVX2. 2118d2d3e72SEric Biggers// 212*acf9f8daSEric Biggers// If \vl == 64 && \avx_level == 512, the generated code requires: 213*acf9f8daSEric Biggers// VPCLMULQDQ && AVX512BW && AVX512VL. 2148d2d3e72SEric Biggers// 2158d2d3e72SEric Biggers// Other \vl and \avx_level combinations are either not supported or not useful. 2168d2d3e72SEric Biggers.macro _crc_pclmul n, lsb_crc, vl, avx_level 2178d2d3e72SEric Biggers .set LSB_CRC, \lsb_crc 2188d2d3e72SEric Biggers .set VL, \vl 2198d2d3e72SEric Biggers .set AVX_LEVEL, \avx_level 2208d2d3e72SEric Biggers 2218d2d3e72SEric Biggers // Define aliases for the xmm, ymm, or zmm registers according to VL. 2228d2d3e72SEric Biggers.irp i, 0,1,2,3,4,5,6,7 2238d2d3e72SEric Biggers .if VL == 16 2248d2d3e72SEric Biggers .set V\i, %xmm\i 2258d2d3e72SEric Biggers .set LOG2_VL, 4 2268d2d3e72SEric Biggers .elseif VL == 32 2278d2d3e72SEric Biggers .set V\i, %ymm\i 2288d2d3e72SEric Biggers .set LOG2_VL, 5 2298d2d3e72SEric Biggers .elseif VL == 64 2308d2d3e72SEric Biggers .set V\i, %zmm\i 2318d2d3e72SEric Biggers .set LOG2_VL, 6 2328d2d3e72SEric Biggers .else 2338d2d3e72SEric Biggers .error "Unsupported vector length" 2348d2d3e72SEric Biggers .endif 2358d2d3e72SEric Biggers.endr 2368d2d3e72SEric Biggers // Define aliases for the function parameters. 2378d2d3e72SEric Biggers // Note: when crc_t is shorter than u32, zero-extension to 32 bits is 2388d2d3e72SEric Biggers // guaranteed by the ABI. Zero-extension to 64 bits is *not* guaranteed 2398d2d3e72SEric Biggers // when crc_t is shorter than u64. 2408d2d3e72SEric Biggers#ifdef __x86_64__ 2418d2d3e72SEric Biggers.if \n <= 32 2428d2d3e72SEric Biggers .set CRC, %edi 2438d2d3e72SEric Biggers.else 2448d2d3e72SEric Biggers .set CRC, %rdi 2458d2d3e72SEric Biggers.endif 2468d2d3e72SEric Biggers .set BUF, %rsi 2478d2d3e72SEric Biggers .set LEN, %rdx 2488d2d3e72SEric Biggers .set LEN32, %edx 2498d2d3e72SEric Biggers .set LEN8, %dl 2508d2d3e72SEric Biggers .set CONSTS_PTR, %rcx 2518d2d3e72SEric Biggers#else 2528d2d3e72SEric Biggers // 32-bit support, assuming -mregparm=3 and not including support for 2538d2d3e72SEric Biggers // CRC-64 (which would use both eax and edx to pass the crc parameter). 2548d2d3e72SEric Biggers .set CRC, %eax 2558d2d3e72SEric Biggers .set BUF, %edx 2568d2d3e72SEric Biggers .set LEN, %ecx 2578d2d3e72SEric Biggers .set LEN32, %ecx 2588d2d3e72SEric Biggers .set LEN8, %cl 2598d2d3e72SEric Biggers .set CONSTS_PTR, %ebx // Passed on stack 2608d2d3e72SEric Biggers#endif 2618d2d3e72SEric Biggers 2628d2d3e72SEric Biggers // Define aliases for some local variables. V0-V5 are used without 2638d2d3e72SEric Biggers // aliases (for accumulators, data, temporary values, etc). Staying 2648d2d3e72SEric Biggers // within the first 8 vector registers keeps the code 32-bit SSE 2658d2d3e72SEric Biggers // compatible and reduces the size of 64-bit SSE code slightly. 2668d2d3e72SEric Biggers .set BSWAP_MASK, V6 2678d2d3e72SEric Biggers .set BSWAP_MASK_YMM, %ymm6 2688d2d3e72SEric Biggers .set BSWAP_MASK_XMM, %xmm6 2698d2d3e72SEric Biggers .set CONSTS, V7 2708d2d3e72SEric Biggers .set CONSTS_YMM, %ymm7 2718d2d3e72SEric Biggers .set CONSTS_XMM, %xmm7 2728d2d3e72SEric Biggers 273a0bd462fSEric Biggers // Use ANNOTATE_NOENDBR to suppress an objtool warning, since the 274a0bd462fSEric Biggers // functions generated by this macro are called only by static_call. 275a0bd462fSEric Biggers ANNOTATE_NOENDBR 276a0bd462fSEric Biggers 2778d2d3e72SEric Biggers#ifdef __i386__ 2788d2d3e72SEric Biggers push CONSTS_PTR 2798d2d3e72SEric Biggers mov 8(%esp), CONSTS_PTR 2808d2d3e72SEric Biggers#endif 2818d2d3e72SEric Biggers 2828d2d3e72SEric Biggers // Create a 128-bit vector that contains the initial CRC in the end 2838d2d3e72SEric Biggers // representing the high-order polynomial coefficients, and the rest 0. 2848d2d3e72SEric Biggers // If the CRC is msb-first, also load the byte-reflection table. 2858d2d3e72SEric Biggers.if \n <= 32 2868d2d3e72SEric Biggers _cond_vex movd, CRC, %xmm0 2878d2d3e72SEric Biggers.else 2888d2d3e72SEric Biggers _cond_vex movq, CRC, %xmm0 2898d2d3e72SEric Biggers.endif 2908d2d3e72SEric Biggers.if !LSB_CRC 2918d2d3e72SEric Biggers _cond_vex pslldq, $(128-\n)/8, %xmm0, %xmm0 2928d2d3e72SEric Biggers _vbroadcast OFFSETOF_BSWAP_MASK(CONSTS_PTR), BSWAP_MASK 2938d2d3e72SEric Biggers.endif 2948d2d3e72SEric Biggers 2958d2d3e72SEric Biggers // Load the first vector of data and XOR the initial CRC into the 2968d2d3e72SEric Biggers // appropriate end of the first 128-bit lane of data. If LEN < VL, then 2978d2d3e72SEric Biggers // use a short vector and jump ahead to the final reduction. (LEN >= 16 2988d2d3e72SEric Biggers // is guaranteed here but not necessarily LEN >= VL.) 2998d2d3e72SEric Biggers.if VL >= 32 3008d2d3e72SEric Biggers cmp $VL, LEN 3018d2d3e72SEric Biggers jae .Lat_least_1vec\@ 3028d2d3e72SEric Biggers .if VL == 64 3038d2d3e72SEric Biggers cmp $32, LEN32 3048d2d3e72SEric Biggers jb .Lless_than_32bytes\@ 3058d2d3e72SEric Biggers _prepare_v0 32, %ymm0, %ymm1, BSWAP_MASK_YMM 3068d2d3e72SEric Biggers add $32, BUF 3078d2d3e72SEric Biggers jmp .Lreduce_256bits_to_128bits\@ 3088d2d3e72SEric Biggers.Lless_than_32bytes\@: 3098d2d3e72SEric Biggers .endif 3108d2d3e72SEric Biggers _prepare_v0 16, %xmm0, %xmm1, BSWAP_MASK_XMM 3118d2d3e72SEric Biggers add $16, BUF 3128d2d3e72SEric Biggers vmovdqa OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM 3138d2d3e72SEric Biggers jmp .Lcheck_for_partial_block\@ 3148d2d3e72SEric Biggers.Lat_least_1vec\@: 3158d2d3e72SEric Biggers.endif 3168d2d3e72SEric Biggers _prepare_v0 VL, V0, V1, BSWAP_MASK 3178d2d3e72SEric Biggers 3188d2d3e72SEric Biggers // Handle VL <= LEN < 4*VL. 3198d2d3e72SEric Biggers cmp $4*VL-1, LEN 3208d2d3e72SEric Biggers ja .Lat_least_4vecs\@ 3218d2d3e72SEric Biggers add $VL, BUF 3228d2d3e72SEric Biggers // If VL <= LEN < 2*VL, then jump ahead to the reduction from 1 vector. 3238d2d3e72SEric Biggers // If VL==16 then load fold_across_128_bits_consts first, as the final 3248d2d3e72SEric Biggers // reduction depends on it and it won't be loaded anywhere else. 3258d2d3e72SEric Biggers cmp $2*VL-1, LEN32 3268d2d3e72SEric Biggers.if VL == 16 3278d2d3e72SEric Biggers _cond_vex movdqa, OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM 3288d2d3e72SEric Biggers.endif 3298d2d3e72SEric Biggers jbe .Lreduce_1vec_to_128bits\@ 3308d2d3e72SEric Biggers // Otherwise 2*VL <= LEN < 4*VL. Load one more vector and jump ahead to 3318d2d3e72SEric Biggers // the reduction from 2 vectors. 3328d2d3e72SEric Biggers _load_data VL, (BUF), BSWAP_MASK, V1 3338d2d3e72SEric Biggers add $VL, BUF 3348d2d3e72SEric Biggers jmp .Lreduce_2vecs_to_1\@ 3358d2d3e72SEric Biggers 3368d2d3e72SEric Biggers.Lat_least_4vecs\@: 3378d2d3e72SEric Biggers // Load 3 more vectors of data. 3388d2d3e72SEric Biggers _load_data VL, 1*VL(BUF), BSWAP_MASK, V1 3398d2d3e72SEric Biggers _load_data VL, 2*VL(BUF), BSWAP_MASK, V2 3408d2d3e72SEric Biggers _load_data VL, 3*VL(BUF), BSWAP_MASK, V3 3418d2d3e72SEric Biggers sub $-4*VL, BUF // Shorter than 'add 4*VL' when VL=32 3428d2d3e72SEric Biggers add $-4*VL, LEN // Shorter than 'sub 4*VL' when VL=32 3438d2d3e72SEric Biggers 3448d2d3e72SEric Biggers // Main loop: while LEN >= 4*VL, fold the 4 vectors V0-V3 into the next 3458d2d3e72SEric Biggers // 4 vectors of data and write the result back to V0-V3. 3468d2d3e72SEric Biggers cmp $4*VL-1, LEN // Shorter than 'cmp 4*VL' when VL=32 3478d2d3e72SEric Biggers jbe .Lreduce_4vecs_to_2\@ 3488d2d3e72SEric Biggers _load_vec_folding_consts 2 3498d2d3e72SEric Biggers.Lfold_4vecs_loop\@: 3508d2d3e72SEric Biggers _fold_vec_mem VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 3518d2d3e72SEric Biggers _fold_vec_mem VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 3528d2d3e72SEric Biggers _fold_vec_mem VL, V2, 2*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 3538d2d3e72SEric Biggers _fold_vec_mem VL, V3, 3*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 3548d2d3e72SEric Biggers sub $-4*VL, BUF 3558d2d3e72SEric Biggers add $-4*VL, LEN 3568d2d3e72SEric Biggers cmp $4*VL-1, LEN 3578d2d3e72SEric Biggers ja .Lfold_4vecs_loop\@ 3588d2d3e72SEric Biggers 3598d2d3e72SEric Biggers // Fold V0,V1 into V2,V3 and write the result back to V0,V1. Then fold 3608d2d3e72SEric Biggers // two more vectors of data from BUF, if at least that much remains. 3618d2d3e72SEric Biggers.Lreduce_4vecs_to_2\@: 3628d2d3e72SEric Biggers _load_vec_folding_consts 1 3638d2d3e72SEric Biggers _fold_vec V0, V2, CONSTS, V4 3648d2d3e72SEric Biggers _fold_vec V1, V3, CONSTS, V4 3658d2d3e72SEric Biggers test $2*VL, LEN8 3668d2d3e72SEric Biggers jz .Lreduce_2vecs_to_1\@ 3678d2d3e72SEric Biggers _fold_vec_mem VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 3688d2d3e72SEric Biggers _fold_vec_mem VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 3698d2d3e72SEric Biggers sub $-2*VL, BUF 3708d2d3e72SEric Biggers 3718d2d3e72SEric Biggers // Fold V0 into V1 and write the result back to V0. Then fold one more 3728d2d3e72SEric Biggers // vector of data from BUF, if at least that much remains. 3738d2d3e72SEric Biggers.Lreduce_2vecs_to_1\@: 3748d2d3e72SEric Biggers _load_vec_folding_consts 0 3758d2d3e72SEric Biggers _fold_vec_final VL, V0, V1, CONSTS, BSWAP_MASK, V4, V5 3768d2d3e72SEric Biggers 3778d2d3e72SEric Biggers.Lreduce_1vec_to_128bits\@: 3788d2d3e72SEric Biggers.if VL == 64 3798d2d3e72SEric Biggers // Reduce 512-bit %zmm0 to 256-bit %ymm0. Then fold 256 more bits of 3808d2d3e72SEric Biggers // data from BUF, if at least that much remains. 3818d2d3e72SEric Biggers vbroadcasti128 OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS(CONSTS_PTR), CONSTS_YMM 3828d2d3e72SEric Biggers vextracti64x4 $1, %zmm0, %ymm1 3838d2d3e72SEric Biggers _fold_vec_final 32, %ymm0, %ymm1, CONSTS_YMM, BSWAP_MASK_YMM, %ymm4, %ymm5 3848d2d3e72SEric Biggers.Lreduce_256bits_to_128bits\@: 3858d2d3e72SEric Biggers.endif 3868d2d3e72SEric Biggers.if VL >= 32 3878d2d3e72SEric Biggers // Reduce 256-bit %ymm0 to 128-bit %xmm0. Then fold 128 more bits of 3888d2d3e72SEric Biggers // data from BUF, if at least that much remains. 3898d2d3e72SEric Biggers vmovdqa OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM 3908d2d3e72SEric Biggers vextracti128 $1, %ymm0, %xmm1 3918d2d3e72SEric Biggers _fold_vec_final 16, %xmm0, %xmm1, CONSTS_XMM, BSWAP_MASK_XMM, %xmm4, %xmm5 3928d2d3e72SEric Biggers.Lcheck_for_partial_block\@: 3938d2d3e72SEric Biggers.endif 3948d2d3e72SEric Biggers and $15, LEN32 3958d2d3e72SEric Biggers jz .Lreduce_128bits_to_crc\@ 3968d2d3e72SEric Biggers 3978d2d3e72SEric Biggers // 1 <= LEN <= 15 data bytes remain in BUF. The polynomial is now 3988d2d3e72SEric Biggers // A*(x^(8*LEN)) + B, where A is the 128-bit polynomial stored in %xmm0 3998d2d3e72SEric Biggers // and B is the polynomial of the remaining LEN data bytes. To reduce 4008d2d3e72SEric Biggers // this to 128 bits without needing fold constants for each possible 4018d2d3e72SEric Biggers // LEN, rearrange this expression into C1*(x^128) + C2, where 4028d2d3e72SEric Biggers // C1 = floor(A / x^(128 - 8*LEN)) and C2 = A*x^(8*LEN) + B mod x^128. 4038d2d3e72SEric Biggers // Then fold C1 into C2, which is just another fold across 128 bits. 4048d2d3e72SEric Biggers 4058d2d3e72SEric Biggers.if !LSB_CRC || AVX_LEVEL == 0 4068d2d3e72SEric Biggers // Load the last 16 data bytes. Note that originally LEN was >= 16. 4078d2d3e72SEric Biggers _load_data 16, "-16(BUF,LEN)", BSWAP_MASK_XMM, %xmm2 4088d2d3e72SEric Biggers.endif // Else will use vpblendvb mem operand later. 4098d2d3e72SEric Biggers.if !LSB_CRC 4108d2d3e72SEric Biggers neg LEN // Needed for indexing shuf_table 4118d2d3e72SEric Biggers.endif 4128d2d3e72SEric Biggers 4138d2d3e72SEric Biggers // tmp = A*x^(8*LEN) mod x^128 4148d2d3e72SEric Biggers // lsb: pshufb by [LEN, LEN+1, ..., 15, -1, -1, ..., -1] 4158d2d3e72SEric Biggers // i.e. right-shift by LEN bytes. 4168d2d3e72SEric Biggers // msb: pshufb by [-1, -1, ..., -1, 0, 1, ..., 15-LEN] 4178d2d3e72SEric Biggers // i.e. left-shift by LEN bytes. 4188d2d3e72SEric Biggers _cond_vex movdqu, "OFFSETOF_SHUF_TABLE+16(CONSTS_PTR,LEN)", %xmm3 4198d2d3e72SEric Biggers _cond_vex pshufb, %xmm3, %xmm0, %xmm1 4208d2d3e72SEric Biggers 4218d2d3e72SEric Biggers // C1 = floor(A / x^(128 - 8*LEN)) 4228d2d3e72SEric Biggers // lsb: pshufb by [-1, -1, ..., -1, 0, 1, ..., LEN-1] 4238d2d3e72SEric Biggers // i.e. left-shift by 16-LEN bytes. 4248d2d3e72SEric Biggers // msb: pshufb by [16-LEN, 16-LEN+1, ..., 15, -1, -1, ..., -1] 4258d2d3e72SEric Biggers // i.e. right-shift by 16-LEN bytes. 4268d2d3e72SEric Biggers _cond_vex pshufb, "OFFSETOF_SHUF_TABLE+32*!LSB_CRC(CONSTS_PTR,LEN)", \ 4278d2d3e72SEric Biggers %xmm0, %xmm0, unaligned_mem_tmp=%xmm4 4288d2d3e72SEric Biggers 4298d2d3e72SEric Biggers // C2 = tmp + B. This is just a blend of tmp with the last 16 data 4308d2d3e72SEric Biggers // bytes (reflected if msb-first). The blend mask is the shuffle table 4318d2d3e72SEric Biggers // that was used to create tmp. 0 selects tmp, and 1 last16databytes. 4328d2d3e72SEric Biggers.if AVX_LEVEL == 0 4338d2d3e72SEric Biggers movdqa %xmm0, %xmm4 4348d2d3e72SEric Biggers movdqa %xmm3, %xmm0 4358d2d3e72SEric Biggers pblendvb %xmm2, %xmm1 // uses %xmm0 as implicit operand 4368d2d3e72SEric Biggers movdqa %xmm4, %xmm0 4378d2d3e72SEric Biggers.elseif LSB_CRC 4388d2d3e72SEric Biggers vpblendvb %xmm3, -16(BUF,LEN), %xmm1, %xmm1 4398d2d3e72SEric Biggers.else 4408d2d3e72SEric Biggers vpblendvb %xmm3, %xmm2, %xmm1, %xmm1 4418d2d3e72SEric Biggers.endif 4428d2d3e72SEric Biggers 4438d2d3e72SEric Biggers // Fold C1 into C2 and store the 128-bit result in %xmm0. 4448d2d3e72SEric Biggers _fold_vec %xmm0, %xmm1, CONSTS_XMM, %xmm4 4458d2d3e72SEric Biggers 4468d2d3e72SEric Biggers.Lreduce_128bits_to_crc\@: 4478d2d3e72SEric Biggers // Compute the CRC as %xmm0 * x^n mod G. Here %xmm0 means the 128-bit 4488d2d3e72SEric Biggers // polynomial stored in %xmm0 (using either lsb-first or msb-first bit 4498d2d3e72SEric Biggers // order according to LSB_CRC), and G is the CRC's generator polynomial. 4508d2d3e72SEric Biggers 4518d2d3e72SEric Biggers // First, multiply %xmm0 by x^n and reduce the result to 64+n bits: 4528d2d3e72SEric Biggers // 4538d2d3e72SEric Biggers // t0 := (x^(64+n) mod G) * floor(%xmm0 / x^64) + 4548d2d3e72SEric Biggers // x^n * (%xmm0 mod x^64) 4558d2d3e72SEric Biggers // 4568d2d3e72SEric Biggers // Store t0 * x^(64-n) in %xmm0. I.e., actually do: 4578d2d3e72SEric Biggers // 4588d2d3e72SEric Biggers // %xmm0 := ((x^(64+n) mod G) * x^(64-n)) * floor(%xmm0 / x^64) + 4598d2d3e72SEric Biggers // x^64 * (%xmm0 mod x^64) 4608d2d3e72SEric Biggers // 4618d2d3e72SEric Biggers // The extra unreduced factor of x^(64-n) makes floor(t0 / x^n) aligned 4628d2d3e72SEric Biggers // to the HI64_TERMS of %xmm0 so that the next pclmulqdq can easily 4638d2d3e72SEric Biggers // select it. The 64-bit constant (x^(64+n) mod G) * x^(64-n) in the 4648d2d3e72SEric Biggers // msb-first case, or (x^(63+n) mod G) * x^(64-n) in the lsb-first case 4658d2d3e72SEric Biggers // (considering the extra factor of x that gets implicitly introduced by 4668d2d3e72SEric Biggers // each pclmulqdq when using lsb-first order), is identical to the 4678d2d3e72SEric Biggers // constant that was used earlier for folding the LO64_TERMS across 128 4688d2d3e72SEric Biggers // bits. Thus it's already available in LO64_TERMS of CONSTS_XMM. 4698d2d3e72SEric Biggers _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm0, HI64_TERMS, %xmm1 4708d2d3e72SEric Biggers.if LSB_CRC 4718d2d3e72SEric Biggers _cond_vex psrldq, $8, %xmm0, %xmm0 // x^64 * (%xmm0 mod x^64) 4728d2d3e72SEric Biggers.else 4738d2d3e72SEric Biggers _cond_vex pslldq, $8, %xmm0, %xmm0 // x^64 * (%xmm0 mod x^64) 4748d2d3e72SEric Biggers.endif 4758d2d3e72SEric Biggers _cond_vex pxor, %xmm1, %xmm0, %xmm0 4768d2d3e72SEric Biggers // The HI64_TERMS of %xmm0 now contain floor(t0 / x^n). 4778d2d3e72SEric Biggers // The LO64_TERMS of %xmm0 now contain (t0 mod x^n) * x^(64-n). 4788d2d3e72SEric Biggers 4798d2d3e72SEric Biggers // First step of Barrett reduction: Compute floor(t0 / G). This is the 4808d2d3e72SEric Biggers // polynomial by which G needs to be multiplied to cancel out the x^n 4818d2d3e72SEric Biggers // and higher terms of t0, i.e. to reduce t0 mod G. First do: 4828d2d3e72SEric Biggers // 4838d2d3e72SEric Biggers // t1 := floor(x^(63+n) / G) * x * floor(t0 / x^n) 4848d2d3e72SEric Biggers // 4858d2d3e72SEric Biggers // Then the desired value floor(t0 / G) is floor(t1 / x^64). The 63 in 4868d2d3e72SEric Biggers // x^(63+n) is the maximum degree of floor(t0 / x^n) and thus the lowest 4878d2d3e72SEric Biggers // value that makes enough precision be carried through the calculation. 4888d2d3e72SEric Biggers // 4898d2d3e72SEric Biggers // The '* x' makes it so the result is floor(t1 / x^64) rather than 4908d2d3e72SEric Biggers // floor(t1 / x^63), making it qword-aligned in HI64_TERMS so that it 4918d2d3e72SEric Biggers // can be extracted much more easily in the next step. In the lsb-first 4928d2d3e72SEric Biggers // case the '* x' happens implicitly. In the msb-first case it must be 4938d2d3e72SEric Biggers // done explicitly; floor(x^(63+n) / G) * x is a 65-bit constant, so the 4948d2d3e72SEric Biggers // constant passed to pclmulqdq is (floor(x^(63+n) / G) * x) - x^64, and 4958d2d3e72SEric Biggers // the multiplication by the x^64 term is handled using a pxor. The 4968d2d3e72SEric Biggers // pxor causes the low 64 terms of t1 to be wrong, but they are unused. 4978d2d3e72SEric Biggers _cond_vex movdqa, OFFSETOF_BARRETT_REDUCTION_CONSTS(CONSTS_PTR), CONSTS_XMM 4988d2d3e72SEric Biggers _pclmulqdq CONSTS_XMM, HI64_TERMS, %xmm0, HI64_TERMS, %xmm1 4998d2d3e72SEric Biggers.if !LSB_CRC 5008d2d3e72SEric Biggers _cond_vex pxor, %xmm0, %xmm1, %xmm1 // += x^64 * floor(t0 / x^n) 5018d2d3e72SEric Biggers.endif 5028d2d3e72SEric Biggers // The HI64_TERMS of %xmm1 now contain floor(t1 / x^64) = floor(t0 / G). 5038d2d3e72SEric Biggers 5048d2d3e72SEric Biggers // Second step of Barrett reduction: Cancel out the x^n and higher terms 5058d2d3e72SEric Biggers // of t0 by subtracting the needed multiple of G. This gives the CRC: 5068d2d3e72SEric Biggers // 5078d2d3e72SEric Biggers // crc := t0 - (G * floor(t0 / G)) 5088d2d3e72SEric Biggers // 5098d2d3e72SEric Biggers // But %xmm0 contains t0 * x^(64-n), so it's more convenient to do: 5108d2d3e72SEric Biggers // 5118d2d3e72SEric Biggers // crc := ((t0 * x^(64-n)) - ((G * x^(64-n)) * floor(t0 / G))) / x^(64-n) 5128d2d3e72SEric Biggers // 5138d2d3e72SEric Biggers // Furthermore, since the resulting CRC is n-bit, if mod x^n is 5148d2d3e72SEric Biggers // explicitly applied to it then the x^n term of G makes no difference 5158d2d3e72SEric Biggers // in the result and can be omitted. This helps keep the constant 5168d2d3e72SEric Biggers // multiplier in 64 bits in most cases. This gives the following: 5178d2d3e72SEric Biggers // 5188d2d3e72SEric Biggers // %xmm0 := %xmm0 - (((G - x^n) * x^(64-n)) * floor(t0 / G)) 5198d2d3e72SEric Biggers // crc := (%xmm0 / x^(64-n)) mod x^n 5208d2d3e72SEric Biggers // 5218d2d3e72SEric Biggers // In the lsb-first case, each pclmulqdq implicitly introduces 5228d2d3e72SEric Biggers // an extra factor of x, so in that case the constant that needs to be 5238d2d3e72SEric Biggers // passed to pclmulqdq is actually '(G - x^n) * x^(63-n)' when n <= 63. 5248d2d3e72SEric Biggers // For lsb-first CRCs where n=64, the extra factor of x cannot be as 5258d2d3e72SEric Biggers // easily avoided. In that case, instead pass '(G - x^n - x^0) / x' to 5268d2d3e72SEric Biggers // pclmulqdq and handle the x^0 term (i.e. 1) separately. (All CRC 5278d2d3e72SEric Biggers // polynomials have nonzero x^n and x^0 terms.) It works out as: the 5288d2d3e72SEric Biggers // CRC has be XORed with the physically low qword of %xmm1, representing 5298d2d3e72SEric Biggers // floor(t0 / G). The most efficient way to do that is to move it to 5308d2d3e72SEric Biggers // the physically high qword and use a ternlog to combine the two XORs. 5318d2d3e72SEric Biggers.if LSB_CRC && \n == 64 5328d2d3e72SEric Biggers _cond_vex punpcklqdq, %xmm1, %xmm2, %xmm2 5338d2d3e72SEric Biggers _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1 534*acf9f8daSEric Biggers .if AVX_LEVEL <= 2 5358d2d3e72SEric Biggers _cond_vex pxor, %xmm2, %xmm0, %xmm0 5368d2d3e72SEric Biggers _cond_vex pxor, %xmm1, %xmm0, %xmm0 5378d2d3e72SEric Biggers .else 5388d2d3e72SEric Biggers vpternlogq $0x96, %xmm2, %xmm1, %xmm0 5398d2d3e72SEric Biggers .endif 5408d2d3e72SEric Biggers _cond_vex "pextrq $1,", %xmm0, %rax // (%xmm0 / x^0) mod x^64 5418d2d3e72SEric Biggers.else 5428d2d3e72SEric Biggers _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1 5438d2d3e72SEric Biggers _cond_vex pxor, %xmm1, %xmm0, %xmm0 5448d2d3e72SEric Biggers .if \n == 8 5458d2d3e72SEric Biggers _cond_vex "pextrb $7 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^56) mod x^8 5468d2d3e72SEric Biggers .elseif \n == 16 5478d2d3e72SEric Biggers _cond_vex "pextrw $3 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^48) mod x^16 5488d2d3e72SEric Biggers .elseif \n == 32 5498d2d3e72SEric Biggers _cond_vex "pextrd $1 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^32) mod x^32 5508d2d3e72SEric Biggers .else // \n == 64 && !LSB_CRC 5518d2d3e72SEric Biggers _cond_vex movq, %xmm0, %rax // (%xmm0 / x^0) mod x^64 5528d2d3e72SEric Biggers .endif 5538d2d3e72SEric Biggers.endif 5548d2d3e72SEric Biggers 5558d2d3e72SEric Biggers.if VL > 16 5568d2d3e72SEric Biggers vzeroupper // Needed when ymm or zmm registers may have been used. 5578d2d3e72SEric Biggers.endif 5588d2d3e72SEric Biggers#ifdef __i386__ 5598d2d3e72SEric Biggers pop CONSTS_PTR 5608d2d3e72SEric Biggers#endif 5618d2d3e72SEric Biggers RET 5628d2d3e72SEric Biggers.endm 5638d2d3e72SEric Biggers 5648d2d3e72SEric Biggers#ifdef CONFIG_AS_VPCLMULQDQ 5658d2d3e72SEric Biggers#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb) \ 5668d2d3e72SEric BiggersSYM_FUNC_START(prefix##_pclmul_sse); \ 5678d2d3e72SEric Biggers _crc_pclmul n=bits, lsb_crc=lsb, vl=16, avx_level=0; \ 5688d2d3e72SEric BiggersSYM_FUNC_END(prefix##_pclmul_sse); \ 5698d2d3e72SEric Biggers \ 5708d2d3e72SEric BiggersSYM_FUNC_START(prefix##_vpclmul_avx2); \ 5718d2d3e72SEric Biggers _crc_pclmul n=bits, lsb_crc=lsb, vl=32, avx_level=2; \ 5728d2d3e72SEric BiggersSYM_FUNC_END(prefix##_vpclmul_avx2); \ 5738d2d3e72SEric Biggers \ 574*acf9f8daSEric BiggersSYM_FUNC_START(prefix##_vpclmul_avx512); \ 575*acf9f8daSEric Biggers _crc_pclmul n=bits, lsb_crc=lsb, vl=64, avx_level=512; \ 576*acf9f8daSEric BiggersSYM_FUNC_END(prefix##_vpclmul_avx512); 5778d2d3e72SEric Biggers#else 5788d2d3e72SEric Biggers#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb) \ 5798d2d3e72SEric BiggersSYM_FUNC_START(prefix##_pclmul_sse); \ 5808d2d3e72SEric Biggers _crc_pclmul n=bits, lsb_crc=lsb, vl=16, avx_level=0; \ 5818d2d3e72SEric BiggersSYM_FUNC_END(prefix##_pclmul_sse); 5828d2d3e72SEric Biggers#endif // !CONFIG_AS_VPCLMULQDQ 583