1 /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 /* 3 * Macros for accessing the [V]PCLMULQDQ-based CRC functions that are 4 * instantiated by crc-pclmul-template.S 5 * 6 * Copyright 2025 Google LLC 7 * 8 * Author: Eric Biggers <ebiggers@google.com> 9 */ 10 #ifndef _CRC_PCLMUL_TEMPLATE_H 11 #define _CRC_PCLMUL_TEMPLATE_H 12 13 #include <asm/cpufeatures.h> 14 #include <asm/simd.h> 15 #include <crypto/internal/simd.h> 16 #include <linux/static_call.h> 17 #include "crc-pclmul-consts.h" 18 19 #define DECLARE_CRC_PCLMUL_FUNCS(prefix, crc_t) \ 20 crc_t prefix##_pclmul_sse(crc_t crc, const u8 *p, size_t len, \ 21 const void *consts_ptr); \ 22 crc_t prefix##_vpclmul_avx2(crc_t crc, const u8 *p, size_t len, \ 23 const void *consts_ptr); \ 24 crc_t prefix##_vpclmul_avx512(crc_t crc, const u8 *p, size_t len, \ 25 const void *consts_ptr); \ 26 DEFINE_STATIC_CALL(prefix##_pclmul, prefix##_pclmul_sse) 27 28 #define INIT_CRC_PCLMUL(prefix) \ 29 do { \ 30 if (IS_ENABLED(CONFIG_AS_VPCLMULQDQ) && \ 31 boot_cpu_has(X86_FEATURE_VPCLMULQDQ) && \ 32 boot_cpu_has(X86_FEATURE_AVX2) && \ 33 cpu_has_xfeatures(XFEATURE_MASK_YMM, NULL)) { \ 34 if (boot_cpu_has(X86_FEATURE_AVX512BW) && \ 35 boot_cpu_has(X86_FEATURE_AVX512VL) && \ 36 !boot_cpu_has(X86_FEATURE_PREFER_YMM) && \ 37 cpu_has_xfeatures(XFEATURE_MASK_AVX512, NULL)) { \ 38 static_call_update(prefix##_pclmul, \ 39 prefix##_vpclmul_avx512); \ 40 } else { \ 41 static_call_update(prefix##_pclmul, \ 42 prefix##_vpclmul_avx2); \ 43 } \ 44 } \ 45 } while (0) 46 47 /* 48 * Call a [V]PCLMULQDQ optimized CRC function if the data length is at least 16 49 * bytes, the CPU has PCLMULQDQ support, and the current context may use SIMD. 50 * 51 * 16 bytes is the minimum length supported by the [V]PCLMULQDQ functions. 52 * There is overhead associated with kernel_fpu_begin() and kernel_fpu_end(), 53 * varying by CPU and factors such as which parts of the "FPU" state userspace 54 * has touched, which could result in a larger cutoff being better. Indeed, a 55 * larger cutoff is usually better for a *single* message. However, the 56 * overhead of the FPU section gets amortized if multiple FPU sections get 57 * executed before returning to userspace, since the XSAVE and XRSTOR occur only 58 * once. Considering that and the fact that the [V]PCLMULQDQ code is lighter on 59 * the dcache than the table-based code is, a 16-byte cutoff seems to work well. 60 */ 61 #define CRC_PCLMUL(crc, p, len, prefix, consts, have_pclmulqdq) \ 62 do { \ 63 if ((len) >= 16 && static_branch_likely(&(have_pclmulqdq)) && \ 64 crypto_simd_usable()) { \ 65 const void *consts_ptr; \ 66 \ 67 consts_ptr = (consts).fold_across_128_bits_consts; \ 68 kernel_fpu_begin(); \ 69 crc = static_call(prefix##_pclmul)((crc), (p), (len), \ 70 consts_ptr); \ 71 kernel_fpu_end(); \ 72 return crc; \ 73 } \ 74 } while (0) 75 76 #endif /* _CRC_PCLMUL_TEMPLATE_H */ 77