xref: /linux/arch/x86/lib/crc-pclmul-template.h (revision 4f9786035f9e519db41375818e1d0b5f20da2f10)
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * Macros for accessing the [V]PCLMULQDQ-based CRC functions that are
4  * instantiated by crc-pclmul-template.S
5  *
6  * Copyright 2025 Google LLC
7  *
8  * Author: Eric Biggers <ebiggers@google.com>
9  */
10 #ifndef _CRC_PCLMUL_TEMPLATE_H
11 #define _CRC_PCLMUL_TEMPLATE_H
12 
13 #include <asm/cpufeatures.h>
14 #include <asm/simd.h>
15 #include <crypto/internal/simd.h>
16 #include <linux/static_call.h>
17 #include "crc-pclmul-consts.h"
18 
19 #define DECLARE_CRC_PCLMUL_FUNCS(prefix, crc_t)				\
20 crc_t prefix##_pclmul_sse(crc_t crc, const u8 *p, size_t len,		\
21 			  const void *consts_ptr);			\
22 crc_t prefix##_vpclmul_avx2(crc_t crc, const u8 *p, size_t len,		\
23 			    const void *consts_ptr);			\
24 crc_t prefix##_vpclmul_avx512(crc_t crc, const u8 *p, size_t len,	\
25 			      const void *consts_ptr);			\
26 DEFINE_STATIC_CALL(prefix##_pclmul, prefix##_pclmul_sse)
27 
28 #define INIT_CRC_PCLMUL(prefix)						\
29 do {									\
30 	if (IS_ENABLED(CONFIG_AS_VPCLMULQDQ) &&				\
31 	    boot_cpu_has(X86_FEATURE_VPCLMULQDQ) &&			\
32 	    boot_cpu_has(X86_FEATURE_AVX2) &&				\
33 	    cpu_has_xfeatures(XFEATURE_MASK_YMM, NULL)) {		\
34 		if (boot_cpu_has(X86_FEATURE_AVX512BW) &&		\
35 		    boot_cpu_has(X86_FEATURE_AVX512VL) &&		\
36 		    !boot_cpu_has(X86_FEATURE_PREFER_YMM) &&		\
37 		    cpu_has_xfeatures(XFEATURE_MASK_AVX512, NULL)) {	\
38 			static_call_update(prefix##_pclmul,		\
39 					   prefix##_vpclmul_avx512);	\
40 		} else {						\
41 			static_call_update(prefix##_pclmul,		\
42 					   prefix##_vpclmul_avx2);	\
43 		}							\
44 	}								\
45 } while (0)
46 
47 /*
48  * Call a [V]PCLMULQDQ optimized CRC function if the data length is at least 16
49  * bytes, the CPU has PCLMULQDQ support, and the current context may use SIMD.
50  *
51  * 16 bytes is the minimum length supported by the [V]PCLMULQDQ functions.
52  * There is overhead associated with kernel_fpu_begin() and kernel_fpu_end(),
53  * varying by CPU and factors such as which parts of the "FPU" state userspace
54  * has touched, which could result in a larger cutoff being better.  Indeed, a
55  * larger cutoff is usually better for a *single* message.  However, the
56  * overhead of the FPU section gets amortized if multiple FPU sections get
57  * executed before returning to userspace, since the XSAVE and XRSTOR occur only
58  * once.  Considering that and the fact that the [V]PCLMULQDQ code is lighter on
59  * the dcache than the table-based code is, a 16-byte cutoff seems to work well.
60  */
61 #define CRC_PCLMUL(crc, p, len, prefix, consts, have_pclmulqdq)		\
62 do {									\
63 	if ((len) >= 16 && static_branch_likely(&(have_pclmulqdq)) &&	\
64 	    crypto_simd_usable()) {					\
65 		const void *consts_ptr;					\
66 									\
67 		consts_ptr = (consts).fold_across_128_bits_consts;	\
68 		kernel_fpu_begin();					\
69 		crc = static_call(prefix##_pclmul)((crc), (p), (len),	\
70 						   consts_ptr);		\
71 		kernel_fpu_end();					\
72 		return crc;						\
73 	}								\
74 } while (0)
75 
76 #endif /* _CRC_PCLMUL_TEMPLATE_H */
77