xref: /linux/lib/crc/x86/crc32.h (revision a578dd095dfe8b56c167201d9aea43e47d27f807)
1b10749d8SEric Biggers // SPDX-License-Identifier: GPL-2.0-only
2b10749d8SEric Biggers /*
3b10749d8SEric Biggers  * x86-optimized CRC32 functions
4b10749d8SEric Biggers  *
5b10749d8SEric Biggers  * Copyright (C) 2008 Intel Corporation
6b10749d8SEric Biggers  * Copyright 2012 Xyratex Technology Limited
7b10749d8SEric Biggers  * Copyright 2024 Google LLC
8b10749d8SEric Biggers  */
9b10749d8SEric Biggers 
10b10749d8SEric Biggers #include "crc-pclmul-template.h"
11b10749d8SEric Biggers 
12b10749d8SEric Biggers static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
13b10749d8SEric Biggers static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
14*118da22eSEric Biggers static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vpclmul_avx512);
15b10749d8SEric Biggers 
16b10749d8SEric Biggers DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32);
17b10749d8SEric Biggers 
crc32_le_arch(u32 crc,const u8 * p,size_t len)18b10749d8SEric Biggers static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
19b10749d8SEric Biggers {
20b10749d8SEric Biggers 	CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts,
21b10749d8SEric Biggers 		   have_pclmulqdq);
22b10749d8SEric Biggers 	return crc32_le_base(crc, p, len);
23b10749d8SEric Biggers }
24b10749d8SEric Biggers 
25b10749d8SEric Biggers #ifdef CONFIG_X86_64
26b10749d8SEric Biggers #define CRC32_INST "crc32q %1, %q0"
27b10749d8SEric Biggers #else
28b10749d8SEric Biggers #define CRC32_INST "crc32l %1, %0"
29b10749d8SEric Biggers #endif
30b10749d8SEric Biggers 
31b10749d8SEric Biggers /*
32b10749d8SEric Biggers  * Use carryless multiply version of crc32c when buffer size is >= 512 to
33b10749d8SEric Biggers  * account for FPU state save/restore overhead.
34b10749d8SEric Biggers  */
35b10749d8SEric Biggers #define CRC32C_PCLMUL_BREAKEVEN	512
36b10749d8SEric Biggers 
37b10749d8SEric Biggers asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
38b10749d8SEric Biggers 
crc32c_arch(u32 crc,const u8 * p,size_t len)39b10749d8SEric Biggers static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
40b10749d8SEric Biggers {
41b10749d8SEric Biggers 	size_t num_longs;
42b10749d8SEric Biggers 
43b10749d8SEric Biggers 	if (!static_branch_likely(&have_crc32))
44b10749d8SEric Biggers 		return crc32c_base(crc, p, len);
45b10749d8SEric Biggers 
46b10749d8SEric Biggers 	if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN &&
47b10749d8SEric Biggers 	    static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) {
48*118da22eSEric Biggers 		/*
49*118da22eSEric Biggers 		 * Long length, the vector registers are usable, and the CPU is
50*118da22eSEric Biggers 		 * 64-bit and supports both CRC32 and PCLMULQDQ instructions.
51*118da22eSEric Biggers 		 * It is worthwhile to divide the data into multiple streams,
52*118da22eSEric Biggers 		 * CRC them independently, and combine them using PCLMULQDQ.
53*118da22eSEric Biggers 		 * crc32c_x86_3way() does this using 3 streams, which is the
54*118da22eSEric Biggers 		 * most that x86_64 CPUs have traditionally been capable of.
55*118da22eSEric Biggers 		 *
56*118da22eSEric Biggers 		 * However, due to improved VPCLMULQDQ performance on newer
57*118da22eSEric Biggers 		 * CPUs, use crc32_lsb_vpclmul_avx512() instead of
58*118da22eSEric Biggers 		 * crc32c_x86_3way() when the CPU supports VPCLMULQDQ and has a
59*118da22eSEric Biggers 		 * "good" implementation of AVX-512.
60*118da22eSEric Biggers 		 *
61*118da22eSEric Biggers 		 * Future work: the optimal strategy on Zen 3--5 is actually to
62*118da22eSEric Biggers 		 * use both crc32q and VPCLMULQDQ in parallel.  Unfortunately,
63*118da22eSEric Biggers 		 * different numbers of streams and vector lengths are optimal
64*118da22eSEric Biggers 		 * on each CPU microarchitecture, making it challenging to take
65*118da22eSEric Biggers 		 * advantage of this.  (Zen 5 even supports 7 parallel crc32q, a
66*118da22eSEric Biggers 		 * major upgrade.)  For now, just choose between
67*118da22eSEric Biggers 		 * crc32c_x86_3way() and crc32_lsb_vpclmul_avx512().  The latter
68*118da22eSEric Biggers 		 * is needed anyway for crc32_le(), so we just reuse it here.
69*118da22eSEric Biggers 		 */
70b10749d8SEric Biggers 		kernel_fpu_begin();
71*118da22eSEric Biggers 		if (static_branch_likely(&have_vpclmul_avx512))
72*118da22eSEric Biggers 			crc = crc32_lsb_vpclmul_avx512(crc, p, len,
73*118da22eSEric Biggers 				       crc32_lsb_0x82f63b78_consts.fold_across_128_bits_consts);
74*118da22eSEric Biggers 		else
75b10749d8SEric Biggers 			crc = crc32c_x86_3way(crc, p, len);
76b10749d8SEric Biggers 		kernel_fpu_end();
77b10749d8SEric Biggers 		return crc;
78b10749d8SEric Biggers 	}
79b10749d8SEric Biggers 
80*118da22eSEric Biggers 	/*
81*118da22eSEric Biggers 	 * Short length, XMM registers unusable, or the CPU is 32-bit; but the
82*118da22eSEric Biggers 	 * CPU supports CRC32 instructions.  Just issue a single stream of CRC32
83*118da22eSEric Biggers 	 * instructions inline.  While this doesn't use the CPU's CRC32
84*118da22eSEric Biggers 	 * throughput very well, it avoids the need to combine streams.  Stream
85*118da22eSEric Biggers 	 * combination would be inefficient here.
86*118da22eSEric Biggers 	 */
87*118da22eSEric Biggers 
88b10749d8SEric Biggers 	for (num_longs = len / sizeof(unsigned long);
89b10749d8SEric Biggers 	     num_longs != 0; num_longs--, p += sizeof(unsigned long))
90b10749d8SEric Biggers 		asm(CRC32_INST : "+r" (crc) : ASM_INPUT_RM (*(unsigned long *)p));
91b10749d8SEric Biggers 
92b10749d8SEric Biggers 	if (sizeof(unsigned long) > 4 && (len & 4)) {
93b10749d8SEric Biggers 		asm("crc32l %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u32 *)p));
94b10749d8SEric Biggers 		p += 4;
95b10749d8SEric Biggers 	}
96b10749d8SEric Biggers 	if (len & 2) {
97b10749d8SEric Biggers 		asm("crc32w %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u16 *)p));
98b10749d8SEric Biggers 		p += 2;
99b10749d8SEric Biggers 	}
100b10749d8SEric Biggers 	if (len & 1)
101b10749d8SEric Biggers 		asm("crc32b %1, %0" : "+r" (crc) : ASM_INPUT_RM (*p));
102b10749d8SEric Biggers 
103b10749d8SEric Biggers 	return crc;
104b10749d8SEric Biggers }
105b10749d8SEric Biggers 
106b10749d8SEric Biggers #define crc32_be_arch crc32_be_base /* not implemented on this arch */
107b10749d8SEric Biggers 
108b10749d8SEric Biggers #define crc32_mod_init_arch crc32_mod_init_arch
crc32_mod_init_arch(void)109b10749d8SEric Biggers static inline void crc32_mod_init_arch(void)
110b10749d8SEric Biggers {
111b10749d8SEric Biggers 	if (boot_cpu_has(X86_FEATURE_XMM4_2))
112b10749d8SEric Biggers 		static_branch_enable(&have_crc32);
113b10749d8SEric Biggers 	if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
114b10749d8SEric Biggers 		static_branch_enable(&have_pclmulqdq);
115110628e5SEric Biggers 		if (have_vpclmul()) {
116110628e5SEric Biggers 			if (have_avx512()) {
117110628e5SEric Biggers 				static_call_update(crc32_lsb_pclmul,
118110628e5SEric Biggers 						   crc32_lsb_vpclmul_avx512);
119*118da22eSEric Biggers 				static_branch_enable(&have_vpclmul_avx512);
120110628e5SEric Biggers 			} else {
121110628e5SEric Biggers 				static_call_update(crc32_lsb_pclmul,
122110628e5SEric Biggers 						   crc32_lsb_vpclmul_avx2);
123110628e5SEric Biggers 			}
124110628e5SEric Biggers 		}
125b10749d8SEric Biggers 	}
126b10749d8SEric Biggers }
127b10749d8SEric Biggers 
crc32_optimizations_arch(void)128b10749d8SEric Biggers static inline u32 crc32_optimizations_arch(void)
129b10749d8SEric Biggers {
130b10749d8SEric Biggers 	u32 optimizations = 0;
131b10749d8SEric Biggers 
132b10749d8SEric Biggers 	if (static_key_enabled(&have_crc32))
133b10749d8SEric Biggers 		optimizations |= CRC32C_OPTIMIZATION;
134b10749d8SEric Biggers 	if (static_key_enabled(&have_pclmulqdq))
135b10749d8SEric Biggers 		optimizations |= CRC32_LE_OPTIMIZATION;
136b10749d8SEric Biggers 	return optimizations;
137b10749d8SEric Biggers }
138