xref: /linux/lib/crc/x86/crc32.h (revision 3f2a5ba784b808109cac0aac921213e43143a216)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * x86-optimized CRC32 functions
4  *
5  * Copyright (C) 2008 Intel Corporation
6  * Copyright 2012 Xyratex Technology Limited
7  * Copyright 2024 Google LLC
8  */
9 
10 #include "crc-pclmul-template.h"
11 
12 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
13 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
14 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vpclmul_avx512);
15 
16 DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32);
17 
18 static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
19 {
20 	CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts,
21 		   have_pclmulqdq);
22 	return crc32_le_base(crc, p, len);
23 }
24 
25 #ifdef CONFIG_X86_64
26 #define CRC32_INST "crc32q %1, %q0"
27 #else
28 #define CRC32_INST "crc32l %1, %0"
29 #endif
30 
31 /*
32  * Use carryless multiply version of crc32c when buffer size is >= 512 to
33  * account for FPU state save/restore overhead.
34  */
35 #define CRC32C_PCLMUL_BREAKEVEN	512
36 
37 asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
38 
39 static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
40 {
41 	size_t num_longs;
42 
43 	if (!static_branch_likely(&have_crc32))
44 		return crc32c_base(crc, p, len);
45 
46 	if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN &&
47 	    static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) {
48 		/*
49 		 * Long length, the vector registers are usable, and the CPU is
50 		 * 64-bit and supports both CRC32 and PCLMULQDQ instructions.
51 		 * It is worthwhile to divide the data into multiple streams,
52 		 * CRC them independently, and combine them using PCLMULQDQ.
53 		 * crc32c_x86_3way() does this using 3 streams, which is the
54 		 * most that x86_64 CPUs have traditionally been capable of.
55 		 *
56 		 * However, due to improved VPCLMULQDQ performance on newer
57 		 * CPUs, use crc32_lsb_vpclmul_avx512() instead of
58 		 * crc32c_x86_3way() when the CPU supports VPCLMULQDQ and has a
59 		 * "good" implementation of AVX-512.
60 		 *
61 		 * Future work: the optimal strategy on Zen 3--5 is actually to
62 		 * use both crc32q and VPCLMULQDQ in parallel.  Unfortunately,
63 		 * different numbers of streams and vector lengths are optimal
64 		 * on each CPU microarchitecture, making it challenging to take
65 		 * advantage of this.  (Zen 5 even supports 7 parallel crc32q, a
66 		 * major upgrade.)  For now, just choose between
67 		 * crc32c_x86_3way() and crc32_lsb_vpclmul_avx512().  The latter
68 		 * is needed anyway for crc32_le(), so we just reuse it here.
69 		 */
70 		kernel_fpu_begin();
71 		if (static_branch_likely(&have_vpclmul_avx512))
72 			crc = crc32_lsb_vpclmul_avx512(crc, p, len,
73 				       crc32_lsb_0x82f63b78_consts.fold_across_128_bits_consts);
74 		else
75 			crc = crc32c_x86_3way(crc, p, len);
76 		kernel_fpu_end();
77 		return crc;
78 	}
79 
80 	/*
81 	 * Short length, XMM registers unusable, or the CPU is 32-bit; but the
82 	 * CPU supports CRC32 instructions.  Just issue a single stream of CRC32
83 	 * instructions inline.  While this doesn't use the CPU's CRC32
84 	 * throughput very well, it avoids the need to combine streams.  Stream
85 	 * combination would be inefficient here.
86 	 */
87 
88 	for (num_longs = len / sizeof(unsigned long);
89 	     num_longs != 0; num_longs--, p += sizeof(unsigned long))
90 		asm(CRC32_INST : "+r" (crc) : ASM_INPUT_RM (*(unsigned long *)p));
91 
92 	if (sizeof(unsigned long) > 4 && (len & 4)) {
93 		asm("crc32l %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u32 *)p));
94 		p += 4;
95 	}
96 	if (len & 2) {
97 		asm("crc32w %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u16 *)p));
98 		p += 2;
99 	}
100 	if (len & 1)
101 		asm("crc32b %1, %0" : "+r" (crc) : ASM_INPUT_RM (*p));
102 
103 	return crc;
104 }
105 
106 #define crc32_be_arch crc32_be_base /* not implemented on this arch */
107 
108 #define crc32_mod_init_arch crc32_mod_init_arch
109 static inline void crc32_mod_init_arch(void)
110 {
111 	if (boot_cpu_has(X86_FEATURE_XMM4_2))
112 		static_branch_enable(&have_crc32);
113 	if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
114 		static_branch_enable(&have_pclmulqdq);
115 		if (have_vpclmul()) {
116 			if (have_avx512()) {
117 				static_call_update(crc32_lsb_pclmul,
118 						   crc32_lsb_vpclmul_avx512);
119 				static_branch_enable(&have_vpclmul_avx512);
120 			} else {
121 				static_call_update(crc32_lsb_pclmul,
122 						   crc32_lsb_vpclmul_avx2);
123 			}
124 		}
125 	}
126 }
127 
128 static inline u32 crc32_optimizations_arch(void)
129 {
130 	u32 optimizations = 0;
131 
132 	if (static_key_enabled(&have_crc32))
133 		optimizations |= CRC32C_OPTIMIZATION;
134 	if (static_key_enabled(&have_pclmulqdq))
135 		optimizations |= CRC32_LE_OPTIMIZATION;
136 	return optimizations;
137 }
138