1b10749d8SEric Biggers // SPDX-License-Identifier: GPL-2.0-only
2b10749d8SEric Biggers /*
3b10749d8SEric Biggers * x86-optimized CRC32 functions
4b10749d8SEric Biggers *
5b10749d8SEric Biggers * Copyright (C) 2008 Intel Corporation
6b10749d8SEric Biggers * Copyright 2012 Xyratex Technology Limited
7b10749d8SEric Biggers * Copyright 2024 Google LLC
8b10749d8SEric Biggers */
9b10749d8SEric Biggers
10b10749d8SEric Biggers #include "crc-pclmul-template.h"
11b10749d8SEric Biggers
12b10749d8SEric Biggers static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
13b10749d8SEric Biggers static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
14*118da22eSEric Biggers static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vpclmul_avx512);
15b10749d8SEric Biggers
16b10749d8SEric Biggers DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32);
17b10749d8SEric Biggers
crc32_le_arch(u32 crc,const u8 * p,size_t len)18b10749d8SEric Biggers static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
19b10749d8SEric Biggers {
20b10749d8SEric Biggers CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts,
21b10749d8SEric Biggers have_pclmulqdq);
22b10749d8SEric Biggers return crc32_le_base(crc, p, len);
23b10749d8SEric Biggers }
24b10749d8SEric Biggers
25b10749d8SEric Biggers #ifdef CONFIG_X86_64
26b10749d8SEric Biggers #define CRC32_INST "crc32q %1, %q0"
27b10749d8SEric Biggers #else
28b10749d8SEric Biggers #define CRC32_INST "crc32l %1, %0"
29b10749d8SEric Biggers #endif
30b10749d8SEric Biggers
31b10749d8SEric Biggers /*
32b10749d8SEric Biggers * Use carryless multiply version of crc32c when buffer size is >= 512 to
33b10749d8SEric Biggers * account for FPU state save/restore overhead.
34b10749d8SEric Biggers */
35b10749d8SEric Biggers #define CRC32C_PCLMUL_BREAKEVEN 512
36b10749d8SEric Biggers
37b10749d8SEric Biggers asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
38b10749d8SEric Biggers
crc32c_arch(u32 crc,const u8 * p,size_t len)39b10749d8SEric Biggers static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
40b10749d8SEric Biggers {
41b10749d8SEric Biggers size_t num_longs;
42b10749d8SEric Biggers
43b10749d8SEric Biggers if (!static_branch_likely(&have_crc32))
44b10749d8SEric Biggers return crc32c_base(crc, p, len);
45b10749d8SEric Biggers
46b10749d8SEric Biggers if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN &&
47b10749d8SEric Biggers static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) {
48*118da22eSEric Biggers /*
49*118da22eSEric Biggers * Long length, the vector registers are usable, and the CPU is
50*118da22eSEric Biggers * 64-bit and supports both CRC32 and PCLMULQDQ instructions.
51*118da22eSEric Biggers * It is worthwhile to divide the data into multiple streams,
52*118da22eSEric Biggers * CRC them independently, and combine them using PCLMULQDQ.
53*118da22eSEric Biggers * crc32c_x86_3way() does this using 3 streams, which is the
54*118da22eSEric Biggers * most that x86_64 CPUs have traditionally been capable of.
55*118da22eSEric Biggers *
56*118da22eSEric Biggers * However, due to improved VPCLMULQDQ performance on newer
57*118da22eSEric Biggers * CPUs, use crc32_lsb_vpclmul_avx512() instead of
58*118da22eSEric Biggers * crc32c_x86_3way() when the CPU supports VPCLMULQDQ and has a
59*118da22eSEric Biggers * "good" implementation of AVX-512.
60*118da22eSEric Biggers *
61*118da22eSEric Biggers * Future work: the optimal strategy on Zen 3--5 is actually to
62*118da22eSEric Biggers * use both crc32q and VPCLMULQDQ in parallel. Unfortunately,
63*118da22eSEric Biggers * different numbers of streams and vector lengths are optimal
64*118da22eSEric Biggers * on each CPU microarchitecture, making it challenging to take
65*118da22eSEric Biggers * advantage of this. (Zen 5 even supports 7 parallel crc32q, a
66*118da22eSEric Biggers * major upgrade.) For now, just choose between
67*118da22eSEric Biggers * crc32c_x86_3way() and crc32_lsb_vpclmul_avx512(). The latter
68*118da22eSEric Biggers * is needed anyway for crc32_le(), so we just reuse it here.
69*118da22eSEric Biggers */
70b10749d8SEric Biggers kernel_fpu_begin();
71*118da22eSEric Biggers if (static_branch_likely(&have_vpclmul_avx512))
72*118da22eSEric Biggers crc = crc32_lsb_vpclmul_avx512(crc, p, len,
73*118da22eSEric Biggers crc32_lsb_0x82f63b78_consts.fold_across_128_bits_consts);
74*118da22eSEric Biggers else
75b10749d8SEric Biggers crc = crc32c_x86_3way(crc, p, len);
76b10749d8SEric Biggers kernel_fpu_end();
77b10749d8SEric Biggers return crc;
78b10749d8SEric Biggers }
79b10749d8SEric Biggers
80*118da22eSEric Biggers /*
81*118da22eSEric Biggers * Short length, XMM registers unusable, or the CPU is 32-bit; but the
82*118da22eSEric Biggers * CPU supports CRC32 instructions. Just issue a single stream of CRC32
83*118da22eSEric Biggers * instructions inline. While this doesn't use the CPU's CRC32
84*118da22eSEric Biggers * throughput very well, it avoids the need to combine streams. Stream
85*118da22eSEric Biggers * combination would be inefficient here.
86*118da22eSEric Biggers */
87*118da22eSEric Biggers
88b10749d8SEric Biggers for (num_longs = len / sizeof(unsigned long);
89b10749d8SEric Biggers num_longs != 0; num_longs--, p += sizeof(unsigned long))
90b10749d8SEric Biggers asm(CRC32_INST : "+r" (crc) : ASM_INPUT_RM (*(unsigned long *)p));
91b10749d8SEric Biggers
92b10749d8SEric Biggers if (sizeof(unsigned long) > 4 && (len & 4)) {
93b10749d8SEric Biggers asm("crc32l %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u32 *)p));
94b10749d8SEric Biggers p += 4;
95b10749d8SEric Biggers }
96b10749d8SEric Biggers if (len & 2) {
97b10749d8SEric Biggers asm("crc32w %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u16 *)p));
98b10749d8SEric Biggers p += 2;
99b10749d8SEric Biggers }
100b10749d8SEric Biggers if (len & 1)
101b10749d8SEric Biggers asm("crc32b %1, %0" : "+r" (crc) : ASM_INPUT_RM (*p));
102b10749d8SEric Biggers
103b10749d8SEric Biggers return crc;
104b10749d8SEric Biggers }
105b10749d8SEric Biggers
106b10749d8SEric Biggers #define crc32_be_arch crc32_be_base /* not implemented on this arch */
107b10749d8SEric Biggers
108b10749d8SEric Biggers #define crc32_mod_init_arch crc32_mod_init_arch
crc32_mod_init_arch(void)109b10749d8SEric Biggers static inline void crc32_mod_init_arch(void)
110b10749d8SEric Biggers {
111b10749d8SEric Biggers if (boot_cpu_has(X86_FEATURE_XMM4_2))
112b10749d8SEric Biggers static_branch_enable(&have_crc32);
113b10749d8SEric Biggers if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
114b10749d8SEric Biggers static_branch_enable(&have_pclmulqdq);
115110628e5SEric Biggers if (have_vpclmul()) {
116110628e5SEric Biggers if (have_avx512()) {
117110628e5SEric Biggers static_call_update(crc32_lsb_pclmul,
118110628e5SEric Biggers crc32_lsb_vpclmul_avx512);
119*118da22eSEric Biggers static_branch_enable(&have_vpclmul_avx512);
120110628e5SEric Biggers } else {
121110628e5SEric Biggers static_call_update(crc32_lsb_pclmul,
122110628e5SEric Biggers crc32_lsb_vpclmul_avx2);
123110628e5SEric Biggers }
124110628e5SEric Biggers }
125b10749d8SEric Biggers }
126b10749d8SEric Biggers }
127b10749d8SEric Biggers
crc32_optimizations_arch(void)128b10749d8SEric Biggers static inline u32 crc32_optimizations_arch(void)
129b10749d8SEric Biggers {
130b10749d8SEric Biggers u32 optimizations = 0;
131b10749d8SEric Biggers
132b10749d8SEric Biggers if (static_key_enabled(&have_crc32))
133b10749d8SEric Biggers optimizations |= CRC32C_OPTIMIZATION;
134b10749d8SEric Biggers if (static_key_enabled(&have_pclmulqdq))
135b10749d8SEric Biggers optimizations |= CRC32_LE_OPTIMIZATION;
136b10749d8SEric Biggers return optimizations;
137b10749d8SEric Biggers }
138