1b10749d8SEric Biggers // SPDX-License-Identifier: GPL-2.0-only 2b10749d8SEric Biggers /* 3b10749d8SEric Biggers * x86-optimized CRC32 functions 4b10749d8SEric Biggers * 5b10749d8SEric Biggers * Copyright (C) 2008 Intel Corporation 6b10749d8SEric Biggers * Copyright 2012 Xyratex Technology Limited 7b10749d8SEric Biggers * Copyright 2024 Google LLC 8b10749d8SEric Biggers */ 9b10749d8SEric Biggers 10b10749d8SEric Biggers #include "crc-pclmul-template.h" 11b10749d8SEric Biggers 12b10749d8SEric Biggers static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32); 13b10749d8SEric Biggers static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); 14*118da22eSEric Biggers static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vpclmul_avx512); 15b10749d8SEric Biggers 16b10749d8SEric Biggers DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32); 17b10749d8SEric Biggers 18b10749d8SEric Biggers static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) 19b10749d8SEric Biggers { 20b10749d8SEric Biggers CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts, 21b10749d8SEric Biggers have_pclmulqdq); 22b10749d8SEric Biggers return crc32_le_base(crc, p, len); 23b10749d8SEric Biggers } 24b10749d8SEric Biggers 25b10749d8SEric Biggers #ifdef CONFIG_X86_64 26b10749d8SEric Biggers #define CRC32_INST "crc32q %1, %q0" 27b10749d8SEric Biggers #else 28b10749d8SEric Biggers #define CRC32_INST "crc32l %1, %0" 29b10749d8SEric Biggers #endif 30b10749d8SEric Biggers 31b10749d8SEric Biggers /* 32b10749d8SEric Biggers * Use carryless multiply version of crc32c when buffer size is >= 512 to 33b10749d8SEric Biggers * account for FPU state save/restore overhead. 34b10749d8SEric Biggers */ 35b10749d8SEric Biggers #define CRC32C_PCLMUL_BREAKEVEN 512 36b10749d8SEric Biggers 37b10749d8SEric Biggers asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len); 38b10749d8SEric Biggers 39b10749d8SEric Biggers static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len) 40b10749d8SEric Biggers { 41b10749d8SEric Biggers size_t num_longs; 42b10749d8SEric Biggers 43b10749d8SEric Biggers if (!static_branch_likely(&have_crc32)) 44b10749d8SEric Biggers return crc32c_base(crc, p, len); 45b10749d8SEric Biggers 46b10749d8SEric Biggers if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN && 47b10749d8SEric Biggers static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) { 48*118da22eSEric Biggers /* 49*118da22eSEric Biggers * Long length, the vector registers are usable, and the CPU is 50*118da22eSEric Biggers * 64-bit and supports both CRC32 and PCLMULQDQ instructions. 51*118da22eSEric Biggers * It is worthwhile to divide the data into multiple streams, 52*118da22eSEric Biggers * CRC them independently, and combine them using PCLMULQDQ. 53*118da22eSEric Biggers * crc32c_x86_3way() does this using 3 streams, which is the 54*118da22eSEric Biggers * most that x86_64 CPUs have traditionally been capable of. 55*118da22eSEric Biggers * 56*118da22eSEric Biggers * However, due to improved VPCLMULQDQ performance on newer 57*118da22eSEric Biggers * CPUs, use crc32_lsb_vpclmul_avx512() instead of 58*118da22eSEric Biggers * crc32c_x86_3way() when the CPU supports VPCLMULQDQ and has a 59*118da22eSEric Biggers * "good" implementation of AVX-512. 60*118da22eSEric Biggers * 61*118da22eSEric Biggers * Future work: the optimal strategy on Zen 3--5 is actually to 62*118da22eSEric Biggers * use both crc32q and VPCLMULQDQ in parallel. Unfortunately, 63*118da22eSEric Biggers * different numbers of streams and vector lengths are optimal 64*118da22eSEric Biggers * on each CPU microarchitecture, making it challenging to take 65*118da22eSEric Biggers * advantage of this. (Zen 5 even supports 7 parallel crc32q, a 66*118da22eSEric Biggers * major upgrade.) For now, just choose between 67*118da22eSEric Biggers * crc32c_x86_3way() and crc32_lsb_vpclmul_avx512(). The latter 68*118da22eSEric Biggers * is needed anyway for crc32_le(), so we just reuse it here. 69*118da22eSEric Biggers */ 70b10749d8SEric Biggers kernel_fpu_begin(); 71*118da22eSEric Biggers if (static_branch_likely(&have_vpclmul_avx512)) 72*118da22eSEric Biggers crc = crc32_lsb_vpclmul_avx512(crc, p, len, 73*118da22eSEric Biggers crc32_lsb_0x82f63b78_consts.fold_across_128_bits_consts); 74*118da22eSEric Biggers else 75b10749d8SEric Biggers crc = crc32c_x86_3way(crc, p, len); 76b10749d8SEric Biggers kernel_fpu_end(); 77b10749d8SEric Biggers return crc; 78b10749d8SEric Biggers } 79b10749d8SEric Biggers 80*118da22eSEric Biggers /* 81*118da22eSEric Biggers * Short length, XMM registers unusable, or the CPU is 32-bit; but the 82*118da22eSEric Biggers * CPU supports CRC32 instructions. Just issue a single stream of CRC32 83*118da22eSEric Biggers * instructions inline. While this doesn't use the CPU's CRC32 84*118da22eSEric Biggers * throughput very well, it avoids the need to combine streams. Stream 85*118da22eSEric Biggers * combination would be inefficient here. 86*118da22eSEric Biggers */ 87*118da22eSEric Biggers 88b10749d8SEric Biggers for (num_longs = len / sizeof(unsigned long); 89b10749d8SEric Biggers num_longs != 0; num_longs--, p += sizeof(unsigned long)) 90b10749d8SEric Biggers asm(CRC32_INST : "+r" (crc) : ASM_INPUT_RM (*(unsigned long *)p)); 91b10749d8SEric Biggers 92b10749d8SEric Biggers if (sizeof(unsigned long) > 4 && (len & 4)) { 93b10749d8SEric Biggers asm("crc32l %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u32 *)p)); 94b10749d8SEric Biggers p += 4; 95b10749d8SEric Biggers } 96b10749d8SEric Biggers if (len & 2) { 97b10749d8SEric Biggers asm("crc32w %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u16 *)p)); 98b10749d8SEric Biggers p += 2; 99b10749d8SEric Biggers } 100b10749d8SEric Biggers if (len & 1) 101b10749d8SEric Biggers asm("crc32b %1, %0" : "+r" (crc) : ASM_INPUT_RM (*p)); 102b10749d8SEric Biggers 103b10749d8SEric Biggers return crc; 104b10749d8SEric Biggers } 105b10749d8SEric Biggers 106b10749d8SEric Biggers #define crc32_be_arch crc32_be_base /* not implemented on this arch */ 107b10749d8SEric Biggers 108b10749d8SEric Biggers #define crc32_mod_init_arch crc32_mod_init_arch 109b10749d8SEric Biggers static inline void crc32_mod_init_arch(void) 110b10749d8SEric Biggers { 111b10749d8SEric Biggers if (boot_cpu_has(X86_FEATURE_XMM4_2)) 112b10749d8SEric Biggers static_branch_enable(&have_crc32); 113b10749d8SEric Biggers if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { 114b10749d8SEric Biggers static_branch_enable(&have_pclmulqdq); 115110628e5SEric Biggers if (have_vpclmul()) { 116110628e5SEric Biggers if (have_avx512()) { 117110628e5SEric Biggers static_call_update(crc32_lsb_pclmul, 118110628e5SEric Biggers crc32_lsb_vpclmul_avx512); 119*118da22eSEric Biggers static_branch_enable(&have_vpclmul_avx512); 120110628e5SEric Biggers } else { 121110628e5SEric Biggers static_call_update(crc32_lsb_pclmul, 122110628e5SEric Biggers crc32_lsb_vpclmul_avx2); 123110628e5SEric Biggers } 124110628e5SEric Biggers } 125b10749d8SEric Biggers } 126b10749d8SEric Biggers } 127b10749d8SEric Biggers 128b10749d8SEric Biggers static inline u32 crc32_optimizations_arch(void) 129b10749d8SEric Biggers { 130b10749d8SEric Biggers u32 optimizations = 0; 131b10749d8SEric Biggers 132b10749d8SEric Biggers if (static_key_enabled(&have_crc32)) 133b10749d8SEric Biggers optimizations |= CRC32C_OPTIMIZATION; 134b10749d8SEric Biggers if (static_key_enabled(&have_pclmulqdq)) 135b10749d8SEric Biggers optimizations |= CRC32_LE_OPTIMIZATION; 136b10749d8SEric Biggers return optimizations; 137b10749d8SEric Biggers } 138