1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * x86-optimized CRC32 functions 4 * 5 * Copyright (C) 2008 Intel Corporation 6 * Copyright 2012 Xyratex Technology Limited 7 * Copyright 2024 Google LLC 8 */ 9 10 #include "crc-pclmul-template.h" 11 12 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32); 13 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); 14 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vpclmul_avx512); 15 16 DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32); 17 18 static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) 19 { 20 CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts, 21 have_pclmulqdq); 22 return crc32_le_base(crc, p, len); 23 } 24 25 #ifdef CONFIG_X86_64 26 #define CRC32_INST "crc32q %1, %q0" 27 #else 28 #define CRC32_INST "crc32l %1, %0" 29 #endif 30 31 /* 32 * Use carryless multiply version of crc32c when buffer size is >= 512 to 33 * account for FPU state save/restore overhead. 34 */ 35 #define CRC32C_PCLMUL_BREAKEVEN 512 36 37 asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len); 38 39 static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len) 40 { 41 size_t num_longs; 42 43 if (!static_branch_likely(&have_crc32)) 44 return crc32c_base(crc, p, len); 45 46 if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN && 47 static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) { 48 /* 49 * Long length, the vector registers are usable, and the CPU is 50 * 64-bit and supports both CRC32 and PCLMULQDQ instructions. 51 * It is worthwhile to divide the data into multiple streams, 52 * CRC them independently, and combine them using PCLMULQDQ. 53 * crc32c_x86_3way() does this using 3 streams, which is the 54 * most that x86_64 CPUs have traditionally been capable of. 55 * 56 * However, due to improved VPCLMULQDQ performance on newer 57 * CPUs, use crc32_lsb_vpclmul_avx512() instead of 58 * crc32c_x86_3way() when the CPU supports VPCLMULQDQ and has a 59 * "good" implementation of AVX-512. 60 * 61 * Future work: the optimal strategy on Zen 3--5 is actually to 62 * use both crc32q and VPCLMULQDQ in parallel. Unfortunately, 63 * different numbers of streams and vector lengths are optimal 64 * on each CPU microarchitecture, making it challenging to take 65 * advantage of this. (Zen 5 even supports 7 parallel crc32q, a 66 * major upgrade.) For now, just choose between 67 * crc32c_x86_3way() and crc32_lsb_vpclmul_avx512(). The latter 68 * is needed anyway for crc32_le(), so we just reuse it here. 69 */ 70 kernel_fpu_begin(); 71 if (static_branch_likely(&have_vpclmul_avx512)) 72 crc = crc32_lsb_vpclmul_avx512(crc, p, len, 73 crc32_lsb_0x82f63b78_consts.fold_across_128_bits_consts); 74 else 75 crc = crc32c_x86_3way(crc, p, len); 76 kernel_fpu_end(); 77 return crc; 78 } 79 80 /* 81 * Short length, XMM registers unusable, or the CPU is 32-bit; but the 82 * CPU supports CRC32 instructions. Just issue a single stream of CRC32 83 * instructions inline. While this doesn't use the CPU's CRC32 84 * throughput very well, it avoids the need to combine streams. Stream 85 * combination would be inefficient here. 86 */ 87 88 for (num_longs = len / sizeof(unsigned long); 89 num_longs != 0; num_longs--, p += sizeof(unsigned long)) 90 asm(CRC32_INST : "+r" (crc) : ASM_INPUT_RM (*(unsigned long *)p)); 91 92 if (sizeof(unsigned long) > 4 && (len & 4)) { 93 asm("crc32l %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u32 *)p)); 94 p += 4; 95 } 96 if (len & 2) { 97 asm("crc32w %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u16 *)p)); 98 p += 2; 99 } 100 if (len & 1) 101 asm("crc32b %1, %0" : "+r" (crc) : ASM_INPUT_RM (*p)); 102 103 return crc; 104 } 105 106 #define crc32_be_arch crc32_be_base /* not implemented on this arch */ 107 108 #define crc32_mod_init_arch crc32_mod_init_arch 109 static inline void crc32_mod_init_arch(void) 110 { 111 if (boot_cpu_has(X86_FEATURE_XMM4_2)) 112 static_branch_enable(&have_crc32); 113 if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { 114 static_branch_enable(&have_pclmulqdq); 115 if (have_vpclmul()) { 116 if (have_avx512()) { 117 static_call_update(crc32_lsb_pclmul, 118 crc32_lsb_vpclmul_avx512); 119 static_branch_enable(&have_vpclmul_avx512); 120 } else { 121 static_call_update(crc32_lsb_pclmul, 122 crc32_lsb_vpclmul_avx2); 123 } 124 } 125 } 126 } 127 128 static inline u32 crc32_optimizations_arch(void) 129 { 130 u32 optimizations = 0; 131 132 if (static_key_enabled(&have_crc32)) 133 optimizations |= CRC32C_OPTIMIZATION; 134 if (static_key_enabled(&have_pclmulqdq)) 135 optimizations |= CRC32_LE_OPTIMIZATION; 136 return optimizations; 137 } 138