1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * ChaCha and HChaCha functions (x86_64 optimized) 4 * 5 * Copyright (C) 2015 Martin Willi 6 */ 7 8 #include <asm/simd.h> 9 #include <crypto/chacha.h> 10 #include <linux/jump_label.h> 11 #include <linux/kernel.h> 12 #include <linux/module.h> 13 #include <linux/sizes.h> 14 15 asmlinkage void chacha_block_xor_ssse3(const struct chacha_state *state, 16 u8 *dst, const u8 *src, 17 unsigned int len, int nrounds); 18 asmlinkage void chacha_4block_xor_ssse3(const struct chacha_state *state, 19 u8 *dst, const u8 *src, 20 unsigned int len, int nrounds); 21 asmlinkage void hchacha_block_ssse3(const struct chacha_state *state, 22 u32 out[HCHACHA_OUT_WORDS], int nrounds); 23 24 asmlinkage void chacha_2block_xor_avx2(const struct chacha_state *state, 25 u8 *dst, const u8 *src, 26 unsigned int len, int nrounds); 27 asmlinkage void chacha_4block_xor_avx2(const struct chacha_state *state, 28 u8 *dst, const u8 *src, 29 unsigned int len, int nrounds); 30 asmlinkage void chacha_8block_xor_avx2(const struct chacha_state *state, 31 u8 *dst, const u8 *src, 32 unsigned int len, int nrounds); 33 34 asmlinkage void chacha_2block_xor_avx512vl(const struct chacha_state *state, 35 u8 *dst, const u8 *src, 36 unsigned int len, int nrounds); 37 asmlinkage void chacha_4block_xor_avx512vl(const struct chacha_state *state, 38 u8 *dst, const u8 *src, 39 unsigned int len, int nrounds); 40 asmlinkage void chacha_8block_xor_avx512vl(const struct chacha_state *state, 41 u8 *dst, const u8 *src, 42 unsigned int len, int nrounds); 43 44 static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd); 45 static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2); 46 static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl); 47 48 static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks) 49 { 50 len = min(len, maxblocks * CHACHA_BLOCK_SIZE); 51 return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE; 52 } 53 54 static void chacha_dosimd(struct chacha_state *state, u8 *dst, const u8 *src, 55 unsigned int bytes, int nrounds) 56 { 57 if (static_branch_likely(&chacha_use_avx512vl)) { 58 while (bytes >= CHACHA_BLOCK_SIZE * 8) { 59 chacha_8block_xor_avx512vl(state, dst, src, bytes, 60 nrounds); 61 bytes -= CHACHA_BLOCK_SIZE * 8; 62 src += CHACHA_BLOCK_SIZE * 8; 63 dst += CHACHA_BLOCK_SIZE * 8; 64 state->x[12] += 8; 65 } 66 if (bytes > CHACHA_BLOCK_SIZE * 4) { 67 chacha_8block_xor_avx512vl(state, dst, src, bytes, 68 nrounds); 69 state->x[12] += chacha_advance(bytes, 8); 70 return; 71 } 72 if (bytes > CHACHA_BLOCK_SIZE * 2) { 73 chacha_4block_xor_avx512vl(state, dst, src, bytes, 74 nrounds); 75 state->x[12] += chacha_advance(bytes, 4); 76 return; 77 } 78 if (bytes) { 79 chacha_2block_xor_avx512vl(state, dst, src, bytes, 80 nrounds); 81 state->x[12] += chacha_advance(bytes, 2); 82 return; 83 } 84 } 85 86 if (static_branch_likely(&chacha_use_avx2)) { 87 while (bytes >= CHACHA_BLOCK_SIZE * 8) { 88 chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); 89 bytes -= CHACHA_BLOCK_SIZE * 8; 90 src += CHACHA_BLOCK_SIZE * 8; 91 dst += CHACHA_BLOCK_SIZE * 8; 92 state->x[12] += 8; 93 } 94 if (bytes > CHACHA_BLOCK_SIZE * 4) { 95 chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); 96 state->x[12] += chacha_advance(bytes, 8); 97 return; 98 } 99 if (bytes > CHACHA_BLOCK_SIZE * 2) { 100 chacha_4block_xor_avx2(state, dst, src, bytes, nrounds); 101 state->x[12] += chacha_advance(bytes, 4); 102 return; 103 } 104 if (bytes > CHACHA_BLOCK_SIZE) { 105 chacha_2block_xor_avx2(state, dst, src, bytes, nrounds); 106 state->x[12] += chacha_advance(bytes, 2); 107 return; 108 } 109 } 110 111 while (bytes >= CHACHA_BLOCK_SIZE * 4) { 112 chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); 113 bytes -= CHACHA_BLOCK_SIZE * 4; 114 src += CHACHA_BLOCK_SIZE * 4; 115 dst += CHACHA_BLOCK_SIZE * 4; 116 state->x[12] += 4; 117 } 118 if (bytes > CHACHA_BLOCK_SIZE) { 119 chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); 120 state->x[12] += chacha_advance(bytes, 4); 121 return; 122 } 123 if (bytes) { 124 chacha_block_xor_ssse3(state, dst, src, bytes, nrounds); 125 state->x[12]++; 126 } 127 } 128 129 void hchacha_block_arch(const struct chacha_state *state, 130 u32 out[HCHACHA_OUT_WORDS], int nrounds) 131 { 132 if (!static_branch_likely(&chacha_use_simd)) { 133 hchacha_block_generic(state, out, nrounds); 134 } else { 135 kernel_fpu_begin(); 136 hchacha_block_ssse3(state, out, nrounds); 137 kernel_fpu_end(); 138 } 139 } 140 EXPORT_SYMBOL(hchacha_block_arch); 141 142 void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, 143 unsigned int bytes, int nrounds) 144 { 145 if (!static_branch_likely(&chacha_use_simd) || 146 bytes <= CHACHA_BLOCK_SIZE) 147 return chacha_crypt_generic(state, dst, src, bytes, nrounds); 148 149 do { 150 unsigned int todo = min_t(unsigned int, bytes, SZ_4K); 151 152 kernel_fpu_begin(); 153 chacha_dosimd(state, dst, src, todo, nrounds); 154 kernel_fpu_end(); 155 156 bytes -= todo; 157 src += todo; 158 dst += todo; 159 } while (bytes); 160 } 161 EXPORT_SYMBOL(chacha_crypt_arch); 162 163 bool chacha_is_arch_optimized(void) 164 { 165 return static_key_enabled(&chacha_use_simd); 166 } 167 EXPORT_SYMBOL(chacha_is_arch_optimized); 168 169 static int __init chacha_simd_mod_init(void) 170 { 171 if (!boot_cpu_has(X86_FEATURE_SSSE3)) 172 return 0; 173 174 static_branch_enable(&chacha_use_simd); 175 176 if (boot_cpu_has(X86_FEATURE_AVX) && 177 boot_cpu_has(X86_FEATURE_AVX2) && 178 cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { 179 static_branch_enable(&chacha_use_avx2); 180 181 if (boot_cpu_has(X86_FEATURE_AVX512VL) && 182 boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */ 183 static_branch_enable(&chacha_use_avx512vl); 184 } 185 return 0; 186 } 187 subsys_initcall(chacha_simd_mod_init); 188 189 static void __exit chacha_simd_mod_exit(void) 190 { 191 } 192 module_exit(chacha_simd_mod_exit); 193 194 MODULE_LICENSE("GPL"); 195 MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); 196 MODULE_DESCRIPTION("ChaCha and HChaCha functions (x86_64 optimized)"); 197