1*74750aa7SEric Biggers // SPDX-License-Identifier: GPL-2.0-or-later 2*74750aa7SEric Biggers /* 3*74750aa7SEric Biggers * ChaCha and HChaCha functions (x86_64 optimized) 4*74750aa7SEric Biggers * 5*74750aa7SEric Biggers * Copyright (C) 2015 Martin Willi 6*74750aa7SEric Biggers */ 7*74750aa7SEric Biggers 8*74750aa7SEric Biggers #include <asm/simd.h> 9*74750aa7SEric Biggers #include <crypto/chacha.h> 10*74750aa7SEric Biggers #include <linux/jump_label.h> 11*74750aa7SEric Biggers #include <linux/kernel.h> 12*74750aa7SEric Biggers #include <linux/module.h> 13*74750aa7SEric Biggers #include <linux/sizes.h> 14*74750aa7SEric Biggers 15*74750aa7SEric Biggers asmlinkage void chacha_block_xor_ssse3(const struct chacha_state *state, 16*74750aa7SEric Biggers u8 *dst, const u8 *src, 17*74750aa7SEric Biggers unsigned int len, int nrounds); 18*74750aa7SEric Biggers asmlinkage void chacha_4block_xor_ssse3(const struct chacha_state *state, 19*74750aa7SEric Biggers u8 *dst, const u8 *src, 20*74750aa7SEric Biggers unsigned int len, int nrounds); 21*74750aa7SEric Biggers asmlinkage void hchacha_block_ssse3(const struct chacha_state *state, 22*74750aa7SEric Biggers u32 out[HCHACHA_OUT_WORDS], int nrounds); 23*74750aa7SEric Biggers 24*74750aa7SEric Biggers asmlinkage void chacha_2block_xor_avx2(const struct chacha_state *state, 25*74750aa7SEric Biggers u8 *dst, const u8 *src, 26*74750aa7SEric Biggers unsigned int len, int nrounds); 27*74750aa7SEric Biggers asmlinkage void chacha_4block_xor_avx2(const struct chacha_state *state, 28*74750aa7SEric Biggers u8 *dst, const u8 *src, 29*74750aa7SEric Biggers unsigned int len, int nrounds); 30*74750aa7SEric Biggers asmlinkage void chacha_8block_xor_avx2(const struct chacha_state *state, 31*74750aa7SEric Biggers u8 *dst, const u8 *src, 32*74750aa7SEric Biggers unsigned int len, int nrounds); 33*74750aa7SEric Biggers 34*74750aa7SEric Biggers asmlinkage void chacha_2block_xor_avx512vl(const struct chacha_state *state, 35*74750aa7SEric Biggers u8 *dst, const u8 *src, 36*74750aa7SEric Biggers unsigned int len, int nrounds); 37*74750aa7SEric Biggers asmlinkage void chacha_4block_xor_avx512vl(const struct chacha_state *state, 38*74750aa7SEric Biggers u8 *dst, const u8 *src, 39*74750aa7SEric Biggers unsigned int len, int nrounds); 40*74750aa7SEric Biggers asmlinkage void chacha_8block_xor_avx512vl(const struct chacha_state *state, 41*74750aa7SEric Biggers u8 *dst, const u8 *src, 42*74750aa7SEric Biggers unsigned int len, int nrounds); 43*74750aa7SEric Biggers 44*74750aa7SEric Biggers static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd); 45*74750aa7SEric Biggers static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2); 46*74750aa7SEric Biggers static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl); 47*74750aa7SEric Biggers 48*74750aa7SEric Biggers static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks) 49*74750aa7SEric Biggers { 50*74750aa7SEric Biggers len = min(len, maxblocks * CHACHA_BLOCK_SIZE); 51*74750aa7SEric Biggers return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE; 52*74750aa7SEric Biggers } 53*74750aa7SEric Biggers 54*74750aa7SEric Biggers static void chacha_dosimd(struct chacha_state *state, u8 *dst, const u8 *src, 55*74750aa7SEric Biggers unsigned int bytes, int nrounds) 56*74750aa7SEric Biggers { 57*74750aa7SEric Biggers if (static_branch_likely(&chacha_use_avx512vl)) { 58*74750aa7SEric Biggers while (bytes >= CHACHA_BLOCK_SIZE * 8) { 59*74750aa7SEric Biggers chacha_8block_xor_avx512vl(state, dst, src, bytes, 60*74750aa7SEric Biggers nrounds); 61*74750aa7SEric Biggers bytes -= CHACHA_BLOCK_SIZE * 8; 62*74750aa7SEric Biggers src += CHACHA_BLOCK_SIZE * 8; 63*74750aa7SEric Biggers dst += CHACHA_BLOCK_SIZE * 8; 64*74750aa7SEric Biggers state->x[12] += 8; 65*74750aa7SEric Biggers } 66*74750aa7SEric Biggers if (bytes > CHACHA_BLOCK_SIZE * 4) { 67*74750aa7SEric Biggers chacha_8block_xor_avx512vl(state, dst, src, bytes, 68*74750aa7SEric Biggers nrounds); 69*74750aa7SEric Biggers state->x[12] += chacha_advance(bytes, 8); 70*74750aa7SEric Biggers return; 71*74750aa7SEric Biggers } 72*74750aa7SEric Biggers if (bytes > CHACHA_BLOCK_SIZE * 2) { 73*74750aa7SEric Biggers chacha_4block_xor_avx512vl(state, dst, src, bytes, 74*74750aa7SEric Biggers nrounds); 75*74750aa7SEric Biggers state->x[12] += chacha_advance(bytes, 4); 76*74750aa7SEric Biggers return; 77*74750aa7SEric Biggers } 78*74750aa7SEric Biggers if (bytes) { 79*74750aa7SEric Biggers chacha_2block_xor_avx512vl(state, dst, src, bytes, 80*74750aa7SEric Biggers nrounds); 81*74750aa7SEric Biggers state->x[12] += chacha_advance(bytes, 2); 82*74750aa7SEric Biggers return; 83*74750aa7SEric Biggers } 84*74750aa7SEric Biggers } 85*74750aa7SEric Biggers 86*74750aa7SEric Biggers if (static_branch_likely(&chacha_use_avx2)) { 87*74750aa7SEric Biggers while (bytes >= CHACHA_BLOCK_SIZE * 8) { 88*74750aa7SEric Biggers chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); 89*74750aa7SEric Biggers bytes -= CHACHA_BLOCK_SIZE * 8; 90*74750aa7SEric Biggers src += CHACHA_BLOCK_SIZE * 8; 91*74750aa7SEric Biggers dst += CHACHA_BLOCK_SIZE * 8; 92*74750aa7SEric Biggers state->x[12] += 8; 93*74750aa7SEric Biggers } 94*74750aa7SEric Biggers if (bytes > CHACHA_BLOCK_SIZE * 4) { 95*74750aa7SEric Biggers chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); 96*74750aa7SEric Biggers state->x[12] += chacha_advance(bytes, 8); 97*74750aa7SEric Biggers return; 98*74750aa7SEric Biggers } 99*74750aa7SEric Biggers if (bytes > CHACHA_BLOCK_SIZE * 2) { 100*74750aa7SEric Biggers chacha_4block_xor_avx2(state, dst, src, bytes, nrounds); 101*74750aa7SEric Biggers state->x[12] += chacha_advance(bytes, 4); 102*74750aa7SEric Biggers return; 103*74750aa7SEric Biggers } 104*74750aa7SEric Biggers if (bytes > CHACHA_BLOCK_SIZE) { 105*74750aa7SEric Biggers chacha_2block_xor_avx2(state, dst, src, bytes, nrounds); 106*74750aa7SEric Biggers state->x[12] += chacha_advance(bytes, 2); 107*74750aa7SEric Biggers return; 108*74750aa7SEric Biggers } 109*74750aa7SEric Biggers } 110*74750aa7SEric Biggers 111*74750aa7SEric Biggers while (bytes >= CHACHA_BLOCK_SIZE * 4) { 112*74750aa7SEric Biggers chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); 113*74750aa7SEric Biggers bytes -= CHACHA_BLOCK_SIZE * 4; 114*74750aa7SEric Biggers src += CHACHA_BLOCK_SIZE * 4; 115*74750aa7SEric Biggers dst += CHACHA_BLOCK_SIZE * 4; 116*74750aa7SEric Biggers state->x[12] += 4; 117*74750aa7SEric Biggers } 118*74750aa7SEric Biggers if (bytes > CHACHA_BLOCK_SIZE) { 119*74750aa7SEric Biggers chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); 120*74750aa7SEric Biggers state->x[12] += chacha_advance(bytes, 4); 121*74750aa7SEric Biggers return; 122*74750aa7SEric Biggers } 123*74750aa7SEric Biggers if (bytes) { 124*74750aa7SEric Biggers chacha_block_xor_ssse3(state, dst, src, bytes, nrounds); 125*74750aa7SEric Biggers state->x[12]++; 126*74750aa7SEric Biggers } 127*74750aa7SEric Biggers } 128*74750aa7SEric Biggers 129*74750aa7SEric Biggers void hchacha_block_arch(const struct chacha_state *state, 130*74750aa7SEric Biggers u32 out[HCHACHA_OUT_WORDS], int nrounds) 131*74750aa7SEric Biggers { 132*74750aa7SEric Biggers if (!static_branch_likely(&chacha_use_simd)) { 133*74750aa7SEric Biggers hchacha_block_generic(state, out, nrounds); 134*74750aa7SEric Biggers } else { 135*74750aa7SEric Biggers kernel_fpu_begin(); 136*74750aa7SEric Biggers hchacha_block_ssse3(state, out, nrounds); 137*74750aa7SEric Biggers kernel_fpu_end(); 138*74750aa7SEric Biggers } 139*74750aa7SEric Biggers } 140*74750aa7SEric Biggers EXPORT_SYMBOL(hchacha_block_arch); 141*74750aa7SEric Biggers 142*74750aa7SEric Biggers void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, 143*74750aa7SEric Biggers unsigned int bytes, int nrounds) 144*74750aa7SEric Biggers { 145*74750aa7SEric Biggers if (!static_branch_likely(&chacha_use_simd) || 146*74750aa7SEric Biggers bytes <= CHACHA_BLOCK_SIZE) 147*74750aa7SEric Biggers return chacha_crypt_generic(state, dst, src, bytes, nrounds); 148*74750aa7SEric Biggers 149*74750aa7SEric Biggers do { 150*74750aa7SEric Biggers unsigned int todo = min_t(unsigned int, bytes, SZ_4K); 151*74750aa7SEric Biggers 152*74750aa7SEric Biggers kernel_fpu_begin(); 153*74750aa7SEric Biggers chacha_dosimd(state, dst, src, todo, nrounds); 154*74750aa7SEric Biggers kernel_fpu_end(); 155*74750aa7SEric Biggers 156*74750aa7SEric Biggers bytes -= todo; 157*74750aa7SEric Biggers src += todo; 158*74750aa7SEric Biggers dst += todo; 159*74750aa7SEric Biggers } while (bytes); 160*74750aa7SEric Biggers } 161*74750aa7SEric Biggers EXPORT_SYMBOL(chacha_crypt_arch); 162*74750aa7SEric Biggers 163*74750aa7SEric Biggers bool chacha_is_arch_optimized(void) 164*74750aa7SEric Biggers { 165*74750aa7SEric Biggers return static_key_enabled(&chacha_use_simd); 166*74750aa7SEric Biggers } 167*74750aa7SEric Biggers EXPORT_SYMBOL(chacha_is_arch_optimized); 168*74750aa7SEric Biggers 169*74750aa7SEric Biggers static int __init chacha_simd_mod_init(void) 170*74750aa7SEric Biggers { 171*74750aa7SEric Biggers if (!boot_cpu_has(X86_FEATURE_SSSE3)) 172*74750aa7SEric Biggers return 0; 173*74750aa7SEric Biggers 174*74750aa7SEric Biggers static_branch_enable(&chacha_use_simd); 175*74750aa7SEric Biggers 176*74750aa7SEric Biggers if (boot_cpu_has(X86_FEATURE_AVX) && 177*74750aa7SEric Biggers boot_cpu_has(X86_FEATURE_AVX2) && 178*74750aa7SEric Biggers cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { 179*74750aa7SEric Biggers static_branch_enable(&chacha_use_avx2); 180*74750aa7SEric Biggers 181*74750aa7SEric Biggers if (boot_cpu_has(X86_FEATURE_AVX512VL) && 182*74750aa7SEric Biggers boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */ 183*74750aa7SEric Biggers static_branch_enable(&chacha_use_avx512vl); 184*74750aa7SEric Biggers } 185*74750aa7SEric Biggers return 0; 186*74750aa7SEric Biggers } 187*74750aa7SEric Biggers subsys_initcall(chacha_simd_mod_init); 188*74750aa7SEric Biggers 189*74750aa7SEric Biggers static void __exit chacha_simd_mod_exit(void) 190*74750aa7SEric Biggers { 191*74750aa7SEric Biggers } 192*74750aa7SEric Biggers module_exit(chacha_simd_mod_exit); 193*74750aa7SEric Biggers 194*74750aa7SEric Biggers MODULE_LICENSE("GPL"); 195*74750aa7SEric Biggers MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); 196*74750aa7SEric Biggers MODULE_DESCRIPTION("ChaCha and HChaCha functions (x86_64 optimized)"); 197