1 /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 /* 3 * ChaCha and HChaCha functions (x86_64 optimized) 4 * 5 * Copyright (C) 2015 Martin Willi 6 */ 7 8 #include <asm/simd.h> 9 #include <linux/jump_label.h> 10 #include <linux/kernel.h> 11 #include <linux/sizes.h> 12 13 asmlinkage void chacha_block_xor_ssse3(const struct chacha_state *state, 14 u8 *dst, const u8 *src, 15 unsigned int len, int nrounds); 16 asmlinkage void chacha_4block_xor_ssse3(const struct chacha_state *state, 17 u8 *dst, const u8 *src, 18 unsigned int len, int nrounds); 19 asmlinkage void hchacha_block_ssse3(const struct chacha_state *state, 20 u32 out[HCHACHA_OUT_WORDS], int nrounds); 21 22 asmlinkage void chacha_2block_xor_avx2(const struct chacha_state *state, 23 u8 *dst, const u8 *src, 24 unsigned int len, int nrounds); 25 asmlinkage void chacha_4block_xor_avx2(const struct chacha_state *state, 26 u8 *dst, const u8 *src, 27 unsigned int len, int nrounds); 28 asmlinkage void chacha_8block_xor_avx2(const struct chacha_state *state, 29 u8 *dst, const u8 *src, 30 unsigned int len, int nrounds); 31 32 asmlinkage void chacha_2block_xor_avx512vl(const struct chacha_state *state, 33 u8 *dst, const u8 *src, 34 unsigned int len, int nrounds); 35 asmlinkage void chacha_4block_xor_avx512vl(const struct chacha_state *state, 36 u8 *dst, const u8 *src, 37 unsigned int len, int nrounds); 38 asmlinkage void chacha_8block_xor_avx512vl(const struct chacha_state *state, 39 u8 *dst, const u8 *src, 40 unsigned int len, int nrounds); 41 42 static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd); 43 static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2); 44 static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl); 45 46 static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks) 47 { 48 len = min(len, maxblocks * CHACHA_BLOCK_SIZE); 49 return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE; 50 } 51 52 static void chacha_dosimd(struct chacha_state *state, u8 *dst, const u8 *src, 53 unsigned int bytes, int nrounds) 54 { 55 if (static_branch_likely(&chacha_use_avx512vl)) { 56 while (bytes >= CHACHA_BLOCK_SIZE * 8) { 57 chacha_8block_xor_avx512vl(state, dst, src, bytes, 58 nrounds); 59 bytes -= CHACHA_BLOCK_SIZE * 8; 60 src += CHACHA_BLOCK_SIZE * 8; 61 dst += CHACHA_BLOCK_SIZE * 8; 62 state->x[12] += 8; 63 } 64 if (bytes > CHACHA_BLOCK_SIZE * 4) { 65 chacha_8block_xor_avx512vl(state, dst, src, bytes, 66 nrounds); 67 state->x[12] += chacha_advance(bytes, 8); 68 return; 69 } 70 if (bytes > CHACHA_BLOCK_SIZE * 2) { 71 chacha_4block_xor_avx512vl(state, dst, src, bytes, 72 nrounds); 73 state->x[12] += chacha_advance(bytes, 4); 74 return; 75 } 76 if (bytes) { 77 chacha_2block_xor_avx512vl(state, dst, src, bytes, 78 nrounds); 79 state->x[12] += chacha_advance(bytes, 2); 80 return; 81 } 82 } 83 84 if (static_branch_likely(&chacha_use_avx2)) { 85 while (bytes >= CHACHA_BLOCK_SIZE * 8) { 86 chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); 87 bytes -= CHACHA_BLOCK_SIZE * 8; 88 src += CHACHA_BLOCK_SIZE * 8; 89 dst += CHACHA_BLOCK_SIZE * 8; 90 state->x[12] += 8; 91 } 92 if (bytes > CHACHA_BLOCK_SIZE * 4) { 93 chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); 94 state->x[12] += chacha_advance(bytes, 8); 95 return; 96 } 97 if (bytes > CHACHA_BLOCK_SIZE * 2) { 98 chacha_4block_xor_avx2(state, dst, src, bytes, nrounds); 99 state->x[12] += chacha_advance(bytes, 4); 100 return; 101 } 102 if (bytes > CHACHA_BLOCK_SIZE) { 103 chacha_2block_xor_avx2(state, dst, src, bytes, nrounds); 104 state->x[12] += chacha_advance(bytes, 2); 105 return; 106 } 107 } 108 109 while (bytes >= CHACHA_BLOCK_SIZE * 4) { 110 chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); 111 bytes -= CHACHA_BLOCK_SIZE * 4; 112 src += CHACHA_BLOCK_SIZE * 4; 113 dst += CHACHA_BLOCK_SIZE * 4; 114 state->x[12] += 4; 115 } 116 if (bytes > CHACHA_BLOCK_SIZE) { 117 chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); 118 state->x[12] += chacha_advance(bytes, 4); 119 return; 120 } 121 if (bytes) { 122 chacha_block_xor_ssse3(state, dst, src, bytes, nrounds); 123 state->x[12]++; 124 } 125 } 126 127 static void hchacha_block_arch(const struct chacha_state *state, 128 u32 out[HCHACHA_OUT_WORDS], int nrounds) 129 { 130 if (!static_branch_likely(&chacha_use_simd)) { 131 hchacha_block_generic(state, out, nrounds); 132 } else { 133 kernel_fpu_begin(); 134 hchacha_block_ssse3(state, out, nrounds); 135 kernel_fpu_end(); 136 } 137 } 138 139 static void chacha_crypt_arch(struct chacha_state *state, u8 *dst, 140 const u8 *src, unsigned int bytes, int nrounds) 141 { 142 if (!static_branch_likely(&chacha_use_simd) || 143 bytes <= CHACHA_BLOCK_SIZE) 144 return chacha_crypt_generic(state, dst, src, bytes, nrounds); 145 146 do { 147 unsigned int todo = min_t(unsigned int, bytes, SZ_4K); 148 149 kernel_fpu_begin(); 150 chacha_dosimd(state, dst, src, todo, nrounds); 151 kernel_fpu_end(); 152 153 bytes -= todo; 154 src += todo; 155 dst += todo; 156 } while (bytes); 157 } 158 159 #define chacha_mod_init_arch chacha_mod_init_arch 160 static void chacha_mod_init_arch(void) 161 { 162 if (!boot_cpu_has(X86_FEATURE_SSSE3)) 163 return; 164 165 static_branch_enable(&chacha_use_simd); 166 167 if (boot_cpu_has(X86_FEATURE_AVX) && 168 boot_cpu_has(X86_FEATURE_AVX2) && 169 cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { 170 static_branch_enable(&chacha_use_avx2); 171 172 if (boot_cpu_has(X86_FEATURE_AVX512VL) && 173 boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */ 174 static_branch_enable(&chacha_use_avx512vl); 175 } 176 } 177