1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * ChaCha and HChaCha functions (x86_64 optimized)
4 *
5 * Copyright (C) 2015 Martin Willi
6 */
7
8 #include <asm/simd.h>
9 #include <crypto/chacha.h>
10 #include <linux/jump_label.h>
11 #include <linux/kernel.h>
12 #include <linux/module.h>
13 #include <linux/sizes.h>
14
15 asmlinkage void chacha_block_xor_ssse3(const struct chacha_state *state,
16 u8 *dst, const u8 *src,
17 unsigned int len, int nrounds);
18 asmlinkage void chacha_4block_xor_ssse3(const struct chacha_state *state,
19 u8 *dst, const u8 *src,
20 unsigned int len, int nrounds);
21 asmlinkage void hchacha_block_ssse3(const struct chacha_state *state,
22 u32 out[HCHACHA_OUT_WORDS], int nrounds);
23
24 asmlinkage void chacha_2block_xor_avx2(const struct chacha_state *state,
25 u8 *dst, const u8 *src,
26 unsigned int len, int nrounds);
27 asmlinkage void chacha_4block_xor_avx2(const struct chacha_state *state,
28 u8 *dst, const u8 *src,
29 unsigned int len, int nrounds);
30 asmlinkage void chacha_8block_xor_avx2(const struct chacha_state *state,
31 u8 *dst, const u8 *src,
32 unsigned int len, int nrounds);
33
34 asmlinkage void chacha_2block_xor_avx512vl(const struct chacha_state *state,
35 u8 *dst, const u8 *src,
36 unsigned int len, int nrounds);
37 asmlinkage void chacha_4block_xor_avx512vl(const struct chacha_state *state,
38 u8 *dst, const u8 *src,
39 unsigned int len, int nrounds);
40 asmlinkage void chacha_8block_xor_avx512vl(const struct chacha_state *state,
41 u8 *dst, const u8 *src,
42 unsigned int len, int nrounds);
43
44 static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd);
45 static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2);
46 static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl);
47
chacha_advance(unsigned int len,unsigned int maxblocks)48 static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks)
49 {
50 len = min(len, maxblocks * CHACHA_BLOCK_SIZE);
51 return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE;
52 }
53
chacha_dosimd(struct chacha_state * state,u8 * dst,const u8 * src,unsigned int bytes,int nrounds)54 static void chacha_dosimd(struct chacha_state *state, u8 *dst, const u8 *src,
55 unsigned int bytes, int nrounds)
56 {
57 if (static_branch_likely(&chacha_use_avx512vl)) {
58 while (bytes >= CHACHA_BLOCK_SIZE * 8) {
59 chacha_8block_xor_avx512vl(state, dst, src, bytes,
60 nrounds);
61 bytes -= CHACHA_BLOCK_SIZE * 8;
62 src += CHACHA_BLOCK_SIZE * 8;
63 dst += CHACHA_BLOCK_SIZE * 8;
64 state->x[12] += 8;
65 }
66 if (bytes > CHACHA_BLOCK_SIZE * 4) {
67 chacha_8block_xor_avx512vl(state, dst, src, bytes,
68 nrounds);
69 state->x[12] += chacha_advance(bytes, 8);
70 return;
71 }
72 if (bytes > CHACHA_BLOCK_SIZE * 2) {
73 chacha_4block_xor_avx512vl(state, dst, src, bytes,
74 nrounds);
75 state->x[12] += chacha_advance(bytes, 4);
76 return;
77 }
78 if (bytes) {
79 chacha_2block_xor_avx512vl(state, dst, src, bytes,
80 nrounds);
81 state->x[12] += chacha_advance(bytes, 2);
82 return;
83 }
84 }
85
86 if (static_branch_likely(&chacha_use_avx2)) {
87 while (bytes >= CHACHA_BLOCK_SIZE * 8) {
88 chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
89 bytes -= CHACHA_BLOCK_SIZE * 8;
90 src += CHACHA_BLOCK_SIZE * 8;
91 dst += CHACHA_BLOCK_SIZE * 8;
92 state->x[12] += 8;
93 }
94 if (bytes > CHACHA_BLOCK_SIZE * 4) {
95 chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
96 state->x[12] += chacha_advance(bytes, 8);
97 return;
98 }
99 if (bytes > CHACHA_BLOCK_SIZE * 2) {
100 chacha_4block_xor_avx2(state, dst, src, bytes, nrounds);
101 state->x[12] += chacha_advance(bytes, 4);
102 return;
103 }
104 if (bytes > CHACHA_BLOCK_SIZE) {
105 chacha_2block_xor_avx2(state, dst, src, bytes, nrounds);
106 state->x[12] += chacha_advance(bytes, 2);
107 return;
108 }
109 }
110
111 while (bytes >= CHACHA_BLOCK_SIZE * 4) {
112 chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
113 bytes -= CHACHA_BLOCK_SIZE * 4;
114 src += CHACHA_BLOCK_SIZE * 4;
115 dst += CHACHA_BLOCK_SIZE * 4;
116 state->x[12] += 4;
117 }
118 if (bytes > CHACHA_BLOCK_SIZE) {
119 chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
120 state->x[12] += chacha_advance(bytes, 4);
121 return;
122 }
123 if (bytes) {
124 chacha_block_xor_ssse3(state, dst, src, bytes, nrounds);
125 state->x[12]++;
126 }
127 }
128
hchacha_block_arch(const struct chacha_state * state,u32 out[HCHACHA_OUT_WORDS],int nrounds)129 void hchacha_block_arch(const struct chacha_state *state,
130 u32 out[HCHACHA_OUT_WORDS], int nrounds)
131 {
132 if (!static_branch_likely(&chacha_use_simd)) {
133 hchacha_block_generic(state, out, nrounds);
134 } else {
135 kernel_fpu_begin();
136 hchacha_block_ssse3(state, out, nrounds);
137 kernel_fpu_end();
138 }
139 }
140 EXPORT_SYMBOL(hchacha_block_arch);
141
chacha_crypt_arch(struct chacha_state * state,u8 * dst,const u8 * src,unsigned int bytes,int nrounds)142 void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
143 unsigned int bytes, int nrounds)
144 {
145 if (!static_branch_likely(&chacha_use_simd) ||
146 bytes <= CHACHA_BLOCK_SIZE)
147 return chacha_crypt_generic(state, dst, src, bytes, nrounds);
148
149 do {
150 unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
151
152 kernel_fpu_begin();
153 chacha_dosimd(state, dst, src, todo, nrounds);
154 kernel_fpu_end();
155
156 bytes -= todo;
157 src += todo;
158 dst += todo;
159 } while (bytes);
160 }
161 EXPORT_SYMBOL(chacha_crypt_arch);
162
chacha_is_arch_optimized(void)163 bool chacha_is_arch_optimized(void)
164 {
165 return static_key_enabled(&chacha_use_simd);
166 }
167 EXPORT_SYMBOL(chacha_is_arch_optimized);
168
chacha_simd_mod_init(void)169 static int __init chacha_simd_mod_init(void)
170 {
171 if (!boot_cpu_has(X86_FEATURE_SSSE3))
172 return 0;
173
174 static_branch_enable(&chacha_use_simd);
175
176 if (boot_cpu_has(X86_FEATURE_AVX) &&
177 boot_cpu_has(X86_FEATURE_AVX2) &&
178 cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
179 static_branch_enable(&chacha_use_avx2);
180
181 if (boot_cpu_has(X86_FEATURE_AVX512VL) &&
182 boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */
183 static_branch_enable(&chacha_use_avx512vl);
184 }
185 return 0;
186 }
187 subsys_initcall(chacha_simd_mod_init);
188
chacha_simd_mod_exit(void)189 static void __exit chacha_simd_mod_exit(void)
190 {
191 }
192 module_exit(chacha_simd_mod_exit);
193
194 MODULE_LICENSE("GPL");
195 MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
196 MODULE_DESCRIPTION("ChaCha and HChaCha functions (x86_64 optimized)");
197