xref: /linux/lib/crypto/x86/chacha_glue.c (revision 13150742b09e720fdf021de14cd2b98b37415a89)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * ChaCha and HChaCha functions (x86_64 optimized)
4  *
5  * Copyright (C) 2015 Martin Willi
6  */
7 
8 #include <asm/simd.h>
9 #include <crypto/chacha.h>
10 #include <linux/jump_label.h>
11 #include <linux/kernel.h>
12 #include <linux/module.h>
13 #include <linux/sizes.h>
14 
15 asmlinkage void chacha_block_xor_ssse3(const struct chacha_state *state,
16 				       u8 *dst, const u8 *src,
17 				       unsigned int len, int nrounds);
18 asmlinkage void chacha_4block_xor_ssse3(const struct chacha_state *state,
19 					u8 *dst, const u8 *src,
20 					unsigned int len, int nrounds);
21 asmlinkage void hchacha_block_ssse3(const struct chacha_state *state,
22 				    u32 out[HCHACHA_OUT_WORDS], int nrounds);
23 
24 asmlinkage void chacha_2block_xor_avx2(const struct chacha_state *state,
25 				       u8 *dst, const u8 *src,
26 				       unsigned int len, int nrounds);
27 asmlinkage void chacha_4block_xor_avx2(const struct chacha_state *state,
28 				       u8 *dst, const u8 *src,
29 				       unsigned int len, int nrounds);
30 asmlinkage void chacha_8block_xor_avx2(const struct chacha_state *state,
31 				       u8 *dst, const u8 *src,
32 				       unsigned int len, int nrounds);
33 
34 asmlinkage void chacha_2block_xor_avx512vl(const struct chacha_state *state,
35 					   u8 *dst, const u8 *src,
36 					   unsigned int len, int nrounds);
37 asmlinkage void chacha_4block_xor_avx512vl(const struct chacha_state *state,
38 					   u8 *dst, const u8 *src,
39 					   unsigned int len, int nrounds);
40 asmlinkage void chacha_8block_xor_avx512vl(const struct chacha_state *state,
41 					   u8 *dst, const u8 *src,
42 					   unsigned int len, int nrounds);
43 
44 static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd);
45 static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2);
46 static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl);
47 
chacha_advance(unsigned int len,unsigned int maxblocks)48 static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks)
49 {
50 	len = min(len, maxblocks * CHACHA_BLOCK_SIZE);
51 	return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE;
52 }
53 
chacha_dosimd(struct chacha_state * state,u8 * dst,const u8 * src,unsigned int bytes,int nrounds)54 static void chacha_dosimd(struct chacha_state *state, u8 *dst, const u8 *src,
55 			  unsigned int bytes, int nrounds)
56 {
57 	if (static_branch_likely(&chacha_use_avx512vl)) {
58 		while (bytes >= CHACHA_BLOCK_SIZE * 8) {
59 			chacha_8block_xor_avx512vl(state, dst, src, bytes,
60 						   nrounds);
61 			bytes -= CHACHA_BLOCK_SIZE * 8;
62 			src += CHACHA_BLOCK_SIZE * 8;
63 			dst += CHACHA_BLOCK_SIZE * 8;
64 			state->x[12] += 8;
65 		}
66 		if (bytes > CHACHA_BLOCK_SIZE * 4) {
67 			chacha_8block_xor_avx512vl(state, dst, src, bytes,
68 						   nrounds);
69 			state->x[12] += chacha_advance(bytes, 8);
70 			return;
71 		}
72 		if (bytes > CHACHA_BLOCK_SIZE * 2) {
73 			chacha_4block_xor_avx512vl(state, dst, src, bytes,
74 						   nrounds);
75 			state->x[12] += chacha_advance(bytes, 4);
76 			return;
77 		}
78 		if (bytes) {
79 			chacha_2block_xor_avx512vl(state, dst, src, bytes,
80 						   nrounds);
81 			state->x[12] += chacha_advance(bytes, 2);
82 			return;
83 		}
84 	}
85 
86 	if (static_branch_likely(&chacha_use_avx2)) {
87 		while (bytes >= CHACHA_BLOCK_SIZE * 8) {
88 			chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
89 			bytes -= CHACHA_BLOCK_SIZE * 8;
90 			src += CHACHA_BLOCK_SIZE * 8;
91 			dst += CHACHA_BLOCK_SIZE * 8;
92 			state->x[12] += 8;
93 		}
94 		if (bytes > CHACHA_BLOCK_SIZE * 4) {
95 			chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
96 			state->x[12] += chacha_advance(bytes, 8);
97 			return;
98 		}
99 		if (bytes > CHACHA_BLOCK_SIZE * 2) {
100 			chacha_4block_xor_avx2(state, dst, src, bytes, nrounds);
101 			state->x[12] += chacha_advance(bytes, 4);
102 			return;
103 		}
104 		if (bytes > CHACHA_BLOCK_SIZE) {
105 			chacha_2block_xor_avx2(state, dst, src, bytes, nrounds);
106 			state->x[12] += chacha_advance(bytes, 2);
107 			return;
108 		}
109 	}
110 
111 	while (bytes >= CHACHA_BLOCK_SIZE * 4) {
112 		chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
113 		bytes -= CHACHA_BLOCK_SIZE * 4;
114 		src += CHACHA_BLOCK_SIZE * 4;
115 		dst += CHACHA_BLOCK_SIZE * 4;
116 		state->x[12] += 4;
117 	}
118 	if (bytes > CHACHA_BLOCK_SIZE) {
119 		chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
120 		state->x[12] += chacha_advance(bytes, 4);
121 		return;
122 	}
123 	if (bytes) {
124 		chacha_block_xor_ssse3(state, dst, src, bytes, nrounds);
125 		state->x[12]++;
126 	}
127 }
128 
hchacha_block_arch(const struct chacha_state * state,u32 out[HCHACHA_OUT_WORDS],int nrounds)129 void hchacha_block_arch(const struct chacha_state *state,
130 			u32 out[HCHACHA_OUT_WORDS], int nrounds)
131 {
132 	if (!static_branch_likely(&chacha_use_simd)) {
133 		hchacha_block_generic(state, out, nrounds);
134 	} else {
135 		kernel_fpu_begin();
136 		hchacha_block_ssse3(state, out, nrounds);
137 		kernel_fpu_end();
138 	}
139 }
140 EXPORT_SYMBOL(hchacha_block_arch);
141 
chacha_crypt_arch(struct chacha_state * state,u8 * dst,const u8 * src,unsigned int bytes,int nrounds)142 void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
143 		       unsigned int bytes, int nrounds)
144 {
145 	if (!static_branch_likely(&chacha_use_simd) ||
146 	    bytes <= CHACHA_BLOCK_SIZE)
147 		return chacha_crypt_generic(state, dst, src, bytes, nrounds);
148 
149 	do {
150 		unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
151 
152 		kernel_fpu_begin();
153 		chacha_dosimd(state, dst, src, todo, nrounds);
154 		kernel_fpu_end();
155 
156 		bytes -= todo;
157 		src += todo;
158 		dst += todo;
159 	} while (bytes);
160 }
161 EXPORT_SYMBOL(chacha_crypt_arch);
162 
chacha_is_arch_optimized(void)163 bool chacha_is_arch_optimized(void)
164 {
165 	return static_key_enabled(&chacha_use_simd);
166 }
167 EXPORT_SYMBOL(chacha_is_arch_optimized);
168 
chacha_simd_mod_init(void)169 static int __init chacha_simd_mod_init(void)
170 {
171 	if (!boot_cpu_has(X86_FEATURE_SSSE3))
172 		return 0;
173 
174 	static_branch_enable(&chacha_use_simd);
175 
176 	if (boot_cpu_has(X86_FEATURE_AVX) &&
177 	    boot_cpu_has(X86_FEATURE_AVX2) &&
178 	    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
179 		static_branch_enable(&chacha_use_avx2);
180 
181 		if (boot_cpu_has(X86_FEATURE_AVX512VL) &&
182 		    boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */
183 			static_branch_enable(&chacha_use_avx512vl);
184 	}
185 	return 0;
186 }
187 subsys_initcall(chacha_simd_mod_init);
188 
chacha_simd_mod_exit(void)189 static void __exit chacha_simd_mod_exit(void)
190 {
191 }
192 module_exit(chacha_simd_mod_exit);
193 
194 MODULE_LICENSE("GPL");
195 MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
196 MODULE_DESCRIPTION("ChaCha and HChaCha functions (x86_64 optimized)");
197