xref: /linux/lib/crypto/x86/chacha_glue.c (revision 13150742b09e720fdf021de14cd2b98b37415a89)
1*74750aa7SEric Biggers // SPDX-License-Identifier: GPL-2.0-or-later
2*74750aa7SEric Biggers /*
3*74750aa7SEric Biggers  * ChaCha and HChaCha functions (x86_64 optimized)
4*74750aa7SEric Biggers  *
5*74750aa7SEric Biggers  * Copyright (C) 2015 Martin Willi
6*74750aa7SEric Biggers  */
7*74750aa7SEric Biggers 
8*74750aa7SEric Biggers #include <asm/simd.h>
9*74750aa7SEric Biggers #include <crypto/chacha.h>
10*74750aa7SEric Biggers #include <linux/jump_label.h>
11*74750aa7SEric Biggers #include <linux/kernel.h>
12*74750aa7SEric Biggers #include <linux/module.h>
13*74750aa7SEric Biggers #include <linux/sizes.h>
14*74750aa7SEric Biggers 
15*74750aa7SEric Biggers asmlinkage void chacha_block_xor_ssse3(const struct chacha_state *state,
16*74750aa7SEric Biggers 				       u8 *dst, const u8 *src,
17*74750aa7SEric Biggers 				       unsigned int len, int nrounds);
18*74750aa7SEric Biggers asmlinkage void chacha_4block_xor_ssse3(const struct chacha_state *state,
19*74750aa7SEric Biggers 					u8 *dst, const u8 *src,
20*74750aa7SEric Biggers 					unsigned int len, int nrounds);
21*74750aa7SEric Biggers asmlinkage void hchacha_block_ssse3(const struct chacha_state *state,
22*74750aa7SEric Biggers 				    u32 out[HCHACHA_OUT_WORDS], int nrounds);
23*74750aa7SEric Biggers 
24*74750aa7SEric Biggers asmlinkage void chacha_2block_xor_avx2(const struct chacha_state *state,
25*74750aa7SEric Biggers 				       u8 *dst, const u8 *src,
26*74750aa7SEric Biggers 				       unsigned int len, int nrounds);
27*74750aa7SEric Biggers asmlinkage void chacha_4block_xor_avx2(const struct chacha_state *state,
28*74750aa7SEric Biggers 				       u8 *dst, const u8 *src,
29*74750aa7SEric Biggers 				       unsigned int len, int nrounds);
30*74750aa7SEric Biggers asmlinkage void chacha_8block_xor_avx2(const struct chacha_state *state,
31*74750aa7SEric Biggers 				       u8 *dst, const u8 *src,
32*74750aa7SEric Biggers 				       unsigned int len, int nrounds);
33*74750aa7SEric Biggers 
34*74750aa7SEric Biggers asmlinkage void chacha_2block_xor_avx512vl(const struct chacha_state *state,
35*74750aa7SEric Biggers 					   u8 *dst, const u8 *src,
36*74750aa7SEric Biggers 					   unsigned int len, int nrounds);
37*74750aa7SEric Biggers asmlinkage void chacha_4block_xor_avx512vl(const struct chacha_state *state,
38*74750aa7SEric Biggers 					   u8 *dst, const u8 *src,
39*74750aa7SEric Biggers 					   unsigned int len, int nrounds);
40*74750aa7SEric Biggers asmlinkage void chacha_8block_xor_avx512vl(const struct chacha_state *state,
41*74750aa7SEric Biggers 					   u8 *dst, const u8 *src,
42*74750aa7SEric Biggers 					   unsigned int len, int nrounds);
43*74750aa7SEric Biggers 
44*74750aa7SEric Biggers static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd);
45*74750aa7SEric Biggers static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2);
46*74750aa7SEric Biggers static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl);
47*74750aa7SEric Biggers 
chacha_advance(unsigned int len,unsigned int maxblocks)48*74750aa7SEric Biggers static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks)
49*74750aa7SEric Biggers {
50*74750aa7SEric Biggers 	len = min(len, maxblocks * CHACHA_BLOCK_SIZE);
51*74750aa7SEric Biggers 	return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE;
52*74750aa7SEric Biggers }
53*74750aa7SEric Biggers 
chacha_dosimd(struct chacha_state * state,u8 * dst,const u8 * src,unsigned int bytes,int nrounds)54*74750aa7SEric Biggers static void chacha_dosimd(struct chacha_state *state, u8 *dst, const u8 *src,
55*74750aa7SEric Biggers 			  unsigned int bytes, int nrounds)
56*74750aa7SEric Biggers {
57*74750aa7SEric Biggers 	if (static_branch_likely(&chacha_use_avx512vl)) {
58*74750aa7SEric Biggers 		while (bytes >= CHACHA_BLOCK_SIZE * 8) {
59*74750aa7SEric Biggers 			chacha_8block_xor_avx512vl(state, dst, src, bytes,
60*74750aa7SEric Biggers 						   nrounds);
61*74750aa7SEric Biggers 			bytes -= CHACHA_BLOCK_SIZE * 8;
62*74750aa7SEric Biggers 			src += CHACHA_BLOCK_SIZE * 8;
63*74750aa7SEric Biggers 			dst += CHACHA_BLOCK_SIZE * 8;
64*74750aa7SEric Biggers 			state->x[12] += 8;
65*74750aa7SEric Biggers 		}
66*74750aa7SEric Biggers 		if (bytes > CHACHA_BLOCK_SIZE * 4) {
67*74750aa7SEric Biggers 			chacha_8block_xor_avx512vl(state, dst, src, bytes,
68*74750aa7SEric Biggers 						   nrounds);
69*74750aa7SEric Biggers 			state->x[12] += chacha_advance(bytes, 8);
70*74750aa7SEric Biggers 			return;
71*74750aa7SEric Biggers 		}
72*74750aa7SEric Biggers 		if (bytes > CHACHA_BLOCK_SIZE * 2) {
73*74750aa7SEric Biggers 			chacha_4block_xor_avx512vl(state, dst, src, bytes,
74*74750aa7SEric Biggers 						   nrounds);
75*74750aa7SEric Biggers 			state->x[12] += chacha_advance(bytes, 4);
76*74750aa7SEric Biggers 			return;
77*74750aa7SEric Biggers 		}
78*74750aa7SEric Biggers 		if (bytes) {
79*74750aa7SEric Biggers 			chacha_2block_xor_avx512vl(state, dst, src, bytes,
80*74750aa7SEric Biggers 						   nrounds);
81*74750aa7SEric Biggers 			state->x[12] += chacha_advance(bytes, 2);
82*74750aa7SEric Biggers 			return;
83*74750aa7SEric Biggers 		}
84*74750aa7SEric Biggers 	}
85*74750aa7SEric Biggers 
86*74750aa7SEric Biggers 	if (static_branch_likely(&chacha_use_avx2)) {
87*74750aa7SEric Biggers 		while (bytes >= CHACHA_BLOCK_SIZE * 8) {
88*74750aa7SEric Biggers 			chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
89*74750aa7SEric Biggers 			bytes -= CHACHA_BLOCK_SIZE * 8;
90*74750aa7SEric Biggers 			src += CHACHA_BLOCK_SIZE * 8;
91*74750aa7SEric Biggers 			dst += CHACHA_BLOCK_SIZE * 8;
92*74750aa7SEric Biggers 			state->x[12] += 8;
93*74750aa7SEric Biggers 		}
94*74750aa7SEric Biggers 		if (bytes > CHACHA_BLOCK_SIZE * 4) {
95*74750aa7SEric Biggers 			chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
96*74750aa7SEric Biggers 			state->x[12] += chacha_advance(bytes, 8);
97*74750aa7SEric Biggers 			return;
98*74750aa7SEric Biggers 		}
99*74750aa7SEric Biggers 		if (bytes > CHACHA_BLOCK_SIZE * 2) {
100*74750aa7SEric Biggers 			chacha_4block_xor_avx2(state, dst, src, bytes, nrounds);
101*74750aa7SEric Biggers 			state->x[12] += chacha_advance(bytes, 4);
102*74750aa7SEric Biggers 			return;
103*74750aa7SEric Biggers 		}
104*74750aa7SEric Biggers 		if (bytes > CHACHA_BLOCK_SIZE) {
105*74750aa7SEric Biggers 			chacha_2block_xor_avx2(state, dst, src, bytes, nrounds);
106*74750aa7SEric Biggers 			state->x[12] += chacha_advance(bytes, 2);
107*74750aa7SEric Biggers 			return;
108*74750aa7SEric Biggers 		}
109*74750aa7SEric Biggers 	}
110*74750aa7SEric Biggers 
111*74750aa7SEric Biggers 	while (bytes >= CHACHA_BLOCK_SIZE * 4) {
112*74750aa7SEric Biggers 		chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
113*74750aa7SEric Biggers 		bytes -= CHACHA_BLOCK_SIZE * 4;
114*74750aa7SEric Biggers 		src += CHACHA_BLOCK_SIZE * 4;
115*74750aa7SEric Biggers 		dst += CHACHA_BLOCK_SIZE * 4;
116*74750aa7SEric Biggers 		state->x[12] += 4;
117*74750aa7SEric Biggers 	}
118*74750aa7SEric Biggers 	if (bytes > CHACHA_BLOCK_SIZE) {
119*74750aa7SEric Biggers 		chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
120*74750aa7SEric Biggers 		state->x[12] += chacha_advance(bytes, 4);
121*74750aa7SEric Biggers 		return;
122*74750aa7SEric Biggers 	}
123*74750aa7SEric Biggers 	if (bytes) {
124*74750aa7SEric Biggers 		chacha_block_xor_ssse3(state, dst, src, bytes, nrounds);
125*74750aa7SEric Biggers 		state->x[12]++;
126*74750aa7SEric Biggers 	}
127*74750aa7SEric Biggers }
128*74750aa7SEric Biggers 
hchacha_block_arch(const struct chacha_state * state,u32 out[HCHACHA_OUT_WORDS],int nrounds)129*74750aa7SEric Biggers void hchacha_block_arch(const struct chacha_state *state,
130*74750aa7SEric Biggers 			u32 out[HCHACHA_OUT_WORDS], int nrounds)
131*74750aa7SEric Biggers {
132*74750aa7SEric Biggers 	if (!static_branch_likely(&chacha_use_simd)) {
133*74750aa7SEric Biggers 		hchacha_block_generic(state, out, nrounds);
134*74750aa7SEric Biggers 	} else {
135*74750aa7SEric Biggers 		kernel_fpu_begin();
136*74750aa7SEric Biggers 		hchacha_block_ssse3(state, out, nrounds);
137*74750aa7SEric Biggers 		kernel_fpu_end();
138*74750aa7SEric Biggers 	}
139*74750aa7SEric Biggers }
140*74750aa7SEric Biggers EXPORT_SYMBOL(hchacha_block_arch);
141*74750aa7SEric Biggers 
chacha_crypt_arch(struct chacha_state * state,u8 * dst,const u8 * src,unsigned int bytes,int nrounds)142*74750aa7SEric Biggers void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
143*74750aa7SEric Biggers 		       unsigned int bytes, int nrounds)
144*74750aa7SEric Biggers {
145*74750aa7SEric Biggers 	if (!static_branch_likely(&chacha_use_simd) ||
146*74750aa7SEric Biggers 	    bytes <= CHACHA_BLOCK_SIZE)
147*74750aa7SEric Biggers 		return chacha_crypt_generic(state, dst, src, bytes, nrounds);
148*74750aa7SEric Biggers 
149*74750aa7SEric Biggers 	do {
150*74750aa7SEric Biggers 		unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
151*74750aa7SEric Biggers 
152*74750aa7SEric Biggers 		kernel_fpu_begin();
153*74750aa7SEric Biggers 		chacha_dosimd(state, dst, src, todo, nrounds);
154*74750aa7SEric Biggers 		kernel_fpu_end();
155*74750aa7SEric Biggers 
156*74750aa7SEric Biggers 		bytes -= todo;
157*74750aa7SEric Biggers 		src += todo;
158*74750aa7SEric Biggers 		dst += todo;
159*74750aa7SEric Biggers 	} while (bytes);
160*74750aa7SEric Biggers }
161*74750aa7SEric Biggers EXPORT_SYMBOL(chacha_crypt_arch);
162*74750aa7SEric Biggers 
chacha_is_arch_optimized(void)163*74750aa7SEric Biggers bool chacha_is_arch_optimized(void)
164*74750aa7SEric Biggers {
165*74750aa7SEric Biggers 	return static_key_enabled(&chacha_use_simd);
166*74750aa7SEric Biggers }
167*74750aa7SEric Biggers EXPORT_SYMBOL(chacha_is_arch_optimized);
168*74750aa7SEric Biggers 
chacha_simd_mod_init(void)169*74750aa7SEric Biggers static int __init chacha_simd_mod_init(void)
170*74750aa7SEric Biggers {
171*74750aa7SEric Biggers 	if (!boot_cpu_has(X86_FEATURE_SSSE3))
172*74750aa7SEric Biggers 		return 0;
173*74750aa7SEric Biggers 
174*74750aa7SEric Biggers 	static_branch_enable(&chacha_use_simd);
175*74750aa7SEric Biggers 
176*74750aa7SEric Biggers 	if (boot_cpu_has(X86_FEATURE_AVX) &&
177*74750aa7SEric Biggers 	    boot_cpu_has(X86_FEATURE_AVX2) &&
178*74750aa7SEric Biggers 	    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
179*74750aa7SEric Biggers 		static_branch_enable(&chacha_use_avx2);
180*74750aa7SEric Biggers 
181*74750aa7SEric Biggers 		if (boot_cpu_has(X86_FEATURE_AVX512VL) &&
182*74750aa7SEric Biggers 		    boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */
183*74750aa7SEric Biggers 			static_branch_enable(&chacha_use_avx512vl);
184*74750aa7SEric Biggers 	}
185*74750aa7SEric Biggers 	return 0;
186*74750aa7SEric Biggers }
187*74750aa7SEric Biggers subsys_initcall(chacha_simd_mod_init);
188*74750aa7SEric Biggers 
chacha_simd_mod_exit(void)189*74750aa7SEric Biggers static void __exit chacha_simd_mod_exit(void)
190*74750aa7SEric Biggers {
191*74750aa7SEric Biggers }
192*74750aa7SEric Biggers module_exit(chacha_simd_mod_exit);
193*74750aa7SEric Biggers 
194*74750aa7SEric Biggers MODULE_LICENSE("GPL");
195*74750aa7SEric Biggers MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
196*74750aa7SEric Biggers MODULE_DESCRIPTION("ChaCha and HChaCha functions (x86_64 optimized)");
197