1*4a32e5dcSEric Biggers // SPDX-License-Identifier: GPL-2.0
2*4a32e5dcSEric Biggers /*
3*4a32e5dcSEric Biggers * ChaCha and HChaCha functions (ARM optimized)
4*4a32e5dcSEric Biggers *
5*4a32e5dcSEric Biggers * Copyright (C) 2016-2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
6*4a32e5dcSEric Biggers * Copyright (C) 2015 Martin Willi
7*4a32e5dcSEric Biggers */
8*4a32e5dcSEric Biggers
9*4a32e5dcSEric Biggers #include <crypto/chacha.h>
10*4a32e5dcSEric Biggers #include <crypto/internal/simd.h>
11*4a32e5dcSEric Biggers #include <linux/jump_label.h>
12*4a32e5dcSEric Biggers #include <linux/kernel.h>
13*4a32e5dcSEric Biggers #include <linux/module.h>
14*4a32e5dcSEric Biggers
15*4a32e5dcSEric Biggers #include <asm/cputype.h>
16*4a32e5dcSEric Biggers #include <asm/hwcap.h>
17*4a32e5dcSEric Biggers #include <asm/neon.h>
18*4a32e5dcSEric Biggers #include <asm/simd.h>
19*4a32e5dcSEric Biggers
20*4a32e5dcSEric Biggers asmlinkage void chacha_block_xor_neon(const struct chacha_state *state,
21*4a32e5dcSEric Biggers u8 *dst, const u8 *src, int nrounds);
22*4a32e5dcSEric Biggers asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state,
23*4a32e5dcSEric Biggers u8 *dst, const u8 *src,
24*4a32e5dcSEric Biggers int nrounds, unsigned int nbytes);
25*4a32e5dcSEric Biggers asmlinkage void hchacha_block_arm(const struct chacha_state *state,
26*4a32e5dcSEric Biggers u32 out[HCHACHA_OUT_WORDS], int nrounds);
27*4a32e5dcSEric Biggers asmlinkage void hchacha_block_neon(const struct chacha_state *state,
28*4a32e5dcSEric Biggers u32 out[HCHACHA_OUT_WORDS], int nrounds);
29*4a32e5dcSEric Biggers
30*4a32e5dcSEric Biggers asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
31*4a32e5dcSEric Biggers const struct chacha_state *state, int nrounds);
32*4a32e5dcSEric Biggers
33*4a32e5dcSEric Biggers static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_neon);
34*4a32e5dcSEric Biggers
neon_usable(void)35*4a32e5dcSEric Biggers static inline bool neon_usable(void)
36*4a32e5dcSEric Biggers {
37*4a32e5dcSEric Biggers return static_branch_likely(&use_neon) && crypto_simd_usable();
38*4a32e5dcSEric Biggers }
39*4a32e5dcSEric Biggers
chacha_doneon(struct chacha_state * state,u8 * dst,const u8 * src,unsigned int bytes,int nrounds)40*4a32e5dcSEric Biggers static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src,
41*4a32e5dcSEric Biggers unsigned int bytes, int nrounds)
42*4a32e5dcSEric Biggers {
43*4a32e5dcSEric Biggers u8 buf[CHACHA_BLOCK_SIZE];
44*4a32e5dcSEric Biggers
45*4a32e5dcSEric Biggers while (bytes > CHACHA_BLOCK_SIZE) {
46*4a32e5dcSEric Biggers unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U);
47*4a32e5dcSEric Biggers
48*4a32e5dcSEric Biggers chacha_4block_xor_neon(state, dst, src, nrounds, l);
49*4a32e5dcSEric Biggers bytes -= l;
50*4a32e5dcSEric Biggers src += l;
51*4a32e5dcSEric Biggers dst += l;
52*4a32e5dcSEric Biggers state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
53*4a32e5dcSEric Biggers }
54*4a32e5dcSEric Biggers if (bytes) {
55*4a32e5dcSEric Biggers const u8 *s = src;
56*4a32e5dcSEric Biggers u8 *d = dst;
57*4a32e5dcSEric Biggers
58*4a32e5dcSEric Biggers if (bytes != CHACHA_BLOCK_SIZE)
59*4a32e5dcSEric Biggers s = d = memcpy(buf, src, bytes);
60*4a32e5dcSEric Biggers chacha_block_xor_neon(state, d, s, nrounds);
61*4a32e5dcSEric Biggers if (d != dst)
62*4a32e5dcSEric Biggers memcpy(dst, buf, bytes);
63*4a32e5dcSEric Biggers state->x[12]++;
64*4a32e5dcSEric Biggers }
65*4a32e5dcSEric Biggers }
66*4a32e5dcSEric Biggers
hchacha_block_arch(const struct chacha_state * state,u32 out[HCHACHA_OUT_WORDS],int nrounds)67*4a32e5dcSEric Biggers void hchacha_block_arch(const struct chacha_state *state,
68*4a32e5dcSEric Biggers u32 out[HCHACHA_OUT_WORDS], int nrounds)
69*4a32e5dcSEric Biggers {
70*4a32e5dcSEric Biggers if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) {
71*4a32e5dcSEric Biggers hchacha_block_arm(state, out, nrounds);
72*4a32e5dcSEric Biggers } else {
73*4a32e5dcSEric Biggers kernel_neon_begin();
74*4a32e5dcSEric Biggers hchacha_block_neon(state, out, nrounds);
75*4a32e5dcSEric Biggers kernel_neon_end();
76*4a32e5dcSEric Biggers }
77*4a32e5dcSEric Biggers }
78*4a32e5dcSEric Biggers EXPORT_SYMBOL(hchacha_block_arch);
79*4a32e5dcSEric Biggers
chacha_crypt_arch(struct chacha_state * state,u8 * dst,const u8 * src,unsigned int bytes,int nrounds)80*4a32e5dcSEric Biggers void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
81*4a32e5dcSEric Biggers unsigned int bytes, int nrounds)
82*4a32e5dcSEric Biggers {
83*4a32e5dcSEric Biggers if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() ||
84*4a32e5dcSEric Biggers bytes <= CHACHA_BLOCK_SIZE) {
85*4a32e5dcSEric Biggers chacha_doarm(dst, src, bytes, state, nrounds);
86*4a32e5dcSEric Biggers state->x[12] += DIV_ROUND_UP(bytes, CHACHA_BLOCK_SIZE);
87*4a32e5dcSEric Biggers return;
88*4a32e5dcSEric Biggers }
89*4a32e5dcSEric Biggers
90*4a32e5dcSEric Biggers do {
91*4a32e5dcSEric Biggers unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
92*4a32e5dcSEric Biggers
93*4a32e5dcSEric Biggers kernel_neon_begin();
94*4a32e5dcSEric Biggers chacha_doneon(state, dst, src, todo, nrounds);
95*4a32e5dcSEric Biggers kernel_neon_end();
96*4a32e5dcSEric Biggers
97*4a32e5dcSEric Biggers bytes -= todo;
98*4a32e5dcSEric Biggers src += todo;
99*4a32e5dcSEric Biggers dst += todo;
100*4a32e5dcSEric Biggers } while (bytes);
101*4a32e5dcSEric Biggers }
102*4a32e5dcSEric Biggers EXPORT_SYMBOL(chacha_crypt_arch);
103*4a32e5dcSEric Biggers
chacha_is_arch_optimized(void)104*4a32e5dcSEric Biggers bool chacha_is_arch_optimized(void)
105*4a32e5dcSEric Biggers {
106*4a32e5dcSEric Biggers /* We always can use at least the ARM scalar implementation. */
107*4a32e5dcSEric Biggers return true;
108*4a32e5dcSEric Biggers }
109*4a32e5dcSEric Biggers EXPORT_SYMBOL(chacha_is_arch_optimized);
110*4a32e5dcSEric Biggers
chacha_arm_mod_init(void)111*4a32e5dcSEric Biggers static int __init chacha_arm_mod_init(void)
112*4a32e5dcSEric Biggers {
113*4a32e5dcSEric Biggers if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
114*4a32e5dcSEric Biggers switch (read_cpuid_part()) {
115*4a32e5dcSEric Biggers case ARM_CPU_PART_CORTEX_A7:
116*4a32e5dcSEric Biggers case ARM_CPU_PART_CORTEX_A5:
117*4a32e5dcSEric Biggers /*
118*4a32e5dcSEric Biggers * The Cortex-A7 and Cortex-A5 do not perform well with
119*4a32e5dcSEric Biggers * the NEON implementation but do incredibly with the
120*4a32e5dcSEric Biggers * scalar one and use less power.
121*4a32e5dcSEric Biggers */
122*4a32e5dcSEric Biggers break;
123*4a32e5dcSEric Biggers default:
124*4a32e5dcSEric Biggers static_branch_enable(&use_neon);
125*4a32e5dcSEric Biggers }
126*4a32e5dcSEric Biggers }
127*4a32e5dcSEric Biggers return 0;
128*4a32e5dcSEric Biggers }
129*4a32e5dcSEric Biggers subsys_initcall(chacha_arm_mod_init);
130*4a32e5dcSEric Biggers
chacha_arm_mod_exit(void)131*4a32e5dcSEric Biggers static void __exit chacha_arm_mod_exit(void)
132*4a32e5dcSEric Biggers {
133*4a32e5dcSEric Biggers }
134*4a32e5dcSEric Biggers module_exit(chacha_arm_mod_exit);
135*4a32e5dcSEric Biggers
136*4a32e5dcSEric Biggers MODULE_DESCRIPTION("ChaCha and HChaCha functions (ARM optimized)");
137*4a32e5dcSEric Biggers MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
138*4a32e5dcSEric Biggers MODULE_LICENSE("GPL v2");
139