1*13cecc52SEric Biggers /* SPDX-License-Identifier: GPL-2.0 */
2*13cecc52SEric Biggers /*
3*13cecc52SEric Biggers * ChaCha and HChaCha functions (ARM optimized)
4*13cecc52SEric Biggers *
5*13cecc52SEric Biggers * Copyright (C) 2016-2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
6*13cecc52SEric Biggers * Copyright (C) 2015 Martin Willi
7*13cecc52SEric Biggers */
8*13cecc52SEric Biggers
9*13cecc52SEric Biggers #include <crypto/internal/simd.h>
10*13cecc52SEric Biggers #include <linux/jump_label.h>
11*13cecc52SEric Biggers #include <linux/kernel.h>
12*13cecc52SEric Biggers
13*13cecc52SEric Biggers #include <asm/cputype.h>
14*13cecc52SEric Biggers #include <asm/hwcap.h>
15*13cecc52SEric Biggers #include <asm/neon.h>
16*13cecc52SEric Biggers #include <asm/simd.h>
17*13cecc52SEric Biggers
18*13cecc52SEric Biggers asmlinkage void chacha_block_xor_neon(const struct chacha_state *state,
19*13cecc52SEric Biggers u8 *dst, const u8 *src, int nrounds);
20*13cecc52SEric Biggers asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state,
21*13cecc52SEric Biggers u8 *dst, const u8 *src,
22*13cecc52SEric Biggers int nrounds, unsigned int nbytes);
23*13cecc52SEric Biggers asmlinkage void hchacha_block_arm(const struct chacha_state *state,
24*13cecc52SEric Biggers u32 out[HCHACHA_OUT_WORDS], int nrounds);
25*13cecc52SEric Biggers asmlinkage void hchacha_block_neon(const struct chacha_state *state,
26*13cecc52SEric Biggers u32 out[HCHACHA_OUT_WORDS], int nrounds);
27*13cecc52SEric Biggers
28*13cecc52SEric Biggers asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
29*13cecc52SEric Biggers const struct chacha_state *state, int nrounds);
30*13cecc52SEric Biggers
31*13cecc52SEric Biggers static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_neon);
32*13cecc52SEric Biggers
neon_usable(void)33*13cecc52SEric Biggers static inline bool neon_usable(void)
34*13cecc52SEric Biggers {
35*13cecc52SEric Biggers return static_branch_likely(&use_neon) && crypto_simd_usable();
36*13cecc52SEric Biggers }
37*13cecc52SEric Biggers
chacha_doneon(struct chacha_state * state,u8 * dst,const u8 * src,unsigned int bytes,int nrounds)38*13cecc52SEric Biggers static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src,
39*13cecc52SEric Biggers unsigned int bytes, int nrounds)
40*13cecc52SEric Biggers {
41*13cecc52SEric Biggers u8 buf[CHACHA_BLOCK_SIZE];
42*13cecc52SEric Biggers
43*13cecc52SEric Biggers while (bytes > CHACHA_BLOCK_SIZE) {
44*13cecc52SEric Biggers unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U);
45*13cecc52SEric Biggers
46*13cecc52SEric Biggers chacha_4block_xor_neon(state, dst, src, nrounds, l);
47*13cecc52SEric Biggers bytes -= l;
48*13cecc52SEric Biggers src += l;
49*13cecc52SEric Biggers dst += l;
50*13cecc52SEric Biggers state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
51*13cecc52SEric Biggers }
52*13cecc52SEric Biggers if (bytes) {
53*13cecc52SEric Biggers const u8 *s = src;
54*13cecc52SEric Biggers u8 *d = dst;
55*13cecc52SEric Biggers
56*13cecc52SEric Biggers if (bytes != CHACHA_BLOCK_SIZE)
57*13cecc52SEric Biggers s = d = memcpy(buf, src, bytes);
58*13cecc52SEric Biggers chacha_block_xor_neon(state, d, s, nrounds);
59*13cecc52SEric Biggers if (d != dst)
60*13cecc52SEric Biggers memcpy(dst, buf, bytes);
61*13cecc52SEric Biggers state->x[12]++;
62*13cecc52SEric Biggers }
63*13cecc52SEric Biggers }
64*13cecc52SEric Biggers
hchacha_block_arch(const struct chacha_state * state,u32 out[HCHACHA_OUT_WORDS],int nrounds)65*13cecc52SEric Biggers static void hchacha_block_arch(const struct chacha_state *state,
66*13cecc52SEric Biggers u32 out[HCHACHA_OUT_WORDS], int nrounds)
67*13cecc52SEric Biggers {
68*13cecc52SEric Biggers if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) {
69*13cecc52SEric Biggers hchacha_block_arm(state, out, nrounds);
70*13cecc52SEric Biggers } else {
71*13cecc52SEric Biggers kernel_neon_begin();
72*13cecc52SEric Biggers hchacha_block_neon(state, out, nrounds);
73*13cecc52SEric Biggers kernel_neon_end();
74*13cecc52SEric Biggers }
75*13cecc52SEric Biggers }
76*13cecc52SEric Biggers
chacha_crypt_arch(struct chacha_state * state,u8 * dst,const u8 * src,unsigned int bytes,int nrounds)77*13cecc52SEric Biggers static void chacha_crypt_arch(struct chacha_state *state, u8 *dst,
78*13cecc52SEric Biggers const u8 *src, unsigned int bytes, int nrounds)
79*13cecc52SEric Biggers {
80*13cecc52SEric Biggers if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() ||
81*13cecc52SEric Biggers bytes <= CHACHA_BLOCK_SIZE) {
82*13cecc52SEric Biggers chacha_doarm(dst, src, bytes, state, nrounds);
83*13cecc52SEric Biggers state->x[12] += DIV_ROUND_UP(bytes, CHACHA_BLOCK_SIZE);
84*13cecc52SEric Biggers return;
85*13cecc52SEric Biggers }
86*13cecc52SEric Biggers
87*13cecc52SEric Biggers do {
88*13cecc52SEric Biggers unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
89*13cecc52SEric Biggers
90*13cecc52SEric Biggers kernel_neon_begin();
91*13cecc52SEric Biggers chacha_doneon(state, dst, src, todo, nrounds);
92*13cecc52SEric Biggers kernel_neon_end();
93*13cecc52SEric Biggers
94*13cecc52SEric Biggers bytes -= todo;
95*13cecc52SEric Biggers src += todo;
96*13cecc52SEric Biggers dst += todo;
97*13cecc52SEric Biggers } while (bytes);
98*13cecc52SEric Biggers }
99*13cecc52SEric Biggers
100*13cecc52SEric Biggers #define chacha_mod_init_arch chacha_mod_init_arch
chacha_mod_init_arch(void)101*13cecc52SEric Biggers static void chacha_mod_init_arch(void)
102*13cecc52SEric Biggers {
103*13cecc52SEric Biggers if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
104*13cecc52SEric Biggers switch (read_cpuid_part()) {
105*13cecc52SEric Biggers case ARM_CPU_PART_CORTEX_A7:
106*13cecc52SEric Biggers case ARM_CPU_PART_CORTEX_A5:
107*13cecc52SEric Biggers /*
108*13cecc52SEric Biggers * The Cortex-A7 and Cortex-A5 do not perform well with
109*13cecc52SEric Biggers * the NEON implementation but do incredibly with the
110*13cecc52SEric Biggers * scalar one and use less power.
111*13cecc52SEric Biggers */
112*13cecc52SEric Biggers break;
113*13cecc52SEric Biggers default:
114*13cecc52SEric Biggers static_branch_enable(&use_neon);
115*13cecc52SEric Biggers }
116*13cecc52SEric Biggers }
117*13cecc52SEric Biggers }
118