1*4a32e5dcSEric Biggers // SPDX-License-Identifier: GPL-2.0 2*4a32e5dcSEric Biggers /* 3*4a32e5dcSEric Biggers * ChaCha and HChaCha functions (ARM optimized) 4*4a32e5dcSEric Biggers * 5*4a32e5dcSEric Biggers * Copyright (C) 2016-2019 Linaro, Ltd. <ard.biesheuvel@linaro.org> 6*4a32e5dcSEric Biggers * Copyright (C) 2015 Martin Willi 7*4a32e5dcSEric Biggers */ 8*4a32e5dcSEric Biggers 9*4a32e5dcSEric Biggers #include <crypto/chacha.h> 10*4a32e5dcSEric Biggers #include <crypto/internal/simd.h> 11*4a32e5dcSEric Biggers #include <linux/jump_label.h> 12*4a32e5dcSEric Biggers #include <linux/kernel.h> 13*4a32e5dcSEric Biggers #include <linux/module.h> 14*4a32e5dcSEric Biggers 15*4a32e5dcSEric Biggers #include <asm/cputype.h> 16*4a32e5dcSEric Biggers #include <asm/hwcap.h> 17*4a32e5dcSEric Biggers #include <asm/neon.h> 18*4a32e5dcSEric Biggers #include <asm/simd.h> 19*4a32e5dcSEric Biggers 20*4a32e5dcSEric Biggers asmlinkage void chacha_block_xor_neon(const struct chacha_state *state, 21*4a32e5dcSEric Biggers u8 *dst, const u8 *src, int nrounds); 22*4a32e5dcSEric Biggers asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state, 23*4a32e5dcSEric Biggers u8 *dst, const u8 *src, 24*4a32e5dcSEric Biggers int nrounds, unsigned int nbytes); 25*4a32e5dcSEric Biggers asmlinkage void hchacha_block_arm(const struct chacha_state *state, 26*4a32e5dcSEric Biggers u32 out[HCHACHA_OUT_WORDS], int nrounds); 27*4a32e5dcSEric Biggers asmlinkage void hchacha_block_neon(const struct chacha_state *state, 28*4a32e5dcSEric Biggers u32 out[HCHACHA_OUT_WORDS], int nrounds); 29*4a32e5dcSEric Biggers 30*4a32e5dcSEric Biggers asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes, 31*4a32e5dcSEric Biggers const struct chacha_state *state, int nrounds); 32*4a32e5dcSEric Biggers 33*4a32e5dcSEric Biggers static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_neon); 34*4a32e5dcSEric Biggers 35*4a32e5dcSEric Biggers static inline bool neon_usable(void) 36*4a32e5dcSEric Biggers { 37*4a32e5dcSEric Biggers return static_branch_likely(&use_neon) && crypto_simd_usable(); 38*4a32e5dcSEric Biggers } 39*4a32e5dcSEric Biggers 40*4a32e5dcSEric Biggers static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src, 41*4a32e5dcSEric Biggers unsigned int bytes, int nrounds) 42*4a32e5dcSEric Biggers { 43*4a32e5dcSEric Biggers u8 buf[CHACHA_BLOCK_SIZE]; 44*4a32e5dcSEric Biggers 45*4a32e5dcSEric Biggers while (bytes > CHACHA_BLOCK_SIZE) { 46*4a32e5dcSEric Biggers unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U); 47*4a32e5dcSEric Biggers 48*4a32e5dcSEric Biggers chacha_4block_xor_neon(state, dst, src, nrounds, l); 49*4a32e5dcSEric Biggers bytes -= l; 50*4a32e5dcSEric Biggers src += l; 51*4a32e5dcSEric Biggers dst += l; 52*4a32e5dcSEric Biggers state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE); 53*4a32e5dcSEric Biggers } 54*4a32e5dcSEric Biggers if (bytes) { 55*4a32e5dcSEric Biggers const u8 *s = src; 56*4a32e5dcSEric Biggers u8 *d = dst; 57*4a32e5dcSEric Biggers 58*4a32e5dcSEric Biggers if (bytes != CHACHA_BLOCK_SIZE) 59*4a32e5dcSEric Biggers s = d = memcpy(buf, src, bytes); 60*4a32e5dcSEric Biggers chacha_block_xor_neon(state, d, s, nrounds); 61*4a32e5dcSEric Biggers if (d != dst) 62*4a32e5dcSEric Biggers memcpy(dst, buf, bytes); 63*4a32e5dcSEric Biggers state->x[12]++; 64*4a32e5dcSEric Biggers } 65*4a32e5dcSEric Biggers } 66*4a32e5dcSEric Biggers 67*4a32e5dcSEric Biggers void hchacha_block_arch(const struct chacha_state *state, 68*4a32e5dcSEric Biggers u32 out[HCHACHA_OUT_WORDS], int nrounds) 69*4a32e5dcSEric Biggers { 70*4a32e5dcSEric Biggers if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) { 71*4a32e5dcSEric Biggers hchacha_block_arm(state, out, nrounds); 72*4a32e5dcSEric Biggers } else { 73*4a32e5dcSEric Biggers kernel_neon_begin(); 74*4a32e5dcSEric Biggers hchacha_block_neon(state, out, nrounds); 75*4a32e5dcSEric Biggers kernel_neon_end(); 76*4a32e5dcSEric Biggers } 77*4a32e5dcSEric Biggers } 78*4a32e5dcSEric Biggers EXPORT_SYMBOL(hchacha_block_arch); 79*4a32e5dcSEric Biggers 80*4a32e5dcSEric Biggers void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, 81*4a32e5dcSEric Biggers unsigned int bytes, int nrounds) 82*4a32e5dcSEric Biggers { 83*4a32e5dcSEric Biggers if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() || 84*4a32e5dcSEric Biggers bytes <= CHACHA_BLOCK_SIZE) { 85*4a32e5dcSEric Biggers chacha_doarm(dst, src, bytes, state, nrounds); 86*4a32e5dcSEric Biggers state->x[12] += DIV_ROUND_UP(bytes, CHACHA_BLOCK_SIZE); 87*4a32e5dcSEric Biggers return; 88*4a32e5dcSEric Biggers } 89*4a32e5dcSEric Biggers 90*4a32e5dcSEric Biggers do { 91*4a32e5dcSEric Biggers unsigned int todo = min_t(unsigned int, bytes, SZ_4K); 92*4a32e5dcSEric Biggers 93*4a32e5dcSEric Biggers kernel_neon_begin(); 94*4a32e5dcSEric Biggers chacha_doneon(state, dst, src, todo, nrounds); 95*4a32e5dcSEric Biggers kernel_neon_end(); 96*4a32e5dcSEric Biggers 97*4a32e5dcSEric Biggers bytes -= todo; 98*4a32e5dcSEric Biggers src += todo; 99*4a32e5dcSEric Biggers dst += todo; 100*4a32e5dcSEric Biggers } while (bytes); 101*4a32e5dcSEric Biggers } 102*4a32e5dcSEric Biggers EXPORT_SYMBOL(chacha_crypt_arch); 103*4a32e5dcSEric Biggers 104*4a32e5dcSEric Biggers bool chacha_is_arch_optimized(void) 105*4a32e5dcSEric Biggers { 106*4a32e5dcSEric Biggers /* We always can use at least the ARM scalar implementation. */ 107*4a32e5dcSEric Biggers return true; 108*4a32e5dcSEric Biggers } 109*4a32e5dcSEric Biggers EXPORT_SYMBOL(chacha_is_arch_optimized); 110*4a32e5dcSEric Biggers 111*4a32e5dcSEric Biggers static int __init chacha_arm_mod_init(void) 112*4a32e5dcSEric Biggers { 113*4a32e5dcSEric Biggers if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) { 114*4a32e5dcSEric Biggers switch (read_cpuid_part()) { 115*4a32e5dcSEric Biggers case ARM_CPU_PART_CORTEX_A7: 116*4a32e5dcSEric Biggers case ARM_CPU_PART_CORTEX_A5: 117*4a32e5dcSEric Biggers /* 118*4a32e5dcSEric Biggers * The Cortex-A7 and Cortex-A5 do not perform well with 119*4a32e5dcSEric Biggers * the NEON implementation but do incredibly with the 120*4a32e5dcSEric Biggers * scalar one and use less power. 121*4a32e5dcSEric Biggers */ 122*4a32e5dcSEric Biggers break; 123*4a32e5dcSEric Biggers default: 124*4a32e5dcSEric Biggers static_branch_enable(&use_neon); 125*4a32e5dcSEric Biggers } 126*4a32e5dcSEric Biggers } 127*4a32e5dcSEric Biggers return 0; 128*4a32e5dcSEric Biggers } 129*4a32e5dcSEric Biggers subsys_initcall(chacha_arm_mod_init); 130*4a32e5dcSEric Biggers 131*4a32e5dcSEric Biggers static void __exit chacha_arm_mod_exit(void) 132*4a32e5dcSEric Biggers { 133*4a32e5dcSEric Biggers } 134*4a32e5dcSEric Biggers module_exit(chacha_arm_mod_exit); 135*4a32e5dcSEric Biggers 136*4a32e5dcSEric Biggers MODULE_DESCRIPTION("ChaCha and HChaCha functions (ARM optimized)"); 137*4a32e5dcSEric Biggers MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); 138*4a32e5dcSEric Biggers MODULE_LICENSE("GPL v2"); 139