1// SPDX-License-Identifier: GPL-2.0 2 3#include <linux/linkage.h> 4#include <asm/cache.h> 5#include <asm/assembler.h> 6 7 .text 8 9#define state0 v0 10#define state1 v1 11#define state2 v2 12#define state3 v3 13#define copy0 v4 14#define copy0_q q4 15#define copy1 v5 16#define copy2 v6 17#define copy3 v7 18#define copy3_d d7 19#define one_d d16 20#define one_q q16 21#define one_v v16 22#define tmp v17 23#define rot8 v18 24 25/* 26 * ARM64 ChaCha20 implementation meant for vDSO. Produces a given positive 27 * number of blocks of output with nonce 0, taking an input key and 8-bytes 28 * counter. Importantly does not spill to the stack. 29 * 30 * This implementation avoids d8-d15 because they are callee-save in user 31 * space. 32 * 33 * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes, 34 * const uint8_t *key, 35 * uint32_t *counter, 36 * size_t nblocks) 37 * 38 * x0: output bytes 39 * x1: 32-byte key input 40 * x2: 8-byte counter input/output 41 * x3: number of 64-byte block to write to output 42 */ 43SYM_FUNC_START(__arch_chacha20_blocks_nostack) 44 45 /* copy0 = "expand 32-byte k" */ 46 mov_q x8, 0x3320646e61707865 47 mov_q x9, 0x6b20657479622d32 48 mov copy0.d[0], x8 49 mov copy0.d[1], x9 50 51 /* copy1,copy2 = key */ 52 ld1 { copy1.4s, copy2.4s }, [x1] 53 /* copy3 = counter || zero nonce */ 54 ld1 { copy3.2s }, [x2] 55 56 movi one_v.2s, #1 57 uzp1 one_v.4s, one_v.4s, one_v.4s 58 59.Lblock: 60 /* copy state to auxiliary vectors for the final add after the permute. */ 61 mov state0.16b, copy0.16b 62 mov state1.16b, copy1.16b 63 mov state2.16b, copy2.16b 64 mov state3.16b, copy3.16b 65 66 mov w4, 20 67.Lpermute: 68 /* 69 * Permute one 64-byte block where the state matrix is stored in the four NEON 70 * registers state0-state3. It performs matrix operations on four words in parallel, 71 * but requires shuffling to rearrange the words after each round. 72 */ 73 74.Ldoubleround: 75 /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ 76 add state0.4s, state0.4s, state1.4s 77 eor state3.16b, state3.16b, state0.16b 78 rev32 state3.8h, state3.8h 79 80 /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ 81 add state2.4s, state2.4s, state3.4s 82 eor tmp.16b, state1.16b, state2.16b 83 shl state1.4s, tmp.4s, #12 84 sri state1.4s, tmp.4s, #20 85 86 /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ 87 add state0.4s, state0.4s, state1.4s 88 eor tmp.16b, state3.16b, state0.16b 89 shl state3.4s, tmp.4s, #8 90 sri state3.4s, tmp.4s, #24 91 92 /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ 93 add state2.4s, state2.4s, state3.4s 94 eor tmp.16b, state1.16b, state2.16b 95 shl state1.4s, tmp.4s, #7 96 sri state1.4s, tmp.4s, #25 97 98 /* state1[0,1,2,3] = state1[1,2,3,0] */ 99 ext state1.16b, state1.16b, state1.16b, #4 100 /* state2[0,1,2,3] = state2[2,3,0,1] */ 101 ext state2.16b, state2.16b, state2.16b, #8 102 /* state3[0,1,2,3] = state3[1,2,3,0] */ 103 ext state3.16b, state3.16b, state3.16b, #12 104 105 /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ 106 add state0.4s, state0.4s, state1.4s 107 eor state3.16b, state3.16b, state0.16b 108 rev32 state3.8h, state3.8h 109 110 /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ 111 add state2.4s, state2.4s, state3.4s 112 eor tmp.16b, state1.16b, state2.16b 113 shl state1.4s, tmp.4s, #12 114 sri state1.4s, tmp.4s, #20 115 116 /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ 117 add state0.4s, state0.4s, state1.4s 118 eor tmp.16b, state3.16b, state0.16b 119 shl state3.4s, tmp.4s, #8 120 sri state3.4s, tmp.4s, #24 121 122 /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ 123 add state2.4s, state2.4s, state3.4s 124 eor tmp.16b, state1.16b, state2.16b 125 shl state1.4s, tmp.4s, #7 126 sri state1.4s, tmp.4s, #25 127 128 /* state1[0,1,2,3] = state1[3,0,1,2] */ 129 ext state1.16b, state1.16b, state1.16b, #12 130 /* state2[0,1,2,3] = state2[2,3,0,1] */ 131 ext state2.16b, state2.16b, state2.16b, #8 132 /* state3[0,1,2,3] = state3[1,2,3,0] */ 133 ext state3.16b, state3.16b, state3.16b, #4 134 135 subs w4, w4, #2 136 b.ne .Ldoubleround 137 138 /* output0 = state0 + state0 */ 139 add state0.4s, state0.4s, copy0.4s 140 /* output1 = state1 + state1 */ 141 add state1.4s, state1.4s, copy1.4s 142 /* output2 = state2 + state2 */ 143 add state2.4s, state2.4s, copy2.4s 144 /* output2 = state3 + state3 */ 145 add state3.4s, state3.4s, copy3.4s 146 st1 { state0.16b - state3.16b }, [x0] 147 148 /* 149 * ++copy3.counter, the 'add' clears the upper half of the SIMD register 150 * which is the expected behaviour here. 151 */ 152 add copy3_d, copy3_d, one_d 153 154 /* output += 64, --nblocks */ 155 add x0, x0, 64 156 subs x3, x3, #1 157 b.ne .Lblock 158 159 /* counter = copy3.counter */ 160 st1 { copy3.2s }, [x2] 161 162 /* Zero out the potentially sensitive regs, in case nothing uses these again. */ 163 movi state0.16b, #0 164 movi state1.16b, #0 165 movi state2.16b, #0 166 movi state3.16b, #0 167 movi copy1.16b, #0 168 movi copy2.16b, #0 169 ret 170SYM_FUNC_END(__arch_chacha20_blocks_nostack) 171 172emit_aarch64_feature_1_and 173