1// SPDX-License-Identifier: GPL-2.0 2/* 3 * Copyright (C) 2024 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved. 4 */ 5 6#include <asm/asm.h> 7#include <asm/regdef.h> 8#include <linux/linkage.h> 9 10.text 11 12/* Salsa20 quarter-round */ 13.macro QR a b c d 14 add.w \a, \a, \b 15 xor \d, \d, \a 16 rotri.w \d, \d, 16 17 18 add.w \c, \c, \d 19 xor \b, \b, \c 20 rotri.w \b, \b, 20 21 22 add.w \a, \a, \b 23 xor \d, \d, \a 24 rotri.w \d, \d, 24 25 26 add.w \c, \c, \d 27 xor \b, \b, \c 28 rotri.w \b, \b, 25 29.endm 30 31/* 32 * Very basic LoongArch implementation of ChaCha20. Produces a given positive 33 * number of blocks of output with a nonce of 0, taking an input key and 34 * 8-byte counter. Importantly does not spill to the stack. Its arguments 35 * are: 36 * 37 * a0: output bytes 38 * a1: 32-byte key input 39 * a2: 8-byte counter input/output 40 * a3: number of 64-byte blocks to write to output 41 */ 42SYM_FUNC_START(__arch_chacha20_blocks_nostack) 43 44/* We don't need a frame pointer */ 45#define s9 fp 46 47#define output a0 48#define key a1 49#define counter a2 50#define nblocks a3 51#define i a4 52#define state0 s0 53#define state1 s1 54#define state2 s2 55#define state3 s3 56#define state4 s4 57#define state5 s5 58#define state6 s6 59#define state7 s7 60#define state8 s8 61#define state9 s9 62#define state10 a5 63#define state11 a6 64#define state12 a7 65#define state13 t0 66#define state14 t1 67#define state15 t2 68#define cnt_lo t3 69#define cnt_hi t4 70#define copy0 t5 71#define copy1 t6 72#define copy2 t7 73 74/* Reuse i as copy3 */ 75#define copy3 i 76 77 /* 78 * The ABI requires s0-s9 saved, and sp aligned to 16-byte. 79 * This does not violate the stack-less requirement: no sensitive data 80 * is spilled onto the stack. 81 */ 82 PTR_ADDI sp, sp, (-SZREG * 10) & STACK_ALIGN 83 REG_S s0, sp, 0 84 REG_S s1, sp, SZREG 85 REG_S s2, sp, SZREG * 2 86 REG_S s3, sp, SZREG * 3 87 REG_S s4, sp, SZREG * 4 88 REG_S s5, sp, SZREG * 5 89 REG_S s6, sp, SZREG * 6 90 REG_S s7, sp, SZREG * 7 91 REG_S s8, sp, SZREG * 8 92 REG_S s9, sp, SZREG * 9 93 94 li.w copy0, 0x61707865 95 li.w copy1, 0x3320646e 96 li.w copy2, 0x79622d32 97 98 ld.w cnt_lo, counter, 0 99 ld.w cnt_hi, counter, 4 100 101.Lblock: 102 /* state[0,1,2,3] = "expand 32-byte k" */ 103 move state0, copy0 104 move state1, copy1 105 move state2, copy2 106 li.w state3, 0x6b206574 107 108 /* state[4,5,..,11] = key */ 109 ld.w state4, key, 0 110 ld.w state5, key, 4 111 ld.w state6, key, 8 112 ld.w state7, key, 12 113 ld.w state8, key, 16 114 ld.w state9, key, 20 115 ld.w state10, key, 24 116 ld.w state11, key, 28 117 118 /* state[12,13] = counter */ 119 move state12, cnt_lo 120 move state13, cnt_hi 121 122 /* state[14,15] = 0 */ 123 move state14, zero 124 move state15, zero 125 126 li.w i, 10 127.Lpermute: 128 /* odd round */ 129 QR state0, state4, state8, state12 130 QR state1, state5, state9, state13 131 QR state2, state6, state10, state14 132 QR state3, state7, state11, state15 133 134 /* even round */ 135 QR state0, state5, state10, state15 136 QR state1, state6, state11, state12 137 QR state2, state7, state8, state13 138 QR state3, state4, state9, state14 139 140 addi.w i, i, -1 141 bnez i, .Lpermute 142 143 /* 144 * copy[3] = "expa", materialize it here because copy[3] shares the 145 * same register with i which just became dead. 146 */ 147 li.w copy3, 0x6b206574 148 149 /* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */ 150 add.w state0, state0, copy0 151 add.w state1, state1, copy1 152 add.w state2, state2, copy2 153 add.w state3, state3, copy3 154 st.w state0, output, 0 155 st.w state1, output, 4 156 st.w state2, output, 8 157 st.w state3, output, 12 158 159 /* from now on state[0,1,2,3] are scratch registers */ 160 161 /* state[0,1,2,3] = lo32(key) */ 162 ld.w state0, key, 0 163 ld.w state1, key, 4 164 ld.w state2, key, 8 165 ld.w state3, key, 12 166 167 /* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */ 168 add.w state4, state4, state0 169 add.w state5, state5, state1 170 add.w state6, state6, state2 171 add.w state7, state7, state3 172 st.w state4, output, 16 173 st.w state5, output, 20 174 st.w state6, output, 24 175 st.w state7, output, 28 176 177 /* state[0,1,2,3] = hi32(key) */ 178 ld.w state0, key, 16 179 ld.w state1, key, 20 180 ld.w state2, key, 24 181 ld.w state3, key, 28 182 183 /* output[8,9,10,11] = state[0,1,2,3] + state[8,9,10,11] */ 184 add.w state8, state8, state0 185 add.w state9, state9, state1 186 add.w state10, state10, state2 187 add.w state11, state11, state3 188 st.w state8, output, 32 189 st.w state9, output, 36 190 st.w state10, output, 40 191 st.w state11, output, 44 192 193 /* output[12,13,14,15] = state[12,13,14,15] + [cnt_lo, cnt_hi, 0, 0] */ 194 add.w state12, state12, cnt_lo 195 add.w state13, state13, cnt_hi 196 st.w state12, output, 48 197 st.w state13, output, 52 198 st.w state14, output, 56 199 st.w state15, output, 60 200 201 /* ++counter */ 202 addi.w cnt_lo, cnt_lo, 1 203 sltui state0, cnt_lo, 1 204 add.w cnt_hi, cnt_hi, state0 205 206 /* output += 64 */ 207 PTR_ADDI output, output, 64 208 /* --nblocks */ 209 PTR_ADDI nblocks, nblocks, -1 210 bnez nblocks, .Lblock 211 212 /* counter = [cnt_lo, cnt_hi] */ 213 st.w cnt_lo, counter, 0 214 st.w cnt_hi, counter, 4 215 216 /* 217 * Zero out the potentially sensitive regs, in case nothing uses these 218 * again. As at now copy[0,1,2,3] just contains "expand 32-byte k" and 219 * state[0,...,9] are s0-s9 those we'll restore in the epilogue, so we 220 * only need to zero state[11,...,15]. 221 */ 222 move state10, zero 223 move state11, zero 224 move state12, zero 225 move state13, zero 226 move state14, zero 227 move state15, zero 228 229 REG_L s0, sp, 0 230 REG_L s1, sp, SZREG 231 REG_L s2, sp, SZREG * 2 232 REG_L s3, sp, SZREG * 3 233 REG_L s4, sp, SZREG * 4 234 REG_L s5, sp, SZREG * 5 235 REG_L s6, sp, SZREG * 6 236 REG_L s7, sp, SZREG * 7 237 REG_L s8, sp, SZREG * 8 238 REG_L s9, sp, SZREG * 9 239 PTR_ADDI sp, sp, -((-SZREG * 10) & STACK_ALIGN) 240 241 jr ra 242SYM_FUNC_END(__arch_chacha20_blocks_nostack) 243