1// SPDX-License-Identifier: GPL-2.0 2/* 3 * Copyright (C) 2024 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved. 4 */ 5 6#include <asm/asm.h> 7#include <asm/regdef.h> 8#include <linux/linkage.h> 9 10.text 11 12.macro OP_4REG op d0 d1 d2 d3 s0 s1 s2 s3 13 \op \d0, \d0, \s0 14 \op \d1, \d1, \s1 15 \op \d2, \d2, \s2 16 \op \d3, \d3, \s3 17.endm 18 19/* 20 * Very basic LoongArch implementation of ChaCha20. Produces a given positive 21 * number of blocks of output with a nonce of 0, taking an input key and 22 * 8-byte counter. Importantly does not spill to the stack. Its arguments 23 * are: 24 * 25 * a0: output bytes 26 * a1: 32-byte key input 27 * a2: 8-byte counter input/output 28 * a3: number of 64-byte blocks to write to output 29 */ 30SYM_FUNC_START(__arch_chacha20_blocks_nostack) 31 32/* We don't need a frame pointer */ 33#define s9 fp 34 35#define output a0 36#define key a1 37#define counter a2 38#define nblocks a3 39#define i a4 40#define state0 s0 41#define state1 s1 42#define state2 s2 43#define state3 s3 44#define state4 s4 45#define state5 s5 46#define state6 s6 47#define state7 s7 48#define state8 s8 49#define state9 s9 50#define state10 a5 51#define state11 a6 52#define state12 a7 53#define state13 t0 54#define state14 t1 55#define state15 t2 56#define cnt_lo t3 57#define cnt_hi t4 58#define copy0 t5 59#define copy1 t6 60#define copy2 t7 61 62/* Reuse i as copy3 */ 63#define copy3 i 64 65/* Packs to be used with OP_4REG */ 66#define line0 state0, state1, state2, state3 67#define line1 state4, state5, state6, state7 68#define line2 state8, state9, state10, state11 69#define line3 state12, state13, state14, state15 70 71#define line1_perm state5, state6, state7, state4 72#define line2_perm state10, state11, state8, state9 73#define line3_perm state15, state12, state13, state14 74 75#define copy copy0, copy1, copy2, copy3 76 77#define _16 16, 16, 16, 16 78#define _20 20, 20, 20, 20 79#define _24 24, 24, 24, 24 80#define _25 25, 25, 25, 25 81 82 /* 83 * The ABI requires s0-s9 saved, and sp aligned to 16-byte. 84 * This does not violate the stack-less requirement: no sensitive data 85 * is spilled onto the stack. 86 */ 87 PTR_ADDI sp, sp, (-SZREG * 10) & STACK_ALIGN 88 REG_S s0, sp, 0 89 REG_S s1, sp, SZREG 90 REG_S s2, sp, SZREG * 2 91 REG_S s3, sp, SZREG * 3 92 REG_S s4, sp, SZREG * 4 93 REG_S s5, sp, SZREG * 5 94 REG_S s6, sp, SZREG * 6 95 REG_S s7, sp, SZREG * 7 96 REG_S s8, sp, SZREG * 8 97 REG_S s9, sp, SZREG * 9 98 99 li.w copy0, 0x61707865 100 li.w copy1, 0x3320646e 101 li.w copy2, 0x79622d32 102 103 ld.w cnt_lo, counter, 0 104 ld.w cnt_hi, counter, 4 105 106.Lblock: 107 /* state[0,1,2,3] = "expand 32-byte k" */ 108 move state0, copy0 109 move state1, copy1 110 move state2, copy2 111 li.w state3, 0x6b206574 112 113 /* state[4,5,..,11] = key */ 114 ld.w state4, key, 0 115 ld.w state5, key, 4 116 ld.w state6, key, 8 117 ld.w state7, key, 12 118 ld.w state8, key, 16 119 ld.w state9, key, 20 120 ld.w state10, key, 24 121 ld.w state11, key, 28 122 123 /* state[12,13] = counter */ 124 move state12, cnt_lo 125 move state13, cnt_hi 126 127 /* state[14,15] = 0 */ 128 move state14, zero 129 move state15, zero 130 131 li.w i, 10 132.Lpermute: 133 /* odd round */ 134 OP_4REG add.w line0, line1 135 OP_4REG xor line3, line0 136 OP_4REG rotri.w line3, _16 137 138 OP_4REG add.w line2, line3 139 OP_4REG xor line1, line2 140 OP_4REG rotri.w line1, _20 141 142 OP_4REG add.w line0, line1 143 OP_4REG xor line3, line0 144 OP_4REG rotri.w line3, _24 145 146 OP_4REG add.w line2, line3 147 OP_4REG xor line1, line2 148 OP_4REG rotri.w line1, _25 149 150 /* even round */ 151 OP_4REG add.w line0, line1_perm 152 OP_4REG xor line3_perm, line0 153 OP_4REG rotri.w line3_perm, _16 154 155 OP_4REG add.w line2_perm, line3_perm 156 OP_4REG xor line1_perm, line2_perm 157 OP_4REG rotri.w line1_perm, _20 158 159 OP_4REG add.w line0, line1_perm 160 OP_4REG xor line3_perm, line0 161 OP_4REG rotri.w line3_perm, _24 162 163 OP_4REG add.w line2_perm, line3_perm 164 OP_4REG xor line1_perm, line2_perm 165 OP_4REG rotri.w line1_perm, _25 166 167 addi.w i, i, -1 168 bnez i, .Lpermute 169 170 /* 171 * copy[3] = "expa", materialize it here because copy[3] shares the 172 * same register with i which just became dead. 173 */ 174 li.w copy3, 0x6b206574 175 176 /* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */ 177 OP_4REG add.w line0, copy 178 st.w state0, output, 0 179 st.w state1, output, 4 180 st.w state2, output, 8 181 st.w state3, output, 12 182 183 /* from now on state[0,1,2,3] are scratch registers */ 184 185 /* state[0,1,2,3] = lo32(key) */ 186 ld.w state0, key, 0 187 ld.w state1, key, 4 188 ld.w state2, key, 8 189 ld.w state3, key, 12 190 191 /* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */ 192 OP_4REG add.w line1, line0 193 st.w state4, output, 16 194 st.w state5, output, 20 195 st.w state6, output, 24 196 st.w state7, output, 28 197 198 /* state[0,1,2,3] = hi32(key) */ 199 ld.w state0, key, 16 200 ld.w state1, key, 20 201 ld.w state2, key, 24 202 ld.w state3, key, 28 203 204 /* output[8,9,10,11] = state[0,1,2,3] + state[8,9,10,11] */ 205 OP_4REG add.w line2, line0 206 st.w state8, output, 32 207 st.w state9, output, 36 208 st.w state10, output, 40 209 st.w state11, output, 44 210 211 /* output[12,13,14,15] = state[12,13,14,15] + [cnt_lo, cnt_hi, 0, 0] */ 212 add.w state12, state12, cnt_lo 213 add.w state13, state13, cnt_hi 214 st.w state12, output, 48 215 st.w state13, output, 52 216 st.w state14, output, 56 217 st.w state15, output, 60 218 219 /* ++counter */ 220 addi.w cnt_lo, cnt_lo, 1 221 sltui state0, cnt_lo, 1 222 add.w cnt_hi, cnt_hi, state0 223 224 /* output += 64 */ 225 PTR_ADDI output, output, 64 226 /* --nblocks */ 227 PTR_ADDI nblocks, nblocks, -1 228 bnez nblocks, .Lblock 229 230 /* counter = [cnt_lo, cnt_hi] */ 231 st.w cnt_lo, counter, 0 232 st.w cnt_hi, counter, 4 233 234 /* 235 * Zero out the potentially sensitive regs, in case nothing uses these 236 * again. As at now copy[0,1,2,3] just contains "expand 32-byte k" and 237 * state[0,...,9] are s0-s9 those we'll restore in the epilogue, so we 238 * only need to zero state[11,...,15]. 239 */ 240 move state10, zero 241 move state11, zero 242 move state12, zero 243 move state13, zero 244 move state14, zero 245 move state15, zero 246 247 REG_L s0, sp, 0 248 REG_L s1, sp, SZREG 249 REG_L s2, sp, SZREG * 2 250 REG_L s3, sp, SZREG * 3 251 REG_L s4, sp, SZREG * 4 252 REG_L s5, sp, SZREG * 5 253 REG_L s6, sp, SZREG * 6 254 REG_L s7, sp, SZREG * 7 255 REG_L s8, sp, SZREG * 8 256 REG_L s9, sp, SZREG * 9 257 PTR_ADDI sp, sp, -((-SZREG * 10) & STACK_ALIGN) 258 259 jr ra 260SYM_FUNC_END(__arch_chacha20_blocks_nostack) 261