1*ee0d0305SXi Ruoyao/* SPDX-License-Identifier: GPL-2.0 */ 2*ee0d0305SXi Ruoyao/* 3*ee0d0305SXi Ruoyao * Copyright (C) 2025 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved. 4*ee0d0305SXi Ruoyao * 5*ee0d0305SXi Ruoyao * Based on arch/loongarch/vdso/vgetrandom-chacha.S. 6*ee0d0305SXi Ruoyao */ 7*ee0d0305SXi Ruoyao 8*ee0d0305SXi Ruoyao#include <asm/asm.h> 9*ee0d0305SXi Ruoyao#include <linux/linkage.h> 10*ee0d0305SXi Ruoyao 11*ee0d0305SXi Ruoyao.text 12*ee0d0305SXi Ruoyao 13*ee0d0305SXi Ruoyao.macro ROTRI rd rs imm 14*ee0d0305SXi Ruoyao slliw t0, \rs, 32 - \imm 15*ee0d0305SXi Ruoyao srliw \rd, \rs, \imm 16*ee0d0305SXi Ruoyao or \rd, \rd, t0 17*ee0d0305SXi Ruoyao.endm 18*ee0d0305SXi Ruoyao 19*ee0d0305SXi Ruoyao.macro OP_4REG op d0 d1 d2 d3 s0 s1 s2 s3 20*ee0d0305SXi Ruoyao \op \d0, \d0, \s0 21*ee0d0305SXi Ruoyao \op \d1, \d1, \s1 22*ee0d0305SXi Ruoyao \op \d2, \d2, \s2 23*ee0d0305SXi Ruoyao \op \d3, \d3, \s3 24*ee0d0305SXi Ruoyao.endm 25*ee0d0305SXi Ruoyao 26*ee0d0305SXi Ruoyao/* 27*ee0d0305SXi Ruoyao * a0: output bytes 28*ee0d0305SXi Ruoyao * a1: 32-byte key input 29*ee0d0305SXi Ruoyao * a2: 8-byte counter input/output 30*ee0d0305SXi Ruoyao * a3: number of 64-byte blocks to write to output 31*ee0d0305SXi Ruoyao */ 32*ee0d0305SXi RuoyaoSYM_FUNC_START(__arch_chacha20_blocks_nostack) 33*ee0d0305SXi Ruoyao 34*ee0d0305SXi Ruoyao#define output a0 35*ee0d0305SXi Ruoyao#define key a1 36*ee0d0305SXi Ruoyao#define counter a2 37*ee0d0305SXi Ruoyao#define nblocks a3 38*ee0d0305SXi Ruoyao#define i a4 39*ee0d0305SXi Ruoyao#define state0 s0 40*ee0d0305SXi Ruoyao#define state1 s1 41*ee0d0305SXi Ruoyao#define state2 s2 42*ee0d0305SXi Ruoyao#define state3 s3 43*ee0d0305SXi Ruoyao#define state4 s4 44*ee0d0305SXi Ruoyao#define state5 s5 45*ee0d0305SXi Ruoyao#define state6 s6 46*ee0d0305SXi Ruoyao#define state7 s7 47*ee0d0305SXi Ruoyao#define state8 s8 48*ee0d0305SXi Ruoyao#define state9 s9 49*ee0d0305SXi Ruoyao#define state10 s10 50*ee0d0305SXi Ruoyao#define state11 s11 51*ee0d0305SXi Ruoyao#define state12 a5 52*ee0d0305SXi Ruoyao#define state13 a6 53*ee0d0305SXi Ruoyao#define state14 a7 54*ee0d0305SXi Ruoyao#define state15 t1 55*ee0d0305SXi Ruoyao#define cnt t2 56*ee0d0305SXi Ruoyao#define copy0 t3 57*ee0d0305SXi Ruoyao#define copy1 t4 58*ee0d0305SXi Ruoyao#define copy2 t5 59*ee0d0305SXi Ruoyao#define copy3 t6 60*ee0d0305SXi Ruoyao 61*ee0d0305SXi Ruoyao/* Packs to be used with OP_4REG */ 62*ee0d0305SXi Ruoyao#define line0 state0, state1, state2, state3 63*ee0d0305SXi Ruoyao#define line1 state4, state5, state6, state7 64*ee0d0305SXi Ruoyao#define line2 state8, state9, state10, state11 65*ee0d0305SXi Ruoyao#define line3 state12, state13, state14, state15 66*ee0d0305SXi Ruoyao 67*ee0d0305SXi Ruoyao#define line1_perm state5, state6, state7, state4 68*ee0d0305SXi Ruoyao#define line2_perm state10, state11, state8, state9 69*ee0d0305SXi Ruoyao#define line3_perm state15, state12, state13, state14 70*ee0d0305SXi Ruoyao 71*ee0d0305SXi Ruoyao#define copy copy0, copy1, copy2, copy3 72*ee0d0305SXi Ruoyao 73*ee0d0305SXi Ruoyao#define _16 16, 16, 16, 16 74*ee0d0305SXi Ruoyao#define _20 20, 20, 20, 20 75*ee0d0305SXi Ruoyao#define _24 24, 24, 24, 24 76*ee0d0305SXi Ruoyao#define _25 25, 25, 25, 25 77*ee0d0305SXi Ruoyao 78*ee0d0305SXi Ruoyao /* 79*ee0d0305SXi Ruoyao * The ABI requires s0-s9 saved. 80*ee0d0305SXi Ruoyao * This does not violate the stack-less requirement: no sensitive data 81*ee0d0305SXi Ruoyao * is spilled onto the stack. 82*ee0d0305SXi Ruoyao */ 83*ee0d0305SXi Ruoyao addi sp, sp, -12*SZREG 84*ee0d0305SXi Ruoyao REG_S s0, (sp) 85*ee0d0305SXi Ruoyao REG_S s1, SZREG(sp) 86*ee0d0305SXi Ruoyao REG_S s2, 2*SZREG(sp) 87*ee0d0305SXi Ruoyao REG_S s3, 3*SZREG(sp) 88*ee0d0305SXi Ruoyao REG_S s4, 4*SZREG(sp) 89*ee0d0305SXi Ruoyao REG_S s5, 5*SZREG(sp) 90*ee0d0305SXi Ruoyao REG_S s6, 6*SZREG(sp) 91*ee0d0305SXi Ruoyao REG_S s7, 7*SZREG(sp) 92*ee0d0305SXi Ruoyao REG_S s8, 8*SZREG(sp) 93*ee0d0305SXi Ruoyao REG_S s9, 9*SZREG(sp) 94*ee0d0305SXi Ruoyao REG_S s10, 10*SZREG(sp) 95*ee0d0305SXi Ruoyao REG_S s11, 11*SZREG(sp) 96*ee0d0305SXi Ruoyao 97*ee0d0305SXi Ruoyao ld cnt, (counter) 98*ee0d0305SXi Ruoyao 99*ee0d0305SXi Ruoyao li copy0, 0x61707865 100*ee0d0305SXi Ruoyao li copy1, 0x3320646e 101*ee0d0305SXi Ruoyao li copy2, 0x79622d32 102*ee0d0305SXi Ruoyao li copy3, 0x6b206574 103*ee0d0305SXi Ruoyao 104*ee0d0305SXi Ruoyao.Lblock: 105*ee0d0305SXi Ruoyao /* state[0,1,2,3] = "expand 32-byte k" */ 106*ee0d0305SXi Ruoyao mv state0, copy0 107*ee0d0305SXi Ruoyao mv state1, copy1 108*ee0d0305SXi Ruoyao mv state2, copy2 109*ee0d0305SXi Ruoyao mv state3, copy3 110*ee0d0305SXi Ruoyao 111*ee0d0305SXi Ruoyao /* state[4,5,..,11] = key */ 112*ee0d0305SXi Ruoyao lw state4, (key) 113*ee0d0305SXi Ruoyao lw state5, 4(key) 114*ee0d0305SXi Ruoyao lw state6, 8(key) 115*ee0d0305SXi Ruoyao lw state7, 12(key) 116*ee0d0305SXi Ruoyao lw state8, 16(key) 117*ee0d0305SXi Ruoyao lw state9, 20(key) 118*ee0d0305SXi Ruoyao lw state10, 24(key) 119*ee0d0305SXi Ruoyao lw state11, 28(key) 120*ee0d0305SXi Ruoyao 121*ee0d0305SXi Ruoyao /* state[12,13] = counter */ 122*ee0d0305SXi Ruoyao mv state12, cnt 123*ee0d0305SXi Ruoyao srli state13, cnt, 32 124*ee0d0305SXi Ruoyao 125*ee0d0305SXi Ruoyao /* state[14,15] = 0 */ 126*ee0d0305SXi Ruoyao mv state14, zero 127*ee0d0305SXi Ruoyao mv state15, zero 128*ee0d0305SXi Ruoyao 129*ee0d0305SXi Ruoyao li i, 10 130*ee0d0305SXi Ruoyao.Lpermute: 131*ee0d0305SXi Ruoyao /* odd round */ 132*ee0d0305SXi Ruoyao OP_4REG addw line0, line1 133*ee0d0305SXi Ruoyao OP_4REG xor line3, line0 134*ee0d0305SXi Ruoyao OP_4REG ROTRI line3, _16 135*ee0d0305SXi Ruoyao 136*ee0d0305SXi Ruoyao OP_4REG addw line2, line3 137*ee0d0305SXi Ruoyao OP_4REG xor line1, line2 138*ee0d0305SXi Ruoyao OP_4REG ROTRI line1, _20 139*ee0d0305SXi Ruoyao 140*ee0d0305SXi Ruoyao OP_4REG addw line0, line1 141*ee0d0305SXi Ruoyao OP_4REG xor line3, line0 142*ee0d0305SXi Ruoyao OP_4REG ROTRI line3, _24 143*ee0d0305SXi Ruoyao 144*ee0d0305SXi Ruoyao OP_4REG addw line2, line3 145*ee0d0305SXi Ruoyao OP_4REG xor line1, line2 146*ee0d0305SXi Ruoyao OP_4REG ROTRI line1, _25 147*ee0d0305SXi Ruoyao 148*ee0d0305SXi Ruoyao /* even round */ 149*ee0d0305SXi Ruoyao OP_4REG addw line0, line1_perm 150*ee0d0305SXi Ruoyao OP_4REG xor line3_perm, line0 151*ee0d0305SXi Ruoyao OP_4REG ROTRI line3_perm, _16 152*ee0d0305SXi Ruoyao 153*ee0d0305SXi Ruoyao OP_4REG addw line2_perm, line3_perm 154*ee0d0305SXi Ruoyao OP_4REG xor line1_perm, line2_perm 155*ee0d0305SXi Ruoyao OP_4REG ROTRI line1_perm, _20 156*ee0d0305SXi Ruoyao 157*ee0d0305SXi Ruoyao OP_4REG addw line0, line1_perm 158*ee0d0305SXi Ruoyao OP_4REG xor line3_perm, line0 159*ee0d0305SXi Ruoyao OP_4REG ROTRI line3_perm, _24 160*ee0d0305SXi Ruoyao 161*ee0d0305SXi Ruoyao OP_4REG addw line2_perm, line3_perm 162*ee0d0305SXi Ruoyao OP_4REG xor line1_perm, line2_perm 163*ee0d0305SXi Ruoyao OP_4REG ROTRI line1_perm, _25 164*ee0d0305SXi Ruoyao 165*ee0d0305SXi Ruoyao addi i, i, -1 166*ee0d0305SXi Ruoyao bnez i, .Lpermute 167*ee0d0305SXi Ruoyao 168*ee0d0305SXi Ruoyao /* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */ 169*ee0d0305SXi Ruoyao OP_4REG addw line0, copy 170*ee0d0305SXi Ruoyao sw state0, (output) 171*ee0d0305SXi Ruoyao sw state1, 4(output) 172*ee0d0305SXi Ruoyao sw state2, 8(output) 173*ee0d0305SXi Ruoyao sw state3, 12(output) 174*ee0d0305SXi Ruoyao 175*ee0d0305SXi Ruoyao /* from now on state[0,1,2,3] are scratch registers */ 176*ee0d0305SXi Ruoyao 177*ee0d0305SXi Ruoyao /* state[0,1,2,3] = lo(key) */ 178*ee0d0305SXi Ruoyao lw state0, (key) 179*ee0d0305SXi Ruoyao lw state1, 4(key) 180*ee0d0305SXi Ruoyao lw state2, 8(key) 181*ee0d0305SXi Ruoyao lw state3, 12(key) 182*ee0d0305SXi Ruoyao 183*ee0d0305SXi Ruoyao /* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */ 184*ee0d0305SXi Ruoyao OP_4REG addw line1, line0 185*ee0d0305SXi Ruoyao sw state4, 16(output) 186*ee0d0305SXi Ruoyao sw state5, 20(output) 187*ee0d0305SXi Ruoyao sw state6, 24(output) 188*ee0d0305SXi Ruoyao sw state7, 28(output) 189*ee0d0305SXi Ruoyao 190*ee0d0305SXi Ruoyao /* state[0,1,2,3] = hi(key) */ 191*ee0d0305SXi Ruoyao lw state0, 16(key) 192*ee0d0305SXi Ruoyao lw state1, 20(key) 193*ee0d0305SXi Ruoyao lw state2, 24(key) 194*ee0d0305SXi Ruoyao lw state3, 28(key) 195*ee0d0305SXi Ruoyao 196*ee0d0305SXi Ruoyao /* output[8,9,10,11] = tmp[0,1,2,3] + state[8,9,10,11] */ 197*ee0d0305SXi Ruoyao OP_4REG addw line2, line0 198*ee0d0305SXi Ruoyao sw state8, 32(output) 199*ee0d0305SXi Ruoyao sw state9, 36(output) 200*ee0d0305SXi Ruoyao sw state10, 40(output) 201*ee0d0305SXi Ruoyao sw state11, 44(output) 202*ee0d0305SXi Ruoyao 203*ee0d0305SXi Ruoyao /* output[12,13,14,15] = state[12,13,14,15] + [cnt_lo, cnt_hi, 0, 0] */ 204*ee0d0305SXi Ruoyao addw state12, state12, cnt 205*ee0d0305SXi Ruoyao srli state0, cnt, 32 206*ee0d0305SXi Ruoyao addw state13, state13, state0 207*ee0d0305SXi Ruoyao sw state12, 48(output) 208*ee0d0305SXi Ruoyao sw state13, 52(output) 209*ee0d0305SXi Ruoyao sw state14, 56(output) 210*ee0d0305SXi Ruoyao sw state15, 60(output) 211*ee0d0305SXi Ruoyao 212*ee0d0305SXi Ruoyao /* ++counter */ 213*ee0d0305SXi Ruoyao addi cnt, cnt, 1 214*ee0d0305SXi Ruoyao 215*ee0d0305SXi Ruoyao /* output += 64 */ 216*ee0d0305SXi Ruoyao addi output, output, 64 217*ee0d0305SXi Ruoyao /* --nblocks */ 218*ee0d0305SXi Ruoyao addi nblocks, nblocks, -1 219*ee0d0305SXi Ruoyao bnez nblocks, .Lblock 220*ee0d0305SXi Ruoyao 221*ee0d0305SXi Ruoyao /* counter = [cnt_lo, cnt_hi] */ 222*ee0d0305SXi Ruoyao sd cnt, (counter) 223*ee0d0305SXi Ruoyao 224*ee0d0305SXi Ruoyao /* Zero out the potentially sensitive regs, in case nothing uses these 225*ee0d0305SXi Ruoyao * again. As at now copy[0,1,2,3] just contains "expand 32-byte k" and 226*ee0d0305SXi Ruoyao * state[0,...,11] are s0-s11 those we'll restore in the epilogue, we 227*ee0d0305SXi Ruoyao * only need to zero state[12,...,15]. 228*ee0d0305SXi Ruoyao */ 229*ee0d0305SXi Ruoyao mv state12, zero 230*ee0d0305SXi Ruoyao mv state13, zero 231*ee0d0305SXi Ruoyao mv state14, zero 232*ee0d0305SXi Ruoyao mv state15, zero 233*ee0d0305SXi Ruoyao 234*ee0d0305SXi Ruoyao REG_L s0, (sp) 235*ee0d0305SXi Ruoyao REG_L s1, SZREG(sp) 236*ee0d0305SXi Ruoyao REG_L s2, 2*SZREG(sp) 237*ee0d0305SXi Ruoyao REG_L s3, 3*SZREG(sp) 238*ee0d0305SXi Ruoyao REG_L s4, 4*SZREG(sp) 239*ee0d0305SXi Ruoyao REG_L s5, 5*SZREG(sp) 240*ee0d0305SXi Ruoyao REG_L s6, 6*SZREG(sp) 241*ee0d0305SXi Ruoyao REG_L s7, 7*SZREG(sp) 242*ee0d0305SXi Ruoyao REG_L s8, 8*SZREG(sp) 243*ee0d0305SXi Ruoyao REG_L s9, 9*SZREG(sp) 244*ee0d0305SXi Ruoyao REG_L s10, 10*SZREG(sp) 245*ee0d0305SXi Ruoyao REG_L s11, 11*SZREG(sp) 246*ee0d0305SXi Ruoyao addi sp, sp, 12*SZREG 247*ee0d0305SXi Ruoyao 248*ee0d0305SXi Ruoyao ret 249*ee0d0305SXi RuoyaoSYM_FUNC_END(__arch_chacha20_blocks_nostack) 250