1/* SPDX-License-Identifier: GPL-2.0 */ 2 3#include <linux/linkage.h> 4#include <asm/alternative.h> 5#include <asm/fpu-insn.h> 6 7#define STATE0 %v0 8#define STATE1 %v1 9#define STATE2 %v2 10#define STATE3 %v3 11#define COPY0 %v4 12#define COPY1 %v5 13#define COPY2 %v6 14#define COPY3 %v7 15#define PERM4 %v16 16#define PERM8 %v17 17#define PERM12 %v18 18#define BEPERM %v19 19#define TMP0 %v20 20#define TMP1 %v21 21#define TMP2 %v22 22#define TMP3 %v23 23 24 .section .rodata 25 26 .balign 128 27.Lconstants: 28 .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral 29 .long 0x04050607,0x08090a0b,0x0c0d0e0f,0x00010203 # rotl 4 bytes 30 .long 0x08090a0b,0x0c0d0e0f,0x00010203,0x04050607 # rotl 8 bytes 31 .long 0x0c0d0e0f,0x00010203,0x04050607,0x08090a0b # rotl 12 bytes 32 .long 0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c # byte swap 33 34 .text 35/* 36 * s390 ChaCha20 implementation meant for vDSO. Produces a given positive 37 * number of blocks of output with nonce 0, taking an input key and 8-bytes 38 * counter. Does not spill to the stack. 39 * 40 * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes, 41 * const uint8_t *key, 42 * uint32_t *counter, 43 * size_t nblocks) 44 */ 45SYM_FUNC_START(__arch_chacha20_blocks_nostack) 46 larl %r1,.Lconstants 47 48 /* COPY0 = "expand 32-byte k" */ 49 VL COPY0,0,,%r1 50 51 /* PERM4-PERM12,BEPERM = byte selectors for VPERM */ 52 VLM PERM4,BEPERM,16,%r1 53 54 /* COPY1,COPY2 = key */ 55 VLM COPY1,COPY2,0,%r3 56 57 /* COPY3 = counter || zero nonce */ 58 lg %r3,0(%r4) 59 VZERO COPY3 60 VLVGG COPY3,%r3,0 61 62 lghi %r1,0 63.Lblock: 64 VLR STATE0,COPY0 65 VLR STATE1,COPY1 66 VLR STATE2,COPY2 67 VLR STATE3,COPY3 68 69 lghi %r0,10 70.Ldoubleround: 71 /* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 16) */ 72 VAF STATE0,STATE0,STATE1 73 VX STATE3,STATE3,STATE0 74 VERLLF STATE3,STATE3,16 75 76 /* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 12) */ 77 VAF STATE2,STATE2,STATE3 78 VX STATE1,STATE1,STATE2 79 VERLLF STATE1,STATE1,12 80 81 /* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 8) */ 82 VAF STATE0,STATE0,STATE1 83 VX STATE3,STATE3,STATE0 84 VERLLF STATE3,STATE3,8 85 86 /* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 7) */ 87 VAF STATE2,STATE2,STATE3 88 VX STATE1,STATE1,STATE2 89 VERLLF STATE1,STATE1,7 90 91 /* STATE1[0,1,2,3] = STATE1[1,2,3,0] */ 92 VPERM STATE1,STATE1,STATE1,PERM4 93 /* STATE2[0,1,2,3] = STATE2[2,3,0,1] */ 94 VPERM STATE2,STATE2,STATE2,PERM8 95 /* STATE3[0,1,2,3] = STATE3[3,0,1,2] */ 96 VPERM STATE3,STATE3,STATE3,PERM12 97 98 /* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 16) */ 99 VAF STATE0,STATE0,STATE1 100 VX STATE3,STATE3,STATE0 101 VERLLF STATE3,STATE3,16 102 103 /* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 12) */ 104 VAF STATE2,STATE2,STATE3 105 VX STATE1,STATE1,STATE2 106 VERLLF STATE1,STATE1,12 107 108 /* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 8) */ 109 VAF STATE0,STATE0,STATE1 110 VX STATE3,STATE3,STATE0 111 VERLLF STATE3,STATE3,8 112 113 /* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 7) */ 114 VAF STATE2,STATE2,STATE3 115 VX STATE1,STATE1,STATE2 116 VERLLF STATE1,STATE1,7 117 118 /* STATE1[0,1,2,3] = STATE1[3,0,1,2] */ 119 VPERM STATE1,STATE1,STATE1,PERM12 120 /* STATE2[0,1,2,3] = STATE2[2,3,0,1] */ 121 VPERM STATE2,STATE2,STATE2,PERM8 122 /* STATE3[0,1,2,3] = STATE3[1,2,3,0] */ 123 VPERM STATE3,STATE3,STATE3,PERM4 124 brctg %r0,.Ldoubleround 125 126 /* OUTPUT0 = STATE0 + STATE0 */ 127 VAF STATE0,STATE0,COPY0 128 /* OUTPUT1 = STATE1 + STATE1 */ 129 VAF STATE1,STATE1,COPY1 130 /* OUTPUT2 = STATE2 + STATE2 */ 131 VAF STATE2,STATE2,COPY2 132 /* OUTPUT2 = STATE3 + STATE3 */ 133 VAF STATE3,STATE3,COPY3 134 135 /* 136 * 32 bit wise little endian store to OUTPUT. If the vector 137 * enhancement facility 2 is not installed use the slow path. 138 */ 139 ALTERNATIVE "brc 0xf,.Lstoreslow", "nop", ALT_FACILITY(148) 140 VSTBRF STATE0,0,,%r2 141 VSTBRF STATE1,16,,%r2 142 VSTBRF STATE2,32,,%r2 143 VSTBRF STATE3,48,,%r2 144.Lstoredone: 145 146 /* ++COPY3.COUNTER */ 147 /* alsih %r3,1 */ 148 .insn rilu,0xcc0a00000000,%r3,1 149 alcr %r3,%r1 150 VLVGG COPY3,%r3,0 151 152 /* OUTPUT += 64, --NBLOCKS */ 153 aghi %r2,64 154 brctg %r5,.Lblock 155 156 /* COUNTER = COPY3.COUNTER */ 157 stg %r3,0(%r4) 158 159 /* Zero out potentially sensitive regs */ 160 VZERO STATE0 161 VZERO STATE1 162 VZERO STATE2 163 VZERO STATE3 164 VZERO COPY1 165 VZERO COPY2 166 167 /* Early exit if TMP0-TMP3 have not been used */ 168 ALTERNATIVE "nopr", "br %r14", ALT_FACILITY(148) 169 170 VZERO TMP0 171 VZERO TMP1 172 VZERO TMP2 173 VZERO TMP3 174 175 br %r14 176 177.Lstoreslow: 178 /* Convert STATE to little endian format and store to OUTPUT */ 179 VPERM TMP0,STATE0,STATE0,BEPERM 180 VPERM TMP1,STATE1,STATE1,BEPERM 181 VPERM TMP2,STATE2,STATE2,BEPERM 182 VPERM TMP3,STATE3,STATE3,BEPERM 183 VSTM TMP0,TMP3,0,%r2 184 j .Lstoredone 185SYM_FUNC_END(__arch_chacha20_blocks_nostack) 186