1/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ 2// 3// This file is dual-licensed, meaning that you can use it under your 4// choice of either of the following two licenses: 5// 6// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. 7// 8// Licensed under the Apache License 2.0 (the "License"). You can obtain 9// a copy in the file LICENSE in the source distribution or at 10// https://www.openssl.org/source/license.html 11// 12// or 13// 14// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com> 15// Copyright 2024 Google LLC 16// All rights reserved. 17// 18// Redistribution and use in source and binary forms, with or without 19// modification, are permitted provided that the following conditions 20// are met: 21// 1. Redistributions of source code must retain the above copyright 22// notice, this list of conditions and the following disclaimer. 23// 2. Redistributions in binary form must reproduce the above copyright 24// notice, this list of conditions and the following disclaimer in the 25// documentation and/or other materials provided with the distribution. 26// 27// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 28// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 29// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 30// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 31// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 32// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 33// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 34// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 35// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 36// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 37// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 39// The generated code of this file depends on the following RISC-V extensions: 40// - RV64I 41// - RISC-V Vector ('V') with VLEN >= 128 42// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb') 43 44#include <linux/linkage.h> 45 46.text 47.option arch, +zvkb 48 49#define KEYP a0 50#define INP a1 51#define OUTP a2 52#define LEN a3 53#define IVP a4 54 55#define CONSTS0 a5 56#define CONSTS1 a6 57#define CONSTS2 a7 58#define CONSTS3 t0 59#define TMP t1 60#define VL t2 61#define STRIDE t3 62#define NROUNDS t4 63#define KEY0 s0 64#define KEY1 s1 65#define KEY2 s2 66#define KEY3 s3 67#define KEY4 s4 68#define KEY5 s5 69#define KEY6 s6 70#define KEY7 s7 71#define COUNTER s8 72#define NONCE0 s9 73#define NONCE1 s10 74#define NONCE2 s11 75 76.macro chacha_round a0, b0, c0, d0, a1, b1, c1, d1, \ 77 a2, b2, c2, d2, a3, b3, c3, d3 78 // a += b; d ^= a; d = rol(d, 16); 79 vadd.vv \a0, \a0, \b0 80 vadd.vv \a1, \a1, \b1 81 vadd.vv \a2, \a2, \b2 82 vadd.vv \a3, \a3, \b3 83 vxor.vv \d0, \d0, \a0 84 vxor.vv \d1, \d1, \a1 85 vxor.vv \d2, \d2, \a2 86 vxor.vv \d3, \d3, \a3 87 vror.vi \d0, \d0, 32 - 16 88 vror.vi \d1, \d1, 32 - 16 89 vror.vi \d2, \d2, 32 - 16 90 vror.vi \d3, \d3, 32 - 16 91 92 // c += d; b ^= c; b = rol(b, 12); 93 vadd.vv \c0, \c0, \d0 94 vadd.vv \c1, \c1, \d1 95 vadd.vv \c2, \c2, \d2 96 vadd.vv \c3, \c3, \d3 97 vxor.vv \b0, \b0, \c0 98 vxor.vv \b1, \b1, \c1 99 vxor.vv \b2, \b2, \c2 100 vxor.vv \b3, \b3, \c3 101 vror.vi \b0, \b0, 32 - 12 102 vror.vi \b1, \b1, 32 - 12 103 vror.vi \b2, \b2, 32 - 12 104 vror.vi \b3, \b3, 32 - 12 105 106 // a += b; d ^= a; d = rol(d, 8); 107 vadd.vv \a0, \a0, \b0 108 vadd.vv \a1, \a1, \b1 109 vadd.vv \a2, \a2, \b2 110 vadd.vv \a3, \a3, \b3 111 vxor.vv \d0, \d0, \a0 112 vxor.vv \d1, \d1, \a1 113 vxor.vv \d2, \d2, \a2 114 vxor.vv \d3, \d3, \a3 115 vror.vi \d0, \d0, 32 - 8 116 vror.vi \d1, \d1, 32 - 8 117 vror.vi \d2, \d2, 32 - 8 118 vror.vi \d3, \d3, 32 - 8 119 120 // c += d; b ^= c; b = rol(b, 7); 121 vadd.vv \c0, \c0, \d0 122 vadd.vv \c1, \c1, \d1 123 vadd.vv \c2, \c2, \d2 124 vadd.vv \c3, \c3, \d3 125 vxor.vv \b0, \b0, \c0 126 vxor.vv \b1, \b1, \c1 127 vxor.vv \b2, \b2, \c2 128 vxor.vv \b3, \b3, \c3 129 vror.vi \b0, \b0, 32 - 7 130 vror.vi \b1, \b1, 32 - 7 131 vror.vi \b2, \b2, 32 - 7 132 vror.vi \b3, \b3, 32 - 7 133.endm 134 135// void chacha20_zvkb(const u32 key[8], const u8 *in, u8 *out, size_t len, 136// const u32 iv[4]); 137// 138// |len| must be nonzero and a multiple of 64 (CHACHA_BLOCK_SIZE). 139// The counter is treated as 32-bit, following the RFC7539 convention. 140SYM_FUNC_START(chacha20_zvkb) 141 srli LEN, LEN, 6 // Bytes to blocks 142 143 addi sp, sp, -96 144 sd s0, 0(sp) 145 sd s1, 8(sp) 146 sd s2, 16(sp) 147 sd s3, 24(sp) 148 sd s4, 32(sp) 149 sd s5, 40(sp) 150 sd s6, 48(sp) 151 sd s7, 56(sp) 152 sd s8, 64(sp) 153 sd s9, 72(sp) 154 sd s10, 80(sp) 155 sd s11, 88(sp) 156 157 li STRIDE, 64 158 159 // Set up the initial state matrix in scalar registers. 160 li CONSTS0, 0x61707865 // "expa" little endian 161 li CONSTS1, 0x3320646e // "nd 3" little endian 162 li CONSTS2, 0x79622d32 // "2-by" little endian 163 li CONSTS3, 0x6b206574 // "te k" little endian 164 lw KEY0, 0(KEYP) 165 lw KEY1, 4(KEYP) 166 lw KEY2, 8(KEYP) 167 lw KEY3, 12(KEYP) 168 lw KEY4, 16(KEYP) 169 lw KEY5, 20(KEYP) 170 lw KEY6, 24(KEYP) 171 lw KEY7, 28(KEYP) 172 lw COUNTER, 0(IVP) 173 lw NONCE0, 4(IVP) 174 lw NONCE1, 8(IVP) 175 lw NONCE2, 12(IVP) 176 177.Lblock_loop: 178 // Set vl to the number of blocks to process in this iteration. 179 vsetvli VL, LEN, e32, m1, ta, ma 180 181 // Set up the initial state matrix for the next VL blocks in v0-v15. 182 // v{i} holds the i'th 32-bit word of the state matrix for all blocks. 183 // Note that only the counter word, at index 12, differs across blocks. 184 vmv.v.x v0, CONSTS0 185 vmv.v.x v1, CONSTS1 186 vmv.v.x v2, CONSTS2 187 vmv.v.x v3, CONSTS3 188 vmv.v.x v4, KEY0 189 vmv.v.x v5, KEY1 190 vmv.v.x v6, KEY2 191 vmv.v.x v7, KEY3 192 vmv.v.x v8, KEY4 193 vmv.v.x v9, KEY5 194 vmv.v.x v10, KEY6 195 vmv.v.x v11, KEY7 196 vid.v v12 197 vadd.vx v12, v12, COUNTER 198 vmv.v.x v13, NONCE0 199 vmv.v.x v14, NONCE1 200 vmv.v.x v15, NONCE2 201 202 // Load the first half of the input data for each block into v16-v23. 203 // v{16+i} holds the i'th 32-bit word for all blocks. 204 vlsseg8e32.v v16, (INP), STRIDE 205 206 li NROUNDS, 20 207.Lnext_doubleround: 208 addi NROUNDS, NROUNDS, -2 209 // column round 210 chacha_round v0, v4, v8, v12, v1, v5, v9, v13, \ 211 v2, v6, v10, v14, v3, v7, v11, v15 212 // diagonal round 213 chacha_round v0, v5, v10, v15, v1, v6, v11, v12, \ 214 v2, v7, v8, v13, v3, v4, v9, v14 215 bnez NROUNDS, .Lnext_doubleround 216 217 // Load the second half of the input data for each block into v24-v31. 218 // v{24+i} holds the {8+i}'th 32-bit word for all blocks. 219 addi TMP, INP, 32 220 vlsseg8e32.v v24, (TMP), STRIDE 221 222 // Finalize the first half of the keystream for each block. 223 vadd.vx v0, v0, CONSTS0 224 vadd.vx v1, v1, CONSTS1 225 vadd.vx v2, v2, CONSTS2 226 vadd.vx v3, v3, CONSTS3 227 vadd.vx v4, v4, KEY0 228 vadd.vx v5, v5, KEY1 229 vadd.vx v6, v6, KEY2 230 vadd.vx v7, v7, KEY3 231 232 // Encrypt/decrypt the first half of the data for each block. 233 vxor.vv v16, v16, v0 234 vxor.vv v17, v17, v1 235 vxor.vv v18, v18, v2 236 vxor.vv v19, v19, v3 237 vxor.vv v20, v20, v4 238 vxor.vv v21, v21, v5 239 vxor.vv v22, v22, v6 240 vxor.vv v23, v23, v7 241 242 // Store the first half of the output data for each block. 243 vssseg8e32.v v16, (OUTP), STRIDE 244 245 // Finalize the second half of the keystream for each block. 246 vadd.vx v8, v8, KEY4 247 vadd.vx v9, v9, KEY5 248 vadd.vx v10, v10, KEY6 249 vadd.vx v11, v11, KEY7 250 vid.v v0 251 vadd.vx v12, v12, COUNTER 252 vadd.vx v13, v13, NONCE0 253 vadd.vx v14, v14, NONCE1 254 vadd.vx v15, v15, NONCE2 255 vadd.vv v12, v12, v0 256 257 // Encrypt/decrypt the second half of the data for each block. 258 vxor.vv v24, v24, v8 259 vxor.vv v25, v25, v9 260 vxor.vv v26, v26, v10 261 vxor.vv v27, v27, v11 262 vxor.vv v29, v29, v13 263 vxor.vv v28, v28, v12 264 vxor.vv v30, v30, v14 265 vxor.vv v31, v31, v15 266 267 // Store the second half of the output data for each block. 268 addi TMP, OUTP, 32 269 vssseg8e32.v v24, (TMP), STRIDE 270 271 // Update the counter, the remaining number of blocks, and the input and 272 // output pointers according to the number of blocks processed (VL). 273 add COUNTER, COUNTER, VL 274 sub LEN, LEN, VL 275 slli TMP, VL, 6 276 add OUTP, OUTP, TMP 277 add INP, INP, TMP 278 bnez LEN, .Lblock_loop 279 280 ld s0, 0(sp) 281 ld s1, 8(sp) 282 ld s2, 16(sp) 283 ld s3, 24(sp) 284 ld s4, 32(sp) 285 ld s5, 40(sp) 286 ld s6, 48(sp) 287 ld s7, 56(sp) 288 ld s8, 64(sp) 289 ld s9, 72(sp) 290 ld s10, 80(sp) 291 ld s11, 88(sp) 292 addi sp, sp, 96 293 ret 294SYM_FUNC_END(chacha20_zvkb) 295