1/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ 2// 3// This file is dual-licensed, meaning that you can use it under your 4// choice of either of the following two licenses: 5// 6// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. 7// 8// Licensed under the Apache License 2.0 (the "License"). You can obtain 9// a copy in the file LICENSE in the source distribution or at 10// https://www.openssl.org/source/license.html 11// 12// or 13// 14// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com> 15// Copyright 2024 Google LLC 16// All rights reserved. 17// 18// Redistribution and use in source and binary forms, with or without 19// modification, are permitted provided that the following conditions 20// are met: 21// 1. Redistributions of source code must retain the above copyright 22// notice, this list of conditions and the following disclaimer. 23// 2. Redistributions in binary form must reproduce the above copyright 24// notice, this list of conditions and the following disclaimer in the 25// documentation and/or other materials provided with the distribution. 26// 27// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 28// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 29// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 30// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 31// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 32// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 33// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 34// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 35// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 36// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 37// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 39// The generated code of this file depends on the following RISC-V extensions: 40// - RV64I 41// - RISC-V Vector ('V') with VLEN >= 128 42// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb') 43 44#include <linux/linkage.h> 45 46.text 47.option arch, +zvkb 48 49#define STATEP a0 50#define INP a1 51#define OUTP a2 52#define NBLOCKS a3 53#define NROUNDS a4 54 55#define CONSTS0 a5 56#define CONSTS1 a6 57#define CONSTS2 a7 58#define CONSTS3 t0 59#define TMP t1 60#define VL t2 61#define STRIDE t3 62#define ROUND_CTR t4 63#define KEY0 s0 64#define KEY1 s1 65#define KEY2 s2 66#define KEY3 s3 67#define KEY4 s4 68#define KEY5 s5 69#define KEY6 s6 70#define KEY7 s7 71#define COUNTER s8 72#define NONCE0 s9 73#define NONCE1 s10 74#define NONCE2 s11 75 76.macro chacha_round a0, b0, c0, d0, a1, b1, c1, d1, \ 77 a2, b2, c2, d2, a3, b3, c3, d3 78 // a += b; d ^= a; d = rol(d, 16); 79 vadd.vv \a0, \a0, \b0 80 vadd.vv \a1, \a1, \b1 81 vadd.vv \a2, \a2, \b2 82 vadd.vv \a3, \a3, \b3 83 vxor.vv \d0, \d0, \a0 84 vxor.vv \d1, \d1, \a1 85 vxor.vv \d2, \d2, \a2 86 vxor.vv \d3, \d3, \a3 87 vror.vi \d0, \d0, 32 - 16 88 vror.vi \d1, \d1, 32 - 16 89 vror.vi \d2, \d2, 32 - 16 90 vror.vi \d3, \d3, 32 - 16 91 92 // c += d; b ^= c; b = rol(b, 12); 93 vadd.vv \c0, \c0, \d0 94 vadd.vv \c1, \c1, \d1 95 vadd.vv \c2, \c2, \d2 96 vadd.vv \c3, \c3, \d3 97 vxor.vv \b0, \b0, \c0 98 vxor.vv \b1, \b1, \c1 99 vxor.vv \b2, \b2, \c2 100 vxor.vv \b3, \b3, \c3 101 vror.vi \b0, \b0, 32 - 12 102 vror.vi \b1, \b1, 32 - 12 103 vror.vi \b2, \b2, 32 - 12 104 vror.vi \b3, \b3, 32 - 12 105 106 // a += b; d ^= a; d = rol(d, 8); 107 vadd.vv \a0, \a0, \b0 108 vadd.vv \a1, \a1, \b1 109 vadd.vv \a2, \a2, \b2 110 vadd.vv \a3, \a3, \b3 111 vxor.vv \d0, \d0, \a0 112 vxor.vv \d1, \d1, \a1 113 vxor.vv \d2, \d2, \a2 114 vxor.vv \d3, \d3, \a3 115 vror.vi \d0, \d0, 32 - 8 116 vror.vi \d1, \d1, 32 - 8 117 vror.vi \d2, \d2, 32 - 8 118 vror.vi \d3, \d3, 32 - 8 119 120 // c += d; b ^= c; b = rol(b, 7); 121 vadd.vv \c0, \c0, \d0 122 vadd.vv \c1, \c1, \d1 123 vadd.vv \c2, \c2, \d2 124 vadd.vv \c3, \c3, \d3 125 vxor.vv \b0, \b0, \c0 126 vxor.vv \b1, \b1, \c1 127 vxor.vv \b2, \b2, \c2 128 vxor.vv \b3, \b3, \c3 129 vror.vi \b0, \b0, 32 - 7 130 vror.vi \b1, \b1, 32 - 7 131 vror.vi \b2, \b2, 32 - 7 132 vror.vi \b3, \b3, 32 - 7 133.endm 134 135// void chacha_zvkb(struct chacha_state *state, const u8 *in, u8 *out, 136// size_t nblocks, int nrounds); 137// 138// |nblocks| is the number of 64-byte blocks to process, and must be nonzero. 139// 140// |state| gives the ChaCha state matrix, including the 32-bit counter in 141// state->x[12] following the RFC7539 convention; note that this differs from 142// the original Salsa20 paper which uses a 64-bit counter in state->x[12..13]. 143// The updated 32-bit counter is written back to state->x[12] before returning. 144SYM_FUNC_START(chacha_zvkb) 145 addi sp, sp, -96 146 sd s0, 0(sp) 147 sd s1, 8(sp) 148 sd s2, 16(sp) 149 sd s3, 24(sp) 150 sd s4, 32(sp) 151 sd s5, 40(sp) 152 sd s6, 48(sp) 153 sd s7, 56(sp) 154 sd s8, 64(sp) 155 sd s9, 72(sp) 156 sd s10, 80(sp) 157 sd s11, 88(sp) 158 159 li STRIDE, 64 160 161 // Set up the initial state matrix in scalar registers. 162 lw CONSTS0, 0(STATEP) 163 lw CONSTS1, 4(STATEP) 164 lw CONSTS2, 8(STATEP) 165 lw CONSTS3, 12(STATEP) 166 lw KEY0, 16(STATEP) 167 lw KEY1, 20(STATEP) 168 lw KEY2, 24(STATEP) 169 lw KEY3, 28(STATEP) 170 lw KEY4, 32(STATEP) 171 lw KEY5, 36(STATEP) 172 lw KEY6, 40(STATEP) 173 lw KEY7, 44(STATEP) 174 lw COUNTER, 48(STATEP) 175 lw NONCE0, 52(STATEP) 176 lw NONCE1, 56(STATEP) 177 lw NONCE2, 60(STATEP) 178 179.Lblock_loop: 180 // Set vl to the number of blocks to process in this iteration. 181 vsetvli VL, NBLOCKS, e32, m1, ta, ma 182 183 // Set up the initial state matrix for the next VL blocks in v0-v15. 184 // v{i} holds the i'th 32-bit word of the state matrix for all blocks. 185 // Note that only the counter word, at index 12, differs across blocks. 186 vmv.v.x v0, CONSTS0 187 vmv.v.x v1, CONSTS1 188 vmv.v.x v2, CONSTS2 189 vmv.v.x v3, CONSTS3 190 vmv.v.x v4, KEY0 191 vmv.v.x v5, KEY1 192 vmv.v.x v6, KEY2 193 vmv.v.x v7, KEY3 194 vmv.v.x v8, KEY4 195 vmv.v.x v9, KEY5 196 vmv.v.x v10, KEY6 197 vmv.v.x v11, KEY7 198 vid.v v12 199 vadd.vx v12, v12, COUNTER 200 vmv.v.x v13, NONCE0 201 vmv.v.x v14, NONCE1 202 vmv.v.x v15, NONCE2 203 204 // Load the first half of the input data for each block into v16-v23. 205 // v{16+i} holds the i'th 32-bit word for all blocks. 206 vlsseg8e32.v v16, (INP), STRIDE 207 208 mv ROUND_CTR, NROUNDS 209.Lnext_doubleround: 210 addi ROUND_CTR, ROUND_CTR, -2 211 // column round 212 chacha_round v0, v4, v8, v12, v1, v5, v9, v13, \ 213 v2, v6, v10, v14, v3, v7, v11, v15 214 // diagonal round 215 chacha_round v0, v5, v10, v15, v1, v6, v11, v12, \ 216 v2, v7, v8, v13, v3, v4, v9, v14 217 bnez ROUND_CTR, .Lnext_doubleround 218 219 // Load the second half of the input data for each block into v24-v31. 220 // v{24+i} holds the {8+i}'th 32-bit word for all blocks. 221 addi TMP, INP, 32 222 vlsseg8e32.v v24, (TMP), STRIDE 223 224 // Finalize the first half of the keystream for each block. 225 vadd.vx v0, v0, CONSTS0 226 vadd.vx v1, v1, CONSTS1 227 vadd.vx v2, v2, CONSTS2 228 vadd.vx v3, v3, CONSTS3 229 vadd.vx v4, v4, KEY0 230 vadd.vx v5, v5, KEY1 231 vadd.vx v6, v6, KEY2 232 vadd.vx v7, v7, KEY3 233 234 // Encrypt/decrypt the first half of the data for each block. 235 vxor.vv v16, v16, v0 236 vxor.vv v17, v17, v1 237 vxor.vv v18, v18, v2 238 vxor.vv v19, v19, v3 239 vxor.vv v20, v20, v4 240 vxor.vv v21, v21, v5 241 vxor.vv v22, v22, v6 242 vxor.vv v23, v23, v7 243 244 // Store the first half of the output data for each block. 245 vssseg8e32.v v16, (OUTP), STRIDE 246 247 // Finalize the second half of the keystream for each block. 248 vadd.vx v8, v8, KEY4 249 vadd.vx v9, v9, KEY5 250 vadd.vx v10, v10, KEY6 251 vadd.vx v11, v11, KEY7 252 vid.v v0 253 vadd.vx v12, v12, COUNTER 254 vadd.vx v13, v13, NONCE0 255 vadd.vx v14, v14, NONCE1 256 vadd.vx v15, v15, NONCE2 257 vadd.vv v12, v12, v0 258 259 // Encrypt/decrypt the second half of the data for each block. 260 vxor.vv v24, v24, v8 261 vxor.vv v25, v25, v9 262 vxor.vv v26, v26, v10 263 vxor.vv v27, v27, v11 264 vxor.vv v29, v29, v13 265 vxor.vv v28, v28, v12 266 vxor.vv v30, v30, v14 267 vxor.vv v31, v31, v15 268 269 // Store the second half of the output data for each block. 270 addi TMP, OUTP, 32 271 vssseg8e32.v v24, (TMP), STRIDE 272 273 // Update the counter, the remaining number of blocks, and the input and 274 // output pointers according to the number of blocks processed (VL). 275 add COUNTER, COUNTER, VL 276 sub NBLOCKS, NBLOCKS, VL 277 slli TMP, VL, 6 278 add OUTP, OUTP, TMP 279 add INP, INP, TMP 280 bnez NBLOCKS, .Lblock_loop 281 282 sw COUNTER, 48(STATEP) 283 ld s0, 0(sp) 284 ld s1, 8(sp) 285 ld s2, 16(sp) 286 ld s3, 24(sp) 287 ld s4, 32(sp) 288 ld s5, 40(sp) 289 ld s6, 48(sp) 290 ld s7, 56(sp) 291 ld s8, 64(sp) 292 ld s9, 72(sp) 293 ld s10, 80(sp) 294 ld s11, 88(sp) 295 addi sp, sp, 96 296 ret 297SYM_FUNC_END(chacha_zvkb) 298