xref: /linux/lib/crypto/riscv/chacha-riscv64-zvkb.S (revision 13150742b09e720fdf021de14cd2b98b37415a89)
1/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
2//
3// This file is dual-licensed, meaning that you can use it under your
4// choice of either of the following two licenses:
5//
6// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
7//
8// Licensed under the Apache License 2.0 (the "License"). You can obtain
9// a copy in the file LICENSE in the source distribution or at
10// https://www.openssl.org/source/license.html
11//
12// or
13//
14// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
15// Copyright 2024 Google LLC
16// All rights reserved.
17//
18// Redistribution and use in source and binary forms, with or without
19// modification, are permitted provided that the following conditions
20// are met:
21// 1. Redistributions of source code must retain the above copyright
22//    notice, this list of conditions and the following disclaimer.
23// 2. Redistributions in binary form must reproduce the above copyright
24//    notice, this list of conditions and the following disclaimer in the
25//    documentation and/or other materials provided with the distribution.
26//
27// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38
39// The generated code of this file depends on the following RISC-V extensions:
40// - RV64I
41// - RISC-V Vector ('V') with VLEN >= 128
42// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
43
44#include <linux/linkage.h>
45
46.text
47.option arch, +zvkb
48
49#define STATEP		a0
50#define INP		a1
51#define OUTP		a2
52#define NBLOCKS		a3
53#define NROUNDS		a4
54
55#define CONSTS0		a5
56#define CONSTS1		a6
57#define CONSTS2		a7
58#define CONSTS3		t0
59#define TMP		t1
60#define VL		t2
61#define STRIDE		t3
62#define ROUND_CTR	t4
63#define KEY0		s0
64#define KEY1		s1
65#define KEY2		s2
66#define KEY3		s3
67#define KEY4		s4
68#define KEY5		s5
69#define KEY6		s6
70#define KEY7		s7
71#define COUNTER		s8
72#define NONCE0		s9
73#define NONCE1		s10
74#define NONCE2		s11
75
76.macro	chacha_round	a0, b0, c0, d0,  a1, b1, c1, d1, \
77			a2, b2, c2, d2,  a3, b3, c3, d3
78	// a += b; d ^= a; d = rol(d, 16);
79	vadd.vv		\a0, \a0, \b0
80	vadd.vv		\a1, \a1, \b1
81	vadd.vv		\a2, \a2, \b2
82	vadd.vv		\a3, \a3, \b3
83	vxor.vv		\d0, \d0, \a0
84	vxor.vv		\d1, \d1, \a1
85	vxor.vv		\d2, \d2, \a2
86	vxor.vv		\d3, \d3, \a3
87	vror.vi		\d0, \d0, 32 - 16
88	vror.vi		\d1, \d1, 32 - 16
89	vror.vi		\d2, \d2, 32 - 16
90	vror.vi		\d3, \d3, 32 - 16
91
92	// c += d; b ^= c; b = rol(b, 12);
93	vadd.vv		\c0, \c0, \d0
94	vadd.vv		\c1, \c1, \d1
95	vadd.vv		\c2, \c2, \d2
96	vadd.vv		\c3, \c3, \d3
97	vxor.vv		\b0, \b0, \c0
98	vxor.vv		\b1, \b1, \c1
99	vxor.vv		\b2, \b2, \c2
100	vxor.vv		\b3, \b3, \c3
101	vror.vi		\b0, \b0, 32 - 12
102	vror.vi		\b1, \b1, 32 - 12
103	vror.vi		\b2, \b2, 32 - 12
104	vror.vi		\b3, \b3, 32 - 12
105
106	// a += b; d ^= a; d = rol(d, 8);
107	vadd.vv		\a0, \a0, \b0
108	vadd.vv		\a1, \a1, \b1
109	vadd.vv		\a2, \a2, \b2
110	vadd.vv		\a3, \a3, \b3
111	vxor.vv		\d0, \d0, \a0
112	vxor.vv		\d1, \d1, \a1
113	vxor.vv		\d2, \d2, \a2
114	vxor.vv		\d3, \d3, \a3
115	vror.vi		\d0, \d0, 32 - 8
116	vror.vi		\d1, \d1, 32 - 8
117	vror.vi		\d2, \d2, 32 - 8
118	vror.vi		\d3, \d3, 32 - 8
119
120	// c += d; b ^= c; b = rol(b, 7);
121	vadd.vv		\c0, \c0, \d0
122	vadd.vv		\c1, \c1, \d1
123	vadd.vv		\c2, \c2, \d2
124	vadd.vv		\c3, \c3, \d3
125	vxor.vv		\b0, \b0, \c0
126	vxor.vv		\b1, \b1, \c1
127	vxor.vv		\b2, \b2, \c2
128	vxor.vv		\b3, \b3, \c3
129	vror.vi		\b0, \b0, 32 - 7
130	vror.vi		\b1, \b1, 32 - 7
131	vror.vi		\b2, \b2, 32 - 7
132	vror.vi		\b3, \b3, 32 - 7
133.endm
134
135// void chacha_zvkb(struct chacha_state *state, const u8 *in, u8 *out,
136//		    size_t nblocks, int nrounds);
137//
138// |nblocks| is the number of 64-byte blocks to process, and must be nonzero.
139//
140// |state| gives the ChaCha state matrix, including the 32-bit counter in
141// state->x[12] following the RFC7539 convention; note that this differs from
142// the original Salsa20 paper which uses a 64-bit counter in state->x[12..13].
143// The updated 32-bit counter is written back to state->x[12] before returning.
144SYM_FUNC_START(chacha_zvkb)
145	addi		sp, sp, -96
146	sd		s0, 0(sp)
147	sd		s1, 8(sp)
148	sd		s2, 16(sp)
149	sd		s3, 24(sp)
150	sd		s4, 32(sp)
151	sd		s5, 40(sp)
152	sd		s6, 48(sp)
153	sd		s7, 56(sp)
154	sd		s8, 64(sp)
155	sd		s9, 72(sp)
156	sd		s10, 80(sp)
157	sd		s11, 88(sp)
158
159	li		STRIDE, 64
160
161	// Set up the initial state matrix in scalar registers.
162	lw		CONSTS0, 0(STATEP)
163	lw		CONSTS1, 4(STATEP)
164	lw		CONSTS2, 8(STATEP)
165	lw		CONSTS3, 12(STATEP)
166	lw		KEY0, 16(STATEP)
167	lw		KEY1, 20(STATEP)
168	lw		KEY2, 24(STATEP)
169	lw		KEY3, 28(STATEP)
170	lw		KEY4, 32(STATEP)
171	lw		KEY5, 36(STATEP)
172	lw		KEY6, 40(STATEP)
173	lw		KEY7, 44(STATEP)
174	lw		COUNTER, 48(STATEP)
175	lw		NONCE0, 52(STATEP)
176	lw		NONCE1, 56(STATEP)
177	lw		NONCE2, 60(STATEP)
178
179.Lblock_loop:
180	// Set vl to the number of blocks to process in this iteration.
181	vsetvli		VL, NBLOCKS, e32, m1, ta, ma
182
183	// Set up the initial state matrix for the next VL blocks in v0-v15.
184	// v{i} holds the i'th 32-bit word of the state matrix for all blocks.
185	// Note that only the counter word, at index 12, differs across blocks.
186	vmv.v.x		v0, CONSTS0
187	vmv.v.x		v1, CONSTS1
188	vmv.v.x		v2, CONSTS2
189	vmv.v.x		v3, CONSTS3
190	vmv.v.x		v4, KEY0
191	vmv.v.x		v5, KEY1
192	vmv.v.x		v6, KEY2
193	vmv.v.x		v7, KEY3
194	vmv.v.x		v8, KEY4
195	vmv.v.x		v9, KEY5
196	vmv.v.x		v10, KEY6
197	vmv.v.x		v11, KEY7
198	vid.v		v12
199	vadd.vx		v12, v12, COUNTER
200	vmv.v.x		v13, NONCE0
201	vmv.v.x		v14, NONCE1
202	vmv.v.x		v15, NONCE2
203
204	// Load the first half of the input data for each block into v16-v23.
205	// v{16+i} holds the i'th 32-bit word for all blocks.
206	vlsseg8e32.v	v16, (INP), STRIDE
207
208	mv		ROUND_CTR, NROUNDS
209.Lnext_doubleround:
210	addi		ROUND_CTR, ROUND_CTR, -2
211	// column round
212	chacha_round	v0, v4, v8, v12, v1, v5, v9, v13, \
213			v2, v6, v10, v14, v3, v7, v11, v15
214	// diagonal round
215	chacha_round	v0, v5, v10, v15, v1, v6, v11, v12, \
216			v2, v7, v8, v13, v3, v4, v9, v14
217	bnez		ROUND_CTR, .Lnext_doubleround
218
219	// Load the second half of the input data for each block into v24-v31.
220	// v{24+i} holds the {8+i}'th 32-bit word for all blocks.
221	addi		TMP, INP, 32
222	vlsseg8e32.v	v24, (TMP), STRIDE
223
224	// Finalize the first half of the keystream for each block.
225	vadd.vx		v0, v0, CONSTS0
226	vadd.vx		v1, v1, CONSTS1
227	vadd.vx		v2, v2, CONSTS2
228	vadd.vx		v3, v3, CONSTS3
229	vadd.vx		v4, v4, KEY0
230	vadd.vx		v5, v5, KEY1
231	vadd.vx		v6, v6, KEY2
232	vadd.vx		v7, v7, KEY3
233
234	// Encrypt/decrypt the first half of the data for each block.
235	vxor.vv		v16, v16, v0
236	vxor.vv		v17, v17, v1
237	vxor.vv		v18, v18, v2
238	vxor.vv		v19, v19, v3
239	vxor.vv		v20, v20, v4
240	vxor.vv		v21, v21, v5
241	vxor.vv		v22, v22, v6
242	vxor.vv		v23, v23, v7
243
244	// Store the first half of the output data for each block.
245	vssseg8e32.v	v16, (OUTP), STRIDE
246
247	// Finalize the second half of the keystream for each block.
248	vadd.vx		v8, v8, KEY4
249	vadd.vx		v9, v9, KEY5
250	vadd.vx		v10, v10, KEY6
251	vadd.vx		v11, v11, KEY7
252	vid.v		v0
253	vadd.vx		v12, v12, COUNTER
254	vadd.vx		v13, v13, NONCE0
255	vadd.vx		v14, v14, NONCE1
256	vadd.vx		v15, v15, NONCE2
257	vadd.vv		v12, v12, v0
258
259	// Encrypt/decrypt the second half of the data for each block.
260	vxor.vv		v24, v24, v8
261	vxor.vv		v25, v25, v9
262	vxor.vv		v26, v26, v10
263	vxor.vv		v27, v27, v11
264	vxor.vv		v29, v29, v13
265	vxor.vv		v28, v28, v12
266	vxor.vv		v30, v30, v14
267	vxor.vv		v31, v31, v15
268
269	// Store the second half of the output data for each block.
270	addi		TMP, OUTP, 32
271	vssseg8e32.v	v24, (TMP), STRIDE
272
273	// Update the counter, the remaining number of blocks, and the input and
274	// output pointers according to the number of blocks processed (VL).
275	add		COUNTER, COUNTER, VL
276	sub		NBLOCKS, NBLOCKS, VL
277	slli		TMP, VL, 6
278	add		OUTP, OUTP, TMP
279	add		INP, INP, TMP
280	bnez		NBLOCKS, .Lblock_loop
281
282	sw		COUNTER, 48(STATEP)
283	ld		s0, 0(sp)
284	ld		s1, 8(sp)
285	ld		s2, 16(sp)
286	ld		s3, 24(sp)
287	ld		s4, 32(sp)
288	ld		s5, 40(sp)
289	ld		s6, 48(sp)
290	ld		s7, 56(sp)
291	ld		s8, 64(sp)
292	ld		s9, 72(sp)
293	ld		s10, 80(sp)
294	ld		s11, 88(sp)
295	addi		sp, sp, 96
296	ret
297SYM_FUNC_END(chacha_zvkb)
298