xref: /linux/lib/crypto/arm64/sha3-ce-core.S (revision 7fc2cd2e4b398c57c9cf961cfea05eadbf34c05c)
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * Core SHA-3 transform using v8.2 Crypto Extensions
4 *
5 * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#include <linux/linkage.h>
13#include <asm/assembler.h>
14
15	.irp	b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
16	.set	.Lv\b\().2d, \b
17	.set	.Lv\b\().16b, \b
18	.endr
19
20	/*
21	 * ARMv8.2 Crypto Extensions instructions
22	 */
23	.macro	eor3, rd, rn, rm, ra
24	.inst	0xce000000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
25	.endm
26
27	.macro	rax1, rd, rn, rm
28	.inst	0xce608c00 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
29	.endm
30
31	.macro	bcax, rd, rn, rm, ra
32	.inst	0xce200000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
33	.endm
34
35	.macro	xar, rd, rn, rm, imm6
36	.inst	0xce800000 | .L\rd | (.L\rn << 5) | ((\imm6) << 10) | (.L\rm << 16)
37	.endm
38
39	/*
40	 * size_t sha3_ce_transform(struct sha3_state *state, const u8 *data,
41	 *			    size_t nblocks, size_t block_size)
42	 *
43	 * block_size is assumed to be one of 72 (SHA3-512), 104 (SHA3-384), 136
44	 * (SHA3-256 and SHAKE256), 144 (SHA3-224), or 168 (SHAKE128).
45	 */
46	.text
47SYM_FUNC_START(sha3_ce_transform)
48	/* load state */
49	add	x8, x0, #32
50	ld1	{ v0.1d- v3.1d}, [x0]
51	ld1	{ v4.1d- v7.1d}, [x8], #32
52	ld1	{ v8.1d-v11.1d}, [x8], #32
53	ld1	{v12.1d-v15.1d}, [x8], #32
54	ld1	{v16.1d-v19.1d}, [x8], #32
55	ld1	{v20.1d-v23.1d}, [x8], #32
56	ld1	{v24.1d}, [x8]
57
580:	sub	x2, x2, #1
59	mov	w8, #24
60	adr_l	x9, .Lsha3_rcon
61
62	/* load input */
63	ld1	{v25.8b-v28.8b}, [x1], #32
64	ld1	{v29.8b}, [x1], #8
65	eor	v0.8b, v0.8b, v25.8b
66	eor	v1.8b, v1.8b, v26.8b
67	eor	v2.8b, v2.8b, v27.8b
68	eor	v3.8b, v3.8b, v28.8b
69	eor	v4.8b, v4.8b, v29.8b
70
71	ld1	{v25.8b-v28.8b}, [x1], #32
72	eor	v5.8b, v5.8b, v25.8b
73	eor	v6.8b, v6.8b, v26.8b
74	eor	v7.8b, v7.8b, v27.8b
75	eor	v8.8b, v8.8b, v28.8b
76	cmp	x3, #72
77	b.eq	3f	/* SHA3-512 (block_size=72)? */
78
79	ld1	{v25.8b-v28.8b}, [x1], #32
80	eor	v9.8b, v9.8b, v25.8b
81	eor	v10.8b, v10.8b, v26.8b
82	eor	v11.8b, v11.8b, v27.8b
83	eor	v12.8b, v12.8b, v28.8b
84	cmp	x3, #104
85	b.eq	3f	/* SHA3-384 (block_size=104)? */
86
87	ld1	{v25.8b-v28.8b}, [x1], #32
88	eor	v13.8b, v13.8b, v25.8b
89	eor	v14.8b, v14.8b, v26.8b
90	eor	v15.8b, v15.8b, v27.8b
91	eor	v16.8b, v16.8b, v28.8b
92	cmp	x3, #144
93	b.lt	3f	/* SHA3-256 or SHAKE256 (block_size=136)? */
94	b.eq	2f	/* SHA3-224 (block_size=144)? */
95
96	/* SHAKE128 (block_size=168) */
97	ld1	{v25.8b-v28.8b}, [x1], #32
98	eor	v17.8b, v17.8b, v25.8b
99	eor	v18.8b, v18.8b, v26.8b
100	eor	v19.8b, v19.8b, v27.8b
101	eor	v20.8b, v20.8b, v28.8b
102	b	3f
1032:
104	/* SHA3-224 (block_size=144) */
105	ld1	{v25.8b}, [x1], #8
106	eor	v17.8b, v17.8b, v25.8b
107
1083:	sub	w8, w8, #1
109
110	eor3	v29.16b,  v4.16b,  v9.16b, v14.16b
111	eor3	v26.16b,  v1.16b,  v6.16b, v11.16b
112	eor3	v28.16b,  v3.16b,  v8.16b, v13.16b
113	eor3	v25.16b,  v0.16b,  v5.16b, v10.16b
114	eor3	v27.16b,  v2.16b,  v7.16b, v12.16b
115	eor3	v29.16b, v29.16b, v19.16b, v24.16b
116	eor3	v26.16b, v26.16b, v16.16b, v21.16b
117	eor3	v28.16b, v28.16b, v18.16b, v23.16b
118	eor3	v25.16b, v25.16b, v15.16b, v20.16b
119	eor3	v27.16b, v27.16b, v17.16b, v22.16b
120
121	rax1	v30.2d, v29.2d, v26.2d	// bc[0]
122	rax1	v26.2d, v26.2d, v28.2d	// bc[2]
123	rax1	v28.2d, v28.2d, v25.2d	// bc[4]
124	rax1	v25.2d, v25.2d, v27.2d	// bc[1]
125	rax1	v27.2d, v27.2d, v29.2d	// bc[3]
126
127	eor	 v0.16b,  v0.16b, v30.16b
128	xar	 v29.2d,   v1.2d,  v25.2d, (64 - 1)
129	xar	  v1.2d,   v6.2d,  v25.2d, (64 - 44)
130	xar	  v6.2d,   v9.2d,  v28.2d, (64 - 20)
131	xar	  v9.2d,  v22.2d,  v26.2d, (64 - 61)
132	xar	 v22.2d,  v14.2d,  v28.2d, (64 - 39)
133	xar	 v14.2d,  v20.2d,  v30.2d, (64 - 18)
134	xar	 v31.2d,   v2.2d,  v26.2d, (64 - 62)
135	xar	  v2.2d,  v12.2d,  v26.2d, (64 - 43)
136	xar	 v12.2d,  v13.2d,  v27.2d, (64 - 25)
137	xar	 v13.2d,  v19.2d,  v28.2d, (64 - 8)
138	xar	 v19.2d,  v23.2d,  v27.2d, (64 - 56)
139	xar	 v23.2d,  v15.2d,  v30.2d, (64 - 41)
140	xar	 v15.2d,   v4.2d,  v28.2d, (64 - 27)
141	xar	 v28.2d,  v24.2d,  v28.2d, (64 - 14)
142	xar	 v24.2d,  v21.2d,  v25.2d, (64 - 2)
143	xar	  v8.2d,   v8.2d,  v27.2d, (64 - 55)
144	xar	  v4.2d,  v16.2d,  v25.2d, (64 - 45)
145	xar	 v16.2d,   v5.2d,  v30.2d, (64 - 36)
146	xar	  v5.2d,   v3.2d,  v27.2d, (64 - 28)
147	xar	 v27.2d,  v18.2d,  v27.2d, (64 - 21)
148	xar	  v3.2d,  v17.2d,  v26.2d, (64 - 15)
149	xar	 v25.2d,  v11.2d,  v25.2d, (64 - 10)
150	xar	 v26.2d,   v7.2d,  v26.2d, (64 - 6)
151	xar	 v30.2d,  v10.2d,  v30.2d, (64 - 3)
152
153	bcax	v20.16b, v31.16b, v22.16b,  v8.16b
154	bcax	v21.16b,  v8.16b, v23.16b, v22.16b
155	bcax	v22.16b, v22.16b, v24.16b, v23.16b
156	bcax	v23.16b, v23.16b, v31.16b, v24.16b
157	bcax	v24.16b, v24.16b,  v8.16b, v31.16b
158
159	ld1r	{v31.2d}, [x9], #8
160
161	bcax	v17.16b, v25.16b, v19.16b,  v3.16b
162	bcax	v18.16b,  v3.16b, v15.16b, v19.16b
163	bcax	v19.16b, v19.16b, v16.16b, v15.16b
164	bcax	v15.16b, v15.16b, v25.16b, v16.16b
165	bcax	v16.16b, v16.16b,  v3.16b, v25.16b
166
167	bcax	v10.16b, v29.16b, v12.16b, v26.16b
168	bcax	v11.16b, v26.16b, v13.16b, v12.16b
169	bcax	v12.16b, v12.16b, v14.16b, v13.16b
170	bcax	v13.16b, v13.16b, v29.16b, v14.16b
171	bcax	v14.16b, v14.16b, v26.16b, v29.16b
172
173	bcax	 v7.16b, v30.16b,  v9.16b,  v4.16b
174	bcax	 v8.16b,  v4.16b,  v5.16b,  v9.16b
175	bcax	 v9.16b,  v9.16b,  v6.16b,  v5.16b
176	bcax	 v5.16b,  v5.16b, v30.16b,  v6.16b
177	bcax	 v6.16b,  v6.16b,  v4.16b, v30.16b
178
179	bcax	 v3.16b, v27.16b,  v0.16b, v28.16b
180	bcax	 v4.16b, v28.16b,  v1.16b,  v0.16b
181	bcax	 v0.16b,  v0.16b,  v2.16b,  v1.16b
182	bcax	 v1.16b,  v1.16b, v27.16b,  v2.16b
183	bcax	 v2.16b,  v2.16b, v28.16b, v27.16b
184
185	eor	 v0.16b,  v0.16b, v31.16b
186
187	cbnz	w8, 3b
188	cond_yield 4f, x8, x9
189	cbnz	x2, 0b
190
191	/* save state */
1924:	st1	{ v0.1d- v3.1d}, [x0], #32
193	st1	{ v4.1d- v7.1d}, [x0], #32
194	st1	{ v8.1d-v11.1d}, [x0], #32
195	st1	{v12.1d-v15.1d}, [x0], #32
196	st1	{v16.1d-v19.1d}, [x0], #32
197	st1	{v20.1d-v23.1d}, [x0], #32
198	st1	{v24.1d}, [x0]
199	mov	x0, x2
200	ret
201SYM_FUNC_END(sha3_ce_transform)
202
203	.section	".rodata", "a"
204	.align		8
205.Lsha3_rcon:
206	.quad	0x0000000000000001, 0x0000000000008082, 0x800000000000808a
207	.quad	0x8000000080008000, 0x000000000000808b, 0x0000000080000001
208	.quad	0x8000000080008081, 0x8000000000008009, 0x000000000000008a
209	.quad	0x0000000000000088, 0x0000000080008009, 0x000000008000000a
210	.quad	0x000000008000808b, 0x800000000000008b, 0x8000000000008089
211	.quad	0x8000000000008003, 0x8000000000008002, 0x8000000000000080
212	.quad	0x000000000000800a, 0x800000008000000a, 0x8000000080008081
213	.quad	0x8000000000008080, 0x0000000080000001, 0x8000000080008008
214