xref: /linux/arch/arm/crypto/blake2b-neon-core.S (revision 3d0fe49454652117522f60bfbefb978ba0e5300b)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * BLAKE2b digest algorithm, NEON accelerated
4 *
5 * Copyright 2020 Google LLC
6 *
7 * Author: Eric Biggers <ebiggers@google.com>
8 */
9
10#include <linux/linkage.h>
11
12	.text
13	.fpu		neon
14
15	// The arguments to blake2b_compress_neon()
16	STATE		.req	r0
17	BLOCK		.req	r1
18	NBLOCKS		.req	r2
19	INC		.req	r3
20
21	// Pointers to the rotation tables
22	ROR24_TABLE	.req	r4
23	ROR16_TABLE	.req	r5
24
25	// The original stack pointer
26	ORIG_SP		.req	r6
27
28	// NEON registers which contain the message words of the current block.
29	// M_0-M_3 are occasionally used for other purposes too.
30	M_0		.req	d16
31	M_1		.req	d17
32	M_2		.req	d18
33	M_3		.req	d19
34	M_4		.req	d20
35	M_5		.req	d21
36	M_6		.req	d22
37	M_7		.req	d23
38	M_8		.req	d24
39	M_9		.req	d25
40	M_10		.req	d26
41	M_11		.req	d27
42	M_12		.req	d28
43	M_13		.req	d29
44	M_14		.req	d30
45	M_15		.req	d31
46
47	.align		4
48	// Tables for computing ror64(x, 24) and ror64(x, 16) using the vtbl.8
49	// instruction.  This is the most efficient way to implement these
50	// rotation amounts with NEON.  (On Cortex-A53 it's the same speed as
51	// vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.)
52.Lror24_table:
53	.byte		3, 4, 5, 6, 7, 0, 1, 2
54.Lror16_table:
55	.byte		2, 3, 4, 5, 6, 7, 0, 1
56	// The BLAKE2b initialization vector
57.Lblake2b_IV:
58	.quad		0x6a09e667f3bcc908, 0xbb67ae8584caa73b
59	.quad		0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
60	.quad		0x510e527fade682d1, 0x9b05688c2b3e6c1f
61	.quad		0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
62
63// Execute one round of BLAKE2b by updating the state matrix v[0..15] in the
64// NEON registers q0-q7.  The message block is in q8..q15 (M_0-M_15).  The stack
65// pointer points to a 32-byte aligned buffer containing a copy of q8 and q9
66// (M_0-M_3), so that they can be reloaded if they are used as temporary
67// registers.  The macro arguments s0-s15 give the order in which the message
68// words are used in this round.  'final' is 1 if this is the final round.
69.macro	_blake2b_round	s0, s1, s2, s3, s4, s5, s6, s7, \
70			s8, s9, s10, s11, s12, s13, s14, s15, final=0
71
72	// Mix the columns:
73	// (v[0], v[4], v[8], v[12]), (v[1], v[5], v[9], v[13]),
74	// (v[2], v[6], v[10], v[14]), and (v[3], v[7], v[11], v[15]).
75
76	// a += b + m[blake2b_sigma[r][2*i + 0]];
77	vadd.u64	q0, q0, q2
78	vadd.u64	q1, q1, q3
79	vadd.u64	d0, d0, M_\s0
80	vadd.u64	d1, d1, M_\s2
81	vadd.u64	d2, d2, M_\s4
82	vadd.u64	d3, d3, M_\s6
83
84	// d = ror64(d ^ a, 32);
85	veor		q6, q6, q0
86	veor		q7, q7, q1
87	vrev64.32	q6, q6
88	vrev64.32	q7, q7
89
90	// c += d;
91	vadd.u64	q4, q4, q6
92	vadd.u64	q5, q5, q7
93
94	// b = ror64(b ^ c, 24);
95	vld1.8		{M_0}, [ROR24_TABLE, :64]
96	veor		q2, q2, q4
97	veor		q3, q3, q5
98	vtbl.8		d4, {d4}, M_0
99	vtbl.8		d5, {d5}, M_0
100	vtbl.8		d6, {d6}, M_0
101	vtbl.8		d7, {d7}, M_0
102
103	// a += b + m[blake2b_sigma[r][2*i + 1]];
104	//
105	// M_0 got clobbered above, so we have to reload it if any of the four
106	// message words this step needs happens to be M_0.  Otherwise we don't
107	// need to reload it here, as it will just get clobbered again below.
108.if \s1 == 0 || \s3 == 0 || \s5 == 0 || \s7 == 0
109	vld1.8		{M_0}, [sp, :64]
110.endif
111	vadd.u64	q0, q0, q2
112	vadd.u64	q1, q1, q3
113	vadd.u64	d0, d0, M_\s1
114	vadd.u64	d1, d1, M_\s3
115	vadd.u64	d2, d2, M_\s5
116	vadd.u64	d3, d3, M_\s7
117
118	// d = ror64(d ^ a, 16);
119	vld1.8		{M_0}, [ROR16_TABLE, :64]
120	veor		q6, q6, q0
121	veor		q7, q7, q1
122	vtbl.8		d12, {d12}, M_0
123	vtbl.8		d13, {d13}, M_0
124	vtbl.8		d14, {d14}, M_0
125	vtbl.8		d15, {d15}, M_0
126
127	// c += d;
128	vadd.u64	q4, q4, q6
129	vadd.u64	q5, q5, q7
130
131	// b = ror64(b ^ c, 63);
132	//
133	// This rotation amount isn't a multiple of 8, so it has to be
134	// implemented using a pair of shifts, which requires temporary
135	// registers.  Use q8-q9 (M_0-M_3) for this, and reload them afterwards.
136	veor		q8, q2, q4
137	veor		q9, q3, q5
138	vshr.u64	q2, q8, #63
139	vshr.u64	q3, q9, #63
140	vsli.u64	q2, q8, #1
141	vsli.u64	q3, q9, #1
142	vld1.8		{q8-q9}, [sp, :256]
143
144	// Mix the diagonals:
145	// (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]),
146	// (v[2], v[7], v[8], v[13]), and (v[3], v[4], v[9], v[14]).
147	//
148	// There are two possible ways to do this: use 'vext' instructions to
149	// shift the rows of the matrix so that the diagonals become columns,
150	// and undo it afterwards; or just use 64-bit operations on 'd'
151	// registers instead of 128-bit operations on 'q' registers.  We use the
152	// latter approach, as it performs much better on Cortex-A7.
153
154	// a += b + m[blake2b_sigma[r][2*i + 0]];
155	vadd.u64	d0, d0, d5
156	vadd.u64	d1, d1, d6
157	vadd.u64	d2, d2, d7
158	vadd.u64	d3, d3, d4
159	vadd.u64	d0, d0, M_\s8
160	vadd.u64	d1, d1, M_\s10
161	vadd.u64	d2, d2, M_\s12
162	vadd.u64	d3, d3, M_\s14
163
164	// d = ror64(d ^ a, 32);
165	veor		d15, d15, d0
166	veor		d12, d12, d1
167	veor		d13, d13, d2
168	veor		d14, d14, d3
169	vrev64.32	d15, d15
170	vrev64.32	d12, d12
171	vrev64.32	d13, d13
172	vrev64.32	d14, d14
173
174	// c += d;
175	vadd.u64	d10, d10, d15
176	vadd.u64	d11, d11, d12
177	vadd.u64	d8, d8, d13
178	vadd.u64	d9, d9, d14
179
180	// b = ror64(b ^ c, 24);
181	vld1.8		{M_0}, [ROR24_TABLE, :64]
182	veor		d5, d5, d10
183	veor		d6, d6, d11
184	veor		d7, d7, d8
185	veor		d4, d4, d9
186	vtbl.8		d5, {d5}, M_0
187	vtbl.8		d6, {d6}, M_0
188	vtbl.8		d7, {d7}, M_0
189	vtbl.8		d4, {d4}, M_0
190
191	// a += b + m[blake2b_sigma[r][2*i + 1]];
192.if \s9 == 0 || \s11 == 0 || \s13 == 0 || \s15 == 0
193	vld1.8		{M_0}, [sp, :64]
194.endif
195	vadd.u64	d0, d0, d5
196	vadd.u64	d1, d1, d6
197	vadd.u64	d2, d2, d7
198	vadd.u64	d3, d3, d4
199	vadd.u64	d0, d0, M_\s9
200	vadd.u64	d1, d1, M_\s11
201	vadd.u64	d2, d2, M_\s13
202	vadd.u64	d3, d3, M_\s15
203
204	// d = ror64(d ^ a, 16);
205	vld1.8		{M_0}, [ROR16_TABLE, :64]
206	veor		d15, d15, d0
207	veor		d12, d12, d1
208	veor		d13, d13, d2
209	veor		d14, d14, d3
210	vtbl.8		d12, {d12}, M_0
211	vtbl.8		d13, {d13}, M_0
212	vtbl.8		d14, {d14}, M_0
213	vtbl.8		d15, {d15}, M_0
214
215	// c += d;
216	vadd.u64	d10, d10, d15
217	vadd.u64	d11, d11, d12
218	vadd.u64	d8, d8, d13
219	vadd.u64	d9, d9, d14
220
221	// b = ror64(b ^ c, 63);
222	veor		d16, d4, d9
223	veor		d17, d5, d10
224	veor		d18, d6, d11
225	veor		d19, d7, d8
226	vshr.u64	q2, q8, #63
227	vshr.u64	q3, q9, #63
228	vsli.u64	q2, q8, #1
229	vsli.u64	q3, q9, #1
230	// Reloading q8-q9 can be skipped on the final round.
231.if ! \final
232	vld1.8		{q8-q9}, [sp, :256]
233.endif
234.endm
235
236//
237// void blake2b_compress_neon(struct blake2b_state *state,
238//			      const u8 *block, size_t nblocks, u32 inc);
239//
240// Only the first three fields of struct blake2b_state are used:
241//	u64 h[8];	(inout)
242//	u64 t[2];	(inout)
243//	u64 f[2];	(in)
244//
245	.align		5
246ENTRY(blake2b_compress_neon)
247	push		{r4-r10}
248
249	// Allocate a 32-byte stack buffer that is 32-byte aligned.
250	mov		ORIG_SP, sp
251	sub		ip, sp, #32
252	bic		ip, ip, #31
253	mov		sp, ip
254
255	adr		ROR24_TABLE, .Lror24_table
256	adr		ROR16_TABLE, .Lror16_table
257
258	mov		ip, STATE
259	vld1.64		{q0-q1}, [ip]!		// Load h[0..3]
260	vld1.64		{q2-q3}, [ip]!		// Load h[4..7]
261.Lnext_block:
262	  adr		r10, .Lblake2b_IV
263	vld1.64		{q14-q15}, [ip]		// Load t[0..1] and f[0..1]
264	vld1.64		{q4-q5}, [r10]!		// Load IV[0..3]
265	  vmov		r7, r8, d28		// Copy t[0] to (r7, r8)
266	vld1.64		{q6-q7}, [r10]		// Load IV[4..7]
267	  adds		r7, r7, INC		// Increment counter
268	bcs		.Lslow_inc_ctr
269	vmov.i32	d28[0], r7
270	vst1.64		{d28}, [ip]		// Update t[0]
271.Linc_ctr_done:
272
273	// Load the next message block and finish initializing the state matrix
274	// 'v'.  Fortunately, there are exactly enough NEON registers to fit the
275	// entire state matrix in q0-q7 and the entire message block in q8-15.
276	//
277	// However, _blake2b_round also needs some extra registers for rotates,
278	// so we have to spill some registers.  It's better to spill the message
279	// registers than the state registers, as the message doesn't change.
280	// Therefore we store a copy of the first 32 bytes of the message block
281	// (q8-q9) in an aligned buffer on the stack so that they can be
282	// reloaded when needed.  (We could just reload directly from the
283	// message buffer, but it's faster to use aligned loads.)
284	vld1.8		{q8-q9}, [BLOCK]!
285	  veor		q6, q6, q14	// v[12..13] = IV[4..5] ^ t[0..1]
286	vld1.8		{q10-q11}, [BLOCK]!
287	  veor		q7, q7, q15	// v[14..15] = IV[6..7] ^ f[0..1]
288	vld1.8		{q12-q13}, [BLOCK]!
289	vst1.8		{q8-q9}, [sp, :256]
290	  mov		ip, STATE
291	vld1.8		{q14-q15}, [BLOCK]!
292
293	// Execute the rounds.  Each round is provided the order in which it
294	// needs to use the message words.
295	_blake2b_round	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
296	_blake2b_round	14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
297	_blake2b_round	11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
298	_blake2b_round	7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
299	_blake2b_round	9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
300	_blake2b_round	2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
301	_blake2b_round	12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
302	_blake2b_round	13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
303	_blake2b_round	6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
304	_blake2b_round	10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
305	_blake2b_round	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
306	_blake2b_round	14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \
307			final=1
308
309	// Fold the final state matrix into the hash chaining value:
310	//
311	//	for (i = 0; i < 8; i++)
312	//		h[i] ^= v[i] ^ v[i + 8];
313	//
314	  vld1.64	{q8-q9}, [ip]!		// Load old h[0..3]
315	veor		q0, q0, q4		// v[0..1] ^= v[8..9]
316	veor		q1, q1, q5		// v[2..3] ^= v[10..11]
317	  vld1.64	{q10-q11}, [ip]		// Load old h[4..7]
318	veor		q2, q2, q6		// v[4..5] ^= v[12..13]
319	veor		q3, q3, q7		// v[6..7] ^= v[14..15]
320	veor		q0, q0, q8		// v[0..1] ^= h[0..1]
321	veor		q1, q1, q9		// v[2..3] ^= h[2..3]
322	  mov		ip, STATE
323	  subs		NBLOCKS, NBLOCKS, #1	// nblocks--
324	  vst1.64	{q0-q1}, [ip]!		// Store new h[0..3]
325	veor		q2, q2, q10		// v[4..5] ^= h[4..5]
326	veor		q3, q3, q11		// v[6..7] ^= h[6..7]
327	  vst1.64	{q2-q3}, [ip]!		// Store new h[4..7]
328
329	// Advance to the next block, if there is one.
330	bne		.Lnext_block		// nblocks != 0?
331
332	mov		sp, ORIG_SP
333	pop		{r4-r10}
334	mov		pc, lr
335
336.Lslow_inc_ctr:
337	// Handle the case where the counter overflowed its low 32 bits, by
338	// carrying the overflow bit into the full 128-bit counter.
339	vmov		r9, r10, d29
340	adcs		r8, r8, #0
341	adcs		r9, r9, #0
342	adc		r10, r10, #0
343	vmov		d28, r7, r8
344	vmov		d29, r9, r10
345	vst1.64		{q14}, [ip]		// Update t[0] and t[1]
346	b		.Linc_ctr_done
347ENDPROC(blake2b_compress_neon)
348