xref: /linux/lib/crypto/arm/blake2b-neon-core.S (revision 7fc2cd2e4b398c57c9cf961cfea05eadbf34c05c)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * BLAKE2b digest algorithm optimized with ARM NEON instructions.  On ARM
4 * processors that have NEON support but not the ARMv8 Crypto Extensions,
5 * typically this BLAKE2b implementation is much faster than the SHA-2 family
6 * and slightly faster than SHA-1.
7 *
8 * Copyright 2020 Google LLC
9 *
10 * Author: Eric Biggers <ebiggers@google.com>
11 */
12
13#include <linux/linkage.h>
14
15	.text
16	.fpu		neon
17
18	// The arguments to blake2b_compress_neon()
19	CTX		.req	r0
20	DATA		.req	r1
21	NBLOCKS		.req	r2
22	INC		.req	r3
23
24	// Pointers to the rotation tables
25	ROR24_TABLE	.req	r4
26	ROR16_TABLE	.req	r5
27
28	// The original stack pointer
29	ORIG_SP		.req	r6
30
31	// NEON registers which contain the message words of the current block.
32	// M_0-M_3 are occasionally used for other purposes too.
33	M_0		.req	d16
34	M_1		.req	d17
35	M_2		.req	d18
36	M_3		.req	d19
37	M_4		.req	d20
38	M_5		.req	d21
39	M_6		.req	d22
40	M_7		.req	d23
41	M_8		.req	d24
42	M_9		.req	d25
43	M_10		.req	d26
44	M_11		.req	d27
45	M_12		.req	d28
46	M_13		.req	d29
47	M_14		.req	d30
48	M_15		.req	d31
49
50	.align		4
51	// Tables for computing ror64(x, 24) and ror64(x, 16) using the vtbl.8
52	// instruction.  This is the most efficient way to implement these
53	// rotation amounts with NEON.  (On Cortex-A53 it's the same speed as
54	// vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.)
55.Lror24_table:
56	.byte		3, 4, 5, 6, 7, 0, 1, 2
57.Lror16_table:
58	.byte		2, 3, 4, 5, 6, 7, 0, 1
59	// The BLAKE2b initialization vector
60.Lblake2b_IV:
61	.quad		0x6a09e667f3bcc908, 0xbb67ae8584caa73b
62	.quad		0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
63	.quad		0x510e527fade682d1, 0x9b05688c2b3e6c1f
64	.quad		0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
65
66// Execute one round of BLAKE2b by updating the state matrix v[0..15] in the
67// NEON registers q0-q7.  The message block is in q8..q15 (M_0-M_15).  The stack
68// pointer points to a 32-byte aligned buffer containing a copy of q8 and q9
69// (M_0-M_3), so that they can be reloaded if they are used as temporary
70// registers.  The macro arguments s0-s15 give the order in which the message
71// words are used in this round.  'final' is 1 if this is the final round.
72.macro	_blake2b_round	s0, s1, s2, s3, s4, s5, s6, s7, \
73			s8, s9, s10, s11, s12, s13, s14, s15, final=0
74
75	// Mix the columns:
76	// (v[0], v[4], v[8], v[12]), (v[1], v[5], v[9], v[13]),
77	// (v[2], v[6], v[10], v[14]), and (v[3], v[7], v[11], v[15]).
78
79	// a += b + m[blake2b_sigma[r][2*i + 0]];
80	vadd.u64	q0, q0, q2
81	vadd.u64	q1, q1, q3
82	vadd.u64	d0, d0, M_\s0
83	vadd.u64	d1, d1, M_\s2
84	vadd.u64	d2, d2, M_\s4
85	vadd.u64	d3, d3, M_\s6
86
87	// d = ror64(d ^ a, 32);
88	veor		q6, q6, q0
89	veor		q7, q7, q1
90	vrev64.32	q6, q6
91	vrev64.32	q7, q7
92
93	// c += d;
94	vadd.u64	q4, q4, q6
95	vadd.u64	q5, q5, q7
96
97	// b = ror64(b ^ c, 24);
98	vld1.8		{M_0}, [ROR24_TABLE, :64]
99	veor		q2, q2, q4
100	veor		q3, q3, q5
101	vtbl.8		d4, {d4}, M_0
102	vtbl.8		d5, {d5}, M_0
103	vtbl.8		d6, {d6}, M_0
104	vtbl.8		d7, {d7}, M_0
105
106	// a += b + m[blake2b_sigma[r][2*i + 1]];
107	//
108	// M_0 got clobbered above, so we have to reload it if any of the four
109	// message words this step needs happens to be M_0.  Otherwise we don't
110	// need to reload it here, as it will just get clobbered again below.
111.if \s1 == 0 || \s3 == 0 || \s5 == 0 || \s7 == 0
112	vld1.8		{M_0}, [sp, :64]
113.endif
114	vadd.u64	q0, q0, q2
115	vadd.u64	q1, q1, q3
116	vadd.u64	d0, d0, M_\s1
117	vadd.u64	d1, d1, M_\s3
118	vadd.u64	d2, d2, M_\s5
119	vadd.u64	d3, d3, M_\s7
120
121	// d = ror64(d ^ a, 16);
122	vld1.8		{M_0}, [ROR16_TABLE, :64]
123	veor		q6, q6, q0
124	veor		q7, q7, q1
125	vtbl.8		d12, {d12}, M_0
126	vtbl.8		d13, {d13}, M_0
127	vtbl.8		d14, {d14}, M_0
128	vtbl.8		d15, {d15}, M_0
129
130	// c += d;
131	vadd.u64	q4, q4, q6
132	vadd.u64	q5, q5, q7
133
134	// b = ror64(b ^ c, 63);
135	//
136	// This rotation amount isn't a multiple of 8, so it has to be
137	// implemented using a pair of shifts, which requires temporary
138	// registers.  Use q8-q9 (M_0-M_3) for this, and reload them afterwards.
139	veor		q8, q2, q4
140	veor		q9, q3, q5
141	vshr.u64	q2, q8, #63
142	vshr.u64	q3, q9, #63
143	vsli.u64	q2, q8, #1
144	vsli.u64	q3, q9, #1
145	vld1.8		{q8-q9}, [sp, :256]
146
147	// Mix the diagonals:
148	// (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]),
149	// (v[2], v[7], v[8], v[13]), and (v[3], v[4], v[9], v[14]).
150	//
151	// There are two possible ways to do this: use 'vext' instructions to
152	// shift the rows of the matrix so that the diagonals become columns,
153	// and undo it afterwards; or just use 64-bit operations on 'd'
154	// registers instead of 128-bit operations on 'q' registers.  We use the
155	// latter approach, as it performs much better on Cortex-A7.
156
157	// a += b + m[blake2b_sigma[r][2*i + 0]];
158	vadd.u64	d0, d0, d5
159	vadd.u64	d1, d1, d6
160	vadd.u64	d2, d2, d7
161	vadd.u64	d3, d3, d4
162	vadd.u64	d0, d0, M_\s8
163	vadd.u64	d1, d1, M_\s10
164	vadd.u64	d2, d2, M_\s12
165	vadd.u64	d3, d3, M_\s14
166
167	// d = ror64(d ^ a, 32);
168	veor		d15, d15, d0
169	veor		d12, d12, d1
170	veor		d13, d13, d2
171	veor		d14, d14, d3
172	vrev64.32	d15, d15
173	vrev64.32	d12, d12
174	vrev64.32	d13, d13
175	vrev64.32	d14, d14
176
177	// c += d;
178	vadd.u64	d10, d10, d15
179	vadd.u64	d11, d11, d12
180	vadd.u64	d8, d8, d13
181	vadd.u64	d9, d9, d14
182
183	// b = ror64(b ^ c, 24);
184	vld1.8		{M_0}, [ROR24_TABLE, :64]
185	veor		d5, d5, d10
186	veor		d6, d6, d11
187	veor		d7, d7, d8
188	veor		d4, d4, d9
189	vtbl.8		d5, {d5}, M_0
190	vtbl.8		d6, {d6}, M_0
191	vtbl.8		d7, {d7}, M_0
192	vtbl.8		d4, {d4}, M_0
193
194	// a += b + m[blake2b_sigma[r][2*i + 1]];
195.if \s9 == 0 || \s11 == 0 || \s13 == 0 || \s15 == 0
196	vld1.8		{M_0}, [sp, :64]
197.endif
198	vadd.u64	d0, d0, d5
199	vadd.u64	d1, d1, d6
200	vadd.u64	d2, d2, d7
201	vadd.u64	d3, d3, d4
202	vadd.u64	d0, d0, M_\s9
203	vadd.u64	d1, d1, M_\s11
204	vadd.u64	d2, d2, M_\s13
205	vadd.u64	d3, d3, M_\s15
206
207	// d = ror64(d ^ a, 16);
208	vld1.8		{M_0}, [ROR16_TABLE, :64]
209	veor		d15, d15, d0
210	veor		d12, d12, d1
211	veor		d13, d13, d2
212	veor		d14, d14, d3
213	vtbl.8		d12, {d12}, M_0
214	vtbl.8		d13, {d13}, M_0
215	vtbl.8		d14, {d14}, M_0
216	vtbl.8		d15, {d15}, M_0
217
218	// c += d;
219	vadd.u64	d10, d10, d15
220	vadd.u64	d11, d11, d12
221	vadd.u64	d8, d8, d13
222	vadd.u64	d9, d9, d14
223
224	// b = ror64(b ^ c, 63);
225	veor		d16, d4, d9
226	veor		d17, d5, d10
227	veor		d18, d6, d11
228	veor		d19, d7, d8
229	vshr.u64	q2, q8, #63
230	vshr.u64	q3, q9, #63
231	vsli.u64	q2, q8, #1
232	vsli.u64	q3, q9, #1
233	// Reloading q8-q9 can be skipped on the final round.
234.if ! \final
235	vld1.8		{q8-q9}, [sp, :256]
236.endif
237.endm
238
239//
240// void blake2b_compress_neon(struct blake2b_ctx *ctx,
241//			      const u8 *data, size_t nblocks, u32 inc);
242//
243// Only the first three fields of struct blake2b_ctx are used:
244//	u64 h[8];	(inout)
245//	u64 t[2];	(inout)
246//	u64 f[2];	(in)
247//
248	.align		5
249ENTRY(blake2b_compress_neon)
250	push		{r4-r10}
251
252	// Allocate a 32-byte stack buffer that is 32-byte aligned.
253	mov		ORIG_SP, sp
254	sub		ip, sp, #32
255	bic		ip, ip, #31
256	mov		sp, ip
257
258	adr		ROR24_TABLE, .Lror24_table
259	adr		ROR16_TABLE, .Lror16_table
260
261	mov		ip, CTX
262	vld1.64		{q0-q1}, [ip]!		// Load h[0..3]
263	vld1.64		{q2-q3}, [ip]!		// Load h[4..7]
264.Lnext_block:
265	  adr		r10, .Lblake2b_IV
266	vld1.64		{q14-q15}, [ip]		// Load t[0..1] and f[0..1]
267	vld1.64		{q4-q5}, [r10]!		// Load IV[0..3]
268	  vmov		r7, r8, d28		// Copy t[0] to (r7, r8)
269	vld1.64		{q6-q7}, [r10]		// Load IV[4..7]
270	  adds		r7, r7, INC		// Increment counter
271	bcs		.Lslow_inc_ctr
272	vmov.i32	d28[0], r7
273	vst1.64		{d28}, [ip]		// Update t[0]
274.Linc_ctr_done:
275
276	// Load the next message block and finish initializing the state matrix
277	// 'v'.  Fortunately, there are exactly enough NEON registers to fit the
278	// entire state matrix in q0-q7 and the entire message block in q8-15.
279	//
280	// However, _blake2b_round also needs some extra registers for rotates,
281	// so we have to spill some registers.  It's better to spill the message
282	// registers than the state registers, as the message doesn't change.
283	// Therefore we store a copy of the first 32 bytes of the message block
284	// (q8-q9) in an aligned buffer on the stack so that they can be
285	// reloaded when needed.  (We could just reload directly from the
286	// message buffer, but it's faster to use aligned loads.)
287	vld1.8		{q8-q9}, [DATA]!
288	  veor		q6, q6, q14	// v[12..13] = IV[4..5] ^ t[0..1]
289	vld1.8		{q10-q11}, [DATA]!
290	  veor		q7, q7, q15	// v[14..15] = IV[6..7] ^ f[0..1]
291	vld1.8		{q12-q13}, [DATA]!
292	vst1.8		{q8-q9}, [sp, :256]
293	  mov		ip, CTX
294	vld1.8		{q14-q15}, [DATA]!
295
296	// Execute the rounds.  Each round is provided the order in which it
297	// needs to use the message words.
298	_blake2b_round	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
299	_blake2b_round	14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
300	_blake2b_round	11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
301	_blake2b_round	7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
302	_blake2b_round	9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
303	_blake2b_round	2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
304	_blake2b_round	12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
305	_blake2b_round	13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
306	_blake2b_round	6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
307	_blake2b_round	10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
308	_blake2b_round	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
309	_blake2b_round	14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \
310			final=1
311
312	// Fold the final state matrix into the hash chaining value:
313	//
314	//	for (i = 0; i < 8; i++)
315	//		h[i] ^= v[i] ^ v[i + 8];
316	//
317	  vld1.64	{q8-q9}, [ip]!		// Load old h[0..3]
318	veor		q0, q0, q4		// v[0..1] ^= v[8..9]
319	veor		q1, q1, q5		// v[2..3] ^= v[10..11]
320	  vld1.64	{q10-q11}, [ip]		// Load old h[4..7]
321	veor		q2, q2, q6		// v[4..5] ^= v[12..13]
322	veor		q3, q3, q7		// v[6..7] ^= v[14..15]
323	veor		q0, q0, q8		// v[0..1] ^= h[0..1]
324	veor		q1, q1, q9		// v[2..3] ^= h[2..3]
325	  mov		ip, CTX
326	  subs		NBLOCKS, NBLOCKS, #1	// nblocks--
327	  vst1.64	{q0-q1}, [ip]!		// Store new h[0..3]
328	veor		q2, q2, q10		// v[4..5] ^= h[4..5]
329	veor		q3, q3, q11		// v[6..7] ^= h[6..7]
330	  vst1.64	{q2-q3}, [ip]!		// Store new h[4..7]
331
332	// Advance to the next block, if there is one.
333	bne		.Lnext_block		// nblocks != 0?
334
335	mov		sp, ORIG_SP
336	pop		{r4-r10}
337	mov		pc, lr
338
339.Lslow_inc_ctr:
340	// Handle the case where the counter overflowed its low 32 bits, by
341	// carrying the overflow bit into the full 128-bit counter.
342	vmov		r9, r10, d29
343	adcs		r8, r8, #0
344	adcs		r9, r9, #0
345	adc		r10, r10, #0
346	vmov		d28, r7, r8
347	vmov		d29, r9, r10
348	vst1.64		{q14}, [ip]		// Update t[0] and t[1]
349	b		.Linc_ctr_done
350ENDPROC(blake2b_compress_neon)
351