xref: /linux/lib/crypto/arm64/sha256-ce.S (revision c4dde411bc366f568dbe33366253bbfea049e8ea)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Core SHA-224/SHA-256 transform using v8 Crypto Extensions
4 *
5 * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
6 */
7
8#include <linux/linkage.h>
9#include <asm/assembler.h>
10
11	.text
12	.arch		armv8-a+crypto
13
14	dga		.req	q20
15	dgav		.req	v20
16	dgb		.req	q21
17	dgbv		.req	v21
18
19	t0		.req	v22
20	t1		.req	v23
21
22	dg0q		.req	q24
23	dg0v		.req	v24
24	dg1q		.req	q25
25	dg1v		.req	v25
26	dg2q		.req	q26
27	dg2v		.req	v26
28
29	.macro		add_only, ev, rc, s0
30	mov		dg2v.16b, dg0v.16b
31	.ifeq		\ev
32	add		t1.4s, v\s0\().4s, \rc\().4s
33	sha256h		dg0q, dg1q, t0.4s
34	sha256h2	dg1q, dg2q, t0.4s
35	.else
36	.ifnb		\s0
37	add		t0.4s, v\s0\().4s, \rc\().4s
38	.endif
39	sha256h		dg0q, dg1q, t1.4s
40	sha256h2	dg1q, dg2q, t1.4s
41	.endif
42	.endm
43
44	.macro		add_update, ev, rc, s0, s1, s2, s3
45	sha256su0	v\s0\().4s, v\s1\().4s
46	add_only	\ev, \rc, \s1
47	sha256su1	v\s0\().4s, v\s2\().4s, v\s3\().4s
48	.endm
49
50	/*
51	 * The SHA-256 round constants
52	 */
53	.section	".rodata", "a"
54	.align		4
55.Lsha2_rcon:
56	.word		0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
57	.word		0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
58	.word		0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
59	.word		0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
60	.word		0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
61	.word		0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
62	.word		0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
63	.word		0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
64	.word		0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
65	.word		0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
66	.word		0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
67	.word		0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
68	.word		0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
69	.word		0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
70	.word		0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
71	.word		0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
72
73	.macro load_round_constants	tmp
74	adr_l		\tmp, .Lsha2_rcon
75	ld1		{ v0.4s- v3.4s}, [\tmp], #64
76	ld1		{ v4.4s- v7.4s}, [\tmp], #64
77	ld1		{ v8.4s-v11.4s}, [\tmp], #64
78	ld1		{v12.4s-v15.4s}, [\tmp]
79	.endm
80
81	/*
82	 * void sha256_ce_transform(struct sha256_block_state *state,
83	 *			    const u8 *data, size_t nblocks);
84	 */
85	.text
86SYM_FUNC_START(sha256_ce_transform)
87
88	load_round_constants	x8
89
90	/* load state */
91	ld1		{dgav.4s, dgbv.4s}, [x0]
92
93	/* load input */
940:	ld1		{v16.4s-v19.4s}, [x1], #64
95	sub		x2, x2, #1
96
97	rev32		v16.16b, v16.16b
98	rev32		v17.16b, v17.16b
99	rev32		v18.16b, v18.16b
100	rev32		v19.16b, v19.16b
101
102	add		t0.4s, v16.4s, v0.4s
103	mov		dg0v.16b, dgav.16b
104	mov		dg1v.16b, dgbv.16b
105
106	add_update	0,  v1, 16, 17, 18, 19
107	add_update	1,  v2, 17, 18, 19, 16
108	add_update	0,  v3, 18, 19, 16, 17
109	add_update	1,  v4, 19, 16, 17, 18
110
111	add_update	0,  v5, 16, 17, 18, 19
112	add_update	1,  v6, 17, 18, 19, 16
113	add_update	0,  v7, 18, 19, 16, 17
114	add_update	1,  v8, 19, 16, 17, 18
115
116	add_update	0,  v9, 16, 17, 18, 19
117	add_update	1, v10, 17, 18, 19, 16
118	add_update	0, v11, 18, 19, 16, 17
119	add_update	1, v12, 19, 16, 17, 18
120
121	add_only	0, v13, 17
122	add_only	1, v14, 18
123	add_only	0, v15, 19
124	add_only	1
125
126	/* update state */
127	add		dgav.4s, dgav.4s, dg0v.4s
128	add		dgbv.4s, dgbv.4s, dg1v.4s
129
130	/* handled all input blocks? */
131	cbnz		x2, 0b
132
133	/* store new state */
134	st1		{dgav.4s, dgbv.4s}, [x0]
135	ret
136SYM_FUNC_END(sha256_ce_transform)
137
138	.unreq dga
139	.unreq dgav
140	.unreq dgb
141	.unreq dgbv
142	.unreq t0
143	.unreq t1
144	.unreq dg0q
145	.unreq dg0v
146	.unreq dg1q
147	.unreq dg1v
148	.unreq dg2q
149	.unreq dg2v
150
151	// parameters for sha256_ce_finup2x()
152	ctx		.req	x0
153	data1		.req	x1
154	data2		.req	x2
155	len		.req	w3
156	out1		.req	x4
157	out2		.req	x5
158
159	// other scalar variables
160	count		.req	x6
161	final_step	.req	w7
162
163	// x8-x9 are used as temporaries.
164
165	// v0-v15 are used to cache the SHA-256 round constants.
166	// v16-v19 are used for the message schedule for the first message.
167	// v20-v23 are used for the message schedule for the second message.
168	// v24-v31 are used for the state and temporaries as given below.
169	// *_a are for the first message and *_b for the second.
170	state0_a_q	.req	q24
171	state0_a	.req	v24
172	state1_a_q	.req	q25
173	state1_a	.req	v25
174	state0_b_q	.req	q26
175	state0_b	.req	v26
176	state1_b_q	.req	q27
177	state1_b	.req	v27
178	t0_a		.req	v28
179	t0_b		.req	v29
180	t1_a_q		.req	q30
181	t1_a		.req	v30
182	t1_b_q		.req	q31
183	t1_b		.req	v31
184
185#define OFFSETOF_BYTECOUNT	32 // offsetof(struct __sha256_ctx, bytecount)
186#define OFFSETOF_BUF		40 // offsetof(struct __sha256_ctx, buf)
187// offsetof(struct __sha256_ctx, state) is assumed to be 0.
188
189	// Do 4 rounds of SHA-256 for each of two messages (interleaved).  m0_a
190	// and m0_b contain the current 4 message schedule words for the first
191	// and second message respectively.
192	//
193	// If not all the message schedule words have been computed yet, then
194	// this also computes 4 more message schedule words for each message.
195	// m1_a-m3_a contain the next 3 groups of 4 message schedule words for
196	// the first message, and likewise m1_b-m3_b for the second.  After
197	// consuming the current value of m0_a, this macro computes the group
198	// after m3_a and writes it to m0_a, and likewise for *_b.  This means
199	// that the next (m0_a, m1_a, m2_a, m3_a) is the current (m1_a, m2_a,
200	// m3_a, m0_a), and likewise for *_b, so the caller must cycle through
201	// the registers accordingly.
202	.macro	do_4rounds_2x	i, k,  m0_a, m1_a, m2_a, m3_a,  \
203				       m0_b, m1_b, m2_b, m3_b
204	add		t0_a\().4s, \m0_a\().4s, \k\().4s
205	add		t0_b\().4s, \m0_b\().4s, \k\().4s
206	.if \i < 48
207	sha256su0	\m0_a\().4s, \m1_a\().4s
208	sha256su0	\m0_b\().4s, \m1_b\().4s
209	sha256su1	\m0_a\().4s, \m2_a\().4s, \m3_a\().4s
210	sha256su1	\m0_b\().4s, \m2_b\().4s, \m3_b\().4s
211	.endif
212	mov		t1_a.16b, state0_a.16b
213	mov		t1_b.16b, state0_b.16b
214	sha256h		state0_a_q, state1_a_q, t0_a\().4s
215	sha256h		state0_b_q, state1_b_q, t0_b\().4s
216	sha256h2	state1_a_q, t1_a_q, t0_a\().4s
217	sha256h2	state1_b_q, t1_b_q, t0_b\().4s
218	.endm
219
220	.macro	do_16rounds_2x	i, k0, k1, k2, k3
221	do_4rounds_2x	\i + 0,  \k0,  v16, v17, v18, v19,  v20, v21, v22, v23
222	do_4rounds_2x	\i + 4,  \k1,  v17, v18, v19, v16,  v21, v22, v23, v20
223	do_4rounds_2x	\i + 8,  \k2,  v18, v19, v16, v17,  v22, v23, v20, v21
224	do_4rounds_2x	\i + 12, \k3,  v19, v16, v17, v18,  v23, v20, v21, v22
225	.endm
226
227//
228// void sha256_ce_finup2x(const struct __sha256_ctx *ctx,
229//			  const u8 *data1, const u8 *data2, int len,
230//			  u8 out1[SHA256_DIGEST_SIZE],
231//			  u8 out2[SHA256_DIGEST_SIZE]);
232//
233// This function computes the SHA-256 digests of two messages |data1| and
234// |data2| that are both |len| bytes long, starting from the initial context
235// |ctx|.  |len| must be at least SHA256_BLOCK_SIZE.
236//
237// The instructions for the two SHA-256 operations are interleaved.  On many
238// CPUs, this is almost twice as fast as hashing each message individually due
239// to taking better advantage of the CPU's SHA-256 and SIMD throughput.
240//
241SYM_FUNC_START(sha256_ce_finup2x)
242	sub		sp, sp, #128
243	mov		final_step, #0
244	load_round_constants	x8
245
246	// Load the initial state from ctx->state.
247	ld1		{state0_a.4s-state1_a.4s}, [ctx]
248
249	// Load ctx->bytecount.  Take the mod 64 of it to get the number of
250	// bytes that are buffered in ctx->buf.  Also save it in a register with
251	// len added to it.
252	ldr		x8, [ctx, #OFFSETOF_BYTECOUNT]
253	add		count, x8, len, sxtw
254	and		x8, x8, #63
255	cbz		x8, .Lfinup2x_enter_loop	// No bytes buffered?
256
257	// x8 bytes (1 to 63) are currently buffered in ctx->buf.  Load them
258	// followed by the first 64 - x8 bytes of data.  Since len >= 64, we
259	// just load 64 bytes from each of ctx->buf, data1, and data2
260	// unconditionally and rearrange the data as needed.
261	add		x9, ctx, #OFFSETOF_BUF
262	ld1		{v16.16b-v19.16b}, [x9]
263	st1		{v16.16b-v19.16b}, [sp]
264
265	ld1		{v16.16b-v19.16b}, [data1], #64
266	add		x9, sp, x8
267	st1		{v16.16b-v19.16b}, [x9]
268	ld1		{v16.4s-v19.4s}, [sp]
269
270	ld1		{v20.16b-v23.16b}, [data2], #64
271	st1		{v20.16b-v23.16b}, [x9]
272	ld1		{v20.4s-v23.4s}, [sp]
273
274	sub		len, len, #64
275	sub		data1, data1, x8
276	sub		data2, data2, x8
277	add		len, len, w8
278	mov		state0_b.16b, state0_a.16b
279	mov		state1_b.16b, state1_a.16b
280	b		.Lfinup2x_loop_have_data
281
282.Lfinup2x_enter_loop:
283	sub		len, len, #64
284	mov		state0_b.16b, state0_a.16b
285	mov		state1_b.16b, state1_a.16b
286.Lfinup2x_loop:
287	// Load the next two data blocks.
288	ld1		{v16.4s-v19.4s}, [data1], #64
289	ld1		{v20.4s-v23.4s}, [data2], #64
290.Lfinup2x_loop_have_data:
291	// Convert the words of the data blocks from big endian.
292	rev32		v16.16b, v16.16b
293	rev32		v17.16b, v17.16b
294	rev32		v18.16b, v18.16b
295	rev32		v19.16b, v19.16b
296	rev32		v20.16b, v20.16b
297	rev32		v21.16b, v21.16b
298	rev32		v22.16b, v22.16b
299	rev32		v23.16b, v23.16b
300.Lfinup2x_loop_have_bswapped_data:
301
302	// Save the original state for each block.
303	st1		{state0_a.4s-state1_b.4s}, [sp]
304
305	// Do the SHA-256 rounds on each block.
306	do_16rounds_2x	0,  v0, v1, v2, v3
307	do_16rounds_2x	16, v4, v5, v6, v7
308	do_16rounds_2x	32, v8, v9, v10, v11
309	do_16rounds_2x	48, v12, v13, v14, v15
310
311	// Add the original state for each block.
312	ld1		{v16.4s-v19.4s}, [sp]
313	add		state0_a.4s, state0_a.4s, v16.4s
314	add		state1_a.4s, state1_a.4s, v17.4s
315	add		state0_b.4s, state0_b.4s, v18.4s
316	add		state1_b.4s, state1_b.4s, v19.4s
317
318	// Update len and loop back if more blocks remain.
319	sub		len, len, #64
320	tbz		len, #31, .Lfinup2x_loop	// len >= 0?
321
322	// Check if any final blocks need to be handled.
323	// final_step = 2: all done
324	// final_step = 1: need to do count-only padding block
325	// final_step = 0: need to do the block with 0x80 padding byte
326	tbnz		final_step, #1, .Lfinup2x_done
327	tbnz		final_step, #0, .Lfinup2x_finalize_countonly
328	add		len, len, #64
329	cbz		len, .Lfinup2x_finalize_blockaligned
330
331	// Not block-aligned; 1 <= len <= 63 data bytes remain.  Pad the block.
332	// To do this, write the padding starting with the 0x80 byte to
333	// &sp[64].  Then for each message, copy the last 64 data bytes to sp
334	// and load from &sp[64 - len] to get the needed padding block.  This
335	// code relies on the data buffers being >= 64 bytes in length.
336	sub		w8, len, #64		// w8 = len - 64
337	add		data1, data1, w8, sxtw	// data1 += len - 64
338	add		data2, data2, w8, sxtw	// data2 += len - 64
339	mov		x9, #0x80
340	fmov		d16, x9
341	movi		v17.16b, #0
342	stp		q16, q17, [sp, #64]
343	stp		q17, q17, [sp, #96]
344	sub		x9, sp, w8, sxtw	// x9 = &sp[64 - len]
345	cmp		len, #56
346	b.ge		1f		// will count spill into its own block?
347	lsl		count, count, #3
348	rev		count, count
349	str		count, [x9, #56]
350	mov		final_step, #2	// won't need count-only block
351	b		2f
3521:
353	mov		final_step, #1	// will need count-only block
3542:
355	ld1		{v16.16b-v19.16b}, [data1]
356	st1		{v16.16b-v19.16b}, [sp]
357	ld1		{v16.4s-v19.4s}, [x9]
358	ld1		{v20.16b-v23.16b}, [data2]
359	st1		{v20.16b-v23.16b}, [sp]
360	ld1		{v20.4s-v23.4s}, [x9]
361	b		.Lfinup2x_loop_have_data
362
363	// Prepare a padding block, either:
364	//
365	//	{0x80, 0, 0, 0, ..., count (as __be64)}
366	//	This is for a block aligned message.
367	//
368	//	{   0, 0, 0, 0, ..., count (as __be64)}
369	//	This is for a message whose length mod 64 is >= 56.
370	//
371	// Pre-swap the endianness of the words.
372.Lfinup2x_finalize_countonly:
373	movi		v16.2d, #0
374	b		1f
375.Lfinup2x_finalize_blockaligned:
376	mov		x8, #0x80000000
377	fmov		d16, x8
3781:
379	movi		v17.2d, #0
380	movi		v18.2d, #0
381	ror		count, count, #29	// ror(lsl(count, 3), 32)
382	mov		v19.d[0], xzr
383	mov		v19.d[1], count
384	mov		v20.16b, v16.16b
385	movi		v21.2d, #0
386	movi		v22.2d, #0
387	mov		v23.16b, v19.16b
388	mov		final_step, #2
389	b		.Lfinup2x_loop_have_bswapped_data
390
391.Lfinup2x_done:
392	// Write the two digests with all bytes in the correct order.
393	rev32		state0_a.16b, state0_a.16b
394	rev32		state1_a.16b, state1_a.16b
395	rev32		state0_b.16b, state0_b.16b
396	rev32		state1_b.16b, state1_b.16b
397	st1		{state0_a.4s-state1_a.4s}, [out1]
398	st1		{state0_b.4s-state1_b.4s}, [out2]
399	add		sp, sp, #128
400	ret
401SYM_FUNC_END(sha256_ce_finup2x)
402