xref: /freebsd/lib/libmd/aarch64/sha1block.S (revision f6210541f9e3c6cfda321e0ad98f277fb98a625b)
1/*-
2 * Copyright (c) 2024 Robert Clausecker <fuz@freebsd.org>
3 *
4 * SPDX-License-Identifier: BSD-2-Clause
5 *
6 * sha1block_sha1 implementation based on sha1-arm.c,
7 * written and placed in public domain by Jeffrey Walton
8 * based on code from ARM, and by Johannes Schneiders, Skip
9 * Hovsmith and Barry O'Rourke for the mbedTLS project.
10 */
11
12#include <machine/asm.h>
13
14/*
15 * Scalar SHA1 implementation.
16 *
17 * Due to the ample register file available on AArch64, the w array is
18 * kept entirely in registers.  The saved a-e variables are instead kept
19 * in memory as we don't have that much memory.
20 */
21
22	// sha1block(SHA1_CTX, buf, len)
23ENTRY(_libmd_sha1block_scalar)
24ctx	.req	x0
25buf	.req	x1
26len	.req	x2
27w	.req	sp
28a	.req	w3
29b	.req	w4
30c	.req	w5
31d	.req	w6
32e	.req	w7
33k	.req	w8
34f	.req	w9
35tmp	.req	w10
36w_0	.req	w11
37w_1	.req	w12
38w_2	.req	w13
39w_3	.req	w14
40w_4	.req	w15
41w_5	.req	w16
42w_6	.req	w17
43// w18 is the platform register
44w_7	.req	w19
45w_8	.req	w20
46w_9	.req	w21
47w_10	.req	w22
48w_11	.req	w23
49w_12	.req	w24
50w_13	.req	w25
51w_14	.req	w26
52w_15	.req	w27
53
54.macro	shuffle	w_i, w_i3, w_i8, w_i14
55	eor	\w_i, \w_i, \w_i3
56	eor	tmp, \w_i8, \w_i14
57	eor	\w_i, \w_i, tmp		// w[i-16] ^ w[i-14] ^ w[i-8] ^ w[i-3]
58	ror	\w_i, \w_i, #31		// w[i] = ... ror #31
59.endm
60
61.macro	func1	a, b, c, d, e
62	and	f, \c, \b
63	bic	tmp, \d, \b
64	orr	f, f, tmp
65.endm
66
67.macro	func2	a, b, c, d, e
68	eor	f, \b, \c
69	eor	f, f, \d
70.endm
71
72.macro	func3	a, b, c, d, e
73	eor	tmp, \b, \c
74	and	f, \b, \c
75	and	tmp, tmp, \d
76	orr	f, f, tmp
77.endm
78
79.macro	func4	a, b, c, d, e
80	func2	\a, \b, \c, \d, \e
81.endm
82
83.macro	mix	a, b, c, d, e, w_i
84	ror	\b, \b, #2
85	ror	tmp, \a, #27
86	add	\e, \e, \w_i
87	add	tmp, tmp, k
88	add	\e, \e, f
89	add	\e, \e, tmp		// (a ror 27) + e + f + k + w[i]
90.endm
91
92.macro	round1	a, b, c, d, e, w_i
93	func1 	\a, \b, \c, \d, \e
94	rev	\w_i, \w_i
95	mix	\a, \b, \c, \d, \e, \w_i
96.endm
97
98.macro	round	func, a, b, c, d, e, w_i, w_i3, w_i8, w_i14
99	shuffle	\w_i, \w_i3, \w_i8, \w_i14
100	\func	\a, \b, \c, \d, \e
101	mix	\a, \b, \c, \d, \e, \w_i
102.endm
103
104.macro	round1x	a, b, c, d, e, w_i, w_i3, w_i8, w_i14
105	round	func1, \a, \b, \c, \d, \e, \w_i, \w_i3, \w_i8, \w_i14
106.endm
107
108.macro	round2	a, b, c, d, e, w_i, w_i3, w_i8, w_i14
109	round	func2, \a, \b, \c, \d, \e, \w_i, \w_i3, \w_i8, \w_i14
110.endm
111
112.macro	round3	a, b, c, d, e, w_i, w_i3, w_i8, w_i14
113	round	func3, \a, \b, \c, \d, \e, \w_i, \w_i3, \w_i8, \w_i14
114.endm
115
116.macro	round4	a, b, c, d, e, w_i, w_i3, w_i8, w_i14
117	round	func4, \a, \b, \c, \d, \e, \w_i, \w_i3, \w_i8, \w_i14
118.endm
119
120	ands	len, len, #~63		// take length in multiples of block length
121	beq	1f			// bail out if input empty
122
123	sub	sp, sp, #24+9*8		// allocate stack space
124	str	x19, [sp, #24+0*8]
125	stp	x20, x21, [sp, #24+1*8]
126	stp	x22, x23, [sp, #24+3*8]
127	stp	x24, x25, [sp, #24+5*8]
128	stp	x26, x27, [sp, #24+7*8]
129
130	ldp	a, b, [ctx, #0]		// load SHA1 state from context
131	ldp	c, d, [ctx, #8]
132	ldr	e, [ctx, #16]
133
1340:	stp	a, b, [sp, #0]		// save old SHA1 state
135	stp	c, d, [sp, #8]
136	str	e, [sp, #16]
137
138	movz	k, #0x7999		// round constant 1
139	movk	k, #0x5a82, lsl #16
140
141	ldp	w_0, w_1, [buf, #0*4]
142	round1	a, b, c, d, e, w_0
143	round1	e, a, b, c, d, w_1
144
145	ldp	w_2, w_3, [buf, #2*4]
146	round1	d, e, a, b, c, w_2
147	round1	c, d, e, a, b, w_3
148
149	ldp	w_4, w_5, [buf, #4*4]
150	round1	b, c, d, e, a, w_4
151	round1	a, b, c, d, e, w_5
152
153	ldp	w_6, w_7, [buf, #6*4]
154	round1	e, a, b, c, d, w_6
155	round1	d, e, a, b, c, w_7
156
157	ldp	w_8, w_9, [buf, #8*4]
158	round1	c, d, e, a, b, w_8
159	round1	b, c, d, e, a, w_9
160
161	ldp	w_10, w_11, [buf, #10*4]
162	round1	a, b, c, d, e, w_10
163	round1	e, a, b, c, d, w_11
164
165	ldp	w_12, w_13, [buf, #12*4]
166	round1	d, e, a, b, c, w_12
167	round1	c, d, e, a, b, w_13
168
169	ldp	w_14, w_15, [buf, #14*4]
170	round1	b, c, d, e, a, w_14
171	round1	a, b, c, d, e, w_15
172
173	round1x	e, a, b, c, d, w_0,  w_13,  w_8,  w_2
174	round1x	d, e, a, b, c, w_1,  w_14,  w_9,  w_3
175	round1x	c, d, e, a, b, w_2,  w_15, w_10,  w_4
176	round1x	b, c, d, e, a, w_3,  w_0,  w_11,  w_5
177
178	movz	k, #0xeba1		// round constant 2
179	movk	k, #0x6ed9, lsl #16
180
181	round2	a, b, c, d, e, w_4,  w_1,  w_12,  w_6
182	round2	e, a, b, c, d, w_5,  w_2,  w_13,  w_7
183	round2	d, e, a, b, c, w_6,  w_3,  w_14,  w_8
184	round2	c, d, e, a, b, w_7,  w_4,  w_15,  w_9
185	round2	b, c, d, e, a, w_8,  w_5,  w_0,   w_10
186
187	round2	a, b, c, d, e, w_9,  w_6,  w_1,   w_11
188	round2	e, a, b, c, d, w_10, w_7,  w_2,   w_12
189	round2	d, e, a, b, c, w_11, w_8,  w_3,   w_13
190	round2	c, d, e, a, b, w_12, w_9,  w_4,   w_14
191	round2	b, c, d, e, a, w_13, w_10, w_5,   w_15
192
193	round2	a, b, c, d, e, w_14, w_11, w_6,   w_0
194	round2	e, a, b, c, d, w_15, w_12, w_7,   w_1
195	round2	d, e, a, b, c, w_0,  w_13, w_8,   w_2
196	round2	c, d, e, a, b, w_1,  w_14, w_9,   w_3
197	round2	b, c, d, e, a, w_2,  w_15, w_10,  w_4
198
199	round2	a, b, c, d, e, w_3,  w_0,  w_11,  w_5
200	round2	e, a, b, c, d, w_4,  w_1,  w_12,  w_6
201	round2	d, e, a, b, c, w_5,  w_2,  w_13,  w_7
202	round2	c, d, e, a, b, w_6,  w_3,  w_14,  w_8
203	round2	b, c, d, e, a, w_7,  w_4,  w_15,  w_9
204
205	movz	k, #0xbcdc		// round constant 3
206	movk	k, #0x8f1b, lsl #16
207
208	round3	a, b, c, d, e, w_8,  w_5,  w_0,  w_10
209	round3	e, a, b, c, d, w_9,  w_6,  w_1,  w_11
210	round3	d, e, a, b, c, w_10, w_7,  w_2,  w_12
211	round3	c, d, e, a, b, w_11, w_8,  w_3,  w_13
212	round3	b, c, d, e, a, w_12, w_9,  w_4,  w_14
213
214	round3	a, b, c, d, e, w_13, w_10, w_5,  w_15
215	round3	e, a, b, c, d, w_14, w_11, w_6,  w_0
216	round3	d, e, a, b, c, w_15, w_12, w_7,  w_1
217	round3	c, d, e, a, b, w_0,  w_13, w_8,  w_2
218	round3	b, c, d, e, a, w_1,  w_14, w_9,  w_3
219
220	round3	a, b, c, d, e, w_2,  w_15, w_10, w_4
221	round3	e, a, b, c, d, w_3,  w_0,  w_11, w_5
222	round3	d, e, a, b, c, w_4,  w_1,  w_12, w_6
223	round3	c, d, e, a, b, w_5,  w_2,  w_13, w_7
224	round3	b, c, d, e, a, w_6,  w_3,  w_14, w_8
225
226	round3	a, b, c, d, e, w_7,  w_4,  w_15, w_9
227	round3	e, a, b, c, d, w_8,  w_5,  w_0,  w_10
228	round3	d, e, a, b, c, w_9,  w_6,  w_1,  w_11
229	round3	c, d, e, a, b, w_10, w_7,  w_2,  w_12
230	round3	b, c, d, e, a, w_11, w_8,  w_3,  w_13
231
232	movz	k, #0xc1d6		// round constant 4
233	movk	k, #0xca62, lsl #16
234
235	round4	a, b, c, d, e, w_12, w_9,  w_4,  w_14
236	round4	e, a, b, c, d, w_13, w_10, w_5,  w_15
237	round4	d, e, a, b, c, w_14, w_11, w_6,  w_0
238	round4	c, d, e, a, b, w_15, w_12, w_7,  w_1
239	round4	b, c, d, e, a, w_0,  w_13, w_8,  w_2
240
241	round4	a, b, c, d, e, w_1,  w_14, w_9,  w_3
242	round4	e, a, b, c, d, w_2,  w_15, w_10, w_4
243	round4	d, e, a, b, c, w_3,  w_0,  w_11, w_5
244	round4	c, d, e, a, b, w_4,  w_1,  w_12, w_6
245	round4	b, c, d, e, a, w_5,  w_2,  w_13, w_7
246
247	round4	a, b, c, d, e, w_6,  w_3,  w_14, w_8
248	round4	e, a, b, c, d, w_7,  w_4,  w_15, w_9
249	round4	d, e, a, b, c, w_8,  w_5,  w_0,  w_10
250	round4	c, d, e, a, b, w_9,  w_6,  w_1,  w_11
251	round4	b, c, d, e, a, w_10, w_7,  w_2,  w_12
252
253	round4	a, b, c, d, e, w_11, w_8,  w_3,  w_13
254	round4	e, a, b, c, d, w_12, w_9,  w_4,  w_14
255	round4	d, e, a, b, c, w_13, w_10, w_5,  w_15
256	round4	c, d, e, a, b, w_14, w_11, w_6,  w_0
257	round4	b, c, d, e, a, w_15, w_12, w_7,  w_1
258
259	ldp	w_0, w_1, [sp, #0]	// reload saved SHA1 state
260	ldp	w_2, w_3, [sp, #8]
261	ldr	w_4, [sp, #16]
262
263	add	a, a, w_0
264	add	b, b, w_1
265	add	c, c, w_2
266	add	d, d, w_3
267	add	e, e, w_4
268
269	add	buf, buf, #64
270	subs	len, len, #64
271	bhi	0b
272
273	stp	a, b, [ctx, #0]		// write updated SHA1 state
274	stp	c, d, [ctx, #8]
275	str	e, [ctx, #16]
276
277	ldr	x19, [sp, #24+0*8]
278	ldp	x20, x21, [sp, #24+1*8]
279	ldp	x22, x23, [sp, #24+3*8]
280	ldp	x24, x25, [sp, #24+5*8]
281	ldp	x26, x27, [sp, #24+7*8]
282	add	sp, sp, #24+9*8
283
2841:	ret
285END(_libmd_sha1block_scalar)
286
287/*
288 * SHA1 implementation using the SHA1 instruction set extension.
289 */
290
291	.arch_extension sha2
292
293	// sha1block(SHA1_CTX, buf, len)
294ENTRY(_libmd_sha1block_sha1)
295	/* ctx, buf, len: same as for sha1block_scalar */
296kaddr	.req	x3
297abcd	.req	v0
298abcd_q	.req	q0			// alias for use with scalar instructions
299abcd_s	.req	s0
300e0	.req	s1
301e0_v	.req	v1
302e1	.req	s2
303abcd_saved .req	v3
304e0_saved .req	v4
305tmp0	.req	v5
306tmp1	.req	v6
307msg0	.req	v16
308msg1	.req	v17
309msg2	.req	v18
310msg3	.req	v19
311k0	.req	v20
312k1	.req	v21
313k2	.req	v22
314k3	.req	v23
315
316	ands	len, len, #~63		// take length in multiples of block length
317	beq	1f			// bail out if input empty
318
319	ldr	abcd_q, [ctx, #0]
320	ldr	e0, [ctx, #16]
321
322	adrp	kaddr, k1234
323	add	kaddr, kaddr, #:lo12:k1234
324	ld4r	{k0.4s, k1.4s, k2.4s, k3.4s}, [kaddr]
325
3260:	mov	abcd_saved.16b, abcd.16b
327	mov	e0_saved.16b, e0_v.16b
328
329	ld1	{msg0.4s, msg1.4s, msg2.4s, msg3.4s}, [buf], #64
330	rev32	msg0.16b, msg0.16b
331	rev32	msg1.16b, msg1.16b
332	rev32	msg2.16b, msg2.16b
333	rev32	msg3.16b, msg3.16b
334
335	add	tmp0.4s, msg0.4s, k0.4s
336	add	tmp1.4s, msg1.4s, k0.4s
337
338	/* rounds 0--3 */
339	sha1h	e1, abcd_s
340	sha1c	abcd_q, e0, tmp0.4s
341	add	tmp0.4s, msg2.4s, k0.4s
342	sha1su0	msg0.4s, msg1.4s, msg2.4s
343
344	/* rounds 4--7 */
345	sha1h	e0, abcd_s
346	sha1c	abcd_q, e1, tmp1.4s
347	add	tmp1.4s, msg3.4s, k0.4s
348	sha1su1	msg0.4s, msg3.4s
349	sha1su0	msg1.4s, msg2.4s, msg3.4s
350
351	/* rounds 8--11 */
352	sha1h	e1, abcd_s
353	sha1c	abcd_q, e0, tmp0.4s
354	add	tmp0.4s, msg0.4s, k0.4s
355	sha1su1	msg1.4s, msg0.4s
356	sha1su0	msg2.4s, msg3.4s, msg0.4s
357
358	/* rounds 12--15 */
359	sha1h	e0, abcd_s
360	sha1c	abcd_q, e1, tmp1.4s
361	add	tmp1.4s, msg1.4s, k1.4s
362	sha1su1	msg2.4s, msg1.4s
363	sha1su0	msg3.4s, msg0.4s, msg1.4s
364
365	/* rounds 16--19 */
366	sha1h	e1, abcd_s
367	sha1c	abcd_q, e0, tmp0.4s
368	add	tmp0.4s, msg2.4s, k1.4s
369	sha1su1	msg3.4s, msg2.4s
370	sha1su0	msg0.4s, msg1.4s, msg2.4s
371
372	/* rounds 20--23 */
373	sha1h	e0, abcd_s
374	sha1p	abcd_q, e1, tmp1.4s
375	add	tmp1.4s, msg3.4s, k1.4s
376	sha1su1	msg0.4s, msg3.4s
377	sha1su0	msg1.4s, msg2.4s, msg3.4s
378
379	/* rounds 24--27 */
380	sha1h	e1, abcd_s
381	sha1p	abcd_q, e0, tmp0.4s
382	add	tmp0.4s, msg0.4s, k1.4s
383	sha1su1	msg1.4s, msg0.4s
384	sha1su0	msg2.4s, msg3.4s, msg0.4s
385
386	/* rounds 28--31 */
387	sha1h	e0, abcd_s
388	sha1p	abcd_q, e1, tmp1.4s
389	add	tmp1.4s, msg1.4s, k1.4s
390	sha1su1	msg2.4s, msg1.4s
391	sha1su0	msg3.4s, msg0.4s, msg1.4s
392
393	/* rounds 32--35 */
394	sha1h	e1, abcd_s
395	sha1p	abcd_q, e0, tmp0.4s
396	add	tmp0.4s, msg2.4s, k2.4s
397	sha1su1	msg3.4s, msg2.4s
398	sha1su0	msg0.4s, msg1.4s, msg2.4s
399
400	/* rounds 36--39 */
401	sha1h	e0, abcd_s
402	sha1p	abcd_q, e1, tmp1.4s
403	add	tmp1.4s, msg3.4s, k2.4s
404	sha1su1	msg0.4s, msg3.4s
405	sha1su0	msg1.4s, msg2.4s, msg3.4s
406
407	/* rounds 40--43 */
408	sha1h	e1, abcd_s
409	sha1m	abcd_q, e0, tmp0.4s
410	add	tmp0.4s, msg0.4s, k2.4s
411	sha1su1	msg1.4s, msg0.4s
412	sha1su0	msg2.4s, msg3.4s, msg0.4s
413
414	/* rounds 44--47 */
415	sha1h	e0, abcd_s
416	sha1m	abcd_q, e1, tmp1.4s
417	add	tmp1.4s, msg1.4s, k2.4s
418	sha1su1	msg2.4s, msg1.4s
419	sha1su0	msg3.4s, msg0.4s, msg1.4s
420
421	/* rounds 48--51 */
422	sha1h	e1, abcd_s
423	sha1m	abcd_q, e0, tmp0.4s
424	add	tmp0.4s, msg2.4s, k2.4s
425	sha1su1	msg3.4s, msg2.4s
426	sha1su0	msg0.4s, msg1.4s, msg2.4s
427
428	/* rounds 52--55 */
429	sha1h	e0, abcd_s
430	sha1m	abcd_q, e1, tmp1.4s
431	add	tmp1.4s, msg3.4s, k3.4s
432	sha1su1	msg0.4s, msg3.4s
433	sha1su0	msg1.4s, msg2.4s, msg3.4s
434
435	/* rounds 56--59 */
436	sha1h	e1, abcd_s
437	sha1m	abcd_q, e0, tmp0.4s
438	add	tmp0.4s, msg0.4s, k3.4s
439	sha1su1	msg1.4s, msg0.4s
440	sha1su0	msg2.4s, msg3.4s, msg0.4s
441
442	/* rounds 60--63 */
443	sha1h	e0, abcd_s
444	sha1p	abcd_q, e1, tmp1.4s
445	add	tmp1.4s, msg1.4s, k3.4s
446	sha1su1	msg2.4s, msg1.4s
447	sha1su0	msg3.4s, msg0.4s, msg1.4s
448
449	/* rounds 64--67 */
450	sha1h	e1, abcd_s
451	sha1p	abcd_q, e0, tmp0.4s
452	add	tmp0.4s, msg2.4s, k3.4s
453	sha1su1	msg3.4s, msg2.4s
454	sha1su0	msg0.4s, msg1.4s, msg2.4s
455
456	/* rounds 68--71 */
457	sha1h	e0, abcd_s
458	sha1p	abcd_q, e1, tmp1.4s
459	add	tmp1.4s, msg3.4s, k3.4s
460	sha1su1	msg0.4s, msg3.4s
461
462	/* rounds 72--75 */
463	sha1h	e1, abcd_s
464	sha1p	abcd_q, e0, tmp0.4s
465
466	/* rounds 76--79 */
467	sha1h	e0, abcd_s
468	sha1p	abcd_q, e1, tmp1.4s
469
470	add	e0_v.4s, e0_v.4s, e0_saved.4s
471	add	abcd.4s, abcd.4s, abcd_saved.4s
472
473	subs	len, len, #64
474	bhi	0b
475
476	str	abcd_q, [ctx, #0]
477	str	e0, [ctx, #16]
478
4791:	ret
480END(_libmd_sha1block_sha1)
481
482	.section .rodata
483	.balign	16
484k1234:	.4byte	0x5a827999
485	.4byte	0x6ed9eba1
486	.4byte	0x8f1bbcdc
487	.4byte	0xca62c1d6
488	.size	k1234, .-k1234
489
490	.section .note.GNU-stack,"",%progbits
491