xref: /freebsd/lib/libmd/amd64/md5block.S (revision df21a004be237a1dccd03c7b47254625eea62fa9)
1/*-
2 * Copyright (c) 2024, 2025 Robert Clausecker <fuz@FreeBSD.org>
3 *
4 * SPDX-License-Identifier: BSD-2-Clause
5 */
6
7#include <machine/asm.h>
8
9/* apply the round keys to the four round functions */
10.macro	allrounds	rfn0, rfn1, rfn2, rfn3
11	\rfn0	 0, 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee
12	\rfn0	 4, 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501
13	\rfn0	 8, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be
14	\rfn0	12, 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821
15
16	\rfn1	16, 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa
17	\rfn1	20, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8
18	\rfn1	24, 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed
19	\rfn1	28, 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a
20
21	\rfn2	32, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c
22	\rfn2	36, 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70
23	\rfn2	40, 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05
24	\rfn2	44, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665
25
26	\rfn3	48, 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039
27	\rfn3	52, 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1
28	\rfn3	56, 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1
29	\rfn3	60, 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
30.endm
31
32	// md5block(MD5_CTX, buf, len)
33ENTRY(_libmd_md5block_baseline)
34.macro	round	a, b, c, d, f, k, m, s
35	\f	%ebp, \b, \c, \d
36	add	$\k, \a			// a + k[i]
37	add	((\m)%16*4)(%rsi), \a	// a + k[i] + m[g]
38	add	%ebp, \a		// a + k[i] + m[g] + f
39	rol	$\s, \a
40	add	\b, \a
41.endm
42
43	// f = b ? c : d
44.macro	f0	f, b, c, d
45	mov	\c, \f
46	xor	\d, \f
47	and	\b, \f
48	xor	\d, \f
49.endm
50
51	// f = d ? b : c
52.macro	f1	f, b, c, d
53	mov	\c, \f
54	xor	\b, \f
55	and	\d, \f
56	xor	\c, \f
57.endm
58
59	// f = b ^ c ^ d
60.macro	f2	f, b, c, d
61	mov	\c, \f
62	xor	\d, \f
63	xor	\b, \f
64.endm
65
66	// f = c ^ (b | ~d)
67.macro	f3	f, b, c, d
68	mov	$-1, \f
69	xor	\d, \f
70	or	\b, \f
71	xor	\c, \f
72.endm
73
74	// do 4 rounds
75.macro	rounds	f, p, q, s0, s1, s2, s3, k0, k1, k2, k3
76	round	%eax, %ebx, %ecx, %edx, \f, \k0, \p*0+\q, \s0
77	round	%edx, %eax, %ebx, %ecx, \f, \k1, \p*1+\q, \s1
78	round	%ecx, %edx, %eax, %ebx, \f, \k2, \p*2+\q, \s2
79	round	%ebx, %ecx, %edx, %eax, \f, \k3, \p*3+\q, \s3
80.endm
81
82	// do 4 rounds with f0, f1, f2, f3
83.macro	rounds0	i, k0, k1, k2, k3
84	rounds	f0, 1, \i, 7, 12, 17, 22, \k0, \k1, \k2, \k3
85.endm
86
87.macro	rounds1	i, k0, k1, k2, k3
88	rounds	f1, 5, 5*\i+1, 5, 9, 14, 20, \k0, \k1, \k2, \k3
89.endm
90
91.macro	rounds2	i, k0, k1, k2, k3
92	rounds	f2, 3, 3*\i+5, 4, 11, 16, 23, \k0, \k1, \k2, \k3
93.endm
94
95.macro	rounds3	i, k0, k1, k2, k3
96	rounds	f3, 7, 7*\i, 6, 10, 15, 21, \k0, \k1, \k2, \k3
97.endm
98
99	push	%rbx
100	push	%rbp
101	push	%r12
102
103	and	$~63, %rdx		// length in blocks
104	lea	(%rsi, %rdx, 1), %r12	// end pointer
105
106	mov	(%rdi), %eax		// a
107	mov	4(%rdi), %ebx		// b
108	mov	8(%rdi), %ecx		// c
109	mov	12(%rdi), %edx		// d
110
111	cmp	%rsi, %r12		// any data to process?
112	je	.Lend
113
114	.balign	16
115.Lloop:	mov	%eax, %r8d
116	mov	%ebx, %r9d
117	mov	%ecx, %r10d
118	mov	%edx, %r11d
119
120	allrounds	rounds0, rounds1, rounds2, rounds3
121
122	add	%r8d, %eax
123	add	%r9d, %ebx
124	add	%r10d, %ecx
125	add	%r11d, %edx
126
127	add	$64, %rsi
128	cmp	%rsi, %r12
129	jne	.Lloop
130
131	mov	%eax, (%rdi)
132	mov	%ebx, 4(%rdi)
133	mov	%ecx, 8(%rdi)
134	mov	%edx, 12(%rdi)
135
136.Lend:	pop	%r12
137	pop	%rbp
138	pop	%rbx
139	ret
140END(_libmd_md5block_baseline)
141
142	/*
143	 * An implementation leveraging the ANDN instruction
144	 * from BMI1 to shorten some dependency chains.
145	 */
146ENTRY(_libmd_md5block_bmi1)
147	// special-cased round 1
148	// f1 = d ? b : c = (d & b) + (~d & c)
149.macro	round1	a, b, c, d, k, m, s
150	andn	\c, \d, %edi		// ~d & c
151	add	$\k, \a			// a + k[i]
152	mov	\d, %ebp
153	add	((\m)%16*4)(%rsi), \a	// a + k[i] + m[g]
154	and	\b, %ebp		// d & b
155	add	%edi, \a		// a + k[i] + m[g] + (~d & c)
156	add	%ebp, \a		// a + k[i] + m[g] + (~d & c) + (d & b)
157	rol	$\s, \a
158	add	\b, \a
159.endm
160
161	// special-cased round 3
162	// f3 = c ^ (b | ~d) = ~(c ^ ~b & d) = -1 - (c ^ ~b & d)
163.macro	round3	a, b, c, d, k, m, s
164	andn	\d, \b, %ebp
165	add	$\k - 1, \a		// a + k[i] - 1
166	add	((\m)%16*4)(%rsi), \a	// a + k[i] + m[g]
167	xor	\c, %ebp
168	sub	%ebp, \a		// a + k[i] + m[g] + f
169	rol	$\s, \a
170	add	\b, \a
171.endm
172
173	.purgem	rounds1
174.macro	rounds1	i, k0, k1, k2, k3
175	round1	%eax, %ebx, %ecx, %edx, \k0, 5*\i+ 1,  5
176	round1	%edx, %eax, %ebx, %ecx, \k1, 5*\i+ 6,  9
177	round1	%ecx, %edx, %eax, %ebx, \k2, 5*\i+11, 14
178	round1	%ebx, %ecx, %edx, %eax, \k3, 5*\i+16, 20
179.endm
180
181	.purgem	rounds3
182.macro	rounds3	i, k0, k1, k2, k3
183	round3	%eax, %ebx, %ecx, %edx, \k0, 7*\i+ 0,  6
184	round3	%edx, %eax, %ebx, %ecx, \k1, 7*\i+ 7, 10
185	round3	%ecx, %edx, %eax, %ebx, \k2, 7*\i+14, 15
186	round3	%ebx, %ecx, %edx, %eax, \k3, 7*\i+21, 21
187.endm
188
189	push	%rbx
190	push	%rbp
191	push	%r12
192
193	and	$~63, %rdx		// length in blocks
194	lea	(%rsi, %rdx, 1), %r12	// end pointer
195
196	mov	(%rdi), %eax		// a
197	mov	4(%rdi), %ebx		// b
198	mov	8(%rdi), %ecx		// c
199	mov	12(%rdi), %edx		// d
200
201	cmp	%rsi, %r12		// any data to process?
202	je	0f
203
204	push	%rdi
205
206	.balign	16
2071:	mov	%eax, %r8d
208	mov	%ebx, %r9d
209	mov	%ecx, %r10d
210	mov	%edx, %r11d
211
212	allrounds	rounds0, rounds1, rounds2, rounds3
213
214	add	%r8d, %eax
215	add	%r9d, %ebx
216	add	%r10d, %ecx
217	add	%r11d, %edx
218
219	add	$64, %rsi
220	cmp	%rsi, %r12
221	jne	1b
222
223	pop	%rdi
224	mov	%eax, (%rdi)
225	mov	%ebx, 4(%rdi)
226	mov	%ecx, 8(%rdi)
227	mov	%edx, 12(%rdi)
228
2290:	pop	%r12
230	pop	%rbp
231	pop	%rbx
232	ret
233END(_libmd_md5block_bmi1)
234
235#ifndef _KERNEL
236	/*
237	 * An implementation leveraging AVX-512 for its VPTERNLOGD
238	 * instruction.  We're using only XMM registers here,
239	 * avoiding costly thermal licensing.
240	 */
241ENTRY(_libmd_md5block_avx512)
242.macro	vround		a, b, c, d, f, i, m, mi, s
243	vmovdqa		\b, %xmm4
244	vpternlogd	$\f, \d, \c, %xmm4
245	vpaddd		4*(\i)(%rax){1to4}, \m, %xmm5 // m[g] + k[i]
246.if	\mi != 0
247	vpshufd		$0x55 * \mi, %xmm5, %xmm5	// broadcast to each dword
248.endif
249	vpaddd		%xmm5, \a, \a		// a + k[i] + m[g]
250	vpaddd		%xmm4, \a, \a		// a + k[i] + m[g] + f
251	vprold		$\s, \a, \a
252	vpaddd		\b, \a, \a
253.endm
254
255.macro	vrounds		f, i, m0, i0, m1, i1, m2, i2, m3, i3, s0, s1, s2, s3
256	vround		%xmm0, %xmm1, %xmm2, %xmm3, \f, \i+0, \m0, \i0, \s0
257	vround		%xmm3, %xmm0, %xmm1, %xmm2, \f, \i+1, \m1, \i1, \s1
258	vround		%xmm2, %xmm3, %xmm0, %xmm1, \f, \i+2, \m2, \i2, \s2
259	vround		%xmm1, %xmm2, %xmm3, %xmm0, \f, \i+3, \m3, \i3, \s3
260.endm
261
262/*
263 * d c b f0 f1 f2 f3
264 * 0 0 0  0  0  0  1
265 * 1 0 0  1  0  1  0
266 * 0 1 0  0  1  1  0
267 * 1 1 0  1  0  0  1
268 * 0 0 1  0  0  1  1
269 * 1 0 1  0  1  0  1
270 * 0 1 1  1  1  0  0
271 * 1 1 1  1  1  1  0
272 */
273
274.macro	vrounds0	i, m
275	vrounds		0xca, \i, \m, 0, \m, 1, \m, 2, \m, 3, 7, 12, 17, 22
276.endm
277
278.macro	vrounds1	i, m0, i0, m1, i1, m2, i2, m3, i3
279	vrounds		0xe4, \i, \m0, \i0, \m1, \i1, \m2, \i2, \m3, \i3, 5, 9, 14, 20
280.endm
281
282.macro	vrounds2	i, m0, i0, m1, i1, m2, i2, m3, i3
283	vrounds		0x96, \i, \m0, \i0, \m1, \i1, \m2, \i2, \m3, \i3, 4, 11, 16, 23
284.endm
285
286.macro	vrounds3	i, m0, i0, m1, i1, m2, i2, m3, i3
287	vrounds		0x39, \i, \m0, \i0, \m1, \i1, \m2, \i2, \m3, \i3, 6, 10, 15, 21
288.endm
289
290	and		$~63, %rdx		// length in blocks
291	add		%rsi, %rdx		// end pointer
292
293	vmovd		(%rdi), %xmm0		// a
294	vmovd		4(%rdi), %xmm1		// b
295	vmovd		8(%rdi), %xmm2		// c
296	vmovd		12(%rdi), %xmm3		// d
297
298	lea		keys(%rip), %rax
299
300	cmp		%rsi, %rdx		// any data to process?
301	je		0f
302
303	.balign		16
3041:	vmovdqu		0*4(%rsi), %xmm8	// message words
305	vmovdqu		4*4(%rsi), %xmm9
306	vmovdqu		8*4(%rsi), %xmm10
307	vmovdqu		12*4(%rsi), %xmm11
308
309	vmovdqa		%xmm0, %xmm12		// stash old state variables
310	vmovdqa		%xmm1, %xmm13
311	vmovdqa		%xmm2, %xmm14
312	vmovdqa		%xmm3, %xmm15
313
314	vrounds0	 0, %xmm8
315	vrounds0	 4, %xmm9
316	vrounds0	 8, %xmm10
317	vrounds0	12, %xmm11
318
319	vrounds1	16,  %xmm8, 1,  %xmm9, 2, %xmm10, 3,  %xmm8, 0
320	vrounds1	20,  %xmm9, 1, %xmm10, 2, %xmm11, 3,  %xmm9, 0
321	vrounds1	24, %xmm10, 1, %xmm11, 2,  %xmm8, 3, %xmm10, 0
322	vrounds1	28, %xmm11, 1,  %xmm8, 2,  %xmm9, 3, %xmm11, 0
323
324	vrounds2	32,  %xmm9, 1, %xmm10, 0, %xmm10, 3, %xmm11, 2
325	vrounds2	36,  %xmm8, 1,  %xmm9, 0,  %xmm9, 3, %xmm10, 2
326	vrounds2	40, %xmm11, 1,  %xmm8, 0,  %xmm8, 3,  %xmm9, 2
327	vrounds2	44  %xmm10, 1, %xmm11, 0, %xmm11, 3,  %xmm8, 2
328
329	vrounds3	48,  %xmm8, 0,  %xmm9, 3, %xmm11, 2,  %xmm9, 1
330	vrounds3	52, %xmm11, 0,  %xmm8, 3, %xmm10, 2,  %xmm8, 1
331	vrounds3	56, %xmm10, 0, %xmm11, 3,  %xmm9, 2, %xmm11, 1
332	vrounds3	60,  %xmm9, 0, %xmm10, 3,  %xmm8, 2, %xmm10, 1
333
334	vpaddd		%xmm12, %xmm0, %xmm0
335	vpaddd		%xmm13, %xmm1, %xmm1
336	vpaddd		%xmm14, %xmm2, %xmm2
337	vpaddd		%xmm15, %xmm3, %xmm3
338
339	add		$64, %rsi
340	cmp		%rsi, %rdx
341	jne		1b
342
343	vmovd		%xmm0, (%rdi)
344	vmovd		%xmm1, 4(%rdi)
345	vmovd		%xmm2, 8(%rdi)
346	vmovd		%xmm3, 12(%rdi)
347
3480:	ret
349END(_libmd_md5block_avx512)
350
351	// round keys, for use in md5block_avx512
352	.section	.rodata
353	.balign		16
354
355.macro	putkeys		i, a, b, c, d
356	.4byte		\a, \b, \c, \d
357.endm
358
359keys:	allrounds	putkeys, putkeys, putkeys, putkeys
360	.size		keys, .-keys
361#endif /* !defined(_KERNEL) */
362
363	.section .note.GNU-stack,"",%progbits
364