xref: /freebsd/lib/libc/amd64/string/memcmp.S (revision 8f7ed58a15556bf567ff876e1999e4fe4d684e1d)
1/*-
2 * Copyright (c) 2018, 2023 The FreeBSD Foundation
3 *
4 * This software was developed by Mateusz Guzik <mjg@FreeBSD.org>
5 * under sponsorship from the FreeBSD Foundation.
6 *
7 * Portions of this software were developed by Robert Clausecker
8 * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32#include <machine/asm.h>
33#include <machine/param.h>
34
35#include "amd64_archlevel.h"
36
37/*
38 * Note: this routine was written with kernel use in mind (read: no simd),
39 * it is only present in userspace as a temporary measure until something
40 * better gets imported.
41 */
42
43#define ALIGN_TEXT      .p2align 4,0x90 /* 16-byte alignment, nop filled */
44
45#ifdef BCMP
46#define memcmp bcmp
47#endif
48
49ARCHFUNCS(memcmp)
50	ARCHFUNC(memcmp, scalar)
51	ARCHFUNC(memcmp, baseline)
52ENDARCHFUNCS(memcmp)
53
54ARCHENTRY(memcmp, scalar)
55	xorl	%eax,%eax
5610:
57	cmpq	$16,%rdx
58	ja	101632f
59
60	cmpb	$8,%dl
61	jg	100816f
62
63	cmpb	$4,%dl
64	jg	100408f
65
66	cmpb	$2,%dl
67	jge	100204f
68
69	cmpb	$1,%dl
70	jl	100000f
71	movzbl	(%rdi),%eax
72	movzbl	(%rsi),%r8d
73	subl	%r8d,%eax
74100000:
75	ret
76
77	ALIGN_TEXT
78100816:
79	movq	(%rdi),%r8
80	movq	(%rsi),%r9
81	cmpq	%r8,%r9
82	jne	80f
83	movq	-8(%rdi,%rdx),%r8
84	movq	-8(%rsi,%rdx),%r9
85	cmpq	%r8,%r9
86	jne	10081608f
87	ret
88	ALIGN_TEXT
89100408:
90	movl	(%rdi),%r8d
91	movl	(%rsi),%r9d
92	cmpl	%r8d,%r9d
93	jne	80f
94	movl	-4(%rdi,%rdx),%r8d
95	movl	-4(%rsi,%rdx),%r9d
96	cmpl	%r8d,%r9d
97	jne	10040804f
98	ret
99	ALIGN_TEXT
100100204:
101	movzwl	(%rdi),%r8d
102	movzwl	(%rsi),%r9d
103	cmpl	%r8d,%r9d
104	jne	1f
105	movzwl	-2(%rdi,%rdx),%r8d
106	movzwl	-2(%rsi,%rdx),%r9d
107	cmpl	%r8d,%r9d
108	jne	1f
109	ret
110	ALIGN_TEXT
111101632:
112	cmpq	$32,%rdx
113	ja	103200f
114	movq	(%rdi),%r8
115	movq	(%rsi),%r9
116	cmpq	%r8,%r9
117	jne	80f
118	movq	8(%rdi),%r8
119	movq	8(%rsi),%r9
120	cmpq	%r8,%r9
121	jne	10163208f
122	movq	-16(%rdi,%rdx),%r8
123	movq	-16(%rsi,%rdx),%r9
124	cmpq	%r8,%r9
125	jne	10163216f
126	movq	-8(%rdi,%rdx),%r8
127	movq	-8(%rsi,%rdx),%r9
128	cmpq	%r8,%r9
129	jne	10163224f
130	ret
131	ALIGN_TEXT
132103200:
133	movq	(%rdi),%r8
134	movq	8(%rdi),%r9
135	subq	(%rsi),%r8
136	subq	8(%rsi),%r9
137	orq	%r8,%r9
138	jnz	10320000f
139
140	movq    16(%rdi),%r8
141	movq    24(%rdi),%r9
142	subq    16(%rsi),%r8
143	subq    24(%rsi),%r9
144	orq	%r8,%r9
145	jnz     10320016f
146
147	leaq	32(%rdi),%rdi
148	leaq	32(%rsi),%rsi
149	subq	$32,%rdx
150	cmpq	$32,%rdx
151	jae	103200b
152	cmpb	$0,%dl
153	jne	10b
154	ret
155
156/*
157 * Mismatch was found.
158 */
159#ifdef BCMP
160	ALIGN_TEXT
16110320016:
16210320000:
16310081608:
16410163224:
16510163216:
16610163208:
16710040804:
16880:
1691:
170	leal	1(%eax),%eax
171	ret
172#else
173/*
174 * We need to compute the difference between strings.
175 * Start with narrowing the range down (16 -> 8 -> 4 bytes).
176 */
177	ALIGN_TEXT
17810320016:
179	leaq	16(%rdi),%rdi
180	leaq	16(%rsi),%rsi
18110320000:
182	movq	(%rdi),%r8
183	movq	(%rsi),%r9
184	cmpq	%r8,%r9
185	jne	80f
186	leaq	8(%rdi),%rdi
187	leaq	8(%rsi),%rsi
188	jmp	80f
189	ALIGN_TEXT
19010081608:
19110163224:
192	leaq	-8(%rdi,%rdx),%rdi
193	leaq	-8(%rsi,%rdx),%rsi
194	jmp	80f
195	ALIGN_TEXT
19610163216:
197	leaq	-16(%rdi,%rdx),%rdi
198	leaq	-16(%rsi,%rdx),%rsi
199	jmp	80f
200	ALIGN_TEXT
20110163208:
202	leaq	8(%rdi),%rdi
203	leaq	8(%rsi),%rsi
204	jmp	80f
205	ALIGN_TEXT
20610040804:
207	leaq	-4(%rdi,%rdx),%rdi
208	leaq	-4(%rsi,%rdx),%rsi
209	jmp	1f
210
211	ALIGN_TEXT
21280:
213	movl	(%rdi),%r8d
214	movl	(%rsi),%r9d
215	cmpl	%r8d,%r9d
216	jne	1f
217	leaq	4(%rdi),%rdi
218	leaq	4(%rsi),%rsi
219
220/*
221 * We have up to 4 bytes to inspect.
222 */
2231:
224	movzbl	(%rdi),%eax
225	movzbl	(%rsi),%r8d
226	cmpb	%r8b,%al
227	jne	2f
228
229	movzbl	1(%rdi),%eax
230	movzbl	1(%rsi),%r8d
231	cmpb	%r8b,%al
232	jne	2f
233
234	movzbl	2(%rdi),%eax
235	movzbl	2(%rsi),%r8d
236	cmpb	%r8b,%al
237	jne	2f
238
239	movzbl	3(%rdi),%eax
240	movzbl	3(%rsi),%r8d
2412:
242	subl	%r8d,%eax
243	ret
244#endif
245ARCHEND(memcmp, scalar)
246
247ARCHENTRY(memcmp, baseline)
248	cmp		$32, %rdx		# enough to permit use of the long kernel?
249	ja		.Llong
250
251	test		%rdx, %rdx		# zero bytes buffer?
252	je		.L0
253
254	/*
255	 * Compare strings of 1--32 bytes.  We want to do this by
256	 * loading into two xmm registers and then comparing.  To avoid
257	 * crossing into unmapped pages, we either load 32 bytes from
258	 * the start of the buffer or 32 bytes before its end, depending
259	 * on whether there is a page boundary between the overread area
260	 * or not.
261	 */
262
263	/* check for page boundaries overreads */
264	lea		31(%rdi), %eax		# end of overread
265	lea		31(%rsi), %r8d
266	lea		-1(%rdi, %rdx, 1), %ecx	# last character in buffer
267	lea		-1(%rsi, %rdx, 1), %r9d
268	xor		%ecx, %eax
269	xor		%r9d, %r8d
270	test		$PAGE_SIZE, %eax	# are they on different pages?
271	jz		0f
272
273	/* fix up rdi */
274	movdqu		-32(%rdi, %rdx, 1), %xmm0
275	movdqu		-16(%rdi, %rdx, 1), %xmm1
276	lea		-8(%rsp), %rdi		# end of replacement buffer
277	sub		%rdx, %rdi		# start of replacement buffer
278	movdqa		%xmm0, -40(%rsp)	# copy to replacement buffer
279	movdqa		%xmm1, -24(%rsp)
280
2810:	test		$PAGE_SIZE, %r8d
282	jz		0f
283
284	/* fix up rsi */
285	movdqu		-32(%rsi, %rdx, 1), %xmm0
286	movdqu		-16(%rsi, %rdx, 1), %xmm1
287	lea		-40(%rsp), %rsi		# end of replacement buffer
288	sub		%rdx, %rsi		# start of replacement buffer
289	movdqa		%xmm0, -72(%rsp)	# copy to replacement buffer
290	movdqa		%xmm1, -56(%rsp)
291
292	/* load data and compare properly */
2930:	movdqu		16(%rdi), %xmm1
294	movdqu		16(%rsi), %xmm3
295	movdqu		(%rdi), %xmm0
296	movdqu		(%rsi), %xmm2
297	mov		%edx, %ecx
298	mov		$-1, %edx
299	shl		%cl, %rdx		# ones where the buffer is not
300	pcmpeqb		%xmm3, %xmm1
301	pcmpeqb		%xmm2, %xmm0
302	pmovmskb	%xmm1, %ecx
303	pmovmskb	%xmm0, %eax
304	shl		$16, %ecx
305	or		%ecx, %eax		# ones where the buffers match
306	or		%edx, %eax		# including where the buffer is not
307	not		%eax			# ones where there is a mismatch
308#ifndef BCMP
309	bsf		%eax, %edx		# location of the first mismatch
310	cmovz		%eax, %edx		# including if there is no mismatch
311	movzbl		(%rdi, %rdx, 1), %eax	# mismatching bytes
312	movzbl		(%rsi, %rdx, 1), %edx
313	sub		%edx, %eax
314#endif
315	ret
316
317	/* empty input */
318.L0:	xor		%eax, %eax
319	ret
320
321	/* compare 33+ bytes */
322	ALIGN_TEXT
323.Llong:	movdqu		(%rdi), %xmm0		# load head
324	movdqu		(%rsi), %xmm2
325	mov		%rdi, %rcx
326	sub		%rdi, %rsi		# express rsi as distance from rdi
327	and		$~0xf, %rdi		# align rdi to 16 bytes
328	movdqu		16(%rsi, %rdi, 1), %xmm1
329	pcmpeqb		16(%rdi), %xmm1		# compare second half of this iteration
330	add		%rcx, %rdx		# pointer to last byte in buffer
331	jc		.Loverflow		# did this overflow?
3320:	pcmpeqb		%xmm2, %xmm0
333	pmovmskb	%xmm0, %eax
334	xor		$0xffff, %eax		# any mismatch?
335	jne		.Lmismatch_head
336	add		$64, %rdi		# advance to next iteration
337	jmp		1f			# and get going with the loop
338
339	/*
340	 * If we got here, a buffer length was passed to memcmp(a, b, len)
341	 * such that a + len < a.  While this sort of usage is illegal,
342	 * it is plausible that a caller tries to do something like
343	 * memcmp(a, b, SIZE_MAX) if a and b are known to differ, intending
344	 * for memcmp() to stop comparing at the first mismatch.  This
345	 * behaviour is not guaranteed by any version of ISO/IEC 9899,
346	 * but usually works out in practice.  Let's try to make this
347	 * case work by comparing until the end of the address space.
348	 */
349.Loverflow:
350	mov		$-1, %rdx		# compare until the end of memory
351	jmp		0b
352
353	/* process buffer 32 bytes at a time */
354	ALIGN_TEXT
3550:	movdqu		-32(%rsi, %rdi, 1), %xmm0
356	movdqu		-16(%rsi, %rdi, 1), %xmm1
357	pcmpeqb		-32(%rdi), %xmm0
358	pcmpeqb		-16(%rdi), %xmm1
359	add		$32, %rdi		# advance to next iteration
3601:	pand		%xmm0, %xmm1		# 0xff where both halves matched
361	pmovmskb	%xmm1, %eax
362	cmp		$0xffff, %eax		# all bytes matched?
363	jne		.Lmismatch
364	cmp		%rdx, %rdi		# end of buffer reached?
365	jb		0b
366
367	/* less than 32 bytes left to compare */
368	movdqu		-16(%rdx), %xmm1	# load 32 byte tail through end pointer
369	movdqu		-16(%rdx, %rsi, 1), %xmm3
370	movdqu		-32(%rdx), %xmm0
371	movdqu		-32(%rdx, %rsi, 1), %xmm2
372	pcmpeqb		%xmm3, %xmm1
373	pcmpeqb		%xmm2, %xmm0
374	pmovmskb	%xmm1, %ecx
375	pmovmskb	%xmm0, %eax
376	shl		$16, %ecx
377	or		%ecx, %eax		# ones where the buffers match
378	not		%eax			# ones where there is a mismatch
379#ifndef BCMP
380	bsf		%eax, %ecx		# location of the first mismatch
381	cmovz		%eax, %ecx		# including if there is no mismatch
382	add		%rcx, %rdx		# pointer to potential mismatch
383	movzbl		-32(%rdx), %eax		# mismatching bytes
384	movzbl		-32(%rdx, %rsi, 1), %edx
385	sub		%edx, %eax
386#endif
387	ret
388
389#ifdef BCMP
390.Lmismatch:
391	mov		$1, %eax
392.Lmismatch_head:
393	ret
394#else /* memcmp */
395.Lmismatch_head:
396	tzcnt		%eax, %eax		# location of mismatch
397	add		%rax, %rcx		# pointer to mismatch
398	movzbl		(%rcx), %eax		# mismatching bytes
399	movzbl		(%rcx, %rsi, 1), %ecx
400	sub		%ecx, %eax
401	ret
402
403.Lmismatch:
404	movdqu		-48(%rsi, %rdi, 1), %xmm1
405	pcmpeqb		-48(%rdi), %xmm1	# reconstruct xmm1 before PAND
406	pmovmskb	%xmm0, %eax		# mismatches in first 16 bytes
407	pmovmskb	%xmm1, %edx		# mismatches in second 16 bytes
408	shl		$16, %edx
409	or		%edx, %eax		# mismatches in both
410	not		%eax			# matches in both
411	tzcnt		%eax, %eax		# location of mismatch
412	add		%rax, %rdi		# pointer to mismatch
413	movzbl		-64(%rdi), %eax		# mismatching bytes
414	movzbl		-64(%rdi, %rsi, 1), %ecx
415	sub		%ecx, %eax
416	ret
417#endif
418ARCHEND(memcmp, baseline)
419
420	.section .note.GNU-stack,"",%progbits
421