xref: /freebsd/lib/libc/amd64/string/strcmp.S (revision b9fa1500cb2265b95927e19b9d2119ca26d65be3)
1/*-
2 * Copyright (c) 2023, The FreeBSD Foundation
3 *
4 * SPDX-License-Expression: BSD-2-Clause
5 *
6 * Portions of this software were developed by Robert Clausecker
7 * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
8 *
9 * Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcmp.S
10 * written by J.T. Conklin <jtc@acorntoolworks.com> that was originally
11 * dedicated to the public domain.
12 */
13
14#include <machine/asm.h>
15#include <machine/param.h>
16
17#if 0
18	RCSID("$NetBSD: strcmp.S,v 1.3 2004/07/19 20:04:41 drochner Exp $")
19#endif
20
21#include "amd64_archlevel.h"
22
23#define ALIGN_TEXT	.p2align 4, 0x90
24
25ARCHFUNCS(strcmp)
26	ARCHFUNC(strcmp, scalar)
27	ARCHFUNC(strcmp, baseline)
28ENDARCHFUNCS(strcmp)
29
30ARCHENTRY(strcmp, scalar)
31	/*
32	 * Align s1 to word boundary.
33	 * Consider unrolling loop?
34	 */
35.Ls1align:
36	testb	$7,%dil
37	je	.Ls1aligned
38	movb	(%rdi),%al
39	incq	%rdi
40	movb	(%rsi),%dl
41	incq	%rsi
42	testb	%al,%al
43	je	.Ldone
44	cmpb	%al,%dl
45	je	.Ls1align
46	jmp	.Ldone
47
48	/*
49	 * Check whether s2 is aligned to a word boundary.  If it is, we
50	 * can compare by words.  Otherwise we have to compare by bytes.
51	 */
52.Ls1aligned:
53	testb	$7,%sil
54	jne	.Lbyte_loop
55
56	movabsq	$0x0101010101010101,%r8
57	subq	$8,%rdi
58	movabsq	$0x8080808080808080,%r9
59	subq	$8,%rsi
60
61	ALIGN_TEXT
62.Lword_loop:
63	movq	8(%rdi),%rax
64	addq	$8,%rdi
65	movq	8(%rsi),%rdx
66	addq	$8,%rsi
67	cmpq	%rax,%rdx
68	jne	.Lbyte_loop
69	subq	%r8,%rdx
70	notq	%rax
71	andq	%rax,%rdx
72	testq	%r9,%rdx
73	je	.Lword_loop
74
75	ALIGN_TEXT
76.Lbyte_loop:
77	movb	(%rdi),%al
78	incq	%rdi
79	movb	(%rsi),%dl
80	incq	%rsi
81	testb	%al,%al
82	je	.Ldone
83	cmpb	%al,%dl
84	je	.Lbyte_loop
85
86.Ldone:
87	movzbq	%al,%rax
88	movzbq	%dl,%rdx
89	subq	%rdx,%rax
90	ret
91ARCHEND(strcmp, scalar)
92
93ARCHENTRY(strcmp, baseline)
94	/* check if either string crosses a page in the head */
95	lea		15(%rdi), %r8d	# end of head
96	lea		15(%rsi), %r9d
97	mov		%edi, %eax
98	mov		%esi, %edx
99	xor		%edi, %r8d	# bits that changed between first and last byte
100	xor		%esi, %r9d
101	and		$~0xf, %rdi	# align heads to 16 bytes
102	and		$~0xf, %rsi
103	or		%r8d, %r9d	# in either RSI or RDI
104	and		$0xf, %eax	# offset from alignment
105	and		$0xf, %edx
106	pxor		%xmm1, %xmm1
107	test		$PAGE_SIZE, %r9d # did the page change?
108	jz		0f		# if not, take fast path
109
110	/* heads may cross page boundary, avoid unmapped loads */
111	movdqa		(%rdi), %xmm0	# load aligned heads
112	movdqa		(%rsi), %xmm2
113	mov		$-1, %r8d
114	mov		$-1, %r9d
115	mov		%eax, %ecx
116	shl		%cl, %r8d	# string head in XMM0
117	mov		%edx, %ecx
118	shl		%cl, %r9d	# string head in XMM2
119	movdqa		%xmm0, -40(%rsp) # stash copies of the heads on the stack
120	movdqa		%xmm2, -24(%rsp)
121	pcmpeqb		%xmm1, %xmm0
122	pcmpeqb		%xmm1, %xmm2
123	pmovmskb	%xmm0, %r10d
124	pmovmskb	%xmm2, %r11d
125	test		%r8d, %r10d	# NUL byte present in first string?
126	lea		-40(%rsp), %r8
127	cmovz		%rdi, %r8
128	test		%r9d, %r11d	# NUL byte present in second string?
129	lea		-24(%rsp), %r9
130	cmovz		%rsi, %r9
131	movdqu		(%r8, %rax, 1), %xmm0 # load true (or fake) heads
132	movdqu		(%r9, %rdx, 1), %xmm4
133	jmp		1f
134
1350:	movdqu		(%rdi, %rax, 1), %xmm0 # load true heads
136	movdqu		(%rsi, %rdx, 1), %xmm4
1371:	pxor		%xmm2, %xmm2
138	pcmpeqb		%xmm0, %xmm2	# NUL byte present?
139	pcmpeqb		%xmm0, %xmm4	# which bytes match?
140	pandn		%xmm4, %xmm2	# match and not NUL byte?
141	pmovmskb	%xmm2, %r9d
142	xor		$0xffff, %r9d	# mismatch or NUL byte?
143	jnz		.Lhead_mismatch
144
145	/* load head and second chunk */
146	movdqa		16(%rdi), %xmm2	# load second chunks
147	movdqa		16(%rsi), %xmm3
148	sub		%rdx, %rax	# is a&0xf >= b&0xf?
149	jb		.Lswapped	# if not, proceed with swapped operands
150
151	neg		%rax
152	movdqu		16(%rsi, %rax, 1), %xmm0
153	sub		%rdi, %rsi	# express RSI as distance from RDI
154	lea		(%rsi, %rax, 1), %rdx # point RDX to offset in second string
155	neg		%rax
156	pcmpeqb		%xmm3, %xmm1	# ... corresponding to RDI
157	pcmpeqb		%xmm2, %xmm0
158	pmovmskb	%xmm1, %r8d
159	pmovmskb	%xmm0, %r9d
160	add		$16, %rdi
161	test		%r8d, %r8d
162	jnz		.Lnul_found
163	xor		$0xffff, %r9d
164	jnz		.Lmismatch
165	add		$16, %rdi	# advance aligned pointers
166
167	/*
168	 * During the main loop, the layout of the two strings is something like:
169	 *
170	 *          v ------1------ v ------2------ v
171	 *     RDI:    AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
172	 *     RSI: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
173	 *
174	 * where v indicates the alignment boundaries and corresponding chunks
175	 * of the strings have the same letters.  Chunk A has been checked in
176	 * the previous iteration.  This iteration, we first check that string
177	 * RSI doesn't end within region 2, then we compare chunk B between the
178	 * two strings.  As RSI is known not to hold a NUL byte in regsions 1
179	 * and 2 at this point, this also ensures that RDI has not ended yet.
180	 */
181	ALIGN_TEXT
1820:	movdqu		(%rdi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI?
183	pxor		%xmm1, %xmm1
184	pcmpeqb		(%rdi, %rsi, 1), %xmm1 # end of string in RSI?
185	pcmpeqb		(%rdi), %xmm0	# where do the chunks match?
186	pmovmskb	%xmm1, %r8d
187	pmovmskb	%xmm0, %r9d
188	test		%r8d, %r8d
189	jnz		.Lnul_found
190	xor		$0xffff, %r9d	# any mismatches?
191	jnz		.Lmismatch
192
193	/* main loop unrolled twice */
194	movdqu		16(%rdi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI?
195	pxor		%xmm1, %xmm1
196	pcmpeqb		16(%rdi, %rsi, 1), %xmm1 # end of string in RSI?
197	pcmpeqb		16(%rdi), %xmm0	# where do the chunks match?
198	pmovmskb	%xmm1, %r8d
199	pmovmskb	%xmm0, %r9d
200	add		$32, %rdi
201	test		%r8d, %r8d
202	jnz		.Lnul_found2
203	xor		$0xffff, %r9d	# any mismatches?
204	jz		0b
205
206	sub		$16, %rdi	# roll back second increment
207
208	/* a mismatch has been found between RDX and RSI */
209.Lmismatch:
210	tzcnt		%r9d, %r9d	# where is the mismatch?
211	add		%rdi, %rdx	# turn RDX from offset to pointer
212	movzbl		(%rdx, %r9, 1), %ecx
213	movzbl		(%rdi, %r9, 1), %eax
214	sub		%ecx, %eax	# difference of the mismatching chars
215	ret
216
217	/* mismatch in true heads */
218.Lhead_mismatch:
219	tzcnt		%r9d, %r9d	# where is the mismatch?
220	add		%rax, %rdi	# return to true heads
221	add		%rdx, %rsi
222	movzbl		(%rdi, %r9, 1), %eax # mismatching characters
223	movzbl		(%rsi, %r9, 1), %ecx
224	sub		%ecx, %eax
225	ret
226
227.Lnul_found2:
228	sub		$16, %rdi	# roll back second increment
229
230	/* a NUL has been found in RSI */
231.Lnul_found:
232	mov		%eax, %ecx
233	mov		%r8d, %r10d
234	shl		%cl, %r8w	# adjust NUL mask to positions in RDI/RDX
235	xor		$0xffff, %r9d	# mask of mismatches
236	or		%r8d, %r9d	# NUL bytes also count as mismatches
237	jnz		.Lmismatch
238
239	/*
240	 * (RDI) == (RSI) and NUL is past the string.
241	 * Compare (RSI) with the corresponding part
242	 * of the other string until the NUL byte.
243	 */
244	movdqu		(%rdi, %rax, 1), %xmm0
245	pcmpeqb		(%rdi, %rsi, 1), %xmm0
246	add		%rdi, %rsi	# restore RSI pointer
247	add		%rax, %rdi	# point RDI to chunk corresponding to (RSI)
248	pmovmskb	%xmm0, %ecx	# mask of matches
249	not		%ecx		# mask of mismatches
250	or		%r10d, %ecx	# mask of mismatches or NUL bytes
251	tzcnt		%ecx, %ecx	# location of first mismatch
252	movzbl		(%rdi, %rcx, 1), %eax
253	movzbl		(%rsi, %rcx, 1), %ecx
254	sub		%ecx, %eax
255	ret
256
257	/*
258	 * If (a&0xf) < (b&0xf), we do the same thing but with swapped
259	 * operands.  I found that this performs slightly better than
260	 * using conditional moves to do the swap branchless.
261	 */
262.Lswapped:
263	movdqu		16(%rdi, %rax, 1), %xmm0
264	sub		%rsi, %rdi	# express RDI as distance from RSI
265	lea		(%rdi, %rax, 1), %rdx # point RDX to offset in RDI corresponding to RSI
266	neg		%rax		# make difference positive
267	pcmpeqb		%xmm2, %xmm1
268	pcmpeqb		%xmm3, %xmm0
269	pmovmskb	%xmm1, %r8d
270	pmovmskb	%xmm0, %r9d
271	add		$16, %rsi	# advance aligned pointers
272	test		%r8d, %r8d
273	jnz		.Lnul_founds
274	xor		$0xffff, %r9d
275	jnz		.Lmismatchs
276	add		$16, %rsi
277
278	/*
279	 * During the main loop, the layout of the two strings is something like:
280	 *
281	 *          v ------1------ v ------2------ v
282	 *     RDI:    AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
283	 *     RSI: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
284	 *
285	 * where v indicates the alignment boundaries and corresponding chunks
286	 * of the strings have the same letters.  Chunk A has been checked in
287	 * the previous iteration.  This iteration, we first check that string
288	 * RSI doesn't end within region 2, then we compare chunk B between the
289	 * two strings.  As RSI is known not to hold a NUL byte in regsions 1
290	 * and 2 at this point, this also ensures that RDI has not ended yet.
291	 */
292	ALIGN_TEXT
2930:	movdqu		(%rsi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI?
294	pxor		%xmm1, %xmm1
295	pcmpeqb		(%rsi, %rdi, 1), %xmm1 # end of string in RSI?
296	pcmpeqb		(%rsi), %xmm0	# where do the chunks match?
297	pmovmskb	%xmm1, %r8d
298	pmovmskb	%xmm0, %r9d
299	test		%r8d, %r8d
300	jnz		.Lnul_founds
301	xor		$0xffff, %r9d	# any mismatches?
302	jnz		.Lmismatchs
303
304	/* main loop unrolled twice */
305	movdqu		16(%rsi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI?
306	pxor		%xmm1, %xmm1
307	pcmpeqb		16(%rsi, %rdi, 1), %xmm1 # end of string in RSI?
308	pcmpeqb		16(%rsi), %xmm0	# where do the chunks match?
309	pmovmskb	%xmm1, %r8d
310	pmovmskb	%xmm0, %r9d
311	add		$32, %rsi
312	test		%r8d, %r8d
313	jnz		.Lnul_found2s
314	xor		$0xffff, %r9d	# any mismatches?
315	jz		0b
316
317	sub		$16, %rsi	# roll back second increment
318
319	/* a mismatch has been found between RDX and RDI */
320.Lmismatchs:
321	tzcnt		%r9d, %r9d	# where is the mismatch?
322	add		%rsi, %rdx	# turn RDX from offset to pointer
323	movzbl		(%rdx, %r9, 1), %eax
324	movzbl		(%rsi, %r9, 1), %ecx
325	sub		%ecx, %eax	# difference of the mismatching chars
326	ret
327
328.Lnul_found2s:
329	sub		$16, %rsi	# roll back second increment
330
331	/* a NUL has been found in RSI */
332.Lnul_founds:
333	mov		%eax, %ecx
334	mov		%r8d, %r10d
335	shl		%cl, %r8w	# adjust NUL mask to positions in RDI/RDX
336	xor		$0xffff, %r9d	# mask of mismatches
337	or		%r8d, %r9d	# NUL bytes also count as mismatches
338	jnz		.Lmismatchs
339
340	/*
341	 * (RDI) == (RSI) and NUL is past the string.
342	 * Compare (RSI) with the corresponding part
343	 * of the other string until the NUL byte.
344	 */
345	movdqu		(%rsi, %rax, 1), %xmm0
346	pcmpeqb		(%rsi, %rdi, 1), %xmm0
347	add		%rsi, %rdi	# restore RDI pointer
348	add		%rax, %rsi	# point RSI to chunk corresponding to (RDI)
349	pmovmskb	%xmm0, %ecx	# mask of matches
350	not		%ecx		# mask of mismatches
351	or		%r10d, %ecx	# mask of mismatches or NUL bytes
352	tzcnt		%ecx, %ecx	# location of first mismatch
353	movzbl		(%rdi, %rcx, 1), %eax
354	movzbl		(%rsi, %rcx, 1), %ecx
355	sub		%ecx, %eax
356	ret
357ARCHEND(strcmp, baseline)
358
359	.section .note.GNU-stack,"",%progbits
360