xref: /freebsd/lib/libc/amd64/string/strcmp.S (revision bca25680b91b3bea7faef615765806a04634eb23)
1*bca25680SRobert Clausecker/*-
2*bca25680SRobert Clausecker * Copyright (c) 2023, The FreeBSD Foundation
3*bca25680SRobert Clausecker *
4*bca25680SRobert Clausecker * SPDX-License-Expression: BSD-2-Clause
5*bca25680SRobert Clausecker *
6*bca25680SRobert Clausecker * Portions of this software were developed by Robert Clausecker
7*bca25680SRobert Clausecker * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
8*bca25680SRobert Clausecker *
9*bca25680SRobert Clausecker * Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcmp.S
10*bca25680SRobert Clausecker * written by J.T. Conklin <jtc@acorntoolworks.com> that was originally
11*bca25680SRobert Clausecker * dedicated to the public domain.
12e5dd4df8SAlan Cox */
13e5dd4df8SAlan Cox
14e5dd4df8SAlan Cox#include <machine/asm.h>
15*bca25680SRobert Clausecker#include <machine/param.h>
16*bca25680SRobert Clausecker
17e5dd4df8SAlan Cox#if 0
18e5dd4df8SAlan Cox	RCSID("$NetBSD: strcmp.S,v 1.3 2004/07/19 20:04:41 drochner Exp $")
19e5dd4df8SAlan Cox#endif
20e5dd4df8SAlan Cox
21*bca25680SRobert Clausecker#include "amd64_archlevel.h"
22*bca25680SRobert Clausecker
23*bca25680SRobert Clausecker#define ALIGN_TEXT	.p2align 4, 0x90
24*bca25680SRobert Clausecker
25*bca25680SRobert ClauseckerARCHFUNCS(strcmp)
26*bca25680SRobert Clausecker	ARCHFUNC(strcmp, scalar)
27*bca25680SRobert Clausecker	ARCHFUNC(strcmp, baseline)
28*bca25680SRobert ClauseckerENDARCHFUNCS(strcmp)
29*bca25680SRobert Clausecker
30*bca25680SRobert ClauseckerARCHENTRY(strcmp, scalar)
31e5dd4df8SAlan Cox	/*
32e5dd4df8SAlan Cox	 * Align s1 to word boundary.
33e5dd4df8SAlan Cox	 * Consider unrolling loop?
34e5dd4df8SAlan Cox	 */
35e5dd4df8SAlan Cox.Ls1align:
36e5dd4df8SAlan Cox	testb	$7,%dil
37e5dd4df8SAlan Cox	je	.Ls1aligned
38e5dd4df8SAlan Cox	movb	(%rdi),%al
39e5dd4df8SAlan Cox	incq	%rdi
40e5dd4df8SAlan Cox	movb	(%rsi),%dl
41e5dd4df8SAlan Cox	incq	%rsi
42e5dd4df8SAlan Cox	testb	%al,%al
43e5dd4df8SAlan Cox	je	.Ldone
44e5dd4df8SAlan Cox	cmpb	%al,%dl
45e5dd4df8SAlan Cox	je	.Ls1align
46e5dd4df8SAlan Cox	jmp	.Ldone
47e5dd4df8SAlan Cox
48e5dd4df8SAlan Cox	/*
4932223c1bSPedro F. Giffuni	 * Check whether s2 is aligned to a word boundary.  If it is, we
50e5dd4df8SAlan Cox	 * can compare by words.  Otherwise we have to compare by bytes.
51e5dd4df8SAlan Cox	 */
52e5dd4df8SAlan Cox.Ls1aligned:
53e5dd4df8SAlan Cox	testb	$7,%sil
54e5dd4df8SAlan Cox	jne	.Lbyte_loop
55e5dd4df8SAlan Cox
56e5dd4df8SAlan Cox	movabsq	$0x0101010101010101,%r8
57e5dd4df8SAlan Cox	subq	$8,%rdi
58e5dd4df8SAlan Cox	movabsq	$0x8080808080808080,%r9
59e5dd4df8SAlan Cox	subq	$8,%rsi
60e5dd4df8SAlan Cox
61*bca25680SRobert Clausecker	ALIGN_TEXT
62e5dd4df8SAlan Cox.Lword_loop:
63e5dd4df8SAlan Cox	movq	8(%rdi),%rax
64e5dd4df8SAlan Cox	addq	$8,%rdi
65e5dd4df8SAlan Cox	movq	8(%rsi),%rdx
66e5dd4df8SAlan Cox	addq	$8,%rsi
67e5dd4df8SAlan Cox	cmpq	%rax,%rdx
68e5dd4df8SAlan Cox	jne	.Lbyte_loop
69e5dd4df8SAlan Cox	subq	%r8,%rdx
70e5dd4df8SAlan Cox	notq	%rax
71e5dd4df8SAlan Cox	andq	%rax,%rdx
72e5dd4df8SAlan Cox	testq	%r9,%rdx
73e5dd4df8SAlan Cox	je	.Lword_loop
74e5dd4df8SAlan Cox
75*bca25680SRobert Clausecker	ALIGN_TEXT
76e5dd4df8SAlan Cox.Lbyte_loop:
77e5dd4df8SAlan Cox	movb	(%rdi),%al
78e5dd4df8SAlan Cox	incq	%rdi
79e5dd4df8SAlan Cox	movb	(%rsi),%dl
80e5dd4df8SAlan Cox	incq	%rsi
81e5dd4df8SAlan Cox	testb	%al,%al
82e5dd4df8SAlan Cox	je	.Ldone
83e5dd4df8SAlan Cox	cmpb	%al,%dl
84e5dd4df8SAlan Cox	je	.Lbyte_loop
85e5dd4df8SAlan Cox
86e5dd4df8SAlan Cox.Ldone:
87e5dd4df8SAlan Cox	movzbq	%al,%rax
88e5dd4df8SAlan Cox	movzbq	%dl,%rdx
89e5dd4df8SAlan Cox	subq	%rdx,%rax
90e5dd4df8SAlan Cox	ret
91*bca25680SRobert ClauseckerARCHEND(strcmp, scalar)
92*bca25680SRobert Clausecker
93*bca25680SRobert ClauseckerARCHENTRY(strcmp, baseline)
94*bca25680SRobert Clausecker	/* check if either string crosses a page in the head */
95*bca25680SRobert Clausecker	lea		15(%rdi), %r8d	# end of head
96*bca25680SRobert Clausecker	lea		15(%rsi), %r9d
97*bca25680SRobert Clausecker	mov		%edi, %eax
98*bca25680SRobert Clausecker	mov		%esi, %edx
99*bca25680SRobert Clausecker	xor		%edi, %r8d	# bits that changed between first and last byte
100*bca25680SRobert Clausecker	xor		%esi, %r9d
101*bca25680SRobert Clausecker	and		$~0xf, %rdi	# align heads to 16 bytes
102*bca25680SRobert Clausecker	and		$~0xf, %rsi
103*bca25680SRobert Clausecker	or		%r8d, %r9d	# in either RSI or RDI
104*bca25680SRobert Clausecker	and		$0xf, %eax	# offset from alignment
105*bca25680SRobert Clausecker	and		$0xf, %edx
106*bca25680SRobert Clausecker	pxor		%xmm1, %xmm1
107*bca25680SRobert Clausecker	test		$PAGE_SIZE, %r9d # did the page change?
108*bca25680SRobert Clausecker	jz		0f		# if not, take fast path
109*bca25680SRobert Clausecker
110*bca25680SRobert Clausecker	/* heads may cross page boundary, avoid unmapped loads */
111*bca25680SRobert Clausecker	movdqa		(%rdi), %xmm0	# load aligned heads
112*bca25680SRobert Clausecker	movdqa		(%rsi), %xmm2
113*bca25680SRobert Clausecker	mov		$-1, %r8d
114*bca25680SRobert Clausecker	mov		$-1, %r9d
115*bca25680SRobert Clausecker	mov		%eax, %ecx
116*bca25680SRobert Clausecker	shl		%cl, %r8d	# string head in XMM0
117*bca25680SRobert Clausecker	mov		%edx, %ecx
118*bca25680SRobert Clausecker	shl		%cl, %r9d	# string head in XMM2
119*bca25680SRobert Clausecker	movdqa		%xmm0, -40(%rsp) # stash copies of the heads on the stack
120*bca25680SRobert Clausecker	movdqa		%xmm2, -24(%rsp)
121*bca25680SRobert Clausecker	pcmpeqb		%xmm1, %xmm0
122*bca25680SRobert Clausecker	pcmpeqb		%xmm1, %xmm2
123*bca25680SRobert Clausecker	pmovmskb	%xmm0, %r10d
124*bca25680SRobert Clausecker	pmovmskb	%xmm2, %r11d
125*bca25680SRobert Clausecker	test		%r8d, %r10d	# NUL byte present in first string?
126*bca25680SRobert Clausecker	lea		-40(%rsp), %r8
127*bca25680SRobert Clausecker	cmovz		%rdi, %r8
128*bca25680SRobert Clausecker	test		%r9d, %r11d	# NUL byte present in second string?
129*bca25680SRobert Clausecker	lea		-24(%rsp), %r9
130*bca25680SRobert Clausecker	cmovz		%rsi, %r9
131*bca25680SRobert Clausecker	movdqu		(%r8, %rax, 1), %xmm0 # load true (or fake) heads
132*bca25680SRobert Clausecker	movdqu		(%r9, %rdx, 1), %xmm4
133*bca25680SRobert Clausecker	jmp		1f
134*bca25680SRobert Clausecker
135*bca25680SRobert Clausecker0:	movdqu		(%rdi, %rax, 1), %xmm0 # load true heads
136*bca25680SRobert Clausecker	movdqu		(%rsi, %rdx, 1), %xmm4
137*bca25680SRobert Clausecker1:	pxor		%xmm2, %xmm2
138*bca25680SRobert Clausecker	pcmpeqb		%xmm0, %xmm2	# NUL byte present?
139*bca25680SRobert Clausecker	pcmpeqb		%xmm0, %xmm4	# which bytes match?
140*bca25680SRobert Clausecker	pandn		%xmm4, %xmm2	# match and not NUL byte?
141*bca25680SRobert Clausecker	pmovmskb	%xmm2, %r9d
142*bca25680SRobert Clausecker	xor		$0xffff, %r9d	# mismatch or NUL byte?
143*bca25680SRobert Clausecker	jnz		.Lhead_mismatch
144*bca25680SRobert Clausecker
145*bca25680SRobert Clausecker	/* load head and second chunk */
146*bca25680SRobert Clausecker	movdqa		16(%rdi), %xmm2	# load second chunks
147*bca25680SRobert Clausecker	movdqa		16(%rsi), %xmm3
148*bca25680SRobert Clausecker	sub		%rdx, %rax	# is a&0xf >= b&0xf?
149*bca25680SRobert Clausecker	jb		.Lswapped	# if not, proceed with swapped operands
150*bca25680SRobert Clausecker
151*bca25680SRobert Clausecker	neg		%rax
152*bca25680SRobert Clausecker	movdqu		16(%rsi, %rax, 1), %xmm0
153*bca25680SRobert Clausecker	sub		%rdi, %rsi	# express RSI as distance from RDI
154*bca25680SRobert Clausecker	lea		(%rsi, %rax, 1), %rdx # point RDX to offset in second string
155*bca25680SRobert Clausecker	neg		%rax
156*bca25680SRobert Clausecker	pcmpeqb		%xmm3, %xmm1	# ... corresponding to RDI
157*bca25680SRobert Clausecker	pcmpeqb		%xmm2, %xmm0
158*bca25680SRobert Clausecker	pmovmskb	%xmm1, %r8d
159*bca25680SRobert Clausecker	pmovmskb	%xmm0, %r9d
160*bca25680SRobert Clausecker	add		$16, %rdi
161*bca25680SRobert Clausecker	test		%r8d, %r8d
162*bca25680SRobert Clausecker	jnz		.Lnul_found
163*bca25680SRobert Clausecker	xor		$0xffff, %r9d
164*bca25680SRobert Clausecker	jnz		.Lmismatch
165*bca25680SRobert Clausecker	add		$16, %rdi	# advance aligned pointers
166*bca25680SRobert Clausecker
167*bca25680SRobert Clausecker	/*
168*bca25680SRobert Clausecker	 * During the main loop, the layout of the two strings is something like:
169*bca25680SRobert Clausecker	 *
170*bca25680SRobert Clausecker	 *          v ------1------ v ------2------ v
171*bca25680SRobert Clausecker	 *     RDI:    AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
172*bca25680SRobert Clausecker	 *     RSI: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
173*bca25680SRobert Clausecker	 *
174*bca25680SRobert Clausecker	 * where v indicates the alignment boundaries and corresponding chunks
175*bca25680SRobert Clausecker	 * of the strings have the same letters.  Chunk A has been checked in
176*bca25680SRobert Clausecker	 * the previous iteration.  This iteration, we first check that string
177*bca25680SRobert Clausecker	 * RSI doesn't end within region 2, then we compare chunk B between the
178*bca25680SRobert Clausecker	 * two strings.  As RSI is known not to hold a NUL byte in regsions 1
179*bca25680SRobert Clausecker	 * and 2 at this point, this also ensures that RDI has not ended yet.
180*bca25680SRobert Clausecker	 */
181*bca25680SRobert Clausecker	ALIGN_TEXT
182*bca25680SRobert Clausecker0:	movdqu		(%rdi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI?
183*bca25680SRobert Clausecker	pxor		%xmm1, %xmm1
184*bca25680SRobert Clausecker	pcmpeqb		(%rdi, %rsi, 1), %xmm1 # end of string in RSI?
185*bca25680SRobert Clausecker	pcmpeqb		(%rdi), %xmm0	# where do the chunks match?
186*bca25680SRobert Clausecker	pmovmskb	%xmm1, %r8d
187*bca25680SRobert Clausecker	pmovmskb	%xmm0, %r9d
188*bca25680SRobert Clausecker	test		%r8d, %r8d
189*bca25680SRobert Clausecker	jnz		.Lnul_found
190*bca25680SRobert Clausecker	xor		$0xffff, %r9d	# any mismatches?
191*bca25680SRobert Clausecker	jnz		.Lmismatch
192*bca25680SRobert Clausecker
193*bca25680SRobert Clausecker	/* main loop unrolled twice */
194*bca25680SRobert Clausecker	movdqu		16(%rdi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI?
195*bca25680SRobert Clausecker	pxor		%xmm1, %xmm1
196*bca25680SRobert Clausecker	pcmpeqb		16(%rdi, %rsi, 1), %xmm1 # end of string in RSI?
197*bca25680SRobert Clausecker	pcmpeqb		16(%rdi), %xmm0	# where do the chunks match?
198*bca25680SRobert Clausecker	pmovmskb	%xmm1, %r8d
199*bca25680SRobert Clausecker	pmovmskb	%xmm0, %r9d
200*bca25680SRobert Clausecker	add		$32, %rdi
201*bca25680SRobert Clausecker	test		%r8d, %r8d
202*bca25680SRobert Clausecker	jnz		.Lnul_found2
203*bca25680SRobert Clausecker	xor		$0xffff, %r9d	# any mismatches?
204*bca25680SRobert Clausecker	jz		0b
205*bca25680SRobert Clausecker
206*bca25680SRobert Clausecker	sub		$16, %rdi	# roll back second increment
207*bca25680SRobert Clausecker
208*bca25680SRobert Clausecker	/* a mismatch has been found between RDX and RSI */
209*bca25680SRobert Clausecker.Lmismatch:
210*bca25680SRobert Clausecker	tzcnt		%r9d, %r9d	# where is the mismatch?
211*bca25680SRobert Clausecker	add		%rdi, %rdx	# turn RDX from offset to pointer
212*bca25680SRobert Clausecker	movzbl		(%rdx, %r9, 1), %ecx
213*bca25680SRobert Clausecker	movzbl		(%rdi, %r9, 1), %eax
214*bca25680SRobert Clausecker	sub		%ecx, %eax	# difference of the mismatching chars
215*bca25680SRobert Clausecker	ret
216*bca25680SRobert Clausecker
217*bca25680SRobert Clausecker	/* mismatch in true heads */
218*bca25680SRobert Clausecker.Lhead_mismatch:
219*bca25680SRobert Clausecker	tzcnt		%r9d, %r9d	# where is the mismatch?
220*bca25680SRobert Clausecker	add		%rax, %rdi	# return to true heads
221*bca25680SRobert Clausecker	add		%rdx, %rsi
222*bca25680SRobert Clausecker	movzbl		(%rdi, %r9, 1), %eax # mismatching characters
223*bca25680SRobert Clausecker	movzbl		(%rsi, %r9, 1), %ecx
224*bca25680SRobert Clausecker	sub		%ecx, %eax
225*bca25680SRobert Clausecker	ret
226*bca25680SRobert Clausecker
227*bca25680SRobert Clausecker.Lnul_found2:
228*bca25680SRobert Clausecker	sub		$16, %rdi	# roll back second increment
229*bca25680SRobert Clausecker
230*bca25680SRobert Clausecker	/* a NUL has been found in RSI */
231*bca25680SRobert Clausecker.Lnul_found:
232*bca25680SRobert Clausecker	mov		%eax, %ecx
233*bca25680SRobert Clausecker	mov		%r8d, %r10d
234*bca25680SRobert Clausecker	shl		%cl, %r8w	# adjust NUL mask to positions in RDI/RDX
235*bca25680SRobert Clausecker	xor		$0xffff, %r9d	# mask of mismatches
236*bca25680SRobert Clausecker	or		%r8d, %r9d	# NUL bytes also count as mismatches
237*bca25680SRobert Clausecker	jnz		.Lmismatch
238*bca25680SRobert Clausecker
239*bca25680SRobert Clausecker	/*
240*bca25680SRobert Clausecker	 * (RDI) == (RSI) and NUL is past the string.
241*bca25680SRobert Clausecker	 * Compare (RSI) with the corresponding part
242*bca25680SRobert Clausecker	 * of the other string until the NUL byte.
243*bca25680SRobert Clausecker	 */
244*bca25680SRobert Clausecker	movdqu		(%rdi, %rax, 1), %xmm0
245*bca25680SRobert Clausecker	pcmpeqb		(%rdi, %rsi, 1), %xmm0
246*bca25680SRobert Clausecker	add		%rdi, %rsi	# restore RSI pointer
247*bca25680SRobert Clausecker	add		%rax, %rdi	# point RDI to chunk corresponding to (RSI)
248*bca25680SRobert Clausecker	pmovmskb	%xmm0, %ecx	# mask of matches
249*bca25680SRobert Clausecker	not		%ecx		# mask of mismatches
250*bca25680SRobert Clausecker	or		%r10d, %ecx	# mask of mismatches or NUL bytes
251*bca25680SRobert Clausecker	tzcnt		%ecx, %ecx	# location of first mismatch
252*bca25680SRobert Clausecker	movzbl		(%rdi, %rcx, 1), %eax
253*bca25680SRobert Clausecker	movzbl		(%rsi, %rcx, 1), %ecx
254*bca25680SRobert Clausecker	sub		%ecx, %eax
255*bca25680SRobert Clausecker	ret
256*bca25680SRobert Clausecker
257*bca25680SRobert Clausecker	/*
258*bca25680SRobert Clausecker	 * If (a&0xf) < (b&0xf), we do the same thing but with swapped
259*bca25680SRobert Clausecker	 * operands.  I found that this performs slightly better than
260*bca25680SRobert Clausecker	 * using conditional moves to do the swap branchless.
261*bca25680SRobert Clausecker	 */
262*bca25680SRobert Clausecker.Lswapped:
263*bca25680SRobert Clausecker	movdqu		16(%rdi, %rax, 1), %xmm0
264*bca25680SRobert Clausecker	sub		%rsi, %rdi	# express RDI as distance from RSI
265*bca25680SRobert Clausecker	lea		(%rdi, %rax, 1), %rdx # point RDX to offset in RDI corresponding to RSI
266*bca25680SRobert Clausecker	neg		%rax		# make difference positive
267*bca25680SRobert Clausecker	pcmpeqb		%xmm2, %xmm1
268*bca25680SRobert Clausecker	pcmpeqb		%xmm3, %xmm0
269*bca25680SRobert Clausecker	pmovmskb	%xmm1, %r8d
270*bca25680SRobert Clausecker	pmovmskb	%xmm0, %r9d
271*bca25680SRobert Clausecker	add		$16, %rsi	# advance aligned pointers
272*bca25680SRobert Clausecker	test		%r8d, %r8d
273*bca25680SRobert Clausecker	jnz		.Lnul_founds
274*bca25680SRobert Clausecker	xor		$0xffff, %r9d
275*bca25680SRobert Clausecker	jnz		.Lmismatchs
276*bca25680SRobert Clausecker	add		$16, %rsi
277*bca25680SRobert Clausecker
278*bca25680SRobert Clausecker	/*
279*bca25680SRobert Clausecker	 * During the main loop, the layout of the two strings is something like:
280*bca25680SRobert Clausecker	 *
281*bca25680SRobert Clausecker	 *          v ------1------ v ------2------ v
282*bca25680SRobert Clausecker	 *     RDI:    AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
283*bca25680SRobert Clausecker	 *     RSI: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
284*bca25680SRobert Clausecker	 *
285*bca25680SRobert Clausecker	 * where v indicates the alignment boundaries and corresponding chunks
286*bca25680SRobert Clausecker	 * of the strings have the same letters.  Chunk A has been checked in
287*bca25680SRobert Clausecker	 * the previous iteration.  This iteration, we first check that string
288*bca25680SRobert Clausecker	 * RSI doesn't end within region 2, then we compare chunk B between the
289*bca25680SRobert Clausecker	 * two strings.  As RSI is known not to hold a NUL byte in regsions 1
290*bca25680SRobert Clausecker	 * and 2 at this point, this also ensures that RDI has not ended yet.
291*bca25680SRobert Clausecker	 */
292*bca25680SRobert Clausecker	ALIGN_TEXT
293*bca25680SRobert Clausecker0:	movdqu		(%rsi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI?
294*bca25680SRobert Clausecker	pxor		%xmm1, %xmm1
295*bca25680SRobert Clausecker	pcmpeqb		(%rsi, %rdi, 1), %xmm1 # end of string in RSI?
296*bca25680SRobert Clausecker	pcmpeqb		(%rsi), %xmm0	# where do the chunks match?
297*bca25680SRobert Clausecker	pmovmskb	%xmm1, %r8d
298*bca25680SRobert Clausecker	pmovmskb	%xmm0, %r9d
299*bca25680SRobert Clausecker	test		%r8d, %r8d
300*bca25680SRobert Clausecker	jnz		.Lnul_founds
301*bca25680SRobert Clausecker	xor		$0xffff, %r9d	# any mismatches?
302*bca25680SRobert Clausecker	jnz		.Lmismatchs
303*bca25680SRobert Clausecker
304*bca25680SRobert Clausecker	/* main loop unrolled twice */
305*bca25680SRobert Clausecker	movdqu		16(%rsi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI?
306*bca25680SRobert Clausecker	pxor		%xmm1, %xmm1
307*bca25680SRobert Clausecker	pcmpeqb		16(%rsi, %rdi, 1), %xmm1 # end of string in RSI?
308*bca25680SRobert Clausecker	pcmpeqb		16(%rsi), %xmm0	# where do the chunks match?
309*bca25680SRobert Clausecker	pmovmskb	%xmm1, %r8d
310*bca25680SRobert Clausecker	pmovmskb	%xmm0, %r9d
311*bca25680SRobert Clausecker	add		$32, %rsi
312*bca25680SRobert Clausecker	test		%r8d, %r8d
313*bca25680SRobert Clausecker	jnz		.Lnul_found2s
314*bca25680SRobert Clausecker	xor		$0xffff, %r9d	# any mismatches?
315*bca25680SRobert Clausecker	jz		0b
316*bca25680SRobert Clausecker
317*bca25680SRobert Clausecker	sub		$16, %rsi	# roll back second increment
318*bca25680SRobert Clausecker
319*bca25680SRobert Clausecker	/* a mismatch has been found between RDX and RDI */
320*bca25680SRobert Clausecker.Lmismatchs:
321*bca25680SRobert Clausecker	tzcnt		%r9d, %r9d	# where is the mismatch?
322*bca25680SRobert Clausecker	add		%rsi, %rdx	# turn RDX from offset to pointer
323*bca25680SRobert Clausecker	movzbl		(%rdx, %r9, 1), %eax
324*bca25680SRobert Clausecker	movzbl		(%rsi, %r9, 1), %ecx
325*bca25680SRobert Clausecker	sub		%ecx, %eax	# difference of the mismatching chars
326*bca25680SRobert Clausecker	ret
327*bca25680SRobert Clausecker
328*bca25680SRobert Clausecker.Lnul_found2s:
329*bca25680SRobert Clausecker	sub		$16, %rsi	# roll back second increment
330*bca25680SRobert Clausecker
331*bca25680SRobert Clausecker	/* a NUL has been found in RSI */
332*bca25680SRobert Clausecker.Lnul_founds:
333*bca25680SRobert Clausecker	mov		%eax, %ecx
334*bca25680SRobert Clausecker	mov		%r8d, %r10d
335*bca25680SRobert Clausecker	shl		%cl, %r8w	# adjust NUL mask to positions in RDI/RDX
336*bca25680SRobert Clausecker	xor		$0xffff, %r9d	# mask of mismatches
337*bca25680SRobert Clausecker	or		%r8d, %r9d	# NUL bytes also count as mismatches
338*bca25680SRobert Clausecker	jnz		.Lmismatchs
339*bca25680SRobert Clausecker
340*bca25680SRobert Clausecker	/*
341*bca25680SRobert Clausecker	 * (RDI) == (RSI) and NUL is past the string.
342*bca25680SRobert Clausecker	 * Compare (RSI) with the corresponding part
343*bca25680SRobert Clausecker	 * of the other string until the NUL byte.
344*bca25680SRobert Clausecker	 */
345*bca25680SRobert Clausecker	movdqu		(%rsi, %rax, 1), %xmm0
346*bca25680SRobert Clausecker	pcmpeqb		(%rsi, %rdi, 1), %xmm0
347*bca25680SRobert Clausecker	add		%rsi, %rdi	# restore RDI pointer
348*bca25680SRobert Clausecker	add		%rax, %rsi	# point RSI to chunk corresponding to (RDI)
349*bca25680SRobert Clausecker	pmovmskb	%xmm0, %ecx	# mask of matches
350*bca25680SRobert Clausecker	not		%ecx		# mask of mismatches
351*bca25680SRobert Clausecker	or		%r10d, %ecx	# mask of mismatches or NUL bytes
352*bca25680SRobert Clausecker	tzcnt		%ecx, %ecx	# location of first mismatch
353*bca25680SRobert Clausecker	movzbl		(%rdi, %rcx, 1), %eax
354*bca25680SRobert Clausecker	movzbl		(%rsi, %rcx, 1), %ecx
355*bca25680SRobert Clausecker	sub		%ecx, %eax
356*bca25680SRobert Clausecker	ret
357*bca25680SRobert ClauseckerARCHEND(strcmp, baseline)
35893ab7586SKonstantin Belousov
35993ab7586SKonstantin Belousov	.section .note.GNU-stack,"",%progbits
360