xref: /freebsd/lib/libc/amd64/string/strncmp.S (revision 14289e973f5c941e4502cc2b11265e4b3072839a)
1*14289e97SRobert Clausecker/*-
2*14289e97SRobert Clausecker * Copyright (c) 2023 The FreeBSD Foundation
3*14289e97SRobert Clausecker *
4*14289e97SRobert Clausecker * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
5*14289e97SRobert Clausecker * under sponsorship from the FreeBSD Foundation.
6*14289e97SRobert Clausecker *
7*14289e97SRobert Clausecker * Redistribution and use in source and binary forms, with or without
8*14289e97SRobert Clausecker * modification, are permitted provided that the following conditions
9*14289e97SRobert Clausecker * are met:
10*14289e97SRobert Clausecker * 1. Redistributions of source code must retain the above copyright
11*14289e97SRobert Clausecker *    notice, this list of conditions and the following disclaimer.
12*14289e97SRobert Clausecker * 2. Redistributions in binary form must reproduce the above copyright
13*14289e97SRobert Clausecker *    notice, this list of conditions and the following disclaimer in the
14*14289e97SRobert Clausecker *    documentation and/or other materials provided with the distribution.
15*14289e97SRobert Clausecker *
16*14289e97SRobert Clausecker * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
17*14289e97SRobert Clausecker * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18*14289e97SRobert Clausecker * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19*14289e97SRobert Clausecker * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20*14289e97SRobert Clausecker * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21*14289e97SRobert Clausecker * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22*14289e97SRobert Clausecker * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23*14289e97SRobert Clausecker * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24*14289e97SRobert Clausecker * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25*14289e97SRobert Clausecker * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26*14289e97SRobert Clausecker * SUCH DAMAGE
27*14289e97SRobert Clausecker */
28*14289e97SRobert Clausecker
29*14289e97SRobert Clausecker#include <machine/asm.h>
30*14289e97SRobert Clausecker#include <machine/param.h>
31*14289e97SRobert Clausecker
32*14289e97SRobert Clausecker#include "amd64_archlevel.h"
33*14289e97SRobert Clausecker
34*14289e97SRobert Clausecker#define ALIGN_TEXT	.p2align 4, 0x90
35*14289e97SRobert Clausecker
36*14289e97SRobert ClauseckerARCHFUNCS(strncmp)
37*14289e97SRobert Clausecker	ARCHFUNC(strncmp, scalar)
38*14289e97SRobert Clausecker	ARCHFUNC(strncmp, baseline)
39*14289e97SRobert ClauseckerENDARCHFUNCS(strncmp)
40*14289e97SRobert Clausecker
41*14289e97SRobert Clausecker/*
42*14289e97SRobert Clausecker * This is just the scalar loop unrolled a bunch of times.
43*14289e97SRobert Clausecker */
44*14289e97SRobert ClauseckerARCHENTRY(strncmp, scalar)
45*14289e97SRobert Clausecker	xor	%eax, %eax
46*14289e97SRobert Clausecker	sub	$4, %rdx	# 4 chars left to compare?
47*14289e97SRobert Clausecker	jbe	1f
48*14289e97SRobert Clausecker
49*14289e97SRobert Clausecker	ALIGN_TEXT
50*14289e97SRobert Clausecker0:	movzbl	(%rdi), %ecx
51*14289e97SRobert Clausecker	test	%ecx, %ecx	# NUL char in first string?
52*14289e97SRobert Clausecker	jz	.L0
53*14289e97SRobert Clausecker	cmpb	(%rsi), %cl	# mismatch between strings?
54*14289e97SRobert Clausecker	jnz	.L0
55*14289e97SRobert Clausecker
56*14289e97SRobert Clausecker	movzbl	1(%rdi), %ecx
57*14289e97SRobert Clausecker	test	%ecx, %ecx
58*14289e97SRobert Clausecker	jz	.L1
59*14289e97SRobert Clausecker	cmpb	1(%rsi), %cl
60*14289e97SRobert Clausecker	jnz	.L1
61*14289e97SRobert Clausecker
62*14289e97SRobert Clausecker	movzbl	2(%rdi), %ecx
63*14289e97SRobert Clausecker	test	%ecx, %ecx
64*14289e97SRobert Clausecker	jz	.L2
65*14289e97SRobert Clausecker	cmpb	2(%rsi), %cl
66*14289e97SRobert Clausecker	jnz	.L2
67*14289e97SRobert Clausecker
68*14289e97SRobert Clausecker	movzbl	3(%rdi), %ecx
69*14289e97SRobert Clausecker	test	%ecx, %ecx
70*14289e97SRobert Clausecker	jz	.L3
71*14289e97SRobert Clausecker	cmpb	3(%rsi), %cl
72*14289e97SRobert Clausecker	jnz	.L3
73*14289e97SRobert Clausecker
74*14289e97SRobert Clausecker	add	$4, %rdi	# advance to next iteration
75*14289e97SRobert Clausecker	add	$4, %rsi
76*14289e97SRobert Clausecker	sub	$4, %rdx
77*14289e97SRobert Clausecker	ja	0b
78*14289e97SRobert Clausecker
79*14289e97SRobert Clausecker	/* end of string within the next 4 characters */
80*14289e97SRobert Clausecker1:	cmp	$-4, %edx	# end of string reached immediately?
81*14289e97SRobert Clausecker	jz	.Leq
82*14289e97SRobert Clausecker	movzbl	(%rdi), %ecx
83*14289e97SRobert Clausecker	test	%ecx, %ecx
84*14289e97SRobert Clausecker	jz	.L0
85*14289e97SRobert Clausecker	cmpb	(%rsi), %cl
86*14289e97SRobert Clausecker	jnz	.L0
87*14289e97SRobert Clausecker
88*14289e97SRobert Clausecker	cmp	$-3, %edx	# end of string reached after 1 char?
89*14289e97SRobert Clausecker	jz	.Leq
90*14289e97SRobert Clausecker	movzbl	1(%rdi), %ecx
91*14289e97SRobert Clausecker	test	%ecx, %ecx
92*14289e97SRobert Clausecker	jz	.L1
93*14289e97SRobert Clausecker	cmpb	1(%rsi), %cl
94*14289e97SRobert Clausecker	jnz	.L1
95*14289e97SRobert Clausecker
96*14289e97SRobert Clausecker	cmp	$-2, %edx
97*14289e97SRobert Clausecker	jz	.Leq
98*14289e97SRobert Clausecker	movzbl	2(%rdi), %ecx
99*14289e97SRobert Clausecker	test	%ecx, %ecx
100*14289e97SRobert Clausecker	jz	.L2
101*14289e97SRobert Clausecker	cmpb	2(%rsi), %cl
102*14289e97SRobert Clausecker	jnz	.L2
103*14289e97SRobert Clausecker
104*14289e97SRobert Clausecker	cmp	$-1, %edx	# either end of string after 3 chars,
105*14289e97SRobert Clausecker	jz	.Leq		# or it boils down to the last char
106*14289e97SRobert Clausecker
107*14289e97SRobert Clausecker.L3:	inc	%eax
108*14289e97SRobert Clausecker.L2:	inc	%eax
109*14289e97SRobert Clausecker.L1:	inc	%eax
110*14289e97SRobert Clausecker.L0:	movzbl	(%rsi, %rax, 1), %ecx
111*14289e97SRobert Clausecker	movzbl	(%rdi, %rax, 1), %eax
112*14289e97SRobert Clausecker	sub	%ecx, %eax
113*14289e97SRobert Clausecker.Leq:	ret
114*14289e97SRobert ClauseckerARCHEND(strncmp, scalar)
115*14289e97SRobert Clausecker
116*14289e97SRobert ClauseckerARCHENTRY(strncmp, baseline)
117*14289e97SRobert Clausecker	push		%rbx
118*14289e97SRobert Clausecker	sub		$1, %rdx	# RDX--, so RDX points to the last byte to compare
119*14289e97SRobert Clausecker	jb		.Lempty		# where there any bytes to compare at all?
120*14289e97SRobert Clausecker
121*14289e97SRobert Clausecker	lea		15(%rdi), %r8d	# end of head
122*14289e97SRobert Clausecker	lea		15(%rsi), %r9d
123*14289e97SRobert Clausecker	mov		%edi, %eax
124*14289e97SRobert Clausecker	mov		%esi, %ebx
125*14289e97SRobert Clausecker	xor		%edi, %r8d	# bits that changed between first and last byte
126*14289e97SRobert Clausecker	xor		%esi, %r9d
127*14289e97SRobert Clausecker	and		$~0xf, %rdi	# align heads to 16 bytes
128*14289e97SRobert Clausecker	and		$~0xf, %rsi
129*14289e97SRobert Clausecker	or		%r8d, %r9d
130*14289e97SRobert Clausecker	and		$0xf, %eax	# offset from alignment
131*14289e97SRobert Clausecker	and		$0xf, %ebx
132*14289e97SRobert Clausecker	movdqa		(%rdi), %xmm0	# load aligned heads
133*14289e97SRobert Clausecker	movdqa		(%rsi), %xmm2
134*14289e97SRobert Clausecker	pxor		%xmm1, %xmm1
135*14289e97SRobert Clausecker	cmp		$16, %rdx	# end of buffer within the first 32 bytes?
136*14289e97SRobert Clausecker	jb		.Llt16
137*14289e97SRobert Clausecker
138*14289e97SRobert Clausecker	test		$PAGE_SIZE, %r9d # did the page change?
139*14289e97SRobert Clausecker	jz		0f		# if not, take fast path
140*14289e97SRobert Clausecker
141*14289e97SRobert Clausecker
142*14289e97SRobert Clausecker	/* heads may cross page boundary, avoid unmapped loads */
143*14289e97SRobert Clausecker	movdqa		%xmm0, -32(%rsp) # stash copies of the heads on the stack
144*14289e97SRobert Clausecker	movdqa		%xmm2, -16(%rsp)
145*14289e97SRobert Clausecker	mov		$-1, %r8d
146*14289e97SRobert Clausecker	mov		$-1, %r9d
147*14289e97SRobert Clausecker	mov		%eax, %ecx
148*14289e97SRobert Clausecker	shl		%cl, %r8d	# string head in XMM0
149*14289e97SRobert Clausecker	mov		%ebx, %ecx
150*14289e97SRobert Clausecker	shl		%cl, %r9d	# string head in XMM2
151*14289e97SRobert Clausecker	pcmpeqb		%xmm1, %xmm0
152*14289e97SRobert Clausecker	pcmpeqb		%xmm1, %xmm2
153*14289e97SRobert Clausecker	pmovmskb	%xmm0, %r10d
154*14289e97SRobert Clausecker	pmovmskb	%xmm2, %r11d
155*14289e97SRobert Clausecker	test		%r8d, %r10d	# NUL byte present in first string?
156*14289e97SRobert Clausecker	lea		-32(%rsp), %r8
157*14289e97SRobert Clausecker	cmovz		%rdi, %r8
158*14289e97SRobert Clausecker	test		%r9d, %r11d	# NUL byte present in second string?
159*14289e97SRobert Clausecker	lea		-16(%rsp), %r9
160*14289e97SRobert Clausecker	cmovz		%rsi, %r9
161*14289e97SRobert Clausecker	movdqu		(%r8, %rax, 1), %xmm0 # load true (or fake) heads
162*14289e97SRobert Clausecker	movdqu		(%r9, %rbx, 1), %xmm4
163*14289e97SRobert Clausecker	jmp		1f
164*14289e97SRobert Clausecker
165*14289e97SRobert Clausecker	/* rdx == 0 */
166*14289e97SRobert Clausecker.Lempty:
167*14289e97SRobert Clausecker	xor		%eax, %eax	# zero-length buffers compare equal
168*14289e97SRobert Clausecker	pop		%rbx
169*14289e97SRobert Clausecker	ret
170*14289e97SRobert Clausecker
171*14289e97SRobert Clausecker0:	movdqu		(%rdi, %rax, 1), %xmm0 # load true heads
172*14289e97SRobert Clausecker	movdqu		(%rsi, %rbx, 1), %xmm4
173*14289e97SRobert Clausecker1:	pxor		%xmm2, %xmm2
174*14289e97SRobert Clausecker	pcmpeqb		%xmm0, %xmm2	# NUL byte present?
175*14289e97SRobert Clausecker	pcmpeqb		%xmm0, %xmm4	# which bytes match?
176*14289e97SRobert Clausecker	pandn		%xmm4, %xmm2	# match and not NUL byte?
177*14289e97SRobert Clausecker	pmovmskb	%xmm2, %r9d
178*14289e97SRobert Clausecker	xor		$0xffff, %r9d	# mismatch or NUL byte?
179*14289e97SRobert Clausecker	jnz		.Lhead_mismatch
180*14289e97SRobert Clausecker
181*14289e97SRobert Clausecker	/* load head and second chunk */
182*14289e97SRobert Clausecker	movdqa		16(%rdi), %xmm2	# load second chunks
183*14289e97SRobert Clausecker	movdqa		16(%rsi), %xmm3
184*14289e97SRobert Clausecker	lea		-16(%rdx, %rbx, 1), %rdx # account for length of RSI chunk
185*14289e97SRobert Clausecker	sub		%rbx, %rax	# is a&0xf >= b&0xf?
186*14289e97SRobert Clausecker	jb		.Lswapped	# if not, proceed with swapped operands
187*14289e97SRobert Clausecker	jmp		.Lnormal
188*14289e97SRobert Clausecker
189*14289e97SRobert Clausecker	/* buffer ends within the first 16 bytes */
190*14289e97SRobert Clausecker.Llt16:	test		$PAGE_SIZE, %r9d # did the page change?
191*14289e97SRobert Clausecker	jz		0f		# if not, take fast path
192*14289e97SRobert Clausecker
193*14289e97SRobert Clausecker	/* heads may cross page boundary */
194*14289e97SRobert Clausecker	movdqa		%xmm0, -32(%rsp) # stash copies of the heads on the stack
195*14289e97SRobert Clausecker	movdqa		%xmm2, -16(%rsp)
196*14289e97SRobert Clausecker	mov		$-1, %r8d
197*14289e97SRobert Clausecker	mov		$-1, %r9d
198*14289e97SRobert Clausecker	mov		%eax, %ecx
199*14289e97SRobert Clausecker	shl		%cl, %r8d	# string head in XMM0
200*14289e97SRobert Clausecker	mov		%ebx, %ecx
201*14289e97SRobert Clausecker	shl		%cl, %r9d	# string head in XMM2
202*14289e97SRobert Clausecker	pcmpeqb		%xmm1, %xmm0
203*14289e97SRobert Clausecker	pcmpeqb		%xmm1, %xmm2
204*14289e97SRobert Clausecker	pmovmskb	%xmm0, %r10d
205*14289e97SRobert Clausecker	pmovmskb	%xmm2, %r11d
206*14289e97SRobert Clausecker	lea		(%rdx, %rax, 1), %ecx # location of last buffer byte in xmm0
207*14289e97SRobert Clausecker	bts		%ecx, %r10d	# treat as if NUL byte present
208*14289e97SRobert Clausecker	lea		(%rdx, %rbx, 1), %ecx
209*14289e97SRobert Clausecker	bts		%ecx, %r11d
210*14289e97SRobert Clausecker	test		%r8w, %r10w	# NUL byte present in first string head?
211*14289e97SRobert Clausecker	lea		-32(%rsp), %r8
212*14289e97SRobert Clausecker	cmovz		%rdi, %r8
213*14289e97SRobert Clausecker	test		%r9w, %r11w	# NUL byte present in second string head?
214*14289e97SRobert Clausecker	lea		-16(%rsp), %r9
215*14289e97SRobert Clausecker	cmovz		%rsi, %r9
216*14289e97SRobert Clausecker	movdqu		(%r8, %rax, 1), %xmm0 # load true (or fake) heads
217*14289e97SRobert Clausecker	movdqu		(%r9, %rbx, 1), %xmm4
218*14289e97SRobert Clausecker	jmp		1f
219*14289e97SRobert Clausecker
220*14289e97SRobert Clausecker0:	movdqu		(%rdi, %rax, 1), %xmm0 # load true heads
221*14289e97SRobert Clausecker	movdqu		(%rsi, %rbx, 1), %xmm4
222*14289e97SRobert Clausecker1:	pxor		%xmm2, %xmm2
223*14289e97SRobert Clausecker	pcmpeqb		%xmm0, %xmm2	# NUL byte present?
224*14289e97SRobert Clausecker	pcmpeqb		%xmm0, %xmm4	# which bytes match?
225*14289e97SRobert Clausecker	pandn		%xmm4, %xmm2	# match and not NUL byte?
226*14289e97SRobert Clausecker	pmovmskb	%xmm2, %r9d
227*14289e97SRobert Clausecker	btr		%edx, %r9d	# induce mismatch in last byte of buffer
228*14289e97SRobert Clausecker	not		%r9d		# mismatch or NUL byte?
229*14289e97SRobert Clausecker
230*14289e97SRobert Clausecker	/* mismatch in true heads */
231*14289e97SRobert Clausecker	ALIGN_TEXT
232*14289e97SRobert Clausecker.Lhead_mismatch:
233*14289e97SRobert Clausecker	tzcnt		%r9d, %r9d	# where is the mismatch?
234*14289e97SRobert Clausecker	add		%rax, %rdi	# return to true heads
235*14289e97SRobert Clausecker	add		%rbx, %rsi
236*14289e97SRobert Clausecker	movzbl		(%rdi, %r9, 1), %eax # mismatching characters
237*14289e97SRobert Clausecker	movzbl		(%rsi, %r9, 1), %ecx
238*14289e97SRobert Clausecker	sub		%ecx, %eax
239*14289e97SRobert Clausecker	pop		%rbx
240*14289e97SRobert Clausecker	ret
241*14289e97SRobert Clausecker
242*14289e97SRobert Clausecker	/* rax >= 0 */
243*14289e97SRobert Clausecker	ALIGN_TEXT
244*14289e97SRobert Clausecker.Lnormal:
245*14289e97SRobert Clausecker	neg		%rax
246*14289e97SRobert Clausecker	movdqu		16(%rsi, %rax, 1), %xmm0
247*14289e97SRobert Clausecker	sub		%rdi, %rsi	# express RSI as distance from RDI
248*14289e97SRobert Clausecker	lea		(%rsi, %rax, 1), %rbx # point RBX to offset in second string
249*14289e97SRobert Clausecker	neg		%rax		# ... corresponding to RDI
250*14289e97SRobert Clausecker	pcmpeqb		%xmm3, %xmm1	# NUL present?
251*14289e97SRobert Clausecker	pcmpeqb		%xmm2, %xmm0	# Mismatch between chunks?
252*14289e97SRobert Clausecker	pmovmskb	%xmm1, %r8d
253*14289e97SRobert Clausecker	pmovmskb	%xmm0, %r9d
254*14289e97SRobert Clausecker	mov		$16, %ecx
255*14289e97SRobert Clausecker	cmp		%rcx, %rdx	# does the buffer end within (RDI,RSI,1)?
256*14289e97SRobert Clausecker	cmovb		%edx, %ecx	# ECX = min(16, RDX)
257*14289e97SRobert Clausecker	add		$32, %rdi	# advance to next iteration
258*14289e97SRobert Clausecker	bts		%ecx, %r8d	# mark end-of-buffer as if there was a NUL byte
259*14289e97SRobert Clausecker	test		%r8w, %r8w	# NUL or end of buffer found?
260*14289e97SRobert Clausecker	jnz		.Lnul_found2
261*14289e97SRobert Clausecker	xor		$0xffff, %r9d
262*14289e97SRobert Clausecker	jnz		.Lmismatch2
263*14289e97SRobert Clausecker	sub		$48, %rdx	# end of buffer within first main loop iteration?
264*14289e97SRobert Clausecker	jb		.Ltail		# if yes, process tail
265*14289e97SRobert Clausecker
266*14289e97SRobert Clausecker	/*
267*14289e97SRobert Clausecker	 * During the main loop, the layout of the two strings is something like:
268*14289e97SRobert Clausecker	 *
269*14289e97SRobert Clausecker	 *          v ------1------ v ------2------ v
270*14289e97SRobert Clausecker	 *     RDI:    AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
271*14289e97SRobert Clausecker	 *     RSI: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
272*14289e97SRobert Clausecker	 *
273*14289e97SRobert Clausecker	 * where v indicates the alignment boundaries and corresponding chunks
274*14289e97SRobert Clausecker	 * of the strings have the same letters.  Chunk A has been checked in
275*14289e97SRobert Clausecker	 * the previous iteration.  This iteration, we first check that string
276*14289e97SRobert Clausecker	 * RSI doesn't end within region 2, then we compare chunk B between the
277*14289e97SRobert Clausecker	 * two strings.  As RSI is known not to hold a NUL byte in regsions 1
278*14289e97SRobert Clausecker	 * and 2 at this point, this also ensures that RDI has not ended yet.
279*14289e97SRobert Clausecker	 */
280*14289e97SRobert Clausecker	ALIGN_TEXT
281*14289e97SRobert Clausecker0:	movdqu		(%rdi, %rbx, 1), %xmm0 # chunk of 2nd string corresponding to RDI
282*14289e97SRobert Clausecker	pxor		%xmm1, %xmm1
283*14289e97SRobert Clausecker	pcmpeqb		(%rdi, %rsi, 1), %xmm1 # end of string in RSI?
284*14289e97SRobert Clausecker	pcmpeqb		(%rdi), %xmm0	# where do the chunks match?
285*14289e97SRobert Clausecker	pmovmskb	%xmm1, %r8d
286*14289e97SRobert Clausecker	pmovmskb	%xmm0, %r9d
287*14289e97SRobert Clausecker	test		%r8d, %r8d
288*14289e97SRobert Clausecker	jnz		.Lnul_found
289*14289e97SRobert Clausecker	xor		$0xffff, %r9d	# any mismatches?
290*14289e97SRobert Clausecker	jnz		.Lmismatch
291*14289e97SRobert Clausecker
292*14289e97SRobert Clausecker	/* main loop unrolled twice */
293*14289e97SRobert Clausecker	movdqu		16(%rdi, %rbx, 1), %xmm0
294*14289e97SRobert Clausecker	pxor		%xmm1, %xmm1
295*14289e97SRobert Clausecker	pcmpeqb		16(%rdi, %rsi, 1), %xmm1
296*14289e97SRobert Clausecker	pcmpeqb		16(%rdi), %xmm0
297*14289e97SRobert Clausecker	pmovmskb	%xmm1, %r8d
298*14289e97SRobert Clausecker	pmovmskb	%xmm0, %r9d
299*14289e97SRobert Clausecker	add		$32, %rdi
300*14289e97SRobert Clausecker	test		%r8d, %r8d
301*14289e97SRobert Clausecker	jnz		.Lnul_found2
302*14289e97SRobert Clausecker	xor		$0xffff, %r9d
303*14289e97SRobert Clausecker	jnz		.Lmismatch2
304*14289e97SRobert Clausecker	sub		$32, %rdx	# end of buffer within next iteration?
305*14289e97SRobert Clausecker	jae		0b
306*14289e97SRobert Clausecker
307*14289e97SRobert Clausecker	/* end of buffer will occur in next 32 bytes */
308*14289e97SRobert Clausecker.Ltail:	movdqu		(%rdi, %rbx, 1), %xmm0 # chunk of 2nd string corresponding to RDI
309*14289e97SRobert Clausecker	pxor		%xmm1, %xmm1
310*14289e97SRobert Clausecker	pcmpeqb		(%rdi, %rsi, 1), %xmm1 # end of string in RSI?
311*14289e97SRobert Clausecker	pcmpeqb		(%rdi), %xmm0	# where do the chunks match?
312*14289e97SRobert Clausecker	pmovmskb	%xmm1, %r8d
313*14289e97SRobert Clausecker	pmovmskb	%xmm0, %r9d
314*14289e97SRobert Clausecker	bts		%edx, %r8d	# indicate NUL byte at last byte in buffer
315*14289e97SRobert Clausecker	test		%r8w, %r8w	# NUL byte in first chunk?
316*14289e97SRobert Clausecker	jnz		.Lnul_found
317*14289e97SRobert Clausecker	xor		$0xffff, %r9d	# any mismatches?
318*14289e97SRobert Clausecker	jnz		.Lmismatch
319*14289e97SRobert Clausecker
320*14289e97SRobert Clausecker	/* main loop unrolled twice */
321*14289e97SRobert Clausecker	movdqu		16(%rdi, %rbx, 1), %xmm0
322*14289e97SRobert Clausecker	pxor		%xmm1, %xmm1
323*14289e97SRobert Clausecker	pcmpeqb		16(%rdi, %rsi, 1), %xmm1
324*14289e97SRobert Clausecker	pcmpeqb		16(%rdi), %xmm0
325*14289e97SRobert Clausecker	pmovmskb	%xmm1, %r8d
326*14289e97SRobert Clausecker	pmovmskb	%xmm0, %r9d
327*14289e97SRobert Clausecker	sub		$16, %edx	# take first half into account
328*14289e97SRobert Clausecker	bts		%edx, %r8d	# indicate NUL byte at last byte in buffer
329*14289e97SRobert Clausecker	add		$32, %rdi
330*14289e97SRobert Clausecker
331*14289e97SRobert Clausecker.Lnul_found2:
332*14289e97SRobert Clausecker	sub		$16, %rdi
333*14289e97SRobert Clausecker
334*14289e97SRobert Clausecker.Lnul_found:
335*14289e97SRobert Clausecker	mov		%eax, %ecx
336*14289e97SRobert Clausecker	mov		%r8d, %r10d
337*14289e97SRobert Clausecker	shl		%cl, %r8d	# adjust NUL mask to positions in RDI/RBX
338*14289e97SRobert Clausecker	not		%r9d		# mask of mismatches
339*14289e97SRobert Clausecker	or		%r8w, %r9w	# NUL bytes als count as mismatches
340*14289e97SRobert Clausecker	jnz		.Lmismatch
341*14289e97SRobert Clausecker
342*14289e97SRobert Clausecker	/*
343*14289e97SRobert Clausecker	 * (RDI) == (RSI) and NUL is past the string.
344*14289e97SRobert Clausecker	 * compare (RSI) with the corresponding part
345*14289e97SRobert Clausecker	 * of the other string until the NUL byte.
346*14289e97SRobert Clausecker	 */
347*14289e97SRobert Clausecker	movdqu		(%rdi, %rax, 1), %xmm0
348*14289e97SRobert Clausecker	pcmpeqb		(%rdi, %rsi, 1), %xmm0
349*14289e97SRobert Clausecker	add		%rdi, %rsi	# restore RSI pointer
350*14289e97SRobert Clausecker	add		%rax, %rdi	# point RDI to chunk corresponding to (RSI)
351*14289e97SRobert Clausecker	pmovmskb	%xmm0, %ecx	# mask of matches
352*14289e97SRobert Clausecker	not		%ecx		# mask of mismatches
353*14289e97SRobert Clausecker	or		%r10d, %ecx	# mask of mismatches or NUL bytes
354*14289e97SRobert Clausecker	tzcnt		%ecx, %ecx	# location of first mismatch
355*14289e97SRobert Clausecker	movzbl		(%rdi, %rcx, 1), %eax
356*14289e97SRobert Clausecker	movzbl		(%rsi, %rcx, 1), %ecx
357*14289e97SRobert Clausecker	sub		%ecx, %eax
358*14289e97SRobert Clausecker	pop		%rbx
359*14289e97SRobert Clausecker	ret
360*14289e97SRobert Clausecker
361*14289e97SRobert Clausecker.Lmismatch2:
362*14289e97SRobert Clausecker	sub		$16, %rdi
363*14289e97SRobert Clausecker
364*14289e97SRobert Clausecker	/* a mismatch has been found between RBX and RSI */
365*14289e97SRobert Clausecker.Lmismatch:
366*14289e97SRobert Clausecker	tzcnt		%r9d, %r9d	# where is the mismatch?
367*14289e97SRobert Clausecker	add		%rdi, %rbx	# turn RBX from offset into pointer
368*14289e97SRobert Clausecker	movzbl		(%rbx, %r9, 1), %ecx
369*14289e97SRobert Clausecker	movzbl		(%rdi, %r9, 1), %eax
370*14289e97SRobert Clausecker	sub		%ecx, %eax
371*14289e97SRobert Clausecker	pop		%rbx
372*14289e97SRobert Clausecker	ret
373*14289e97SRobert Clausecker
374*14289e97SRobert Clausecker	/* rax < 0 */
375*14289e97SRobert Clausecker	ALIGN_TEXT
376*14289e97SRobert Clausecker.Lswapped:
377*14289e97SRobert Clausecker	movdqu		16(%rdi, %rax, 1), %xmm0
378*14289e97SRobert Clausecker	sub		%rsi, %rdi	# express RDI as distance from RDI
379*14289e97SRobert Clausecker	lea		(%rdi, %rax, 1), %rbx # point RBX to offset in first string
380*14289e97SRobert Clausecker	pcmpeqb		%xmm2, %xmm1	# NUL present?
381*14289e97SRobert Clausecker	pcmpeqb		%xmm3, %xmm0	# mismatch between chunks?
382*14289e97SRobert Clausecker	pmovmskb	%xmm1, %r8d
383*14289e97SRobert Clausecker	pmovmskb	%xmm0, %r9d
384*14289e97SRobert Clausecker	add		%rax, %rdx	# RDX points to buffer end in RSI
385*14289e97SRobert Clausecker	neg		%rax		# ... corresponding to RSI
386*14289e97SRobert Clausecker	mov		$16, %ecx
387*14289e97SRobert Clausecker	cmp		%rcx, %rdx	# does the buffer end within (RSI,RDI,1)?
388*14289e97SRobert Clausecker	cmovb		%edx, %ecx	# ECX = min(16, RDX)
389*14289e97SRobert Clausecker	add		$32, %rsi
390*14289e97SRobert Clausecker	bts		%ecx, %r8d	# mark end-of-buffer as if there was a NUL byte
391*14289e97SRobert Clausecker	test		%r8w, %r8w	# NUL or end of buffer found?
392*14289e97SRobert Clausecker	jnz		.Lnul_found2s
393*14289e97SRobert Clausecker	xor		$0xffff, %r9d
394*14289e97SRobert Clausecker	jnz		.Lmismatch2s
395*14289e97SRobert Clausecker	sub		$48, %rdx	# end of buffer within first main loop iteration?
396*14289e97SRobert Clausecker	jb		.Ltails		# if yes, process tail
397*14289e97SRobert Clausecker
398*14289e97SRobert Clausecker	ALIGN_TEXT
399*14289e97SRobert Clausecker0:	movdqu		(%rsi, %rbx, 1), %xmm0 # chunk of 1st string corresponding to RSI
400*14289e97SRobert Clausecker	pxor		%xmm1, %xmm1
401*14289e97SRobert Clausecker	pcmpeqb		(%rsi, %rdi, 1), %xmm1 # end of string in RDI?
402*14289e97SRobert Clausecker	pcmpeqb		(%rsi), %xmm0	# where do the chunks match?
403*14289e97SRobert Clausecker	pmovmskb	%xmm1, %r8d
404*14289e97SRobert Clausecker	pmovmskb	%xmm0, %r9d
405*14289e97SRobert Clausecker	test		%r8d, %r8d
406*14289e97SRobert Clausecker	jnz		.Lnul_founds
407*14289e97SRobert Clausecker	xor		$0xffff, %r9d	# any mismatches?
408*14289e97SRobert Clausecker	jnz		.Lmismatchs
409*14289e97SRobert Clausecker
410*14289e97SRobert Clausecker	/* main loop unrolled twice */
411*14289e97SRobert Clausecker	movdqu		16(%rsi, %rbx, 1), %xmm0
412*14289e97SRobert Clausecker	pxor		%xmm1, %xmm1
413*14289e97SRobert Clausecker	pcmpeqb		16(%rsi, %rdi, 1), %xmm1
414*14289e97SRobert Clausecker	pcmpeqb		16(%rsi), %xmm0
415*14289e97SRobert Clausecker	pmovmskb	%xmm1, %r8d
416*14289e97SRobert Clausecker	pmovmskb	%xmm0, %r9d
417*14289e97SRobert Clausecker	add		$32, %rsi
418*14289e97SRobert Clausecker	test		%r8d, %r8d
419*14289e97SRobert Clausecker	jnz		.Lnul_found2s
420*14289e97SRobert Clausecker	xor		$0xffff, %r9d
421*14289e97SRobert Clausecker	jnz		.Lmismatch2s
422*14289e97SRobert Clausecker	sub		$32, %rdx	# end of buffer within next iteration?
423*14289e97SRobert Clausecker	jae		0b
424*14289e97SRobert Clausecker
425*14289e97SRobert Clausecker	/* end of buffer will occur in next 32 bytes */
426*14289e97SRobert Clausecker.Ltails:
427*14289e97SRobert Clausecker	movdqu		(%rsi, %rbx, 1), %xmm0 # chunk of 1st string corresponding to RSI
428*14289e97SRobert Clausecker	pxor		%xmm1, %xmm1
429*14289e97SRobert Clausecker	pcmpeqb		(%rsi, %rdi, 1), %xmm1 # end of string in RDI?
430*14289e97SRobert Clausecker	pcmpeqb		(%rsi), %xmm0	# where do the chunks match?
431*14289e97SRobert Clausecker	pmovmskb	%xmm1, %r8d
432*14289e97SRobert Clausecker	pmovmskb	%xmm0, %r9d
433*14289e97SRobert Clausecker	bts		%edx, %r8d	# indicate NUL byte at laste byte in buffer
434*14289e97SRobert Clausecker	test		%r8w, %r8w	# NUL byte in first chunk?
435*14289e97SRobert Clausecker	jnz		.Lnul_founds
436*14289e97SRobert Clausecker	xor		$0xffff, %r9d	# any mismatches?
437*14289e97SRobert Clausecker	jnz		.Lmismatchs
438*14289e97SRobert Clausecker
439*14289e97SRobert Clausecker	/* main loop unrolled twice */
440*14289e97SRobert Clausecker	movdqu		16(%rsi, %rbx, 1), %xmm0
441*14289e97SRobert Clausecker	pxor		%xmm1, %xmm1
442*14289e97SRobert Clausecker	pcmpeqb		16(%rsi, %rdi, 1), %xmm1
443*14289e97SRobert Clausecker	pcmpeqb		16(%rsi), %xmm0
444*14289e97SRobert Clausecker	pmovmskb	%xmm1, %r8d
445*14289e97SRobert Clausecker	pmovmskb	%xmm0, %r9d
446*14289e97SRobert Clausecker	sub		$16, %edx	# take first half into account
447*14289e97SRobert Clausecker	bts		%edx, %r8d	# indicate NUL byte at laste byte in buffer
448*14289e97SRobert Clausecker	add		$32, %rsi
449*14289e97SRobert Clausecker
450*14289e97SRobert Clausecker.Lnul_found2s:
451*14289e97SRobert Clausecker	sub		$16, %rsi
452*14289e97SRobert Clausecker
453*14289e97SRobert Clausecker.Lnul_founds:
454*14289e97SRobert Clausecker	mov		%eax, %ecx
455*14289e97SRobert Clausecker	mov		%r8d, %r10d
456*14289e97SRobert Clausecker	shl		%cl, %r8d	# adjust NUL mask to positions in RSI/RBX
457*14289e97SRobert Clausecker	not		%r9d		# mask of mismatches
458*14289e97SRobert Clausecker	or		%r8w, %r9w	# NUL bytes also count as mismatches
459*14289e97SRobert Clausecker	jnz		.Lmismatchs
460*14289e97SRobert Clausecker
461*14289e97SRobert Clausecker	movdqu		(%rsi, %rax, 1), %xmm0
462*14289e97SRobert Clausecker	pcmpeqb		(%rsi, %rdi, 1), %xmm0
463*14289e97SRobert Clausecker	add		%rsi, %rdi	# restore RDI pointer
464*14289e97SRobert Clausecker	add		%rax, %rsi	# point RSI to chunk corresponding to (RDI)
465*14289e97SRobert Clausecker	pmovmskb	%xmm0, %ecx	# mask of matches
466*14289e97SRobert Clausecker	not		%ecx		# mask of mismatches
467*14289e97SRobert Clausecker	or		%r10d, %ecx	# mask of mismatches or NUL bytes
468*14289e97SRobert Clausecker	tzcnt		%ecx, %ecx	# location of first mismatch
469*14289e97SRobert Clausecker	movzbl		(%rdi, %rcx, 1), %eax
470*14289e97SRobert Clausecker	movzbl		(%rsi, %rcx, 1), %ecx
471*14289e97SRobert Clausecker	sub		%ecx, %eax
472*14289e97SRobert Clausecker	pop		%rbx
473*14289e97SRobert Clausecker	ret
474*14289e97SRobert Clausecker
475*14289e97SRobert Clausecker.Lmismatch2s:
476*14289e97SRobert Clausecker	sub		$16, %rsi
477*14289e97SRobert Clausecker
478*14289e97SRobert Clausecker.Lmismatchs:
479*14289e97SRobert Clausecker	tzcnt		%r9d, %r9d	# where is the mismatch?
480*14289e97SRobert Clausecker	add		%rsi, %rbx	# turn RBX from offset into pointer
481*14289e97SRobert Clausecker	movzbl		(%rbx, %r9, 1), %eax
482*14289e97SRobert Clausecker	movzbl		(%rsi, %r9, 1), %ecx
483*14289e97SRobert Clausecker	sub		%ecx, %eax
484*14289e97SRobert Clausecker	pop		%rbx
485*14289e97SRobert Clausecker	ret
486*14289e97SRobert ClauseckerARCHEND(strncmp, baseline)
487*14289e97SRobert Clausecker
488*14289e97SRobert Clausecker	.section .note.GNU-stack,"",%progbits
489