xref: /freebsd/lib/libc/amd64/string/strchrnul.S (revision 3d8ef251aa9dceabd57f7821a0e6749d35317db3)
161f4c4d3SRobert Clausecker/*-
261f4c4d3SRobert Clausecker * Copyright (c) 2023 The FreeBSD Foundation
361f4c4d3SRobert Clausecker *
461f4c4d3SRobert Clausecker * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
561f4c4d3SRobert Clausecker * under sponsorship from the FreeBSD Foundation.
661f4c4d3SRobert Clausecker *
761f4c4d3SRobert Clausecker * Redistribution and use in source and binary forms, with or without
861f4c4d3SRobert Clausecker * modification, are permitted provided that the following conditions
961f4c4d3SRobert Clausecker * are met:
1061f4c4d3SRobert Clausecker * 1. Redistributions of source code must retain the above copyright
1161f4c4d3SRobert Clausecker *    notice, this list of conditions and the following disclaimer.
1261f4c4d3SRobert Clausecker * 2. Redistributions in binary form must reproduce the above copyright
1361f4c4d3SRobert Clausecker *    notice, this list of conditions and the following disclaimer in the
1461f4c4d3SRobert Clausecker *    documentation and/or other materials provided with the distribution.
1561f4c4d3SRobert Clausecker *
1661f4c4d3SRobert Clausecker * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
1761f4c4d3SRobert Clausecker * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1861f4c4d3SRobert Clausecker * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1961f4c4d3SRobert Clausecker * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
2061f4c4d3SRobert Clausecker * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2161f4c4d3SRobert Clausecker * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2261f4c4d3SRobert Clausecker * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2361f4c4d3SRobert Clausecker * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2461f4c4d3SRobert Clausecker * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2561f4c4d3SRobert Clausecker * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2661f4c4d3SRobert Clausecker * SUCH DAMAGE
2761f4c4d3SRobert Clausecker */
2861f4c4d3SRobert Clausecker
2961f4c4d3SRobert Clausecker#include <machine/asm.h>
3061f4c4d3SRobert Clausecker
3161f4c4d3SRobert Clausecker#include "amd64_archlevel.h"
3261f4c4d3SRobert Clausecker
3361f4c4d3SRobert Clausecker#define ALIGN_TEXT	.p2align 4,0x90	# 16-byte alignment, nop-filled
3461f4c4d3SRobert Clausecker
3561f4c4d3SRobert Clausecker	.weak	strchrnul
3661f4c4d3SRobert Clausecker	.set	strchrnul, __strchrnul
3761f4c4d3SRobert Clausecker
3861f4c4d3SRobert ClauseckerARCHFUNCS(__strchrnul)
3961f4c4d3SRobert Clausecker	ARCHFUNC(__strchrnul, scalar)
4061f4c4d3SRobert Clausecker	ARCHFUNC(__strchrnul, baseline)
4161f4c4d3SRobert ClauseckerENDARCHFUNCS(__strchrnul)
4261f4c4d3SRobert Clausecker
4361f4c4d3SRobert Clausecker/*
4461f4c4d3SRobert Clausecker * strchrnul(str, c)
4561f4c4d3SRobert Clausecker * This is implemented like strlen(str), but we check for the
4661f4c4d3SRobert Clausecker * presence of both NUL and c in each iteration.
4761f4c4d3SRobert Clausecker */
4861f4c4d3SRobert ClauseckerARCHENTRY(__strchrnul, scalar)
4961f4c4d3SRobert Clausecker	mov	%edi, %ecx
5061f4c4d3SRobert Clausecker	and	$~7, %rdi		# align to 8 byte
5161f4c4d3SRobert Clausecker	movzbl	%sil, %esi		# clear stray high bits
5261f4c4d3SRobert Clausecker	movabs	$0x0101010101010101, %r8
5361f4c4d3SRobert Clausecker	mov	(%rdi), %rax		# load first word
5461f4c4d3SRobert Clausecker	imul	%r8, %rsi		# replicate char 8 times
5561f4c4d3SRobert Clausecker
5661f4c4d3SRobert Clausecker	/*
5761f4c4d3SRobert Clausecker	 * Unaligned input: align to 8 bytes.  Then proceed the same
58*3d8ef251SRobert Clausecker	 * way as with aligned input, but prevent matches before the
59*3d8ef251SRobert Clausecker	 * beginning of the string.  This is achieved by oring 0x01
60*3d8ef251SRobert Clausecker	 * into each byte of the buffer before the string
6161f4c4d3SRobert Clausecker	 */
6261f4c4d3SRobert Clausecker	shl	$3, %ecx
63*3d8ef251SRobert Clausecker	mov	%r8, %r10
6461f4c4d3SRobert Clausecker	add	$8, %rdi
65*3d8ef251SRobert Clausecker	shl	%cl, %r10		# 0x01 where the string is
66*3d8ef251SRobert Clausecker	xor	%r8, %r10		# 0x01 where it is not
6761f4c4d3SRobert Clausecker	neg	%r8			# negate 01..01 so we can use lea
68*3d8ef251SRobert Clausecker	movabs	$0x8080808080808080, %r9
6961f4c4d3SRobert Clausecker
7061f4c4d3SRobert Clausecker	mov	%rsi, %rcx
7161f4c4d3SRobert Clausecker	xor	%rax, %rcx		# str ^ c
72*3d8ef251SRobert Clausecker	or	%r10, %rax		# str without NUL bytes before it
73*3d8ef251SRobert Clausecker	or	%r10, %rcx		# (str ^ c) without matches before it
7461f4c4d3SRobert Clausecker	lea	(%rax, %r8, 1), %rdx	# str - 0x01..01
7561f4c4d3SRobert Clausecker	lea	(%rcx, %r8, 1), %r11	# (str ^ c) - 0x01..01
7661f4c4d3SRobert Clausecker	not	%rax			# ~str
7761f4c4d3SRobert Clausecker	not	%rcx			# ~(str ^ c)
7861f4c4d3SRobert Clausecker	and	%rdx, %rax		# (str - 0x01..01) & ~str
7961f4c4d3SRobert Clausecker	and	%r11, %rcx		# ((str ^ c - 0x01..01) & ~(str ^ c)
8061f4c4d3SRobert Clausecker	or	%rcx, %rax		# matches for both
81*3d8ef251SRobert Clausecker	and	%r9, %rax		# not including junk bytes
8261f4c4d3SRobert Clausecker	jnz	1f
8361f4c4d3SRobert Clausecker
8461f4c4d3SRobert Clausecker	/* main loop unrolled twice */
8561f4c4d3SRobert Clausecker	ALIGN_TEXT
8661f4c4d3SRobert Clausecker0:	mov	(%rdi), %rax		# str
8761f4c4d3SRobert Clausecker	mov	%rsi, %rcx
8861f4c4d3SRobert Clausecker	xor	%rax, %rcx		# str ^ c
8961f4c4d3SRobert Clausecker	lea	(%rax, %r8, 1), %rdx	# str - 0x01..01
9061f4c4d3SRobert Clausecker	lea	(%rcx, %r8, 1), %r11	# (str ^ c) - 0x01..01
9161f4c4d3SRobert Clausecker	not	%rax			# ~str
9261f4c4d3SRobert Clausecker	not	%rcx			# ~(str ^ c)
9361f4c4d3SRobert Clausecker	and	%rdx, %rax		# (str - 0x01..01) & ~str
9461f4c4d3SRobert Clausecker	and	%r11, %rcx		# ((str ^ c - 0x01..01) & ~(str ^ c)
9561f4c4d3SRobert Clausecker	or	%rcx, %rax		# matches for both
9661f4c4d3SRobert Clausecker	and	%r9, %rax		# not including junk bits
9761f4c4d3SRobert Clausecker	jnz	2f
9861f4c4d3SRobert Clausecker
9961f4c4d3SRobert Clausecker	mov	8(%rdi), %rax		# str
10061f4c4d3SRobert Clausecker	add	$16, %rdi
10161f4c4d3SRobert Clausecker	mov	%rsi, %rcx
10261f4c4d3SRobert Clausecker	xor	%rax, %rcx		# str ^ c
10361f4c4d3SRobert Clausecker	lea	(%rax, %r8, 1), %rdx	# str - 0x01..01
10461f4c4d3SRobert Clausecker	lea	(%rcx, %r8, 1), %r11	# (str ^ c) - 0x01..01
10561f4c4d3SRobert Clausecker	not	%rax			# ~str
10661f4c4d3SRobert Clausecker	not	%rcx			# ~(str ^ c)
10761f4c4d3SRobert Clausecker	and	%rdx, %rax		# (str - 0x01..01) & ~str
10861f4c4d3SRobert Clausecker	and	%r11, %rcx		# ((str ^ c - 0x01..01) & ~(str ^ c)
10961f4c4d3SRobert Clausecker	or	%rcx, %rax		# matches for both
11061f4c4d3SRobert Clausecker	and	%r9, %rax		# not including junk bits
11161f4c4d3SRobert Clausecker	jz	0b
11261f4c4d3SRobert Clausecker
11361f4c4d3SRobert Clausecker	/* NUL or c found */
11461f4c4d3SRobert Clausecker1:	sub	$8, %rdi		# undo advance past buffer
11561f4c4d3SRobert Clausecker2:	tzcnt	%rax, %rax		# first NUL or c byte match
11661f4c4d3SRobert Clausecker	shr	$3, %eax		# scale from bit to byte index
11761f4c4d3SRobert Clausecker	add	%rdi, %rax		# pointer to found c or NUL
11861f4c4d3SRobert Clausecker	ret
11961f4c4d3SRobert ClauseckerARCHEND(__strchrnul, scalar)
12061f4c4d3SRobert Clausecker
12161f4c4d3SRobert ClauseckerARCHENTRY(__strchrnul, baseline)
12261f4c4d3SRobert Clausecker	mov		%edi, %ecx
12361f4c4d3SRobert Clausecker	and		$~0xf, %rdi		# align to 16 byte
12461f4c4d3SRobert Clausecker	movdqa		(%rdi), %xmm1
12561f4c4d3SRobert Clausecker	movd		%esi, %xmm0
12661f4c4d3SRobert Clausecker	and		$0xf, %ecx		# distance from (%rdi) to start of string
12761f4c4d3SRobert Clausecker	pxor		%xmm2, %xmm2
12861f4c4d3SRobert Clausecker	mov		$-1, %edx
12961f4c4d3SRobert Clausecker	punpcklbw	%xmm0, %xmm0		# c -> cc
13061f4c4d3SRobert Clausecker	shl		%cl, %edx		# bits corresponding to bytes in the string
13161f4c4d3SRobert Clausecker	punpcklwd	%xmm0, %xmm0		# cc -> cccc
13261f4c4d3SRobert Clausecker	add		$16, %rdi
13361f4c4d3SRobert Clausecker
13461f4c4d3SRobert Clausecker	/* check for match in head */
13561f4c4d3SRobert Clausecker	pcmpeqb		%xmm1, %xmm2		# NUL bytes present?
13661f4c4d3SRobert Clausecker	pshufd		$0, %xmm0, %xmm0	# cccc -> cccccccccccccccc
13761f4c4d3SRobert Clausecker	pcmpeqb		%xmm0, %xmm1		# c present?
13861f4c4d3SRobert Clausecker	por		%xmm2, %xmm1		# either present?
13961f4c4d3SRobert Clausecker	pmovmskb	%xmm1, %eax
14061f4c4d3SRobert Clausecker	and		%edx, %eax		# match in the string?
14161f4c4d3SRobert Clausecker	jnz		1f
14261f4c4d3SRobert Clausecker
14361f4c4d3SRobert Clausecker	/* main loop unrolled twice */
14461f4c4d3SRobert Clausecker	ALIGN_TEXT
14561f4c4d3SRobert Clausecker0:	movdqa		(%rdi), %xmm1
14661f4c4d3SRobert Clausecker	pxor		%xmm2, %xmm2
14761f4c4d3SRobert Clausecker	pcmpeqb		%xmm1, %xmm2		# NUL bytes present?
14861f4c4d3SRobert Clausecker	pcmpeqb		%xmm0, %xmm1		# c present?
14961f4c4d3SRobert Clausecker	por		%xmm2, %xmm1		# either present?
15061f4c4d3SRobert Clausecker	pmovmskb	%xmm1, %eax
15161f4c4d3SRobert Clausecker	test		%eax, %eax		# match in the string?
15261f4c4d3SRobert Clausecker	jnz		2f
15361f4c4d3SRobert Clausecker
15461f4c4d3SRobert Clausecker	movdqa		16(%rdi), %xmm1
15561f4c4d3SRobert Clausecker	add		$32, %rdi
15661f4c4d3SRobert Clausecker	pxor		%xmm2, %xmm2
15761f4c4d3SRobert Clausecker	pcmpeqb		%xmm1, %xmm2		# NUL bytes present?
15861f4c4d3SRobert Clausecker	pcmpeqb		%xmm0, %xmm1		# c present?
15961f4c4d3SRobert Clausecker	por		%xmm2, %xmm1		# either present?
16061f4c4d3SRobert Clausecker	pmovmskb	%xmm1, %eax
16161f4c4d3SRobert Clausecker	test		%eax, %eax		# match in the string?
16261f4c4d3SRobert Clausecker	jz		0b
16361f4c4d3SRobert Clausecker
16461f4c4d3SRobert Clausecker1:	sub		$16, %rdi		# undo advance past buffer
16561f4c4d3SRobert Clausecker2:	tzcnt		%eax, %eax		# where is the match?
16661f4c4d3SRobert Clausecker	add		%rdi, %rax		# pointer to found c or NUL
16761f4c4d3SRobert Clausecker	ret
16861f4c4d3SRobert ClauseckerARCHEND(__strchrnul, baseline)
16961f4c4d3SRobert Clausecker
17061f4c4d3SRobert Clausecker	.section .note.GNU-stack,"",%progbits
171