xref: /freebsd/lib/libc/amd64/string/memchr.S (revision b2618b651b28fd29e62a4e285f5be09ea30a85d4)
1de12a689SRobert Clausecker/*-
2de12a689SRobert Clausecker * Copyright (c) 2023 The FreeBSD Foundation
3de12a689SRobert Clausecker *
4de12a689SRobert Clausecker * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
5de12a689SRobert Clausecker * under sponsorship from the FreeBSD Foundation.
6de12a689SRobert Clausecker *
7de12a689SRobert Clausecker * Redistribution and use in source and binary forms, with or without
8de12a689SRobert Clausecker * modification, are permitted provided that the following conditions
9de12a689SRobert Clausecker * are met:
10de12a689SRobert Clausecker * 1. Redistributions of source code must retain the above copyright
11de12a689SRobert Clausecker *    notice, this list of conditions and the following disclaimer.
12de12a689SRobert Clausecker * 2. Redistributions in binary form must reproduce the above copyright
13de12a689SRobert Clausecker *    notice, this list of conditions and the following disclaimer in the
14de12a689SRobert Clausecker *    documentation and/or other materials provided with the distribution.
15de12a689SRobert Clausecker *
16de12a689SRobert Clausecker * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
17de12a689SRobert Clausecker * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18de12a689SRobert Clausecker * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19de12a689SRobert Clausecker * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20de12a689SRobert Clausecker * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21de12a689SRobert Clausecker * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22de12a689SRobert Clausecker * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23de12a689SRobert Clausecker * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24de12a689SRobert Clausecker * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25de12a689SRobert Clausecker * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26de12a689SRobert Clausecker * SUCH DAMAGE
27de12a689SRobert Clausecker */
28de12a689SRobert Clausecker
29de12a689SRobert Clausecker#include <machine/asm.h>
30de12a689SRobert Clausecker
31de12a689SRobert Clausecker#include "amd64_archlevel.h"
32de12a689SRobert Clausecker
33de12a689SRobert Clausecker#define ALIGN_TEXT      .p2align 4,0x90 /* 16-byte alignment, nop filled */
34de12a689SRobert Clausecker
35de12a689SRobert Clausecker        .weak memchr
36de12a689SRobert Clausecker        .set memchr, __memchr
37de12a689SRobert ClauseckerARCHFUNCS(__memchr)
38de12a689SRobert Clausecker	ARCHFUNC(__memchr, scalar)
39de12a689SRobert Clausecker	ARCHFUNC(__memchr, baseline)
40de12a689SRobert ClauseckerENDARCHFUNCS(__memchr)
41de12a689SRobert Clausecker
42de12a689SRobert ClauseckerARCHENTRY(__memchr, scalar)
43de12a689SRobert Clausecker	test	%rdx, %rdx	# empty input?
44de12a689SRobert Clausecker	je	.Lnomatch
45de12a689SRobert Clausecker
46de12a689SRobert Clausecker	lea	(, %rdi, 8), %ecx
47*b2618b65SRobert Clausecker	mov	$-1, %rax
48*b2618b65SRobert Clausecker	add	%rdi, %rdx	# pointer to end of buffer or to end of
49*b2618b65SRobert Clausecker	cmovc	%rax, %rdx	# address space (whichever comes first)
50de12a689SRobert Clausecker	and	$~7, %rdi	# align to 8 bytes
51de12a689SRobert Clausecker	mov	(%rdi), %rax	# load first word
52de12a689SRobert Clausecker	movzbl	%sil, %esi	# clear stray high bits
53de12a689SRobert Clausecker	movabs	$0x0101010101010101, %r8
54de12a689SRobert Clausecker	imul	%r8, %rsi	# replicate char 8 times
55de12a689SRobert Clausecker
56de12a689SRobert Clausecker	/* compute head and tail masks */
57de12a689SRobert Clausecker	mov	%r8, %r10
58de12a689SRobert Clausecker	movabs	$0x8080808080808080, %r9
59de12a689SRobert Clausecker	shl	%cl, %r10	# 0x01 where string head is
60de12a689SRobert Clausecker	lea	(, %rdx, 8), %ecx
61de12a689SRobert Clausecker	xor	%r8, %r10	# 0x01 where it is not
62de12a689SRobert Clausecker	neg	%r8		# negate 01..01 so we can use lea
63de12a689SRobert Clausecker	mov	%r9, %r11
64de12a689SRobert Clausecker	xor	%rsi, %rax	# str ^ c (0x00 where str[i] == c)
65de12a689SRobert Clausecker	neg	%ecx
66de12a689SRobert Clausecker	or	%r10, %rax	# except before the string
67de12a689SRobert Clausecker	shr	%cl, %r11	# 0x80 where string tail is
68de12a689SRobert Clausecker
69de12a689SRobert Clausecker	add	$8, %rdi	# advance to next 8 bytes
70de12a689SRobert Clausecker	cmp	%rdx, %rdi	# end of buffer reached during head?
71de12a689SRobert Clausecker	jae	.Ltail		# and go to tail-processing code
72de12a689SRobert Clausecker
73de12a689SRobert Clausecker	/* main loop, unrolled twice */
74de12a689SRobert Clausecker	ALIGN_TEXT
75de12a689SRobert Clausecker0:	lea	(%rax, %r8, 1), %rcx # (str ^ c) - 0x01..01
76de12a689SRobert Clausecker	not	%rax		# ~(str ^ c)
77de12a689SRobert Clausecker	and	%r9, %rax	# ((str^c) - 0x01..01) & ~(str^c)
78de12a689SRobert Clausecker	and	%rcx, %rax	# not including junk bytes
79de12a689SRobert Clausecker	jnz	.Lmatch
80de12a689SRobert Clausecker
81de12a689SRobert Clausecker	mov	(%rdi), %rax
82de12a689SRobert Clausecker	add	$8, %rdi
83de12a689SRobert Clausecker	xor	%rsi, %rax	# str ^ c
84de12a689SRobert Clausecker	cmp	%rdx, %rdi
85de12a689SRobert Clausecker	jae	.Ltail
86de12a689SRobert Clausecker
87de12a689SRobert Clausecker	lea	(%rax, %r8, 1), %rcx # (str ^ c) - 0x01..01
88de12a689SRobert Clausecker	not	%rax		# ~(str ^ c)
89de12a689SRobert Clausecker	and	%r9, %rax	# ((str^c) - 0x01..01) & ~(str^c)
90de12a689SRobert Clausecker	and	%rcx, %rax	# not including junk bytes
91de12a689SRobert Clausecker	jnz	.Lmatch
92de12a689SRobert Clausecker
93de12a689SRobert Clausecker	mov	(%rdi), %rax
94de12a689SRobert Clausecker	add	$8, %rdi
95de12a689SRobert Clausecker	xor	%rsi, %rax	# str ^ c
96de12a689SRobert Clausecker	cmp	%rdx, %rdi
97de12a689SRobert Clausecker	jb	0b
98de12a689SRobert Clausecker
99de12a689SRobert Clausecker.Ltail:	lea	(%rax, %r8, 1), %rcx # (str ^ c) - 0x01..01
100de12a689SRobert Clausecker	not	%rax		# ~(str ^ c)
101de12a689SRobert Clausecker	and	%r11, %rax	# ((str^c) - 0x01..01) & ~(str^c)
102de12a689SRobert Clausecker	and	%rcx, %rax	# not including junk bytes or bytes past buffer
103de12a689SRobert Clausecker	jz	.Lnomatch
104de12a689SRobert Clausecker
105de12a689SRobert Clausecker.Lmatch:
106de12a689SRobert Clausecker	tzcnt	%rax, %rax	# first match
107de12a689SRobert Clausecker	shr	$3, %eax	# scale from bit to byte index
108de12a689SRobert Clausecker	lea	-8(%rdi, %rax), %rax # pointer to found c
109de12a689SRobert Clausecker	ret
110de12a689SRobert Clausecker
111de12a689SRobert Clausecker	/* no match found */
112de12a689SRobert Clausecker.Lnomatch:
113de12a689SRobert Clausecker	xor	%eax, %eax	# return null pointer
114de12a689SRobert Clausecker	ret
115de12a689SRobert ClauseckerARCHEND(__memchr, scalar)
116de12a689SRobert Clausecker
117de12a689SRobert ClauseckerARCHENTRY(__memchr, baseline)
118de12a689SRobert Clausecker	test		%rdx, %rdx		# empty input?
119de12a689SRobert Clausecker	je		.Lnomatchb
120de12a689SRobert Clausecker
121de12a689SRobert Clausecker	movd		%esi, %xmm2
122de12a689SRobert Clausecker	mov		%edi, %ecx
123*b2618b65SRobert Clausecker	mov		$-1, %r9
124*b2618b65SRobert Clausecker	add		%rdi, %rdx		# pointer to end of buffer or to end of
125*b2618b65SRobert Clausecker	cmovc		%r9, %rdx		# address space (whichever comes first)
126de12a689SRobert Clausecker	and		$~0x1f, %rdi		# align to 32 bytes
127de12a689SRobert Clausecker	movdqa		(%rdi), %xmm0		# load first 32 bytes
128de12a689SRobert Clausecker	movdqa		16(%rdi), %xmm1
129de12a689SRobert Clausecker
130de12a689SRobert Clausecker	punpcklbw	%xmm2, %xmm2		# c -> cc
131de12a689SRobert Clausecker
132de12a689SRobert Clausecker	shl		%cl, %r9d		# mask with zeroes before the string
133de12a689SRobert Clausecker
134de12a689SRobert Clausecker	punpcklwd	%xmm2, %xmm2		# cc -> cccc
135de12a689SRobert Clausecker
136de12a689SRobert Clausecker	mov		$-1, %r8d
137de12a689SRobert Clausecker	xor		%ecx, %ecx
138de12a689SRobert Clausecker	sub		%edx, %ecx		# edx = -ecx
139de12a689SRobert Clausecker	shr		%cl, %r8d		# bytes in tail that are part of the buffer
140de12a689SRobert Clausecker
141de12a689SRobert Clausecker	pshufd		$0, %xmm2, %xmm2	# cccc -> cccccccccccccccc
142de12a689SRobert Clausecker
143de12a689SRobert Clausecker	add		$32, %rdi		# advance to next 32 bytes
144de12a689SRobert Clausecker	mov		$-1, %eax
145de12a689SRobert Clausecker	cmp		%rdx, %rdi		# end of buffer reached during head?
146de12a689SRobert Clausecker	cmovae		%r8d, %eax		# if yes, do combined head/tail processing
147de12a689SRobert Clausecker	and		%r9d, %eax		# mask of bytes in head part of string
148de12a689SRobert Clausecker
149de12a689SRobert Clausecker	/* process head */
150de12a689SRobert Clausecker	pcmpeqb		%xmm2, %xmm1
151de12a689SRobert Clausecker	pcmpeqb		%xmm2, %xmm0
152de12a689SRobert Clausecker	pmovmskb	%xmm1, %esi
153de12a689SRobert Clausecker	pmovmskb	%xmm0, %ecx
154de12a689SRobert Clausecker	shl		$16, %esi
155de12a689SRobert Clausecker	or		%esi, %ecx		# locations of matches
156de12a689SRobert Clausecker	and		%ecx, %eax		# any match inside buffer?
157de12a689SRobert Clausecker	jnz		.Lprecisematchb
158de12a689SRobert Clausecker
159de12a689SRobert Clausecker	cmp		%rdx, %rdi		# did the buffer end here?
160de12a689SRobert Clausecker	jae		.Lnomatchb		# if yes we are done
161de12a689SRobert Clausecker
162de12a689SRobert Clausecker	/* main loop */
163de12a689SRobert Clausecker	ALIGN_TEXT
164de12a689SRobert Clausecker0:	movdqa		(%rdi), %xmm0		# load next string chunk
165de12a689SRobert Clausecker	movdqa		16(%rdi), %xmm1
166de12a689SRobert Clausecker	add		$32, %rdi
167de12a689SRobert Clausecker	cmp		%rdx, %rdi		# ready for main loop?
168de12a689SRobert Clausecker	jae		.Ltailb
169de12a689SRobert Clausecker
170de12a689SRobert Clausecker	pcmpeqb		%xmm2, %xmm0
171de12a689SRobert Clausecker	pcmpeqb		%xmm2, %xmm1
172de12a689SRobert Clausecker	por		%xmm1, %xmm0		# match in either half?
173de12a689SRobert Clausecker	pmovmskb	%xmm0, %eax
174de12a689SRobert Clausecker	test		%eax, %eax
175de12a689SRobert Clausecker	jz		0b
176de12a689SRobert Clausecker
177de12a689SRobert Clausecker.Lmatchb:
178de12a689SRobert Clausecker	pcmpeqb		-32(%rdi), %xmm2	# redo comparison of first 16 bytes
179de12a689SRobert Clausecker	pmovmskb	%xmm1, %ecx
180de12a689SRobert Clausecker	pmovmskb	%xmm2, %eax
181de12a689SRobert Clausecker	shl		$16, %ecx
182de12a689SRobert Clausecker	or		%ecx, %eax		# location of matches
183de12a689SRobert Clausecker
184de12a689SRobert Clausecker.Lprecisematchb:
185de12a689SRobert Clausecker	tzcnt		%eax, %eax		# find location of match
186de12a689SRobert Clausecker	lea		-32(%rdi, %rax, 1), %rax # point to matching byte
187de12a689SRobert Clausecker	ret
188de12a689SRobert Clausecker
189de12a689SRobert Clausecker.Ltailb:
190de12a689SRobert Clausecker	pcmpeqb		%xmm2, %xmm1
191de12a689SRobert Clausecker	pcmpeqb		%xmm2, %xmm0
192de12a689SRobert Clausecker	pmovmskb	%xmm1, %edx
193de12a689SRobert Clausecker	pmovmskb	%xmm0, %eax
194de12a689SRobert Clausecker	shl		$16, %edx
195de12a689SRobert Clausecker	or		%edx, %eax		# location of matches
196de12a689SRobert Clausecker	and		%r8d, %eax		# mask out matches beyond buffer
197de12a689SRobert Clausecker	bsf		%eax, %edx		# location of match
198de12a689SRobert Clausecker	lea		-32(%rdi, %rdx, 1), %rdx # pointer to match (if any)
199de12a689SRobert Clausecker	cmovnz		%rdx, %rax		# point to match if present,
200de12a689SRobert Clausecker	ret					# else null pointer
201de12a689SRobert Clausecker
202de12a689SRobert Clausecker.Lnomatchb:
203de12a689SRobert Clausecker	xor	%eax, %eax	# return null pointer
204de12a689SRobert Clausecker	ret
205de12a689SRobert ClauseckerARCHEND(__memchr, baseline)
206de12a689SRobert Clausecker
207de12a689SRobert Clausecker	.section .note.GNU-stack,"",%progbits
208