xref: /freebsd/lib/libc/amd64/string/strcspn.S (revision e88e1272792e41cbf0a5af1f5f0a858afece0475)
1/*
2 * Copyright (c) 2023 The FreeBSD Foundation
3 *
4 * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
5 * under sponsorship from the FreeBSD Foundation.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE
27 */
28
29#include <machine/asm.h>
30#include <machine/param.h>
31
32#include "amd64_archlevel.h"
33
34#define ALIGN_TEXT	.p2align 4,0x90 /* 16-byte alignment, nop filled */
35
36ARCHFUNCS(strcspn)
37	ARCHFUNC(strcspn, scalar)
38	NOARCHFUNC
39	ARCHFUNC(strcspn, x86_64_v2)
40ENDARCHFUNCS(strcspn)
41
42ARCHENTRY(strcspn, scalar)
43	push	%rbp			# align stack to enable function call
44	mov	%rsp, %rbp
45	sub	$256, %rsp		# allocate space for lookup table
46
47	/* check for special cases */
48	movzbl	(%rsi), %eax		# first character in the set
49	test	%eax, %eax
50	jz	.Lstrlen
51
52	movzbl	1(%rsi), %edx		# second character in the set
53	test	%edx, %edx
54	jz	.Lstrchr
55
56	/* no special case matches -- prepare lookup table */
57	xor	%r8d, %r8d
58	mov	$28, %ecx
590:	mov	%r8, (%rsp, %rcx, 8)
60	mov	%r8, 8(%rsp, %rcx, 8)
61	mov	%r8, 16(%rsp, %rcx, 8)
62	mov	%r8, 24(%rsp, %rcx, 8)
63	sub	$4, %ecx
64	jnc	0b
65
66	add	$2, %rsi
67	movb	$1, (%rsp, %rax, 1)	# register first chars in set
68	movb	$1, (%rsp, %rdx, 1)
69	mov	%rdi, %rax		# a copy of the source to iterate over
70
71	/* process remaining chars in set */
72	ALIGN_TEXT
730:	movzbl	(%rsi), %ecx
74	movb	$1, (%rsp, %rcx, 1)
75	test	%ecx, %ecx
76	jz	1f
77
78	movzbl	1(%rsi), %ecx
79	movb	$1, (%rsp, %rcx, 1)
80	test	%ecx, %ecx
81	jz	1f
82
83	add	$2, %rsi
84	jmp	0b
85
86	/* find match */
87	ALIGN_TEXT
881:	movzbl	(%rax), %ecx
89	cmpb	$0, (%rsp, %rcx, 1)
90	jne	2f
91
92	movzbl	1(%rax), %ecx
93	cmpb	$0, (%rsp, %rcx, 1)
94	jne	3f
95
96	movzbl	2(%rax), %ecx
97	cmpb	$0, (%rsp, %rcx, 1)
98	jne	4f
99
100	movzbl	3(%rax), %ecx
101	add	$4, %rax
102	cmpb	$0, (%rsp, %rcx, 1)
103	je	1b
104
105	sub	$3, %rax
1064:	dec	%rdi
1073:	inc	%rax
1082:	sub	%rdi, %rax		# number of characters preceding match
109	leave
110	ret
111
112	/* set is empty, degrades to strlen */
113.Lstrlen:
114	leave
115	jmp	CNAME(strlen)
116
117	/* just one character in set, degrades to strchr */
118.Lstrchr:
119	mov	%rdi, (%rsp)		# stash a copy of the string
120	mov	%eax, %esi		# find the character in the set
121	call	CNAME(strchrnul)
122	sub	(%rsp), %rax		# length of prefix before match
123	leave
124	ret
125ARCHEND(strcspn, scalar)
126
127	/*
128	 * This kernel uses pcmpistri to do the heavy lifting.
129	 * We provide five code paths, depending on set size:
130	 *
131	 *      0: call strlen()
132	 *      1: call strchr()
133	 *  2--16: one pcmpistri per 16 bytes of input
134	 * 17--32: two pcmpistri per 16 bytes of input
135	 *   >=33: fall back to look up table
136	 */
137ARCHENTRY(strcspn, x86_64_v2)
138	push		%rbp
139	mov		%rsp, %rbp
140	sub		$256, %rsp
141
142	/* check for special cases */
143	movzbl		(%rsi), %eax
144	test		%eax, %eax		# empty string?
145	jz		.Lstrlenv2
146
147	cmpb		$0, 1(%rsi)		# single character string?
148	jz		.Lstrchrv2
149
150	/* find set size and copy up to 32 bytes to (%rsp) */
151	mov		%esi, %ecx
152	and		$~0xf, %rsi		# align set pointer
153	movdqa		(%rsi), %xmm0
154	pxor		%xmm1, %xmm1
155	and		$0xf, %ecx		# amount of bytes rsi is past alignment
156	xor		%edx, %edx
157	pcmpeqb		%xmm0, %xmm1		# end of string reached?
158	movdqa		%xmm0, 32(%rsp)		# transfer head of set to stack
159	pmovmskb	%xmm1, %eax
160	shr		%cl, %eax		# clear out junk before string
161	test		%eax, %eax		# end of set reached?
162	jnz		0f
163
164	movdqa		16(%rsi), %xmm0		# second chunk of the set
165	mov		$16, %edx
166	sub		%ecx, %edx		# length of set preceding xmm0
167	pxor		%xmm1, %xmm1
168	pcmpeqb		%xmm0, %xmm1
169	movdqa		%xmm0, 48(%rsp)
170	movdqu		32(%rsp, %rcx, 1), %xmm2 # head of set
171	pmovmskb	%xmm1, %eax
172	test		%eax, %eax
173	jnz		1f
174
175	movdqa		32(%rsi), %xmm0		# third chunk
176	add		$16, %edx
177	pxor		%xmm1, %xmm1
178	pcmpeqb		%xmm0, %xmm1
179	movdqa		%xmm0, 64(%rsp)
180	pmovmskb	%xmm1, %eax
181	test		%eax, %eax		# still not done?
182	jz		.Lgt32v2
183
1840:	movdqu		32(%rsp, %rcx, 1), %xmm2 # head of set
1851:	tzcnt		%eax, %eax
186	add		%eax, %edx		# length of set (excluding NUL byte)
187	cmp		$32, %edx		# above 32 bytes?
188	ja		.Lgt32v2
189
190	/*
191	 * At this point we know that we want to use pcmpistri.
192	 * one last problem obtains: the head of the string is not
193	 * aligned and may cross a cacheline.  If this is the case,
194	 * we take the part before the page boundary and repeat the
195	 * last byte to fill up the xmm register.
196	 */
197	mov		%rdi, %rax		# save original string pointer
198	lea		15(%rdi), %esi		# last byte of the head
199	xor		%edi, %esi
200	test		$PAGE_SIZE, %esi	# does the head cross a page?
201	jz		0f
202
203	/* head crosses page: copy to stack to fix up */
204	and		$~0xf, %rax		# align head pointer temporarily
205	movzbl		15(%rax), %esi		# last head byte on the page
206	movdqa		(%rax), %xmm0
207	movabs		$0x0101010101010101, %r8
208	imul		%r8, %rsi		# repeated 8 times
209	movdqa		%xmm0, (%rsp)		# head word on stack
210	mov		%rsi, 16(%rsp)		# followed by filler (last byte x8)
211	mov		%rsi, 24(%rsp)
212	mov		%edi, %eax
213	and		$0xf, %eax		# offset of head from alignment
214	add		%rsp, %rax		# pointer to fake head
215
2160:	movdqu		(%rax), %xmm0		# load head (fake or real)
217	lea		16(%rdi), %rax
218	and		$~0xf, %rax		# second 16 bytes of string (aligned)
2191:	cmp		$16, %edx		# 16--32 bytes?
220	ja		.Lgt16v2
221
222
223	/* set is 2--16 bytes in size */
224
225	/* _SIDD_UBYTE_OPS|_SIDD_CMP_EQUAL_ANY|_SIDD_LEAST_SIGNIFICANT */
226	pcmpistri	$0, %xmm0, %xmm2	# match in head?
227	jbe		.Lheadmatchv2
228
229	ALIGN_TEXT
2300:	pcmpistri	$0, (%rax), %xmm2
231	jbe		1f			# match or end of string?
232	pcmpistri	$0, 16(%rax), %xmm2
233	lea		32(%rax), %rax
234	ja		0b			# match or end of string?
235
2363:	lea		-16(%rax), %rax		# go back to second half
2371:	jc		2f			# jump if match found
238	movdqa		(%rax), %xmm0		# reload string piece
239	pxor		%xmm1, %xmm1
240	pcmpeqb		%xmm1, %xmm0		# where is the NUL byte?
241	pmovmskb	%xmm0, %ecx
242	tzcnt		%ecx, %ecx		# location of NUL byte in (%rax)
2432:	sub		%rdi, %rax		# offset of %xmm0 from beginning of string
244	add		%rcx, %rax		# prefix length before match/NUL
245	leave
246	ret
247
248.Lheadmatchv2:
249	jc		2f			# jump if match found
250	pxor		%xmm1, %xmm1
251	pcmpeqb		%xmm1, %xmm0
252	pmovmskb	%xmm0, %ecx
253	tzcnt		%ecx, %ecx		# location of NUL byte
2542:	mov		%ecx, %eax		# prefix length before match/NUL
255	leave
256	ret
257
258	/* match in first set half during head */
259.Lheadmatchv2first:
260	mov		%ecx, %eax
261	pcmpistri	$0, %xmm0, %xmm3	# match in second set half?
262	cmp		%ecx, %eax		# before the first half match?
263	cmova		%ecx, %eax		# use the earlier match
264	leave
265	ret
266
267.Lgt16v2:
268	movdqu		48(%rsp, %rcx, 1), %xmm3 # second part of set
269
270	/* set is 17--32 bytes in size */
271	pcmpistri	$0, %xmm0, %xmm2	# match in first set half?
272	jb		.Lheadmatchv2first
273	pcmpistri	$0, %xmm0, %xmm3	# match in second set half or end of string?
274	jbe		.Lheadmatchv2
275
276	ALIGN_TEXT
2770:	movdqa		(%rax), %xmm0
278	pcmpistri	$0, %xmm0, %xmm2
279	jb		4f			# match in first set half?
280	pcmpistri	$0, %xmm0, %xmm3
281	jbe		1f			# match in second set half or end of string?
282	movdqa		16(%rax), %xmm0
283	add		$32, %rax
284	pcmpistri	$0, %xmm0, %xmm2
285	jb		3f			# match in first set half?
286	pcmpistri	$0, %xmm0, %xmm3
287	ja		0b			# neither match in 2nd half nor string end?
288
289	/* match in second half or NUL */
290	lea		-16(%rax), %rax		# go back to second half
2911:	jc		2f			# jump if match found
292	pxor		%xmm1, %xmm1
293	pcmpeqb		%xmm1, %xmm0		# where is the NUL byte?
294	pmovmskb	%xmm0, %ecx
295	tzcnt		%ecx, %ecx		# location of NUL byte in (%rax)
2962:	sub		%rdi, %rax		# offset of %xmm0 from beginning of string
297	add		%rcx, %rax		# prefix length before match/NUL
298	leave
299	ret
300
301	/* match in first half */
3023:	sub		$16, %rax		# go back to second half
3034:	sub		%rdi, %rax		# offset of %xmm0 from beginning of string
304	mov		%ecx, %edx
305	pcmpistri	$0, %xmm0, %xmm3	# match in second set half?
306	cmp		%ecx, %edx		# before the first half match?
307	cmova		%ecx, %edx		# use the earlier match
308	add		%rdx, %rax		# return full ofset
309	leave
310	ret
311
312	/* set is empty, degrades to strlen */
313.Lstrlenv2:
314	leave
315	jmp	CNAME(strlen)
316
317	/* just one character in set, degrades to strchr */
318.Lstrchrv2:
319	mov	%rdi, (%rsp)		# stash a copy of the string
320	mov	%eax, %esi		# find this character
321	call	CNAME(strchrnul)
322	sub	(%rsp), %rax		# length of prefix before match
323	leave
324	ret
325
326	/* set is >=33 bytes in size */
327.Lgt32v2:
328	xorps	%xmm0, %xmm0
329	mov	$256-64, %edx
330
331	/* clear out look up table */
3320:	movaps	%xmm0, (%rsp, %rdx, 1)
333	movaps	%xmm0, 16(%rsp, %rdx, 1)
334	movaps	%xmm0, 32(%rsp, %rdx, 1)
335	movaps	%xmm0, 48(%rsp, %rdx, 1)
336	sub	$64, %edx
337	jnc	0b
338
339	add	%rcx, %rsi		# restore string pointer
340	mov	%rdi, %rax		# keep a copy of the string
341
342	/* initialise look up table */
343	ALIGN_TEXT
3440:	movzbl	(%rsi), %ecx
345	movb	$1, (%rsp, %rcx, 1)
346	test	%ecx, %ecx
347	jz	1f
348
349	movzbl	1(%rsi), %ecx
350	movb	$1, (%rsp, %rcx, 1)
351	test	%ecx, %ecx
352	jz	1f
353
354	movzbl	2(%rsi), %ecx
355	movb	$1, (%rsp, %rcx, 1)
356	test	%ecx, %ecx
357	jz	1f
358
359	movzbl	3(%rsi), %ecx
360	movb	$1, (%rsp, %rcx, 1)
361	test	%ecx, %ecx
362	jz	1f
363
364	add	$4, %rsi
365	jmp	0b
366
367	/* find match */
368	ALIGN_TEXT
3691:	movzbl	(%rax), %ecx
370	cmpb	$0, (%rsp, %rcx, 1)
371	jne	2f
372
373	movzbl	1(%rax), %ecx
374	cmpb	$0, (%rsp, %rcx, 1)
375	jne	3f
376
377	movzbl	2(%rax), %ecx
378	cmpb	$0, (%rsp, %rcx, 1)
379	jne	4f
380
381	movzbl	3(%rax), %ecx
382	add	$4, %rax
383	cmpb	$0, (%rsp, %rcx, 1)
384	je	1b
385
386	sub	$3, %rax
3874:	dec	%rdi
3883:	inc	%rax
3892:	sub	%rdi, %rax		# number of characters preceding match
390	leave
391	ret
392ARCHEND(strcspn, x86_64_v2)
393
394	.section .note.GNU-stack,"",%progbits
395