xref: /freebsd/lib/libc/amd64/string/strcspn.S (revision 0e867a49115687398fd486b3af67fbb41f48b8a8)
1/*
2 * Copyright (c) 2023 The FreeBSD Foundation
3 *
4 * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
5 * under sponsorship from the FreeBSD Foundation.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE
27 */
28
29#include <machine/asm.h>
30#include <machine/param.h>
31
32#include "amd64_archlevel.h"
33
34#define ALIGN_TEXT	.p2align 4,0x90 /* 16-byte alignment, nop filled */
35
36	.weak strcspn
37	.set strcspn, __strcspn
38ARCHFUNCS(__strcspn)
39	ARCHFUNC(__strcspn, scalar)
40	NOARCHFUNC
41	ARCHFUNC(__strcspn, x86_64_v2)
42ENDARCHFUNCS(__strcspn)
43
44ARCHENTRY(__strcspn, scalar)
45	push	%rbp			# align stack to enable function call
46	mov	%rsp, %rbp
47	sub	$256, %rsp		# allocate space for lookup table
48
49	/* check for special cases */
50	movzbl	(%rsi), %eax		# first character in the set
51	test	%eax, %eax
52	jz	.Lstrlen
53
54	movzbl	1(%rsi), %edx		# second character in the set
55	test	%edx, %edx
56	jz	.Lstrchr
57
58	/* no special case matches -- prepare lookup table */
59	xor	%r8d, %r8d
60	mov	$28, %ecx
610:	mov	%r8, (%rsp, %rcx, 8)
62	mov	%r8, 8(%rsp, %rcx, 8)
63	mov	%r8, 16(%rsp, %rcx, 8)
64	mov	%r8, 24(%rsp, %rcx, 8)
65	sub	$4, %ecx
66	jnc	0b
67
68	add	$2, %rsi
69	movb	$1, (%rsp, %rax, 1)	# register first chars in set
70	movb	$1, (%rsp, %rdx, 1)
71	mov	%rdi, %rax		# a copy of the source to iterate over
72
73	/* process remaining chars in set */
74	ALIGN_TEXT
750:	movzbl	(%rsi), %ecx
76	movb	$1, (%rsp, %rcx, 1)
77	test	%ecx, %ecx
78	jz	1f
79
80	movzbl	1(%rsi), %ecx
81	movb	$1, (%rsp, %rcx, 1)
82	test	%ecx, %ecx
83	jz	1f
84
85	add	$2, %rsi
86	jmp	0b
87
88	/* find match */
89	ALIGN_TEXT
901:	movzbl	(%rax), %ecx
91	cmpb	$0, (%rsp, %rcx, 1)
92	jne	2f
93
94	movzbl	1(%rax), %ecx
95	cmpb	$0, (%rsp, %rcx, 1)
96	jne	3f
97
98	movzbl	2(%rax), %ecx
99	cmpb	$0, (%rsp, %rcx, 1)
100	jne	4f
101
102	movzbl	3(%rax), %ecx
103	add	$4, %rax
104	cmpb	$0, (%rsp, %rcx, 1)
105	je	1b
106
107	sub	$3, %rax
1084:	dec	%rdi
1093:	inc	%rax
1102:	sub	%rdi, %rax		# number of characters preceding match
111	leave
112	ret
113
114	/* set is empty, degrades to strlen */
115.Lstrlen:
116	leave
117	jmp	CNAME(strlen)
118
119	/* just one character in set, degrades to strchr */
120.Lstrchr:
121	mov	%rdi, (%rsp)		# stash a copy of the string
122	mov	%eax, %esi		# find the character in the set
123	call	CNAME(strchrnul)
124	sub	(%rsp), %rax		# length of prefix before match
125	leave
126	ret
127ARCHEND(__strcspn, scalar)
128
129	/*
130	 * This kernel uses pcmpistri to do the heavy lifting.
131	 * We provide five code paths, depending on set size:
132	 *
133	 *      0: call strlen()
134	 *      1: call strchr()
135	 *  2--16: one pcmpistri per 16 bytes of input
136	 * 17--32: two pcmpistri per 16 bytes of input
137	 *   >=33: fall back to look up table
138	 */
139ARCHENTRY(__strcspn, x86_64_v2)
140	push		%rbp
141	mov		%rsp, %rbp
142	sub		$256, %rsp
143
144	/* check for special cases */
145	movzbl		(%rsi), %eax
146	test		%eax, %eax		# empty string?
147	jz		.Lstrlenv2
148
149	cmpb		$0, 1(%rsi)		# single character string?
150	jz		.Lstrchrv2
151
152	/* find set size and copy up to 32 bytes to (%rsp) */
153	mov		%esi, %ecx
154	and		$~0xf, %rsi		# align set pointer
155	movdqa		(%rsi), %xmm0
156	pxor		%xmm1, %xmm1
157	and		$0xf, %ecx		# amount of bytes rsi is past alignment
158	xor		%edx, %edx
159	pcmpeqb		%xmm0, %xmm1		# end of string reached?
160	movdqa		%xmm0, 32(%rsp)		# transfer head of set to stack
161	pmovmskb	%xmm1, %eax
162	shr		%cl, %eax		# clear out junk before string
163	test		%eax, %eax		# end of set reached?
164	jnz		0f
165
166	movdqa		16(%rsi), %xmm0		# second chunk of the set
167	mov		$16, %edx
168	sub		%ecx, %edx		# length of set preceding xmm0
169	pxor		%xmm1, %xmm1
170	pcmpeqb		%xmm0, %xmm1
171	movdqa		%xmm0, 48(%rsp)
172	movdqu		32(%rsp, %rcx, 1), %xmm2 # head of set
173	pmovmskb	%xmm1, %eax
174	test		%eax, %eax
175	jnz		1f
176
177	movdqa		32(%rsi), %xmm0		# third chunk
178	add		$16, %edx
179	pxor		%xmm1, %xmm1
180	pcmpeqb		%xmm0, %xmm1
181	movdqa		%xmm0, 64(%rsp)
182	pmovmskb	%xmm1, %eax
183	test		%eax, %eax		# still not done?
184	jz		.Lgt32v2
185
1860:	movdqu		32(%rsp, %rcx, 1), %xmm2 # head of set
1871:	tzcnt		%eax, %eax
188	add		%eax, %edx		# length of set (excluding NUL byte)
189	cmp		$32, %edx		# above 32 bytes?
190	ja		.Lgt32v2
191
192	/*
193	 * At this point we know that we want to use pcmpistri.
194	 * one last problem obtains: the head of the string is not
195	 * aligned and may cross a cacheline.  If this is the case,
196	 * we take the part before the page boundary and repeat the
197	 * last byte to fill up the xmm register.
198	 */
199	mov		%rdi, %rax		# save original string pointer
200	lea		15(%rdi), %esi		# last byte of the head
201	xor		%edi, %esi
202	test		$PAGE_SIZE, %esi	# does the head cross a page?
203	jz		0f
204
205	/* head crosses page: copy to stack to fix up */
206	and		$~0xf, %rax		# align head pointer temporarily
207	movzbl		15(%rax), %esi		# last head byte on the page
208	movdqa		(%rax), %xmm0
209	movabs		$0x0101010101010101, %r8
210	imul		%r8, %rsi		# repeated 8 times
211	movdqa		%xmm0, (%rsp)		# head word on stack
212	mov		%rsi, 16(%rsp)		# followed by filler (last byte x8)
213	mov		%rsi, 24(%rsp)
214	mov		%edi, %eax
215	and		$0xf, %eax		# offset of head from alignment
216	add		%rsp, %rax		# pointer to fake head
217
2180:	movdqu		(%rax), %xmm0		# load head (fake or real)
219	lea		16(%rdi), %rax
220	and		$~0xf, %rax		# second 16 bytes of string (aligned)
2211:	cmp		$16, %edx		# 16--32 bytes?
222	ja		.Lgt16v2
223
224
225	/* set is 2--16 bytes in size */
226
227	/* _SIDD_UBYTE_OPS|_SIDD_CMP_EQUAL_ANY|_SIDD_LEAST_SIGNIFICANT */
228	pcmpistri	$0, %xmm0, %xmm2	# match in head?
229	jbe		.Lheadmatchv2
230
231	ALIGN_TEXT
2320:	pcmpistri	$0, (%rax), %xmm2
233	jbe		1f			# match or end of string?
234	pcmpistri	$0, 16(%rax), %xmm2
235	lea		32(%rax), %rax
236	ja		0b			# match or end of string?
237
2383:	lea		-16(%rax), %rax		# go back to second half
2391:	jc		2f			# jump if match found
240	movdqa		(%rax), %xmm0		# reload string piece
241	pxor		%xmm1, %xmm1
242	pcmpeqb		%xmm1, %xmm0		# where is the NUL byte?
243	pmovmskb	%xmm0, %ecx
244	tzcnt		%ecx, %ecx		# location of NUL byte in (%rax)
2452:	sub		%rdi, %rax		# offset of %xmm0 from beginning of string
246	add		%rcx, %rax		# prefix length before match/NUL
247	leave
248	ret
249
250.Lheadmatchv2:
251	jc		2f			# jump if match found
252	pxor		%xmm1, %xmm1
253	pcmpeqb		%xmm1, %xmm0
254	pmovmskb	%xmm0, %ecx
255	tzcnt		%ecx, %ecx		# location of NUL byte
2562:	mov		%ecx, %eax		# prefix length before match/NUL
257	leave
258	ret
259
260	/* match in first set half during head */
261.Lheadmatchv2first:
262	mov		%ecx, %eax
263	pcmpistri	$0, %xmm0, %xmm3	# match in second set half?
264	cmp		%ecx, %eax		# before the first half match?
265	cmova		%ecx, %eax		# use the earlier match
266	leave
267	ret
268
269.Lgt16v2:
270	movdqu		48(%rsp, %rcx, 1), %xmm3 # second part of set
271
272	/* set is 17--32 bytes in size */
273	pcmpistri	$0, %xmm0, %xmm2	# match in first set half?
274	jb		.Lheadmatchv2first
275	pcmpistri	$0, %xmm0, %xmm3	# match in second set half or end of string?
276	jbe		.Lheadmatchv2
277
278	ALIGN_TEXT
2790:	movdqa		(%rax), %xmm0
280	pcmpistri	$0, %xmm0, %xmm2
281	jb		4f			# match in first set half?
282	pcmpistri	$0, %xmm0, %xmm3
283	jbe		1f			# match in second set half or end of string?
284	movdqa		16(%rax), %xmm0
285	add		$32, %rax
286	pcmpistri	$0, %xmm0, %xmm2
287	jb		3f			# match in first set half?
288	pcmpistri	$0, %xmm0, %xmm3
289	ja		0b			# neither match in 2nd half nor string end?
290
291	/* match in second half or NUL */
292	lea		-16(%rax), %rax		# go back to second half
2931:	jc		2f			# jump if match found
294	pxor		%xmm1, %xmm1
295	pcmpeqb		%xmm1, %xmm0		# where is the NUL byte?
296	pmovmskb	%xmm0, %ecx
297	tzcnt		%ecx, %ecx		# location of NUL byte in (%rax)
2982:	sub		%rdi, %rax		# offset of %xmm0 from beginning of string
299	add		%rcx, %rax		# prefix length before match/NUL
300	leave
301	ret
302
303	/* match in first half */
3043:	sub		$16, %rax		# go back to second half
3054:	sub		%rdi, %rax		# offset of %xmm0 from beginning of string
306	mov		%ecx, %edx
307	pcmpistri	$0, %xmm0, %xmm3	# match in second set half?
308	cmp		%ecx, %edx		# before the first half match?
309	cmova		%ecx, %edx		# use the earlier match
310	add		%rdx, %rax		# return full ofset
311	leave
312	ret
313
314	/* set is empty, degrades to strlen */
315.Lstrlenv2:
316	leave
317	jmp	CNAME(strlen)
318
319	/* just one character in set, degrades to strchr */
320.Lstrchrv2:
321	mov	%rdi, (%rsp)		# stash a copy of the string
322	mov	%eax, %esi		# find this character
323	call	CNAME(strchrnul)
324	sub	(%rsp), %rax		# length of prefix before match
325	leave
326	ret
327
328	/* set is >=33 bytes in size */
329.Lgt32v2:
330	xorps	%xmm0, %xmm0
331	mov	$256-64, %edx
332
333	/* clear out look up table */
3340:	movaps	%xmm0, (%rsp, %rdx, 1)
335	movaps	%xmm0, 16(%rsp, %rdx, 1)
336	movaps	%xmm0, 32(%rsp, %rdx, 1)
337	movaps	%xmm0, 48(%rsp, %rdx, 1)
338	sub	$64, %edx
339	jnc	0b
340
341	add	%rcx, %rsi		# restore string pointer
342	mov	%rdi, %rax		# keep a copy of the string
343
344	/* initialise look up table */
345	ALIGN_TEXT
3460:	movzbl	(%rsi), %ecx
347	movb	$1, (%rsp, %rcx, 1)
348	test	%ecx, %ecx
349	jz	1f
350
351	movzbl	1(%rsi), %ecx
352	movb	$1, (%rsp, %rcx, 1)
353	test	%ecx, %ecx
354	jz	1f
355
356	movzbl	2(%rsi), %ecx
357	movb	$1, (%rsp, %rcx, 1)
358	test	%ecx, %ecx
359	jz	1f
360
361	movzbl	3(%rsi), %ecx
362	movb	$1, (%rsp, %rcx, 1)
363	test	%ecx, %ecx
364	jz	1f
365
366	add	$4, %rsi
367	jmp	0b
368
369	/* find match */
370	ALIGN_TEXT
3711:	movzbl	(%rax), %ecx
372	cmpb	$0, (%rsp, %rcx, 1)
373	jne	2f
374
375	movzbl	1(%rax), %ecx
376	cmpb	$0, (%rsp, %rcx, 1)
377	jne	3f
378
379	movzbl	2(%rax), %ecx
380	cmpb	$0, (%rsp, %rcx, 1)
381	jne	4f
382
383	movzbl	3(%rax), %ecx
384	add	$4, %rax
385	cmpb	$0, (%rsp, %rcx, 1)
386	je	1b
387
388	sub	$3, %rax
3894:	dec	%rdi
3903:	inc	%rax
3912:	sub	%rdi, %rax		# number of characters preceding match
392	leave
393	ret
394ARCHEND(__strcspn, x86_64_v2)
395
396	.section .note.GNU-stack,"",%progbits
397