xref: /freebsd/lib/libc/amd64/string/strncmp.S (revision e9b261f297cac146f0c9f895c16debe1c4cf8978)
1/*-
2 * Copyright (c) 2023 The FreeBSD Foundation
3 *
4 * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
5 * under sponsorship from the FreeBSD Foundation.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE
27 */
28
29#include <machine/asm.h>
30#include <machine/param.h>
31
32#include "amd64_archlevel.h"
33
34#define ALIGN_TEXT	.p2align 4, 0x90
35
36ARCHFUNCS(strncmp)
37	ARCHFUNC(strncmp, scalar)
38	ARCHFUNC(strncmp, baseline)
39ENDARCHFUNCS(strncmp)
40
41/*
42 * This is just the scalar loop unrolled a bunch of times.
43 */
44ARCHENTRY(strncmp, scalar)
45	xor	%eax, %eax
46	sub	$4, %rdx	# 4 chars left to compare?
47	jbe	1f
48
49	ALIGN_TEXT
500:	movzbl	(%rdi), %ecx
51	test	%ecx, %ecx	# NUL char in first string?
52	jz	.L0
53	cmpb	(%rsi), %cl	# mismatch between strings?
54	jnz	.L0
55
56	movzbl	1(%rdi), %ecx
57	test	%ecx, %ecx
58	jz	.L1
59	cmpb	1(%rsi), %cl
60	jnz	.L1
61
62	movzbl	2(%rdi), %ecx
63	test	%ecx, %ecx
64	jz	.L2
65	cmpb	2(%rsi), %cl
66	jnz	.L2
67
68	movzbl	3(%rdi), %ecx
69	test	%ecx, %ecx
70	jz	.L3
71	cmpb	3(%rsi), %cl
72	jnz	.L3
73
74	add	$4, %rdi	# advance to next iteration
75	add	$4, %rsi
76	sub	$4, %rdx
77	ja	0b
78
79	/* end of string within the next 4 characters */
801:	cmp	$-4, %edx	# end of string reached immediately?
81	jz	.Leq
82	movzbl	(%rdi), %ecx
83	test	%ecx, %ecx
84	jz	.L0
85	cmpb	(%rsi), %cl
86	jnz	.L0
87
88	cmp	$-3, %edx	# end of string reached after 1 char?
89	jz	.Leq
90	movzbl	1(%rdi), %ecx
91	test	%ecx, %ecx
92	jz	.L1
93	cmpb	1(%rsi), %cl
94	jnz	.L1
95
96	cmp	$-2, %edx
97	jz	.Leq
98	movzbl	2(%rdi), %ecx
99	test	%ecx, %ecx
100	jz	.L2
101	cmpb	2(%rsi), %cl
102	jnz	.L2
103
104	cmp	$-1, %edx	# either end of string after 3 chars,
105	jz	.Leq		# or it boils down to the last char
106
107.L3:	inc	%eax
108.L2:	inc	%eax
109.L1:	inc	%eax
110.L0:	movzbl	(%rsi, %rax, 1), %ecx
111	movzbl	(%rdi, %rax, 1), %eax
112	sub	%ecx, %eax
113.Leq:	ret
114ARCHEND(strncmp, scalar)
115
116ARCHENTRY(strncmp, baseline)
117	push		%rbx
118	sub		$1, %rdx	# RDX--, so RDX points to the last byte to compare
119	jb		.Lempty		# where there any bytes to compare at all?
120
121	lea		15(%rdi), %r8d	# end of head
122	lea		15(%rsi), %r9d
123	mov		%edi, %eax
124	mov		%esi, %ebx
125	xor		%edi, %r8d	# bits that changed between first and last byte
126	xor		%esi, %r9d
127	and		$~0xf, %rdi	# align heads to 16 bytes
128	and		$~0xf, %rsi
129	or		%r8d, %r9d
130	and		$0xf, %eax	# offset from alignment
131	and		$0xf, %ebx
132	movdqa		(%rdi), %xmm0	# load aligned heads
133	movdqa		(%rsi), %xmm2
134	pxor		%xmm1, %xmm1
135	cmp		$16, %rdx	# end of buffer within the first 32 bytes?
136	jb		.Llt16
137
138	test		$PAGE_SIZE, %r9d # did the page change?
139	jz		0f		# if not, take fast path
140
141
142	/* heads may cross page boundary, avoid unmapped loads */
143	movdqa		%xmm0, -32(%rsp) # stash copies of the heads on the stack
144	movdqa		%xmm2, -16(%rsp)
145	mov		$-1, %r8d
146	mov		$-1, %r9d
147	mov		%eax, %ecx
148	shl		%cl, %r8d	# string head in XMM0
149	mov		%ebx, %ecx
150	shl		%cl, %r9d	# string head in XMM2
151	pcmpeqb		%xmm1, %xmm0
152	pcmpeqb		%xmm1, %xmm2
153	pmovmskb	%xmm0, %r10d
154	pmovmskb	%xmm2, %r11d
155	test		%r8d, %r10d	# NUL byte present in first string?
156	lea		-32(%rsp), %r8
157	cmovz		%rdi, %r8
158	test		%r9d, %r11d	# NUL byte present in second string?
159	lea		-16(%rsp), %r9
160	cmovz		%rsi, %r9
161	movdqu		(%r8, %rax, 1), %xmm0 # load true (or fake) heads
162	movdqu		(%r9, %rbx, 1), %xmm4
163	jmp		1f
164
165	/* rdx == 0 */
166.Lempty:
167	xor		%eax, %eax	# zero-length buffers compare equal
168	pop		%rbx
169	ret
170
1710:	movdqu		(%rdi, %rax, 1), %xmm0 # load true heads
172	movdqu		(%rsi, %rbx, 1), %xmm4
1731:	pxor		%xmm2, %xmm2
174	pcmpeqb		%xmm0, %xmm2	# NUL byte present?
175	pcmpeqb		%xmm0, %xmm4	# which bytes match?
176	pandn		%xmm4, %xmm2	# match and not NUL byte?
177	pmovmskb	%xmm2, %r9d
178	xor		$0xffff, %r9d	# mismatch or NUL byte?
179	jnz		.Lhead_mismatch
180
181	/* load head and second chunk */
182	movdqa		16(%rdi), %xmm2	# load second chunks
183	movdqa		16(%rsi), %xmm3
184	lea		-16(%rdx, %rbx, 1), %rdx # account for length of RSI chunk
185	sub		%rbx, %rax	# is a&0xf >= b&0xf?
186	jb		.Lswapped	# if not, proceed with swapped operands
187	jmp		.Lnormal
188
189	/* buffer ends within the first 16 bytes */
190.Llt16:	test		$PAGE_SIZE, %r9d # did the page change?
191	jz		0f		# if not, take fast path
192
193	/* heads may cross page boundary */
194	movdqa		%xmm0, -32(%rsp) # stash copies of the heads on the stack
195	movdqa		%xmm2, -16(%rsp)
196	mov		$-1, %r8d
197	mov		$-1, %r9d
198	mov		%eax, %ecx
199	shl		%cl, %r8d	# string head in XMM0
200	mov		%ebx, %ecx
201	shl		%cl, %r9d	# string head in XMM2
202	pcmpeqb		%xmm1, %xmm0
203	pcmpeqb		%xmm1, %xmm2
204	pmovmskb	%xmm0, %r10d
205	pmovmskb	%xmm2, %r11d
206	lea		(%rdx, %rax, 1), %ecx # location of last buffer byte in xmm0
207	bts		%ecx, %r10d	# treat as if NUL byte present
208	lea		(%rdx, %rbx, 1), %ecx
209	bts		%ecx, %r11d
210	test		%r8w, %r10w	# NUL byte present in first string head?
211	lea		-32(%rsp), %r8
212	cmovz		%rdi, %r8
213	test		%r9w, %r11w	# NUL byte present in second string head?
214	lea		-16(%rsp), %r9
215	cmovz		%rsi, %r9
216	movdqu		(%r8, %rax, 1), %xmm0 # load true (or fake) heads
217	movdqu		(%r9, %rbx, 1), %xmm4
218	jmp		1f
219
2200:	movdqu		(%rdi, %rax, 1), %xmm0 # load true heads
221	movdqu		(%rsi, %rbx, 1), %xmm4
2221:	pxor		%xmm2, %xmm2
223	pcmpeqb		%xmm0, %xmm2	# NUL byte present?
224	pcmpeqb		%xmm0, %xmm4	# which bytes match?
225	pandn		%xmm4, %xmm2	# match and not NUL byte?
226	pmovmskb	%xmm2, %r9d
227	btr		%edx, %r9d	# induce mismatch in last byte of buffer
228	not		%r9d		# mismatch or NUL byte?
229
230	/* mismatch in true heads */
231	ALIGN_TEXT
232.Lhead_mismatch:
233	tzcnt		%r9d, %r9d	# where is the mismatch?
234	add		%rax, %rdi	# return to true heads
235	add		%rbx, %rsi
236	movzbl		(%rdi, %r9, 1), %eax # mismatching characters
237	movzbl		(%rsi, %r9, 1), %ecx
238	sub		%ecx, %eax
239	pop		%rbx
240	ret
241
242	/* rax >= 0 */
243	ALIGN_TEXT
244.Lnormal:
245	neg		%rax
246	movdqu		16(%rsi, %rax, 1), %xmm0
247	sub		%rdi, %rsi	# express RSI as distance from RDI
248	lea		(%rsi, %rax, 1), %rbx # point RBX to offset in second string
249	neg		%rax		# ... corresponding to RDI
250	pcmpeqb		%xmm3, %xmm1	# NUL present?
251	pcmpeqb		%xmm2, %xmm0	# Mismatch between chunks?
252	pmovmskb	%xmm1, %r8d
253	pmovmskb	%xmm0, %r9d
254	mov		$16, %ecx
255	cmp		%rcx, %rdx	# does the buffer end within (RDI,RSI,1)?
256	cmovb		%edx, %ecx	# ECX = min(16, RDX)
257	add		$32, %rdi	# advance to next iteration
258	bts		%ecx, %r8d	# mark end-of-buffer as if there was a NUL byte
259	test		%r8w, %r8w	# NUL or end of buffer found?
260	jnz		.Lnul_found2
261	xor		$0xffff, %r9d
262	jnz		.Lmismatch2
263	sub		$48, %rdx	# end of buffer within first main loop iteration?
264	jb		.Ltail		# if yes, process tail
265
266	/*
267	 * During the main loop, the layout of the two strings is something like:
268	 *
269	 *          v ------1------ v ------2------ v
270	 *     RDI:    AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
271	 *     RSI: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
272	 *
273	 * where v indicates the alignment boundaries and corresponding chunks
274	 * of the strings have the same letters.  Chunk A has been checked in
275	 * the previous iteration.  This iteration, we first check that string
276	 * RSI doesn't end within region 2, then we compare chunk B between the
277	 * two strings.  As RSI is known not to hold a NUL byte in regsions 1
278	 * and 2 at this point, this also ensures that RDI has not ended yet.
279	 */
280	ALIGN_TEXT
2810:	movdqu		(%rdi, %rbx, 1), %xmm0 # chunk of 2nd string corresponding to RDI
282	pxor		%xmm1, %xmm1
283	pcmpeqb		(%rdi, %rsi, 1), %xmm1 # end of string in RSI?
284	pcmpeqb		(%rdi), %xmm0	# where do the chunks match?
285	pmovmskb	%xmm1, %r8d
286	pmovmskb	%xmm0, %r9d
287	test		%r8d, %r8d
288	jnz		.Lnul_found
289	xor		$0xffff, %r9d	# any mismatches?
290	jnz		.Lmismatch
291
292	/* main loop unrolled twice */
293	movdqu		16(%rdi, %rbx, 1), %xmm0
294	pxor		%xmm1, %xmm1
295	pcmpeqb		16(%rdi, %rsi, 1), %xmm1
296	pcmpeqb		16(%rdi), %xmm0
297	pmovmskb	%xmm1, %r8d
298	pmovmskb	%xmm0, %r9d
299	add		$32, %rdi
300	test		%r8d, %r8d
301	jnz		.Lnul_found2
302	xor		$0xffff, %r9d
303	jnz		.Lmismatch2
304	sub		$32, %rdx	# end of buffer within next iteration?
305	jae		0b
306
307	/* end of buffer will occur in next 32 bytes */
308.Ltail:	movdqu		(%rdi, %rbx, 1), %xmm0 # chunk of 2nd string corresponding to RDI
309	pxor		%xmm1, %xmm1
310	pcmpeqb		(%rdi, %rsi, 1), %xmm1 # end of string in RSI?
311	pcmpeqb		(%rdi), %xmm0	# where do the chunks match?
312	pmovmskb	%xmm1, %r8d
313	pmovmskb	%xmm0, %r9d
314	bts		%edx, %r8d	# indicate NUL byte at last byte in buffer
315	test		%r8w, %r8w	# NUL byte in first chunk?
316	jnz		.Lnul_found
317	xor		$0xffff, %r9d	# any mismatches?
318	jnz		.Lmismatch
319
320	/* main loop unrolled twice */
321	movdqu		16(%rdi, %rbx, 1), %xmm0
322	pxor		%xmm1, %xmm1
323	pcmpeqb		16(%rdi, %rsi, 1), %xmm1
324	pcmpeqb		16(%rdi), %xmm0
325	pmovmskb	%xmm1, %r8d
326	pmovmskb	%xmm0, %r9d
327	sub		$16, %edx	# take first half into account
328	bts		%edx, %r8d	# indicate NUL byte at last byte in buffer
329	add		$32, %rdi
330
331.Lnul_found2:
332	sub		$16, %rdi
333
334.Lnul_found:
335	mov		%eax, %ecx
336	mov		%r8d, %r10d
337	shl		%cl, %r8d	# adjust NUL mask to positions in RDI/RBX
338	not		%r9d		# mask of mismatches
339	or		%r8w, %r9w	# NUL bytes als count as mismatches
340	jnz		.Lmismatch
341
342	/*
343	 * (RDI) == (RSI) and NUL is past the string.
344	 * compare (RSI) with the corresponding part
345	 * of the other string until the NUL byte.
346	 */
347	movdqu		(%rdi, %rax, 1), %xmm0
348	pcmpeqb		(%rdi, %rsi, 1), %xmm0
349	add		%rdi, %rsi	# restore RSI pointer
350	add		%rax, %rdi	# point RDI to chunk corresponding to (RSI)
351	pmovmskb	%xmm0, %ecx	# mask of matches
352	not		%ecx		# mask of mismatches
353	or		%r10d, %ecx	# mask of mismatches or NUL bytes
354	tzcnt		%ecx, %ecx	# location of first mismatch
355	movzbl		(%rdi, %rcx, 1), %eax
356	movzbl		(%rsi, %rcx, 1), %ecx
357	sub		%ecx, %eax
358	pop		%rbx
359	ret
360
361.Lmismatch2:
362	sub		$16, %rdi
363
364	/* a mismatch has been found between RBX and RSI */
365.Lmismatch:
366	tzcnt		%r9d, %r9d	# where is the mismatch?
367	add		%rdi, %rbx	# turn RBX from offset into pointer
368	movzbl		(%rbx, %r9, 1), %ecx
369	movzbl		(%rdi, %r9, 1), %eax
370	sub		%ecx, %eax
371	pop		%rbx
372	ret
373
374	/* rax < 0 */
375	ALIGN_TEXT
376.Lswapped:
377	movdqu		16(%rdi, %rax, 1), %xmm0
378	sub		%rsi, %rdi	# express RDI as distance from RDI
379	lea		(%rdi, %rax, 1), %rbx # point RBX to offset in first string
380	pcmpeqb		%xmm2, %xmm1	# NUL present?
381	pcmpeqb		%xmm3, %xmm0	# mismatch between chunks?
382	pmovmskb	%xmm1, %r8d
383	pmovmskb	%xmm0, %r9d
384	add		%rax, %rdx	# RDX points to buffer end in RSI
385	neg		%rax		# ... corresponding to RSI
386	mov		$16, %ecx
387	cmp		%rcx, %rdx	# does the buffer end within (RSI,RDI,1)?
388	cmovb		%edx, %ecx	# ECX = min(16, RDX)
389	add		$32, %rsi
390	bts		%ecx, %r8d	# mark end-of-buffer as if there was a NUL byte
391	test		%r8w, %r8w	# NUL or end of buffer found?
392	jnz		.Lnul_found2s
393	xor		$0xffff, %r9d
394	jnz		.Lmismatch2s
395	sub		$48, %rdx	# end of buffer within first main loop iteration?
396	jb		.Ltails		# if yes, process tail
397
398	ALIGN_TEXT
3990:	movdqu		(%rsi, %rbx, 1), %xmm0 # chunk of 1st string corresponding to RSI
400	pxor		%xmm1, %xmm1
401	pcmpeqb		(%rsi, %rdi, 1), %xmm1 # end of string in RDI?
402	pcmpeqb		(%rsi), %xmm0	# where do the chunks match?
403	pmovmskb	%xmm1, %r8d
404	pmovmskb	%xmm0, %r9d
405	test		%r8d, %r8d
406	jnz		.Lnul_founds
407	xor		$0xffff, %r9d	# any mismatches?
408	jnz		.Lmismatchs
409
410	/* main loop unrolled twice */
411	movdqu		16(%rsi, %rbx, 1), %xmm0
412	pxor		%xmm1, %xmm1
413	pcmpeqb		16(%rsi, %rdi, 1), %xmm1
414	pcmpeqb		16(%rsi), %xmm0
415	pmovmskb	%xmm1, %r8d
416	pmovmskb	%xmm0, %r9d
417	add		$32, %rsi
418	test		%r8d, %r8d
419	jnz		.Lnul_found2s
420	xor		$0xffff, %r9d
421	jnz		.Lmismatch2s
422	sub		$32, %rdx	# end of buffer within next iteration?
423	jae		0b
424
425	/* end of buffer will occur in next 32 bytes */
426.Ltails:
427	movdqu		(%rsi, %rbx, 1), %xmm0 # chunk of 1st string corresponding to RSI
428	pxor		%xmm1, %xmm1
429	pcmpeqb		(%rsi, %rdi, 1), %xmm1 # end of string in RDI?
430	pcmpeqb		(%rsi), %xmm0	# where do the chunks match?
431	pmovmskb	%xmm1, %r8d
432	pmovmskb	%xmm0, %r9d
433	bts		%edx, %r8d	# indicate NUL byte at laste byte in buffer
434	test		%r8w, %r8w	# NUL byte in first chunk?
435	jnz		.Lnul_founds
436	xor		$0xffff, %r9d	# any mismatches?
437	jnz		.Lmismatchs
438
439	/* main loop unrolled twice */
440	movdqu		16(%rsi, %rbx, 1), %xmm0
441	pxor		%xmm1, %xmm1
442	pcmpeqb		16(%rsi, %rdi, 1), %xmm1
443	pcmpeqb		16(%rsi), %xmm0
444	pmovmskb	%xmm1, %r8d
445	pmovmskb	%xmm0, %r9d
446	sub		$16, %edx	# take first half into account
447	bts		%edx, %r8d	# indicate NUL byte at laste byte in buffer
448	add		$32, %rsi
449
450.Lnul_found2s:
451	sub		$16, %rsi
452
453.Lnul_founds:
454	mov		%eax, %ecx
455	mov		%r8d, %r10d
456	shl		%cl, %r8d	# adjust NUL mask to positions in RSI/RBX
457	not		%r9d		# mask of mismatches
458	or		%r8w, %r9w	# NUL bytes also count as mismatches
459	jnz		.Lmismatchs
460
461	movdqu		(%rsi, %rax, 1), %xmm0
462	pcmpeqb		(%rsi, %rdi, 1), %xmm0
463	add		%rsi, %rdi	# restore RDI pointer
464	add		%rax, %rsi	# point RSI to chunk corresponding to (RDI)
465	pmovmskb	%xmm0, %ecx	# mask of matches
466	not		%ecx		# mask of mismatches
467	or		%r10d, %ecx	# mask of mismatches or NUL bytes
468	tzcnt		%ecx, %ecx	# location of first mismatch
469	movzbl		(%rdi, %rcx, 1), %eax
470	movzbl		(%rsi, %rcx, 1), %ecx
471	sub		%ecx, %eax
472	pop		%rbx
473	ret
474
475.Lmismatch2s:
476	sub		$16, %rsi
477
478.Lmismatchs:
479	tzcnt		%r9d, %r9d	# where is the mismatch?
480	add		%rsi, %rbx	# turn RBX from offset into pointer
481	movzbl		(%rbx, %r9, 1), %eax
482	movzbl		(%rsi, %r9, 1), %ecx
483	sub		%ecx, %eax
484	pop		%rbx
485	ret
486ARCHEND(strncmp, baseline)
487
488	.section .note.GNU-stack,"",%progbits
489