xref: /freebsd/lib/libc/amd64/string/memrchr.S (revision b2d2a78ad80ec68d4a17f5aef97d21686cb1e29b)
1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2023 Robert Clausecker
5 */
6
7#include <machine/asm.h>
8
9#include "amd64_archlevel.h"
10
11#define	ALIGN_TEXT	.p2align 4, 0x90
12
13ARCHFUNCS(memrchr)
14	ARCHFUNC(memrchr, scalar)
15	ARCHFUNC(memrchr, baseline)
16ENDARCHFUNCS(memrchr)
17
18ARCHENTRY(memrchr, scalar)
19	xor	%eax, %eax		# prospective return value
20	sub	$4, %rdx		# 4 bytes left to process?
21	jb	1f
22
23	ALIGN_TEXT
240:	xor	%r8, %r8
25	lea	2(%rdi), %r10
26	cmp	%sil, 2(%rdi)
27	cmovne	%r8, %r10		# point to null if no match
28
29	cmp	%sil, (%rdi)
30	cmove	%rdi, %r8		# point to first char if match
31
32	lea	1(%rdi), %r9
33	cmp	%sil, 1(%rdi)
34	cmovne	%r8, %r9		# point to first result if no match in second
35
36	lea	3(%rdi), %r11
37	cmp	%sil, 3(%rdi)
38	cmovne	%r10, %r11
39
40	test	%r11, %r11
41	cmovz	%r9, %r11		# take first pair match if none in second
42
43	test	%r11, %r11
44	cmovnz	%r11, %rax		# take match in current set if any
45
46	add	$4, %rdi
47	sub	$4, %rdx
48	jae	0b
49
501:	cmp	$-3, %edx		# a least one character left to process?
51	jb	2f
52
53	cmp	%sil, (%rdi)
54	cmove	%rdi, %rax
55
56	lea	1(%rdi), %rcx
57	cmp	$-2, %edx		# at least two characters left to process?
58	jb	2f
59
60	cmp	%sil, 1(%rdi)
61	cmove	%rcx, %rax
62
63	lea	2(%rdi), %rcx
64	cmp	$-1, %edx		# at least three character left to process?
65	jb	2f
66
67	cmp	%sil, 2(%rdi)
68	cmove	%rcx, %rax
69
702:	ret
71ARCHEND(memrchr, scalar)
72
73ARCHENTRY(memrchr, baseline)
74	movd		%esi, %xmm4
75	test		%rdx, %rdx		# empty buffer?
76	jz		.L0			# if yes, return immediately
77
78	punpcklbw	%xmm4, %xmm4		# c -> cc
79	mov		%edi, %ecx
80	punpcklwd	%xmm4, %xmm4		# cc -> cccc
81	and		$~0xf, %rdi		# align source pointer
82	pshufd		$0, %xmm4, %xmm4	# cccc -> cccccccccccccccc
83	and		$0xf, %ecx
84	movdqa		%xmm4, %xmm0
85	mov		$-1, %r8d
86	pcmpeqb		(%rdi), %xmm0		# compare aligned head
87	shl		%cl, %r8d		# mask of bytes in the head of the buffer
88	pmovmskb	%xmm0, %eax
89
90	sub		$16, %rcx
91	and		%r8d, %eax		# match mask
92	add		%rcx, %rdx		# advance past head
93	cmc
94	jbe		.Lrunt			# did the string end in the buffer?
95
96	mov		%rdi, %rsi		# pointer to matching chunk
97	add		$16, %rdi
98	sub		$16, %rdx		# enough left for another round?
99	jbe		1f
100
101	/* main loop unrolled twice */
102	ALIGN_TEXT
1030:	movdqa		%xmm4, %xmm0
104	pcmpeqb		(%rdi), %xmm0
105	pmovmskb	%xmm0, %r8d
106
107	cmp		$16, %rdx		# enough left for second chunk?
108	jbe		2f
109
110	movdqa		%xmm4, %xmm0
111	pcmpeqb		16(%rdi), %xmm0
112	pmovmskb	%xmm0, %ecx
113
114	lea		16(%rdi), %r9
115	test		%ecx, %ecx		# match found in second chunk?
116	cmovz		%r8d, %ecx		# if not, use match data from first chunk
117	cmovz		%rdi, %r9
118
119	test		%ecx, %ecx		# any match found?
120	cmovnz		%ecx, %eax		# if yes, overwrite previously found match
121	cmovnz		%r9, %rsi
122
123	add		$32, %rdi		# advance to next iteration
124	sub		$32, %rdx		# advance to next chunks
125	ja		0b
126
127	/* process remaining 1--16 bytes */
1281:	pcmpeqb		(%rdi), %xmm4
129	mov		$0xffff, %r8d
130	xor		%ecx, %ecx
131	sub		%edx, %ecx		# number of bytes to be masked out
132	pmovmskb	%xmm4, %r9d
133	shr		%cl, %r8d		# mask of bytes to be kept in the buffer
134	and		%r9d, %r8d
135	cmovnz		%r8d, %eax
136	cmovnz		%rdi, %rsi
137	bsr		%eax, %eax
138	lea		(%rsi, %rax, 1), %rsi	# pointer to match (or junk)
139	cmovnz		%rsi, %rax		# if any match was found, return it
140	ret
141
142	/* end of chunk reached within first half iteration */
1432:	test		%r8d, %r8d		# match in previous chunk?
144	cmovnz		%r8d, %eax		# if yes, overwrite previous chunks
145	cmovnz		%rdi, %rsi
146	add		$16, %rdi		# point to tail
147	sub		$16, %edx
148	jmp		1b			# handle tail the same otherwise
149
150	/* runt: string ends within head, edx has negated amount of invalid head bytes */
151.Lrunt:	mov		$0xffff, %r8d
152	xor		%ecx, %ecx
153	sub		%edx, %ecx
154	shr		%cl, %r8d
155	and		%r8d, %eax
156	bsr		%eax, %eax
157	lea		(%rdi, %rax, 1), %rdi
158	cmovnz		%rdi, %rax
159	ret
160
161	/* empty buffer: return a null pointer */
162.L0:	xor		%eax, %eax
163	ret
164ARCHEND(memrchr, baseline)
165
166	.section	.note.GNU-stack, "", %progbits
167