xref: /freebsd/lib/libc/amd64/string/memrchr.S (revision 24e4dcf4ba5e9dedcf89efd358ea3e1fe5867020)
1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2023, 2025 Robert Clausecker <fuz@FreeBSD.org>
5 */
6
7#include <machine/asm.h>
8
9#include "amd64_archlevel.h"
10
11#define	ALIGN_TEXT	.p2align 4, 0x90
12
13ARCHFUNCS(memrchr)
14	ARCHFUNC(memrchr, scalar)
15	ARCHFUNC(memrchr, baseline)
16ENDARCHFUNCS(memrchr)
17
18ARCHENTRY(memrchr, scalar)
19	lea		-1(%rdi, %rdx, 1), %rax	# point to last char in buffer
20	sub		$4, %rdx		# 4 bytes left to process?
21	jb		.Ltail
22
23	ALIGN_TEXT
240:	cmp		%sil, (%rax)		# match at last entry?
25	je		1f
26
27	cmp		%sil, -1(%rax)		# match at second to last entry?
28	je		2f
29
30	cmp		%sil, -2(%rax)		# match at third to last entry?
31	je		3f
32
33	cmp		%sil, -3(%rax)		# match at fourth to last entry?
34	je		4f
35
36	sub		$4, %rax
37	sub		$4, %rdx
38	jae		0b
39
40.Ltail:	cmp		$-3, %edx		# at least one character left to process?
41	jb		.Lnotfound
42
43	cmp		%sil, (%rax)
44	je		1f
45
46	cmp		$-2, %edx		# at least two characters left to process?
47	jb		.Lnotfound
48
49	cmp		%sil, -1(%rax)
50	je		2f
51
52	cmp		$-1, %edx		# at least three characters left to process?
53	jb		.Lnotfound
54
55	cmp		%sil, -2(%rax)
56	je		3f
57
58.Lnotfound:
59	xor		%eax, %eax
60	ret
61
62	/* match found -- adjust rax to point to matching byte */
634:	dec		%rax
643:	dec		%rax
652:	dec		%rax
661:	ret
67ARCHEND(memrchr, scalar)
68
69ARCHENTRY(memrchr, baseline)
70	test		%rdx, %rdx		# empty input?
71	je		.Lnomatchb
72
73
74	lea		(%rdi, %rdx, 1), %ecx	# pointer to end of buffer
75	lea		-1(%rdi, %rdx, 1), %rdx	# pointer to last char in buffer
76	movd		%esi, %xmm2
77	and		$~0x1f, %rdx		# pointer to final 32 buffer bytes
78	movdqa		(%rdx), %xmm0		# load last 32 bytes
79	movdqa		16(%rdx), %xmm1
80
81	punpcklbw	%xmm2, %xmm2		# c -> cc
82
83	mov		$-1, %r8d
84	neg		%ecx
85	mov		%r8d, %r9d
86	shr		%cl, %r8d		# mask with zeroes after the string
87
88	punpcklwd	%xmm2, %xmm2		# cc -> cccc
89
90	mov		%edi, %ecx
91	mov		%r9d, %eax
92	shl		%cl, %r9d		# mask with zeroes before the string
93
94	pshufd		$0, %xmm2, %xmm2	# cccc -> cccccccccccccccc
95
96	cmp		%rdx, %rdi		# tail is beginning of buffer?
97	cmovae		%r9d, %eax		# if yes, do combined head/tail processing
98	and		%r8d, %eax		# mak of bytes in tail part of string
99
100	/* process tail */
101	pcmpeqb		%xmm2, %xmm1
102	pcmpeqb		%xmm2, %xmm0
103	pmovmskb	%xmm1, %esi
104	pmovmskb	%xmm0, %ecx
105	shl		$16, %esi
106	or		%esi, %ecx		# locations of matches
107	and		%ecx, %eax		# any match inside buffer?
108	jnz		.Lprecisematchb
109
110	cmp		%rdx, %rdi		# did the buffer begin here?
111	jae		.Lnomatchb		# if yes, we are done
112
113	/* main loop */
114	ALIGN_TEXT
1150:	movdqa		-32(%rdx), %xmm0	# load previous string chunk
116	movdqa		-16(%rdx), %xmm1
117	sub		$32, %rdx		# beginning of string reached?
118	cmp		%rdx, %rdi
119	jae		.Ltailb
120
121	pcmpeqb		%xmm2, %xmm0
122	pcmpeqb		%xmm2, %xmm1
123	por		%xmm1, %xmm0		# match in either half?
124	pmovmskb	%xmm0, %eax
125	test		%eax, %eax
126	jz		0b
127
128.Lmatchb:
129	pcmpeqb		(%rdx), %xmm2		# redo comparison of first 16 bytes
130	pmovmskb	%xmm1, %ecx
131	pmovmskb	%xmm2, %eax
132	shl		$16, %ecx
133	or		%ecx, %eax		# location of matches
134
135.Lprecisematchb:
136	bsr		%eax, %eax		# find location of match
137	add		%rdx, %rax		# point to matching byte
138	ret
139
140.Ltailb:
141	pcmpeqb		%xmm2, %xmm1
142	pcmpeqb		%xmm2, %xmm0
143	pmovmskb	%xmm1, %ecx
144	pmovmskb	%xmm0, %eax
145	shl		$16, %ecx
146	or		%ecx, %eax		# location of matches
147	and		%r9d, %eax		# mask out matches before buffer
148	bsr		%eax, %edi		# location of match
149	lea		(%rdx, %rdi, 1), %rdx	# pointer to match (if any)
150	cmovnz		%rdx, %rax		# point to match if present,
151	ret					# else null pointer
152
153.Lnomatchb:
154	xor		%eax, %eax		# return null pointer
155	ret
156ARCHEND(memrchr, baseline)
157
158	.section	.note.GNU-stack, "", %progbits
159