xref: /freebsd/lib/libc/amd64/string/memrchr.S (revision 4b15965daa99044daf184221b7c283bf7f2d7e66)
1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2023, 2025 Robert Clausecker <fuz@FreeBSD.org>
5 */
6
7#include <machine/asm.h>
8
9#include "amd64_archlevel.h"
10
11#define	ALIGN_TEXT	.p2align 4, 0x90
12
13ARCHFUNCS(memrchr)
14	ARCHFUNC(memrchr, scalar)
15	ARCHFUNC(memrchr, baseline)
16ENDARCHFUNCS(memrchr)
17
18ARCHENTRY(memrchr, scalar)
19	xor	%eax, %eax		# prospective return value
20	sub	$4, %rdx		# 4 bytes left to process?
21	jb	1f
22
23	ALIGN_TEXT
240:	xor	%r8, %r8
25	lea	2(%rdi), %r10
26	cmp	%sil, 2(%rdi)
27	cmovne	%r8, %r10		# point to null if no match
28
29	cmp	%sil, (%rdi)
30	cmove	%rdi, %r8		# point to first char if match
31
32	lea	1(%rdi), %r9
33	cmp	%sil, 1(%rdi)
34	cmovne	%r8, %r9		# point to first result if no match in second
35
36	lea	3(%rdi), %r11
37	cmp	%sil, 3(%rdi)
38	cmovne	%r10, %r11
39
40	test	%r11, %r11
41	cmovz	%r9, %r11		# take first pair match if none in second
42
43	test	%r11, %r11
44	cmovnz	%r11, %rax		# take match in current set if any
45
46	add	$4, %rdi
47	sub	$4, %rdx
48	jae	0b
49
501:	cmp	$-3, %edx		# a least one character left to process?
51	jb	2f
52
53	cmp	%sil, (%rdi)
54	cmove	%rdi, %rax
55
56	lea	1(%rdi), %rcx
57	cmp	$-2, %edx		# at least two characters left to process?
58	jb	2f
59
60	cmp	%sil, 1(%rdi)
61	cmove	%rcx, %rax
62
63	lea	2(%rdi), %rcx
64	cmp	$-1, %edx		# at least three character left to process?
65	jb	2f
66
67	cmp	%sil, 2(%rdi)
68	cmove	%rcx, %rax
69
702:	ret
71ARCHEND(memrchr, scalar)
72
73ARCHENTRY(memrchr, baseline)
74	test		%rdx, %rdx		# empty input?
75	je		.Lnomatchb
76
77
78	lea		(%rdi, %rdx, 1), %ecx	# pointer to end of buffer
79	lea		-1(%rdi, %rdx, 1), %rdx	# pointer to last char in buffer
80	movd		%esi, %xmm2
81	and		$~0x1f, %rdx		# pointer to final 32 buffer bytes
82	movdqa		(%rdx), %xmm0		# load last 32 bytes
83	movdqa		16(%rdx), %xmm1
84
85	punpcklbw	%xmm2, %xmm2		# c -> cc
86
87	mov		$-1, %r8d
88	neg		%ecx
89	mov		%r8d, %r9d
90	shr		%cl, %r8d		# mask with zeroes after the string
91
92	punpcklwd	%xmm2, %xmm2		# cc -> cccc
93
94	mov		%edi, %ecx
95	mov		%r9d, %eax
96	shl		%cl, %r9d		# mask with zeroes before the string
97
98	pshufd		$0, %xmm2, %xmm2	# cccc -> cccccccccccccccc
99
100	cmp		%rdx, %rdi		# tail is beginning of buffer?
101	cmovae		%r9d, %eax		# if yes, do combined head/tail processing
102	and		%r8d, %eax		# mak of bytes in tail part of string
103
104	/* process tail */
105	pcmpeqb		%xmm2, %xmm1
106	pcmpeqb		%xmm2, %xmm0
107	pmovmskb	%xmm1, %esi
108	pmovmskb	%xmm0, %ecx
109	shl		$16, %esi
110	or		%esi, %ecx		# locations of matches
111	and		%ecx, %eax		# any match inside buffer?
112	jnz		.Lprecisematchb
113
114	cmp		%rdx, %rdi		# did the buffer begin here?
115	jae		.Lnomatchb		# if yes, we are done
116
117	/* main loop */
118	ALIGN_TEXT
1190:	movdqa		-32(%rdx), %xmm0	# load previous string chunk
120	movdqa		-16(%rdx), %xmm1
121	sub		$32, %rdx		# beginning of string reached?
122	cmp		%rdx, %rdi
123	jae		.Ltailb
124
125	pcmpeqb		%xmm2, %xmm0
126	pcmpeqb		%xmm2, %xmm1
127	por		%xmm1, %xmm0		# match in either half?
128	pmovmskb	%xmm0, %eax
129	test		%eax, %eax
130	jz		0b
131
132.Lmatchb:
133	pcmpeqb		(%rdx), %xmm2		# redo comparison of first 16 bytes
134	pmovmskb	%xmm1, %ecx
135	pmovmskb	%xmm2, %eax
136	shl		$16, %ecx
137	or		%ecx, %eax		# location of matches
138
139.Lprecisematchb:
140	bsr		%eax, %eax		# find location of match
141	add		%rdx, %rax		# point to matching byte
142	ret
143
144.Ltailb:
145	pcmpeqb		%xmm2, %xmm1
146	pcmpeqb		%xmm2, %xmm0
147	pmovmskb	%xmm1, %ecx
148	pmovmskb	%xmm0, %eax
149	shl		$16, %ecx
150	or		%ecx, %eax		# location of matches
151	and		%r9d, %eax		# mask out matches before buffer
152	bsr		%eax, %edi		# location of match
153	lea		(%rdx, %rdi, 1), %rdx	# pointer to match (if any)
154	cmovnz		%rdx, %rax		# point to match if present,
155	ret					# else null pointer
156
157.Lnomatchb:
158	xor		%eax, %eax		# return null pointer
159	ret
160ARCHEND(memrchr, baseline)
161
162	.section	.note.GNU-stack, "", %progbits
163