xref: /freebsd/lib/libc/amd64/string/strrchr.S (revision 2ed514a220edbac6ca5ec9f40a3e0b3f2804796d)
1*2ed514a2SRobert Clausecker/*-
2*2ed514a2SRobert Clausecker * Copyright (c) 2023 The FreeBSD Foundation
3*2ed514a2SRobert Clausecker *
4*2ed514a2SRobert Clausecker * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
5*2ed514a2SRobert Clausecker * under sponsorship from the FreeBSD Foundation.
6*2ed514a2SRobert Clausecker *
7*2ed514a2SRobert Clausecker * Redistribution and use in source and binary forms, with or without
8*2ed514a2SRobert Clausecker * modification, are permitted provided that the following conditions
9*2ed514a2SRobert Clausecker * are met:
10*2ed514a2SRobert Clausecker * 1. Redistributions of source code must retain the above copyright
11*2ed514a2SRobert Clausecker *    notice, this list of conditions and the following disclaimer.
12*2ed514a2SRobert Clausecker * 2. Redistributions in binary form must reproduce the above copyright
13*2ed514a2SRobert Clausecker *    notice, this list of conditions and the following disclaimer in the
14*2ed514a2SRobert Clausecker *    documentation and/or other materials provided with the distribution.
15*2ed514a2SRobert Clausecker *
16*2ed514a2SRobert Clausecker * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
17*2ed514a2SRobert Clausecker * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18*2ed514a2SRobert Clausecker * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19*2ed514a2SRobert Clausecker * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20*2ed514a2SRobert Clausecker * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21*2ed514a2SRobert Clausecker * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22*2ed514a2SRobert Clausecker * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23*2ed514a2SRobert Clausecker * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24*2ed514a2SRobert Clausecker * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25*2ed514a2SRobert Clausecker * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26*2ed514a2SRobert Clausecker * SUCH DAMAGE
27*2ed514a2SRobert Clausecker */
28*2ed514a2SRobert Clausecker
29*2ed514a2SRobert Clausecker#include <machine/asm.h>
30*2ed514a2SRobert Clausecker
31*2ed514a2SRobert Clausecker#include "amd64_archlevel.h"
32*2ed514a2SRobert Clausecker
33*2ed514a2SRobert Clausecker#define ALIGN_TEXT	.p2align 4,0x90	# 16-byte alignment, nop-filled
34*2ed514a2SRobert Clausecker
35*2ed514a2SRobert Clausecker	.weak	rindex
36*2ed514a2SRobert Clausecker	.set	rindex, strrchr
37*2ed514a2SRobert Clausecker
38*2ed514a2SRobert ClauseckerARCHFUNCS(strrchr)
39*2ed514a2SRobert Clausecker	ARCHFUNC(strrchr, scalar)
40*2ed514a2SRobert Clausecker	ARCHFUNC(strrchr, baseline)
41*2ed514a2SRobert ClauseckerENDARCHFUNCS(strrchr)
42*2ed514a2SRobert Clausecker
43*2ed514a2SRobert ClauseckerARCHENTRY(strrchr, scalar)
44*2ed514a2SRobert Clausecker	mov	%edi, %ecx
45*2ed514a2SRobert Clausecker	and	$~7, %rdi		# align to 8 byte
46*2ed514a2SRobert Clausecker	movzbl	%sil, %esi		# clear stray high bits
47*2ed514a2SRobert Clausecker	movabs	$0x0101010101010101, %r8
48*2ed514a2SRobert Clausecker	mov	(%rdi), %rax		# load first word
49*2ed514a2SRobert Clausecker	imul	%r8, %rsi		# replicate char 8 times
50*2ed514a2SRobert Clausecker
51*2ed514a2SRobert Clausecker	/*
52*2ed514a2SRobert Clausecker	 * Unaligned input: align to 8 bytes.  Then proceed the same
53*2ed514a2SRobert Clausecker	 * way as with aligned input, but prevent matches before the
54*2ed514a2SRobert Clausecker	 * beginning of the string.  This is achieved by oring 0x01
55*2ed514a2SRobert Clausecker	 * into each byte of the buffer before the string
56*2ed514a2SRobert Clausecker	 */
57*2ed514a2SRobert Clausecker	shl	$3, %ecx
58*2ed514a2SRobert Clausecker	mov	%r8, %r10
59*2ed514a2SRobert Clausecker	shl	%cl, %r10		# 0x01 where the string is
60*2ed514a2SRobert Clausecker	xor	%r8, %r10		# 0x01 where it is not
61*2ed514a2SRobert Clausecker	neg	%r8			# negate 01..01 so we can use lea
62*2ed514a2SRobert Clausecker	movabs	$0x8080808080808080, %r9
63*2ed514a2SRobert Clausecker
64*2ed514a2SRobert Clausecker	mov	%rsi, %rcx
65*2ed514a2SRobert Clausecker	xor	%rax, %rcx		# str ^ c
66*2ed514a2SRobert Clausecker	or	%r10, %rax		# ensure str != 0 before string
67*2ed514a2SRobert Clausecker	or	%r10, %rcx		# ensure str^c != 0 before string
68*2ed514a2SRobert Clausecker	bswap	%rcx			# in reverse order, to find last match
69*2ed514a2SRobert Clausecker	mov	%rdi, %r10		# location of initial mismatch (if any)
70*2ed514a2SRobert Clausecker	xor	%r11, %r11		# initial mismatch (none)
71*2ed514a2SRobert Clausecker	add	$8, %rdi		# advance to next iteration
72*2ed514a2SRobert Clausecker	lea	(%rax, %r8, 1), %rdx	# str - 0x01..01
73*2ed514a2SRobert Clausecker	not	%rax			# ~str
74*2ed514a2SRobert Clausecker	and	%rdx, %rax		# (str - 0x01..01) & ~str
75*2ed514a2SRobert Clausecker	and	%r9, %rax		# not including junk bits
76*2ed514a2SRobert Clausecker	jnz	1f			# end of string?
77*2ed514a2SRobert Clausecker
78*2ed514a2SRobert Clausecker	lea	(%rcx, %r8, 1), %rdx	# (str ^ c) - 0x01..01
79*2ed514a2SRobert Clausecker	not	%rcx			# ~(str ^ c)
80*2ed514a2SRobert Clausecker	and	%rdx, %rcx		# ((str ^ c - 0x01..01) & ~(str ^ c)
81*2ed514a2SRobert Clausecker	and	%r9, %rcx		# not including junk bits
82*2ed514a2SRobert Clausecker	mov	%rcx, %r11		# remember mismatch in head
83*2ed514a2SRobert Clausecker	jmp	0f
84*2ed514a2SRobert Clausecker
85*2ed514a2SRobert Clausecker	/* main loop unrolled twice */
86*2ed514a2SRobert Clausecker	ALIGN_TEXT
87*2ed514a2SRobert Clausecker3:	lea	(%rcx, %r8, 1), %rdx	# (str ^ c) - 0x01..01
88*2ed514a2SRobert Clausecker	not	%rcx			# ~(str ^ c)
89*2ed514a2SRobert Clausecker	and	%rdx, %rcx		# ((str ^ c - 0x01..01) & ~(str ^ c)
90*2ed514a2SRobert Clausecker	and	%r9, %rcx		# not including junk bits
91*2ed514a2SRobert Clausecker	lea	-8(%rdi), %rdx
92*2ed514a2SRobert Clausecker	cmovnz	%rdx, %r10		# remember location of current mismatch
93*2ed514a2SRobert Clausecker	cmovnz	%rcx, %r11
94*2ed514a2SRobert Clausecker
95*2ed514a2SRobert Clausecker0:	mov	(%rdi), %rax		# str
96*2ed514a2SRobert Clausecker	mov	%rsi, %rcx
97*2ed514a2SRobert Clausecker	xor	%rax, %rcx		# str ^ c
98*2ed514a2SRobert Clausecker	bswap	%rcx			# in reverse order, to find last match
99*2ed514a2SRobert Clausecker	lea	(%rax, %r8, 1), %rdx	# str - 0x01..01
100*2ed514a2SRobert Clausecker	not	%rax			# ~str
101*2ed514a2SRobert Clausecker	and	%rdx, %rax		# (str - 0x01..01) & ~str
102*2ed514a2SRobert Clausecker	and	%r9, %rax		# not including junk bits
103*2ed514a2SRobert Clausecker	jnz	2f			# end of string?
104*2ed514a2SRobert Clausecker
105*2ed514a2SRobert Clausecker	lea	(%rcx, %r8, 1), %rdx	# (str ^ c) - 0x01..01
106*2ed514a2SRobert Clausecker	not	%rcx			# ~(str ^ c)
107*2ed514a2SRobert Clausecker	and	%rdx, %rcx		# ((str ^ c - 0x01..01) & ~(str ^ c)
108*2ed514a2SRobert Clausecker	and	%r9, %rcx		# not including junk bits
109*2ed514a2SRobert Clausecker	cmovnz	%rdi, %r10		# remember location of current mismatch
110*2ed514a2SRobert Clausecker	cmovnz	%rcx, %r11
111*2ed514a2SRobert Clausecker
112*2ed514a2SRobert Clausecker	mov	8(%rdi), %rax		# str
113*2ed514a2SRobert Clausecker	add	$16, %rdi
114*2ed514a2SRobert Clausecker	mov	%rsi, %rcx
115*2ed514a2SRobert Clausecker	xor	%rax, %rcx		# str ^ c
116*2ed514a2SRobert Clausecker	bswap	%rcx
117*2ed514a2SRobert Clausecker	lea	(%rax, %r8, 1), %rdx	# str - 0x01..01
118*2ed514a2SRobert Clausecker	not	%rax			# ~str
119*2ed514a2SRobert Clausecker	and	%rdx, %rax		# (str - 0x01..01) & ~str
120*2ed514a2SRobert Clausecker	and	%r9, %rax		# not including junk bits
121*2ed514a2SRobert Clausecker	jz	3b			# end of string?
122*2ed514a2SRobert Clausecker
123*2ed514a2SRobert Clausecker	/* NUL found */
124*2ed514a2SRobert Clausecker1:	sub	$8, %rdi		# undo advance past buffer
125*2ed514a2SRobert Clausecker2:	lea	(%rcx, %r8, 1), %rdx	# (str ^ c) - 0x01..01
126*2ed514a2SRobert Clausecker	not	%rcx			# ~(str ^ c)
127*2ed514a2SRobert Clausecker	and	%rdx, %rcx		# ((str ^ c - 0x01..01) & ~(str ^ c)
128*2ed514a2SRobert Clausecker	and	%r9, %rcx		# not including junk bits
129*2ed514a2SRobert Clausecker	lea	-1(%rax), %rdx
130*2ed514a2SRobert Clausecker	xor	%rdx, %rax		# mask of bytes in the string
131*2ed514a2SRobert Clausecker	bswap	%rdx			# in reverse order
132*2ed514a2SRobert Clausecker	and	%rdx, %rcx		# c found in the tail?
133*2ed514a2SRobert Clausecker	cmovnz	%rdi, %r10
134*2ed514a2SRobert Clausecker	cmovnz	%rcx, %r11
135*2ed514a2SRobert Clausecker	bswap	%r11			# unreverse byte order
136*2ed514a2SRobert Clausecker	bsr	%r11, %rcx		# last location of c in (R10)
137*2ed514a2SRobert Clausecker	shr	$3, %rcx		# as byte offset
138*2ed514a2SRobert Clausecker	lea	(%r10, %rcx, 1), %rax	# pointer to match
139*2ed514a2SRobert Clausecker	test	%r11, %r11		# was there actually a match?
140*2ed514a2SRobert Clausecker	cmovz	%r11, %rax		# if not, return null pointer
141*2ed514a2SRobert Clausecker	ret
142*2ed514a2SRobert ClauseckerARCHEND(strrchr, scalar)
143*2ed514a2SRobert Clausecker
144*2ed514a2SRobert ClauseckerARCHENTRY(strrchr, baseline)
145*2ed514a2SRobert Clausecker	mov		%edi, %ecx
146*2ed514a2SRobert Clausecker	and		$~0xf, %rdi		# align to 16 bytes
147*2ed514a2SRobert Clausecker	movdqa		(%rdi), %xmm1
148*2ed514a2SRobert Clausecker	movd		%esi, %xmm0
149*2ed514a2SRobert Clausecker	and		$0xf, %ecx		# offset from alignment
150*2ed514a2SRobert Clausecker	pxor		%xmm2, %xmm2
151*2ed514a2SRobert Clausecker	mov		$-1, %edx
152*2ed514a2SRobert Clausecker	punpcklbw	%xmm0, %xmm0		# c -> cc
153*2ed514a2SRobert Clausecker	shl		%cl, %edx		# bits corresponding to bytes in the string
154*2ed514a2SRobert Clausecker	punpcklwd	%xmm0, %xmm0		# cc -> cccc
155*2ed514a2SRobert Clausecker	xor		%r8, %r8		# address of latest match
156*2ed514a2SRobert Clausecker	mov		$1, %esi		# bit mask of latest match
157*2ed514a2SRobert Clausecker	mov		%rdi, %r9		# candidate location for next match
158*2ed514a2SRobert Clausecker	add		$16, %rdi		# advance to next chunk
159*2ed514a2SRobert Clausecker
160*2ed514a2SRobert Clausecker	/* check for match in head */
161*2ed514a2SRobert Clausecker	pcmpeqb		%xmm1, %xmm2		# NUL byte present?
162*2ed514a2SRobert Clausecker	pshufd		$0, %xmm0, %xmm0	# cccc -> cccccccccccccccc
163*2ed514a2SRobert Clausecker	pcmpeqb		%xmm0, %xmm1		# c present?
164*2ed514a2SRobert Clausecker	pmovmskb	%xmm2, %eax
165*2ed514a2SRobert Clausecker	pmovmskb	%xmm1, %ecx
166*2ed514a2SRobert Clausecker	and		%edx, %ecx		# c present in the string?
167*2ed514a2SRobert Clausecker	and		%edx, %eax		# NUL present in the string?
168*2ed514a2SRobert Clausecker	jnz		.Lend2
169*2ed514a2SRobert Clausecker
170*2ed514a2SRobert Clausecker	/* main loop unrolled twice */
171*2ed514a2SRobert Clausecker	ALIGN_TEXT
172*2ed514a2SRobert Clausecker0:	movdqa		(%rdi), %xmm1
173*2ed514a2SRobert Clausecker	test		%ecx, %ecx		# was there a match in the last iter.?
174*2ed514a2SRobert Clausecker	cmovnz		%r9, %r8		# remember match if any
175*2ed514a2SRobert Clausecker	cmovnz		%ecx, %esi
176*2ed514a2SRobert Clausecker	pxor		%xmm2, %xmm2
177*2ed514a2SRobert Clausecker	pcmpeqb		%xmm1, %xmm2		# NUL byte present?
178*2ed514a2SRobert Clausecker	pcmpeqb		%xmm0, %xmm1		# c present?
179*2ed514a2SRobert Clausecker	pmovmskb	%xmm2, %eax
180*2ed514a2SRobert Clausecker	pmovmskb	%xmm1, %ecx
181*2ed514a2SRobert Clausecker	test		%eax, %eax		# end of string in first half?
182*2ed514a2SRobert Clausecker	jnz		.Lend
183*2ed514a2SRobert Clausecker
184*2ed514a2SRobert Clausecker	movdqa		16(%rdi), %xmm1
185*2ed514a2SRobert Clausecker	test		%ecx, %ecx		# was there a match in the last iter.?
186*2ed514a2SRobert Clausecker	cmovnz		%rdi, %r8		# remember match if any
187*2ed514a2SRobert Clausecker	cmovnz		%ecx, %esi
188*2ed514a2SRobert Clausecker	pxor		%xmm2, %xmm2
189*2ed514a2SRobert Clausecker	pcmpeqb		%xmm1, %xmm2		# NUL byte present?
190*2ed514a2SRobert Clausecker	pcmpeqb		%xmm0, %xmm1		# c present?
191*2ed514a2SRobert Clausecker	pmovmskb	%xmm2, %eax
192*2ed514a2SRobert Clausecker	pmovmskb	%xmm1, %ecx
193*2ed514a2SRobert Clausecker	lea		16(%rdi), %r9
194*2ed514a2SRobert Clausecker	add		$32, %rdi
195*2ed514a2SRobert Clausecker	test		%eax, %eax		# end of string in second half?
196*2ed514a2SRobert Clausecker	jz		0b
197*2ed514a2SRobert Clausecker
198*2ed514a2SRobert Clausecker	ALIGN_TEXT
199*2ed514a2SRobert Clausecker.Lend2:	sub		$16, %rdi
200*2ed514a2SRobert Clausecker.Lend:	lea 		-1(%rax), %edx
201*2ed514a2SRobert Clausecker	xor		%edx, %eax		# mask of bytes in the string
202*2ed514a2SRobert Clausecker	and		%eax, %ecx		# c found in the tail?
203*2ed514a2SRobert Clausecker	cmovnz		%rdi, %r8
204*2ed514a2SRobert Clausecker	cmovnz		%ecx, %esi
205*2ed514a2SRobert Clausecker	bsr		%esi, %esi		# last location of c in (R8)
206*2ed514a2SRobert Clausecker	lea		(%r8, %rsi, 1), %rax	# pointer to match
207*2ed514a2SRobert Clausecker	ret
208*2ed514a2SRobert ClauseckerARCHEND(strrchr, baseline)
209*2ed514a2SRobert Clausecker	.section .note.GNU-stack,"",%progbits
210