xref: /freebsd/lib/libc/amd64/string/memrchr.S (revision fb197a4f7751bb4e116989e57ba7fb12a981895f)
1*fb197a4fSRobert Clausecker/*-
2*fb197a4fSRobert Clausecker * SPDX-License-Identifier: BSD-2-Clause
3*fb197a4fSRobert Clausecker *
4*fb197a4fSRobert Clausecker * Copyright (c) 2023 Robert Clausecker
5*fb197a4fSRobert Clausecker */
6*fb197a4fSRobert Clausecker
7*fb197a4fSRobert Clausecker#include <machine/asm.h>
8*fb197a4fSRobert Clausecker
9*fb197a4fSRobert Clausecker#include "amd64_archlevel.h"
10*fb197a4fSRobert Clausecker
11*fb197a4fSRobert Clausecker#define	ALIGN_TEXT	.p2align 4, 0x90
12*fb197a4fSRobert Clausecker
13*fb197a4fSRobert ClauseckerARCHFUNCS(memrchr)
14*fb197a4fSRobert Clausecker	ARCHFUNC(memrchr, scalar)
15*fb197a4fSRobert Clausecker	ARCHFUNC(memrchr, baseline)
16*fb197a4fSRobert ClauseckerENDARCHFUNCS(memrchr)
17*fb197a4fSRobert Clausecker
18*fb197a4fSRobert ClauseckerARCHENTRY(memrchr, scalar)
19*fb197a4fSRobert Clausecker	xor	%eax, %eax		# prospective return value
20*fb197a4fSRobert Clausecker	sub	$4, %rdx		# 4 bytes left to process?
21*fb197a4fSRobert Clausecker	jb	1f
22*fb197a4fSRobert Clausecker
23*fb197a4fSRobert Clausecker	ALIGN_TEXT
24*fb197a4fSRobert Clausecker0:	xor	%r8, %r8
25*fb197a4fSRobert Clausecker	lea	2(%rdi), %r10
26*fb197a4fSRobert Clausecker	cmp	%sil, 2(%rdi)
27*fb197a4fSRobert Clausecker	cmovne	%r8, %r10		# point to null if no match
28*fb197a4fSRobert Clausecker
29*fb197a4fSRobert Clausecker	cmp	%sil, (%rdi)
30*fb197a4fSRobert Clausecker	cmove	%rdi, %r8		# point to first char if match
31*fb197a4fSRobert Clausecker
32*fb197a4fSRobert Clausecker	lea	1(%rdi), %r9
33*fb197a4fSRobert Clausecker	cmp	%sil, 1(%rdi)
34*fb197a4fSRobert Clausecker	cmovne	%r8, %r9		# point to first result if no match in second
35*fb197a4fSRobert Clausecker
36*fb197a4fSRobert Clausecker	lea	3(%rdi), %r11
37*fb197a4fSRobert Clausecker	cmp	%sil, 3(%rdi)
38*fb197a4fSRobert Clausecker	cmovne	%r10, %r11
39*fb197a4fSRobert Clausecker
40*fb197a4fSRobert Clausecker	test	%r11, %r11
41*fb197a4fSRobert Clausecker	cmovz	%r9, %r11		# take first pair match if none in second
42*fb197a4fSRobert Clausecker
43*fb197a4fSRobert Clausecker	test	%r11, %r11
44*fb197a4fSRobert Clausecker	cmovnz	%r11, %rax		# take match in current set if any
45*fb197a4fSRobert Clausecker
46*fb197a4fSRobert Clausecker	add	$4, %rdi
47*fb197a4fSRobert Clausecker	sub	$4, %rdx
48*fb197a4fSRobert Clausecker	jae	0b
49*fb197a4fSRobert Clausecker
50*fb197a4fSRobert Clausecker1:	cmp	$-3, %edx		# a least one character left to process?
51*fb197a4fSRobert Clausecker	jb	2f
52*fb197a4fSRobert Clausecker
53*fb197a4fSRobert Clausecker	cmp	%sil, (%rdi)
54*fb197a4fSRobert Clausecker	cmove	%rdi, %rax
55*fb197a4fSRobert Clausecker
56*fb197a4fSRobert Clausecker	lea	1(%rdi), %rcx
57*fb197a4fSRobert Clausecker	cmp	$-2, %edx		# at least two characters left to process?
58*fb197a4fSRobert Clausecker	jb	2f
59*fb197a4fSRobert Clausecker
60*fb197a4fSRobert Clausecker	cmp	%sil, 1(%rdi)
61*fb197a4fSRobert Clausecker	cmove	%rcx, %rax
62*fb197a4fSRobert Clausecker
63*fb197a4fSRobert Clausecker	lea	2(%rdi), %rcx
64*fb197a4fSRobert Clausecker	cmp	$-1, %edx		# at least three character left to process?
65*fb197a4fSRobert Clausecker	jb	2f
66*fb197a4fSRobert Clausecker
67*fb197a4fSRobert Clausecker	cmp	%sil, 2(%rdi)
68*fb197a4fSRobert Clausecker	cmove	%rcx, %rax
69*fb197a4fSRobert Clausecker
70*fb197a4fSRobert Clausecker2:	ret
71*fb197a4fSRobert ClauseckerARCHEND(memrchr, scalar)
72*fb197a4fSRobert Clausecker
73*fb197a4fSRobert ClauseckerARCHENTRY(memrchr, baseline)
74*fb197a4fSRobert Clausecker	movd		%esi, %xmm4
75*fb197a4fSRobert Clausecker	test		%rdx, %rdx		# empty buffer?
76*fb197a4fSRobert Clausecker	jz		.L0			# if yes, return immediately
77*fb197a4fSRobert Clausecker
78*fb197a4fSRobert Clausecker	punpcklbw	%xmm4, %xmm4		# c -> cc
79*fb197a4fSRobert Clausecker	mov		%edi, %ecx
80*fb197a4fSRobert Clausecker	punpcklwd	%xmm4, %xmm4		# cc -> cccc
81*fb197a4fSRobert Clausecker	and		$~0xf, %rdi		# align source pointer
82*fb197a4fSRobert Clausecker	pshufd		$0, %xmm4, %xmm4	# cccc -> cccccccccccccccc
83*fb197a4fSRobert Clausecker	and		$0xf, %ecx
84*fb197a4fSRobert Clausecker	movdqa		%xmm4, %xmm0
85*fb197a4fSRobert Clausecker	mov		$-1, %r8d
86*fb197a4fSRobert Clausecker	pcmpeqb		(%rdi), %xmm0		# compare aligned head
87*fb197a4fSRobert Clausecker	shl		%cl, %r8d		# mask of bytes in the head of the buffer
88*fb197a4fSRobert Clausecker	pmovmskb	%xmm0, %eax
89*fb197a4fSRobert Clausecker
90*fb197a4fSRobert Clausecker	sub		$16, %rcx
91*fb197a4fSRobert Clausecker	and		%r8d, %eax		# match mask
92*fb197a4fSRobert Clausecker	add		%rcx, %rdx		# advance past head
93*fb197a4fSRobert Clausecker	cmc
94*fb197a4fSRobert Clausecker	jbe		.Lrunt			# did the string end in the buffer?
95*fb197a4fSRobert Clausecker
96*fb197a4fSRobert Clausecker	mov		%rdi, %rsi		# pointer to matching chunk
97*fb197a4fSRobert Clausecker	add		$16, %rdi
98*fb197a4fSRobert Clausecker	sub		$16, %rdx		# enough left for another round?
99*fb197a4fSRobert Clausecker	jbe		1f
100*fb197a4fSRobert Clausecker
101*fb197a4fSRobert Clausecker	/* main loop unrolled twice */
102*fb197a4fSRobert Clausecker	ALIGN_TEXT
103*fb197a4fSRobert Clausecker0:	movdqa		%xmm4, %xmm0
104*fb197a4fSRobert Clausecker	pcmpeqb		(%rdi), %xmm0
105*fb197a4fSRobert Clausecker	pmovmskb	%xmm0, %r8d
106*fb197a4fSRobert Clausecker
107*fb197a4fSRobert Clausecker	cmp		$16, %rdx		# enough left for second chunk?
108*fb197a4fSRobert Clausecker	jbe		2f
109*fb197a4fSRobert Clausecker
110*fb197a4fSRobert Clausecker	movdqa		%xmm4, %xmm0
111*fb197a4fSRobert Clausecker	pcmpeqb		16(%rdi), %xmm0
112*fb197a4fSRobert Clausecker	pmovmskb	%xmm0, %ecx
113*fb197a4fSRobert Clausecker
114*fb197a4fSRobert Clausecker	lea		16(%rdi), %r9
115*fb197a4fSRobert Clausecker	test		%ecx, %ecx		# match found in second chunk?
116*fb197a4fSRobert Clausecker	cmovz		%r8d, %ecx		# if not, use match data from first chunk
117*fb197a4fSRobert Clausecker	cmovz		%rdi, %r9
118*fb197a4fSRobert Clausecker
119*fb197a4fSRobert Clausecker	test		%ecx, %ecx		# any match found?
120*fb197a4fSRobert Clausecker	cmovnz		%ecx, %eax		# if yes, overwrite previously found match
121*fb197a4fSRobert Clausecker	cmovnz		%r9, %rsi
122*fb197a4fSRobert Clausecker
123*fb197a4fSRobert Clausecker	add		$32, %rdi		# advance to next iteration
124*fb197a4fSRobert Clausecker	sub		$32, %rdx		# advance to next chunks
125*fb197a4fSRobert Clausecker	ja		0b
126*fb197a4fSRobert Clausecker
127*fb197a4fSRobert Clausecker	/* process remaining 1--16 bytes */
128*fb197a4fSRobert Clausecker1:	pcmpeqb		(%rdi), %xmm4
129*fb197a4fSRobert Clausecker	mov		$0xffff, %r8d
130*fb197a4fSRobert Clausecker	xor		%ecx, %ecx
131*fb197a4fSRobert Clausecker	sub		%edx, %ecx		# number of bytes to be masked out
132*fb197a4fSRobert Clausecker	pmovmskb	%xmm4, %r9d
133*fb197a4fSRobert Clausecker	shr		%cl, %r8d		# mask of bytes to be kept in the buffer
134*fb197a4fSRobert Clausecker	and		%r9d, %r8d
135*fb197a4fSRobert Clausecker	cmovnz		%r8d, %eax
136*fb197a4fSRobert Clausecker	cmovnz		%rdi, %rsi
137*fb197a4fSRobert Clausecker	bsr		%eax, %eax
138*fb197a4fSRobert Clausecker	lea		(%rsi, %rax, 1), %rsi	# pointer to match (or junk)
139*fb197a4fSRobert Clausecker	cmovnz		%rsi, %rax		# if any match was found, return it
140*fb197a4fSRobert Clausecker	ret
141*fb197a4fSRobert Clausecker
142*fb197a4fSRobert Clausecker	/* end of chunk reached within first half iteration */
143*fb197a4fSRobert Clausecker2:	test		%r8d, %r8d		# match in previous chunk?
144*fb197a4fSRobert Clausecker	cmovnz		%r8d, %eax		# if yes, overwrite previous chunks
145*fb197a4fSRobert Clausecker	cmovnz		%rdi, %rsi
146*fb197a4fSRobert Clausecker	add		$16, %rdi		# point to tail
147*fb197a4fSRobert Clausecker	sub		$16, %edx
148*fb197a4fSRobert Clausecker	jmp		1b			# handle tail the same otherwise
149*fb197a4fSRobert Clausecker
150*fb197a4fSRobert Clausecker	/* runt: string ends within head, edx has negated amount of invalid head bytes */
151*fb197a4fSRobert Clausecker.Lrunt:	mov		$0xffff, %r8d
152*fb197a4fSRobert Clausecker	xor		%ecx, %ecx
153*fb197a4fSRobert Clausecker	sub		%edx, %ecx
154*fb197a4fSRobert Clausecker	shr		%cl, %r8d
155*fb197a4fSRobert Clausecker	and		%r8d, %eax
156*fb197a4fSRobert Clausecker	bsr		%eax, %eax
157*fb197a4fSRobert Clausecker	lea		(%rdi, %rax, 1), %rdi
158*fb197a4fSRobert Clausecker	cmovnz		%rdi, %rax
159*fb197a4fSRobert Clausecker	ret
160*fb197a4fSRobert Clausecker
161*fb197a4fSRobert Clausecker	/* empty buffer: return a null pointer */
162*fb197a4fSRobert Clausecker.L0:	xor		%eax, %eax
163*fb197a4fSRobert Clausecker	ret
164*fb197a4fSRobert ClauseckerARCHEND(memrchr, baseline)
165*fb197a4fSRobert Clausecker
166*fb197a4fSRobert Clausecker	.section	.note.GNU-stack, "", %progbits
167