xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/strrchr.S (revision 072a4ba82a01476eaee33781ccd241033eefcf0b)
131914882SAlex Richardson/*
231914882SAlex Richardson * strrchr - find last position of a character in a string.
331914882SAlex Richardson *
4*072a4ba8SAndrew Turner * Copyright (c) 2014-2022, Arm Limited.
5*072a4ba8SAndrew Turner * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
631914882SAlex Richardson */
731914882SAlex Richardson
831914882SAlex Richardson/* Assumptions:
931914882SAlex Richardson *
1031914882SAlex Richardson * ARMv8-a, AArch64
1131914882SAlex Richardson * Neon Available.
1231914882SAlex Richardson */
1331914882SAlex Richardson
14*072a4ba8SAndrew Turner#include "asmdefs.h"
1531914882SAlex Richardson
1631914882SAlex Richardson/* Arguments and results.  */
1731914882SAlex Richardson#define srcin		x0
1831914882SAlex Richardson#define chrin		w1
1931914882SAlex Richardson
2031914882SAlex Richardson#define result		x0
2131914882SAlex Richardson
2231914882SAlex Richardson#define src		x2
2331914882SAlex Richardson#define	tmp1		x3
2431914882SAlex Richardson#define wtmp2		w4
2531914882SAlex Richardson#define tmp3		x5
2631914882SAlex Richardson#define src_match	x6
2731914882SAlex Richardson#define src_offset	x7
2831914882SAlex Richardson#define const_m1	x8
2931914882SAlex Richardson#define tmp4		x9
3031914882SAlex Richardson#define nul_match	x10
3131914882SAlex Richardson#define chr_match	x11
3231914882SAlex Richardson
3331914882SAlex Richardson#define vrepchr		v0
3431914882SAlex Richardson#define vdata1		v1
3531914882SAlex Richardson#define vdata2		v2
3631914882SAlex Richardson#define vhas_nul1	v3
3731914882SAlex Richardson#define vhas_nul2	v4
3831914882SAlex Richardson#define vhas_chr1	v5
3931914882SAlex Richardson#define vhas_chr2	v6
4031914882SAlex Richardson#define vrepmask_0	v7
4131914882SAlex Richardson#define vrepmask_c	v16
4231914882SAlex Richardson#define vend1		v17
4331914882SAlex Richardson#define vend2		v18
4431914882SAlex Richardson
4531914882SAlex Richardson/* Core algorithm.
4631914882SAlex Richardson
4731914882SAlex Richardson   For each 32-byte hunk we calculate a 64-bit syndrome value, with
4831914882SAlex Richardson   two bits per byte (LSB is always in bits 0 and 1, for both big
4931914882SAlex Richardson   and little-endian systems).  For each tuple, bit 0 is set iff
5031914882SAlex Richardson   the relevant byte matched the requested character; bit 1 is set
5131914882SAlex Richardson   iff the relevant byte matched the NUL end of string (we trigger
5231914882SAlex Richardson   off bit0 for the special case of looking for NUL).  Since the bits
5331914882SAlex Richardson   in the syndrome reflect exactly the order in which things occur
5431914882SAlex Richardson   in the original string a count_trailing_zeros() operation will
5531914882SAlex Richardson   identify exactly which byte is causing the termination, and why.  */
5631914882SAlex Richardson
5731914882SAlex RichardsonENTRY (__strrchr_aarch64)
5831914882SAlex Richardson	PTR_ARG (0)
5931914882SAlex Richardson	/* Magic constant 0x40100401 to allow us to identify which lane
6031914882SAlex Richardson	   matches the requested byte.  Magic constant 0x80200802 used
6131914882SAlex Richardson	   similarly for NUL termination.  */
6231914882SAlex Richardson	mov	wtmp2, #0x0401
6331914882SAlex Richardson	movk	wtmp2, #0x4010, lsl #16
6431914882SAlex Richardson	dup	vrepchr.16b, chrin
6531914882SAlex Richardson	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
6631914882SAlex Richardson	dup	vrepmask_c.4s, wtmp2
6731914882SAlex Richardson	mov	src_offset, #0
6831914882SAlex Richardson	ands	tmp1, srcin, #31
6931914882SAlex Richardson	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
7031914882SAlex Richardson	b.eq	L(aligned)
7131914882SAlex Richardson
7231914882SAlex Richardson	/* Input string is not 32-byte aligned.  Rather than forcing
7331914882SAlex Richardson	   the padding bytes to a safe value, we calculate the syndrome
7431914882SAlex Richardson	   for all the bytes, but then mask off those bits of the
7531914882SAlex Richardson	   syndrome that are related to the padding.  */
7631914882SAlex Richardson	ld1	{vdata1.16b, vdata2.16b}, [src], #32
7731914882SAlex Richardson	neg	tmp1, tmp1
7831914882SAlex Richardson	cmeq	vhas_nul1.16b, vdata1.16b, #0
7931914882SAlex Richardson	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
8031914882SAlex Richardson	cmeq	vhas_nul2.16b, vdata2.16b, #0
8131914882SAlex Richardson	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
8231914882SAlex Richardson	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
8331914882SAlex Richardson	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
8431914882SAlex Richardson	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
8531914882SAlex Richardson	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
8631914882SAlex Richardson	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b	// 256->128
8731914882SAlex Richardson	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
8831914882SAlex Richardson	addp	vend1.16b, vhas_nul1.16b, vhas_chr1.16b		// 128->64
8931914882SAlex Richardson	mov	nul_match, vend1.d[0]
9031914882SAlex Richardson	lsl	tmp1, tmp1, #1
9131914882SAlex Richardson	mov	const_m1, #~0
9231914882SAlex Richardson	lsr	tmp3, const_m1, tmp1
9331914882SAlex Richardson	mov	chr_match, vend1.d[1]
9431914882SAlex Richardson
9531914882SAlex Richardson	bic	nul_match, nul_match, tmp3	// Mask padding bits.
9631914882SAlex Richardson	bic	chr_match, chr_match, tmp3	// Mask padding bits.
9731914882SAlex Richardson	cbnz	nul_match, L(tail)
9831914882SAlex Richardson
9931914882SAlex Richardson	.p2align 4
10031914882SAlex RichardsonL(loop):
10131914882SAlex Richardson	cmp	chr_match, #0
10231914882SAlex Richardson	csel	src_match, src, src_match, ne
10331914882SAlex Richardson	csel	src_offset, chr_match, src_offset, ne
10431914882SAlex RichardsonL(aligned):
10531914882SAlex Richardson	ld1	{vdata1.16b, vdata2.16b}, [src], #32
10631914882SAlex Richardson	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
10731914882SAlex Richardson	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
10831914882SAlex Richardson	uminp	vend1.16b, vdata1.16b, vdata2.16b
10931914882SAlex Richardson	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
11031914882SAlex Richardson	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
11131914882SAlex Richardson	cmeq	vend1.16b, vend1.16b, 0
11231914882SAlex Richardson	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
11331914882SAlex Richardson	addp	vend1.16b, vend1.16b, vhas_chr1.16b		// 128->64
11431914882SAlex Richardson	mov	nul_match, vend1.d[0]
11531914882SAlex Richardson	mov	chr_match, vend1.d[1]
11631914882SAlex Richardson	cbz	nul_match, L(loop)
11731914882SAlex Richardson
11831914882SAlex Richardson	cmeq	vhas_nul1.16b, vdata1.16b, #0
11931914882SAlex Richardson	cmeq	vhas_nul2.16b, vdata2.16b, #0
12031914882SAlex Richardson	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
12131914882SAlex Richardson	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
12231914882SAlex Richardson	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b
12331914882SAlex Richardson	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b
12431914882SAlex Richardson	mov	nul_match, vhas_nul1.d[0]
12531914882SAlex Richardson
12631914882SAlex RichardsonL(tail):
12731914882SAlex Richardson	/* Work out exactly where the string ends.  */
12831914882SAlex Richardson	sub	tmp4, nul_match, #1
12931914882SAlex Richardson	eor	tmp4, tmp4, nul_match
13031914882SAlex Richardson	ands	chr_match, chr_match, tmp4
13131914882SAlex Richardson	/* And pick the values corresponding to the last match.  */
13231914882SAlex Richardson	csel	src_match, src, src_match, ne
13331914882SAlex Richardson	csel	src_offset, chr_match, src_offset, ne
13431914882SAlex Richardson
13531914882SAlex Richardson	/* Count down from the top of the syndrome to find the last match.  */
13631914882SAlex Richardson	clz	tmp3, src_offset
13731914882SAlex Richardson	/* Src_match points beyond the word containing the match, so we can
13831914882SAlex Richardson	   simply subtract half the bit-offset into the syndrome.  Because
13931914882SAlex Richardson	   we are counting down, we need to go back one more character.  */
14031914882SAlex Richardson	add	tmp3, tmp3, #2
14131914882SAlex Richardson	sub	result, src_match, tmp3, lsr #1
14231914882SAlex Richardson	/* But if the syndrome shows no match was found, then return NULL.  */
14331914882SAlex Richardson	cmp	src_offset, #0
14431914882SAlex Richardson	csel	result, result, xzr, ne
14531914882SAlex Richardson
14631914882SAlex Richardson	ret
14731914882SAlex Richardson
14831914882SAlex RichardsonEND (__strrchr_aarch64)
14931914882SAlex Richardson
150