xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/strchr.S (revision f3087bef11543b42e0d69b708f367097a4118d24)
131914882SAlex Richardson/*
231914882SAlex Richardson * strchr - find a character in a string
331914882SAlex Richardson *
4*072a4ba8SAndrew Turner * Copyright (c) 2014-2022, Arm Limited.
5*072a4ba8SAndrew Turner * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
631914882SAlex Richardson */
731914882SAlex Richardson
831914882SAlex Richardson/* Assumptions:
931914882SAlex Richardson *
1031914882SAlex Richardson * ARMv8-a, AArch64
1131914882SAlex Richardson * Neon Available.
1231914882SAlex Richardson */
1331914882SAlex Richardson
14*072a4ba8SAndrew Turner#include "asmdefs.h"
1531914882SAlex Richardson
1631914882SAlex Richardson/* Arguments and results.  */
1731914882SAlex Richardson#define srcin		x0
1831914882SAlex Richardson#define chrin		w1
1931914882SAlex Richardson
2031914882SAlex Richardson#define result		x0
2131914882SAlex Richardson
2231914882SAlex Richardson#define src		x2
2331914882SAlex Richardson#define	tmp1		x3
2431914882SAlex Richardson#define wtmp2		w4
2531914882SAlex Richardson#define tmp3		x5
2631914882SAlex Richardson
2731914882SAlex Richardson#define vrepchr		v0
2831914882SAlex Richardson#define vdata1		v1
2931914882SAlex Richardson#define vdata2		v2
3031914882SAlex Richardson#define vhas_nul1	v3
3131914882SAlex Richardson#define vhas_nul2	v4
3231914882SAlex Richardson#define vhas_chr1	v5
3331914882SAlex Richardson#define vhas_chr2	v6
3431914882SAlex Richardson#define vrepmask_0	v7
3531914882SAlex Richardson#define vrepmask_c	v16
3631914882SAlex Richardson#define vend1		v17
3731914882SAlex Richardson#define vend2		v18
3831914882SAlex Richardson
3931914882SAlex Richardson/* Core algorithm.
4031914882SAlex Richardson
4131914882SAlex Richardson   For each 32-byte hunk we calculate a 64-bit syndrome value, with
4231914882SAlex Richardson   two bits per byte (LSB is always in bits 0 and 1, for both big
4331914882SAlex Richardson   and little-endian systems).  For each tuple, bit 0 is set iff
4431914882SAlex Richardson   the relevant byte matched the requested character; bit 1 is set
4531914882SAlex Richardson   iff the relevant byte matched the NUL end of string (we trigger
4631914882SAlex Richardson   off bit0 for the special case of looking for NUL).  Since the bits
4731914882SAlex Richardson   in the syndrome reflect exactly the order in which things occur
4831914882SAlex Richardson   in the original string a count_trailing_zeros() operation will
4931914882SAlex Richardson   identify exactly which byte is causing the termination, and why.  */
5031914882SAlex Richardson
5131914882SAlex Richardson/* Locals and temporaries.  */
5231914882SAlex Richardson
5331914882SAlex RichardsonENTRY (__strchr_aarch64)
5431914882SAlex Richardson	/* Magic constant 0xc0300c03 to allow us to identify which lane
5531914882SAlex Richardson	   matches the requested byte.  Even bits are set if the character
5631914882SAlex Richardson	   matches, odd bits if either the char is NUL or matches.  */
5731914882SAlex Richardson	mov	wtmp2, 0x0c03
5831914882SAlex Richardson	movk	wtmp2, 0xc030, lsl 16
5931914882SAlex Richardson	dup	vrepchr.16b, chrin
6031914882SAlex Richardson	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
6131914882SAlex Richardson	dup	vrepmask_c.4s, wtmp2
6231914882SAlex Richardson	ands	tmp1, srcin, #31
6331914882SAlex Richardson	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
6431914882SAlex Richardson	b.eq	L(loop)
6531914882SAlex Richardson
6631914882SAlex Richardson	/* Input string is not 32-byte aligned.  Rather than forcing
6731914882SAlex Richardson	   the padding bytes to a safe value, we calculate the syndrome
6831914882SAlex Richardson	   for all the bytes, but then mask off those bits of the
6931914882SAlex Richardson	   syndrome that are related to the padding.  */
7031914882SAlex Richardson	ld1	{vdata1.16b, vdata2.16b}, [src], #32
7131914882SAlex Richardson	neg	tmp1, tmp1
7231914882SAlex Richardson	cmeq	vhas_nul1.16b, vdata1.16b, #0
7331914882SAlex Richardson	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
7431914882SAlex Richardson	cmeq	vhas_nul2.16b, vdata2.16b, #0
7531914882SAlex Richardson	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
7631914882SAlex Richardson	bif	vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
7731914882SAlex Richardson	bif	vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
7831914882SAlex Richardson	and	vend1.16b, vhas_nul1.16b, vrepmask_c.16b
7931914882SAlex Richardson	and	vend2.16b, vhas_nul2.16b, vrepmask_c.16b
8031914882SAlex Richardson	lsl	tmp1, tmp1, #1
8131914882SAlex Richardson	addp	vend1.16b, vend1.16b, vend2.16b		// 256->128
8231914882SAlex Richardson	mov	tmp3, #~0
8331914882SAlex Richardson	addp	vend1.16b, vend1.16b, vend2.16b		// 128->64
8431914882SAlex Richardson	lsr	tmp1, tmp3, tmp1
8531914882SAlex Richardson
8631914882SAlex Richardson	mov	tmp3, vend1.d[0]
8731914882SAlex Richardson	bic	tmp1, tmp3, tmp1	// Mask padding bits.
8831914882SAlex Richardson	cbnz	tmp1, L(tail)
8931914882SAlex Richardson
9031914882SAlex Richardson	.p2align 4
9131914882SAlex RichardsonL(loop):
9231914882SAlex Richardson	ld1	{vdata1.16b, vdata2.16b}, [src], #32
9331914882SAlex Richardson	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
9431914882SAlex Richardson	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
9531914882SAlex Richardson	cmhs	vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
9631914882SAlex Richardson	cmhs	vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
9731914882SAlex Richardson	orr	vend1.16b, vhas_nul1.16b, vhas_nul2.16b
9831914882SAlex Richardson	umaxp	vend1.16b, vend1.16b, vend1.16b
9931914882SAlex Richardson	mov	tmp1, vend1.d[0]
10031914882SAlex Richardson	cbz	tmp1, L(loop)
10131914882SAlex Richardson
10231914882SAlex Richardson	/* Termination condition found.  Now need to establish exactly why
10331914882SAlex Richardson	   we terminated.  */
10431914882SAlex Richardson	bif	vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
10531914882SAlex Richardson	bif	vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
10631914882SAlex Richardson	and	vend1.16b, vhas_nul1.16b, vrepmask_c.16b
10731914882SAlex Richardson	and	vend2.16b, vhas_nul2.16b, vrepmask_c.16b
10831914882SAlex Richardson	addp	vend1.16b, vend1.16b, vend2.16b		// 256->128
10931914882SAlex Richardson	addp	vend1.16b, vend1.16b, vend2.16b		// 128->64
11031914882SAlex Richardson	mov	tmp1, vend1.d[0]
11131914882SAlex RichardsonL(tail):
11231914882SAlex Richardson	/* Count the trailing zeros, by bit reversing...  */
11331914882SAlex Richardson	rbit	tmp1, tmp1
11431914882SAlex Richardson	/* Re-bias source.  */
11531914882SAlex Richardson	sub	src, src, #32
11631914882SAlex Richardson	clz	tmp1, tmp1	/* And counting the leading zeros.  */
11731914882SAlex Richardson	/* Tmp1 is even if the target charager was found first.  Otherwise
11831914882SAlex Richardson	   we've found the end of string and we weren't looking for NUL.  */
11931914882SAlex Richardson	tst	tmp1, #1
12031914882SAlex Richardson	add	result, src, tmp1, lsr #1
12131914882SAlex Richardson	csel	result, result, xzr, eq
12231914882SAlex Richardson	ret
12331914882SAlex Richardson
12431914882SAlex RichardsonEND (__strchr_aarch64)
12531914882SAlex Richardson
126