xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/strrchr.S (revision dd21556857e8d40f66bf5ad54754d9d52669ebf7)
1/*
2 * strrchr - find last position of a character in a string.
3 *
4 * Copyright (c) 2014-2022, Arm Limited.
5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64
11 * Neon Available.
12 */
13
14#include "asmdefs.h"
15
16/* Arguments and results.  */
17#define srcin		x0
18#define chrin		w1
19
20#define result		x0
21
22#define src		x2
23#define	tmp1		x3
24#define wtmp2		w4
25#define tmp3		x5
26#define src_match	x6
27#define src_offset	x7
28#define const_m1	x8
29#define tmp4		x9
30#define nul_match	x10
31#define chr_match	x11
32
33#define vrepchr		v0
34#define vdata1		v1
35#define vdata2		v2
36#define vhas_nul1	v3
37#define vhas_nul2	v4
38#define vhas_chr1	v5
39#define vhas_chr2	v6
40#define vrepmask_0	v7
41#define vrepmask_c	v16
42#define vend1		v17
43#define vend2		v18
44
45/* Core algorithm.
46
47   For each 32-byte hunk we calculate a 64-bit syndrome value, with
48   two bits per byte (LSB is always in bits 0 and 1, for both big
49   and little-endian systems).  For each tuple, bit 0 is set iff
50   the relevant byte matched the requested character; bit 1 is set
51   iff the relevant byte matched the NUL end of string (we trigger
52   off bit0 for the special case of looking for NUL).  Since the bits
53   in the syndrome reflect exactly the order in which things occur
54   in the original string a count_trailing_zeros() operation will
55   identify exactly which byte is causing the termination, and why.  */
56
57ENTRY (__strrchr_aarch64)
58	/* Magic constant 0x40100401 to allow us to identify which lane
59	   matches the requested byte.  Magic constant 0x80200802 used
60	   similarly for NUL termination.  */
61	mov	wtmp2, #0x0401
62	movk	wtmp2, #0x4010, lsl #16
63	dup	vrepchr.16b, chrin
64	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
65	dup	vrepmask_c.4s, wtmp2
66	mov	src_offset, #0
67	ands	tmp1, srcin, #31
68	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
69	b.eq	L(aligned)
70
71	/* Input string is not 32-byte aligned.  Rather than forcing
72	   the padding bytes to a safe value, we calculate the syndrome
73	   for all the bytes, but then mask off those bits of the
74	   syndrome that are related to the padding.  */
75	ld1	{vdata1.16b, vdata2.16b}, [src], #32
76	neg	tmp1, tmp1
77	cmeq	vhas_nul1.16b, vdata1.16b, #0
78	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
79	cmeq	vhas_nul2.16b, vdata2.16b, #0
80	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
81	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
82	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
83	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
84	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
85	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b	// 256->128
86	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
87	addp	vend1.16b, vhas_nul1.16b, vhas_chr1.16b		// 128->64
88	mov	nul_match, vend1.d[0]
89	lsl	tmp1, tmp1, #1
90	mov	const_m1, #~0
91	lsr	tmp3, const_m1, tmp1
92	mov	chr_match, vend1.d[1]
93
94	bic	nul_match, nul_match, tmp3	// Mask padding bits.
95	bic	chr_match, chr_match, tmp3	// Mask padding bits.
96	cbnz	nul_match, L(tail)
97
98	.p2align 4
99L(loop):
100	cmp	chr_match, #0
101	csel	src_match, src, src_match, ne
102	csel	src_offset, chr_match, src_offset, ne
103L(aligned):
104	ld1	{vdata1.16b, vdata2.16b}, [src], #32
105	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
106	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
107	uminp	vend1.16b, vdata1.16b, vdata2.16b
108	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
109	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
110	cmeq	vend1.16b, vend1.16b, 0
111	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
112	addp	vend1.16b, vend1.16b, vhas_chr1.16b		// 128->64
113	mov	nul_match, vend1.d[0]
114	mov	chr_match, vend1.d[1]
115	cbz	nul_match, L(loop)
116
117	cmeq	vhas_nul1.16b, vdata1.16b, #0
118	cmeq	vhas_nul2.16b, vdata2.16b, #0
119	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
120	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
121	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b
122	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b
123	mov	nul_match, vhas_nul1.d[0]
124
125L(tail):
126	/* Work out exactly where the string ends.  */
127	sub	tmp4, nul_match, #1
128	eor	tmp4, tmp4, nul_match
129	ands	chr_match, chr_match, tmp4
130	/* And pick the values corresponding to the last match.  */
131	csel	src_match, src, src_match, ne
132	csel	src_offset, chr_match, src_offset, ne
133
134	/* Count down from the top of the syndrome to find the last match.  */
135	clz	tmp3, src_offset
136	/* Src_match points beyond the word containing the match, so we can
137	   simply subtract half the bit-offset into the syndrome.  Because
138	   we are counting down, we need to go back one more character.  */
139	add	tmp3, tmp3, #2
140	sub	result, src_match, tmp3, lsr #1
141	/* But if the syndrome shows no match was found, then return NULL.  */
142	cmp	src_offset, #0
143	csel	result, result, xzr, ne
144
145	ret
146
147END (__strrchr_aarch64)
148
149