xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/strrchr.S (revision 19fae0f66023a97a9b464b3beeeabb2081f575b3)
1/*
2 * strrchr - find last position of a character in a string.
3 *
4 * Copyright (c) 2014-2022, Arm Limited.
5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64
11 * Neon Available.
12 */
13
14#include "asmdefs.h"
15
16/* Arguments and results.  */
17#define srcin		x0
18#define chrin		w1
19
20#define result		x0
21
22#define src		x2
23#define	tmp1		x3
24#define wtmp2		w4
25#define tmp3		x5
26#define src_match	x6
27#define src_offset	x7
28#define const_m1	x8
29#define tmp4		x9
30#define nul_match	x10
31#define chr_match	x11
32
33#define vrepchr		v0
34#define vdata1		v1
35#define vdata2		v2
36#define vhas_nul1	v3
37#define vhas_nul2	v4
38#define vhas_chr1	v5
39#define vhas_chr2	v6
40#define vrepmask_0	v7
41#define vrepmask_c	v16
42#define vend1		v17
43#define vend2		v18
44
45/* Core algorithm.
46
47   For each 32-byte hunk we calculate a 64-bit syndrome value, with
48   two bits per byte (LSB is always in bits 0 and 1, for both big
49   and little-endian systems).  For each tuple, bit 0 is set iff
50   the relevant byte matched the requested character; bit 1 is set
51   iff the relevant byte matched the NUL end of string (we trigger
52   off bit0 for the special case of looking for NUL).  Since the bits
53   in the syndrome reflect exactly the order in which things occur
54   in the original string a count_trailing_zeros() operation will
55   identify exactly which byte is causing the termination, and why.  */
56
57ENTRY (__strrchr_aarch64)
58	PTR_ARG (0)
59	/* Magic constant 0x40100401 to allow us to identify which lane
60	   matches the requested byte.  Magic constant 0x80200802 used
61	   similarly for NUL termination.  */
62	mov	wtmp2, #0x0401
63	movk	wtmp2, #0x4010, lsl #16
64	dup	vrepchr.16b, chrin
65	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
66	dup	vrepmask_c.4s, wtmp2
67	mov	src_offset, #0
68	ands	tmp1, srcin, #31
69	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
70	b.eq	L(aligned)
71
72	/* Input string is not 32-byte aligned.  Rather than forcing
73	   the padding bytes to a safe value, we calculate the syndrome
74	   for all the bytes, but then mask off those bits of the
75	   syndrome that are related to the padding.  */
76	ld1	{vdata1.16b, vdata2.16b}, [src], #32
77	neg	tmp1, tmp1
78	cmeq	vhas_nul1.16b, vdata1.16b, #0
79	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
80	cmeq	vhas_nul2.16b, vdata2.16b, #0
81	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
82	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
83	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
84	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
85	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
86	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b	// 256->128
87	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
88	addp	vend1.16b, vhas_nul1.16b, vhas_chr1.16b		// 128->64
89	mov	nul_match, vend1.d[0]
90	lsl	tmp1, tmp1, #1
91	mov	const_m1, #~0
92	lsr	tmp3, const_m1, tmp1
93	mov	chr_match, vend1.d[1]
94
95	bic	nul_match, nul_match, tmp3	// Mask padding bits.
96	bic	chr_match, chr_match, tmp3	// Mask padding bits.
97	cbnz	nul_match, L(tail)
98
99	.p2align 4
100L(loop):
101	cmp	chr_match, #0
102	csel	src_match, src, src_match, ne
103	csel	src_offset, chr_match, src_offset, ne
104L(aligned):
105	ld1	{vdata1.16b, vdata2.16b}, [src], #32
106	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
107	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
108	uminp	vend1.16b, vdata1.16b, vdata2.16b
109	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
110	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
111	cmeq	vend1.16b, vend1.16b, 0
112	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
113	addp	vend1.16b, vend1.16b, vhas_chr1.16b		// 128->64
114	mov	nul_match, vend1.d[0]
115	mov	chr_match, vend1.d[1]
116	cbz	nul_match, L(loop)
117
118	cmeq	vhas_nul1.16b, vdata1.16b, #0
119	cmeq	vhas_nul2.16b, vdata2.16b, #0
120	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
121	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
122	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b
123	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b
124	mov	nul_match, vhas_nul1.d[0]
125
126L(tail):
127	/* Work out exactly where the string ends.  */
128	sub	tmp4, nul_match, #1
129	eor	tmp4, tmp4, nul_match
130	ands	chr_match, chr_match, tmp4
131	/* And pick the values corresponding to the last match.  */
132	csel	src_match, src, src_match, ne
133	csel	src_offset, chr_match, src_offset, ne
134
135	/* Count down from the top of the syndrome to find the last match.  */
136	clz	tmp3, src_offset
137	/* Src_match points beyond the word containing the match, so we can
138	   simply subtract half the bit-offset into the syndrome.  Because
139	   we are counting down, we need to go back one more character.  */
140	add	tmp3, tmp3, #2
141	sub	result, src_match, tmp3, lsr #1
142	/* But if the syndrome shows no match was found, then return NULL.  */
143	cmp	src_offset, #0
144	csel	result, result, xzr, ne
145
146	ret
147
148END (__strrchr_aarch64)
149
150