xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/strrchr-mte.S (revision 3e8eb5c7f4909209c042403ddee340b2ee7003a5)
1/*
2 * strrchr - find last position of a character in a string.
3 *
4 * Copyright (c) 2020, Arm Limited.
5 * SPDX-License-Identifier: MIT
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, Advanced SIMD.
11 * MTE compatible.
12 */
13
14#include "../asmdefs.h"
15
16#define srcin		x0
17#define chrin		w1
18#define result		x0
19
20#define src		x2
21#define tmp		x3
22#define wtmp		w3
23#define synd		x3
24#define shift		x4
25#define src_match	x4
26#define nul_match	x5
27#define chr_match	x6
28
29#define vrepchr		v0
30#define vdata		v1
31#define vhas_nul	v2
32#define vhas_chr	v3
33#define vrepmask	v4
34#define vrepmask2	v5
35#define vend		v5
36#define dend		d5
37
38/* Core algorithm.
39
40   For each 16-byte chunk we calculate a 64-bit syndrome value, with
41   four bits per byte (LSB is always in bits 0 and 1, for both big
42   and little-endian systems).  For each tuple, bits 0-1 are set if
43   the relevant byte matched the requested character; bits 2-3 are set
44   if the relevant byte matched the NUL end of string.  */
45
46ENTRY (__strrchr_aarch64_mte)
47	PTR_ARG (0)
48	bic	src, srcin, 15
49	dup	vrepchr.16b, chrin
50	mov	wtmp, 0x3003
51	dup	vrepmask.8h, wtmp
52	tst	srcin, 15
53	beq	L(loop1)
54
55	ld1	{vdata.16b}, [src], 16
56	cmeq	vhas_nul.16b, vdata.16b, 0
57	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
58	mov	wtmp, 0xf00f
59	dup	vrepmask2.8h, wtmp
60	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
61	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
62	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
63	lsl	shift, srcin, 2
64	fmov	synd, dend
65	lsr	synd, synd, shift
66	lsl	synd, synd, shift
67	ands	nul_match, synd, 0xcccccccccccccccc
68	bne	L(tail)
69	cbnz	synd, L(loop2)
70
71	.p2align 5
72L(loop1):
73	ld1	{vdata.16b}, [src], 16
74	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
75	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
76	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
77	fmov	synd, dend
78	cbz	synd, L(loop1)
79
80	cmeq	vhas_nul.16b, vdata.16b, 0
81	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
82	bic	vhas_nul.8h, 0x0f, lsl 8
83	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
84	fmov	synd, dend
85	ands	nul_match, synd, 0xcccccccccccccccc
86	beq	L(loop2)
87
88L(tail):
89	sub	nul_match, nul_match, 1
90	and	chr_match, synd, 0x3333333333333333
91	ands	chr_match, chr_match, nul_match
92	sub	result, src, 1
93	clz	tmp, chr_match
94	sub	result, result, tmp, lsr 2
95	csel	result, result, xzr, ne
96	ret
97
98	.p2align 4
99L(loop2):
100	cmp	synd, 0
101	csel	src_match, src, src_match, ne
102	csel	chr_match, synd, chr_match, ne
103	ld1	{vdata.16b}, [src], 16
104	cmeq	vhas_nul.16b, vdata.16b, 0
105	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
106	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
107	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
108	fmov	synd, dend
109	tst	synd, 0xcccccccccccccccc
110	beq	L(loop2)
111
112	bic	vhas_nul.8h, 0x0f, lsl 8
113	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
114	fmov	synd, dend
115	and	nul_match, synd, 0xcccccccccccccccc
116	sub	nul_match, nul_match, 1
117	and	tmp, synd, 0x3333333333333333
118	ands	tmp, tmp, nul_match
119	csel	chr_match, tmp, chr_match, ne
120	csel	src_match, src, src_match, ne
121	sub	src_match, src_match, 1
122	clz	tmp, chr_match
123	sub	result, src_match, tmp, lsr 2
124	ret
125
126END (__strrchr_aarch64_mte)
127
128