xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/strchr-mte.S (revision 924226fba12cc9a228c73b956e1b7fa24c60b055)
1/*
2 * strchr - find a character in a string
3 *
4 * Copyright (c) 2020, Arm Limited.
5 * SPDX-License-Identifier: MIT
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, Advanced SIMD.
11 * MTE compatible.
12 */
13
14#include "../asmdefs.h"
15
16#define srcin		x0
17#define chrin		w1
18#define result		x0
19
20#define src		x2
21#define tmp1		x1
22#define wtmp2		w3
23#define tmp3		x3
24
25#define vrepchr		v0
26#define vdata		v1
27#define qdata		q1
28#define vhas_nul	v2
29#define vhas_chr	v3
30#define vrepmask	v4
31#define vrepmask2	v5
32#define vend		v6
33#define dend		d6
34
35/* Core algorithm.
36
37   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
38   per byte. For even bytes, bits 0-1 are set if the relevant byte matched the
39   requested character, bits 2-3 are set if the byte is NUL (or matched), and
40   bits 4-7 are not used and must be zero if none of bits 0-3 are set). Odd
41   bytes set bits 4-7 so that adjacent bytes can be merged. Since the bits
42   in the syndrome reflect the order in which things occur in the original
43   string, counting trailing zeros identifies exactly which byte matched.  */
44
45ENTRY (__strchr_aarch64_mte)
46	PTR_ARG (0)
47	bic	src, srcin, 15
48	dup	vrepchr.16b, chrin
49	ld1	{vdata.16b}, [src]
50	mov	wtmp2, 0x3003
51	dup	vrepmask.8h, wtmp2
52	cmeq	vhas_nul.16b, vdata.16b, 0
53	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
54	mov	wtmp2, 0xf00f
55	dup	vrepmask2.8h, wtmp2
56
57	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
58	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
59	lsl	tmp3, srcin, 2
60	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
61
62	fmov	tmp1, dend
63	lsr	tmp1, tmp1, tmp3
64	cbz	tmp1, L(loop)
65
66	rbit	tmp1, tmp1
67	clz	tmp1, tmp1
68	/* Tmp1 is an even multiple of 2 if the target character was
69	   found first. Otherwise we've found the end of string.  */
70	tst	tmp1, 2
71	add	result, srcin, tmp1, lsr 2
72	csel	result, result, xzr, eq
73	ret
74
75	.p2align 4
76L(loop):
77	ldr	qdata, [src, 16]!
78	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
79	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
80	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
81	fmov	tmp1, dend
82	cbz	tmp1, L(loop)
83
84#ifdef __AARCH64EB__
85	bif	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
86	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
87	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
88	fmov	tmp1, dend
89#else
90	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
91	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
92	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
93	fmov	tmp1, dend
94	rbit	tmp1, tmp1
95#endif
96	clz	tmp1, tmp1
97	/* Tmp1 is an even multiple of 2 if the target character was
98	   found first. Otherwise we've found the end of string.  */
99	tst	tmp1, 2
100	add	result, src, tmp1, lsr 2
101	csel	result, result, xzr, eq
102	ret
103
104END (__strchr_aarch64_mte)
105
106