xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/strchrnul-mte.S (revision 31ba4ce8898f9dfa5e7f054fdbc26e50a599a6e3)
1/*
2 * strchrnul - find a character or nul in a string
3 *
4 * Copyright (c) 2020, Arm Limited.
5 * SPDX-License-Identifier: MIT
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, Advanced SIMD.
11 * MTE compatible.
12 */
13
14#include "../asmdefs.h"
15
16#define srcin		x0
17#define chrin		w1
18#define result		x0
19
20#define src		x2
21#define tmp1		x1
22#define tmp2		x3
23#define tmp2w		w3
24
25#define vrepchr		v0
26#define vdata		v1
27#define qdata		q1
28#define vhas_nul	v2
29#define vhas_chr	v3
30#define vrepmask	v4
31#define vend		v5
32#define dend		d5
33
34/* Core algorithm:
35
36   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
37   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
38   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
39   set likewise for odd bytes so that adjacent bytes can be merged. Since the
40   bits in the syndrome reflect the order in which things occur in the original
41   string, counting trailing zeros identifies exactly which byte matched.  */
42
43ENTRY (__strchrnul_aarch64_mte)
44	PTR_ARG (0)
45	bic	src, srcin, 15
46	dup	vrepchr.16b, chrin
47	ld1	{vdata.16b}, [src]
48	mov	tmp2w, 0xf00f
49	dup	vrepmask.8h, tmp2w
50	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
51	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
52	lsl	tmp2, srcin, 2
53	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
54	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
55	fmov	tmp1, dend
56	lsr	tmp1, tmp1, tmp2	/* Mask padding bits.  */
57	cbz	tmp1, L(loop)
58
59	rbit	tmp1, tmp1
60	clz	tmp1, tmp1
61	add	result, srcin, tmp1, lsr 2
62	ret
63
64	.p2align 4
65L(loop):
66	ldr	qdata, [src, 16]!
67	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
68	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
69	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b
70	fmov	tmp1, dend
71	cbz	tmp1, L(loop)
72
73	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
74	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
75	fmov	tmp1, dend
76#ifndef __AARCH64EB__
77	rbit	tmp1, tmp1
78#endif
79	clz	tmp1, tmp1
80	add	result, src, tmp1, lsr 2
81	ret
82
83END (__strchrnul_aarch64_mte)
84
85