xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/memchr-mte.S (revision e6bfd18d21b225af6a0ed67ceeaf1293b7b9eba5)
1/*
2 * memchr - find a character in a memory zone
3 *
4 * Copyright (c) 2020, Arm Limited.
5 * SPDX-License-Identifier: MIT
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, Advanced SIMD.
11 * MTE compatible.
12 */
13
14#include "../asmdefs.h"
15
16#define srcin		x0
17#define chrin		w1
18#define cntin		x2
19#define result		x0
20
21#define src		x3
22#define cntrem		x4
23#define synd		x5
24#define shift		x6
25#define	tmp		x7
26#define wtmp		w7
27
28#define vrepchr		v0
29#define qdata		q1
30#define vdata		v1
31#define vhas_chr	v2
32#define vrepmask	v3
33#define vend		v4
34#define dend		d4
35
36/*
37   Core algorithm:
38
39   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
40   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
41   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
42   set likewise for odd bytes so that adjacent bytes can be merged. Since the
43   bits in the syndrome reflect the order in which things occur in the original
44   string, counting trailing zeros identifies exactly which byte matched.  */
45
46ENTRY (__memchr_aarch64_mte)
47	PTR_ARG (0)
48	SIZE_ARG (2)
49	bic	src, srcin, 15
50	cbz	cntin, L(nomatch)
51	ld1	{vdata.16b}, [src]
52	dup	vrepchr.16b, chrin
53	mov	wtmp, 0xf00f
54	dup	vrepmask.8h, wtmp
55	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
56	lsl	shift, srcin, 2
57	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
58	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
59	fmov	synd, dend
60	lsr	synd, synd, shift
61	cbz	synd, L(start_loop)
62
63	rbit	synd, synd
64	clz	synd, synd
65	add	result, srcin, synd, lsr 2
66	cmp	cntin, synd, lsr 2
67	csel	result, result, xzr, hi
68	ret
69
70L(start_loop):
71	sub	tmp, src, srcin
72	add	tmp, tmp, 16
73	subs	cntrem, cntin, tmp
74	b.ls	L(nomatch)
75
76	/* Make sure that it won't overread by a 16-byte chunk */
77	add	tmp, cntrem, 15
78	tbnz	tmp, 4, L(loop32_2)
79
80	.p2align 4
81L(loop32):
82	ldr	qdata, [src, 16]!
83	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
84	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
85	fmov	synd, dend
86	cbnz	synd, L(end)
87
88L(loop32_2):
89	ldr	qdata, [src, 16]!
90	subs	cntrem, cntrem, 32
91	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
92	b.ls	L(end)
93	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
94	fmov	synd, dend
95	cbz	synd, L(loop32)
96L(end):
97	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
98	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
99	fmov	synd, dend
100	add	tmp, srcin, cntin
101	sub	cntrem, tmp, src
102#ifndef __AARCH64EB__
103	rbit	synd, synd
104#endif
105	clz	synd, synd
106	cmp	cntrem, synd, lsr 2
107	add	result, src, synd, lsr 2
108	csel	result, result, xzr, hi
109	ret
110
111L(nomatch):
112	mov	result, 0
113	ret
114
115END (__memchr_aarch64_mte)
116
117