xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/memchr.S (revision dd21556857e8d40f66bf5ad54754d9d52669ebf7)
1/*
2 * memchr - find a character in a memory zone
3 *
4 * Copyright (c) 2014-2022, Arm Limited.
5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64
11 * Neon Available.
12 */
13
14#include "asmdefs.h"
15
16/* Arguments and results.  */
17#define srcin		x0
18#define chrin		w1
19#define cntin		x2
20
21#define result		x0
22
23#define src		x3
24#define	tmp		x4
25#define wtmp2		w5
26#define synd		x6
27#define soff		x9
28#define cntrem		x10
29
30#define vrepchr		v0
31#define vdata1		v1
32#define vdata2		v2
33#define vhas_chr1	v3
34#define vhas_chr2	v4
35#define vrepmask	v5
36#define vend		v6
37
38/*
39 * Core algorithm:
40 *
41 * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
42 * per byte. For each tuple, bit 0 is set if the relevant byte matched the
43 * requested character and bit 1 is not used (faster than using a 32bit
44 * syndrome). Since the bits in the syndrome reflect exactly the order in which
45 * things occur in the original string, counting trailing zeros allows to
46 * identify exactly which byte has matched.
47 */
48
49ENTRY (__memchr_aarch64)
50	/* Do not dereference srcin if no bytes to compare.  */
51	cbz	cntin, L(zero_length)
52	/*
53	 * Magic constant 0x40100401 allows us to identify which lane matches
54	 * the requested byte.
55	 */
56	mov	wtmp2, #0x0401
57	movk	wtmp2, #0x4010, lsl #16
58	dup	vrepchr.16b, chrin
59	/* Work with aligned 32-byte chunks */
60	bic	src, srcin, #31
61	dup	vrepmask.4s, wtmp2
62	ands	soff, srcin, #31
63	and	cntrem, cntin, #31
64	b.eq	L(loop)
65
66	/*
67	 * Input string is not 32-byte aligned. We calculate the syndrome
68	 * value for the aligned 32 bytes block containing the first bytes
69	 * and mask the irrelevant part.
70	 */
71
72	ld1	{vdata1.16b, vdata2.16b}, [src], #32
73	sub	tmp, soff, #32
74	adds	cntin, cntin, tmp
75	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
76	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
77	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
78	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
79	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
80	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
81	mov	synd, vend.d[0]
82	/* Clear the soff*2 lower bits */
83	lsl	tmp, soff, #1
84	lsr	synd, synd, tmp
85	lsl	synd, synd, tmp
86	/* The first block can also be the last */
87	b.ls	L(masklast)
88	/* Have we found something already? */
89	cbnz	synd, L(tail)
90
91L(loop):
92	ld1	{vdata1.16b, vdata2.16b}, [src], #32
93	subs	cntin, cntin, #32
94	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
95	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
96	/* If we're out of data we finish regardless of the result */
97	b.ls	L(end)
98	/* Use a fast check for the termination condition */
99	orr	vend.16b, vhas_chr1.16b, vhas_chr2.16b
100	addp	vend.2d, vend.2d, vend.2d
101	mov	synd, vend.d[0]
102	/* We're not out of data, loop if we haven't found the character */
103	cbz	synd, L(loop)
104
105L(end):
106	/* Termination condition found, let's calculate the syndrome value */
107	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
108	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
109	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
110	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
111	mov	synd, vend.d[0]
112	/* Only do the clear for the last possible block */
113	b.hs	L(tail)
114
115L(masklast):
116	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
117	add	tmp, cntrem, soff
118	and	tmp, tmp, #31
119	sub	tmp, tmp, #32
120	neg	tmp, tmp, lsl #1
121	lsl	synd, synd, tmp
122	lsr	synd, synd, tmp
123
124L(tail):
125	/* Count the trailing zeros using bit reversing */
126	rbit	synd, synd
127	/* Compensate the last post-increment */
128	sub	src, src, #32
129	/* Check that we have found a character */
130	cmp	synd, #0
131	/* And count the leading zeros */
132	clz	synd, synd
133	/* Compute the potential result */
134	add	result, src, synd, lsr #1
135	/* Select result or NULL */
136	csel	result, xzr, result, eq
137	ret
138
139L(zero_length):
140	mov	result, #0
141	ret
142
143END (__memchr_aarch64)
144
145