xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/memchr.S (revision d5b0e70f7e04d971691517ce1304d86a1e367e2e)
1/*
2 * memchr - find a character in a memory zone
3 *
4 * Copyright (c) 2014-2020, Arm Limited.
5 * SPDX-License-Identifier: MIT
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64
11 * Neon Available.
12 */
13
14#include "../asmdefs.h"
15
16/* Arguments and results.  */
17#define srcin		x0
18#define chrin		w1
19#define cntin		x2
20
21#define result		x0
22
23#define src		x3
24#define	tmp		x4
25#define wtmp2		w5
26#define synd		x6
27#define soff		x9
28#define cntrem		x10
29
30#define vrepchr		v0
31#define vdata1		v1
32#define vdata2		v2
33#define vhas_chr1	v3
34#define vhas_chr2	v4
35#define vrepmask	v5
36#define vend		v6
37
38/*
39 * Core algorithm:
40 *
41 * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
42 * per byte. For each tuple, bit 0 is set if the relevant byte matched the
43 * requested character and bit 1 is not used (faster than using a 32bit
44 * syndrome). Since the bits in the syndrome reflect exactly the order in which
45 * things occur in the original string, counting trailing zeros allows to
46 * identify exactly which byte has matched.
47 */
48
49ENTRY (__memchr_aarch64)
50	PTR_ARG (0)
51	SIZE_ARG (2)
52	/* Do not dereference srcin if no bytes to compare.  */
53	cbz	cntin, L(zero_length)
54	/*
55	 * Magic constant 0x40100401 allows us to identify which lane matches
56	 * the requested byte.
57	 */
58	mov	wtmp2, #0x0401
59	movk	wtmp2, #0x4010, lsl #16
60	dup	vrepchr.16b, chrin
61	/* Work with aligned 32-byte chunks */
62	bic	src, srcin, #31
63	dup	vrepmask.4s, wtmp2
64	ands	soff, srcin, #31
65	and	cntrem, cntin, #31
66	b.eq	L(loop)
67
68	/*
69	 * Input string is not 32-byte aligned. We calculate the syndrome
70	 * value for the aligned 32 bytes block containing the first bytes
71	 * and mask the irrelevant part.
72	 */
73
74	ld1	{vdata1.16b, vdata2.16b}, [src], #32
75	sub	tmp, soff, #32
76	adds	cntin, cntin, tmp
77	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
78	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
79	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
80	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
81	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
82	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
83	mov	synd, vend.d[0]
84	/* Clear the soff*2 lower bits */
85	lsl	tmp, soff, #1
86	lsr	synd, synd, tmp
87	lsl	synd, synd, tmp
88	/* The first block can also be the last */
89	b.ls	L(masklast)
90	/* Have we found something already? */
91	cbnz	synd, L(tail)
92
93L(loop):
94	ld1	{vdata1.16b, vdata2.16b}, [src], #32
95	subs	cntin, cntin, #32
96	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
97	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
98	/* If we're out of data we finish regardless of the result */
99	b.ls	L(end)
100	/* Use a fast check for the termination condition */
101	orr	vend.16b, vhas_chr1.16b, vhas_chr2.16b
102	addp	vend.2d, vend.2d, vend.2d
103	mov	synd, vend.d[0]
104	/* We're not out of data, loop if we haven't found the character */
105	cbz	synd, L(loop)
106
107L(end):
108	/* Termination condition found, let's calculate the syndrome value */
109	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
110	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
111	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
112	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
113	mov	synd, vend.d[0]
114	/* Only do the clear for the last possible block */
115	b.hs	L(tail)
116
117L(masklast):
118	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
119	add	tmp, cntrem, soff
120	and	tmp, tmp, #31
121	sub	tmp, tmp, #32
122	neg	tmp, tmp, lsl #1
123	lsl	synd, synd, tmp
124	lsr	synd, synd, tmp
125
126L(tail):
127	/* Count the trailing zeros using bit reversing */
128	rbit	synd, synd
129	/* Compensate the last post-increment */
130	sub	src, src, #32
131	/* Check that we have found a character */
132	cmp	synd, #0
133	/* And count the leading zeros */
134	clz	synd, synd
135	/* Compute the potential result */
136	add	result, src, synd, lsr #1
137	/* Select result or NULL */
138	csel	result, xzr, result, eq
139	ret
140
141L(zero_length):
142	mov	result, #0
143	ret
144
145END (__memchr_aarch64)
146
147