xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/strlen.S (revision 072a4ba82a01476eaee33781ccd241033eefcf0b)
1/*
2 * strlen - calculate the length of a string.
3 *
4 * Copyright (c) 2020-2022, Arm Limited.
5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
11 * Not MTE compatible.
12 */
13
14#include "asmdefs.h"
15
16#define srcin	x0
17#define len	x0
18
19#define src	x1
20#define data1	x2
21#define data2	x3
22#define has_nul1 x4
23#define has_nul2 x5
24#define tmp1	x4
25#define tmp2	x5
26#define tmp3	x6
27#define tmp4	x7
28#define zeroones x8
29
30#define maskv	v0
31#define maskd	d0
32#define dataq1	q1
33#define dataq2	q2
34#define datav1	v1
35#define datav2	v2
36#define tmp	x2
37#define tmpw	w2
38#define synd	x3
39#define syndw	w3
40#define shift	x4
41
42/* For the first 32 bytes, NUL detection works on the principle that
43   (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a
44   byte is zero, and can be done in parallel across the entire word.  */
45
46#define REP8_01 0x0101010101010101
47#define REP8_7f 0x7f7f7f7f7f7f7f7f
48
49/* To test the page crossing code path more thoroughly, compile with
50   -DTEST_PAGE_CROSS - this will force all calls through the slower
51   entry path.  This option is not intended for production use.  */
52
53#ifdef TEST_PAGE_CROSS
54# define MIN_PAGE_SIZE 32
55#else
56# define MIN_PAGE_SIZE 4096
57#endif
58
59/* Core algorithm:
60
61   Since strings are short on average, we check the first 32 bytes of the
62   string for a NUL character without aligning the string.  In order to use
63   unaligned loads safely we must do a page cross check first.
64
65   If there is a NUL byte we calculate the length from the 2 8-byte words
66   using conditional select to reduce branch mispredictions (it is unlikely
67   strlen will be repeatedly called on strings with the same length).
68
69   If the string is longer than 32 bytes, align src so we don't need further
70   page cross checks, and process 32 bytes per iteration using a fast SIMD
71   loop.
72
73   If the page cross check fails, we read 32 bytes from an aligned address,
74   and ignore any characters before the string.  If it contains a NUL
75   character, return the length, if not, continue in the main loop.  */
76
77ENTRY (__strlen_aarch64)
78	PTR_ARG (0)
79	and	tmp1, srcin, MIN_PAGE_SIZE - 1
80	cmp	tmp1, MIN_PAGE_SIZE - 32
81	b.hi	L(page_cross)
82
83	/* Look for a NUL byte in the first 16 bytes.  */
84	ldp	data1, data2, [srcin]
85	mov	zeroones, REP8_01
86
87#ifdef __AARCH64EB__
88	/* For big-endian, carry propagation (if the final byte in the
89	   string is 0x01) means we cannot use has_nul1/2 directly.
90	   Since we expect strings to be small and early-exit,
91	   byte-swap the data now so has_null1/2 will be correct.  */
92	rev	data1, data1
93	rev	data2, data2
94#endif
95	sub	tmp1, data1, zeroones
96	orr	tmp2, data1, REP8_7f
97	sub	tmp3, data2, zeroones
98	orr	tmp4, data2, REP8_7f
99	bics	has_nul1, tmp1, tmp2
100	bic	has_nul2, tmp3, tmp4
101	ccmp	has_nul2, 0, 0, eq
102	b.eq	L(bytes16_31)
103
104	/* Find the exact offset of the first NUL byte in the first 16 bytes
105	   from the string start.  Enter with C = has_nul1 == 0.  */
106	csel	has_nul1, has_nul1, has_nul2, cc
107	mov	len, 8
108	rev	has_nul1, has_nul1
109	csel	len, xzr, len, cc
110	clz	tmp1, has_nul1
111	add	len, len, tmp1, lsr 3
112	ret
113
114	/* Look for a NUL byte at offset 16..31 in the string.  */
115L(bytes16_31):
116	ldp	data1, data2, [srcin, 16]
117#ifdef __AARCH64EB__
118	rev	data1, data1
119	rev	data2, data2
120#endif
121	sub	tmp1, data1, zeroones
122	orr	tmp2, data1, REP8_7f
123	sub	tmp3, data2, zeroones
124	orr	tmp4, data2, REP8_7f
125	bics	has_nul1, tmp1, tmp2
126	bic	has_nul2, tmp3, tmp4
127	ccmp	has_nul2, 0, 0, eq
128	b.eq	L(loop_entry)
129
130	/* Find the exact offset of the first NUL byte at offset 16..31 from
131	   the string start.  Enter with C = has_nul1 == 0.  */
132	csel	has_nul1, has_nul1, has_nul2, cc
133	mov	len, 24
134	rev	has_nul1, has_nul1
135	mov	tmp3, 16
136	clz	tmp1, has_nul1
137	csel	len, tmp3, len, cc
138	add	len, len, tmp1, lsr 3
139	ret
140
141	nop
142L(loop_entry):
143	bic	src, srcin, 31
144
145	.p2align 5
146L(loop):
147	ldp	dataq1, dataq2, [src, 32]!
148	uminp	maskv.16b, datav1.16b, datav2.16b
149	uminp	maskv.16b, maskv.16b, maskv.16b
150	cmeq	maskv.8b, maskv.8b, 0
151	fmov	synd, maskd
152	cbz	synd, L(loop)
153
154	/* Low 32 bits of synd are non-zero if a NUL was found in datav1.  */
155	cmeq	maskv.16b, datav1.16b, 0
156	sub	len, src, srcin
157	cbnz	syndw, 1f
158	cmeq	maskv.16b, datav2.16b, 0
159	add	len, len, 16
1601:
161	/* Generate a bitmask and compute correct byte offset.  */
162	shrn	maskv.8b, maskv.8h, 4
163	fmov	synd, maskd
164#ifndef __AARCH64EB__
165	rbit	synd, synd
166#endif
167	clz	tmp, synd
168	add	len, len, tmp, lsr 2
169	ret
170
171L(page_cross):
172	bic	src, srcin, 31
173	mov	tmpw, 0x0c03
174	movk	tmpw, 0xc030, lsl 16
175	ld1	{datav1.16b, datav2.16b}, [src]
176	dup	maskv.4s, tmpw
177	cmeq	datav1.16b, datav1.16b, 0
178	cmeq	datav2.16b, datav2.16b, 0
179	and	datav1.16b, datav1.16b, maskv.16b
180	and	datav2.16b, datav2.16b, maskv.16b
181	addp	maskv.16b, datav1.16b, datav2.16b
182	addp	maskv.16b, maskv.16b, maskv.16b
183	fmov	synd, maskd
184	lsl	shift, srcin, 1
185	lsr	synd, synd, shift
186	cbz	synd, L(loop)
187
188	rbit	synd, synd
189	clz	len, synd
190	lsr	len, len, 1
191	ret
192
193END (__strlen_aarch64)
194