xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/strlen.S (revision dd21556857e8d40f66bf5ad54754d9d52669ebf7)
1/*
2 * strlen - calculate the length of a string.
3 *
4 * Copyright (c) 2020-2022, Arm Limited.
5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
11 * Not MTE compatible.
12 */
13
14#include "asmdefs.h"
15
16#define srcin	x0
17#define len	x0
18
19#define src	x1
20#define data1	x2
21#define data2	x3
22#define has_nul1 x4
23#define has_nul2 x5
24#define tmp1	x4
25#define tmp2	x5
26#define tmp3	x6
27#define tmp4	x7
28#define zeroones x8
29
30#define maskv	v0
31#define maskd	d0
32#define dataq1	q1
33#define dataq2	q2
34#define datav1	v1
35#define datav2	v2
36#define tmp	x2
37#define tmpw	w2
38#define synd	x3
39#define syndw	w3
40#define shift	x4
41
42/* For the first 32 bytes, NUL detection works on the principle that
43   (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a
44   byte is zero, and can be done in parallel across the entire word.  */
45
46#define REP8_01 0x0101010101010101
47#define REP8_7f 0x7f7f7f7f7f7f7f7f
48
49/* To test the page crossing code path more thoroughly, compile with
50   -DTEST_PAGE_CROSS - this will force all calls through the slower
51   entry path.  This option is not intended for production use.  */
52
53#ifdef TEST_PAGE_CROSS
54# define MIN_PAGE_SIZE 32
55#else
56# define MIN_PAGE_SIZE 4096
57#endif
58
59/* Core algorithm:
60
61   Since strings are short on average, we check the first 32 bytes of the
62   string for a NUL character without aligning the string.  In order to use
63   unaligned loads safely we must do a page cross check first.
64
65   If there is a NUL byte we calculate the length from the 2 8-byte words
66   using conditional select to reduce branch mispredictions (it is unlikely
67   strlen will be repeatedly called on strings with the same length).
68
69   If the string is longer than 32 bytes, align src so we don't need further
70   page cross checks, and process 32 bytes per iteration using a fast SIMD
71   loop.
72
73   If the page cross check fails, we read 32 bytes from an aligned address,
74   and ignore any characters before the string.  If it contains a NUL
75   character, return the length, if not, continue in the main loop.  */
76
77ENTRY (__strlen_aarch64)
78	and	tmp1, srcin, MIN_PAGE_SIZE - 1
79	cmp	tmp1, MIN_PAGE_SIZE - 32
80	b.hi	L(page_cross)
81
82	/* Look for a NUL byte in the first 16 bytes.  */
83	ldp	data1, data2, [srcin]
84	mov	zeroones, REP8_01
85
86#ifdef __AARCH64EB__
87	/* For big-endian, carry propagation (if the final byte in the
88	   string is 0x01) means we cannot use has_nul1/2 directly.
89	   Since we expect strings to be small and early-exit,
90	   byte-swap the data now so has_null1/2 will be correct.  */
91	rev	data1, data1
92	rev	data2, data2
93#endif
94	sub	tmp1, data1, zeroones
95	orr	tmp2, data1, REP8_7f
96	sub	tmp3, data2, zeroones
97	orr	tmp4, data2, REP8_7f
98	bics	has_nul1, tmp1, tmp2
99	bic	has_nul2, tmp3, tmp4
100	ccmp	has_nul2, 0, 0, eq
101	b.eq	L(bytes16_31)
102
103	/* Find the exact offset of the first NUL byte in the first 16 bytes
104	   from the string start.  Enter with C = has_nul1 == 0.  */
105	csel	has_nul1, has_nul1, has_nul2, cc
106	mov	len, 8
107	rev	has_nul1, has_nul1
108	csel	len, xzr, len, cc
109	clz	tmp1, has_nul1
110	add	len, len, tmp1, lsr 3
111	ret
112
113	/* Look for a NUL byte at offset 16..31 in the string.  */
114L(bytes16_31):
115	ldp	data1, data2, [srcin, 16]
116#ifdef __AARCH64EB__
117	rev	data1, data1
118	rev	data2, data2
119#endif
120	sub	tmp1, data1, zeroones
121	orr	tmp2, data1, REP8_7f
122	sub	tmp3, data2, zeroones
123	orr	tmp4, data2, REP8_7f
124	bics	has_nul1, tmp1, tmp2
125	bic	has_nul2, tmp3, tmp4
126	ccmp	has_nul2, 0, 0, eq
127	b.eq	L(loop_entry)
128
129	/* Find the exact offset of the first NUL byte at offset 16..31 from
130	   the string start.  Enter with C = has_nul1 == 0.  */
131	csel	has_nul1, has_nul1, has_nul2, cc
132	mov	len, 24
133	rev	has_nul1, has_nul1
134	mov	tmp3, 16
135	clz	tmp1, has_nul1
136	csel	len, tmp3, len, cc
137	add	len, len, tmp1, lsr 3
138	ret
139
140	nop
141L(loop_entry):
142	bic	src, srcin, 31
143
144	.p2align 5
145L(loop):
146	ldp	dataq1, dataq2, [src, 32]!
147	uminp	maskv.16b, datav1.16b, datav2.16b
148	uminp	maskv.16b, maskv.16b, maskv.16b
149	cmeq	maskv.8b, maskv.8b, 0
150	fmov	synd, maskd
151	cbz	synd, L(loop)
152
153	/* Low 32 bits of synd are non-zero if a NUL was found in datav1.  */
154	cmeq	maskv.16b, datav1.16b, 0
155	sub	len, src, srcin
156	cbnz	syndw, 1f
157	cmeq	maskv.16b, datav2.16b, 0
158	add	len, len, 16
1591:
160	/* Generate a bitmask and compute correct byte offset.  */
161	shrn	maskv.8b, maskv.8h, 4
162	fmov	synd, maskd
163#ifndef __AARCH64EB__
164	rbit	synd, synd
165#endif
166	clz	tmp, synd
167	add	len, len, tmp, lsr 2
168	ret
169
170L(page_cross):
171	bic	src, srcin, 31
172	mov	tmpw, 0x0c03
173	movk	tmpw, 0xc030, lsl 16
174	ld1	{datav1.16b, datav2.16b}, [src]
175	dup	maskv.4s, tmpw
176	cmeq	datav1.16b, datav1.16b, 0
177	cmeq	datav2.16b, datav2.16b, 0
178	and	datav1.16b, datav1.16b, maskv.16b
179	and	datav2.16b, datav2.16b, maskv.16b
180	addp	maskv.16b, datav1.16b, datav2.16b
181	addp	maskv.16b, maskv.16b, maskv.16b
182	fmov	synd, maskd
183	lsl	shift, srcin, 1
184	lsr	synd, synd, shift
185	cbz	synd, L(loop)
186
187	rbit	synd, synd
188	clz	len, synd
189	lsr	len, len, 1
190	ret
191
192END (__strlen_aarch64)
193