xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/strlen.S (revision 9729f076e4d93c5a37e78d427bfe0f1ab99bbcc6)
1/*
2 * strlen - calculate the length of a string.
3 *
4 * Copyright (c) 2020, Arm Limited.
5 * SPDX-License-Identifier: MIT
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
11 * Not MTE compatible.
12 */
13
14#include "../asmdefs.h"
15
16#define srcin	x0
17#define len	x0
18
19#define src	x1
20#define data1	x2
21#define data2	x3
22#define has_nul1 x4
23#define has_nul2 x5
24#define tmp1	x4
25#define tmp2	x5
26#define tmp3	x6
27#define tmp4	x7
28#define zeroones x8
29
30#define maskv	v0
31#define maskd	d0
32#define dataq1	q1
33#define dataq2	q2
34#define datav1	v1
35#define datav2	v2
36#define tmp	x2
37#define tmpw	w2
38#define synd	x3
39#define shift	x4
40
41/* For the first 32 bytes, NUL detection works on the principle that
42   (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a
43   byte is zero, and can be done in parallel across the entire word.  */
44
45#define REP8_01 0x0101010101010101
46#define REP8_7f 0x7f7f7f7f7f7f7f7f
47
48/* To test the page crossing code path more thoroughly, compile with
49   -DTEST_PAGE_CROSS - this will force all calls through the slower
50   entry path.  This option is not intended for production use.  */
51
52#ifdef TEST_PAGE_CROSS
53# define MIN_PAGE_SIZE 32
54#else
55# define MIN_PAGE_SIZE 4096
56#endif
57
58/* Core algorithm:
59
60   Since strings are short on average, we check the first 32 bytes of the
61   string for a NUL character without aligning the string.  In order to use
62   unaligned loads safely we must do a page cross check first.
63
64   If there is a NUL byte we calculate the length from the 2 8-byte words
65   using conditional select to reduce branch mispredictions (it is unlikely
66   strlen will be repeatedly called on strings with the same length).
67
68   If the string is longer than 32 bytes, align src so we don't need further
69   page cross checks, and process 32 bytes per iteration using a fast SIMD
70   loop.
71
72   If the page cross check fails, we read 32 bytes from an aligned address,
73   and ignore any characters before the string.  If it contains a NUL
74   character, return the length, if not, continue in the main loop.  */
75
76ENTRY (__strlen_aarch64)
77	PTR_ARG (0)
78	and	tmp1, srcin, MIN_PAGE_SIZE - 1
79	cmp	tmp1, MIN_PAGE_SIZE - 32
80	b.hi	L(page_cross)
81
82	/* Look for a NUL byte in the first 16 bytes.  */
83	ldp	data1, data2, [srcin]
84	mov	zeroones, REP8_01
85
86#ifdef __AARCH64EB__
87	/* For big-endian, carry propagation (if the final byte in the
88	   string is 0x01) means we cannot use has_nul1/2 directly.
89	   Since we expect strings to be small and early-exit,
90	   byte-swap the data now so has_null1/2 will be correct.  */
91	rev	data1, data1
92	rev	data2, data2
93#endif
94	sub	tmp1, data1, zeroones
95	orr	tmp2, data1, REP8_7f
96	sub	tmp3, data2, zeroones
97	orr	tmp4, data2, REP8_7f
98	bics	has_nul1, tmp1, tmp2
99	bic	has_nul2, tmp3, tmp4
100	ccmp	has_nul2, 0, 0, eq
101	b.eq	L(bytes16_31)
102
103	/* Find the exact offset of the first NUL byte in the first 16 bytes
104	   from the string start.  Enter with C = has_nul1 == 0.  */
105	csel	has_nul1, has_nul1, has_nul2, cc
106	mov	len, 8
107	rev	has_nul1, has_nul1
108	csel	len, xzr, len, cc
109	clz	tmp1, has_nul1
110	add	len, len, tmp1, lsr 3
111	ret
112
113	.p2align 3
114	/* Look for a NUL byte at offset 16..31 in the string.  */
115L(bytes16_31):
116	ldp	data1, data2, [srcin, 16]
117#ifdef __AARCH64EB__
118	rev	data1, data1
119	rev	data2, data2
120#endif
121	sub	tmp1, data1, zeroones
122	orr	tmp2, data1, REP8_7f
123	sub	tmp3, data2, zeroones
124	orr	tmp4, data2, REP8_7f
125	bics	has_nul1, tmp1, tmp2
126	bic	has_nul2, tmp3, tmp4
127	ccmp	has_nul2, 0, 0, eq
128	b.eq	L(loop_entry)
129
130	/* Find the exact offset of the first NUL byte at offset 16..31 from
131	   the string start.  Enter with C = has_nul1 == 0.  */
132	csel	has_nul1, has_nul1, has_nul2, cc
133	mov	len, 24
134	rev	has_nul1, has_nul1
135	mov	tmp3, 16
136	clz	tmp1, has_nul1
137	csel	len, tmp3, len, cc
138	add	len, len, tmp1, lsr 3
139	ret
140
141L(loop_entry):
142	bic	src, srcin, 31
143
144	.p2align 5
145L(loop):
146	ldp	dataq1, dataq2, [src, 32]!
147	uminp	maskv.16b, datav1.16b, datav2.16b
148	uminp	maskv.16b, maskv.16b, maskv.16b
149	cmeq	maskv.8b, maskv.8b, 0
150	fmov	synd, maskd
151	cbz	synd, L(loop)
152
153	/* Low 32 bits of synd are non-zero if a NUL was found in datav1.  */
154	cmeq	maskv.16b, datav1.16b, 0
155	sub	len, src, srcin
156	tst	synd, 0xffffffff
157	b.ne	1f
158	cmeq	maskv.16b, datav2.16b, 0
159	add	len, len, 16
1601:
161	/* Generate a bitmask and compute correct byte offset.  */
162#ifdef __AARCH64EB__
163	bic	maskv.8h, 0xf0
164#else
165	bic	maskv.8h, 0x0f, lsl 8
166#endif
167	umaxp	maskv.16b, maskv.16b, maskv.16b
168	fmov	synd, maskd
169#ifndef __AARCH64EB__
170	rbit	synd, synd
171#endif
172	clz	tmp, synd
173	add	len, len, tmp, lsr 2
174	ret
175
176        .p2align 4
177
178L(page_cross):
179	bic	src, srcin, 31
180	mov	tmpw, 0x0c03
181	movk	tmpw, 0xc030, lsl 16
182	ld1	{datav1.16b, datav2.16b}, [src]
183	dup	maskv.4s, tmpw
184	cmeq	datav1.16b, datav1.16b, 0
185	cmeq	datav2.16b, datav2.16b, 0
186	and	datav1.16b, datav1.16b, maskv.16b
187	and	datav2.16b, datav2.16b, maskv.16b
188	addp	maskv.16b, datav1.16b, datav2.16b
189	addp	maskv.16b, maskv.16b, maskv.16b
190	fmov	synd, maskd
191	lsl	shift, srcin, 1
192	lsr	synd, synd, shift
193	cbz	synd, L(loop)
194
195	rbit	synd, synd
196	clz	len, synd
197	lsr	len, len, 1
198	ret
199
200END (__strlen_aarch64)
201