xref: /freebsd/contrib/cortex-strings/src/aarch64/strlen.S (revision f6a3b357e9be4c6423c85eff9a847163a0d307c8)
1/* Copyright (c) 2013-2015, Linaro Limited
2   All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions are met:
6       * Redistributions of source code must retain the above copyright
7	 notice, this list of conditions and the following disclaimer.
8       * Redistributions in binary form must reproduce the above copyright
9	 notice, this list of conditions and the following disclaimer in the
10	 documentation and/or other materials provided with the distribution.
11       * Neither the name of the Linaro nor the
12	 names of its contributors may be used to endorse or promote products
13	 derived from this software without specific prior written permission.
14
15   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
26
27/* Assumptions:
28 *
29 * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
30 */
31
32/* To test the page crossing code path more thoroughly, compile with
33   -DTEST_PAGE_CROSS - this will force all calls through the slower
34   entry path.  This option is not intended for production use.	 */
35
36/* Arguments and results.  */
37#define srcin		x0
38#define len		x0
39
40/* Locals and temporaries.  */
41#define src		x1
42#define data1		x2
43#define data2		x3
44#define has_nul1	x4
45#define has_nul2	x5
46#define tmp1		x4
47#define tmp2		x5
48#define tmp3		x6
49#define tmp4		x7
50#define zeroones	x8
51
52#define L(l) .L ## l
53
54	.macro def_fn f p2align=0
55	.text
56	.p2align \p2align
57	.global \f
58	.type \f, %function
59\f:
60	.endm
61
62	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
63	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
64	   can be done in parallel across the entire word. A faster check
65	   (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
66	   false hits for characters 129..255.	*/
67
68#define REP8_01 0x0101010101010101
69#define REP8_7f 0x7f7f7f7f7f7f7f7f
70#define REP8_80 0x8080808080808080
71
72#ifdef TEST_PAGE_CROSS
73# define MIN_PAGE_SIZE 15
74#else
75# define MIN_PAGE_SIZE 4096
76#endif
77
78	/* Since strings are short on average, we check the first 16 bytes
79	   of the string for a NUL character.  In order to do an unaligned ldp
80	   safely we have to do a page cross check first.  If there is a NUL
81	   byte we calculate the length from the 2 8-byte words using
82	   conditional select to reduce branch mispredictions (it is unlikely
83	   strlen will be repeatedly called on strings with the same length).
84
85	   If the string is longer than 16 bytes, we align src so don't need
86	   further page cross checks, and process 32 bytes per iteration
87	   using the fast NUL check.  If we encounter non-ASCII characters,
88	   fallback to a second loop using the full NUL check.
89
90	   If the page cross check fails, we read 16 bytes from an aligned
91	   address, remove any characters before the string, and continue
92	   in the main loop using aligned loads.  Since strings crossing a
93	   page in the first 16 bytes are rare (probability of
94	   16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
95
96	   AArch64 systems have a minimum page size of 4k.  We don't bother
97	   checking for larger page sizes - the cost of setting up the correct
98	   page size is just not worth the extra gain from a small reduction in
99	   the cases taking the slow path.  Note that we only care about
100	   whether the first fetch, which may be misaligned, crosses a page
101	   boundary.  */
102
103def_fn strlen p2align=6
104	and	tmp1, srcin, MIN_PAGE_SIZE - 1
105	mov	zeroones, REP8_01
106	cmp	tmp1, MIN_PAGE_SIZE - 16
107	b.gt	L(page_cross)
108	ldp	data1, data2, [srcin]
109#ifdef __AARCH64EB__
110	/* For big-endian, carry propagation (if the final byte in the
111	   string is 0x01) means we cannot use has_nul1/2 directly.
112	   Since we expect strings to be small and early-exit,
113	   byte-swap the data now so has_null1/2 will be correct.  */
114	rev	data1, data1
115	rev	data2, data2
116#endif
117	sub	tmp1, data1, zeroones
118	orr	tmp2, data1, REP8_7f
119	sub	tmp3, data2, zeroones
120	orr	tmp4, data2, REP8_7f
121	bics	has_nul1, tmp1, tmp2
122	bic	has_nul2, tmp3, tmp4
123	ccmp	has_nul2, 0, 0, eq
124	beq	L(main_loop_entry)
125
126	/* Enter with C = has_nul1 == 0.  */
127	csel	has_nul1, has_nul1, has_nul2, cc
128	mov	len, 8
129	rev	has_nul1, has_nul1
130	clz	tmp1, has_nul1
131	csel	len, xzr, len, cc
132	add	len, len, tmp1, lsr 3
133	ret
134
135	/* The inner loop processes 32 bytes per iteration and uses the fast
136	   NUL check.  If we encounter non-ASCII characters, use a second
137	   loop with the accurate NUL check.  */
138	.p2align 4
139L(main_loop_entry):
140	bic	src, srcin, 15
141	sub	src, src, 16
142L(main_loop):
143	ldp	data1, data2, [src, 32]!
144.Lpage_cross_entry:
145	sub	tmp1, data1, zeroones
146	sub	tmp3, data2, zeroones
147	orr	tmp2, tmp1, tmp3
148	tst	tmp2, zeroones, lsl 7
149	bne	1f
150	ldp	data1, data2, [src, 16]
151	sub	tmp1, data1, zeroones
152	sub	tmp3, data2, zeroones
153	orr	tmp2, tmp1, tmp3
154	tst	tmp2, zeroones, lsl 7
155	beq	L(main_loop)
156	add	src, src, 16
1571:
158	/* The fast check failed, so do the slower, accurate NUL check.	 */
159	orr	tmp2, data1, REP8_7f
160	orr	tmp4, data2, REP8_7f
161	bics	has_nul1, tmp1, tmp2
162	bic	has_nul2, tmp3, tmp4
163	ccmp	has_nul2, 0, 0, eq
164	beq	L(nonascii_loop)
165
166	/* Enter with C = has_nul1 == 0.  */
167L(tail):
168#ifdef __AARCH64EB__
169	/* For big-endian, carry propagation (if the final byte in the
170	   string is 0x01) means we cannot use has_nul1/2 directly.  The
171	   easiest way to get the correct byte is to byte-swap the data
172	   and calculate the syndrome a second time.  */
173	csel	data1, data1, data2, cc
174	rev	data1, data1
175	sub	tmp1, data1, zeroones
176	orr	tmp2, data1, REP8_7f
177	bic	has_nul1, tmp1, tmp2
178#else
179	csel	has_nul1, has_nul1, has_nul2, cc
180#endif
181	sub	len, src, srcin
182	rev	has_nul1, has_nul1
183	add	tmp2, len, 8
184	clz	tmp1, has_nul1
185	csel	len, len, tmp2, cc
186	add	len, len, tmp1, lsr 3
187	ret
188
189L(nonascii_loop):
190	ldp	data1, data2, [src, 16]!
191	sub	tmp1, data1, zeroones
192	orr	tmp2, data1, REP8_7f
193	sub	tmp3, data2, zeroones
194	orr	tmp4, data2, REP8_7f
195	bics	has_nul1, tmp1, tmp2
196	bic	has_nul2, tmp3, tmp4
197	ccmp	has_nul2, 0, 0, eq
198	bne	L(tail)
199	ldp	data1, data2, [src, 16]!
200	sub	tmp1, data1, zeroones
201	orr	tmp2, data1, REP8_7f
202	sub	tmp3, data2, zeroones
203	orr	tmp4, data2, REP8_7f
204	bics	has_nul1, tmp1, tmp2
205	bic	has_nul2, tmp3, tmp4
206	ccmp	has_nul2, 0, 0, eq
207	beq	L(nonascii_loop)
208	b	L(tail)
209
210	/* Load 16 bytes from [srcin & ~15] and force the bytes that precede
211	   srcin to 0x7f, so we ignore any NUL bytes before the string.
212	   Then continue in the aligned loop.  */
213L(page_cross):
214	bic	src, srcin, 15
215	ldp	data1, data2, [src]
216	lsl	tmp1, srcin, 3
217	mov	tmp4, -1
218#ifdef __AARCH64EB__
219	/* Big-endian.	Early bytes are at MSB.	 */
220	lsr	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
221#else
222	/* Little-endian.  Early bytes are at LSB.  */
223	lsl	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
224#endif
225	orr	tmp1, tmp1, REP8_80
226	orn	data1, data1, tmp1
227	orn	tmp2, data2, tmp1
228	tst	srcin, 8
229	csel	data1, data1, tmp4, eq
230	csel	data2, data2, tmp2, eq
231	b	L(page_cross_entry)
232
233	.size	strlen, . - strlen
234