xref: /freebsd/contrib/cortex-strings/src/aarch64/strcpy.S (revision 8c4282b370bd66908b45b6a223226a9fc2b69d57)
1/*
2   strcpy/stpcpy - copy a string returning pointer to start/end.
3
4   Copyright (c) 2013, 2014, 2015 ARM Ltd.
5   All Rights Reserved.
6
7   Redistribution and use in source and binary forms, with or without
8   modification, are permitted provided that the following conditions are met:
9       * Redistributions of source code must retain the above copyright
10         notice, this list of conditions and the following disclaimer.
11       * Redistributions in binary form must reproduce the above copyright
12         notice, this list of conditions and the following disclaimer in the
13         documentation and/or other materials provided with the distribution.
14       * Neither the name of the company nor the names of its contributors
15         may be used to endorse or promote products derived from this
16         software without specific prior written permission.
17
18   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  */
29
30/* Assumptions:
31 *
32 * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
33 */
34
35/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
36
37   To test the page crossing code path more thoroughly, compile with
38   -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower
39   entry path.  This option is not intended for production use.  */
40
41/* Arguments and results.  */
42#define dstin		x0
43#define srcin		x1
44
45/* Locals and temporaries.  */
46#define src		x2
47#define dst		x3
48#define data1		x4
49#define data1w		w4
50#define data2		x5
51#define data2w		w5
52#define has_nul1	x6
53#define has_nul2	x7
54#define tmp1		x8
55#define tmp2		x9
56#define tmp3		x10
57#define tmp4		x11
58#define zeroones	x12
59#define data1a		x13
60#define data2a		x14
61#define pos		x15
62#define len		x16
63#define to_align	x17
64
65#ifdef BUILD_STPCPY
66#define STRCPY stpcpy
67#else
68#define STRCPY strcpy
69#endif
70
71	.macro def_fn f p2align=0
72	.text
73	.p2align \p2align
74	.global \f
75	.type \f, %function
76\f:
77	.endm
78
79	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
80	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
81	   can be done in parallel across the entire word.  */
82
83#define REP8_01 0x0101010101010101
84#define REP8_7f 0x7f7f7f7f7f7f7f7f
85#define REP8_80 0x8080808080808080
86
87	/* AArch64 systems have a minimum page size of 4k.  We can do a quick
88	   page size check for crossing this boundary on entry and if we
89	   do not, then we can short-circuit much of the entry code.  We
90	   expect early page-crossing strings to be rare (probability of
91	   16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
92	   predictable, even with random strings.
93
94	   We don't bother checking for larger page sizes, the cost of setting
95	   up the correct page size is just not worth the extra gain from
96	   a small reduction in the cases taking the slow path.  Note that
97	   we only care about whether the first fetch, which may be
98	   misaligned, crosses a page boundary - after that we move to aligned
99	   fetches for the remainder of the string.  */
100
101#ifdef STRCPY_TEST_PAGE_CROSS
102	/* Make everything that isn't Qword aligned look like a page cross.  */
103#define MIN_PAGE_P2 4
104#else
105#define MIN_PAGE_P2 12
106#endif
107
108#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
109
110def_fn STRCPY p2align=6
111	/* For moderately short strings, the fastest way to do the copy is to
112	   calculate the length of the string in the same way as strlen, then
113	   essentially do a memcpy of the result.  This avoids the need for
114	   multiple byte copies and further means that by the time we
115	   reach the bulk copy loop we know we can always use DWord
116	   accesses.  We expect strcpy to rarely be called repeatedly
117	   with the same source string, so branch prediction is likely to
118	   always be difficult - we mitigate against this by preferring
119	   conditional select operations over branches whenever this is
120	   feasible.  */
121	and	tmp2, srcin, #(MIN_PAGE_SIZE - 1)
122	mov	zeroones, #REP8_01
123	and	to_align, srcin, #15
124	cmp	tmp2, #(MIN_PAGE_SIZE - 16)
125	neg	tmp1, to_align
126	/* The first fetch will straddle a (possible) page boundary iff
127	   srcin + 15 causes bit[MIN_PAGE_P2] to change value.  A 16-byte
128	   aligned string will never fail the page align check, so will
129	   always take the fast path.  */
130	b.gt	.Lpage_cross
131
132.Lpage_cross_ok:
133	ldp	data1, data2, [srcin]
134#ifdef __AARCH64EB__
135	/* Because we expect the end to be found within 16 characters
136	   (profiling shows this is the most common case), it's worth
137	   swapping the bytes now to save having to recalculate the
138	   termination syndrome later.  We preserve data1 and data2
139	   so that we can re-use the values later on.  */
140	rev	tmp2, data1
141	sub	tmp1, tmp2, zeroones
142	orr	tmp2, tmp2, #REP8_7f
143	bics	has_nul1, tmp1, tmp2
144	b.ne	.Lfp_le8
145	rev	tmp4, data2
146	sub	tmp3, tmp4, zeroones
147	orr	tmp4, tmp4, #REP8_7f
148#else
149	sub	tmp1, data1, zeroones
150	orr	tmp2, data1, #REP8_7f
151	bics	has_nul1, tmp1, tmp2
152	b.ne	.Lfp_le8
153	sub	tmp3, data2, zeroones
154	orr	tmp4, data2, #REP8_7f
155#endif
156	bics	has_nul2, tmp3, tmp4
157	b.eq	.Lbulk_entry
158
159	/* The string is short (<=16 bytes).  We don't know exactly how
160	   short though, yet.  Work out the exact length so that we can
161	   quickly select the optimal copy strategy.  */
162.Lfp_gt8:
163	rev	has_nul2, has_nul2
164	clz	pos, has_nul2
165	mov	tmp2, #56
166	add	dst, dstin, pos, lsr #3		/* Bits to bytes.  */
167	sub	pos, tmp2, pos
168#ifdef __AARCH64EB__
169	lsr	data2, data2, pos
170#else
171	lsl	data2, data2, pos
172#endif
173	str	data2, [dst, #1]
174	str	data1, [dstin]
175#ifdef BUILD_STPCPY
176	add	dstin, dst, #8
177#endif
178	ret
179
180.Lfp_le8:
181	rev	has_nul1, has_nul1
182	clz	pos, has_nul1
183	add	dst, dstin, pos, lsr #3		/* Bits to bytes.  */
184	subs	tmp2, pos, #24			/* Pos in bits. */
185	b.lt	.Lfp_lt4
186#ifdef __AARCH64EB__
187	mov	tmp2, #56
188	sub	pos, tmp2, pos
189	lsr	data2, data1, pos
190	lsr	data1, data1, #32
191#else
192	lsr	data2, data1, tmp2
193#endif
194	/* 4->7 bytes to copy.  */
195	str	data2w, [dst, #-3]
196	str	data1w, [dstin]
197#ifdef BUILD_STPCPY
198	mov	dstin, dst
199#endif
200	ret
201.Lfp_lt4:
202	cbz	pos, .Lfp_lt2
203	/* 2->3 bytes to copy.  */
204#ifdef __AARCH64EB__
205	lsr	data1, data1, #48
206#endif
207	strh	data1w, [dstin]
208	/* Fall-through, one byte (max) to go.  */
209.Lfp_lt2:
210	/* Null-terminated string.  Last character must be zero!  */
211	strb	wzr, [dst]
212#ifdef BUILD_STPCPY
213	mov	dstin, dst
214#endif
215	ret
216
217	.p2align 6
218	/* Aligning here ensures that the entry code and main loop all lies
219	   within one 64-byte cache line.  */
220.Lbulk_entry:
221	sub	to_align, to_align, #16
222	stp	data1, data2, [dstin]
223	sub	src, srcin, to_align
224	sub	dst, dstin, to_align
225	b	.Lentry_no_page_cross
226
227	/* The inner loop deals with two Dwords at a time.  This has a
228	   slightly higher start-up cost, but we should win quite quickly,
229	   especially on cores with a high number of issue slots per
230	   cycle, as we get much better parallelism out of the operations.  */
231.Lmain_loop:
232	stp	data1, data2, [dst], #16
233.Lentry_no_page_cross:
234	ldp	data1, data2, [src], #16
235	sub	tmp1, data1, zeroones
236	orr	tmp2, data1, #REP8_7f
237	sub	tmp3, data2, zeroones
238	orr	tmp4, data2, #REP8_7f
239	bic	has_nul1, tmp1, tmp2
240	bics	has_nul2, tmp3, tmp4
241	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
242	b.eq	.Lmain_loop
243
244	/* Since we know we are copying at least 16 bytes, the fastest way
245	   to deal with the tail is to determine the location of the
246	   trailing NUL, then (re)copy the 16 bytes leading up to that.  */
247	cmp	has_nul1, #0
248#ifdef __AARCH64EB__
249	/* For big-endian, carry propagation (if the final byte in the
250	   string is 0x01) means we cannot use has_nul directly.  The
251	   easiest way to get the correct byte is to byte-swap the data
252	   and calculate the syndrome a second time.  */
253	csel	data1, data1, data2, ne
254	rev	data1, data1
255	sub	tmp1, data1, zeroones
256	orr	tmp2, data1, #REP8_7f
257	bic	has_nul1, tmp1, tmp2
258#else
259	csel	has_nul1, has_nul1, has_nul2, ne
260#endif
261	rev	has_nul1, has_nul1
262	clz	pos, has_nul1
263	add	tmp1, pos, #72
264	add	pos, pos, #8
265	csel	pos, pos, tmp1, ne
266	add	src, src, pos, lsr #3
267	add	dst, dst, pos, lsr #3
268	ldp	data1, data2, [src, #-32]
269	stp	data1, data2, [dst, #-16]
270#ifdef BUILD_STPCPY
271	sub	dstin, dst, #1
272#endif
273	ret
274
275.Lpage_cross:
276	bic	src, srcin, #15
277	/* Start by loading two words at [srcin & ~15], then forcing the
278	   bytes that precede srcin to 0xff.  This means they never look
279	   like termination bytes.  */
280	ldp	data1, data2, [src]
281	lsl	tmp1, tmp1, #3	/* Bytes beyond alignment -> bits.  */
282	tst	to_align, #7
283	csetm	tmp2, ne
284#ifdef __AARCH64EB__
285	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
286#else
287	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
288#endif
289	orr	data1, data1, tmp2
290	orr	data2a, data2, tmp2
291	cmp	to_align, #8
292	csinv	data1, data1, xzr, lt
293	csel	data2, data2, data2a, lt
294	sub	tmp1, data1, zeroones
295	orr	tmp2, data1, #REP8_7f
296	sub	tmp3, data2, zeroones
297	orr	tmp4, data2, #REP8_7f
298	bic	has_nul1, tmp1, tmp2
299	bics	has_nul2, tmp3, tmp4
300	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
301	b.eq	.Lpage_cross_ok
302	/* We now need to make data1 and data2 look like they've been
303	   loaded directly from srcin.  Do a rotate on the 128-bit value.  */
304	lsl	tmp1, to_align, #3	/* Bytes->bits.  */
305	neg	tmp2, to_align, lsl #3
306#ifdef __AARCH64EB__
307	lsl	data1a, data1, tmp1
308	lsr	tmp4, data2, tmp2
309	lsl	data2, data2, tmp1
310	orr	tmp4, tmp4, data1a
311	cmp	to_align, #8
312	csel	data1, tmp4, data2, lt
313	rev	tmp2, data1
314	rev	tmp4, data2
315	sub	tmp1, tmp2, zeroones
316	orr	tmp2, tmp2, #REP8_7f
317	sub	tmp3, tmp4, zeroones
318	orr	tmp4, tmp4, #REP8_7f
319#else
320	lsr	data1a, data1, tmp1
321	lsl	tmp4, data2, tmp2
322	lsr	data2, data2, tmp1
323	orr	tmp4, tmp4, data1a
324	cmp	to_align, #8
325	csel	data1, tmp4, data2, lt
326	sub	tmp1, data1, zeroones
327	orr	tmp2, data1, #REP8_7f
328	sub	tmp3, data2, zeroones
329	orr	tmp4, data2, #REP8_7f
330#endif
331	bic	has_nul1, tmp1, tmp2
332	cbnz	has_nul1, .Lfp_le8
333	bic	has_nul2, tmp3, tmp4
334	b	.Lfp_gt8
335
336	.size	STRCPY, . - STRCPY
337