xref: /freebsd/contrib/cortex-strings/src/aarch64/memcpy.S (revision b2d2a78ad80ec68d4a17f5aef97d21686cb1e29b)
1/* Copyright (c) 2012, Linaro Limited
2   All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions are met:
6       * Redistributions of source code must retain the above copyright
7         notice, this list of conditions and the following disclaimer.
8       * Redistributions in binary form must reproduce the above copyright
9         notice, this list of conditions and the following disclaimer in the
10         documentation and/or other materials provided with the distribution.
11       * Neither the name of the Linaro nor the
12         names of its contributors may be used to endorse or promote products
13         derived from this software without specific prior written permission.
14
15   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
26
27/*
28 * Copyright (c) 2015 ARM Ltd
29 * All rights reserved.
30 *
31 * Redistribution and use in source and binary forms, with or without
32 * modification, are permitted provided that the following conditions
33 * are met:
34 * 1. Redistributions of source code must retain the above copyright
35 *    notice, this list of conditions and the following disclaimer.
36 * 2. Redistributions in binary form must reproduce the above copyright
37 *    notice, this list of conditions and the following disclaimer in the
38 *    documentation and/or other materials provided with the distribution.
39 * 3. The name of the company may not be used to endorse or promote
40 *    products derived from this software without specific prior written
41 *    permission.
42 *
43 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
44 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
45 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
46 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
47 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
48 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
49 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
50 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
51 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
52 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
53 */
54
55/* Assumptions:
56 *
57 * ARMv8-a, AArch64, unaligned accesses.
58 *
59 */
60
61#define dstin	x0
62#define src	x1
63#define count	x2
64#define dst	x3
65#define srcend	x4
66#define dstend	x5
67#define A_l	x6
68#define A_lw	w6
69#define A_h	x7
70#define A_hw	w7
71#define B_l	x8
72#define B_lw	w8
73#define B_h	x9
74#define C_l	x10
75#define C_h	x11
76#define D_l	x12
77#define D_h	x13
78#define E_l	src
79#define E_h	count
80#define F_l	dst
81#define F_h	srcend
82#define tmp1	x9
83
84#define L(l) .L ## l
85
86	.macro def_fn f p2align=0
87	.text
88	.p2align \p2align
89	.global \f
90	.type \f, %function
91\f:
92	.endm
93
94/* Copies are split into 3 main cases: small copies of up to 16 bytes,
95   medium copies of 17..96 bytes which are fully unrolled. Large copies
96   of more than 96 bytes align the destination and use an unrolled loop
97   processing 64 bytes per iteration.
98   Small and medium copies read all data before writing, allowing any
99   kind of overlap, and memmove tailcalls memcpy for these cases as
100   well as non-overlapping copies.
101*/
102
103def_fn memcpy p2align=6
104	prfm	PLDL1KEEP, [src]
105	add	srcend, src, count
106	add	dstend, dstin, count
107	cmp	count, 16
108	b.ls	L(copy16)
109	cmp	count, 96
110	b.hi	L(copy_long)
111
112	/* Medium copies: 17..96 bytes.  */
113	sub	tmp1, count, 1
114	ldp	A_l, A_h, [src]
115	tbnz	tmp1, 6, L(copy96)
116	ldp	D_l, D_h, [srcend, -16]
117	tbz	tmp1, 5, 1f
118	ldp	B_l, B_h, [src, 16]
119	ldp	C_l, C_h, [srcend, -32]
120	stp	B_l, B_h, [dstin, 16]
121	stp	C_l, C_h, [dstend, -32]
1221:
123	stp	A_l, A_h, [dstin]
124	stp	D_l, D_h, [dstend, -16]
125	ret
126
127	.p2align 4
128	/* Small copies: 0..16 bytes.  */
129L(copy16):
130	cmp	count, 8
131	b.lo	1f
132	ldr	A_l, [src]
133	ldr	A_h, [srcend, -8]
134	str	A_l, [dstin]
135	str	A_h, [dstend, -8]
136	ret
137	.p2align 4
1381:
139	tbz	count, 2, 1f
140	ldr	A_lw, [src]
141	ldr	A_hw, [srcend, -4]
142	str	A_lw, [dstin]
143	str	A_hw, [dstend, -4]
144	ret
145
146	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
147	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
1481:
149	cbz	count, 2f
150	lsr	tmp1, count, 1
151	ldrb	A_lw, [src]
152	ldrb	A_hw, [srcend, -1]
153	ldrb	B_lw, [src, tmp1]
154	strb	A_lw, [dstin]
155	strb	B_lw, [dstin, tmp1]
156	strb	A_hw, [dstend, -1]
1572:	ret
158
159	.p2align 4
160	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
161	   32 bytes from the end.  */
162L(copy96):
163	ldp	B_l, B_h, [src, 16]
164	ldp	C_l, C_h, [src, 32]
165	ldp	D_l, D_h, [src, 48]
166	ldp	E_l, E_h, [srcend, -32]
167	ldp	F_l, F_h, [srcend, -16]
168	stp	A_l, A_h, [dstin]
169	stp	B_l, B_h, [dstin, 16]
170	stp	C_l, C_h, [dstin, 32]
171	stp	D_l, D_h, [dstin, 48]
172	stp	E_l, E_h, [dstend, -32]
173	stp	F_l, F_h, [dstend, -16]
174	ret
175
176	/* Align DST to 16 byte alignment so that we don't cross cache line
177	   boundaries on both loads and stores.	 There are at least 96 bytes
178	   to copy, so copy 16 bytes unaligned and then align.	The loop
179	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
180
181	.p2align 4
182L(copy_long):
183	and	tmp1, dstin, 15
184	bic	dst, dstin, 15
185	ldp	D_l, D_h, [src]
186	sub	src, src, tmp1
187	add	count, count, tmp1	/* Count is now 16 too large.  */
188	ldp	A_l, A_h, [src, 16]
189	stp	D_l, D_h, [dstin]
190	ldp	B_l, B_h, [src, 32]
191	ldp	C_l, C_h, [src, 48]
192	ldp	D_l, D_h, [src, 64]!
193	subs	count, count, 128 + 16	/* Test and readjust count.  */
194	b.ls	2f
1951:
196	stp	A_l, A_h, [dst, 16]
197	ldp	A_l, A_h, [src, 16]
198	stp	B_l, B_h, [dst, 32]
199	ldp	B_l, B_h, [src, 32]
200	stp	C_l, C_h, [dst, 48]
201	ldp	C_l, C_h, [src, 48]
202	stp	D_l, D_h, [dst, 64]!
203	ldp	D_l, D_h, [src, 64]!
204	subs	count, count, 64
205	b.hi	1b
206
207	/* Write the last full set of 64 bytes.	 The remainder is at most 64
208	   bytes, so it is safe to always copy 64 bytes from the end even if
209	   there is just 1 byte left.  */
2102:
211	ldp	E_l, E_h, [srcend, -64]
212	stp	A_l, A_h, [dst, 16]
213	ldp	A_l, A_h, [srcend, -48]
214	stp	B_l, B_h, [dst, 32]
215	ldp	B_l, B_h, [srcend, -32]
216	stp	C_l, C_h, [dst, 48]
217	ldp	C_l, C_h, [srcend, -16]
218	stp	D_l, D_h, [dst, 64]
219	stp	E_l, E_h, [dstend, -64]
220	stp	A_l, A_h, [dstend, -48]
221	stp	B_l, B_h, [dstend, -32]
222	stp	C_l, C_h, [dstend, -16]
223	ret
224
225	.size	memcpy, . - memcpy
226