xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/memcpy.S (revision 59c8e88e72633afbc47a4ace0d2170d00d51f7dc)
1/*
2 * memcpy - copy memory area
3 *
4 * Copyright (c) 2012-2022, Arm Limited.
5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, unaligned accesses.
11 *
12 */
13
14#include "asmdefs.h"
15
16#define dstin	x0
17#define src	x1
18#define count	x2
19#define dst	x3
20#define srcend	x4
21#define dstend	x5
22#define A_l	x6
23#define A_lw	w6
24#define A_h	x7
25#define B_l	x8
26#define B_lw	w8
27#define B_h	x9
28#define C_l	x10
29#define C_lw	w10
30#define C_h	x11
31#define D_l	x12
32#define D_h	x13
33#define E_l	x14
34#define E_h	x15
35#define F_l	x16
36#define F_h	x17
37#define G_l	count
38#define G_h	dst
39#define H_l	src
40#define H_h	srcend
41#define tmp1	x14
42
43/* This implementation handles overlaps and supports both memcpy and memmove
44   from a single entry point.  It uses unaligned accesses and branchless
45   sequences to keep the code small, simple and improve performance.
46
47   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
48   copies of up to 128 bytes, and large copies.  The overhead of the overlap
49   check is negligible since it is only required for large copies.
50
51   Large copies use a software pipelined loop processing 64 bytes per iteration.
52   The destination pointer is 16-byte aligned to minimize unaligned accesses.
53   The loop tail is handled by always copying 64 bytes from the end.
54*/
55
56ENTRY_ALIAS (__memmove_aarch64)
57ENTRY (__memcpy_aarch64)
58	PTR_ARG (0)
59	PTR_ARG (1)
60	SIZE_ARG (2)
61	add	srcend, src, count
62	add	dstend, dstin, count
63	cmp	count, 128
64	b.hi	L(copy_long)
65	cmp	count, 32
66	b.hi	L(copy32_128)
67
68	/* Small copies: 0..32 bytes.  */
69	cmp	count, 16
70	b.lo	L(copy16)
71	ldp	A_l, A_h, [src]
72	ldp	D_l, D_h, [srcend, -16]
73	stp	A_l, A_h, [dstin]
74	stp	D_l, D_h, [dstend, -16]
75	ret
76
77	/* Copy 8-15 bytes.  */
78L(copy16):
79	tbz	count, 3, L(copy8)
80	ldr	A_l, [src]
81	ldr	A_h, [srcend, -8]
82	str	A_l, [dstin]
83	str	A_h, [dstend, -8]
84	ret
85
86	.p2align 3
87	/* Copy 4-7 bytes.  */
88L(copy8):
89	tbz	count, 2, L(copy4)
90	ldr	A_lw, [src]
91	ldr	B_lw, [srcend, -4]
92	str	A_lw, [dstin]
93	str	B_lw, [dstend, -4]
94	ret
95
96	/* Copy 0..3 bytes using a branchless sequence.  */
97L(copy4):
98	cbz	count, L(copy0)
99	lsr	tmp1, count, 1
100	ldrb	A_lw, [src]
101	ldrb	C_lw, [srcend, -1]
102	ldrb	B_lw, [src, tmp1]
103	strb	A_lw, [dstin]
104	strb	B_lw, [dstin, tmp1]
105	strb	C_lw, [dstend, -1]
106L(copy0):
107	ret
108
109	.p2align 4
110	/* Medium copies: 33..128 bytes.  */
111L(copy32_128):
112	ldp	A_l, A_h, [src]
113	ldp	B_l, B_h, [src, 16]
114	ldp	C_l, C_h, [srcend, -32]
115	ldp	D_l, D_h, [srcend, -16]
116	cmp	count, 64
117	b.hi	L(copy128)
118	stp	A_l, A_h, [dstin]
119	stp	B_l, B_h, [dstin, 16]
120	stp	C_l, C_h, [dstend, -32]
121	stp	D_l, D_h, [dstend, -16]
122	ret
123
124	.p2align 4
125	/* Copy 65..128 bytes.  */
126L(copy128):
127	ldp	E_l, E_h, [src, 32]
128	ldp	F_l, F_h, [src, 48]
129	cmp	count, 96
130	b.ls	L(copy96)
131	ldp	G_l, G_h, [srcend, -64]
132	ldp	H_l, H_h, [srcend, -48]
133	stp	G_l, G_h, [dstend, -64]
134	stp	H_l, H_h, [dstend, -48]
135L(copy96):
136	stp	A_l, A_h, [dstin]
137	stp	B_l, B_h, [dstin, 16]
138	stp	E_l, E_h, [dstin, 32]
139	stp	F_l, F_h, [dstin, 48]
140	stp	C_l, C_h, [dstend, -32]
141	stp	D_l, D_h, [dstend, -16]
142	ret
143
144	.p2align 4
145	/* Copy more than 128 bytes.  */
146L(copy_long):
147	/* Use backwards copy if there is an overlap.  */
148	sub	tmp1, dstin, src
149	cbz	tmp1, L(copy0)
150	cmp	tmp1, count
151	b.lo	L(copy_long_backwards)
152
153	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
154
155	ldp	D_l, D_h, [src]
156	and	tmp1, dstin, 15
157	bic	dst, dstin, 15
158	sub	src, src, tmp1
159	add	count, count, tmp1	/* Count is now 16 too large.  */
160	ldp	A_l, A_h, [src, 16]
161	stp	D_l, D_h, [dstin]
162	ldp	B_l, B_h, [src, 32]
163	ldp	C_l, C_h, [src, 48]
164	ldp	D_l, D_h, [src, 64]!
165	subs	count, count, 128 + 16	/* Test and readjust count.  */
166	b.ls	L(copy64_from_end)
167
168L(loop64):
169	stp	A_l, A_h, [dst, 16]
170	ldp	A_l, A_h, [src, 16]
171	stp	B_l, B_h, [dst, 32]
172	ldp	B_l, B_h, [src, 32]
173	stp	C_l, C_h, [dst, 48]
174	ldp	C_l, C_h, [src, 48]
175	stp	D_l, D_h, [dst, 64]!
176	ldp	D_l, D_h, [src, 64]!
177	subs	count, count, 64
178	b.hi	L(loop64)
179
180	/* Write the last iteration and copy 64 bytes from the end.  */
181L(copy64_from_end):
182	ldp	E_l, E_h, [srcend, -64]
183	stp	A_l, A_h, [dst, 16]
184	ldp	A_l, A_h, [srcend, -48]
185	stp	B_l, B_h, [dst, 32]
186	ldp	B_l, B_h, [srcend, -32]
187	stp	C_l, C_h, [dst, 48]
188	ldp	C_l, C_h, [srcend, -16]
189	stp	D_l, D_h, [dst, 64]
190	stp	E_l, E_h, [dstend, -64]
191	stp	A_l, A_h, [dstend, -48]
192	stp	B_l, B_h, [dstend, -32]
193	stp	C_l, C_h, [dstend, -16]
194	ret
195
196	.p2align 4
197
198	/* Large backwards copy for overlapping copies.
199	   Copy 16 bytes and then align dst to 16-byte alignment.  */
200L(copy_long_backwards):
201	ldp	D_l, D_h, [srcend, -16]
202	and	tmp1, dstend, 15
203	sub	srcend, srcend, tmp1
204	sub	count, count, tmp1
205	ldp	A_l, A_h, [srcend, -16]
206	stp	D_l, D_h, [dstend, -16]
207	ldp	B_l, B_h, [srcend, -32]
208	ldp	C_l, C_h, [srcend, -48]
209	ldp	D_l, D_h, [srcend, -64]!
210	sub	dstend, dstend, tmp1
211	subs	count, count, 128
212	b.ls	L(copy64_from_start)
213
214L(loop64_backwards):
215	stp	A_l, A_h, [dstend, -16]
216	ldp	A_l, A_h, [srcend, -16]
217	stp	B_l, B_h, [dstend, -32]
218	ldp	B_l, B_h, [srcend, -32]
219	stp	C_l, C_h, [dstend, -48]
220	ldp	C_l, C_h, [srcend, -48]
221	stp	D_l, D_h, [dstend, -64]!
222	ldp	D_l, D_h, [srcend, -64]!
223	subs	count, count, 64
224	b.hi	L(loop64_backwards)
225
226	/* Write the last iteration and copy 64 bytes from the start.  */
227L(copy64_from_start):
228	ldp	G_l, G_h, [src, 48]
229	stp	A_l, A_h, [dstend, -16]
230	ldp	A_l, A_h, [src, 32]
231	stp	B_l, B_h, [dstend, -32]
232	ldp	B_l, B_h, [src, 16]
233	stp	C_l, C_h, [dstend, -48]
234	ldp	C_l, C_h, [src]
235	stp	D_l, D_h, [dstend, -64]
236	stp	G_l, G_h, [dstin, 48]
237	stp	A_l, A_h, [dstin, 32]
238	stp	B_l, B_h, [dstin, 16]
239	stp	C_l, C_h, [dstin]
240	ret
241
242END (__memcpy_aarch64)
243
244