xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/memcpy.S (revision dd21556857e8d40f66bf5ad54754d9d52669ebf7)
1/*
2 * memcpy - copy memory area
3 *
4 * Copyright (c) 2012-2022, Arm Limited.
5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, unaligned accesses.
11 *
12 */
13
14#include "asmdefs.h"
15
16#define dstin	x0
17#define src	x1
18#define count	x2
19#define dst	x3
20#define srcend	x4
21#define dstend	x5
22#define A_l	x6
23#define A_lw	w6
24#define A_h	x7
25#define B_l	x8
26#define B_lw	w8
27#define B_h	x9
28#define C_l	x10
29#define C_lw	w10
30#define C_h	x11
31#define D_l	x12
32#define D_h	x13
33#define E_l	x14
34#define E_h	x15
35#define F_l	x16
36#define F_h	x17
37#define G_l	count
38#define G_h	dst
39#define H_l	src
40#define H_h	srcend
41#define tmp1	x14
42
43/* This implementation handles overlaps and supports both memcpy and memmove
44   from a single entry point.  It uses unaligned accesses and branchless
45   sequences to keep the code small, simple and improve performance.
46
47   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
48   copies of up to 128 bytes, and large copies.  The overhead of the overlap
49   check is negligible since it is only required for large copies.
50
51   Large copies use a software pipelined loop processing 64 bytes per iteration.
52   The destination pointer is 16-byte aligned to minimize unaligned accesses.
53   The loop tail is handled by always copying 64 bytes from the end.
54*/
55
56ENTRY_ALIAS (__memmove_aarch64)
57ENTRY (__memcpy_aarch64)
58	add	srcend, src, count
59	add	dstend, dstin, count
60	cmp	count, 128
61	b.hi	L(copy_long)
62	cmp	count, 32
63	b.hi	L(copy32_128)
64
65	/* Small copies: 0..32 bytes.  */
66	cmp	count, 16
67	b.lo	L(copy16)
68	ldp	A_l, A_h, [src]
69	ldp	D_l, D_h, [srcend, -16]
70	stp	A_l, A_h, [dstin]
71	stp	D_l, D_h, [dstend, -16]
72	ret
73
74	/* Copy 8-15 bytes.  */
75L(copy16):
76	tbz	count, 3, L(copy8)
77	ldr	A_l, [src]
78	ldr	A_h, [srcend, -8]
79	str	A_l, [dstin]
80	str	A_h, [dstend, -8]
81	ret
82
83	.p2align 3
84	/* Copy 4-7 bytes.  */
85L(copy8):
86	tbz	count, 2, L(copy4)
87	ldr	A_lw, [src]
88	ldr	B_lw, [srcend, -4]
89	str	A_lw, [dstin]
90	str	B_lw, [dstend, -4]
91	ret
92
93	/* Copy 0..3 bytes using a branchless sequence.  */
94L(copy4):
95	cbz	count, L(copy0)
96	lsr	tmp1, count, 1
97	ldrb	A_lw, [src]
98	ldrb	C_lw, [srcend, -1]
99	ldrb	B_lw, [src, tmp1]
100	strb	A_lw, [dstin]
101	strb	B_lw, [dstin, tmp1]
102	strb	C_lw, [dstend, -1]
103L(copy0):
104	ret
105
106	.p2align 4
107	/* Medium copies: 33..128 bytes.  */
108L(copy32_128):
109	ldp	A_l, A_h, [src]
110	ldp	B_l, B_h, [src, 16]
111	ldp	C_l, C_h, [srcend, -32]
112	ldp	D_l, D_h, [srcend, -16]
113	cmp	count, 64
114	b.hi	L(copy128)
115	stp	A_l, A_h, [dstin]
116	stp	B_l, B_h, [dstin, 16]
117	stp	C_l, C_h, [dstend, -32]
118	stp	D_l, D_h, [dstend, -16]
119	ret
120
121	.p2align 4
122	/* Copy 65..128 bytes.  */
123L(copy128):
124	ldp	E_l, E_h, [src, 32]
125	ldp	F_l, F_h, [src, 48]
126	cmp	count, 96
127	b.ls	L(copy96)
128	ldp	G_l, G_h, [srcend, -64]
129	ldp	H_l, H_h, [srcend, -48]
130	stp	G_l, G_h, [dstend, -64]
131	stp	H_l, H_h, [dstend, -48]
132L(copy96):
133	stp	A_l, A_h, [dstin]
134	stp	B_l, B_h, [dstin, 16]
135	stp	E_l, E_h, [dstin, 32]
136	stp	F_l, F_h, [dstin, 48]
137	stp	C_l, C_h, [dstend, -32]
138	stp	D_l, D_h, [dstend, -16]
139	ret
140
141	.p2align 4
142	/* Copy more than 128 bytes.  */
143L(copy_long):
144	/* Use backwards copy if there is an overlap.  */
145	sub	tmp1, dstin, src
146	cbz	tmp1, L(copy0)
147	cmp	tmp1, count
148	b.lo	L(copy_long_backwards)
149
150	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
151
152	ldp	D_l, D_h, [src]
153	and	tmp1, dstin, 15
154	bic	dst, dstin, 15
155	sub	src, src, tmp1
156	add	count, count, tmp1	/* Count is now 16 too large.  */
157	ldp	A_l, A_h, [src, 16]
158	stp	D_l, D_h, [dstin]
159	ldp	B_l, B_h, [src, 32]
160	ldp	C_l, C_h, [src, 48]
161	ldp	D_l, D_h, [src, 64]!
162	subs	count, count, 128 + 16	/* Test and readjust count.  */
163	b.ls	L(copy64_from_end)
164
165L(loop64):
166	stp	A_l, A_h, [dst, 16]
167	ldp	A_l, A_h, [src, 16]
168	stp	B_l, B_h, [dst, 32]
169	ldp	B_l, B_h, [src, 32]
170	stp	C_l, C_h, [dst, 48]
171	ldp	C_l, C_h, [src, 48]
172	stp	D_l, D_h, [dst, 64]!
173	ldp	D_l, D_h, [src, 64]!
174	subs	count, count, 64
175	b.hi	L(loop64)
176
177	/* Write the last iteration and copy 64 bytes from the end.  */
178L(copy64_from_end):
179	ldp	E_l, E_h, [srcend, -64]
180	stp	A_l, A_h, [dst, 16]
181	ldp	A_l, A_h, [srcend, -48]
182	stp	B_l, B_h, [dst, 32]
183	ldp	B_l, B_h, [srcend, -32]
184	stp	C_l, C_h, [dst, 48]
185	ldp	C_l, C_h, [srcend, -16]
186	stp	D_l, D_h, [dst, 64]
187	stp	E_l, E_h, [dstend, -64]
188	stp	A_l, A_h, [dstend, -48]
189	stp	B_l, B_h, [dstend, -32]
190	stp	C_l, C_h, [dstend, -16]
191	ret
192
193	.p2align 4
194
195	/* Large backwards copy for overlapping copies.
196	   Copy 16 bytes and then align dst to 16-byte alignment.  */
197L(copy_long_backwards):
198	ldp	D_l, D_h, [srcend, -16]
199	and	tmp1, dstend, 15
200	sub	srcend, srcend, tmp1
201	sub	count, count, tmp1
202	ldp	A_l, A_h, [srcend, -16]
203	stp	D_l, D_h, [dstend, -16]
204	ldp	B_l, B_h, [srcend, -32]
205	ldp	C_l, C_h, [srcend, -48]
206	ldp	D_l, D_h, [srcend, -64]!
207	sub	dstend, dstend, tmp1
208	subs	count, count, 128
209	b.ls	L(copy64_from_start)
210
211L(loop64_backwards):
212	stp	A_l, A_h, [dstend, -16]
213	ldp	A_l, A_h, [srcend, -16]
214	stp	B_l, B_h, [dstend, -32]
215	ldp	B_l, B_h, [srcend, -32]
216	stp	C_l, C_h, [dstend, -48]
217	ldp	C_l, C_h, [srcend, -48]
218	stp	D_l, D_h, [dstend, -64]!
219	ldp	D_l, D_h, [srcend, -64]!
220	subs	count, count, 64
221	b.hi	L(loop64_backwards)
222
223	/* Write the last iteration and copy 64 bytes from the start.  */
224L(copy64_from_start):
225	ldp	G_l, G_h, [src, 48]
226	stp	A_l, A_h, [dstend, -16]
227	ldp	A_l, A_h, [src, 32]
228	stp	B_l, B_h, [dstend, -32]
229	ldp	B_l, B_h, [src, 16]
230	stp	C_l, C_h, [dstend, -48]
231	ldp	C_l, C_h, [src]
232	stp	D_l, D_h, [dstend, -64]
233	stp	G_l, G_h, [dstin, 48]
234	stp	A_l, A_h, [dstin, 32]
235	stp	B_l, B_h, [dstin, 16]
236	stp	C_l, C_h, [dstin]
237	ret
238
239END (__memcpy_aarch64)
240
241