xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S (revision dd21556857e8d40f66bf5ad54754d9d52669ebf7)
1/*
2 * memcpy - copy memory area
3 *
4 * Copyright (c) 2019-2023, Arm Limited.
5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
11 *
12 */
13
14#include "asmdefs.h"
15
16#define dstin	x0
17#define src	x1
18#define count	x2
19#define dst	x3
20#define srcend	x4
21#define dstend	x5
22#define A_l	x6
23#define A_lw	w6
24#define A_h	x7
25#define B_l	x8
26#define B_lw	w8
27#define B_h	x9
28#define C_lw	w10
29#define tmp1	x14
30
31#define A_q	q0
32#define B_q	q1
33#define C_q	q2
34#define D_q	q3
35#define E_q	q4
36#define F_q	q5
37#define G_q	q6
38#define H_q	q7
39
40/* This implementation handles overlaps and supports both memcpy and memmove
41   from a single entry point.  It uses unaligned accesses and branchless
42   sequences to keep the code small, simple and improve performance.
43
44   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
45   copies of up to 128 bytes, and large copies.  The overhead of the overlap
46   check is negligible since it is only required for large copies.
47
48   Large copies use a software pipelined loop processing 64 bytes per iteration.
49   The source pointer is 16-byte aligned to minimize unaligned accesses.
50   The loop tail is handled by always copying 64 bytes from the end.
51*/
52
53ENTRY_ALIAS (__memmove_aarch64_simd)
54ENTRY (__memcpy_aarch64_simd)
55	add	srcend, src, count
56	cmp	count, 128
57	b.hi	L(copy_long)
58	add	dstend, dstin, count
59	cmp	count, 32
60	b.hi	L(copy32_128)
61	nop
62
63	/* Small copies: 0..32 bytes.  */
64	cmp	count, 16
65	b.lo	L(copy16)
66	ldr	A_q, [src]
67	ldr	B_q, [srcend, -16]
68	str	A_q, [dstin]
69	str	B_q, [dstend, -16]
70	ret
71
72	.p2align 4
73	/* Medium copies: 33..128 bytes.  */
74L(copy32_128):
75	ldp	A_q, B_q, [src]
76	ldp	C_q, D_q, [srcend, -32]
77	cmp	count, 64
78	b.hi	L(copy128)
79	stp	A_q, B_q, [dstin]
80	stp	C_q, D_q, [dstend, -32]
81	ret
82
83	.p2align 4
84	/* Copy 8-15 bytes.  */
85L(copy16):
86	tbz	count, 3, L(copy8)
87	ldr	A_l, [src]
88	ldr	A_h, [srcend, -8]
89	str	A_l, [dstin]
90	str	A_h, [dstend, -8]
91	ret
92
93	/* Copy 4-7 bytes.  */
94L(copy8):
95	tbz	count, 2, L(copy4)
96	ldr	A_lw, [src]
97	ldr	B_lw, [srcend, -4]
98	str	A_lw, [dstin]
99	str	B_lw, [dstend, -4]
100	ret
101
102	/* Copy 65..128 bytes.  */
103L(copy128):
104	ldp	E_q, F_q, [src, 32]
105	cmp	count, 96
106	b.ls	L(copy96)
107	ldp	G_q, H_q, [srcend, -64]
108	stp	G_q, H_q, [dstend, -64]
109L(copy96):
110	stp	A_q, B_q, [dstin]
111	stp	E_q, F_q, [dstin, 32]
112	stp	C_q, D_q, [dstend, -32]
113	ret
114
115	/* Copy 0..3 bytes using a branchless sequence.  */
116L(copy4):
117	cbz	count, L(copy0)
118	lsr	tmp1, count, 1
119	ldrb	A_lw, [src]
120	ldrb	C_lw, [srcend, -1]
121	ldrb	B_lw, [src, tmp1]
122	strb	A_lw, [dstin]
123	strb	B_lw, [dstin, tmp1]
124	strb	C_lw, [dstend, -1]
125L(copy0):
126	ret
127
128	.p2align 3
129	/* Copy more than 128 bytes.  */
130L(copy_long):
131	add	dstend, dstin, count
132
133	/* Use backwards copy if there is an overlap.  */
134	sub	tmp1, dstin, src
135	cmp	tmp1, count
136	b.lo	L(copy_long_backwards)
137
138	/* Copy 16 bytes and then align src to 16-byte alignment.  */
139	ldr	D_q, [src]
140	and	tmp1, src, 15
141	bic	src, src, 15
142	sub	dst, dstin, tmp1
143	add	count, count, tmp1	/* Count is now 16 too large.  */
144	ldp	A_q, B_q, [src, 16]
145	str	D_q, [dstin]
146	ldp	C_q, D_q, [src, 48]
147	subs	count, count, 128 + 16	/* Test and readjust count.  */
148	b.ls	L(copy64_from_end)
149L(loop64):
150	stp	A_q, B_q, [dst, 16]
151	ldp	A_q, B_q, [src, 80]
152	stp	C_q, D_q, [dst, 48]
153	ldp	C_q, D_q, [src, 112]
154	add	src, src, 64
155	add	dst, dst, 64
156	subs	count, count, 64
157	b.hi	L(loop64)
158
159	/* Write the last iteration and copy 64 bytes from the end.  */
160L(copy64_from_end):
161	ldp	E_q, F_q, [srcend, -64]
162	stp	A_q, B_q, [dst, 16]
163	ldp	A_q, B_q, [srcend, -32]
164	stp	C_q, D_q, [dst, 48]
165	stp	E_q, F_q, [dstend, -64]
166	stp	A_q, B_q, [dstend, -32]
167	ret
168
169	.p2align 4
170	nop
171
172	/* Large backwards copy for overlapping copies.
173	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
174L(copy_long_backwards):
175	cbz	tmp1, L(copy0)
176	ldr	D_q, [srcend, -16]
177	and	tmp1, srcend, 15
178	bic	srcend, srcend, 15
179	sub	count, count, tmp1
180	ldp	A_q, B_q, [srcend, -32]
181	str	D_q, [dstend, -16]
182	ldp	C_q, D_q, [srcend, -64]
183	sub	dstend, dstend, tmp1
184	subs	count, count, 128
185	b.ls	L(copy64_from_start)
186
187L(loop64_backwards):
188	str	B_q, [dstend, -16]
189	str	A_q, [dstend, -32]
190	ldp	A_q, B_q, [srcend, -96]
191	str	D_q, [dstend, -48]
192	str	C_q, [dstend, -64]!
193	ldp	C_q, D_q, [srcend, -128]
194	sub	srcend, srcend, 64
195	subs	count, count, 64
196	b.hi	L(loop64_backwards)
197
198	/* Write the last iteration and copy 64 bytes from the start.  */
199L(copy64_from_start):
200	ldp	E_q, F_q, [src, 32]
201	stp	A_q, B_q, [dstend, -32]
202	ldp	A_q, B_q, [src]
203	stp	C_q, D_q, [dstend, -64]
204	stp	E_q, F_q, [dstin, 32]
205	stp	A_q, B_q, [dstin]
206	ret
207
208END (__memcpy_aarch64_simd)
209
210