xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S (revision b64c5a0ace59af62eff52bfe110a521dc73c937b)
1/*
2 * memcpy - copy memory area
3 *
4 * Copyright (c) 2019-2023, Arm Limited.
5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
11 *
12 */
13
14#include "asmdefs.h"
15
16#define dstin	x0
17#define src	x1
18#define count	x2
19#define dst	x3
20#define srcend	x4
21#define dstend	x5
22#define A_l	x6
23#define A_lw	w6
24#define A_h	x7
25#define B_l	x8
26#define B_lw	w8
27#define B_h	x9
28#define C_lw	w10
29#define tmp1	x14
30
31#define A_q	q0
32#define B_q	q1
33#define C_q	q2
34#define D_q	q3
35#define E_q	q4
36#define F_q	q5
37#define G_q	q6
38#define H_q	q7
39
40/* This implementation handles overlaps and supports both memcpy and memmove
41   from a single entry point.  It uses unaligned accesses and branchless
42   sequences to keep the code small, simple and improve performance.
43
44   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
45   copies of up to 128 bytes, and large copies.  The overhead of the overlap
46   check is negligible since it is only required for large copies.
47
48   Large copies use a software pipelined loop processing 64 bytes per iteration.
49   The source pointer is 16-byte aligned to minimize unaligned accesses.
50   The loop tail is handled by always copying 64 bytes from the end.
51*/
52
53ENTRY_ALIAS (__memmove_aarch64_simd)
54ENTRY (__memcpy_aarch64_simd)
55	PTR_ARG (0)
56	PTR_ARG (1)
57	SIZE_ARG (2)
58	add	srcend, src, count
59	cmp	count, 128
60	b.hi	L(copy_long)
61	add	dstend, dstin, count
62	cmp	count, 32
63	b.hi	L(copy32_128)
64	nop
65
66	/* Small copies: 0..32 bytes.  */
67	cmp	count, 16
68	b.lo	L(copy16)
69	ldr	A_q, [src]
70	ldr	B_q, [srcend, -16]
71	str	A_q, [dstin]
72	str	B_q, [dstend, -16]
73	ret
74
75	.p2align 4
76	/* Medium copies: 33..128 bytes.  */
77L(copy32_128):
78	ldp	A_q, B_q, [src]
79	ldp	C_q, D_q, [srcend, -32]
80	cmp	count, 64
81	b.hi	L(copy128)
82	stp	A_q, B_q, [dstin]
83	stp	C_q, D_q, [dstend, -32]
84	ret
85
86	.p2align 4
87	/* Copy 8-15 bytes.  */
88L(copy16):
89	tbz	count, 3, L(copy8)
90	ldr	A_l, [src]
91	ldr	A_h, [srcend, -8]
92	str	A_l, [dstin]
93	str	A_h, [dstend, -8]
94	ret
95
96	/* Copy 4-7 bytes.  */
97L(copy8):
98	tbz	count, 2, L(copy4)
99	ldr	A_lw, [src]
100	ldr	B_lw, [srcend, -4]
101	str	A_lw, [dstin]
102	str	B_lw, [dstend, -4]
103	ret
104
105	/* Copy 65..128 bytes.  */
106L(copy128):
107	ldp	E_q, F_q, [src, 32]
108	cmp	count, 96
109	b.ls	L(copy96)
110	ldp	G_q, H_q, [srcend, -64]
111	stp	G_q, H_q, [dstend, -64]
112L(copy96):
113	stp	A_q, B_q, [dstin]
114	stp	E_q, F_q, [dstin, 32]
115	stp	C_q, D_q, [dstend, -32]
116	ret
117
118	/* Copy 0..3 bytes using a branchless sequence.  */
119L(copy4):
120	cbz	count, L(copy0)
121	lsr	tmp1, count, 1
122	ldrb	A_lw, [src]
123	ldrb	C_lw, [srcend, -1]
124	ldrb	B_lw, [src, tmp1]
125	strb	A_lw, [dstin]
126	strb	B_lw, [dstin, tmp1]
127	strb	C_lw, [dstend, -1]
128L(copy0):
129	ret
130
131	.p2align 3
132	/* Copy more than 128 bytes.  */
133L(copy_long):
134	add	dstend, dstin, count
135
136	/* Use backwards copy if there is an overlap.  */
137	sub	tmp1, dstin, src
138	cmp	tmp1, count
139	b.lo	L(copy_long_backwards)
140
141	/* Copy 16 bytes and then align src to 16-byte alignment.  */
142	ldr	D_q, [src]
143	and	tmp1, src, 15
144	bic	src, src, 15
145	sub	dst, dstin, tmp1
146	add	count, count, tmp1	/* Count is now 16 too large.  */
147	ldp	A_q, B_q, [src, 16]
148	str	D_q, [dstin]
149	ldp	C_q, D_q, [src, 48]
150	subs	count, count, 128 + 16	/* Test and readjust count.  */
151	b.ls	L(copy64_from_end)
152L(loop64):
153	stp	A_q, B_q, [dst, 16]
154	ldp	A_q, B_q, [src, 80]
155	stp	C_q, D_q, [dst, 48]
156	ldp	C_q, D_q, [src, 112]
157	add	src, src, 64
158	add	dst, dst, 64
159	subs	count, count, 64
160	b.hi	L(loop64)
161
162	/* Write the last iteration and copy 64 bytes from the end.  */
163L(copy64_from_end):
164	ldp	E_q, F_q, [srcend, -64]
165	stp	A_q, B_q, [dst, 16]
166	ldp	A_q, B_q, [srcend, -32]
167	stp	C_q, D_q, [dst, 48]
168	stp	E_q, F_q, [dstend, -64]
169	stp	A_q, B_q, [dstend, -32]
170	ret
171
172	.p2align 4
173	nop
174
175	/* Large backwards copy for overlapping copies.
176	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
177L(copy_long_backwards):
178	cbz	tmp1, L(copy0)
179	ldr	D_q, [srcend, -16]
180	and	tmp1, srcend, 15
181	bic	srcend, srcend, 15
182	sub	count, count, tmp1
183	ldp	A_q, B_q, [srcend, -32]
184	str	D_q, [dstend, -16]
185	ldp	C_q, D_q, [srcend, -64]
186	sub	dstend, dstend, tmp1
187	subs	count, count, 128
188	b.ls	L(copy64_from_start)
189
190L(loop64_backwards):
191	str	B_q, [dstend, -16]
192	str	A_q, [dstend, -32]
193	ldp	A_q, B_q, [srcend, -96]
194	str	D_q, [dstend, -48]
195	str	C_q, [dstend, -64]!
196	ldp	C_q, D_q, [srcend, -128]
197	sub	srcend, srcend, 64
198	subs	count, count, 64
199	b.hi	L(loop64_backwards)
200
201	/* Write the last iteration and copy 64 bytes from the start.  */
202L(copy64_from_start):
203	ldp	E_q, F_q, [src, 32]
204	stp	A_q, B_q, [dstend, -32]
205	ldp	A_q, B_q, [src]
206	stp	C_q, D_q, [dstend, -64]
207	stp	E_q, F_q, [dstin, 32]
208	stp	A_q, B_q, [dstin]
209	ret
210
211END (__memcpy_aarch64_simd)
212
213