xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S (revision b5a3a89c50671a1ad29e7c43fe15e7b16feac239)
1/*
2 * memcpy - copy memory area
3 *
4 * Copyright (c) 2019-2022, Arm Limited.
5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
11 *
12 */
13
14#include "asmdefs.h"
15
16#define dstin	x0
17#define src	x1
18#define count	x2
19#define dst	x3
20#define srcend	x4
21#define dstend	x5
22#define A_l	x6
23#define A_lw	w6
24#define A_h	x7
25#define B_l	x8
26#define B_lw	w8
27#define B_h	x9
28#define C_lw	w10
29#define tmp1	x14
30
31#define A_q	q0
32#define B_q	q1
33#define C_q	q2
34#define D_q	q3
35#define E_q	q4
36#define F_q	q5
37#define G_q	q6
38#define H_q	q7
39
40/* This implementation handles overlaps and supports both memcpy and memmove
41   from a single entry point.  It uses unaligned accesses and branchless
42   sequences to keep the code small, simple and improve performance.
43
44   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
45   copies of up to 128 bytes, and large copies.  The overhead of the overlap
46   check is negligible since it is only required for large copies.
47
48   Large copies use a software pipelined loop processing 64 bytes per iteration.
49   The source pointer is 16-byte aligned to minimize unaligned accesses.
50   The loop tail is handled by always copying 64 bytes from the end.
51*/
52
53ENTRY_ALIAS (__memmove_aarch64_simd)
54ENTRY (__memcpy_aarch64_simd)
55	PTR_ARG (0)
56	PTR_ARG (1)
57	SIZE_ARG (2)
58	add	srcend, src, count
59	add	dstend, dstin, count
60	cmp	count, 128
61	b.hi	L(copy_long)
62	cmp	count, 32
63	b.hi	L(copy32_128)
64
65	/* Small copies: 0..32 bytes.  */
66	cmp	count, 16
67	b.lo	L(copy16)
68	ldr	A_q, [src]
69	ldr	B_q, [srcend, -16]
70	str	A_q, [dstin]
71	str	B_q, [dstend, -16]
72	ret
73
74	/* Copy 8-15 bytes.  */
75L(copy16):
76	tbz	count, 3, L(copy8)
77	ldr	A_l, [src]
78	ldr	A_h, [srcend, -8]
79	str	A_l, [dstin]
80	str	A_h, [dstend, -8]
81	ret
82
83	.p2align 3
84	/* Copy 4-7 bytes.  */
85L(copy8):
86	tbz	count, 2, L(copy4)
87	ldr	A_lw, [src]
88	ldr	B_lw, [srcend, -4]
89	str	A_lw, [dstin]
90	str	B_lw, [dstend, -4]
91	ret
92
93	/* Copy 0..3 bytes using a branchless sequence.  */
94L(copy4):
95	cbz	count, L(copy0)
96	lsr	tmp1, count, 1
97	ldrb	A_lw, [src]
98	ldrb	C_lw, [srcend, -1]
99	ldrb	B_lw, [src, tmp1]
100	strb	A_lw, [dstin]
101	strb	B_lw, [dstin, tmp1]
102	strb	C_lw, [dstend, -1]
103L(copy0):
104	ret
105
106	.p2align 4
107	/* Medium copies: 33..128 bytes.  */
108L(copy32_128):
109	ldp	A_q, B_q, [src]
110	ldp	C_q, D_q, [srcend, -32]
111	cmp	count, 64
112	b.hi	L(copy128)
113	stp	A_q, B_q, [dstin]
114	stp	C_q, D_q, [dstend, -32]
115	ret
116
117	.p2align 4
118	/* Copy 65..128 bytes.  */
119L(copy128):
120	ldp	E_q, F_q, [src, 32]
121	cmp	count, 96
122	b.ls	L(copy96)
123	ldp	G_q, H_q, [srcend, -64]
124	stp	G_q, H_q, [dstend, -64]
125L(copy96):
126	stp	A_q, B_q, [dstin]
127	stp	E_q, F_q, [dstin, 32]
128	stp	C_q, D_q, [dstend, -32]
129	ret
130
131	/* Copy more than 128 bytes.  */
132L(copy_long):
133	/* Use backwards copy if there is an overlap.  */
134	sub	tmp1, dstin, src
135	cmp	tmp1, count
136	b.lo	L(copy_long_backwards)
137
138	/* Copy 16 bytes and then align src to 16-byte alignment.  */
139	ldr	D_q, [src]
140	and	tmp1, src, 15
141	bic	src, src, 15
142	sub	dst, dstin, tmp1
143	add	count, count, tmp1	/* Count is now 16 too large.  */
144	ldp	A_q, B_q, [src, 16]
145	str	D_q, [dstin]
146	ldp	C_q, D_q, [src, 48]
147	subs	count, count, 128 + 16	/* Test and readjust count.  */
148	b.ls	L(copy64_from_end)
149L(loop64):
150	stp	A_q, B_q, [dst, 16]
151	ldp	A_q, B_q, [src, 80]
152	stp	C_q, D_q, [dst, 48]
153	ldp	C_q, D_q, [src, 112]
154	add	src, src, 64
155	add	dst, dst, 64
156	subs	count, count, 64
157	b.hi	L(loop64)
158
159	/* Write the last iteration and copy 64 bytes from the end.  */
160L(copy64_from_end):
161	ldp	E_q, F_q, [srcend, -64]
162	stp	A_q, B_q, [dst, 16]
163	ldp	A_q, B_q, [srcend, -32]
164	stp	C_q, D_q, [dst, 48]
165	stp	E_q, F_q, [dstend, -64]
166	stp	A_q, B_q, [dstend, -32]
167	ret
168
169	/* Large backwards copy for overlapping copies.
170	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
171L(copy_long_backwards):
172	cbz	tmp1, L(copy0)
173	ldr	D_q, [srcend, -16]
174	and	tmp1, srcend, 15
175	bic	srcend, srcend, 15
176	sub	count, count, tmp1
177	ldp	A_q, B_q, [srcend, -32]
178	str	D_q, [dstend, -16]
179	ldp	C_q, D_q, [srcend, -64]
180	sub	dstend, dstend, tmp1
181	subs	count, count, 128
182	b.ls	L(copy64_from_start)
183
184L(loop64_backwards):
185	str	B_q, [dstend, -16]
186	str	A_q, [dstend, -32]
187	ldp	A_q, B_q, [srcend, -96]
188	str	D_q, [dstend, -48]
189	str	C_q, [dstend, -64]!
190	ldp	C_q, D_q, [srcend, -128]
191	sub	srcend, srcend, 64
192	subs	count, count, 64
193	b.hi	L(loop64_backwards)
194
195	/* Write the last iteration and copy 64 bytes from the start.  */
196L(copy64_from_start):
197	ldp	E_q, F_q, [src, 32]
198	stp	A_q, B_q, [dstend, -32]
199	ldp	A_q, B_q, [src]
200	stp	C_q, D_q, [dstend, -64]
201	stp	E_q, F_q, [dstin, 32]
202	stp	A_q, B_q, [dstin]
203	ret
204
205END (__memcpy_aarch64_simd)
206
207