xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/memcpy-sve.S (revision 1719886f6d08408b834d270c59ffcfd821c8f63a)
1/*
2 * memcpy - copy memory area
3 *
4 * Copyright (c) 2019-2023, Arm Limited.
5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
11 *
12 */
13
14#include "asmdefs.h"
15
16#ifdef HAVE_SVE
17
18.arch armv8-a+sve
19
20#define dstin	x0
21#define src	x1
22#define count	x2
23#define dst	x3
24#define srcend	x4
25#define dstend	x5
26#define tmp1	x6
27#define vlen	x6
28
29#define A_q	q0
30#define B_q	q1
31#define C_q	q2
32#define D_q	q3
33#define E_q	q4
34#define F_q	q5
35#define G_q	q6
36#define H_q	q7
37
38/* This implementation handles overlaps and supports both memcpy and memmove
39   from a single entry point.  It uses unaligned accesses and branchless
40   sequences to keep the code small, simple and improve performance.
41   SVE vectors are used to speedup small copies.
42
43   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
44   copies of up to 128 bytes, and large copies.  The overhead of the overlap
45   check is negligible since it is only required for large copies.
46
47   Large copies use a software pipelined loop processing 64 bytes per iteration.
48   The source pointer is 16-byte aligned to minimize unaligned accesses.
49   The loop tail is handled by always copying 64 bytes from the end.
50*/
51
52ENTRY_ALIAS (__memmove_aarch64_sve)
53ENTRY (__memcpy_aarch64_sve)
54	PTR_ARG (0)
55	PTR_ARG (1)
56	SIZE_ARG (2)
57
58	cmp	count, 128
59	b.hi	L(copy_long)
60	cntb	vlen
61	cmp	count, vlen, lsl 1
62	b.hi	L(copy32_128)
63
64	whilelo p0.b, xzr, count
65	whilelo p1.b, vlen, count
66	ld1b	z0.b, p0/z, [src, 0, mul vl]
67	ld1b	z1.b, p1/z, [src, 1, mul vl]
68	st1b	z0.b, p0, [dstin, 0, mul vl]
69	st1b	z1.b, p1, [dstin, 1, mul vl]
70	ret
71
72	/* Medium copies: 33..128 bytes.  */
73L(copy32_128):
74	add	srcend, src, count
75	add	dstend, dstin, count
76	ldp	A_q, B_q, [src]
77	ldp	C_q, D_q, [srcend, -32]
78	cmp	count, 64
79	b.hi	L(copy128)
80	stp	A_q, B_q, [dstin]
81	stp	C_q, D_q, [dstend, -32]
82	ret
83
84	/* Copy 65..128 bytes.  */
85L(copy128):
86	ldp	E_q, F_q, [src, 32]
87	cmp	count, 96
88	b.ls	L(copy96)
89	ldp	G_q, H_q, [srcend, -64]
90	stp	G_q, H_q, [dstend, -64]
91L(copy96):
92	stp	A_q, B_q, [dstin]
93	stp	E_q, F_q, [dstin, 32]
94	stp	C_q, D_q, [dstend, -32]
95	ret
96
97	/* Copy more than 128 bytes.  */
98L(copy_long):
99	add	srcend, src, count
100	add	dstend, dstin, count
101
102	/* Use backwards copy if there is an overlap.  */
103	sub	tmp1, dstin, src
104	cmp	tmp1, count
105	b.lo	L(copy_long_backwards)
106
107	/* Copy 16 bytes and then align src to 16-byte alignment.  */
108	ldr	D_q, [src]
109	and	tmp1, src, 15
110	bic	src, src, 15
111	sub	dst, dstin, tmp1
112	add	count, count, tmp1	/* Count is now 16 too large.  */
113	ldp	A_q, B_q, [src, 16]
114	str	D_q, [dstin]
115	ldp	C_q, D_q, [src, 48]
116	subs	count, count, 128 + 16	/* Test and readjust count.  */
117	b.ls	L(copy64_from_end)
118L(loop64):
119	stp	A_q, B_q, [dst, 16]
120	ldp	A_q, B_q, [src, 80]
121	stp	C_q, D_q, [dst, 48]
122	ldp	C_q, D_q, [src, 112]
123	add	src, src, 64
124	add	dst, dst, 64
125	subs	count, count, 64
126	b.hi	L(loop64)
127
128	/* Write the last iteration and copy 64 bytes from the end.  */
129L(copy64_from_end):
130	ldp	E_q, F_q, [srcend, -64]
131	stp	A_q, B_q, [dst, 16]
132	ldp	A_q, B_q, [srcend, -32]
133	stp	C_q, D_q, [dst, 48]
134	stp	E_q, F_q, [dstend, -64]
135	stp	A_q, B_q, [dstend, -32]
136	ret
137
138	/* Large backwards copy for overlapping copies.
139	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
140L(copy_long_backwards):
141	cbz	tmp1, L(return)
142	ldr	D_q, [srcend, -16]
143	and	tmp1, srcend, 15
144	bic	srcend, srcend, 15
145	sub	count, count, tmp1
146	ldp	A_q, B_q, [srcend, -32]
147	str	D_q, [dstend, -16]
148	ldp	C_q, D_q, [srcend, -64]
149	sub	dstend, dstend, tmp1
150	subs	count, count, 128
151	b.ls	L(copy64_from_start)
152
153L(loop64_backwards):
154	str	B_q, [dstend, -16]
155	str	A_q, [dstend, -32]
156	ldp	A_q, B_q, [srcend, -96]
157	str	D_q, [dstend, -48]
158	str	C_q, [dstend, -64]!
159	ldp	C_q, D_q, [srcend, -128]
160	sub	srcend, srcend, 64
161	subs	count, count, 64
162	b.hi	L(loop64_backwards)
163
164	/* Write the last iteration and copy 64 bytes from the start.  */
165L(copy64_from_start):
166	ldp	E_q, F_q, [src, 32]
167	stp	A_q, B_q, [dstend, -32]
168	ldp	A_q, B_q, [src]
169	stp	C_q, D_q, [dstend, -64]
170	stp	E_q, F_q, [dstin, 32]
171	stp	A_q, B_q, [dstin]
172L(return):
173	ret
174
175END (__memcpy_aarch64_sve)
176
177#endif
178