xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/memcpy-sve.S (revision d5b0e70f7e04d971691517ce1304d86a1e367e2e)
1/*
2 * memcpy - copy memory area
3 *
4 * Copyright (c) 2019-2022, Arm Limited.
5 * SPDX-License-Identifier: MIT
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
11 *
12 */
13
14#if __ARM_FEATURE_SVE
15
16#include "../asmdefs.h"
17
18#define dstin	x0
19#define src	x1
20#define count	x2
21#define dst	x3
22#define srcend	x4
23#define dstend	x5
24#define tmp1	x6
25#define vlen	x6
26
27#define A_q	q0
28#define B_q	q1
29#define C_q	q2
30#define D_q	q3
31#define E_q	q4
32#define F_q	q5
33#define G_q	q6
34#define H_q	q7
35
36/* This implementation handles overlaps and supports both memcpy and memmove
37   from a single entry point.  It uses unaligned accesses and branchless
38   sequences to keep the code small, simple and improve performance.
39   SVE vectors are used to speedup small copies.
40
41   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
42   copies of up to 128 bytes, and large copies.  The overhead of the overlap
43   check is negligible since it is only required for large copies.
44
45   Large copies use a software pipelined loop processing 64 bytes per iteration.
46   The source pointer is 16-byte aligned to minimize unaligned accesses.
47   The loop tail is handled by always copying 64 bytes from the end.
48*/
49
50ENTRY_ALIAS (__memmove_aarch64_sve)
51ENTRY (__memcpy_aarch64_sve)
52	PTR_ARG (0)
53	PTR_ARG (1)
54	SIZE_ARG (2)
55
56	cmp	count, 128
57	b.hi	L(copy_long)
58	cmp	count, 32
59	b.hi	L(copy32_128)
60
61	whilelo p0.b, xzr, count
62	cntb	vlen
63	tbnz	vlen, 4, L(vlen128)
64	ld1b	z0.b, p0/z, [src]
65	st1b	z0.b, p0, [dstin]
66	ret
67
68	/* Medium copies: 33..128 bytes.  */
69L(copy32_128):
70	add	srcend, src, count
71	add	dstend, dstin, count
72	ldp	A_q, B_q, [src]
73	ldp	C_q, D_q, [srcend, -32]
74	cmp	count, 64
75	b.hi	L(copy128)
76	stp	A_q, B_q, [dstin]
77	stp	C_q, D_q, [dstend, -32]
78	ret
79
80	/* Copy 65..128 bytes.  */
81L(copy128):
82	ldp	E_q, F_q, [src, 32]
83	cmp	count, 96
84	b.ls	L(copy96)
85	ldp	G_q, H_q, [srcend, -64]
86	stp	G_q, H_q, [dstend, -64]
87L(copy96):
88	stp	A_q, B_q, [dstin]
89	stp	E_q, F_q, [dstin, 32]
90	stp	C_q, D_q, [dstend, -32]
91	ret
92
93	/* Copy more than 128 bytes.  */
94L(copy_long):
95	add	srcend, src, count
96	add	dstend, dstin, count
97
98	/* Use backwards copy if there is an overlap.  */
99	sub	tmp1, dstin, src
100	cmp	tmp1, count
101	b.lo	L(copy_long_backwards)
102
103	/* Copy 16 bytes and then align src to 16-byte alignment.  */
104	ldr	D_q, [src]
105	and	tmp1, src, 15
106	bic	src, src, 15
107	sub	dst, dstin, tmp1
108	add	count, count, tmp1	/* Count is now 16 too large.  */
109	ldp	A_q, B_q, [src, 16]
110	str	D_q, [dstin]
111	ldp	C_q, D_q, [src, 48]
112	subs	count, count, 128 + 16	/* Test and readjust count.  */
113	b.ls	L(copy64_from_end)
114L(loop64):
115	stp	A_q, B_q, [dst, 16]
116	ldp	A_q, B_q, [src, 80]
117	stp	C_q, D_q, [dst, 48]
118	ldp	C_q, D_q, [src, 112]
119	add	src, src, 64
120	add	dst, dst, 64
121	subs	count, count, 64
122	b.hi	L(loop64)
123
124	/* Write the last iteration and copy 64 bytes from the end.  */
125L(copy64_from_end):
126	ldp	E_q, F_q, [srcend, -64]
127	stp	A_q, B_q, [dst, 16]
128	ldp	A_q, B_q, [srcend, -32]
129	stp	C_q, D_q, [dst, 48]
130	stp	E_q, F_q, [dstend, -64]
131	stp	A_q, B_q, [dstend, -32]
132	ret
133
134L(vlen128):
135	whilelo p1.b, vlen, count
136	ld1b	z0.b, p0/z, [src, 0, mul vl]
137	ld1b	z1.b, p1/z, [src, 1, mul vl]
138	st1b	z0.b, p0, [dstin, 0, mul vl]
139	st1b	z1.b, p1, [dstin, 1, mul vl]
140	ret
141
142	/* Large backwards copy for overlapping copies.
143	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
144L(copy_long_backwards):
145	cbz	tmp1, L(return)
146	ldr	D_q, [srcend, -16]
147	and	tmp1, srcend, 15
148	bic	srcend, srcend, 15
149	sub	count, count, tmp1
150	ldp	A_q, B_q, [srcend, -32]
151	str	D_q, [dstend, -16]
152	ldp	C_q, D_q, [srcend, -64]
153	sub	dstend, dstend, tmp1
154	subs	count, count, 128
155	b.ls	L(copy64_from_start)
156
157L(loop64_backwards):
158	str	B_q, [dstend, -16]
159	str	A_q, [dstend, -32]
160	ldp	A_q, B_q, [srcend, -96]
161	str	D_q, [dstend, -48]
162	str	C_q, [dstend, -64]!
163	ldp	C_q, D_q, [srcend, -128]
164	sub	srcend, srcend, 64
165	subs	count, count, 64
166	b.hi	L(loop64_backwards)
167
168	/* Write the last iteration and copy 64 bytes from the start.  */
169L(copy64_from_start):
170	ldp	E_q, F_q, [src, 32]
171	stp	A_q, B_q, [dstend, -32]
172	ldp	A_q, B_q, [src]
173	stp	C_q, D_q, [dstend, -64]
174	stp	E_q, F_q, [dstin, 32]
175	stp	A_q, B_q, [dstin]
176L(return):
177	ret
178
179END (__memcpy_aarch64_sve)
180#endif
181