xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/memcpy-sve.S (revision dd21556857e8d40f66bf5ad54754d9d52669ebf7)
1/*
2 * memcpy - copy memory area
3 *
4 * Copyright (c) 2019-2023, Arm Limited.
5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
11 *
12 */
13
14#include "asmdefs.h"
15
16.arch armv8-a+sve
17
18#define dstin	x0
19#define src	x1
20#define count	x2
21#define dst	x3
22#define srcend	x4
23#define dstend	x5
24#define tmp1	x6
25#define vlen	x6
26
27#define A_q	q0
28#define B_q	q1
29#define C_q	q2
30#define D_q	q3
31#define E_q	q4
32#define F_q	q5
33#define G_q	q6
34#define H_q	q7
35
36/* This implementation handles overlaps and supports both memcpy and memmove
37   from a single entry point.  It uses unaligned accesses and branchless
38   sequences to keep the code small, simple and improve performance.
39   SVE vectors are used to speedup small copies.
40
41   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
42   copies of up to 128 bytes, and large copies.  The overhead of the overlap
43   check is negligible since it is only required for large copies.
44
45   Large copies use a software pipelined loop processing 64 bytes per iteration.
46   The source pointer is 16-byte aligned to minimize unaligned accesses.
47   The loop tail is handled by always copying 64 bytes from the end.
48*/
49
50ENTRY_ALIAS (__memmove_aarch64_sve)
51ENTRY (__memcpy_aarch64_sve)
52	cmp	count, 128
53	b.hi	L(copy_long)
54	cntb	vlen
55	cmp	count, vlen, lsl 1
56	b.hi	L(copy32_128)
57
58	whilelo p0.b, xzr, count
59	whilelo p1.b, vlen, count
60	ld1b	z0.b, p0/z, [src, 0, mul vl]
61	ld1b	z1.b, p1/z, [src, 1, mul vl]
62	st1b	z0.b, p0, [dstin, 0, mul vl]
63	st1b	z1.b, p1, [dstin, 1, mul vl]
64	ret
65
66	/* Medium copies: 33..128 bytes.  */
67L(copy32_128):
68	add	srcend, src, count
69	add	dstend, dstin, count
70	ldp	A_q, B_q, [src]
71	ldp	C_q, D_q, [srcend, -32]
72	cmp	count, 64
73	b.hi	L(copy128)
74	stp	A_q, B_q, [dstin]
75	stp	C_q, D_q, [dstend, -32]
76	ret
77
78	/* Copy 65..128 bytes.  */
79L(copy128):
80	ldp	E_q, F_q, [src, 32]
81	cmp	count, 96
82	b.ls	L(copy96)
83	ldp	G_q, H_q, [srcend, -64]
84	stp	G_q, H_q, [dstend, -64]
85L(copy96):
86	stp	A_q, B_q, [dstin]
87	stp	E_q, F_q, [dstin, 32]
88	stp	C_q, D_q, [dstend, -32]
89	ret
90
91	/* Copy more than 128 bytes.  */
92L(copy_long):
93	add	srcend, src, count
94	add	dstend, dstin, count
95
96	/* Use backwards copy if there is an overlap.  */
97	sub	tmp1, dstin, src
98	cmp	tmp1, count
99	b.lo	L(copy_long_backwards)
100
101	/* Copy 16 bytes and then align src to 16-byte alignment.  */
102	ldr	D_q, [src]
103	and	tmp1, src, 15
104	bic	src, src, 15
105	sub	dst, dstin, tmp1
106	add	count, count, tmp1	/* Count is now 16 too large.  */
107	ldp	A_q, B_q, [src, 16]
108	str	D_q, [dstin]
109	ldp	C_q, D_q, [src, 48]
110	subs	count, count, 128 + 16	/* Test and readjust count.  */
111	b.ls	L(copy64_from_end)
112L(loop64):
113	stp	A_q, B_q, [dst, 16]
114	ldp	A_q, B_q, [src, 80]
115	stp	C_q, D_q, [dst, 48]
116	ldp	C_q, D_q, [src, 112]
117	add	src, src, 64
118	add	dst, dst, 64
119	subs	count, count, 64
120	b.hi	L(loop64)
121
122	/* Write the last iteration and copy 64 bytes from the end.  */
123L(copy64_from_end):
124	ldp	E_q, F_q, [srcend, -64]
125	stp	A_q, B_q, [dst, 16]
126	ldp	A_q, B_q, [srcend, -32]
127	stp	C_q, D_q, [dst, 48]
128	stp	E_q, F_q, [dstend, -64]
129	stp	A_q, B_q, [dstend, -32]
130	ret
131
132	/* Large backwards copy for overlapping copies.
133	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
134L(copy_long_backwards):
135	cbz	tmp1, L(return)
136	ldr	D_q, [srcend, -16]
137	and	tmp1, srcend, 15
138	bic	srcend, srcend, 15
139	sub	count, count, tmp1
140	ldp	A_q, B_q, [srcend, -32]
141	str	D_q, [dstend, -16]
142	ldp	C_q, D_q, [srcend, -64]
143	sub	dstend, dstend, tmp1
144	subs	count, count, 128
145	b.ls	L(copy64_from_start)
146
147L(loop64_backwards):
148	str	B_q, [dstend, -16]
149	str	A_q, [dstend, -32]
150	ldp	A_q, B_q, [srcend, -96]
151	str	D_q, [dstend, -48]
152	str	C_q, [dstend, -64]!
153	ldp	C_q, D_q, [srcend, -128]
154	sub	srcend, srcend, 64
155	subs	count, count, 64
156	b.hi	L(loop64_backwards)
157
158	/* Write the last iteration and copy 64 bytes from the start.  */
159L(copy64_from_start):
160	ldp	E_q, F_q, [src, 32]
161	stp	A_q, B_q, [dstend, -32]
162	ldp	A_q, B_q, [src]
163	stp	C_q, D_q, [dstend, -64]
164	stp	E_q, F_q, [dstin, 32]
165	stp	A_q, B_q, [dstin]
166L(return):
167	ret
168
169END (__memcpy_aarch64_sve)
170