xref: /linux/arch/arm64/lib/memcpy.S (revision 4d5e3b06e1fc1428be14cd4ebe3b37c1bb34f95d)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Copyright (c) 2012-2021, Arm Limited.
4 *
5 * Adapted from the original at:
6 * https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/memcpy.S
7 */
8
9#include <linux/linkage.h>
10#include <asm/assembler.h>
11
12/* Assumptions:
13 *
14 * ARMv8-a, AArch64, unaligned accesses.
15 *
16 */
17
18#define L(label) .L ## label
19
20#define dstin	x0
21#define src	x1
22#define count	x2
23#define dst	x3
24#define srcend	x4
25#define dstend	x5
26#define A_l	x6
27#define A_lw	w6
28#define A_h	x7
29#define B_l	x8
30#define B_lw	w8
31#define B_h	x9
32#define C_l	x10
33#define C_lw	w10
34#define C_h	x11
35#define D_l	x12
36#define D_h	x13
37#define E_l	x14
38#define E_h	x15
39#define F_l	x16
40#define F_h	x17
41#define G_l	count
42#define G_h	dst
43#define H_l	src
44#define H_h	srcend
45#define tmp1	x14
46
47/* This implementation handles overlaps and supports both memcpy and memmove
48   from a single entry point.  It uses unaligned accesses and branchless
49   sequences to keep the code small, simple and improve performance.
50
51   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
52   copies of up to 128 bytes, and large copies.  The overhead of the overlap
53   check is negligible since it is only required for large copies.
54
55   Large copies use a software pipelined loop processing 64 bytes per iteration.
56   The destination pointer is 16-byte aligned to minimize unaligned accesses.
57   The loop tail is handled by always copying 64 bytes from the end.
58*/
59
60SYM_FUNC_START(__pi_memcpy)
61	add	srcend, src, count
62	add	dstend, dstin, count
63	cmp	count, 128
64	b.hi	L(copy_long)
65	cmp	count, 32
66	b.hi	L(copy32_128)
67
68	/* Small copies: 0..32 bytes.  */
69	cmp	count, 16
70	b.lo	L(copy16)
71	ldp	A_l, A_h, [src]
72	ldp	D_l, D_h, [srcend, -16]
73	stp	A_l, A_h, [dstin]
74	stp	D_l, D_h, [dstend, -16]
75	ret
76
77	/* Copy 8-15 bytes.  */
78L(copy16):
79	tbz	count, 3, L(copy8)
80	ldr	A_l, [src]
81	ldr	A_h, [srcend, -8]
82	str	A_l, [dstin]
83	str	A_h, [dstend, -8]
84	ret
85
86	.p2align 3
87	/* Copy 4-7 bytes.  */
88L(copy8):
89	tbz	count, 2, L(copy4)
90	ldr	A_lw, [src]
91	ldr	B_lw, [srcend, -4]
92	str	A_lw, [dstin]
93	str	B_lw, [dstend, -4]
94	ret
95
96	/* Copy 0..3 bytes using a branchless sequence.  */
97L(copy4):
98	cbz	count, L(copy0)
99	lsr	tmp1, count, 1
100	ldrb	A_lw, [src]
101	ldrb	C_lw, [srcend, -1]
102	ldrb	B_lw, [src, tmp1]
103	strb	A_lw, [dstin]
104	strb	B_lw, [dstin, tmp1]
105	strb	C_lw, [dstend, -1]
106L(copy0):
107	ret
108
109	.p2align 4
110	/* Medium copies: 33..128 bytes.  */
111L(copy32_128):
112	ldp	A_l, A_h, [src]
113	ldp	B_l, B_h, [src, 16]
114	ldp	C_l, C_h, [srcend, -32]
115	ldp	D_l, D_h, [srcend, -16]
116	cmp	count, 64
117	b.hi	L(copy128)
118	stp	A_l, A_h, [dstin]
119	stp	B_l, B_h, [dstin, 16]
120	stp	C_l, C_h, [dstend, -32]
121	stp	D_l, D_h, [dstend, -16]
122	ret
123
124	.p2align 4
125	/* Copy 65..128 bytes.  */
126L(copy128):
127	ldp	E_l, E_h, [src, 32]
128	ldp	F_l, F_h, [src, 48]
129	cmp	count, 96
130	b.ls	L(copy96)
131	ldp	G_l, G_h, [srcend, -64]
132	ldp	H_l, H_h, [srcend, -48]
133	stp	G_l, G_h, [dstend, -64]
134	stp	H_l, H_h, [dstend, -48]
135L(copy96):
136	stp	A_l, A_h, [dstin]
137	stp	B_l, B_h, [dstin, 16]
138	stp	E_l, E_h, [dstin, 32]
139	stp	F_l, F_h, [dstin, 48]
140	stp	C_l, C_h, [dstend, -32]
141	stp	D_l, D_h, [dstend, -16]
142	ret
143
144	.p2align 4
145	/* Copy more than 128 bytes.  */
146L(copy_long):
147	/* Use backwards copy if there is an overlap.  */
148	sub	tmp1, dstin, src
149	cbz	tmp1, L(copy0)
150	cmp	tmp1, count
151	b.lo	L(copy_long_backwards)
152
153	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
154
155	ldp	D_l, D_h, [src]
156	and	tmp1, dstin, 15
157	bic	dst, dstin, 15
158	sub	src, src, tmp1
159	add	count, count, tmp1	/* Count is now 16 too large.  */
160	ldp	A_l, A_h, [src, 16]
161	stp	D_l, D_h, [dstin]
162	ldp	B_l, B_h, [src, 32]
163	ldp	C_l, C_h, [src, 48]
164	ldp	D_l, D_h, [src, 64]!
165	subs	count, count, 128 + 16	/* Test and readjust count.  */
166	b.ls	L(copy64_from_end)
167
168L(loop64):
169	stp	A_l, A_h, [dst, 16]
170	ldp	A_l, A_h, [src, 16]
171	stp	B_l, B_h, [dst, 32]
172	ldp	B_l, B_h, [src, 32]
173	stp	C_l, C_h, [dst, 48]
174	ldp	C_l, C_h, [src, 48]
175	stp	D_l, D_h, [dst, 64]!
176	ldp	D_l, D_h, [src, 64]!
177	subs	count, count, 64
178	b.hi	L(loop64)
179
180	/* Write the last iteration and copy 64 bytes from the end.  */
181L(copy64_from_end):
182	ldp	E_l, E_h, [srcend, -64]
183	stp	A_l, A_h, [dst, 16]
184	ldp	A_l, A_h, [srcend, -48]
185	stp	B_l, B_h, [dst, 32]
186	ldp	B_l, B_h, [srcend, -32]
187	stp	C_l, C_h, [dst, 48]
188	ldp	C_l, C_h, [srcend, -16]
189	stp	D_l, D_h, [dst, 64]
190	stp	E_l, E_h, [dstend, -64]
191	stp	A_l, A_h, [dstend, -48]
192	stp	B_l, B_h, [dstend, -32]
193	stp	C_l, C_h, [dstend, -16]
194	ret
195
196	.p2align 4
197
198	/* Large backwards copy for overlapping copies.
199	   Copy 16 bytes and then align dst to 16-byte alignment.  */
200L(copy_long_backwards):
201	ldp	D_l, D_h, [srcend, -16]
202	and	tmp1, dstend, 15
203	sub	srcend, srcend, tmp1
204	sub	count, count, tmp1
205	ldp	A_l, A_h, [srcend, -16]
206	stp	D_l, D_h, [dstend, -16]
207	ldp	B_l, B_h, [srcend, -32]
208	ldp	C_l, C_h, [srcend, -48]
209	ldp	D_l, D_h, [srcend, -64]!
210	sub	dstend, dstend, tmp1
211	subs	count, count, 128
212	b.ls	L(copy64_from_start)
213
214L(loop64_backwards):
215	stp	A_l, A_h, [dstend, -16]
216	ldp	A_l, A_h, [srcend, -16]
217	stp	B_l, B_h, [dstend, -32]
218	ldp	B_l, B_h, [srcend, -32]
219	stp	C_l, C_h, [dstend, -48]
220	ldp	C_l, C_h, [srcend, -48]
221	stp	D_l, D_h, [dstend, -64]!
222	ldp	D_l, D_h, [srcend, -64]!
223	subs	count, count, 64
224	b.hi	L(loop64_backwards)
225
226	/* Write the last iteration and copy 64 bytes from the start.  */
227L(copy64_from_start):
228	ldp	G_l, G_h, [src, 48]
229	stp	A_l, A_h, [dstend, -16]
230	ldp	A_l, A_h, [src, 32]
231	stp	B_l, B_h, [dstend, -32]
232	ldp	B_l, B_h, [src, 16]
233	stp	C_l, C_h, [dstend, -48]
234	ldp	C_l, C_h, [src]
235	stp	D_l, D_h, [dstend, -64]
236	stp	G_l, G_h, [dstin, 48]
237	stp	A_l, A_h, [dstin, 32]
238	stp	B_l, B_h, [dstin, 16]
239	stp	C_l, C_h, [dstin]
240	ret
241SYM_FUNC_END(__pi_memcpy)
242
243SYM_FUNC_ALIAS(__memcpy, __pi_memcpy)
244EXPORT_SYMBOL(__memcpy)
245SYM_FUNC_ALIAS_WEAK(memcpy, __memcpy)
246EXPORT_SYMBOL(memcpy)
247
248SYM_FUNC_ALIAS(__pi_memmove, __pi_memcpy)
249
250SYM_FUNC_ALIAS(__memmove, __pi_memmove)
251EXPORT_SYMBOL(__memmove)
252SYM_FUNC_ALIAS_WEAK(memmove, __memmove)
253EXPORT_SYMBOL(memmove)
254