xref: /freebsd/contrib/arm-optimized-routines/string/arm/memcpy.S (revision 8ddb146abcdf061be9f2c0db7e391697dafad85c)
1/*
2 * memcpy - copy memory area
3 *
4 * Copyright (c) 2013-2020, Arm Limited.
5 * SPDX-License-Identifier: MIT
6 */
7
8/*
9   This memcpy routine is optimised for Cortex-A15 cores and takes advantage
10   of VFP or NEON when built with the appropriate flags.
11
12   Assumptions:
13
14    ARMv6 (ARMv7-a if using Neon)
15    ARM state
16    Unaligned accesses
17
18 */
19
20#include "../asmdefs.h"
21
22	.syntax unified
23	/* This implementation requires ARM state.  */
24	.arm
25
26#ifdef __ARM_NEON__
27
28	.fpu	neon
29	.arch	armv7-a
30# define FRAME_SIZE	4
31# define USE_VFP
32# define USE_NEON
33
34#elif !defined (__SOFTFP__)
35
36	.arch	armv6
37	.fpu	vfpv2
38# define FRAME_SIZE	32
39# define USE_VFP
40
41#else
42	.arch	armv6
43# define FRAME_SIZE    32
44
45#endif
46
47/* Old versions of GAS incorrectly implement the NEON align semantics.  */
48#ifdef BROKEN_ASM_NEON_ALIGN
49#define ALIGN(addr, align) addr,:align
50#else
51#define ALIGN(addr, align) addr:align
52#endif
53
54#define PC_OFFSET	8	/* PC pipeline compensation.  */
55#define INSN_SIZE	4
56
57/* Call parameters.  */
58#define dstin	r0
59#define src	r1
60#define count	r2
61
62/* Locals.  */
63#define tmp1	r3
64#define dst	ip
65#define tmp2	r10
66
67#ifndef USE_NEON
68/* For bulk copies using GP registers.  */
69#define	A_l	r2		/* Call-clobbered.  */
70#define	A_h	r3		/* Call-clobbered.  */
71#define	B_l	r4
72#define	B_h	r5
73#define	C_l	r6
74#define	C_h	r7
75#define	D_l	r8
76#define	D_h	r9
77#endif
78
79/* Number of lines ahead to pre-fetch data.  If you change this the code
80   below will need adjustment to compensate.  */
81
82#define prefetch_lines	5
83
84#ifdef USE_VFP
85	.macro	cpy_line_vfp vreg, base
86	vstr	\vreg, [dst, #\base]
87	vldr	\vreg, [src, #\base]
88	vstr	d0, [dst, #\base + 8]
89	vldr	d0, [src, #\base + 8]
90	vstr	d1, [dst, #\base + 16]
91	vldr	d1, [src, #\base + 16]
92	vstr	d2, [dst, #\base + 24]
93	vldr	d2, [src, #\base + 24]
94	vstr	\vreg, [dst, #\base + 32]
95	vldr	\vreg, [src, #\base + prefetch_lines * 64 - 32]
96	vstr	d0, [dst, #\base + 40]
97	vldr	d0, [src, #\base + 40]
98	vstr	d1, [dst, #\base + 48]
99	vldr	d1, [src, #\base + 48]
100	vstr	d2, [dst, #\base + 56]
101	vldr	d2, [src, #\base + 56]
102	.endm
103
104	.macro	cpy_tail_vfp vreg, base
105	vstr	\vreg, [dst, #\base]
106	vldr	\vreg, [src, #\base]
107	vstr	d0, [dst, #\base + 8]
108	vldr	d0, [src, #\base + 8]
109	vstr	d1, [dst, #\base + 16]
110	vldr	d1, [src, #\base + 16]
111	vstr	d2, [dst, #\base + 24]
112	vldr	d2, [src, #\base + 24]
113	vstr	\vreg, [dst, #\base + 32]
114	vstr	d0, [dst, #\base + 40]
115	vldr	d0, [src, #\base + 40]
116	vstr	d1, [dst, #\base + 48]
117	vldr	d1, [src, #\base + 48]
118	vstr	d2, [dst, #\base + 56]
119	vldr	d2, [src, #\base + 56]
120	.endm
121#endif
122
123ENTRY (__memcpy_arm)
124
125	mov	dst, dstin	/* Preserve dstin, we need to return it.  */
126	cmp	count, #64
127	bhs	L(cpy_not_short)
128	/* Deal with small copies quickly by dropping straight into the
129	   exit block.  */
130
131L(tail63unaligned):
132#ifdef USE_NEON
133	and	tmp1, count, #0x38
134	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
135	add	pc, pc, tmp1
136	vld1.8	{d0}, [src]!	/* 14 words to go.  */
137	vst1.8	{d0}, [dst]!
138	vld1.8	{d0}, [src]!	/* 12 words to go.  */
139	vst1.8	{d0}, [dst]!
140	vld1.8	{d0}, [src]!	/* 10 words to go.  */
141	vst1.8	{d0}, [dst]!
142	vld1.8	{d0}, [src]!	/* 8 words to go.  */
143	vst1.8	{d0}, [dst]!
144	vld1.8	{d0}, [src]!	/* 6 words to go.  */
145	vst1.8	{d0}, [dst]!
146	vld1.8	{d0}, [src]!	/* 4 words to go.  */
147	vst1.8	{d0}, [dst]!
148	vld1.8	{d0}, [src]!	/* 2 words to go.  */
149	vst1.8	{d0}, [dst]!
150
151	tst	count, #4
152	ldrne	tmp1, [src], #4
153	strne	tmp1, [dst], #4
154#else
155	/* Copy up to 15 full words of data.  May not be aligned.  */
156	/* Cannot use VFP for unaligned data.  */
157	and	tmp1, count, #0x3c
158	add	dst, dst, tmp1
159	add	src, src, tmp1
160	rsb	tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
161	/* Jump directly into the sequence below at the correct offset.  */
162	add	pc, pc, tmp1, lsl #1
163
164	ldr	tmp1, [src, #-60]	/* 15 words to go.  */
165	str	tmp1, [dst, #-60]
166
167	ldr	tmp1, [src, #-56]	/* 14 words to go.  */
168	str	tmp1, [dst, #-56]
169	ldr	tmp1, [src, #-52]
170	str	tmp1, [dst, #-52]
171
172	ldr	tmp1, [src, #-48]	/* 12 words to go.  */
173	str	tmp1, [dst, #-48]
174	ldr	tmp1, [src, #-44]
175	str	tmp1, [dst, #-44]
176
177	ldr	tmp1, [src, #-40]	/* 10 words to go.  */
178	str	tmp1, [dst, #-40]
179	ldr	tmp1, [src, #-36]
180	str	tmp1, [dst, #-36]
181
182	ldr	tmp1, [src, #-32]	/* 8 words to go.  */
183	str	tmp1, [dst, #-32]
184	ldr	tmp1, [src, #-28]
185	str	tmp1, [dst, #-28]
186
187	ldr	tmp1, [src, #-24]	/* 6 words to go.  */
188	str	tmp1, [dst, #-24]
189	ldr	tmp1, [src, #-20]
190	str	tmp1, [dst, #-20]
191
192	ldr	tmp1, [src, #-16]	/* 4 words to go.  */
193	str	tmp1, [dst, #-16]
194	ldr	tmp1, [src, #-12]
195	str	tmp1, [dst, #-12]
196
197	ldr	tmp1, [src, #-8]	/* 2 words to go.  */
198	str	tmp1, [dst, #-8]
199	ldr	tmp1, [src, #-4]
200	str	tmp1, [dst, #-4]
201#endif
202
203	lsls	count, count, #31
204	ldrhcs	tmp1, [src], #2
205	ldrbne	src, [src]		/* Src is dead, use as a scratch.  */
206	strhcs	tmp1, [dst], #2
207	strbne	src, [dst]
208	bx	lr
209
210L(cpy_not_short):
211	/* At least 64 bytes to copy, but don't know the alignment yet.  */
212	str	tmp2, [sp, #-FRAME_SIZE]!
213	and	tmp2, src, #7
214	and	tmp1, dst, #7
215	cmp	tmp1, tmp2
216	bne	L(cpy_notaligned)
217
218#ifdef USE_VFP
219	/* Magic dust alert!  Force VFP on Cortex-A9.  Experiments show
220	   that the FP pipeline is much better at streaming loads and
221	   stores.  This is outside the critical loop.  */
222	vmov.f32	s0, s0
223#endif
224
225	/* SRC and DST have the same mutual 64-bit alignment, but we may
226	   still need to pre-copy some bytes to get to natural alignment.
227	   We bring SRC and DST into full 64-bit alignment.  */
228	lsls	tmp2, dst, #29
229	beq	1f
230	rsbs	tmp2, tmp2, #0
231	sub	count, count, tmp2, lsr #29
232	ldrmi	tmp1, [src], #4
233	strmi	tmp1, [dst], #4
234	lsls	tmp2, tmp2, #2
235	ldrhcs	tmp1, [src], #2
236	ldrbne	tmp2, [src], #1
237	strhcs	tmp1, [dst], #2
238	strbne	tmp2, [dst], #1
239
2401:
241	subs	tmp2, count, #64	/* Use tmp2 for count.  */
242	blo	L(tail63aligned)
243
244	cmp	tmp2, #512
245	bhs	L(cpy_body_long)
246
247L(cpy_body_medium):			/* Count in tmp2.  */
248#ifdef USE_VFP
2491:
250	vldr	d0, [src, #0]
251	subs	tmp2, tmp2, #64
252	vldr	d1, [src, #8]
253	vstr	d0, [dst, #0]
254	vldr	d0, [src, #16]
255	vstr	d1, [dst, #8]
256	vldr	d1, [src, #24]
257	vstr	d0, [dst, #16]
258	vldr	d0, [src, #32]
259	vstr	d1, [dst, #24]
260	vldr	d1, [src, #40]
261	vstr	d0, [dst, #32]
262	vldr	d0, [src, #48]
263	vstr	d1, [dst, #40]
264	vldr	d1, [src, #56]
265	vstr	d0, [dst, #48]
266	add	src, src, #64
267	vstr	d1, [dst, #56]
268	add	dst, dst, #64
269	bhs	1b
270	tst	tmp2, #0x3f
271	beq	L(done)
272
273L(tail63aligned):			/* Count in tmp2.  */
274	and	tmp1, tmp2, #0x38
275	add	dst, dst, tmp1
276	add	src, src, tmp1
277	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
278	add	pc, pc, tmp1
279
280	vldr	d0, [src, #-56]	/* 14 words to go.  */
281	vstr	d0, [dst, #-56]
282	vldr	d0, [src, #-48]	/* 12 words to go.  */
283	vstr	d0, [dst, #-48]
284	vldr	d0, [src, #-40]	/* 10 words to go.  */
285	vstr	d0, [dst, #-40]
286	vldr	d0, [src, #-32]	/* 8 words to go.  */
287	vstr	d0, [dst, #-32]
288	vldr	d0, [src, #-24]	/* 6 words to go.  */
289	vstr	d0, [dst, #-24]
290	vldr	d0, [src, #-16]	/* 4 words to go.  */
291	vstr	d0, [dst, #-16]
292	vldr	d0, [src, #-8]	/* 2 words to go.  */
293	vstr	d0, [dst, #-8]
294#else
295	sub	src, src, #8
296	sub	dst, dst, #8
2971:
298	ldrd	A_l, A_h, [src, #8]
299	strd	A_l, A_h, [dst, #8]
300	ldrd	A_l, A_h, [src, #16]
301	strd	A_l, A_h, [dst, #16]
302	ldrd	A_l, A_h, [src, #24]
303	strd	A_l, A_h, [dst, #24]
304	ldrd	A_l, A_h, [src, #32]
305	strd	A_l, A_h, [dst, #32]
306	ldrd	A_l, A_h, [src, #40]
307	strd	A_l, A_h, [dst, #40]
308	ldrd	A_l, A_h, [src, #48]
309	strd	A_l, A_h, [dst, #48]
310	ldrd	A_l, A_h, [src, #56]
311	strd	A_l, A_h, [dst, #56]
312	ldrd	A_l, A_h, [src, #64]!
313	strd	A_l, A_h, [dst, #64]!
314	subs	tmp2, tmp2, #64
315	bhs	1b
316	tst	tmp2, #0x3f
317	bne	1f
318	ldr	tmp2,[sp], #FRAME_SIZE
319	bx	lr
3201:
321	add	src, src, #8
322	add	dst, dst, #8
323
324L(tail63aligned):			/* Count in tmp2.  */
325	/* Copy up to 7 d-words of data.  Similar to Ltail63unaligned, but
326	   we know that the src and dest are 64-bit aligned so we can use
327	   LDRD/STRD to improve efficiency.  */
328	/* TMP2 is now negative, but we don't care about that.  The bottom
329	   six bits still tell us how many bytes are left to copy.  */
330
331	and	tmp1, tmp2, #0x38
332	add	dst, dst, tmp1
333	add	src, src, tmp1
334	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
335	add	pc, pc, tmp1
336	ldrd	A_l, A_h, [src, #-56]	/* 14 words to go.  */
337	strd	A_l, A_h, [dst, #-56]
338	ldrd	A_l, A_h, [src, #-48]	/* 12 words to go.  */
339	strd	A_l, A_h, [dst, #-48]
340	ldrd	A_l, A_h, [src, #-40]	/* 10 words to go.  */
341	strd	A_l, A_h, [dst, #-40]
342	ldrd	A_l, A_h, [src, #-32]	/* 8 words to go.  */
343	strd	A_l, A_h, [dst, #-32]
344	ldrd	A_l, A_h, [src, #-24]	/* 6 words to go.  */
345	strd	A_l, A_h, [dst, #-24]
346	ldrd	A_l, A_h, [src, #-16]	/* 4 words to go.  */
347	strd	A_l, A_h, [dst, #-16]
348	ldrd	A_l, A_h, [src, #-8]	/* 2 words to go.  */
349	strd	A_l, A_h, [dst, #-8]
350
351#endif
352	tst	tmp2, #4
353	ldrne	tmp1, [src], #4
354	strne	tmp1, [dst], #4
355	lsls	tmp2, tmp2, #31		/* Count (tmp2) now dead. */
356	ldrhcs	tmp1, [src], #2
357	ldrbne	tmp2, [src]
358	strhcs	tmp1, [dst], #2
359	strbne	tmp2, [dst]
360
361L(done):
362	ldr	tmp2, [sp], #FRAME_SIZE
363	bx	lr
364
365L(cpy_body_long):			/* Count in tmp2.  */
366
367	/* Long copy.  We know that there's at least (prefetch_lines * 64)
368	   bytes to go.  */
369#ifdef USE_VFP
370	/* Don't use PLD.  Instead, read some data in advance of the current
371	   copy position into a register.  This should act like a PLD
372	   operation but we won't have to repeat the transfer.  */
373
374	vldr	d3, [src, #0]
375	vldr	d4, [src, #64]
376	vldr	d5, [src, #128]
377	vldr	d6, [src, #192]
378	vldr	d7, [src, #256]
379
380	vldr	d0, [src, #8]
381	vldr	d1, [src, #16]
382	vldr	d2, [src, #24]
383	add	src, src, #32
384
385	subs	tmp2, tmp2, #prefetch_lines * 64 * 2
386	blo	2f
3871:
388	cpy_line_vfp	d3, 0
389	cpy_line_vfp	d4, 64
390	cpy_line_vfp	d5, 128
391	add	dst, dst, #3 * 64
392	add	src, src, #3 * 64
393	cpy_line_vfp	d6, 0
394	cpy_line_vfp	d7, 64
395	add	dst, dst, #2 * 64
396	add	src, src, #2 * 64
397	subs	tmp2, tmp2, #prefetch_lines * 64
398	bhs	1b
399
4002:
401	cpy_tail_vfp	d3, 0
402	cpy_tail_vfp	d4, 64
403	cpy_tail_vfp	d5, 128
404	add	src, src, #3 * 64
405	add	dst, dst, #3 * 64
406	cpy_tail_vfp	d6, 0
407	vstr	d7, [dst, #64]
408	vldr	d7, [src, #64]
409	vstr	d0, [dst, #64 + 8]
410	vldr	d0, [src, #64 + 8]
411	vstr	d1, [dst, #64 + 16]
412	vldr	d1, [src, #64 + 16]
413	vstr	d2, [dst, #64 + 24]
414	vldr	d2, [src, #64 + 24]
415	vstr	d7, [dst, #64 + 32]
416	add	src, src, #96
417	vstr	d0, [dst, #64 + 40]
418	vstr	d1, [dst, #64 + 48]
419	vstr	d2, [dst, #64 + 56]
420	add	dst, dst, #128
421	add	tmp2, tmp2, #prefetch_lines * 64
422	b	L(cpy_body_medium)
423#else
424	/* Long copy.  Use an SMS style loop to maximize the I/O
425	   bandwidth of the core.  We don't have enough spare registers
426	   to synthesise prefetching, so use PLD operations.  */
427	/* Pre-bias src and dst.  */
428	sub	src, src, #8
429	sub	dst, dst, #8
430	pld	[src, #8]
431	pld	[src, #72]
432	subs	tmp2, tmp2, #64
433	pld	[src, #136]
434	ldrd	A_l, A_h, [src, #8]
435	strd	B_l, B_h, [sp, #8]
436	ldrd	B_l, B_h, [src, #16]
437	strd	C_l, C_h, [sp, #16]
438	ldrd	C_l, C_h, [src, #24]
439	strd	D_l, D_h, [sp, #24]
440	pld	[src, #200]
441	ldrd	D_l, D_h, [src, #32]!
442	b	1f
443	.p2align	6
4442:
445	pld	[src, #232]
446	strd	A_l, A_h, [dst, #40]
447	ldrd	A_l, A_h, [src, #40]
448	strd	B_l, B_h, [dst, #48]
449	ldrd	B_l, B_h, [src, #48]
450	strd	C_l, C_h, [dst, #56]
451	ldrd	C_l, C_h, [src, #56]
452	strd	D_l, D_h, [dst, #64]!
453	ldrd	D_l, D_h, [src, #64]!
454	subs	tmp2, tmp2, #64
4551:
456	strd	A_l, A_h, [dst, #8]
457	ldrd	A_l, A_h, [src, #8]
458	strd	B_l, B_h, [dst, #16]
459	ldrd	B_l, B_h, [src, #16]
460	strd	C_l, C_h, [dst, #24]
461	ldrd	C_l, C_h, [src, #24]
462	strd	D_l, D_h, [dst, #32]
463	ldrd	D_l, D_h, [src, #32]
464	bcs	2b
465	/* Save the remaining bytes and restore the callee-saved regs.  */
466	strd	A_l, A_h, [dst, #40]
467	add	src, src, #40
468	strd	B_l, B_h, [dst, #48]
469	ldrd	B_l, B_h, [sp, #8]
470	strd	C_l, C_h, [dst, #56]
471	ldrd	C_l, C_h, [sp, #16]
472	strd	D_l, D_h, [dst, #64]
473	ldrd	D_l, D_h, [sp, #24]
474	add	dst, dst, #72
475	tst	tmp2, #0x3f
476	bne	L(tail63aligned)
477	ldr	tmp2, [sp], #FRAME_SIZE
478	bx	lr
479#endif
480
481L(cpy_notaligned):
482	pld	[src]
483	pld	[src, #64]
484	/* There's at least 64 bytes to copy, but there is no mutual
485	   alignment.  */
486	/* Bring DST to 64-bit alignment.  */
487	lsls	tmp2, dst, #29
488	pld	[src, #(2 * 64)]
489	beq	1f
490	rsbs	tmp2, tmp2, #0
491	sub	count, count, tmp2, lsr #29
492	ldrmi	tmp1, [src], #4
493	strmi	tmp1, [dst], #4
494	lsls	tmp2, tmp2, #2
495	ldrbne	tmp1, [src], #1
496	ldrhcs	tmp2, [src], #2
497	strbne	tmp1, [dst], #1
498	strhcs	tmp2, [dst], #2
4991:
500	pld	[src, #(3 * 64)]
501	subs	count, count, #64
502	ldrlo	tmp2, [sp], #FRAME_SIZE
503	blo	L(tail63unaligned)
504	pld	[src, #(4 * 64)]
505
506#ifdef USE_NEON
507	vld1.8	{d0-d3}, [src]!
508	vld1.8	{d4-d7}, [src]!
509	subs	count, count, #64
510	blo	2f
5111:
512	pld	[src, #(4 * 64)]
513	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
514	vld1.8	{d0-d3}, [src]!
515	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
516	vld1.8	{d4-d7}, [src]!
517	subs	count, count, #64
518	bhs	1b
5192:
520	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
521	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
522	ands	count, count, #0x3f
523#else
524	/* Use an SMS style loop to maximize the I/O bandwidth.  */
525	sub	src, src, #4
526	sub	dst, dst, #8
527	subs	tmp2, count, #64	/* Use tmp2 for count.  */
528	ldr	A_l, [src, #4]
529	ldr	A_h, [src, #8]
530	strd	B_l, B_h, [sp, #8]
531	ldr	B_l, [src, #12]
532	ldr	B_h, [src, #16]
533	strd	C_l, C_h, [sp, #16]
534	ldr	C_l, [src, #20]
535	ldr	C_h, [src, #24]
536	strd	D_l, D_h, [sp, #24]
537	ldr	D_l, [src, #28]
538	ldr	D_h, [src, #32]!
539	b	1f
540	.p2align	6
5412:
542	pld	[src, #(5 * 64) - (32 - 4)]
543	strd	A_l, A_h, [dst, #40]
544	ldr	A_l, [src, #36]
545	ldr	A_h, [src, #40]
546	strd	B_l, B_h, [dst, #48]
547	ldr	B_l, [src, #44]
548	ldr	B_h, [src, #48]
549	strd	C_l, C_h, [dst, #56]
550	ldr	C_l, [src, #52]
551	ldr	C_h, [src, #56]
552	strd	D_l, D_h, [dst, #64]!
553	ldr	D_l, [src, #60]
554	ldr	D_h, [src, #64]!
555	subs	tmp2, tmp2, #64
5561:
557	strd	A_l, A_h, [dst, #8]
558	ldr	A_l, [src, #4]
559	ldr	A_h, [src, #8]
560	strd	B_l, B_h, [dst, #16]
561	ldr	B_l, [src, #12]
562	ldr	B_h, [src, #16]
563	strd	C_l, C_h, [dst, #24]
564	ldr	C_l, [src, #20]
565	ldr	C_h, [src, #24]
566	strd	D_l, D_h, [dst, #32]
567	ldr	D_l, [src, #28]
568	ldr	D_h, [src, #32]
569	bcs	2b
570
571	/* Save the remaining bytes and restore the callee-saved regs.  */
572	strd	A_l, A_h, [dst, #40]
573	add	src, src, #36
574	strd	B_l, B_h, [dst, #48]
575	ldrd	B_l, B_h, [sp, #8]
576	strd	C_l, C_h, [dst, #56]
577	ldrd	C_l, C_h, [sp, #16]
578	strd	D_l, D_h, [dst, #64]
579	ldrd	D_l, D_h, [sp, #24]
580	add	dst, dst, #72
581	ands	count, tmp2, #0x3f
582#endif
583	ldr	tmp2, [sp], #FRAME_SIZE
584	bne	L(tail63unaligned)
585	bx	lr
586
587END (__memcpy_arm)
588