xref: /freebsd/contrib/cortex-strings/src/arm/memcpy.S (revision 05427f4639bcf2703329a9be9d25ec09bb782742)
1/* Copyright (c) 2013, Linaro Limited
2   All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions
6   are met:
7
8      * Redistributions of source code must retain the above copyright
9      notice, this list of conditions and the following disclaimer.
10
11      * Redistributions in binary form must reproduce the above copyright
12      notice, this list of conditions and the following disclaimer in the
13      documentation and/or other materials provided with the distribution.
14
15      * Neither the name of Linaro Limited nor the names of its
16      contributors may be used to endorse or promote products derived
17      from this software without specific prior written permission.
18
19   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 */
31
32/*
33   This memcpy routine is optimised for Cortex-A15 cores and takes advantage
34   of VFP or NEON when built with the appropriate flags.
35
36   Assumptions:
37
38    ARMv6 (ARMv7-a if using Neon)
39    ARM state
40    Unaligned accesses
41
42 */
43
44	.syntax unified
45	/* This implementation requires ARM state.  */
46	.arm
47
48#ifdef __ARM_NEON__
49
50	.fpu	neon
51	.arch	armv7-a
52# define FRAME_SIZE	4
53# define USE_VFP
54# define USE_NEON
55
56#elif !defined (__SOFTFP__)
57
58	.arch	armv6
59	.fpu	vfpv2
60# define FRAME_SIZE	32
61# define USE_VFP
62
63#else
64	.arch	armv6
65# define FRAME_SIZE    32
66
67#endif
68
69/* Old versions of GAS incorrectly implement the NEON align semantics.  */
70#ifdef BROKEN_ASM_NEON_ALIGN
71#define ALIGN(addr, align) addr,:align
72#else
73#define ALIGN(addr, align) addr:align
74#endif
75
76#define PC_OFFSET	8	/* PC pipeline compensation.  */
77#define INSN_SIZE	4
78
79/* Call parameters.  */
80#define dstin	r0
81#define src	r1
82#define count	r2
83
84/* Locals.  */
85#define tmp1	r3
86#define dst	ip
87#define tmp2	r10
88
89#ifndef USE_NEON
90/* For bulk copies using GP registers.  */
91#define	A_l	r2		/* Call-clobbered.  */
92#define	A_h	r3		/* Call-clobbered.  */
93#define	B_l	r4
94#define	B_h	r5
95#define	C_l	r6
96#define	C_h	r7
97#define	D_l	r8
98#define	D_h	r9
99#endif
100
101/* Number of lines ahead to pre-fetch data.  If you change this the code
102   below will need adjustment to compensate.  */
103
104#define prefetch_lines	5
105
106#ifdef USE_VFP
107	.macro	cpy_line_vfp vreg, base
108	vstr	\vreg, [dst, #\base]
109	vldr	\vreg, [src, #\base]
110	vstr	d0, [dst, #\base + 8]
111	vldr	d0, [src, #\base + 8]
112	vstr	d1, [dst, #\base + 16]
113	vldr	d1, [src, #\base + 16]
114	vstr	d2, [dst, #\base + 24]
115	vldr	d2, [src, #\base + 24]
116	vstr	\vreg, [dst, #\base + 32]
117	vldr	\vreg, [src, #\base + prefetch_lines * 64 - 32]
118	vstr	d0, [dst, #\base + 40]
119	vldr	d0, [src, #\base + 40]
120	vstr	d1, [dst, #\base + 48]
121	vldr	d1, [src, #\base + 48]
122	vstr	d2, [dst, #\base + 56]
123	vldr	d2, [src, #\base + 56]
124	.endm
125
126	.macro	cpy_tail_vfp vreg, base
127	vstr	\vreg, [dst, #\base]
128	vldr	\vreg, [src, #\base]
129	vstr	d0, [dst, #\base + 8]
130	vldr	d0, [src, #\base + 8]
131	vstr	d1, [dst, #\base + 16]
132	vldr	d1, [src, #\base + 16]
133	vstr	d2, [dst, #\base + 24]
134	vldr	d2, [src, #\base + 24]
135	vstr	\vreg, [dst, #\base + 32]
136	vstr	d0, [dst, #\base + 40]
137	vldr	d0, [src, #\base + 40]
138	vstr	d1, [dst, #\base + 48]
139	vldr	d1, [src, #\base + 48]
140	vstr	d2, [dst, #\base + 56]
141	vldr	d2, [src, #\base + 56]
142	.endm
143#endif
144
145	.macro def_fn f p2align=0
146	.text
147	.p2align \p2align
148	.global \f
149	.type \f, %function
150\f:
151	.endm
152
153def_fn memcpy p2align=6
154
155	mov	dst, dstin	/* Preserve dstin, we need to return it.  */
156	cmp	count, #64
157	bge	.Lcpy_not_short
158	/* Deal with small copies quickly by dropping straight into the
159	   exit block.  */
160
161.Ltail63unaligned:
162#ifdef USE_NEON
163	and	tmp1, count, #0x38
164	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
165	add	pc, pc, tmp1
166	vld1.8	{d0}, [src]!	/* 14 words to go.  */
167	vst1.8	{d0}, [dst]!
168	vld1.8	{d0}, [src]!	/* 12 words to go.  */
169	vst1.8	{d0}, [dst]!
170	vld1.8	{d0}, [src]!	/* 10 words to go.  */
171	vst1.8	{d0}, [dst]!
172	vld1.8	{d0}, [src]!	/* 8 words to go.  */
173	vst1.8	{d0}, [dst]!
174	vld1.8	{d0}, [src]!	/* 6 words to go.  */
175	vst1.8	{d0}, [dst]!
176	vld1.8	{d0}, [src]!	/* 4 words to go.  */
177	vst1.8	{d0}, [dst]!
178	vld1.8	{d0}, [src]!	/* 2 words to go.  */
179	vst1.8	{d0}, [dst]!
180
181	tst	count, #4
182	ldrne	tmp1, [src], #4
183	strne	tmp1, [dst], #4
184#else
185	/* Copy up to 15 full words of data.  May not be aligned.  */
186	/* Cannot use VFP for unaligned data.  */
187	and	tmp1, count, #0x3c
188	add	dst, dst, tmp1
189	add	src, src, tmp1
190	rsb	tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
191	/* Jump directly into the sequence below at the correct offset.  */
192	add	pc, pc, tmp1, lsl #1
193
194	ldr	tmp1, [src, #-60]	/* 15 words to go.  */
195	str	tmp1, [dst, #-60]
196
197	ldr	tmp1, [src, #-56]	/* 14 words to go.  */
198	str	tmp1, [dst, #-56]
199	ldr	tmp1, [src, #-52]
200	str	tmp1, [dst, #-52]
201
202	ldr	tmp1, [src, #-48]	/* 12 words to go.  */
203	str	tmp1, [dst, #-48]
204	ldr	tmp1, [src, #-44]
205	str	tmp1, [dst, #-44]
206
207	ldr	tmp1, [src, #-40]	/* 10 words to go.  */
208	str	tmp1, [dst, #-40]
209	ldr	tmp1, [src, #-36]
210	str	tmp1, [dst, #-36]
211
212	ldr	tmp1, [src, #-32]	/* 8 words to go.  */
213	str	tmp1, [dst, #-32]
214	ldr	tmp1, [src, #-28]
215	str	tmp1, [dst, #-28]
216
217	ldr	tmp1, [src, #-24]	/* 6 words to go.  */
218	str	tmp1, [dst, #-24]
219	ldr	tmp1, [src, #-20]
220	str	tmp1, [dst, #-20]
221
222	ldr	tmp1, [src, #-16]	/* 4 words to go.  */
223	str	tmp1, [dst, #-16]
224	ldr	tmp1, [src, #-12]
225	str	tmp1, [dst, #-12]
226
227	ldr	tmp1, [src, #-8]	/* 2 words to go.  */
228	str	tmp1, [dst, #-8]
229	ldr	tmp1, [src, #-4]
230	str	tmp1, [dst, #-4]
231#endif
232
233	lsls	count, count, #31
234	ldrhcs	tmp1, [src], #2
235	ldrbne	src, [src]		/* Src is dead, use as a scratch.  */
236	strhcs	tmp1, [dst], #2
237	strbne	src, [dst]
238	bx	lr
239
240.Lcpy_not_short:
241	/* At least 64 bytes to copy, but don't know the alignment yet.  */
242	str	tmp2, [sp, #-FRAME_SIZE]!
243	and	tmp2, src, #7
244	and	tmp1, dst, #7
245	cmp	tmp1, tmp2
246	bne	.Lcpy_notaligned
247
248#ifdef USE_VFP
249	/* Magic dust alert!  Force VFP on Cortex-A9.  Experiments show
250	   that the FP pipeline is much better at streaming loads and
251	   stores.  This is outside the critical loop.  */
252	vmov.f32	s0, s0
253#endif
254
255	/* SRC and DST have the same mutual 64-bit alignment, but we may
256	   still need to pre-copy some bytes to get to natural alignment.
257	   We bring SRC and DST into full 64-bit alignment.  */
258	lsls	tmp2, dst, #29
259	beq	1f
260	rsbs	tmp2, tmp2, #0
261	sub	count, count, tmp2, lsr #29
262	ldrmi	tmp1, [src], #4
263	strmi	tmp1, [dst], #4
264	lsls	tmp2, tmp2, #2
265	ldrhcs	tmp1, [src], #2
266	ldrbne	tmp2, [src], #1
267	strhcs	tmp1, [dst], #2
268	strbne	tmp2, [dst], #1
269
2701:
271	subs	tmp2, count, #64	/* Use tmp2 for count.  */
272	blt	.Ltail63aligned
273
274	cmp	tmp2, #512
275	bge	.Lcpy_body_long
276
277.Lcpy_body_medium:			/* Count in tmp2.  */
278#ifdef USE_VFP
2791:
280	vldr	d0, [src, #0]
281	subs	tmp2, tmp2, #64
282	vldr	d1, [src, #8]
283	vstr	d0, [dst, #0]
284	vldr	d0, [src, #16]
285	vstr	d1, [dst, #8]
286	vldr	d1, [src, #24]
287	vstr	d0, [dst, #16]
288	vldr	d0, [src, #32]
289	vstr	d1, [dst, #24]
290	vldr	d1, [src, #40]
291	vstr	d0, [dst, #32]
292	vldr	d0, [src, #48]
293	vstr	d1, [dst, #40]
294	vldr	d1, [src, #56]
295	vstr	d0, [dst, #48]
296	add	src, src, #64
297	vstr	d1, [dst, #56]
298	add	dst, dst, #64
299	bge	1b
300	tst	tmp2, #0x3f
301	beq	.Ldone
302
303.Ltail63aligned:			/* Count in tmp2.  */
304	and	tmp1, tmp2, #0x38
305	add	dst, dst, tmp1
306	add	src, src, tmp1
307	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
308	add	pc, pc, tmp1
309
310	vldr	d0, [src, #-56]	/* 14 words to go.  */
311	vstr	d0, [dst, #-56]
312	vldr	d0, [src, #-48]	/* 12 words to go.  */
313	vstr	d0, [dst, #-48]
314	vldr	d0, [src, #-40]	/* 10 words to go.  */
315	vstr	d0, [dst, #-40]
316	vldr	d0, [src, #-32]	/* 8 words to go.  */
317	vstr	d0, [dst, #-32]
318	vldr	d0, [src, #-24]	/* 6 words to go.  */
319	vstr	d0, [dst, #-24]
320	vldr	d0, [src, #-16]	/* 4 words to go.  */
321	vstr	d0, [dst, #-16]
322	vldr	d0, [src, #-8]	/* 2 words to go.  */
323	vstr	d0, [dst, #-8]
324#else
325	sub	src, src, #8
326	sub	dst, dst, #8
3271:
328	ldrd	A_l, A_h, [src, #8]
329	strd	A_l, A_h, [dst, #8]
330	ldrd	A_l, A_h, [src, #16]
331	strd	A_l, A_h, [dst, #16]
332	ldrd	A_l, A_h, [src, #24]
333	strd	A_l, A_h, [dst, #24]
334	ldrd	A_l, A_h, [src, #32]
335	strd	A_l, A_h, [dst, #32]
336	ldrd	A_l, A_h, [src, #40]
337	strd	A_l, A_h, [dst, #40]
338	ldrd	A_l, A_h, [src, #48]
339	strd	A_l, A_h, [dst, #48]
340	ldrd	A_l, A_h, [src, #56]
341	strd	A_l, A_h, [dst, #56]
342	ldrd	A_l, A_h, [src, #64]!
343	strd	A_l, A_h, [dst, #64]!
344	subs	tmp2, tmp2, #64
345	bge	1b
346	tst	tmp2, #0x3f
347	bne	1f
348	ldr	tmp2,[sp], #FRAME_SIZE
349	bx	lr
3501:
351	add	src, src, #8
352	add	dst, dst, #8
353
354.Ltail63aligned:			/* Count in tmp2.  */
355	/* Copy up to 7 d-words of data.  Similar to Ltail63unaligned, but
356	   we know that the src and dest are 64-bit aligned so we can use
357	   LDRD/STRD to improve efficiency.  */
358	/* TMP2 is now negative, but we don't care about that.  The bottom
359	   six bits still tell us how many bytes are left to copy.  */
360
361	and	tmp1, tmp2, #0x38
362	add	dst, dst, tmp1
363	add	src, src, tmp1
364	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
365	add	pc, pc, tmp1
366	ldrd	A_l, A_h, [src, #-56]	/* 14 words to go.  */
367	strd	A_l, A_h, [dst, #-56]
368	ldrd	A_l, A_h, [src, #-48]	/* 12 words to go.  */
369	strd	A_l, A_h, [dst, #-48]
370	ldrd	A_l, A_h, [src, #-40]	/* 10 words to go.  */
371	strd	A_l, A_h, [dst, #-40]
372	ldrd	A_l, A_h, [src, #-32]	/* 8 words to go.  */
373	strd	A_l, A_h, [dst, #-32]
374	ldrd	A_l, A_h, [src, #-24]	/* 6 words to go.  */
375	strd	A_l, A_h, [dst, #-24]
376	ldrd	A_l, A_h, [src, #-16]	/* 4 words to go.  */
377	strd	A_l, A_h, [dst, #-16]
378	ldrd	A_l, A_h, [src, #-8]	/* 2 words to go.  */
379	strd	A_l, A_h, [dst, #-8]
380
381#endif
382	tst	tmp2, #4
383	ldrne	tmp1, [src], #4
384	strne	tmp1, [dst], #4
385	lsls	tmp2, tmp2, #31		/* Count (tmp2) now dead. */
386	ldrhcs	tmp1, [src], #2
387	ldrbne	tmp2, [src]
388	strhcs	tmp1, [dst], #2
389	strbne	tmp2, [dst]
390
391.Ldone:
392	ldr	tmp2, [sp], #FRAME_SIZE
393	bx	lr
394
395.Lcpy_body_long:			/* Count in tmp2.  */
396
397	/* Long copy.  We know that there's at least (prefetch_lines * 64)
398	   bytes to go.  */
399#ifdef USE_VFP
400	/* Don't use PLD.  Instead, read some data in advance of the current
401	   copy position into a register.  This should act like a PLD
402	   operation but we won't have to repeat the transfer.  */
403
404	vldr	d3, [src, #0]
405	vldr	d4, [src, #64]
406	vldr	d5, [src, #128]
407	vldr	d6, [src, #192]
408	vldr	d7, [src, #256]
409
410	vldr	d0, [src, #8]
411	vldr	d1, [src, #16]
412	vldr	d2, [src, #24]
413	add	src, src, #32
414
415	subs	tmp2, tmp2, #prefetch_lines * 64 * 2
416	blt	2f
4171:
418	cpy_line_vfp	d3, 0
419	cpy_line_vfp	d4, 64
420	cpy_line_vfp	d5, 128
421	add	dst, dst, #3 * 64
422	add	src, src, #3 * 64
423	cpy_line_vfp	d6, 0
424	cpy_line_vfp	d7, 64
425	add	dst, dst, #2 * 64
426	add	src, src, #2 * 64
427	subs	tmp2, tmp2, #prefetch_lines * 64
428	bge	1b
429
4302:
431	cpy_tail_vfp	d3, 0
432	cpy_tail_vfp	d4, 64
433	cpy_tail_vfp	d5, 128
434	add	src, src, #3 * 64
435	add	dst, dst, #3 * 64
436	cpy_tail_vfp	d6, 0
437	vstr	d7, [dst, #64]
438	vldr	d7, [src, #64]
439	vstr	d0, [dst, #64 + 8]
440	vldr	d0, [src, #64 + 8]
441	vstr	d1, [dst, #64 + 16]
442	vldr	d1, [src, #64 + 16]
443	vstr	d2, [dst, #64 + 24]
444	vldr	d2, [src, #64 + 24]
445	vstr	d7, [dst, #64 + 32]
446	add	src, src, #96
447	vstr	d0, [dst, #64 + 40]
448	vstr	d1, [dst, #64 + 48]
449	vstr	d2, [dst, #64 + 56]
450	add	dst, dst, #128
451	add	tmp2, tmp2, #prefetch_lines * 64
452	b	.Lcpy_body_medium
453#else
454	/* Long copy.  Use an SMS style loop to maximize the I/O
455	   bandwidth of the core.  We don't have enough spare registers
456	   to synthesise prefetching, so use PLD operations.  */
457	/* Pre-bias src and dst.  */
458	sub	src, src, #8
459	sub	dst, dst, #8
460	pld	[src, #8]
461	pld	[src, #72]
462	subs	tmp2, tmp2, #64
463	pld	[src, #136]
464	ldrd	A_l, A_h, [src, #8]
465	strd	B_l, B_h, [sp, #8]
466	ldrd	B_l, B_h, [src, #16]
467	strd	C_l, C_h, [sp, #16]
468	ldrd	C_l, C_h, [src, #24]
469	strd	D_l, D_h, [sp, #24]
470	pld	[src, #200]
471	ldrd	D_l, D_h, [src, #32]!
472	b	1f
473	.p2align	6
4742:
475	pld	[src, #232]
476	strd	A_l, A_h, [dst, #40]
477	ldrd	A_l, A_h, [src, #40]
478	strd	B_l, B_h, [dst, #48]
479	ldrd	B_l, B_h, [src, #48]
480	strd	C_l, C_h, [dst, #56]
481	ldrd	C_l, C_h, [src, #56]
482	strd	D_l, D_h, [dst, #64]!
483	ldrd	D_l, D_h, [src, #64]!
484	subs	tmp2, tmp2, #64
4851:
486	strd	A_l, A_h, [dst, #8]
487	ldrd	A_l, A_h, [src, #8]
488	strd	B_l, B_h, [dst, #16]
489	ldrd	B_l, B_h, [src, #16]
490	strd	C_l, C_h, [dst, #24]
491	ldrd	C_l, C_h, [src, #24]
492	strd	D_l, D_h, [dst, #32]
493	ldrd	D_l, D_h, [src, #32]
494	bcs	2b
495	/* Save the remaining bytes and restore the callee-saved regs.  */
496	strd	A_l, A_h, [dst, #40]
497	add	src, src, #40
498	strd	B_l, B_h, [dst, #48]
499	ldrd	B_l, B_h, [sp, #8]
500	strd	C_l, C_h, [dst, #56]
501	ldrd	C_l, C_h, [sp, #16]
502	strd	D_l, D_h, [dst, #64]
503	ldrd	D_l, D_h, [sp, #24]
504	add	dst, dst, #72
505	tst	tmp2, #0x3f
506	bne	.Ltail63aligned
507	ldr	tmp2, [sp], #FRAME_SIZE
508	bx	lr
509#endif
510
511.Lcpy_notaligned:
512	pld	[src]
513	pld	[src, #64]
514	/* There's at least 64 bytes to copy, but there is no mutual
515	   alignment.  */
516	/* Bring DST to 64-bit alignment.  */
517	lsls	tmp2, dst, #29
518	pld	[src, #(2 * 64)]
519	beq	1f
520	rsbs	tmp2, tmp2, #0
521	sub	count, count, tmp2, lsr #29
522	ldrmi	tmp1, [src], #4
523	strmi	tmp1, [dst], #4
524	lsls	tmp2, tmp2, #2
525	ldrbne	tmp1, [src], #1
526	ldrhcs	tmp2, [src], #2
527	strbne	tmp1, [dst], #1
528	strhcs	tmp2, [dst], #2
5291:
530	pld	[src, #(3 * 64)]
531	subs	count, count, #64
532	ldrmi	tmp2, [sp], #FRAME_SIZE
533	bmi	.Ltail63unaligned
534	pld	[src, #(4 * 64)]
535
536#ifdef USE_NEON
537	vld1.8	{d0-d3}, [src]!
538	vld1.8	{d4-d7}, [src]!
539	subs	count, count, #64
540	bmi	2f
5411:
542	pld	[src, #(4 * 64)]
543	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
544	vld1.8	{d0-d3}, [src]!
545	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
546	vld1.8	{d4-d7}, [src]!
547	subs	count, count, #64
548	bpl	1b
5492:
550	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
551	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
552	ands	count, count, #0x3f
553#else
554	/* Use an SMS style loop to maximize the I/O bandwidth.  */
555	sub	src, src, #4
556	sub	dst, dst, #8
557	subs	tmp2, count, #64	/* Use tmp2 for count.  */
558	ldr	A_l, [src, #4]
559	ldr	A_h, [src, #8]
560	strd	B_l, B_h, [sp, #8]
561	ldr	B_l, [src, #12]
562	ldr	B_h, [src, #16]
563	strd	C_l, C_h, [sp, #16]
564	ldr	C_l, [src, #20]
565	ldr	C_h, [src, #24]
566	strd	D_l, D_h, [sp, #24]
567	ldr	D_l, [src, #28]
568	ldr	D_h, [src, #32]!
569	b	1f
570	.p2align	6
5712:
572	pld	[src, #(5 * 64) - (32 - 4)]
573	strd	A_l, A_h, [dst, #40]
574	ldr	A_l, [src, #36]
575	ldr	A_h, [src, #40]
576	strd	B_l, B_h, [dst, #48]
577	ldr	B_l, [src, #44]
578	ldr	B_h, [src, #48]
579	strd	C_l, C_h, [dst, #56]
580	ldr	C_l, [src, #52]
581	ldr	C_h, [src, #56]
582	strd	D_l, D_h, [dst, #64]!
583	ldr	D_l, [src, #60]
584	ldr	D_h, [src, #64]!
585	subs	tmp2, tmp2, #64
5861:
587	strd	A_l, A_h, [dst, #8]
588	ldr	A_l, [src, #4]
589	ldr	A_h, [src, #8]
590	strd	B_l, B_h, [dst, #16]
591	ldr	B_l, [src, #12]
592	ldr	B_h, [src, #16]
593	strd	C_l, C_h, [dst, #24]
594	ldr	C_l, [src, #20]
595	ldr	C_h, [src, #24]
596	strd	D_l, D_h, [dst, #32]
597	ldr	D_l, [src, #28]
598	ldr	D_h, [src, #32]
599	bcs	2b
600
601	/* Save the remaining bytes and restore the callee-saved regs.  */
602	strd	A_l, A_h, [dst, #40]
603	add	src, src, #36
604	strd	B_l, B_h, [dst, #48]
605	ldrd	B_l, B_h, [sp, #8]
606	strd	C_l, C_h, [dst, #56]
607	ldrd	C_l, C_h, [sp, #16]
608	strd	D_l, D_h, [dst, #64]
609	ldrd	D_l, D_h, [sp, #24]
610	add	dst, dst, #72
611	ands	count, tmp2, #0x3f
612#endif
613	ldr	tmp2, [sp], #FRAME_SIZE
614	bne	.Ltail63unaligned
615	bx	lr
616
617	.size	memcpy, . - memcpy
618