xref: /freebsd/lib/libc/arm/string/memcpy.S (revision f81cdf24ba5436367377f7c8e8f51f6df2a75ca7)
1/*	$NetBSD: memcpy_xscale.S,v 1.1 2003/10/14 07:51:45 scw Exp $	*/
2
3/*
4 * Copyright 2003 Wasabi Systems, Inc.
5 * All rights reserved.
6 *
7 * Written by Steve C. Woodford for Wasabi Systems, Inc.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *      This product includes software developed for the NetBSD Project by
20 *      Wasabi Systems, Inc.
21 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
22 *    or promote products derived from this software without specific prior
23 *    written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
36 */
37
38#include <machine/asm.h>
39.syntax	unified
40
41/* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
42ENTRY(memcpy)
43	pld	[r1]
44	cmp	r2, #0x0c
45	ble	.Lmemcpy_short		/* <= 12 bytes */
46	mov	r3, r0			/* We must not clobber r0 */
47
48	/* Word-align the destination buffer */
49	ands	ip, r3, #0x03		/* Already word aligned? */
50	beq	.Lmemcpy_wordaligned	/* Yup */
51	cmp	ip, #0x02
52	ldrb	ip, [r1], #0x01
53	sub	r2, r2, #0x01
54	strb	ip, [r3], #0x01
55	ldrble	ip, [r1], #0x01
56	suble	r2, r2, #0x01
57	strble	ip, [r3], #0x01
58	ldrblt	ip, [r1], #0x01
59	sublt	r2, r2, #0x01
60	strblt	ip, [r3], #0x01
61
62	/* Destination buffer is now word aligned */
63.Lmemcpy_wordaligned:
64	ands	ip, r1, #0x03		/* Is src also word-aligned? */
65	bne	.Lmemcpy_bad_align	/* Nope. Things just got bad */
66
67	/* Quad-align the destination buffer */
68	tst	r3, #0x07		/* Already quad aligned? */
69	ldrne	ip, [r1], #0x04
70	stmfd	sp!, {r4-r9}		/* Free up some registers */
71	subne	r2, r2, #0x04
72	strne	ip, [r3], #0x04
73
74	/* Destination buffer quad aligned, source is at least word aligned */
75	subs	r2, r2, #0x80
76	blt	.Lmemcpy_w_lessthan128
77
78	/* Copy 128 bytes at a time */
79.Lmemcpy_w_loop128:
80	ldr	r4, [r1], #0x04		/* LD:00-03 */
81	ldr	r5, [r1], #0x04		/* LD:04-07 */
82	pld	[r1, #0x18]		/* Prefetch 0x20 */
83	ldr	r6, [r1], #0x04		/* LD:08-0b */
84	ldr	r7, [r1], #0x04		/* LD:0c-0f */
85	ldr	r8, [r1], #0x04		/* LD:10-13 */
86	ldr	r9, [r1], #0x04		/* LD:14-17 */
87	strd	r4, [r3], #0x08		/* ST:00-07 */
88	ldr	r4, [r1], #0x04		/* LD:18-1b */
89	ldr	r5, [r1], #0x04		/* LD:1c-1f */
90	strd	r6, [r3], #0x08		/* ST:08-0f */
91	ldr	r6, [r1], #0x04		/* LD:20-23 */
92	ldr	r7, [r1], #0x04		/* LD:24-27 */
93	pld	[r1, #0x18]		/* Prefetch 0x40 */
94	strd	r8, [r3], #0x08		/* ST:10-17 */
95	ldr	r8, [r1], #0x04		/* LD:28-2b */
96	ldr	r9, [r1], #0x04		/* LD:2c-2f */
97	strd	r4, [r3], #0x08		/* ST:18-1f */
98	ldr	r4, [r1], #0x04		/* LD:30-33 */
99	ldr	r5, [r1], #0x04		/* LD:34-37 */
100	strd	r6, [r3], #0x08		/* ST:20-27 */
101	ldr	r6, [r1], #0x04		/* LD:38-3b */
102	ldr	r7, [r1], #0x04		/* LD:3c-3f */
103	strd	r8, [r3], #0x08		/* ST:28-2f */
104	ldr	r8, [r1], #0x04		/* LD:40-43 */
105	ldr	r9, [r1], #0x04		/* LD:44-47 */
106	pld	[r1, #0x18]		/* Prefetch 0x60 */
107	strd	r4, [r3], #0x08		/* ST:30-37 */
108	ldr	r4, [r1], #0x04		/* LD:48-4b */
109	ldr	r5, [r1], #0x04		/* LD:4c-4f */
110	strd	r6, [r3], #0x08		/* ST:38-3f */
111	ldr	r6, [r1], #0x04		/* LD:50-53 */
112	ldr	r7, [r1], #0x04		/* LD:54-57 */
113	strd	r8, [r3], #0x08		/* ST:40-47 */
114	ldr	r8, [r1], #0x04		/* LD:58-5b */
115	ldr	r9, [r1], #0x04		/* LD:5c-5f */
116	strd	r4, [r3], #0x08		/* ST:48-4f */
117	ldr	r4, [r1], #0x04		/* LD:60-63 */
118	ldr	r5, [r1], #0x04		/* LD:64-67 */
119	pld	[r1, #0x18]		/* Prefetch 0x80 */
120	strd	r6, [r3], #0x08		/* ST:50-57 */
121	ldr	r6, [r1], #0x04		/* LD:68-6b */
122	ldr	r7, [r1], #0x04		/* LD:6c-6f */
123	strd	r8, [r3], #0x08		/* ST:58-5f */
124	ldr	r8, [r1], #0x04		/* LD:70-73 */
125	ldr	r9, [r1], #0x04		/* LD:74-77 */
126	strd	r4, [r3], #0x08		/* ST:60-67 */
127	ldr	r4, [r1], #0x04		/* LD:78-7b */
128	ldr	r5, [r1], #0x04		/* LD:7c-7f */
129	strd	r6, [r3], #0x08		/* ST:68-6f */
130	strd	r8, [r3], #0x08		/* ST:70-77 */
131	subs	r2, r2, #0x80
132	strd	r4, [r3], #0x08		/* ST:78-7f */
133	bge	.Lmemcpy_w_loop128
134
135.Lmemcpy_w_lessthan128:
136	adds	r2, r2, #0x80		/* Adjust for extra sub */
137	ldmfdeq	sp!, {r4-r9}
138	bxeq	lr			/* Return now if done */
139	subs	r2, r2, #0x20
140	blt	.Lmemcpy_w_lessthan32
141
142	/* Copy 32 bytes at a time */
143.Lmemcpy_w_loop32:
144	ldr	r4, [r1], #0x04
145	ldr	r5, [r1], #0x04
146	pld	[r1, #0x18]
147	ldr	r6, [r1], #0x04
148	ldr	r7, [r1], #0x04
149	ldr	r8, [r1], #0x04
150	ldr	r9, [r1], #0x04
151	strd	r4, [r3], #0x08
152	ldr	r4, [r1], #0x04
153	ldr	r5, [r1], #0x04
154	strd	r6, [r3], #0x08
155	strd	r8, [r3], #0x08
156	subs	r2, r2, #0x20
157	strd	r4, [r3], #0x08
158	bge	.Lmemcpy_w_loop32
159
160.Lmemcpy_w_lessthan32:
161	adds	r2, r2, #0x20		/* Adjust for extra sub */
162	ldmfdeq	sp!, {r4-r9}
163	bxeq	lr			/* Return now if done */
164
165	and	r4, r2, #0x18
166	rsbs	r4, r4, #0x18
167	addne	pc, pc, r4, lsl #1
168	nop
169
170	/* At least 24 bytes remaining */
171	ldr	r4, [r1], #0x04
172	ldr	r5, [r1], #0x04
173	sub	r2, r2, #0x08
174	strd	r4, [r3], #0x08
175
176	/* At least 16 bytes remaining */
177	ldr	r4, [r1], #0x04
178	ldr	r5, [r1], #0x04
179	sub	r2, r2, #0x08
180	strd	r4, [r3], #0x08
181
182	/* At least 8 bytes remaining */
183	ldr	r4, [r1], #0x04
184	ldr	r5, [r1], #0x04
185	subs	r2, r2, #0x08
186	strd	r4, [r3], #0x08
187
188	/* Less than 8 bytes remaining */
189	ldmfd	sp!, {r4-r9}
190	bxeq	lr			/* Return now if done */
191	subs	r2, r2, #0x04
192	ldrge	ip, [r1], #0x04
193	strge	ip, [r3], #0x04
194	bxeq	lr			/* Return now if done */
195	addlt	r2, r2, #0x04
196	ldrb	ip, [r1], #0x01
197	cmp	r2, #0x02
198	ldrbge	r2, [r1], #0x01
199	strb	ip, [r3], #0x01
200	ldrbgt	ip, [r1]
201	strbge	r2, [r3], #0x01
202	strbgt	ip, [r3]
203	bx	lr
204
205
206/*
207 * At this point, it has not been possible to word align both buffers.
208 * The destination buffer is word aligned, but the source buffer is not.
209 */
210.Lmemcpy_bad_align:
211	stmfd	sp!, {r4-r7}
212	bic	r1, r1, #0x03
213	cmp	ip, #2
214	ldr	ip, [r1], #0x04
215	bgt	.Lmemcpy_bad3
216	beq	.Lmemcpy_bad2
217	b	.Lmemcpy_bad1
218
219.Lmemcpy_bad1_loop16:
220	mov	r4, ip, lsr #8
221	ldr	r5, [r1], #0x04
222	pld	[r1, #0x018]
223	ldr	r6, [r1], #0x04
224	ldr	r7, [r1], #0x04
225	ldr	ip, [r1], #0x04
226	orr	r4, r4, r5, lsl #24
227	mov	r5, r5, lsr #8
228	orr	r5, r5, r6, lsl #24
229	mov	r6, r6, lsr #8
230	orr	r6, r6, r7, lsl #24
231	mov	r7, r7, lsr #8
232	orr	r7, r7, ip, lsl #24
233	str	r4, [r3], #0x04
234	str	r5, [r3], #0x04
235	str	r6, [r3], #0x04
236	str	r7, [r3], #0x04
237.Lmemcpy_bad1:
238	subs	r2, r2, #0x10
239	bge	.Lmemcpy_bad1_loop16
240
241	adds	r2, r2, #0x10
242	ldmfdeq	sp!, {r4-r7}
243	bxeq	lr			/* Return now if done */
244	subs	r2, r2, #0x04
245	sublt	r1, r1, #0x03
246	blt	.Lmemcpy_bad_done
247
248.Lmemcpy_bad1_loop4:
249	mov	r4, ip, lsr #8
250	ldr	ip, [r1], #0x04
251	subs	r2, r2, #0x04
252	orr	r4, r4, ip, lsl #24
253	str	r4, [r3], #0x04
254	bge	.Lmemcpy_bad1_loop4
255	sub	r1, r1, #0x03
256	b	.Lmemcpy_bad_done
257
258.Lmemcpy_bad2_loop16:
259	mov	r4, ip, lsr #16
260	ldr	r5, [r1], #0x04
261	pld	[r1, #0x018]
262	ldr	r6, [r1], #0x04
263	ldr	r7, [r1], #0x04
264	ldr	ip, [r1], #0x04
265	orr	r4, r4, r5, lsl #16
266	mov	r5, r5, lsr #16
267	orr	r5, r5, r6, lsl #16
268	mov	r6, r6, lsr #16
269	orr	r6, r6, r7, lsl #16
270	mov	r7, r7, lsr #16
271	orr	r7, r7, ip, lsl #16
272	str	r4, [r3], #0x04
273	str	r5, [r3], #0x04
274	str	r6, [r3], #0x04
275	str	r7, [r3], #0x04
276.Lmemcpy_bad2:
277	subs	r2, r2, #0x10
278	bge	.Lmemcpy_bad2_loop16
279
280	adds	r2, r2, #0x10
281	ldmfdeq	sp!, {r4-r7}
282	bxeq	lr			/* Return now if done */
283	subs	r2, r2, #0x04
284	sublt	r1, r1, #0x02
285	blt	.Lmemcpy_bad_done
286
287.Lmemcpy_bad2_loop4:
288	mov	r4, ip, lsr #16
289	ldr	ip, [r1], #0x04
290	subs	r2, r2, #0x04
291	orr	r4, r4, ip, lsl #16
292	str	r4, [r3], #0x04
293	bge	.Lmemcpy_bad2_loop4
294	sub	r1, r1, #0x02
295	b	.Lmemcpy_bad_done
296
297.Lmemcpy_bad3_loop16:
298	mov	r4, ip, lsr #24
299	ldr	r5, [r1], #0x04
300	pld	[r1, #0x018]
301	ldr	r6, [r1], #0x04
302	ldr	r7, [r1], #0x04
303	ldr	ip, [r1], #0x04
304	orr	r4, r4, r5, lsl #8
305	mov	r5, r5, lsr #24
306	orr	r5, r5, r6, lsl #8
307	mov	r6, r6, lsr #24
308	orr	r6, r6, r7, lsl #8
309	mov	r7, r7, lsr #24
310	orr	r7, r7, ip, lsl #8
311	str	r4, [r3], #0x04
312	str	r5, [r3], #0x04
313	str	r6, [r3], #0x04
314	str	r7, [r3], #0x04
315.Lmemcpy_bad3:
316	subs	r2, r2, #0x10
317	bge	.Lmemcpy_bad3_loop16
318
319	adds	r2, r2, #0x10
320	ldmfdeq	sp!, {r4-r7}
321	bxeq	lr			/* Return now if done */
322	subs	r2, r2, #0x04
323	sublt	r1, r1, #0x01
324	blt	.Lmemcpy_bad_done
325
326.Lmemcpy_bad3_loop4:
327	mov	r4, ip, lsr #24
328	ldr	ip, [r1], #0x04
329	subs	r2, r2, #0x04
330	orr	r4, r4, ip, lsl #8
331	str	r4, [r3], #0x04
332	bge	.Lmemcpy_bad3_loop4
333	sub	r1, r1, #0x01
334
335.Lmemcpy_bad_done:
336	ldmfd	sp!, {r4-r7}
337	adds	r2, r2, #0x04
338	bxeq	lr
339	ldrb	ip, [r1], #0x01
340	cmp	r2, #0x02
341	ldrbge	r2, [r1], #0x01
342	strb	ip, [r3], #0x01
343	ldrbgt	ip, [r1]
344	strbge	r2, [r3], #0x01
345	strbgt	ip, [r3]
346	bx	lr
347
348
349/*
350 * Handle short copies (less than 16 bytes), possibly misaligned.
351 * Some of these are *very* common, thanks to the network stack,
352 * and so are handled specially.
353 */
354.Lmemcpy_short:
355#ifndef _STANDALONE
356	add	pc, pc, r2, lsl #2
357	nop
358	bx	lr			/* 0x00 */
359	b	.Lmemcpy_bytewise	/* 0x01 */
360	b	.Lmemcpy_bytewise	/* 0x02 */
361	b	.Lmemcpy_bytewise	/* 0x03 */
362	b	.Lmemcpy_4		/* 0x04 */
363	b	.Lmemcpy_bytewise	/* 0x05 */
364	b	.Lmemcpy_6		/* 0x06 */
365	b	.Lmemcpy_bytewise	/* 0x07 */
366	b	.Lmemcpy_8		/* 0x08 */
367	b	.Lmemcpy_bytewise	/* 0x09 */
368	b	.Lmemcpy_bytewise	/* 0x0a */
369	b	.Lmemcpy_bytewise	/* 0x0b */
370	b	.Lmemcpy_c		/* 0x0c */
371#endif
372.Lmemcpy_bytewise:
373	mov	r3, r0			/* We must not clobber r0 */
374	ldrb	ip, [r1], #0x01
3751:	subs	r2, r2, #0x01
376	strb	ip, [r3], #0x01
377	ldrbne	ip, [r1], #0x01
378	bne	1b
379	bx	lr
380
381#ifndef _STANDALONE
382/******************************************************************************
383 * Special case for 4 byte copies
384 */
385#define	LMEMCPY_4_LOG2	6	/* 64 bytes */
386#define	LMEMCPY_4_PAD	.align LMEMCPY_4_LOG2
387	LMEMCPY_4_PAD
388.Lmemcpy_4:
389	and	r2, r1, #0x03
390	orr	r2, r2, r0, lsl #2
391	ands	r2, r2, #0x0f
392	sub	r3, pc, #0x14
393	addne	pc, r3, r2, lsl #LMEMCPY_4_LOG2
394
395/*
396 * 0000: dst is 32-bit aligned, src is 32-bit aligned
397 */
398	ldr	r2, [r1]
399	str	r2, [r0]
400	bx	lr
401	LMEMCPY_4_PAD
402
403/*
404 * 0001: dst is 32-bit aligned, src is 8-bit aligned
405 */
406	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
407	ldr	r2, [r1, #3]		/* BE:r2 = 3xxx  LE:r2 = xxx3 */
408	mov	r3, r3, lsr #8		/* r3 = .210 */
409	orr	r3, r3, r2, lsl #24	/* r3 = 3210 */
410	str	r3, [r0]
411	bx	lr
412	LMEMCPY_4_PAD
413
414/*
415 * 0010: dst is 32-bit aligned, src is 16-bit aligned
416 */
417	ldrh	r3, [r1, #0x02]
418	ldrh	r2, [r1]
419	orr	r3, r2, r3, lsl #16
420	str	r3, [r0]
421	bx	lr
422	LMEMCPY_4_PAD
423
424/*
425 * 0011: dst is 32-bit aligned, src is 8-bit aligned
426 */
427	ldr	r3, [r1, #-3]		/* BE:r3 = xxx0  LE:r3 = 0xxx */
428	ldr	r2, [r1, #1]		/* BE:r2 = 123x  LE:r2 = x321 */
429	mov	r3, r3, lsr #24		/* r3 = ...0 */
430	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
431	str	r3, [r0]
432	bx	lr
433	LMEMCPY_4_PAD
434
435/*
436 * 0100: dst is 8-bit aligned, src is 32-bit aligned
437 */
438	ldr	r2, [r1]
439	strb	r2, [r0]
440	mov	r3, r2, lsr #8
441	mov	r1, r2, lsr #24
442	strb	r1, [r0, #0x03]
443	strh	r3, [r0, #0x01]
444	bx	lr
445	LMEMCPY_4_PAD
446
447/*
448 * 0101: dst is 8-bit aligned, src is 8-bit aligned
449 */
450	ldrb	r2, [r1]
451	ldrh	r3, [r1, #0x01]
452	ldrb	r1, [r1, #0x03]
453	strb	r2, [r0]
454	strh	r3, [r0, #0x01]
455	strb	r1, [r0, #0x03]
456	bx	lr
457	LMEMCPY_4_PAD
458
459/*
460 * 0110: dst is 8-bit aligned, src is 16-bit aligned
461 */
462	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
463	ldrh	r3, [r1, #0x02]		/* LE:r3 = ..23  LE:r3 = ..32 */
464	strb	r2, [r0]
465	mov	r2, r2, lsr #8		/* r2 = ...1 */
466	orr	r2, r2, r3, lsl #8	/* r2 = .321 */
467	mov	r3, r3, lsr #8		/* r3 = ...3 */
468	strh	r2, [r0, #0x01]
469	strb	r3, [r0, #0x03]
470	bx	lr
471	LMEMCPY_4_PAD
472
473/*
474 * 0111: dst is 8-bit aligned, src is 8-bit aligned
475 */
476	ldrb	r2, [r1]
477	ldrh	r3, [r1, #0x01]
478	ldrb	r1, [r1, #0x03]
479	strb	r2, [r0]
480	strh	r3, [r0, #0x01]
481	strb	r1, [r0, #0x03]
482	bx	lr
483	LMEMCPY_4_PAD
484
485/*
486 * 1000: dst is 16-bit aligned, src is 32-bit aligned
487 */
488	ldr	r2, [r1]
489	strh	r2, [r0]
490	mov	r3, r2, lsr #16
491	strh	r3, [r0, #0x02]
492	bx	 lr
493	LMEMCPY_4_PAD
494
495/*
496 * 1001: dst is 16-bit aligned, src is 8-bit aligned
497 */
498	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
499	ldr	r3, [r1, #3]		/* BE:r3 = 3xxx  LE:r3 = xxx3 */
500	mov	r1, r2, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
501	strh	r1, [r0]
502	mov	r2, r2, lsr #24		/* r2 = ...2 */
503	orr	r2, r2, r3, lsl #8	/* r2 = xx32 */
504	strh	r2, [r0, #0x02]
505	bx	lr
506	LMEMCPY_4_PAD
507
508/*
509 * 1010: dst is 16-bit aligned, src is 16-bit aligned
510 */
511	ldrh	r2, [r1]
512	ldrh	r3, [r1, #0x02]
513	strh	r2, [r0]
514	strh	r3, [r0, #0x02]
515	bx	lr
516	LMEMCPY_4_PAD
517
518/*
519 * 1011: dst is 16-bit aligned, src is 8-bit aligned
520 */
521	ldr	r3, [r1, #1]		/* BE:r3 = 123x  LE:r3 = x321 */
522	ldr	r2, [r1, #-3]		/* BE:r2 = xxx0  LE:r2 = 0xxx */
523	mov	r1, r3, lsr #8		/* BE:r1 = .123  LE:r1 = .x32 */
524	strh	r1, [r0, #0x02]
525	mov	r3, r3, lsl #8		/* r3 = 321. */
526	orr	r3, r3, r2, lsr #24	/* r3 = 3210 */
527	strh	r3, [r0]
528	bx	lr
529	LMEMCPY_4_PAD
530
531/*
532 * 1100: dst is 8-bit aligned, src is 32-bit aligned
533 */
534	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
535	strb	r2, [r0]
536	mov	r3, r2, lsr #8
537	mov	r1, r2, lsr #24
538	strh	r3, [r0, #0x01]
539	strb	r1, [r0, #0x03]
540	bx	lr
541	LMEMCPY_4_PAD
542
543/*
544 * 1101: dst is 8-bit aligned, src is 8-bit aligned
545 */
546	ldrb	r2, [r1]
547	ldrh	r3, [r1, #0x01]
548	ldrb	r1, [r1, #0x03]
549	strb	r2, [r0]
550	strh	r3, [r0, #0x01]
551	strb	r1, [r0, #0x03]
552	bx	lr
553	LMEMCPY_4_PAD
554
555/*
556 * 1110: dst is 8-bit aligned, src is 16-bit aligned
557 */
558	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
559	ldrh	r3, [r1, #0x02]		/* BE:r3 = ..23  LE:r3 = ..32 */
560	strb	r2, [r0]
561	mov	r2, r2, lsr #8		/* r2 = ...1 */
562	orr	r2, r2, r3, lsl #8	/* r2 = .321 */
563	strh	r2, [r0, #0x01]
564	mov	r3, r3, lsr #8		/* r3 = ...3 */
565	strb	r3, [r0, #0x03]
566	bx	lr
567	LMEMCPY_4_PAD
568
569/*
570 * 1111: dst is 8-bit aligned, src is 8-bit aligned
571 */
572	ldrb	r2, [r1]
573	ldrh	r3, [r1, #0x01]
574	ldrb	r1, [r1, #0x03]
575	strb	r2, [r0]
576	strh	r3, [r0, #0x01]
577	strb	r1, [r0, #0x03]
578	bx	lr
579	LMEMCPY_4_PAD
580
581
582/******************************************************************************
583 * Special case for 6 byte copies
584 */
585#define	LMEMCPY_6_LOG2	6	/* 64 bytes */
586#define	LMEMCPY_6_PAD	.align LMEMCPY_6_LOG2
587	LMEMCPY_6_PAD
588.Lmemcpy_6:
589	and	r2, r1, #0x03
590	orr	r2, r2, r0, lsl #2
591	ands	r2, r2, #0x0f
592	sub	r3, pc, #0x14
593	addne	pc, r3, r2, lsl #LMEMCPY_6_LOG2
594
595/*
596 * 0000: dst is 32-bit aligned, src is 32-bit aligned
597 */
598	ldr	r2, [r1]
599	ldrh	r3, [r1, #0x04]
600	str	r2, [r0]
601	strh	r3, [r0, #0x04]
602	bx	lr
603	LMEMCPY_6_PAD
604
605/*
606 * 0001: dst is 32-bit aligned, src is 8-bit aligned
607 */
608	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
609	ldr	r3, [r1, #0x03]		/* BE:r3 = 345x  LE:r3 = x543 */
610	mov	r2, r2, lsr #8		/* r2 = .210 */
611	orr	r2, r2, r3, lsl #24	/* r2 = 3210 */
612	mov	r3, r3, lsr #8		/* BE:r3 = .345  LE:r3 = .x54 */
613	str	r2, [r0]
614	strh	r3, [r0, #0x04]
615	bx	lr
616	LMEMCPY_6_PAD
617
618/*
619 * 0010: dst is 32-bit aligned, src is 16-bit aligned
620 */
621	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
622	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
623	mov	r1, r3, lsr #16		/* r1 = ..54 */
624	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
625	str	r2, [r0]
626	strh	r1, [r0, #0x04]
627	bx	lr
628	LMEMCPY_6_PAD
629
630/*
631 * 0011: dst is 32-bit aligned, src is 8-bit aligned
632 */
633	ldr	r2, [r1, #-3]		/* BE:r2 = xxx0  LE:r2 = 0xxx */
634	ldr	r3, [r1, #1]		/* BE:r3 = 1234  LE:r3 = 4321 */
635	ldr	r1, [r1, #5]		/* BE:r1 = 5xxx  LE:r3 = xxx5 */
636	mov	r2, r2, lsr #24		/* r2 = ...0 */
637	orr	r2, r2, r3, lsl #8	/* r2 = 3210 */
638	mov	r1, r1, lsl #8		/* r1 = xx5. */
639	orr	r1, r1, r3, lsr #24	/* r1 = xx54 */
640	str	r2, [r0]
641	strh	r1, [r0, #0x04]
642	bx	lr
643	LMEMCPY_6_PAD
644
645/*
646 * 0100: dst is 8-bit aligned, src is 32-bit aligned
647 */
648	ldr	r3, [r1]		/* BE:r3 = 0123  LE:r3 = 3210 */
649	ldrh	r2, [r1, #0x04]		/* BE:r2 = ..45  LE:r2 = ..54 */
650	mov	r1, r3, lsr #8		/* BE:r1 = .012  LE:r1 = .321 */
651	strh	r1, [r0, #0x01]
652	strb	r3, [r0]
653	mov	r3, r3, lsr #24		/* r3 = ...3 */
654	orr	r3, r3, r2, lsl #8	/* r3 = .543 */
655	mov	r2, r2, lsr #8		/* r2 = ...5 */
656	strh	r3, [r0, #0x03]
657	strb	r2, [r0, #0x05]
658	bx	lr
659	LMEMCPY_6_PAD
660
661/*
662 * 0101: dst is 8-bit aligned, src is 8-bit aligned
663 */
664	ldrb	r2, [r1]
665	ldrh	r3, [r1, #0x01]
666	ldrh	ip, [r1, #0x03]
667	ldrb	r1, [r1, #0x05]
668	strb	r2, [r0]
669	strh	r3, [r0, #0x01]
670	strh	ip, [r0, #0x03]
671	strb	r1, [r0, #0x05]
672	bx	lr
673	LMEMCPY_6_PAD
674
675/*
676 * 0110: dst is 8-bit aligned, src is 16-bit aligned
677 */
678	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
679	ldr	r1, [r1, #0x02]		/* BE:r1 = 2345  LE:r1 = 5432 */
680	strb	r2, [r0]
681	mov	r3, r1, lsr #24
682	strb	r3, [r0, #0x05]
683	mov	r3, r1, lsr #8		/* r3 = .543 */
684	strh	r3, [r0, #0x03]
685	mov	r3, r2, lsr #8		/* r3 = ...1 */
686	orr	r3, r3, r1, lsl #8	/* r3 = 4321 */
687	strh	r3, [r0, #0x01]
688	bx	lr
689	LMEMCPY_6_PAD
690
691/*
692 * 0111: dst is 8-bit aligned, src is 8-bit aligned
693 */
694	ldrb	r2, [r1]
695	ldrh	r3, [r1, #0x01]
696	ldrh	ip, [r1, #0x03]
697	ldrb	r1, [r1, #0x05]
698	strb	r2, [r0]
699	strh	r3, [r0, #0x01]
700	strh	ip, [r0, #0x03]
701	strb	r1, [r0, #0x05]
702	bx	lr
703	LMEMCPY_6_PAD
704
705/*
706 * 1000: dst is 16-bit aligned, src is 32-bit aligned
707 */
708	ldrh	r2, [r1, #0x04]		/* r2 = ..54 */
709	ldr	r3, [r1]		/* r3 = 3210 */
710	mov	r2, r2, lsl #16		/* r2 = 54.. */
711	orr	r2, r2, r3, lsr #16	/* r2 = 5432 */
712	strh	r3, [r0]
713	str	r2, [r0, #0x02]
714	bx	lr
715	LMEMCPY_6_PAD
716
717/*
718 * 1001: dst is 16-bit aligned, src is 8-bit aligned
719 */
720	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
721	ldr	r2, [r1, #3]		/* BE:r2 = 345x  LE:r2 = x543 */
722	mov	r1, r3, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
723	mov	r2, r2, lsl #8		/* r2 = 543. */
724	orr	r2, r2, r3, lsr #24	/* r2 = 5432 */
725	strh	r1, [r0]
726	str	r2, [r0, #0x02]
727	bx	lr
728	LMEMCPY_6_PAD
729
730/*
731 * 1010: dst is 16-bit aligned, src is 16-bit aligned
732 */
733	ldrh	r2, [r1]
734	ldr	r3, [r1, #0x02]
735	strh	r2, [r0]
736	str	r3, [r0, #0x02]
737	bx	lr
738	LMEMCPY_6_PAD
739
740/*
741 * 1011: dst is 16-bit aligned, src is 8-bit aligned
742 */
743	ldrb	r3, [r1]		/* r3 = ...0 */
744	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
745	ldrb	r1, [r1, #0x05]		/* r1 = ...5 */
746	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
747	mov	r1, r1, lsl #24		/* r1 = 5... */
748	orr	r1, r1, r2, lsr #8	/* r1 = 5432 */
749	strh	r3, [r0]
750	str	r1, [r0, #0x02]
751	bx	lr
752	LMEMCPY_6_PAD
753
754/*
755 * 1100: dst is 8-bit aligned, src is 32-bit aligned
756 */
757	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
758	ldrh	r1, [r1, #0x04]		/* BE:r1 = ..45  LE:r1 = ..54 */
759	strb	r2, [r0]
760	mov	r2, r2, lsr #8		/* r2 = .321 */
761	orr	r2, r2, r1, lsl #24	/* r2 = 4321 */
762	mov	r1, r1, lsr #8		/* r1 = ...5 */
763	str	r2, [r0, #0x01]
764	strb	r1, [r0, #0x05]
765	bx	lr
766	LMEMCPY_6_PAD
767
768/*
769 * 1101: dst is 8-bit aligned, src is 8-bit aligned
770 */
771	ldrb	r2, [r1]
772	ldrh	r3, [r1, #0x01]
773	ldrh	ip, [r1, #0x03]
774	ldrb	r1, [r1, #0x05]
775	strb	r2, [r0]
776	strh	r3, [r0, #0x01]
777	strh	ip, [r0, #0x03]
778	strb	r1, [r0, #0x05]
779	bx	lr
780	LMEMCPY_6_PAD
781
782/*
783 * 1110: dst is 8-bit aligned, src is 16-bit aligned
784 */
785	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
786	ldr	r1, [r1, #0x02]		/* BE:r1 = 2345  LE:r1 = 5432 */
787	strb	r2, [r0]
788	mov	r2, r2, lsr #8		/* r2 = ...1 */
789	orr	r2, r2, r1, lsl #8	/* r2 = 4321 */
790	mov	r1, r1, lsr #24		/* r1 = ...5 */
791	str	r2, [r0, #0x01]
792	strb	r1, [r0, #0x05]
793	bx	lr
794	LMEMCPY_6_PAD
795
796/*
797 * 1111: dst is 8-bit aligned, src is 8-bit aligned
798 */
799	ldrb	r2, [r1]
800	ldr	r3, [r1, #0x01]
801	ldrb	r1, [r1, #0x05]
802	strb	r2, [r0]
803	str	r3, [r0, #0x01]
804	strb	r1, [r0, #0x05]
805	bx	lr
806	LMEMCPY_6_PAD
807
808
809/******************************************************************************
810 * Special case for 8 byte copies
811 */
812#define	LMEMCPY_8_LOG2	6	/* 64 bytes */
813#define	LMEMCPY_8_PAD	.align LMEMCPY_8_LOG2
814	LMEMCPY_8_PAD
815.Lmemcpy_8:
816	and	r2, r1, #0x03
817	orr	r2, r2, r0, lsl #2
818	ands	r2, r2, #0x0f
819	sub	r3, pc, #0x14
820	addne	pc, r3, r2, lsl #LMEMCPY_8_LOG2
821
822/*
823 * 0000: dst is 32-bit aligned, src is 32-bit aligned
824 */
825	ldr	r2, [r1]
826	ldr	r3, [r1, #0x04]
827	str	r2, [r0]
828	str	r3, [r0, #0x04]
829	bx	lr
830	LMEMCPY_8_PAD
831
832/*
833 * 0001: dst is 32-bit aligned, src is 8-bit aligned
834 */
835	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
836	ldr	r2, [r1, #0x03]		/* BE:r2 = 3456  LE:r2 = 6543 */
837	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
838	mov	r3, r3, lsr #8		/* r3 = .210 */
839	orr	r3, r3, r2, lsl #24	/* r3 = 3210 */
840	mov	r1, r1, lsl #24		/* r1 = 7... */
841	orr	r2, r1, r2, lsr #8	/* r2 = 7654 */
842	str	r3, [r0]
843	str	r2, [r0, #0x04]
844	bx	lr
845	LMEMCPY_8_PAD
846
847/*
848 * 0010: dst is 32-bit aligned, src is 16-bit aligned
849 */
850	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
851	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
852	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
853	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
854	mov	r3, r3, lsr #16		/* r3 = ..54 */
855	orr	r3, r3, r1, lsl #16	/* r3 = 7654 */
856	str	r2, [r0]
857	str	r3, [r0, #0x04]
858	bx	lr
859	LMEMCPY_8_PAD
860
861/*
862 * 0011: dst is 32-bit aligned, src is 8-bit aligned
863 */
864	ldrb	r3, [r1]		/* r3 = ...0 */
865	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
866	ldr	r1, [r1, #0x05]		/* BE:r1 = 567x  LE:r1 = x765 */
867	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
868	mov	r2, r2, lsr #24		/* r2 = ...4 */
869	orr	r2, r2, r1, lsl #8	/* r2 = 7654 */
870	str	r3, [r0]
871	str	r2, [r0, #0x04]
872	bx	lr
873	LMEMCPY_8_PAD
874
875/*
876 * 0100: dst is 8-bit aligned, src is 32-bit aligned
877 */
878	ldr	r3, [r1]		/* BE:r3 = 0123  LE:r3 = 3210 */
879	ldr	r2, [r1, #0x04]		/* BE:r2 = 4567  LE:r2 = 7654 */
880	strb	r3, [r0]
881	mov	r1, r2, lsr #24		/* r1 = ...7 */
882	strb	r1, [r0, #0x07]
883	mov	r1, r3, lsr #8		/* r1 = .321 */
884	mov	r3, r3, lsr #24		/* r3 = ...3 */
885	orr	r3, r3, r2, lsl #8	/* r3 = 6543 */
886	strh	r1, [r0, #0x01]
887	str	r3, [r0, #0x03]
888	bx	lr
889	LMEMCPY_8_PAD
890
891/*
892 * 0101: dst is 8-bit aligned, src is 8-bit aligned
893 */
894	ldrb	r2, [r1]
895	ldrh	r3, [r1, #0x01]
896	ldr	ip, [r1, #0x03]
897	ldrb	r1, [r1, #0x07]
898	strb	r2, [r0]
899	strh	r3, [r0, #0x01]
900	str	ip, [r0, #0x03]
901	strb	r1, [r0, #0x07]
902	bx	lr
903	LMEMCPY_8_PAD
904
905/*
906 * 0110: dst is 8-bit aligned, src is 16-bit aligned
907 */
908	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
909	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
910	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
911	strb	r2, [r0]		/* 0 */
912	mov	ip, r1, lsr #8		/* ip = ...7 */
913	strb	ip, [r0, #0x07]		/* 7 */
914	mov	ip, r2, lsr #8		/* ip = ...1 */
915	orr	ip, ip, r3, lsl #8	/* ip = 4321 */
916	mov	r3, r3, lsr #8		/* r3 = .543 */
917	orr	r3, r3, r1, lsl #24	/* r3 = 6543 */
918	strh	ip, [r0, #0x01]
919	str	r3, [r0, #0x03]
920	bx	lr
921	LMEMCPY_8_PAD
922
923/*
924 * 0111: dst is 8-bit aligned, src is 8-bit aligned
925 */
926	ldrb	r3, [r1]		/* r3 = ...0 */
927	ldr	ip, [r1, #0x01]		/* BE:ip = 1234  LE:ip = 4321 */
928	ldrh	r2, [r1, #0x05]		/* BE:r2 = ..56  LE:r2 = ..65 */
929	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
930	strb	r3, [r0]
931	mov	r3, ip, lsr #16		/* BE:r3 = ..12  LE:r3 = ..43 */
932	strh	ip, [r0, #0x01]
933	orr	r2, r3, r2, lsl #16	/* r2 = 6543 */
934	str	r2, [r0, #0x03]
935	strb	r1, [r0, #0x07]
936	bx	lr
937	LMEMCPY_8_PAD
938
939/*
940 * 1000: dst is 16-bit aligned, src is 32-bit aligned
941 */
942	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
943	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
944	mov	r1, r2, lsr #16		/* BE:r1 = ..01  LE:r1 = ..32 */
945	strh	r2, [r0]
946	orr	r2, r1, r3, lsl #16	/* r2 = 5432 */
947	mov	r3, r3, lsr #16		/* r3 = ..76 */
948	str	r2, [r0, #0x02]
949	strh	r3, [r0, #0x06]
950	bx	lr
951	LMEMCPY_8_PAD
952
953/*
954 * 1001: dst is 16-bit aligned, src is 8-bit aligned
955 */
956	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
957	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
958	ldrb	ip, [r1, #0x07]		/* ip = ...7 */
959	mov	r1, r2, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
960	strh	r1, [r0]
961	mov	r1, r2, lsr #24		/* r1 = ...2 */
962	orr	r1, r1, r3, lsl #8	/* r1 = 5432 */
963	mov	r3, r3, lsr #24		/* r3 = ...6 */
964	orr	r3, r3, ip, lsl #8	/* r3 = ..76 */
965	str	r1, [r0, #0x02]
966	strh	r3, [r0, #0x06]
967	bx	lr
968	LMEMCPY_8_PAD
969
970/*
971 * 1010: dst is 16-bit aligned, src is 16-bit aligned
972 */
973	ldrh	r2, [r1]
974	ldr	ip, [r1, #0x02]
975	ldrh	r3, [r1, #0x06]
976	strh	r2, [r0]
977	str	ip, [r0, #0x02]
978	strh	r3, [r0, #0x06]
979	bx	lr
980	LMEMCPY_8_PAD
981
982/*
983 * 1011: dst is 16-bit aligned, src is 8-bit aligned
984 */
985	ldr	r3, [r1, #0x05]		/* BE:r3 = 567x  LE:r3 = x765 */
986	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
987	ldrb	ip, [r1]		/* ip = ...0 */
988	mov	r1, r3, lsr #8		/* BE:r1 = .567  LE:r1 = .x76 */
989	strh	r1, [r0, #0x06]
990	mov	r3, r3, lsl #24		/* r3 = 5... */
991	orr	r3, r3, r2, lsr #8	/* r3 = 5432 */
992	orr	r2, ip, r2, lsl #8	/* r2 = 3210 */
993	str	r3, [r0, #0x02]
994	strh	r2, [r0]
995	bx	lr
996	LMEMCPY_8_PAD
997
998/*
999 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1000 */
1001	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
1002	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1003	mov	r1, r3, lsr #8		/* BE:r1 = .456  LE:r1 = .765 */
1004	strh	r1, [r0, #0x05]
1005	strb	r2, [r0]
1006	mov	r1, r3, lsr #24		/* r1 = ...7 */
1007	strb	r1, [r0, #0x07]
1008	mov	r2, r2, lsr #8		/* r2 = .321 */
1009	orr	r2, r2, r3, lsl #24	/* r2 = 4321 */
1010	str	r2, [r0, #0x01]
1011	bx	 lr
1012	LMEMCPY_8_PAD
1013
1014/*
1015 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1016 */
1017	ldrb	r3, [r1]		/* r3 = ...0 */
1018	ldrh	r2, [r1, #0x01]		/* BE:r2 = ..12  LE:r2 = ..21 */
1019	ldr	ip, [r1, #0x03]		/* BE:ip = 3456  LE:ip = 6543 */
1020	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
1021	strb	r3, [r0]
1022	mov	r3, ip, lsr #16		/* BE:r3 = ..34  LE:r3 = ..65 */
1023	strh	r3, [r0, #0x05]
1024	orr	r2, r2, ip, lsl #16	/* r2 = 4321 */
1025	str	r2, [r0, #0x01]
1026	strb	r1, [r0, #0x07]
1027	bx	lr
1028	LMEMCPY_8_PAD
1029
1030/*
1031 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1032 */
1033	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1034	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1035	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
1036	strb	r2, [r0]
1037	mov	ip, r2, lsr #8		/* ip = ...1 */
1038	orr	ip, ip, r3, lsl #8	/* ip = 4321 */
1039	mov	r2, r1, lsr #8		/* r2 = ...7 */
1040	strb	r2, [r0, #0x07]
1041	mov	r1, r1, lsl #8		/* r1 = .76. */
1042	orr	r1, r1, r3, lsr #24	/* r1 = .765 */
1043	str	ip, [r0, #0x01]
1044	strh	r1, [r0, #0x05]
1045	bx	lr
1046	LMEMCPY_8_PAD
1047
1048/*
1049 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1050 */
1051	ldrb	r2, [r1]
1052	ldr	ip, [r1, #0x01]
1053	ldrh	r3, [r1, #0x05]
1054	ldrb	r1, [r1, #0x07]
1055	strb	r2, [r0]
1056	str	ip, [r0, #0x01]
1057	strh	r3, [r0, #0x05]
1058	strb	r1, [r0, #0x07]
1059	bx	lr
1060	LMEMCPY_8_PAD
1061
1062/******************************************************************************
1063 * Special case for 12 byte copies
1064 */
1065#define	LMEMCPY_C_LOG2	7	/* 128 bytes */
1066#define	LMEMCPY_C_PAD	.align LMEMCPY_C_LOG2
1067	LMEMCPY_C_PAD
1068.Lmemcpy_c:
1069	and	r2, r1, #0x03
1070	orr	r2, r2, r0, lsl #2
1071	ands	r2, r2, #0x0f
1072	sub	r3, pc, #0x14
1073	addne	pc, r3, r2, lsl #LMEMCPY_C_LOG2
1074
1075/*
1076 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1077 */
1078	ldr	r2, [r1]
1079	ldr	r3, [r1, #0x04]
1080	ldr	r1, [r1, #0x08]
1081	str	r2, [r0]
1082	str	r3, [r0, #0x04]
1083	str	r1, [r0, #0x08]
1084	bx	lr
1085	LMEMCPY_C_PAD
1086
1087/*
1088 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1089 */
1090	ldrb	r2, [r1, #0xb]		/* r2 = ...B */
1091	ldr	ip, [r1, #0x07]		/* BE:ip = 789A  LE:ip = A987 */
1092	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
1093	ldr	r1, [r1, #-1]		/* BE:r1 = x012  LE:r1 = 210x */
1094	mov	r2, r2, lsl #24		/* r2 = B... */
1095	orr	r2, r2, ip, lsr #8	/* r2 = BA98 */
1096	str	r2, [r0, #0x08]
1097	mov	r2, ip, lsl #24		/* r2 = 7... */
1098	orr	r2, r2, r3, lsr #8	/* r2 = 7654 */
1099	mov	r1, r1, lsr #8		/* r1 = .210 */
1100	orr	r1, r1, r3, lsl #24	/* r1 = 3210 */
1101	str	r2, [r0, #0x04]
1102	str	r1, [r0]
1103	bx	lr
1104	LMEMCPY_C_PAD
1105
1106/*
1107 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1108 */
1109	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1110	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1111	ldr	ip, [r1, #0x06]		/* BE:ip = 6789  LE:ip = 9876 */
1112	ldrh	r1, [r1, #0x0a]		/* BE:r1 = ..AB  LE:r1 = ..BA */
1113	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
1114	str	r2, [r0]
1115	mov	r3, r3, lsr #16		/* r3 = ..54 */
1116	orr	r3, r3, ip, lsl #16	/* r3 = 7654 */
1117	mov	r1, r1, lsl #16		/* r1 = BA.. */
1118	orr	r1, r1, ip, lsr #16	/* r1 = BA98 */
1119	str	r3, [r0, #0x04]
1120	str	r1, [r0, #0x08]
1121	bx	lr
1122	LMEMCPY_C_PAD
1123
1124/*
1125 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1126 */
1127	ldrb	r2, [r1]		/* r2 = ...0 */
1128	ldr	r3, [r1, #0x01]		/* BE:r3 = 1234  LE:r3 = 4321 */
1129	ldr	ip, [r1, #0x05]		/* BE:ip = 5678  LE:ip = 8765 */
1130	ldr	r1, [r1, #0x09]		/* BE:r1 = 9ABx  LE:r1 = xBA9 */
1131	orr	r2, r2, r3, lsl #8	/* r2 = 3210 */
1132	str	r2, [r0]
1133	mov	r3, r3, lsr #24		/* r3 = ...4 */
1134	orr	r3, r3, ip, lsl #8	/* r3 = 7654 */
1135	mov	r1, r1, lsl #8		/* r1 = BA9. */
1136	orr	r1, r1, ip, lsr #24	/* r1 = BA98 */
1137	str	r3, [r0, #0x04]
1138	str	r1, [r0, #0x08]
1139	bx	lr
1140	LMEMCPY_C_PAD
1141
1142/*
1143 * 0100: dst is 8-bit aligned (byte 1), src is 32-bit aligned
1144 */
1145	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1146	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
1147	ldr	ip, [r1, #0x08]		/* BE:ip = 89AB  LE:ip = BA98 */
1148	mov	r1, r2, lsr #8		/* BE:r1 = .012  LE:r1 = .321 */
1149	strh	r1, [r0, #0x01]
1150	strb	r2, [r0]
1151	mov	r1, r2, lsr #24		/* r1 = ...3 */
1152	orr	r2, r1, r3, lsl #8	/* r1 = 6543 */
1153	mov	r1, r3, lsr #24		/* r1 = ...7 */
1154	orr	r1, r1, ip, lsl #8	/* r1 = A987 */
1155	mov	ip, ip, lsr #24		/* ip = ...B */
1156	str	r2, [r0, #0x03]
1157	str	r1, [r0, #0x07]
1158	strb	ip, [r0, #0x0b]
1159	bx	lr
1160	LMEMCPY_C_PAD
1161
1162/*
1163 * 0101: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 1)
1164 */
1165	ldrb	r2, [r1]
1166	ldrh	r3, [r1, #0x01]
1167	ldr	ip, [r1, #0x03]
1168	strb	r2, [r0]
1169	ldr	r2, [r1, #0x07]
1170	ldrb	r1, [r1, #0x0b]
1171	strh	r3, [r0, #0x01]
1172	str	ip, [r0, #0x03]
1173	str	r2, [r0, #0x07]
1174	strb	r1, [r0, #0x0b]
1175	bx	lr
1176	LMEMCPY_C_PAD
1177
1178/*
1179 * 0110: dst is 8-bit aligned (byte 1), src is 16-bit aligned
1180 */
1181	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1182	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1183	ldr	ip, [r1, #0x06]		/* BE:ip = 6789  LE:ip = 9876 */
1184	ldrh	r1, [r1, #0x0a]		/* BE:r1 = ..AB  LE:r1 = ..BA */
1185	strb	r2, [r0]
1186	mov	r2, r2, lsr #8		/* r2 = ...1 */
1187	orr	r2, r2, r3, lsl #8	/* r2 = 4321 */
1188	strh	r2, [r0, #0x01]
1189	mov	r2, r3, lsr #8		/* r2 = .543 */
1190	orr	r3, r2, ip, lsl #24	/* r3 = 6543 */
1191	mov	r2, ip, lsr #8		/* r2 = .987 */
1192	orr	r2, r2, r1, lsl #24	/* r2 = A987 */
1193	mov	r1, r1, lsr #8		/* r1 = ...B */
1194	str	r3, [r0, #0x03]
1195	str	r2, [r0, #0x07]
1196	strb	r1, [r0, #0x0b]
1197	bx	lr
1198	LMEMCPY_C_PAD
1199
1200/*
1201 * 0111: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 3)
1202 */
1203	ldrb	r2, [r1]
1204	ldr	r3, [r1, #0x01]		/* BE:r3 = 1234  LE:r3 = 4321 */
1205	ldr	ip, [r1, #0x05]		/* BE:ip = 5678  LE:ip = 8765 */
1206	ldr	r1, [r1, #0x09]		/* BE:r1 = 9ABx  LE:r1 = xBA9 */
1207	strb	r2, [r0]
1208	strh	r3, [r0, #0x01]
1209	mov	r3, r3, lsr #16		/* r3 = ..43 */
1210	orr	r3, r3, ip, lsl #16	/* r3 = 6543 */
1211	mov	ip, ip, lsr #16		/* ip = ..87 */
1212	orr	ip, ip, r1, lsl #16	/* ip = A987 */
1213	mov	r1, r1, lsr #16		/* r1 = ..xB */
1214	str	r3, [r0, #0x03]
1215	str	ip, [r0, #0x07]
1216	strb	r1, [r0, #0x0b]
1217	bx	lr
1218	LMEMCPY_C_PAD
1219
1220/*
1221 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1222 */
1223	ldr	ip, [r1]		/* BE:ip = 0123  LE:ip = 3210 */
1224	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
1225	ldr	r2, [r1, #0x08]		/* BE:r2 = 89AB  LE:r2 = BA98 */
1226	mov	r1, ip, lsr #16		/* BE:r1 = ..01  LE:r1 = ..32 */
1227	strh	ip, [r0]
1228	orr	r1, r1, r3, lsl #16	/* r1 = 5432 */
1229	mov	r3, r3, lsr #16		/* r3 = ..76 */
1230	orr	r3, r3, r2, lsl #16	/* r3 = 9876 */
1231	mov	r2, r2, lsr #16		/* r2 = ..BA */
1232	str	r1, [r0, #0x02]
1233	str	r3, [r0, #0x06]
1234	strh	r2, [r0, #0x0a]
1235	bx	lr
1236	LMEMCPY_C_PAD
1237
1238/*
1239 * 1001: dst is 16-bit aligned, src is 8-bit aligned (byte 1)
1240 */
1241	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
1242	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
1243	mov	ip, r2, lsr #8		/* BE:ip = .x01  LE:ip = .210 */
1244	strh	ip, [r0]
1245	ldr	ip, [r1, #0x07]		/* BE:ip = 789A  LE:ip = A987 */
1246	ldrb	r1, [r1, #0x0b]		/* r1 = ...B */
1247	mov	r2, r2, lsr #24		/* r2 = ...2 */
1248	orr	r2, r2, r3, lsl #8	/* r2 = 5432 */
1249	mov	r3, r3, lsr #24		/* r3 = ...6 */
1250	orr	r3, r3, ip, lsl #8	/* r3 = 9876 */
1251	mov	r1, r1, lsl #8		/* r1 = ..B. */
1252	orr	r1, r1, ip, lsr #24	/* r1 = ..BA */
1253	str	r2, [r0, #0x02]
1254	str	r3, [r0, #0x06]
1255	strh	r1, [r0, #0x0a]
1256	bx	lr
1257	LMEMCPY_C_PAD
1258
1259/*
1260 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1261 */
1262	ldrh	r2, [r1]
1263	ldr	r3, [r1, #0x02]
1264	ldr	ip, [r1, #0x06]
1265	ldrh	r1, [r1, #0x0a]
1266	strh	r2, [r0]
1267	str	r3, [r0, #0x02]
1268	str	ip, [r0, #0x06]
1269	strh	r1, [r0, #0x0a]
1270	bx	lr
1271	LMEMCPY_C_PAD
1272
1273/*
1274 * 1011: dst is 16-bit aligned, src is 8-bit aligned (byte 3)
1275 */
1276	ldr	r2, [r1, #0x09]		/* BE:r2 = 9ABx  LE:r2 = xBA9 */
1277	ldr	r3, [r1, #0x05]		/* BE:r3 = 5678  LE:r3 = 8765 */
1278	mov	ip, r2, lsr #8		/* BE:ip = .9AB  LE:ip = .xBA */
1279	strh	ip, [r0, #0x0a]
1280	ldr	ip, [r1, #0x01]		/* BE:ip = 1234  LE:ip = 4321 */
1281	ldrb	r1, [r1]		/* r1 = ...0 */
1282	mov	r2, r2, lsl #24		/* r2 = 9... */
1283	orr	r2, r2, r3, lsr #8	/* r2 = 9876 */
1284	mov	r3, r3, lsl #24		/* r3 = 5... */
1285	orr	r3, r3, ip, lsr #8	/* r3 = 5432 */
1286	orr	r1, r1, ip, lsl #8	/* r1 = 3210 */
1287	str	r2, [r0, #0x06]
1288	str	r3, [r0, #0x02]
1289	strh	r1, [r0]
1290	bx	lr
1291	LMEMCPY_C_PAD
1292
1293/*
1294 * 1100: dst is 8-bit aligned (byte 3), src is 32-bit aligned
1295 */
1296	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1297	ldr	ip, [r1, #0x04]		/* BE:ip = 4567  LE:ip = 7654 */
1298	ldr	r1, [r1, #0x08]		/* BE:r1 = 89AB  LE:r1 = BA98 */
1299	strb	r2, [r0]
1300	mov	r3, r2, lsr #8		/* r3 = .321 */
1301	orr	r3, r3, ip, lsl #24	/* r3 = 4321 */
1302	str	r3, [r0, #0x01]
1303	mov	r3, ip, lsr #8		/* r3 = .765 */
1304	orr	r3, r3, r1, lsl #24	/* r3 = 8765 */
1305	str	r3, [r0, #0x05]
1306	mov	r1, r1, lsr #8		/* r1 = .BA9 */
1307	strh	r1, [r0, #0x09]
1308	mov	r1, r1, lsr #16		/* r1 = ...B */
1309	strb	r1, [r0, #0x0b]
1310	bx	lr
1311	LMEMCPY_C_PAD
1312
1313/*
1314 * 1101: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 1)
1315 */
1316	ldrb	r2, [r1, #0x0b]		/* r2 = ...B */
1317	ldr	r3, [r1, #0x07]		/* BE:r3 = 789A  LE:r3 = A987 */
1318	ldr	ip, [r1, #0x03]		/* BE:ip = 3456  LE:ip = 6543 */
1319	ldr	r1, [r1, #-1]		/* BE:r1 = x012  LE:r1 = 210x */
1320	strb	r2, [r0, #0x0b]
1321	mov	r2, r3, lsr #16		/* r2 = ..A9 */
1322	strh	r2, [r0, #0x09]
1323	mov	r3, r3, lsl #16		/* r3 = 87.. */
1324	orr	r3, r3, ip, lsr #16	/* r3 = 8765 */
1325	mov	ip, ip, lsl #16		/* ip = 43.. */
1326	orr	ip, ip, r1, lsr #16	/* ip = 4321 */
1327	mov	r1, r1, lsr #8		/* r1 = .210 */
1328	str	r3, [r0, #0x05]
1329	str	ip, [r0, #0x01]
1330	strb	r1, [r0]
1331	bx	lr
1332	LMEMCPY_C_PAD
1333
1334/*
1335 * 1110: dst is 8-bit aligned (byte 3), src is 16-bit aligned
1336 */
1337	ldrh	r2, [r1]		/* r2 = ..10 */
1338	ldr	r3, [r1, #0x02]		/* r3 = 5432 */
1339	ldr	ip, [r1, #0x06]		/* ip = 9876 */
1340	ldrh	r1, [r1, #0x0a]		/* r1 = ..BA */
1341	strb	r2, [r0]
1342	mov	r2, r2, lsr #8		/* r2 = ...1 */
1343	orr	r2, r2, r3, lsl #8	/* r2 = 4321 */
1344	mov	r3, r3, lsr #24		/* r3 = ...5 */
1345	orr	r3, r3, ip, lsl #8	/* r3 = 8765 */
1346	mov	ip, ip, lsr #24		/* ip = ...9 */
1347	orr	ip, ip, r1, lsl #8	/* ip = .BA9 */
1348	mov	r1, r1, lsr #8		/* r1 = ...B */
1349	str	r2, [r0, #0x01]
1350	str	r3, [r0, #0x05]
1351	strh	ip, [r0, #0x09]
1352	strb	r1, [r0, #0x0b]
1353	bx	lr
1354	LMEMCPY_C_PAD
1355
1356/*
1357 * 1111: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 3)
1358 */
1359	ldrb	r2, [r1]
1360	ldr	r3, [r1, #0x01]
1361	ldr	ip, [r1, #0x05]
1362	strb	r2, [r0]
1363	ldrh	r2, [r1, #0x09]
1364	ldrb	r1, [r1, #0x0b]
1365	str	r3, [r0, #0x01]
1366	str	ip, [r0, #0x05]
1367	strh	r2, [r0, #0x09]
1368	strb	r1, [r0, #0x0b]
1369	bx	lr
1370#endif	/* !_STANDALONE */
1371END(memcpy)
1372
1373	.section .note.GNU-stack,"",%progbits
1374