xref: /freebsd/sys/arm/arm/support.S (revision c7142afec42c4f0a3aa4da845f4c4e15b5e3f018)
1/*-
2 * Copyright (c) 2004 Olivier Houchard
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26/*
27 * Copyright 2003 Wasabi Systems, Inc.
28 * All rights reserved.
29 *
30 * Written by Steve C. Woodford for Wasabi Systems, Inc.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 *    notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 *    notice, this list of conditions and the following disclaimer in the
39 *    documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 *    must display the following acknowledgement:
42 *      This product includes software developed for the NetBSD Project by
43 *      Wasabi Systems, Inc.
44 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
45 *    or promote products derived from this software without specific prior
46 *    written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
50 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
51 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
52 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
53 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
54 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
55 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
56 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
57 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
58 * POSSIBILITY OF SUCH DAMAGE.
59 */
60/*
61 * Copyright (c) 1997 The NetBSD Foundation, Inc.
62 * All rights reserved.
63 *
64 * This code is derived from software contributed to The NetBSD Foundation
65 * by Neil A. Carson and Mark Brinicombe
66 *
67 * Redistribution and use in source and binary forms, with or without
68 * modification, are permitted provided that the following conditions
69 * are met:
70 * 1. Redistributions of source code must retain the above copyright
71 *    notice, this list of conditions and the following disclaimer.
72 * 2. Redistributions in binary form must reproduce the above copyright
73 *    notice, this list of conditions and the following disclaimer in the
74 *    documentation and/or other materials provided with the distribution.
75 *
76 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
77 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
78 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
79 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
80 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
81 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
82 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
83 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
84 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
85 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
86 * POSSIBILITY OF SUCH DAMAGE.
87 */
88
89#include <machine/asm.h>
90#include "assym.inc"
91
92	.syntax	unified
93
94/*
95 * memset: Sets a block of memory to the specified value
96 *
97 * On entry:
98 *   r0 - dest address
99 *   r1 - byte to write
100 *   r2 - number of bytes to write
101 *
102 * On exit:
103 *   r0 - dest address
104 */
105/* LINTSTUB: Func: void *memset(void *, int, size_t) */
106ENTRY(memset)
107	and	r3, r1, #0xff		/* We deal with bytes */
108	mov	r1, r2
109do_memset:
110	cmp	r1, #0x04		/* Do we have less than 4 bytes */
111	mov	ip, r0
112	blt	.Lmemset_lessthanfour
113
114	/* Ok first we will word align the address */
115	ands	r2, ip, #0x03		/* Get the bottom two bits */
116	bne	.Lmemset_wordunaligned	/* The address is not word aligned */
117
118	/* We are now word aligned */
119.Lmemset_wordaligned:
120	orr	r3, r3, r3, lsl #8	/* Extend value to 16-bits */
121	tst	ip, #0x04		/* Quad-align for armv5e */
122	orr	r3, r3, r3, lsl #16	/* Extend value to 32-bits */
123	subne	r1, r1, #0x04		/* Quad-align if necessary */
124	strne	r3, [ip], #0x04
125	cmp	r1, #0x10
126	blt	.Lmemset_loop4		/* If less than 16 then use words */
127	mov	r2, r3			/* Duplicate data */
128	cmp	r1, #0x80		/* If < 128 then skip the big loop */
129	blt	.Lmemset_loop32
130
131	/* Do 128 bytes at a time */
132.Lmemset_loop128:
133	subs	r1, r1, #0x80
134	strdge	r2, [ip], #0x08
135	strdge	r2, [ip], #0x08
136	strdge	r2, [ip], #0x08
137	strdge	r2, [ip], #0x08
138	strdge	r2, [ip], #0x08
139	strdge	r2, [ip], #0x08
140	strdge	r2, [ip], #0x08
141	strdge	r2, [ip], #0x08
142	strdge	r2, [ip], #0x08
143	strdge	r2, [ip], #0x08
144	strdge	r2, [ip], #0x08
145	strdge	r2, [ip], #0x08
146	strdge	r2, [ip], #0x08
147	strdge	r2, [ip], #0x08
148	strdge	r2, [ip], #0x08
149	strdge	r2, [ip], #0x08
150	bgt	.Lmemset_loop128
151	RETeq			/* Zero length so just exit */
152
153	add	r1, r1, #0x80		/* Adjust for extra sub */
154
155	/* Do 32 bytes at a time */
156.Lmemset_loop32:
157	subs	r1, r1, #0x20
158	strdge	r2, [ip], #0x08
159	strdge	r2, [ip], #0x08
160	strdge	r2, [ip], #0x08
161	strdge	r2, [ip], #0x08
162	bgt	.Lmemset_loop32
163	RETeq			/* Zero length so just exit */
164
165	adds	r1, r1, #0x10		/* Partially adjust for extra sub */
166
167	/* Deal with 16 bytes or more */
168	strdge	r2, [ip], #0x08
169	strdge	r2, [ip], #0x08
170	RETeq			/* Zero length so just exit */
171
172	addlt	r1, r1, #0x10		/* Possibly adjust for extra sub */
173
174	/* We have at least 4 bytes so copy as words */
175.Lmemset_loop4:
176	subs	r1, r1, #0x04
177	strge	r3, [ip], #0x04
178	bgt	.Lmemset_loop4
179	RETeq			/* Zero length so just exit */
180
181	/* Compensate for 64-bit alignment check */
182	adds	r1, r1, #0x04
183	RETeq
184	cmp	r1, #2
185
186	strb	r3, [ip], #0x01		/* Set 1 byte */
187	strbge	r3, [ip], #0x01		/* Set another byte */
188	strbgt	r3, [ip]		/* and a third */
189	RET			/* Exit */
190
191.Lmemset_wordunaligned:
192	rsb	r2, r2, #0x004
193	strb	r3, [ip], #0x01		/* Set 1 byte */
194	cmp	r2, #0x02
195	strbge	r3, [ip], #0x01		/* Set another byte */
196	sub	r1, r1, r2
197	strbgt	r3, [ip], #0x01		/* and a third */
198	cmp	r1, #0x04		/* More than 4 bytes left? */
199	bge	.Lmemset_wordaligned	/* Yup */
200
201.Lmemset_lessthanfour:
202	cmp	r1, #0x00
203	RETeq			/* Zero length so exit */
204	strb	r3, [ip], #0x01		/* Set 1 byte */
205	cmp	r1, #0x02
206	strbge	r3, [ip], #0x01		/* Set another byte */
207	strbgt	r3, [ip]		/* and a third */
208	RET			/* Exit */
209END(memset)
210
211ENTRY(memcmp)
212	mov	ip, r0
213	cmp	r2, #0x06
214	beq	.Lmemcmp_6bytes
215	mov	r0, #0x00
216
217	/* Are both addresses aligned the same way? */
218	cmp	r2, #0x00
219	eorsne	r3, ip, r1
220	RETeq			/* len == 0, or same addresses! */
221	tst	r3, #0x03
222	subne	r2, r2, #0x01
223	bne	.Lmemcmp_bytewise2	/* Badly aligned. Do it the slow way */
224
225	/* Word-align the addresses, if necessary */
226	sub	r3, r1, #0x05
227	ands	r3, r3, #0x03
228	add	r3, r3, r3, lsl #1
229	addne	pc, pc, r3, lsl #3
230	nop
231
232	/* Compare up to 3 bytes */
233	ldrb	r0, [ip], #0x01
234	ldrb	r3, [r1], #0x01
235	subs	r0, r0, r3
236	RETne
237	subs	r2, r2, #0x01
238	RETeq
239
240	/* Compare up to 2 bytes */
241	ldrb	r0, [ip], #0x01
242	ldrb	r3, [r1], #0x01
243	subs	r0, r0, r3
244	RETne
245	subs	r2, r2, #0x01
246	RETeq
247
248	/* Compare 1 byte */
249	ldrb	r0, [ip], #0x01
250	ldrb	r3, [r1], #0x01
251	subs	r0, r0, r3
252	RETne
253	subs	r2, r2, #0x01
254	RETeq
255
256	/* Compare 4 bytes at a time, if possible */
257	subs	r2, r2, #0x04
258	bcc	.Lmemcmp_bytewise
259.Lmemcmp_word_aligned:
260	ldr	r0, [ip], #0x04
261	ldr	r3, [r1], #0x04
262	subs	r2, r2, #0x04
263	cmpcs	r0, r3
264	beq	.Lmemcmp_word_aligned
265	sub	r0, r0, r3
266
267	/* Correct for extra subtraction, and check if done */
268	adds	r2, r2, #0x04
269	cmpeq	r0, #0x00		/* If done, did all bytes match? */
270	RETeq			/* Yup. Just return */
271
272	/* Re-do the final word byte-wise */
273	sub	ip, ip, #0x04
274	sub	r1, r1, #0x04
275
276.Lmemcmp_bytewise:
277	add	r2, r2, #0x03
278.Lmemcmp_bytewise2:
279	ldrb	r0, [ip], #0x01
280	ldrb	r3, [r1], #0x01
281	subs	r2, r2, #0x01
282	cmpcs	r0, r3
283	beq	.Lmemcmp_bytewise2
284	sub	r0, r0, r3
285	RET
286
287	/*
288	 * 6 byte compares are very common, thanks to the network stack.
289	 * This code is hand-scheduled to reduce the number of stalls for
290	 * load results. Everything else being equal, this will be ~32%
291	 * faster than a byte-wise memcmp.
292	 */
293	.align	5
294.Lmemcmp_6bytes:
295	ldrb	r3, [r1, #0x00]		/* r3 = b2#0 */
296	ldrb	r0, [ip, #0x00]		/* r0 = b1#0 */
297	ldrb	r2, [r1, #0x01]		/* r2 = b2#1 */
298	subs	r0, r0, r3		/* r0 = b1#0 - b2#0 */
299	ldrbeq	r3, [ip, #0x01]		/* r3 = b1#1 */
300	RETne			/* Return if mismatch on #0 */
301	subs	r0, r3, r2		/* r0 = b1#1 - b2#1 */
302	ldrbeq	r3, [r1, #0x02]		/* r3 = b2#2 */
303	ldrbeq	r0, [ip, #0x02]		/* r0 = b1#2 */
304	RETne			/* Return if mismatch on #1 */
305	ldrb	r2, [r1, #0x03]		/* r2 = b2#3 */
306	subs	r0, r0, r3		/* r0 = b1#2 - b2#2 */
307	ldrbeq	r3, [ip, #0x03]		/* r3 = b1#3 */
308	RETne			/* Return if mismatch on #2 */
309	subs	r0, r3, r2		/* r0 = b1#3 - b2#3 */
310	ldrbeq	r3, [r1, #0x04]		/* r3 = b2#4 */
311	ldrbeq	r0, [ip, #0x04]		/* r0 = b1#4 */
312	RETne			/* Return if mismatch on #3 */
313	ldrb	r2, [r1, #0x05]		/* r2 = b2#5 */
314	subs	r0, r0, r3		/* r0 = b1#4 - b2#4 */
315	ldrbeq	r3, [ip, #0x05]		/* r3 = b1#5 */
316	RETne			/* Return if mismatch on #4 */
317	sub	r0, r3, r2		/* r0 = b1#5 - b2#5 */
318	RET
319END(memcmp)
320
321ENTRY(memmove)
322	/* Do the buffers overlap? */
323	cmp	r0, r1
324	RETeq		/* Bail now if src/dst are the same */
325	subcc	r3, r0, r1	/* if (dst > src) r3 = dst - src */
326	subcs	r3, r1, r0	/* if (src > dsr) r3 = src - dst */
327	cmp	r3, r2		/* if (r3 < len) we have an overlap */
328	bcc	PIC_SYM(_C_LABEL(memcpy), PLT)
329
330	/* Determine copy direction */
331	cmp	r1, r0
332	bcc	.Lmemmove_backwards
333
334	moveq	r0, #0			/* Quick abort for len=0 */
335	RETeq
336
337	stmdb	sp!, {r0, lr}		/* memmove() returns dest addr */
338	subs	r2, r2, #4
339	blt	.Lmemmove_fl4		/* less than 4 bytes */
340	ands	r12, r0, #3
341	bne	.Lmemmove_fdestul	/* oh unaligned destination addr */
342	ands	r12, r1, #3
343	bne	.Lmemmove_fsrcul		/* oh unaligned source addr */
344
345.Lmemmove_ft8:
346	/* We have aligned source and destination */
347	subs	r2, r2, #8
348	blt	.Lmemmove_fl12		/* less than 12 bytes (4 from above) */
349	subs	r2, r2, #0x14
350	blt	.Lmemmove_fl32		/* less than 32 bytes (12 from above) */
351	stmdb	sp!, {r4}		/* borrow r4 */
352
353	/* blat 32 bytes at a time */
354	/* XXX for really big copies perhaps we should use more registers */
355.Lmemmove_floop32:
356	ldmia	r1!, {r3, r4, r12, lr}
357	stmia	r0!, {r3, r4, r12, lr}
358	ldmia	r1!, {r3, r4, r12, lr}
359	stmia	r0!, {r3, r4, r12, lr}
360	subs	r2, r2, #0x20
361	bge	.Lmemmove_floop32
362
363	cmn	r2, #0x10
364	ldmiage	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
365	stmiage	r0!, {r3, r4, r12, lr}
366	subge	r2, r2, #0x10
367	ldmia	sp!, {r4}		/* return r4 */
368
369.Lmemmove_fl32:
370	adds	r2, r2, #0x14
371
372	/* blat 12 bytes at a time */
373.Lmemmove_floop12:
374	ldmiage	r1!, {r3, r12, lr}
375	stmiage	r0!, {r3, r12, lr}
376	subsge	r2, r2, #0x0c
377	bge	.Lmemmove_floop12
378
379.Lmemmove_fl12:
380	adds	r2, r2, #8
381	blt	.Lmemmove_fl4
382
383	subs	r2, r2, #4
384	ldrlt	r3, [r1], #4
385	strlt	r3, [r0], #4
386	ldmiage	r1!, {r3, r12}
387	stmiage	r0!, {r3, r12}
388	subge	r2, r2, #4
389
390.Lmemmove_fl4:
391	/* less than 4 bytes to go */
392	adds	r2, r2, #4
393	ldmiaeq	sp!, {r0, pc}		/* done */
394
395	/* copy the crud byte at a time */
396	cmp	r2, #2
397	ldrb	r3, [r1], #1
398	strb	r3, [r0], #1
399	ldrbge	r3, [r1], #1
400	strbge	r3, [r0], #1
401	ldrbgt	r3, [r1], #1
402	strbgt	r3, [r0], #1
403	ldmia	sp!, {r0, pc}
404
405	/* erg - unaligned destination */
406.Lmemmove_fdestul:
407	rsb	r12, r12, #4
408	cmp	r12, #2
409
410	/* align destination with byte copies */
411	ldrb	r3, [r1], #1
412	strb	r3, [r0], #1
413	ldrbge	r3, [r1], #1
414	strbge	r3, [r0], #1
415	ldrbgt	r3, [r1], #1
416	strbgt	r3, [r0], #1
417	subs	r2, r2, r12
418	blt	.Lmemmove_fl4		/* less the 4 bytes */
419
420	ands	r12, r1, #3
421	beq	.Lmemmove_ft8		/* we have an aligned source */
422
423	/* erg - unaligned source */
424	/* This is where it gets nasty ... */
425.Lmemmove_fsrcul:
426	bic	r1, r1, #3
427	ldr	lr, [r1], #4
428	cmp	r12, #2
429	bgt	.Lmemmove_fsrcul3
430	beq	.Lmemmove_fsrcul2
431	cmp	r2, #0x0c
432	blt	.Lmemmove_fsrcul1loop4
433	sub	r2, r2, #0x0c
434	stmdb	sp!, {r4, r5}
435
436.Lmemmove_fsrcul1loop16:
437	mov	r3, lr, lsr #8
438	ldmia	r1!, {r4, r5, r12, lr}
439	orr	r3, r3, r4, lsl #24
440	mov	r4, r4, lsr #8
441	orr	r4, r4, r5, lsl #24
442	mov	r5, r5, lsr #8
443	orr	r5, r5, r12, lsl #24
444	mov	r12, r12, lsr #8
445	orr	r12, r12, lr, lsl #24
446	stmia	r0!, {r3-r5, r12}
447	subs	r2, r2, #0x10
448	bge	.Lmemmove_fsrcul1loop16
449	ldmia	sp!, {r4, r5}
450	adds	r2, r2, #0x0c
451	blt	.Lmemmove_fsrcul1l4
452
453.Lmemmove_fsrcul1loop4:
454	mov	r12, lr, lsr #8
455	ldr	lr, [r1], #4
456	orr	r12, r12, lr, lsl #24
457	str	r12, [r0], #4
458	subs	r2, r2, #4
459	bge	.Lmemmove_fsrcul1loop4
460
461.Lmemmove_fsrcul1l4:
462	sub	r1, r1, #3
463	b	.Lmemmove_fl4
464
465.Lmemmove_fsrcul2:
466	cmp	r2, #0x0c
467	blt	.Lmemmove_fsrcul2loop4
468	sub	r2, r2, #0x0c
469	stmdb	sp!, {r4, r5}
470
471.Lmemmove_fsrcul2loop16:
472	mov	r3, lr, lsr #16
473	ldmia	r1!, {r4, r5, r12, lr}
474	orr	r3, r3, r4, lsl #16
475	mov	r4, r4, lsr #16
476	orr	r4, r4, r5, lsl #16
477	mov	r5, r5, lsr #16
478	orr	r5, r5, r12, lsl #16
479	mov	r12, r12, lsr #16
480	orr	r12, r12, lr, lsl #16
481	stmia	r0!, {r3-r5, r12}
482	subs	r2, r2, #0x10
483	bge	.Lmemmove_fsrcul2loop16
484	ldmia	sp!, {r4, r5}
485	adds	r2, r2, #0x0c
486	blt	.Lmemmove_fsrcul2l4
487
488.Lmemmove_fsrcul2loop4:
489	mov	r12, lr, lsr #16
490	ldr	lr, [r1], #4
491	orr	r12, r12, lr, lsl #16
492	str	r12, [r0], #4
493	subs	r2, r2, #4
494	bge	.Lmemmove_fsrcul2loop4
495
496.Lmemmove_fsrcul2l4:
497	sub	r1, r1, #2
498	b	.Lmemmove_fl4
499
500.Lmemmove_fsrcul3:
501	cmp	r2, #0x0c
502	blt	.Lmemmove_fsrcul3loop4
503	sub	r2, r2, #0x0c
504	stmdb	sp!, {r4, r5}
505
506.Lmemmove_fsrcul3loop16:
507	mov	r3, lr, lsr #24
508	ldmia	r1!, {r4, r5, r12, lr}
509	orr	r3, r3, r4, lsl #8
510	mov	r4, r4, lsr #24
511	orr	r4, r4, r5, lsl #8
512	mov	r5, r5, lsr #24
513	orr	r5, r5, r12, lsl #8
514	mov	r12, r12, lsr #24
515	orr	r12, r12, lr, lsl #8
516	stmia	r0!, {r3-r5, r12}
517	subs	r2, r2, #0x10
518	bge	.Lmemmove_fsrcul3loop16
519	ldmia	sp!, {r4, r5}
520	adds	r2, r2, #0x0c
521	blt	.Lmemmove_fsrcul3l4
522
523.Lmemmove_fsrcul3loop4:
524	mov	r12, lr, lsr #24
525	ldr	lr, [r1], #4
526	orr	r12, r12, lr, lsl #8
527	str	r12, [r0], #4
528	subs	r2, r2, #4
529	bge	.Lmemmove_fsrcul3loop4
530
531.Lmemmove_fsrcul3l4:
532	sub	r1, r1, #1
533	b	.Lmemmove_fl4
534
535.Lmemmove_backwards:
536	add	r1, r1, r2
537	add	r0, r0, r2
538	subs	r2, r2, #4
539	blt	.Lmemmove_bl4		/* less than 4 bytes */
540	ands	r12, r0, #3
541	bne	.Lmemmove_bdestul	/* oh unaligned destination addr */
542	ands	r12, r1, #3
543	bne	.Lmemmove_bsrcul		/* oh unaligned source addr */
544
545.Lmemmove_bt8:
546	/* We have aligned source and destination */
547	subs	r2, r2, #8
548	blt	.Lmemmove_bl12		/* less than 12 bytes (4 from above) */
549	stmdb	sp!, {r4, lr}
550	subs	r2, r2, #0x14		/* less than 32 bytes (12 from above) */
551	blt	.Lmemmove_bl32
552
553	/* blat 32 bytes at a time */
554	/* XXX for really big copies perhaps we should use more registers */
555.Lmemmove_bloop32:
556	ldmdb	r1!, {r3, r4, r12, lr}
557	stmdb	r0!, {r3, r4, r12, lr}
558	ldmdb	r1!, {r3, r4, r12, lr}
559	stmdb	r0!, {r3, r4, r12, lr}
560	subs	r2, r2, #0x20
561	bge	.Lmemmove_bloop32
562
563.Lmemmove_bl32:
564	cmn	r2, #0x10
565	ldmdbge	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
566	stmdbge	r0!, {r3, r4, r12, lr}
567	subge	r2, r2, #0x10
568	adds	r2, r2, #0x14
569	ldmdbge	r1!, {r3, r12, lr}	/* blat a remaining 12 bytes */
570	stmdbge	r0!, {r3, r12, lr}
571	subge	r2, r2, #0x0c
572	ldmia	sp!, {r4, lr}
573
574.Lmemmove_bl12:
575	adds	r2, r2, #8
576	blt	.Lmemmove_bl4
577	subs	r2, r2, #4
578	ldrlt	r3, [r1, #-4]!
579	strlt	r3, [r0, #-4]!
580	ldmdbge	r1!, {r3, r12}
581	stmdbge	r0!, {r3, r12}
582	subge	r2, r2, #4
583
584.Lmemmove_bl4:
585	/* less than 4 bytes to go */
586	adds	r2, r2, #4
587	RETeq			/* done */
588
589	/* copy the crud byte at a time */
590	cmp	r2, #2
591	ldrb	r3, [r1, #-1]!
592	strb	r3, [r0, #-1]!
593	ldrbge	r3, [r1, #-1]!
594	strbge	r3, [r0, #-1]!
595	ldrbgt	r3, [r1, #-1]!
596	strbgt	r3, [r0, #-1]!
597	RET
598
599	/* erg - unaligned destination */
600.Lmemmove_bdestul:
601	cmp	r12, #2
602
603	/* align destination with byte copies */
604	ldrb	r3, [r1, #-1]!
605	strb	r3, [r0, #-1]!
606	ldrbge	r3, [r1, #-1]!
607	strbge	r3, [r0, #-1]!
608	ldrbgt	r3, [r1, #-1]!
609	strbgt	r3, [r0, #-1]!
610	subs	r2, r2, r12
611	blt	.Lmemmove_bl4		/* less than 4 bytes to go */
612	ands	r12, r1, #3
613	beq	.Lmemmove_bt8		/* we have an aligned source */
614
615	/* erg - unaligned source */
616	/* This is where it gets nasty ... */
617.Lmemmove_bsrcul:
618	bic	r1, r1, #3
619	ldr	r3, [r1, #0]
620	cmp	r12, #2
621	blt	.Lmemmove_bsrcul1
622	beq	.Lmemmove_bsrcul2
623	cmp	r2, #0x0c
624	blt	.Lmemmove_bsrcul3loop4
625	sub	r2, r2, #0x0c
626	stmdb	sp!, {r4, r5, lr}
627
628.Lmemmove_bsrcul3loop16:
629	mov	lr, r3, lsl #8
630	ldmdb	r1!, {r3-r5, r12}
631	orr	lr, lr, r12, lsr #24
632	mov	r12, r12, lsl #8
633	orr	r12, r12, r5, lsr #24
634	mov	r5, r5, lsl #8
635	orr	r5, r5, r4, lsr #24
636	mov	r4, r4, lsl #8
637	orr	r4, r4, r3, lsr #24
638	stmdb	r0!, {r4, r5, r12, lr}
639	subs	r2, r2, #0x10
640	bge	.Lmemmove_bsrcul3loop16
641	ldmia	sp!, {r4, r5, lr}
642	adds	r2, r2, #0x0c
643	blt	.Lmemmove_bsrcul3l4
644
645.Lmemmove_bsrcul3loop4:
646	mov	r12, r3, lsl #8
647	ldr	r3, [r1, #-4]!
648	orr	r12, r12, r3, lsr #24
649	str	r12, [r0, #-4]!
650	subs	r2, r2, #4
651	bge	.Lmemmove_bsrcul3loop4
652
653.Lmemmove_bsrcul3l4:
654	add	r1, r1, #3
655	b	.Lmemmove_bl4
656
657.Lmemmove_bsrcul2:
658	cmp	r2, #0x0c
659	blt	.Lmemmove_bsrcul2loop4
660	sub	r2, r2, #0x0c
661	stmdb	sp!, {r4, r5, lr}
662
663.Lmemmove_bsrcul2loop16:
664	mov	lr, r3, lsl #16
665	ldmdb	r1!, {r3-r5, r12}
666	orr	lr, lr, r12, lsr #16
667	mov	r12, r12, lsl #16
668	orr	r12, r12, r5, lsr #16
669	mov	r5, r5, lsl #16
670	orr	r5, r5, r4, lsr #16
671	mov	r4, r4, lsl #16
672	orr	r4, r4, r3, lsr #16
673	stmdb	r0!, {r4, r5, r12, lr}
674	subs	r2, r2, #0x10
675	bge	.Lmemmove_bsrcul2loop16
676	ldmia	sp!, {r4, r5, lr}
677	adds	r2, r2, #0x0c
678	blt	.Lmemmove_bsrcul2l4
679
680.Lmemmove_bsrcul2loop4:
681	mov	r12, r3, lsl #16
682	ldr	r3, [r1, #-4]!
683	orr	r12, r12, r3, lsr #16
684	str	r12, [r0, #-4]!
685	subs	r2, r2, #4
686	bge	.Lmemmove_bsrcul2loop4
687
688.Lmemmove_bsrcul2l4:
689	add	r1, r1, #2
690	b	.Lmemmove_bl4
691
692.Lmemmove_bsrcul1:
693	cmp	r2, #0x0c
694	blt	.Lmemmove_bsrcul1loop4
695	sub	r2, r2, #0x0c
696	stmdb	sp!, {r4, r5, lr}
697
698.Lmemmove_bsrcul1loop32:
699	mov	lr, r3, lsl #24
700	ldmdb	r1!, {r3-r5, r12}
701	orr	lr, lr, r12, lsr #8
702	mov	r12, r12, lsl #24
703	orr	r12, r12, r5, lsr #8
704	mov	r5, r5, lsl #24
705	orr	r5, r5, r4, lsr #8
706	mov	r4, r4, lsl #24
707	orr	r4, r4, r3, lsr #8
708	stmdb	r0!, {r4, r5, r12, lr}
709	subs	r2, r2, #0x10
710	bge	.Lmemmove_bsrcul1loop32
711	ldmia	sp!, {r4, r5, lr}
712	adds	r2, r2, #0x0c
713	blt	.Lmemmove_bsrcul1l4
714
715.Lmemmove_bsrcul1loop4:
716	mov	r12, r3, lsl #24
717	ldr	r3, [r1, #-4]!
718	orr	r12, r12, r3, lsr #8
719	str	r12, [r0, #-4]!
720	subs	r2, r2, #4
721	bge	.Lmemmove_bsrcul1loop4
722
723.Lmemmove_bsrcul1l4:
724	add	r1, r1, #1
725	b	.Lmemmove_bl4
726END(memmove)
727
728/* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
729ENTRY(memcpy)
730	pld	[r1]
731	cmp	r2, #0x0c
732	ble	.Lmemcpy_short		/* <= 12 bytes */
733	mov	r3, r0			/* We must not clobber r0 */
734
735	/* Word-align the destination buffer */
736	ands	ip, r3, #0x03		/* Already word aligned? */
737	beq	.Lmemcpy_wordaligned	/* Yup */
738	cmp	ip, #0x02
739	ldrb	ip, [r1], #0x01
740	sub	r2, r2, #0x01
741	strb	ip, [r3], #0x01
742	ldrble	ip, [r1], #0x01
743	suble	r2, r2, #0x01
744	strble	ip, [r3], #0x01
745	ldrblt	ip, [r1], #0x01
746	sublt	r2, r2, #0x01
747	strblt	ip, [r3], #0x01
748
749	/* Destination buffer is now word aligned */
750.Lmemcpy_wordaligned:
751	ands	ip, r1, #0x03		/* Is src also word-aligned? */
752	bne	.Lmemcpy_bad_align	/* Nope. Things just got bad */
753
754	/* Quad-align the destination buffer */
755	tst	r3, #0x07		/* Already quad aligned? */
756	ldrne	ip, [r1], #0x04
757	stmfd	sp!, {r4-r9}		/* Free up some registers */
758	subne	r2, r2, #0x04
759	strne	ip, [r3], #0x04
760
761	/* Destination buffer quad aligned, source is at least word aligned */
762	subs	r2, r2, #0x80
763	blt	.Lmemcpy_w_lessthan128
764
765	/* Copy 128 bytes at a time */
766.Lmemcpy_w_loop128:
767	ldr	r4, [r1], #0x04		/* LD:00-03 */
768	ldr	r5, [r1], #0x04		/* LD:04-07 */
769	pld	[r1, #0x18]		/* Prefetch 0x20 */
770	ldr	r6, [r1], #0x04		/* LD:08-0b */
771	ldr	r7, [r1], #0x04		/* LD:0c-0f */
772	ldr	r8, [r1], #0x04		/* LD:10-13 */
773	ldr	r9, [r1], #0x04		/* LD:14-17 */
774	strd	r4, [r3], #0x08		/* ST:00-07 */
775	ldr	r4, [r1], #0x04		/* LD:18-1b */
776	ldr	r5, [r1], #0x04		/* LD:1c-1f */
777	strd	r6, [r3], #0x08		/* ST:08-0f */
778	ldr	r6, [r1], #0x04		/* LD:20-23 */
779	ldr	r7, [r1], #0x04		/* LD:24-27 */
780	pld	[r1, #0x18]		/* Prefetch 0x40 */
781	strd	r8, [r3], #0x08		/* ST:10-17 */
782	ldr	r8, [r1], #0x04		/* LD:28-2b */
783	ldr	r9, [r1], #0x04		/* LD:2c-2f */
784	strd	r4, [r3], #0x08		/* ST:18-1f */
785	ldr	r4, [r1], #0x04		/* LD:30-33 */
786	ldr	r5, [r1], #0x04		/* LD:34-37 */
787	strd	r6, [r3], #0x08		/* ST:20-27 */
788	ldr	r6, [r1], #0x04		/* LD:38-3b */
789	ldr	r7, [r1], #0x04		/* LD:3c-3f */
790	strd	r8, [r3], #0x08		/* ST:28-2f */
791	ldr	r8, [r1], #0x04		/* LD:40-43 */
792	ldr	r9, [r1], #0x04		/* LD:44-47 */
793	pld	[r1, #0x18]		/* Prefetch 0x60 */
794	strd	r4, [r3], #0x08		/* ST:30-37 */
795	ldr	r4, [r1], #0x04		/* LD:48-4b */
796	ldr	r5, [r1], #0x04		/* LD:4c-4f */
797	strd	r6, [r3], #0x08		/* ST:38-3f */
798	ldr	r6, [r1], #0x04		/* LD:50-53 */
799	ldr	r7, [r1], #0x04		/* LD:54-57 */
800	strd	r8, [r3], #0x08		/* ST:40-47 */
801	ldr	r8, [r1], #0x04		/* LD:58-5b */
802	ldr	r9, [r1], #0x04		/* LD:5c-5f */
803	strd	r4, [r3], #0x08		/* ST:48-4f */
804	ldr	r4, [r1], #0x04		/* LD:60-63 */
805	ldr	r5, [r1], #0x04		/* LD:64-67 */
806	pld	[r1, #0x18]		/* Prefetch 0x80 */
807	strd	r6, [r3], #0x08		/* ST:50-57 */
808	ldr	r6, [r1], #0x04		/* LD:68-6b */
809	ldr	r7, [r1], #0x04		/* LD:6c-6f */
810	strd	r8, [r3], #0x08		/* ST:58-5f */
811	ldr	r8, [r1], #0x04		/* LD:70-73 */
812	ldr	r9, [r1], #0x04		/* LD:74-77 */
813	strd	r4, [r3], #0x08		/* ST:60-67 */
814	ldr	r4, [r1], #0x04		/* LD:78-7b */
815	ldr	r5, [r1], #0x04		/* LD:7c-7f */
816	strd	r6, [r3], #0x08		/* ST:68-6f */
817	strd	r8, [r3], #0x08		/* ST:70-77 */
818	subs	r2, r2, #0x80
819	strd	r4, [r3], #0x08		/* ST:78-7f */
820	bge	.Lmemcpy_w_loop128
821
822.Lmemcpy_w_lessthan128:
823	adds	r2, r2, #0x80		/* Adjust for extra sub */
824	ldmfdeq	sp!, {r4-r9}
825	RETeq			/* Return now if done */
826	subs	r2, r2, #0x20
827	blt	.Lmemcpy_w_lessthan32
828
829	/* Copy 32 bytes at a time */
830.Lmemcpy_w_loop32:
831	ldr	r4, [r1], #0x04
832	ldr	r5, [r1], #0x04
833	pld	[r1, #0x18]
834	ldr	r6, [r1], #0x04
835	ldr	r7, [r1], #0x04
836	ldr	r8, [r1], #0x04
837	ldr	r9, [r1], #0x04
838	strd	r4, [r3], #0x08
839	ldr	r4, [r1], #0x04
840	ldr	r5, [r1], #0x04
841	strd	r6, [r3], #0x08
842	strd	r8, [r3], #0x08
843	subs	r2, r2, #0x20
844	strd	r4, [r3], #0x08
845	bge	.Lmemcpy_w_loop32
846
847.Lmemcpy_w_lessthan32:
848	adds	r2, r2, #0x20		/* Adjust for extra sub */
849	ldmfdeq	sp!, {r4-r9}
850	RETeq			/* Return now if done */
851
852	and	r4, r2, #0x18
853	rsbs	r4, r4, #0x18
854	addne	pc, pc, r4, lsl #1
855	nop
856
857	/* At least 24 bytes remaining */
858	ldr	r4, [r1], #0x04
859	ldr	r5, [r1], #0x04
860	sub	r2, r2, #0x08
861	strd	r4, [r3], #0x08
862
863	/* At least 16 bytes remaining */
864	ldr	r4, [r1], #0x04
865	ldr	r5, [r1], #0x04
866	sub	r2, r2, #0x08
867	strd	r4, [r3], #0x08
868
869	/* At least 8 bytes remaining */
870	ldr	r4, [r1], #0x04
871	ldr	r5, [r1], #0x04
872	subs	r2, r2, #0x08
873	strd	r4, [r3], #0x08
874
875	/* Less than 8 bytes remaining */
876	ldmfd	sp!, {r4-r9}
877	RETeq			/* Return now if done */
878	subs	r2, r2, #0x04
879	ldrge	ip, [r1], #0x04
880	strge	ip, [r3], #0x04
881	RETeq			/* Return now if done */
882	addlt	r2, r2, #0x04
883	ldrb	ip, [r1], #0x01
884	cmp	r2, #0x02
885	ldrbge	r2, [r1], #0x01
886	strb	ip, [r3], #0x01
887	ldrbgt	ip, [r1]
888	strbge	r2, [r3], #0x01
889	strbgt	ip, [r3]
890	RET
891/* Place a literal pool here for the above ldr instructions to use */
892.ltorg
893
894
895/*
896 * At this point, it has not been possible to word align both buffers.
897 * The destination buffer is word aligned, but the source buffer is not.
898 */
899.Lmemcpy_bad_align:
900	stmfd	sp!, {r4-r7}
901	bic	r1, r1, #0x03
902	cmp	ip, #2
903	ldr	ip, [r1], #0x04
904	bgt	.Lmemcpy_bad3
905	beq	.Lmemcpy_bad2
906	b	.Lmemcpy_bad1
907
908.Lmemcpy_bad1_loop16:
909	mov	r4, ip, lsr #8
910	ldr	r5, [r1], #0x04
911	pld	[r1, #0x018]
912	ldr	r6, [r1], #0x04
913	ldr	r7, [r1], #0x04
914	ldr	ip, [r1], #0x04
915	orr	r4, r4, r5, lsl #24
916	mov	r5, r5, lsr #8
917	orr	r5, r5, r6, lsl #24
918	mov	r6, r6, lsr #8
919	orr	r6, r6, r7, lsl #24
920	mov	r7, r7, lsr #8
921	orr	r7, r7, ip, lsl #24
922	str	r4, [r3], #0x04
923	str	r5, [r3], #0x04
924	str	r6, [r3], #0x04
925	str	r7, [r3], #0x04
926.Lmemcpy_bad1:
927	subs	r2, r2, #0x10
928	bge	.Lmemcpy_bad1_loop16
929
930	adds	r2, r2, #0x10
931	ldmfdeq	sp!, {r4-r7}
932	RETeq			/* Return now if done */
933	subs	r2, r2, #0x04
934	sublt	r1, r1, #0x03
935	blt	.Lmemcpy_bad_done
936
937.Lmemcpy_bad1_loop4:
938	mov	r4, ip, lsr #8
939	ldr	ip, [r1], #0x04
940	subs	r2, r2, #0x04
941	orr	r4, r4, ip, lsl #24
942	str	r4, [r3], #0x04
943	bge	.Lmemcpy_bad1_loop4
944	sub	r1, r1, #0x03
945	b	.Lmemcpy_bad_done
946
947.Lmemcpy_bad2_loop16:
948	mov	r4, ip, lsr #16
949	ldr	r5, [r1], #0x04
950	pld	[r1, #0x018]
951	ldr	r6, [r1], #0x04
952	ldr	r7, [r1], #0x04
953	ldr	ip, [r1], #0x04
954	orr	r4, r4, r5, lsl #16
955	mov	r5, r5, lsr #16
956	orr	r5, r5, r6, lsl #16
957	mov	r6, r6, lsr #16
958	orr	r6, r6, r7, lsl #16
959	mov	r7, r7, lsr #16
960	orr	r7, r7, ip, lsl #16
961	str	r4, [r3], #0x04
962	str	r5, [r3], #0x04
963	str	r6, [r3], #0x04
964	str	r7, [r3], #0x04
965.Lmemcpy_bad2:
966	subs	r2, r2, #0x10
967	bge	.Lmemcpy_bad2_loop16
968
969	adds	r2, r2, #0x10
970	ldmfdeq	sp!, {r4-r7}
971	RETeq			/* Return now if done */
972	subs	r2, r2, #0x04
973	sublt	r1, r1, #0x02
974	blt	.Lmemcpy_bad_done
975
976.Lmemcpy_bad2_loop4:
977	mov	r4, ip, lsr #16
978	ldr	ip, [r1], #0x04
979	subs	r2, r2, #0x04
980	orr	r4, r4, ip, lsl #16
981	str	r4, [r3], #0x04
982	bge	.Lmemcpy_bad2_loop4
983	sub	r1, r1, #0x02
984	b	.Lmemcpy_bad_done
985
986.Lmemcpy_bad3_loop16:
987	mov	r4, ip, lsr #24
988	ldr	r5, [r1], #0x04
989	pld	[r1, #0x018]
990	ldr	r6, [r1], #0x04
991	ldr	r7, [r1], #0x04
992	ldr	ip, [r1], #0x04
993	orr	r4, r4, r5, lsl #8
994	mov	r5, r5, lsr #24
995	orr	r5, r5, r6, lsl #8
996	mov	r6, r6, lsr #24
997	orr	r6, r6, r7, lsl #8
998	mov	r7, r7, lsr #24
999	orr	r7, r7, ip, lsl #8
1000	str	r4, [r3], #0x04
1001	str	r5, [r3], #0x04
1002	str	r6, [r3], #0x04
1003	str	r7, [r3], #0x04
1004.Lmemcpy_bad3:
1005	subs	r2, r2, #0x10
1006	bge	.Lmemcpy_bad3_loop16
1007
1008	adds	r2, r2, #0x10
1009	ldmfdeq	sp!, {r4-r7}
1010	RETeq			/* Return now if done */
1011	subs	r2, r2, #0x04
1012	sublt	r1, r1, #0x01
1013	blt	.Lmemcpy_bad_done
1014
1015.Lmemcpy_bad3_loop4:
1016	mov	r4, ip, lsr #24
1017	ldr	ip, [r1], #0x04
1018	subs	r2, r2, #0x04
1019	orr	r4, r4, ip, lsl #8
1020	str	r4, [r3], #0x04
1021	bge	.Lmemcpy_bad3_loop4
1022	sub	r1, r1, #0x01
1023
1024.Lmemcpy_bad_done:
1025	ldmfd	sp!, {r4-r7}
1026	adds	r2, r2, #0x04
1027	RETeq
1028	ldrb	ip, [r1], #0x01
1029	cmp	r2, #0x02
1030	ldrbge	r2, [r1], #0x01
1031	strb	ip, [r3], #0x01
1032	ldrbgt	ip, [r1]
1033	strbge	r2, [r3], #0x01
1034	strbgt	ip, [r3]
1035	RET
1036
1037
1038/*
1039 * Handle short copies (less than 16 bytes), possibly misaligned.
1040 * Some of these are *very* common, thanks to the network stack,
1041 * and so are handled specially.
1042 */
1043.Lmemcpy_short:
1044	add	pc, pc, r2, lsl #2
1045	nop
1046	RET			/* 0x00 */
1047	b	.Lmemcpy_bytewise	/* 0x01 */
1048	b	.Lmemcpy_bytewise	/* 0x02 */
1049	b	.Lmemcpy_bytewise	/* 0x03 */
1050	b	.Lmemcpy_4		/* 0x04 */
1051	b	.Lmemcpy_bytewise	/* 0x05 */
1052	b	.Lmemcpy_6		/* 0x06 */
1053	b	.Lmemcpy_bytewise	/* 0x07 */
1054	b	.Lmemcpy_8		/* 0x08 */
1055	b	.Lmemcpy_bytewise	/* 0x09 */
1056	b	.Lmemcpy_bytewise	/* 0x0a */
1057	b	.Lmemcpy_bytewise	/* 0x0b */
1058	b	.Lmemcpy_c		/* 0x0c */
1059.Lmemcpy_bytewise:
1060	mov	r3, r0			/* We must not clobber r0 */
1061	ldrb	ip, [r1], #0x01
10621:	subs	r2, r2, #0x01
1063	strb	ip, [r3], #0x01
1064	ldrbne	ip, [r1], #0x01
1065	bne	1b
1066	RET
1067
1068/******************************************************************************
1069 * Special case for 4 byte copies
1070 */
1071#define	LMEMCPY_4_LOG2	6	/* 64 bytes */
1072#define	LMEMCPY_4_PAD	.align LMEMCPY_4_LOG2
1073	LMEMCPY_4_PAD
1074.Lmemcpy_4:
1075	and	r2, r1, #0x03
1076	orr	r2, r2, r0, lsl #2
1077	ands	r2, r2, #0x0f
1078	sub	r3, pc, #0x14
1079	addne	pc, r3, r2, lsl #LMEMCPY_4_LOG2
1080
1081/*
1082 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1083 */
1084	ldr	r2, [r1]
1085	str	r2, [r0]
1086	RET
1087	LMEMCPY_4_PAD
1088
1089/*
1090 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1091 */
1092	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
1093	ldr	r2, [r1, #3]		/* BE:r2 = 3xxx  LE:r2 = xxx3 */
1094	mov	r3, r3, lsr #8		/* r3 = .210 */
1095	orr	r3, r3, r2, lsl #24	/* r3 = 3210 */
1096	str	r3, [r0]
1097	RET
1098	LMEMCPY_4_PAD
1099
1100/*
1101 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1102 */
1103	ldrh	r3, [r1, #0x02]
1104	ldrh	r2, [r1]
1105	orr	r3, r2, r3, lsl #16
1106	str	r3, [r0]
1107	RET
1108	LMEMCPY_4_PAD
1109
1110/*
1111 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1112 */
1113	ldr	r3, [r1, #-3]		/* BE:r3 = xxx0  LE:r3 = 0xxx */
1114	ldr	r2, [r1, #1]		/* BE:r2 = 123x  LE:r2 = x321 */
1115	mov	r3, r3, lsr #24		/* r3 = ...0 */
1116	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
1117	str	r3, [r0]
1118	RET
1119	LMEMCPY_4_PAD
1120
1121/*
1122 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1123 */
1124	ldr	r2, [r1]
1125	strb	r2, [r0]
1126	mov	r3, r2, lsr #8
1127	mov	r1, r2, lsr #24
1128	strb	r1, [r0, #0x03]
1129	strh	r3, [r0, #0x01]
1130	RET
1131	LMEMCPY_4_PAD
1132
1133/*
1134 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1135 */
1136	ldrb	r2, [r1]
1137	ldrh	r3, [r1, #0x01]
1138	ldrb	r1, [r1, #0x03]
1139	strb	r2, [r0]
1140	strh	r3, [r0, #0x01]
1141	strb	r1, [r0, #0x03]
1142	RET
1143	LMEMCPY_4_PAD
1144
1145/*
1146 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1147 */
1148	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1149	ldrh	r3, [r1, #0x02]		/* LE:r3 = ..23  LE:r3 = ..32 */
1150	strb	r2, [r0]
1151	mov	r2, r2, lsr #8		/* r2 = ...1 */
1152	orr	r2, r2, r3, lsl #8	/* r2 = .321 */
1153	mov	r3, r3, lsr #8		/* r3 = ...3 */
1154	strh	r2, [r0, #0x01]
1155	strb	r3, [r0, #0x03]
1156	RET
1157	LMEMCPY_4_PAD
1158
1159/*
1160 * 0111: dst is 8-bit aligned, src is 8-bit aligned
1161 */
1162	ldrb	r2, [r1]
1163	ldrh	r3, [r1, #0x01]
1164	ldrb	r1, [r1, #0x03]
1165	strb	r2, [r0]
1166	strh	r3, [r0, #0x01]
1167	strb	r1, [r0, #0x03]
1168	RET
1169	LMEMCPY_4_PAD
1170
1171/*
1172 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1173 */
1174	ldr	r2, [r1]
1175	strh	r2, [r0]
1176	mov	r3, r2, lsr #16
1177	strh	r3, [r0, #0x02]
1178	RET
1179	LMEMCPY_4_PAD
1180
1181/*
1182 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1183 */
1184	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
1185	ldr	r3, [r1, #3]		/* BE:r3 = 3xxx  LE:r3 = xxx3 */
1186	mov	r1, r2, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
1187	strh	r1, [r0]
1188	mov	r2, r2, lsr #24		/* r2 = ...2 */
1189	orr	r2, r2, r3, lsl #8	/* r2 = xx32 */
1190	strh	r2, [r0, #0x02]
1191	RET
1192	LMEMCPY_4_PAD
1193
1194/*
1195 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1196 */
1197	ldrh	r2, [r1]
1198	ldrh	r3, [r1, #0x02]
1199	strh	r2, [r0]
1200	strh	r3, [r0, #0x02]
1201	RET
1202	LMEMCPY_4_PAD
1203
1204/*
1205 * 1011: dst is 16-bit aligned, src is 8-bit aligned
1206 */
1207	ldr	r3, [r1, #1]		/* BE:r3 = 123x  LE:r3 = x321 */
1208	ldr	r2, [r1, #-3]		/* BE:r2 = xxx0  LE:r2 = 0xxx */
1209	mov	r1, r3, lsr #8		/* BE:r1 = .123  LE:r1 = .x32 */
1210	strh	r1, [r0, #0x02]
1211	mov	r3, r3, lsl #8		/* r3 = 321. */
1212	orr	r3, r3, r2, lsr #24	/* r3 = 3210 */
1213	strh	r3, [r0]
1214	RET
1215	LMEMCPY_4_PAD
1216
1217/*
1218 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1219 */
1220	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1221	strb	r2, [r0]
1222	mov	r3, r2, lsr #8
1223	mov	r1, r2, lsr #24
1224	strh	r3, [r0, #0x01]
1225	strb	r1, [r0, #0x03]
1226	RET
1227	LMEMCPY_4_PAD
1228
1229/*
1230 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1231 */
1232	ldrb	r2, [r1]
1233	ldrh	r3, [r1, #0x01]
1234	ldrb	r1, [r1, #0x03]
1235	strb	r2, [r0]
1236	strh	r3, [r0, #0x01]
1237	strb	r1, [r0, #0x03]
1238	RET
1239	LMEMCPY_4_PAD
1240
1241/*
1242 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1243 */
1244	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1245	ldrh	r3, [r1, #0x02]		/* BE:r3 = ..23  LE:r3 = ..32 */
1246	strb	r2, [r0]
1247	mov	r2, r2, lsr #8		/* r2 = ...1 */
1248	orr	r2, r2, r3, lsl #8	/* r2 = .321 */
1249	strh	r2, [r0, #0x01]
1250	mov	r3, r3, lsr #8		/* r3 = ...3 */
1251	strb	r3, [r0, #0x03]
1252	RET
1253	LMEMCPY_4_PAD
1254
1255/*
1256 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1257 */
1258	ldrb	r2, [r1]
1259	ldrh	r3, [r1, #0x01]
1260	ldrb	r1, [r1, #0x03]
1261	strb	r2, [r0]
1262	strh	r3, [r0, #0x01]
1263	strb	r1, [r0, #0x03]
1264	RET
1265	LMEMCPY_4_PAD
1266
1267
1268/******************************************************************************
1269 * Special case for 6 byte copies
1270 */
1271#define	LMEMCPY_6_LOG2	6	/* 64 bytes */
1272#define	LMEMCPY_6_PAD	.align LMEMCPY_6_LOG2
1273	LMEMCPY_6_PAD
1274.Lmemcpy_6:
1275	and	r2, r1, #0x03
1276	orr	r2, r2, r0, lsl #2
1277	ands	r2, r2, #0x0f
1278	sub	r3, pc, #0x14
1279	addne	pc, r3, r2, lsl #LMEMCPY_6_LOG2
1280
1281/*
1282 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1283 */
1284	ldr	r2, [r1]
1285	ldrh	r3, [r1, #0x04]
1286	str	r2, [r0]
1287	strh	r3, [r0, #0x04]
1288	RET
1289	LMEMCPY_6_PAD
1290
1291/*
1292 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1293 */
1294	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
1295	ldr	r3, [r1, #0x03]		/* BE:r3 = 345x  LE:r3 = x543 */
1296	mov	r2, r2, lsr #8		/* r2 = .210 */
1297	orr	r2, r2, r3, lsl #24	/* r2 = 3210 */
1298	mov	r3, r3, lsr #8		/* BE:r3 = .345  LE:r3 = .x54 */
1299	str	r2, [r0]
1300	strh	r3, [r0, #0x04]
1301	RET
1302	LMEMCPY_6_PAD
1303
1304/*
1305 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1306 */
1307	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1308	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1309	mov	r1, r3, lsr #16		/* r1 = ..54 */
1310	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
1311	str	r2, [r0]
1312	strh	r1, [r0, #0x04]
1313	RET
1314	LMEMCPY_6_PAD
1315
1316/*
1317 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1318 */
1319	ldr	r2, [r1, #-3]		/* BE:r2 = xxx0  LE:r2 = 0xxx */
1320	ldr	r3, [r1, #1]		/* BE:r3 = 1234  LE:r3 = 4321 */
1321	ldr	r1, [r1, #5]		/* BE:r1 = 5xxx  LE:r3 = xxx5 */
1322	mov	r2, r2, lsr #24		/* r2 = ...0 */
1323	orr	r2, r2, r3, lsl #8	/* r2 = 3210 */
1324	mov	r1, r1, lsl #8		/* r1 = xx5. */
1325	orr	r1, r1, r3, lsr #24	/* r1 = xx54 */
1326	str	r2, [r0]
1327	strh	r1, [r0, #0x04]
1328	RET
1329	LMEMCPY_6_PAD
1330
1331/*
1332 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1333 */
1334	ldr	r3, [r1]		/* BE:r3 = 0123  LE:r3 = 3210 */
1335	ldrh	r2, [r1, #0x04]		/* BE:r2 = ..45  LE:r2 = ..54 */
1336	mov	r1, r3, lsr #8		/* BE:r1 = .012  LE:r1 = .321 */
1337	strh	r1, [r0, #0x01]
1338	strb	r3, [r0]
1339	mov	r3, r3, lsr #24		/* r3 = ...3 */
1340	orr	r3, r3, r2, lsl #8	/* r3 = .543 */
1341	mov	r2, r2, lsr #8		/* r2 = ...5 */
1342	strh	r3, [r0, #0x03]
1343	strb	r2, [r0, #0x05]
1344	RET
1345	LMEMCPY_6_PAD
1346
1347/*
1348 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1349 */
1350	ldrb	r2, [r1]
1351	ldrh	r3, [r1, #0x01]
1352	ldrh	ip, [r1, #0x03]
1353	ldrb	r1, [r1, #0x05]
1354	strb	r2, [r0]
1355	strh	r3, [r0, #0x01]
1356	strh	ip, [r0, #0x03]
1357	strb	r1, [r0, #0x05]
1358	RET
1359	LMEMCPY_6_PAD
1360
1361/*
1362 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1363 */
1364	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1365	ldr	r1, [r1, #0x02]		/* BE:r1 = 2345  LE:r1 = 5432 */
1366	strb	r2, [r0]
1367	mov	r3, r1, lsr #24
1368	strb	r3, [r0, #0x05]
1369	mov	r3, r1, lsr #8		/* r3 = .543 */
1370	strh	r3, [r0, #0x03]
1371	mov	r3, r2, lsr #8		/* r3 = ...1 */
1372	orr	r3, r3, r1, lsl #8	/* r3 = 4321 */
1373	strh	r3, [r0, #0x01]
1374	RET
1375	LMEMCPY_6_PAD
1376
1377/*
1378 * 0111: dst is 8-bit aligned, src is 8-bit aligned
1379 */
1380	ldrb	r2, [r1]
1381	ldrh	r3, [r1, #0x01]
1382	ldrh	ip, [r1, #0x03]
1383	ldrb	r1, [r1, #0x05]
1384	strb	r2, [r0]
1385	strh	r3, [r0, #0x01]
1386	strh	ip, [r0, #0x03]
1387	strb	r1, [r0, #0x05]
1388	RET
1389	LMEMCPY_6_PAD
1390
1391/*
1392 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1393 */
1394	ldrh	r2, [r1, #0x04]		/* r2 = ..54 */
1395	ldr	r3, [r1]		/* r3 = 3210 */
1396	mov	r2, r2, lsl #16		/* r2 = 54.. */
1397	orr	r2, r2, r3, lsr #16	/* r2 = 5432 */
1398	strh	r3, [r0]
1399	str	r2, [r0, #0x02]
1400	RET
1401	LMEMCPY_6_PAD
1402
1403/*
1404 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1405 */
1406	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
1407	ldr	r2, [r1, #3]		/* BE:r2 = 345x  LE:r2 = x543 */
1408	mov	r1, r3, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
1409	mov	r2, r2, lsl #8		/* r2 = 543. */
1410	orr	r2, r2, r3, lsr #24	/* r2 = 5432 */
1411	strh	r1, [r0]
1412	str	r2, [r0, #0x02]
1413	RET
1414	LMEMCPY_6_PAD
1415
1416/*
1417 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1418 */
1419	ldrh	r2, [r1]
1420	ldr	r3, [r1, #0x02]
1421	strh	r2, [r0]
1422	str	r3, [r0, #0x02]
1423	RET
1424	LMEMCPY_6_PAD
1425
1426/*
1427 * 1011: dst is 16-bit aligned, src is 8-bit aligned
1428 */
1429	ldrb	r3, [r1]		/* r3 = ...0 */
1430	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
1431	ldrb	r1, [r1, #0x05]		/* r1 = ...5 */
1432	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
1433	mov	r1, r1, lsl #24		/* r1 = 5... */
1434	orr	r1, r1, r2, lsr #8	/* r1 = 5432 */
1435	strh	r3, [r0]
1436	str	r1, [r0, #0x02]
1437	RET
1438	LMEMCPY_6_PAD
1439
1440/*
1441 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1442 */
1443	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1444	ldrh	r1, [r1, #0x04]		/* BE:r1 = ..45  LE:r1 = ..54 */
1445	strb	r2, [r0]
1446	mov	r2, r2, lsr #8		/* r2 = .321 */
1447	orr	r2, r2, r1, lsl #24	/* r2 = 4321 */
1448	mov	r1, r1, lsr #8		/* r1 = ...5 */
1449	str	r2, [r0, #0x01]
1450	strb	r1, [r0, #0x05]
1451	RET
1452	LMEMCPY_6_PAD
1453
1454/*
1455 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1456 */
1457	ldrb	r2, [r1]
1458	ldrh	r3, [r1, #0x01]
1459	ldrh	ip, [r1, #0x03]
1460	ldrb	r1, [r1, #0x05]
1461	strb	r2, [r0]
1462	strh	r3, [r0, #0x01]
1463	strh	ip, [r0, #0x03]
1464	strb	r1, [r0, #0x05]
1465	RET
1466	LMEMCPY_6_PAD
1467
1468/*
1469 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1470 */
1471	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1472	ldr	r1, [r1, #0x02]		/* BE:r1 = 2345  LE:r1 = 5432 */
1473	strb	r2, [r0]
1474	mov	r2, r2, lsr #8		/* r2 = ...1 */
1475	orr	r2, r2, r1, lsl #8	/* r2 = 4321 */
1476	mov	r1, r1, lsr #24		/* r1 = ...5 */
1477	str	r2, [r0, #0x01]
1478	strb	r1, [r0, #0x05]
1479	RET
1480	LMEMCPY_6_PAD
1481
1482/*
1483 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1484 */
1485	ldrb	r2, [r1]
1486	ldr	r3, [r1, #0x01]
1487	ldrb	r1, [r1, #0x05]
1488	strb	r2, [r0]
1489	str	r3, [r0, #0x01]
1490	strb	r1, [r0, #0x05]
1491	RET
1492	LMEMCPY_6_PAD
1493
1494
1495/******************************************************************************
1496 * Special case for 8 byte copies
1497 */
1498#define	LMEMCPY_8_LOG2	6	/* 64 bytes */
1499#define	LMEMCPY_8_PAD	.align LMEMCPY_8_LOG2
1500	LMEMCPY_8_PAD
1501.Lmemcpy_8:
1502	and	r2, r1, #0x03
1503	orr	r2, r2, r0, lsl #2
1504	ands	r2, r2, #0x0f
1505	sub	r3, pc, #0x14
1506	addne	pc, r3, r2, lsl #LMEMCPY_8_LOG2
1507
1508/*
1509 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1510 */
1511	ldr	r2, [r1]
1512	ldr	r3, [r1, #0x04]
1513	str	r2, [r0]
1514	str	r3, [r0, #0x04]
1515	RET
1516	LMEMCPY_8_PAD
1517
1518/*
1519 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1520 */
1521	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
1522	ldr	r2, [r1, #0x03]		/* BE:r2 = 3456  LE:r2 = 6543 */
1523	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
1524	mov	r3, r3, lsr #8		/* r3 = .210 */
1525	orr	r3, r3, r2, lsl #24	/* r3 = 3210 */
1526	mov	r1, r1, lsl #24		/* r1 = 7... */
1527	orr	r2, r1, r2, lsr #8	/* r2 = 7654 */
1528	str	r3, [r0]
1529	str	r2, [r0, #0x04]
1530	RET
1531	LMEMCPY_8_PAD
1532
1533/*
1534 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1535 */
1536	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1537	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1538	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
1539	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
1540	mov	r3, r3, lsr #16		/* r3 = ..54 */
1541	orr	r3, r3, r1, lsl #16	/* r3 = 7654 */
1542	str	r2, [r0]
1543	str	r3, [r0, #0x04]
1544	RET
1545	LMEMCPY_8_PAD
1546
1547/*
1548 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1549 */
1550	ldrb	r3, [r1]		/* r3 = ...0 */
1551	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
1552	ldr	r1, [r1, #0x05]		/* BE:r1 = 567x  LE:r1 = x765 */
1553	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
1554	mov	r2, r2, lsr #24		/* r2 = ...4 */
1555	orr	r2, r2, r1, lsl #8	/* r2 = 7654 */
1556	str	r3, [r0]
1557	str	r2, [r0, #0x04]
1558	RET
1559	LMEMCPY_8_PAD
1560
1561/*
1562 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1563 */
1564	ldr	r3, [r1]		/* BE:r3 = 0123  LE:r3 = 3210 */
1565	ldr	r2, [r1, #0x04]		/* BE:r2 = 4567  LE:r2 = 7654 */
1566	strb	r3, [r0]
1567	mov	r1, r2, lsr #24		/* r1 = ...7 */
1568	strb	r1, [r0, #0x07]
1569	mov	r1, r3, lsr #8		/* r1 = .321 */
1570	mov	r3, r3, lsr #24		/* r3 = ...3 */
1571	orr	r3, r3, r2, lsl #8	/* r3 = 6543 */
1572	strh	r1, [r0, #0x01]
1573	str	r3, [r0, #0x03]
1574	RET
1575	LMEMCPY_8_PAD
1576
1577/*
1578 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1579 */
1580	ldrb	r2, [r1]
1581	ldrh	r3, [r1, #0x01]
1582	ldr	ip, [r1, #0x03]
1583	ldrb	r1, [r1, #0x07]
1584	strb	r2, [r0]
1585	strh	r3, [r0, #0x01]
1586	str	ip, [r0, #0x03]
1587	strb	r1, [r0, #0x07]
1588	RET
1589	LMEMCPY_8_PAD
1590
1591/*
1592 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1593 */
1594	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1595	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1596	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
1597	strb	r2, [r0]		/* 0 */
1598	mov	ip, r1, lsr #8		/* ip = ...7 */
1599	strb	ip, [r0, #0x07]		/* 7 */
1600	mov	ip, r2, lsr #8		/* ip = ...1 */
1601	orr	ip, ip, r3, lsl #8	/* ip = 4321 */
1602	mov	r3, r3, lsr #8		/* r3 = .543 */
1603	orr	r3, r3, r1, lsl #24	/* r3 = 6543 */
1604	strh	ip, [r0, #0x01]
1605	str	r3, [r0, #0x03]
1606	RET
1607	LMEMCPY_8_PAD
1608
1609/*
1610 * 0111: dst is 8-bit aligned, src is 8-bit aligned
1611 */
1612	ldrb	r3, [r1]		/* r3 = ...0 */
1613	ldr	ip, [r1, #0x01]		/* BE:ip = 1234  LE:ip = 4321 */
1614	ldrh	r2, [r1, #0x05]		/* BE:r2 = ..56  LE:r2 = ..65 */
1615	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
1616	strb	r3, [r0]
1617	mov	r3, ip, lsr #16		/* BE:r3 = ..12  LE:r3 = ..43 */
1618	strh	ip, [r0, #0x01]
1619	orr	r2, r3, r2, lsl #16	/* r2 = 6543 */
1620	str	r2, [r0, #0x03]
1621	strb	r1, [r0, #0x07]
1622	RET
1623	LMEMCPY_8_PAD
1624
1625/*
1626 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1627 */
1628	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1629	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
1630	mov	r1, r2, lsr #16		/* BE:r1 = ..01  LE:r1 = ..32 */
1631	strh	r2, [r0]
1632	orr	r2, r1, r3, lsl #16	/* r2 = 5432 */
1633	mov	r3, r3, lsr #16		/* r3 = ..76 */
1634	str	r2, [r0, #0x02]
1635	strh	r3, [r0, #0x06]
1636	RET
1637	LMEMCPY_8_PAD
1638
1639/*
1640 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1641 */
1642	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
1643	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
1644	ldrb	ip, [r1, #0x07]		/* ip = ...7 */
1645	mov	r1, r2, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
1646	strh	r1, [r0]
1647	mov	r1, r2, lsr #24		/* r1 = ...2 */
1648	orr	r1, r1, r3, lsl #8	/* r1 = 5432 */
1649	mov	r3, r3, lsr #24		/* r3 = ...6 */
1650	orr	r3, r3, ip, lsl #8	/* r3 = ..76 */
1651	str	r1, [r0, #0x02]
1652	strh	r3, [r0, #0x06]
1653	RET
1654	LMEMCPY_8_PAD
1655
1656/*
1657 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1658 */
1659	ldrh	r2, [r1]
1660	ldr	ip, [r1, #0x02]
1661	ldrh	r3, [r1, #0x06]
1662	strh	r2, [r0]
1663	str	ip, [r0, #0x02]
1664	strh	r3, [r0, #0x06]
1665	RET
1666	LMEMCPY_8_PAD
1667
1668/*
1669 * 1011: dst is 16-bit aligned, src is 8-bit aligned
1670 */
1671	ldr	r3, [r1, #0x05]		/* BE:r3 = 567x  LE:r3 = x765 */
1672	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
1673	ldrb	ip, [r1]		/* ip = ...0 */
1674	mov	r1, r3, lsr #8		/* BE:r1 = .567  LE:r1 = .x76 */
1675	strh	r1, [r0, #0x06]
1676	mov	r3, r3, lsl #24		/* r3 = 5... */
1677	orr	r3, r3, r2, lsr #8	/* r3 = 5432 */
1678	orr	r2, ip, r2, lsl #8	/* r2 = 3210 */
1679	str	r3, [r0, #0x02]
1680	strh	r2, [r0]
1681	RET
1682	LMEMCPY_8_PAD
1683
1684/*
1685 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1686 */
1687	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
1688	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1689	mov	r1, r3, lsr #8		/* BE:r1 = .456  LE:r1 = .765 */
1690	strh	r1, [r0, #0x05]
1691	strb	r2, [r0]
1692	mov	r1, r3, lsr #24		/* r1 = ...7 */
1693	strb	r1, [r0, #0x07]
1694	mov	r2, r2, lsr #8		/* r2 = .321 */
1695	orr	r2, r2, r3, lsl #24	/* r2 = 4321 */
1696	str	r2, [r0, #0x01]
1697	RET
1698	LMEMCPY_8_PAD
1699
1700/*
1701 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1702 */
1703	ldrb	r3, [r1]		/* r3 = ...0 */
1704	ldrh	r2, [r1, #0x01]		/* BE:r2 = ..12  LE:r2 = ..21 */
1705	ldr	ip, [r1, #0x03]		/* BE:ip = 3456  LE:ip = 6543 */
1706	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
1707	strb	r3, [r0]
1708	mov	r3, ip, lsr #16		/* BE:r3 = ..34  LE:r3 = ..65 */
1709	strh	r3, [r0, #0x05]
1710	orr	r2, r2, ip, lsl #16	/* r2 = 4321 */
1711	str	r2, [r0, #0x01]
1712	strb	r1, [r0, #0x07]
1713	RET
1714	LMEMCPY_8_PAD
1715
1716/*
1717 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1718 */
1719	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1720	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1721	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
1722	strb	r2, [r0]
1723	mov	ip, r2, lsr #8		/* ip = ...1 */
1724	orr	ip, ip, r3, lsl #8	/* ip = 4321 */
1725	mov	r2, r1, lsr #8		/* r2 = ...7 */
1726	strb	r2, [r0, #0x07]
1727	mov	r1, r1, lsl #8		/* r1 = .76. */
1728	orr	r1, r1, r3, lsr #24	/* r1 = .765 */
1729	str	ip, [r0, #0x01]
1730	strh	r1, [r0, #0x05]
1731	RET
1732	LMEMCPY_8_PAD
1733
1734/*
1735 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1736 */
1737	ldrb	r2, [r1]
1738	ldr	ip, [r1, #0x01]
1739	ldrh	r3, [r1, #0x05]
1740	ldrb	r1, [r1, #0x07]
1741	strb	r2, [r0]
1742	str	ip, [r0, #0x01]
1743	strh	r3, [r0, #0x05]
1744	strb	r1, [r0, #0x07]
1745	RET
1746	LMEMCPY_8_PAD
1747
1748/******************************************************************************
1749 * Special case for 12 byte copies
1750 */
1751#define	LMEMCPY_C_LOG2	7	/* 128 bytes */
1752#define	LMEMCPY_C_PAD	.align LMEMCPY_C_LOG2
1753	LMEMCPY_C_PAD
1754.Lmemcpy_c:
1755	and	r2, r1, #0x03
1756	orr	r2, r2, r0, lsl #2
1757	ands	r2, r2, #0x0f
1758	sub	r3, pc, #0x14
1759	addne	pc, r3, r2, lsl #LMEMCPY_C_LOG2
1760
1761/*
1762 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1763 */
1764	ldr	r2, [r1]
1765	ldr	r3, [r1, #0x04]
1766	ldr	r1, [r1, #0x08]
1767	str	r2, [r0]
1768	str	r3, [r0, #0x04]
1769	str	r1, [r0, #0x08]
1770	RET
1771	LMEMCPY_C_PAD
1772
1773/*
1774 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1775 */
1776	ldrb	r2, [r1, #0xb]		/* r2 = ...B */
1777	ldr	ip, [r1, #0x07]		/* BE:ip = 789A  LE:ip = A987 */
1778	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
1779	ldr	r1, [r1, #-1]		/* BE:r1 = x012  LE:r1 = 210x */
1780	mov	r2, r2, lsl #24		/* r2 = B... */
1781	orr	r2, r2, ip, lsr #8	/* r2 = BA98 */
1782	str	r2, [r0, #0x08]
1783	mov	r2, ip, lsl #24		/* r2 = 7... */
1784	orr	r2, r2, r3, lsr #8	/* r2 = 7654 */
1785	mov	r1, r1, lsr #8		/* r1 = .210 */
1786	orr	r1, r1, r3, lsl #24	/* r1 = 3210 */
1787	str	r2, [r0, #0x04]
1788	str	r1, [r0]
1789	RET
1790	LMEMCPY_C_PAD
1791
1792/*
1793 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1794 */
1795	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1796	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1797	ldr	ip, [r1, #0x06]		/* BE:ip = 6789  LE:ip = 9876 */
1798	ldrh	r1, [r1, #0x0a]		/* BE:r1 = ..AB  LE:r1 = ..BA */
1799	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
1800	str	r2, [r0]
1801	mov	r3, r3, lsr #16		/* r3 = ..54 */
1802	orr	r3, r3, ip, lsl #16	/* r3 = 7654 */
1803	mov	r1, r1, lsl #16		/* r1 = BA.. */
1804	orr	r1, r1, ip, lsr #16	/* r1 = BA98 */
1805	str	r3, [r0, #0x04]
1806	str	r1, [r0, #0x08]
1807	RET
1808	LMEMCPY_C_PAD
1809
1810/*
1811 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1812 */
1813	ldrb	r2, [r1]		/* r2 = ...0 */
1814	ldr	r3, [r1, #0x01]		/* BE:r3 = 1234  LE:r3 = 4321 */
1815	ldr	ip, [r1, #0x05]		/* BE:ip = 5678  LE:ip = 8765 */
1816	ldr	r1, [r1, #0x09]		/* BE:r1 = 9ABx  LE:r1 = xBA9 */
1817	orr	r2, r2, r3, lsl #8	/* r2 = 3210 */
1818	str	r2, [r0]
1819	mov	r3, r3, lsr #24		/* r3 = ...4 */
1820	orr	r3, r3, ip, lsl #8	/* r3 = 7654 */
1821	mov	r1, r1, lsl #8		/* r1 = BA9. */
1822	orr	r1, r1, ip, lsr #24	/* r1 = BA98 */
1823	str	r3, [r0, #0x04]
1824	str	r1, [r0, #0x08]
1825	RET
1826	LMEMCPY_C_PAD
1827
1828/*
1829 * 0100: dst is 8-bit aligned (byte 1), src is 32-bit aligned
1830 */
1831	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1832	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
1833	ldr	ip, [r1, #0x08]		/* BE:ip = 89AB  LE:ip = BA98 */
1834	mov	r1, r2, lsr #8		/* BE:r1 = .012  LE:r1 = .321 */
1835	strh	r1, [r0, #0x01]
1836	strb	r2, [r0]
1837	mov	r1, r2, lsr #24		/* r1 = ...3 */
1838	orr	r2, r1, r3, lsl #8	/* r1 = 6543 */
1839	mov	r1, r3, lsr #24		/* r1 = ...7 */
1840	orr	r1, r1, ip, lsl #8	/* r1 = A987 */
1841	mov	ip, ip, lsr #24		/* ip = ...B */
1842	str	r2, [r0, #0x03]
1843	str	r1, [r0, #0x07]
1844	strb	ip, [r0, #0x0b]
1845	RET
1846	LMEMCPY_C_PAD
1847
1848/*
1849 * 0101: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 1)
1850 */
1851	ldrb	r2, [r1]
1852	ldrh	r3, [r1, #0x01]
1853	ldr	ip, [r1, #0x03]
1854	strb	r2, [r0]
1855	ldr	r2, [r1, #0x07]
1856	ldrb	r1, [r1, #0x0b]
1857	strh	r3, [r0, #0x01]
1858	str	ip, [r0, #0x03]
1859	str	r2, [r0, #0x07]
1860	strb	r1, [r0, #0x0b]
1861	RET
1862	LMEMCPY_C_PAD
1863
1864/*
1865 * 0110: dst is 8-bit aligned (byte 1), src is 16-bit aligned
1866 */
1867	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1868	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1869	ldr	ip, [r1, #0x06]		/* BE:ip = 6789  LE:ip = 9876 */
1870	ldrh	r1, [r1, #0x0a]		/* BE:r1 = ..AB  LE:r1 = ..BA */
1871	strb	r2, [r0]
1872	mov	r2, r2, lsr #8		/* r2 = ...1 */
1873	orr	r2, r2, r3, lsl #8	/* r2 = 4321 */
1874	strh	r2, [r0, #0x01]
1875	mov	r2, r3, lsr #8		/* r2 = .543 */
1876	orr	r3, r2, ip, lsl #24	/* r3 = 6543 */
1877	mov	r2, ip, lsr #8		/* r2 = .987 */
1878	orr	r2, r2, r1, lsl #24	/* r2 = A987 */
1879	mov	r1, r1, lsr #8		/* r1 = ...B */
1880	str	r3, [r0, #0x03]
1881	str	r2, [r0, #0x07]
1882	strb	r1, [r0, #0x0b]
1883	RET
1884	LMEMCPY_C_PAD
1885
1886/*
1887 * 0111: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 3)
1888 */
1889	ldrb	r2, [r1]
1890	ldr	r3, [r1, #0x01]		/* BE:r3 = 1234  LE:r3 = 4321 */
1891	ldr	ip, [r1, #0x05]		/* BE:ip = 5678  LE:ip = 8765 */
1892	ldr	r1, [r1, #0x09]		/* BE:r1 = 9ABx  LE:r1 = xBA9 */
1893	strb	r2, [r0]
1894	strh	r3, [r0, #0x01]
1895	mov	r3, r3, lsr #16		/* r3 = ..43 */
1896	orr	r3, r3, ip, lsl #16	/* r3 = 6543 */
1897	mov	ip, ip, lsr #16		/* ip = ..87 */
1898	orr	ip, ip, r1, lsl #16	/* ip = A987 */
1899	mov	r1, r1, lsr #16		/* r1 = ..xB */
1900	str	r3, [r0, #0x03]
1901	str	ip, [r0, #0x07]
1902	strb	r1, [r0, #0x0b]
1903	RET
1904	LMEMCPY_C_PAD
1905
1906/*
1907 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1908 */
1909	ldr	ip, [r1]		/* BE:ip = 0123  LE:ip = 3210 */
1910	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
1911	ldr	r2, [r1, #0x08]		/* BE:r2 = 89AB  LE:r2 = BA98 */
1912	mov	r1, ip, lsr #16		/* BE:r1 = ..01  LE:r1 = ..32 */
1913	strh	ip, [r0]
1914	orr	r1, r1, r3, lsl #16	/* r1 = 5432 */
1915	mov	r3, r3, lsr #16		/* r3 = ..76 */
1916	orr	r3, r3, r2, lsl #16	/* r3 = 9876 */
1917	mov	r2, r2, lsr #16		/* r2 = ..BA */
1918	str	r1, [r0, #0x02]
1919	str	r3, [r0, #0x06]
1920	strh	r2, [r0, #0x0a]
1921	RET
1922	LMEMCPY_C_PAD
1923
1924/*
1925 * 1001: dst is 16-bit aligned, src is 8-bit aligned (byte 1)
1926 */
1927	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
1928	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
1929	mov	ip, r2, lsr #8		/* BE:ip = .x01  LE:ip = .210 */
1930	strh	ip, [r0]
1931	ldr	ip, [r1, #0x07]		/* BE:ip = 789A  LE:ip = A987 */
1932	ldrb	r1, [r1, #0x0b]		/* r1 = ...B */
1933	mov	r2, r2, lsr #24		/* r2 = ...2 */
1934	orr	r2, r2, r3, lsl #8	/* r2 = 5432 */
1935	mov	r3, r3, lsr #24		/* r3 = ...6 */
1936	orr	r3, r3, ip, lsl #8	/* r3 = 9876 */
1937	mov	r1, r1, lsl #8		/* r1 = ..B. */
1938	orr	r1, r1, ip, lsr #24	/* r1 = ..BA */
1939	str	r2, [r0, #0x02]
1940	str	r3, [r0, #0x06]
1941	strh	r1, [r0, #0x0a]
1942	RET
1943	LMEMCPY_C_PAD
1944
1945/*
1946 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1947 */
1948	ldrh	r2, [r1]
1949	ldr	r3, [r1, #0x02]
1950	ldr	ip, [r1, #0x06]
1951	ldrh	r1, [r1, #0x0a]
1952	strh	r2, [r0]
1953	str	r3, [r0, #0x02]
1954	str	ip, [r0, #0x06]
1955	strh	r1, [r0, #0x0a]
1956	RET
1957	LMEMCPY_C_PAD
1958
1959/*
1960 * 1011: dst is 16-bit aligned, src is 8-bit aligned (byte 3)
1961 */
1962	ldr	r2, [r1, #0x09]		/* BE:r2 = 9ABx  LE:r2 = xBA9 */
1963	ldr	r3, [r1, #0x05]		/* BE:r3 = 5678  LE:r3 = 8765 */
1964	mov	ip, r2, lsr #8		/* BE:ip = .9AB  LE:ip = .xBA */
1965	strh	ip, [r0, #0x0a]
1966	ldr	ip, [r1, #0x01]		/* BE:ip = 1234  LE:ip = 4321 */
1967	ldrb	r1, [r1]		/* r1 = ...0 */
1968	mov	r2, r2, lsl #24		/* r2 = 9... */
1969	orr	r2, r2, r3, lsr #8	/* r2 = 9876 */
1970	mov	r3, r3, lsl #24		/* r3 = 5... */
1971	orr	r3, r3, ip, lsr #8	/* r3 = 5432 */
1972	orr	r1, r1, ip, lsl #8	/* r1 = 3210 */
1973	str	r2, [r0, #0x06]
1974	str	r3, [r0, #0x02]
1975	strh	r1, [r0]
1976	RET
1977	LMEMCPY_C_PAD
1978
1979/*
1980 * 1100: dst is 8-bit aligned (byte 3), src is 32-bit aligned
1981 */
1982	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1983	ldr	ip, [r1, #0x04]		/* BE:ip = 4567  LE:ip = 7654 */
1984	ldr	r1, [r1, #0x08]		/* BE:r1 = 89AB  LE:r1 = BA98 */
1985	strb	r2, [r0]
1986	mov	r3, r2, lsr #8		/* r3 = .321 */
1987	orr	r3, r3, ip, lsl #24	/* r3 = 4321 */
1988	str	r3, [r0, #0x01]
1989	mov	r3, ip, lsr #8		/* r3 = .765 */
1990	orr	r3, r3, r1, lsl #24	/* r3 = 8765 */
1991	str	r3, [r0, #0x05]
1992	mov	r1, r1, lsr #8		/* r1 = .BA9 */
1993	strh	r1, [r0, #0x09]
1994	mov	r1, r1, lsr #16		/* r1 = ...B */
1995	strb	r1, [r0, #0x0b]
1996	RET
1997	LMEMCPY_C_PAD
1998
1999/*
2000 * 1101: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 1)
2001 */
2002	ldrb	r2, [r1, #0x0b]		/* r2 = ...B */
2003	ldr	r3, [r1, #0x07]		/* BE:r3 = 789A  LE:r3 = A987 */
2004	ldr	ip, [r1, #0x03]		/* BE:ip = 3456  LE:ip = 6543 */
2005	ldr	r1, [r1, #-1]		/* BE:r1 = x012  LE:r1 = 210x */
2006	strb	r2, [r0, #0x0b]
2007	mov	r2, r3, lsr #16		/* r2 = ..A9 */
2008	strh	r2, [r0, #0x09]
2009	mov	r3, r3, lsl #16		/* r3 = 87.. */
2010	orr	r3, r3, ip, lsr #16	/* r3 = 8765 */
2011	mov	ip, ip, lsl #16		/* ip = 43.. */
2012	orr	ip, ip, r1, lsr #16	/* ip = 4321 */
2013	mov	r1, r1, lsr #8		/* r1 = .210 */
2014	str	r3, [r0, #0x05]
2015	str	ip, [r0, #0x01]
2016	strb	r1, [r0]
2017	RET
2018	LMEMCPY_C_PAD
2019
2020/*
2021 * 1110: dst is 8-bit aligned (byte 3), src is 16-bit aligned
2022 */
2023	ldrh	r2, [r1]		/* r2 = ..10 */
2024	ldr	r3, [r1, #0x02]		/* r3 = 5432 */
2025	ldr	ip, [r1, #0x06]		/* ip = 9876 */
2026	ldrh	r1, [r1, #0x0a]		/* r1 = ..BA */
2027	strb	r2, [r0]
2028	mov	r2, r2, lsr #8		/* r2 = ...1 */
2029	orr	r2, r2, r3, lsl #8	/* r2 = 4321 */
2030	mov	r3, r3, lsr #24		/* r3 = ...5 */
2031	orr	r3, r3, ip, lsl #8	/* r3 = 8765 */
2032	mov	ip, ip, lsr #24		/* ip = ...9 */
2033	orr	ip, ip, r1, lsl #8	/* ip = .BA9 */
2034	mov	r1, r1, lsr #8		/* r1 = ...B */
2035	str	r2, [r0, #0x01]
2036	str	r3, [r0, #0x05]
2037	strh	ip, [r0, #0x09]
2038	strb	r1, [r0, #0x0b]
2039	RET
2040	LMEMCPY_C_PAD
2041
2042/*
2043 * 1111: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 3)
2044 */
2045	ldrb	r2, [r1]
2046	ldr	r3, [r1, #0x01]
2047	ldr	ip, [r1, #0x05]
2048	strb	r2, [r0]
2049	ldrh	r2, [r1, #0x09]
2050	ldrb	r1, [r1, #0x0b]
2051	str	r3, [r0, #0x01]
2052	str	ip, [r0, #0x05]
2053	strh	r2, [r0, #0x09]
2054	strb	r1, [r0, #0x0b]
2055	RET
2056END(memcpy)
2057