xref: /linux/arch/microblaze/lib/fastcopy.S (revision 02091cbe9cc4f18167208eec1d6de636cc731817)
1/*
2 * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu>
3 * Copyright (C) 2008-2009 PetaLogix
4 * Copyright (C) 2008 Jim Law - Iris LP  All rights reserved.
5 *
6 * This file is subject to the terms and conditions of the GNU General
7 * Public License.  See the file COPYING in the main directory of this
8 * archive for more details.
9 *
10 * Written by Jim Law <jlaw@irispower.com>
11 *
12 * intended to replace:
13 *	memcpy in memcpy.c and
14 *	memmove in memmove.c
15 * ... in arch/microblaze/lib
16 *
17 *
18 * assly_fastcopy.S
19 *
20 * Attempt at quicker memcpy and memmove for MicroBlaze
21 *	Input :	Operand1 in Reg r5 - destination address
22 *		Operand2 in Reg r6 - source address
23 *		Operand3 in Reg r7 - number of bytes to transfer
24 *	Output: Result in Reg r3 - starting destinaition address
25 *
26 *
27 * Explanation:
28 *	Perform (possibly unaligned) copy of a block of memory
29 *	between mem locations with size of xfer spec'd in bytes
30 */
31
32#include <linux/linkage.h>
33	.text
34	.globl	memcpy
35	.type  memcpy, @function
36	.ent	memcpy
37
38memcpy:
39fast_memcpy_ascending:
40	/* move d to return register as value of function */
41	addi	r3, r5, 0
42
43	addi	r4, r0, 4	/* n = 4 */
44	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
45	blti	r4, a_xfer_end	/* if n < 0, less than one word to transfer */
46
47	/* transfer first 0~3 bytes to get aligned dest address */
48	andi	r4, r5, 3		/* n = d & 3 */
49	/* if zero, destination already aligned */
50	beqi	r4, a_dalign_done
51	/* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */
52	rsubi	r4, r4, 4
53	rsub	r7, r4, r7		/* c = c - n adjust c */
54
55a_xfer_first_loop:
56	/* if no bytes left to transfer, transfer the bulk */
57	beqi	r4, a_dalign_done
58	lbui	r11, r6, 0		/* h = *s */
59	sbi	r11, r5, 0		/* *d = h */
60	addi	r6, r6, 1		/* s++ */
61	addi	r5, r5, 1		/* d++ */
62	brid	a_xfer_first_loop	/* loop */
63	addi	r4, r4, -1		/* n-- (IN DELAY SLOT) */
64
65a_dalign_done:
66	addi	r4, r0, 32		/* n = 32 */
67	cmpu	r4, r4, r7		/* n = c - n  (unsigned) */
68	/* if n < 0, less than one block to transfer */
69	blti	r4, a_block_done
70
71a_block_xfer:
72	andi	r4, r7, 0xffffffe0	/* n = c & ~31 */
73	rsub	r7, r4, r7		/* c = c - n */
74
75	andi	r9, r6, 3		/* t1 = s & 3 */
76	/* if temp != 0, unaligned transfers needed */
77	bnei	r9, a_block_unaligned
78
79a_block_aligned:
80	lwi	r9, r6, 0		/* t1 = *(s + 0) */
81	lwi	r10, r6, 4		/* t2 = *(s + 4) */
82	lwi	r11, r6, 8		/* t3 = *(s + 8) */
83	lwi	r12, r6, 12		/* t4 = *(s + 12) */
84	swi	r9, r5, 0		/* *(d + 0) = t1 */
85	swi	r10, r5, 4		/* *(d + 4) = t2 */
86	swi	r11, r5, 8		/* *(d + 8) = t3 */
87	swi	r12, r5, 12		/* *(d + 12) = t4 */
88	lwi	r9, r6, 16		/* t1 = *(s + 16) */
89	lwi	r10, r6, 20		/* t2 = *(s + 20) */
90	lwi	r11, r6, 24		/* t3 = *(s + 24) */
91	lwi	r12, r6, 28		/* t4 = *(s + 28) */
92	swi	r9, r5, 16		/* *(d + 16) = t1 */
93	swi	r10, r5, 20		/* *(d + 20) = t2 */
94	swi	r11, r5, 24		/* *(d + 24) = t3 */
95	swi	r12, r5, 28		/* *(d + 28) = t4 */
96	addi	r6, r6, 32		/* s = s + 32 */
97	addi	r4, r4, -32		/* n = n - 32 */
98	bneid	r4, a_block_aligned	/* while (n) loop */
99	addi	r5, r5, 32		/* d = d + 32 (IN DELAY SLOT) */
100	bri	a_block_done
101
102a_block_unaligned:
103	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */
104	add	r6, r6, r4		/* s = s + n */
105	lwi	r11, r8, 0		/* h = *(as + 0) */
106
107	addi	r9, r9, -1
108	beqi	r9, a_block_u1		/* t1 was 1 => 1 byte offset */
109	addi	r9, r9, -1
110	beqi	r9, a_block_u2		/* t1 was 2 => 2 byte offset */
111
112a_block_u3:
113	bslli	r11, r11, 24	/* h = h << 24 */
114a_bu3_loop:
115	lwi	r12, r8, 4	/* v = *(as + 4) */
116	bsrli	r9, r12, 8	/* t1 = v >> 8 */
117	or	r9, r11, r9	/* t1 = h | t1 */
118	swi	r9, r5, 0	/* *(d + 0) = t1 */
119	bslli	r11, r12, 24	/* h = v << 24 */
120	lwi	r12, r8, 8	/* v = *(as + 8) */
121	bsrli	r9, r12, 8	/* t1 = v >> 8 */
122	or	r9, r11, r9	/* t1 = h | t1 */
123	swi	r9, r5, 4	/* *(d + 4) = t1 */
124	bslli	r11, r12, 24	/* h = v << 24 */
125	lwi	r12, r8, 12	/* v = *(as + 12) */
126	bsrli	r9, r12, 8	/* t1 = v >> 8 */
127	or	r9, r11, r9	/* t1 = h | t1 */
128	swi	r9, r5, 8	/* *(d + 8) = t1 */
129	bslli	r11, r12, 24	/* h = v << 24 */
130	lwi	r12, r8, 16	/* v = *(as + 16) */
131	bsrli	r9, r12, 8	/* t1 = v >> 8 */
132	or	r9, r11, r9	/* t1 = h | t1 */
133	swi	r9, r5, 12	/* *(d + 12) = t1 */
134	bslli	r11, r12, 24	/* h = v << 24 */
135	lwi	r12, r8, 20	/* v = *(as + 20) */
136	bsrli	r9, r12, 8	/* t1 = v >> 8 */
137	or	r9, r11, r9	/* t1 = h | t1 */
138	swi	r9, r5, 16	/* *(d + 16) = t1 */
139	bslli	r11, r12, 24	/* h = v << 24 */
140	lwi	r12, r8, 24	/* v = *(as + 24) */
141	bsrli	r9, r12, 8	/* t1 = v >> 8 */
142	or	r9, r11, r9	/* t1 = h | t1 */
143	swi	r9, r5, 20	/* *(d + 20) = t1 */
144	bslli	r11, r12, 24	/* h = v << 24 */
145	lwi	r12, r8, 28	/* v = *(as + 28) */
146	bsrli	r9, r12, 8	/* t1 = v >> 8 */
147	or	r9, r11, r9	/* t1 = h | t1 */
148	swi	r9, r5, 24	/* *(d + 24) = t1 */
149	bslli	r11, r12, 24	/* h = v << 24 */
150	lwi	r12, r8, 32	/* v = *(as + 32) */
151	bsrli	r9, r12, 8	/* t1 = v >> 8 */
152	or	r9, r11, r9	/* t1 = h | t1 */
153	swi	r9, r5, 28	/* *(d + 28) = t1 */
154	bslli	r11, r12, 24	/* h = v << 24 */
155	addi	r8, r8, 32	/* as = as + 32 */
156	addi	r4, r4, -32	/* n = n - 32 */
157	bneid	r4, a_bu3_loop	/* while (n) loop */
158	addi	r5, r5, 32	/* d = d + 32 (IN DELAY SLOT) */
159	bri	a_block_done
160
161a_block_u1:
162	bslli	r11, r11, 8	/* h = h << 8 */
163a_bu1_loop:
164	lwi	r12, r8, 4	/* v = *(as + 4) */
165	bsrli	r9, r12, 24	/* t1 = v >> 24 */
166	or	r9, r11, r9	/* t1 = h | t1 */
167	swi	r9, r5, 0	/* *(d + 0) = t1 */
168	bslli	r11, r12, 8	/* h = v << 8 */
169	lwi	r12, r8, 8	/* v = *(as + 8) */
170	bsrli	r9, r12, 24	/* t1 = v >> 24 */
171	or	r9, r11, r9	/* t1 = h | t1 */
172	swi	r9, r5, 4	/* *(d + 4) = t1 */
173	bslli	r11, r12, 8	/* h = v << 8 */
174	lwi	r12, r8, 12	/* v = *(as + 12) */
175	bsrli	r9, r12, 24	/* t1 = v >> 24 */
176	or	r9, r11, r9	/* t1 = h | t1 */
177	swi	r9, r5, 8	/* *(d + 8) = t1 */
178	bslli	r11, r12, 8	/* h = v << 8 */
179	lwi	r12, r8, 16	/* v = *(as + 16) */
180	bsrli	r9, r12, 24	/* t1 = v >> 24 */
181	or	r9, r11, r9	/* t1 = h | t1 */
182	swi	r9, r5, 12	/* *(d + 12) = t1 */
183	bslli	r11, r12, 8	/* h = v << 8 */
184	lwi	r12, r8, 20	/* v = *(as + 20) */
185	bsrli	r9, r12, 24	/* t1 = v >> 24 */
186	or	r9, r11, r9	/* t1 = h | t1 */
187	swi	r9, r5, 16	/* *(d + 16) = t1 */
188	bslli	r11, r12, 8	/* h = v << 8 */
189	lwi	r12, r8, 24	/* v = *(as + 24) */
190	bsrli	r9, r12, 24	/* t1 = v >> 24 */
191	or	r9, r11, r9	/* t1 = h | t1 */
192	swi	r9, r5, 20	/* *(d + 20) = t1 */
193	bslli	r11, r12, 8	/* h = v << 8 */
194	lwi	r12, r8, 28	/* v = *(as + 28) */
195	bsrli	r9, r12, 24	/* t1 = v >> 24 */
196	or	r9, r11, r9	/* t1 = h | t1 */
197	swi	r9, r5, 24	/* *(d + 24) = t1 */
198	bslli	r11, r12, 8	/* h = v << 8 */
199	lwi	r12, r8, 32	/* v = *(as + 32) */
200	bsrli	r9, r12, 24	/* t1 = v >> 24 */
201	or	r9, r11, r9	/* t1 = h | t1 */
202	swi	r9, r5, 28	/* *(d + 28) = t1 */
203	bslli	r11, r12, 8	/* h = v << 8 */
204	addi	r8, r8, 32	/* as = as + 32 */
205	addi	r4, r4, -32	/* n = n - 32 */
206	bneid	r4, a_bu1_loop	/* while (n) loop */
207	addi	r5, r5, 32	/* d = d + 32 (IN DELAY SLOT) */
208	bri	a_block_done
209
210a_block_u2:
211	bslli	r11, r11, 16	/* h = h << 16 */
212a_bu2_loop:
213	lwi	r12, r8, 4	/* v = *(as + 4) */
214	bsrli	r9, r12, 16	/* t1 = v >> 16 */
215	or	r9, r11, r9	/* t1 = h | t1 */
216	swi	r9, r5, 0	/* *(d + 0) = t1 */
217	bslli	r11, r12, 16	/* h = v << 16 */
218	lwi	r12, r8, 8	/* v = *(as + 8) */
219	bsrli	r9, r12, 16	/* t1 = v >> 16 */
220	or	r9, r11, r9	/* t1 = h | t1 */
221	swi	r9, r5, 4	/* *(d + 4) = t1 */
222	bslli	r11, r12, 16	/* h = v << 16 */
223	lwi	r12, r8, 12	/* v = *(as + 12) */
224	bsrli	r9, r12, 16	/* t1 = v >> 16 */
225	or	r9, r11, r9	/* t1 = h | t1 */
226	swi	r9, r5, 8	/* *(d + 8) = t1 */
227	bslli	r11, r12, 16	/* h = v << 16 */
228	lwi	r12, r8, 16	/* v = *(as + 16) */
229	bsrli	r9, r12, 16	/* t1 = v >> 16 */
230	or	r9, r11, r9	/* t1 = h | t1 */
231	swi	r9, r5, 12	/* *(d + 12) = t1 */
232	bslli	r11, r12, 16	/* h = v << 16 */
233	lwi	r12, r8, 20	/* v = *(as + 20) */
234	bsrli	r9, r12, 16	/* t1 = v >> 16 */
235	or	r9, r11, r9	/* t1 = h | t1 */
236	swi	r9, r5, 16	/* *(d + 16) = t1 */
237	bslli	r11, r12, 16	/* h = v << 16 */
238	lwi	r12, r8, 24	/* v = *(as + 24) */
239	bsrli	r9, r12, 16	/* t1 = v >> 16 */
240	or	r9, r11, r9	/* t1 = h | t1 */
241	swi	r9, r5, 20	/* *(d + 20) = t1 */
242	bslli	r11, r12, 16	/* h = v << 16 */
243	lwi	r12, r8, 28	/* v = *(as + 28) */
244	bsrli	r9, r12, 16	/* t1 = v >> 16 */
245	or	r9, r11, r9	/* t1 = h | t1 */
246	swi	r9, r5, 24	/* *(d + 24) = t1 */
247	bslli	r11, r12, 16	/* h = v << 16 */
248	lwi	r12, r8, 32	/* v = *(as + 32) */
249	bsrli	r9, r12, 16	/* t1 = v >> 16 */
250	or	r9, r11, r9	/* t1 = h | t1 */
251	swi	r9, r5, 28	/* *(d + 28) = t1 */
252	bslli	r11, r12, 16	/* h = v << 16 */
253	addi	r8, r8, 32	/* as = as + 32 */
254	addi	r4, r4, -32	/* n = n - 32 */
255	bneid	r4, a_bu2_loop	/* while (n) loop */
256	addi	r5, r5, 32	/* d = d + 32 (IN DELAY SLOT) */
257
258a_block_done:
259	addi	r4, r0, 4	/* n = 4 */
260	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
261	blti	r4, a_xfer_end	/* if n < 0, less than one word to transfer */
262
263a_word_xfer:
264	andi	r4, r7, 0xfffffffc	/* n = c & ~3 */
265	addi	r10, r0, 0		/* offset = 0 */
266
267	andi	r9, r6, 3		/* t1 = s & 3 */
268	/* if temp != 0, unaligned transfers needed */
269	bnei	r9, a_word_unaligned
270
271a_word_aligned:
272	lw	r9, r6, r10		/* t1 = *(s+offset) */
273	sw	r9, r5, r10		/* *(d+offset) = t1 */
274	addi	r4, r4,-4		/* n-- */
275	bneid	r4, a_word_aligned	/* loop */
276	addi	r10, r10, 4		/* offset++ (IN DELAY SLOT) */
277
278	bri	a_word_done
279
280a_word_unaligned:
281	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */
282	lwi	r11, r8, 0		/* h = *(as + 0) */
283	addi	r8, r8, 4		/* as = as + 4 */
284
285	addi	r9, r9, -1
286	beqi	r9, a_word_u1		/* t1 was 1 => 1 byte offset */
287	addi	r9, r9, -1
288	beqi	r9, a_word_u2		/* t1 was 2 => 2 byte offset */
289
290a_word_u3:
291	bslli	r11, r11, 24	/* h = h << 24 */
292a_wu3_loop:
293	lw	r12, r8, r10	/* v = *(as + offset) */
294	bsrli	r9, r12, 8	/* t1 = v >> 8 */
295	or	r9, r11, r9	/* t1 = h | t1 */
296	sw	r9, r5, r10	/* *(d + offset) = t1 */
297	bslli	r11, r12, 24	/* h = v << 24 */
298	addi	r4, r4,-4	/* n = n - 4 */
299	bneid	r4, a_wu3_loop	/* while (n) loop */
300	addi	r10, r10, 4	/* offset = ofset + 4 (IN DELAY SLOT) */
301
302	bri	a_word_done
303
304a_word_u1:
305	bslli	r11, r11, 8	/* h = h << 8 */
306a_wu1_loop:
307	lw	r12, r8, r10	/* v = *(as + offset) */
308	bsrli	r9, r12, 24	/* t1 = v >> 24 */
309	or	r9, r11, r9	/* t1 = h | t1 */
310	sw	r9, r5, r10	/* *(d + offset) = t1 */
311	bslli	r11, r12, 8	/* h = v << 8 */
312	addi	r4, r4,-4	/* n = n - 4 */
313	bneid	r4, a_wu1_loop	/* while (n) loop */
314	addi	r10, r10, 4	/* offset = ofset + 4 (IN DELAY SLOT) */
315
316	bri	a_word_done
317
318a_word_u2:
319	bslli	r11, r11, 16	/* h = h << 16 */
320a_wu2_loop:
321	lw	r12, r8, r10	/* v = *(as + offset) */
322	bsrli	r9, r12, 16	/* t1 = v >> 16 */
323	or	r9, r11, r9	/* t1 = h | t1 */
324	sw	r9, r5, r10	/* *(d + offset) = t1 */
325	bslli	r11, r12, 16	/* h = v << 16 */
326	addi	r4, r4,-4	/* n = n - 4 */
327	bneid	r4, a_wu2_loop	/* while (n) loop */
328	addi	r10, r10, 4	/* offset = ofset + 4 (IN DELAY SLOT) */
329
330a_word_done:
331	add	r5, r5, r10	/* d = d + offset */
332	add	r6, r6, r10	/* s = s + offset */
333	rsub	r7, r10, r7	/* c = c - offset */
334
335a_xfer_end:
336a_xfer_end_loop:
337	beqi	r7, a_done		/* while (c) */
338	lbui	r9, r6, 0		/* t1 = *s */
339	addi	r6, r6, 1		/* s++ */
340	sbi	r9, r5, 0		/* *d = t1 */
341	addi	r7, r7, -1		/* c-- */
342	brid	a_xfer_end_loop		/* loop */
343	addi	r5, r5, 1		/* d++ (IN DELAY SLOT) */
344
345a_done:
346	rtsd	r15, 8
347	nop
348
349.size  memcpy, . - memcpy
350.end memcpy
351/*----------------------------------------------------------------------------*/
352	.globl	memmove
353	.type  memmove, @function
354	.ent	memmove
355
356memmove:
357	cmpu	r4, r5, r6	/* n = s - d */
358	bgei	r4,fast_memcpy_ascending
359
360fast_memcpy_descending:
361	/* move d to return register as value of function */
362	addi	r3, r5, 0
363
364	add	r5, r5, r7	/* d = d + c */
365	add	r6, r6, r7	/* s = s + c */
366
367	addi	r4, r0, 4	/* n = 4 */
368	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
369	blti	r4,d_xfer_end	/* if n < 0, less than one word to transfer */
370
371	/* transfer first 0~3 bytes to get aligned dest address */
372	andi	r4, r5, 3		/* n = d & 3 */
373	/* if zero, destination already aligned */
374	beqi	r4,d_dalign_done
375	rsub	r7, r4, r7		/* c = c - n adjust c */
376
377d_xfer_first_loop:
378	/* if no bytes left to transfer, transfer the bulk */
379	beqi	r4,d_dalign_done
380	addi	r6, r6, -1		/* s-- */
381	addi	r5, r5, -1		/* d-- */
382	lbui	r11, r6, 0		/* h = *s */
383	sbi	r11, r5, 0		/* *d = h */
384	brid	d_xfer_first_loop	/* loop */
385	addi	r4, r4, -1		/* n-- (IN DELAY SLOT) */
386
387d_dalign_done:
388	addi	r4, r0, 32	/* n = 32 */
389	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
390	/* if n < 0, less than one block to transfer */
391	blti	r4, d_block_done
392
393d_block_xfer:
394	andi	r4, r7, 0xffffffe0	/* n = c & ~31 */
395	rsub	r7, r4, r7		/* c = c - n */
396
397	andi	r9, r6, 3		/* t1 = s & 3 */
398	/* if temp != 0, unaligned transfers needed */
399	bnei	r9, d_block_unaligned
400
401d_block_aligned:
402	addi	r6, r6, -32		/* s = s - 32 */
403	addi	r5, r5, -32		/* d = d - 32 */
404	lwi	r9, r6, 28		/* t1 = *(s + 28) */
405	lwi	r10, r6, 24		/* t2 = *(s + 24) */
406	lwi	r11, r6, 20		/* t3 = *(s + 20) */
407	lwi	r12, r6, 16		/* t4 = *(s + 16) */
408	swi	r9, r5, 28		/* *(d + 28) = t1 */
409	swi	r10, r5, 24		/* *(d + 24) = t2 */
410	swi	r11, r5, 20		/* *(d + 20) = t3 */
411	swi	r12, r5, 16		/* *(d + 16) = t4 */
412	lwi	r9, r6, 12		/* t1 = *(s + 12) */
413	lwi	r10, r6, 8		/* t2 = *(s + 8) */
414	lwi	r11, r6, 4		/* t3 = *(s + 4) */
415	lwi	r12, r6, 0		/* t4 = *(s + 0) */
416	swi	r9, r5, 12		/* *(d + 12) = t1 */
417	swi	r10, r5, 8		/* *(d + 8) = t2 */
418	swi	r11, r5, 4		/* *(d + 4) = t3 */
419	addi	r4, r4, -32		/* n = n - 32 */
420	bneid	r4, d_block_aligned	/* while (n) loop */
421	swi	r12, r5, 0		/* *(d + 0) = t4 (IN DELAY SLOT) */
422	bri	d_block_done
423
424d_block_unaligned:
425	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */
426	rsub	r6, r4, r6		/* s = s - n */
427	lwi	r11, r8, 0		/* h = *(as + 0) */
428
429	addi	r9, r9, -1
430	beqi	r9,d_block_u1		/* t1 was 1 => 1 byte offset */
431	addi	r9, r9, -1
432	beqi	r9,d_block_u2		/* t1 was 2 => 2 byte offset */
433
434d_block_u3:
435	bsrli	r11, r11, 8	/* h = h >> 8 */
436d_bu3_loop:
437	addi	r8, r8, -32	/* as = as - 32 */
438	addi	r5, r5, -32	/* d = d - 32 */
439	lwi	r12, r8, 28	/* v = *(as + 28) */
440	bslli	r9, r12, 24	/* t1 = v << 24 */
441	or	r9, r11, r9	/* t1 = h | t1 */
442	swi	r9, r5, 28	/* *(d + 28) = t1 */
443	bsrli	r11, r12, 8	/* h = v >> 8 */
444	lwi	r12, r8, 24	/* v = *(as + 24) */
445	bslli	r9, r12, 24	/* t1 = v << 24 */
446	or	r9, r11, r9	/* t1 = h | t1 */
447	swi	r9, r5, 24	/* *(d + 24) = t1 */
448	bsrli	r11, r12, 8	/* h = v >> 8 */
449	lwi	r12, r8, 20	/* v = *(as + 20) */
450	bslli	r9, r12, 24	/* t1 = v << 24 */
451	or	r9, r11, r9	/* t1 = h | t1 */
452	swi	r9, r5, 20	/* *(d + 20) = t1 */
453	bsrli	r11, r12, 8	/* h = v >> 8 */
454	lwi	r12, r8, 16	/* v = *(as + 16) */
455	bslli	r9, r12, 24	/* t1 = v << 24 */
456	or	r9, r11, r9	/* t1 = h | t1 */
457	swi	r9, r5, 16	/* *(d + 16) = t1 */
458	bsrli	r11, r12, 8	/* h = v >> 8 */
459	lwi	r12, r8, 12	/* v = *(as + 12) */
460	bslli	r9, r12, 24	/* t1 = v << 24 */
461	or	r9, r11, r9	/* t1 = h | t1 */
462	swi	r9, r5, 12	/* *(d + 112) = t1 */
463	bsrli	r11, r12, 8	/* h = v >> 8 */
464	lwi	r12, r8, 8	/* v = *(as + 8) */
465	bslli	r9, r12, 24	/* t1 = v << 24 */
466	or	r9, r11, r9	/* t1 = h | t1 */
467	swi	r9, r5, 8	/* *(d + 8) = t1 */
468	bsrli	r11, r12, 8	/* h = v >> 8 */
469	lwi	r12, r8, 4	/* v = *(as + 4) */
470	bslli	r9, r12, 24	/* t1 = v << 24 */
471	or	r9, r11, r9	/* t1 = h | t1 */
472	swi	r9, r5, 4	/* *(d + 4) = t1 */
473	bsrli	r11, r12, 8	/* h = v >> 8 */
474	lwi	r12, r8, 0	/* v = *(as + 0) */
475	bslli	r9, r12, 24	/* t1 = v << 24 */
476	or	r9, r11, r9	/* t1 = h | t1 */
477	swi	r9, r5, 0	/* *(d + 0) = t1 */
478	addi	r4, r4, -32	/* n = n - 32 */
479	bneid	r4, d_bu3_loop	/* while (n) loop */
480	bsrli	r11, r12, 8	/* h = v >> 8 (IN DELAY SLOT) */
481	bri	d_block_done
482
483d_block_u1:
484	bsrli	r11, r11, 24	/* h = h >> 24 */
485d_bu1_loop:
486	addi	r8, r8, -32	/* as = as - 32 */
487	addi	r5, r5, -32	/* d = d - 32 */
488	lwi	r12, r8, 28	/* v = *(as + 28) */
489	bslli	r9, r12, 8	/* t1 = v << 8 */
490	or	r9, r11, r9	/* t1 = h | t1 */
491	swi	r9, r5, 28	/* *(d + 28) = t1 */
492	bsrli	r11, r12, 24	/* h = v >> 24 */
493	lwi	r12, r8, 24	/* v = *(as + 24) */
494	bslli	r9, r12, 8	/* t1 = v << 8 */
495	or	r9, r11, r9	/* t1 = h | t1 */
496	swi	r9, r5, 24	/* *(d + 24) = t1 */
497	bsrli	r11, r12, 24	/* h = v >> 24 */
498	lwi	r12, r8, 20	/* v = *(as + 20) */
499	bslli	r9, r12, 8	/* t1 = v << 8 */
500	or	r9, r11, r9	/* t1 = h | t1 */
501	swi	r9, r5, 20	/* *(d + 20) = t1 */
502	bsrli	r11, r12, 24	/* h = v >> 24 */
503	lwi	r12, r8, 16	/* v = *(as + 16) */
504	bslli	r9, r12, 8	/* t1 = v << 8 */
505	or	r9, r11, r9	/* t1 = h | t1 */
506	swi	r9, r5, 16	/* *(d + 16) = t1 */
507	bsrli	r11, r12, 24	/* h = v >> 24 */
508	lwi	r12, r8, 12	/* v = *(as + 12) */
509	bslli	r9, r12, 8	/* t1 = v << 8 */
510	or	r9, r11, r9	/* t1 = h | t1 */
511	swi	r9, r5, 12	/* *(d + 112) = t1 */
512	bsrli	r11, r12, 24	/* h = v >> 24 */
513	lwi	r12, r8, 8	/* v = *(as + 8) */
514	bslli	r9, r12, 8	/* t1 = v << 8 */
515	or	r9, r11, r9	/* t1 = h | t1 */
516	swi	r9, r5, 8	/* *(d + 8) = t1 */
517	bsrli	r11, r12, 24	/* h = v >> 24 */
518	lwi	r12, r8, 4	/* v = *(as + 4) */
519	bslli	r9, r12, 8	/* t1 = v << 8 */
520	or	r9, r11, r9	/* t1 = h | t1 */
521	swi	r9, r5, 4	/* *(d + 4) = t1 */
522	bsrli	r11, r12, 24	/* h = v >> 24 */
523	lwi	r12, r8, 0	/* v = *(as + 0) */
524	bslli	r9, r12, 8	/* t1 = v << 8 */
525	or	r9, r11, r9	/* t1 = h | t1 */
526	swi	r9, r5, 0	/* *(d + 0) = t1 */
527	addi	r4, r4, -32	/* n = n - 32 */
528	bneid	r4, d_bu1_loop	/* while (n) loop */
529	bsrli	r11, r12, 24	/* h = v >> 24 (IN DELAY SLOT) */
530	bri	d_block_done
531
532d_block_u2:
533	bsrli	r11, r11, 16	/* h = h >> 16 */
534d_bu2_loop:
535	addi	r8, r8, -32	/* as = as - 32 */
536	addi	r5, r5, -32	/* d = d - 32 */
537	lwi	r12, r8, 28	/* v = *(as + 28) */
538	bslli	r9, r12, 16	/* t1 = v << 16 */
539	or	r9, r11, r9	/* t1 = h | t1 */
540	swi	r9, r5, 28	/* *(d + 28) = t1 */
541	bsrli	r11, r12, 16	/* h = v >> 16 */
542	lwi	r12, r8, 24	/* v = *(as + 24) */
543	bslli	r9, r12, 16	/* t1 = v << 16 */
544	or	r9, r11, r9	/* t1 = h | t1 */
545	swi	r9, r5, 24	/* *(d + 24) = t1 */
546	bsrli	r11, r12, 16	/* h = v >> 16 */
547	lwi	r12, r8, 20	/* v = *(as + 20) */
548	bslli	r9, r12, 16	/* t1 = v << 16 */
549	or	r9, r11, r9	/* t1 = h | t1 */
550	swi	r9, r5, 20	/* *(d + 20) = t1 */
551	bsrli	r11, r12, 16	/* h = v >> 16 */
552	lwi	r12, r8, 16	/* v = *(as + 16) */
553	bslli	r9, r12, 16	/* t1 = v << 16 */
554	or	r9, r11, r9	/* t1 = h | t1 */
555	swi	r9, r5, 16	/* *(d + 16) = t1 */
556	bsrli	r11, r12, 16	/* h = v >> 16 */
557	lwi	r12, r8, 12	/* v = *(as + 12) */
558	bslli	r9, r12, 16	/* t1 = v << 16 */
559	or	r9, r11, r9	/* t1 = h | t1 */
560	swi	r9, r5, 12	/* *(d + 112) = t1 */
561	bsrli	r11, r12, 16	/* h = v >> 16 */
562	lwi	r12, r8, 8	/* v = *(as + 8) */
563	bslli	r9, r12, 16	/* t1 = v << 16 */
564	or	r9, r11, r9	/* t1 = h | t1 */
565	swi	r9, r5, 8	/* *(d + 8) = t1 */
566	bsrli	r11, r12, 16	/* h = v >> 16 */
567	lwi	r12, r8, 4	/* v = *(as + 4) */
568	bslli	r9, r12, 16	/* t1 = v << 16 */
569	or	r9, r11, r9	/* t1 = h | t1 */
570	swi	r9, r5, 4	/* *(d + 4) = t1 */
571	bsrli	r11, r12, 16	/* h = v >> 16 */
572	lwi	r12, r8, 0	/* v = *(as + 0) */
573	bslli	r9, r12, 16	/* t1 = v << 16 */
574	or	r9, r11, r9	/* t1 = h | t1 */
575	swi	r9, r5, 0	/* *(d + 0) = t1 */
576	addi	r4, r4, -32	/* n = n - 32 */
577	bneid	r4, d_bu2_loop	/* while (n) loop */
578	bsrli	r11, r12, 16	/* h = v >> 16 (IN DELAY SLOT) */
579
580d_block_done:
581	addi	r4, r0, 4	/* n = 4 */
582	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
583	blti	r4,d_xfer_end	/* if n < 0, less than one word to transfer */
584
585d_word_xfer:
586	andi	r4, r7, 0xfffffffc	/* n = c & ~3 */
587	rsub	r5, r4, r5		/* d = d - n */
588	rsub	r6, r4, r6		/* s = s - n */
589	rsub	r7, r4, r7		/* c = c - n */
590
591	andi	r9, r6, 3		/* t1 = s & 3 */
592	/* if temp != 0, unaligned transfers needed */
593	bnei	r9, d_word_unaligned
594
595d_word_aligned:
596	addi	r4, r4,-4		/* n-- */
597	lw	r9, r6, r4		/* t1 = *(s+n) */
598	bneid	r4, d_word_aligned	/* loop */
599	sw	r9, r5, r4		/* *(d+n) = t1 (IN DELAY SLOT) */
600
601	bri	d_word_done
602
603d_word_unaligned:
604	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */
605	lw	r11, r8, r4		/* h = *(as + n) */
606
607	addi	r9, r9, -1
608	beqi	r9,d_word_u1		/* t1 was 1 => 1 byte offset */
609	addi	r9, r9, -1
610	beqi	r9,d_word_u2		/* t1 was 2 => 2 byte offset */
611
612d_word_u3:
613	bsrli	r11, r11, 8	/* h = h >> 8 */
614d_wu3_loop:
615	addi	r4, r4,-4	/* n = n - 4 */
616	lw	r12, r8, r4	/* v = *(as + n) */
617	bslli	r9, r12, 24	/* t1 = v << 24 */
618	or	r9, r11, r9	/* t1 = h | t1 */
619	sw	r9, r5, r4	/* *(d + n) = t1 */
620	bneid	r4, d_wu3_loop	/* while (n) loop */
621	bsrli	r11, r12, 8	/* h = v >> 8 (IN DELAY SLOT) */
622
623	bri	d_word_done
624
625d_word_u1:
626	bsrli	r11, r11, 24	/* h = h >> 24 */
627d_wu1_loop:
628	addi	r4, r4,-4	/* n = n - 4 */
629	lw	r12, r8, r4	/* v = *(as + n) */
630	bslli	r9, r12, 8	/* t1 = v << 8 */
631	or	r9, r11, r9	/* t1 = h | t1 */
632	sw	r9, r5, r4	/* *(d + n) = t1 */
633	bneid	r4, d_wu1_loop	/* while (n) loop */
634	bsrli	r11, r12, 24	/* h = v >> 24 (IN DELAY SLOT) */
635
636	bri	d_word_done
637
638d_word_u2:
639	bsrli	r11, r11, 16	/* h = h >> 16 */
640d_wu2_loop:
641	addi	r4, r4,-4	/* n = n - 4 */
642	lw	r12, r8, r4	/* v = *(as + n) */
643	bslli	r9, r12, 16	/* t1 = v << 16 */
644	or	r9, r11, r9	/* t1 = h | t1 */
645	sw	r9, r5, r4	/* *(d + n) = t1 */
646	bneid	r4, d_wu2_loop	/* while (n) loop */
647	bsrli	r11, r12, 16	/* h = v >> 16 (IN DELAY SLOT) */
648
649d_word_done:
650
651d_xfer_end:
652d_xfer_end_loop:
653	beqi	r7, a_done		/* while (c) */
654	addi	r6, r6, -1		/* s-- */
655	lbui	r9, r6, 0		/* t1 = *s */
656	addi	r5, r5, -1		/* d-- */
657	sbi	r9, r5, 0		/* *d = t1 */
658	brid	d_xfer_end_loop		/* loop */
659	addi	r7, r7, -1		/* c-- (IN DELAY SLOT) */
660
661d_done:
662	rtsd	r15, 8
663	nop
664
665.size  memmove, . - memmove
666.end memmove
667