xref: /linux/arch/microblaze/lib/fastcopy.S (revision 0883c2c06fb5bcf5b9e008270827e63c09a88c1e)
1/*
2 * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu>
3 * Copyright (C) 2008-2009 PetaLogix
4 * Copyright (C) 2008 Jim Law - Iris LP  All rights reserved.
5 *
6 * This file is subject to the terms and conditions of the GNU General
7 * Public License.  See the file COPYING in the main directory of this
8 * archive for more details.
9 *
10 * Written by Jim Law <jlaw@irispower.com>
11 *
12 * intended to replace:
13 *	memcpy in memcpy.c and
14 *	memmove in memmove.c
15 * ... in arch/microblaze/lib
16 *
17 *
18 * assly_fastcopy.S
19 *
20 * Attempt at quicker memcpy and memmove for MicroBlaze
21 *	Input :	Operand1 in Reg r5 - destination address
22 *		Operand2 in Reg r6 - source address
23 *		Operand3 in Reg r7 - number of bytes to transfer
24 *	Output: Result in Reg r3 - starting destinaition address
25 *
26 *
27 * Explanation:
28 *	Perform (possibly unaligned) copy of a block of memory
29 *	between mem locations with size of xfer spec'd in bytes
30 */
31
32#ifdef __MICROBLAZEEL__
33#error Microblaze LE not support ASM optimized lib func. Disable OPT_LIB_ASM.
34#endif
35
36#include <linux/linkage.h>
37	.text
38	.globl	memcpy
39	.type  memcpy, @function
40	.ent	memcpy
41
42memcpy:
43fast_memcpy_ascending:
44	/* move d to return register as value of function */
45	addi	r3, r5, 0
46
47	addi	r4, r0, 4	/* n = 4 */
48	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
49	blti	r4, a_xfer_end	/* if n < 0, less than one word to transfer */
50
51	/* transfer first 0~3 bytes to get aligned dest address */
52	andi	r4, r5, 3		/* n = d & 3 */
53	/* if zero, destination already aligned */
54	beqi	r4, a_dalign_done
55	/* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */
56	rsubi	r4, r4, 4
57	rsub	r7, r4, r7		/* c = c - n adjust c */
58
59a_xfer_first_loop:
60	/* if no bytes left to transfer, transfer the bulk */
61	beqi	r4, a_dalign_done
62	lbui	r11, r6, 0		/* h = *s */
63	sbi	r11, r5, 0		/* *d = h */
64	addi	r6, r6, 1		/* s++ */
65	addi	r5, r5, 1		/* d++ */
66	brid	a_xfer_first_loop	/* loop */
67	addi	r4, r4, -1		/* n-- (IN DELAY SLOT) */
68
69a_dalign_done:
70	addi	r4, r0, 32		/* n = 32 */
71	cmpu	r4, r4, r7		/* n = c - n  (unsigned) */
72	/* if n < 0, less than one block to transfer */
73	blti	r4, a_block_done
74
75a_block_xfer:
76	andi	r4, r7, 0xffffffe0	/* n = c & ~31 */
77	rsub	r7, r4, r7		/* c = c - n */
78
79	andi	r9, r6, 3		/* t1 = s & 3 */
80	/* if temp != 0, unaligned transfers needed */
81	bnei	r9, a_block_unaligned
82
83a_block_aligned:
84	lwi	r9, r6, 0		/* t1 = *(s + 0) */
85	lwi	r10, r6, 4		/* t2 = *(s + 4) */
86	lwi	r11, r6, 8		/* t3 = *(s + 8) */
87	lwi	r12, r6, 12		/* t4 = *(s + 12) */
88	swi	r9, r5, 0		/* *(d + 0) = t1 */
89	swi	r10, r5, 4		/* *(d + 4) = t2 */
90	swi	r11, r5, 8		/* *(d + 8) = t3 */
91	swi	r12, r5, 12		/* *(d + 12) = t4 */
92	lwi	r9, r6, 16		/* t1 = *(s + 16) */
93	lwi	r10, r6, 20		/* t2 = *(s + 20) */
94	lwi	r11, r6, 24		/* t3 = *(s + 24) */
95	lwi	r12, r6, 28		/* t4 = *(s + 28) */
96	swi	r9, r5, 16		/* *(d + 16) = t1 */
97	swi	r10, r5, 20		/* *(d + 20) = t2 */
98	swi	r11, r5, 24		/* *(d + 24) = t3 */
99	swi	r12, r5, 28		/* *(d + 28) = t4 */
100	addi	r6, r6, 32		/* s = s + 32 */
101	addi	r4, r4, -32		/* n = n - 32 */
102	bneid	r4, a_block_aligned	/* while (n) loop */
103	addi	r5, r5, 32		/* d = d + 32 (IN DELAY SLOT) */
104	bri	a_block_done
105
106a_block_unaligned:
107	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */
108	add	r6, r6, r4		/* s = s + n */
109	lwi	r11, r8, 0		/* h = *(as + 0) */
110
111	addi	r9, r9, -1
112	beqi	r9, a_block_u1		/* t1 was 1 => 1 byte offset */
113	addi	r9, r9, -1
114	beqi	r9, a_block_u2		/* t1 was 2 => 2 byte offset */
115
116a_block_u3:
117	bslli	r11, r11, 24	/* h = h << 24 */
118a_bu3_loop:
119	lwi	r12, r8, 4	/* v = *(as + 4) */
120	bsrli	r9, r12, 8	/* t1 = v >> 8 */
121	or	r9, r11, r9	/* t1 = h | t1 */
122	swi	r9, r5, 0	/* *(d + 0) = t1 */
123	bslli	r11, r12, 24	/* h = v << 24 */
124	lwi	r12, r8, 8	/* v = *(as + 8) */
125	bsrli	r9, r12, 8	/* t1 = v >> 8 */
126	or	r9, r11, r9	/* t1 = h | t1 */
127	swi	r9, r5, 4	/* *(d + 4) = t1 */
128	bslli	r11, r12, 24	/* h = v << 24 */
129	lwi	r12, r8, 12	/* v = *(as + 12) */
130	bsrli	r9, r12, 8	/* t1 = v >> 8 */
131	or	r9, r11, r9	/* t1 = h | t1 */
132	swi	r9, r5, 8	/* *(d + 8) = t1 */
133	bslli	r11, r12, 24	/* h = v << 24 */
134	lwi	r12, r8, 16	/* v = *(as + 16) */
135	bsrli	r9, r12, 8	/* t1 = v >> 8 */
136	or	r9, r11, r9	/* t1 = h | t1 */
137	swi	r9, r5, 12	/* *(d + 12) = t1 */
138	bslli	r11, r12, 24	/* h = v << 24 */
139	lwi	r12, r8, 20	/* v = *(as + 20) */
140	bsrli	r9, r12, 8	/* t1 = v >> 8 */
141	or	r9, r11, r9	/* t1 = h | t1 */
142	swi	r9, r5, 16	/* *(d + 16) = t1 */
143	bslli	r11, r12, 24	/* h = v << 24 */
144	lwi	r12, r8, 24	/* v = *(as + 24) */
145	bsrli	r9, r12, 8	/* t1 = v >> 8 */
146	or	r9, r11, r9	/* t1 = h | t1 */
147	swi	r9, r5, 20	/* *(d + 20) = t1 */
148	bslli	r11, r12, 24	/* h = v << 24 */
149	lwi	r12, r8, 28	/* v = *(as + 28) */
150	bsrli	r9, r12, 8	/* t1 = v >> 8 */
151	or	r9, r11, r9	/* t1 = h | t1 */
152	swi	r9, r5, 24	/* *(d + 24) = t1 */
153	bslli	r11, r12, 24	/* h = v << 24 */
154	lwi	r12, r8, 32	/* v = *(as + 32) */
155	bsrli	r9, r12, 8	/* t1 = v >> 8 */
156	or	r9, r11, r9	/* t1 = h | t1 */
157	swi	r9, r5, 28	/* *(d + 28) = t1 */
158	bslli	r11, r12, 24	/* h = v << 24 */
159	addi	r8, r8, 32	/* as = as + 32 */
160	addi	r4, r4, -32	/* n = n - 32 */
161	bneid	r4, a_bu3_loop	/* while (n) loop */
162	addi	r5, r5, 32	/* d = d + 32 (IN DELAY SLOT) */
163	bri	a_block_done
164
165a_block_u1:
166	bslli	r11, r11, 8	/* h = h << 8 */
167a_bu1_loop:
168	lwi	r12, r8, 4	/* v = *(as + 4) */
169	bsrli	r9, r12, 24	/* t1 = v >> 24 */
170	or	r9, r11, r9	/* t1 = h | t1 */
171	swi	r9, r5, 0	/* *(d + 0) = t1 */
172	bslli	r11, r12, 8	/* h = v << 8 */
173	lwi	r12, r8, 8	/* v = *(as + 8) */
174	bsrli	r9, r12, 24	/* t1 = v >> 24 */
175	or	r9, r11, r9	/* t1 = h | t1 */
176	swi	r9, r5, 4	/* *(d + 4) = t1 */
177	bslli	r11, r12, 8	/* h = v << 8 */
178	lwi	r12, r8, 12	/* v = *(as + 12) */
179	bsrli	r9, r12, 24	/* t1 = v >> 24 */
180	or	r9, r11, r9	/* t1 = h | t1 */
181	swi	r9, r5, 8	/* *(d + 8) = t1 */
182	bslli	r11, r12, 8	/* h = v << 8 */
183	lwi	r12, r8, 16	/* v = *(as + 16) */
184	bsrli	r9, r12, 24	/* t1 = v >> 24 */
185	or	r9, r11, r9	/* t1 = h | t1 */
186	swi	r9, r5, 12	/* *(d + 12) = t1 */
187	bslli	r11, r12, 8	/* h = v << 8 */
188	lwi	r12, r8, 20	/* v = *(as + 20) */
189	bsrli	r9, r12, 24	/* t1 = v >> 24 */
190	or	r9, r11, r9	/* t1 = h | t1 */
191	swi	r9, r5, 16	/* *(d + 16) = t1 */
192	bslli	r11, r12, 8	/* h = v << 8 */
193	lwi	r12, r8, 24	/* v = *(as + 24) */
194	bsrli	r9, r12, 24	/* t1 = v >> 24 */
195	or	r9, r11, r9	/* t1 = h | t1 */
196	swi	r9, r5, 20	/* *(d + 20) = t1 */
197	bslli	r11, r12, 8	/* h = v << 8 */
198	lwi	r12, r8, 28	/* v = *(as + 28) */
199	bsrli	r9, r12, 24	/* t1 = v >> 24 */
200	or	r9, r11, r9	/* t1 = h | t1 */
201	swi	r9, r5, 24	/* *(d + 24) = t1 */
202	bslli	r11, r12, 8	/* h = v << 8 */
203	lwi	r12, r8, 32	/* v = *(as + 32) */
204	bsrli	r9, r12, 24	/* t1 = v >> 24 */
205	or	r9, r11, r9	/* t1 = h | t1 */
206	swi	r9, r5, 28	/* *(d + 28) = t1 */
207	bslli	r11, r12, 8	/* h = v << 8 */
208	addi	r8, r8, 32	/* as = as + 32 */
209	addi	r4, r4, -32	/* n = n - 32 */
210	bneid	r4, a_bu1_loop	/* while (n) loop */
211	addi	r5, r5, 32	/* d = d + 32 (IN DELAY SLOT) */
212	bri	a_block_done
213
214a_block_u2:
215	bslli	r11, r11, 16	/* h = h << 16 */
216a_bu2_loop:
217	lwi	r12, r8, 4	/* v = *(as + 4) */
218	bsrli	r9, r12, 16	/* t1 = v >> 16 */
219	or	r9, r11, r9	/* t1 = h | t1 */
220	swi	r9, r5, 0	/* *(d + 0) = t1 */
221	bslli	r11, r12, 16	/* h = v << 16 */
222	lwi	r12, r8, 8	/* v = *(as + 8) */
223	bsrli	r9, r12, 16	/* t1 = v >> 16 */
224	or	r9, r11, r9	/* t1 = h | t1 */
225	swi	r9, r5, 4	/* *(d + 4) = t1 */
226	bslli	r11, r12, 16	/* h = v << 16 */
227	lwi	r12, r8, 12	/* v = *(as + 12) */
228	bsrli	r9, r12, 16	/* t1 = v >> 16 */
229	or	r9, r11, r9	/* t1 = h | t1 */
230	swi	r9, r5, 8	/* *(d + 8) = t1 */
231	bslli	r11, r12, 16	/* h = v << 16 */
232	lwi	r12, r8, 16	/* v = *(as + 16) */
233	bsrli	r9, r12, 16	/* t1 = v >> 16 */
234	or	r9, r11, r9	/* t1 = h | t1 */
235	swi	r9, r5, 12	/* *(d + 12) = t1 */
236	bslli	r11, r12, 16	/* h = v << 16 */
237	lwi	r12, r8, 20	/* v = *(as + 20) */
238	bsrli	r9, r12, 16	/* t1 = v >> 16 */
239	or	r9, r11, r9	/* t1 = h | t1 */
240	swi	r9, r5, 16	/* *(d + 16) = t1 */
241	bslli	r11, r12, 16	/* h = v << 16 */
242	lwi	r12, r8, 24	/* v = *(as + 24) */
243	bsrli	r9, r12, 16	/* t1 = v >> 16 */
244	or	r9, r11, r9	/* t1 = h | t1 */
245	swi	r9, r5, 20	/* *(d + 20) = t1 */
246	bslli	r11, r12, 16	/* h = v << 16 */
247	lwi	r12, r8, 28	/* v = *(as + 28) */
248	bsrli	r9, r12, 16	/* t1 = v >> 16 */
249	or	r9, r11, r9	/* t1 = h | t1 */
250	swi	r9, r5, 24	/* *(d + 24) = t1 */
251	bslli	r11, r12, 16	/* h = v << 16 */
252	lwi	r12, r8, 32	/* v = *(as + 32) */
253	bsrli	r9, r12, 16	/* t1 = v >> 16 */
254	or	r9, r11, r9	/* t1 = h | t1 */
255	swi	r9, r5, 28	/* *(d + 28) = t1 */
256	bslli	r11, r12, 16	/* h = v << 16 */
257	addi	r8, r8, 32	/* as = as + 32 */
258	addi	r4, r4, -32	/* n = n - 32 */
259	bneid	r4, a_bu2_loop	/* while (n) loop */
260	addi	r5, r5, 32	/* d = d + 32 (IN DELAY SLOT) */
261
262a_block_done:
263	addi	r4, r0, 4	/* n = 4 */
264	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
265	blti	r4, a_xfer_end	/* if n < 0, less than one word to transfer */
266
267a_word_xfer:
268	andi	r4, r7, 0xfffffffc	/* n = c & ~3 */
269	addi	r10, r0, 0		/* offset = 0 */
270
271	andi	r9, r6, 3		/* t1 = s & 3 */
272	/* if temp != 0, unaligned transfers needed */
273	bnei	r9, a_word_unaligned
274
275a_word_aligned:
276	lw	r9, r6, r10		/* t1 = *(s+offset) */
277	sw	r9, r5, r10		/* *(d+offset) = t1 */
278	addi	r4, r4,-4		/* n-- */
279	bneid	r4, a_word_aligned	/* loop */
280	addi	r10, r10, 4		/* offset++ (IN DELAY SLOT) */
281
282	bri	a_word_done
283
284a_word_unaligned:
285	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */
286	lwi	r11, r8, 0		/* h = *(as + 0) */
287	addi	r8, r8, 4		/* as = as + 4 */
288
289	addi	r9, r9, -1
290	beqi	r9, a_word_u1		/* t1 was 1 => 1 byte offset */
291	addi	r9, r9, -1
292	beqi	r9, a_word_u2		/* t1 was 2 => 2 byte offset */
293
294a_word_u3:
295	bslli	r11, r11, 24	/* h = h << 24 */
296a_wu3_loop:
297	lw	r12, r8, r10	/* v = *(as + offset) */
298	bsrli	r9, r12, 8	/* t1 = v >> 8 */
299	or	r9, r11, r9	/* t1 = h | t1 */
300	sw	r9, r5, r10	/* *(d + offset) = t1 */
301	bslli	r11, r12, 24	/* h = v << 24 */
302	addi	r4, r4,-4	/* n = n - 4 */
303	bneid	r4, a_wu3_loop	/* while (n) loop */
304	addi	r10, r10, 4	/* offset = ofset + 4 (IN DELAY SLOT) */
305
306	bri	a_word_done
307
308a_word_u1:
309	bslli	r11, r11, 8	/* h = h << 8 */
310a_wu1_loop:
311	lw	r12, r8, r10	/* v = *(as + offset) */
312	bsrli	r9, r12, 24	/* t1 = v >> 24 */
313	or	r9, r11, r9	/* t1 = h | t1 */
314	sw	r9, r5, r10	/* *(d + offset) = t1 */
315	bslli	r11, r12, 8	/* h = v << 8 */
316	addi	r4, r4,-4	/* n = n - 4 */
317	bneid	r4, a_wu1_loop	/* while (n) loop */
318	addi	r10, r10, 4	/* offset = ofset + 4 (IN DELAY SLOT) */
319
320	bri	a_word_done
321
322a_word_u2:
323	bslli	r11, r11, 16	/* h = h << 16 */
324a_wu2_loop:
325	lw	r12, r8, r10	/* v = *(as + offset) */
326	bsrli	r9, r12, 16	/* t1 = v >> 16 */
327	or	r9, r11, r9	/* t1 = h | t1 */
328	sw	r9, r5, r10	/* *(d + offset) = t1 */
329	bslli	r11, r12, 16	/* h = v << 16 */
330	addi	r4, r4,-4	/* n = n - 4 */
331	bneid	r4, a_wu2_loop	/* while (n) loop */
332	addi	r10, r10, 4	/* offset = ofset + 4 (IN DELAY SLOT) */
333
334a_word_done:
335	add	r5, r5, r10	/* d = d + offset */
336	add	r6, r6, r10	/* s = s + offset */
337	rsub	r7, r10, r7	/* c = c - offset */
338
339a_xfer_end:
340a_xfer_end_loop:
341	beqi	r7, a_done		/* while (c) */
342	lbui	r9, r6, 0		/* t1 = *s */
343	addi	r6, r6, 1		/* s++ */
344	sbi	r9, r5, 0		/* *d = t1 */
345	addi	r7, r7, -1		/* c-- */
346	brid	a_xfer_end_loop		/* loop */
347	addi	r5, r5, 1		/* d++ (IN DELAY SLOT) */
348
349a_done:
350	rtsd	r15, 8
351	nop
352
353.size  memcpy, . - memcpy
354.end memcpy
355/*----------------------------------------------------------------------------*/
356	.globl	memmove
357	.type  memmove, @function
358	.ent	memmove
359
360memmove:
361	cmpu	r4, r5, r6	/* n = s - d */
362	bgei	r4,fast_memcpy_ascending
363
364fast_memcpy_descending:
365	/* move d to return register as value of function */
366	addi	r3, r5, 0
367
368	add	r5, r5, r7	/* d = d + c */
369	add	r6, r6, r7	/* s = s + c */
370
371	addi	r4, r0, 4	/* n = 4 */
372	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
373	blti	r4,d_xfer_end	/* if n < 0, less than one word to transfer */
374
375	/* transfer first 0~3 bytes to get aligned dest address */
376	andi	r4, r5, 3		/* n = d & 3 */
377	/* if zero, destination already aligned */
378	beqi	r4,d_dalign_done
379	rsub	r7, r4, r7		/* c = c - n adjust c */
380
381d_xfer_first_loop:
382	/* if no bytes left to transfer, transfer the bulk */
383	beqi	r4,d_dalign_done
384	addi	r6, r6, -1		/* s-- */
385	addi	r5, r5, -1		/* d-- */
386	lbui	r11, r6, 0		/* h = *s */
387	sbi	r11, r5, 0		/* *d = h */
388	brid	d_xfer_first_loop	/* loop */
389	addi	r4, r4, -1		/* n-- (IN DELAY SLOT) */
390
391d_dalign_done:
392	addi	r4, r0, 32	/* n = 32 */
393	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
394	/* if n < 0, less than one block to transfer */
395	blti	r4, d_block_done
396
397d_block_xfer:
398	andi	r4, r7, 0xffffffe0	/* n = c & ~31 */
399	rsub	r7, r4, r7		/* c = c - n */
400
401	andi	r9, r6, 3		/* t1 = s & 3 */
402	/* if temp != 0, unaligned transfers needed */
403	bnei	r9, d_block_unaligned
404
405d_block_aligned:
406	addi	r6, r6, -32		/* s = s - 32 */
407	addi	r5, r5, -32		/* d = d - 32 */
408	lwi	r9, r6, 28		/* t1 = *(s + 28) */
409	lwi	r10, r6, 24		/* t2 = *(s + 24) */
410	lwi	r11, r6, 20		/* t3 = *(s + 20) */
411	lwi	r12, r6, 16		/* t4 = *(s + 16) */
412	swi	r9, r5, 28		/* *(d + 28) = t1 */
413	swi	r10, r5, 24		/* *(d + 24) = t2 */
414	swi	r11, r5, 20		/* *(d + 20) = t3 */
415	swi	r12, r5, 16		/* *(d + 16) = t4 */
416	lwi	r9, r6, 12		/* t1 = *(s + 12) */
417	lwi	r10, r6, 8		/* t2 = *(s + 8) */
418	lwi	r11, r6, 4		/* t3 = *(s + 4) */
419	lwi	r12, r6, 0		/* t4 = *(s + 0) */
420	swi	r9, r5, 12		/* *(d + 12) = t1 */
421	swi	r10, r5, 8		/* *(d + 8) = t2 */
422	swi	r11, r5, 4		/* *(d + 4) = t3 */
423	addi	r4, r4, -32		/* n = n - 32 */
424	bneid	r4, d_block_aligned	/* while (n) loop */
425	swi	r12, r5, 0		/* *(d + 0) = t4 (IN DELAY SLOT) */
426	bri	d_block_done
427
428d_block_unaligned:
429	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */
430	rsub	r6, r4, r6		/* s = s - n */
431	lwi	r11, r8, 0		/* h = *(as + 0) */
432
433	addi	r9, r9, -1
434	beqi	r9,d_block_u1		/* t1 was 1 => 1 byte offset */
435	addi	r9, r9, -1
436	beqi	r9,d_block_u2		/* t1 was 2 => 2 byte offset */
437
438d_block_u3:
439	bsrli	r11, r11, 8	/* h = h >> 8 */
440d_bu3_loop:
441	addi	r8, r8, -32	/* as = as - 32 */
442	addi	r5, r5, -32	/* d = d - 32 */
443	lwi	r12, r8, 28	/* v = *(as + 28) */
444	bslli	r9, r12, 24	/* t1 = v << 24 */
445	or	r9, r11, r9	/* t1 = h | t1 */
446	swi	r9, r5, 28	/* *(d + 28) = t1 */
447	bsrli	r11, r12, 8	/* h = v >> 8 */
448	lwi	r12, r8, 24	/* v = *(as + 24) */
449	bslli	r9, r12, 24	/* t1 = v << 24 */
450	or	r9, r11, r9	/* t1 = h | t1 */
451	swi	r9, r5, 24	/* *(d + 24) = t1 */
452	bsrli	r11, r12, 8	/* h = v >> 8 */
453	lwi	r12, r8, 20	/* v = *(as + 20) */
454	bslli	r9, r12, 24	/* t1 = v << 24 */
455	or	r9, r11, r9	/* t1 = h | t1 */
456	swi	r9, r5, 20	/* *(d + 20) = t1 */
457	bsrli	r11, r12, 8	/* h = v >> 8 */
458	lwi	r12, r8, 16	/* v = *(as + 16) */
459	bslli	r9, r12, 24	/* t1 = v << 24 */
460	or	r9, r11, r9	/* t1 = h | t1 */
461	swi	r9, r5, 16	/* *(d + 16) = t1 */
462	bsrli	r11, r12, 8	/* h = v >> 8 */
463	lwi	r12, r8, 12	/* v = *(as + 12) */
464	bslli	r9, r12, 24	/* t1 = v << 24 */
465	or	r9, r11, r9	/* t1 = h | t1 */
466	swi	r9, r5, 12	/* *(d + 112) = t1 */
467	bsrli	r11, r12, 8	/* h = v >> 8 */
468	lwi	r12, r8, 8	/* v = *(as + 8) */
469	bslli	r9, r12, 24	/* t1 = v << 24 */
470	or	r9, r11, r9	/* t1 = h | t1 */
471	swi	r9, r5, 8	/* *(d + 8) = t1 */
472	bsrli	r11, r12, 8	/* h = v >> 8 */
473	lwi	r12, r8, 4	/* v = *(as + 4) */
474	bslli	r9, r12, 24	/* t1 = v << 24 */
475	or	r9, r11, r9	/* t1 = h | t1 */
476	swi	r9, r5, 4	/* *(d + 4) = t1 */
477	bsrli	r11, r12, 8	/* h = v >> 8 */
478	lwi	r12, r8, 0	/* v = *(as + 0) */
479	bslli	r9, r12, 24	/* t1 = v << 24 */
480	or	r9, r11, r9	/* t1 = h | t1 */
481	swi	r9, r5, 0	/* *(d + 0) = t1 */
482	addi	r4, r4, -32	/* n = n - 32 */
483	bneid	r4, d_bu3_loop	/* while (n) loop */
484	bsrli	r11, r12, 8	/* h = v >> 8 (IN DELAY SLOT) */
485	bri	d_block_done
486
487d_block_u1:
488	bsrli	r11, r11, 24	/* h = h >> 24 */
489d_bu1_loop:
490	addi	r8, r8, -32	/* as = as - 32 */
491	addi	r5, r5, -32	/* d = d - 32 */
492	lwi	r12, r8, 28	/* v = *(as + 28) */
493	bslli	r9, r12, 8	/* t1 = v << 8 */
494	or	r9, r11, r9	/* t1 = h | t1 */
495	swi	r9, r5, 28	/* *(d + 28) = t1 */
496	bsrli	r11, r12, 24	/* h = v >> 24 */
497	lwi	r12, r8, 24	/* v = *(as + 24) */
498	bslli	r9, r12, 8	/* t1 = v << 8 */
499	or	r9, r11, r9	/* t1 = h | t1 */
500	swi	r9, r5, 24	/* *(d + 24) = t1 */
501	bsrli	r11, r12, 24	/* h = v >> 24 */
502	lwi	r12, r8, 20	/* v = *(as + 20) */
503	bslli	r9, r12, 8	/* t1 = v << 8 */
504	or	r9, r11, r9	/* t1 = h | t1 */
505	swi	r9, r5, 20	/* *(d + 20) = t1 */
506	bsrli	r11, r12, 24	/* h = v >> 24 */
507	lwi	r12, r8, 16	/* v = *(as + 16) */
508	bslli	r9, r12, 8	/* t1 = v << 8 */
509	or	r9, r11, r9	/* t1 = h | t1 */
510	swi	r9, r5, 16	/* *(d + 16) = t1 */
511	bsrli	r11, r12, 24	/* h = v >> 24 */
512	lwi	r12, r8, 12	/* v = *(as + 12) */
513	bslli	r9, r12, 8	/* t1 = v << 8 */
514	or	r9, r11, r9	/* t1 = h | t1 */
515	swi	r9, r5, 12	/* *(d + 112) = t1 */
516	bsrli	r11, r12, 24	/* h = v >> 24 */
517	lwi	r12, r8, 8	/* v = *(as + 8) */
518	bslli	r9, r12, 8	/* t1 = v << 8 */
519	or	r9, r11, r9	/* t1 = h | t1 */
520	swi	r9, r5, 8	/* *(d + 8) = t1 */
521	bsrli	r11, r12, 24	/* h = v >> 24 */
522	lwi	r12, r8, 4	/* v = *(as + 4) */
523	bslli	r9, r12, 8	/* t1 = v << 8 */
524	or	r9, r11, r9	/* t1 = h | t1 */
525	swi	r9, r5, 4	/* *(d + 4) = t1 */
526	bsrli	r11, r12, 24	/* h = v >> 24 */
527	lwi	r12, r8, 0	/* v = *(as + 0) */
528	bslli	r9, r12, 8	/* t1 = v << 8 */
529	or	r9, r11, r9	/* t1 = h | t1 */
530	swi	r9, r5, 0	/* *(d + 0) = t1 */
531	addi	r4, r4, -32	/* n = n - 32 */
532	bneid	r4, d_bu1_loop	/* while (n) loop */
533	bsrli	r11, r12, 24	/* h = v >> 24 (IN DELAY SLOT) */
534	bri	d_block_done
535
536d_block_u2:
537	bsrli	r11, r11, 16	/* h = h >> 16 */
538d_bu2_loop:
539	addi	r8, r8, -32	/* as = as - 32 */
540	addi	r5, r5, -32	/* d = d - 32 */
541	lwi	r12, r8, 28	/* v = *(as + 28) */
542	bslli	r9, r12, 16	/* t1 = v << 16 */
543	or	r9, r11, r9	/* t1 = h | t1 */
544	swi	r9, r5, 28	/* *(d + 28) = t1 */
545	bsrli	r11, r12, 16	/* h = v >> 16 */
546	lwi	r12, r8, 24	/* v = *(as + 24) */
547	bslli	r9, r12, 16	/* t1 = v << 16 */
548	or	r9, r11, r9	/* t1 = h | t1 */
549	swi	r9, r5, 24	/* *(d + 24) = t1 */
550	bsrli	r11, r12, 16	/* h = v >> 16 */
551	lwi	r12, r8, 20	/* v = *(as + 20) */
552	bslli	r9, r12, 16	/* t1 = v << 16 */
553	or	r9, r11, r9	/* t1 = h | t1 */
554	swi	r9, r5, 20	/* *(d + 20) = t1 */
555	bsrli	r11, r12, 16	/* h = v >> 16 */
556	lwi	r12, r8, 16	/* v = *(as + 16) */
557	bslli	r9, r12, 16	/* t1 = v << 16 */
558	or	r9, r11, r9	/* t1 = h | t1 */
559	swi	r9, r5, 16	/* *(d + 16) = t1 */
560	bsrli	r11, r12, 16	/* h = v >> 16 */
561	lwi	r12, r8, 12	/* v = *(as + 12) */
562	bslli	r9, r12, 16	/* t1 = v << 16 */
563	or	r9, r11, r9	/* t1 = h | t1 */
564	swi	r9, r5, 12	/* *(d + 112) = t1 */
565	bsrli	r11, r12, 16	/* h = v >> 16 */
566	lwi	r12, r8, 8	/* v = *(as + 8) */
567	bslli	r9, r12, 16	/* t1 = v << 16 */
568	or	r9, r11, r9	/* t1 = h | t1 */
569	swi	r9, r5, 8	/* *(d + 8) = t1 */
570	bsrli	r11, r12, 16	/* h = v >> 16 */
571	lwi	r12, r8, 4	/* v = *(as + 4) */
572	bslli	r9, r12, 16	/* t1 = v << 16 */
573	or	r9, r11, r9	/* t1 = h | t1 */
574	swi	r9, r5, 4	/* *(d + 4) = t1 */
575	bsrli	r11, r12, 16	/* h = v >> 16 */
576	lwi	r12, r8, 0	/* v = *(as + 0) */
577	bslli	r9, r12, 16	/* t1 = v << 16 */
578	or	r9, r11, r9	/* t1 = h | t1 */
579	swi	r9, r5, 0	/* *(d + 0) = t1 */
580	addi	r4, r4, -32	/* n = n - 32 */
581	bneid	r4, d_bu2_loop	/* while (n) loop */
582	bsrli	r11, r12, 16	/* h = v >> 16 (IN DELAY SLOT) */
583
584d_block_done:
585	addi	r4, r0, 4	/* n = 4 */
586	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
587	blti	r4,d_xfer_end	/* if n < 0, less than one word to transfer */
588
589d_word_xfer:
590	andi	r4, r7, 0xfffffffc	/* n = c & ~3 */
591	rsub	r5, r4, r5		/* d = d - n */
592	rsub	r6, r4, r6		/* s = s - n */
593	rsub	r7, r4, r7		/* c = c - n */
594
595	andi	r9, r6, 3		/* t1 = s & 3 */
596	/* if temp != 0, unaligned transfers needed */
597	bnei	r9, d_word_unaligned
598
599d_word_aligned:
600	addi	r4, r4,-4		/* n-- */
601	lw	r9, r6, r4		/* t1 = *(s+n) */
602	bneid	r4, d_word_aligned	/* loop */
603	sw	r9, r5, r4		/* *(d+n) = t1 (IN DELAY SLOT) */
604
605	bri	d_word_done
606
607d_word_unaligned:
608	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */
609	lw	r11, r8, r4		/* h = *(as + n) */
610
611	addi	r9, r9, -1
612	beqi	r9,d_word_u1		/* t1 was 1 => 1 byte offset */
613	addi	r9, r9, -1
614	beqi	r9,d_word_u2		/* t1 was 2 => 2 byte offset */
615
616d_word_u3:
617	bsrli	r11, r11, 8	/* h = h >> 8 */
618d_wu3_loop:
619	addi	r4, r4,-4	/* n = n - 4 */
620	lw	r12, r8, r4	/* v = *(as + n) */
621	bslli	r9, r12, 24	/* t1 = v << 24 */
622	or	r9, r11, r9	/* t1 = h | t1 */
623	sw	r9, r5, r4	/* *(d + n) = t1 */
624	bneid	r4, d_wu3_loop	/* while (n) loop */
625	bsrli	r11, r12, 8	/* h = v >> 8 (IN DELAY SLOT) */
626
627	bri	d_word_done
628
629d_word_u1:
630	bsrli	r11, r11, 24	/* h = h >> 24 */
631d_wu1_loop:
632	addi	r4, r4,-4	/* n = n - 4 */
633	lw	r12, r8, r4	/* v = *(as + n) */
634	bslli	r9, r12, 8	/* t1 = v << 8 */
635	or	r9, r11, r9	/* t1 = h | t1 */
636	sw	r9, r5, r4	/* *(d + n) = t1 */
637	bneid	r4, d_wu1_loop	/* while (n) loop */
638	bsrli	r11, r12, 24	/* h = v >> 24 (IN DELAY SLOT) */
639
640	bri	d_word_done
641
642d_word_u2:
643	bsrli	r11, r11, 16	/* h = h >> 16 */
644d_wu2_loop:
645	addi	r4, r4,-4	/* n = n - 4 */
646	lw	r12, r8, r4	/* v = *(as + n) */
647	bslli	r9, r12, 16	/* t1 = v << 16 */
648	or	r9, r11, r9	/* t1 = h | t1 */
649	sw	r9, r5, r4	/* *(d + n) = t1 */
650	bneid	r4, d_wu2_loop	/* while (n) loop */
651	bsrli	r11, r12, 16	/* h = v >> 16 (IN DELAY SLOT) */
652
653d_word_done:
654
655d_xfer_end:
656d_xfer_end_loop:
657	beqi	r7, a_done		/* while (c) */
658	addi	r6, r6, -1		/* s-- */
659	lbui	r9, r6, 0		/* t1 = *s */
660	addi	r5, r5, -1		/* d-- */
661	sbi	r9, r5, 0		/* *d = t1 */
662	brid	d_xfer_end_loop		/* loop */
663	addi	r7, r7, -1		/* c-- (IN DELAY SLOT) */
664
665d_done:
666	rtsd	r15, 8
667	nop
668
669.size  memmove, . - memmove
670.end memmove
671