xref: /linux/arch/sparc/lib/memcpy.S (revision d3867f0483103b8ff7edfdea3ef1981c03d96891)
1/* memcpy.S: Sparc optimized memcpy and memmove code
2 * Hand optimized from GNU libc's memcpy and memmove
3 * Copyright (C) 1991,1996 Free Software Foundation
4 * Copyright (C) 1995 Linus Torvalds (Linus.Torvalds@helsinki.fi)
5 * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
6 * Copyright (C) 1996 Eddie C. Dost (ecd@skynet.be)
7 * Copyright (C) 1996 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
8 */
9
10#include <asm/export.h>
11#define FUNC(x) 		\
12	.globl	x;		\
13	.type	x,@function;	\
14	.align	4;		\
15x:
16
17/* Both these macros have to start with exactly the same insn */
18#define MOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
19	ldd	[%src + (offset) + 0x00], %t0; \
20	ldd	[%src + (offset) + 0x08], %t2; \
21	ldd	[%src + (offset) + 0x10], %t4; \
22	ldd	[%src + (offset) + 0x18], %t6; \
23	st	%t0, [%dst + (offset) + 0x00]; \
24	st	%t1, [%dst + (offset) + 0x04]; \
25	st	%t2, [%dst + (offset) + 0x08]; \
26	st	%t3, [%dst + (offset) + 0x0c]; \
27	st	%t4, [%dst + (offset) + 0x10]; \
28	st	%t5, [%dst + (offset) + 0x14]; \
29	st	%t6, [%dst + (offset) + 0x18]; \
30	st	%t7, [%dst + (offset) + 0x1c];
31
32#define MOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
33	ldd	[%src + (offset) + 0x00], %t0; \
34	ldd	[%src + (offset) + 0x08], %t2; \
35	ldd	[%src + (offset) + 0x10], %t4; \
36	ldd	[%src + (offset) + 0x18], %t6; \
37	std	%t0, [%dst + (offset) + 0x00]; \
38	std	%t2, [%dst + (offset) + 0x08]; \
39	std	%t4, [%dst + (offset) + 0x10]; \
40	std	%t6, [%dst + (offset) + 0x18];
41
42#define MOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \
43	ldd	[%src - (offset) - 0x10], %t0; \
44	ldd	[%src - (offset) - 0x08], %t2; \
45	st	%t0, [%dst - (offset) - 0x10]; \
46	st	%t1, [%dst - (offset) - 0x0c]; \
47	st	%t2, [%dst - (offset) - 0x08]; \
48	st	%t3, [%dst - (offset) - 0x04];
49
50#define MOVE_LASTALIGNCHUNK(src, dst, offset, t0, t1, t2, t3) \
51	ldd	[%src - (offset) - 0x10], %t0; \
52	ldd	[%src - (offset) - 0x08], %t2; \
53	std	%t0, [%dst - (offset) - 0x10]; \
54	std	%t2, [%dst - (offset) - 0x08];
55
56#define MOVE_SHORTCHUNK(src, dst, offset, t0, t1) \
57	ldub	[%src - (offset) - 0x02], %t0; \
58	ldub	[%src - (offset) - 0x01], %t1; \
59	stb	%t0, [%dst - (offset) - 0x02]; \
60	stb	%t1, [%dst - (offset) - 0x01];
61
62/* Both these macros have to start with exactly the same insn */
63#define RMOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
64	ldd	[%src - (offset) - 0x20], %t0; \
65	ldd	[%src - (offset) - 0x18], %t2; \
66	ldd	[%src - (offset) - 0x10], %t4; \
67	ldd	[%src - (offset) - 0x08], %t6; \
68	st	%t0, [%dst - (offset) - 0x20]; \
69	st	%t1, [%dst - (offset) - 0x1c]; \
70	st	%t2, [%dst - (offset) - 0x18]; \
71	st	%t3, [%dst - (offset) - 0x14]; \
72	st	%t4, [%dst - (offset) - 0x10]; \
73	st	%t5, [%dst - (offset) - 0x0c]; \
74	st	%t6, [%dst - (offset) - 0x08]; \
75	st	%t7, [%dst - (offset) - 0x04];
76
77#define RMOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
78	ldd	[%src - (offset) - 0x20], %t0; \
79	ldd	[%src - (offset) - 0x18], %t2; \
80	ldd	[%src - (offset) - 0x10], %t4; \
81	ldd	[%src - (offset) - 0x08], %t6; \
82	std	%t0, [%dst - (offset) - 0x20]; \
83	std	%t2, [%dst - (offset) - 0x18]; \
84	std	%t4, [%dst - (offset) - 0x10]; \
85	std	%t6, [%dst - (offset) - 0x08];
86
87#define RMOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \
88	ldd	[%src + (offset) + 0x00], %t0; \
89	ldd	[%src + (offset) + 0x08], %t2; \
90	st	%t0, [%dst + (offset) + 0x00]; \
91	st	%t1, [%dst + (offset) + 0x04]; \
92	st	%t2, [%dst + (offset) + 0x08]; \
93	st	%t3, [%dst + (offset) + 0x0c];
94
95#define RMOVE_SHORTCHUNK(src, dst, offset, t0, t1) \
96	ldub	[%src + (offset) + 0x00], %t0; \
97	ldub	[%src + (offset) + 0x01], %t1; \
98	stb	%t0, [%dst + (offset) + 0x00]; \
99	stb	%t1, [%dst + (offset) + 0x01];
100
101#define SMOVE_CHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, prev, shil, shir, offset2) \
102	ldd	[%src + (offset) + 0x00], %t0; \
103	ldd	[%src + (offset) + 0x08], %t2; \
104	srl	%t0, shir, %t5; \
105	srl	%t1, shir, %t6; \
106	sll	%t0, shil, %t0; \
107	or	%t5, %prev, %t5; \
108	sll	%t1, shil, %prev; \
109	or	%t6, %t0, %t0; \
110	srl	%t2, shir, %t1; \
111	srl	%t3, shir, %t6; \
112	sll	%t2, shil, %t2; \
113	or	%t1, %prev, %t1; \
114	std	%t4, [%dst + (offset) + (offset2) - 0x04]; \
115	std	%t0, [%dst + (offset) + (offset2) + 0x04]; \
116	sll	%t3, shil, %prev; \
117	or	%t6, %t2, %t4;
118
119#define SMOVE_ALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, prev, shil, shir, offset2) \
120	ldd	[%src + (offset) + 0x00], %t0; \
121	ldd	[%src + (offset) + 0x08], %t2; \
122	srl	%t0, shir, %t4;	\
123	srl	%t1, shir, %t5;	\
124	sll	%t0, shil, %t6;	\
125	or	%t4, %prev, %t0; \
126	sll	%t1, shil, %prev; \
127	or	%t5, %t6, %t1; \
128	srl	%t2, shir, %t4;	\
129	srl	%t3, shir, %t5;	\
130	sll	%t2, shil, %t6; \
131	or	%t4, %prev, %t2; \
132	sll	%t3, shil, %prev; \
133	or	%t5, %t6, %t3; \
134	std	%t0, [%dst + (offset) + (offset2) + 0x00]; \
135	std	%t2, [%dst + (offset) + (offset2) + 0x08];
136
137	.text
138	.align	4
139
1400:
141	retl
142	 nop		! Only bcopy returns here and it retuns void...
143
144#ifdef __KERNEL__
145FUNC(amemmove)
146FUNC(__memmove)
147EXPORT_SYMBOL(__memmove)
148#endif
149FUNC(memmove)
150EXPORT_SYMBOL(memmove)
151	cmp		%o0, %o1
152	mov		%o0, %g7
153	bleu		9f
154	 sub		%o0, %o1, %o4
155
156	add		%o1, %o2, %o3
157	cmp		%o3, %o0
158	bleu		0f
159	 andcc		%o4, 3, %o5
160
161	add		%o1, %o2, %o1
162	add		%o0, %o2, %o0
163	sub		%o1, 1, %o1
164	sub		%o0, 1, %o0
165
1661:	/* reverse_bytes */
167
168	ldub		[%o1], %o4
169	subcc		%o2, 1, %o2
170	stb		%o4, [%o0]
171	sub		%o1, 1, %o1
172	bne		1b
173	 sub		%o0, 1, %o0
174
175	retl
176	 mov		%g7, %o0
177
178/* NOTE: This code is executed just for the cases,
179         where %src (=%o1) & 3 is != 0.
180	 We need to align it to 4. So, for (%src & 3)
181	 1 we need to do ldub,lduh
182	 2 lduh
183	 3 just ldub
184         so even if it looks weird, the branches
185         are correct here. -jj
186 */
18778:	/* dword_align */
188
189	andcc		%o1, 1, %g0
190	be		4f
191	 andcc		%o1, 2, %g0
192
193	ldub		[%o1], %g2
194	add		%o1, 1, %o1
195	stb		%g2, [%o0]
196	sub		%o2, 1, %o2
197	bne		3f
198	 add		%o0, 1, %o0
1994:
200	lduh		[%o1], %g2
201	add		%o1, 2, %o1
202	sth		%g2, [%o0]
203	sub		%o2, 2, %o2
204	b		3f
205	 add		%o0, 2, %o0
206
207FUNC(memcpy)	/* %o0=dst %o1=src %o2=len */
208EXPORT_SYMBOL(memcpy)
209
210	sub		%o0, %o1, %o4
211	mov		%o0, %g7
2129:
213	andcc		%o4, 3, %o5
2140:
215	bne		86f
216	 cmp		%o2, 15
217
218	bleu		90f
219	 andcc		%o1, 3, %g0
220
221	bne		78b
2223:
223	 andcc		%o1, 4, %g0
224
225	be		2f
226	 mov		%o2, %g1
227
228	ld		[%o1], %o4
229	sub		%g1, 4, %g1
230	st		%o4, [%o0]
231	add		%o1, 4, %o1
232	add		%o0, 4, %o0
2332:
234	andcc		%g1, 0xffffff80, %g0
235	be		3f
236	 andcc		%o0, 4, %g0
237
238	be		82f + 4
2395:
240	MOVE_BIGCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
241	MOVE_BIGCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
242	MOVE_BIGCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
243	MOVE_BIGCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
244	sub		%g1, 128, %g1
245	add		%o1, 128, %o1
246	cmp		%g1, 128
247	bge		5b
248	 add		%o0, 128, %o0
2493:
250	andcc		%g1, 0x70, %g4
251	be		80f
252	 andcc		%g1, 8, %g0
253
254	sethi		%hi(80f), %o5
255	srl		%g4, 1, %o4
256	add		%g4, %o4, %o4
257	add		%o1, %g4, %o1
258	sub		%o5, %o4, %o5
259	jmpl		%o5 + %lo(80f), %g0
260	 add		%o0, %g4, %o0
261
26279:	/* memcpy_table */
263
264	MOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g4, g5)
265	MOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g4, g5)
266	MOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g4, g5)
267	MOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g4, g5)
268	MOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g4, g5)
269	MOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g4, g5)
270	MOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g4, g5)
271
27280:	/* memcpy_table_end */
273	be		81f
274	 andcc		%g1, 4, %g0
275
276	ldd		[%o1], %g2
277	add		%o0, 8, %o0
278	st		%g2, [%o0 - 0x08]
279	add		%o1, 8, %o1
280	st		%g3, [%o0 - 0x04]
281
28281:	/* memcpy_last7 */
283
284	be		1f
285	 andcc		%g1, 2, %g0
286
287	ld		[%o1], %g2
288	add		%o1, 4, %o1
289	st		%g2, [%o0]
290	add		%o0, 4, %o0
2911:
292	be		1f
293	 andcc		%g1, 1, %g0
294
295	lduh		[%o1], %g2
296	add		%o1, 2, %o1
297	sth		%g2, [%o0]
298	add		%o0, 2, %o0
2991:
300	be		1f
301	 nop
302
303	ldub		[%o1], %g2
304	stb		%g2, [%o0]
3051:
306	retl
307	 mov		%g7, %o0
308
30982:	/* ldd_std */
310	MOVE_BIGALIGNCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
311	MOVE_BIGALIGNCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
312	MOVE_BIGALIGNCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
313	MOVE_BIGALIGNCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
314	subcc		%g1, 128, %g1
315	add		%o1, 128, %o1
316	cmp		%g1, 128
317	bge		82b
318	 add		%o0, 128, %o0
319
320	andcc		%g1, 0x70, %g4
321	be		84f
322	 andcc		%g1, 8, %g0
323
324	sethi		%hi(84f), %o5
325	add		%o1, %g4, %o1
326	sub		%o5, %g4, %o5
327	jmpl		%o5 + %lo(84f), %g0
328	 add		%o0, %g4, %o0
329
33083:	/* amemcpy_table */
331
332	MOVE_LASTALIGNCHUNK(o1, o0, 0x60, g2, g3, g4, g5)
333	MOVE_LASTALIGNCHUNK(o1, o0, 0x50, g2, g3, g4, g5)
334	MOVE_LASTALIGNCHUNK(o1, o0, 0x40, g2, g3, g4, g5)
335	MOVE_LASTALIGNCHUNK(o1, o0, 0x30, g2, g3, g4, g5)
336	MOVE_LASTALIGNCHUNK(o1, o0, 0x20, g2, g3, g4, g5)
337	MOVE_LASTALIGNCHUNK(o1, o0, 0x10, g2, g3, g4, g5)
338	MOVE_LASTALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5)
339
34084:	/* amemcpy_table_end */
341	be		85f
342	 andcc		%g1, 4, %g0
343
344	ldd		[%o1], %g2
345	add		%o0, 8, %o0
346	std		%g2, [%o0 - 0x08]
347	add		%o1, 8, %o1
34885:	/* amemcpy_last7 */
349	be		1f
350	 andcc		%g1, 2, %g0
351
352	ld		[%o1], %g2
353	add		%o1, 4, %o1
354	st		%g2, [%o0]
355	add		%o0, 4, %o0
3561:
357	be		1f
358	 andcc		%g1, 1, %g0
359
360	lduh		[%o1], %g2
361	add		%o1, 2, %o1
362	sth		%g2, [%o0]
363	add		%o0, 2, %o0
3641:
365	be		1f
366	 nop
367
368	ldub		[%o1], %g2
369	stb		%g2, [%o0]
3701:
371	retl
372	 mov		%g7, %o0
373
37486:	/* non_aligned */
375	cmp		%o2, 6
376	bleu		88f
377	 nop
378
379	save		%sp, -96, %sp
380	andcc		%i0, 3, %g0
381	be		61f
382	 andcc		%i0, 1, %g0
383	be		60f
384	 andcc		%i0, 2, %g0
385
386	ldub		[%i1], %g5
387	add		%i1, 1, %i1
388	stb		%g5, [%i0]
389	sub		%i2, 1, %i2
390	bne		61f
391	 add		%i0, 1, %i0
39260:
393	ldub		[%i1], %g3
394	add		%i1, 2, %i1
395	stb		%g3, [%i0]
396	sub		%i2, 2, %i2
397	ldub		[%i1 - 1], %g3
398	add		%i0, 2, %i0
399	stb		%g3, [%i0 - 1]
40061:
401	and		%i1, 3, %g2
402	and		%i2, 0xc, %g3
403	and		%i1, -4, %i1
404	cmp		%g3, 4
405	sll		%g2, 3, %g4
406	mov		32, %g2
407	be		4f
408	 sub		%g2, %g4, %l0
409
410	blu		3f
411	 cmp		%g3, 0x8
412
413	be		2f
414	 srl		%i2, 2, %g3
415
416	ld		[%i1], %i3
417	add		%i0, -8, %i0
418	ld		[%i1 + 4], %i4
419	b		8f
420	 add		%g3, 1, %g3
4212:
422	ld		[%i1], %i4
423	add		%i0, -12, %i0
424	ld		[%i1 + 4], %i5
425	add		%g3, 2, %g3
426	b		9f
427	 add		%i1, -4, %i1
4283:
429	ld		[%i1], %g1
430	add		%i0, -4, %i0
431	ld		[%i1 + 4], %i3
432	srl		%i2, 2, %g3
433	b		7f
434	 add		%i1, 4, %i1
4354:
436	ld		[%i1], %i5
437	cmp		%i2, 7
438	ld		[%i1 + 4], %g1
439	srl		%i2, 2, %g3
440	bleu		10f
441	 add		%i1, 8, %i1
442
443	ld		[%i1], %i3
444	add		%g3, -1, %g3
4455:
446	sll		%i5, %g4, %g2
447	srl		%g1, %l0, %g5
448	or		%g2, %g5, %g2
449	st		%g2, [%i0]
4507:
451	ld		[%i1 + 4], %i4
452	sll		%g1, %g4, %g2
453	srl		%i3, %l0, %g5
454	or		%g2, %g5, %g2
455	st		%g2, [%i0 + 4]
4568:
457	ld		[%i1 + 8], %i5
458	sll		%i3, %g4, %g2
459	srl		%i4, %l0, %g5
460	or		%g2, %g5, %g2
461	st		%g2, [%i0 + 8]
4629:
463	ld		[%i1 + 12], %g1
464	sll		%i4, %g4, %g2
465	srl		%i5, %l0, %g5
466	addcc		%g3, -4, %g3
467	or		%g2, %g5, %g2
468	add		%i1, 16, %i1
469	st		%g2, [%i0 + 12]
470	add		%i0, 16, %i0
471	bne,a		5b
472	 ld		[%i1], %i3
47310:
474	sll		%i5, %g4, %g2
475	srl		%g1, %l0, %g5
476	srl		%l0, 3, %g3
477	or		%g2, %g5, %g2
478	sub		%i1, %g3, %i1
479	andcc		%i2, 2, %g0
480	st		%g2, [%i0]
481	be		1f
482	 andcc		%i2, 1, %g0
483
484	ldub		[%i1], %g2
485	add		%i1, 2, %i1
486	stb		%g2, [%i0 + 4]
487	add		%i0, 2, %i0
488	ldub		[%i1 - 1], %g2
489	stb		%g2, [%i0 + 3]
4901:
491	be		1f
492	 nop
493	ldub		[%i1], %g2
494	stb		%g2, [%i0 + 4]
4951:
496	ret
497	 restore	%g7, %g0, %o0
498
49988:	/* short_end */
500
501	and		%o2, 0xe, %o3
50220:
503	sethi		%hi(89f), %o5
504	sll		%o3, 3, %o4
505	add		%o0, %o3, %o0
506	sub		%o5, %o4, %o5
507	add		%o1, %o3, %o1
508	jmpl		%o5 + %lo(89f), %g0
509	 andcc		%o2, 1, %g0
510
511	MOVE_SHORTCHUNK(o1, o0, 0x0c, g2, g3)
512	MOVE_SHORTCHUNK(o1, o0, 0x0a, g2, g3)
513	MOVE_SHORTCHUNK(o1, o0, 0x08, g2, g3)
514	MOVE_SHORTCHUNK(o1, o0, 0x06, g2, g3)
515	MOVE_SHORTCHUNK(o1, o0, 0x04, g2, g3)
516	MOVE_SHORTCHUNK(o1, o0, 0x02, g2, g3)
517	MOVE_SHORTCHUNK(o1, o0, 0x00, g2, g3)
518
51989:	/* short_table_end */
520
521	be		1f
522	 nop
523
524	ldub		[%o1], %g2
525	stb		%g2, [%o0]
5261:
527	retl
528	 mov		%g7, %o0
529
53090:	/* short_aligned_end */
531	bne		88b
532	 andcc		%o2, 8, %g0
533
534	be		1f
535	 andcc		%o2, 4, %g0
536
537	ld		[%o1 + 0x00], %g2
538	ld		[%o1 + 0x04], %g3
539	add		%o1, 8, %o1
540	st		%g2, [%o0 + 0x00]
541	st		%g3, [%o0 + 0x04]
542	add		%o0, 8, %o0
5431:
544	b		81b
545	 mov		%o2, %g1
546