xref: /titanic_52/usr/src/lib/libc/capabilities/sun4u-us3/common/memcpy.s (revision f936286c99fb83153e4bfd870eb2830a990a82c1)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26	.file	"memcpy.s"
27
28/*
29 * memcpy(s1, s2, len)
30 *
31 * Copy s2 to s1, always copy n bytes.
32 * Note: this C code does not work for overlapped copies.
33 *       Memmove() and bcopy() do.
34 *
35 * Fast assembler language version of the following C-program for memcpy
36 * which represents the `standard' for the C-library.
37 *
38 *	void *
39 *	memcpy(void *s, const void *s0, size_t n)
40 *	{
41 *		if (n != 0) {
42 *	   	    char *s1 = s;
43 *		    const char *s2 = s0;
44 *		    do {
45 *			*s1++ = *s2++;
46 *		    } while (--n != 0);
47 *		}
48 *		return (s);
49 *	}
50 */
51
52#include <sys/asm_linkage.h>
53#include <sys/sun4asi.h>
54#include <sys/trap.h>
55
56#define	ICACHE_LINE_SIZE	64
57#define	BLOCK_SIZE	64
58#define	FPRS_FEF	0x4
59
60#define SHORTCOPY	3
61#define	SMALL_MAX	39
62#define	MEDIUM_MAX	255
63#define MED_WMAX	256	/* max copy for medium word-aligned case */
64#define MED_MAX		256	/* max copy for medium longword-aligned case */
65
66#ifndef BSTORE_SIZE
67#define BSTORE_SIZE	256	/* min copy size for block store */
68#endif
69
70	ANSI_PRAGMA_WEAK(memmove,function)
71	ANSI_PRAGMA_WEAK(memcpy,function)
72
73	ENTRY(memmove)
74	cmp	%o1, %o0	! if from address is >= to use forward copy
75	bgeu	%ncc, .forcpy	! else use backward if ...
76	sub	%o0, %o1, %o4	! get difference of two addresses
77	cmp	%o2, %o4	! compare size and difference of addresses
78	bleu	%ncc, .forcpy	! if size is bigger, do overlapped copy
79	nop
80
81        !
82        ! an overlapped copy that must be done "backwards"
83        !
84.ovbc:
85	mov	%o0, %g1		! save dest address for return val
86	add     %o1, %o2, %o1           ! get to end of source space
87        add     %o0, %o2, %o0           ! get to end of destination space
88
89	cmp	%o2, 24
90	bgeu,pn	%ncc, .dbalign
91	nop
92	cmp	%o2, 4
93	blt,pn	%ncc, .byte
94	sub	%o2, 3, %o2
95.byte4loop:
96	ldub	[%o1-1], %o3		! load last byte
97	stb	%o3, [%o0-1]		! store last byte
98	sub	%o1, 4, %o1
99	ldub	[%o1+2], %o3		! load 2nd from last byte
100	stb	%o3, [%o0-2]		! store 2nd from last byte
101	sub	%o0, 4, %o0
102	ldub	[%o1+1], %o3		! load 3rd from last byte
103	stb	%o3, [%o0+1]		! store 3rd from last byte
104	subcc	%o2, 4, %o2
105	ldub	[%o1], %o3		! load 4th from last byte
106	bgu,pt	%ncc, .byte4loop
107	stb	%o3, [%o0]		! store 4th from last byte
108.byte:
109	addcc	%o2, 3, %o2
110	bz,pt	%ncc, .exit
111.byteloop:
112	dec	%o1			! decrement src address
113	ldub	[%o1], %o3		! read a byte
114	dec	%o0			! decrement dst address
115	deccc	%o2			! decrement count
116	bgu,pt	%ncc, .byteloop		! loop until done
117	stb	%o3, [%o0]		! write byte
118.exit:
119	retl
120	mov	%g1, %o0
121
122	.align	16
123.dbalign:
124	andcc   %o0, 7, %o5		! bytes till DST 8 byte aligned
125	bz,pt	%ncc, .dbmed
126	sub	%o2, %o5, %o2		! update count
127.dbalign1:
128	dec	%o1			! decrement src address
129	ldub	[%o1], %o3		! read a byte
130	dec	%o0			! decrement dst address
131	deccc	%o5			! decrement count
132	bgu,pt	%ncc, .dbalign1		! loop until done
133	stb	%o3, [%o0]		! store a byte
134
135! check for src long word alignment
136.dbmed:
137	andcc	%o1, 7, %g0		! chk src long word alignment
138	bnz,pn	%ncc, .dbbck
139	nop
140!
141! Following code is for overlapping copies where src and dest
142! are long word aligned
143!
144	cmp	%o2, 4095
145	blt,pn	%ncc, .dbmedl32enter	! go to no prefetch code
146	nop
147	prefetch [%o1 - (1 * BLOCK_SIZE)], 20	! into the prefetch cache
148	sub	%o2, 63, %o2		! adjust length to allow cc test
149					! for end of loop
150	prefetch [%o1 - (2 * BLOCK_SIZE)], 20	! into the prefetch cache
151	rd	%fprs, %o3		! o3 = fprs
152	! if fprs.fef == 0, set it. Checking it, requires 2 instructions.
153	! So set it anyway, without checking.
154	prefetch [%o1 - (3 * BLOCK_SIZE)], 20	! into the prefetch cache
155	wr      %g0, 0x4, %fprs         ! fprs.fef = 1
156	prefetch [%o1 - (4 * BLOCK_SIZE)], 20	! into the prefetch cache
157.dbmedl64:
158	prefetch [%o1 - (5 * BLOCK_SIZE)], 20	! into the prefetch cache
159	ldd	[%o1-8], %d4		! load
160	subcc	%o2, 64, %o2		! decrement length count
161	std	%d4, [%o0-8]		! and store
162	ldd	[%o1-16], %d2		! a block of 64 bytes
163	sub	%o1, 64, %o1		! decrease src ptr by 64
164	std	%d2, [%o0-16]
165	sub	%o0, 64, %o0		! decrease dst ptr by 64
166	ldd	[%o1+40], %d4
167	std	%d4, [%o0+40]
168	ldd	[%o1+32], %d2
169	std	%d2, [%o0+32]
170	ldd	[%o1+24], %d4
171	std	%d4, [%o0+24]
172	ldd	[%o1+16], %d2
173	std	%d2, [%o0+16]
174	ldd	[%o1+8], %d4
175	std	%d4, [%o0+8]
176	ldd	[%o1], %d2
177	bgu,pt	%ncc, .dbmedl64		! repeat if at least 64 bytes left
178	std	%d2, [%o0]
179	add	%o2, 63, %o2		! restore offset adjustment
180	and	%o3, 0x4, %o3           ! fprs.du = fprs.dl = 0
181	wr	%o3, %g0, %fprs         ! fprs = o3   restore fprs
182.dbmedl32enter:
183	subcc	%o2, 31, %o2		! adjust length to allow cc test
184					! for end of loop
185	ble,pt  %ncc, .dbmedl31		! skip big loop if less than 32
186	nop
187.dbmedl32:
188	ldx	[%o1-8], %o4		! load
189	subcc	%o2, 32, %o2		! decrement length count
190	stx	%o4, [%o0-8]		! and store
191	ldx	[%o1-16], %o3		! a block of 32 bytes
192	sub	%o1, 32, %o1		! decrease src ptr by 32
193	stx	%o3, [%o0-16]
194	ldx	[%o1+8], %o4
195	sub	%o0, 32, %o0		! decrease dst ptr by 32
196	stx	%o4, [%o0+8]
197	ldx	[%o1], %o3
198	bgu,pt	%ncc, .dbmedl32		! repeat if at least 32 bytes left
199	stx	%o3, [%o0]
200.dbmedl31:
201	addcc	%o2, 16, %o2		! adjust remaining count
202	ble,pt	%ncc, .dbmedl15		! skip if 15 or fewer bytes left
203	nop				!
204	ldx	[%o1-8], %o4		! load and store 16 bytes
205	sub	%o1, 16, %o1		! decrease src ptr by 16
206	stx	%o4, [%o0-8]		!
207	sub	%o2, 16, %o2		! decrease count by 16
208	ldx	[%o1], %o3		!
209	sub	%o0, 16, %o0		! decrease dst ptr by 16
210	stx	%o3, [%o0]
211.dbmedl15:
212	addcc	%o2, 15, %o2		! restore count
213	bz,pt	%ncc, .dbexit		! exit if finished
214	nop
215	cmp	%o2, 8
216	blt,pt	%ncc, .dbremain		! skip if 7 or fewer bytes left
217	nop
218	ldx	[%o1-8], %o4		! load 8 bytes
219	sub	%o1, 8, %o1		! decrease src ptr by 8
220	stx	%o4, [%o0-8]		! and store 8 bytes
221	subcc	%o2, 8, %o2		! decrease count by 8
222	bnz	%ncc, .dbremain		! exit if finished
223	sub	%o0, 8, %o0		! decrease dst ptr by 8
224	retl
225	mov	%g1, %o0
226
227!
228! Following code is for overlapping copies where src and dest
229! are not long word aligned
230!
231	.align	16
232.dbbck:
233	rd	%fprs, %o3		! o3 = fprs
234
235	! if fprs.fef == 0, set it. Checking it, requires 2 instructions.
236	! So set it anyway, without checking.
237	wr      %g0, 0x4, %fprs         ! fprs.fef = 1
238
239	alignaddr %o1, %g0, %o5		! align src
240	ldd	[%o5], %d0		! get first 8 byte block
241	andn	%o2, 7, %o4		! prepare src ptr for finishup code
242	cmp	%o2, 32
243	blt,pn	%ncc, .dbmv8
244	sub	%o1, %o4, %o1		!
245	cmp	%o2, 4095		! check for short memmoves
246	blt,pn	%ncc, .dbmv32enter	! go to no prefetch code
247.dbmv64:
248	ldd	[%o5-8], %d2		! load 8 bytes
249	ldd	[%o5-16], %d4		! load 8 bytes
250	sub	%o5, 64, %o5		!
251	ldd	[%o5+40], %d6		! load 8 bytes
252	sub	%o0, 64, %o0		!
253	ldd	[%o5+32], %d8		! load 8 bytes
254	sub	%o2, 64, %o2		! 64 less bytes to copy
255	ldd	[%o5+24], %d18		! load 8 bytes
256	cmp	%o2, 64			! do we have < 64 bytes remaining
257	ldd	[%o5+16], %d28		! load 8 bytes
258	ldd	[%o5+8], %d30		! load 8 bytes
259	prefetch [%o5 - (5 * BLOCK_SIZE)], 20	! into the prefetch cache
260	faligndata %d2, %d0, %d10	! extract 8 bytes out
261	ldd	[%o5], %d0		! load 8 bytes
262	std	%d10, [%o0+56]		! store the current 8 bytes
263	faligndata %d4, %d2, %d12	! extract 8 bytes out
264	std	%d12, [%o0+48]		! store the current 8 bytes
265	faligndata %d6, %d4, %d14	! extract 8 bytes out
266	std	%d14, [%o0+40]		! store the current 8 bytes
267	faligndata %d8, %d6, %d16	! extract 8 bytes out
268	std	%d16, [%o0+32]		! store the current 8 bytes
269	faligndata %d18, %d8, %d20	! extract 8 bytes out
270	std	%d20, [%o0+24]		! store the current 8 bytes
271	faligndata %d28, %d18, %d22	! extract 8 bytes out
272	std	%d22, [%o0+16]		! store the current 8 bytes
273	faligndata %d30, %d28, %d24	! extract 8 bytes out
274	std	%d24, [%o0+8]		! store the current 8 bytes
275	faligndata %d0, %d30, %d26	! extract 8 bytes out
276	bgeu,pt	%ncc, .dbmv64
277	std	%d26, [%o0]		! store the current 8 bytes
278
279	cmp	%o2, 32
280	blt,pn	%ncc, .dbmvx
281	nop
282.dbmv32:
283	ldd	[%o5-8], %d2		! load 8 bytes
284.dbmv32enter:
285	ldd	[%o5-16], %d4		! load 8 bytes
286	sub	%o5, 32, %o5		!
287	ldd	[%o5+8], %d6		! load 8 bytes
288	sub	%o0, 32, %o0		!
289	faligndata %d2, %d0, %d10	! extract 8 bytes out
290	ldd	[%o5], %d0		! load 8 bytes
291	sub     %o2,32, %o2		! 32 less bytes to copy
292	std	%d10, [%o0+24]		! store the current 8 bytes
293	cmp	%o2, 32			! do we have < 32 bytes remaining
294	faligndata %d4, %d2, %d12	! extract 8 bytes out
295	std	%d12, [%o0+16]		! store the current 8 bytes
296	faligndata %d6, %d4, %d14	! extract 8 bytes out
297	std	%d14, [%o0+8]		! store the current 8 bytes
298	faligndata %d0, %d6, %d16	! extract 8 bytes out
299	bgeu,pt	%ncc, .dbmv32
300	std	%d16, [%o0]		! store the current 8 bytes
301.dbmvx:
302	cmp	%o2, 8			! do we have < 8 bytes remaining
303	blt,pt	%ncc, .dbmvfinish	! if yes, skip to finish up code
304	nop
305.dbmv8:
306	ldd	[%o5-8], %d2
307	sub	%o0, 8, %o0		! since we are at the end
308					! when we first enter the loop
309	sub     %o2, 8, %o2		! 8 less bytes to copy
310	sub	%o5, 8, %o5
311	cmp	%o2, 8			! do we have < 8 bytes remaining
312	faligndata %d2, %d0, %d8        ! extract 8 bytes out
313	std	%d8, [%o0]		! store the current 8 bytes
314	bgeu,pt	%ncc, .dbmv8
315	fmovd	%d2, %d0
316.dbmvfinish:
317	and	%o3, 0x4, %o3           ! fprs.du = fprs.dl = 0
318	tst	%o2
319	bz,pt	%ncc, .dbexit
320	wr	%o3, %g0, %fprs         ! fprs = o3   restore fprs
321
322.dbremain:
323	cmp	%o2, 4
324	blt,pn	%ncc, .dbbyte
325	nop
326	ldub	[%o1-1], %o3		! load last byte
327	stb	%o3, [%o0-1]		! store last byte
328	sub	%o1, 4, %o1
329	ldub	[%o1+2], %o3		! load 2nd from last byte
330	stb	%o3, [%o0-2]		! store 2nd from last byte
331	sub	%o0, 4, %o0
332	ldub	[%o1+1], %o3		! load 3rd from last byte
333	stb	%o3, [%o0+1]		! store 3rd from last byte
334	subcc	%o2, 4, %o2
335	ldub	[%o1], %o3		! load 4th from last byte
336	stb	%o3, [%o0]		! store 4th from last byte
337	bz,pt	%ncc, .dbexit
338.dbbyte:
339	dec	%o1			! decrement src address
340	ldub	[%o1], %o3		! read a byte
341	dec	%o0			! decrement dst address
342	deccc	%o2			! decrement count
343	bgu,pt	%ncc, .dbbyte		! loop until done
344	stb	%o3, [%o0]		! write byte
345.dbexit:
346	retl
347        mov     %g1, %o0
348	SET_SIZE(memmove)
349
350
351	.align ICACHE_LINE_SIZE
352	ENTRY(memcpy)
353					! adjust instruction alignment
354	nop				! Do not remove, these nops affect
355	nop				! icache alignment and performance
356.forcpy:
357	cmp	%o2, SMALL_MAX		! check for not small case
358	bgu,pn	%ncc, .medium		! go to larger cases
359	mov	%o0, %g1		! save %o0
360	cmp	%o2, SHORTCOPY		! check for really short case
361	ble,pt	%ncc, .smallleft	!
362	or	%o0, %o1, %o3		! prepare alignment check
363	andcc	%o3, 0x3, %g0		! test for alignment
364	bz,pt	%ncc, .smallword	! branch to word aligned case
365	sub	%o2, 3, %o2		! adjust count to allow cc zero test
366.smallnotalign4:
367	ldub	[%o1], %o3		! read byte
368	subcc	%o2, 4, %o2		! reduce count by 4
369	stb	%o3, [%o0]		! write byte
370	ldub	[%o1+1], %o3		! repeat for a total of 4 bytes
371	add	%o1, 4, %o1		! advance SRC by 4
372	stb	%o3, [%o0+1]
373	ldub	[%o1-2], %o3
374	add	%o0, 4, %o0		! advance DST by 4
375	stb	%o3, [%o0-2]
376	ldub	[%o1-1], %o3
377	bgu,pt	%ncc, .smallnotalign4	! loop til 3 or fewer bytes remain
378	stb	%o3, [%o0-1]
379	add	%o2, 3, %o2		! restore count
380.smallleft:
381	tst	%o2
382	bz,pt	%ncc, .smallexit
383	nop
384.smallleft3:				! 1, 2, or 3 bytes remain
385	ldub	[%o1], %o3		! load one byte
386	deccc	%o2			! reduce count for cc test
387	bz,pt	%ncc, .smallexit
388	stb	%o3, [%o0]		! store one byte
389	ldub	[%o1+1], %o3		! load second byte
390	deccc	%o2
391	bz,pt	%ncc, .smallexit
392	stb	%o3, [%o0+1]		! store second byte
393	ldub	[%o1+2], %o3		! load third byte
394	stb	%o3, [%o0+2]		! store third byte
395	retl
396	mov	%g1, %o0		! restore %o0
397
398	.align	16
399	nop				! affects loop icache alignment
400.smallwords:
401	lduw	[%o1], %o3		! read word
402.smallwordx:
403	subcc	%o2, 8, %o2		! update count
404	stw	%o3, [%o0]		! write word
405	add	%o1, 8, %o1		! update SRC
406	lduw	[%o1-4], %o3		! read word
407	add	%o0, 8, %o0		! update DST
408	bgu,pt	%ncc, .smallwords	! loop until done
409	stw	%o3, [%o0-4]		! write word
410	addcc	%o2, 7, %o2		! restore count
411	bz,pt	%ncc, .smallexit	! check for completion
412	nop
413	cmp	%o2, 4			! check for 4 or more bytes left
414	blt	.smallleft3		! if not, go to finish up
415	nop
416	lduw	[%o1], %o3
417	add	%o1, 4, %o1
418	subcc	%o2, 4, %o2
419	stw	%o3, [%o0]
420	add	%o0, 4, %o0
421	bnz,pt	%ncc, .smallleft3
422	nop
423	retl
424	mov	%g1, %o0		! restore %o0
425
426.smallword:
427	subcc	%o2, 4, %o2		! update count
428	bgu,pt	%ncc, .smallwordx
429	lduw	[%o1], %o3		! read word
430	addcc	%o2, 3, %o2		! restore count
431	bz,pt	%ncc, .smallexit
432	stw	%o3, [%o0]		! write word
433	deccc	%o2			! reduce count for cc test
434	ldub	[%o1+4], %o3		! load one byte
435	bz,pt	%ncc, .smallexit
436	stb	%o3, [%o0+4]		! store one byte
437	ldub	[%o1+5], %o3		! load second byte
438	deccc	%o2
439	bz,pt	%ncc, .smallexit
440	stb	%o3, [%o0+5]		! store second byte
441	ldub	[%o1+6], %o3		! load third byte
442	stb	%o3, [%o0+6]		! store third byte
443.smallexit:
444	retl
445	mov	%g1, %o0		! restore %o0
446	.align 16
447.medium:
448	neg	%o0, %o5
449	neg	%o1, %o3
450	andcc	%o5, 7, %o5	! bytes till DST 8 byte aligned
451	and	%o3, 7, %o3	! bytes till SRC 8 byte aligned
452
453	bz	%ncc, 2f
454	sub	%o5, %o3, %o3	! -(bytes till SRC aligned after DST aligned)
455				! o3={-7, -6, ... 7}  o3>0 => SRC overaligned
456
457	sub	%o2, %o5, %o2	! update count
458
4591:
460	ldub	[%o1], %o4
461	deccc	%o5
462	inc	%o1
463	stb	%o4, [%o0]
464	bgu,pt	%ncc, 1b
465	inc	%o0
466
467	! Now DST is 8-byte aligned.  o0, o1, o2 are current.
468
4692:
470	andcc	%o1, 0x3, %g0		! test alignment
471	bnz,pt	%ncc, .mediumsetup	! branch to skip aligned cases
472					! if src, dst not aligned
473	prefetch [%o1 + (1 * BLOCK_SIZE)], 20
474
475/*
476 * Handle all cases where src and dest are aligned on word
477 * or long word boundaries.  Use unrolled loops for better
478 * performance.  This option wins over standard large data
479 * move when source and destination is in cache for medium
480 * to short data moves.
481 */
482	andcc	%o1, 0x7, %g0		! test word alignment
483	bz,pt	%ncc, .medlword		! branch to long word aligned case
484	prefetch [%o1 + (2 * BLOCK_SIZE)], 20
485	cmp	%o2, MED_WMAX		! limit to store buffer size
486	bgu,pt	%ncc, .mediumrejoin	! otherwise rejoin main loop
487	nop
488	subcc	%o2, 15, %o2		! adjust length to allow cc test
489					! for end of loop
490	ble,pt	%ncc, .medw15		! skip big loop if less than 16
491	prefetch [%o1 + (3 * BLOCK_SIZE)], 20
492/*
493 * no need to put prefetch in loop as prefetches have
494 * already been issued for maximum loop size
495 */
496.medw16:
497	ld	[%o1], %o4		! load
498	subcc	%o2, 16, %o2		! decrement length count
499	stw	%o4, [%o0]		! and store
500	ld	[%o1+4], %o3		! a block of 16 bytes
501	add	%o1, 16, %o1		! increase src ptr by 16
502	stw	%o3, [%o0+4]
503	ld	[%o1-8], %o4
504	add	%o0, 16, %o0		! increase dst ptr by 16
505	stw	%o4, [%o0-8]
506	ld	[%o1-4], %o3
507	bgu,pt	%ncc, .medw16		! repeat if at least 16 bytes left
508	stw	%o3, [%o0-4]
509.medw15:
510	addcc	%o2, 15, %o2		! restore count
511	bz,pt	%ncc, .medwexit		! exit if finished
512	nop
513	cmp	%o2, 8
514	blt,pt	%ncc, .medw7		! skip if 7 or fewer bytes left
515	nop				!
516	ld	[%o1], %o4		! load 4 bytes
517	subcc	%o2, 8, %o2		! decrease count by 8
518	stw	%o4, [%o0]		! and store 4 bytes
519	add	%o1, 8, %o1		! increase src ptr by 8
520	ld	[%o1-4], %o3		! load 4 bytes
521	add	%o0, 8, %o0		! increase dst ptr by 8
522	stw	%o3, [%o0-4]		! and store 4 bytes
523	bz	%ncc, .medwexit		! exit if finished
524	nop
525.medw7:					! count is ge 1, less than 8
526	cmp	%o2, 3			! check for 4 bytes left
527	ble,pt	%ncc, .medw3		! skip if 3 or fewer bytes left
528	nop				!
529	ld	[%o1], %o4		! load 4 bytes
530	sub	%o2, 4, %o2		! decrease count by 4
531	add	%o1, 4, %o1		! increase src ptr by 4
532	stw	%o4, [%o0]		! and store 4 bytes
533	add	%o0, 4, %o0		! increase dst ptr by 4
534	tst	%o2			! check for zero bytes left
535	bz	%ncc, .medwexit		! exit if finished
536	nop
537.medw3:					! count is known to be 1, 2, or 3
538	deccc	%o2			! reduce count by one
539	ldub	[%o1], %o3		! load one byte
540	bz,pt	%ncc, .medwexit		! exit if last byte
541	stb	%o3, [%o0]		! store one byte
542	ldub	[%o1+1], %o3		! load second byte
543	deccc	%o2			! reduce count by one
544	bz,pt	%ncc, .medwexit		! exit if last byte
545	stb	%o3, [%o0+1]		! store second byte
546	ldub	[%o1+2], %o3		! load third byte
547	stb	%o3, [%o0+2]		! store third byte
548.medwexit:
549	retl
550	mov	%g1, %o0		! restore %o0
551
552/*
553 * Special case for handling when src and dest are both long word aligned
554 * and total data to move is between SMALL_MAX and MED_MAX bytes
555 */
556
557	.align 16
558	nop
559.medlword:				! long word aligned
560					! length > SMALL_MAX
561	cmp	%o2, MED_MAX		! limit to store buffer size
562	bgu,pt	%ncc, .mediumrejoin	! otherwise rejoin main loop
563	nop
564	subcc	%o2, 31, %o2		! adjust length to allow cc test
565					! for end of loop
566	ble,pt	%ncc, .medl31		! skip big loop if less than 32
567	prefetch [%o1 + (3 * BLOCK_SIZE)], 20	! into the l2 cache
568/*
569 * no need to put prefetch in loop as prefetches have
570 * already been issued for maximum loop size
571 */
572.medl32:
573	ldx	[%o1], %o4		! load
574	subcc	%o2, 32, %o2		! decrement length count
575	stx	%o4, [%o0]		! and store
576	ldx	[%o1+8], %o3		! a block of 32 bytes
577	add	%o1, 32, %o1		! increase src ptr by 32
578	stx	%o3, [%o0+8]
579	ldx	[%o1-16], %o4
580	add	%o0, 32, %o0		! increase dst ptr by 32
581	stx	%o4, [%o0-16]
582	ldx	[%o1-8], %o3
583	bgu,pt	%ncc, .medl32		! repeat if at least 32 bytes left
584	stx	%o3, [%o0-8]
585.medl31:
586	addcc	%o2, 16, %o2		! adjust remaining count
587	ble,pt	%ncc, .medl15		! skip if 15 or fewer bytes left
588	nop				!
589	ldx	[%o1], %o4		! load and store 16 bytes
590	add	%o1, 16, %o1		! increase src ptr by 16
591	stx	%o4, [%o0]		!
592	sub	%o2, 16, %o2		! decrease count by 16
593	ldx	[%o1-8], %o3		!
594	add	%o0, 16, %o0		! increase dst ptr by 16
595	stx	%o3, [%o0-8]
596.medl15:
597	addcc	%o2, 15, %o2		! restore count
598	bz,pt	%ncc, .medwexit		! exit if finished
599	nop
600	cmp	%o2, 8
601	blt,pt	%ncc, .medw7		! skip if 7 or fewer bytes left
602	nop
603	ldx	[%o1], %o4		! load 8 bytes
604	add	%o1, 8, %o1		! increase src ptr by 8
605	stx	%o4, [%o0]		! and store 8 bytes
606	subcc	%o2, 8, %o2		! decrease count by 8
607	bz	%ncc, .medwexit		! exit if finished
608	add	%o0, 8, %o0		! increase dst ptr by 8
609	ba	.medw7
610	nop
611
612	.align 16
613	nop
614	nop
615	nop
616.mediumsetup:
617	prefetch [%o1 + (2 * BLOCK_SIZE)], 21
618.mediumrejoin:
619	rd	%fprs, %o4		! check for unused FPU
620
621	add	%o1, 8, %o1		! prepare to round SRC upward
622
623	sethi	%hi(0x1234567f), %o5	! For GSR.MASK
624	or	%o5, 0x67f, %o5
625
626	andcc	%o4, FPRS_FEF, %o4	! test FEF, fprs.du = fprs.dl = 0
627	bz,a	%ncc, 3f
628	wr	%g0, FPRS_FEF, %fprs	! fprs.fef = 1
6293:
630	cmp	%o2, MEDIUM_MAX
631	bmask	%o5, %g0, %g0
632
633	! Compute o5 (number of bytes that need copying using the main loop).
634	! First, compute for the medium case.
635	! Then, if large case, o5 is replaced by count for block alignment.
636	! Be careful not to read past end of SRC
637	! Currently, o2 is the actual count remaining
638	!            o3 is how much sooner we'll cross the alignment boundary
639	!                in SRC compared to in DST
640	!
641	! Examples:  Let # denote bytes that should not be accessed
642	!            Let x denote a byte already copied to align DST
643	!            Let . and - denote bytes not yet copied
644	!            Let | denote double alignment boundaries
645	!
646	!            DST:  ######xx|........|--------|..######   o2 = 18
647	!                          o0
648	!
649	!  o3 = -3:  SRC:  ###xx...|.....---|-----..#|########   o5 = 8
650	!                          o1
651	!
652	!  o3 =  0:  SRC:  ######xx|........|--------|..######   o5 = 16-8 = 8
653	!                                   o1
654	!
655	!  o3 = +1:  SRC:  #######x|x.......|.-------|-..#####   o5 = 16-8 = 8
656	!                                   o1
657
658	or	%g0, -8, %o5
659	alignaddr %o1, %g0, %o1		! set GSR.ALIGN and align o1
660
661	movrlz	%o3, %g0, %o5		! subtract 8 from o2+o3 only if o3>=0
662	add	%o5, %o2, %o5
663	add	%o5, %o3, %o5
664
665	bleu	%ncc, 4f
666	andn	%o5, 7, %o5		! 8 byte aligned count
667	neg	%o0, %o5		! 'large' case
668	and	%o5, BLOCK_SIZE-1, %o5  ! bytes till DST block aligned
6694:
670	brgez,a	%o3, .beginmedloop
671	ldd	[%o1-8], %d0
672
673	add	%o1, %o3, %o1		! back up o1
6745:
675	ldda	[%o1]ASI_FL8_P, %d2
676	inc	%o1
677	andcc	%o1, 7, %g0
678	bnz	%ncc, 5b
679	bshuffle %d0, %d2, %d0		! shifts d0 left 1 byte and or's in d2
680
681.beginmedloop:
682	tst	%o5
683	bz	%ncc, .endmedloop
684	sub	%o2, %o5, %o2		! update count for later
685
686	! Main loop to write out doubles.  Note: o5 & 7 == 0
687
688	ldd	[%o1], %d2
689	subcc	%o5, 8, %o5		! update local count
690	bz,pn	%ncc, 1f
691	add	%o1, 8, %o1		! update SRC
692
693.medloop:
694	faligndata %d0, %d2, %d4
695	ldd	[%o1], %d0
696	subcc	%o5, 8, %o5		! update local count
697	add	%o1, 16, %o1		! update SRC
698	std	%d4, [%o0]
699	bz,pn	%ncc, 2f
700	faligndata %d2, %d0, %d6
701	ldd	[%o1 - 8], %d2
702	subcc	%o5, 8, %o5		! update local count
703	std	%d6, [%o0 + 8]
704	bnz,pt	%ncc, .medloop
705	add	%o0, 16, %o0		! update DST
706
7071:
708	faligndata %d0, %d2, %d4
709	fmovd	%d2, %d0
710	std	%d4, [%o0]
711	ba	.endmedloop
712	add	%o0, 8, %o0
713
7142:
715	std	%d6, [%o0 + 8]
716	sub	%o1, 8, %o1
717	add	%o0, 16, %o0
718
719
720.endmedloop:
721	! Currently, o1 is pointing to the next double-aligned byte in SRC
722	! The 8 bytes starting at [o1-8] are available in d0
723	! At least one, and possibly all, of these need to be written.
724
725	cmp	%o2, BLOCK_SIZE
726	bgu	%ncc, .large		! otherwise, less than 16 bytes left
727
728#if 0
729
730	/* This code will use partial stores.  */
731
732	mov	%g0, %o5
733	and	%o3, 7, %o3		! Number of bytes needed to completely
734					! fill %d0 with good (unwritten) data.
735
736	subcc	%o2, 8, %o2		! update count (maybe too much)
737	movl	%ncc, %o2, %o5
738	addcc	%o3, %o5, %o5		! extra bytes we can stuff into %d0
739	sub	%o3, %o5, %o3		! update o3 (# bad bytes in %d0)
740
741	bz	%ncc, 2f
742	alignaddr %o3, %g0, %g0		! set GSR.ALIGN
743
7441:
745	deccc	%o5
746	ldda	[%o1]ASI_FL8_P, %d2
747	inc	%o1
748	bgu	%ncc, 1b
749	bshuffle %d0, %d2, %d0		! shifts d0 left 1 byte and or's in d2
750
7512:
752	not     %o3
753	faligndata %d0, %d0, %d0	! shift bytes to the left
754	and	%o3, 7, %o3		! last byte to be stored in [%o0+%o3]
755	edge8n	%g0, %o3, %o5
756	stda	%d0, [%o0]%o5, ASI_PST8_P
757	brlez	%o2, .mediumexit
758	add	%o0, %o3, %o0		! update DST to last stored byte
7593:
760	inc	%o0
761	deccc	%o2
762	ldub	[%o1], %o3
763	stb	%o3, [%o0]
764	bgu	%ncc, 3b
765	inc	%o1
766
767#else
768
769	andcc	%o3, 7, %o5		! Number of bytes needed to completely
770					! fill %d0 with good (unwritten) data.
771	bz	%ncc, 2f
772	sub	%o5, 8, %o3		! -(number of good bytes in %d0)
773	cmp	%o2, 8
774	bl,a	%ncc, 3f		! Not enough bytes to fill %d0
775	add	%o1, %o3, %o1 		! Back up %o1
776
7771:
778	deccc	%o5
779	ldda	[%o1]ASI_FL8_P, %d2
780	inc	%o1
781	bgu	%ncc, 1b
782	bshuffle %d0, %d2, %d0		! shifts d0 left 1 byte and or's in d2
783
7842:
785	subcc	%o2, 8, %o2
786	std	%d0, [%o0]
787	bz	%ncc, .mediumexit
788	add	%o0, 8, %o0
7893:
790	ldub	[%o1], %o3
791	deccc	%o2
792	inc	%o1
793	stb	%o3, [%o0]
794	bgu	%ncc, 3b
795	inc	%o0
796#endif
797
798.mediumexit:
799        wr      %o4, %g0, %fprs		! fprs = o4   restore fprs
800	retl
801        mov     %g1, %o0
802
803
804	.align ICACHE_LINE_SIZE
805.large:
806	! The following test for BSTORE_SIZE is used to decide whether
807	! to store data with a block store or with individual stores.
808	! The block store wins when the amount of data is so large
809	! that it is causes other application data to be moved out
810	! of the L1 or L2 cache.
811	! On a Panther, block store can lose more often because block
812	! store forces the stored data to be removed from the L3 cache.
813	!
814	sethi	%hi(BSTORE_SIZE),%o5
815	or	%o5,%lo(BSTORE_SIZE),%o5
816	cmp	%o2, %o5
817	bgu	%ncc, .xlarge
818
819	! %o0 I/O DST is 64-byte aligned
820	! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN)
821	! %d0 I/O already loaded with SRC data from [%o1-8]
822	! %o2 I/O count (number of bytes that need to be written)
823	! %o3 I   Not written.  If zero, then SRC is double aligned.
824	! %o4 I   Not written.  Holds fprs.
825	! %o5   O The number of doubles that remain to be written.
826
827	! Load the rest of the current block
828	! Recall that %o1 is further into SRC than %o0 is into DST
829
830	prefetch [%o0 + (0 * BLOCK_SIZE)], 22
831	prefetch [%o0 + (1 * BLOCK_SIZE)], 22
832	prefetch [%o0 + (2 * BLOCK_SIZE)], 22
833	ldd	[%o1], %f2
834	prefetch [%o1 + (3 * BLOCK_SIZE)], 21
835	ldd	[%o1 + 0x8], %f4
836	faligndata %f0, %f2, %f32
837	ldd	[%o1 + 0x10], %f6
838	faligndata %f2, %f4, %f34
839	ldd	[%o1 + 0x18], %f8
840	faligndata %f4, %f6, %f36
841	ldd	[%o1 + 0x20], %f10
842        or	%g0, -8, %o5		! if %o3 >= 0, %o5 = -8
843	prefetch [%o1 + (4 * BLOCK_SIZE)], 21
844	faligndata %f6, %f8, %f38
845	ldd	[%o1 + 0x28], %f12
846	movrlz	%o3, %g0, %o5		! if %o3 < 0, %o5 = 0  (needed lter)
847	faligndata %f8, %f10, %f40
848	ldd	[%o1 + 0x30], %f14
849	faligndata %f10, %f12, %f42
850	ldd	[%o1 + 0x38], %f0
851	sub	%o2, BLOCK_SIZE, %o2	! update count
852	prefetch [%o1 + (5 * BLOCK_SIZE)], 21
853	add	%o1, BLOCK_SIZE, %o1		! update SRC
854
855	! Main loop.  Write previous block.  Load rest of current block.
856	! Some bytes will be loaded that won't yet be written.
8571:
858	ldd	[%o1], %f2
859	faligndata %f12, %f14, %f44
860	ldd	[%o1 + 0x8], %f4
861	faligndata %f14, %f0, %f46
862	std	%f32, [%o0]
863	std	%f34, [%o0+8]
864	std	%f36, [%o0+16]
865	std	%f38, [%o0+24]
866	std	%f40, [%o0+32]
867	std	%f42, [%o0+40]
868	std	%f44, [%o0+48]
869	std	%f46, [%o0+56]
870	sub	%o2, BLOCK_SIZE, %o2		! update count
871	prefetch [%o0 + (6 * BLOCK_SIZE)], 22
872	prefetch [%o0 + (3 * BLOCK_SIZE)], 22
873	add	%o0, BLOCK_SIZE, %o0		! update DST
874	ldd	[%o1 + 0x10], %f6
875	faligndata %f0, %f2, %f32
876	ldd	[%o1 + 0x18], %f8
877	faligndata %f2, %f4, %f34
878	ldd	[%o1 + 0x20], %f10
879	faligndata %f4, %f6, %f36
880	ldd	[%o1 + 0x28], %f12
881	faligndata %f6, %f8, %f38
882	ldd	[%o1 + 0x30], %f14
883	faligndata %f8, %f10, %f40
884	ldd	[%o1 + 0x38], %f0
885	faligndata %f10, %f12, %f42
886	cmp	%o2, BLOCK_SIZE + 8
887	prefetch [%o1 + (5 * BLOCK_SIZE)], 21
888	bgu,pt	%ncc, 1b
889	add	%o1, BLOCK_SIZE, %o1	! update SRC
890	faligndata %f12, %f14, %f44
891	faligndata %f14, %f0, %f46
892	stda	%f32, [%o0]ASI_BLK_P		! store 64 bytes, bypass cache
893	cmp	%o2, BLOCK_SIZE
894	bne	%ncc, 2f		! exactly 1 block remaining?
895	add	%o0, BLOCK_SIZE, %o0	! update DST
896	brz,a	%o3, 3f			! is SRC double aligned?
897	ldd	[%o1], %f2
898
8992:
900	add	%o5, %o2, %o5		! %o5 was already set to 0 or -8
901	add	%o5, %o3, %o5
902
903	membar	#StoreLoad|#StoreStore
904
905	ba	.beginmedloop
906	andn	%o5, 7, %o5		! 8 byte aligned count
907
908
909	! This is when there is exactly 1 block remaining and SRC is aligned
9103:
911	ldd	[%o1 + 0x8], %f4
912	ldd	[%o1 + 0x10], %f6
913	fsrc1	%f0, %f32
914	ldd	[%o1 + 0x18], %f8
915	fsrc1	%f2, %f34
916	ldd	[%o1 + 0x20], %f10
917	fsrc1	%f4, %f36
918	ldd	[%o1 + 0x28], %f12
919	fsrc1	%f6, %f38
920	ldd	[%o1 + 0x30], %f14
921	fsrc1	%f8, %f40
922	fsrc1	%f10, %f42
923	fsrc1	%f12, %f44
924	fsrc1	%f14, %f46
925	stda	%f32, [%o0]ASI_BLK_P
926	membar	#StoreLoad|#StoreStore
927	wr	%o4, 0, %fprs
928	retl
929	mov	%g1, %o0
930
931
932	.align 16
933	! two nops here causes loop starting at 1f below to be
934	! on a cache line boundary, improving performance
935	nop
936	nop
937.xlarge:
938	! %o0 I/O DST is 64-byte aligned
939	! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN)
940	! %d0 I/O already loaded with SRC data from [%o1-8]
941	! %o2 I/O count (number of bytes that need to be written)
942	! %o3 I   Not written.  If zero, then SRC is double aligned.
943	! %o4 I   Not written.  Holds fprs.
944	! %o5   O The number of doubles that remain to be written.
945
946	! Load the rest of the current block
947	! Recall that %o1 is further into SRC than %o0 is into DST
948
949	! prefetch [%o1 + (3 * BLOCK_SIZE)], 21
950	! executed in delay slot for branch to .xlarge
951	prefetch [%o1 + (4 * BLOCK_SIZE)], 21
952	prefetch [%o1 + (5 * BLOCK_SIZE)], 21
953	ldd	[%o1], %f2
954	prefetch [%o1 + (6 * BLOCK_SIZE)], 21
955	ldd	[%o1 + 0x8], %f4
956	faligndata %f0, %f2, %f32
957	ldd	[%o1 + 0x10], %f6
958	faligndata %f2, %f4, %f34
959	ldd	[%o1 + 0x18], %f8
960	faligndata %f4, %f6, %f36
961	ldd	[%o1 + 0x20], %f10
962        or	%g0, -8, %o5		! if %o3 >= 0, %o5 = -8
963	faligndata %f6, %f8, %f38
964	ldd	[%o1 + 0x28], %f12
965	movrlz	%o3, %g0, %o5		! if %o3 < 0, %o5 = 0  (needed later)
966	faligndata %f8, %f10, %f40
967	ldd	[%o1 + 0x30], %f14
968	faligndata %f10, %f12, %f42
969	ldd	[%o1 + 0x38], %f0
970	sub	%o2, BLOCK_SIZE, %o2	! update count
971	prefetch [%o1 + (7 * BLOCK_SIZE)], 21
972	add	%o1, BLOCK_SIZE, %o1	! update SRC
973
974	! This point is 32-byte aligned since 24 instructions appear since
975	! the previous alignment directive.
976
977
978	! Main loop.  Write previous block.  Load rest of current block.
979	! Some bytes will be loaded that won't yet be written.
9801:
981	ldd	[%o1], %f2
982	faligndata %f12, %f14, %f44
983	ldd	[%o1 + 0x8], %f4
984	faligndata %f14, %f0, %f46
985	stda	%f32, [%o0]ASI_BLK_P
986	sub	%o2, BLOCK_SIZE, %o2		! update count
987	ldd	[%o1 + 0x10], %f6
988	faligndata %f0, %f2, %f32
989	ldd	[%o1 + 0x18], %f8
990	faligndata %f2, %f4, %f34
991	ldd	[%o1 + 0x20], %f10
992	faligndata %f4, %f6, %f36
993	ldd	[%o1 + 0x28], %f12
994	faligndata %f6, %f8, %f38
995	ldd	[%o1 + 0x30], %f14
996	faligndata %f8, %f10, %f40
997	ldd	[%o1 + 0x38], %f0
998	faligndata %f10, %f12, %f42
999	! offset of 8*BLK+8 bytes works best over range of (src-dst) mod 1K
1000	prefetch [%o1 + (8 * BLOCK_SIZE) + 8], 21
1001	add	%o0, BLOCK_SIZE, %o0		! update DST
1002	cmp	%o2, BLOCK_SIZE + 8
1003	! second prefetch important to correct for occasional dropped
1004	! initial prefetches, 5*BLK works best over range of (src-dst) mod 1K
1005	! strong prefetch prevents drops on Panther, but Jaguar and earlier
1006	! US-III models treat strong prefetches as weak prefetchs
1007	! to avoid regressions on customer hardware, we retain the prefetch
1008	prefetch [%o1 + (5 * BLOCK_SIZE)], 21
1009	bgu,pt	%ncc, 1b
1010	add	%o1, BLOCK_SIZE, %o1		! update SRC
1011
1012	faligndata %f12, %f14, %f44
1013	faligndata %f14, %f0, %f46
1014	stda	%f32, [%o0]ASI_BLK_P		! store 64 bytes, bypass cache
1015	cmp	%o2, BLOCK_SIZE
1016	bne	%ncc, 2f		! exactly 1 block remaining?
1017	add	%o0, BLOCK_SIZE, %o0	! update DST
1018	brz,a	%o3, 3f			! is SRC double aligned?
1019	ldd	[%o1], %f2
1020
10212:
1022	add	%o5, %o2, %o5		! %o5 was already set to 0 or -8
1023	add	%o5, %o3, %o5
1024
1025	membar	#StoreLoad|#StoreStore
1026
1027	ba	.beginmedloop
1028	andn	%o5, 7, %o5		! 8 byte aligned count
1029
1030
1031	! This is when there is exactly 1 block remaining and SRC is aligned
10323:
1033	ldd	[%o1 + 0x8], %f4
1034	ldd	[%o1 + 0x10], %f6
1035	fsrc1	%f0, %f32
1036	ldd	[%o1 + 0x18], %f8
1037	fsrc1	%f2, %f34
1038	ldd	[%o1 + 0x20], %f10
1039	fsrc1	%f4, %f36
1040	ldd	[%o1 + 0x28], %f12
1041	fsrc1	%f6, %f38
1042	ldd	[%o1 + 0x30], %f14
1043	fsrc1	%f8, %f40
1044	fsrc1	%f10, %f42
1045	fsrc1	%f12, %f44
1046	fsrc1	%f14, %f46
1047	stda	%f32, [%o0]ASI_BLK_P
1048	membar	#StoreLoad|#StoreStore
1049	wr	%o4, 0, %fprs
1050	retl
1051	mov	%g1, %o0
1052
1053	SET_SIZE(memcpy)
1054