xref: /titanic_41/usr/src/lib/libc/capabilities/sun4u-us3/common/memcpy.s (revision 1e49577a7fcde812700ded04431b49d67cc57d6d)
1*1e49577aSRod Evans/*
2*1e49577aSRod Evans * CDDL HEADER START
3*1e49577aSRod Evans *
4*1e49577aSRod Evans * The contents of this file are subject to the terms of the
5*1e49577aSRod Evans * Common Development and Distribution License (the "License").
6*1e49577aSRod Evans * You may not use this file except in compliance with the License.
7*1e49577aSRod Evans *
8*1e49577aSRod Evans * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*1e49577aSRod Evans * or http://www.opensolaris.org/os/licensing.
10*1e49577aSRod Evans * See the License for the specific language governing permissions
11*1e49577aSRod Evans * and limitations under the License.
12*1e49577aSRod Evans *
13*1e49577aSRod Evans * When distributing Covered Code, include this CDDL HEADER in each
14*1e49577aSRod Evans * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*1e49577aSRod Evans * If applicable, add the following below this CDDL HEADER, with the
16*1e49577aSRod Evans * fields enclosed by brackets "[]" replaced with your own identifying
17*1e49577aSRod Evans * information: Portions Copyright [yyyy] [name of copyright owner]
18*1e49577aSRod Evans *
19*1e49577aSRod Evans * CDDL HEADER END
20*1e49577aSRod Evans */
21*1e49577aSRod Evans
22*1e49577aSRod Evans/*
23*1e49577aSRod Evans * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
24*1e49577aSRod Evans */
25*1e49577aSRod Evans
26*1e49577aSRod Evans	.file	"memcpy.s"
27*1e49577aSRod Evans
28*1e49577aSRod Evans/*
29*1e49577aSRod Evans * memcpy(s1, s2, len)
30*1e49577aSRod Evans *
31*1e49577aSRod Evans * Copy s2 to s1, always copy n bytes.
32*1e49577aSRod Evans * Note: this C code does not work for overlapped copies.
33*1e49577aSRod Evans *       Memmove() and bcopy() do.
34*1e49577aSRod Evans *
35*1e49577aSRod Evans * Fast assembler language version of the following C-program for memcpy
36*1e49577aSRod Evans * which represents the `standard' for the C-library.
37*1e49577aSRod Evans *
38*1e49577aSRod Evans *	void *
39*1e49577aSRod Evans *	memcpy(void *s, const void *s0, size_t n)
40*1e49577aSRod Evans *	{
41*1e49577aSRod Evans *		if (n != 0) {
42*1e49577aSRod Evans *	   	    char *s1 = s;
43*1e49577aSRod Evans *		    const char *s2 = s0;
44*1e49577aSRod Evans *		    do {
45*1e49577aSRod Evans *			*s1++ = *s2++;
46*1e49577aSRod Evans *		    } while (--n != 0);
47*1e49577aSRod Evans *		}
48*1e49577aSRod Evans *		return (s);
49*1e49577aSRod Evans *	}
50*1e49577aSRod Evans */
51*1e49577aSRod Evans
52*1e49577aSRod Evans#include <sys/asm_linkage.h>
53*1e49577aSRod Evans#include <sys/sun4asi.h>
54*1e49577aSRod Evans#include <sys/trap.h>
55*1e49577aSRod Evans
56*1e49577aSRod Evans#define	ICACHE_LINE_SIZE	64
57*1e49577aSRod Evans#define	BLOCK_SIZE	64
58*1e49577aSRod Evans#define	FPRS_FEF	0x4
59*1e49577aSRod Evans
60*1e49577aSRod Evans#define SHORTCOPY	3
61*1e49577aSRod Evans#define	SMALL_MAX	39
62*1e49577aSRod Evans#define	MEDIUM_MAX	255
63*1e49577aSRod Evans#define MED_WMAX	256	/* max copy for medium word-aligned case */
64*1e49577aSRod Evans#define MED_MAX		256	/* max copy for medium longword-aligned case */
65*1e49577aSRod Evans
66*1e49577aSRod Evans#ifndef BSTORE_SIZE
67*1e49577aSRod Evans#define BSTORE_SIZE	256	/* min copy size for block store */
68*1e49577aSRod Evans#endif
69*1e49577aSRod Evans
70*1e49577aSRod Evans	ANSI_PRAGMA_WEAK(memmove,function)
71*1e49577aSRod Evans	ANSI_PRAGMA_WEAK(memcpy,function)
72*1e49577aSRod Evans
73*1e49577aSRod Evans	ENTRY(memmove)
74*1e49577aSRod Evans	cmp	%o1, %o0	! if from address is >= to use forward copy
75*1e49577aSRod Evans	bgeu	%ncc, .forcpy	! else use backward if ...
76*1e49577aSRod Evans	sub	%o0, %o1, %o4	! get difference of two addresses
77*1e49577aSRod Evans	cmp	%o2, %o4	! compare size and difference of addresses
78*1e49577aSRod Evans	bleu	%ncc, .forcpy	! if size is bigger, do overlapped copy
79*1e49577aSRod Evans	nop
80*1e49577aSRod Evans
81*1e49577aSRod Evans        !
82*1e49577aSRod Evans        ! an overlapped copy that must be done "backwards"
83*1e49577aSRod Evans        !
84*1e49577aSRod Evans.ovbc:
85*1e49577aSRod Evans	mov	%o0, %g1		! save dest address for return val
86*1e49577aSRod Evans	add     %o1, %o2, %o1           ! get to end of source space
87*1e49577aSRod Evans        add     %o0, %o2, %o0           ! get to end of destination space
88*1e49577aSRod Evans
89*1e49577aSRod Evans	cmp	%o2, 24
90*1e49577aSRod Evans	bgeu,pn	%ncc, .dbalign
91*1e49577aSRod Evans	nop
92*1e49577aSRod Evans	cmp	%o2, 4
93*1e49577aSRod Evans	blt,pn	%ncc, .byte
94*1e49577aSRod Evans	sub	%o2, 3, %o2
95*1e49577aSRod Evans.byte4loop:
96*1e49577aSRod Evans	ldub	[%o1-1], %o3		! load last byte
97*1e49577aSRod Evans	stb	%o3, [%o0-1]		! store last byte
98*1e49577aSRod Evans	sub	%o1, 4, %o1
99*1e49577aSRod Evans	ldub	[%o1+2], %o3		! load 2nd from last byte
100*1e49577aSRod Evans	stb	%o3, [%o0-2]		! store 2nd from last byte
101*1e49577aSRod Evans	sub	%o0, 4, %o0
102*1e49577aSRod Evans	ldub	[%o1+1], %o3		! load 3rd from last byte
103*1e49577aSRod Evans	stb	%o3, [%o0+1]		! store 3rd from last byte
104*1e49577aSRod Evans	subcc	%o2, 4, %o2
105*1e49577aSRod Evans	ldub	[%o1], %o3		! load 4th from last byte
106*1e49577aSRod Evans	bgu,pt	%ncc, .byte4loop
107*1e49577aSRod Evans	stb	%o3, [%o0]		! store 4th from last byte
108*1e49577aSRod Evans.byte:
109*1e49577aSRod Evans	addcc	%o2, 3, %o2
110*1e49577aSRod Evans	bz,pt	%ncc, .exit
111*1e49577aSRod Evans.byteloop:
112*1e49577aSRod Evans	dec	%o1			! decrement src address
113*1e49577aSRod Evans	ldub	[%o1], %o3		! read a byte
114*1e49577aSRod Evans	dec	%o0			! decrement dst address
115*1e49577aSRod Evans	deccc	%o2			! decrement count
116*1e49577aSRod Evans	bgu,pt	%ncc, .byteloop		! loop until done
117*1e49577aSRod Evans	stb	%o3, [%o0]		! write byte
118*1e49577aSRod Evans.exit:
119*1e49577aSRod Evans	retl
120*1e49577aSRod Evans	mov	%g1, %o0
121*1e49577aSRod Evans
122*1e49577aSRod Evans	.align	16
123*1e49577aSRod Evans.dbalign:
124*1e49577aSRod Evans	andcc   %o0, 7, %o5		! bytes till DST 8 byte aligned
125*1e49577aSRod Evans	bz,pt	%ncc, .dbmed
126*1e49577aSRod Evans	sub	%o2, %o5, %o2		! update count
127*1e49577aSRod Evans.dbalign1:
128*1e49577aSRod Evans	dec	%o1			! decrement src address
129*1e49577aSRod Evans	ldub	[%o1], %o3		! read a byte
130*1e49577aSRod Evans	dec	%o0			! decrement dst address
131*1e49577aSRod Evans	deccc	%o5			! decrement count
132*1e49577aSRod Evans	bgu,pt	%ncc, .dbalign1		! loop until done
133*1e49577aSRod Evans	stb	%o3, [%o0]		! store a byte
134*1e49577aSRod Evans
135*1e49577aSRod Evans! check for src long word alignment
136*1e49577aSRod Evans.dbmed:
137*1e49577aSRod Evans	andcc	%o1, 7, %g0		! chk src long word alignment
138*1e49577aSRod Evans	bnz,pn	%ncc, .dbbck
139*1e49577aSRod Evans	nop
140*1e49577aSRod Evans!
141*1e49577aSRod Evans! Following code is for overlapping copies where src and dest
142*1e49577aSRod Evans! are long word aligned
143*1e49577aSRod Evans!
144*1e49577aSRod Evans	cmp	%o2, 4095
145*1e49577aSRod Evans	blt,pn	%ncc, .dbmedl32enter	! go to no prefetch code
146*1e49577aSRod Evans	nop
147*1e49577aSRod Evans	prefetch [%o1 - (1 * BLOCK_SIZE)], 20	! into the prefetch cache
148*1e49577aSRod Evans	sub	%o2, 63, %o2		! adjust length to allow cc test
149*1e49577aSRod Evans					! for end of loop
150*1e49577aSRod Evans	prefetch [%o1 - (2 * BLOCK_SIZE)], 20	! into the prefetch cache
151*1e49577aSRod Evans	rd	%fprs, %o3		! o3 = fprs
152*1e49577aSRod Evans	! if fprs.fef == 0, set it. Checking it, requires 2 instructions.
153*1e49577aSRod Evans	! So set it anyway, without checking.
154*1e49577aSRod Evans	prefetch [%o1 - (3 * BLOCK_SIZE)], 20	! into the prefetch cache
155*1e49577aSRod Evans	wr      %g0, 0x4, %fprs         ! fprs.fef = 1
156*1e49577aSRod Evans	prefetch [%o1 - (4 * BLOCK_SIZE)], 20	! into the prefetch cache
157*1e49577aSRod Evans.dbmedl64:
158*1e49577aSRod Evans	prefetch [%o1 - (5 * BLOCK_SIZE)], 20	! into the prefetch cache
159*1e49577aSRod Evans	ldd	[%o1-8], %d4		! load
160*1e49577aSRod Evans	subcc	%o2, 64, %o2		! decrement length count
161*1e49577aSRod Evans	std	%d4, [%o0-8]		! and store
162*1e49577aSRod Evans	ldd	[%o1-16], %d2		! a block of 64 bytes
163*1e49577aSRod Evans	sub	%o1, 64, %o1		! decrease src ptr by 64
164*1e49577aSRod Evans	std	%d2, [%o0-16]
165*1e49577aSRod Evans	sub	%o0, 64, %o0		! decrease dst ptr by 64
166*1e49577aSRod Evans	ldd	[%o1+40], %d4
167*1e49577aSRod Evans	std	%d4, [%o0+40]
168*1e49577aSRod Evans	ldd	[%o1+32], %d2
169*1e49577aSRod Evans	std	%d2, [%o0+32]
170*1e49577aSRod Evans	ldd	[%o1+24], %d4
171*1e49577aSRod Evans	std	%d4, [%o0+24]
172*1e49577aSRod Evans	ldd	[%o1+16], %d2
173*1e49577aSRod Evans	std	%d2, [%o0+16]
174*1e49577aSRod Evans	ldd	[%o1+8], %d4
175*1e49577aSRod Evans	std	%d4, [%o0+8]
176*1e49577aSRod Evans	ldd	[%o1], %d2
177*1e49577aSRod Evans	bgu,pt	%ncc, .dbmedl64		! repeat if at least 64 bytes left
178*1e49577aSRod Evans	std	%d2, [%o0]
179*1e49577aSRod Evans	add	%o2, 63, %o2		! restore offset adjustment
180*1e49577aSRod Evans	and	%o3, 0x4, %o3           ! fprs.du = fprs.dl = 0
181*1e49577aSRod Evans	wr	%o3, %g0, %fprs         ! fprs = o3   restore fprs
182*1e49577aSRod Evans.dbmedl32enter:
183*1e49577aSRod Evans	subcc	%o2, 31, %o2		! adjust length to allow cc test
184*1e49577aSRod Evans					! for end of loop
185*1e49577aSRod Evans	ble,pt  %ncc, .dbmedl31		! skip big loop if less than 32
186*1e49577aSRod Evans	nop
187*1e49577aSRod Evans.dbmedl32:
188*1e49577aSRod Evans	ldx	[%o1-8], %o4		! load
189*1e49577aSRod Evans	subcc	%o2, 32, %o2		! decrement length count
190*1e49577aSRod Evans	stx	%o4, [%o0-8]		! and store
191*1e49577aSRod Evans	ldx	[%o1-16], %o3		! a block of 32 bytes
192*1e49577aSRod Evans	sub	%o1, 32, %o1		! decrease src ptr by 32
193*1e49577aSRod Evans	stx	%o3, [%o0-16]
194*1e49577aSRod Evans	ldx	[%o1+8], %o4
195*1e49577aSRod Evans	sub	%o0, 32, %o0		! decrease dst ptr by 32
196*1e49577aSRod Evans	stx	%o4, [%o0+8]
197*1e49577aSRod Evans	ldx	[%o1], %o3
198*1e49577aSRod Evans	bgu,pt	%ncc, .dbmedl32		! repeat if at least 32 bytes left
199*1e49577aSRod Evans	stx	%o3, [%o0]
200*1e49577aSRod Evans.dbmedl31:
201*1e49577aSRod Evans	addcc	%o2, 16, %o2		! adjust remaining count
202*1e49577aSRod Evans	ble,pt	%ncc, .dbmedl15		! skip if 15 or fewer bytes left
203*1e49577aSRod Evans	nop				!
204*1e49577aSRod Evans	ldx	[%o1-8], %o4		! load and store 16 bytes
205*1e49577aSRod Evans	sub	%o1, 16, %o1		! decrease src ptr by 16
206*1e49577aSRod Evans	stx	%o4, [%o0-8]		!
207*1e49577aSRod Evans	sub	%o2, 16, %o2		! decrease count by 16
208*1e49577aSRod Evans	ldx	[%o1], %o3		!
209*1e49577aSRod Evans	sub	%o0, 16, %o0		! decrease dst ptr by 16
210*1e49577aSRod Evans	stx	%o3, [%o0]
211*1e49577aSRod Evans.dbmedl15:
212*1e49577aSRod Evans	addcc	%o2, 15, %o2		! restore count
213*1e49577aSRod Evans	bz,pt	%ncc, .dbexit		! exit if finished
214*1e49577aSRod Evans	nop
215*1e49577aSRod Evans	cmp	%o2, 8
216*1e49577aSRod Evans	blt,pt	%ncc, .dbremain		! skip if 7 or fewer bytes left
217*1e49577aSRod Evans	nop
218*1e49577aSRod Evans	ldx	[%o1-8], %o4		! load 8 bytes
219*1e49577aSRod Evans	sub	%o1, 8, %o1		! decrease src ptr by 8
220*1e49577aSRod Evans	stx	%o4, [%o0-8]		! and store 8 bytes
221*1e49577aSRod Evans	subcc	%o2, 8, %o2		! decrease count by 8
222*1e49577aSRod Evans	bnz	%ncc, .dbremain		! exit if finished
223*1e49577aSRod Evans	sub	%o0, 8, %o0		! decrease dst ptr by 8
224*1e49577aSRod Evans	retl
225*1e49577aSRod Evans	mov	%g1, %o0
226*1e49577aSRod Evans
227*1e49577aSRod Evans!
228*1e49577aSRod Evans! Following code is for overlapping copies where src and dest
229*1e49577aSRod Evans! are not long word aligned
230*1e49577aSRod Evans!
231*1e49577aSRod Evans	.align	16
232*1e49577aSRod Evans.dbbck:
233*1e49577aSRod Evans	rd	%fprs, %o3		! o3 = fprs
234*1e49577aSRod Evans
235*1e49577aSRod Evans	! if fprs.fef == 0, set it. Checking it, requires 2 instructions.
236*1e49577aSRod Evans	! So set it anyway, without checking.
237*1e49577aSRod Evans	wr      %g0, 0x4, %fprs         ! fprs.fef = 1
238*1e49577aSRod Evans
239*1e49577aSRod Evans	alignaddr %o1, %g0, %o5		! align src
240*1e49577aSRod Evans	ldd	[%o5], %d0		! get first 8 byte block
241*1e49577aSRod Evans	andn	%o2, 7, %o4		! prepare src ptr for finishup code
242*1e49577aSRod Evans	cmp	%o2, 32
243*1e49577aSRod Evans	blt,pn	%ncc, .dbmv8
244*1e49577aSRod Evans	sub	%o1, %o4, %o1		!
245*1e49577aSRod Evans	cmp	%o2, 4095		! check for short memmoves
246*1e49577aSRod Evans	blt,pn	%ncc, .dbmv32enter	! go to no prefetch code
247*1e49577aSRod Evans.dbmv64:
248*1e49577aSRod Evans	ldd	[%o5-8], %d2		! load 8 bytes
249*1e49577aSRod Evans	ldd	[%o5-16], %d4		! load 8 bytes
250*1e49577aSRod Evans	sub	%o5, 64, %o5		!
251*1e49577aSRod Evans	ldd	[%o5+40], %d6		! load 8 bytes
252*1e49577aSRod Evans	sub	%o0, 64, %o0		!
253*1e49577aSRod Evans	ldd	[%o5+32], %d8		! load 8 bytes
254*1e49577aSRod Evans	sub	%o2, 64, %o2		! 64 less bytes to copy
255*1e49577aSRod Evans	ldd	[%o5+24], %d18		! load 8 bytes
256*1e49577aSRod Evans	cmp	%o2, 64			! do we have < 64 bytes remaining
257*1e49577aSRod Evans	ldd	[%o5+16], %d28		! load 8 bytes
258*1e49577aSRod Evans	ldd	[%o5+8], %d30		! load 8 bytes
259*1e49577aSRod Evans	prefetch [%o5 - (5 * BLOCK_SIZE)], 20	! into the prefetch cache
260*1e49577aSRod Evans	faligndata %d2, %d0, %d10	! extract 8 bytes out
261*1e49577aSRod Evans	ldd	[%o5], %d0		! load 8 bytes
262*1e49577aSRod Evans	std	%d10, [%o0+56]		! store the current 8 bytes
263*1e49577aSRod Evans	faligndata %d4, %d2, %d12	! extract 8 bytes out
264*1e49577aSRod Evans	std	%d12, [%o0+48]		! store the current 8 bytes
265*1e49577aSRod Evans	faligndata %d6, %d4, %d14	! extract 8 bytes out
266*1e49577aSRod Evans	std	%d14, [%o0+40]		! store the current 8 bytes
267*1e49577aSRod Evans	faligndata %d8, %d6, %d16	! extract 8 bytes out
268*1e49577aSRod Evans	std	%d16, [%o0+32]		! store the current 8 bytes
269*1e49577aSRod Evans	faligndata %d18, %d8, %d20	! extract 8 bytes out
270*1e49577aSRod Evans	std	%d20, [%o0+24]		! store the current 8 bytes
271*1e49577aSRod Evans	faligndata %d28, %d18, %d22	! extract 8 bytes out
272*1e49577aSRod Evans	std	%d22, [%o0+16]		! store the current 8 bytes
273*1e49577aSRod Evans	faligndata %d30, %d28, %d24	! extract 8 bytes out
274*1e49577aSRod Evans	std	%d24, [%o0+8]		! store the current 8 bytes
275*1e49577aSRod Evans	faligndata %d0, %d30, %d26	! extract 8 bytes out
276*1e49577aSRod Evans	bgeu,pt	%ncc, .dbmv64
277*1e49577aSRod Evans	std	%d26, [%o0]		! store the current 8 bytes
278*1e49577aSRod Evans
279*1e49577aSRod Evans	cmp	%o2, 32
280*1e49577aSRod Evans	blt,pn	%ncc, .dbmvx
281*1e49577aSRod Evans	nop
282*1e49577aSRod Evans.dbmv32:
283*1e49577aSRod Evans	ldd	[%o5-8], %d2		! load 8 bytes
284*1e49577aSRod Evans.dbmv32enter:
285*1e49577aSRod Evans	ldd	[%o5-16], %d4		! load 8 bytes
286*1e49577aSRod Evans	sub	%o5, 32, %o5		!
287*1e49577aSRod Evans	ldd	[%o5+8], %d6		! load 8 bytes
288*1e49577aSRod Evans	sub	%o0, 32, %o0		!
289*1e49577aSRod Evans	faligndata %d2, %d0, %d10	! extract 8 bytes out
290*1e49577aSRod Evans	ldd	[%o5], %d0		! load 8 bytes
291*1e49577aSRod Evans	sub     %o2,32, %o2		! 32 less bytes to copy
292*1e49577aSRod Evans	std	%d10, [%o0+24]		! store the current 8 bytes
293*1e49577aSRod Evans	cmp	%o2, 32			! do we have < 32 bytes remaining
294*1e49577aSRod Evans	faligndata %d4, %d2, %d12	! extract 8 bytes out
295*1e49577aSRod Evans	std	%d12, [%o0+16]		! store the current 8 bytes
296*1e49577aSRod Evans	faligndata %d6, %d4, %d14	! extract 8 bytes out
297*1e49577aSRod Evans	std	%d14, [%o0+8]		! store the current 8 bytes
298*1e49577aSRod Evans	faligndata %d0, %d6, %d16	! extract 8 bytes out
299*1e49577aSRod Evans	bgeu,pt	%ncc, .dbmv32
300*1e49577aSRod Evans	std	%d16, [%o0]		! store the current 8 bytes
301*1e49577aSRod Evans.dbmvx:
302*1e49577aSRod Evans	cmp	%o2, 8			! do we have < 8 bytes remaining
303*1e49577aSRod Evans	blt,pt	%ncc, .dbmvfinish	! if yes, skip to finish up code
304*1e49577aSRod Evans	nop
305*1e49577aSRod Evans.dbmv8:
306*1e49577aSRod Evans	ldd	[%o5-8], %d2
307*1e49577aSRod Evans	sub	%o0, 8, %o0		! since we are at the end
308*1e49577aSRod Evans					! when we first enter the loop
309*1e49577aSRod Evans	sub     %o2, 8, %o2		! 8 less bytes to copy
310*1e49577aSRod Evans	sub	%o5, 8, %o5
311*1e49577aSRod Evans	cmp	%o2, 8			! do we have < 8 bytes remaining
312*1e49577aSRod Evans	faligndata %d2, %d0, %d8        ! extract 8 bytes out
313*1e49577aSRod Evans	std	%d8, [%o0]		! store the current 8 bytes
314*1e49577aSRod Evans	bgeu,pt	%ncc, .dbmv8
315*1e49577aSRod Evans	fmovd	%d2, %d0
316*1e49577aSRod Evans.dbmvfinish:
317*1e49577aSRod Evans	and	%o3, 0x4, %o3           ! fprs.du = fprs.dl = 0
318*1e49577aSRod Evans	tst	%o2
319*1e49577aSRod Evans	bz,pt	%ncc, .dbexit
320*1e49577aSRod Evans	wr	%o3, %g0, %fprs         ! fprs = o3   restore fprs
321*1e49577aSRod Evans
322*1e49577aSRod Evans.dbremain:
323*1e49577aSRod Evans	cmp	%o2, 4
324*1e49577aSRod Evans	blt,pn	%ncc, .dbbyte
325*1e49577aSRod Evans	nop
326*1e49577aSRod Evans	ldub	[%o1-1], %o3		! load last byte
327*1e49577aSRod Evans	stb	%o3, [%o0-1]		! store last byte
328*1e49577aSRod Evans	sub	%o1, 4, %o1
329*1e49577aSRod Evans	ldub	[%o1+2], %o3		! load 2nd from last byte
330*1e49577aSRod Evans	stb	%o3, [%o0-2]		! store 2nd from last byte
331*1e49577aSRod Evans	sub	%o0, 4, %o0
332*1e49577aSRod Evans	ldub	[%o1+1], %o3		! load 3rd from last byte
333*1e49577aSRod Evans	stb	%o3, [%o0+1]		! store 3rd from last byte
334*1e49577aSRod Evans	subcc	%o2, 4, %o2
335*1e49577aSRod Evans	ldub	[%o1], %o3		! load 4th from last byte
336*1e49577aSRod Evans	stb	%o3, [%o0]		! store 4th from last byte
337*1e49577aSRod Evans	bz,pt	%ncc, .dbexit
338*1e49577aSRod Evans.dbbyte:
339*1e49577aSRod Evans	dec	%o1			! decrement src address
340*1e49577aSRod Evans	ldub	[%o1], %o3		! read a byte
341*1e49577aSRod Evans	dec	%o0			! decrement dst address
342*1e49577aSRod Evans	deccc	%o2			! decrement count
343*1e49577aSRod Evans	bgu,pt	%ncc, .dbbyte		! loop until done
344*1e49577aSRod Evans	stb	%o3, [%o0]		! write byte
345*1e49577aSRod Evans.dbexit:
346*1e49577aSRod Evans	retl
347*1e49577aSRod Evans        mov     %g1, %o0
348*1e49577aSRod Evans	SET_SIZE(memmove)
349*1e49577aSRod Evans
350*1e49577aSRod Evans
351*1e49577aSRod Evans	.align ICACHE_LINE_SIZE
352*1e49577aSRod Evans	ENTRY(memcpy)
353*1e49577aSRod Evans					! adjust instruction alignment
354*1e49577aSRod Evans	nop				! Do not remove, these nops affect
355*1e49577aSRod Evans	nop				! icache alignment and performance
356*1e49577aSRod Evans.forcpy:
357*1e49577aSRod Evans	cmp	%o2, SMALL_MAX		! check for not small case
358*1e49577aSRod Evans	bgu,pn	%ncc, .medium		! go to larger cases
359*1e49577aSRod Evans	mov	%o0, %g1		! save %o0
360*1e49577aSRod Evans	cmp	%o2, SHORTCOPY		! check for really short case
361*1e49577aSRod Evans	ble,pt	%ncc, .smallleft	!
362*1e49577aSRod Evans	or	%o0, %o1, %o3		! prepare alignment check
363*1e49577aSRod Evans	andcc	%o3, 0x3, %g0		! test for alignment
364*1e49577aSRod Evans	bz,pt	%ncc, .smallword	! branch to word aligned case
365*1e49577aSRod Evans	sub	%o2, 3, %o2		! adjust count to allow cc zero test
366*1e49577aSRod Evans.smallnotalign4:
367*1e49577aSRod Evans	ldub	[%o1], %o3		! read byte
368*1e49577aSRod Evans	subcc	%o2, 4, %o2		! reduce count by 4
369*1e49577aSRod Evans	stb	%o3, [%o0]		! write byte
370*1e49577aSRod Evans	ldub	[%o1+1], %o3		! repeat for a total of 4 bytes
371*1e49577aSRod Evans	add	%o1, 4, %o1		! advance SRC by 4
372*1e49577aSRod Evans	stb	%o3, [%o0+1]
373*1e49577aSRod Evans	ldub	[%o1-2], %o3
374*1e49577aSRod Evans	add	%o0, 4, %o0		! advance DST by 4
375*1e49577aSRod Evans	stb	%o3, [%o0-2]
376*1e49577aSRod Evans	ldub	[%o1-1], %o3
377*1e49577aSRod Evans	bgu,pt	%ncc, .smallnotalign4	! loop til 3 or fewer bytes remain
378*1e49577aSRod Evans	stb	%o3, [%o0-1]
379*1e49577aSRod Evans	add	%o2, 3, %o2		! restore count
380*1e49577aSRod Evans.smallleft:
381*1e49577aSRod Evans	tst	%o2
382*1e49577aSRod Evans	bz,pt	%ncc, .smallexit
383*1e49577aSRod Evans	nop
384*1e49577aSRod Evans.smallleft3:				! 1, 2, or 3 bytes remain
385*1e49577aSRod Evans	ldub	[%o1], %o3		! load one byte
386*1e49577aSRod Evans	deccc	%o2			! reduce count for cc test
387*1e49577aSRod Evans	bz,pt	%ncc, .smallexit
388*1e49577aSRod Evans	stb	%o3, [%o0]		! store one byte
389*1e49577aSRod Evans	ldub	[%o1+1], %o3		! load second byte
390*1e49577aSRod Evans	deccc	%o2
391*1e49577aSRod Evans	bz,pt	%ncc, .smallexit
392*1e49577aSRod Evans	stb	%o3, [%o0+1]		! store second byte
393*1e49577aSRod Evans	ldub	[%o1+2], %o3		! load third byte
394*1e49577aSRod Evans	stb	%o3, [%o0+2]		! store third byte
395*1e49577aSRod Evans	retl
396*1e49577aSRod Evans	mov	%g1, %o0		! restore %o0
397*1e49577aSRod Evans
398*1e49577aSRod Evans	.align	16
399*1e49577aSRod Evans	nop				! affects loop icache alignment
400*1e49577aSRod Evans.smallwords:
401*1e49577aSRod Evans	lduw	[%o1], %o3		! read word
402*1e49577aSRod Evans.smallwordx:
403*1e49577aSRod Evans	subcc	%o2, 8, %o2		! update count
404*1e49577aSRod Evans	stw	%o3, [%o0]		! write word
405*1e49577aSRod Evans	add	%o1, 8, %o1		! update SRC
406*1e49577aSRod Evans	lduw	[%o1-4], %o3		! read word
407*1e49577aSRod Evans	add	%o0, 8, %o0		! update DST
408*1e49577aSRod Evans	bgu,pt	%ncc, .smallwords	! loop until done
409*1e49577aSRod Evans	stw	%o3, [%o0-4]		! write word
410*1e49577aSRod Evans	addcc	%o2, 7, %o2		! restore count
411*1e49577aSRod Evans	bz,pt	%ncc, .smallexit	! check for completion
412*1e49577aSRod Evans	nop
413*1e49577aSRod Evans	cmp	%o2, 4			! check for 4 or more bytes left
414*1e49577aSRod Evans	blt	.smallleft3		! if not, go to finish up
415*1e49577aSRod Evans	nop
416*1e49577aSRod Evans	lduw	[%o1], %o3
417*1e49577aSRod Evans	add	%o1, 4, %o1
418*1e49577aSRod Evans	subcc	%o2, 4, %o2
419*1e49577aSRod Evans	stw	%o3, [%o0]
420*1e49577aSRod Evans	add	%o0, 4, %o0
421*1e49577aSRod Evans	bnz,pt	%ncc, .smallleft3
422*1e49577aSRod Evans	nop
423*1e49577aSRod Evans	retl
424*1e49577aSRod Evans	mov	%g1, %o0		! restore %o0
425*1e49577aSRod Evans
426*1e49577aSRod Evans.smallword:
427*1e49577aSRod Evans	subcc	%o2, 4, %o2		! update count
428*1e49577aSRod Evans	bgu,pt	%ncc, .smallwordx
429*1e49577aSRod Evans	lduw	[%o1], %o3		! read word
430*1e49577aSRod Evans	addcc	%o2, 3, %o2		! restore count
431*1e49577aSRod Evans	bz,pt	%ncc, .smallexit
432*1e49577aSRod Evans	stw	%o3, [%o0]		! write word
433*1e49577aSRod Evans	deccc	%o2			! reduce count for cc test
434*1e49577aSRod Evans	ldub	[%o1+4], %o3		! load one byte
435*1e49577aSRod Evans	bz,pt	%ncc, .smallexit
436*1e49577aSRod Evans	stb	%o3, [%o0+4]		! store one byte
437*1e49577aSRod Evans	ldub	[%o1+5], %o3		! load second byte
438*1e49577aSRod Evans	deccc	%o2
439*1e49577aSRod Evans	bz,pt	%ncc, .smallexit
440*1e49577aSRod Evans	stb	%o3, [%o0+5]		! store second byte
441*1e49577aSRod Evans	ldub	[%o1+6], %o3		! load third byte
442*1e49577aSRod Evans	stb	%o3, [%o0+6]		! store third byte
443*1e49577aSRod Evans.smallexit:
444*1e49577aSRod Evans	retl
445*1e49577aSRod Evans	mov	%g1, %o0		! restore %o0
446*1e49577aSRod Evans	.align 16
447*1e49577aSRod Evans.medium:
448*1e49577aSRod Evans	neg	%o0, %o5
449*1e49577aSRod Evans	neg	%o1, %o3
450*1e49577aSRod Evans	andcc	%o5, 7, %o5	! bytes till DST 8 byte aligned
451*1e49577aSRod Evans	and	%o3, 7, %o3	! bytes till SRC 8 byte aligned
452*1e49577aSRod Evans
453*1e49577aSRod Evans	bz	%ncc, 2f
454*1e49577aSRod Evans	sub	%o5, %o3, %o3	! -(bytes till SRC aligned after DST aligned)
455*1e49577aSRod Evans				! o3={-7, -6, ... 7}  o3>0 => SRC overaligned
456*1e49577aSRod Evans
457*1e49577aSRod Evans	sub	%o2, %o5, %o2	! update count
458*1e49577aSRod Evans
459*1e49577aSRod Evans1:
460*1e49577aSRod Evans	ldub	[%o1], %o4
461*1e49577aSRod Evans	deccc	%o5
462*1e49577aSRod Evans	inc	%o1
463*1e49577aSRod Evans	stb	%o4, [%o0]
464*1e49577aSRod Evans	bgu,pt	%ncc, 1b
465*1e49577aSRod Evans	inc	%o0
466*1e49577aSRod Evans
467*1e49577aSRod Evans	! Now DST is 8-byte aligned.  o0, o1, o2 are current.
468*1e49577aSRod Evans
469*1e49577aSRod Evans2:
470*1e49577aSRod Evans	andcc	%o1, 0x3, %g0		! test alignment
471*1e49577aSRod Evans	bnz,pt	%ncc, .mediumsetup	! branch to skip aligned cases
472*1e49577aSRod Evans					! if src, dst not aligned
473*1e49577aSRod Evans	prefetch [%o1 + (1 * BLOCK_SIZE)], 20
474*1e49577aSRod Evans
475*1e49577aSRod Evans/*
476*1e49577aSRod Evans * Handle all cases where src and dest are aligned on word
477*1e49577aSRod Evans * or long word boundaries.  Use unrolled loops for better
478*1e49577aSRod Evans * performance.  This option wins over standard large data
479*1e49577aSRod Evans * move when source and destination is in cache for medium
480*1e49577aSRod Evans * to short data moves.
481*1e49577aSRod Evans */
482*1e49577aSRod Evans	andcc	%o1, 0x7, %g0		! test word alignment
483*1e49577aSRod Evans	bz,pt	%ncc, .medlword		! branch to long word aligned case
484*1e49577aSRod Evans	prefetch [%o1 + (2 * BLOCK_SIZE)], 20
485*1e49577aSRod Evans	cmp	%o2, MED_WMAX		! limit to store buffer size
486*1e49577aSRod Evans	bgu,pt	%ncc, .mediumrejoin	! otherwise rejoin main loop
487*1e49577aSRod Evans	nop
488*1e49577aSRod Evans	subcc	%o2, 15, %o2		! adjust length to allow cc test
489*1e49577aSRod Evans					! for end of loop
490*1e49577aSRod Evans	ble,pt	%ncc, .medw15		! skip big loop if less than 16
491*1e49577aSRod Evans	prefetch [%o1 + (3 * BLOCK_SIZE)], 20
492*1e49577aSRod Evans/*
493*1e49577aSRod Evans * no need to put prefetch in loop as prefetches have
494*1e49577aSRod Evans * already been issued for maximum loop size
495*1e49577aSRod Evans */
496*1e49577aSRod Evans.medw16:
497*1e49577aSRod Evans	ld	[%o1], %o4		! load
498*1e49577aSRod Evans	subcc	%o2, 16, %o2		! decrement length count
499*1e49577aSRod Evans	stw	%o4, [%o0]		! and store
500*1e49577aSRod Evans	ld	[%o1+4], %o3		! a block of 16 bytes
501*1e49577aSRod Evans	add	%o1, 16, %o1		! increase src ptr by 16
502*1e49577aSRod Evans	stw	%o3, [%o0+4]
503*1e49577aSRod Evans	ld	[%o1-8], %o4
504*1e49577aSRod Evans	add	%o0, 16, %o0		! increase dst ptr by 16
505*1e49577aSRod Evans	stw	%o4, [%o0-8]
506*1e49577aSRod Evans	ld	[%o1-4], %o3
507*1e49577aSRod Evans	bgu,pt	%ncc, .medw16		! repeat if at least 16 bytes left
508*1e49577aSRod Evans	stw	%o3, [%o0-4]
509*1e49577aSRod Evans.medw15:
510*1e49577aSRod Evans	addcc	%o2, 15, %o2		! restore count
511*1e49577aSRod Evans	bz,pt	%ncc, .medwexit		! exit if finished
512*1e49577aSRod Evans	nop
513*1e49577aSRod Evans	cmp	%o2, 8
514*1e49577aSRod Evans	blt,pt	%ncc, .medw7		! skip if 7 or fewer bytes left
515*1e49577aSRod Evans	nop				!
516*1e49577aSRod Evans	ld	[%o1], %o4		! load 4 bytes
517*1e49577aSRod Evans	subcc	%o2, 8, %o2		! decrease count by 8
518*1e49577aSRod Evans	stw	%o4, [%o0]		! and store 4 bytes
519*1e49577aSRod Evans	add	%o1, 8, %o1		! increase src ptr by 8
520*1e49577aSRod Evans	ld	[%o1-4], %o3		! load 4 bytes
521*1e49577aSRod Evans	add	%o0, 8, %o0		! increase dst ptr by 8
522*1e49577aSRod Evans	stw	%o3, [%o0-4]		! and store 4 bytes
523*1e49577aSRod Evans	bz	%ncc, .medwexit		! exit if finished
524*1e49577aSRod Evans	nop
525*1e49577aSRod Evans.medw7:					! count is ge 1, less than 8
526*1e49577aSRod Evans	cmp	%o2, 3			! check for 4 bytes left
527*1e49577aSRod Evans	ble,pt	%ncc, .medw3		! skip if 3 or fewer bytes left
528*1e49577aSRod Evans	nop				!
529*1e49577aSRod Evans	ld	[%o1], %o4		! load 4 bytes
530*1e49577aSRod Evans	sub	%o2, 4, %o2		! decrease count by 4
531*1e49577aSRod Evans	add	%o1, 4, %o1		! increase src ptr by 4
532*1e49577aSRod Evans	stw	%o4, [%o0]		! and store 4 bytes
533*1e49577aSRod Evans	add	%o0, 4, %o0		! increase dst ptr by 4
534*1e49577aSRod Evans	tst	%o2			! check for zero bytes left
535*1e49577aSRod Evans	bz	%ncc, .medwexit		! exit if finished
536*1e49577aSRod Evans	nop
537*1e49577aSRod Evans.medw3:					! count is known to be 1, 2, or 3
538*1e49577aSRod Evans	deccc	%o2			! reduce count by one
539*1e49577aSRod Evans	ldub	[%o1], %o3		! load one byte
540*1e49577aSRod Evans	bz,pt	%ncc, .medwexit		! exit if last byte
541*1e49577aSRod Evans	stb	%o3, [%o0]		! store one byte
542*1e49577aSRod Evans	ldub	[%o1+1], %o3		! load second byte
543*1e49577aSRod Evans	deccc	%o2			! reduce count by one
544*1e49577aSRod Evans	bz,pt	%ncc, .medwexit		! exit if last byte
545*1e49577aSRod Evans	stb	%o3, [%o0+1]		! store second byte
546*1e49577aSRod Evans	ldub	[%o1+2], %o3		! load third byte
547*1e49577aSRod Evans	stb	%o3, [%o0+2]		! store third byte
548*1e49577aSRod Evans.medwexit:
549*1e49577aSRod Evans	retl
550*1e49577aSRod Evans	mov	%g1, %o0		! restore %o0
551*1e49577aSRod Evans
552*1e49577aSRod Evans/*
553*1e49577aSRod Evans * Special case for handling when src and dest are both long word aligned
554*1e49577aSRod Evans * and total data to move is between SMALL_MAX and MED_MAX bytes
555*1e49577aSRod Evans */
556*1e49577aSRod Evans
557*1e49577aSRod Evans	.align 16
558*1e49577aSRod Evans	nop
559*1e49577aSRod Evans.medlword:				! long word aligned
560*1e49577aSRod Evans					! length > SMALL_MAX
561*1e49577aSRod Evans	cmp	%o2, MED_MAX		! limit to store buffer size
562*1e49577aSRod Evans	bgu,pt	%ncc, .mediumrejoin	! otherwise rejoin main loop
563*1e49577aSRod Evans	nop
564*1e49577aSRod Evans	subcc	%o2, 31, %o2		! adjust length to allow cc test
565*1e49577aSRod Evans					! for end of loop
566*1e49577aSRod Evans	ble,pt	%ncc, .medl31		! skip big loop if less than 32
567*1e49577aSRod Evans	prefetch [%o1 + (3 * BLOCK_SIZE)], 20	! into the l2 cache
568*1e49577aSRod Evans/*
569*1e49577aSRod Evans * no need to put prefetch in loop as prefetches have
570*1e49577aSRod Evans * already been issued for maximum loop size
571*1e49577aSRod Evans */
572*1e49577aSRod Evans.medl32:
573*1e49577aSRod Evans	ldx	[%o1], %o4		! load
574*1e49577aSRod Evans	subcc	%o2, 32, %o2		! decrement length count
575*1e49577aSRod Evans	stx	%o4, [%o0]		! and store
576*1e49577aSRod Evans	ldx	[%o1+8], %o3		! a block of 32 bytes
577*1e49577aSRod Evans	add	%o1, 32, %o1		! increase src ptr by 32
578*1e49577aSRod Evans	stx	%o3, [%o0+8]
579*1e49577aSRod Evans	ldx	[%o1-16], %o4
580*1e49577aSRod Evans	add	%o0, 32, %o0		! increase dst ptr by 32
581*1e49577aSRod Evans	stx	%o4, [%o0-16]
582*1e49577aSRod Evans	ldx	[%o1-8], %o3
583*1e49577aSRod Evans	bgu,pt	%ncc, .medl32		! repeat if at least 32 bytes left
584*1e49577aSRod Evans	stx	%o3, [%o0-8]
585*1e49577aSRod Evans.medl31:
586*1e49577aSRod Evans	addcc	%o2, 16, %o2		! adjust remaining count
587*1e49577aSRod Evans	ble,pt	%ncc, .medl15		! skip if 15 or fewer bytes left
588*1e49577aSRod Evans	nop				!
589*1e49577aSRod Evans	ldx	[%o1], %o4		! load and store 16 bytes
590*1e49577aSRod Evans	add	%o1, 16, %o1		! increase src ptr by 16
591*1e49577aSRod Evans	stx	%o4, [%o0]		!
592*1e49577aSRod Evans	sub	%o2, 16, %o2		! decrease count by 16
593*1e49577aSRod Evans	ldx	[%o1-8], %o3		!
594*1e49577aSRod Evans	add	%o0, 16, %o0		! increase dst ptr by 16
595*1e49577aSRod Evans	stx	%o3, [%o0-8]
596*1e49577aSRod Evans.medl15:
597*1e49577aSRod Evans	addcc	%o2, 15, %o2		! restore count
598*1e49577aSRod Evans	bz,pt	%ncc, .medwexit		! exit if finished
599*1e49577aSRod Evans	nop
600*1e49577aSRod Evans	cmp	%o2, 8
601*1e49577aSRod Evans	blt,pt	%ncc, .medw7		! skip if 7 or fewer bytes left
602*1e49577aSRod Evans	nop
603*1e49577aSRod Evans	ldx	[%o1], %o4		! load 8 bytes
604*1e49577aSRod Evans	add	%o1, 8, %o1		! increase src ptr by 8
605*1e49577aSRod Evans	stx	%o4, [%o0]		! and store 8 bytes
606*1e49577aSRod Evans	subcc	%o2, 8, %o2		! decrease count by 8
607*1e49577aSRod Evans	bz	%ncc, .medwexit		! exit if finished
608*1e49577aSRod Evans	add	%o0, 8, %o0		! increase dst ptr by 8
609*1e49577aSRod Evans	ba	.medw7
610*1e49577aSRod Evans	nop
611*1e49577aSRod Evans
612*1e49577aSRod Evans	.align 16
613*1e49577aSRod Evans	nop
614*1e49577aSRod Evans	nop
615*1e49577aSRod Evans	nop
616*1e49577aSRod Evans.mediumsetup:
617*1e49577aSRod Evans	prefetch [%o1 + (2 * BLOCK_SIZE)], 21
618*1e49577aSRod Evans.mediumrejoin:
619*1e49577aSRod Evans	rd	%fprs, %o4		! check for unused FPU
620*1e49577aSRod Evans
621*1e49577aSRod Evans	add	%o1, 8, %o1		! prepare to round SRC upward
622*1e49577aSRod Evans
623*1e49577aSRod Evans	sethi	%hi(0x1234567f), %o5	! For GSR.MASK
624*1e49577aSRod Evans	or	%o5, 0x67f, %o5
625*1e49577aSRod Evans
626*1e49577aSRod Evans	andcc	%o4, FPRS_FEF, %o4	! test FEF, fprs.du = fprs.dl = 0
627*1e49577aSRod Evans	bz,a	%ncc, 3f
628*1e49577aSRod Evans	wr	%g0, FPRS_FEF, %fprs	! fprs.fef = 1
629*1e49577aSRod Evans3:
630*1e49577aSRod Evans	cmp	%o2, MEDIUM_MAX
631*1e49577aSRod Evans	bmask	%o5, %g0, %g0
632*1e49577aSRod Evans
633*1e49577aSRod Evans	! Compute o5 (number of bytes that need copying using the main loop).
634*1e49577aSRod Evans	! First, compute for the medium case.
635*1e49577aSRod Evans	! Then, if large case, o5 is replaced by count for block alignment.
636*1e49577aSRod Evans	! Be careful not to read past end of SRC
637*1e49577aSRod Evans	! Currently, o2 is the actual count remaining
638*1e49577aSRod Evans	!            o3 is how much sooner we'll cross the alignment boundary
639*1e49577aSRod Evans	!                in SRC compared to in DST
640*1e49577aSRod Evans	!
641*1e49577aSRod Evans	! Examples:  Let # denote bytes that should not be accessed
642*1e49577aSRod Evans	!            Let x denote a byte already copied to align DST
643*1e49577aSRod Evans	!            Let . and - denote bytes not yet copied
644*1e49577aSRod Evans	!            Let | denote double alignment boundaries
645*1e49577aSRod Evans	!
646*1e49577aSRod Evans	!            DST:  ######xx|........|--------|..######   o2 = 18
647*1e49577aSRod Evans	!                          o0
648*1e49577aSRod Evans	!
649*1e49577aSRod Evans	!  o3 = -3:  SRC:  ###xx...|.....---|-----..#|########   o5 = 8
650*1e49577aSRod Evans	!                          o1
651*1e49577aSRod Evans	!
652*1e49577aSRod Evans	!  o3 =  0:  SRC:  ######xx|........|--------|..######   o5 = 16-8 = 8
653*1e49577aSRod Evans	!                                   o1
654*1e49577aSRod Evans	!
655*1e49577aSRod Evans	!  o3 = +1:  SRC:  #######x|x.......|.-------|-..#####   o5 = 16-8 = 8
656*1e49577aSRod Evans	!                                   o1
657*1e49577aSRod Evans
658*1e49577aSRod Evans	or	%g0, -8, %o5
659*1e49577aSRod Evans	alignaddr %o1, %g0, %o1		! set GSR.ALIGN and align o1
660*1e49577aSRod Evans
661*1e49577aSRod Evans	movrlz	%o3, %g0, %o5		! subtract 8 from o2+o3 only if o3>=0
662*1e49577aSRod Evans	add	%o5, %o2, %o5
663*1e49577aSRod Evans	add	%o5, %o3, %o5
664*1e49577aSRod Evans
665*1e49577aSRod Evans	bleu	%ncc, 4f
666*1e49577aSRod Evans	andn	%o5, 7, %o5		! 8 byte aligned count
667*1e49577aSRod Evans	neg	%o0, %o5		! 'large' case
668*1e49577aSRod Evans	and	%o5, BLOCK_SIZE-1, %o5  ! bytes till DST block aligned
669*1e49577aSRod Evans4:
670*1e49577aSRod Evans	brgez,a	%o3, .beginmedloop
671*1e49577aSRod Evans	ldd	[%o1-8], %d0
672*1e49577aSRod Evans
673*1e49577aSRod Evans	add	%o1, %o3, %o1		! back up o1
674*1e49577aSRod Evans5:
675*1e49577aSRod Evans	ldda	[%o1]ASI_FL8_P, %d2
676*1e49577aSRod Evans	inc	%o1
677*1e49577aSRod Evans	andcc	%o1, 7, %g0
678*1e49577aSRod Evans	bnz	%ncc, 5b
679*1e49577aSRod Evans	bshuffle %d0, %d2, %d0		! shifts d0 left 1 byte and or's in d2
680*1e49577aSRod Evans
681*1e49577aSRod Evans.beginmedloop:
682*1e49577aSRod Evans	tst	%o5
683*1e49577aSRod Evans	bz	%ncc, .endmedloop
684*1e49577aSRod Evans	sub	%o2, %o5, %o2		! update count for later
685*1e49577aSRod Evans
686*1e49577aSRod Evans	! Main loop to write out doubles.  Note: o5 & 7 == 0
687*1e49577aSRod Evans
688*1e49577aSRod Evans	ldd	[%o1], %d2
689*1e49577aSRod Evans	subcc	%o5, 8, %o5		! update local count
690*1e49577aSRod Evans	bz,pn	%ncc, 1f
691*1e49577aSRod Evans	add	%o1, 8, %o1		! update SRC
692*1e49577aSRod Evans
693*1e49577aSRod Evans.medloop:
694*1e49577aSRod Evans	faligndata %d0, %d2, %d4
695*1e49577aSRod Evans	ldd	[%o1], %d0
696*1e49577aSRod Evans	subcc	%o5, 8, %o5		! update local count
697*1e49577aSRod Evans	add	%o1, 16, %o1		! update SRC
698*1e49577aSRod Evans	std	%d4, [%o0]
699*1e49577aSRod Evans	bz,pn	%ncc, 2f
700*1e49577aSRod Evans	faligndata %d2, %d0, %d6
701*1e49577aSRod Evans	ldd	[%o1 - 8], %d2
702*1e49577aSRod Evans	subcc	%o5, 8, %o5		! update local count
703*1e49577aSRod Evans	std	%d6, [%o0 + 8]
704*1e49577aSRod Evans	bnz,pt	%ncc, .medloop
705*1e49577aSRod Evans	add	%o0, 16, %o0		! update DST
706*1e49577aSRod Evans
707*1e49577aSRod Evans1:
708*1e49577aSRod Evans	faligndata %d0, %d2, %d4
709*1e49577aSRod Evans	fmovd	%d2, %d0
710*1e49577aSRod Evans	std	%d4, [%o0]
711*1e49577aSRod Evans	ba	.endmedloop
712*1e49577aSRod Evans	add	%o0, 8, %o0
713*1e49577aSRod Evans
714*1e49577aSRod Evans2:
715*1e49577aSRod Evans	std	%d6, [%o0 + 8]
716*1e49577aSRod Evans	sub	%o1, 8, %o1
717*1e49577aSRod Evans	add	%o0, 16, %o0
718*1e49577aSRod Evans
719*1e49577aSRod Evans
720*1e49577aSRod Evans.endmedloop:
721*1e49577aSRod Evans	! Currently, o1 is pointing to the next double-aligned byte in SRC
722*1e49577aSRod Evans	! The 8 bytes starting at [o1-8] are available in d0
723*1e49577aSRod Evans	! At least one, and possibly all, of these need to be written.
724*1e49577aSRod Evans
725*1e49577aSRod Evans	cmp	%o2, BLOCK_SIZE
726*1e49577aSRod Evans	bgu	%ncc, .large		! otherwise, less than 16 bytes left
727*1e49577aSRod Evans
728*1e49577aSRod Evans#if 0
729*1e49577aSRod Evans
730*1e49577aSRod Evans	/* This code will use partial stores.  */
731*1e49577aSRod Evans
732*1e49577aSRod Evans	mov	%g0, %o5
733*1e49577aSRod Evans	and	%o3, 7, %o3		! Number of bytes needed to completely
734*1e49577aSRod Evans					! fill %d0 with good (unwritten) data.
735*1e49577aSRod Evans
736*1e49577aSRod Evans	subcc	%o2, 8, %o2		! update count (maybe too much)
737*1e49577aSRod Evans	movl	%ncc, %o2, %o5
738*1e49577aSRod Evans	addcc	%o3, %o5, %o5		! extra bytes we can stuff into %d0
739*1e49577aSRod Evans	sub	%o3, %o5, %o3		! update o3 (# bad bytes in %d0)
740*1e49577aSRod Evans
741*1e49577aSRod Evans	bz	%ncc, 2f
742*1e49577aSRod Evans	alignaddr %o3, %g0, %g0		! set GSR.ALIGN
743*1e49577aSRod Evans
744*1e49577aSRod Evans1:
745*1e49577aSRod Evans	deccc	%o5
746*1e49577aSRod Evans	ldda	[%o1]ASI_FL8_P, %d2
747*1e49577aSRod Evans	inc	%o1
748*1e49577aSRod Evans	bgu	%ncc, 1b
749*1e49577aSRod Evans	bshuffle %d0, %d2, %d0		! shifts d0 left 1 byte and or's in d2
750*1e49577aSRod Evans
751*1e49577aSRod Evans2:
752*1e49577aSRod Evans	not     %o3
753*1e49577aSRod Evans	faligndata %d0, %d0, %d0	! shift bytes to the left
754*1e49577aSRod Evans	and	%o3, 7, %o3		! last byte to be stored in [%o0+%o3]
755*1e49577aSRod Evans	edge8n	%g0, %o3, %o5
756*1e49577aSRod Evans	stda	%d0, [%o0]%o5, ASI_PST8_P
757*1e49577aSRod Evans	brlez	%o2, .mediumexit
758*1e49577aSRod Evans	add	%o0, %o3, %o0		! update DST to last stored byte
759*1e49577aSRod Evans3:
760*1e49577aSRod Evans	inc	%o0
761*1e49577aSRod Evans	deccc	%o2
762*1e49577aSRod Evans	ldub	[%o1], %o3
763*1e49577aSRod Evans	stb	%o3, [%o0]
764*1e49577aSRod Evans	bgu	%ncc, 3b
765*1e49577aSRod Evans	inc	%o1
766*1e49577aSRod Evans
767*1e49577aSRod Evans#else
768*1e49577aSRod Evans
769*1e49577aSRod Evans	andcc	%o3, 7, %o5		! Number of bytes needed to completely
770*1e49577aSRod Evans					! fill %d0 with good (unwritten) data.
771*1e49577aSRod Evans	bz	%ncc, 2f
772*1e49577aSRod Evans	sub	%o5, 8, %o3		! -(number of good bytes in %d0)
773*1e49577aSRod Evans	cmp	%o2, 8
774*1e49577aSRod Evans	bl,a	%ncc, 3f		! Not enough bytes to fill %d0
775*1e49577aSRod Evans	add	%o1, %o3, %o1 		! Back up %o1
776*1e49577aSRod Evans
777*1e49577aSRod Evans1:
778*1e49577aSRod Evans	deccc	%o5
779*1e49577aSRod Evans	ldda	[%o1]ASI_FL8_P, %d2
780*1e49577aSRod Evans	inc	%o1
781*1e49577aSRod Evans	bgu	%ncc, 1b
782*1e49577aSRod Evans	bshuffle %d0, %d2, %d0		! shifts d0 left 1 byte and or's in d2
783*1e49577aSRod Evans
784*1e49577aSRod Evans2:
785*1e49577aSRod Evans	subcc	%o2, 8, %o2
786*1e49577aSRod Evans	std	%d0, [%o0]
787*1e49577aSRod Evans	bz	%ncc, .mediumexit
788*1e49577aSRod Evans	add	%o0, 8, %o0
789*1e49577aSRod Evans3:
790*1e49577aSRod Evans	ldub	[%o1], %o3
791*1e49577aSRod Evans	deccc	%o2
792*1e49577aSRod Evans	inc	%o1
793*1e49577aSRod Evans	stb	%o3, [%o0]
794*1e49577aSRod Evans	bgu	%ncc, 3b
795*1e49577aSRod Evans	inc	%o0
796*1e49577aSRod Evans#endif
797*1e49577aSRod Evans
798*1e49577aSRod Evans.mediumexit:
799*1e49577aSRod Evans        wr      %o4, %g0, %fprs		! fprs = o4   restore fprs
800*1e49577aSRod Evans	retl
801*1e49577aSRod Evans        mov     %g1, %o0
802*1e49577aSRod Evans
803*1e49577aSRod Evans
804*1e49577aSRod Evans	.align ICACHE_LINE_SIZE
805*1e49577aSRod Evans.large:
806*1e49577aSRod Evans	! The following test for BSTORE_SIZE is used to decide whether
807*1e49577aSRod Evans	! to store data with a block store or with individual stores.
808*1e49577aSRod Evans	! The block store wins when the amount of data is so large
809*1e49577aSRod Evans	! that it is causes other application data to be moved out
810*1e49577aSRod Evans	! of the L1 or L2 cache.
811*1e49577aSRod Evans	! On a Panther, block store can lose more often because block
812*1e49577aSRod Evans	! store forces the stored data to be removed from the L3 cache.
813*1e49577aSRod Evans	!
814*1e49577aSRod Evans	sethi	%hi(BSTORE_SIZE),%o5
815*1e49577aSRod Evans	or	%o5,%lo(BSTORE_SIZE),%o5
816*1e49577aSRod Evans	cmp	%o2, %o5
817*1e49577aSRod Evans	bgu	%ncc, .xlarge
818*1e49577aSRod Evans
819*1e49577aSRod Evans	! %o0 I/O DST is 64-byte aligned
820*1e49577aSRod Evans	! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN)
821*1e49577aSRod Evans	! %d0 I/O already loaded with SRC data from [%o1-8]
822*1e49577aSRod Evans	! %o2 I/O count (number of bytes that need to be written)
823*1e49577aSRod Evans	! %o3 I   Not written.  If zero, then SRC is double aligned.
824*1e49577aSRod Evans	! %o4 I   Not written.  Holds fprs.
825*1e49577aSRod Evans	! %o5   O The number of doubles that remain to be written.
826*1e49577aSRod Evans
827*1e49577aSRod Evans	! Load the rest of the current block
828*1e49577aSRod Evans	! Recall that %o1 is further into SRC than %o0 is into DST
829*1e49577aSRod Evans
830*1e49577aSRod Evans	prefetch [%o0 + (0 * BLOCK_SIZE)], 22
831*1e49577aSRod Evans	prefetch [%o0 + (1 * BLOCK_SIZE)], 22
832*1e49577aSRod Evans	prefetch [%o0 + (2 * BLOCK_SIZE)], 22
833*1e49577aSRod Evans	ldd	[%o1], %f2
834*1e49577aSRod Evans	prefetch [%o1 + (3 * BLOCK_SIZE)], 21
835*1e49577aSRod Evans	ldd	[%o1 + 0x8], %f4
836*1e49577aSRod Evans	faligndata %f0, %f2, %f32
837*1e49577aSRod Evans	ldd	[%o1 + 0x10], %f6
838*1e49577aSRod Evans	faligndata %f2, %f4, %f34
839*1e49577aSRod Evans	ldd	[%o1 + 0x18], %f8
840*1e49577aSRod Evans	faligndata %f4, %f6, %f36
841*1e49577aSRod Evans	ldd	[%o1 + 0x20], %f10
842*1e49577aSRod Evans        or	%g0, -8, %o5		! if %o3 >= 0, %o5 = -8
843*1e49577aSRod Evans	prefetch [%o1 + (4 * BLOCK_SIZE)], 21
844*1e49577aSRod Evans	faligndata %f6, %f8, %f38
845*1e49577aSRod Evans	ldd	[%o1 + 0x28], %f12
846*1e49577aSRod Evans	movrlz	%o3, %g0, %o5		! if %o3 < 0, %o5 = 0  (needed lter)
847*1e49577aSRod Evans	faligndata %f8, %f10, %f40
848*1e49577aSRod Evans	ldd	[%o1 + 0x30], %f14
849*1e49577aSRod Evans	faligndata %f10, %f12, %f42
850*1e49577aSRod Evans	ldd	[%o1 + 0x38], %f0
851*1e49577aSRod Evans	sub	%o2, BLOCK_SIZE, %o2	! update count
852*1e49577aSRod Evans	prefetch [%o1 + (5 * BLOCK_SIZE)], 21
853*1e49577aSRod Evans	add	%o1, BLOCK_SIZE, %o1		! update SRC
854*1e49577aSRod Evans
855*1e49577aSRod Evans	! Main loop.  Write previous block.  Load rest of current block.
856*1e49577aSRod Evans	! Some bytes will be loaded that won't yet be written.
857*1e49577aSRod Evans1:
858*1e49577aSRod Evans	ldd	[%o1], %f2
859*1e49577aSRod Evans	faligndata %f12, %f14, %f44
860*1e49577aSRod Evans	ldd	[%o1 + 0x8], %f4
861*1e49577aSRod Evans	faligndata %f14, %f0, %f46
862*1e49577aSRod Evans	std	%f32, [%o0]
863*1e49577aSRod Evans	std	%f34, [%o0+8]
864*1e49577aSRod Evans	std	%f36, [%o0+16]
865*1e49577aSRod Evans	std	%f38, [%o0+24]
866*1e49577aSRod Evans	std	%f40, [%o0+32]
867*1e49577aSRod Evans	std	%f42, [%o0+40]
868*1e49577aSRod Evans	std	%f44, [%o0+48]
869*1e49577aSRod Evans	std	%f46, [%o0+56]
870*1e49577aSRod Evans	sub	%o2, BLOCK_SIZE, %o2		! update count
871*1e49577aSRod Evans	prefetch [%o0 + (6 * BLOCK_SIZE)], 22
872*1e49577aSRod Evans	prefetch [%o0 + (3 * BLOCK_SIZE)], 22
873*1e49577aSRod Evans	add	%o0, BLOCK_SIZE, %o0		! update DST
874*1e49577aSRod Evans	ldd	[%o1 + 0x10], %f6
875*1e49577aSRod Evans	faligndata %f0, %f2, %f32
876*1e49577aSRod Evans	ldd	[%o1 + 0x18], %f8
877*1e49577aSRod Evans	faligndata %f2, %f4, %f34
878*1e49577aSRod Evans	ldd	[%o1 + 0x20], %f10
879*1e49577aSRod Evans	faligndata %f4, %f6, %f36
880*1e49577aSRod Evans	ldd	[%o1 + 0x28], %f12
881*1e49577aSRod Evans	faligndata %f6, %f8, %f38
882*1e49577aSRod Evans	ldd	[%o1 + 0x30], %f14
883*1e49577aSRod Evans	faligndata %f8, %f10, %f40
884*1e49577aSRod Evans	ldd	[%o1 + 0x38], %f0
885*1e49577aSRod Evans	faligndata %f10, %f12, %f42
886*1e49577aSRod Evans	cmp	%o2, BLOCK_SIZE + 8
887*1e49577aSRod Evans	prefetch [%o1 + (5 * BLOCK_SIZE)], 21
888*1e49577aSRod Evans	bgu,pt	%ncc, 1b
889*1e49577aSRod Evans	add	%o1, BLOCK_SIZE, %o1	! update SRC
890*1e49577aSRod Evans	faligndata %f12, %f14, %f44
891*1e49577aSRod Evans	faligndata %f14, %f0, %f46
892*1e49577aSRod Evans	stda	%f32, [%o0]ASI_BLK_P		! store 64 bytes, bypass cache
893*1e49577aSRod Evans	cmp	%o2, BLOCK_SIZE
894*1e49577aSRod Evans	bne	%ncc, 2f		! exactly 1 block remaining?
895*1e49577aSRod Evans	add	%o0, BLOCK_SIZE, %o0	! update DST
896*1e49577aSRod Evans	brz,a	%o3, 3f			! is SRC double aligned?
897*1e49577aSRod Evans	ldd	[%o1], %f2
898*1e49577aSRod Evans
899*1e49577aSRod Evans2:
900*1e49577aSRod Evans	add	%o5, %o2, %o5		! %o5 was already set to 0 or -8
901*1e49577aSRod Evans	add	%o5, %o3, %o5
902*1e49577aSRod Evans
903*1e49577aSRod Evans	membar	#StoreLoad|#StoreStore
904*1e49577aSRod Evans
905*1e49577aSRod Evans	ba	.beginmedloop
906*1e49577aSRod Evans	andn	%o5, 7, %o5		! 8 byte aligned count
907*1e49577aSRod Evans
908*1e49577aSRod Evans
909*1e49577aSRod Evans	! This is when there is exactly 1 block remaining and SRC is aligned
910*1e49577aSRod Evans3:
911*1e49577aSRod Evans	ldd	[%o1 + 0x8], %f4
912*1e49577aSRod Evans	ldd	[%o1 + 0x10], %f6
913*1e49577aSRod Evans	fsrc1	%f0, %f32
914*1e49577aSRod Evans	ldd	[%o1 + 0x18], %f8
915*1e49577aSRod Evans	fsrc1	%f2, %f34
916*1e49577aSRod Evans	ldd	[%o1 + 0x20], %f10
917*1e49577aSRod Evans	fsrc1	%f4, %f36
918*1e49577aSRod Evans	ldd	[%o1 + 0x28], %f12
919*1e49577aSRod Evans	fsrc1	%f6, %f38
920*1e49577aSRod Evans	ldd	[%o1 + 0x30], %f14
921*1e49577aSRod Evans	fsrc1	%f8, %f40
922*1e49577aSRod Evans	fsrc1	%f10, %f42
923*1e49577aSRod Evans	fsrc1	%f12, %f44
924*1e49577aSRod Evans	fsrc1	%f14, %f46
925*1e49577aSRod Evans	stda	%f32, [%o0]ASI_BLK_P
926*1e49577aSRod Evans	membar	#StoreLoad|#StoreStore
927*1e49577aSRod Evans	wr	%o4, 0, %fprs
928*1e49577aSRod Evans	retl
929*1e49577aSRod Evans	mov	%g1, %o0
930*1e49577aSRod Evans
931*1e49577aSRod Evans
932*1e49577aSRod Evans	.align 16
933*1e49577aSRod Evans	! two nops here causes loop starting at 1f below to be
934*1e49577aSRod Evans	! on a cache line boundary, improving performance
935*1e49577aSRod Evans	nop
936*1e49577aSRod Evans	nop
937*1e49577aSRod Evans.xlarge:
938*1e49577aSRod Evans	! %o0 I/O DST is 64-byte aligned
939*1e49577aSRod Evans	! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN)
940*1e49577aSRod Evans	! %d0 I/O already loaded with SRC data from [%o1-8]
941*1e49577aSRod Evans	! %o2 I/O count (number of bytes that need to be written)
942*1e49577aSRod Evans	! %o3 I   Not written.  If zero, then SRC is double aligned.
943*1e49577aSRod Evans	! %o4 I   Not written.  Holds fprs.
944*1e49577aSRod Evans	! %o5   O The number of doubles that remain to be written.
945*1e49577aSRod Evans
946*1e49577aSRod Evans	! Load the rest of the current block
947*1e49577aSRod Evans	! Recall that %o1 is further into SRC than %o0 is into DST
948*1e49577aSRod Evans
949*1e49577aSRod Evans	! prefetch [%o1 + (3 * BLOCK_SIZE)], 21
950*1e49577aSRod Evans	! executed in delay slot for branch to .xlarge
951*1e49577aSRod Evans	prefetch [%o1 + (4 * BLOCK_SIZE)], 21
952*1e49577aSRod Evans	prefetch [%o1 + (5 * BLOCK_SIZE)], 21
953*1e49577aSRod Evans	ldd	[%o1], %f2
954*1e49577aSRod Evans	prefetch [%o1 + (6 * BLOCK_SIZE)], 21
955*1e49577aSRod Evans	ldd	[%o1 + 0x8], %f4
956*1e49577aSRod Evans	faligndata %f0, %f2, %f32
957*1e49577aSRod Evans	ldd	[%o1 + 0x10], %f6
958*1e49577aSRod Evans	faligndata %f2, %f4, %f34
959*1e49577aSRod Evans	ldd	[%o1 + 0x18], %f8
960*1e49577aSRod Evans	faligndata %f4, %f6, %f36
961*1e49577aSRod Evans	ldd	[%o1 + 0x20], %f10
962*1e49577aSRod Evans        or	%g0, -8, %o5		! if %o3 >= 0, %o5 = -8
963*1e49577aSRod Evans	faligndata %f6, %f8, %f38
964*1e49577aSRod Evans	ldd	[%o1 + 0x28], %f12
965*1e49577aSRod Evans	movrlz	%o3, %g0, %o5		! if %o3 < 0, %o5 = 0  (needed later)
966*1e49577aSRod Evans	faligndata %f8, %f10, %f40
967*1e49577aSRod Evans	ldd	[%o1 + 0x30], %f14
968*1e49577aSRod Evans	faligndata %f10, %f12, %f42
969*1e49577aSRod Evans	ldd	[%o1 + 0x38], %f0
970*1e49577aSRod Evans	sub	%o2, BLOCK_SIZE, %o2	! update count
971*1e49577aSRod Evans	prefetch [%o1 + (7 * BLOCK_SIZE)], 21
972*1e49577aSRod Evans	add	%o1, BLOCK_SIZE, %o1	! update SRC
973*1e49577aSRod Evans
974*1e49577aSRod Evans	! This point is 32-byte aligned since 24 instructions appear since
975*1e49577aSRod Evans	! the previous alignment directive.
976*1e49577aSRod Evans
977*1e49577aSRod Evans
978*1e49577aSRod Evans	! Main loop.  Write previous block.  Load rest of current block.
979*1e49577aSRod Evans	! Some bytes will be loaded that won't yet be written.
980*1e49577aSRod Evans1:
981*1e49577aSRod Evans	ldd	[%o1], %f2
982*1e49577aSRod Evans	faligndata %f12, %f14, %f44
983*1e49577aSRod Evans	ldd	[%o1 + 0x8], %f4
984*1e49577aSRod Evans	faligndata %f14, %f0, %f46
985*1e49577aSRod Evans	stda	%f32, [%o0]ASI_BLK_P
986*1e49577aSRod Evans	sub	%o2, BLOCK_SIZE, %o2		! update count
987*1e49577aSRod Evans	ldd	[%o1 + 0x10], %f6
988*1e49577aSRod Evans	faligndata %f0, %f2, %f32
989*1e49577aSRod Evans	ldd	[%o1 + 0x18], %f8
990*1e49577aSRod Evans	faligndata %f2, %f4, %f34
991*1e49577aSRod Evans	ldd	[%o1 + 0x20], %f10
992*1e49577aSRod Evans	faligndata %f4, %f6, %f36
993*1e49577aSRod Evans	ldd	[%o1 + 0x28], %f12
994*1e49577aSRod Evans	faligndata %f6, %f8, %f38
995*1e49577aSRod Evans	ldd	[%o1 + 0x30], %f14
996*1e49577aSRod Evans	faligndata %f8, %f10, %f40
997*1e49577aSRod Evans	ldd	[%o1 + 0x38], %f0
998*1e49577aSRod Evans	faligndata %f10, %f12, %f42
999*1e49577aSRod Evans	! offset of 8*BLK+8 bytes works best over range of (src-dst) mod 1K
1000*1e49577aSRod Evans	prefetch [%o1 + (8 * BLOCK_SIZE) + 8], 21
1001*1e49577aSRod Evans	add	%o0, BLOCK_SIZE, %o0		! update DST
1002*1e49577aSRod Evans	cmp	%o2, BLOCK_SIZE + 8
1003*1e49577aSRod Evans	! second prefetch important to correct for occasional dropped
1004*1e49577aSRod Evans	! initial prefetches, 5*BLK works best over range of (src-dst) mod 1K
1005*1e49577aSRod Evans	! strong prefetch prevents drops on Panther, but Jaguar and earlier
1006*1e49577aSRod Evans	! US-III models treat strong prefetches as weak prefetchs
1007*1e49577aSRod Evans	! to avoid regressions on customer hardware, we retain the prefetch
1008*1e49577aSRod Evans	prefetch [%o1 + (5 * BLOCK_SIZE)], 21
1009*1e49577aSRod Evans	bgu,pt	%ncc, 1b
1010*1e49577aSRod Evans	add	%o1, BLOCK_SIZE, %o1		! update SRC
1011*1e49577aSRod Evans
1012*1e49577aSRod Evans	faligndata %f12, %f14, %f44
1013*1e49577aSRod Evans	faligndata %f14, %f0, %f46
1014*1e49577aSRod Evans	stda	%f32, [%o0]ASI_BLK_P		! store 64 bytes, bypass cache
1015*1e49577aSRod Evans	cmp	%o2, BLOCK_SIZE
1016*1e49577aSRod Evans	bne	%ncc, 2f		! exactly 1 block remaining?
1017*1e49577aSRod Evans	add	%o0, BLOCK_SIZE, %o0	! update DST
1018*1e49577aSRod Evans	brz,a	%o3, 3f			! is SRC double aligned?
1019*1e49577aSRod Evans	ldd	[%o1], %f2
1020*1e49577aSRod Evans
1021*1e49577aSRod Evans2:
1022*1e49577aSRod Evans	add	%o5, %o2, %o5		! %o5 was already set to 0 or -8
1023*1e49577aSRod Evans	add	%o5, %o3, %o5
1024*1e49577aSRod Evans
1025*1e49577aSRod Evans	membar	#StoreLoad|#StoreStore
1026*1e49577aSRod Evans
1027*1e49577aSRod Evans	ba	.beginmedloop
1028*1e49577aSRod Evans	andn	%o5, 7, %o5		! 8 byte aligned count
1029*1e49577aSRod Evans
1030*1e49577aSRod Evans
1031*1e49577aSRod Evans	! This is when there is exactly 1 block remaining and SRC is aligned
1032*1e49577aSRod Evans3:
1033*1e49577aSRod Evans	ldd	[%o1 + 0x8], %f4
1034*1e49577aSRod Evans	ldd	[%o1 + 0x10], %f6
1035*1e49577aSRod Evans	fsrc1	%f0, %f32
1036*1e49577aSRod Evans	ldd	[%o1 + 0x18], %f8
1037*1e49577aSRod Evans	fsrc1	%f2, %f34
1038*1e49577aSRod Evans	ldd	[%o1 + 0x20], %f10
1039*1e49577aSRod Evans	fsrc1	%f4, %f36
1040*1e49577aSRod Evans	ldd	[%o1 + 0x28], %f12
1041*1e49577aSRod Evans	fsrc1	%f6, %f38
1042*1e49577aSRod Evans	ldd	[%o1 + 0x30], %f14
1043*1e49577aSRod Evans	fsrc1	%f8, %f40
1044*1e49577aSRod Evans	fsrc1	%f10, %f42
1045*1e49577aSRod Evans	fsrc1	%f12, %f44
1046*1e49577aSRod Evans	fsrc1	%f14, %f46
1047*1e49577aSRod Evans	stda	%f32, [%o0]ASI_BLK_P
1048*1e49577aSRod Evans	membar	#StoreLoad|#StoreStore
1049*1e49577aSRod Evans	wr	%o4, 0, %fprs
1050*1e49577aSRod Evans	retl
1051*1e49577aSRod Evans	mov	%g1, %o0
1052*1e49577aSRod Evans
1053*1e49577aSRod Evans	SET_SIZE(memcpy)
1054