xref: /illumos-gate/usr/src/lib/libc/capabilities/sun4v/common/memcpy.S (revision 5d9d9091f564c198a760790b0bfa72c44e17912b)
1*5d9d9091SRichard Lowe/*
2*5d9d9091SRichard Lowe * CDDL HEADER START
3*5d9d9091SRichard Lowe *
4*5d9d9091SRichard Lowe * The contents of this file are subject to the terms of the
5*5d9d9091SRichard Lowe * Common Development and Distribution License (the "License").
6*5d9d9091SRichard Lowe * You may not use this file except in compliance with the License.
7*5d9d9091SRichard Lowe *
8*5d9d9091SRichard Lowe * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*5d9d9091SRichard Lowe * or http://www.opensolaris.org/os/licensing.
10*5d9d9091SRichard Lowe * See the License for the specific language governing permissions
11*5d9d9091SRichard Lowe * and limitations under the License.
12*5d9d9091SRichard Lowe *
13*5d9d9091SRichard Lowe * When distributing Covered Code, include this CDDL HEADER in each
14*5d9d9091SRichard Lowe * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*5d9d9091SRichard Lowe * If applicable, add the following below this CDDL HEADER, with the
16*5d9d9091SRichard Lowe * fields enclosed by brackets "[]" replaced with your own identifying
17*5d9d9091SRichard Lowe * information: Portions Copyright [yyyy] [name of copyright owner]
18*5d9d9091SRichard Lowe *
19*5d9d9091SRichard Lowe * CDDL HEADER END
20*5d9d9091SRichard Lowe */
21*5d9d9091SRichard Lowe
22*5d9d9091SRichard Lowe/*
23*5d9d9091SRichard Lowe * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24*5d9d9091SRichard Lowe */
25*5d9d9091SRichard Lowe
26*5d9d9091SRichard Lowe	.file	"memcpy.s"
27*5d9d9091SRichard Lowe
28*5d9d9091SRichard Lowe/*
29*5d9d9091SRichard Lowe * memcpy(s1, s2, len)
30*5d9d9091SRichard Lowe *
31*5d9d9091SRichard Lowe * Copy s2 to s1, always copy n bytes.
32*5d9d9091SRichard Lowe * Note: this C code does not work for overlapped copies.
33*5d9d9091SRichard Lowe *       Memmove() and bcopy() do.
34*5d9d9091SRichard Lowe *
35*5d9d9091SRichard Lowe * Added entry __align_cpy_1 is generally for use of the compilers.
36*5d9d9091SRichard Lowe *
37*5d9d9091SRichard Lowe * Fast assembler language version of the following C-program for memcpy
38*5d9d9091SRichard Lowe * which represents the `standard' for the C-library.
39*5d9d9091SRichard Lowe *
40*5d9d9091SRichard Lowe *	void *
41*5d9d9091SRichard Lowe *	memcpy(void *s, const void *s0, size_t n)
42*5d9d9091SRichard Lowe *	{
43*5d9d9091SRichard Lowe *		if (n != 0) {
44*5d9d9091SRichard Lowe *		    char *s1 = s;
45*5d9d9091SRichard Lowe *		    const char *s2 = s0;
46*5d9d9091SRichard Lowe *		    do {
47*5d9d9091SRichard Lowe *			*s1++ = *s2++;
48*5d9d9091SRichard Lowe *		    } while (--n != 0);
49*5d9d9091SRichard Lowe *		}
50*5d9d9091SRichard Lowe *		return (s);
51*5d9d9091SRichard Lowe *	}
52*5d9d9091SRichard Lowe *
53*5d9d9091SRichard Lowe *
54*5d9d9091SRichard Lowe * N1 Flow :
55*5d9d9091SRichard Lowe *
56*5d9d9091SRichard Lowe * if (count < 17) {
57*5d9d9091SRichard Lowe *	Do the byte copy
58*5d9d9091SRichard Lowe *	Return destination address
59*5d9d9091SRichard Lowe * }
60*5d9d9091SRichard Lowe * if (count < 128) {
61*5d9d9091SRichard Lowe *	Is source aligned on word boundary
62*5d9d9091SRichard Lowe *	If no then align source on word boundary then goto .ald
63*5d9d9091SRichard Lowe *	If yes goto .ald
64*5d9d9091SRichard Lowe *	.ald:
65*5d9d9091SRichard Lowe *		Is destination aligned on word boundary
66*5d9d9091SRichard Lowe *		Depending on destination offset (last 2 bits of destination)
67*5d9d9091SRichard Lowe *		copy data by shifting and merging.
68*5d9d9091SRichard Lowe *		Copy residue bytes as byte copy
69*5d9d9091SRichard Lowe *		Return destination address
70*5d9d9091SRichard Lowe * } else {
71*5d9d9091SRichard Lowe *	Align destination on block boundary
72*5d9d9091SRichard Lowe *	Depending on the source offset (last 4 bits of source address) align
73*5d9d9091SRichard Lowe *	the data and store to destination. Both the load and store are done
74*5d9d9091SRichard Lowe *	using ASI_BLK_INIT_ST_QUAD_LDD_P.
75*5d9d9091SRichard Lowe *	For remaining count copy as much data in 8-byte chunk from source to
76*5d9d9091SRichard Lowe *	destination.
77*5d9d9091SRichard Lowe *	Followed by trailing copy using byte copy.
78*5d9d9091SRichard Lowe *	Return saved destination address
79*5d9d9091SRichard Lowe * }
80*5d9d9091SRichard Lowe *
81*5d9d9091SRichard Lowe *
82*5d9d9091SRichard Lowe * N2 Flow :
83*5d9d9091SRichard Lowe * Flow :
84*5d9d9091SRichard Lowe *
85*5d9d9091SRichard Lowe * if (count < 128) {
86*5d9d9091SRichard Lowe *   if count < 3
87*5d9d9091SRichard Lowe *	copy bytes; exit with dst addr
88*5d9d9091SRichard Lowe *   if src & dst aligned on word boundary but not long word boundary,
89*5d9d9091SRichard Lowe *     copy with ldw/stw; branch to finish_up
90*5d9d9091SRichard Lowe *   if src & dst aligned on long word boundary
91*5d9d9091SRichard Lowe *     copy with ldx/stx; branch to finish_up
92*5d9d9091SRichard Lowe *   if src & dst not aligned and length <= 14
93*5d9d9091SRichard Lowe *     copy bytes; exit with dst addr
94*5d9d9091SRichard Lowe *   move enough bytes to get src to word boundary
95*5d9d9091SRichard Lowe *   if dst now on word boundary
96*5d9d9091SRichard Lowe * move_words:
97*5d9d9091SRichard Lowe *     copy words; branch to finish_up
98*5d9d9091SRichard Lowe *   if dst now on half word boundary
99*5d9d9091SRichard Lowe *     load words, shift half words, store words; branch to finish_up
100*5d9d9091SRichard Lowe *   if dst on byte 1
101*5d9d9091SRichard Lowe *     load words, shift 3 bytes, store words; branch to finish_up
102*5d9d9091SRichard Lowe *   if dst on byte 3
103*5d9d9091SRichard Lowe *     load words, shift 1 byte, store words; branch to finish_up
104*5d9d9091SRichard Lowe * finish_up:
105*5d9d9091SRichard Lowe *     copy bytes; exit with dst addr
106*5d9d9091SRichard Lowe * } else {                                         More than 128 bytes
107*5d9d9091SRichard Lowe *   move bytes until dst is on long word boundary
108*5d9d9091SRichard Lowe *   if( src is on long word boundary ) {
109*5d9d9091SRichard Lowe *     if (count < 512) {
110*5d9d9091SRichard Lowe * finish_long:				           src/dst aligned on 8 bytes
111*5d9d9091SRichard Lowe *       copy with ldx/stx in 8-way unrolled loop;
112*5d9d9091SRichard Lowe *       copy final 0-63 bytes; exit with dst addr
113*5d9d9091SRichard Lowe *     } else {                                 src/dst aligned; count > 512
114*5d9d9091SRichard Lowe *       align dst on 64 byte boundary; use 8-way test for each of 8 possible
115*5d9d9091SRichard Lowe *       src alignments relative to a 64 byte boundary to select the
116*5d9d9091SRichard Lowe *       16-way unrolled loop to use for
117*5d9d9091SRichard Lowe *       block load, fmovd, block-init-store, block-store, fmovd operations
118*5d9d9091SRichard Lowe *       then go to finish_long.
119*5d9d9091SRichard Lowe *     }
120*5d9d9091SRichard Lowe *   } else {                                   src/dst not aligned on 8 bytes
121*5d9d9091SRichard Lowe *     if src is word aligned and count < 512
122*5d9d9091SRichard Lowe *       move words in 8-way unrolled loop
123*5d9d9091SRichard Lowe *       move final 0-31 bytes; exit with dst addr
124*5d9d9091SRichard Lowe *     if count < 512
125*5d9d9091SRichard Lowe *       use alignaddr/faligndata combined with ldd/std in 8-way
126*5d9d9091SRichard Lowe *       unrolled loop to move data.
127*5d9d9091SRichard Lowe *       go to unalign_done
128*5d9d9091SRichard Lowe *     else
129*5d9d9091SRichard Lowe *       setup alignaddr for faligndata instructions
130*5d9d9091SRichard Lowe *       align dst on 64 byte boundary; use 8-way test for each of 8 possible
131*5d9d9091SRichard Lowe *       src alignments to nearest long word relative to 64 byte boundary to
132*5d9d9091SRichard Lowe *       select the 8-way unrolled loop to use for
133*5d9d9091SRichard Lowe *       block load, falign, fmovd, block-init-store, block-store loop
134*5d9d9091SRichard Lowe *	 (only use block-init-store when src/dst on 8 byte boundaries.)
135*5d9d9091SRichard Lowe * unalign_done:
136*5d9d9091SRichard Lowe *       move remaining bytes for unaligned cases. exit with dst addr.
137*5d9d9091SRichard Lowe * }
138*5d9d9091SRichard Lowe *
139*5d9d9091SRichard Lowe * Comment on N2 memmove and memcpy common code and block-store-init:
140*5d9d9091SRichard Lowe *   In the man page for memmove, it specifies that copying will take place
141*5d9d9091SRichard Lowe *   correctly between objects that overlap.  For memcpy, behavior is
142*5d9d9091SRichard Lowe *   undefined for objects that overlap.
143*5d9d9091SRichard Lowe *
144*5d9d9091SRichard Lowe *   In rare cases, some multi-threaded applications may attempt to examine
145*5d9d9091SRichard Lowe *   the copy destination buffer during the copy. Using the block-store-init
146*5d9d9091SRichard Lowe *   instruction allows those applications to observe zeros in some
147*5d9d9091SRichard Lowe *   cache lines of the destination buffer for narrow windows. But the
148*5d9d9091SRichard Lowe *   the block-store-init provides memory throughput advantages for many
149*5d9d9091SRichard Lowe *   common applications. To meet both needs, those applications which need
150*5d9d9091SRichard Lowe *   the destination buffer to retain meaning during the copy should use
151*5d9d9091SRichard Lowe *   memmove instead of memcpy.  The memmove version duplicates the memcpy
152*5d9d9091SRichard Lowe *   algorithms except the memmove version does not use block-store-init
153*5d9d9091SRichard Lowe *   in those cases where memcpy does use block-store-init. Otherwise, when
154*5d9d9091SRichard Lowe *   memmove can determine the source and destination do not overlap,
155*5d9d9091SRichard Lowe *   memmove shares the memcpy code.
156*5d9d9091SRichard Lowe */
157*5d9d9091SRichard Lowe
158*5d9d9091SRichard Lowe#include <sys/asm_linkage.h>
159*5d9d9091SRichard Lowe#include <sys/niagaraasi.h>
160*5d9d9091SRichard Lowe#include <sys/asi.h>
161*5d9d9091SRichard Lowe#include <sys/trap.h>
162*5d9d9091SRichard Lowe
163*5d9d9091SRichard Lowe/* documented name for primary block initializing store */
164*5d9d9091SRichard Lowe#define	ASI_STBI_P	ASI_BLK_INIT_ST_QUAD_LDD_P
165*5d9d9091SRichard Lowe
166*5d9d9091SRichard Lowe#define	BLOCK_SIZE	64
167*5d9d9091SRichard Lowe#define	FPRS_FEF	0x4
168*5d9d9091SRichard Lowe
169*5d9d9091SRichard Lowe#define	SHORTCOPY	3
170*5d9d9091SRichard Lowe#define	SHORTCHECK	14
171*5d9d9091SRichard Lowe#define	SHORT_LONG	64	/* max copy for short longword-aligned case */
172*5d9d9091SRichard Lowe				/* must be at least 32 */
173*5d9d9091SRichard Lowe#define	SMALL_MAX	128
174*5d9d9091SRichard Lowe#define	MED_UMAX	512	/* max copy for medium un-aligned case */
175*5d9d9091SRichard Lowe#define	MED_WMAX	512	/* max copy for medium word-aligned case */
176*5d9d9091SRichard Lowe#define	MED_MAX		512	/* max copy for medium longword-aligned case */
177*5d9d9091SRichard Lowe
178*5d9d9091SRichard Lowe#ifdef NIAGARA2_IMPL
179*5d9d9091SRichard Lowe#include <sys/sun4asi.h>
180*5d9d9091SRichard Lowe
181*5d9d9091SRichard Lowe#else	/* NIAGARA2_IMPL */
182*5d9d9091SRichard Lowe/*
183*5d9d9091SRichard Lowe * This define is to align data for the unaligned source cases.
184*5d9d9091SRichard Lowe * The data1, data2 and data3 is merged into data1 and data2.
185*5d9d9091SRichard Lowe * The data3 is preserved for next merge.
186*5d9d9091SRichard Lowe */
187*5d9d9091SRichard Lowe#define	ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp)	\
188*5d9d9091SRichard Lowe	sllx	data1, lshift, data1				;\
189*5d9d9091SRichard Lowe	srlx	data2, rshift, tmp				;\
190*5d9d9091SRichard Lowe	or	data1, tmp, data1				;\
191*5d9d9091SRichard Lowe	sllx	data2, lshift, data2				;\
192*5d9d9091SRichard Lowe	srlx	data3, rshift, tmp				;\
193*5d9d9091SRichard Lowe	or	data2, tmp, data2
194*5d9d9091SRichard Lowe/*
195*5d9d9091SRichard Lowe * Align the data. Merge the data1 and data2 into data1.
196*5d9d9091SRichard Lowe */
197*5d9d9091SRichard Lowe#define	ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp)	\
198*5d9d9091SRichard Lowe	sllx	data1, lshift, data1				;\
199*5d9d9091SRichard Lowe	srlx	data2, rshift, tmp				;\
200*5d9d9091SRichard Lowe	or	data1, tmp, data1
201*5d9d9091SRichard Lowe#endif	/* NIAGARA2_IMPL */
202*5d9d9091SRichard Lowe
203*5d9d9091SRichard Lowe
204*5d9d9091SRichard Lowe	ANSI_PRAGMA_WEAK(memmove,function)
205*5d9d9091SRichard Lowe	ANSI_PRAGMA_WEAK(memcpy,function)
206*5d9d9091SRichard Lowe
207*5d9d9091SRichard Lowe	ENTRY(memmove)
208*5d9d9091SRichard Lowe	cmp	%o1, %o0	! if from address is >= to use forward copy
209*5d9d9091SRichard Lowe	bgeu,pn	%ncc, .forcpy	! else use backward if ...
210*5d9d9091SRichard Lowe	sub	%o0, %o1, %o4	! get difference of two addresses
211*5d9d9091SRichard Lowe	cmp	%o2, %o4	! compare size and difference of addresses
212*5d9d9091SRichard Lowe	bleu,pn	%ncc, .forcpy	! if size is bigger, do overlapped copy
213*5d9d9091SRichard Lowe	add	%o1, %o2, %o5	! get to end of source space
214*5d9d9091SRichard Lowe
215*5d9d9091SRichard Lowe	!
216*5d9d9091SRichard Lowe	! an overlapped copy that must be done "backwards"
217*5d9d9091SRichard Lowe	!
218*5d9d9091SRichard Lowe.chksize:
219*5d9d9091SRichard Lowe	cmp	%o2, 8			! less than 8 byte do byte copy
220*5d9d9091SRichard Lowe	blu,pt %ncc, 2f			! else continue
221*5d9d9091SRichard Lowe
222*5d9d9091SRichard Lowe	! Now size is bigger than 8
223*5d9d9091SRichard Lowe.dbalign:
224*5d9d9091SRichard Lowe	add	%o0, %o2, %g1		! get to end of dest space
225*5d9d9091SRichard Lowe	andcc	%g1, 7, %o3		! %o3 has bytes till dst 8 bytes aligned
226*5d9d9091SRichard Lowe	bz,a,pn	%ncc, .dbbck		! if dst is not 8 byte aligned: align it
227*5d9d9091SRichard Lowe	andn	%o2, 7, %o3		! %o3 count is multiple of 8 bytes size
228*5d9d9091SRichard Lowe	sub	%o2, %o3, %o2		! update o2 with new count
229*5d9d9091SRichard Lowe
230*5d9d9091SRichard Lowe1:	dec	%o5			! decrement source
231*5d9d9091SRichard Lowe	ldub	[%o5], %g1		! load one byte
232*5d9d9091SRichard Lowe	deccc	%o3			! decrement count
233*5d9d9091SRichard Lowe	bgu,pt	%ncc, 1b		! if not done keep copying
234*5d9d9091SRichard Lowe	stb	%g1, [%o5+%o4]		! store one byte into dest
235*5d9d9091SRichard Lowe	andncc	%o2, 7, %o3		! %o3 count is multiple of 8 bytes size
236*5d9d9091SRichard Lowe	bz,pn	%ncc, 2f		! if size < 8, move to byte copy
237*5d9d9091SRichard Lowe
238*5d9d9091SRichard Lowe	! Now Destination is 8 byte aligned
239*5d9d9091SRichard Lowe.dbbck:
240*5d9d9091SRichard Lowe	andcc	%o5, 7, %o0		! %o0 has src offset
241*5d9d9091SRichard Lowe	bz,a,pn	%ncc, .dbcopybc		! if src is aligned to fast mem move
242*5d9d9091SRichard Lowe	sub	%o2, %o3, %o2		! Residue bytes in %o2
243*5d9d9091SRichard Lowe
244*5d9d9091SRichard Lowe.cpy_dbwdbc:				! alignment of src is needed
245*5d9d9091SRichard Lowe	sub	%o2, 8, %o2		! set size one loop ahead
246*5d9d9091SRichard Lowe	sll	%o0, 3, %g1		! %g1 is left shift
247*5d9d9091SRichard Lowe	mov	64, %g5			! init %g5 to be 64
248*5d9d9091SRichard Lowe	sub	%g5, %g1, %g5		! %g5 right shift = (64 - left shift)
249*5d9d9091SRichard Lowe	sub	%o5, %o0, %o5		! align the src at 8 bytes.
250*5d9d9091SRichard Lowe	add	%o4, %o0, %o4		! increase difference between src & dst
251*5d9d9091SRichard Lowe	ldx	[%o5], %o1		! load first 8 bytes
252*5d9d9091SRichard Lowe	srlx	%o1, %g5, %o1
253*5d9d9091SRichard Lowe1:	sub	%o5, 8, %o5		! subtract 8 from src
254*5d9d9091SRichard Lowe	ldx	[%o5], %o0		! load 8 byte
255*5d9d9091SRichard Lowe	sllx	%o0, %g1, %o3		! shift loaded 8 bytes left into tmp reg
256*5d9d9091SRichard Lowe	or	%o1, %o3, %o3		! align data
257*5d9d9091SRichard Lowe	stx	%o3, [%o5+%o4]		! store 8 byte
258*5d9d9091SRichard Lowe	subcc	%o2, 8, %o2		! subtract 8 byte from size
259*5d9d9091SRichard Lowe	bg,pt	%ncc, 1b		! if size > 0 continue
260*5d9d9091SRichard Lowe	srlx	%o0, %g5, %o1		! move extra byte for the next use
261*5d9d9091SRichard Lowe
262*5d9d9091SRichard Lowe	srl	%g1, 3, %o0		! retsote %o0 value for alignment
263*5d9d9091SRichard Lowe	add	%o5, %o0, %o5		! restore src alignment
264*5d9d9091SRichard Lowe	sub	%o4, %o0, %o4		! restore difference between src & dest
265*5d9d9091SRichard Lowe
266*5d9d9091SRichard Lowe	ba	2f			! branch to the trailing byte copy
267*5d9d9091SRichard Lowe	add	%o2, 8, %o2		! restore size value
268*5d9d9091SRichard Lowe
269*5d9d9091SRichard Lowe.dbcopybc:				! alignment of src is not needed
270*5d9d9091SRichard Lowe1:	sub	%o5, 8, %o5		! subtract from src
271*5d9d9091SRichard Lowe	ldx	[%o5], %g1		! load 8 bytes
272*5d9d9091SRichard Lowe	subcc	%o3, 8, %o3		! subtract from size
273*5d9d9091SRichard Lowe	bgu,pt	%ncc, 1b		! if size is bigger 0 continue
274*5d9d9091SRichard Lowe	stx	%g1, [%o5+%o4]		! store 8 bytes to destination
275*5d9d9091SRichard Lowe
276*5d9d9091SRichard Lowe	ba	2f
277*5d9d9091SRichard Lowe	nop
278*5d9d9091SRichard Lowe
279*5d9d9091SRichard Lowe.bcbyte:
280*5d9d9091SRichard Lowe1:	ldub	[%o5], %g1		! load one byte
281*5d9d9091SRichard Lowe	stb	%g1, [%o5+%o4]		! store one byte
282*5d9d9091SRichard Lowe2:	deccc	%o2			! decrement size
283*5d9d9091SRichard Lowe	bgeu,a,pt %ncc, 1b		! if size is >= 0 continue
284*5d9d9091SRichard Lowe	dec	%o5			! decrement from address
285*5d9d9091SRichard Lowe
286*5d9d9091SRichard Lowe.exitbc:				! exit from backward copy
287*5d9d9091SRichard Lowe	retl
288*5d9d9091SRichard Lowe	add	%o5, %o4, %o0		! restore dest addr
289*5d9d9091SRichard Lowe
290*5d9d9091SRichard Lowe#ifdef NIAGARA2_IMPL
291*5d9d9091SRichard Lowe	!
292*5d9d9091SRichard Lowe	! Check to see if memmove is large aligned copy
293*5d9d9091SRichard Lowe	! If so, use special version of copy that avoids
294*5d9d9091SRichard Lowe	! use of block store init
295*5d9d9091SRichard Lowe	!
296*5d9d9091SRichard Lowe.forcpy:
297*5d9d9091SRichard Lowe	cmp	%o2, SMALL_MAX		! check for not small case
298*5d9d9091SRichard Lowe	blt,pn	%ncc, .mv_short		! merge with memcpy
299*5d9d9091SRichard Lowe	mov	%o0, %g1		! save %o0
300*5d9d9091SRichard Lowe	neg	%o0, %o5
301*5d9d9091SRichard Lowe	andcc	%o5, 7, %o5		! bytes till DST 8 byte aligned
302*5d9d9091SRichard Lowe	brz,pt	%o5, .mv_dst_aligned_on_8
303*5d9d9091SRichard Lowe
304*5d9d9091SRichard Lowe	! %o5 has the bytes to be written in partial store.
305*5d9d9091SRichard Lowe	sub	%o2, %o5, %o2
306*5d9d9091SRichard Lowe	sub	%o1, %o0, %o1		! %o1 gets the difference
307*5d9d9091SRichard Lowe7:					! dst aligning loop
308*5d9d9091SRichard Lowe	ldub	[%o1+%o0], %o4		! load one byte
309*5d9d9091SRichard Lowe	subcc	%o5, 1, %o5
310*5d9d9091SRichard Lowe	stb	%o4, [%o0]
311*5d9d9091SRichard Lowe	bgu,pt	%ncc, 7b
312*5d9d9091SRichard Lowe	add	%o0, 1, %o0		! advance dst
313*5d9d9091SRichard Lowe	add	%o1, %o0, %o1		! restore %o1
314*5d9d9091SRichard Lowe.mv_dst_aligned_on_8:
315*5d9d9091SRichard Lowe	andcc	%o1, 7, %o5
316*5d9d9091SRichard Lowe	brnz,pt	%o5, .src_dst_unaligned_on_8
317*5d9d9091SRichard Lowe	prefetch [%o1 + (1 * BLOCK_SIZE)], #one_read
318*5d9d9091SRichard Lowe
319*5d9d9091SRichard Lowe.mv_src_dst_aligned_on_8:
320*5d9d9091SRichard Lowe	! check if we are copying MED_MAX or more bytes
321*5d9d9091SRichard Lowe	cmp	%o2, MED_MAX		! limit to store buffer size
322*5d9d9091SRichard Lowe	bleu,pt	%ncc, .medlong
323*5d9d9091SRichard Lowe	prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read
324*5d9d9091SRichard Lowe
325*5d9d9091SRichard Lowe/*
326*5d9d9091SRichard Lowe * The following memmove code mimics the memcpy code for large aligned copies,
327*5d9d9091SRichard Lowe * but does not use the ASI_STBI_P (block initializing store) performance
328*5d9d9091SRichard Lowe * optimization. See memmove rationale section in documentation
329*5d9d9091SRichard Lowe */
330*5d9d9091SRichard Lowe.mv_large_align8_copy:			! Src and dst share 8 byte alignment
331*5d9d9091SRichard Lowe	rd	%fprs, %g5		! check for unused fp
332*5d9d9091SRichard Lowe	! if fprs.fef == 0, set it.
333*5d9d9091SRichard Lowe	! Setting it when already set costs more than checking
334*5d9d9091SRichard Lowe	andcc	%g5, FPRS_FEF, %g5	! test FEF, fprs.du = fprs.dl = 0
335*5d9d9091SRichard Lowe	bz,a	%ncc, 1f
336*5d9d9091SRichard Lowe	wr	%g0, FPRS_FEF, %fprs	! fprs.fef = 1
337*5d9d9091SRichard Lowe1:
338*5d9d9091SRichard Lowe	! align dst to 64 byte boundary
339*5d9d9091SRichard Lowe	andcc	%o0, 0x3f, %o3		! %o3 == 0 means dst is 64 byte aligned
340*5d9d9091SRichard Lowe	brz,pn	%o3, .mv_aligned_on_64
341*5d9d9091SRichard Lowe	sub	%o3, 64, %o3		! %o3 has negative bytes to move
342*5d9d9091SRichard Lowe	add	%o2, %o3, %o2		! adjust remaining count
343*5d9d9091SRichard Lowe.mv_align_to_64:
344*5d9d9091SRichard Lowe	ldx	[%o1], %o4
345*5d9d9091SRichard Lowe	add	%o1, 8, %o1		! increment src ptr
346*5d9d9091SRichard Lowe	addcc	%o3, 8, %o3
347*5d9d9091SRichard Lowe	stx	%o4, [%o0]
348*5d9d9091SRichard Lowe	brnz,pt	%o3, .mv_align_to_64
349*5d9d9091SRichard Lowe	add	%o0, 8, %o0		! increment dst ptr
350*5d9d9091SRichard Lowe
351*5d9d9091SRichard Lowe.mv_aligned_on_64:
352*5d9d9091SRichard Lowe	prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
353*5d9d9091SRichard Lowe	mov	%asi,%o4		! save %asi
354*5d9d9091SRichard Lowe	! Determine source alignment
355*5d9d9091SRichard Lowe	! to correct 8 byte offset
356*5d9d9091SRichard Lowe	andcc	%o1, 0x20, %o3
357*5d9d9091SRichard Lowe	brnz,pn	%o3, .mv_align_1
358*5d9d9091SRichard Lowe	mov	ASI_BLK_P, %asi		! setup %asi for block load/store
359*5d9d9091SRichard Lowe	andcc	%o1, 0x10, %o3
360*5d9d9091SRichard Lowe	brnz,pn	%o3, .mv_align_01
361*5d9d9091SRichard Lowe	nop
362*5d9d9091SRichard Lowe	andcc	%o1, 0x08, %o3
363*5d9d9091SRichard Lowe	brz,pn	%o3, .mv_align_000
364*5d9d9091SRichard Lowe	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
365*5d9d9091SRichard Lowe	ba	.mv_align_001
366*5d9d9091SRichard Lowe	nop
367*5d9d9091SRichard Lowe.mv_align_01:
368*5d9d9091SRichard Lowe	andcc	%o1, 0x08, %o3
369*5d9d9091SRichard Lowe	brnz,pn	%o3, .mv_align_011
370*5d9d9091SRichard Lowe	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
371*5d9d9091SRichard Lowe	ba	.mv_align_010
372*5d9d9091SRichard Lowe	nop
373*5d9d9091SRichard Lowe.mv_align_1:
374*5d9d9091SRichard Lowe	andcc	%o1, 0x10, %o3
375*5d9d9091SRichard Lowe	brnz,pn	%o3, .mv_align_11
376*5d9d9091SRichard Lowe	nop
377*5d9d9091SRichard Lowe	andcc	%o1, 0x08, %o3
378*5d9d9091SRichard Lowe	brnz,pn	%o3, .mv_align_101
379*5d9d9091SRichard Lowe	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
380*5d9d9091SRichard Lowe	ba	.mv_align_100
381*5d9d9091SRichard Lowe	nop
382*5d9d9091SRichard Lowe.mv_align_11:
383*5d9d9091SRichard Lowe	andcc	%o1, 0x08, %o3
384*5d9d9091SRichard Lowe	brz,pn	%o3, .mv_align_110
385*5d9d9091SRichard Lowe	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
386*5d9d9091SRichard Lowe
387*5d9d9091SRichard Lowe.mv_align_111:
388*5d9d9091SRichard Lowe! Alignment off by 8 bytes
389*5d9d9091SRichard Lowe	ldd	[%o1], %d0
390*5d9d9091SRichard Lowe	add	%o1, 8, %o1
391*5d9d9091SRichard Lowe	sub	%o2, 8, %o2
392*5d9d9091SRichard Lowe	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
393*5d9d9091SRichard Lowe	and	%o2, 0x7f, %o2		! residue bytes in %o2
394*5d9d9091SRichard Lowe.mv_align_111_loop:
395*5d9d9091SRichard Lowe	subcc	%o5, 128, %o5
396*5d9d9091SRichard Lowe	/* ---- copy line 1 of 2. ---- */
397*5d9d9091SRichard Lowe	ldda	[%o1]%asi,%d16		! block load
398*5d9d9091SRichard Lowe	fmovd	%d16, %d2
399*5d9d9091SRichard Lowe	fmovd	%d18, %d4
400*5d9d9091SRichard Lowe	fmovd	%d20, %d6
401*5d9d9091SRichard Lowe	fmovd	%d22, %d8
402*5d9d9091SRichard Lowe	fmovd	%d24, %d10
403*5d9d9091SRichard Lowe	fmovd	%d26, %d12
404*5d9d9091SRichard Lowe	fmovd	%d28, %d14
405*5d9d9091SRichard Lowe	stda	%d0,[%o0]%asi
406*5d9d9091SRichard Lowe	add	%o0, 64, %o0		! advance dst
407*5d9d9091SRichard Lowe	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
408*5d9d9091SRichard Lowe	fmovd	%d30, %d0
409*5d9d9091SRichard Lowe
410*5d9d9091SRichard Lowe	/* ---- copy line 2 of 2. ---- */
411*5d9d9091SRichard Lowe	ldda	[%o1+64]%asi,%d16
412*5d9d9091SRichard Lowe	fmovd	%d16, %d2
413*5d9d9091SRichard Lowe	fmovd	%d18, %d4
414*5d9d9091SRichard Lowe	fmovd	%d20, %d6
415*5d9d9091SRichard Lowe	fmovd	%d22, %d8
416*5d9d9091SRichard Lowe	fmovd	%d24, %d10
417*5d9d9091SRichard Lowe	fmovd	%d26, %d12
418*5d9d9091SRichard Lowe	fmovd	%d28, %d14
419*5d9d9091SRichard Lowe	add	%o1, 128, %o1		! increment src
420*5d9d9091SRichard Lowe	stda	%d0,[%o0]%asi
421*5d9d9091SRichard Lowe	add	%o0, 64, %o0		! advance dst
422*5d9d9091SRichard Lowe	fmovd	%d30, %d0
423*5d9d9091SRichard Lowe	bgt,pt	%ncc, .mv_align_111_loop
424*5d9d9091SRichard Lowe	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
425*5d9d9091SRichard Lowe
426*5d9d9091SRichard Lowe	std	%d0, [%o0]
427*5d9d9091SRichard Lowe	ba	.remain_stuff
428*5d9d9091SRichard Lowe	add	%o0, 8, %o0
429*5d9d9091SRichard Lowe	! END OF mv_align_111
430*5d9d9091SRichard Lowe
431*5d9d9091SRichard Lowe.mv_align_110:
432*5d9d9091SRichard Lowe! Alignment off by 16 bytes
433*5d9d9091SRichard Lowe	ldd	[%o1], %d0
434*5d9d9091SRichard Lowe	ldd	[%o1+8], %d2
435*5d9d9091SRichard Lowe	add	%o1, 16, %o1
436*5d9d9091SRichard Lowe	sub	%o2, 16, %o2
437*5d9d9091SRichard Lowe	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
438*5d9d9091SRichard Lowe	and	%o2, 0x7f, %o2		! residue bytes in %o2
439*5d9d9091SRichard Lowe.mv_align_110_loop:
440*5d9d9091SRichard Lowe	subcc	%o5, 128, %o5
441*5d9d9091SRichard Lowe	/* ---- copy line 1 of 2. ---- */
442*5d9d9091SRichard Lowe
443*5d9d9091SRichard Lowe	ldda	[%o1]%asi,%d16		! block load
444*5d9d9091SRichard Lowe	fmovd	%d16, %d4
445*5d9d9091SRichard Lowe	fmovd	%d18, %d6
446*5d9d9091SRichard Lowe	fmovd	%d20, %d8
447*5d9d9091SRichard Lowe	fmovd	%d22, %d10
448*5d9d9091SRichard Lowe	fmovd	%d24, %d12
449*5d9d9091SRichard Lowe	fmovd	%d26, %d14
450*5d9d9091SRichard Lowe	stda	%d0,[%o0]%asi
451*5d9d9091SRichard Lowe	add	%o0, 64, %o0		! advance dst
452*5d9d9091SRichard Lowe	fmovd	%d28, %d0
453*5d9d9091SRichard Lowe	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
454*5d9d9091SRichard Lowe	fmovd	%d30, %d2
455*5d9d9091SRichard Lowe
456*5d9d9091SRichard Lowe	/* ---- copy line 2 of 2. ---- */
457*5d9d9091SRichard Lowe	ldda	[%o1+64]%asi,%d16
458*5d9d9091SRichard Lowe	fmovd	%d16, %d4
459*5d9d9091SRichard Lowe	fmovd	%d18, %d6
460*5d9d9091SRichard Lowe	fmovd	%d20, %d8
461*5d9d9091SRichard Lowe	fmovd	%d22, %d10
462*5d9d9091SRichard Lowe	fmovd	%d24, %d12
463*5d9d9091SRichard Lowe	fmovd	%d26, %d14
464*5d9d9091SRichard Lowe	add	%o1, 128, %o1		! increment src
465*5d9d9091SRichard Lowe	stda	%d0,[%o0]%asi
466*5d9d9091SRichard Lowe	add	%o0, 64, %o0		! advance dst
467*5d9d9091SRichard Lowe	fmovd	%d28, %d0
468*5d9d9091SRichard Lowe	fmovd	%d30, %d2
469*5d9d9091SRichard Lowe	bgt,pt	%ncc, .mv_align_110_loop
470*5d9d9091SRichard Lowe	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
471*5d9d9091SRichard Lowe
472*5d9d9091SRichard Lowe	std	%d0, [%o0]
473*5d9d9091SRichard Lowe	std	%d2, [%o0+8]
474*5d9d9091SRichard Lowe	ba	.remain_stuff
475*5d9d9091SRichard Lowe	add	%o0, 16, %o0
476*5d9d9091SRichard Lowe	! END OF mv_align_110
477*5d9d9091SRichard Lowe
478*5d9d9091SRichard Lowe.mv_align_101:
479*5d9d9091SRichard Lowe! Alignment off by 24 bytes
480*5d9d9091SRichard Lowe	ldd	[%o1], %d0
481*5d9d9091SRichard Lowe	ldd	[%o1+8], %d2
482*5d9d9091SRichard Lowe	ldd	[%o1+16], %d4
483*5d9d9091SRichard Lowe	add	%o1, 24, %o1
484*5d9d9091SRichard Lowe	sub	%o2, 24, %o2
485*5d9d9091SRichard Lowe	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
486*5d9d9091SRichard Lowe	and	%o2, 0x7f, %o2		! residue bytes in %o2
487*5d9d9091SRichard Lowe.mv_align_101_loop:
488*5d9d9091SRichard Lowe	subcc	%o5, 128, %o5
489*5d9d9091SRichard Lowe	/* ---- copy line 1 of 2. ---- */
490*5d9d9091SRichard Lowe
491*5d9d9091SRichard Lowe	ldda	[%o1]%asi,%d16		! block load
492*5d9d9091SRichard Lowe	fmovd	%d16, %d6
493*5d9d9091SRichard Lowe	fmovd	%d18, %d8
494*5d9d9091SRichard Lowe	fmovd	%d20, %d10
495*5d9d9091SRichard Lowe	fmovd	%d22, %d12
496*5d9d9091SRichard Lowe	fmovd	%d24, %d14
497*5d9d9091SRichard Lowe	stda	%d0,[%o0]%asi
498*5d9d9091SRichard Lowe	add	%o0, 64, %o0		! advance dst
499*5d9d9091SRichard Lowe	fmovd	%d26, %d0
500*5d9d9091SRichard Lowe	fmovd	%d28, %d2
501*5d9d9091SRichard Lowe	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
502*5d9d9091SRichard Lowe	fmovd	%d30, %d4
503*5d9d9091SRichard Lowe
504*5d9d9091SRichard Lowe	/* ---- copy line 2 of 2. ---- */
505*5d9d9091SRichard Lowe	ldda	[%o1+64]%asi,%d16
506*5d9d9091SRichard Lowe	fmovd	%d16, %d6
507*5d9d9091SRichard Lowe	fmovd	%d18, %d8
508*5d9d9091SRichard Lowe	fmovd	%d20, %d10
509*5d9d9091SRichard Lowe	fmovd	%d22, %d12
510*5d9d9091SRichard Lowe	fmovd	%d24, %d14
511*5d9d9091SRichard Lowe	add	%o1, 128, %o1		! increment src
512*5d9d9091SRichard Lowe	stda	%d0,[%o0]%asi
513*5d9d9091SRichard Lowe	add	%o0, 64, %o0		! advance dst
514*5d9d9091SRichard Lowe	fmovd	%d26, %d0
515*5d9d9091SRichard Lowe	fmovd	%d28, %d2
516*5d9d9091SRichard Lowe	fmovd	%d30, %d4
517*5d9d9091SRichard Lowe	bgt,pt	%ncc, .mv_align_101_loop
518*5d9d9091SRichard Lowe	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
519*5d9d9091SRichard Lowe
520*5d9d9091SRichard Lowe	std	%d0, [%o0]
521*5d9d9091SRichard Lowe	std	%d2, [%o0+8]
522*5d9d9091SRichard Lowe	std	%d4, [%o0+16]
523*5d9d9091SRichard Lowe	ba	.remain_stuff
524*5d9d9091SRichard Lowe	add	%o0, 24, %o0
525*5d9d9091SRichard Lowe	! END OF mv_align_101
526*5d9d9091SRichard Lowe
527*5d9d9091SRichard Lowe.mv_align_100:
528*5d9d9091SRichard Lowe! Alignment off by 32 bytes
529*5d9d9091SRichard Lowe	ldd	[%o1], %d0
530*5d9d9091SRichard Lowe	ldd	[%o1+8], %d2
531*5d9d9091SRichard Lowe	ldd	[%o1+16],%d4
532*5d9d9091SRichard Lowe	ldd	[%o1+24],%d6
533*5d9d9091SRichard Lowe	add	%o1, 32, %o1
534*5d9d9091SRichard Lowe	sub	%o2, 32, %o2
535*5d9d9091SRichard Lowe	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
536*5d9d9091SRichard Lowe	and	%o2, 0x7f, %o2		! residue bytes in %o2
537*5d9d9091SRichard Lowe.mv_align_100_loop:
538*5d9d9091SRichard Lowe	subcc	%o5, 128, %o5
539*5d9d9091SRichard Lowe	/* ---- copy line 1 of 2. ---- */
540*5d9d9091SRichard Lowe	ldda	[%o1]%asi,%d16		! block load
541*5d9d9091SRichard Lowe	fmovd	%d16, %d8
542*5d9d9091SRichard Lowe	fmovd	%d18, %d10
543*5d9d9091SRichard Lowe	fmovd	%d20, %d12
544*5d9d9091SRichard Lowe	fmovd	%d22, %d14
545*5d9d9091SRichard Lowe	stda	%d0,[%o0]%asi
546*5d9d9091SRichard Lowe	add	%o0, 64, %o0		! advance dst
547*5d9d9091SRichard Lowe	fmovd	%d24, %d0
548*5d9d9091SRichard Lowe	fmovd	%d26, %d2
549*5d9d9091SRichard Lowe	fmovd	%d28, %d4
550*5d9d9091SRichard Lowe	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
551*5d9d9091SRichard Lowe	fmovd	%d30, %d6
552*5d9d9091SRichard Lowe
553*5d9d9091SRichard Lowe	/* ---- copy line 2 of 2. ---- */
554*5d9d9091SRichard Lowe	ldda	[%o1+64]%asi,%d16
555*5d9d9091SRichard Lowe	fmovd	%d16, %d8
556*5d9d9091SRichard Lowe	fmovd	%d18, %d10
557*5d9d9091SRichard Lowe	fmovd	%d20, %d12
558*5d9d9091SRichard Lowe	fmovd	%d22, %d14
559*5d9d9091SRichard Lowe	add	%o1, 128, %o1		! increment src
560*5d9d9091SRichard Lowe	stda	%d0,[%o0]%asi
561*5d9d9091SRichard Lowe	add	%o0, 64, %o0		! advance dst
562*5d9d9091SRichard Lowe	fmovd	%d24, %d0
563*5d9d9091SRichard Lowe	fmovd	%d26, %d2
564*5d9d9091SRichard Lowe	fmovd	%d28, %d4
565*5d9d9091SRichard Lowe	fmovd	%d30, %d6
566*5d9d9091SRichard Lowe	bgt,pt	%ncc, .mv_align_100_loop
567*5d9d9091SRichard Lowe	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
568*5d9d9091SRichard Lowe
569*5d9d9091SRichard Lowe	std	%d0, [%o0]
570*5d9d9091SRichard Lowe	std	%d2, [%o0+8]
571*5d9d9091SRichard Lowe	std	%d4, [%o0+16]
572*5d9d9091SRichard Lowe	std	%d6, [%o0+24]
573*5d9d9091SRichard Lowe	ba	.remain_stuff
574*5d9d9091SRichard Lowe	add	%o0, 32, %o0
575*5d9d9091SRichard Lowe	! END OF mv_align_100
576*5d9d9091SRichard Lowe
577*5d9d9091SRichard Lowe.mv_align_011:
578*5d9d9091SRichard Lowe! Alignment off by 40 bytes
579*5d9d9091SRichard Lowe	ldd	[%o1], %d0
580*5d9d9091SRichard Lowe	ldd	[%o1+8], %d2
581*5d9d9091SRichard Lowe	ldd	[%o1+16], %d4
582*5d9d9091SRichard Lowe	ldd	[%o1+24], %d6
583*5d9d9091SRichard Lowe	ldd	[%o1+32], %d8
584*5d9d9091SRichard Lowe	add	%o1, 40, %o1
585*5d9d9091SRichard Lowe	sub	%o2, 40, %o2
586*5d9d9091SRichard Lowe	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
587*5d9d9091SRichard Lowe	and	%o2, 0x7f, %o2		! residue bytes in %o2
588*5d9d9091SRichard Lowe.mv_align_011_loop:
589*5d9d9091SRichard Lowe	subcc	%o5, 128, %o5
590*5d9d9091SRichard Lowe	/* ---- copy line 1 of 2. ---- */
591*5d9d9091SRichard Lowe
592*5d9d9091SRichard Lowe	ldda	[%o1]%asi,%d16		! block load
593*5d9d9091SRichard Lowe	fmovd	%d16, %d10
594*5d9d9091SRichard Lowe	fmovd	%d18, %d12
595*5d9d9091SRichard Lowe	fmovd	%d20, %d14
596*5d9d9091SRichard Lowe	stda	%d0,[%o0]%asi
597*5d9d9091SRichard Lowe	add	%o0, 64, %o0		! advance dst
598*5d9d9091SRichard Lowe	fmovd	%d22, %d0
599*5d9d9091SRichard Lowe	fmovd	%d24, %d2
600*5d9d9091SRichard Lowe	fmovd	%d26, %d4
601*5d9d9091SRichard Lowe	fmovd	%d28, %d6
602*5d9d9091SRichard Lowe	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
603*5d9d9091SRichard Lowe	fmovd	%d30, %d8
604*5d9d9091SRichard Lowe
605*5d9d9091SRichard Lowe	/* ---- copy line 2 of 2. ---- */
606*5d9d9091SRichard Lowe	ldda	[%o1+64]%asi,%d16
607*5d9d9091SRichard Lowe	fmovd	%d16, %d10
608*5d9d9091SRichard Lowe	fmovd	%d18, %d12
609*5d9d9091SRichard Lowe	fmovd	%d20, %d14
610*5d9d9091SRichard Lowe	add	%o1, 128, %o1		! increment src
611*5d9d9091SRichard Lowe	stda	%d0,[%o0]%asi
612*5d9d9091SRichard Lowe	add	%o0, 64, %o0		! advance dst
613*5d9d9091SRichard Lowe	fmovd	%d22, %d0
614*5d9d9091SRichard Lowe	fmovd	%d24, %d2
615*5d9d9091SRichard Lowe	fmovd	%d26, %d4
616*5d9d9091SRichard Lowe	fmovd	%d28, %d6
617*5d9d9091SRichard Lowe	fmovd	%d30, %d8
618*5d9d9091SRichard Lowe	bgt,pt	%ncc, .mv_align_011_loop
619*5d9d9091SRichard Lowe	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
620*5d9d9091SRichard Lowe
621*5d9d9091SRichard Lowe	std	%d0, [%o0]
622*5d9d9091SRichard Lowe	std	%d2, [%o0+8]
623*5d9d9091SRichard Lowe	std	%d4, [%o0+16]
624*5d9d9091SRichard Lowe	std	%d6, [%o0+24]
625*5d9d9091SRichard Lowe	std	%d8, [%o0+32]
626*5d9d9091SRichard Lowe	ba	.remain_stuff
627*5d9d9091SRichard Lowe	add	%o0, 40, %o0
628*5d9d9091SRichard Lowe	! END OF mv_align_011
629*5d9d9091SRichard Lowe
630*5d9d9091SRichard Lowe.mv_align_010:
631*5d9d9091SRichard Lowe! Alignment off by 48 bytes
632*5d9d9091SRichard Lowe	ldd	[%o1], %d0
633*5d9d9091SRichard Lowe	ldd	[%o1+8], %d2
634*5d9d9091SRichard Lowe	ldd	[%o1+16], %d4
635*5d9d9091SRichard Lowe	ldd	[%o1+24], %d6
636*5d9d9091SRichard Lowe	ldd	[%o1+32], %d8
637*5d9d9091SRichard Lowe	ldd	[%o1+40], %d10
638*5d9d9091SRichard Lowe	add	%o1, 48, %o1
639*5d9d9091SRichard Lowe	sub	%o2, 48, %o2
640*5d9d9091SRichard Lowe	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
641*5d9d9091SRichard Lowe	and	%o2, 0x7f, %o2		! residue bytes in %o2
642*5d9d9091SRichard Lowe.mv_align_010_loop:
643*5d9d9091SRichard Lowe	subcc	%o5, 128, %o5
644*5d9d9091SRichard Lowe	/* ---- copy line 1 of 2. ---- */
645*5d9d9091SRichard Lowe
646*5d9d9091SRichard Lowe	ldda	[%o1]%asi,%d16		! block load
647*5d9d9091SRichard Lowe	fmovd	%d16, %d12
648*5d9d9091SRichard Lowe	fmovd	%d18, %d14
649*5d9d9091SRichard Lowe	stda	%d0,[%o0]%asi
650*5d9d9091SRichard Lowe	add	%o0, 64, %o0		! advance dst
651*5d9d9091SRichard Lowe	fmovd	%d20, %d0
652*5d9d9091SRichard Lowe	fmovd	%d22, %d2
653*5d9d9091SRichard Lowe	fmovd	%d24, %d4
654*5d9d9091SRichard Lowe	fmovd	%d26, %d6
655*5d9d9091SRichard Lowe	fmovd	%d28, %d8
656*5d9d9091SRichard Lowe	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
657*5d9d9091SRichard Lowe	fmovd	%d30, %d10
658*5d9d9091SRichard Lowe
659*5d9d9091SRichard Lowe	/* ---- copy line 2 of 2. ---- */
660*5d9d9091SRichard Lowe	ldda	[%o1+64]%asi,%d16
661*5d9d9091SRichard Lowe	fmovd	%d16, %d12
662*5d9d9091SRichard Lowe	fmovd	%d18, %d14
663*5d9d9091SRichard Lowe	add	%o1, 128, %o1	! increment src
664*5d9d9091SRichard Lowe	stda	%d0,[%o0]%asi
665*5d9d9091SRichard Lowe	add	%o0, 64, %o0		! advance dst
666*5d9d9091SRichard Lowe	fmovd	%d20, %d0
667*5d9d9091SRichard Lowe	fmovd	%d22, %d2
668*5d9d9091SRichard Lowe	fmovd	%d24, %d4
669*5d9d9091SRichard Lowe	fmovd	%d26, %d6
670*5d9d9091SRichard Lowe	fmovd	%d28, %d8
671*5d9d9091SRichard Lowe	fmovd	%d30, %d10
672*5d9d9091SRichard Lowe	bgt,pt	%ncc, .mv_align_010_loop
673*5d9d9091SRichard Lowe	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
674*5d9d9091SRichard Lowe
675*5d9d9091SRichard Lowe	std	%d0, [%o0]
676*5d9d9091SRichard Lowe	std	%d2, [%o0+8]
677*5d9d9091SRichard Lowe	std	%d4, [%o0+16]
678*5d9d9091SRichard Lowe	std	%d6, [%o0+24]
679*5d9d9091SRichard Lowe	std	%d8, [%o0+32]
680*5d9d9091SRichard Lowe	std	%d10, [%o0+40]
681*5d9d9091SRichard Lowe	ba	.remain_stuff
682*5d9d9091SRichard Lowe	add	%o0, 48, %o0
683*5d9d9091SRichard Lowe	! END OF mv_align_010
684*5d9d9091SRichard Lowe
685*5d9d9091SRichard Lowe.mv_align_001:
686*5d9d9091SRichard Lowe! Alignment off by 56 bytes
687*5d9d9091SRichard Lowe	ldd	[%o1], %d0
688*5d9d9091SRichard Lowe	ldd	[%o1+8], %d2
689*5d9d9091SRichard Lowe	ldd	[%o1+16], %d4
690*5d9d9091SRichard Lowe	ldd	[%o1+24], %d6
691*5d9d9091SRichard Lowe	ldd	[%o1+32], %d8
692*5d9d9091SRichard Lowe	ldd	[%o1+40], %d10
693*5d9d9091SRichard Lowe	ldd	[%o1+48], %d12
694*5d9d9091SRichard Lowe	add	%o1, 56, %o1
695*5d9d9091SRichard Lowe	sub	%o2, 56, %o2
696*5d9d9091SRichard Lowe	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
697*5d9d9091SRichard Lowe	and	%o2, 0x7f, %o2		! residue bytes in %o2
698*5d9d9091SRichard Lowe.mv_align_001_loop:
699*5d9d9091SRichard Lowe	subcc	%o5, 128, %o5
700*5d9d9091SRichard Lowe	/* ---- copy line 1 of 2. ---- */
701*5d9d9091SRichard Lowe
702*5d9d9091SRichard Lowe	ldda	[%o1]%asi,%d16		! block load
703*5d9d9091SRichard Lowe	fmovd	%d16, %d14
704*5d9d9091SRichard Lowe	stda	%d0,[%o0]%asi
705*5d9d9091SRichard Lowe	add	%o0, 64, %o0		! advance dst
706*5d9d9091SRichard Lowe	fmovd	%d18, %d0
707*5d9d9091SRichard Lowe	fmovd	%d20, %d2
708*5d9d9091SRichard Lowe	fmovd	%d22, %d4
709*5d9d9091SRichard Lowe	fmovd	%d24, %d6
710*5d9d9091SRichard Lowe	fmovd	%d26, %d8
711*5d9d9091SRichard Lowe	fmovd	%d28, %d10
712*5d9d9091SRichard Lowe	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
713*5d9d9091SRichard Lowe	fmovd	%d30, %d12
714*5d9d9091SRichard Lowe
715*5d9d9091SRichard Lowe	/* ---- copy line 2 of 2. ---- */
716*5d9d9091SRichard Lowe	ldda	[%o1+64]%asi,%d16
717*5d9d9091SRichard Lowe	fmovd	%d16, %d14
718*5d9d9091SRichard Lowe	add	%o1, 128, %o1		! increment src
719*5d9d9091SRichard Lowe	stda	%d0,[%o0]%asi
720*5d9d9091SRichard Lowe	add	%o0, 64, %o0		! advance dst
721*5d9d9091SRichard Lowe	fmovd	%d18, %d0
722*5d9d9091SRichard Lowe	fmovd	%d20, %d2
723*5d9d9091SRichard Lowe	fmovd	%d22, %d4
724*5d9d9091SRichard Lowe	fmovd	%d24, %d6
725*5d9d9091SRichard Lowe	fmovd	%d26, %d8
726*5d9d9091SRichard Lowe	fmovd	%d28, %d10
727*5d9d9091SRichard Lowe	fmovd	%d30, %d12
728*5d9d9091SRichard Lowe	bgt,pt	%ncc, .mv_align_001_loop
729*5d9d9091SRichard Lowe	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
730*5d9d9091SRichard Lowe
731*5d9d9091SRichard Lowe	std	%d0, [%o0]
732*5d9d9091SRichard Lowe	std	%d2, [%o0+8]
733*5d9d9091SRichard Lowe	std	%d4, [%o0+16]
734*5d9d9091SRichard Lowe	std	%d6, [%o0+24]
735*5d9d9091SRichard Lowe	std	%d8, [%o0+32]
736*5d9d9091SRichard Lowe	std	%d10, [%o0+40]
737*5d9d9091SRichard Lowe	std	%d12, [%o0+48]
738*5d9d9091SRichard Lowe	ba	.remain_stuff
739*5d9d9091SRichard Lowe	add	%o0, 56, %o0
740*5d9d9091SRichard Lowe	! END OF mv_align_001
741*5d9d9091SRichard Lowe
742*5d9d9091SRichard Lowe.mv_align_000:
743*5d9d9091SRichard Lowe	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
744*5d9d9091SRichard Lowe	and	%o2, 0x7f, %o2		! residue bytes in %o2
745*5d9d9091SRichard Lowe.mv_align_000_loop:
746*5d9d9091SRichard Lowe	/* ---- copy line 1 of 2. ---- */
747*5d9d9091SRichard Lowe	subcc	%o5, 128, %o5
748*5d9d9091SRichard Lowe	ldda	[%o1]%asi,%d0
749*5d9d9091SRichard Lowe	stda	%d0,[%o0]%asi
750*5d9d9091SRichard Lowe	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
751*5d9d9091SRichard Lowe
752*5d9d9091SRichard Lowe	/* ---- copy line 2 of 2. ---- */
753*5d9d9091SRichard Lowe	add	%o0, 64, %o0
754*5d9d9091SRichard Lowe	ldda	[%o1+64]%asi,%d0
755*5d9d9091SRichard Lowe	add	%o1, 128, %o1		! increment src
756*5d9d9091SRichard Lowe	stda	%d0,[%o0]%asi
757*5d9d9091SRichard Lowe	add	%o0, 64, %o0		! increment dst
758*5d9d9091SRichard Lowe	bgt,pt	%ncc, .mv_align_000_loop
759*5d9d9091SRichard Lowe	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
760*5d9d9091SRichard Lowe	ba	.remain_stuff
761*5d9d9091SRichard Lowe	nop
762*5d9d9091SRichard Lowe
763*5d9d9091SRichard Lowe	! END OF mv_align_000
764*5d9d9091SRichard Lowe#else	/* NIAGARA2_IMPL */
765*5d9d9091SRichard Lowe#endif	/* NIAGARA2_IMPL */
766*5d9d9091SRichard Lowe
767*5d9d9091SRichard Lowe	SET_SIZE(memmove)
768*5d9d9091SRichard Lowe
769*5d9d9091SRichard Lowe	ENTRY(memcpy)
770*5d9d9091SRichard Lowe	ENTRY(__align_cpy_1)
771*5d9d9091SRichard Lowe#ifdef NIAGARA2_IMPL
772*5d9d9091SRichard Lowe	cmp	%o2, SMALL_MAX		! check for not small case
773*5d9d9091SRichard Lowe	bgeu,pn	%ncc, .medium		! go to larger cases
774*5d9d9091SRichard Lowe	mov	%o0, %g1		! save %o0
775*5d9d9091SRichard Lowe.mv_short:
776*5d9d9091SRichard Lowe	cmp	%o2, SHORTCOPY		! check for really short case
777*5d9d9091SRichard Lowe	ble,pt	%ncc, .smallfin
778*5d9d9091SRichard Lowe	or	%o0, %o1, %o4		! prepare alignment check
779*5d9d9091SRichard Lowe	andcc	%o4, 0x3, %o5		! test for alignment
780*5d9d9091SRichard Lowe	bz,pt	%ncc, .smallword	! branch to word aligned case
781*5d9d9091SRichard Lowe	cmp	%o2, SHORTCHECK
782*5d9d9091SRichard Lowe	ble,pt	%ncc, .smallrest
783*5d9d9091SRichard Lowe	andcc	%o1, 0x3, %o5		! is src word aligned
784*5d9d9091SRichard Lowe	bz,pn	%ncc, .aldst
785*5d9d9091SRichard Lowe	cmp	%o5, 2			! is src half-word aligned
786*5d9d9091SRichard Lowe	be,pt	%ncc, .s2algn
787*5d9d9091SRichard Lowe	cmp	%o5, 3			! src is byte aligned
788*5d9d9091SRichard Lowe.s1algn:ldub	[%o1], %o3		! move 1 or 3 bytes to align it
789*5d9d9091SRichard Lowe	inc	1, %o1
790*5d9d9091SRichard Lowe	stb	%o3, [%o0]		! move a byte to align src
791*5d9d9091SRichard Lowe	inc	1, %o0
792*5d9d9091SRichard Lowe	bne,pt	%ncc, .s2algn
793*5d9d9091SRichard Lowe	dec	%o2
794*5d9d9091SRichard Lowe	b	.ald			! now go align dest
795*5d9d9091SRichard Lowe	andcc	%o0, 0x3, %o5
796*5d9d9091SRichard Lowe
797*5d9d9091SRichard Lowe.s2algn:lduh	[%o1], %o3		! know src is 2 byte aligned
798*5d9d9091SRichard Lowe	inc	2, %o1
799*5d9d9091SRichard Lowe	srl	%o3, 8, %o4
800*5d9d9091SRichard Lowe	stb	%o4, [%o0]		! have to do bytes,
801*5d9d9091SRichard Lowe	stb	%o3, [%o0 + 1]		! don't know dst alignment
802*5d9d9091SRichard Lowe	inc	2, %o0
803*5d9d9091SRichard Lowe	dec	2, %o2
804*5d9d9091SRichard Lowe
805*5d9d9091SRichard Lowe.aldst:	andcc	%o0, 0x3, %o5		! align the destination address
806*5d9d9091SRichard Lowe.ald:	bz,pn	%ncc, .w4cp
807*5d9d9091SRichard Lowe	cmp	%o5, 2
808*5d9d9091SRichard Lowe	be,pn	%ncc, .w2cp
809*5d9d9091SRichard Lowe	cmp	%o5, 3
810*5d9d9091SRichard Lowe.w3cp:	lduw	[%o1], %o4
811*5d9d9091SRichard Lowe	inc	4, %o1
812*5d9d9091SRichard Lowe	srl	%o4, 24, %o5
813*5d9d9091SRichard Lowe	stb	%o5, [%o0]
814*5d9d9091SRichard Lowe	bne,pt	%ncc, .w1cp
815*5d9d9091SRichard Lowe	inc	%o0
816*5d9d9091SRichard Lowe	dec	1, %o2
817*5d9d9091SRichard Lowe	andn	%o2, 3, %o3		! %o3 is aligned word count
818*5d9d9091SRichard Lowe	dec	4, %o3			! avoid reading beyond tail of src
819*5d9d9091SRichard Lowe	sub	%o1, %o0, %o1		! %o1 gets the difference
820*5d9d9091SRichard Lowe
821*5d9d9091SRichard Lowe1:	sll	%o4, 8, %g5		! save residual bytes
822*5d9d9091SRichard Lowe	lduw	[%o1+%o0], %o4
823*5d9d9091SRichard Lowe	deccc	4, %o3
824*5d9d9091SRichard Lowe	srl	%o4, 24, %o5		! merge with residual
825*5d9d9091SRichard Lowe	or	%o5, %g5, %g5
826*5d9d9091SRichard Lowe	st	%g5, [%o0]
827*5d9d9091SRichard Lowe	bnz,pt	%ncc, 1b
828*5d9d9091SRichard Lowe	inc	4, %o0
829*5d9d9091SRichard Lowe	sub	%o1, 3, %o1		! used one byte of last word read
830*5d9d9091SRichard Lowe	and	%o2, 3, %o2
831*5d9d9091SRichard Lowe	b	7f
832*5d9d9091SRichard Lowe	inc	4, %o2
833*5d9d9091SRichard Lowe
834*5d9d9091SRichard Lowe.w1cp:	srl	%o4, 8, %o5
835*5d9d9091SRichard Lowe	sth	%o5, [%o0]
836*5d9d9091SRichard Lowe	inc	2, %o0
837*5d9d9091SRichard Lowe	dec	3, %o2
838*5d9d9091SRichard Lowe	andn	%o2, 3, %o3		! %o3 is aligned word count
839*5d9d9091SRichard Lowe	dec	4, %o3			! avoid reading beyond tail of src
840*5d9d9091SRichard Lowe	sub	%o1, %o0, %o1		! %o1 gets the difference
841*5d9d9091SRichard Lowe
842*5d9d9091SRichard Lowe2:	sll	%o4, 24, %g5		! save residual bytes
843*5d9d9091SRichard Lowe	lduw	[%o1+%o0], %o4
844*5d9d9091SRichard Lowe	deccc	4, %o3
845*5d9d9091SRichard Lowe	srl	%o4, 8, %o5		! merge with residual
846*5d9d9091SRichard Lowe	or	%o5, %g5, %g5
847*5d9d9091SRichard Lowe	st	%g5, [%o0]
848*5d9d9091SRichard Lowe	bnz,pt	%ncc, 2b
849*5d9d9091SRichard Lowe	inc	4, %o0
850*5d9d9091SRichard Lowe	sub	%o1, 1, %o1		! used three bytes of last word read
851*5d9d9091SRichard Lowe	and	%o2, 3, %o2
852*5d9d9091SRichard Lowe	b	7f
853*5d9d9091SRichard Lowe	inc	4, %o2
854*5d9d9091SRichard Lowe
855*5d9d9091SRichard Lowe.w2cp:	lduw	[%o1], %o4
856*5d9d9091SRichard Lowe	inc	4, %o1
857*5d9d9091SRichard Lowe	srl	%o4, 16, %o5
858*5d9d9091SRichard Lowe	sth	%o5, [%o0]
859*5d9d9091SRichard Lowe	inc	2, %o0
860*5d9d9091SRichard Lowe	dec	2, %o2
861*5d9d9091SRichard Lowe	andn	%o2, 3, %o3		! %o3 is aligned word count
862*5d9d9091SRichard Lowe	dec	4, %o3			! avoid reading beyond tail of src
863*5d9d9091SRichard Lowe	sub	%o1, %o0, %o1		! %o1 gets the difference
864*5d9d9091SRichard Lowe
865*5d9d9091SRichard Lowe3:	sll	%o4, 16, %g5		! save residual bytes
866*5d9d9091SRichard Lowe	lduw	[%o1+%o0], %o4
867*5d9d9091SRichard Lowe	deccc	4, %o3
868*5d9d9091SRichard Lowe	srl	%o4, 16, %o5		! merge with residual
869*5d9d9091SRichard Lowe	or	%o5, %g5, %g5
870*5d9d9091SRichard Lowe	st	%g5, [%o0]
871*5d9d9091SRichard Lowe	bnz,pt	%ncc, 3b
872*5d9d9091SRichard Lowe	inc	4, %o0
873*5d9d9091SRichard Lowe	sub	%o1, 2, %o1		! used two bytes of last word read
874*5d9d9091SRichard Lowe	and	%o2, 3, %o2
875*5d9d9091SRichard Lowe	b	7f
876*5d9d9091SRichard Lowe	inc	4, %o2
877*5d9d9091SRichard Lowe
878*5d9d9091SRichard Lowe.w4cp:	andn	%o2, 3, %o3		! %o3 is aligned word count
879*5d9d9091SRichard Lowe	sub	%o1, %o0, %o1		! %o1 gets the difference
880*5d9d9091SRichard Lowe
881*5d9d9091SRichard Lowe1:	lduw	[%o1+%o0], %o4		! read from address
882*5d9d9091SRichard Lowe	deccc	4, %o3			! decrement count
883*5d9d9091SRichard Lowe	st	%o4, [%o0]		! write at destination address
884*5d9d9091SRichard Lowe	bgu,pt	%ncc, 1b
885*5d9d9091SRichard Lowe	inc	4, %o0			! increment to address
886*5d9d9091SRichard Lowe	and	%o2, 3, %o2		! number of leftover bytes, if any
887*5d9d9091SRichard Lowe
888*5d9d9091SRichard Lowe	! simple finish up byte copy, works with any alignment
889*5d9d9091SRichard Lowe7:
890*5d9d9091SRichard Lowe	add	%o1, %o0, %o1		! restore %o1
891*5d9d9091SRichard Lowe.smallrest:
892*5d9d9091SRichard Lowe	tst	%o2
893*5d9d9091SRichard Lowe	bz,pt	%ncc, .smallx
894*5d9d9091SRichard Lowe	cmp	%o2, 4
895*5d9d9091SRichard Lowe	blt,pt	%ncc, .smallleft3
896*5d9d9091SRichard Lowe	nop
897*5d9d9091SRichard Lowe	sub	%o2, 3, %o2
898*5d9d9091SRichard Lowe.smallnotalign4:
899*5d9d9091SRichard Lowe	ldub	[%o1], %o3		! read byte
900*5d9d9091SRichard Lowe	subcc	%o2, 4, %o2		! reduce count by 4
901*5d9d9091SRichard Lowe	stb	%o3, [%o0]		! write byte
902*5d9d9091SRichard Lowe	ldub	[%o1+1], %o3		! repeat for total of 4 bytes
903*5d9d9091SRichard Lowe	add	%o1, 4, %o1		! advance SRC by 4
904*5d9d9091SRichard Lowe	stb	%o3, [%o0+1]
905*5d9d9091SRichard Lowe	ldub	[%o1-2], %o3
906*5d9d9091SRichard Lowe	add	%o0, 4, %o0		! advance DST by 4
907*5d9d9091SRichard Lowe	stb	%o3, [%o0-2]
908*5d9d9091SRichard Lowe	ldub	[%o1-1], %o3
909*5d9d9091SRichard Lowe	bgu,pt	%ncc, .smallnotalign4	! loop til 3 or fewer bytes remain
910*5d9d9091SRichard Lowe	stb	%o3, [%o0-1]
911*5d9d9091SRichard Lowe	addcc	%o2, 3, %o2		! restore count
912*5d9d9091SRichard Lowe	bz,pt	%ncc, .smallx
913*5d9d9091SRichard Lowe.smallleft3:				! 1, 2, or 3 bytes remain
914*5d9d9091SRichard Lowe	subcc	%o2, 1, %o2
915*5d9d9091SRichard Lowe	ldub	[%o1], %o3		! load one byte
916*5d9d9091SRichard Lowe	bz,pt	%ncc, .smallx
917*5d9d9091SRichard Lowe	stb	%o3, [%o0]		! store one byte
918*5d9d9091SRichard Lowe	ldub	[%o1+1], %o3		! load second byte
919*5d9d9091SRichard Lowe	subcc	%o2, 1, %o2
920*5d9d9091SRichard Lowe	bz,pt	%ncc, .smallx
921*5d9d9091SRichard Lowe	stb	%o3, [%o0+1]		! store second byte
922*5d9d9091SRichard Lowe	ldub	[%o1+2], %o3		! load third byte
923*5d9d9091SRichard Lowe	stb	%o3, [%o0+2]		! store third byte
924*5d9d9091SRichard Lowe.smallx:
925*5d9d9091SRichard Lowe	retl
926*5d9d9091SRichard Lowe	mov	%g1, %o0		! restore %o0
927*5d9d9091SRichard Lowe
928*5d9d9091SRichard Lowe.smallfin:
929*5d9d9091SRichard Lowe	tst	%o2
930*5d9d9091SRichard Lowe	bnz,pt	%ncc, .smallleft3
931*5d9d9091SRichard Lowe	nop
932*5d9d9091SRichard Lowe	retl
933*5d9d9091SRichard Lowe	mov	%g1, %o0		! restore %o0
934*5d9d9091SRichard Lowe
935*5d9d9091SRichard Lowe	.align 16
936*5d9d9091SRichard Lowe.smallwords:
937*5d9d9091SRichard Lowe	lduw	[%o1], %o3		! read word
938*5d9d9091SRichard Lowe.smallwordx:
939*5d9d9091SRichard Lowe	subcc	%o2, 8, %o2		! update count
940*5d9d9091SRichard Lowe	stw	%o3, [%o0]		! write word
941*5d9d9091SRichard Lowe	add	%o1, 8, %o1		! update SRC
942*5d9d9091SRichard Lowe	lduw	[%o1-4], %o3		! read word
943*5d9d9091SRichard Lowe	add	%o0, 8, %o0		! update DST
944*5d9d9091SRichard Lowe	bgu,pt	%ncc, .smallwords	! loop until done
945*5d9d9091SRichard Lowe	stw	%o3, [%o0-4]		! write word
946*5d9d9091SRichard Lowe	addcc	%o2, 7, %o2		! restore count
947*5d9d9091SRichard Lowe	bz,pt	%ncc, .smallexit	! check for completion
948*5d9d9091SRichard Lowe	cmp	%o2, 4			! check for 4 or more bytes left
949*5d9d9091SRichard Lowe	blt	%ncc, .smallleft3	! if not, go to finish up
950*5d9d9091SRichard Lowe	nop
951*5d9d9091SRichard Lowe	lduw	[%o1], %o3
952*5d9d9091SRichard Lowe	add	%o1, 4, %o1
953*5d9d9091SRichard Lowe	subcc	%o2, 4, %o2
954*5d9d9091SRichard Lowe	add	%o0, 4, %o0
955*5d9d9091SRichard Lowe	bnz,pt	%ncc, .smallleft3
956*5d9d9091SRichard Lowe	stw	%o3, [%o0-4]
957*5d9d9091SRichard Lowe	retl
958*5d9d9091SRichard Lowe	mov	%g1, %o0		! restore %o0
959*5d9d9091SRichard Lowe
960*5d9d9091SRichard Lowe! 8 or more bytes, src and dest start on word boundary
961*5d9d9091SRichard Lowe! %o4 contains or %o0, %o1; %o3 contains first four bytes of src
962*5d9d9091SRichard Lowe.smalllong:
963*5d9d9091SRichard Lowe	andcc	%o4, 0x7, %o5		! test for long alignment
964*5d9d9091SRichard Lowe	bnz,pt	%ncc, .smallwordx	! branch to word aligned case
965*5d9d9091SRichard Lowe	cmp	%o2, SHORT_LONG-7
966*5d9d9091SRichard Lowe	bge,a	%ncc, .medl64		! if we branch
967*5d9d9091SRichard Lowe	sub	%o2,56,%o2		! adjust %o2 to -31 off count
968*5d9d9091SRichard Lowe	sub	%o1, %o0, %o1		! %o1 gets the difference
969*5d9d9091SRichard Lowe.small_long_l:
970*5d9d9091SRichard Lowe	ldx	[%o1+%o0], %o3
971*5d9d9091SRichard Lowe	subcc	%o2, 8, %o2
972*5d9d9091SRichard Lowe	add	%o0, 8, %o0
973*5d9d9091SRichard Lowe	bgu,pt	%ncc, .small_long_l	! loop until done
974*5d9d9091SRichard Lowe	stx	%o3, [%o0-8]		! write word
975*5d9d9091SRichard Lowe	add	%o1, %o0, %o1		! restore %o1
976*5d9d9091SRichard Lowe	addcc	%o2, 7, %o2		! restore %o2 to correct count
977*5d9d9091SRichard Lowe	bz,pt	%ncc, .smallexit	! check for completion
978*5d9d9091SRichard Lowe	cmp	%o2, 4			! check for 4 or more bytes left
979*5d9d9091SRichard Lowe	blt,pt	%ncc, .smallleft3	! if not, go to finish up
980*5d9d9091SRichard Lowe	nop
981*5d9d9091SRichard Lowe	lduw	[%o1], %o3
982*5d9d9091SRichard Lowe	add	%o1, 4, %o1
983*5d9d9091SRichard Lowe	subcc	%o2, 4, %o2
984*5d9d9091SRichard Lowe	stw	%o3, [%o0]
985*5d9d9091SRichard Lowe	add	%o0, 4, %o0
986*5d9d9091SRichard Lowe	bnz,pt	%ncc, .smallleft3
987*5d9d9091SRichard Lowe	nop
988*5d9d9091SRichard Lowe	retl
989*5d9d9091SRichard Lowe	mov	%g1, %o0		! restore %o0
990*5d9d9091SRichard Lowe
991*5d9d9091SRichard Lowe	.align 16
992*5d9d9091SRichard Lowe! src and dest start on word boundary
993*5d9d9091SRichard Lowe.smallword:
994*5d9d9091SRichard Lowe	subcc	%o2, 7, %o2		! adjust count
995*5d9d9091SRichard Lowe	bgu,pt	%ncc, .smalllong
996*5d9d9091SRichard Lowe	lduw	[%o1], %o3		! read word
997*5d9d9091SRichard Lowe	addcc	%o2, 3, %o2		! restore count
998*5d9d9091SRichard Lowe	bz,pt	%ncc, .smallexit
999*5d9d9091SRichard Lowe	stw	%o3, [%o0]		! write word
1000*5d9d9091SRichard Lowe	deccc	%o2			! reduce count for cc test
1001*5d9d9091SRichard Lowe	ldub	[%o1+4], %o3		! load one byte
1002*5d9d9091SRichard Lowe	bz,pt	%ncc, .smallexit
1003*5d9d9091SRichard Lowe	stb	%o3, [%o0+4]		! store one byte
1004*5d9d9091SRichard Lowe	ldub	[%o1+5], %o3		! load second byte
1005*5d9d9091SRichard Lowe	deccc	%o2
1006*5d9d9091SRichard Lowe	bz,pt	%ncc, .smallexit
1007*5d9d9091SRichard Lowe	stb	%o3, [%o0+5]		! store second byte
1008*5d9d9091SRichard Lowe	ldub	[%o1+6], %o3		! load third byte
1009*5d9d9091SRichard Lowe	stb	%o3, [%o0+6]		! store third byte
1010*5d9d9091SRichard Lowe.smallexit:
1011*5d9d9091SRichard Lowe	retl
1012*5d9d9091SRichard Lowe	mov	%g1, %o0		! restore %o0
1013*5d9d9091SRichard Lowe
1014*5d9d9091SRichard Lowe	.align 16
1015*5d9d9091SRichard Lowe.medium:
1016*5d9d9091SRichard Lowe	neg	%o0, %o5
1017*5d9d9091SRichard Lowe	andcc	%o5, 7, %o5		! bytes till DST 8 byte aligned
1018*5d9d9091SRichard Lowe	brz,pt	%o5, .dst_aligned_on_8
1019*5d9d9091SRichard Lowe
1020*5d9d9091SRichard Lowe	! %o5 has the bytes to be written in partial store.
1021*5d9d9091SRichard Lowe	sub	%o2, %o5, %o2
1022*5d9d9091SRichard Lowe	sub	%o1, %o0, %o1		! %o1 gets the difference
1023*5d9d9091SRichard Lowe7:					! dst aligning loop
1024*5d9d9091SRichard Lowe	ldub	[%o1+%o0], %o4		! load one byte
1025*5d9d9091SRichard Lowe	subcc	%o5, 1, %o5
1026*5d9d9091SRichard Lowe	stb	%o4, [%o0]
1027*5d9d9091SRichard Lowe	bgu,pt	%ncc, 7b
1028*5d9d9091SRichard Lowe	add	%o0, 1, %o0		! advance dst
1029*5d9d9091SRichard Lowe	add	%o1, %o0, %o1		! restore %o1
1030*5d9d9091SRichard Lowe.dst_aligned_on_8:
1031*5d9d9091SRichard Lowe	andcc	%o1, 7, %o5
1032*5d9d9091SRichard Lowe	brnz,pt	%o5, .src_dst_unaligned_on_8
1033*5d9d9091SRichard Lowe	prefetch [%o1 + (1 * BLOCK_SIZE)], #one_read
1034*5d9d9091SRichard Lowe
1035*5d9d9091SRichard Lowe.src_dst_aligned_on_8:
1036*5d9d9091SRichard Lowe	! check if we are copying MED_MAX or more bytes
1037*5d9d9091SRichard Lowe	cmp	%o2, MED_MAX		! limit to store buffer size
1038*5d9d9091SRichard Lowe	bgu,pt	%ncc, .large_align8_copy
1039*5d9d9091SRichard Lowe	prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read
1040*5d9d9091SRichard Lowe/*
1041*5d9d9091SRichard Lowe * Special case for handling when src and dest are both long word aligned
1042*5d9d9091SRichard Lowe * and total data to move is less than MED_MAX bytes
1043*5d9d9091SRichard Lowe */
1044*5d9d9091SRichard Lowe.medlong:
1045*5d9d9091SRichard Lowe	subcc	%o2, 63, %o2		! adjust length to allow cc test
1046*5d9d9091SRichard Lowe	ble,pt	%ncc, .medl63		! skip big loop if less than 64 bytes
1047*5d9d9091SRichard Lowe.medl64:
1048*5d9d9091SRichard Lowe	prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read ! into the l2 cache
1049*5d9d9091SRichard Lowe	ldx	[%o1], %o4		! load
1050*5d9d9091SRichard Lowe	subcc	%o2, 64, %o2		! decrement length count
1051*5d9d9091SRichard Lowe	stx	%o4, [%o0]		! and store
1052*5d9d9091SRichard Lowe	ldx	[%o1+8], %o3		! a block of 64 bytes
1053*5d9d9091SRichard Lowe	stx	%o3, [%o0+8]
1054*5d9d9091SRichard Lowe	ldx	[%o1+16], %o4
1055*5d9d9091SRichard Lowe	stx	%o4, [%o0+16]
1056*5d9d9091SRichard Lowe	ldx	[%o1+24], %o3
1057*5d9d9091SRichard Lowe	stx	%o3, [%o0+24]
1058*5d9d9091SRichard Lowe	ldx	[%o1+32], %o4		! load
1059*5d9d9091SRichard Lowe	stx	%o4, [%o0+32]		! and store
1060*5d9d9091SRichard Lowe	ldx	[%o1+40], %o3		! a block of 64 bytes
1061*5d9d9091SRichard Lowe	add	%o1, 64, %o1		! increase src ptr by 64
1062*5d9d9091SRichard Lowe	stx	%o3, [%o0+40]
1063*5d9d9091SRichard Lowe	ldx	[%o1-16], %o4
1064*5d9d9091SRichard Lowe	add	%o0, 64, %o0		! increase dst ptr by 64
1065*5d9d9091SRichard Lowe	stx	%o4, [%o0-16]
1066*5d9d9091SRichard Lowe	ldx	[%o1-8], %o3
1067*5d9d9091SRichard Lowe	bgu,pt	%ncc, .medl64		! repeat if at least 64 bytes left
1068*5d9d9091SRichard Lowe	stx	%o3, [%o0-8]
1069*5d9d9091SRichard Lowe.medl63:
1070*5d9d9091SRichard Lowe	addcc	%o2, 32, %o2		! adjust remaining count
1071*5d9d9091SRichard Lowe	ble,pt	%ncc, .medl31		! to skip if 31 or fewer bytes left
1072*5d9d9091SRichard Lowe	nop
1073*5d9d9091SRichard Lowe	ldx	[%o1], %o4		! load
1074*5d9d9091SRichard Lowe	sub	%o2, 32, %o2		! decrement length count
1075*5d9d9091SRichard Lowe	stx	%o4, [%o0]		! and store
1076*5d9d9091SRichard Lowe	ldx	[%o1+8], %o3		! a block of 32 bytes
1077*5d9d9091SRichard Lowe	add	%o1, 32, %o1		! increase src ptr by 32
1078*5d9d9091SRichard Lowe	stx	%o3, [%o0+8]
1079*5d9d9091SRichard Lowe	ldx	[%o1-16], %o4
1080*5d9d9091SRichard Lowe	add	%o0, 32, %o0		! increase dst ptr by 32
1081*5d9d9091SRichard Lowe	stx	%o4, [%o0-16]
1082*5d9d9091SRichard Lowe	ldx	[%o1-8], %o3
1083*5d9d9091SRichard Lowe	stx	%o3, [%o0-8]
1084*5d9d9091SRichard Lowe.medl31:
1085*5d9d9091SRichard Lowe	addcc	%o2, 16, %o2		! adjust remaining count
1086*5d9d9091SRichard Lowe	ble,pt	%ncc, .medl15		! skip if 15 or fewer bytes left
1087*5d9d9091SRichard Lowe	nop				!
1088*5d9d9091SRichard Lowe	ldx	[%o1], %o4		! load and store 16 bytes
1089*5d9d9091SRichard Lowe	add	%o1, 16, %o1		! increase src ptr by 16
1090*5d9d9091SRichard Lowe	stx	%o4, [%o0]		!
1091*5d9d9091SRichard Lowe	sub	%o2, 16, %o2		! decrease count by 16
1092*5d9d9091SRichard Lowe	ldx	[%o1-8], %o3		!
1093*5d9d9091SRichard Lowe	add	%o0, 16, %o0		! increase dst ptr by 16
1094*5d9d9091SRichard Lowe	stx	%o3, [%o0-8]
1095*5d9d9091SRichard Lowe.medl15:
1096*5d9d9091SRichard Lowe	addcc	%o2, 15, %o2		! restore count
1097*5d9d9091SRichard Lowe	bz,pt	%ncc, .smallexit	! exit if finished
1098*5d9d9091SRichard Lowe	cmp	%o2, 8
1099*5d9d9091SRichard Lowe	blt,pt	%ncc, .medw7		! skip if 7 or fewer bytes left
1100*5d9d9091SRichard Lowe	tst	%o2
1101*5d9d9091SRichard Lowe	ldx	[%o1], %o4		! load 8 bytes
1102*5d9d9091SRichard Lowe	add	%o1, 8, %o1		! increase src ptr by 8
1103*5d9d9091SRichard Lowe	add	%o0, 8, %o0		! increase dst ptr by 8
1104*5d9d9091SRichard Lowe	subcc	%o2, 8, %o2		! decrease count by 8
1105*5d9d9091SRichard Lowe	bnz,pt	%ncc, .medw7
1106*5d9d9091SRichard Lowe	stx	%o4, [%o0-8]		! and store 8 bytes
1107*5d9d9091SRichard Lowe	retl
1108*5d9d9091SRichard Lowe	mov	%g1, %o0		! restore %o0
1109*5d9d9091SRichard Lowe
1110*5d9d9091SRichard Lowe	.align 16
1111*5d9d9091SRichard Lowe.src_dst_unaligned_on_8:
1112*5d9d9091SRichard Lowe	! DST is 8-byte aligned, src is not
1113*5d9d9091SRichard Lowe2:
1114*5d9d9091SRichard Lowe	andcc	%o1, 0x3, %o5		! test word alignment
1115*5d9d9091SRichard Lowe	bnz,pt	%ncc, .unalignsetup	! branch to skip if not word aligned
1116*5d9d9091SRichard Lowe	prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read
1117*5d9d9091SRichard Lowe
1118*5d9d9091SRichard Lowe/*
1119*5d9d9091SRichard Lowe * Handle all cases where src and dest are aligned on word
1120*5d9d9091SRichard Lowe * boundaries. Use unrolled loops for better performance.
1121*5d9d9091SRichard Lowe * This option wins over standard large data move when
1122*5d9d9091SRichard Lowe * source and destination is in cache for medium
1123*5d9d9091SRichard Lowe * to short data moves.
1124*5d9d9091SRichard Lowe */
1125*5d9d9091SRichard Lowe	cmp	%o2, MED_WMAX		! limit to store buffer size
1126*5d9d9091SRichard Lowe	bge,pt	%ncc, .unalignrejoin	! otherwise rejoin main loop
1127*5d9d9091SRichard Lowe	prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
1128*5d9d9091SRichard Lowe
1129*5d9d9091SRichard Lowe	subcc	%o2, 31, %o2		! adjust length to allow cc test
1130*5d9d9091SRichard Lowe					! for end of loop
1131*5d9d9091SRichard Lowe	ble,pt	%ncc, .medw31		! skip big loop if less than 16
1132*5d9d9091SRichard Lowe	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1133*5d9d9091SRichard Lowe.medw32:
1134*5d9d9091SRichard Lowe	ld	[%o1], %o4		! move a block of 32 bytes
1135*5d9d9091SRichard Lowe	stw	%o4, [%o0]
1136*5d9d9091SRichard Lowe	ld	[%o1+4], %o3
1137*5d9d9091SRichard Lowe	stw	%o3, [%o0+4]
1138*5d9d9091SRichard Lowe	ld	[%o1+8], %o4
1139*5d9d9091SRichard Lowe	stw	%o4, [%o0+8]
1140*5d9d9091SRichard Lowe	ld	[%o1+12], %o3
1141*5d9d9091SRichard Lowe	stw	%o3, [%o0+12]
1142*5d9d9091SRichard Lowe	ld	[%o1+16], %o4
1143*5d9d9091SRichard Lowe	subcc	%o2, 32, %o2		! decrement length count
1144*5d9d9091SRichard Lowe	stw	%o4, [%o0+16]
1145*5d9d9091SRichard Lowe	ld	[%o1+20], %o3
1146*5d9d9091SRichard Lowe	add	%o1, 32, %o1		! increase src ptr by 32
1147*5d9d9091SRichard Lowe	stw	%o3, [%o0+20]
1148*5d9d9091SRichard Lowe	ld	[%o1-8], %o4
1149*5d9d9091SRichard Lowe	add	%o0, 32, %o0		! increase dst ptr by 32
1150*5d9d9091SRichard Lowe	stw	%o4, [%o0-8]
1151*5d9d9091SRichard Lowe	ld	[%o1-4], %o3
1152*5d9d9091SRichard Lowe	bgu,pt	%ncc, .medw32		! repeat if at least 32 bytes left
1153*5d9d9091SRichard Lowe	stw	%o3, [%o0-4]
1154*5d9d9091SRichard Lowe.medw31:
1155*5d9d9091SRichard Lowe	addcc	%o2, 31, %o2		! restore count
1156*5d9d9091SRichard Lowe
1157*5d9d9091SRichard Lowe	bz,pt	%ncc, .smallexit	! exit if finished
1158*5d9d9091SRichard Lowe	nop
1159*5d9d9091SRichard Lowe	cmp	%o2, 16
1160*5d9d9091SRichard Lowe	blt,pt	%ncc, .medw15
1161*5d9d9091SRichard Lowe	nop
1162*5d9d9091SRichard Lowe	ld	[%o1], %o4		! move a block of 16 bytes
1163*5d9d9091SRichard Lowe	subcc	%o2, 16, %o2		! decrement length count
1164*5d9d9091SRichard Lowe	stw	%o4, [%o0]
1165*5d9d9091SRichard Lowe	ld	[%o1+4], %o3
1166*5d9d9091SRichard Lowe	add	%o1, 16, %o1		! increase src ptr by 16
1167*5d9d9091SRichard Lowe	stw	%o3, [%o0+4]
1168*5d9d9091SRichard Lowe	ld	[%o1-8], %o4
1169*5d9d9091SRichard Lowe	add	%o0, 16, %o0		! increase dst ptr by 16
1170*5d9d9091SRichard Lowe	stw	%o4, [%o0-8]
1171*5d9d9091SRichard Lowe	ld	[%o1-4], %o3
1172*5d9d9091SRichard Lowe	stw	%o3, [%o0-4]
1173*5d9d9091SRichard Lowe.medw15:
1174*5d9d9091SRichard Lowe	bz,pt	%ncc, .smallexit	! exit if finished
1175*5d9d9091SRichard Lowe	cmp	%o2, 8
1176*5d9d9091SRichard Lowe	blt,pt	%ncc, .medw7		! skip if 7 or fewer bytes left
1177*5d9d9091SRichard Lowe	tst	%o2
1178*5d9d9091SRichard Lowe	ld	[%o1], %o4		! load 4 bytes
1179*5d9d9091SRichard Lowe	subcc	%o2, 8, %o2		! decrease count by 8
1180*5d9d9091SRichard Lowe	stw	%o4, [%o0]		! and store 4 bytes
1181*5d9d9091SRichard Lowe	add	%o1, 8, %o1		! increase src ptr by 8
1182*5d9d9091SRichard Lowe	ld	[%o1-4], %o3		! load 4 bytes
1183*5d9d9091SRichard Lowe	add	%o0, 8, %o0		! increase dst ptr by 8
1184*5d9d9091SRichard Lowe	stw	%o3, [%o0-4]		! and store 4 bytes
1185*5d9d9091SRichard Lowe	bz,pt	%ncc, .smallexit	! exit if finished
1186*5d9d9091SRichard Lowe.medw7:					! count is ge 1, less than 8
1187*5d9d9091SRichard Lowe	cmp	%o2, 4			! check for 4 bytes left
1188*5d9d9091SRichard Lowe	blt,pt	%ncc, .smallleft3	! skip if 3 or fewer bytes left
1189*5d9d9091SRichard Lowe	nop				!
1190*5d9d9091SRichard Lowe	ld	[%o1], %o4		! load 4 bytes
1191*5d9d9091SRichard Lowe	add	%o1, 4, %o1		! increase src ptr by 4
1192*5d9d9091SRichard Lowe	add	%o0, 4, %o0		! increase dst ptr by 4
1193*5d9d9091SRichard Lowe	subcc	%o2, 4, %o2		! decrease count by 4
1194*5d9d9091SRichard Lowe	bnz	.smallleft3
1195*5d9d9091SRichard Lowe	stw	%o4, [%o0-4]		! and store 4 bytes
1196*5d9d9091SRichard Lowe	retl
1197*5d9d9091SRichard Lowe	mov	%g1, %o0		! restore %o0
1198*5d9d9091SRichard Lowe
1199*5d9d9091SRichard Lowe	.align	16
1200*5d9d9091SRichard Lowe.large_align8_copy:			! Src and dst share 8 byte alignment
1201*5d9d9091SRichard Lowe	rd	%fprs, %g5		! check for unused fp
1202*5d9d9091SRichard Lowe	! if fprs.fef == 0, set it.
1203*5d9d9091SRichard Lowe	! Setting it when already set costs more than checking
1204*5d9d9091SRichard Lowe	andcc	%g5, FPRS_FEF, %g5	! test FEF, fprs.du = fprs.dl = 0
1205*5d9d9091SRichard Lowe	bz,a	%ncc, 1f
1206*5d9d9091SRichard Lowe	wr	%g0, FPRS_FEF, %fprs	! fprs.fef = 1
1207*5d9d9091SRichard Lowe1:
1208*5d9d9091SRichard Lowe	! align dst to 64 byte boundary
1209*5d9d9091SRichard Lowe	andcc	%o0, 0x3f, %o3		! %o3 == 0 means dst is 64 byte aligned
1210*5d9d9091SRichard Lowe	brz,pn	%o3, .aligned_to_64
1211*5d9d9091SRichard Lowe	andcc	%o0, 8, %o3		! odd long words to move?
1212*5d9d9091SRichard Lowe	brz,pt	%o3, .aligned_to_16
1213*5d9d9091SRichard Lowe	nop
1214*5d9d9091SRichard Lowe	ldx	[%o1], %o4
1215*5d9d9091SRichard Lowe	sub	%o2, 8, %o2
1216*5d9d9091SRichard Lowe	add	%o1, 8, %o1		! increment src ptr
1217*5d9d9091SRichard Lowe	add	%o0, 8, %o0		! increment dst ptr
1218*5d9d9091SRichard Lowe	stx	%o4, [%o0-8]
1219*5d9d9091SRichard Lowe.aligned_to_16:
1220*5d9d9091SRichard Lowe	andcc	%o0, 16, %o3		! pair of long words to move?
1221*5d9d9091SRichard Lowe	brz,pt	%o3, .aligned_to_32
1222*5d9d9091SRichard Lowe	nop
1223*5d9d9091SRichard Lowe	ldx	[%o1], %o4
1224*5d9d9091SRichard Lowe	sub	%o2, 16, %o2
1225*5d9d9091SRichard Lowe	stx	%o4, [%o0]
1226*5d9d9091SRichard Lowe	add	%o1, 16, %o1		! increment src ptr
1227*5d9d9091SRichard Lowe	ldx	[%o1-8], %o4
1228*5d9d9091SRichard Lowe	add	%o0, 16, %o0		! increment dst ptr
1229*5d9d9091SRichard Lowe	stx	%o4, [%o0-8]
1230*5d9d9091SRichard Lowe.aligned_to_32:
1231*5d9d9091SRichard Lowe	andcc	%o0, 32, %o3		! four long words to move?
1232*5d9d9091SRichard Lowe	brz,pt	%o3, .aligned_to_64
1233*5d9d9091SRichard Lowe	nop
1234*5d9d9091SRichard Lowe	ldx	[%o1], %o4
1235*5d9d9091SRichard Lowe	sub	%o2, 32, %o2
1236*5d9d9091SRichard Lowe	stx	%o4, [%o0]
1237*5d9d9091SRichard Lowe	ldx	[%o1+8], %o4
1238*5d9d9091SRichard Lowe	stx	%o4, [%o0+8]
1239*5d9d9091SRichard Lowe	ldx	[%o1+16], %o4
1240*5d9d9091SRichard Lowe	stx	%o4, [%o0+16]
1241*5d9d9091SRichard Lowe	add	%o1, 32, %o1		! increment src ptr
1242*5d9d9091SRichard Lowe	ldx	[%o1-8], %o4
1243*5d9d9091SRichard Lowe	add	%o0, 32, %o0		! increment dst ptr
1244*5d9d9091SRichard Lowe	stx	%o4, [%o0-8]
1245*5d9d9091SRichard Lowe.aligned_to_64:
1246*5d9d9091SRichard Lowe	prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
1247*5d9d9091SRichard Lowe	mov	%asi,%o4		! save %asi
1248*5d9d9091SRichard Lowe	! Determine source alignment
1249*5d9d9091SRichard Lowe	! to correct 8 byte offset
1250*5d9d9091SRichard Lowe	andcc	%o1, 0x20, %o3
1251*5d9d9091SRichard Lowe	brnz,pn	%o3, .align_1
1252*5d9d9091SRichard Lowe	mov	ASI_BLK_P, %asi		! setup %asi for block load/store
1253*5d9d9091SRichard Lowe	andcc	%o1, 0x10, %o3
1254*5d9d9091SRichard Lowe	brnz,pn	%o3, .align_01
1255*5d9d9091SRichard Lowe	nop
1256*5d9d9091SRichard Lowe	andcc	%o1, 0x08, %o3
1257*5d9d9091SRichard Lowe	brz,pn	%o3, .align_000
1258*5d9d9091SRichard Lowe	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1259*5d9d9091SRichard Lowe	ba	.align_001
1260*5d9d9091SRichard Lowe	nop
1261*5d9d9091SRichard Lowe.align_01:
1262*5d9d9091SRichard Lowe	andcc	%o1, 0x08, %o3
1263*5d9d9091SRichard Lowe	brnz,pn	%o3, .align_011
1264*5d9d9091SRichard Lowe	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1265*5d9d9091SRichard Lowe	ba	.align_010
1266*5d9d9091SRichard Lowe	nop
1267*5d9d9091SRichard Lowe.align_1:
1268*5d9d9091SRichard Lowe	andcc	%o1, 0x10, %o3
1269*5d9d9091SRichard Lowe	brnz,pn	%o3, .align_11
1270*5d9d9091SRichard Lowe	nop
1271*5d9d9091SRichard Lowe	andcc	%o1, 0x08, %o3
1272*5d9d9091SRichard Lowe	brnz,pn	%o3, .align_101
1273*5d9d9091SRichard Lowe	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1274*5d9d9091SRichard Lowe	ba	.align_100
1275*5d9d9091SRichard Lowe	nop
1276*5d9d9091SRichard Lowe.align_11:
1277*5d9d9091SRichard Lowe	andcc	%o1, 0x08, %o3
1278*5d9d9091SRichard Lowe	brz,pn	%o3, .align_110
1279*5d9d9091SRichard Lowe	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1280*5d9d9091SRichard Lowe
1281*5d9d9091SRichard Lowe.align_111:
1282*5d9d9091SRichard Lowe! Alignment off by 8 bytes
1283*5d9d9091SRichard Lowe	ldd	[%o1], %d0
1284*5d9d9091SRichard Lowe	add	%o1, 8, %o1
1285*5d9d9091SRichard Lowe	sub	%o2, 8, %o2
1286*5d9d9091SRichard Lowe	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
1287*5d9d9091SRichard Lowe	and	%o2, 0x7f, %o2		! residue bytes in %o2
1288*5d9d9091SRichard Lowe.align_111_loop:
1289*5d9d9091SRichard Lowe	subcc	%o5, 128, %o5
1290*5d9d9091SRichard Lowe	/* ---- copy line 1 of 2. ---- */
1291*5d9d9091SRichard Lowe	ldda	[%o1]%asi,%d16		! block load
1292*5d9d9091SRichard Lowe	fmovd	%d16, %d2
1293*5d9d9091SRichard Lowe	fmovd	%d18, %d4
1294*5d9d9091SRichard Lowe	fmovd	%d20, %d6
1295*5d9d9091SRichard Lowe	fmovd	%d22, %d8
1296*5d9d9091SRichard Lowe	fmovd	%d24, %d10
1297*5d9d9091SRichard Lowe	fmovd	%d26, %d12
1298*5d9d9091SRichard Lowe	fmovd	%d28, %d14
1299*5d9d9091SRichard Lowe	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1300*5d9d9091SRichard Lowe	stda	%d0,[%o0]%asi
1301*5d9d9091SRichard Lowe	add	%o0, 64, %o0		! advance dst
1302*5d9d9091SRichard Lowe	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1303*5d9d9091SRichard Lowe	fmovd	%d30, %d0
1304*5d9d9091SRichard Lowe
1305*5d9d9091SRichard Lowe	/* ---- copy line 2 of 2. ---- */
1306*5d9d9091SRichard Lowe	ldda	[%o1+64]%asi,%d16
1307*5d9d9091SRichard Lowe	fmovd	%d16, %d2
1308*5d9d9091SRichard Lowe	fmovd	%d18, %d4
1309*5d9d9091SRichard Lowe	fmovd	%d20, %d6
1310*5d9d9091SRichard Lowe	fmovd	%d22, %d8
1311*5d9d9091SRichard Lowe	fmovd	%d24, %d10
1312*5d9d9091SRichard Lowe	fmovd	%d26, %d12
1313*5d9d9091SRichard Lowe	fmovd	%d28, %d14
1314*5d9d9091SRichard Lowe	add	%o1, 128, %o1		! increment src
1315*5d9d9091SRichard Lowe	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1316*5d9d9091SRichard Lowe	stda	%d0,[%o0]%asi
1317*5d9d9091SRichard Lowe	add	%o0, 64, %o0		! advance dst
1318*5d9d9091SRichard Lowe	fmovd	%d30, %d0
1319*5d9d9091SRichard Lowe	bgt,pt	%ncc, .align_111_loop
1320*5d9d9091SRichard Lowe	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1321*5d9d9091SRichard Lowe
1322*5d9d9091SRichard Lowe	std	%d0, [%o0]
1323*5d9d9091SRichard Lowe	ba	.remain_stuff
1324*5d9d9091SRichard Lowe	add	%o0, 8, %o0
1325*5d9d9091SRichard Lowe	! END OF align_111
1326*5d9d9091SRichard Lowe
1327*5d9d9091SRichard Lowe.align_110:
1328*5d9d9091SRichard Lowe! Alignment off by 16 bytes
1329*5d9d9091SRichard Lowe	ldd	[%o1], %d0
1330*5d9d9091SRichard Lowe	ldd	[%o1+8], %d2
1331*5d9d9091SRichard Lowe	add	%o1, 16, %o1
1332*5d9d9091SRichard Lowe	sub	%o2, 16, %o2
1333*5d9d9091SRichard Lowe	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
1334*5d9d9091SRichard Lowe	and	%o2, 0x7f, %o2		! residue bytes in %o2
1335*5d9d9091SRichard Lowe.align_110_loop:
1336*5d9d9091SRichard Lowe	subcc	%o5, 128, %o5
1337*5d9d9091SRichard Lowe	/* ---- copy line 1 of 2. ---- */
1338*5d9d9091SRichard Lowe
1339*5d9d9091SRichard Lowe	ldda	[%o1]%asi,%d16		! block load
1340*5d9d9091SRichard Lowe	fmovd	%d16, %d4
1341*5d9d9091SRichard Lowe	fmovd	%d18, %d6
1342*5d9d9091SRichard Lowe	fmovd	%d20, %d8
1343*5d9d9091SRichard Lowe	fmovd	%d22, %d10
1344*5d9d9091SRichard Lowe	fmovd	%d24, %d12
1345*5d9d9091SRichard Lowe	fmovd	%d26, %d14
1346*5d9d9091SRichard Lowe	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1347*5d9d9091SRichard Lowe	stda	%d0,[%o0]%asi
1348*5d9d9091SRichard Lowe	add	%o0, 64, %o0		! advance dst
1349*5d9d9091SRichard Lowe	fmovd	%d28, %d0
1350*5d9d9091SRichard Lowe	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1351*5d9d9091SRichard Lowe	fmovd	%d30, %d2
1352*5d9d9091SRichard Lowe
1353*5d9d9091SRichard Lowe	/* ---- copy line 2 of 2. ---- */
1354*5d9d9091SRichard Lowe	ldda	[%o1+64]%asi,%d16
1355*5d9d9091SRichard Lowe	fmovd	%d16, %d4
1356*5d9d9091SRichard Lowe	fmovd	%d18, %d6
1357*5d9d9091SRichard Lowe	fmovd	%d20, %d8
1358*5d9d9091SRichard Lowe	fmovd	%d22, %d10
1359*5d9d9091SRichard Lowe	fmovd	%d24, %d12
1360*5d9d9091SRichard Lowe	fmovd	%d26, %d14
1361*5d9d9091SRichard Lowe	add	%o1, 128, %o1		! increment src
1362*5d9d9091SRichard Lowe	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1363*5d9d9091SRichard Lowe	stda	%d0,[%o0]%asi
1364*5d9d9091SRichard Lowe	add	%o0, 64, %o0		! advance dst
1365*5d9d9091SRichard Lowe	fmovd	%d28, %d0
1366*5d9d9091SRichard Lowe	fmovd	%d30, %d2
1367*5d9d9091SRichard Lowe	bgt,pt	%ncc, .align_110_loop
1368*5d9d9091SRichard Lowe	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1369*5d9d9091SRichard Lowe
1370*5d9d9091SRichard Lowe	std	%d0, [%o0]
1371*5d9d9091SRichard Lowe	std	%d2, [%o0+8]
1372*5d9d9091SRichard Lowe	ba	.remain_stuff
1373*5d9d9091SRichard Lowe	add	%o0, 16, %o0
1374*5d9d9091SRichard Lowe	! END OF align_110
1375*5d9d9091SRichard Lowe
1376*5d9d9091SRichard Lowe.align_101:
1377*5d9d9091SRichard Lowe! Alignment off by 24 bytes
1378*5d9d9091SRichard Lowe	ldd	[%o1], %d0
1379*5d9d9091SRichard Lowe	ldd	[%o1+8], %d2
1380*5d9d9091SRichard Lowe	ldd	[%o1+16], %d4
1381*5d9d9091SRichard Lowe	add	%o1, 24, %o1
1382*5d9d9091SRichard Lowe	sub	%o2, 24, %o2
1383*5d9d9091SRichard Lowe	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
1384*5d9d9091SRichard Lowe	and	%o2, 0x7f, %o2		! residue bytes in %o2
1385*5d9d9091SRichard Lowe.align_101_loop:
1386*5d9d9091SRichard Lowe	subcc	%o5, 128, %o5
1387*5d9d9091SRichard Lowe	/* ---- copy line 1 of 2. ---- */
1388*5d9d9091SRichard Lowe
1389*5d9d9091SRichard Lowe	ldda	[%o1]%asi,%d16		! block load
1390*5d9d9091SRichard Lowe	fmovd	%d16, %d6
1391*5d9d9091SRichard Lowe	fmovd	%d18, %d8
1392*5d9d9091SRichard Lowe	fmovd	%d20, %d10
1393*5d9d9091SRichard Lowe	fmovd	%d22, %d12
1394*5d9d9091SRichard Lowe	fmovd	%d24, %d14
1395*5d9d9091SRichard Lowe	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1396*5d9d9091SRichard Lowe	stda	%d0,[%o0]%asi
1397*5d9d9091SRichard Lowe	add	%o0, 64, %o0		! advance dst
1398*5d9d9091SRichard Lowe	fmovd	%d26, %d0
1399*5d9d9091SRichard Lowe	fmovd	%d28, %d2
1400*5d9d9091SRichard Lowe	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1401*5d9d9091SRichard Lowe	fmovd	%d30, %d4
1402*5d9d9091SRichard Lowe
1403*5d9d9091SRichard Lowe	/* ---- copy line 2 of 2. ---- */
1404*5d9d9091SRichard Lowe	ldda	[%o1+64]%asi,%d16
1405*5d9d9091SRichard Lowe	fmovd	%d16, %d6
1406*5d9d9091SRichard Lowe	fmovd	%d18, %d8
1407*5d9d9091SRichard Lowe	fmovd	%d20, %d10
1408*5d9d9091SRichard Lowe	fmovd	%d22, %d12
1409*5d9d9091SRichard Lowe	fmovd	%d24, %d14
1410*5d9d9091SRichard Lowe	add	%o1, 128, %o1		! increment src
1411*5d9d9091SRichard Lowe	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1412*5d9d9091SRichard Lowe	stda	%d0,[%o0]%asi
1413*5d9d9091SRichard Lowe	add	%o0, 64, %o0		! advance dst
1414*5d9d9091SRichard Lowe	fmovd	%d26, %d0
1415*5d9d9091SRichard Lowe	fmovd	%d28, %d2
1416*5d9d9091SRichard Lowe	fmovd	%d30, %d4
1417*5d9d9091SRichard Lowe	bgt,pt	%ncc, .align_101_loop
1418*5d9d9091SRichard Lowe	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1419*5d9d9091SRichard Lowe
1420*5d9d9091SRichard Lowe	std	%d0, [%o0]
1421*5d9d9091SRichard Lowe	std	%d2, [%o0+8]
1422*5d9d9091SRichard Lowe	std	%d4, [%o0+16]
1423*5d9d9091SRichard Lowe	ba	.remain_stuff
1424*5d9d9091SRichard Lowe	add	%o0, 24, %o0
1425*5d9d9091SRichard Lowe	! END OF align_101
1426*5d9d9091SRichard Lowe
1427*5d9d9091SRichard Lowe.align_100:
1428*5d9d9091SRichard Lowe! Alignment off by 32 bytes
1429*5d9d9091SRichard Lowe	ldd	[%o1], %d0
1430*5d9d9091SRichard Lowe	ldd	[%o1+8], %d2
1431*5d9d9091SRichard Lowe	ldd	[%o1+16],%d4
1432*5d9d9091SRichard Lowe	ldd	[%o1+24],%d6
1433*5d9d9091SRichard Lowe	add	%o1, 32, %o1
1434*5d9d9091SRichard Lowe	sub	%o2, 32, %o2
1435*5d9d9091SRichard Lowe	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
1436*5d9d9091SRichard Lowe	and	%o2, 0x7f, %o2		! residue bytes in %o2
1437*5d9d9091SRichard Lowe.align_100_loop:
1438*5d9d9091SRichard Lowe	subcc	%o5, 128, %o5
1439*5d9d9091SRichard Lowe	/* ---- copy line 1 of 2. ---- */
1440*5d9d9091SRichard Lowe	ldda	[%o1]%asi,%d16		! block load
1441*5d9d9091SRichard Lowe	fmovd	%d16, %d8
1442*5d9d9091SRichard Lowe	fmovd	%d18, %d10
1443*5d9d9091SRichard Lowe	fmovd	%d20, %d12
1444*5d9d9091SRichard Lowe	fmovd	%d22, %d14
1445*5d9d9091SRichard Lowe	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1446*5d9d9091SRichard Lowe	stda	%d0,[%o0]%asi
1447*5d9d9091SRichard Lowe	add	%o0, 64, %o0		! advance dst
1448*5d9d9091SRichard Lowe	fmovd	%d24, %d0
1449*5d9d9091SRichard Lowe	fmovd	%d26, %d2
1450*5d9d9091SRichard Lowe	fmovd	%d28, %d4
1451*5d9d9091SRichard Lowe	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1452*5d9d9091SRichard Lowe	fmovd	%d30, %d6
1453*5d9d9091SRichard Lowe
1454*5d9d9091SRichard Lowe	/* ---- copy line 2 of 2. ---- */
1455*5d9d9091SRichard Lowe	ldda	[%o1+64]%asi,%d16
1456*5d9d9091SRichard Lowe	fmovd	%d16, %d8
1457*5d9d9091SRichard Lowe	fmovd	%d18, %d10
1458*5d9d9091SRichard Lowe	fmovd	%d20, %d12
1459*5d9d9091SRichard Lowe	fmovd	%d22, %d14
1460*5d9d9091SRichard Lowe	add	%o1, 128, %o1		! increment src
1461*5d9d9091SRichard Lowe	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1462*5d9d9091SRichard Lowe	stda	%d0,[%o0]%asi
1463*5d9d9091SRichard Lowe	add	%o0, 64, %o0		! advance dst
1464*5d9d9091SRichard Lowe	fmovd	%d24, %d0
1465*5d9d9091SRichard Lowe	fmovd	%d26, %d2
1466*5d9d9091SRichard Lowe	fmovd	%d28, %d4
1467*5d9d9091SRichard Lowe	fmovd	%d30, %d6
1468*5d9d9091SRichard Lowe	bgt,pt	%ncc, .align_100_loop
1469*5d9d9091SRichard Lowe	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1470*5d9d9091SRichard Lowe
1471*5d9d9091SRichard Lowe	std	%d0, [%o0]
1472*5d9d9091SRichard Lowe	std	%d2, [%o0+8]
1473*5d9d9091SRichard Lowe	std	%d4, [%o0+16]
1474*5d9d9091SRichard Lowe	std	%d6, [%o0+24]
1475*5d9d9091SRichard Lowe	ba	.remain_stuff
1476*5d9d9091SRichard Lowe	add	%o0, 32, %o0
1477*5d9d9091SRichard Lowe	! END OF align_100
1478*5d9d9091SRichard Lowe
1479*5d9d9091SRichard Lowe.align_011:
1480*5d9d9091SRichard Lowe! Alignment off by 40 bytes
1481*5d9d9091SRichard Lowe	ldd	[%o1], %d0
1482*5d9d9091SRichard Lowe	ldd	[%o1+8], %d2
1483*5d9d9091SRichard Lowe	ldd	[%o1+16], %d4
1484*5d9d9091SRichard Lowe	ldd	[%o1+24], %d6
1485*5d9d9091SRichard Lowe	ldd	[%o1+32], %d8
1486*5d9d9091SRichard Lowe	add	%o1, 40, %o1
1487*5d9d9091SRichard Lowe	sub	%o2, 40, %o2
1488*5d9d9091SRichard Lowe	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
1489*5d9d9091SRichard Lowe	and	%o2, 0x7f, %o2		! residue bytes in %o2
1490*5d9d9091SRichard Lowe.align_011_loop:
1491*5d9d9091SRichard Lowe	subcc	%o5, 128, %o5
1492*5d9d9091SRichard Lowe	/* ---- copy line 1 of 2. ---- */
1493*5d9d9091SRichard Lowe
1494*5d9d9091SRichard Lowe	ldda	[%o1]%asi,%d16		! block load
1495*5d9d9091SRichard Lowe	fmovd	%d16, %d10
1496*5d9d9091SRichard Lowe	fmovd	%d18, %d12
1497*5d9d9091SRichard Lowe	fmovd	%d20, %d14
1498*5d9d9091SRichard Lowe	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1499*5d9d9091SRichard Lowe	stda	%d0,[%o0]%asi
1500*5d9d9091SRichard Lowe	add	%o0, 64, %o0		! advance dst
1501*5d9d9091SRichard Lowe	fmovd	%d22, %d0
1502*5d9d9091SRichard Lowe	fmovd	%d24, %d2
1503*5d9d9091SRichard Lowe	fmovd	%d26, %d4
1504*5d9d9091SRichard Lowe	fmovd	%d28, %d6
1505*5d9d9091SRichard Lowe	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1506*5d9d9091SRichard Lowe	fmovd	%d30, %d8
1507*5d9d9091SRichard Lowe
1508*5d9d9091SRichard Lowe	/* ---- copy line 2 of 2. ---- */
1509*5d9d9091SRichard Lowe	ldda	[%o1+64]%asi,%d16
1510*5d9d9091SRichard Lowe	fmovd	%d16, %d10
1511*5d9d9091SRichard Lowe	fmovd	%d18, %d12
1512*5d9d9091SRichard Lowe	fmovd	%d20, %d14
1513*5d9d9091SRichard Lowe	add	%o1, 128, %o1		! increment src
1514*5d9d9091SRichard Lowe	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1515*5d9d9091SRichard Lowe	stda	%d0,[%o0]%asi
1516*5d9d9091SRichard Lowe	add	%o0, 64, %o0		! advance dst
1517*5d9d9091SRichard Lowe	fmovd	%d22, %d0
1518*5d9d9091SRichard Lowe	fmovd	%d24, %d2
1519*5d9d9091SRichard Lowe	fmovd	%d26, %d4
1520*5d9d9091SRichard Lowe	fmovd	%d28, %d6
1521*5d9d9091SRichard Lowe	fmovd	%d30, %d8
1522*5d9d9091SRichard Lowe	bgt,pt	%ncc, .align_011_loop
1523*5d9d9091SRichard Lowe	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1524*5d9d9091SRichard Lowe
1525*5d9d9091SRichard Lowe	std	%d0, [%o0]
1526*5d9d9091SRichard Lowe	std	%d2, [%o0+8]
1527*5d9d9091SRichard Lowe	std	%d4, [%o0+16]
1528*5d9d9091SRichard Lowe	std	%d6, [%o0+24]
1529*5d9d9091SRichard Lowe	std	%d8, [%o0+32]
1530*5d9d9091SRichard Lowe	ba	.remain_stuff
1531*5d9d9091SRichard Lowe	add	%o0, 40, %o0
1532*5d9d9091SRichard Lowe	! END OF align_011
1533*5d9d9091SRichard Lowe
1534*5d9d9091SRichard Lowe.align_010:
1535*5d9d9091SRichard Lowe! Alignment off by 48 bytes
1536*5d9d9091SRichard Lowe	ldd	[%o1], %d0
1537*5d9d9091SRichard Lowe	ldd	[%o1+8], %d2
1538*5d9d9091SRichard Lowe	ldd	[%o1+16], %d4
1539*5d9d9091SRichard Lowe	ldd	[%o1+24], %d6
1540*5d9d9091SRichard Lowe	ldd	[%o1+32], %d8
1541*5d9d9091SRichard Lowe	ldd	[%o1+40], %d10
1542*5d9d9091SRichard Lowe	add	%o1, 48, %o1
1543*5d9d9091SRichard Lowe	sub	%o2, 48, %o2
1544*5d9d9091SRichard Lowe	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
1545*5d9d9091SRichard Lowe	and	%o2, 0x7f, %o2		! residue bytes in %o2
1546*5d9d9091SRichard Lowe.align_010_loop:
1547*5d9d9091SRichard Lowe	subcc	%o5, 128, %o5
1548*5d9d9091SRichard Lowe	/* ---- copy line 1 of 2. ---- */
1549*5d9d9091SRichard Lowe
1550*5d9d9091SRichard Lowe	ldda	[%o1]%asi,%d16		! block load
1551*5d9d9091SRichard Lowe	fmovd	%d16, %d12
1552*5d9d9091SRichard Lowe	fmovd	%d18, %d14
1553*5d9d9091SRichard Lowe	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1554*5d9d9091SRichard Lowe	stda	%d0,[%o0]%asi
1555*5d9d9091SRichard Lowe	add	%o0, 64, %o0		! advance dst
1556*5d9d9091SRichard Lowe	fmovd	%d20, %d0
1557*5d9d9091SRichard Lowe	fmovd	%d22, %d2
1558*5d9d9091SRichard Lowe	fmovd	%d24, %d4
1559*5d9d9091SRichard Lowe	fmovd	%d26, %d6
1560*5d9d9091SRichard Lowe	fmovd	%d28, %d8
1561*5d9d9091SRichard Lowe	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1562*5d9d9091SRichard Lowe	fmovd	%d30, %d10
1563*5d9d9091SRichard Lowe
1564*5d9d9091SRichard Lowe	/* ---- copy line 2 of 2. ---- */
1565*5d9d9091SRichard Lowe	ldda	[%o1+64]%asi,%d16
1566*5d9d9091SRichard Lowe	fmovd	%d16, %d12
1567*5d9d9091SRichard Lowe	fmovd	%d18, %d14
1568*5d9d9091SRichard Lowe	add	%o1, 128, %o1	! increment src
1569*5d9d9091SRichard Lowe	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1570*5d9d9091SRichard Lowe	stda	%d0,[%o0]%asi
1571*5d9d9091SRichard Lowe	add	%o0, 64, %o0		! advance dst
1572*5d9d9091SRichard Lowe	fmovd	%d20, %d0
1573*5d9d9091SRichard Lowe	fmovd	%d22, %d2
1574*5d9d9091SRichard Lowe	fmovd	%d24, %d4
1575*5d9d9091SRichard Lowe	fmovd	%d26, %d6
1576*5d9d9091SRichard Lowe	fmovd	%d28, %d8
1577*5d9d9091SRichard Lowe	fmovd	%d30, %d10
1578*5d9d9091SRichard Lowe	bgt,pt	%ncc, .align_010_loop
1579*5d9d9091SRichard Lowe	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1580*5d9d9091SRichard Lowe
1581*5d9d9091SRichard Lowe	std	%d0, [%o0]
1582*5d9d9091SRichard Lowe	std	%d2, [%o0+8]
1583*5d9d9091SRichard Lowe	std	%d4, [%o0+16]
1584*5d9d9091SRichard Lowe	std	%d6, [%o0+24]
1585*5d9d9091SRichard Lowe	std	%d8, [%o0+32]
1586*5d9d9091SRichard Lowe	std	%d10, [%o0+40]
1587*5d9d9091SRichard Lowe	ba	.remain_stuff
1588*5d9d9091SRichard Lowe	add	%o0, 48, %o0
1589*5d9d9091SRichard Lowe	! END OF align_010
1590*5d9d9091SRichard Lowe
1591*5d9d9091SRichard Lowe.align_001:
1592*5d9d9091SRichard Lowe! Alignment off by 56 bytes
1593*5d9d9091SRichard Lowe	ldd	[%o1], %d0
1594*5d9d9091SRichard Lowe	ldd	[%o1+8], %d2
1595*5d9d9091SRichard Lowe	ldd	[%o1+16], %d4
1596*5d9d9091SRichard Lowe	ldd	[%o1+24], %d6
1597*5d9d9091SRichard Lowe	ldd	[%o1+32], %d8
1598*5d9d9091SRichard Lowe	ldd	[%o1+40], %d10
1599*5d9d9091SRichard Lowe	ldd	[%o1+48], %d12
1600*5d9d9091SRichard Lowe	add	%o1, 56, %o1
1601*5d9d9091SRichard Lowe	sub	%o2, 56, %o2
1602*5d9d9091SRichard Lowe	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
1603*5d9d9091SRichard Lowe	and	%o2, 0x7f, %o2		! residue bytes in %o2
1604*5d9d9091SRichard Lowe.align_001_loop:
1605*5d9d9091SRichard Lowe	subcc	%o5, 128, %o5
1606*5d9d9091SRichard Lowe	/* ---- copy line 1 of 2. ---- */
1607*5d9d9091SRichard Lowe
1608*5d9d9091SRichard Lowe	ldda	[%o1]%asi,%d16		! block load
1609*5d9d9091SRichard Lowe	fmovd	%d16, %d14
1610*5d9d9091SRichard Lowe	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1611*5d9d9091SRichard Lowe	stda	%d0,[%o0]%asi
1612*5d9d9091SRichard Lowe	add	%o0, 64, %o0		! advance dst
1613*5d9d9091SRichard Lowe	fmovd	%d18, %d0
1614*5d9d9091SRichard Lowe	fmovd	%d20, %d2
1615*5d9d9091SRichard Lowe	fmovd	%d22, %d4
1616*5d9d9091SRichard Lowe	fmovd	%d24, %d6
1617*5d9d9091SRichard Lowe	fmovd	%d26, %d8
1618*5d9d9091SRichard Lowe	fmovd	%d28, %d10
1619*5d9d9091SRichard Lowe	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1620*5d9d9091SRichard Lowe	fmovd	%d30, %d12
1621*5d9d9091SRichard Lowe
1622*5d9d9091SRichard Lowe	/* ---- copy line 2 of 2. ---- */
1623*5d9d9091SRichard Lowe	ldda	[%o1+64]%asi,%d16
1624*5d9d9091SRichard Lowe	fmovd	%d16, %d14
1625*5d9d9091SRichard Lowe	add	%o1, 128, %o1		! increment src
1626*5d9d9091SRichard Lowe	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1627*5d9d9091SRichard Lowe	stda	%d0,[%o0]%asi
1628*5d9d9091SRichard Lowe	add	%o0, 64, %o0		! advance dst
1629*5d9d9091SRichard Lowe	fmovd	%d18, %d0
1630*5d9d9091SRichard Lowe	fmovd	%d20, %d2
1631*5d9d9091SRichard Lowe	fmovd	%d22, %d4
1632*5d9d9091SRichard Lowe	fmovd	%d24, %d6
1633*5d9d9091SRichard Lowe	fmovd	%d26, %d8
1634*5d9d9091SRichard Lowe	fmovd	%d28, %d10
1635*5d9d9091SRichard Lowe	fmovd	%d30, %d12
1636*5d9d9091SRichard Lowe	bgt,pt	%ncc, .align_001_loop
1637*5d9d9091SRichard Lowe	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1638*5d9d9091SRichard Lowe
1639*5d9d9091SRichard Lowe	std	%d0, [%o0]
1640*5d9d9091SRichard Lowe	std	%d2, [%o0+8]
1641*5d9d9091SRichard Lowe	std	%d4, [%o0+16]
1642*5d9d9091SRichard Lowe	std	%d6, [%o0+24]
1643*5d9d9091SRichard Lowe	std	%d8, [%o0+32]
1644*5d9d9091SRichard Lowe	std	%d10, [%o0+40]
1645*5d9d9091SRichard Lowe	std	%d12, [%o0+48]
1646*5d9d9091SRichard Lowe	ba	.remain_stuff
1647*5d9d9091SRichard Lowe	add	%o0, 56, %o0
1648*5d9d9091SRichard Lowe	! END OF align_001
1649*5d9d9091SRichard Lowe
1650*5d9d9091SRichard Lowe.align_000:
1651*5d9d9091SRichard Lowe	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
1652*5d9d9091SRichard Lowe	and	%o2, 0x7f, %o2		! residue bytes in %o2
1653*5d9d9091SRichard Lowe.align_000_loop:
1654*5d9d9091SRichard Lowe	/* ---- copy line 1 of 2. ---- */
1655*5d9d9091SRichard Lowe	subcc	%o5, 128, %o5
1656*5d9d9091SRichard Lowe	ldda	[%o1]%asi,%d0
1657*5d9d9091SRichard Lowe	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1658*5d9d9091SRichard Lowe	stda	%d0,[%o0]%asi
1659*5d9d9091SRichard Lowe	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1660*5d9d9091SRichard Lowe
1661*5d9d9091SRichard Lowe	/* ---- copy line 2 of 2. ---- */
1662*5d9d9091SRichard Lowe	add	%o0, 64, %o0
1663*5d9d9091SRichard Lowe	ldda	[%o1+64]%asi,%d0
1664*5d9d9091SRichard Lowe	add	%o1, 128, %o1		! increment src
1665*5d9d9091SRichard Lowe	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1666*5d9d9091SRichard Lowe	stda	%d0,[%o0]%asi
1667*5d9d9091SRichard Lowe	add	%o0, 64, %o0		! increment dst
1668*5d9d9091SRichard Lowe	bgt,pt	%ncc, .align_000_loop
1669*5d9d9091SRichard Lowe	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1670*5d9d9091SRichard Lowe
1671*5d9d9091SRichard Lowe	! END OF align_000
1672*5d9d9091SRichard Lowe
1673*5d9d9091SRichard Lowe.remain_stuff:
1674*5d9d9091SRichard Lowe	mov	%o4, %asi		! restore %asi
1675*5d9d9091SRichard Lowe	brnz	%g5, .medlong
1676*5d9d9091SRichard Lowe	membar	#Sync
1677*5d9d9091SRichard Lowe	ba	.medlong
1678*5d9d9091SRichard Lowe	wr	%g5, %g0, %fprs
1679*5d9d9091SRichard Lowe
1680*5d9d9091SRichard Lowe	.align 16
1681*5d9d9091SRichard Lowe	! Dst is on 8 byte boundary; src is not; remaining count > SMALL_MAX
1682*5d9d9091SRichard Lowe.unalignsetup:
1683*5d9d9091SRichard Lowe	prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
1684*5d9d9091SRichard Lowe.unalignrejoin:
1685*5d9d9091SRichard Lowe	rd	%fprs, %g5		! check for unused fp
1686*5d9d9091SRichard Lowe	! if fprs.fef == 0, set it.
1687*5d9d9091SRichard Lowe	! Setting it when already set costs more than checking
1688*5d9d9091SRichard Lowe	andcc	%g5, FPRS_FEF, %g5	! test FEF, fprs.du = fprs.dl = 0
1689*5d9d9091SRichard Lowe	bz,a	%ncc, 1f
1690*5d9d9091SRichard Lowe	wr	%g0, FPRS_FEF, %fprs	! fprs.fef = 1
1691*5d9d9091SRichard Lowe1:
1692*5d9d9091SRichard Lowe	cmp	%o2, MED_UMAX		! check for medium unaligned limit
1693*5d9d9091SRichard Lowe	bge,pt	%ncc,.unalign_large
1694*5d9d9091SRichard Lowe	nop
1695*5d9d9091SRichard Lowe	andn	%o2, 0x3f, %o5		! %o5 is multiple of block size
1696*5d9d9091SRichard Lowe	and	%o2, 0x3f, %o2		! residue bytes in %o2
1697*5d9d9091SRichard Lowe	cmp	%o2, 8			! Insure we don't load beyond
1698*5d9d9091SRichard Lowe	bgt	.unalign_adjust		! end of source buffer
1699*5d9d9091SRichard Lowe	andn	%o1, 0x7, %o4		! %o4 has long word aligned src address
1700*5d9d9091SRichard Lowe	add	%o2, 64, %o2		! adjust to leave loop
1701*5d9d9091SRichard Lowe	sub	%o5, 64, %o5		! early if necessary
1702*5d9d9091SRichard Lowe.unalign_adjust:
1703*5d9d9091SRichard Lowe	alignaddr %o1, %g0, %g0		! generate %gsr
1704*5d9d9091SRichard Lowe	add	%o1, %o5, %o1		! advance %o1 to after blocks
1705*5d9d9091SRichard Lowe	ldd	[%o4], %d0
1706*5d9d9091SRichard Lowe.unalign_loop:
1707*5d9d9091SRichard Lowe	ldd	[%o4+8], %d2
1708*5d9d9091SRichard Lowe	faligndata %d0, %d2, %d16
1709*5d9d9091SRichard Lowe	ldd	[%o4+16], %d4
1710*5d9d9091SRichard Lowe	std	%d16, [%o0]
1711*5d9d9091SRichard Lowe	faligndata %d2, %d4, %d18
1712*5d9d9091SRichard Lowe	ldd	[%o4+24], %d6
1713*5d9d9091SRichard Lowe	std	%d18, [%o0+8]
1714*5d9d9091SRichard Lowe	faligndata %d4, %d6, %d20
1715*5d9d9091SRichard Lowe	ldd	[%o4+32], %d8
1716*5d9d9091SRichard Lowe	std	%d20, [%o0+16]
1717*5d9d9091SRichard Lowe	faligndata %d6, %d8, %d22
1718*5d9d9091SRichard Lowe	ldd	[%o4+40], %d10
1719*5d9d9091SRichard Lowe	std	%d22, [%o0+24]
1720*5d9d9091SRichard Lowe	faligndata %d8, %d10, %d24
1721*5d9d9091SRichard Lowe	ldd	[%o4+48], %d12
1722*5d9d9091SRichard Lowe	std	%d24, [%o0+32]
1723*5d9d9091SRichard Lowe	faligndata %d10, %d12, %d26
1724*5d9d9091SRichard Lowe	ldd	[%o4+56], %d14
1725*5d9d9091SRichard Lowe	std	%d26, [%o0+40]
1726*5d9d9091SRichard Lowe	faligndata %d12, %d14, %d28
1727*5d9d9091SRichard Lowe	ldd	[%o4+64], %d0
1728*5d9d9091SRichard Lowe	std	%d28, [%o0+48]
1729*5d9d9091SRichard Lowe	faligndata %d14, %d0, %d30
1730*5d9d9091SRichard Lowe	add	%o4, BLOCK_SIZE, %o4
1731*5d9d9091SRichard Lowe	std	%d30, [%o0+56]
1732*5d9d9091SRichard Lowe	add	%o0, BLOCK_SIZE, %o0
1733*5d9d9091SRichard Lowe	subcc	%o5, BLOCK_SIZE, %o5
1734*5d9d9091SRichard Lowe	bgu,pt	%ncc, .unalign_loop
1735*5d9d9091SRichard Lowe	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1736*5d9d9091SRichard Lowe	ba	.unalign_done
1737*5d9d9091SRichard Lowe	nop
1738*5d9d9091SRichard Lowe
1739*5d9d9091SRichard Lowe.unalign_large:
1740*5d9d9091SRichard Lowe	andcc	%o0, 0x3f, %o3		! is dst 64-byte block aligned?
1741*5d9d9091SRichard Lowe	bz	%ncc, .unalignsrc
1742*5d9d9091SRichard Lowe	sub	%o3, 64, %o3		! %o3 will be multiple of 8
1743*5d9d9091SRichard Lowe	neg	%o3			! bytes until dest is 64 byte aligned
1744*5d9d9091SRichard Lowe	sub	%o2, %o3, %o2		! update cnt with bytes to be moved
1745*5d9d9091SRichard Lowe	! Move bytes according to source alignment
1746*5d9d9091SRichard Lowe	andcc	%o1, 0x1, %o5
1747*5d9d9091SRichard Lowe	bnz	%ncc, .unalignbyte	! check for byte alignment
1748*5d9d9091SRichard Lowe	nop
1749*5d9d9091SRichard Lowe	andcc	%o1, 2, %o5		! check for half word alignment
1750*5d9d9091SRichard Lowe	bnz	%ncc, .unalignhalf
1751*5d9d9091SRichard Lowe	nop
1752*5d9d9091SRichard Lowe	! Src is word aligned
1753*5d9d9091SRichard Lowe.unalignword:
1754*5d9d9091SRichard Lowe	ld	[%o1], %o4		! load 4 bytes
1755*5d9d9091SRichard Lowe	stw	%o4, [%o0]		! and store 4 bytes
1756*5d9d9091SRichard Lowe	ld	[%o1+4], %o4		! load 4 bytes
1757*5d9d9091SRichard Lowe	add	%o1, 8, %o1		! increase src ptr by 8
1758*5d9d9091SRichard Lowe	stw	%o4, [%o0+4]		! and store 4 bytes
1759*5d9d9091SRichard Lowe	subcc	%o3, 8, %o3		! decrease count by 8
1760*5d9d9091SRichard Lowe	bnz	%ncc, .unalignword
1761*5d9d9091SRichard Lowe	add	%o0, 8, %o0		! increase dst ptr by 8
1762*5d9d9091SRichard Lowe	ba	.unalignsrc
1763*5d9d9091SRichard Lowe	nop
1764*5d9d9091SRichard Lowe
1765*5d9d9091SRichard Lowe	! Src is half-word aligned
1766*5d9d9091SRichard Lowe.unalignhalf:
1767*5d9d9091SRichard Lowe	lduh	[%o1], %o4		! load 2 bytes
1768*5d9d9091SRichard Lowe	sllx	%o4, 32, %o5		! shift left
1769*5d9d9091SRichard Lowe	lduw	[%o1+2], %o4
1770*5d9d9091SRichard Lowe	or	%o4, %o5, %o5
1771*5d9d9091SRichard Lowe	sllx	%o5, 16, %o5
1772*5d9d9091SRichard Lowe	lduh	[%o1+6], %o4
1773*5d9d9091SRichard Lowe	or	%o4, %o5, %o5
1774*5d9d9091SRichard Lowe	stx	%o5, [%o0]
1775*5d9d9091SRichard Lowe	add	%o1, 8, %o1
1776*5d9d9091SRichard Lowe	subcc	%o3, 8, %o3
1777*5d9d9091SRichard Lowe	bnz	%ncc, .unalignhalf
1778*5d9d9091SRichard Lowe	add	%o0, 8, %o0
1779*5d9d9091SRichard Lowe	ba	.unalignsrc
1780*5d9d9091SRichard Lowe	nop
1781*5d9d9091SRichard Lowe
1782*5d9d9091SRichard Lowe	! Src is Byte aligned
1783*5d9d9091SRichard Lowe.unalignbyte:
1784*5d9d9091SRichard Lowe	sub	%o0, %o1, %o0		! share pointer advance
1785*5d9d9091SRichard Lowe.unalignbyte_loop:
1786*5d9d9091SRichard Lowe	ldub	[%o1], %o4
1787*5d9d9091SRichard Lowe	sllx	%o4, 56, %o5
1788*5d9d9091SRichard Lowe	lduh	[%o1+1], %o4
1789*5d9d9091SRichard Lowe	sllx	%o4, 40, %o4
1790*5d9d9091SRichard Lowe	or	%o4, %o5, %o5
1791*5d9d9091SRichard Lowe	lduh	[%o1+3], %o4
1792*5d9d9091SRichard Lowe	sllx	%o4, 24, %o4
1793*5d9d9091SRichard Lowe	or	%o4, %o5, %o5
1794*5d9d9091SRichard Lowe	lduh	[%o1+5], %o4
1795*5d9d9091SRichard Lowe	sllx	%o4,  8, %o4
1796*5d9d9091SRichard Lowe	or	%o4, %o5, %o5
1797*5d9d9091SRichard Lowe	ldub	[%o1+7], %o4
1798*5d9d9091SRichard Lowe	or	%o4, %o5, %o5
1799*5d9d9091SRichard Lowe	stx	%o5, [%o0+%o1]
1800*5d9d9091SRichard Lowe	subcc	%o3, 8, %o3
1801*5d9d9091SRichard Lowe	bnz	%ncc, .unalignbyte_loop
1802*5d9d9091SRichard Lowe	add	%o1, 8, %o1
1803*5d9d9091SRichard Lowe	add	%o0,%o1, %o0 		! restore pointer
1804*5d9d9091SRichard Lowe
1805*5d9d9091SRichard Lowe	! Destination is now block (64 byte aligned)
1806*5d9d9091SRichard Lowe.unalignsrc:
1807*5d9d9091SRichard Lowe	andn	%o2, 0x3f, %o5		! %o5 is multiple of block size
1808*5d9d9091SRichard Lowe	and	%o2, 0x3f, %o2		! residue bytes in %o2
1809*5d9d9091SRichard Lowe	add	%o2, 64, %o2		! Insure we don't load beyond
1810*5d9d9091SRichard Lowe	sub	%o5, 64, %o5		! end of source buffer
1811*5d9d9091SRichard Lowe
1812*5d9d9091SRichard Lowe	andn	%o1, 0x3f, %o4		! %o4 has block aligned src address
1813*5d9d9091SRichard Lowe	prefetch [%o4 + (3 * BLOCK_SIZE)], #one_read
1814*5d9d9091SRichard Lowe	alignaddr %o1, %g0, %g0		! generate %gsr
1815*5d9d9091SRichard Lowe	add	%o1, %o5, %o1		! advance %o1 to after blocks
1816*5d9d9091SRichard Lowe	!
1817*5d9d9091SRichard Lowe	! Determine source alignment to correct 8 byte offset
1818*5d9d9091SRichard Lowe	andcc	%o1, 0x20, %o3
1819*5d9d9091SRichard Lowe	brnz,pn	%o3, .unalign_1
1820*5d9d9091SRichard Lowe	nop
1821*5d9d9091SRichard Lowe	andcc	%o1, 0x10, %o3
1822*5d9d9091SRichard Lowe	brnz,pn	%o3, .unalign_01
1823*5d9d9091SRichard Lowe	nop
1824*5d9d9091SRichard Lowe	andcc	%o1, 0x08, %o3
1825*5d9d9091SRichard Lowe	brz,a	%o3, .unalign_000
1826*5d9d9091SRichard Lowe	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1827*5d9d9091SRichard Lowe	ba	.unalign_001
1828*5d9d9091SRichard Lowe	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1829*5d9d9091SRichard Lowe.unalign_01:
1830*5d9d9091SRichard Lowe	andcc	%o1, 0x08, %o3
1831*5d9d9091SRichard Lowe	brnz,a	%o3, .unalign_011
1832*5d9d9091SRichard Lowe	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1833*5d9d9091SRichard Lowe	ba	.unalign_010
1834*5d9d9091SRichard Lowe	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1835*5d9d9091SRichard Lowe.unalign_1:
1836*5d9d9091SRichard Lowe	andcc	%o1, 0x10, %o3
1837*5d9d9091SRichard Lowe	brnz,pn	%o3, .unalign_11
1838*5d9d9091SRichard Lowe	nop
1839*5d9d9091SRichard Lowe	andcc	%o1, 0x08, %o3
1840*5d9d9091SRichard Lowe	brnz,a	%o3, .unalign_101
1841*5d9d9091SRichard Lowe	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1842*5d9d9091SRichard Lowe	ba	.unalign_100
1843*5d9d9091SRichard Lowe	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1844*5d9d9091SRichard Lowe.unalign_11:
1845*5d9d9091SRichard Lowe	andcc	%o1, 0x08, %o3
1846*5d9d9091SRichard Lowe	brz,pn	%o3, .unalign_110
1847*5d9d9091SRichard Lowe	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1848*5d9d9091SRichard Lowe
1849*5d9d9091SRichard Lowe.unalign_111:
1850*5d9d9091SRichard Lowe	ldd	[%o4+56], %d14
1851*5d9d9091SRichard Lowe.unalign_111_loop:
1852*5d9d9091SRichard Lowe	add	%o4, 64, %o4
1853*5d9d9091SRichard Lowe	ldda	[%o4]ASI_BLK_P, %d16
1854*5d9d9091SRichard Lowe	faligndata %d14, %d16, %d48
1855*5d9d9091SRichard Lowe	faligndata %d16, %d18, %d50
1856*5d9d9091SRichard Lowe	faligndata %d18, %d20, %d52
1857*5d9d9091SRichard Lowe	faligndata %d20, %d22, %d54
1858*5d9d9091SRichard Lowe	faligndata %d22, %d24, %d56
1859*5d9d9091SRichard Lowe	faligndata %d24, %d26, %d58
1860*5d9d9091SRichard Lowe	faligndata %d26, %d28, %d60
1861*5d9d9091SRichard Lowe	faligndata %d28, %d30, %d62
1862*5d9d9091SRichard Lowe	fmovd	%d30, %d14
1863*5d9d9091SRichard Lowe	stda	%d48, [%o0]ASI_BLK_P
1864*5d9d9091SRichard Lowe	subcc	%o5, 64, %o5
1865*5d9d9091SRichard Lowe	add	%o0, 64, %o0
1866*5d9d9091SRichard Lowe	bgu,pt	%ncc, .unalign_111_loop
1867*5d9d9091SRichard Lowe	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1868*5d9d9091SRichard Lowe	ba	.unalign_done
1869*5d9d9091SRichard Lowe	membar	#Sync
1870*5d9d9091SRichard Lowe
1871*5d9d9091SRichard Lowe.unalign_110:
1872*5d9d9091SRichard Lowe	ldd	[%o4+48], %d12
1873*5d9d9091SRichard Lowe	ldd	[%o4+56], %d14
1874*5d9d9091SRichard Lowe.unalign_110_loop:
1875*5d9d9091SRichard Lowe	add	%o4, 64, %o4
1876*5d9d9091SRichard Lowe	ldda	[%o4]ASI_BLK_P, %d16
1877*5d9d9091SRichard Lowe	faligndata %d12, %d14, %d48
1878*5d9d9091SRichard Lowe	faligndata %d14, %d16, %d50
1879*5d9d9091SRichard Lowe	faligndata %d16, %d18, %d52
1880*5d9d9091SRichard Lowe	faligndata %d18, %d20, %d54
1881*5d9d9091SRichard Lowe	faligndata %d20, %d22, %d56
1882*5d9d9091SRichard Lowe	faligndata %d22, %d24, %d58
1883*5d9d9091SRichard Lowe	faligndata %d24, %d26, %d60
1884*5d9d9091SRichard Lowe	faligndata %d26, %d28, %d62
1885*5d9d9091SRichard Lowe	fmovd	%d28, %d12
1886*5d9d9091SRichard Lowe	fmovd	%d30, %d14
1887*5d9d9091SRichard Lowe	stda	%d48, [%o0]ASI_BLK_P
1888*5d9d9091SRichard Lowe	subcc	%o5, 64, %o5
1889*5d9d9091SRichard Lowe	add	%o0, 64, %o0
1890*5d9d9091SRichard Lowe	bgu,pt	%ncc, .unalign_110_loop
1891*5d9d9091SRichard Lowe	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1892*5d9d9091SRichard Lowe	ba	.unalign_done
1893*5d9d9091SRichard Lowe	membar	#Sync
1894*5d9d9091SRichard Lowe
1895*5d9d9091SRichard Lowe.unalign_101:
1896*5d9d9091SRichard Lowe	ldd	[%o4+40], %d10
1897*5d9d9091SRichard Lowe	ldd	[%o4+48], %d12
1898*5d9d9091SRichard Lowe	ldd	[%o4+56], %d14
1899*5d9d9091SRichard Lowe.unalign_101_loop:
1900*5d9d9091SRichard Lowe	add	%o4, 64, %o4
1901*5d9d9091SRichard Lowe	ldda	[%o4]ASI_BLK_P, %d16
1902*5d9d9091SRichard Lowe	faligndata %d10, %d12, %d48
1903*5d9d9091SRichard Lowe	faligndata %d12, %d14, %d50
1904*5d9d9091SRichard Lowe	faligndata %d14, %d16, %d52
1905*5d9d9091SRichard Lowe	faligndata %d16, %d18, %d54
1906*5d9d9091SRichard Lowe	faligndata %d18, %d20, %d56
1907*5d9d9091SRichard Lowe	faligndata %d20, %d22, %d58
1908*5d9d9091SRichard Lowe	faligndata %d22, %d24, %d60
1909*5d9d9091SRichard Lowe	faligndata %d24, %d26, %d62
1910*5d9d9091SRichard Lowe	fmovd	%d26, %d10
1911*5d9d9091SRichard Lowe	fmovd	%d28, %d12
1912*5d9d9091SRichard Lowe	fmovd	%d30, %d14
1913*5d9d9091SRichard Lowe	stda	%d48, [%o0]ASI_BLK_P
1914*5d9d9091SRichard Lowe	subcc	%o5, 64, %o5
1915*5d9d9091SRichard Lowe	add	%o0, 64, %o0
1916*5d9d9091SRichard Lowe	bgu,pt	%ncc, .unalign_101_loop
1917*5d9d9091SRichard Lowe	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1918*5d9d9091SRichard Lowe	ba	.unalign_done
1919*5d9d9091SRichard Lowe	membar	#Sync
1920*5d9d9091SRichard Lowe
1921*5d9d9091SRichard Lowe.unalign_100:
1922*5d9d9091SRichard Lowe	ldd	[%o4+32], %d8
1923*5d9d9091SRichard Lowe	ldd	[%o4+40], %d10
1924*5d9d9091SRichard Lowe	ldd	[%o4+48], %d12
1925*5d9d9091SRichard Lowe	ldd	[%o4+56], %d14
1926*5d9d9091SRichard Lowe.unalign_100_loop:
1927*5d9d9091SRichard Lowe	add	%o4, 64, %o4
1928*5d9d9091SRichard Lowe	ldda	[%o4]ASI_BLK_P, %d16
1929*5d9d9091SRichard Lowe	faligndata %d8, %d10, %d48
1930*5d9d9091SRichard Lowe	faligndata %d10, %d12, %d50
1931*5d9d9091SRichard Lowe	faligndata %d12, %d14, %d52
1932*5d9d9091SRichard Lowe	faligndata %d14, %d16, %d54
1933*5d9d9091SRichard Lowe	faligndata %d16, %d18, %d56
1934*5d9d9091SRichard Lowe	faligndata %d18, %d20, %d58
1935*5d9d9091SRichard Lowe	faligndata %d20, %d22, %d60
1936*5d9d9091SRichard Lowe	faligndata %d22, %d24, %d62
1937*5d9d9091SRichard Lowe	fmovd	%d24, %d8
1938*5d9d9091SRichard Lowe	fmovd	%d26, %d10
1939*5d9d9091SRichard Lowe	fmovd	%d28, %d12
1940*5d9d9091SRichard Lowe	fmovd	%d30, %d14
1941*5d9d9091SRichard Lowe	stda	%d48, [%o0]ASI_BLK_P
1942*5d9d9091SRichard Lowe	subcc	%o5, 64, %o5
1943*5d9d9091SRichard Lowe	add	%o0, 64, %o0
1944*5d9d9091SRichard Lowe	bgu,pt	%ncc, .unalign_100_loop
1945*5d9d9091SRichard Lowe	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1946*5d9d9091SRichard Lowe	ba	.unalign_done
1947*5d9d9091SRichard Lowe	membar	#Sync
1948*5d9d9091SRichard Lowe
1949*5d9d9091SRichard Lowe.unalign_011:
1950*5d9d9091SRichard Lowe	ldd	[%o4+24], %d6
1951*5d9d9091SRichard Lowe	ldd	[%o4+32], %d8
1952*5d9d9091SRichard Lowe	ldd	[%o4+40], %d10
1953*5d9d9091SRichard Lowe	ldd	[%o4+48], %d12
1954*5d9d9091SRichard Lowe	ldd	[%o4+56], %d14
1955*5d9d9091SRichard Lowe.unalign_011_loop:
1956*5d9d9091SRichard Lowe	add	%o4, 64, %o4
1957*5d9d9091SRichard Lowe	ldda	[%o4]ASI_BLK_P, %d16
1958*5d9d9091SRichard Lowe	faligndata %d6, %d8, %d48
1959*5d9d9091SRichard Lowe	faligndata %d8, %d10, %d50
1960*5d9d9091SRichard Lowe	faligndata %d10, %d12, %d52
1961*5d9d9091SRichard Lowe	faligndata %d12, %d14, %d54
1962*5d9d9091SRichard Lowe	faligndata %d14, %d16, %d56
1963*5d9d9091SRichard Lowe	faligndata %d16, %d18, %d58
1964*5d9d9091SRichard Lowe	faligndata %d18, %d20, %d60
1965*5d9d9091SRichard Lowe	faligndata %d20, %d22, %d62
1966*5d9d9091SRichard Lowe	fmovd	%d22, %d6
1967*5d9d9091SRichard Lowe	fmovd	%d24, %d8
1968*5d9d9091SRichard Lowe	fmovd	%d26, %d10
1969*5d9d9091SRichard Lowe	fmovd	%d28, %d12
1970*5d9d9091SRichard Lowe	fmovd	%d30, %d14
1971*5d9d9091SRichard Lowe	stda	%d48, [%o0]ASI_BLK_P
1972*5d9d9091SRichard Lowe	subcc	%o5, 64, %o5
1973*5d9d9091SRichard Lowe	add	%o0, 64, %o0
1974*5d9d9091SRichard Lowe	bgu,pt	%ncc, .unalign_011_loop
1975*5d9d9091SRichard Lowe	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1976*5d9d9091SRichard Lowe	ba	.unalign_done
1977*5d9d9091SRichard Lowe	membar	#Sync
1978*5d9d9091SRichard Lowe
1979*5d9d9091SRichard Lowe.unalign_010:
1980*5d9d9091SRichard Lowe	ldd	[%o4+16], %d4
1981*5d9d9091SRichard Lowe	ldd	[%o4+24], %d6
1982*5d9d9091SRichard Lowe	ldd	[%o4+32], %d8
1983*5d9d9091SRichard Lowe	ldd	[%o4+40], %d10
1984*5d9d9091SRichard Lowe	ldd	[%o4+48], %d12
1985*5d9d9091SRichard Lowe	ldd	[%o4+56], %d14
1986*5d9d9091SRichard Lowe.unalign_010_loop:
1987*5d9d9091SRichard Lowe	add	%o4, 64, %o4
1988*5d9d9091SRichard Lowe	ldda	[%o4]ASI_BLK_P, %d16
1989*5d9d9091SRichard Lowe	faligndata %d4, %d6, %d48
1990*5d9d9091SRichard Lowe	faligndata %d6, %d8, %d50
1991*5d9d9091SRichard Lowe	faligndata %d8, %d10, %d52
1992*5d9d9091SRichard Lowe	faligndata %d10, %d12, %d54
1993*5d9d9091SRichard Lowe	faligndata %d12, %d14, %d56
1994*5d9d9091SRichard Lowe	faligndata %d14, %d16, %d58
1995*5d9d9091SRichard Lowe	faligndata %d16, %d18, %d60
1996*5d9d9091SRichard Lowe	faligndata %d18, %d20, %d62
1997*5d9d9091SRichard Lowe	fmovd	%d20, %d4
1998*5d9d9091SRichard Lowe	fmovd	%d22, %d6
1999*5d9d9091SRichard Lowe	fmovd	%d24, %d8
2000*5d9d9091SRichard Lowe	fmovd	%d26, %d10
2001*5d9d9091SRichard Lowe	fmovd	%d28, %d12
2002*5d9d9091SRichard Lowe	fmovd	%d30, %d14
2003*5d9d9091SRichard Lowe	stda	%d48, [%o0]ASI_BLK_P
2004*5d9d9091SRichard Lowe	subcc	%o5, 64, %o5
2005*5d9d9091SRichard Lowe	add	%o0, 64, %o0
2006*5d9d9091SRichard Lowe	bgu,pt	%ncc, .unalign_010_loop
2007*5d9d9091SRichard Lowe	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
2008*5d9d9091SRichard Lowe	ba	.unalign_done
2009*5d9d9091SRichard Lowe	membar	#Sync
2010*5d9d9091SRichard Lowe
2011*5d9d9091SRichard Lowe.unalign_001:
2012*5d9d9091SRichard Lowe	ldd	[%o4+8], %d2
2013*5d9d9091SRichard Lowe	ldd	[%o4+16], %d4
2014*5d9d9091SRichard Lowe	ldd	[%o4+24], %d6
2015*5d9d9091SRichard Lowe	ldd	[%o4+32], %d8
2016*5d9d9091SRichard Lowe	ldd	[%o4+40], %d10
2017*5d9d9091SRichard Lowe	ldd	[%o4+48], %d12
2018*5d9d9091SRichard Lowe	ldd	[%o4+56], %d14
2019*5d9d9091SRichard Lowe.unalign_001_loop:
2020*5d9d9091SRichard Lowe	add	%o4, 64, %o4
2021*5d9d9091SRichard Lowe	ldda	[%o4]ASI_BLK_P, %d16
2022*5d9d9091SRichard Lowe	faligndata %d2, %d4, %d48
2023*5d9d9091SRichard Lowe	faligndata %d4, %d6, %d50
2024*5d9d9091SRichard Lowe	faligndata %d6, %d8, %d52
2025*5d9d9091SRichard Lowe	faligndata %d8, %d10, %d54
2026*5d9d9091SRichard Lowe	faligndata %d10, %d12, %d56
2027*5d9d9091SRichard Lowe	faligndata %d12, %d14, %d58
2028*5d9d9091SRichard Lowe	faligndata %d14, %d16, %d60
2029*5d9d9091SRichard Lowe	faligndata %d16, %d18, %d62
2030*5d9d9091SRichard Lowe	fmovd	%d18, %d2
2031*5d9d9091SRichard Lowe	fmovd	%d20, %d4
2032*5d9d9091SRichard Lowe	fmovd	%d22, %d6
2033*5d9d9091SRichard Lowe	fmovd	%d24, %d8
2034*5d9d9091SRichard Lowe	fmovd	%d26, %d10
2035*5d9d9091SRichard Lowe	fmovd	%d28, %d12
2036*5d9d9091SRichard Lowe	fmovd	%d30, %d14
2037*5d9d9091SRichard Lowe	stda	%d48, [%o0]ASI_BLK_P
2038*5d9d9091SRichard Lowe	subcc	%o5, 64, %o5
2039*5d9d9091SRichard Lowe	add	%o0, 64, %o0
2040*5d9d9091SRichard Lowe	bgu,pt	%ncc, .unalign_001_loop
2041*5d9d9091SRichard Lowe	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
2042*5d9d9091SRichard Lowe	ba	.unalign_done
2043*5d9d9091SRichard Lowe	membar	#Sync
2044*5d9d9091SRichard Lowe
2045*5d9d9091SRichard Lowe.unalign_000:
2046*5d9d9091SRichard Lowe	ldda	[%o4]ASI_BLK_P, %d0
2047*5d9d9091SRichard Lowe.unalign_000_loop:
2048*5d9d9091SRichard Lowe	add	%o4, 64, %o4
2049*5d9d9091SRichard Lowe	ldda	[%o4]ASI_BLK_P, %d16
2050*5d9d9091SRichard Lowe	faligndata %d0, %d2, %d48
2051*5d9d9091SRichard Lowe	faligndata %d2, %d4, %d50
2052*5d9d9091SRichard Lowe	faligndata %d4, %d6, %d52
2053*5d9d9091SRichard Lowe	faligndata %d6, %d8, %d54
2054*5d9d9091SRichard Lowe	faligndata %d8, %d10, %d56
2055*5d9d9091SRichard Lowe	faligndata %d10, %d12, %d58
2056*5d9d9091SRichard Lowe	faligndata %d12, %d14, %d60
2057*5d9d9091SRichard Lowe	faligndata %d14, %d16, %d62
2058*5d9d9091SRichard Lowe	fmovd	%d16, %d0
2059*5d9d9091SRichard Lowe	fmovd	%d18, %d2
2060*5d9d9091SRichard Lowe	fmovd	%d20, %d4
2061*5d9d9091SRichard Lowe	fmovd	%d22, %d6
2062*5d9d9091SRichard Lowe	fmovd	%d24, %d8
2063*5d9d9091SRichard Lowe	fmovd	%d26, %d10
2064*5d9d9091SRichard Lowe	fmovd	%d28, %d12
2065*5d9d9091SRichard Lowe	fmovd	%d30, %d14
2066*5d9d9091SRichard Lowe	stda	%d48, [%o0]ASI_BLK_P
2067*5d9d9091SRichard Lowe	subcc	%o5, 64, %o5
2068*5d9d9091SRichard Lowe	add	%o0, 64, %o0
2069*5d9d9091SRichard Lowe	bgu,pt	%ncc, .unalign_000_loop
2070*5d9d9091SRichard Lowe	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
2071*5d9d9091SRichard Lowe	membar	#Sync
2072*5d9d9091SRichard Lowe
2073*5d9d9091SRichard Lowe.unalign_done:
2074*5d9d9091SRichard Lowe	! Handle trailing bytes, 64 to 127
2075*5d9d9091SRichard Lowe	! Dest long word aligned, Src not long word aligned
2076*5d9d9091SRichard Lowe	cmp	%o2, 15
2077*5d9d9091SRichard Lowe	bleu	%ncc, .unalign_short
2078*5d9d9091SRichard Lowe
2079*5d9d9091SRichard Lowe	andn	%o2, 0x7, %o5		! %o5 is multiple of 8
2080*5d9d9091SRichard Lowe	and	%o2, 0x7, %o2		! residue bytes in %o2
2081*5d9d9091SRichard Lowe	add	%o2, 8, %o2
2082*5d9d9091SRichard Lowe	sub	%o5, 8, %o5		! insure we don't load past end of src
2083*5d9d9091SRichard Lowe	andn	%o1, 0x7, %o4		! %o4 has long word aligned src address
2084*5d9d9091SRichard Lowe	add	%o1, %o5, %o1		! advance %o1 to after multiple of 8
2085*5d9d9091SRichard Lowe	ldd	[%o4], %d0		! fetch partial word
2086*5d9d9091SRichard Lowe.unalign_by8:
2087*5d9d9091SRichard Lowe	ldd	[%o4+8], %d2
2088*5d9d9091SRichard Lowe	add	%o4, 8, %o4
2089*5d9d9091SRichard Lowe	faligndata %d0, %d2, %d16
2090*5d9d9091SRichard Lowe	subcc	%o5, 8, %o5
2091*5d9d9091SRichard Lowe	std	%d16, [%o0]
2092*5d9d9091SRichard Lowe	fmovd	%d2, %d0
2093*5d9d9091SRichard Lowe	bgu,pt	%ncc, .unalign_by8
2094*5d9d9091SRichard Lowe	add	%o0, 8, %o0
2095*5d9d9091SRichard Lowe
2096*5d9d9091SRichard Lowe.unalign_short:
2097*5d9d9091SRichard Lowe	brnz	%g5, .smallrest
2098*5d9d9091SRichard Lowe	nop
2099*5d9d9091SRichard Lowe	ba	.smallrest
2100*5d9d9091SRichard Lowe	wr	%g5, %g0, %fprs
2101*5d9d9091SRichard Lowe#else	/* NIAGARA2_IMPL */
2102*5d9d9091SRichard Lowe.forcpy:
2103*5d9d9091SRichard Lowe	mov	%o0, %g5		! save des address for return val
2104*5d9d9091SRichard Lowe	cmp	%o2, 17			! for small counts copy bytes
2105*5d9d9091SRichard Lowe	bleu,pt	%ncc, .dbytecp
2106*5d9d9091SRichard Lowe	nop
2107*5d9d9091SRichard Lowe
2108*5d9d9091SRichard Lowe	cmp	%o2, 0x80		! For lengths less than 128 bytes no
2109*5d9d9091SRichard Lowe	bleu,pn	%ncc, .no_blkcpy	! copy using ASI_BLK_INIT_ST_QUAD_LDD_P
2110*5d9d9091SRichard Lowe
2111*5d9d9091SRichard Lowe	/*
2112*5d9d9091SRichard Lowe	 * Make sure that source and destination buffers are 64 bytes apart.
2113*5d9d9091SRichard Lowe	 * If they are not, do not use ASI_BLK_INIT_ST_QUAD_LDD_P asi to copy
2114*5d9d9091SRichard Lowe	 * the data.
2115*5d9d9091SRichard Lowe	 */
2116*5d9d9091SRichard Lowe	subcc	%o1, %o0, %o3
2117*5d9d9091SRichard Lowe	blu	%ncc, .blkalgndst
2118*5d9d9091SRichard Lowe	cmp	%o3, 0x40		! if src - dst >= 0x40
2119*5d9d9091SRichard Lowe	bgeu,pt	%ncc, .blkalgndst	! then use ASI_BLK_INIT_ST_QUAD_LDD_P
2120*5d9d9091SRichard Lowe.no_blkcpy:
2121*5d9d9091SRichard Lowe	andcc	%o1, 3, %o5		! is src word aligned
2122*5d9d9091SRichard Lowe	bz,pn	%ncc, .aldst
2123*5d9d9091SRichard Lowe	cmp	%o5, 2			! is src half-word aligned
2124*5d9d9091SRichard Lowe	be,pt	%ncc, .s2algn
2125*5d9d9091SRichard Lowe	cmp	%o5, 3			! src is byte aligned
2126*5d9d9091SRichard Lowe.s1algn:ldub	[%o1], %o3		! move 1 or 3 bytes to align it
2127*5d9d9091SRichard Lowe	inc	1, %o1
2128*5d9d9091SRichard Lowe	stb	%o3, [%g5]		! move a byte to align src
2129*5d9d9091SRichard Lowe	inc	1, %g5
2130*5d9d9091SRichard Lowe	bne,pt	%ncc, .s2algn
2131*5d9d9091SRichard Lowe	dec	%o2
2132*5d9d9091SRichard Lowe	b	.ald			! now go align dest
2133*5d9d9091SRichard Lowe	andcc	%g5, 3, %o5
2134*5d9d9091SRichard Lowe
2135*5d9d9091SRichard Lowe.s2algn:lduh	[%o1], %o3		! know src is 2 byte alinged
2136*5d9d9091SRichard Lowe	inc	2, %o1
2137*5d9d9091SRichard Lowe	srl	%o3, 8, %o4
2138*5d9d9091SRichard Lowe	stb	%o4, [%g5]		! have to do bytes,
2139*5d9d9091SRichard Lowe	stb	%o3, [%g5 + 1]		! don't know dst alingment
2140*5d9d9091SRichard Lowe	inc	2, %g5
2141*5d9d9091SRichard Lowe	dec	2, %o2
2142*5d9d9091SRichard Lowe
2143*5d9d9091SRichard Lowe.aldst:	andcc	%g5, 3, %o5		! align the destination address
2144*5d9d9091SRichard Lowe.ald:	bz,pn	%ncc, .w4cp
2145*5d9d9091SRichard Lowe	cmp	%o5, 2
2146*5d9d9091SRichard Lowe	bz,pn	%ncc, .w2cp
2147*5d9d9091SRichard Lowe	cmp	%o5, 3
2148*5d9d9091SRichard Lowe.w3cp:	lduw	[%o1], %o4
2149*5d9d9091SRichard Lowe	inc	4, %o1
2150*5d9d9091SRichard Lowe	srl	%o4, 24, %o5
2151*5d9d9091SRichard Lowe	stb	%o5, [%g5]
2152*5d9d9091SRichard Lowe	bne,pt	%ncc, .w1cp
2153*5d9d9091SRichard Lowe	inc	%g5
2154*5d9d9091SRichard Lowe	dec	1, %o2
2155*5d9d9091SRichard Lowe	andn	%o2, 3, %o3		! o3 is aligned word count
2156*5d9d9091SRichard Lowe	dec	4, %o3			! avoid reading beyond tail of src
2157*5d9d9091SRichard Lowe	sub	%o1, %g5, %o1		! o1 gets the difference
2158*5d9d9091SRichard Lowe
2159*5d9d9091SRichard Lowe1:	sll	%o4, 8, %g1		! save residual bytes
2160*5d9d9091SRichard Lowe	lduw	[%o1+%g5], %o4
2161*5d9d9091SRichard Lowe	deccc	4, %o3
2162*5d9d9091SRichard Lowe	srl	%o4, 24, %o5		! merge with residual
2163*5d9d9091SRichard Lowe	or	%o5, %g1, %g1
2164*5d9d9091SRichard Lowe	st	%g1, [%g5]
2165*5d9d9091SRichard Lowe	bnz,pt	%ncc, 1b
2166*5d9d9091SRichard Lowe	inc	4, %g5
2167*5d9d9091SRichard Lowe	sub	%o1, 3, %o1		! used one byte of last word read
2168*5d9d9091SRichard Lowe	and	%o2, 3, %o2
2169*5d9d9091SRichard Lowe	b	7f
2170*5d9d9091SRichard Lowe	inc	4, %o2
2171*5d9d9091SRichard Lowe
2172*5d9d9091SRichard Lowe.w1cp:	srl	%o4, 8, %o5
2173*5d9d9091SRichard Lowe	sth	%o5, [%g5]
2174*5d9d9091SRichard Lowe	inc	2, %g5
2175*5d9d9091SRichard Lowe	dec	3, %o2
2176*5d9d9091SRichard Lowe	andn	%o2, 3, %o3		! o3 is aligned word count
2177*5d9d9091SRichard Lowe	dec	4, %o3			! avoid reading beyond tail of src
2178*5d9d9091SRichard Lowe	sub	%o1, %g5, %o1		! o1 gets the difference
2179*5d9d9091SRichard Lowe
2180*5d9d9091SRichard Lowe2:	sll	%o4, 24, %g1		! save residual bytes
2181*5d9d9091SRichard Lowe	lduw	[%o1+%g5], %o4
2182*5d9d9091SRichard Lowe	deccc	4, %o3
2183*5d9d9091SRichard Lowe	srl	%o4, 8, %o5		! merge with residual
2184*5d9d9091SRichard Lowe	or	%o5, %g1, %g1
2185*5d9d9091SRichard Lowe	st	%g1, [%g5]
2186*5d9d9091SRichard Lowe	bnz,pt	%ncc, 2b
2187*5d9d9091SRichard Lowe	inc	4, %g5
2188*5d9d9091SRichard Lowe	sub	%o1, 1, %o1		! used three bytes of last word read
2189*5d9d9091SRichard Lowe	and	%o2, 3, %o2
2190*5d9d9091SRichard Lowe	b	7f
2191*5d9d9091SRichard Lowe	inc	4, %o2
2192*5d9d9091SRichard Lowe
2193*5d9d9091SRichard Lowe.w2cp:	lduw	[%o1], %o4
2194*5d9d9091SRichard Lowe	inc	4, %o1
2195*5d9d9091SRichard Lowe	srl	%o4, 16, %o5
2196*5d9d9091SRichard Lowe	sth	%o5, [%g5]
2197*5d9d9091SRichard Lowe	inc	2, %g5
2198*5d9d9091SRichard Lowe	dec	2, %o2
2199*5d9d9091SRichard Lowe	andn	%o2, 3, %o3		! o3 is aligned word count
2200*5d9d9091SRichard Lowe	dec	4, %o3			! avoid reading beyond tail of src
2201*5d9d9091SRichard Lowe	sub	%o1, %g5, %o1		! o1 gets the difference
2202*5d9d9091SRichard Lowe
2203*5d9d9091SRichard Lowe3:	sll	%o4, 16, %g1		! save residual bytes
2204*5d9d9091SRichard Lowe	lduw	[%o1+%g5], %o4
2205*5d9d9091SRichard Lowe	deccc	4, %o3
2206*5d9d9091SRichard Lowe	srl	%o4, 16, %o5		! merge with residual
2207*5d9d9091SRichard Lowe	or	%o5, %g1, %g1
2208*5d9d9091SRichard Lowe	st	%g1, [%g5]
2209*5d9d9091SRichard Lowe	bnz,pt	%ncc, 3b
2210*5d9d9091SRichard Lowe	inc	4, %g5
2211*5d9d9091SRichard Lowe	sub	%o1, 2, %o1		! used two bytes of last word read
2212*5d9d9091SRichard Lowe	and	%o2, 3, %o2
2213*5d9d9091SRichard Lowe	b	7f
2214*5d9d9091SRichard Lowe	inc	4, %o2
2215*5d9d9091SRichard Lowe
2216*5d9d9091SRichard Lowe.w4cp:	andn	%o2, 3, %o3		! o3 is aligned word count
2217*5d9d9091SRichard Lowe	sub	%o1, %g5, %o1		! o1 gets the difference
2218*5d9d9091SRichard Lowe
2219*5d9d9091SRichard Lowe1:	lduw	[%o1+%g5], %o4		! read from address
2220*5d9d9091SRichard Lowe	deccc	4, %o3			! decrement count
2221*5d9d9091SRichard Lowe	st	%o4, [%g5]		! write at destination address
2222*5d9d9091SRichard Lowe	bgu,pt	%ncc, 1b
2223*5d9d9091SRichard Lowe	inc	4, %g5			! increment to address
2224*5d9d9091SRichard Lowe	b	7f
2225*5d9d9091SRichard Lowe	and	%o2, 3, %o2		! number of leftover bytes, if any
2226*5d9d9091SRichard Lowe
2227*5d9d9091SRichard Lowe	!
2228*5d9d9091SRichard Lowe	! differenced byte copy, works with any alignment
2229*5d9d9091SRichard Lowe	!
2230*5d9d9091SRichard Lowe.dbytecp:
2231*5d9d9091SRichard Lowe	b	7f
2232*5d9d9091SRichard Lowe	sub	%o1, %g5, %o1		! o1 gets the difference
2233*5d9d9091SRichard Lowe
2234*5d9d9091SRichard Lowe4:	stb	%o4, [%g5]		! write to address
2235*5d9d9091SRichard Lowe	inc	%g5			! inc to address
2236*5d9d9091SRichard Lowe7:	deccc	%o2			! decrement count
2237*5d9d9091SRichard Lowe	bgeu,a,pt %ncc,4b		! loop till done
2238*5d9d9091SRichard Lowe	ldub	[%o1+%g5], %o4		! read from address
2239*5d9d9091SRichard Lowe	retl				! %o0 was preserved
2240*5d9d9091SRichard Lowe	nop
2241*5d9d9091SRichard Lowe
2242*5d9d9091SRichard Lowe.blkalgndst:
2243*5d9d9091SRichard Lowe	save	%sp, -SA(MINFRAME), %sp
2244*5d9d9091SRichard Lowe
2245*5d9d9091SRichard Lowe	! Block (64 bytes) align the destination.
2246*5d9d9091SRichard Lowe	andcc	%i0, 0x3f, %i3		! is dst block aligned
2247*5d9d9091SRichard Lowe	bz	%ncc, .chksrc		! dst already block aligned
2248*5d9d9091SRichard Lowe	sub	%i3, 0x40, %i3
2249*5d9d9091SRichard Lowe	neg	%i3			! bytes till dst 64 bytes aligned
2250*5d9d9091SRichard Lowe	sub	%i2, %i3, %i2		! update i2 with new count
2251*5d9d9091SRichard Lowe
2252*5d9d9091SRichard Lowe	! Based on source and destination alignment do
2253*5d9d9091SRichard Lowe	! either 8 bytes, 4 bytes, 2 bytes or byte copy.
2254*5d9d9091SRichard Lowe
2255*5d9d9091SRichard Lowe	! Is dst & src 8B aligned
2256*5d9d9091SRichard Lowe	or	%i0, %i1, %o2
2257*5d9d9091SRichard Lowe	andcc	%o2, 0x7, %g0
2258*5d9d9091SRichard Lowe	bz	%ncc, .alewdcp
2259*5d9d9091SRichard Lowe	nop
2260*5d9d9091SRichard Lowe
2261*5d9d9091SRichard Lowe	! Is dst & src 4B aligned
2262*5d9d9091SRichard Lowe	andcc	%o2, 0x3, %g0
2263*5d9d9091SRichard Lowe	bz	%ncc, .alwdcp
2264*5d9d9091SRichard Lowe	nop
2265*5d9d9091SRichard Lowe
2266*5d9d9091SRichard Lowe	! Is dst & src 2B aligned
2267*5d9d9091SRichard Lowe	andcc	%o2, 0x1, %g0
2268*5d9d9091SRichard Lowe	bz	%ncc, .alhlfwdcp
2269*5d9d9091SRichard Lowe	nop
2270*5d9d9091SRichard Lowe
2271*5d9d9091SRichard Lowe	! 1B aligned
2272*5d9d9091SRichard Lowe1:	ldub	[%i1], %o2
2273*5d9d9091SRichard Lowe	stb	%o2, [%i0]
2274*5d9d9091SRichard Lowe	inc	%i1
2275*5d9d9091SRichard Lowe	deccc	%i3
2276*5d9d9091SRichard Lowe	bgu,pt	%ncc, 1b
2277*5d9d9091SRichard Lowe	inc	%i0
2278*5d9d9091SRichard Lowe
2279*5d9d9091SRichard Lowe	ba	.chksrc
2280*5d9d9091SRichard Lowe	nop
2281*5d9d9091SRichard Lowe
2282*5d9d9091SRichard Lowe	! dst & src 4B aligned
2283*5d9d9091SRichard Lowe.alwdcp:
2284*5d9d9091SRichard Lowe	ld	[%i1], %o2
2285*5d9d9091SRichard Lowe	st	%o2, [%i0]
2286*5d9d9091SRichard Lowe	add	%i1, 0x4, %i1
2287*5d9d9091SRichard Lowe	subcc	%i3, 0x4, %i3
2288*5d9d9091SRichard Lowe	bgu,pt	%ncc, .alwdcp
2289*5d9d9091SRichard Lowe	add	%i0, 0x4, %i0
2290*5d9d9091SRichard Lowe
2291*5d9d9091SRichard Lowe	ba	.chksrc
2292*5d9d9091SRichard Lowe	nop
2293*5d9d9091SRichard Lowe
2294*5d9d9091SRichard Lowe	! dst & src 2B aligned
2295*5d9d9091SRichard Lowe.alhlfwdcp:
2296*5d9d9091SRichard Lowe	lduh	[%i1], %o2
2297*5d9d9091SRichard Lowe	stuh	%o2, [%i0]
2298*5d9d9091SRichard Lowe	add	%i1, 0x2, %i1
2299*5d9d9091SRichard Lowe	subcc	%i3, 0x2, %i3
2300*5d9d9091SRichard Lowe	bgu,pt	%ncc, .alhlfwdcp
2301*5d9d9091SRichard Lowe	add	%i0, 0x2, %i0
2302*5d9d9091SRichard Lowe
2303*5d9d9091SRichard Lowe	ba	.chksrc
2304*5d9d9091SRichard Lowe	nop
2305*5d9d9091SRichard Lowe
2306*5d9d9091SRichard Lowe	! dst & src 8B aligned
2307*5d9d9091SRichard Lowe.alewdcp:
2308*5d9d9091SRichard Lowe	ldx	[%i1], %o2
2309*5d9d9091SRichard Lowe	stx	%o2, [%i0]
2310*5d9d9091SRichard Lowe	add	%i1, 0x8, %i1
2311*5d9d9091SRichard Lowe	subcc	%i3, 0x8, %i3
2312*5d9d9091SRichard Lowe	bgu,pt	%ncc, .alewdcp
2313*5d9d9091SRichard Lowe	add	%i0, 0x8, %i0
2314*5d9d9091SRichard Lowe
2315*5d9d9091SRichard Lowe	! Now Destination is block (64 bytes) aligned
2316*5d9d9091SRichard Lowe.chksrc:
2317*5d9d9091SRichard Lowe	andn	%i2, 0x3f, %i3		! %i3 count is multiple of block size
2318*5d9d9091SRichard Lowe	sub	%i2, %i3, %i2		! Residue bytes in %i2
2319*5d9d9091SRichard Lowe	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
2320*5d9d9091SRichard Lowe	andcc	%i1, 0xf, %l1		! is src quadword aligned
2321*5d9d9091SRichard Lowe	bz,pn	%ncc, .blkcpy		! src offset in %l1
2322*5d9d9091SRichard Lowe	nop
2323*5d9d9091SRichard Lowe	cmp	%l1, 0x8
2324*5d9d9091SRichard Lowe	bgu	%ncc, .cpy_upper_double
2325*5d9d9091SRichard Lowe	nop
2326*5d9d9091SRichard Lowe	blu	%ncc, .cpy_lower_double
2327*5d9d9091SRichard Lowe	nop
2328*5d9d9091SRichard Lowe
2329*5d9d9091SRichard Lowe	! Falls through when source offset is equal to 8 i.e.
2330*5d9d9091SRichard Lowe	! source is double word aligned.
2331*5d9d9091SRichard Lowe	! In this case no shift/merge of data is required
2332*5d9d9091SRichard Lowe	sub	%i1, %l1, %i1		! align the src at 16 bytes.
2333*5d9d9091SRichard Lowe	andn	%i1, 0x3f, %o0		! %o0 has block aligned source
2334*5d9d9091SRichard Lowe	prefetch [%o0+0x0], #one_read
2335*5d9d9091SRichard Lowe	ldda	[%i1+0x0]%asi, %o2
2336*5d9d9091SRichard Loweloop0:
2337*5d9d9091SRichard Lowe	ldda	[%i1+0x10]%asi, %o4
2338*5d9d9091SRichard Lowe	prefetch [%o0+0x40], #one_read
2339*5d9d9091SRichard Lowe
2340*5d9d9091SRichard Lowe	stxa	%o3, [%i0+0x0]%asi
2341*5d9d9091SRichard Lowe	stxa	%o4, [%i0+0x8]%asi
2342*5d9d9091SRichard Lowe
2343*5d9d9091SRichard Lowe	ldda	[%i1+0x20]%asi, %o2
2344*5d9d9091SRichard Lowe	stxa	%o5, [%i0+0x10]%asi
2345*5d9d9091SRichard Lowe	stxa	%o2, [%i0+0x18]%asi
2346*5d9d9091SRichard Lowe
2347*5d9d9091SRichard Lowe	ldda	[%i1+0x30]%asi, %o4
2348*5d9d9091SRichard Lowe	stxa	%o3, [%i0+0x20]%asi
2349*5d9d9091SRichard Lowe	stxa	%o4, [%i0+0x28]%asi
2350*5d9d9091SRichard Lowe
2351*5d9d9091SRichard Lowe	ldda	[%i1+0x40]%asi, %o2
2352*5d9d9091SRichard Lowe	stxa	%o5, [%i0+0x30]%asi
2353*5d9d9091SRichard Lowe	stxa	%o2, [%i0+0x38]%asi
2354*5d9d9091SRichard Lowe
2355*5d9d9091SRichard Lowe	add	%o0, 0x40, %o0
2356*5d9d9091SRichard Lowe	add	%i1, 0x40, %i1
2357*5d9d9091SRichard Lowe	subcc	%i3, 0x40, %i3
2358*5d9d9091SRichard Lowe	bgu,pt	%ncc, loop0
2359*5d9d9091SRichard Lowe	add	%i0, 0x40, %i0
2360*5d9d9091SRichard Lowe	ba	.blkdone
2361*5d9d9091SRichard Lowe	add	%i1, %l1, %i1		! increment the source by src offset
2362*5d9d9091SRichard Lowe
2363*5d9d9091SRichard Lowe.cpy_lower_double:
2364*5d9d9091SRichard Lowe	sub	%i1, %l1, %i1		! align the src at 16 bytes.
2365*5d9d9091SRichard Lowe	sll	%l1, 3, %l2		! %l2 left shift
2366*5d9d9091SRichard Lowe	mov	0x40, %l3
2367*5d9d9091SRichard Lowe	sub	%l3, %l2, %l3		! %l3 right shift = (64 - left shift)
2368*5d9d9091SRichard Lowe	andn	%i1, 0x3f, %o0		! %o0 has block aligned source
2369*5d9d9091SRichard Lowe	prefetch [%o0+0x0], #one_read
2370*5d9d9091SRichard Lowe	ldda	[%i1+0x0]%asi, %o2	! partial data in %o2 and %o3 has
2371*5d9d9091SRichard Lowe					! complete data
2372*5d9d9091SRichard Loweloop1:
2373*5d9d9091SRichard Lowe	ldda	[%i1+0x10]%asi, %o4	! %o4 has partial data for this read.
2374*5d9d9091SRichard Lowe	ALIGN_DATA(%o2, %o3, %o4, %l2, %l3, %g1)	! merge %o2, %o3 and %o4
2375*5d9d9091SRichard Lowe							! into %o2 and %o3
2376*5d9d9091SRichard Lowe	prefetch [%o0+0x40], #one_read
2377*5d9d9091SRichard Lowe	stxa	%o2, [%i0+0x0]%asi
2378*5d9d9091SRichard Lowe	stxa	%o3, [%i0+0x8]%asi
2379*5d9d9091SRichard Lowe
2380*5d9d9091SRichard Lowe	ldda	[%i1+0x20]%asi, %o2
2381*5d9d9091SRichard Lowe	ALIGN_DATA(%o4, %o5, %o2, %l2, %l3, %g1)	! merge %o2 with %o5 and
2382*5d9d9091SRichard Lowe	stxa	%o4, [%i0+0x10]%asi			! %o4 from previous read
2383*5d9d9091SRichard Lowe	stxa	%o5, [%i0+0x18]%asi			! into %o4 and %o5
2384*5d9d9091SRichard Lowe
2385*5d9d9091SRichard Lowe	! Repeat the same for next 32 bytes.
2386*5d9d9091SRichard Lowe
2387*5d9d9091SRichard Lowe	ldda	[%i1+0x30]%asi, %o4
2388*5d9d9091SRichard Lowe	ALIGN_DATA(%o2, %o3, %o4, %l2, %l3, %g1)
2389*5d9d9091SRichard Lowe	stxa	%o2, [%i0+0x20]%asi
2390*5d9d9091SRichard Lowe	stxa	%o3, [%i0+0x28]%asi
2391*5d9d9091SRichard Lowe
2392*5d9d9091SRichard Lowe	ldda	[%i1+0x40]%asi, %o2
2393*5d9d9091SRichard Lowe	ALIGN_DATA(%o4, %o5, %o2, %l2, %l3, %g1)
2394*5d9d9091SRichard Lowe	stxa	%o4, [%i0+0x30]%asi
2395*5d9d9091SRichard Lowe	stxa	%o5, [%i0+0x38]%asi
2396*5d9d9091SRichard Lowe
2397*5d9d9091SRichard Lowe	add	%o0, 0x40, %o0
2398*5d9d9091SRichard Lowe	add	%i1, 0x40, %i1
2399*5d9d9091SRichard Lowe	subcc	%i3, 0x40, %i3
2400*5d9d9091SRichard Lowe	bgu,pt	%ncc, loop1
2401*5d9d9091SRichard Lowe	add	%i0, 0x40, %i0
2402*5d9d9091SRichard Lowe	ba	.blkdone
2403*5d9d9091SRichard Lowe	add	%i1, %l1, %i1		! increment the source by src offset
2404*5d9d9091SRichard Lowe
2405*5d9d9091SRichard Lowe.cpy_upper_double:
2406*5d9d9091SRichard Lowe	sub	%i1, %l1, %i1		! align the src at 16 bytes.
2407*5d9d9091SRichard Lowe	mov	0x8, %l2
2408*5d9d9091SRichard Lowe	sub	%l1, %l2, %l2
2409*5d9d9091SRichard Lowe	sll	%l2, 3, %l2		! %l2 left shift
2410*5d9d9091SRichard Lowe	mov	0x40, %l3
2411*5d9d9091SRichard Lowe	sub	%l3, %l2, %l3		! %l3 right shift = (64 - left shift)
2412*5d9d9091SRichard Lowe	andn	%i1, 0x3f, %o0		! %o0 has block aligned source
2413*5d9d9091SRichard Lowe	prefetch [%o0+0x0], #one_read
2414*5d9d9091SRichard Lowe	ldda	[%i1+0x0]%asi, %o2	! partial data in %o3 for this read and
2415*5d9d9091SRichard Lowe					! no data in %o2
2416*5d9d9091SRichard Loweloop2:
2417*5d9d9091SRichard Lowe	ldda	[%i1+0x10]%asi, %o4	! %o4 has complete data and %o5 has
2418*5d9d9091SRichard Lowe					! partial
2419*5d9d9091SRichard Lowe	ALIGN_DATA(%o3, %o4, %o5, %l2, %l3, %g1)	! merge %o3, %o4 and %o5
2420*5d9d9091SRichard Lowe							! into %o3 and %o4
2421*5d9d9091SRichard Lowe	prefetch [%o0+0x40], #one_read
2422*5d9d9091SRichard Lowe	stxa	%o3, [%i0+0x0]%asi
2423*5d9d9091SRichard Lowe	stxa	%o4, [%i0+0x8]%asi
2424*5d9d9091SRichard Lowe
2425*5d9d9091SRichard Lowe	ldda	[%i1+0x20]%asi, %o2
2426*5d9d9091SRichard Lowe	ALIGN_DATA(%o5, %o2, %o3, %l2, %l3, %g1)	! merge %o2 and %o3 with
2427*5d9d9091SRichard Lowe	stxa	%o5, [%i0+0x10]%asi			! %o5 from previous read
2428*5d9d9091SRichard Lowe	stxa	%o2, [%i0+0x18]%asi			! into %o5 and %o2
2429*5d9d9091SRichard Lowe
2430*5d9d9091SRichard Lowe	! Repeat the same for next 32 bytes.
2431*5d9d9091SRichard Lowe
2432*5d9d9091SRichard Lowe	ldda	[%i1+0x30]%asi, %o4
2433*5d9d9091SRichard Lowe	ALIGN_DATA(%o3, %o4, %o5, %l2, %l3, %g1)
2434*5d9d9091SRichard Lowe	stxa	%o3, [%i0+0x20]%asi
2435*5d9d9091SRichard Lowe	stxa	%o4, [%i0+0x28]%asi
2436*5d9d9091SRichard Lowe
2437*5d9d9091SRichard Lowe	ldda	[%i1+0x40]%asi, %o2
2438*5d9d9091SRichard Lowe	ALIGN_DATA(%o5, %o2, %o3, %l2, %l3, %g1)
2439*5d9d9091SRichard Lowe	stxa	%o5, [%i0+0x30]%asi
2440*5d9d9091SRichard Lowe	stxa	%o2, [%i0+0x38]%asi
2441*5d9d9091SRichard Lowe
2442*5d9d9091SRichard Lowe	add	%o0, 0x40, %o0
2443*5d9d9091SRichard Lowe	add	%i1, 0x40, %i1
2444*5d9d9091SRichard Lowe	subcc	%i3, 0x40, %i3
2445*5d9d9091SRichard Lowe	bgu,pt	%ncc, loop2
2446*5d9d9091SRichard Lowe	add	%i0, 0x40, %i0
2447*5d9d9091SRichard Lowe	ba	.blkdone
2448*5d9d9091SRichard Lowe	add	%i1, %l1, %i1		! increment the source by src offset
2449*5d9d9091SRichard Lowe
2450*5d9d9091SRichard Lowe	! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
2451*5d9d9091SRichard Lowe.blkcpy:
2452*5d9d9091SRichard Lowe	andn	%i1, 0x3f, %o0		! %o0 has block aligned source
2453*5d9d9091SRichard Lowe	prefetch [%o0+0x0], #one_read
2454*5d9d9091SRichard Lowe1:
2455*5d9d9091SRichard Lowe	prefetch [%o0+0x40], #one_read
2456*5d9d9091SRichard Lowe
2457*5d9d9091SRichard Lowe	ldda	[%i1+0x0]%asi, %o2
2458*5d9d9091SRichard Lowe	ldda	[%i1+0x10]%asi, %o4
2459*5d9d9091SRichard Lowe
2460*5d9d9091SRichard Lowe	stxa	%o2, [%i0+0x0]%asi
2461*5d9d9091SRichard Lowe	stxa	%o3, [%i0+0x8]%asi
2462*5d9d9091SRichard Lowe	stxa	%o4, [%i0+0x10]%asi
2463*5d9d9091SRichard Lowe	stxa	%o5, [%i0+0x18]%asi
2464*5d9d9091SRichard Lowe
2465*5d9d9091SRichard Lowe	ldda	[%i1+0x20]%asi, %o2
2466*5d9d9091SRichard Lowe	ldda	[%i1+0x30]%asi, %o4
2467*5d9d9091SRichard Lowe
2468*5d9d9091SRichard Lowe	stxa	%o2, [%i0+0x20]%asi
2469*5d9d9091SRichard Lowe	stxa	%o3, [%i0+0x28]%asi
2470*5d9d9091SRichard Lowe	stxa	%o4, [%i0+0x30]%asi
2471*5d9d9091SRichard Lowe	stxa	%o5, [%i0+0x38]%asi
2472*5d9d9091SRichard Lowe
2473*5d9d9091SRichard Lowe	add	%o0, 0x40, %o0
2474*5d9d9091SRichard Lowe	add	%i1, 0x40, %i1
2475*5d9d9091SRichard Lowe	subcc	%i3, 0x40, %i3
2476*5d9d9091SRichard Lowe	bgu,pt	%ncc, 1b
2477*5d9d9091SRichard Lowe	add	%i0, 0x40, %i0
2478*5d9d9091SRichard Lowe
2479*5d9d9091SRichard Lowe.blkdone:
2480*5d9d9091SRichard Lowe	membar	#Sync
2481*5d9d9091SRichard Lowe
2482*5d9d9091SRichard Lowe	mov	ASI_PNF, %asi		! restore %asi to default
2483*5d9d9091SRichard Lowe					! ASI_PRIMARY_NOFAULT value
2484*5d9d9091SRichard Lowe	tst	%i2
2485*5d9d9091SRichard Lowe	bz,pt	%ncc, .blkexit
2486*5d9d9091SRichard Lowe	nop
2487*5d9d9091SRichard Lowe
2488*5d9d9091SRichard Lowe	! Handle trailing bytes
2489*5d9d9091SRichard Lowe	cmp	%i2, 0x8
2490*5d9d9091SRichard Lowe	blu,pt	%ncc, .residue
2491*5d9d9091SRichard Lowe	nop
2492*5d9d9091SRichard Lowe
2493*5d9d9091SRichard Lowe	! Can we do some 8B ops
2494*5d9d9091SRichard Lowe	or	%i1, %i0, %o2
2495*5d9d9091SRichard Lowe	andcc	%o2, 0x7, %g0
2496*5d9d9091SRichard Lowe	bnz	%ncc, .last4
2497*5d9d9091SRichard Lowe	nop
2498*5d9d9091SRichard Lowe
2499*5d9d9091SRichard Lowe	! Do 8byte ops as long as possible
2500*5d9d9091SRichard Lowe.last8:
2501*5d9d9091SRichard Lowe	ldx	[%i1], %o2
2502*5d9d9091SRichard Lowe	stx	%o2, [%i0]
2503*5d9d9091SRichard Lowe	add	%i1, 0x8, %i1
2504*5d9d9091SRichard Lowe	sub	%i2, 0x8, %i2
2505*5d9d9091SRichard Lowe	cmp	%i2, 0x8
2506*5d9d9091SRichard Lowe	bgu,pt	%ncc, .last8
2507*5d9d9091SRichard Lowe	add	%i0, 0x8, %i0
2508*5d9d9091SRichard Lowe
2509*5d9d9091SRichard Lowe	tst	%i2
2510*5d9d9091SRichard Lowe	bz,pt	%ncc, .blkexit
2511*5d9d9091SRichard Lowe	nop
2512*5d9d9091SRichard Lowe
2513*5d9d9091SRichard Lowe	ba	.residue
2514*5d9d9091SRichard Lowe	nop
2515*5d9d9091SRichard Lowe
2516*5d9d9091SRichard Lowe.last4:
2517*5d9d9091SRichard Lowe	! Can we do 4B ops
2518*5d9d9091SRichard Lowe	andcc	%o2, 0x3, %g0
2519*5d9d9091SRichard Lowe	bnz	%ncc, .last2
2520*5d9d9091SRichard Lowe	nop
2521*5d9d9091SRichard Lowe1:
2522*5d9d9091SRichard Lowe	ld	[%i1], %o2
2523*5d9d9091SRichard Lowe	st	%o2, [%i0]
2524*5d9d9091SRichard Lowe	add	%i1, 0x4, %i1
2525*5d9d9091SRichard Lowe	sub	%i2, 0x4, %i2
2526*5d9d9091SRichard Lowe	cmp	%i2, 0x4
2527*5d9d9091SRichard Lowe	bgu,pt	%ncc, 1b
2528*5d9d9091SRichard Lowe	add	%i0, 0x4, %i0
2529*5d9d9091SRichard Lowe
2530*5d9d9091SRichard Lowe	cmp	%i2, 0
2531*5d9d9091SRichard Lowe	bz,pt	%ncc, .blkexit
2532*5d9d9091SRichard Lowe	nop
2533*5d9d9091SRichard Lowe
2534*5d9d9091SRichard Lowe	ba	.residue
2535*5d9d9091SRichard Lowe	nop
2536*5d9d9091SRichard Lowe
2537*5d9d9091SRichard Lowe.last2:
2538*5d9d9091SRichard Lowe	! Can we do 2B ops
2539*5d9d9091SRichard Lowe	andcc	%o2, 0x1, %g0
2540*5d9d9091SRichard Lowe	bnz	%ncc, .residue
2541*5d9d9091SRichard Lowe	nop
2542*5d9d9091SRichard Lowe
2543*5d9d9091SRichard Lowe1:
2544*5d9d9091SRichard Lowe	lduh	[%i1], %o2
2545*5d9d9091SRichard Lowe	stuh	%o2, [%i0]
2546*5d9d9091SRichard Lowe	add	%i1, 0x2, %i1
2547*5d9d9091SRichard Lowe	sub	%i2, 0x2, %i2
2548*5d9d9091SRichard Lowe	cmp	%i2, 0x2
2549*5d9d9091SRichard Lowe	bgu,pt	%ncc, 1b
2550*5d9d9091SRichard Lowe	add	%i0, 0x2, %i0
2551*5d9d9091SRichard Lowe
2552*5d9d9091SRichard Lowe	cmp	%i2, 0
2553*5d9d9091SRichard Lowe	bz,pt	%ncc, .blkexit
2554*5d9d9091SRichard Lowe	nop
2555*5d9d9091SRichard Lowe
2556*5d9d9091SRichard Lowe.residue:
2557*5d9d9091SRichard Lowe	ldub	[%i1], %o2
2558*5d9d9091SRichard Lowe	stb	%o2, [%i0]
2559*5d9d9091SRichard Lowe	inc	%i1
2560*5d9d9091SRichard Lowe	deccc	%i2
2561*5d9d9091SRichard Lowe	bgu,pt	%ncc, .residue
2562*5d9d9091SRichard Lowe	inc	%i0
2563*5d9d9091SRichard Lowe
2564*5d9d9091SRichard Lowe.blkexit:
2565*5d9d9091SRichard Lowe
2566*5d9d9091SRichard Lowe	ret
2567*5d9d9091SRichard Lowe	restore	%g5, %g0, %o0
2568*5d9d9091SRichard Lowe
2569*5d9d9091SRichard Lowe#endif	/* NIAGARA2_IMPL */
2570*5d9d9091SRichard Lowe	SET_SIZE(memcpy)
2571*5d9d9091SRichard Lowe	SET_SIZE(__align_cpy_1)
2572