xref: /titanic_50/usr/src/lib/libc/capabilities/sun4v/common/memcpy.s (revision 1e49577a7fcde812700ded04431b49d67cc57d6d)
1*1e49577aSRod Evans/*
2*1e49577aSRod Evans * CDDL HEADER START
3*1e49577aSRod Evans *
4*1e49577aSRod Evans * The contents of this file are subject to the terms of the
5*1e49577aSRod Evans * Common Development and Distribution License (the "License").
6*1e49577aSRod Evans * You may not use this file except in compliance with the License.
7*1e49577aSRod Evans *
8*1e49577aSRod Evans * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*1e49577aSRod Evans * or http://www.opensolaris.org/os/licensing.
10*1e49577aSRod Evans * See the License for the specific language governing permissions
11*1e49577aSRod Evans * and limitations under the License.
12*1e49577aSRod Evans *
13*1e49577aSRod Evans * When distributing Covered Code, include this CDDL HEADER in each
14*1e49577aSRod Evans * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*1e49577aSRod Evans * If applicable, add the following below this CDDL HEADER, with the
16*1e49577aSRod Evans * fields enclosed by brackets "[]" replaced with your own identifying
17*1e49577aSRod Evans * information: Portions Copyright [yyyy] [name of copyright owner]
18*1e49577aSRod Evans *
19*1e49577aSRod Evans * CDDL HEADER END
20*1e49577aSRod Evans */
21*1e49577aSRod Evans
22*1e49577aSRod Evans/*
23*1e49577aSRod Evans * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24*1e49577aSRod Evans */
25*1e49577aSRod Evans
26*1e49577aSRod Evans	.file	"memcpy.s"
27*1e49577aSRod Evans
28*1e49577aSRod Evans/*
29*1e49577aSRod Evans * memcpy(s1, s2, len)
30*1e49577aSRod Evans *
31*1e49577aSRod Evans * Copy s2 to s1, always copy n bytes.
32*1e49577aSRod Evans * Note: this C code does not work for overlapped copies.
33*1e49577aSRod Evans *       Memmove() and bcopy() do.
34*1e49577aSRod Evans *
35*1e49577aSRod Evans * Added entry __align_cpy_1 is generally for use of the compilers.
36*1e49577aSRod Evans *
37*1e49577aSRod Evans * Fast assembler language version of the following C-program for memcpy
38*1e49577aSRod Evans * which represents the `standard' for the C-library.
39*1e49577aSRod Evans *
40*1e49577aSRod Evans *	void *
41*1e49577aSRod Evans *	memcpy(void *s, const void *s0, size_t n)
42*1e49577aSRod Evans *	{
43*1e49577aSRod Evans *		if (n != 0) {
44*1e49577aSRod Evans *		    char *s1 = s;
45*1e49577aSRod Evans *		    const char *s2 = s0;
46*1e49577aSRod Evans *		    do {
47*1e49577aSRod Evans *			*s1++ = *s2++;
48*1e49577aSRod Evans *		    } while (--n != 0);
49*1e49577aSRod Evans *		}
50*1e49577aSRod Evans *		return (s);
51*1e49577aSRod Evans *	}
52*1e49577aSRod Evans *
53*1e49577aSRod Evans *
54*1e49577aSRod Evans * N1 Flow :
55*1e49577aSRod Evans *
56*1e49577aSRod Evans * if (count < 17) {
57*1e49577aSRod Evans *	Do the byte copy
58*1e49577aSRod Evans *	Return destination address
59*1e49577aSRod Evans * }
60*1e49577aSRod Evans * if (count < 128) {
61*1e49577aSRod Evans *	Is source aligned on word boundary
62*1e49577aSRod Evans *	If no then align source on word boundary then goto .ald
63*1e49577aSRod Evans *	If yes goto .ald
64*1e49577aSRod Evans *	.ald:
65*1e49577aSRod Evans *		Is destination aligned on word boundary
66*1e49577aSRod Evans *		Depending on destination offset (last 2 bits of destination)
67*1e49577aSRod Evans *		copy data by shifting and merging.
68*1e49577aSRod Evans *		Copy residue bytes as byte copy
69*1e49577aSRod Evans *		Return destination address
70*1e49577aSRod Evans * } else {
71*1e49577aSRod Evans *	Align destination on block boundary
72*1e49577aSRod Evans *	Depending on the source offset (last 4 bits of source address) align
73*1e49577aSRod Evans *	the data and store to destination. Both the load and store are done
74*1e49577aSRod Evans *	using ASI_BLK_INIT_ST_QUAD_LDD_P.
75*1e49577aSRod Evans *	For remaining count copy as much data in 8-byte chunk from source to
76*1e49577aSRod Evans *	destination.
77*1e49577aSRod Evans *	Followed by trailing copy using byte copy.
78*1e49577aSRod Evans *	Return saved destination address
79*1e49577aSRod Evans * }
80*1e49577aSRod Evans *
81*1e49577aSRod Evans *
82*1e49577aSRod Evans * N2 Flow :
83*1e49577aSRod Evans * Flow :
84*1e49577aSRod Evans *
85*1e49577aSRod Evans * if (count < 128) {
86*1e49577aSRod Evans *   if count < 3
87*1e49577aSRod Evans *	copy bytes; exit with dst addr
88*1e49577aSRod Evans *   if src & dst aligned on word boundary but not long word boundary,
89*1e49577aSRod Evans *     copy with ldw/stw; branch to finish_up
90*1e49577aSRod Evans *   if src & dst aligned on long word boundary
91*1e49577aSRod Evans *     copy with ldx/stx; branch to finish_up
92*1e49577aSRod Evans *   if src & dst not aligned and length <= 14
93*1e49577aSRod Evans *     copy bytes; exit with dst addr
94*1e49577aSRod Evans *   move enough bytes to get src to word boundary
95*1e49577aSRod Evans *   if dst now on word boundary
96*1e49577aSRod Evans * move_words:
97*1e49577aSRod Evans *     copy words; branch to finish_up
98*1e49577aSRod Evans *   if dst now on half word boundary
99*1e49577aSRod Evans *     load words, shift half words, store words; branch to finish_up
100*1e49577aSRod Evans *   if dst on byte 1
101*1e49577aSRod Evans *     load words, shift 3 bytes, store words; branch to finish_up
102*1e49577aSRod Evans *   if dst on byte 3
103*1e49577aSRod Evans *     load words, shift 1 byte, store words; branch to finish_up
104*1e49577aSRod Evans * finish_up:
105*1e49577aSRod Evans *     copy bytes; exit with dst addr
106*1e49577aSRod Evans * } else {                                         More than 128 bytes
107*1e49577aSRod Evans *   move bytes until dst is on long word boundary
108*1e49577aSRod Evans *   if( src is on long word boundary ) {
109*1e49577aSRod Evans *     if (count < 512) {
110*1e49577aSRod Evans * finish_long:				           src/dst aligned on 8 bytes
111*1e49577aSRod Evans *       copy with ldx/stx in 8-way unrolled loop;
112*1e49577aSRod Evans *       copy final 0-63 bytes; exit with dst addr
113*1e49577aSRod Evans *     } else {                                 src/dst aligned; count > 512
114*1e49577aSRod Evans *       align dst on 64 byte boundary; use 8-way test for each of 8 possible
115*1e49577aSRod Evans *       src alignments relative to a 64 byte boundary to select the
116*1e49577aSRod Evans *       16-way unrolled loop to use for
117*1e49577aSRod Evans *       block load, fmovd, block-init-store, block-store, fmovd operations
118*1e49577aSRod Evans *       then go to finish_long.
119*1e49577aSRod Evans *     }
120*1e49577aSRod Evans *   } else {                                   src/dst not aligned on 8 bytes
121*1e49577aSRod Evans *     if src is word aligned and count < 512
122*1e49577aSRod Evans *       move words in 8-way unrolled loop
123*1e49577aSRod Evans *       move final 0-31 bytes; exit with dst addr
124*1e49577aSRod Evans *     if count < 512
125*1e49577aSRod Evans *       use alignaddr/faligndata combined with ldd/std in 8-way
126*1e49577aSRod Evans *       unrolled loop to move data.
127*1e49577aSRod Evans *       go to unalign_done
128*1e49577aSRod Evans *     else
129*1e49577aSRod Evans *       setup alignaddr for faligndata instructions
130*1e49577aSRod Evans *       align dst on 64 byte boundary; use 8-way test for each of 8 possible
131*1e49577aSRod Evans *       src alignments to nearest long word relative to 64 byte boundary to
132*1e49577aSRod Evans *       select the 8-way unrolled loop to use for
133*1e49577aSRod Evans *       block load, falign, fmovd, block-init-store, block-store loop
134*1e49577aSRod Evans *	 (only use block-init-store when src/dst on 8 byte boundaries.)
135*1e49577aSRod Evans * unalign_done:
136*1e49577aSRod Evans *       move remaining bytes for unaligned cases. exit with dst addr.
137*1e49577aSRod Evans * }
138*1e49577aSRod Evans *
139*1e49577aSRod Evans * Comment on N2 memmove and memcpy common code and block-store-init:
140*1e49577aSRod Evans *   In the man page for memmove, it specifies that copying will take place
141*1e49577aSRod Evans *   correctly between objects that overlap.  For memcpy, behavior is
142*1e49577aSRod Evans *   undefined for objects that overlap.
143*1e49577aSRod Evans *
144*1e49577aSRod Evans *   In rare cases, some multi-threaded applications may attempt to examine
145*1e49577aSRod Evans *   the copy destination buffer during the copy. Using the block-store-init
146*1e49577aSRod Evans *   instruction allows those applications to observe zeros in some
147*1e49577aSRod Evans *   cache lines of the destination buffer for narrow windows. But the
148*1e49577aSRod Evans *   the block-store-init provides memory throughput advantages for many
149*1e49577aSRod Evans *   common applications. To meet both needs, those applications which need
150*1e49577aSRod Evans *   the destination buffer to retain meaning during the copy should use
151*1e49577aSRod Evans *   memmove instead of memcpy.  The memmove version duplicates the memcpy
152*1e49577aSRod Evans *   algorithms except the memmove version does not use block-store-init
153*1e49577aSRod Evans *   in those cases where memcpy does use block-store-init. Otherwise, when
154*1e49577aSRod Evans *   memmove can determine the source and destination do not overlap,
155*1e49577aSRod Evans *   memmove shares the memcpy code.
156*1e49577aSRod Evans */
157*1e49577aSRod Evans
158*1e49577aSRod Evans#include <sys/asm_linkage.h>
159*1e49577aSRod Evans#include <sys/niagaraasi.h>
160*1e49577aSRod Evans#include <sys/asi.h>
161*1e49577aSRod Evans#include <sys/trap.h>
162*1e49577aSRod Evans
163*1e49577aSRod Evans/* documented name for primary block initializing store */
164*1e49577aSRod Evans#define	ASI_STBI_P	ASI_BLK_INIT_ST_QUAD_LDD_P
165*1e49577aSRod Evans
166*1e49577aSRod Evans#define	BLOCK_SIZE	64
167*1e49577aSRod Evans#define	FPRS_FEF	0x4
168*1e49577aSRod Evans
169*1e49577aSRod Evans#define	SHORTCOPY	3
170*1e49577aSRod Evans#define	SHORTCHECK	14
171*1e49577aSRod Evans#define	SHORT_LONG	64	/* max copy for short longword-aligned case */
172*1e49577aSRod Evans				/* must be at least 32 */
173*1e49577aSRod Evans#define	SMALL_MAX	128
174*1e49577aSRod Evans#define	MED_UMAX	512	/* max copy for medium un-aligned case */
175*1e49577aSRod Evans#define	MED_WMAX	512	/* max copy for medium word-aligned case */
176*1e49577aSRod Evans#define	MED_MAX		512	/* max copy for medium longword-aligned case */
177*1e49577aSRod Evans
178*1e49577aSRod Evans#ifdef NIAGARA2_IMPL
179*1e49577aSRod Evans#include <sys/sun4asi.h>
180*1e49577aSRod Evans
181*1e49577aSRod Evans#else	/* NIAGARA2_IMPL */
182*1e49577aSRod Evans/*
183*1e49577aSRod Evans * This define is to align data for the unaligned source cases.
184*1e49577aSRod Evans * The data1, data2 and data3 is merged into data1 and data2.
185*1e49577aSRod Evans * The data3 is preserved for next merge.
186*1e49577aSRod Evans */
187*1e49577aSRod Evans#define	ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp)	\
188*1e49577aSRod Evans	sllx	data1, lshift, data1				;\
189*1e49577aSRod Evans	srlx	data2, rshift, tmp				;\
190*1e49577aSRod Evans	or	data1, tmp, data1				;\
191*1e49577aSRod Evans	sllx	data2, lshift, data2				;\
192*1e49577aSRod Evans	srlx	data3, rshift, tmp				;\
193*1e49577aSRod Evans	or	data2, tmp, data2
194*1e49577aSRod Evans/*
195*1e49577aSRod Evans * Align the data. Merge the data1 and data2 into data1.
196*1e49577aSRod Evans */
197*1e49577aSRod Evans#define	ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp)	\
198*1e49577aSRod Evans	sllx	data1, lshift, data1				;\
199*1e49577aSRod Evans	srlx	data2, rshift, tmp				;\
200*1e49577aSRod Evans	or	data1, tmp, data1
201*1e49577aSRod Evans#endif	/* NIAGARA2_IMPL */
202*1e49577aSRod Evans
203*1e49577aSRod Evans
204*1e49577aSRod Evans	ANSI_PRAGMA_WEAK(memmove,function)
205*1e49577aSRod Evans	ANSI_PRAGMA_WEAK(memcpy,function)
206*1e49577aSRod Evans
207*1e49577aSRod Evans	ENTRY(memmove)
208*1e49577aSRod Evans	cmp	%o1, %o0	! if from address is >= to use forward copy
209*1e49577aSRod Evans	bgeu,pn	%ncc, .forcpy	! else use backward if ...
210*1e49577aSRod Evans	sub	%o0, %o1, %o4	! get difference of two addresses
211*1e49577aSRod Evans	cmp	%o2, %o4	! compare size and difference of addresses
212*1e49577aSRod Evans	bleu,pn	%ncc, .forcpy	! if size is bigger, do overlapped copy
213*1e49577aSRod Evans	add	%o1, %o2, %o5	! get to end of source space
214*1e49577aSRod Evans
215*1e49577aSRod Evans	!
216*1e49577aSRod Evans	! an overlapped copy that must be done "backwards"
217*1e49577aSRod Evans	!
218*1e49577aSRod Evans.chksize:
219*1e49577aSRod Evans	cmp	%o2, 8			! less than 8 byte do byte copy
220*1e49577aSRod Evans	blu,pt %ncc, 2f			! else continue
221*1e49577aSRod Evans
222*1e49577aSRod Evans	! Now size is bigger than 8
223*1e49577aSRod Evans.dbalign:
224*1e49577aSRod Evans	add	%o0, %o2, %g1		! get to end of dest space
225*1e49577aSRod Evans	andcc	%g1, 7, %o3		! %o3 has bytes till dst 8 bytes aligned
226*1e49577aSRod Evans	bz,a,pn	%ncc, .dbbck		! if dst is not 8 byte aligned: align it
227*1e49577aSRod Evans	andn	%o2, 7, %o3		! %o3 count is multiple of 8 bytes size
228*1e49577aSRod Evans	sub	%o2, %o3, %o2		! update o2 with new count
229*1e49577aSRod Evans
230*1e49577aSRod Evans1:	dec	%o5			! decrement source
231*1e49577aSRod Evans	ldub	[%o5], %g1		! load one byte
232*1e49577aSRod Evans	deccc	%o3			! decrement count
233*1e49577aSRod Evans	bgu,pt	%ncc, 1b		! if not done keep copying
234*1e49577aSRod Evans	stb	%g1, [%o5+%o4]		! store one byte into dest
235*1e49577aSRod Evans	andncc	%o2, 7, %o3		! %o3 count is multiple of 8 bytes size
236*1e49577aSRod Evans	bz,pn	%ncc, 2f		! if size < 8, move to byte copy
237*1e49577aSRod Evans
238*1e49577aSRod Evans	! Now Destination is 8 byte aligned
239*1e49577aSRod Evans.dbbck:
240*1e49577aSRod Evans	andcc	%o5, 7, %o0		! %o0 has src offset
241*1e49577aSRod Evans	bz,a,pn	%ncc, .dbcopybc		! if src is aligned to fast mem move
242*1e49577aSRod Evans	sub	%o2, %o3, %o2		! Residue bytes in %o2
243*1e49577aSRod Evans
244*1e49577aSRod Evans.cpy_dbwdbc:				! alignment of src is needed
245*1e49577aSRod Evans	sub	%o2, 8, %o2		! set size one loop ahead
246*1e49577aSRod Evans	sll	%o0, 3, %g1		! %g1 is left shift
247*1e49577aSRod Evans	mov	64, %g5			! init %g5 to be 64
248*1e49577aSRod Evans	sub	%g5, %g1, %g5		! %g5 right shift = (64 - left shift)
249*1e49577aSRod Evans	sub	%o5, %o0, %o5		! align the src at 8 bytes.
250*1e49577aSRod Evans	add	%o4, %o0, %o4		! increase difference between src & dst
251*1e49577aSRod Evans	ldx	[%o5], %o1		! load first 8 bytes
252*1e49577aSRod Evans	srlx	%o1, %g5, %o1
253*1e49577aSRod Evans1:	sub	%o5, 8, %o5		! subtract 8 from src
254*1e49577aSRod Evans	ldx	[%o5], %o0		! load 8 byte
255*1e49577aSRod Evans	sllx	%o0, %g1, %o3		! shift loaded 8 bytes left into tmp reg
256*1e49577aSRod Evans	or	%o1, %o3, %o3		! align data
257*1e49577aSRod Evans	stx	%o3, [%o5+%o4]		! store 8 byte
258*1e49577aSRod Evans	subcc	%o2, 8, %o2		! subtract 8 byte from size
259*1e49577aSRod Evans	bg,pt	%ncc, 1b		! if size > 0 continue
260*1e49577aSRod Evans	srlx	%o0, %g5, %o1		! move extra byte for the next use
261*1e49577aSRod Evans
262*1e49577aSRod Evans	srl	%g1, 3, %o0		! retsote %o0 value for alignment
263*1e49577aSRod Evans	add	%o5, %o0, %o5		! restore src alignment
264*1e49577aSRod Evans	sub	%o4, %o0, %o4		! restore difference between src & dest
265*1e49577aSRod Evans
266*1e49577aSRod Evans	ba	2f			! branch to the trailing byte copy
267*1e49577aSRod Evans	add	%o2, 8, %o2		! restore size value
268*1e49577aSRod Evans
269*1e49577aSRod Evans.dbcopybc:				! alignment of src is not needed
270*1e49577aSRod Evans1:	sub	%o5, 8, %o5		! subtract from src
271*1e49577aSRod Evans	ldx	[%o5], %g1		! load 8 bytes
272*1e49577aSRod Evans	subcc	%o3, 8, %o3		! subtract from size
273*1e49577aSRod Evans	bgu,pt	%ncc, 1b		! if size is bigger 0 continue
274*1e49577aSRod Evans	stx	%g1, [%o5+%o4]		! store 8 bytes to destination
275*1e49577aSRod Evans
276*1e49577aSRod Evans	ba	2f
277*1e49577aSRod Evans	nop
278*1e49577aSRod Evans
279*1e49577aSRod Evans.bcbyte:
280*1e49577aSRod Evans1:	ldub	[%o5], %g1		! load one byte
281*1e49577aSRod Evans	stb	%g1, [%o5+%o4]		! store one byte
282*1e49577aSRod Evans2:	deccc	%o2			! decrement size
283*1e49577aSRod Evans	bgeu,a,pt %ncc, 1b		! if size is >= 0 continue
284*1e49577aSRod Evans	dec	%o5			! decrement from address
285*1e49577aSRod Evans
286*1e49577aSRod Evans.exitbc:				! exit from backward copy
287*1e49577aSRod Evans	retl
288*1e49577aSRod Evans	add	%o5, %o4, %o0		! restore dest addr
289*1e49577aSRod Evans
290*1e49577aSRod Evans#ifdef NIAGARA2_IMPL
291*1e49577aSRod Evans	!
292*1e49577aSRod Evans	! Check to see if memmove is large aligned copy
293*1e49577aSRod Evans	! If so, use special version of copy that avoids
294*1e49577aSRod Evans	! use of block store init
295*1e49577aSRod Evans	!
296*1e49577aSRod Evans.forcpy:
297*1e49577aSRod Evans	cmp	%o2, SMALL_MAX		! check for not small case
298*1e49577aSRod Evans	blt,pn	%ncc, .mv_short		! merge with memcpy
299*1e49577aSRod Evans	mov	%o0, %g1		! save %o0
300*1e49577aSRod Evans	neg	%o0, %o5
301*1e49577aSRod Evans	andcc	%o5, 7, %o5		! bytes till DST 8 byte aligned
302*1e49577aSRod Evans	brz,pt	%o5, .mv_dst_aligned_on_8
303*1e49577aSRod Evans
304*1e49577aSRod Evans	! %o5 has the bytes to be written in partial store.
305*1e49577aSRod Evans	sub	%o2, %o5, %o2
306*1e49577aSRod Evans	sub	%o1, %o0, %o1		! %o1 gets the difference
307*1e49577aSRod Evans7:					! dst aligning loop
308*1e49577aSRod Evans	ldub	[%o1+%o0], %o4		! load one byte
309*1e49577aSRod Evans	subcc	%o5, 1, %o5
310*1e49577aSRod Evans	stb	%o4, [%o0]
311*1e49577aSRod Evans	bgu,pt	%ncc, 7b
312*1e49577aSRod Evans	add	%o0, 1, %o0		! advance dst
313*1e49577aSRod Evans	add	%o1, %o0, %o1		! restore %o1
314*1e49577aSRod Evans.mv_dst_aligned_on_8:
315*1e49577aSRod Evans	andcc	%o1, 7, %o5
316*1e49577aSRod Evans	brnz,pt	%o5, .src_dst_unaligned_on_8
317*1e49577aSRod Evans	prefetch [%o1 + (1 * BLOCK_SIZE)], #one_read
318*1e49577aSRod Evans
319*1e49577aSRod Evans.mv_src_dst_aligned_on_8:
320*1e49577aSRod Evans	! check if we are copying MED_MAX or more bytes
321*1e49577aSRod Evans	cmp	%o2, MED_MAX		! limit to store buffer size
322*1e49577aSRod Evans	bleu,pt	%ncc, .medlong
323*1e49577aSRod Evans	prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read
324*1e49577aSRod Evans
325*1e49577aSRod Evans/*
326*1e49577aSRod Evans * The following memmove code mimics the memcpy code for large aligned copies,
327*1e49577aSRod Evans * but does not use the ASI_STBI_P (block initializing store) performance
328*1e49577aSRod Evans * optimization. See memmove rationale section in documentation
329*1e49577aSRod Evans */
330*1e49577aSRod Evans.mv_large_align8_copy:			! Src and dst share 8 byte alignment
331*1e49577aSRod Evans	rd	%fprs, %g5		! check for unused fp
332*1e49577aSRod Evans	! if fprs.fef == 0, set it.
333*1e49577aSRod Evans	! Setting it when already set costs more than checking
334*1e49577aSRod Evans	andcc	%g5, FPRS_FEF, %g5	! test FEF, fprs.du = fprs.dl = 0
335*1e49577aSRod Evans	bz,a	%ncc, 1f
336*1e49577aSRod Evans	wr	%g0, FPRS_FEF, %fprs	! fprs.fef = 1
337*1e49577aSRod Evans1:
338*1e49577aSRod Evans	! align dst to 64 byte boundary
339*1e49577aSRod Evans	andcc	%o0, 0x3f, %o3		! %o3 == 0 means dst is 64 byte aligned
340*1e49577aSRod Evans	brz,pn	%o3, .mv_aligned_on_64
341*1e49577aSRod Evans	sub	%o3, 64, %o3		! %o3 has negative bytes to move
342*1e49577aSRod Evans	add	%o2, %o3, %o2		! adjust remaining count
343*1e49577aSRod Evans.mv_align_to_64:
344*1e49577aSRod Evans	ldx	[%o1], %o4
345*1e49577aSRod Evans	add	%o1, 8, %o1		! increment src ptr
346*1e49577aSRod Evans	addcc	%o3, 8, %o3
347*1e49577aSRod Evans	stx	%o4, [%o0]
348*1e49577aSRod Evans	brnz,pt	%o3, .mv_align_to_64
349*1e49577aSRod Evans	add	%o0, 8, %o0		! increment dst ptr
350*1e49577aSRod Evans
351*1e49577aSRod Evans.mv_aligned_on_64:
352*1e49577aSRod Evans	prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
353*1e49577aSRod Evans	mov	%asi,%o4		! save %asi
354*1e49577aSRod Evans	! Determine source alignment
355*1e49577aSRod Evans	! to correct 8 byte offset
356*1e49577aSRod Evans	andcc	%o1, 0x20, %o3
357*1e49577aSRod Evans	brnz,pn	%o3, .mv_align_1
358*1e49577aSRod Evans	mov	ASI_BLK_P, %asi		! setup %asi for block load/store
359*1e49577aSRod Evans	andcc	%o1, 0x10, %o3
360*1e49577aSRod Evans	brnz,pn	%o3, .mv_align_01
361*1e49577aSRod Evans	nop
362*1e49577aSRod Evans	andcc	%o1, 0x08, %o3
363*1e49577aSRod Evans	brz,pn	%o3, .mv_align_000
364*1e49577aSRod Evans	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
365*1e49577aSRod Evans	ba	.mv_align_001
366*1e49577aSRod Evans	nop
367*1e49577aSRod Evans.mv_align_01:
368*1e49577aSRod Evans	andcc	%o1, 0x08, %o3
369*1e49577aSRod Evans	brnz,pn	%o3, .mv_align_011
370*1e49577aSRod Evans	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
371*1e49577aSRod Evans	ba	.mv_align_010
372*1e49577aSRod Evans	nop
373*1e49577aSRod Evans.mv_align_1:
374*1e49577aSRod Evans	andcc	%o1, 0x10, %o3
375*1e49577aSRod Evans	brnz,pn	%o3, .mv_align_11
376*1e49577aSRod Evans	nop
377*1e49577aSRod Evans	andcc	%o1, 0x08, %o3
378*1e49577aSRod Evans	brnz,pn	%o3, .mv_align_101
379*1e49577aSRod Evans	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
380*1e49577aSRod Evans	ba	.mv_align_100
381*1e49577aSRod Evans	nop
382*1e49577aSRod Evans.mv_align_11:
383*1e49577aSRod Evans	andcc	%o1, 0x08, %o3
384*1e49577aSRod Evans	brz,pn	%o3, .mv_align_110
385*1e49577aSRod Evans	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
386*1e49577aSRod Evans
387*1e49577aSRod Evans.mv_align_111:
388*1e49577aSRod Evans! Alignment off by 8 bytes
389*1e49577aSRod Evans	ldd	[%o1], %d0
390*1e49577aSRod Evans	add	%o1, 8, %o1
391*1e49577aSRod Evans	sub	%o2, 8, %o2
392*1e49577aSRod Evans	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
393*1e49577aSRod Evans	and	%o2, 0x7f, %o2		! residue bytes in %o2
394*1e49577aSRod Evans.mv_align_111_loop:
395*1e49577aSRod Evans	subcc	%o5, 128, %o5
396*1e49577aSRod Evans	/* ---- copy line 1 of 2. ---- */
397*1e49577aSRod Evans	ldda	[%o1]%asi,%d16		! block load
398*1e49577aSRod Evans	fmovd	%d16, %d2
399*1e49577aSRod Evans	fmovd	%d18, %d4
400*1e49577aSRod Evans	fmovd	%d20, %d6
401*1e49577aSRod Evans	fmovd	%d22, %d8
402*1e49577aSRod Evans	fmovd	%d24, %d10
403*1e49577aSRod Evans	fmovd	%d26, %d12
404*1e49577aSRod Evans	fmovd	%d28, %d14
405*1e49577aSRod Evans	stda	%d0,[%o0]%asi
406*1e49577aSRod Evans	add	%o0, 64, %o0		! advance dst
407*1e49577aSRod Evans	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
408*1e49577aSRod Evans	fmovd	%d30, %d0
409*1e49577aSRod Evans
410*1e49577aSRod Evans	/* ---- copy line 2 of 2. ---- */
411*1e49577aSRod Evans	ldda	[%o1+64]%asi,%d16
412*1e49577aSRod Evans	fmovd	%d16, %d2
413*1e49577aSRod Evans	fmovd	%d18, %d4
414*1e49577aSRod Evans	fmovd	%d20, %d6
415*1e49577aSRod Evans	fmovd	%d22, %d8
416*1e49577aSRod Evans	fmovd	%d24, %d10
417*1e49577aSRod Evans	fmovd	%d26, %d12
418*1e49577aSRod Evans	fmovd	%d28, %d14
419*1e49577aSRod Evans	add	%o1, 128, %o1		! increment src
420*1e49577aSRod Evans	stda	%d0,[%o0]%asi
421*1e49577aSRod Evans	add	%o0, 64, %o0		! advance dst
422*1e49577aSRod Evans	fmovd	%d30, %d0
423*1e49577aSRod Evans	bgt,pt	%ncc, .mv_align_111_loop
424*1e49577aSRod Evans	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
425*1e49577aSRod Evans
426*1e49577aSRod Evans	std	%d0, [%o0]
427*1e49577aSRod Evans	ba	.remain_stuff
428*1e49577aSRod Evans	add	%o0, 8, %o0
429*1e49577aSRod Evans	! END OF mv_align_111
430*1e49577aSRod Evans
431*1e49577aSRod Evans.mv_align_110:
432*1e49577aSRod Evans! Alignment off by 16 bytes
433*1e49577aSRod Evans	ldd	[%o1], %d0
434*1e49577aSRod Evans	ldd	[%o1+8], %d2
435*1e49577aSRod Evans	add	%o1, 16, %o1
436*1e49577aSRod Evans	sub	%o2, 16, %o2
437*1e49577aSRod Evans	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
438*1e49577aSRod Evans	and	%o2, 0x7f, %o2		! residue bytes in %o2
439*1e49577aSRod Evans.mv_align_110_loop:
440*1e49577aSRod Evans	subcc	%o5, 128, %o5
441*1e49577aSRod Evans	/* ---- copy line 1 of 2. ---- */
442*1e49577aSRod Evans
443*1e49577aSRod Evans	ldda	[%o1]%asi,%d16		! block load
444*1e49577aSRod Evans	fmovd	%d16, %d4
445*1e49577aSRod Evans	fmovd	%d18, %d6
446*1e49577aSRod Evans	fmovd	%d20, %d8
447*1e49577aSRod Evans	fmovd	%d22, %d10
448*1e49577aSRod Evans	fmovd	%d24, %d12
449*1e49577aSRod Evans	fmovd	%d26, %d14
450*1e49577aSRod Evans	stda	%d0,[%o0]%asi
451*1e49577aSRod Evans	add	%o0, 64, %o0		! advance dst
452*1e49577aSRod Evans	fmovd	%d28, %d0
453*1e49577aSRod Evans	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
454*1e49577aSRod Evans	fmovd	%d30, %d2
455*1e49577aSRod Evans
456*1e49577aSRod Evans	/* ---- copy line 2 of 2. ---- */
457*1e49577aSRod Evans	ldda	[%o1+64]%asi,%d16
458*1e49577aSRod Evans	fmovd	%d16, %d4
459*1e49577aSRod Evans	fmovd	%d18, %d6
460*1e49577aSRod Evans	fmovd	%d20, %d8
461*1e49577aSRod Evans	fmovd	%d22, %d10
462*1e49577aSRod Evans	fmovd	%d24, %d12
463*1e49577aSRod Evans	fmovd	%d26, %d14
464*1e49577aSRod Evans	add	%o1, 128, %o1		! increment src
465*1e49577aSRod Evans	stda	%d0,[%o0]%asi
466*1e49577aSRod Evans	add	%o0, 64, %o0		! advance dst
467*1e49577aSRod Evans	fmovd	%d28, %d0
468*1e49577aSRod Evans	fmovd	%d30, %d2
469*1e49577aSRod Evans	bgt,pt	%ncc, .mv_align_110_loop
470*1e49577aSRod Evans	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
471*1e49577aSRod Evans
472*1e49577aSRod Evans	std	%d0, [%o0]
473*1e49577aSRod Evans	std	%d2, [%o0+8]
474*1e49577aSRod Evans	ba	.remain_stuff
475*1e49577aSRod Evans	add	%o0, 16, %o0
476*1e49577aSRod Evans	! END OF mv_align_110
477*1e49577aSRod Evans
478*1e49577aSRod Evans.mv_align_101:
479*1e49577aSRod Evans! Alignment off by 24 bytes
480*1e49577aSRod Evans	ldd	[%o1], %d0
481*1e49577aSRod Evans	ldd	[%o1+8], %d2
482*1e49577aSRod Evans	ldd	[%o1+16], %d4
483*1e49577aSRod Evans	add	%o1, 24, %o1
484*1e49577aSRod Evans	sub	%o2, 24, %o2
485*1e49577aSRod Evans	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
486*1e49577aSRod Evans	and	%o2, 0x7f, %o2		! residue bytes in %o2
487*1e49577aSRod Evans.mv_align_101_loop:
488*1e49577aSRod Evans	subcc	%o5, 128, %o5
489*1e49577aSRod Evans	/* ---- copy line 1 of 2. ---- */
490*1e49577aSRod Evans
491*1e49577aSRod Evans	ldda	[%o1]%asi,%d16		! block load
492*1e49577aSRod Evans	fmovd	%d16, %d6
493*1e49577aSRod Evans	fmovd	%d18, %d8
494*1e49577aSRod Evans	fmovd	%d20, %d10
495*1e49577aSRod Evans	fmovd	%d22, %d12
496*1e49577aSRod Evans	fmovd	%d24, %d14
497*1e49577aSRod Evans	stda	%d0,[%o0]%asi
498*1e49577aSRod Evans	add	%o0, 64, %o0		! advance dst
499*1e49577aSRod Evans	fmovd	%d26, %d0
500*1e49577aSRod Evans	fmovd	%d28, %d2
501*1e49577aSRod Evans	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
502*1e49577aSRod Evans	fmovd	%d30, %d4
503*1e49577aSRod Evans
504*1e49577aSRod Evans	/* ---- copy line 2 of 2. ---- */
505*1e49577aSRod Evans	ldda	[%o1+64]%asi,%d16
506*1e49577aSRod Evans	fmovd	%d16, %d6
507*1e49577aSRod Evans	fmovd	%d18, %d8
508*1e49577aSRod Evans	fmovd	%d20, %d10
509*1e49577aSRod Evans	fmovd	%d22, %d12
510*1e49577aSRod Evans	fmovd	%d24, %d14
511*1e49577aSRod Evans	add	%o1, 128, %o1		! increment src
512*1e49577aSRod Evans	stda	%d0,[%o0]%asi
513*1e49577aSRod Evans	add	%o0, 64, %o0		! advance dst
514*1e49577aSRod Evans	fmovd	%d26, %d0
515*1e49577aSRod Evans	fmovd	%d28, %d2
516*1e49577aSRod Evans	fmovd	%d30, %d4
517*1e49577aSRod Evans	bgt,pt	%ncc, .mv_align_101_loop
518*1e49577aSRod Evans	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
519*1e49577aSRod Evans
520*1e49577aSRod Evans	std	%d0, [%o0]
521*1e49577aSRod Evans	std	%d2, [%o0+8]
522*1e49577aSRod Evans	std	%d4, [%o0+16]
523*1e49577aSRod Evans	ba	.remain_stuff
524*1e49577aSRod Evans	add	%o0, 24, %o0
525*1e49577aSRod Evans	! END OF mv_align_101
526*1e49577aSRod Evans
527*1e49577aSRod Evans.mv_align_100:
528*1e49577aSRod Evans! Alignment off by 32 bytes
529*1e49577aSRod Evans	ldd	[%o1], %d0
530*1e49577aSRod Evans	ldd	[%o1+8], %d2
531*1e49577aSRod Evans	ldd	[%o1+16],%d4
532*1e49577aSRod Evans	ldd	[%o1+24],%d6
533*1e49577aSRod Evans	add	%o1, 32, %o1
534*1e49577aSRod Evans	sub	%o2, 32, %o2
535*1e49577aSRod Evans	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
536*1e49577aSRod Evans	and	%o2, 0x7f, %o2		! residue bytes in %o2
537*1e49577aSRod Evans.mv_align_100_loop:
538*1e49577aSRod Evans	subcc	%o5, 128, %o5
539*1e49577aSRod Evans	/* ---- copy line 1 of 2. ---- */
540*1e49577aSRod Evans	ldda	[%o1]%asi,%d16		! block load
541*1e49577aSRod Evans	fmovd	%d16, %d8
542*1e49577aSRod Evans	fmovd	%d18, %d10
543*1e49577aSRod Evans	fmovd	%d20, %d12
544*1e49577aSRod Evans	fmovd	%d22, %d14
545*1e49577aSRod Evans	stda	%d0,[%o0]%asi
546*1e49577aSRod Evans	add	%o0, 64, %o0		! advance dst
547*1e49577aSRod Evans	fmovd	%d24, %d0
548*1e49577aSRod Evans	fmovd	%d26, %d2
549*1e49577aSRod Evans	fmovd	%d28, %d4
550*1e49577aSRod Evans	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
551*1e49577aSRod Evans	fmovd	%d30, %d6
552*1e49577aSRod Evans
553*1e49577aSRod Evans	/* ---- copy line 2 of 2. ---- */
554*1e49577aSRod Evans	ldda	[%o1+64]%asi,%d16
555*1e49577aSRod Evans	fmovd	%d16, %d8
556*1e49577aSRod Evans	fmovd	%d18, %d10
557*1e49577aSRod Evans	fmovd	%d20, %d12
558*1e49577aSRod Evans	fmovd	%d22, %d14
559*1e49577aSRod Evans	add	%o1, 128, %o1		! increment src
560*1e49577aSRod Evans	stda	%d0,[%o0]%asi
561*1e49577aSRod Evans	add	%o0, 64, %o0		! advance dst
562*1e49577aSRod Evans	fmovd	%d24, %d0
563*1e49577aSRod Evans	fmovd	%d26, %d2
564*1e49577aSRod Evans	fmovd	%d28, %d4
565*1e49577aSRod Evans	fmovd	%d30, %d6
566*1e49577aSRod Evans	bgt,pt	%ncc, .mv_align_100_loop
567*1e49577aSRod Evans	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
568*1e49577aSRod Evans
569*1e49577aSRod Evans	std	%d0, [%o0]
570*1e49577aSRod Evans	std	%d2, [%o0+8]
571*1e49577aSRod Evans	std	%d4, [%o0+16]
572*1e49577aSRod Evans	std	%d6, [%o0+24]
573*1e49577aSRod Evans	ba	.remain_stuff
574*1e49577aSRod Evans	add	%o0, 32, %o0
575*1e49577aSRod Evans	! END OF mv_align_100
576*1e49577aSRod Evans
577*1e49577aSRod Evans.mv_align_011:
578*1e49577aSRod Evans! Alignment off by 40 bytes
579*1e49577aSRod Evans	ldd	[%o1], %d0
580*1e49577aSRod Evans	ldd	[%o1+8], %d2
581*1e49577aSRod Evans	ldd	[%o1+16], %d4
582*1e49577aSRod Evans	ldd	[%o1+24], %d6
583*1e49577aSRod Evans	ldd	[%o1+32], %d8
584*1e49577aSRod Evans	add	%o1, 40, %o1
585*1e49577aSRod Evans	sub	%o2, 40, %o2
586*1e49577aSRod Evans	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
587*1e49577aSRod Evans	and	%o2, 0x7f, %o2		! residue bytes in %o2
588*1e49577aSRod Evans.mv_align_011_loop:
589*1e49577aSRod Evans	subcc	%o5, 128, %o5
590*1e49577aSRod Evans	/* ---- copy line 1 of 2. ---- */
591*1e49577aSRod Evans
592*1e49577aSRod Evans	ldda	[%o1]%asi,%d16		! block load
593*1e49577aSRod Evans	fmovd	%d16, %d10
594*1e49577aSRod Evans	fmovd	%d18, %d12
595*1e49577aSRod Evans	fmovd	%d20, %d14
596*1e49577aSRod Evans	stda	%d0,[%o0]%asi
597*1e49577aSRod Evans	add	%o0, 64, %o0		! advance dst
598*1e49577aSRod Evans	fmovd	%d22, %d0
599*1e49577aSRod Evans	fmovd	%d24, %d2
600*1e49577aSRod Evans	fmovd	%d26, %d4
601*1e49577aSRod Evans	fmovd	%d28, %d6
602*1e49577aSRod Evans	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
603*1e49577aSRod Evans	fmovd	%d30, %d8
604*1e49577aSRod Evans
605*1e49577aSRod Evans	/* ---- copy line 2 of 2. ---- */
606*1e49577aSRod Evans	ldda	[%o1+64]%asi,%d16
607*1e49577aSRod Evans	fmovd	%d16, %d10
608*1e49577aSRod Evans	fmovd	%d18, %d12
609*1e49577aSRod Evans	fmovd	%d20, %d14
610*1e49577aSRod Evans	add	%o1, 128, %o1		! increment src
611*1e49577aSRod Evans	stda	%d0,[%o0]%asi
612*1e49577aSRod Evans	add	%o0, 64, %o0		! advance dst
613*1e49577aSRod Evans	fmovd	%d22, %d0
614*1e49577aSRod Evans	fmovd	%d24, %d2
615*1e49577aSRod Evans	fmovd	%d26, %d4
616*1e49577aSRod Evans	fmovd	%d28, %d6
617*1e49577aSRod Evans	fmovd	%d30, %d8
618*1e49577aSRod Evans	bgt,pt	%ncc, .mv_align_011_loop
619*1e49577aSRod Evans	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
620*1e49577aSRod Evans
621*1e49577aSRod Evans	std	%d0, [%o0]
622*1e49577aSRod Evans	std	%d2, [%o0+8]
623*1e49577aSRod Evans	std	%d4, [%o0+16]
624*1e49577aSRod Evans	std	%d6, [%o0+24]
625*1e49577aSRod Evans	std	%d8, [%o0+32]
626*1e49577aSRod Evans	ba	.remain_stuff
627*1e49577aSRod Evans	add	%o0, 40, %o0
628*1e49577aSRod Evans	! END OF mv_align_011
629*1e49577aSRod Evans
630*1e49577aSRod Evans.mv_align_010:
631*1e49577aSRod Evans! Alignment off by 48 bytes
632*1e49577aSRod Evans	ldd	[%o1], %d0
633*1e49577aSRod Evans	ldd	[%o1+8], %d2
634*1e49577aSRod Evans	ldd	[%o1+16], %d4
635*1e49577aSRod Evans	ldd	[%o1+24], %d6
636*1e49577aSRod Evans	ldd	[%o1+32], %d8
637*1e49577aSRod Evans	ldd	[%o1+40], %d10
638*1e49577aSRod Evans	add	%o1, 48, %o1
639*1e49577aSRod Evans	sub	%o2, 48, %o2
640*1e49577aSRod Evans	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
641*1e49577aSRod Evans	and	%o2, 0x7f, %o2		! residue bytes in %o2
642*1e49577aSRod Evans.mv_align_010_loop:
643*1e49577aSRod Evans	subcc	%o5, 128, %o5
644*1e49577aSRod Evans	/* ---- copy line 1 of 2. ---- */
645*1e49577aSRod Evans
646*1e49577aSRod Evans	ldda	[%o1]%asi,%d16		! block load
647*1e49577aSRod Evans	fmovd	%d16, %d12
648*1e49577aSRod Evans	fmovd	%d18, %d14
649*1e49577aSRod Evans	stda	%d0,[%o0]%asi
650*1e49577aSRod Evans	add	%o0, 64, %o0		! advance dst
651*1e49577aSRod Evans	fmovd	%d20, %d0
652*1e49577aSRod Evans	fmovd	%d22, %d2
653*1e49577aSRod Evans	fmovd	%d24, %d4
654*1e49577aSRod Evans	fmovd	%d26, %d6
655*1e49577aSRod Evans	fmovd	%d28, %d8
656*1e49577aSRod Evans	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
657*1e49577aSRod Evans	fmovd	%d30, %d10
658*1e49577aSRod Evans
659*1e49577aSRod Evans	/* ---- copy line 2 of 2. ---- */
660*1e49577aSRod Evans	ldda	[%o1+64]%asi,%d16
661*1e49577aSRod Evans	fmovd	%d16, %d12
662*1e49577aSRod Evans	fmovd	%d18, %d14
663*1e49577aSRod Evans	add	%o1, 128, %o1	! increment src
664*1e49577aSRod Evans	stda	%d0,[%o0]%asi
665*1e49577aSRod Evans	add	%o0, 64, %o0		! advance dst
666*1e49577aSRod Evans	fmovd	%d20, %d0
667*1e49577aSRod Evans	fmovd	%d22, %d2
668*1e49577aSRod Evans	fmovd	%d24, %d4
669*1e49577aSRod Evans	fmovd	%d26, %d6
670*1e49577aSRod Evans	fmovd	%d28, %d8
671*1e49577aSRod Evans	fmovd	%d30, %d10
672*1e49577aSRod Evans	bgt,pt	%ncc, .mv_align_010_loop
673*1e49577aSRod Evans	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
674*1e49577aSRod Evans
675*1e49577aSRod Evans	std	%d0, [%o0]
676*1e49577aSRod Evans	std	%d2, [%o0+8]
677*1e49577aSRod Evans	std	%d4, [%o0+16]
678*1e49577aSRod Evans	std	%d6, [%o0+24]
679*1e49577aSRod Evans	std	%d8, [%o0+32]
680*1e49577aSRod Evans	std	%d10, [%o0+40]
681*1e49577aSRod Evans	ba	.remain_stuff
682*1e49577aSRod Evans	add	%o0, 48, %o0
683*1e49577aSRod Evans	! END OF mv_align_010
684*1e49577aSRod Evans
685*1e49577aSRod Evans.mv_align_001:
686*1e49577aSRod Evans! Alignment off by 56 bytes
687*1e49577aSRod Evans	ldd	[%o1], %d0
688*1e49577aSRod Evans	ldd	[%o1+8], %d2
689*1e49577aSRod Evans	ldd	[%o1+16], %d4
690*1e49577aSRod Evans	ldd	[%o1+24], %d6
691*1e49577aSRod Evans	ldd	[%o1+32], %d8
692*1e49577aSRod Evans	ldd	[%o1+40], %d10
693*1e49577aSRod Evans	ldd	[%o1+48], %d12
694*1e49577aSRod Evans	add	%o1, 56, %o1
695*1e49577aSRod Evans	sub	%o2, 56, %o2
696*1e49577aSRod Evans	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
697*1e49577aSRod Evans	and	%o2, 0x7f, %o2		! residue bytes in %o2
698*1e49577aSRod Evans.mv_align_001_loop:
699*1e49577aSRod Evans	subcc	%o5, 128, %o5
700*1e49577aSRod Evans	/* ---- copy line 1 of 2. ---- */
701*1e49577aSRod Evans
702*1e49577aSRod Evans	ldda	[%o1]%asi,%d16		! block load
703*1e49577aSRod Evans	fmovd	%d16, %d14
704*1e49577aSRod Evans	stda	%d0,[%o0]%asi
705*1e49577aSRod Evans	add	%o0, 64, %o0		! advance dst
706*1e49577aSRod Evans	fmovd	%d18, %d0
707*1e49577aSRod Evans	fmovd	%d20, %d2
708*1e49577aSRod Evans	fmovd	%d22, %d4
709*1e49577aSRod Evans	fmovd	%d24, %d6
710*1e49577aSRod Evans	fmovd	%d26, %d8
711*1e49577aSRod Evans	fmovd	%d28, %d10
712*1e49577aSRod Evans	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
713*1e49577aSRod Evans	fmovd	%d30, %d12
714*1e49577aSRod Evans
715*1e49577aSRod Evans	/* ---- copy line 2 of 2. ---- */
716*1e49577aSRod Evans	ldda	[%o1+64]%asi,%d16
717*1e49577aSRod Evans	fmovd	%d16, %d14
718*1e49577aSRod Evans	add	%o1, 128, %o1		! increment src
719*1e49577aSRod Evans	stda	%d0,[%o0]%asi
720*1e49577aSRod Evans	add	%o0, 64, %o0		! advance dst
721*1e49577aSRod Evans	fmovd	%d18, %d0
722*1e49577aSRod Evans	fmovd	%d20, %d2
723*1e49577aSRod Evans	fmovd	%d22, %d4
724*1e49577aSRod Evans	fmovd	%d24, %d6
725*1e49577aSRod Evans	fmovd	%d26, %d8
726*1e49577aSRod Evans	fmovd	%d28, %d10
727*1e49577aSRod Evans	fmovd	%d30, %d12
728*1e49577aSRod Evans	bgt,pt	%ncc, .mv_align_001_loop
729*1e49577aSRod Evans	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
730*1e49577aSRod Evans
731*1e49577aSRod Evans	std	%d0, [%o0]
732*1e49577aSRod Evans	std	%d2, [%o0+8]
733*1e49577aSRod Evans	std	%d4, [%o0+16]
734*1e49577aSRod Evans	std	%d6, [%o0+24]
735*1e49577aSRod Evans	std	%d8, [%o0+32]
736*1e49577aSRod Evans	std	%d10, [%o0+40]
737*1e49577aSRod Evans	std	%d12, [%o0+48]
738*1e49577aSRod Evans	ba	.remain_stuff
739*1e49577aSRod Evans	add	%o0, 56, %o0
740*1e49577aSRod Evans	! END OF mv_align_001
741*1e49577aSRod Evans
742*1e49577aSRod Evans.mv_align_000:
743*1e49577aSRod Evans	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
744*1e49577aSRod Evans	and	%o2, 0x7f, %o2		! residue bytes in %o2
745*1e49577aSRod Evans.mv_align_000_loop:
746*1e49577aSRod Evans	/* ---- copy line 1 of 2. ---- */
747*1e49577aSRod Evans	subcc	%o5, 128, %o5
748*1e49577aSRod Evans	ldda	[%o1]%asi,%d0
749*1e49577aSRod Evans	stda	%d0,[%o0]%asi
750*1e49577aSRod Evans	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
751*1e49577aSRod Evans
752*1e49577aSRod Evans	/* ---- copy line 2 of 2. ---- */
753*1e49577aSRod Evans	add	%o0, 64, %o0
754*1e49577aSRod Evans	ldda	[%o1+64]%asi,%d0
755*1e49577aSRod Evans	add	%o1, 128, %o1		! increment src
756*1e49577aSRod Evans	stda	%d0,[%o0]%asi
757*1e49577aSRod Evans	add	%o0, 64, %o0		! increment dst
758*1e49577aSRod Evans	bgt,pt	%ncc, .mv_align_000_loop
759*1e49577aSRod Evans	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
760*1e49577aSRod Evans	ba	.remain_stuff
761*1e49577aSRod Evans	nop
762*1e49577aSRod Evans
763*1e49577aSRod Evans	! END OF mv_align_000
764*1e49577aSRod Evans#else	/* NIAGARA2_IMPL */
765*1e49577aSRod Evans#endif	/* NIAGARA2_IMPL */
766*1e49577aSRod Evans
767*1e49577aSRod Evans	SET_SIZE(memmove)
768*1e49577aSRod Evans
769*1e49577aSRod Evans	ENTRY(memcpy)
770*1e49577aSRod Evans	ENTRY(__align_cpy_1)
771*1e49577aSRod Evans#ifdef NIAGARA2_IMPL
772*1e49577aSRod Evans	cmp	%o2, SMALL_MAX		! check for not small case
773*1e49577aSRod Evans	bgeu,pn	%ncc, .medium		! go to larger cases
774*1e49577aSRod Evans	mov	%o0, %g1		! save %o0
775*1e49577aSRod Evans.mv_short:
776*1e49577aSRod Evans	cmp	%o2, SHORTCOPY		! check for really short case
777*1e49577aSRod Evans	ble,pt	%ncc, .smallfin
778*1e49577aSRod Evans	or	%o0, %o1, %o4		! prepare alignment check
779*1e49577aSRod Evans	andcc	%o4, 0x3, %o5		! test for alignment
780*1e49577aSRod Evans	bz,pt	%ncc, .smallword	! branch to word aligned case
781*1e49577aSRod Evans	cmp	%o2, SHORTCHECK
782*1e49577aSRod Evans	ble,pt	%ncc, .smallrest
783*1e49577aSRod Evans	andcc	%o1, 0x3, %o5		! is src word aligned
784*1e49577aSRod Evans	bz,pn	%ncc, .aldst
785*1e49577aSRod Evans	cmp	%o5, 2			! is src half-word aligned
786*1e49577aSRod Evans	be,pt	%ncc, .s2algn
787*1e49577aSRod Evans	cmp	%o5, 3			! src is byte aligned
788*1e49577aSRod Evans.s1algn:ldub	[%o1], %o3		! move 1 or 3 bytes to align it
789*1e49577aSRod Evans	inc	1, %o1
790*1e49577aSRod Evans	stb	%o3, [%o0]		! move a byte to align src
791*1e49577aSRod Evans	inc	1, %o0
792*1e49577aSRod Evans	bne,pt	%ncc, .s2algn
793*1e49577aSRod Evans	dec	%o2
794*1e49577aSRod Evans	b	.ald			! now go align dest
795*1e49577aSRod Evans	andcc	%o0, 0x3, %o5
796*1e49577aSRod Evans
797*1e49577aSRod Evans.s2algn:lduh	[%o1], %o3		! know src is 2 byte aligned
798*1e49577aSRod Evans	inc	2, %o1
799*1e49577aSRod Evans	srl	%o3, 8, %o4
800*1e49577aSRod Evans	stb	%o4, [%o0]		! have to do bytes,
801*1e49577aSRod Evans	stb	%o3, [%o0 + 1]		! don't know dst alignment
802*1e49577aSRod Evans	inc	2, %o0
803*1e49577aSRod Evans	dec	2, %o2
804*1e49577aSRod Evans
805*1e49577aSRod Evans.aldst:	andcc	%o0, 0x3, %o5		! align the destination address
806*1e49577aSRod Evans.ald:	bz,pn	%ncc, .w4cp
807*1e49577aSRod Evans	cmp	%o5, 2
808*1e49577aSRod Evans	be,pn	%ncc, .w2cp
809*1e49577aSRod Evans	cmp	%o5, 3
810*1e49577aSRod Evans.w3cp:	lduw	[%o1], %o4
811*1e49577aSRod Evans	inc	4, %o1
812*1e49577aSRod Evans	srl	%o4, 24, %o5
813*1e49577aSRod Evans	stb	%o5, [%o0]
814*1e49577aSRod Evans	bne,pt	%ncc, .w1cp
815*1e49577aSRod Evans	inc	%o0
816*1e49577aSRod Evans	dec	1, %o2
817*1e49577aSRod Evans	andn	%o2, 3, %o3		! %o3 is aligned word count
818*1e49577aSRod Evans	dec	4, %o3			! avoid reading beyond tail of src
819*1e49577aSRod Evans	sub	%o1, %o0, %o1		! %o1 gets the difference
820*1e49577aSRod Evans
821*1e49577aSRod Evans1:	sll	%o4, 8, %g5		! save residual bytes
822*1e49577aSRod Evans	lduw	[%o1+%o0], %o4
823*1e49577aSRod Evans	deccc	4, %o3
824*1e49577aSRod Evans	srl	%o4, 24, %o5		! merge with residual
825*1e49577aSRod Evans	or	%o5, %g5, %g5
826*1e49577aSRod Evans	st	%g5, [%o0]
827*1e49577aSRod Evans	bnz,pt	%ncc, 1b
828*1e49577aSRod Evans	inc	4, %o0
829*1e49577aSRod Evans	sub	%o1, 3, %o1		! used one byte of last word read
830*1e49577aSRod Evans	and	%o2, 3, %o2
831*1e49577aSRod Evans	b	7f
832*1e49577aSRod Evans	inc	4, %o2
833*1e49577aSRod Evans
834*1e49577aSRod Evans.w1cp:	srl	%o4, 8, %o5
835*1e49577aSRod Evans	sth	%o5, [%o0]
836*1e49577aSRod Evans	inc	2, %o0
837*1e49577aSRod Evans	dec	3, %o2
838*1e49577aSRod Evans	andn	%o2, 3, %o3		! %o3 is aligned word count
839*1e49577aSRod Evans	dec	4, %o3			! avoid reading beyond tail of src
840*1e49577aSRod Evans	sub	%o1, %o0, %o1		! %o1 gets the difference
841*1e49577aSRod Evans
842*1e49577aSRod Evans2:	sll	%o4, 24, %g5		! save residual bytes
843*1e49577aSRod Evans	lduw	[%o1+%o0], %o4
844*1e49577aSRod Evans	deccc	4, %o3
845*1e49577aSRod Evans	srl	%o4, 8, %o5		! merge with residual
846*1e49577aSRod Evans	or	%o5, %g5, %g5
847*1e49577aSRod Evans	st	%g5, [%o0]
848*1e49577aSRod Evans	bnz,pt	%ncc, 2b
849*1e49577aSRod Evans	inc	4, %o0
850*1e49577aSRod Evans	sub	%o1, 1, %o1		! used three bytes of last word read
851*1e49577aSRod Evans	and	%o2, 3, %o2
852*1e49577aSRod Evans	b	7f
853*1e49577aSRod Evans	inc	4, %o2
854*1e49577aSRod Evans
855*1e49577aSRod Evans.w2cp:	lduw	[%o1], %o4
856*1e49577aSRod Evans	inc	4, %o1
857*1e49577aSRod Evans	srl	%o4, 16, %o5
858*1e49577aSRod Evans	sth	%o5, [%o0]
859*1e49577aSRod Evans	inc	2, %o0
860*1e49577aSRod Evans	dec	2, %o2
861*1e49577aSRod Evans	andn	%o2, 3, %o3		! %o3 is aligned word count
862*1e49577aSRod Evans	dec	4, %o3			! avoid reading beyond tail of src
863*1e49577aSRod Evans	sub	%o1, %o0, %o1		! %o1 gets the difference
864*1e49577aSRod Evans
865*1e49577aSRod Evans3:	sll	%o4, 16, %g5		! save residual bytes
866*1e49577aSRod Evans	lduw	[%o1+%o0], %o4
867*1e49577aSRod Evans	deccc	4, %o3
868*1e49577aSRod Evans	srl	%o4, 16, %o5		! merge with residual
869*1e49577aSRod Evans	or	%o5, %g5, %g5
870*1e49577aSRod Evans	st	%g5, [%o0]
871*1e49577aSRod Evans	bnz,pt	%ncc, 3b
872*1e49577aSRod Evans	inc	4, %o0
873*1e49577aSRod Evans	sub	%o1, 2, %o1		! used two bytes of last word read
874*1e49577aSRod Evans	and	%o2, 3, %o2
875*1e49577aSRod Evans	b	7f
876*1e49577aSRod Evans	inc	4, %o2
877*1e49577aSRod Evans
878*1e49577aSRod Evans.w4cp:	andn	%o2, 3, %o3		! %o3 is aligned word count
879*1e49577aSRod Evans	sub	%o1, %o0, %o1		! %o1 gets the difference
880*1e49577aSRod Evans
881*1e49577aSRod Evans1:	lduw	[%o1+%o0], %o4		! read from address
882*1e49577aSRod Evans	deccc	4, %o3			! decrement count
883*1e49577aSRod Evans	st	%o4, [%o0]		! write at destination address
884*1e49577aSRod Evans	bgu,pt	%ncc, 1b
885*1e49577aSRod Evans	inc	4, %o0			! increment to address
886*1e49577aSRod Evans	and	%o2, 3, %o2		! number of leftover bytes, if any
887*1e49577aSRod Evans
888*1e49577aSRod Evans	! simple finish up byte copy, works with any alignment
889*1e49577aSRod Evans7:
890*1e49577aSRod Evans	add	%o1, %o0, %o1		! restore %o1
891*1e49577aSRod Evans.smallrest:
892*1e49577aSRod Evans	tst	%o2
893*1e49577aSRod Evans	bz,pt	%ncc, .smallx
894*1e49577aSRod Evans	cmp	%o2, 4
895*1e49577aSRod Evans	blt,pt	%ncc, .smallleft3
896*1e49577aSRod Evans	nop
897*1e49577aSRod Evans	sub	%o2, 3, %o2
898*1e49577aSRod Evans.smallnotalign4:
899*1e49577aSRod Evans	ldub	[%o1], %o3		! read byte
900*1e49577aSRod Evans	subcc	%o2, 4, %o2		! reduce count by 4
901*1e49577aSRod Evans	stb	%o3, [%o0]		! write byte
902*1e49577aSRod Evans	ldub	[%o1+1], %o3		! repeat for total of 4 bytes
903*1e49577aSRod Evans	add	%o1, 4, %o1		! advance SRC by 4
904*1e49577aSRod Evans	stb	%o3, [%o0+1]
905*1e49577aSRod Evans	ldub	[%o1-2], %o3
906*1e49577aSRod Evans	add	%o0, 4, %o0		! advance DST by 4
907*1e49577aSRod Evans	stb	%o3, [%o0-2]
908*1e49577aSRod Evans	ldub	[%o1-1], %o3
909*1e49577aSRod Evans	bgu,pt	%ncc, .smallnotalign4	! loop til 3 or fewer bytes remain
910*1e49577aSRod Evans	stb	%o3, [%o0-1]
911*1e49577aSRod Evans	addcc	%o2, 3, %o2		! restore count
912*1e49577aSRod Evans	bz,pt	%ncc, .smallx
913*1e49577aSRod Evans.smallleft3:				! 1, 2, or 3 bytes remain
914*1e49577aSRod Evans	subcc	%o2, 1, %o2
915*1e49577aSRod Evans	ldub	[%o1], %o3		! load one byte
916*1e49577aSRod Evans	bz,pt	%ncc, .smallx
917*1e49577aSRod Evans	stb	%o3, [%o0]		! store one byte
918*1e49577aSRod Evans	ldub	[%o1+1], %o3		! load second byte
919*1e49577aSRod Evans	subcc	%o2, 1, %o2
920*1e49577aSRod Evans	bz,pt	%ncc, .smallx
921*1e49577aSRod Evans	stb	%o3, [%o0+1]		! store second byte
922*1e49577aSRod Evans	ldub	[%o1+2], %o3		! load third byte
923*1e49577aSRod Evans	stb	%o3, [%o0+2]		! store third byte
924*1e49577aSRod Evans.smallx:
925*1e49577aSRod Evans	retl
926*1e49577aSRod Evans	mov	%g1, %o0		! restore %o0
927*1e49577aSRod Evans
928*1e49577aSRod Evans.smallfin:
929*1e49577aSRod Evans	tst	%o2
930*1e49577aSRod Evans	bnz,pt	%ncc, .smallleft3
931*1e49577aSRod Evans	nop
932*1e49577aSRod Evans	retl
933*1e49577aSRod Evans	mov	%g1, %o0		! restore %o0
934*1e49577aSRod Evans
935*1e49577aSRod Evans	.align 16
936*1e49577aSRod Evans.smallwords:
937*1e49577aSRod Evans	lduw	[%o1], %o3		! read word
938*1e49577aSRod Evans.smallwordx:
939*1e49577aSRod Evans	subcc	%o2, 8, %o2		! update count
940*1e49577aSRod Evans	stw	%o3, [%o0]		! write word
941*1e49577aSRod Evans	add	%o1, 8, %o1		! update SRC
942*1e49577aSRod Evans	lduw	[%o1-4], %o3		! read word
943*1e49577aSRod Evans	add	%o0, 8, %o0		! update DST
944*1e49577aSRod Evans	bgu,pt	%ncc, .smallwords	! loop until done
945*1e49577aSRod Evans	stw	%o3, [%o0-4]		! write word
946*1e49577aSRod Evans	addcc	%o2, 7, %o2		! restore count
947*1e49577aSRod Evans	bz,pt	%ncc, .smallexit	! check for completion
948*1e49577aSRod Evans	cmp	%o2, 4			! check for 4 or more bytes left
949*1e49577aSRod Evans	blt	%ncc, .smallleft3	! if not, go to finish up
950*1e49577aSRod Evans	nop
951*1e49577aSRod Evans	lduw	[%o1], %o3
952*1e49577aSRod Evans	add	%o1, 4, %o1
953*1e49577aSRod Evans	subcc	%o2, 4, %o2
954*1e49577aSRod Evans	add	%o0, 4, %o0
955*1e49577aSRod Evans	bnz,pt	%ncc, .smallleft3
956*1e49577aSRod Evans	stw	%o3, [%o0-4]
957*1e49577aSRod Evans	retl
958*1e49577aSRod Evans	mov	%g1, %o0		! restore %o0
959*1e49577aSRod Evans
960*1e49577aSRod Evans! 8 or more bytes, src and dest start on word boundary
961*1e49577aSRod Evans! %o4 contains or %o0, %o1; %o3 contains first four bytes of src
962*1e49577aSRod Evans.smalllong:
963*1e49577aSRod Evans	andcc	%o4, 0x7, %o5		! test for long alignment
964*1e49577aSRod Evans	bnz,pt	%ncc, .smallwordx	! branch to word aligned case
965*1e49577aSRod Evans	cmp	%o2, SHORT_LONG-7
966*1e49577aSRod Evans	bge,a	%ncc, .medl64		! if we branch
967*1e49577aSRod Evans	sub	%o2,56,%o2		! adjust %o2 to -31 off count
968*1e49577aSRod Evans	sub	%o1, %o0, %o1		! %o1 gets the difference
969*1e49577aSRod Evans.small_long_l:
970*1e49577aSRod Evans	ldx	[%o1+%o0], %o3
971*1e49577aSRod Evans	subcc	%o2, 8, %o2
972*1e49577aSRod Evans	add	%o0, 8, %o0
973*1e49577aSRod Evans	bgu,pt	%ncc, .small_long_l	! loop until done
974*1e49577aSRod Evans	stx	%o3, [%o0-8]		! write word
975*1e49577aSRod Evans	add	%o1, %o0, %o1		! restore %o1
976*1e49577aSRod Evans	addcc	%o2, 7, %o2		! restore %o2 to correct count
977*1e49577aSRod Evans	bz,pt	%ncc, .smallexit	! check for completion
978*1e49577aSRod Evans	cmp	%o2, 4			! check for 4 or more bytes left
979*1e49577aSRod Evans	blt,pt	%ncc, .smallleft3	! if not, go to finish up
980*1e49577aSRod Evans	nop
981*1e49577aSRod Evans	lduw	[%o1], %o3
982*1e49577aSRod Evans	add	%o1, 4, %o1
983*1e49577aSRod Evans	subcc	%o2, 4, %o2
984*1e49577aSRod Evans	stw	%o3, [%o0]
985*1e49577aSRod Evans	add	%o0, 4, %o0
986*1e49577aSRod Evans	bnz,pt	%ncc, .smallleft3
987*1e49577aSRod Evans	nop
988*1e49577aSRod Evans	retl
989*1e49577aSRod Evans	mov	%g1, %o0		! restore %o0
990*1e49577aSRod Evans
991*1e49577aSRod Evans	.align 16
992*1e49577aSRod Evans! src and dest start on word boundary
993*1e49577aSRod Evans.smallword:
994*1e49577aSRod Evans	subcc	%o2, 7, %o2		! adjust count
995*1e49577aSRod Evans	bgu,pt	%ncc, .smalllong
996*1e49577aSRod Evans	lduw	[%o1], %o3		! read word
997*1e49577aSRod Evans	addcc	%o2, 3, %o2		! restore count
998*1e49577aSRod Evans	bz,pt	%ncc, .smallexit
999*1e49577aSRod Evans	stw	%o3, [%o0]		! write word
1000*1e49577aSRod Evans	deccc	%o2			! reduce count for cc test
1001*1e49577aSRod Evans	ldub	[%o1+4], %o3		! load one byte
1002*1e49577aSRod Evans	bz,pt	%ncc, .smallexit
1003*1e49577aSRod Evans	stb	%o3, [%o0+4]		! store one byte
1004*1e49577aSRod Evans	ldub	[%o1+5], %o3		! load second byte
1005*1e49577aSRod Evans	deccc	%o2
1006*1e49577aSRod Evans	bz,pt	%ncc, .smallexit
1007*1e49577aSRod Evans	stb	%o3, [%o0+5]		! store second byte
1008*1e49577aSRod Evans	ldub	[%o1+6], %o3		! load third byte
1009*1e49577aSRod Evans	stb	%o3, [%o0+6]		! store third byte
1010*1e49577aSRod Evans.smallexit:
1011*1e49577aSRod Evans	retl
1012*1e49577aSRod Evans	mov	%g1, %o0		! restore %o0
1013*1e49577aSRod Evans
1014*1e49577aSRod Evans	.align 16
1015*1e49577aSRod Evans.medium:
1016*1e49577aSRod Evans	neg	%o0, %o5
1017*1e49577aSRod Evans	andcc	%o5, 7, %o5		! bytes till DST 8 byte aligned
1018*1e49577aSRod Evans	brz,pt	%o5, .dst_aligned_on_8
1019*1e49577aSRod Evans
1020*1e49577aSRod Evans	! %o5 has the bytes to be written in partial store.
1021*1e49577aSRod Evans	sub	%o2, %o5, %o2
1022*1e49577aSRod Evans	sub	%o1, %o0, %o1		! %o1 gets the difference
1023*1e49577aSRod Evans7:					! dst aligning loop
1024*1e49577aSRod Evans	ldub	[%o1+%o0], %o4		! load one byte
1025*1e49577aSRod Evans	subcc	%o5, 1, %o5
1026*1e49577aSRod Evans	stb	%o4, [%o0]
1027*1e49577aSRod Evans	bgu,pt	%ncc, 7b
1028*1e49577aSRod Evans	add	%o0, 1, %o0		! advance dst
1029*1e49577aSRod Evans	add	%o1, %o0, %o1		! restore %o1
1030*1e49577aSRod Evans.dst_aligned_on_8:
1031*1e49577aSRod Evans	andcc	%o1, 7, %o5
1032*1e49577aSRod Evans	brnz,pt	%o5, .src_dst_unaligned_on_8
1033*1e49577aSRod Evans	prefetch [%o1 + (1 * BLOCK_SIZE)], #one_read
1034*1e49577aSRod Evans
1035*1e49577aSRod Evans.src_dst_aligned_on_8:
1036*1e49577aSRod Evans	! check if we are copying MED_MAX or more bytes
1037*1e49577aSRod Evans	cmp	%o2, MED_MAX		! limit to store buffer size
1038*1e49577aSRod Evans	bgu,pt	%ncc, .large_align8_copy
1039*1e49577aSRod Evans	prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read
1040*1e49577aSRod Evans/*
1041*1e49577aSRod Evans * Special case for handling when src and dest are both long word aligned
1042*1e49577aSRod Evans * and total data to move is less than MED_MAX bytes
1043*1e49577aSRod Evans */
1044*1e49577aSRod Evans.medlong:
1045*1e49577aSRod Evans	subcc	%o2, 63, %o2		! adjust length to allow cc test
1046*1e49577aSRod Evans	ble,pt	%ncc, .medl63		! skip big loop if less than 64 bytes
1047*1e49577aSRod Evans.medl64:
1048*1e49577aSRod Evans	prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read ! into the l2 cache
1049*1e49577aSRod Evans	ldx	[%o1], %o4		! load
1050*1e49577aSRod Evans	subcc	%o2, 64, %o2		! decrement length count
1051*1e49577aSRod Evans	stx	%o4, [%o0]		! and store
1052*1e49577aSRod Evans	ldx	[%o1+8], %o3		! a block of 64 bytes
1053*1e49577aSRod Evans	stx	%o3, [%o0+8]
1054*1e49577aSRod Evans	ldx	[%o1+16], %o4
1055*1e49577aSRod Evans	stx	%o4, [%o0+16]
1056*1e49577aSRod Evans	ldx	[%o1+24], %o3
1057*1e49577aSRod Evans	stx	%o3, [%o0+24]
1058*1e49577aSRod Evans	ldx	[%o1+32], %o4		! load
1059*1e49577aSRod Evans	stx	%o4, [%o0+32]		! and store
1060*1e49577aSRod Evans	ldx	[%o1+40], %o3		! a block of 64 bytes
1061*1e49577aSRod Evans	add	%o1, 64, %o1		! increase src ptr by 64
1062*1e49577aSRod Evans	stx	%o3, [%o0+40]
1063*1e49577aSRod Evans	ldx	[%o1-16], %o4
1064*1e49577aSRod Evans	add	%o0, 64, %o0		! increase dst ptr by 64
1065*1e49577aSRod Evans	stx	%o4, [%o0-16]
1066*1e49577aSRod Evans	ldx	[%o1-8], %o3
1067*1e49577aSRod Evans	bgu,pt	%ncc, .medl64		! repeat if at least 64 bytes left
1068*1e49577aSRod Evans	stx	%o3, [%o0-8]
1069*1e49577aSRod Evans.medl63:
1070*1e49577aSRod Evans	addcc	%o2, 32, %o2		! adjust remaining count
1071*1e49577aSRod Evans	ble,pt	%ncc, .medl31		! to skip if 31 or fewer bytes left
1072*1e49577aSRod Evans	nop
1073*1e49577aSRod Evans	ldx	[%o1], %o4		! load
1074*1e49577aSRod Evans	sub	%o2, 32, %o2		! decrement length count
1075*1e49577aSRod Evans	stx	%o4, [%o0]		! and store
1076*1e49577aSRod Evans	ldx	[%o1+8], %o3		! a block of 32 bytes
1077*1e49577aSRod Evans	add	%o1, 32, %o1		! increase src ptr by 32
1078*1e49577aSRod Evans	stx	%o3, [%o0+8]
1079*1e49577aSRod Evans	ldx	[%o1-16], %o4
1080*1e49577aSRod Evans	add	%o0, 32, %o0		! increase dst ptr by 32
1081*1e49577aSRod Evans	stx	%o4, [%o0-16]
1082*1e49577aSRod Evans	ldx	[%o1-8], %o3
1083*1e49577aSRod Evans	stx	%o3, [%o0-8]
1084*1e49577aSRod Evans.medl31:
1085*1e49577aSRod Evans	addcc	%o2, 16, %o2		! adjust remaining count
1086*1e49577aSRod Evans	ble,pt	%ncc, .medl15		! skip if 15 or fewer bytes left
1087*1e49577aSRod Evans	nop				!
1088*1e49577aSRod Evans	ldx	[%o1], %o4		! load and store 16 bytes
1089*1e49577aSRod Evans	add	%o1, 16, %o1		! increase src ptr by 16
1090*1e49577aSRod Evans	stx	%o4, [%o0]		!
1091*1e49577aSRod Evans	sub	%o2, 16, %o2		! decrease count by 16
1092*1e49577aSRod Evans	ldx	[%o1-8], %o3		!
1093*1e49577aSRod Evans	add	%o0, 16, %o0		! increase dst ptr by 16
1094*1e49577aSRod Evans	stx	%o3, [%o0-8]
1095*1e49577aSRod Evans.medl15:
1096*1e49577aSRod Evans	addcc	%o2, 15, %o2		! restore count
1097*1e49577aSRod Evans	bz,pt	%ncc, .smallexit	! exit if finished
1098*1e49577aSRod Evans	cmp	%o2, 8
1099*1e49577aSRod Evans	blt,pt	%ncc, .medw7		! skip if 7 or fewer bytes left
1100*1e49577aSRod Evans	tst	%o2
1101*1e49577aSRod Evans	ldx	[%o1], %o4		! load 8 bytes
1102*1e49577aSRod Evans	add	%o1, 8, %o1		! increase src ptr by 8
1103*1e49577aSRod Evans	add	%o0, 8, %o0		! increase dst ptr by 8
1104*1e49577aSRod Evans	subcc	%o2, 8, %o2		! decrease count by 8
1105*1e49577aSRod Evans	bnz,pt	%ncc, .medw7
1106*1e49577aSRod Evans	stx	%o4, [%o0-8]		! and store 8 bytes
1107*1e49577aSRod Evans	retl
1108*1e49577aSRod Evans	mov	%g1, %o0		! restore %o0
1109*1e49577aSRod Evans
1110*1e49577aSRod Evans	.align 16
1111*1e49577aSRod Evans.src_dst_unaligned_on_8:
1112*1e49577aSRod Evans	! DST is 8-byte aligned, src is not
1113*1e49577aSRod Evans2:
1114*1e49577aSRod Evans	andcc	%o1, 0x3, %o5		! test word alignment
1115*1e49577aSRod Evans	bnz,pt	%ncc, .unalignsetup	! branch to skip if not word aligned
1116*1e49577aSRod Evans	prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read
1117*1e49577aSRod Evans
1118*1e49577aSRod Evans/*
1119*1e49577aSRod Evans * Handle all cases where src and dest are aligned on word
1120*1e49577aSRod Evans * boundaries. Use unrolled loops for better performance.
1121*1e49577aSRod Evans * This option wins over standard large data move when
1122*1e49577aSRod Evans * source and destination is in cache for medium
1123*1e49577aSRod Evans * to short data moves.
1124*1e49577aSRod Evans */
1125*1e49577aSRod Evans	cmp	%o2, MED_WMAX		! limit to store buffer size
1126*1e49577aSRod Evans	bge,pt	%ncc, .unalignrejoin	! otherwise rejoin main loop
1127*1e49577aSRod Evans	prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
1128*1e49577aSRod Evans
1129*1e49577aSRod Evans	subcc	%o2, 31, %o2		! adjust length to allow cc test
1130*1e49577aSRod Evans					! for end of loop
1131*1e49577aSRod Evans	ble,pt	%ncc, .medw31		! skip big loop if less than 16
1132*1e49577aSRod Evans	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1133*1e49577aSRod Evans.medw32:
1134*1e49577aSRod Evans	ld	[%o1], %o4		! move a block of 32 bytes
1135*1e49577aSRod Evans	stw	%o4, [%o0]
1136*1e49577aSRod Evans	ld	[%o1+4], %o3
1137*1e49577aSRod Evans	stw	%o3, [%o0+4]
1138*1e49577aSRod Evans	ld	[%o1+8], %o4
1139*1e49577aSRod Evans	stw	%o4, [%o0+8]
1140*1e49577aSRod Evans	ld	[%o1+12], %o3
1141*1e49577aSRod Evans	stw	%o3, [%o0+12]
1142*1e49577aSRod Evans	ld	[%o1+16], %o4
1143*1e49577aSRod Evans	subcc	%o2, 32, %o2		! decrement length count
1144*1e49577aSRod Evans	stw	%o4, [%o0+16]
1145*1e49577aSRod Evans	ld	[%o1+20], %o3
1146*1e49577aSRod Evans	add	%o1, 32, %o1		! increase src ptr by 32
1147*1e49577aSRod Evans	stw	%o3, [%o0+20]
1148*1e49577aSRod Evans	ld	[%o1-8], %o4
1149*1e49577aSRod Evans	add	%o0, 32, %o0		! increase dst ptr by 32
1150*1e49577aSRod Evans	stw	%o4, [%o0-8]
1151*1e49577aSRod Evans	ld	[%o1-4], %o3
1152*1e49577aSRod Evans	bgu,pt	%ncc, .medw32		! repeat if at least 32 bytes left
1153*1e49577aSRod Evans	stw	%o3, [%o0-4]
1154*1e49577aSRod Evans.medw31:
1155*1e49577aSRod Evans	addcc	%o2, 31, %o2		! restore count
1156*1e49577aSRod Evans
1157*1e49577aSRod Evans	bz,pt	%ncc, .smallexit	! exit if finished
1158*1e49577aSRod Evans	nop
1159*1e49577aSRod Evans	cmp	%o2, 16
1160*1e49577aSRod Evans	blt,pt	%ncc, .medw15
1161*1e49577aSRod Evans	nop
1162*1e49577aSRod Evans	ld	[%o1], %o4		! move a block of 16 bytes
1163*1e49577aSRod Evans	subcc	%o2, 16, %o2		! decrement length count
1164*1e49577aSRod Evans	stw	%o4, [%o0]
1165*1e49577aSRod Evans	ld	[%o1+4], %o3
1166*1e49577aSRod Evans	add	%o1, 16, %o1		! increase src ptr by 16
1167*1e49577aSRod Evans	stw	%o3, [%o0+4]
1168*1e49577aSRod Evans	ld	[%o1-8], %o4
1169*1e49577aSRod Evans	add	%o0, 16, %o0		! increase dst ptr by 16
1170*1e49577aSRod Evans	stw	%o4, [%o0-8]
1171*1e49577aSRod Evans	ld	[%o1-4], %o3
1172*1e49577aSRod Evans	stw	%o3, [%o0-4]
1173*1e49577aSRod Evans.medw15:
1174*1e49577aSRod Evans	bz,pt	%ncc, .smallexit	! exit if finished
1175*1e49577aSRod Evans	cmp	%o2, 8
1176*1e49577aSRod Evans	blt,pt	%ncc, .medw7		! skip if 7 or fewer bytes left
1177*1e49577aSRod Evans	tst	%o2
1178*1e49577aSRod Evans	ld	[%o1], %o4		! load 4 bytes
1179*1e49577aSRod Evans	subcc	%o2, 8, %o2		! decrease count by 8
1180*1e49577aSRod Evans	stw	%o4, [%o0]		! and store 4 bytes
1181*1e49577aSRod Evans	add	%o1, 8, %o1		! increase src ptr by 8
1182*1e49577aSRod Evans	ld	[%o1-4], %o3		! load 4 bytes
1183*1e49577aSRod Evans	add	%o0, 8, %o0		! increase dst ptr by 8
1184*1e49577aSRod Evans	stw	%o3, [%o0-4]		! and store 4 bytes
1185*1e49577aSRod Evans	bz,pt	%ncc, .smallexit	! exit if finished
1186*1e49577aSRod Evans.medw7:					! count is ge 1, less than 8
1187*1e49577aSRod Evans	cmp	%o2, 4			! check for 4 bytes left
1188*1e49577aSRod Evans	blt,pt	%ncc, .smallleft3	! skip if 3 or fewer bytes left
1189*1e49577aSRod Evans	nop				!
1190*1e49577aSRod Evans	ld	[%o1], %o4		! load 4 bytes
1191*1e49577aSRod Evans	add	%o1, 4, %o1		! increase src ptr by 4
1192*1e49577aSRod Evans	add	%o0, 4, %o0		! increase dst ptr by 4
1193*1e49577aSRod Evans	subcc	%o2, 4, %o2		! decrease count by 4
1194*1e49577aSRod Evans	bnz	.smallleft3
1195*1e49577aSRod Evans	stw	%o4, [%o0-4]		! and store 4 bytes
1196*1e49577aSRod Evans	retl
1197*1e49577aSRod Evans	mov	%g1, %o0		! restore %o0
1198*1e49577aSRod Evans
1199*1e49577aSRod Evans	.align	16
1200*1e49577aSRod Evans.large_align8_copy:			! Src and dst share 8 byte alignment
1201*1e49577aSRod Evans	rd	%fprs, %g5		! check for unused fp
1202*1e49577aSRod Evans	! if fprs.fef == 0, set it.
1203*1e49577aSRod Evans	! Setting it when already set costs more than checking
1204*1e49577aSRod Evans	andcc	%g5, FPRS_FEF, %g5	! test FEF, fprs.du = fprs.dl = 0
1205*1e49577aSRod Evans	bz,a	%ncc, 1f
1206*1e49577aSRod Evans	wr	%g0, FPRS_FEF, %fprs	! fprs.fef = 1
1207*1e49577aSRod Evans1:
1208*1e49577aSRod Evans	! align dst to 64 byte boundary
1209*1e49577aSRod Evans	andcc	%o0, 0x3f, %o3		! %o3 == 0 means dst is 64 byte aligned
1210*1e49577aSRod Evans	brz,pn	%o3, .aligned_to_64
1211*1e49577aSRod Evans	andcc	%o0, 8, %o3		! odd long words to move?
1212*1e49577aSRod Evans	brz,pt	%o3, .aligned_to_16
1213*1e49577aSRod Evans	nop
1214*1e49577aSRod Evans	ldx	[%o1], %o4
1215*1e49577aSRod Evans	sub	%o2, 8, %o2
1216*1e49577aSRod Evans	add	%o1, 8, %o1		! increment src ptr
1217*1e49577aSRod Evans	add	%o0, 8, %o0		! increment dst ptr
1218*1e49577aSRod Evans	stx	%o4, [%o0-8]
1219*1e49577aSRod Evans.aligned_to_16:
1220*1e49577aSRod Evans	andcc	%o0, 16, %o3		! pair of long words to move?
1221*1e49577aSRod Evans	brz,pt	%o3, .aligned_to_32
1222*1e49577aSRod Evans	nop
1223*1e49577aSRod Evans	ldx	[%o1], %o4
1224*1e49577aSRod Evans	sub	%o2, 16, %o2
1225*1e49577aSRod Evans	stx	%o4, [%o0]
1226*1e49577aSRod Evans	add	%o1, 16, %o1		! increment src ptr
1227*1e49577aSRod Evans	ldx	[%o1-8], %o4
1228*1e49577aSRod Evans	add	%o0, 16, %o0		! increment dst ptr
1229*1e49577aSRod Evans	stx	%o4, [%o0-8]
1230*1e49577aSRod Evans.aligned_to_32:
1231*1e49577aSRod Evans	andcc	%o0, 32, %o3		! four long words to move?
1232*1e49577aSRod Evans	brz,pt	%o3, .aligned_to_64
1233*1e49577aSRod Evans	nop
1234*1e49577aSRod Evans	ldx	[%o1], %o4
1235*1e49577aSRod Evans	sub	%o2, 32, %o2
1236*1e49577aSRod Evans	stx	%o4, [%o0]
1237*1e49577aSRod Evans	ldx	[%o1+8], %o4
1238*1e49577aSRod Evans	stx	%o4, [%o0+8]
1239*1e49577aSRod Evans	ldx	[%o1+16], %o4
1240*1e49577aSRod Evans	stx	%o4, [%o0+16]
1241*1e49577aSRod Evans	add	%o1, 32, %o1		! increment src ptr
1242*1e49577aSRod Evans	ldx	[%o1-8], %o4
1243*1e49577aSRod Evans	add	%o0, 32, %o0		! increment dst ptr
1244*1e49577aSRod Evans	stx	%o4, [%o0-8]
1245*1e49577aSRod Evans.aligned_to_64:
1246*1e49577aSRod Evans	prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
1247*1e49577aSRod Evans	mov	%asi,%o4		! save %asi
1248*1e49577aSRod Evans	! Determine source alignment
1249*1e49577aSRod Evans	! to correct 8 byte offset
1250*1e49577aSRod Evans	andcc	%o1, 0x20, %o3
1251*1e49577aSRod Evans	brnz,pn	%o3, .align_1
1252*1e49577aSRod Evans	mov	ASI_BLK_P, %asi		! setup %asi for block load/store
1253*1e49577aSRod Evans	andcc	%o1, 0x10, %o3
1254*1e49577aSRod Evans	brnz,pn	%o3, .align_01
1255*1e49577aSRod Evans	nop
1256*1e49577aSRod Evans	andcc	%o1, 0x08, %o3
1257*1e49577aSRod Evans	brz,pn	%o3, .align_000
1258*1e49577aSRod Evans	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1259*1e49577aSRod Evans	ba	.align_001
1260*1e49577aSRod Evans	nop
1261*1e49577aSRod Evans.align_01:
1262*1e49577aSRod Evans	andcc	%o1, 0x08, %o3
1263*1e49577aSRod Evans	brnz,pn	%o3, .align_011
1264*1e49577aSRod Evans	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1265*1e49577aSRod Evans	ba	.align_010
1266*1e49577aSRod Evans	nop
1267*1e49577aSRod Evans.align_1:
1268*1e49577aSRod Evans	andcc	%o1, 0x10, %o3
1269*1e49577aSRod Evans	brnz,pn	%o3, .align_11
1270*1e49577aSRod Evans	nop
1271*1e49577aSRod Evans	andcc	%o1, 0x08, %o3
1272*1e49577aSRod Evans	brnz,pn	%o3, .align_101
1273*1e49577aSRod Evans	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1274*1e49577aSRod Evans	ba	.align_100
1275*1e49577aSRod Evans	nop
1276*1e49577aSRod Evans.align_11:
1277*1e49577aSRod Evans	andcc	%o1, 0x08, %o3
1278*1e49577aSRod Evans	brz,pn	%o3, .align_110
1279*1e49577aSRod Evans	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1280*1e49577aSRod Evans
1281*1e49577aSRod Evans.align_111:
1282*1e49577aSRod Evans! Alignment off by 8 bytes
1283*1e49577aSRod Evans	ldd	[%o1], %d0
1284*1e49577aSRod Evans	add	%o1, 8, %o1
1285*1e49577aSRod Evans	sub	%o2, 8, %o2
1286*1e49577aSRod Evans	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
1287*1e49577aSRod Evans	and	%o2, 0x7f, %o2		! residue bytes in %o2
1288*1e49577aSRod Evans.align_111_loop:
1289*1e49577aSRod Evans	subcc	%o5, 128, %o5
1290*1e49577aSRod Evans	/* ---- copy line 1 of 2. ---- */
1291*1e49577aSRod Evans	ldda	[%o1]%asi,%d16		! block load
1292*1e49577aSRod Evans	fmovd	%d16, %d2
1293*1e49577aSRod Evans	fmovd	%d18, %d4
1294*1e49577aSRod Evans	fmovd	%d20, %d6
1295*1e49577aSRod Evans	fmovd	%d22, %d8
1296*1e49577aSRod Evans	fmovd	%d24, %d10
1297*1e49577aSRod Evans	fmovd	%d26, %d12
1298*1e49577aSRod Evans	fmovd	%d28, %d14
1299*1e49577aSRod Evans	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1300*1e49577aSRod Evans	stda	%d0,[%o0]%asi
1301*1e49577aSRod Evans	add	%o0, 64, %o0		! advance dst
1302*1e49577aSRod Evans	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1303*1e49577aSRod Evans	fmovd	%d30, %d0
1304*1e49577aSRod Evans
1305*1e49577aSRod Evans	/* ---- copy line 2 of 2. ---- */
1306*1e49577aSRod Evans	ldda	[%o1+64]%asi,%d16
1307*1e49577aSRod Evans	fmovd	%d16, %d2
1308*1e49577aSRod Evans	fmovd	%d18, %d4
1309*1e49577aSRod Evans	fmovd	%d20, %d6
1310*1e49577aSRod Evans	fmovd	%d22, %d8
1311*1e49577aSRod Evans	fmovd	%d24, %d10
1312*1e49577aSRod Evans	fmovd	%d26, %d12
1313*1e49577aSRod Evans	fmovd	%d28, %d14
1314*1e49577aSRod Evans	add	%o1, 128, %o1		! increment src
1315*1e49577aSRod Evans	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1316*1e49577aSRod Evans	stda	%d0,[%o0]%asi
1317*1e49577aSRod Evans	add	%o0, 64, %o0		! advance dst
1318*1e49577aSRod Evans	fmovd	%d30, %d0
1319*1e49577aSRod Evans	bgt,pt	%ncc, .align_111_loop
1320*1e49577aSRod Evans	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1321*1e49577aSRod Evans
1322*1e49577aSRod Evans	std	%d0, [%o0]
1323*1e49577aSRod Evans	ba	.remain_stuff
1324*1e49577aSRod Evans	add	%o0, 8, %o0
1325*1e49577aSRod Evans	! END OF align_111
1326*1e49577aSRod Evans
1327*1e49577aSRod Evans.align_110:
1328*1e49577aSRod Evans! Alignment off by 16 bytes
1329*1e49577aSRod Evans	ldd	[%o1], %d0
1330*1e49577aSRod Evans	ldd	[%o1+8], %d2
1331*1e49577aSRod Evans	add	%o1, 16, %o1
1332*1e49577aSRod Evans	sub	%o2, 16, %o2
1333*1e49577aSRod Evans	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
1334*1e49577aSRod Evans	and	%o2, 0x7f, %o2		! residue bytes in %o2
1335*1e49577aSRod Evans.align_110_loop:
1336*1e49577aSRod Evans	subcc	%o5, 128, %o5
1337*1e49577aSRod Evans	/* ---- copy line 1 of 2. ---- */
1338*1e49577aSRod Evans
1339*1e49577aSRod Evans	ldda	[%o1]%asi,%d16		! block load
1340*1e49577aSRod Evans	fmovd	%d16, %d4
1341*1e49577aSRod Evans	fmovd	%d18, %d6
1342*1e49577aSRod Evans	fmovd	%d20, %d8
1343*1e49577aSRod Evans	fmovd	%d22, %d10
1344*1e49577aSRod Evans	fmovd	%d24, %d12
1345*1e49577aSRod Evans	fmovd	%d26, %d14
1346*1e49577aSRod Evans	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1347*1e49577aSRod Evans	stda	%d0,[%o0]%asi
1348*1e49577aSRod Evans	add	%o0, 64, %o0		! advance dst
1349*1e49577aSRod Evans	fmovd	%d28, %d0
1350*1e49577aSRod Evans	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1351*1e49577aSRod Evans	fmovd	%d30, %d2
1352*1e49577aSRod Evans
1353*1e49577aSRod Evans	/* ---- copy line 2 of 2. ---- */
1354*1e49577aSRod Evans	ldda	[%o1+64]%asi,%d16
1355*1e49577aSRod Evans	fmovd	%d16, %d4
1356*1e49577aSRod Evans	fmovd	%d18, %d6
1357*1e49577aSRod Evans	fmovd	%d20, %d8
1358*1e49577aSRod Evans	fmovd	%d22, %d10
1359*1e49577aSRod Evans	fmovd	%d24, %d12
1360*1e49577aSRod Evans	fmovd	%d26, %d14
1361*1e49577aSRod Evans	add	%o1, 128, %o1		! increment src
1362*1e49577aSRod Evans	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1363*1e49577aSRod Evans	stda	%d0,[%o0]%asi
1364*1e49577aSRod Evans	add	%o0, 64, %o0		! advance dst
1365*1e49577aSRod Evans	fmovd	%d28, %d0
1366*1e49577aSRod Evans	fmovd	%d30, %d2
1367*1e49577aSRod Evans	bgt,pt	%ncc, .align_110_loop
1368*1e49577aSRod Evans	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1369*1e49577aSRod Evans
1370*1e49577aSRod Evans	std	%d0, [%o0]
1371*1e49577aSRod Evans	std	%d2, [%o0+8]
1372*1e49577aSRod Evans	ba	.remain_stuff
1373*1e49577aSRod Evans	add	%o0, 16, %o0
1374*1e49577aSRod Evans	! END OF align_110
1375*1e49577aSRod Evans
1376*1e49577aSRod Evans.align_101:
1377*1e49577aSRod Evans! Alignment off by 24 bytes
1378*1e49577aSRod Evans	ldd	[%o1], %d0
1379*1e49577aSRod Evans	ldd	[%o1+8], %d2
1380*1e49577aSRod Evans	ldd	[%o1+16], %d4
1381*1e49577aSRod Evans	add	%o1, 24, %o1
1382*1e49577aSRod Evans	sub	%o2, 24, %o2
1383*1e49577aSRod Evans	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
1384*1e49577aSRod Evans	and	%o2, 0x7f, %o2		! residue bytes in %o2
1385*1e49577aSRod Evans.align_101_loop:
1386*1e49577aSRod Evans	subcc	%o5, 128, %o5
1387*1e49577aSRod Evans	/* ---- copy line 1 of 2. ---- */
1388*1e49577aSRod Evans
1389*1e49577aSRod Evans	ldda	[%o1]%asi,%d16		! block load
1390*1e49577aSRod Evans	fmovd	%d16, %d6
1391*1e49577aSRod Evans	fmovd	%d18, %d8
1392*1e49577aSRod Evans	fmovd	%d20, %d10
1393*1e49577aSRod Evans	fmovd	%d22, %d12
1394*1e49577aSRod Evans	fmovd	%d24, %d14
1395*1e49577aSRod Evans	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1396*1e49577aSRod Evans	stda	%d0,[%o0]%asi
1397*1e49577aSRod Evans	add	%o0, 64, %o0		! advance dst
1398*1e49577aSRod Evans	fmovd	%d26, %d0
1399*1e49577aSRod Evans	fmovd	%d28, %d2
1400*1e49577aSRod Evans	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1401*1e49577aSRod Evans	fmovd	%d30, %d4
1402*1e49577aSRod Evans
1403*1e49577aSRod Evans	/* ---- copy line 2 of 2. ---- */
1404*1e49577aSRod Evans	ldda	[%o1+64]%asi,%d16
1405*1e49577aSRod Evans	fmovd	%d16, %d6
1406*1e49577aSRod Evans	fmovd	%d18, %d8
1407*1e49577aSRod Evans	fmovd	%d20, %d10
1408*1e49577aSRod Evans	fmovd	%d22, %d12
1409*1e49577aSRod Evans	fmovd	%d24, %d14
1410*1e49577aSRod Evans	add	%o1, 128, %o1		! increment src
1411*1e49577aSRod Evans	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1412*1e49577aSRod Evans	stda	%d0,[%o0]%asi
1413*1e49577aSRod Evans	add	%o0, 64, %o0		! advance dst
1414*1e49577aSRod Evans	fmovd	%d26, %d0
1415*1e49577aSRod Evans	fmovd	%d28, %d2
1416*1e49577aSRod Evans	fmovd	%d30, %d4
1417*1e49577aSRod Evans	bgt,pt	%ncc, .align_101_loop
1418*1e49577aSRod Evans	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1419*1e49577aSRod Evans
1420*1e49577aSRod Evans	std	%d0, [%o0]
1421*1e49577aSRod Evans	std	%d2, [%o0+8]
1422*1e49577aSRod Evans	std	%d4, [%o0+16]
1423*1e49577aSRod Evans	ba	.remain_stuff
1424*1e49577aSRod Evans	add	%o0, 24, %o0
1425*1e49577aSRod Evans	! END OF align_101
1426*1e49577aSRod Evans
1427*1e49577aSRod Evans.align_100:
1428*1e49577aSRod Evans! Alignment off by 32 bytes
1429*1e49577aSRod Evans	ldd	[%o1], %d0
1430*1e49577aSRod Evans	ldd	[%o1+8], %d2
1431*1e49577aSRod Evans	ldd	[%o1+16],%d4
1432*1e49577aSRod Evans	ldd	[%o1+24],%d6
1433*1e49577aSRod Evans	add	%o1, 32, %o1
1434*1e49577aSRod Evans	sub	%o2, 32, %o2
1435*1e49577aSRod Evans	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
1436*1e49577aSRod Evans	and	%o2, 0x7f, %o2		! residue bytes in %o2
1437*1e49577aSRod Evans.align_100_loop:
1438*1e49577aSRod Evans	subcc	%o5, 128, %o5
1439*1e49577aSRod Evans	/* ---- copy line 1 of 2. ---- */
1440*1e49577aSRod Evans	ldda	[%o1]%asi,%d16		! block load
1441*1e49577aSRod Evans	fmovd	%d16, %d8
1442*1e49577aSRod Evans	fmovd	%d18, %d10
1443*1e49577aSRod Evans	fmovd	%d20, %d12
1444*1e49577aSRod Evans	fmovd	%d22, %d14
1445*1e49577aSRod Evans	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1446*1e49577aSRod Evans	stda	%d0,[%o0]%asi
1447*1e49577aSRod Evans	add	%o0, 64, %o0		! advance dst
1448*1e49577aSRod Evans	fmovd	%d24, %d0
1449*1e49577aSRod Evans	fmovd	%d26, %d2
1450*1e49577aSRod Evans	fmovd	%d28, %d4
1451*1e49577aSRod Evans	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1452*1e49577aSRod Evans	fmovd	%d30, %d6
1453*1e49577aSRod Evans
1454*1e49577aSRod Evans	/* ---- copy line 2 of 2. ---- */
1455*1e49577aSRod Evans	ldda	[%o1+64]%asi,%d16
1456*1e49577aSRod Evans	fmovd	%d16, %d8
1457*1e49577aSRod Evans	fmovd	%d18, %d10
1458*1e49577aSRod Evans	fmovd	%d20, %d12
1459*1e49577aSRod Evans	fmovd	%d22, %d14
1460*1e49577aSRod Evans	add	%o1, 128, %o1		! increment src
1461*1e49577aSRod Evans	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1462*1e49577aSRod Evans	stda	%d0,[%o0]%asi
1463*1e49577aSRod Evans	add	%o0, 64, %o0		! advance dst
1464*1e49577aSRod Evans	fmovd	%d24, %d0
1465*1e49577aSRod Evans	fmovd	%d26, %d2
1466*1e49577aSRod Evans	fmovd	%d28, %d4
1467*1e49577aSRod Evans	fmovd	%d30, %d6
1468*1e49577aSRod Evans	bgt,pt	%ncc, .align_100_loop
1469*1e49577aSRod Evans	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1470*1e49577aSRod Evans
1471*1e49577aSRod Evans	std	%d0, [%o0]
1472*1e49577aSRod Evans	std	%d2, [%o0+8]
1473*1e49577aSRod Evans	std	%d4, [%o0+16]
1474*1e49577aSRod Evans	std	%d6, [%o0+24]
1475*1e49577aSRod Evans	ba	.remain_stuff
1476*1e49577aSRod Evans	add	%o0, 32, %o0
1477*1e49577aSRod Evans	! END OF align_100
1478*1e49577aSRod Evans
1479*1e49577aSRod Evans.align_011:
1480*1e49577aSRod Evans! Alignment off by 40 bytes
1481*1e49577aSRod Evans	ldd	[%o1], %d0
1482*1e49577aSRod Evans	ldd	[%o1+8], %d2
1483*1e49577aSRod Evans	ldd	[%o1+16], %d4
1484*1e49577aSRod Evans	ldd	[%o1+24], %d6
1485*1e49577aSRod Evans	ldd	[%o1+32], %d8
1486*1e49577aSRod Evans	add	%o1, 40, %o1
1487*1e49577aSRod Evans	sub	%o2, 40, %o2
1488*1e49577aSRod Evans	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
1489*1e49577aSRod Evans	and	%o2, 0x7f, %o2		! residue bytes in %o2
1490*1e49577aSRod Evans.align_011_loop:
1491*1e49577aSRod Evans	subcc	%o5, 128, %o5
1492*1e49577aSRod Evans	/* ---- copy line 1 of 2. ---- */
1493*1e49577aSRod Evans
1494*1e49577aSRod Evans	ldda	[%o1]%asi,%d16		! block load
1495*1e49577aSRod Evans	fmovd	%d16, %d10
1496*1e49577aSRod Evans	fmovd	%d18, %d12
1497*1e49577aSRod Evans	fmovd	%d20, %d14
1498*1e49577aSRod Evans	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1499*1e49577aSRod Evans	stda	%d0,[%o0]%asi
1500*1e49577aSRod Evans	add	%o0, 64, %o0		! advance dst
1501*1e49577aSRod Evans	fmovd	%d22, %d0
1502*1e49577aSRod Evans	fmovd	%d24, %d2
1503*1e49577aSRod Evans	fmovd	%d26, %d4
1504*1e49577aSRod Evans	fmovd	%d28, %d6
1505*1e49577aSRod Evans	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1506*1e49577aSRod Evans	fmovd	%d30, %d8
1507*1e49577aSRod Evans
1508*1e49577aSRod Evans	/* ---- copy line 2 of 2. ---- */
1509*1e49577aSRod Evans	ldda	[%o1+64]%asi,%d16
1510*1e49577aSRod Evans	fmovd	%d16, %d10
1511*1e49577aSRod Evans	fmovd	%d18, %d12
1512*1e49577aSRod Evans	fmovd	%d20, %d14
1513*1e49577aSRod Evans	add	%o1, 128, %o1		! increment src
1514*1e49577aSRod Evans	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1515*1e49577aSRod Evans	stda	%d0,[%o0]%asi
1516*1e49577aSRod Evans	add	%o0, 64, %o0		! advance dst
1517*1e49577aSRod Evans	fmovd	%d22, %d0
1518*1e49577aSRod Evans	fmovd	%d24, %d2
1519*1e49577aSRod Evans	fmovd	%d26, %d4
1520*1e49577aSRod Evans	fmovd	%d28, %d6
1521*1e49577aSRod Evans	fmovd	%d30, %d8
1522*1e49577aSRod Evans	bgt,pt	%ncc, .align_011_loop
1523*1e49577aSRod Evans	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1524*1e49577aSRod Evans
1525*1e49577aSRod Evans	std	%d0, [%o0]
1526*1e49577aSRod Evans	std	%d2, [%o0+8]
1527*1e49577aSRod Evans	std	%d4, [%o0+16]
1528*1e49577aSRod Evans	std	%d6, [%o0+24]
1529*1e49577aSRod Evans	std	%d8, [%o0+32]
1530*1e49577aSRod Evans	ba	.remain_stuff
1531*1e49577aSRod Evans	add	%o0, 40, %o0
1532*1e49577aSRod Evans	! END OF align_011
1533*1e49577aSRod Evans
1534*1e49577aSRod Evans.align_010:
1535*1e49577aSRod Evans! Alignment off by 48 bytes
1536*1e49577aSRod Evans	ldd	[%o1], %d0
1537*1e49577aSRod Evans	ldd	[%o1+8], %d2
1538*1e49577aSRod Evans	ldd	[%o1+16], %d4
1539*1e49577aSRod Evans	ldd	[%o1+24], %d6
1540*1e49577aSRod Evans	ldd	[%o1+32], %d8
1541*1e49577aSRod Evans	ldd	[%o1+40], %d10
1542*1e49577aSRod Evans	add	%o1, 48, %o1
1543*1e49577aSRod Evans	sub	%o2, 48, %o2
1544*1e49577aSRod Evans	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
1545*1e49577aSRod Evans	and	%o2, 0x7f, %o2		! residue bytes in %o2
1546*1e49577aSRod Evans.align_010_loop:
1547*1e49577aSRod Evans	subcc	%o5, 128, %o5
1548*1e49577aSRod Evans	/* ---- copy line 1 of 2. ---- */
1549*1e49577aSRod Evans
1550*1e49577aSRod Evans	ldda	[%o1]%asi,%d16		! block load
1551*1e49577aSRod Evans	fmovd	%d16, %d12
1552*1e49577aSRod Evans	fmovd	%d18, %d14
1553*1e49577aSRod Evans	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1554*1e49577aSRod Evans	stda	%d0,[%o0]%asi
1555*1e49577aSRod Evans	add	%o0, 64, %o0		! advance dst
1556*1e49577aSRod Evans	fmovd	%d20, %d0
1557*1e49577aSRod Evans	fmovd	%d22, %d2
1558*1e49577aSRod Evans	fmovd	%d24, %d4
1559*1e49577aSRod Evans	fmovd	%d26, %d6
1560*1e49577aSRod Evans	fmovd	%d28, %d8
1561*1e49577aSRod Evans	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1562*1e49577aSRod Evans	fmovd	%d30, %d10
1563*1e49577aSRod Evans
1564*1e49577aSRod Evans	/* ---- copy line 2 of 2. ---- */
1565*1e49577aSRod Evans	ldda	[%o1+64]%asi,%d16
1566*1e49577aSRod Evans	fmovd	%d16, %d12
1567*1e49577aSRod Evans	fmovd	%d18, %d14
1568*1e49577aSRod Evans	add	%o1, 128, %o1	! increment src
1569*1e49577aSRod Evans	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1570*1e49577aSRod Evans	stda	%d0,[%o0]%asi
1571*1e49577aSRod Evans	add	%o0, 64, %o0		! advance dst
1572*1e49577aSRod Evans	fmovd	%d20, %d0
1573*1e49577aSRod Evans	fmovd	%d22, %d2
1574*1e49577aSRod Evans	fmovd	%d24, %d4
1575*1e49577aSRod Evans	fmovd	%d26, %d6
1576*1e49577aSRod Evans	fmovd	%d28, %d8
1577*1e49577aSRod Evans	fmovd	%d30, %d10
1578*1e49577aSRod Evans	bgt,pt	%ncc, .align_010_loop
1579*1e49577aSRod Evans	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1580*1e49577aSRod Evans
1581*1e49577aSRod Evans	std	%d0, [%o0]
1582*1e49577aSRod Evans	std	%d2, [%o0+8]
1583*1e49577aSRod Evans	std	%d4, [%o0+16]
1584*1e49577aSRod Evans	std	%d6, [%o0+24]
1585*1e49577aSRod Evans	std	%d8, [%o0+32]
1586*1e49577aSRod Evans	std	%d10, [%o0+40]
1587*1e49577aSRod Evans	ba	.remain_stuff
1588*1e49577aSRod Evans	add	%o0, 48, %o0
1589*1e49577aSRod Evans	! END OF align_010
1590*1e49577aSRod Evans
1591*1e49577aSRod Evans.align_001:
1592*1e49577aSRod Evans! Alignment off by 56 bytes
1593*1e49577aSRod Evans	ldd	[%o1], %d0
1594*1e49577aSRod Evans	ldd	[%o1+8], %d2
1595*1e49577aSRod Evans	ldd	[%o1+16], %d4
1596*1e49577aSRod Evans	ldd	[%o1+24], %d6
1597*1e49577aSRod Evans	ldd	[%o1+32], %d8
1598*1e49577aSRod Evans	ldd	[%o1+40], %d10
1599*1e49577aSRod Evans	ldd	[%o1+48], %d12
1600*1e49577aSRod Evans	add	%o1, 56, %o1
1601*1e49577aSRod Evans	sub	%o2, 56, %o2
1602*1e49577aSRod Evans	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
1603*1e49577aSRod Evans	and	%o2, 0x7f, %o2		! residue bytes in %o2
1604*1e49577aSRod Evans.align_001_loop:
1605*1e49577aSRod Evans	subcc	%o5, 128, %o5
1606*1e49577aSRod Evans	/* ---- copy line 1 of 2. ---- */
1607*1e49577aSRod Evans
1608*1e49577aSRod Evans	ldda	[%o1]%asi,%d16		! block load
1609*1e49577aSRod Evans	fmovd	%d16, %d14
1610*1e49577aSRod Evans	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1611*1e49577aSRod Evans	stda	%d0,[%o0]%asi
1612*1e49577aSRod Evans	add	%o0, 64, %o0		! advance dst
1613*1e49577aSRod Evans	fmovd	%d18, %d0
1614*1e49577aSRod Evans	fmovd	%d20, %d2
1615*1e49577aSRod Evans	fmovd	%d22, %d4
1616*1e49577aSRod Evans	fmovd	%d24, %d6
1617*1e49577aSRod Evans	fmovd	%d26, %d8
1618*1e49577aSRod Evans	fmovd	%d28, %d10
1619*1e49577aSRod Evans	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1620*1e49577aSRod Evans	fmovd	%d30, %d12
1621*1e49577aSRod Evans
1622*1e49577aSRod Evans	/* ---- copy line 2 of 2. ---- */
1623*1e49577aSRod Evans	ldda	[%o1+64]%asi,%d16
1624*1e49577aSRod Evans	fmovd	%d16, %d14
1625*1e49577aSRod Evans	add	%o1, 128, %o1		! increment src
1626*1e49577aSRod Evans	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1627*1e49577aSRod Evans	stda	%d0,[%o0]%asi
1628*1e49577aSRod Evans	add	%o0, 64, %o0		! advance dst
1629*1e49577aSRod Evans	fmovd	%d18, %d0
1630*1e49577aSRod Evans	fmovd	%d20, %d2
1631*1e49577aSRod Evans	fmovd	%d22, %d4
1632*1e49577aSRod Evans	fmovd	%d24, %d6
1633*1e49577aSRod Evans	fmovd	%d26, %d8
1634*1e49577aSRod Evans	fmovd	%d28, %d10
1635*1e49577aSRod Evans	fmovd	%d30, %d12
1636*1e49577aSRod Evans	bgt,pt	%ncc, .align_001_loop
1637*1e49577aSRod Evans	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1638*1e49577aSRod Evans
1639*1e49577aSRod Evans	std	%d0, [%o0]
1640*1e49577aSRod Evans	std	%d2, [%o0+8]
1641*1e49577aSRod Evans	std	%d4, [%o0+16]
1642*1e49577aSRod Evans	std	%d6, [%o0+24]
1643*1e49577aSRod Evans	std	%d8, [%o0+32]
1644*1e49577aSRod Evans	std	%d10, [%o0+40]
1645*1e49577aSRod Evans	std	%d12, [%o0+48]
1646*1e49577aSRod Evans	ba	.remain_stuff
1647*1e49577aSRod Evans	add	%o0, 56, %o0
1648*1e49577aSRod Evans	! END OF align_001
1649*1e49577aSRod Evans
1650*1e49577aSRod Evans.align_000:
1651*1e49577aSRod Evans	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
1652*1e49577aSRod Evans	and	%o2, 0x7f, %o2		! residue bytes in %o2
1653*1e49577aSRod Evans.align_000_loop:
1654*1e49577aSRod Evans	/* ---- copy line 1 of 2. ---- */
1655*1e49577aSRod Evans	subcc	%o5, 128, %o5
1656*1e49577aSRod Evans	ldda	[%o1]%asi,%d0
1657*1e49577aSRod Evans	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1658*1e49577aSRod Evans	stda	%d0,[%o0]%asi
1659*1e49577aSRod Evans	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1660*1e49577aSRod Evans
1661*1e49577aSRod Evans	/* ---- copy line 2 of 2. ---- */
1662*1e49577aSRod Evans	add	%o0, 64, %o0
1663*1e49577aSRod Evans	ldda	[%o1+64]%asi,%d0
1664*1e49577aSRod Evans	add	%o1, 128, %o1		! increment src
1665*1e49577aSRod Evans	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1666*1e49577aSRod Evans	stda	%d0,[%o0]%asi
1667*1e49577aSRod Evans	add	%o0, 64, %o0		! increment dst
1668*1e49577aSRod Evans	bgt,pt	%ncc, .align_000_loop
1669*1e49577aSRod Evans	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1670*1e49577aSRod Evans
1671*1e49577aSRod Evans	! END OF align_000
1672*1e49577aSRod Evans
1673*1e49577aSRod Evans.remain_stuff:
1674*1e49577aSRod Evans	mov	%o4, %asi		! restore %asi
1675*1e49577aSRod Evans	brnz	%g5, .medlong
1676*1e49577aSRod Evans	membar	#Sync
1677*1e49577aSRod Evans	ba	.medlong
1678*1e49577aSRod Evans	wr	%g5, %g0, %fprs
1679*1e49577aSRod Evans
1680*1e49577aSRod Evans	.align 16
1681*1e49577aSRod Evans	! Dst is on 8 byte boundary; src is not; remaining count > SMALL_MAX
1682*1e49577aSRod Evans.unalignsetup:
1683*1e49577aSRod Evans	prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
1684*1e49577aSRod Evans.unalignrejoin:
1685*1e49577aSRod Evans	rd	%fprs, %g5		! check for unused fp
1686*1e49577aSRod Evans	! if fprs.fef == 0, set it.
1687*1e49577aSRod Evans	! Setting it when already set costs more than checking
1688*1e49577aSRod Evans	andcc	%g5, FPRS_FEF, %g5	! test FEF, fprs.du = fprs.dl = 0
1689*1e49577aSRod Evans	bz,a	%ncc, 1f
1690*1e49577aSRod Evans	wr	%g0, FPRS_FEF, %fprs	! fprs.fef = 1
1691*1e49577aSRod Evans1:
1692*1e49577aSRod Evans	cmp	%o2, MED_UMAX		! check for medium unaligned limit
1693*1e49577aSRod Evans	bge,pt	%ncc,.unalign_large
1694*1e49577aSRod Evans	nop
1695*1e49577aSRod Evans	andn	%o2, 0x3f, %o5		! %o5 is multiple of block size
1696*1e49577aSRod Evans	and	%o2, 0x3f, %o2		! residue bytes in %o2
1697*1e49577aSRod Evans	cmp	%o2, 8			! Insure we don't load beyond
1698*1e49577aSRod Evans	bgt	.unalign_adjust		! end of source buffer
1699*1e49577aSRod Evans	andn	%o1, 0x7, %o4		! %o4 has long word aligned src address
1700*1e49577aSRod Evans	add	%o2, 64, %o2		! adjust to leave loop
1701*1e49577aSRod Evans	sub	%o5, 64, %o5		! early if necessary
1702*1e49577aSRod Evans.unalign_adjust:
1703*1e49577aSRod Evans	alignaddr %o1, %g0, %g0		! generate %gsr
1704*1e49577aSRod Evans	add	%o1, %o5, %o1		! advance %o1 to after blocks
1705*1e49577aSRod Evans	ldd	[%o4], %d0
1706*1e49577aSRod Evans.unalign_loop:
1707*1e49577aSRod Evans	ldd	[%o4+8], %d2
1708*1e49577aSRod Evans	faligndata %d0, %d2, %d16
1709*1e49577aSRod Evans	ldd	[%o4+16], %d4
1710*1e49577aSRod Evans	std	%d16, [%o0]
1711*1e49577aSRod Evans	faligndata %d2, %d4, %d18
1712*1e49577aSRod Evans	ldd	[%o4+24], %d6
1713*1e49577aSRod Evans	std	%d18, [%o0+8]
1714*1e49577aSRod Evans	faligndata %d4, %d6, %d20
1715*1e49577aSRod Evans	ldd	[%o4+32], %d8
1716*1e49577aSRod Evans	std	%d20, [%o0+16]
1717*1e49577aSRod Evans	faligndata %d6, %d8, %d22
1718*1e49577aSRod Evans	ldd	[%o4+40], %d10
1719*1e49577aSRod Evans	std	%d22, [%o0+24]
1720*1e49577aSRod Evans	faligndata %d8, %d10, %d24
1721*1e49577aSRod Evans	ldd	[%o4+48], %d12
1722*1e49577aSRod Evans	std	%d24, [%o0+32]
1723*1e49577aSRod Evans	faligndata %d10, %d12, %d26
1724*1e49577aSRod Evans	ldd	[%o4+56], %d14
1725*1e49577aSRod Evans	std	%d26, [%o0+40]
1726*1e49577aSRod Evans	faligndata %d12, %d14, %d28
1727*1e49577aSRod Evans	ldd	[%o4+64], %d0
1728*1e49577aSRod Evans	std	%d28, [%o0+48]
1729*1e49577aSRod Evans	faligndata %d14, %d0, %d30
1730*1e49577aSRod Evans	add	%o4, BLOCK_SIZE, %o4
1731*1e49577aSRod Evans	std	%d30, [%o0+56]
1732*1e49577aSRod Evans	add	%o0, BLOCK_SIZE, %o0
1733*1e49577aSRod Evans	subcc	%o5, BLOCK_SIZE, %o5
1734*1e49577aSRod Evans	bgu,pt	%ncc, .unalign_loop
1735*1e49577aSRod Evans	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1736*1e49577aSRod Evans	ba	.unalign_done
1737*1e49577aSRod Evans	nop
1738*1e49577aSRod Evans
1739*1e49577aSRod Evans.unalign_large:
1740*1e49577aSRod Evans	andcc	%o0, 0x3f, %o3		! is dst 64-byte block aligned?
1741*1e49577aSRod Evans	bz	%ncc, .unalignsrc
1742*1e49577aSRod Evans	sub	%o3, 64, %o3		! %o3 will be multiple of 8
1743*1e49577aSRod Evans	neg	%o3			! bytes until dest is 64 byte aligned
1744*1e49577aSRod Evans	sub	%o2, %o3, %o2		! update cnt with bytes to be moved
1745*1e49577aSRod Evans	! Move bytes according to source alignment
1746*1e49577aSRod Evans	andcc	%o1, 0x1, %o5
1747*1e49577aSRod Evans	bnz	%ncc, .unalignbyte	! check for byte alignment
1748*1e49577aSRod Evans	nop
1749*1e49577aSRod Evans	andcc	%o1, 2, %o5		! check for half word alignment
1750*1e49577aSRod Evans	bnz	%ncc, .unalignhalf
1751*1e49577aSRod Evans	nop
1752*1e49577aSRod Evans	! Src is word aligned
1753*1e49577aSRod Evans.unalignword:
1754*1e49577aSRod Evans	ld	[%o1], %o4		! load 4 bytes
1755*1e49577aSRod Evans	stw	%o4, [%o0]		! and store 4 bytes
1756*1e49577aSRod Evans	ld	[%o1+4], %o4		! load 4 bytes
1757*1e49577aSRod Evans	add	%o1, 8, %o1		! increase src ptr by 8
1758*1e49577aSRod Evans	stw	%o4, [%o0+4]		! and store 4 bytes
1759*1e49577aSRod Evans	subcc	%o3, 8, %o3		! decrease count by 8
1760*1e49577aSRod Evans	bnz	%ncc, .unalignword
1761*1e49577aSRod Evans	add	%o0, 8, %o0		! increase dst ptr by 8
1762*1e49577aSRod Evans	ba	.unalignsrc
1763*1e49577aSRod Evans	nop
1764*1e49577aSRod Evans
1765*1e49577aSRod Evans	! Src is half-word aligned
1766*1e49577aSRod Evans.unalignhalf:
1767*1e49577aSRod Evans	lduh	[%o1], %o4		! load 2 bytes
1768*1e49577aSRod Evans	sllx	%o4, 32, %o5		! shift left
1769*1e49577aSRod Evans	lduw	[%o1+2], %o4
1770*1e49577aSRod Evans	or	%o4, %o5, %o5
1771*1e49577aSRod Evans	sllx	%o5, 16, %o5
1772*1e49577aSRod Evans	lduh	[%o1+6], %o4
1773*1e49577aSRod Evans	or	%o4, %o5, %o5
1774*1e49577aSRod Evans	stx	%o5, [%o0]
1775*1e49577aSRod Evans	add	%o1, 8, %o1
1776*1e49577aSRod Evans	subcc	%o3, 8, %o3
1777*1e49577aSRod Evans	bnz	%ncc, .unalignhalf
1778*1e49577aSRod Evans	add	%o0, 8, %o0
1779*1e49577aSRod Evans	ba	.unalignsrc
1780*1e49577aSRod Evans	nop
1781*1e49577aSRod Evans
1782*1e49577aSRod Evans	! Src is Byte aligned
1783*1e49577aSRod Evans.unalignbyte:
1784*1e49577aSRod Evans	sub	%o0, %o1, %o0		! share pointer advance
1785*1e49577aSRod Evans.unalignbyte_loop:
1786*1e49577aSRod Evans	ldub	[%o1], %o4
1787*1e49577aSRod Evans	sllx	%o4, 56, %o5
1788*1e49577aSRod Evans	lduh	[%o1+1], %o4
1789*1e49577aSRod Evans	sllx	%o4, 40, %o4
1790*1e49577aSRod Evans	or	%o4, %o5, %o5
1791*1e49577aSRod Evans	lduh	[%o1+3], %o4
1792*1e49577aSRod Evans	sllx	%o4, 24, %o4
1793*1e49577aSRod Evans	or	%o4, %o5, %o5
1794*1e49577aSRod Evans	lduh	[%o1+5], %o4
1795*1e49577aSRod Evans	sllx	%o4,  8, %o4
1796*1e49577aSRod Evans	or	%o4, %o5, %o5
1797*1e49577aSRod Evans	ldub	[%o1+7], %o4
1798*1e49577aSRod Evans	or	%o4, %o5, %o5
1799*1e49577aSRod Evans	stx	%o5, [%o0+%o1]
1800*1e49577aSRod Evans	subcc	%o3, 8, %o3
1801*1e49577aSRod Evans	bnz	%ncc, .unalignbyte_loop
1802*1e49577aSRod Evans	add	%o1, 8, %o1
1803*1e49577aSRod Evans	add	%o0,%o1, %o0 		! restore pointer
1804*1e49577aSRod Evans
1805*1e49577aSRod Evans	! Destination is now block (64 byte aligned)
1806*1e49577aSRod Evans.unalignsrc:
1807*1e49577aSRod Evans	andn	%o2, 0x3f, %o5		! %o5 is multiple of block size
1808*1e49577aSRod Evans	and	%o2, 0x3f, %o2		! residue bytes in %o2
1809*1e49577aSRod Evans	add	%o2, 64, %o2		! Insure we don't load beyond
1810*1e49577aSRod Evans	sub	%o5, 64, %o5		! end of source buffer
1811*1e49577aSRod Evans
1812*1e49577aSRod Evans	andn	%o1, 0x3f, %o4		! %o4 has block aligned src address
1813*1e49577aSRod Evans	prefetch [%o4 + (3 * BLOCK_SIZE)], #one_read
1814*1e49577aSRod Evans	alignaddr %o1, %g0, %g0		! generate %gsr
1815*1e49577aSRod Evans	add	%o1, %o5, %o1		! advance %o1 to after blocks
1816*1e49577aSRod Evans	!
1817*1e49577aSRod Evans	! Determine source alignment to correct 8 byte offset
1818*1e49577aSRod Evans	andcc	%o1, 0x20, %o3
1819*1e49577aSRod Evans	brnz,pn	%o3, .unalign_1
1820*1e49577aSRod Evans	nop
1821*1e49577aSRod Evans	andcc	%o1, 0x10, %o3
1822*1e49577aSRod Evans	brnz,pn	%o3, .unalign_01
1823*1e49577aSRod Evans	nop
1824*1e49577aSRod Evans	andcc	%o1, 0x08, %o3
1825*1e49577aSRod Evans	brz,a	%o3, .unalign_000
1826*1e49577aSRod Evans	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1827*1e49577aSRod Evans	ba	.unalign_001
1828*1e49577aSRod Evans	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1829*1e49577aSRod Evans.unalign_01:
1830*1e49577aSRod Evans	andcc	%o1, 0x08, %o3
1831*1e49577aSRod Evans	brnz,a	%o3, .unalign_011
1832*1e49577aSRod Evans	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1833*1e49577aSRod Evans	ba	.unalign_010
1834*1e49577aSRod Evans	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1835*1e49577aSRod Evans.unalign_1:
1836*1e49577aSRod Evans	andcc	%o1, 0x10, %o3
1837*1e49577aSRod Evans	brnz,pn	%o3, .unalign_11
1838*1e49577aSRod Evans	nop
1839*1e49577aSRod Evans	andcc	%o1, 0x08, %o3
1840*1e49577aSRod Evans	brnz,a	%o3, .unalign_101
1841*1e49577aSRod Evans	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1842*1e49577aSRod Evans	ba	.unalign_100
1843*1e49577aSRod Evans	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1844*1e49577aSRod Evans.unalign_11:
1845*1e49577aSRod Evans	andcc	%o1, 0x08, %o3
1846*1e49577aSRod Evans	brz,pn	%o3, .unalign_110
1847*1e49577aSRod Evans	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1848*1e49577aSRod Evans
1849*1e49577aSRod Evans.unalign_111:
1850*1e49577aSRod Evans	ldd	[%o4+56], %d14
1851*1e49577aSRod Evans.unalign_111_loop:
1852*1e49577aSRod Evans	add	%o4, 64, %o4
1853*1e49577aSRod Evans	ldda	[%o4]ASI_BLK_P, %d16
1854*1e49577aSRod Evans	faligndata %d14, %d16, %d48
1855*1e49577aSRod Evans	faligndata %d16, %d18, %d50
1856*1e49577aSRod Evans	faligndata %d18, %d20, %d52
1857*1e49577aSRod Evans	faligndata %d20, %d22, %d54
1858*1e49577aSRod Evans	faligndata %d22, %d24, %d56
1859*1e49577aSRod Evans	faligndata %d24, %d26, %d58
1860*1e49577aSRod Evans	faligndata %d26, %d28, %d60
1861*1e49577aSRod Evans	faligndata %d28, %d30, %d62
1862*1e49577aSRod Evans	fmovd	%d30, %d14
1863*1e49577aSRod Evans	stda	%d48, [%o0]ASI_BLK_P
1864*1e49577aSRod Evans	subcc	%o5, 64, %o5
1865*1e49577aSRod Evans	add	%o0, 64, %o0
1866*1e49577aSRod Evans	bgu,pt	%ncc, .unalign_111_loop
1867*1e49577aSRod Evans	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1868*1e49577aSRod Evans	ba	.unalign_done
1869*1e49577aSRod Evans	membar	#Sync
1870*1e49577aSRod Evans
1871*1e49577aSRod Evans.unalign_110:
1872*1e49577aSRod Evans	ldd	[%o4+48], %d12
1873*1e49577aSRod Evans	ldd	[%o4+56], %d14
1874*1e49577aSRod Evans.unalign_110_loop:
1875*1e49577aSRod Evans	add	%o4, 64, %o4
1876*1e49577aSRod Evans	ldda	[%o4]ASI_BLK_P, %d16
1877*1e49577aSRod Evans	faligndata %d12, %d14, %d48
1878*1e49577aSRod Evans	faligndata %d14, %d16, %d50
1879*1e49577aSRod Evans	faligndata %d16, %d18, %d52
1880*1e49577aSRod Evans	faligndata %d18, %d20, %d54
1881*1e49577aSRod Evans	faligndata %d20, %d22, %d56
1882*1e49577aSRod Evans	faligndata %d22, %d24, %d58
1883*1e49577aSRod Evans	faligndata %d24, %d26, %d60
1884*1e49577aSRod Evans	faligndata %d26, %d28, %d62
1885*1e49577aSRod Evans	fmovd	%d28, %d12
1886*1e49577aSRod Evans	fmovd	%d30, %d14
1887*1e49577aSRod Evans	stda	%d48, [%o0]ASI_BLK_P
1888*1e49577aSRod Evans	subcc	%o5, 64, %o5
1889*1e49577aSRod Evans	add	%o0, 64, %o0
1890*1e49577aSRod Evans	bgu,pt	%ncc, .unalign_110_loop
1891*1e49577aSRod Evans	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1892*1e49577aSRod Evans	ba	.unalign_done
1893*1e49577aSRod Evans	membar	#Sync
1894*1e49577aSRod Evans
1895*1e49577aSRod Evans.unalign_101:
1896*1e49577aSRod Evans	ldd	[%o4+40], %d10
1897*1e49577aSRod Evans	ldd	[%o4+48], %d12
1898*1e49577aSRod Evans	ldd	[%o4+56], %d14
1899*1e49577aSRod Evans.unalign_101_loop:
1900*1e49577aSRod Evans	add	%o4, 64, %o4
1901*1e49577aSRod Evans	ldda	[%o4]ASI_BLK_P, %d16
1902*1e49577aSRod Evans	faligndata %d10, %d12, %d48
1903*1e49577aSRod Evans	faligndata %d12, %d14, %d50
1904*1e49577aSRod Evans	faligndata %d14, %d16, %d52
1905*1e49577aSRod Evans	faligndata %d16, %d18, %d54
1906*1e49577aSRod Evans	faligndata %d18, %d20, %d56
1907*1e49577aSRod Evans	faligndata %d20, %d22, %d58
1908*1e49577aSRod Evans	faligndata %d22, %d24, %d60
1909*1e49577aSRod Evans	faligndata %d24, %d26, %d62
1910*1e49577aSRod Evans	fmovd	%d26, %d10
1911*1e49577aSRod Evans	fmovd	%d28, %d12
1912*1e49577aSRod Evans	fmovd	%d30, %d14
1913*1e49577aSRod Evans	stda	%d48, [%o0]ASI_BLK_P
1914*1e49577aSRod Evans	subcc	%o5, 64, %o5
1915*1e49577aSRod Evans	add	%o0, 64, %o0
1916*1e49577aSRod Evans	bgu,pt	%ncc, .unalign_101_loop
1917*1e49577aSRod Evans	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1918*1e49577aSRod Evans	ba	.unalign_done
1919*1e49577aSRod Evans	membar	#Sync
1920*1e49577aSRod Evans
1921*1e49577aSRod Evans.unalign_100:
1922*1e49577aSRod Evans	ldd	[%o4+32], %d8
1923*1e49577aSRod Evans	ldd	[%o4+40], %d10
1924*1e49577aSRod Evans	ldd	[%o4+48], %d12
1925*1e49577aSRod Evans	ldd	[%o4+56], %d14
1926*1e49577aSRod Evans.unalign_100_loop:
1927*1e49577aSRod Evans	add	%o4, 64, %o4
1928*1e49577aSRod Evans	ldda	[%o4]ASI_BLK_P, %d16
1929*1e49577aSRod Evans	faligndata %d8, %d10, %d48
1930*1e49577aSRod Evans	faligndata %d10, %d12, %d50
1931*1e49577aSRod Evans	faligndata %d12, %d14, %d52
1932*1e49577aSRod Evans	faligndata %d14, %d16, %d54
1933*1e49577aSRod Evans	faligndata %d16, %d18, %d56
1934*1e49577aSRod Evans	faligndata %d18, %d20, %d58
1935*1e49577aSRod Evans	faligndata %d20, %d22, %d60
1936*1e49577aSRod Evans	faligndata %d22, %d24, %d62
1937*1e49577aSRod Evans	fmovd	%d24, %d8
1938*1e49577aSRod Evans	fmovd	%d26, %d10
1939*1e49577aSRod Evans	fmovd	%d28, %d12
1940*1e49577aSRod Evans	fmovd	%d30, %d14
1941*1e49577aSRod Evans	stda	%d48, [%o0]ASI_BLK_P
1942*1e49577aSRod Evans	subcc	%o5, 64, %o5
1943*1e49577aSRod Evans	add	%o0, 64, %o0
1944*1e49577aSRod Evans	bgu,pt	%ncc, .unalign_100_loop
1945*1e49577aSRod Evans	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1946*1e49577aSRod Evans	ba	.unalign_done
1947*1e49577aSRod Evans	membar	#Sync
1948*1e49577aSRod Evans
1949*1e49577aSRod Evans.unalign_011:
1950*1e49577aSRod Evans	ldd	[%o4+24], %d6
1951*1e49577aSRod Evans	ldd	[%o4+32], %d8
1952*1e49577aSRod Evans	ldd	[%o4+40], %d10
1953*1e49577aSRod Evans	ldd	[%o4+48], %d12
1954*1e49577aSRod Evans	ldd	[%o4+56], %d14
1955*1e49577aSRod Evans.unalign_011_loop:
1956*1e49577aSRod Evans	add	%o4, 64, %o4
1957*1e49577aSRod Evans	ldda	[%o4]ASI_BLK_P, %d16
1958*1e49577aSRod Evans	faligndata %d6, %d8, %d48
1959*1e49577aSRod Evans	faligndata %d8, %d10, %d50
1960*1e49577aSRod Evans	faligndata %d10, %d12, %d52
1961*1e49577aSRod Evans	faligndata %d12, %d14, %d54
1962*1e49577aSRod Evans	faligndata %d14, %d16, %d56
1963*1e49577aSRod Evans	faligndata %d16, %d18, %d58
1964*1e49577aSRod Evans	faligndata %d18, %d20, %d60
1965*1e49577aSRod Evans	faligndata %d20, %d22, %d62
1966*1e49577aSRod Evans	fmovd	%d22, %d6
1967*1e49577aSRod Evans	fmovd	%d24, %d8
1968*1e49577aSRod Evans	fmovd	%d26, %d10
1969*1e49577aSRod Evans	fmovd	%d28, %d12
1970*1e49577aSRod Evans	fmovd	%d30, %d14
1971*1e49577aSRod Evans	stda	%d48, [%o0]ASI_BLK_P
1972*1e49577aSRod Evans	subcc	%o5, 64, %o5
1973*1e49577aSRod Evans	add	%o0, 64, %o0
1974*1e49577aSRod Evans	bgu,pt	%ncc, .unalign_011_loop
1975*1e49577aSRod Evans	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1976*1e49577aSRod Evans	ba	.unalign_done
1977*1e49577aSRod Evans	membar	#Sync
1978*1e49577aSRod Evans
1979*1e49577aSRod Evans.unalign_010:
1980*1e49577aSRod Evans	ldd	[%o4+16], %d4
1981*1e49577aSRod Evans	ldd	[%o4+24], %d6
1982*1e49577aSRod Evans	ldd	[%o4+32], %d8
1983*1e49577aSRod Evans	ldd	[%o4+40], %d10
1984*1e49577aSRod Evans	ldd	[%o4+48], %d12
1985*1e49577aSRod Evans	ldd	[%o4+56], %d14
1986*1e49577aSRod Evans.unalign_010_loop:
1987*1e49577aSRod Evans	add	%o4, 64, %o4
1988*1e49577aSRod Evans	ldda	[%o4]ASI_BLK_P, %d16
1989*1e49577aSRod Evans	faligndata %d4, %d6, %d48
1990*1e49577aSRod Evans	faligndata %d6, %d8, %d50
1991*1e49577aSRod Evans	faligndata %d8, %d10, %d52
1992*1e49577aSRod Evans	faligndata %d10, %d12, %d54
1993*1e49577aSRod Evans	faligndata %d12, %d14, %d56
1994*1e49577aSRod Evans	faligndata %d14, %d16, %d58
1995*1e49577aSRod Evans	faligndata %d16, %d18, %d60
1996*1e49577aSRod Evans	faligndata %d18, %d20, %d62
1997*1e49577aSRod Evans	fmovd	%d20, %d4
1998*1e49577aSRod Evans	fmovd	%d22, %d6
1999*1e49577aSRod Evans	fmovd	%d24, %d8
2000*1e49577aSRod Evans	fmovd	%d26, %d10
2001*1e49577aSRod Evans	fmovd	%d28, %d12
2002*1e49577aSRod Evans	fmovd	%d30, %d14
2003*1e49577aSRod Evans	stda	%d48, [%o0]ASI_BLK_P
2004*1e49577aSRod Evans	subcc	%o5, 64, %o5
2005*1e49577aSRod Evans	add	%o0, 64, %o0
2006*1e49577aSRod Evans	bgu,pt	%ncc, .unalign_010_loop
2007*1e49577aSRod Evans	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
2008*1e49577aSRod Evans	ba	.unalign_done
2009*1e49577aSRod Evans	membar	#Sync
2010*1e49577aSRod Evans
2011*1e49577aSRod Evans.unalign_001:
2012*1e49577aSRod Evans	ldd	[%o4+8], %d2
2013*1e49577aSRod Evans	ldd	[%o4+16], %d4
2014*1e49577aSRod Evans	ldd	[%o4+24], %d6
2015*1e49577aSRod Evans	ldd	[%o4+32], %d8
2016*1e49577aSRod Evans	ldd	[%o4+40], %d10
2017*1e49577aSRod Evans	ldd	[%o4+48], %d12
2018*1e49577aSRod Evans	ldd	[%o4+56], %d14
2019*1e49577aSRod Evans.unalign_001_loop:
2020*1e49577aSRod Evans	add	%o4, 64, %o4
2021*1e49577aSRod Evans	ldda	[%o4]ASI_BLK_P, %d16
2022*1e49577aSRod Evans	faligndata %d2, %d4, %d48
2023*1e49577aSRod Evans	faligndata %d4, %d6, %d50
2024*1e49577aSRod Evans	faligndata %d6, %d8, %d52
2025*1e49577aSRod Evans	faligndata %d8, %d10, %d54
2026*1e49577aSRod Evans	faligndata %d10, %d12, %d56
2027*1e49577aSRod Evans	faligndata %d12, %d14, %d58
2028*1e49577aSRod Evans	faligndata %d14, %d16, %d60
2029*1e49577aSRod Evans	faligndata %d16, %d18, %d62
2030*1e49577aSRod Evans	fmovd	%d18, %d2
2031*1e49577aSRod Evans	fmovd	%d20, %d4
2032*1e49577aSRod Evans	fmovd	%d22, %d6
2033*1e49577aSRod Evans	fmovd	%d24, %d8
2034*1e49577aSRod Evans	fmovd	%d26, %d10
2035*1e49577aSRod Evans	fmovd	%d28, %d12
2036*1e49577aSRod Evans	fmovd	%d30, %d14
2037*1e49577aSRod Evans	stda	%d48, [%o0]ASI_BLK_P
2038*1e49577aSRod Evans	subcc	%o5, 64, %o5
2039*1e49577aSRod Evans	add	%o0, 64, %o0
2040*1e49577aSRod Evans	bgu,pt	%ncc, .unalign_001_loop
2041*1e49577aSRod Evans	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
2042*1e49577aSRod Evans	ba	.unalign_done
2043*1e49577aSRod Evans	membar	#Sync
2044*1e49577aSRod Evans
2045*1e49577aSRod Evans.unalign_000:
2046*1e49577aSRod Evans	ldda	[%o4]ASI_BLK_P, %d0
2047*1e49577aSRod Evans.unalign_000_loop:
2048*1e49577aSRod Evans	add	%o4, 64, %o4
2049*1e49577aSRod Evans	ldda	[%o4]ASI_BLK_P, %d16
2050*1e49577aSRod Evans	faligndata %d0, %d2, %d48
2051*1e49577aSRod Evans	faligndata %d2, %d4, %d50
2052*1e49577aSRod Evans	faligndata %d4, %d6, %d52
2053*1e49577aSRod Evans	faligndata %d6, %d8, %d54
2054*1e49577aSRod Evans	faligndata %d8, %d10, %d56
2055*1e49577aSRod Evans	faligndata %d10, %d12, %d58
2056*1e49577aSRod Evans	faligndata %d12, %d14, %d60
2057*1e49577aSRod Evans	faligndata %d14, %d16, %d62
2058*1e49577aSRod Evans	fmovd	%d16, %d0
2059*1e49577aSRod Evans	fmovd	%d18, %d2
2060*1e49577aSRod Evans	fmovd	%d20, %d4
2061*1e49577aSRod Evans	fmovd	%d22, %d6
2062*1e49577aSRod Evans	fmovd	%d24, %d8
2063*1e49577aSRod Evans	fmovd	%d26, %d10
2064*1e49577aSRod Evans	fmovd	%d28, %d12
2065*1e49577aSRod Evans	fmovd	%d30, %d14
2066*1e49577aSRod Evans	stda	%d48, [%o0]ASI_BLK_P
2067*1e49577aSRod Evans	subcc	%o5, 64, %o5
2068*1e49577aSRod Evans	add	%o0, 64, %o0
2069*1e49577aSRod Evans	bgu,pt	%ncc, .unalign_000_loop
2070*1e49577aSRod Evans	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
2071*1e49577aSRod Evans	membar	#Sync
2072*1e49577aSRod Evans
2073*1e49577aSRod Evans.unalign_done:
2074*1e49577aSRod Evans	! Handle trailing bytes, 64 to 127
2075*1e49577aSRod Evans	! Dest long word aligned, Src not long word aligned
2076*1e49577aSRod Evans	cmp	%o2, 15
2077*1e49577aSRod Evans	bleu	%ncc, .unalign_short
2078*1e49577aSRod Evans
2079*1e49577aSRod Evans	andn	%o2, 0x7, %o5		! %o5 is multiple of 8
2080*1e49577aSRod Evans	and	%o2, 0x7, %o2		! residue bytes in %o2
2081*1e49577aSRod Evans	add	%o2, 8, %o2
2082*1e49577aSRod Evans	sub	%o5, 8, %o5		! insure we don't load past end of src
2083*1e49577aSRod Evans	andn	%o1, 0x7, %o4		! %o4 has long word aligned src address
2084*1e49577aSRod Evans	add	%o1, %o5, %o1		! advance %o1 to after multiple of 8
2085*1e49577aSRod Evans	ldd	[%o4], %d0		! fetch partial word
2086*1e49577aSRod Evans.unalign_by8:
2087*1e49577aSRod Evans	ldd	[%o4+8], %d2
2088*1e49577aSRod Evans	add	%o4, 8, %o4
2089*1e49577aSRod Evans	faligndata %d0, %d2, %d16
2090*1e49577aSRod Evans	subcc	%o5, 8, %o5
2091*1e49577aSRod Evans	std	%d16, [%o0]
2092*1e49577aSRod Evans	fmovd	%d2, %d0
2093*1e49577aSRod Evans	bgu,pt	%ncc, .unalign_by8
2094*1e49577aSRod Evans	add	%o0, 8, %o0
2095*1e49577aSRod Evans
2096*1e49577aSRod Evans.unalign_short:
2097*1e49577aSRod Evans	brnz	%g5, .smallrest
2098*1e49577aSRod Evans	nop
2099*1e49577aSRod Evans	ba	.smallrest
2100*1e49577aSRod Evans	wr	%g5, %g0, %fprs
2101*1e49577aSRod Evans#else	/* NIAGARA2_IMPL */
2102*1e49577aSRod Evans.forcpy:
2103*1e49577aSRod Evans	mov	%o0, %g5		! save des address for return val
2104*1e49577aSRod Evans	cmp	%o2, 17			! for small counts copy bytes
2105*1e49577aSRod Evans	bleu,pt	%ncc, .dbytecp
2106*1e49577aSRod Evans	nop
2107*1e49577aSRod Evans
2108*1e49577aSRod Evans	cmp	%o2, 0x80		! For lengths less than 128 bytes no
2109*1e49577aSRod Evans	bleu,pn	%ncc, .no_blkcpy	! copy using ASI_BLK_INIT_ST_QUAD_LDD_P
2110*1e49577aSRod Evans
2111*1e49577aSRod Evans	/*
2112*1e49577aSRod Evans	 * Make sure that source and destination buffers are 64 bytes apart.
2113*1e49577aSRod Evans	 * If they are not, do not use ASI_BLK_INIT_ST_QUAD_LDD_P asi to copy
2114*1e49577aSRod Evans	 * the data.
2115*1e49577aSRod Evans	 */
2116*1e49577aSRod Evans	subcc	%o1, %o0, %o3
2117*1e49577aSRod Evans	blu	%ncc, .blkalgndst
2118*1e49577aSRod Evans	cmp	%o3, 0x40		! if src - dst >= 0x40
2119*1e49577aSRod Evans	bgeu,pt	%ncc, .blkalgndst	! then use ASI_BLK_INIT_ST_QUAD_LDD_P
2120*1e49577aSRod Evans.no_blkcpy:
2121*1e49577aSRod Evans	andcc	%o1, 3, %o5		! is src word aligned
2122*1e49577aSRod Evans	bz,pn	%ncc, .aldst
2123*1e49577aSRod Evans	cmp	%o5, 2			! is src half-word aligned
2124*1e49577aSRod Evans	be,pt	%ncc, .s2algn
2125*1e49577aSRod Evans	cmp	%o5, 3			! src is byte aligned
2126*1e49577aSRod Evans.s1algn:ldub	[%o1], %o3		! move 1 or 3 bytes to align it
2127*1e49577aSRod Evans	inc	1, %o1
2128*1e49577aSRod Evans	stb	%o3, [%g5]		! move a byte to align src
2129*1e49577aSRod Evans	inc	1, %g5
2130*1e49577aSRod Evans	bne,pt	%ncc, .s2algn
2131*1e49577aSRod Evans	dec	%o2
2132*1e49577aSRod Evans	b	.ald			! now go align dest
2133*1e49577aSRod Evans	andcc	%g5, 3, %o5
2134*1e49577aSRod Evans
2135*1e49577aSRod Evans.s2algn:lduh	[%o1], %o3		! know src is 2 byte alinged
2136*1e49577aSRod Evans	inc	2, %o1
2137*1e49577aSRod Evans	srl	%o3, 8, %o4
2138*1e49577aSRod Evans	stb	%o4, [%g5]		! have to do bytes,
2139*1e49577aSRod Evans	stb	%o3, [%g5 + 1]		! don't know dst alingment
2140*1e49577aSRod Evans	inc	2, %g5
2141*1e49577aSRod Evans	dec	2, %o2
2142*1e49577aSRod Evans
2143*1e49577aSRod Evans.aldst:	andcc	%g5, 3, %o5		! align the destination address
2144*1e49577aSRod Evans.ald:	bz,pn	%ncc, .w4cp
2145*1e49577aSRod Evans	cmp	%o5, 2
2146*1e49577aSRod Evans	bz,pn	%ncc, .w2cp
2147*1e49577aSRod Evans	cmp	%o5, 3
2148*1e49577aSRod Evans.w3cp:	lduw	[%o1], %o4
2149*1e49577aSRod Evans	inc	4, %o1
2150*1e49577aSRod Evans	srl	%o4, 24, %o5
2151*1e49577aSRod Evans	stb	%o5, [%g5]
2152*1e49577aSRod Evans	bne,pt	%ncc, .w1cp
2153*1e49577aSRod Evans	inc	%g5
2154*1e49577aSRod Evans	dec	1, %o2
2155*1e49577aSRod Evans	andn	%o2, 3, %o3		! o3 is aligned word count
2156*1e49577aSRod Evans	dec	4, %o3			! avoid reading beyond tail of src
2157*1e49577aSRod Evans	sub	%o1, %g5, %o1		! o1 gets the difference
2158*1e49577aSRod Evans
2159*1e49577aSRod Evans1:	sll	%o4, 8, %g1		! save residual bytes
2160*1e49577aSRod Evans	lduw	[%o1+%g5], %o4
2161*1e49577aSRod Evans	deccc	4, %o3
2162*1e49577aSRod Evans	srl	%o4, 24, %o5		! merge with residual
2163*1e49577aSRod Evans	or	%o5, %g1, %g1
2164*1e49577aSRod Evans	st	%g1, [%g5]
2165*1e49577aSRod Evans	bnz,pt	%ncc, 1b
2166*1e49577aSRod Evans	inc	4, %g5
2167*1e49577aSRod Evans	sub	%o1, 3, %o1		! used one byte of last word read
2168*1e49577aSRod Evans	and	%o2, 3, %o2
2169*1e49577aSRod Evans	b	7f
2170*1e49577aSRod Evans	inc	4, %o2
2171*1e49577aSRod Evans
2172*1e49577aSRod Evans.w1cp:	srl	%o4, 8, %o5
2173*1e49577aSRod Evans	sth	%o5, [%g5]
2174*1e49577aSRod Evans	inc	2, %g5
2175*1e49577aSRod Evans	dec	3, %o2
2176*1e49577aSRod Evans	andn	%o2, 3, %o3		! o3 is aligned word count
2177*1e49577aSRod Evans	dec	4, %o3			! avoid reading beyond tail of src
2178*1e49577aSRod Evans	sub	%o1, %g5, %o1		! o1 gets the difference
2179*1e49577aSRod Evans
2180*1e49577aSRod Evans2:	sll	%o4, 24, %g1		! save residual bytes
2181*1e49577aSRod Evans	lduw	[%o1+%g5], %o4
2182*1e49577aSRod Evans	deccc	4, %o3
2183*1e49577aSRod Evans	srl	%o4, 8, %o5		! merge with residual
2184*1e49577aSRod Evans	or	%o5, %g1, %g1
2185*1e49577aSRod Evans	st	%g1, [%g5]
2186*1e49577aSRod Evans	bnz,pt	%ncc, 2b
2187*1e49577aSRod Evans	inc	4, %g5
2188*1e49577aSRod Evans	sub	%o1, 1, %o1		! used three bytes of last word read
2189*1e49577aSRod Evans	and	%o2, 3, %o2
2190*1e49577aSRod Evans	b	7f
2191*1e49577aSRod Evans	inc	4, %o2
2192*1e49577aSRod Evans
2193*1e49577aSRod Evans.w2cp:	lduw	[%o1], %o4
2194*1e49577aSRod Evans	inc	4, %o1
2195*1e49577aSRod Evans	srl	%o4, 16, %o5
2196*1e49577aSRod Evans	sth	%o5, [%g5]
2197*1e49577aSRod Evans	inc	2, %g5
2198*1e49577aSRod Evans	dec	2, %o2
2199*1e49577aSRod Evans	andn	%o2, 3, %o3		! o3 is aligned word count
2200*1e49577aSRod Evans	dec	4, %o3			! avoid reading beyond tail of src
2201*1e49577aSRod Evans	sub	%o1, %g5, %o1		! o1 gets the difference
2202*1e49577aSRod Evans
2203*1e49577aSRod Evans3:	sll	%o4, 16, %g1		! save residual bytes
2204*1e49577aSRod Evans	lduw	[%o1+%g5], %o4
2205*1e49577aSRod Evans	deccc	4, %o3
2206*1e49577aSRod Evans	srl	%o4, 16, %o5		! merge with residual
2207*1e49577aSRod Evans	or	%o5, %g1, %g1
2208*1e49577aSRod Evans	st	%g1, [%g5]
2209*1e49577aSRod Evans	bnz,pt	%ncc, 3b
2210*1e49577aSRod Evans	inc	4, %g5
2211*1e49577aSRod Evans	sub	%o1, 2, %o1		! used two bytes of last word read
2212*1e49577aSRod Evans	and	%o2, 3, %o2
2213*1e49577aSRod Evans	b	7f
2214*1e49577aSRod Evans	inc	4, %o2
2215*1e49577aSRod Evans
2216*1e49577aSRod Evans.w4cp:	andn	%o2, 3, %o3		! o3 is aligned word count
2217*1e49577aSRod Evans	sub	%o1, %g5, %o1		! o1 gets the difference
2218*1e49577aSRod Evans
2219*1e49577aSRod Evans1:	lduw	[%o1+%g5], %o4		! read from address
2220*1e49577aSRod Evans	deccc	4, %o3			! decrement count
2221*1e49577aSRod Evans	st	%o4, [%g5]		! write at destination address
2222*1e49577aSRod Evans	bgu,pt	%ncc, 1b
2223*1e49577aSRod Evans	inc	4, %g5			! increment to address
2224*1e49577aSRod Evans	b	7f
2225*1e49577aSRod Evans	and	%o2, 3, %o2		! number of leftover bytes, if any
2226*1e49577aSRod Evans
2227*1e49577aSRod Evans	!
2228*1e49577aSRod Evans	! differenced byte copy, works with any alignment
2229*1e49577aSRod Evans	!
2230*1e49577aSRod Evans.dbytecp:
2231*1e49577aSRod Evans	b	7f
2232*1e49577aSRod Evans	sub	%o1, %g5, %o1		! o1 gets the difference
2233*1e49577aSRod Evans
2234*1e49577aSRod Evans4:	stb	%o4, [%g5]		! write to address
2235*1e49577aSRod Evans	inc	%g5			! inc to address
2236*1e49577aSRod Evans7:	deccc	%o2			! decrement count
2237*1e49577aSRod Evans	bgeu,a,pt %ncc,4b		! loop till done
2238*1e49577aSRod Evans	ldub	[%o1+%g5], %o4		! read from address
2239*1e49577aSRod Evans	retl				! %o0 was preserved
2240*1e49577aSRod Evans	nop
2241*1e49577aSRod Evans
2242*1e49577aSRod Evans.blkalgndst:
2243*1e49577aSRod Evans	save	%sp, -SA(MINFRAME), %sp
2244*1e49577aSRod Evans
2245*1e49577aSRod Evans	! Block (64 bytes) align the destination.
2246*1e49577aSRod Evans	andcc	%i0, 0x3f, %i3		! is dst block aligned
2247*1e49577aSRod Evans	bz	%ncc, .chksrc		! dst already block aligned
2248*1e49577aSRod Evans	sub	%i3, 0x40, %i3
2249*1e49577aSRod Evans	neg	%i3			! bytes till dst 64 bytes aligned
2250*1e49577aSRod Evans	sub	%i2, %i3, %i2		! update i2 with new count
2251*1e49577aSRod Evans
2252*1e49577aSRod Evans	! Based on source and destination alignment do
2253*1e49577aSRod Evans	! either 8 bytes, 4 bytes, 2 bytes or byte copy.
2254*1e49577aSRod Evans
2255*1e49577aSRod Evans	! Is dst & src 8B aligned
2256*1e49577aSRod Evans	or	%i0, %i1, %o2
2257*1e49577aSRod Evans	andcc	%o2, 0x7, %g0
2258*1e49577aSRod Evans	bz	%ncc, .alewdcp
2259*1e49577aSRod Evans	nop
2260*1e49577aSRod Evans
2261*1e49577aSRod Evans	! Is dst & src 4B aligned
2262*1e49577aSRod Evans	andcc	%o2, 0x3, %g0
2263*1e49577aSRod Evans	bz	%ncc, .alwdcp
2264*1e49577aSRod Evans	nop
2265*1e49577aSRod Evans
2266*1e49577aSRod Evans	! Is dst & src 2B aligned
2267*1e49577aSRod Evans	andcc	%o2, 0x1, %g0
2268*1e49577aSRod Evans	bz	%ncc, .alhlfwdcp
2269*1e49577aSRod Evans	nop
2270*1e49577aSRod Evans
2271*1e49577aSRod Evans	! 1B aligned
2272*1e49577aSRod Evans1:	ldub	[%i1], %o2
2273*1e49577aSRod Evans	stb	%o2, [%i0]
2274*1e49577aSRod Evans	inc	%i1
2275*1e49577aSRod Evans	deccc	%i3
2276*1e49577aSRod Evans	bgu,pt	%ncc, 1b
2277*1e49577aSRod Evans	inc	%i0
2278*1e49577aSRod Evans
2279*1e49577aSRod Evans	ba	.chksrc
2280*1e49577aSRod Evans	nop
2281*1e49577aSRod Evans
2282*1e49577aSRod Evans	! dst & src 4B aligned
2283*1e49577aSRod Evans.alwdcp:
2284*1e49577aSRod Evans	ld	[%i1], %o2
2285*1e49577aSRod Evans	st	%o2, [%i0]
2286*1e49577aSRod Evans	add	%i1, 0x4, %i1
2287*1e49577aSRod Evans	subcc	%i3, 0x4, %i3
2288*1e49577aSRod Evans	bgu,pt	%ncc, .alwdcp
2289*1e49577aSRod Evans	add	%i0, 0x4, %i0
2290*1e49577aSRod Evans
2291*1e49577aSRod Evans	ba	.chksrc
2292*1e49577aSRod Evans	nop
2293*1e49577aSRod Evans
2294*1e49577aSRod Evans	! dst & src 2B aligned
2295*1e49577aSRod Evans.alhlfwdcp:
2296*1e49577aSRod Evans	lduh	[%i1], %o2
2297*1e49577aSRod Evans	stuh	%o2, [%i0]
2298*1e49577aSRod Evans	add	%i1, 0x2, %i1
2299*1e49577aSRod Evans	subcc	%i3, 0x2, %i3
2300*1e49577aSRod Evans	bgu,pt	%ncc, .alhlfwdcp
2301*1e49577aSRod Evans	add	%i0, 0x2, %i0
2302*1e49577aSRod Evans
2303*1e49577aSRod Evans	ba	.chksrc
2304*1e49577aSRod Evans	nop
2305*1e49577aSRod Evans
2306*1e49577aSRod Evans	! dst & src 8B aligned
2307*1e49577aSRod Evans.alewdcp:
2308*1e49577aSRod Evans	ldx	[%i1], %o2
2309*1e49577aSRod Evans	stx	%o2, [%i0]
2310*1e49577aSRod Evans	add	%i1, 0x8, %i1
2311*1e49577aSRod Evans	subcc	%i3, 0x8, %i3
2312*1e49577aSRod Evans	bgu,pt	%ncc, .alewdcp
2313*1e49577aSRod Evans	add	%i0, 0x8, %i0
2314*1e49577aSRod Evans
2315*1e49577aSRod Evans	! Now Destination is block (64 bytes) aligned
2316*1e49577aSRod Evans.chksrc:
2317*1e49577aSRod Evans	andn	%i2, 0x3f, %i3		! %i3 count is multiple of block size
2318*1e49577aSRod Evans	sub	%i2, %i3, %i2		! Residue bytes in %i2
2319*1e49577aSRod Evans	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
2320*1e49577aSRod Evans	andcc	%i1, 0xf, %l1		! is src quadword aligned
2321*1e49577aSRod Evans	bz,pn	%ncc, .blkcpy		! src offset in %l1
2322*1e49577aSRod Evans	nop
2323*1e49577aSRod Evans	cmp	%l1, 0x8
2324*1e49577aSRod Evans	bgu	%ncc, .cpy_upper_double
2325*1e49577aSRod Evans	nop
2326*1e49577aSRod Evans	blu	%ncc, .cpy_lower_double
2327*1e49577aSRod Evans	nop
2328*1e49577aSRod Evans
2329*1e49577aSRod Evans	! Falls through when source offset is equal to 8 i.e.
2330*1e49577aSRod Evans	! source is double word aligned.
2331*1e49577aSRod Evans	! In this case no shift/merge of data is required
2332*1e49577aSRod Evans	sub	%i1, %l1, %i1		! align the src at 16 bytes.
2333*1e49577aSRod Evans	andn	%i1, 0x3f, %o0		! %o0 has block aligned source
2334*1e49577aSRod Evans	prefetch [%o0+0x0], #one_read
2335*1e49577aSRod Evans	ldda	[%i1+0x0]%asi, %o2
2336*1e49577aSRod Evansloop0:
2337*1e49577aSRod Evans	ldda	[%i1+0x10]%asi, %o4
2338*1e49577aSRod Evans	prefetch [%o0+0x40], #one_read
2339*1e49577aSRod Evans
2340*1e49577aSRod Evans	stxa	%o3, [%i0+0x0]%asi
2341*1e49577aSRod Evans	stxa	%o4, [%i0+0x8]%asi
2342*1e49577aSRod Evans
2343*1e49577aSRod Evans	ldda	[%i1+0x20]%asi, %o2
2344*1e49577aSRod Evans	stxa	%o5, [%i0+0x10]%asi
2345*1e49577aSRod Evans	stxa	%o2, [%i0+0x18]%asi
2346*1e49577aSRod Evans
2347*1e49577aSRod Evans	ldda	[%i1+0x30]%asi, %o4
2348*1e49577aSRod Evans	stxa	%o3, [%i0+0x20]%asi
2349*1e49577aSRod Evans	stxa	%o4, [%i0+0x28]%asi
2350*1e49577aSRod Evans
2351*1e49577aSRod Evans	ldda	[%i1+0x40]%asi, %o2
2352*1e49577aSRod Evans	stxa	%o5, [%i0+0x30]%asi
2353*1e49577aSRod Evans	stxa	%o2, [%i0+0x38]%asi
2354*1e49577aSRod Evans
2355*1e49577aSRod Evans	add	%o0, 0x40, %o0
2356*1e49577aSRod Evans	add	%i1, 0x40, %i1
2357*1e49577aSRod Evans	subcc	%i3, 0x40, %i3
2358*1e49577aSRod Evans	bgu,pt	%ncc, loop0
2359*1e49577aSRod Evans	add	%i0, 0x40, %i0
2360*1e49577aSRod Evans	ba	.blkdone
2361*1e49577aSRod Evans	add	%i1, %l1, %i1		! increment the source by src offset
2362*1e49577aSRod Evans
2363*1e49577aSRod Evans.cpy_lower_double:
2364*1e49577aSRod Evans	sub	%i1, %l1, %i1		! align the src at 16 bytes.
2365*1e49577aSRod Evans	sll	%l1, 3, %l2		! %l2 left shift
2366*1e49577aSRod Evans	mov	0x40, %l3
2367*1e49577aSRod Evans	sub	%l3, %l2, %l3		! %l3 right shift = (64 - left shift)
2368*1e49577aSRod Evans	andn	%i1, 0x3f, %o0		! %o0 has block aligned source
2369*1e49577aSRod Evans	prefetch [%o0+0x0], #one_read
2370*1e49577aSRod Evans	ldda	[%i1+0x0]%asi, %o2	! partial data in %o2 and %o3 has
2371*1e49577aSRod Evans					! complete data
2372*1e49577aSRod Evansloop1:
2373*1e49577aSRod Evans	ldda	[%i1+0x10]%asi, %o4	! %o4 has partial data for this read.
2374*1e49577aSRod Evans	ALIGN_DATA(%o2, %o3, %o4, %l2, %l3, %g1)	! merge %o2, %o3 and %o4
2375*1e49577aSRod Evans							! into %o2 and %o3
2376*1e49577aSRod Evans	prefetch [%o0+0x40], #one_read
2377*1e49577aSRod Evans	stxa	%o2, [%i0+0x0]%asi
2378*1e49577aSRod Evans	stxa	%o3, [%i0+0x8]%asi
2379*1e49577aSRod Evans
2380*1e49577aSRod Evans	ldda	[%i1+0x20]%asi, %o2
2381*1e49577aSRod Evans	ALIGN_DATA(%o4, %o5, %o2, %l2, %l3, %g1)	! merge %o2 with %o5 and
2382*1e49577aSRod Evans	stxa	%o4, [%i0+0x10]%asi			! %o4 from previous read
2383*1e49577aSRod Evans	stxa	%o5, [%i0+0x18]%asi			! into %o4 and %o5
2384*1e49577aSRod Evans
2385*1e49577aSRod Evans	! Repeat the same for next 32 bytes.
2386*1e49577aSRod Evans
2387*1e49577aSRod Evans	ldda	[%i1+0x30]%asi, %o4
2388*1e49577aSRod Evans	ALIGN_DATA(%o2, %o3, %o4, %l2, %l3, %g1)
2389*1e49577aSRod Evans	stxa	%o2, [%i0+0x20]%asi
2390*1e49577aSRod Evans	stxa	%o3, [%i0+0x28]%asi
2391*1e49577aSRod Evans
2392*1e49577aSRod Evans	ldda	[%i1+0x40]%asi, %o2
2393*1e49577aSRod Evans	ALIGN_DATA(%o4, %o5, %o2, %l2, %l3, %g1)
2394*1e49577aSRod Evans	stxa	%o4, [%i0+0x30]%asi
2395*1e49577aSRod Evans	stxa	%o5, [%i0+0x38]%asi
2396*1e49577aSRod Evans
2397*1e49577aSRod Evans	add	%o0, 0x40, %o0
2398*1e49577aSRod Evans	add	%i1, 0x40, %i1
2399*1e49577aSRod Evans	subcc	%i3, 0x40, %i3
2400*1e49577aSRod Evans	bgu,pt	%ncc, loop1
2401*1e49577aSRod Evans	add	%i0, 0x40, %i0
2402*1e49577aSRod Evans	ba	.blkdone
2403*1e49577aSRod Evans	add	%i1, %l1, %i1		! increment the source by src offset
2404*1e49577aSRod Evans
2405*1e49577aSRod Evans.cpy_upper_double:
2406*1e49577aSRod Evans	sub	%i1, %l1, %i1		! align the src at 16 bytes.
2407*1e49577aSRod Evans	mov	0x8, %l2
2408*1e49577aSRod Evans	sub	%l1, %l2, %l2
2409*1e49577aSRod Evans	sll	%l2, 3, %l2		! %l2 left shift
2410*1e49577aSRod Evans	mov	0x40, %l3
2411*1e49577aSRod Evans	sub	%l3, %l2, %l3		! %l3 right shift = (64 - left shift)
2412*1e49577aSRod Evans	andn	%i1, 0x3f, %o0		! %o0 has block aligned source
2413*1e49577aSRod Evans	prefetch [%o0+0x0], #one_read
2414*1e49577aSRod Evans	ldda	[%i1+0x0]%asi, %o2	! partial data in %o3 for this read and
2415*1e49577aSRod Evans					! no data in %o2
2416*1e49577aSRod Evansloop2:
2417*1e49577aSRod Evans	ldda	[%i1+0x10]%asi, %o4	! %o4 has complete data and %o5 has
2418*1e49577aSRod Evans					! partial
2419*1e49577aSRod Evans	ALIGN_DATA(%o3, %o4, %o5, %l2, %l3, %g1)	! merge %o3, %o4 and %o5
2420*1e49577aSRod Evans							! into %o3 and %o4
2421*1e49577aSRod Evans	prefetch [%o0+0x40], #one_read
2422*1e49577aSRod Evans	stxa	%o3, [%i0+0x0]%asi
2423*1e49577aSRod Evans	stxa	%o4, [%i0+0x8]%asi
2424*1e49577aSRod Evans
2425*1e49577aSRod Evans	ldda	[%i1+0x20]%asi, %o2
2426*1e49577aSRod Evans	ALIGN_DATA(%o5, %o2, %o3, %l2, %l3, %g1)	! merge %o2 and %o3 with
2427*1e49577aSRod Evans	stxa	%o5, [%i0+0x10]%asi			! %o5 from previous read
2428*1e49577aSRod Evans	stxa	%o2, [%i0+0x18]%asi			! into %o5 and %o2
2429*1e49577aSRod Evans
2430*1e49577aSRod Evans	! Repeat the same for next 32 bytes.
2431*1e49577aSRod Evans
2432*1e49577aSRod Evans	ldda	[%i1+0x30]%asi, %o4
2433*1e49577aSRod Evans	ALIGN_DATA(%o3, %o4, %o5, %l2, %l3, %g1)
2434*1e49577aSRod Evans	stxa	%o3, [%i0+0x20]%asi
2435*1e49577aSRod Evans	stxa	%o4, [%i0+0x28]%asi
2436*1e49577aSRod Evans
2437*1e49577aSRod Evans	ldda	[%i1+0x40]%asi, %o2
2438*1e49577aSRod Evans	ALIGN_DATA(%o5, %o2, %o3, %l2, %l3, %g1)
2439*1e49577aSRod Evans	stxa	%o5, [%i0+0x30]%asi
2440*1e49577aSRod Evans	stxa	%o2, [%i0+0x38]%asi
2441*1e49577aSRod Evans
2442*1e49577aSRod Evans	add	%o0, 0x40, %o0
2443*1e49577aSRod Evans	add	%i1, 0x40, %i1
2444*1e49577aSRod Evans	subcc	%i3, 0x40, %i3
2445*1e49577aSRod Evans	bgu,pt	%ncc, loop2
2446*1e49577aSRod Evans	add	%i0, 0x40, %i0
2447*1e49577aSRod Evans	ba	.blkdone
2448*1e49577aSRod Evans	add	%i1, %l1, %i1		! increment the source by src offset
2449*1e49577aSRod Evans
2450*1e49577aSRod Evans	! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
2451*1e49577aSRod Evans.blkcpy:
2452*1e49577aSRod Evans	andn	%i1, 0x3f, %o0		! %o0 has block aligned source
2453*1e49577aSRod Evans	prefetch [%o0+0x0], #one_read
2454*1e49577aSRod Evans1:
2455*1e49577aSRod Evans	prefetch [%o0+0x40], #one_read
2456*1e49577aSRod Evans
2457*1e49577aSRod Evans	ldda	[%i1+0x0]%asi, %o2
2458*1e49577aSRod Evans	ldda	[%i1+0x10]%asi, %o4
2459*1e49577aSRod Evans
2460*1e49577aSRod Evans	stxa	%o2, [%i0+0x0]%asi
2461*1e49577aSRod Evans	stxa	%o3, [%i0+0x8]%asi
2462*1e49577aSRod Evans	stxa	%o4, [%i0+0x10]%asi
2463*1e49577aSRod Evans	stxa	%o5, [%i0+0x18]%asi
2464*1e49577aSRod Evans
2465*1e49577aSRod Evans	ldda	[%i1+0x20]%asi, %o2
2466*1e49577aSRod Evans	ldda	[%i1+0x30]%asi, %o4
2467*1e49577aSRod Evans
2468*1e49577aSRod Evans	stxa	%o2, [%i0+0x20]%asi
2469*1e49577aSRod Evans	stxa	%o3, [%i0+0x28]%asi
2470*1e49577aSRod Evans	stxa	%o4, [%i0+0x30]%asi
2471*1e49577aSRod Evans	stxa	%o5, [%i0+0x38]%asi
2472*1e49577aSRod Evans
2473*1e49577aSRod Evans	add	%o0, 0x40, %o0
2474*1e49577aSRod Evans	add	%i1, 0x40, %i1
2475*1e49577aSRod Evans	subcc	%i3, 0x40, %i3
2476*1e49577aSRod Evans	bgu,pt	%ncc, 1b
2477*1e49577aSRod Evans	add	%i0, 0x40, %i0
2478*1e49577aSRod Evans
2479*1e49577aSRod Evans.blkdone:
2480*1e49577aSRod Evans	membar	#Sync
2481*1e49577aSRod Evans
2482*1e49577aSRod Evans	mov	ASI_PNF, %asi		! restore %asi to default
2483*1e49577aSRod Evans					! ASI_PRIMARY_NOFAULT value
2484*1e49577aSRod Evans	tst	%i2
2485*1e49577aSRod Evans	bz,pt	%ncc, .blkexit
2486*1e49577aSRod Evans	nop
2487*1e49577aSRod Evans
2488*1e49577aSRod Evans	! Handle trailing bytes
2489*1e49577aSRod Evans	cmp	%i2, 0x8
2490*1e49577aSRod Evans	blu,pt	%ncc, .residue
2491*1e49577aSRod Evans	nop
2492*1e49577aSRod Evans
2493*1e49577aSRod Evans	! Can we do some 8B ops
2494*1e49577aSRod Evans	or	%i1, %i0, %o2
2495*1e49577aSRod Evans	andcc	%o2, 0x7, %g0
2496*1e49577aSRod Evans	bnz	%ncc, .last4
2497*1e49577aSRod Evans	nop
2498*1e49577aSRod Evans
2499*1e49577aSRod Evans	! Do 8byte ops as long as possible
2500*1e49577aSRod Evans.last8:
2501*1e49577aSRod Evans	ldx	[%i1], %o2
2502*1e49577aSRod Evans	stx	%o2, [%i0]
2503*1e49577aSRod Evans	add	%i1, 0x8, %i1
2504*1e49577aSRod Evans	sub	%i2, 0x8, %i2
2505*1e49577aSRod Evans	cmp	%i2, 0x8
2506*1e49577aSRod Evans	bgu,pt	%ncc, .last8
2507*1e49577aSRod Evans	add	%i0, 0x8, %i0
2508*1e49577aSRod Evans
2509*1e49577aSRod Evans	tst	%i2
2510*1e49577aSRod Evans	bz,pt	%ncc, .blkexit
2511*1e49577aSRod Evans	nop
2512*1e49577aSRod Evans
2513*1e49577aSRod Evans	ba	.residue
2514*1e49577aSRod Evans	nop
2515*1e49577aSRod Evans
2516*1e49577aSRod Evans.last4:
2517*1e49577aSRod Evans	! Can we do 4B ops
2518*1e49577aSRod Evans	andcc	%o2, 0x3, %g0
2519*1e49577aSRod Evans	bnz	%ncc, .last2
2520*1e49577aSRod Evans	nop
2521*1e49577aSRod Evans1:
2522*1e49577aSRod Evans	ld	[%i1], %o2
2523*1e49577aSRod Evans	st	%o2, [%i0]
2524*1e49577aSRod Evans	add	%i1, 0x4, %i1
2525*1e49577aSRod Evans	sub	%i2, 0x4, %i2
2526*1e49577aSRod Evans	cmp	%i2, 0x4
2527*1e49577aSRod Evans	bgu,pt	%ncc, 1b
2528*1e49577aSRod Evans	add	%i0, 0x4, %i0
2529*1e49577aSRod Evans
2530*1e49577aSRod Evans	cmp	%i2, 0
2531*1e49577aSRod Evans	bz,pt	%ncc, .blkexit
2532*1e49577aSRod Evans	nop
2533*1e49577aSRod Evans
2534*1e49577aSRod Evans	ba	.residue
2535*1e49577aSRod Evans	nop
2536*1e49577aSRod Evans
2537*1e49577aSRod Evans.last2:
2538*1e49577aSRod Evans	! Can we do 2B ops
2539*1e49577aSRod Evans	andcc	%o2, 0x1, %g0
2540*1e49577aSRod Evans	bnz	%ncc, .residue
2541*1e49577aSRod Evans	nop
2542*1e49577aSRod Evans
2543*1e49577aSRod Evans1:
2544*1e49577aSRod Evans	lduh	[%i1], %o2
2545*1e49577aSRod Evans	stuh	%o2, [%i0]
2546*1e49577aSRod Evans	add	%i1, 0x2, %i1
2547*1e49577aSRod Evans	sub	%i2, 0x2, %i2
2548*1e49577aSRod Evans	cmp	%i2, 0x2
2549*1e49577aSRod Evans	bgu,pt	%ncc, 1b
2550*1e49577aSRod Evans	add	%i0, 0x2, %i0
2551*1e49577aSRod Evans
2552*1e49577aSRod Evans	cmp	%i2, 0
2553*1e49577aSRod Evans	bz,pt	%ncc, .blkexit
2554*1e49577aSRod Evans	nop
2555*1e49577aSRod Evans
2556*1e49577aSRod Evans.residue:
2557*1e49577aSRod Evans	ldub	[%i1], %o2
2558*1e49577aSRod Evans	stb	%o2, [%i0]
2559*1e49577aSRod Evans	inc	%i1
2560*1e49577aSRod Evans	deccc	%i2
2561*1e49577aSRod Evans	bgu,pt	%ncc, .residue
2562*1e49577aSRod Evans	inc	%i0
2563*1e49577aSRod Evans
2564*1e49577aSRod Evans.blkexit:
2565*1e49577aSRod Evans
2566*1e49577aSRod Evans	ret
2567*1e49577aSRod Evans	restore	%g5, %g0, %o0
2568*1e49577aSRod Evans
2569*1e49577aSRod Evans#endif	/* NIAGARA2_IMPL */
2570*1e49577aSRod Evans	SET_SIZE(memcpy)
2571*1e49577aSRod Evans	SET_SIZE(__align_cpy_1)
2572