xref: /illumos-gate/usr/src/lib/libc/capabilities/sun4v/common/memcpy.S (revision 8119dad84d6416f13557b0ba8e2aaf9064cbcfd3)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26	.file	"memcpy.s"
27
28/*
29 * memcpy(s1, s2, len)
30 *
31 * Copy s2 to s1, always copy n bytes.
32 * Note: this C code does not work for overlapped copies.
33 *       Memmove() and bcopy() do.
34 *
35 * Added entry __align_cpy_1 is generally for use of the compilers.
36 *
37 * Fast assembler language version of the following C-program for memcpy
38 * which represents the `standard' for the C-library.
39 *
40 *	void *
41 *	memcpy(void *s, const void *s0, size_t n)
42 *	{
43 *		if (n != 0) {
44 *		    char *s1 = s;
45 *		    const char *s2 = s0;
46 *		    do {
47 *			*s1++ = *s2++;
48 *		    } while (--n != 0);
49 *		}
50 *		return (s);
51 *	}
52 *
53 *
54 * N1 Flow :
55 *
56 * if (count < 17) {
57 *	Do the byte copy
58 *	Return destination address
59 * }
60 * if (count < 128) {
61 *	Is source aligned on word boundary
62 *	If no then align source on word boundary then goto .ald
63 *	If yes goto .ald
64 *	.ald:
65 *		Is destination aligned on word boundary
66 *		Depending on destination offset (last 2 bits of destination)
67 *		copy data by shifting and merging.
68 *		Copy residue bytes as byte copy
69 *		Return destination address
70 * } else {
71 *	Align destination on block boundary
72 *	Depending on the source offset (last 4 bits of source address) align
73 *	the data and store to destination. Both the load and store are done
74 *	using ASI_BLK_INIT_ST_QUAD_LDD_P.
75 *	For remaining count copy as much data in 8-byte chunk from source to
76 *	destination.
77 *	Followed by trailing copy using byte copy.
78 *	Return saved destination address
79 * }
80 *
81 *
82 * N2 Flow :
83 * Flow :
84 *
85 * if (count < 128) {
86 *   if count < 3
87 *	copy bytes; exit with dst addr
88 *   if src & dst aligned on word boundary but not long word boundary,
89 *     copy with ldw/stw; branch to finish_up
90 *   if src & dst aligned on long word boundary
91 *     copy with ldx/stx; branch to finish_up
92 *   if src & dst not aligned and length <= 14
93 *     copy bytes; exit with dst addr
94 *   move enough bytes to get src to word boundary
95 *   if dst now on word boundary
96 * move_words:
97 *     copy words; branch to finish_up
98 *   if dst now on half word boundary
99 *     load words, shift half words, store words; branch to finish_up
100 *   if dst on byte 1
101 *     load words, shift 3 bytes, store words; branch to finish_up
102 *   if dst on byte 3
103 *     load words, shift 1 byte, store words; branch to finish_up
104 * finish_up:
105 *     copy bytes; exit with dst addr
106 * } else {                                         More than 128 bytes
107 *   move bytes until dst is on long word boundary
108 *   if( src is on long word boundary ) {
109 *     if (count < 512) {
110 * finish_long:				           src/dst aligned on 8 bytes
111 *       copy with ldx/stx in 8-way unrolled loop;
112 *       copy final 0-63 bytes; exit with dst addr
113 *     } else {                                 src/dst aligned; count > 512
114 *       align dst on 64 byte boundary; use 8-way test for each of 8 possible
115 *       src alignments relative to a 64 byte boundary to select the
116 *       16-way unrolled loop to use for
117 *       block load, fmovd, block-init-store, block-store, fmovd operations
118 *       then go to finish_long.
119 *     }
120 *   } else {                                   src/dst not aligned on 8 bytes
121 *     if src is word aligned and count < 512
122 *       move words in 8-way unrolled loop
123 *       move final 0-31 bytes; exit with dst addr
124 *     if count < 512
125 *       use alignaddr/faligndata combined with ldd/std in 8-way
126 *       unrolled loop to move data.
127 *       go to unalign_done
128 *     else
129 *       setup alignaddr for faligndata instructions
130 *       align dst on 64 byte boundary; use 8-way test for each of 8 possible
131 *       src alignments to nearest long word relative to 64 byte boundary to
132 *       select the 8-way unrolled loop to use for
133 *       block load, falign, fmovd, block-init-store, block-store loop
134 *	 (only use block-init-store when src/dst on 8 byte boundaries.)
135 * unalign_done:
136 *       move remaining bytes for unaligned cases. exit with dst addr.
137 * }
138 *
139 * Comment on N2 memmove and memcpy common code and block-store-init:
140 *   In the man page for memmove, it specifies that copying will take place
141 *   correctly between objects that overlap.  For memcpy, behavior is
142 *   undefined for objects that overlap.
143 *
144 *   In rare cases, some multi-threaded applications may attempt to examine
145 *   the copy destination buffer during the copy. Using the block-store-init
146 *   instruction allows those applications to observe zeros in some
147 *   cache lines of the destination buffer for narrow windows. But the
148 *   the block-store-init provides memory throughput advantages for many
149 *   common applications. To meet both needs, those applications which need
150 *   the destination buffer to retain meaning during the copy should use
151 *   memmove instead of memcpy.  The memmove version duplicates the memcpy
152 *   algorithms except the memmove version does not use block-store-init
153 *   in those cases where memcpy does use block-store-init. Otherwise, when
154 *   memmove can determine the source and destination do not overlap,
155 *   memmove shares the memcpy code.
156 */
157
158#include <sys/asm_linkage.h>
159#include <sys/niagaraasi.h>
160#include <sys/asi.h>
161#include <sys/trap.h>
162
163/* documented name for primary block initializing store */
164#define	ASI_STBI_P	ASI_BLK_INIT_ST_QUAD_LDD_P
165
166#define	BLOCK_SIZE	64
167#define	FPRS_FEF	0x4
168
169#define	SHORTCOPY	3
170#define	SHORTCHECK	14
171#define	SHORT_LONG	64	/* max copy for short longword-aligned case */
172				/* must be at least 32 */
173#define	SMALL_MAX	128
174#define	MED_UMAX	512	/* max copy for medium un-aligned case */
175#define	MED_WMAX	512	/* max copy for medium word-aligned case */
176#define	MED_MAX		512	/* max copy for medium longword-aligned case */
177
178#ifdef NIAGARA2_IMPL
179#include <sys/sun4asi.h>
180
181#else	/* NIAGARA2_IMPL */
182/*
183 * This define is to align data for the unaligned source cases.
184 * The data1, data2 and data3 is merged into data1 and data2.
185 * The data3 is preserved for next merge.
186 */
187#define	ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp)	\
188	sllx	data1, lshift, data1				;\
189	srlx	data2, rshift, tmp				;\
190	or	data1, tmp, data1				;\
191	sllx	data2, lshift, data2				;\
192	srlx	data3, rshift, tmp				;\
193	or	data2, tmp, data2
194/*
195 * Align the data. Merge the data1 and data2 into data1.
196 */
197#define	ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp)	\
198	sllx	data1, lshift, data1				;\
199	srlx	data2, rshift, tmp				;\
200	or	data1, tmp, data1
201#endif	/* NIAGARA2_IMPL */
202
203
204	ANSI_PRAGMA_WEAK(memmove,function)
205	ANSI_PRAGMA_WEAK(memcpy,function)
206
207	ENTRY(memmove)
208	cmp	%o1, %o0	! if from address is >= to use forward copy
209	bgeu,pn	%ncc, .forcpy	! else use backward if ...
210	sub	%o0, %o1, %o4	! get difference of two addresses
211	cmp	%o2, %o4	! compare size and difference of addresses
212	bleu,pn	%ncc, .forcpy	! if size is bigger, do overlapped copy
213	add	%o1, %o2, %o5	! get to end of source space
214
215	!
216	! an overlapped copy that must be done "backwards"
217	!
218.chksize:
219	cmp	%o2, 8			! less than 8 byte do byte copy
220	blu,pt %ncc, 2f			! else continue
221
222	! Now size is bigger than 8
223.dbalign:
224	add	%o0, %o2, %g1		! get to end of dest space
225	andcc	%g1, 7, %o3		! %o3 has bytes till dst 8 bytes aligned
226	bz,a,pn	%ncc, .dbbck		! if dst is not 8 byte aligned: align it
227	andn	%o2, 7, %o3		! %o3 count is multiple of 8 bytes size
228	sub	%o2, %o3, %o2		! update o2 with new count
229
2301:	dec	%o5			! decrement source
231	ldub	[%o5], %g1		! load one byte
232	deccc	%o3			! decrement count
233	bgu,pt	%ncc, 1b		! if not done keep copying
234	stb	%g1, [%o5+%o4]		! store one byte into dest
235	andncc	%o2, 7, %o3		! %o3 count is multiple of 8 bytes size
236	bz,pn	%ncc, 2f		! if size < 8, move to byte copy
237
238	! Now Destination is 8 byte aligned
239.dbbck:
240	andcc	%o5, 7, %o0		! %o0 has src offset
241	bz,a,pn	%ncc, .dbcopybc		! if src is aligned to fast mem move
242	sub	%o2, %o3, %o2		! Residue bytes in %o2
243
244.cpy_dbwdbc:				! alignment of src is needed
245	sub	%o2, 8, %o2		! set size one loop ahead
246	sll	%o0, 3, %g1		! %g1 is left shift
247	mov	64, %g5			! init %g5 to be 64
248	sub	%g5, %g1, %g5		! %g5 right shift = (64 - left shift)
249	sub	%o5, %o0, %o5		! align the src at 8 bytes.
250	add	%o4, %o0, %o4		! increase difference between src & dst
251	ldx	[%o5], %o1		! load first 8 bytes
252	srlx	%o1, %g5, %o1
2531:	sub	%o5, 8, %o5		! subtract 8 from src
254	ldx	[%o5], %o0		! load 8 byte
255	sllx	%o0, %g1, %o3		! shift loaded 8 bytes left into tmp reg
256	or	%o1, %o3, %o3		! align data
257	stx	%o3, [%o5+%o4]		! store 8 byte
258	subcc	%o2, 8, %o2		! subtract 8 byte from size
259	bg,pt	%ncc, 1b		! if size > 0 continue
260	srlx	%o0, %g5, %o1		! move extra byte for the next use
261
262	srl	%g1, 3, %o0		! retsote %o0 value for alignment
263	add	%o5, %o0, %o5		! restore src alignment
264	sub	%o4, %o0, %o4		! restore difference between src & dest
265
266	ba	2f			! branch to the trailing byte copy
267	add	%o2, 8, %o2		! restore size value
268
269.dbcopybc:				! alignment of src is not needed
2701:	sub	%o5, 8, %o5		! subtract from src
271	ldx	[%o5], %g1		! load 8 bytes
272	subcc	%o3, 8, %o3		! subtract from size
273	bgu,pt	%ncc, 1b		! if size is bigger 0 continue
274	stx	%g1, [%o5+%o4]		! store 8 bytes to destination
275
276	ba	2f
277	nop
278
279.bcbyte:
2801:	ldub	[%o5], %g1		! load one byte
281	stb	%g1, [%o5+%o4]		! store one byte
2822:	deccc	%o2			! decrement size
283	bgeu,a,pt %ncc, 1b		! if size is >= 0 continue
284	dec	%o5			! decrement from address
285
286.exitbc:				! exit from backward copy
287	retl
288	add	%o5, %o4, %o0		! restore dest addr
289
290#ifdef NIAGARA2_IMPL
291	!
292	! Check to see if memmove is large aligned copy
293	! If so, use special version of copy that avoids
294	! use of block store init
295	!
296.forcpy:
297	cmp	%o2, SMALL_MAX		! check for not small case
298	blt,pn	%ncc, .mv_short		! merge with memcpy
299	mov	%o0, %g1		! save %o0
300	neg	%o0, %o5
301	andcc	%o5, 7, %o5		! bytes till DST 8 byte aligned
302	brz,pt	%o5, .mv_dst_aligned_on_8
303
304	! %o5 has the bytes to be written in partial store.
305	sub	%o2, %o5, %o2
306	sub	%o1, %o0, %o1		! %o1 gets the difference
3077:					! dst aligning loop
308	ldub	[%o1+%o0], %o4		! load one byte
309	subcc	%o5, 1, %o5
310	stb	%o4, [%o0]
311	bgu,pt	%ncc, 7b
312	add	%o0, 1, %o0		! advance dst
313	add	%o1, %o0, %o1		! restore %o1
314.mv_dst_aligned_on_8:
315	andcc	%o1, 7, %o5
316	brnz,pt	%o5, .src_dst_unaligned_on_8
317	prefetch [%o1 + (1 * BLOCK_SIZE)], #one_read
318
319.mv_src_dst_aligned_on_8:
320	! check if we are copying MED_MAX or more bytes
321	cmp	%o2, MED_MAX		! limit to store buffer size
322	bleu,pt	%ncc, .medlong
323	prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read
324
325/*
326 * The following memmove code mimics the memcpy code for large aligned copies,
327 * but does not use the ASI_STBI_P (block initializing store) performance
328 * optimization. See memmove rationale section in documentation
329 */
330.mv_large_align8_copy:			! Src and dst share 8 byte alignment
331	rd	%fprs, %g5		! check for unused fp
332	! if fprs.fef == 0, set it.
333	! Setting it when already set costs more than checking
334	andcc	%g5, FPRS_FEF, %g5	! test FEF, fprs.du = fprs.dl = 0
335	bz,a	%ncc, 1f
336	wr	%g0, FPRS_FEF, %fprs	! fprs.fef = 1
3371:
338	! align dst to 64 byte boundary
339	andcc	%o0, 0x3f, %o3		! %o3 == 0 means dst is 64 byte aligned
340	brz,pn	%o3, .mv_aligned_on_64
341	sub	%o3, 64, %o3		! %o3 has negative bytes to move
342	add	%o2, %o3, %o2		! adjust remaining count
343.mv_align_to_64:
344	ldx	[%o1], %o4
345	add	%o1, 8, %o1		! increment src ptr
346	addcc	%o3, 8, %o3
347	stx	%o4, [%o0]
348	brnz,pt	%o3, .mv_align_to_64
349	add	%o0, 8, %o0		! increment dst ptr
350
351.mv_aligned_on_64:
352	prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
353	mov	%asi,%o4		! save %asi
354	! Determine source alignment
355	! to correct 8 byte offset
356	andcc	%o1, 0x20, %o3
357	brnz,pn	%o3, .mv_align_1
358	mov	ASI_BLK_P, %asi		! setup %asi for block load/store
359	andcc	%o1, 0x10, %o3
360	brnz,pn	%o3, .mv_align_01
361	nop
362	andcc	%o1, 0x08, %o3
363	brz,pn	%o3, .mv_align_000
364	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
365	ba	.mv_align_001
366	nop
367.mv_align_01:
368	andcc	%o1, 0x08, %o3
369	brnz,pn	%o3, .mv_align_011
370	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
371	ba	.mv_align_010
372	nop
373.mv_align_1:
374	andcc	%o1, 0x10, %o3
375	brnz,pn	%o3, .mv_align_11
376	nop
377	andcc	%o1, 0x08, %o3
378	brnz,pn	%o3, .mv_align_101
379	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
380	ba	.mv_align_100
381	nop
382.mv_align_11:
383	andcc	%o1, 0x08, %o3
384	brz,pn	%o3, .mv_align_110
385	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
386
387.mv_align_111:
388! Alignment off by 8 bytes
389	ldd	[%o1], %d0
390	add	%o1, 8, %o1
391	sub	%o2, 8, %o2
392	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
393	and	%o2, 0x7f, %o2		! residue bytes in %o2
394.mv_align_111_loop:
395	subcc	%o5, 128, %o5
396	/* ---- copy line 1 of 2. ---- */
397	ldda	[%o1]%asi,%d16		! block load
398	fmovd	%d16, %d2
399	fmovd	%d18, %d4
400	fmovd	%d20, %d6
401	fmovd	%d22, %d8
402	fmovd	%d24, %d10
403	fmovd	%d26, %d12
404	fmovd	%d28, %d14
405	stda	%d0,[%o0]%asi
406	add	%o0, 64, %o0		! advance dst
407	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
408	fmovd	%d30, %d0
409
410	/* ---- copy line 2 of 2. ---- */
411	ldda	[%o1+64]%asi,%d16
412	fmovd	%d16, %d2
413	fmovd	%d18, %d4
414	fmovd	%d20, %d6
415	fmovd	%d22, %d8
416	fmovd	%d24, %d10
417	fmovd	%d26, %d12
418	fmovd	%d28, %d14
419	add	%o1, 128, %o1		! increment src
420	stda	%d0,[%o0]%asi
421	add	%o0, 64, %o0		! advance dst
422	fmovd	%d30, %d0
423	bgt,pt	%ncc, .mv_align_111_loop
424	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
425
426	std	%d0, [%o0]
427	ba	.remain_stuff
428	add	%o0, 8, %o0
429	! END OF mv_align_111
430
431.mv_align_110:
432! Alignment off by 16 bytes
433	ldd	[%o1], %d0
434	ldd	[%o1+8], %d2
435	add	%o1, 16, %o1
436	sub	%o2, 16, %o2
437	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
438	and	%o2, 0x7f, %o2		! residue bytes in %o2
439.mv_align_110_loop:
440	subcc	%o5, 128, %o5
441	/* ---- copy line 1 of 2. ---- */
442
443	ldda	[%o1]%asi,%d16		! block load
444	fmovd	%d16, %d4
445	fmovd	%d18, %d6
446	fmovd	%d20, %d8
447	fmovd	%d22, %d10
448	fmovd	%d24, %d12
449	fmovd	%d26, %d14
450	stda	%d0,[%o0]%asi
451	add	%o0, 64, %o0		! advance dst
452	fmovd	%d28, %d0
453	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
454	fmovd	%d30, %d2
455
456	/* ---- copy line 2 of 2. ---- */
457	ldda	[%o1+64]%asi,%d16
458	fmovd	%d16, %d4
459	fmovd	%d18, %d6
460	fmovd	%d20, %d8
461	fmovd	%d22, %d10
462	fmovd	%d24, %d12
463	fmovd	%d26, %d14
464	add	%o1, 128, %o1		! increment src
465	stda	%d0,[%o0]%asi
466	add	%o0, 64, %o0		! advance dst
467	fmovd	%d28, %d0
468	fmovd	%d30, %d2
469	bgt,pt	%ncc, .mv_align_110_loop
470	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
471
472	std	%d0, [%o0]
473	std	%d2, [%o0+8]
474	ba	.remain_stuff
475	add	%o0, 16, %o0
476	! END OF mv_align_110
477
478.mv_align_101:
479! Alignment off by 24 bytes
480	ldd	[%o1], %d0
481	ldd	[%o1+8], %d2
482	ldd	[%o1+16], %d4
483	add	%o1, 24, %o1
484	sub	%o2, 24, %o2
485	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
486	and	%o2, 0x7f, %o2		! residue bytes in %o2
487.mv_align_101_loop:
488	subcc	%o5, 128, %o5
489	/* ---- copy line 1 of 2. ---- */
490
491	ldda	[%o1]%asi,%d16		! block load
492	fmovd	%d16, %d6
493	fmovd	%d18, %d8
494	fmovd	%d20, %d10
495	fmovd	%d22, %d12
496	fmovd	%d24, %d14
497	stda	%d0,[%o0]%asi
498	add	%o0, 64, %o0		! advance dst
499	fmovd	%d26, %d0
500	fmovd	%d28, %d2
501	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
502	fmovd	%d30, %d4
503
504	/* ---- copy line 2 of 2. ---- */
505	ldda	[%o1+64]%asi,%d16
506	fmovd	%d16, %d6
507	fmovd	%d18, %d8
508	fmovd	%d20, %d10
509	fmovd	%d22, %d12
510	fmovd	%d24, %d14
511	add	%o1, 128, %o1		! increment src
512	stda	%d0,[%o0]%asi
513	add	%o0, 64, %o0		! advance dst
514	fmovd	%d26, %d0
515	fmovd	%d28, %d2
516	fmovd	%d30, %d4
517	bgt,pt	%ncc, .mv_align_101_loop
518	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
519
520	std	%d0, [%o0]
521	std	%d2, [%o0+8]
522	std	%d4, [%o0+16]
523	ba	.remain_stuff
524	add	%o0, 24, %o0
525	! END OF mv_align_101
526
527.mv_align_100:
528! Alignment off by 32 bytes
529	ldd	[%o1], %d0
530	ldd	[%o1+8], %d2
531	ldd	[%o1+16],%d4
532	ldd	[%o1+24],%d6
533	add	%o1, 32, %o1
534	sub	%o2, 32, %o2
535	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
536	and	%o2, 0x7f, %o2		! residue bytes in %o2
537.mv_align_100_loop:
538	subcc	%o5, 128, %o5
539	/* ---- copy line 1 of 2. ---- */
540	ldda	[%o1]%asi,%d16		! block load
541	fmovd	%d16, %d8
542	fmovd	%d18, %d10
543	fmovd	%d20, %d12
544	fmovd	%d22, %d14
545	stda	%d0,[%o0]%asi
546	add	%o0, 64, %o0		! advance dst
547	fmovd	%d24, %d0
548	fmovd	%d26, %d2
549	fmovd	%d28, %d4
550	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
551	fmovd	%d30, %d6
552
553	/* ---- copy line 2 of 2. ---- */
554	ldda	[%o1+64]%asi,%d16
555	fmovd	%d16, %d8
556	fmovd	%d18, %d10
557	fmovd	%d20, %d12
558	fmovd	%d22, %d14
559	add	%o1, 128, %o1		! increment src
560	stda	%d0,[%o0]%asi
561	add	%o0, 64, %o0		! advance dst
562	fmovd	%d24, %d0
563	fmovd	%d26, %d2
564	fmovd	%d28, %d4
565	fmovd	%d30, %d6
566	bgt,pt	%ncc, .mv_align_100_loop
567	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
568
569	std	%d0, [%o0]
570	std	%d2, [%o0+8]
571	std	%d4, [%o0+16]
572	std	%d6, [%o0+24]
573	ba	.remain_stuff
574	add	%o0, 32, %o0
575	! END OF mv_align_100
576
577.mv_align_011:
578! Alignment off by 40 bytes
579	ldd	[%o1], %d0
580	ldd	[%o1+8], %d2
581	ldd	[%o1+16], %d4
582	ldd	[%o1+24], %d6
583	ldd	[%o1+32], %d8
584	add	%o1, 40, %o1
585	sub	%o2, 40, %o2
586	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
587	and	%o2, 0x7f, %o2		! residue bytes in %o2
588.mv_align_011_loop:
589	subcc	%o5, 128, %o5
590	/* ---- copy line 1 of 2. ---- */
591
592	ldda	[%o1]%asi,%d16		! block load
593	fmovd	%d16, %d10
594	fmovd	%d18, %d12
595	fmovd	%d20, %d14
596	stda	%d0,[%o0]%asi
597	add	%o0, 64, %o0		! advance dst
598	fmovd	%d22, %d0
599	fmovd	%d24, %d2
600	fmovd	%d26, %d4
601	fmovd	%d28, %d6
602	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
603	fmovd	%d30, %d8
604
605	/* ---- copy line 2 of 2. ---- */
606	ldda	[%o1+64]%asi,%d16
607	fmovd	%d16, %d10
608	fmovd	%d18, %d12
609	fmovd	%d20, %d14
610	add	%o1, 128, %o1		! increment src
611	stda	%d0,[%o0]%asi
612	add	%o0, 64, %o0		! advance dst
613	fmovd	%d22, %d0
614	fmovd	%d24, %d2
615	fmovd	%d26, %d4
616	fmovd	%d28, %d6
617	fmovd	%d30, %d8
618	bgt,pt	%ncc, .mv_align_011_loop
619	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
620
621	std	%d0, [%o0]
622	std	%d2, [%o0+8]
623	std	%d4, [%o0+16]
624	std	%d6, [%o0+24]
625	std	%d8, [%o0+32]
626	ba	.remain_stuff
627	add	%o0, 40, %o0
628	! END OF mv_align_011
629
630.mv_align_010:
631! Alignment off by 48 bytes
632	ldd	[%o1], %d0
633	ldd	[%o1+8], %d2
634	ldd	[%o1+16], %d4
635	ldd	[%o1+24], %d6
636	ldd	[%o1+32], %d8
637	ldd	[%o1+40], %d10
638	add	%o1, 48, %o1
639	sub	%o2, 48, %o2
640	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
641	and	%o2, 0x7f, %o2		! residue bytes in %o2
642.mv_align_010_loop:
643	subcc	%o5, 128, %o5
644	/* ---- copy line 1 of 2. ---- */
645
646	ldda	[%o1]%asi,%d16		! block load
647	fmovd	%d16, %d12
648	fmovd	%d18, %d14
649	stda	%d0,[%o0]%asi
650	add	%o0, 64, %o0		! advance dst
651	fmovd	%d20, %d0
652	fmovd	%d22, %d2
653	fmovd	%d24, %d4
654	fmovd	%d26, %d6
655	fmovd	%d28, %d8
656	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
657	fmovd	%d30, %d10
658
659	/* ---- copy line 2 of 2. ---- */
660	ldda	[%o1+64]%asi,%d16
661	fmovd	%d16, %d12
662	fmovd	%d18, %d14
663	add	%o1, 128, %o1	! increment src
664	stda	%d0,[%o0]%asi
665	add	%o0, 64, %o0		! advance dst
666	fmovd	%d20, %d0
667	fmovd	%d22, %d2
668	fmovd	%d24, %d4
669	fmovd	%d26, %d6
670	fmovd	%d28, %d8
671	fmovd	%d30, %d10
672	bgt,pt	%ncc, .mv_align_010_loop
673	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
674
675	std	%d0, [%o0]
676	std	%d2, [%o0+8]
677	std	%d4, [%o0+16]
678	std	%d6, [%o0+24]
679	std	%d8, [%o0+32]
680	std	%d10, [%o0+40]
681	ba	.remain_stuff
682	add	%o0, 48, %o0
683	! END OF mv_align_010
684
685.mv_align_001:
686! Alignment off by 56 bytes
687	ldd	[%o1], %d0
688	ldd	[%o1+8], %d2
689	ldd	[%o1+16], %d4
690	ldd	[%o1+24], %d6
691	ldd	[%o1+32], %d8
692	ldd	[%o1+40], %d10
693	ldd	[%o1+48], %d12
694	add	%o1, 56, %o1
695	sub	%o2, 56, %o2
696	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
697	and	%o2, 0x7f, %o2		! residue bytes in %o2
698.mv_align_001_loop:
699	subcc	%o5, 128, %o5
700	/* ---- copy line 1 of 2. ---- */
701
702	ldda	[%o1]%asi,%d16		! block load
703	fmovd	%d16, %d14
704	stda	%d0,[%o0]%asi
705	add	%o0, 64, %o0		! advance dst
706	fmovd	%d18, %d0
707	fmovd	%d20, %d2
708	fmovd	%d22, %d4
709	fmovd	%d24, %d6
710	fmovd	%d26, %d8
711	fmovd	%d28, %d10
712	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
713	fmovd	%d30, %d12
714
715	/* ---- copy line 2 of 2. ---- */
716	ldda	[%o1+64]%asi,%d16
717	fmovd	%d16, %d14
718	add	%o1, 128, %o1		! increment src
719	stda	%d0,[%o0]%asi
720	add	%o0, 64, %o0		! advance dst
721	fmovd	%d18, %d0
722	fmovd	%d20, %d2
723	fmovd	%d22, %d4
724	fmovd	%d24, %d6
725	fmovd	%d26, %d8
726	fmovd	%d28, %d10
727	fmovd	%d30, %d12
728	bgt,pt	%ncc, .mv_align_001_loop
729	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
730
731	std	%d0, [%o0]
732	std	%d2, [%o0+8]
733	std	%d4, [%o0+16]
734	std	%d6, [%o0+24]
735	std	%d8, [%o0+32]
736	std	%d10, [%o0+40]
737	std	%d12, [%o0+48]
738	ba	.remain_stuff
739	add	%o0, 56, %o0
740	! END OF mv_align_001
741
742.mv_align_000:
743	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
744	and	%o2, 0x7f, %o2		! residue bytes in %o2
745.mv_align_000_loop:
746	/* ---- copy line 1 of 2. ---- */
747	subcc	%o5, 128, %o5
748	ldda	[%o1]%asi,%d0
749	stda	%d0,[%o0]%asi
750	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
751
752	/* ---- copy line 2 of 2. ---- */
753	add	%o0, 64, %o0
754	ldda	[%o1+64]%asi,%d0
755	add	%o1, 128, %o1		! increment src
756	stda	%d0,[%o0]%asi
757	add	%o0, 64, %o0		! increment dst
758	bgt,pt	%ncc, .mv_align_000_loop
759	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
760	ba	.remain_stuff
761	nop
762
763	! END OF mv_align_000
764#else	/* NIAGARA2_IMPL */
765#endif	/* NIAGARA2_IMPL */
766
767	SET_SIZE(memmove)
768
769	ENTRY(memcpy)
770	ENTRY(__align_cpy_1)
771#ifdef NIAGARA2_IMPL
772	cmp	%o2, SMALL_MAX		! check for not small case
773	bgeu,pn	%ncc, .medium		! go to larger cases
774	mov	%o0, %g1		! save %o0
775.mv_short:
776	cmp	%o2, SHORTCOPY		! check for really short case
777	ble,pt	%ncc, .smallfin
778	or	%o0, %o1, %o4		! prepare alignment check
779	andcc	%o4, 0x3, %o5		! test for alignment
780	bz,pt	%ncc, .smallword	! branch to word aligned case
781	cmp	%o2, SHORTCHECK
782	ble,pt	%ncc, .smallrest
783	andcc	%o1, 0x3, %o5		! is src word aligned
784	bz,pn	%ncc, .aldst
785	cmp	%o5, 2			! is src half-word aligned
786	be,pt	%ncc, .s2algn
787	cmp	%o5, 3			! src is byte aligned
788.s1algn:ldub	[%o1], %o3		! move 1 or 3 bytes to align it
789	inc	1, %o1
790	stb	%o3, [%o0]		! move a byte to align src
791	inc	1, %o0
792	bne,pt	%ncc, .s2algn
793	dec	%o2
794	b	.ald			! now go align dest
795	andcc	%o0, 0x3, %o5
796
797.s2algn:lduh	[%o1], %o3		! know src is 2 byte aligned
798	inc	2, %o1
799	srl	%o3, 8, %o4
800	stb	%o4, [%o0]		! have to do bytes,
801	stb	%o3, [%o0 + 1]		! don't know dst alignment
802	inc	2, %o0
803	dec	2, %o2
804
805.aldst:	andcc	%o0, 0x3, %o5		! align the destination address
806.ald:	bz,pn	%ncc, .w4cp
807	cmp	%o5, 2
808	be,pn	%ncc, .w2cp
809	cmp	%o5, 3
810.w3cp:	lduw	[%o1], %o4
811	inc	4, %o1
812	srl	%o4, 24, %o5
813	stb	%o5, [%o0]
814	bne,pt	%ncc, .w1cp
815	inc	%o0
816	dec	1, %o2
817	andn	%o2, 3, %o3		! %o3 is aligned word count
818	dec	4, %o3			! avoid reading beyond tail of src
819	sub	%o1, %o0, %o1		! %o1 gets the difference
820
8211:	sll	%o4, 8, %g5		! save residual bytes
822	lduw	[%o1+%o0], %o4
823	deccc	4, %o3
824	srl	%o4, 24, %o5		! merge with residual
825	or	%o5, %g5, %g5
826	st	%g5, [%o0]
827	bnz,pt	%ncc, 1b
828	inc	4, %o0
829	sub	%o1, 3, %o1		! used one byte of last word read
830	and	%o2, 3, %o2
831	b	7f
832	inc	4, %o2
833
834.w1cp:	srl	%o4, 8, %o5
835	sth	%o5, [%o0]
836	inc	2, %o0
837	dec	3, %o2
838	andn	%o2, 3, %o3		! %o3 is aligned word count
839	dec	4, %o3			! avoid reading beyond tail of src
840	sub	%o1, %o0, %o1		! %o1 gets the difference
841
8422:	sll	%o4, 24, %g5		! save residual bytes
843	lduw	[%o1+%o0], %o4
844	deccc	4, %o3
845	srl	%o4, 8, %o5		! merge with residual
846	or	%o5, %g5, %g5
847	st	%g5, [%o0]
848	bnz,pt	%ncc, 2b
849	inc	4, %o0
850	sub	%o1, 1, %o1		! used three bytes of last word read
851	and	%o2, 3, %o2
852	b	7f
853	inc	4, %o2
854
855.w2cp:	lduw	[%o1], %o4
856	inc	4, %o1
857	srl	%o4, 16, %o5
858	sth	%o5, [%o0]
859	inc	2, %o0
860	dec	2, %o2
861	andn	%o2, 3, %o3		! %o3 is aligned word count
862	dec	4, %o3			! avoid reading beyond tail of src
863	sub	%o1, %o0, %o1		! %o1 gets the difference
864
8653:	sll	%o4, 16, %g5		! save residual bytes
866	lduw	[%o1+%o0], %o4
867	deccc	4, %o3
868	srl	%o4, 16, %o5		! merge with residual
869	or	%o5, %g5, %g5
870	st	%g5, [%o0]
871	bnz,pt	%ncc, 3b
872	inc	4, %o0
873	sub	%o1, 2, %o1		! used two bytes of last word read
874	and	%o2, 3, %o2
875	b	7f
876	inc	4, %o2
877
878.w4cp:	andn	%o2, 3, %o3		! %o3 is aligned word count
879	sub	%o1, %o0, %o1		! %o1 gets the difference
880
8811:	lduw	[%o1+%o0], %o4		! read from address
882	deccc	4, %o3			! decrement count
883	st	%o4, [%o0]		! write at destination address
884	bgu,pt	%ncc, 1b
885	inc	4, %o0			! increment to address
886	and	%o2, 3, %o2		! number of leftover bytes, if any
887
888	! simple finish up byte copy, works with any alignment
8897:
890	add	%o1, %o0, %o1		! restore %o1
891.smallrest:
892	tst	%o2
893	bz,pt	%ncc, .smallx
894	cmp	%o2, 4
895	blt,pt	%ncc, .smallleft3
896	nop
897	sub	%o2, 3, %o2
898.smallnotalign4:
899	ldub	[%o1], %o3		! read byte
900	subcc	%o2, 4, %o2		! reduce count by 4
901	stb	%o3, [%o0]		! write byte
902	ldub	[%o1+1], %o3		! repeat for total of 4 bytes
903	add	%o1, 4, %o1		! advance SRC by 4
904	stb	%o3, [%o0+1]
905	ldub	[%o1-2], %o3
906	add	%o0, 4, %o0		! advance DST by 4
907	stb	%o3, [%o0-2]
908	ldub	[%o1-1], %o3
909	bgu,pt	%ncc, .smallnotalign4	! loop til 3 or fewer bytes remain
910	stb	%o3, [%o0-1]
911	addcc	%o2, 3, %o2		! restore count
912	bz,pt	%ncc, .smallx
913.smallleft3:				! 1, 2, or 3 bytes remain
914	subcc	%o2, 1, %o2
915	ldub	[%o1], %o3		! load one byte
916	bz,pt	%ncc, .smallx
917	stb	%o3, [%o0]		! store one byte
918	ldub	[%o1+1], %o3		! load second byte
919	subcc	%o2, 1, %o2
920	bz,pt	%ncc, .smallx
921	stb	%o3, [%o0+1]		! store second byte
922	ldub	[%o1+2], %o3		! load third byte
923	stb	%o3, [%o0+2]		! store third byte
924.smallx:
925	retl
926	mov	%g1, %o0		! restore %o0
927
928.smallfin:
929	tst	%o2
930	bnz,pt	%ncc, .smallleft3
931	nop
932	retl
933	mov	%g1, %o0		! restore %o0
934
935	.align 16
936.smallwords:
937	lduw	[%o1], %o3		! read word
938.smallwordx:
939	subcc	%o2, 8, %o2		! update count
940	stw	%o3, [%o0]		! write word
941	add	%o1, 8, %o1		! update SRC
942	lduw	[%o1-4], %o3		! read word
943	add	%o0, 8, %o0		! update DST
944	bgu,pt	%ncc, .smallwords	! loop until done
945	stw	%o3, [%o0-4]		! write word
946	addcc	%o2, 7, %o2		! restore count
947	bz,pt	%ncc, .smallexit	! check for completion
948	cmp	%o2, 4			! check for 4 or more bytes left
949	blt	%ncc, .smallleft3	! if not, go to finish up
950	nop
951	lduw	[%o1], %o3
952	add	%o1, 4, %o1
953	subcc	%o2, 4, %o2
954	add	%o0, 4, %o0
955	bnz,pt	%ncc, .smallleft3
956	stw	%o3, [%o0-4]
957	retl
958	mov	%g1, %o0		! restore %o0
959
960! 8 or more bytes, src and dest start on word boundary
961! %o4 contains or %o0, %o1; %o3 contains first four bytes of src
962.smalllong:
963	andcc	%o4, 0x7, %o5		! test for long alignment
964	bnz,pt	%ncc, .smallwordx	! branch to word aligned case
965	cmp	%o2, SHORT_LONG-7
966	bge,a	%ncc, .medl64		! if we branch
967	sub	%o2,56,%o2		! adjust %o2 to -31 off count
968	sub	%o1, %o0, %o1		! %o1 gets the difference
969.small_long_l:
970	ldx	[%o1+%o0], %o3
971	subcc	%o2, 8, %o2
972	add	%o0, 8, %o0
973	bgu,pt	%ncc, .small_long_l	! loop until done
974	stx	%o3, [%o0-8]		! write word
975	add	%o1, %o0, %o1		! restore %o1
976	addcc	%o2, 7, %o2		! restore %o2 to correct count
977	bz,pt	%ncc, .smallexit	! check for completion
978	cmp	%o2, 4			! check for 4 or more bytes left
979	blt,pt	%ncc, .smallleft3	! if not, go to finish up
980	nop
981	lduw	[%o1], %o3
982	add	%o1, 4, %o1
983	subcc	%o2, 4, %o2
984	stw	%o3, [%o0]
985	add	%o0, 4, %o0
986	bnz,pt	%ncc, .smallleft3
987	nop
988	retl
989	mov	%g1, %o0		! restore %o0
990
991	.align 16
992! src and dest start on word boundary
993.smallword:
994	subcc	%o2, 7, %o2		! adjust count
995	bgu,pt	%ncc, .smalllong
996	lduw	[%o1], %o3		! read word
997	addcc	%o2, 3, %o2		! restore count
998	bz,pt	%ncc, .smallexit
999	stw	%o3, [%o0]		! write word
1000	deccc	%o2			! reduce count for cc test
1001	ldub	[%o1+4], %o3		! load one byte
1002	bz,pt	%ncc, .smallexit
1003	stb	%o3, [%o0+4]		! store one byte
1004	ldub	[%o1+5], %o3		! load second byte
1005	deccc	%o2
1006	bz,pt	%ncc, .smallexit
1007	stb	%o3, [%o0+5]		! store second byte
1008	ldub	[%o1+6], %o3		! load third byte
1009	stb	%o3, [%o0+6]		! store third byte
1010.smallexit:
1011	retl
1012	mov	%g1, %o0		! restore %o0
1013
1014	.align 16
1015.medium:
1016	neg	%o0, %o5
1017	andcc	%o5, 7, %o5		! bytes till DST 8 byte aligned
1018	brz,pt	%o5, .dst_aligned_on_8
1019
1020	! %o5 has the bytes to be written in partial store.
1021	sub	%o2, %o5, %o2
1022	sub	%o1, %o0, %o1		! %o1 gets the difference
10237:					! dst aligning loop
1024	ldub	[%o1+%o0], %o4		! load one byte
1025	subcc	%o5, 1, %o5
1026	stb	%o4, [%o0]
1027	bgu,pt	%ncc, 7b
1028	add	%o0, 1, %o0		! advance dst
1029	add	%o1, %o0, %o1		! restore %o1
1030.dst_aligned_on_8:
1031	andcc	%o1, 7, %o5
1032	brnz,pt	%o5, .src_dst_unaligned_on_8
1033	prefetch [%o1 + (1 * BLOCK_SIZE)], #one_read
1034
1035.src_dst_aligned_on_8:
1036	! check if we are copying MED_MAX or more bytes
1037	cmp	%o2, MED_MAX		! limit to store buffer size
1038	bgu,pt	%ncc, .large_align8_copy
1039	prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read
1040/*
1041 * Special case for handling when src and dest are both long word aligned
1042 * and total data to move is less than MED_MAX bytes
1043 */
1044.medlong:
1045	subcc	%o2, 63, %o2		! adjust length to allow cc test
1046	ble,pt	%ncc, .medl63		! skip big loop if less than 64 bytes
1047.medl64:
1048	prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read ! into the l2 cache
1049	ldx	[%o1], %o4		! load
1050	subcc	%o2, 64, %o2		! decrement length count
1051	stx	%o4, [%o0]		! and store
1052	ldx	[%o1+8], %o3		! a block of 64 bytes
1053	stx	%o3, [%o0+8]
1054	ldx	[%o1+16], %o4
1055	stx	%o4, [%o0+16]
1056	ldx	[%o1+24], %o3
1057	stx	%o3, [%o0+24]
1058	ldx	[%o1+32], %o4		! load
1059	stx	%o4, [%o0+32]		! and store
1060	ldx	[%o1+40], %o3		! a block of 64 bytes
1061	add	%o1, 64, %o1		! increase src ptr by 64
1062	stx	%o3, [%o0+40]
1063	ldx	[%o1-16], %o4
1064	add	%o0, 64, %o0		! increase dst ptr by 64
1065	stx	%o4, [%o0-16]
1066	ldx	[%o1-8], %o3
1067	bgu,pt	%ncc, .medl64		! repeat if at least 64 bytes left
1068	stx	%o3, [%o0-8]
1069.medl63:
1070	addcc	%o2, 32, %o2		! adjust remaining count
1071	ble,pt	%ncc, .medl31		! to skip if 31 or fewer bytes left
1072	nop
1073	ldx	[%o1], %o4		! load
1074	sub	%o2, 32, %o2		! decrement length count
1075	stx	%o4, [%o0]		! and store
1076	ldx	[%o1+8], %o3		! a block of 32 bytes
1077	add	%o1, 32, %o1		! increase src ptr by 32
1078	stx	%o3, [%o0+8]
1079	ldx	[%o1-16], %o4
1080	add	%o0, 32, %o0		! increase dst ptr by 32
1081	stx	%o4, [%o0-16]
1082	ldx	[%o1-8], %o3
1083	stx	%o3, [%o0-8]
1084.medl31:
1085	addcc	%o2, 16, %o2		! adjust remaining count
1086	ble,pt	%ncc, .medl15		! skip if 15 or fewer bytes left
1087	nop				!
1088	ldx	[%o1], %o4		! load and store 16 bytes
1089	add	%o1, 16, %o1		! increase src ptr by 16
1090	stx	%o4, [%o0]		!
1091	sub	%o2, 16, %o2		! decrease count by 16
1092	ldx	[%o1-8], %o3		!
1093	add	%o0, 16, %o0		! increase dst ptr by 16
1094	stx	%o3, [%o0-8]
1095.medl15:
1096	addcc	%o2, 15, %o2		! restore count
1097	bz,pt	%ncc, .smallexit	! exit if finished
1098	cmp	%o2, 8
1099	blt,pt	%ncc, .medw7		! skip if 7 or fewer bytes left
1100	tst	%o2
1101	ldx	[%o1], %o4		! load 8 bytes
1102	add	%o1, 8, %o1		! increase src ptr by 8
1103	add	%o0, 8, %o0		! increase dst ptr by 8
1104	subcc	%o2, 8, %o2		! decrease count by 8
1105	bnz,pt	%ncc, .medw7
1106	stx	%o4, [%o0-8]		! and store 8 bytes
1107	retl
1108	mov	%g1, %o0		! restore %o0
1109
1110	.align 16
1111.src_dst_unaligned_on_8:
1112	! DST is 8-byte aligned, src is not
11132:
1114	andcc	%o1, 0x3, %o5		! test word alignment
1115	bnz,pt	%ncc, .unalignsetup	! branch to skip if not word aligned
1116	prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read
1117
1118/*
1119 * Handle all cases where src and dest are aligned on word
1120 * boundaries. Use unrolled loops for better performance.
1121 * This option wins over standard large data move when
1122 * source and destination is in cache for medium
1123 * to short data moves.
1124 */
1125	cmp	%o2, MED_WMAX		! limit to store buffer size
1126	bge,pt	%ncc, .unalignrejoin	! otherwise rejoin main loop
1127	prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
1128
1129	subcc	%o2, 31, %o2		! adjust length to allow cc test
1130					! for end of loop
1131	ble,pt	%ncc, .medw31		! skip big loop if less than 16
1132	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1133.medw32:
1134	ld	[%o1], %o4		! move a block of 32 bytes
1135	stw	%o4, [%o0]
1136	ld	[%o1+4], %o3
1137	stw	%o3, [%o0+4]
1138	ld	[%o1+8], %o4
1139	stw	%o4, [%o0+8]
1140	ld	[%o1+12], %o3
1141	stw	%o3, [%o0+12]
1142	ld	[%o1+16], %o4
1143	subcc	%o2, 32, %o2		! decrement length count
1144	stw	%o4, [%o0+16]
1145	ld	[%o1+20], %o3
1146	add	%o1, 32, %o1		! increase src ptr by 32
1147	stw	%o3, [%o0+20]
1148	ld	[%o1-8], %o4
1149	add	%o0, 32, %o0		! increase dst ptr by 32
1150	stw	%o4, [%o0-8]
1151	ld	[%o1-4], %o3
1152	bgu,pt	%ncc, .medw32		! repeat if at least 32 bytes left
1153	stw	%o3, [%o0-4]
1154.medw31:
1155	addcc	%o2, 31, %o2		! restore count
1156
1157	bz,pt	%ncc, .smallexit	! exit if finished
1158	nop
1159	cmp	%o2, 16
1160	blt,pt	%ncc, .medw15
1161	nop
1162	ld	[%o1], %o4		! move a block of 16 bytes
1163	subcc	%o2, 16, %o2		! decrement length count
1164	stw	%o4, [%o0]
1165	ld	[%o1+4], %o3
1166	add	%o1, 16, %o1		! increase src ptr by 16
1167	stw	%o3, [%o0+4]
1168	ld	[%o1-8], %o4
1169	add	%o0, 16, %o0		! increase dst ptr by 16
1170	stw	%o4, [%o0-8]
1171	ld	[%o1-4], %o3
1172	stw	%o3, [%o0-4]
1173.medw15:
1174	bz,pt	%ncc, .smallexit	! exit if finished
1175	cmp	%o2, 8
1176	blt,pt	%ncc, .medw7		! skip if 7 or fewer bytes left
1177	tst	%o2
1178	ld	[%o1], %o4		! load 4 bytes
1179	subcc	%o2, 8, %o2		! decrease count by 8
1180	stw	%o4, [%o0]		! and store 4 bytes
1181	add	%o1, 8, %o1		! increase src ptr by 8
1182	ld	[%o1-4], %o3		! load 4 bytes
1183	add	%o0, 8, %o0		! increase dst ptr by 8
1184	stw	%o3, [%o0-4]		! and store 4 bytes
1185	bz,pt	%ncc, .smallexit	! exit if finished
1186.medw7:					! count is ge 1, less than 8
1187	cmp	%o2, 4			! check for 4 bytes left
1188	blt,pt	%ncc, .smallleft3	! skip if 3 or fewer bytes left
1189	nop				!
1190	ld	[%o1], %o4		! load 4 bytes
1191	add	%o1, 4, %o1		! increase src ptr by 4
1192	add	%o0, 4, %o0		! increase dst ptr by 4
1193	subcc	%o2, 4, %o2		! decrease count by 4
1194	bnz	.smallleft3
1195	stw	%o4, [%o0-4]		! and store 4 bytes
1196	retl
1197	mov	%g1, %o0		! restore %o0
1198
1199	.align	16
1200.large_align8_copy:			! Src and dst share 8 byte alignment
1201	rd	%fprs, %g5		! check for unused fp
1202	! if fprs.fef == 0, set it.
1203	! Setting it when already set costs more than checking
1204	andcc	%g5, FPRS_FEF, %g5	! test FEF, fprs.du = fprs.dl = 0
1205	bz,a	%ncc, 1f
1206	wr	%g0, FPRS_FEF, %fprs	! fprs.fef = 1
12071:
1208	! align dst to 64 byte boundary
1209	andcc	%o0, 0x3f, %o3		! %o3 == 0 means dst is 64 byte aligned
1210	brz,pn	%o3, .aligned_to_64
1211	andcc	%o0, 8, %o3		! odd long words to move?
1212	brz,pt	%o3, .aligned_to_16
1213	nop
1214	ldx	[%o1], %o4
1215	sub	%o2, 8, %o2
1216	add	%o1, 8, %o1		! increment src ptr
1217	add	%o0, 8, %o0		! increment dst ptr
1218	stx	%o4, [%o0-8]
1219.aligned_to_16:
1220	andcc	%o0, 16, %o3		! pair of long words to move?
1221	brz,pt	%o3, .aligned_to_32
1222	nop
1223	ldx	[%o1], %o4
1224	sub	%o2, 16, %o2
1225	stx	%o4, [%o0]
1226	add	%o1, 16, %o1		! increment src ptr
1227	ldx	[%o1-8], %o4
1228	add	%o0, 16, %o0		! increment dst ptr
1229	stx	%o4, [%o0-8]
1230.aligned_to_32:
1231	andcc	%o0, 32, %o3		! four long words to move?
1232	brz,pt	%o3, .aligned_to_64
1233	nop
1234	ldx	[%o1], %o4
1235	sub	%o2, 32, %o2
1236	stx	%o4, [%o0]
1237	ldx	[%o1+8], %o4
1238	stx	%o4, [%o0+8]
1239	ldx	[%o1+16], %o4
1240	stx	%o4, [%o0+16]
1241	add	%o1, 32, %o1		! increment src ptr
1242	ldx	[%o1-8], %o4
1243	add	%o0, 32, %o0		! increment dst ptr
1244	stx	%o4, [%o0-8]
1245.aligned_to_64:
1246	prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
1247	mov	%asi,%o4		! save %asi
1248	! Determine source alignment
1249	! to correct 8 byte offset
1250	andcc	%o1, 0x20, %o3
1251	brnz,pn	%o3, .align_1
1252	mov	ASI_BLK_P, %asi		! setup %asi for block load/store
1253	andcc	%o1, 0x10, %o3
1254	brnz,pn	%o3, .align_01
1255	nop
1256	andcc	%o1, 0x08, %o3
1257	brz,pn	%o3, .align_000
1258	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1259	ba	.align_001
1260	nop
1261.align_01:
1262	andcc	%o1, 0x08, %o3
1263	brnz,pn	%o3, .align_011
1264	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1265	ba	.align_010
1266	nop
1267.align_1:
1268	andcc	%o1, 0x10, %o3
1269	brnz,pn	%o3, .align_11
1270	nop
1271	andcc	%o1, 0x08, %o3
1272	brnz,pn	%o3, .align_101
1273	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1274	ba	.align_100
1275	nop
1276.align_11:
1277	andcc	%o1, 0x08, %o3
1278	brz,pn	%o3, .align_110
1279	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1280
1281.align_111:
1282! Alignment off by 8 bytes
1283	ldd	[%o1], %d0
1284	add	%o1, 8, %o1
1285	sub	%o2, 8, %o2
1286	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
1287	and	%o2, 0x7f, %o2		! residue bytes in %o2
1288.align_111_loop:
1289	subcc	%o5, 128, %o5
1290	/* ---- copy line 1 of 2. ---- */
1291	ldda	[%o1]%asi,%d16		! block load
1292	fmovd	%d16, %d2
1293	fmovd	%d18, %d4
1294	fmovd	%d20, %d6
1295	fmovd	%d22, %d8
1296	fmovd	%d24, %d10
1297	fmovd	%d26, %d12
1298	fmovd	%d28, %d14
1299	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1300	stda	%d0,[%o0]%asi
1301	add	%o0, 64, %o0		! advance dst
1302	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1303	fmovd	%d30, %d0
1304
1305	/* ---- copy line 2 of 2. ---- */
1306	ldda	[%o1+64]%asi,%d16
1307	fmovd	%d16, %d2
1308	fmovd	%d18, %d4
1309	fmovd	%d20, %d6
1310	fmovd	%d22, %d8
1311	fmovd	%d24, %d10
1312	fmovd	%d26, %d12
1313	fmovd	%d28, %d14
1314	add	%o1, 128, %o1		! increment src
1315	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1316	stda	%d0,[%o0]%asi
1317	add	%o0, 64, %o0		! advance dst
1318	fmovd	%d30, %d0
1319	bgt,pt	%ncc, .align_111_loop
1320	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1321
1322	std	%d0, [%o0]
1323	ba	.remain_stuff
1324	add	%o0, 8, %o0
1325	! END OF align_111
1326
1327.align_110:
1328! Alignment off by 16 bytes
1329	ldd	[%o1], %d0
1330	ldd	[%o1+8], %d2
1331	add	%o1, 16, %o1
1332	sub	%o2, 16, %o2
1333	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
1334	and	%o2, 0x7f, %o2		! residue bytes in %o2
1335.align_110_loop:
1336	subcc	%o5, 128, %o5
1337	/* ---- copy line 1 of 2. ---- */
1338
1339	ldda	[%o1]%asi,%d16		! block load
1340	fmovd	%d16, %d4
1341	fmovd	%d18, %d6
1342	fmovd	%d20, %d8
1343	fmovd	%d22, %d10
1344	fmovd	%d24, %d12
1345	fmovd	%d26, %d14
1346	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1347	stda	%d0,[%o0]%asi
1348	add	%o0, 64, %o0		! advance dst
1349	fmovd	%d28, %d0
1350	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1351	fmovd	%d30, %d2
1352
1353	/* ---- copy line 2 of 2. ---- */
1354	ldda	[%o1+64]%asi,%d16
1355	fmovd	%d16, %d4
1356	fmovd	%d18, %d6
1357	fmovd	%d20, %d8
1358	fmovd	%d22, %d10
1359	fmovd	%d24, %d12
1360	fmovd	%d26, %d14
1361	add	%o1, 128, %o1		! increment src
1362	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1363	stda	%d0,[%o0]%asi
1364	add	%o0, 64, %o0		! advance dst
1365	fmovd	%d28, %d0
1366	fmovd	%d30, %d2
1367	bgt,pt	%ncc, .align_110_loop
1368	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1369
1370	std	%d0, [%o0]
1371	std	%d2, [%o0+8]
1372	ba	.remain_stuff
1373	add	%o0, 16, %o0
1374	! END OF align_110
1375
1376.align_101:
1377! Alignment off by 24 bytes
1378	ldd	[%o1], %d0
1379	ldd	[%o1+8], %d2
1380	ldd	[%o1+16], %d4
1381	add	%o1, 24, %o1
1382	sub	%o2, 24, %o2
1383	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
1384	and	%o2, 0x7f, %o2		! residue bytes in %o2
1385.align_101_loop:
1386	subcc	%o5, 128, %o5
1387	/* ---- copy line 1 of 2. ---- */
1388
1389	ldda	[%o1]%asi,%d16		! block load
1390	fmovd	%d16, %d6
1391	fmovd	%d18, %d8
1392	fmovd	%d20, %d10
1393	fmovd	%d22, %d12
1394	fmovd	%d24, %d14
1395	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1396	stda	%d0,[%o0]%asi
1397	add	%o0, 64, %o0		! advance dst
1398	fmovd	%d26, %d0
1399	fmovd	%d28, %d2
1400	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1401	fmovd	%d30, %d4
1402
1403	/* ---- copy line 2 of 2. ---- */
1404	ldda	[%o1+64]%asi,%d16
1405	fmovd	%d16, %d6
1406	fmovd	%d18, %d8
1407	fmovd	%d20, %d10
1408	fmovd	%d22, %d12
1409	fmovd	%d24, %d14
1410	add	%o1, 128, %o1		! increment src
1411	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1412	stda	%d0,[%o0]%asi
1413	add	%o0, 64, %o0		! advance dst
1414	fmovd	%d26, %d0
1415	fmovd	%d28, %d2
1416	fmovd	%d30, %d4
1417	bgt,pt	%ncc, .align_101_loop
1418	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1419
1420	std	%d0, [%o0]
1421	std	%d2, [%o0+8]
1422	std	%d4, [%o0+16]
1423	ba	.remain_stuff
1424	add	%o0, 24, %o0
1425	! END OF align_101
1426
1427.align_100:
1428! Alignment off by 32 bytes
1429	ldd	[%o1], %d0
1430	ldd	[%o1+8], %d2
1431	ldd	[%o1+16],%d4
1432	ldd	[%o1+24],%d6
1433	add	%o1, 32, %o1
1434	sub	%o2, 32, %o2
1435	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
1436	and	%o2, 0x7f, %o2		! residue bytes in %o2
1437.align_100_loop:
1438	subcc	%o5, 128, %o5
1439	/* ---- copy line 1 of 2. ---- */
1440	ldda	[%o1]%asi,%d16		! block load
1441	fmovd	%d16, %d8
1442	fmovd	%d18, %d10
1443	fmovd	%d20, %d12
1444	fmovd	%d22, %d14
1445	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1446	stda	%d0,[%o0]%asi
1447	add	%o0, 64, %o0		! advance dst
1448	fmovd	%d24, %d0
1449	fmovd	%d26, %d2
1450	fmovd	%d28, %d4
1451	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1452	fmovd	%d30, %d6
1453
1454	/* ---- copy line 2 of 2. ---- */
1455	ldda	[%o1+64]%asi,%d16
1456	fmovd	%d16, %d8
1457	fmovd	%d18, %d10
1458	fmovd	%d20, %d12
1459	fmovd	%d22, %d14
1460	add	%o1, 128, %o1		! increment src
1461	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1462	stda	%d0,[%o0]%asi
1463	add	%o0, 64, %o0		! advance dst
1464	fmovd	%d24, %d0
1465	fmovd	%d26, %d2
1466	fmovd	%d28, %d4
1467	fmovd	%d30, %d6
1468	bgt,pt	%ncc, .align_100_loop
1469	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1470
1471	std	%d0, [%o0]
1472	std	%d2, [%o0+8]
1473	std	%d4, [%o0+16]
1474	std	%d6, [%o0+24]
1475	ba	.remain_stuff
1476	add	%o0, 32, %o0
1477	! END OF align_100
1478
1479.align_011:
1480! Alignment off by 40 bytes
1481	ldd	[%o1], %d0
1482	ldd	[%o1+8], %d2
1483	ldd	[%o1+16], %d4
1484	ldd	[%o1+24], %d6
1485	ldd	[%o1+32], %d8
1486	add	%o1, 40, %o1
1487	sub	%o2, 40, %o2
1488	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
1489	and	%o2, 0x7f, %o2		! residue bytes in %o2
1490.align_011_loop:
1491	subcc	%o5, 128, %o5
1492	/* ---- copy line 1 of 2. ---- */
1493
1494	ldda	[%o1]%asi,%d16		! block load
1495	fmovd	%d16, %d10
1496	fmovd	%d18, %d12
1497	fmovd	%d20, %d14
1498	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1499	stda	%d0,[%o0]%asi
1500	add	%o0, 64, %o0		! advance dst
1501	fmovd	%d22, %d0
1502	fmovd	%d24, %d2
1503	fmovd	%d26, %d4
1504	fmovd	%d28, %d6
1505	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1506	fmovd	%d30, %d8
1507
1508	/* ---- copy line 2 of 2. ---- */
1509	ldda	[%o1+64]%asi,%d16
1510	fmovd	%d16, %d10
1511	fmovd	%d18, %d12
1512	fmovd	%d20, %d14
1513	add	%o1, 128, %o1		! increment src
1514	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1515	stda	%d0,[%o0]%asi
1516	add	%o0, 64, %o0		! advance dst
1517	fmovd	%d22, %d0
1518	fmovd	%d24, %d2
1519	fmovd	%d26, %d4
1520	fmovd	%d28, %d6
1521	fmovd	%d30, %d8
1522	bgt,pt	%ncc, .align_011_loop
1523	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1524
1525	std	%d0, [%o0]
1526	std	%d2, [%o0+8]
1527	std	%d4, [%o0+16]
1528	std	%d6, [%o0+24]
1529	std	%d8, [%o0+32]
1530	ba	.remain_stuff
1531	add	%o0, 40, %o0
1532	! END OF align_011
1533
1534.align_010:
1535! Alignment off by 48 bytes
1536	ldd	[%o1], %d0
1537	ldd	[%o1+8], %d2
1538	ldd	[%o1+16], %d4
1539	ldd	[%o1+24], %d6
1540	ldd	[%o1+32], %d8
1541	ldd	[%o1+40], %d10
1542	add	%o1, 48, %o1
1543	sub	%o2, 48, %o2
1544	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
1545	and	%o2, 0x7f, %o2		! residue bytes in %o2
1546.align_010_loop:
1547	subcc	%o5, 128, %o5
1548	/* ---- copy line 1 of 2. ---- */
1549
1550	ldda	[%o1]%asi,%d16		! block load
1551	fmovd	%d16, %d12
1552	fmovd	%d18, %d14
1553	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1554	stda	%d0,[%o0]%asi
1555	add	%o0, 64, %o0		! advance dst
1556	fmovd	%d20, %d0
1557	fmovd	%d22, %d2
1558	fmovd	%d24, %d4
1559	fmovd	%d26, %d6
1560	fmovd	%d28, %d8
1561	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1562	fmovd	%d30, %d10
1563
1564	/* ---- copy line 2 of 2. ---- */
1565	ldda	[%o1+64]%asi,%d16
1566	fmovd	%d16, %d12
1567	fmovd	%d18, %d14
1568	add	%o1, 128, %o1	! increment src
1569	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1570	stda	%d0,[%o0]%asi
1571	add	%o0, 64, %o0		! advance dst
1572	fmovd	%d20, %d0
1573	fmovd	%d22, %d2
1574	fmovd	%d24, %d4
1575	fmovd	%d26, %d6
1576	fmovd	%d28, %d8
1577	fmovd	%d30, %d10
1578	bgt,pt	%ncc, .align_010_loop
1579	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1580
1581	std	%d0, [%o0]
1582	std	%d2, [%o0+8]
1583	std	%d4, [%o0+16]
1584	std	%d6, [%o0+24]
1585	std	%d8, [%o0+32]
1586	std	%d10, [%o0+40]
1587	ba	.remain_stuff
1588	add	%o0, 48, %o0
1589	! END OF align_010
1590
1591.align_001:
1592! Alignment off by 56 bytes
1593	ldd	[%o1], %d0
1594	ldd	[%o1+8], %d2
1595	ldd	[%o1+16], %d4
1596	ldd	[%o1+24], %d6
1597	ldd	[%o1+32], %d8
1598	ldd	[%o1+40], %d10
1599	ldd	[%o1+48], %d12
1600	add	%o1, 56, %o1
1601	sub	%o2, 56, %o2
1602	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
1603	and	%o2, 0x7f, %o2		! residue bytes in %o2
1604.align_001_loop:
1605	subcc	%o5, 128, %o5
1606	/* ---- copy line 1 of 2. ---- */
1607
1608	ldda	[%o1]%asi,%d16		! block load
1609	fmovd	%d16, %d14
1610	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1611	stda	%d0,[%o0]%asi
1612	add	%o0, 64, %o0		! advance dst
1613	fmovd	%d18, %d0
1614	fmovd	%d20, %d2
1615	fmovd	%d22, %d4
1616	fmovd	%d24, %d6
1617	fmovd	%d26, %d8
1618	fmovd	%d28, %d10
1619	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1620	fmovd	%d30, %d12
1621
1622	/* ---- copy line 2 of 2. ---- */
1623	ldda	[%o1+64]%asi,%d16
1624	fmovd	%d16, %d14
1625	add	%o1, 128, %o1		! increment src
1626	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1627	stda	%d0,[%o0]%asi
1628	add	%o0, 64, %o0		! advance dst
1629	fmovd	%d18, %d0
1630	fmovd	%d20, %d2
1631	fmovd	%d22, %d4
1632	fmovd	%d24, %d6
1633	fmovd	%d26, %d8
1634	fmovd	%d28, %d10
1635	fmovd	%d30, %d12
1636	bgt,pt	%ncc, .align_001_loop
1637	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1638
1639	std	%d0, [%o0]
1640	std	%d2, [%o0+8]
1641	std	%d4, [%o0+16]
1642	std	%d6, [%o0+24]
1643	std	%d8, [%o0+32]
1644	std	%d10, [%o0+40]
1645	std	%d12, [%o0+48]
1646	ba	.remain_stuff
1647	add	%o0, 56, %o0
1648	! END OF align_001
1649
1650.align_000:
1651	andn	%o2, 0x7f, %o5		! %o5 is multiple of 2*block size
1652	and	%o2, 0x7f, %o2		! residue bytes in %o2
1653.align_000_loop:
1654	/* ---- copy line 1 of 2. ---- */
1655	subcc	%o5, 128, %o5
1656	ldda	[%o1]%asi,%d0
1657	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1658	stda	%d0,[%o0]%asi
1659	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
1660
1661	/* ---- copy line 2 of 2. ---- */
1662	add	%o0, 64, %o0
1663	ldda	[%o1+64]%asi,%d0
1664	add	%o1, 128, %o1		! increment src
1665	stxa	%g0,[%o0]ASI_STBI_P	! block initializing store
1666	stda	%d0,[%o0]%asi
1667	add	%o0, 64, %o0		! increment dst
1668	bgt,pt	%ncc, .align_000_loop
1669	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
1670
1671	! END OF align_000
1672
1673.remain_stuff:
1674	mov	%o4, %asi		! restore %asi
1675	brnz	%g5, .medlong
1676	membar	#Sync
1677	ba	.medlong
1678	wr	%g5, %g0, %fprs
1679
1680	.align 16
1681	! Dst is on 8 byte boundary; src is not; remaining count > SMALL_MAX
1682.unalignsetup:
1683	prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
1684.unalignrejoin:
1685	rd	%fprs, %g5		! check for unused fp
1686	! if fprs.fef == 0, set it.
1687	! Setting it when already set costs more than checking
1688	andcc	%g5, FPRS_FEF, %g5	! test FEF, fprs.du = fprs.dl = 0
1689	bz,a	%ncc, 1f
1690	wr	%g0, FPRS_FEF, %fprs	! fprs.fef = 1
16911:
1692	cmp	%o2, MED_UMAX		! check for medium unaligned limit
1693	bge,pt	%ncc,.unalign_large
1694	nop
1695	andn	%o2, 0x3f, %o5		! %o5 is multiple of block size
1696	and	%o2, 0x3f, %o2		! residue bytes in %o2
1697	cmp	%o2, 8			! Insure we don't load beyond
1698	bgt	.unalign_adjust		! end of source buffer
1699	andn	%o1, 0x7, %o4		! %o4 has long word aligned src address
1700	add	%o2, 64, %o2		! adjust to leave loop
1701	sub	%o5, 64, %o5		! early if necessary
1702.unalign_adjust:
1703	alignaddr %o1, %g0, %g0		! generate %gsr
1704	add	%o1, %o5, %o1		! advance %o1 to after blocks
1705	ldd	[%o4], %d0
1706.unalign_loop:
1707	ldd	[%o4+8], %d2
1708	faligndata %d0, %d2, %d16
1709	ldd	[%o4+16], %d4
1710	std	%d16, [%o0]
1711	faligndata %d2, %d4, %d18
1712	ldd	[%o4+24], %d6
1713	std	%d18, [%o0+8]
1714	faligndata %d4, %d6, %d20
1715	ldd	[%o4+32], %d8
1716	std	%d20, [%o0+16]
1717	faligndata %d6, %d8, %d22
1718	ldd	[%o4+40], %d10
1719	std	%d22, [%o0+24]
1720	faligndata %d8, %d10, %d24
1721	ldd	[%o4+48], %d12
1722	std	%d24, [%o0+32]
1723	faligndata %d10, %d12, %d26
1724	ldd	[%o4+56], %d14
1725	std	%d26, [%o0+40]
1726	faligndata %d12, %d14, %d28
1727	ldd	[%o4+64], %d0
1728	std	%d28, [%o0+48]
1729	faligndata %d14, %d0, %d30
1730	add	%o4, BLOCK_SIZE, %o4
1731	std	%d30, [%o0+56]
1732	add	%o0, BLOCK_SIZE, %o0
1733	subcc	%o5, BLOCK_SIZE, %o5
1734	bgu,pt	%ncc, .unalign_loop
1735	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1736	ba	.unalign_done
1737	nop
1738
1739.unalign_large:
1740	andcc	%o0, 0x3f, %o3		! is dst 64-byte block aligned?
1741	bz	%ncc, .unalignsrc
1742	sub	%o3, 64, %o3		! %o3 will be multiple of 8
1743	neg	%o3			! bytes until dest is 64 byte aligned
1744	sub	%o2, %o3, %o2		! update cnt with bytes to be moved
1745	! Move bytes according to source alignment
1746	andcc	%o1, 0x1, %o5
1747	bnz	%ncc, .unalignbyte	! check for byte alignment
1748	nop
1749	andcc	%o1, 2, %o5		! check for half word alignment
1750	bnz	%ncc, .unalignhalf
1751	nop
1752	! Src is word aligned
1753.unalignword:
1754	ld	[%o1], %o4		! load 4 bytes
1755	stw	%o4, [%o0]		! and store 4 bytes
1756	ld	[%o1+4], %o4		! load 4 bytes
1757	add	%o1, 8, %o1		! increase src ptr by 8
1758	stw	%o4, [%o0+4]		! and store 4 bytes
1759	subcc	%o3, 8, %o3		! decrease count by 8
1760	bnz	%ncc, .unalignword
1761	add	%o0, 8, %o0		! increase dst ptr by 8
1762	ba	.unalignsrc
1763	nop
1764
1765	! Src is half-word aligned
1766.unalignhalf:
1767	lduh	[%o1], %o4		! load 2 bytes
1768	sllx	%o4, 32, %o5		! shift left
1769	lduw	[%o1+2], %o4
1770	or	%o4, %o5, %o5
1771	sllx	%o5, 16, %o5
1772	lduh	[%o1+6], %o4
1773	or	%o4, %o5, %o5
1774	stx	%o5, [%o0]
1775	add	%o1, 8, %o1
1776	subcc	%o3, 8, %o3
1777	bnz	%ncc, .unalignhalf
1778	add	%o0, 8, %o0
1779	ba	.unalignsrc
1780	nop
1781
1782	! Src is Byte aligned
1783.unalignbyte:
1784	sub	%o0, %o1, %o0		! share pointer advance
1785.unalignbyte_loop:
1786	ldub	[%o1], %o4
1787	sllx	%o4, 56, %o5
1788	lduh	[%o1+1], %o4
1789	sllx	%o4, 40, %o4
1790	or	%o4, %o5, %o5
1791	lduh	[%o1+3], %o4
1792	sllx	%o4, 24, %o4
1793	or	%o4, %o5, %o5
1794	lduh	[%o1+5], %o4
1795	sllx	%o4,  8, %o4
1796	or	%o4, %o5, %o5
1797	ldub	[%o1+7], %o4
1798	or	%o4, %o5, %o5
1799	stx	%o5, [%o0+%o1]
1800	subcc	%o3, 8, %o3
1801	bnz	%ncc, .unalignbyte_loop
1802	add	%o1, 8, %o1
1803	add	%o0,%o1, %o0 		! restore pointer
1804
1805	! Destination is now block (64 byte aligned)
1806.unalignsrc:
1807	andn	%o2, 0x3f, %o5		! %o5 is multiple of block size
1808	and	%o2, 0x3f, %o2		! residue bytes in %o2
1809	add	%o2, 64, %o2		! Insure we don't load beyond
1810	sub	%o5, 64, %o5		! end of source buffer
1811
1812	andn	%o1, 0x3f, %o4		! %o4 has block aligned src address
1813	prefetch [%o4 + (3 * BLOCK_SIZE)], #one_read
1814	alignaddr %o1, %g0, %g0		! generate %gsr
1815	add	%o1, %o5, %o1		! advance %o1 to after blocks
1816	!
1817	! Determine source alignment to correct 8 byte offset
1818	andcc	%o1, 0x20, %o3
1819	brnz,pn	%o3, .unalign_1
1820	nop
1821	andcc	%o1, 0x10, %o3
1822	brnz,pn	%o3, .unalign_01
1823	nop
1824	andcc	%o1, 0x08, %o3
1825	brz,a	%o3, .unalign_000
1826	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1827	ba	.unalign_001
1828	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1829.unalign_01:
1830	andcc	%o1, 0x08, %o3
1831	brnz,a	%o3, .unalign_011
1832	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1833	ba	.unalign_010
1834	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1835.unalign_1:
1836	andcc	%o1, 0x10, %o3
1837	brnz,pn	%o3, .unalign_11
1838	nop
1839	andcc	%o1, 0x08, %o3
1840	brnz,a	%o3, .unalign_101
1841	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1842	ba	.unalign_100
1843	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1844.unalign_11:
1845	andcc	%o1, 0x08, %o3
1846	brz,pn	%o3, .unalign_110
1847	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1848
1849.unalign_111:
1850	ldd	[%o4+56], %d14
1851.unalign_111_loop:
1852	add	%o4, 64, %o4
1853	ldda	[%o4]ASI_BLK_P, %d16
1854	faligndata %d14, %d16, %d48
1855	faligndata %d16, %d18, %d50
1856	faligndata %d18, %d20, %d52
1857	faligndata %d20, %d22, %d54
1858	faligndata %d22, %d24, %d56
1859	faligndata %d24, %d26, %d58
1860	faligndata %d26, %d28, %d60
1861	faligndata %d28, %d30, %d62
1862	fmovd	%d30, %d14
1863	stda	%d48, [%o0]ASI_BLK_P
1864	subcc	%o5, 64, %o5
1865	add	%o0, 64, %o0
1866	bgu,pt	%ncc, .unalign_111_loop
1867	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1868	ba	.unalign_done
1869	membar	#Sync
1870
1871.unalign_110:
1872	ldd	[%o4+48], %d12
1873	ldd	[%o4+56], %d14
1874.unalign_110_loop:
1875	add	%o4, 64, %o4
1876	ldda	[%o4]ASI_BLK_P, %d16
1877	faligndata %d12, %d14, %d48
1878	faligndata %d14, %d16, %d50
1879	faligndata %d16, %d18, %d52
1880	faligndata %d18, %d20, %d54
1881	faligndata %d20, %d22, %d56
1882	faligndata %d22, %d24, %d58
1883	faligndata %d24, %d26, %d60
1884	faligndata %d26, %d28, %d62
1885	fmovd	%d28, %d12
1886	fmovd	%d30, %d14
1887	stda	%d48, [%o0]ASI_BLK_P
1888	subcc	%o5, 64, %o5
1889	add	%o0, 64, %o0
1890	bgu,pt	%ncc, .unalign_110_loop
1891	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1892	ba	.unalign_done
1893	membar	#Sync
1894
1895.unalign_101:
1896	ldd	[%o4+40], %d10
1897	ldd	[%o4+48], %d12
1898	ldd	[%o4+56], %d14
1899.unalign_101_loop:
1900	add	%o4, 64, %o4
1901	ldda	[%o4]ASI_BLK_P, %d16
1902	faligndata %d10, %d12, %d48
1903	faligndata %d12, %d14, %d50
1904	faligndata %d14, %d16, %d52
1905	faligndata %d16, %d18, %d54
1906	faligndata %d18, %d20, %d56
1907	faligndata %d20, %d22, %d58
1908	faligndata %d22, %d24, %d60
1909	faligndata %d24, %d26, %d62
1910	fmovd	%d26, %d10
1911	fmovd	%d28, %d12
1912	fmovd	%d30, %d14
1913	stda	%d48, [%o0]ASI_BLK_P
1914	subcc	%o5, 64, %o5
1915	add	%o0, 64, %o0
1916	bgu,pt	%ncc, .unalign_101_loop
1917	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1918	ba	.unalign_done
1919	membar	#Sync
1920
1921.unalign_100:
1922	ldd	[%o4+32], %d8
1923	ldd	[%o4+40], %d10
1924	ldd	[%o4+48], %d12
1925	ldd	[%o4+56], %d14
1926.unalign_100_loop:
1927	add	%o4, 64, %o4
1928	ldda	[%o4]ASI_BLK_P, %d16
1929	faligndata %d8, %d10, %d48
1930	faligndata %d10, %d12, %d50
1931	faligndata %d12, %d14, %d52
1932	faligndata %d14, %d16, %d54
1933	faligndata %d16, %d18, %d56
1934	faligndata %d18, %d20, %d58
1935	faligndata %d20, %d22, %d60
1936	faligndata %d22, %d24, %d62
1937	fmovd	%d24, %d8
1938	fmovd	%d26, %d10
1939	fmovd	%d28, %d12
1940	fmovd	%d30, %d14
1941	stda	%d48, [%o0]ASI_BLK_P
1942	subcc	%o5, 64, %o5
1943	add	%o0, 64, %o0
1944	bgu,pt	%ncc, .unalign_100_loop
1945	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1946	ba	.unalign_done
1947	membar	#Sync
1948
1949.unalign_011:
1950	ldd	[%o4+24], %d6
1951	ldd	[%o4+32], %d8
1952	ldd	[%o4+40], %d10
1953	ldd	[%o4+48], %d12
1954	ldd	[%o4+56], %d14
1955.unalign_011_loop:
1956	add	%o4, 64, %o4
1957	ldda	[%o4]ASI_BLK_P, %d16
1958	faligndata %d6, %d8, %d48
1959	faligndata %d8, %d10, %d50
1960	faligndata %d10, %d12, %d52
1961	faligndata %d12, %d14, %d54
1962	faligndata %d14, %d16, %d56
1963	faligndata %d16, %d18, %d58
1964	faligndata %d18, %d20, %d60
1965	faligndata %d20, %d22, %d62
1966	fmovd	%d22, %d6
1967	fmovd	%d24, %d8
1968	fmovd	%d26, %d10
1969	fmovd	%d28, %d12
1970	fmovd	%d30, %d14
1971	stda	%d48, [%o0]ASI_BLK_P
1972	subcc	%o5, 64, %o5
1973	add	%o0, 64, %o0
1974	bgu,pt	%ncc, .unalign_011_loop
1975	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
1976	ba	.unalign_done
1977	membar	#Sync
1978
1979.unalign_010:
1980	ldd	[%o4+16], %d4
1981	ldd	[%o4+24], %d6
1982	ldd	[%o4+32], %d8
1983	ldd	[%o4+40], %d10
1984	ldd	[%o4+48], %d12
1985	ldd	[%o4+56], %d14
1986.unalign_010_loop:
1987	add	%o4, 64, %o4
1988	ldda	[%o4]ASI_BLK_P, %d16
1989	faligndata %d4, %d6, %d48
1990	faligndata %d6, %d8, %d50
1991	faligndata %d8, %d10, %d52
1992	faligndata %d10, %d12, %d54
1993	faligndata %d12, %d14, %d56
1994	faligndata %d14, %d16, %d58
1995	faligndata %d16, %d18, %d60
1996	faligndata %d18, %d20, %d62
1997	fmovd	%d20, %d4
1998	fmovd	%d22, %d6
1999	fmovd	%d24, %d8
2000	fmovd	%d26, %d10
2001	fmovd	%d28, %d12
2002	fmovd	%d30, %d14
2003	stda	%d48, [%o0]ASI_BLK_P
2004	subcc	%o5, 64, %o5
2005	add	%o0, 64, %o0
2006	bgu,pt	%ncc, .unalign_010_loop
2007	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
2008	ba	.unalign_done
2009	membar	#Sync
2010
2011.unalign_001:
2012	ldd	[%o4+8], %d2
2013	ldd	[%o4+16], %d4
2014	ldd	[%o4+24], %d6
2015	ldd	[%o4+32], %d8
2016	ldd	[%o4+40], %d10
2017	ldd	[%o4+48], %d12
2018	ldd	[%o4+56], %d14
2019.unalign_001_loop:
2020	add	%o4, 64, %o4
2021	ldda	[%o4]ASI_BLK_P, %d16
2022	faligndata %d2, %d4, %d48
2023	faligndata %d4, %d6, %d50
2024	faligndata %d6, %d8, %d52
2025	faligndata %d8, %d10, %d54
2026	faligndata %d10, %d12, %d56
2027	faligndata %d12, %d14, %d58
2028	faligndata %d14, %d16, %d60
2029	faligndata %d16, %d18, %d62
2030	fmovd	%d18, %d2
2031	fmovd	%d20, %d4
2032	fmovd	%d22, %d6
2033	fmovd	%d24, %d8
2034	fmovd	%d26, %d10
2035	fmovd	%d28, %d12
2036	fmovd	%d30, %d14
2037	stda	%d48, [%o0]ASI_BLK_P
2038	subcc	%o5, 64, %o5
2039	add	%o0, 64, %o0
2040	bgu,pt	%ncc, .unalign_001_loop
2041	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
2042	ba	.unalign_done
2043	membar	#Sync
2044
2045.unalign_000:
2046	ldda	[%o4]ASI_BLK_P, %d0
2047.unalign_000_loop:
2048	add	%o4, 64, %o4
2049	ldda	[%o4]ASI_BLK_P, %d16
2050	faligndata %d0, %d2, %d48
2051	faligndata %d2, %d4, %d50
2052	faligndata %d4, %d6, %d52
2053	faligndata %d6, %d8, %d54
2054	faligndata %d8, %d10, %d56
2055	faligndata %d10, %d12, %d58
2056	faligndata %d12, %d14, %d60
2057	faligndata %d14, %d16, %d62
2058	fmovd	%d16, %d0
2059	fmovd	%d18, %d2
2060	fmovd	%d20, %d4
2061	fmovd	%d22, %d6
2062	fmovd	%d24, %d8
2063	fmovd	%d26, %d10
2064	fmovd	%d28, %d12
2065	fmovd	%d30, %d14
2066	stda	%d48, [%o0]ASI_BLK_P
2067	subcc	%o5, 64, %o5
2068	add	%o0, 64, %o0
2069	bgu,pt	%ncc, .unalign_000_loop
2070	prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
2071	membar	#Sync
2072
2073.unalign_done:
2074	! Handle trailing bytes, 64 to 127
2075	! Dest long word aligned, Src not long word aligned
2076	cmp	%o2, 15
2077	bleu	%ncc, .unalign_short
2078
2079	andn	%o2, 0x7, %o5		! %o5 is multiple of 8
2080	and	%o2, 0x7, %o2		! residue bytes in %o2
2081	add	%o2, 8, %o2
2082	sub	%o5, 8, %o5		! insure we don't load past end of src
2083	andn	%o1, 0x7, %o4		! %o4 has long word aligned src address
2084	add	%o1, %o5, %o1		! advance %o1 to after multiple of 8
2085	ldd	[%o4], %d0		! fetch partial word
2086.unalign_by8:
2087	ldd	[%o4+8], %d2
2088	add	%o4, 8, %o4
2089	faligndata %d0, %d2, %d16
2090	subcc	%o5, 8, %o5
2091	std	%d16, [%o0]
2092	fmovd	%d2, %d0
2093	bgu,pt	%ncc, .unalign_by8
2094	add	%o0, 8, %o0
2095
2096.unalign_short:
2097	brnz	%g5, .smallrest
2098	nop
2099	ba	.smallrest
2100	wr	%g5, %g0, %fprs
2101#else	/* NIAGARA2_IMPL */
2102.forcpy:
2103	mov	%o0, %g5		! save des address for return val
2104	cmp	%o2, 17			! for small counts copy bytes
2105	bleu,pt	%ncc, .dbytecp
2106	nop
2107
2108	cmp	%o2, 0x80		! For lengths less than 128 bytes no
2109	bleu,pn	%ncc, .no_blkcpy	! copy using ASI_BLK_INIT_ST_QUAD_LDD_P
2110
2111	/*
2112	 * Make sure that source and destination buffers are 64 bytes apart.
2113	 * If they are not, do not use ASI_BLK_INIT_ST_QUAD_LDD_P asi to copy
2114	 * the data.
2115	 */
2116	subcc	%o1, %o0, %o3
2117	blu	%ncc, .blkalgndst
2118	cmp	%o3, 0x40		! if src - dst >= 0x40
2119	bgeu,pt	%ncc, .blkalgndst	! then use ASI_BLK_INIT_ST_QUAD_LDD_P
2120.no_blkcpy:
2121	andcc	%o1, 3, %o5		! is src word aligned
2122	bz,pn	%ncc, .aldst
2123	cmp	%o5, 2			! is src half-word aligned
2124	be,pt	%ncc, .s2algn
2125	cmp	%o5, 3			! src is byte aligned
2126.s1algn:ldub	[%o1], %o3		! move 1 or 3 bytes to align it
2127	inc	1, %o1
2128	stb	%o3, [%g5]		! move a byte to align src
2129	inc	1, %g5
2130	bne,pt	%ncc, .s2algn
2131	dec	%o2
2132	b	.ald			! now go align dest
2133	andcc	%g5, 3, %o5
2134
2135.s2algn:lduh	[%o1], %o3		! know src is 2 byte alinged
2136	inc	2, %o1
2137	srl	%o3, 8, %o4
2138	stb	%o4, [%g5]		! have to do bytes,
2139	stb	%o3, [%g5 + 1]		! don't know dst alingment
2140	inc	2, %g5
2141	dec	2, %o2
2142
2143.aldst:	andcc	%g5, 3, %o5		! align the destination address
2144.ald:	bz,pn	%ncc, .w4cp
2145	cmp	%o5, 2
2146	bz,pn	%ncc, .w2cp
2147	cmp	%o5, 3
2148.w3cp:	lduw	[%o1], %o4
2149	inc	4, %o1
2150	srl	%o4, 24, %o5
2151	stb	%o5, [%g5]
2152	bne,pt	%ncc, .w1cp
2153	inc	%g5
2154	dec	1, %o2
2155	andn	%o2, 3, %o3		! o3 is aligned word count
2156	dec	4, %o3			! avoid reading beyond tail of src
2157	sub	%o1, %g5, %o1		! o1 gets the difference
2158
21591:	sll	%o4, 8, %g1		! save residual bytes
2160	lduw	[%o1+%g5], %o4
2161	deccc	4, %o3
2162	srl	%o4, 24, %o5		! merge with residual
2163	or	%o5, %g1, %g1
2164	st	%g1, [%g5]
2165	bnz,pt	%ncc, 1b
2166	inc	4, %g5
2167	sub	%o1, 3, %o1		! used one byte of last word read
2168	and	%o2, 3, %o2
2169	b	7f
2170	inc	4, %o2
2171
2172.w1cp:	srl	%o4, 8, %o5
2173	sth	%o5, [%g5]
2174	inc	2, %g5
2175	dec	3, %o2
2176	andn	%o2, 3, %o3		! o3 is aligned word count
2177	dec	4, %o3			! avoid reading beyond tail of src
2178	sub	%o1, %g5, %o1		! o1 gets the difference
2179
21802:	sll	%o4, 24, %g1		! save residual bytes
2181	lduw	[%o1+%g5], %o4
2182	deccc	4, %o3
2183	srl	%o4, 8, %o5		! merge with residual
2184	or	%o5, %g1, %g1
2185	st	%g1, [%g5]
2186	bnz,pt	%ncc, 2b
2187	inc	4, %g5
2188	sub	%o1, 1, %o1		! used three bytes of last word read
2189	and	%o2, 3, %o2
2190	b	7f
2191	inc	4, %o2
2192
2193.w2cp:	lduw	[%o1], %o4
2194	inc	4, %o1
2195	srl	%o4, 16, %o5
2196	sth	%o5, [%g5]
2197	inc	2, %g5
2198	dec	2, %o2
2199	andn	%o2, 3, %o3		! o3 is aligned word count
2200	dec	4, %o3			! avoid reading beyond tail of src
2201	sub	%o1, %g5, %o1		! o1 gets the difference
2202
22033:	sll	%o4, 16, %g1		! save residual bytes
2204	lduw	[%o1+%g5], %o4
2205	deccc	4, %o3
2206	srl	%o4, 16, %o5		! merge with residual
2207	or	%o5, %g1, %g1
2208	st	%g1, [%g5]
2209	bnz,pt	%ncc, 3b
2210	inc	4, %g5
2211	sub	%o1, 2, %o1		! used two bytes of last word read
2212	and	%o2, 3, %o2
2213	b	7f
2214	inc	4, %o2
2215
2216.w4cp:	andn	%o2, 3, %o3		! o3 is aligned word count
2217	sub	%o1, %g5, %o1		! o1 gets the difference
2218
22191:	lduw	[%o1+%g5], %o4		! read from address
2220	deccc	4, %o3			! decrement count
2221	st	%o4, [%g5]		! write at destination address
2222	bgu,pt	%ncc, 1b
2223	inc	4, %g5			! increment to address
2224	b	7f
2225	and	%o2, 3, %o2		! number of leftover bytes, if any
2226
2227	!
2228	! differenced byte copy, works with any alignment
2229	!
2230.dbytecp:
2231	b	7f
2232	sub	%o1, %g5, %o1		! o1 gets the difference
2233
22344:	stb	%o4, [%g5]		! write to address
2235	inc	%g5			! inc to address
22367:	deccc	%o2			! decrement count
2237	bgeu,a,pt %ncc,4b		! loop till done
2238	ldub	[%o1+%g5], %o4		! read from address
2239	retl				! %o0 was preserved
2240	nop
2241
2242.blkalgndst:
2243	save	%sp, -SA(MINFRAME), %sp
2244
2245	! Block (64 bytes) align the destination.
2246	andcc	%i0, 0x3f, %i3		! is dst block aligned
2247	bz	%ncc, .chksrc		! dst already block aligned
2248	sub	%i3, 0x40, %i3
2249	neg	%i3			! bytes till dst 64 bytes aligned
2250	sub	%i2, %i3, %i2		! update i2 with new count
2251
2252	! Based on source and destination alignment do
2253	! either 8 bytes, 4 bytes, 2 bytes or byte copy.
2254
2255	! Is dst & src 8B aligned
2256	or	%i0, %i1, %o2
2257	andcc	%o2, 0x7, %g0
2258	bz	%ncc, .alewdcp
2259	nop
2260
2261	! Is dst & src 4B aligned
2262	andcc	%o2, 0x3, %g0
2263	bz	%ncc, .alwdcp
2264	nop
2265
2266	! Is dst & src 2B aligned
2267	andcc	%o2, 0x1, %g0
2268	bz	%ncc, .alhlfwdcp
2269	nop
2270
2271	! 1B aligned
22721:	ldub	[%i1], %o2
2273	stb	%o2, [%i0]
2274	inc	%i1
2275	deccc	%i3
2276	bgu,pt	%ncc, 1b
2277	inc	%i0
2278
2279	ba	.chksrc
2280	nop
2281
2282	! dst & src 4B aligned
2283.alwdcp:
2284	ld	[%i1], %o2
2285	st	%o2, [%i0]
2286	add	%i1, 0x4, %i1
2287	subcc	%i3, 0x4, %i3
2288	bgu,pt	%ncc, .alwdcp
2289	add	%i0, 0x4, %i0
2290
2291	ba	.chksrc
2292	nop
2293
2294	! dst & src 2B aligned
2295.alhlfwdcp:
2296	lduh	[%i1], %o2
2297	stuh	%o2, [%i0]
2298	add	%i1, 0x2, %i1
2299	subcc	%i3, 0x2, %i3
2300	bgu,pt	%ncc, .alhlfwdcp
2301	add	%i0, 0x2, %i0
2302
2303	ba	.chksrc
2304	nop
2305
2306	! dst & src 8B aligned
2307.alewdcp:
2308	ldx	[%i1], %o2
2309	stx	%o2, [%i0]
2310	add	%i1, 0x8, %i1
2311	subcc	%i3, 0x8, %i3
2312	bgu,pt	%ncc, .alewdcp
2313	add	%i0, 0x8, %i0
2314
2315	! Now Destination is block (64 bytes) aligned
2316.chksrc:
2317	andn	%i2, 0x3f, %i3		! %i3 count is multiple of block size
2318	sub	%i2, %i3, %i2		! Residue bytes in %i2
2319	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
2320	andcc	%i1, 0xf, %l1		! is src quadword aligned
2321	bz,pn	%ncc, .blkcpy		! src offset in %l1
2322	nop
2323	cmp	%l1, 0x8
2324	bgu	%ncc, .cpy_upper_double
2325	nop
2326	blu	%ncc, .cpy_lower_double
2327	nop
2328
2329	! Falls through when source offset is equal to 8 i.e.
2330	! source is double word aligned.
2331	! In this case no shift/merge of data is required
2332	sub	%i1, %l1, %i1		! align the src at 16 bytes.
2333	andn	%i1, 0x3f, %o0		! %o0 has block aligned source
2334	prefetch [%o0+0x0], #one_read
2335	ldda	[%i1+0x0]%asi, %o2
2336loop0:
2337	ldda	[%i1+0x10]%asi, %o4
2338	prefetch [%o0+0x40], #one_read
2339
2340	stxa	%o3, [%i0+0x0]%asi
2341	stxa	%o4, [%i0+0x8]%asi
2342
2343	ldda	[%i1+0x20]%asi, %o2
2344	stxa	%o5, [%i0+0x10]%asi
2345	stxa	%o2, [%i0+0x18]%asi
2346
2347	ldda	[%i1+0x30]%asi, %o4
2348	stxa	%o3, [%i0+0x20]%asi
2349	stxa	%o4, [%i0+0x28]%asi
2350
2351	ldda	[%i1+0x40]%asi, %o2
2352	stxa	%o5, [%i0+0x30]%asi
2353	stxa	%o2, [%i0+0x38]%asi
2354
2355	add	%o0, 0x40, %o0
2356	add	%i1, 0x40, %i1
2357	subcc	%i3, 0x40, %i3
2358	bgu,pt	%ncc, loop0
2359	add	%i0, 0x40, %i0
2360	ba	.blkdone
2361	add	%i1, %l1, %i1		! increment the source by src offset
2362
2363.cpy_lower_double:
2364	sub	%i1, %l1, %i1		! align the src at 16 bytes.
2365	sll	%l1, 3, %l2		! %l2 left shift
2366	mov	0x40, %l3
2367	sub	%l3, %l2, %l3		! %l3 right shift = (64 - left shift)
2368	andn	%i1, 0x3f, %o0		! %o0 has block aligned source
2369	prefetch [%o0+0x0], #one_read
2370	ldda	[%i1+0x0]%asi, %o2	! partial data in %o2 and %o3 has
2371					! complete data
2372loop1:
2373	ldda	[%i1+0x10]%asi, %o4	! %o4 has partial data for this read.
2374	ALIGN_DATA(%o2, %o3, %o4, %l2, %l3, %g1)	! merge %o2, %o3 and %o4
2375							! into %o2 and %o3
2376	prefetch [%o0+0x40], #one_read
2377	stxa	%o2, [%i0+0x0]%asi
2378	stxa	%o3, [%i0+0x8]%asi
2379
2380	ldda	[%i1+0x20]%asi, %o2
2381	ALIGN_DATA(%o4, %o5, %o2, %l2, %l3, %g1)	! merge %o2 with %o5 and
2382	stxa	%o4, [%i0+0x10]%asi			! %o4 from previous read
2383	stxa	%o5, [%i0+0x18]%asi			! into %o4 and %o5
2384
2385	! Repeat the same for next 32 bytes.
2386
2387	ldda	[%i1+0x30]%asi, %o4
2388	ALIGN_DATA(%o2, %o3, %o4, %l2, %l3, %g1)
2389	stxa	%o2, [%i0+0x20]%asi
2390	stxa	%o3, [%i0+0x28]%asi
2391
2392	ldda	[%i1+0x40]%asi, %o2
2393	ALIGN_DATA(%o4, %o5, %o2, %l2, %l3, %g1)
2394	stxa	%o4, [%i0+0x30]%asi
2395	stxa	%o5, [%i0+0x38]%asi
2396
2397	add	%o0, 0x40, %o0
2398	add	%i1, 0x40, %i1
2399	subcc	%i3, 0x40, %i3
2400	bgu,pt	%ncc, loop1
2401	add	%i0, 0x40, %i0
2402	ba	.blkdone
2403	add	%i1, %l1, %i1		! increment the source by src offset
2404
2405.cpy_upper_double:
2406	sub	%i1, %l1, %i1		! align the src at 16 bytes.
2407	mov	0x8, %l2
2408	sub	%l1, %l2, %l2
2409	sll	%l2, 3, %l2		! %l2 left shift
2410	mov	0x40, %l3
2411	sub	%l3, %l2, %l3		! %l3 right shift = (64 - left shift)
2412	andn	%i1, 0x3f, %o0		! %o0 has block aligned source
2413	prefetch [%o0+0x0], #one_read
2414	ldda	[%i1+0x0]%asi, %o2	! partial data in %o3 for this read and
2415					! no data in %o2
2416loop2:
2417	ldda	[%i1+0x10]%asi, %o4	! %o4 has complete data and %o5 has
2418					! partial
2419	ALIGN_DATA(%o3, %o4, %o5, %l2, %l3, %g1)	! merge %o3, %o4 and %o5
2420							! into %o3 and %o4
2421	prefetch [%o0+0x40], #one_read
2422	stxa	%o3, [%i0+0x0]%asi
2423	stxa	%o4, [%i0+0x8]%asi
2424
2425	ldda	[%i1+0x20]%asi, %o2
2426	ALIGN_DATA(%o5, %o2, %o3, %l2, %l3, %g1)	! merge %o2 and %o3 with
2427	stxa	%o5, [%i0+0x10]%asi			! %o5 from previous read
2428	stxa	%o2, [%i0+0x18]%asi			! into %o5 and %o2
2429
2430	! Repeat the same for next 32 bytes.
2431
2432	ldda	[%i1+0x30]%asi, %o4
2433	ALIGN_DATA(%o3, %o4, %o5, %l2, %l3, %g1)
2434	stxa	%o3, [%i0+0x20]%asi
2435	stxa	%o4, [%i0+0x28]%asi
2436
2437	ldda	[%i1+0x40]%asi, %o2
2438	ALIGN_DATA(%o5, %o2, %o3, %l2, %l3, %g1)
2439	stxa	%o5, [%i0+0x30]%asi
2440	stxa	%o2, [%i0+0x38]%asi
2441
2442	add	%o0, 0x40, %o0
2443	add	%i1, 0x40, %i1
2444	subcc	%i3, 0x40, %i3
2445	bgu,pt	%ncc, loop2
2446	add	%i0, 0x40, %i0
2447	ba	.blkdone
2448	add	%i1, %l1, %i1		! increment the source by src offset
2449
2450	! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
2451.blkcpy:
2452	andn	%i1, 0x3f, %o0		! %o0 has block aligned source
2453	prefetch [%o0+0x0], #one_read
24541:
2455	prefetch [%o0+0x40], #one_read
2456
2457	ldda	[%i1+0x0]%asi, %o2
2458	ldda	[%i1+0x10]%asi, %o4
2459
2460	stxa	%o2, [%i0+0x0]%asi
2461	stxa	%o3, [%i0+0x8]%asi
2462	stxa	%o4, [%i0+0x10]%asi
2463	stxa	%o5, [%i0+0x18]%asi
2464
2465	ldda	[%i1+0x20]%asi, %o2
2466	ldda	[%i1+0x30]%asi, %o4
2467
2468	stxa	%o2, [%i0+0x20]%asi
2469	stxa	%o3, [%i0+0x28]%asi
2470	stxa	%o4, [%i0+0x30]%asi
2471	stxa	%o5, [%i0+0x38]%asi
2472
2473	add	%o0, 0x40, %o0
2474	add	%i1, 0x40, %i1
2475	subcc	%i3, 0x40, %i3
2476	bgu,pt	%ncc, 1b
2477	add	%i0, 0x40, %i0
2478
2479.blkdone:
2480	membar	#Sync
2481
2482	mov	ASI_PNF, %asi		! restore %asi to default
2483					! ASI_PRIMARY_NOFAULT value
2484	tst	%i2
2485	bz,pt	%ncc, .blkexit
2486	nop
2487
2488	! Handle trailing bytes
2489	cmp	%i2, 0x8
2490	blu,pt	%ncc, .residue
2491	nop
2492
2493	! Can we do some 8B ops
2494	or	%i1, %i0, %o2
2495	andcc	%o2, 0x7, %g0
2496	bnz	%ncc, .last4
2497	nop
2498
2499	! Do 8byte ops as long as possible
2500.last8:
2501	ldx	[%i1], %o2
2502	stx	%o2, [%i0]
2503	add	%i1, 0x8, %i1
2504	sub	%i2, 0x8, %i2
2505	cmp	%i2, 0x8
2506	bgu,pt	%ncc, .last8
2507	add	%i0, 0x8, %i0
2508
2509	tst	%i2
2510	bz,pt	%ncc, .blkexit
2511	nop
2512
2513	ba	.residue
2514	nop
2515
2516.last4:
2517	! Can we do 4B ops
2518	andcc	%o2, 0x3, %g0
2519	bnz	%ncc, .last2
2520	nop
25211:
2522	ld	[%i1], %o2
2523	st	%o2, [%i0]
2524	add	%i1, 0x4, %i1
2525	sub	%i2, 0x4, %i2
2526	cmp	%i2, 0x4
2527	bgu,pt	%ncc, 1b
2528	add	%i0, 0x4, %i0
2529
2530	cmp	%i2, 0
2531	bz,pt	%ncc, .blkexit
2532	nop
2533
2534	ba	.residue
2535	nop
2536
2537.last2:
2538	! Can we do 2B ops
2539	andcc	%o2, 0x1, %g0
2540	bnz	%ncc, .residue
2541	nop
2542
25431:
2544	lduh	[%i1], %o2
2545	stuh	%o2, [%i0]
2546	add	%i1, 0x2, %i1
2547	sub	%i2, 0x2, %i2
2548	cmp	%i2, 0x2
2549	bgu,pt	%ncc, 1b
2550	add	%i0, 0x2, %i0
2551
2552	cmp	%i2, 0
2553	bz,pt	%ncc, .blkexit
2554	nop
2555
2556.residue:
2557	ldub	[%i1], %o2
2558	stb	%o2, [%i0]
2559	inc	%i1
2560	deccc	%i2
2561	bgu,pt	%ncc, .residue
2562	inc	%i0
2563
2564.blkexit:
2565
2566	ret
2567	restore	%g5, %g0, %o0
2568
2569#endif	/* NIAGARA2_IMPL */
2570	SET_SIZE(memcpy)
2571	SET_SIZE(__align_cpy_1)
2572