xref: /illumos-gate/usr/src/uts/sun4v/cpu/niagara_copy.S (revision 9164a50bf932130cbb5097a16f6986873ce0e6e5)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25
26#include <sys/param.h>
27#include <sys/errno.h>
28#include <sys/asm_linkage.h>
29#include <sys/vtrace.h>
30#include <sys/machthread.h>
31#include <sys/clock.h>
32#include <sys/asi.h>
33#include <sys/fsr.h>
34#include <sys/privregs.h>
35#include <sys/machasi.h>
36#include <sys/niagaraasi.h>
37
38#include "assym.h"
39
40
41/*
42 * Pseudo-code to aid in understanding the control flow of the
43 * bcopy/kcopy routine.
44 *
45 *	! WARNING : <Register usage convention>
46 *	! In kcopy() the %o5, holds previous error handler and a flag
47 *	! LOFAULT_SET (low bits). The %o5 is null in bcopy().
48 *	! The %o5 is not available for any other use.
49 *
50 * On entry:
51 *	! Determine whether to use the FP register version or the
52 *	! the leaf routine version depending on the size of the copy.
53 *	! Set up error handling accordingly.
54 *	! The transition point depends on FP_COPY
55 *	! For both versions %o5 is reserved
56 *
57 * kcopy():
58 *	if(length > FP_COPY)
59 *		go to regular_kcopy
60 *
61 *	! Setup_leaf_rtn_error_handler
62 *	%o5 = curthread->t_lofault;		! save existing handler in %o5
63 *	%o5 |= LOFAULT_SET;			! ORed with LOFAULT_SET flag
64 *	curthread->t_lofault = .sm_copyerr;
65 *	goto small_bcopy();
66 *
67 * regular_kcopy:
68 *	save_registers()
69 *	%o5 = curthread->t_lofault;		! save existing handler in %o5
70 *	%o5 |= LOFAULT_SET;			! ORed with LOFAULT_SET flag
71 *	curthread->t_lofault = .copyerr;
72 *	goto do_copy();
73 *
74 * bcopy():
75 *	if(length > FP_COPY)
76 *		go to regular_bcopy
77 *
78 *	! Setup_leaf_rtn_error_handler
79 *	%o5 = curthread->t_lofault;		! save existing handler in %o5
80 *	curthread->t_lofault = .sm_copyerr;
81 *	goto small_bcopy();
82 *
83 * regular_bcopy:
84 *	%o5 = curthread->t_lofault;		! save existing handler in %o5
85 *	curthread->t_lofault = .copyerr;
86 *	goto do_copy();
87 *
88 * small_bcopy:
89 *	! handle copies smaller than FP_COPY
90 *	restore t_lofault handler
91 *	exit
92 *
93 * do_copy:
94 *	! handle copies larger than FP_COPY
95 *	save fp_regs
96 * 	blockcopy;
97 *	restore fp_regs
98 *	restore t_lofault handler if came from kcopy();
99 *
100 *
101 * In leaf lofault handler:
102 *	curthread->t_lofault = (%o5 & ~LOFAULT_SET);	! restore old t_lofault
103 *	return (errno)
104 *
105 * In lofault handler:
106 *	curthread->t_lofault = (%o5 & ~LOFAULT_SET);	! restore old t_lofault
107 *	restore fp_regs
108 *	return (errno)
109 *
110 *
111 *
112 * For all of bcopy/copyin/copyout the copy logic is specialized according
113 * to how the src and dst is aligned and how much data needs to be moved.
114 * The following comments apply to the N2/RF code (#if !defined(NIAGARA_IMPL))
115 *
116 * N2/RF Flow :
117 *
118 * if (count < FP_COPY) {  (584 bytes)
119 *   set small fault handler (no register window save/restore)
120 *   if count < SHORTCOPY  (7 bytes)
121 *	copy bytes; go to short_exit
122 *   else
123 *   determine dst alignment, move minimum bytes/halfwords to
124 *   get dst aligned on long word boundary
125 *     if( src is on long word boundary ) {
126 * medlong:					   src/dst aligned on 8 bytes
127 *	 copy with ldx/stx in 4-way unrolled loop;
128 *       copy final 0-31 bytes; go to short_exit
129 *     } else {					src/dst not aligned on 8 bytes
130 *     if src is word aligned, ld/st words in 32-byte chunks
131 *     if src is half word aligned, ld half, ld word, ld half; pack
132 *		into long word, store long words in 32-byte chunks
133 *     if src is byte aligned, ld byte,half,word parts;  pack into long
134 *	   word, store long words in 32-byte chunks
135 *     move final 0-31 bytes according to src alignment;  go to short_exit
136 * short_exit:
137 *     restore trap handler if needed, retl
138 * else {					   More than FP_COPY bytes
139 *     set fault handler
140 *     disable kernel preemption
141 *     save registers, save FP registers if in use
142 *     move bytes to align destination register on long word boundary
143 *     if(src is on long word boundary) {	   src/dst aligned on 8 bytes
144 *       align dst on 64 byte boundary;  use 8-way test for each of 8 possible
145 *       src alignments relative to a 64 byte boundary to select the
146 *       16-way unrolled loop (128 bytes) to use for
147 *       block load, fmovd, block-init-store, block-store, fmovd operations
148 *       then go to remain_stuff.
149 * remain_stuff: move remaining bytes. go to long_exit
150 *     } else {
151 *       setup alignaddr for faligndata instructions
152 *       align dst on 64 byte boundary; use 8-way test for each of 8 possible
153 *       src alignments to nearest long word relative to 64 byte boundary to
154 *       select the 8-way unrolled loop (64 bytes) to use for
155 *       block load, falign, fmovd, block-store loop
156 *	 (only use block-init-store when src/dst on 8 byte boundaries.)
157 *       goto unalign_done.
158 * unalign_done:
159 *       move remaining bytes for unaligned cases. go to long_exit
160 * long_exit:
161 *       restore %gsr, FP regs (either from stack or set to zero),
162 *       restore trap handler, check for kernel preemption request,
163 *       handle if needed, ret.
164 * }
165 *
166 * Other platforms include hw_bcopy_limit_[1248] to control the exact
167 * point where the FP register code is used. On those platforms, the
168 * FP register code did not leave data in L2 cache, potentially affecting
169 * performance more than the gain/loss from the algorithm difference.
170 * For N2/RF, block store places data in the L2 cache, so use or non-use
171 * of the FP registers has no effect on L2 cache behavior.
172 * The cost for testing hw_bcopy_limit_* according to different
173 * alignments exceeds 50 cycles for all cases, even when hw_bcopy_limits
174 * were not used. That cost was judged too high relative to the benefits,
175 * so the hw_bcopy_limit option is omitted from this code.
176 */
177
178/*
179 * Less then or equal this number of bytes we will always copy byte-for-byte
180 */
181#define	SMALL_LIMIT	7
182
183/*
184 * LOFAULT_SET : Flag set by kzero and kcopy to indicate that t_lofault
185 * handler was set
186 */
187#define	LOFAULT_SET 2
188
189/*
190 * This define is to align data for the unaligned source cases.
191 * The data1, data2 and data3 is merged into data1 and data2.
192 * The data3 is preserved for next merge.
193 */
194#define	ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp)	\
195	sllx	data1, lshift, data1				;\
196	srlx	data2, rshift, tmp				;\
197	or	data1, tmp, data1				;\
198	sllx	data2, lshift, data2				;\
199	srlx	data3, rshift, tmp				;\
200	or	data2, tmp, data2
201/*
202 * This macro is to align the data. Basically it merges
203 * data1 and data2 to form double word.
204 */
205#define	ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp)	\
206	sllx	data1, lshift, data1				;\
207	srlx	data2, rshift, tmp				;\
208	or	data1, tmp, data1
209
210#if !defined(NIAGARA_IMPL)
211/*
212 * Flags set in the lower bits of the t_lofault address:
213 * FPUSED_FLAG: The FP registers were in use and must be restored
214 * LOFAULT_SET: Set for bcopy calls, cleared for kcopy calls
215 * COPY_FLAGS: Both of the above
216 *
217 * Other flags:
218 * KPREEMPT_FLAG: kpreempt needs to be called
219 */
220#define	FPUSED_FLAG	1
221#define	LOFAULT_SET	2
222#define	COPY_FLAGS	(FPUSED_FLAG | LOFAULT_SET)
223#define	KPREEMPT_FLAG	4
224
225#define	ALIGN_OFF_1_7			\
226	faligndata %d0, %d2, %d48	;\
227	faligndata %d2, %d4, %d50	;\
228	faligndata %d4, %d6, %d52	;\
229	faligndata %d6, %d8, %d54	;\
230	faligndata %d8, %d10, %d56	;\
231	faligndata %d10, %d12, %d58	;\
232	faligndata %d12, %d14, %d60	;\
233	faligndata %d14, %d16, %d62
234
235#define	ALIGN_OFF_8_15			\
236	faligndata %d2, %d4, %d48	;\
237	faligndata %d4, %d6, %d50	;\
238	faligndata %d6, %d8, %d52	;\
239	faligndata %d8, %d10, %d54	;\
240	faligndata %d10, %d12, %d56	;\
241	faligndata %d12, %d14, %d58	;\
242	faligndata %d14, %d16, %d60	;\
243	faligndata %d16, %d18, %d62
244
245#define	ALIGN_OFF_16_23			\
246	faligndata %d4, %d6, %d48	;\
247	faligndata %d6, %d8, %d50	;\
248	faligndata %d8, %d10, %d52	;\
249	faligndata %d10, %d12, %d54	;\
250	faligndata %d12, %d14, %d56	;\
251	faligndata %d14, %d16, %d58	;\
252	faligndata %d16, %d18, %d60	;\
253	faligndata %d18, %d20, %d62
254
255#define	ALIGN_OFF_24_31			\
256	faligndata %d6, %d8, %d48	;\
257	faligndata %d8, %d10, %d50	;\
258	faligndata %d10, %d12, %d52	;\
259	faligndata %d12, %d14, %d54	;\
260	faligndata %d14, %d16, %d56	;\
261	faligndata %d16, %d18, %d58	;\
262	faligndata %d18, %d20, %d60	;\
263	faligndata %d20, %d22, %d62
264
265#define	ALIGN_OFF_32_39			\
266	faligndata %d8, %d10, %d48	;\
267	faligndata %d10, %d12, %d50	;\
268	faligndata %d12, %d14, %d52	;\
269	faligndata %d14, %d16, %d54	;\
270	faligndata %d16, %d18, %d56	;\
271	faligndata %d18, %d20, %d58	;\
272	faligndata %d20, %d22, %d60	;\
273	faligndata %d22, %d24, %d62
274
275#define	ALIGN_OFF_40_47			\
276	faligndata %d10, %d12, %d48	;\
277	faligndata %d12, %d14, %d50	;\
278	faligndata %d14, %d16, %d52	;\
279	faligndata %d16, %d18, %d54	;\
280	faligndata %d18, %d20, %d56	;\
281	faligndata %d20, %d22, %d58	;\
282	faligndata %d22, %d24, %d60	;\
283	faligndata %d24, %d26, %d62
284
285#define	ALIGN_OFF_48_55			\
286	faligndata %d12, %d14, %d48	;\
287	faligndata %d14, %d16, %d50	;\
288	faligndata %d16, %d18, %d52	;\
289	faligndata %d18, %d20, %d54	;\
290	faligndata %d20, %d22, %d56	;\
291	faligndata %d22, %d24, %d58	;\
292	faligndata %d24, %d26, %d60	;\
293	faligndata %d26, %d28, %d62
294
295#define	ALIGN_OFF_56_63			\
296	faligndata %d14, %d16, %d48	;\
297	faligndata %d16, %d18, %d50	;\
298	faligndata %d18, %d20, %d52	;\
299	faligndata %d20, %d22, %d54	;\
300	faligndata %d22, %d24, %d56	;\
301	faligndata %d24, %d26, %d58	;\
302	faligndata %d26, %d28, %d60	;\
303	faligndata %d28, %d30, %d62
304
305/*
306 * FP_COPY indicates the minimum number of bytes needed
307 * to justify using FP/VIS-accelerated memory operations.
308 * The FPBLK code assumes a minimum number of bytes are available
309 * to be moved on entry.  Check that code carefully before
310 * reducing FP_COPY below 256.
311 */
312#define FP_COPY			584
313#define SHORTCOPY		7
314#define ASI_STBI_P		ASI_BLK_INIT_ST_QUAD_LDD_P
315#define ASI_STBI_AIUS		ASI_BLK_INIT_QUAD_LDD_AIUS
316#define CACHE_LINE		64
317#define	VIS_BLOCKSIZE		64
318
319/*
320 * Size of stack frame in order to accomodate a 64-byte aligned
321 * floating-point register save area and 2 64-bit temp locations.
322 * All copy functions use three quadrants of fp registers; to assure a
323 * block-aligned three block buffer in which to save we must reserve
324 * four blocks on stack.
325 *
326 *    _______________________________________ <-- %fp + STACK_BIAS
327 *    | We may need to preserve 3 quadrants |
328 *    | of fp regs, but since we do so with |
329 *    | BST/BLD we need room in which to    |
330 *    | align to VIS_BLOCKSIZE bytes.  So   |
331 *    | this area is 4 * VIS_BLOCKSIZE.     | <--  - SAVED_FPREGS_OFFSET
332 *    |-------------------------------------|
333 *    | 8 bytes to save %fprs		    | <--  - SAVED_FPRS_OFFSET
334 *    |-------------------------------------|
335 *    | 8 bytes to save %gsr		    | <--  - SAVED_GSR_OFFSET
336 *    ---------------------------------------
337 */
338#define HWCOPYFRAMESIZE		((VIS_BLOCKSIZE * (3 + 1)) + (2 * 8))
339#define SAVED_FPREGS_OFFSET	(VIS_BLOCKSIZE * 4)
340#define SAVED_FPREGS_ADJUST	((VIS_BLOCKSIZE * 3) + 1)
341#define SAVED_FPRS_OFFSET	(SAVED_FPREGS_OFFSET + 8)
342#define SAVED_GSR_OFFSET	(SAVED_FPRS_OFFSET + 8)
343
344/*
345 * In FP copies if we do not have preserved data to restore over
346 * the fp regs we used then we must zero those regs to avoid
347 * exposing portions of the data to later threads (data security).
348 */
349#define	FZERO				\
350	fzero	%f0			;\
351	fzero	%f2			;\
352	faddd	%f0, %f2, %f4		;\
353	fmuld	%f0, %f2, %f6		;\
354	faddd	%f0, %f2, %f8		;\
355	fmuld	%f0, %f2, %f10		;\
356	faddd	%f0, %f2, %f12		;\
357	fmuld	%f0, %f2, %f14		;\
358	faddd	%f0, %f2, %f16		;\
359	fmuld	%f0, %f2, %f18		;\
360	faddd	%f0, %f2, %f20		;\
361	fmuld	%f0, %f2, %f22		;\
362	faddd	%f0, %f2, %f24		;\
363	fmuld	%f0, %f2, %f26		;\
364	faddd	%f0, %f2, %f28		;\
365	fmuld	%f0, %f2, %f30		;\
366	faddd	%f0, %f2, %f48		;\
367	fmuld	%f0, %f2, %f50		;\
368	faddd	%f0, %f2, %f52		;\
369	fmuld	%f0, %f2, %f54		;\
370	faddd	%f0, %f2, %f56		;\
371	fmuld	%f0, %f2, %f58		;\
372	faddd	%f0, %f2, %f60		;\
373	fmuld	%f0, %f2, %f62
374
375/*
376 * Macros to save and restore fp registers to/from the stack.
377 * Used to save and restore in-use fp registers when we want to use FP.
378 */
379#define BST_FP_TOSTACK(tmp1)					\
380	/* membar #Sync	*/					;\
381	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
382	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
383	stda	%f0, [tmp1]ASI_BLK_P				;\
384	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
385	stda	%f16, [tmp1]ASI_BLK_P				;\
386	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
387	stda	%f48, [tmp1]ASI_BLK_P				;\
388	membar	#Sync
389
390#define	BLD_FP_FROMSTACK(tmp1)					\
391	/* membar #Sync - provided at copy completion */	;\
392	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
393	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
394	ldda	[tmp1]ASI_BLK_P, %f0				;\
395	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
396	ldda	[tmp1]ASI_BLK_P, %f16				;\
397	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
398	ldda	[tmp1]ASI_BLK_P, %f48				;\
399	membar	#Sync
400
401#endif /* !NIAGARA_IMPL */
402
403/*
404 * Copy a block of storage, returning an error code if `from' or
405 * `to' takes a kernel pagefault which cannot be resolved.
406 * Returns errno value on pagefault error, 0 if all ok
407 */
408
409	.seg	".text"
410	.align	4
411
412	ENTRY(kcopy)
413#if !defined(NIAGARA_IMPL)
414	cmp	%o2, FP_COPY			! check for small copy/leaf case
415	bgt,pt	%ncc, .kcopy_more		!
416	nop
417.kcopy_small:					! setup error handler
418	sethi	%hi(.sm_copyerr), %o4
419	or	%o4, %lo(.sm_copyerr), %o4	! .sm_copyerr is lofault value
420	ldn	[THREAD_REG + T_LOFAULT], %o5	! save existing handler
421	! Note that we carefully do *not* flag the setting of
422	! t_lofault.
423	membar	#Sync				! sync error barrier
424	b	.sm_do_copy			! common code
425	stn	%o4, [THREAD_REG + T_LOFAULT]	! set t_lofault
426
427
428.kcopy_more:
429	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
430	sethi	%hi(.copyerr), %l7		! copyerr is lofault value
431	or	%l7, %lo(.copyerr), %l7
432	ldn	[THREAD_REG + T_LOFAULT], %o5	! save existing handler
433	! Note that we carefully do *not* flag the setting of
434	! t_lofault.
435	membar	#Sync				! sync error barrier
436	b	.do_copy			! common code
437	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
438
439/*
440 * We got here because of a fault during a small kcopy or bcopy.
441 * if a fault handler existed when bcopy was called.
442 * No floating point registers are used by the small copies.
443 * Small copies are from a leaf routine
444 * Errno value is in %g1.
445 */
446.sm_copyerr:
447	! The kcopy will always set a t_lofault handler. If it fires,
448	! we're expected to just return the error code and not to
449	! invoke any existing error handler. As far as bcopy is concerned,
450	! we only set t_lofault if there was an existing lofault handler.
451	! In that case we're expected to invoke the previously existing
452	! handler after resetting the t_lofault value.
453	btst	LOFAULT_SET, %o5
454	membar	#Sync				! sync error barrier
455	andn	%o5, LOFAULT_SET, %o5		! clear fault flag
456	bnz,pn	%ncc, 3f
457	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
458	retl
459	mov	%g1, %o0
4603:
461	! We're here via bcopy. There must have been an error handler
462	! in place otherwise we would have died a nasty death already.
463	jmp	%o5				! goto real handler
464	mov	%g0, %o0
465/*
466 *  end of .sm_copyerr
467 */
468
469/*
470 * We got here because of a fault during kcopy or bcopy if a fault
471 * handler existed when bcopy was called.
472 * stack and fp registers need to be restored
473 * Errno value is in %g1.
474 */
475.copyerr:
476	sethi	%hi(.copyerr2), %l1
477	or	%l1, %lo(.copyerr2), %l1
478	membar	#Sync				! sync error barrier
479	stn	%l1, [THREAD_REG + T_LOFAULT]	! set t_lofault
480	btst	FPUSED_FLAG, %o5
481	bz,pt	%xcc, 1f
482	and	%o5, LOFAULT_SET, %l1	! copy flag to %l1
483
484	membar	#Sync				! sync error barrier
485	wr	%l5, 0, %gsr
486	btst	FPRS_FEF, %g5
487	bz,pt	%icc, 4f
488	nop
489	! restore fpregs from stack
490	BLD_FP_FROMSTACK(%o2)
491	ba,pt	%ncc, 2f
492	wr	%g5, 0, %fprs		! restore fprs
4934:
494	FZERO
495	wr	%g5, 0, %fprs		! restore fprs
4962:
497	ldn	[THREAD_REG + T_LWP], %o2
498	brnz,pt	%o2, 1f
499	nop
500
501	ldsb	[THREAD_REG + T_PREEMPT], %l0
502	deccc	%l0
503	bnz,pn	%ncc, 1f
504	stb	%l0, [THREAD_REG + T_PREEMPT]
505
506	! Check for a kernel preemption request
507	ldn	[THREAD_REG + T_CPU], %l0
508	ldub	[%l0 + CPU_KPRUNRUN], %l0
509	brnz,a,pt	%l0, 1f	! Need to call kpreempt?
510	or	%l1, KPREEMPT_FLAG, %l1	! If so, set the flag
511
512	! The kcopy will always set a t_lofault handler. If it fires,
513	! we're expected to just return the error code and not to
514	! invoke any existing error handler. As far as bcopy is concerned,
515	! we only set t_lofault if there was an existing lofault handler.
516	! In that case we're expected to invoke the previously existing
517	! handler after resetting the t_lofault value.
5181:
519	andn	%o5, COPY_FLAGS, %o5	! remove flags from lofault address
520	membar	#Sync				! sync error barrier
521	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
522
523	! call kpreempt if necessary
524	btst	KPREEMPT_FLAG, %l1
525	bz,pt	%icc, 2f
526	nop
527	call	kpreempt
528	rdpr	%pil, %o0	! pass %pil
5292:
530	btst	LOFAULT_SET, %l1
531	bnz,pn	%ncc, 3f
532	nop
533	ret
534	restore	%g1, 0, %o0
5353:
536	! We're here via bcopy. There must have been an error handler
537	! in place otherwise we would have died a nasty death already.
538	jmp	%o5				! goto real handler
539	restore	%g0, 0, %o0			! dispose of copy window
540
541/*
542 * We got here because of a fault in .copyerr.  We can't safely restore fp
543 * state, so we panic.
544 */
545fp_panic_msg:
546	.asciz	"Unable to restore fp state after copy operation"
547
548	.align	4
549.copyerr2:
550	set	fp_panic_msg, %o0
551	call	panic
552	nop
553/*
554 *  end of .copyerr
555 */
556
557#else	/* NIAGARA_IMPL */
558	save	%sp, -SA(MINFRAME), %sp
559	set	.copyerr, %l7			! copyerr is lofault value
560	ldn	[THREAD_REG + T_LOFAULT], %o5	! save existing handler
561	or	%o5, LOFAULT_SET, %o5
562	membar	#Sync				! sync error barrier
563	b	.do_copy			! common code
564	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
565
566/*
567 * We got here because of a fault during kcopy.
568 * Errno value is in %g1.
569 */
570.copyerr:
571	! The kcopy() *always* sets a t_lofault handler and it ORs LOFAULT_SET
572	! into %o5 to indicate it has set t_lofault handler. Need to clear
573	! LOFAULT_SET flag before restoring the error handler.
574	andn	%o5, LOFAULT_SET, %o5
575	membar	#Sync				! sync error barrier
576	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
577	ret
578	restore	%g1, 0, %o0
579#endif	/* NIAGARA_IMPL */
580
581	SET_SIZE(kcopy)
582
583
584/*
585 * Copy a block of storage - must not overlap (from + len <= to).
586 */
587
588	ENTRY(bcopy)
589#if !defined(NIAGARA_IMPL)
590	cmp	%o2, FP_COPY			! check for small copy/leaf case
591	bgt,pt	%ncc, .bcopy_more		!
592	nop
593.bcopy_small:					! setup error handler
594	ldn	[THREAD_REG + T_LOFAULT], %o5	! save existing handler
595	tst	%o5
596	bz,pt	%icc, .sm_do_copy
597	sethi	%hi(.sm_copyerr), %o4
598	or	%o4, %lo(.sm_copyerr), %o4	! .sm_copyerr is lofault value
599	membar	#Sync				! sync error barrier
600	stn	%o4, [THREAD_REG + T_LOFAULT]	! set t_lofault
601	or	%o5, LOFAULT_SET, %o5		! Error should trampoline
602.sm_do_copy:
603	mov	%o0, %g1		! save %o0
604	cmp	%o2, SHORTCOPY		! make sure there is enough to align
605	ble,pt	%ncc, .bc_smallest
606	andcc	%o1, 0x7, %o3		! is dest long aligned
607	bnz,pn	%ncc, .bc_align
608	andcc	%o1, 1, %o3		! is dest byte aligned
609
610! Destination is long word aligned
611.bc_al_src:
612	andcc	%o0, 7, %o3
613	brnz,pt	%o3, .bc_src_dst_unal8
614	nop
615/*
616 * Special case for handling when src and dest are both long word aligned
617 * and total data to move is less than FP_COPY bytes
618 * Also handles finish up for large block moves, so may be less than 32 bytes
619 */
620.bc_medlong:
621	subcc	%o2, 31, %o2		! adjust length to allow cc test
622	ble,pt	%ncc, .bc_medl31
623	nop
624.bc_medl32:
625	ldx	[%o0], %o4		! move 32 bytes
626	subcc	%o2, 32, %o2		! decrement length count by 32
627	stx	%o4, [%o1]
628	ldx	[%o0+8], %o4
629	stx	%o4, [%o1+8]
630	ldx	[%o0+16], %o4
631	add	%o0, 32, %o0		! increase src ptr by 32
632	stx	%o4, [%o1+16]
633	ldx	[%o0-8], %o4
634	add	%o1, 32, %o1		! increase dst ptr by 32
635	bgu,pt	%ncc, .bc_medl32	! repeat if at least 32 bytes left
636	stx	%o4, [%o1-8]
637.bc_medl31:
638	addcc	%o2, 24, %o2		! adjust count to be off by 7
639	ble,pt	%ncc, .bc_medl7		! skip if 7 or fewer bytes left
640	nop
641.bc_medl8:
642	ldx	[%o0], %o4		! move 8 bytes
643	add	%o0, 8, %o0		! increase src ptr by 8
644	subcc	%o2, 8, %o2		! decrease count by 8
645	add	%o1, 8, %o1		! increase dst ptr by 8
646	bgu,pt	%ncc, .bc_medl8
647	stx	%o4, [%o1-8]
648.bc_medl7:
649	addcc	%o2, 7, %o2		! finish adjustment of remaining count
650	bnz,pt	%ncc, .bc_small4	! do final bytes if not finished
651
652.bc_smallx:				! finish up and exit
653	tst	%o5
654	bz,pt	%ncc, .bc_sm_done
655	andn	%o5, COPY_FLAGS, %o5	! remove flags from lofault address
656	membar	#Sync			! sync error barrier
657	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
658.bc_sm_done:
659	retl
660	mov	%g0, %o0
661
662.bc_small4:
663	cmp	%o2, 4
664	blt,pt	%ncc, .bc_small3x	! skip if less than 4 bytes left
665	nop				!
666	ld	[%o0], %o4		! move 4 bytes
667	add	%o0, 4, %o0		! increase src ptr by 4
668	add	%o1, 4, %o1		! increase dst ptr by 4
669	subcc	%o2, 4, %o2		! decrease count by 4
670	bz,pt	%ncc, .bc_smallx
671	stw	%o4, [%o1-4]
672
673.bc_small3x:				! Exactly 1, 2, or 3 bytes remain
674	subcc	%o2, 1, %o2		! reduce count for cc test
675	ldub	[%o0], %o4		! load one byte
676	bz,pt	%ncc, .bc_smallx
677	stb	%o4, [%o1]		! store one byte
678	ldub	[%o0+1], %o4		! load second byte
679	subcc	%o2, 1, %o2
680	bz,pt	%ncc, .bc_smallx
681	stb	%o4, [%o1+1]		! store second byte
682	ldub	[%o0+2], %o4		! load third byte
683	ba	.bc_smallx
684	stb	%o4, [%o1+2]		! store third byte
685
686.bc_smallest:				! 7 or fewer bytes remain
687	tst	%o2
688	bz,pt	%ncc, .bc_smallx
689	cmp	%o2, 4
690	blt,pt	%ncc, .bc_small3x
691	nop
692	ldub	[%o0], %o4		! read byte
693	subcc	%o2, 4, %o2		! reduce count by 4
694	stb	%o4, [%o1]		! write byte
695	ldub	[%o0+1], %o4		! repeat for total of 4 bytes
696	add	%o0, 4, %o0		! advance src by 4
697	stb	%o4, [%o1+1]
698	ldub	[%o0-2], %o4
699	add	%o1, 4, %o1		! advance dst by 4
700	stb	%o4, [%o1-2]
701	ldub	[%o0-1], %o4
702	bnz,pt	%ncc, .bc_small3x
703	stb	%o4, [%o1-1]
704	ba	.bc_smallx
705	nop
706
707/*
708 * Align destination to long word boundary
709 */
710.bc_align:				! byte align test in prior branch delay
711	bnz,pt	%ncc, .bc_al_d1
712.bc_al_d1f:				! dest is now half word aligned
713	andcc	%o1, 2, %o3
714	bnz,pt	%ncc, .bc_al_d2
715.bc_al_d2f:				! dest is now word aligned
716	andcc	%o1, 4, %o3		! is dest longword aligned?
717	bz,pt	%ncc, .bc_al_src
718	nop
719.bc_al_d4:				! dest is word aligned;  src is unknown
720	ldub	[%o0], %o4		! move a word (src align unknown)
721	ldub	[%o0+1], %o3
722	sll	%o4, 24, %o4		! position
723	sll	%o3, 16, %o3		! position
724	or	%o4, %o3, %o3		! merge
725	ldub	[%o0+2], %o4
726	sll	%o4, 8, %o4		! position
727	or	%o4, %o3, %o3		! merge
728	ldub	[%o0+3], %o4
729	or	%o4, %o3, %o4		! merge
730	stw	%o4,[%o1]		! store four bytes
731	add	%o0, 4, %o0		! adjust src by 4
732	add	%o1, 4, %o1		! adjust dest by 4
733	sub	%o2, 4, %o2		! adjust count by 4
734	andcc	%o0, 7, %o3		! check for src long word alignment
735	brz,pt	%o3, .bc_medlong
736.bc_src_dst_unal8:
737	! dst is 8-byte aligned, src is not
738	! Size is less than FP_COPY
739	! Following code is to select for alignment
740	andcc	%o0, 0x3, %o3		! test word alignment
741	bz,pt	%ncc, .bc_medword
742	nop
743	andcc	%o0, 0x1, %o3		! test halfword alignment
744	bnz,pt	%ncc, .bc_med_byte	! go to byte move if not halfword
745	andcc	%o0, 0x2, %o3		! test which byte alignment
746	ba	.bc_medhalf
747	nop
748.bc_al_d1:				! align dest to half word
749	ldub	[%o0], %o4		! move a byte
750	add	%o0, 1, %o0
751	stb	%o4, [%o1]
752	add	%o1, 1, %o1
753	andcc	%o1, 2, %o3
754	bz,pt	%ncc, .bc_al_d2f
755	sub	%o2, 1, %o2
756.bc_al_d2:				! align dest to word
757	ldub	[%o0], %o4		! move a half-word (src align unknown)
758	ldub	[%o0+1], %o3
759	sll	%o4, 8, %o4		! position
760	or	%o4, %o3, %o4		! merge
761	sth	%o4, [%o1]
762	add	%o0, 2, %o0
763	add	%o1, 2, %o1
764	andcc	%o1, 4, %o3		! is dest longword aligned?
765	bz,pt	%ncc, .bc_al_src
766	sub	%o2, 2, %o2
767	ba	.bc_al_d4
768	nop
769/*
770 * Handle all cases where src and dest are aligned on word
771 * boundaries. Use unrolled loops for better performance.
772 * This option wins over standard large data move when
773 * source and destination is in cache for medium
774 * to short data moves.
775 */
776.bc_medword:
777	subcc	%o2, 31, %o2		! adjust length to allow cc test
778	ble,pt	%ncc, .bc_medw31
779	nop
780.bc_medw32:
781	ld	[%o0], %o4		! move a block of 32 bytes
782	stw	%o4, [%o1]
783	ld	[%o0+4], %o4
784	stw	%o4, [%o1+4]
785	ld	[%o0+8], %o4
786	stw	%o4, [%o1+8]
787	ld	[%o0+12], %o4
788	stw	%o4, [%o1+12]
789	ld	[%o0+16], %o4
790	stw	%o4, [%o1+16]
791	ld	[%o0+20], %o4
792	subcc	%o2, 32, %o2		! decrement length count
793	stw	%o4, [%o1+20]
794	ld	[%o0+24], %o4
795	add	%o0, 32, %o0		! increase src ptr by 32
796	stw	%o4, [%o1+24]
797	ld	[%o0-4], %o4
798	add	%o1, 32, %o1		! increase dst ptr by 32
799	bgu,pt	%ncc, .bc_medw32	! repeat if at least 32 bytes left
800	stw	%o4, [%o1-4]
801.bc_medw31:
802	addcc	%o2, 24, %o2		! adjust count to be off by 7
803	ble,pt	%ncc, .bc_medw7		! skip if 7 or fewer bytes left
804	nop				!
805.bc_medw15:
806	ld	[%o0], %o4		! move a block of 8 bytes
807	subcc	%o2, 8, %o2		! decrement length count
808	stw	%o4, [%o1]
809	add	%o0, 8, %o0		! increase src ptr by 8
810	ld	[%o0-4], %o4
811	add	%o1, 8, %o1		! increase dst ptr by 8
812	bgu,pt	%ncc, .bc_medw15
813	stw	%o4, [%o1-4]
814.bc_medw7:
815	addcc	%o2, 7, %o2		! finish adjustment of remaining count
816	bz,pt	%ncc, .bc_smallx	! exit if finished
817	cmp	%o2, 4
818	blt,pt	%ncc, .bc_small3x	! skip if less than 4 bytes left
819	nop				!
820	ld	[%o0], %o4		! move 4 bytes
821	add	%o0, 4, %o0		! increase src ptr by 4
822	add	%o1, 4, %o1		! increase dst ptr by 4
823	subcc	%o2, 4, %o2		! decrease count by 4
824	bnz	.bc_small3x
825	stw	%o4, [%o1-4]
826	ba	.bc_smallx
827	nop
828
829.bc_medhalf:
830	subcc	%o2, 31, %o2		! adjust length to allow cc test
831	ble,pt	%ncc, .bc_medh31
832	nop
833.bc_medh32:				! load and store block of 32 bytes
834	subcc	%o2, 32, %o2		! decrement length count
835
836	lduh	[%o0], %o4		! move 32 bytes
837	lduw	[%o0+2], %o3
838	sllx	%o4, 48, %o4
839	sllx	%o3, 16, %o3
840	or	%o4, %o3, %o3
841	lduh	[%o0+6], %o4
842	or	%o4, %o3, %o4
843	stx	%o4, [%o1]
844
845	lduh	[%o0+8], %o4
846	lduw	[%o0+10], %o3
847	sllx	%o4, 48, %o4
848	sllx	%o3, 16, %o3
849	or	%o4, %o3, %o3
850	lduh	[%o0+14], %o4
851	or	%o4, %o3, %o4
852	stx	%o4, [%o1+8]
853
854	lduh	[%o0+16], %o4
855	lduw	[%o0+18], %o3
856	sllx	%o4, 48, %o4
857	sllx	%o3, 16, %o3
858	or	%o4, %o3, %o3
859	lduh	[%o0+22], %o4
860	or	%o4, %o3, %o4
861	stx	%o4, [%o1+16]
862
863	add	%o0, 32, %o0		! increase src ptr by 32
864	add	%o1, 32, %o1		! increase dst ptr by 32
865
866	lduh	[%o0-8], %o4
867	lduw	[%o0-6], %o3
868	sllx	%o4, 48, %o4
869	sllx	%o3, 16, %o3
870	or	%o4, %o3, %o3
871	lduh	[%o0-2], %o4
872	or	%o3, %o4, %o4
873	bgu,pt	%ncc, .bc_medh32	! repeat if at least 32 bytes left
874	stx	%o4, [%o1-8]
875
876.bc_medh31:
877	addcc	%o2, 24, %o2		! adjust count to be off by 7
878	ble,pt	%ncc, .bc_medh7		! skip if 7 or fewer bytes left
879	nop				!
880.bc_medh15:
881	lduh	[%o0], %o4		! move 16 bytes
882	subcc	%o2, 8, %o2		! decrement length count
883	lduw	[%o0+2], %o3
884	sllx	%o4, 48, %o4
885	sllx	%o3, 16, %o3
886	or	%o4, %o3, %o3
887	add	%o1, 8, %o1		! increase dst ptr by 8
888	lduh	[%o0+6], %o4
889	add	%o0, 8, %o0		! increase src ptr by 8
890	or	%o4, %o3, %o4
891	bgu,pt	%ncc, .bc_medh15
892	stx	%o4, [%o1-8]
893.bc_medh7:
894	addcc	%o2, 7, %o2		! finish adjustment of remaining count
895	bz,pt	%ncc, .bc_smallx	! exit if finished
896	cmp	%o2, 4
897	blt,pt	%ncc, .bc_small3x	! skip if less than 4 bytes left
898	nop				!
899	lduh	[%o0], %o4
900	sll	%o4, 16, %o4
901	lduh	[%o0+2], %o3
902	or	%o3, %o4, %o4
903	subcc	%o2, 4, %o2
904	add	%o0, 4, %o0
905	add	%o1, 4, %o1
906	bnz	.bc_small3x
907	stw	%o4, [%o1-4]
908	ba	.bc_smallx
909	nop
910
911	.align 16
912.bc_med_byte:
913	bnz,pt	%ncc, .bc_medbh32a	! go to correct byte move
914	subcc	%o2, 31, %o2		! adjust length to allow cc test
915	ble,pt	%ncc, .bc_medb31
916	nop
917.bc_medb32:				! Alignment 1 or 5
918	subcc	%o2, 32, %o2		! decrement length count
919
920	ldub	[%o0], %o4		! load and store a block of 32 bytes
921	sllx	%o4, 56, %o3
922	lduh	[%o0+1], %o4
923	sllx	%o4, 40, %o4
924	or	%o4, %o3, %o3
925	lduw	[%o0+3], %o4
926	sllx	%o4, 8, %o4
927	or	%o4, %o3, %o3
928	ldub	[%o0+7], %o4
929	or	%o4, %o3, %o4
930	stx	%o4, [%o1]
931
932	ldub	[%o0+8], %o4
933	sllx	%o4, 56, %o3
934	lduh	[%o0+9], %o4
935	sllx	%o4, 40, %o4
936	or	%o4, %o3, %o3
937	lduw	[%o0+11], %o4
938	sllx	%o4, 8, %o4
939	or	%o4, %o3, %o3
940	ldub	[%o0+15], %o4
941	or	%o4, %o3, %o4
942	stx	%o4, [%o1+8]
943
944	ldub	[%o0+16], %o4
945	sllx	%o4, 56, %o3
946	lduh	[%o0+17], %o4
947	sllx	%o4, 40, %o4
948	or	%o4, %o3, %o3
949	lduw	[%o0+19], %o4
950	sllx	%o4, 8, %o4
951	or	%o4, %o3, %o3
952	ldub	[%o0+23], %o4
953	or	%o4, %o3, %o4
954	stx	%o4, [%o1+16]
955
956	add	%o0, 32, %o0		! increase src ptr by 32
957	add	%o1, 32, %o1		! increase dst ptr by 32
958
959	ldub	[%o0-8], %o4
960	sllx	%o4, 56, %o3
961	lduh	[%o0-7], %o4
962	sllx	%o4, 40, %o4
963	or	%o4, %o3, %o3
964	lduw	[%o0-5], %o4
965	sllx	%o4, 8, %o4
966	or	%o4, %o3, %o3
967	ldub	[%o0-1], %o4
968	or	%o4, %o3, %o4
969	bgu,pt	%ncc, .bc_medb32	! repeat if at least 32 bytes left
970	stx	%o4, [%o1-8]
971
972.bc_medb31:				! 31 or fewer bytes remaining
973	addcc	%o2, 24, %o2		! adjust count to be off by 7
974	ble,pt	%ncc, .bc_medb7		! skip if 7 or fewer bytes left
975	nop				!
976.bc_medb15:
977
978	ldub	[%o0], %o4		! load and store a block of 8 bytes
979	subcc	%o2, 8, %o2		! decrement length count
980	sllx	%o4, 56, %o3
981	lduh	[%o0+1], %o4
982	sllx	%o4, 40, %o4
983	or	%o4, %o3, %o3
984	lduw	[%o0+3], %o4
985	add	%o1, 8, %o1		! increase dst ptr by 16
986	sllx	%o4, 8, %o4
987	or	%o4, %o3, %o3
988	ldub	[%o0+7], %o4
989	add	%o0, 8, %o0		! increase src ptr by 16
990	or	%o4, %o3, %o4
991	bgu,pt	%ncc, .bc_medb15
992	stx	%o4, [%o1-8]
993.bc_medb7:
994	addcc	%o2, 7, %o2		! finish adjustment of remaining count
995	bz,pt	%ncc, .bc_smallx	! exit if finished
996	cmp	%o2, 4
997	blt,pt	%ncc, .bc_small3x	! skip if less than 4 bytes left
998	nop				!
999	ldub	[%o0], %o4		! move 4 bytes
1000	sll	%o4, 24, %o3
1001	lduh	[%o0+1], %o4
1002	sll	%o4, 8, %o4
1003	or	%o4, %o3, %o3
1004	ldub	[%o0+3], %o4
1005	or	%o4, %o3, %o4
1006	subcc	%o2, 4, %o2
1007	add	%o0, 4, %o0
1008	add	%o1, 4, %o1
1009	bnz	.bc_small3x
1010	stw	%o4, [%o1-4]
1011	ba	.bc_smallx
1012	nop
1013
1014	.align 16
1015.bc_medbh32a:				! Alignment 3 or 7
1016	ble,pt	%ncc, .bc_medbh31
1017	nop
1018.bc_medbh32:				! Alignment 3 or 7
1019	subcc	%o2, 32, %o2		! decrement length count
1020
1021	ldub	[%o0], %o4		! load and store a block of 32 bytes
1022	sllx	%o4, 56, %o3
1023	lduw	[%o0+1], %o4
1024	sllx	%o4, 24, %o4
1025	or	%o4, %o3, %o3
1026	lduh	[%o0+5], %o4
1027	sllx	%o4, 8, %o4
1028	or	%o4, %o3, %o3
1029	ldub	[%o0+7], %o4
1030	or	%o4, %o3, %o4
1031	stx	%o4, [%o1]
1032
1033	ldub	[%o0+8], %o4
1034	sllx	%o4, 56, %o3
1035	lduw	[%o0+9], %o4
1036	sllx	%o4, 24, %o4
1037	or	%o4, %o3, %o3
1038	lduh	[%o0+13], %o4
1039	sllx	%o4, 8, %o4
1040	or	%o4, %o3, %o3
1041	ldub	[%o0+15], %o4
1042	or	%o4, %o3, %o4
1043	stx	%o4, [%o1+8]
1044
1045	ldub	[%o0+16], %o4
1046	sllx	%o4, 56, %o3
1047	lduw	[%o0+17], %o4
1048	sllx	%o4, 24, %o4
1049	or	%o4, %o3, %o3
1050	lduh	[%o0+21], %o4
1051	sllx	%o4, 8, %o4
1052	or	%o4, %o3, %o3
1053	ldub	[%o0+23], %o4
1054	or	%o4, %o3, %o4
1055	stx	%o4, [%o1+16]
1056
1057	add	%o0, 32, %o0		! increase src ptr by 32
1058	add	%o1, 32, %o1		! increase dst ptr by 32
1059
1060	ldub	[%o0-8], %o4
1061	sllx	%o4, 56, %o3
1062	lduw	[%o0-7], %o4
1063	sllx	%o4, 24, %o4
1064	or	%o4, %o3, %o3
1065	lduh	[%o0-3], %o4
1066	sllx	%o4, 8, %o4
1067	or	%o4, %o3, %o3
1068	ldub	[%o0-1], %o4
1069	or	%o4, %o3, %o4
1070	bgu,pt	%ncc, .bc_medbh32	! repeat if at least 32 bytes left
1071	stx	%o4, [%o1-8]
1072
1073.bc_medbh31:
1074	addcc	%o2, 24, %o2		! adjust count to be off by 7
1075	ble,pt	%ncc, .bc_medb7		! skip if 7 or fewer bytes left
1076	nop				!
1077.bc_medbh15:
1078	ldub	[%o0], %o4		! load and store a block of 8 bytes
1079	sllx	%o4, 56, %o3
1080	lduw	[%o0+1], %o4
1081	sllx	%o4, 24, %o4
1082	or	%o4, %o3, %o3
1083	lduh	[%o0+5], %o4
1084	sllx	%o4, 8, %o4
1085	or	%o4, %o3, %o3
1086	ldub	[%o0+7], %o4
1087	or	%o4, %o3, %o4
1088	stx	%o4, [%o1]
1089	subcc	%o2, 8, %o2		! decrement length count
1090	add	%o1, 8, %o1		! increase dst ptr by 8
1091	add	%o0, 8, %o0		! increase src ptr by 8
1092	bgu,pt	%ncc, .bc_medbh15
1093	stx	%o4, [%o1-8]
1094	ba	.bc_medb7
1095	nop
1096
1097	SET_SIZE(bcopy)
1098/*
1099 * The _more entry points are not intended to be used directly by
1100 * any caller from outside this file.  They are provided to allow
1101 * profiling and dtrace of the portions of the copy code that uses
1102 * the floating point registers.
1103*/
1104	ENTRY(bcopy_more)
1105.bcopy_more:
1106	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1107	ldn	[THREAD_REG + T_LOFAULT], %o5	! save existing handler
1108	brz,pt	%o5, .do_copy
1109	nop
1110	sethi	%hi(.copyerr), %l7		! copyerr is lofault value
1111	or	%l7, %lo(.copyerr), %l7
1112	membar	#Sync				! sync error barrier
1113	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
1114	! We've already captured whether t_lofault was zero on entry.
1115	! We need to mark ourselves as being from bcopy since both
1116	! kcopy and bcopy use the same code path. If LOFAULT_SET is
1117	! set and the saved lofault was zero, we won't reset lofault on
1118	! returning.
1119	or	%o5, LOFAULT_SET, %o5
1120.do_copy:
1121	ldn	[THREAD_REG + T_LWP], %o3
1122	brnz,pt	%o3, 1f
1123	nop
1124/*
1125 * kpreempt_disable();
1126 */
1127	ldsb	[THREAD_REG +T_PREEMPT], %o3
1128	inc	%o3
1129	stb	%o3, [THREAD_REG + T_PREEMPT]
11301:
1131/*
1132 * Following code is for large copies. We know there is at
1133 * least FP_COPY bytes available. FP regs are used, so
1134 *  we save registers and fp regs before starting
1135 */
1136	rd	%fprs, %g5		! check for unused fp
1137	or	%o5,FPUSED_FLAG,%o5
1138	! if fprs.fef == 0, set it.
1139	! Setting it when already set costs more than checking
1140	andcc	%g5, FPRS_FEF, %g5	! test FEF, fprs.du = fprs.dl = 0
1141	bz,pt	%ncc, .bc_fp_unused
1142	prefetch [%i0 + (1 * CACHE_LINE)], #one_read
1143	BST_FP_TOSTACK(%o3)
1144	ba	.bc_fp_ready
1145.bc_fp_unused:
1146	andcc	%i1, 1, %o3		! is dest byte aligned
1147	wr	%g0, FPRS_FEF, %fprs	! fprs.fef = 1
1148.bc_fp_ready:
1149	rd	%gsr, %l5		! save %gsr value
1150	bnz,pt	%ncc, .bc_big_d1
1151.bc_big_d1f:				! dest is now half word aligned
1152	andcc	%i1, 2, %o3
1153	bnz,pt	%ncc, .bc_big_d2
1154.bc_big_d2f:				! dest is now word aligned
1155	andcc	%i1, 4, %o3
1156	bnz,pt	%ncc, .bc_big_d4
1157.bc_big_d4f:				! dest is now long word aligned
1158	andcc	%i0, 7, %o3		! is src long word aligned
1159	brnz,pt	%o3, .bc_big_unal8
1160	prefetch [%i0 + (2 * CACHE_LINE)], #one_read
1161
1162	! Src and dst are long word aligned
1163	! align dst to 64 byte boundary
1164	andcc	%i1, 0x3f, %o3		! %o3 == 0 means dst is 64 byte aligned
1165	brz,pn	%o3, .bc_al_to_64
1166	nop
1167	sub	%o3, 64, %o3		! %o3 has negative bytes to move
1168	add	%i2, %o3, %i2		! adjust remaining count
1169	andcc	%o3, 8, %o4		! odd long words to move?
1170	brz,pt	%o4, .bc_al_to_16
1171	nop
1172	add	%o3, 8, %o3
1173	ldx	[%i0], %o4
1174	add	%i0, 8, %i0		! increment src ptr
1175	add	%i1, 8, %i1		! increment dst ptr
1176	stx	%o4, [%i1-8]
1177! Dest is aligned on 16 bytes, src 8 byte aligned
1178.bc_al_to_16:
1179	andcc	%o3, 0x30, %o4		! pair of long words to move?
1180	brz,pt	%o4, .bc_al_to_64
1181	nop
1182.bc_al_mv_16:
1183	add	%o3, 16, %o3
1184	ldx	[%i0], %o4
1185	stx	%o4, [%i1]
1186	ldx	[%i0+8], %o4
1187	add	%i0, 16, %i0		! increment src ptr
1188	stx	%o4, [%i1+8]
1189	andcc	%o3, 48, %o4
1190	brnz,pt	%o4, .bc_al_mv_16
1191	add	%i1, 16, %i1		! increment dst ptr
1192! Dest is aligned on 64 bytes, src 8 byte aligned
1193.bc_al_to_64:
1194	! Determine source alignment
1195	! to correct 8 byte offset
1196	andcc	%i0, 32, %o3
1197	brnz,pn	%o3, .bc_aln_1
1198	andcc	%i0, 16, %o3
1199	brnz,pn	%o3, .bc_aln_01
1200	andcc	%i0, 8, %o3
1201	brz,pn	%o3, .bc_aln_000
1202	prefetch [%i0 + (3 * CACHE_LINE)], #one_read
1203	ba	.bc_aln_001
1204	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1205
1206.bc_aln_01:
1207	brnz,pn	%o3, .bc_aln_011
1208	prefetch [%i0 + (3 * CACHE_LINE)], #one_read
1209	ba	.bc_aln_010
1210	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1211.bc_aln_1:
1212	andcc	%i0, 16, %o3
1213	brnz,pn	%o3, .bc_aln_11
1214	andcc	%i0, 8, %o3
1215	brnz,pn	%o3, .bc_aln_101
1216	prefetch [%i0 + (3 * CACHE_LINE)], #one_read
1217	ba	.bc_aln_100
1218	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1219.bc_aln_11:
1220	brz,pn	%o3, .bc_aln_110
1221	prefetch [%i0 + (3 * CACHE_LINE)], #one_read
1222
1223.bc_aln_111:
1224! Alignment off by 8 bytes
1225	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1226	ldd	[%i0], %d0
1227	add	%i0, 8, %i0
1228	sub	%i2, 8, %i2
1229	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
1230	and	%i2, 0x7f, %i2		! residue bytes in %i2
1231	sub	%i1, %i0, %i1
1232.bc_aln_111_loop:
1233	ldda	[%i0]ASI_BLK_P,%d16		! block load
1234	subcc	%o3, 64, %o3
1235	fmovd	%d16, %d2
1236	fmovd	%d18, %d4
1237	fmovd	%d20, %d6
1238	fmovd	%d22, %d8
1239	fmovd	%d24, %d10
1240	fmovd	%d26, %d12
1241	fmovd	%d28, %d14
1242	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
1243	stda	%d0,[%i0+%i1]ASI_BLK_P
1244	add	%i0, 64, %i0
1245	fmovd	%d30, %d0
1246	bgt,pt	%ncc, .bc_aln_111_loop
1247	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1248	add	%i1, %i0, %i1
1249
1250	std	%d0, [%i1]
1251	ba	.bc_remain_stuff
1252	add	%i1, 8, %i1
1253	! END OF aln_111
1254
1255.bc_aln_110:
1256! Alignment off by 16 bytes
1257	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1258	ldd	[%i0], %d0
1259	ldd	[%i0+8], %d2
1260	add	%i0, 16, %i0
1261	sub	%i2, 16, %i2
1262	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
1263	and	%i2, 0x7f, %i2		! residue bytes in %i2
1264	sub	%i1, %i0, %i1
1265.bc_aln_110_loop:
1266	ldda	[%i0]ASI_BLK_P,%d16		! block load
1267	subcc	%o3, 64, %o3
1268	fmovd	%d16, %d4
1269	fmovd	%d18, %d6
1270	fmovd	%d20, %d8
1271	fmovd	%d22, %d10
1272	fmovd	%d24, %d12
1273	fmovd	%d26, %d14
1274	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
1275	stda	%d0,[%i0+%i1]ASI_BLK_P
1276	add	%i0, 64, %i0
1277	fmovd	%d28, %d0
1278	fmovd	%d30, %d2
1279	bgt,pt	%ncc, .bc_aln_110_loop
1280	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1281	add	%i1, %i0, %i1
1282
1283	std	%d0, [%i1]
1284	std	%d2, [%i1+8]
1285	ba	.bc_remain_stuff
1286	add	%i1, 16, %i1
1287	! END OF aln_110
1288
1289.bc_aln_101:
1290! Alignment off by 24 bytes
1291	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1292	ldd	[%i0], %d0
1293	ldd	[%i0+8], %d2
1294	ldd	[%i0+16], %d4
1295	add	%i0, 24, %i0
1296	sub	%i2, 24, %i2
1297	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
1298	and	%i2, 0x7f, %i2		! residue bytes in %i2
1299	sub	%i1, %i0, %i1
1300.bc_aln_101_loop:
1301	ldda	[%i0]ASI_BLK_P,%d16	! block load
1302	subcc	%o3, 64, %o3
1303	fmovd	%d16, %d6
1304	fmovd	%d18, %d8
1305	fmovd	%d20, %d10
1306	fmovd	%d22, %d12
1307	fmovd	%d24, %d14
1308	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
1309	stda	%d0,[%i0+%i1]ASI_BLK_P
1310	add	%i0, 64, %i0
1311	fmovd	%d26, %d0
1312	fmovd	%d28, %d2
1313	fmovd	%d30, %d4
1314	bgt,pt	%ncc, .bc_aln_101_loop
1315	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1316	add	%i1, %i0, %i1
1317
1318	std	%d0, [%i1]
1319	std	%d2, [%i1+8]
1320	std	%d4, [%i1+16]
1321	ba	.bc_remain_stuff
1322	add	%i1, 24, %i1
1323	! END OF aln_101
1324
1325.bc_aln_100:
1326! Alignment off by 32 bytes
1327	ldd	[%i0], %d0
1328	ldd	[%i0+8], %d2
1329	ldd	[%i0+16],%d4
1330	ldd	[%i0+24],%d6
1331	add	%i0, 32, %i0
1332	sub	%i2, 32, %i2
1333	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
1334	and	%i2, 0x7f, %i2		! residue bytes in %i2
1335	sub	%i1, %i0, %i1
1336.bc_aln_100_loop:
1337	ldda	[%i0]ASI_BLK_P,%d16	! block load
1338	subcc	%o3, 64, %o3
1339	fmovd	%d16, %d8
1340	fmovd	%d18, %d10
1341	fmovd	%d20, %d12
1342	fmovd	%d22, %d14
1343	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
1344	stda	%d0,[%i0+%i1]ASI_BLK_P
1345	add	%i0, 64, %i0
1346	fmovd	%d24, %d0
1347	fmovd	%d26, %d2
1348	fmovd	%d28, %d4
1349	fmovd	%d30, %d6
1350	bgt,pt	%ncc, .bc_aln_100_loop
1351	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1352	add	%i1, %i0, %i1
1353
1354	std	%d0, [%i1]
1355	std	%d2, [%i1+8]
1356	std	%d4, [%i1+16]
1357	std	%d6, [%i1+24]
1358	ba	.bc_remain_stuff
1359	add	%i1, 32, %i1
1360	! END OF aln_100
1361
1362.bc_aln_011:
1363! Alignment off by 40 bytes
1364	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1365	ldd	[%i0], %d0
1366	ldd	[%i0+8], %d2
1367	ldd	[%i0+16], %d4
1368	ldd	[%i0+24], %d6
1369	ldd	[%i0+32], %d8
1370	add	%i0, 40, %i0
1371	sub	%i2, 40, %i2
1372	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
1373	and	%i2, 0x7f, %i2		! residue bytes in %i2
1374	sub	%i1, %i0, %i1
1375.bc_aln_011_loop:
1376	ldda	[%i0]ASI_BLK_P,%d16	! block load
1377	subcc	%o3, 64, %o3
1378	fmovd	%d16, %d10
1379	fmovd	%d18, %d12
1380	fmovd	%d20, %d14
1381	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
1382	stda	%d0,[%i0+%i1]ASI_BLK_P
1383	add	%i0, 64, %i0
1384	fmovd	%d22, %d0
1385	fmovd	%d24, %d2
1386	fmovd	%d26, %d4
1387	fmovd	%d28, %d6
1388	fmovd	%d30, %d8
1389	bgt,pt	%ncc, .bc_aln_011_loop
1390	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1391	add	%i1, %i0, %i1
1392
1393	std	%d0, [%i1]
1394	std	%d2, [%i1+8]
1395	std	%d4, [%i1+16]
1396	std	%d6, [%i1+24]
1397	std	%d8, [%i1+32]
1398	ba	.bc_remain_stuff
1399	add	%i1, 40, %i1
1400	! END OF aln_011
1401
1402.bc_aln_010:
1403! Alignment off by 48 bytes
1404	ldd	[%i0], %d0
1405	ldd	[%i0+8], %d2
1406	ldd	[%i0+16], %d4
1407	ldd	[%i0+24], %d6
1408	ldd	[%i0+32], %d8
1409	ldd	[%i0+40], %d10
1410	add	%i0, 48, %i0
1411	sub	%i2, 48, %i2
1412	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
1413	and	%i2, 0x7f, %i2		! residue bytes in %i2
1414	sub	%i1, %i0, %i1
1415.bc_aln_010_loop:
1416	ldda	[%i0]ASI_BLK_P,%d16	! block load
1417	subcc	%o3, 64, %o3
1418	fmovd	%d16, %d12
1419	fmovd	%d18, %d14
1420	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
1421	stda	%d0,[%i0+%i1]ASI_BLK_P
1422	add	%i0, 64, %i0
1423	fmovd	%d20, %d0
1424	fmovd	%d22, %d2
1425	fmovd	%d24, %d4
1426	fmovd	%d26, %d6
1427	fmovd	%d28, %d8
1428	fmovd	%d30, %d10
1429	bgt,pt	%ncc, .bc_aln_010_loop
1430	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1431	add	%i1, %i0, %i1
1432
1433	std	%d0, [%i1]
1434	std	%d2, [%i1+8]
1435	std	%d4, [%i1+16]
1436	std	%d6, [%i1+24]
1437	std	%d8, [%i1+32]
1438	std	%d10, [%i1+40]
1439	ba	.bc_remain_stuff
1440	add	%i1, 48, %i1
1441	! END OF aln_010
1442
1443.bc_aln_001:
1444! Alignment off by 56 bytes
1445	ldd	[%i0], %d0
1446	ldd	[%i0+8], %d2
1447	ldd	[%i0+16], %d4
1448	ldd	[%i0+24], %d6
1449	ldd	[%i0+32], %d8
1450	ldd	[%i0+40], %d10
1451	ldd	[%i0+48], %d12
1452	add	%i0, 56, %i0
1453	sub	%i2, 56, %i2
1454	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
1455	and	%i2, 0x7f, %i2		! residue bytes in %i2
1456	sub	%i1, %i0, %i1
1457.bc_aln_001_loop:
1458	ldda	[%i0]ASI_BLK_P,%d16	! block load
1459	subcc	%o3, 64, %o3
1460	fmovd	%d16, %d14
1461	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
1462	stda	%d0,[%i0+%i1]ASI_BLK_P
1463	add	%i0, 64, %i0
1464	fmovd	%d18, %d0
1465	fmovd	%d20, %d2
1466	fmovd	%d22, %d4
1467	fmovd	%d24, %d6
1468	fmovd	%d26, %d8
1469	fmovd	%d28, %d10
1470	fmovd	%d30, %d12
1471	bgt,pt	%ncc, .bc_aln_001_loop
1472	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1473	add	%i1, %i0, %i1
1474
1475	std	%d0, [%i1]
1476	std	%d2, [%i1+8]
1477	std	%d4, [%i1+16]
1478	std	%d6, [%i1+24]
1479	std	%d8, [%i1+32]
1480	std	%d10, [%i1+40]
1481	std	%d12, [%i1+48]
1482	ba	.bc_remain_stuff
1483	add	%i1, 56, %i1
1484	! END OF aln_001
1485
1486.bc_aln_000:
1487	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1488	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
1489	and	%i2, 0x7f, %i2		! residue bytes in %i2
1490	sub	%i1, %i0, %i1
1491.bc_aln_000_loop:
1492	ldda	[%i0]ASI_BLK_P,%d0
1493	subcc	%o3, 64, %o3
1494	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
1495	stda	%d0,[%i0+%i1]ASI_BLK_P
1496	add	%i0, 64, %i0
1497	bgt,pt	%ncc, .bc_aln_000_loop
1498	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1499	add	%i1, %i0, %i1
1500
1501	! END OF aln_000
1502
1503.bc_remain_stuff:
1504	subcc	%i2, 31, %i2		! adjust length to allow cc test
1505	ble,pt	%ncc, .bc_aln_31
1506	nop
1507.bc_aln_32:
1508	ldx	[%i0], %o4		! move 32 bytes
1509	subcc	%i2, 32, %i2		! decrement length count by 32
1510	stx	%o4, [%i1]
1511	ldx	[%i0+8], %o4
1512	stx	%o4, [%i1+8]
1513	ldx	[%i0+16], %o4
1514	add	%i0, 32, %i0		! increase src ptr by 32
1515	stx	%o4, [%i1+16]
1516	ldx	[%i0-8], %o4
1517	add	%i1, 32, %i1		! increase dst ptr by 32
1518	bgu,pt	%ncc, .bc_aln_32	! repeat if at least 32 bytes left
1519	stx	%o4, [%i1-8]
1520.bc_aln_31:
1521	addcc	%i2, 24, %i2		! adjust count to be off by 7
1522	ble,pt	%ncc, .bc_aln_7		! skip if 7 or fewer bytes left
1523	nop				!
1524.bc_aln_15:
1525	ldx	[%i0], %o4		! move 8 bytes
1526	add	%i0, 8, %i0		! increase src ptr by 8
1527	subcc	%i2, 8, %i2		! decrease count by 8
1528	add	%i1, 8, %i1		! increase dst ptr by 8
1529	bgu,pt	%ncc, .bc_aln_15
1530	stx	%o4, [%i1-8]		!
1531.bc_aln_7:
1532	addcc	%i2, 7, %i2		! finish adjustment of remaining count
1533	bz,pt	%ncc, .bc_exit		! exit if finished
1534	cmp	%i2, 4
1535	blt,pt	%ncc, .bc_unaln3x	! skip if less than 4 bytes left
1536	nop				!
1537	ld	[%i0], %o4		! move 4 bytes
1538	add	%i0, 4, %i0		! increase src ptr by 4
1539	add	%i1, 4, %i1		! increase dst ptr by 4
1540	subcc	%i2, 4, %i2		! decrease count by 4
1541	bnz	.bc_unaln3x
1542	stw	%o4, [%i1-4]
1543	ba	.bc_exit
1544	nop
1545
1546	! destination alignment code
1547.bc_big_d1:
1548	ldub	[%i0], %o4		! move a byte
1549	add	%i0, 1, %i0
1550	stb	%o4, [%i1]
1551	add	%i1, 1, %i1
1552	andcc	%i1, 2, %o3
1553	bz,pt	%ncc, .bc_big_d2f
1554	sub	%i2, 1, %i2
1555.bc_big_d2:
1556	ldub	[%i0], %o4		! move a half-word (src align unknown)
1557	ldub	[%i0+1], %o3
1558	add	%i0, 2, %i0
1559	sll	%o4, 8, %o4		! position
1560	or	%o4, %o3, %o4		! merge
1561	sth	%o4, [%i1]
1562	add	%i1, 2, %i1
1563	andcc	%i1, 4, %o3
1564	bz,pt	%ncc, .bc_big_d4f
1565	sub	%i2, 2, %i2
1566.bc_big_d4:
1567	ldub	[%i0], %o4		! move a word (src align unknown)
1568	ldub	[%i0+1], %o3
1569	sll	%o4, 24, %o4		! position
1570	sll	%o3, 16, %o3		! position
1571	or	%o4, %o3, %o3		! merge
1572	ldub	[%i0+2], %o4
1573	sll	%o4, 8, %o4		! position
1574	or	%o4, %o3, %o3		! merge
1575	ldub	[%i0+3], %o4
1576	or	%o4, %o3, %o4		! merge
1577	stw	%o4,[%i1]		! store four bytes
1578	add	%i0, 4, %i0		! adjust src by 4
1579	add	%i1, 4, %i1		! adjust dest by 4
1580	ba	.bc_big_d4f
1581	sub	%i2, 4, %i2		! adjust count by 4
1582
1583
1584	! Dst is on 8 byte boundary; src is not;
1585.bc_big_unal8:
1586	andcc	%i1, 0x3f, %o3		! is dst 64-byte block aligned?
1587	bz	%ncc, .bc_unalnsrc
1588	sub	%o3, 64, %o3		! %o3 will be multiple of 8
1589	neg	%o3			! bytes until dest is 64 byte aligned
1590	sub	%i2, %o3, %i2		! update cnt with bytes to be moved
1591	! Move bytes according to source alignment
1592	andcc	%i0, 0x1, %o4
1593	bnz	%ncc, .bc_unalnbyte	! check for byte alignment
1594	nop
1595	andcc	%i0, 2, %o4		! check for half word alignment
1596	bnz	%ncc, .bc_unalnhalf
1597	nop
1598	! Src is word aligned, move bytes until dest 64 byte aligned
1599.bc_unalnword:
1600	ld	[%i0], %o4		! load 4 bytes
1601	stw	%o4, [%i1]		! and store 4 bytes
1602	ld	[%i0+4], %o4		! load 4 bytes
1603	add	%i0, 8, %i0		! increase src ptr by 8
1604	stw	%o4, [%i1+4]		! and store 4 bytes
1605	subcc	%o3, 8, %o3		! decrease count by 8
1606	bnz	%ncc, .bc_unalnword
1607	add	%i1, 8, %i1		! increase dst ptr by 8
1608	ba	.bc_unalnsrc
1609	nop
1610
1611	! Src is half-word aligned, move bytes until dest 64 byte aligned
1612.bc_unalnhalf:
1613	lduh	[%i0], %o4		! load 2 bytes
1614	sllx	%o4, 32, %i3		! shift left
1615	lduw	[%i0+2], %o4
1616	or	%o4, %i3, %i3
1617	sllx	%i3, 16, %i3
1618	lduh	[%i0+6], %o4
1619	or	%o4, %i3, %i3
1620	stx	%i3, [%i1]
1621	add	%i0, 8, %i0
1622	subcc	%o3, 8, %o3
1623	bnz	%ncc, .bc_unalnhalf
1624	add	%i1, 8, %i1
1625	ba	.bc_unalnsrc
1626	nop
1627
1628	! Src is Byte aligned, move bytes until dest 64 byte aligned
1629.bc_unalnbyte:
1630	sub	%i1, %i0, %i1		! share pointer advance
1631.bc_unalnbyte_loop:
1632	ldub	[%i0], %o4
1633	sllx	%o4, 56, %i3
1634	lduh	[%i0+1], %o4
1635	sllx	%o4, 40, %o4
1636	or	%o4, %i3, %i3
1637	lduh	[%i0+3], %o4
1638	sllx	%o4, 24, %o4
1639	or	%o4, %i3, %i3
1640	lduh	[%i0+5], %o4
1641	sllx	%o4, 8, %o4
1642	or	%o4, %i3, %i3
1643	ldub	[%i0+7], %o4
1644	or	%o4, %i3, %i3
1645	stx	%i3, [%i1+%i0]
1646	subcc	%o3, 8, %o3
1647	bnz	%ncc, .bc_unalnbyte_loop
1648	add	%i0, 8, %i0
1649	add	%i1,%i0, %i1		! restore pointer
1650
1651	! Destination is now block (64 byte aligned), src is not 8 byte aligned
1652.bc_unalnsrc:
1653	andn	%i2, 0x3f, %i3		! %i3 is multiple of block size
1654	and	%i2, 0x3f, %i2		! residue bytes in %i2
1655	add	%i2, 64, %i2		! Insure we don't load beyond
1656	sub	%i3, 64, %i3		! end of source buffer
1657
1658	andn	%i0, 0x3f, %o4		! %o4 has block aligned src address
1659	prefetch [%o4 + (3 * CACHE_LINE)], #one_read
1660	alignaddr %i0, %g0, %g0		! generate %gsr
1661	add	%i0, %i3, %i0		! advance %i0 to after blocks
1662	!
1663	! Determine source alignment to correct 8 byte offset
1664	andcc	%i0, 0x20, %o3
1665	brnz,pn	%o3, .bc_unaln_1
1666	andcc	%i0, 0x10, %o3
1667	brnz,pn	%o3, .bc_unaln_01
1668	andcc	%i0, 0x08, %o3
1669	brz,a	%o3, .bc_unaln_000
1670	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1671	ba	.bc_unaln_001
1672	nop
1673.bc_unaln_01:
1674	brnz,a	%o3, .bc_unaln_011
1675	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1676	ba	.bc_unaln_010
1677	nop
1678.bc_unaln_1:
1679	brnz,pn	%o3, .bc_unaln_11
1680	andcc	%i0, 0x08, %o3
1681	brnz,a	%o3, .bc_unaln_101
1682	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1683	ba	.bc_unaln_100
1684	nop
1685.bc_unaln_11:
1686	brz,pn	%o3, .bc_unaln_110
1687	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1688
1689.bc_unaln_111:
1690	ldd	[%o4+56], %d14
1691.bc_unaln_111_loop:
1692	add	%o4, 64, %o4
1693	ldda	[%o4]ASI_BLK_P, %d16
1694	faligndata %d14, %d16, %d48
1695	faligndata %d16, %d18, %d50
1696	faligndata %d18, %d20, %d52
1697	faligndata %d20, %d22, %d54
1698	faligndata %d22, %d24, %d56
1699	faligndata %d24, %d26, %d58
1700	faligndata %d26, %d28, %d60
1701	faligndata %d28, %d30, %d62
1702	fmovd	%d30, %d14
1703	stda	%d48, [%i1]ASI_BLK_P
1704	subcc	%i3, 64, %i3
1705	add	%i1, 64, %i1
1706	bgu,pt	%ncc, .bc_unaln_111_loop
1707	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1708	ba	.bc_unaln_done
1709	nop
1710
1711.bc_unaln_110:
1712	ldd	[%o4+48], %d12
1713	ldd	[%o4+56], %d14
1714.bc_unaln_110_loop:
1715	add	%o4, 64, %o4
1716	ldda	[%o4]ASI_BLK_P, %d16
1717	faligndata %d12, %d14, %d48
1718	faligndata %d14, %d16, %d50
1719	faligndata %d16, %d18, %d52
1720	faligndata %d18, %d20, %d54
1721	faligndata %d20, %d22, %d56
1722	faligndata %d22, %d24, %d58
1723	faligndata %d24, %d26, %d60
1724	faligndata %d26, %d28, %d62
1725	fmovd	%d28, %d12
1726	fmovd	%d30, %d14
1727	stda	%d48, [%i1]ASI_BLK_P
1728	subcc	%i3, 64, %i3
1729	add	%i1, 64, %i1
1730	bgu,pt	%ncc, .bc_unaln_110_loop
1731	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1732	ba	.bc_unaln_done
1733	nop
1734
1735.bc_unaln_101:
1736	ldd	[%o4+40], %d10
1737	ldd	[%o4+48], %d12
1738	ldd	[%o4+56], %d14
1739.bc_unaln_101_loop:
1740	add	%o4, 64, %o4
1741	ldda	[%o4]ASI_BLK_P, %d16
1742	faligndata %d10, %d12, %d48
1743	faligndata %d12, %d14, %d50
1744	faligndata %d14, %d16, %d52
1745	faligndata %d16, %d18, %d54
1746	faligndata %d18, %d20, %d56
1747	faligndata %d20, %d22, %d58
1748	faligndata %d22, %d24, %d60
1749	faligndata %d24, %d26, %d62
1750	fmovd	%d26, %d10
1751	fmovd	%d28, %d12
1752	fmovd	%d30, %d14
1753	stda	%d48, [%i1]ASI_BLK_P
1754	subcc	%i3, 64, %i3
1755	add	%i1, 64, %i1
1756	bgu,pt	%ncc, .bc_unaln_101_loop
1757	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1758	ba	.bc_unaln_done
1759	nop
1760
1761.bc_unaln_100:
1762	ldd	[%o4+32], %d8
1763	ldd	[%o4+40], %d10
1764	ldd	[%o4+48], %d12
1765	ldd	[%o4+56], %d14
1766.bc_unaln_100_loop:
1767	add	%o4, 64, %o4
1768	ldda	[%o4]ASI_BLK_P, %d16
1769	faligndata %d8, %d10, %d48
1770	faligndata %d10, %d12, %d50
1771	faligndata %d12, %d14, %d52
1772	faligndata %d14, %d16, %d54
1773	faligndata %d16, %d18, %d56
1774	faligndata %d18, %d20, %d58
1775	faligndata %d20, %d22, %d60
1776	faligndata %d22, %d24, %d62
1777	fmovd	%d24, %d8
1778	fmovd	%d26, %d10
1779	fmovd	%d28, %d12
1780	fmovd	%d30, %d14
1781	stda	%d48, [%i1]ASI_BLK_P
1782	subcc	%i3, 64, %i3
1783	add	%i1, 64, %i1
1784	bgu,pt	%ncc, .bc_unaln_100_loop
1785	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1786	ba	.bc_unaln_done
1787	nop
1788
1789.bc_unaln_011:
1790	ldd	[%o4+24], %d6
1791	ldd	[%o4+32], %d8
1792	ldd	[%o4+40], %d10
1793	ldd	[%o4+48], %d12
1794	ldd	[%o4+56], %d14
1795.bc_unaln_011_loop:
1796	add	%o4, 64, %o4
1797	ldda	[%o4]ASI_BLK_P, %d16
1798	faligndata %d6, %d8, %d48
1799	faligndata %d8, %d10, %d50
1800	faligndata %d10, %d12, %d52
1801	faligndata %d12, %d14, %d54
1802	faligndata %d14, %d16, %d56
1803	faligndata %d16, %d18, %d58
1804	faligndata %d18, %d20, %d60
1805	faligndata %d20, %d22, %d62
1806	fmovd	%d22, %d6
1807	fmovd	%d24, %d8
1808	fmovd	%d26, %d10
1809	fmovd	%d28, %d12
1810	fmovd	%d30, %d14
1811	stda	%d48, [%i1]ASI_BLK_P
1812	subcc	%i3, 64, %i3
1813	add	%i1, 64, %i1
1814	bgu,pt	%ncc, .bc_unaln_011_loop
1815	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1816	ba	.bc_unaln_done
1817	nop
1818
1819.bc_unaln_010:
1820	ldd	[%o4+16], %d4
1821	ldd	[%o4+24], %d6
1822	ldd	[%o4+32], %d8
1823	ldd	[%o4+40], %d10
1824	ldd	[%o4+48], %d12
1825	ldd	[%o4+56], %d14
1826.bc_unaln_010_loop:
1827	add	%o4, 64, %o4
1828	ldda	[%o4]ASI_BLK_P, %d16
1829	faligndata %d4, %d6, %d48
1830	faligndata %d6, %d8, %d50
1831	faligndata %d8, %d10, %d52
1832	faligndata %d10, %d12, %d54
1833	faligndata %d12, %d14, %d56
1834	faligndata %d14, %d16, %d58
1835	faligndata %d16, %d18, %d60
1836	faligndata %d18, %d20, %d62
1837	fmovd	%d20, %d4
1838	fmovd	%d22, %d6
1839	fmovd	%d24, %d8
1840	fmovd	%d26, %d10
1841	fmovd	%d28, %d12
1842	fmovd	%d30, %d14
1843	stda	%d48, [%i1]ASI_BLK_P
1844	subcc	%i3, 64, %i3
1845	add	%i1, 64, %i1
1846	bgu,pt	%ncc, .bc_unaln_010_loop
1847	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1848	ba	.bc_unaln_done
1849	nop
1850
1851.bc_unaln_001:
1852	ldd	[%o4+8], %d2
1853	ldd	[%o4+16], %d4
1854	ldd	[%o4+24], %d6
1855	ldd	[%o4+32], %d8
1856	ldd	[%o4+40], %d10
1857	ldd	[%o4+48], %d12
1858	ldd	[%o4+56], %d14
1859.bc_unaln_001_loop:
1860	add	%o4, 64, %o4
1861	ldda	[%o4]ASI_BLK_P, %d16
1862	faligndata %d2, %d4, %d48
1863	faligndata %d4, %d6, %d50
1864	faligndata %d6, %d8, %d52
1865	faligndata %d8, %d10, %d54
1866	faligndata %d10, %d12, %d56
1867	faligndata %d12, %d14, %d58
1868	faligndata %d14, %d16, %d60
1869	faligndata %d16, %d18, %d62
1870	fmovd	%d18, %d2
1871	fmovd	%d20, %d4
1872	fmovd	%d22, %d6
1873	fmovd	%d24, %d8
1874	fmovd	%d26, %d10
1875	fmovd	%d28, %d12
1876	fmovd	%d30, %d14
1877	stda	%d48, [%i1]ASI_BLK_P
1878	subcc	%i3, 64, %i3
1879	add	%i1, 64, %i1
1880	bgu,pt	%ncc, .bc_unaln_001_loop
1881	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1882	ba	.bc_unaln_done
1883	nop
1884
1885.bc_unaln_000:
1886	ldda	[%o4]ASI_BLK_P, %d0
1887.bc_unaln_000_loop:
1888	add	%o4, 64, %o4
1889	ldda	[%o4]ASI_BLK_P, %d16
1890	faligndata %d0, %d2, %d48
1891	faligndata %d2, %d4, %d50
1892	faligndata %d4, %d6, %d52
1893	faligndata %d6, %d8, %d54
1894	faligndata %d8, %d10, %d56
1895	faligndata %d10, %d12, %d58
1896	faligndata %d12, %d14, %d60
1897	faligndata %d14, %d16, %d62
1898	fmovd	%d16, %d0
1899	fmovd	%d18, %d2
1900	fmovd	%d20, %d4
1901	fmovd	%d22, %d6
1902	fmovd	%d24, %d8
1903	fmovd	%d26, %d10
1904	fmovd	%d28, %d12
1905	fmovd	%d30, %d14
1906	stda	%d48, [%i1]ASI_BLK_P
1907	subcc	%i3, 64, %i3
1908	add	%i1, 64, %i1
1909	bgu,pt	%ncc, .bc_unaln_000_loop
1910	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1911
1912.bc_unaln_done:
1913	! Handle trailing bytes, 64 to 127
1914	! Dest long word aligned, Src not long word aligned
1915	cmp	%i2, 15
1916	bleu	%ncc, .bc_unaln_short
1917
1918	andn	%i2, 0x7, %i3		! %i3 is multiple of 8
1919	and	%i2, 0x7, %i2		! residue bytes in %i2
1920	add	%i2, 8, %i2
1921	sub	%i3, 8, %i3		! insure we don't load past end of src
1922	andn	%i0, 0x7, %o4		! %o4 has long word aligned src address
1923	add	%i0, %i3, %i0		! advance %i0 to after multiple of 8
1924	ldd	[%o4], %d0		! fetch partial word
1925.bc_unaln_by8:
1926	ldd	[%o4+8], %d2
1927	add	%o4, 8, %o4
1928	faligndata %d0, %d2, %d16
1929	subcc	%i3, 8, %i3
1930	std	%d16, [%i1]
1931	fmovd	%d2, %d0
1932	bgu,pt	%ncc, .bc_unaln_by8
1933	add	%i1, 8, %i1
1934
1935.bc_unaln_short:
1936	cmp	%i2, 8
1937	blt,pt	%ncc, .bc_unalnfin
1938	nop
1939	ldub	[%i0], %o4
1940	sll	%o4, 24, %o3
1941	ldub	[%i0+1], %o4
1942	sll	%o4, 16, %o4
1943	or	%o4, %o3, %o3
1944	ldub	[%i0+2], %o4
1945	sll	%o4, 8, %o4
1946	or	%o4, %o3, %o3
1947	ldub	[%i0+3], %o4
1948	or	%o4, %o3, %o3
1949	stw	%o3, [%i1]
1950	ldub	[%i0+4], %o4
1951	sll	%o4, 24, %o3
1952	ldub	[%i0+5], %o4
1953	sll	%o4, 16, %o4
1954	or	%o4, %o3, %o3
1955	ldub	[%i0+6], %o4
1956	sll	%o4, 8, %o4
1957	or	%o4, %o3, %o3
1958	ldub	[%i0+7], %o4
1959	or	%o4, %o3, %o3
1960	stw	%o3, [%i1+4]
1961	add	%i0, 8, %i0
1962	add	%i1, 8, %i1
1963	sub	%i2, 8, %i2
1964.bc_unalnfin:
1965	cmp	%i2, 4
1966	blt,pt	%ncc, .bc_unalnz
1967	tst	%i2
1968	ldub	[%i0], %o3		! read byte
1969	subcc	%i2, 4, %i2		! reduce count by 4
1970	sll	%o3, 24, %o3		! position
1971	ldub	[%i0+1], %o4
1972	sll	%o4, 16, %o4		! position
1973	or	%o4, %o3, %o3		! merge
1974	ldub	[%i0+2], %o4
1975	sll	%o4, 8, %o4		! position
1976	or	%o4, %o3, %o3		! merge
1977	add	%i1, 4, %i1		! advance dst by 4
1978	ldub	[%i0+3], %o4
1979	add	%i0, 4, %i0		! advance src by 4
1980	or	%o4, %o3, %o4		! merge
1981	bnz,pt	%ncc, .bc_unaln3x
1982	stw	%o4, [%i1-4]
1983	ba	.bc_exit
1984	nop
1985.bc_unalnz:
1986	bz,pt	%ncc, .bc_exit
1987.bc_unaln3x:				! Exactly 1, 2, or 3 bytes remain
1988	subcc	%i2, 1, %i2		! reduce count for cc test
1989	ldub	[%i0], %o4		! load one byte
1990	bz,pt	%ncc, .bc_exit
1991	stb	%o4, [%i1]		! store one byte
1992	ldub	[%i0+1], %o4		! load second byte
1993	subcc	%i2, 1, %i2
1994	bz,pt	%ncc, .bc_exit
1995	stb	%o4, [%i1+1]		! store second byte
1996	ldub	[%i0+2], %o4		! load third byte
1997	stb	%o4, [%i1+2]		! store third byte
1998.bc_exit:
1999	wr	%l5, %g0, %gsr		! restore %gsr
2000	brnz	%g5, .bc_fp_restore
2001	and	%o5, COPY_FLAGS, %l1	! save flags in %l1
2002	FZERO
2003	wr	%g5, %g0, %fprs
2004	ba,pt	%ncc, .bc_ex2
2005	nop
2006.bc_fp_restore:
2007	BLD_FP_FROMSTACK(%o4)
2008.bc_ex2:
2009	ldn	[THREAD_REG + T_LWP], %o2
2010	brnz,pt	%o2, 1f
2011	nop
2012
2013	ldsb	[THREAD_REG + T_PREEMPT], %l0
2014	deccc	%l0
2015	bnz,pn	%ncc, 1f
2016	stb	%l0, [THREAD_REG + T_PREEMPT]
2017
2018	! Check for a kernel preemption request
2019	ldn	[THREAD_REG + T_CPU], %l0
2020	ldub	[%l0 + CPU_KPRUNRUN], %l0
2021	brnz,a,pt	%l0, 1f	! Need to call kpreempt?
2022	or	%l1, KPREEMPT_FLAG, %l1	! If so, set the flag
20231:
2024	btst	LOFAULT_SET, %l1
2025	bz,pn	%icc, 3f
2026	andncc	%o5, COPY_FLAGS, %o5
2027	! Here via bcopy. Check to see if the handler was NULL.
2028	! If so, just return quietly. Otherwise, reset the
2029	! handler and return.
2030	bz,pn %ncc, 2f
2031	nop
2032	membar	#Sync
2033	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
20342:
2035	btst	KPREEMPT_FLAG, %l1
2036	bz,pt	%icc, 3f
2037	nop
2038	call	kpreempt
2039	rdpr	%pil, %o0		! pass %pil
20403:
2041	ret
2042	restore	%g0, 0, %o0
2043
2044	SET_SIZE(bcopy_more)
2045
2046
2047#else	/* NIAGARA_IMPL */
2048	save	%sp, -SA(MINFRAME), %sp
2049	clr	%o5			! flag LOFAULT_SET is not set for bcopy
2050.do_copy:
2051	cmp	%i2, 12			! for small counts
2052	blu	%ncc, .bytecp		! just copy bytes
2053	.empty
2054
2055	cmp	%i2, 128		! for less than 128 bytes
2056	blu,pn	%ncc, .bcb_punt		! no block st/quad ld
2057	nop
2058
2059	set	use_hw_bcopy, %o2
2060	ld	[%o2], %o2
2061	brz,pn	%o2, .bcb_punt
2062	nop
2063
2064	subcc	%i1, %i0, %i3
2065	bneg,a,pn %ncc, 1f
2066	neg	%i3
20671:
2068	/*
2069	 * Compare against 256 since we should be checking block addresses
2070	 * and (dest & ~63) - (src & ~63) can be 3 blocks even if
2071	 * src = dest + (64 * 3) + 63.
2072	 */
2073	cmp	%i3, 256
2074	blu,pn	%ncc, .bcb_punt
2075	nop
2076
2077	/*
2078	 * Copy that reach here have at least 2 blocks of data to copy.
2079	 */
2080.do_blockcopy:
2081	! Swap src/dst since the code below is memcpy code
2082	! and memcpy/bcopy have different calling sequences
2083	mov	%i1, %i5
2084	mov	%i0, %i1
2085	mov	%i5, %i0
2086
2087	! Block (64 bytes) align the destination.
2088	andcc	%i0, 0x3f, %i3		! is dst aligned on a 64 bytes
2089	bz	%xcc, .chksrc		! dst is already double aligned
2090	sub	%i3, 0x40, %i3
2091	neg	%i3			! bytes till dst 64 bytes aligned
2092	sub	%i2, %i3, %i2		! update i2 with new count
2093
2094	! Based on source and destination alignment do
2095	! either 8 bytes, 4 bytes, 2 bytes or byte copy.
2096
2097	! Is dst & src 8B aligned
2098	or	%i0, %i1, %o2
2099	andcc	%o2, 0x7, %g0
2100	bz	%ncc, .alewdcp
2101	nop
2102
2103	! Is dst & src 4B aligned
2104	andcc	%o2, 0x3, %g0
2105	bz	%ncc, .alwdcp
2106	nop
2107
2108	! Is dst & src 2B aligned
2109	andcc	%o2, 0x1, %g0
2110	bz	%ncc, .alhlfwdcp
2111	nop
2112
2113	! 1B aligned
21141:	ldub	[%i1], %o2
2115	stb	%o2, [%i0]
2116	inc	%i1
2117	deccc	%i3
2118	bgu,pt	%ncc, 1b
2119	inc	%i0
2120
2121	ba	.chksrc
2122	nop
2123
2124	! dst & src 4B aligned
2125.alwdcp:
2126	ld	[%i1], %o2
2127	st	%o2, [%i0]
2128	add	%i1, 0x4, %i1
2129	subcc	%i3, 0x4, %i3
2130	bgu,pt	%ncc, .alwdcp
2131	add	%i0, 0x4, %i0
2132
2133	ba	.chksrc
2134	nop
2135
2136	! dst & src 2B aligned
2137.alhlfwdcp:
2138	lduh	[%i1], %o2
2139	stuh	%o2, [%i0]
2140	add	%i1, 0x2, %i1
2141	subcc	%i3, 0x2, %i3
2142	bgu,pt	%ncc, .alhlfwdcp
2143	add	%i0, 0x2, %i0
2144
2145	ba	.chksrc
2146	nop
2147
2148	! dst & src 8B aligned
2149.alewdcp:
2150	ldx	[%i1], %o2
2151	stx	%o2, [%i0]
2152	add	%i1, 0x8, %i1
2153	subcc	%i3, 0x8, %i3
2154	bgu,pt	%ncc, .alewdcp
2155	add	%i0, 0x8, %i0
2156
2157	! Now Destination is block (64 bytes) aligned
2158.chksrc:
2159	andn	%i2, 0x3f, %i3		! %i3 count is multiple of block size
2160	sub	%i2, %i3, %i2		! Residue bytes in %i2
2161
2162	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
2163
2164	andcc	%i1, 0xf, %o2		! is src quadword aligned
2165	bz,pn	%xcc, .blkcpy		! src offset in %o2
2166	nop
2167	cmp	%o2, 0x8
2168	bg	.cpy_upper_double
2169	nop
2170	bl	.cpy_lower_double
2171	nop
2172
2173	! Falls through when source offset is equal to 8 i.e.
2174	! source is double word aligned.
2175	! In this case no shift/merge of data is required
2176	sub	%i1, %o2, %i1		! align the src at 16 bytes.
2177	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
2178	prefetch [%l0+0x0], #one_read
2179	ldda	[%i1+0x0]%asi, %l2
2180loop0:
2181	ldda	[%i1+0x10]%asi, %l4
2182	prefetch [%l0+0x40], #one_read
2183
2184	stxa	%l3, [%i0+0x0]%asi
2185	stxa	%l4, [%i0+0x8]%asi
2186
2187	ldda	[%i1+0x20]%asi, %l2
2188	stxa	%l5, [%i0+0x10]%asi
2189	stxa	%l2, [%i0+0x18]%asi
2190
2191	ldda	[%i1+0x30]%asi, %l4
2192	stxa	%l3, [%i0+0x20]%asi
2193	stxa	%l4, [%i0+0x28]%asi
2194
2195	ldda	[%i1+0x40]%asi, %l2
2196	stxa	%l5, [%i0+0x30]%asi
2197	stxa	%l2, [%i0+0x38]%asi
2198
2199	add	%l0, 0x40, %l0
2200	add	%i1, 0x40, %i1
2201	subcc	%i3, 0x40, %i3
2202	bgu,pt	%xcc, loop0
2203	add	%i0, 0x40, %i0
2204	ba	.blkdone
2205	add	%i1, %o2, %i1		! increment the source by src offset
2206					! the src offset was stored in %o2
2207
2208.cpy_lower_double:
2209	sub	%i1, %o2, %i1		! align the src at 16 bytes.
2210	sll	%o2, 3, %o0		! %o0 left shift
2211	mov	0x40, %o1
2212	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
2213	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
2214	prefetch [%l0+0x0], #one_read
2215	ldda	[%i1+0x0]%asi, %l2	! partial data in %l2 and %l3 has
2216					! complete data
2217loop1:
2218	ldda	[%i1+0x10]%asi, %l4	! %l4 has partial data for this read.
2219	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)	! merge %l2, %l3 and %l4
2220							! into %l2 and %l3
2221	prefetch [%l0+0x40], #one_read
2222	stxa	%l2, [%i0+0x0]%asi
2223	stxa	%l3, [%i0+0x8]%asi
2224
2225	ldda	[%i1+0x20]%asi, %l2
2226	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)	! merge %l2 with %l5 and
2227	stxa	%l4, [%i0+0x10]%asi			! %l4 from previous read
2228	stxa	%l5, [%i0+0x18]%asi			! into %l4 and %l5
2229
2230	! Repeat the same for next 32 bytes.
2231
2232	ldda	[%i1+0x30]%asi, %l4
2233	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
2234	stxa	%l2, [%i0+0x20]%asi
2235	stxa	%l3, [%i0+0x28]%asi
2236
2237	ldda	[%i1+0x40]%asi, %l2
2238	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
2239	stxa	%l4, [%i0+0x30]%asi
2240	stxa	%l5, [%i0+0x38]%asi
2241
2242	add	%l0, 0x40, %l0
2243	add	%i1, 0x40, %i1
2244	subcc	%i3, 0x40, %i3
2245	bgu,pt	%xcc, loop1
2246	add	%i0, 0x40, %i0
2247	ba	.blkdone
2248	add	%i1, %o2, %i1		! increment the source by src offset
2249					! the src offset was stored in %o2
2250
2251.cpy_upper_double:
2252	sub	%i1, %o2, %i1		! align the src at 16 bytes.
2253	mov	0x8, %o0
2254	sub	%o2, %o0, %o0
2255	sll	%o0, 3, %o0		! %o0 left shift
2256	mov	0x40, %o1
2257	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
2258	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
2259	prefetch [%l0+0x0], #one_read
2260	ldda	[%i1+0x0]%asi, %l2	! partial data in %l3 for this read and
2261					! no data in %l2
2262loop2:
2263	ldda	[%i1+0x10]%asi, %l4	! %l4 has complete data and %l5 has
2264					! partial
2265	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)	! merge %l3, %l4 and %l5
2266							! into %l3 and %l4
2267	prefetch [%l0+0x40], #one_read
2268	stxa	%l3, [%i0+0x0]%asi
2269	stxa	%l4, [%i0+0x8]%asi
2270
2271	ldda	[%i1+0x20]%asi, %l2
2272	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)	! merge %l2 and %l3 with
2273	stxa	%l5, [%i0+0x10]%asi			! %l5 from previous read
2274	stxa	%l2, [%i0+0x18]%asi			! into %l5 and %l2
2275
2276	! Repeat the same for next 32 bytes.
2277
2278	ldda	[%i1+0x30]%asi, %l4
2279	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
2280	stxa	%l3, [%i0+0x20]%asi
2281	stxa	%l4, [%i0+0x28]%asi
2282
2283	ldda	[%i1+0x40]%asi, %l2
2284	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
2285	stxa	%l5, [%i0+0x30]%asi
2286	stxa	%l2, [%i0+0x38]%asi
2287
2288	add	%l0, 0x40, %l0
2289	add	%i1, 0x40, %i1
2290	subcc	%i3, 0x40, %i3
2291	bgu,pt	%xcc, loop2
2292	add	%i0, 0x40, %i0
2293	ba	.blkdone
2294	add	%i1, %o2, %i1		! increment the source by src offset
2295					! the src offset was stored in %o2
2296
2297
2298	! Both Source and Destination are block aligned.
2299	! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
2300.blkcpy:
2301	prefetch [%i1+0x0], #one_read
23021:
2303	ldda	[%i1+0x0]%asi, %l0
2304	ldda	[%i1+0x10]%asi, %l2
2305	prefetch [%i1+0x40], #one_read
2306
2307	stxa	%l0, [%i0+0x0]%asi
2308	ldda	[%i1+0x20]%asi, %l4
2309	ldda	[%i1+0x30]%asi, %l6
2310
2311	stxa	%l1, [%i0+0x8]%asi
2312	stxa	%l2, [%i0+0x10]%asi
2313	stxa	%l3, [%i0+0x18]%asi
2314	stxa	%l4, [%i0+0x20]%asi
2315	stxa	%l5, [%i0+0x28]%asi
2316	stxa	%l6, [%i0+0x30]%asi
2317	stxa	%l7, [%i0+0x38]%asi
2318
2319	add	%i1, 0x40, %i1
2320	subcc	%i3, 0x40, %i3
2321	bgu,pt	%xcc, 1b
2322	add	%i0, 0x40, %i0
2323
2324.blkdone:
2325	membar	#Sync
2326
2327	brz,pt	%i2, .blkexit
2328	nop
2329
2330	! Handle trailing bytes
2331	cmp	%i2, 0x8
2332	blu,pt	%ncc, .residue
2333	nop
2334
2335	! Can we do some 8B ops
2336	or	%i1, %i0, %o2
2337	andcc	%o2, 0x7, %g0
2338	bnz	%ncc, .last4
2339	nop
2340
2341	! Do 8byte ops as long as possible
2342.last8:
2343	ldx	[%i1], %o2
2344	stx	%o2, [%i0]
2345	add	%i1, 0x8, %i1
2346	sub	%i2, 0x8, %i2
2347	cmp	%i2, 0x8
2348	bgu,pt	%ncc, .last8
2349	add	%i0, 0x8, %i0
2350
2351	brz,pt	%i2, .blkexit
2352	nop
2353
2354	ba	.residue
2355	nop
2356
2357.last4:
2358	! Can we do 4B ops
2359	andcc	%o2, 0x3, %g0
2360	bnz	%ncc, .last2
2361	nop
23621:
2363	ld	[%i1], %o2
2364	st	%o2, [%i0]
2365	add	%i1, 0x4, %i1
2366	sub	%i2, 0x4, %i2
2367	cmp	%i2, 0x4
2368	bgu,pt	%ncc, 1b
2369	add	%i0, 0x4, %i0
2370
2371	brz,pt	%i2, .blkexit
2372	nop
2373
2374	ba	.residue
2375	nop
2376
2377.last2:
2378	! Can we do 2B ops
2379	andcc	%o2, 0x1, %g0
2380	bnz	%ncc, .residue
2381	nop
2382
23831:
2384	lduh	[%i1], %o2
2385	stuh	%o2, [%i0]
2386	add	%i1, 0x2, %i1
2387	sub	%i2, 0x2, %i2
2388	cmp	%i2, 0x2
2389	bgu,pt	%ncc, 1b
2390	add	%i0, 0x2, %i0
2391
2392	brz,pt	%i2, .blkexit
2393	nop
2394
2395.residue:
2396	ldub	[%i1], %o2
2397	stb	%o2, [%i0]
2398	inc	%i1
2399	deccc	%i2
2400	bgu,pt	%ncc, .residue
2401	inc	%i0
2402
2403.blkexit:
2404
2405	membar	#Sync				! sync error barrier
2406	! Restore t_lofault handler, if came here from kcopy().
2407	tst	%o5
2408	bz	%ncc, 1f
2409	andn	%o5, LOFAULT_SET, %o5
2410	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
24111:
2412	ret
2413	restore	%g0, 0, %o0
2414
2415
2416.bcb_punt:
2417	!
2418	! use aligned transfers where possible
2419	!
2420	xor	%i0, %i1, %o4		! xor from and to address
2421	btst	7, %o4			! if lower three bits zero
2422	bz	.aldoubcp		! can align on double boundary
2423	.empty	! assembler complaints about label
2424
2425	xor	%i0, %i1, %o4		! xor from and to address
2426	btst	3, %o4			! if lower two bits zero
2427	bz	.alwordcp		! can align on word boundary
2428	btst	3, %i0			! delay slot, from address unaligned?
2429	!
2430	! use aligned reads and writes where possible
2431	! this differs from wordcp in that it copes
2432	! with odd alignment between source and destnation
2433	! using word reads and writes with the proper shifts
2434	! in between to align transfers to and from memory
2435	! i0 - src address, i1 - dest address, i2 - count
2436	! i3, i4 - tmps for used generating complete word
2437	! i5 (word to write)
2438	! l0 size in bits of upper part of source word (US)
2439	! l1 size in bits of lower part of source word (LS = 32 - US)
2440	! l2 size in bits of upper part of destination word (UD)
2441	! l3 size in bits of lower part of destination word (LD = 32 - UD)
2442	! l4 number of bytes leftover after aligned transfers complete
2443	! l5 the number 32
2444	!
2445	mov	32, %l5			! load an oft-needed constant
2446	bz	.align_dst_only
2447	btst	3, %i1			! is destnation address aligned?
2448	clr	%i4			! clear registers used in either case
2449	bz	.align_src_only
2450	clr	%l0
2451	!
2452	! both source and destination addresses are unaligned
2453	!
24541:					! align source
2455	ldub	[%i0], %i3		! read a byte from source address
2456	add	%i0, 1, %i0		! increment source address
2457	or	%i4, %i3, %i4		! or in with previous bytes (if any)
2458	btst	3, %i0			! is source aligned?
2459	add	%l0, 8, %l0		! increment size of upper source (US)
2460	bnz,a	1b
2461	sll	%i4, 8, %i4		! make room for next byte
2462
2463	sub	%l5, %l0, %l1		! generate shift left count (LS)
2464	sll	%i4, %l1, %i4		! prepare to get rest
2465	ld	[%i0], %i3		! read a word
2466	add	%i0, 4, %i0		! increment source address
2467	srl	%i3, %l0, %i5		! upper src bits into lower dst bits
2468	or	%i4, %i5, %i5		! merge
2469	mov	24, %l3			! align destination
24701:
2471	srl	%i5, %l3, %i4		! prepare to write a single byte
2472	stb	%i4, [%i1]		! write a byte
2473	add	%i1, 1, %i1		! increment destination address
2474	sub	%i2, 1, %i2		! decrement count
2475	btst	3, %i1			! is destination aligned?
2476	bnz,a	1b
2477	sub	%l3, 8, %l3		! delay slot, decrement shift count (LD)
2478	sub	%l5, %l3, %l2		! generate shift left count (UD)
2479	sll	%i5, %l2, %i5		! move leftover into upper bytes
2480	cmp	%l2, %l0		! cmp # reqd to fill dst w old src left
2481	bgu	%ncc, .more_needed	! need more to fill than we have
2482	nop
2483
2484	sll	%i3, %l1, %i3		! clear upper used byte(s)
2485	srl	%i3, %l1, %i3
2486	! get the odd bytes between alignments
2487	sub	%l0, %l2, %l0		! regenerate shift count
2488	sub	%l5, %l0, %l1		! generate new shift left count (LS)
2489	and	%i2, 3, %l4		! must do remaining bytes if count%4 > 0
2490	andn	%i2, 3, %i2		! # of aligned bytes that can be moved
2491	srl	%i3, %l0, %i4
2492	or	%i5, %i4, %i5
2493	st	%i5, [%i1]		! write a word
2494	subcc	%i2, 4, %i2		! decrement count
2495	bz	%ncc, .unalign_out
2496	add	%i1, 4, %i1		! increment destination address
2497
2498	b	2f
2499	sll	%i3, %l1, %i5		! get leftover into upper bits
2500.more_needed:
2501	sll	%i3, %l0, %i3		! save remaining byte(s)
2502	srl	%i3, %l0, %i3
2503	sub	%l2, %l0, %l1		! regenerate shift count
2504	sub	%l5, %l1, %l0		! generate new shift left count
2505	sll	%i3, %l1, %i4		! move to fill empty space
2506	b	3f
2507	or	%i5, %i4, %i5		! merge to complete word
2508	!
2509	! the source address is aligned and destination is not
2510	!
2511.align_dst_only:
2512	ld	[%i0], %i4		! read a word
2513	add	%i0, 4, %i0		! increment source address
2514	mov	24, %l0			! initial shift alignment count
25151:
2516	srl	%i4, %l0, %i3		! prepare to write a single byte
2517	stb	%i3, [%i1]		! write a byte
2518	add	%i1, 1, %i1		! increment destination address
2519	sub	%i2, 1, %i2		! decrement count
2520	btst	3, %i1			! is destination aligned?
2521	bnz,a	1b
2522	sub	%l0, 8, %l0		! delay slot, decrement shift count
2523.xfer:
2524	sub	%l5, %l0, %l1		! generate shift left count
2525	sll	%i4, %l1, %i5		! get leftover
25263:
2527	and	%i2, 3, %l4		! must do remaining bytes if count%4 > 0
2528	andn	%i2, 3, %i2		! # of aligned bytes that can be moved
25292:
2530	ld	[%i0], %i3		! read a source word
2531	add	%i0, 4, %i0		! increment source address
2532	srl	%i3, %l0, %i4		! upper src bits into lower dst bits
2533	or	%i5, %i4, %i5		! merge with upper dest bits (leftover)
2534	st	%i5, [%i1]		! write a destination word
2535	subcc	%i2, 4, %i2		! decrement count
2536	bz	%ncc, .unalign_out	! check if done
2537	add	%i1, 4, %i1		! increment destination address
2538	b	2b			! loop
2539	sll	%i3, %l1, %i5		! get leftover
2540.unalign_out:
2541	tst	%l4			! any bytes leftover?
2542	bz	%ncc, .cpdone
2543	.empty				! allow next instruction in delay slot
25441:
2545	sub	%l0, 8, %l0		! decrement shift
2546	srl	%i3, %l0, %i4		! upper src byte into lower dst byte
2547	stb	%i4, [%i1]		! write a byte
2548	subcc	%l4, 1, %l4		! decrement count
2549	bz	%ncc, .cpdone		! done?
2550	add	%i1, 1, %i1		! increment destination
2551	tst	%l0			! any more previously read bytes
2552	bnz	%ncc, 1b		! we have leftover bytes
2553	mov	%l4, %i2		! delay slot, mv cnt where dbytecp wants
2554	b	.dbytecp		! let dbytecp do the rest
2555	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
2556	!
2557	! the destination address is aligned and the source is not
2558	!
2559.align_src_only:
2560	ldub	[%i0], %i3		! read a byte from source address
2561	add	%i0, 1, %i0		! increment source address
2562	or	%i4, %i3, %i4		! or in with previous bytes (if any)
2563	btst	3, %i0			! is source aligned?
2564	add	%l0, 8, %l0		! increment shift count (US)
2565	bnz,a	.align_src_only
2566	sll	%i4, 8, %i4		! make room for next byte
2567	b,a	.xfer
2568	!
2569	! if from address unaligned for double-word moves,
2570	! move bytes till it is, if count is < 56 it could take
2571	! longer to align the thing than to do the transfer
2572	! in word size chunks right away
2573	!
2574.aldoubcp:
2575	cmp	%i2, 56			! if count < 56, use wordcp, it takes
2576	blu,a	%ncc, .alwordcp		! longer to align doubles than words
2577	mov	3, %o0			! mask for word alignment
2578	call	.alignit		! copy bytes until aligned
2579	mov	7, %o0			! mask for double alignment
2580	!
2581	! source and destination are now double-word aligned
2582	! i3 has aligned count returned by alignit
2583	!
2584	and	%i2, 7, %i2		! unaligned leftover count
2585	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
25865:
2587	ldx	[%i0+%i1], %o4		! read from address
2588	stx	%o4, [%i1]		! write at destination address
2589	subcc	%i3, 8, %i3		! dec count
2590	bgu	%ncc, 5b
2591	add	%i1, 8, %i1		! delay slot, inc to address
2592	cmp	%i2, 4			! see if we can copy a word
2593	blu	%ncc, .dbytecp		! if 3 or less bytes use bytecp
2594	.empty
2595	!
2596	! for leftover bytes we fall into wordcp, if needed
2597	!
2598.wordcp:
2599	and	%i2, 3, %i2		! unaligned leftover count
26005:
2601	ld	[%i0+%i1], %o4		! read from address
2602	st	%o4, [%i1]		! write at destination address
2603	subcc	%i3, 4, %i3		! dec count
2604	bgu	%ncc, 5b
2605	add	%i1, 4, %i1		! delay slot, inc to address
2606	b,a	.dbytecp
2607
2608	! we come here to align copies on word boundaries
2609.alwordcp:
2610	call	.alignit		! go word-align it
2611	mov	3, %o0			! bits that must be zero to be aligned
2612	b	.wordcp
2613	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
2614
2615	!
2616	! byte copy, works with any alignment
2617	!
2618.bytecp:
2619	b	.dbytecp
2620	sub	%i0, %i1, %i0		! i0 gets difference of src and dst
2621
2622	!
2623	! differenced byte copy, works with any alignment
2624	! assumes dest in %i1 and (source - dest) in %i0
2625	!
26261:
2627	stb	%o4, [%i1]		! write to address
2628	inc	%i1			! inc to address
2629.dbytecp:
2630	deccc	%i2			! dec count
2631	bgeu,a	%ncc, 1b		! loop till done
2632	ldub	[%i0+%i1], %o4		! read from address
2633.cpdone:
2634
2635	membar	#Sync				! sync error barrier
2636	! Restore t_lofault handler, if came here from kcopy().
2637	tst	%o5
2638	bz	%ncc, 1f
2639	andn	%o5, LOFAULT_SET, %o5
2640	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
26411:
2642	ret
2643	restore %g0, 0, %o0		! return (0)
2644
2645/*
2646 * Common code used to align transfers on word and doubleword
2647 * boundaries.  Aligns source and destination and returns a count
2648 * of aligned bytes to transfer in %i3
2649 */
26501:
2651	inc	%i0			! inc from
2652	stb	%o4, [%i1]		! write a byte
2653	inc	%i1			! inc to
2654	dec	%i2			! dec count
2655.alignit:
2656	btst	%o0, %i0		! %o0 is bit mask to check for alignment
2657	bnz,a	1b
2658	ldub	[%i0], %o4		! read next byte
2659
2660	retl
2661	andn	%i2, %o0, %i3		! return size of aligned bytes
2662
2663	SET_SIZE(bcopy)
2664
2665#endif	/* NIAGARA_IMPL */
2666
2667/*
2668 * Block copy with possibly overlapped operands.
2669 */
2670
2671	ENTRY(ovbcopy)
2672	tst	%o2			! check count
2673	bgu,a	%ncc, 1f		! nothing to do or bad arguments
2674	subcc	%o0, %o1, %o3		! difference of from and to address
2675
2676	retl				! return
2677	nop
26781:
2679	bneg,a	%ncc, 2f
2680	neg	%o3			! if < 0, make it positive
26812:	cmp	%o2, %o3		! cmp size and abs(from - to)
2682	bleu	%ncc, bcopy		! if size <= abs(diff): use bcopy,
2683	.empty				!   no overlap
2684	cmp	%o0, %o1		! compare from and to addresses
2685	blu	%ncc, .ov_bkwd		! if from < to, copy backwards
2686	nop
2687	!
2688	! Copy forwards.
2689	!
2690.ov_fwd:
2691	ldub	[%o0], %o3		! read from address
2692	inc	%o0			! inc from address
2693	stb	%o3, [%o1]		! write to address
2694	deccc	%o2			! dec count
2695	bgu	%ncc, .ov_fwd		! loop till done
2696	inc	%o1			! inc to address
2697
2698	retl				! return
2699	nop
2700	!
2701	! Copy backwards.
2702	!
2703.ov_bkwd:
2704	deccc	%o2			! dec count
2705	ldub	[%o0 + %o2], %o3	! get byte at end of src
2706	bgu	%ncc, .ov_bkwd		! loop till done
2707	stb	%o3, [%o1 + %o2]	! delay slot, store at end of dst
2708
2709	retl				! return
2710	nop
2711	SET_SIZE(ovbcopy)
2712
2713/*
2714 * hwblkpagecopy()
2715 *
2716 * Copies exactly one page.  This routine assumes the caller (ppcopy)
2717 * has already disabled kernel preemption and has checked
2718 * use_hw_bcopy.
2719 */
2720	ENTRY(hwblkpagecopy)
2721	save	%sp, -SA(MINFRAME), %sp
2722
2723	! %i0 - source address (arg)
2724	! %i1 - destination address (arg)
2725	! %i2 - length of region (not arg)
2726
2727	set	PAGESIZE, %i2
2728
2729	/*
2730	 * Copying exactly one page and PAGESIZE is in mutliple of 0x80.
2731	 */
2732	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
2733	prefetch [%i0+0x0], #one_read
2734	prefetch [%i0+0x40], #one_read
27351:
2736	prefetch [%i0+0x80], #one_read
2737	prefetch [%i0+0xc0], #one_read
2738	ldda	[%i0+0x0]%asi, %l0
2739	ldda	[%i0+0x10]%asi, %l2
2740	ldda	[%i0+0x20]%asi, %l4
2741	ldda	[%i0+0x30]%asi, %l6
2742	stxa	%l0, [%i1+0x0]%asi
2743	stxa	%l1, [%i1+0x8]%asi
2744	stxa	%l2, [%i1+0x10]%asi
2745	stxa	%l3, [%i1+0x18]%asi
2746	stxa	%l4, [%i1+0x20]%asi
2747	stxa	%l5, [%i1+0x28]%asi
2748	stxa	%l6, [%i1+0x30]%asi
2749	stxa	%l7, [%i1+0x38]%asi
2750	ldda	[%i0+0x40]%asi, %l0
2751	ldda	[%i0+0x50]%asi, %l2
2752	ldda	[%i0+0x60]%asi, %l4
2753	ldda	[%i0+0x70]%asi, %l6
2754	stxa	%l0, [%i1+0x40]%asi
2755	stxa	%l1, [%i1+0x48]%asi
2756	stxa	%l2, [%i1+0x50]%asi
2757	stxa	%l3, [%i1+0x58]%asi
2758	stxa	%l4, [%i1+0x60]%asi
2759	stxa	%l5, [%i1+0x68]%asi
2760	stxa	%l6, [%i1+0x70]%asi
2761	stxa	%l7, [%i1+0x78]%asi
2762
2763	add	%i0, 0x80, %i0
2764	subcc	%i2, 0x80, %i2
2765	bgu,pt	%xcc, 1b
2766	add	%i1, 0x80, %i1
2767
2768	membar #Sync
2769	ret
2770	restore	%g0, 0, %o0
2771	SET_SIZE(hwblkpagecopy)
2772
2773
2774/*
2775 * Transfer data to and from user space -
2776 * Note that these routines can cause faults
2777 * It is assumed that the kernel has nothing at
2778 * less than KERNELBASE in the virtual address space.
2779 *
2780 * Note that copyin(9F) and copyout(9F) are part of the
2781 * DDI/DKI which specifies that they return '-1' on "errors."
2782 *
2783 * Sigh.
2784 *
2785 * So there's two extremely similar routines - xcopyin() and xcopyout()
2786 * which return the errno that we've faithfully computed.  This
2787 * allows other callers (e.g. uiomove(9F)) to work correctly.
2788 * Given that these are used pretty heavily, we expand the calling
2789 * sequences inline for all flavours (rather than making wrappers).
2790 *
2791 * There are also stub routines for xcopyout_little and xcopyin_little,
2792 * which currently are intended to handle requests of <= 16 bytes from
2793 * do_unaligned. Future enhancement to make them handle 8k pages efficiently
2794 * is left as an exercise...
2795 */
2796
2797/*
2798 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
2799 *
2800 * General theory of operation:
2801 *
2802 * None of the copyops routines grab a window until it's decided that
2803 * we need to do a HW block copy operation. This saves a window
2804 * spill/fill when we're called during socket ops. The typical IO
2805 * path won't cause spill/fill traps.
2806 *
2807 * This code uses a set of 4 limits for the maximum size that will
2808 * be copied given a particular input/output address alignment.
2809 * the default limits are:
2810 *
2811 * single byte aligned - 256 (hw_copy_limit_1)
2812 * two byte aligned - 512 (hw_copy_limit_2)
2813 * four byte aligned - 1024 (hw_copy_limit_4)
2814 * eight byte aligned - 1024 (hw_copy_limit_8)
2815 *
2816 * If the value for a particular limit is zero, the copy will be done
2817 * via the copy loops rather than block store/quad load instructions.
2818 *
2819 * Flow:
2820 *
2821 * If count == zero return zero.
2822 *
2823 * Store the previous lo_fault handler into %g6.
2824 * Place our secondary lofault handler into %g5.
2825 * Place the address of our nowindow fault handler into %o3.
2826 * Place the address of the windowed fault handler into %o4.
2827 * --> We'll use this handler if we end up grabbing a window
2828 * --> before we use block initializing store and quad load ASIs
2829 *
2830 * If count is less than or equal to SMALL_LIMIT (7) we
2831 * always do a byte for byte copy.
2832 *
2833 * If count is > SMALL_LIMIT, we check the alignment of the input
2834 * and output pointers. Based on the alignment we check count
2835 * against a limit based on detected alignment.  If we exceed the
2836 * alignment value we copy via block initializing store and quad
2837 * load instructions.
2838 *
2839 * If we don't exceed one of the limits, we store -count in %o3,
2840 * we store the number of chunks (8, 4, 2 or 1 byte) operated
2841 * on in our basic copy loop in %o2. Following this we branch
2842 * to the appropriate copy loop and copy that many chunks.
2843 * Since we've been adding the chunk size to %o3 each time through
2844 * as well as decrementing %o2, we can tell if any data is
2845 * is left to be copied by examining %o3. If that is zero, we're
2846 * done and can go home. If not, we figure out what the largest
2847 * chunk size left to be copied is and branch to that copy loop
2848 * unless there's only one byte left. We load that as we're
2849 * branching to code that stores it just before we return.
2850 *
2851 * Fault handlers are invoked if we reference memory that has no
2852 * current mapping.  All forms share the same copyio_fault handler.
2853 * This routine handles fixing up the stack and general housecleaning.
2854 * Each copy operation has a simple fault handler that is then called
2855 * to do the work specific to the invidual operation.  The handler
2856 * for copyOP and xcopyOP are found at the end of individual function.
2857 * The handlers for xcopyOP_little are found at the end of xcopyin_little.
2858 * The handlers for copyOP_noerr are found at the end of copyin_noerr.
2859 */
2860
2861/*
2862 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
2863 */
2864
2865/*
2866 * We save the arguments in the following registers in case of a fault:
2867 * 	kaddr - %g2
2868 * 	uaddr - %g3
2869 * 	count - %g4
2870 */
2871#define	SAVE_SRC	%g2
2872#define	SAVE_DST	%g3
2873#define	SAVE_COUNT	%g4
2874
2875#define	REAL_LOFAULT		%g5
2876#define	SAVED_LOFAULT		%g6
2877
2878/*
2879 * Generic copyio fault handler.  This is the first line of defense when a
2880 * fault occurs in (x)copyin/(x)copyout.  In order for this to function
2881 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
2882 * This allows us to share common code for all the flavors of the copy
2883 * operations, including the _noerr versions.
2884 *
2885 * Note that this function will restore the original input parameters before
2886 * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
2887 * member of the t_copyop structure, if needed.
2888 */
2889	ENTRY(copyio_fault)
2890#if !defined(NIAGARA_IMPL)
2891	btst	FPUSED_FLAG, SAVED_LOFAULT
2892	bz	1f
2893	andn	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
2894
2895	wr	%l5, 0, %gsr		! restore gsr
2896
2897	btst	FPRS_FEF, %g1
2898	bz	%icc, 4f
2899	nop
2900
2901	! restore fpregs from stack
2902	BLD_FP_FROMSTACK(%o2)
2903
2904	ba,pt	%ncc, 1f
2905	nop
29064:
2907	FZERO				! zero all of the fpregs
2908	wr	%g1, %g0, %fprs		! restore fprs
29091:
2910	restore
2911	mov	SAVE_SRC, %o0
2912	mov	SAVE_DST, %o1
2913	jmp	REAL_LOFAULT
2914	mov	SAVE_COUNT, %o2
2915
2916#else	/* NIAGARA_IMPL */
2917	membar	#Sync
2918	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2919	restore
2920	mov	SAVE_SRC, %o0
2921	mov	SAVE_DST, %o1
2922	jmp	REAL_LOFAULT
2923	mov	SAVE_COUNT, %o2
2924
2925#endif	/* NIAGARA_IMPL */
2926
2927	SET_SIZE(copyio_fault)
2928
2929	ENTRY(copyio_fault_nowindow)
2930	membar	#Sync
2931	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2932
2933	mov	SAVE_SRC, %o0
2934	mov	SAVE_DST, %o1
2935	jmp	REAL_LOFAULT
2936	mov	SAVE_COUNT, %o2
2937	SET_SIZE(copyio_fault_nowindow)
2938
2939	ENTRY(copyout)
2940	sethi	%hi(.copyout_err), REAL_LOFAULT
2941	or	REAL_LOFAULT, %lo(.copyout_err), REAL_LOFAULT
2942
2943#if !defined(NIAGARA_IMPL)
2944.do_copyout:
2945	tst	%o2			! check for zero count;  quick exit
2946	bz,pt	%ncc, .co_smallqx
2947	mov	%o0, SAVE_SRC
2948	mov	%o1, SAVE_DST
2949	mov	%o2, SAVE_COUNT
2950	cmp	%o2, FP_COPY		! check for small copy/leaf case
2951	bgt,pt	%ncc, .co_copy_more
2952	ldn	[THREAD_REG + T_LOFAULT], SAVED_LOFAULT
2953/*
2954 * Small copy out code
2955 *
2956 */
2957	sethi	%hi(copyio_fault_nowindow), %o3
2958	or	%o3, %lo(copyio_fault_nowindow), %o3
2959	membar	#Sync
2960	stn	%o3, [THREAD_REG + T_LOFAULT]
2961
2962	mov	ASI_USER, %asi
2963	cmp	%o2, SHORTCOPY		! make sure there is enough to align
2964	ble,pt	%ncc, .co_smallest
2965	andcc	%o1, 0x7, %o3		! is dest long word aligned
2966	bnz,pn	%ncc, .co_align
2967	andcc	%o1, 1, %o3		! is dest byte aligned
2968
2969! Destination is long word aligned
2970! 8 cases for src alignment; load parts, store long words
2971.co_al_src:
2972	andcc	%o0, 7, %o3
2973	brnz,pt	%o3, .co_src_dst_unal8
2974	nop
2975/*
2976 * Special case for handling when src and dest are both long word aligned
2977 * and total data to move is less than FP_COPY bytes
2978 * Also handles finish up for large block moves, so may be less than 32 bytes
2979 */
2980.co_medlong:
2981	subcc	%o2, 31, %o2		! adjust length to allow cc test
2982	ble,pt	%ncc, .co_medl31
2983	nop
2984.co_medl32:
2985	ldx	[%o0], %o4		! move 32 bytes
2986	subcc	%o2, 32, %o2		! decrement length count by 32
2987	stxa	%o4, [%o1]%asi
2988	ldx	[%o0+8], %o4
2989	stxa	%o4, [%o1+8]%asi
2990	ldx	[%o0+16], %o4
2991	add	%o0, 32, %o0		! increase src ptr by 32
2992	stxa	%o4, [%o1+16]%asi
2993	ldx	[%o0-8], %o4
2994	add	%o1, 32, %o1		! increase dst ptr by 32
2995	bgu,pt	%ncc, .co_medl32	! repeat if at least 32 bytes left
2996	stxa	%o4, [%o1-8]%asi
2997.co_medl31:
2998	addcc	%o2, 24, %o2		! adjust count to be off by 7
2999	ble,pt	%ncc, .co_medl7		! skip if 7 or fewer bytes left
3000	nop
3001.co_medl8:
3002	ldx	[%o0], %o4		! move 8 bytes
3003	add	%o0, 8, %o0		! increase src ptr by 8
3004	subcc	%o2, 8, %o2		! decrease count by 8
3005	add	%o1, 8, %o1		! increase dst ptr by 8
3006	bgu,pt	%ncc, .co_medl8
3007	stxa	%o4, [%o1-8]%asi
3008.co_medl7:
3009	addcc	%o2, 7, %o2		! finish adjustment of remaining count
3010	bnz,pt	%ncc, .co_small4	! do final bytes if not finished
3011
3012.co_smallx:				! finish up and exit
3013	membar	#Sync
3014	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3015.co_smallqx:
3016	retl
3017	mov	%g0, %o0
3018
3019.co_small4:
3020	cmp	%o2, 4
3021	blt,pt	%ncc, .co_small3x	! skip if less than 4 bytes left
3022	nop				!
3023	ld	[%o0], %o4		! move 4 bytes
3024	add	%o0, 4, %o0		! increase src ptr by 4
3025	add	%o1, 4, %o1		! increase dst ptr by 4
3026	subcc	%o2, 4, %o2		! decrease count by 4
3027	bz,pt	%ncc, .co_smallx
3028	stwa	%o4, [%o1-4]%asi
3029
3030.co_small3x:				! Exactly 1, 2, or 3 bytes remain
3031	subcc	%o2, 1, %o2		! reduce count for cc test
3032	ldub	[%o0], %o4		! load one byte
3033	bz,pt	%ncc, .co_smallx
3034	stba	%o4, [%o1]%asi		! store one byte
3035	ldub	[%o0+1], %o4		! load second byte
3036	subcc	%o2, 1, %o2
3037	bz,pt	%ncc, .co_smallx
3038	stba	%o4, [%o1+1]%asi	! store second byte
3039	ldub	[%o0+2], %o4		! load third byte
3040	ba	.co_smallx
3041	stba	%o4, [%o1+2]%asi	! store third byte
3042
3043.co_smallest:				! 7 or fewer bytes remain
3044	cmp	%o2, 4
3045	blt,pt	%ncc, .co_small3x
3046	nop
3047	ldub	[%o0], %o4		! read byte
3048	subcc	%o2, 4, %o2		! reduce count by 4
3049	stba	%o4, [%o1]%asi		! write byte
3050	ldub	[%o0+1], %o4		! repeat for total of 4 bytes
3051	add	%o0, 4, %o0		! advance src by 4
3052	stba	%o4, [%o1+1]%asi
3053	ldub	[%o0-2], %o4
3054	add	%o1, 4, %o1		! advance dst by 4
3055	stba	%o4, [%o1-2]%asi
3056	ldub	[%o0-1], %o4
3057	bnz,pt	%ncc, .co_small3x
3058	stba	%o4, [%o1-1]%asi
3059	membar	#Sync
3060	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3061	retl
3062	mov	%g0, %o0
3063
3064.co_align:				! byte align test in prior branch delay
3065	bnz,pt	%ncc, .co_al_d1
3066.co_al_d1f:				! dest is now half word aligned
3067	andcc	%o1, 2, %o3
3068	bnz,pt	%ncc, .co_al_d2
3069.co_al_d2f:				! dest is now word aligned
3070	andcc	%o1, 4, %o3		! is dest longword aligned?
3071	bz,pt	%ncc, .co_al_src
3072	nop
3073.co_al_d4:				! dest is word aligned;  src is unknown
3074	ldub	[%o0], %o4		! move a word (src align unknown)
3075	ldub	[%o0+1], %o3
3076	sll	%o4, 24, %o4		! position
3077	sll	%o3, 16, %o3		! position
3078	or	%o4, %o3, %o3		! merge
3079	ldub	[%o0+2], %o4
3080	sll	%o4, 8, %o4		! position
3081	or	%o4, %o3, %o3		! merge
3082	ldub	[%o0+3], %o4
3083	or	%o4, %o3, %o4		! merge
3084	stwa	%o4,[%o1]%asi		! store four bytes
3085	add	%o0, 4, %o0		! adjust src by 4
3086	add	%o1, 4, %o1		! adjust dest by 4
3087	sub	%o2, 4, %o2		! adjust count by 4
3088	andcc	%o0, 7, %o3		! check for src long word alignment
3089	brz,pt	%o3, .co_medlong
3090.co_src_dst_unal8:
3091	! dst is 8-byte aligned, src is not
3092	! Size is less than FP_COPY
3093	! Following code is to select for alignment
3094	andcc	%o0, 0x3, %o3		! test word alignment
3095	bz,pt	%ncc, .co_medword
3096	nop
3097	andcc	%o0, 0x1, %o3		! test halfword alignment
3098	bnz,pt	%ncc, .co_med_byte	! go to byte move if not halfword
3099	andcc	%o0, 0x2, %o3		! test which byte alignment
3100	ba	.co_medhalf
3101	nop
3102.co_al_d1:				! align dest to half word
3103	ldub	[%o0], %o4		! move a byte
3104	add	%o0, 1, %o0
3105	stba	%o4, [%o1]%asi
3106	add	%o1, 1, %o1
3107	andcc	%o1, 2, %o3
3108	bz,pt	%ncc, .co_al_d2f
3109	sub	%o2, 1, %o2
3110.co_al_d2:				! align dest to word
3111	ldub	[%o0], %o4		! move a half-word (src align unknown)
3112	ldub	[%o0+1], %o3
3113	sll	%o4, 8, %o4		! position
3114	or	%o4, %o3, %o4		! merge
3115	stha	%o4, [%o1]%asi
3116	add	%o0, 2, %o0
3117	add	%o1, 2, %o1
3118	andcc	%o1, 4, %o3		! is dest longword aligned?
3119	bz,pt	%ncc, .co_al_src
3120	sub	%o2, 2, %o2
3121	ba	.co_al_d4
3122	nop
3123/*
3124 * Handle all cases where src and dest are aligned on word
3125 * boundaries. Use unrolled loops for better performance.
3126 * This option wins over standard large data move when
3127 * source and destination is in cache for medium
3128 * to short data moves.
3129 */
3130.co_medword:
3131	subcc	%o2, 31, %o2		! adjust length to allow cc test
3132	ble,pt	%ncc, .co_medw31
3133	nop
3134.co_medw32:
3135	ld	[%o0], %o4		! move a block of 32 bytes
3136	stwa	%o4, [%o1]%asi
3137	ld	[%o0+4], %o4
3138	stwa	%o4, [%o1+4]%asi
3139	ld	[%o0+8], %o4
3140	stwa	%o4, [%o1+8]%asi
3141	ld	[%o0+12], %o4
3142	stwa	%o4, [%o1+12]%asi
3143	ld	[%o0+16], %o4
3144	stwa	%o4, [%o1+16]%asi
3145	ld	[%o0+20], %o4
3146	subcc	%o2, 32, %o2		! decrement length count
3147	stwa	%o4, [%o1+20]%asi
3148	ld	[%o0+24], %o4
3149	add	%o0, 32, %o0		! increase src ptr by 32
3150	stwa	%o4, [%o1+24]%asi
3151	ld	[%o0-4], %o4
3152	add	%o1, 32, %o1		! increase dst ptr by 32
3153	bgu,pt	%ncc, .co_medw32	! repeat if at least 32 bytes left
3154	stwa	%o4, [%o1-4]%asi
3155.co_medw31:
3156	addcc	%o2, 24, %o2		! adjust count to be off by 7
3157	ble,pt	%ncc, .co_medw7		! skip if 7 or fewer bytes left
3158	nop				!
3159.co_medw15:
3160	ld	[%o0], %o4		! move a block of 8 bytes
3161	subcc	%o2, 8, %o2		! decrement length count
3162	stwa	%o4, [%o1]%asi
3163	add	%o0, 8, %o0		! increase src ptr by 8
3164	ld	[%o0-4], %o4
3165	add	%o1, 8, %o1		! increase dst ptr by 8
3166	bgu,pt	%ncc, .co_medw15
3167	stwa	%o4, [%o1-4]%asi
3168.co_medw7:
3169	addcc	%o2, 7, %o2		! finish adjustment of remaining count
3170	bz,pt	%ncc, .co_smallx	! exit if finished
3171	cmp	%o2, 4
3172	blt,pt	%ncc, .co_small3x	! skip if less than 4 bytes left
3173	nop				!
3174	ld	[%o0], %o4		! move 4 bytes
3175	add	%o0, 4, %o0		! increase src ptr by 4
3176	add	%o1, 4, %o1		! increase dst ptr by 4
3177	subcc	%o2, 4, %o2		! decrease count by 4
3178	bnz	.co_small3x
3179	stwa	%o4, [%o1-4]%asi
3180	membar	#Sync
3181	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3182	retl
3183	mov	%g0, %o0
3184
3185.co_medhalf:
3186	subcc	%o2, 31, %o2		! adjust length to allow cc test
3187	ble,pt	%ncc, .co_medh31
3188	nop
3189.co_medh32:				! load and store block of 32 bytes
3190
3191	lduh	[%o0], %o4		! move 32 bytes
3192	subcc	%o2, 32, %o2		! decrement length count
3193	lduw	[%o0+2], %o3
3194	sllx	%o4, 48, %o4
3195	sllx	%o3, 16, %o3
3196	or	%o4, %o3, %o3
3197	lduh	[%o0+6], %o4
3198	or	%o4, %o3, %o4
3199	stxa	%o4, [%o1]%asi
3200
3201	lduh	[%o0+8], %o4
3202	lduw	[%o0+10], %o3
3203	sllx	%o4, 48, %o4
3204	sllx	%o3, 16, %o3
3205	or	%o4, %o3, %o3
3206	lduh	[%o0+14], %o4
3207	or	%o4, %o3, %o4
3208	stxa	%o4, [%o1+8]%asi
3209
3210	lduh	[%o0+16], %o4
3211	lduw	[%o0+18], %o3
3212	sllx	%o4, 48, %o4
3213	sllx	%o3, 16, %o3
3214	or	%o4, %o3, %o3
3215	lduh	[%o0+22], %o4
3216	or	%o4, %o3, %o4
3217	stxa	%o4, [%o1+16]%asi
3218
3219	add	%o0, 32, %o0		! increase src ptr by 32
3220	add	%o1, 32, %o1		! increase dst ptr by 32
3221
3222	lduh	[%o0-8], %o4
3223	lduw	[%o0-6], %o3
3224	sllx	%o4, 48, %o4
3225	sllx	%o3, 16, %o3
3226	or	%o4, %o3, %o3
3227	lduh	[%o0-2], %o4
3228	or	%o3, %o4, %o4
3229	bgu,pt	%ncc, .co_medh32	! repeat if at least 32 bytes left
3230	stxa	%o4, [%o1-8]%asi
3231
3232.co_medh31:
3233	addcc	%o2, 24, %o2		! adjust count to be off by 7
3234	ble,pt	%ncc, .co_medh7		! skip if 7 or fewer bytes left
3235	nop				!
3236.co_medh15:
3237	lduh	[%o0], %o4		! move 16 bytes
3238	subcc	%o2, 8, %o2		! decrement length count
3239	lduw	[%o0+2], %o3
3240	sllx	%o4, 48, %o4
3241	sllx	%o3, 16, %o3
3242	or	%o4, %o3, %o3
3243	add	%o1, 8, %o1		! increase dst ptr by 8
3244	lduh	[%o0+6], %o4
3245	add	%o0, 8, %o0		! increase src ptr by 8
3246	or	%o4, %o3, %o4
3247	bgu,pt	%ncc, .co_medh15
3248	stxa	%o4, [%o1-8]%asi
3249.co_medh7:
3250	addcc	%o2, 7, %o2		! finish adjustment of remaining count
3251	bz,pt	%ncc, .co_smallx	! exit if finished
3252	cmp	%o2, 4
3253	blt,pt	%ncc, .co_small3x	! skip if less than 4 bytes left
3254	nop				!
3255	lduh	[%o0], %o4
3256	sll	%o4, 16, %o4
3257	lduh	[%o0+2], %o3
3258	or	%o3, %o4, %o4
3259	subcc	%o2, 4, %o2
3260	add	%o0, 4, %o0
3261	add	%o1, 4, %o1
3262	bnz	.co_small3x
3263	stwa	%o4, [%o1-4]%asi
3264	membar	#Sync
3265	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3266	retl
3267	mov	%g0, %o0
3268
3269	.align 16
3270.co_med_byte:
3271	bnz,pt	%ncc, .co_medbh32a	! go to correct byte move
3272	subcc	%o2, 31, %o2		! adjust length to allow cc test
3273	ble,pt	%ncc, .co_medb31
3274	nop
3275.co_medb32:				! Alignment 1 or 5
3276	subcc	%o2, 32, %o2		! decrement length count
3277
3278	ldub	[%o0], %o4		! load and store a block of 32 bytes
3279	sllx	%o4, 56, %o3
3280	lduh	[%o0+1], %o4
3281	sllx	%o4, 40, %o4
3282	or	%o4, %o3, %o3
3283	lduw	[%o0+3], %o4
3284	sllx	%o4, 8, %o4
3285	or	%o4, %o3, %o3
3286	ldub	[%o0+7], %o4
3287	or	%o4, %o3, %o4
3288	stxa	%o4, [%o1]%asi
3289
3290	ldub	[%o0+8], %o4
3291	sllx	%o4, 56, %o3
3292	lduh	[%o0+9], %o4
3293	sllx	%o4, 40, %o4
3294	or	%o4, %o3, %o3
3295	lduw	[%o0+11], %o4
3296	sllx	%o4, 8, %o4
3297	or	%o4, %o3, %o3
3298	ldub	[%o0+15], %o4
3299	or	%o4, %o3, %o4
3300	stxa	%o4, [%o1+8]%asi
3301
3302	ldub	[%o0+16], %o4
3303	sllx	%o4, 56, %o3
3304	lduh	[%o0+17], %o4
3305	sllx	%o4, 40, %o4
3306	or	%o4, %o3, %o3
3307	lduw	[%o0+19], %o4
3308	sllx	%o4, 8, %o4
3309	or	%o4, %o3, %o3
3310	ldub	[%o0+23], %o4
3311	or	%o4, %o3, %o4
3312	stxa	%o4, [%o1+16]%asi
3313
3314	add	%o0, 32, %o0		! increase src ptr by 32
3315	add	%o1, 32, %o1		! increase dst ptr by 32
3316
3317	ldub	[%o0-8], %o4
3318	sllx	%o4, 56, %o3
3319	lduh	[%o0-7], %o4
3320	sllx	%o4, 40, %o4
3321	or	%o4, %o3, %o3
3322	lduw	[%o0-5], %o4
3323	sllx	%o4, 8, %o4
3324	or	%o4, %o3, %o3
3325	ldub	[%o0-1], %o4
3326	or	%o4, %o3, %o4
3327	bgu,pt	%ncc, .co_medb32	! repeat if at least 32 bytes left
3328	stxa	%o4, [%o1-8]%asi
3329
3330.co_medb31:				! 31 or fewer bytes remaining
3331	addcc	%o2, 24, %o2		! adjust count to be off by 7
3332	ble,pt	%ncc, .co_medb7		! skip if 7 or fewer bytes left
3333	nop				!
3334.co_medb15:
3335
3336	ldub	[%o0], %o4		! load and store a block of 8 bytes
3337	subcc	%o2, 8, %o2		! decrement length count
3338	sllx	%o4, 56, %o3
3339	lduh	[%o0+1], %o4
3340	sllx	%o4, 40, %o4
3341	or	%o4, %o3, %o3
3342	lduw	[%o0+3], %o4
3343	add	%o1, 8, %o1		! increase dst ptr by 16
3344	sllx	%o4, 8, %o4
3345	or	%o4, %o3, %o3
3346	ldub	[%o0+7], %o4
3347	add	%o0, 8, %o0		! increase src ptr by 16
3348	or	%o4, %o3, %o4
3349	bgu,pt	%ncc, .co_medb15
3350	stxa	%o4, [%o1-8]%asi
3351.co_medb7:
3352	addcc	%o2, 7, %o2		! finish adjustment of remaining count
3353	bz,pt	%ncc, .co_smallx	! exit if finished
3354	cmp	%o2, 4
3355	blt,pt	%ncc, .co_small3x	! skip if less than 4 bytes left
3356	nop				!
3357	ldub	[%o0], %o4		! move 4 bytes
3358	sll	%o4, 24, %o3
3359	lduh	[%o0+1], %o4
3360	sll	%o4, 8, %o4
3361	or	%o4, %o3, %o3
3362	ldub	[%o0+3], %o4
3363	or	%o4, %o3, %o4
3364	subcc	%o2, 4, %o2
3365	add	%o0, 4, %o0
3366	add	%o1, 4, %o1
3367	bnz	.co_small3x
3368	stwa	%o4, [%o1-4]%asi
3369	membar	#Sync
3370	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3371	retl
3372	mov	%g0, %o0
3373
3374	.align 16
3375.co_medbh32a:
3376	ble,pt	%ncc, .co_medbh31
3377	nop
3378.co_medbh32:				! Alignment 3 or 7
3379	subcc	%o2, 32, %o2		! decrement length count
3380
3381	ldub	[%o0], %o4		! load and store a block of 32 bytes
3382	sllx	%o4, 56, %o3
3383	lduw	[%o0+1], %o4
3384	sllx	%o4, 24, %o4
3385	or	%o4, %o3, %o3
3386	lduh	[%o0+5], %o4
3387	sllx	%o4, 8, %o4
3388	or	%o4, %o3, %o3
3389	ldub	[%o0+7], %o4
3390	or	%o4, %o3, %o4
3391	stxa	%o4, [%o1]%asi
3392
3393	ldub	[%o0+8], %o4
3394	sllx	%o4, 56, %o3
3395	lduw	[%o0+9], %o4
3396	sllx	%o4, 24, %o4
3397	or	%o4, %o3, %o3
3398	lduh	[%o0+13], %o4
3399	sllx	%o4, 8, %o4
3400	or	%o4, %o3, %o3
3401	ldub	[%o0+15], %o4
3402	or	%o4, %o3, %o4
3403	stxa	%o4, [%o1+8]%asi
3404
3405	ldub	[%o0+16], %o4
3406	sllx	%o4, 56, %o3
3407	lduw	[%o0+17], %o4
3408	sllx	%o4, 24, %o4
3409	or	%o4, %o3, %o3
3410	lduh	[%o0+21], %o4
3411	sllx	%o4, 8, %o4
3412	or	%o4, %o3, %o3
3413	ldub	[%o0+23], %o4
3414	or	%o4, %o3, %o4
3415	stxa	%o4, [%o1+16]%asi
3416
3417	add	%o0, 32, %o0		! increase src ptr by 32
3418	add	%o1, 32, %o1		! increase dst ptr by 32
3419
3420	ldub	[%o0-8], %o4
3421	sllx	%o4, 56, %o3
3422	lduw	[%o0-7], %o4
3423	sllx	%o4, 24, %o4
3424	or	%o4, %o3, %o3
3425	lduh	[%o0-3], %o4
3426	sllx	%o4, 8, %o4
3427	or	%o4, %o3, %o3
3428	ldub	[%o0-1], %o4
3429	or	%o4, %o3, %o4
3430	bgu,pt	%ncc, .co_medbh32	! repeat if at least 32 bytes left
3431	stxa	%o4, [%o1-8]%asi
3432
3433.co_medbh31:
3434	addcc	%o2, 24, %o2		! adjust count to be off by 7
3435	ble,pt	%ncc, .co_medb7		! skip if 7 or fewer bytes left
3436	nop				!
3437.co_medbh15:
3438	ldub	[%o0], %o4		! load and store a block of 8 bytes
3439	sllx	%o4, 56, %o3
3440	lduw	[%o0+1], %o4
3441	sllx	%o4, 24, %o4
3442	or	%o4, %o3, %o3
3443	lduh	[%o0+5], %o4
3444	sllx	%o4, 8, %o4
3445	or	%o4, %o3, %o3
3446	ldub	[%o0+7], %o4
3447	or	%o4, %o3, %o4
3448	stxa	%o4, [%o1]%asi
3449	subcc	%o2, 8, %o2		! decrement length count
3450	add	%o1, 8, %o1		! increase dst ptr by 8
3451	add	%o0, 8, %o0		! increase src ptr by 8
3452	bgu,pt	%ncc, .co_medbh15
3453	stxa	%o4, [%o1-8]%asi
3454	ba	.co_medb7
3455	nop
3456/*
3457 * End of small copy (no window) code
3458 */
3459
3460/*
3461 * Long copy code
3462 */
3463.co_copy_more:
3464	sethi	%hi(copyio_fault), %o3
3465	or	%o3, %lo(copyio_fault), %o3
3466	membar	#Sync
3467	stn	%o3, [THREAD_REG + T_LOFAULT]
3468
3469/*
3470 * Following code is for large copies. We know there is at
3471 * least FP_COPY bytes available. FP regs are used, so
3472 *  we save registers and fp regs before starting
3473 */
3474	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3475	or	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
3476	rd	%fprs, %g1		! check for unused fp
3477	! if fprs.fef == 0, set it.
3478	! Setting it when already set costs more than checking
3479	andcc	%g1, FPRS_FEF, %g1	! test FEF, fprs.du = fprs.dl = 0
3480	bz,pt	%ncc, .co_fp_unused
3481	mov	ASI_USER, %asi
3482	BST_FP_TOSTACK(%o3)
3483	ba	.co_fp_ready
3484.co_fp_unused:
3485	prefetch [%i0 + (1 * CACHE_LINE)], #one_read
3486	wr	%g0, FPRS_FEF, %fprs	! fprs.fef = 1
3487.co_fp_ready:
3488	rd	%gsr, %l5		! save %gsr value
3489	andcc	%i1, 1, %o3		! is dest byte aligned
3490	bnz,pt	%ncc, .co_big_d1
3491.co_big_d1f:				! dest is now half word aligned
3492	andcc	%i1, 2, %o3
3493	bnz,pt	%ncc, .co_big_d2
3494.co_big_d2f:				! dest is now word aligned
3495	andcc	%i1, 4, %o3		! is dest longword aligned
3496	bnz,pt	%ncc, .co_big_d4
3497.co_big_d4f:				! dest is now long word aligned
3498	andcc	%i0, 7, %o3		! is src long word aligned
3499	brnz,pt	%o3, .co_big_unal8
3500	prefetch [%i0 + (2 * CACHE_LINE)], #one_read
3501	! Src and dst are long word aligned
3502	! align dst to 64 byte boundary
3503	andcc	%i1, 0x3f, %o3		! %o3 == 0 means dst is 64 byte aligned
3504	brz,pn	%o3, .co_al_to_64
3505	nop
3506	sub	%o3, 64, %o3		! %o3 has negative bytes to move
3507	add	%i2, %o3, %i2		! adjust remaining count
3508	andcc	%o3, 8, %o4		! odd long words to move?
3509	brz,pt	%o4, .co_al_to_16
3510	nop
3511	add	%o3, 8, %o3
3512	ldx	[%i0], %o4
3513	add	%i0, 8, %i0		! increment src ptr
3514	stxa	%o4, [%i1]ASI_USER
3515	add	%i1, 8, %i1		! increment dst ptr
3516! Dest is aligned on 16 bytes, src 8 byte aligned
3517.co_al_to_16:
3518	andcc	%o3, 0x30, %o4		! move to move?
3519	brz,pt	%o4, .co_al_to_64
3520	nop
3521.co_al_mv_16:
3522	add	%o3, 16, %o3
3523	ldx	[%i0], %o4
3524	stxa	%o4, [%i1]ASI_USER
3525	add	%i0, 16, %i0		! increment src ptr
3526	ldx	[%i0-8], %o4
3527	add	%i1, 8, %i1		! increment dst ptr
3528	stxa	%o4, [%i1]ASI_USER
3529	andcc	%o3, 0x30, %o4
3530	brnz,pt	%o4, .co_al_mv_16
3531	add	%i1, 8, %i1		! increment dst ptr
3532! Dest is aligned on 64 bytes, src 8 byte aligned
3533.co_al_to_64:
3534	! Determine source alignment
3535	! to correct 8 byte offset
3536	andcc	%i0, 32, %o3
3537	brnz,pn	%o3, .co_aln_1
3538	andcc	%i0, 16, %o3
3539	brnz,pn	%o3, .co_aln_01
3540	andcc	%i0, 8, %o3
3541	brz,pn	%o3, .co_aln_000
3542	prefetch [%i0 + (3 * CACHE_LINE)], #one_read
3543	ba	.co_aln_001
3544	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3545.co_aln_01:
3546	brnz,pn	%o3, .co_aln_011
3547	prefetch [%i0 + (3 * CACHE_LINE)], #one_read
3548	ba	.co_aln_010
3549	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3550.co_aln_1:
3551	andcc	%i0, 16, %o3
3552	brnz,pn	%o3, .co_aln_11
3553	andcc	%i0, 8, %o3
3554	brnz,pn	%o3, .co_aln_101
3555	prefetch [%i0 + (3 * CACHE_LINE)], #one_read
3556	ba	.co_aln_100
3557	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3558.co_aln_11:
3559	brz,pn	%o3, .co_aln_110
3560	prefetch [%i0 + (3 * CACHE_LINE)], #one_read
3561
3562.co_aln_111:
3563! Alignment off by 8 bytes
3564	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3565	ldd	[%i0], %d0
3566	add	%i0, 8, %i0
3567	sub	%i2, 8, %i2
3568	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
3569	and	%i2, 0x7f, %i2		! residue bytes in %i2
3570	sub	%i1, %i0, %i1
3571.co_aln_111_loop:
3572	ldda	[%i0]ASI_BLK_P,%d16		! block load
3573	subcc	%o3, 64, %o3
3574	fmovd	%d16, %d2
3575	fmovd	%d18, %d4
3576	fmovd	%d20, %d6
3577	fmovd	%d22, %d8
3578	fmovd	%d24, %d10
3579	fmovd	%d26, %d12
3580	fmovd	%d28, %d14
3581	stxa	%g0,[%i0+%i1]ASI_STBI_AIUS	! block initializing store
3582	stda	%d0,[%i0+%i1]ASI_BLK_AIUS
3583	add	%i0, 64, %i0
3584	fmovd	%d30, %d0
3585	bgt,pt	%ncc, .co_aln_111_loop
3586	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3587	add	%i1, %i0, %i1
3588
3589	stda	%d0, [%i1]ASI_USER
3590	ba	.co_remain_stuff
3591	add	%i1, 8, %i1
3592	! END OF aln_111
3593
3594.co_aln_110:
3595! Alignment off by 16 bytes
3596	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3597	ldd	[%i0], %d0
3598	ldd	[%i0+8], %d2
3599	add	%i0, 16, %i0
3600	sub	%i2, 16, %i2
3601	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
3602	and	%i2, 0x7f, %i2		! residue bytes in %i2
3603	sub	%i1, %i0, %i1
3604.co_aln_110_loop:
3605	ldda	[%i0]ASI_BLK_P,%d16		! block load
3606	subcc	%o3, 64, %o3
3607	fmovd	%d16, %d4
3608	fmovd	%d18, %d6
3609	fmovd	%d20, %d8
3610	fmovd	%d22, %d10
3611	fmovd	%d24, %d12
3612	fmovd	%d26, %d14
3613	stxa	%g0,[%i0+%i1]ASI_STBI_AIUS	! block initializing store
3614	stda	%d0,[%i0+%i1]ASI_BLK_AIUS
3615	add	%i0, 64, %i0
3616	fmovd	%d28, %d0
3617	fmovd	%d30, %d2
3618	bgt,pt	%ncc, .co_aln_110_loop
3619	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3620	add	%i1, %i0, %i1
3621
3622	stda	%d0, [%i1]%asi
3623	stda	%d2, [%i1+8]%asi
3624	ba	.co_remain_stuff
3625	add	%i1, 16, %i1
3626	! END OF aln_110
3627
3628.co_aln_101:
3629! Alignment off by 24 bytes
3630	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3631	ldd	[%i0], %d0
3632	ldd	[%i0+8], %d2
3633	ldd	[%i0+16], %d4
3634	add	%i0, 24, %i0
3635	sub	%i2, 24, %i2
3636	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
3637	and	%i2, 0x7f, %i2		! residue bytes in %i2
3638	sub	%i1, %i0, %i1
3639.co_aln_101_loop:
3640	ldda	[%i0]ASI_BLK_P,%d16	! block load
3641	subcc	%o3, 64, %o3
3642	fmovd	%d16, %d6
3643	fmovd	%d18, %d8
3644	fmovd	%d20, %d10
3645	fmovd	%d22, %d12
3646	fmovd	%d24, %d14
3647	stxa	%g0,[%i0+%i1]ASI_STBI_AIUS	! block initializing store
3648	stda	%d0,[%i0+%i1]ASI_BLK_AIUS
3649	add	%i0, 64, %i0
3650	fmovd	%d26, %d0
3651	fmovd	%d28, %d2
3652	fmovd	%d30, %d4
3653	bgt,pt	%ncc, .co_aln_101_loop
3654	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3655	add	%i1, %i0, %i1
3656
3657	stda	%d0, [%i1]%asi
3658	stda	%d2, [%i1+8]%asi
3659	stda	%d4, [%i1+16]%asi
3660	ba	.co_remain_stuff
3661	add	%i1, 24, %i1
3662	! END OF aln_101
3663
3664.co_aln_100:
3665! Alignment off by 32 bytes
3666	ldd	[%i0], %d0
3667	ldd	[%i0+8], %d2
3668	ldd	[%i0+16],%d4
3669	ldd	[%i0+24],%d6
3670	add	%i0, 32, %i0
3671	sub	%i2, 32, %i2
3672	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
3673	and	%i2, 0x7f, %i2		! residue bytes in %i2
3674	sub	%i1, %i0, %i1
3675.co_aln_100_loop:
3676	ldda	[%i0]ASI_BLK_P,%d16	! block load
3677	subcc	%o3, 64, %o3
3678	fmovd	%d16, %d8
3679	fmovd	%d18, %d10
3680	fmovd	%d20, %d12
3681	fmovd	%d22, %d14
3682	stxa	%g0,[%i0+%i1]ASI_STBI_AIUS	! block initializing store
3683	stda	%d0,[%i0+%i1]ASI_BLK_AIUS
3684	add	%i0, 64, %i0
3685	fmovd	%d24, %d0
3686	fmovd	%d26, %d2
3687	fmovd	%d28, %d4
3688	fmovd	%d30, %d6
3689	bgt,pt	%ncc, .co_aln_100_loop
3690	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3691	add	%i1, %i0, %i1
3692
3693	stda	%d0, [%i1]%asi
3694	stda	%d2, [%i1+8]%asi
3695	stda	%d4, [%i1+16]%asi
3696	stda	%d6, [%i1+24]%asi
3697	ba	.co_remain_stuff
3698	add	%i1, 32, %i1
3699	! END OF aln_100
3700
3701.co_aln_011:
3702! Alignment off by 40 bytes
3703	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3704	ldd	[%i0], %d0
3705	ldd	[%i0+8], %d2
3706	ldd	[%i0+16], %d4
3707	ldd	[%i0+24], %d6
3708	ldd	[%i0+32], %d8
3709	add	%i0, 40, %i0
3710	sub	%i2, 40, %i2
3711	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
3712	and	%i2, 0x7f, %i2		! residue bytes in %i2
3713	sub	%i1, %i0, %i1
3714.co_aln_011_loop:
3715	ldda	[%i0]ASI_BLK_P,%d16	! block load
3716	subcc	%o3, 64, %o3
3717	fmovd	%d16, %d10
3718	fmovd	%d18, %d12
3719	fmovd	%d20, %d14
3720	stxa	%g0,[%i0+%i1]ASI_STBI_AIUS	! block initializing store
3721	stda	%d0,[%i0+%i1]ASI_BLK_AIUS
3722	add	%i0, 64, %i0
3723	fmovd	%d22, %d0
3724	fmovd	%d24, %d2
3725	fmovd	%d26, %d4
3726	fmovd	%d28, %d6
3727	fmovd	%d30, %d8
3728	bgt,pt	%ncc, .co_aln_011_loop
3729	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3730	add	%i1, %i0, %i1
3731
3732	stda	%d0, [%i1]%asi
3733	stda	%d2, [%i1+8]%asi
3734	stda	%d4, [%i1+16]%asi
3735	stda	%d6, [%i1+24]%asi
3736	stda	%d8, [%i1+32]%asi
3737	ba	.co_remain_stuff
3738	add	%i1, 40, %i1
3739	! END OF aln_011
3740
3741.co_aln_010:
3742! Alignment off by 48 bytes
3743	ldd	[%i0], %d0
3744	ldd	[%i0+8], %d2
3745	ldd	[%i0+16], %d4
3746	ldd	[%i0+24], %d6
3747	ldd	[%i0+32], %d8
3748	ldd	[%i0+40], %d10
3749	add	%i0, 48, %i0
3750	sub	%i2, 48, %i2
3751	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
3752	and	%i2, 0x7f, %i2		! residue bytes in %i2
3753	sub	%i1, %i0, %i1
3754.co_aln_010_loop:
3755	ldda	[%i0]ASI_BLK_P,%d16	! block load
3756	subcc	%o3, 64, %o3
3757	fmovd	%d16, %d12
3758	fmovd	%d18, %d14
3759	stxa	%g0,[%i0+%i1]ASI_STBI_AIUS	! block initializing store
3760	stda	%d0,[%i0+%i1]ASI_BLK_AIUS
3761	add	%i0, 64, %i0
3762	fmovd	%d20, %d0
3763	fmovd	%d22, %d2
3764	fmovd	%d24, %d4
3765	fmovd	%d26, %d6
3766	fmovd	%d28, %d8
3767	fmovd	%d30, %d10
3768	bgt,pt	%ncc, .co_aln_010_loop
3769	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3770	add	%i1, %i0, %i1
3771
3772	stda	%d0, [%i1]%asi
3773	stda	%d2, [%i1+8]%asi
3774	stda	%d4, [%i1+16]%asi
3775	stda	%d6, [%i1+24]%asi
3776	stda	%d8, [%i1+32]%asi
3777	stda	%d10, [%i1+40]%asi
3778	ba	.co_remain_stuff
3779	add	%i1, 48, %i1
3780	! END OF aln_010
3781
3782.co_aln_001:
3783! Alignment off by 56 bytes
3784	ldd	[%i0], %d0
3785	ldd	[%i0+8], %d2
3786	ldd	[%i0+16], %d4
3787	ldd	[%i0+24], %d6
3788	ldd	[%i0+32], %d8
3789	ldd	[%i0+40], %d10
3790	ldd	[%i0+48], %d12
3791	add	%i0, 56, %i0
3792	sub	%i2, 56, %i2
3793	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
3794	and	%i2, 0x7f, %i2		! residue bytes in %i2
3795	sub	%i1, %i0, %i1
3796.co_aln_001_loop:
3797	ldda	[%i0]ASI_BLK_P,%d16	! block load
3798	subcc	%o3, 64, %o3
3799	fmovd	%d16, %d14
3800	stxa	%g0,[%i0+%i1]ASI_STBI_AIUS	! block initializing store
3801	stda	%d0,[%i0+%i1]ASI_BLK_AIUS
3802	add	%i0, 64, %i0
3803	fmovd	%d18, %d0
3804	fmovd	%d20, %d2
3805	fmovd	%d22, %d4
3806	fmovd	%d24, %d6
3807	fmovd	%d26, %d8
3808	fmovd	%d28, %d10
3809	fmovd	%d30, %d12
3810	bgt,pt	%ncc, .co_aln_001_loop
3811	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3812	add	%i1, %i0, %i1
3813
3814	stda	%d0, [%i1]%asi
3815	stda	%d2, [%i1+8]%asi
3816	stda	%d4, [%i1+16]%asi
3817	stda	%d6, [%i1+24]%asi
3818	stda	%d8, [%i1+32]%asi
3819	stda	%d10, [%i1+40]%asi
3820	stda	%d12, [%i1+48]%asi
3821	ba	.co_remain_stuff
3822	add	%i1, 56, %i1
3823	! END OF aln_001
3824
3825.co_aln_000:
3826	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3827	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
3828	and	%i2, 0x7f, %i2		! residue bytes in %i2
3829	sub	%i1, %i0, %i1
3830.co_aln_000_loop:
3831	ldda	[%i0]ASI_BLK_P,%d0
3832	subcc	%o3, 64, %o3
3833	stxa	%g0,[%i0+%i1]ASI_STBI_AIUS	! block initializing store
3834	stda	%d0,[%i0+%i1]ASI_BLK_AIUS
3835	add	%i0, 64, %i0
3836	bgt,pt	%ncc, .co_aln_000_loop
3837	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3838	add	%i1, %i0, %i1
3839
3840	! END OF aln_000
3841
3842.co_remain_stuff:
3843	subcc	%i2, 31, %i2		! adjust length to allow cc test
3844	ble,pt	%ncc, .co_aln_31
3845	nop
3846.co_aln_32:
3847	ldx	[%i0], %o4		! move 32 bytes
3848	subcc	%i2, 32, %i2		! decrement length count by 32
3849	stxa	%o4, [%i1]%asi
3850	ldx	[%i0+8], %o4
3851	stxa	%o4, [%i1+8]%asi
3852	ldx	[%i0+16], %o4
3853	add	%i0, 32, %i0		! increase src ptr by 32
3854	stxa	%o4, [%i1+16]%asi
3855	ldx	[%i0-8], %o4
3856	add	%i1, 32, %i1		! increase dst ptr by 32
3857	bgu,pt	%ncc, .co_aln_32	! repeat if at least 32 bytes left
3858	stxa	%o4, [%i1-8]%asi
3859.co_aln_31:
3860	addcc	%i2, 24, %i2		! adjust count to be off by 7
3861	ble,pt	%ncc, .co_aln_7		! skip if 7 or fewer bytes left
3862	nop				!
3863.co_aln_15:
3864	ldx	[%i0], %o4		! move 8 bytes
3865	add	%i0, 8, %i0		! increase src ptr by 8
3866	subcc	%i2, 8, %i2		! decrease count by 8
3867	add	%i1, 8, %i1		! increase dst ptr by 8
3868	bgu,pt	%ncc, .co_aln_15
3869	stxa	%o4, [%i1-8]%asi
3870.co_aln_7:
3871	addcc	%i2, 7, %i2		! finish adjustment of remaining count
3872	bz,pt	%ncc, .co_exit		! exit if finished
3873	cmp	%i2, 4
3874	blt,pt	%ncc, .co_unaln3x	! skip if less than 4 bytes left
3875	nop				!
3876	ld	[%i0], %o4		! move 4 bytes
3877	add	%i0, 4, %i0		! increase src ptr by 4
3878	add	%i1, 4, %i1		! increase dst ptr by 4
3879	subcc	%i2, 4, %i2		! decrease count by 4
3880	bnz	.co_unaln3x
3881	stwa	%o4, [%i1-4]%asi
3882	ba	.co_exit
3883	nop
3884
3885	! destination alignment code
3886.co_big_d1:
3887	ldub	[%i0], %o4		! move a byte
3888	add	%i0, 1, %i0
3889	stba	%o4, [%i1]ASI_USER
3890	add	%i1, 1, %i1
3891	andcc	%i1, 2, %o3
3892	bz,pt	%ncc, .co_big_d2f
3893	sub	%i2, 1, %i2
3894.co_big_d2:
3895	ldub	[%i0], %o4		! move a half-word (src align unknown)
3896	ldub	[%i0+1], %o3
3897	add	%i0, 2, %i0
3898	sll	%o4, 8, %o4		! position
3899	or	%o4, %o3, %o4		! merge
3900	stha	%o4, [%i1]ASI_USER
3901	add	%i1, 2, %i1
3902	andcc	%i1, 4, %o3		! is dest longword aligned
3903	bz,pt	%ncc, .co_big_d4f
3904	sub	%i2, 2, %i2
3905.co_big_d4:				! dest is at least word aligned
3906	nop
3907	ldub	[%i0], %o4		! move a word (src align unknown)
3908	ldub	[%i0+1], %o3
3909	sll	%o4, 24, %o4		! position
3910	sll	%o3, 16, %o3		! position
3911	or	%o4, %o3, %o3		! merge
3912	ldub	[%i0+2], %o4
3913	sll	%o4, 8, %o4		! position
3914	or	%o4, %o3, %o3		! merge
3915	ldub	[%i0+3], %o4
3916	or	%o4, %o3, %o4		! merge
3917	stwa	%o4,[%i1]ASI_USER	! store four bytes
3918	add	%i0, 4, %i0		! adjust src by 4
3919	add	%i1, 4, %i1		! adjust dest by 4
3920	ba	.co_big_d4f
3921	sub	%i2, 4, %i2		! adjust count by 4
3922
3923
3924	! Dst is on 8 byte boundary; src is not;
3925.co_big_unal8:
3926	andcc	%i1, 0x3f, %o3		! is dst 64-byte block aligned?
3927	bz	%ncc, .co_unalnsrc
3928	sub	%o3, 64, %o3		! %o3 will be multiple of 8
3929	neg	%o3			! bytes until dest is 64 byte aligned
3930	sub	%i2, %o3, %i2		! update cnt with bytes to be moved
3931	! Move bytes according to source alignment
3932	andcc	%i0, 0x1, %o4
3933	bnz	%ncc, .co_unalnbyte	! check for byte alignment
3934	nop
3935	andcc	%i0, 2, %o4		! check for half word alignment
3936	bnz	%ncc, .co_unalnhalf
3937	nop
3938	! Src is word aligned, move bytes until dest 64 byte aligned
3939.co_unalnword:
3940	ld	[%i0], %o4		! load 4 bytes
3941	stwa	%o4, [%i1]%asi		! and store 4 bytes
3942	ld	[%i0+4], %o4		! load 4 bytes
3943	add	%i0, 8, %i0		! increase src ptr by 8
3944	stwa	%o4, [%i1+4]%asi	! and store 4 bytes
3945	subcc	%o3, 8, %o3		! decrease count by 8
3946	bnz	%ncc, .co_unalnword
3947	add	%i1, 8, %i1		! increase dst ptr by 8
3948	ba	.co_unalnsrc
3949	nop
3950
3951	! Src is half-word aligned, move bytes until dest 64 byte aligned
3952.co_unalnhalf:
3953	lduh	[%i0], %o4		! load 2 bytes
3954	sllx	%o4, 32, %i3		! shift left
3955	lduw	[%i0+2], %o4
3956	or	%o4, %i3, %i3
3957	sllx	%i3, 16, %i3
3958	lduh	[%i0+6], %o4
3959	or	%o4, %i3, %i3
3960	stxa	%i3, [%i1]ASI_USER
3961	add	%i0, 8, %i0
3962	subcc	%o3, 8, %o3
3963	bnz	%ncc, .co_unalnhalf
3964	add	%i1, 8, %i1
3965	ba	.co_unalnsrc
3966	nop
3967
3968	! Src is Byte aligned, move bytes until dest 64 byte aligned
3969.co_unalnbyte:
3970	sub	%i1, %i0, %i1		! share pointer advance
3971.co_unalnbyte_loop:
3972	ldub	[%i0], %o4
3973	sllx	%o4, 56, %i3
3974	lduh	[%i0+1], %o4
3975	sllx	%o4, 40, %o4
3976	or	%o4, %i3, %i3
3977	lduh	[%i0+3], %o4
3978	sllx	%o4, 24, %o4
3979	or	%o4, %i3, %i3
3980	lduh	[%i0+5], %o4
3981	sllx	%o4, 8, %o4
3982	or	%o4, %i3, %i3
3983	ldub	[%i0+7], %o4
3984	or	%o4, %i3, %i3
3985	stxa	%i3, [%i1+%i0]ASI_USER
3986	subcc	%o3, 8, %o3
3987	bnz	%ncc, .co_unalnbyte_loop
3988	add	%i0, 8, %i0
3989	add	%i1,%i0, %i1		! restore pointer
3990
3991	! Destination is now block (64 byte aligned), src is not 8 byte aligned
3992.co_unalnsrc:
3993	andn	%i2, 0x3f, %i3		! %i3 is multiple of block size
3994	and	%i2, 0x3f, %i2		! residue bytes in %i2
3995	add	%i2, 64, %i2		! Insure we don't load beyond
3996	sub	%i3, 64, %i3		! end of source buffer
3997
3998	andn	%i0, 0x3f, %o4		! %o4 has block aligned src address
3999	prefetch [%o4 + (3 * CACHE_LINE)], #one_read
4000	alignaddr %i0, %g0, %g0		! generate %gsr
4001	add	%i0, %i3, %i0		! advance %i0 to after blocks
4002	!
4003	! Determine source alignment to correct 8 byte offset
4004	andcc	%i0, 0x20, %o3
4005	brnz,pn	%o3, .co_unaln_1
4006	andcc	%i0, 0x10, %o3
4007	brnz,pn	%o3, .co_unaln_01
4008	andcc	%i0, 0x08, %o3
4009	brz,a	%o3, .co_unaln_000
4010	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4011	ba	.co_unaln_001
4012	nop
4013.co_unaln_01:
4014	brnz,a	%o3, .co_unaln_011
4015	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4016	ba	.co_unaln_010
4017	nop
4018.co_unaln_1:
4019	brnz,pn	%o3, .co_unaln_11
4020	andcc	%i0, 0x08, %o3
4021	brnz,a	%o3, .co_unaln_101
4022	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4023	ba	.co_unaln_100
4024	nop
4025.co_unaln_11:
4026	brz,pn	%o3, .co_unaln_110
4027	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
4028
4029.co_unaln_111:
4030	ldd	[%o4+56], %d14
4031.co_unaln_111_loop:
4032	add	%o4, 64, %o4
4033	ldda	[%o4]ASI_BLK_P, %d16
4034	faligndata %d14, %d16, %d48
4035	faligndata %d16, %d18, %d50
4036	faligndata %d18, %d20, %d52
4037	faligndata %d20, %d22, %d54
4038	faligndata %d22, %d24, %d56
4039	faligndata %d24, %d26, %d58
4040	faligndata %d26, %d28, %d60
4041	faligndata %d28, %d30, %d62
4042	fmovd	%d30, %d14
4043	stda	%d48, [%i1]ASI_BLK_AIUS
4044	subcc	%i3, 64, %i3
4045	add	%i1, 64, %i1
4046	bgu,pt	%ncc, .co_unaln_111_loop
4047	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4048	ba	.co_unaln_done
4049	nop
4050
4051.co_unaln_110:
4052	ldd	[%o4+48], %d12
4053	ldd	[%o4+56], %d14
4054.co_unaln_110_loop:
4055	add	%o4, 64, %o4
4056	ldda	[%o4]ASI_BLK_P, %d16
4057	faligndata %d12, %d14, %d48
4058	faligndata %d14, %d16, %d50
4059	faligndata %d16, %d18, %d52
4060	faligndata %d18, %d20, %d54
4061	faligndata %d20, %d22, %d56
4062	faligndata %d22, %d24, %d58
4063	faligndata %d24, %d26, %d60
4064	faligndata %d26, %d28, %d62
4065	fmovd	%d28, %d12
4066	fmovd	%d30, %d14
4067	stda	%d48, [%i1]ASI_BLK_AIUS
4068	subcc	%i3, 64, %i3
4069	add	%i1, 64, %i1
4070	bgu,pt	%ncc, .co_unaln_110_loop
4071	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4072	ba	.co_unaln_done
4073	nop
4074
4075.co_unaln_101:
4076	ldd	[%o4+40], %d10
4077	ldd	[%o4+48], %d12
4078	ldd	[%o4+56], %d14
4079.co_unaln_101_loop:
4080	add	%o4, 64, %o4
4081	ldda	[%o4]ASI_BLK_P, %d16
4082	faligndata %d10, %d12, %d48
4083	faligndata %d12, %d14, %d50
4084	faligndata %d14, %d16, %d52
4085	faligndata %d16, %d18, %d54
4086	faligndata %d18, %d20, %d56
4087	faligndata %d20, %d22, %d58
4088	faligndata %d22, %d24, %d60
4089	faligndata %d24, %d26, %d62
4090	fmovd	%d26, %d10
4091	fmovd	%d28, %d12
4092	fmovd	%d30, %d14
4093	stda	%d48, [%i1]ASI_BLK_AIUS
4094	subcc	%i3, 64, %i3
4095	add	%i1, 64, %i1
4096	bgu,pt	%ncc, .co_unaln_101_loop
4097	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4098	ba	.co_unaln_done
4099	nop
4100
4101.co_unaln_100:
4102	ldd	[%o4+32], %d8
4103	ldd	[%o4+40], %d10
4104	ldd	[%o4+48], %d12
4105	ldd	[%o4+56], %d14
4106.co_unaln_100_loop:
4107	add	%o4, 64, %o4
4108	ldda	[%o4]ASI_BLK_P, %d16
4109	faligndata %d8, %d10, %d48
4110	faligndata %d10, %d12, %d50
4111	faligndata %d12, %d14, %d52
4112	faligndata %d14, %d16, %d54
4113	faligndata %d16, %d18, %d56
4114	faligndata %d18, %d20, %d58
4115	faligndata %d20, %d22, %d60
4116	faligndata %d22, %d24, %d62
4117	fmovd	%d24, %d8
4118	fmovd	%d26, %d10
4119	fmovd	%d28, %d12
4120	fmovd	%d30, %d14
4121	stda	%d48, [%i1]ASI_BLK_AIUS
4122	subcc	%i3, 64, %i3
4123	add	%i1, 64, %i1
4124	bgu,pt	%ncc, .co_unaln_100_loop
4125	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4126	ba	.co_unaln_done
4127	nop
4128
4129.co_unaln_011:
4130	ldd	[%o4+24], %d6
4131	ldd	[%o4+32], %d8
4132	ldd	[%o4+40], %d10
4133	ldd	[%o4+48], %d12
4134	ldd	[%o4+56], %d14
4135.co_unaln_011_loop:
4136	add	%o4, 64, %o4
4137	ldda	[%o4]ASI_BLK_P, %d16
4138	faligndata %d6, %d8, %d48
4139	faligndata %d8, %d10, %d50
4140	faligndata %d10, %d12, %d52
4141	faligndata %d12, %d14, %d54
4142	faligndata %d14, %d16, %d56
4143	faligndata %d16, %d18, %d58
4144	faligndata %d18, %d20, %d60
4145	faligndata %d20, %d22, %d62
4146	fmovd	%d22, %d6
4147	fmovd	%d24, %d8
4148	fmovd	%d26, %d10
4149	fmovd	%d28, %d12
4150	fmovd	%d30, %d14
4151	stda	%d48, [%i1]ASI_BLK_AIUS
4152	subcc	%i3, 64, %i3
4153	add	%i1, 64, %i1
4154	bgu,pt	%ncc, .co_unaln_011_loop
4155	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4156	ba	.co_unaln_done
4157	nop
4158
4159.co_unaln_010:
4160	ldd	[%o4+16], %d4
4161	ldd	[%o4+24], %d6
4162	ldd	[%o4+32], %d8
4163	ldd	[%o4+40], %d10
4164	ldd	[%o4+48], %d12
4165	ldd	[%o4+56], %d14
4166.co_unaln_010_loop:
4167	add	%o4, 64, %o4
4168	ldda	[%o4]ASI_BLK_P, %d16
4169	faligndata %d4, %d6, %d48
4170	faligndata %d6, %d8, %d50
4171	faligndata %d8, %d10, %d52
4172	faligndata %d10, %d12, %d54
4173	faligndata %d12, %d14, %d56
4174	faligndata %d14, %d16, %d58
4175	faligndata %d16, %d18, %d60
4176	faligndata %d18, %d20, %d62
4177	fmovd	%d20, %d4
4178	fmovd	%d22, %d6
4179	fmovd	%d24, %d8
4180	fmovd	%d26, %d10
4181	fmovd	%d28, %d12
4182	fmovd	%d30, %d14
4183	stda	%d48, [%i1]ASI_BLK_AIUS
4184	subcc	%i3, 64, %i3
4185	add	%i1, 64, %i1
4186	bgu,pt	%ncc, .co_unaln_010_loop
4187	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4188	ba	.co_unaln_done
4189	nop
4190
4191.co_unaln_001:
4192	ldd	[%o4+8], %d2
4193	ldd	[%o4+16], %d4
4194	ldd	[%o4+24], %d6
4195	ldd	[%o4+32], %d8
4196	ldd	[%o4+40], %d10
4197	ldd	[%o4+48], %d12
4198	ldd	[%o4+56], %d14
4199.co_unaln_001_loop:
4200	add	%o4, 64, %o4
4201	ldda	[%o4]ASI_BLK_P, %d16
4202	faligndata %d2, %d4, %d48
4203	faligndata %d4, %d6, %d50
4204	faligndata %d6, %d8, %d52
4205	faligndata %d8, %d10, %d54
4206	faligndata %d10, %d12, %d56
4207	faligndata %d12, %d14, %d58
4208	faligndata %d14, %d16, %d60
4209	faligndata %d16, %d18, %d62
4210	fmovd	%d18, %d2
4211	fmovd	%d20, %d4
4212	fmovd	%d22, %d6
4213	fmovd	%d24, %d8
4214	fmovd	%d26, %d10
4215	fmovd	%d28, %d12
4216	fmovd	%d30, %d14
4217	stda	%d48, [%i1]ASI_BLK_AIUS
4218	subcc	%i3, 64, %i3
4219	add	%i1, 64, %i1
4220	bgu,pt	%ncc, .co_unaln_001_loop
4221	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4222	ba	.co_unaln_done
4223	nop
4224
4225.co_unaln_000:
4226	ldda	[%o4]ASI_BLK_P, %d0
4227.co_unaln_000_loop:
4228	add	%o4, 64, %o4
4229	ldda	[%o4]ASI_BLK_P, %d16
4230	faligndata %d0, %d2, %d48
4231	faligndata %d2, %d4, %d50
4232	faligndata %d4, %d6, %d52
4233	faligndata %d6, %d8, %d54
4234	faligndata %d8, %d10, %d56
4235	faligndata %d10, %d12, %d58
4236	faligndata %d12, %d14, %d60
4237	faligndata %d14, %d16, %d62
4238	fmovd	%d16, %d0
4239	fmovd	%d18, %d2
4240	fmovd	%d20, %d4
4241	fmovd	%d22, %d6
4242	fmovd	%d24, %d8
4243	fmovd	%d26, %d10
4244	fmovd	%d28, %d12
4245	fmovd	%d30, %d14
4246	stda	%d48, [%i1]ASI_BLK_AIUS
4247	subcc	%i3, 64, %i3
4248	add	%i1, 64, %i1
4249	bgu,pt	%ncc, .co_unaln_000_loop
4250	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4251
4252.co_unaln_done:
4253	! Handle trailing bytes, 64 to 127
4254	! Dest long word aligned, Src not long word aligned
4255	cmp	%i2, 15
4256	bleu	%ncc, .co_unaln_short
4257
4258	andn	%i2, 0x7, %i3		! %i3 is multiple of 8
4259	and	%i2, 0x7, %i2		! residue bytes in %i2
4260	add	%i2, 8, %i2
4261	sub	%i3, 8, %i3		! insure we don't load past end of src
4262	andn	%i0, 0x7, %o4		! %o4 has long word aligned src address
4263	add	%i0, %i3, %i0		! advance %i0 to after multiple of 8
4264	ldd	[%o4], %d0		! fetch partial word
4265.co_unaln_by8:
4266	ldd	[%o4+8], %d2
4267	add	%o4, 8, %o4
4268	faligndata %d0, %d2, %d16
4269	subcc	%i3, 8, %i3
4270	stda	%d16, [%i1]%asi
4271	fmovd	%d2, %d0
4272	bgu,pt	%ncc, .co_unaln_by8
4273	add	%i1, 8, %i1
4274
4275.co_unaln_short:
4276	cmp	%i2, 8
4277	blt,pt	%ncc, .co_unalnfin
4278	nop
4279	ldub	[%i0], %o4
4280	sll	%o4, 24, %o3
4281	ldub	[%i0+1], %o4
4282	sll	%o4, 16, %o4
4283	or	%o4, %o3, %o3
4284	ldub	[%i0+2], %o4
4285	sll	%o4, 8, %o4
4286	or	%o4, %o3, %o3
4287	ldub	[%i0+3], %o4
4288	or	%o4, %o3, %o3
4289	stwa	%o3, [%i1]%asi
4290	ldub	[%i0+4], %o4
4291	sll	%o4, 24, %o3
4292	ldub	[%i0+5], %o4
4293	sll	%o4, 16, %o4
4294	or	%o4, %o3, %o3
4295	ldub	[%i0+6], %o4
4296	sll	%o4, 8, %o4
4297	or	%o4, %o3, %o3
4298	ldub	[%i0+7], %o4
4299	or	%o4, %o3, %o3
4300	stwa	%o3, [%i1+4]%asi
4301	add	%i0, 8, %i0
4302	add	%i1, 8, %i1
4303	sub	%i2, 8, %i2
4304.co_unalnfin:
4305	cmp	%i2, 4
4306	blt,pt	%ncc, .co_unalnz
4307	tst	%i2
4308	ldub	[%i0], %o3		! read byte
4309	subcc	%i2, 4, %i2		! reduce count by 4
4310	sll	%o3, 24, %o3		! position
4311	ldub	[%i0+1], %o4
4312	sll	%o4, 16, %o4		! position
4313	or	%o4, %o3, %o3		! merge
4314	ldub	[%i0+2], %o4
4315	sll	%o4, 8, %o4		! position
4316	or	%o4, %o3, %o3		! merge
4317	add	%i1, 4, %i1		! advance dst by 4
4318	ldub	[%i0+3], %o4
4319	add	%i0, 4, %i0		! advance src by 4
4320	or	%o4, %o3, %o4		! merge
4321	bnz,pt	%ncc, .co_unaln3x
4322	stwa	%o4, [%i1-4]%asi
4323	ba	.co_exit
4324	nop
4325.co_unalnz:
4326	bz,pt	%ncc, .co_exit
4327	wr	%l5, %g0, %gsr		! restore %gsr
4328.co_unaln3x:				! Exactly 1, 2, or 3 bytes remain
4329	subcc	%i2, 1, %i2		! reduce count for cc test
4330	ldub	[%i0], %o4		! load one byte
4331	bz,pt	%ncc, .co_exit
4332	stba	%o4, [%i1]%asi		! store one byte
4333	ldub	[%i0+1], %o4		! load second byte
4334	subcc	%i2, 1, %i2
4335	bz,pt	%ncc, .co_exit
4336	stba	%o4, [%i1+1]%asi	! store second byte
4337	ldub	[%i0+2], %o4		! load third byte
4338	stba	%o4, [%i1+2]%asi	! store third byte
4339.co_exit:
4340	brnz	%g1, .co_fp_restore
4341	nop
4342	FZERO
4343	wr	%g1, %g0, %fprs
4344	ba,pt	%ncc, .co_ex2
4345	membar	#Sync
4346.co_fp_restore:
4347	BLD_FP_FROMSTACK(%o4)
4348.co_ex2:
4349	andn	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
4350	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
4351	ret
4352	restore %g0, 0, %o0
4353
4354.copyout_err:
4355	ldn	[THREAD_REG + T_COPYOPS], %o4
4356	brz	%o4, 2f
4357	nop
4358	ldn	[%o4 + CP_COPYOUT], %g2
4359	jmp	%g2
4360	nop
43612:
4362	retl
4363	mov	-1, %o0
4364
4365#else	/* NIAGARA_IMPL */
4366.do_copyout:
4367	!
4368	! Check the length and bail if zero.
4369	!
4370	tst	%o2
4371	bnz,pt	%ncc, 1f
4372	nop
4373	retl
4374	clr	%o0
43751:
4376	sethi	%hi(copyio_fault), %o4
4377	or	%o4, %lo(copyio_fault), %o4
4378	sethi	%hi(copyio_fault_nowindow), %o3
4379	ldn	[THREAD_REG + T_LOFAULT], SAVED_LOFAULT
4380	or	%o3, %lo(copyio_fault_nowindow), %o3
4381	membar	#Sync
4382	stn	%o3, [THREAD_REG + T_LOFAULT]
4383
4384	mov	%o0, SAVE_SRC
4385	mov	%o1, SAVE_DST
4386	mov	%o2, SAVE_COUNT
4387
4388	!
4389	! Check to see if we're more than SMALL_LIMIT (7 bytes).
4390	! Run in leaf mode, using the %o regs as our input regs.
4391	!
4392	subcc	%o2, SMALL_LIMIT, %o3
4393	bgu,a,pt %ncc, .dco_ns
4394	or	%o0, %o1, %o3
4395	!
4396	! What was previously ".small_copyout"
4397	! Do full differenced copy.
4398	!
4399.dcobcp:
4400	sub	%g0, %o2, %o3		! negate count
4401	add	%o0, %o2, %o0		! make %o0 point at the end
4402	add	%o1, %o2, %o1		! make %o1 point at the end
4403	ba,pt	%ncc, .dcocl
4404	ldub	[%o0 + %o3], %o4	! load first byte
4405	!
4406	! %o0 and %o2 point at the end and remain pointing at the end
4407	! of their buffers. We pull things out by adding %o3 (which is
4408	! the negation of the length) to the buffer end which gives us
4409	! the curent location in the buffers. By incrementing %o3 we walk
4410	! through both buffers without having to bump each buffer's
4411	! pointer. A very fast 4 instruction loop.
4412	!
4413	.align 16
4414.dcocl:
4415	stba	%o4, [%o1 + %o3]ASI_USER
4416	inccc	%o3
4417	bl,a,pt	%ncc, .dcocl
4418	ldub	[%o0 + %o3], %o4
4419	!
4420	! We're done. Go home.
4421	!
4422	membar	#Sync
4423	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
4424	retl
4425	clr	%o0
4426	!
4427	! Try aligned copies from here.
4428	!
4429.dco_ns:
4430	! %o0 = kernel addr (to be copied from)
4431	! %o1 = user addr (to be copied to)
4432	! %o2 = length
4433	! %o3 = %o1 | %o2 (used for alignment checking)
4434	! %o4 is alternate lo_fault
4435	! %o5 is original lo_fault
4436	!
4437	! See if we're single byte aligned. If we are, check the
4438	! limit for single byte copies. If we're smaller or equal,
4439	! bounce to the byte for byte copy loop. Otherwise do it in
4440	! HW (if enabled).
4441	!
4442	btst	1, %o3
4443	bz,pt	%icc, .dcoh8
4444	btst	7, %o3
4445	!
4446	! Single byte aligned. Do we do it via HW or via
4447	! byte for byte? Do a quick no memory reference
4448	! check to pick up small copies.
4449	!
4450	sethi	%hi(hw_copy_limit_1), %o3
4451	!
4452	! Big enough that we need to check the HW limit for
4453	! this size copy.
4454	!
4455	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
4456	!
4457	! Is HW copy on? If not, do everything byte for byte.
4458	!
4459	tst	%o3
4460	bz,pn	%icc, .dcobcp
4461	subcc	%o3, %o2, %o3
4462	!
4463	! If we're less than or equal to the single byte copy limit,
4464	! bop to the copy loop.
4465	!
4466	bge,pt	%ncc, .dcobcp
4467	nop
4468	!
4469	! We're big enough and copy is on. Do it with HW.
4470	!
4471	ba,pt	%ncc, .big_copyout
4472	nop
4473.dcoh8:
4474	!
4475	! 8 byte aligned?
4476	!
4477	bnz,a	%ncc, .dcoh4
4478	btst	3, %o3
4479	!
4480	! See if we're in the "small range".
4481	! If so, go off and do the copy.
4482	! If not, load the hard limit. %o3 is
4483	! available for reuse.
4484	!
4485	sethi	%hi(hw_copy_limit_8), %o3
4486	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
4487	!
4488	! If it's zero, there's no HW bcopy.
4489	! Bop off to the aligned copy.
4490	!
4491	tst	%o3
4492	bz,pn	%icc, .dcos8
4493	subcc	%o3, %o2, %o3
4494	!
4495	! We're negative if our size is larger than hw_copy_limit_8.
4496	!
4497	bge,pt	%ncc, .dcos8
4498	nop
4499	!
4500	! HW assist is on and we're large enough. Do it.
4501	!
4502	ba,pt	%ncc, .big_copyout
4503	nop
4504.dcos8:
4505	!
4506	! Housekeeping for copy loops. Uses same idea as in the byte for
4507	! byte copy loop above.
4508	!
4509	add	%o0, %o2, %o0
4510	add	%o1, %o2, %o1
4511	sub	%g0, %o2, %o3
4512	ba,pt	%ncc, .dodebc
4513	srl	%o2, 3, %o2		! Number of 8 byte chunks to copy
4514	!
4515	! 4 byte aligned?
4516	!
4517.dcoh4:
4518	bnz,pn	%ncc, .dcoh2
4519	!
4520	! See if we're in the "small range".
4521	! If so, go off an do the copy.
4522	! If not, load the hard limit. %o3 is
4523	! available for reuse.
4524	!
4525	sethi	%hi(hw_copy_limit_4), %o3
4526	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
4527	!
4528	! If it's zero, there's no HW bcopy.
4529	! Bop off to the aligned copy.
4530	!
4531	tst	%o3
4532	bz,pn	%icc, .dcos4
4533	subcc	%o3, %o2, %o3
4534	!
4535	! We're negative if our size is larger than hw_copy_limit_4.
4536	!
4537	bge,pt	%ncc, .dcos4
4538	nop
4539	!
4540	! HW assist is on and we're large enough. Do it.
4541	!
4542	ba,pt	%ncc, .big_copyout
4543	nop
4544.dcos4:
4545	add	%o0, %o2, %o0
4546	add	%o1, %o2, %o1
4547	sub	%g0, %o2, %o3
4548	ba,pt	%ncc, .dodfbc
4549	srl	%o2, 2, %o2		! Number of 4 byte chunks to copy
4550	!
4551	! We must be 2 byte aligned. Off we go.
4552	! The check for small copies was done in the
4553	! delay at .dcoh4
4554	!
4555.dcoh2:
4556	ble	%ncc, .dcos2
4557	sethi	%hi(hw_copy_limit_2), %o3
4558	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
4559	tst	%o3
4560	bz,pn	%icc, .dcos2
4561	subcc	%o3, %o2, %o3
4562	bge,pt	%ncc, .dcos2
4563	nop
4564	!
4565	! HW is on and we're big enough. Do it.
4566	!
4567	ba,pt	%ncc, .big_copyout
4568	nop
4569.dcos2:
4570	add	%o0, %o2, %o0
4571	add	%o1, %o2, %o1
4572	sub	%g0, %o2, %o3
4573	ba,pt	%ncc, .dodtbc
4574	srl	%o2, 1, %o2		! Number of 2 byte chunks to copy
4575.small_copyout:
4576	!
4577	! Why are we doing this AGAIN? There are certain conditions in
4578	! big_copyout that will cause us to forego the HW assisted copies
4579	! and bounce back to a non-HW assisted copy. This dispatches those
4580	! copies. Note that we branch around this in the main line code.
4581	!
4582	! We make no check for limits or HW enablement here. We've
4583	! already been told that we're a poster child so just go off
4584	! and do it.
4585	!
4586	or	%o0, %o1, %o3
4587	btst	1, %o3
4588	bnz	%icc, .dcobcp		! Most likely
4589	btst	7, %o3
4590	bz	%icc, .dcos8
4591	btst	3, %o3
4592	bz	%icc, .dcos4
4593	nop
4594	ba,pt	%ncc, .dcos2
4595	nop
4596	.align 32
4597.dodebc:
4598	ldx	[%o0 + %o3], %o4
4599	deccc	%o2
4600	stxa	%o4, [%o1 + %o3]ASI_USER
4601	bg,pt	%ncc, .dodebc
4602	addcc	%o3, 8, %o3
4603	!
4604	! End of copy loop. Check to see if we're done. Most
4605	! eight byte aligned copies end here.
4606	!
4607	bz,pt	%ncc, .dcofh
4608	nop
4609	!
4610	! Something is left - do it byte for byte.
4611	!
4612	ba,pt	%ncc, .dcocl
4613	ldub	[%o0 + %o3], %o4	! load next byte
4614	!
4615	! Four byte copy loop. %o2 is the number of 4 byte chunks to copy.
4616	!
4617	.align 32
4618.dodfbc:
4619	lduw	[%o0 + %o3], %o4
4620	deccc	%o2
4621	sta	%o4, [%o1 + %o3]ASI_USER
4622	bg,pt	%ncc, .dodfbc
4623	addcc	%o3, 4, %o3
4624	!
4625	! End of copy loop. Check to see if we're done. Most
4626	! four byte aligned copies end here.
4627	!
4628	bz,pt	%ncc, .dcofh
4629	nop
4630	!
4631	! Something is left. Do it byte for byte.
4632	!
4633	ba,pt	%ncc, .dcocl
4634	ldub	[%o0 + %o3], %o4	! load next byte
4635	!
4636	! two byte aligned copy loop. %o2 is the number of 2 byte chunks to
4637	! copy.
4638	!
4639	.align 32
4640.dodtbc:
4641	lduh	[%o0 + %o3], %o4
4642	deccc	%o2
4643	stha	%o4, [%o1 + %o3]ASI_USER
4644	bg,pt	%ncc, .dodtbc
4645	addcc	%o3, 2, %o3
4646	!
4647	! End of copy loop. Anything left?
4648	!
4649	bz,pt	%ncc, .dcofh
4650	nop
4651	!
4652	! Deal with the last byte
4653	!
4654	ldub	[%o0 + %o3], %o4
4655	stba	%o4, [%o1 + %o3]ASI_USER
4656.dcofh:
4657	membar	#Sync
4658	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
4659	retl
4660	clr	%o0
4661
4662.big_copyout:
4663	! We're going to go off and do a block copy.
4664	! Switch fault handlers and grab a window. We
4665	! don't do a membar #Sync since we've done only
4666	! kernel data to this point.
4667	stn	%o4, [THREAD_REG + T_LOFAULT]
4668
4669	! Copy out that reach here are larger than 256 bytes. The
4670	! hw_copy_limit_1 is set to 256. Never set this limit less
4671	! 128 bytes.
4672	save	%sp, -SA(MINFRAME), %sp
4673.do_block_copyout:
4674
4675	! Swap src/dst since the code below is memcpy code
4676	! and memcpy/bcopy have different calling sequences
4677	mov	%i1, %i5
4678	mov	%i0, %i1
4679	mov	%i5, %i0
4680
4681	! Block (64 bytes) align the destination.
4682	andcc	%i0, 0x3f, %i3		! is dst block aligned
4683	bz	%ncc, copyout_blalign	! dst already block aligned
4684	sub	%i3, 0x40, %i3
4685	neg	%i3			! bytes till dst 64 bytes aligned
4686	sub	%i2, %i3, %i2		! update i2 with new count
4687
4688	! Based on source and destination alignment do
4689	! either 8 bytes, 4 bytes, 2 bytes or byte copy.
4690
4691	! Is dst & src 8B aligned
4692	or	%i0, %i1, %o2
4693	andcc	%o2, 0x7, %g0
4694	bz	%ncc, .co_alewdcp
4695	nop
4696
4697	! Is dst & src 4B aligned
4698	andcc	%o2, 0x3, %g0
4699	bz	%ncc, .co_alwdcp
4700	nop
4701
4702	! Is dst & src 2B aligned
4703	andcc	%o2, 0x1, %g0
4704	bz	%ncc, .co_alhlfwdcp
4705	nop
4706
4707	! 1B aligned
47081:	ldub	[%i1], %o2
4709	stba	%o2, [%i0]ASI_USER
4710	inc	%i1
4711	deccc	%i3
4712	bgu,pt	%ncc, 1b
4713	inc	%i0
4714
4715	ba	copyout_blalign
4716	nop
4717
4718	! dst & src 4B aligned
4719.co_alwdcp:
4720	ld	[%i1], %o2
4721	sta	%o2, [%i0]ASI_USER
4722	add	%i1, 0x4, %i1
4723	subcc	%i3, 0x4, %i3
4724	bgu,pt	%ncc, .co_alwdcp
4725	add	%i0, 0x4, %i0
4726
4727	ba	copyout_blalign
4728	nop
4729
4730	! dst & src 2B aligned
4731.co_alhlfwdcp:
4732	lduh	[%i1], %o2
4733	stuha	%o2, [%i0]ASI_USER
4734	add	%i1, 0x2, %i1
4735	subcc	%i3, 0x2, %i3
4736	bgu,pt	%ncc, .co_alhlfwdcp
4737	add	%i0, 0x2, %i0
4738
4739	ba	copyout_blalign
4740	nop
4741
4742	! dst & src 8B aligned
4743.co_alewdcp:
4744	ldx	[%i1], %o2
4745	stxa	%o2, [%i0]ASI_USER
4746	add	%i1, 0x8, %i1
4747	subcc	%i3, 0x8, %i3
4748	bgu,pt	%ncc, .co_alewdcp
4749	add	%i0, 0x8, %i0
4750
4751	! Now Destination is block (64 bytes) aligned
4752copyout_blalign:
4753	andn	%i2, 0x3f, %i3		! %i3 count is multiple of block size
4754	sub	%i2, %i3, %i2		! Residue bytes in %i2
4755
4756	mov	ASI_BLK_INIT_QUAD_LDD_AIUS, %asi
4757
4758	andcc	%i1, 0xf, %o2		! is src quadword aligned
4759	bz,pn	%xcc, .co_blkcpy	! src offset in %o2 (last 4-bits)
4760	nop
4761	cmp	%o2, 0x8
4762	bg	.co_upper_double
4763	nop
4764	bl	.co_lower_double
4765	nop
4766
4767	! Falls through when source offset is equal to 8 i.e.
4768	! source is double word aligned.
4769	! In this case no shift/merge of data is required
4770
4771	sub	%i1, %o2, %i1		! align the src at 16 bytes.
4772	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
4773	prefetch [%l0+0x0], #one_read
4774	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4775.co_loop0:
4776	add	%i1, 0x10, %i1
4777	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
4778	prefetch [%l0+0x40], #one_read
4779
4780	stxa	%l3, [%i0+0x0]%asi
4781	stxa	%l4, [%i0+0x8]%asi
4782
4783	add	%i1, 0x10, %i1
4784	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4785
4786	stxa	%l5, [%i0+0x10]%asi
4787	stxa	%l2, [%i0+0x18]%asi
4788
4789	add	%i1, 0x10, %i1
4790	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
4791
4792	stxa	%l3, [%i0+0x20]%asi
4793	stxa	%l4, [%i0+0x28]%asi
4794
4795	add	%i1, 0x10, %i1
4796	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4797
4798	stxa	%l5, [%i0+0x30]%asi
4799	stxa	%l2, [%i0+0x38]%asi
4800
4801	add	%l0, 0x40, %l0
4802	subcc	%i3, 0x40, %i3
4803	bgu,pt	%xcc, .co_loop0
4804	add	%i0, 0x40, %i0
4805	ba	.co_blkdone
4806	add	%i1, %o2, %i1		! increment the source by src offset
4807					! the src offset was stored in %o2
4808
4809.co_lower_double:
4810
4811	sub	%i1, %o2, %i1		! align the src at 16 bytes.
4812	sll	%o2, 3, %o0		! %o0 left shift
4813	mov	0x40, %o1
4814	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
4815	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
4816	prefetch [%l0+0x0], #one_read
4817	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2	! partial data in %l2 and %l3 has
4818					! complete data
4819.co_loop1:
4820	add	%i1, 0x10, %i1
4821	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4	! %l4 has partial data
4822							! for this read.
4823	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)	! merge %l2, %l3 and %l4
4824							! into %l2 and %l3
4825	prefetch [%l0+0x40], #one_read
4826
4827	stxa	%l2, [%i0+0x0]%asi
4828	stxa	%l3, [%i0+0x8]%asi
4829
4830	add	%i1, 0x10, %i1
4831	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4832	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)	! merge %l2 with %l5 and
4833							! %l4 from previous read
4834							! into %l4 and %l5
4835	stxa	%l4, [%i0+0x10]%asi
4836	stxa	%l5, [%i0+0x18]%asi
4837
4838	! Repeat the same for next 32 bytes.
4839
4840	add	%i1, 0x10, %i1
4841	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
4842	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
4843
4844	stxa	%l2, [%i0+0x20]%asi
4845	stxa	%l3, [%i0+0x28]%asi
4846
4847	add	%i1, 0x10, %i1
4848	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4849	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
4850
4851	stxa	%l4, [%i0+0x30]%asi
4852	stxa	%l5, [%i0+0x38]%asi
4853
4854	add	%l0, 0x40, %l0
4855	subcc	%i3, 0x40, %i3
4856	bgu,pt	%xcc, .co_loop1
4857	add	%i0, 0x40, %i0
4858	ba	.co_blkdone
4859	add	%i1, %o2, %i1		! increment the source by src offset
4860					! the src offset was stored in %o2
4861
4862.co_upper_double:
4863
4864	sub	%i1, %o2, %i1		! align the src at 16 bytes.
4865	sub	%o2, 0x8, %o0
4866	sll	%o0, 3, %o0		! %o0 left shift
4867	mov	0x40, %o1
4868	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
4869	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
4870	prefetch [%l0+0x0], #one_read
4871	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2	! partial data in %l3
4872							! for this read and
4873							! no data in %l2
4874.co_loop2:
4875	add	%i1, 0x10, %i1
4876	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4	! %l4 has complete data
4877							! and %l5 has partial
4878	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)	! merge %l3, %l4 and %l5
4879							! into %l3 and %l4
4880	prefetch [%l0+0x40], #one_read
4881
4882	stxa	%l3, [%i0+0x0]%asi
4883	stxa	%l4, [%i0+0x8]%asi
4884
4885	add	%i1, 0x10, %i1
4886	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4887	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)	! merge %l2 and %l3 with
4888							! %l5 from previous read
4889							! into %l5 and %l2
4890
4891	stxa	%l5, [%i0+0x10]%asi
4892	stxa	%l2, [%i0+0x18]%asi
4893
4894	! Repeat the same for next 32 bytes.
4895
4896	add	%i1, 0x10, %i1
4897	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
4898	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
4899
4900	stxa	%l3, [%i0+0x20]%asi
4901	stxa	%l4, [%i0+0x28]%asi
4902
4903	add	%i1, 0x10, %i1
4904	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4905	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
4906
4907	stxa	%l5, [%i0+0x30]%asi
4908	stxa	%l2, [%i0+0x38]%asi
4909
4910	add	%l0, 0x40, %l0
4911	subcc	%i3, 0x40, %i3
4912	bgu,pt	%xcc, .co_loop2
4913	add	%i0, 0x40, %i0
4914	ba	.co_blkdone
4915	add	%i1, %o2, %i1		! increment the source by src offset
4916					! the src offset was stored in %o2
4917
4918
4919	! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
4920.co_blkcpy:
4921
4922	andn	%i1, 0x3f, %o0		! %o0 has block aligned source
4923	prefetch [%o0+0x0], #one_read
49241:
4925	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l0
4926	add	%i1, 0x10, %i1
4927	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4928	add	%i1, 0x10, %i1
4929
4930	prefetch [%o0+0x40], #one_read
4931
4932	stxa	%l0, [%i0+0x0]%asi
4933
4934	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
4935	add	%i1, 0x10, %i1
4936	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l6
4937	add	%i1, 0x10, %i1
4938
4939	stxa	%l1, [%i0+0x8]%asi
4940	stxa	%l2, [%i0+0x10]%asi
4941	stxa	%l3, [%i0+0x18]%asi
4942	stxa	%l4, [%i0+0x20]%asi
4943	stxa	%l5, [%i0+0x28]%asi
4944	stxa	%l6, [%i0+0x30]%asi
4945	stxa	%l7, [%i0+0x38]%asi
4946
4947	add	%o0, 0x40, %o0
4948	subcc	%i3, 0x40, %i3
4949	bgu,pt	%xcc, 1b
4950	add	%i0, 0x40, %i0
4951
4952.co_blkdone:
4953	membar	#Sync
4954
4955	brz,pt	%i2, .copyout_exit
4956	nop
4957
4958	! Handle trailing bytes
4959	cmp	%i2, 0x8
4960	blu,pt	%ncc, .co_residue
4961	nop
4962
4963	! Can we do some 8B ops
4964	or	%i1, %i0, %o2
4965	andcc	%o2, 0x7, %g0
4966	bnz	%ncc, .co_last4
4967	nop
4968
4969	! Do 8byte ops as long as possible
4970.co_last8:
4971	ldx	[%i1], %o2
4972	stxa	%o2, [%i0]ASI_USER
4973	add	%i1, 0x8, %i1
4974	sub	%i2, 0x8, %i2
4975	cmp	%i2, 0x8
4976	bgu,pt	%ncc, .co_last8
4977	add	%i0, 0x8, %i0
4978
4979	brz,pt	%i2, .copyout_exit
4980	nop
4981
4982	ba	.co_residue
4983	nop
4984
4985.co_last4:
4986	! Can we do 4B ops
4987	andcc	%o2, 0x3, %g0
4988	bnz	%ncc, .co_last2
4989	nop
49901:
4991	ld	[%i1], %o2
4992	sta	%o2, [%i0]ASI_USER
4993	add	%i1, 0x4, %i1
4994	sub	%i2, 0x4, %i2
4995	cmp	%i2, 0x4
4996	bgu,pt	%ncc, 1b
4997	add	%i0, 0x4, %i0
4998
4999	brz,pt	%i2, .copyout_exit
5000	nop
5001
5002	ba	.co_residue
5003	nop
5004
5005.co_last2:
5006	! Can we do 2B ops
5007	andcc	%o2, 0x1, %g0
5008	bnz	%ncc, .co_residue
5009	nop
5010
50111:
5012	lduh	[%i1], %o2
5013	stuha	%o2, [%i0]ASI_USER
5014	add	%i1, 0x2, %i1
5015	sub	%i2, 0x2, %i2
5016	cmp	%i2, 0x2
5017	bgu,pt	%ncc, 1b
5018	add	%i0, 0x2, %i0
5019
5020	brz,pt	%i2, .copyout_exit
5021	nop
5022
5023	! Copy the residue as byte copy
5024.co_residue:
5025	ldub	[%i1], %i4
5026	stba	%i4, [%i0]ASI_USER
5027	inc	%i1
5028	deccc	%i2
5029	bgu,pt	%xcc, .co_residue
5030	inc	%i0
5031
5032.copyout_exit:
5033	membar	#Sync
5034	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
5035	ret
5036	restore	%g0, 0, %o0
5037
5038.copyout_err:
5039	ldn	[THREAD_REG + T_COPYOPS], %o4
5040	brz	%o4, 2f
5041	nop
5042	ldn	[%o4 + CP_COPYOUT], %g2
5043	jmp	%g2
5044	nop
50452:
5046	retl
5047	mov	-1, %o0
5048#endif	/* NIAGARA_IMPL */
5049	SET_SIZE(copyout)
5050
5051
5052	ENTRY(xcopyout)
5053	sethi	%hi(.xcopyout_err), REAL_LOFAULT
5054	b	.do_copyout
5055	or	REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
5056.xcopyout_err:
5057	ldn	[THREAD_REG + T_COPYOPS], %o4
5058	brz	%o4, 2f
5059	nop
5060	ldn	[%o4 + CP_XCOPYOUT], %g2
5061	jmp	%g2
5062	nop
50632:
5064	retl
5065	mov	%g1, %o0
5066	SET_SIZE(xcopyout)
5067
5068	ENTRY(xcopyout_little)
5069	sethi	%hi(.little_err), %o4
5070	ldn	[THREAD_REG + T_LOFAULT], %o5
5071	or	%o4, %lo(.little_err), %o4
5072	membar	#Sync			! sync error barrier
5073	stn	%o4, [THREAD_REG + T_LOFAULT]
5074
5075	subcc	%g0, %o2, %o3
5076	add	%o0, %o2, %o0
5077	bz,pn	%ncc, 2f		! check for zero bytes
5078	sub	%o2, 1, %o4
5079	add	%o0, %o4, %o0		! start w/last byte
5080	add	%o1, %o2, %o1
5081	ldub	[%o0+%o3], %o4
5082
50831:	stba	%o4, [%o1+%o3]ASI_AIUSL
5084	inccc	%o3
5085	sub	%o0, 2, %o0		! get next byte
5086	bcc,a,pt %ncc, 1b
5087	ldub	[%o0+%o3], %o4
5088
50892:	membar	#Sync			! sync error barrier
5090	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
5091	retl
5092	mov	%g0, %o0		! return (0)
5093	SET_SIZE(xcopyout_little)
5094
5095/*
5096 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
5097 */
5098
5099	ENTRY(copyin)
5100	sethi	%hi(.copyin_err), REAL_LOFAULT
5101	or	REAL_LOFAULT, %lo(.copyin_err), REAL_LOFAULT
5102
5103#if !defined(NIAGARA_IMPL)
5104.do_copyin:
5105	tst	%o2			! check for zero count;  quick exit
5106	bz,pt	%ncc, .ci_smallqx
5107	mov	%o0, SAVE_SRC
5108	mov	%o1, SAVE_DST
5109	mov	%o2, SAVE_COUNT
5110	cmp	%o2, FP_COPY		! check for small copy/leaf case
5111	bgt,pt	%ncc, .ci_copy_more
5112	ldn	[THREAD_REG + T_LOFAULT], SAVED_LOFAULT
5113/*
5114 * Small copy in code
5115 *
5116 */
5117	sethi	%hi(copyio_fault_nowindow), %o3
5118	or	%o3, %lo(copyio_fault_nowindow), %o3
5119	membar	#Sync
5120	stn	%o3, [THREAD_REG + T_LOFAULT]
5121
5122	mov	ASI_USER, %asi
5123	cmp	%o2, SHORTCOPY		! make sure there is enough to align
5124	ble,pt	%ncc, .ci_smallest
5125	andcc	%o1, 0x7, %o3		! is dest long word aligned
5126	bnz,pn	%ncc, .ci_align
5127	andcc	%o1, 1, %o3		! is dest byte aligned
5128
5129! Destination is long word aligned
5130.ci_al_src:
5131	andcc	%o0, 7, %o3
5132	brnz,pt	%o3, .ci_src_dst_unal8
5133	nop
5134/*
5135 * Special case for handling when src and dest are both long word aligned
5136 * and total data to move is less than FP_COPY bytes
5137 * Also handles finish up for large block moves, so may be less than 32 bytes
5138 */
5139.ci_medlong:
5140	subcc	%o2, 31, %o2		! adjust length to allow cc test
5141	ble,pt	%ncc, .ci_medl31
5142	nop
5143.ci_medl32:
5144	ldxa	[%o0]%asi, %o4		! move 32 bytes
5145	subcc	%o2, 32, %o2		! decrement length count by 32
5146	stx	%o4, [%o1]
5147	ldxa	[%o0+8]%asi, %o4
5148	stx	%o4, [%o1+8]
5149	ldxa	[%o0+16]%asi, %o4
5150	add	%o0, 32, %o0		! increase src ptr by 32
5151	stx	%o4, [%o1+16]
5152	ldxa	[%o0-8]%asi, %o4
5153	add	%o1, 32, %o1		! increase dst ptr by 32
5154	bgu,pt	%ncc, .ci_medl32	! repeat if at least 32 bytes left
5155	stx	%o4, [%o1-8]
5156.ci_medl31:
5157	addcc	%o2, 24, %o2		! adjust count to be off by 7
5158	ble,pt	%ncc, .ci_medl7		! skip if 7 or fewer bytes left
5159	nop
5160.ci_medl8:
5161	ldxa	[%o0]%asi, %o4		! move 8 bytes
5162	add	%o0, 8, %o0		! increase src ptr by 8
5163	subcc	%o2, 8, %o2		! decrease count by 8
5164	add	%o1, 8, %o1		! increase dst ptr by 8
5165	bgu,pt	%ncc, .ci_medl8
5166	stx	%o4, [%o1-8]
5167.ci_medl7:
5168	addcc	%o2, 7, %o2		! finish adjustment of remaining count
5169	bnz,pt	%ncc, .ci_small4	! do final bytes if not finished
5170	nop
5171.ci_smallx:				! finish up and exit
5172	membar	#Sync
5173	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
5174.ci_smallqx:
5175	retl
5176	mov	%g0, %o0
5177
5178.ci_small4:
5179	cmp	%o2, 4
5180	blt,pt	%ncc, .ci_small3x	! skip if less than 4 bytes left
5181	nop				!
5182	lda	[%o0]%asi, %o4		! move 4 bytes
5183	add	%o0, 4, %o0		! increase src ptr by 4
5184	add	%o1, 4, %o1		! increase dst ptr by 4
5185	subcc	%o2, 4, %o2		! decrease count by 4
5186	bz	%ncc, .ci_smallx
5187	stw	%o4, [%o1-4]
5188
5189.ci_small3x:				! Exactly 1, 2, or 3 bytes remain
5190	subcc	%o2, 1, %o2		! reduce count for cc test
5191	lduba	[%o0]%asi, %o4		! load one byte
5192	bz,pt	%ncc, .ci_smallx
5193	stb	%o4, [%o1]		! store one byte
5194	lduba	[%o0+1]%asi, %o4	! load second byte
5195	subcc	%o2, 1, %o2
5196	bz,pt	%ncc, .ci_smallx
5197	stb	%o4, [%o1+1]		! store second byte
5198	lduba	[%o0+2]%asi, %o4	! load third byte
5199	ba	.ci_smallx
5200	stb	%o4, [%o1+2]		! store third byte
5201
5202.ci_smallest:				! 7 or fewer bytes remain
5203	cmp	%o2, 4
5204	blt,pt	%ncc, .ci_small3x
5205	nop
5206	lduba	[%o0]%asi, %o4		! read byte
5207	subcc	%o2, 4, %o2		! reduce count by 4
5208	stb	%o4, [%o1]		! write byte
5209	lduba	[%o0+1]%asi, %o4	! repeat for total of 4 bytes
5210	add	%o0, 4, %o0		! advance src by 4
5211	stb	%o4, [%o1+1]
5212	lduba	[%o0-2]%asi, %o4
5213	add	%o1, 4, %o1		! advance dst by 4
5214	stb	%o4, [%o1-2]
5215	lduba	[%o0-1]%asi, %o4
5216	bnz,pt	%ncc, .ci_small3x
5217	stb	%o4, [%o1-1]
5218	membar	#Sync
5219	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
5220	retl
5221	mov	%g0, %o0
5222
5223.ci_align:
5224	bnz,pt	%ncc, .ci_al_d1
5225.ci_al_d1f:				! dest is now half word aligned
5226	andcc	%o1, 2, %o3		! is dest word aligned
5227	bnz,pt	%ncc, .ci_al_d2
5228.ci_al_d2f:				! dest is now word aligned
5229	andcc	%o1, 4, %o3		! is dest longword aligned?
5230	bz,pt	%ncc, .ci_al_src
5231	nop
5232.ci_al_d4:				! dest is word aligned;  src is unknown
5233	lduba	[%o0]%asi, %o4		! move a word (src align unknown)
5234	lduba	[%o0+1]%asi, %o3
5235	sll	%o4, 24, %o4		! position
5236	sll	%o3, 16, %o3		! position
5237	or	%o4, %o3, %o3		! merge
5238	lduba	[%o0+2]%asi, %o4
5239	sll	%o4, 8, %o4		! position
5240	or	%o4, %o3, %o3		! merge
5241	lduba	[%o0+3]%asi, %o4
5242	or	%o4, %o3, %o4		! merge
5243	stw	%o4,[%o1]		! store four bytes
5244	add	%o0, 4, %o0		! adjust src by 4
5245	add	%o1, 4, %o1		! adjust dest by 4
5246	sub	%o2, 4, %o2		! adjust count by 4
5247	andcc	%o0, 7, %o3		! check for src long word alignment
5248	brz,pt	%o3, .ci_medlong
5249.ci_src_dst_unal8:
5250	! dst is 8-byte aligned, src is not
5251	! Size is less than FP_COPY
5252	! Following code is to select for alignment
5253	andcc	%o0, 0x3, %o3		! test word alignment
5254	bz,pt	%ncc, .ci_medword
5255	nop
5256	andcc	%o0, 0x1, %o3		! test halfword alignment
5257	bnz,pt	%ncc, .ci_med_byte	! go to byte move if not halfword
5258	andcc	%o0, 0x2, %o3		! test which byte alignment
5259	ba	.ci_medhalf
5260	nop
5261.ci_al_d1:				! align dest to half word
5262	lduba	[%o0]%asi, %o4		! move a byte
5263	add	%o0, 1, %o0
5264	stb	%o4, [%o1]
5265	add	%o1, 1, %o1
5266	andcc	%o1, 2, %o3		! is dest word aligned
5267	bz,pt	%ncc, .ci_al_d2f
5268	sub	%o2, 1, %o2
5269.ci_al_d2:				! align dest to word
5270	lduba	[%o0]%asi, %o4		! move a half-word (src align unknown)
5271	lduba	[%o0+1]%asi, %o3
5272	sll	%o4, 8, %o4		! position
5273	or	%o4, %o3, %o4		! merge
5274	sth	%o4, [%o1]
5275	add	%o0, 2, %o0
5276	add	%o1, 2, %o1
5277	andcc	%o1, 4, %o3		! is dest longword aligned?
5278	bz,pt	%ncc, .ci_al_src
5279	sub	%o2, 2, %o2
5280	ba	.ci_al_d4
5281	nop
5282/*
5283 * Handle all cases where src and dest are aligned on word
5284 * boundaries. Use unrolled loops for better performance.
5285 * This option wins over standard large data move when
5286 * source and destination is in cache for medium
5287 * to short data moves.
5288 */
5289.ci_medword:
5290	subcc	%o2, 31, %o2		! adjust length to allow cc test
5291	ble,pt	%ncc, .ci_medw31
5292	nop
5293.ci_medw32:
5294	lda	[%o0]%asi, %o4		! move a block of 32 bytes
5295	stw	%o4, [%o1]
5296	lda	[%o0+4]%asi, %o4
5297	stw	%o4, [%o1+4]
5298	lda	[%o0+8]%asi, %o4
5299	stw	%o4, [%o1+8]
5300	lda	[%o0+12]%asi, %o4
5301	stw	%o4, [%o1+12]
5302	lda	[%o0+16]%asi, %o4
5303	stw	%o4, [%o1+16]
5304	lda	[%o0+20]%asi, %o4
5305	subcc	%o2, 32, %o2		! decrement length count
5306	stw	%o4, [%o1+20]
5307	lda	[%o0+24]%asi, %o4
5308	add	%o0, 32, %o0		! increase src ptr by 32
5309	stw	%o4, [%o1+24]
5310	lda	[%o0-4]%asi, %o4
5311	add	%o1, 32, %o1		! increase dst ptr by 32
5312	bgu,pt	%ncc, .ci_medw32	! repeat if at least 32 bytes left
5313	stw	%o4, [%o1-4]
5314.ci_medw31:
5315	addcc	%o2, 24, %o2		! adjust count to be off by 7
5316	ble,pt	%ncc, .ci_medw7		! skip if 7 or fewer bytes left
5317	nop				!
5318.ci_medw15:
5319	lda	[%o0]%asi, %o4		! move a block of 8 bytes
5320	subcc	%o2, 8, %o2		! decrement length count
5321	stw	%o4, [%o1]
5322	add	%o0, 8, %o0		! increase src ptr by 8
5323	lda	[%o0-4]%asi, %o4
5324	add	%o1, 8, %o1		! increase dst ptr by 8
5325	bgu,pt	%ncc, .ci_medw15
5326	stw	%o4, [%o1-4]
5327.ci_medw7:
5328	addcc	%o2, 7, %o2		! finish adjustment of remaining count
5329	bz,pt	%ncc, .ci_smallx	! exit if finished
5330	cmp	%o2, 4
5331	blt,pt	%ncc, .ci_small3x	! skip if less than 4 bytes left
5332	nop				!
5333	lda	[%o0]%asi, %o4		! move 4 bytes
5334	add	%o0, 4, %o0		! increase src ptr by 4
5335	add	%o1, 4, %o1		! increase dst ptr by 4
5336	subcc	%o2, 4, %o2		! decrease count by 4
5337	bnz	.ci_small3x
5338	stw	%o4, [%o1-4]
5339	membar	#Sync
5340	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
5341	retl
5342	mov	%g0, %o0
5343
5344.ci_medhalf:
5345	subcc	%o2, 31, %o2		! adjust length to allow cc test
5346	ble,pt	%ncc, .ci_medh31
5347	nop
5348.ci_medh32:				! load and store block of 32 bytes
5349	subcc	%o2, 32, %o2		! decrement length count
5350
5351	lduha	[%o0]%asi, %o4		! move 32 bytes
5352	lduwa	[%o0+2]%asi, %o3
5353	sllx	%o4, 48, %o4
5354	sllx	%o3, 16, %o3
5355	or	%o4, %o3, %o3
5356	lduha	[%o0+6]%asi, %o4
5357	or	%o4, %o3, %o4
5358	stx	%o4, [%o1]
5359
5360	lduha	[%o0+8]%asi, %o4
5361	lduwa	[%o0+10]%asi, %o3
5362	sllx	%o4, 48, %o4
5363	sllx	%o3, 16, %o3
5364	or	%o4, %o3, %o3
5365	lduha	[%o0+14]%asi, %o4
5366	or	%o4, %o3, %o4
5367	stx	%o4, [%o1+8]
5368
5369	lduha	[%o0+16]%asi, %o4
5370	lduwa	[%o0+18]%asi, %o3
5371	sllx	%o4, 48, %o4
5372	sllx	%o3, 16, %o3
5373	or	%o4, %o3, %o3
5374	lduha	[%o0+22]%asi, %o4
5375	or	%o4, %o3, %o4
5376	stx	%o4, [%o1+16]
5377
5378	add	%o0, 32, %o0		! increase src ptr by 32
5379	add	%o1, 32, %o1		! increase dst ptr by 32
5380
5381	lduha	[%o0-8]%asi, %o4
5382	lduwa	[%o0-6]%asi, %o3
5383	sllx	%o4, 48, %o4
5384	sllx	%o3, 16, %o3
5385	or	%o4, %o3, %o3
5386	lduha	[%o0-2]%asi, %o4
5387	or	%o3, %o4, %o4
5388	bgu,pt	%ncc, .ci_medh32	! repeat if at least 32 bytes left
5389	stx	%o4, [%o1-8]
5390
5391.ci_medh31:
5392	addcc	%o2, 24, %o2		! adjust count to be off by 7
5393	ble,pt	%ncc, .ci_medh7		! skip if 7 or fewer bytes left
5394	nop				!
5395.ci_medh15:
5396	lduha	[%o0]%asi, %o4		! move 16 bytes
5397	subcc	%o2, 8, %o2		! decrement length count
5398	lduwa	[%o0+2]%asi, %o3
5399	sllx	%o4, 48, %o4
5400	sllx	%o3, 16, %o3
5401	or	%o4, %o3, %o3
5402	add	%o1, 8, %o1		! increase dst ptr by 8
5403	lduha	[%o0+6]%asi, %o4
5404	add	%o0, 8, %o0		! increase src ptr by 8
5405	or	%o4, %o3, %o4
5406	bgu,pt	%ncc, .ci_medh15
5407	stx	%o4, [%o1-8]
5408.ci_medh7:
5409	addcc	%o2, 7, %o2		! finish adjustment of remaining count
5410	bz,pt	%ncc, .ci_smallx	! exit if finished
5411	cmp	%o2, 4
5412	blt,pt	%ncc, .ci_small3x	! skip if less than 4 bytes left
5413	nop				!
5414	lduha	[%o0]%asi, %o4
5415	sll	%o4, 16, %o4
5416	lduha	[%o0+2]%asi, %o3
5417	or	%o3, %o4, %o4
5418	subcc	%o2, 4, %o2
5419	add	%o0, 4, %o0
5420	add	%o1, 4, %o1
5421	bnz	.ci_small3x
5422	stw	%o4, [%o1-4]
5423	membar	#Sync
5424	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
5425	retl
5426	mov	%g0, %o0
5427
5428	.align 16
5429.ci_med_byte:
5430	bnz,pt	%ncc, .ci_medbh32a	! go to correct byte move
5431	subcc	%o2, 31, %o2		! adjust length to allow cc test
5432	ble,pt	%ncc, .ci_medb31
5433	nop
5434.ci_medb32:				! Alignment 1 or 5
5435	subcc	%o2, 32, %o2		! decrement length count
5436
5437	lduba	[%o0]%asi, %o4		! load and store a block of 32 bytes
5438	sllx	%o4, 56, %o3
5439	lduha	[%o0+1]%asi, %o4
5440	sllx	%o4, 40, %o4
5441	or	%o4, %o3, %o3
5442	lduwa	[%o0+3]%asi, %o4
5443	sllx	%o4, 8, %o4
5444	or	%o4, %o3, %o3
5445	lduba	[%o0+7]%asi, %o4
5446	or	%o4, %o3, %o4
5447	stx	%o4, [%o1]
5448
5449	lduba	[%o0+8]%asi, %o4
5450	sllx	%o4, 56, %o3
5451	lduha	[%o0+9]%asi, %o4
5452	sllx	%o4, 40, %o4
5453	or	%o4, %o3, %o3
5454	lduwa	[%o0+11]%asi, %o4
5455	sllx	%o4, 8, %o4
5456	or	%o4, %o3, %o3
5457	lduba	[%o0+15]%asi, %o4
5458	or	%o4, %o3, %o4
5459	stx	%o4, [%o1+8]
5460
5461	lduba	[%o0+16]%asi, %o4
5462	sllx	%o4, 56, %o3
5463	lduha	[%o0+17]%asi, %o4
5464	sllx	%o4, 40, %o4
5465	or	%o4, %o3, %o3
5466	lduwa	[%o0+19]%asi, %o4
5467	sllx	%o4, 8, %o4
5468	or	%o4, %o3, %o3
5469	lduba	[%o0+23]%asi, %o4
5470	or	%o4, %o3, %o4
5471	stx	%o4, [%o1+16]
5472
5473	add	%o0, 32, %o0		! increase src ptr by 32
5474	add	%o1, 32, %o1		! increase dst ptr by 32
5475
5476	lduba	[%o0-8]%asi, %o4
5477	sllx	%o4, 56, %o3
5478	lduha	[%o0-7]%asi, %o4
5479	sllx	%o4, 40, %o4
5480	or	%o4, %o3, %o3
5481	lduwa	[%o0-5]%asi, %o4
5482	sllx	%o4, 8, %o4
5483	or	%o4, %o3, %o3
5484	lduba	[%o0-1]%asi, %o4
5485	or	%o4, %o3, %o4
5486	bgu,pt	%ncc, .ci_medb32	! repeat if at least 32 bytes left
5487	stx	%o4, [%o1-8]
5488
5489.ci_medb31:				! 31 or fewer bytes remaining
5490	addcc	%o2, 24, %o2		! adjust count to be off by 7
5491	ble,pt	%ncc, .ci_medb7		! skip if 7 or fewer bytes left
5492	nop				!
5493.ci_medb15:
5494
5495	lduba	[%o0]%asi, %o4		! load and store a block of 8 bytes
5496	subcc	%o2, 8, %o2		! decrement length count
5497	sllx	%o4, 56, %o3
5498	lduha	[%o0+1]%asi, %o4
5499	sllx	%o4, 40, %o4
5500	or	%o4, %o3, %o3
5501	lduwa	[%o0+3]%asi, %o4
5502	add	%o1, 8, %o1		! increase dst ptr by 16
5503	sllx	%o4, 8, %o4
5504	or	%o4, %o3, %o3
5505	lduba	[%o0+7]%asi, %o4
5506	add	%o0, 8, %o0		! increase src ptr by 16
5507	or	%o4, %o3, %o4
5508	bgu,pt	%ncc, .ci_medb15
5509	stx	%o4, [%o1-8]
5510.ci_medb7:
5511	addcc	%o2, 7, %o2		! finish adjustment of remaining count
5512	bz,pt	%ncc, .ci_smallx	! exit if finished
5513	cmp	%o2, 4
5514	blt,pt	%ncc, .ci_small3x	! skip if less than 4 bytes left
5515	nop				!
5516	lduba	[%o0]%asi, %o4		! move 4 bytes
5517	sll	%o4, 24, %o3
5518	lduha	[%o0+1]%asi, %o4
5519	sll	%o4, 8, %o4
5520	or	%o4, %o3, %o3
5521	lduba	[%o0+3]%asi, %o4
5522	or	%o4, %o3, %o4
5523	subcc	%o2, 4, %o2
5524	add	%o0, 4, %o0
5525	add	%o1, 4, %o1
5526	bnz	.ci_small3x
5527	stw	%o4, [%o1-4]
5528	membar	#Sync
5529	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
5530	retl
5531	mov	%g0, %o0
5532
5533	.align 16
5534.ci_medbh32a:				! Alignment 3 or 7
5535	ble,pt	%ncc, .ci_medbh31
5536	nop
5537.ci_medbh32:				! Alignment 3 or 7
5538	subcc	%o2, 32, %o2		! decrement length count
5539
5540	lduba	[%o0]%asi, %o4		! load and store a block of 32 bytes
5541	sllx	%o4, 56, %o3
5542	lduwa	[%o0+1]%asi, %o4
5543	sllx	%o4, 24, %o4
5544	or	%o4, %o3, %o3
5545	lduha	[%o0+5]%asi, %o4
5546	sllx	%o4, 8, %o4
5547	or	%o4, %o3, %o3
5548	lduba	[%o0+7]%asi, %o4
5549	or	%o4, %o3, %o4
5550	stx	%o4, [%o1]
5551
5552	lduba	[%o0+8]%asi, %o4
5553	sllx	%o4, 56, %o3
5554	lduwa	[%o0+9]%asi, %o4
5555	sllx	%o4, 24, %o4
5556	or	%o4, %o3, %o3
5557	lduha	[%o0+13]%asi, %o4
5558	sllx	%o4, 8, %o4
5559	or	%o4, %o3, %o3
5560	lduba	[%o0+15]%asi, %o4
5561	or	%o4, %o3, %o4
5562	stx	%o4, [%o1+8]
5563
5564	lduba	[%o0+16]%asi, %o4
5565	sllx	%o4, 56, %o3
5566	lduwa	[%o0+17]%asi, %o4
5567	sllx	%o4, 24, %o4
5568	or	%o4, %o3, %o3
5569	lduha	[%o0+21]%asi, %o4
5570	sllx	%o4, 8, %o4
5571	or	%o4, %o3, %o3
5572	lduba	[%o0+23]%asi, %o4
5573	or	%o4, %o3, %o4
5574	stx	%o4, [%o1+16]
5575
5576	add	%o0, 32, %o0		! increase src ptr by 32
5577	add	%o1, 32, %o1		! increase dst ptr by 32
5578
5579	lduba	[%o0-8]%asi, %o4
5580	sllx	%o4, 56, %o3
5581	lduwa	[%o0-7]%asi, %o4
5582	sllx	%o4, 24, %o4
5583	or	%o4, %o3, %o3
5584	lduha	[%o0-3]%asi, %o4
5585	sllx	%o4, 8, %o4
5586	or	%o4, %o3, %o3
5587	lduba	[%o0-1]%asi, %o4
5588	or	%o4, %o3, %o4
5589	bgu,pt	%ncc, .ci_medbh32	! repeat if at least 32 bytes left
5590	stx	%o4, [%o1-8]
5591
5592.ci_medbh31:
5593	addcc	%o2, 24, %o2		! adjust count to be off by 7
5594	ble,pt	%ncc, .ci_medb7		! skip if 7 or fewer bytes left
5595	nop				!
5596.ci_medbh15:
5597	lduba	[%o0]%asi, %o4		! load and store a block of 8 bytes
5598	sllx	%o4, 56, %o3
5599	lduwa	[%o0+1]%asi, %o4
5600	sllx	%o4, 24, %o4
5601	or	%o4, %o3, %o3
5602	lduha	[%o0+5]%asi, %o4
5603	sllx	%o4, 8, %o4
5604	or	%o4, %o3, %o3
5605	lduba	[%o0+7]%asi, %o4
5606	or	%o4, %o3, %o4
5607	stx	%o4, [%o1]
5608	subcc	%o2, 8, %o2		! decrement length count
5609	add	%o1, 8, %o1		! increase dst ptr by 8
5610	add	%o0, 8, %o0		! increase src ptr by 8
5611	bgu,pt	%ncc, .ci_medbh15
5612	stx	%o4, [%o1-8]
5613	ba	.ci_medb7
5614	nop
5615
5616/*
5617 * End of small copy in code (no window)
5618 *
5619 */
5620
5621/*
5622 * Long copy in code (using register window and fp regs)
5623 *
5624 */
5625
5626.ci_copy_more:
5627	sethi	%hi(copyio_fault), %o3
5628	or	%o3, %lo(copyio_fault), %o3
5629	membar	#Sync
5630	stn	%o3, [THREAD_REG + T_LOFAULT]
5631/*
5632 * Following code is for large copies. We know there is at
5633 * least FP_COPY bytes available. FP regs are used, so
5634 *  we save registers and fp regs before starting
5635 */
5636	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
5637	or	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
5638	rd	%fprs, %g1		! check for unused fp
5639	! if fprs.fef == 0, set it.
5640	! Setting it when already set costs more than checking
5641	andcc	%g1, FPRS_FEF, %g1	! test FEF, fprs.du = fprs.dl = 0
5642	bz,pt	%ncc, .ci_fp_unused
5643	mov	ASI_USER, %asi
5644	BST_FP_TOSTACK(%o3)
5645	ba	.ci_fp_ready
5646.ci_fp_unused:
5647	prefetcha [%i0 + (1 * CACHE_LINE)]%asi, #one_read
5648	wr	%g0, FPRS_FEF, %fprs	! fprs.fef = 1
5649.ci_fp_ready:
5650	rd	%gsr, %l5		! save %gsr value
5651	andcc	%i1, 1, %o3		! is dest byte aligned
5652	bnz,pt	%ncc, .ci_big_d1
5653.ci_big_d1f:				! dest is now half word aligned
5654	andcc	%i1, 2, %o3
5655	bnz,pt	%ncc, .ci_big_d2
5656.ci_big_d2f:				! dest is now word aligned
5657	andcc	%i1, 4, %o3
5658	bnz,pt	%ncc, .ci_big_d4
5659.ci_big_d4f:				! dest is long word aligned
5660	andcc	%i0, 7, %o3		! is src long word aligned
5661	brnz,pt	%o3, .ci_big_unal8
5662	prefetcha [%i0 + (2 * CACHE_LINE)]%asi, #one_read
5663	! Src and dst are long word aligned
5664	! align dst to 64 byte boundary
5665	andcc	%i1, 0x3f, %o3		! %o3 == 0 means dst is 64 byte aligned
5666	brz,pn	%o3, .ci_al_to_64
5667	nop
5668	sub	%o3, 64, %o3		! %o3 has negative bytes to move
5669	add	%i2, %o3, %i2		! adjust remaining count
5670	andcc	%o3, 8, %o4		! odd long words to move?
5671	brz,pt	%o4, .ci_al_to_16
5672	nop
5673	add	%o3, 8, %o3
5674	ldxa	[%i0]%asi, %o4
5675	add	%i0, 8, %i0		! increment src ptr
5676	add	%i1, 8, %i1		! increment dst ptr
5677	stx	%o4, [%i1-8]
5678! Dest is aligned on 16 bytes, src 8 byte aligned
5679.ci_al_to_16:
5680	andcc	%o3, 0x30, %o4		! pair of long words to move?
5681	brz,pt	%o4, .ci_al_to_64
5682	nop
5683.ci_al_mv_16:
5684	add	%o3, 16, %o3
5685	ldxa	[%i0]%asi, %o4
5686	stx	%o4, [%i1]
5687	add	%i0, 16, %i0		! increment src ptr
5688	ldxa	[%i0-8]%asi, %o4
5689	stx	%o4, [%i1+8]
5690	andcc	%o3, 0x30, %o4
5691	brnz,pt	%o4, .ci_al_mv_16
5692	add	%i1, 16, %i1		! increment dst ptr
5693! Dest is aligned on 64 bytes, src 8 byte aligned
5694.ci_al_to_64:
5695	! Determine source alignment
5696	! to correct 8 byte offset
5697	andcc	%i0, 32, %o3
5698	brnz,pn	%o3, .ci_aln_1
5699	andcc	%i0, 16, %o3
5700	brnz,pn	%o3, .ci_aln_01
5701	andcc	%i0, 8, %o3
5702	brz,pn	%o3, .ci_aln_000
5703	prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read
5704	ba	.ci_aln_001
5705	prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5706.ci_aln_01:
5707	brnz,pn	%o3, .ci_aln_011
5708	prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read
5709	ba	.ci_aln_010
5710	prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5711.ci_aln_1:
5712	andcc	%i0, 16, %o3
5713	brnz,pn	%o3, .ci_aln_11
5714	andcc	%i0, 8, %o3
5715	brnz,pn	%o3, .ci_aln_101
5716	prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read
5717	ba	.ci_aln_100
5718	prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5719.ci_aln_11:
5720	brz,pn	%o3, .ci_aln_110
5721	prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read
5722
5723.ci_aln_111:
5724! Alignment off by 8 bytes
5725	prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5726	ldda	[%i0]%asi, %d0
5727	add	%i0, 8, %i0
5728	sub	%i2, 8, %i2
5729	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
5730	and	%i2, 0x7f, %i2		! residue bytes in %i2
5731	sub	%i1, %i0, %i1
5732.ci_aln_111_loop:
5733	ldda	[%i0]ASI_BLK_AIUS,%d16		! block load
5734	subcc	%o3, 64, %o3
5735	fmovd	%d16, %d2
5736	fmovd	%d18, %d4
5737	fmovd	%d20, %d6
5738	fmovd	%d22, %d8
5739	fmovd	%d24, %d10
5740	fmovd	%d26, %d12
5741	fmovd	%d28, %d14
5742	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
5743	stda	%d0,[%i0+%i1]ASI_BLK_P
5744	add	%i0, 64, %i0
5745	fmovd	%d30, %d0
5746	bgt,pt	%ncc, .ci_aln_111_loop
5747	prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5748	add	%i1, %i0, %i1
5749
5750	std	%d0, [%i1]
5751	ba	.ci_remain_stuff
5752	add	%i1, 8, %i1
5753	! END OF aln_111
5754
5755.ci_aln_110:
5756! Alignment off by 16 bytes
5757	prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5758	ldda	[%i0]%asi, %d0
5759	ldda	[%i0+8]%asi, %d2
5760	add	%i0, 16, %i0
5761	sub	%i2, 16, %i2
5762	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
5763	and	%i2, 0x7f, %i2		! residue bytes in %i2
5764	sub	%i1, %i0, %i1
5765.ci_aln_110_loop:
5766	ldda	[%i0]ASI_BLK_AIUS,%d16		! block load
5767	subcc	%o3, 64, %o3
5768	fmovd	%d16, %d4
5769	fmovd	%d18, %d6
5770	fmovd	%d20, %d8
5771	fmovd	%d22, %d10
5772	fmovd	%d24, %d12
5773	fmovd	%d26, %d14
5774	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
5775	stda	%d0,[%i0+%i1]ASI_BLK_P
5776	add	%i0, 64, %i0
5777	fmovd	%d28, %d0
5778	fmovd	%d30, %d2
5779	bgt,pt	%ncc, .ci_aln_110_loop
5780	prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5781	add	%i1, %i0, %i1
5782
5783	std	%d0, [%i1]
5784	std	%d2, [%i1+8]
5785	ba	.ci_remain_stuff
5786	add	%i1, 16, %i1
5787	! END OF aln_110
5788
5789.ci_aln_101:
5790! Alignment off by 24 bytes
5791	prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5792	ldda	[%i0]%asi, %d0
5793	ldda	[%i0+8]%asi, %d2
5794	ldda	[%i0+16]%asi, %d4
5795	add	%i0, 24, %i0
5796	sub	%i2, 24, %i2
5797	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
5798	and	%i2, 0x7f, %i2		! residue bytes in %i2
5799	sub	%i1, %i0, %i1
5800.ci_aln_101_loop:
5801	ldda	[%i0]ASI_BLK_AIUS,%d16	! block load
5802	subcc	%o3, 64, %o3
5803	fmovd	%d16, %d6
5804	fmovd	%d18, %d8
5805	fmovd	%d20, %d10
5806	fmovd	%d22, %d12
5807	fmovd	%d24, %d14
5808	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
5809	stda	%d0,[%i0+%i1]ASI_BLK_P
5810	add	%i0, 64, %i0
5811	fmovd	%d26, %d0
5812	fmovd	%d28, %d2
5813	fmovd	%d30, %d4
5814	bgt,pt	%ncc, .ci_aln_101_loop
5815	prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5816	add	%i1, %i0, %i1
5817
5818	std	%d0, [%i1]
5819	std	%d2, [%i1+8]
5820	std	%d4, [%i1+16]
5821	ba	.ci_remain_stuff
5822	add	%i1, 24, %i1
5823	! END OF aln_101
5824
5825.ci_aln_100:
5826! Alignment off by 32 bytes
5827	ldda	[%i0]%asi, %d0
5828	ldda	[%i0+8]%asi, %d2
5829	ldda	[%i0+16]%asi,%d4
5830	ldda	[%i0+24]%asi,%d6
5831	add	%i0, 32, %i0
5832	sub	%i2, 32, %i2
5833	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
5834	and	%i2, 0x7f, %i2		! residue bytes in %i2
5835	sub	%i1, %i0, %i1
5836.ci_aln_100_loop:
5837	ldda	[%i0]ASI_BLK_AIUS,%d16	! block load
5838	subcc	%o3, 64, %o3
5839	fmovd	%d16, %d8
5840	fmovd	%d18, %d10
5841	fmovd	%d20, %d12
5842	fmovd	%d22, %d14
5843	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
5844	stda	%d0,[%i0+%i1]ASI_BLK_P
5845	add	%i0, 64, %i0
5846	fmovd	%d24, %d0
5847	fmovd	%d26, %d2
5848	fmovd	%d28, %d4
5849	fmovd	%d30, %d6
5850	bgt,pt	%ncc, .ci_aln_100_loop
5851	prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5852	add	%i1, %i0, %i1
5853
5854	std	%d0, [%i1]
5855	std	%d2, [%i1+8]
5856	std	%d4, [%i1+16]
5857	std	%d6, [%i1+24]
5858	ba	.ci_remain_stuff
5859	add	%i1, 32, %i1
5860	! END OF aln_100
5861
5862.ci_aln_011:
5863! Alignment off by 40 bytes
5864	prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5865	ldda	[%i0]%asi, %d0
5866	ldda	[%i0+8]%asi, %d2
5867	ldda	[%i0+16]%asi, %d4
5868	ldda	[%i0+24]%asi, %d6
5869	ldda	[%i0+32]%asi, %d8
5870	add	%i0, 40, %i0
5871	sub	%i2, 40, %i2
5872	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
5873	and	%i2, 0x7f, %i2		! residue bytes in %i2
5874	sub	%i1, %i0, %i1
5875.ci_aln_011_loop:
5876	ldda	[%i0]ASI_BLK_AIUS,%d16	! block load
5877	subcc	%o3, 64, %o3
5878	fmovd	%d16, %d10
5879	fmovd	%d18, %d12
5880	fmovd	%d20, %d14
5881	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
5882	stda	%d0,[%i0+%i1]ASI_BLK_P
5883	add	%i0, 64, %i0
5884	fmovd	%d22, %d0
5885	fmovd	%d24, %d2
5886	fmovd	%d26, %d4
5887	fmovd	%d28, %d6
5888	fmovd	%d30, %d8
5889	bgt,pt	%ncc, .ci_aln_011_loop
5890	prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5891	add	%i1, %i0, %i1
5892
5893	std	%d0, [%i1]
5894	std	%d2, [%i1+8]
5895	std	%d4, [%i1+16]
5896	std	%d6, [%i1+24]
5897	std	%d8, [%i1+32]
5898	ba	.ci_remain_stuff
5899	add	%i1, 40, %i1
5900	! END OF aln_011
5901
5902.ci_aln_010:
5903! Alignment off by 48 bytes
5904	ldda	[%i0]%asi, %d0
5905	ldda	[%i0+8]%asi, %d2
5906	ldda	[%i0+16]%asi, %d4
5907	ldda	[%i0+24]%asi, %d6
5908	ldda	[%i0+32]%asi, %d8
5909	ldda	[%i0+40]%asi, %d10
5910	add	%i0, 48, %i0
5911	sub	%i2, 48, %i2
5912	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
5913	and	%i2, 0x7f, %i2		! residue bytes in %i2
5914	sub	%i1, %i0, %i1
5915.ci_aln_010_loop:
5916	ldda	[%i0]ASI_BLK_AIUS,%d16	! block load
5917	subcc	%o3, 64, %o3
5918	fmovd	%d16, %d12
5919	fmovd	%d18, %d14
5920	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
5921	stda	%d0,[%i0+%i1]ASI_BLK_P
5922	add	%i0, 64, %i0
5923	fmovd	%d20, %d0
5924	fmovd	%d22, %d2
5925	fmovd	%d24, %d4
5926	fmovd	%d26, %d6
5927	fmovd	%d28, %d8
5928	fmovd	%d30, %d10
5929	bgt,pt	%ncc, .ci_aln_010_loop
5930	prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5931	add	%i1, %i0, %i1
5932
5933	std	%d0, [%i1]
5934	std	%d2, [%i1+8]
5935	std	%d4, [%i1+16]
5936	std	%d6, [%i1+24]
5937	std	%d8, [%i1+32]
5938	std	%d10, [%i1+40]
5939	ba	.ci_remain_stuff
5940	add	%i1, 48, %i1
5941	! END OF aln_010
5942
5943.ci_aln_001:
5944! Alignment off by 56 bytes
5945	ldda	[%i0]%asi, %d0
5946	ldda	[%i0+8]%asi, %d2
5947	ldda	[%i0+16]%asi, %d4
5948	ldda	[%i0+24]%asi, %d6
5949	ldda	[%i0+32]%asi, %d8
5950	ldda	[%i0+40]%asi, %d10
5951	ldda	[%i0+48]%asi, %d12
5952	add	%i0, 56, %i0
5953	sub	%i2, 56, %i2
5954	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
5955	and	%i2, 0x7f, %i2		! residue bytes in %i2
5956	sub	%i1, %i0, %i1
5957.ci_aln_001_loop:
5958	ldda	[%i0]ASI_BLK_AIUS,%d16	! block load
5959	subcc	%o3, 64, %o3
5960	fmovd	%d16, %d14
5961	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
5962	stda	%d0,[%i0+%i1]ASI_BLK_P
5963	add	%i0, 64, %i0
5964	fmovd	%d18, %d0
5965	fmovd	%d20, %d2
5966	fmovd	%d22, %d4
5967	fmovd	%d24, %d6
5968	fmovd	%d26, %d8
5969	fmovd	%d28, %d10
5970	fmovd	%d30, %d12
5971	bgt,pt	%ncc, .ci_aln_001_loop
5972	prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5973	add	%i1, %i0, %i1
5974
5975	std	%d0, [%i1]
5976	std	%d2, [%i1+8]
5977	std	%d4, [%i1+16]
5978	std	%d6, [%i1+24]
5979	std	%d8, [%i1+32]
5980	std	%d10, [%i1+40]
5981	std	%d12, [%i1+48]
5982	ba	.ci_remain_stuff
5983	add	%i1, 56, %i1
5984	! END OF aln_001
5985
5986.ci_aln_000:
5987	prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5988	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
5989	and	%i2, 0x7f, %i2		! residue bytes in %i2
5990	sub	%i1, %i0, %i1
5991.ci_aln_000_loop:
5992	ldda	[%i0]ASI_BLK_AIUS,%d0
5993	subcc	%o3, 64, %o3
5994	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
5995	stda	%d0,[%i0+%i1]ASI_BLK_P
5996	add	%i0, 64, %i0
5997	bgt,pt	%ncc, .ci_aln_000_loop
5998	prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5999	add	%i1, %i0, %i1
6000
6001	! END OF aln_000
6002
6003.ci_remain_stuff:
6004	subcc	%i2, 31, %i2		! adjust length to allow cc test
6005	ble,pt	%ncc, .ci_aln_31
6006	nop
6007.ci_aln_32:
6008	ldxa	[%i0]%asi, %o4		! move 32 bytes
6009	subcc	%i2, 32, %i2		! decrement length count by 32
6010	stx	%o4, [%i1]
6011	ldxa	[%i0+8]%asi, %o4
6012	stx	%o4, [%i1+8]
6013	ldxa	[%i0+16]%asi, %o4
6014	add	%i0, 32, %i0		! increase src ptr by 32
6015	stx	%o4, [%i1+16]
6016	ldxa	[%i0-8]%asi, %o4
6017	add	%i1, 32, %i1		! increase dst ptr by 32
6018	bgu,pt	%ncc, .ci_aln_32	! repeat if at least 32 bytes left
6019	stx	%o4, [%i1-8]
6020.ci_aln_31:
6021	addcc	%i2, 24, %i2		! adjust count to be off by 7
6022	ble,pt	%ncc, .ci_aln_7		! skip if 7 or fewer bytes left
6023	nop				!
6024.ci_aln_15:
6025	ldxa	[%i0]%asi, %o4		! move 8 bytes
6026	add	%i0, 8, %i0		! increase src ptr by 8
6027	subcc	%i2, 8, %i2		! decrease count by 8
6028	add	%i1, 8, %i1		! increase dst ptr by 8
6029	bgu,pt	%ncc, .ci_aln_15
6030	stx	%o4, [%i1-8]		!
6031.ci_aln_7:
6032	addcc	%i2, 7, %i2		! finish adjustment of remaining count
6033	bz,pt	%ncc, .ci_exit		! exit if finished
6034	cmp	%i2, 4
6035	blt,pt	%ncc, .ci_unaln3x	! skip if less than 4 bytes left
6036	nop				!
6037	lda	[%i0]%asi, %o4		! move 4 bytes
6038	add	%i0, 4, %i0		! increase src ptr by 4
6039	add	%i1, 4, %i1		! increase dst ptr by 4
6040	subcc	%i2, 4, %i2		! decrease count by 4
6041	bnz	.ci_unaln3x
6042	stw	%o4, [%i1-4]
6043	ba	.ci_exit
6044	nop
6045
6046	! destination alignment code
6047.ci_big_d1:
6048	lduba	[%i0]%asi, %o4		! move a byte
6049	add	%i0, 1, %i0
6050	stb	%o4, [%i1]
6051	add	%i1, 1, %i1
6052	andcc	%i1, 2, %o3
6053	bz,pt	%ncc, .ci_big_d2f
6054	sub	%i2, 1, %i2
6055.ci_big_d2:				! dest is now at least half word aligned
6056	lduba	[%i0]%asi, %o4		! move a half-word (src align unknown)
6057	lduba	[%i0+1]%asi, %o3
6058	add	%i0, 2, %i0
6059	sll	%o4, 8, %o4		! position
6060	or	%o4, %o3, %o4		! merge
6061	sth	%o4, [%i1]
6062	add	%i1, 2, %i1
6063	andcc	%i1, 4, %o3
6064	bz,pt	%ncc, .ci_big_d4f
6065	sub	%i2, 2, %i2
6066.ci_big_d4:				! dest is at least word aligned
6067	nop
6068	lduba	[%i0]%asi, %o4		! move a word (src align unknown)
6069	lduba	[%i0+1]%asi, %o3
6070	sll	%o4, 24, %o4		! position
6071	sll	%o3, 16, %o3		! position
6072	or	%o4, %o3, %o3		! merge
6073	lduba	[%i0+2]%asi, %o4
6074	sll	%o4, 8, %o4		! position
6075	or	%o4, %o3, %o3		! merge
6076	lduba	[%i0+3]%asi, %o4
6077	or	%o4, %o3, %o4		! merge
6078	stw	%o4,[%i1]		! store four bytes
6079	add	%i0, 4, %i0		! adjust src by 4
6080	add	%i1, 4, %i1		! adjust dest by 4
6081	ba	.ci_big_d4f
6082	sub	%i2, 4, %i2		! adjust count by 4
6083
6084
6085	! Dst is on 8 byte boundary; src is not;
6086.ci_big_unal8:
6087	andcc	%i1, 0x3f, %o3		! is dst 64-byte block aligned?
6088	bz	%ncc, .ci_unalnsrc
6089	sub	%o3, 64, %o3		! %o3 will be multiple of 8
6090	neg	%o3			! bytes until dest is 64 byte aligned
6091	sub	%i2, %o3, %i2		! update cnt with bytes to be moved
6092	! Move bytes according to source alignment
6093	andcc	%i0, 0x1, %o4
6094	bnz	%ncc, .ci_unalnbyte	! check for byte alignment
6095	nop
6096	andcc	%i0, 2, %o4		! check for half word alignment
6097	bnz	%ncc, .ci_unalnhalf
6098	nop
6099	! Src is word aligned, move bytes until dest 64 byte aligned
6100.ci_unalnword:
6101	lda	[%i0]%asi, %o4		! load 4 bytes
6102	stw	%o4, [%i1]		! and store 4 bytes
6103	lda	[%i0+4]%asi, %o4	! load 4 bytes
6104	add	%i0, 8, %i0		! increase src ptr by 8
6105	stw	%o4, [%i1+4]		! and store 4 bytes
6106	subcc	%o3, 8, %o3		! decrease count by 8
6107	bnz	%ncc, .ci_unalnword
6108	add	%i1, 8, %i1		! increase dst ptr by 8
6109	ba	.ci_unalnsrc
6110	nop
6111
6112	! Src is half-word aligned, move bytes until dest 64 byte aligned
6113.ci_unalnhalf:
6114	lduha	[%i0]%asi, %o4		! load 2 bytes
6115	sllx	%o4, 32, %i3		! shift left
6116	lduwa	[%i0+2]%asi, %o4
6117	or	%o4, %i3, %i3
6118	sllx	%i3, 16, %i3
6119	lduha	[%i0+6]%asi, %o4
6120	or	%o4, %i3, %i3
6121	stx	%i3, [%i1]
6122	add	%i0, 8, %i0
6123	subcc	%o3, 8, %o3
6124	bnz	%ncc, .ci_unalnhalf
6125	add	%i1, 8, %i1
6126	ba	.ci_unalnsrc
6127	nop
6128
6129	! Src is Byte aligned, move bytes until dest 64 byte aligned
6130.ci_unalnbyte:
6131	sub	%i1, %i0, %i1		! share pointer advance
6132.ci_unalnbyte_loop:
6133	lduba	[%i0]%asi, %o4
6134	sllx	%o4, 56, %i3
6135	lduha	[%i0+1]%asi, %o4
6136	sllx	%o4, 40, %o4
6137	or	%o4, %i3, %i3
6138	lduha	[%i0+3]%asi, %o4
6139	sllx	%o4, 24, %o4
6140	or	%o4, %i3, %i3
6141	lduha	[%i0+5]%asi, %o4
6142	sllx	%o4, 8, %o4
6143	or	%o4, %i3, %i3
6144	lduba	[%i0+7]%asi, %o4
6145	or	%o4, %i3, %i3
6146	stx	%i3, [%i1+%i0]
6147	subcc	%o3, 8, %o3
6148	bnz	%ncc, .ci_unalnbyte_loop
6149	add	%i0, 8, %i0
6150	add	%i1,%i0, %i1		! restore pointer
6151
6152	! Destination is now block (64 byte aligned), src is not 8 byte aligned
6153.ci_unalnsrc:
6154	andn	%i2, 0x3f, %i3		! %i3 is multiple of block size
6155	and	%i2, 0x3f, %i2		! residue bytes in %i2
6156	add	%i2, 64, %i2		! Insure we don't load beyond
6157	sub	%i3, 64, %i3		! end of source buffer
6158
6159	andn	%i0, 0x3f, %o4		! %o4 has block aligned src address
6160	prefetcha [%o4 + (3 * CACHE_LINE)]%asi, #one_read
6161	alignaddr %i0, %g0, %g0		! generate %gsr
6162	add	%i0, %i3, %i0		! advance %i0 to after blocks
6163	!
6164	! Determine source alignment to correct 8 byte offset
6165	andcc	%i0, 0x20, %o3
6166	brnz,pn	%o3, .ci_unaln_1
6167	andcc	%i0, 0x10, %o3
6168	brnz,pn	%o3, .ci_unaln_01
6169	andcc	%i0, 0x08, %o3
6170	brz,a	%o3, .ci_unaln_000
6171	prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6172	ba	.ci_unaln_001
6173	nop
6174.ci_unaln_01:
6175	brnz,a	%o3, .ci_unaln_011
6176	prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6177	ba	.ci_unaln_010
6178	nop
6179.ci_unaln_1:
6180	brnz,pn	%o3, .ci_unaln_11
6181	andcc	%i0, 0x08, %o3
6182	brnz,a	%o3, .ci_unaln_101
6183	prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6184	ba	.ci_unaln_100
6185	nop
6186.ci_unaln_11:
6187	brz,pn	%o3, .ci_unaln_110
6188	prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
6189
6190.ci_unaln_111:
6191	ldda	[%o4+56]%asi, %d14
6192.ci_unaln_111_loop:
6193	add	%o4, 64, %o4
6194	ldda	[%o4]ASI_BLK_AIUS, %d16
6195	faligndata %d14, %d16, %d48
6196	faligndata %d16, %d18, %d50
6197	faligndata %d18, %d20, %d52
6198	faligndata %d20, %d22, %d54
6199	faligndata %d22, %d24, %d56
6200	faligndata %d24, %d26, %d58
6201	faligndata %d26, %d28, %d60
6202	faligndata %d28, %d30, %d62
6203	fmovd	%d30, %d14
6204	stda	%d48, [%i1]ASI_BLK_P
6205	subcc	%i3, 64, %i3
6206	add	%i1, 64, %i1
6207	bgu,pt	%ncc, .ci_unaln_111_loop
6208	prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6209	ba	.ci_unaln_done
6210	nop
6211
6212.ci_unaln_110:
6213	ldda	[%o4+48]%asi, %d12
6214	ldda	[%o4+56]%asi, %d14
6215.ci_unaln_110_loop:
6216	add	%o4, 64, %o4
6217	ldda	[%o4]ASI_BLK_AIUS, %d16
6218	faligndata %d12, %d14, %d48
6219	faligndata %d14, %d16, %d50
6220	faligndata %d16, %d18, %d52
6221	faligndata %d18, %d20, %d54
6222	faligndata %d20, %d22, %d56
6223	faligndata %d22, %d24, %d58
6224	faligndata %d24, %d26, %d60
6225	faligndata %d26, %d28, %d62
6226	fmovd	%d28, %d12
6227	fmovd	%d30, %d14
6228	stda	%d48, [%i1]ASI_BLK_P
6229	subcc	%i3, 64, %i3
6230	add	%i1, 64, %i1
6231	bgu,pt	%ncc, .ci_unaln_110_loop
6232	prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6233	ba	.ci_unaln_done
6234	nop
6235
6236.ci_unaln_101:
6237	ldda	[%o4+40]%asi, %d10
6238	ldda	[%o4+48]%asi, %d12
6239	ldda	[%o4+56]%asi, %d14
6240.ci_unaln_101_loop:
6241	add	%o4, 64, %o4
6242	ldda	[%o4]ASI_BLK_AIUS, %d16
6243	faligndata %d10, %d12, %d48
6244	faligndata %d12, %d14, %d50
6245	faligndata %d14, %d16, %d52
6246	faligndata %d16, %d18, %d54
6247	faligndata %d18, %d20, %d56
6248	faligndata %d20, %d22, %d58
6249	faligndata %d22, %d24, %d60
6250	faligndata %d24, %d26, %d62
6251	fmovd	%d26, %d10
6252	fmovd	%d28, %d12
6253	fmovd	%d30, %d14
6254	stda	%d48, [%i1]ASI_BLK_P
6255	subcc	%i3, 64, %i3
6256	add	%i1, 64, %i1
6257	bgu,pt	%ncc, .ci_unaln_101_loop
6258	prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6259	ba	.ci_unaln_done
6260	nop
6261
6262.ci_unaln_100:
6263	ldda	[%o4+32]%asi, %d8
6264	ldda	[%o4+40]%asi, %d10
6265	ldda	[%o4+48]%asi, %d12
6266	ldda	[%o4+56]%asi, %d14
6267.ci_unaln_100_loop:
6268	add	%o4, 64, %o4
6269	ldda	[%o4]ASI_BLK_AIUS, %d16
6270	faligndata %d8, %d10, %d48
6271	faligndata %d10, %d12, %d50
6272	faligndata %d12, %d14, %d52
6273	faligndata %d14, %d16, %d54
6274	faligndata %d16, %d18, %d56
6275	faligndata %d18, %d20, %d58
6276	faligndata %d20, %d22, %d60
6277	faligndata %d22, %d24, %d62
6278	fmovd	%d24, %d8
6279	fmovd	%d26, %d10
6280	fmovd	%d28, %d12
6281	fmovd	%d30, %d14
6282	stda	%d48, [%i1]ASI_BLK_P
6283	subcc	%i3, 64, %i3
6284	add	%i1, 64, %i1
6285	bgu,pt	%ncc, .ci_unaln_100_loop
6286	prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6287	ba	.ci_unaln_done
6288	nop
6289
6290.ci_unaln_011:
6291	ldda	[%o4+24]%asi, %d6
6292	ldda	[%o4+32]%asi, %d8
6293	ldda	[%o4+40]%asi, %d10
6294	ldda	[%o4+48]%asi, %d12
6295	ldda	[%o4+56]%asi, %d14
6296.ci_unaln_011_loop:
6297	add	%o4, 64, %o4
6298	ldda	[%o4]ASI_BLK_AIUS, %d16
6299	faligndata %d6, %d8, %d48
6300	faligndata %d8, %d10, %d50
6301	faligndata %d10, %d12, %d52
6302	faligndata %d12, %d14, %d54
6303	faligndata %d14, %d16, %d56
6304	faligndata %d16, %d18, %d58
6305	faligndata %d18, %d20, %d60
6306	faligndata %d20, %d22, %d62
6307	fmovd	%d22, %d6
6308	fmovd	%d24, %d8
6309	fmovd	%d26, %d10
6310	fmovd	%d28, %d12
6311	fmovd	%d30, %d14
6312	stda	%d48, [%i1]ASI_BLK_P
6313	subcc	%i3, 64, %i3
6314	add	%i1, 64, %i1
6315	bgu,pt	%ncc, .ci_unaln_011_loop
6316	prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6317	ba	.ci_unaln_done
6318	nop
6319
6320.ci_unaln_010:
6321	ldda	[%o4+16]%asi, %d4
6322	ldda	[%o4+24]%asi, %d6
6323	ldda	[%o4+32]%asi, %d8
6324	ldda	[%o4+40]%asi, %d10
6325	ldda	[%o4+48]%asi, %d12
6326	ldda	[%o4+56]%asi, %d14
6327.ci_unaln_010_loop:
6328	add	%o4, 64, %o4
6329	ldda	[%o4]ASI_BLK_AIUS, %d16
6330	faligndata %d4, %d6, %d48
6331	faligndata %d6, %d8, %d50
6332	faligndata %d8, %d10, %d52
6333	faligndata %d10, %d12, %d54
6334	faligndata %d12, %d14, %d56
6335	faligndata %d14, %d16, %d58
6336	faligndata %d16, %d18, %d60
6337	faligndata %d18, %d20, %d62
6338	fmovd	%d20, %d4
6339	fmovd	%d22, %d6
6340	fmovd	%d24, %d8
6341	fmovd	%d26, %d10
6342	fmovd	%d28, %d12
6343	fmovd	%d30, %d14
6344	stda	%d48, [%i1]ASI_BLK_P
6345	subcc	%i3, 64, %i3
6346	add	%i1, 64, %i1
6347	bgu,pt	%ncc, .ci_unaln_010_loop
6348	prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6349	ba	.ci_unaln_done
6350	nop
6351
6352.ci_unaln_001:
6353	ldda	[%o4+8]%asi, %d2
6354	ldda	[%o4+16]%asi, %d4
6355	ldda	[%o4+24]%asi, %d6
6356	ldda	[%o4+32]%asi, %d8
6357	ldda	[%o4+40]%asi, %d10
6358	ldda	[%o4+48]%asi, %d12
6359	ldda	[%o4+56]%asi, %d14
6360.ci_unaln_001_loop:
6361	add	%o4, 64, %o4
6362	ldda	[%o4]ASI_BLK_AIUS, %d16
6363	faligndata %d2, %d4, %d48
6364	faligndata %d4, %d6, %d50
6365	faligndata %d6, %d8, %d52
6366	faligndata %d8, %d10, %d54
6367	faligndata %d10, %d12, %d56
6368	faligndata %d12, %d14, %d58
6369	faligndata %d14, %d16, %d60
6370	faligndata %d16, %d18, %d62
6371	fmovd	%d18, %d2
6372	fmovd	%d20, %d4
6373	fmovd	%d22, %d6
6374	fmovd	%d24, %d8
6375	fmovd	%d26, %d10
6376	fmovd	%d28, %d12
6377	fmovd	%d30, %d14
6378	stda	%d48, [%i1]ASI_BLK_P
6379	subcc	%i3, 64, %i3
6380	add	%i1, 64, %i1
6381	bgu,pt	%ncc, .ci_unaln_001_loop
6382	prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6383	ba	.ci_unaln_done
6384	nop
6385
6386.ci_unaln_000:
6387	ldda	[%o4]ASI_BLK_AIUS, %d0
6388.ci_unaln_000_loop:
6389	add	%o4, 64, %o4
6390	ldda	[%o4]ASI_BLK_AIUS, %d16
6391	faligndata %d0, %d2, %d48
6392	faligndata %d2, %d4, %d50
6393	faligndata %d4, %d6, %d52
6394	faligndata %d6, %d8, %d54
6395	faligndata %d8, %d10, %d56
6396	faligndata %d10, %d12, %d58
6397	faligndata %d12, %d14, %d60
6398	faligndata %d14, %d16, %d62
6399	fmovd	%d16, %d0
6400	fmovd	%d18, %d2
6401	fmovd	%d20, %d4
6402	fmovd	%d22, %d6
6403	fmovd	%d24, %d8
6404	fmovd	%d26, %d10
6405	fmovd	%d28, %d12
6406	fmovd	%d30, %d14
6407	stda	%d48, [%i1]ASI_BLK_P
6408	subcc	%i3, 64, %i3
6409	add	%i1, 64, %i1
6410	bgu,pt	%ncc, .ci_unaln_000_loop
6411	prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6412
6413.ci_unaln_done:
6414	! Handle trailing bytes, 64 to 127
6415	! Dest long word aligned, Src not long word aligned
6416	cmp	%i2, 15
6417	bleu	%ncc, .ci_unaln_short
6418
6419	andn	%i2, 0x7, %i3		! %i3 is multiple of 8
6420	and	%i2, 0x7, %i2		! residue bytes in %i2
6421	add	%i2, 8, %i2
6422	sub	%i3, 8, %i3		! insure we don't load past end of src
6423	andn	%i0, 0x7, %o4		! %o4 has long word aligned src address
6424	add	%i0, %i3, %i0		! advance %i0 to after multiple of 8
6425	ldda	[%o4]%asi, %d0		! fetch partial word
6426.ci_unaln_by8:
6427	ldda	[%o4+8]%asi, %d2
6428	add	%o4, 8, %o4
6429	faligndata %d0, %d2, %d16
6430	subcc	%i3, 8, %i3
6431	std	%d16, [%i1]
6432	fmovd	%d2, %d0
6433	bgu,pt	%ncc, .ci_unaln_by8
6434	add	%i1, 8, %i1
6435
6436.ci_unaln_short:
6437	cmp	%i2, 8
6438	blt,pt	%ncc, .ci_unalnfin
6439	nop
6440	lduba	[%i0]%asi, %o4
6441	sll	%o4, 24, %o3
6442	lduba	[%i0+1]%asi, %o4
6443	sll	%o4, 16, %o4
6444	or	%o4, %o3, %o3
6445	lduba	[%i0+2]%asi, %o4
6446	sll	%o4, 8, %o4
6447	or	%o4, %o3, %o3
6448	lduba	[%i0+3]%asi, %o4
6449	or	%o4, %o3, %o3
6450	stw	%o3, [%i1]
6451	lduba	[%i0+4]%asi, %o4
6452	sll	%o4, 24, %o3
6453	lduba	[%i0+5]%asi, %o4
6454	sll	%o4, 16, %o4
6455	or	%o4, %o3, %o3
6456	lduba	[%i0+6]%asi, %o4
6457	sll	%o4, 8, %o4
6458	or	%o4, %o3, %o3
6459	lduba	[%i0+7]%asi, %o4
6460	or	%o4, %o3, %o3
6461	stw	%o3, [%i1+4]
6462	add	%i0, 8, %i0
6463	add	%i1, 8, %i1
6464	sub	%i2, 8, %i2
6465.ci_unalnfin:
6466	cmp	%i2, 4
6467	blt,pt	%ncc, .ci_unalnz
6468	tst	%i2
6469	lduba	[%i0]%asi, %o3		! read byte
6470	subcc	%i2, 4, %i2		! reduce count by 4
6471	sll	%o3, 24, %o3		! position
6472	lduba	[%i0+1]%asi, %o4
6473	sll	%o4, 16, %o4		! position
6474	or	%o4, %o3, %o3		! merge
6475	lduba	[%i0+2]%asi, %o4
6476	sll	%o4, 8, %o4		! position
6477	or	%o4, %o3, %o3		! merge
6478	add	%i1, 4, %i1		! advance dst by 4
6479	lduba	[%i0+3]%asi, %o4
6480	add	%i0, 4, %i0		! advance src by 4
6481	or	%o4, %o3, %o4		! merge
6482	bnz,pt	%ncc, .ci_unaln3x
6483	stw	%o4, [%i1-4]
6484	ba	.ci_exit
6485	nop
6486.ci_unalnz:
6487	bz,pt	%ncc, .ci_exit
6488	wr	%l5, %g0, %gsr		! restore %gsr
6489.ci_unaln3x:				! Exactly 1, 2, or 3 bytes remain
6490	subcc	%i2, 1, %i2		! reduce count for cc test
6491	lduba	[%i0]%asi, %o4		! load one byte
6492	bz,pt	%ncc, .ci_exit
6493	stb	%o4, [%i1]		! store one byte
6494	lduba	[%i0+1]%asi, %o4	! load second byte
6495	subcc	%i2, 1, %i2
6496	bz,pt	%ncc, .ci_exit
6497	stb	%o4, [%i1+1]		! store second byte
6498	lduba	[%i0+2]%asi, %o4	! load third byte
6499	stb	%o4, [%i1+2]		! store third byte
6500.ci_exit:
6501	brnz	%g1, .ci_fp_restore
6502	nop
6503	FZERO
6504	wr	%g1, %g0, %fprs
6505	ba,pt	%ncc, .ci_ex2
6506	membar	#Sync
6507.ci_fp_restore:
6508	BLD_FP_FROMSTACK(%o4)
6509.ci_ex2:
6510	andn	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
6511	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
6512	ret
6513	restore %g0, 0, %o0
6514
6515.copyin_err:
6516	ldn	[THREAD_REG + T_COPYOPS], %o4
6517	brz	%o4, 2f
6518	nop
6519	ldn	[%o4 + CP_COPYIN], %g2
6520	jmp	%g2
6521	nop
65222:
6523	retl
6524	mov	-1, %o0
6525
6526#else	/* NIAGARA_IMPL */
6527.do_copyin:
6528	!
6529	! Check the length and bail if zero.
6530	!
6531	tst	%o2
6532	bnz,pt	%ncc, 1f
6533	nop
6534	retl
6535	clr	%o0
65361:
6537	sethi	%hi(copyio_fault), %o4
6538	or	%o4, %lo(copyio_fault), %o4
6539	sethi	%hi(copyio_fault_nowindow), %o3
6540	ldn	[THREAD_REG + T_LOFAULT], SAVED_LOFAULT
6541	or	%o3, %lo(copyio_fault_nowindow), %o3
6542	membar	#Sync
6543	stn	%o3, [THREAD_REG + T_LOFAULT]
6544
6545	mov	%o0, SAVE_SRC
6546	mov	%o1, SAVE_DST
6547	mov	%o2, SAVE_COUNT
6548
6549	!
6550	! Check to see if we're more than SMALL_LIMIT.
6551	!
6552	subcc	%o2, SMALL_LIMIT, %o3
6553	bgu,a,pt %ncc, .dci_ns
6554	or	%o0, %o1, %o3
6555	!
6556	! What was previously ".small_copyin"
6557	!
6558.dcibcp:
6559	sub	%g0, %o2, %o3		! setup for copy loop
6560	add	%o0, %o2, %o0
6561	add	%o1, %o2, %o1
6562	ba,pt	%ncc, .dcicl
6563	lduba	[%o0 + %o3]ASI_USER, %o4
6564	!
6565	! %o0 and %o1 point at the end and remain pointing at the end
6566	! of their buffers. We pull things out by adding %o3 (which is
6567	! the negation of the length) to the buffer end which gives us
6568	! the curent location in the buffers. By incrementing %o3 we walk
6569	! through both buffers without having to bump each buffer's
6570	! pointer. A very fast 4 instruction loop.
6571	!
6572	.align 16
6573.dcicl:
6574	stb	%o4, [%o1 + %o3]
6575	inccc	%o3
6576	bl,a,pt %ncc, .dcicl
6577	lduba	[%o0 + %o3]ASI_USER, %o4
6578	!
6579	! We're done. Go home.
6580	!
6581	membar	#Sync
6582	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
6583	retl
6584	clr	%o0
6585	!
6586	! Try aligned copies from here.
6587	!
6588.dci_ns:
6589	!
6590	! See if we're single byte aligned. If we are, check the
6591	! limit for single byte copies. If we're smaller, or equal,
6592	! bounce to the byte for byte copy loop. Otherwise do it in
6593	! HW (if enabled).
6594	!
6595	btst	1, %o3
6596	bz,a,pt	%icc, .dcih8
6597	btst	7, %o3
6598	!
6599	! We're single byte aligned.
6600	!
6601	sethi	%hi(hw_copy_limit_1), %o3
6602	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
6603	!
6604	! Is HW copy on? If not do everything byte for byte.
6605	!
6606	tst	%o3
6607	bz,pn	%icc, .dcibcp
6608	subcc	%o3, %o2, %o3
6609	!
6610	! Are we bigger than the HW limit? If not
6611	! go to byte for byte.
6612	!
6613	bge,pt	%ncc, .dcibcp
6614	nop
6615	!
6616	! We're big enough and copy is on. Do it with HW.
6617	!
6618	ba,pt	%ncc, .big_copyin
6619	nop
6620.dcih8:
6621	!
6622	! 8 byte aligned?
6623	!
6624	bnz,a	%ncc, .dcih4
6625	btst	3, %o3
6626	!
6627	! We're eight byte aligned.
6628	!
6629	sethi	%hi(hw_copy_limit_8), %o3
6630	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
6631	!
6632	! Is HW assist on? If not, do it with the aligned copy.
6633	!
6634	tst	%o3
6635	bz,pn	%icc, .dcis8
6636	subcc	%o3, %o2, %o3
6637	bge	%ncc, .dcis8
6638	nop
6639	ba,pt	%ncc, .big_copyin
6640	nop
6641.dcis8:
6642	!
6643	! Housekeeping for copy loops. Uses same idea as in the byte for
6644	! byte copy loop above.
6645	!
6646	add	%o0, %o2, %o0
6647	add	%o1, %o2, %o1
6648	sub	%g0, %o2, %o3
6649	ba,pt	%ncc, .didebc
6650	srl	%o2, 3, %o2		! Number of 8 byte chunks to copy
6651	!
6652	! 4 byte aligned?
6653	!
6654.dcih4:
6655	bnz	%ncc, .dcih2
6656	sethi	%hi(hw_copy_limit_4), %o3
6657	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
6658	!
6659	! Is HW assist on? If not, do it with the aligned copy.
6660	!
6661	tst	%o3
6662	bz,pn	%icc, .dcis4
6663	subcc	%o3, %o2, %o3
6664	!
6665	! We're negative if our size is less than or equal to hw_copy_limit_4.
6666	!
6667	bge	%ncc, .dcis4
6668	nop
6669	ba,pt	%ncc, .big_copyin
6670	nop
6671.dcis4:
6672	!
6673	! Housekeeping for copy loops. Uses same idea as in the byte
6674	! for byte copy loop above.
6675	!
6676	add	%o0, %o2, %o0
6677	add	%o1, %o2, %o1
6678	sub	%g0, %o2, %o3
6679	ba,pt	%ncc, .didfbc
6680	srl	%o2, 2, %o2		! Number of 4 byte chunks to copy
6681.dcih2:
6682	!
6683	! We're two byte aligned. Check for "smallness"
6684	! done in delay at .dcih4
6685	!
6686	bleu,pt	%ncc, .dcis2
6687	sethi	%hi(hw_copy_limit_2), %o3
6688	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
6689	!
6690	! Is HW assist on? If not, do it with the aligned copy.
6691	!
6692	tst	%o3
6693	bz,pn	%icc, .dcis2
6694	subcc	%o3, %o2, %o3
6695	!
6696	! Are we larger than the HW limit?
6697	!
6698	bge	%ncc, .dcis2
6699	nop
6700	!
6701	! HW assist is on and we're large enough to use it.
6702	!
6703	ba,pt	%ncc, .big_copyin
6704	nop
6705	!
6706	! Housekeeping for copy loops. Uses same idea as in the byte
6707	! for byte copy loop above.
6708	!
6709.dcis2:
6710	add	%o0, %o2, %o0
6711	add	%o1, %o2, %o1
6712	sub	%g0, %o2, %o3
6713	ba,pt	%ncc, .didtbc
6714	srl	%o2, 1, %o2		! Number of 2 byte chunks to copy
6715	!
6716.small_copyin:
6717	!
6718	! Why are we doing this AGAIN? There are certain conditions in
6719	! big copyin that will cause us to forgo the HW assisted copys
6720	! and bounce back to a non-hw assisted copy. This dispatches
6721	! those copies. Note that we branch around this in the main line
6722	! code.
6723	!
6724	! We make no check for limits or HW enablement here. We've
6725	! already been told that we're a poster child so just go off
6726	! and do it.
6727	!
6728	or	%o0, %o1, %o3
6729	btst	1, %o3
6730	bnz	%icc, .dcibcp		! Most likely
6731	btst	7, %o3
6732	bz	%icc, .dcis8
6733	btst	3, %o3
6734	bz	%icc, .dcis4
6735	nop
6736	ba,pt	%ncc, .dcis2
6737	nop
6738	!
6739	! Eight byte aligned copies. A steal from the original .small_copyin
6740	! with modifications. %o2 is number of 8 byte chunks to copy. When
6741	! done, we examine %o3. If this is < 0, we have 1 - 7 bytes more
6742	! to copy.
6743	!
6744	.align 32
6745.didebc:
6746	ldxa	[%o0 + %o3]ASI_USER, %o4
6747	deccc	%o2
6748	stx	%o4, [%o1 + %o3]
6749	bg,pt	%ncc, .didebc
6750	addcc	%o3, 8, %o3
6751	!
6752	! End of copy loop. Most 8 byte aligned copies end here.
6753	!
6754	bz,pt	%ncc, .dcifh
6755	nop
6756	!
6757	! Something is left. Do it byte for byte.
6758	!
6759	ba,pt	%ncc, .dcicl
6760	lduba	[%o0 + %o3]ASI_USER, %o4
6761	!
6762	! 4 byte copy loop. %o2 is number of 4 byte chunks to copy.
6763	!
6764	.align 32
6765.didfbc:
6766	lduwa	[%o0 + %o3]ASI_USER, %o4
6767	deccc	%o2
6768	st	%o4, [%o1 + %o3]
6769	bg,pt	%ncc, .didfbc
6770	addcc	%o3, 4, %o3
6771	!
6772	! End of copy loop. Most 4 byte aligned copies end here.
6773	!
6774	bz,pt	%ncc, .dcifh
6775	nop
6776	!
6777	! Something is left. Do it byte for byte.
6778	!
6779	ba,pt	%ncc, .dcicl
6780	lduba	[%o0 + %o3]ASI_USER, %o4
6781	!
6782	! 2 byte aligned copy loop. %o2 is number of 2 byte chunks to
6783	! copy.
6784	!
6785	.align 32
6786.didtbc:
6787	lduha	[%o0 + %o3]ASI_USER, %o4
6788	deccc	%o2
6789	sth	%o4, [%o1 + %o3]
6790	bg,pt	%ncc, .didtbc
6791	addcc	%o3, 2, %o3
6792	!
6793	! End of copy loop. Most 2 byte aligned copies end here.
6794	!
6795	bz,pt	%ncc, .dcifh
6796	nop
6797	!
6798	! Deal with the last byte
6799	!
6800	lduba	[%o0 + %o3]ASI_USER, %o4
6801	stb	%o4, [%o1 + %o3]
6802.dcifh:
6803	membar	#Sync
6804	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
6805	retl
6806	clr	%o0
6807
6808.big_copyin:
6809	! We're going off to do a block copy.
6810	! Switch fault hendlers and grab a window. We
6811	! don't do a membar #Sync since we've done only
6812	! kernel data to this point.
6813	stn	%o4, [THREAD_REG + T_LOFAULT]
6814
6815	! Copy in that reach here are larger than 256 bytes. The
6816	! hw_copy_limit_1 is set to 256. Never set this limit less
6817	! 128 bytes.
6818	save	%sp, -SA(MINFRAME), %sp
6819.do_blockcopyin:
6820
6821	! Swap src/dst since the code below is memcpy code
6822	! and memcpy/bcopy have different calling sequences
6823	mov	%i1, %i5
6824	mov	%i0, %i1
6825	mov	%i5, %i0
6826
6827	! Block (64 bytes) align the destination.
6828	andcc	%i0, 0x3f, %i3		! is dst block aligned
6829	bz	%ncc, copyin_blalign	! dst already block aligned
6830	sub	%i3, 0x40, %i3
6831	neg	%i3			! bytes till dst 64 bytes aligned
6832	sub	%i2, %i3, %i2		! update i2 with new count
6833
6834	! Based on source and destination alignment do
6835	! either 8 bytes, 4 bytes, 2 bytes or byte copy.
6836
6837	! Is dst & src 8B aligned
6838	or	%i0, %i1, %o2
6839	andcc	%o2, 0x7, %g0
6840	bz	%ncc, .ci_alewdcp
6841	nop
6842
6843	! Is dst & src 4B aligned
6844	andcc	%o2, 0x3, %g0
6845	bz	%ncc, .ci_alwdcp
6846	nop
6847
6848	! Is dst & src 2B aligned
6849	andcc	%o2, 0x1, %g0
6850	bz	%ncc, .ci_alhlfwdcp
6851	nop
6852
6853	! 1B aligned
68541:	lduba	[%i1]ASI_USER, %o2
6855	stb	%o2, [%i0]
6856	inc	%i1
6857	deccc	%i3
6858	bgu,pt	%ncc, 1b
6859	inc	%i0
6860
6861	ba	copyin_blalign
6862	nop
6863
6864	! dst & src 4B aligned
6865.ci_alwdcp:
6866	lda	[%i1]ASI_USER, %o2
6867	st	%o2, [%i0]
6868	add	%i1, 0x4, %i1
6869	subcc	%i3, 0x4, %i3
6870	bgu,pt	%ncc, .ci_alwdcp
6871	add	%i0, 0x4, %i0
6872
6873	ba	copyin_blalign
6874	nop
6875
6876	! dst & src 2B aligned
6877.ci_alhlfwdcp:
6878	lduha	[%i1]ASI_USER, %o2
6879	stuh	%o2, [%i0]
6880	add	%i1, 0x2, %i1
6881	subcc	%i3, 0x2, %i3
6882	bgu,pt	%ncc, .ci_alhlfwdcp
6883	add	%i0, 0x2, %i0
6884
6885	ba	copyin_blalign
6886	nop
6887
6888	! dst & src 8B aligned
6889.ci_alewdcp:
6890	ldxa	[%i1]ASI_USER, %o2
6891	stx	%o2, [%i0]
6892	add	%i1, 0x8, %i1
6893	subcc	%i3, 0x8, %i3
6894	bgu,pt	%ncc, .ci_alewdcp
6895	add	%i0, 0x8, %i0
6896
6897copyin_blalign:
6898	andn	%i2, 0x3f, %i3		! %i3 count is multiple of block size
6899	sub	%i2, %i3, %i2		! Residue bytes in %i2
6900
6901	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
6902
6903	andcc	%i1, 0xf, %o2		! is src quadword aligned
6904	bz,pn	%xcc, .ci_blkcpy	! src offset in %o2 (last 4-bits)
6905	nop
6906	cmp	%o2, 0x8
6907	bg	.ci_upper_double
6908	nop
6909	bl	.ci_lower_double
6910	nop
6911
6912	! Falls through when source offset is equal to 8 i.e.
6913	! source is double word aligned.
6914	! In this case no shift/merge of data is required
6915
6916	sub	%i1, %o2, %i1		! align the src at 16 bytes.
6917	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
6918	prefetcha [%l0]ASI_USER, #one_read
6919	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
6920	add	%l0, 0x40, %l0
6921.ci_loop0:
6922	add	%i1, 0x10, %i1
6923	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
6924
6925	prefetcha [%l0]ASI_USER, #one_read
6926
6927	stxa	%l3, [%i0+0x0]%asi
6928	stxa	%l4, [%i0+0x8]%asi
6929
6930	add	%i1, 0x10, %i1
6931	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
6932
6933	stxa	%l5, [%i0+0x10]%asi
6934	stxa	%l2, [%i0+0x18]%asi
6935
6936	add	%i1, 0x10, %i1
6937	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
6938
6939	stxa	%l3, [%i0+0x20]%asi
6940	stxa	%l4, [%i0+0x28]%asi
6941
6942	add	%i1, 0x10, %i1
6943	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
6944
6945	stxa	%l5, [%i0+0x30]%asi
6946	stxa	%l2, [%i0+0x38]%asi
6947
6948	add	%l0, 0x40, %l0
6949	subcc	%i3, 0x40, %i3
6950	bgu,pt	%xcc, .ci_loop0
6951	add	%i0, 0x40, %i0
6952	ba	.ci_blkdone
6953	add	%i1, %o2, %i1		! increment the source by src offset
6954					! the src offset was stored in %o2
6955
6956.ci_lower_double:
6957
6958	sub	%i1, %o2, %i1		! align the src at 16 bytes.
6959	sll	%o2, 3, %o0		! %o0 left shift
6960	mov	0x40, %o1
6961	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
6962	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
6963	prefetcha [%l0]ASI_USER, #one_read
6964	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2	! partial data in %l2
6965							! and %l3 has complete
6966							! data
6967	add	%l0, 0x40, %l0
6968.ci_loop1:
6969	add	%i1, 0x10, %i1
6970	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4	! %l4 has partial data
6971							! for this read.
6972	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)	! merge %l2, %l3 and %l4
6973							! into %l2 and %l3
6974
6975	prefetcha [%l0]ASI_USER, #one_read
6976
6977	stxa	%l2, [%i0+0x0]%asi
6978	stxa	%l3, [%i0+0x8]%asi
6979
6980	add	%i1, 0x10, %i1
6981	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
6982	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)	! merge %l2 with %l5 and
6983							! %l4 from previous read
6984							! into %l4 and %l5
6985	stxa	%l4, [%i0+0x10]%asi
6986	stxa	%l5, [%i0+0x18]%asi
6987
6988	! Repeat the same for next 32 bytes.
6989
6990	add	%i1, 0x10, %i1
6991	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
6992	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
6993
6994	stxa	%l2, [%i0+0x20]%asi
6995	stxa	%l3, [%i0+0x28]%asi
6996
6997	add	%i1, 0x10, %i1
6998	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
6999	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
7000
7001	stxa	%l4, [%i0+0x30]%asi
7002	stxa	%l5, [%i0+0x38]%asi
7003
7004	add	%l0, 0x40, %l0
7005	subcc	%i3, 0x40, %i3
7006	bgu,pt	%xcc, .ci_loop1
7007	add	%i0, 0x40, %i0
7008	ba	.ci_blkdone
7009	add	%i1, %o2, %i1		! increment the source by src offset
7010					! the src offset was stored in %o2
7011
7012.ci_upper_double:
7013
7014	sub	%i1, %o2, %i1		! align the src at 16 bytes.
7015	sub	%o2, 0x8, %o0
7016	sll	%o0, 3, %o0		! %o0 left shift
7017	mov	0x40, %o1
7018	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
7019	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
7020	prefetcha [%l0]ASI_USER, #one_read
7021	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2	! partial data in %l3
7022							! for this read and
7023							! no data in %l2
7024	add	%l0, 0x40, %l0
7025.ci_loop2:
7026	add	%i1, 0x10, %i1
7027	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4	! %l4 has complete data
7028							! and %l5 has partial
7029	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)	! merge %l3, %l4 and %l5
7030							! into %l3 and %l4
7031	prefetcha [%l0]ASI_USER, #one_read
7032
7033	stxa	%l3, [%i0+0x0]%asi
7034	stxa	%l4, [%i0+0x8]%asi
7035
7036	add	%i1, 0x10, %i1
7037	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7038	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)	! merge %l2 and %l3 with
7039							! %l5 from previous read
7040							! into %l5 and %l2
7041
7042	stxa	%l5, [%i0+0x10]%asi
7043	stxa	%l2, [%i0+0x18]%asi
7044
7045	! Repeat the same for next 32 bytes.
7046
7047	add	%i1, 0x10, %i1
7048	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
7049	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
7050
7051	stxa	%l3, [%i0+0x20]%asi
7052	stxa	%l4, [%i0+0x28]%asi
7053
7054	add	%i1, 0x10, %i1
7055	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7056	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
7057
7058	stxa	%l5, [%i0+0x30]%asi
7059	stxa	%l2, [%i0+0x38]%asi
7060
7061	add	%l0, 0x40, %l0
7062	subcc	%i3, 0x40, %i3
7063	bgu,pt	%xcc, .ci_loop2
7064	add	%i0, 0x40, %i0
7065	ba	.ci_blkdone
7066	add	%i1, %o2, %i1		! increment the source by src offset
7067					! the src offset was stored in %o2
7068
7069
7070	! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
7071.ci_blkcpy:
7072
7073	andn	%i1, 0x3f, %o0		! %o0 has block aligned source
7074	prefetcha [%o0]ASI_USER, #one_read
7075	add	%o0, 0x40, %o0
70761:
7077	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l0
7078	add	%i1, 0x10, %i1
7079	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7080	add	%i1, 0x10, %i1
7081
7082	prefetcha [%o0]ASI_USER, #one_read
7083
7084	stxa	%l0, [%i0+0x0]%asi
7085
7086	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
7087	add	%i1, 0x10, %i1
7088	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l6
7089	add	%i1, 0x10, %i1
7090
7091	stxa	%l1, [%i0+0x8]%asi
7092	stxa	%l2, [%i0+0x10]%asi
7093	stxa	%l3, [%i0+0x18]%asi
7094	stxa	%l4, [%i0+0x20]%asi
7095	stxa	%l5, [%i0+0x28]%asi
7096	stxa	%l6, [%i0+0x30]%asi
7097	stxa	%l7, [%i0+0x38]%asi
7098
7099	add	%o0, 0x40, %o0
7100	subcc	%i3, 0x40, %i3
7101	bgu,pt	%xcc, 1b
7102	add	%i0, 0x40, %i0
7103
7104.ci_blkdone:
7105	membar	#Sync
7106
7107	brz,pt	%i2, .copyin_exit
7108	nop
7109
7110	! Handle trailing bytes
7111	cmp	%i2, 0x8
7112	blu,pt	%ncc, .ci_residue
7113	nop
7114
7115	! Can we do some 8B ops
7116	or	%i1, %i0, %o2
7117	andcc	%o2, 0x7, %g0
7118	bnz	%ncc, .ci_last4
7119	nop
7120
7121	! Do 8byte ops as long as possible
7122.ci_last8:
7123	ldxa	[%i1]ASI_USER, %o2
7124	stx	%o2, [%i0]
7125	add	%i1, 0x8, %i1
7126	sub	%i2, 0x8, %i2
7127	cmp	%i2, 0x8
7128	bgu,pt	%ncc, .ci_last8
7129	add	%i0, 0x8, %i0
7130
7131	brz,pt	%i2, .copyin_exit
7132	nop
7133
7134	ba	.ci_residue
7135	nop
7136
7137.ci_last4:
7138	! Can we do 4B ops
7139	andcc	%o2, 0x3, %g0
7140	bnz	%ncc, .ci_last2
7141	nop
71421:
7143	lda	[%i1]ASI_USER, %o2
7144	st	%o2, [%i0]
7145	add	%i1, 0x4, %i1
7146	sub	%i2, 0x4, %i2
7147	cmp	%i2, 0x4
7148	bgu,pt	%ncc, 1b
7149	add	%i0, 0x4, %i0
7150
7151	brz,pt	%i2, .copyin_exit
7152	nop
7153
7154	ba	.ci_residue
7155	nop
7156
7157.ci_last2:
7158	! Can we do 2B ops
7159	andcc	%o2, 0x1, %g0
7160	bnz	%ncc, .ci_residue
7161	nop
7162
71631:
7164	lduha	[%i1]ASI_USER, %o2
7165	stuh	%o2, [%i0]
7166	add	%i1, 0x2, %i1
7167	sub	%i2, 0x2, %i2
7168	cmp	%i2, 0x2
7169	bgu,pt	%ncc, 1b
7170	add	%i0, 0x2, %i0
7171
7172	brz,pt	%i2, .copyin_exit
7173	nop
7174
7175	! Copy the residue as byte copy
7176.ci_residue:
7177	lduba	[%i1]ASI_USER, %i4
7178	stb	%i4, [%i0]
7179	inc	%i1
7180	deccc	%i2
7181	bgu,pt	%xcc, .ci_residue
7182	inc	%i0
7183
7184.copyin_exit:
7185	membar	#Sync
7186	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
7187	ret
7188	restore	%g0, 0, %o0
7189.copyin_err:
7190	ldn	[THREAD_REG + T_COPYOPS], %o4
7191	brz	%o4, 2f
7192	nop
7193	ldn	[%o4 + CP_COPYIN], %g2
7194	jmp	%g2
7195	nop
71962:
7197	retl
7198	mov	-1, %o0
7199#endif	/* NIAGARA_IMPL */
7200	SET_SIZE(copyin)
7201
7202	ENTRY(xcopyin)
7203	sethi	%hi(.xcopyin_err), REAL_LOFAULT
7204	b	.do_copyin
7205	or	REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
7206.xcopyin_err:
7207	ldn	[THREAD_REG + T_COPYOPS], %o4
7208	brz	%o4, 2f
7209	nop
7210	ldn	[%o4 + CP_XCOPYIN], %g2
7211	jmp	%g2
7212	nop
72132:
7214	retl
7215	mov	%g1, %o0
7216	SET_SIZE(xcopyin)
7217
7218	ENTRY(xcopyin_little)
7219	sethi	%hi(.little_err), %o4
7220	ldn	[THREAD_REG + T_LOFAULT], %o5
7221	or	%o4, %lo(.little_err), %o4
7222	membar	#Sync				! sync error barrier
7223	stn	%o4, [THREAD_REG + T_LOFAULT]
7224
7225	subcc	%g0, %o2, %o3
7226	add	%o0, %o2, %o0
7227	bz,pn	%ncc, 2f		! check for zero bytes
7228	sub	%o2, 1, %o4
7229	add	%o0, %o4, %o0		! start w/last byte
7230	add	%o1, %o2, %o1
7231	lduba	[%o0+%o3]ASI_AIUSL, %o4
7232
72331:	stb	%o4, [%o1+%o3]
7234	inccc	%o3
7235	sub	%o0, 2, %o0		! get next byte
7236	bcc,a,pt %ncc, 1b
7237	lduba	[%o0+%o3]ASI_AIUSL, %o4
7238
72392:	membar	#Sync				! sync error barrier
7240	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
7241	retl
7242	mov	%g0, %o0		! return (0)
7243
7244.little_err:
7245	membar	#Sync				! sync error barrier
7246	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
7247	retl
7248	mov	%g1, %o0
7249	SET_SIZE(xcopyin_little)
7250
7251
7252/*
7253 * Copy a block of storage - must not overlap (from + len <= to).
7254 * No fault handler installed (to be called under on_fault())
7255 */
7256
7257	ENTRY(copyin_noerr)
7258	sethi	%hi(.copyio_noerr), REAL_LOFAULT
7259	b	.do_copyin
7260	or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
7261.copyio_noerr:
7262	jmp	SAVED_LOFAULT
7263	nop
7264	SET_SIZE(copyin_noerr)
7265
7266/*
7267 * Copy a block of storage - must not overlap (from + len <= to).
7268 * No fault handler installed (to be called under on_fault())
7269 */
7270
7271	ENTRY(copyout_noerr)
7272	sethi	%hi(.copyio_noerr), REAL_LOFAULT
7273	b	.do_copyout
7274	or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
7275	SET_SIZE(copyout_noerr)
7276
7277	.align	4
7278	DGDEF(use_hw_bcopy)
7279	.word	1
7280	DGDEF(use_hw_bzero)
7281	.word	1
7282	DGDEF(hw_copy_limit_1)
7283	.word	0x100
7284	DGDEF(hw_copy_limit_2)
7285	.word	0x200
7286	DGDEF(hw_copy_limit_4)
7287	.word	0x400
7288	DGDEF(hw_copy_limit_8)
7289	.word	0x400
7290
7291	.align	64
7292	.section ".text"
7293
7294/*
7295 * hwblkclr - clears block-aligned, block-multiple-sized regions that are
7296 * longer than 256 bytes in length using Niagara's block stores/quad store.
7297 * If the criteria for using this routine are not met then it calls bzero
7298 * and returns 1.  Otherwise 0 is returned indicating success.
7299 * Caller is responsible for ensuring use_hw_bzero is true and that
7300 * kpreempt_disable() has been called.
7301 */
7302	! %i0 - start address
7303	! %i1 - length of region (multiple of 64)
7304
7305	ENTRY(hwblkclr)
7306	save	%sp, -SA(MINFRAME), %sp
7307
7308	! Must be block-aligned
7309	andcc	%i0, 0x3f, %g0
7310	bnz,pn	%ncc, 1f
7311	nop
7312
7313	! ... and must be 256 bytes or more
7314	cmp	%i1, 0x100
7315	blu,pn	%ncc, 1f
7316	nop
7317
7318	! ... and length must be a multiple of 64
7319	andcc	%i1, 0x3f, %g0
7320	bz,pn	%ncc, .pz_doblock
7321	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
7322
73231:	! punt, call bzero but notify the caller that bzero was used
7324	mov	%i0, %o0
7325	call	bzero
7326	mov	%i1, %o1
7327	ret
7328	restore	%g0, 1, %o0	! return (1) - did not use block operations
7329
7330	! Already verified that there are at least 256 bytes to set
7331.pz_doblock:
7332	stxa	%g0, [%i0+0x0]%asi
7333	stxa	%g0, [%i0+0x40]%asi
7334	stxa	%g0, [%i0+0x80]%asi
7335	stxa	%g0, [%i0+0xc0]%asi
7336
7337	stxa	%g0, [%i0+0x8]%asi
7338	stxa	%g0, [%i0+0x10]%asi
7339	stxa	%g0, [%i0+0x18]%asi
7340	stxa	%g0, [%i0+0x20]%asi
7341	stxa	%g0, [%i0+0x28]%asi
7342	stxa	%g0, [%i0+0x30]%asi
7343	stxa	%g0, [%i0+0x38]%asi
7344
7345	stxa	%g0, [%i0+0x48]%asi
7346	stxa	%g0, [%i0+0x50]%asi
7347	stxa	%g0, [%i0+0x58]%asi
7348	stxa	%g0, [%i0+0x60]%asi
7349	stxa	%g0, [%i0+0x68]%asi
7350	stxa	%g0, [%i0+0x70]%asi
7351	stxa	%g0, [%i0+0x78]%asi
7352
7353	stxa	%g0, [%i0+0x88]%asi
7354	stxa	%g0, [%i0+0x90]%asi
7355	stxa	%g0, [%i0+0x98]%asi
7356	stxa	%g0, [%i0+0xa0]%asi
7357	stxa	%g0, [%i0+0xa8]%asi
7358	stxa	%g0, [%i0+0xb0]%asi
7359	stxa	%g0, [%i0+0xb8]%asi
7360
7361	stxa	%g0, [%i0+0xc8]%asi
7362	stxa	%g0, [%i0+0xd0]%asi
7363	stxa	%g0, [%i0+0xd8]%asi
7364	stxa	%g0, [%i0+0xe0]%asi
7365	stxa	%g0, [%i0+0xe8]%asi
7366	stxa	%g0, [%i0+0xf0]%asi
7367	stxa	%g0, [%i0+0xf8]%asi
7368
7369	sub	%i1, 0x100, %i1
7370	cmp	%i1, 0x100
7371	bgu,pt	%ncc, .pz_doblock
7372	add	%i0, 0x100, %i0
7373
73742:
7375	! Check if more than 64 bytes to set
7376	cmp	%i1,0x40
7377	blu	%ncc, .pz_finish
7378	nop
7379
73803:
7381	stxa	%g0, [%i0+0x0]%asi
7382	stxa	%g0, [%i0+0x8]%asi
7383	stxa	%g0, [%i0+0x10]%asi
7384	stxa	%g0, [%i0+0x18]%asi
7385	stxa	%g0, [%i0+0x20]%asi
7386	stxa	%g0, [%i0+0x28]%asi
7387	stxa	%g0, [%i0+0x30]%asi
7388	stxa	%g0, [%i0+0x38]%asi
7389
7390	subcc	%i1, 0x40, %i1
7391	bgu,pt	%ncc, 3b
7392	add	%i0, 0x40, %i0
7393
7394.pz_finish:
7395	membar	#Sync
7396	ret
7397	restore	%g0, 0, %o0		! return (bzero or not)
7398	SET_SIZE(hwblkclr)
7399
7400	/*
7401	 * Copy 32 bytes of data from src (%o0) to dst (%o1)
7402	 * using physical addresses.
7403	 */
7404	ENTRY_NP(hw_pa_bcopy32)
7405	rdpr	%pstate, %g1
7406	andn	%g1, PSTATE_IE, %g2
7407	wrpr	%g0, %g2, %pstate
7408
7409	ldxa	[%o0]ASI_MEM, %o2
7410	add	%o0, 8, %o0
7411	ldxa	[%o0]ASI_MEM, %o3
7412	add	%o0, 8, %o0
7413	ldxa	[%o0]ASI_MEM, %o4
7414	add	%o0, 8, %o0
7415	ldxa	[%o0]ASI_MEM, %o5
7416	stxa	%o2, [%o1]ASI_MEM
7417	add	%o1, 8, %o1
7418	stxa	%o3, [%o1]ASI_MEM
7419	add	%o1, 8, %o1
7420	stxa	%o4, [%o1]ASI_MEM
7421	add	%o1, 8, %o1
7422	stxa	%o5, [%o1]ASI_MEM
7423
7424	membar	#Sync
7425	retl
7426	wrpr	%g0, %g1, %pstate
7427	SET_SIZE(hw_pa_bcopy32)
7428
7429/*
7430 * Zero a block of storage.
7431 *
7432 * uzero is used by the kernel to zero a block in user address space.
7433 */
7434
7435/*
7436 * Control flow of the bzero/kzero/uzero routine.
7437 *
7438 *	For fewer than 7 bytes stores, bytes will be zeroed.
7439 *
7440 *	For less than 15 bytes stores, align the address on 4 byte boundary.
7441 *	Then store as many 4-byte chunks, followed by trailing bytes.
7442 *
7443 *	For sizes greater than 15 bytes, align the address on 8 byte boundary.
7444 *	if (count > 128) {
7445 *		store as many 8-bytes chunks to block align the address
7446 *		store using ASI_BLK_INIT_ST_QUAD_LDD_P (bzero/kzero) OR
7447 *		store using ASI_BLK_INIT_QUAD_LDD_AIUS (uzero)
7448 *	}
7449 *	Store as many 8-byte chunks, followed by trailing bytes.
7450 */
7451
7452	ENTRY(uzero)
7453	!
7454	! Set a new lo_fault handler only if we came in with one
7455	! already specified.
7456	!
7457	wr	%g0, ASI_USER, %asi
7458	ldn	[THREAD_REG + T_LOFAULT], %o5
7459	tst	%o5
7460	bz,pt	%ncc, .do_zero
7461	sethi	%hi(.zeroerr), %o2
7462	or	%o2, %lo(.zeroerr), %o2
7463	membar	#Sync
7464	ba,pt	%ncc, .do_zero
7465	stn	%o2, [THREAD_REG + T_LOFAULT]
7466
7467	ENTRY(kzero)
7468	!
7469	! Always set a lo_fault handler
7470	!
7471	wr	%g0, ASI_P, %asi
7472	ldn	[THREAD_REG + T_LOFAULT], %o5
7473	sethi	%hi(.zeroerr), %o2
7474	or	%o5, LOFAULT_SET, %o5
7475	or	%o2, %lo(.zeroerr), %o2
7476	membar	#Sync
7477	ba,pt	%ncc, .do_zero
7478	stn	%o2, [THREAD_REG + T_LOFAULT]
7479
7480/*
7481 * We got here because of a fault during kzero or if
7482 * uzero or bzero was called with t_lofault non-zero.
7483 * Otherwise we've already run screaming from the room.
7484 * Errno value is in %g1. Note that we're here iff
7485 * we did set t_lofault.
7486 */
7487.zeroerr:
7488	!
7489	! Undo asi register setting. Just set it to be the
7490	! kernel default without checking.
7491	!
7492	wr	%g0, ASI_P, %asi
7493
7494	!
7495	! We did set t_lofault. It may well have been zero coming in.
7496	!
74971:
7498	tst	%o5
7499	membar #Sync
7500	bne,pn	%ncc, 3f
7501	andncc	%o5, LOFAULT_SET, %o5
75022:
7503	!
7504	! Old handler was zero. Just return the error.
7505	!
7506	retl				! return
7507	mov	%g1, %o0		! error code from %g1
75083:
7509	!
7510	! We're here because %o5 was non-zero. It was non-zero
7511	! because either LOFAULT_SET was present, a previous fault
7512	! handler was present or both. In all cases we need to reset
7513	! T_LOFAULT to the value of %o5 after clearing LOFAULT_SET
7514	! before we either simply return the error or we invoke the
7515	! previously specified handler.
7516	!
7517	be	%ncc, 2b
7518	stn	%o5, [THREAD_REG + T_LOFAULT]
7519	jmp	%o5			! goto real handler
7520	nop
7521	SET_SIZE(kzero)
7522	SET_SIZE(uzero)
7523
7524/*
7525 * Zero a block of storage.
7526 */
7527
7528	ENTRY(bzero)
7529	wr	%g0, ASI_P, %asi
7530
7531	ldn	[THREAD_REG + T_LOFAULT], %o5	! save old vector
7532	tst	%o5
7533	bz,pt	%ncc, .do_zero
7534	sethi	%hi(.zeroerr), %o2
7535	or	%o2, %lo(.zeroerr), %o2
7536	membar	#Sync				! sync error barrier
7537	stn	%o2, [THREAD_REG + T_LOFAULT]	! install new vector
7538
7539.do_zero:
7540	cmp	%o1, 7
7541	blu,pn	%ncc, .byteclr
7542	nop
7543
7544	cmp	%o1, 15
7545	blu,pn	%ncc, .wdalign
7546	nop
7547
7548	andcc	%o0, 7, %o3		! is add aligned on a 8 byte bound
7549	bz,pt	%ncc, .blkalign		! already double aligned
7550	sub	%o3, 8, %o3		! -(bytes till double aligned)
7551	add	%o1, %o3, %o1		! update o1 with new count
7552
75531:
7554	stba	%g0, [%o0]%asi
7555	inccc	%o3
7556	bl,pt	%ncc, 1b
7557	inc	%o0
7558
7559	! Now address is double aligned
7560.blkalign:
7561	cmp	%o1, 0x80		! check if there are 128 bytes to set
7562	blu,pn	%ncc, .bzero_small
7563	mov	%o1, %o3
7564
7565	sethi	%hi(use_hw_bzero), %o2
7566	ld	[%o2 + %lo(use_hw_bzero)], %o2
7567	tst	%o2
7568	bz	%ncc, .bzero_small
7569	mov	%o1, %o3
7570
7571	rd	%asi, %o3
7572	wr	%g0, ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
7573	cmp	%o3, ASI_P
7574	bne,a	%ncc, .algnblk
7575	wr	%g0, ASI_BLK_INIT_QUAD_LDD_AIUS, %asi
7576
7577.algnblk:
7578	andcc	%o0, 0x3f, %o3		! is block aligned?
7579	bz,pt	%ncc, .bzero_blk
7580	sub	%o3, 0x40, %o3		! -(bytes till block aligned)
7581	add	%o1, %o3, %o1		! o1 is the remainder
7582
7583	! Clear -(%o3) bytes till block aligned
75841:
7585	stxa	%g0, [%o0]%asi
7586	addcc	%o3, 8, %o3
7587	bl,pt	%ncc, 1b
7588	add	%o0, 8, %o0
7589
7590.bzero_blk:
7591	and	%o1, 0x3f, %o3		! calc bytes left after blk clear
7592	andn	%o1, 0x3f, %o4		! calc size of blocks in bytes
7593
7594	cmp	%o4, 0x100		! 256 bytes or more
7595	blu,pn	%ncc, 3f
7596	nop
7597
75982:
7599	stxa	%g0, [%o0+0x0]%asi
7600	stxa	%g0, [%o0+0x40]%asi
7601	stxa	%g0, [%o0+0x80]%asi
7602	stxa	%g0, [%o0+0xc0]%asi
7603
7604	stxa	%g0, [%o0+0x8]%asi
7605	stxa	%g0, [%o0+0x10]%asi
7606	stxa	%g0, [%o0+0x18]%asi
7607	stxa	%g0, [%o0+0x20]%asi
7608	stxa	%g0, [%o0+0x28]%asi
7609	stxa	%g0, [%o0+0x30]%asi
7610	stxa	%g0, [%o0+0x38]%asi
7611
7612	stxa	%g0, [%o0+0x48]%asi
7613	stxa	%g0, [%o0+0x50]%asi
7614	stxa	%g0, [%o0+0x58]%asi
7615	stxa	%g0, [%o0+0x60]%asi
7616	stxa	%g0, [%o0+0x68]%asi
7617	stxa	%g0, [%o0+0x70]%asi
7618	stxa	%g0, [%o0+0x78]%asi
7619
7620	stxa	%g0, [%o0+0x88]%asi
7621	stxa	%g0, [%o0+0x90]%asi
7622	stxa	%g0, [%o0+0x98]%asi
7623	stxa	%g0, [%o0+0xa0]%asi
7624	stxa	%g0, [%o0+0xa8]%asi
7625	stxa	%g0, [%o0+0xb0]%asi
7626	stxa	%g0, [%o0+0xb8]%asi
7627
7628	stxa	%g0, [%o0+0xc8]%asi
7629	stxa	%g0, [%o0+0xd0]%asi
7630	stxa	%g0, [%o0+0xd8]%asi
7631	stxa	%g0, [%o0+0xe0]%asi
7632	stxa	%g0, [%o0+0xe8]%asi
7633	stxa	%g0, [%o0+0xf0]%asi
7634	stxa	%g0, [%o0+0xf8]%asi
7635
7636	sub	%o4, 0x100, %o4
7637	cmp	%o4, 0x100
7638	bgu,pt	%ncc, 2b
7639	add	%o0, 0x100, %o0
7640
76413:
7642	! ... check if 64 bytes to set
7643	cmp	%o4, 0x40
7644	blu	%ncc, .bzero_blk_done
7645	nop
7646
76474:
7648	stxa	%g0, [%o0+0x0]%asi
7649	stxa	%g0, [%o0+0x8]%asi
7650	stxa	%g0, [%o0+0x10]%asi
7651	stxa	%g0, [%o0+0x18]%asi
7652	stxa	%g0, [%o0+0x20]%asi
7653	stxa	%g0, [%o0+0x28]%asi
7654	stxa	%g0, [%o0+0x30]%asi
7655	stxa	%g0, [%o0+0x38]%asi
7656
7657	subcc	%o4, 0x40, %o4
7658	bgu,pt	%ncc, 3b
7659	add	%o0, 0x40, %o0
7660
7661.bzero_blk_done:
7662	membar	#Sync
7663	!
7664	! Undo asi register setting.
7665	!
7666	rd	%asi, %o4
7667	wr	%g0, ASI_P, %asi
7668	cmp	%o4, ASI_BLK_INIT_ST_QUAD_LDD_P
7669	bne,a	%ncc, .bzero_small
7670	wr	%g0, ASI_USER, %asi
7671
7672.bzero_small:
7673	! Set the remaining doubles
7674	subcc	%o3, 8, %o3		! Can we store any doubles?
7675	blu,pn	%ncc, .byteclr
7676	and	%o1, 7, %o1		! calc bytes left after doubles
7677
7678.dbclr:
7679	stxa	%g0, [%o0]%asi		! Clear the doubles
7680	subcc	%o3, 8, %o3
7681	bgeu,pt	%ncc, .dbclr
7682	add	%o0, 8, %o0
7683
7684	ba	.byteclr
7685	nop
7686
7687.wdalign:
7688	andcc	%o0, 3, %o3		! is add aligned on a word boundary
7689	bz,pn	%ncc, .wdclr
7690	andn	%o1, 3, %o3		! create word sized count in %o3
7691
7692	dec	%o1			! decrement count
7693	stba	%g0, [%o0]%asi		! clear a byte
7694	ba	.wdalign
7695	inc	%o0			! next byte
7696
7697.wdclr:
7698	sta	%g0, [%o0]%asi		! 4-byte clearing loop
7699	subcc	%o3, 4, %o3
7700	bnz,pt	%ncc, .wdclr
7701	inc	4, %o0
7702
7703	and	%o1, 3, %o1		! leftover count, if any
7704
7705.byteclr:
7706	! Set the leftover bytes
7707	brz	%o1, .bzero_exit
7708	nop
7709
77107:
7711	deccc	%o1			! byte clearing loop
7712	stba	%g0, [%o0]%asi
7713	bgu,pt	%ncc, 7b
7714	inc	%o0
7715
7716.bzero_exit:
7717	!
7718	! We're just concerned with whether t_lofault was set
7719	! when we came in. We end up here from either kzero()
7720	! or bzero(). kzero() *always* sets a lofault handler.
7721	! It ors LOFAULT_SET into %o5 to indicate it has done
7722	! this even if the value of %o5 is otherwise zero.
7723	! bzero() sets a lofault handler *only* if one was
7724	! previously set. Accordingly we need to examine
7725	! %o5 and if it is non-zero be sure to clear LOFAULT_SET
7726	! before resetting the error handler.
7727	!
7728	tst	%o5
7729	bz	%ncc, 1f
7730	andn	%o5, LOFAULT_SET, %o5
7731	membar	#Sync				! sync error barrier
7732	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
77331:
7734	retl
7735	clr	%o0			! return (0)
7736
7737	SET_SIZE(bzero)
7738