xref: /titanic_44/usr/src/uts/sun4v/cpu/niagara_copy.s (revision a547be5daca7e465ca82df6d179f6b1f8e0cda72)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25
26#include <sys/param.h>
27#include <sys/errno.h>
28#include <sys/asm_linkage.h>
29#include <sys/vtrace.h>
30#include <sys/machthread.h>
31#include <sys/clock.h>
32#include <sys/asi.h>
33#include <sys/fsr.h>
34#include <sys/privregs.h>
35#include <sys/machasi.h>
36#include <sys/niagaraasi.h>
37
38#if !defined(lint)
39#include "assym.h"
40#endif	/* lint */
41
42
43/*
44 * Pseudo-code to aid in understanding the control flow of the
45 * bcopy/kcopy routine.
46 *
47 *	! WARNING : <Register usage convention>
48 *	! In kcopy() the %o5, holds previous error handler and a flag
49 *	! LOFAULT_SET (low bits). The %o5 is null in bcopy().
50 *	! The %o5 is not available for any other use.
51 *
52 * On entry:
53 *	! Determine whether to use the FP register version or the
54 *	! the leaf routine version depending on the size of the copy.
55 *	! Set up error handling accordingly.
56 *	! The transition point depends on FP_COPY
57 *	! For both versions %o5 is reserved
58 *
59 * kcopy():
60 *	if(length > FP_COPY)
61 *		go to regular_kcopy
62 *
63 *	! Setup_leaf_rtn_error_handler
64 *	%o5 = curthread->t_lofault;		! save existing handler in %o5
65 *	%o5 |= LOFAULT_SET;			! ORed with LOFAULT_SET flag
66 *	curthread->t_lofault = .sm_copyerr;
67 *	goto small_bcopy();
68 *
69 * regular_kcopy:
70 *	save_registers()
71 *	%o5 = curthread->t_lofault;		! save existing handler in %o5
72 *	%o5 |= LOFAULT_SET;			! ORed with LOFAULT_SET flag
73 *	curthread->t_lofault = .copyerr;
74 *	goto do_copy();
75 *
76 * bcopy():
77 *	if(length > FP_COPY)
78 *		go to regular_bcopy
79 *
80 *	! Setup_leaf_rtn_error_handler
81 *	%o5 = curthread->t_lofault;		! save existing handler in %o5
82 *	curthread->t_lofault = .sm_copyerr;
83 *	goto small_bcopy();
84 *
85 * regular_bcopy:
86 *	%o5 = curthread->t_lofault;		! save existing handler in %o5
87 *	curthread->t_lofault = .copyerr;
88 *	goto do_copy();
89 *
90 * small_bcopy:
91 *	! handle copies smaller than FP_COPY
92 *	restore t_lofault handler
93 *	exit
94 *
95 * do_copy:
96 *	! handle copies larger than FP_COPY
97 *	save fp_regs
98 * 	blockcopy;
99 *	restore fp_regs
100 *	restore t_lofault handler if came from kcopy();
101 *
102 *
103 * In leaf lofault handler:
104 *	curthread->t_lofault = (%o5 & ~LOFAULT_SET);	! restore old t_lofault
105 *	return (errno)
106 *
107 * In lofault handler:
108 *	curthread->t_lofault = (%o5 & ~LOFAULT_SET);	! restore old t_lofault
109 *	restore fp_regs
110 *	return (errno)
111 *
112 *
113 *
114 * For all of bcopy/copyin/copyout the copy logic is specialized according
115 * to how the src and dst is aligned and how much data needs to be moved.
116 * The following comments apply to the N2/RF code (#if !defined(NIAGARA_IMPL))
117 *
118 * N2/RF Flow :
119 *
120 * if (count < FP_COPY) {  (584 bytes)
121 *   set small fault handler (no register window save/restore)
122 *   if count < SHORTCOPY  (7 bytes)
123 *	copy bytes; go to short_exit
124 *   else
125 *   determine dst alignment, move minimum bytes/halfwords to
126 *   get dst aligned on long word boundary
127 *     if( src is on long word boundary ) {
128 * medlong:					   src/dst aligned on 8 bytes
129 *	 copy with ldx/stx in 4-way unrolled loop;
130 *       copy final 0-31 bytes; go to short_exit
131 *     } else {					src/dst not aligned on 8 bytes
132 *     if src is word aligned, ld/st words in 32-byte chunks
133 *     if src is half word aligned, ld half, ld word, ld half; pack
134 *		into long word, store long words in 32-byte chunks
135 *     if src is byte aligned, ld byte,half,word parts;  pack into long
136 *	   word, store long words in 32-byte chunks
137 *     move final 0-31 bytes according to src alignment;  go to short_exit
138 * short_exit:
139 *     restore trap handler if needed, retl
140 * else {					   More than FP_COPY bytes
141 *     set fault handler
142 *     disable kernel preemption
143 *     save registers, save FP registers if in use
144 *     move bytes to align destination register on long word boundary
145 *     if(src is on long word boundary) {	   src/dst aligned on 8 bytes
146 *       align dst on 64 byte boundary;  use 8-way test for each of 8 possible
147 *       src alignments relative to a 64 byte boundary to select the
148 *       16-way unrolled loop (128 bytes) to use for
149 *       block load, fmovd, block-init-store, block-store, fmovd operations
150 *       then go to remain_stuff.
151 * remain_stuff: move remaining bytes. go to long_exit
152 *     } else {
153 *       setup alignaddr for faligndata instructions
154 *       align dst on 64 byte boundary; use 8-way test for each of 8 possible
155 *       src alignments to nearest long word relative to 64 byte boundary to
156 *       select the 8-way unrolled loop (64 bytes) to use for
157 *       block load, falign, fmovd, block-store loop
158 *	 (only use block-init-store when src/dst on 8 byte boundaries.)
159 *       goto unalign_done.
160 * unalign_done:
161 *       move remaining bytes for unaligned cases. go to long_exit
162 * long_exit:
163 *       restore %gsr, FP regs (either from stack or set to zero),
164 *       restore trap handler, check for kernel preemption request,
165 *       handle if needed, ret.
166 * }
167 *
168 * Other platforms include hw_bcopy_limit_[1248] to control the exact
169 * point where the FP register code is used. On those platforms, the
170 * FP register code did not leave data in L2 cache, potentially affecting
171 * performance more than the gain/loss from the algorithm difference.
172 * For N2/RF, block store places data in the L2 cache, so use or non-use
173 * of the FP registers has no effect on L2 cache behavior.
174 * The cost for testing hw_bcopy_limit_* according to different
175 * alignments exceeds 50 cycles for all cases, even when hw_bcopy_limits
176 * were not used. That cost was judged too high relative to the benefits,
177 * so the hw_bcopy_limit option is omitted from this code.
178 */
179
180/*
181 * Less then or equal this number of bytes we will always copy byte-for-byte
182 */
183#define	SMALL_LIMIT	7
184
185/*
186 * LOFAULT_SET : Flag set by kzero and kcopy to indicate that t_lofault
187 * handler was set
188 */
189#define	LOFAULT_SET 2
190
191/*
192 * This define is to align data for the unaligned source cases.
193 * The data1, data2 and data3 is merged into data1 and data2.
194 * The data3 is preserved for next merge.
195 */
196#define	ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp)	\
197	sllx	data1, lshift, data1				;\
198	srlx	data2, rshift, tmp				;\
199	or	data1, tmp, data1				;\
200	sllx	data2, lshift, data2				;\
201	srlx	data3, rshift, tmp				;\
202	or	data2, tmp, data2
203/*
204 * This macro is to align the data. Basically it merges
205 * data1 and data2 to form double word.
206 */
207#define	ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp)	\
208	sllx	data1, lshift, data1				;\
209	srlx	data2, rshift, tmp				;\
210	or	data1, tmp, data1
211
212#if !defined(NIAGARA_IMPL)
213/*
214 * Flags set in the lower bits of the t_lofault address:
215 * FPUSED_FLAG: The FP registers were in use and must be restored
216 * LOFAULT_SET: Set for bcopy calls, cleared for kcopy calls
217 * COPY_FLAGS: Both of the above
218 *
219 * Other flags:
220 * KPREEMPT_FLAG: kpreempt needs to be called
221 */
222#define	FPUSED_FLAG	1
223#define	LOFAULT_SET	2
224#define	COPY_FLAGS	(FPUSED_FLAG | LOFAULT_SET)
225#define	KPREEMPT_FLAG	4
226
227#define	ALIGN_OFF_1_7			\
228	faligndata %d0, %d2, %d48	;\
229	faligndata %d2, %d4, %d50	;\
230	faligndata %d4, %d6, %d52	;\
231	faligndata %d6, %d8, %d54	;\
232	faligndata %d8, %d10, %d56	;\
233	faligndata %d10, %d12, %d58	;\
234	faligndata %d12, %d14, %d60	;\
235	faligndata %d14, %d16, %d62
236
237#define	ALIGN_OFF_8_15			\
238	faligndata %d2, %d4, %d48	;\
239	faligndata %d4, %d6, %d50	;\
240	faligndata %d6, %d8, %d52	;\
241	faligndata %d8, %d10, %d54	;\
242	faligndata %d10, %d12, %d56	;\
243	faligndata %d12, %d14, %d58	;\
244	faligndata %d14, %d16, %d60	;\
245	faligndata %d16, %d18, %d62
246
247#define	ALIGN_OFF_16_23			\
248	faligndata %d4, %d6, %d48	;\
249	faligndata %d6, %d8, %d50	;\
250	faligndata %d8, %d10, %d52	;\
251	faligndata %d10, %d12, %d54	;\
252	faligndata %d12, %d14, %d56	;\
253	faligndata %d14, %d16, %d58	;\
254	faligndata %d16, %d18, %d60	;\
255	faligndata %d18, %d20, %d62
256
257#define	ALIGN_OFF_24_31			\
258	faligndata %d6, %d8, %d48	;\
259	faligndata %d8, %d10, %d50	;\
260	faligndata %d10, %d12, %d52	;\
261	faligndata %d12, %d14, %d54	;\
262	faligndata %d14, %d16, %d56	;\
263	faligndata %d16, %d18, %d58	;\
264	faligndata %d18, %d20, %d60	;\
265	faligndata %d20, %d22, %d62
266
267#define	ALIGN_OFF_32_39			\
268	faligndata %d8, %d10, %d48	;\
269	faligndata %d10, %d12, %d50	;\
270	faligndata %d12, %d14, %d52	;\
271	faligndata %d14, %d16, %d54	;\
272	faligndata %d16, %d18, %d56	;\
273	faligndata %d18, %d20, %d58	;\
274	faligndata %d20, %d22, %d60	;\
275	faligndata %d22, %d24, %d62
276
277#define	ALIGN_OFF_40_47			\
278	faligndata %d10, %d12, %d48	;\
279	faligndata %d12, %d14, %d50	;\
280	faligndata %d14, %d16, %d52	;\
281	faligndata %d16, %d18, %d54	;\
282	faligndata %d18, %d20, %d56	;\
283	faligndata %d20, %d22, %d58	;\
284	faligndata %d22, %d24, %d60	;\
285	faligndata %d24, %d26, %d62
286
287#define	ALIGN_OFF_48_55			\
288	faligndata %d12, %d14, %d48	;\
289	faligndata %d14, %d16, %d50	;\
290	faligndata %d16, %d18, %d52	;\
291	faligndata %d18, %d20, %d54	;\
292	faligndata %d20, %d22, %d56	;\
293	faligndata %d22, %d24, %d58	;\
294	faligndata %d24, %d26, %d60	;\
295	faligndata %d26, %d28, %d62
296
297#define	ALIGN_OFF_56_63			\
298	faligndata %d14, %d16, %d48	;\
299	faligndata %d16, %d18, %d50	;\
300	faligndata %d18, %d20, %d52	;\
301	faligndata %d20, %d22, %d54	;\
302	faligndata %d22, %d24, %d56	;\
303	faligndata %d24, %d26, %d58	;\
304	faligndata %d26, %d28, %d60	;\
305	faligndata %d28, %d30, %d62
306
307/*
308 * FP_COPY indicates the minimum number of bytes needed
309 * to justify using FP/VIS-accelerated memory operations.
310 * The FPBLK code assumes a minimum number of bytes are available
311 * to be moved on entry.  Check that code carefully before
312 * reducing FP_COPY below 256.
313 */
314#define FP_COPY			584
315#define SHORTCOPY		7
316#define ASI_STBI_P		ASI_BLK_INIT_ST_QUAD_LDD_P
317#define ASI_STBI_AIUS		ASI_BLK_INIT_QUAD_LDD_AIUS
318#define CACHE_LINE		64
319#define	VIS_BLOCKSIZE		64
320
321/*
322 * Size of stack frame in order to accomodate a 64-byte aligned
323 * floating-point register save area and 2 64-bit temp locations.
324 * All copy functions use three quadrants of fp registers; to assure a
325 * block-aligned three block buffer in which to save we must reserve
326 * four blocks on stack.
327 *
328 *    _______________________________________ <-- %fp + STACK_BIAS
329 *    | We may need to preserve 3 quadrants |
330 *    | of fp regs, but since we do so with |
331 *    | BST/BLD we need room in which to    |
332 *    | align to VIS_BLOCKSIZE bytes.  So   |
333 *    | this area is 4 * VIS_BLOCKSIZE.     | <--  - SAVED_FPREGS_OFFSET
334 *    |-------------------------------------|
335 *    | 8 bytes to save %fprs		    | <--  - SAVED_FPRS_OFFSET
336 *    |-------------------------------------|
337 *    | 8 bytes to save %gsr		    | <--  - SAVED_GSR_OFFSET
338 *    ---------------------------------------
339 */
340#define HWCOPYFRAMESIZE		((VIS_BLOCKSIZE * (3 + 1)) + (2 * 8))
341#define SAVED_FPREGS_OFFSET	(VIS_BLOCKSIZE * 4)
342#define SAVED_FPREGS_ADJUST	((VIS_BLOCKSIZE * 3) + 1)
343#define SAVED_FPRS_OFFSET	(SAVED_FPREGS_OFFSET + 8)
344#define SAVED_GSR_OFFSET	(SAVED_FPRS_OFFSET + 8)
345
346/*
347 * In FP copies if we do not have preserved data to restore over
348 * the fp regs we used then we must zero those regs to avoid
349 * exposing portions of the data to later threads (data security).
350 */
351#define	FZERO				\
352	fzero	%f0			;\
353	fzero	%f2			;\
354	faddd	%f0, %f2, %f4		;\
355	fmuld	%f0, %f2, %f6		;\
356	faddd	%f0, %f2, %f8		;\
357	fmuld	%f0, %f2, %f10		;\
358	faddd	%f0, %f2, %f12		;\
359	fmuld	%f0, %f2, %f14		;\
360	faddd	%f0, %f2, %f16		;\
361	fmuld	%f0, %f2, %f18		;\
362	faddd	%f0, %f2, %f20		;\
363	fmuld	%f0, %f2, %f22		;\
364	faddd	%f0, %f2, %f24		;\
365	fmuld	%f0, %f2, %f26		;\
366	faddd	%f0, %f2, %f28		;\
367	fmuld	%f0, %f2, %f30		;\
368	faddd	%f0, %f2, %f48		;\
369	fmuld	%f0, %f2, %f50		;\
370	faddd	%f0, %f2, %f52		;\
371	fmuld	%f0, %f2, %f54		;\
372	faddd	%f0, %f2, %f56		;\
373	fmuld	%f0, %f2, %f58		;\
374	faddd	%f0, %f2, %f60		;\
375	fmuld	%f0, %f2, %f62
376
377#if !defined(lint)
378
379/*
380 * Macros to save and restore fp registers to/from the stack.
381 * Used to save and restore in-use fp registers when we want to use FP.
382 */
383#define BST_FP_TOSTACK(tmp1)					\
384	/* membar #Sync	*/					;\
385	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
386	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
387	stda	%f0, [tmp1]ASI_BLK_P				;\
388	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
389	stda	%f16, [tmp1]ASI_BLK_P				;\
390	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
391	stda	%f48, [tmp1]ASI_BLK_P				;\
392	membar	#Sync
393
394#define	BLD_FP_FROMSTACK(tmp1)					\
395	/* membar #Sync - provided at copy completion */	;\
396	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
397	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
398	ldda	[tmp1]ASI_BLK_P, %f0				;\
399	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
400	ldda	[tmp1]ASI_BLK_P, %f16				;\
401	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
402	ldda	[tmp1]ASI_BLK_P, %f48				;\
403	membar	#Sync
404#endif	/* NIAGARA_IMPL */
405
406#endif	/* lint */
407/*
408 * Copy a block of storage, returning an error code if `from' or
409 * `to' takes a kernel pagefault which cannot be resolved.
410 * Returns errno value on pagefault error, 0 if all ok
411 */
412
413#if defined(lint)
414
415/* ARGSUSED */
416int
417kcopy(const void *from, void *to, size_t count)
418{ return(0); }
419
420#else	/* lint */
421
422	.seg	".text"
423	.align	4
424
425	ENTRY(kcopy)
426#if !defined(NIAGARA_IMPL)
427	cmp	%o2, FP_COPY			! check for small copy/leaf case
428	bgt,pt	%ncc, .kcopy_more		!
429	nop
430.kcopy_small:					! setup error handler
431	sethi	%hi(.sm_copyerr), %o4
432	or	%o4, %lo(.sm_copyerr), %o4	! .sm_copyerr is lofault value
433	ldn	[THREAD_REG + T_LOFAULT], %o5	! save existing handler
434	! Note that we carefully do *not* flag the setting of
435	! t_lofault.
436	membar	#Sync				! sync error barrier
437	b	.sm_do_copy			! common code
438	stn	%o4, [THREAD_REG + T_LOFAULT]	! set t_lofault
439
440
441.kcopy_more:
442	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
443	sethi	%hi(.copyerr), %l7		! copyerr is lofault value
444	or	%l7, %lo(.copyerr), %l7
445	ldn	[THREAD_REG + T_LOFAULT], %o5	! save existing handler
446	! Note that we carefully do *not* flag the setting of
447	! t_lofault.
448	membar	#Sync				! sync error barrier
449	b	.do_copy			! common code
450	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
451
452/*
453 * We got here because of a fault during a small kcopy or bcopy.
454 * if a fault handler existed when bcopy was called.
455 * No floating point registers are used by the small copies.
456 * Small copies are from a leaf routine
457 * Errno value is in %g1.
458 */
459.sm_copyerr:
460	! The kcopy will always set a t_lofault handler. If it fires,
461	! we're expected to just return the error code and not to
462	! invoke any existing error handler. As far as bcopy is concerned,
463	! we only set t_lofault if there was an existing lofault handler.
464	! In that case we're expected to invoke the previously existing
465	! handler after resetting the t_lofault value.
466	btst	LOFAULT_SET, %o5
467	membar	#Sync				! sync error barrier
468	andn	%o5, LOFAULT_SET, %o5		! clear fault flag
469	bnz,pn	%ncc, 3f
470	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
471	retl
472	mov	%g1, %o0
4733:
474	! We're here via bcopy. There must have been an error handler
475	! in place otherwise we would have died a nasty death already.
476	jmp	%o5				! goto real handler
477	mov	%g0, %o0
478/*
479 *  end of .sm_copyerr
480 */
481
482/*
483 * We got here because of a fault during kcopy or bcopy if a fault
484 * handler existed when bcopy was called.
485 * stack and fp registers need to be restored
486 * Errno value is in %g1.
487 */
488.copyerr:
489	sethi	%hi(.copyerr2), %l1
490	or	%l1, %lo(.copyerr2), %l1
491	membar	#Sync				! sync error barrier
492	stn	%l1, [THREAD_REG + T_LOFAULT]	! set t_lofault
493	btst	FPUSED_FLAG, %o5
494	bz,pt	%xcc, 1f
495	and	%o5, LOFAULT_SET, %l1	! copy flag to %l1
496
497	membar	#Sync				! sync error barrier
498	wr	%l5, 0, %gsr
499	btst	FPRS_FEF, %g5
500	bz,pt	%icc, 4f
501	nop
502	! restore fpregs from stack
503	BLD_FP_FROMSTACK(%o2)
504	ba,pt	%ncc, 2f
505	wr	%g5, 0, %fprs		! restore fprs
5064:
507	FZERO
508	wr	%g5, 0, %fprs		! restore fprs
5092:
510	ldn	[THREAD_REG + T_LWP], %o2
511	brnz,pt	%o2, 1f
512	nop
513
514	ldsb	[THREAD_REG + T_PREEMPT], %l0
515	deccc	%l0
516	bnz,pn	%ncc, 1f
517	stb	%l0, [THREAD_REG + T_PREEMPT]
518
519	! Check for a kernel preemption request
520	ldn	[THREAD_REG + T_CPU], %l0
521	ldub	[%l0 + CPU_KPRUNRUN], %l0
522	brnz,a,pt	%l0, 1f	! Need to call kpreempt?
523	or	%l1, KPREEMPT_FLAG, %l1	! If so, set the flag
524
525	! The kcopy will always set a t_lofault handler. If it fires,
526	! we're expected to just return the error code and not to
527	! invoke any existing error handler. As far as bcopy is concerned,
528	! we only set t_lofault if there was an existing lofault handler.
529	! In that case we're expected to invoke the previously existing
530	! handler after resetting the t_lofault value.
5311:
532	andn	%o5, COPY_FLAGS, %o5	! remove flags from lofault address
533	membar	#Sync				! sync error barrier
534	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
535
536	! call kpreempt if necessary
537	btst	KPREEMPT_FLAG, %l1
538	bz,pt	%icc, 2f
539	nop
540	call	kpreempt
541	rdpr	%pil, %o0	! pass %pil
5422:
543	btst	LOFAULT_SET, %l1
544	bnz,pn	%ncc, 3f
545	nop
546	ret
547	restore	%g1, 0, %o0
5483:
549	! We're here via bcopy. There must have been an error handler
550	! in place otherwise we would have died a nasty death already.
551	jmp	%o5				! goto real handler
552	restore	%g0, 0, %o0			! dispose of copy window
553
554/*
555 * We got here because of a fault in .copyerr.  We can't safely restore fp
556 * state, so we panic.
557 */
558fp_panic_msg:
559	.asciz	"Unable to restore fp state after copy operation"
560
561	.align	4
562.copyerr2:
563	set	fp_panic_msg, %o0
564	call	panic
565	nop
566/*
567 *  end of .copyerr
568 */
569
570#else	/* NIAGARA_IMPL */
571	save	%sp, -SA(MINFRAME), %sp
572	set	.copyerr, %l7			! copyerr is lofault value
573	ldn	[THREAD_REG + T_LOFAULT], %o5	! save existing handler
574	or	%o5, LOFAULT_SET, %o5
575	membar	#Sync				! sync error barrier
576	b	.do_copy			! common code
577	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
578
579/*
580 * We got here because of a fault during kcopy.
581 * Errno value is in %g1.
582 */
583.copyerr:
584	! The kcopy() *always* sets a t_lofault handler and it ORs LOFAULT_SET
585	! into %o5 to indicate it has set t_lofault handler. Need to clear
586	! LOFAULT_SET flag before restoring the error handler.
587	andn	%o5, LOFAULT_SET, %o5
588	membar	#Sync				! sync error barrier
589	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
590	ret
591	restore	%g1, 0, %o0
592#endif	/* NIAGARA_IMPL */
593
594	SET_SIZE(kcopy)
595#endif	/* lint */
596
597
598/*
599 * Copy a block of storage - must not overlap (from + len <= to).
600 */
601#if defined(lint)
602
603/* ARGSUSED */
604void
605bcopy(const void *from, void *to, size_t count)
606{}
607
608#else	/* lint */
609
610	ENTRY(bcopy)
611#if !defined(NIAGARA_IMPL)
612	cmp	%o2, FP_COPY			! check for small copy/leaf case
613	bgt,pt	%ncc, .bcopy_more		!
614	nop
615.bcopy_small:					! setup error handler
616	ldn	[THREAD_REG + T_LOFAULT], %o5	! save existing handler
617	tst	%o5
618	bz,pt	%icc, .sm_do_copy
619	sethi	%hi(.sm_copyerr), %o4
620	or	%o4, %lo(.sm_copyerr), %o4	! .sm_copyerr is lofault value
621	membar	#Sync				! sync error barrier
622	stn	%o4, [THREAD_REG + T_LOFAULT]	! set t_lofault
623	or	%o5, LOFAULT_SET, %o5		! Error should trampoline
624.sm_do_copy:
625	mov	%o0, %g1		! save %o0
626	cmp	%o2, SHORTCOPY		! make sure there is enough to align
627	ble,pt	%ncc, .bc_smallest
628	andcc	%o1, 0x7, %o3		! is dest long aligned
629	bnz,pn	%ncc, .bc_align
630	andcc	%o1, 1, %o3		! is dest byte aligned
631
632! Destination is long word aligned
633.bc_al_src:
634	andcc	%o0, 7, %o3
635	brnz,pt	%o3, .bc_src_dst_unal8
636	nop
637/*
638 * Special case for handling when src and dest are both long word aligned
639 * and total data to move is less than FP_COPY bytes
640 * Also handles finish up for large block moves, so may be less than 32 bytes
641 */
642.bc_medlong:
643	subcc	%o2, 31, %o2		! adjust length to allow cc test
644	ble,pt	%ncc, .bc_medl31
645	nop
646.bc_medl32:
647	ldx	[%o0], %o4		! move 32 bytes
648	subcc	%o2, 32, %o2		! decrement length count by 32
649	stx	%o4, [%o1]
650	ldx	[%o0+8], %o4
651	stx	%o4, [%o1+8]
652	ldx	[%o0+16], %o4
653	add	%o0, 32, %o0		! increase src ptr by 32
654	stx	%o4, [%o1+16]
655	ldx	[%o0-8], %o4
656	add	%o1, 32, %o1		! increase dst ptr by 32
657	bgu,pt	%ncc, .bc_medl32	! repeat if at least 32 bytes left
658	stx	%o4, [%o1-8]
659.bc_medl31:
660	addcc	%o2, 24, %o2		! adjust count to be off by 7
661	ble,pt	%ncc, .bc_medl7		! skip if 7 or fewer bytes left
662	nop
663.bc_medl8:
664	ldx	[%o0], %o4		! move 8 bytes
665	add	%o0, 8, %o0		! increase src ptr by 8
666	subcc	%o2, 8, %o2		! decrease count by 8
667	add	%o1, 8, %o1		! increase dst ptr by 8
668	bgu,pt	%ncc, .bc_medl8
669	stx	%o4, [%o1-8]
670.bc_medl7:
671	addcc	%o2, 7, %o2		! finish adjustment of remaining count
672	bnz,pt	%ncc, .bc_small4	! do final bytes if not finished
673
674.bc_smallx:				! finish up and exit
675	tst	%o5
676	bz,pt	%ncc, .bc_sm_done
677	andn	%o5, COPY_FLAGS, %o5	! remove flags from lofault address
678	membar	#Sync			! sync error barrier
679	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
680.bc_sm_done:
681	retl
682	mov	%g0, %o0
683
684.bc_small4:
685	cmp	%o2, 4
686	blt,pt	%ncc, .bc_small3x	! skip if less than 4 bytes left
687	nop				!
688	ld	[%o0], %o4		! move 4 bytes
689	add	%o0, 4, %o0		! increase src ptr by 4
690	add	%o1, 4, %o1		! increase dst ptr by 4
691	subcc	%o2, 4, %o2		! decrease count by 4
692	bz,pt	%ncc, .bc_smallx
693	stw	%o4, [%o1-4]
694
695.bc_small3x:				! Exactly 1, 2, or 3 bytes remain
696	subcc	%o2, 1, %o2		! reduce count for cc test
697	ldub	[%o0], %o4		! load one byte
698	bz,pt	%ncc, .bc_smallx
699	stb	%o4, [%o1]		! store one byte
700	ldub	[%o0+1], %o4		! load second byte
701	subcc	%o2, 1, %o2
702	bz,pt	%ncc, .bc_smallx
703	stb	%o4, [%o1+1]		! store second byte
704	ldub	[%o0+2], %o4		! load third byte
705	ba	.bc_smallx
706	stb	%o4, [%o1+2]		! store third byte
707
708.bc_smallest:				! 7 or fewer bytes remain
709	tst	%o2
710	bz,pt	%ncc, .bc_smallx
711	cmp	%o2, 4
712	blt,pt	%ncc, .bc_small3x
713	nop
714	ldub	[%o0], %o4		! read byte
715	subcc	%o2, 4, %o2		! reduce count by 4
716	stb	%o4, [%o1]		! write byte
717	ldub	[%o0+1], %o4		! repeat for total of 4 bytes
718	add	%o0, 4, %o0		! advance src by 4
719	stb	%o4, [%o1+1]
720	ldub	[%o0-2], %o4
721	add	%o1, 4, %o1		! advance dst by 4
722	stb	%o4, [%o1-2]
723	ldub	[%o0-1], %o4
724	bnz,pt	%ncc, .bc_small3x
725	stb	%o4, [%o1-1]
726	ba	.bc_smallx
727	nop
728
729/*
730 * Align destination to long word boundary
731 */
732.bc_align:				! byte align test in prior branch delay
733	bnz,pt	%ncc, .bc_al_d1
734.bc_al_d1f:				! dest is now half word aligned
735	andcc	%o1, 2, %o3
736	bnz,pt	%ncc, .bc_al_d2
737.bc_al_d2f:				! dest is now word aligned
738	andcc	%o1, 4, %o3		! is dest longword aligned?
739	bz,pt	%ncc, .bc_al_src
740	nop
741.bc_al_d4:				! dest is word aligned;  src is unknown
742	ldub	[%o0], %o4		! move a word (src align unknown)
743	ldub	[%o0+1], %o3
744	sll	%o4, 24, %o4		! position
745	sll	%o3, 16, %o3		! position
746	or	%o4, %o3, %o3		! merge
747	ldub	[%o0+2], %o4
748	sll	%o4, 8, %o4		! position
749	or	%o4, %o3, %o3		! merge
750	ldub	[%o0+3], %o4
751	or	%o4, %o3, %o4		! merge
752	stw	%o4,[%o1]		! store four bytes
753	add	%o0, 4, %o0		! adjust src by 4
754	add	%o1, 4, %o1		! adjust dest by 4
755	sub	%o2, 4, %o2		! adjust count by 4
756	andcc	%o0, 7, %o3		! check for src long word alignment
757	brz,pt	%o3, .bc_medlong
758.bc_src_dst_unal8:
759	! dst is 8-byte aligned, src is not
760	! Size is less than FP_COPY
761	! Following code is to select for alignment
762	andcc	%o0, 0x3, %o3		! test word alignment
763	bz,pt	%ncc, .bc_medword
764	nop
765	andcc	%o0, 0x1, %o3		! test halfword alignment
766	bnz,pt	%ncc, .bc_med_byte	! go to byte move if not halfword
767	andcc	%o0, 0x2, %o3		! test which byte alignment
768	ba	.bc_medhalf
769	nop
770.bc_al_d1:				! align dest to half word
771	ldub	[%o0], %o4		! move a byte
772	add	%o0, 1, %o0
773	stb	%o4, [%o1]
774	add	%o1, 1, %o1
775	andcc	%o1, 2, %o3
776	bz,pt	%ncc, .bc_al_d2f
777	sub	%o2, 1, %o2
778.bc_al_d2:				! align dest to word
779	ldub	[%o0], %o4		! move a half-word (src align unknown)
780	ldub	[%o0+1], %o3
781	sll	%o4, 8, %o4		! position
782	or	%o4, %o3, %o4		! merge
783	sth	%o4, [%o1]
784	add	%o0, 2, %o0
785	add	%o1, 2, %o1
786	andcc	%o1, 4, %o3		! is dest longword aligned?
787	bz,pt	%ncc, .bc_al_src
788	sub	%o2, 2, %o2
789	ba	.bc_al_d4
790	nop
791/*
792 * Handle all cases where src and dest are aligned on word
793 * boundaries. Use unrolled loops for better performance.
794 * This option wins over standard large data move when
795 * source and destination is in cache for medium
796 * to short data moves.
797 */
798.bc_medword:
799	subcc	%o2, 31, %o2		! adjust length to allow cc test
800	ble,pt	%ncc, .bc_medw31
801	nop
802.bc_medw32:
803	ld	[%o0], %o4		! move a block of 32 bytes
804	stw	%o4, [%o1]
805	ld	[%o0+4], %o4
806	stw	%o4, [%o1+4]
807	ld	[%o0+8], %o4
808	stw	%o4, [%o1+8]
809	ld	[%o0+12], %o4
810	stw	%o4, [%o1+12]
811	ld	[%o0+16], %o4
812	stw	%o4, [%o1+16]
813	ld	[%o0+20], %o4
814	subcc	%o2, 32, %o2		! decrement length count
815	stw	%o4, [%o1+20]
816	ld	[%o0+24], %o4
817	add	%o0, 32, %o0		! increase src ptr by 32
818	stw	%o4, [%o1+24]
819	ld	[%o0-4], %o4
820	add	%o1, 32, %o1		! increase dst ptr by 32
821	bgu,pt	%ncc, .bc_medw32	! repeat if at least 32 bytes left
822	stw	%o4, [%o1-4]
823.bc_medw31:
824	addcc	%o2, 24, %o2		! adjust count to be off by 7
825	ble,pt	%ncc, .bc_medw7		! skip if 7 or fewer bytes left
826	nop				!
827.bc_medw15:
828	ld	[%o0], %o4		! move a block of 8 bytes
829	subcc	%o2, 8, %o2		! decrement length count
830	stw	%o4, [%o1]
831	add	%o0, 8, %o0		! increase src ptr by 8
832	ld	[%o0-4], %o4
833	add	%o1, 8, %o1		! increase dst ptr by 8
834	bgu,pt	%ncc, .bc_medw15
835	stw	%o4, [%o1-4]
836.bc_medw7:
837	addcc	%o2, 7, %o2		! finish adjustment of remaining count
838	bz,pt	%ncc, .bc_smallx	! exit if finished
839	cmp	%o2, 4
840	blt,pt	%ncc, .bc_small3x	! skip if less than 4 bytes left
841	nop				!
842	ld	[%o0], %o4		! move 4 bytes
843	add	%o0, 4, %o0		! increase src ptr by 4
844	add	%o1, 4, %o1		! increase dst ptr by 4
845	subcc	%o2, 4, %o2		! decrease count by 4
846	bnz	.bc_small3x
847	stw	%o4, [%o1-4]
848	ba	.bc_smallx
849	nop
850
851.bc_medhalf:
852	subcc	%o2, 31, %o2		! adjust length to allow cc test
853	ble,pt	%ncc, .bc_medh31
854	nop
855.bc_medh32:				! load and store block of 32 bytes
856	subcc	%o2, 32, %o2		! decrement length count
857
858	lduh	[%o0], %o4		! move 32 bytes
859	lduw	[%o0+2], %o3
860	sllx	%o4, 48, %o4
861	sllx	%o3, 16, %o3
862	or	%o4, %o3, %o3
863	lduh	[%o0+6], %o4
864	or	%o4, %o3, %o4
865	stx	%o4, [%o1]
866
867	lduh	[%o0+8], %o4
868	lduw	[%o0+10], %o3
869	sllx	%o4, 48, %o4
870	sllx	%o3, 16, %o3
871	or	%o4, %o3, %o3
872	lduh	[%o0+14], %o4
873	or	%o4, %o3, %o4
874	stx	%o4, [%o1+8]
875
876	lduh	[%o0+16], %o4
877	lduw	[%o0+18], %o3
878	sllx	%o4, 48, %o4
879	sllx	%o3, 16, %o3
880	or	%o4, %o3, %o3
881	lduh	[%o0+22], %o4
882	or	%o4, %o3, %o4
883	stx	%o4, [%o1+16]
884
885	add	%o0, 32, %o0		! increase src ptr by 32
886	add	%o1, 32, %o1		! increase dst ptr by 32
887
888	lduh	[%o0-8], %o4
889	lduw	[%o0-6], %o3
890	sllx	%o4, 48, %o4
891	sllx	%o3, 16, %o3
892	or	%o4, %o3, %o3
893	lduh	[%o0-2], %o4
894	or	%o3, %o4, %o4
895	bgu,pt	%ncc, .bc_medh32	! repeat if at least 32 bytes left
896	stx	%o4, [%o1-8]
897
898.bc_medh31:
899	addcc	%o2, 24, %o2		! adjust count to be off by 7
900	ble,pt	%ncc, .bc_medh7		! skip if 7 or fewer bytes left
901	nop				!
902.bc_medh15:
903	lduh	[%o0], %o4		! move 16 bytes
904	subcc	%o2, 8, %o2		! decrement length count
905	lduw	[%o0+2], %o3
906	sllx	%o4, 48, %o4
907	sllx	%o3, 16, %o3
908	or	%o4, %o3, %o3
909	add	%o1, 8, %o1		! increase dst ptr by 8
910	lduh	[%o0+6], %o4
911	add	%o0, 8, %o0		! increase src ptr by 8
912	or	%o4, %o3, %o4
913	bgu,pt	%ncc, .bc_medh15
914	stx	%o4, [%o1-8]
915.bc_medh7:
916	addcc	%o2, 7, %o2		! finish adjustment of remaining count
917	bz,pt	%ncc, .bc_smallx	! exit if finished
918	cmp	%o2, 4
919	blt,pt	%ncc, .bc_small3x	! skip if less than 4 bytes left
920	nop				!
921	lduh	[%o0], %o4
922	sll	%o4, 16, %o4
923	lduh	[%o0+2], %o3
924	or	%o3, %o4, %o4
925	subcc	%o2, 4, %o2
926	add	%o0, 4, %o0
927	add	%o1, 4, %o1
928	bnz	.bc_small3x
929	stw	%o4, [%o1-4]
930	ba	.bc_smallx
931	nop
932
933	.align 16
934.bc_med_byte:
935	bnz,pt	%ncc, .bc_medbh32a	! go to correct byte move
936	subcc	%o2, 31, %o2		! adjust length to allow cc test
937	ble,pt	%ncc, .bc_medb31
938	nop
939.bc_medb32:				! Alignment 1 or 5
940	subcc	%o2, 32, %o2		! decrement length count
941
942	ldub	[%o0], %o4		! load and store a block of 32 bytes
943	sllx	%o4, 56, %o3
944	lduh	[%o0+1], %o4
945	sllx	%o4, 40, %o4
946	or	%o4, %o3, %o3
947	lduw	[%o0+3], %o4
948	sllx	%o4, 8, %o4
949	or	%o4, %o3, %o3
950	ldub	[%o0+7], %o4
951	or	%o4, %o3, %o4
952	stx	%o4, [%o1]
953
954	ldub	[%o0+8], %o4
955	sllx	%o4, 56, %o3
956	lduh	[%o0+9], %o4
957	sllx	%o4, 40, %o4
958	or	%o4, %o3, %o3
959	lduw	[%o0+11], %o4
960	sllx	%o4, 8, %o4
961	or	%o4, %o3, %o3
962	ldub	[%o0+15], %o4
963	or	%o4, %o3, %o4
964	stx	%o4, [%o1+8]
965
966	ldub	[%o0+16], %o4
967	sllx	%o4, 56, %o3
968	lduh	[%o0+17], %o4
969	sllx	%o4, 40, %o4
970	or	%o4, %o3, %o3
971	lduw	[%o0+19], %o4
972	sllx	%o4, 8, %o4
973	or	%o4, %o3, %o3
974	ldub	[%o0+23], %o4
975	or	%o4, %o3, %o4
976	stx	%o4, [%o1+16]
977
978	add	%o0, 32, %o0		! increase src ptr by 32
979	add	%o1, 32, %o1		! increase dst ptr by 32
980
981	ldub	[%o0-8], %o4
982	sllx	%o4, 56, %o3
983	lduh	[%o0-7], %o4
984	sllx	%o4, 40, %o4
985	or	%o4, %o3, %o3
986	lduw	[%o0-5], %o4
987	sllx	%o4, 8, %o4
988	or	%o4, %o3, %o3
989	ldub	[%o0-1], %o4
990	or	%o4, %o3, %o4
991	bgu,pt	%ncc, .bc_medb32	! repeat if at least 32 bytes left
992	stx	%o4, [%o1-8]
993
994.bc_medb31:				! 31 or fewer bytes remaining
995	addcc	%o2, 24, %o2		! adjust count to be off by 7
996	ble,pt	%ncc, .bc_medb7		! skip if 7 or fewer bytes left
997	nop				!
998.bc_medb15:
999
1000	ldub	[%o0], %o4		! load and store a block of 8 bytes
1001	subcc	%o2, 8, %o2		! decrement length count
1002	sllx	%o4, 56, %o3
1003	lduh	[%o0+1], %o4
1004	sllx	%o4, 40, %o4
1005	or	%o4, %o3, %o3
1006	lduw	[%o0+3], %o4
1007	add	%o1, 8, %o1		! increase dst ptr by 16
1008	sllx	%o4, 8, %o4
1009	or	%o4, %o3, %o3
1010	ldub	[%o0+7], %o4
1011	add	%o0, 8, %o0		! increase src ptr by 16
1012	or	%o4, %o3, %o4
1013	bgu,pt	%ncc, .bc_medb15
1014	stx	%o4, [%o1-8]
1015.bc_medb7:
1016	addcc	%o2, 7, %o2		! finish adjustment of remaining count
1017	bz,pt	%ncc, .bc_smallx	! exit if finished
1018	cmp	%o2, 4
1019	blt,pt	%ncc, .bc_small3x	! skip if less than 4 bytes left
1020	nop				!
1021	ldub	[%o0], %o4		! move 4 bytes
1022	sll	%o4, 24, %o3
1023	lduh	[%o0+1], %o4
1024	sll	%o4, 8, %o4
1025	or	%o4, %o3, %o3
1026	ldub	[%o0+3], %o4
1027	or	%o4, %o3, %o4
1028	subcc	%o2, 4, %o2
1029	add	%o0, 4, %o0
1030	add	%o1, 4, %o1
1031	bnz	.bc_small3x
1032	stw	%o4, [%o1-4]
1033	ba	.bc_smallx
1034	nop
1035
1036	.align 16
1037.bc_medbh32a:				! Alignment 3 or 7
1038	ble,pt	%ncc, .bc_medbh31
1039	nop
1040.bc_medbh32:				! Alignment 3 or 7
1041	subcc	%o2, 32, %o2		! decrement length count
1042
1043	ldub	[%o0], %o4		! load and store a block of 32 bytes
1044	sllx	%o4, 56, %o3
1045	lduw	[%o0+1], %o4
1046	sllx	%o4, 24, %o4
1047	or	%o4, %o3, %o3
1048	lduh	[%o0+5], %o4
1049	sllx	%o4, 8, %o4
1050	or	%o4, %o3, %o3
1051	ldub	[%o0+7], %o4
1052	or	%o4, %o3, %o4
1053	stx	%o4, [%o1]
1054
1055	ldub	[%o0+8], %o4
1056	sllx	%o4, 56, %o3
1057	lduw	[%o0+9], %o4
1058	sllx	%o4, 24, %o4
1059	or	%o4, %o3, %o3
1060	lduh	[%o0+13], %o4
1061	sllx	%o4, 8, %o4
1062	or	%o4, %o3, %o3
1063	ldub	[%o0+15], %o4
1064	or	%o4, %o3, %o4
1065	stx	%o4, [%o1+8]
1066
1067	ldub	[%o0+16], %o4
1068	sllx	%o4, 56, %o3
1069	lduw	[%o0+17], %o4
1070	sllx	%o4, 24, %o4
1071	or	%o4, %o3, %o3
1072	lduh	[%o0+21], %o4
1073	sllx	%o4, 8, %o4
1074	or	%o4, %o3, %o3
1075	ldub	[%o0+23], %o4
1076	or	%o4, %o3, %o4
1077	stx	%o4, [%o1+16]
1078
1079	add	%o0, 32, %o0		! increase src ptr by 32
1080	add	%o1, 32, %o1		! increase dst ptr by 32
1081
1082	ldub	[%o0-8], %o4
1083	sllx	%o4, 56, %o3
1084	lduw	[%o0-7], %o4
1085	sllx	%o4, 24, %o4
1086	or	%o4, %o3, %o3
1087	lduh	[%o0-3], %o4
1088	sllx	%o4, 8, %o4
1089	or	%o4, %o3, %o3
1090	ldub	[%o0-1], %o4
1091	or	%o4, %o3, %o4
1092	bgu,pt	%ncc, .bc_medbh32	! repeat if at least 32 bytes left
1093	stx	%o4, [%o1-8]
1094
1095.bc_medbh31:
1096	addcc	%o2, 24, %o2		! adjust count to be off by 7
1097	ble,pt	%ncc, .bc_medb7		! skip if 7 or fewer bytes left
1098	nop				!
1099.bc_medbh15:
1100	ldub	[%o0], %o4		! load and store a block of 8 bytes
1101	sllx	%o4, 56, %o3
1102	lduw	[%o0+1], %o4
1103	sllx	%o4, 24, %o4
1104	or	%o4, %o3, %o3
1105	lduh	[%o0+5], %o4
1106	sllx	%o4, 8, %o4
1107	or	%o4, %o3, %o3
1108	ldub	[%o0+7], %o4
1109	or	%o4, %o3, %o4
1110	stx	%o4, [%o1]
1111	subcc	%o2, 8, %o2		! decrement length count
1112	add	%o1, 8, %o1		! increase dst ptr by 8
1113	add	%o0, 8, %o0		! increase src ptr by 8
1114	bgu,pt	%ncc, .bc_medbh15
1115	stx	%o4, [%o1-8]
1116	ba	.bc_medb7
1117	nop
1118
1119	SET_SIZE(bcopy)
1120/*
1121 * The _more entry points are not intended to be used directly by
1122 * any caller from outside this file.  They are provided to allow
1123 * profiling and dtrace of the portions of the copy code that uses
1124 * the floating point registers.
1125*/
1126	ENTRY(bcopy_more)
1127.bcopy_more:
1128	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1129	ldn	[THREAD_REG + T_LOFAULT], %o5	! save existing handler
1130	brz,pt	%o5, .do_copy
1131	nop
1132	sethi	%hi(.copyerr), %l7		! copyerr is lofault value
1133	or	%l7, %lo(.copyerr), %l7
1134	membar	#Sync				! sync error barrier
1135	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
1136	! We've already captured whether t_lofault was zero on entry.
1137	! We need to mark ourselves as being from bcopy since both
1138	! kcopy and bcopy use the same code path. If LOFAULT_SET is
1139	! set and the saved lofault was zero, we won't reset lofault on
1140	! returning.
1141	or	%o5, LOFAULT_SET, %o5
1142.do_copy:
1143	ldn	[THREAD_REG + T_LWP], %o3
1144	brnz,pt	%o3, 1f
1145	nop
1146/*
1147 * kpreempt_disable();
1148 */
1149	ldsb	[THREAD_REG +T_PREEMPT], %o3
1150	inc	%o3
1151	stb	%o3, [THREAD_REG + T_PREEMPT]
11521:
1153/*
1154 * Following code is for large copies. We know there is at
1155 * least FP_COPY bytes available. FP regs are used, so
1156 *  we save registers and fp regs before starting
1157 */
1158	rd	%fprs, %g5		! check for unused fp
1159	or	%o5,FPUSED_FLAG,%o5
1160	! if fprs.fef == 0, set it.
1161	! Setting it when already set costs more than checking
1162	andcc	%g5, FPRS_FEF, %g5	! test FEF, fprs.du = fprs.dl = 0
1163	bz,pt	%ncc, .bc_fp_unused
1164	prefetch [%i0 + (1 * CACHE_LINE)], #one_read
1165	BST_FP_TOSTACK(%o3)
1166	ba	.bc_fp_ready
1167.bc_fp_unused:
1168	andcc	%i1, 1, %o3		! is dest byte aligned
1169	wr	%g0, FPRS_FEF, %fprs	! fprs.fef = 1
1170.bc_fp_ready:
1171	rd	%gsr, %l5		! save %gsr value
1172	bnz,pt	%ncc, .bc_big_d1
1173.bc_big_d1f:				! dest is now half word aligned
1174	andcc	%i1, 2, %o3
1175	bnz,pt	%ncc, .bc_big_d2
1176.bc_big_d2f:				! dest is now word aligned
1177	andcc	%i1, 4, %o3
1178	bnz,pt	%ncc, .bc_big_d4
1179.bc_big_d4f:				! dest is now long word aligned
1180	andcc	%i0, 7, %o3		! is src long word aligned
1181	brnz,pt	%o3, .bc_big_unal8
1182	prefetch [%i0 + (2 * CACHE_LINE)], #one_read
1183
1184	! Src and dst are long word aligned
1185	! align dst to 64 byte boundary
1186	andcc	%i1, 0x3f, %o3		! %o3 == 0 means dst is 64 byte aligned
1187	brz,pn	%o3, .bc_al_to_64
1188	nop
1189	sub	%o3, 64, %o3		! %o3 has negative bytes to move
1190	add	%i2, %o3, %i2		! adjust remaining count
1191	andcc	%o3, 8, %o4		! odd long words to move?
1192	brz,pt	%o4, .bc_al_to_16
1193	nop
1194	add	%o3, 8, %o3
1195	ldx	[%i0], %o4
1196	add	%i0, 8, %i0		! increment src ptr
1197	add	%i1, 8, %i1		! increment dst ptr
1198	stx	%o4, [%i1-8]
1199! Dest is aligned on 16 bytes, src 8 byte aligned
1200.bc_al_to_16:
1201	andcc	%o3, 0x30, %o4		! pair of long words to move?
1202	brz,pt	%o4, .bc_al_to_64
1203	nop
1204.bc_al_mv_16:
1205	add	%o3, 16, %o3
1206	ldx	[%i0], %o4
1207	stx	%o4, [%i1]
1208	ldx	[%i0+8], %o4
1209	add	%i0, 16, %i0		! increment src ptr
1210	stx	%o4, [%i1+8]
1211	andcc	%o3, 48, %o4
1212	brnz,pt	%o4, .bc_al_mv_16
1213	add	%i1, 16, %i1		! increment dst ptr
1214! Dest is aligned on 64 bytes, src 8 byte aligned
1215.bc_al_to_64:
1216	! Determine source alignment
1217	! to correct 8 byte offset
1218	andcc	%i0, 32, %o3
1219	brnz,pn	%o3, .bc_aln_1
1220	andcc	%i0, 16, %o3
1221	brnz,pn	%o3, .bc_aln_01
1222	andcc	%i0, 8, %o3
1223	brz,pn	%o3, .bc_aln_000
1224	prefetch [%i0 + (3 * CACHE_LINE)], #one_read
1225	ba	.bc_aln_001
1226	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1227
1228.bc_aln_01:
1229	brnz,pn	%o3, .bc_aln_011
1230	prefetch [%i0 + (3 * CACHE_LINE)], #one_read
1231	ba	.bc_aln_010
1232	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1233.bc_aln_1:
1234	andcc	%i0, 16, %o3
1235	brnz,pn	%o3, .bc_aln_11
1236	andcc	%i0, 8, %o3
1237	brnz,pn	%o3, .bc_aln_101
1238	prefetch [%i0 + (3 * CACHE_LINE)], #one_read
1239	ba	.bc_aln_100
1240	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1241.bc_aln_11:
1242	brz,pn	%o3, .bc_aln_110
1243	prefetch [%i0 + (3 * CACHE_LINE)], #one_read
1244
1245.bc_aln_111:
1246! Alignment off by 8 bytes
1247	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1248	ldd	[%i0], %d0
1249	add	%i0, 8, %i0
1250	sub	%i2, 8, %i2
1251	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
1252	and	%i2, 0x7f, %i2		! residue bytes in %i2
1253	sub	%i1, %i0, %i1
1254.bc_aln_111_loop:
1255	ldda	[%i0]ASI_BLK_P,%d16		! block load
1256	subcc	%o3, 64, %o3
1257	fmovd	%d16, %d2
1258	fmovd	%d18, %d4
1259	fmovd	%d20, %d6
1260	fmovd	%d22, %d8
1261	fmovd	%d24, %d10
1262	fmovd	%d26, %d12
1263	fmovd	%d28, %d14
1264	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
1265	stda	%d0,[%i0+%i1]ASI_BLK_P
1266	add	%i0, 64, %i0
1267	fmovd	%d30, %d0
1268	bgt,pt	%ncc, .bc_aln_111_loop
1269	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1270	add	%i1, %i0, %i1
1271
1272	std	%d0, [%i1]
1273	ba	.bc_remain_stuff
1274	add	%i1, 8, %i1
1275	! END OF aln_111
1276
1277.bc_aln_110:
1278! Alignment off by 16 bytes
1279	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1280	ldd	[%i0], %d0
1281	ldd	[%i0+8], %d2
1282	add	%i0, 16, %i0
1283	sub	%i2, 16, %i2
1284	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
1285	and	%i2, 0x7f, %i2		! residue bytes in %i2
1286	sub	%i1, %i0, %i1
1287.bc_aln_110_loop:
1288	ldda	[%i0]ASI_BLK_P,%d16		! block load
1289	subcc	%o3, 64, %o3
1290	fmovd	%d16, %d4
1291	fmovd	%d18, %d6
1292	fmovd	%d20, %d8
1293	fmovd	%d22, %d10
1294	fmovd	%d24, %d12
1295	fmovd	%d26, %d14
1296	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
1297	stda	%d0,[%i0+%i1]ASI_BLK_P
1298	add	%i0, 64, %i0
1299	fmovd	%d28, %d0
1300	fmovd	%d30, %d2
1301	bgt,pt	%ncc, .bc_aln_110_loop
1302	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1303	add	%i1, %i0, %i1
1304
1305	std	%d0, [%i1]
1306	std	%d2, [%i1+8]
1307	ba	.bc_remain_stuff
1308	add	%i1, 16, %i1
1309	! END OF aln_110
1310
1311.bc_aln_101:
1312! Alignment off by 24 bytes
1313	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1314	ldd	[%i0], %d0
1315	ldd	[%i0+8], %d2
1316	ldd	[%i0+16], %d4
1317	add	%i0, 24, %i0
1318	sub	%i2, 24, %i2
1319	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
1320	and	%i2, 0x7f, %i2		! residue bytes in %i2
1321	sub	%i1, %i0, %i1
1322.bc_aln_101_loop:
1323	ldda	[%i0]ASI_BLK_P,%d16	! block load
1324	subcc	%o3, 64, %o3
1325	fmovd	%d16, %d6
1326	fmovd	%d18, %d8
1327	fmovd	%d20, %d10
1328	fmovd	%d22, %d12
1329	fmovd	%d24, %d14
1330	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
1331	stda	%d0,[%i0+%i1]ASI_BLK_P
1332	add	%i0, 64, %i0
1333	fmovd	%d26, %d0
1334	fmovd	%d28, %d2
1335	fmovd	%d30, %d4
1336	bgt,pt	%ncc, .bc_aln_101_loop
1337	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1338	add	%i1, %i0, %i1
1339
1340	std	%d0, [%i1]
1341	std	%d2, [%i1+8]
1342	std	%d4, [%i1+16]
1343	ba	.bc_remain_stuff
1344	add	%i1, 24, %i1
1345	! END OF aln_101
1346
1347.bc_aln_100:
1348! Alignment off by 32 bytes
1349	ldd	[%i0], %d0
1350	ldd	[%i0+8], %d2
1351	ldd	[%i0+16],%d4
1352	ldd	[%i0+24],%d6
1353	add	%i0, 32, %i0
1354	sub	%i2, 32, %i2
1355	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
1356	and	%i2, 0x7f, %i2		! residue bytes in %i2
1357	sub	%i1, %i0, %i1
1358.bc_aln_100_loop:
1359	ldda	[%i0]ASI_BLK_P,%d16	! block load
1360	subcc	%o3, 64, %o3
1361	fmovd	%d16, %d8
1362	fmovd	%d18, %d10
1363	fmovd	%d20, %d12
1364	fmovd	%d22, %d14
1365	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
1366	stda	%d0,[%i0+%i1]ASI_BLK_P
1367	add	%i0, 64, %i0
1368	fmovd	%d24, %d0
1369	fmovd	%d26, %d2
1370	fmovd	%d28, %d4
1371	fmovd	%d30, %d6
1372	bgt,pt	%ncc, .bc_aln_100_loop
1373	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1374	add	%i1, %i0, %i1
1375
1376	std	%d0, [%i1]
1377	std	%d2, [%i1+8]
1378	std	%d4, [%i1+16]
1379	std	%d6, [%i1+24]
1380	ba	.bc_remain_stuff
1381	add	%i1, 32, %i1
1382	! END OF aln_100
1383
1384.bc_aln_011:
1385! Alignment off by 40 bytes
1386	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1387	ldd	[%i0], %d0
1388	ldd	[%i0+8], %d2
1389	ldd	[%i0+16], %d4
1390	ldd	[%i0+24], %d6
1391	ldd	[%i0+32], %d8
1392	add	%i0, 40, %i0
1393	sub	%i2, 40, %i2
1394	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
1395	and	%i2, 0x7f, %i2		! residue bytes in %i2
1396	sub	%i1, %i0, %i1
1397.bc_aln_011_loop:
1398	ldda	[%i0]ASI_BLK_P,%d16	! block load
1399	subcc	%o3, 64, %o3
1400	fmovd	%d16, %d10
1401	fmovd	%d18, %d12
1402	fmovd	%d20, %d14
1403	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
1404	stda	%d0,[%i0+%i1]ASI_BLK_P
1405	add	%i0, 64, %i0
1406	fmovd	%d22, %d0
1407	fmovd	%d24, %d2
1408	fmovd	%d26, %d4
1409	fmovd	%d28, %d6
1410	fmovd	%d30, %d8
1411	bgt,pt	%ncc, .bc_aln_011_loop
1412	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1413	add	%i1, %i0, %i1
1414
1415	std	%d0, [%i1]
1416	std	%d2, [%i1+8]
1417	std	%d4, [%i1+16]
1418	std	%d6, [%i1+24]
1419	std	%d8, [%i1+32]
1420	ba	.bc_remain_stuff
1421	add	%i1, 40, %i1
1422	! END OF aln_011
1423
1424.bc_aln_010:
1425! Alignment off by 48 bytes
1426	ldd	[%i0], %d0
1427	ldd	[%i0+8], %d2
1428	ldd	[%i0+16], %d4
1429	ldd	[%i0+24], %d6
1430	ldd	[%i0+32], %d8
1431	ldd	[%i0+40], %d10
1432	add	%i0, 48, %i0
1433	sub	%i2, 48, %i2
1434	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
1435	and	%i2, 0x7f, %i2		! residue bytes in %i2
1436	sub	%i1, %i0, %i1
1437.bc_aln_010_loop:
1438	ldda	[%i0]ASI_BLK_P,%d16	! block load
1439	subcc	%o3, 64, %o3
1440	fmovd	%d16, %d12
1441	fmovd	%d18, %d14
1442	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
1443	stda	%d0,[%i0+%i1]ASI_BLK_P
1444	add	%i0, 64, %i0
1445	fmovd	%d20, %d0
1446	fmovd	%d22, %d2
1447	fmovd	%d24, %d4
1448	fmovd	%d26, %d6
1449	fmovd	%d28, %d8
1450	fmovd	%d30, %d10
1451	bgt,pt	%ncc, .bc_aln_010_loop
1452	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1453	add	%i1, %i0, %i1
1454
1455	std	%d0, [%i1]
1456	std	%d2, [%i1+8]
1457	std	%d4, [%i1+16]
1458	std	%d6, [%i1+24]
1459	std	%d8, [%i1+32]
1460	std	%d10, [%i1+40]
1461	ba	.bc_remain_stuff
1462	add	%i1, 48, %i1
1463	! END OF aln_010
1464
1465.bc_aln_001:
1466! Alignment off by 56 bytes
1467	ldd	[%i0], %d0
1468	ldd	[%i0+8], %d2
1469	ldd	[%i0+16], %d4
1470	ldd	[%i0+24], %d6
1471	ldd	[%i0+32], %d8
1472	ldd	[%i0+40], %d10
1473	ldd	[%i0+48], %d12
1474	add	%i0, 56, %i0
1475	sub	%i2, 56, %i2
1476	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
1477	and	%i2, 0x7f, %i2		! residue bytes in %i2
1478	sub	%i1, %i0, %i1
1479.bc_aln_001_loop:
1480	ldda	[%i0]ASI_BLK_P,%d16	! block load
1481	subcc	%o3, 64, %o3
1482	fmovd	%d16, %d14
1483	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
1484	stda	%d0,[%i0+%i1]ASI_BLK_P
1485	add	%i0, 64, %i0
1486	fmovd	%d18, %d0
1487	fmovd	%d20, %d2
1488	fmovd	%d22, %d4
1489	fmovd	%d24, %d6
1490	fmovd	%d26, %d8
1491	fmovd	%d28, %d10
1492	fmovd	%d30, %d12
1493	bgt,pt	%ncc, .bc_aln_001_loop
1494	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1495	add	%i1, %i0, %i1
1496
1497	std	%d0, [%i1]
1498	std	%d2, [%i1+8]
1499	std	%d4, [%i1+16]
1500	std	%d6, [%i1+24]
1501	std	%d8, [%i1+32]
1502	std	%d10, [%i1+40]
1503	std	%d12, [%i1+48]
1504	ba	.bc_remain_stuff
1505	add	%i1, 56, %i1
1506	! END OF aln_001
1507
1508.bc_aln_000:
1509	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1510	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
1511	and	%i2, 0x7f, %i2		! residue bytes in %i2
1512	sub	%i1, %i0, %i1
1513.bc_aln_000_loop:
1514	ldda	[%i0]ASI_BLK_P,%d0
1515	subcc	%o3, 64, %o3
1516	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
1517	stda	%d0,[%i0+%i1]ASI_BLK_P
1518	add	%i0, 64, %i0
1519	bgt,pt	%ncc, .bc_aln_000_loop
1520	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1521	add	%i1, %i0, %i1
1522
1523	! END OF aln_000
1524
1525.bc_remain_stuff:
1526	subcc	%i2, 31, %i2		! adjust length to allow cc test
1527	ble,pt	%ncc, .bc_aln_31
1528	nop
1529.bc_aln_32:
1530	ldx	[%i0], %o4		! move 32 bytes
1531	subcc	%i2, 32, %i2		! decrement length count by 32
1532	stx	%o4, [%i1]
1533	ldx	[%i0+8], %o4
1534	stx	%o4, [%i1+8]
1535	ldx	[%i0+16], %o4
1536	add	%i0, 32, %i0		! increase src ptr by 32
1537	stx	%o4, [%i1+16]
1538	ldx	[%i0-8], %o4
1539	add	%i1, 32, %i1		! increase dst ptr by 32
1540	bgu,pt	%ncc, .bc_aln_32	! repeat if at least 32 bytes left
1541	stx	%o4, [%i1-8]
1542.bc_aln_31:
1543	addcc	%i2, 24, %i2		! adjust count to be off by 7
1544	ble,pt	%ncc, .bc_aln_7		! skip if 7 or fewer bytes left
1545	nop				!
1546.bc_aln_15:
1547	ldx	[%i0], %o4		! move 8 bytes
1548	add	%i0, 8, %i0		! increase src ptr by 8
1549	subcc	%i2, 8, %i2		! decrease count by 8
1550	add	%i1, 8, %i1		! increase dst ptr by 8
1551	bgu,pt	%ncc, .bc_aln_15
1552	stx	%o4, [%i1-8]		!
1553.bc_aln_7:
1554	addcc	%i2, 7, %i2		! finish adjustment of remaining count
1555	bz,pt	%ncc, .bc_exit		! exit if finished
1556	cmp	%i2, 4
1557	blt,pt	%ncc, .bc_unaln3x	! skip if less than 4 bytes left
1558	nop				!
1559	ld	[%i0], %o4		! move 4 bytes
1560	add	%i0, 4, %i0		! increase src ptr by 4
1561	add	%i1, 4, %i1		! increase dst ptr by 4
1562	subcc	%i2, 4, %i2		! decrease count by 4
1563	bnz	.bc_unaln3x
1564	stw	%o4, [%i1-4]
1565	ba	.bc_exit
1566	nop
1567
1568	! destination alignment code
1569.bc_big_d1:
1570	ldub	[%i0], %o4		! move a byte
1571	add	%i0, 1, %i0
1572	stb	%o4, [%i1]
1573	add	%i1, 1, %i1
1574	andcc	%i1, 2, %o3
1575	bz,pt	%ncc, .bc_big_d2f
1576	sub	%i2, 1, %i2
1577.bc_big_d2:
1578	ldub	[%i0], %o4		! move a half-word (src align unknown)
1579	ldub	[%i0+1], %o3
1580	add	%i0, 2, %i0
1581	sll	%o4, 8, %o4		! position
1582	or	%o4, %o3, %o4		! merge
1583	sth	%o4, [%i1]
1584	add	%i1, 2, %i1
1585	andcc	%i1, 4, %o3
1586	bz,pt	%ncc, .bc_big_d4f
1587	sub	%i2, 2, %i2
1588.bc_big_d4:
1589	ldub	[%i0], %o4		! move a word (src align unknown)
1590	ldub	[%i0+1], %o3
1591	sll	%o4, 24, %o4		! position
1592	sll	%o3, 16, %o3		! position
1593	or	%o4, %o3, %o3		! merge
1594	ldub	[%i0+2], %o4
1595	sll	%o4, 8, %o4		! position
1596	or	%o4, %o3, %o3		! merge
1597	ldub	[%i0+3], %o4
1598	or	%o4, %o3, %o4		! merge
1599	stw	%o4,[%i1]		! store four bytes
1600	add	%i0, 4, %i0		! adjust src by 4
1601	add	%i1, 4, %i1		! adjust dest by 4
1602	ba	.bc_big_d4f
1603	sub	%i2, 4, %i2		! adjust count by 4
1604
1605
1606	! Dst is on 8 byte boundary; src is not;
1607.bc_big_unal8:
1608	andcc	%i1, 0x3f, %o3		! is dst 64-byte block aligned?
1609	bz	%ncc, .bc_unalnsrc
1610	sub	%o3, 64, %o3		! %o3 will be multiple of 8
1611	neg	%o3			! bytes until dest is 64 byte aligned
1612	sub	%i2, %o3, %i2		! update cnt with bytes to be moved
1613	! Move bytes according to source alignment
1614	andcc	%i0, 0x1, %o4
1615	bnz	%ncc, .bc_unalnbyte	! check for byte alignment
1616	nop
1617	andcc	%i0, 2, %o4		! check for half word alignment
1618	bnz	%ncc, .bc_unalnhalf
1619	nop
1620	! Src is word aligned, move bytes until dest 64 byte aligned
1621.bc_unalnword:
1622	ld	[%i0], %o4		! load 4 bytes
1623	stw	%o4, [%i1]		! and store 4 bytes
1624	ld	[%i0+4], %o4		! load 4 bytes
1625	add	%i0, 8, %i0		! increase src ptr by 8
1626	stw	%o4, [%i1+4]		! and store 4 bytes
1627	subcc	%o3, 8, %o3		! decrease count by 8
1628	bnz	%ncc, .bc_unalnword
1629	add	%i1, 8, %i1		! increase dst ptr by 8
1630	ba	.bc_unalnsrc
1631	nop
1632
1633	! Src is half-word aligned, move bytes until dest 64 byte aligned
1634.bc_unalnhalf:
1635	lduh	[%i0], %o4		! load 2 bytes
1636	sllx	%o4, 32, %i3		! shift left
1637	lduw	[%i0+2], %o4
1638	or	%o4, %i3, %i3
1639	sllx	%i3, 16, %i3
1640	lduh	[%i0+6], %o4
1641	or	%o4, %i3, %i3
1642	stx	%i3, [%i1]
1643	add	%i0, 8, %i0
1644	subcc	%o3, 8, %o3
1645	bnz	%ncc, .bc_unalnhalf
1646	add	%i1, 8, %i1
1647	ba	.bc_unalnsrc
1648	nop
1649
1650	! Src is Byte aligned, move bytes until dest 64 byte aligned
1651.bc_unalnbyte:
1652	sub	%i1, %i0, %i1		! share pointer advance
1653.bc_unalnbyte_loop:
1654	ldub	[%i0], %o4
1655	sllx	%o4, 56, %i3
1656	lduh	[%i0+1], %o4
1657	sllx	%o4, 40, %o4
1658	or	%o4, %i3, %i3
1659	lduh	[%i0+3], %o4
1660	sllx	%o4, 24, %o4
1661	or	%o4, %i3, %i3
1662	lduh	[%i0+5], %o4
1663	sllx	%o4, 8, %o4
1664	or	%o4, %i3, %i3
1665	ldub	[%i0+7], %o4
1666	or	%o4, %i3, %i3
1667	stx	%i3, [%i1+%i0]
1668	subcc	%o3, 8, %o3
1669	bnz	%ncc, .bc_unalnbyte_loop
1670	add	%i0, 8, %i0
1671	add	%i1,%i0, %i1		! restore pointer
1672
1673	! Destination is now block (64 byte aligned), src is not 8 byte aligned
1674.bc_unalnsrc:
1675	andn	%i2, 0x3f, %i3		! %i3 is multiple of block size
1676	and	%i2, 0x3f, %i2		! residue bytes in %i2
1677	add	%i2, 64, %i2		! Insure we don't load beyond
1678	sub	%i3, 64, %i3		! end of source buffer
1679
1680	andn	%i0, 0x3f, %o4		! %o4 has block aligned src address
1681	prefetch [%o4 + (3 * CACHE_LINE)], #one_read
1682	alignaddr %i0, %g0, %g0		! generate %gsr
1683	add	%i0, %i3, %i0		! advance %i0 to after blocks
1684	!
1685	! Determine source alignment to correct 8 byte offset
1686	andcc	%i0, 0x20, %o3
1687	brnz,pn	%o3, .bc_unaln_1
1688	andcc	%i0, 0x10, %o3
1689	brnz,pn	%o3, .bc_unaln_01
1690	andcc	%i0, 0x08, %o3
1691	brz,a	%o3, .bc_unaln_000
1692	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1693	ba	.bc_unaln_001
1694	nop
1695.bc_unaln_01:
1696	brnz,a	%o3, .bc_unaln_011
1697	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1698	ba	.bc_unaln_010
1699	nop
1700.bc_unaln_1:
1701	brnz,pn	%o3, .bc_unaln_11
1702	andcc	%i0, 0x08, %o3
1703	brnz,a	%o3, .bc_unaln_101
1704	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1705	ba	.bc_unaln_100
1706	nop
1707.bc_unaln_11:
1708	brz,pn	%o3, .bc_unaln_110
1709	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1710
1711.bc_unaln_111:
1712	ldd	[%o4+56], %d14
1713.bc_unaln_111_loop:
1714	add	%o4, 64, %o4
1715	ldda	[%o4]ASI_BLK_P, %d16
1716	faligndata %d14, %d16, %d48
1717	faligndata %d16, %d18, %d50
1718	faligndata %d18, %d20, %d52
1719	faligndata %d20, %d22, %d54
1720	faligndata %d22, %d24, %d56
1721	faligndata %d24, %d26, %d58
1722	faligndata %d26, %d28, %d60
1723	faligndata %d28, %d30, %d62
1724	fmovd	%d30, %d14
1725	stda	%d48, [%i1]ASI_BLK_P
1726	subcc	%i3, 64, %i3
1727	add	%i1, 64, %i1
1728	bgu,pt	%ncc, .bc_unaln_111_loop
1729	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1730	ba	.bc_unaln_done
1731	nop
1732
1733.bc_unaln_110:
1734	ldd	[%o4+48], %d12
1735	ldd	[%o4+56], %d14
1736.bc_unaln_110_loop:
1737	add	%o4, 64, %o4
1738	ldda	[%o4]ASI_BLK_P, %d16
1739	faligndata %d12, %d14, %d48
1740	faligndata %d14, %d16, %d50
1741	faligndata %d16, %d18, %d52
1742	faligndata %d18, %d20, %d54
1743	faligndata %d20, %d22, %d56
1744	faligndata %d22, %d24, %d58
1745	faligndata %d24, %d26, %d60
1746	faligndata %d26, %d28, %d62
1747	fmovd	%d28, %d12
1748	fmovd	%d30, %d14
1749	stda	%d48, [%i1]ASI_BLK_P
1750	subcc	%i3, 64, %i3
1751	add	%i1, 64, %i1
1752	bgu,pt	%ncc, .bc_unaln_110_loop
1753	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1754	ba	.bc_unaln_done
1755	nop
1756
1757.bc_unaln_101:
1758	ldd	[%o4+40], %d10
1759	ldd	[%o4+48], %d12
1760	ldd	[%o4+56], %d14
1761.bc_unaln_101_loop:
1762	add	%o4, 64, %o4
1763	ldda	[%o4]ASI_BLK_P, %d16
1764	faligndata %d10, %d12, %d48
1765	faligndata %d12, %d14, %d50
1766	faligndata %d14, %d16, %d52
1767	faligndata %d16, %d18, %d54
1768	faligndata %d18, %d20, %d56
1769	faligndata %d20, %d22, %d58
1770	faligndata %d22, %d24, %d60
1771	faligndata %d24, %d26, %d62
1772	fmovd	%d26, %d10
1773	fmovd	%d28, %d12
1774	fmovd	%d30, %d14
1775	stda	%d48, [%i1]ASI_BLK_P
1776	subcc	%i3, 64, %i3
1777	add	%i1, 64, %i1
1778	bgu,pt	%ncc, .bc_unaln_101_loop
1779	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1780	ba	.bc_unaln_done
1781	nop
1782
1783.bc_unaln_100:
1784	ldd	[%o4+32], %d8
1785	ldd	[%o4+40], %d10
1786	ldd	[%o4+48], %d12
1787	ldd	[%o4+56], %d14
1788.bc_unaln_100_loop:
1789	add	%o4, 64, %o4
1790	ldda	[%o4]ASI_BLK_P, %d16
1791	faligndata %d8, %d10, %d48
1792	faligndata %d10, %d12, %d50
1793	faligndata %d12, %d14, %d52
1794	faligndata %d14, %d16, %d54
1795	faligndata %d16, %d18, %d56
1796	faligndata %d18, %d20, %d58
1797	faligndata %d20, %d22, %d60
1798	faligndata %d22, %d24, %d62
1799	fmovd	%d24, %d8
1800	fmovd	%d26, %d10
1801	fmovd	%d28, %d12
1802	fmovd	%d30, %d14
1803	stda	%d48, [%i1]ASI_BLK_P
1804	subcc	%i3, 64, %i3
1805	add	%i1, 64, %i1
1806	bgu,pt	%ncc, .bc_unaln_100_loop
1807	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1808	ba	.bc_unaln_done
1809	nop
1810
1811.bc_unaln_011:
1812	ldd	[%o4+24], %d6
1813	ldd	[%o4+32], %d8
1814	ldd	[%o4+40], %d10
1815	ldd	[%o4+48], %d12
1816	ldd	[%o4+56], %d14
1817.bc_unaln_011_loop:
1818	add	%o4, 64, %o4
1819	ldda	[%o4]ASI_BLK_P, %d16
1820	faligndata %d6, %d8, %d48
1821	faligndata %d8, %d10, %d50
1822	faligndata %d10, %d12, %d52
1823	faligndata %d12, %d14, %d54
1824	faligndata %d14, %d16, %d56
1825	faligndata %d16, %d18, %d58
1826	faligndata %d18, %d20, %d60
1827	faligndata %d20, %d22, %d62
1828	fmovd	%d22, %d6
1829	fmovd	%d24, %d8
1830	fmovd	%d26, %d10
1831	fmovd	%d28, %d12
1832	fmovd	%d30, %d14
1833	stda	%d48, [%i1]ASI_BLK_P
1834	subcc	%i3, 64, %i3
1835	add	%i1, 64, %i1
1836	bgu,pt	%ncc, .bc_unaln_011_loop
1837	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1838	ba	.bc_unaln_done
1839	nop
1840
1841.bc_unaln_010:
1842	ldd	[%o4+16], %d4
1843	ldd	[%o4+24], %d6
1844	ldd	[%o4+32], %d8
1845	ldd	[%o4+40], %d10
1846	ldd	[%o4+48], %d12
1847	ldd	[%o4+56], %d14
1848.bc_unaln_010_loop:
1849	add	%o4, 64, %o4
1850	ldda	[%o4]ASI_BLK_P, %d16
1851	faligndata %d4, %d6, %d48
1852	faligndata %d6, %d8, %d50
1853	faligndata %d8, %d10, %d52
1854	faligndata %d10, %d12, %d54
1855	faligndata %d12, %d14, %d56
1856	faligndata %d14, %d16, %d58
1857	faligndata %d16, %d18, %d60
1858	faligndata %d18, %d20, %d62
1859	fmovd	%d20, %d4
1860	fmovd	%d22, %d6
1861	fmovd	%d24, %d8
1862	fmovd	%d26, %d10
1863	fmovd	%d28, %d12
1864	fmovd	%d30, %d14
1865	stda	%d48, [%i1]ASI_BLK_P
1866	subcc	%i3, 64, %i3
1867	add	%i1, 64, %i1
1868	bgu,pt	%ncc, .bc_unaln_010_loop
1869	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1870	ba	.bc_unaln_done
1871	nop
1872
1873.bc_unaln_001:
1874	ldd	[%o4+8], %d2
1875	ldd	[%o4+16], %d4
1876	ldd	[%o4+24], %d6
1877	ldd	[%o4+32], %d8
1878	ldd	[%o4+40], %d10
1879	ldd	[%o4+48], %d12
1880	ldd	[%o4+56], %d14
1881.bc_unaln_001_loop:
1882	add	%o4, 64, %o4
1883	ldda	[%o4]ASI_BLK_P, %d16
1884	faligndata %d2, %d4, %d48
1885	faligndata %d4, %d6, %d50
1886	faligndata %d6, %d8, %d52
1887	faligndata %d8, %d10, %d54
1888	faligndata %d10, %d12, %d56
1889	faligndata %d12, %d14, %d58
1890	faligndata %d14, %d16, %d60
1891	faligndata %d16, %d18, %d62
1892	fmovd	%d18, %d2
1893	fmovd	%d20, %d4
1894	fmovd	%d22, %d6
1895	fmovd	%d24, %d8
1896	fmovd	%d26, %d10
1897	fmovd	%d28, %d12
1898	fmovd	%d30, %d14
1899	stda	%d48, [%i1]ASI_BLK_P
1900	subcc	%i3, 64, %i3
1901	add	%i1, 64, %i1
1902	bgu,pt	%ncc, .bc_unaln_001_loop
1903	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1904	ba	.bc_unaln_done
1905	nop
1906
1907.bc_unaln_000:
1908	ldda	[%o4]ASI_BLK_P, %d0
1909.bc_unaln_000_loop:
1910	add	%o4, 64, %o4
1911	ldda	[%o4]ASI_BLK_P, %d16
1912	faligndata %d0, %d2, %d48
1913	faligndata %d2, %d4, %d50
1914	faligndata %d4, %d6, %d52
1915	faligndata %d6, %d8, %d54
1916	faligndata %d8, %d10, %d56
1917	faligndata %d10, %d12, %d58
1918	faligndata %d12, %d14, %d60
1919	faligndata %d14, %d16, %d62
1920	fmovd	%d16, %d0
1921	fmovd	%d18, %d2
1922	fmovd	%d20, %d4
1923	fmovd	%d22, %d6
1924	fmovd	%d24, %d8
1925	fmovd	%d26, %d10
1926	fmovd	%d28, %d12
1927	fmovd	%d30, %d14
1928	stda	%d48, [%i1]ASI_BLK_P
1929	subcc	%i3, 64, %i3
1930	add	%i1, 64, %i1
1931	bgu,pt	%ncc, .bc_unaln_000_loop
1932	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1933
1934.bc_unaln_done:
1935	! Handle trailing bytes, 64 to 127
1936	! Dest long word aligned, Src not long word aligned
1937	cmp	%i2, 15
1938	bleu	%ncc, .bc_unaln_short
1939
1940	andn	%i2, 0x7, %i3		! %i3 is multiple of 8
1941	and	%i2, 0x7, %i2		! residue bytes in %i2
1942	add	%i2, 8, %i2
1943	sub	%i3, 8, %i3		! insure we don't load past end of src
1944	andn	%i0, 0x7, %o4		! %o4 has long word aligned src address
1945	add	%i0, %i3, %i0		! advance %i0 to after multiple of 8
1946	ldd	[%o4], %d0		! fetch partial word
1947.bc_unaln_by8:
1948	ldd	[%o4+8], %d2
1949	add	%o4, 8, %o4
1950	faligndata %d0, %d2, %d16
1951	subcc	%i3, 8, %i3
1952	std	%d16, [%i1]
1953	fmovd	%d2, %d0
1954	bgu,pt	%ncc, .bc_unaln_by8
1955	add	%i1, 8, %i1
1956
1957.bc_unaln_short:
1958	cmp	%i2, 8
1959	blt,pt	%ncc, .bc_unalnfin
1960	nop
1961	ldub	[%i0], %o4
1962	sll	%o4, 24, %o3
1963	ldub	[%i0+1], %o4
1964	sll	%o4, 16, %o4
1965	or	%o4, %o3, %o3
1966	ldub	[%i0+2], %o4
1967	sll	%o4, 8, %o4
1968	or	%o4, %o3, %o3
1969	ldub	[%i0+3], %o4
1970	or	%o4, %o3, %o3
1971	stw	%o3, [%i1]
1972	ldub	[%i0+4], %o4
1973	sll	%o4, 24, %o3
1974	ldub	[%i0+5], %o4
1975	sll	%o4, 16, %o4
1976	or	%o4, %o3, %o3
1977	ldub	[%i0+6], %o4
1978	sll	%o4, 8, %o4
1979	or	%o4, %o3, %o3
1980	ldub	[%i0+7], %o4
1981	or	%o4, %o3, %o3
1982	stw	%o3, [%i1+4]
1983	add	%i0, 8, %i0
1984	add	%i1, 8, %i1
1985	sub	%i2, 8, %i2
1986.bc_unalnfin:
1987	cmp	%i2, 4
1988	blt,pt	%ncc, .bc_unalnz
1989	tst	%i2
1990	ldub	[%i0], %o3		! read byte
1991	subcc	%i2, 4, %i2		! reduce count by 4
1992	sll	%o3, 24, %o3		! position
1993	ldub	[%i0+1], %o4
1994	sll	%o4, 16, %o4		! position
1995	or	%o4, %o3, %o3		! merge
1996	ldub	[%i0+2], %o4
1997	sll	%o4, 8, %o4		! position
1998	or	%o4, %o3, %o3		! merge
1999	add	%i1, 4, %i1		! advance dst by 4
2000	ldub	[%i0+3], %o4
2001	add	%i0, 4, %i0		! advance src by 4
2002	or	%o4, %o3, %o4		! merge
2003	bnz,pt	%ncc, .bc_unaln3x
2004	stw	%o4, [%i1-4]
2005	ba	.bc_exit
2006	nop
2007.bc_unalnz:
2008	bz,pt	%ncc, .bc_exit
2009.bc_unaln3x:				! Exactly 1, 2, or 3 bytes remain
2010	subcc	%i2, 1, %i2		! reduce count for cc test
2011	ldub	[%i0], %o4		! load one byte
2012	bz,pt	%ncc, .bc_exit
2013	stb	%o4, [%i1]		! store one byte
2014	ldub	[%i0+1], %o4		! load second byte
2015	subcc	%i2, 1, %i2
2016	bz,pt	%ncc, .bc_exit
2017	stb	%o4, [%i1+1]		! store second byte
2018	ldub	[%i0+2], %o4		! load third byte
2019	stb	%o4, [%i1+2]		! store third byte
2020.bc_exit:
2021	wr	%l5, %g0, %gsr		! restore %gsr
2022	brnz	%g5, .bc_fp_restore
2023	and	%o5, COPY_FLAGS, %l1	! save flags in %l1
2024	FZERO
2025	wr	%g5, %g0, %fprs
2026	ba,pt	%ncc, .bc_ex2
2027	nop
2028.bc_fp_restore:
2029	BLD_FP_FROMSTACK(%o4)
2030.bc_ex2:
2031	ldn	[THREAD_REG + T_LWP], %o2
2032	brnz,pt	%o2, 1f
2033	nop
2034
2035	ldsb	[THREAD_REG + T_PREEMPT], %l0
2036	deccc	%l0
2037	bnz,pn	%ncc, 1f
2038	stb	%l0, [THREAD_REG + T_PREEMPT]
2039
2040	! Check for a kernel preemption request
2041	ldn	[THREAD_REG + T_CPU], %l0
2042	ldub	[%l0 + CPU_KPRUNRUN], %l0
2043	brnz,a,pt	%l0, 1f	! Need to call kpreempt?
2044	or	%l1, KPREEMPT_FLAG, %l1	! If so, set the flag
20451:
2046	btst	LOFAULT_SET, %l1
2047	bz,pn	%icc, 3f
2048	andncc	%o5, COPY_FLAGS, %o5
2049	! Here via bcopy. Check to see if the handler was NULL.
2050	! If so, just return quietly. Otherwise, reset the
2051	! handler and return.
2052	bz,pn %ncc, 2f
2053	nop
2054	membar	#Sync
2055	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
20562:
2057	btst	KPREEMPT_FLAG, %l1
2058	bz,pt	%icc, 3f
2059	nop
2060	call	kpreempt
2061	rdpr	%pil, %o0		! pass %pil
20623:
2063	ret
2064	restore	%g0, 0, %o0
2065
2066	SET_SIZE(bcopy_more)
2067
2068
2069#else	/* NIAGARA_IMPL */
2070	save	%sp, -SA(MINFRAME), %sp
2071	clr	%o5			! flag LOFAULT_SET is not set for bcopy
2072.do_copy:
2073	cmp	%i2, 12			! for small counts
2074	blu	%ncc, .bytecp		! just copy bytes
2075	.empty
2076
2077	cmp	%i2, 128		! for less than 128 bytes
2078	blu,pn	%ncc, .bcb_punt		! no block st/quad ld
2079	nop
2080
2081	set	use_hw_bcopy, %o2
2082	ld	[%o2], %o2
2083	brz,pn	%o2, .bcb_punt
2084	nop
2085
2086	subcc	%i1, %i0, %i3
2087	bneg,a,pn %ncc, 1f
2088	neg	%i3
20891:
2090	/*
2091	 * Compare against 256 since we should be checking block addresses
2092	 * and (dest & ~63) - (src & ~63) can be 3 blocks even if
2093	 * src = dest + (64 * 3) + 63.
2094	 */
2095	cmp	%i3, 256
2096	blu,pn	%ncc, .bcb_punt
2097	nop
2098
2099	/*
2100	 * Copy that reach here have at least 2 blocks of data to copy.
2101	 */
2102.do_blockcopy:
2103	! Swap src/dst since the code below is memcpy code
2104	! and memcpy/bcopy have different calling sequences
2105	mov	%i1, %i5
2106	mov	%i0, %i1
2107	mov	%i5, %i0
2108
2109	! Block (64 bytes) align the destination.
2110	andcc	%i0, 0x3f, %i3		! is dst aligned on a 64 bytes
2111	bz	%xcc, .chksrc		! dst is already double aligned
2112	sub	%i3, 0x40, %i3
2113	neg	%i3			! bytes till dst 64 bytes aligned
2114	sub	%i2, %i3, %i2		! update i2 with new count
2115
2116	! Based on source and destination alignment do
2117	! either 8 bytes, 4 bytes, 2 bytes or byte copy.
2118
2119	! Is dst & src 8B aligned
2120	or	%i0, %i1, %o2
2121	andcc	%o2, 0x7, %g0
2122	bz	%ncc, .alewdcp
2123	nop
2124
2125	! Is dst & src 4B aligned
2126	andcc	%o2, 0x3, %g0
2127	bz	%ncc, .alwdcp
2128	nop
2129
2130	! Is dst & src 2B aligned
2131	andcc	%o2, 0x1, %g0
2132	bz	%ncc, .alhlfwdcp
2133	nop
2134
2135	! 1B aligned
21361:	ldub	[%i1], %o2
2137	stb	%o2, [%i0]
2138	inc	%i1
2139	deccc	%i3
2140	bgu,pt	%ncc, 1b
2141	inc	%i0
2142
2143	ba	.chksrc
2144	nop
2145
2146	! dst & src 4B aligned
2147.alwdcp:
2148	ld	[%i1], %o2
2149	st	%o2, [%i0]
2150	add	%i1, 0x4, %i1
2151	subcc	%i3, 0x4, %i3
2152	bgu,pt	%ncc, .alwdcp
2153	add	%i0, 0x4, %i0
2154
2155	ba	.chksrc
2156	nop
2157
2158	! dst & src 2B aligned
2159.alhlfwdcp:
2160	lduh	[%i1], %o2
2161	stuh	%o2, [%i0]
2162	add	%i1, 0x2, %i1
2163	subcc	%i3, 0x2, %i3
2164	bgu,pt	%ncc, .alhlfwdcp
2165	add	%i0, 0x2, %i0
2166
2167	ba	.chksrc
2168	nop
2169
2170	! dst & src 8B aligned
2171.alewdcp:
2172	ldx	[%i1], %o2
2173	stx	%o2, [%i0]
2174	add	%i1, 0x8, %i1
2175	subcc	%i3, 0x8, %i3
2176	bgu,pt	%ncc, .alewdcp
2177	add	%i0, 0x8, %i0
2178
2179	! Now Destination is block (64 bytes) aligned
2180.chksrc:
2181	andn	%i2, 0x3f, %i3		! %i3 count is multiple of block size
2182	sub	%i2, %i3, %i2		! Residue bytes in %i2
2183
2184	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
2185
2186	andcc	%i1, 0xf, %o2		! is src quadword aligned
2187	bz,pn	%xcc, .blkcpy		! src offset in %o2
2188	nop
2189	cmp	%o2, 0x8
2190	bg	.cpy_upper_double
2191	nop
2192	bl	.cpy_lower_double
2193	nop
2194
2195	! Falls through when source offset is equal to 8 i.e.
2196	! source is double word aligned.
2197	! In this case no shift/merge of data is required
2198	sub	%i1, %o2, %i1		! align the src at 16 bytes.
2199	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
2200	prefetch [%l0+0x0], #one_read
2201	ldda	[%i1+0x0]%asi, %l2
2202loop0:
2203	ldda	[%i1+0x10]%asi, %l4
2204	prefetch [%l0+0x40], #one_read
2205
2206	stxa	%l3, [%i0+0x0]%asi
2207	stxa	%l4, [%i0+0x8]%asi
2208
2209	ldda	[%i1+0x20]%asi, %l2
2210	stxa	%l5, [%i0+0x10]%asi
2211	stxa	%l2, [%i0+0x18]%asi
2212
2213	ldda	[%i1+0x30]%asi, %l4
2214	stxa	%l3, [%i0+0x20]%asi
2215	stxa	%l4, [%i0+0x28]%asi
2216
2217	ldda	[%i1+0x40]%asi, %l2
2218	stxa	%l5, [%i0+0x30]%asi
2219	stxa	%l2, [%i0+0x38]%asi
2220
2221	add	%l0, 0x40, %l0
2222	add	%i1, 0x40, %i1
2223	subcc	%i3, 0x40, %i3
2224	bgu,pt	%xcc, loop0
2225	add	%i0, 0x40, %i0
2226	ba	.blkdone
2227	add	%i1, %o2, %i1		! increment the source by src offset
2228					! the src offset was stored in %o2
2229
2230.cpy_lower_double:
2231	sub	%i1, %o2, %i1		! align the src at 16 bytes.
2232	sll	%o2, 3, %o0		! %o0 left shift
2233	mov	0x40, %o1
2234	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
2235	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
2236	prefetch [%l0+0x0], #one_read
2237	ldda	[%i1+0x0]%asi, %l2	! partial data in %l2 and %l3 has
2238					! complete data
2239loop1:
2240	ldda	[%i1+0x10]%asi, %l4	! %l4 has partial data for this read.
2241	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)	! merge %l2, %l3 and %l4
2242							! into %l2 and %l3
2243	prefetch [%l0+0x40], #one_read
2244	stxa	%l2, [%i0+0x0]%asi
2245	stxa	%l3, [%i0+0x8]%asi
2246
2247	ldda	[%i1+0x20]%asi, %l2
2248	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)	! merge %l2 with %l5 and
2249	stxa	%l4, [%i0+0x10]%asi			! %l4 from previous read
2250	stxa	%l5, [%i0+0x18]%asi			! into %l4 and %l5
2251
2252	! Repeat the same for next 32 bytes.
2253
2254	ldda	[%i1+0x30]%asi, %l4
2255	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
2256	stxa	%l2, [%i0+0x20]%asi
2257	stxa	%l3, [%i0+0x28]%asi
2258
2259	ldda	[%i1+0x40]%asi, %l2
2260	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
2261	stxa	%l4, [%i0+0x30]%asi
2262	stxa	%l5, [%i0+0x38]%asi
2263
2264	add	%l0, 0x40, %l0
2265	add	%i1, 0x40, %i1
2266	subcc	%i3, 0x40, %i3
2267	bgu,pt	%xcc, loop1
2268	add	%i0, 0x40, %i0
2269	ba	.blkdone
2270	add	%i1, %o2, %i1		! increment the source by src offset
2271					! the src offset was stored in %o2
2272
2273.cpy_upper_double:
2274	sub	%i1, %o2, %i1		! align the src at 16 bytes.
2275	mov	0x8, %o0
2276	sub	%o2, %o0, %o0
2277	sll	%o0, 3, %o0		! %o0 left shift
2278	mov	0x40, %o1
2279	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
2280	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
2281	prefetch [%l0+0x0], #one_read
2282	ldda	[%i1+0x0]%asi, %l2	! partial data in %l3 for this read and
2283					! no data in %l2
2284loop2:
2285	ldda	[%i1+0x10]%asi, %l4	! %l4 has complete data and %l5 has
2286					! partial
2287	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)	! merge %l3, %l4 and %l5
2288							! into %l3 and %l4
2289	prefetch [%l0+0x40], #one_read
2290	stxa	%l3, [%i0+0x0]%asi
2291	stxa	%l4, [%i0+0x8]%asi
2292
2293	ldda	[%i1+0x20]%asi, %l2
2294	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)	! merge %l2 and %l3 with
2295	stxa	%l5, [%i0+0x10]%asi			! %l5 from previous read
2296	stxa	%l2, [%i0+0x18]%asi			! into %l5 and %l2
2297
2298	! Repeat the same for next 32 bytes.
2299
2300	ldda	[%i1+0x30]%asi, %l4
2301	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
2302	stxa	%l3, [%i0+0x20]%asi
2303	stxa	%l4, [%i0+0x28]%asi
2304
2305	ldda	[%i1+0x40]%asi, %l2
2306	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
2307	stxa	%l5, [%i0+0x30]%asi
2308	stxa	%l2, [%i0+0x38]%asi
2309
2310	add	%l0, 0x40, %l0
2311	add	%i1, 0x40, %i1
2312	subcc	%i3, 0x40, %i3
2313	bgu,pt	%xcc, loop2
2314	add	%i0, 0x40, %i0
2315	ba	.blkdone
2316	add	%i1, %o2, %i1		! increment the source by src offset
2317					! the src offset was stored in %o2
2318
2319
2320	! Both Source and Destination are block aligned.
2321	! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
2322.blkcpy:
2323	prefetch [%i1+0x0], #one_read
23241:
2325	ldda	[%i1+0x0]%asi, %l0
2326	ldda	[%i1+0x10]%asi, %l2
2327	prefetch [%i1+0x40], #one_read
2328
2329	stxa	%l0, [%i0+0x0]%asi
2330	ldda	[%i1+0x20]%asi, %l4
2331	ldda	[%i1+0x30]%asi, %l6
2332
2333	stxa	%l1, [%i0+0x8]%asi
2334	stxa	%l2, [%i0+0x10]%asi
2335	stxa	%l3, [%i0+0x18]%asi
2336	stxa	%l4, [%i0+0x20]%asi
2337	stxa	%l5, [%i0+0x28]%asi
2338	stxa	%l6, [%i0+0x30]%asi
2339	stxa	%l7, [%i0+0x38]%asi
2340
2341	add	%i1, 0x40, %i1
2342	subcc	%i3, 0x40, %i3
2343	bgu,pt	%xcc, 1b
2344	add	%i0, 0x40, %i0
2345
2346.blkdone:
2347	membar	#Sync
2348
2349	brz,pt	%i2, .blkexit
2350	nop
2351
2352	! Handle trailing bytes
2353	cmp	%i2, 0x8
2354	blu,pt	%ncc, .residue
2355	nop
2356
2357	! Can we do some 8B ops
2358	or	%i1, %i0, %o2
2359	andcc	%o2, 0x7, %g0
2360	bnz	%ncc, .last4
2361	nop
2362
2363	! Do 8byte ops as long as possible
2364.last8:
2365	ldx	[%i1], %o2
2366	stx	%o2, [%i0]
2367	add	%i1, 0x8, %i1
2368	sub	%i2, 0x8, %i2
2369	cmp	%i2, 0x8
2370	bgu,pt	%ncc, .last8
2371	add	%i0, 0x8, %i0
2372
2373	brz,pt	%i2, .blkexit
2374	nop
2375
2376	ba	.residue
2377	nop
2378
2379.last4:
2380	! Can we do 4B ops
2381	andcc	%o2, 0x3, %g0
2382	bnz	%ncc, .last2
2383	nop
23841:
2385	ld	[%i1], %o2
2386	st	%o2, [%i0]
2387	add	%i1, 0x4, %i1
2388	sub	%i2, 0x4, %i2
2389	cmp	%i2, 0x4
2390	bgu,pt	%ncc, 1b
2391	add	%i0, 0x4, %i0
2392
2393	brz,pt	%i2, .blkexit
2394	nop
2395
2396	ba	.residue
2397	nop
2398
2399.last2:
2400	! Can we do 2B ops
2401	andcc	%o2, 0x1, %g0
2402	bnz	%ncc, .residue
2403	nop
2404
24051:
2406	lduh	[%i1], %o2
2407	stuh	%o2, [%i0]
2408	add	%i1, 0x2, %i1
2409	sub	%i2, 0x2, %i2
2410	cmp	%i2, 0x2
2411	bgu,pt	%ncc, 1b
2412	add	%i0, 0x2, %i0
2413
2414	brz,pt	%i2, .blkexit
2415	nop
2416
2417.residue:
2418	ldub	[%i1], %o2
2419	stb	%o2, [%i0]
2420	inc	%i1
2421	deccc	%i2
2422	bgu,pt	%ncc, .residue
2423	inc	%i0
2424
2425.blkexit:
2426
2427	membar	#Sync				! sync error barrier
2428	! Restore t_lofault handler, if came here from kcopy().
2429	tst	%o5
2430	bz	%ncc, 1f
2431	andn	%o5, LOFAULT_SET, %o5
2432	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
24331:
2434	ret
2435	restore	%g0, 0, %o0
2436
2437
2438.bcb_punt:
2439	!
2440	! use aligned transfers where possible
2441	!
2442	xor	%i0, %i1, %o4		! xor from and to address
2443	btst	7, %o4			! if lower three bits zero
2444	bz	.aldoubcp		! can align on double boundary
2445	.empty	! assembler complaints about label
2446
2447	xor	%i0, %i1, %o4		! xor from and to address
2448	btst	3, %o4			! if lower two bits zero
2449	bz	.alwordcp		! can align on word boundary
2450	btst	3, %i0			! delay slot, from address unaligned?
2451	!
2452	! use aligned reads and writes where possible
2453	! this differs from wordcp in that it copes
2454	! with odd alignment between source and destnation
2455	! using word reads and writes with the proper shifts
2456	! in between to align transfers to and from memory
2457	! i0 - src address, i1 - dest address, i2 - count
2458	! i3, i4 - tmps for used generating complete word
2459	! i5 (word to write)
2460	! l0 size in bits of upper part of source word (US)
2461	! l1 size in bits of lower part of source word (LS = 32 - US)
2462	! l2 size in bits of upper part of destination word (UD)
2463	! l3 size in bits of lower part of destination word (LD = 32 - UD)
2464	! l4 number of bytes leftover after aligned transfers complete
2465	! l5 the number 32
2466	!
2467	mov	32, %l5			! load an oft-needed constant
2468	bz	.align_dst_only
2469	btst	3, %i1			! is destnation address aligned?
2470	clr	%i4			! clear registers used in either case
2471	bz	.align_src_only
2472	clr	%l0
2473	!
2474	! both source and destination addresses are unaligned
2475	!
24761:					! align source
2477	ldub	[%i0], %i3		! read a byte from source address
2478	add	%i0, 1, %i0		! increment source address
2479	or	%i4, %i3, %i4		! or in with previous bytes (if any)
2480	btst	3, %i0			! is source aligned?
2481	add	%l0, 8, %l0		! increment size of upper source (US)
2482	bnz,a	1b
2483	sll	%i4, 8, %i4		! make room for next byte
2484
2485	sub	%l5, %l0, %l1		! generate shift left count (LS)
2486	sll	%i4, %l1, %i4		! prepare to get rest
2487	ld	[%i0], %i3		! read a word
2488	add	%i0, 4, %i0		! increment source address
2489	srl	%i3, %l0, %i5		! upper src bits into lower dst bits
2490	or	%i4, %i5, %i5		! merge
2491	mov	24, %l3			! align destination
24921:
2493	srl	%i5, %l3, %i4		! prepare to write a single byte
2494	stb	%i4, [%i1]		! write a byte
2495	add	%i1, 1, %i1		! increment destination address
2496	sub	%i2, 1, %i2		! decrement count
2497	btst	3, %i1			! is destination aligned?
2498	bnz,a	1b
2499	sub	%l3, 8, %l3		! delay slot, decrement shift count (LD)
2500	sub	%l5, %l3, %l2		! generate shift left count (UD)
2501	sll	%i5, %l2, %i5		! move leftover into upper bytes
2502	cmp	%l2, %l0		! cmp # reqd to fill dst w old src left
2503	bgu	%ncc, .more_needed	! need more to fill than we have
2504	nop
2505
2506	sll	%i3, %l1, %i3		! clear upper used byte(s)
2507	srl	%i3, %l1, %i3
2508	! get the odd bytes between alignments
2509	sub	%l0, %l2, %l0		! regenerate shift count
2510	sub	%l5, %l0, %l1		! generate new shift left count (LS)
2511	and	%i2, 3, %l4		! must do remaining bytes if count%4 > 0
2512	andn	%i2, 3, %i2		! # of aligned bytes that can be moved
2513	srl	%i3, %l0, %i4
2514	or	%i5, %i4, %i5
2515	st	%i5, [%i1]		! write a word
2516	subcc	%i2, 4, %i2		! decrement count
2517	bz	%ncc, .unalign_out
2518	add	%i1, 4, %i1		! increment destination address
2519
2520	b	2f
2521	sll	%i3, %l1, %i5		! get leftover into upper bits
2522.more_needed:
2523	sll	%i3, %l0, %i3		! save remaining byte(s)
2524	srl	%i3, %l0, %i3
2525	sub	%l2, %l0, %l1		! regenerate shift count
2526	sub	%l5, %l1, %l0		! generate new shift left count
2527	sll	%i3, %l1, %i4		! move to fill empty space
2528	b	3f
2529	or	%i5, %i4, %i5		! merge to complete word
2530	!
2531	! the source address is aligned and destination is not
2532	!
2533.align_dst_only:
2534	ld	[%i0], %i4		! read a word
2535	add	%i0, 4, %i0		! increment source address
2536	mov	24, %l0			! initial shift alignment count
25371:
2538	srl	%i4, %l0, %i3		! prepare to write a single byte
2539	stb	%i3, [%i1]		! write a byte
2540	add	%i1, 1, %i1		! increment destination address
2541	sub	%i2, 1, %i2		! decrement count
2542	btst	3, %i1			! is destination aligned?
2543	bnz,a	1b
2544	sub	%l0, 8, %l0		! delay slot, decrement shift count
2545.xfer:
2546	sub	%l5, %l0, %l1		! generate shift left count
2547	sll	%i4, %l1, %i5		! get leftover
25483:
2549	and	%i2, 3, %l4		! must do remaining bytes if count%4 > 0
2550	andn	%i2, 3, %i2		! # of aligned bytes that can be moved
25512:
2552	ld	[%i0], %i3		! read a source word
2553	add	%i0, 4, %i0		! increment source address
2554	srl	%i3, %l0, %i4		! upper src bits into lower dst bits
2555	or	%i5, %i4, %i5		! merge with upper dest bits (leftover)
2556	st	%i5, [%i1]		! write a destination word
2557	subcc	%i2, 4, %i2		! decrement count
2558	bz	%ncc, .unalign_out	! check if done
2559	add	%i1, 4, %i1		! increment destination address
2560	b	2b			! loop
2561	sll	%i3, %l1, %i5		! get leftover
2562.unalign_out:
2563	tst	%l4			! any bytes leftover?
2564	bz	%ncc, .cpdone
2565	.empty				! allow next instruction in delay slot
25661:
2567	sub	%l0, 8, %l0		! decrement shift
2568	srl	%i3, %l0, %i4		! upper src byte into lower dst byte
2569	stb	%i4, [%i1]		! write a byte
2570	subcc	%l4, 1, %l4		! decrement count
2571	bz	%ncc, .cpdone		! done?
2572	add	%i1, 1, %i1		! increment destination
2573	tst	%l0			! any more previously read bytes
2574	bnz	%ncc, 1b		! we have leftover bytes
2575	mov	%l4, %i2		! delay slot, mv cnt where dbytecp wants
2576	b	.dbytecp		! let dbytecp do the rest
2577	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
2578	!
2579	! the destination address is aligned and the source is not
2580	!
2581.align_src_only:
2582	ldub	[%i0], %i3		! read a byte from source address
2583	add	%i0, 1, %i0		! increment source address
2584	or	%i4, %i3, %i4		! or in with previous bytes (if any)
2585	btst	3, %i0			! is source aligned?
2586	add	%l0, 8, %l0		! increment shift count (US)
2587	bnz,a	.align_src_only
2588	sll	%i4, 8, %i4		! make room for next byte
2589	b,a	.xfer
2590	!
2591	! if from address unaligned for double-word moves,
2592	! move bytes till it is, if count is < 56 it could take
2593	! longer to align the thing than to do the transfer
2594	! in word size chunks right away
2595	!
2596.aldoubcp:
2597	cmp	%i2, 56			! if count < 56, use wordcp, it takes
2598	blu,a	%ncc, .alwordcp		! longer to align doubles than words
2599	mov	3, %o0			! mask for word alignment
2600	call	.alignit		! copy bytes until aligned
2601	mov	7, %o0			! mask for double alignment
2602	!
2603	! source and destination are now double-word aligned
2604	! i3 has aligned count returned by alignit
2605	!
2606	and	%i2, 7, %i2		! unaligned leftover count
2607	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
26085:
2609	ldx	[%i0+%i1], %o4		! read from address
2610	stx	%o4, [%i1]		! write at destination address
2611	subcc	%i3, 8, %i3		! dec count
2612	bgu	%ncc, 5b
2613	add	%i1, 8, %i1		! delay slot, inc to address
2614	cmp	%i2, 4			! see if we can copy a word
2615	blu	%ncc, .dbytecp		! if 3 or less bytes use bytecp
2616	.empty
2617	!
2618	! for leftover bytes we fall into wordcp, if needed
2619	!
2620.wordcp:
2621	and	%i2, 3, %i2		! unaligned leftover count
26225:
2623	ld	[%i0+%i1], %o4		! read from address
2624	st	%o4, [%i1]		! write at destination address
2625	subcc	%i3, 4, %i3		! dec count
2626	bgu	%ncc, 5b
2627	add	%i1, 4, %i1		! delay slot, inc to address
2628	b,a	.dbytecp
2629
2630	! we come here to align copies on word boundaries
2631.alwordcp:
2632	call	.alignit		! go word-align it
2633	mov	3, %o0			! bits that must be zero to be aligned
2634	b	.wordcp
2635	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
2636
2637	!
2638	! byte copy, works with any alignment
2639	!
2640.bytecp:
2641	b	.dbytecp
2642	sub	%i0, %i1, %i0		! i0 gets difference of src and dst
2643
2644	!
2645	! differenced byte copy, works with any alignment
2646	! assumes dest in %i1 and (source - dest) in %i0
2647	!
26481:
2649	stb	%o4, [%i1]		! write to address
2650	inc	%i1			! inc to address
2651.dbytecp:
2652	deccc	%i2			! dec count
2653	bgeu,a	%ncc, 1b		! loop till done
2654	ldub	[%i0+%i1], %o4		! read from address
2655.cpdone:
2656
2657	membar	#Sync				! sync error barrier
2658	! Restore t_lofault handler, if came here from kcopy().
2659	tst	%o5
2660	bz	%ncc, 1f
2661	andn	%o5, LOFAULT_SET, %o5
2662	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
26631:
2664	ret
2665	restore %g0, 0, %o0		! return (0)
2666
2667/*
2668 * Common code used to align transfers on word and doubleword
2669 * boundaries.  Aligns source and destination and returns a count
2670 * of aligned bytes to transfer in %i3
2671 */
26721:
2673	inc	%i0			! inc from
2674	stb	%o4, [%i1]		! write a byte
2675	inc	%i1			! inc to
2676	dec	%i2			! dec count
2677.alignit:
2678	btst	%o0, %i0		! %o0 is bit mask to check for alignment
2679	bnz,a	1b
2680	ldub	[%i0], %o4		! read next byte
2681
2682	retl
2683	andn	%i2, %o0, %i3		! return size of aligned bytes
2684
2685	SET_SIZE(bcopy)
2686
2687#endif	/* NIAGARA_IMPL */
2688
2689#endif	/* lint */
2690
2691/*
2692 * Block copy with possibly overlapped operands.
2693 */
2694
2695#if defined(lint)
2696
2697/*ARGSUSED*/
2698void
2699ovbcopy(const void *from, void *to, size_t count)
2700{}
2701
2702#else	/* lint */
2703
2704	ENTRY(ovbcopy)
2705	tst	%o2			! check count
2706	bgu,a	%ncc, 1f		! nothing to do or bad arguments
2707	subcc	%o0, %o1, %o3		! difference of from and to address
2708
2709	retl				! return
2710	nop
27111:
2712	bneg,a	%ncc, 2f
2713	neg	%o3			! if < 0, make it positive
27142:	cmp	%o2, %o3		! cmp size and abs(from - to)
2715	bleu	%ncc, bcopy		! if size <= abs(diff): use bcopy,
2716	.empty				!   no overlap
2717	cmp	%o0, %o1		! compare from and to addresses
2718	blu	%ncc, .ov_bkwd		! if from < to, copy backwards
2719	nop
2720	!
2721	! Copy forwards.
2722	!
2723.ov_fwd:
2724	ldub	[%o0], %o3		! read from address
2725	inc	%o0			! inc from address
2726	stb	%o3, [%o1]		! write to address
2727	deccc	%o2			! dec count
2728	bgu	%ncc, .ov_fwd		! loop till done
2729	inc	%o1			! inc to address
2730
2731	retl				! return
2732	nop
2733	!
2734	! Copy backwards.
2735	!
2736.ov_bkwd:
2737	deccc	%o2			! dec count
2738	ldub	[%o0 + %o2], %o3	! get byte at end of src
2739	bgu	%ncc, .ov_bkwd		! loop till done
2740	stb	%o3, [%o1 + %o2]	! delay slot, store at end of dst
2741
2742	retl				! return
2743	nop
2744	SET_SIZE(ovbcopy)
2745
2746#endif	/* lint */
2747
2748/*
2749 * hwblkpagecopy()
2750 *
2751 * Copies exactly one page.  This routine assumes the caller (ppcopy)
2752 * has already disabled kernel preemption and has checked
2753 * use_hw_bcopy.
2754 */
2755#ifdef lint
2756/*ARGSUSED*/
2757void
2758hwblkpagecopy(const void *src, void *dst)
2759{ }
2760#else /* lint */
2761	ENTRY(hwblkpagecopy)
2762	save	%sp, -SA(MINFRAME), %sp
2763
2764	! %i0 - source address (arg)
2765	! %i1 - destination address (arg)
2766	! %i2 - length of region (not arg)
2767
2768	set	PAGESIZE, %i2
2769
2770	/*
2771	 * Copying exactly one page and PAGESIZE is in mutliple of 0x80.
2772	 */
2773	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
2774	prefetch [%i0+0x0], #one_read
2775	prefetch [%i0+0x40], #one_read
27761:
2777	prefetch [%i0+0x80], #one_read
2778	prefetch [%i0+0xc0], #one_read
2779	ldda	[%i0+0x0]%asi, %l0
2780	ldda	[%i0+0x10]%asi, %l2
2781	ldda	[%i0+0x20]%asi, %l4
2782	ldda	[%i0+0x30]%asi, %l6
2783	stxa	%l0, [%i1+0x0]%asi
2784	stxa	%l1, [%i1+0x8]%asi
2785	stxa	%l2, [%i1+0x10]%asi
2786	stxa	%l3, [%i1+0x18]%asi
2787	stxa	%l4, [%i1+0x20]%asi
2788	stxa	%l5, [%i1+0x28]%asi
2789	stxa	%l6, [%i1+0x30]%asi
2790	stxa	%l7, [%i1+0x38]%asi
2791	ldda	[%i0+0x40]%asi, %l0
2792	ldda	[%i0+0x50]%asi, %l2
2793	ldda	[%i0+0x60]%asi, %l4
2794	ldda	[%i0+0x70]%asi, %l6
2795	stxa	%l0, [%i1+0x40]%asi
2796	stxa	%l1, [%i1+0x48]%asi
2797	stxa	%l2, [%i1+0x50]%asi
2798	stxa	%l3, [%i1+0x58]%asi
2799	stxa	%l4, [%i1+0x60]%asi
2800	stxa	%l5, [%i1+0x68]%asi
2801	stxa	%l6, [%i1+0x70]%asi
2802	stxa	%l7, [%i1+0x78]%asi
2803
2804	add	%i0, 0x80, %i0
2805	subcc	%i2, 0x80, %i2
2806	bgu,pt	%xcc, 1b
2807	add	%i1, 0x80, %i1
2808
2809	membar #Sync
2810	ret
2811	restore	%g0, 0, %o0
2812	SET_SIZE(hwblkpagecopy)
2813#endif	/* lint */
2814
2815
2816/*
2817 * Transfer data to and from user space -
2818 * Note that these routines can cause faults
2819 * It is assumed that the kernel has nothing at
2820 * less than KERNELBASE in the virtual address space.
2821 *
2822 * Note that copyin(9F) and copyout(9F) are part of the
2823 * DDI/DKI which specifies that they return '-1' on "errors."
2824 *
2825 * Sigh.
2826 *
2827 * So there's two extremely similar routines - xcopyin() and xcopyout()
2828 * which return the errno that we've faithfully computed.  This
2829 * allows other callers (e.g. uiomove(9F)) to work correctly.
2830 * Given that these are used pretty heavily, we expand the calling
2831 * sequences inline for all flavours (rather than making wrappers).
2832 *
2833 * There are also stub routines for xcopyout_little and xcopyin_little,
2834 * which currently are intended to handle requests of <= 16 bytes from
2835 * do_unaligned. Future enhancement to make them handle 8k pages efficiently
2836 * is left as an exercise...
2837 */
2838
2839/*
2840 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
2841 *
2842 * General theory of operation:
2843 *
2844 * None of the copyops routines grab a window until it's decided that
2845 * we need to do a HW block copy operation. This saves a window
2846 * spill/fill when we're called during socket ops. The typical IO
2847 * path won't cause spill/fill traps.
2848 *
2849 * This code uses a set of 4 limits for the maximum size that will
2850 * be copied given a particular input/output address alignment.
2851 * the default limits are:
2852 *
2853 * single byte aligned - 256 (hw_copy_limit_1)
2854 * two byte aligned - 512 (hw_copy_limit_2)
2855 * four byte aligned - 1024 (hw_copy_limit_4)
2856 * eight byte aligned - 1024 (hw_copy_limit_8)
2857 *
2858 * If the value for a particular limit is zero, the copy will be done
2859 * via the copy loops rather than block store/quad load instructions.
2860 *
2861 * Flow:
2862 *
2863 * If count == zero return zero.
2864 *
2865 * Store the previous lo_fault handler into %g6.
2866 * Place our secondary lofault handler into %g5.
2867 * Place the address of our nowindow fault handler into %o3.
2868 * Place the address of the windowed fault handler into %o4.
2869 * --> We'll use this handler if we end up grabbing a window
2870 * --> before we use block initializing store and quad load ASIs
2871 *
2872 * If count is less than or equal to SMALL_LIMIT (7) we
2873 * always do a byte for byte copy.
2874 *
2875 * If count is > SMALL_LIMIT, we check the alignment of the input
2876 * and output pointers. Based on the alignment we check count
2877 * against a limit based on detected alignment.  If we exceed the
2878 * alignment value we copy via block initializing store and quad
2879 * load instructions.
2880 *
2881 * If we don't exceed one of the limits, we store -count in %o3,
2882 * we store the number of chunks (8, 4, 2 or 1 byte) operated
2883 * on in our basic copy loop in %o2. Following this we branch
2884 * to the appropriate copy loop and copy that many chunks.
2885 * Since we've been adding the chunk size to %o3 each time through
2886 * as well as decrementing %o2, we can tell if any data is
2887 * is left to be copied by examining %o3. If that is zero, we're
2888 * done and can go home. If not, we figure out what the largest
2889 * chunk size left to be copied is and branch to that copy loop
2890 * unless there's only one byte left. We load that as we're
2891 * branching to code that stores it just before we return.
2892 *
2893 * Fault handlers are invoked if we reference memory that has no
2894 * current mapping.  All forms share the same copyio_fault handler.
2895 * This routine handles fixing up the stack and general housecleaning.
2896 * Each copy operation has a simple fault handler that is then called
2897 * to do the work specific to the invidual operation.  The handler
2898 * for copyOP and xcopyOP are found at the end of individual function.
2899 * The handlers for xcopyOP_little are found at the end of xcopyin_little.
2900 * The handlers for copyOP_noerr are found at the end of copyin_noerr.
2901 */
2902
2903/*
2904 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
2905 */
2906
2907#if defined(lint)
2908
2909/*ARGSUSED*/
2910int
2911copyout(const void *kaddr, void *uaddr, size_t count)
2912{ return (0); }
2913
2914#else	/* lint */
2915
2916/*
2917 * We save the arguments in the following registers in case of a fault:
2918 * 	kaddr - %g2
2919 * 	uaddr - %g3
2920 * 	count - %g4
2921 */
2922#define	SAVE_SRC	%g2
2923#define	SAVE_DST	%g3
2924#define	SAVE_COUNT	%g4
2925
2926#define	REAL_LOFAULT		%g5
2927#define	SAVED_LOFAULT		%g6
2928
2929/*
2930 * Generic copyio fault handler.  This is the first line of defense when a
2931 * fault occurs in (x)copyin/(x)copyout.  In order for this to function
2932 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
2933 * This allows us to share common code for all the flavors of the copy
2934 * operations, including the _noerr versions.
2935 *
2936 * Note that this function will restore the original input parameters before
2937 * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
2938 * member of the t_copyop structure, if needed.
2939 */
2940	ENTRY(copyio_fault)
2941#if !defined(NIAGARA_IMPL)
2942	btst	FPUSED_FLAG, SAVED_LOFAULT
2943	bz	1f
2944	andn	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
2945
2946	wr	%l5, 0, %gsr		! restore gsr
2947
2948	btst	FPRS_FEF, %g1
2949	bz	%icc, 4f
2950	nop
2951
2952	! restore fpregs from stack
2953	BLD_FP_FROMSTACK(%o2)
2954
2955	ba,pt	%ncc, 1f
2956	nop
29574:
2958	FZERO				! zero all of the fpregs
2959	wr	%g1, %g0, %fprs		! restore fprs
29601:
2961	restore
2962	mov	SAVE_SRC, %o0
2963	mov	SAVE_DST, %o1
2964	jmp	REAL_LOFAULT
2965	mov	SAVE_COUNT, %o2
2966
2967#else	/* NIAGARA_IMPL */
2968	membar	#Sync
2969	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2970	restore
2971	mov	SAVE_SRC, %o0
2972	mov	SAVE_DST, %o1
2973	jmp	REAL_LOFAULT
2974	mov	SAVE_COUNT, %o2
2975
2976#endif	/* NIAGARA_IMPL */
2977
2978	SET_SIZE(copyio_fault)
2979
2980	ENTRY(copyio_fault_nowindow)
2981	membar	#Sync
2982	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2983
2984	mov	SAVE_SRC, %o0
2985	mov	SAVE_DST, %o1
2986	jmp	REAL_LOFAULT
2987	mov	SAVE_COUNT, %o2
2988	SET_SIZE(copyio_fault_nowindow)
2989
2990	ENTRY(copyout)
2991	sethi	%hi(.copyout_err), REAL_LOFAULT
2992	or	REAL_LOFAULT, %lo(.copyout_err), REAL_LOFAULT
2993
2994#if !defined(NIAGARA_IMPL)
2995.do_copyout:
2996	tst	%o2			! check for zero count;  quick exit
2997	bz,pt	%ncc, .co_smallqx
2998	mov	%o0, SAVE_SRC
2999	mov	%o1, SAVE_DST
3000	mov	%o2, SAVE_COUNT
3001	cmp	%o2, FP_COPY		! check for small copy/leaf case
3002	bgt,pt	%ncc, .co_copy_more
3003	ldn	[THREAD_REG + T_LOFAULT], SAVED_LOFAULT
3004/*
3005 * Small copy out code
3006 *
3007 */
3008	sethi	%hi(copyio_fault_nowindow), %o3
3009	or	%o3, %lo(copyio_fault_nowindow), %o3
3010	membar	#Sync
3011	stn	%o3, [THREAD_REG + T_LOFAULT]
3012
3013	mov	ASI_USER, %asi
3014	cmp	%o2, SHORTCOPY		! make sure there is enough to align
3015	ble,pt	%ncc, .co_smallest
3016	andcc	%o1, 0x7, %o3		! is dest long word aligned
3017	bnz,pn	%ncc, .co_align
3018	andcc	%o1, 1, %o3		! is dest byte aligned
3019
3020! Destination is long word aligned
3021! 8 cases for src alignment; load parts, store long words
3022.co_al_src:
3023	andcc	%o0, 7, %o3
3024	brnz,pt	%o3, .co_src_dst_unal8
3025	nop
3026/*
3027 * Special case for handling when src and dest are both long word aligned
3028 * and total data to move is less than FP_COPY bytes
3029 * Also handles finish up for large block moves, so may be less than 32 bytes
3030 */
3031.co_medlong:
3032	subcc	%o2, 31, %o2		! adjust length to allow cc test
3033	ble,pt	%ncc, .co_medl31
3034	nop
3035.co_medl32:
3036	ldx	[%o0], %o4		! move 32 bytes
3037	subcc	%o2, 32, %o2		! decrement length count by 32
3038	stxa	%o4, [%o1]%asi
3039	ldx	[%o0+8], %o4
3040	stxa	%o4, [%o1+8]%asi
3041	ldx	[%o0+16], %o4
3042	add	%o0, 32, %o0		! increase src ptr by 32
3043	stxa	%o4, [%o1+16]%asi
3044	ldx	[%o0-8], %o4
3045	add	%o1, 32, %o1		! increase dst ptr by 32
3046	bgu,pt	%ncc, .co_medl32	! repeat if at least 32 bytes left
3047	stxa	%o4, [%o1-8]%asi
3048.co_medl31:
3049	addcc	%o2, 24, %o2		! adjust count to be off by 7
3050	ble,pt	%ncc, .co_medl7		! skip if 7 or fewer bytes left
3051	nop
3052.co_medl8:
3053	ldx	[%o0], %o4		! move 8 bytes
3054	add	%o0, 8, %o0		! increase src ptr by 8
3055	subcc	%o2, 8, %o2		! decrease count by 8
3056	add	%o1, 8, %o1		! increase dst ptr by 8
3057	bgu,pt	%ncc, .co_medl8
3058	stxa	%o4, [%o1-8]%asi
3059.co_medl7:
3060	addcc	%o2, 7, %o2		! finish adjustment of remaining count
3061	bnz,pt	%ncc, .co_small4	! do final bytes if not finished
3062
3063.co_smallx:				! finish up and exit
3064	membar	#Sync
3065	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3066.co_smallqx:
3067	retl
3068	mov	%g0, %o0
3069
3070.co_small4:
3071	cmp	%o2, 4
3072	blt,pt	%ncc, .co_small3x	! skip if less than 4 bytes left
3073	nop				!
3074	ld	[%o0], %o4		! move 4 bytes
3075	add	%o0, 4, %o0		! increase src ptr by 4
3076	add	%o1, 4, %o1		! increase dst ptr by 4
3077	subcc	%o2, 4, %o2		! decrease count by 4
3078	bz,pt	%ncc, .co_smallx
3079	stwa	%o4, [%o1-4]%asi
3080
3081.co_small3x:				! Exactly 1, 2, or 3 bytes remain
3082	subcc	%o2, 1, %o2		! reduce count for cc test
3083	ldub	[%o0], %o4		! load one byte
3084	bz,pt	%ncc, .co_smallx
3085	stba	%o4, [%o1]%asi		! store one byte
3086	ldub	[%o0+1], %o4		! load second byte
3087	subcc	%o2, 1, %o2
3088	bz,pt	%ncc, .co_smallx
3089	stba	%o4, [%o1+1]%asi	! store second byte
3090	ldub	[%o0+2], %o4		! load third byte
3091	ba	.co_smallx
3092	stba	%o4, [%o1+2]%asi	! store third byte
3093
3094.co_smallest:				! 7 or fewer bytes remain
3095	cmp	%o2, 4
3096	blt,pt	%ncc, .co_small3x
3097	nop
3098	ldub	[%o0], %o4		! read byte
3099	subcc	%o2, 4, %o2		! reduce count by 4
3100	stba	%o4, [%o1]%asi		! write byte
3101	ldub	[%o0+1], %o4		! repeat for total of 4 bytes
3102	add	%o0, 4, %o0		! advance src by 4
3103	stba	%o4, [%o1+1]%asi
3104	ldub	[%o0-2], %o4
3105	add	%o1, 4, %o1		! advance dst by 4
3106	stba	%o4, [%o1-2]%asi
3107	ldub	[%o0-1], %o4
3108	bnz,pt	%ncc, .co_small3x
3109	stba	%o4, [%o1-1]%asi
3110	membar	#Sync
3111	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3112	retl
3113	mov	%g0, %o0
3114
3115.co_align:				! byte align test in prior branch delay
3116	bnz,pt	%ncc, .co_al_d1
3117.co_al_d1f:				! dest is now half word aligned
3118	andcc	%o1, 2, %o3
3119	bnz,pt	%ncc, .co_al_d2
3120.co_al_d2f:				! dest is now word aligned
3121	andcc	%o1, 4, %o3		! is dest longword aligned?
3122	bz,pt	%ncc, .co_al_src
3123	nop
3124.co_al_d4:				! dest is word aligned;  src is unknown
3125	ldub	[%o0], %o4		! move a word (src align unknown)
3126	ldub	[%o0+1], %o3
3127	sll	%o4, 24, %o4		! position
3128	sll	%o3, 16, %o3		! position
3129	or	%o4, %o3, %o3		! merge
3130	ldub	[%o0+2], %o4
3131	sll	%o4, 8, %o4		! position
3132	or	%o4, %o3, %o3		! merge
3133	ldub	[%o0+3], %o4
3134	or	%o4, %o3, %o4		! merge
3135	stwa	%o4,[%o1]%asi		! store four bytes
3136	add	%o0, 4, %o0		! adjust src by 4
3137	add	%o1, 4, %o1		! adjust dest by 4
3138	sub	%o2, 4, %o2		! adjust count by 4
3139	andcc	%o0, 7, %o3		! check for src long word alignment
3140	brz,pt	%o3, .co_medlong
3141.co_src_dst_unal8:
3142	! dst is 8-byte aligned, src is not
3143	! Size is less than FP_COPY
3144	! Following code is to select for alignment
3145	andcc	%o0, 0x3, %o3		! test word alignment
3146	bz,pt	%ncc, .co_medword
3147	nop
3148	andcc	%o0, 0x1, %o3		! test halfword alignment
3149	bnz,pt	%ncc, .co_med_byte	! go to byte move if not halfword
3150	andcc	%o0, 0x2, %o3		! test which byte alignment
3151	ba	.co_medhalf
3152	nop
3153.co_al_d1:				! align dest to half word
3154	ldub	[%o0], %o4		! move a byte
3155	add	%o0, 1, %o0
3156	stba	%o4, [%o1]%asi
3157	add	%o1, 1, %o1
3158	andcc	%o1, 2, %o3
3159	bz,pt	%ncc, .co_al_d2f
3160	sub	%o2, 1, %o2
3161.co_al_d2:				! align dest to word
3162	ldub	[%o0], %o4		! move a half-word (src align unknown)
3163	ldub	[%o0+1], %o3
3164	sll	%o4, 8, %o4		! position
3165	or	%o4, %o3, %o4		! merge
3166	stha	%o4, [%o1]%asi
3167	add	%o0, 2, %o0
3168	add	%o1, 2, %o1
3169	andcc	%o1, 4, %o3		! is dest longword aligned?
3170	bz,pt	%ncc, .co_al_src
3171	sub	%o2, 2, %o2
3172	ba	.co_al_d4
3173	nop
3174/*
3175 * Handle all cases where src and dest are aligned on word
3176 * boundaries. Use unrolled loops for better performance.
3177 * This option wins over standard large data move when
3178 * source and destination is in cache for medium
3179 * to short data moves.
3180 */
3181.co_medword:
3182	subcc	%o2, 31, %o2		! adjust length to allow cc test
3183	ble,pt	%ncc, .co_medw31
3184	nop
3185.co_medw32:
3186	ld	[%o0], %o4		! move a block of 32 bytes
3187	stwa	%o4, [%o1]%asi
3188	ld	[%o0+4], %o4
3189	stwa	%o4, [%o1+4]%asi
3190	ld	[%o0+8], %o4
3191	stwa	%o4, [%o1+8]%asi
3192	ld	[%o0+12], %o4
3193	stwa	%o4, [%o1+12]%asi
3194	ld	[%o0+16], %o4
3195	stwa	%o4, [%o1+16]%asi
3196	ld	[%o0+20], %o4
3197	subcc	%o2, 32, %o2		! decrement length count
3198	stwa	%o4, [%o1+20]%asi
3199	ld	[%o0+24], %o4
3200	add	%o0, 32, %o0		! increase src ptr by 32
3201	stwa	%o4, [%o1+24]%asi
3202	ld	[%o0-4], %o4
3203	add	%o1, 32, %o1		! increase dst ptr by 32
3204	bgu,pt	%ncc, .co_medw32	! repeat if at least 32 bytes left
3205	stwa	%o4, [%o1-4]%asi
3206.co_medw31:
3207	addcc	%o2, 24, %o2		! adjust count to be off by 7
3208	ble,pt	%ncc, .co_medw7		! skip if 7 or fewer bytes left
3209	nop				!
3210.co_medw15:
3211	ld	[%o0], %o4		! move a block of 8 bytes
3212	subcc	%o2, 8, %o2		! decrement length count
3213	stwa	%o4, [%o1]%asi
3214	add	%o0, 8, %o0		! increase src ptr by 8
3215	ld	[%o0-4], %o4
3216	add	%o1, 8, %o1		! increase dst ptr by 8
3217	bgu,pt	%ncc, .co_medw15
3218	stwa	%o4, [%o1-4]%asi
3219.co_medw7:
3220	addcc	%o2, 7, %o2		! finish adjustment of remaining count
3221	bz,pt	%ncc, .co_smallx	! exit if finished
3222	cmp	%o2, 4
3223	blt,pt	%ncc, .co_small3x	! skip if less than 4 bytes left
3224	nop				!
3225	ld	[%o0], %o4		! move 4 bytes
3226	add	%o0, 4, %o0		! increase src ptr by 4
3227	add	%o1, 4, %o1		! increase dst ptr by 4
3228	subcc	%o2, 4, %o2		! decrease count by 4
3229	bnz	.co_small3x
3230	stwa	%o4, [%o1-4]%asi
3231	membar	#Sync
3232	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3233	retl
3234	mov	%g0, %o0
3235
3236.co_medhalf:
3237	subcc	%o2, 31, %o2		! adjust length to allow cc test
3238	ble,pt	%ncc, .co_medh31
3239	nop
3240.co_medh32:				! load and store block of 32 bytes
3241
3242	lduh	[%o0], %o4		! move 32 bytes
3243	subcc	%o2, 32, %o2		! decrement length count
3244	lduw	[%o0+2], %o3
3245	sllx	%o4, 48, %o4
3246	sllx	%o3, 16, %o3
3247	or	%o4, %o3, %o3
3248	lduh	[%o0+6], %o4
3249	or	%o4, %o3, %o4
3250	stxa	%o4, [%o1]%asi
3251
3252	lduh	[%o0+8], %o4
3253	lduw	[%o0+10], %o3
3254	sllx	%o4, 48, %o4
3255	sllx	%o3, 16, %o3
3256	or	%o4, %o3, %o3
3257	lduh	[%o0+14], %o4
3258	or	%o4, %o3, %o4
3259	stxa	%o4, [%o1+8]%asi
3260
3261	lduh	[%o0+16], %o4
3262	lduw	[%o0+18], %o3
3263	sllx	%o4, 48, %o4
3264	sllx	%o3, 16, %o3
3265	or	%o4, %o3, %o3
3266	lduh	[%o0+22], %o4
3267	or	%o4, %o3, %o4
3268	stxa	%o4, [%o1+16]%asi
3269
3270	add	%o0, 32, %o0		! increase src ptr by 32
3271	add	%o1, 32, %o1		! increase dst ptr by 32
3272
3273	lduh	[%o0-8], %o4
3274	lduw	[%o0-6], %o3
3275	sllx	%o4, 48, %o4
3276	sllx	%o3, 16, %o3
3277	or	%o4, %o3, %o3
3278	lduh	[%o0-2], %o4
3279	or	%o3, %o4, %o4
3280	bgu,pt	%ncc, .co_medh32	! repeat if at least 32 bytes left
3281	stxa	%o4, [%o1-8]%asi
3282
3283.co_medh31:
3284	addcc	%o2, 24, %o2		! adjust count to be off by 7
3285	ble,pt	%ncc, .co_medh7		! skip if 7 or fewer bytes left
3286	nop				!
3287.co_medh15:
3288	lduh	[%o0], %o4		! move 16 bytes
3289	subcc	%o2, 8, %o2		! decrement length count
3290	lduw	[%o0+2], %o3
3291	sllx	%o4, 48, %o4
3292	sllx	%o3, 16, %o3
3293	or	%o4, %o3, %o3
3294	add	%o1, 8, %o1		! increase dst ptr by 8
3295	lduh	[%o0+6], %o4
3296	add	%o0, 8, %o0		! increase src ptr by 8
3297	or	%o4, %o3, %o4
3298	bgu,pt	%ncc, .co_medh15
3299	stxa	%o4, [%o1-8]%asi
3300.co_medh7:
3301	addcc	%o2, 7, %o2		! finish adjustment of remaining count
3302	bz,pt	%ncc, .co_smallx	! exit if finished
3303	cmp	%o2, 4
3304	blt,pt	%ncc, .co_small3x	! skip if less than 4 bytes left
3305	nop				!
3306	lduh	[%o0], %o4
3307	sll	%o4, 16, %o4
3308	lduh	[%o0+2], %o3
3309	or	%o3, %o4, %o4
3310	subcc	%o2, 4, %o2
3311	add	%o0, 4, %o0
3312	add	%o1, 4, %o1
3313	bnz	.co_small3x
3314	stwa	%o4, [%o1-4]%asi
3315	membar	#Sync
3316	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3317	retl
3318	mov	%g0, %o0
3319
3320	.align 16
3321.co_med_byte:
3322	bnz,pt	%ncc, .co_medbh32a	! go to correct byte move
3323	subcc	%o2, 31, %o2		! adjust length to allow cc test
3324	ble,pt	%ncc, .co_medb31
3325	nop
3326.co_medb32:				! Alignment 1 or 5
3327	subcc	%o2, 32, %o2		! decrement length count
3328
3329	ldub	[%o0], %o4		! load and store a block of 32 bytes
3330	sllx	%o4, 56, %o3
3331	lduh	[%o0+1], %o4
3332	sllx	%o4, 40, %o4
3333	or	%o4, %o3, %o3
3334	lduw	[%o0+3], %o4
3335	sllx	%o4, 8, %o4
3336	or	%o4, %o3, %o3
3337	ldub	[%o0+7], %o4
3338	or	%o4, %o3, %o4
3339	stxa	%o4, [%o1]%asi
3340
3341	ldub	[%o0+8], %o4
3342	sllx	%o4, 56, %o3
3343	lduh	[%o0+9], %o4
3344	sllx	%o4, 40, %o4
3345	or	%o4, %o3, %o3
3346	lduw	[%o0+11], %o4
3347	sllx	%o4, 8, %o4
3348	or	%o4, %o3, %o3
3349	ldub	[%o0+15], %o4
3350	or	%o4, %o3, %o4
3351	stxa	%o4, [%o1+8]%asi
3352
3353	ldub	[%o0+16], %o4
3354	sllx	%o4, 56, %o3
3355	lduh	[%o0+17], %o4
3356	sllx	%o4, 40, %o4
3357	or	%o4, %o3, %o3
3358	lduw	[%o0+19], %o4
3359	sllx	%o4, 8, %o4
3360	or	%o4, %o3, %o3
3361	ldub	[%o0+23], %o4
3362	or	%o4, %o3, %o4
3363	stxa	%o4, [%o1+16]%asi
3364
3365	add	%o0, 32, %o0		! increase src ptr by 32
3366	add	%o1, 32, %o1		! increase dst ptr by 32
3367
3368	ldub	[%o0-8], %o4
3369	sllx	%o4, 56, %o3
3370	lduh	[%o0-7], %o4
3371	sllx	%o4, 40, %o4
3372	or	%o4, %o3, %o3
3373	lduw	[%o0-5], %o4
3374	sllx	%o4, 8, %o4
3375	or	%o4, %o3, %o3
3376	ldub	[%o0-1], %o4
3377	or	%o4, %o3, %o4
3378	bgu,pt	%ncc, .co_medb32	! repeat if at least 32 bytes left
3379	stxa	%o4, [%o1-8]%asi
3380
3381.co_medb31:				! 31 or fewer bytes remaining
3382	addcc	%o2, 24, %o2		! adjust count to be off by 7
3383	ble,pt	%ncc, .co_medb7		! skip if 7 or fewer bytes left
3384	nop				!
3385.co_medb15:
3386
3387	ldub	[%o0], %o4		! load and store a block of 8 bytes
3388	subcc	%o2, 8, %o2		! decrement length count
3389	sllx	%o4, 56, %o3
3390	lduh	[%o0+1], %o4
3391	sllx	%o4, 40, %o4
3392	or	%o4, %o3, %o3
3393	lduw	[%o0+3], %o4
3394	add	%o1, 8, %o1		! increase dst ptr by 16
3395	sllx	%o4, 8, %o4
3396	or	%o4, %o3, %o3
3397	ldub	[%o0+7], %o4
3398	add	%o0, 8, %o0		! increase src ptr by 16
3399	or	%o4, %o3, %o4
3400	bgu,pt	%ncc, .co_medb15
3401	stxa	%o4, [%o1-8]%asi
3402.co_medb7:
3403	addcc	%o2, 7, %o2		! finish adjustment of remaining count
3404	bz,pt	%ncc, .co_smallx	! exit if finished
3405	cmp	%o2, 4
3406	blt,pt	%ncc, .co_small3x	! skip if less than 4 bytes left
3407	nop				!
3408	ldub	[%o0], %o4		! move 4 bytes
3409	sll	%o4, 24, %o3
3410	lduh	[%o0+1], %o4
3411	sll	%o4, 8, %o4
3412	or	%o4, %o3, %o3
3413	ldub	[%o0+3], %o4
3414	or	%o4, %o3, %o4
3415	subcc	%o2, 4, %o2
3416	add	%o0, 4, %o0
3417	add	%o1, 4, %o1
3418	bnz	.co_small3x
3419	stwa	%o4, [%o1-4]%asi
3420	membar	#Sync
3421	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3422	retl
3423	mov	%g0, %o0
3424
3425	.align 16
3426.co_medbh32a:
3427	ble,pt	%ncc, .co_medbh31
3428	nop
3429.co_medbh32:				! Alignment 3 or 7
3430	subcc	%o2, 32, %o2		! decrement length count
3431
3432	ldub	[%o0], %o4		! load and store a block of 32 bytes
3433	sllx	%o4, 56, %o3
3434	lduw	[%o0+1], %o4
3435	sllx	%o4, 24, %o4
3436	or	%o4, %o3, %o3
3437	lduh	[%o0+5], %o4
3438	sllx	%o4, 8, %o4
3439	or	%o4, %o3, %o3
3440	ldub	[%o0+7], %o4
3441	or	%o4, %o3, %o4
3442	stxa	%o4, [%o1]%asi
3443
3444	ldub	[%o0+8], %o4
3445	sllx	%o4, 56, %o3
3446	lduw	[%o0+9], %o4
3447	sllx	%o4, 24, %o4
3448	or	%o4, %o3, %o3
3449	lduh	[%o0+13], %o4
3450	sllx	%o4, 8, %o4
3451	or	%o4, %o3, %o3
3452	ldub	[%o0+15], %o4
3453	or	%o4, %o3, %o4
3454	stxa	%o4, [%o1+8]%asi
3455
3456	ldub	[%o0+16], %o4
3457	sllx	%o4, 56, %o3
3458	lduw	[%o0+17], %o4
3459	sllx	%o4, 24, %o4
3460	or	%o4, %o3, %o3
3461	lduh	[%o0+21], %o4
3462	sllx	%o4, 8, %o4
3463	or	%o4, %o3, %o3
3464	ldub	[%o0+23], %o4
3465	or	%o4, %o3, %o4
3466	stxa	%o4, [%o1+16]%asi
3467
3468	add	%o0, 32, %o0		! increase src ptr by 32
3469	add	%o1, 32, %o1		! increase dst ptr by 32
3470
3471	ldub	[%o0-8], %o4
3472	sllx	%o4, 56, %o3
3473	lduw	[%o0-7], %o4
3474	sllx	%o4, 24, %o4
3475	or	%o4, %o3, %o3
3476	lduh	[%o0-3], %o4
3477	sllx	%o4, 8, %o4
3478	or	%o4, %o3, %o3
3479	ldub	[%o0-1], %o4
3480	or	%o4, %o3, %o4
3481	bgu,pt	%ncc, .co_medbh32	! repeat if at least 32 bytes left
3482	stxa	%o4, [%o1-8]%asi
3483
3484.co_medbh31:
3485	addcc	%o2, 24, %o2		! adjust count to be off by 7
3486	ble,pt	%ncc, .co_medb7		! skip if 7 or fewer bytes left
3487	nop				!
3488.co_medbh15:
3489	ldub	[%o0], %o4		! load and store a block of 8 bytes
3490	sllx	%o4, 56, %o3
3491	lduw	[%o0+1], %o4
3492	sllx	%o4, 24, %o4
3493	or	%o4, %o3, %o3
3494	lduh	[%o0+5], %o4
3495	sllx	%o4, 8, %o4
3496	or	%o4, %o3, %o3
3497	ldub	[%o0+7], %o4
3498	or	%o4, %o3, %o4
3499	stxa	%o4, [%o1]%asi
3500	subcc	%o2, 8, %o2		! decrement length count
3501	add	%o1, 8, %o1		! increase dst ptr by 8
3502	add	%o0, 8, %o0		! increase src ptr by 8
3503	bgu,pt	%ncc, .co_medbh15
3504	stxa	%o4, [%o1-8]%asi
3505	ba	.co_medb7
3506	nop
3507/*
3508 * End of small copy (no window) code
3509 */
3510
3511/*
3512 * Long copy code
3513 */
3514.co_copy_more:
3515	sethi	%hi(copyio_fault), %o3
3516	or	%o3, %lo(copyio_fault), %o3
3517	membar	#Sync
3518	stn	%o3, [THREAD_REG + T_LOFAULT]
3519
3520/*
3521 * Following code is for large copies. We know there is at
3522 * least FP_COPY bytes available. FP regs are used, so
3523 *  we save registers and fp regs before starting
3524 */
3525	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3526	or	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
3527	rd	%fprs, %g1		! check for unused fp
3528	! if fprs.fef == 0, set it.
3529	! Setting it when already set costs more than checking
3530	andcc	%g1, FPRS_FEF, %g1	! test FEF, fprs.du = fprs.dl = 0
3531	bz,pt	%ncc, .co_fp_unused
3532	mov	ASI_USER, %asi
3533	BST_FP_TOSTACK(%o3)
3534	ba	.co_fp_ready
3535.co_fp_unused:
3536	prefetch [%i0 + (1 * CACHE_LINE)], #one_read
3537	wr	%g0, FPRS_FEF, %fprs	! fprs.fef = 1
3538.co_fp_ready:
3539	rd	%gsr, %l5		! save %gsr value
3540	andcc	%i1, 1, %o3		! is dest byte aligned
3541	bnz,pt	%ncc, .co_big_d1
3542.co_big_d1f:				! dest is now half word aligned
3543	andcc	%i1, 2, %o3
3544	bnz,pt	%ncc, .co_big_d2
3545.co_big_d2f:				! dest is now word aligned
3546	andcc	%i1, 4, %o3		! is dest longword aligned
3547	bnz,pt	%ncc, .co_big_d4
3548.co_big_d4f:				! dest is now long word aligned
3549	andcc	%i0, 7, %o3		! is src long word aligned
3550	brnz,pt	%o3, .co_big_unal8
3551	prefetch [%i0 + (2 * CACHE_LINE)], #one_read
3552	! Src and dst are long word aligned
3553	! align dst to 64 byte boundary
3554	andcc	%i1, 0x3f, %o3		! %o3 == 0 means dst is 64 byte aligned
3555	brz,pn	%o3, .co_al_to_64
3556	nop
3557	sub	%o3, 64, %o3		! %o3 has negative bytes to move
3558	add	%i2, %o3, %i2		! adjust remaining count
3559	andcc	%o3, 8, %o4		! odd long words to move?
3560	brz,pt	%o4, .co_al_to_16
3561	nop
3562	add	%o3, 8, %o3
3563	ldx	[%i0], %o4
3564	add	%i0, 8, %i0		! increment src ptr
3565	stxa	%o4, [%i1]ASI_USER
3566	add	%i1, 8, %i1		! increment dst ptr
3567! Dest is aligned on 16 bytes, src 8 byte aligned
3568.co_al_to_16:
3569	andcc	%o3, 0x30, %o4		! move to move?
3570	brz,pt	%o4, .co_al_to_64
3571	nop
3572.co_al_mv_16:
3573	add	%o3, 16, %o3
3574	ldx	[%i0], %o4
3575	stxa	%o4, [%i1]ASI_USER
3576	add	%i0, 16, %i0		! increment src ptr
3577	ldx	[%i0-8], %o4
3578	add	%i1, 8, %i1		! increment dst ptr
3579	stxa	%o4, [%i1]ASI_USER
3580	andcc	%o3, 0x30, %o4
3581	brnz,pt	%o4, .co_al_mv_16
3582	add	%i1, 8, %i1		! increment dst ptr
3583! Dest is aligned on 64 bytes, src 8 byte aligned
3584.co_al_to_64:
3585	! Determine source alignment
3586	! to correct 8 byte offset
3587	andcc	%i0, 32, %o3
3588	brnz,pn	%o3, .co_aln_1
3589	andcc	%i0, 16, %o3
3590	brnz,pn	%o3, .co_aln_01
3591	andcc	%i0, 8, %o3
3592	brz,pn	%o3, .co_aln_000
3593	prefetch [%i0 + (3 * CACHE_LINE)], #one_read
3594	ba	.co_aln_001
3595	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3596.co_aln_01:
3597	brnz,pn	%o3, .co_aln_011
3598	prefetch [%i0 + (3 * CACHE_LINE)], #one_read
3599	ba	.co_aln_010
3600	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3601.co_aln_1:
3602	andcc	%i0, 16, %o3
3603	brnz,pn	%o3, .co_aln_11
3604	andcc	%i0, 8, %o3
3605	brnz,pn	%o3, .co_aln_101
3606	prefetch [%i0 + (3 * CACHE_LINE)], #one_read
3607	ba	.co_aln_100
3608	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3609.co_aln_11:
3610	brz,pn	%o3, .co_aln_110
3611	prefetch [%i0 + (3 * CACHE_LINE)], #one_read
3612
3613.co_aln_111:
3614! Alignment off by 8 bytes
3615	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3616	ldd	[%i0], %d0
3617	add	%i0, 8, %i0
3618	sub	%i2, 8, %i2
3619	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
3620	and	%i2, 0x7f, %i2		! residue bytes in %i2
3621	sub	%i1, %i0, %i1
3622.co_aln_111_loop:
3623	ldda	[%i0]ASI_BLK_P,%d16		! block load
3624	subcc	%o3, 64, %o3
3625	fmovd	%d16, %d2
3626	fmovd	%d18, %d4
3627	fmovd	%d20, %d6
3628	fmovd	%d22, %d8
3629	fmovd	%d24, %d10
3630	fmovd	%d26, %d12
3631	fmovd	%d28, %d14
3632	stxa	%g0,[%i0+%i1]ASI_STBI_AIUS	! block initializing store
3633	stda	%d0,[%i0+%i1]ASI_BLK_AIUS
3634	add	%i0, 64, %i0
3635	fmovd	%d30, %d0
3636	bgt,pt	%ncc, .co_aln_111_loop
3637	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3638	add	%i1, %i0, %i1
3639
3640	stda	%d0, [%i1]ASI_USER
3641	ba	.co_remain_stuff
3642	add	%i1, 8, %i1
3643	! END OF aln_111
3644
3645.co_aln_110:
3646! Alignment off by 16 bytes
3647	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3648	ldd	[%i0], %d0
3649	ldd	[%i0+8], %d2
3650	add	%i0, 16, %i0
3651	sub	%i2, 16, %i2
3652	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
3653	and	%i2, 0x7f, %i2		! residue bytes in %i2
3654	sub	%i1, %i0, %i1
3655.co_aln_110_loop:
3656	ldda	[%i0]ASI_BLK_P,%d16		! block load
3657	subcc	%o3, 64, %o3
3658	fmovd	%d16, %d4
3659	fmovd	%d18, %d6
3660	fmovd	%d20, %d8
3661	fmovd	%d22, %d10
3662	fmovd	%d24, %d12
3663	fmovd	%d26, %d14
3664	stxa	%g0,[%i0+%i1]ASI_STBI_AIUS	! block initializing store
3665	stda	%d0,[%i0+%i1]ASI_BLK_AIUS
3666	add	%i0, 64, %i0
3667	fmovd	%d28, %d0
3668	fmovd	%d30, %d2
3669	bgt,pt	%ncc, .co_aln_110_loop
3670	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3671	add	%i1, %i0, %i1
3672
3673	stda	%d0, [%i1]%asi
3674	stda	%d2, [%i1+8]%asi
3675	ba	.co_remain_stuff
3676	add	%i1, 16, %i1
3677	! END OF aln_110
3678
3679.co_aln_101:
3680! Alignment off by 24 bytes
3681	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3682	ldd	[%i0], %d0
3683	ldd	[%i0+8], %d2
3684	ldd	[%i0+16], %d4
3685	add	%i0, 24, %i0
3686	sub	%i2, 24, %i2
3687	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
3688	and	%i2, 0x7f, %i2		! residue bytes in %i2
3689	sub	%i1, %i0, %i1
3690.co_aln_101_loop:
3691	ldda	[%i0]ASI_BLK_P,%d16	! block load
3692	subcc	%o3, 64, %o3
3693	fmovd	%d16, %d6
3694	fmovd	%d18, %d8
3695	fmovd	%d20, %d10
3696	fmovd	%d22, %d12
3697	fmovd	%d24, %d14
3698	stxa	%g0,[%i0+%i1]ASI_STBI_AIUS	! block initializing store
3699	stda	%d0,[%i0+%i1]ASI_BLK_AIUS
3700	add	%i0, 64, %i0
3701	fmovd	%d26, %d0
3702	fmovd	%d28, %d2
3703	fmovd	%d30, %d4
3704	bgt,pt	%ncc, .co_aln_101_loop
3705	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3706	add	%i1, %i0, %i1
3707
3708	stda	%d0, [%i1]%asi
3709	stda	%d2, [%i1+8]%asi
3710	stda	%d4, [%i1+16]%asi
3711	ba	.co_remain_stuff
3712	add	%i1, 24, %i1
3713	! END OF aln_101
3714
3715.co_aln_100:
3716! Alignment off by 32 bytes
3717	ldd	[%i0], %d0
3718	ldd	[%i0+8], %d2
3719	ldd	[%i0+16],%d4
3720	ldd	[%i0+24],%d6
3721	add	%i0, 32, %i0
3722	sub	%i2, 32, %i2
3723	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
3724	and	%i2, 0x7f, %i2		! residue bytes in %i2
3725	sub	%i1, %i0, %i1
3726.co_aln_100_loop:
3727	ldda	[%i0]ASI_BLK_P,%d16	! block load
3728	subcc	%o3, 64, %o3
3729	fmovd	%d16, %d8
3730	fmovd	%d18, %d10
3731	fmovd	%d20, %d12
3732	fmovd	%d22, %d14
3733	stxa	%g0,[%i0+%i1]ASI_STBI_AIUS	! block initializing store
3734	stda	%d0,[%i0+%i1]ASI_BLK_AIUS
3735	add	%i0, 64, %i0
3736	fmovd	%d24, %d0
3737	fmovd	%d26, %d2
3738	fmovd	%d28, %d4
3739	fmovd	%d30, %d6
3740	bgt,pt	%ncc, .co_aln_100_loop
3741	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3742	add	%i1, %i0, %i1
3743
3744	stda	%d0, [%i1]%asi
3745	stda	%d2, [%i1+8]%asi
3746	stda	%d4, [%i1+16]%asi
3747	stda	%d6, [%i1+24]%asi
3748	ba	.co_remain_stuff
3749	add	%i1, 32, %i1
3750	! END OF aln_100
3751
3752.co_aln_011:
3753! Alignment off by 40 bytes
3754	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3755	ldd	[%i0], %d0
3756	ldd	[%i0+8], %d2
3757	ldd	[%i0+16], %d4
3758	ldd	[%i0+24], %d6
3759	ldd	[%i0+32], %d8
3760	add	%i0, 40, %i0
3761	sub	%i2, 40, %i2
3762	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
3763	and	%i2, 0x7f, %i2		! residue bytes in %i2
3764	sub	%i1, %i0, %i1
3765.co_aln_011_loop:
3766	ldda	[%i0]ASI_BLK_P,%d16	! block load
3767	subcc	%o3, 64, %o3
3768	fmovd	%d16, %d10
3769	fmovd	%d18, %d12
3770	fmovd	%d20, %d14
3771	stxa	%g0,[%i0+%i1]ASI_STBI_AIUS	! block initializing store
3772	stda	%d0,[%i0+%i1]ASI_BLK_AIUS
3773	add	%i0, 64, %i0
3774	fmovd	%d22, %d0
3775	fmovd	%d24, %d2
3776	fmovd	%d26, %d4
3777	fmovd	%d28, %d6
3778	fmovd	%d30, %d8
3779	bgt,pt	%ncc, .co_aln_011_loop
3780	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3781	add	%i1, %i0, %i1
3782
3783	stda	%d0, [%i1]%asi
3784	stda	%d2, [%i1+8]%asi
3785	stda	%d4, [%i1+16]%asi
3786	stda	%d6, [%i1+24]%asi
3787	stda	%d8, [%i1+32]%asi
3788	ba	.co_remain_stuff
3789	add	%i1, 40, %i1
3790	! END OF aln_011
3791
3792.co_aln_010:
3793! Alignment off by 48 bytes
3794	ldd	[%i0], %d0
3795	ldd	[%i0+8], %d2
3796	ldd	[%i0+16], %d4
3797	ldd	[%i0+24], %d6
3798	ldd	[%i0+32], %d8
3799	ldd	[%i0+40], %d10
3800	add	%i0, 48, %i0
3801	sub	%i2, 48, %i2
3802	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
3803	and	%i2, 0x7f, %i2		! residue bytes in %i2
3804	sub	%i1, %i0, %i1
3805.co_aln_010_loop:
3806	ldda	[%i0]ASI_BLK_P,%d16	! block load
3807	subcc	%o3, 64, %o3
3808	fmovd	%d16, %d12
3809	fmovd	%d18, %d14
3810	stxa	%g0,[%i0+%i1]ASI_STBI_AIUS	! block initializing store
3811	stda	%d0,[%i0+%i1]ASI_BLK_AIUS
3812	add	%i0, 64, %i0
3813	fmovd	%d20, %d0
3814	fmovd	%d22, %d2
3815	fmovd	%d24, %d4
3816	fmovd	%d26, %d6
3817	fmovd	%d28, %d8
3818	fmovd	%d30, %d10
3819	bgt,pt	%ncc, .co_aln_010_loop
3820	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3821	add	%i1, %i0, %i1
3822
3823	stda	%d0, [%i1]%asi
3824	stda	%d2, [%i1+8]%asi
3825	stda	%d4, [%i1+16]%asi
3826	stda	%d6, [%i1+24]%asi
3827	stda	%d8, [%i1+32]%asi
3828	stda	%d10, [%i1+40]%asi
3829	ba	.co_remain_stuff
3830	add	%i1, 48, %i1
3831	! END OF aln_010
3832
3833.co_aln_001:
3834! Alignment off by 56 bytes
3835	ldd	[%i0], %d0
3836	ldd	[%i0+8], %d2
3837	ldd	[%i0+16], %d4
3838	ldd	[%i0+24], %d6
3839	ldd	[%i0+32], %d8
3840	ldd	[%i0+40], %d10
3841	ldd	[%i0+48], %d12
3842	add	%i0, 56, %i0
3843	sub	%i2, 56, %i2
3844	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
3845	and	%i2, 0x7f, %i2		! residue bytes in %i2
3846	sub	%i1, %i0, %i1
3847.co_aln_001_loop:
3848	ldda	[%i0]ASI_BLK_P,%d16	! block load
3849	subcc	%o3, 64, %o3
3850	fmovd	%d16, %d14
3851	stxa	%g0,[%i0+%i1]ASI_STBI_AIUS	! block initializing store
3852	stda	%d0,[%i0+%i1]ASI_BLK_AIUS
3853	add	%i0, 64, %i0
3854	fmovd	%d18, %d0
3855	fmovd	%d20, %d2
3856	fmovd	%d22, %d4
3857	fmovd	%d24, %d6
3858	fmovd	%d26, %d8
3859	fmovd	%d28, %d10
3860	fmovd	%d30, %d12
3861	bgt,pt	%ncc, .co_aln_001_loop
3862	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3863	add	%i1, %i0, %i1
3864
3865	stda	%d0, [%i1]%asi
3866	stda	%d2, [%i1+8]%asi
3867	stda	%d4, [%i1+16]%asi
3868	stda	%d6, [%i1+24]%asi
3869	stda	%d8, [%i1+32]%asi
3870	stda	%d10, [%i1+40]%asi
3871	stda	%d12, [%i1+48]%asi
3872	ba	.co_remain_stuff
3873	add	%i1, 56, %i1
3874	! END OF aln_001
3875
3876.co_aln_000:
3877	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3878	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
3879	and	%i2, 0x7f, %i2		! residue bytes in %i2
3880	sub	%i1, %i0, %i1
3881.co_aln_000_loop:
3882	ldda	[%i0]ASI_BLK_P,%d0
3883	subcc	%o3, 64, %o3
3884	stxa	%g0,[%i0+%i1]ASI_STBI_AIUS	! block initializing store
3885	stda	%d0,[%i0+%i1]ASI_BLK_AIUS
3886	add	%i0, 64, %i0
3887	bgt,pt	%ncc, .co_aln_000_loop
3888	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3889	add	%i1, %i0, %i1
3890
3891	! END OF aln_000
3892
3893.co_remain_stuff:
3894	subcc	%i2, 31, %i2		! adjust length to allow cc test
3895	ble,pt	%ncc, .co_aln_31
3896	nop
3897.co_aln_32:
3898	ldx	[%i0], %o4		! move 32 bytes
3899	subcc	%i2, 32, %i2		! decrement length count by 32
3900	stxa	%o4, [%i1]%asi
3901	ldx	[%i0+8], %o4
3902	stxa	%o4, [%i1+8]%asi
3903	ldx	[%i0+16], %o4
3904	add	%i0, 32, %i0		! increase src ptr by 32
3905	stxa	%o4, [%i1+16]%asi
3906	ldx	[%i0-8], %o4
3907	add	%i1, 32, %i1		! increase dst ptr by 32
3908	bgu,pt	%ncc, .co_aln_32	! repeat if at least 32 bytes left
3909	stxa	%o4, [%i1-8]%asi
3910.co_aln_31:
3911	addcc	%i2, 24, %i2		! adjust count to be off by 7
3912	ble,pt	%ncc, .co_aln_7		! skip if 7 or fewer bytes left
3913	nop				!
3914.co_aln_15:
3915	ldx	[%i0], %o4		! move 8 bytes
3916	add	%i0, 8, %i0		! increase src ptr by 8
3917	subcc	%i2, 8, %i2		! decrease count by 8
3918	add	%i1, 8, %i1		! increase dst ptr by 8
3919	bgu,pt	%ncc, .co_aln_15
3920	stxa	%o4, [%i1-8]%asi
3921.co_aln_7:
3922	addcc	%i2, 7, %i2		! finish adjustment of remaining count
3923	bz,pt	%ncc, .co_exit		! exit if finished
3924	cmp	%i2, 4
3925	blt,pt	%ncc, .co_unaln3x	! skip if less than 4 bytes left
3926	nop				!
3927	ld	[%i0], %o4		! move 4 bytes
3928	add	%i0, 4, %i0		! increase src ptr by 4
3929	add	%i1, 4, %i1		! increase dst ptr by 4
3930	subcc	%i2, 4, %i2		! decrease count by 4
3931	bnz	.co_unaln3x
3932	stwa	%o4, [%i1-4]%asi
3933	ba	.co_exit
3934	nop
3935
3936	! destination alignment code
3937.co_big_d1:
3938	ldub	[%i0], %o4		! move a byte
3939	add	%i0, 1, %i0
3940	stba	%o4, [%i1]ASI_USER
3941	add	%i1, 1, %i1
3942	andcc	%i1, 2, %o3
3943	bz,pt	%ncc, .co_big_d2f
3944	sub	%i2, 1, %i2
3945.co_big_d2:
3946	ldub	[%i0], %o4		! move a half-word (src align unknown)
3947	ldub	[%i0+1], %o3
3948	add	%i0, 2, %i0
3949	sll	%o4, 8, %o4		! position
3950	or	%o4, %o3, %o4		! merge
3951	stha	%o4, [%i1]ASI_USER
3952	add	%i1, 2, %i1
3953	andcc	%i1, 4, %o3		! is dest longword aligned
3954	bz,pt	%ncc, .co_big_d4f
3955	sub	%i2, 2, %i2
3956.co_big_d4:				! dest is at least word aligned
3957	nop
3958	ldub	[%i0], %o4		! move a word (src align unknown)
3959	ldub	[%i0+1], %o3
3960	sll	%o4, 24, %o4		! position
3961	sll	%o3, 16, %o3		! position
3962	or	%o4, %o3, %o3		! merge
3963	ldub	[%i0+2], %o4
3964	sll	%o4, 8, %o4		! position
3965	or	%o4, %o3, %o3		! merge
3966	ldub	[%i0+3], %o4
3967	or	%o4, %o3, %o4		! merge
3968	stwa	%o4,[%i1]ASI_USER	! store four bytes
3969	add	%i0, 4, %i0		! adjust src by 4
3970	add	%i1, 4, %i1		! adjust dest by 4
3971	ba	.co_big_d4f
3972	sub	%i2, 4, %i2		! adjust count by 4
3973
3974
3975	! Dst is on 8 byte boundary; src is not;
3976.co_big_unal8:
3977	andcc	%i1, 0x3f, %o3		! is dst 64-byte block aligned?
3978	bz	%ncc, .co_unalnsrc
3979	sub	%o3, 64, %o3		! %o3 will be multiple of 8
3980	neg	%o3			! bytes until dest is 64 byte aligned
3981	sub	%i2, %o3, %i2		! update cnt with bytes to be moved
3982	! Move bytes according to source alignment
3983	andcc	%i0, 0x1, %o4
3984	bnz	%ncc, .co_unalnbyte	! check for byte alignment
3985	nop
3986	andcc	%i0, 2, %o4		! check for half word alignment
3987	bnz	%ncc, .co_unalnhalf
3988	nop
3989	! Src is word aligned, move bytes until dest 64 byte aligned
3990.co_unalnword:
3991	ld	[%i0], %o4		! load 4 bytes
3992	stwa	%o4, [%i1]%asi		! and store 4 bytes
3993	ld	[%i0+4], %o4		! load 4 bytes
3994	add	%i0, 8, %i0		! increase src ptr by 8
3995	stwa	%o4, [%i1+4]%asi	! and store 4 bytes
3996	subcc	%o3, 8, %o3		! decrease count by 8
3997	bnz	%ncc, .co_unalnword
3998	add	%i1, 8, %i1		! increase dst ptr by 8
3999	ba	.co_unalnsrc
4000	nop
4001
4002	! Src is half-word aligned, move bytes until dest 64 byte aligned
4003.co_unalnhalf:
4004	lduh	[%i0], %o4		! load 2 bytes
4005	sllx	%o4, 32, %i3		! shift left
4006	lduw	[%i0+2], %o4
4007	or	%o4, %i3, %i3
4008	sllx	%i3, 16, %i3
4009	lduh	[%i0+6], %o4
4010	or	%o4, %i3, %i3
4011	stxa	%i3, [%i1]ASI_USER
4012	add	%i0, 8, %i0
4013	subcc	%o3, 8, %o3
4014	bnz	%ncc, .co_unalnhalf
4015	add	%i1, 8, %i1
4016	ba	.co_unalnsrc
4017	nop
4018
4019	! Src is Byte aligned, move bytes until dest 64 byte aligned
4020.co_unalnbyte:
4021	sub	%i1, %i0, %i1		! share pointer advance
4022.co_unalnbyte_loop:
4023	ldub	[%i0], %o4
4024	sllx	%o4, 56, %i3
4025	lduh	[%i0+1], %o4
4026	sllx	%o4, 40, %o4
4027	or	%o4, %i3, %i3
4028	lduh	[%i0+3], %o4
4029	sllx	%o4, 24, %o4
4030	or	%o4, %i3, %i3
4031	lduh	[%i0+5], %o4
4032	sllx	%o4, 8, %o4
4033	or	%o4, %i3, %i3
4034	ldub	[%i0+7], %o4
4035	or	%o4, %i3, %i3
4036	stxa	%i3, [%i1+%i0]ASI_USER
4037	subcc	%o3, 8, %o3
4038	bnz	%ncc, .co_unalnbyte_loop
4039	add	%i0, 8, %i0
4040	add	%i1,%i0, %i1		! restore pointer
4041
4042	! Destination is now block (64 byte aligned), src is not 8 byte aligned
4043.co_unalnsrc:
4044	andn	%i2, 0x3f, %i3		! %i3 is multiple of block size
4045	and	%i2, 0x3f, %i2		! residue bytes in %i2
4046	add	%i2, 64, %i2		! Insure we don't load beyond
4047	sub	%i3, 64, %i3		! end of source buffer
4048
4049	andn	%i0, 0x3f, %o4		! %o4 has block aligned src address
4050	prefetch [%o4 + (3 * CACHE_LINE)], #one_read
4051	alignaddr %i0, %g0, %g0		! generate %gsr
4052	add	%i0, %i3, %i0		! advance %i0 to after blocks
4053	!
4054	! Determine source alignment to correct 8 byte offset
4055	andcc	%i0, 0x20, %o3
4056	brnz,pn	%o3, .co_unaln_1
4057	andcc	%i0, 0x10, %o3
4058	brnz,pn	%o3, .co_unaln_01
4059	andcc	%i0, 0x08, %o3
4060	brz,a	%o3, .co_unaln_000
4061	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4062	ba	.co_unaln_001
4063	nop
4064.co_unaln_01:
4065	brnz,a	%o3, .co_unaln_011
4066	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4067	ba	.co_unaln_010
4068	nop
4069.co_unaln_1:
4070	brnz,pn	%o3, .co_unaln_11
4071	andcc	%i0, 0x08, %o3
4072	brnz,a	%o3, .co_unaln_101
4073	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4074	ba	.co_unaln_100
4075	nop
4076.co_unaln_11:
4077	brz,pn	%o3, .co_unaln_110
4078	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
4079
4080.co_unaln_111:
4081	ldd	[%o4+56], %d14
4082.co_unaln_111_loop:
4083	add	%o4, 64, %o4
4084	ldda	[%o4]ASI_BLK_P, %d16
4085	faligndata %d14, %d16, %d48
4086	faligndata %d16, %d18, %d50
4087	faligndata %d18, %d20, %d52
4088	faligndata %d20, %d22, %d54
4089	faligndata %d22, %d24, %d56
4090	faligndata %d24, %d26, %d58
4091	faligndata %d26, %d28, %d60
4092	faligndata %d28, %d30, %d62
4093	fmovd	%d30, %d14
4094	stda	%d48, [%i1]ASI_BLK_AIUS
4095	subcc	%i3, 64, %i3
4096	add	%i1, 64, %i1
4097	bgu,pt	%ncc, .co_unaln_111_loop
4098	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4099	ba	.co_unaln_done
4100	nop
4101
4102.co_unaln_110:
4103	ldd	[%o4+48], %d12
4104	ldd	[%o4+56], %d14
4105.co_unaln_110_loop:
4106	add	%o4, 64, %o4
4107	ldda	[%o4]ASI_BLK_P, %d16
4108	faligndata %d12, %d14, %d48
4109	faligndata %d14, %d16, %d50
4110	faligndata %d16, %d18, %d52
4111	faligndata %d18, %d20, %d54
4112	faligndata %d20, %d22, %d56
4113	faligndata %d22, %d24, %d58
4114	faligndata %d24, %d26, %d60
4115	faligndata %d26, %d28, %d62
4116	fmovd	%d28, %d12
4117	fmovd	%d30, %d14
4118	stda	%d48, [%i1]ASI_BLK_AIUS
4119	subcc	%i3, 64, %i3
4120	add	%i1, 64, %i1
4121	bgu,pt	%ncc, .co_unaln_110_loop
4122	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4123	ba	.co_unaln_done
4124	nop
4125
4126.co_unaln_101:
4127	ldd	[%o4+40], %d10
4128	ldd	[%o4+48], %d12
4129	ldd	[%o4+56], %d14
4130.co_unaln_101_loop:
4131	add	%o4, 64, %o4
4132	ldda	[%o4]ASI_BLK_P, %d16
4133	faligndata %d10, %d12, %d48
4134	faligndata %d12, %d14, %d50
4135	faligndata %d14, %d16, %d52
4136	faligndata %d16, %d18, %d54
4137	faligndata %d18, %d20, %d56
4138	faligndata %d20, %d22, %d58
4139	faligndata %d22, %d24, %d60
4140	faligndata %d24, %d26, %d62
4141	fmovd	%d26, %d10
4142	fmovd	%d28, %d12
4143	fmovd	%d30, %d14
4144	stda	%d48, [%i1]ASI_BLK_AIUS
4145	subcc	%i3, 64, %i3
4146	add	%i1, 64, %i1
4147	bgu,pt	%ncc, .co_unaln_101_loop
4148	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4149	ba	.co_unaln_done
4150	nop
4151
4152.co_unaln_100:
4153	ldd	[%o4+32], %d8
4154	ldd	[%o4+40], %d10
4155	ldd	[%o4+48], %d12
4156	ldd	[%o4+56], %d14
4157.co_unaln_100_loop:
4158	add	%o4, 64, %o4
4159	ldda	[%o4]ASI_BLK_P, %d16
4160	faligndata %d8, %d10, %d48
4161	faligndata %d10, %d12, %d50
4162	faligndata %d12, %d14, %d52
4163	faligndata %d14, %d16, %d54
4164	faligndata %d16, %d18, %d56
4165	faligndata %d18, %d20, %d58
4166	faligndata %d20, %d22, %d60
4167	faligndata %d22, %d24, %d62
4168	fmovd	%d24, %d8
4169	fmovd	%d26, %d10
4170	fmovd	%d28, %d12
4171	fmovd	%d30, %d14
4172	stda	%d48, [%i1]ASI_BLK_AIUS
4173	subcc	%i3, 64, %i3
4174	add	%i1, 64, %i1
4175	bgu,pt	%ncc, .co_unaln_100_loop
4176	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4177	ba	.co_unaln_done
4178	nop
4179
4180.co_unaln_011:
4181	ldd	[%o4+24], %d6
4182	ldd	[%o4+32], %d8
4183	ldd	[%o4+40], %d10
4184	ldd	[%o4+48], %d12
4185	ldd	[%o4+56], %d14
4186.co_unaln_011_loop:
4187	add	%o4, 64, %o4
4188	ldda	[%o4]ASI_BLK_P, %d16
4189	faligndata %d6, %d8, %d48
4190	faligndata %d8, %d10, %d50
4191	faligndata %d10, %d12, %d52
4192	faligndata %d12, %d14, %d54
4193	faligndata %d14, %d16, %d56
4194	faligndata %d16, %d18, %d58
4195	faligndata %d18, %d20, %d60
4196	faligndata %d20, %d22, %d62
4197	fmovd	%d22, %d6
4198	fmovd	%d24, %d8
4199	fmovd	%d26, %d10
4200	fmovd	%d28, %d12
4201	fmovd	%d30, %d14
4202	stda	%d48, [%i1]ASI_BLK_AIUS
4203	subcc	%i3, 64, %i3
4204	add	%i1, 64, %i1
4205	bgu,pt	%ncc, .co_unaln_011_loop
4206	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4207	ba	.co_unaln_done
4208	nop
4209
4210.co_unaln_010:
4211	ldd	[%o4+16], %d4
4212	ldd	[%o4+24], %d6
4213	ldd	[%o4+32], %d8
4214	ldd	[%o4+40], %d10
4215	ldd	[%o4+48], %d12
4216	ldd	[%o4+56], %d14
4217.co_unaln_010_loop:
4218	add	%o4, 64, %o4
4219	ldda	[%o4]ASI_BLK_P, %d16
4220	faligndata %d4, %d6, %d48
4221	faligndata %d6, %d8, %d50
4222	faligndata %d8, %d10, %d52
4223	faligndata %d10, %d12, %d54
4224	faligndata %d12, %d14, %d56
4225	faligndata %d14, %d16, %d58
4226	faligndata %d16, %d18, %d60
4227	faligndata %d18, %d20, %d62
4228	fmovd	%d20, %d4
4229	fmovd	%d22, %d6
4230	fmovd	%d24, %d8
4231	fmovd	%d26, %d10
4232	fmovd	%d28, %d12
4233	fmovd	%d30, %d14
4234	stda	%d48, [%i1]ASI_BLK_AIUS
4235	subcc	%i3, 64, %i3
4236	add	%i1, 64, %i1
4237	bgu,pt	%ncc, .co_unaln_010_loop
4238	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4239	ba	.co_unaln_done
4240	nop
4241
4242.co_unaln_001:
4243	ldd	[%o4+8], %d2
4244	ldd	[%o4+16], %d4
4245	ldd	[%o4+24], %d6
4246	ldd	[%o4+32], %d8
4247	ldd	[%o4+40], %d10
4248	ldd	[%o4+48], %d12
4249	ldd	[%o4+56], %d14
4250.co_unaln_001_loop:
4251	add	%o4, 64, %o4
4252	ldda	[%o4]ASI_BLK_P, %d16
4253	faligndata %d2, %d4, %d48
4254	faligndata %d4, %d6, %d50
4255	faligndata %d6, %d8, %d52
4256	faligndata %d8, %d10, %d54
4257	faligndata %d10, %d12, %d56
4258	faligndata %d12, %d14, %d58
4259	faligndata %d14, %d16, %d60
4260	faligndata %d16, %d18, %d62
4261	fmovd	%d18, %d2
4262	fmovd	%d20, %d4
4263	fmovd	%d22, %d6
4264	fmovd	%d24, %d8
4265	fmovd	%d26, %d10
4266	fmovd	%d28, %d12
4267	fmovd	%d30, %d14
4268	stda	%d48, [%i1]ASI_BLK_AIUS
4269	subcc	%i3, 64, %i3
4270	add	%i1, 64, %i1
4271	bgu,pt	%ncc, .co_unaln_001_loop
4272	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4273	ba	.co_unaln_done
4274	nop
4275
4276.co_unaln_000:
4277	ldda	[%o4]ASI_BLK_P, %d0
4278.co_unaln_000_loop:
4279	add	%o4, 64, %o4
4280	ldda	[%o4]ASI_BLK_P, %d16
4281	faligndata %d0, %d2, %d48
4282	faligndata %d2, %d4, %d50
4283	faligndata %d4, %d6, %d52
4284	faligndata %d6, %d8, %d54
4285	faligndata %d8, %d10, %d56
4286	faligndata %d10, %d12, %d58
4287	faligndata %d12, %d14, %d60
4288	faligndata %d14, %d16, %d62
4289	fmovd	%d16, %d0
4290	fmovd	%d18, %d2
4291	fmovd	%d20, %d4
4292	fmovd	%d22, %d6
4293	fmovd	%d24, %d8
4294	fmovd	%d26, %d10
4295	fmovd	%d28, %d12
4296	fmovd	%d30, %d14
4297	stda	%d48, [%i1]ASI_BLK_AIUS
4298	subcc	%i3, 64, %i3
4299	add	%i1, 64, %i1
4300	bgu,pt	%ncc, .co_unaln_000_loop
4301	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4302
4303.co_unaln_done:
4304	! Handle trailing bytes, 64 to 127
4305	! Dest long word aligned, Src not long word aligned
4306	cmp	%i2, 15
4307	bleu	%ncc, .co_unaln_short
4308
4309	andn	%i2, 0x7, %i3		! %i3 is multiple of 8
4310	and	%i2, 0x7, %i2		! residue bytes in %i2
4311	add	%i2, 8, %i2
4312	sub	%i3, 8, %i3		! insure we don't load past end of src
4313	andn	%i0, 0x7, %o4		! %o4 has long word aligned src address
4314	add	%i0, %i3, %i0		! advance %i0 to after multiple of 8
4315	ldd	[%o4], %d0		! fetch partial word
4316.co_unaln_by8:
4317	ldd	[%o4+8], %d2
4318	add	%o4, 8, %o4
4319	faligndata %d0, %d2, %d16
4320	subcc	%i3, 8, %i3
4321	stda	%d16, [%i1]%asi
4322	fmovd	%d2, %d0
4323	bgu,pt	%ncc, .co_unaln_by8
4324	add	%i1, 8, %i1
4325
4326.co_unaln_short:
4327	cmp	%i2, 8
4328	blt,pt	%ncc, .co_unalnfin
4329	nop
4330	ldub	[%i0], %o4
4331	sll	%o4, 24, %o3
4332	ldub	[%i0+1], %o4
4333	sll	%o4, 16, %o4
4334	or	%o4, %o3, %o3
4335	ldub	[%i0+2], %o4
4336	sll	%o4, 8, %o4
4337	or	%o4, %o3, %o3
4338	ldub	[%i0+3], %o4
4339	or	%o4, %o3, %o3
4340	stwa	%o3, [%i1]%asi
4341	ldub	[%i0+4], %o4
4342	sll	%o4, 24, %o3
4343	ldub	[%i0+5], %o4
4344	sll	%o4, 16, %o4
4345	or	%o4, %o3, %o3
4346	ldub	[%i0+6], %o4
4347	sll	%o4, 8, %o4
4348	or	%o4, %o3, %o3
4349	ldub	[%i0+7], %o4
4350	or	%o4, %o3, %o3
4351	stwa	%o3, [%i1+4]%asi
4352	add	%i0, 8, %i0
4353	add	%i1, 8, %i1
4354	sub	%i2, 8, %i2
4355.co_unalnfin:
4356	cmp	%i2, 4
4357	blt,pt	%ncc, .co_unalnz
4358	tst	%i2
4359	ldub	[%i0], %o3		! read byte
4360	subcc	%i2, 4, %i2		! reduce count by 4
4361	sll	%o3, 24, %o3		! position
4362	ldub	[%i0+1], %o4
4363	sll	%o4, 16, %o4		! position
4364	or	%o4, %o3, %o3		! merge
4365	ldub	[%i0+2], %o4
4366	sll	%o4, 8, %o4		! position
4367	or	%o4, %o3, %o3		! merge
4368	add	%i1, 4, %i1		! advance dst by 4
4369	ldub	[%i0+3], %o4
4370	add	%i0, 4, %i0		! advance src by 4
4371	or	%o4, %o3, %o4		! merge
4372	bnz,pt	%ncc, .co_unaln3x
4373	stwa	%o4, [%i1-4]%asi
4374	ba	.co_exit
4375	nop
4376.co_unalnz:
4377	bz,pt	%ncc, .co_exit
4378	wr	%l5, %g0, %gsr		! restore %gsr
4379.co_unaln3x:				! Exactly 1, 2, or 3 bytes remain
4380	subcc	%i2, 1, %i2		! reduce count for cc test
4381	ldub	[%i0], %o4		! load one byte
4382	bz,pt	%ncc, .co_exit
4383	stba	%o4, [%i1]%asi		! store one byte
4384	ldub	[%i0+1], %o4		! load second byte
4385	subcc	%i2, 1, %i2
4386	bz,pt	%ncc, .co_exit
4387	stba	%o4, [%i1+1]%asi	! store second byte
4388	ldub	[%i0+2], %o4		! load third byte
4389	stba	%o4, [%i1+2]%asi	! store third byte
4390.co_exit:
4391	brnz	%g1, .co_fp_restore
4392	nop
4393	FZERO
4394	wr	%g1, %g0, %fprs
4395	ba,pt	%ncc, .co_ex2
4396	membar	#Sync
4397.co_fp_restore:
4398	BLD_FP_FROMSTACK(%o4)
4399.co_ex2:
4400	andn	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
4401	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
4402	ret
4403	restore %g0, 0, %o0
4404
4405.copyout_err:
4406	ldn	[THREAD_REG + T_COPYOPS], %o4
4407	brz	%o4, 2f
4408	nop
4409	ldn	[%o4 + CP_COPYOUT], %g2
4410	jmp	%g2
4411	nop
44122:
4413	retl
4414	mov	-1, %o0
4415
4416#else	/* NIAGARA_IMPL */
4417.do_copyout:
4418	!
4419	! Check the length and bail if zero.
4420	!
4421	tst	%o2
4422	bnz,pt	%ncc, 1f
4423	nop
4424	retl
4425	clr	%o0
44261:
4427	sethi	%hi(copyio_fault), %o4
4428	or	%o4, %lo(copyio_fault), %o4
4429	sethi	%hi(copyio_fault_nowindow), %o3
4430	ldn	[THREAD_REG + T_LOFAULT], SAVED_LOFAULT
4431	or	%o3, %lo(copyio_fault_nowindow), %o3
4432	membar	#Sync
4433	stn	%o3, [THREAD_REG + T_LOFAULT]
4434
4435	mov	%o0, SAVE_SRC
4436	mov	%o1, SAVE_DST
4437	mov	%o2, SAVE_COUNT
4438
4439	!
4440	! Check to see if we're more than SMALL_LIMIT (7 bytes).
4441	! Run in leaf mode, using the %o regs as our input regs.
4442	!
4443	subcc	%o2, SMALL_LIMIT, %o3
4444	bgu,a,pt %ncc, .dco_ns
4445	or	%o0, %o1, %o3
4446	!
4447	! What was previously ".small_copyout"
4448	! Do full differenced copy.
4449	!
4450.dcobcp:
4451	sub	%g0, %o2, %o3		! negate count
4452	add	%o0, %o2, %o0		! make %o0 point at the end
4453	add	%o1, %o2, %o1		! make %o1 point at the end
4454	ba,pt	%ncc, .dcocl
4455	ldub	[%o0 + %o3], %o4	! load first byte
4456	!
4457	! %o0 and %o2 point at the end and remain pointing at the end
4458	! of their buffers. We pull things out by adding %o3 (which is
4459	! the negation of the length) to the buffer end which gives us
4460	! the curent location in the buffers. By incrementing %o3 we walk
4461	! through both buffers without having to bump each buffer's
4462	! pointer. A very fast 4 instruction loop.
4463	!
4464	.align 16
4465.dcocl:
4466	stba	%o4, [%o1 + %o3]ASI_USER
4467	inccc	%o3
4468	bl,a,pt	%ncc, .dcocl
4469	ldub	[%o0 + %o3], %o4
4470	!
4471	! We're done. Go home.
4472	!
4473	membar	#Sync
4474	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
4475	retl
4476	clr	%o0
4477	!
4478	! Try aligned copies from here.
4479	!
4480.dco_ns:
4481	! %o0 = kernel addr (to be copied from)
4482	! %o1 = user addr (to be copied to)
4483	! %o2 = length
4484	! %o3 = %o1 | %o2 (used for alignment checking)
4485	! %o4 is alternate lo_fault
4486	! %o5 is original lo_fault
4487	!
4488	! See if we're single byte aligned. If we are, check the
4489	! limit for single byte copies. If we're smaller or equal,
4490	! bounce to the byte for byte copy loop. Otherwise do it in
4491	! HW (if enabled).
4492	!
4493	btst	1, %o3
4494	bz,pt	%icc, .dcoh8
4495	btst	7, %o3
4496	!
4497	! Single byte aligned. Do we do it via HW or via
4498	! byte for byte? Do a quick no memory reference
4499	! check to pick up small copies.
4500	!
4501	sethi	%hi(hw_copy_limit_1), %o3
4502	!
4503	! Big enough that we need to check the HW limit for
4504	! this size copy.
4505	!
4506	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
4507	!
4508	! Is HW copy on? If not, do everything byte for byte.
4509	!
4510	tst	%o3
4511	bz,pn	%icc, .dcobcp
4512	subcc	%o3, %o2, %o3
4513	!
4514	! If we're less than or equal to the single byte copy limit,
4515	! bop to the copy loop.
4516	!
4517	bge,pt	%ncc, .dcobcp
4518	nop
4519	!
4520	! We're big enough and copy is on. Do it with HW.
4521	!
4522	ba,pt	%ncc, .big_copyout
4523	nop
4524.dcoh8:
4525	!
4526	! 8 byte aligned?
4527	!
4528	bnz,a	%ncc, .dcoh4
4529	btst	3, %o3
4530	!
4531	! See if we're in the "small range".
4532	! If so, go off and do the copy.
4533	! If not, load the hard limit. %o3 is
4534	! available for reuse.
4535	!
4536	sethi	%hi(hw_copy_limit_8), %o3
4537	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
4538	!
4539	! If it's zero, there's no HW bcopy.
4540	! Bop off to the aligned copy.
4541	!
4542	tst	%o3
4543	bz,pn	%icc, .dcos8
4544	subcc	%o3, %o2, %o3
4545	!
4546	! We're negative if our size is larger than hw_copy_limit_8.
4547	!
4548	bge,pt	%ncc, .dcos8
4549	nop
4550	!
4551	! HW assist is on and we're large enough. Do it.
4552	!
4553	ba,pt	%ncc, .big_copyout
4554	nop
4555.dcos8:
4556	!
4557	! Housekeeping for copy loops. Uses same idea as in the byte for
4558	! byte copy loop above.
4559	!
4560	add	%o0, %o2, %o0
4561	add	%o1, %o2, %o1
4562	sub	%g0, %o2, %o3
4563	ba,pt	%ncc, .dodebc
4564	srl	%o2, 3, %o2		! Number of 8 byte chunks to copy
4565	!
4566	! 4 byte aligned?
4567	!
4568.dcoh4:
4569	bnz,pn	%ncc, .dcoh2
4570	!
4571	! See if we're in the "small range".
4572	! If so, go off an do the copy.
4573	! If not, load the hard limit. %o3 is
4574	! available for reuse.
4575	!
4576	sethi	%hi(hw_copy_limit_4), %o3
4577	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
4578	!
4579	! If it's zero, there's no HW bcopy.
4580	! Bop off to the aligned copy.
4581	!
4582	tst	%o3
4583	bz,pn	%icc, .dcos4
4584	subcc	%o3, %o2, %o3
4585	!
4586	! We're negative if our size is larger than hw_copy_limit_4.
4587	!
4588	bge,pt	%ncc, .dcos4
4589	nop
4590	!
4591	! HW assist is on and we're large enough. Do it.
4592	!
4593	ba,pt	%ncc, .big_copyout
4594	nop
4595.dcos4:
4596	add	%o0, %o2, %o0
4597	add	%o1, %o2, %o1
4598	sub	%g0, %o2, %o3
4599	ba,pt	%ncc, .dodfbc
4600	srl	%o2, 2, %o2		! Number of 4 byte chunks to copy
4601	!
4602	! We must be 2 byte aligned. Off we go.
4603	! The check for small copies was done in the
4604	! delay at .dcoh4
4605	!
4606.dcoh2:
4607	ble	%ncc, .dcos2
4608	sethi	%hi(hw_copy_limit_2), %o3
4609	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
4610	tst	%o3
4611	bz,pn	%icc, .dcos2
4612	subcc	%o3, %o2, %o3
4613	bge,pt	%ncc, .dcos2
4614	nop
4615	!
4616	! HW is on and we're big enough. Do it.
4617	!
4618	ba,pt	%ncc, .big_copyout
4619	nop
4620.dcos2:
4621	add	%o0, %o2, %o0
4622	add	%o1, %o2, %o1
4623	sub	%g0, %o2, %o3
4624	ba,pt	%ncc, .dodtbc
4625	srl	%o2, 1, %o2		! Number of 2 byte chunks to copy
4626.small_copyout:
4627	!
4628	! Why are we doing this AGAIN? There are certain conditions in
4629	! big_copyout that will cause us to forego the HW assisted copies
4630	! and bounce back to a non-HW assisted copy. This dispatches those
4631	! copies. Note that we branch around this in the main line code.
4632	!
4633	! We make no check for limits or HW enablement here. We've
4634	! already been told that we're a poster child so just go off
4635	! and do it.
4636	!
4637	or	%o0, %o1, %o3
4638	btst	1, %o3
4639	bnz	%icc, .dcobcp		! Most likely
4640	btst	7, %o3
4641	bz	%icc, .dcos8
4642	btst	3, %o3
4643	bz	%icc, .dcos4
4644	nop
4645	ba,pt	%ncc, .dcos2
4646	nop
4647	.align 32
4648.dodebc:
4649	ldx	[%o0 + %o3], %o4
4650	deccc	%o2
4651	stxa	%o4, [%o1 + %o3]ASI_USER
4652	bg,pt	%ncc, .dodebc
4653	addcc	%o3, 8, %o3
4654	!
4655	! End of copy loop. Check to see if we're done. Most
4656	! eight byte aligned copies end here.
4657	!
4658	bz,pt	%ncc, .dcofh
4659	nop
4660	!
4661	! Something is left - do it byte for byte.
4662	!
4663	ba,pt	%ncc, .dcocl
4664	ldub	[%o0 + %o3], %o4	! load next byte
4665	!
4666	! Four byte copy loop. %o2 is the number of 4 byte chunks to copy.
4667	!
4668	.align 32
4669.dodfbc:
4670	lduw	[%o0 + %o3], %o4
4671	deccc	%o2
4672	sta	%o4, [%o1 + %o3]ASI_USER
4673	bg,pt	%ncc, .dodfbc
4674	addcc	%o3, 4, %o3
4675	!
4676	! End of copy loop. Check to see if we're done. Most
4677	! four byte aligned copies end here.
4678	!
4679	bz,pt	%ncc, .dcofh
4680	nop
4681	!
4682	! Something is left. Do it byte for byte.
4683	!
4684	ba,pt	%ncc, .dcocl
4685	ldub	[%o0 + %o3], %o4	! load next byte
4686	!
4687	! two byte aligned copy loop. %o2 is the number of 2 byte chunks to
4688	! copy.
4689	!
4690	.align 32
4691.dodtbc:
4692	lduh	[%o0 + %o3], %o4
4693	deccc	%o2
4694	stha	%o4, [%o1 + %o3]ASI_USER
4695	bg,pt	%ncc, .dodtbc
4696	addcc	%o3, 2, %o3
4697	!
4698	! End of copy loop. Anything left?
4699	!
4700	bz,pt	%ncc, .dcofh
4701	nop
4702	!
4703	! Deal with the last byte
4704	!
4705	ldub	[%o0 + %o3], %o4
4706	stba	%o4, [%o1 + %o3]ASI_USER
4707.dcofh:
4708	membar	#Sync
4709	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
4710	retl
4711	clr	%o0
4712
4713.big_copyout:
4714	! We're going to go off and do a block copy.
4715	! Switch fault handlers and grab a window. We
4716	! don't do a membar #Sync since we've done only
4717	! kernel data to this point.
4718	stn	%o4, [THREAD_REG + T_LOFAULT]
4719
4720	! Copy out that reach here are larger than 256 bytes. The
4721	! hw_copy_limit_1 is set to 256. Never set this limit less
4722	! 128 bytes.
4723	save	%sp, -SA(MINFRAME), %sp
4724.do_block_copyout:
4725
4726	! Swap src/dst since the code below is memcpy code
4727	! and memcpy/bcopy have different calling sequences
4728	mov	%i1, %i5
4729	mov	%i0, %i1
4730	mov	%i5, %i0
4731
4732	! Block (64 bytes) align the destination.
4733	andcc	%i0, 0x3f, %i3		! is dst block aligned
4734	bz	%ncc, copyout_blalign	! dst already block aligned
4735	sub	%i3, 0x40, %i3
4736	neg	%i3			! bytes till dst 64 bytes aligned
4737	sub	%i2, %i3, %i2		! update i2 with new count
4738
4739	! Based on source and destination alignment do
4740	! either 8 bytes, 4 bytes, 2 bytes or byte copy.
4741
4742	! Is dst & src 8B aligned
4743	or	%i0, %i1, %o2
4744	andcc	%o2, 0x7, %g0
4745	bz	%ncc, .co_alewdcp
4746	nop
4747
4748	! Is dst & src 4B aligned
4749	andcc	%o2, 0x3, %g0
4750	bz	%ncc, .co_alwdcp
4751	nop
4752
4753	! Is dst & src 2B aligned
4754	andcc	%o2, 0x1, %g0
4755	bz	%ncc, .co_alhlfwdcp
4756	nop
4757
4758	! 1B aligned
47591:	ldub	[%i1], %o2
4760	stba	%o2, [%i0]ASI_USER
4761	inc	%i1
4762	deccc	%i3
4763	bgu,pt	%ncc, 1b
4764	inc	%i0
4765
4766	ba	copyout_blalign
4767	nop
4768
4769	! dst & src 4B aligned
4770.co_alwdcp:
4771	ld	[%i1], %o2
4772	sta	%o2, [%i0]ASI_USER
4773	add	%i1, 0x4, %i1
4774	subcc	%i3, 0x4, %i3
4775	bgu,pt	%ncc, .co_alwdcp
4776	add	%i0, 0x4, %i0
4777
4778	ba	copyout_blalign
4779	nop
4780
4781	! dst & src 2B aligned
4782.co_alhlfwdcp:
4783	lduh	[%i1], %o2
4784	stuha	%o2, [%i0]ASI_USER
4785	add	%i1, 0x2, %i1
4786	subcc	%i3, 0x2, %i3
4787	bgu,pt	%ncc, .co_alhlfwdcp
4788	add	%i0, 0x2, %i0
4789
4790	ba	copyout_blalign
4791	nop
4792
4793	! dst & src 8B aligned
4794.co_alewdcp:
4795	ldx	[%i1], %o2
4796	stxa	%o2, [%i0]ASI_USER
4797	add	%i1, 0x8, %i1
4798	subcc	%i3, 0x8, %i3
4799	bgu,pt	%ncc, .co_alewdcp
4800	add	%i0, 0x8, %i0
4801
4802	! Now Destination is block (64 bytes) aligned
4803copyout_blalign:
4804	andn	%i2, 0x3f, %i3		! %i3 count is multiple of block size
4805	sub	%i2, %i3, %i2		! Residue bytes in %i2
4806
4807	mov	ASI_BLK_INIT_QUAD_LDD_AIUS, %asi
4808
4809	andcc	%i1, 0xf, %o2		! is src quadword aligned
4810	bz,pn	%xcc, .co_blkcpy	! src offset in %o2 (last 4-bits)
4811	nop
4812	cmp	%o2, 0x8
4813	bg	.co_upper_double
4814	nop
4815	bl	.co_lower_double
4816	nop
4817
4818	! Falls through when source offset is equal to 8 i.e.
4819	! source is double word aligned.
4820	! In this case no shift/merge of data is required
4821
4822	sub	%i1, %o2, %i1		! align the src at 16 bytes.
4823	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
4824	prefetch [%l0+0x0], #one_read
4825	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4826.co_loop0:
4827	add	%i1, 0x10, %i1
4828	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
4829	prefetch [%l0+0x40], #one_read
4830
4831	stxa	%l3, [%i0+0x0]%asi
4832	stxa	%l4, [%i0+0x8]%asi
4833
4834	add	%i1, 0x10, %i1
4835	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4836
4837	stxa	%l5, [%i0+0x10]%asi
4838	stxa	%l2, [%i0+0x18]%asi
4839
4840	add	%i1, 0x10, %i1
4841	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
4842
4843	stxa	%l3, [%i0+0x20]%asi
4844	stxa	%l4, [%i0+0x28]%asi
4845
4846	add	%i1, 0x10, %i1
4847	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4848
4849	stxa	%l5, [%i0+0x30]%asi
4850	stxa	%l2, [%i0+0x38]%asi
4851
4852	add	%l0, 0x40, %l0
4853	subcc	%i3, 0x40, %i3
4854	bgu,pt	%xcc, .co_loop0
4855	add	%i0, 0x40, %i0
4856	ba	.co_blkdone
4857	add	%i1, %o2, %i1		! increment the source by src offset
4858					! the src offset was stored in %o2
4859
4860.co_lower_double:
4861
4862	sub	%i1, %o2, %i1		! align the src at 16 bytes.
4863	sll	%o2, 3, %o0		! %o0 left shift
4864	mov	0x40, %o1
4865	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
4866	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
4867	prefetch [%l0+0x0], #one_read
4868	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2	! partial data in %l2 and %l3 has
4869					! complete data
4870.co_loop1:
4871	add	%i1, 0x10, %i1
4872	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4	! %l4 has partial data
4873							! for this read.
4874	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)	! merge %l2, %l3 and %l4
4875							! into %l2 and %l3
4876	prefetch [%l0+0x40], #one_read
4877
4878	stxa	%l2, [%i0+0x0]%asi
4879	stxa	%l3, [%i0+0x8]%asi
4880
4881	add	%i1, 0x10, %i1
4882	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4883	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)	! merge %l2 with %l5 and
4884							! %l4 from previous read
4885							! into %l4 and %l5
4886	stxa	%l4, [%i0+0x10]%asi
4887	stxa	%l5, [%i0+0x18]%asi
4888
4889	! Repeat the same for next 32 bytes.
4890
4891	add	%i1, 0x10, %i1
4892	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
4893	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
4894
4895	stxa	%l2, [%i0+0x20]%asi
4896	stxa	%l3, [%i0+0x28]%asi
4897
4898	add	%i1, 0x10, %i1
4899	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4900	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
4901
4902	stxa	%l4, [%i0+0x30]%asi
4903	stxa	%l5, [%i0+0x38]%asi
4904
4905	add	%l0, 0x40, %l0
4906	subcc	%i3, 0x40, %i3
4907	bgu,pt	%xcc, .co_loop1
4908	add	%i0, 0x40, %i0
4909	ba	.co_blkdone
4910	add	%i1, %o2, %i1		! increment the source by src offset
4911					! the src offset was stored in %o2
4912
4913.co_upper_double:
4914
4915	sub	%i1, %o2, %i1		! align the src at 16 bytes.
4916	sub	%o2, 0x8, %o0
4917	sll	%o0, 3, %o0		! %o0 left shift
4918	mov	0x40, %o1
4919	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
4920	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
4921	prefetch [%l0+0x0], #one_read
4922	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2	! partial data in %l3
4923							! for this read and
4924							! no data in %l2
4925.co_loop2:
4926	add	%i1, 0x10, %i1
4927	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4	! %l4 has complete data
4928							! and %l5 has partial
4929	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)	! merge %l3, %l4 and %l5
4930							! into %l3 and %l4
4931	prefetch [%l0+0x40], #one_read
4932
4933	stxa	%l3, [%i0+0x0]%asi
4934	stxa	%l4, [%i0+0x8]%asi
4935
4936	add	%i1, 0x10, %i1
4937	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4938	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)	! merge %l2 and %l3 with
4939							! %l5 from previous read
4940							! into %l5 and %l2
4941
4942	stxa	%l5, [%i0+0x10]%asi
4943	stxa	%l2, [%i0+0x18]%asi
4944
4945	! Repeat the same for next 32 bytes.
4946
4947	add	%i1, 0x10, %i1
4948	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
4949	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
4950
4951	stxa	%l3, [%i0+0x20]%asi
4952	stxa	%l4, [%i0+0x28]%asi
4953
4954	add	%i1, 0x10, %i1
4955	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4956	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
4957
4958	stxa	%l5, [%i0+0x30]%asi
4959	stxa	%l2, [%i0+0x38]%asi
4960
4961	add	%l0, 0x40, %l0
4962	subcc	%i3, 0x40, %i3
4963	bgu,pt	%xcc, .co_loop2
4964	add	%i0, 0x40, %i0
4965	ba	.co_blkdone
4966	add	%i1, %o2, %i1		! increment the source by src offset
4967					! the src offset was stored in %o2
4968
4969
4970	! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
4971.co_blkcpy:
4972
4973	andn	%i1, 0x3f, %o0		! %o0 has block aligned source
4974	prefetch [%o0+0x0], #one_read
49751:
4976	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l0
4977	add	%i1, 0x10, %i1
4978	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4979	add	%i1, 0x10, %i1
4980
4981	prefetch [%o0+0x40], #one_read
4982
4983	stxa	%l0, [%i0+0x0]%asi
4984
4985	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
4986	add	%i1, 0x10, %i1
4987	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l6
4988	add	%i1, 0x10, %i1
4989
4990	stxa	%l1, [%i0+0x8]%asi
4991	stxa	%l2, [%i0+0x10]%asi
4992	stxa	%l3, [%i0+0x18]%asi
4993	stxa	%l4, [%i0+0x20]%asi
4994	stxa	%l5, [%i0+0x28]%asi
4995	stxa	%l6, [%i0+0x30]%asi
4996	stxa	%l7, [%i0+0x38]%asi
4997
4998	add	%o0, 0x40, %o0
4999	subcc	%i3, 0x40, %i3
5000	bgu,pt	%xcc, 1b
5001	add	%i0, 0x40, %i0
5002
5003.co_blkdone:
5004	membar	#Sync
5005
5006	brz,pt	%i2, .copyout_exit
5007	nop
5008
5009	! Handle trailing bytes
5010	cmp	%i2, 0x8
5011	blu,pt	%ncc, .co_residue
5012	nop
5013
5014	! Can we do some 8B ops
5015	or	%i1, %i0, %o2
5016	andcc	%o2, 0x7, %g0
5017	bnz	%ncc, .co_last4
5018	nop
5019
5020	! Do 8byte ops as long as possible
5021.co_last8:
5022	ldx	[%i1], %o2
5023	stxa	%o2, [%i0]ASI_USER
5024	add	%i1, 0x8, %i1
5025	sub	%i2, 0x8, %i2
5026	cmp	%i2, 0x8
5027	bgu,pt	%ncc, .co_last8
5028	add	%i0, 0x8, %i0
5029
5030	brz,pt	%i2, .copyout_exit
5031	nop
5032
5033	ba	.co_residue
5034	nop
5035
5036.co_last4:
5037	! Can we do 4B ops
5038	andcc	%o2, 0x3, %g0
5039	bnz	%ncc, .co_last2
5040	nop
50411:
5042	ld	[%i1], %o2
5043	sta	%o2, [%i0]ASI_USER
5044	add	%i1, 0x4, %i1
5045	sub	%i2, 0x4, %i2
5046	cmp	%i2, 0x4
5047	bgu,pt	%ncc, 1b
5048	add	%i0, 0x4, %i0
5049
5050	brz,pt	%i2, .copyout_exit
5051	nop
5052
5053	ba	.co_residue
5054	nop
5055
5056.co_last2:
5057	! Can we do 2B ops
5058	andcc	%o2, 0x1, %g0
5059	bnz	%ncc, .co_residue
5060	nop
5061
50621:
5063	lduh	[%i1], %o2
5064	stuha	%o2, [%i0]ASI_USER
5065	add	%i1, 0x2, %i1
5066	sub	%i2, 0x2, %i2
5067	cmp	%i2, 0x2
5068	bgu,pt	%ncc, 1b
5069	add	%i0, 0x2, %i0
5070
5071	brz,pt	%i2, .copyout_exit
5072	nop
5073
5074	! Copy the residue as byte copy
5075.co_residue:
5076	ldub	[%i1], %i4
5077	stba	%i4, [%i0]ASI_USER
5078	inc	%i1
5079	deccc	%i2
5080	bgu,pt	%xcc, .co_residue
5081	inc	%i0
5082
5083.copyout_exit:
5084	membar	#Sync
5085	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
5086	ret
5087	restore	%g0, 0, %o0
5088
5089.copyout_err:
5090	ldn	[THREAD_REG + T_COPYOPS], %o4
5091	brz	%o4, 2f
5092	nop
5093	ldn	[%o4 + CP_COPYOUT], %g2
5094	jmp	%g2
5095	nop
50962:
5097	retl
5098	mov	-1, %o0
5099#endif	/* NIAGARA_IMPL */
5100	SET_SIZE(copyout)
5101
5102#endif	/* lint */
5103
5104
5105#ifdef	lint
5106
5107/*ARGSUSED*/
5108int
5109xcopyout(const void *kaddr, void *uaddr, size_t count)
5110{ return (0); }
5111
5112#else	/* lint */
5113
5114	ENTRY(xcopyout)
5115	sethi	%hi(.xcopyout_err), REAL_LOFAULT
5116	b	.do_copyout
5117	or	REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
5118.xcopyout_err:
5119	ldn	[THREAD_REG + T_COPYOPS], %o4
5120	brz	%o4, 2f
5121	nop
5122	ldn	[%o4 + CP_XCOPYOUT], %g2
5123	jmp	%g2
5124	nop
51252:
5126	retl
5127	mov	%g1, %o0
5128	SET_SIZE(xcopyout)
5129
5130#endif	/* lint */
5131
5132#ifdef	lint
5133
5134/*ARGSUSED*/
5135int
5136xcopyout_little(const void *kaddr, void *uaddr, size_t count)
5137{ return (0); }
5138
5139#else	/* lint */
5140
5141	ENTRY(xcopyout_little)
5142	sethi	%hi(.little_err), %o4
5143	ldn	[THREAD_REG + T_LOFAULT], %o5
5144	or	%o4, %lo(.little_err), %o4
5145	membar	#Sync			! sync error barrier
5146	stn	%o4, [THREAD_REG + T_LOFAULT]
5147
5148	subcc	%g0, %o2, %o3
5149	add	%o0, %o2, %o0
5150	bz,pn	%ncc, 2f		! check for zero bytes
5151	sub	%o2, 1, %o4
5152	add	%o0, %o4, %o0		! start w/last byte
5153	add	%o1, %o2, %o1
5154	ldub	[%o0+%o3], %o4
5155
51561:	stba	%o4, [%o1+%o3]ASI_AIUSL
5157	inccc	%o3
5158	sub	%o0, 2, %o0		! get next byte
5159	bcc,a,pt %ncc, 1b
5160	ldub	[%o0+%o3], %o4
5161
51622:	membar	#Sync			! sync error barrier
5163	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
5164	retl
5165	mov	%g0, %o0		! return (0)
5166	SET_SIZE(xcopyout_little)
5167
5168#endif	/* lint */
5169
5170/*
5171 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
5172 */
5173
5174#if defined(lint)
5175
5176/*ARGSUSED*/
5177int
5178copyin(const void *uaddr, void *kaddr, size_t count)
5179{ return (0); }
5180
5181#else	/* lint */
5182
5183	ENTRY(copyin)
5184	sethi	%hi(.copyin_err), REAL_LOFAULT
5185	or	REAL_LOFAULT, %lo(.copyin_err), REAL_LOFAULT
5186
5187#if !defined(NIAGARA_IMPL)
5188.do_copyin:
5189	tst	%o2			! check for zero count;  quick exit
5190	bz,pt	%ncc, .ci_smallqx
5191	mov	%o0, SAVE_SRC
5192	mov	%o1, SAVE_DST
5193	mov	%o2, SAVE_COUNT
5194	cmp	%o2, FP_COPY		! check for small copy/leaf case
5195	bgt,pt	%ncc, .ci_copy_more
5196	ldn	[THREAD_REG + T_LOFAULT], SAVED_LOFAULT
5197/*
5198 * Small copy in code
5199 *
5200 */
5201	sethi	%hi(copyio_fault_nowindow), %o3
5202	or	%o3, %lo(copyio_fault_nowindow), %o3
5203	membar	#Sync
5204	stn	%o3, [THREAD_REG + T_LOFAULT]
5205
5206	mov	ASI_USER, %asi
5207	cmp	%o2, SHORTCOPY		! make sure there is enough to align
5208	ble,pt	%ncc, .ci_smallest
5209	andcc	%o1, 0x7, %o3		! is dest long word aligned
5210	bnz,pn	%ncc, .ci_align
5211	andcc	%o1, 1, %o3		! is dest byte aligned
5212
5213! Destination is long word aligned
5214.ci_al_src:
5215	andcc	%o0, 7, %o3
5216	brnz,pt	%o3, .ci_src_dst_unal8
5217	nop
5218/*
5219 * Special case for handling when src and dest are both long word aligned
5220 * and total data to move is less than FP_COPY bytes
5221 * Also handles finish up for large block moves, so may be less than 32 bytes
5222 */
5223.ci_medlong:
5224	subcc	%o2, 31, %o2		! adjust length to allow cc test
5225	ble,pt	%ncc, .ci_medl31
5226	nop
5227.ci_medl32:
5228	ldxa	[%o0]%asi, %o4		! move 32 bytes
5229	subcc	%o2, 32, %o2		! decrement length count by 32
5230	stx	%o4, [%o1]
5231	ldxa	[%o0+8]%asi, %o4
5232	stx	%o4, [%o1+8]
5233	ldxa	[%o0+16]%asi, %o4
5234	add	%o0, 32, %o0		! increase src ptr by 32
5235	stx	%o4, [%o1+16]
5236	ldxa	[%o0-8]%asi, %o4
5237	add	%o1, 32, %o1		! increase dst ptr by 32
5238	bgu,pt	%ncc, .ci_medl32	! repeat if at least 32 bytes left
5239	stx	%o4, [%o1-8]
5240.ci_medl31:
5241	addcc	%o2, 24, %o2		! adjust count to be off by 7
5242	ble,pt	%ncc, .ci_medl7		! skip if 7 or fewer bytes left
5243	nop
5244.ci_medl8:
5245	ldxa	[%o0]%asi, %o4		! move 8 bytes
5246	add	%o0, 8, %o0		! increase src ptr by 8
5247	subcc	%o2, 8, %o2		! decrease count by 8
5248	add	%o1, 8, %o1		! increase dst ptr by 8
5249	bgu,pt	%ncc, .ci_medl8
5250	stx	%o4, [%o1-8]
5251.ci_medl7:
5252	addcc	%o2, 7, %o2		! finish adjustment of remaining count
5253	bnz,pt	%ncc, .ci_small4	! do final bytes if not finished
5254	nop
5255.ci_smallx:				! finish up and exit
5256	membar	#Sync
5257	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
5258.ci_smallqx:
5259	retl
5260	mov	%g0, %o0
5261
5262.ci_small4:
5263	cmp	%o2, 4
5264	blt,pt	%ncc, .ci_small3x	! skip if less than 4 bytes left
5265	nop				!
5266	lda	[%o0]%asi, %o4		! move 4 bytes
5267	add	%o0, 4, %o0		! increase src ptr by 4
5268	add	%o1, 4, %o1		! increase dst ptr by 4
5269	subcc	%o2, 4, %o2		! decrease count by 4
5270	bz	%ncc, .ci_smallx
5271	stw	%o4, [%o1-4]
5272
5273.ci_small3x:				! Exactly 1, 2, or 3 bytes remain
5274	subcc	%o2, 1, %o2		! reduce count for cc test
5275	lduba	[%o0]%asi, %o4		! load one byte
5276	bz,pt	%ncc, .ci_smallx
5277	stb	%o4, [%o1]		! store one byte
5278	lduba	[%o0+1]%asi, %o4	! load second byte
5279	subcc	%o2, 1, %o2
5280	bz,pt	%ncc, .ci_smallx
5281	stb	%o4, [%o1+1]		! store second byte
5282	lduba	[%o0+2]%asi, %o4	! load third byte
5283	ba	.ci_smallx
5284	stb	%o4, [%o1+2]		! store third byte
5285
5286.ci_smallest:				! 7 or fewer bytes remain
5287	cmp	%o2, 4
5288	blt,pt	%ncc, .ci_small3x
5289	nop
5290	lduba	[%o0]%asi, %o4		! read byte
5291	subcc	%o2, 4, %o2		! reduce count by 4
5292	stb	%o4, [%o1]		! write byte
5293	lduba	[%o0+1]%asi, %o4	! repeat for total of 4 bytes
5294	add	%o0, 4, %o0		! advance src by 4
5295	stb	%o4, [%o1+1]
5296	lduba	[%o0-2]%asi, %o4
5297	add	%o1, 4, %o1		! advance dst by 4
5298	stb	%o4, [%o1-2]
5299	lduba	[%o0-1]%asi, %o4
5300	bnz,pt	%ncc, .ci_small3x
5301	stb	%o4, [%o1-1]
5302	membar	#Sync
5303	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
5304	retl
5305	mov	%g0, %o0
5306
5307.ci_align:
5308	bnz,pt	%ncc, .ci_al_d1
5309.ci_al_d1f:				! dest is now half word aligned
5310	andcc	%o1, 2, %o3		! is dest word aligned
5311	bnz,pt	%ncc, .ci_al_d2
5312.ci_al_d2f:				! dest is now word aligned
5313	andcc	%o1, 4, %o3		! is dest longword aligned?
5314	bz,pt	%ncc, .ci_al_src
5315	nop
5316.ci_al_d4:				! dest is word aligned;  src is unknown
5317	lduba	[%o0]%asi, %o4		! move a word (src align unknown)
5318	lduba	[%o0+1]%asi, %o3
5319	sll	%o4, 24, %o4		! position
5320	sll	%o3, 16, %o3		! position
5321	or	%o4, %o3, %o3		! merge
5322	lduba	[%o0+2]%asi, %o4
5323	sll	%o4, 8, %o4		! position
5324	or	%o4, %o3, %o3		! merge
5325	lduba	[%o0+3]%asi, %o4
5326	or	%o4, %o3, %o4		! merge
5327	stw	%o4,[%o1]		! store four bytes
5328	add	%o0, 4, %o0		! adjust src by 4
5329	add	%o1, 4, %o1		! adjust dest by 4
5330	sub	%o2, 4, %o2		! adjust count by 4
5331	andcc	%o0, 7, %o3		! check for src long word alignment
5332	brz,pt	%o3, .ci_medlong
5333.ci_src_dst_unal8:
5334	! dst is 8-byte aligned, src is not
5335	! Size is less than FP_COPY
5336	! Following code is to select for alignment
5337	andcc	%o0, 0x3, %o3		! test word alignment
5338	bz,pt	%ncc, .ci_medword
5339	nop
5340	andcc	%o0, 0x1, %o3		! test halfword alignment
5341	bnz,pt	%ncc, .ci_med_byte	! go to byte move if not halfword
5342	andcc	%o0, 0x2, %o3		! test which byte alignment
5343	ba	.ci_medhalf
5344	nop
5345.ci_al_d1:				! align dest to half word
5346	lduba	[%o0]%asi, %o4		! move a byte
5347	add	%o0, 1, %o0
5348	stb	%o4, [%o1]
5349	add	%o1, 1, %o1
5350	andcc	%o1, 2, %o3		! is dest word aligned
5351	bz,pt	%ncc, .ci_al_d2f
5352	sub	%o2, 1, %o2
5353.ci_al_d2:				! align dest to word
5354	lduba	[%o0]%asi, %o4		! move a half-word (src align unknown)
5355	lduba	[%o0+1]%asi, %o3
5356	sll	%o4, 8, %o4		! position
5357	or	%o4, %o3, %o4		! merge
5358	sth	%o4, [%o1]
5359	add	%o0, 2, %o0
5360	add	%o1, 2, %o1
5361	andcc	%o1, 4, %o3		! is dest longword aligned?
5362	bz,pt	%ncc, .ci_al_src
5363	sub	%o2, 2, %o2
5364	ba	.ci_al_d4
5365	nop
5366/*
5367 * Handle all cases where src and dest are aligned on word
5368 * boundaries. Use unrolled loops for better performance.
5369 * This option wins over standard large data move when
5370 * source and destination is in cache for medium
5371 * to short data moves.
5372 */
5373.ci_medword:
5374	subcc	%o2, 31, %o2		! adjust length to allow cc test
5375	ble,pt	%ncc, .ci_medw31
5376	nop
5377.ci_medw32:
5378	lda	[%o0]%asi, %o4		! move a block of 32 bytes
5379	stw	%o4, [%o1]
5380	lda	[%o0+4]%asi, %o4
5381	stw	%o4, [%o1+4]
5382	lda	[%o0+8]%asi, %o4
5383	stw	%o4, [%o1+8]
5384	lda	[%o0+12]%asi, %o4
5385	stw	%o4, [%o1+12]
5386	lda	[%o0+16]%asi, %o4
5387	stw	%o4, [%o1+16]
5388	lda	[%o0+20]%asi, %o4
5389	subcc	%o2, 32, %o2		! decrement length count
5390	stw	%o4, [%o1+20]
5391	lda	[%o0+24]%asi, %o4
5392	add	%o0, 32, %o0		! increase src ptr by 32
5393	stw	%o4, [%o1+24]
5394	lda	[%o0-4]%asi, %o4
5395	add	%o1, 32, %o1		! increase dst ptr by 32
5396	bgu,pt	%ncc, .ci_medw32	! repeat if at least 32 bytes left
5397	stw	%o4, [%o1-4]
5398.ci_medw31:
5399	addcc	%o2, 24, %o2		! adjust count to be off by 7
5400	ble,pt	%ncc, .ci_medw7		! skip if 7 or fewer bytes left
5401	nop				!
5402.ci_medw15:
5403	lda	[%o0]%asi, %o4		! move a block of 8 bytes
5404	subcc	%o2, 8, %o2		! decrement length count
5405	stw	%o4, [%o1]
5406	add	%o0, 8, %o0		! increase src ptr by 8
5407	lda	[%o0-4]%asi, %o4
5408	add	%o1, 8, %o1		! increase dst ptr by 8
5409	bgu,pt	%ncc, .ci_medw15
5410	stw	%o4, [%o1-4]
5411.ci_medw7:
5412	addcc	%o2, 7, %o2		! finish adjustment of remaining count
5413	bz,pt	%ncc, .ci_smallx	! exit if finished
5414	cmp	%o2, 4
5415	blt,pt	%ncc, .ci_small3x	! skip if less than 4 bytes left
5416	nop				!
5417	lda	[%o0]%asi, %o4		! move 4 bytes
5418	add	%o0, 4, %o0		! increase src ptr by 4
5419	add	%o1, 4, %o1		! increase dst ptr by 4
5420	subcc	%o2, 4, %o2		! decrease count by 4
5421	bnz	.ci_small3x
5422	stw	%o4, [%o1-4]
5423	membar	#Sync
5424	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
5425	retl
5426	mov	%g0, %o0
5427
5428.ci_medhalf:
5429	subcc	%o2, 31, %o2		! adjust length to allow cc test
5430	ble,pt	%ncc, .ci_medh31
5431	nop
5432.ci_medh32:				! load and store block of 32 bytes
5433	subcc	%o2, 32, %o2		! decrement length count
5434
5435	lduha	[%o0]%asi, %o4		! move 32 bytes
5436	lduwa	[%o0+2]%asi, %o3
5437	sllx	%o4, 48, %o4
5438	sllx	%o3, 16, %o3
5439	or	%o4, %o3, %o3
5440	lduha	[%o0+6]%asi, %o4
5441	or	%o4, %o3, %o4
5442	stx	%o4, [%o1]
5443
5444	lduha	[%o0+8]%asi, %o4
5445	lduwa	[%o0+10]%asi, %o3
5446	sllx	%o4, 48, %o4
5447	sllx	%o3, 16, %o3
5448	or	%o4, %o3, %o3
5449	lduha	[%o0+14]%asi, %o4
5450	or	%o4, %o3, %o4
5451	stx	%o4, [%o1+8]
5452
5453	lduha	[%o0+16]%asi, %o4
5454	lduwa	[%o0+18]%asi, %o3
5455	sllx	%o4, 48, %o4
5456	sllx	%o3, 16, %o3
5457	or	%o4, %o3, %o3
5458	lduha	[%o0+22]%asi, %o4
5459	or	%o4, %o3, %o4
5460	stx	%o4, [%o1+16]
5461
5462	add	%o0, 32, %o0		! increase src ptr by 32
5463	add	%o1, 32, %o1		! increase dst ptr by 32
5464
5465	lduha	[%o0-8]%asi, %o4
5466	lduwa	[%o0-6]%asi, %o3
5467	sllx	%o4, 48, %o4
5468	sllx	%o3, 16, %o3
5469	or	%o4, %o3, %o3
5470	lduha	[%o0-2]%asi, %o4
5471	or	%o3, %o4, %o4
5472	bgu,pt	%ncc, .ci_medh32	! repeat if at least 32 bytes left
5473	stx	%o4, [%o1-8]
5474
5475.ci_medh31:
5476	addcc	%o2, 24, %o2		! adjust count to be off by 7
5477	ble,pt	%ncc, .ci_medh7		! skip if 7 or fewer bytes left
5478	nop				!
5479.ci_medh15:
5480	lduha	[%o0]%asi, %o4		! move 16 bytes
5481	subcc	%o2, 8, %o2		! decrement length count
5482	lduwa	[%o0+2]%asi, %o3
5483	sllx	%o4, 48, %o4
5484	sllx	%o3, 16, %o3
5485	or	%o4, %o3, %o3
5486	add	%o1, 8, %o1		! increase dst ptr by 8
5487	lduha	[%o0+6]%asi, %o4
5488	add	%o0, 8, %o0		! increase src ptr by 8
5489	or	%o4, %o3, %o4
5490	bgu,pt	%ncc, .ci_medh15
5491	stx	%o4, [%o1-8]
5492.ci_medh7:
5493	addcc	%o2, 7, %o2		! finish adjustment of remaining count
5494	bz,pt	%ncc, .ci_smallx	! exit if finished
5495	cmp	%o2, 4
5496	blt,pt	%ncc, .ci_small3x	! skip if less than 4 bytes left
5497	nop				!
5498	lduha	[%o0]%asi, %o4
5499	sll	%o4, 16, %o4
5500	lduha	[%o0+2]%asi, %o3
5501	or	%o3, %o4, %o4
5502	subcc	%o2, 4, %o2
5503	add	%o0, 4, %o0
5504	add	%o1, 4, %o1
5505	bnz	.ci_small3x
5506	stw	%o4, [%o1-4]
5507	membar	#Sync
5508	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
5509	retl
5510	mov	%g0, %o0
5511
5512	.align 16
5513.ci_med_byte:
5514	bnz,pt	%ncc, .ci_medbh32a	! go to correct byte move
5515	subcc	%o2, 31, %o2		! adjust length to allow cc test
5516	ble,pt	%ncc, .ci_medb31
5517	nop
5518.ci_medb32:				! Alignment 1 or 5
5519	subcc	%o2, 32, %o2		! decrement length count
5520
5521	lduba	[%o0]%asi, %o4		! load and store a block of 32 bytes
5522	sllx	%o4, 56, %o3
5523	lduha	[%o0+1]%asi, %o4
5524	sllx	%o4, 40, %o4
5525	or	%o4, %o3, %o3
5526	lduwa	[%o0+3]%asi, %o4
5527	sllx	%o4, 8, %o4
5528	or	%o4, %o3, %o3
5529	lduba	[%o0+7]%asi, %o4
5530	or	%o4, %o3, %o4
5531	stx	%o4, [%o1]
5532
5533	lduba	[%o0+8]%asi, %o4
5534	sllx	%o4, 56, %o3
5535	lduha	[%o0+9]%asi, %o4
5536	sllx	%o4, 40, %o4
5537	or	%o4, %o3, %o3
5538	lduwa	[%o0+11]%asi, %o4
5539	sllx	%o4, 8, %o4
5540	or	%o4, %o3, %o3
5541	lduba	[%o0+15]%asi, %o4
5542	or	%o4, %o3, %o4
5543	stx	%o4, [%o1+8]
5544
5545	lduba	[%o0+16]%asi, %o4
5546	sllx	%o4, 56, %o3
5547	lduha	[%o0+17]%asi, %o4
5548	sllx	%o4, 40, %o4
5549	or	%o4, %o3, %o3
5550	lduwa	[%o0+19]%asi, %o4
5551	sllx	%o4, 8, %o4
5552	or	%o4, %o3, %o3
5553	lduba	[%o0+23]%asi, %o4
5554	or	%o4, %o3, %o4
5555	stx	%o4, [%o1+16]
5556
5557	add	%o0, 32, %o0		! increase src ptr by 32
5558	add	%o1, 32, %o1		! increase dst ptr by 32
5559
5560	lduba	[%o0-8]%asi, %o4
5561	sllx	%o4, 56, %o3
5562	lduha	[%o0-7]%asi, %o4
5563	sllx	%o4, 40, %o4
5564	or	%o4, %o3, %o3
5565	lduwa	[%o0-5]%asi, %o4
5566	sllx	%o4, 8, %o4
5567	or	%o4, %o3, %o3
5568	lduba	[%o0-1]%asi, %o4
5569	or	%o4, %o3, %o4
5570	bgu,pt	%ncc, .ci_medb32	! repeat if at least 32 bytes left
5571	stx	%o4, [%o1-8]
5572
5573.ci_medb31:				! 31 or fewer bytes remaining
5574	addcc	%o2, 24, %o2		! adjust count to be off by 7
5575	ble,pt	%ncc, .ci_medb7		! skip if 7 or fewer bytes left
5576	nop				!
5577.ci_medb15:
5578
5579	lduba	[%o0]%asi, %o4		! load and store a block of 8 bytes
5580	subcc	%o2, 8, %o2		! decrement length count
5581	sllx	%o4, 56, %o3
5582	lduha	[%o0+1]%asi, %o4
5583	sllx	%o4, 40, %o4
5584	or	%o4, %o3, %o3
5585	lduwa	[%o0+3]%asi, %o4
5586	add	%o1, 8, %o1		! increase dst ptr by 16
5587	sllx	%o4, 8, %o4
5588	or	%o4, %o3, %o3
5589	lduba	[%o0+7]%asi, %o4
5590	add	%o0, 8, %o0		! increase src ptr by 16
5591	or	%o4, %o3, %o4
5592	bgu,pt	%ncc, .ci_medb15
5593	stx	%o4, [%o1-8]
5594.ci_medb7:
5595	addcc	%o2, 7, %o2		! finish adjustment of remaining count
5596	bz,pt	%ncc, .ci_smallx	! exit if finished
5597	cmp	%o2, 4
5598	blt,pt	%ncc, .ci_small3x	! skip if less than 4 bytes left
5599	nop				!
5600	lduba	[%o0]%asi, %o4		! move 4 bytes
5601	sll	%o4, 24, %o3
5602	lduha	[%o0+1]%asi, %o4
5603	sll	%o4, 8, %o4
5604	or	%o4, %o3, %o3
5605	lduba	[%o0+3]%asi, %o4
5606	or	%o4, %o3, %o4
5607	subcc	%o2, 4, %o2
5608	add	%o0, 4, %o0
5609	add	%o1, 4, %o1
5610	bnz	.ci_small3x
5611	stw	%o4, [%o1-4]
5612	membar	#Sync
5613	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
5614	retl
5615	mov	%g0, %o0
5616
5617	.align 16
5618.ci_medbh32a:				! Alignment 3 or 7
5619	ble,pt	%ncc, .ci_medbh31
5620	nop
5621.ci_medbh32:				! Alignment 3 or 7
5622	subcc	%o2, 32, %o2		! decrement length count
5623
5624	lduba	[%o0]%asi, %o4		! load and store a block of 32 bytes
5625	sllx	%o4, 56, %o3
5626	lduwa	[%o0+1]%asi, %o4
5627	sllx	%o4, 24, %o4
5628	or	%o4, %o3, %o3
5629	lduha	[%o0+5]%asi, %o4
5630	sllx	%o4, 8, %o4
5631	or	%o4, %o3, %o3
5632	lduba	[%o0+7]%asi, %o4
5633	or	%o4, %o3, %o4
5634	stx	%o4, [%o1]
5635
5636	lduba	[%o0+8]%asi, %o4
5637	sllx	%o4, 56, %o3
5638	lduwa	[%o0+9]%asi, %o4
5639	sllx	%o4, 24, %o4
5640	or	%o4, %o3, %o3
5641	lduha	[%o0+13]%asi, %o4
5642	sllx	%o4, 8, %o4
5643	or	%o4, %o3, %o3
5644	lduba	[%o0+15]%asi, %o4
5645	or	%o4, %o3, %o4
5646	stx	%o4, [%o1+8]
5647
5648	lduba	[%o0+16]%asi, %o4
5649	sllx	%o4, 56, %o3
5650	lduwa	[%o0+17]%asi, %o4
5651	sllx	%o4, 24, %o4
5652	or	%o4, %o3, %o3
5653	lduha	[%o0+21]%asi, %o4
5654	sllx	%o4, 8, %o4
5655	or	%o4, %o3, %o3
5656	lduba	[%o0+23]%asi, %o4
5657	or	%o4, %o3, %o4
5658	stx	%o4, [%o1+16]
5659
5660	add	%o0, 32, %o0		! increase src ptr by 32
5661	add	%o1, 32, %o1		! increase dst ptr by 32
5662
5663	lduba	[%o0-8]%asi, %o4
5664	sllx	%o4, 56, %o3
5665	lduwa	[%o0-7]%asi, %o4
5666	sllx	%o4, 24, %o4
5667	or	%o4, %o3, %o3
5668	lduha	[%o0-3]%asi, %o4
5669	sllx	%o4, 8, %o4
5670	or	%o4, %o3, %o3
5671	lduba	[%o0-1]%asi, %o4
5672	or	%o4, %o3, %o4
5673	bgu,pt	%ncc, .ci_medbh32	! repeat if at least 32 bytes left
5674	stx	%o4, [%o1-8]
5675
5676.ci_medbh31:
5677	addcc	%o2, 24, %o2		! adjust count to be off by 7
5678	ble,pt	%ncc, .ci_medb7		! skip if 7 or fewer bytes left
5679	nop				!
5680.ci_medbh15:
5681	lduba	[%o0]%asi, %o4		! load and store a block of 8 bytes
5682	sllx	%o4, 56, %o3
5683	lduwa	[%o0+1]%asi, %o4
5684	sllx	%o4, 24, %o4
5685	or	%o4, %o3, %o3
5686	lduha	[%o0+5]%asi, %o4
5687	sllx	%o4, 8, %o4
5688	or	%o4, %o3, %o3
5689	lduba	[%o0+7]%asi, %o4
5690	or	%o4, %o3, %o4
5691	stx	%o4, [%o1]
5692	subcc	%o2, 8, %o2		! decrement length count
5693	add	%o1, 8, %o1		! increase dst ptr by 8
5694	add	%o0, 8, %o0		! increase src ptr by 8
5695	bgu,pt	%ncc, .ci_medbh15
5696	stx	%o4, [%o1-8]
5697	ba	.ci_medb7
5698	nop
5699
5700/*
5701 * End of small copy in code (no window)
5702 *
5703 */
5704
5705/*
5706 * Long copy in code (using register window and fp regs)
5707 *
5708 */
5709
5710.ci_copy_more:
5711	sethi	%hi(copyio_fault), %o3
5712	or	%o3, %lo(copyio_fault), %o3
5713	membar	#Sync
5714	stn	%o3, [THREAD_REG + T_LOFAULT]
5715/*
5716 * Following code is for large copies. We know there is at
5717 * least FP_COPY bytes available. FP regs are used, so
5718 *  we save registers and fp regs before starting
5719 */
5720	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
5721	or	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
5722	rd	%fprs, %g1		! check for unused fp
5723	! if fprs.fef == 0, set it.
5724	! Setting it when already set costs more than checking
5725	andcc	%g1, FPRS_FEF, %g1	! test FEF, fprs.du = fprs.dl = 0
5726	bz,pt	%ncc, .ci_fp_unused
5727	mov	ASI_USER, %asi
5728	BST_FP_TOSTACK(%o3)
5729	ba	.ci_fp_ready
5730.ci_fp_unused:
5731	prefetcha [%i0 + (1 * CACHE_LINE)]%asi, #one_read
5732	wr	%g0, FPRS_FEF, %fprs	! fprs.fef = 1
5733.ci_fp_ready:
5734	rd	%gsr, %l5		! save %gsr value
5735	andcc	%i1, 1, %o3		! is dest byte aligned
5736	bnz,pt	%ncc, .ci_big_d1
5737.ci_big_d1f:				! dest is now half word aligned
5738	andcc	%i1, 2, %o3
5739	bnz,pt	%ncc, .ci_big_d2
5740.ci_big_d2f:				! dest is now word aligned
5741	andcc	%i1, 4, %o3
5742	bnz,pt	%ncc, .ci_big_d4
5743.ci_big_d4f:				! dest is long word aligned
5744	andcc	%i0, 7, %o3		! is src long word aligned
5745	brnz,pt	%o3, .ci_big_unal8
5746	prefetcha [%i0 + (2 * CACHE_LINE)]%asi, #one_read
5747	! Src and dst are long word aligned
5748	! align dst to 64 byte boundary
5749	andcc	%i1, 0x3f, %o3		! %o3 == 0 means dst is 64 byte aligned
5750	brz,pn	%o3, .ci_al_to_64
5751	nop
5752	sub	%o3, 64, %o3		! %o3 has negative bytes to move
5753	add	%i2, %o3, %i2		! adjust remaining count
5754	andcc	%o3, 8, %o4		! odd long words to move?
5755	brz,pt	%o4, .ci_al_to_16
5756	nop
5757	add	%o3, 8, %o3
5758	ldxa	[%i0]%asi, %o4
5759	add	%i0, 8, %i0		! increment src ptr
5760	add	%i1, 8, %i1		! increment dst ptr
5761	stx	%o4, [%i1-8]
5762! Dest is aligned on 16 bytes, src 8 byte aligned
5763.ci_al_to_16:
5764	andcc	%o3, 0x30, %o4		! pair of long words to move?
5765	brz,pt	%o4, .ci_al_to_64
5766	nop
5767.ci_al_mv_16:
5768	add	%o3, 16, %o3
5769	ldxa	[%i0]%asi, %o4
5770	stx	%o4, [%i1]
5771	add	%i0, 16, %i0		! increment src ptr
5772	ldxa	[%i0-8]%asi, %o4
5773	stx	%o4, [%i1+8]
5774	andcc	%o3, 0x30, %o4
5775	brnz,pt	%o4, .ci_al_mv_16
5776	add	%i1, 16, %i1		! increment dst ptr
5777! Dest is aligned on 64 bytes, src 8 byte aligned
5778.ci_al_to_64:
5779	! Determine source alignment
5780	! to correct 8 byte offset
5781	andcc	%i0, 32, %o3
5782	brnz,pn	%o3, .ci_aln_1
5783	andcc	%i0, 16, %o3
5784	brnz,pn	%o3, .ci_aln_01
5785	andcc	%i0, 8, %o3
5786	brz,pn	%o3, .ci_aln_000
5787	prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read
5788	ba	.ci_aln_001
5789	prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5790.ci_aln_01:
5791	brnz,pn	%o3, .ci_aln_011
5792	prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read
5793	ba	.ci_aln_010
5794	prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5795.ci_aln_1:
5796	andcc	%i0, 16, %o3
5797	brnz,pn	%o3, .ci_aln_11
5798	andcc	%i0, 8, %o3
5799	brnz,pn	%o3, .ci_aln_101
5800	prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read
5801	ba	.ci_aln_100
5802	prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5803.ci_aln_11:
5804	brz,pn	%o3, .ci_aln_110
5805	prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read
5806
5807.ci_aln_111:
5808! Alignment off by 8 bytes
5809	prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5810	ldda	[%i0]%asi, %d0
5811	add	%i0, 8, %i0
5812	sub	%i2, 8, %i2
5813	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
5814	and	%i2, 0x7f, %i2		! residue bytes in %i2
5815	sub	%i1, %i0, %i1
5816.ci_aln_111_loop:
5817	ldda	[%i0]ASI_BLK_AIUS,%d16		! block load
5818	subcc	%o3, 64, %o3
5819	fmovd	%d16, %d2
5820	fmovd	%d18, %d4
5821	fmovd	%d20, %d6
5822	fmovd	%d22, %d8
5823	fmovd	%d24, %d10
5824	fmovd	%d26, %d12
5825	fmovd	%d28, %d14
5826	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
5827	stda	%d0,[%i0+%i1]ASI_BLK_P
5828	add	%i0, 64, %i0
5829	fmovd	%d30, %d0
5830	bgt,pt	%ncc, .ci_aln_111_loop
5831	prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5832	add	%i1, %i0, %i1
5833
5834	std	%d0, [%i1]
5835	ba	.ci_remain_stuff
5836	add	%i1, 8, %i1
5837	! END OF aln_111
5838
5839.ci_aln_110:
5840! Alignment off by 16 bytes
5841	prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5842	ldda	[%i0]%asi, %d0
5843	ldda	[%i0+8]%asi, %d2
5844	add	%i0, 16, %i0
5845	sub	%i2, 16, %i2
5846	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
5847	and	%i2, 0x7f, %i2		! residue bytes in %i2
5848	sub	%i1, %i0, %i1
5849.ci_aln_110_loop:
5850	ldda	[%i0]ASI_BLK_AIUS,%d16		! block load
5851	subcc	%o3, 64, %o3
5852	fmovd	%d16, %d4
5853	fmovd	%d18, %d6
5854	fmovd	%d20, %d8
5855	fmovd	%d22, %d10
5856	fmovd	%d24, %d12
5857	fmovd	%d26, %d14
5858	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
5859	stda	%d0,[%i0+%i1]ASI_BLK_P
5860	add	%i0, 64, %i0
5861	fmovd	%d28, %d0
5862	fmovd	%d30, %d2
5863	bgt,pt	%ncc, .ci_aln_110_loop
5864	prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5865	add	%i1, %i0, %i1
5866
5867	std	%d0, [%i1]
5868	std	%d2, [%i1+8]
5869	ba	.ci_remain_stuff
5870	add	%i1, 16, %i1
5871	! END OF aln_110
5872
5873.ci_aln_101:
5874! Alignment off by 24 bytes
5875	prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5876	ldda	[%i0]%asi, %d0
5877	ldda	[%i0+8]%asi, %d2
5878	ldda	[%i0+16]%asi, %d4
5879	add	%i0, 24, %i0
5880	sub	%i2, 24, %i2
5881	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
5882	and	%i2, 0x7f, %i2		! residue bytes in %i2
5883	sub	%i1, %i0, %i1
5884.ci_aln_101_loop:
5885	ldda	[%i0]ASI_BLK_AIUS,%d16	! block load
5886	subcc	%o3, 64, %o3
5887	fmovd	%d16, %d6
5888	fmovd	%d18, %d8
5889	fmovd	%d20, %d10
5890	fmovd	%d22, %d12
5891	fmovd	%d24, %d14
5892	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
5893	stda	%d0,[%i0+%i1]ASI_BLK_P
5894	add	%i0, 64, %i0
5895	fmovd	%d26, %d0
5896	fmovd	%d28, %d2
5897	fmovd	%d30, %d4
5898	bgt,pt	%ncc, .ci_aln_101_loop
5899	prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5900	add	%i1, %i0, %i1
5901
5902	std	%d0, [%i1]
5903	std	%d2, [%i1+8]
5904	std	%d4, [%i1+16]
5905	ba	.ci_remain_stuff
5906	add	%i1, 24, %i1
5907	! END OF aln_101
5908
5909.ci_aln_100:
5910! Alignment off by 32 bytes
5911	ldda	[%i0]%asi, %d0
5912	ldda	[%i0+8]%asi, %d2
5913	ldda	[%i0+16]%asi,%d4
5914	ldda	[%i0+24]%asi,%d6
5915	add	%i0, 32, %i0
5916	sub	%i2, 32, %i2
5917	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
5918	and	%i2, 0x7f, %i2		! residue bytes in %i2
5919	sub	%i1, %i0, %i1
5920.ci_aln_100_loop:
5921	ldda	[%i0]ASI_BLK_AIUS,%d16	! block load
5922	subcc	%o3, 64, %o3
5923	fmovd	%d16, %d8
5924	fmovd	%d18, %d10
5925	fmovd	%d20, %d12
5926	fmovd	%d22, %d14
5927	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
5928	stda	%d0,[%i0+%i1]ASI_BLK_P
5929	add	%i0, 64, %i0
5930	fmovd	%d24, %d0
5931	fmovd	%d26, %d2
5932	fmovd	%d28, %d4
5933	fmovd	%d30, %d6
5934	bgt,pt	%ncc, .ci_aln_100_loop
5935	prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5936	add	%i1, %i0, %i1
5937
5938	std	%d0, [%i1]
5939	std	%d2, [%i1+8]
5940	std	%d4, [%i1+16]
5941	std	%d6, [%i1+24]
5942	ba	.ci_remain_stuff
5943	add	%i1, 32, %i1
5944	! END OF aln_100
5945
5946.ci_aln_011:
5947! Alignment off by 40 bytes
5948	prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5949	ldda	[%i0]%asi, %d0
5950	ldda	[%i0+8]%asi, %d2
5951	ldda	[%i0+16]%asi, %d4
5952	ldda	[%i0+24]%asi, %d6
5953	ldda	[%i0+32]%asi, %d8
5954	add	%i0, 40, %i0
5955	sub	%i2, 40, %i2
5956	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
5957	and	%i2, 0x7f, %i2		! residue bytes in %i2
5958	sub	%i1, %i0, %i1
5959.ci_aln_011_loop:
5960	ldda	[%i0]ASI_BLK_AIUS,%d16	! block load
5961	subcc	%o3, 64, %o3
5962	fmovd	%d16, %d10
5963	fmovd	%d18, %d12
5964	fmovd	%d20, %d14
5965	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
5966	stda	%d0,[%i0+%i1]ASI_BLK_P
5967	add	%i0, 64, %i0
5968	fmovd	%d22, %d0
5969	fmovd	%d24, %d2
5970	fmovd	%d26, %d4
5971	fmovd	%d28, %d6
5972	fmovd	%d30, %d8
5973	bgt,pt	%ncc, .ci_aln_011_loop
5974	prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5975	add	%i1, %i0, %i1
5976
5977	std	%d0, [%i1]
5978	std	%d2, [%i1+8]
5979	std	%d4, [%i1+16]
5980	std	%d6, [%i1+24]
5981	std	%d8, [%i1+32]
5982	ba	.ci_remain_stuff
5983	add	%i1, 40, %i1
5984	! END OF aln_011
5985
5986.ci_aln_010:
5987! Alignment off by 48 bytes
5988	ldda	[%i0]%asi, %d0
5989	ldda	[%i0+8]%asi, %d2
5990	ldda	[%i0+16]%asi, %d4
5991	ldda	[%i0+24]%asi, %d6
5992	ldda	[%i0+32]%asi, %d8
5993	ldda	[%i0+40]%asi, %d10
5994	add	%i0, 48, %i0
5995	sub	%i2, 48, %i2
5996	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
5997	and	%i2, 0x7f, %i2		! residue bytes in %i2
5998	sub	%i1, %i0, %i1
5999.ci_aln_010_loop:
6000	ldda	[%i0]ASI_BLK_AIUS,%d16	! block load
6001	subcc	%o3, 64, %o3
6002	fmovd	%d16, %d12
6003	fmovd	%d18, %d14
6004	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
6005	stda	%d0,[%i0+%i1]ASI_BLK_P
6006	add	%i0, 64, %i0
6007	fmovd	%d20, %d0
6008	fmovd	%d22, %d2
6009	fmovd	%d24, %d4
6010	fmovd	%d26, %d6
6011	fmovd	%d28, %d8
6012	fmovd	%d30, %d10
6013	bgt,pt	%ncc, .ci_aln_010_loop
6014	prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
6015	add	%i1, %i0, %i1
6016
6017	std	%d0, [%i1]
6018	std	%d2, [%i1+8]
6019	std	%d4, [%i1+16]
6020	std	%d6, [%i1+24]
6021	std	%d8, [%i1+32]
6022	std	%d10, [%i1+40]
6023	ba	.ci_remain_stuff
6024	add	%i1, 48, %i1
6025	! END OF aln_010
6026
6027.ci_aln_001:
6028! Alignment off by 56 bytes
6029	ldda	[%i0]%asi, %d0
6030	ldda	[%i0+8]%asi, %d2
6031	ldda	[%i0+16]%asi, %d4
6032	ldda	[%i0+24]%asi, %d6
6033	ldda	[%i0+32]%asi, %d8
6034	ldda	[%i0+40]%asi, %d10
6035	ldda	[%i0+48]%asi, %d12
6036	add	%i0, 56, %i0
6037	sub	%i2, 56, %i2
6038	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
6039	and	%i2, 0x7f, %i2		! residue bytes in %i2
6040	sub	%i1, %i0, %i1
6041.ci_aln_001_loop:
6042	ldda	[%i0]ASI_BLK_AIUS,%d16	! block load
6043	subcc	%o3, 64, %o3
6044	fmovd	%d16, %d14
6045	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
6046	stda	%d0,[%i0+%i1]ASI_BLK_P
6047	add	%i0, 64, %i0
6048	fmovd	%d18, %d0
6049	fmovd	%d20, %d2
6050	fmovd	%d22, %d4
6051	fmovd	%d24, %d6
6052	fmovd	%d26, %d8
6053	fmovd	%d28, %d10
6054	fmovd	%d30, %d12
6055	bgt,pt	%ncc, .ci_aln_001_loop
6056	prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
6057	add	%i1, %i0, %i1
6058
6059	std	%d0, [%i1]
6060	std	%d2, [%i1+8]
6061	std	%d4, [%i1+16]
6062	std	%d6, [%i1+24]
6063	std	%d8, [%i1+32]
6064	std	%d10, [%i1+40]
6065	std	%d12, [%i1+48]
6066	ba	.ci_remain_stuff
6067	add	%i1, 56, %i1
6068	! END OF aln_001
6069
6070.ci_aln_000:
6071	prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
6072	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
6073	and	%i2, 0x7f, %i2		! residue bytes in %i2
6074	sub	%i1, %i0, %i1
6075.ci_aln_000_loop:
6076	ldda	[%i0]ASI_BLK_AIUS,%d0
6077	subcc	%o3, 64, %o3
6078	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
6079	stda	%d0,[%i0+%i1]ASI_BLK_P
6080	add	%i0, 64, %i0
6081	bgt,pt	%ncc, .ci_aln_000_loop
6082	prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
6083	add	%i1, %i0, %i1
6084
6085	! END OF aln_000
6086
6087.ci_remain_stuff:
6088	subcc	%i2, 31, %i2		! adjust length to allow cc test
6089	ble,pt	%ncc, .ci_aln_31
6090	nop
6091.ci_aln_32:
6092	ldxa	[%i0]%asi, %o4		! move 32 bytes
6093	subcc	%i2, 32, %i2		! decrement length count by 32
6094	stx	%o4, [%i1]
6095	ldxa	[%i0+8]%asi, %o4
6096	stx	%o4, [%i1+8]
6097	ldxa	[%i0+16]%asi, %o4
6098	add	%i0, 32, %i0		! increase src ptr by 32
6099	stx	%o4, [%i1+16]
6100	ldxa	[%i0-8]%asi, %o4
6101	add	%i1, 32, %i1		! increase dst ptr by 32
6102	bgu,pt	%ncc, .ci_aln_32	! repeat if at least 32 bytes left
6103	stx	%o4, [%i1-8]
6104.ci_aln_31:
6105	addcc	%i2, 24, %i2		! adjust count to be off by 7
6106	ble,pt	%ncc, .ci_aln_7		! skip if 7 or fewer bytes left
6107	nop				!
6108.ci_aln_15:
6109	ldxa	[%i0]%asi, %o4		! move 8 bytes
6110	add	%i0, 8, %i0		! increase src ptr by 8
6111	subcc	%i2, 8, %i2		! decrease count by 8
6112	add	%i1, 8, %i1		! increase dst ptr by 8
6113	bgu,pt	%ncc, .ci_aln_15
6114	stx	%o4, [%i1-8]		!
6115.ci_aln_7:
6116	addcc	%i2, 7, %i2		! finish adjustment of remaining count
6117	bz,pt	%ncc, .ci_exit		! exit if finished
6118	cmp	%i2, 4
6119	blt,pt	%ncc, .ci_unaln3x	! skip if less than 4 bytes left
6120	nop				!
6121	lda	[%i0]%asi, %o4		! move 4 bytes
6122	add	%i0, 4, %i0		! increase src ptr by 4
6123	add	%i1, 4, %i1		! increase dst ptr by 4
6124	subcc	%i2, 4, %i2		! decrease count by 4
6125	bnz	.ci_unaln3x
6126	stw	%o4, [%i1-4]
6127	ba	.ci_exit
6128	nop
6129
6130	! destination alignment code
6131.ci_big_d1:
6132	lduba	[%i0]%asi, %o4		! move a byte
6133	add	%i0, 1, %i0
6134	stb	%o4, [%i1]
6135	add	%i1, 1, %i1
6136	andcc	%i1, 2, %o3
6137	bz,pt	%ncc, .ci_big_d2f
6138	sub	%i2, 1, %i2
6139.ci_big_d2:				! dest is now at least half word aligned
6140	lduba	[%i0]%asi, %o4		! move a half-word (src align unknown)
6141	lduba	[%i0+1]%asi, %o3
6142	add	%i0, 2, %i0
6143	sll	%o4, 8, %o4		! position
6144	or	%o4, %o3, %o4		! merge
6145	sth	%o4, [%i1]
6146	add	%i1, 2, %i1
6147	andcc	%i1, 4, %o3
6148	bz,pt	%ncc, .ci_big_d4f
6149	sub	%i2, 2, %i2
6150.ci_big_d4:				! dest is at least word aligned
6151	nop
6152	lduba	[%i0]%asi, %o4		! move a word (src align unknown)
6153	lduba	[%i0+1]%asi, %o3
6154	sll	%o4, 24, %o4		! position
6155	sll	%o3, 16, %o3		! position
6156	or	%o4, %o3, %o3		! merge
6157	lduba	[%i0+2]%asi, %o4
6158	sll	%o4, 8, %o4		! position
6159	or	%o4, %o3, %o3		! merge
6160	lduba	[%i0+3]%asi, %o4
6161	or	%o4, %o3, %o4		! merge
6162	stw	%o4,[%i1]		! store four bytes
6163	add	%i0, 4, %i0		! adjust src by 4
6164	add	%i1, 4, %i1		! adjust dest by 4
6165	ba	.ci_big_d4f
6166	sub	%i2, 4, %i2		! adjust count by 4
6167
6168
6169	! Dst is on 8 byte boundary; src is not;
6170.ci_big_unal8:
6171	andcc	%i1, 0x3f, %o3		! is dst 64-byte block aligned?
6172	bz	%ncc, .ci_unalnsrc
6173	sub	%o3, 64, %o3		! %o3 will be multiple of 8
6174	neg	%o3			! bytes until dest is 64 byte aligned
6175	sub	%i2, %o3, %i2		! update cnt with bytes to be moved
6176	! Move bytes according to source alignment
6177	andcc	%i0, 0x1, %o4
6178	bnz	%ncc, .ci_unalnbyte	! check for byte alignment
6179	nop
6180	andcc	%i0, 2, %o4		! check for half word alignment
6181	bnz	%ncc, .ci_unalnhalf
6182	nop
6183	! Src is word aligned, move bytes until dest 64 byte aligned
6184.ci_unalnword:
6185	lda	[%i0]%asi, %o4		! load 4 bytes
6186	stw	%o4, [%i1]		! and store 4 bytes
6187	lda	[%i0+4]%asi, %o4	! load 4 bytes
6188	add	%i0, 8, %i0		! increase src ptr by 8
6189	stw	%o4, [%i1+4]		! and store 4 bytes
6190	subcc	%o3, 8, %o3		! decrease count by 8
6191	bnz	%ncc, .ci_unalnword
6192	add	%i1, 8, %i1		! increase dst ptr by 8
6193	ba	.ci_unalnsrc
6194	nop
6195
6196	! Src is half-word aligned, move bytes until dest 64 byte aligned
6197.ci_unalnhalf:
6198	lduha	[%i0]%asi, %o4		! load 2 bytes
6199	sllx	%o4, 32, %i3		! shift left
6200	lduwa	[%i0+2]%asi, %o4
6201	or	%o4, %i3, %i3
6202	sllx	%i3, 16, %i3
6203	lduha	[%i0+6]%asi, %o4
6204	or	%o4, %i3, %i3
6205	stx	%i3, [%i1]
6206	add	%i0, 8, %i0
6207	subcc	%o3, 8, %o3
6208	bnz	%ncc, .ci_unalnhalf
6209	add	%i1, 8, %i1
6210	ba	.ci_unalnsrc
6211	nop
6212
6213	! Src is Byte aligned, move bytes until dest 64 byte aligned
6214.ci_unalnbyte:
6215	sub	%i1, %i0, %i1		! share pointer advance
6216.ci_unalnbyte_loop:
6217	lduba	[%i0]%asi, %o4
6218	sllx	%o4, 56, %i3
6219	lduha	[%i0+1]%asi, %o4
6220	sllx	%o4, 40, %o4
6221	or	%o4, %i3, %i3
6222	lduha	[%i0+3]%asi, %o4
6223	sllx	%o4, 24, %o4
6224	or	%o4, %i3, %i3
6225	lduha	[%i0+5]%asi, %o4
6226	sllx	%o4, 8, %o4
6227	or	%o4, %i3, %i3
6228	lduba	[%i0+7]%asi, %o4
6229	or	%o4, %i3, %i3
6230	stx	%i3, [%i1+%i0]
6231	subcc	%o3, 8, %o3
6232	bnz	%ncc, .ci_unalnbyte_loop
6233	add	%i0, 8, %i0
6234	add	%i1,%i0, %i1		! restore pointer
6235
6236	! Destination is now block (64 byte aligned), src is not 8 byte aligned
6237.ci_unalnsrc:
6238	andn	%i2, 0x3f, %i3		! %i3 is multiple of block size
6239	and	%i2, 0x3f, %i2		! residue bytes in %i2
6240	add	%i2, 64, %i2		! Insure we don't load beyond
6241	sub	%i3, 64, %i3		! end of source buffer
6242
6243	andn	%i0, 0x3f, %o4		! %o4 has block aligned src address
6244	prefetcha [%o4 + (3 * CACHE_LINE)]%asi, #one_read
6245	alignaddr %i0, %g0, %g0		! generate %gsr
6246	add	%i0, %i3, %i0		! advance %i0 to after blocks
6247	!
6248	! Determine source alignment to correct 8 byte offset
6249	andcc	%i0, 0x20, %o3
6250	brnz,pn	%o3, .ci_unaln_1
6251	andcc	%i0, 0x10, %o3
6252	brnz,pn	%o3, .ci_unaln_01
6253	andcc	%i0, 0x08, %o3
6254	brz,a	%o3, .ci_unaln_000
6255	prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6256	ba	.ci_unaln_001
6257	nop
6258.ci_unaln_01:
6259	brnz,a	%o3, .ci_unaln_011
6260	prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6261	ba	.ci_unaln_010
6262	nop
6263.ci_unaln_1:
6264	brnz,pn	%o3, .ci_unaln_11
6265	andcc	%i0, 0x08, %o3
6266	brnz,a	%o3, .ci_unaln_101
6267	prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6268	ba	.ci_unaln_100
6269	nop
6270.ci_unaln_11:
6271	brz,pn	%o3, .ci_unaln_110
6272	prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
6273
6274.ci_unaln_111:
6275	ldda	[%o4+56]%asi, %d14
6276.ci_unaln_111_loop:
6277	add	%o4, 64, %o4
6278	ldda	[%o4]ASI_BLK_AIUS, %d16
6279	faligndata %d14, %d16, %d48
6280	faligndata %d16, %d18, %d50
6281	faligndata %d18, %d20, %d52
6282	faligndata %d20, %d22, %d54
6283	faligndata %d22, %d24, %d56
6284	faligndata %d24, %d26, %d58
6285	faligndata %d26, %d28, %d60
6286	faligndata %d28, %d30, %d62
6287	fmovd	%d30, %d14
6288	stda	%d48, [%i1]ASI_BLK_P
6289	subcc	%i3, 64, %i3
6290	add	%i1, 64, %i1
6291	bgu,pt	%ncc, .ci_unaln_111_loop
6292	prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6293	ba	.ci_unaln_done
6294	nop
6295
6296.ci_unaln_110:
6297	ldda	[%o4+48]%asi, %d12
6298	ldda	[%o4+56]%asi, %d14
6299.ci_unaln_110_loop:
6300	add	%o4, 64, %o4
6301	ldda	[%o4]ASI_BLK_AIUS, %d16
6302	faligndata %d12, %d14, %d48
6303	faligndata %d14, %d16, %d50
6304	faligndata %d16, %d18, %d52
6305	faligndata %d18, %d20, %d54
6306	faligndata %d20, %d22, %d56
6307	faligndata %d22, %d24, %d58
6308	faligndata %d24, %d26, %d60
6309	faligndata %d26, %d28, %d62
6310	fmovd	%d28, %d12
6311	fmovd	%d30, %d14
6312	stda	%d48, [%i1]ASI_BLK_P
6313	subcc	%i3, 64, %i3
6314	add	%i1, 64, %i1
6315	bgu,pt	%ncc, .ci_unaln_110_loop
6316	prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6317	ba	.ci_unaln_done
6318	nop
6319
6320.ci_unaln_101:
6321	ldda	[%o4+40]%asi, %d10
6322	ldda	[%o4+48]%asi, %d12
6323	ldda	[%o4+56]%asi, %d14
6324.ci_unaln_101_loop:
6325	add	%o4, 64, %o4
6326	ldda	[%o4]ASI_BLK_AIUS, %d16
6327	faligndata %d10, %d12, %d48
6328	faligndata %d12, %d14, %d50
6329	faligndata %d14, %d16, %d52
6330	faligndata %d16, %d18, %d54
6331	faligndata %d18, %d20, %d56
6332	faligndata %d20, %d22, %d58
6333	faligndata %d22, %d24, %d60
6334	faligndata %d24, %d26, %d62
6335	fmovd	%d26, %d10
6336	fmovd	%d28, %d12
6337	fmovd	%d30, %d14
6338	stda	%d48, [%i1]ASI_BLK_P
6339	subcc	%i3, 64, %i3
6340	add	%i1, 64, %i1
6341	bgu,pt	%ncc, .ci_unaln_101_loop
6342	prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6343	ba	.ci_unaln_done
6344	nop
6345
6346.ci_unaln_100:
6347	ldda	[%o4+32]%asi, %d8
6348	ldda	[%o4+40]%asi, %d10
6349	ldda	[%o4+48]%asi, %d12
6350	ldda	[%o4+56]%asi, %d14
6351.ci_unaln_100_loop:
6352	add	%o4, 64, %o4
6353	ldda	[%o4]ASI_BLK_AIUS, %d16
6354	faligndata %d8, %d10, %d48
6355	faligndata %d10, %d12, %d50
6356	faligndata %d12, %d14, %d52
6357	faligndata %d14, %d16, %d54
6358	faligndata %d16, %d18, %d56
6359	faligndata %d18, %d20, %d58
6360	faligndata %d20, %d22, %d60
6361	faligndata %d22, %d24, %d62
6362	fmovd	%d24, %d8
6363	fmovd	%d26, %d10
6364	fmovd	%d28, %d12
6365	fmovd	%d30, %d14
6366	stda	%d48, [%i1]ASI_BLK_P
6367	subcc	%i3, 64, %i3
6368	add	%i1, 64, %i1
6369	bgu,pt	%ncc, .ci_unaln_100_loop
6370	prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6371	ba	.ci_unaln_done
6372	nop
6373
6374.ci_unaln_011:
6375	ldda	[%o4+24]%asi, %d6
6376	ldda	[%o4+32]%asi, %d8
6377	ldda	[%o4+40]%asi, %d10
6378	ldda	[%o4+48]%asi, %d12
6379	ldda	[%o4+56]%asi, %d14
6380.ci_unaln_011_loop:
6381	add	%o4, 64, %o4
6382	ldda	[%o4]ASI_BLK_AIUS, %d16
6383	faligndata %d6, %d8, %d48
6384	faligndata %d8, %d10, %d50
6385	faligndata %d10, %d12, %d52
6386	faligndata %d12, %d14, %d54
6387	faligndata %d14, %d16, %d56
6388	faligndata %d16, %d18, %d58
6389	faligndata %d18, %d20, %d60
6390	faligndata %d20, %d22, %d62
6391	fmovd	%d22, %d6
6392	fmovd	%d24, %d8
6393	fmovd	%d26, %d10
6394	fmovd	%d28, %d12
6395	fmovd	%d30, %d14
6396	stda	%d48, [%i1]ASI_BLK_P
6397	subcc	%i3, 64, %i3
6398	add	%i1, 64, %i1
6399	bgu,pt	%ncc, .ci_unaln_011_loop
6400	prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6401	ba	.ci_unaln_done
6402	nop
6403
6404.ci_unaln_010:
6405	ldda	[%o4+16]%asi, %d4
6406	ldda	[%o4+24]%asi, %d6
6407	ldda	[%o4+32]%asi, %d8
6408	ldda	[%o4+40]%asi, %d10
6409	ldda	[%o4+48]%asi, %d12
6410	ldda	[%o4+56]%asi, %d14
6411.ci_unaln_010_loop:
6412	add	%o4, 64, %o4
6413	ldda	[%o4]ASI_BLK_AIUS, %d16
6414	faligndata %d4, %d6, %d48
6415	faligndata %d6, %d8, %d50
6416	faligndata %d8, %d10, %d52
6417	faligndata %d10, %d12, %d54
6418	faligndata %d12, %d14, %d56
6419	faligndata %d14, %d16, %d58
6420	faligndata %d16, %d18, %d60
6421	faligndata %d18, %d20, %d62
6422	fmovd	%d20, %d4
6423	fmovd	%d22, %d6
6424	fmovd	%d24, %d8
6425	fmovd	%d26, %d10
6426	fmovd	%d28, %d12
6427	fmovd	%d30, %d14
6428	stda	%d48, [%i1]ASI_BLK_P
6429	subcc	%i3, 64, %i3
6430	add	%i1, 64, %i1
6431	bgu,pt	%ncc, .ci_unaln_010_loop
6432	prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6433	ba	.ci_unaln_done
6434	nop
6435
6436.ci_unaln_001:
6437	ldda	[%o4+8]%asi, %d2
6438	ldda	[%o4+16]%asi, %d4
6439	ldda	[%o4+24]%asi, %d6
6440	ldda	[%o4+32]%asi, %d8
6441	ldda	[%o4+40]%asi, %d10
6442	ldda	[%o4+48]%asi, %d12
6443	ldda	[%o4+56]%asi, %d14
6444.ci_unaln_001_loop:
6445	add	%o4, 64, %o4
6446	ldda	[%o4]ASI_BLK_AIUS, %d16
6447	faligndata %d2, %d4, %d48
6448	faligndata %d4, %d6, %d50
6449	faligndata %d6, %d8, %d52
6450	faligndata %d8, %d10, %d54
6451	faligndata %d10, %d12, %d56
6452	faligndata %d12, %d14, %d58
6453	faligndata %d14, %d16, %d60
6454	faligndata %d16, %d18, %d62
6455	fmovd	%d18, %d2
6456	fmovd	%d20, %d4
6457	fmovd	%d22, %d6
6458	fmovd	%d24, %d8
6459	fmovd	%d26, %d10
6460	fmovd	%d28, %d12
6461	fmovd	%d30, %d14
6462	stda	%d48, [%i1]ASI_BLK_P
6463	subcc	%i3, 64, %i3
6464	add	%i1, 64, %i1
6465	bgu,pt	%ncc, .ci_unaln_001_loop
6466	prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6467	ba	.ci_unaln_done
6468	nop
6469
6470.ci_unaln_000:
6471	ldda	[%o4]ASI_BLK_AIUS, %d0
6472.ci_unaln_000_loop:
6473	add	%o4, 64, %o4
6474	ldda	[%o4]ASI_BLK_AIUS, %d16
6475	faligndata %d0, %d2, %d48
6476	faligndata %d2, %d4, %d50
6477	faligndata %d4, %d6, %d52
6478	faligndata %d6, %d8, %d54
6479	faligndata %d8, %d10, %d56
6480	faligndata %d10, %d12, %d58
6481	faligndata %d12, %d14, %d60
6482	faligndata %d14, %d16, %d62
6483	fmovd	%d16, %d0
6484	fmovd	%d18, %d2
6485	fmovd	%d20, %d4
6486	fmovd	%d22, %d6
6487	fmovd	%d24, %d8
6488	fmovd	%d26, %d10
6489	fmovd	%d28, %d12
6490	fmovd	%d30, %d14
6491	stda	%d48, [%i1]ASI_BLK_P
6492	subcc	%i3, 64, %i3
6493	add	%i1, 64, %i1
6494	bgu,pt	%ncc, .ci_unaln_000_loop
6495	prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6496
6497.ci_unaln_done:
6498	! Handle trailing bytes, 64 to 127
6499	! Dest long word aligned, Src not long word aligned
6500	cmp	%i2, 15
6501	bleu	%ncc, .ci_unaln_short
6502
6503	andn	%i2, 0x7, %i3		! %i3 is multiple of 8
6504	and	%i2, 0x7, %i2		! residue bytes in %i2
6505	add	%i2, 8, %i2
6506	sub	%i3, 8, %i3		! insure we don't load past end of src
6507	andn	%i0, 0x7, %o4		! %o4 has long word aligned src address
6508	add	%i0, %i3, %i0		! advance %i0 to after multiple of 8
6509	ldda	[%o4]%asi, %d0		! fetch partial word
6510.ci_unaln_by8:
6511	ldda	[%o4+8]%asi, %d2
6512	add	%o4, 8, %o4
6513	faligndata %d0, %d2, %d16
6514	subcc	%i3, 8, %i3
6515	std	%d16, [%i1]
6516	fmovd	%d2, %d0
6517	bgu,pt	%ncc, .ci_unaln_by8
6518	add	%i1, 8, %i1
6519
6520.ci_unaln_short:
6521	cmp	%i2, 8
6522	blt,pt	%ncc, .ci_unalnfin
6523	nop
6524	lduba	[%i0]%asi, %o4
6525	sll	%o4, 24, %o3
6526	lduba	[%i0+1]%asi, %o4
6527	sll	%o4, 16, %o4
6528	or	%o4, %o3, %o3
6529	lduba	[%i0+2]%asi, %o4
6530	sll	%o4, 8, %o4
6531	or	%o4, %o3, %o3
6532	lduba	[%i0+3]%asi, %o4
6533	or	%o4, %o3, %o3
6534	stw	%o3, [%i1]
6535	lduba	[%i0+4]%asi, %o4
6536	sll	%o4, 24, %o3
6537	lduba	[%i0+5]%asi, %o4
6538	sll	%o4, 16, %o4
6539	or	%o4, %o3, %o3
6540	lduba	[%i0+6]%asi, %o4
6541	sll	%o4, 8, %o4
6542	or	%o4, %o3, %o3
6543	lduba	[%i0+7]%asi, %o4
6544	or	%o4, %o3, %o3
6545	stw	%o3, [%i1+4]
6546	add	%i0, 8, %i0
6547	add	%i1, 8, %i1
6548	sub	%i2, 8, %i2
6549.ci_unalnfin:
6550	cmp	%i2, 4
6551	blt,pt	%ncc, .ci_unalnz
6552	tst	%i2
6553	lduba	[%i0]%asi, %o3		! read byte
6554	subcc	%i2, 4, %i2		! reduce count by 4
6555	sll	%o3, 24, %o3		! position
6556	lduba	[%i0+1]%asi, %o4
6557	sll	%o4, 16, %o4		! position
6558	or	%o4, %o3, %o3		! merge
6559	lduba	[%i0+2]%asi, %o4
6560	sll	%o4, 8, %o4		! position
6561	or	%o4, %o3, %o3		! merge
6562	add	%i1, 4, %i1		! advance dst by 4
6563	lduba	[%i0+3]%asi, %o4
6564	add	%i0, 4, %i0		! advance src by 4
6565	or	%o4, %o3, %o4		! merge
6566	bnz,pt	%ncc, .ci_unaln3x
6567	stw	%o4, [%i1-4]
6568	ba	.ci_exit
6569	nop
6570.ci_unalnz:
6571	bz,pt	%ncc, .ci_exit
6572	wr	%l5, %g0, %gsr		! restore %gsr
6573.ci_unaln3x:				! Exactly 1, 2, or 3 bytes remain
6574	subcc	%i2, 1, %i2		! reduce count for cc test
6575	lduba	[%i0]%asi, %o4		! load one byte
6576	bz,pt	%ncc, .ci_exit
6577	stb	%o4, [%i1]		! store one byte
6578	lduba	[%i0+1]%asi, %o4	! load second byte
6579	subcc	%i2, 1, %i2
6580	bz,pt	%ncc, .ci_exit
6581	stb	%o4, [%i1+1]		! store second byte
6582	lduba	[%i0+2]%asi, %o4	! load third byte
6583	stb	%o4, [%i1+2]		! store third byte
6584.ci_exit:
6585	brnz	%g1, .ci_fp_restore
6586	nop
6587	FZERO
6588	wr	%g1, %g0, %fprs
6589	ba,pt	%ncc, .ci_ex2
6590	membar	#Sync
6591.ci_fp_restore:
6592	BLD_FP_FROMSTACK(%o4)
6593.ci_ex2:
6594	andn	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
6595	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
6596	ret
6597	restore %g0, 0, %o0
6598
6599.copyin_err:
6600	ldn	[THREAD_REG + T_COPYOPS], %o4
6601	brz	%o4, 2f
6602	nop
6603	ldn	[%o4 + CP_COPYIN], %g2
6604	jmp	%g2
6605	nop
66062:
6607	retl
6608	mov	-1, %o0
6609
6610#else	/* NIAGARA_IMPL */
6611.do_copyin:
6612	!
6613	! Check the length and bail if zero.
6614	!
6615	tst	%o2
6616	bnz,pt	%ncc, 1f
6617	nop
6618	retl
6619	clr	%o0
66201:
6621	sethi	%hi(copyio_fault), %o4
6622	or	%o4, %lo(copyio_fault), %o4
6623	sethi	%hi(copyio_fault_nowindow), %o3
6624	ldn	[THREAD_REG + T_LOFAULT], SAVED_LOFAULT
6625	or	%o3, %lo(copyio_fault_nowindow), %o3
6626	membar	#Sync
6627	stn	%o3, [THREAD_REG + T_LOFAULT]
6628
6629	mov	%o0, SAVE_SRC
6630	mov	%o1, SAVE_DST
6631	mov	%o2, SAVE_COUNT
6632
6633	!
6634	! Check to see if we're more than SMALL_LIMIT.
6635	!
6636	subcc	%o2, SMALL_LIMIT, %o3
6637	bgu,a,pt %ncc, .dci_ns
6638	or	%o0, %o1, %o3
6639	!
6640	! What was previously ".small_copyin"
6641	!
6642.dcibcp:
6643	sub	%g0, %o2, %o3		! setup for copy loop
6644	add	%o0, %o2, %o0
6645	add	%o1, %o2, %o1
6646	ba,pt	%ncc, .dcicl
6647	lduba	[%o0 + %o3]ASI_USER, %o4
6648	!
6649	! %o0 and %o1 point at the end and remain pointing at the end
6650	! of their buffers. We pull things out by adding %o3 (which is
6651	! the negation of the length) to the buffer end which gives us
6652	! the curent location in the buffers. By incrementing %o3 we walk
6653	! through both buffers without having to bump each buffer's
6654	! pointer. A very fast 4 instruction loop.
6655	!
6656	.align 16
6657.dcicl:
6658	stb	%o4, [%o1 + %o3]
6659	inccc	%o3
6660	bl,a,pt %ncc, .dcicl
6661	lduba	[%o0 + %o3]ASI_USER, %o4
6662	!
6663	! We're done. Go home.
6664	!
6665	membar	#Sync
6666	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
6667	retl
6668	clr	%o0
6669	!
6670	! Try aligned copies from here.
6671	!
6672.dci_ns:
6673	!
6674	! See if we're single byte aligned. If we are, check the
6675	! limit for single byte copies. If we're smaller, or equal,
6676	! bounce to the byte for byte copy loop. Otherwise do it in
6677	! HW (if enabled).
6678	!
6679	btst	1, %o3
6680	bz,a,pt	%icc, .dcih8
6681	btst	7, %o3
6682	!
6683	! We're single byte aligned.
6684	!
6685	sethi	%hi(hw_copy_limit_1), %o3
6686	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
6687	!
6688	! Is HW copy on? If not do everything byte for byte.
6689	!
6690	tst	%o3
6691	bz,pn	%icc, .dcibcp
6692	subcc	%o3, %o2, %o3
6693	!
6694	! Are we bigger than the HW limit? If not
6695	! go to byte for byte.
6696	!
6697	bge,pt	%ncc, .dcibcp
6698	nop
6699	!
6700	! We're big enough and copy is on. Do it with HW.
6701	!
6702	ba,pt	%ncc, .big_copyin
6703	nop
6704.dcih8:
6705	!
6706	! 8 byte aligned?
6707	!
6708	bnz,a	%ncc, .dcih4
6709	btst	3, %o3
6710	!
6711	! We're eight byte aligned.
6712	!
6713	sethi	%hi(hw_copy_limit_8), %o3
6714	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
6715	!
6716	! Is HW assist on? If not, do it with the aligned copy.
6717	!
6718	tst	%o3
6719	bz,pn	%icc, .dcis8
6720	subcc	%o3, %o2, %o3
6721	bge	%ncc, .dcis8
6722	nop
6723	ba,pt	%ncc, .big_copyin
6724	nop
6725.dcis8:
6726	!
6727	! Housekeeping for copy loops. Uses same idea as in the byte for
6728	! byte copy loop above.
6729	!
6730	add	%o0, %o2, %o0
6731	add	%o1, %o2, %o1
6732	sub	%g0, %o2, %o3
6733	ba,pt	%ncc, .didebc
6734	srl	%o2, 3, %o2		! Number of 8 byte chunks to copy
6735	!
6736	! 4 byte aligned?
6737	!
6738.dcih4:
6739	bnz	%ncc, .dcih2
6740	sethi	%hi(hw_copy_limit_4), %o3
6741	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
6742	!
6743	! Is HW assist on? If not, do it with the aligned copy.
6744	!
6745	tst	%o3
6746	bz,pn	%icc, .dcis4
6747	subcc	%o3, %o2, %o3
6748	!
6749	! We're negative if our size is less than or equal to hw_copy_limit_4.
6750	!
6751	bge	%ncc, .dcis4
6752	nop
6753	ba,pt	%ncc, .big_copyin
6754	nop
6755.dcis4:
6756	!
6757	! Housekeeping for copy loops. Uses same idea as in the byte
6758	! for byte copy loop above.
6759	!
6760	add	%o0, %o2, %o0
6761	add	%o1, %o2, %o1
6762	sub	%g0, %o2, %o3
6763	ba,pt	%ncc, .didfbc
6764	srl	%o2, 2, %o2		! Number of 4 byte chunks to copy
6765.dcih2:
6766	!
6767	! We're two byte aligned. Check for "smallness"
6768	! done in delay at .dcih4
6769	!
6770	bleu,pt	%ncc, .dcis2
6771	sethi	%hi(hw_copy_limit_2), %o3
6772	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
6773	!
6774	! Is HW assist on? If not, do it with the aligned copy.
6775	!
6776	tst	%o3
6777	bz,pn	%icc, .dcis2
6778	subcc	%o3, %o2, %o3
6779	!
6780	! Are we larger than the HW limit?
6781	!
6782	bge	%ncc, .dcis2
6783	nop
6784	!
6785	! HW assist is on and we're large enough to use it.
6786	!
6787	ba,pt	%ncc, .big_copyin
6788	nop
6789	!
6790	! Housekeeping for copy loops. Uses same idea as in the byte
6791	! for byte copy loop above.
6792	!
6793.dcis2:
6794	add	%o0, %o2, %o0
6795	add	%o1, %o2, %o1
6796	sub	%g0, %o2, %o3
6797	ba,pt	%ncc, .didtbc
6798	srl	%o2, 1, %o2		! Number of 2 byte chunks to copy
6799	!
6800.small_copyin:
6801	!
6802	! Why are we doing this AGAIN? There are certain conditions in
6803	! big copyin that will cause us to forgo the HW assisted copys
6804	! and bounce back to a non-hw assisted copy. This dispatches
6805	! those copies. Note that we branch around this in the main line
6806	! code.
6807	!
6808	! We make no check for limits or HW enablement here. We've
6809	! already been told that we're a poster child so just go off
6810	! and do it.
6811	!
6812	or	%o0, %o1, %o3
6813	btst	1, %o3
6814	bnz	%icc, .dcibcp		! Most likely
6815	btst	7, %o3
6816	bz	%icc, .dcis8
6817	btst	3, %o3
6818	bz	%icc, .dcis4
6819	nop
6820	ba,pt	%ncc, .dcis2
6821	nop
6822	!
6823	! Eight byte aligned copies. A steal from the original .small_copyin
6824	! with modifications. %o2 is number of 8 byte chunks to copy. When
6825	! done, we examine %o3. If this is < 0, we have 1 - 7 bytes more
6826	! to copy.
6827	!
6828	.align 32
6829.didebc:
6830	ldxa	[%o0 + %o3]ASI_USER, %o4
6831	deccc	%o2
6832	stx	%o4, [%o1 + %o3]
6833	bg,pt	%ncc, .didebc
6834	addcc	%o3, 8, %o3
6835	!
6836	! End of copy loop. Most 8 byte aligned copies end here.
6837	!
6838	bz,pt	%ncc, .dcifh
6839	nop
6840	!
6841	! Something is left. Do it byte for byte.
6842	!
6843	ba,pt	%ncc, .dcicl
6844	lduba	[%o0 + %o3]ASI_USER, %o4
6845	!
6846	! 4 byte copy loop. %o2 is number of 4 byte chunks to copy.
6847	!
6848	.align 32
6849.didfbc:
6850	lduwa	[%o0 + %o3]ASI_USER, %o4
6851	deccc	%o2
6852	st	%o4, [%o1 + %o3]
6853	bg,pt	%ncc, .didfbc
6854	addcc	%o3, 4, %o3
6855	!
6856	! End of copy loop. Most 4 byte aligned copies end here.
6857	!
6858	bz,pt	%ncc, .dcifh
6859	nop
6860	!
6861	! Something is left. Do it byte for byte.
6862	!
6863	ba,pt	%ncc, .dcicl
6864	lduba	[%o0 + %o3]ASI_USER, %o4
6865	!
6866	! 2 byte aligned copy loop. %o2 is number of 2 byte chunks to
6867	! copy.
6868	!
6869	.align 32
6870.didtbc:
6871	lduha	[%o0 + %o3]ASI_USER, %o4
6872	deccc	%o2
6873	sth	%o4, [%o1 + %o3]
6874	bg,pt	%ncc, .didtbc
6875	addcc	%o3, 2, %o3
6876	!
6877	! End of copy loop. Most 2 byte aligned copies end here.
6878	!
6879	bz,pt	%ncc, .dcifh
6880	nop
6881	!
6882	! Deal with the last byte
6883	!
6884	lduba	[%o0 + %o3]ASI_USER, %o4
6885	stb	%o4, [%o1 + %o3]
6886.dcifh:
6887	membar	#Sync
6888	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
6889	retl
6890	clr	%o0
6891
6892.big_copyin:
6893	! We're going off to do a block copy.
6894	! Switch fault hendlers and grab a window. We
6895	! don't do a membar #Sync since we've done only
6896	! kernel data to this point.
6897	stn	%o4, [THREAD_REG + T_LOFAULT]
6898
6899	! Copy in that reach here are larger than 256 bytes. The
6900	! hw_copy_limit_1 is set to 256. Never set this limit less
6901	! 128 bytes.
6902	save	%sp, -SA(MINFRAME), %sp
6903.do_blockcopyin:
6904
6905	! Swap src/dst since the code below is memcpy code
6906	! and memcpy/bcopy have different calling sequences
6907	mov	%i1, %i5
6908	mov	%i0, %i1
6909	mov	%i5, %i0
6910
6911	! Block (64 bytes) align the destination.
6912	andcc	%i0, 0x3f, %i3		! is dst block aligned
6913	bz	%ncc, copyin_blalign	! dst already block aligned
6914	sub	%i3, 0x40, %i3
6915	neg	%i3			! bytes till dst 64 bytes aligned
6916	sub	%i2, %i3, %i2		! update i2 with new count
6917
6918	! Based on source and destination alignment do
6919	! either 8 bytes, 4 bytes, 2 bytes or byte copy.
6920
6921	! Is dst & src 8B aligned
6922	or	%i0, %i1, %o2
6923	andcc	%o2, 0x7, %g0
6924	bz	%ncc, .ci_alewdcp
6925	nop
6926
6927	! Is dst & src 4B aligned
6928	andcc	%o2, 0x3, %g0
6929	bz	%ncc, .ci_alwdcp
6930	nop
6931
6932	! Is dst & src 2B aligned
6933	andcc	%o2, 0x1, %g0
6934	bz	%ncc, .ci_alhlfwdcp
6935	nop
6936
6937	! 1B aligned
69381:	lduba	[%i1]ASI_USER, %o2
6939	stb	%o2, [%i0]
6940	inc	%i1
6941	deccc	%i3
6942	bgu,pt	%ncc, 1b
6943	inc	%i0
6944
6945	ba	copyin_blalign
6946	nop
6947
6948	! dst & src 4B aligned
6949.ci_alwdcp:
6950	lda	[%i1]ASI_USER, %o2
6951	st	%o2, [%i0]
6952	add	%i1, 0x4, %i1
6953	subcc	%i3, 0x4, %i3
6954	bgu,pt	%ncc, .ci_alwdcp
6955	add	%i0, 0x4, %i0
6956
6957	ba	copyin_blalign
6958	nop
6959
6960	! dst & src 2B aligned
6961.ci_alhlfwdcp:
6962	lduha	[%i1]ASI_USER, %o2
6963	stuh	%o2, [%i0]
6964	add	%i1, 0x2, %i1
6965	subcc	%i3, 0x2, %i3
6966	bgu,pt	%ncc, .ci_alhlfwdcp
6967	add	%i0, 0x2, %i0
6968
6969	ba	copyin_blalign
6970	nop
6971
6972	! dst & src 8B aligned
6973.ci_alewdcp:
6974	ldxa	[%i1]ASI_USER, %o2
6975	stx	%o2, [%i0]
6976	add	%i1, 0x8, %i1
6977	subcc	%i3, 0x8, %i3
6978	bgu,pt	%ncc, .ci_alewdcp
6979	add	%i0, 0x8, %i0
6980
6981copyin_blalign:
6982	andn	%i2, 0x3f, %i3		! %i3 count is multiple of block size
6983	sub	%i2, %i3, %i2		! Residue bytes in %i2
6984
6985	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
6986
6987	andcc	%i1, 0xf, %o2		! is src quadword aligned
6988	bz,pn	%xcc, .ci_blkcpy	! src offset in %o2 (last 4-bits)
6989	nop
6990	cmp	%o2, 0x8
6991	bg	.ci_upper_double
6992	nop
6993	bl	.ci_lower_double
6994	nop
6995
6996	! Falls through when source offset is equal to 8 i.e.
6997	! source is double word aligned.
6998	! In this case no shift/merge of data is required
6999
7000	sub	%i1, %o2, %i1		! align the src at 16 bytes.
7001	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
7002	prefetcha [%l0]ASI_USER, #one_read
7003	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7004	add	%l0, 0x40, %l0
7005.ci_loop0:
7006	add	%i1, 0x10, %i1
7007	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
7008
7009	prefetcha [%l0]ASI_USER, #one_read
7010
7011	stxa	%l3, [%i0+0x0]%asi
7012	stxa	%l4, [%i0+0x8]%asi
7013
7014	add	%i1, 0x10, %i1
7015	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7016
7017	stxa	%l5, [%i0+0x10]%asi
7018	stxa	%l2, [%i0+0x18]%asi
7019
7020	add	%i1, 0x10, %i1
7021	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
7022
7023	stxa	%l3, [%i0+0x20]%asi
7024	stxa	%l4, [%i0+0x28]%asi
7025
7026	add	%i1, 0x10, %i1
7027	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7028
7029	stxa	%l5, [%i0+0x30]%asi
7030	stxa	%l2, [%i0+0x38]%asi
7031
7032	add	%l0, 0x40, %l0
7033	subcc	%i3, 0x40, %i3
7034	bgu,pt	%xcc, .ci_loop0
7035	add	%i0, 0x40, %i0
7036	ba	.ci_blkdone
7037	add	%i1, %o2, %i1		! increment the source by src offset
7038					! the src offset was stored in %o2
7039
7040.ci_lower_double:
7041
7042	sub	%i1, %o2, %i1		! align the src at 16 bytes.
7043	sll	%o2, 3, %o0		! %o0 left shift
7044	mov	0x40, %o1
7045	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
7046	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
7047	prefetcha [%l0]ASI_USER, #one_read
7048	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2	! partial data in %l2
7049							! and %l3 has complete
7050							! data
7051	add	%l0, 0x40, %l0
7052.ci_loop1:
7053	add	%i1, 0x10, %i1
7054	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4	! %l4 has partial data
7055							! for this read.
7056	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)	! merge %l2, %l3 and %l4
7057							! into %l2 and %l3
7058
7059	prefetcha [%l0]ASI_USER, #one_read
7060
7061	stxa	%l2, [%i0+0x0]%asi
7062	stxa	%l3, [%i0+0x8]%asi
7063
7064	add	%i1, 0x10, %i1
7065	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7066	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)	! merge %l2 with %l5 and
7067							! %l4 from previous read
7068							! into %l4 and %l5
7069	stxa	%l4, [%i0+0x10]%asi
7070	stxa	%l5, [%i0+0x18]%asi
7071
7072	! Repeat the same for next 32 bytes.
7073
7074	add	%i1, 0x10, %i1
7075	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
7076	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
7077
7078	stxa	%l2, [%i0+0x20]%asi
7079	stxa	%l3, [%i0+0x28]%asi
7080
7081	add	%i1, 0x10, %i1
7082	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7083	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
7084
7085	stxa	%l4, [%i0+0x30]%asi
7086	stxa	%l5, [%i0+0x38]%asi
7087
7088	add	%l0, 0x40, %l0
7089	subcc	%i3, 0x40, %i3
7090	bgu,pt	%xcc, .ci_loop1
7091	add	%i0, 0x40, %i0
7092	ba	.ci_blkdone
7093	add	%i1, %o2, %i1		! increment the source by src offset
7094					! the src offset was stored in %o2
7095
7096.ci_upper_double:
7097
7098	sub	%i1, %o2, %i1		! align the src at 16 bytes.
7099	sub	%o2, 0x8, %o0
7100	sll	%o0, 3, %o0		! %o0 left shift
7101	mov	0x40, %o1
7102	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
7103	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
7104	prefetcha [%l0]ASI_USER, #one_read
7105	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2	! partial data in %l3
7106							! for this read and
7107							! no data in %l2
7108	add	%l0, 0x40, %l0
7109.ci_loop2:
7110	add	%i1, 0x10, %i1
7111	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4	! %l4 has complete data
7112							! and %l5 has partial
7113	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)	! merge %l3, %l4 and %l5
7114							! into %l3 and %l4
7115	prefetcha [%l0]ASI_USER, #one_read
7116
7117	stxa	%l3, [%i0+0x0]%asi
7118	stxa	%l4, [%i0+0x8]%asi
7119
7120	add	%i1, 0x10, %i1
7121	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7122	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)	! merge %l2 and %l3 with
7123							! %l5 from previous read
7124							! into %l5 and %l2
7125
7126	stxa	%l5, [%i0+0x10]%asi
7127	stxa	%l2, [%i0+0x18]%asi
7128
7129	! Repeat the same for next 32 bytes.
7130
7131	add	%i1, 0x10, %i1
7132	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
7133	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
7134
7135	stxa	%l3, [%i0+0x20]%asi
7136	stxa	%l4, [%i0+0x28]%asi
7137
7138	add	%i1, 0x10, %i1
7139	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7140	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
7141
7142	stxa	%l5, [%i0+0x30]%asi
7143	stxa	%l2, [%i0+0x38]%asi
7144
7145	add	%l0, 0x40, %l0
7146	subcc	%i3, 0x40, %i3
7147	bgu,pt	%xcc, .ci_loop2
7148	add	%i0, 0x40, %i0
7149	ba	.ci_blkdone
7150	add	%i1, %o2, %i1		! increment the source by src offset
7151					! the src offset was stored in %o2
7152
7153
7154	! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
7155.ci_blkcpy:
7156
7157	andn	%i1, 0x3f, %o0		! %o0 has block aligned source
7158	prefetcha [%o0]ASI_USER, #one_read
7159	add	%o0, 0x40, %o0
71601:
7161	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l0
7162	add	%i1, 0x10, %i1
7163	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7164	add	%i1, 0x10, %i1
7165
7166	prefetcha [%o0]ASI_USER, #one_read
7167
7168	stxa	%l0, [%i0+0x0]%asi
7169
7170	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
7171	add	%i1, 0x10, %i1
7172	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l6
7173	add	%i1, 0x10, %i1
7174
7175	stxa	%l1, [%i0+0x8]%asi
7176	stxa	%l2, [%i0+0x10]%asi
7177	stxa	%l3, [%i0+0x18]%asi
7178	stxa	%l4, [%i0+0x20]%asi
7179	stxa	%l5, [%i0+0x28]%asi
7180	stxa	%l6, [%i0+0x30]%asi
7181	stxa	%l7, [%i0+0x38]%asi
7182
7183	add	%o0, 0x40, %o0
7184	subcc	%i3, 0x40, %i3
7185	bgu,pt	%xcc, 1b
7186	add	%i0, 0x40, %i0
7187
7188.ci_blkdone:
7189	membar	#Sync
7190
7191	brz,pt	%i2, .copyin_exit
7192	nop
7193
7194	! Handle trailing bytes
7195	cmp	%i2, 0x8
7196	blu,pt	%ncc, .ci_residue
7197	nop
7198
7199	! Can we do some 8B ops
7200	or	%i1, %i0, %o2
7201	andcc	%o2, 0x7, %g0
7202	bnz	%ncc, .ci_last4
7203	nop
7204
7205	! Do 8byte ops as long as possible
7206.ci_last8:
7207	ldxa	[%i1]ASI_USER, %o2
7208	stx	%o2, [%i0]
7209	add	%i1, 0x8, %i1
7210	sub	%i2, 0x8, %i2
7211	cmp	%i2, 0x8
7212	bgu,pt	%ncc, .ci_last8
7213	add	%i0, 0x8, %i0
7214
7215	brz,pt	%i2, .copyin_exit
7216	nop
7217
7218	ba	.ci_residue
7219	nop
7220
7221.ci_last4:
7222	! Can we do 4B ops
7223	andcc	%o2, 0x3, %g0
7224	bnz	%ncc, .ci_last2
7225	nop
72261:
7227	lda	[%i1]ASI_USER, %o2
7228	st	%o2, [%i0]
7229	add	%i1, 0x4, %i1
7230	sub	%i2, 0x4, %i2
7231	cmp	%i2, 0x4
7232	bgu,pt	%ncc, 1b
7233	add	%i0, 0x4, %i0
7234
7235	brz,pt	%i2, .copyin_exit
7236	nop
7237
7238	ba	.ci_residue
7239	nop
7240
7241.ci_last2:
7242	! Can we do 2B ops
7243	andcc	%o2, 0x1, %g0
7244	bnz	%ncc, .ci_residue
7245	nop
7246
72471:
7248	lduha	[%i1]ASI_USER, %o2
7249	stuh	%o2, [%i0]
7250	add	%i1, 0x2, %i1
7251	sub	%i2, 0x2, %i2
7252	cmp	%i2, 0x2
7253	bgu,pt	%ncc, 1b
7254	add	%i0, 0x2, %i0
7255
7256	brz,pt	%i2, .copyin_exit
7257	nop
7258
7259	! Copy the residue as byte copy
7260.ci_residue:
7261	lduba	[%i1]ASI_USER, %i4
7262	stb	%i4, [%i0]
7263	inc	%i1
7264	deccc	%i2
7265	bgu,pt	%xcc, .ci_residue
7266	inc	%i0
7267
7268.copyin_exit:
7269	membar	#Sync
7270	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
7271	ret
7272	restore	%g0, 0, %o0
7273.copyin_err:
7274	ldn	[THREAD_REG + T_COPYOPS], %o4
7275	brz	%o4, 2f
7276	nop
7277	ldn	[%o4 + CP_COPYIN], %g2
7278	jmp	%g2
7279	nop
72802:
7281	retl
7282	mov	-1, %o0
7283#endif	/* NIAGARA_IMPL */
7284	SET_SIZE(copyin)
7285
7286#endif	/* lint */
7287
7288#ifdef	lint
7289
7290/*ARGSUSED*/
7291int
7292xcopyin(const void *uaddr, void *kaddr, size_t count)
7293{ return (0); }
7294
7295#else	/* lint */
7296
7297	ENTRY(xcopyin)
7298	sethi	%hi(.xcopyin_err), REAL_LOFAULT
7299	b	.do_copyin
7300	or	REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
7301.xcopyin_err:
7302	ldn	[THREAD_REG + T_COPYOPS], %o4
7303	brz	%o4, 2f
7304	nop
7305	ldn	[%o4 + CP_XCOPYIN], %g2
7306	jmp	%g2
7307	nop
73082:
7309	retl
7310	mov	%g1, %o0
7311	SET_SIZE(xcopyin)
7312
7313#endif	/* lint */
7314
7315#ifdef	lint
7316
7317/*ARGSUSED*/
7318int
7319xcopyin_little(const void *uaddr, void *kaddr, size_t count)
7320{ return (0); }
7321
7322#else	/* lint */
7323
7324	ENTRY(xcopyin_little)
7325	sethi	%hi(.little_err), %o4
7326	ldn	[THREAD_REG + T_LOFAULT], %o5
7327	or	%o4, %lo(.little_err), %o4
7328	membar	#Sync				! sync error barrier
7329	stn	%o4, [THREAD_REG + T_LOFAULT]
7330
7331	subcc	%g0, %o2, %o3
7332	add	%o0, %o2, %o0
7333	bz,pn	%ncc, 2f		! check for zero bytes
7334	sub	%o2, 1, %o4
7335	add	%o0, %o4, %o0		! start w/last byte
7336	add	%o1, %o2, %o1
7337	lduba	[%o0+%o3]ASI_AIUSL, %o4
7338
73391:	stb	%o4, [%o1+%o3]
7340	inccc	%o3
7341	sub	%o0, 2, %o0		! get next byte
7342	bcc,a,pt %ncc, 1b
7343	lduba	[%o0+%o3]ASI_AIUSL, %o4
7344
73452:	membar	#Sync				! sync error barrier
7346	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
7347	retl
7348	mov	%g0, %o0		! return (0)
7349
7350.little_err:
7351	membar	#Sync				! sync error barrier
7352	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
7353	retl
7354	mov	%g1, %o0
7355	SET_SIZE(xcopyin_little)
7356
7357#endif	/* lint */
7358
7359
7360/*
7361 * Copy a block of storage - must not overlap (from + len <= to).
7362 * No fault handler installed (to be called under on_fault())
7363 */
7364#if defined(lint)
7365
7366/* ARGSUSED */
7367void
7368copyin_noerr(const void *ufrom, void *kto, size_t count)
7369{}
7370
7371#else	/* lint */
7372
7373	ENTRY(copyin_noerr)
7374	sethi	%hi(.copyio_noerr), REAL_LOFAULT
7375	b	.do_copyin
7376	or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
7377.copyio_noerr:
7378	jmp	SAVED_LOFAULT
7379	nop
7380	SET_SIZE(copyin_noerr)
7381
7382#endif /* lint */
7383
7384/*
7385 * Copy a block of storage - must not overlap (from + len <= to).
7386 * No fault handler installed (to be called under on_fault())
7387 */
7388
7389#if defined(lint)
7390
7391/* ARGSUSED */
7392void
7393copyout_noerr(const void *kfrom, void *uto, size_t count)
7394{}
7395
7396#else	/* lint */
7397
7398	ENTRY(copyout_noerr)
7399	sethi	%hi(.copyio_noerr), REAL_LOFAULT
7400	b	.do_copyout
7401	or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
7402	SET_SIZE(copyout_noerr)
7403
7404#endif /* lint */
7405
7406#if defined(lint)
7407
7408int use_hw_bcopy = 1;
7409int use_hw_bzero = 1;
7410uint_t hw_copy_limit_1 = 0x100;
7411uint_t hw_copy_limit_2 = 0x200;
7412uint_t hw_copy_limit_4 = 0x400;
7413uint_t hw_copy_limit_8 = 0x400;
7414
7415#else /* !lint */
7416
7417	.align	4
7418	DGDEF(use_hw_bcopy)
7419	.word	1
7420	DGDEF(use_hw_bzero)
7421	.word	1
7422	DGDEF(hw_copy_limit_1)
7423	.word	0x100
7424	DGDEF(hw_copy_limit_2)
7425	.word	0x200
7426	DGDEF(hw_copy_limit_4)
7427	.word	0x400
7428	DGDEF(hw_copy_limit_8)
7429	.word	0x400
7430
7431	.align	64
7432	.section ".text"
7433#endif /* !lint */
7434
7435/*
7436 * hwblkclr - clears block-aligned, block-multiple-sized regions that are
7437 * longer than 256 bytes in length using Niagara's block stores/quad store.
7438 * If the criteria for using this routine are not met then it calls bzero
7439 * and returns 1.  Otherwise 0 is returned indicating success.
7440 * Caller is responsible for ensuring use_hw_bzero is true and that
7441 * kpreempt_disable() has been called.
7442 */
7443#ifdef lint
7444/*ARGSUSED*/
7445int
7446hwblkclr(void *addr, size_t len)
7447{
7448	return(0);
7449}
7450#else /* lint */
7451	! %i0 - start address
7452	! %i1 - length of region (multiple of 64)
7453
7454	ENTRY(hwblkclr)
7455	save	%sp, -SA(MINFRAME), %sp
7456
7457	! Must be block-aligned
7458	andcc	%i0, 0x3f, %g0
7459	bnz,pn	%ncc, 1f
7460	nop
7461
7462	! ... and must be 256 bytes or more
7463	cmp	%i1, 0x100
7464	blu,pn	%ncc, 1f
7465	nop
7466
7467	! ... and length must be a multiple of 64
7468	andcc	%i1, 0x3f, %g0
7469	bz,pn	%ncc, .pz_doblock
7470	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
7471
74721:	! punt, call bzero but notify the caller that bzero was used
7473	mov	%i0, %o0
7474	call	bzero
7475	mov	%i1, %o1
7476	ret
7477	restore	%g0, 1, %o0	! return (1) - did not use block operations
7478
7479	! Already verified that there are at least 256 bytes to set
7480.pz_doblock:
7481	stxa	%g0, [%i0+0x0]%asi
7482	stxa	%g0, [%i0+0x40]%asi
7483	stxa	%g0, [%i0+0x80]%asi
7484	stxa	%g0, [%i0+0xc0]%asi
7485
7486	stxa	%g0, [%i0+0x8]%asi
7487	stxa	%g0, [%i0+0x10]%asi
7488	stxa	%g0, [%i0+0x18]%asi
7489	stxa	%g0, [%i0+0x20]%asi
7490	stxa	%g0, [%i0+0x28]%asi
7491	stxa	%g0, [%i0+0x30]%asi
7492	stxa	%g0, [%i0+0x38]%asi
7493
7494	stxa	%g0, [%i0+0x48]%asi
7495	stxa	%g0, [%i0+0x50]%asi
7496	stxa	%g0, [%i0+0x58]%asi
7497	stxa	%g0, [%i0+0x60]%asi
7498	stxa	%g0, [%i0+0x68]%asi
7499	stxa	%g0, [%i0+0x70]%asi
7500	stxa	%g0, [%i0+0x78]%asi
7501
7502	stxa	%g0, [%i0+0x88]%asi
7503	stxa	%g0, [%i0+0x90]%asi
7504	stxa	%g0, [%i0+0x98]%asi
7505	stxa	%g0, [%i0+0xa0]%asi
7506	stxa	%g0, [%i0+0xa8]%asi
7507	stxa	%g0, [%i0+0xb0]%asi
7508	stxa	%g0, [%i0+0xb8]%asi
7509
7510	stxa	%g0, [%i0+0xc8]%asi
7511	stxa	%g0, [%i0+0xd0]%asi
7512	stxa	%g0, [%i0+0xd8]%asi
7513	stxa	%g0, [%i0+0xe0]%asi
7514	stxa	%g0, [%i0+0xe8]%asi
7515	stxa	%g0, [%i0+0xf0]%asi
7516	stxa	%g0, [%i0+0xf8]%asi
7517
7518	sub	%i1, 0x100, %i1
7519	cmp	%i1, 0x100
7520	bgu,pt	%ncc, .pz_doblock
7521	add	%i0, 0x100, %i0
7522
75232:
7524	! Check if more than 64 bytes to set
7525	cmp	%i1,0x40
7526	blu	%ncc, .pz_finish
7527	nop
7528
75293:
7530	stxa	%g0, [%i0+0x0]%asi
7531	stxa	%g0, [%i0+0x8]%asi
7532	stxa	%g0, [%i0+0x10]%asi
7533	stxa	%g0, [%i0+0x18]%asi
7534	stxa	%g0, [%i0+0x20]%asi
7535	stxa	%g0, [%i0+0x28]%asi
7536	stxa	%g0, [%i0+0x30]%asi
7537	stxa	%g0, [%i0+0x38]%asi
7538
7539	subcc	%i1, 0x40, %i1
7540	bgu,pt	%ncc, 3b
7541	add	%i0, 0x40, %i0
7542
7543.pz_finish:
7544	membar	#Sync
7545	ret
7546	restore	%g0, 0, %o0		! return (bzero or not)
7547	SET_SIZE(hwblkclr)
7548#endif	/* lint */
7549
7550#ifdef	lint
7551/* Copy 32 bytes of data from src to dst using physical addresses */
7552/*ARGSUSED*/
7553void
7554hw_pa_bcopy32(uint64_t src, uint64_t dst)
7555{}
7556#else	/*!lint */
7557
7558	/*
7559	 * Copy 32 bytes of data from src (%o0) to dst (%o1)
7560	 * using physical addresses.
7561	 */
7562	ENTRY_NP(hw_pa_bcopy32)
7563	rdpr	%pstate, %g1
7564	andn	%g1, PSTATE_IE, %g2
7565	wrpr	%g0, %g2, %pstate
7566
7567	ldxa	[%o0]ASI_MEM, %o2
7568	add	%o0, 8, %o0
7569	ldxa	[%o0]ASI_MEM, %o3
7570	add	%o0, 8, %o0
7571	ldxa	[%o0]ASI_MEM, %o4
7572	add	%o0, 8, %o0
7573	ldxa	[%o0]ASI_MEM, %o5
7574	stxa	%o2, [%o1]ASI_MEM
7575	add	%o1, 8, %o1
7576	stxa	%o3, [%o1]ASI_MEM
7577	add	%o1, 8, %o1
7578	stxa	%o4, [%o1]ASI_MEM
7579	add	%o1, 8, %o1
7580	stxa	%o5, [%o1]ASI_MEM
7581
7582	membar	#Sync
7583	retl
7584	wrpr	%g0, %g1, %pstate
7585	SET_SIZE(hw_pa_bcopy32)
7586#endif /* lint */
7587
7588/*
7589 * Zero a block of storage.
7590 *
7591 * uzero is used by the kernel to zero a block in user address space.
7592 */
7593
7594/*
7595 * Control flow of the bzero/kzero/uzero routine.
7596 *
7597 *	For fewer than 7 bytes stores, bytes will be zeroed.
7598 *
7599 *	For less than 15 bytes stores, align the address on 4 byte boundary.
7600 *	Then store as many 4-byte chunks, followed by trailing bytes.
7601 *
7602 *	For sizes greater than 15 bytes, align the address on 8 byte boundary.
7603 *	if (count > 128) {
7604 *		store as many 8-bytes chunks to block align the address
7605 *		store using ASI_BLK_INIT_ST_QUAD_LDD_P (bzero/kzero) OR
7606 *		store using ASI_BLK_INIT_QUAD_LDD_AIUS (uzero)
7607 *	}
7608 *	Store as many 8-byte chunks, followed by trailing bytes.
7609 */
7610
7611#if defined(lint)
7612
7613/* ARGSUSED */
7614int
7615kzero(void *addr, size_t count)
7616{ return(0); }
7617
7618/* ARGSUSED */
7619void
7620uzero(void *addr, size_t count)
7621{}
7622
7623#else	/* lint */
7624
7625	ENTRY(uzero)
7626	!
7627	! Set a new lo_fault handler only if we came in with one
7628	! already specified.
7629	!
7630	wr	%g0, ASI_USER, %asi
7631	ldn	[THREAD_REG + T_LOFAULT], %o5
7632	tst	%o5
7633	bz,pt	%ncc, .do_zero
7634	sethi	%hi(.zeroerr), %o2
7635	or	%o2, %lo(.zeroerr), %o2
7636	membar	#Sync
7637	ba,pt	%ncc, .do_zero
7638	stn	%o2, [THREAD_REG + T_LOFAULT]
7639
7640	ENTRY(kzero)
7641	!
7642	! Always set a lo_fault handler
7643	!
7644	wr	%g0, ASI_P, %asi
7645	ldn	[THREAD_REG + T_LOFAULT], %o5
7646	sethi	%hi(.zeroerr), %o2
7647	or	%o5, LOFAULT_SET, %o5
7648	or	%o2, %lo(.zeroerr), %o2
7649	membar	#Sync
7650	ba,pt	%ncc, .do_zero
7651	stn	%o2, [THREAD_REG + T_LOFAULT]
7652
7653/*
7654 * We got here because of a fault during kzero or if
7655 * uzero or bzero was called with t_lofault non-zero.
7656 * Otherwise we've already run screaming from the room.
7657 * Errno value is in %g1. Note that we're here iff
7658 * we did set t_lofault.
7659 */
7660.zeroerr:
7661	!
7662	! Undo asi register setting. Just set it to be the
7663	! kernel default without checking.
7664	!
7665	wr	%g0, ASI_P, %asi
7666
7667	!
7668	! We did set t_lofault. It may well have been zero coming in.
7669	!
76701:
7671	tst	%o5
7672	membar #Sync
7673	bne,pn	%ncc, 3f
7674	andncc	%o5, LOFAULT_SET, %o5
76752:
7676	!
7677	! Old handler was zero. Just return the error.
7678	!
7679	retl				! return
7680	mov	%g1, %o0		! error code from %g1
76813:
7682	!
7683	! We're here because %o5 was non-zero. It was non-zero
7684	! because either LOFAULT_SET was present, a previous fault
7685	! handler was present or both. In all cases we need to reset
7686	! T_LOFAULT to the value of %o5 after clearing LOFAULT_SET
7687	! before we either simply return the error or we invoke the
7688	! previously specified handler.
7689	!
7690	be	%ncc, 2b
7691	stn	%o5, [THREAD_REG + T_LOFAULT]
7692	jmp	%o5			! goto real handler
7693	nop
7694	SET_SIZE(kzero)
7695	SET_SIZE(uzero)
7696
7697#endif	/* lint */
7698
7699/*
7700 * Zero a block of storage.
7701 */
7702
7703#if defined(lint)
7704
7705/* ARGSUSED */
7706void
7707bzero(void *addr, size_t count)
7708{}
7709
7710#else	/* lint */
7711
7712	ENTRY(bzero)
7713	wr	%g0, ASI_P, %asi
7714
7715	ldn	[THREAD_REG + T_LOFAULT], %o5	! save old vector
7716	tst	%o5
7717	bz,pt	%ncc, .do_zero
7718	sethi	%hi(.zeroerr), %o2
7719	or	%o2, %lo(.zeroerr), %o2
7720	membar	#Sync				! sync error barrier
7721	stn	%o2, [THREAD_REG + T_LOFAULT]	! install new vector
7722
7723.do_zero:
7724	cmp	%o1, 7
7725	blu,pn	%ncc, .byteclr
7726	nop
7727
7728	cmp	%o1, 15
7729	blu,pn	%ncc, .wdalign
7730	nop
7731
7732	andcc	%o0, 7, %o3		! is add aligned on a 8 byte bound
7733	bz,pt	%ncc, .blkalign		! already double aligned
7734	sub	%o3, 8, %o3		! -(bytes till double aligned)
7735	add	%o1, %o3, %o1		! update o1 with new count
7736
77371:
7738	stba	%g0, [%o0]%asi
7739	inccc	%o3
7740	bl,pt	%ncc, 1b
7741	inc	%o0
7742
7743	! Now address is double aligned
7744.blkalign:
7745	cmp	%o1, 0x80		! check if there are 128 bytes to set
7746	blu,pn	%ncc, .bzero_small
7747	mov	%o1, %o3
7748
7749	sethi	%hi(use_hw_bzero), %o2
7750	ld	[%o2 + %lo(use_hw_bzero)], %o2
7751	tst	%o2
7752	bz	%ncc, .bzero_small
7753	mov	%o1, %o3
7754
7755	rd	%asi, %o3
7756	wr	%g0, ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
7757	cmp	%o3, ASI_P
7758	bne,a	%ncc, .algnblk
7759	wr	%g0, ASI_BLK_INIT_QUAD_LDD_AIUS, %asi
7760
7761.algnblk:
7762	andcc	%o0, 0x3f, %o3		! is block aligned?
7763	bz,pt	%ncc, .bzero_blk
7764	sub	%o3, 0x40, %o3		! -(bytes till block aligned)
7765	add	%o1, %o3, %o1		! o1 is the remainder
7766
7767	! Clear -(%o3) bytes till block aligned
77681:
7769	stxa	%g0, [%o0]%asi
7770	addcc	%o3, 8, %o3
7771	bl,pt	%ncc, 1b
7772	add	%o0, 8, %o0
7773
7774.bzero_blk:
7775	and	%o1, 0x3f, %o3		! calc bytes left after blk clear
7776	andn	%o1, 0x3f, %o4		! calc size of blocks in bytes
7777
7778	cmp	%o4, 0x100		! 256 bytes or more
7779	blu,pn	%ncc, 3f
7780	nop
7781
77822:
7783	stxa	%g0, [%o0+0x0]%asi
7784	stxa	%g0, [%o0+0x40]%asi
7785	stxa	%g0, [%o0+0x80]%asi
7786	stxa	%g0, [%o0+0xc0]%asi
7787
7788	stxa	%g0, [%o0+0x8]%asi
7789	stxa	%g0, [%o0+0x10]%asi
7790	stxa	%g0, [%o0+0x18]%asi
7791	stxa	%g0, [%o0+0x20]%asi
7792	stxa	%g0, [%o0+0x28]%asi
7793	stxa	%g0, [%o0+0x30]%asi
7794	stxa	%g0, [%o0+0x38]%asi
7795
7796	stxa	%g0, [%o0+0x48]%asi
7797	stxa	%g0, [%o0+0x50]%asi
7798	stxa	%g0, [%o0+0x58]%asi
7799	stxa	%g0, [%o0+0x60]%asi
7800	stxa	%g0, [%o0+0x68]%asi
7801	stxa	%g0, [%o0+0x70]%asi
7802	stxa	%g0, [%o0+0x78]%asi
7803
7804	stxa	%g0, [%o0+0x88]%asi
7805	stxa	%g0, [%o0+0x90]%asi
7806	stxa	%g0, [%o0+0x98]%asi
7807	stxa	%g0, [%o0+0xa0]%asi
7808	stxa	%g0, [%o0+0xa8]%asi
7809	stxa	%g0, [%o0+0xb0]%asi
7810	stxa	%g0, [%o0+0xb8]%asi
7811
7812	stxa	%g0, [%o0+0xc8]%asi
7813	stxa	%g0, [%o0+0xd0]%asi
7814	stxa	%g0, [%o0+0xd8]%asi
7815	stxa	%g0, [%o0+0xe0]%asi
7816	stxa	%g0, [%o0+0xe8]%asi
7817	stxa	%g0, [%o0+0xf0]%asi
7818	stxa	%g0, [%o0+0xf8]%asi
7819
7820	sub	%o4, 0x100, %o4
7821	cmp	%o4, 0x100
7822	bgu,pt	%ncc, 2b
7823	add	%o0, 0x100, %o0
7824
78253:
7826	! ... check if 64 bytes to set
7827	cmp	%o4, 0x40
7828	blu	%ncc, .bzero_blk_done
7829	nop
7830
78314:
7832	stxa	%g0, [%o0+0x0]%asi
7833	stxa	%g0, [%o0+0x8]%asi
7834	stxa	%g0, [%o0+0x10]%asi
7835	stxa	%g0, [%o0+0x18]%asi
7836	stxa	%g0, [%o0+0x20]%asi
7837	stxa	%g0, [%o0+0x28]%asi
7838	stxa	%g0, [%o0+0x30]%asi
7839	stxa	%g0, [%o0+0x38]%asi
7840
7841	subcc	%o4, 0x40, %o4
7842	bgu,pt	%ncc, 3b
7843	add	%o0, 0x40, %o0
7844
7845.bzero_blk_done:
7846	membar	#Sync
7847	!
7848	! Undo asi register setting.
7849	!
7850	rd	%asi, %o4
7851	wr	%g0, ASI_P, %asi
7852	cmp	%o4, ASI_BLK_INIT_ST_QUAD_LDD_P
7853	bne,a	%ncc, .bzero_small
7854	wr	%g0, ASI_USER, %asi
7855
7856.bzero_small:
7857	! Set the remaining doubles
7858	subcc	%o3, 8, %o3		! Can we store any doubles?
7859	blu,pn	%ncc, .byteclr
7860	and	%o1, 7, %o1		! calc bytes left after doubles
7861
7862.dbclr:
7863	stxa	%g0, [%o0]%asi		! Clear the doubles
7864	subcc	%o3, 8, %o3
7865	bgeu,pt	%ncc, .dbclr
7866	add	%o0, 8, %o0
7867
7868	ba	.byteclr
7869	nop
7870
7871.wdalign:
7872	andcc	%o0, 3, %o3		! is add aligned on a word boundary
7873	bz,pn	%ncc, .wdclr
7874	andn	%o1, 3, %o3		! create word sized count in %o3
7875
7876	dec	%o1			! decrement count
7877	stba	%g0, [%o0]%asi		! clear a byte
7878	ba	.wdalign
7879	inc	%o0			! next byte
7880
7881.wdclr:
7882	sta	%g0, [%o0]%asi		! 4-byte clearing loop
7883	subcc	%o3, 4, %o3
7884	bnz,pt	%ncc, .wdclr
7885	inc	4, %o0
7886
7887	and	%o1, 3, %o1		! leftover count, if any
7888
7889.byteclr:
7890	! Set the leftover bytes
7891	brz	%o1, .bzero_exit
7892	nop
7893
78947:
7895	deccc	%o1			! byte clearing loop
7896	stba	%g0, [%o0]%asi
7897	bgu,pt	%ncc, 7b
7898	inc	%o0
7899
7900.bzero_exit:
7901	!
7902	! We're just concerned with whether t_lofault was set
7903	! when we came in. We end up here from either kzero()
7904	! or bzero(). kzero() *always* sets a lofault handler.
7905	! It ors LOFAULT_SET into %o5 to indicate it has done
7906	! this even if the value of %o5 is otherwise zero.
7907	! bzero() sets a lofault handler *only* if one was
7908	! previously set. Accordingly we need to examine
7909	! %o5 and if it is non-zero be sure to clear LOFAULT_SET
7910	! before resetting the error handler.
7911	!
7912	tst	%o5
7913	bz	%ncc, 1f
7914	andn	%o5, LOFAULT_SET, %o5
7915	membar	#Sync				! sync error barrier
7916	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
79171:
7918	retl
7919	clr	%o0			! return (0)
7920
7921	SET_SIZE(bzero)
7922#endif	/* lint */
7923