xref: /titanic_52/usr/src/uts/sun4v/cpu/niagara_copy.s (revision 59ac0c1669407488b67ae9e273667a340dccc611)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28#include <sys/param.h>
29#include <sys/errno.h>
30#include <sys/asm_linkage.h>
31#include <sys/vtrace.h>
32#include <sys/machthread.h>
33#include <sys/clock.h>
34#include <sys/asi.h>
35#include <sys/fsr.h>
36#include <sys/privregs.h>
37#include <sys/machasi.h>
38#include <sys/niagaraasi.h>
39
40#if !defined(lint)
41#include "assym.h"
42#endif	/* lint */
43
44
45/*
46 * Pseudo-code to aid in understanding the control flow of the
47 * bcopy/kcopy routine.
48 *
49 *	! WARNING : <Register usage convention>
50 *	! In kcopy() the %o5, holds previous error handler and a flag
51 *	! LOFAULT_SET (low bits). The %o5 is null in bcopy().
52 *	! The %o5 is not available for any other use.
53 *
54 * kcopy():
55 *	%o5 = curthread->t_lofault;		! save existing handler in %o5
56 *	%o5 |= LOFAULT_SET;			! ORed with LOFAULT_SET flag
57 *	curthread->t_lofault = .copyerr;
58 *	Call bcopy();
59 *
60 * bcopy():
61 * 	if (length < 128)
62 * 		goto regular_copy;
63 *
64 * 	if (!use_hw_bcopy)
65 * 		goto regular_copy;
66 *
67 * 	blockcopy;
68 *	restore t_lofault handler if came from kcopy();
69 *
70 *	regular_copy;
71 *	restore t_lofault handler if came from kcopy();
72 *
73 * In lofault handler:
74 *	curthread->t_lofault = (%o5 & ~LOFAULT_SET);	! restore old t_lofault
75 *	return (errno)
76 *
77 */
78
79/*
80 * Less then or equal this number of bytes we will always copy byte-for-byte
81 */
82#define	SMALL_LIMIT	7
83
84/*
85 * LOFAULT_SET : Flag set by kzero and kcopy to indicate that t_lofault
86 * handler was set
87 */
88#define	LOFAULT_SET 2
89
90/*
91 * This define is to align data for the unaligned source cases.
92 * The data1, data2 and data3 is merged into data1 and data2.
93 * The data3 is preserved for next merge.
94 */
95#define	ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp)	\
96	sllx	data1, lshift, data1				;\
97	srlx	data2, rshift, tmp				;\
98	or	data1, tmp, data1				;\
99	sllx	data2, lshift, data2				;\
100	srlx	data3, rshift, tmp				;\
101	or	data2, tmp, data2
102/*
103 * This macro is to align the data. Basically it merges
104 * data1 and data2 to form double word.
105 */
106#define	ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp)	\
107	sllx	data1, lshift, data1				;\
108	srlx	data2, rshift, tmp				;\
109	or	data1, tmp, data1
110
111#if !defined(NIAGARA_IMPL)
112/*
113 * Flags set in the lower bits of the t_lofault address:
114 * FPUSED_FLAG: The FP registers were in use and must be restored
115 * BCOPY_FLAG: Set for bcopy calls, cleared for kcopy calls
116 * COPY_FLAGS: Both of the above
117 *
118 * Other flags:
119 * KPREEMPT_FLAG: kpreempt needs to be called
120 */
121#define	FPUSED_FLAG	1
122#define	BCOPY_FLAG	2
123#define	COPY_FLAGS	(FPUSED_FLAG | BCOPY_FLAG)
124#define	KPREEMPT_FLAG	4
125
126#define	ALIGN_OFF_1_7			\
127	faligndata %d0, %d2, %d48	;\
128	faligndata %d2, %d4, %d50	;\
129	faligndata %d4, %d6, %d52	;\
130	faligndata %d6, %d8, %d54	;\
131	faligndata %d8, %d10, %d56	;\
132	faligndata %d10, %d12, %d58	;\
133	faligndata %d12, %d14, %d60	;\
134	faligndata %d14, %d16, %d62
135
136#define	ALIGN_OFF_8_15			\
137	faligndata %d2, %d4, %d48	;\
138	faligndata %d4, %d6, %d50	;\
139	faligndata %d6, %d8, %d52	;\
140	faligndata %d8, %d10, %d54	;\
141	faligndata %d10, %d12, %d56	;\
142	faligndata %d12, %d14, %d58	;\
143	faligndata %d14, %d16, %d60	;\
144	faligndata %d16, %d18, %d62
145
146#define	ALIGN_OFF_16_23			\
147	faligndata %d4, %d6, %d48	;\
148	faligndata %d6, %d8, %d50	;\
149	faligndata %d8, %d10, %d52	;\
150	faligndata %d10, %d12, %d54	;\
151	faligndata %d12, %d14, %d56	;\
152	faligndata %d14, %d16, %d58	;\
153	faligndata %d16, %d18, %d60	;\
154	faligndata %d18, %d20, %d62
155
156#define	ALIGN_OFF_24_31			\
157	faligndata %d6, %d8, %d48	;\
158	faligndata %d8, %d10, %d50	;\
159	faligndata %d10, %d12, %d52	;\
160	faligndata %d12, %d14, %d54	;\
161	faligndata %d14, %d16, %d56	;\
162	faligndata %d16, %d18, %d58	;\
163	faligndata %d18, %d20, %d60	;\
164	faligndata %d20, %d22, %d62
165
166#define	ALIGN_OFF_32_39			\
167	faligndata %d8, %d10, %d48	;\
168	faligndata %d10, %d12, %d50	;\
169	faligndata %d12, %d14, %d52	;\
170	faligndata %d14, %d16, %d54	;\
171	faligndata %d16, %d18, %d56	;\
172	faligndata %d18, %d20, %d58	;\
173	faligndata %d20, %d22, %d60	;\
174	faligndata %d22, %d24, %d62
175
176#define	ALIGN_OFF_40_47			\
177	faligndata %d10, %d12, %d48	;\
178	faligndata %d12, %d14, %d50	;\
179	faligndata %d14, %d16, %d52	;\
180	faligndata %d16, %d18, %d54	;\
181	faligndata %d18, %d20, %d56	;\
182	faligndata %d20, %d22, %d58	;\
183	faligndata %d22, %d24, %d60	;\
184	faligndata %d24, %d26, %d62
185
186#define	ALIGN_OFF_48_55			\
187	faligndata %d12, %d14, %d48	;\
188	faligndata %d14, %d16, %d50	;\
189	faligndata %d16, %d18, %d52	;\
190	faligndata %d18, %d20, %d54	;\
191	faligndata %d20, %d22, %d56	;\
192	faligndata %d22, %d24, %d58	;\
193	faligndata %d24, %d26, %d60	;\
194	faligndata %d26, %d28, %d62
195
196#define	ALIGN_OFF_56_63			\
197	faligndata %d14, %d16, %d48	;\
198	faligndata %d16, %d18, %d50	;\
199	faligndata %d18, %d20, %d52	;\
200	faligndata %d20, %d22, %d54	;\
201	faligndata %d22, %d24, %d56	;\
202	faligndata %d24, %d26, %d58	;\
203	faligndata %d26, %d28, %d60	;\
204	faligndata %d28, %d30, %d62
205
206#define	VIS_BLOCKSIZE		64
207
208/*
209 * Size of stack frame in order to accomodate a 64-byte aligned
210 * floating-point register save area and 2 64-bit temp locations.
211 * All copy functions use three quadrants of fp registers; to assure a
212 * block-aligned three block buffer in which to save we must reserve
213 * four blocks on stack.
214 *
215 *    _______________________________________ <-- %fp + STACK_BIAS
216 *    | We may need to preserve 3 quadrants |
217 *    | of fp regs, but since we do so with |
218 *    | BST/BLD we need room in which to    |
219 *    | align to VIS_BLOCKSIZE bytes.  So   |
220 *    | this area is 4 * VIS_BLOCKSIZE.     | <--  - SAVED_FPREGS_OFFSET
221 *    |-------------------------------------|
222 *    | 8 bytes to save %fprs               | <--  - SAVED_FPRS_OFFSET
223 *    |-------------------------------------|
224 *    | 8 bytes to save %gsr                | <--  - SAVED_GSR_OFFSET
225 *    ---------------------------------------
226 */
227#define HWCOPYFRAMESIZE         ((VIS_BLOCKSIZE * (3 + 1)) + (2 * 8))
228#define SAVED_FPREGS_OFFSET     (VIS_BLOCKSIZE * 4)
229#define SAVED_FPREGS_ADJUST     ((VIS_BLOCKSIZE * 3) + 1)
230#define SAVED_FPRS_OFFSET       (SAVED_FPREGS_OFFSET + 8)
231#define SAVED_GSR_OFFSET        (SAVED_FPRS_OFFSET + 8)
232
233/*
234 * In FP copies if we do not have preserved data to restore over
235 * the fp regs we used then we must zero those regs to avoid
236 * exposing portions of the data to later threads (data security).
237 */
238#define	FZERO				\
239	fzero	%f0			;\
240	fzero	%f2			;\
241	faddd	%f0, %f2, %f4		;\
242	fmuld	%f0, %f2, %f6		;\
243	faddd	%f0, %f2, %f8		;\
244	fmuld	%f0, %f2, %f10		;\
245	faddd	%f0, %f2, %f12		;\
246	fmuld	%f0, %f2, %f14		;\
247	faddd	%f0, %f2, %f16		;\
248	fmuld	%f0, %f2, %f18		;\
249	faddd	%f0, %f2, %f20		;\
250	fmuld	%f0, %f2, %f22		;\
251	faddd	%f0, %f2, %f24		;\
252	fmuld	%f0, %f2, %f26		;\
253	faddd	%f0, %f2, %f28		;\
254	fmuld	%f0, %f2, %f30		;\
255	faddd	%f0, %f2, %f48		;\
256	fmuld	%f0, %f2, %f50		;\
257	faddd	%f0, %f2, %f52		;\
258	fmuld	%f0, %f2, %f54		;\
259	faddd	%f0, %f2, %f56		;\
260	fmuld	%f0, %f2, %f58		;\
261	faddd	%f0, %f2, %f60		;\
262	fmuld	%f0, %f2, %f62
263
264#if !defined(lint)
265
266/*
267 * Macros to save and restore fp registers to/from the stack.
268 * Used to save and restore in-use fp registers when we want to use FP.
269 */
270#define BST_FP_TOSTACK(tmp1)					\
271	/* membar #Sync	*/					;\
272	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
273	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
274	stda	%f0, [tmp1]ASI_BLK_P				;\
275	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
276	stda	%f16, [tmp1]ASI_BLK_P				;\
277	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
278	stda	%f48, [tmp1]ASI_BLK_P				;\
279	membar	#Sync
280
281#define	BLD_FP_FROMSTACK(tmp1)					\
282	/* membar #Sync - provided at copy completion */	;\
283	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
284	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
285	ldda	[tmp1]ASI_BLK_P, %f0				;\
286	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
287	ldda	[tmp1]ASI_BLK_P, %f16				;\
288	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
289	ldda	[tmp1]ASI_BLK_P, %f48				;\
290	membar	#Sync
291#endif	/* NIAGARA_IMPL */
292
293#endif	/* lint */
294/*
295 * Copy a block of storage, returning an error code if `from' or
296 * `to' takes a kernel pagefault which cannot be resolved.
297 * Returns errno value on pagefault error, 0 if all ok
298 */
299
300#if defined(lint)
301
302/* ARGSUSED */
303int
304kcopy(const void *from, void *to, size_t count)
305{ return(0); }
306
307#else	/* lint */
308
309	.seg	".text"
310	.align	4
311
312	ENTRY(kcopy)
313
314#if !defined(NIAGARA_IMPL)
315	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
316	sethi	%hi(.copyerr), %l7		! copyerr is lofault value
317	or	%l7, %lo(.copyerr), %l7
318	ldn	[THREAD_REG + T_LOFAULT], %o5	! save existing handler
319	! Note that we carefully do *not* flag the setting of
320	! t_lofault.
321	membar	#Sync				! sync error barrier
322	b	.do_copy			! common code
323	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
324
325/*
326 * We got here because of a fault during kcopy or bcopy if a fault
327 * handler existed when bcopy was called.
328 * Errno value is in %g1.
329 */
330.copyerr:
331	sethi	%hi(.copyerr2), %l1
332	or	%l1, %lo(.copyerr2), %l1
333	membar	#Sync				! sync error barrier
334	stn	%l1, [THREAD_REG + T_LOFAULT]	! set t_lofault
335	btst	FPUSED_FLAG, %o5
336	bz,pt	%xcc, 1f
337	and	%o5, BCOPY_FLAG, %l1	! copy flag to %l1
338
339	membar	#Sync				! sync error barrier
340	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2      ! restore gsr
341	wr	%o2, 0, %gsr
342
343	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
344	btst	FPRS_FEF, %o3
345	bz,pt	%icc, 4f
346	  nop
347
348	! restore fpregs from stack
349	BLD_FP_FROMSTACK(%o2)
350
351	ba,pt	%ncc, 2f
352	  wr	%o3, 0, %fprs		! restore fprs
353
3544:
355	FZERO
356	wr	%o3, 0, %fprs		! restore fprs
357
3582:
359	ldn	[THREAD_REG + T_LWP], %o2
360	brnz,pt	%o2, 1f
361	  nop
362
363	ldsb	[THREAD_REG + T_PREEMPT], %l0
364	deccc	%l0
365	bnz,pn	%ncc, 1f
366	  stb	%l0, [THREAD_REG + T_PREEMPT]
367
368	! Check for a kernel preemption request
369	ldn	[THREAD_REG + T_CPU], %l0
370	ldub	[%l0 + CPU_KPRUNRUN], %l0
371	brnz,a,pt	%l0, 1f	! Need to call kpreempt?
372	  or	%l1, KPREEMPT_FLAG, %l1	! If so, set the flag
373
374	! The kcopy will always set a t_lofault handler. If it fires,
375	! we're expected to just return the error code and not to
376	! invoke any existing error handler. As far as bcopy is concerned,
377	! we only set t_lofault if there was an existing lofault handler.
378	! In that case we're expected to invoke the previously existing
379	! handler after restting the t_lofault value.
3801:
381	andn	%o5, COPY_FLAGS, %o5	! remove flags from lofault address
382	membar	#Sync				! sync error barrier
383	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
384
385	! call kpreempt if necessary
386	btst	KPREEMPT_FLAG, %l1
387	bz,pt	%icc, 2f
388	  nop
389	call	kpreempt
390	  rdpr	%pil, %o0	! pass %pil
3912:
392	btst	BCOPY_FLAG, %l1
393	bnz,pn	%ncc, 3f
394	nop
395	ret
396	restore	%g1, 0, %o0
397
3983:
399	! We're here via bcopy. There must have been an error handler
400	! in place otherwise we would have died a nasty death already.
401	jmp	%o5				! goto real handler
402	restore	%g0, 0, %o0			! dispose of copy window
403
404/*
405 * We got here because of a fault in .copyerr.  We can't safely restore fp
406 * state, so we panic.
407 */
408fp_panic_msg:
409	.asciz	"Unable to restore fp state after copy operation"
410
411	.align	4
412.copyerr2:
413	set	fp_panic_msg, %o0
414	call	panic
415	  nop
416#else	/* NIAGARA_IMPL */
417	save	%sp, -SA(MINFRAME), %sp
418	set	.copyerr, %l7			! copyerr is lofault value
419	ldn	[THREAD_REG + T_LOFAULT], %o5	! save existing handler
420	or	%o5, LOFAULT_SET, %o5
421	membar	#Sync				! sync error barrier
422	b	.do_copy			! common code
423	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
424
425/*
426 * We got here because of a fault during kcopy.
427 * Errno value is in %g1.
428 */
429.copyerr:
430	! The kcopy() *always* sets a t_lofault handler and it ORs LOFAULT_SET
431	! into %o5 to indicate it has set t_lofault handler. Need to clear
432	! LOFAULT_SET flag before restoring the error handler.
433	andn	%o5, LOFAULT_SET, %o5
434	membar	#Sync				! sync error barrier
435	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
436	ret
437	restore	%g1, 0, %o0
438#endif	/* NIAGARA_IMPL */
439
440	SET_SIZE(kcopy)
441#endif	/* lint */
442
443
444/*
445 * Copy a block of storage - must not overlap (from + len <= to).
446 */
447#if defined(lint)
448
449/* ARGSUSED */
450void
451bcopy(const void *from, void *to, size_t count)
452{}
453
454#else	/* lint */
455
456	ENTRY(bcopy)
457
458#if !defined(NIAGARA_IMPL)
459	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
460	ldn	[THREAD_REG + T_LOFAULT], %o5	! save existing handler
461	brz,pt	%o5, .do_copy
462	  nop
463	sethi	%hi(.copyerr), %l7		! copyerr is lofault value
464	or	%l7, %lo(.copyerr), %l7
465	membar	#Sync				! sync error barrier
466	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
467	! We've already captured whether t_lofault was zero on entry.
468	! We need to mark ourselves as being from bcopy since both
469	! kcopy and bcopy use the same code path. If BCOPY_FLAG is
470	! set and the saved lofault was zero, we won't reset lofault on
471	! returning.
472	or	%o5, BCOPY_FLAG, %o5
473#else	/* NIAGARA_IMPL */
474	save	%sp, -SA(MINFRAME), %sp
475	clr	%o5			! flag LOFAULT_SET is not set for bcopy
476#endif	/* NIAGARA_IMPL */
477
478.do_copy:
479	cmp	%i2, 12			! for small counts
480	blu	%ncc, .bytecp		! just copy bytes
481	  .empty
482
483	cmp	%i2, 128		! for less than 128 bytes
484	blu,pn	%ncc, .bcb_punt		! no block st/quad ld
485	  nop
486
487	set	use_hw_bcopy, %o2
488	ld	[%o2], %o2
489	brz,pn	%o2, .bcb_punt
490	  nop
491
492	subcc	%i1, %i0, %i3
493	bneg,a,pn %ncc, 1f
494	neg	%i3
4951:
496	/*
497	 * Compare against 256 since we should be checking block addresses
498	 * and (dest & ~63) - (src & ~63) can be 3 blocks even if
499	 * src = dest + (64 * 3) + 63.
500	 */
501	cmp	%i3, 256
502	blu,pn	%ncc, .bcb_punt
503	  nop
504
505	/*
506	 * Copy that reach here have at least 2 blocks of data to copy.
507	 */
508#if !defined(NIAGARA_IMPL)
509	ldn	[THREAD_REG + T_LWP], %o3
510	brnz,pt	%o3, 1f
511	  nop
512
513	! kpreempt_disable();
514	ldsb	[THREAD_REG + T_PREEMPT], %o2
515	inc	%o2
516	stb	%o2, [THREAD_REG + T_PREEMPT]
517
5181:
519	rd	%fprs, %o2              ! check for unused fp
520	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
521	btst	FPRS_FEF, %o2
522	bz,a,pt	%icc, .do_blockcopy
523	wr	%g0, FPRS_FEF, %fprs
524
525	! save in-use fpregs on stack
526	BST_FP_TOSTACK(%o2)
527#endif	/* NIAGARA_IMPL */
528
529.do_blockcopy:
530
531#if !defined(NIAGARA_IMPL)
532	rd	%gsr, %o2
533	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]      ! save gsr
534	or	%o5, FPUSED_FLAG, %o5		! fp regs are in use
535#endif	/* NIAGARA_IMPL */
536
537	! Swap src/dst since the code below is memcpy code
538	! and memcpy/bcopy have different calling sequences
539	mov	%i1, %i5
540	mov	%i0, %i1
541	mov	%i5, %i0
542
543	! Block (64 bytes) align the destination.
544	andcc	%i0, 0x3f, %i3		! is dst aligned on a 64 bytes
545	bz	%xcc, .chksrc		! dst is already double aligned
546	sub	%i3, 0x40, %i3
547	neg	%i3			! bytes till dst 64 bytes aligned
548	sub	%i2, %i3, %i2		! update i2 with new count
549
550	! Based on source and destination alignment do
551	! either 8 bytes, 4 bytes, 2 bytes or byte copy.
552
553	! Is dst & src 8B aligned
554	or	%i0, %i1, %o2
555	andcc	%o2, 0x7, %g0
556	bz	%ncc, .alewdcp
557	nop
558
559	! Is dst & src 4B aligned
560	andcc	%o2, 0x3, %g0
561	bz	%ncc, .alwdcp
562	nop
563
564	! Is dst & src 2B aligned
565	andcc	%o2, 0x1, %g0
566	bz	%ncc, .alhlfwdcp
567	nop
568
569	! 1B aligned
5701:	ldub	[%i1], %o2
571	stb	%o2, [%i0]
572	inc	%i1
573	deccc	%i3
574	bgu,pt	%ncc, 1b
575	inc	%i0
576
577	ba	.chksrc
578	nop
579
580	! dst & src 4B aligned
581.alwdcp:
582	ld	[%i1], %o2
583	st	%o2, [%i0]
584	add	%i1, 0x4, %i1
585	subcc	%i3, 0x4, %i3
586	bgu,pt	%ncc, .alwdcp
587	add	%i0, 0x4, %i0
588
589	ba	.chksrc
590	nop
591
592	! dst & src 2B aligned
593.alhlfwdcp:
594	lduh	[%i1], %o2
595	stuh	%o2, [%i0]
596	add	%i1, 0x2, %i1
597	subcc	%i3, 0x2, %i3
598	bgu,pt	%ncc, .alhlfwdcp
599	add	%i0, 0x2, %i0
600
601	ba	.chksrc
602	nop
603
604	! dst & src 8B aligned
605.alewdcp:
606	ldx	[%i1], %o2
607	stx	%o2, [%i0]
608	add	%i1, 0x8, %i1
609	subcc	%i3, 0x8, %i3
610	bgu,pt	%ncc, .alewdcp
611	add	%i0, 0x8, %i0
612
613	! Now Destination is block (64 bytes) aligned
614.chksrc:
615	andn	%i2, 0x3f, %i3		! %i3 count is multiple of block size
616	sub	%i2, %i3, %i2		! Residue bytes in %i2
617
618	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
619
620#if !defined(NIAGARA_IMPL)
621	andn	%i1, 0x3f, %l0		! %l0 has block aligned src address
622	prefetch [%l0+0x0], #one_read
623	andcc	%i1, 0x3f, %g0		! is src 64B aligned
624	bz,pn	%ncc, .blkcpy
625	nop
626
627	! handle misaligned source cases
628	alignaddr %i1, %g0, %g0		! generate %gsr
629
630	srl	%i1, 0x3, %l1		! src add bits 3, 4, 5 are now least
631					! significant in %l1
632	andcc	%l1, 0x7, %l2		! mask everything except bits 1, 2, 3
633	add	%i1, %i3, %i1
634
635	! switch statement to get to right 8 byte block within
636	! 64 byte block
637	cmp	 %l2, 0x4
638	bgeu,a	 hlf
639	cmp	 %l2, 0x6
640	cmp	 %l2, 0x2
641	bgeu,a	 sqtr
642	nop
643	cmp	 %l2, 0x1
644	be,a	 off15
645	nop
646	ba	 off7
647	nop
648sqtr:
649	be,a	 off23
650	nop
651	ba,a	 off31
652	nop
653
654hlf:
655	bgeu,a	 fqtr
656	nop
657	cmp	 %l2, 0x5
658	be,a	 off47
659	nop
660	ba	 off39
661	nop
662fqtr:
663	be,a	 off55
664	nop
665
666	! Falls through when the source offset is greater than 56
667	ldd	[%l0+0x38], %d14
668	prefetch [%l0+0x40], #one_read
669	prefetch [%l0+0x80], #one_read
6707:
671	add	%l0, 0x40, %l0
672	stxa	%g0, [%i0]%asi		! initialize the cache line
673
674	ldda	[%l0]ASI_BLK_P, %d16
675	ALIGN_OFF_56_63
676	fmovd	%d30, %d14
677
678	stda	%d48, [%i0]ASI_BLK_P
679	subcc	%i3, 0x40, %i3
680	add	%i0, 0x40, %i0
681	bgu,pt	%ncc, 7b
682	prefetch [%l0+0x80], #one_read
683	ba	.blkdone
684	membar	#Sync
685
686	! This copy case for source offset between 1 and 7
687off7:
688	ldda	[%l0]ASI_BLK_P, %d0
689	prefetch [%l0+0x40], #one_read
690	prefetch [%l0+0x80], #one_read
6910:
692	add	%l0, 0x40, %l0
693	stxa	%g0, [%i0]%asi		! initialize the cache line
694
695	ldda	[%l0]ASI_BLK_P, %d16
696	ALIGN_OFF_1_7
697	fmovd	%d16, %d0
698	fmovd	%d18, %d2
699	fmovd	%d20, %d4
700	fmovd	%d22, %d6
701	fmovd	%d24, %d8
702	fmovd	%d26, %d10
703	fmovd	%d28, %d12
704	fmovd	%d30, %d14
705
706	stda	%d48, [%i0]ASI_BLK_P
707	subcc	%i3, 0x40, %i3
708	add	%i0, 0x40, %i0
709	bgu,pt	%ncc, 0b
710	prefetch [%l0+0x80], #one_read
711	ba	.blkdone
712	membar	#Sync
713
714	! This copy case for source offset between 8 and 15
715off15:
716	ldd	[%l0+0x8], %d2
717	ldd	[%l0+0x10], %d4
718	ldd	[%l0+0x18], %d6
719	ldd	[%l0+0x20], %d8
720	ldd	[%l0+0x28], %d10
721	ldd	[%l0+0x30], %d12
722	ldd	[%l0+0x38], %d14
723	prefetch [%l0+0x40], #one_read
724	prefetch [%l0+0x80], #one_read
7251:
726	add	%l0, 0x40, %l0
727	stxa	%g0, [%i0]%asi		! initialize the cache line
728
729	ldda	[%l0]ASI_BLK_P, %d16
730	ALIGN_OFF_8_15
731	fmovd	%d18, %d2
732	fmovd	%d20, %d4
733	fmovd	%d22, %d6
734	fmovd	%d24, %d8
735	fmovd	%d26, %d10
736	fmovd	%d28, %d12
737	fmovd	%d30, %d14
738
739	stda	%d48, [%i0]ASI_BLK_P
740	subcc	%i3, 0x40, %i3
741	add	%i0, 0x40, %i0
742	bgu,pt	%ncc, 1b
743	prefetch [%l0+0x80], #one_read
744	ba	.blkdone
745	membar	#Sync
746
747	! This copy case for source offset between 16 and 23
748off23:
749	ldd	[%l0+0x10], %d4
750	ldd	[%l0+0x18], %d6
751	ldd	[%l0+0x20], %d8
752	ldd	[%l0+0x28], %d10
753	ldd	[%l0+0x30], %d12
754	ldd	[%l0+0x38], %d14
755	prefetch [%l0+0x40], #one_read
756	prefetch [%l0+0x80], #one_read
7572:
758	add	%l0, 0x40, %l0
759	stxa	%g0, [%i0]%asi		! initialize the cache line
760
761	ldda	[%l0]ASI_BLK_P, %d16
762	ALIGN_OFF_16_23
763	fmovd	%d20, %d4
764	fmovd	%d22, %d6
765	fmovd	%d24, %d8
766	fmovd	%d26, %d10
767	fmovd	%d28, %d12
768	fmovd	%d30, %d14
769
770	stda	%d48, [%i0]ASI_BLK_P
771	subcc	%i3, 0x40, %i3
772	add	%i0, 0x40, %i0
773	bgu,pt	%ncc, 2b
774	prefetch [%l0+0x80], #one_read
775	ba	.blkdone
776	membar	#Sync
777
778	! This copy case for source offset between 24 and 31
779off31:
780	ldd	[%l0+0x18], %d6
781	ldd	[%l0+0x20], %d8
782	ldd	[%l0+0x28], %d10
783	ldd	[%l0+0x30], %d12
784	ldd	[%l0+0x38], %d14
785	prefetch [%l0+0x40], #one_read
786	prefetch [%l0+0x80], #one_read
7873:
788	add	%l0, 0x40, %l0
789	stxa	%g0, [%i0]%asi		! initialize the cache line
790
791	ldda	[%l0]ASI_BLK_P, %d16
792	ALIGN_OFF_24_31
793	fmovd	%d22, %d6
794	fmovd	%d24, %d8
795	fmovd	%d26, %d10
796	fmovd	%d28, %d12
797	fmovd	%d30, %d14
798
799	stda	%d48, [%i0]ASI_BLK_P
800	subcc	%i3, 0x40, %i3
801	add	%i0, 0x40, %i0
802	bgu,pt	%ncc, 3b
803	prefetch [%l0+0x80], #one_read
804	ba	.blkdone
805	membar	#Sync
806
807	! This copy case for source offset between 32 and 39
808off39:
809	ldd	[%l0+0x20], %d8
810	ldd	[%l0+0x28], %d10
811	ldd	[%l0+0x30], %d12
812	ldd	[%l0+0x38], %d14
813	prefetch [%l0+0x40], #one_read
814	prefetch [%l0+0x80], #one_read
8154:
816	add	%l0, 0x40, %l0
817	stxa	%g0, [%i0]%asi		! initialize the cache line
818
819	ldda	[%l0]ASI_BLK_P, %d16
820	ALIGN_OFF_32_39
821	fmovd	%d24, %d8
822	fmovd	%d26, %d10
823	fmovd	%d28, %d12
824	fmovd	%d30, %d14
825
826	stda	%d48, [%i0]ASI_BLK_P
827	subcc	%i3, 0x40, %i3
828	add	%i0, 0x40, %i0
829	bgu,pt	%ncc, 4b
830	prefetch [%l0+0x80], #one_read
831	ba	.blkdone
832	membar	#Sync
833
834	! This copy case for source offset between 40 and 47
835off47:
836	ldd	[%l0+0x28], %d10
837	ldd	[%l0+0x30], %d12
838	ldd	[%l0+0x38], %d14
839	prefetch [%l0+0x40], #one_read
840	prefetch [%l0+0x80], #one_read
8415:
842	add	%l0, 0x40, %l0
843	stxa	%g0, [%i0]%asi		! initialize the cache line
844
845	ldda	[%l0]ASI_BLK_P, %d16
846	ALIGN_OFF_40_47
847	fmovd	%d26, %d10
848	fmovd	%d28, %d12
849	fmovd	%d30, %d14
850
851	stda	%d48, [%i0]ASI_BLK_P
852	subcc	%i3, 0x40, %i3
853	add	%i0, 0x40, %i0
854	bgu,pt	%ncc, 5b
855	prefetch [%l0+0x80], #one_read
856	ba	.blkdone
857	membar	#Sync
858
859	! This copy case for source offset between 48 and 55
860off55:
861	ldd	[%l0+0x30], %d12
862	ldd	[%l0+0x38], %d14
863	prefetch [%l0+0x40], #one_read
864	prefetch [%l0+0x80], #one_read
8656:
866	add	%l0, 0x40, %l0
867	stxa	%g0, [%i0]%asi		! initialize the cache line
868
869	ldda	[%l0]ASI_BLK_P, %d16
870	ALIGN_OFF_48_55
871	fmovd	%d28, %d12
872	fmovd	%d30, %d14
873
874	stda	%d48, [%i0]ASI_BLK_P
875	subcc	%i3, 0x40, %i3
876	add	%i0, 0x40, %i0
877	bgu,pt	%ncc, 6b
878	prefetch [%l0+0x80], #one_read
879	ba	.blkdone
880	membar	#Sync
881
882	! Both source and destination are block aligned.
883.blkcpy:
884	prefetch [%i1+0x40], #one_read
885	prefetch [%i1+0x80], #one_read
8868:
887	stxa	%g0, [%i0]%asi		! initialize the cache line
888	ldda	[%i1]ASI_BLK_P, %d0
889	stda	%d0, [%i0]ASI_BLK_P
890
891	add	%i1, 0x40, %i1
892	subcc	%i3, 0x40, %i3
893	add	%i0, 0x40, %i0
894	bgu,pt	%ncc, 8b
895	prefetch [%i1+0x80], #one_read
896	membar	#Sync
897
898.blkdone:
899#else	/* NIAGARA_IMPL */
900	andcc	%i1, 0xf, %o2		! is src quadword aligned
901	bz,pn	%xcc, .blkcpy		! src offset in %o2
902	nop
903	cmp	%o2, 0x8
904	bg	.cpy_upper_double
905	nop
906	bl	.cpy_lower_double
907	nop
908
909	! Falls through when source offset is equal to 8 i.e.
910	! source is double word aligned.
911	! In this case no shift/merge of data is required
912	sub	%i1, %o2, %i1		! align the src at 16 bytes.
913	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
914	prefetch [%l0+0x0], #one_read
915	ldda	[%i1+0x0]%asi, %l2
916loop0:
917	ldda	[%i1+0x10]%asi, %l4
918	prefetch [%l0+0x40], #one_read
919
920	stxa	%l3, [%i0+0x0]%asi
921	stxa	%l4, [%i0+0x8]%asi
922
923	ldda	[%i1+0x20]%asi, %l2
924	stxa	%l5, [%i0+0x10]%asi
925	stxa	%l2, [%i0+0x18]%asi
926
927	ldda	[%i1+0x30]%asi, %l4
928	stxa	%l3, [%i0+0x20]%asi
929	stxa	%l4, [%i0+0x28]%asi
930
931	ldda	[%i1+0x40]%asi, %l2
932	stxa	%l5, [%i0+0x30]%asi
933	stxa	%l2, [%i0+0x38]%asi
934
935	add	%l0, 0x40, %l0
936	add	%i1, 0x40, %i1
937	subcc	%i3, 0x40, %i3
938	bgu,pt	%xcc, loop0
939	add	%i0, 0x40, %i0
940	ba	.blkdone
941	add	%i1, %o2, %i1		! increment the source by src offset
942					! the src offset was stored in %o2
943
944.cpy_lower_double:
945	sub	%i1, %o2, %i1		! align the src at 16 bytes.
946	sll	%o2, 3, %o0		! %o0 left shift
947	mov	0x40, %o1
948	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
949	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
950	prefetch [%l0+0x0], #one_read
951	ldda	[%i1+0x0]%asi, %l2	! partial data in %l2 and %l3 has
952					! complete data
953loop1:
954	ldda	[%i1+0x10]%asi, %l4	! %l4 has partial data for this read.
955	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)	! merge %l2, %l3 and %l4
956							! into %l2 and %l3
957	prefetch [%l0+0x40], #one_read
958	stxa	%l2, [%i0+0x0]%asi
959	stxa	%l3, [%i0+0x8]%asi
960
961	ldda	[%i1+0x20]%asi, %l2
962	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)	! merge %l2 with %l5 and
963	stxa	%l4, [%i0+0x10]%asi			! %l4 from previous read
964	stxa	%l5, [%i0+0x18]%asi			! into %l4 and %l5
965
966	! Repeat the same for next 32 bytes.
967
968	ldda	[%i1+0x30]%asi, %l4
969	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
970	stxa	%l2, [%i0+0x20]%asi
971	stxa	%l3, [%i0+0x28]%asi
972
973	ldda	[%i1+0x40]%asi, %l2
974	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
975	stxa	%l4, [%i0+0x30]%asi
976	stxa	%l5, [%i0+0x38]%asi
977
978	add	%l0, 0x40, %l0
979	add	%i1, 0x40, %i1
980	subcc	%i3, 0x40, %i3
981	bgu,pt	%xcc, loop1
982	add	%i0, 0x40, %i0
983	ba	.blkdone
984	add	%i1, %o2, %i1		! increment the source by src offset
985					! the src offset was stored in %o2
986
987.cpy_upper_double:
988	sub	%i1, %o2, %i1		! align the src at 16 bytes.
989	mov	0x8, %o0
990	sub	%o2, %o0, %o0
991	sll	%o0, 3, %o0		! %o0 left shift
992	mov	0x40, %o1
993	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
994	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
995	prefetch [%l0+0x0], #one_read
996	ldda	[%i1+0x0]%asi, %l2	! partial data in %l3 for this read and
997					! no data in %l2
998loop2:
999	ldda	[%i1+0x10]%asi, %l4	! %l4 has complete data and %l5 has
1000					! partial
1001	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)	! merge %l3, %l4 and %l5
1002							! into %l3 and %l4
1003	prefetch [%l0+0x40], #one_read
1004	stxa	%l3, [%i0+0x0]%asi
1005	stxa	%l4, [%i0+0x8]%asi
1006
1007	ldda	[%i1+0x20]%asi, %l2
1008	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)	! merge %l2 and %l3 with
1009	stxa	%l5, [%i0+0x10]%asi			! %l5 from previous read
1010	stxa	%l2, [%i0+0x18]%asi			! into %l5 and %l2
1011
1012	! Repeat the same for next 32 bytes.
1013
1014	ldda	[%i1+0x30]%asi, %l4
1015	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
1016	stxa	%l3, [%i0+0x20]%asi
1017	stxa	%l4, [%i0+0x28]%asi
1018
1019	ldda	[%i1+0x40]%asi, %l2
1020	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
1021	stxa	%l5, [%i0+0x30]%asi
1022	stxa	%l2, [%i0+0x38]%asi
1023
1024	add	%l0, 0x40, %l0
1025	add	%i1, 0x40, %i1
1026	subcc	%i3, 0x40, %i3
1027	bgu,pt	%xcc, loop2
1028	add	%i0, 0x40, %i0
1029	ba	.blkdone
1030	add	%i1, %o2, %i1		! increment the source by src offset
1031					! the src offset was stored in %o2
1032
1033
1034	! Both Source and Destination are block aligned.
1035	! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
1036.blkcpy:
1037	prefetch [%i1+0x0], #one_read
10381:
1039	ldda	[%i1+0x0]%asi, %l0
1040	ldda	[%i1+0x10]%asi, %l2
1041	prefetch [%i1+0x40], #one_read
1042
1043	stxa	%l0, [%i0+0x0]%asi
1044	ldda	[%i1+0x20]%asi, %l4
1045	ldda	[%i1+0x30]%asi, %l6
1046
1047	stxa	%l1, [%i0+0x8]%asi
1048	stxa	%l2, [%i0+0x10]%asi
1049	stxa	%l3, [%i0+0x18]%asi
1050	stxa	%l4, [%i0+0x20]%asi
1051	stxa	%l5, [%i0+0x28]%asi
1052	stxa	%l6, [%i0+0x30]%asi
1053	stxa	%l7, [%i0+0x38]%asi
1054
1055	add	%i1, 0x40, %i1
1056	subcc	%i3, 0x40, %i3
1057	bgu,pt	%xcc, 1b
1058	add	%i0, 0x40, %i0
1059
1060.blkdone:
1061	membar	#Sync
1062#endif	/* NIAGARA_IMPL */
1063
1064	brz,pt	%i2, .blkexit
1065	nop
1066
1067	! Handle trailing bytes
1068	cmp	%i2, 0x8
1069	blu,pt	%ncc, .residue
1070	nop
1071
1072	! Can we do some 8B ops
1073	or	%i1, %i0, %o2
1074	andcc	%o2, 0x7, %g0
1075	bnz	%ncc, .last4
1076	nop
1077
1078	! Do 8byte ops as long as possible
1079.last8:
1080	ldx	[%i1], %o2
1081	stx	%o2, [%i0]
1082	add	%i1, 0x8, %i1
1083	sub	%i2, 0x8, %i2
1084	cmp	%i2, 0x8
1085	bgu,pt	%ncc, .last8
1086	add	%i0, 0x8, %i0
1087
1088	brz,pt	%i2, .blkexit
1089	nop
1090
1091	ba	.residue
1092	nop
1093
1094.last4:
1095	! Can we do 4B ops
1096	andcc	%o2, 0x3, %g0
1097	bnz	%ncc, .last2
1098	nop
10991:
1100	ld	[%i1], %o2
1101	st	%o2, [%i0]
1102	add	%i1, 0x4, %i1
1103	sub	%i2, 0x4, %i2
1104	cmp	%i2, 0x4
1105	bgu,pt	%ncc, 1b
1106	add	%i0, 0x4, %i0
1107
1108	brz,pt	%i2, .blkexit
1109	nop
1110
1111	ba	.residue
1112	nop
1113
1114.last2:
1115	! Can we do 2B ops
1116	andcc	%o2, 0x1, %g0
1117	bnz	%ncc, .residue
1118	nop
1119
11201:
1121	lduh	[%i1], %o2
1122	stuh	%o2, [%i0]
1123	add	%i1, 0x2, %i1
1124	sub	%i2, 0x2, %i2
1125	cmp	%i2, 0x2
1126	bgu,pt	%ncc, 1b
1127	add	%i0, 0x2, %i0
1128
1129	brz,pt	%i2, .blkexit
1130	nop
1131
1132.residue:
1133	ldub	[%i1], %o2
1134	stb	%o2, [%i0]
1135	inc	%i1
1136	deccc	%i2
1137	bgu,pt	%ncc, .residue
1138	inc	%i0
1139
1140.blkexit:
1141#if !defined(NIAGARA_IMPL)
1142	btst	FPUSED_FLAG, %o5
1143	bz	%icc, 1f
1144	  and	%o5,  COPY_FLAGS, %l1	! Store flags in %l1
1145					! We can't clear the flags from %o5 yet
1146					! If there's an error, .copyerr will
1147					! need them
1148
1149	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2      ! restore gsr
1150	wr	%o2, 0, %gsr
1151
1152	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1153	btst	FPRS_FEF, %o3
1154	bz,pt	%icc, 4f
1155	  nop
1156
1157	! restore fpregs from stack
1158	BLD_FP_FROMSTACK(%o2)
1159
1160	ba,pt	%ncc, 2f
1161	  wr	%o3, 0, %fprs		! restore fprs
1162
11634:
1164	FZERO
1165	wr	%o3, 0, %fprs		! restore fprs
1166
11672:
1168	ldn	[THREAD_REG + T_LWP], %o2
1169	brnz,pt	%o2, 1f
1170	  nop
1171
1172	ldsb	[THREAD_REG + T_PREEMPT], %l0
1173	deccc	%l0
1174	bnz,pn	%ncc, 1f
1175	  stb	%l0, [THREAD_REG + T_PREEMPT]
1176
1177	! Check for a kernel preemption request
1178	ldn	[THREAD_REG + T_CPU], %l0
1179	ldub	[%l0 + CPU_KPRUNRUN], %l0
1180	brnz,a,pt	%l0, 1f	! Need to call kpreempt?
1181	  or	%l1, KPREEMPT_FLAG, %l1	! If so, set the flag
1182
11831:
1184	btst	BCOPY_FLAG, %l1
1185	bz,pn	%icc, 3f
1186	andncc	%o5, COPY_FLAGS, %o5
1187
1188	! Here via bcopy. Check to see if the handler was NULL.
1189	! If so, just return quietly. Otherwise, reset the
1190	! handler and go home.
1191	bnz,pn	%ncc, 3f
1192	nop
1193
1194	! Null handler.
1195	btst	KPREEMPT_FLAG, %l1
1196	bz,pt	%icc, 2f
1197	  nop
1198	call	kpreempt
1199	  rdpr	%pil, %o0	! pass %pil
12002:
1201
1202	ret
1203	restore	%g0, 0, %o0
1204
1205	! Here via kcopy or bcopy with a handler.
1206	! Reset the fault handler.
12073:
1208	membar	#Sync
1209	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1210
1211	! call kpreempt if necessary
1212	btst	KPREEMPT_FLAG, %l1
1213	bz,pt	%icc, 4f
1214	  nop
1215	call	kpreempt
1216	  rdpr	%pil, %o0
12174:
1218#else	/* NIAGARA_IMPL */
1219	membar	#Sync				! sync error barrier
1220	! Restore t_lofault handler, if came here from kcopy().
1221	tst	%o5
1222	bz	%ncc, 1f
1223	andn	%o5, LOFAULT_SET, %o5
1224	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
12251:
1226#endif	/* NIAGARA_IMPL */
1227	ret
1228	restore	%g0, 0, %o0
1229
1230.bcb_punt:
1231	!
1232	! use aligned transfers where possible
1233	!
1234	xor	%i0, %i1, %o4		! xor from and to address
1235	btst	7, %o4			! if lower three bits zero
1236	bz	.aldoubcp		! can align on double boundary
1237	.empty	! assembler complaints about label
1238
1239	xor	%i0, %i1, %o4		! xor from and to address
1240	btst	3, %o4			! if lower two bits zero
1241	bz	.alwordcp		! can align on word boundary
1242	btst	3, %i0			! delay slot, from address unaligned?
1243	!
1244	! use aligned reads and writes where possible
1245	! this differs from wordcp in that it copes
1246	! with odd alignment between source and destnation
1247	! using word reads and writes with the proper shifts
1248	! in between to align transfers to and from memory
1249	! i0 - src address, i1 - dest address, i2 - count
1250	! i3, i4 - tmps for used generating complete word
1251	! i5 (word to write)
1252	! l0 size in bits of upper part of source word (US)
1253	! l1 size in bits of lower part of source word (LS = 32 - US)
1254	! l2 size in bits of upper part of destination word (UD)
1255	! l3 size in bits of lower part of destination word (LD = 32 - UD)
1256	! l4 number of bytes leftover after aligned transfers complete
1257	! l5 the number 32
1258	!
1259	mov	32, %l5			! load an oft-needed constant
1260	bz	.align_dst_only
1261	btst	3, %i1			! is destnation address aligned?
1262	clr	%i4			! clear registers used in either case
1263	bz	.align_src_only
1264	clr	%l0
1265	!
1266	! both source and destination addresses are unaligned
1267	!
12681:					! align source
1269	ldub	[%i0], %i3		! read a byte from source address
1270	add	%i0, 1, %i0		! increment source address
1271	or	%i4, %i3, %i4		! or in with previous bytes (if any)
1272	btst	3, %i0			! is source aligned?
1273	add	%l0, 8, %l0		! increment size of upper source (US)
1274	bnz,a	1b
1275	sll	%i4, 8, %i4		! make room for next byte
1276
1277	sub	%l5, %l0, %l1		! generate shift left count (LS)
1278	sll	%i4, %l1, %i4		! prepare to get rest
1279	ld	[%i0], %i3		! read a word
1280	add	%i0, 4, %i0		! increment source address
1281	srl	%i3, %l0, %i5		! upper src bits into lower dst bits
1282	or	%i4, %i5, %i5		! merge
1283	mov	24, %l3			! align destination
12841:
1285	srl	%i5, %l3, %i4		! prepare to write a single byte
1286	stb	%i4, [%i1]		! write a byte
1287	add	%i1, 1, %i1		! increment destination address
1288	sub	%i2, 1, %i2		! decrement count
1289	btst	3, %i1			! is destination aligned?
1290	bnz,a	1b
1291	sub	%l3, 8, %l3		! delay slot, decrement shift count (LD)
1292	sub	%l5, %l3, %l2		! generate shift left count (UD)
1293	sll	%i5, %l2, %i5		! move leftover into upper bytes
1294	cmp	%l2, %l0		! cmp # reqd to fill dst w old src left
1295	bgu	%ncc, .more_needed	! need more to fill than we have
1296	nop
1297
1298	sll	%i3, %l1, %i3		! clear upper used byte(s)
1299	srl	%i3, %l1, %i3
1300	! get the odd bytes between alignments
1301	sub	%l0, %l2, %l0		! regenerate shift count
1302	sub	%l5, %l0, %l1		! generate new shift left count (LS)
1303	and	%i2, 3, %l4		! must do remaining bytes if count%4 > 0
1304	andn	%i2, 3, %i2		! # of aligned bytes that can be moved
1305	srl	%i3, %l0, %i4
1306	or	%i5, %i4, %i5
1307	st	%i5, [%i1]		! write a word
1308	subcc	%i2, 4, %i2		! decrement count
1309	bz	%ncc, .unalign_out
1310	add	%i1, 4, %i1		! increment destination address
1311
1312	b	2f
1313	sll	%i3, %l1, %i5		! get leftover into upper bits
1314.more_needed:
1315	sll	%i3, %l0, %i3		! save remaining byte(s)
1316	srl	%i3, %l0, %i3
1317	sub	%l2, %l0, %l1		! regenerate shift count
1318	sub	%l5, %l1, %l0		! generate new shift left count
1319	sll	%i3, %l1, %i4		! move to fill empty space
1320	b	3f
1321	or	%i5, %i4, %i5		! merge to complete word
1322	!
1323	! the source address is aligned and destination is not
1324	!
1325.align_dst_only:
1326	ld	[%i0], %i4		! read a word
1327	add	%i0, 4, %i0		! increment source address
1328	mov	24, %l0			! initial shift alignment count
13291:
1330	srl	%i4, %l0, %i3		! prepare to write a single byte
1331	stb	%i3, [%i1]		! write a byte
1332	add	%i1, 1, %i1		! increment destination address
1333	sub	%i2, 1, %i2		! decrement count
1334	btst	3, %i1			! is destination aligned?
1335	bnz,a	1b
1336	sub	%l0, 8, %l0		! delay slot, decrement shift count
1337.xfer:
1338	sub	%l5, %l0, %l1		! generate shift left count
1339	sll	%i4, %l1, %i5		! get leftover
13403:
1341	and	%i2, 3, %l4		! must do remaining bytes if count%4 > 0
1342	andn	%i2, 3, %i2		! # of aligned bytes that can be moved
13432:
1344	ld	[%i0], %i3		! read a source word
1345	add	%i0, 4, %i0		! increment source address
1346	srl	%i3, %l0, %i4		! upper src bits into lower dst bits
1347	or	%i5, %i4, %i5		! merge with upper dest bits (leftover)
1348	st	%i5, [%i1]		! write a destination word
1349	subcc	%i2, 4, %i2		! decrement count
1350	bz	%ncc, .unalign_out	! check if done
1351	add	%i1, 4, %i1		! increment destination address
1352	b	2b			! loop
1353	sll	%i3, %l1, %i5		! get leftover
1354.unalign_out:
1355	tst	%l4			! any bytes leftover?
1356	bz	%ncc, .cpdone
1357	.empty				! allow next instruction in delay slot
13581:
1359	sub	%l0, 8, %l0		! decrement shift
1360	srl	%i3, %l0, %i4		! upper src byte into lower dst byte
1361	stb	%i4, [%i1]		! write a byte
1362	subcc	%l4, 1, %l4		! decrement count
1363	bz	%ncc, .cpdone		! done?
1364	add	%i1, 1, %i1		! increment destination
1365	tst	%l0			! any more previously read bytes
1366	bnz	%ncc, 1b		! we have leftover bytes
1367	mov	%l4, %i2		! delay slot, mv cnt where dbytecp wants
1368	b	.dbytecp		! let dbytecp do the rest
1369	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
1370	!
1371	! the destination address is aligned and the source is not
1372	!
1373.align_src_only:
1374	ldub	[%i0], %i3		! read a byte from source address
1375	add	%i0, 1, %i0		! increment source address
1376	or	%i4, %i3, %i4		! or in with previous bytes (if any)
1377	btst	3, %i0			! is source aligned?
1378	add	%l0, 8, %l0		! increment shift count (US)
1379	bnz,a	.align_src_only
1380	sll	%i4, 8, %i4		! make room for next byte
1381	b,a	.xfer
1382	!
1383	! if from address unaligned for double-word moves,
1384	! move bytes till it is, if count is < 56 it could take
1385	! longer to align the thing than to do the transfer
1386	! in word size chunks right away
1387	!
1388.aldoubcp:
1389	cmp	%i2, 56			! if count < 56, use wordcp, it takes
1390	blu,a	%ncc, .alwordcp		! longer to align doubles than words
1391	mov	3, %o0			! mask for word alignment
1392	call	.alignit		! copy bytes until aligned
1393	mov	7, %o0			! mask for double alignment
1394	!
1395	! source and destination are now double-word aligned
1396	! i3 has aligned count returned by alignit
1397	!
1398	and	%i2, 7, %i2		! unaligned leftover count
1399	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
14005:
1401	ldx	[%i0+%i1], %o4		! read from address
1402	stx	%o4, [%i1]		! write at destination address
1403	subcc	%i3, 8, %i3		! dec count
1404	bgu	%ncc, 5b
1405	add	%i1, 8, %i1		! delay slot, inc to address
1406	cmp	%i2, 4			! see if we can copy a word
1407	blu	%ncc, .dbytecp		! if 3 or less bytes use bytecp
1408	.empty
1409	!
1410	! for leftover bytes we fall into wordcp, if needed
1411	!
1412.wordcp:
1413	and	%i2, 3, %i2		! unaligned leftover count
14145:
1415	ld	[%i0+%i1], %o4		! read from address
1416	st	%o4, [%i1]		! write at destination address
1417	subcc	%i3, 4, %i3		! dec count
1418	bgu	%ncc, 5b
1419	add	%i1, 4, %i1		! delay slot, inc to address
1420	b,a	.dbytecp
1421
1422	! we come here to align copies on word boundaries
1423.alwordcp:
1424	call	.alignit		! go word-align it
1425	mov	3, %o0			! bits that must be zero to be aligned
1426	b	.wordcp
1427	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
1428
1429	!
1430	! byte copy, works with any alignment
1431	!
1432.bytecp:
1433	b	.dbytecp
1434	sub	%i0, %i1, %i0		! i0 gets difference of src and dst
1435
1436	!
1437	! differenced byte copy, works with any alignment
1438	! assumes dest in %i1 and (source - dest) in %i0
1439	!
14401:
1441	stb	%o4, [%i1]		! write to address
1442	inc	%i1			! inc to address
1443.dbytecp:
1444	deccc	%i2			! dec count
1445	bgeu,a	%ncc, 1b		! loop till done
1446	ldub	[%i0+%i1], %o4		! read from address
1447.cpdone:
1448#if !defined(NIAGARA_IMPL)
1449	! FPUSED_FLAG will not have been set in any path leading to
1450	! this point. No need to deal with it.
1451	btst	BCOPY_FLAG, %o5
1452	bz,pn	%icc, 2f
1453	andcc	%o5, BCOPY_FLAG, %o5
1454	! Here via bcopy. Check to see if the handler was NULL.
1455	! If so, just return quietly. Otherwise, reset the
1456	! handler and go home.
1457	bnz,pn	%ncc, 2f
1458	nop
1459	!
1460	! Null handler.
1461	!
1462	ret
1463	restore %g0, 0, %o0
1464	! Here via kcopy or bcopy with a handler.
1465	! Reset the fault handler.
14662:
1467	membar	#Sync
1468	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1469#else	/* NIAGARA_IMPL */
1470	membar	#Sync				! sync error barrier
1471	! Restore t_lofault handler, if came here from kcopy().
1472	tst	%o5
1473	bz	%ncc, 1f
1474	andn	%o5, LOFAULT_SET, %o5
1475	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
14761:
1477#endif	/* NIAGARA_IMPL */
1478	ret
1479	restore %g0, 0, %o0		! return (0)
1480
1481/*
1482 * Common code used to align transfers on word and doubleword
1483 * boudaries.  Aligns source and destination and returns a count
1484 * of aligned bytes to transfer in %i3
1485 */
14861:
1487	inc	%i0			! inc from
1488	stb	%o4, [%i1]		! write a byte
1489	inc	%i1			! inc to
1490	dec	%i2			! dec count
1491.alignit:
1492	btst	%o0, %i0		! %o0 is bit mask to check for alignment
1493	bnz,a	1b
1494	ldub	[%i0], %o4		! read next byte
1495
1496	retl
1497	andn	%i2, %o0, %i3		! return size of aligned bytes
1498	SET_SIZE(bcopy)
1499
1500#endif	/* lint */
1501
1502/*
1503 * Block copy with possibly overlapped operands.
1504 */
1505
1506#if defined(lint)
1507
1508/*ARGSUSED*/
1509void
1510ovbcopy(const void *from, void *to, size_t count)
1511{}
1512
1513#else	/* lint */
1514
1515	ENTRY(ovbcopy)
1516	tst	%o2			! check count
1517	bgu,a	%ncc, 1f		! nothing to do or bad arguments
1518	subcc	%o0, %o1, %o3		! difference of from and to address
1519
1520	retl				! return
1521	nop
15221:
1523	bneg,a	%ncc, 2f
1524	neg	%o3			! if < 0, make it positive
15252:	cmp	%o2, %o3		! cmp size and abs(from - to)
1526	bleu	%ncc, bcopy		! if size <= abs(diff): use bcopy,
1527	.empty				!   no overlap
1528	cmp	%o0, %o1		! compare from and to addresses
1529	blu	%ncc, .ov_bkwd		! if from < to, copy backwards
1530	nop
1531	!
1532	! Copy forwards.
1533	!
1534.ov_fwd:
1535	ldub	[%o0], %o3		! read from address
1536	inc	%o0			! inc from address
1537	stb	%o3, [%o1]		! write to address
1538	deccc	%o2			! dec count
1539	bgu	%ncc, .ov_fwd		! loop till done
1540	inc	%o1			! inc to address
1541
1542	retl				! return
1543	nop
1544	!
1545	! Copy backwards.
1546	!
1547.ov_bkwd:
1548	deccc	%o2			! dec count
1549	ldub	[%o0 + %o2], %o3	! get byte at end of src
1550	bgu	%ncc, .ov_bkwd		! loop till done
1551	stb	%o3, [%o1 + %o2]	! delay slot, store at end of dst
1552
1553	retl				! return
1554	nop
1555	SET_SIZE(ovbcopy)
1556
1557#endif	/* lint */
1558
1559/*
1560 * hwblkpagecopy()
1561 *
1562 * Copies exactly one page.  This routine assumes the caller (ppcopy)
1563 * has already disabled kernel preemption and has checked
1564 * use_hw_bcopy.
1565 */
1566#ifdef lint
1567/*ARGSUSED*/
1568void
1569hwblkpagecopy(const void *src, void *dst)
1570{ }
1571#else /* lint */
1572	ENTRY(hwblkpagecopy)
1573	save	%sp, -SA(MINFRAME), %sp
1574
1575	! %i0 - source address (arg)
1576	! %i1 - destination address (arg)
1577	! %i2 - length of region (not arg)
1578
1579	set	PAGESIZE, %i2
1580
1581	/*
1582	 * Copying exactly one page and PAGESIZE is in mutliple of 0x80.
1583	 */
1584	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
1585	prefetch [%i0+0x0], #one_read
1586	prefetch [%i0+0x40], #one_read
15871:
1588	prefetch [%i0+0x80], #one_read
1589	prefetch [%i0+0xc0], #one_read
1590	ldda	[%i0+0x0]%asi, %l0
1591	ldda	[%i0+0x10]%asi, %l2
1592	ldda	[%i0+0x20]%asi, %l4
1593	ldda	[%i0+0x30]%asi, %l6
1594	stxa	%l0, [%i1+0x0]%asi
1595	stxa	%l1, [%i1+0x8]%asi
1596	stxa	%l2, [%i1+0x10]%asi
1597	stxa	%l3, [%i1+0x18]%asi
1598	stxa	%l4, [%i1+0x20]%asi
1599	stxa	%l5, [%i1+0x28]%asi
1600	stxa	%l6, [%i1+0x30]%asi
1601	stxa	%l7, [%i1+0x38]%asi
1602	ldda	[%i0+0x40]%asi, %l0
1603	ldda	[%i0+0x50]%asi, %l2
1604	ldda	[%i0+0x60]%asi, %l4
1605	ldda	[%i0+0x70]%asi, %l6
1606	stxa	%l0, [%i1+0x40]%asi
1607	stxa	%l1, [%i1+0x48]%asi
1608	stxa	%l2, [%i1+0x50]%asi
1609	stxa	%l3, [%i1+0x58]%asi
1610	stxa	%l4, [%i1+0x60]%asi
1611	stxa	%l5, [%i1+0x68]%asi
1612	stxa	%l6, [%i1+0x70]%asi
1613	stxa	%l7, [%i1+0x78]%asi
1614
1615	add	%i0, 0x80, %i0
1616	subcc	%i2, 0x80, %i2
1617	bgu,pt	%xcc, 1b
1618	add	%i1, 0x80, %i1
1619
1620	membar #Sync
1621	ret
1622	restore	%g0, 0, %o0
1623	SET_SIZE(hwblkpagecopy)
1624#endif	/* lint */
1625
1626
1627/*
1628 * Transfer data to and from user space -
1629 * Note that these routines can cause faults
1630 * It is assumed that the kernel has nothing at
1631 * less than KERNELBASE in the virtual address space.
1632 *
1633 * Note that copyin(9F) and copyout(9F) are part of the
1634 * DDI/DKI which specifies that they return '-1' on "errors."
1635 *
1636 * Sigh.
1637 *
1638 * So there's two extremely similar routines - xcopyin() and xcopyout()
1639 * which return the errno that we've faithfully computed.  This
1640 * allows other callers (e.g. uiomove(9F)) to work correctly.
1641 * Given that these are used pretty heavily, we expand the calling
1642 * sequences inline for all flavours (rather than making wrappers).
1643 *
1644 * There are also stub routines for xcopyout_little and xcopyin_little,
1645 * which currently are intended to handle requests of <= 16 bytes from
1646 * do_unaligned. Future enhancement to make them handle 8k pages efficiently
1647 * is left as an exercise...
1648 */
1649
1650/*
1651 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
1652 *
1653 * General theory of operation:
1654 *
1655 * None of the copyops routines grab a window until it's decided that
1656 * we need to do a HW block copy operation. This saves a window
1657 * spill/fill when we're called during socket ops. The typical IO
1658 * path won't cause spill/fill traps.
1659 *
1660 * This code uses a set of 4 limits for the maximum size that will
1661 * be copied given a particular input/output address alignment.
1662 * the default limits are:
1663 *
1664 * single byte aligned - 256 (hw_copy_limit_1)
1665 * two byte aligned - 512 (hw_copy_limit_2)
1666 * four byte aligned - 1024 (hw_copy_limit_4)
1667 * eight byte aligned - 1024 (hw_copy_limit_8)
1668 *
1669 * If the value for a particular limit is zero, the copy will be done
1670 * via the copy loops rather than block store/quad load instructions.
1671 *
1672 * Flow:
1673 *
1674 * If count == zero return zero.
1675 *
1676 * Store the previous lo_fault handler into %g6.
1677 * Place our secondary lofault handler into %g5.
1678 * Place the address of our nowindow fault handler into %o3.
1679 * Place the address of the windowed fault handler into %o4.
1680 * --> We'll use this handler if we end up grabbing a window
1681 * --> before we use block initializing store and quad load ASIs
1682 *
1683 * If count is less than or equal to SMALL_LIMIT (7) we
1684 * always do a byte for byte copy.
1685 *
1686 * If count is > SMALL_LIMIT, we check the alignment of the input
1687 * and output pointers. Based on the alignment we check count
1688 * against a limit based on detected alignment.  If we exceed the
1689 * alignment value we copy via block initializing store and quad
1690 * load instructions.
1691 *
1692 * If we don't exceed one of the limits, we store -count in %o3,
1693 * we store the number of chunks (8, 4, 2 or 1 byte) operated
1694 * on in our basic copy loop in %o2. Following this we branch
1695 * to the appropriate copy loop and copy that many chunks.
1696 * Since we've been adding the chunk size to %o3 each time through
1697 * as well as decrementing %o2, we can tell if any data is
1698 * is left to be copied by examining %o3. If that is zero, we're
1699 * done and can go home. If not, we figure out what the largest
1700 * chunk size left to be copied is and branch to that copy loop
1701 * unless there's only one byte left. We load that as we're
1702 * branching to code that stores it just before we return.
1703 *
1704 * Fault handlers are invoked if we reference memory that has no
1705 * current mapping.  All forms share the same copyio_fault handler.
1706 * This routine handles fixing up the stack and general housecleaning.
1707 * Each copy operation has a simple fault handler that is then called
1708 * to do the work specific to the invidual operation.  The handler
1709 * for copyOP and xcopyOP are found at the end of individual function.
1710 * The handlers for xcopyOP_little are found at the end of xcopyin_little.
1711 * The handlers for copyOP_noerr are found at the end of copyin_noerr.
1712 */
1713
1714/*
1715 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
1716 */
1717
1718#if defined(lint)
1719
1720/*ARGSUSED*/
1721int
1722copyout(const void *kaddr, void *uaddr, size_t count)
1723{ return (0); }
1724
1725#else	/* lint */
1726
1727/*
1728 * We save the arguments in the following registers in case of a fault:
1729 * 	kaddr - %g2
1730 * 	uaddr - %g3
1731 * 	count - %g4
1732 */
1733#define	SAVE_SRC	%g2
1734#define	SAVE_DST	%g3
1735#define	SAVE_COUNT	%g4
1736
1737#define	REAL_LOFAULT		%g5
1738#define	SAVED_LOFAULT		%g6
1739
1740/*
1741 * Generic copyio fault handler.  This is the first line of defense when a
1742 * fault occurs in (x)copyin/(x)copyout.  In order for this to function
1743 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
1744 * This allows us to share common code for all the flavors of the copy
1745 * operations, including the _noerr versions.
1746 *
1747 * Note that this function will restore the original input parameters before
1748 * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
1749 * member of the t_copyop structure, if needed.
1750 */
1751	ENTRY(copyio_fault)
1752#if !defined(NIAGARA_IMPL)
1753	btst	FPUSED_FLAG, SAVED_LOFAULT
1754	bz	1f
1755	andn	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
1756
1757	ld	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
1758	wr	%o2, 0, %gsr		! restore gsr
1759
1760	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1761	btst	FPRS_FEF, %o3
1762	bz	%icc, 4f
1763	  nop
1764
1765	! restore fpregs from stack
1766	BLD_FP_FROMSTACK(%o2)
1767
1768	ba,pt	%ncc, 1f
1769	  wr	%o3, 0, %fprs		! restore fprs
1770
17714:
1772	FZERO				! zero all of the fpregs
1773	wr	%o3, 0, %fprs		! restore fprs
1774
17751:
1776#else	/* NIAGARA_IMPL */
1777	membar	#Sync
1778	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1779#endif	/* NIAGARA_IMPL */
1780
1781	restore
1782
1783	mov	SAVE_SRC, %o0
1784	mov	SAVE_DST, %o1
1785	jmp	REAL_LOFAULT
1786	  mov	SAVE_COUNT, %o2
1787	SET_SIZE(copyio_fault)
1788
1789	ENTRY(copyio_fault_nowindow)
1790	membar	#Sync
1791	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1792
1793	mov	SAVE_SRC, %o0
1794	mov	SAVE_DST, %o1
1795	jmp	REAL_LOFAULT
1796	  mov	SAVE_COUNT, %o2
1797	SET_SIZE(copyio_fault_nowindow)
1798
1799	ENTRY(copyout)
1800	sethi	%hi(.copyout_err), REAL_LOFAULT
1801	or	REAL_LOFAULT, %lo(.copyout_err), REAL_LOFAULT
1802
1803.do_copyout:
1804	!
1805	! Check the length and bail if zero.
1806	!
1807	tst	%o2
1808	bnz,pt	%ncc, 1f
1809	  nop
1810	retl
1811	  clr	%o0
18121:
1813	sethi	%hi(copyio_fault), %o4
1814	or	%o4, %lo(copyio_fault), %o4
1815	sethi	%hi(copyio_fault_nowindow), %o3
1816	ldn	[THREAD_REG + T_LOFAULT], SAVED_LOFAULT
1817	or	%o3, %lo(copyio_fault_nowindow), %o3
1818	membar	#Sync
1819	stn	%o3, [THREAD_REG + T_LOFAULT]
1820
1821	mov	%o0, SAVE_SRC
1822	mov	%o1, SAVE_DST
1823	mov	%o2, SAVE_COUNT
1824
1825	!
1826	! Check to see if we're more than SMALL_LIMIT (7 bytes).
1827	! Run in leaf mode, using the %o regs as our input regs.
1828	!
1829	subcc	%o2, SMALL_LIMIT, %o3
1830	bgu,a,pt %ncc, .dco_ns
1831	or	%o0, %o1, %o3
1832	!
1833	! What was previously ".small_copyout"
1834	! Do full differenced copy.
1835	!
1836.dcobcp:
1837	sub	%g0, %o2, %o3		! negate count
1838	add	%o0, %o2, %o0		! make %o0 point at the end
1839	add	%o1, %o2, %o1		! make %o1 point at the end
1840	ba,pt	%ncc, .dcocl
1841	ldub	[%o0 + %o3], %o4	! load first byte
1842	!
1843	! %o0 and %o2 point at the end and remain pointing at the end
1844	! of their buffers. We pull things out by adding %o3 (which is
1845	! the negation of the length) to the buffer end which gives us
1846	! the curent location in the buffers. By incrementing %o3 we walk
1847	! through both buffers without having to bump each buffer's
1848	! pointer. A very fast 4 instruction loop.
1849	!
1850	.align 16
1851.dcocl:
1852	stba	%o4, [%o1 + %o3]ASI_USER
1853	inccc	%o3
1854	bl,a,pt	%ncc, .dcocl
1855	ldub	[%o0 + %o3], %o4
1856	!
1857	! We're done. Go home.
1858	!
1859	membar	#Sync
1860	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
1861	retl
1862	clr	%o0
1863	!
1864	! Try aligned copies from here.
1865	!
1866.dco_ns:
1867	! %o0 = kernel addr (to be copied from)
1868	! %o1 = user addr (to be copied to)
1869	! %o2 = length
1870	! %o3 = %o1 | %o2 (used for alignment checking)
1871	! %o4 is alternate lo_fault
1872	! %o5 is original lo_fault
1873	!
1874	! See if we're single byte aligned. If we are, check the
1875	! limit for single byte copies. If we're smaller or equal,
1876	! bounce to the byte for byte copy loop. Otherwise do it in
1877	! HW (if enabled).
1878	!
1879	btst	1, %o3
1880	bz,pt	%icc, .dcoh8
1881	btst	7, %o3
1882	!
1883	! Single byte aligned. Do we do it via HW or via
1884	! byte for byte? Do a quick no memory reference
1885	! check to pick up small copies.
1886	!
1887	sethi	%hi(hw_copy_limit_1), %o3
1888	!
1889	! Big enough that we need to check the HW limit for
1890	! this size copy.
1891	!
1892	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
1893	!
1894	! Is HW copy on? If not, do everything byte for byte.
1895	!
1896	tst	%o3
1897	bz,pn	%icc, .dcobcp
1898	subcc	%o3, %o2, %o3
1899	!
1900	! If we're less than or equal to the single byte copy limit,
1901	! bop to the copy loop.
1902	!
1903	bge,pt	%ncc, .dcobcp
1904	nop
1905	!
1906	! We're big enough and copy is on. Do it with HW.
1907	!
1908	ba,pt	%ncc, .big_copyout
1909	nop
1910.dcoh8:
1911	!
1912	! 8 byte aligned?
1913	!
1914	bnz,a	%ncc, .dcoh4
1915	btst	3, %o3
1916	!
1917	! See if we're in the "small range".
1918	! If so, go off and do the copy.
1919	! If not, load the hard limit. %o3 is
1920	! available for reuse.
1921	!
1922	sethi	%hi(hw_copy_limit_8), %o3
1923	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
1924	!
1925	! If it's zero, there's no HW bcopy.
1926	! Bop off to the aligned copy.
1927	!
1928	tst	%o3
1929	bz,pn	%icc, .dcos8
1930	subcc	%o3, %o2, %o3
1931	!
1932	! We're negative if our size is larger than hw_copy_limit_8.
1933	!
1934	bge,pt	%ncc, .dcos8
1935	nop
1936	!
1937	! HW assist is on and we're large enough. Do it.
1938	!
1939	ba,pt	%ncc, .big_copyout
1940	nop
1941.dcos8:
1942	!
1943	! Housekeeping for copy loops. Uses same idea as in the byte for
1944	! byte copy loop above.
1945	!
1946	add	%o0, %o2, %o0
1947	add	%o1, %o2, %o1
1948	sub	%g0, %o2, %o3
1949	ba,pt	%ncc, .dodebc
1950	srl	%o2, 3, %o2		! Number of 8 byte chunks to copy
1951	!
1952	! 4 byte aligned?
1953	!
1954.dcoh4:
1955	bnz,pn	%ncc, .dcoh2
1956	!
1957	! See if we're in the "small range".
1958	! If so, go off an do the copy.
1959	! If not, load the hard limit. %o3 is
1960	! available for reuse.
1961	!
1962	sethi	%hi(hw_copy_limit_4), %o3
1963	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
1964	!
1965	! If it's zero, there's no HW bcopy.
1966	! Bop off to the aligned copy.
1967	!
1968	tst	%o3
1969	bz,pn	%icc, .dcos4
1970	subcc	%o3, %o2, %o3
1971	!
1972	! We're negative if our size is larger than hw_copy_limit_4.
1973	!
1974	bge,pt	%ncc, .dcos4
1975	nop
1976	!
1977	! HW assist is on and we're large enough. Do it.
1978	!
1979	ba,pt	%ncc, .big_copyout
1980	nop
1981.dcos4:
1982	add	%o0, %o2, %o0
1983	add	%o1, %o2, %o1
1984	sub	%g0, %o2, %o3
1985	ba,pt	%ncc, .dodfbc
1986	srl	%o2, 2, %o2		! Number of 4 byte chunks to copy
1987	!
1988	! We must be 2 byte aligned. Off we go.
1989	! The check for small copies was done in the
1990	! delay at .dcoh4
1991	!
1992.dcoh2:
1993	ble	%ncc, .dcos2
1994	sethi	%hi(hw_copy_limit_2), %o3
1995	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
1996	tst	%o3
1997	bz,pn	%icc, .dcos2
1998	subcc	%o3, %o2, %o3
1999	bge,pt	%ncc, .dcos2
2000	nop
2001	!
2002	! HW is on and we're big enough. Do it.
2003	!
2004	ba,pt	%ncc, .big_copyout
2005	nop
2006.dcos2:
2007	add	%o0, %o2, %o0
2008	add	%o1, %o2, %o1
2009	sub	%g0, %o2, %o3
2010	ba,pt	%ncc, .dodtbc
2011	srl	%o2, 1, %o2		! Number of 2 byte chunks to copy
2012.small_copyout:
2013	!
2014	! Why are we doing this AGAIN? There are certain conditions in
2015	! big_copyout that will cause us to forego the HW assisted copies
2016	! and bounce back to a non-HW assisted copy. This dispatches those
2017	! copies. Note that we branch around this in the main line code.
2018	!
2019	! We make no check for limits or HW enablement here. We've
2020	! already been told that we're a poster child so just go off
2021	! and do it.
2022	!
2023	or	%o0, %o1, %o3
2024	btst	1, %o3
2025	bnz	%icc, .dcobcp		! Most likely
2026	btst	7, %o3
2027	bz	%icc, .dcos8
2028	btst	3, %o3
2029	bz	%icc, .dcos4
2030	nop
2031	ba,pt	%ncc, .dcos2
2032	nop
2033	.align 32
2034.dodebc:
2035	ldx	[%o0 + %o3], %o4
2036	deccc	%o2
2037	stxa	%o4, [%o1 + %o3]ASI_USER
2038	bg,pt	%ncc, .dodebc
2039	addcc	%o3, 8, %o3
2040	!
2041	! End of copy loop. Check to see if we're done. Most
2042	! eight byte aligned copies end here.
2043	!
2044	bz,pt	%ncc, .dcofh
2045	nop
2046	!
2047	! Something is left - do it byte for byte.
2048	!
2049	ba,pt	%ncc, .dcocl
2050	ldub	[%o0 + %o3], %o4	! load next byte
2051	!
2052	! Four byte copy loop. %o2 is the number of 4 byte chunks to copy.
2053	!
2054	.align 32
2055.dodfbc:
2056	lduw	[%o0 + %o3], %o4
2057	deccc	%o2
2058	sta	%o4, [%o1 + %o3]ASI_USER
2059	bg,pt	%ncc, .dodfbc
2060	addcc	%o3, 4, %o3
2061	!
2062	! End of copy loop. Check to see if we're done. Most
2063	! four byte aligned copies end here.
2064	!
2065	bz,pt	%ncc, .dcofh
2066	nop
2067	!
2068	! Something is left. Do it byte for byte.
2069	!
2070	ba,pt	%ncc, .dcocl
2071	ldub	[%o0 + %o3], %o4	! load next byte
2072	!
2073	! two byte aligned copy loop. %o2 is the number of 2 byte chunks to
2074	! copy.
2075	!
2076	.align 32
2077.dodtbc:
2078	lduh	[%o0 + %o3], %o4
2079	deccc	%o2
2080	stha	%o4, [%o1 + %o3]ASI_USER
2081	bg,pt	%ncc, .dodtbc
2082	addcc	%o3, 2, %o3
2083	!
2084	! End of copy loop. Anything left?
2085	!
2086	bz,pt	%ncc, .dcofh
2087	nop
2088	!
2089	! Deal with the last byte
2090	!
2091	ldub	[%o0 + %o3], %o4
2092	stba	%o4, [%o1 + %o3]ASI_USER
2093.dcofh:
2094	membar	#Sync
2095	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2096	retl
2097	clr	%o0
2098
2099.big_copyout:
2100	! We're going to go off and do a block copy.
2101	! Switch fault handlers and grab a window. We
2102	! don't do a membar #Sync since we've done only
2103	! kernel data to this point.
2104	stn	%o4, [THREAD_REG + T_LOFAULT]
2105
2106	! Copy out that reach here are larger than 256 bytes. The
2107	! hw_copy_limit_1 is set to 256. Never set this limit less
2108	! 128 bytes.
2109#if !defined(NIAGARA_IMPL)
2110	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2111
2112	rd	%fprs, %o2			! check for unused fp
2113	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET]	! save %fprs
2114	btst	FPRS_FEF, %o2
2115	bz,a,pt	%icc, .do_block_copyout
2116	wr	%g0, FPRS_FEF, %fprs
2117
2118	! save in-use fpregs on stack
2119	BST_FP_TOSTACK(%o2)
2120#else	/* NIAGARA_IMPL */
2121	save	%sp, -SA(MINFRAME), %sp
2122#endif	/* NIAGARA_IMPL */
2123
2124.do_block_copyout:
2125
2126#if !defined(NIAGARA_IMPL)
2127	rd	%gsr, %o2
2128	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
2129	! set the lower bit saved t_lofault to indicate that we need
2130	! clear %fprs register on the way out
2131	or	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
2132#endif	/* NIAGARA_IMPL */
2133
2134	! Swap src/dst since the code below is memcpy code
2135	! and memcpy/bcopy have different calling sequences
2136	mov	%i1, %i5
2137	mov	%i0, %i1
2138	mov	%i5, %i0
2139
2140	! Block (64 bytes) align the destination.
2141	andcc	%i0, 0x3f, %i3		! is dst block aligned
2142	bz	%ncc, copyout_blalign	! dst already block aligned
2143	sub	%i3, 0x40, %i3
2144	neg	%i3			! bytes till dst 64 bytes aligned
2145	sub	%i2, %i3, %i2		! update i2 with new count
2146
2147	! Based on source and destination alignment do
2148	! either 8 bytes, 4 bytes, 2 bytes or byte copy.
2149
2150	! Is dst & src 8B aligned
2151	or	%i0, %i1, %o2
2152	andcc	%o2, 0x7, %g0
2153	bz	%ncc, .co_alewdcp
2154	nop
2155
2156	! Is dst & src 4B aligned
2157	andcc	%o2, 0x3, %g0
2158	bz	%ncc, .co_alwdcp
2159	nop
2160
2161	! Is dst & src 2B aligned
2162	andcc	%o2, 0x1, %g0
2163	bz	%ncc, .co_alhlfwdcp
2164	nop
2165
2166	! 1B aligned
21671:	ldub	[%i1], %o2
2168	stba	%o2, [%i0]ASI_USER
2169	inc	%i1
2170	deccc	%i3
2171	bgu,pt	%ncc, 1b
2172	inc	%i0
2173
2174	ba	copyout_blalign
2175	nop
2176
2177	! dst & src 4B aligned
2178.co_alwdcp:
2179	ld	[%i1], %o2
2180	sta	%o2, [%i0]ASI_USER
2181	add	%i1, 0x4, %i1
2182	subcc	%i3, 0x4, %i3
2183	bgu,pt	%ncc, .co_alwdcp
2184	add	%i0, 0x4, %i0
2185
2186	ba	copyout_blalign
2187	nop
2188
2189	! dst & src 2B aligned
2190.co_alhlfwdcp:
2191	lduh	[%i1], %o2
2192	stuha	%o2, [%i0]ASI_USER
2193	add	%i1, 0x2, %i1
2194	subcc	%i3, 0x2, %i3
2195	bgu,pt	%ncc, .co_alhlfwdcp
2196	add	%i0, 0x2, %i0
2197
2198	ba	copyout_blalign
2199	nop
2200
2201	! dst & src 8B aligned
2202.co_alewdcp:
2203	ldx	[%i1], %o2
2204	stxa	%o2, [%i0]ASI_USER
2205	add	%i1, 0x8, %i1
2206	subcc	%i3, 0x8, %i3
2207	bgu,pt	%ncc, .co_alewdcp
2208	add	%i0, 0x8, %i0
2209
2210	! Now Destination is block (64 bytes) aligned
2211copyout_blalign:
2212	andn	%i2, 0x3f, %i3		! %i3 count is multiple of block size
2213	sub	%i2, %i3, %i2		! Residue bytes in %i2
2214
2215	mov	ASI_BLK_INIT_QUAD_LDD_AIUS, %asi
2216
2217#if !defined(NIAGARA_IMPL)
2218	andn	%i1, 0x3f, %l0		! %l0 has block aligned src address
2219	prefetch [%l0+0x0], #one_read
2220	andcc	%i1, 0x3f, %g0		! is src 64B aligned
2221	bz,pn	%ncc, .co_blkcpy
2222	nop
2223
2224	! handle misaligned source cases
2225	alignaddr %i1, %g0, %g0		! generate %gsr
2226
2227	srl	%i1, 0x3, %l1		! src add bits 3, 4, 5 are now least
2228					! significant in %l1
2229	andcc	%l1, 0x7, %l2		! mask everything except bits 1, 2, 3
2230	add	%i1, %i3, %i1
2231
2232	! switch statement to get to right 8 byte block within
2233	! 64 byte block
2234	cmp	 %l2, 0x4
2235	bgeu,a	 co_hlf
2236	cmp	 %l2, 0x6
2237	cmp	 %l2, 0x2
2238	bgeu,a	 co_sqtr
2239	nop
2240	cmp	 %l2, 0x1
2241	be,a	 co_off15
2242	nop
2243	ba	 co_off7
2244	nop
2245co_sqtr:
2246	be,a	 co_off23
2247	nop
2248	ba,a	 co_off31
2249	nop
2250
2251co_hlf:
2252	bgeu,a	 co_fqtr
2253	nop
2254	cmp	 %l2, 0x5
2255	be,a	 co_off47
2256	nop
2257	ba	 co_off39
2258	nop
2259co_fqtr:
2260	be,a	 co_off55
2261	nop
2262
2263	ldd	[%l0+0x38], %d14
2264	prefetch [%l0+0x40], #one_read
2265	prefetch [%l0+0x80], #one_read
22667:
2267	add	%l0, 0x40, %l0
2268	stxa	%g0, [%i0]%asi		! initialize the cache line
2269
2270	ldda	[%l0]ASI_BLK_P, %d16
2271	ALIGN_OFF_56_63
2272	fmovd	%d30, %d14
2273
2274	stda	%d48, [%i0]ASI_BLK_AIUS
2275	subcc	%i3, 0x40, %i3
2276	add	%i0, 0x40, %i0
2277	bgu,pt	%ncc, 7b
2278	prefetch [%l0+0x80], #one_read
2279	ba	.co_blkdone
2280	membar	#Sync
2281
2282co_off7:
2283	ldda	[%l0]ASI_BLK_P, %d0
2284	prefetch [%l0+0x40], #one_read
2285	prefetch [%l0+0x80], #one_read
22860:
2287	add	%l0, 0x40, %l0
2288	stxa	%g0, [%i0]%asi		! initialize the cache line
2289
2290	ldda	[%l0]ASI_BLK_P, %d16
2291	ALIGN_OFF_1_7
2292	fmovd	%d16, %d0
2293	fmovd	%d18, %d2
2294	fmovd	%d20, %d4
2295	fmovd	%d22, %d6
2296	fmovd	%d24, %d8
2297	fmovd	%d26, %d10
2298	fmovd	%d28, %d12
2299	fmovd	%d30, %d14
2300
2301	stda	%d48, [%i0]ASI_BLK_AIUS
2302	subcc	%i3, 0x40, %i3
2303	add	%i0, 0x40, %i0
2304	bgu,pt	%ncc, 0b
2305	prefetch [%l0+0x80], #one_read
2306	ba	.co_blkdone
2307	membar	#Sync
2308
2309co_off15:
2310	ldd	[%l0+0x8], %d2
2311	ldd	[%l0+0x10], %d4
2312	ldd	[%l0+0x18], %d6
2313	ldd	[%l0+0x20], %d8
2314	ldd	[%l0+0x28], %d10
2315	ldd	[%l0+0x30], %d12
2316	ldd	[%l0+0x38], %d14
2317	prefetch [%l0+0x40], #one_read
2318	prefetch [%l0+0x80], #one_read
23191:
2320	add	%l0, 0x40, %l0
2321	stxa	%g0, [%i0]%asi		! initialize the cache line
2322
2323	ldda	[%l0]ASI_BLK_P, %d16
2324	ALIGN_OFF_8_15
2325	fmovd	%d18, %d2
2326	fmovd	%d20, %d4
2327	fmovd	%d22, %d6
2328	fmovd	%d24, %d8
2329	fmovd	%d26, %d10
2330	fmovd	%d28, %d12
2331	fmovd	%d30, %d14
2332
2333	stda	%d48, [%i0]ASI_BLK_AIUS
2334	subcc	%i3, 0x40, %i3
2335	add	%i0, 0x40, %i0
2336	bgu,pt	%ncc, 1b
2337	prefetch [%l0+0x80], #one_read
2338	ba	.co_blkdone
2339	membar	#Sync
2340
2341co_off23:
2342	ldd	[%l0+0x10], %d4
2343	ldd	[%l0+0x18], %d6
2344	ldd	[%l0+0x20], %d8
2345	ldd	[%l0+0x28], %d10
2346	ldd	[%l0+0x30], %d12
2347	ldd	[%l0+0x38], %d14
2348	prefetch [%l0+0x40], #one_read
2349	prefetch [%l0+0x80], #one_read
23502:
2351	add	%l0, 0x40, %l0
2352	stxa	%g0, [%i0]%asi		! initialize the cache line
2353
2354	ldda	[%l0]ASI_BLK_P, %d16
2355	ALIGN_OFF_16_23
2356	fmovd	%d20, %d4
2357	fmovd	%d22, %d6
2358	fmovd	%d24, %d8
2359	fmovd	%d26, %d10
2360	fmovd	%d28, %d12
2361	fmovd	%d30, %d14
2362
2363	stda	%d48, [%i0]ASI_BLK_AIUS
2364	subcc	%i3, 0x40, %i3
2365	add	%i0, 0x40, %i0
2366	bgu,pt	%ncc, 2b
2367	prefetch [%l0+0x80], #one_read
2368	ba	.co_blkdone
2369	membar	#Sync
2370
2371co_off31:
2372	ldd	[%l0+0x18], %d6
2373	ldd	[%l0+0x20], %d8
2374	ldd	[%l0+0x28], %d10
2375	ldd	[%l0+0x30], %d12
2376	ldd	[%l0+0x38], %d14
2377	prefetch [%l0+0x40], #one_read
2378	prefetch [%l0+0x80], #one_read
23793:
2380	add	%l0, 0x40, %l0
2381	stxa	%g0, [%i0]%asi		! initialize the cache line
2382
2383	ldda	[%l0]ASI_BLK_P, %d16
2384	ALIGN_OFF_24_31
2385	fmovd	%d22, %d6
2386	fmovd	%d24, %d8
2387	fmovd	%d26, %d10
2388	fmovd	%d28, %d12
2389	fmovd	%d30, %d14
2390
2391	stda	%d48, [%i0]ASI_BLK_AIUS
2392	subcc	%i3, 0x40, %i3
2393	add	%i0, 0x40, %i0
2394	bgu,pt	%ncc, 3b
2395	prefetch [%l0+0x80], #one_read
2396	ba	.co_blkdone
2397	membar	#Sync
2398
2399co_off39:
2400	ldd	[%l0+0x20], %d8
2401	ldd	[%l0+0x28], %d10
2402	ldd	[%l0+0x30], %d12
2403	ldd	[%l0+0x38], %d14
2404	prefetch [%l0+0x40], #one_read
2405	prefetch [%l0+0x80], #one_read
24064:
2407	add	%l0, 0x40, %l0
2408	stxa	%g0, [%i0]%asi		! initialize the cache line
2409
2410	ldda	[%l0]ASI_BLK_P, %d16
2411	ALIGN_OFF_32_39
2412	fmovd	%d24, %d8
2413	fmovd	%d26, %d10
2414	fmovd	%d28, %d12
2415	fmovd	%d30, %d14
2416
2417	stda	%d48, [%i0]ASI_BLK_AIUS
2418	subcc	%i3, 0x40, %i3
2419	add	%i0, 0x40, %i0
2420	bgu,pt	%ncc, 4b
2421	prefetch [%l0+0x80], #one_read
2422	ba	.co_blkdone
2423	membar	#Sync
2424
2425co_off47:
2426	ldd	[%l0+0x28], %d10
2427	ldd	[%l0+0x30], %d12
2428	ldd	[%l0+0x38], %d14
2429	prefetch [%l0+0x40], #one_read
2430	prefetch [%l0+0x80], #one_read
24315:
2432	add	%l0, 0x40, %l0
2433	stxa	%g0, [%i0]%asi		! initialize the cache line
2434
2435	ldda	[%l0]ASI_BLK_P, %d16
2436	ALIGN_OFF_40_47
2437	fmovd	%d26, %d10
2438	fmovd	%d28, %d12
2439	fmovd	%d30, %d14
2440
2441	stda	%d48, [%i0]ASI_BLK_AIUS
2442	subcc	%i3, 0x40, %i3
2443	add	%i0, 0x40, %i0
2444	bgu,pt	%ncc, 5b
2445	prefetch [%l0+0x80], #one_read
2446	ba	.co_blkdone
2447	membar	#Sync
2448
2449co_off55:
2450	ldd	[%l0+0x30], %d12
2451	ldd	[%l0+0x38], %d14
2452	prefetch [%l0+0x40], #one_read
2453	prefetch [%l0+0x80], #one_read
24546:
2455	add	%l0, 0x40, %l0
2456	stxa	%g0, [%i0]%asi		! initialize the cache line
2457
2458	ldda	[%l0]ASI_BLK_P, %d16
2459	ALIGN_OFF_48_55
2460	fmovd	%d28, %d12
2461	fmovd	%d30, %d14
2462
2463	stda	%d48, [%i0]ASI_BLK_AIUS
2464	subcc	%i3, 0x40, %i3
2465	add	%i0, 0x40, %i0
2466	bgu,pt	%ncc, 6b
2467	prefetch [%l0+0x80], #one_read
2468	ba	.co_blkdone
2469	membar	#Sync
2470
2471.co_blkcpy:
2472	prefetch [%i1+0x40], #one_read
2473	prefetch [%i1+0x80], #one_read
24748:
2475	stxa	%g0, [%i0]%asi		! initialize the cache line
2476	ldda	[%i1]ASI_BLK_P, %d0
2477	stda	%d0, [%i0]ASI_BLK_AIUS
2478
2479	add	%i1, 0x40, %i1
2480	subcc	%i3, 0x40, %i3
2481	add	%i0, 0x40, %i0
2482	bgu,pt	%ncc, 8b
2483	prefetch [%i1+0x80], #one_read
2484	membar	#Sync
2485
2486.co_blkdone:
2487#else	/* NIAGARA_IMPL */
2488	andcc	%i1, 0xf, %o2		! is src quadword aligned
2489	bz,pn	%xcc, .co_blkcpy	! src offset in %o2 (last 4-bits)
2490	nop
2491	cmp	%o2, 0x8
2492	bg	.co_upper_double
2493	nop
2494	bl	.co_lower_double
2495	nop
2496
2497	! Falls through when source offset is equal to 8 i.e.
2498	! source is double word aligned.
2499	! In this case no shift/merge of data is required
2500
2501	sub	%i1, %o2, %i1		! align the src at 16 bytes.
2502	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
2503	prefetch [%l0+0x0], #one_read
2504	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
2505.co_loop0:
2506	add	%i1, 0x10, %i1
2507	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
2508	prefetch [%l0+0x40], #one_read
2509
2510	stxa	%l3, [%i0+0x0]%asi
2511	stxa	%l4, [%i0+0x8]%asi
2512
2513	add	%i1, 0x10, %i1
2514	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
2515
2516	stxa	%l5, [%i0+0x10]%asi
2517	stxa	%l2, [%i0+0x18]%asi
2518
2519	add	%i1, 0x10, %i1
2520	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
2521
2522	stxa	%l3, [%i0+0x20]%asi
2523	stxa	%l4, [%i0+0x28]%asi
2524
2525	add	%i1, 0x10, %i1
2526	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
2527
2528	stxa	%l5, [%i0+0x30]%asi
2529	stxa	%l2, [%i0+0x38]%asi
2530
2531	add	%l0, 0x40, %l0
2532	subcc	%i3, 0x40, %i3
2533	bgu,pt	%xcc, .co_loop0
2534	add	%i0, 0x40, %i0
2535	ba	.co_blkdone
2536	add	%i1, %o2, %i1		! increment the source by src offset
2537					! the src offset was stored in %o2
2538
2539.co_lower_double:
2540
2541	sub	%i1, %o2, %i1		! align the src at 16 bytes.
2542	sll	%o2, 3, %o0		! %o0 left shift
2543	mov	0x40, %o1
2544	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
2545	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
2546	prefetch [%l0+0x0], #one_read
2547	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2	! partial data in %l2 and %l3 has
2548					! complete data
2549.co_loop1:
2550	add	%i1, 0x10, %i1
2551	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4	! %l4 has partial data
2552							! for this read.
2553	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)	! merge %l2, %l3 and %l4
2554							! into %l2 and %l3
2555	prefetch [%l0+0x40], #one_read
2556
2557	stxa	%l2, [%i0+0x0]%asi
2558	stxa	%l3, [%i0+0x8]%asi
2559
2560	add	%i1, 0x10, %i1
2561	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
2562	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)	! merge %l2 with %l5 and
2563							! %l4 from previous read
2564							! into %l4 and %l5
2565	stxa	%l4, [%i0+0x10]%asi
2566	stxa	%l5, [%i0+0x18]%asi
2567
2568	! Repeat the same for next 32 bytes.
2569
2570	add	%i1, 0x10, %i1
2571	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
2572	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
2573
2574	stxa	%l2, [%i0+0x20]%asi
2575	stxa	%l3, [%i0+0x28]%asi
2576
2577	add	%i1, 0x10, %i1
2578	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
2579	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
2580
2581	stxa	%l4, [%i0+0x30]%asi
2582	stxa	%l5, [%i0+0x38]%asi
2583
2584	add	%l0, 0x40, %l0
2585	subcc	%i3, 0x40, %i3
2586	bgu,pt	%xcc, .co_loop1
2587	add	%i0, 0x40, %i0
2588	ba	.co_blkdone
2589	add	%i1, %o2, %i1		! increment the source by src offset
2590					! the src offset was stored in %o2
2591
2592.co_upper_double:
2593
2594	sub	%i1, %o2, %i1		! align the src at 16 bytes.
2595	sub	%o2, 0x8, %o0
2596	sll	%o0, 3, %o0		! %o0 left shift
2597	mov	0x40, %o1
2598	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
2599	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
2600	prefetch [%l0+0x0], #one_read
2601	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2	! partial data in %l3
2602							! for this read and
2603							! no data in %l2
2604.co_loop2:
2605	add	%i1, 0x10, %i1
2606	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4	! %l4 has complete data
2607							! and %l5 has partial
2608	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)	! merge %l3, %l4 and %l5
2609							! into %l3 and %l4
2610	prefetch [%l0+0x40], #one_read
2611
2612	stxa	%l3, [%i0+0x0]%asi
2613	stxa	%l4, [%i0+0x8]%asi
2614
2615	add	%i1, 0x10, %i1
2616	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
2617	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)	! merge %l2 and %l3 with
2618							! %l5 from previous read
2619							! into %l5 and %l2
2620
2621	stxa	%l5, [%i0+0x10]%asi
2622	stxa	%l2, [%i0+0x18]%asi
2623
2624	! Repeat the same for next 32 bytes.
2625
2626	add	%i1, 0x10, %i1
2627	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
2628	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
2629
2630	stxa	%l3, [%i0+0x20]%asi
2631	stxa	%l4, [%i0+0x28]%asi
2632
2633	add	%i1, 0x10, %i1
2634	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
2635	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
2636
2637	stxa	%l5, [%i0+0x30]%asi
2638	stxa	%l2, [%i0+0x38]%asi
2639
2640	add	%l0, 0x40, %l0
2641	subcc	%i3, 0x40, %i3
2642	bgu,pt	%xcc, .co_loop2
2643	add	%i0, 0x40, %i0
2644	ba	.co_blkdone
2645	add	%i1, %o2, %i1		! increment the source by src offset
2646					! the src offset was stored in %o2
2647
2648
2649	! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
2650.co_blkcpy:
2651
2652	andn	%i1, 0x3f, %o0		! %o0 has block aligned source
2653	prefetch [%o0+0x0], #one_read
26541:
2655	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l0
2656	add	%i1, 0x10, %i1
2657	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
2658	add	%i1, 0x10, %i1
2659
2660	prefetch [%o0+0x40], #one_read
2661
2662	stxa	%l0, [%i0+0x0]%asi
2663
2664	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
2665	add	%i1, 0x10, %i1
2666	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l6
2667	add	%i1, 0x10, %i1
2668
2669	stxa	%l1, [%i0+0x8]%asi
2670	stxa	%l2, [%i0+0x10]%asi
2671	stxa	%l3, [%i0+0x18]%asi
2672	stxa	%l4, [%i0+0x20]%asi
2673	stxa	%l5, [%i0+0x28]%asi
2674	stxa	%l6, [%i0+0x30]%asi
2675	stxa	%l7, [%i0+0x38]%asi
2676
2677	add	%o0, 0x40, %o0
2678	subcc	%i3, 0x40, %i3
2679	bgu,pt	%xcc, 1b
2680	add	%i0, 0x40, %i0
2681
2682.co_blkdone:
2683	membar	#Sync
2684#endif	/* NIAGARA_IMPL */
2685
2686	brz,pt	%i2, .copyout_exit
2687	nop
2688
2689	! Handle trailing bytes
2690	cmp	%i2, 0x8
2691	blu,pt	%ncc, .co_residue
2692	nop
2693
2694	! Can we do some 8B ops
2695	or	%i1, %i0, %o2
2696	andcc	%o2, 0x7, %g0
2697	bnz	%ncc, .co_last4
2698	nop
2699
2700	! Do 8byte ops as long as possible
2701.co_last8:
2702	ldx	[%i1], %o2
2703	stxa	%o2, [%i0]ASI_USER
2704	add	%i1, 0x8, %i1
2705	sub	%i2, 0x8, %i2
2706	cmp	%i2, 0x8
2707	bgu,pt	%ncc, .co_last8
2708	add	%i0, 0x8, %i0
2709
2710	brz,pt	%i2, .copyout_exit
2711	nop
2712
2713	ba	.co_residue
2714	nop
2715
2716.co_last4:
2717	! Can we do 4B ops
2718	andcc	%o2, 0x3, %g0
2719	bnz	%ncc, .co_last2
2720	nop
27211:
2722	ld	[%i1], %o2
2723	sta	%o2, [%i0]ASI_USER
2724	add	%i1, 0x4, %i1
2725	sub	%i2, 0x4, %i2
2726	cmp	%i2, 0x4
2727	bgu,pt	%ncc, 1b
2728	add	%i0, 0x4, %i0
2729
2730	brz,pt	%i2, .copyout_exit
2731	nop
2732
2733	ba	.co_residue
2734	nop
2735
2736.co_last2:
2737	! Can we do 2B ops
2738	andcc	%o2, 0x1, %g0
2739	bnz	%ncc, .co_residue
2740	nop
2741
27421:
2743	lduh	[%i1], %o2
2744	stuha	%o2, [%i0]ASI_USER
2745	add	%i1, 0x2, %i1
2746	sub	%i2, 0x2, %i2
2747	cmp	%i2, 0x2
2748	bgu,pt	%ncc, 1b
2749	add	%i0, 0x2, %i0
2750
2751	brz,pt	%i2, .copyout_exit
2752	nop
2753
2754	! Copy the residue as byte copy
2755.co_residue:
2756	ldub	[%i1], %i4
2757	stba	%i4, [%i0]ASI_USER
2758	inc	%i1
2759	deccc	%i2
2760	bgu,pt	%xcc, .co_residue
2761	inc	%i0
2762
2763.copyout_exit:
2764#if !defined(NIAGARA_IMPL)
2765	ld	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
2766	wr	%o2, 0, %gsr		! restore gsr
2767
2768	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
2769	btst	FPRS_FEF, %o3
2770	bz	%icc, 4f
2771	  nop
2772
2773	! restore fpregs from stack
2774	BLD_FP_FROMSTACK(%o2)
2775
2776	ba,pt	%ncc, 2f
2777	  wr	%o3, 0, %fprs		! restore fprs
2778
27794:
2780	FZERO				! zero all of the fpregs
2781	wr	%o3, 0, %fprs		! restore fprs
2782
27832:
2784	membar	#Sync
2785	andn	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
2786#else	/* NIAGARA_IMPL */
2787	membar	#Sync
2788#endif	/* NIAGARA_IMPL */
2789	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2790	ret
2791	restore	%g0, 0, %o0
2792
2793.copyout_err:
2794	ldn	[THREAD_REG + T_COPYOPS], %o4
2795	brz	%o4, 2f
2796	nop
2797	ldn	[%o4 + CP_COPYOUT], %g2
2798	jmp	%g2
2799	nop
28002:
2801	retl
2802	mov	-1, %o0
2803	SET_SIZE(copyout)
2804
2805#endif	/* lint */
2806
2807
2808#ifdef	lint
2809
2810/*ARGSUSED*/
2811int
2812xcopyout(const void *kaddr, void *uaddr, size_t count)
2813{ return (0); }
2814
2815#else	/* lint */
2816
2817	ENTRY(xcopyout)
2818	sethi	%hi(.xcopyout_err), REAL_LOFAULT
2819	b	.do_copyout
2820	  or	REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
2821.xcopyout_err:
2822	ldn	[THREAD_REG + T_COPYOPS], %o4
2823	brz	%o4, 2f
2824	nop
2825	ldn	[%o4 + CP_XCOPYOUT], %g2
2826	jmp	%g2
2827	nop
28282:
2829	retl
2830	mov	%g1, %o0
2831	SET_SIZE(xcopyout)
2832
2833#endif	/* lint */
2834
2835#ifdef	lint
2836
2837/*ARGSUSED*/
2838int
2839xcopyout_little(const void *kaddr, void *uaddr, size_t count)
2840{ return (0); }
2841
2842#else	/* lint */
2843
2844	ENTRY(xcopyout_little)
2845	sethi	%hi(.little_err), %o4
2846	ldn	[THREAD_REG + T_LOFAULT], %o5
2847	or	%o4, %lo(.little_err), %o4
2848	membar	#Sync			! sync error barrier
2849	stn	%o4, [THREAD_REG + T_LOFAULT]
2850
2851	subcc	%g0, %o2, %o3
2852	add	%o0, %o2, %o0
2853	bz,pn	%ncc, 2f		! check for zero bytes
2854	sub	%o2, 1, %o4
2855	add	%o0, %o4, %o0		! start w/last byte
2856	add	%o1, %o2, %o1
2857	ldub	[%o0+%o3], %o4
2858
28591:	stba	%o4, [%o1+%o3]ASI_AIUSL
2860	inccc	%o3
2861	sub	%o0, 2, %o0		! get next byte
2862	bcc,a,pt %ncc, 1b
2863	  ldub	[%o0+%o3], %o4
2864
28652:	membar	#Sync			! sync error barrier
2866	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2867	retl
2868	mov	%g0, %o0		! return (0)
2869	SET_SIZE(xcopyout_little)
2870
2871#endif	/* lint */
2872
2873/*
2874 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
2875 */
2876
2877#if defined(lint)
2878
2879/*ARGSUSED*/
2880int
2881copyin(const void *uaddr, void *kaddr, size_t count)
2882{ return (0); }
2883
2884#else	/* lint */
2885
2886	ENTRY(copyin)
2887	sethi	%hi(.copyin_err), REAL_LOFAULT
2888	or	REAL_LOFAULT, %lo(.copyin_err), REAL_LOFAULT
2889
2890.do_copyin:
2891	!
2892	! Check the length and bail if zero.
2893	!
2894	tst	%o2
2895	bnz,pt	%ncc, 1f
2896	  nop
2897	retl
2898	  clr	%o0
28991:
2900	sethi	%hi(copyio_fault), %o4
2901	or	%o4, %lo(copyio_fault), %o4
2902	sethi	%hi(copyio_fault_nowindow), %o3
2903	ldn	[THREAD_REG + T_LOFAULT], SAVED_LOFAULT
2904	or	%o3, %lo(copyio_fault_nowindow), %o3
2905	membar	#Sync
2906	stn	%o3, [THREAD_REG + T_LOFAULT]
2907
2908	mov	%o0, SAVE_SRC
2909	mov	%o1, SAVE_DST
2910	mov	%o2, SAVE_COUNT
2911
2912	!
2913	! Check to see if we're more than SMALL_LIMIT.
2914	!
2915	subcc	%o2, SMALL_LIMIT, %o3
2916	bgu,a,pt %ncc, .dci_ns
2917	or	%o0, %o1, %o3
2918	!
2919	! What was previously ".small_copyin"
2920	!
2921.dcibcp:
2922	sub	%g0, %o2, %o3		! setup for copy loop
2923	add	%o0, %o2, %o0
2924	add	%o1, %o2, %o1
2925	ba,pt	%ncc, .dcicl
2926	lduba	[%o0 + %o3]ASI_USER, %o4
2927	!
2928	! %o0 and %o1 point at the end and remain pointing at the end
2929	! of their buffers. We pull things out by adding %o3 (which is
2930	! the negation of the length) to the buffer end which gives us
2931	! the curent location in the buffers. By incrementing %o3 we walk
2932	! through both buffers without having to bump each buffer's
2933	! pointer. A very fast 4 instruction loop.
2934	!
2935	.align 16
2936.dcicl:
2937	stb	%o4, [%o1 + %o3]
2938	inccc	%o3
2939	bl,a,pt %ncc, .dcicl
2940	lduba	[%o0 + %o3]ASI_USER, %o4
2941	!
2942	! We're done. Go home.
2943	!
2944	membar	#Sync
2945	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
2946	retl
2947	clr	%o0
2948	!
2949	! Try aligned copies from here.
2950	!
2951.dci_ns:
2952	!
2953	! See if we're single byte aligned. If we are, check the
2954	! limit for single byte copies. If we're smaller, or equal,
2955	! bounce to the byte for byte copy loop. Otherwise do it in
2956	! HW (if enabled).
2957	!
2958	btst	1, %o3
2959	bz,a,pt	%icc, .dcih8
2960	btst	7, %o3
2961	!
2962	! We're single byte aligned.
2963	!
2964	sethi	%hi(hw_copy_limit_1), %o3
2965	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
2966	!
2967	! Is HW copy on? If not do everything byte for byte.
2968	!
2969	tst	%o3
2970	bz,pn	%icc, .dcibcp
2971	subcc	%o3, %o2, %o3
2972	!
2973	! Are we bigger than the HW limit? If not
2974	! go to byte for byte.
2975	!
2976	bge,pt	%ncc, .dcibcp
2977	nop
2978	!
2979	! We're big enough and copy is on. Do it with HW.
2980	!
2981	ba,pt	%ncc, .big_copyin
2982	nop
2983.dcih8:
2984	!
2985	! 8 byte aligned?
2986	!
2987	bnz,a	%ncc, .dcih4
2988	btst	3, %o3
2989	!
2990	! We're eight byte aligned.
2991	!
2992	sethi	%hi(hw_copy_limit_8), %o3
2993	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
2994	!
2995	! Is HW assist on? If not, do it with the aligned copy.
2996	!
2997	tst	%o3
2998	bz,pn	%icc, .dcis8
2999	subcc	%o3, %o2, %o3
3000	bge	%ncc, .dcis8
3001	nop
3002	ba,pt	%ncc, .big_copyin
3003	nop
3004.dcis8:
3005	!
3006	! Housekeeping for copy loops. Uses same idea as in the byte for
3007	! byte copy loop above.
3008	!
3009	add	%o0, %o2, %o0
3010	add	%o1, %o2, %o1
3011	sub	%g0, %o2, %o3
3012	ba,pt	%ncc, .didebc
3013	srl	%o2, 3, %o2		! Number of 8 byte chunks to copy
3014	!
3015	! 4 byte aligned?
3016	!
3017.dcih4:
3018	bnz	%ncc, .dcih2
3019	sethi	%hi(hw_copy_limit_4), %o3
3020	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
3021	!
3022	! Is HW assist on? If not, do it with the aligned copy.
3023	!
3024	tst	%o3
3025	bz,pn	%icc, .dcis4
3026	subcc	%o3, %o2, %o3
3027	!
3028	! We're negative if our size is less than or equal to hw_copy_limit_4.
3029	!
3030	bge	%ncc, .dcis4
3031	nop
3032	ba,pt	%ncc, .big_copyin
3033	nop
3034.dcis4:
3035	!
3036	! Housekeeping for copy loops. Uses same idea as in the byte
3037	! for byte copy loop above.
3038	!
3039	add	%o0, %o2, %o0
3040	add	%o1, %o2, %o1
3041	sub	%g0, %o2, %o3
3042	ba,pt	%ncc, .didfbc
3043	srl	%o2, 2, %o2		! Number of 4 byte chunks to copy
3044.dcih2:
3045	!
3046	! We're two byte aligned. Check for "smallness"
3047	! done in delay at .dcih4
3048	!
3049	bleu,pt	%ncc, .dcis2
3050	sethi	%hi(hw_copy_limit_2), %o3
3051	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
3052	!
3053	! Is HW assist on? If not, do it with the aligned copy.
3054	!
3055	tst	%o3
3056	bz,pn	%icc, .dcis2
3057	subcc	%o3, %o2, %o3
3058	!
3059	! Are we larger than the HW limit?
3060	!
3061	bge	%ncc, .dcis2
3062	nop
3063	!
3064	! HW assist is on and we're large enough to use it.
3065	!
3066	ba,pt	%ncc, .big_copyin
3067	nop
3068	!
3069	! Housekeeping for copy loops. Uses same idea as in the byte
3070	! for byte copy loop above.
3071	!
3072.dcis2:
3073	add	%o0, %o2, %o0
3074	add	%o1, %o2, %o1
3075	sub	%g0, %o2, %o3
3076	ba,pt	%ncc, .didtbc
3077	srl	%o2, 1, %o2		! Number of 2 byte chunks to copy
3078	!
3079.small_copyin:
3080	!
3081	! Why are we doing this AGAIN? There are certain conditions in
3082	! big copyin that will cause us to forgo the HW assisted copys
3083	! and bounce back to a non-hw assisted copy. This dispatches
3084	! those copies. Note that we branch around this in the main line
3085	! code.
3086	!
3087	! We make no check for limits or HW enablement here. We've
3088	! already been told that we're a poster child so just go off
3089	! and do it.
3090	!
3091	or	%o0, %o1, %o3
3092	btst	1, %o3
3093	bnz	%icc, .dcibcp		! Most likely
3094	btst	7, %o3
3095	bz	%icc, .dcis8
3096	btst	3, %o3
3097	bz	%icc, .dcis4
3098	nop
3099	ba,pt	%ncc, .dcis2
3100	nop
3101	!
3102	! Eight byte aligned copies. A steal from the original .small_copyin
3103	! with modifications. %o2 is number of 8 byte chunks to copy. When
3104	! done, we examine %o3. If this is < 0, we have 1 - 7 bytes more
3105	! to copy.
3106	!
3107	.align 32
3108.didebc:
3109	ldxa	[%o0 + %o3]ASI_USER, %o4
3110	deccc	%o2
3111	stx	%o4, [%o1 + %o3]
3112	bg,pt	%ncc, .didebc
3113	addcc	%o3, 8, %o3
3114	!
3115	! End of copy loop. Most 8 byte aligned copies end here.
3116	!
3117	bz,pt	%ncc, .dcifh
3118	nop
3119	!
3120	! Something is left. Do it byte for byte.
3121	!
3122	ba,pt	%ncc, .dcicl
3123	lduba	[%o0 + %o3]ASI_USER, %o4
3124	!
3125	! 4 byte copy loop. %o2 is number of 4 byte chunks to copy.
3126	!
3127	.align 32
3128.didfbc:
3129	lduwa	[%o0 + %o3]ASI_USER, %o4
3130	deccc	%o2
3131	st	%o4, [%o1 + %o3]
3132	bg,pt	%ncc, .didfbc
3133	addcc	%o3, 4, %o3
3134	!
3135	! End of copy loop. Most 4 byte aligned copies end here.
3136	!
3137	bz,pt	%ncc, .dcifh
3138	nop
3139	!
3140	! Something is left. Do it byte for byte.
3141	!
3142	ba,pt	%ncc, .dcicl
3143	lduba	[%o0 + %o3]ASI_USER, %o4
3144	!
3145	! 2 byte aligned copy loop. %o2 is number of 2 byte chunks to
3146	! copy.
3147	!
3148	.align 32
3149.didtbc:
3150	lduha	[%o0 + %o3]ASI_USER, %o4
3151	deccc	%o2
3152	sth	%o4, [%o1 + %o3]
3153	bg,pt	%ncc, .didtbc
3154	addcc	%o3, 2, %o3
3155	!
3156	! End of copy loop. Most 2 byte aligned copies end here.
3157	!
3158	bz,pt	%ncc, .dcifh
3159	nop
3160	!
3161	! Deal with the last byte
3162	!
3163	lduba	[%o0 + %o3]ASI_USER, %o4
3164	stb	%o4, [%o1 + %o3]
3165.dcifh:
3166	membar	#Sync
3167	stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3168	retl
3169	clr	%o0
3170
3171.big_copyin:
3172	! We're going off to do a block copy.
3173	! Switch fault hendlers and grab a window. We
3174	! don't do a membar #Sync since we've done only
3175	! kernel data to this point.
3176	stn	%o4, [THREAD_REG + T_LOFAULT]
3177
3178	! Copy in that reach here are larger than 256 bytes. The
3179	! hw_copy_limit_1 is set to 256. Never set this limit less
3180	! 128 bytes.
3181#if !defined(NIAGARA_IMPL)
3182	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3183
3184	rd	%fprs, %o2			! check for unused fp
3185	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET]	! save %fprs
3186	btst	FPRS_FEF, %o2
3187	bz,a,pt	%icc, .do_blockcopyin
3188	wr	%g0, FPRS_FEF, %fprs
3189
3190	! save in-use fpregs on stack
3191	BST_FP_TOSTACK(%o2)
3192#else	/* NIAGARA_IMPL */
3193	save	%sp, -SA(MINFRAME), %sp
3194#endif	/* NIAGARA_IMPL */
3195
3196.do_blockcopyin:
3197
3198#if !defined(NIAGARA_IMPL)
3199	rd	%gsr, %o2
3200	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
3201	! set the lower bit saved t_lofault to indicate that we need
3202	! clear %fprs register on the way out
3203	or	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
3204#endif	/* NIAGARA_IMPL */
3205
3206	! Swap src/dst since the code below is memcpy code
3207	! and memcpy/bcopy have different calling sequences
3208	mov	%i1, %i5
3209	mov	%i0, %i1
3210	mov	%i5, %i0
3211
3212	! Block (64 bytes) align the destination.
3213	andcc	%i0, 0x3f, %i3		! is dst block aligned
3214	bz	%ncc, copyin_blalign	! dst already block aligned
3215	sub	%i3, 0x40, %i3
3216	neg	%i3			! bytes till dst 64 bytes aligned
3217	sub	%i2, %i3, %i2		! update i2 with new count
3218
3219	! Based on source and destination alignment do
3220	! either 8 bytes, 4 bytes, 2 bytes or byte copy.
3221
3222	! Is dst & src 8B aligned
3223	or	%i0, %i1, %o2
3224	andcc	%o2, 0x7, %g0
3225	bz	%ncc, .ci_alewdcp
3226	nop
3227
3228	! Is dst & src 4B aligned
3229	andcc	%o2, 0x3, %g0
3230	bz	%ncc, .ci_alwdcp
3231	nop
3232
3233	! Is dst & src 2B aligned
3234	andcc	%o2, 0x1, %g0
3235	bz	%ncc, .ci_alhlfwdcp
3236	nop
3237
3238	! 1B aligned
32391:	lduba	[%i1]ASI_USER, %o2
3240	stb	%o2, [%i0]
3241	inc	%i1
3242	deccc	%i3
3243	bgu,pt	%ncc, 1b
3244	inc	%i0
3245
3246	ba	copyin_blalign
3247	nop
3248
3249	! dst & src 4B aligned
3250.ci_alwdcp:
3251	lda	[%i1]ASI_USER, %o2
3252	st	%o2, [%i0]
3253	add	%i1, 0x4, %i1
3254	subcc	%i3, 0x4, %i3
3255	bgu,pt	%ncc, .ci_alwdcp
3256	add	%i0, 0x4, %i0
3257
3258	ba	copyin_blalign
3259	nop
3260
3261	! dst & src 2B aligned
3262.ci_alhlfwdcp:
3263	lduha	[%i1]ASI_USER, %o2
3264	stuh	%o2, [%i0]
3265	add	%i1, 0x2, %i1
3266	subcc	%i3, 0x2, %i3
3267	bgu,pt	%ncc, .ci_alhlfwdcp
3268	add	%i0, 0x2, %i0
3269
3270	ba	copyin_blalign
3271	nop
3272
3273	! dst & src 8B aligned
3274.ci_alewdcp:
3275	ldxa	[%i1]ASI_USER, %o2
3276	stx	%o2, [%i0]
3277	add	%i1, 0x8, %i1
3278	subcc	%i3, 0x8, %i3
3279	bgu,pt	%ncc, .ci_alewdcp
3280	add	%i0, 0x8, %i0
3281
3282copyin_blalign:
3283	andn	%i2, 0x3f, %i3		! %i3 count is multiple of block size
3284	sub	%i2, %i3, %i2		! Residue bytes in %i2
3285
3286#if !defined(NIAGARA_IMPL)
3287	mov	ASI_USER, %asi
3288
3289	andn	%i1, 0x3f, %l0		! %l0 has block aligned src address
3290	prefetch [%l0+0x0], #one_read
3291	andcc	%i1, 0x3f, %g0		! is src 64B aligned
3292	bz,pn	%ncc, .ci_blkcpy
3293	nop
3294
3295	! handle misaligned source cases
3296	alignaddr %i1, %g0, %g0		! generate %gsr
3297
3298	srl	%i1, 0x3, %l1		! src add bits 3, 4, 5 are now least
3299					! significant in %l1
3300	andcc	%l1, 0x7, %l2		! mask everything except bits 1, 2, 3
3301	add	%i1, %i3, %i1
3302
3303	! switch statement to get to right 8 byte block within
3304	! 64 byte block
3305	cmp	 %l2, 0x4
3306	bgeu,a	 ci_hlf
3307	cmp	 %l2, 0x6
3308	cmp	 %l2, 0x2
3309	bgeu,a	 ci_sqtr
3310	nop
3311	cmp	 %l2, 0x1
3312	be,a	 ci_off15
3313	nop
3314	ba	 ci_off7
3315	nop
3316ci_sqtr:
3317	be,a	 ci_off23
3318	nop
3319	ba,a	 ci_off31
3320	nop
3321
3322ci_hlf:
3323	bgeu,a	 ci_fqtr
3324	nop
3325	cmp	 %l2, 0x5
3326	be,a	 ci_off47
3327	nop
3328	ba	 ci_off39
3329	nop
3330ci_fqtr:
3331	be,a	 ci_off55
3332	nop
3333
3334	ldda	[%l0+0x38]%asi, %d14
3335	prefetch [%l0+0x40], #one_read
3336	prefetch [%l0+0x80], #one_read
33377:
3338	add	%l0, 0x40, %l0
3339	stxa	%g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line
3340
3341	ldda	[%l0]ASI_BLK_AIUS, %d16
3342	ALIGN_OFF_56_63
3343	fmovd	%d30, %d14
3344
3345	stda	%d48, [%i0]ASI_BLK_P
3346	subcc	%i3, 0x40, %i3
3347	add	%i0, 0x40, %i0
3348	bgu,pt	%ncc, 7b
3349	prefetch [%l0+0x80], #one_read
3350	ba	.ci_blkdone
3351	membar	#Sync
3352
3353ci_off7:
3354	ldda	[%l0]ASI_BLK_AIUS, %d0
3355	prefetch [%l0+0x40], #one_read
3356	prefetch [%l0+0x80], #one_read
33570:
3358	add	%l0, 0x40, %l0
3359	stxa	%g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line
3360
3361	ldda	[%l0]ASI_BLK_AIUS, %d16
3362	ALIGN_OFF_1_7
3363	fmovd	%d16, %d0
3364	fmovd	%d18, %d2
3365	fmovd	%d20, %d4
3366	fmovd	%d22, %d6
3367	fmovd	%d24, %d8
3368	fmovd	%d26, %d10
3369	fmovd	%d28, %d12
3370	fmovd	%d30, %d14
3371
3372	stda	%d48, [%i0]ASI_BLK_P
3373	subcc	%i3, 0x40, %i3
3374	add	%i0, 0x40, %i0
3375	bgu,pt	%ncc, 0b
3376	prefetch [%l0+0x80], #one_read
3377	ba	.ci_blkdone
3378	membar	#Sync
3379
3380ci_off15:
3381	ldda	[%l0+0x8]%asi, %d2
3382	ldda	[%l0+0x10]%asi, %d4
3383	ldda	[%l0+0x18]%asi, %d6
3384	ldda	[%l0+0x20]%asi, %d8
3385	ldda	[%l0+0x28]%asi, %d10
3386	ldda	[%l0+0x30]%asi, %d12
3387	ldda	[%l0+0x38]%asi, %d14
3388	prefetch [%l0+0x40], #one_read
3389	prefetch [%l0+0x80], #one_read
33901:
3391	add	%l0, 0x40, %l0
3392	stxa	%g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line
3393
3394	ldda	[%l0]ASI_BLK_AIUS, %d16
3395	ALIGN_OFF_8_15
3396	fmovd	%d18, %d2
3397	fmovd	%d20, %d4
3398	fmovd	%d22, %d6
3399	fmovd	%d24, %d8
3400	fmovd	%d26, %d10
3401	fmovd	%d28, %d12
3402	fmovd	%d30, %d14
3403
3404	stda	%d48, [%i0]ASI_BLK_P
3405	subcc	%i3, 0x40, %i3
3406	add	%i0, 0x40, %i0
3407	bgu,pt	%ncc, 1b
3408	prefetch [%l0+0x80], #one_read
3409	ba	.ci_blkdone
3410	membar	#Sync
3411
3412ci_off23:
3413	ldda	[%l0+0x10]%asi, %d4
3414	ldda	[%l0+0x18]%asi, %d6
3415	ldda	[%l0+0x20]%asi, %d8
3416	ldda	[%l0+0x28]%asi, %d10
3417	ldda	[%l0+0x30]%asi, %d12
3418	ldda	[%l0+0x38]%asi, %d14
3419	prefetch [%l0+0x40], #one_read
3420	prefetch [%l0+0x80], #one_read
34212:
3422	add	%l0, 0x40, %l0
3423	stxa	%g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line
3424
3425	ldda	[%l0]ASI_BLK_AIUS, %d16
3426	ALIGN_OFF_16_23
3427	fmovd	%d20, %d4
3428	fmovd	%d22, %d6
3429	fmovd	%d24, %d8
3430	fmovd	%d26, %d10
3431	fmovd	%d28, %d12
3432	fmovd	%d30, %d14
3433
3434	stda	%d48, [%i0]ASI_BLK_P
3435	subcc	%i3, 0x40, %i3
3436	add	%i0, 0x40, %i0
3437	bgu,pt	%ncc, 2b
3438	prefetch [%l0+0x80], #one_read
3439	ba	.ci_blkdone
3440	membar	#Sync
3441
3442ci_off31:
3443	ldda	[%l0+0x18]%asi, %d6
3444	ldda	[%l0+0x20]%asi, %d8
3445	ldda	[%l0+0x28]%asi, %d10
3446	ldda	[%l0+0x30]%asi, %d12
3447	ldda	[%l0+0x38]%asi, %d14
3448	prefetch [%l0+0x40], #one_read
3449	prefetch [%l0+0x80], #one_read
34503:
3451	add	%l0, 0x40, %l0
3452	stxa	%g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line
3453
3454	ldda	[%l0]ASI_BLK_AIUS, %d16
3455	ALIGN_OFF_24_31
3456	fmovd	%d22, %d6
3457	fmovd	%d24, %d8
3458	fmovd	%d26, %d10
3459	fmovd	%d28, %d12
3460	fmovd	%d30, %d14
3461
3462	stda	%d48, [%i0]ASI_BLK_P
3463	subcc	%i3, 0x40, %i3
3464	add	%i0, 0x40, %i0
3465	bgu,pt	%ncc, 3b
3466	prefetch [%l0+0x80], #one_read
3467	ba	.ci_blkdone
3468	membar	#Sync
3469
3470ci_off39:
3471	ldda	[%l0+0x20]%asi, %d8
3472	ldda	[%l0+0x28]%asi, %d10
3473	ldda	[%l0+0x30]%asi, %d12
3474	ldda	[%l0+0x38]%asi, %d14
3475	prefetch [%l0+0x40], #one_read
3476	prefetch [%l0+0x80], #one_read
34774:
3478	add	%l0, 0x40, %l0
3479	stxa	%g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line
3480
3481	ldda	[%l0]ASI_BLK_AIUS, %d16
3482	ALIGN_OFF_32_39
3483	fmovd	%d24, %d8
3484	fmovd	%d26, %d10
3485	fmovd	%d28, %d12
3486	fmovd	%d30, %d14
3487
3488	stda	%d48, [%i0]ASI_BLK_P
3489	subcc	%i3, 0x40, %i3
3490	add	%i0, 0x40, %i0
3491	bgu,pt	%ncc, 4b
3492	prefetch [%l0+0x80], #one_read
3493	ba	.ci_blkdone
3494	membar	#Sync
3495
3496ci_off47:
3497	ldda	[%l0+0x28]%asi, %d10
3498	ldda	[%l0+0x30]%asi, %d12
3499	ldda	[%l0+0x38]%asi, %d14
3500	prefetch [%l0+0x40], #one_read
3501	prefetch [%l0+0x80], #one_read
35025:
3503	add	%l0, 0x40, %l0
3504	stxa	%g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line
3505
3506	ldda	[%l0]ASI_BLK_AIUS, %d16
3507	ALIGN_OFF_40_47
3508	fmovd	%d26, %d10
3509	fmovd	%d28, %d12
3510	fmovd	%d30, %d14
3511
3512	stda	%d48, [%i0]ASI_BLK_P
3513	subcc	%i3, 0x40, %i3
3514	add	%i0, 0x40, %i0
3515	bgu,pt	%ncc, 5b
3516	prefetch [%l0+0x80], #one_read
3517	ba	.ci_blkdone
3518	membar	#Sync
3519
3520ci_off55:
3521	ldda	[%l0+0x30]%asi, %d12
3522	ldda	[%l0+0x38]%asi, %d14
3523	prefetch [%l0+0x40], #one_read
3524	prefetch [%l0+0x80], #one_read
35256:
3526	add	%l0, 0x40, %l0
3527	stxa	%g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line
3528
3529	ldda	[%l0]ASI_BLK_AIUS, %d16
3530	ALIGN_OFF_48_55
3531	fmovd	%d28, %d12
3532	fmovd	%d30, %d14
3533
3534	stda	%d48, [%i0]ASI_BLK_P
3535	subcc	%i3, 0x40, %i3
3536	add	%i0, 0x40, %i0
3537	bgu,pt	%ncc, 6b
3538	prefetch [%l0+0x80], #one_read
3539	ba	.ci_blkdone
3540	membar	#Sync
3541
3542.ci_blkcpy:
3543	prefetch [%i1+0x40], #one_read
3544	prefetch [%i1+0x80], #one_read
35458:
3546	stxa	%g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line
3547	ldda	[%i1]ASI_BLK_AIUS, %d0
3548	stda	%d0, [%i0]ASI_BLK_P
3549
3550	add	%i1, 0x40, %i1
3551	subcc	%i3, 0x40, %i3
3552	add	%i0, 0x40, %i0
3553	bgu,pt	%ncc, 8b
3554	prefetch [%i1+0x80], #one_read
3555	membar	#Sync
3556
3557.ci_blkdone:
3558#else	/* NIAGARA_IMPL */
3559	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
3560
3561	andcc	%i1, 0xf, %o2		! is src quadword aligned
3562	bz,pn	%xcc, .ci_blkcpy	! src offset in %o2 (last 4-bits)
3563	nop
3564	cmp	%o2, 0x8
3565	bg	.ci_upper_double
3566	nop
3567	bl	.ci_lower_double
3568	nop
3569
3570	! Falls through when source offset is equal to 8 i.e.
3571	! source is double word aligned.
3572	! In this case no shift/merge of data is required
3573
3574	sub	%i1, %o2, %i1		! align the src at 16 bytes.
3575	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
3576	prefetch [%l0+0x0], #one_read
3577	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
3578.ci_loop0:
3579	add	%i1, 0x10, %i1
3580	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
3581
3582	prefetch [%l0+0x40], #one_read
3583
3584	stxa	%l3, [%i0+0x0]%asi
3585	stxa	%l4, [%i0+0x8]%asi
3586
3587	add	%i1, 0x10, %i1
3588	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
3589
3590	stxa	%l5, [%i0+0x10]%asi
3591	stxa	%l2, [%i0+0x18]%asi
3592
3593	add	%i1, 0x10, %i1
3594	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
3595
3596	stxa	%l3, [%i0+0x20]%asi
3597	stxa	%l4, [%i0+0x28]%asi
3598
3599	add	%i1, 0x10, %i1
3600	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
3601
3602	stxa	%l5, [%i0+0x30]%asi
3603	stxa	%l2, [%i0+0x38]%asi
3604
3605	add	%l0, 0x40, %l0
3606	subcc	%i3, 0x40, %i3
3607	bgu,pt	%xcc, .ci_loop0
3608	add	%i0, 0x40, %i0
3609	ba	.ci_blkdone
3610	add	%i1, %o2, %i1		! increment the source by src offset
3611					! the src offset was stored in %o2
3612
3613.ci_lower_double:
3614
3615	sub	%i1, %o2, %i1		! align the src at 16 bytes.
3616	sll	%o2, 3, %o0		! %o0 left shift
3617	mov	0x40, %o1
3618	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
3619	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
3620	prefetch [%l0+0x0], #one_read
3621	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2	! partial data in %l2
3622							! and %l3 has complete
3623							! data
3624.ci_loop1:
3625	add	%i1, 0x10, %i1
3626	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4	! %l4 has partial data
3627							! for this read.
3628	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)	! merge %l2, %l3 and %l4
3629							! into %l2 and %l3
3630
3631	prefetch [%l0+0x40], #one_read
3632
3633	stxa	%l2, [%i0+0x0]%asi
3634	stxa	%l3, [%i0+0x8]%asi
3635
3636	add	%i1, 0x10, %i1
3637	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
3638	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)	! merge %l2 with %l5 and
3639							! %l4 from previous read
3640							! into %l4 and %l5
3641	stxa	%l4, [%i0+0x10]%asi
3642	stxa	%l5, [%i0+0x18]%asi
3643
3644	! Repeat the same for next 32 bytes.
3645
3646	add	%i1, 0x10, %i1
3647	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
3648	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
3649
3650	stxa	%l2, [%i0+0x20]%asi
3651	stxa	%l3, [%i0+0x28]%asi
3652
3653	add	%i1, 0x10, %i1
3654	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
3655	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
3656
3657	stxa	%l4, [%i0+0x30]%asi
3658	stxa	%l5, [%i0+0x38]%asi
3659
3660	add	%l0, 0x40, %l0
3661	subcc	%i3, 0x40, %i3
3662	bgu,pt	%xcc, .ci_loop1
3663	add	%i0, 0x40, %i0
3664	ba	.ci_blkdone
3665	add	%i1, %o2, %i1		! increment the source by src offset
3666					! the src offset was stored in %o2
3667
3668.ci_upper_double:
3669
3670	sub	%i1, %o2, %i1		! align the src at 16 bytes.
3671	sub	%o2, 0x8, %o0
3672	sll	%o0, 3, %o0		! %o0 left shift
3673	mov	0x40, %o1
3674	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
3675	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
3676	prefetch [%l0+0x0], #one_read
3677	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2	! partial data in %l3
3678							! for this read and
3679							! no data in %l2
3680.ci_loop2:
3681	add	%i1, 0x10, %i1
3682	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4	! %l4 has complete data
3683							! and %l5 has partial
3684	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)	! merge %l3, %l4 and %l5
3685							! into %l3 and %l4
3686	prefetch [%l0+0x40], #one_read
3687
3688	stxa	%l3, [%i0+0x0]%asi
3689	stxa	%l4, [%i0+0x8]%asi
3690
3691	add	%i1, 0x10, %i1
3692	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
3693	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)	! merge %l2 and %l3 with
3694							! %l5 from previous read
3695							! into %l5 and %l2
3696
3697	stxa	%l5, [%i0+0x10]%asi
3698	stxa	%l2, [%i0+0x18]%asi
3699
3700	! Repeat the same for next 32 bytes.
3701
3702	add	%i1, 0x10, %i1
3703	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
3704	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
3705
3706	stxa	%l3, [%i0+0x20]%asi
3707	stxa	%l4, [%i0+0x28]%asi
3708
3709	add	%i1, 0x10, %i1
3710	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
3711	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
3712
3713	stxa	%l5, [%i0+0x30]%asi
3714	stxa	%l2, [%i0+0x38]%asi
3715
3716	add	%l0, 0x40, %l0
3717	subcc	%i3, 0x40, %i3
3718	bgu,pt	%xcc, .ci_loop2
3719	add	%i0, 0x40, %i0
3720	ba	.ci_blkdone
3721	add	%i1, %o2, %i1		! increment the source by src offset
3722					! the src offset was stored in %o2
3723
3724
3725	! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
3726.ci_blkcpy:
3727
3728	andn	%i1, 0x3f, %o0		! %o0 has block aligned source
3729	prefetch [%o0+0x0], #one_read
37301:
3731	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l0
3732	add	%i1, 0x10, %i1
3733	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
3734	add	%i1, 0x10, %i1
3735
3736	prefetch [%o0+0x40], #one_read
3737
3738	stxa	%l0, [%i0+0x0]%asi
3739
3740	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
3741	add	%i1, 0x10, %i1
3742	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l6
3743	add	%i1, 0x10, %i1
3744
3745	stxa	%l1, [%i0+0x8]%asi
3746	stxa	%l2, [%i0+0x10]%asi
3747	stxa	%l3, [%i0+0x18]%asi
3748	stxa	%l4, [%i0+0x20]%asi
3749	stxa	%l5, [%i0+0x28]%asi
3750	stxa	%l6, [%i0+0x30]%asi
3751	stxa	%l7, [%i0+0x38]%asi
3752
3753	add	%o0, 0x40, %o0
3754	subcc	%i3, 0x40, %i3
3755	bgu,pt	%xcc, 1b
3756	add	%i0, 0x40, %i0
3757
3758.ci_blkdone:
3759	membar	#Sync
3760#endif	/* NIAGARA_IMPL */
3761
3762	brz,pt	%i2, .copyin_exit
3763	nop
3764
3765	! Handle trailing bytes
3766	cmp	%i2, 0x8
3767	blu,pt	%ncc, .ci_residue
3768	nop
3769
3770	! Can we do some 8B ops
3771	or	%i1, %i0, %o2
3772	andcc	%o2, 0x7, %g0
3773	bnz	%ncc, .ci_last4
3774	nop
3775
3776	! Do 8byte ops as long as possible
3777.ci_last8:
3778	ldxa	[%i1]ASI_USER, %o2
3779	stx	%o2, [%i0]
3780	add	%i1, 0x8, %i1
3781	sub	%i2, 0x8, %i2
3782	cmp	%i2, 0x8
3783	bgu,pt	%ncc, .ci_last8
3784	add	%i0, 0x8, %i0
3785
3786	brz,pt	%i2, .copyin_exit
3787	nop
3788
3789	ba	.ci_residue
3790	nop
3791
3792.ci_last4:
3793	! Can we do 4B ops
3794	andcc	%o2, 0x3, %g0
3795	bnz	%ncc, .ci_last2
3796	nop
37971:
3798	lda	[%i1]ASI_USER, %o2
3799	st	%o2, [%i0]
3800	add	%i1, 0x4, %i1
3801	sub	%i2, 0x4, %i2
3802	cmp	%i2, 0x4
3803	bgu,pt	%ncc, 1b
3804	add	%i0, 0x4, %i0
3805
3806	brz,pt	%i2, .copyin_exit
3807	nop
3808
3809	ba	.ci_residue
3810	nop
3811
3812.ci_last2:
3813	! Can we do 2B ops
3814	andcc	%o2, 0x1, %g0
3815	bnz	%ncc, .ci_residue
3816	nop
3817
38181:
3819	lduha	[%i1]ASI_USER, %o2
3820	stuh	%o2, [%i0]
3821	add	%i1, 0x2, %i1
3822	sub	%i2, 0x2, %i2
3823	cmp	%i2, 0x2
3824	bgu,pt	%ncc, 1b
3825	add	%i0, 0x2, %i0
3826
3827	brz,pt	%i2, .copyin_exit
3828	nop
3829
3830	! Copy the residue as byte copy
3831.ci_residue:
3832	lduba	[%i1]ASI_USER, %i4
3833	stb	%i4, [%i0]
3834	inc	%i1
3835	deccc	%i2
3836	bgu,pt	%xcc, .ci_residue
3837	inc	%i0
3838
3839.copyin_exit:
3840#if !defined(NIAGARA_IMPL)
3841	ld	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
3842	wr	%o2, 0, %gsr		! restore gsr
3843
3844	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
3845	btst	FPRS_FEF, %o3
3846	bz	%icc, 4f
3847	  nop
3848
3849	! restore fpregs from stack
3850	BLD_FP_FROMSTACK(%o2)
3851
3852	ba,pt	%ncc, 2f
3853	  wr	%o3, 0, %fprs		! restore fprs
3854
38554:
3856	FZERO				! zero all of the fpregs
3857	wr	%o3, 0, %fprs		! restore fprs
3858
38592:
3860	membar	#Sync			! sync error barrier
3861	andn	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
3862#else	/* NIAGARA_IMPL */
3863	membar	#Sync
3864#endif	/* NIAGARA_IMPL */
3865	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
3866	ret
3867	restore	%g0, 0, %o0
3868.copyin_err:
3869	ldn	[THREAD_REG + T_COPYOPS], %o4
3870	brz	%o4, 2f
3871	nop
3872	ldn	[%o4 + CP_COPYIN], %g2
3873	jmp	%g2
3874	nop
38752:
3876	retl
3877	mov	-1, %o0
3878	SET_SIZE(copyin)
3879
3880#endif	/* lint */
3881
3882#ifdef	lint
3883
3884/*ARGSUSED*/
3885int
3886xcopyin(const void *uaddr, void *kaddr, size_t count)
3887{ return (0); }
3888
3889#else	/* lint */
3890
3891	ENTRY(xcopyin)
3892	sethi	%hi(.xcopyin_err), REAL_LOFAULT
3893	b	.do_copyin
3894	  or	REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
3895.xcopyin_err:
3896	ldn	[THREAD_REG + T_COPYOPS], %o4
3897	brz	%o4, 2f
3898	nop
3899	ldn	[%o4 + CP_XCOPYIN], %g2
3900	jmp	%g2
3901	nop
39022:
3903	retl
3904	mov	%g1, %o0
3905	SET_SIZE(xcopyin)
3906
3907#endif	/* lint */
3908
3909#ifdef	lint
3910
3911/*ARGSUSED*/
3912int
3913xcopyin_little(const void *uaddr, void *kaddr, size_t count)
3914{ return (0); }
3915
3916#else	/* lint */
3917
3918	ENTRY(xcopyin_little)
3919	sethi	%hi(.little_err), %o4
3920	ldn	[THREAD_REG + T_LOFAULT], %o5
3921	or	%o4, %lo(.little_err), %o4
3922	membar	#Sync				! sync error barrier
3923	stn	%o4, [THREAD_REG + T_LOFAULT]
3924
3925	subcc	%g0, %o2, %o3
3926	add	%o0, %o2, %o0
3927	bz,pn	%ncc, 2f		! check for zero bytes
3928	sub	%o2, 1, %o4
3929	add	%o0, %o4, %o0		! start w/last byte
3930	add	%o1, %o2, %o1
3931	lduba	[%o0+%o3]ASI_AIUSL, %o4
3932
39331:	stb	%o4, [%o1+%o3]
3934	inccc	%o3
3935	sub	%o0, 2, %o0		! get next byte
3936	bcc,a,pt %ncc, 1b
3937	  lduba	[%o0+%o3]ASI_AIUSL, %o4
3938
39392:	membar	#Sync				! sync error barrier
3940	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
3941	retl
3942	mov	%g0, %o0		! return (0)
3943
3944.little_err:
3945	membar	#Sync				! sync error barrier
3946	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
3947	retl
3948	mov	%g1, %o0
3949	SET_SIZE(xcopyin_little)
3950
3951#endif	/* lint */
3952
3953
3954/*
3955 * Copy a block of storage - must not overlap (from + len <= to).
3956 * No fault handler installed (to be called under on_fault())
3957 */
3958#if defined(lint)
3959
3960/* ARGSUSED */
3961void
3962copyin_noerr(const void *ufrom, void *kto, size_t count)
3963{}
3964
3965#else	/* lint */
3966
3967	ENTRY(copyin_noerr)
3968	sethi	%hi(.copyio_noerr), REAL_LOFAULT
3969	b	.do_copyin
3970	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
3971.copyio_noerr:
3972	jmp	SAVED_LOFAULT
3973	  nop
3974	SET_SIZE(copyin_noerr)
3975
3976#endif /* lint */
3977
3978/*
3979 * Copy a block of storage - must not overlap (from + len <= to).
3980 * No fault handler installed (to be called under on_fault())
3981 */
3982
3983#if defined(lint)
3984
3985/* ARGSUSED */
3986void
3987copyout_noerr(const void *kfrom, void *uto, size_t count)
3988{}
3989
3990#else	/* lint */
3991
3992	ENTRY(copyout_noerr)
3993	sethi	%hi(.copyio_noerr), REAL_LOFAULT
3994	b	.do_copyout
3995	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
3996	SET_SIZE(copyout_noerr)
3997
3998#endif /* lint */
3999
4000#if defined(lint)
4001
4002int use_hw_bcopy = 1;
4003int use_hw_bzero = 1;
4004uint_t hw_copy_limit_1 = 0x100;
4005uint_t hw_copy_limit_2 = 0x200;
4006uint_t hw_copy_limit_4 = 0x400;
4007uint_t hw_copy_limit_8 = 0x400;
4008
4009#else /* !lint */
4010
4011	.align	4
4012	DGDEF(use_hw_bcopy)
4013	.word	1
4014	DGDEF(use_hw_bzero)
4015	.word	1
4016	DGDEF(hw_copy_limit_1)
4017	.word	0x100
4018	DGDEF(hw_copy_limit_2)
4019	.word	0x200
4020	DGDEF(hw_copy_limit_4)
4021	.word	0x400
4022	DGDEF(hw_copy_limit_8)
4023	.word	0x400
4024
4025	.align	64
4026	.section ".text"
4027#endif /* !lint */
4028
4029/*
4030 * hwblkclr - clears block-aligned, block-multiple-sized regions that are
4031 * longer than 256 bytes in length using Niagara's block stores/quad store.
4032 * If the criteria for using this routine are not met then it calls bzero
4033 * and returns 1.  Otherwise 0 is returned indicating success.
4034 * Caller is responsible for ensuring use_hw_bzero is true and that
4035 * kpreempt_disable() has been called.
4036 */
4037#ifdef lint
4038/*ARGSUSED*/
4039int
4040hwblkclr(void *addr, size_t len)
4041{
4042	return(0);
4043}
4044#else /* lint */
4045	! %i0 - start address
4046	! %i1 - length of region (multiple of 64)
4047
4048	ENTRY(hwblkclr)
4049	save	%sp, -SA(MINFRAME), %sp
4050
4051	! Must be block-aligned
4052	andcc	%i0, 0x3f, %g0
4053	bnz,pn	%ncc, 1f
4054	  nop
4055
4056	! ... and must be 256 bytes or more
4057	cmp	%i1, 0x100
4058	blu,pn	%ncc, 1f
4059	  nop
4060
4061	! ... and length must be a multiple of 64
4062	andcc	%i1, 0x3f, %g0
4063	bz,pn	%ncc, .pz_doblock
4064	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
4065
40661:	! punt, call bzero but notify the caller that bzero was used
4067	mov	%i0, %o0
4068	call	bzero
4069	  mov	%i1, %o1
4070	ret
4071	restore	%g0, 1, %o0	! return (1) - did not use block operations
4072
4073	! Already verified that there are at least 256 bytes to set
4074.pz_doblock:
4075	stxa	%g0, [%i0+0x0]%asi
4076	stxa	%g0, [%i0+0x40]%asi
4077	stxa	%g0, [%i0+0x80]%asi
4078	stxa	%g0, [%i0+0xc0]%asi
4079
4080	stxa	%g0, [%i0+0x8]%asi
4081	stxa	%g0, [%i0+0x10]%asi
4082	stxa	%g0, [%i0+0x18]%asi
4083	stxa	%g0, [%i0+0x20]%asi
4084	stxa	%g0, [%i0+0x28]%asi
4085	stxa	%g0, [%i0+0x30]%asi
4086	stxa	%g0, [%i0+0x38]%asi
4087
4088	stxa	%g0, [%i0+0x48]%asi
4089	stxa	%g0, [%i0+0x50]%asi
4090	stxa	%g0, [%i0+0x58]%asi
4091	stxa	%g0, [%i0+0x60]%asi
4092	stxa	%g0, [%i0+0x68]%asi
4093	stxa	%g0, [%i0+0x70]%asi
4094	stxa	%g0, [%i0+0x78]%asi
4095
4096	stxa	%g0, [%i0+0x88]%asi
4097	stxa	%g0, [%i0+0x90]%asi
4098	stxa	%g0, [%i0+0x98]%asi
4099	stxa	%g0, [%i0+0xa0]%asi
4100	stxa	%g0, [%i0+0xa8]%asi
4101	stxa	%g0, [%i0+0xb0]%asi
4102	stxa	%g0, [%i0+0xb8]%asi
4103
4104	stxa	%g0, [%i0+0xc8]%asi
4105	stxa	%g0, [%i0+0xd0]%asi
4106	stxa	%g0, [%i0+0xd8]%asi
4107	stxa	%g0, [%i0+0xe0]%asi
4108	stxa	%g0, [%i0+0xe8]%asi
4109	stxa	%g0, [%i0+0xf0]%asi
4110	stxa	%g0, [%i0+0xf8]%asi
4111
4112	sub	%i1, 0x100, %i1
4113	cmp	%i1, 0x100
4114	bgu,pt	%ncc, .pz_doblock
4115	add	%i0, 0x100, %i0
4116
41172:
4118	! Check if more than 64 bytes to set
4119	cmp	%i1,0x40
4120	blu	%ncc, .pz_finish
4121	nop
4122
41233:
4124	stxa	%g0, [%i0+0x0]%asi
4125	stxa	%g0, [%i0+0x8]%asi
4126	stxa	%g0, [%i0+0x10]%asi
4127	stxa	%g0, [%i0+0x18]%asi
4128	stxa	%g0, [%i0+0x20]%asi
4129	stxa	%g0, [%i0+0x28]%asi
4130	stxa	%g0, [%i0+0x30]%asi
4131	stxa	%g0, [%i0+0x38]%asi
4132
4133	subcc	%i1, 0x40, %i1
4134	bgu,pt	%ncc, 3b
4135	add	%i0, 0x40, %i0
4136
4137.pz_finish:
4138	membar	#Sync
4139	ret
4140	restore	%g0, 0, %o0		! return (bzero or not)
4141	SET_SIZE(hwblkclr)
4142#endif	/* lint */
4143
4144#ifdef	lint
4145/* Copy 32 bytes of data from src to dst using physical addresses */
4146/*ARGSUSED*/
4147void
4148hw_pa_bcopy32(uint64_t src, uint64_t dst)
4149{}
4150#else	/*!lint */
4151
4152	/*
4153	 * Copy 32 bytes of data from src (%o0) to dst (%o1)
4154	 * using physical addresses.
4155	 */
4156	ENTRY_NP(hw_pa_bcopy32)
4157	rdpr    %pstate, %g1
4158	andn    %g1, PSTATE_IE, %g2
4159	wrpr    %g0, %g2, %pstate
4160
4161	ldxa    [%o0]ASI_MEM, %o2
4162	add     %o0, 8, %o0
4163	ldxa    [%o0]ASI_MEM, %o3
4164	add     %o0, 8, %o0
4165	ldxa    [%o0]ASI_MEM, %o4
4166	add     %o0, 8, %o0
4167	ldxa    [%o0]ASI_MEM, %o5
4168	stxa    %o2, [%o1]ASI_MEM
4169	add     %o1, 8, %o1
4170	stxa    %o3, [%o1]ASI_MEM
4171	add     %o1, 8, %o1
4172	stxa    %o4, [%o1]ASI_MEM
4173	add     %o1, 8, %o1
4174	stxa    %o5, [%o1]ASI_MEM
4175
4176	membar	#Sync
4177	retl
4178	  wrpr    %g0, %g1, %pstate
4179	SET_SIZE(hw_pa_bcopy32)
4180#endif /* lint */
4181
4182/*
4183 * Zero a block of storage.
4184 *
4185 * uzero is used by the kernel to zero a block in user address space.
4186 */
4187
4188/*
4189 * Control flow of the bzero/kzero/uzero routine.
4190 *
4191 *	For fewer than 7 bytes stores, bytes will be zeroed.
4192 *
4193 *	For less than 15 bytes stores, align the address on 4 byte boundary.
4194 *	Then store as many 4-byte chunks, followed by trailing bytes.
4195 *
4196 *	For sizes greater than 15 bytes, align the address on 8 byte boundary.
4197 *	if (count > 128) {
4198 *		store as many 8-bytes chunks to block align the address
4199 *		store using ASI_BLK_INIT_ST_QUAD_LDD_P (bzero/kzero) OR
4200 *		store using ASI_BLK_INIT_QUAD_LDD_AIUS (uzero)
4201 *	}
4202 *	Store as many 8-byte chunks, followed by trailing bytes.
4203 */
4204
4205#if defined(lint)
4206
4207/* ARGSUSED */
4208int
4209kzero(void *addr, size_t count)
4210{ return(0); }
4211
4212/* ARGSUSED */
4213void
4214uzero(void *addr, size_t count)
4215{}
4216
4217#else	/* lint */
4218
4219	ENTRY(uzero)
4220	!
4221	! Set a new lo_fault handler only if we came in with one
4222	! already specified.
4223	!
4224	wr	%g0, ASI_USER, %asi
4225	ldn	[THREAD_REG + T_LOFAULT], %o5
4226	tst	%o5
4227	bz,pt	%ncc, .do_zero
4228	sethi	%hi(.zeroerr), %o2
4229	or	%o2, %lo(.zeroerr), %o2
4230	membar	#Sync
4231	ba,pt	%ncc, .do_zero
4232	stn	%o2, [THREAD_REG + T_LOFAULT]
4233
4234	ENTRY(kzero)
4235	!
4236	! Always set a lo_fault handler
4237	!
4238	wr	%g0, ASI_P, %asi
4239	ldn	[THREAD_REG + T_LOFAULT], %o5
4240	sethi	%hi(.zeroerr), %o2
4241	or	%o5, LOFAULT_SET, %o5
4242	or	%o2, %lo(.zeroerr), %o2
4243	membar	#Sync
4244	ba,pt	%ncc, .do_zero
4245	stn	%o2, [THREAD_REG + T_LOFAULT]
4246
4247/*
4248 * We got here because of a fault during kzero or if
4249 * uzero or bzero was called with t_lofault non-zero.
4250 * Otherwise we've already run screaming from the room.
4251 * Errno value is in %g1. Note that we're here iff
4252 * we did set t_lofault.
4253 */
4254.zeroerr:
4255	!
4256	! Undo asi register setting. Just set it to be the
4257        ! kernel default without checking.
4258	!
4259	wr	%g0, ASI_P, %asi
4260
4261	!
4262	! We did set t_lofault. It may well have been zero coming in.
4263	!
42641:
4265	tst	%o5
4266	membar #Sync
4267	bne,pn	%ncc, 3f
4268	andncc	%o5, LOFAULT_SET, %o5
42692:
4270	!
4271	! Old handler was zero. Just return the error.
4272	!
4273	retl				! return
4274	mov	%g1, %o0		! error code from %g1
42753:
4276	!
4277	! We're here because %o5 was non-zero. It was non-zero
4278	! because either LOFAULT_SET was present, a previous fault
4279	! handler was present or both. In all cases we need to reset
4280	! T_LOFAULT to the value of %o5 after clearing LOFAULT_SET
4281	! before we either simply return the error or we invoke the
4282	! previously specified handler.
4283	!
4284	be	%ncc, 2b
4285	stn	%o5, [THREAD_REG + T_LOFAULT]
4286	jmp	%o5			! goto real handler
4287	  nop
4288	SET_SIZE(kzero)
4289	SET_SIZE(uzero)
4290
4291#endif	/* lint */
4292
4293/*
4294 * Zero a block of storage.
4295 */
4296
4297#if defined(lint)
4298
4299/* ARGSUSED */
4300void
4301bzero(void *addr, size_t count)
4302{}
4303
4304#else	/* lint */
4305
4306	ENTRY(bzero)
4307	wr	%g0, ASI_P, %asi
4308
4309	ldn	[THREAD_REG + T_LOFAULT], %o5	! save old vector
4310	tst	%o5
4311	bz,pt	%ncc, .do_zero
4312	sethi	%hi(.zeroerr), %o2
4313	or	%o2, %lo(.zeroerr), %o2
4314	membar	#Sync				! sync error barrier
4315	stn	%o2, [THREAD_REG + T_LOFAULT]	! install new vector
4316
4317.do_zero:
4318	cmp	%o1, 7
4319	blu,pn	%ncc, .byteclr
4320	nop
4321
4322	cmp	%o1, 15
4323	blu,pn	%ncc, .wdalign
4324	nop
4325
4326	andcc	%o0, 7, %o3		! is add aligned on a 8 byte bound
4327	bz,pt	%ncc, .blkalign		! already double aligned
4328	sub	%o3, 8, %o3		! -(bytes till double aligned)
4329	add	%o1, %o3, %o1		! update o1 with new count
4330
43311:
4332	stba	%g0, [%o0]%asi
4333	inccc	%o3
4334	bl,pt	%ncc, 1b
4335	inc	%o0
4336
4337	! Now address is double aligned
4338.blkalign:
4339	cmp	%o1, 0x80		! check if there are 128 bytes to set
4340	blu,pn	%ncc, .bzero_small
4341	mov	%o1, %o3
4342
4343	sethi	%hi(use_hw_bzero), %o2
4344	ld	[%o2 + %lo(use_hw_bzero)], %o2
4345	tst	%o2
4346	bz	%ncc, .bzero_small
4347	mov	%o1, %o3
4348
4349	rd	%asi, %o3
4350	wr	%g0, ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
4351	cmp	%o3, ASI_P
4352	bne,a	%ncc, .algnblk
4353	wr	%g0, ASI_BLK_INIT_QUAD_LDD_AIUS, %asi
4354
4355.algnblk:
4356	andcc	%o0, 0x3f, %o3		! is block aligned?
4357	bz,pt	%ncc, .bzero_blk
4358	sub	%o3, 0x40, %o3		! -(bytes till block aligned)
4359	add	%o1, %o3, %o1		! o1 is the remainder
4360
4361	! Clear -(%o3) bytes till block aligned
43621:
4363	stxa	%g0, [%o0]%asi
4364	addcc	%o3, 8, %o3
4365	bl,pt	%ncc, 1b
4366	add	%o0, 8, %o0
4367
4368.bzero_blk:
4369	and	%o1, 0x3f, %o3		! calc bytes left after blk clear
4370	andn	%o1, 0x3f, %o4		! calc size of blocks in bytes
4371
4372	cmp	%o4, 0x100		! 256 bytes or more
4373	blu,pn	%ncc, 3f
4374	nop
4375
43762:
4377	stxa	%g0, [%o0+0x0]%asi
4378	stxa	%g0, [%o0+0x40]%asi
4379	stxa	%g0, [%o0+0x80]%asi
4380	stxa	%g0, [%o0+0xc0]%asi
4381
4382	stxa	%g0, [%o0+0x8]%asi
4383	stxa	%g0, [%o0+0x10]%asi
4384	stxa	%g0, [%o0+0x18]%asi
4385	stxa	%g0, [%o0+0x20]%asi
4386	stxa	%g0, [%o0+0x28]%asi
4387	stxa	%g0, [%o0+0x30]%asi
4388	stxa	%g0, [%o0+0x38]%asi
4389
4390	stxa	%g0, [%o0+0x48]%asi
4391	stxa	%g0, [%o0+0x50]%asi
4392	stxa	%g0, [%o0+0x58]%asi
4393	stxa	%g0, [%o0+0x60]%asi
4394	stxa	%g0, [%o0+0x68]%asi
4395	stxa	%g0, [%o0+0x70]%asi
4396	stxa	%g0, [%o0+0x78]%asi
4397
4398	stxa	%g0, [%o0+0x88]%asi
4399	stxa	%g0, [%o0+0x90]%asi
4400	stxa	%g0, [%o0+0x98]%asi
4401	stxa	%g0, [%o0+0xa0]%asi
4402	stxa	%g0, [%o0+0xa8]%asi
4403	stxa	%g0, [%o0+0xb0]%asi
4404	stxa	%g0, [%o0+0xb8]%asi
4405
4406	stxa	%g0, [%o0+0xc8]%asi
4407	stxa	%g0, [%o0+0xd0]%asi
4408	stxa	%g0, [%o0+0xd8]%asi
4409	stxa	%g0, [%o0+0xe0]%asi
4410	stxa	%g0, [%o0+0xe8]%asi
4411	stxa	%g0, [%o0+0xf0]%asi
4412	stxa	%g0, [%o0+0xf8]%asi
4413
4414	sub	%o4, 0x100, %o4
4415	cmp	%o4, 0x100
4416	bgu,pt	%ncc, 2b
4417	add	%o0, 0x100, %o0
4418
44193:
4420	! ... check if 64 bytes to set
4421	cmp	%o4, 0x40
4422	blu	%ncc, .bzero_blk_done
4423	nop
4424
44254:
4426	stxa	%g0, [%o0+0x0]%asi
4427	stxa	%g0, [%o0+0x8]%asi
4428	stxa	%g0, [%o0+0x10]%asi
4429	stxa	%g0, [%o0+0x18]%asi
4430	stxa	%g0, [%o0+0x20]%asi
4431	stxa	%g0, [%o0+0x28]%asi
4432	stxa	%g0, [%o0+0x30]%asi
4433	stxa	%g0, [%o0+0x38]%asi
4434
4435	subcc	%o4, 0x40, %o4
4436	bgu,pt	%ncc, 3b
4437	add	%o0, 0x40, %o0
4438
4439.bzero_blk_done:
4440	membar	#Sync
4441	!
4442	! Undo asi register setting.
4443	!
4444	rd	%asi, %o4
4445	wr	%g0, ASI_P, %asi
4446	cmp	%o4, ASI_BLK_INIT_ST_QUAD_LDD_P
4447	bne,a	%ncc, .bzero_small
4448	wr	%g0, ASI_USER, %asi
4449
4450.bzero_small:
4451	! Set the remaining doubles
4452	subcc	%o3, 8, %o3		! Can we store any doubles?
4453	blu,pn	%ncc, .byteclr
4454	and	%o1, 7, %o1		! calc bytes left after doubles
4455
4456.dbclr:
4457	stxa	%g0, [%o0]%asi		! Clear the doubles
4458	subcc	%o3, 8, %o3
4459	bgeu,pt	%ncc, .dbclr
4460	add	%o0, 8, %o0
4461
4462	ba	.byteclr
4463	nop
4464
4465.wdalign:
4466	andcc	%o0, 3, %o3		! is add aligned on a word boundary
4467	bz,pn	%ncc, .wdclr
4468	andn	%o1, 3, %o3		! create word sized count in %o3
4469
4470	dec	%o1			! decrement count
4471	stba	%g0, [%o0]%asi		! clear a byte
4472	ba	.wdalign
4473	inc	%o0			! next byte
4474
4475.wdclr:
4476	sta	%g0, [%o0]%asi		! 4-byte clearing loop
4477	subcc	%o3, 4, %o3
4478	bnz,pt	%ncc, .wdclr
4479	inc	4, %o0
4480
4481	and	%o1, 3, %o1		! leftover count, if any
4482
4483.byteclr:
4484	! Set the leftover bytes
4485	brz	%o1, .bzero_exit
4486	nop
4487
44887:
4489	deccc	%o1			! byte clearing loop
4490	stba	%g0, [%o0]%asi
4491	bgu,pt	%ncc, 7b
4492	inc	%o0
4493
4494.bzero_exit:
4495	!
4496	! We're just concerned with whether t_lofault was set
4497	! when we came in. We end up here from either kzero()
4498	! or bzero(). kzero() *always* sets a lofault handler.
4499	! It ors LOFAULT_SET into %o5 to indicate it has done
4500	! this even if the value of %o5 is otherwise zero.
4501	! bzero() sets a lofault handler *only* if one was
4502	! previously set. Accordingly we need to examine
4503	! %o5 and if it is non-zero be sure to clear LOFAULT_SET
4504	! before resetting the error handler.
4505	!
4506	tst	%o5
4507	bz	%ncc, 1f
4508	andn	%o5, LOFAULT_SET, %o5
4509	membar	#Sync				! sync error barrier
4510	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
45111:
4512	retl
4513	clr	%o0			! return (0)
4514
4515	SET_SIZE(bzero)
4516#endif	/* lint */
4517